diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh
index 178db42a609a..b25f3b21e8eb 100644
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@@ -8,6 +8,8 @@ if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
     export TORCH_CUDA_ARCH_LIST="8.0;9.0"
 elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
     export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
+elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
 elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
     export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
 fi
@@ -15,6 +17,8 @@ fi
 # Compress the fatbin with -compress-mode=size for CUDA 13
 if [[ "$DESIRED_CUDA" == *"13"* ]]; then
     export TORCH_NVCC_FLAGS="-compress-mode=size"
+    # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
+    export BUILD_BUNDLE_PTXAS=1
 fi
 
 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
@@ -31,8 +35,7 @@ pip install -r /pytorch/requirements.txt
 pip install auditwheel==6.2.0 wheel
 if [ "$DESIRED_CUDA" = "cpu" ]; then
     echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
-    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
-    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
 else
     echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
     export USE_SYSTEM_NCCL=1
@@ -46,6 +49,5 @@ else
         export USE_NVIDIA_PYPI_LIBS=1
     fi
 
-    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
-    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
+    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
index 1b6429fa8c06..a99e5f8f6565 100755
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@@ -13,49 +13,6 @@ def list_dir(path: str) -> list[str]:
     return check_output(["ls", "-1", path]).decode().split("\n")
 
 
-def build_ArmComputeLibrary() -> None:
-    """
-    Using ArmComputeLibrary for aarch64 PyTorch
-    """
-    print("Building Arm Compute Library")
-    acl_build_flags = [
-        "debug=0",
-        "neon=1",
-        "opencl=0",
-        "os=linux",
-        "openmp=1",
-        "cppthreads=0",
-        "arch=armv8a",
-        "multi_isa=1",
-        "fixed_format_kernels=1",
-        "build=native",
-    ]
-    acl_install_dir = "/acl"
-    acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
-    if os.path.isdir(acl_install_dir):
-        shutil.rmtree(acl_install_dir)
-    if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
-        check_call(
-            [
-                "git",
-                "clone",
-                "https://github.com/ARM-software/ComputeLibrary.git",
-                "-b",
-                "v25.02",
-                "--depth",
-                "1",
-                "--shallow-submodules",
-            ]
-        )
-
-    check_call(
-        ["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
-        cwd=acl_checkout_dir,
-    )
-    for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
-        shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
-
-
 def replace_tag(filename) -> None:
     with open(filename) as f:
         lines = f.readlines()
@@ -317,7 +274,7 @@ def parse_arguments():
     ).decode()
 
     print("Building PyTorch wheel")
-    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
+    build_vars = ""
     # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
     if enable_cuda:
         build_vars += "MAX_JOBS=5 "
@@ -356,23 +313,17 @@ def parse_arguments():
         build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
 
     if enable_mkldnn:
-        build_ArmComputeLibrary()
         print("build pytorch with mkldnn+acl backend")
-        build_vars += (
-            "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
-            "ACL_ROOT_DIR=/acl "
-            "LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH "
-            "ACL_INCLUDE_DIR=/acl/build "
-            "ACL_LIBRARY=/acl/build "
-        )
+        build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
+        build_vars += "ACL_ROOT_DIR=/acl "
         if enable_cuda:
             build_vars += "BLAS=NVPL "
         else:
-            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/OpenBLAS "
+            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "
     else:
         print("build pytorch without mkldnn backend")
 
-    os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel")
+    os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
     if enable_cuda:
         print("Updating Cuda Dependency")
         filename = os.listdir("/pytorch/dist/")
diff --git a/.ci/aarch64_linux/build_aarch64_wheel.py b/.ci/aarch64_linux/build_aarch64_wheel.py
index 7a4715d33006..a157ec57b574 100755
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
         try:
             with socket.create_connection((addr, port), timeout=timeout):
                 return
-        except (ConnectionRefusedError, socket.timeout):  # noqa: PERF203
+        except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203
             if i == attempt_cnt - 1:
                 raise
             time.sleep(timeout)
@@ -299,40 +299,6 @@ def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
         )
 
 
-def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None:
-    print("Building OpenBLAS")
-    host.run_cmd(
-        f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.28 {git_clone_flags}"
-    )
-    make_flags = "NUM_THREADS=64 USE_OPENMP=1 NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=ARMV8"
-    host.run_cmd(
-        f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS"
-    )
-
-
-def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None:
-    print("Building Arm Compute Library")
-    acl_build_flags = " ".join(
-        [
-            "debug=0",
-            "neon=1",
-            "opencl=0",
-            "os=linux",
-            "openmp=1",
-            "cppthreads=0",
-            "arch=armv8a",
-            "multi_isa=1",
-            "fixed_format_kernels=1",
-            "build=native",
-        ]
-    )
-    host.run_cmd(
-        f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}"
-    )
-
-    host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}")
-
-
 def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
     host.run_cmd("pip3 install auditwheel")
     host.run_cmd(
@@ -442,7 +408,7 @@ def build_torchvision(
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
 
-    host.run_cmd(f"cd vision && {build_vars} python3 setup.py bdist_wheel")
+    host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")
     vision_wheel_name = host.list_dir("vision/dist")[0]
     embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))
 
@@ -497,7 +463,7 @@ def build_torchdata(
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
 
-    host.run_cmd(f"cd data && {build_vars} python3 setup.py bdist_wheel")
+    host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")
     wheel_name = host.list_dir("data/dist")[0]
     embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))
 
@@ -553,7 +519,7 @@ def build_torchtext(
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
 
-    host.run_cmd(f"cd text && {build_vars} python3 setup.py bdist_wheel")
+    host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")
     wheel_name = host.list_dir("text/dist")[0]
     embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))
 
@@ -614,7 +580,7 @@ def build_torchaudio(
     host.run_cmd(
         f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \
         && ./packaging/ffmpeg/build.sh \
-        && {build_vars} python3 setup.py bdist_wheel"
+        && {build_vars} python3 -m build --wheel --no-isolation"
     )
 
     wheel_name = host.list_dir("audio/dist")[0]
@@ -700,7 +666,6 @@ def start_build(
     configure_system(
         host, compiler=compiler, use_conda=use_conda, python_version=python_version
     )
-    build_OpenBLAS(host, git_clone_flags)
 
     if host.using_docker():
         print("Move libgfortant.a into a standard location")
@@ -723,10 +688,12 @@ def start_build(
         f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
     )
 
+    host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
+
     print("Building PyTorch wheel")
     build_opts = ""
     if pytorch_build_number is not None:
-        build_opts += f" --build-number {pytorch_build_number}"
+        build_opts += f" -C--build-option=--build-number={pytorch_build_number}"
     # Breakpad build fails on aarch64
     build_vars = "USE_BREAKPAD=0 "
     if branch == "nightly":
@@ -743,15 +710,18 @@ def start_build(
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
     if enable_mkldnn:
-        build_ArmComputeLibrary(host, git_clone_flags)
+        host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
         print("build pytorch with mkldnn+acl backend")
         build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
+        build_vars += " BLAS=OpenBLAS"
+        build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"
+        build_vars += " ACL_ROOT_DIR=/acl"
         host.run_cmd(
-            f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}"
+            f"cd $HOME/pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
         )
         print("Repair the wheel")
         pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
-        ld_library_path = "$HOME/acl/build:$HOME/pytorch/build/lib"
+        ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
         host.run_cmd(
             f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
         )
@@ -763,7 +733,7 @@ def start_build(
     else:
         print("build pytorch without mkldnn backend")
         host.run_cmd(
-            f"cd pytorch && {build_vars} python3 setup.py bdist_wheel{build_opts}"
+            f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
         )
 
     print("Deleting build folder")
@@ -907,7 +877,7 @@ def terminate_instances(instance_type: str) -> None:
 def parse_arguments():
     from argparse import ArgumentParser
 
-    parser = ArgumentParser("Builid and test AARCH64 wheels using EC2")
+    parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
     parser.add_argument("--key-name", type=str)
     parser.add_argument("--debug", action="store_true")
     parser.add_argument("--build-only", action="store_true")
@@ -1004,7 +974,7 @@ def parse_arguments():
         install_condaforge_python(host, args.python_version)
         sys.exit(0)
 
-    python_version = args.python_version if args.python_version is not None else "3.9"
+    python_version = args.python_version if args.python_version is not None else "3.10"
 
     if args.use_torch_from_pypi:
         configure_system(host, compiler=args.compiler, python_version=python_version)
diff --git a/.ci/docker/almalinux/Dockerfile b/.ci/docker/almalinux/Dockerfile
index 481d21b96cfe..ce7803cf9acd 100644
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@@ -69,7 +69,8 @@ RUN bash ./install_cuda.sh 13.0
 ENV DESIRED_CUDA=13.0
 
 FROM ${ROCM_IMAGE} as rocm
-ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+ARG PYTORCH_ROCM_ARCH
+ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
 ENV MKLROOT /opt/intel
diff --git a/.ci/docker/almalinux/build.sh b/.ci/docker/almalinux/build.sh
index ec15c13e439b..ad234ce1ffb9 100755
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@@ -36,6 +36,12 @@ case ${DOCKER_TAG_PREFIX} in
     ;;
   rocm*)
     BASE_TARGET=rocm
+    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+    # add gfx950, gfx115x conditionally starting in ROCm 7.0
+    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
+        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
+    fi
+    EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
     ;;
   *)
     echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index be85fdcb542d..a23c85bc60a5 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -84,8 +84,8 @@ fi
 _UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
 _UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
 if [[ "$image" == *rocm* ]]; then
-  _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
-  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
+  _UCX_COMMIT=29831d319e6be55cb8c768ca61de335c934ca39e
+  _UCC_COMMIT=9f4b242cbbd8b1462cbc732eb29316cdfa124b77
 fi
 
 tag=$(echo $image | awk -F':' '{print $2}')
@@ -113,6 +113,7 @@ case "$tag" in
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
     TRITON=yes
+    INSTALL_MINGW=yes
     ;;
   pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
     CUDA_VERSION=13.0.0
@@ -175,28 +176,17 @@ case "$tag" in
     fi
     GCC_VERSION=11
     VISION=yes
-    ROCM_VERSION=6.4
+    ROCM_VERSION=7.0
     NINJA_VERSION=1.9.0
     TRITON=yes
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
+    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100"
     if [[ $tag =~ "benchmarks" ]]; then
       INDUCTOR_BENCHMARKS=yes
     fi
     ;;
-  pytorch-linux-noble-rocm-alpha-py3)
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=11
-    VISION=yes
-    ROCM_VERSION=7.0
-    NINJA_VERSION=1.9.0
-    TRITON=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
-    ;;
   pytorch-linux-jammy-xpu-n-1-py3)
     ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=11
@@ -262,13 +252,10 @@ case "$tag" in
     TRITON_CPU=yes
     ;;
   pytorch-linux-jammy-linter)
-    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
-    # We will need to update mypy version eventually, but that's for another day. The task
-    # would be to upgrade mypy to 1.0.0 with Python 3.11
-    PYTHON_VERSION=3.9
+    PYTHON_VERSION=3.10
     ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
-    PYTHON_VERSION=3.9
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter)
+    PYTHON_VERSION=3.10
     CUDA_VERSION=12.8.1
     ;;
   pytorch-linux-jammy-aarch64-py3.10-gcc11)
@@ -358,7 +345,7 @@ docker build \
        --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
        --build-arg "KATEX=${KATEX:-}" \
        --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
-       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \
+       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}" \
        --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
        --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
        --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
@@ -375,6 +362,7 @@ docker build \
        --build-arg "OPENBLAS=${OPENBLAS:-}" \
        --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
        --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
+       --build-arg "INSTALL_MINGW=${INSTALL_MINGW:-}" \
        -f $(dirname ${DOCKERFILE})/Dockerfile \
        -t "$tmp_tag" \
        "$@" \
@@ -455,12 +443,3 @@ elif [ "$HAS_TRITON" = "yes" ]; then
   echo "expecting triton to not be installed, but it is"
   exit 1
 fi
-
-# Sanity check cmake version.  Executorch reinstalls cmake and I'm not sure if
-# they support 4.0.0 yet, so exclude them from this check.
-CMAKE_VERSION=$(drun cmake --version)
-if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then
-  echo "CMake version is not 4.0.0:"
-  drun cmake --version
-  exit 1
-fi
diff --git a/.ci/docker/ci_commit_pins/executorch.txt b/.ci/docker/ci_commit_pins/executorch.txt
index 0e527f468229..f2e2d655a6cf 100644
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@@ -1 +1 @@
-56392aa978594cc155fa8af48cd949f5b5f1823a
+deb42f2a8e48f5032b4a98ee781a15fa87a157cf
diff --git a/.ci/docker/ci_commit_pins/huggingface-requirements.txt b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
index 66e5dbdfb1bb..f4f3830136eb 100644
--- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt
+++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
@@ -1,2 +1,2 @@
-transformers==4.54.0
+transformers==4.56.0
 soxr==0.5.0
diff --git a/.ci/docker/ci_commit_pins/nccl-cu12.txt b/.ci/docker/ci_commit_pins/nccl-cu12.txt
index d099a6b91b76..77a73992346c 100644
--- a/.ci/docker/ci_commit_pins/nccl-cu12.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt
@@ -1 +1 @@
-v2.27.5-1
+v2.27.5-1
\ No newline at end of file
diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index e543da3aa161..10f1207e60e6 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-5ae38bdb0dc066c5823e34dc9797afb9de42c866
+7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
diff --git a/.ci/docker/common/install_acl.sh b/.ci/docker/common/install_acl.sh
old mode 100644
new mode 100755
index bf41a03b2806..0b865e5bc6f8
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@@ -1,16 +1,27 @@
-set -euo pipefail
+#!/bin/bash
+# Script used only in CD pipeline
 
-readonly version=v25.02
-readonly src_host=https://github.com/ARM-software
-readonly src_repo=ComputeLibrary
+set -eux
 
-# Clone ACL
-[[ ! -d ${src_repo} ]] && git clone ${src_host}/${src_repo}.git
-cd ${src_repo}
+ACL_VERSION=${ACL_VERSION:-"v25.02"}
+ACL_INSTALL_DIR="/acl"
 
-git checkout $version
+# Clone ACL
+git clone https://github.com/ARM-software/ComputeLibrary.git -b "${ACL_VERSION}" --depth 1 --shallow-submodules
 
+ACL_CHECKOUT_DIR="ComputeLibrary"
 # Build with scons
+pushd $ACL_CHECKOUT_DIR
 scons -j8  Werror=0 debug=0 neon=1 opencl=0 embed_kernels=0 \
   os=linux arch=armv8a build=native multi_isa=1 \
   fixed_format_kernels=1 openmp=1 cppthreads=0
+popd
+
+# Install ACL
+sudo mkdir -p ${ACL_INSTALL_DIR}
+for d in arm_compute include utils support src build
+do
+  sudo cp -r ${ACL_CHECKOUT_DIR}/${d} ${ACL_INSTALL_DIR}/${d}
+done
+
+rm -rf $ACL_CHECKOUT_DIR
\ No newline at end of file
diff --git a/.ci/docker/common/install_cpython.sh b/.ci/docker/common/install_cpython.sh
index 692edd0b898f..c873c930097b 100755
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@@ -83,10 +83,6 @@ function build_cpython {
         py_suffix=${py_ver::-1}
         py_folder=$py_suffix
     fi
-    # Update to rc2 due to https://github.com/python/cpython/commit/c72699086fe4
-    if [ "$py_suffix" == "3.14.0" ]; then
-        py_suffix="3.14.0rc2"
-    fi
     wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz
     do_cpython_build $py_ver Python-$py_suffix
 
diff --git a/.ci/docker/common/install_executorch.sh b/.ci/docker/common/install_executorch.sh
index becd2264e395..fb168acd4feb 100755
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@@ -42,22 +42,27 @@ install_pip_dependencies() {
   # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
   # numba and scipy version used in PyTorch CI
   conda_run pip uninstall -y numba scipy
+  # Yaspin is needed for running CI test (get_benchmark_analysis_data.py)
+  pip_install yaspin==3.1.0
 
   popd
 }
 
 setup_executorch() {
-  pushd executorch
-
   export PYTHON_EXECUTABLE=python
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON"
 
   as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
-  popd
 }
 
-clone_executorch
-install_buck2
-install_conda_dependencies
-install_pip_dependencies
-setup_executorch
+if [ $# -eq 0 ]; then
+  clone_executorch
+  install_buck2
+  install_conda_dependencies
+  install_pip_dependencies
+  pushd executorch
+  setup_executorch
+  popd
+else
+  "$@"
+fi
diff --git a/.ci/docker/common/install_mingw.sh b/.ci/docker/common/install_mingw.sh
new file mode 100644
index 000000000000..6232a0d0245c
--- /dev/null
+++ b/.ci/docker/common/install_mingw.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -ex
+
+# Install MinGW-w64 for Windows cross-compilation
+apt-get update
+apt-get install -y g++-mingw-w64-x86-64-posix
+
+echo "MinGW-w64 installed successfully"
+x86_64-w64-mingw32-g++ --version
diff --git a/.ci/docker/common/install_onnx.sh b/.ci/docker/common/install_onnx.sh
index 9f23feb5adfa..b0615b8a84c1 100755
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@@ -19,8 +19,8 @@ pip_install \
   transformers==4.36.2
 
 pip_install coloredlogs packaging
-pip_install onnxruntime==1.22.1
-pip_install onnxscript==0.4.0
+pip_install onnxruntime==1.23.0
+pip_install onnxscript==0.5.4
 
 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
diff --git a/.ci/docker/common/install_openblas.sh b/.ci/docker/common/install_openblas.sh
old mode 100644
new mode 100755
index 3c795acf2220..2f386c6bd523
--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@@ -3,8 +3,10 @@
 
 set -ex
 
-cd /
-git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules
+OPENBLAS_VERSION=${OPENBLAS_VERSION:-"v0.3.30"}
+
+# Clone OpenBLAS
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION}" --depth 1 --shallow-submodules
 
 OPENBLAS_CHECKOUT_DIR="OpenBLAS"
 OPENBLAS_BUILD_FLAGS="
@@ -17,5 +19,7 @@ CFLAGS=-O3
 BUILD_BFLOAT16=1
 "
 
-make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
-make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
+make -j8 ${OPENBLAS_BUILD_FLAGS} -C $OPENBLAS_CHECKOUT_DIR
+sudo make install -C $OPENBLAS_CHECKOUT_DIR
+
+rm -rf $OPENBLAS_CHECKOUT_DIR
\ No newline at end of file
diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh
index a156670cb815..7878311c15b0 100644
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@@ -42,12 +42,6 @@ EOF
     rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
     amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
 
-    # Special case for ROCM_VERSION == 7.0
-    if [[ $(ver "$ROCM_VERSION") -eq $(ver 7.0) ]]; then
-        rocm_baseurl="https://repo.radeon.com/rocm/apt/7.0_alpha2"
-        amdgpu_baseurl="https://repo.radeon.com/amdgpu/30.10_alpha2/ubuntu"
-    fi
-
     # Add amdgpu repository
     UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
     echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh
index a8d8ba00b35b..9bf45e6f1b0a 100644
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@@ -12,8 +12,8 @@ function do_install() {
 
     rocm_version_nodot=${rocm_version//./}
 
-    # Version 2.7.2 + ROCm related updates
-    MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
+    # https://github.com/icl-utk-edu/magma/pull/65
+    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
     magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
 
     rocm_dir="/opt/rocm"
diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh
index f48140952c3a..1b68e3c24783 100755
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@@ -66,15 +66,15 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}"
   # Triton needs at least gcc-9 to build
   apt-get install -y g++-9
 
-  CXX=g++-9 conda_run python setup.py bdist_wheel
+  CXX=g++-9 conda_run python -m build --wheel --no-isolation
 elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
   # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
   add-apt-repository -y ppa:ubuntu-toolchain-r/test
   apt-get install -y g++-9
 
-  CXX=g++-9 conda_run python setup.py bdist_wheel
+  CXX=g++-9 conda_run python -m build --wheel --no-isolation
 else
-  conda_run python setup.py bdist_wheel
+  conda_run python -m build --wheel --no-isolation
 fi
 
 # Copy the wheel to /opt for multi stage docker builds
diff --git a/.ci/docker/common/patch_libstdc.sh b/.ci/docker/common/patch_libstdc.sh
new file mode 100755
index 000000000000..7e3a00d0dad8
--- /dev/null
+++ b/.ci/docker/common/patch_libstdc.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -xe
+# Script used in Linux x86 and aarch64 CD pipeline
+
+# Workaround for exposing statically linked libstdc++ CXX11 ABI symbols.
+# see: https://github.com/pytorch/pytorch/issues/133437
+LIBNONSHARED=$(gcc -print-file-name=libstdc++_nonshared.a)
+nm -g $LIBNONSHARED | grep " T " | grep recursive_directory_iterator | cut -c 20-  > weaken-symbols.txt
+objcopy --weaken-symbols weaken-symbols.txt $LIBNONSHARED $LIBNONSHARED
diff --git a/.ci/docker/libtorch/build.sh b/.ci/docker/libtorch/build.sh
index 7caedf1f44d4..c40896cb5499 100755
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@@ -39,13 +39,21 @@ case ${DOCKER_TAG_PREFIX} in
         DOCKER_GPU_BUILD_ARG=""
         ;;
     rocm*)
-        # we want the patch version of 6.4 instead
-        if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
+        # we want the patch version of 7.0 instead
+        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
             GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
         fi
+        # we want the patch version of 6.4 instead
+        if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
+            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4"
+        fi
         BASE_TARGET=rocm
         GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
         PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+        # add gfx950, gfx115x conditionally starting in ROCm 7.0
+        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
+        fi
         DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
         ;;
     *)
diff --git a/.ci/docker/manywheel/Dockerfile_2_28 b/.ci/docker/manywheel/Dockerfile_2_28
index 5d4d8dba690d..4803cb778c90 100644
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@@ -130,7 +130,8 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op
 RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \
     /opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \
     done;
-
+ADD ./common/patch_libstdc.sh patch_libstdc.sh
+RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
 
 # cmake-3.18.4 from pip; force in case cmake3 already exists
 RUN yum install -y python3-pip && \
diff --git a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
index da7ab4d3fd15..768db0992936 100644
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@@ -62,6 +62,13 @@ ARG OPENBLAS_VERSION
 ADD ./common/install_openblas.sh install_openblas.sh
 RUN bash ./install_openblas.sh && rm install_openblas.sh
 
+# Install Arm Compute Library
+FROM base as arm_compute
+# use python3.9 to install scons
+RUN python3.9 -m pip install scons==4.7.0
+RUN ln -sf /opt/python/cp39-cp39/bin/scons /usr/local/bin
+COPY ./common/install_acl.sh install_acl.sh
+RUN bash ./install_acl.sh && rm install_acl.sh
 FROM base as final
 
 # remove unnecessary python versions
@@ -70,4 +77,7 @@ RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
-ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
+COPY --from=arm_compute /acl /acl
+ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:/acl/build/:$LD_LIBRARY_PATH
+ADD ./common/patch_libstdc.sh patch_libstdc.sh
+RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
diff --git a/.ci/docker/manywheel/Dockerfile_cuda_aarch64 b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
index 369706055737..347a01ee4ede 100644
--- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@@ -86,6 +86,15 @@ FROM base as nvpl
 ADD ./common/install_nvpl.sh install_nvpl.sh
 RUN bash ./install_nvpl.sh && rm install_nvpl.sh
 
+# Install Arm Compute Library
+FROM base as arm_compute
+# use python3.9 to install scons
+RUN python3.9 -m pip install scons==4.7.0
+RUN ln -sf /opt/python/cp39-cp39/bin/scons /usr/local/bin
+COPY ./common/install_acl.sh install_acl.sh
+RUN bash ./install_acl.sh && rm install_acl.sh
+FROM base as final
+
 FROM final as cuda_final
 ARG BASE_CUDA_VERSION
 RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
@@ -93,5 +102,9 @@ COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BAS
 COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=nvpl /opt/nvpl/lib/  /usr/local/lib/
 COPY --from=nvpl /opt/nvpl/include/  /usr/local/include/
+COPY --from=arm_compute /acl /acl
 RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
 ENV PATH=/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/acl/build/:$LD_LIBRARY_PATH
+ADD ./common/patch_libstdc.sh patch_libstdc.sh
+RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
diff --git a/.ci/docker/manywheel/Dockerfile_cxx11-abi b/.ci/docker/manywheel/Dockerfile_cxx11-abi
deleted file mode 100644
index ed33cc61df09..000000000000
--- a/.ci/docker/manywheel/Dockerfile_cxx11-abi
+++ /dev/null
@@ -1,71 +0,0 @@
-FROM centos:8 as base
-
-ENV LC_ALL en_US.UTF-8
-ENV LANG en_US.UTF-8
-ENV LANGUAGE en_US.UTF-8
-ENV PATH /opt/rh/gcc-toolset-11/root/bin/:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-
-# change to a valid repo
-RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-Linux-*.repo
-# enable to install ninja-build
-RUN sed -i 's|enabled=0|enabled=1|g' /etc/yum.repos.d/CentOS-Linux-PowerTools.repo
-
-RUN yum -y update
-RUN yum install -y wget curl perl util-linux xz bzip2 git patch which zlib-devel sudo
-RUN yum install -y autoconf automake make cmake gdb gcc-toolset-11-gcc-c++
-
-
-FROM base as openssl
-ADD ./common/install_openssl.sh install_openssl.sh
-RUN bash ./install_openssl.sh && rm install_openssl.sh
-
-# Install python
-FROM base as python
-RUN yum install -y openssl-devel zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel
-ADD common/install_cpython.sh install_cpython.sh
-RUN bash ./install_cpython.sh && rm install_cpython.sh
-
-FROM base as conda
-ADD ./common/install_conda_docker.sh install_conda.sh
-RUN bash ./install_conda.sh && rm install_conda.sh
-RUN /opt/conda/bin/conda install -y cmake
-
-FROM base as intel
-# Install MKL
-COPY --from=python             /opt/python                           /opt/python
-COPY --from=python             /opt/_internal                        /opt/_internal
-COPY --from=conda              /opt/conda                            /opt/conda
-ENV PATH=/opt/conda/bin:$PATH
-ADD ./common/install_mkl.sh install_mkl.sh
-RUN bash ./install_mkl.sh && rm install_mkl.sh
-
-FROM base as patchelf
-ADD ./common/install_patchelf.sh install_patchelf.sh
-RUN bash ./install_patchelf.sh && rm install_patchelf.sh
-RUN cp $(which patchelf) /patchelf
-
-FROM base as jni
-ADD ./common/install_jni.sh install_jni.sh
-ADD ./java/jni.h jni.h
-RUN bash ./install_jni.sh && rm install_jni.sh
-
-FROM base as libpng
-ADD ./common/install_libpng.sh install_libpng.sh
-RUN bash ./install_libpng.sh && rm install_libpng.sh
-
-FROM base as final
-COPY --from=openssl            /opt/openssl                          /opt/openssl
-COPY --from=python             /opt/python                           /opt/python
-COPY --from=python             /opt/_internal                        /opt/_internal
-COPY --from=intel              /opt/intel                            /opt/intel
-COPY --from=conda              /opt/conda                            /opt/conda
-COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
-COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
-COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
-COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
-COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
-COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
-COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
-COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
-
-RUN yum install -y ninja-build
diff --git a/.ci/docker/manywheel/Dockerfile_s390x b/.ci/docker/manywheel/Dockerfile_s390x
index 46ec7f77ae8b..1cf83acb1c73 100644
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@@ -115,6 +115,9 @@ RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
 # cmake-3.28.0 from pip for onnxruntime
 RUN python3 -mpip install cmake==3.28.0
 
+ADD ./common/patch_libstdc.sh patch_libstdc.sh
+RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
+
 # build onnxruntime 1.21.0 from sources.
 # it is not possible to build it from sources using pip,
 # so just build it from upstream repository.
diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh
index 5dee4325857f..b4b505997303 100755
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@@ -28,6 +28,7 @@ fi
 MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
 DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
 OPENBLAS_VERSION=${OPENBLAS_VERSION:-}
+ACL_VERSION=${ACL_VERSION:-}
 
 case ${image} in
     manylinux2_28-builder:cpu)
@@ -41,13 +42,6 @@ case ${image} in
         GPU_IMAGE=arm64v8/almalinux:8
         DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
         MANY_LINUX_VERSION="2_28_aarch64"
-        OPENBLAS_VERSION="v0.3.30"
-        ;;
-    manylinuxcxx11-abi-builder:cpu-cxx11-abi)
-        TARGET=final
-        GPU_IMAGE=""
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
-        MANY_LINUX_VERSION="cxx11-abi"
         ;;
     manylinuxs390x-builder:cpu-s390x)
         TARGET=final
@@ -81,15 +75,23 @@ case ${image} in
         DOCKERFILE_SUFFIX="_cuda_aarch64"
         ;;
     manylinux2_28-builder:rocm*)
-        # we want the patch version of 6.4 instead
-        if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
+        # we want the patch version of 7.0 instead
+        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
             GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
         fi
+        # we want the patch version of 6.4 instead
+        if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
+            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4"
+        fi
         TARGET=rocm_final
         MANY_LINUX_VERSION="2_28"
         DEVTOOLSET_VERSION="11"
         GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
         PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+        # add gfx950, gfx115x conditionally starting in ROCm 7.0
+        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
+        fi
         DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
         ;;
     manylinux2_28-builder:xpu)
@@ -121,7 +123,8 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
 DOCKER_BUILDKIT=1 docker build  \
     ${DOCKER_GPU_BUILD_ARG} \
     --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
-    --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION}" \
+    --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION:-}" \
+    --build-arg "ACL_VERSION=${ACL_VERSION:-}" \
     --target "${TARGET}" \
     -t "${tmp_tag}" \
     $@ \
diff --git a/.ci/docker/manywheel/build_scripts/ssl-check.py b/.ci/docker/manywheel/build_scripts/ssl-check.py
index 0fd7eb363144..c4df0eacbb7f 100644
--- a/.ci/docker/manywheel/build_scripts/ssl-check.py
+++ b/.ci/docker/manywheel/build_scripts/ssl-check.py
@@ -10,11 +10,6 @@
 
 print("Testing SSL certificate checking for Python:", sys.version)
 
-if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
-    print("This version never checks SSL certs; skipping tests")
-    sys.exit(0)
-
-
 EXC = OSError
 
 print(f"Connecting to {GOOD_SSL} should work")
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 4e08c0d6711e..04dc2b98eb66 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -10,6 +10,11 @@ boto3==1.35.42
 #Pinned versions: 1.19.12, 1.16.34
 #test that import:
 
+build==1.3.0
+#Description: A simple, correct Python build frontend.
+#Pinned versions: 1.3.0
+#test that import:
+
 click
 #Description: Command Line Interface Creation Kit
 #Pinned versions:
@@ -47,10 +52,10 @@ flatbuffers==24.12.23
 #Pinned versions: 24.12.23
 #test that import:
 
-hypothesis==5.35.1
+hypothesis==6.56.4
 # Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
 #Description: advanced library for generating parametrized tests
-#Pinned versions: 5.35.1
+#Pinned versions: 6.56.4
 #test that import: test_xnnpack_integration.py, test_pruning_op.py, test_nn.py
 
 junitparser==2.1.1
@@ -93,8 +98,9 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Pinned versions:
 #test that import:
 
-mypy==1.16.0
+mypy==1.16.0 ; platform_system == "Linux"
 # Pin MyPy version because new errors are likely to appear with each release
+# Skip on Windows as lots of type annotations are POSIX specific
 #Description: linter
 #Pinned versions: 1.16.0
 #test that import: test_typing.py, test_type_hints.py
@@ -105,20 +111,17 @@ networkx==2.8.8
 #Pinned versions: 2.8.8
 #test that import: functorch
 
-ninja==1.11.1.3
+ninja==1.11.1.4
 #Description: build system. Used in some tests. Used in build to generate build
 #time tracing information
-#Pinned versions: 1.11.1.3
+#Pinned versions: 1.11.1.4
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
 
-numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
-numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
 numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
 numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
-#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
+#Pinned versions: 0.55.2, 0.60.0
 #test that import: test_numba_integration.py
-#For numba issue see https://github.com/pytorch/pytorch/issues/51511
 #Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073
 
 #numpy
@@ -133,7 +136,7 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
+numpy==1.22.4; python_version == "3.10"
 numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
 numpy==2.1.2; python_version >= "3.13"
 
@@ -165,12 +168,12 @@ optree==0.13.0
 
 pillow==11.0.0
 #Description:  Python Imaging Library fork
-#Pinned versions: 10.3.0
+#Pinned versions: 11.0.0
 #test that import:
 
-protobuf==5.29.4
+protobuf==5.29.5
 #Description:  Google's data interchange format
-#Pinned versions: 5.29.4
+#Pinned versions: 5.29.5
 #test that import: test_tensorboard.py, test/onnx/*
 
 psutil
@@ -213,7 +216,7 @@ pytest-subtests==0.13.1
 #Pinned versions:
 #test that import:
 
-xdoctest==1.1.0
+xdoctest==1.3.0
 #Description: runs doctests in pytest
 #Pinned versions: 1.1.0
 #test that import:
@@ -238,10 +241,9 @@ pygments==2.15.0
 #Pinned versions: 14.1.0
 #test that import:
 
-scikit-image==0.19.3 ; python_version < "3.10"
-scikit-image==0.22.0 ; python_version >= "3.10"
+scikit-image==0.22.0
 #Description: image processing routines
-#Pinned versions:
+#Pinned versions: 0.22.0
 #test that import: test_nn.py
 
 #scikit-learn
@@ -264,7 +266,7 @@ scipy==1.14.1 ; python_version >= "3.12"
 #test that import:
 
 # needed by torchgen utils
-typing-extensions>=4.10.0
+typing-extensions==4.12.2
 #Description: type hints for python
 #Pinned versions:
 #test that import:
@@ -325,8 +327,6 @@ pywavelets==1.7.0 ; python_version >= "3.12"
 lxml==5.3.0
 #Description: This is a requirement of unittest-xml-reporting
 
-# Python-3.9 binaries
-
 PyGithub==2.3.0
 
 sympy==1.13.3
@@ -339,7 +339,7 @@ onnx==1.18.0
 #Pinned versions:
 #test that import:
 
-onnxscript==0.4.0
+onnxscript==0.5.3
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
@@ -359,9 +359,10 @@ pwlf==2.2.1
 #test that import: test_sac_estimator.py
 
 # To build PyTorch itself
-pyyaml
+pyyaml==6.0.2
 pyzstd
-setuptools>=70.1.0
+setuptools==78.1.1
+packaging==23.1
 six
 
 scons==4.5.2 ; platform_machine == "aarch64"
@@ -376,13 +377,16 @@ dataclasses_json==0.6.7
 #Pinned versions: 0.6.7
 #test that import:
 
-cmake==4.0.0
+cmake==3.31.6
 #Description: required for building
 
 tlparse==0.4.0
 #Description: required for log parsing
 
-cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
+filelock==3.18.0
+#Description: required for inductor testing
+
+cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x" and platform_system != "Darwin"
 #Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits.
 #test that import: test_cuda.py
 
diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt
index efe6fb4c949b..6e623b4c5694 100644
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@@ -1,8 +1,15 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
--e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
 
+standard-imghdr==3.13.0; python_version >= "3.13"
+#Description: This is needed by Sphinx, so it needs to be added here.
+# The reasons are as follows:
+# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
+# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
+# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
+
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 1edc8c60c2f0..3f22a1276921 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -103,6 +103,11 @@ COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
 
+ARG INSTALL_MINGW
+COPY ./common/install_mingw.sh install_mingw.sh
+RUN if [ -n "${INSTALL_MINGW}" ]; then bash ./install_mingw.sh; fi
+RUN rm install_mingw.sh
+
 ARG TRITON
 ARG TRITON_CPU
 
diff --git a/.ci/lumen_cli/cli/lib/common/git_helper.py b/.ci/lumen_cli/cli/lib/common/git_helper.py
index 9833caca956c..c4d6f8a0b6f5 100644
--- a/.ci/lumen_cli/cli/lib/common/git_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/git_helper.py
@@ -57,8 +57,8 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
         logger.info("Successfully cloned %s", target)
         return r, commit
 
-    except GitCommandError as e:
-        logger.error("Git operation failed: %s", e)
+    except GitCommandError:
+        logger.exception("Git operation failed")
         raise
 
 
diff --git a/.ci/lumen_cli/cli/lib/core/vllm/lib.py b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
index 98cfc807e284..8c106214ea9e 100644
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@@ -41,7 +41,6 @@ def sample_vllm_test_library():
                 "pytest -v -s basic_correctness/test_cumem.py",
                 "pytest -v -s basic_correctness/test_basic_correctness.py",
                 "pytest -v -s basic_correctness/test_cpu_offload.py",
-                "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
             ],
         },
         "vllm_basic_models_test": {
@@ -68,15 +67,12 @@ def sample_vllm_test_library():
                         "-v",
                         "-s",
                         "entrypoints/llm",
-                        "--ignore=entrypoints/llm/test_lazy_outlines.py",
                         "--ignore=entrypoints/llm/test_generate.py",
-                        "--ignore=entrypoints/llm/test_generate_multiple_loras.py",
                         "--ignore=entrypoints/llm/test_collective_rpc.py",
                     ]
                 ),
-                "pytest -v -s entrypoints/llm/test_lazy_outlines.py",
-                "pytest -v -s entrypoints/llm/test_generate.py ",
-                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
+                "pytest -v -s entrypoints/llm/test_generate.py",
+                "pytest -v -s entrypoints/offline_mode",
             ],
         },
         "vllm_regression_test": {
@@ -147,7 +143,7 @@ def sample_vllm_test_library():
                 "pytest -v -s compile/test_decorator.py",
             ],
         },
-        "vllm_languagde_model_test_extended_generation_28_failure_test": {
+        "vllm_language_model_test_extended_generation_28_failure_test": {
             "title": "Language Models Test (Extended Generation) 2.8 release failure",
             "id": "vllm_languagde_model_test_extended_generation_28_failure_test",
             "package_install": [
diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
index 415e05d07551..63e5f7a28de5 100644
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
@@ -63,7 +63,7 @@ class VllmBuildParameters:
     # DOCKERFILE_PATH: path to Dockerfile used when use_local_dockerfile is True"
     use_local_dockerfile: bool = env_bool_field("USE_LOCAL_DOCKERFILE", True)
     dockerfile_path: Path = env_path_field(
-        "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
+        "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile"
     )
 
     # the cleaning script to remove torch dependencies from pip
diff --git a/.ci/magma-rocm/Makefile b/.ci/magma-rocm/Makefile
index 5f63da87bc4d..9fca7ad54461 100644
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@@ -1,11 +1,11 @@
 SHELL=/usr/bin/env bash
 
 DOCKER_CMD ?= docker
-DESIRED_ROCM ?= 6.4
+DESIRED_ROCM ?= 7.0
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
-#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
+#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201
 
 DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
@@ -16,20 +16,20 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma-rocm/build_magma.sh
 
 .PHONY: all
+all: magma-rocm70
 all: magma-rocm64
-all: magma-rocm63
 
 .PHONY:
 clean:
 	$(RM) -r magma-*
 	$(RM) -r output
 
+.PHONY: magma-rocm70
+magma-rocm70: DESIRED_ROCM := 7.0
+magma-rocm70:
+	$(DOCKER_RUN)
+
 .PHONY: magma-rocm64
 magma-rocm64: DESIRED_ROCM := 6.4
 magma-rocm64:
 	$(DOCKER_RUN)
-
-.PHONY: magma-rocm63
-magma-rocm63: DESIRED_ROCM := 6.3
-magma-rocm63:
-	$(DOCKER_RUN)
diff --git a/.ci/magma-rocm/build_magma.sh b/.ci/magma-rocm/build_magma.sh
index 4acb3fb0dc3b..c7c7780227ea 100755
--- a/.ci/magma-rocm/build_magma.sh
+++ b/.ci/magma-rocm/build_magma.sh
@@ -6,8 +6,8 @@ set -eou pipefail
 # The script expects DESIRED_CUDA and PACKAGE_NAME to be set
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 
-# Version 2.7.2 + ROCm related updates
-MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
+# https://github.com/icl-utk-edu/magma/pull/65
+MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
 
 # Folders for the build
 PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
@@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE
 
 # Fetch magma sources and verify checksum
 pushd ${PACKAGE_DIR}
-git clone https://bitbucket.org/icl/magma.git
+git clone https://github.com/jeffdaily/magma
 pushd magma
 git checkout ${MAGMA_VERSION}
 popd
diff --git a/.ci/manywheel/build_common.sh b/.ci/manywheel/build_common.sh
index 4c268befb30e..b84268fd1289 100644
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@@ -142,7 +142,7 @@ time CMAKE_ARGS=${CMAKE_ARGS[@]} \
     EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
     BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
     USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
-    python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
+    python -m build --wheel --no-isolation --outdir /tmp/$WHEELHOUSE_DIR
 echo "Finished setup.py bdist at $(date)"
 
 # Build libtorch packages
diff --git a/.ci/manywheel/build_cuda.sh b/.ci/manywheel/build_cuda.sh
index 6ed38f8b25c6..2a822295e036 100644
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@@ -187,19 +187,22 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
             export USE_CUFILE=0
         else
             DEPS_LIST+=(
-                "/usr/local/cuda/lib64/libnvToolsExt.so.1"
                 "/usr/local/cuda/lib64/libcublas.so.12"
                 "/usr/local/cuda/lib64/libcublasLt.so.12"
                 "/usr/local/cuda/lib64/libcudart.so.12"
                 "/usr/local/cuda/lib64/libnvrtc.so.12"
                 "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12")
             DEPS_SONAME+=(
-                "libnvToolsExt.so.1"
                 "libcublas.so.12"
                 "libcublasLt.so.12"
                 "libcudart.so.12"
                 "libnvrtc.so.12"
                 "libcupti.so.12")
+
+            if [[ $CUDA_VERSION != 12.9* ]]; then
+                DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")
+                DEPS_SONAME+=("libnvToolsExt.so.1")
+            fi
         fi
     else
         echo "Using nvidia libs from pypi."
diff --git a/.ci/manywheel/build_libtorch.sh b/.ci/manywheel/build_libtorch.sh
index 4de775b1823c..d78fbd5c3ed3 100644
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@@ -104,7 +104,7 @@ if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
     export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr
 fi
 
-echo "Calling 'python -m pip install .' at $(date)"
+echo "Calling -m pip install . -v --no-build-isolation at $(date)"
 
 if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
     STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
diff --git a/.ci/manywheel/build_rocm.sh b/.ci/manywheel/build_rocm.sh
index ffc15bcdc5fa..bac56746f450 100755
--- a/.ci/manywheel/build_rocm.sh
+++ b/.ci/manywheel/build_rocm.sh
@@ -107,6 +107,10 @@ if [[ $ROCM_INT -ge 60200 ]]; then
     ROCM_SO_FILES+=("librocm-core.so")
 fi
 
+if [[ $ROCM_INT -ge 70000 ]]; then
+    ROCM_SO_FILES+=("librocroller.so")
+fi
+
 OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
 if [[ "$OS_NAME" == *"CentOS Linux"* || "$OS_NAME" == *"AlmaLinux"* ]]; then
     LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
index 1c88554c2af9..cae81a2568d5 100755
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@@ -89,7 +89,7 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
   export USE_MKLDNN=1
   export USE_MKLDNN_ACL=1
-  export ACL_ROOT_DIR=/ComputeLibrary
+  export ACL_ROOT_DIR=/acl
 fi
 
 if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
@@ -233,7 +233,9 @@ if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
   export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi
 
-if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *-full-debug* ]]; then
+  export CMAKE_BUILD_TYPE=Debug
+elif [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
   export CMAKE_BUILD_TYPE=RelWithAssert
 fi
 
@@ -290,15 +292,20 @@ else
 
       WERROR=1 python setup.py clean
 
-      WERROR=1 python setup.py bdist_wheel
+      WERROR=1 python -m build --wheel --no-isolation
     else
       python setup.py clean
       if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
         source .ci/pytorch/install_cache_xla.sh
       fi
-      python setup.py bdist_wheel
+      python -m build --wheel --no-isolation
     fi
     pip_install_whl "$(echo dist/*.whl)"
+    if [[ "$BUILD_ENVIRONMENT" == *full-debug* ]]; then
+      # Regression test for https://github.com/pytorch/pytorch/issues/164297
+      # Torch should be importable and that's about it
+      pushd /; python -c "import torch;print(torch.__config__.show(), torch.randn(5) + 1.7)"; popd
+    fi
 
     if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *vision* ]]; then
       install_torchvision
diff --git a/.ci/pytorch/cpp_doc_push_script.sh b/.ci/pytorch/cpp_doc_push_script.sh
index 6e417bf8bbe9..f085fa78bebe 100755
--- a/.ci/pytorch/cpp_doc_push_script.sh
+++ b/.ci/pytorch/cpp_doc_push_script.sh
@@ -58,7 +58,7 @@ time python tools/setup_helpers/generate_code.py \
 
 # Build the docs
 pushd docs/cpp
-time make VERBOSE=1 html -j
+time make VERBOSE=1 html
 
 popd
 popd
diff --git a/.ci/pytorch/macos-build.sh b/.ci/pytorch/macos-build.sh
index d41c3c08e628..c01efda11ea6 100755
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@@ -35,11 +35,12 @@ fi
 
 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
+  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python -m build --wheel --no-isolation
 else
-  # NB: we always build with distributed; USE_DISTRIBUTED turns off all
-  # backends (specifically the gloo backend), so test that this case works too
-  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
+  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
+  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
+  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python -m build --wheel --no-isolation -C--build-option=--plat-name=macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
   print_sccache_stats
diff --git a/.ci/pytorch/macos-test.sh b/.ci/pytorch/macos-test.sh
index 79d47da43171..2687852a2c4f 100755
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
 fi
 popd
 
-python -mpip install -r requirements.txt
-
 # enable debug asserts in serialization
 export TORCH_SERIALIZATION_DEBUG=1
 
-python -mpip install --no-input -r requirements.txt
-
 setup_test_python() {
   # The CircleCI worker hostname doesn't resolve to an address.
   # This environment variable makes ProcessGroupGloo default to
@@ -59,7 +55,7 @@ test_python_shard() {
 
   setup_test_python
 
-  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS"
+  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS"
 
   assert_git_not_dirty
 }
@@ -260,7 +256,7 @@ test_torchbench_smoketest() {
   local device=mps
   local dtypes=(undefined float16 bfloat16 notset)
   local dtype=${dtypes[$1]}
-  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
+  local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)
 
   for backend in eager inductor; do
 
@@ -323,7 +319,7 @@ test_aoti_torchbench_smoketest() {
   local device=mps
   local dtypes=(undefined float16 bfloat16 notset)
   local dtype=${dtypes[$1]}
-  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
+  local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)
 
   echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
   local dtype_arg="--${dtype}"
diff --git a/.ci/pytorch/multigpu-test.sh b/.ci/pytorch/multigpu-test.sh
index 219463f318db..039459816724 100755
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@@ -26,6 +26,7 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then
     time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
     time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
     time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering
+    time python test/run_test.py --verbose -i distributed/test_aten_comm_compute_reordering
     time python test/run_test.py --verbose -i distributed/test_store
     time python test/run_test.py --verbose -i distributed/test_symmetric_memory
     time python test/run_test.py --verbose -i distributed/test_pg_wrapper
diff --git a/.ci/pytorch/smoke_test/check_binary_symbols.py b/.ci/pytorch/smoke_test/check_binary_symbols.py
index 3e88ffe4ffd7..b0c607659c72 100755
--- a/.ci/pytorch/smoke_test/check_binary_symbols.py
+++ b/.ci/pytorch/smoke_test/check_binary_symbols.py
@@ -32,6 +32,9 @@
     "torch::",
 )
 
+# Patterns for detecting statically linked libstdc++ symbols
+STATICALLY_LINKED_CXX11_ABI = [re.compile(r".*recursive_directory_iterator.*")]
+
 
 def _apply_libtorch_symbols(symbols):
     return [
@@ -53,12 +56,17 @@ def get_symbols(lib: str) -> list[tuple[str, str, str]]:
     return [x.split(" ", 2) for x in lines.decode("latin1").split("\n")[:-1]]
 
 
-def grep_symbols(lib: str, patterns: list[Any]) -> list[str]:
+def grep_symbols(
+    lib: str, patterns: list[Any], symbol_type: str | None = None
+) -> list[str]:
     def _grep_symbols(
         symbols: list[tuple[str, str, str]], patterns: list[Any]
     ) -> list[str]:
         rc = []
         for _s_addr, _s_type, s_name in symbols:
+            # Filter by symbol type if specified
+            if symbol_type and _s_type != symbol_type:
+                continue
             for pattern in patterns:
                 if pattern.match(s_name):
                     rc.append(s_name)
@@ -80,6 +88,18 @@ def _get_symbols_chunk(i):
         return functools.reduce(list.__add__, (x.result() for x in tasks), [])
 
 
+def check_lib_statically_linked_libstdc_cxx_abi_symbols(lib: str) -> None:
+    cxx11_statically_linked_symbols = grep_symbols(
+        lib, STATICALLY_LINKED_CXX11_ABI, symbol_type="T"
+    )
+    num_statically_linked_symbols = len(cxx11_statically_linked_symbols)
+    print(f"num_statically_linked_symbols (T): {num_statically_linked_symbols}")
+    if num_statically_linked_symbols > 0:
+        raise RuntimeError(
+            f"Found statically linked libstdc++ symbols (recursive_directory_iterator): {cxx11_statically_linked_symbols[:100]}"
+        )
+
+
 def check_lib_symbols_for_abi_correctness(lib: str) -> None:
     print(f"lib: {lib}")
     cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS)
@@ -107,6 +127,7 @@ def main() -> None:
 
     libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so")
     check_lib_symbols_for_abi_correctness(libtorch_cpu_path)
+    check_lib_statically_linked_libstdc_cxx_abi_symbols(libtorch_cpu_path)
 
 
 if __name__ == "__main__":
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index 7290ff6c8954..3e2dc09ef495 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -34,12 +34,14 @@ fi
 
 
 # Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
-NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
-if [ -n "$NUMBA_CUDA_DIR" ]; then
-  NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
-  pushd "$NUMBA_CUDA_DIR"
-  patch -p4 <"$NUMBA_PATCH"
-  popd
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
+  NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
+  if [ -n "$NUMBA_CUDA_DIR" ]; then
+    NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
+    pushd "$NUMBA_CUDA_DIR"
+    patch -p4 <"$NUMBA_PATCH"
+    popd
+  fi
 fi
 
 echo "Environment variables:"
@@ -322,20 +324,26 @@ test_python_shard() {
 
   # modify LD_LIBRARY_PATH to ensure it has the conda env.
   # This set of tests has been shown to be buggy without it for the split-build
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
 
   assert_git_not_dirty
 }
 
 test_python() {
   # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
   assert_git_not_dirty
 }
 
 test_python_smoke() {
-  # Smoke tests for H100
-  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  # Smoke tests for H100/B200
+  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  assert_git_not_dirty
+}
+
+test_python_smoke_b200() {
+  # Targeted smoke tests for B200 - staged approach to avoid too many failures
+  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
   assert_git_not_dirty
 }
 
@@ -384,6 +392,7 @@ test_dynamo_wrapped_shard() {
     --exclude-distributed-tests \
     --exclude-torch-export-tests \
     --exclude-aot-dispatch-tests \
+    --exclude-quantization-tests \
     --shard "$1" "$NUM_TEST_SHARDS" \
     --verbose \
     --upload-artifacts-while-running
@@ -428,7 +437,7 @@ test_inductor_distributed() {
 
   # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
   # with if required # gpus aren't available
-  python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_compute_comm_reordering --verbose
+  python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_aten_comm_compute_reordering distributed/test_compute_comm_reordering --verbose
   assert_git_not_dirty
 }
 
@@ -476,6 +485,22 @@ test_inductor_aoti() {
   /usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
 }
 
+test_inductor_aoti_cross_compile_for_windows() {
+
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+
+  # Set WINDOWS_CUDA_HOME environment variable
+  WINDOWS_CUDA_HOME="$(pwd)/win-torch-wheel-extracted"
+  export WINDOWS_CUDA_HOME
+
+  echo "WINDOWS_CUDA_HOME is set to: $WINDOWS_CUDA_HOME"
+  echo "Contents:"
+  ls -lah "$(pwd)/win-torch-wheel-extracted/lib/x64/" || true
+
+  python test/inductor/test_aoti_cross_compile_windows.py -k compile --package-dir "$TEST_REPORTS_DIR" --win-torch-lib-dir "$(pwd)/win-torch-wheel-extracted/torch/lib"
+}
+
 test_inductor_cpp_wrapper_shard() {
   if [[ -z "$NUM_TEST_SHARDS" ]]; then
     echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
@@ -829,7 +854,7 @@ test_dynamo_benchmark() {
       elif [[ "${suite}" == "timm_models" ]]; then
         export TORCHBENCH_ONLY_MODELS="inception_v3"
       elif [[ "${suite}" == "torchbench" ]]; then
-        export TORCHBENCH_ONLY_MODELS="hf_Bert"
+        export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
       fi
     fi
     test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
@@ -860,13 +885,13 @@ test_inductor_torchbench_smoketest_perf() {
   mkdir -p "$TEST_REPORTS_DIR"
 
   python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
-    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
+    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only BERT_pytorch \
     --output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
   # The threshold value needs to be actively maintained to make this check useful
   python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4
 
   # Check memory compression ratio for a few models
-  for test in hf_Albert timm_vision_transformer; do
+  for test in BERT_pytorch yolov3; do
     python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
       --disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \
       --only $test --output "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
@@ -877,7 +902,7 @@ test_inductor_torchbench_smoketest_perf() {
   done
 
   # Perform some "warm-start" runs for a few huggingface models.
-  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
+  for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
     python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
       --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
     python benchmarks/dynamo/check_accuracy.py \
@@ -891,7 +916,7 @@ test_inductor_set_cpu_affinity(){
   export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
   export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
 
-  if [[ "${TEST_CONFIG}" != *aarch64* ]]; then
+  if [[ "$(uname -m)" != "aarch64" ]]; then
     # Use Intel OpenMP for x86
     IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
     export LD_PRELOAD="$IOMP_LIB":"$LD_PRELOAD"
@@ -905,7 +930,7 @@ test_inductor_set_cpu_affinity(){
   cores=$((cpus / thread_per_core))
 
   # Set number of cores to 16 on aarch64 for performance runs
-  if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then
+  if [[ "$(uname -m)" == "aarch64" && $cores -gt 16 ]]; then
     cores=16
   fi
   export OMP_NUM_THREADS=$cores
@@ -1156,6 +1181,12 @@ test_distributed() {
   fi
 }
 
+test_quantization() {
+  echo "Testing quantization"
+
+  python test/test_quantization.py
+}
+
 test_rpc() {
   echo "Testing RPC C++ tests"
   # NB: the ending test_rpc must match the current function name for the current
@@ -1402,7 +1433,7 @@ EOF
   pip3 install -r requirements.txt
   # shellcheck source=./common-build.sh
   source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
-  python setup.py bdist_wheel --bdist-dir="base_bdist_tmp" --dist-dir="base_dist"
+  python -m build --wheel --no-isolation -C--build-option=--bdist-dir="base_bdist_tmp" --outdir "base_dist"
   python -mpip install base_dist/*.whl
   echo "::endgroup::"
 
@@ -1550,14 +1581,10 @@ test_executorch() {
   install_torchvision
   install_torchaudio
 
-  pushd /executorch
-
-  export PYTHON_EXECUTABLE=python
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+  INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh"
 
-  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
-  # from the PR
-  bash .ci/scripts/setup-linux.sh --build-tool cmake
+  pushd /executorch
+  "${INSTALL_SCRIPT}" setup_executorch
 
   echo "Run ExecuTorch unit tests"
   pytest -v -n auto
@@ -1571,17 +1598,13 @@ test_executorch() {
 
   popd
 
-  # Test torchgen generated code for Executorch.
-  echo "Testing ExecuTorch op registration"
-  "$BUILD_BIN_DIR"/test_edge_op_registration
-
   assert_git_not_dirty
 }
 
 test_linux_aarch64() {
   python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
         test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
-        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
+        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \
         distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
         --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
 
@@ -1608,11 +1631,12 @@ test_operator_benchmark() {
   TEST_REPORTS_DIR=$(pwd)/test/test-reports
   mkdir -p "$TEST_REPORTS_DIR"
   TEST_DIR=$(pwd)
+  ARCH=$(uname -m)
 
   test_inductor_set_cpu_affinity
 
   cd benchmarks/operator_benchmark/pt_extension
-  python -m pip install .
+  python -m pip install . -v --no-build-isolation
 
   cd "${TEST_DIR}"/benchmarks/operator_benchmark
   $TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \
@@ -1622,9 +1646,28 @@ test_operator_benchmark() {
   pip_install pandas
   python check_perf_csv.py \
       --actual "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \
-      --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
+      --expected "${ARCH}_expected_ci_operator_benchmark_eager_float32_cpu.csv"
 }
 
+test_operator_microbenchmark() {
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+  TEST_DIR=$(pwd)
+
+  cd benchmarks/operator_benchmark/pt_extension
+  python -m pip install .
+
+  cd "${TEST_DIR}"/benchmarks/operator_benchmark
+
+  for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
+    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
+      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
+      --benchmark-name "PyTorch operator microbenchmark" --use-compile
+    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
+      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}.json" \
+      --benchmark-name "PyTorch operator microbenchmark"
+  done
+}
 
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
   (cd test && python -c "import torch; print(torch.__config__.show())")
@@ -1640,7 +1683,7 @@ if [[ "${TEST_CONFIG}" == *numpy_2* ]]; then
     python -m pip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0
   fi
   python test/run_test.py --include dynamo/test_functions.py dynamo/test_unspec.py test_binary_ufuncs.py test_fake_tensor.py test_linalg.py test_numpy_interop.py test_tensor_creation_ops.py test_torch.py torch_np/test_basic.py
-elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" != *perf_cpu_aarch64* ]]; then
+elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]]; then
   test_linux_aarch64
 elif [[ "${TEST_CONFIG}" == *backward* ]]; then
   test_forward_backward_compatibility
@@ -1657,6 +1700,8 @@ elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
   test_executorch
 elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
   test_python_legacy_jit
+elif [[ "$TEST_CONFIG" == 'quantization' ]]; then
+  test_quantization
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
   # TODO: run some C++ tests
   echo "no-op at the moment"
@@ -1679,6 +1724,8 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
     test_operator_benchmark cpu ${TEST_MODE}
 
   fi
+elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
+  test_operator_microbenchmark
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
   test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
@@ -1687,6 +1734,8 @@ elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
   test_inductor_triton_cpu
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
   test_inductor_micro_benchmark
+elif [[ "${TEST_CONFIG}" == *aoti_cross_compile_for_windows* ]]; then
+  test_inductor_aoti_cross_compile_for_windows
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
   install_torchvision
   id=$((SHARD_NUMBER-1))
@@ -1781,10 +1830,14 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
   test_xpu_bin
 elif [[ "${TEST_CONFIG}" == smoke ]]; then
   test_python_smoke
+elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then
+  test_python_smoke_b200
 elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
   test_h100_distributed
 elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
   test_h100_symm_mem
+elif [[ "${TEST_CONFIG}" == "b200-symm-mem" ]]; then
+  test_h100_symm_mem
 elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then
   test_h100_cutlass_backend
 else
diff --git a/.ci/pytorch/test_fa3_abi_stable.sh b/.ci/pytorch/test_fa3_abi_stable.sh
new file mode 100755
index 000000000000..ff71e9887293
--- /dev/null
+++ b/.ci/pytorch/test_fa3_abi_stable.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -ex -o pipefail
+
+# Suppress ANSI color escape sequences
+export TERM=vt100
+
+# shellcheck source=./common.sh
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+# shellcheck source=./common-build.sh
+source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
+
+echo "Environment variables"
+env
+
+echo "Testing FA3 stable wheel still works with currently built torch"
+
+echo "Installing ABI Stable FA3 wheel"
+# The wheel was built on https://github.com/Dao-AILab/flash-attention/commit/b3846b059bf6b143d1cd56879933be30a9f78c81
+# on torch nightly torch==2.9.0.dev20250830+cu129
+$MAYBE_SUDO pip -q install https://s3.amazonaws.com/ossci-linux/wheels/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl
+
+pushd flash-attention/hopper
+export PYTHONPATH=$PWD
+pytest -v -s \
+  "test_flash_attn.py::test_flash_attn_output[1-1-192-False-False-False-0.0-False-False-mha-dtype0]" \
+  "test_flash_attn.py::test_flash_attn_varlen_output[511-1-64-True-False-False-0.0-False-False-gqa-dtype2]" \
+  "test_flash_attn.py::test_flash_attn_kvcache[1-128-128-False-False-True-None-0.0-False-False-True-False-True-False-gqa-dtype0]" \
+  "test_flash_attn.py::test_flash_attn_race_condition[97-97-192-True-dtype0]" \
+  "test_flash_attn.py::test_flash_attn_combine[2-3-64-dtype1]" \
+  "test_flash_attn.py::test_flash3_bw_compatibility"
+popd
diff --git a/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1 b/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
index 29b3e913439c..a165f2a222ca 100644
--- a/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
+++ b/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
@@ -70,7 +70,7 @@ sccache --zero-stats
 sccache --show-stats
 
 # Build the wheel
-python setup.py bdist_wheel
+python -m build --wheel --no-build-isolation
 if ($LASTEXITCODE -ne 0) { exit 1 }
 
 # Install the wheel locally
diff --git a/.ci/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat
index 19d715b9d0b6..240cc8b55932 100644
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@@ -38,10 +38,12 @@ if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
 
 :: Update CMake
+:: TODO: Investigate why this helps MKL detection, even when CMake from choco is not used
 call choco upgrade -y cmake --no-progress --installargs 'ADD_CMAKE_TO_PATH=System' --apply-install-arguments-to-dependencies --version=3.27.9
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
 
+:: TODO: Move to .ci/docker/requirements-ci.txt
 call pip install mkl==2024.2.0 mkl-static==2024.2.0 mkl-include==2024.2.0
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
@@ -130,14 +132,14 @@ if "%USE_CUDA%"=="1" (
 :: Print all existing environment variable for debugging
 set
 
-python setup.py bdist_wheel
+python -m build --wheel --no-isolation
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
 sccache --show-stats
 python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
 (
   if "%BUILD_ENVIRONMENT%"=="" (
-    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
+    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash.
   ) else (
     copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%"
 
diff --git a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
index 01e08c8bb4e5..abd2c8722b11 100644
--- a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
@@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" (
 ) else (
   set CONDA_PARENT_DIR=C:\Jenkins
 )
-
+set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3
 
 :: Be conservative here when rolling out the new AMI with conda. This will try
 :: to install conda as before if it couldn't find the conda installation. This
 :: can be removed eventually after we gain enough confidence in the AMI
-if not exist %CONDA_PARENT_DIR%\Miniconda3 (
+if not exist %CONDA_ROOT_DIR% (
   set INSTALL_FRESH_CONDA=1
 )
 
@@ -17,10 +17,14 @@ if "%INSTALL_FRESH_CONDA%"=="1" (
   if errorlevel 1 exit /b
   if not errorlevel 0 exit /b
 
-  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
+  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR%
   if errorlevel 1 exit /b
   if not errorlevel 0 exit /b
 )
 
 :: Activate conda so that we can use its commands, i.e. conda, python, pip
-call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
+call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%
+:: Activate conda so that we can use its commands, i.e. conda, python, pip
+call conda activate py_tmp
+
+call pip install -r .ci/docker/requirements-ci.txt
diff --git a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
index 4a464d6b5786..3173582b06f4 100644
--- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
@@ -14,7 +14,7 @@ if not errorlevel 0 exit /b
 :: build\torch. Rather than changing all these references, making a copy of torch folder
 :: from conda to the current workspace is easier. The workspace will be cleaned up after
 :: the job anyway
-xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
+xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
 
 pushd .
 if "%VC_VERSION%" == "" (
diff --git a/.ci/pytorch/win-test-helpers/test_libtorch.bat b/.ci/pytorch/win-test-helpers/test_libtorch.bat
index ed80fadbc25c..d6ecd7218876 100644
--- a/.ci/pytorch/win-test-helpers/test_libtorch.bat
+++ b/.ci/pytorch/win-test-helpers/test_libtorch.bat
@@ -15,37 +15,35 @@ if errorlevel 1 exit /b 1
 if not errorlevel 0 exit /b 1
 
 cd %TMP_DIR_WIN%\build\torch\test
+
+:: Enable delayed variable expansion to make the list
+setlocal enabledelayedexpansion
+set EXE_LIST=
 for /r "." %%a in (*.exe) do (
-    call :libtorch_check "%%~na" "%%~fa"
+  if "%%~na" == "c10_intrusive_ptr_benchmark" (
+    @REM NB: This is not a gtest executable file, thus couldn't be handled by
+    @REM pytest-cpp and is excluded from test discovery by run_test
+    call "%%~fa"
     if errorlevel 1 goto fail
+    if not errorlevel 0 goto fail
+  ) else (
+    if "%%~na" == "verify_api_visibility" (
+      @REM Skip verify_api_visibility as it is a compile-level test
+    ) else (
+      set EXE_LIST=!EXE_LIST! cpp/%%~na
+    )
+  )
 )
 
-goto :eof
-
-:libtorch_check
-
 cd %CWD%
 set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\test
 
-:: Skip verify_api_visibility as it a compile level test
-if "%~1" == "verify_api_visibility" goto :eof
+:: Run python test\run_test.py on the list
+set NO_TD=True && python test\run_test.py --cpp --verbose -i !EXE_LIST!
+if errorlevel 1 goto fail
+if not errorlevel 0 goto fail
 
-echo Running "%~2"
-if "%~1" == "c10_intrusive_ptr_benchmark" (
-  :: NB: This is not a gtest executable file, thus couldn't be handled by pytest-cpp
-  call "%~2"
-  goto :eof
-)
-
-python test\run_test.py --cpp --verbose -i "cpp/%~1"
-if errorlevel 1 (
-  echo %1 failed with exit code %errorlevel%
-  goto fail
-)
-if not errorlevel 0 (
-  echo %1 failed with exit code %errorlevel%
-  goto fail
-)
+goto :eof
 
 :eof
 exit /b 0
diff --git a/.ci/pytorch/win-test-helpers/test_python_shard.bat b/.ci/pytorch/win-test-helpers/test_python_shard.bat
index d0fa3babe59d..02829ee36975 100644
--- a/.ci/pytorch/win-test-helpers/test_python_shard.bat
+++ b/.ci/pytorch/win-test-helpers/test_python_shard.bat
@@ -25,7 +25,7 @@ echo Copying over test times file
 robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"
 
 echo Run nn tests
-python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
+python run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
 if ERRORLEVEL 1 goto fail
 
 popd
diff --git a/.ci/pytorch/win-test.sh b/.ci/pytorch/win-test.sh
index 43524dc04e3f..a01aa0b6431c 100755
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@@ -37,23 +37,8 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
   export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"
 fi
 
-# TODO: Move both of them to Windows AMI
-python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
-
-# Install Z3 optional dependency for Windows builds.
-python -m pip install z3-solver==4.15.1.0
-
-# Install tlparse for test\dynamo\test_structured_trace.py UTs.
-python -m pip install tlparse==0.4.0
-
-# Install parameterized
-python -m pip install parameterized==0.8.1
-
-# Install pulp for testing ilps under torch\distributed\_tools
-python -m pip install pulp==2.9.0
-
-# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
-python -m pip install expecttest==0.3.0
+# TODO: Move this to .ci/docker/requirements-ci.txt
+python -m pip install "psutil==5.9.1" nvidia-ml-py "pytest-shard==0.1.2"
 
 run_tests() {
     # Run nvidia-smi if available
diff --git a/.ci/pytorch/windows/arm64/build_pytorch.bat b/.ci/pytorch/windows/arm64/build_pytorch.bat
index 3363a2d08846..b5c2ef65b84a 100644
--- a/.ci/pytorch/windows/arm64/build_pytorch.bat
+++ b/.ci/pytorch/windows/arm64/build_pytorch.bat
@@ -48,7 +48,7 @@ sccache --zero-stats
 sccache --show-stats
 
 :: Call PyTorch build script
-python setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%"
+python -m build --wheel --no-isolation --outdir "%PYTORCH_FINAL_PACKAGE_DIR%"
 
 :: show sccache stats
 sccache --show-stats
diff --git a/.ci/pytorch/windows/cuda128.bat b/.ci/pytorch/windows/cuda128.bat
index bbdfb4bd1bb7..bbd349e2efb4 100644
--- a/.ci/pytorch/windows/cuda128.bat
+++ b/.ci/pytorch/windows/cuda128.bat
@@ -37,10 +37,10 @@ IF "%CUDA_PATH_V128%"=="" (
 )
 
 IF "%BUILD_VISION%" == "" (
-    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0
+    set TORCH_CUDA_ARCH_LIST=7.0;7.5;8.0;8.6;9.0;10.0;12.0
     set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
-    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
+    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
 )
 
 set "CUDA_PATH=%CUDA_PATH_V128%"
diff --git a/.ci/pytorch/windows/internal/install_python.bat b/.ci/pytorch/windows/internal/install_python.bat
index 84d0f9caccef..86626e15fbc4 100644
--- a/.ci/pytorch/windows/internal/install_python.bat
+++ b/.ci/pytorch/windows/internal/install_python.bat
@@ -28,5 +28,5 @@ start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_t
 if errorlevel 1 exit /b 1
 
 set "PATH=%CD%\Python\Scripts;%CD%\Python;%PATH%"
-%PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel
+%PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel build
 if errorlevel 1 exit /b 1
diff --git a/.ci/pytorch/windows/internal/setup.bat b/.ci/pytorch/windows/internal/setup.bat
index 71056540464c..34a5140cb1ee 100644
--- a/.ci/pytorch/windows/internal/setup.bat
+++ b/.ci/pytorch/windows/internal/setup.bat
@@ -86,7 +86,7 @@ copy /Y "%LIBTORCH_PREFIX%-%PYTORCH_BUILD_VERSION%.zip" "%PYTORCH_FINAL_PACKAGE_
 goto build_end
 
 :pytorch
-%PYTHON_EXEC% setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%"
+%PYTHON_EXEC% -m build --wheel --no-isolation --outdir "%PYTORCH_FINAL_PACKAGE_DIR%"
 
 :build_end
 IF ERRORLEVEL 1 exit /b 1
diff --git a/.ci/pytorch/windows/internal/static_lib_test.bat b/.ci/pytorch/windows/internal/static_lib_test.bat
index 5f23a63d5c20..cd1fc484ae15 100644
--- a/.ci/pytorch/windows/internal/static_lib_test.bat
+++ b/.ci/pytorch/windows/internal/static_lib_test.bat
@@ -63,7 +63,7 @@ if errorlevel 1 exit /b 1
 call %CONDA_HOME%\condabin\activate.bat testenv
 if errorlevel 1 exit /b 1
 
-call conda install  -y -q -c conda-forge libuv=1.39
+call conda install  -y -q -c conda-forge libuv=1.51
 call conda install -y -q intel-openmp
 
 echo "install and test libtorch"
diff --git a/.ci/pytorch/windows/setup_build.bat b/.ci/pytorch/windows/setup_build.bat
index dbdc9891324c..a7addd5d712d 100644
--- a/.ci/pytorch/windows/setup_build.bat
+++ b/.ci/pytorch/windows/setup_build.bat
@@ -18,7 +18,7 @@ if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
 
 %PYTHON_EXEC% -m pip install pyyaml
 %PYTHON_EXEC% -m pip install mkl-include mkl-static
-%PYTHON_EXEC% -m pip install boto3 ninja typing_extensions setuptools==72.1.0
+%PYTHON_EXEC% -m pip install boto3 requests ninja typing_extensions setuptools==72.1.0
 
 where cmake.exe
 
diff --git a/.ci/wheel/build_wheel.sh b/.ci/wheel/build_wheel.sh
index 98b50c0ceeaf..6123e8abc8c0 100755
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@@ -143,7 +143,8 @@ case $desired_python in
         RENAME_WHEEL=false
         ;;
     3.13t)
-        echo "Using 3.13 deps"
+        echo "Using 3.13t deps"
+        mac_version='macosx-11.0-arm64'
         NUMPY_PINNED_VERSION="==2.1.0"
         RENAME_WHEEL=false
         ;;
@@ -177,8 +178,7 @@ source ~/${desired_python}-build/bin/activate
 retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
 retry brew install libomp
 
-# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
-# is build as part of tensorpipe submodule
+# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1
 
 export USE_MKLDNN=OFF
@@ -186,11 +186,11 @@ export USE_QNNPACK=OFF
 export BUILD_TEST=OFF
 
 pushd "$pytorch_rootdir"
-echo "Calling setup.py bdist_wheel at $(date)"
+echo "Calling -m build --wheel --no-isolation at $(date)"
 
-_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name "${mac_version//[-.]/_}"
+_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python -m build --wheel --no-isolation --outdir "$whl_tmp_dir" -C--plat-name="${mac_version//[-.]/_}"
 
-echo "Finished setup.py bdist_wheel at $(date)"
+echo "Finished -m build --wheel --no-isolation at $(date)"
 
 if [[ $package_type != 'libtorch' ]]; then
     echo "delocating wheel dependencies"
diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index f5b949858d60..f12a3ac07517 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -71,14 +71,7 @@ export PYTORCH_BUILD_NUMBER=1
 
 # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
-
-# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
-TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
-
-# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
-if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
-  TRITON_CONSTRAINT="platform_system == 'Linux'"
-fi
+TRITON_CONSTRAINT="platform_system == 'Linux'"
 
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
   TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
diff --git a/.circleci/scripts/functorch_doc_push_script.sh b/.circleci/scripts/functorch_doc_push_script.sh
deleted file mode 100755
index 010956e21252..000000000000
--- a/.circleci/scripts/functorch_doc_push_script.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# =================== The following code **should** be executed inside Docker container ===================
-
-# Install dependencies
-sudo apt-get -y update
-sudo apt-get -y install expect-dev
-
-# This is where the local pytorch install in the docker image is located
-pt_checkout="/var/lib/jenkins/workspace"
-source "$pt_checkout/.ci/pytorch/common_utils.sh"
-echo "functorch_doc_push_script.sh: Invoked with $*"
-
-set -ex
-
-version=${DOCS_VERSION:-nightly}
-echo "version: $version"
-
-# Build functorch docs
-pushd $pt_checkout/functorch/docs
-pip -q install -r requirements.txt
-make html
-popd
-
-git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
-pushd functorch_ghpages
-
-if [ $version == "main" ]; then
-  version=nightly
-fi
-
-git rm -rf "$version" || true
-mv "$pt_checkout/functorch/docs/build/html" "$version"
-
-git add "$version" || true
-git status
-git config user.email "soumith+bot@pytorch.org"
-git config user.name "pytorchbot"
-# If there aren't changes, don't make a commit; push is no-op
-git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
-git status
-
-if [[ "${WITH_PUSH:-}" == true ]]; then
-  git push -u origin gh-pages
-fi
-
-popd
-# =================== The above code **should** be executed inside Docker container ===================
diff --git a/.clang-tidy b/.clang-tidy
index 4b1548d646b2..71ffdf8cb224 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -59,16 +59,19 @@ performance-*,
 -performance-enum-size,
 readability-container-size-empty,
 readability-delete-null-pointer,
-readability-duplicate-include
+readability-duplicate-include,
 readability-misplaced-array-index,
-readability-redundant*
+readability-redundant*,
 readability-simplify-subscript-expr,
 readability-string-compare,
 -readability-redundant-access-specifiers,
 -readability-redundant-control-flow,
+-readability-redundant-inline-specifier,
 '
 HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
 WarningsAsErrors: '*'
+LineFilter:
+  - name: '/usr/include/.*'
 CheckOptions:
   cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true
   cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true
diff --git a/.flake8 b/.flake8
index fa73b7b880fd..937234edb403 100644
--- a/.flake8
+++ b/.flake8
@@ -7,16 +7,12 @@ max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
 ignore =
-    E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
+    E203,E305,E402,E501,E704,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
     # shebang has extra meaning in fbcode lints, so I think it's not worth trying
     # to line this up with executable bit
     EXE001,
     # these ignores are from flake8-bugbear; please fix!
-    B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907,B908,B910
-    # these ignores are from flake8-comprehensions; please fix!
-    C407,
-    # these ignores are from flake8-logging-format; please fix!
-    G100,G101,G200
+    B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910
     # these ignores are from flake8-simplify. please fix or ignore with commented reason
     SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
     # SIM104 is already covered by pyupgrade ruff
diff --git a/.github/ISSUE_TEMPLATE/ci-sev.md b/.github/ISSUE_TEMPLATE/ci-sev.md
index a7e7006aaea5..1ed74161f55d 100644
--- a/.github/ISSUE_TEMPLATE/ci-sev.md
+++ b/.github/ISSUE_TEMPLATE/ci-sev.md
@@ -1,9 +1,14 @@
 ---
 name: "⚠️ CI SEV"
 about: Tracking incidents for PyTorch's CI infra.
+title: ''
+labels: ''
+assignees: ''
+
 ---
 
 > NOTE: Remember to label this issue with "`ci: sev`"
+>       If you want autorevert to be disabled, keep the ci: disable-autorevert label
 
  <!-- Add the `merge blocking` label to this PR to prevent PRs from being merged while this issue is open -->
 
diff --git a/.github/ISSUE_TEMPLATE/disable-autorevert.md b/.github/ISSUE_TEMPLATE/disable-autorevert.md
new file mode 100644
index 000000000000..a76f2e4222eb
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/disable-autorevert.md
@@ -0,0 +1,18 @@
+---
+name: "D❌​\U0001F519​ ISABLE AUTOREVERT"
+about: Disables autorevert when open
+title: "[DISABLE AUTOREVERT]"
+labels: 'ci: disable-autorevert'
+assignees: ''
+
+---
+
+This issue, while open, disables the autorevert functionality.
+
+More details can be found [here](https://github.com/pytorch/test-infra/blob/main/aws/lambda/pytorch-auto-revert/README.md)
+
+
+## Why are you disabling autorevert?
+
+
+## Links to any issues/commits/errors that shows the source of problem
diff --git a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
index 8bea044cfd4b..d9e0cc22bd3f 100644
--- a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
+++ b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
@@ -1,8 +1,10 @@
 ---
 name: Disable CI jobs (PyTorch Dev Infra only)
 about: Use this template to disable CI jobs
-title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]"
-labels: "module: ci"
+title: DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]
+labels: 'module: ci'
+assignees: ''
+
 ---
 
 > For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once
diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
index d4a7df9d5805..2c49247c0aa5 100644
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -22,6 +22,9 @@ self-hosted-runner:
     - linux.arm64.m7g.4xlarge
     - linux.arm64.m7g.4xlarge.ephemeral
     - linux.arm64.r7g.12xlarge.memory
+    - linux.aws.h100
+    - linux.aws.h100.4
+    - linux.aws.h100.8
     - linux.4xlarge.nvidia.gpu
     - linux.8xlarge.nvidia.gpu
     - linux.16xlarge.nvidia.gpu
@@ -51,12 +54,17 @@ self-hosted-runner:
     - windows-11-arm64
     - windows-11-arm64-preview
     # Organization-wide AMD-hosted runners
-    # MI2xx runners
+    # MI2xx non-ARC runners
     - linux.rocm.gpu
-    - linux.rocm.gpu.mi250
     - linux.rocm.gpu.2
     - linux.rocm.gpu.4
-    # gfx942 runners
+    - linux.rocm.gpu.mi250
+    - linux.rocm.gpu.gfx1100
+    # MI2xx ARC runners
+    - linux.rocm.gpu.mi250.1
+    - linux.rocm.gpu.mi250.2
+    - linux.rocm.gpu.mi250.4
+    # gfx942 ARC runners
     - linux.rocm.gpu.gfx942.1
     - linux.rocm.gpu.gfx942.2
     - linux.rocm.gpu.gfx942.4
diff --git a/.github/actions/build-external-packages/action.yml b/.github/actions/build-external-packages/action.yml
index c0c727d93ac6..049c3ce28e45 100644
--- a/.github/actions/build-external-packages/action.yml
+++ b/.github/actions/build-external-packages/action.yml
@@ -65,7 +65,7 @@ runs:
           cd .ci/lumen_cli
           python3 -m pip install -e .
         )
-        MAX_JOBS="$(nproc --ignore=6)"
+        MAX_JOBS="$(nproc --ignore=10)"
         export MAX_JOBS
 
         # Split the comma-separated list and build each target
diff --git a/.github/actions/linux-test/action.yml b/.github/actions/linux-test/action.yml
index 32fe1d7385b1..f29d776402ba 100644
--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@@ -274,8 +274,6 @@ runs:
           -w /var/lib/jenkins/workspace \
           "${DOCKER_IMAGE}"
         )
-        # Propagate download.pytorch.org IP to container
-        grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
         echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
         docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
 
diff --git a/.github/actions/reuse-old-whl/reuse_old_whl.py b/.github/actions/reuse-old-whl/reuse_old_whl.py
index def0276a9c8a..48a849098594 100644
--- a/.github/actions/reuse-old-whl/reuse_old_whl.py
+++ b/.github/actions/reuse-old-whl/reuse_old_whl.py
@@ -264,7 +264,7 @@ def change_content_to_new_version(file: Union[str, Path]) -> None:
         change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py")
 
         for file in Path(f"artifacts/dist/{old_stem}").glob(
-            "*.dist-info/**",
+            "*.dist-info/*",
         ):
             change_content_to_new_version(file)
 
diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml
index 5af32ac03497..4370549e4801 100644
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@@ -28,6 +28,10 @@ runs:
         echo "instance-type: $(get_ec2_metadata instance-type)"
         echo "system info $(uname -a)"
 
+    - name: Print GPU info (if present)
+      shell: bash
+      run: if [ -f /usr/bin/nvidia-smi ]; then nvidia-smi; fi
+
     - name: Check if in a container runner
       shell: bash
       id: check_container_runner
@@ -82,37 +86,6 @@ runs:
         # Prune all of the docker images
         docker system prune -af
 
-    - name: Manually resolve download.pytorch.org
-      shell: bash
-      continue-on-error: true
-      run: |
-        set +e
-        set -x
-
-        PT_DOMAIN=download.pytorch.org
-        # TODO: Flaky access to download.pytorch.org https://github.com/pytorch/pytorch/issues/100400,
-        # cleaning this up once the issue is fixed. There are more than one resolved IP here, the last
-        # one is returned at random
-        RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" | tail -n1)
-
-        if [ -z "${RESOLVED_IP}" ]; then
-          echo "Couldn't resolve ${PT_DOMAIN}, retrying with Google DNS..."
-          RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" @8.8.8.8 | tail -n1)
-
-          if [ -z "${RESOLVED_IP}" ]; then
-            echo "Couldn't resolve ${PT_DOMAIN}, exiting..."
-            exit 1
-          fi
-        fi
-
-        if grep -r "${PT_DOMAIN}" /etc/hosts; then
-          # Clean up any old records first
-          sudo sed -i "/${PT_DOMAIN}/d" /etc/hosts
-        fi
-
-        echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
-        cat /etc/hosts
-
     - name: Check that the docker daemon is running
       shell: bash
       continue-on-error: true
diff --git a/.github/actions/setup-rocm/action.yml b/.github/actions/setup-rocm/action.yml
index a58db801b1cf..07c649985b79 100644
--- a/.github/actions/setup-rocm/action.yml
+++ b/.github/actions/setup-rocm/action.yml
@@ -111,3 +111,16 @@ runs:
         # This video group ID maps to subgid 1 inside the docker image due to the /etc/subgid entries.
         # The group name corresponding to group ID 1 can change depending on the OS, so both are necessary.
         echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd $DEVICE_FLAG --group-add video --group-add $render_gid --group-add daemon --group-add bin --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --network=host" >> "${GITHUB_ENV}"
+
+    - name: configure aws credentials
+      id: aws_creds
+      uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+      with:
+        role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+        aws-region: us-east-1
+        role-duration-seconds: 18000
+
+    - name: Login to Amazon ECR
+      id: login-ecr
+      continue-on-error: true
+      uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml
index 93c957896b5e..2ea330f93b49 100644
--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@@ -6,6 +6,12 @@ inputs:
   cuda-version:
     description: which cuda version to install, 'cpu' for none
     required: true
+  python-version:
+    required: false
+    type: string
+    default: "3.10"
+    description: |
+      The python version to be used. Will be 3.10 by default
 
 runs:
   using: composite
@@ -38,18 +44,24 @@ runs:
         CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat"
 
         {
+          echo "CONDA=${CONDA}";
           echo "CONDA_RUN=${CONDA} run --no-capture-output";
           echo "CONDA_BUILD=${CONDA} run conda-build";
           echo "CONDA_INSTALL=${CONDA} install";
         } >> "${GITHUB_ENV}"
 
     - name: Setup Python3
+      env:
+          PYTHON_VERSION: ${{ inputs.python-version }}
       shell: bash
       run: |
         set +e
         set -x
 
-        PYTHON3=$(${CONDA_RUN} which python3)
+        # Create new py_tmp env with python-version
+        ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv
+
+        PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
         EXIT_CODE=$?
 
         if [[ "${EXIT_CODE}" == "0" ]]; then
@@ -62,7 +74,7 @@ runs:
           # installation, which is Python 3 based. Its Python is default to Python 3. Further, there
           # is also the Miniconda installation that is Python 2 based, and both can be installed if
           # needed. In both cases, Python binary is just called python
-          PYTHON=$(${CONDA_RUN} which python)
+          PYTHON=$(${CONDA_RUN} -n py_tmp which python)
           EXIT_CODE=$?
 
           if [[ "${EXIT_CODE}" == "0" ]]; then
diff --git a/.github/actions/teardown-win/action.yml b/.github/actions/teardown-win/action.yml
index 799b20812b96..b5e5f74db037 100644
--- a/.github/actions/teardown-win/action.yml
+++ b/.github/actions/teardown-win/action.yml
@@ -23,9 +23,6 @@ runs:
       run: |
         .github\scripts\kill_active_ssh_sessions.ps1
 
-    - name: Clean up leftover processes on non-ephemeral Windows runner
-      uses: pytorch/test-infra/.github/actions/cleanup-runner@main
-
     # Cleaning up Windows workspace sometimes fails flakily with device or resource busy
     # error, meaning one or more processes haven't stopped completely yet. So trying to
     # retry this step several time similar to how checkout-pytorch GHA does
diff --git a/.github/actions/test-pytorch-binary/action.yml b/.github/actions/test-pytorch-binary/action.yml
index d4b8be8b609a..991cf9fb87ef 100644
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@@ -33,10 +33,6 @@ runs:
         )
 
         echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
-        if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" && "${GPU_ARCH_TYPE}" != "xpu" ]]; then
-          # Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
-          grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
-        fi
 
         docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
         # Generate test script
diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt
index 05e0b684b427..8af554d56ee5 100644
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@@ -1 +1 @@
-87ff22e49ed0e92576c4935ccb8c143daac4a3cd
+69bbe7363897764f9e758d851cd0340147d27f94
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index 4a57d6e374bd..6cc41d703bd5 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-966da7e46f65d6d49df3e31214470a4fe5cc8e66
+faffd5cf673615583da6517275e361cb3dbc77e6
diff --git a/.github/ci_commit_pins/vllm.txt b/.github/ci_commit_pins/vllm.txt
index 8ac38f3e1f4c..45ad7752358c 100644
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@@ -1 +1 @@
-5963b98b465007e3cfb0d39447e4459a8afa96dc
+e5192819208c4d68194844b7dfafbc00020d0dea
diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 504d924ec764..1bac2adbb56d 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-c77852e117bdf056c8e9a087e51d6f65cf6ba53d
+0fa6e3129e61143224663e1ec67980d12b7ec4eb
diff --git a/.github/ci_configs/vllm/Dockerfile.tmp_vllm b/.github/ci_configs/vllm/Dockerfile
similarity index 61%
rename from .github/ci_configs/vllm/Dockerfile.tmp_vllm
rename to .github/ci_configs/vllm/Dockerfile
index a1b68ad28210..1aefa1be9831 100644
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile
@@ -1,59 +1,41 @@
-# TODO(elainwy): remove this file after the torch nightly dockerfile is in sync in vllm repo
-# The vLLM Dockerfile is used to construct vLLM image against torch nightly and torch main that can be directly used for testing
-
 ARG CUDA_VERSION=12.8.1
 ARG PYTHON_VERSION=3.12
 
 # BUILD_BASE_IMAGE: used to setup python build xformers, and vllm wheels, It can be replaced with a different base image from local machine,
 # by default, it uses the torch-nightly-base stage from this docker image
 ARG BUILD_BASE_IMAGE=torch-nightly-base
-
-# FINAL_BASE_IMAGE: used to set up vllm-instaled environment and build flashinfer,
-# by default, it uses devel-ubuntu22.04 official image.
 ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 
 # The logic is copied from https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile
 ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py"
 
-
 #################### TORCH NIGHTLY BASE IMAGE ####################
-# A base image for building vLLM with devel ubuntu 22.04, this is mainly used to build vllm in vllm builtkite ci
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base
 
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
 ARG GET_PIP_URL
 
-# Install Python and other dependencies
+# Install system dependencies and uv, then create Python virtual environment
 RUN apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl wget sudo vim \
-    && add-apt-repository -y ppa:deadsnakes/ppa \
-    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
+    && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh \
+    && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
+    && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
+    && ln -s /opt/venv/bin/python3 /usr/bin/python3 \
+    && ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
+    && ln -s /opt/venv/bin/pip /usr/bin/pip \
     && python3 --version && python3 -m pip --version
 
 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 # as it was causing spam when compiling the CUTLASS kernels
-# Ensure gcc >= 10 to avoid CUTLASS issues (bug 92519)
-RUN current_gcc_version=$(gcc -dumpversion | cut -f1 -d.) && \
-    if command -v apt-get >/dev/null; then \
-        if [ "$current_gcc_version" -lt 10 ]; then \
-            echo "GCC version is $current_gcc_version, installing gcc-10..."; \
-            apt-get update \
-            && apt-get install -y gcc-10 g++-10 \
-            && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 \
-            && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \
-        else \
-            echo "GCC version is $current_gcc_version, no need to install gcc-10."; \
-        fi \
-    fi \
-    && gcc --version && g++ --version
+RUN apt-get install -y gcc-10 g++-10
+RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
+RUN <<EOF
+gcc --version
+EOF
 
-# install uv for faster pip installs
+# Install uv for faster pip installs
 RUN --mount=type=cache,target=/root/.cache/uv \
     python3 -m pip install uv==0.8.4
 
@@ -61,36 +43,32 @@ ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
-
 #################### TORCH NIGHTLY  BASE IMAGE ####################
 
 
 #################### BASE BUILD IMAGE ####################
-# A base image for building vLLM with torch nightly or torch wheels
-# prepare basic build environment
 FROM ${BUILD_BASE_IMAGE} AS base
 USER root
 
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
 
-# TODO (huydhn): Only work with PyTorch manylinux builder
+# Only work with PyTorch manylinux builder
 ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
 
 # Install some system dependencies and double check python version
 RUN if command -v apt-get >/dev/null; then \
         apt-get update -y \
-        && apt-get install -y ccache software-properties-common git curl wget sudo vim; \
+        && apt-get install -y ccache software-properties-common git wget sudo vim; \
     else \
-        dnf install -y git curl wget sudo; \
+        dnf install -y git wget sudo; \
     fi \
     && python3 --version && python3 -m pip --version
 
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
-    if ! python3 -m uv --version >/dev/null 2>&1; then \
-        python3 -m pip install uv==0.8.4; \
-    fi
+    python3 -m pip install uv==0.8.4
+
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
@@ -98,15 +76,15 @@ ENV UV_LINK_MODE=copy
 
 WORKDIR /workspace
 
-# install build and runtime dependencies
+# Install build and runtime dependencies
 COPY requirements/common.txt requirements/common.txt
 COPY use_existing_torch.py use_existing_torch.py
 COPY pyproject.toml pyproject.toml
 
-# install build and runtime dependencies without stable torch version
+# Install build and runtime dependencies without stable torch version
 RUN python3 use_existing_torch.py
 
-# default mount file as placeholder, this just avoid the mount error
+# Default mount file as placeholder, this just avoid the mount error
 # change to a different vllm folder if this does not exist anymore
 ARG TORCH_WHEELS_PATH="./requirements"
 ARG PINNED_TORCH_VERSION
@@ -138,56 +116,36 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements/common.txt
 
-# Must put before installing xformers, so it can install the correct version of xfomrers.
-ARG xformers_cuda_arch_list='7.5;8.0+PTX;9.0a'
-ENV TORCH_CUDA_ARCH_LIST=${xformers_cuda_arch_list}
-
 ARG max_jobs=16
 ENV MAX_JOBS=${max_jobs}
 
-RUN echo ${TORCH_CUDA_ARCH_LIST}
-RUN echo ${MAX_JOBS}
-RUN pip freeze | grep -E 'ninja'
+RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
+    export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
+    git clone https://github.com/facebookresearch/xformers.git
 
-# Build xformers with cuda and torch nightly/wheel
-# following official xformers guidance: https://github.com/facebookresearch/xformers#build
-# sha for https://github.com/facebookresearch/xformers/tree/v0.0.32.post2
-ARG XFORMERS_COMMIT=5d4b92a5e5a9c6c6d4878283f47d82e17995b468
-ENV CCACHE_DIR=/root/.cache/ccache
+    pushd xformers
+    git checkout v0.0.32.post2
+    git submodule update --init --recursive
+    python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose
+    popd
 
-RUN --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=cache,target=/root/.cache/uv \
-    echo 'git clone xformers...' \
-    && git clone https://github.com/facebookresearch/xformers.git --recursive \
-    && cd xformers \
-    && git checkout ${XFORMERS_COMMIT} \
-    && git submodule update --init --recursive \
-    && echo 'finish git clone xformers...' \
-    && rm -rf build \
-    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
-    && cd .. \
-    && rm -rf xformers
+    rm -rf xformers
+BASH
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system xformers-dist/*.whl --verbose
+    uv pip install --system xformers-dist/*.whl
 
-# Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
-# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
 RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
-
 RUN cat torch_build_versions.txt
 RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
-
 #################### BASE BUILD IMAGE ####################
 
 
 #################### WHEEL BUILD IMAGE ####################
-# Image used to build vllm wheel
 FROM base AS build
 ARG TARGETPLATFORM
 
 COPY . .
-
 RUN python3 use_existing_torch.py
 
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -197,20 +155,17 @@ ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
 
-# Max jobs used by Ninja to build extensions
 ARG max_jobs=16
 ENV MAX_JOBS=${max_jobs}
-ARG nvcc_threads=4
+ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
-ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
-ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 
 ARG USE_SCCACHE
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
 
-# if USE_SCCACHE is set, use sccache to speed up compilation
+# Use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=.git,target=.git \
     if [ "$USE_SCCACHE" = "1" ]; then \
@@ -235,6 +190,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         && sccache --show-stats; \
     fi
 
+ARG torch_cuda_arch_list='8.0 8.6 8.9 9.0'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+
 ARG vllm_target_device="cuda"
 ENV VLLM_TARGET_DEVICE=${vllm_target_device}
 ENV CCACHE_DIR=/root/.cache/ccache
@@ -248,17 +206,10 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         export VLLM_DOCKER_BUILD_CONTEXT=1 && \
         python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \
     fi
-
-RUN echo "[INFO] Listing current directory:" && \
-    ls -al && \
-    echo "[INFO] Showing torch_build_versions.txt content:" && \
-    cat torch_build_versions.txt
-
 #################### WHEEL BUILD IMAGE ####################
 
 
 ################### VLLM INSTALLED IMAGE ####################
-# Setup clean environment for vLLM for test and api server using ubuntu22.04 with AOT flashinfer
 FROM ${FINAL_BASE_IMAGE} AS vllm-base
 USER root
 
@@ -266,7 +217,7 @@ ARG CUDA_VERSION
 ARG PYTHON_VERSION
 ARG GET_PIP_URL
 
-# TODO (huydhn): Only work with PyTorch manylinux builder
+# Only work with PyTorch manylinux builder
 ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
 
 # prepare for environment starts
@@ -275,20 +226,19 @@ WORKDIR /workspace
 # Install Python and other dependencies
 RUN if command -v apt-get >/dev/null; then \
         apt-get update -y \
-        && apt-get install -y ccache software-properties-common git curl wget sudo vim \
-        && add-apt-repository -y ppa:deadsnakes/ppa \
-        && apt-get update -y \
-        && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
-        && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-        && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-        && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-        && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
+        && apt-get install -y ccache software-properties-common git sudo vim python3-pip; \
     else \
-        dnf install -y git curl wget sudo; \
+        dnf install -y git wget sudo; \
     fi \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh \
+    && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
+    && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
+    && ln -s /opt/venv/bin/python3 /usr/bin/python3 \
+    && ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
+    && ln -s /opt/venv/bin/pip /usr/bin/pip \
     && python3 --version && python3 -m pip --version
 
-# Get the torch versions, and whls used in previous stagtes for consistency
+# Get the torch versions, and whls used in previous stage
 COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
 COPY --from=base /workspace/xformers-dist /wheels/xformers
 COPY --from=build /workspace/vllm-dist /wheels/vllm
@@ -299,19 +249,27 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
 
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
-    if ! python3 -m uv --version > /dev/null 2>&1; then \
-        python3 -m pip install uv==0.8.4; \
-    fi
+    python3 -m pip install uv==0.8.4
+
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
+# Install build and runtime dependencies, this is needed for flashinfer install
+COPY requirements/build.txt requirements/build.txt
+COPY use_existing_torch.py use_existing_torch.py
+RUN python3 use_existing_torch.py
+RUN cat requirements/build.txt
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/build.txt
+
 # Default mount file as placeholder, this just avoid the mount error
 ARG TORCH_WHEELS_PATH="./requirements"
-# Install torch, torchaudio and torchvision
-# if TORCH_WHEELS_PATH is default "./requirements", it will pull the nightly versions using pip using torch_build_versions.txt
-# otherwise, it will use the whls from TORCH_WHEELS_PATH from the host machine
+# Install torch, torchaudio and torchvision. If TORCH_WHEELS_PATH is default
+# to ./requirements, it will pull the nightly versions using pip. Otherwise,
+# it will use the local wheels from TORCH_WHEELS_PATH
 RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
     --mount=type=cache,target=/root/.cache/uv \
     if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
@@ -333,19 +291,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system /wheels/xformers/*.whl --verbose
 
-# Build flashinfer from source.
+# Build FlashInfer from source
 ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
-# install package for build flashinfer
-# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
-
-RUN pip install build==1.3.0
-RUN pip freeze | grep -E 'setuptools|packaging|build'
-
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-# Build flashinfer for torch nightly from source around 10 mins
+
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
-# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
 ARG FLASHINFER_GIT_REF="v0.2.14.post1"
+
 RUN --mount=type=cache,target=/root/.cache/uv \
     git clone --depth 1 --recursive --shallow-submodules \
         --branch ${FLASHINFER_GIT_REF} \
@@ -357,7 +309,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     && cd .. \
     && rm -rf flashinfer
 
-# install flashinfer python
+# Install FlashInfer
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system wheels/flashinfer/*.whl --verbose
 
@@ -367,49 +319,6 @@ RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio\|^xformers\|^vllm
 ################### VLLM INSTALLED IMAGE ####################
 
 
-#################### UNITTEST IMAGE #############################
-FROM vllm-base as test
-
-ENV UV_HTTP_TIMEOUT=500
-ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
-ENV UV_LINK_MODE=copy
-
-COPY tests/ tests/
-COPY examples examples
-COPY benchmarks benchmarks
-COPY ./vllm/collect_env.py .
-COPY requirements/common.txt requirements/common.txt
-COPY use_existing_torch.py use_existing_torch.py
-COPY pyproject.toml pyproject.toml
-# Install build and runtime dependencies without stable torch version
-COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt
-
-RUN python3 use_existing_torch.py
-
-# install packages
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/common.txt
-# enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER 1
-
-# install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -e tests/vllm_test_utils
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/nightly_torch_test.txt
-
-# Logging to confirm the torch versions
-RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
-
-# Logging to confirm all the packages are installed
-RUN pip freeze
-
-#################### UNITTEST IMAGE #############################
-
 #################### EXPORT STAGE ####################
 FROM scratch as export-wheels
 
diff --git a/.github/ci_configs/vllm/use_existing_torch.py b/.github/ci_configs/vllm/use_existing_torch.py
index f55db97850d9..3d59fd67a398 100644
--- a/.github/ci_configs/vllm/use_existing_torch.py
+++ b/.github/ci_configs/vllm/use_existing_torch.py
@@ -1,9 +1,14 @@
 import glob
+import os
 
 
 requires_files = glob.glob("requirements/*.txt")
 requires_files += ["pyproject.toml"]
+
 for file in requires_files:
+    if not os.path.exists(file):
+        print(f"!!! skipping missing {file}")
+        continue
     print(f">>> cleaning {file}")
     with open(file) as f:
         lines = f.readlines()
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 8b1acc77c267..7b47b9fefb5d 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -130,3 +130,35 @@
 - torch/csrc/inductor/aoti_include/**
 - torchgen/aoti/**
 - torchgen/gen_aoti_c_shim.py
+
+"ciflow/vllm":
+- .github/ci_commit_pins/vllm.txt
+
+"ciflow/b200":
+- test/test_matmul_cuda.py
+- test/test_scaled_matmul_cuda.py
+- test/inductor/test_fp8.py
+- aten/src/ATen/native/cuda/Blas.cpp
+- torch/**/*cublas*
+- torch/_inductor/kernel/mm.py
+- test/inductor/test_max_autotune.py
+- third_party/fbgemm
+
+"ciflow/h100":
+- test/test_matmul_cuda.py
+- test/test_scaled_matmul_cuda.py
+- test/inductor/test_fp8.py
+- aten/src/ATen/native/cuda/Blas.cpp
+- torch/**/*cublas*
+- torch/_inductor/kernel/mm.py
+- test/inductor/test_max_autotune.py
+- third_party/fbgemm
+
+"ciflow/rocm":
+- test/test_matmul_cuda.py
+- test/test_scaled_matmul_cuda.py
+- test/inductor/test_fp8.py
+- aten/src/ATen/native/cuda/Blas.cpp
+- torch/_inductor/kernel/mm.py
+- test/inductor/test_max_autotune.py
+- third_party/fbgemm
diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml
index 354381755ce5..e75b80dc4689 100644
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@@ -525,6 +525,21 @@
   - Lint
   - pull
 
+- name: typechecking
+  patterns:
+  - 'pyrefly.toml'
+  - 'mypy.ini'
+  - 'mypy-strict.ini'
+  approved_by:
+  - lolpack
+  - maggiemoss
+  - ndmitchell
+  - kinto0
+  mandatory_checks_name:
+  - EasyCLA
+  - Lint
+  - pull
+
 - name: superuser
   patterns:
   - '*'
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index a0aa6921b92b..74b0d243859a 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -1,41 +1,48 @@
 tracking_issue: 24422
 ciflow_tracking_issue: 64124
 ciflow_push_tags:
+- ciflow/b200
+- ciflow/b200-symm-mem
+- ciflow/b200-distributed
 - ciflow/binaries
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
-- ciflow/triton_binaries
+- ciflow/h100
+- ciflow/h100-cutlass-backend
+- ciflow/h100-distributed
+- ciflow/h100-symm-mem
 - ciflow/inductor
-- ciflow/inductor-periodic
-- ciflow/inductor-rocm
-- ciflow/inductor-perf-test-nightly-rocm
-- ciflow/inductor-perf-compare
+- ciflow/inductor-cu126
 - ciflow/inductor-micro-benchmark
 - ciflow/inductor-micro-benchmark-cpu-x86
+- ciflow/inductor-perf-compare
+- ciflow/inductor-perf-test-nightly-rocm-mi300
+- ciflow/inductor-perf-test-nightly-rocm-mi355
 - ciflow/inductor-perf-test-nightly-x86-zen
-- ciflow/inductor-cu126
+- ciflow/inductor-periodic
+- ciflow/inductor-rocm
 - ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
+- ciflow/op-benchmark
 - ciflow/periodic
 - ciflow/periodic-rocm-mi300
+- ciflow/pull
+- ciflow/quantization-periodic
+- ciflow/riscv64
 - ciflow/rocm
 - ciflow/rocm-mi300
+- ciflow/rocm-mi355
+- ciflow/rocm-navi31
 - ciflow/s390
-- ciflow/riscv64
 - ciflow/slow
+- ciflow/torchbench
+- ciflow/triton_binaries
 - ciflow/trunk
 - ciflow/unstable
-- ciflow/xpu
 - ciflow/vllm
-- ciflow/torchbench
-- ciflow/op-benchmark
-- ciflow/pull
-- ciflow/h100
-- ciflow/h100-distributed
 - ciflow/win-arm64
-- ciflow/h100-symm-mem
-- ciflow/h100-cutlass-backend
+- ciflow/xpu
 retryable_workflows:
 - pull
 - trunk
@@ -44,4 +51,4 @@ retryable_workflows:
 - inductor-A100-perf-nightly
 labeler_config: labeler.yml
 label_to_label_config: label_to_label.yml
-mergebot: True
+mergebot: true
diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt
deleted file mode 100644
index 5fc26302a0ad..000000000000
--- a/.github/requirements/pip-requirements-macOS.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-boto3==1.35.42
-cmake==3.27.*
-expecttest==0.3.0
-fbscribelogger==0.1.7
-filelock==3.18.0
-hypothesis==6.56.4
-librosa>=0.6.2
-mpmath==1.3.0
-networkx==2.8.7
-ninja==1.10.2.4
-numba==0.59.0
-numpy==1.26.4
-opt-einsum>=3.3
-optree==0.13.0
-packaging==23.1
-parameterized==0.8.1
-pillow==10.3.0
-protobuf==5.29.5
-psutil==5.9.8
-pygments==2.15.0
-pytest-cpp==2.3.0
-pytest-flakefinder==1.1.0
-pytest-rerunfailures==10.3
-pytest-subtests==0.13.1
-pytest-xdist==3.3.1
-pytest==7.3.2
-pyyaml==6.0.2
-scipy==1.12.0
-setuptools==78.1.1
-sympy==1.13.3
-tlparse==0.4.0
-tensorboard==2.13.0
-typing-extensions==4.12.2
-unittest-xml-reporting<=3.2.0,>=2.0.0
-xdoctest==1.1.0
-z3-solver==4.15.1.0
diff --git a/.github/scripts/drci_mocks.json.gz b/.github/scripts/drci_mocks.json.gz
index b8c483013694..4e03d0672bdd 100644
Binary files a/.github/scripts/drci_mocks.json.gz and b/.github/scripts/drci_mocks.json.gz differ
diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
index 9ba210a5ed2b..592c7aab6d93 100755
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@@ -502,6 +502,7 @@ def perform_misc_tasks(
     job_name: str,
     pr_body: str,
     branch: Optional[str] = None,
+    tag: Optional[str] = None,
 ) -> None:
     """
     In addition to apply the filter logic, the script also does the following
@@ -509,7 +510,11 @@ def perform_misc_tasks(
     """
     set_output(
         "keep-going",
-        branch == MAIN_BRANCH or check_for_setting(labels, pr_body, "keep-going"),
+        branch == MAIN_BRANCH
+        or bool(tag and re.match(r"^trunk/[a-f0-9]{40}$", tag))
+        # Pattern for tags created via manual run on HUD
+        or bool(tag and re.match(r"^ciflow/[^/]+/[a-f0-9]{40}$", tag))
+        or check_for_setting(labels, pr_body, "keep-going"),
     )
     set_output(
         "ci-verbose-test-logs",
@@ -634,6 +639,7 @@ def main() -> None:
         job_name=args.job_name,
         pr_body=pr_body if pr_body else "",
         branch=args.branch,
+        tag=tag,
     )
 
     # Set the filtered test matrix as the output
diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index e57c2d5ef074..154b5a6f0b90 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -16,21 +16,23 @@
 
 
 # NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
-CUDA_ARCHES = ["12.6", "12.8", "13.0"]
+CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
 CUDA_STABLE = "12.8"
 CUDA_ARCHES_FULL_VERSION = {
     "12.6": "12.6.3",
     "12.8": "12.8.1",
+    "12.9": "12.9.1",
     "13.0": "13.0.0",
 }
 CUDA_ARCHES_CUDNN_VERSION = {
     "12.6": "9",
     "12.8": "9",
+    "12.9": "9",
     "13.0": "9",
 }
 
 # NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
-ROCM_ARCHES = ["6.3", "6.4"]
+ROCM_ARCHES = ["6.4", "7.0"]
 
 XPU_ARCHES = ["xpu"]
 
@@ -38,7 +40,7 @@
 
 CPU_S390X_ARCH = ["cpu-s390x"]
 
-CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"]
+CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "12.9-aarch64", "13.0-aarch64"]
 
 
 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
@@ -76,6 +78,23 @@
         "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
         "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
     ),
+    "12.9": (
+        "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
+        "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | "
+        "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | "
+        "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | "
+        "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | "
+        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
+        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
+        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
+        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
+    ),
     "13.0": (
         "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
         "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
@@ -222,7 +241,11 @@ def generate_libtorch_matrix(
             arches += CUDA_ARCHES
             arches += ROCM_ARCHES
         elif os == "windows":
-            arches += CUDA_ARCHES
+            # TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
+            # in 2.10
+            windows_cuda_arches = CUDA_ARCHES.copy()
+            windows_cuda_arches.remove("12.9")
+            arches += windows_cuda_arches
     if libtorch_variants is None:
         libtorch_variants = [
             "shared-with-deps",
@@ -286,7 +309,11 @@ def generate_wheels_matrix(
         if os == "linux":
             arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
         elif os == "windows":
-            arches += CUDA_ARCHES + XPU_ARCHES
+            # TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
+            # in 2.10
+            windows_cuda_arches = CUDA_ARCHES.copy()
+            windows_cuda_arches.remove("12.9")
+            arches += windows_cuda_arches + XPU_ARCHES
         elif os == "linux-aarch64":
             # Separate new if as the CPU type is different and
             # uses different build/test scripts
@@ -322,7 +349,7 @@ def generate_wheels_matrix(
             # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
 
             if (
-                arch_version in ["13.0", "12.8", "12.6"]
+                arch_version in ["13.0", "12.9", "12.8", "12.6"]
                 and os == "linux"
                 or arch_version in CUDA_AARCH64_ARCHES
             ):
@@ -386,5 +413,6 @@ def generate_wheels_matrix(
 
 
 validate_nccl_dep_consistency("13.0")
+validate_nccl_dep_consistency("12.9")
 validate_nccl_dep_consistency("12.8")
 validate_nccl_dep_consistency("12.6")
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 67906d4ad88d..7d22e5059b7c 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -127,53 +127,6 @@ class OperatingSystem:
     ),
 ]
 
-ROCM_SMOKE_WORKFLOWS = [
-    BinaryBuildWorkflow(
-        os=OperatingSystem.LINUX,
-        package_type="manywheel",
-        build_variant="rocm",
-        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-            OperatingSystem.LINUX,
-            arches=["6.4"],
-            python_versions=["3.9"],
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={
-                LABEL_CIFLOW_BINARIES,
-                LABEL_CIFLOW_BINARIES_WHEEL,
-                LABEL_CIFLOW_ROCM,
-            },
-            isolated_workflow=True,
-        ),
-        branches="main",
-    ),
-]
-
-LINUX_BINARY_SMOKE_WORKFLOWS = [
-    BinaryBuildWorkflow(
-        os=OperatingSystem.LINUX,
-        package_type="manywheel",
-        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-            OperatingSystem.LINUX,
-            arches=["12.8"],
-            python_versions=["3.12"],
-        ),
-        branches="main",
-    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.LINUX,
-        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.RELEASE,
-        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.LINUX,
-            generate_binary_build_matrix.RELEASE,
-            arches=["cpu"],
-            libtorch_variants=["shared-with-deps"],
-        ),
-        branches="main",
-    ),
-]
-
 WINDOWS_BINARY_BUILD_WORKFLOWS = [
     BinaryBuildWorkflow(
         os=OperatingSystem.WINDOWS,
@@ -259,39 +212,6 @@ class OperatingSystem:
     ),
 ]
 
-WINDOWS_BINARY_SMOKE_WORKFLOWS = [
-    BinaryBuildWorkflow(
-        os=OperatingSystem.WINDOWS,
-        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.RELEASE,
-        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.WINDOWS,
-            generate_binary_build_matrix.RELEASE,
-            arches=["cpu"],
-            libtorch_variants=["shared-with-deps"],
-        ),
-        branches="main",
-        ciflow_config=CIFlowConfig(
-            isolated_workflow=True,
-        ),
-    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.WINDOWS,
-        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.DEBUG,
-        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.WINDOWS,
-            generate_binary_build_matrix.DEBUG,
-            arches=["cpu"],
-            libtorch_variants=["shared-with-deps"],
-        ),
-        branches="main",
-        ciflow_config=CIFlowConfig(
-            isolated_workflow=True,
-        ),
-    ),
-]
-
 MACOS_BINARY_BUILD_WORKFLOWS = [
     BinaryBuildWorkflow(
         os=OperatingSystem.MACOS_ARM64,
@@ -372,23 +292,10 @@ def main() -> None:
             jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
             S390X_BINARY_BUILD_WORKFLOWS,
         ),
-        (
-            # Give rocm it's own workflow file
-            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
-            ROCM_SMOKE_WORKFLOWS,
-        ),
-        (
-            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
-            LINUX_BINARY_SMOKE_WORKFLOWS,
-        ),
         (
             jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
             WINDOWS_BINARY_BUILD_WORKFLOWS,
         ),
-        (
-            jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
-            WINDOWS_BINARY_SMOKE_WORKFLOWS,
-        ),
         (
             jinja_env.get_template("macos_binary_build_workflow.yml.j2"),
             MACOS_BINARY_BUILD_WORKFLOWS,
diff --git a/.github/scripts/github_utils.py b/.github/scripts/github_utils.py
index 0fc60cb31e2a..110015988a5c 100644
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@@ -18,6 +18,7 @@ class GitHubComment:
     body_text: str
     created_at: str
     author_login: str
+    author_url: Optional[str]
     author_association: str
     editor_login: Optional[str]
     database_id: int
diff --git a/.github/scripts/gql_mocks.json.gz b/.github/scripts/gql_mocks.json.gz
index 67355239dc42..70663a01e777 100644
Binary files a/.github/scripts/gql_mocks.json.gz and b/.github/scripts/gql_mocks.json.gz differ
diff --git a/.github/scripts/test_check_labels.py b/.github/scripts/test_check_labels.py
index 15b9d806b302..74ce276c9d10 100644
--- a/.github/scripts/test_check_labels.py
+++ b/.github/scripts/test_check_labels.py
@@ -38,6 +38,7 @@ def mock_get_comments() -> list[GitHubComment]:
             body_text="mock_body_text",
             created_at="",
             author_login="",
+            author_url=None,
             author_association="",
             editor_login=None,
             database_id=1,
@@ -48,6 +49,7 @@ def mock_get_comments() -> list[GitHubComment]:
             body_text=" #" + LABEL_ERR_MSG_TITLE.replace("`", ""),
             created_at="",
             author_login=BOT_AUTHORS[1],
+            author_url=None,
             author_association="",
             editor_login=None,
             database_id=2,
diff --git a/.github/scripts/test_trymerge.py b/.github/scripts/test_trymerge.py
index ac3a1cc12921..790deb85ef8c 100755
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@@ -32,6 +32,7 @@
     main as trymerge_main,
     MandatoryChecksMissingError,
     MergeRule,
+    PostCommentError,
     RE_GHSTACK_DESC,
     read_merge_rules,
     remove_job_name_suffix,
@@ -588,6 +589,23 @@ def test_get_merge_base(self, *args: Any) -> None:
             self.assertEqual(mock_merge_base, pr.get_merge_base())
             mocked_gh_fetch_merge_base.assert_called_once()
 
+    def test_app_can_revert(self, *args: Any) -> None:
+        pr = GitHubPR("pytorch", "pytorch", 164660)
+        repo = DummyGitRepo()
+        app_comment_id, impostor_comment_id = 3375785595, 3377647892
+        # Check that app can revert
+        self.assertIsNotNone(validate_revert(repo, pr, comment_id=app_comment_id))
+        # But impostor can not
+        self.assertRaises(
+            PostCommentError,
+            lambda: validate_revert(repo, pr, comment_id=impostor_comment_id),
+        )
+        # Despite it's name being the name of the bot
+        self.assertEqual(
+            pr.get_comment_by_id(impostor_comment_id).author_login,
+            "pytorch-auto-revert",
+        )
+
 
 @mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
 @mock.patch("trymerge.gh_fetch_merge_base", return_value="")
diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
index 00b66869dcf2..c258284a00d8 100755
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@@ -234,6 +234,7 @@ def __init__(self, name: str, url: str, run_id: int, status: Optional[str]):
           createdAt
           author {
             login
+            url
           }
           authorAssociation
           editor {
@@ -1091,8 +1092,9 @@ def _comment_from_node(node: Any) -> GitHubComment:
         editor = node["editor"]
         return GitHubComment(
             body_text=node["bodyText"],
-            created_at=node["createdAt"] if "createdAt" in node else "",
+            created_at=node.get("createdAt", ""),
             author_login=node["author"]["login"],
+            author_url=node["author"].get("url", None),
             author_association=node["authorAssociation"],
             editor_login=editor["login"] if editor else None,
             database_id=node["databaseId"],
@@ -2029,16 +2031,17 @@ def validate_revert(
     # For some reason, one can not be a member of private repo, only CONTRIBUTOR
     if pr.is_base_repo_private():
         allowed_reverters.append("CONTRIBUTOR")
+    # Special case the pytorch-auto-revert app, whose does not have association
+    # But should be able to issue revert command
+    if comment.author_url == "https://github.com/apps/pytorch-auto-revert":
+        allowed_reverters.append("NONE")
+
     if author_association not in allowed_reverters:
         raise PostCommentError(
             f"Will not revert as @{author_login} is not one of "
             f"[{', '.join(allowed_reverters)}], but instead is {author_association}."
         )
 
-    # Raises exception if matching rule is not found, but ignores all status checks
-    find_matching_merge_rule(
-        pr, repo, skip_mandatory_checks=True, skip_internal_checks=True
-    )
     commit_sha = get_pr_commit_sha(repo, pr)
     return (author_login, commit_sha)
 
diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2
index fee9ca2eac12..baff04967e3a 100644
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@@ -71,12 +71,15 @@ jobs:
     with:!{{ upload.binary_env_as_input(config) }}
       {%- if "aarch64" in build_environment %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       {%- elif "s390x" in build_environment %}
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
       timeout-minutes: 420
+      {%- elif config["gpu_arch_type"] == "rocm" %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.24xlarge.ephemeral
@@ -174,6 +177,9 @@ jobs:
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: !{{ common.timeout_minutes }}
     !{{ upload.binary_env(config) }}
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2
index 7f307447c357..ad5dd74972d0 100644
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@@ -26,9 +26,8 @@ name: !{{ build_environment }}
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
-          # TODO: Removeme once 3.14 is out
           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
-          python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}"
+          python-version: "!{{ py_ver.strip('t') + ('.4' if '3.14' not in py_ver else '.0') }}"
           freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
 {%- endmacro %}
 
diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml
index 2d9e4d0e27b2..476dd182db0f 100644
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@@ -187,8 +187,6 @@ jobs:
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
         uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        with:
-          driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }}
         if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}
 
       - name: configure aws credentials
diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml
index aba3fa3dceec..ebf96264e994 100644
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@@ -67,12 +67,12 @@ jobs:
             # an OOM issue when running the job, so this upgrades the runner from 4xlarge
             # to the next available tier of 12xlarge. So much memory just to generate cpp
             # doc
-            runner: ${{ inputs.runner_prefix }}linux.12xlarge
+            runner: ${{ inputs.runner_prefix }}linux.12xlarge.memory
             # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now)
             # Let's try to figure out how this can be improved
             timeout-minutes: 360
           - docs_type: python
-            runner: ${{ inputs.runner_prefix }}linux.2xlarge
+            runner: ${{ inputs.runner_prefix }}linux.c7i.2xlarge
             # It takes less than 30m to finish python docs unless there are issues
             timeout-minutes: 30
     # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
diff --git a/.github/workflows/_get-changed-files.yml b/.github/workflows/_get-changed-files.yml
index 55712b065270..599d7a3277fe 100644
--- a/.github/workflows/_get-changed-files.yml
+++ b/.github/workflows/_get-changed-files.yml
@@ -2,6 +2,12 @@ name: Get Changed Files
 
 on:
   workflow_call:
+    inputs:
+      all_files:
+        description: "Whether to return all files instead of just changed files"
+        required: false
+        type: boolean
+        default: false
     outputs:
       changed-files:
         description: "List of changed files (space-separated) or '*' if not in a PR"
@@ -26,16 +32,31 @@ jobs:
             # Get the PR number from the github context
             PR_NUMBER="${{ github.event.number }}"
 
-            # Use gh CLI to get changed files in the PR with explicit repo
-            CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
+            # Check if all_files is requested
+            if [ "${{ inputs.all_files }}" = "true" ]; then
+              echo "all_files input is true, returning all files"
+              echo "changed-files=*" >> "$GITHUB_OUTPUT"
+            else
+              # Use gh CLI to get changed files in the PR with explicit repo
+              CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
 
-            if [ -z "$CHANGED_FILES" ]; then
-              echo "No changed files found, setting to '*'"
-              CHANGED_FILES="*"
-            fi
+              # See https://github.com/pytorch/pytorch/pull/134215#issuecomment-2332128790
+              PYI_FILES_TO_ADD=""
+              for file in ${CHANGED_FILES}; do
+                if [[ "${file}" == *".pyi.in" ]]; then
+                  PYI_FILES_TO_ADD="${PYI_FILES_TO_ADD} ${file//.in/}"
+                fi
+              done
+              CHANGED_FILES="${CHANGED_FILES}${PYI_FILES_TO_ADD}"
+
+              if [ -z "$CHANGED_FILES" ]; then
+                echo "No changed files found, setting to '*'"
+                CHANGED_FILES="*"
+              fi
 
-            echo "Changed files: $CHANGED_FILES"
-            echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
+              echo "Changed files: $CHANGED_FILES"
+              echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
+            fi
 
           else
             echo "Not in PR context, setting changed files to '*'"
diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml
index 6b4bd429e3c9..cc0064391fde 100644
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@@ -37,7 +37,7 @@ on:
       runner:
         required: false
         type: string
-        default: "linux.2xlarge"
+        default: "linux.c7i.2xlarge"
         description: |
           Label of the runner this job should run on.
       test-matrix:
diff --git a/.github/workflows/_linux-test-stable-fa3.yml b/.github/workflows/_linux-test-stable-fa3.yml
new file mode 100644
index 000000000000..63a9e7359ed2
--- /dev/null
+++ b/.github/workflows/_linux-test-stable-fa3.yml
@@ -0,0 +1,255 @@
+# The point of this workflow is to test that a FA3 wheel that was built based off the
+# stable ABI as of torch nightly 20250830 can still run on the newer torch.
+#
+# This workflow is very similar to the _linux-test.yml workflow, with the following
+# differences:
+#   1. It is simpler (there is no test matrix)
+#   2. It pulls flash-attention as a secondary repository in order to access the tests.
+#      Note that it does not BUILD anything from flash-attention, as we have a prebuilt
+#      wheel. We pull flash-attention only to run a few tests.
+#   3. It runs only FA3 tests. No PyTorch tests are run.
+name: linux-test-stable-fa3
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      docker-image:
+        required: true
+        type: string
+        description: Docker image to run in.
+      timeout-minutes:
+        required: false
+        type: number
+        default: 30
+        description: |
+          Set the maximum (in minutes) how long the workflow should take to finish
+      s3-bucket:
+        description: S3 bucket to download artifact
+        required: false
+        type: string
+        default: "gha-artifacts"
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub
+      VLLM_TEST_HUGGING_FACE_TOKEN:
+        required: false
+        description: |
+          HF Auth token to test vllm
+      SCRIBE_GRAPHQL_ACCESS_TOKEN:
+        required: false
+        description: |
+          FB app token to write to scribe endpoint
+
+env:
+  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+
+jobs:
+  test:
+    # Don't run on forked repos
+    if: github.repository_owner == 'pytorch'
+    runs-on: linux.aws.h100
+    timeout-minutes: ${{ inputs.timeout-minutes || 30 }}
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          no-sudo: true
+
+      - name: Checkout flash-attention as a secondary repository
+        uses: actions/checkout@v4
+        with:
+          repository: Dao-AILab/flash-attention
+          path: flash-attention
+
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+          docker-image-name: ${{ inputs.docker-image }}
+
+      - name: Use following to pull public copy of the image
+        id: print-ghcr-mirror
+        env:
+          ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+        shell: bash
+        run: |
+          tag=${ECR_DOCKER_IMAGE##*:}
+          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Check if in a container runner
+        shell: bash
+        id: check_container_runner
+        run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
+
+      - name: Setup GPU_FLAG for docker run
+        id: setup-gpu-flag
+        run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+
+      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
+        id: setup-sscache-port-flag
+        run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
+        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
+
+      - name: Get workflow job id
+        id: get-job-id
+        uses: ./.github/actions/get-workflow-job-id
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Download build artifacts
+        uses: ./.github/actions/download-build-artifacts
+        with:
+          name: ${{ inputs.build-environment }}
+          s3-bucket: ${{ inputs.s3-bucket }}
+
+      - name: Parse ref
+        id: parse-ref
+        run: .github/scripts/parse_ref.py
+
+      - name: Set Test step time
+        id: test-timeout
+        shell: bash
+        env:
+          JOB_TIMEOUT: ${{ inputs.timeout-minutes }}
+        run: |
+          echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
+
+      - name: Preserve github env variables for use in docker
+        shell: bash
+        run: |
+          env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+          env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+
+      - name: Test
+        id: test
+        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
+        env:
+          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_WORKFLOW: ${{ github.workflow }}
+          GITHUB_JOB: ${{ github.job }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          GITHUB_RUN_NUMBER: ${{ github.run_number }}
+          GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
+          BRANCH: ${{ steps.parse-ref.outputs.branch }}
+          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+          BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
+          SHM_SIZE: '2g'
+          DOCKER_IMAGE: ${{ inputs.docker-image }}
+          VLLM_TEST_HUGGING_FACE_TOKEN: ${{ secrets.VLLM_TEST_HUGGING_FACE_TOKEN }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ steps.get-job-id.outputs.job-id }}
+        run: |
+          set -x
+
+          TEST_COMMAND=.ci/pytorch/test_fa3_abi_stable.sh
+
+          # Leaving 1GB for the runner and other things
+          TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
+          # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
+          # comes from https://github.com/pytorch/test-infra/pull/6058
+          TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))
+
+
+          SHM_OPTS="--shm-size=${SHM_SIZE}"
+          JENKINS_USER="--user jenkins"
+          DOCKER_SHELL_CMD=
+
+          # detached container should get cleaned up by teardown_ec2_linux
+          # TODO: Stop building test binaries as part of the build phase
+          # Used for GPU_FLAG, SHM_OPTS, JENKINS_USER and DOCKER_SHELL_CMD since that doesn't play nice
+          # shellcheck disable=SC2086,SC2090
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
+            -e BUILD_ENVIRONMENT \
+            -e PR_NUMBER \
+            -e GITHUB_ACTIONS \
+            -e GITHUB_REPOSITORY \
+            -e GITHUB_WORKFLOW \
+            -e GITHUB_JOB \
+            -e GITHUB_RUN_ID \
+            -e GITHUB_RUN_NUMBER \
+            -e GITHUB_RUN_ATTEMPT \
+            -e JOB_ID \
+            -e JOB_NAME \
+            -e BASE_SHA \
+            -e BRANCH \
+            -e SHA1 \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e HUGGING_FACE_HUB_TOKEN \
+            -e VLLM_TEST_HUGGING_FACE_TOKEN \
+            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
+            -e ARTIFACTS_FILE_SUFFIX \
+            --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
+            --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --ipc=host \
+            ${SHM_OPTS} \
+            --tty \
+            --detach \
+            --name="${container_name}" \
+            ${JENKINS_USER} \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}" \
+            ${DOCKER_SHELL_CMD}
+          )
+
+          echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
+
+          docker exec -t "${container_name}" sh -c "python3 -m pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
+
+      - name: Collect backtraces from coredumps (if any)
+        if: always()
+        run: |
+          # shellcheck disable=SC2156
+          find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
+
+      - name: Store Core dumps on S3
+        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
+        if: failure()
+        with:
+          name: coredumps-fa3-stable-abi-smoke-tests
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ./**/core.[1-9]*
+
+      - name: Upload utilization stats
+        if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' }}
+        continue-on-error: true
+        uses: ./.github/actions/upload-utilization-stats
+        with:
+          job_id: ${{ steps.get-job-id.outputs.job-id }}
+          job_name: ${{ steps.get-job-id.outputs.job-name }}
+          workflow_name: ${{ github.workflow }}
+          workflow_run_id: ${{github.run_id}}
+          workflow_attempt: ${{github.run_attempt}}
+
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 537e94488b36..29c2fc8e0847 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -224,6 +224,46 @@ jobs:
         continue-on-error: true
         uses: ./.github/actions/download-td-artifacts
 
+      - name: Download Windows torch wheel for cross-compilation
+        if: matrix.win_torch_wheel_artifact != ''
+        uses: seemethere/download-artifact-s3@1da556a7aa0a088e3153970611f6c432d58e80e6 # v4.2.0
+        with:
+          name: ${{ matrix.win_torch_wheel_artifact }}
+          path: win-torch-wheel
+
+      - name: Extract Windows wheel and setup CUDA libraries
+        if: matrix.win_torch_wheel_artifact != ''
+        shell: bash
+        run: |
+          set -x
+
+          # Find the wheel file
+          WHEEL_FILE=$(find win-torch-wheel -name "*.whl" -type f | head -n 1)
+          if [ -z "$WHEEL_FILE" ]; then
+            echo "Error: No wheel file found in win-torch-wheel directory"
+            exit 1
+          fi
+          echo "Found wheel file: $WHEEL_FILE"
+
+          # Unzip the wheel file
+          unzip -q "$WHEEL_FILE" -d win-torch-wheel-extracted
+          echo "Extracted wheel contents"
+
+          # Setup CUDA libraries (cuda.lib and cudart.lib) directory
+          mkdir -p win-torch-wheel-extracted/lib/x64
+          if [ -f "win-torch-wheel/cuda.lib" ]; then
+            mv win-torch-wheel/cuda.lib win-torch-wheel-extracted/lib/x64/
+            echo "Moved cuda.lib to win-torch-wheel-extracted/lib/x64/"
+          fi
+          if [ -f "win-torch-wheel/cudart.lib" ]; then
+            mv win-torch-wheel/cudart.lib win-torch-wheel-extracted/lib/x64/
+            echo "Moved cudart.lib to win-torch-wheel-extracted/lib/x64/"
+          fi
+
+          # Verify CUDA libraries are present
+          echo "CUDA libraries:"
+          ls -la win-torch-wheel-extracted/lib/x64/ || echo "No CUDA libraries found"
+
       - name: Parse ref
         id: parse-ref
         run: .github/scripts/parse_ref.py
@@ -273,6 +313,8 @@ jobs:
           TEST_CONFIG: ${{ matrix.config }}
           SHARD_NUMBER: ${{ matrix.shard }}
           NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+          EXTRA_FLAGS: ${{ matrix.extra_flags || '' }}
+          OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }}
           REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
           VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
@@ -387,8 +429,6 @@ jobs:
             "${DOCKER_IMAGE}" \
             ${DOCKER_SHELL_CMD}
           )
-          # Propagate download.pytorch.org IP to container
-          grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
           echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
 
           if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index a2a5f8dd9111..24fe510f0fb5 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -85,7 +85,7 @@ jobs:
         uses: pytorch/test-infra/.github/actions/setup-python@main
         with:
           python-version: ${{ inputs.python-version }}
-          pip-requirements-file: .github/requirements/pip-requirements-macOS.txt
+          pip-requirements-file: .ci/docker/requirements-ci.txt
 
       - name: Install sccache (only for non-forked PRs, and pushes to trunk)
         uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index 086e25b4868e..82eb3c4bf2c7 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -122,7 +122,7 @@ jobs:
         uses: pytorch/test-infra/.github/actions/setup-python@main
         with:
           python-version: ${{ inputs.python-version }}
-          pip-requirements-file: .github/requirements/pip-requirements-macOS.txt
+          pip-requirements-file: .ci/docker/requirements-ci.txt
 
       - name: Start monitoring script
         id: monitor-script
diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
index 7781e1f65fd1..43ed76a63cc6 100644
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@@ -102,19 +102,6 @@ jobs:
             exit 1
           fi
 
-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-
-      - name: Login to Amazon ECR
-        id: login-ecr
-        continue-on-error: true
-        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
-
       - name: Calculate docker image
         id: calculate-docker-image
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml
index 7067d79eb075..0fd3cf7f3972 100644
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@@ -84,9 +84,6 @@ jobs:
           # in https://github.com/actions/checkout/issues/1018
           git config --global core.fsmonitor false
 
-      - name: Clean up leftover processes on non-ephemeral Windows runner
-        uses: pytorch/test-infra/.github/actions/cleanup-runner@main
-
       - name: Setup SSH (Click me for login details)
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
         with:
@@ -151,7 +148,7 @@ jobs:
           BUILD_WHEEL: 1
           MAX_JOBS: 8
           CUDA_VERSION: ${{ inputs.cuda-version }}
-          PYTHON_VERSION: "3.9"
+          PYTHON_VERSION: "3.10"
           SCCACHE_BUCKET: "ossci-compiler-cache"
           SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
           SCCACHE_REGION: us-east-1
@@ -171,6 +168,31 @@ jobs:
         run: |
           .ci/pytorch/win-build.sh
 
+      # Collect Windows torch libs and CUDA libs for cross-compilation
+      - name: Collect Windows CUDA libs for cross-compilation
+        if: steps.build.outcome != 'skipped' && inputs.cuda-version != 'cpu'
+        shell: bash
+        run: |
+          set -ex
+
+          # Create directory structure if does not exist
+          mkdir -p /c/${{ github.run_id }}/build-results
+
+          # Copy CUDA libs
+          CUDA_PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${{ inputs.cuda-version }}"
+
+          if [ -f "${CUDA_PATH}/lib/x64/cuda.lib" ]; then
+            cp "${CUDA_PATH}/lib/x64/cuda.lib" /c/${{ github.run_id }}/build-results/
+          fi
+
+          if [ -f "${CUDA_PATH}/lib/x64/cudart.lib" ]; then
+            cp "${CUDA_PATH}/lib/x64/cudart.lib" /c/${{ github.run_id }}/build-results/
+          fi
+
+          # List collected files
+          echo "Collected CUDA libs:"
+          ls -lah /c/${{ github.run_id }}/build-results/*.lib
+
       # Upload to github so that people can click and download artifacts
       - name: Upload artifacts to s3
         if: steps.build.outcome != 'skipped'
diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
index 5049ef61f693..3d2fe8a4b3fa 100644
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@@ -77,9 +77,6 @@ jobs:
           # in https://github.com/actions/checkout/issues/1018
           git config --global core.fsmonitor false
 
-      - name: Clean up leftover processes on non-ephemeral Windows runner
-        uses: pytorch/test-infra/.github/actions/cleanup-runner@main
-
       - name: Setup SSH (Click me for login details)
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
         with:
@@ -106,18 +103,6 @@ jobs:
         with:
           cuda-version: ${{ inputs.cuda-version }}
 
-      # TODO: Move to a requirements.txt file for windows
-      - name: Install pip dependencies
-        uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 5
-          max_attempts: 5
-          retry_wait_seconds: 30
-          command: |
-            set -eu
-            python3 -m pip install 'xdoctest>=1.1.0'
-
       - name: Get workflow job id
         id: get-job-id
         uses: ./.github/actions/get-workflow-job-id
@@ -184,7 +169,7 @@ jobs:
         env:
           USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
           INSTALL_WINDOWS_SDK: 1
-          PYTHON_VERSION: 3.9
+          PYTHON_VERSION: "3.10"
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
           VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
           TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
@@ -272,15 +257,6 @@ jobs:
         shell: bash
         run: python3 .github/scripts/parse_ref.py
 
-      - name: Uninstall PyTorch
-        if: always()
-        continue-on-error: true
-        shell: bash
-        run: |
-          # This step removes PyTorch installed by the test to give a clean slate
-          # to the next job
-          python3 -mpip uninstall -y torch
-
       - name: Teardown Windows
         uses: ./.github/actions/teardown-win
         if: always()
diff --git a/.github/workflows/b200-distributed.yml b/.github/workflows/b200-distributed.yml
new file mode 100644
index 000000000000..596a31431e61
--- /dev/null
+++ b/.github/workflows/b200-distributed.yml
@@ -0,0 +1,62 @@
+name: CI for distributed tests on B200
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/b200-distributed.yml
+  workflow_dispatch:
+  push:
+    tags:
+      - ciflow/b200-distributed/*
+  schedule:
+    - cron: 46 8 * * *  # about 1:46am PDT
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed-b200
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '10.0'
+      test-matrix: |
+        { include: [
+          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.dgx.b200.8" },
+          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.dgx.b200.8" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-test-distributed-b200:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-test-b200
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200
+    with:
+      timeout-minutes: 1200
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+    secrets: inherit
diff --git a/.github/workflows/b200-symm-mem.yml b/.github/workflows/b200-symm-mem.yml
new file mode 100644
index 000000000000..7fa8a8a73044
--- /dev/null
+++ b/.github/workflows/b200-symm-mem.yml
@@ -0,0 +1,60 @@
+name: Limited CI for symmetric memory tests on B200
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/b200-symm-mem.yml
+  workflow_dispatch:
+  push:
+    tags:
+      - ciflow/b200-symm-mem/*
+  schedule:
+    - cron: 22 8 * * *  # about 1:22am PDT
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '10.0'
+      test-matrix: |
+        { include: [
+          { config: "b200-symm-mem", shard: 1, num_shards: 1, runner: "linux.dgx.b200.8" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+    secrets: inherit
diff --git a/.github/workflows/build-almalinux-images.yml b/.github/workflows/build-almalinux-images.yml
index 0754b154a358..8318286cccbe 100644
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@@ -36,7 +36,7 @@ jobs:
     runs-on: linux.9xlarge.ephemeral
     strategy:
       matrix:
-        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"]
+        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]
     steps:
       - name: Build docker image
         uses: pytorch/pytorch/.github/actions/binary-docker-build@main
diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml
index cc2f54fc45f8..c67281e0a112 100644
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@@ -52,8 +52,8 @@ jobs:
           { tag: "cuda12.9" },
           { tag: "cuda12.8" },
           { tag: "cuda12.6" },
-          { tag: "rocm6.3"  },
           { tag: "rocm6.4"  },
+          { tag: "rocm7.0"  },
           { tag: "cpu"      },
         ]
     steps:
diff --git a/.github/workflows/build-magma-rocm-linux.yml b/.github/workflows/build-magma-rocm-linux.yml
index b6eb09188fd4..eaeb741e5639 100644
--- a/.github/workflows/build-magma-rocm-linux.yml
+++ b/.github/workflows/build-magma-rocm-linux.yml
@@ -34,7 +34,7 @@ jobs:
       id-token: write
     strategy:
       matrix:
-        rocm_version: ["64", "63"]
+        rocm_version: ["70", "64"]
     steps:
       - name: Checkout PyTorch
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml
index 9d08501f51bc..a5c5c387adb8 100644
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@@ -46,17 +46,18 @@ jobs:
       fail-fast: false
       matrix:
         include: [
-          { name: "manylinux2_28-builder",          tag: "cuda13.0",         runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cuda13.0",          runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "cuda12.8",          runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cuda12.9",          runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "cuda12.6",          runner: "linux.9xlarge.ephemeral" },
           { name: "manylinuxaarch64-builder",       tag: "cuda13.0",          runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinuxaarch64-builder",       tag: "cuda12.9",          runner: "linux.arm64.2xlarge.ephemeral" },
           { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
           { name: "manylinuxaarch64-builder",       tag: "cuda12.6",          runner: "linux.arm64.2xlarge.ephemeral" },
-          { name: "manylinux2_28-builder",          tag: "rocm6.3",           runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "rocm7.0",           runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28_aarch64-builder",  tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" },
-          { name: "manylinuxcxx11-abi-builder",     tag: "cpu-cxx11-abi",     runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "xpu",               runner: "linux.9xlarge.ephemeral" },
         ]
     runs-on: ${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}
diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml
index 932d9c886302..9e4144ae56c2 100644
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@@ -50,12 +50,12 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
+        py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
         device: ["cuda", "rocm", "xpu", "aarch64"]
         docker-image: ["pytorch/manylinux2_28-builder:cpu"]
         include:
           - device: "rocm"
-            rocm_version: "6.4"
+            rocm_version: "7.0"
             runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
           - device: "cuda"
             rocm_version: ""
@@ -108,9 +108,6 @@ jobs:
 
           # Determine python executable for given version
           case $PY_VERS in
-          3.9)
-            PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python
-            ;;
           3.10)
             PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python
             ;;
@@ -194,7 +191,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
+        py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
         device: ["xpu"]
     timeout-minutes: 40
     env:
diff --git a/.github/workflows/build-vllm-wheel.yml b/.github/workflows/build-vllm-wheel.yml
index 2c6635374841..4526faf6d7fc 100644
--- a/.github/workflows/build-vllm-wheel.yml
+++ b/.github/workflows/build-vllm-wheel.yml
@@ -27,9 +27,8 @@ jobs:
       fail-fast: false
       matrix:
         python-version: [ '3.12' ]
-        # TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved
         platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
-        device: [ 'cu128', 'cu129' ]
+        device: [ 'cu128', 'cu129', 'cu130' ]
         include:
           - platform: manylinux_2_28_x86_64
             device: cu128
@@ -39,6 +38,10 @@ jobs:
             device: cu129
             manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
             runner: linux.12xlarge.memory
+          - platform: manylinux_2_28_x86_64
+            device: cu130
+            manylinux-image: 'pytorch/manylinux2_28-builder:cuda13.0'
+            runner: linux.12xlarge.memory
           - platform: manylinux_2_28_aarch64
             device: cu128
             manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
@@ -47,6 +50,11 @@ jobs:
             device: cu129
             manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
             runner: linux.arm64.r7g.12xlarge.memory
+        exclude:
+          # TODO (huydhn): Add cu130 aarch64 once PyTorch is on 2.9+ and
+          # xformers is update to support 13.0
+          - platform: manylinux_2_28_aarch64
+            device: cu130
     name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 480
@@ -169,7 +177,12 @@ jobs:
       fail-fast: false
       matrix:
         platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
-        device: [ 'cu128', 'cu129' ]
+        device: [ 'cu128', 'cu129', 'cu130' ]
+        exclude:
+          # TODO (huydhn): Add cu130 aarch64 once PyTorch is on 2.9+ and
+          # xformers is update to support 13.0
+          - platform: manylinux_2_28_aarch64
+            device: cu130
     env:
       PLATFORM: ${{ matrix.platform }}
       BUILD_DEVICE: ${{ matrix.device }}
diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
index 57fe7be15d29..d5e0d96fe19f 100644
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@@ -35,6 +35,7 @@ jobs:
       contents: write
     outputs:
       pt_release_name: ${{ steps.release_name.outputs.pt_release_name }}
+      pt_pep517_release_name: ${{ steps.release_name.outputs.pt_pep517_release_name }}
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
@@ -53,8 +54,12 @@ jobs:
           tag_or_branch="${tag_or_branch#refs/heads/}"
           # replace directory separators with _ in branch name
           tag_or_branch="${tag_or_branch//\//_}"
-          echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV"
-          echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
+          torch_version="$(python -c 'from tools.generate_torch_version import get_torch_version; print(get_torch_version())')"
+          {
+            echo "PT_RELEASE_NAME=pytorch-$tag_or_branch";
+            echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz";
+            echo "PT_PEP517_RELEASE_FILE=torch-${torch_version}.tar.gz";
+          } >> "$GITHUB_ENV"
       - name: Checkout optional submodules
         run: python3 tools/optional_submodules.py
       - name: Copy docs requirements for inclusion
@@ -64,30 +69,47 @@ jobs:
           cp .ci/docker/requirements-docs.txt docs/requirements.txt
       - name: Create source distribution
         run: |
-            # Create new folder with specified name so extracting the archive yields that
-            rm -rf "/tmp/$PT_RELEASE_NAME"
-            cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
-            mv "/tmp/$PT_RELEASE_NAME" .
-            # Cleanup
-            rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
-            find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
-            # Create archive
-            tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
-            echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
+          # Create new folder with specified name so extracting the archive yields that
+          rm -rf "/tmp/$PT_RELEASE_NAME"
+          cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
+          mv "/tmp/$PT_RELEASE_NAME" .
+          # Cleanup
+          rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
+          find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
+          # Create archive
+          tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
+          echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
+      - name: Create PEP 517 compatible source distribution
+        run: |
+          pip install build==1.2.2.post1 || exit 1
+          python -m build --sdist || exit 1
+          cd dist || exit 1
       - name: Upload source distribution for release
         if: ${{ github.event_name == 'release' }}
         uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2
         with:
-          files: ${{env.PT_RELEASE_FILE}}
-      - name: Upload source distribution to GHA artifacts for release tags
+          files: |
+            ${{ env.PT_RELEASE_FILE }}
+            ${{ env.PT_PEP517_RELEASE_FILE }}
+      - name: Upload source distribution to GHA artifacts  # for release tags
         if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
         uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
         with:
           name: ${{ env.PT_RELEASE_FILE }}
           path: ${{ env.PT_RELEASE_FILE }}
+      - name: Upload PEP 517 source distribution to GHA artifacts  # for release tags
+        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        with:
+          name: ${{ env.PT_PEP517_RELEASE_FILE }}
+          path: dist/${{ env.PT_PEP517_RELEASE_FILE }}
       - name: Set output
         id: release_name
-        run: echo "pt_release_name=${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}"
+        run: |
+          {
+            echo "pt_release_name=${{ env.PT_RELEASE_FILE }}";
+            echo "pt_pep517_release_name=${{ env.PT_PEP517_RELEASE_FILE }}";
+          } >> "${GITHUB_OUTPUT}"
 
   upload_source_code_to_s3:
     if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
@@ -103,6 +125,9 @@ jobs:
       - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
         with:
           name: ${{ needs.release.outputs.pt_release_name }}
+      - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
+        with:
+          name: ${{ needs.release.outputs.pt_pep517_release_name }}
       - name: Configure AWS credentials(PyTorch account)
         uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
         with:
@@ -113,7 +138,9 @@ jobs:
           s3-bucket: pytorch
           s3-prefix: source_code/test
           if-no-files-found: warn
-          path: ${{ needs.release.outputs.pt_release_name }}
+          path: |
+            ${{ needs.release.outputs.pt_release_name }}
+            ${{ needs.release.outputs.pt_pep517_release_name }}
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 492f41775d9d..ca257ee8225a 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -59,7 +59,6 @@ jobs:
           pytorch-linux-jammy-py3.13-clang12,
           pytorch-linux-jammy-rocm-n-py3,
           pytorch-linux-noble-rocm-n-py3,
-          pytorch-linux-noble-rocm-alpha-py3,
           pytorch-linux-jammy-rocm-n-py3-benchmarks,
           pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12,
           pytorch-linux-jammy-py3.10-gcc11,
@@ -70,9 +69,8 @@ jobs:
           pytorch-linux-jammy-py3-clang18-asan,
           pytorch-linux-jammy-py3-clang12-onnx,
           pytorch-linux-jammy-linter,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
-          # Executorch pin needs update
-          # pytorch-linux-jammy-py3-clang12-executorch,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter,
+          pytorch-linux-jammy-py3-clang12-executorch,
           pytorch-linux-jammy-py3.12-triton-cpu,
           pytorch-linux-noble-riscv64-py3.12-gcc14
         ]
diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
index 8a3c0840f843..fd31e4819bb9 100644
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@@ -62,7 +62,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_10-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -128,7 +128,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_10-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
@@ -174,7 +174,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_10-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
@@ -204,6 +204,52 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_10-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.r7g.12xlarge.memory
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_10-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_10-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_10-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -220,7 +266,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_10-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
@@ -265,7 +311,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_11-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -331,7 +377,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_11-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
@@ -377,7 +423,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_11-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
@@ -407,6 +453,52 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_11-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.r7g.12xlarge.memory
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_11-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_11-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -423,7 +515,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_11-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
@@ -468,7 +560,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_12-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -534,7 +626,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_12-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
@@ -580,7 +672,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_12-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
@@ -610,6 +702,52 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_12-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.r7g.12xlarge.memory
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_12-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_12-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -626,7 +764,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_12-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
@@ -671,7 +809,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -737,7 +875,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
@@ -783,7 +921,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
@@ -813,6 +951,52 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_13-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.13"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.r7g.12xlarge.memory
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_13-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_13-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -829,7 +1013,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
@@ -874,7 +1058,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13t-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -940,7 +1124,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13t-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
@@ -986,7 +1170,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13t-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
@@ -1016,6 +1200,52 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_13t-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.r7g.12xlarge.memory
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_13t-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_13t-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1032,7 +1262,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_13t-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
@@ -1077,7 +1307,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -1143,7 +1373,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
@@ -1189,7 +1419,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
@@ -1219,6 +1449,52 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_14-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.r7g.12xlarge.memory
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_14-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1235,7 +1511,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
@@ -1280,7 +1556,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14t-cpu-aarch64
       build_environment: linux-aarch64-binary-manywheel
@@ -1346,7 +1622,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14t-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
@@ -1392,7 +1668,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14t-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
@@ -1422,6 +1698,52 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_14t-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.r7g.12xlarge.memory
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14t-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_14t-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1438,7 +1760,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
       ALPINE_IMAGE: "arm64v8/alpine"
       build_name: manywheel-py3_14t-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
index 03835a9f5f35..7f3277ef64a1 100644
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@@ -248,6 +248,74 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  libtorch-cuda12_9-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-cuda12_9-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_9-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cuda12_9-shared-with-deps-release-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-cuda12_9-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_9-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_9-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-cuda12_9-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   libtorch-cuda13_0-shared-with-deps-release-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -316,7 +384,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  libtorch-rocm6_3-shared-with-deps-release-build:
+  libtorch-rocm6_4-shared-with-deps-release-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -325,22 +393,23 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: libtorch-rocm6_3-shared-with-deps-release
+      timeout-minutes: 300
+      build_name: libtorch-rocm6_4-shared-with-deps-release
       build_environment: linux-binary-libtorch
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-rocm6_3-shared-with-deps-release-test:  # Testing
+  libtorch-rocm6_4-shared-with-deps-release-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - libtorch-rocm6_3-shared-with-deps-release-build
+      - libtorch-rocm6_4-shared-with-deps-release-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -349,21 +418,24 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm6_3-shared-with-deps-release
+          name: libtorch-rocm6_4-shared-with-deps-release
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -394,7 +466,7 @@ jobs:
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: libtorch-cxx11-builder
-          custom-tag-prefix: rocm6.3
+          custom-tag-prefix: rocm6.4
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
@@ -407,30 +479,30 @@ jobs:
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  libtorch-rocm6_3-shared-with-deps-release-upload:  # Uploading
+  libtorch-rocm6_4-shared-with-deps-release-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: libtorch-rocm6_3-shared-with-deps-release-test
+    needs: libtorch-rocm6_4-shared-with-deps-release-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
-      build_name: libtorch-rocm6_3-shared-with-deps-release
+      build_name: libtorch-rocm6_4-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  libtorch-rocm6_4-shared-with-deps-release-build:
+  libtorch-rocm7_0-shared-with-deps-release-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -439,22 +511,23 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: libtorch-rocm6_4-shared-with-deps-release
+      timeout-minutes: 300
+      build_name: libtorch-rocm7_0-shared-with-deps-release
       build_environment: linux-binary-libtorch
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-rocm6_4-shared-with-deps-release-test:  # Testing
+  libtorch-rocm7_0-shared-with-deps-release-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - libtorch-rocm6_4-shared-with-deps-release-build
+      - libtorch-rocm7_0-shared-with-deps-release-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -463,21 +536,24 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: libtorch-rocm6_4-shared-with-deps-release
+          name: libtorch-rocm7_0-shared-with-deps-release
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -508,7 +584,7 @@ jobs:
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: libtorch-cxx11-builder
-          custom-tag-prefix: rocm6.4
+          custom-tag-prefix: rocm7.0
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
@@ -521,25 +597,25 @@ jobs:
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  libtorch-rocm6_4-shared-with-deps-release-upload:  # Uploading
+  libtorch-rocm7_0-shared-with-deps-release-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: libtorch-rocm6_4-shared-with-deps-release-test
+    needs: libtorch-rocm7_0-shared-with-deps-release-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
-      build_name: libtorch-rocm6_4-shared-with-deps-release
+      build_name: libtorch-rocm7_0-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-linux-binary-libtorch-release-main.yml b/.github/workflows/generated-linux-binary-libtorch-release-main.yml
deleted file mode 100644
index c98d71dfefc4..000000000000
--- a/.github/workflows/generated-linux-binary-libtorch-release-main.yml
+++ /dev/null
@@ -1,87 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-libtorch-release
-
-
-on:
-  push:
-    branches:
-      - main
-    tags:
-      - 'ciflow/trunk/*'
-  workflow_dispatch:
-
-permissions:
-  id-token: write
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  AWS_DEFAULT_REGION: us-east-1
-  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-libtorch-release
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
-  PYTORCH_ROOT: /pytorch
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 0
-concurrency:
-  group: linux-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  get-label-type:
-    if: github.repository_owner == 'pytorch'
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-  libtorch-cpu-shared-with-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: libtorch-cpu-shared-with-deps-release
-      build_environment: linux-binary-libtorch-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cpu-shared-with-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-cpu-shared-with-deps-release-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      build_name: libtorch-cpu-shared-with-deps-release
-      build_environment: linux-binary-libtorch-release
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-linux-binary-manywheel-main.yml b/.github/workflows/generated-linux-binary-manywheel-main.yml
deleted file mode 100644
index 96b9f9f739f7..000000000000
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ /dev/null
@@ -1,88 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-manywheel
-
-
-on:
-  push:
-    branches:
-      - main
-    tags:
-      - 'ciflow/trunk/*'
-  workflow_dispatch:
-
-permissions:
-  id-token: write
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  AWS_DEFAULT_REGION: us-east-1
-  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-manywheel
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
-  PYTORCH_ROOT: /pytorch
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 0
-concurrency:
-  group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  get-label-type:
-    if: github.repository_owner == 'pytorch'
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_12-cuda12_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      DESIRED_PYTHON: "3.12"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_12-cuda12_8
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda12_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_12-cuda12_8-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda12_8
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index 0f87f97df694..a4a1e3cea95c 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -241,6 +241,72 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_10-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_10-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_10-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_10-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_10-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -307,7 +373,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_10-rocm6_3-build:
+  manywheel-py3_10-rocm6_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -316,21 +382,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_10-rocm6_3
+      timeout-minutes: 300
+      build_name: manywheel-py3_10-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-rocm6_3-test:  # Testing
+  manywheel-py3_10-rocm6_4-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_10-rocm6_3-build
+      - manywheel-py3_10-rocm6_4-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -339,20 +406,23 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.10"
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_10-rocm6_3
+          name: manywheel-py3_10-rocm6_4
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -383,7 +453,7 @@ jobs:
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.3
+          custom-tag-prefix: rocm6.4
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
@@ -396,29 +466,29 @@ jobs:
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_10-rocm6_3-upload:  # Uploading
+  manywheel-py3_10-rocm6_4-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_10-rocm6_3-test
+    needs: manywheel-py3_10-rocm6_4-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-rocm6_3
+      build_name: manywheel-py3_10-rocm6_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_10-rocm6_4-build:
+  manywheel-py3_10-rocm7_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -427,21 +497,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_10-rocm6_4
+      timeout-minutes: 300
+      build_name: manywheel-py3_10-rocm7_0
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-rocm6_4-test:  # Testing
+  manywheel-py3_10-rocm7_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_10-rocm6_4-build
+      - manywheel-py3_10-rocm7_0-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -450,20 +521,23 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.10"
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_10-rocm6_4
+          name: manywheel-py3_10-rocm7_0
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -494,7 +568,7 @@ jobs:
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.4
+          custom-tag-prefix: rocm7.0
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
@@ -507,24 +581,24 @@ jobs:
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_10-rocm6_4-upload:  # Uploading
+  manywheel-py3_10-rocm7_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_10-rocm6_4-test
+    needs: manywheel-py3_10-rocm7_0-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-rocm6_4
+      build_name: manywheel-py3_10-rocm7_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -833,6 +907,72 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_11-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_11-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_11-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_11-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -899,7 +1039,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_11-rocm6_3-build:
+  manywheel-py3_11-rocm6_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -908,21 +1048,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_11-rocm6_3
+      timeout-minutes: 300
+      build_name: manywheel-py3_11-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-rocm6_3-test:  # Testing
+  manywheel-py3_11-rocm6_4-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_11-rocm6_3-build
+      - manywheel-py3_11-rocm6_4-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -931,20 +1072,23 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.11"
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_11-rocm6_3
+          name: manywheel-py3_11-rocm6_4
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -975,7 +1119,7 @@ jobs:
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.3
+          custom-tag-prefix: rocm6.4
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
@@ -988,29 +1132,29 @@ jobs:
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_11-rocm6_3-upload:  # Uploading
+  manywheel-py3_11-rocm6_4-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_11-rocm6_3-test
+    needs: manywheel-py3_11-rocm6_4-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-rocm6_3
+      build_name: manywheel-py3_11-rocm6_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_11-rocm6_4-build:
+  manywheel-py3_11-rocm7_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -1019,21 +1163,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_11-rocm6_4
+      timeout-minutes: 300
+      build_name: manywheel-py3_11-rocm7_0
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-rocm6_4-test:  # Testing
+  manywheel-py3_11-rocm7_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_11-rocm6_4-build
+      - manywheel-py3_11-rocm7_0-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -1042,20 +1187,23 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.11"
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_11-rocm6_4
+          name: manywheel-py3_11-rocm7_0
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -1086,7 +1234,7 @@ jobs:
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.4
+          custom-tag-prefix: rocm7.0
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
@@ -1099,24 +1247,24 @@ jobs:
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_11-rocm6_4-upload:  # Uploading
+  manywheel-py3_11-rocm7_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_11-rocm6_4-test
+    needs: manywheel-py3_11-rocm7_0-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-rocm6_4
+      build_name: manywheel-py3_11-rocm7_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -1425,6 +1573,72 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_12-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_12-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_12-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_12-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1491,7 +1705,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_12-rocm6_3-build:
+  manywheel-py3_12-rocm6_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -1500,21 +1714,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_12-rocm6_3
+      timeout-minutes: 300
+      build_name: manywheel-py3_12-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-rocm6_3-test:  # Testing
+  manywheel-py3_12-rocm6_4-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_12-rocm6_3-build
+      - manywheel-py3_12-rocm6_4-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -1523,20 +1738,23 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.12"
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_12-rocm6_3
+          name: manywheel-py3_12-rocm6_4
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -1567,7 +1785,7 @@ jobs:
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.3
+          custom-tag-prefix: rocm6.4
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
@@ -1580,29 +1798,29 @@ jobs:
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_12-rocm6_3-upload:  # Uploading
+  manywheel-py3_12-rocm6_4-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_12-rocm6_3-test
+    needs: manywheel-py3_12-rocm6_4-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-rocm6_3
+      build_name: manywheel-py3_12-rocm6_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_12-rocm6_4-build:
+  manywheel-py3_12-rocm7_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -1611,21 +1829,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_12-rocm6_4
+      timeout-minutes: 300
+      build_name: manywheel-py3_12-rocm7_0
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-rocm6_4-test:  # Testing
+  manywheel-py3_12-rocm7_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_12-rocm6_4-build
+      - manywheel-py3_12-rocm7_0-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -1634,20 +1853,23 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.12"
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_12-rocm6_4
+          name: manywheel-py3_12-rocm7_0
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -1678,7 +1900,7 @@ jobs:
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.4
+          custom-tag-prefix: rocm7.0
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
@@ -1691,24 +1913,24 @@ jobs:
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_12-rocm6_4-upload:  # Uploading
+  manywheel-py3_12-rocm7_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_12-rocm6_4-test
+    needs: manywheel-py3_12-rocm7_0-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-rocm6_4
+      build_name: manywheel-py3_12-rocm7_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -2017,6 +2239,72 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_13-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.13"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cuda12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_13-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -2083,7 +2371,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_13-rocm6_3-build:
+  manywheel-py3_13-rocm6_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -2092,21 +2380,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13-rocm6_3
+      timeout-minutes: 300
+      build_name: manywheel-py3_13-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-rocm6_3-test:  # Testing
+  manywheel-py3_13-rocm6_4-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_13-rocm6_3-build
+      - manywheel-py3_13-rocm6_4-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -2115,20 +2404,23 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.13"
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_13-rocm6_3
+          name: manywheel-py3_13-rocm6_4
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -2159,7 +2451,7 @@ jobs:
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.3
+          custom-tag-prefix: rocm6.4
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
@@ -2172,29 +2464,29 @@ jobs:
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_13-rocm6_3-upload:  # Uploading
+  manywheel-py3_13-rocm6_4-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_13-rocm6_3-test
+    needs: manywheel-py3_13-rocm6_4-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-rocm6_3
+      build_name: manywheel-py3_13-rocm6_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_13-rocm6_4-build:
+  manywheel-py3_13-rocm7_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -2203,21 +2495,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13-rocm6_4
+      timeout-minutes: 300
+      build_name: manywheel-py3_13-rocm7_0
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-rocm6_4-test:  # Testing
+  manywheel-py3_13-rocm7_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_13-rocm6_4-build
+      - manywheel-py3_13-rocm7_0-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -2226,20 +2519,23 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.13"
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_13-rocm6_4
+          name: manywheel-py3_13-rocm7_0
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -2270,7 +2566,7 @@ jobs:
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.4
+          custom-tag-prefix: rocm7.0
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
@@ -2283,24 +2579,24 @@ jobs:
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_13-rocm6_4-upload:  # Uploading
+  manywheel-py3_13-rocm7_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_13-rocm6_4-test
+    needs: manywheel-py3_13-rocm7_0-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-rocm6_4
+      build_name: manywheel-py3_13-rocm7_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -2515,35 +2811,101 @@ jobs:
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda12_6
+      build_name: manywheel-py3_13t-cuda12_6
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_6-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda12_6-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_6
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda12_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda12_8
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda12_8-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_8
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda12_6-upload:  # Uploading
+  manywheel-py3_13t-cuda12_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_13t-cuda12_6-test
+    needs: manywheel-py3_13t-cuda12_8-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: "12.6"
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda12_6
+      build_name: manywheel-py3_13t-cuda12_8
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_13t-cuda12_8-build:
+  manywheel-py3_13t-cuda12_9-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -2552,22 +2914,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13t-cuda12_8
+      build_name: manywheel-py3_13t-cuda12_9
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda12_8-test:  # Testing
+  manywheel-py3_13t-cuda12_9-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_13t-cuda12_8-build
+      - manywheel-py3_13t-cuda12_9-build
       - get-label-type
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
@@ -2575,36 +2937,36 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
       DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda12_8
+      build_name: manywheel-py3_13t-cuda12_9
       build_environment: linux-binary-manywheel
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda12_8-upload:  # Uploading
+  manywheel-py3_13t-cuda12_9-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_13t-cuda12_8-test
+    needs: manywheel-py3_13t-cuda12_9-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
       DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda12_8
+      build_name: manywheel-py3_13t-cuda12_9
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -2675,7 +3037,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_13t-rocm6_3-build:
+  manywheel-py3_13t-rocm6_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -2684,21 +3046,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13t-rocm6_3
+      timeout-minutes: 300
+      build_name: manywheel-py3_13t-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-rocm6_3-test:  # Testing
+  manywheel-py3_13t-rocm6_4-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_13t-rocm6_3-build
+      - manywheel-py3_13t-rocm6_4-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -2707,20 +3070,23 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.13t"
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_13t-rocm6_3
+          name: manywheel-py3_13t-rocm6_4
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -2751,7 +3117,7 @@ jobs:
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.3
+          custom-tag-prefix: rocm6.4
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
@@ -2764,29 +3130,29 @@ jobs:
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_13t-rocm6_3-upload:  # Uploading
+  manywheel-py3_13t-rocm6_4-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_13t-rocm6_3-test
+    needs: manywheel-py3_13t-rocm6_4-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-rocm6_3
+      build_name: manywheel-py3_13t-rocm6_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_13t-rocm6_4-build:
+  manywheel-py3_13t-rocm7_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -2795,21 +3161,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13t-rocm6_4
+      timeout-minutes: 300
+      build_name: manywheel-py3_13t-rocm7_0
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-rocm6_4-test:  # Testing
+  manywheel-py3_13t-rocm7_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_13t-rocm6_4-build
+      - manywheel-py3_13t-rocm7_0-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -2818,20 +3185,23 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.13t"
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_13t-rocm6_4
+          name: manywheel-py3_13t-rocm7_0
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -2862,7 +3232,7 @@ jobs:
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.4
+          custom-tag-prefix: rocm7.0
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
@@ -2875,24 +3245,24 @@ jobs:
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_13t-rocm6_4-upload:  # Uploading
+  manywheel-py3_13t-rocm7_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_13t-rocm6_4-test
+    needs: manywheel-py3_13t-rocm7_0-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-rocm6_4
+      build_name: manywheel-py3_13t-rocm7_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -3201,6 +3571,72 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_14-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_14-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cuda12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_14-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -3267,7 +3703,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_14-rocm6_3-build:
+  manywheel-py3_14-rocm6_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -3276,21 +3712,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_14-rocm6_3
+      timeout-minutes: 300
+      build_name: manywheel-py3_14-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14-rocm6_3-test:  # Testing
+  manywheel-py3_14-rocm6_4-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_14-rocm6_3-build
+      - manywheel-py3_14-rocm6_4-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -3299,20 +3736,23 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.14"
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_14-rocm6_3
+          name: manywheel-py3_14-rocm6_4
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -3343,7 +3783,7 @@ jobs:
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.3
+          custom-tag-prefix: rocm6.4
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
@@ -3356,29 +3796,29 @@ jobs:
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_14-rocm6_3-upload:  # Uploading
+  manywheel-py3_14-rocm6_4-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_14-rocm6_3-test
+    needs: manywheel-py3_14-rocm6_4-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.14"
-      build_name: manywheel-py3_14-rocm6_3
+      build_name: manywheel-py3_14-rocm6_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_14-rocm6_4-build:
+  manywheel-py3_14-rocm7_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -3387,21 +3827,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_14-rocm6_4
+      timeout-minutes: 300
+      build_name: manywheel-py3_14-rocm7_0
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14-rocm6_4-test:  # Testing
+  manywheel-py3_14-rocm7_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_14-rocm6_4-build
+      - manywheel-py3_14-rocm7_0-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -3410,20 +3851,23 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.14"
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_14-rocm6_4
+          name: manywheel-py3_14-rocm7_0
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -3454,7 +3898,7 @@ jobs:
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.4
+          custom-tag-prefix: rocm7.0
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
@@ -3467,24 +3911,24 @@ jobs:
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_14-rocm6_4-upload:  # Uploading
+  manywheel-py3_14-rocm7_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_14-rocm6_4-test
+    needs: manywheel-py3_14-rocm7_0-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.14"
-      build_name: manywheel-py3_14-rocm6_4
+      build_name: manywheel-py3_14-rocm7_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -3793,6 +4237,72 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
+  manywheel-py3_14t-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_14t-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14t-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cuda12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
   manywheel-py3_14t-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -3859,7 +4369,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_14t-rocm6_3-build:
+  manywheel-py3_14t-rocm6_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -3868,21 +4378,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_14t-rocm6_3
+      timeout-minutes: 300
+      build_name: manywheel-py3_14t-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14t-rocm6_3-test:  # Testing
+  manywheel-py3_14t-rocm6_4-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_14t-rocm6_3-build
+      - manywheel-py3_14t-rocm6_4-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -3891,20 +4402,23 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.14t"
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_14t-rocm6_3
+          name: manywheel-py3_14t-rocm6_4
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -3935,7 +4449,7 @@ jobs:
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.3
+          custom-tag-prefix: rocm6.4
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
@@ -3948,29 +4462,29 @@ jobs:
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_14t-rocm6_3-upload:  # Uploading
+  manywheel-py3_14t-rocm6_4-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_14t-rocm6_3-test
+    needs: manywheel-py3_14t-rocm6_4-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.14t"
-      build_name: manywheel-py3_14t-rocm6_3
+      build_name: manywheel-py3_14t-rocm6_4
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_14t-rocm6_4-build:
+  manywheel-py3_14t-rocm7_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -3979,21 +4493,22 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_14t-rocm6_4
+      timeout-minutes: 300
+      build_name: manywheel-py3_14t-rocm7_0
       build_environment: linux-binary-manywheel
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14t-rocm6_4-test:  # Testing
+  manywheel-py3_14t-rocm7_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_14t-rocm6_4-build
+      - manywheel-py3_14t-rocm7_0-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -4002,20 +4517,23 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.14t"
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_14t-rocm6_4
+          name: manywheel-py3_14t-rocm7_0
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -4046,7 +4564,7 @@ jobs:
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.4
+          custom-tag-prefix: rocm7.0
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
@@ -4059,24 +4577,24 @@ jobs:
           DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Teardown ROCm
         uses: ./.github/actions/teardown-rocm
-  manywheel-py3_14t-rocm6_4-upload:  # Uploading
+  manywheel-py3_14t-rocm7_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_14t-rocm6_4-test
+    needs: manywheel-py3_14t-rocm7_0-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
       DESIRED_PYTHON: "3.14t"
-      build_name: manywheel-py3_14t-rocm6_4
+      build_name: manywheel-py3_14t-rocm7_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
deleted file mode 100644
index 8177bac3fe21..000000000000
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ /dev/null
@@ -1,135 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-manywheel-rocm
-
-
-on:
-  push:
-    branches:
-      - main
-    tags:
-      - 'ciflow/binaries/*'
-      - 'ciflow/binaries_wheel/*'
-      - 'ciflow/rocm/*'
-  workflow_dispatch:
-
-permissions:
-  id-token: write
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  AWS_DEFAULT_REGION: us-east-1
-  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-manywheel-rocm
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
-  PYTORCH_ROOT: /pytorch
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 0
-concurrency:
-  group: linux-binary-manywheel-rocm-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  get-label-type:
-    if: github.repository_owner == 'pytorch'
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_9-rocm6_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-rocm6_4
-      build_environment: linux-binary-manywheel-rocm
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-rocm6_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-rocm6_4-build
-      - get-label-type
-    runs-on: linux.rocm.gpu.mi250
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.9"
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_9-rocm6_4
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: configure aws credentials
-        id: aws_creds
-        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-      - name: Calculate docker image
-        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
-          docker-image-name: manylinux2_28-builder
-          custom-tag-prefix: rocm6.4
-          docker-build-dir: .ci/docker
-          working-directory: pytorch
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-        env:
-          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
diff --git a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
index cd912650eb17..109e98cd9d91 100644
--- a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
@@ -63,7 +63,6 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
-          # TODO: Removeme once 3.14 is out
           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
           python-version: "3.10.4"
           freethreaded: false
diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
index 8522d2d36993..afe9330deb83 100644
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@@ -59,7 +59,6 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
-          # TODO: Removeme once 3.14 is out
           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
           python-version: "3.10.4"
           freethreaded: false
@@ -169,7 +168,6 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
-          # TODO: Removeme once 3.14 is out
           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
           python-version: "3.11.4"
           freethreaded: false
@@ -279,7 +277,6 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
-          # TODO: Removeme once 3.14 is out
           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
           python-version: "3.12.4"
           freethreaded: false
@@ -389,7 +386,6 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
-          # TODO: Removeme once 3.14 is out
           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
           python-version: "3.13.4"
           freethreaded: false
@@ -499,7 +495,6 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
-          # TODO: Removeme once 3.14 is out
           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
           python-version: "3.13.4"
           freethreaded: true
@@ -609,9 +604,8 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
-          # TODO: Removeme once 3.14 is out
           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
-          python-version: "3.14.0-rc.2"
+          python-version: "3.14.0"
           freethreaded: false
       - name: Checkout PyTorch
         uses: actions/checkout@v4
@@ -719,9 +713,8 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v6
         with:
-          # TODO: Removeme once 3.14 is out
           # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
-          python-version: "3.14.0-rc.2"
+          python-version: "3.14.0"
           freethreaded: true
       - name: Checkout PyTorch
         uses: actions/checkout@v4
diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
deleted file mode 100644
index 818d2ca45cc4..000000000000
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ /dev/null
@@ -1,261 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: windows-binary-libtorch-debug
-
-on:
-  push:
-    branches:
-      - main
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  AWS_DEFAULT_REGION: us-east-1
-  BUILD_ENVIRONMENT: windows-binary-libtorch-debug
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
-  OS: windows
-concurrency:
-  group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  get-label-type:
-    if: github.repository_owner == 'pytorch'
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-  libtorch-cpu-shared-with-deps-debug-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.10"
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v4.4.0
-        if: always()
-        with:
-          name: libtorch-cpu-shared-with-deps-debug
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-
-  libtorch-cpu-shared-with-deps-debug-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-cpu-shared-with-deps-debug-build
-      - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: debug
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-debug
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
diff --git a/.github/workflows/generated-windows-binary-libtorch-release-main.yml b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
deleted file mode 100644
index ff8a2bbbfe1e..000000000000
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ /dev/null
@@ -1,261 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: windows-binary-libtorch-release
-
-on:
-  push:
-    branches:
-      - main
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  AWS_DEFAULT_REGION: us-east-1
-  BUILD_ENVIRONMENT: windows-binary-libtorch-release
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
-  OS: windows
-concurrency:
-  group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  get-label-type:
-    if: github.repository_owner == 'pytorch'
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-  libtorch-cpu-shared-with-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.10"
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v4.4.0
-        if: always()
-        with:
-          name: libtorch-cpu-shared-with-deps-release
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-
-  libtorch-cpu-shared-with-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-cpu-shared-with-deps-release-build
-      - get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 360
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      # This is a dummy value for libtorch to work correctly with our batch scripts
-      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.10"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
-        shell: bash
-        run: |
-          git config --global core.longpaths true
-          git config --global core.symlinks true
-
-          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
-          # the directory on Windows and prevent GHA from checking out as reported
-          # in https://github.com/actions/checkout/issues/1018
-          git config --global core.fsmonitor false
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-release
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
diff --git a/.github/workflows/h100-distributed.yml b/.github/workflows/h100-distributed.yml
index a0a7495483d4..be19b8f961f4 100644
--- a/.github/workflows/h100-distributed.yml
+++ b/.github/workflows/h100-distributed.yml
@@ -37,7 +37,7 @@ jobs:
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: "linux.12xlarge"
+      runner: "linux.c7i.12xlarge"
       build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
       cuda-arch-list: '9.0'
diff --git a/.github/workflows/inductor-perf-test-nightly-h100.yml b/.github/workflows/inductor-perf-test-nightly-h100.yml
index 41210f89c9a8..8209bf053a77 100644
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@@ -2,7 +2,7 @@ name: inductor-perf-nightly-h100
 
 on:
   schedule:
-    - cron: 15 0,12 * * 1-6
+    - cron: 15 0 * * 1-6
     - cron: 0 7 * * 0
   # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
   # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
@@ -130,7 +130,7 @@ jobs:
     name: test-periodically
     uses: ./.github/workflows/_linux-test.yml
     needs: build
-    if: github.event.schedule == '15 0,12 * * 1-6'
+    if: github.event.schedule == '15 0 * * 1-6'
     with:
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
       dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
diff --git a/.github/workflows/inductor-perf-test-nightly-macos.yml b/.github/workflows/inductor-perf-test-nightly-macos.yml
index c3b9a4229924..81c1c27b7643 100644
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@@ -63,6 +63,7 @@ jobs:
       # Same as the build job
       python-version: 3.12.7
       test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
+      timeout-minutes: 300
       disable-monitor: false
       monitor-log-interval: 15
       monitor-data-collect-interval: 4
diff --git a/.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml b/.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml
new file mode 100644
index 000000000000..8d6da1850300
--- /dev/null
+++ b/.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml
@@ -0,0 +1,132 @@
+name: inductor-perf-nightly-rocm-mi300
+
+on:
+  push:
+    tags:
+      - ciflow/inductor-perf-test-nightly-rocm-mi300/*
+  schedule:
+    - cron: 15 0 * * *
+  # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
+  # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
+  workflow_dispatch:
+    inputs:
+      training:
+        description: Run training (on by default)?
+        required: false
+        type: boolean
+        default: true
+      inference:
+        description: Run inference (on by default)?
+        required: false
+        type: boolean
+        default: true
+      default:
+        description: Run inductor_default?
+        required: false
+        type: boolean
+        default: false
+      dynamic:
+        description: Run inductor_dynamic_shapes?
+        required: false
+        type: boolean
+        default: false
+      cppwrapper:
+        description: Run inductor_cpp_wrapper?
+        required: false
+        type: boolean
+        default: false
+      cudagraphs:
+        description: Run inductor_cudagraphs?
+        required: false
+        type: boolean
+        default: true
+      freezing_cudagraphs:
+        description: Run inductor_cudagraphs with freezing for inference?
+        required: false
+        type: boolean
+        default: false
+      aotinductor:
+        description: Run aot_inductor for inference?
+        required: false
+        type: boolean
+        default: false
+      maxautotune:
+        description: Run inductor_max_autotune?
+        required: false
+        type: boolean
+        default: false
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+        default: inductor_huggingface_perf_rocm_mi300,inductor_timm_perf_rocm_mi300,inductor_torchbench_perf_rocm_mi300
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+      opt_out_experiments: lf
+
+  linux-jammy-rocm-py3_10-inductor-benchmark-build:
+    if: github.repository_owner == 'pytorch'
+    name: rocm-py3_10-inductor-benchmark-build
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf_rocm_mi300", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_huggingface_perf_rocm_mi300", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_huggingface_perf_rocm_mi300", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_huggingface_perf_rocm_mi300", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_huggingface_perf_rocm_mi300", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_timm_perf_rocm_mi300", shard: 1, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_timm_perf_rocm_mi300", shard: 2, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_timm_perf_rocm_mi300", shard: 3, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_timm_perf_rocm_mi300", shard: 4, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_timm_perf_rocm_mi300", shard: 5, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_timm_perf_rocm_mi300", shard: 6, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_timm_perf_rocm_mi300", shard: 7, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm_mi300", shard: 1, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm_mi300", shard: 2, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm_mi300", shard: 3, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm_mi300", shard: 4, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm_mi300", shard: 5, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm_mi300", shard: 6, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm_mi300", shard: 7, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm_mi300", shard: 8, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm_mi300", shard: 9, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-inductor-benchmark-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: rocm-py3_10-inductor-benchmark-test
+    uses: ./.github/workflows/_rocm-test.yml
+    needs: linux-jammy-rocm-py3_10-inductor-benchmark-build
+    with:
+      build-environment: linux-jammy-rocm-py3_10
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.test-matrix }}
+      timeout-minutes: 720
+      # Disable monitor in perf tests for more investigation
+      disable-monitor: true
+      monitor-log-interval: 10
+      monitor-data-collect-interval: 2
+    secrets: inherit
diff --git a/.github/workflows/inductor-perf-test-nightly-rocm.yml b/.github/workflows/inductor-perf-test-nightly-rocm-mi355.yml
similarity index 58%
rename from .github/workflows/inductor-perf-test-nightly-rocm.yml
rename to .github/workflows/inductor-perf-test-nightly-rocm-mi355.yml
index f329fe74e6b6..24872d2b1f11 100644
--- a/.github/workflows/inductor-perf-test-nightly-rocm.yml
+++ b/.github/workflows/inductor-perf-test-nightly-rocm-mi355.yml
@@ -1,11 +1,11 @@
-name: inductor-perf-nightly-rocm
+name: inductor-perf-nightly-rocm-mi355
 
 on:
   push:
     tags:
-      - ciflow/inductor-perf-test-nightly-rocm/*
+      - ciflow/inductor-perf-test-nightly-rocm-mi355/*
   schedule:
-    - cron: 0 7 * * 0,3
+    - cron: 15 0 * * *
   # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
   # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
   workflow_dispatch:
@@ -59,7 +59,7 @@ on:
         description: The list of configs used the benchmark
         required: false
         type: string
-        default: inductor_huggingface_perf_rocm,inductor_timm_perf_rocm,inductor_torchbench_perf_rocm
+        default: inductor_huggingface_perf_rocm_mi355,inductor_timm_perf_rocm_mi355,inductor_torchbench_perf_rocm_mi355
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
@@ -88,23 +88,27 @@ jobs:
       docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
       test-matrix: |
         { include: [
-          { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_huggingface_perf_rocm_mi355", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_huggingface_perf_rocm_mi355", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_huggingface_perf_rocm_mi355", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_huggingface_perf_rocm_mi355", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_huggingface_perf_rocm_mi355", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_timm_perf_rocm_mi355", shard: 1, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_timm_perf_rocm_mi355", shard: 2, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_timm_perf_rocm_mi355", shard: 3, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_timm_perf_rocm_mi355", shard: 4, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_timm_perf_rocm_mi355", shard: 5, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_timm_perf_rocm_mi355", shard: 6, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_timm_perf_rocm_mi355", shard: 7, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_torchbench_perf_rocm_mi355", shard: 1, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_torchbench_perf_rocm_mi355", shard: 2, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_torchbench_perf_rocm_mi355", shard: 3, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_torchbench_perf_rocm_mi355", shard: 4, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_torchbench_perf_rocm_mi355", shard: 5, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_torchbench_perf_rocm_mi355", shard: 6, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_torchbench_perf_rocm_mi355", shard: 7, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_torchbench_perf_rocm_mi355", shard: 8, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_torchbench_perf_rocm_mi355", shard: 9, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml
index 454cd166c90b..4b0e573d129c 100644
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@@ -106,6 +106,16 @@ jobs:
           { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
           { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
           { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index b1a6dfb39071..729b11157485 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -12,6 +12,7 @@ on:
       - landchecks/*
     tags:
       - ciflow/pull/*
+      - ciflow/trunk/*
   workflow_dispatch:
 
 permissions: read-all
@@ -31,9 +32,13 @@ jobs:
     if: github.repository_owner == 'pytorch'
     name: Get changed files
     uses: ./.github/workflows/_get-changed-files.yml
+    with:
+      all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') || github.event_name == 'push' }}
 
   lintrunner-clang:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    # Needed to prevent deduping on HUD
+    name: lintrunner-clang-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
     needs: [get-label-type, get-changed-files]
     # Only run if there are changed files relevant to clangtidy / clangformat
     if: |
@@ -53,7 +58,7 @@ jobs:
     with:
       timeout: 120
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
-      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
+      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter
       # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
       # to run git rev-parse HEAD~:.ci/docker when a new image is needed
       fetch-depth: 0
@@ -73,6 +78,7 @@ jobs:
   #       fails to find types when it should
   lintrunner-mypy:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    name: lintrunner-mypy-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
     needs: [get-label-type, get-changed-files]
     # Only run if there are changed files relevant to mypy
     if: |
@@ -97,6 +103,7 @@ jobs:
 
   lintrunner-noclang:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    name: lintrunner-noclang-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
     needs: [get-label-type, get-changed-files]
     with:
       timeout: 120
@@ -111,9 +118,9 @@ jobs:
         CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
         echo "Running all other linters"
         if [ "$CHANGED_FILES" = '*' ]; then
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT,PYREFLY --all-files" .github/scripts/lintrunner.sh
         else
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT ${CHANGED_FILES}" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT,PYREFLY ${CHANGED_FILES}" .github/scripts/lintrunner.sh
         fi
 
   quick-checks:
@@ -264,10 +271,10 @@ jobs:
         with:
           submodules: false
           fetch-depth: 1
-      - name: Setup Python 3.9
+      - name: Setup Python 3.10
         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
-          python-version: '3.9'
+          python-version: '3.10'
           architecture: x64
           cache: pip
       - name: Install dependencies
diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml
index dcdc2cd0ba24..40fb3b8d0c85 100644
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@@ -7,9 +7,11 @@ on:
   workflow_dispatch:
     inputs:
       test_mode:
-        required: false
-        type: string
-        default: 'short'
+        type: choice
+        options:
+          - 'short'
+          - 'long'
+          - 'all'
         description: tag filter for operator benchmarks, options from long, short, all
   schedule:
     # Run at 07:00 UTC every Sunday
@@ -28,38 +30,49 @@ permissions:
   contents: read
 
 jobs:
-  opbenchmark-build:
+  x86-opbenchmark-build:
     if: github.repository_owner == 'pytorch'
-    name: opbenchmark-build
+    name: x86-opbenchmark-build
     uses: ./.github/workflows/_linux-build.yml
     with:
       build-environment: linux-jammy-py3.10-gcc11-build
       docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       test-matrix: |
         { include: [
-          { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+          { config: "cpu_operator_benchmark_${{ inputs.test_mode || 'short' }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
         ]}
     secrets: inherit
 
-  opbenchmark-on-demand-build:
-    if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }}
-    name: opbenchmark-on-demand-build
-    uses: ./.github/workflows/_linux-build.yml
+  x86-opbenchmark-test:
+    name: x86-opbenchmark-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs: x86-opbenchmark-build
     with:
       build-environment: linux-jammy-py3.10-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      docker-image: ${{ needs.x86-opbenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.x86-opbenchmark-build.outputs.test-matrix }}
+    secrets: inherit
+
+  aarch64-opbenchmark-build:
+    if: github.repository_owner == 'pytorch'
+    name: aarch64-opbenchmark-build
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-aarch64-py3.10
+      runner: linux.arm64.m7g.4xlarge
+      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
       test-matrix: |
         { include: [
-          { config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+          { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.arm64.m8g.4xlarge" },
         ]}
     secrets: inherit
 
-  opbenchmark-test:
-    name: opbenchmark-test
+  aarch64-opbenchmark-test:
+    name: aarch64-opbenchmark-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: opbenchmark-build
+    needs: aarch64-opbenchmark-build
     with:
-      build-environment: linux-jammy-py3.10-gcc11-build
-      docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
-      test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
+      build-environment: linux-jammy-aarch64-py3.10
+      docker-image: ${{ needs.aarch64-opbenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.aarch64-opbenchmark-build.outputs.test-matrix }}
     secrets: inherit
diff --git a/.github/workflows/operator_microbenchmark.yml b/.github/workflows/operator_microbenchmark.yml
new file mode 100644
index 000000000000..89d6d63c7287
--- /dev/null
+++ b/.github/workflows/operator_microbenchmark.yml
@@ -0,0 +1,100 @@
+name: operator_microbenchmark
+
+on:
+  push:
+    tags:
+      - ciflow/op-benchmark/*
+  workflow_dispatch:
+  schedule:
+    # Run at 06:00 UTC everyday
+    - cron: 0 6 * * *
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  # H100 A100 runners
+  opmicrobenchmark-build:
+    if: github.repository_owner == 'pytorch'
+    name: opmicrobenchmark-build
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '8.0 9.0'
+      test-matrix: |
+        { include: [
+          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" },
+          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+        ]}
+    secrets: inherit
+
+  opmicrobenchmark-test:
+    name: opmicrobenchmark-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs: opmicrobenchmark-build
+    with:
+      timeout-minutes: 500
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
+    secrets: inherit
+
+  # B200 runner
+  opmicrobenchmark-build-b200:
+    if: github.repository_owner == 'pytorch'
+    name: opmicrobenchmark-build-b200
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '10.0'
+      test-matrix: |
+        { include: [
+          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
+        ]}
+    secrets: inherit
+
+  opmicrobenchmark-test-b200:
+    name: opmicrobenchmark-test-b200
+    uses: ./.github/workflows/_linux-test.yml
+    needs: opmicrobenchmark-build-b200
+    with:
+      timeout-minutes: 500
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
+      test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+    secrets: inherit
+
+  # ROCM MI300 runner
+  opmicrobenchmark-build-rocm:
+    if: github.repository_owner == 'pytorch'
+    name: opmicrobenchmark-build-rocm
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
+      test-matrix: |
+        { include: [
+          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
+        ]}
+    secrets: inherit
+
+  opmicrobenchmark-test-rocm:
+    name: opmicrobenchmark-test-rocm
+    uses: ./.github/workflows/_rocm-test.yml
+    needs: opmicrobenchmark-build-rocm
+    with:
+      timeout-minutes: 500
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image: ${{ needs.opmicrobenchmark-build-rocm.outputs.docker-image }}
+      test-matrix: ${{ needs.opmicrobenchmark-build-rocm.outputs.test-matrix }}
+    secrets: inherit
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 714838eb8476..d821878074b2 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -59,13 +59,14 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-cuda12.4-py3.10-gcc11
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11
+      cuda-arch-list: 7.5
       test-matrix: |
         { include: [
-          { config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
         ]}
     secrets: inherit
 
@@ -112,13 +113,13 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_9-gcc9-build:
-    name: linux-jammy-cuda12.8-py3.9-gcc9
+  linux-jammy-cuda12_8-py3_10-gcc9-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc9
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.9-gcc9
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
       cuda-arch-list: 8.6
       test-matrix: |
@@ -128,14 +129,14 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_9-gcc9-test:
-    name: linux-jammy-cuda12.8-py3.9-gcc9
+  linux-jammy-cuda12_8-py3_10-gcc9-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc9
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_9-gcc9-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-build
     with:
-      build-environment: linux-jammy-cuda12.8-py3.9-gcc9
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.test-matrix }}
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.test-matrix }}
     secrets: inherit
 
   linux-jammy-cuda12_8-py3_10-gcc9-debug-build:
@@ -181,11 +182,11 @@ jobs:
       docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
       test-matrix: |
         { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
-          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
           { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
         ]}
     secrets: inherit
@@ -212,9 +213,9 @@ jobs:
       docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
       test-matrix: |
         { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index ff6e9ed10711..a31a10063f1b 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -127,7 +127,6 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      # More memory is needed to build with asan
       runner: linux.2xlarge.memory
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3.10-clang18-asan
@@ -318,32 +317,6 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-py3-clang12-executorch-build:
-    if: false  # Docker build needs pin update
-    name: linux-jammy-py3-clang12-executorch
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3-clang12-executorch
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
-      test-matrix: |
-        { include: [
-          { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-py3-clang12-executorch-test:
-    name: linux-jammy-py3-clang12-executorch
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3-clang12-executorch-build
-    if: false # Has been broken for a while
-    with:
-      build-environment: linux-jammy-py3-clang12-executorch
-      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
-    secrets: inherit
-
   linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
     name: cuda12.8-py3.10-gcc9-sm75
     uses: ./.github/workflows/_linux-build.yml
@@ -369,14 +342,14 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-xpu-n-py3_9-build:
-    name: linux-jammy-xpu-n-py3.9
+  linux-jammy-xpu-n-py3_10-build:
+    name: linux-jammy-xpu-n-py3.10
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       sync-tag: linux-xpu-n-build
       runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-n-py3.9
+      build-environment: linux-jammy-xpu-n-py3.10
       docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
       test-matrix: |
         { include: [
diff --git a/.github/workflows/quantization-periodic.yml b/.github/workflows/quantization-periodic.yml
new file mode 100644
index 000000000000..688f557eaf0e
--- /dev/null
+++ b/.github/workflows/quantization-periodic.yml
@@ -0,0 +1,54 @@
+name: quantization-periodic
+
+on:
+  push:
+    tags:
+      - ciflow/quantization-periodic/*
+  workflow_dispatch:
+  schedule:
+    # run weekly
+    - cron: "45 0 * * 0"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  get-default-label-prefix:
+    name: get-default-label-prefix
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+      opt_out_experiments: lf
+
+  periodic-quantization-build:
+    name: periodic-quantization-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-default-label-prefix
+    with:
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '8.9'
+      test-matrix: |
+        { include: [
+          { config: "quantization", shard: 1, num_shards: 1, runner: "${{ needs.get-default-label-prefix.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+        ]}
+    secrets: inherit
+  periodic-test-quantization:
+    name: periodic-test-quantization
+    uses: ./.github/workflows/_linux-test.yml
+    needs: periodic-quantization-build
+    with:
+      build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      docker-image: ${{ needs.periodic-quantization-build.outputs.docker-image }}
+      test-matrix: ${{ needs.periodic-quantization-build.outputs.test-matrix }}
+    secrets: inherit
diff --git a/.github/workflows/rocm-mi355.yml b/.github/workflows/rocm-mi355.yml
index e5dda604a4db..6d05ae9ae3ec 100644
--- a/.github/workflows/rocm-mi355.yml
+++ b/.github/workflows/rocm-mi355.yml
@@ -1,6 +1,9 @@
 name: rocm-mi355
 
 on:
+  push:
+    tags:
+      - ciflow/rocm-mi355/*
   workflow_dispatch:
   schedule:
     - cron: 30 11,1 * * *  # about 4:30am PDT and 6:30pm PDT
@@ -38,16 +41,16 @@ jobs:
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-noble-rocm-py3.12-mi355
-      docker-image-name: ci-image:pytorch-linux-noble-rocm-alpha-py3
+      docker-image-name: ci-image:pytorch-linux-noble-rocm-n-py3
       sync-tag: rocm-build
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
-          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
-          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
-          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
-          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
-          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
         ]}
     secrets: inherit
 
@@ -64,5 +67,7 @@ jobs:
       build-environment: linux-noble-rocm-py3.12-mi355
       docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
-      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
+      tests-to-include: >-
+                        ${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor test_matmul_cuda test_scaled_matmul_cuda'
+                           || '' }}
     secrets: inherit
diff --git a/.github/workflows/rocm-navi31.yml b/.github/workflows/rocm-navi31.yml
new file mode 100644
index 000000000000..aaee8fce262b
--- /dev/null
+++ b/.github/workflows/rocm-navi31.yml
@@ -0,0 +1,63 @@
+name: rocm-navi31
+
+on:
+  push:
+    tags:
+      - ciflow/rocm-navi31/*
+  workflow_dispatch:
+  schedule:
+    # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
+    # Also run less frequently on weekends.
+    - cron: 45 */2 * * 1-5
+    - cron: 45 4,12 * * 0,6
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  target-determination:
+    if: github.repository_owner == 'pytorch'
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  linux-jammy-rocm-py3_10-build:
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      sync-tag: rocm-build
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3_10
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+      tests-to-include: >-
+         ${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
+         test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
+         inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
+         inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
+         inductor/test_flex_attention inductor/test_max_autotune' || '' }}
+    secrets: inherit
diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml
index c21c851aab6d..227c7f676b1c 100644
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@@ -36,12 +36,12 @@ jobs:
       sync-tag: rocm-build
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.2" },
-          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.2" },
-          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.2" },
-          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.2" },
-          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.2" },
-          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" },
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi250.1" },
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml
index 9675ee4169f4..d4992a2ddb2c 100644
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@@ -140,7 +140,6 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      # More memory is needed to build with asan
       runner: linux.2xlarge.memory
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3.10-clang18-asan
diff --git a/.github/workflows/test-b200.yml b/.github/workflows/test-b200.yml
new file mode 100644
index 000000000000..ef7f75bc4b2b
--- /dev/null
+++ b/.github/workflows/test-b200.yml
@@ -0,0 +1,76 @@
+# B200 Smoke Tests CI Workflow
+#
+# This workflow runs smoke tests on B200 hardware
+#
+# Flow:
+# 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200
+# 2. Runs smoke tests on linux.dgx.b200 runner
+# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function
+#
+# Triggered by:
+# - Pull requests modifying this workflow file
+# - Manual dispatch
+# - Schedule (every 6 hours)
+# - Adding ciflow/b200 label to a PR (creates ciflow/b200/* tag)
+
+name: B200 Smoke Tests
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/test-b200.yml
+  workflow_dispatch:
+  schedule:
+    - cron: 0 4,10,16,22 * * *  # every 6 hours
+  push:
+    tags:
+      - ciflow/b200/*
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm100-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '10.0'
+      test-matrix: |
+        { include: [
+          { config: "smoke_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
+        ]}
+      # config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+    secrets: inherit
\ No newline at end of file
diff --git a/.github/workflows/test-h100.yml b/.github/workflows/test-h100.yml
index 1e83c7b9d98c..ec99f4473bb0 100644
--- a/.github/workflows/test-h100.yml
+++ b/.github/workflows/test-h100.yml
@@ -61,3 +61,15 @@ jobs:
       docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.test-matrix }}
     secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm90-FA3-ABI-stable-test:
+    name: linux-jammy-cuda12_8-py3_10-gcc11-sm90-FA3-ABI-stable-test
+    uses: ./.github/workflows/_linux-test-stable-fa3.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm90-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build.outputs.docker-image }}
+      timeout-minutes: 30
+      s3-bucket: gha-artifacts
+    secrets: inherit
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 5b1a12812003..48d1c4490d72 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -56,7 +56,7 @@ jobs:
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
       build-generates-artifacts: false
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: "linux.4xlarge"
+      runner: "linux.c7i.4xlarge"
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1 },
@@ -160,9 +160,10 @@ jobs:
       runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
-          { config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
-          { config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
         ]}
     secrets: inherit
 
@@ -179,13 +180,13 @@ jobs:
       disable-monitor: false
     secrets: inherit
 
-  win-vs2022-cuda12_6-py3-build:
-    name: win-vs2022-cuda12.6-py3
+  win-vs2022-cuda12_8-py3-build:
+    name: win-vs2022-cuda12.8-py3
     uses: ./.github/workflows/_win-build.yml
     needs: get-label-type
     with:
-      build-environment: win-vs2022-cuda12.6-py3
-      cuda-version: "12.6"
+      build-environment: win-vs2022-cuda12.8-py3
+      cuda-version: "12.8"
       runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
     secrets: inherit
 
@@ -203,7 +204,6 @@ jobs:
         { include: [
           { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
           { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.4" },
         ]}
     secrets: inherit
 
@@ -221,7 +221,7 @@ jobs:
       build-environment: linux-jammy-rocm-py3.10
       docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
-      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
+      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
     secrets: inherit
 
   inductor-build:
@@ -234,6 +234,23 @@ jobs:
       cuda-arch-list: '8.0'
     secrets: inherit
 
+  # Test cross-compiled models with Windows libs extracted from wheel
+  cross-compile-linux-test:
+    name: cross-compile-linux-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-build
+      - get-label-type
+      - win-vs2022-cuda12_8-py3-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "aoti_cross_compile_for_windows", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", win_torch_wheel_artifact: "win-vs2022-cuda12.8-py3" },
+        ]}
+    secrets: inherit
+
   verify-cachebench-cpu-build:
     name: verify-cachebench-cpu-build
     uses: ./.github/workflows/_linux-build.yml
@@ -259,3 +276,38 @@ jobs:
       docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
       test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
     secrets: inherit
+
+  linux-jammy-py3-clang12-executorch-build:
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
+      test-matrix: |
+        { include: [
+          { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-py3-clang12-executorch-test:
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3-clang12-executorch-build
+    with:
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-py3_10-gcc11-full-debug-build-only:
+    name: linux-jammy-py3.10-gcc11-full-debug-build-only
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: linux.2xlarge.memory
+      build-environment: linux-jammy-py3.10-gcc11-full-debug-build-only
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+    secrets: inherit
diff --git a/.github/workflows/unstable.yml b/.github/workflows/unstable.yml
index 7f0fe6058bd0..b5955127d9fb 100644
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@@ -53,27 +53,3 @@ jobs:
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
-
-  linux-jammy-py3_9-clang9-xla-build:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
-      test-matrix: |
-        { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-py3_9-clang9-xla-test:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3_9-clang9-xla-build
-    with:
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
-    secrets: inherit
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index aa12cf22b246..b3fc9efdf667 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -23,7 +23,7 @@ jobs:
         with:
           repository: pytorch/pytorch
           stable-branch: viable/strict
-          requires: '[\"pull\", \"trunk\", \"lint\", \"^linux-binary-manywheel$\", \"^linux-binary-libtorch-release$\", \"linux-aarch64\"]'
+          requires: '[\"pull\", \"trunk\", \"lint\",  \"linux-aarch64\"]'
           secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
           clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
           clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}
@@ -48,4 +48,7 @@ jobs:
             echo "{\"sha\": \"${LATEST_SHA}\", \"repository\":\"pytorch/pytorch\", \"timestamp\": ${TIME}}" > "/tmp/${LATEST_SHA}.json"
             pip install awscli==1.29.40
             aws s3 cp "/tmp/${LATEST_SHA}.json" "s3://ossci-raw-job-status/stable_pushes/pytorch/pytorch/${LATEST_SHA}.json"
+            # Push new viable/strict tag
+            cd pytorch/pytorch
+            git push origin "${LATEST_SHA}:refs/tags/viable/strict/${TIME}"
           fi
diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml
index b2768a8f767e..3bddecdadfe3 100644
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@@ -42,11 +42,11 @@ jobs:
       build-external-packages: "vllm"
       build-environment: linux-jammy-cuda12.8-py3.12-gcc11
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
-      cuda-arch-list: '8.0;8.9;9.0'
+      cuda-arch-list: '8.0 8.9 9.0'
       runner: linux.24xlarge.memory
       test-matrix: |
         { include: [
-          { config:  "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "vllm_entrypoints_test", shard: 1, num_shards: 1,runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "vllm_regression_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
@@ -54,7 +54,7 @@ jobs:
           { config: "vllm_pytorch_compilation_unit_tests", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "vllm_lora_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "vllm_multi_model_test_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"},
-          { config: "vllm_languagde_model_test_extended_generation_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"},
+          { config: "vllm_language_model_test_extended_generation_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"},
           { config: "vllm_distributed_test_2_gpu_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "vllm_lora_test", shard: 0, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "vllm_lora_test", shard: 1, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
diff --git a/.github/workflows/xpu.yml b/.github/workflows/xpu.yml
index 36ba62349f28..c6bdb06812e7 100644
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@@ -35,7 +35,7 @@ jobs:
       runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
       build-environment: linux-jammy-xpu-n-1-py3.10
       docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
-      runner: linux.12xlarge
+      runner: linux.c7i.12xlarge
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
@@ -56,7 +56,7 @@ jobs:
       runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
       build-environment: linux-jammy-xpu-n-py3.10
       docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
-      runner: linux.12xlarge
+      runner: linux.c7i.12xlarge
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
diff --git a/.gitignore b/.gitignore
index 2dd40f8cfa85..447ef777e929 100644
--- a/.gitignore
+++ b/.gitignore
@@ -82,12 +82,13 @@ torch/return_types.pyi
 torch/nn/functional.pyi
 torch/utils/data/datapipes/datapipe.pyi
 torch/csrc/autograd/generated/*
+torch/csrc/functionalization/generated/*
 torch/csrc/lazy/generated/*.[!m]*
 torch_compile_debug/
 # Listed manually because some files in this directory are not generated
 torch/testing/_internal/generated/annotated_fn_args.py
 torch/testing/_internal/data/*.pt
-torch/csrc/api/include/torch/version.h
+torch/headeronly/version.h
 torch/csrc/cudnn/cuDNN.cpp
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
@@ -259,6 +260,9 @@ gen
 .pytest_cache
 aten/build/*
 
+# Linker scripts for prioritized text optimization
+cmake/linker_script.ld
+
 # Bram
 plsdontbreak
 
@@ -370,6 +374,7 @@ third_party/ruy/
 third_party/glog/
 
 # Virtualenv
+.venv/
 venv/
 
 # Log files
@@ -391,3 +396,4 @@ android/pytorch_android_torchvision/.cxx
 CLAUDE.local.md
 /test_*.py
 /debug_*.py
+CLAUDE_CONTEXT/
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 1f79f1eb971d..411e4d2c215b 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -18,6 +18,7 @@ exclude_patterns = [
     'torch/_inductor/autoheuristic/artifacts/**',
     'scripts/**',
     'test/generated_type_hints_smoketest.py',
+    'test/test_torchfuzz_repros.py',
     # CPython tests
     'test/dynamo/cpython/**',
     # Tests from the NumPy test suite
@@ -27,6 +28,7 @@ exclude_patterns = [
     'torch/lib/**',
     'venv/**',
     '**/*.pyi',
+    "tools/experimental/torchfuzz/**",
     'tools/test/test_selective_build.py',
 ]
 command = [
@@ -49,7 +51,7 @@ init_command = [
     'mccabe==0.7.0',
     'pycodestyle==2.14.0',
     'pyflakes==3.4.0',
-    'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
+    'torchfix==0.4.0 ; python_version >= "3.10" and python_version < "3.13"',
 ]
 
 
@@ -123,6 +125,7 @@ is_formatter = true
 code = 'MYPY'
 include_patterns = [
     'setup.py',
+    'functorch/dim/**/*.py',
     'torch/**/*.py',
     'torch/**/*.pyi',
     'caffe2/**/*.py',
@@ -152,7 +155,7 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
-    'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"',
+    'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
     'numpy==2.1.0 ; python_version >= "3.12"',
     'expecttest==0.3.0',
     'mypy==1.16.0',
@@ -195,6 +198,7 @@ exclude_patterns = [
     'tools/test/gen_operators_yaml_test.py',
     'tools/test/gen_oplist_test.py',
     'tools/test/test_selective_build.py',
+    'tools/experimental/torchfuzz/**',
 ]
 command = [
     'python3',
@@ -205,6 +209,46 @@ command = [
     '@{{PATHSFILE}}'
 ]
 
+
+[[linter]]
+code = 'PYREFLY'
+include_patterns = [
+    'torch/**/*.py',
+    'torch/**/*.pyi',
+    'torchgen/**/*.py',
+    'torchgen/**/*.pyi',
+    'functorch/**/*.py',
+    'functorch/**/*.pyi',
+]
+exclude_patterns = []
+command = [
+    'python3',
+    'tools/linter/adapters/pyrefly_linter.py',
+    '--config=pyrefly.toml',
+]
+init_command = [
+    'python3',
+    'tools/linter/adapters/pip_init.py',
+    '--dry-run={{DRYRUN}}',
+    'numpy==2.1.0 ; python_version >= "3.12"',
+    'expecttest==0.3.0',
+    'pyrefly==0.36.2',
+    'sympy==1.13.3',
+    'types-requests==2.27.25',
+    'types-pyyaml==6.0.2',
+    'types-tabulate==0.8.8',
+    'types-protobuf==5.29.1.20250403',
+    'types-setuptools==79.0.0.20250422',
+    'types-jinja2==2.11.9',
+    'types-colorama==0.4.6',
+    'filelock==3.18.0',
+    'junitparser==2.1.1',
+    'rich==14.1.0',
+    'optree==0.17.0',
+    'types-openpyxl==3.1.5.20250919',
+    'types-python-dateutil==2.9.0.20251008'
+]
+
 [[linter]]
 code = 'CLANGTIDY'
 include_patterns = [
@@ -964,7 +1008,6 @@ exclude_patterns = [
     'test/jit/**',  # should be run through test/test_jit.py
     'test/ao/sparsity/**',  # should be run through test/test_ao_sparsity.py
     'test/fx/**',  # should be run through test/test_fx.py
-    'test/bottleneck_test/**',  # excluded by test/run_test.py
     'test/package/**',  # excluded by test/run_test.py
     'test/distributed/argparse_util_test.py',
     'test/distributed/bin/test_script.py',
@@ -1259,6 +1302,7 @@ exclude_patterns = [
     'test/test_masked.py',
     'test/test_maskedtensor.py',
     'test/test_matmul_cuda.py',
+    'test/test_scaled_matmul_cuda.py',
     'test/test_meta.py',
     'test/test_metal.py',
     'test/test_mkl_verbose.py',
@@ -1410,8 +1454,6 @@ exclude_patterns = [
     'torch/utils/benchmark/utils/timer.py',
     'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
     'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
-    'torch/utils/bottleneck/__init__.py',
-    'torch/utils/bottleneck/__main__.py',
     'torch/utils/bundled_inputs.py',
     'torch/utils/checkpoint.py',
     'torch/utils/collect_env.py',
@@ -1454,7 +1496,7 @@ init_command = [
     '--dry-run={{DRYRUN}}',
     'usort==1.0.8.post1',
     'isort==6.0.1',
-    'ruff==0.12.9',  # sync with RUFF
+    'ruff==0.13.1',  # sync with RUFF
 ]
 is_formatter = true
 
@@ -1571,6 +1613,7 @@ exclude_patterns = [
     'torch/_inductor/fx_passes/serialized_patterns/**',
     'torch/_inductor/autoheuristic/artifacts/**',
     'test/dynamo/cpython/**',
+    'test/test_torchfuzz_repros.py',
     'scripts/**',
     'third_party/**',
     'fb/**',
@@ -1588,7 +1631,7 @@ init_command = [
     'python3',
     'tools/linter/adapters/pip_init.py',
     '--dry-run={{DRYRUN}}',
-    'ruff==0.12.9',  # sync with PYFMT
+    'ruff==0.13.1',  # sync with PYFMT
 ]
 is_formatter = true
 
diff --git a/BUILD.bazel b/BUILD.bazel
index 635f39eed2ce..4737a2a0c486 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -13,6 +13,9 @@ load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libt
 load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
 load("//:tools/bazel.bzl", "rules")
 
+# Export files for use by torch/headeronly (where version.h generation now lives)
+exports_files(["version.txt"])
+
 define_targets(rules = rules)
 
 COMMON_COPTS = [
@@ -22,6 +25,7 @@ COMMON_COPTS = [
     "-DHAVE_SHM_UNLINK=1",
     "-D_FILE_OFFSET_BITS=64",
     "-DUSE_FBGEMM",
+    "-DUSE_DISTRIBUTED",
     "-DAT_PER_OPERATOR_HEADERS",
     "-DATEN_THREADING=NATIVE",
     "-DNO_CUDNN_DESTROY_HANDLE",
@@ -90,6 +94,8 @@ generated_cpu_cpp = [
     "aten/src/ATen/NativeMetaFunctions.h",
     "aten/src/ATen/RegistrationDeclarations.h",
     "aten/src/ATen/VmapGeneratedPlumbing.h",
+    "aten/src/ATen/ViewMetaClasses.h",
+    "aten/src/ATen/ViewMetaClasses.cpp",
     "aten/src/ATen/core/aten_interned_strings.h",
     "aten/src/ATen/core/enum_tag.h",
     "aten/src/ATen/core/TensorBody.h",
@@ -687,7 +693,9 @@ cc_library(
             "torch/csrc/*/generated/*.h",
             "torch/csrc/jit/serialization/mobile_bytecode_generated.h",
         ] + torch_cuda_headers,
-    ) + GENERATED_AUTOGRAD_CPP + [":version_h"],
+    ) + GENERATED_AUTOGRAD_CPP + [
+        "//torch/headeronly:version_h",
+    ],
     includes = [
         "third_party/kineto/libkineto/include",
         "torch/csrc",
@@ -810,7 +818,7 @@ cc_library(
     name = "torch_python",
     srcs = libtorch_python_core_sources
         + if_cuda(libtorch_python_cuda_sources)
-        + libtorch_python_distributed_sources
+        + if_cuda(libtorch_python_distributed_sources)
         + GENERATED_AUTOGRAD_PYTHON,
     hdrs = glob([
         "torch/csrc/generic/*.cpp",
@@ -832,36 +840,6 @@ pybind_extension(
     ],
 )
 
-cc_library(
-    name = "functorch",
-    hdrs = glob([
-        "functorch/csrc/dim/*.h",
-    ]),
-    srcs = glob([
-        "functorch/csrc/dim/*.cpp",
-    ]),
-    deps = [
-        ":aten_nvrtc",
-        ":torch_python",
-        "@pybind11",
-    ],
-)
-
-pybind_extension(
-    name = "functorch/_C",
-    copts=[
-        "-DTORCH_EXTENSION_NAME=_C"
-    ],
-    srcs = [
-        "functorch/csrc/init_dim_only.cpp",
-    ],
-    deps = [
-        ":functorch",
-        ":torch_python",
-        ":aten_nvrtc",
-    ],
-)
-
 cc_binary(
     name = "torch/bin/torch_shm_manager",
     srcs = [
@@ -902,7 +880,6 @@ py_library(
     ],
     data = [
         ":torch/_C.so",
-        ":functorch/_C.so",
         ":torch/bin/torch_shm_manager",
     ],
 )
@@ -1105,6 +1082,7 @@ test_suite(
         "aten/src/ATen/templates/LazyNonNativeIr.h",
         "aten/src/ATen/templates/RegisterDispatchKey.cpp",
         "aten/src/ATen/templates/RegisterDispatchDefinitions.ini",
+        "aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp",
         "aten/src/ATen/native/native_functions.yaml",
         "aten/src/ATen/native/tags.yaml",
         "aten/src/ATen/native/ts_native_functions.yaml",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4fba0eea881b..0b88247df27a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,4 @@
 cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
-# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)
 
 # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
 # sometimes makes XCode C compiler gets detected as "Clang", even when the C++
@@ -181,9 +180,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
   set(CPU_POWER ON)
 endif()
 
-# For non-supported platforms, turn USE_DISTRIBUTED off by default.
-# NB: USE_DISTRIBUTED simply disables the backend; distributed code
-# still gets built
+# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
+# tested and likely won't work without additional changes.
 if(NOT LINUX AND NOT WIN32)
   set(USE_DISTRIBUTED
       OFF
@@ -263,11 +261,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
-option(USE_DISTRIBUTED "Enable default distributed backends" ON)
+option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
                        "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_XCCL "Use XCCL" ON
-                       "USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
+                       "USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
@@ -380,12 +378,19 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
                        OFF "USE_CUDA" OFF)
 cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
                         "CPU_AARCH64" OFF)
+# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
+set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
+if(LINUX AND CPU_AARCH64)
+  set(USE_PRIORITIZED_TEXT_DEFAULT ON)
+endif()
+cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld."
+  "${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)
 
 option(USE_MIMALLOC "Use mimalloc" OFF)
 # Enable third party mimalloc library to improve memory allocation performance
-# on Windows.
+# on Windows and AArch64.
 option(USE_MIMALLOC_ON_MKL "Use mimalloc on MKL" OFF)
-if(WIN32)
+if(WIN32 OR (CPU_AARCH64 AND NOT APPLE))
   set(USE_MIMALLOC ON)
 
   # Not enable USE_MIMALLOC_ON_MKL due to it caused issue:
@@ -432,11 +437,12 @@ if(WIN32)
       PATH_SUFFIXES lib
       NO_DEFAULT_PATH)
     if(NOT libuv_tmp_LIBRARY)
+      set(USE_DISTRIBUTED OFF)
       set(USE_GLOO OFF)
       message(
         WARNING
-          "Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
-          "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
+          "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
+          "Please run command 'conda install -c conda-forge libuv=1.51' to install libuv."
       )
     else()
       set(ENV{libuv_ROOT} ${libuv_tmp_LIBRARY}/../../)
@@ -657,6 +663,11 @@ endif(MSVC)
 
 string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
 
+# Set linker max-page-size to 64KiB on AArch64 Linux
+if(LINUX AND CPU_AARCH64)
+  add_link_options_if_supported("-z,max-page-size=0x10000")
+endif()
+
 # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
 # applicable to mobile are disabled by this variable. Setting
 # `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
@@ -877,23 +888,28 @@ cmake_dependent_option(
   "(USE_CUDA AND NOT MSVC) OR USE_ROCM"
   OFF)
 
+
+IF(USE_ROCM AND "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
+  message(WARNING "Setting USE_FBGEMM_GENAI for gfx942 to ON by default, doing ROCM build")
+  set(USE_FBGEMM_GENAI_DEFAULT ON)
+elseif(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
+  message(STATUS "Setting USE_FBGEMM_GENAI to ON by default , doing CUDA build for SM100a")
+  set(USE_FBGEMM_GENAI_DEFAULT ON)
+else()
+  set(USE_FBGEMM_GENAI_DEFAULT OFF)
+endif()
+
 cmake_dependent_option(
   USE_FBGEMM_GENAI
   "Whether to build FBGEMM GenAI quantized GEMM kernels.\
   Will be disabled if not supported by the platform"
-  ON
-  "USE_ROCM"
+  ${USE_FBGEMM_GENAI_DEFAULT}
+  "(USE_CUDA AND NOT MSVC) OR USE_ROCM"
   OFF)
 
-IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
-  message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
-  set(USE_FBGEMM_GENAI off)
-endif()
 
 # Set USE_FBGEMM_GENAI to ON for CUDA build on SM100.
 if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
-  message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a")
-  set(USE_FBGEMM_GENAI ON)
 endif()
 
 # CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
@@ -1379,10 +1395,6 @@ endif()
 include(cmake/Summary.cmake)
 caffe2_print_configuration_summary()
 
-if(BUILD_FUNCTORCH)
-  add_subdirectory(functorch)
-endif()
-
 # Parse custom debug info
 if(DEFINED USE_CUSTOM_DEBINFO)
   string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
@@ -1421,3 +1433,57 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
   install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
           DESTINATION "${CMAKE_INSTALL_BINDIR}")
 endif()
+
+if(USE_PRIORITIZED_TEXT_FOR_LD)
+  add_compile_options(
+    $<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
+    $<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
+  )
+  set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
+  set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
+
+  add_custom_command(
+    OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
+    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
+    DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
+    COMMENT "Generating prioritized text linker files"
+    VERBATIM
+  )
+
+  add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
+
+  if(BUILD_PYTHON)
+    set(LINKER_OPT_TARGETS torch_python)
+  endif()
+
+  if(NOT BUILD_LIBTORCHLESS)
+    list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
+    if(USE_CUDA)
+      list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
+    endif()
+    if(USE_XPU)
+      list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
+    endif()
+    if(USE_ROCM)
+      list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
+    endif()
+  endif()
+
+  foreach(tgt IN LISTS LINKER_OPT_TARGETS)
+    if(TARGET ${tgt})
+      add_dependencies("${tgt}" generate_linker_script)
+      target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
+      set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
+    else()
+       message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
+    endif()
+  endforeach()
+
+else()
+  if(LINUX AND CPU_AARCH64)
+    message(WARNING [[
+    It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
+    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
+    ]])
+  endif()
+endif()
diff --git a/CODEOWNERS b/CODEOWNERS
index 1d91adacb062..cc249dc4f43a 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -181,15 +181,15 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
 /torch/csrc/jit/python/init.cpp @mikaylagawarecki
 
 # CUDA and CUDA math libraries
-aten/src/ATen/cuda/ @eqy @syed-ahmed
-aten/src/ATen/cudnn/ @eqy @syed-ahmed
-aten/src/ATen/native/cuda/ @eqy @syed-ahmed
-aten/src/ATen/native/cudnn/ @eqy @syed-ahmed
-c10/cuda @eqy @syed-ahmed
-torch/cuda/ @eqy @syed-ahmed
-torch/csrc/cuda/ @eqy @syed-ahmed
-torch/backends/cuda/ @eqy @syed-ahmed
-torch/backends/cudnn/ @eqy @syed-ahmed
+aten/src/ATen/cuda/ @eqy @syed-ahmed @Aidyn-A
+aten/src/ATen/cudnn/ @eqy @syed-ahmed @Aidyn-A
+aten/src/ATen/native/cuda/ @eqy @syed-ahmed @Aidyn-A
+aten/src/ATen/native/cudnn/ @eqy @syed-ahmed @Aidyn-A
+c10/cuda @eqy @syed-ahmed @Aidyn-A
+torch/cuda/ @eqy @syed-ahmed @Aidyn-A
+torch/csrc/cuda/ @eqy @syed-ahmed @Aidyn-A
+torch/backends/cuda/ @eqy @syed-ahmed @Aidyn-A
+torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
 
 # PyTree utilities
 /torch/utils/_pytree.py @XuehaiPan
@@ -201,3 +201,17 @@ torch/backends/cudnn/ @eqy @syed-ahmed
 /torch/csrc/stable/ @janeyx99 @mikaylagawarecki
 /torch/headeronly/ @janeyx99
 /torch/header_only_apis.txt @janeyx99
+
+# FlexAttention
+/torch/nn/attention/flex_attention.py @drisspg
+/torch/_higher_order_ops/flex_attention.py @drisspg
+/torch/_inductor/kernel/flex/ @drisspg
+/torch/_inductor/codegen/cpp_flex_attention_template.py @drisspg
+/test/inductor/test_flex_attention.py @drisspg
+/test/inductor/test_flex_decoding.py @drisspg
+
+# Low Precision GEMMs
+/aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
+/aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
+/aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
+/test/test_scaled_matmul_cuda.py @drisspg @slayton58
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9d2b5d355391..4c46077f9db7 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -81,7 +81,7 @@ git remote add upstream git@github.com:pytorch/pytorch.git
 make setup-env
 # Or run `make setup-env-cuda` for pre-built CUDA binaries
 # Or run `make setup-env-rocm` for pre-built ROCm binaries
-source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 ```
 
 ### Tips and Debugging
@@ -182,28 +182,36 @@ You can use this script to check out a new nightly branch with the following:
 
 ```bash
 ./tools/nightly.py checkout -b my-nightly-branch
-source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 ```
 
 To install the nightly binaries built with CUDA, you can pass in the flag `--cuda`:
 
 ```bash
 ./tools/nightly.py checkout -b my-nightly-branch --cuda
-source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 ```
 
 To install the nightly binaries built with ROCm, you can pass in the flag `--rocm`:
 
 ```bash
 ./tools/nightly.py checkout -b my-nightly-branch --rocm
-source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 ```
 
 You can also use this tool to pull the nightly commits into the current branch:
 
 ```bash
-./tools/nightly.py pull -p my-env
-source my-env/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+./tools/nightly.py pull
+source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
+```
+
+To create the virtual environment with a specific Python interpreter, you can
+pass in the `--python` argument:
+
+```bash
+./tools/nightly.py --python /path/to/python3.12
+source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 ```
 
 Pulling will recreate a fresh virtual environment and reinstall the development
diff --git a/Dockerfile b/Dockerfile
index 7b8964bd860e..331cf00593cb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -50,11 +50,10 @@ RUN git submodule update --init --recursive
 FROM conda as conda-installs
 ARG PYTHON_VERSION=3.11
 ARG CUDA_PATH=cu121
-ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=whl/nightly
 # Automatically set by buildx
-RUN /opt/conda/bin/conda update -y -n base -c defaults conda
-RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION}
+# pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574
+RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0
 
 ARG TARGETPLATFORM
 
diff --git a/MANIFEST.in b/MANIFEST.in
index ec00f251160b..bb8e488283a9 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,20 +1,61 @@
 # Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html
 
-# Include source files in SDist
-include CMakeLists.txt
-include *.bzl *.bazel .bazel* BUILD *.BUILD BUILD.* WORKSPACE
-include BUCK BUCK.*
-include requirements*.txt
-include version.txt
-include [Mm]akefile *.[Mm]akefile [Mm]akefile.*
-include [Dd]ockerfile *.[Dd]ockerfile [Dd]ockerfile.* .dockerignore
+# Include individual top-level files
+include CITATION.cff
+include CODEOWNERS
+include Dockerfile
+include LICENSE
+include MANIFEST.in
+include Makefile
+include NOTICE
+include .bc-linter.yml
+include .clang-format .clang-tidy
+include .cmakelintrc
+include .coveragerc
+include .dockerignore
+include .editorconfig
+include .flake8
+include .gdbinit
+include .lintrunner.toml
+include .lldbinit
+include codex_setup.sh
+include docker.Makefile
+include pyrefly.toml
+include ubsan.supp
+
+# Include bazel and BUCK related files
+include BUILD.bazel BUCK.oss
+include WORKSPACE
+include *.bzl
+include .bazelignore .bazelrc .bazelversion
+
+# Include general configuration files
+include *.ini
+# Include important top-level information
+include *.md
+# Include technical text files at the moment, comprises
+# version.txt, CMakeLists.txt, requirements.txt
+include *.txt
+
+# Include ctags configuration
+include .ctags.d/*.ctags
+
+# Include subfolders completely
+graft .devcontainer
+graft .vscode
 graft android
 graft aten
+graft benchmarks
 graft binaries
 graft c10
 graft caffe2
 graft cmake
+graft docs
 graft functorch
+graft ios
+graft mypy_plugins
+graft scripts
+graft test
 graft third_party
 graft tools
 graft torch
@@ -22,29 +63,37 @@ graft torchgen
 # FIXME: torch-xla build during codegen will fail if include this file in wheel
 exclude torchgen/BUILD.bazel
 
-# Misc files and directories in SDist
-include *.md
-include CITATION.cff
-include LICENSE NOTICE
-include mypy*.ini
-graft benchmarks
-graft docs
-graft mypy_plugins
-graft scripts
+# The following exclusions omit parts from third-party dependencies that
+# contain invalid symlinks[1] and that are not needed for pytorch, such as
+# bindings for unused languages
+prune third_party/flatbuffers/java
+prune third_party/flatbuffers/kotlin
+prune third_party/ittapi/rust
+prune third_party/nccl/pkg/debian
+prune third_party/opentelemetry-cpp/third_party/prometheus-cpp/cmake/project-import-*
+
+# The following document is also an invalid symlink[1] and superfluous
+exclude third_party/flatbuffers/docs/source/CONTRIBUTING.md
+
+# Omit autogenerated code
+prune torchgen/packaged
+
+# Omit caches, compiled, and scm related content
+prune */__pycache__
+prune **/.github
+prune **/.gitlab
+global-exclude *.o *.obj *.so *.dylib *.a *.pxd *.dll *.lib
+global-exclude *.py[cod] *.swp *~
+global-exclude .git .git-blame-ignore-revs .gitattributes .gitignore .gitmodules
+global-exclude .gitlab-ci.yml
 
 # Misc files needed for custom setuptools command
 include .gitignore
 include .gitmodules
 
-# Include test suites in SDist
-graft test
-include pytest.ini
-include .coveragerc
-
-# Prune generated/compiled files
-prune torchgen/packaged
-prune */__pycache__
-global-exclude *.o *.obj *.so *.a *.dylib *.pxd *.dll *.lib *.py[cod]
+# [1] Invalid symlinks for the purposes of Python source distributions are,
+# according to the source distribution format[2] links pointing outside the
+# destination directory or links with a `..` component, which is those of
+# concern here.
 
-prune */.git
-global-exclude .git *~ *.swp
+# [2] https://packaging.python.org/en/latest/specifications/source-distribution-format/#source-distribution-archive-features
diff --git a/README.md b/README.md
index 99e6dabd1618..61b4447ddf4d 100644
--- a/README.md
+++ b/README.md
@@ -161,7 +161,7 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)
 
 #### Prerequisites
 If you are installing from source, you will need:
-- Python 3.9 or later
+- Python 3.10 or later
 - A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux)
 - Visual Studio or Visual Studio Build Tool (Windows only)
 
@@ -275,7 +275,7 @@ conda install pkg-config libuv
 pip install mkl-static mkl-include
 # Add these packages if torch.distributed is needed.
 # Distributed package support on Windows is a prototype feature and is subject to changes.
-conda install -c conda-forge libuv
+conda install -c conda-forge libuv=1.51
 ```
 
 #### Install PyTorch
diff --git a/RELEASE.md b/RELEASE.md
index 52371e73f0a6..87f042d659fd 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -3,6 +3,7 @@
 <!-- toc -->
 
   - [Release Compatibility Matrix](#release-compatibility-matrix)
+    - [PyTorch CUDA Support Matrix](#pytorch-cuda-support-matrix)
   - [Release Cadence](#release-cadence)
   - [General Overview](#general-overview)
     - [Frequently Asked Questions](#frequently-asked-questions)
@@ -63,6 +64,22 @@ Following is the Release Compatibility Matrix for PyTorch releases:
 | 1.13 | >=3.7, <=3.10 | C++14 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
 | 1.12 | >=3.7, <=3.10 | C++14 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |
 
+### PyTorch CUDA Support Matrix
+
+For Release 2.9 PyTorch Supports following CUDA Architectures:
+
+| CUDA | architectures supported for Linux x86 and Windows builds | notes |
+| --- | --- | --- |
+| 12.6.3 | Maxwell(5.0), Pascal(6.0), Volta(7.0), Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0) | |
+| 12.8.1 | Volta(7.0), Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0), Blackwell(10.0, 12.0)  | |
+| 13.0.0 | Turing(7.5), Ampere(8.0, 8.6), Hopper(9.0), Blackwell(10.0, 12.0+PTX) | +PTX available on linux builds only |
+
+| CUDA | architectures supported for Linux aarch64 builds |
+| --- | --- |
+| 12.6.3 | Ampere(8.0), Hopper(9.0) |
+| 12.8.1 | Ampere(8.0), Hopper(9.0), Blackwell(10.0, 12.0)  |
+| 13.0.0 | Ampere(8.0), Hopper(9.0), Blackwell(10.0, 11.0, 12.0+PTX) |
+
 ## Release Cadence
 
 Following is the release cadence. All future dates below are tentative. For latest updates on the release schedule, please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional.
diff --git a/aten/src/ATen/BlasBackend.h b/aten/src/ATen/BlasBackend.h
index 307793301441..03b00cc21564 100644
--- a/aten/src/ATen/BlasBackend.h
+++ b/aten/src/ATen/BlasBackend.h
@@ -28,4 +28,19 @@ inline std::ostream& operator<<(std::ostream& stream, at::BlasBackend backend) {
   return stream << BlasBackendToString(backend);
 }
 
+namespace blas {
+
+enum class ScalingType : std::uint8_t {
+  TensorWise, // fp32 scales
+  RowWise, // fp32 scales
+  BlockWise1x16, // fp8_e4m3fn scales
+  BlockWise1x32, // fp8_e8m0fnu scales
+  BlockWise1x128, // fp32 scales
+  BlockWise128x128, // fp32 scales
+};
+
+enum class SwizzleType : std::uint8_t { NO_SWIZZLE = 0, SWIZZLE_32_4_4 = 1 };
+
+} // namespace blas
+
 } // namespace at
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index aa250c8b7fae..6bf0797b9e46 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -256,6 +256,7 @@ endif()
 IF(USE_FBGEMM_GENAI)
   set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
   set(FBGEMM_GENAI_SRCS ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
+
   if(USE_CUDA)
     # To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
     # If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
@@ -292,48 +293,65 @@ IF(USE_FBGEMM_GENAI)
       "${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
     )
 
-    target_include_directories(fbgemm_genai PUBLIC
+    target_include_directories(fbgemm_genai PRIVATE
       ${FBGEMM_THIRD_PARTY}/cutlass/include
       ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
       ${fbgemm_genai_mx8mx8bf16_grouped}
       ${FBGEMM_GENAI_SRCS}/common/include/   # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
       ${FBGEMM_GENAI_SRCS}/include/          # includes fbgemm_gpu/torch_ops.h
     )
-  else()
-    if(USE_ROCM)
-      # Only include the kernels we want to build to avoid increasing binary size.
-      file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
-        "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
-        "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
-      set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-
-      # Add additional HIPCC compiler flags for performance
-      set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
-        -mllvm
-        -amdgpu-coerce-illegal-types=1
-        -mllvm
-        -enable-post-misched=0
-        -mllvm
-        -greedy-reverse-local-assignment=1
-        -fhip-new-launch-api)
-
-      hip_add_library(
-        fbgemm_genai STATIC
-        ${fbgemm_genai_native_rocm_hip}
-        HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
-      set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
-      target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
-
-      target_include_directories(fbgemm_genai PUBLIC
-        # FBGEMM version of Composable Kernel is used due to some customizations
-        ${FBGEMM_THIRD_PARTY}/composable_kernel/include
-        ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
-        ${FBGEMM_THIRD_PARTY}/cutlass/include
-        ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
-        ${FBGEMM_GENAI_SRCS}/common/include/   # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
-        ${FBGEMM_GENAI_SRCS}/include/          # includes fbgemm_gpu/torch_ops.h
-      )
+
+    # Add FBGEMM_GENAI include directories for torch_ops.h
+    list(APPEND ATen_CUDA_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
+    list(APPEND ATen_CUDA_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
+  elseif(USE_ROCM)
+    # Only include the kernels we want to build to avoid increasing binary size.
+    file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
+      "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
+      "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
+    set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+
+    # Add additional HIPCC compiler flags for performance
+    set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
+      -mllvm
+      -enable-post-misched=0
+      -mllvm
+      -greedy-reverse-local-assignment=1
+      -fhip-new-launch-api)
+    if(DEFINED ROCM_VERSION_DEV AND ROCM_VERSION_DEV VERSION_LESS "7.2.0")
+        list(PREPEND FBGEMM_GENAI_EXTRA_HIPCC_FLAGS -mllvm -amdgpu-coerce-illegal-types=1)
+      endif()
+
+    # Only compile for gfx942 for now.
+    # This is rather hacky, I could not figure out a clean solution :(
+    set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
+    string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
+    if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
+      list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
     endif()
+    set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
+
+    hip_add_library(
+      fbgemm_genai STATIC
+      ${fbgemm_genai_native_rocm_hip}
+      HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
+    set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
+    set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
+
+    target_include_directories(fbgemm_genai PRIVATE
+      # FBGEMM version of Composable Kernel is used due to some customizations
+      ${FBGEMM_THIRD_PARTY}/composable_kernel/include
+      ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
+      ${FBGEMM_THIRD_PARTY}/cutlass/include
+      ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
+      ${FBGEMM_GENAI_SRCS}/common/include/   # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
+      ${FBGEMM_GENAI_SRCS}/include/          # includes fbgemm_gpu/torch_ops.h
+    )
+
+    # Add FBGEMM_GENAI include directories for torch_ops.h
+    list(APPEND ATen_HIP_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
+    list(APPEND ATen_HIP_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
   endif()
 endif()
 
@@ -595,6 +613,11 @@ if(UNIX)
   if(HAVE_MALLOC_USABLE_SIZE)
     add_definitions(-DHAVE_MALLOC_USABLE_SIZE=1)
   endif(HAVE_MALLOC_USABLE_SIZE)
+  set(CMAKE_EXTRA_INCLUDE_FILES "fcntl.h")
+  CHECK_FUNCTION_EXISTS(posix_fallocate HAVE_POSIX_FALLOCATE)
+  if(HAVE_POSIX_FALLOCATE)
+    add_definitions(-DHAVE_POSIX_FALLOCATE=1)
+  endif(HAVE_POSIX_FALLOCATE)
 endif(UNIX)
 
 ADD_DEFINITIONS(-DUSE_EXTERNAL_MZCRC)
@@ -677,12 +700,6 @@ if(USE_CUDA AND NOT USE_ROCM)
   list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
   list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
 
-  # Add FBGEMM_GENAI include directories for torch_ops.h
-  if(USE_FBGEMM_GENAI)
-    list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
-    list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
-  endif()
-
   if($ENV{ATEN_STATIC_CUDA})
     if(CUDA_VERSION VERSION_LESS_EQUAL 12.9)
       list(APPEND ATen_CUDA_DEPENDENCY_LIBS
diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h
index 39932b1c4398..8b283556c7a4 100644
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@@ -144,8 +144,7 @@ inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
 inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
   checkDeviceType("CPU_tensor_apply", tensors, kCPU);
   checkLayout("CPU_tensor_apply", tensors, kStrided);
-  if (!_all_equal_numel(tensors))
-    TORCH_CHECK(false, _all_equal_numel_error(tensors));
+  TORCH_CHECK(_all_equal_numel(tensors), _all_equal_numel_error(tensors));
   // An empty tensor has no elements
   for (auto& t : tensors)
     if (t.numel() == 0)
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index 4d48084b0ab8..3310abfb41d5 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -40,41 +40,6 @@ namespace {
                 ->conv
                 ->rnn
 */
-const std::map<std::string, std::vector<std::string>> _fp32_precisions = {
-    {"generic", {{"ieee", "tf32", "bf16", "none"}}},
-    {"mkldnn", {{"ieee", "tf32", "bf16", "none"}}},
-    {"cuda", {{"ieee", "tf32", "none"}}}};
-
-// Check whether the backend and op are legal
-void check_fp32_prec_backend_and_op(
-    const std::string& backend,
-    const std::string& op) {
-  static std::vector<std::string> backends = {"generic", "mkldnn", "cuda"};
-  static std::vector<std::string> operators = {"conv", "matmul", "rnn", "all"};
-  TORCH_CHECK(
-      std::find(backends.begin(), backends.end(), backend) != backends.end(),
-      "Invalid backend: ",
-      backend);
-  TORCH_CHECK(
-      std::find(operators.begin(), operators.end(), op) != operators.end(),
-      "Invalid operator: ",
-      op);
-  if (backend == "generic") {
-    TORCH_CHECK(op == "all", "Invalid operation for generic backend: ", op);
-  }
-  }
-
-  // Return whether the precision is supported by backends
-  bool validate_fp32_prec(
-      const std::string& backend,
-      const std::string& precision) {
-    auto iterp = _fp32_precisions.find(backend);
-    TORCH_CHECK(iterp != _fp32_precisions.end());
-    auto precisions = iterp->second;
-    bool valid = std::find(precisions.begin(), precisions.end(), precision) !=
-        precisions.end();
-    return valid;
-  }
 
   C10_ALWAYS_INLINE void warn_deprecated_fp32_precision_api(){
     TORCH_WARN_ONCE(
@@ -86,6 +51,54 @@ void check_fp32_prec_backend_and_op(
   }
 } // namespace
 
+Float32Backend str2backend(const std::string& name) {
+  if (name == "generic")
+    return Float32Backend::GENERIC;
+  else if (name == "cuda")
+    return Float32Backend::CUDA;
+  else if (name == "mkldnn")
+    return Float32Backend::MKLDNN;
+  TORCH_CHECK(false, "Unknown backend: ", name);
+}
+
+Float32Op str2op(const std::string& name) {
+  if (name == "all")
+    return Float32Op::ALL;
+  else if (name == "conv")
+    return Float32Op::CONV;
+  else if (name == "rnn")
+    return Float32Op::RNN;
+  else if (name == "matmul")
+    return Float32Op::MATMUL;
+  TORCH_CHECK(false, "Unknown op: ", name);
+}
+
+Float32Precision str2precision(const std::string& name) {
+  if (name == "none")
+    return Float32Precision::NONE;
+  else if (name == "ieee")
+    return Float32Precision::IEEE;
+  else if (name == "tf32")
+    return Float32Precision::TF32;
+  else if (name == "bf16")
+    return Float32Precision::BF16;
+  TORCH_CHECK(false, "Unknown precision: ", name);
+}
+
+std::string precision2str(Float32Precision prec) {
+  switch (prec) {
+    case Float32Precision::NONE:
+      return "none";
+    case Float32Precision::IEEE:
+      return "ieee";
+    case Float32Precision::TF32:
+      return "tf32";
+    case Float32Precision::BF16:
+      return "bf16";
+  }
+  TORCH_CHECK(false, "Invalid enum Float32Precision(", static_cast<int>(prec), ")");
+}
+
 Context::Context() = default;
 
 // TODO: This could be bad juju if someone calls globalContext() in the
@@ -179,10 +192,10 @@ void Context::setUserEnabledNNPACK(bool e) {
   enabled_nnpack = e;
 }
 
-bool Context::allowTF32CuDNN(const std::string& op) const {
-  if (op.size() == 0){
-    bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
-    bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
+bool Context::allowTF32CuDNN(std::optional<Float32Op> op) const {
+  if (!op.has_value()) {
+    bool allow_tf32_rnn = float32Precision(Float32Backend::CUDA, Float32Op::RNN) == Float32Precision::TF32;
+    bool allow_tf32_conv = float32Precision(Float32Backend::CUDA, Float32Op::CONV) == Float32Precision::TF32;
     TORCH_CHECK(
         allow_tf32_rnn == allow_tf32_conv && allow_tf32_rnn == allow_tf32_cudnn,
         "PyTorch is checking whether allow_tf32 is enabled for cuDNN without a specific operator name,",
@@ -191,15 +204,15 @@ bool Context::allowTF32CuDNN(const std::string& op) const {
         "We suggest only using the new API to set the TF32 flag(s). See also: ",
         "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices");
   } else {
-    return float32Precision("cuda", op) == "tf32";
+    return float32Precision(Float32Backend::CUDA, op.value()) == Float32Precision::TF32;
   }
   warn_deprecated_fp32_precision_api();
   return allow_tf32_cudnn;
 }
 
 void Context::setAllowTF32CuDNN(bool b) {
-  setFloat32Precision("cuda", "rnn", b ? "tf32" : "none");
-  setFloat32Precision("cuda", "conv", b ? "tf32" : "none");
+  setFloat32Precision(Float32Backend::CUDA, Float32Op::RNN, b ? Float32Precision::TF32 : Float32Precision::NONE);
+  setFloat32Precision(Float32Backend::CUDA, Float32Op::CONV, b ? Float32Precision::TF32 : Float32Precision::NONE);
   allow_tf32_cudnn = b;
   warn_deprecated_fp32_precision_api();
 }
@@ -279,45 +292,6 @@ bool Context::userEnabledOverrideableSDP() const {
   return enabled_overrideable;
 }
 
-static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
-static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
-#ifdef USE_ROCM
-static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32";
-#endif
-
-bool Context::checkCuBLASConfigDeterministic() {
-  // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
-  // is set to deterministic setting
-  if (hasCUDART()) {
-    const auto workspace_config = c10::utils::get_env(cublas_config_var_name);
-    return (workspace_config == cublas_deterministic_configs[0] || workspace_config == cublas_deterministic_configs[1]);
-  }
-  return true;
-}
-
-void Context::alertCuBLASConfigNotDeterministic() const {
-  static const bool cublas_config_deterministic = checkCuBLASConfigDeterministic();
-  if (C10_LIKELY(!deterministicAlgorithms() || cublas_config_deterministic)) {
-    return;
-  }
-
-  auto msg = c10::str(
-    "Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or ",
-    "`at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because ",
-    "it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this ",
-    "case, you must set an environment variable before running your PyTorch application: ",
-    cublas_config_var_name, "=", cublas_deterministic_configs[0], " or ",
-    cublas_config_var_name, "=", cublas_deterministic_configs[1], ". For more information, go to ",
-    "https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility"
-  );
-
-  if (deterministicAlgorithmsWarnOnly()) {
-    TORCH_WARN(msg);
-  } else {
-    TORCH_CHECK(false, msg);
-  }
-}
-
 bool Context::benchmarkCuDNN() const {
   return benchmark_cudnn;
 }
@@ -343,14 +317,8 @@ void Context::setImmediateMiopen(bool b) {
 }
 
 bool Context::allowTF32CuBLAS() const {
-#ifdef USE_ROCM
-    const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
-    if (allow_tf32 != true) {
-      return false;
-    }
-#endif
   bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
-  bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
+  bool allow_tf32_new = float32Precision(Float32Backend::CUDA, Float32Op::MATMUL) == Float32Precision::TF32;
   TORCH_CHECK(
       legacy_allow_tf32 == allow_tf32_new,
       "PyTorch is checking whether allow_tf32_new is enabled for cuBlas matmul,",
@@ -362,26 +330,18 @@ bool Context::allowTF32CuBLAS() const {
 }
 
 void Context::setAllowTF32CuBLAS(bool b) {
-#ifdef USE_ROCM
-  const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
-  if (allow_tf32 != true) {
-    C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. "
-                              << "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it.";
-    return;
-  }
-#endif
   float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
-  setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
+  setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, b ? Float32Precision::TF32 : Float32Precision::IEEE);
 }
 
 Float32MatmulPrecision Context::float32MatmulPrecision() const {
-  bool invalid = float32Precision("cuda", "matmul") == "tf32" &&
+  bool invalid = float32Precision(Float32Backend::CUDA, Float32Op::MATMUL) == Float32Precision::TF32 &&
       float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST;
   invalid = invalid ||
-      (float32Precision("mkldnn", "matmul") == "bf16" &&
+      (float32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL) == Float32Precision::BF16 &&
        float32_matmul_precision != at::Float32MatmulPrecision::MEDIUM);
   invalid = invalid ||
-      (float32Precision("mkldnn", "matmul") == "tf32" &&
+      (float32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL) == Float32Precision::TF32 &&
        float32_matmul_precision != at::Float32MatmulPrecision::HIGH);
   TORCH_CHECK(
       !invalid,
@@ -393,15 +353,26 @@ Float32MatmulPrecision Context::float32MatmulPrecision() const {
   return float32_matmul_precision;
 }
 
-std::string Context::float32Precision(const std::string& backend, const std::string& op) const {
-  check_fp32_prec_backend_and_op(backend, op);
-  auto precision = fp32_precision.find(backend)->second.find(op)->second;
-  if (precision == "none")
-    precision = fp32_precision.find(backend)->second.find("all")->second;
-  if (precision == "none")
-    precision = fp32_precision.find("generic")->second.find("all")->second;
-  bool valid_prec = validate_fp32_prec(backend, precision);
-  return valid_prec ? precision : "none";
+Float32Precision Context::float32Precision(Float32Backend backend, Float32Op op) const {
+  std::pair<Float32Backend, Float32Op> key{backend, op};
+  auto it = fp32_precision.find(key);
+  TORCH_CHECK(it != fp32_precision.end(), "Invalid (backend, op) pair: (", backend, ", ", op, ")");
+
+  Float32Precision precision = it->second;
+  if (precision == Float32Precision::NONE) {
+    key.second = Float32Op::ALL;
+    precision = fp32_precision.find(key)->second;
+  }
+  if (precision == Float32Precision::NONE) {
+    key.first = Float32Backend::GENERIC;
+    precision = fp32_precision.find(key)->second;
+  }
+
+  // "cuda" does not support "bf16"
+  if (backend == Float32Backend::CUDA && precision == Float32Precision::BF16) {
+    return Float32Precision::NONE;
+  }
+  return precision;
 }
 
 void Context::setFloat32MatmulPrecision(const std::string &s) {
@@ -410,18 +381,18 @@ void Context::setFloat32MatmulPrecision(const std::string &s) {
     // TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention
     if (s_ == "highest") {
       float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST;
-      setFloat32Precision("cuda", "matmul", "ieee");
-      setFloat32Precision("mkldnn", "matmul", "ieee");
+      setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::IEEE);
+      setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::IEEE);
       return true;
     } else if (s_ == "high") {
       float32_matmul_precision = at::Float32MatmulPrecision::HIGH;
-      setFloat32Precision("cuda", "matmul", "tf32");
-      setFloat32Precision("mkldnn", "matmul", "tf32");
+      setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::TF32);
+      setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::TF32);
       return true;
     } else if (s_ == "medium") {
       float32_matmul_precision = at::Float32MatmulPrecision::MEDIUM;
-      setFloat32Precision("cuda", "matmul", "tf32");
-      setFloat32Precision("mkldnn", "matmul", "bf16");
+      setFloat32Precision(Float32Backend::CUDA, Float32Op::MATMUL, Float32Precision::TF32);
+      setFloat32Precision(Float32Backend::MKLDNN, Float32Op::MATMUL, Float32Precision::BF16);
       return true;
     }
     return false;
@@ -435,25 +406,16 @@ void Context::setFloat32MatmulPrecision(const std::string &s) {
     "setFloat32MatmulPrecision call has no effect.");
 }
 
-void Context::setFloat32Precision(const std::string& backend, const std::string& op, const std::string& p) {
-  check_fp32_prec_backend_and_op(backend, op);
-  if (validate_fp32_prec(backend, p)) {
-    fp32_precision[backend][op] = p;
-  } else {
-    std::string msg;
-    auto iterp = _fp32_precisions.find(backend);
-    TORCH_CHECK(iterp != _fp32_precisions.end());
-    for (auto p : iterp->second) {
-      msg += p;
-      msg += " ";
-    }
-    TORCH_WARN(
-        "you have set wrong precision for backend:",
-        backend,
-        " setFloat32Precision call has no effect.",
-        "Please choose precision from: ",
-        msg);
-  }
+void Context::setFloat32Precision(Float32Backend backend, Float32Op op, Float32Precision p) {
+  auto it = fp32_precision.find(std::make_pair(backend, op));
+  TORCH_CHECK(
+      it != fp32_precision.end(),
+      "Invalid (backend, op) pair: (", backend, ", ", op, ")");
+  TORCH_CHECK(
+      !(backend == Float32Backend::CUDA && p == Float32Precision::BF16),
+      "backend 'cuda' does not support precision 'bf16'");
+
+  it->second = p;
 }
 
 at::LinalgBackend Context::linalgPreferredBackend() const {
@@ -521,8 +483,8 @@ at::BlasBackend Context::blasPreferredBackend() {
 #if ROCM_VERSION >= 60300
           "gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
 #endif
-#if ROCM_VERSION >= 60500
-          "gfx950"
+#if ROCM_VERSION >= 70000
+          "gfx950", "gfx1150", "gfx1151"
 #endif
       };
       for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
@@ -625,20 +587,33 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
   rocm_fa_preferred_backend = b;
 }
 
-bool Context::allowFP16ReductionCuBLAS() const {
+CuBLASReductionOption Context::allowFP16ReductionCuBLAS() const {
   return allow_fp16_reduction_cublas;
 }
 
-void Context::setAllowFP16ReductionCuBLAS(bool b) {
-  allow_fp16_reduction_cublas = b;
+CuBLASReductionOption inline get_reduction_option(bool allow_reduced_precision, bool allow_splitk) {
+  TORCH_CHECK(
+      !(allow_reduced_precision && !allow_splitk),
+      "allow_splitk=False is not supported when reduced precision reductions are enabled");
+  if (allow_reduced_precision) {
+    return CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
+  } else if (allow_splitk) {
+    return CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK;
+  } else {
+    return CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK;
+  }
+}
+
+void Context::setAllowFP16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
+  allow_fp16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
 }
 
-bool Context::allowBF16ReductionCuBLAS() const {
+CuBLASReductionOption Context::allowBF16ReductionCuBLAS() const {
   return allow_bf16_reduction_cublas;
 }
 
-void Context::setAllowBF16ReductionCuBLAS(bool b) {
-  allow_bf16_reduction_cublas = b;
+void Context::setAllowBF16ReductionCuBLAS(bool allow_reduced_precision, bool allow_splitk) {
+  allow_bf16_reduction_cublas = get_reduction_option(allow_reduced_precision, allow_splitk);
 }
 
 bool Context::allowFP16AccumulationCuBLAS() const {
diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h
index 5cfa9b23e20a..d0f6ce18862a 100644
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@@ -25,11 +25,13 @@
 #include <c10/util/CallOnce.h>
 #include <c10/util/Exception.h>
 #include <c10/util/env.h>
+#include <c10/util/hash.h>
 #include <c10/util/irange.h>
 
 #include <cstdint>
 #include <map>
 #include <mutex>
+#include <unordered_map>
 
 namespace at {
 
@@ -37,6 +39,20 @@ class Tensor;
 
 enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };
 
+enum class CuBLASReductionOption : uint8_t {
+  AllowReducedPrecisionWithSplitK = 0,
+  DisallowReducedPrecisionAllowSplitK = 1,
+  DisallowReducedPrecisionDisallowSplitK = 2,
+};
+enum class TORCH_API Float32Backend { GENERIC, CUDA, MKLDNN };
+enum class TORCH_API Float32Op { ALL, CONV, RNN, MATMUL };
+enum class TORCH_API Float32Precision { NONE, IEEE, TF32, BF16 };
+
+TORCH_API Float32Backend str2backend(const std::string& name);
+TORCH_API Float32Op str2op(const std::string& name);
+TORCH_API Float32Precision str2precision(const std::string& name);
+TORCH_API std::string precision2str(Float32Precision prec);
+
 class TORCH_API Context {
  public:
   Context();
@@ -210,15 +226,15 @@ class TORCH_API Context {
   bool userEnabledMkldnn() const;
   void setUserEnabledMkldnn(bool e);
   bool benchmarkCuDNN() const;
-  void setBenchmarkCuDNN(bool);
+  void setBenchmarkCuDNN(bool /*b*/);
   int benchmarkLimitCuDNN() const;
-  void setBenchmarkLimitCuDNN(int);
+  void setBenchmarkLimitCuDNN(int /*b*/);
   bool immediateMiopen() const;
-  void setImmediateMiopen(bool);
+  void setImmediateMiopen(bool /*b*/);
   bool deterministicCuDNN() const;
-  void setDeterministicCuDNN(bool);
+  void setDeterministicCuDNN(bool /*b*/);
   bool deterministicMkldnn() const;
-  void setDeterministicMkldnn(bool);
+  void setDeterministicMkldnn(bool /*b*/);
   bool userEnabledNNPACK() const;
   void setUserEnabledNNPACK(bool e);
 
@@ -236,32 +252,32 @@ class TORCH_API Context {
   void setSDPPriorityOrder(const std::vector<int64_t>& order);
   std::array<at::SDPBackend, at::num_sdp_backends> sDPPriorityOrder();
 
-  void setSDPUseFlash(bool);
+  void setSDPUseFlash(bool /*e*/);
   bool userEnabledFlashSDP() const;
 
-  void setSDPUseMemEfficient(bool);
+  void setSDPUseMemEfficient(bool /*e*/);
   bool userEnabledMemEfficientSDP() const;
 
-  void setSDPUseMath(bool);
+  void setSDPUseMath(bool /*e*/);
   bool userEnabledMathSDP() const;
 
-  void setSDPUseCuDNN(bool);
+  void setSDPUseCuDNN(bool /*e*/);
   bool userEnabledCuDNNSDP() const;
 
-  void setAllowFP16BF16ReductionMathSDP(bool);
+  void setAllowFP16BF16ReductionMathSDP(bool /*e*/);
   bool allowFP16BF16ReductionMathSDP() const;
 
-  void setSDPUseOverrideable(bool);
+  void setSDPUseOverrideable(bool /*e*/);
   bool userEnabledOverrideableSDP() const;
 
   at::LinalgBackend linalgPreferredBackend() const;
-  void setLinalgPreferredBackend(at::LinalgBackend);
+  void setLinalgPreferredBackend(at::LinalgBackend /*b*/);
 
   at::BlasBackend blasPreferredBackend();
-  void setBlasPreferredBackend(at::BlasBackend);
+  void setBlasPreferredBackend(at::BlasBackend /*b*/);
 
   at::ROCmFABackend getROCmFAPreferredBackend();
-  void setROCmFAPreferredBackend(at::ROCmFABackend);
+  void setROCmFAPreferredBackend(at::ROCmFABackend /*b*/);
 
   // Note [Enabling Deterministic Operations]
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -294,9 +310,9 @@ class TORCH_API Context {
 
   bool deterministicAlgorithms() const;
   bool deterministicAlgorithmsWarnOnly() const;
-  void setDeterministicAlgorithms(bool, bool);
+  void setDeterministicAlgorithms(bool /*b*/, bool /*warn_only*/);
   bool deterministicFillUninitializedMemory() const;
-  void setDeterministicFillUninitializedMemory(bool);
+  void setDeterministicFillUninitializedMemory(bool /*b*/);
 
   // Note [Writing Nondeterministic Operations]
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -310,13 +326,7 @@ class TORCH_API Context {
   //
   // * Throw an error when `Context::deterministicAlgorithms()` is true. Most
   //   of the time, this should be accomplished by calling
-  //   `at::globalContext().alertNotDeterminstic()`.  However, if the
-  //   nondeterministic behavior is caused by the CuBLAS workspace
-  //   configuration in CUDA >= 10.2,
-  //   `at::globalContext().alertCuBLASConfigNotDeterministic()` should be
-  //   called instead (in this case, a comment explaining why the operation is
-  //   nondeterministic is not necessary). See below for details on these
-  //   methods.
+  //   `at::globalContext().alertNotDeterminstic().
   //
   // * Have an entry in the list of nondeterministic PyTorch operations in the
   //   docstring of `use_deterministic_algorithms()` in torch/__init__.py
@@ -340,33 +350,29 @@ class TORCH_API Context {
   // Throws an error if `Context::deterministicAlgorithms()` is true
   static void alertNotDeterministic(std::string_view const& caller);
 
-  // Throws an error if `Context::deterministicAlgorithms()` is true, CUDA
-  // >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or
-  // ":4096:8". For more details:
-  // https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
-  void alertCuBLASConfigNotDeterministic() const;
-
   void setFloat32MatmulPrecision(const std::string& s);
   void setFloat32Precision(
-      const std::string& backend,
-      const std::string& op,
-      const std::string& s);
-  bool allowTF32CuDNN(const std::string& op = std::string()) const;
-  void setAllowTF32CuDNN(bool);
+      Float32Backend backend,
+      Float32Op op,
+      Float32Precision p);
+  bool allowTF32CuDNN(std::optional<Float32Op> op = std::nullopt) const;
+  void setAllowTF32CuDNN(bool /*b*/);
   bool allowTF32OneDNN() const;
-  void setAllowTF32OneDNN(bool);
+  void setAllowTF32OneDNN(bool /*b*/);
   bool allowTF32CuBLAS() const;
-  void setAllowTF32CuBLAS(bool);
+  void setAllowTF32CuBLAS(bool /*b*/);
   Float32MatmulPrecision float32MatmulPrecision() const;
-  std::string float32Precision(
-      const std::string& backend,
-      const std::string& op) const;
-  bool allowFP16ReductionCuBLAS() const;
-  void setAllowFP16ReductionCuBLAS(bool);
-  bool allowBF16ReductionCuBLAS() const;
-  void setAllowBF16ReductionCuBLAS(bool);
+  Float32Precision float32Precision(Float32Backend backend, Float32Op op) const;
+  CuBLASReductionOption allowFP16ReductionCuBLAS() const;
+  void setAllowFP16ReductionCuBLAS(
+      bool allow_reduced_precision,
+      bool allow_splitk = true);
+  CuBLASReductionOption allowBF16ReductionCuBLAS() const;
+  void setAllowBF16ReductionCuBLAS(
+      bool allow_reduced_precision,
+      bool allow_splitk = true);
   bool allowFP16AccumulationCuBLAS() const;
-  void setAllowFP16AccumulationCuBLAS(bool);
+  void setAllowFP16AccumulationCuBLAS(bool /*b*/);
 
   // Matmuls can use a so-called "persistent" kernel which launches one CUDA
   // block for each SM on the GPU, and each block then iterates over multiple
@@ -378,7 +384,7 @@ class TORCH_API Context {
   // to make matmuls target only a subset of the SMs, so they can fully schedule
   // even next to a comms kernel, and only be a few percent slower.
   std::optional<int32_t> _SMCarveout_EXPERIMENTAL() const;
-  void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t>);
+  void _setSMCarveout_EXPERIMENTAL(std::optional<int32_t> /*c*/);
 
   at::QEngine qEngine() const;
   void setQEngine(at::QEngine e);
@@ -399,7 +405,7 @@ class TORCH_API Context {
   void setDefaultMobileCPUAllocator();
   void unsetDefaultMobileCPUAllocator();
   bool allowFP16ReductionCPU() const;
-  void setAllowFP16ReductionCPU(bool);
+  void setAllowFP16ReductionCPU(bool /*b*/);
 
   // Preserved for BC
   void lazyInitCUDA() {
@@ -429,7 +435,6 @@ class TORCH_API Context {
   }
 
  private:
-  static bool checkCuBLASConfigDeterministic();
   std::array<c10::once_flag, at::COMPILE_TIME_MAX_DEVICE_TYPES> init_;
   bool enabled_cudnn = true;
   bool deterministic_cudnn = false;
@@ -457,8 +462,10 @@ class TORCH_API Context {
       : at::Float32MatmulPrecision::HIGHEST;
   int benchmark_limit_cudnn = 10;
   bool allow_tf32_cudnn = true;
-  bool allow_fp16_reduction_cublas = true;
-  bool allow_bf16_reduction_cublas = true;
+  CuBLASReductionOption allow_fp16_reduction_cublas =
+      CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
+  CuBLASReductionOption allow_bf16_reduction_cublas =
+      CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
   bool allow_fp16_accumulation_cublas = false;
   std::optional<int32_t> sm_carveout = std::nullopt;
   bool enabled_mkldnn = true;
@@ -488,21 +495,20 @@ class TORCH_API Context {
   bool enable_sparse_tensor_invariant_checks = false;
   bool allow_fp16_reduction_cpu = false;
 
-  std::map<std::string, std::map<std::string, std::string>> fp32_precision = {
-      {"generic", {{"all", "none"}}},
-      {"mkldnn",
-       {{"matmul", "none"},
-        {"conv", "none"},
-        {"rnn", "none"},
-        {"all", "none"}}},
-      {"cuda",
-       {{"matmul",
-         float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST
-             ? "none"
-             : "tf32"},
-        {"conv", "tf32"},
-        {"rnn", "tf32"},
-        {"all", "none"}}},
+  using Key = std::pair<Float32Backend, Float32Op>;
+  std::unordered_map<Key, Float32Precision, c10::hash<Key>> fp32_precision = {
+      {{Float32Backend::GENERIC, Float32Op::ALL}, Float32Precision::NONE},
+      {{Float32Backend::MKLDNN, Float32Op::ALL}, Float32Precision::NONE},
+      {{Float32Backend::MKLDNN, Float32Op::CONV}, Float32Precision::NONE},
+      {{Float32Backend::MKLDNN, Float32Op::RNN}, Float32Precision::NONE},
+      {{Float32Backend::MKLDNN, Float32Op::MATMUL}, Float32Precision::NONE},
+      {{Float32Backend::CUDA, Float32Op::ALL}, Float32Precision::NONE},
+      {{Float32Backend::CUDA, Float32Op::CONV}, Float32Precision::TF32},
+      {{Float32Backend::CUDA, Float32Op::RNN}, Float32Precision::TF32},
+      {{Float32Backend::CUDA, Float32Op::MATMUL},
+       float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST
+           ? Float32Precision::NONE
+           : Float32Precision::TF32},
   };
 
   Allocator* prev_allocator_ptr_{nullptr};
@@ -684,5 +690,4 @@ struct TORCH_API ROCmBackwardPassGuard {
   ~ROCmBackwardPassGuard();
   static bool is_backward_pass();
 };
-
 } // namespace at
diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp
index b16d188b99a5..ccb0ae15a11e 100644
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@@ -389,54 +389,16 @@ void fillVersion<DLManagedTensorVersioned>(
 // constructed out of ATen tensor
 template <class T>
 T* toDLPackImpl(const Tensor& src) {
-  auto view = src;
-
-  // Detect whether there is need to normalize the strides
-  // Background: gh-83069
-  //
-  // However, normalizing strides can come at a high-cost
-  // to slow down toDLPack conversion 3x, so we
-  // only normalize if needed.
-  //
-  // The following code detects whether the src follows
-  // a continuous pattern. If the src follows such pattern (common-case)
-  // then we do not need to normalize the strides.
-  bool need_normalize_strides = false;
-  int64_t expected_stride = 1;
-  for (int i = src.dim() - 1; i >= 0; i--) {
-    // detect if we do not meet continuous pattern
-    // and the size is 1, so there is opportunity to normalize
-    if (src.stride(i) != expected_stride && src.size(i) == 1) {
-      need_normalize_strides = true;
-      break;
-    }
-    expected_stride *= src.size(i);
-  }
-
-  // less common case, try normalizing the strides
-  if (need_normalize_strides) {
-    // create a new tensor with possibly normalized strides
-    // gh-83069
-    auto shape = src.sizes();
-    auto strides = src.strides().vec();
-    for (int i = 0; i < src.dim(); i++) {
-      if (shape[i] < 2) {
-        strides[i] = 1;
-      }
-    }
-    view = src.as_strided(shape, strides, src.storage_offset());
-  }
-
   ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
-  atDLMTensor->handle = view;
+  atDLMTensor->handle = src;
   atDLMTensor->tensor.manager_ctx = atDLMTensor;
   atDLMTensor->tensor.deleter = &deleter<T>;
-  atDLMTensor->tensor.dl_tensor.data = view.data_ptr();
+  atDLMTensor->tensor.dl_tensor.data = src.data_ptr();
   atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device());
   atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
   atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
-  atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(view.sizes().data());
-  atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(view.strides().data());
+  atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(src.sizes().data());
+  atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(src.strides().data());
   atDLMTensor->tensor.dl_tensor.byte_offset = 0;
   fillVersion(&atDLMTensor->tensor);
 
diff --git a/aten/src/ATen/DLConvertor.h b/aten/src/ATen/DLConvertor.h
index b1c2eaa2d6ea..928731fafb2f 100644
--- a/aten/src/ATen/DLConvertor.h
+++ b/aten/src/ATen/DLConvertor.h
@@ -52,16 +52,16 @@ struct DLPackTraits {};
 
 template <>
 struct DLPackTraits<DLManagedTensor> {
-  inline static const char* capsule = "dltensor";
-  inline static const char* used = "used_dltensor";
+  inline static constexpr const char* capsule = "dltensor";
+  inline static constexpr const char* used = "used_dltensor";
   inline static auto toDLPack = at::toDLPack;
   inline static auto fromDLPack = at::fromDLPack;
 };
 
 template <>
 struct DLPackTraits<DLManagedTensorVersioned> {
-  inline static const char* capsule = "dltensor_versioned";
-  inline static const char* used = "used_dltensor_versioned";
+  inline static constexpr const char* capsule = "dltensor_versioned";
+  inline static constexpr const char* used = "used_dltensor_versioned";
   inline static auto toDLPack = at::toDLPackVersioned;
   inline static auto fromDLPack = at::fromDLPackVersioned;
 };
diff --git a/aten/src/ATen/EmptyTensor.h b/aten/src/ATen/EmptyTensor.h
index e34be30f9607..ac76d09537fa 100644
--- a/aten/src/ATen/EmptyTensor.h
+++ b/aten/src/ATen/EmptyTensor.h
@@ -16,8 +16,8 @@ inline void check_size_nonnegative(ArrayRef<int64_t> size) {
 
 inline void check_size_nonnegative(ArrayRef<c10::SymInt> size) {
   for (const auto& x : size) {
-    TORCH_CHECK(
-        x.expect_size(__FILE__, __LINE__),
+    TORCH_SYM_CHECK(
+        x.sym_ge(0),
         "Trying to create tensor with negative dimension ",
         x,
         ": ",
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 090699339ccf..1bf46ebe61b6 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -468,7 +468,7 @@ inline Tensor _sum_to(
       // if we assume no reduction due to unbacked we ensure that at runtime.
       TORCH_MAYBE_SYM_CHECK(
           sym_eq(shape[i - leading_dims], sizes[i]),
-          "non-reduction path was assumed due to unabcked symbols expected those two sizes to be the same:",
+          "non-reduction path was assumed due to unbacked symbols expected those two sizes to be the same:",
           shape[i - leading_dims],
           ", ",
           sizes[i])
diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp
index 2cf8d9727f65..9631872875c6 100644
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@@ -9,11 +9,6 @@
 
 namespace at::functionalization {
 
-ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
-  if (out_idx == this->out_index) return *this;
-  return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx);
-}
-
 // Note [Functionalization: Alias Removal Part 2]
 // See Note [Functionalization: Alias Removal] for more details.
 // This function applies a single update from one of the views to the StorageImpl.
@@ -42,12 +37,12 @@ ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
 static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) {
   at::Tensor t = update.new_val;
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
-  if (update.view_metas.empty()) return t;
+  if (update.view_metas.empty()) { return t; }
 
   std::vector<at::Tensor> tmp_values({base});
   tmp_values.reserve(update.view_metas.size());
   for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
-    at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
+    at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back());
     // NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
     // All of these ops require additional information to recover the sizes of the original tensor.
     // If need to, we could probably apply this optimization and only bother computing tmp_values
@@ -55,9 +50,8 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co
     tmp_values.push_back(std::move(next_view));
   }
   for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) {
-    int64_t out_idx = update.view_metas[i].out_index;
     // Each view inverse is implemented in ViewInverses.cpp.
-    t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx);
+    t = update.view_metas[i]->reverse(tmp_values[i], t);
   }
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
   return t;
@@ -111,13 +105,13 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
 }
 
-void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) {
+void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<std::shared_ptr<ViewMeta>>& metas) {
   TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage");
 
   if (metas.size() > 1) {
     for (size_t i = 1; i < metas.size(); ++i) {
       // Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI
-      TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided,
+      TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided,
 "During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i,
 " was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today,"
 "so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you "
diff --git a/aten/src/ATen/FunctionalStorageImpl.h b/aten/src/ATen/FunctionalStorageImpl.h
index 8cd1cb7434aa..0c9c1fd775f3 100644
--- a/aten/src/ATen/FunctionalStorageImpl.h
+++ b/aten/src/ATen/FunctionalStorageImpl.h
@@ -8,44 +8,89 @@ namespace at::functionalization {
 
 // See Note [Functionalization Pass In Core]
 
+enum class InverseReturnMode {
+  /// Specifies that functional inverses should always return a view.
+  AlwaysView,
+  /// Specifies that functional inverses should always return a non-view / copy.
+  NeverView,
+  /// Specifies that functional inverses should return a view unless a (copying)
+  /// scatter
+  /// inverse exists, in which case that will be used instead.
+  /// This avoids as_strided() calls that can be difficult for subclasses to
+  /// handle.
+  ViewOrScatterInverse,
+};
+
+#define FUNCTIONALIZATION_VIEWMETA_NAME(TYPE) \
+  static const char* name() {                 \
+    return #TYPE;                             \
+  }
+
+#define FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(...) \
+  using SerializableTuple = std::tuple<__VA_ARGS__>
+
 // ViewMeta is a class used by the functionalization pass to navigate between
 // a base tensor and a view tensor.
 // For example, if I call `b = a.view1(...)`
-// the functionalization pass will generate and store a ViewMeta on b that looks
-// like:
+// the functionalization pass will generate and store a ViewMeta specialization
+// for `view1` operation on b that looks like:
 //
-// ViewMeta(
-//   [<captures>](const Tensor& base, int64_t mutated_view_idx) {
-//     return base.view1(...);
-//   },
-//   [<captures>](const at::Tensor& base, const at::Tensor& mutated_view,
-//   int64_t mutated_view_idx) -> at::Tensor {
-//     return at::functionalization::impl::view1_inverse(base, mutated_view,
-//     ...);
+// struct TORCH_API view1_ViewMeta : public ViewMeta {
+//   FUNCTIONALIZATION_VIEWMETA_NAME(view1_ViewMeta);
+//   FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
+//       bool /* reapply_views */,
+//       const std::vector<int64_t>&);
+//
+//   view1_ViewMeta(const SerializableTuple& tpl)
+//       : view1_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
+//
+//   view1_ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
+//       : ViewMeta(/*has_symbolic_inputs=*/false),
+//         reapply_views(reapply_views),
+//         size(size) {}
+//
+//   Tensor forward(const Tensor& base) override {
+//       return base.view1(...);
 //   }
 //
-// The forward_fn lambda describes how to replay view1 on a tensor.
+//   Tensor reverse(const Tensor& base, const Tensor& mutated_view) override {
+//       return at::functionalization::impl::view1_inverse(base, mutated_view,
+//       ...);
+//   }
 //
-// The reverse_fn lambda describes how, given a tensor that is already a view,
+//   SerializableTuple to_serializable_tuple() {
+//     return std::make_tuple(reapply_views, size);
+//   }
+//
+//   bool reapply_views;
+//   std::vector<int64_t> size;
+// };
+//
+// The forward function describes how to replay view1 on a tensor.
+//
+// The reverse function describes how, given a tensor that is already a view,
 // how to get the corresponding base tensor. See Note [Functionalization Pass:
 // View Inverses] for details.
+//
+// `SerializedTuple` is a typedef that defines an `std::tuple<...>` type
+// representing the `ViewMeta` instance state. Methods that take in/return such
+// a type are used for supporting pickle serialization.
 struct ViewMeta {
   ViewMeta(
-      std::function<Tensor(const Tensor&, int64_t)> forward,
-      std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
       bool has_symbolic_inputs,
       bool is_multi_output = false,
       bool is_as_strided = false,
       int64_t out_idx = 0)
-      : forward_fn(std::move(forward)),
-        reverse_fn(std::move(reverse)),
-        out_index(out_idx),
+      : out_index(out_idx),
         is_multi_output(is_multi_output),
         is_as_strided(is_as_strided),
         has_symbolic_inputs(has_symbolic_inputs) {}
 
-  std::function<Tensor(const Tensor&, int64_t)> forward_fn;
-  std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
+  virtual ~ViewMeta() = default;
+
+  virtual Tensor forward(const Tensor& base) = 0;
+  virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0;
+
   // See Note [out_idx in ViewMeta]
   int64_t out_index;
 
@@ -57,10 +102,17 @@ struct ViewMeta {
   // Tells us if this view operation has any symbolic inputs
   bool has_symbolic_inputs;
 
-  // Returns a copy of the current ViewMeta, if out_idx matches the current
-  // out_index. Otherwise, returns a new ViewMeta with the same forward/reverse
+  // Returns a new ViewMeta with the same forward/reverse
   // functions, but a new out index.
-  ViewMeta to_out_idx(int64_t out_idx);
+  //
+  // This method should be implemented by those `ViewMeta` that have more than
+  // one output.
+  virtual std::shared_ptr<ViewMeta> to_out_index(int64_t out_index) {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        "ViewMeta::to_out_index not implemented. ",
+        "Likely because there's only one output.");
+  }
 };
 
 // FunctionalStorageImpl is a subclass of StorageImpl used by the
@@ -93,14 +145,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
     // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
     const at::Tensor new_val;
     // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
-    const std::vector<ViewMeta> view_metas;
+    const std::vector<std::shared_ptr<ViewMeta>> view_metas;
   };
 
   explicit FunctionalStorageImpl(const Tensor& value);
 
   void add_update(
       const Tensor& updated_val,
-      const std::vector<ViewMeta>& view_metas);
+      const std::vector<std::shared_ptr<ViewMeta>>& view_metas);
   bool apply_updates();
   const Tensor& base() {
     return base_;
diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp
index 7d5e4e84e861..d553cc1fb949 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@@ -129,17 +129,19 @@ void FunctionalTensorWrapper::freeze_storage() const {
 // - view_value: The output tensor that we need to wrap.
 // - base: The "base" of the view that `view_value` was generated from.
 // See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
-FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta)
-  : c10::TensorImpl(
-      c10::DispatchKeySet(DispatchKey::Functionalize),
-      view_value.dtype(),
-      view_value.device()
-    ),
-    value_(view_value),
-    is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
-    was_storage_changed_(base->was_storage_changed_),
-    is_symbolic_(base->is_symbolic_)
-{
+FunctionalTensorWrapper::FunctionalTensorWrapper(
+    const Tensor& view_value,
+    const FunctionalTensorWrapper* base,
+    const std::shared_ptr<functionalization::ViewMeta>& meta)
+    : c10::TensorImpl(
+          c10::DispatchKeySet(DispatchKey::Functionalize),
+          view_value.dtype(),
+          base->storage().data_ptr().device()),
+      value_(view_value),
+      is_multi_output_view_(
+          base->is_multi_output_view_ || meta->is_multi_output),
+      was_storage_changed_(base->was_storage_changed_),
+      is_symbolic_(base->is_symbolic_) {
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_));
   TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
   set_constructor_metadata();
@@ -148,11 +150,10 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const
       view_metas_ = base->view_metas_;  // copy
   }
   view_metas_.push_back(meta);
-  maybe_mark_symbolic(meta);
+  maybe_mark_symbolic(meta.get());
   storage_ = base->storage_; // alias this tensor's storage with the base tensor's
 }
 
-
 functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const {
   return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl());
 }
@@ -176,18 +177,18 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
 }
 
 // See Note [Functionalization Pass - Inplace View Ops]
-void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
+void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr<at::functionalization::ViewMeta>& meta) {
   view_metas_.push_back(meta);
   // Manually track the fact that this tensor received a metadata mutation!
   has_metadata_mutation_ = true;
   // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
-  maybe_mark_symbolic(meta);
+  maybe_mark_symbolic(meta.get());
   // Note [Functionalization Pass - Inplace View Ops]
   // So, these ops are special - they're mutation AND view ops. They get special codegen.
   // An example is transpose_, e.g. `a.transpose_()`
   // Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas.
   at::AutoDispatchSkipFunctionalize guard;
-  value_ = meta.forward_fn(value_, meta.out_index);
+  value_ = meta->forward(value_);
   TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
 }
 
@@ -368,15 +369,8 @@ void FunctionalTensorWrapper::sync_() {
   regenerate_from_base();
 }
 
-Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) {
-  auto t = base;
-
-  // Reapply views to get the viewed tensor from the base in alias_
-  for (auto& view_meta: view_metas_) {
-    t = view_meta.forward_fn(t, view_meta.out_index);
-  }
-
-  return t;
+const std::vector<std::shared_ptr<functionalization::ViewMeta>>& FunctionalTensorWrapper::view_metas() const {
+  return view_metas_;
 }
 
 void FunctionalTensorWrapper::regenerate_from_base() {
@@ -385,7 +379,7 @@ void FunctionalTensorWrapper::regenerate_from_base() {
   auto t = storage_impl->base();
 
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
-  t = apply_view_metas(t);
+  t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_);
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
 
   replace_(t, /*from_lazy_regenerate=*/true);
@@ -485,7 +479,10 @@ void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptr<TensorI
 
 
 c10::Device FunctionalTensorWrapper::device_custom() const {
-  return value_.unsafeGetTensorImpl()->device();
+  // The storage pointer already uses the underlying tensor custom device (if
+  // applicable) to extract the device. So, we dont have to recurse again by
+  // doing value_.unsafeGetTensorImpl()->device().
+  return storage().data_ptr().device();
 }
 at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const {
   return value_.unsafeGetTensorImpl()->sizes();
@@ -724,11 +721,11 @@ bool isFunctionalTensor(const std::optional<Tensor>& t) {
 }
 
 bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
-  if (t_list.empty()) return false;
+  if (t_list.empty()) { return false; }
   auto functional_count = 0;
   for (const auto i : c10::irange(t_list.size())) {
     auto const & e= t_list[i];
-    if (!e.has_value() || !e->defined()) continue;
+    if (!e.has_value() || !e->defined()) { continue; }
     if (isFunctionalTensor(e)) {
       ++functional_count;
     }
@@ -738,10 +735,10 @@ bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
 
 template <typename T>
 static bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
-  if (list.size() == 0) return false;
+  if (list.size() == 0) { return false; }
   auto functional_count = 0;
   for (const auto& tensor : list) {
-    if (!tensor.defined()) continue;
+    if (!tensor.defined()) { continue; }
     if (isFunctionalTensor(tensor)) {
       ++functional_count;
     }
@@ -759,20 +756,28 @@ void freeze_functional_tensor(const Tensor& tensor) {
   functional_base_impl->freeze_storage();
 }
 
-Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
+Tensor create_functional_tensor_with_view_meta(
+    const at::Tensor& view_to_wrap,
+    const at::Tensor& base,
+    const std::shared_ptr<functionalization::ViewMeta>& meta,
+    int64_t out_idx) {
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
   TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
   auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
+  auto meta_ = meta;
   if (out_idx != 0) {
     // Note [out_idx in ViewMeta]
     // When a view op outputs multiple tensors, each output needs its own separate ViewMeta.
     // Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function.
-    meta = meta.to_out_idx(out_idx);
+    meta_ = meta->to_out_index(out_idx);
   }
-  return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
+  return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta_);
 }
 
-std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) {
+std::vector<Tensor> create_functional_tensor_with_view_meta(
+    ITensorListRef view_to_wrap,
+    const at::Tensor& base,
+    const std::shared_ptr<functionalization::ViewMeta>& meta) {
   std::vector<Tensor> outputs(view_to_wrap.size());
   int64_t i = 0;
   for (const auto& tensor : view_to_wrap) {
@@ -782,12 +787,22 @@ std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_
   return outputs;
 }
 
-void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) {
+void mutate_view_meta(const at::Tensor& self, const std::shared_ptr<functionalization::ViewMeta>& meta) {
   TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
   auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
   self_impl->mutate_view_meta(meta);
 }
 
+Tensor apply_view_meta_sequence(
+    const Tensor& base,
+    const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence) {
+  Tensor r = base;
+  for (auto& vm : sequence) {
+    r = vm->forward(r);
+  }
+  return r;
+}
+
 // Note [Propagating strides in the functionalization pass]
 // In order to properly compute stride information, the functionalization pass
 // calls each {view} reference implementations with meta tensors.
@@ -881,7 +896,7 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s
     const auto& ivalue = returns[idx];
     if (ivalue.isTensor()) {
       const auto& t = ivalue.toTensor();
-      if (!t.defined()) continue;
+      if (!t.defined()) { continue; }
       at::functionalization::impl::sync(t);
       auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
       (*stack)[returns_begin + idx] = t_new;
diff --git a/aten/src/ATen/FunctionalTensorWrapper.h b/aten/src/ATen/FunctionalTensorWrapper.h
index b260b7c9f958..6d9050728da7 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@@ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   explicit FunctionalTensorWrapper(
       const Tensor& view_value,
       const FunctionalTensorWrapper* base,
-      const functionalization::ViewMeta& meta);
+      const std::shared_ptr<functionalization::ViewMeta>& meta);
 
   // Get the underlying, actual tensor, that doesn't know anything about
   // functionalization.
@@ -99,17 +99,17 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
         ->are_all_mutations_under_no_grad_or_inference_mode();
   }
 
-  void maybe_mark_symbolic(const functionalization::ViewMeta& meta) {
-    is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs;
+  void maybe_mark_symbolic(functionalization::ViewMeta* meta) {
+    is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs;
   }
 
   bool is_symbolic() const {
     return is_symbolic_;
   }
 
-  // Runs the forward_fn of every ViewMeta collected in the current instance
-  // to some other base.
-  Tensor apply_view_metas(const Tensor& base);
+  // Retrieves the ViewMeta sequence of this tensor.
+  const std::vector<std::shared_ptr<functionalization::ViewMeta>>& view_metas()
+      const;
 
   // Sync's the underlying tensor with its alias, if it's out of date. This
   // involves two steps: 1) Apply any pending updates/mutations to the alias 2)
@@ -146,7 +146,8 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   // from the base tensor. This method is used by inplace-view ops like
   // transpose_. It appends a ViewMeta to the existing stack, and refreshes the
   // tensor by replaying the views off of the alias.
-  void mutate_view_meta(const at::functionalization::ViewMeta& meta);
+  void mutate_view_meta(
+      const std::shared_ptr<at::functionalization::ViewMeta>& meta);
 
   // Custom implementation of self.set_(src)
   void set__impl(const FunctionalTensorWrapper* other);
@@ -285,7 +286,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   bool is_symbolic_ = false;
 
   size_t generation_ = 0;
-  std::vector<at::functionalization::ViewMeta> view_metas_;
+  std::vector<std::shared_ptr<at::functionalization::ViewMeta>> view_metas_;
 
  protected:
   static void copy_tensor_metadata(
@@ -377,16 +378,20 @@ TORCH_API void propagate_xla_data_direct(
 Tensor create_functional_tensor_with_view_meta(
     const Tensor& view_to_wrap,
     const Tensor& base,
-    functionalization::ViewMeta meta,
+    const std::shared_ptr<functionalization::ViewMeta>& meta,
     int64_t out_idx = 0);
 std::vector<Tensor> create_functional_tensor_with_view_meta(
     ITensorListRef view_to_wrap,
     const Tensor& base,
-    const functionalization::ViewMeta& meta);
+    const std::shared_ptr<functionalization::ViewMeta>& meta);
 
 void mutate_view_meta(
     const Tensor& self,
-    const functionalization::ViewMeta& meta);
+    const std::shared_ptr<functionalization::ViewMeta>& meta);
+
+TORCH_API Tensor apply_view_meta_sequence(
+    const Tensor& base,
+    const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence);
 
 void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
 void set_sizes_strides_offset(
diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.cpp b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
index 97094c9f125a..10f988b4d281 100644
--- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
@@ -1,3 +1,5 @@
+#include <ATen/FunctionalizeFallbackKernel.h>
+
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/EmptyTensor.h>
@@ -7,7 +9,6 @@
 #include <torch/library.h>
 #include <c10/util/irange.h>
 #include <c10/util/strides.h>
-#include <ATen/EmptyTensor.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/ATen.h>
@@ -28,6 +29,31 @@
 #include <utility>
 #endif
 
+namespace at::functionalization {
+
+Tensor resize__ViewMeta::forward(const Tensor& base) {
+  if (reapply_views) {
+    return base.as_strided(size, c10::contiguous_strides(size));
+  } else {
+    return at::as_strided_copy(base, size, c10::contiguous_strides(size));
+  }
+}
+
+Tensor resize__ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
+  return base.as_strided_scatter(
+      mutated_view, size, c10::contiguous_strides(size));
+}
+
+Tensor _unsafe_view_ViewMeta::forward(const Tensor& base) {
+  return at::_unsafe_view_symint(base, size);
+}
+
+Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
+  return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
+}
+
+} // namespace at::functionalization
+
 namespace {
   void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) {
     const auto& schema = op.schema();
@@ -106,7 +132,9 @@ namespace {
       const auto& ivalue = returns[idx];
       if (ivalue.isTensor() && should_wrap_outputs) {
         const auto& t = ivalue.toTensor();
-        if (!t.defined()) continue;
+        if (!t.defined()) {
+          continue;
+        }
         auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
         (*stack)[returns_begin + idx] = t_new;
       } else if (ivalue.isTensorList() && should_wrap_outputs) {
@@ -169,19 +197,8 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch
   // The output of resizing is equivalent to taking a slice of a larger tensor.
   // We have to emulate this "slicing" with an as_strided call.
   auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
-  at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
-    [reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
-      if (reapply_views) {
-        return base.as_strided(size, c10::contiguous_strides(size));
-      } else {
-        return at::as_strided_copy(base, size, c10::contiguous_strides(size));
-      }
-    },
-    [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
-      return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size));
-    },
-    /*has_symbolic_inputs=*/false
-  );
+  auto view_meta = std::make_shared<at::functionalization::resize__ViewMeta>(
+      reapply_views, size.vec());
   at::functionalization::impl::mutate_view_meta(self, view_meta);
   return self;
 }
@@ -300,17 +317,11 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
     tmp_output = at::_unsafe_view_symint(self_, size);
   }
 
-  bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
-
-  at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
-    [size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
-      return at::_unsafe_view_symint(base, size);
-    },
-    [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
-      return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
-    },
-    /*has_symbolic_inputs=*/has_symbolic_inputs
-  );
+  bool has_symbolic_inputs = std::any_of(
+      size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
+  auto view_meta =
+      std::make_shared<at::functionalization::_unsafe_view_ViewMeta>(
+          has_symbolic_inputs, size.vec());
 
   auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta));
   // See  Note [Propagating strides in the functionalization pass]
diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.h b/aten/src/ATen/FunctionalizeFallbackKernel.h
new file mode 100644
index 000000000000..aabcfc827af3
--- /dev/null
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <ATen/FunctionalStorageImpl.h>
+
+namespace at::functionalization {
+
+// `ViewMeta` implementation for `resize_` operation.
+struct TORCH_API resize__ViewMeta : public ViewMeta {
+  FUNCTIONALIZATION_VIEWMETA_NAME(resize__ViewMeta)
+  FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
+      bool /* reapply_views */,
+      const std::vector<int64_t>&);
+
+  resize__ViewMeta(const SerializableTuple& tpl)
+      : resize__ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
+
+  resize__ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
+      : ViewMeta(/*has_symbolic_inputs=*/false),
+        reapply_views(reapply_views),
+        size(size) {}
+
+  Tensor forward(const Tensor& base) override;
+  Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
+
+  SerializableTuple to_serializable_tuple() {
+    return std::make_tuple(reapply_views, size);
+  }
+
+  bool reapply_views;
+  std::vector<int64_t> size;
+};
+
+// `ViewMeta` implementation for `_unsafe_view` operation.
+struct TORCH_API _unsafe_view_ViewMeta : public ViewMeta {
+  FUNCTIONALIZATION_VIEWMETA_NAME(_unsafe_view_ViewMeta)
+  FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
+      bool /* has_symbolic_inputs */,
+      const std::vector<c10::SymInt>&);
+
+  _unsafe_view_ViewMeta(const SerializableTuple& tpl)
+      : _unsafe_view_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
+
+  _unsafe_view_ViewMeta(
+      bool has_symbolic_inputs,
+      const std::vector<c10::SymInt>& size)
+      : ViewMeta(has_symbolic_inputs), size(size) {}
+
+  Tensor forward(const Tensor& base) override;
+  Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
+
+  SerializableTuple to_serializable_tuple() {
+    return std::make_tuple(has_symbolic_inputs, size);
+  }
+
+  std::vector<c10::SymInt> size;
+};
+
+} // namespace at::functionalization
diff --git a/aten/src/ATen/InferSize.h b/aten/src/ATen/InferSize.h
index e701882a2606..817bf0ddba0b 100644
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@@ -4,6 +4,7 @@
 #include <c10/core/ScalarType.h>
 #include <c10/core/SymIntArrayRef.h>
 #include <c10/util/DimVector.h>
+#include <c10/util/Exception.h>
 #include <optional>
 #include <sstream>
 #include <vector>
@@ -26,9 +27,7 @@ inline void infer_size_impl(
   std::optional<int64_t> infer_dim;
   for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) {
     if (TORCH_GUARD_OR_FALSE(sym_eq(shape[dim], -1))) {
-      if (infer_dim) {
-        throw std::runtime_error("only one dimension can be inferred");
-      }
+      TORCH_CHECK(!infer_dim, "only one dimension can be inferred");
       infer_dim = dim;
     } else {
       // in case of unbacked shape[dim] we assume it's not -1 and add a runtime
@@ -45,7 +44,39 @@ inline void infer_size_impl(
     }
   }
 
-  auto set_infer_dim = [&]() {
+  if (infer_dim) {
+    // numel is the product of known sizes, it has to be divisible by newsize.
+    // and newsize should be positive unless newsize == numel (we throw
+    // different) error message in that case.
+    if constexpr (std::is_same_v<NumelType, c10::SymInt>) {
+      auto v = newsize.maybe_as_int();
+      if (v and *v == 0) {
+        // Avoid div by 0 when sym_eq(numel % newsize, 0) is constructed!
+        // which may happen when newsize is not a symbol! if its a symbol
+        // division won't happen anyway during compile.
+        TORCH_MAYBE_SYM_CHECK(
+            numel == newsize,
+            "shape '",
+            shape,
+            "' is invalid for input of size ",
+            numel);
+      } else {
+        auto cond = sym_gt(newsize, 0)
+                        .sym_and(sym_eq(numel % newsize, 0))
+                        .sym_or(sym_eq(numel, newsize));
+        TORCH_MAYBE_SYM_CHECK(
+            cond, "shape '", shape, "' is invalid for input of size ", numel);
+      }
+
+    } else {
+      TORCH_CHECK(
+          (newsize > 0 && (numel % newsize == 0)) || numel == newsize,
+          "shape '",
+          shape,
+          "' is invalid for input of size ",
+          numel);
+    }
+
     // We have a degree of freedom here to select the dimension size; follow
     // NumPy semantics and just bail.  However, a nice error message is needed
     // because users often use `view` as a way to flatten & unflatten
@@ -54,18 +85,14 @@ inline void infer_size_impl(
     // works yet
     //   empty_tensor.view(-1, 0)
     // doesn't.
-    TORCH_CHECK(
+    TORCH_MAYBE_SYM_CHECK(
         newsize != 0,
         "cannot reshape tensor of 0 elements into shape ",
         shape,
         " because the unspecified dimension size -1 can be any "
         "value and is ambiguous");
-    res[*infer_dim] = numel / newsize;
-    return;
-  };
 
-  if (infer_dim && newsize > 0 && numel % newsize == 0) {
-    set_infer_dim();
+    res[*infer_dim] = numel / newsize;
     return;
   }
 
@@ -75,9 +102,6 @@ inline void infer_size_impl(
       shape,
       "' is invalid for input of size ",
       numel);
-  if (infer_dim) {
-    set_infer_dim();
-  }
 }
 
 inline std::vector<int64_t> infer_size(IntArrayRef shape, int64_t numel) {
diff --git a/aten/src/ATen/LegacyBatchingRegistrations.cpp b/aten/src/ATen/LegacyBatchingRegistrations.cpp
index 4c8c07f84e96..2c54718e938f 100644
--- a/aten/src/ATen/LegacyBatchingRegistrations.cpp
+++ b/aten/src/ATen/LegacyBatchingRegistrations.cpp
@@ -58,7 +58,7 @@ namespace at {
 namespace{
 
 // PyTorch allows operations to specify dim 0 and dim -1 on a scalar tensor.
-static bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
+bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
   return dim == 0 || dim == -1;
 }
 
@@ -365,7 +365,7 @@ Tensor select_batching_rule(const Tensor& self, int64_t dim, int64_t index) {
   return self_physical.getPhysicalToLogicalMap().apply(result);
 }
 
-static int64_t getGradInputPhysicalDim(int64_t dim, IntArrayRef input_sizes, int64_t num_batch_dims) {
+int64_t getGradInputPhysicalDim(int64_t dim, IntArrayRef input_sizes, int64_t num_batch_dims) {
   return maybe_wrap_dim(dim, static_cast<int64_t>(input_sizes.size())) + num_batch_dims;
 }
 
@@ -488,7 +488,7 @@ Tensor view_as_complex_batching_rule(const Tensor& self) {
 // Checks that the smallest batch stride is greater than the largest example
 // stride. This is something we can support but we choose not to because it's
 // potentially error prone.
-static void checkBatchDimsAtFrontInLayout(IntArrayRef physical_strides, int64_t num_batch_dims) {
+void checkBatchDimsAtFrontInLayout(IntArrayRef physical_strides, int64_t num_batch_dims) {
   auto smallest_batch_stride = std::min_element(
       physical_strides.begin(), physical_strides.begin() + num_batch_dims);
   auto largest_example_stride = std::max_element(
@@ -508,7 +508,7 @@ static void checkBatchDimsAtFrontInLayout(IntArrayRef physical_strides, int64_t
 // given (sizes, strides, storage_offset) returns the maximum location that
 // can be indexed (or nullopt if such a location doesn't exist, e.g., tensors
 // with zero-size dims).
-static std::optional<int64_t> maximum_indexable_location(
+std::optional<int64_t> maximum_indexable_location(
     IntArrayRef sizes, IntArrayRef strides, int64_t storage_offset) {
   auto result = native::storage_size_for(sizes, strides);
   if (result == 0) {
@@ -521,7 +521,7 @@ static std::optional<int64_t> maximum_indexable_location(
 // This checks that the range of possible memory locations accessible by
 // x.as_strided(sizes, strides, maybe_storage_offset)
 // are within the bounds of possible memory locations accessible by x.
-static void checkBasicAsStridedValidForSlice(
+void checkBasicAsStridedValidForSlice(
     const Tensor& physical_tensor,
     int64_t num_batch_dims,
     IntArrayRef sizes,
diff --git a/aten/src/ATen/MapAllocator.cpp b/aten/src/ATen/MapAllocator.cpp
index 63a278050e8a..ed697c32b58a 100644
--- a/aten/src/ATen/MapAllocator.cpp
+++ b/aten/src/ATen/MapAllocator.cpp
@@ -62,7 +62,7 @@ constexpr const char* unknown_eventname = "eventname not specified";
 #endif
 }  // namespace (anonymous)
 
-MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags, size_t size)
+MapAllocator::MapAllocator(WithFd /*unused*/, std::string_view filename, int fd, int flags, size_t size)
   : filename_(filename.empty() ? unknown_filename : filename)
   , size_(0) // to be filled later
 #ifdef _WIN32
@@ -292,6 +292,28 @@ MapAllocator::MapAllocator(WithFd, std::string_view filename, int fd, int flags,
           if (ftruncate(fd, static_cast<off_t>(size)) == -1) {
             TORCH_CHECK(false, "unable to resize file <", filename_, "> to the right size: ", c10::utils::str_error(errno), " (", errno, ")");
           }
+
+#ifdef HAVE_POSIX_FALLOCATE
+          if (flags_ & ALLOCATOR_MAPPED_SHAREDMEM) {
+            for (;;) {
+              if (posix_fallocate(fd, 0, static_cast<off_t>(size)) == 0) {
+                break;
+              }
+
+              if (errno == EINTR) {
+                continue;
+              }
+
+              if (errno == EINVAL || errno == EOPNOTSUPP) {
+                // the underlying filesystem does not support the operation
+                break;
+              }
+
+              TORCH_CHECK(false, "unable to allocate shared memory(shm) for file <", filename_, ">: ", c10::utils::str_error(errno), " (", errno, ")");
+            }
+          }
+#endif
+
           if (fstat(fd, &file_stat) == -1 || file_stat.st_size < static_cast<int64_t>(size)) {
 #ifndef STRIP_ERROR_MESSAGES
             int last_err = errno;
@@ -472,7 +494,7 @@ RefcountedMapAllocator::RefcountedMapAllocator(const char *filename, int flags,
 
     initializeAlloc();
 }
-RefcountedMapAllocator::RefcountedMapAllocator(WithFd, const char *filename, int fd, int flags, size_t size)
+RefcountedMapAllocator::RefcountedMapAllocator(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size)
   : RefcountedMapAllocatorArgCheck(flags)
   , MapAllocator(WITH_FD, filename, flags, fd, size + map_alloc_alignment) {
 
@@ -592,7 +614,7 @@ at::DataPtr MapAllocator::makeDataPtr(std::string_view filename, int flags, size
   return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
 }
 
-at::DataPtr MapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
+at::DataPtr MapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
   auto* context = new MapAllocator(WITH_FD, filename, fd, flags, size);
   if (actual_size_out) *actual_size_out = context->size();
   return {context->data(), context, &deleteMapAllocator, at::DeviceType::CPU};
@@ -604,7 +626,7 @@ at::DataPtr RefcountedMapAllocator::makeDataPtr(const char *filename, int flags,
   return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};
 }
 
-at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
+at::DataPtr RefcountedMapAllocator::makeDataPtr(WithFd /*unused*/, const char *filename, int fd, int flags, size_t size, size_t* actual_size_out) {
   auto* context = new RefcountedMapAllocator(WITH_FD, filename, fd, flags, size);
   if (actual_size_out) *actual_size_out = context->size() - map_alloc_alignment;
   return {context->data(), context, &deleteRefcountedMapAllocator, at::DeviceType::CPU};
diff --git a/aten/src/ATen/MapAllocator.h b/aten/src/ATen/MapAllocator.h
index 9fc5e32adcb5..7a3415a4c411 100644
--- a/aten/src/ATen/MapAllocator.h
+++ b/aten/src/ATen/MapAllocator.h
@@ -25,7 +25,7 @@ class TORCH_API MapAllocator {
  public:
   MapAllocator(std::string_view filename, int flags, size_t size);
   MapAllocator(
-      WithFd,
+      WithFd /*unused*/,
       std::string_view filename,
       int fd,
       int flags,
@@ -59,14 +59,14 @@ class TORCH_API MapAllocator {
     return flags_;
   }
 
-  static MapAllocator* fromDataPtr(const at::DataPtr&);
+  static MapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
   static at::DataPtr makeDataPtr(
       std::string_view filename,
       int flags,
       size_t size,
       size_t* actual_size_out);
   static at::DataPtr makeDataPtr(
-      WithFd,
+      WithFd /*unused*/,
       const char* filename,
       int fd,
       int flags,
@@ -105,13 +105,13 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
  public:
   RefcountedMapAllocator(const char* filename, int flags, size_t size);
   RefcountedMapAllocator(
-      WithFd,
+      WithFd /*unused*/,
       const char* filename,
       int fd,
       int flags,
       size_t size);
 
-  static RefcountedMapAllocator* fromDataPtr(const at::DataPtr&);
+  static RefcountedMapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
   RefcountedMapAllocator(const RefcountedMapAllocator&) = delete;
   RefcountedMapAllocator(RefcountedMapAllocator&&) = delete;
   RefcountedMapAllocator& operator=(const RefcountedMapAllocator&) = delete;
@@ -122,7 +122,7 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
       size_t size,
       size_t* actual_size_out);
   static at::DataPtr makeDataPtr(
-      WithFd,
+      WithFd /*unused*/,
       const char* filename,
       int fd,
       int flags,
diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp
index 09fbedd4056d..2de73a70dd33 100644
--- a/aten/src/ATen/NamedTensorUtils.cpp
+++ b/aten/src/ATen/NamedTensorUtils.cpp
@@ -179,7 +179,7 @@ void propagate_names_except(const Tensor& result, const Tensor& src, IntArrayRef
     return;
   }
   const auto src_names = src.names();
-  const auto result_dim = static_cast<int64_t>(result.dim());
+  const auto result_dim = result.dim();
   const auto src_dim = static_cast<int64_t>(src_names.size());
   const auto excluded_dim = static_cast<int64_t>(excluded_idxs.size());
   TORCH_INTERNAL_ASSERT(src_dim - excluded_dim == result_dim);
diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp
index 63bd867f9022..ea951ed3db13 100644
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@@ -273,7 +273,7 @@ c10::SymInt NestedTensorImpl::sym_numel_custom() const {
   return NestedTensorImpl::numel_custom();
 }
 
-c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat) const {
+c10::SymBool NestedTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
   return nested_tensor_impl_is_contiguous(this);
 }
 IntArrayRef NestedTensorImpl::sizes_custom() const {
diff --git a/aten/src/ATen/NestedTensorImpl.h b/aten/src/ATen/NestedTensorImpl.h
index cddf37df34a5..9b92e9ec83ad 100644
--- a/aten/src/ATen/NestedTensorImpl.h
+++ b/aten/src/ATen/NestedTensorImpl.h
@@ -115,7 +115,8 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
   // with real implementations
   int64_t numel_custom() const override;
   c10::SymInt sym_numel_custom() const override;
-  c10::SymBool sym_is_contiguous_custom(MemoryFormat) const override;
+  c10::SymBool sym_is_contiguous_custom(
+      MemoryFormat /*memory_format*/) const override;
   int64_t size_custom(int64_t d) const override {
     return this->size(d);
   }
diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h
index b55dad02f347..d09a33841b94 100644
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@@ -14,7 +14,7 @@ inline int64_t divup(int64_t x, int64_t y) {
 TORCH_API void init_num_threads();
 
 // Sets the number of threads to be used in parallel region
-TORCH_API void set_num_threads(int);
+TORCH_API void set_num_threads(int /*nthreads*/);
 
 // Returns the maximum number of threads that may be used in a parallel region
 TORCH_API int get_num_threads();
@@ -37,7 +37,7 @@ inline void lazy_init_num_threads() {
   }
 }
 
-TORCH_API void set_thread_num(int);
+TORCH_API void set_thread_num(int /*id*/);
 
 class TORCH_API ThreadIdGuard {
  public:
@@ -130,7 +130,7 @@ inline scalar_t parallel_reduce(
 TORCH_API std::string get_parallel_info();
 
 // Sets number of threads used for inter-op parallelism
-TORCH_API void set_num_interop_threads(int);
+TORCH_API void set_num_interop_threads(int /*nthreads*/);
 
 // Returns the number of threads used for inter-op parallelism
 TORCH_API size_t get_num_interop_threads();
diff --git a/aten/src/ATen/PythonTorchFunctionTLS.cpp b/aten/src/ATen/PythonTorchFunctionTLS.cpp
index e4105bf8468f..e90065543e35 100644
--- a/aten/src/ATen/PythonTorchFunctionTLS.cpp
+++ b/aten/src/ATen/PythonTorchFunctionTLS.cpp
@@ -42,8 +42,14 @@ const PythonTorchFunctionTLS& PythonTorchFunctionTLS::get_state() {
 }
 
 bool torch_function_mode_enabled() {
-  return PythonTorchFunctionTLS::get_disabled_state() != TorchFunctionDisabledState::ALL_DISABLED &&
-         PythonTorchFunctionTLS::stack_len() > 0;
+  // Manually flatten because gcc is refusing to inline here.  Note
+  // that we are still calling __tls_get_addr twice here with GCC,
+  // presumably because of
+  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81501 (which says
+  // the fix ships in GCC 16), but forcing inlining still improves
+  // performance.
+  const auto& ptfs = pythonTorchFunctionState;
+  return ptfs.disabled_state_ != TorchFunctionDisabledState::ALL_DISABLED && !ptfs.stack_.empty();
 }
 
 // This is needed to disambiguate the ternary torch function disabled states
diff --git a/aten/src/ATen/PythonTorchFunctionTLS.h b/aten/src/ATen/PythonTorchFunctionTLS.h
index a245a55ebdc4..502bb535be05 100644
--- a/aten/src/ATen/PythonTorchFunctionTLS.h
+++ b/aten/src/ATen/PythonTorchFunctionTLS.h
@@ -27,6 +27,7 @@ struct TORCH_API PythonTorchFunctionTLS {
   TorchFunctionDisabledState disabled_state_ =
       TorchFunctionDisabledState::ENABLED;
   std::vector<std::shared_ptr<c10::SafePyObject>> stack_;
+  friend TORCH_API bool torch_function_mode_enabled();
 };
 
 TORCH_API bool torch_function_mode_enabled();
diff --git a/aten/src/ATen/SavedTensorHooks.cpp b/aten/src/ATen/SavedTensorHooks.cpp
index e05e3145fdf3..69d0c243156f 100644
--- a/aten/src/ATen/SavedTensorHooks.cpp
+++ b/aten/src/ATen/SavedTensorHooks.cpp
@@ -13,7 +13,7 @@ namespace {
   // and left at true for the rest of the execution.
   // It's an optimization so that users who never use default hooks don't need to
   // read the thread_local variables pack_hook_ and unpack_hook_.
-  static bool is_initialized(false);
+  bool is_initialized(false);
 }
 
 static void assertSavedTensorHooksNotDisabled() {
diff --git a/aten/src/ATen/SparseCsrTensorImpl.cpp b/aten/src/ATen/SparseCsrTensorImpl.cpp
index f73d75ab53ad..dec6d2e95960 100644
--- a/aten/src/ATen/SparseCsrTensorImpl.cpp
+++ b/aten/src/ATen/SparseCsrTensorImpl.cpp
@@ -252,7 +252,7 @@ void SparseCsrTensorImpl::set_stride(int64_t dim, int64_t new_stride) {
 void SparseCsrTensorImpl::set_storage_offset(int64_t storage_offset) {
   TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have set_storage_offset.");
 }
-c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat) const {
+c10::SymBool SparseCsrTensorImpl::sym_is_contiguous_custom(MemoryFormat /*memory_format*/) const {
   TORCH_CHECK(false, "Sparse ", at::sparse_csr::layoutToString(layout_, /*upper=*/true), " tensors do not have is_contiguous");
 }
 } // namespace at
diff --git a/aten/src/ATen/SparseCsrTensorImpl.h b/aten/src/ATen/SparseCsrTensorImpl.h
index 14688163a374..e764f954db33 100644
--- a/aten/src/ATen/SparseCsrTensorImpl.h
+++ b/aten/src/ATen/SparseCsrTensorImpl.h
@@ -32,10 +32,10 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
 
  public:
   explicit SparseCsrTensorImpl(
-      at::DispatchKeySet,
+      at::DispatchKeySet /*key_set*/,
       at::Device device,
       Layout layout,
-      const caffe2::TypeMeta);
+      const caffe2::TypeMeta /*data_type*/);
 
   void resize_(int64_t nnz, IntArrayRef size);
   void resize_and_clear_(
@@ -86,7 +86,8 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
  protected:
   IntArrayRef strides_custom() const override;
   SymIntArrayRef sym_strides_custom() const override;
-  SymBool sym_is_contiguous_custom(MemoryFormat) const override;
+  SymBool sym_is_contiguous_custom(
+      MemoryFormat /*memory_format*/) const override;
 
  public:
   void set_size(int64_t dim, int64_t new_size) override;
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index b10795fbc37e..a2c12fcfe8b9 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -46,7 +46,9 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
 
  public:
   // Public for now...
-  explicit SparseTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta);
+  explicit SparseTensorImpl(
+      at::DispatchKeySet /*key_set*/,
+      const caffe2::TypeMeta /*data_type*/);
 
   void release_resources() override;
 
@@ -229,14 +231,14 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
   }
 
   void resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef<int64_t> size) {
-    return _resize_(sparse_dim, dense_dim, size);
+    _resize_(sparse_dim, dense_dim, size);
   }
 
   void resize_(
       int64_t sparse_dim,
       int64_t dense_dim,
       ArrayRef<c10::SymInt> size) {
-    return _resize_(sparse_dim, dense_dim, size);
+    _resize_(sparse_dim, dense_dim, size);
   }
 
   // NOTE: this function will resize the sparse tensor and also set `indices`
@@ -384,8 +386,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
 
  private:
   explicit SparseTensorImpl(
-      at::DispatchKeySet,
-      const caffe2::TypeMeta,
+      at::DispatchKeySet /*key_set*/,
+      const caffe2::TypeMeta /*data_type*/,
       at::Tensor indices,
       at::Tensor values);
 
diff --git a/aten/src/ATen/TensorIndexing.cpp b/aten/src/ATen/TensorIndexing.cpp
index bd50282b46ec..1fa852686656 100644
--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@@ -59,7 +59,7 @@ static inline void set_item(const Tensor& self, ArrayRef<TensorIndex> indices, c
     }
   }
 
-  return set_item(self, indices, value);
+  set_item(self, indices, value);
 }
 
 } // namespace indexing
diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h
index a487589833e8..9291d2e66e5f 100644
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@@ -112,10 +112,10 @@ TORCH_API std::ostream& operator<<(std::ostream& stream, const Slice& slice);
 // `torch.tensor([1, 2])`) | `torch::tensor({1, 2})`
 struct TORCH_API TensorIndex final {
   // Case 1: `at::indexing::None`
-  TensorIndex(std::nullopt_t) : type_(TensorIndexType::None) {}
+  TensorIndex(std::nullopt_t /*unused*/) : type_(TensorIndexType::None) {}
 
   // Case 2: "..." / `at::indexing::Ellipsis`
-  TensorIndex(at::indexing::EllipsisIndexType)
+  TensorIndex(at::indexing::EllipsisIndexType /*unused*/)
       : type_(TensorIndexType::Ellipsis) {}
   TensorIndex(const char* str) : TensorIndex(at::indexing::Ellipsis) {
     TORCH_CHECK_VALUE(
@@ -214,7 +214,7 @@ inline Tensor applySlice(
       "step must be greater than zero");
 
   // See NOTE [nested tensor size for indexing]
-  if (self_sizes.has_value() && self_sizes.value().size() > 0) {
+  if (self_sizes.has_value() && !self_sizes.value().empty()) {
     // Skip this optimization if we are tracing, as the trace may be polymorphic
     // over the shape of the `self` tensor, and we still want to record
     // the slice.
diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
index 9096cbfc68eb..d0bbe2d76548 100644
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@@ -56,7 +56,7 @@ inline void get_strides(int64_t* strides, ArrayRef<OperandInfo> operands, int64_
   }
 }
 
-static OptionalTensorRef make_otr(const TensorBase &tensor) {
+OptionalTensorRef make_otr(const TensorBase &tensor) {
   if (tensor.defined()) {
     return OptionalTensorRef(tensor);
   } else {
@@ -765,7 +765,8 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) {
   if (numel == 0) {
     return;
   } else if (numel < grain_size || at::get_num_threads() == 1) {
-    return serial_for_each(loop, {0, numel});
+    serial_for_each(loop, {0, numel});
+    return;
   } else {
     at::parallel_for(0, numel, grain_size, [&](int64_t begin, int64_t end) {
       serial_for_each(loop, {begin, end});
diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h
index d8eebd4c06a4..d8593a80292b 100644
--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@@ -250,7 +250,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   using PtrVector = SmallVector<char*, 4>;
   using StrideVector = SmallVector<int64_t, 6>;
 
-  void build(TensorIteratorConfig&);
+  void build(TensorIteratorConfig& /*config*/);
 
   // The inner-loop function operates on the fastest moving dimension. It
   // implements element-wise operations in terms of 1-d strided tensors.
@@ -618,20 +618,20 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
 #undef TORCH_DISALLOW_TEMPORARIES
  protected:
   // Mutable reference as it moves tensors out of TensorIteratorConfig
-  void populate_operands(TensorIteratorConfig&);
+  void populate_operands(TensorIteratorConfig& /*config*/);
   void mark_outputs();
-  void mark_resize_outputs(const TensorIteratorConfig&);
-  void compute_mem_overlaps(const TensorIteratorConfig&);
-  void compute_shape(const TensorIteratorConfig&);
-  void compute_strides(const TensorIteratorConfig&);
+  void mark_resize_outputs(const TensorIteratorConfig& /*config*/);
+  void compute_mem_overlaps(const TensorIteratorConfig& /*config*/);
+  void compute_shape(const TensorIteratorConfig& /*config*/);
+  void compute_strides(const TensorIteratorConfig& /*config*/);
   void reorder_dimensions();
   void permute_dimensions(IntArrayRef perm);
-  void compute_types(const TensorIteratorConfig&);
+  void compute_types(const TensorIteratorConfig& /*config*/);
   ScalarType compute_common_dtype();
   void allocate_or_resize_outputs();
-  bool fast_set_up(const TensorIteratorConfig&);
-  FastSetupType compute_fast_setup_type(const TensorIteratorConfig&);
-  void compute_names(const TensorIteratorConfig&);
+  bool fast_set_up(const TensorIteratorConfig& /*config*/);
+  FastSetupType compute_fast_setup_type(const TensorIteratorConfig& /*config*/);
+  void compute_names(const TensorIteratorConfig& /*config*/);
   void propagate_names_to_outputs();
   void coalesce_dimensions();
 
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
index 34cb5329de6a..8236751679f0 100644
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@@ -273,11 +273,11 @@ void checkLayout(CheckedFrom c, at::ArrayRef<Tensor> tensors, at::Layout layout)
 }
 
 void * maybe_data_ptr(const Tensor& tensor) {
-  return tensor.defined() ? (void *)tensor.data_ptr() : nullptr;
+  return tensor.defined() ? tensor.data_ptr() : nullptr;
 }
 
 void * maybe_data_ptr(const TensorArg& tensor) {
-  return tensor->defined() ? (void *)tensor->data_ptr() : nullptr;
+  return tensor->defined() ? tensor->data_ptr() : nullptr;
 }
 
 void check_dim_size(
diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h
index 95a35bd5563a..e9c936b906c6 100644
--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@@ -20,7 +20,7 @@
 
 namespace at {
 
-TORCH_API int _crash_if_asan(int);
+TORCH_API int _crash_if_asan(int /*arg*/);
 
 // Converts a TensorList (i.e. ArrayRef<Tensor> to vector of TensorImpl*)
 // NB: This is ONLY used by legacy TH bindings, and ONLY used by cat.
diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index 4b8b5f6c5d18..e3424cc4cb8e 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -36,7 +36,7 @@ namespace {
 using weakref_type = c10::weak_intrusive_ptr<TensorImpl, UndefinedTensorImpl>;
 using val_type = std::tuple<weakref_type, Tensor>;
 
-static ska::flat_hash_map<TensorImpl*, val_type>& get_cached_casts() {
+ska::flat_hash_map<TensorImpl*, val_type>& get_cached_casts() {
   static ska::flat_hash_map<TensorImpl*, val_type> cached_casts;
   return cached_casts;
 }
@@ -148,7 +148,7 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg, DeviceType device_
 Banned functions
 *******************************/
 
-static Tensor binary_cross_entropy_banned(const Tensor &, const Tensor &, const std::optional<Tensor>&, int64_t) {
+static Tensor binary_cross_entropy_banned(const Tensor & /*unused*/, const Tensor & /*unused*/, const std::optional<Tensor>& /*unused*/, int64_t /*unused*/) {
   TORCH_CHECK(false, "torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n"
            "Many models use a sigmoid layer right before the binary cross entropy layer.\n"
            "In this case, combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits\n"
diff --git a/aten/src/ATen/core/CachingHostAllocator.cpp b/aten/src/ATen/core/CachingHostAllocator.cpp
index 5939253caf55..f3ddaedc5ecd 100644
--- a/aten/src/ATen/core/CachingHostAllocator.cpp
+++ b/aten/src/ATen/core/CachingHostAllocator.cpp
@@ -6,9 +6,9 @@ namespace at {
 
 namespace {
 
-static std::array<HostAllocator*, at::COMPILE_TIME_MAX_DEVICE_TYPES>
+std::array<HostAllocator*, at::COMPILE_TIME_MAX_DEVICE_TYPES>
     allocator_array{};
-static std::array<uint8_t, at::COMPILE_TIME_MAX_DEVICE_TYPES>
+std::array<uint8_t, at::COMPILE_TIME_MAX_DEVICE_TYPES>
     allocator_priority{};
 
 } // anonymous namespace
diff --git a/aten/src/ATen/core/CachingHostAllocator.h b/aten/src/ATen/core/CachingHostAllocator.h
index 53e95cd2d4cf..c9eacbed42ef 100644
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@@ -6,6 +6,7 @@
 #include <c10/core/thread_pool.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/llvmMathExtras.h>
+#include <iostream>
 #include <optional>
 
 #include <deque>
@@ -49,19 +50,57 @@ namespace {
   constexpr size_t MAX_SIZE_INDEX = 64;
 }
 
+// A large reserved pinned memory segment that is created in advance which is used
+// to allocate small pinned memory requests to avoid calling into expensive APIs.
+// We never free this memory and move up the pointer as we allocate new blocks
+// and when blocks are freed, they are cached in the free lists.
+struct PinnedReserveSegment {
+  PinnedReserveSegment(void *start, size_t size) : start_(start), size_(size),
+    current_ptr_(start_), initialized_(true) {}
+
+  PinnedReserveSegment() : start_(nullptr), size_(0), current_ptr_(nullptr), initialized_(false) {}
+
+  bool initialized() {
+    return initialized_;
+  }
+
+  void* allocate(size_t bytes) {
+    std::lock_guard<std::mutex> guard(mutex_);
+
+    // Round up the requested size to 4KB boundary for all including the small ones.
+    size_t rounded_bytes = (bytes + 4096 - 1) & ~(4096 - 1);
+
+    if (((uint8_t*)current_ptr_ + rounded_bytes) > ((uint8_t*)start_ + size_)) {
+      return nullptr;
+    }
+
+    void* ptr = current_ptr_;
+    current_ptr_ = (uint8_t*)current_ptr_ + rounded_bytes;
+    return ptr;
+  }
+
+  bool owns(void* ptr) {
+    return ptr >= start_ && ptr < (uint8_t*)start_ + size_;
+  }
+
+  std::mutex mutex_;
+  void* start_;
+  size_t size_;
+  void* current_ptr_;
+  bool initialized_;
+};
+
 // Struct containing memory allocator summary statistics for host.
 struct TORCH_API HostStats {
-  // COUNT: allocations requested by client code. Note that active
-  // count can be extracted by looking at current allocations
-  Stat allocation;
-  // COUNT: number of allocated segments from host memory allocation.
-  Stat segment;
-
-  // SUM: bytes allocated by this memory alocator. Note that active bytes
-  // can be extracted by looking at current bytes allocated
+  // COUNT: total allocations (active)
+  Stat active_requests;
+  // SUM: bytes allocated/reserved by this memory alocator. (active)
+  Stat active_bytes;
+  // COUNT: total allocations (active + free)
+  Stat allocations;
+  // SUM: bytes allocated/reserved by this memory alocator. This accounts
+  // for both free and in-use blocks.
   Stat allocated_bytes;
-  // SUM: bytes reserved by this memory allocator (both free and used)
-  Stat reserved_bytes;
 
   // SUM: time spent in cudaHostAlloc/cudaHostRegister in microseconds
   DurationStat host_alloc_time;
@@ -75,6 +114,9 @@ struct TORCH_API HostStats {
 
   // COUNT: number of times cudaHostFree/cudaHostUnregister was called.
   int64_t num_host_free = 0; // This is derived from segment or timing
+
+  // Count of cudaHostAlloc/cudaHostRegister per bucket
+  std::vector<int64_t> bucket_allocation = std::vector<int64_t>(MAX_SIZE_INDEX);
 };
 
 // Struct containing memory allocator summary statistics for host, as they
@@ -82,17 +124,22 @@ struct TORCH_API HostStats {
 // avoid locking the allocator while collecting stats.
 struct alignas(64) HostStatsStaged {
   std::mutex timing_mutex_;
-  // COUNT: allocations requested by client code resulting in a new segment/block allocation
-  // LOCK: access to this stat is protected by the allocator's blocks_mutex_
-  Stat allocation;
-  // SUM: bytes within active memory blocks, including blocks that are
-  // currently in the free list.
+  // COUNT: total allocations (active + free)
   // LOCK: access to this stat is protected by the allocator's blocks_mutex_
+  Stat allocations;
+  // SUM: bytes allocated/reserved by this memory alocator. This accounts
+  // for both free and in-use blocks.
   Stat allocated_bytes;
-  // COUNT: number of allocations per bucket
+  // COUNT: number of allocations per bucket (active)
+  // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
+  std::vector<Stat> active_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
+  // SUM: bytes of allocation per bucket (active)
+  // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
+  std::vector<Stat> active_bytes_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
+  // COUNT: number of allocations per bucket (active + free)
   // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
   std::vector<Stat> allocation_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
-  // SUM: bytes of allocation per bucket
+  // SUM: bytes of allocation per bucket (active + free)
   // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
   std::vector<Stat> allocated_bytes_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
   // SUM: time spent in cudaHostAlloc/cudaHostRegister
@@ -211,12 +258,6 @@ struct CachingHostAllocatorImpl {
     // Check in the recently freed blocks with pending events to see if we
     // can reuse them. Call get_free_block again after processing events
     if (pinned_use_background_threads()) {
-      process_events_for_specific_size(roundSize);
-      block = get_free_block(roundSize);
-      if (block) {
-        return {block->ptr_, reinterpret_cast<void*>(block)};
-      }
-
       // Launch the background thread and process events in a loop.
       static bool background_thread_flag [[maybe_unused]] = [this] {
         getBackgroundThreadPool()->run([&]() {
@@ -278,8 +319,6 @@ struct CachingHostAllocatorImpl {
       auto index = size_index(block->size_);
       std::lock_guard<std::mutex> g(free_list_[index].mutex_);
       free_list_[index].list_.push_back(block);
-      stats_.allocation_bucket_stats[index].decrease(1);
-      stats_.allocated_bytes_bucket_stats[index].decrease(block->size_);
     } else {
       // restore these events that record by used streams.
       std::lock_guard<std::mutex> g(events_mutex_);
@@ -339,9 +378,12 @@ struct CachingHostAllocatorImpl {
       for (auto* block : blocks_to_remove) {
         blocks_.erase(block);
         ptr_to_block_.erase(block->ptr_);
-        stats_.allocation.decrease(1);
-        stats_.allocated_bytes.decrease(block->size_);
+        auto index = size_index(block->size_);
         free_block(block);
+        stats_.allocations.decrease(1);
+        stats_.allocated_bytes.decrease(block->size_);
+        stats_.allocation_bucket_stats[index].decrease(1);
+        stats_.allocated_bytes_bucket_stats[index].decrease(block->size_);
         delete block;
       }
     }
@@ -388,16 +430,17 @@ struct CachingHostAllocatorImpl {
       // per bucket (we pick index 0 arbitrarily). These are also all the host
       // allocations, not taking into account caching and free lists.
       if (i == 0) {
-        stats.segment = stats_.allocation;
-        stats.reserved_bytes = stats_.allocated_bytes;
-        stats.num_host_alloc = stats.segment.allocated;
-        stats.num_host_free = stats.segment.freed;
+        stats.allocations = stats_.allocations;
+        stats.allocated_bytes = stats_.allocated_bytes;
+        stats.num_host_alloc = stats.allocations.allocated;
+        stats.num_host_free = stats.allocations.freed;
       }
 
       // Bucket stats need to be merged with the slow-path stats. We do this in
       // a best effort manner, since we can't really replay the cached events per bucket.
-      add_bucket_stats(stats.allocation, stats_.allocation_bucket_stats[i]);
-      add_bucket_stats(stats.allocated_bytes, stats_.allocated_bytes_bucket_stats[i]);
+      add_bucket_stats(stats.active_requests, stats_.active_bucket_stats[i]);
+      add_bucket_stats(stats.active_bytes, stats_.active_bytes_bucket_stats[i]);
+      stats.bucket_allocation[i] = stats_.allocation_bucket_stats[i].allocated;
     }
 
     // Get the timing stats
@@ -421,9 +464,11 @@ struct CachingHostAllocatorImpl {
       std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
 
       if (i == 0) {
-        stats_.allocation.reset_accumulated();
+        stats_.allocations.reset_accumulated();
         stats_.allocated_bytes.reset_accumulated();
       }
+      stats_.active_bucket_stats[i].reset_accumulated();
+      stats_.active_bytes_bucket_stats[i].reset_accumulated();
       stats_.allocation_bucket_stats[i].reset_accumulated();
       stats_.allocated_bytes_bucket_stats[i].reset_accumulated();
     }
@@ -446,9 +491,11 @@ struct CachingHostAllocatorImpl {
       std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
 
       if (i == 0) {
-        stats_.allocation.reset_peak();
+        stats_.allocations.reset_peak();
         stats_.allocated_bytes.reset_peak();
       }
+      stats_.active_bucket_stats[i].reset_peak();
+      stats_.active_bytes_bucket_stats[i].reset_peak();
       stats_.allocation_bucket_stats[i].reset_peak();
       stats_.allocated_bytes_bucket_stats[i].reset_peak();
     }
@@ -465,7 +512,7 @@ struct CachingHostAllocatorImpl {
   virtual void add_allocated_block(B* block) {
     std::lock_guard<std::mutex> g(blocks_mutex_);
     blocks_.insert(block);
-    stats_.allocation.increase(1);
+    stats_.allocations.increase(1);
     stats_.allocated_bytes.increase(block->size_);
     ptr_to_block_.insert({block->ptr_, block});
 
@@ -478,6 +525,8 @@ struct CachingHostAllocatorImpl {
       std::lock_guard<std::mutex> g(free_list_[index].mutex_);
       stats_.allocation_bucket_stats[index].increase(1);
       stats_.allocated_bytes_bucket_stats[index].increase(size);
+      stats_.active_bucket_stats[index].increase(1);
+      stats_.active_bytes_bucket_stats[index].increase(size);
     }
   }
 
@@ -488,8 +537,8 @@ struct CachingHostAllocatorImpl {
       B* block = free_list_[index].list_.back();
       free_list_[index].list_.pop_back();
       block->allocated_ = true;
-      stats_.allocation_bucket_stats[index].increase(1);
-      stats_.allocated_bytes_bucket_stats[index].increase(size);
+      stats_.active_bucket_stats[index].increase(1);
+      stats_.active_bytes_bucket_stats[index].increase(size);
       return block;
     }
     return nullptr;
@@ -583,8 +632,8 @@ struct CachingHostAllocatorImpl {
         auto index = size_index(block->size_);
         std::lock_guard<std::mutex> g(free_list_[index].mutex_);
         free_list_[index].list_.push_back(block);
-        stats_.allocation_bucket_stats[index].decrease(1);
-        stats_.allocated_bytes_bucket_stats[index].decrease(size);
+        stats_.active_bucket_stats[index].decrease(1);
+        stats_.active_bytes_bucket_stats[index].decrease(size);
         if (size != -1) {
           return;
         }
diff --git a/aten/src/ATen/core/NamedTensor.cpp b/aten/src/ATen/core/NamedTensor.cpp
index eaca01fe5e09..0bbeb9ddc13a 100644
--- a/aten/src/ATen/core/NamedTensor.cpp
+++ b/aten/src/ATen/core/NamedTensor.cpp
@@ -49,7 +49,7 @@ static void check_unique_names(DimnameList names) {
 }
 
 void check_names_valid_for(const TensorBase& tensor, DimnameList names) {
-  return impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
+  impl::check_names_valid_for(tensor.unsafeGetTensorImpl(), names);
 }
 
 void check_names_valid_for(size_t tensor_dim, DimnameList names) {
diff --git a/aten/src/ATen/core/NamedTensor.h b/aten/src/ATen/core/NamedTensor.h
index 81998e160185..52acae90b128 100644
--- a/aten/src/ATen/core/NamedTensor.h
+++ b/aten/src/ATen/core/NamedTensor.h
@@ -27,11 +27,11 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
     HasNonWildcard
   };
 
-  explicit NamedTensorMeta(HAS_NON_WILDCARD, DimnameList names)
+  explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, DimnameList names)
     : names_(names.vec()) {
     check_invariants();
   }
-  explicit NamedTensorMeta(HAS_NON_WILDCARD, std::vector<Dimname>&& names)
+  explicit NamedTensorMeta(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& names)
     : names_(std::move(names)) {
     check_invariants();
   }
@@ -52,13 +52,13 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
       std::any_of(names_.begin(), names_.end(), [](const Dimname& n) { return !n.isWildcard(); }));
   }
 
-  void set_names(HAS_NON_WILDCARD, DimnameList new_names) {
+  void set_names(HAS_NON_WILDCARD /*unused*/, DimnameList new_names) {
     TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
     std::copy(new_names.begin(), new_names.end(), names_.begin());
     check_invariants();
   }
 
-  void set_names(HAS_NON_WILDCARD, std::vector<Dimname>&& new_names) {
+  void set_names(HAS_NON_WILDCARD /*unused*/, std::vector<Dimname>&& new_names) {
     TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
     names_ = std::move(new_names);
     check_invariants();
diff --git a/aten/src/ATen/core/PhiloxRNGEngine.h b/aten/src/ATen/core/PhiloxRNGEngine.h
index 413055d3fad6..e8bac545933c 100644
--- a/aten/src/ATen/core/PhiloxRNGEngine.h
+++ b/aten/src/ATen/core/PhiloxRNGEngine.h
@@ -229,10 +229,10 @@ class philox_engine {
   }
 
 
-  static const uint32_t kPhilox10A = 0x9E3779B9;
-  static const uint32_t kPhilox10B = 0xBB67AE85;
-  static const uint32_t kPhiloxSA = 0xD2511F53;
-  static const uint32_t kPhiloxSB = 0xCD9E8D57;
+  static constexpr uint32_t kPhilox10A = 0x9E3779B9;
+  static constexpr uint32_t kPhilox10B = 0xBB67AE85;
+  static constexpr uint32_t kPhiloxSA = 0xD2511F53;
+  static constexpr uint32_t kPhiloxSB = 0xCD9E8D57;
 };
 
 typedef philox_engine Philox4_32;
diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp
index efd9508ce15c..39f4e7cb6976 100644
--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@@ -2,6 +2,7 @@
 #include <c10/core/impl/PythonDispatcherTLS.h>
 #include <ATen/core/PythonFallbackKernel.h>
 #include <c10/core/SafePyObject.h>
+#include <ATen/record_function.h>
 
 namespace {
 
@@ -53,20 +54,24 @@ void pythonFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_
   TORCH_INTERNAL_ASSERT(tls_on_entry.has_value());
   // c10::impl::ForceDispatchKeyGuard dispatcher_guard(tls_on_entry.value());
   // StashTLSOnEntryGuard stash_guard;
-  c10::impl::ExcludeDispatchKeyGuard guard(after_Python_keyset);
+  c10::impl::ExcludeDispatchKeyGuard exclude_guard(after_Python_keyset);
 
+  const auto& schema = op.schema();
+  const auto num_arguments = schema.arguments().size();
 
   // If Torch Dispatch Mode is active, use its PyInterpreter for dispatch
   const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len();
   if (mode_stack_len > 0) {
+    RECORD_FUNCTION("PythonDispatchMode", torch::jit::last(*stack, num_arguments));
     const auto& cur_torch_dispatch_mode_state = c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1);
     cur_torch_dispatch_mode_state->pyinterpreter()->dispatch(op, stack);
     return;
   }
 
+  RECORD_FUNCTION("PythonSubclass", torch::jit::last(*stack, num_arguments));
+
   // Otherwise, find a PyInterpreter on a Tensor
-  const auto& schema = op.schema();
-  const auto num_arguments = schema.arguments().size();
+
   // It is safe to dispatch on the very first Tensor with a pyobj_interpreter
   // without checking the interpreters of any of the arguments, because when
   // we actually run dispatch(), we will take out PyObjects in the context
diff --git a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
index bec323c7d25b..83b39de34d78 100644
--- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
+++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
@@ -13,7 +13,7 @@ class TORCH_API PythonOpRegistrationTrampoline final {
 public:
   //  Returns true if you successfully registered yourself (that means
   //  you are in the hot seat for doing the operator registrations!)
-  static bool registerInterpreter(c10::impl::PyInterpreter*);
+  static bool registerInterpreter(c10::impl::PyInterpreter* /*interp*/);
 
   // Returns nullptr if no interpreter has been registered yet.
   static c10::impl::PyInterpreter* getInterpreter();
diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp
index 246418ad7ce8..c5f887f096cd 100644
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@@ -138,7 +138,7 @@ void Tensor::_backward(TensorList inputs,
         const std::optional<Tensor>& gradient,
         std::optional<bool> keep_graph,
         bool create_graph) const {
-  return impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
+  impl::GetVariableHooks()->_backward(*this, inputs, gradient, keep_graph, create_graph);
 }
 
 const TensorBase& TensorBase::requires_grad_(bool _requires_grad) const {
@@ -173,4 +173,12 @@ unsigned TensorBase::_register_hook(std::function<TensorBase(const TensorBase&)>
   return impl::GetVariableHooks()->_register_hook(*this, std::move(hook));
 }
 
+std::optional<ScalarType> TensorBase::grad_dtype() const {
+  return impl::GetVariableHooks()->grad_dtype(*this);
+}
+
+void TensorBase::set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const {
+  return impl::GetVariableHooks()->set_grad_dtype(*this, grad_dtype);
+}
+
 } // namespace at
diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h
index 5f43738ea0fa..1d0a3e73a5a5 100644
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@@ -100,7 +100,7 @@ class TORCH_API TensorBase {
   // Create a Tensor with a +0 reference count. Special care must be
   // taken to avoid decrementing this reference count at destruction
   // time. Intended to support MaybeOwnedTraits<Tensor>.
-  explicit TensorBase(unsafe_borrow_t, const TensorBase& rhs)
+  explicit TensorBase(unsafe_borrow_t /*unused*/, const TensorBase& rhs)
       : impl_(c10::intrusive_ptr<at::TensorImpl, UndefinedTensorImpl>(rhs.impl_.get(), c10::raw::DontIncreaseRefcount{})) {}
   friend MaybeOwnedTraits<TensorBase>;
 
@@ -930,6 +930,10 @@ class TORCH_API TensorBase {
 
   const TensorBase& requires_grad_(bool _requires_grad=true) const;
 
+  std::optional<ScalarType> grad_dtype() const;
+
+  void set_grad_dtype(const std::optional<ScalarType>& grad_dtype) const;
+
   // View Variables
   //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -950,7 +954,7 @@ class TORCH_API TensorBase {
   c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
 
 private:
-  TensorBase __dispatch_contiguous(c10::MemoryFormat) const;
+  TensorBase __dispatch_contiguous(c10::MemoryFormat /*memory_format*/) const;
 };
 
 inline DeviceIndex get_device(const TensorBase& self) {
diff --git a/aten/src/ATen/core/TransformationHelper.h b/aten/src/ATen/core/TransformationHelper.h
index f81018a8e674..dad18bd019bb 100644
--- a/aten/src/ATen/core/TransformationHelper.h
+++ b/aten/src/ATen/core/TransformationHelper.h
@@ -117,7 +117,7 @@ C10_HOST_DEVICE inline T cauchy(T val, T median, T sigma) {
 template <>
 C10_HOST_DEVICE inline double cauchy(double val, double median, double sigma) {
   // https://en.wikipedia.org/wiki/Cauchy_distribution#Cumulative_distribution_function
-  return median + sigma * at::tan(c10::pi<double> * (val - static_cast<double>(0.5)));
+  return median + sigma * at::tan(c10::pi<double> * (val - 0.5));
 }
 
 /**
diff --git a/aten/src/ATen/core/VariableHooksInterface.h b/aten/src/ATen/core/VariableHooksInterface.h
index f9c0aa4a5fc1..c0f270700e3c 100644
--- a/aten/src/ATen/core/VariableHooksInterface.h
+++ b/aten/src/ATen/core/VariableHooksInterface.h
@@ -68,6 +68,8 @@ struct TORCH_API VariableHooksInterface {
       const c10::OperatorHandle& op,
       c10::DispatchKeySet dispatch_keys,
       torch::jit::Stack* stack) const = 0;
+  virtual std::optional<c10::ScalarType> grad_dtype(const TensorBase&) const = 0;
+  virtual void set_grad_dtype(const TensorBase&, const std::optional<c10::ScalarType>&) const = 0;
 };
 
 TORCH_API void SetVariableHooks(VariableHooksInterface* hooks);
diff --git a/aten/src/ATen/core/boxing/BoxedKernel.h b/aten/src/ATen/core/boxing/BoxedKernel.h
index 62b915885a80..c5e46d8de000 100644
--- a/aten/src/ATen/core/boxing/BoxedKernel.h
+++ b/aten/src/ATen/core/boxing/BoxedKernel.h
@@ -18,10 +18,10 @@ class KernelFunction;
 // implementation notes; notably, this does NOT actually go through the
 // boxing/unboxing codepath.
 TORCH_API void fallthrough_kernel(
-    OperatorKernel*,
-    const OperatorHandle&,
-    DispatchKeySet,
-    Stack*);
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& /*unused*/,
+    DispatchKeySet /*unused*/,
+    Stack* /*unused*/);
 
 // Note [Ambiguity in AutogradOther kernel]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -62,10 +62,10 @@ TORCH_API void fallthrough_kernel(
 // than arbitrarily pick one or the other, we just register a kernel that raises
 // an error and let the user decide how to proceed.
 TORCH_API void ambiguous_autogradother_kernel(
-    OperatorKernel*,
-    const OperatorHandle&,
-    DispatchKeySet,
-    Stack*);
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& /*op*/,
+    DispatchKeySet /*unused*/,
+    Stack* /*unused*/);
 
 // Note [named_not_supported_kernel]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -75,10 +75,10 @@ TORCH_API void ambiguous_autogradother_kernel(
 // give a good error message in cases when boxing is not supported).  When
 // boxing is universally supported this can be removed.
 [[noreturn]] TORCH_API void named_not_supported_kernel(
-    OperatorKernel*,
-    const OperatorHandle&,
-    DispatchKeySet,
-    Stack*);
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& /*op*/,
+    DispatchKeySet /*unused*/,
+    Stack* /*unused*/);
 
 /**
  * BoxedKernel is similar to a std::function storing a boxed kernel.
@@ -185,16 +185,16 @@ class TORCH_API BoxedKernel final {
 
   template <BoxedKernelFunction* func>
   static void make_boxed_function(
-      OperatorKernel*,
+      OperatorKernel* /*unused*/,
       const OperatorHandle& opHandle,
-      DispatchKeySet,
+      DispatchKeySet /*unused*/,
       Stack* stack);
 
   template <BoxedKernelFunction_withDispatchKeys* func>
   static void make_boxed_function(
-      OperatorKernel*,
+      OperatorKernel* /*unused*/,
       const OperatorHandle& opHandle,
-      DispatchKeySet,
+      DispatchKeySet /*ks*/,
       Stack* stack);
 
   explicit BoxedKernel(
diff --git a/aten/src/ATen/core/boxing/BoxedKernel_impl.h b/aten/src/ATen/core/boxing/BoxedKernel_impl.h
index 1960607c6bc8..04ba1368f070 100644
--- a/aten/src/ATen/core/boxing/BoxedKernel_impl.h
+++ b/aten/src/ATen/core/boxing/BoxedKernel_impl.h
@@ -2,7 +2,7 @@
 
 namespace c10 {
 
-inline BoxedKernel::BoxedKernel() : functor_(), boxed_kernel_func_(nullptr) {}
+inline BoxedKernel::BoxedKernel() : boxed_kernel_func_(nullptr) {}
 
 inline BoxedKernel::BoxedKernel(
     std::unique_ptr<OperatorKernel> functor,
@@ -11,9 +11,9 @@ inline BoxedKernel::BoxedKernel(
 
 template <BoxedKernel::BoxedKernelFunction* func>
 inline void BoxedKernel::make_boxed_function(
-    OperatorKernel*,
+    OperatorKernel* /*unused*/,
     const OperatorHandle& opHandle,
-    DispatchKeySet,
+    DispatchKeySet /*unused*/,
     Stack* stack) {
   // Note that we're dropping the DispatchKeySet argument.
   // See Note [Plumbing Keys Through The Dispatcher 2] for details.
@@ -22,7 +22,7 @@ inline void BoxedKernel::make_boxed_function(
 
 template <BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
 inline void BoxedKernel::make_boxed_function(
-    OperatorKernel*,
+    OperatorKernel* /*unused*/,
     const OperatorHandle& opHandle,
     DispatchKeySet ks,
     Stack* stack) {
diff --git a/aten/src/ATen/core/boxing/KernelFunction.cpp b/aten/src/ATen/core/boxing/KernelFunction.cpp
index c099c456814a..dd2fb32e6817 100644
--- a/aten/src/ATen/core/boxing/KernelFunction.cpp
+++ b/aten/src/ATen/core/boxing/KernelFunction.cpp
@@ -10,7 +10,7 @@ namespace c10 {
 // be handled specially.  Its semantics is that it redispatches to the
 // *next* dispatch key that would have been processed, skipping the current
 // one.
-void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*) {
+void fallthrough_kernel(OperatorKernel* /*unused*/, const OperatorHandle& /*unused*/, DispatchKeySet /*unused*/, Stack* /*unused*/) {
   TORCH_INTERNAL_ASSERT(0,
     "fallthrough_kernel was executed but it should have been short-circuited by the dispatcher. "
     "This could occur if you registered a fallthrough kernel as a override for a specific operator "
@@ -19,7 +19,7 @@ void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet,
     "let us know in the bug tracker.");
 }
 
-void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) {
+void ambiguous_autogradother_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) {
   TORCH_INTERNAL_ASSERT(0,
     op.operator_name(), " has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. "
     "This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering "
@@ -32,7 +32,7 @@ void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, D
     "\nCanonical state\n~~~~~~~~~~~\n", op.dumpState(), "\n\n");
 }
 
-void named_not_supported_kernel(OperatorKernel*, const OperatorHandle& op, DispatchKeySet, Stack*) {
+void named_not_supported_kernel(OperatorKernel* /*unused*/, const OperatorHandle& op, DispatchKeySet /*unused*/, Stack* /*unused*/) {
   // DO NOT LOOK AT STACK, YOU HAVE SHORT CIRCUITED BOXING
   // See Note [named_not_supported_kernel]
   TORCH_CHECK(0,
diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h
index 4300217235b8..eb0cf833dfc2 100644
--- a/aten/src/ATen/core/boxing/KernelFunction.h
+++ b/aten/src/ATen/core/boxing/KernelFunction.h
@@ -229,7 +229,7 @@ class TORCH_API KernelFunction final {
    * &unboxed_func>();
    */
   template <class FuncPtr, bool AllowLegacyTypes = false>
-  static KernelFunction makeFromUnboxedFunction(FuncPtr);
+  static KernelFunction makeFromUnboxedFunction(FuncPtr /*func_ptr*/);
 
   /**
    * Create a KernelFunction from an unboxed function.
@@ -271,7 +271,7 @@ class TORCH_API KernelFunction final {
 
   std::string dumpState() const;
   // For testing internal invariants only
-  bool _equalsBoxedAndUnboxed(const KernelFunction&) const;
+  bool _equalsBoxedAndUnboxed(const KernelFunction& /*other*/) const;
 
   // Register a token to be invalidated when this KernelFunction is destroyed
   void registerToken(std::weak_ptr<KernelToken> token) const;
diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h
index 672309ec19a2..bb981c1d4efd 100644
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@@ -20,9 +20,7 @@ make_unique_base(Args&&... args) {
 } // namespace detail
 
 inline KernelFunction::KernelFunction()
-    : boxed_kernel_func_(),
-      unboxed_kernel_func_(nullptr),
-      sym_unboxed_kernel_func_(nullptr) {}
+    : unboxed_kernel_func_(nullptr), sym_unboxed_kernel_func_(nullptr) {}
 
 inline KernelFunction::~KernelFunction() {
   if (tokens_) {
diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h
index 68e25cccd44c..7fbc3b982609 100644
--- a/aten/src/ATen/core/boxing/impl/boxing.h
+++ b/aten/src/ATen/core/boxing/impl/boxing.h
@@ -131,7 +131,7 @@ C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack(
   new (dest++) IValue(options.pinned_memory());
 }
 
-inline void boxArgsToStack(IValue*&) {}
+inline void boxArgsToStack(IValue*& /*unused*/) {}
 
 template <typename T, typename... Args>
 C10_ALWAYS_INLINE_UNLESS_MOBILE void boxArgsToStack(
@@ -185,7 +185,7 @@ struct PopResult<std::tuple<Types...>> final {
   template <size_t... indices>
   static Result pop_to_tuple_impl(
       Stack& stack,
-      std::index_sequence<indices...>) {
+      std::index_sequence<indices...> /*unused*/) {
     return std::make_tuple((std::move(stack[indices]).template to<Types>())...);
   }
 };
diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
index 20dfde846e64..34b1514f32cd 100644
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@@ -561,7 +561,7 @@ struct wrap_kernel_functor_unboxed_<
   // doesn't use &&
   static ReturnType call(
       OperatorKernel* functor,
-      DispatchKeySet,
+      DispatchKeySet /*unused*/,
       ParameterTypes... args) {
     KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
     // Note [Plumbing Keys Through The Dispatcher 2]
@@ -629,8 +629,8 @@ call_functor_with_args_from_stack_(
     OperatorKernel* functor,
     DispatchKeySet dispatchKeySet,
     Stack* stack,
-    std::index_sequence<ivalue_arg_indices...>,
-    guts::typelist::typelist<ArgTypes...>*) {
+    std::index_sequence<ivalue_arg_indices...> /*unused*/,
+    guts::typelist::typelist<ArgTypes...>* /*unused*/) {
   (void)(stack); // when sizeof...(ivalue_arg_indices) == 0, this argument would
                  // be unused and we have to silence the compiler warning.
 
@@ -708,7 +708,7 @@ struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
   static void call_(
       std::tuple<OutputTypes...>&& output,
       Stack* stack,
-      std::index_sequence<indices...>) {
+      std::index_sequence<indices...> /*unused*/) {
     torch::jit::push(
         *stack,
         return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::call(
@@ -718,7 +718,7 @@ struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
   static void copy_(
       const std::tuple<OutputTypes...>& output,
       Stack* stack,
-      std::index_sequence<indices...>) {
+      std::index_sequence<indices...> /*unused*/) {
     torch::jit::push(
         *stack,
         return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::copy(
@@ -741,7 +741,7 @@ struct make_boxed_from_unboxed_functor final {
 
   static void call(
       OperatorKernel* functor,
-      const OperatorHandle&,
+      const OperatorHandle& /*unused*/,
       DispatchKeySet dispatchKeySet,
       Stack* stack) {
     using ReturnType =
diff --git a/aten/src/ATen/core/builtin_function.h b/aten/src/ATen/core/builtin_function.h
index 5ab1ace1685f..8c837871dff7 100644
--- a/aten/src/ATen/core/builtin_function.h
+++ b/aten/src/ATen/core/builtin_function.h
@@ -63,13 +63,13 @@ struct BuiltinOpFunction : public Function {
 
   bool call(
       Stack& stack,
-      std::optional<size_t>,
-      c10::function_ref<void(const Code&)>) override {
+      std::optional<size_t> /*unused*/,
+      c10::function_ref<void(const Code&)> /*unused*/) override {
     run(stack);
     return false;
   }
 
-  bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)>)
+  bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)> /*unused*/)
       override {
     run(stack);
     return false;
diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
index ecc4bc7b5d89..dbd00e9c5290 100644
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
@@ -80,7 +80,8 @@ struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
       ts = ts | x.key_set();
     }
   }
-  [[noreturn]] void operator()(at::ArrayRef<std::optional<at::Tensor>>) {
+  [[noreturn]] void operator()(
+      at::ArrayRef<std::optional<at::Tensor>> /*unused*/) {
     // Just checking that the handling of Tensor?[] didn't change.
     TORCH_INTERNAL_ASSERT(false);
   }
@@ -95,7 +96,7 @@ struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
     }
   }
   template <typename T>
-  void operator()(const T&) {
+  void operator()(const T& /*unused*/) {
     // do nothing
   }
 };
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp
index 91a5f6459617..4f9d7c6ec0db 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@@ -76,13 +76,7 @@ void _print_dispatch_trace(const std::string& label, const std::string& op_name,
 
 OpRegistrationListener::~OpRegistrationListener()= default;
 
-Dispatcher::Dispatcher()
-: operators_()
-, operatorLookupTable_()
-, backendFallbackKernels_()
-, listeners_(std::make_unique<detail::RegistrationListenerList>())
-, cond_var_()
-, guard_(std::make_shared<Guard>())
+Dispatcher::Dispatcher(): backendFallbackKernels_(), listeners_(std::make_unique<detail::RegistrationListenerList>()), guard_(std::make_shared<Guard>())
 {}
 
 Dispatcher::~Dispatcher() {
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index 43eb0028c70f..29139a294745 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -96,7 +96,7 @@ class TORCH_API Dispatcher final {
   friend class TypedOperatorHandle;
 
   struct Guard final {
-    Guard() : alive(true), mutex() {}
+    Guard() : alive(true) {}
     std::atomic<bool> alive;
     std::mutex mutex;
   };
@@ -496,7 +496,7 @@ class TORCH_API OperatorHandle {
   }
 
   void checkInvariants() const {
-    return operatorDef_->op.checkInvariants();
+    operatorDef_->op.checkInvariants();
   }
 
   c10::ArrayRef<at::Tag> getTags() const {
@@ -633,7 +633,7 @@ class TypedOperatorHandle<Return(Args...)> final : public OperatorHandle {
 
 namespace detail {
 template <class... Args>
-inline void unused_arg_(const Args&...) {}
+inline void unused_arg_(const Args&... /*unused*/) {}
 
 // CaptureKernelCall is intended to capture return values from Dispatcher
 // unboxed kernel calls. A record function may request to get outputs from the
@@ -932,7 +932,7 @@ inline void Dispatcher::redispatchBoxed(
   }
 #endif
   const auto& kernel = entry.lookup(dispatchKeySet);
-  return kernel.callBoxed(op, dispatchKeySet, stack);
+  kernel.callBoxed(op, dispatchKeySet, stack);
 }
 
 } // namespace c10
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index c172e9b9c609..7040049ddf1e 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -62,17 +62,7 @@ static const auto& getDispatchTableIndexToKey() {
 }
 
 OperatorEntry::OperatorEntry(OperatorName&& operator_name)
-: name_(std::move(operator_name))
-, schema_()
-#ifndef C10_MOBILE
-, tags_()
-#endif
-, dispatchTable_()
-, dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized())
-, kernels_()
-, cpp_signature_()
-, sym_cpp_signature_()
-, is_observed_(ObservedOperators::isObserved(name_))
+: name_(std::move(operator_name)), dispatchTable_(), dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized()), is_observed_(ObservedOperators::isObserved(name_))
 {
   // Pick up any backend fallbacks that were registered prior to this
   // OperatorEntry being created.
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h
index 59b54ce1d9d3..cc5736ba0e77 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@@ -105,7 +105,7 @@ class TORCH_API OperatorEntry final {
   // versa that is an error.  (Refcounting for the registrations is
   // handled in the OperatorHandle in Dispatcher)
   void registerSchema(
-      FunctionSchema&&,
+      FunctionSchema&& /*schema*/,
       std::string&& debug,
       std::vector<at::Tag> tags = {});
   void deregisterSchema();
diff --git a/aten/src/ATen/core/dynamic_type.cpp b/aten/src/ATen/core/dynamic_type.cpp
index d4596ed2ca73..2b1a32bd0ac8 100644
--- a/aten/src/ATen/core/dynamic_type.cpp
+++ b/aten/src/ATen/core/dynamic_type.cpp
@@ -177,7 +177,7 @@ bool DynamicType::equals(const Type& rhs) const {
   return equals(*create(rhs));
 }
 
-bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream*) const {
+bool DynamicType::isSubtypeOfExt(const Type& rhs, std::ostream* /*why_not*/) const {
   auto other = create(rhs);
   if (tag_ == other->tag_) {
     if (equals(*other)) {
@@ -371,7 +371,7 @@ DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::create(
 }
 
 DynamicTypePtr ivalue::TupleTypeFactory<c10::DynamicType>::fallback(
-    const Type&) {
+    const Type& /*unused*/) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
   return nullptr;
 }
diff --git a/aten/src/ATen/core/dynamic_type.h b/aten/src/ATen/core/dynamic_type.h
index 2ba841e44e20..ee0d077e5c51 100644
--- a/aten/src/ATen/core/dynamic_type.h
+++ b/aten/src/ATen/core/dynamic_type.h
@@ -138,8 +138,8 @@ class DynamicType : public SharedType {
 
   struct Arguments {
     Arguments() = default;
-    Arguments(c10::ArrayRef<TypePtr>);
-    Arguments(const std::vector<std::string_view>&, c10::ArrayRef<TypePtr>);
+    Arguments(c10::ArrayRef<TypePtr> /*args*/);
+    Arguments(const std::vector<std::string_view>& /*names*/, c10::ArrayRef<TypePtr> /*args*/);
     std::vector<LabeledDynamicType> elems;
   };
 
@@ -156,15 +156,15 @@ class DynamicType : public SharedType {
   static const TypeKind Kind = TypeKind::DynamicType;
   static TORCH_API DynamicTypePtr create(Type& ty);
 
-  explicit DynamicType(Tag, Arguments);
-  explicit DynamicType(Tag, std::string_view, Arguments);
+  explicit DynamicType(Tag /*tag*/, Arguments /*arguments*/);
+  explicit DynamicType(Tag /*tag*/, std::string_view /*name*/, Arguments /*arguments*/);
 
   DynamicType(DynamicType&& other) = delete;
   DynamicType(const DynamicType&) = delete;
   DynamicType& operator=(const DynamicType&) = delete;
   DynamicType& operator=(DynamicType&&) = delete;
 
-  TypePtr containedType(size_t) const override;
+  TypePtr containedType(size_t /*i*/) const override;
   size_t containedTypeSize() const override;
   Tag tag() const {
     return tag_;
diff --git a/aten/src/ATen/core/function.h b/aten/src/ATen/core/function.h
index 7e8a765a05ab..83db2ec9d71d 100644
--- a/aten/src/ATen/core/function.h
+++ b/aten/src/ATen/core/function.h
@@ -96,15 +96,15 @@ struct TORCH_API Function {
   // Overload for server interpreter, a bailout size is needed for graph
   // executor.
   virtual bool call(
-      Stack&,
-      std::optional<size_t>,
-      c10::function_ref<void(const Code&)>) {
+      Stack& /*unused*/,
+      std::optional<size_t> /*unused*/,
+      c10::function_ref<void(const Code&)> /*unused*/) {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
     return false;
   }
 
   // Overload for mobile interpreter.
-  virtual bool call(Stack&, c10::function_ref<void(const mobile::Code&)>) {
+  virtual bool call(Stack& /*unused*/, c10::function_ref<void(const mobile::Code&)> /*unused*/) {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
     return false;
   }
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 72589436606e..264c7aff2cca 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -357,7 +357,7 @@ IValue IValue::equals(const IValue& rhs) const {
     case Tag::Enum:
       return lhs.toEnumHolder()->is(*rhs.toEnumHolder());
     case Tag::Uninitialized:
-      // Unitialized ivalues show up in no-ops when the compiler can prove a
+      // Uninitialized ivalues show up in no-ops when the compiler can prove a
       // value will never be used. Just return false on any equality comparison.
       return false;
   }
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index ab2039e05820..d9516ed900e3 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -624,7 +624,14 @@ struct TORCH_API IValue final {
   IValue(const c10::SymBool& i) {
     if (auto mi = i.maybe_as_bool()) {
       tag = Tag::Bool;
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
       payload.u.as_int = *mi;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+      /* due to byteorder if value assigned as_int, as_bool actually is not set correctly */
+      payload.u.as_bool = *mi;
+#else
+#error Unexpected or undefined __BYTE_ORDER__
+#endif
     } else {
       tag = Tag::SymBool;
       payload.u.as_intrusive_ptr = i.toSymNodeImpl().release();
@@ -847,7 +854,7 @@ struct TORCH_API IValue final {
   IValue(std::optional<T> v);
   template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
   IValue(c10::OptionalArrayRef<T> v);
-  IValue(std::nullopt_t);
+  IValue(std::nullopt_t /*unused*/);
 
   // ClassType
   IValue(c10::intrusive_ptr<ivalue::Object> v);
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index 1251c4c0c210..89759560c3ea 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -660,7 +660,7 @@ struct TORCH_API TupleTypeFactory<TupleType> {
 template <>
 struct TORCH_API TupleTypeFactory<c10::DynamicType> {
   static DynamicTypePtr create(const std::vector<TypePtr>& elemTypes);
-  static DynamicTypePtr fallback(const Type&);
+  static DynamicTypePtr fallback(const Type& /*unused*/);
 };
 
 struct TORCH_API Tuple : c10::intrusive_ptr_target {
@@ -1682,7 +1682,7 @@ struct ivalue::EnumHolder : c10::intrusive_ptr_target {
 namespace detail {
 
 struct _guarded_unsigned_long_unique_dummy final {
-  _guarded_unsigned_long_unique_dummy(int64_t){}
+  _guarded_unsigned_long_unique_dummy(int64_t /*unused*/){}
 };
 using _guarded_unsigned_long = std::conditional_t<
     std::is_same_v<unsigned long, uint32_t> ||
@@ -1776,7 +1776,7 @@ template <class Elem>
 // native_functions.yaml still return std::vector.
 // C10_DEPRECATED_MESSAGE("IValues based on std::vector<T> are potentially slow
 // and deprecated. Please use torch::List<T> instead.")
-std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>>) {
+std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>> /*unused*/) {
   // We need to do a deep copy of the vector because there might be other
   // references to this same IValue that also use the list. We can't just
   // move the elements out.
@@ -1826,18 +1826,18 @@ c10::intrusive_ptr<T> IValue::toCustomClass() const& {
 }
 
 template <typename T>
-T generic_to(IValue ivalue, _fake_type<T>) {
+T generic_to(IValue ivalue, _fake_type<T> /*unused*/) {
   using ElemType = typename std::remove_pointer<T>::type::element_type;
   return std::move(ivalue).template toCustomClass<ElemType>();
 }
 
 template <typename T>
-tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>>) {
+tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>> /*unused*/) {
   return tagged_capsule<T>{std::move(ivalue)};
 }
 
 template <typename Elem>
-c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>>) {
+c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>> /*unused*/) {
   return impl::toTypedList<Elem>(std::move(ivalue).toList());
 }
 
@@ -1867,7 +1867,7 @@ std::vector<T> createVectorFromList(const c10::List<T>& impl) {
 }
 
 template <typename T>
-OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>>) {
+OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>> /*unused*/) {
   if (ivalue.isNone()) {
     return {};
   }
@@ -1880,8 +1880,8 @@ namespace detail {
 template <typename Elem, size_t... I>
 std::array<Elem, sizeof...(I)> generic_to_array(
     IValue ivalue,
-    _fake_type<std::array<Elem, sizeof...(I)>>,
-    std::index_sequence<I...>) {
+    _fake_type<std::array<Elem, sizeof...(I)>> /*unused*/,
+    std::index_sequence<I...> /*unused*/) {
   // We need to do a deep copy of the array because there might be other
   // references to this same IValue that also use the list. We can't just
   // move the elements out.
@@ -1906,7 +1906,7 @@ std::array<Elem, N> generic_to(
 template <typename Key, typename Value>
 c10::Dict<Key, Value> generic_to(
     IValue ivalue,
-    _fake_type<c10::Dict<Key, Value>>) {
+    _fake_type<c10::Dict<Key, Value>> /*unused*/) {
   return impl::toTypedDict<Key, Value>(std::move(ivalue).toGenericDict());
 }
 
@@ -1915,7 +1915,7 @@ C10_DEPRECATED_MESSAGE(
     "IValues based on std::unordered_map are slow and deprecated. Please use c10::Dict<K, V> instead.")
 std::unordered_map<K, V> generic_to(
     IValue ivalue,
-    _fake_type<std::unordered_map<K, V>>) {
+    _fake_type<std::unordered_map<K, V>> /*unused*/) {
   std::unordered_map<K, V> specialized_dict;
 
   for (const auto& item : std::move(ivalue).toGenericDict()) {
@@ -1926,7 +1926,7 @@ std::unordered_map<K, V> generic_to(
 }
 
 template <typename T>
-std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>>) {
+std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>> /*unused*/) {
   if (ivalue.isNone()) {
     return std::nullopt;
   }
@@ -1937,7 +1937,7 @@ namespace detail {
 template <typename Tuple, std::size_t... INDEX>
 Tuple generic_to_tuple_impl(
     const ivalue::TupleElements& t,
-    std::index_sequence<INDEX...>) {
+    std::index_sequence<INDEX...> /*unused*/) {
   return std::make_tuple(
       t[INDEX].to<typename std::tuple_element<INDEX, Tuple>::type>()...);
 }
@@ -1951,7 +1951,7 @@ template <
             std::is_lvalue_reference<Args>...,
             std::negation<std::is_constructible<IValue, Args>>...>,
         std::nullptr_t> = nullptr>
-std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>>) {
+std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>> /*unused*/) {
   const auto& vals = ivalue.toTupleRef().elements();
   TORCH_CHECK(vals.size() == sizeof...(Args));
   return detail::generic_to_tuple_impl<std::tuple<Args...>>(vals, Indices{});
@@ -2311,7 +2311,7 @@ inline IValue::IValue(std::optional<T> v) : IValue() {
   }
 }
 
-inline IValue::IValue(std::nullopt_t) : IValue() {}
+inline IValue::IValue(std::nullopt_t /*unused*/) : IValue() {}
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::Object> v)
     : tag(Tag::Object) {
@@ -2482,15 +2482,15 @@ namespace ivalue {
 namespace detail {
 
 template <typename T>
-IValue from_(T&& x, std::true_type) {
+IValue from_(T&& x, std::true_type /*unused*/) {
   return IValue(std::forward<T>(x));
 }
 template <typename T>
-IValue from_(c10::intrusive_ptr<T> x, std::false_type) {
+IValue from_(c10::intrusive_ptr<T> x, std::false_type /*unused*/) {
   return IValue(std::move(x));
 }
 template <typename T>
-IValue from_(T&& /*x*/, std::false_type) {
+IValue from_(T&& /*x*/, std::false_type /*unused*/) {
   static_assert(
       guts::false_t<T>::value,
       "You are calling from with a type that it doesn't support, and isn't a potential custom class (ie: is an intrusive_ptr)");
@@ -2546,19 +2546,19 @@ struct MaybeOwnedTraits<IValue> {
     return &borrow;
   }
 
-  static bool debugBorrowIsValid(const borrow_type&) {
+  static bool debugBorrowIsValid(const borrow_type& /*unused*/) {
     return true;
   }
 };
 
 template <>
 struct IValue::TagType<c10::Type> {
-  static TORCH_API c10::TypePtr get(const IValue&);
+  static TORCH_API c10::TypePtr get(const IValue& /*v*/);
 };
 
 template <>
 struct IValue::TagType<c10::DynamicType> {
-  static TORCH_API c10::TypePtr get(const IValue&);
+  static TORCH_API c10::TypePtr get(const IValue& /*v*/);
 };
 
 template <typename T>
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index c15e5f72af27..d8e7b7e8b55a 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -1234,7 +1234,7 @@ struct TORCH_API TupleType : public NamedType {
   std::shared_ptr<FunctionSchema> schema_;
 };
 
-// the common supertype of all Enums, only used in operator registraion.
+// the common supertype of all Enums, only used in operator registration.
 // EnumType <: AnyEnumType for all Enums
 struct AnyEnumType;
 using AnyEnumTypePtr = SingletonTypePtr<AnyEnumType>;
diff --git a/aten/src/ATen/core/op_registration/infer_schema.h b/aten/src/ATen/core/op_registration/infer_schema.h
index a393e0290458..0ee79ed85930 100644
--- a/aten/src/ATen/core/op_registration/infer_schema.h
+++ b/aten/src/ATen/core/op_registration/infer_schema.h
@@ -44,7 +44,7 @@ constexpr int checkStaticTypes() {
 }
 
 template <typename... Ts, size_t... Is>
-constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...>) {
+constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...> /*unused*/) {
   return (
     // Check types for common errors
     checkStaticTypes<Ts...>(),
diff --git a/aten/src/ATen/core/op_registration/op_allowlist.h b/aten/src/ATen/core/op_registration/op_allowlist.h
index 3e8e03f9fa4c..1f39ba4e3871 100644
--- a/aten/src/ATen/core/op_registration/op_allowlist.h
+++ b/aten/src/ATen/core/op_registration/op_allowlist.h
@@ -114,7 +114,7 @@ constexpr bool allowlist_contains(std::string_view allowlist, std::string_view i
         }
         next++;
       } else {
-        if (allowlist.substr(cur).compare(item) == 0) {
+        if (allowlist.substr(cur) == item) {
           return true;
         }
         break;
diff --git a/aten/src/ATen/core/op_registration/op_registration.cpp b/aten/src/ATen/core/op_registration/op_registration.cpp
index b5ae2290b5ad..b34134309cb7 100644
--- a/aten/src/ATen/core/op_registration/op_registration.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration.cpp
@@ -73,7 +73,7 @@ c10::FunctionSchema RegisterOperators::inferSchemaFromKernels_(
 
   std::optional<FunctionSchema> inferred_schema = std::nullopt;
   for (const auto& kernel : options.kernels) {
-    if (nullptr != kernel.inferred_function_schema.get()) {
+    if (nullptr != kernel.inferred_function_schema) {
       if (!inferred_schema.has_value()) {
         inferred_schema = *kernel.inferred_function_schema;
         break;
diff --git a/aten/src/ATen/core/op_registration/op_registration.h b/aten/src/ATen/core/op_registration/op_registration.h
index 7a44cfa49b07..d441269bf297 100644
--- a/aten/src/ATen/core/op_registration/op_registration.h
+++ b/aten/src/ATen/core/op_registration/op_registration.h
@@ -411,7 +411,6 @@ class TORCH_API RegisterOperators final {
 
     Options()
     : schemaOrName_(std::nullopt)
-    , kernels()
     , aliasAnalysisKind_(std::nullopt)
     {}
 
@@ -420,7 +419,6 @@ class TORCH_API RegisterOperators final {
     struct KernelRegistrationConfig final {
       KernelRegistrationConfig()
         : dispatch_key(std::nullopt)
-        , func()
         , cpp_signature(std::nullopt)
         , inferred_function_schema(nullptr)
       {}
diff --git a/aten/src/ATen/core/operator_name.h b/aten/src/ATen/core/operator_name.h
index 22e1f427b632..4c138ee50456 100644
--- a/aten/src/ATen/core/operator_name.h
+++ b/aten/src/ATen/core/operator_name.h
@@ -83,7 +83,7 @@ inline bool operator!=(const OperatorName& lhs, const OperatorName& rhs) {
 }
 
 TORCH_API std::string toString(const OperatorName& opName);
-TORCH_API std::ostream& operator<<(std::ostream&, const OperatorName&);
+TORCH_API std::ostream& operator<<(std::ostream& /*os*/, const OperatorName& /*opName*/);
 
 } // namespace c10
 
diff --git a/aten/src/ATen/core/type_ptr.h b/aten/src/ATen/core/type_ptr.h
index 0859e04c7d2d..011a1750ecaa 100644
--- a/aten/src/ATen/core/type_ptr.h
+++ b/aten/src/ATen/core/type_ptr.h
@@ -16,7 +16,7 @@ class SingletonTypePtr {
   /* implicit */ SingletonTypePtr(T* p) : repr_(p) {}
 
   // We need this to satisfy Pybind11, but it shouldn't be hit.
-  explicit SingletonTypePtr(std::shared_ptr<T>) { TORCH_CHECK(false); }
+  explicit SingletonTypePtr(std::shared_ptr<T> /*unused*/) { TORCH_CHECK(false); }
 
   using element_type = typename std::shared_ptr<T>::element_type;
 
diff --git a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
index d269e1073959..9e0b189bdac8 100644
--- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
@@ -308,8 +308,8 @@ Vectorized<c10::BFloat16> inline operator/(
 }
 
 inline Vectorized<BFloat16>::Vectorized() {
-  const short zero = 0;
-  values = svdup_n_bf16(c10::bit_cast<bfloat16_t>(zero));
+  auto vals_f = svdup_n_f32(0);
+  values = convert_float_bfloat16(vals_f, vals_f);
 }
 
 inline Vectorized<BFloat16>::Vectorized(int val) {
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128.h b/aten/src/ATen/cpu/vec/vec128/vec128.h
index c49580410aaf..6b216f20b0bd 100644
--- a/aten/src/ATen/cpu/vec/vec128/vec128.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128.h
@@ -8,6 +8,7 @@
 #include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_float_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_half_neon.h>
+#include <ATen/cpu/vec/vec128/vec128_int_aarch64.h>
 #endif
 
 #include <ATen/cpu/vec/vec128/vec128_convert.h>
diff --git a/aten/src/ATen/cpu/vec/vec128/vec128_int_aarch64.h b/aten/src/ATen/cpu/vec/vec128/vec128_int_aarch64.h
new file mode 100644
index 000000000000..070ba25f8574
--- /dev/null
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_int_aarch64.h
@@ -0,0 +1,794 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
+
+namespace at::vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#define VEC_INT_NEON_TEMPLATE(vl, bit)                                        \
+  template <>                                                                 \
+  struct is_vec_specialized_for<int##bit##_t> : std::bool_constant<true> {};  \
+                                                                              \
+  template <>                                                                 \
+  class Vectorized<int##bit##_t> {                                            \
+    using neon_type = int##bit##x##vl##_t;                                    \
+                                                                              \
+   private:                                                                   \
+    neon_type values;                                                         \
+                                                                              \
+   public:                                                                    \
+    using value_type = int##bit##_t;                                          \
+    using size_type = int;                                                    \
+    static constexpr size_type size() {                                       \
+      return vl;                                                              \
+    }                                                                         \
+    Vectorized() {                                                            \
+      values = vdupq_n_s##bit(0);                                             \
+    }                                                                         \
+    Vectorized(neon_type v) : values(v) {}                                    \
+    Vectorized(int##bit##_t val);                                             \
+    template <                                                                \
+        typename... Args,                                                     \
+        typename = std::enable_if_t<(sizeof...(Args) == size())>>             \
+    Vectorized(Args... vals) {                                                \
+      __at_align__ int##bit##_t buffer[size()] = {vals...};                   \
+      values = vld1q_s##bit(buffer);                                          \
+    }                                                                         \
+    operator neon_type() const {                                              \
+      return values;                                                          \
+    }                                                                         \
+    static Vectorized<int##bit##_t> loadu(                                    \
+        const void* ptr,                                                      \
+        int64_t count = size());                                              \
+    void store(void* ptr, int64_t count = size()) const;                      \
+    template <int64_t mask>                                                   \
+    static Vectorized<int##bit##_t> blend(                                    \
+        const Vectorized<int##bit##_t>& a,                                    \
+        const Vectorized<int##bit##_t>& b);                                   \
+    static Vectorized<int##bit##_t> blendv(                                   \
+        const Vectorized<int##bit##_t>& a,                                    \
+        const Vectorized<int##bit##_t>& b,                                    \
+        const Vectorized<int##bit##_t>& mask_) {                              \
+      return vbslq_s##bit(vreinterpretq_u##bit##_s##bit(mask_.values), b, a); \
+    }                                                                         \
+    template <typename step_t>                                                \
+    static Vectorized<int##bit##_t> arange(                                   \
+        value_type base = 0,                                                  \
+        step_t step = static_cast<step_t>(1));                                \
+    static Vectorized<int##bit##_t> set(                                      \
+        const Vectorized<int##bit##_t>& a,                                    \
+        const Vectorized<int##bit##_t>& b,                                    \
+        int64_t count = size());                                              \
+    const int##bit##_t& operator[](int idx) const = delete;                   \
+    int##bit##_t& operator[](int idx) = delete;                               \
+    Vectorized<int##bit##_t> abs() const {                                    \
+      return vabsq_s##bit(values);                                            \
+    }                                                                         \
+    Vectorized<int##bit##_t> real() const {                                   \
+      return values;                                                          \
+    }                                                                         \
+    Vectorized<int##bit##_t> imag() const {                                   \
+      return vdupq_n_s##bit(0);                                               \
+    }                                                                         \
+    Vectorized<int##bit##_t> conj() const {                                   \
+      return values;                                                          \
+    }                                                                         \
+    Vectorized<int##bit##_t> neg() const {                                    \
+      return vnegq_s##bit(values);                                            \
+    }                                                                         \
+    int##bit##_t reduce_add() const {                                         \
+      return vaddvq_s##bit(values);                                           \
+    }                                                                         \
+    int##bit##_t reduce_max() const;                                          \
+    Vectorized<int##bit##_t> operator==(                                      \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      return Vectorized<value_type>(                                          \
+          vreinterpretq_s##bit##_u##bit(vceqq_s##bit(values, other.values))); \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator!=(                                      \
+        const Vectorized<int##bit##_t>& other) const;                         \
+    Vectorized<int##bit##_t> operator<(                                       \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      return Vectorized<value_type>(                                          \
+          vreinterpretq_s##bit##_u##bit(vcltq_s##bit(values, other.values))); \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator<=(                                      \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      return Vectorized<value_type>(                                          \
+          vreinterpretq_s##bit##_u##bit(vcleq_s##bit(values, other.values))); \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator>(                                       \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      return Vectorized<value_type>(                                          \
+          vreinterpretq_s##bit##_u##bit(vcgtq_s##bit(values, other.values))); \
+    }                                                                         \
+    Vectorized<int##bit##_t> operator>=(                                      \
+        const Vectorized<int##bit##_t>& other) const {                        \
+      return Vectorized<value_type>(                                          \
+          vreinterpretq_s##bit##_u##bit(vcgeq_s##bit(values, other.values))); \
+    }                                                                         \
+    Vectorized<int##bit##_t> eq(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> ne(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> gt(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> ge(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> lt(const Vectorized<int##bit##_t>& other) const; \
+    Vectorized<int##bit##_t> le(const Vectorized<int##bit##_t>& other) const; \
+  };                                                                          \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator+(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return vaddq_s##bit(a, b);                                                \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator-(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return vsubq_s##bit(a, b);                                                \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator&(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return vandq_s##bit(a, b);                                                \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator|(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return vorrq_s##bit(a, b);                                                \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<int##bit##_t> inline operator^(                                  \
+      const Vectorized<int##bit##_t>& a, const Vectorized<int##bit##_t>& b) { \
+    return veorq_s##bit(a, b);                                                \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::eq(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this == other) & Vectorized<int##bit##_t>(1);                    \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ne(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this != other) & Vectorized<int##bit##_t>(1);                    \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::gt(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this > other) & Vectorized<int##bit##_t>(1);                     \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ge(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this >= other) & Vectorized<int##bit##_t>(1);                    \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::lt(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this < other) & Vectorized<int##bit##_t>(1);                     \
+  }                                                                           \
+  Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::le(               \
+      const Vectorized<int##bit##_t>& other) const {                          \
+    return (*this <= other) & Vectorized<int##bit##_t>(1);                    \
+  }
+
+VEC_INT_NEON_TEMPLATE(2, 64)
+VEC_INT_NEON_TEMPLATE(4, 32)
+VEC_INT_NEON_TEMPLATE(8, 16)
+VEC_INT_NEON_TEMPLATE(16, 8)
+
+inline int32_t Vectorized<int32_t>::reduce_max() const {
+  return vmaxvq_s32(values);
+}
+
+inline int16_t Vectorized<int16_t>::reduce_max() const {
+  return vmaxvq_s16(values);
+}
+
+inline int8_t Vectorized<int8_t>::reduce_max() const {
+  return vmaxvq_s8(values);
+}
+
+template <>
+Vectorized<int32_t> inline operator*(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return vmulq_s32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator*(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return vmulq_s16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator*(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return vmulq_s8(a, b);
+}
+
+template <>
+inline Vectorized<int64_t> operator~(const Vectorized<int64_t>& a) {
+  int64x2_t val = a;
+  return ~val;
+}
+
+template <>
+inline Vectorized<int32_t> operator~(const Vectorized<int32_t>& a) {
+  return vmvnq_s32(a);
+}
+
+template <>
+inline Vectorized<int16_t> operator~(const Vectorized<int16_t>& a) {
+  return vmvnq_s16(a);
+}
+
+template <>
+inline Vectorized<int8_t> operator~(const Vectorized<int8_t>& a) {
+  return vmvnq_s8(a);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::operator!=(
+    const Vectorized<int64_t>& other) const {
+  return ~(*this == other);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::operator!=(
+    const Vectorized<int32_t>& other) const {
+  return ~(*this == other);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::operator!=(
+    const Vectorized<int16_t>& other) const {
+  return ~(*this == other);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::operator!=(
+    const Vectorized<int8_t>& other) const {
+  return ~(*this == other);
+}
+
+template <>
+Vectorized<int32_t> inline minimum(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return vminq_s32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline minimum(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return vminq_s16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline minimum(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return vminq_s8(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline maximum(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return vmaxq_s32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline maximum(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return vmaxq_s16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline maximum(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  return vmaxq_s8(a, b);
+}
+
+template <int64_t mask>
+Vectorized<int64_t> Vectorized<int64_t>::blend(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  // Build an array of flags: each bit of element is 1 if the corresponding bit
+  // in 'mask' is set, 0 otherwise.
+  uint64x2_t maskArray = {
+      (mask & 1LL) ? 0xFFFFFFFFFFFFFFFF : 0,
+      (mask & 2LL) ? 0xFFFFFFFFFFFFFFFF : 0};
+  // Use BSL to select elements from b where the mask is 1, else from a
+  return vbslq_s64(maskArray, b.values, a.values);
+}
+
+template <int64_t mask>
+Vectorized<int32_t> Vectorized<int32_t>::blend(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  // Build an array of flags: each bit of element is 1 if the corresponding bit
+  // in 'mask' is set, 0 otherwise.
+  uint32x4_t maskArray = {
+      (mask & 1LL) ? 0xFFFFFFFF : 0,
+      (mask & 2LL) ? 0xFFFFFFFF : 0,
+      (mask & 4LL) ? 0xFFFFFFFF : 0,
+      (mask & 8LL) ? 0xFFFFFFFF : 0};
+  // Use BSL to select elements from b where the mask is 1, else from a
+  return vbslq_s32(maskArray, b.values, a.values);
+}
+
+template <int64_t mask>
+Vectorized<int16_t> Vectorized<int16_t>::blend(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  // Build an array of flags: each bit of element is 1 if the corresponding bit
+  // in 'mask' is set, 0 otherwise.
+  uint16x8_t maskArray = {
+      (mask & 1LL) ? 0xFFFF : 0,
+      (mask & 2LL) ? 0xFFFF : 0,
+      (mask & 4LL) ? 0xFFFF : 0,
+      (mask & 8LL) ? 0xFFFF : 0,
+      (mask & 16LL) ? 0xFFFF : 0,
+      (mask & 32LL) ? 0xFFFF : 0,
+      (mask & 64LL) ? 0xFFFF : 0,
+      (mask & 128LL) ? 0xFFFF : 0};
+  // Use BSL to select elements from b where the mask is 1, else from a
+  return vbslq_s16(maskArray, b.values, a.values);
+}
+
+template <int64_t mask>
+Vectorized<int8_t> Vectorized<int8_t>::blend(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  // Build an array of flags: each bit of element is 1 if the corresponding bit
+  // in 'mask' is set, 0 otherwise.
+  uint8x16_t maskArray = {
+      (mask & 1LL) ? 0xFF : 0,
+      (mask & 2LL) ? 0xFF : 0,
+      (mask & 4LL) ? 0xFF : 0,
+      (mask & 8LL) ? 0xFF : 0,
+      (mask & 16LL) ? 0xFF : 0,
+      (mask & 32LL) ? 0xFF : 0,
+      (mask & 64LL) ? 0xFF : 0,
+      (mask & 128LL) ? 0xFF : 0,
+      (mask & 256LL) ? 0xFF : 0,
+      (mask & 512LL) ? 0xFF : 0,
+      (mask & 1024LL) ? 0xFF : 0,
+      (mask & 2048LL) ? 0xFF : 0,
+      (mask & 4096LL) ? 0xFF : 0,
+      (mask & 8192LL) ? 0xFF : 0,
+      (mask & 16384LL) ? 0xFF : 0,
+      (mask & 32768LL) ? 0xFF : 0};
+  // Use BSL to select elements from b where the mask is 1, else from a
+  return vbslq_s8(maskArray, b.values, a.values);
+}
+
+#define VEC_INT_NEON_OPS(vl, bit)                                             \
+  inline Vectorized<int##bit##_t>::Vectorized(int##bit##_t val) {             \
+    values = vdupq_n_s##bit(val);                                             \
+  }                                                                           \
+  inline Vectorized<int##bit##_t> Vectorized<int##bit##_t>::loadu(            \
+      const void* ptr, int64_t count) {                                       \
+    if (count == size()) {                                                    \
+      return vld1q_s##bit(reinterpret_cast<const int##bit##_t*>(ptr));        \
+    } else {                                                                  \
+      __at_align__ int##bit##_t tmp_values[size()];                           \
+      for (const auto i : c10::irange(size())) {                              \
+        tmp_values[i] = 0;                                                    \
+      }                                                                       \
+      std::memcpy(                                                            \
+          tmp_values,                                                         \
+          reinterpret_cast<const int##bit##_t*>(ptr),                         \
+          count * sizeof(int##bit##_t));                                      \
+      return vld1q_s##bit(reinterpret_cast<const int##bit##_t*>(tmp_values)); \
+    }                                                                         \
+  }                                                                           \
+  inline void Vectorized<int##bit##_t>::store(void* ptr, int64_t count)       \
+      const {                                                                 \
+    if (count == size()) {                                                    \
+      vst1q_s##bit(reinterpret_cast<int##bit##_t*>(ptr), values);             \
+    } else {                                                                  \
+      int##bit##_t tmp_values[size()];                                        \
+      vst1q_s##bit(reinterpret_cast<int##bit##_t*>(tmp_values), values);      \
+      std::memcpy(ptr, tmp_values, count * sizeof(int##bit##_t));             \
+    }                                                                         \
+  }
+
+VEC_INT_NEON_OPS(2, 64)
+VEC_INT_NEON_OPS(4, 32)
+VEC_INT_NEON_OPS(8, 16)
+VEC_INT_NEON_OPS(16, 8)
+
+template <>
+Vectorized<int64_t> inline operator*(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  int64x2_t x = a;
+  int64x2_t y = b;
+  return x * y;
+}
+
+template <>
+Vectorized<int64_t> inline operator/(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  int64x2_t x = a;
+  int64x2_t y = b;
+  return x / y;
+}
+
+template <>
+Vectorized<int32_t> inline operator/(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  int32x4_t x = a;
+  int32x4_t y = b;
+  return x / y;
+}
+
+inline int64_t Vectorized<int64_t>::reduce_max() const {
+  return std::max(values[0], values[1]);
+}
+
+template <>
+Vectorized<int64_t> inline minimum(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  int64x2_t x = a;
+  int64x2_t y = b;
+  return {std::min(x[0], y[0]), std::min(x[1], y[1])};
+}
+
+template <>
+Vectorized<int64_t> inline maximum(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  int64x2_t x = a;
+  int64x2_t y = b;
+  return {std::max(x[0], y[0]), std::max(x[1], y[1])};
+}
+
+template <typename step_t>
+inline Vectorized<int64_t> Vectorized<int64_t>::arange(
+    int64_t base,
+    step_t step) {
+  const Vectorized<int64_t> base_vec(base);
+  const Vectorized<int64_t> step_vec(step);
+  const int64x2_t step_sizes = {0, 1};
+  return base_vec.values + step_sizes * step_vec.values;
+}
+
+template <typename step_t>
+inline Vectorized<int32_t> Vectorized<int32_t>::arange(
+    int32_t base,
+    step_t step) {
+  const Vectorized<int32_t> base_vec(base);
+  const Vectorized<int32_t> step_vec(step);
+  const int32x4_t step_sizes = {0, 1, 2, 3};
+  return vmlaq_s32(base_vec, step_sizes, step_vec);
+}
+
+template <typename step_t>
+inline Vectorized<int16_t> Vectorized<int16_t>::arange(
+    int16_t base,
+    step_t step) {
+  const Vectorized<int16_t> base_vec(base);
+  const Vectorized<int16_t> step_vec(step);
+  const int16x8_t step_sizes = {0, 1, 2, 3, 4, 5, 6, 7};
+  return vmlaq_s16(base_vec, step_sizes, step_vec);
+}
+
+template <typename step_t>
+inline Vectorized<int8_t> Vectorized<int8_t>::arange(int8_t base, step_t step) {
+  const Vectorized<int8_t> base_vec(base);
+  const Vectorized<int8_t> step_vec(step);
+  const int8x16_t step_sizes = {
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  return vmlaq_s8(base_vec, step_sizes, step_vec);
+}
+
+template <>
+Vectorized<int64_t> inline operator>>(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  int64x2_t x = a;
+  int64x2_t y = b;
+  uint64x2_t u = vreinterpretq_u64_s64(y);
+  uint64x2_t z = {std::min(u[0], (uint64_t)63), std::min(u[1], (uint64_t)63)};
+  return x >> vreinterpretq_s64_u64(z);
+}
+
+template <>
+Vectorized<int32_t> inline operator>>(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  int32x4_t x = a;
+  int32x4_t y = b;
+  uint32x4_t bound = vdupq_n_u32(31);
+  uint32x4_t z = vminq_u32(vreinterpretq_u32_s32(y), bound);
+  return x >> vreinterpretq_s32_u32(z);
+}
+
+template <>
+Vectorized<int16_t> inline operator>>(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  int16x8_t x = a;
+  int16x8_t y = b;
+  uint16x8_t bound = vdupq_n_u16(15);
+  uint16x8_t z = vminq_u16(vreinterpretq_u16_s16(y), bound);
+  return x >> vreinterpretq_s16_u16(z);
+}
+
+template <>
+Vectorized<int8_t> inline operator>>(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  int8x16_t x = a;
+  int8x16_t y = b;
+  uint8x16_t bound = vdupq_n_u8(7);
+  int8x16_t z = vreinterpretq_s8_u8(vminq_u8(vreinterpretq_u8_s8(y), bound));
+  return x >> z;
+}
+
+template <>
+Vectorized<int64_t> inline operator<<(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  int64x2_t y = b;
+  uint64x2_t u = vreinterpretq_u64_s64(y);
+  uint64x2_t z = {std::min(u[0], (uint64_t)64), std::min(u[1], (uint64_t)64)};
+  return vshlq_s64(a, vreinterpretq_s64_u64(z));
+}
+
+template <>
+Vectorized<int32_t> inline operator<<(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  int32x4_t y = b;
+  uint32x4_t bound = vdupq_n_u32(32);
+  uint32x4_t z = vminq_u32(vreinterpretq_u32_s32(y), bound);
+  return vshlq_s32(a, vreinterpretq_s32_u32(z));
+}
+
+template <>
+Vectorized<int16_t> inline operator<<(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  int16x8_t y = b;
+  uint16x8_t bound = vdupq_n_u16(16);
+  uint16x8_t z = vminq_u16(vreinterpretq_u16_s16(y), bound);
+  return vshlq_s16(a, vreinterpretq_s16_u16(z));
+}
+
+template <>
+Vectorized<int8_t> inline operator<<(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  int8x16_t y = b;
+  uint8x16_t bound = vdupq_n_u8(8);
+  int8x16_t z = vreinterpretq_s8_u8(vminq_u8(vreinterpretq_u8_s8(y), bound));
+  return vshlq_s8(a, z);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::set(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b,
+    int64_t count) {
+  if (count == 0) {
+    return a;
+  } else if (count >= 2) {
+    return b;
+  } else {
+    int64x2_t c = {b.values[0], a.values[1]};
+    return c;
+  }
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::set(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b,
+    int64_t count) {
+  if (count == 0) {
+    return a;
+  } else if (count >= 4) {
+    return b;
+  } else {
+    // Build an array of flags: each bit of element is 1 if the corresponding
+    // bit in 'mask' is set, 0 otherwise.
+    uint32x4_t maskArray = {
+        (count >= 1LL) ? 0xFFFFFFFF : 0,
+        (count >= 2LL) ? 0xFFFFFFFF : 0,
+        (count >= 3LL) ? 0xFFFFFFFF : 0,
+        0};
+    // Use BSL to select elements from b where the mask is 1, else from a
+    return vbslq_s32(maskArray, b.values, a.values);
+  }
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::set(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b,
+    int64_t count) {
+  if (count == 0) {
+    return a;
+  } else if (count >= 8) {
+    return b;
+  } else {
+    // Build an array of flags: each bit of element is 1 if the corresponding
+    // bit in 'mask' is set, 0 otherwise.
+    uint16x8_t maskArray = {
+        static_cast<uint16_t>((count >= 1LL) ? 0xFFFF : 0),
+        static_cast<uint16_t>((count >= 2LL) ? 0xFFFF : 0),
+        static_cast<uint16_t>((count >= 3LL) ? 0xFFFF : 0),
+        static_cast<uint16_t>((count >= 4LL) ? 0xFFFF : 0),
+        static_cast<uint16_t>((count >= 5LL) ? 0xFFFF : 0),
+        static_cast<uint16_t>((count >= 6LL) ? 0xFFFF : 0),
+        static_cast<uint16_t>((count >= 7LL) ? 0xFFFF : 0),
+        0};
+    // Use BSL to select elements from b where the mask is 1, else from a
+    return vbslq_s16(maskArray, b.values, a.values);
+  }
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::set(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b,
+    int64_t count) {
+  if (count == 0) {
+    return a;
+  } else if (count >= 16) {
+    return b;
+  } else {
+    // Build an array of flags: each bit of element is 1 if the corresponding
+    // bit in 'mask' is set, 0 otherwise.
+    uint8x16_t maskArray = {
+        static_cast<uint8_t>((count >= 1LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 2LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 3LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 4LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 5LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 6LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 7LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 8LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 9LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 10LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 11LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 12LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 13LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 14LL) ? 0xFF : 0),
+        static_cast<uint8_t>((count >= 15LL) ? 0xFF : 0),
+        0};
+
+    // Use BSL to select elements from b where the mask is 1, else from a
+    return vbslq_s8(maskArray, b.values, a.values);
+  }
+}
+
+template <>
+Vectorized<int16_t> inline operator/(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  Vectorized<int32_t> highBitsA = vmovl_high_s16(a);
+  Vectorized<int32_t> highBitsB = vmovl_high_s16(b);
+  Vectorized<int32_t> lowBitsA = vmovl_s16(vget_low_s16(a));
+  Vectorized<int32_t> lowBitsB = vmovl_s16(vget_low_s16(b));
+  int32x4_t highBitsResult = highBitsA / highBitsB;
+  int32x4_t lowBitsResult = lowBitsA / lowBitsB;
+  return vuzp1q_s16(
+      vreinterpretq_s16_s32(lowBitsResult),
+      vreinterpretq_s16_s32(highBitsResult));
+}
+
+template <>
+Vectorized<int8_t> inline operator/(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& b) {
+  Vectorized<int16_t> highBitsA = vmovl_high_s8(a);
+  Vectorized<int16_t> highBitsB = vmovl_high_s8(b);
+  Vectorized<int16_t> lowBitsA = vmovl_s8(vget_low_s8(a));
+  Vectorized<int16_t> lowBitsB = vmovl_s8(vget_low_s8(b));
+  int16x8_t highBitsResult = highBitsA / highBitsB;
+  int16x8_t lowBitsResult = lowBitsA / lowBitsB;
+  return vuzp1q_s8(
+      vreinterpretq_s8_s16(lowBitsResult),
+      vreinterpretq_s8_s16(highBitsResult));
+}
+
+template <>
+Vectorized<int64_t> inline clamp(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& min,
+    const Vectorized<int64_t>& max) {
+  return minimum(max, maximum(min, a));
+}
+
+template <>
+Vectorized<int32_t> inline clamp(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& min,
+    const Vectorized<int32_t>& max) {
+  return minimum(max, maximum(min, a));
+}
+
+template <>
+Vectorized<int16_t> inline clamp(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& min,
+    const Vectorized<int16_t>& max) {
+  return minimum(max, maximum(min, a));
+}
+
+template <>
+Vectorized<int8_t> inline clamp(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& min,
+    const Vectorized<int8_t>& max) {
+  return minimum(max, maximum(min, a));
+}
+
+template <>
+Vectorized<int64_t> inline clamp_max(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& max) {
+  return minimum(max, a);
+}
+
+template <>
+Vectorized<int32_t> inline clamp_max(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& max) {
+  return minimum(max, a);
+}
+
+template <>
+Vectorized<int16_t> inline clamp_max(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& max) {
+  return minimum(max, a);
+}
+
+template <>
+Vectorized<int8_t> inline clamp_max(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& max) {
+  return minimum(max, a);
+}
+
+template <>
+Vectorized<int64_t> inline clamp_min(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& min) {
+  return maximum(min, a);
+}
+
+template <>
+Vectorized<int32_t> inline clamp_min(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& min) {
+  return maximum(min, a);
+}
+
+template <>
+Vectorized<int16_t> inline clamp_min(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& min) {
+  return maximum(min, a);
+}
+
+template <>
+Vectorized<int8_t> inline clamp_min(
+    const Vectorized<int8_t>& a,
+    const Vectorized<int8_t>& min) {
+  return maximum(min, a);
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
index ba57ca034e9a..735315bee768 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
@@ -342,19 +342,19 @@ class Vectorized<c10::complex<double>> {
     return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ);
   }
   Vectorized<c10::complex<double>> operator<(
-      const Vectorized<c10::complex<double>>&) const {
+      const Vectorized<c10::complex<double>>& /*unused*/) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
   Vectorized<c10::complex<double>> operator<=(
-      const Vectorized<c10::complex<double>>&) const {
+      const Vectorized<c10::complex<double>>& /*unused*/) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
   Vectorized<c10::complex<double>> operator>(
-      const Vectorized<c10::complex<double>>&) const {
+      const Vectorized<c10::complex<double>>& /*unused*/) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
   Vectorized<c10::complex<double>> operator>=(
-      const Vectorized<c10::complex<double>>&) const {
+      const Vectorized<c10::complex<double>>& /*unused*/) const {
     TORCH_CHECK(false, "not supported for complex numbers");
   }
 
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
index 515cbff730d9..559db3c97567 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h
@@ -905,7 +905,7 @@ class Vectorized8 : public Vectorizedi {
     // Because loadu(const void* ptr, T count) requires zero initialization for
     // upper 128 bits. However, by using _mm256_castsi128_si256, the upper 128
     // bits of the result are undefined.
-    // TODO<leslie> We can use _mm256_zextsi128_si256 in the furture,
+    // TODO<leslie> We can use _mm256_zextsi128_si256 in the future,
     // since gcc 9.3 doesn't support it now.
     __m128i input_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr));
     return _mm256_castsi128_si256(input_128);
@@ -1844,7 +1844,7 @@ Vectorized<int16_t> inline shift_256_16(
     c0 = _mm256_srav_epi32(a0, b0);
   c0 = _mm256_shuffle_epi8(c0, ctl_1_0);
 
-  // Peform shifting the same way for input array elements with
+  // Perform shifting the same way for input array elements with
   // idx%2==1.
   __m256i a1 = _mm256_and_si256(a, keep_1);
   __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
@@ -2180,7 +2180,7 @@ Vectorized<T> inline shift_256_8(
     c0 = _mm256_srlv_epi32(a0, b0);
   c0 = _mm256_shuffle_epi8(c0, ctl_3_0);
 
-  // Peform shifting the same way for input array elements with
+  // Perform shifting the same way for input array elements with
   // idx%4==1.
   __m256i a1 = _mm256_shuffle_epi8(a, ctl_1_3);
   __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
@@ -2193,7 +2193,7 @@ Vectorized<T> inline shift_256_8(
     c1 = _mm256_srlv_epi32(a1, b1);
   c1 = _mm256_shuffle_epi8(c1, ctl_3_1);
 
-  // Peform shifting the same way for input array elements with
+  // Perform shifting the same way for input array elements with
   // idx%4==2.
   __m256i a2 = _mm256_shuffle_epi8(a, ctl_2_3);
   __m256i b2 = _mm256_shuffle_epi8(b, ctl_2_0);
@@ -2206,7 +2206,7 @@ Vectorized<T> inline shift_256_8(
     c2 = _mm256_srlv_epi32(a2, b2);
   c2 = _mm256_shuffle_epi8(c2, ctl_3_2);
 
-  // Peform shifting the same way for input array elements with
+  // Perform shifting the same way for input array elements with
   // idx%4==3.
   __m256i a3 = _mm256_and_si256(a, keep_3);
   __m256i b3 = _mm256_shuffle_epi8(b, ctl_3_0);
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
index dafe444163eb..145ac7aee567 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@@ -1377,7 +1377,7 @@ Vectorized<c10::quint8> inline maximum(
 #if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
 std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
     at::vec::Vectorized<int8_t> src) {
-  auto s8x8 = vld1_s8(src.operator const int8_t*());
+  auto s8x8 = vget_low_s8(src);
   auto s16x8 = vmovl_s8(s8x8);
 
   auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
@@ -1402,7 +1402,7 @@ std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
 
 Vectorized<float> inline convert_int8_half_register_to_float(
     at::vec::Vectorized<int8_t> src) {
-  auto s8x8 = vld1_s8(src.operator const int8_t*());
+  auto s8x8 = vget_low_s8(src);
   auto s16x8 = vmovl_s8(s8x8);
 
   auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_int.h b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
index 5f80a7c2bcff..8b2768fab6a3 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h
@@ -1088,7 +1088,7 @@ class Vectorized8 : public Vectorizedi {
     // Because loadu(const void* ptr, T count) requires zero initialization for
     // upper 384 bits. However, by using _mm512_castsi128_si512, the upper 384
     // bits of the result are undefined.
-    // TODO<leslie> We can use _mm512_zextsi128_si512 in the furture,
+    // TODO<leslie> We can use _mm512_zextsi128_si512 in the future,
     // since gcc 9.3 doesn't support it now.
     __m128i input_128 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
     return _mm512_castsi128_si512(input_128);
@@ -2022,7 +2022,7 @@ Vectorized<T> inline shift_512_8(
     c0 = _mm512_srlv_epi16(a0, b0);
   c0 = _mm512_shuffle_epi8(c0, ctl_1_0);
 
-  // Peform shifting the same way for input array elements with
+  // Perform shifting the same way for input array elements with
   // idx%2==1.
   __m512i a1 = _mm512_and_si512(a, keep_1);
   __m512i b1 = _mm512_shuffle_epi8(b, ctl_1_0);
diff --git a/aten/src/ATen/cpu/vec/vec_quant.h b/aten/src/ATen/cpu/vec/vec_quant.h
index 36602c4a760f..ae9e86c6a9c8 100644
--- a/aten/src/ATen/cpu/vec/vec_quant.h
+++ b/aten/src/ATen/cpu/vec/vec_quant.h
@@ -149,5 +149,105 @@ static inline void pack_vnni4(
 #endif
 }
 
+// This is a helper function for transpose_pack_vnni4
+// Transform a [4, 16] block (with incontiguous output)
+// Src:
+// a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16
+// b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15 b16
+// c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 c16
+// d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 d16
+// Dst:
+// a1 a2 a3 a4 b1 b2 b3 b4 c1 c2 c3 c4 d1 d2 d3 d4
+// a5 a6 a7 a8 b5 b6 b7 b8 c5 c6 c7 c8 d5 d6 d7 d8
+// a9 a10 a11 a12 b9 b10 b11 b12 c9 c10 c11 c12 d9 d10 d11 d12
+// a13 a14 a15 a16 b13 b14 b15 b16 c13 c14 c15 c16 d13 d14 d15 d16
+template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>>
+static inline void transpose_vnni4_pad_4x16_block(
+    const scalar_t* src,
+    scalar_t* dst,
+    int64_t ld_src,
+    int64_t ld_dst,
+    int krem = 4) {
+#if defined(CPU_CAPABILITY_AVX512)
+  __m128i r[4];
+  for (int i = 0; i < krem; ++i) {
+    r[i] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i * ld_src));
+  }
+  for (int i = krem; i < 4; ++i) {
+    r[i] = _mm_setzero_si128();
+  }
+
+  // Transpose 4x16 bytes using unpack and shuffle
+  __m128i t0 = _mm_unpacklo_epi32(r[0], r[1]);
+  __m128i t1 = _mm_unpackhi_epi32(r[0], r[1]);
+  __m128i t2 = _mm_unpacklo_epi32(r[2], r[3]);
+  __m128i t3 = _mm_unpackhi_epi32(r[2], r[3]);
+
+  __m128i r0 = _mm_unpacklo_epi64(t0, t2);
+  __m128i r1 = _mm_unpackhi_epi64(t0, t2);
+  __m128i r2 = _mm_unpacklo_epi64(t1, t3);
+  __m128i r3 = _mm_unpackhi_epi64(t1, t3);
+
+  // Store output
+  if (krem == 4) {
+    // normal case
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), r0);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst), r1);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 2), r2);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 3), r3);
+  } else {
+    // masked case
+    __mmask16 mask = (1ULL << (krem * 4)) - 1;
+    _mm_mask_storeu_epi8(dst, mask, r0);
+    _mm_mask_storeu_epi8(reinterpret_cast<__m128i*>(dst + ld_dst), mask, r1);
+    _mm_mask_storeu_epi8(
+        reinterpret_cast<__m128i*>(dst + ld_dst * 2), mask, r2);
+    _mm_mask_storeu_epi8(
+        reinterpret_cast<__m128i*>(dst + ld_dst * 3), mask, r3);
+  }
+#else
+  TORCH_CHECK(
+      false,
+      "transpose_vnni4_pad_4x16_block is only supported when AVX-512 is supported")
+#endif
+}
+
+// Do the transpose packing fusion with VNNI4
+// Reorder [K, N] → [N/4, K, 4] (VNNI4-style layout for bit8)
+template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>>
+static inline void transpose_pack_vnni4(
+    const scalar_t* src,
+    scalar_t* dst,
+    int64_t ld_src,
+    int64_t K,
+    int64_t N) {
+#if defined(CPU_CAPABILITY_AVX512)
+  TORCH_CHECK(
+      N % 16 == 0, "N needs to be multiple of 16 for transpose_pack_vnni4");
+  int64_t bk = 0;
+  int64_t _K = K / 4 * 4;
+  for (; bk < _K; bk += 4) {
+    int64_t bn = 0;
+    for (; bn < N; bn += 16) {
+      transpose_vnni4_pad_4x16_block(
+          src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4);
+    }
+  }
+
+  // Handle leftover K rows (< 4)
+  if (K % 4 != 0) {
+    int krem = K - bk;
+    int64_t bn = 0;
+    for (; bn < N; bn += 16) {
+      transpose_vnni4_pad_4x16_block(
+          src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4, krem);
+    }
+  }
+#else
+  TORCH_CHECK(
+      false, "transpose_pack_vnni4 is only supported when AVX-512 is supported")
+#endif
+}
+
 } // namespace CPU_CAPABILITY
 } // namespace at::vec
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index a81d34df4d64..6933099bb1f3 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -16,6 +16,8 @@
 #include <c10/util/irange.h>
 #include <c10/core/ScalarType.h>
 
+#include <ATen/cuda/detail/BLASConstants.h>
+
 #ifdef USE_ROCM
 #include <c10/cuda/CUDAStream.h>
 #include <hipblaslt/hipblaslt-ext.hpp>
@@ -108,7 +110,7 @@ static hipblasStatus_t rocBLASStatusToHIPStatus(rocblas_status error)
 
 namespace {
 
-static cublasOperation_t _cublasOpFromChar(char op) {
+cublasOperation_t _cublasOpFromChar(char op) {
   // NOLINTNEXTLINE(bugprone-switch-missing-default-case)
   switch (op) {
     case 'n':
@@ -128,7 +130,7 @@ static cublasOperation_t _cublasOpFromChar(char op) {
       "_cublasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
 }
 
-static void _cublasAdjustLdLevel2(int64_t m, int64_t n, int64_t* lda) {
+void _cublasAdjustLdLevel2(int64_t m, int64_t n, int64_t* lda) {
   // Note: leading dimensions generally are checked that they are > 0
   // and at least as big the result requires (even if the value won't
   // be used).
@@ -142,7 +144,7 @@ static void _cublasAdjustLdLevel2(int64_t m, int64_t n, int64_t* lda) {
     *lda = std::max<int64_t>(m, 1);
 }
 
-static void _cublasAdjustLdLevel3(
+void _cublasAdjustLdLevel3(
     char transa,
     char transb,
     int64_t m,
@@ -191,6 +193,10 @@ uint32_t _getAlignment(uintptr_t address) {
 
 #ifdef USE_ROCM
 static c10::cuda::CUDAStream _getCarveoutStream(int32_t value) {
+  // 0 is default value, meaning full CUs i.e. no mask
+  if (value == 0) {
+    return at::cuda::getCurrentCUDAStream();
+  }
   static int32_t last_value = 0;
   static hipStream_t stream;
   if (last_value == 0) {
@@ -209,15 +215,15 @@ static c10::cuda::CUDAStream _getCarveoutStream(int32_t value) {
   int32_t CUs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
   // how many uint32_t do we need to cover all CUs, fill bitmask with 1
   uint32_t mask_size = static_cast<uint32_t>((CUs + 32 - 1) / 32);
-  std::vector<uint32_t> mask(mask_size, uint32_t{0xffffffff});
+  std::vector<uint32_t> mask(mask_size, uint32_t{0x00000000});
   // starting from lowest order bits, in 32-bit chunks
   // set bits to 0 based on how many CUs to carve out
   int32_t full_shifts = value / 32;
   int32_t remainder = value % 32;
   for (int32_t i = 0; i < full_shifts; i++) {
-    mask[i] = uint32_t{0x00000000};
+    mask[i] = uint32_t{0xffffffff};
   }
-  mask[full_shifts] = uint32_t{0xffffffff} << remainder;
+  mask[full_shifts] = uint32_t{0xffffffff} << (32 - remainder);
 
   // finally, create masked stream
   AT_CUDA_CHECK(hipExtStreamCreateWithCUMask(&stream, mask_size, &mask[0]));
@@ -319,7 +325,7 @@ class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor<
     descriptor_.reset(raw_descriptor);
   }
   template <typename T>
-  inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
+  void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) {
     // NOLINTNEXTLINE(bugprone-sizeof-expression)
     TORCH_CUDABLAS_CHECK(::cublasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(value)));
   }
@@ -341,7 +347,7 @@ class CuBlasLtMatrixLayout : public CuBlasLtDescriptor<
     descriptor_.reset(raw_descriptor);
   }
   template <typename T>
-  inline void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) {
+  void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) {
     TORCH_CUDABLAS_CHECK(::cublasLtMatrixLayoutSetAttribute(descriptor(), attr, &value, sizeof(T)));
   }
 };
@@ -356,7 +362,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
     descriptor_.reset(raw_descriptor);
   }
   template <typename T>
-  inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) {
+  void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) {
     TORCH_CUDABLAS_CHECK(::cublasLtMatmulPreferenceSetAttribute(descriptor(), attr, &value, sizeof(T)));
   }
 };
@@ -391,7 +397,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
     computeType = CUBLAS_COMPUTE_64F;
     scaleType = CUDA_R_64F;
   } else if constexpr (std::is_same_v<Dtype, float>) {
-    if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
+    if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
       computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
     }
   } else if constexpr (std::is_same_v<Dtype, c10::complex<double>>) {
@@ -418,25 +424,40 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
     abType = CUDA_R_16F;
     cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
 #ifndef USE_ROCM
-    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
-      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
-        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+    auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
+    if (fp16_reduction !=
+        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+      uint32_t mask =
+          fp16_reduction ==
+                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
+              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
+                 CUBLASLT_REDUCTION_SCHEME_NONE)
+              : CUBLASLT_REDUCTION_SCHEME_NONE;
+      preference.setAttribute(
+          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
     }
 #endif
   } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
     abType = CUDA_R_16BF;
     cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
 #ifndef USE_ROCM
-    if (!at::globalContext().allowBF16ReductionCuBLAS()) {
-      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
-        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+    auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
+    if (bf16_reduction !=
+        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+      uint32_t mask =
+          bf16_reduction ==
+                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
+              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
+                 CUBLASLT_REDUCTION_SCHEME_NONE)
+              : CUBLASLT_REDUCTION_SCHEME_NONE;
+      preference.setAttribute(
+          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
     }
 #endif
   } else {
     static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented");
   }
 
-  globalContext().alertCuBLASConfigNotDeterministic();
   cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -570,8 +591,6 @@ inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_D
 
 template <>
 void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -583,8 +602,6 @@ void bgemm_internal_cublas<double>(CUDABLAS_BGEMM_ARGTYPES(double)) {
 
 template <>
 void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -596,8 +613,6 @@ void bgemm_internal_cublas<float>(CUDABLAS_BGEMM_ARGTYPES(float)) {
 
 template <>
 void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -611,8 +626,6 @@ void bgemm_internal_cublas<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::co
 
 template <>
 void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -626,8 +639,6 @@ void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::com
 
 template <typename C_Dtype>
 inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -699,8 +710,6 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
 
 template <typename C_Dtype>
 inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
   BGEMM_CHECK_ARGVALUES(at::BFloat16);
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
@@ -1024,8 +1033,6 @@ inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dty
 
 template <>
 void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -1037,8 +1044,6 @@ void gemm_internal_cublas<double>(CUDABLAS_GEMM_ARGTYPES(double)) {
 
 template <>
 void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -1050,8 +1055,6 @@ void gemm_internal_cublas<float>(CUDABLAS_GEMM_ARGTYPES(float)) {
 
 template <>
 void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -1065,8 +1068,6 @@ void gemm_internal_cublas<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::comp
 
 template <>
 void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -1080,8 +1081,6 @@ void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::compl
 
 template <typename C_Dtype>
 inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -1139,8 +1138,15 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
   }
   if (prop->major >= 5) {
     cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
-    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
-      cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+    auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
+    TORCH_CHECK(fp16_reduction !=
+        at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
+          "torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
+          "..., allow_splitk=False) requires the cuBLASLt backend");
+    if (fp16_reduction !=
+        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+      cublas_flags = static_cast<cublasMath_t>(
+          cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
     }
     // Disallow fp16 reductions that could lead to unexpected overflow issues.
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, cublas_flags));
@@ -1190,7 +1196,6 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
 
 template <typename C_Dtype>
 inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
-  globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t opa = _cublasOpFromChar(transa);
   cublasOperation_t opb = _cublasOpFromChar(transb);
@@ -1200,8 +1205,15 @@ inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DT
   GEMM_CHECK_ARGVALUES(at::BFloat16);
 #ifndef USE_ROCM
   cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
-  if (!at::globalContext().allowBF16ReductionCuBLAS()) {
-    cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+  auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
+  TORCH_CHECK(bf16_reduction !=
+      at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK,
+        "torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction("
+        "..., allow_splitk=False) requires the cuBLASLt backend");
+  if (bf16_reduction !=
+      at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+    cublas_flags = static_cast<cublasMath_t>(
+        cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
   }
 #endif
 #if defined(USE_ROCM)
@@ -1290,7 +1302,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
   }
 #if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM)
   else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-    if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) { //no CK GEMM version for gfx1100
+    if (at::detail::getCUDAHooks().isGPUArch({"gfx11", "gfx12"})) { //no CK GEMM version
       gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
     } else{
       at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
@@ -1579,7 +1591,7 @@ bool gemm_and_bias(
     computeType = CUBLAS_COMPUTE_64F;
     scaleType = CUDA_R_64F;
   } else if constexpr (std::is_same_v<Dtype, float>) {
-    if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
+    if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
       computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
     }
   } else if constexpr (std::is_same_v<Dtype, at::Half>) {
@@ -1597,18 +1609,34 @@ bool gemm_and_bias(
     abType = CUDA_R_16F;
     cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
 #ifndef USE_ROCM
-    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
-      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
-        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+    auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
+    if (fp16_reduction !=
+        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+      uint32_t mask =
+          fp16_reduction ==
+                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
+              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
+                 CUBLASLT_REDUCTION_SCHEME_NONE)
+              : CUBLASLT_REDUCTION_SCHEME_NONE;
+      preference.setAttribute(
+          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
     }
 #endif
   } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
     abType = CUDA_R_16BF;
     cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
 #ifndef USE_ROCM
-    if (!at::globalContext().allowBF16ReductionCuBLAS()) {
-      preference.setAttribute(CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK,
-        CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE | CUBLASLT_REDUCTION_SCHEME_NONE);
+    auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
+    if (bf16_reduction !=
+        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
+      uint32_t mask =
+          bf16_reduction ==
+                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
+              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
+                 CUBLASLT_REDUCTION_SCHEME_NONE)
+              : CUBLASLT_REDUCTION_SCHEME_NONE;
+      preference.setAttribute(
+          CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, mask);
     }
 #endif
   }
@@ -1637,9 +1665,7 @@ bool gemm_and_bias(
   if (activation == GEMMAndBiasActivationEpilogue::RELU) {
     epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
   } else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
-#if CUDA_VERSION >= 11040 || defined(USE_ROCM)
     epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
-#endif
   }
 
   if (bias != nullptr) {
@@ -1837,6 +1863,8 @@ template bool gemm_and_bias(
     int64_t result_ld,
     GEMMAndBiasActivationEpilogue activation);
 
+using at::blas::ScalingType;
+
 int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fast_accum) {
   switch (scaling_type) {
     case ScalingType::BlockWise1x32:
@@ -1928,14 +1956,15 @@ void scaled_gemm(
     const void *result_scale_ptr,
     int64_t result_ld,
     ScalarType result_dtype,
-    bool use_fast_accum) {
+    bool use_fast_accum,
+    const std::optional<Tensor>& alpha) {
   // Note: see `cublasCommonArgs` for various non-intuitive manupulations
   // of input arguments to this function.
-#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
   const auto computeType = CUBLAS_COMPUTE_32F;
   const auto scaleType = CUDA_R_32F;
-  const float alpha_val = 1.0;
-  const float beta_val = 0.0;
+  // Note: alpha_val may change later depending on user-passed argument
+  float alpha_val = 1.0;
+  float beta_val = 0.0;
   CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, _cublasOpFromChar(transa));
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
@@ -1954,8 +1983,8 @@ void scaled_gemm(
   #if ROCM_VERSION >= 70000
             if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) {
                 // TODO: add constraints based on hipblaslt internals
-                TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
-                           "Matrix dimensions must be multiples of 32 for MX format. "
+                TORCH_CHECK((m % 16 == 0) && (n % 16 == 0) && (k % 128 == 0),
+                           "M, N must be multiples of 16 and K should be multiple of 128 for MX format. "
                            "Got m=", m, ", n=", n, ", k=", k);
             }
   #endif
@@ -2006,6 +2035,33 @@ void scaled_gemm(
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, CUBLASLT_EPILOGUE_BIAS);
     computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
   }
+
+  // Handle user-passed alpha
+  float *alpha_ptr = &alpha_val;
+  float *beta_ptr = &beta_val;
+
+  if (alpha.has_value()) {
+    auto& a = alpha.value();
+
+    // if device-tensor
+    if (a.is_cuda()) {
+      // NOTE: there are lifetime requirements on device-side pointers for alpha/beta -- the value must be
+      //       valid & correct until the cublas call finishes (not is scheduled like host-side values). Thus
+      //       we need to use allocations for alpha/beta that have some guarantees on lifetime - a statically
+      //       managed 4B buffer for alpha that we'll copy the passed alpha value into, and constant memory
+      //       for beta respectively.
+      float *user_alpha_ptr = at::cuda::detail::get_user_alpha_ptr();
+      at::Tensor user_alpha = at::from_blob(user_alpha_ptr, {1}, TensorOptions().device(kCUDA).dtype(kFloat));
+      user_alpha.copy_(a);
+      // Tell cublasLt we're using device-side pointers for alpha/beta
+      auto pointer_mode = CUBLASLT_POINTER_MODE_DEVICE;
+      computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_POINTER_MODE, pointer_mode);
+      alpha_ptr = user_alpha.data_ptr<float>();
+      beta_ptr = at::cuda::detail::get_cublas_device_zero();
+    } else {
+      alpha_val = a.item<float>();
+    }
+  }
     // For other data types, use the get_scale_mode function based on scaling type
     // The SCALE_MODE attrs only exist in cuBLAS 12.8+/ROCm 7.0 or in recent hipblaslt,
     // but we must invoke get_scale_mode anyways to trigger the version checks.
@@ -2023,6 +2079,7 @@ void scaled_gemm(
   cublasLtMatmulHeuristicResult_t heuristicResult = {};
   int returnedResult = 0;
   cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle();
+
   TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
       ltHandle,
       computeDesc.descriptor(),
@@ -2063,10 +2120,10 @@ void scaled_gemm(
         auto is_valid_status = hipblaslt_ext::matmulIsAlgoSupported(
                 ltHandle,
                 computeDesc.descriptor(),
-                &alpha_val,
+                alpha_ptr,
                 Adesc.descriptor(),
                 Bdesc.descriptor(),
-                &beta_val,
+                beta_ptr,
                 Cdesc.descriptor(),
                 Ddesc.descriptor(),
                 all_algos[i].algo,
@@ -2085,17 +2142,14 @@ void scaled_gemm(
   cublasStatus_t cublasStatus = cublasLtMatmul(
       ltHandle,
       computeDesc.descriptor(),
-      &alpha_val,
+      alpha_ptr,
       mat1_ptr,
       Adesc.descriptor(),
       mat2_ptr,
       Bdesc.descriptor(),
-      &beta_val,
-#ifdef USE_ROCM
+      beta_ptr,
+      // NOTE: always use result_ptr here, because cuBLASLt w/device beta=0 can't handle nullptr either
       result_ptr, // unused, since beta_val is 0, but hipblaslt can't handle nullptr
-#else
-      nullptr,
-#endif // ifdef USE_ROCM
       Cdesc.descriptor(),
       result_ptr,
       Ddesc.descriptor(),
@@ -2133,8 +2187,6 @@ void scaled_gemm(
       " scaleType ",
       scaleType);
   return;
-#endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM)
-  TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above");
 }
 
 void int8_gemm(
@@ -2409,8 +2461,6 @@ void trsmBatched<c10::complex<double>>(
 
 template <>
 void gemv<c10::complex<double>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<double>)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t op = _cublasOpFromChar(trans);
   _cublasAdjustLdLevel2(m, n, &lda);
@@ -2426,8 +2476,6 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
   // gemv is bw bound, and does not benefit from TF32. But the precision
   // loss still happens on TF32. So we disable it here.
   NoTF32Guard disable_tf32;
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t op = _cublasOpFromChar(trans);
   _cublasAdjustLdLevel2(m, n, &lda);
@@ -2440,8 +2488,6 @@ void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>)) {
 
 template <>
 void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double)) {
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t op = _cublasOpFromChar(trans);
   _cublasAdjustLdLevel2(m, n, &lda);
@@ -2455,8 +2501,6 @@ void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float)) {
   // gemv is bw bound, and does not benefit from TF32. But the precision
   // loss still happens on TF32. So we disable it here.
   NoTF32Guard disable_tf32;
-  // See Note [Writing Nondeterministic Operations]
-  globalContext().alertCuBLASConfigNotDeterministic();
   cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
   cublasOperation_t op = _cublasOpFromChar(trans);
   _cublasAdjustLdLevel2(m, n, &lda);
diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h
index b235840418e2..0295948311a5 100644
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@@ -14,6 +14,7 @@
  */
 
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/BlasBackend.h>
 #include <ATen/OpMathType.h>
 
 namespace at::cuda::blas {
@@ -136,15 +137,6 @@ void int8_gemm(
     int32_t* result_ptr,
     int64_t result_ld);
 
-enum class ScalingType : std::uint8_t {
-  TensorWise,  // fp32 scales
-  RowWise,  // fp32 scales
-  BlockWise1x16,  // fp8_e4m3fn scales
-  BlockWise1x32,  // fp8_e8m0fnu scales
-  BlockWise1x128,  // fp32 scales
-  BlockWise128x128,  // fp32 scales
-};
-
 void scaled_gemm(
     char transa,
     char transb,
@@ -156,20 +148,21 @@ void scaled_gemm(
     int64_t mat1_ld,
     ScalarType mat1_dtype,
     ScalarType mat1_scale_dtype,
-    ScalingType mat1_scaling_type,
+    at::blas::ScalingType mat1_scaling_type,
     const void* mat2_ptr,
     const void* mat2_scale_ptr,
     int64_t mat2_ld,
     ScalarType mat2_dtype,
     ScalarType mat2_scale_dtype,
-    ScalingType mat2_scaling_type,
+    at::blas::ScalingType mat2_scaling_type,
     const void* bias_ptr,
     ScalarType bias_dtype,
     void* result_ptr,
     const void* result_scale_ptr,
     int64_t result_ld,
     ScalarType result_dtype,
-    bool use_fast_accum);
+    bool use_fast_accum,
+    const std::optional<Tensor>& alpha);
 
 #define CUDABLAS_BGEMM_ARGTYPES(Dtype)  CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype)
 
diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
index f95faa94e611..2e387fbc264d 100644
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@@ -15,19 +15,19 @@ namespace cuda::detail {
 namespace {
 
 // Total number of gpus in the system.
-static int64_t num_gpus;
+int64_t num_gpus;
 
 // Ensures default_gens_cuda is initialized once.
-static std::deque<c10::once_flag> cuda_gens_init_flag;
+std::deque<c10::once_flag> cuda_gens_init_flag;
 
 // Default, global CUDA generators, one per GPU.
-static std::vector<Generator> default_gens_cuda;
+std::vector<Generator> default_gens_cuda;
 
 /*
  * Populates the global variables related to CUDA generators
  * Warning: this function must only be called once!
  */
-static void initCUDAGenVector() {
+void initCUDAGenVector() {
   // Ensures we only call cudaGetDeviceCount only once.
   static bool num_gpu_init_flag [[maybe_unused]] = []() {
     num_gpus = static_cast<int32_t>(c10::cuda::device_count());
@@ -109,7 +109,7 @@ void CUDAGeneratorState::increase(uint64_t increment) {
         offset_intragraph_ % 4 == 0, "RNG offset must be a multiple of 4.");
     // Ensures the increment does not cause overflow.
     TORCH_INTERNAL_ASSERT(
-        offset_intragraph_ <= std::numeric_limits<uint32_t>::max() - increment,
+        offset_intragraph_ <= std::numeric_limits<uint64_t>::max() - increment,
         "Increment causes overflow in the offset value.");
     offset_intragraph_ += increment;
   } else {
@@ -325,9 +325,9 @@ uint64_t CUDAGeneratorImpl::seed() {
  */
 c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
   // The RNG state comprises the seed, and an offset used for Philox.
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(int64_t);
-  static const size_t total_size = seed_size + offset_size;
+  constexpr size_t seed_size = sizeof(uint64_t);
+  constexpr size_t offset_size = sizeof(int64_t);
+  constexpr size_t total_size = seed_size + offset_size;
 
   auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
   auto rng_state = state_tensor.data_ptr<uint8_t>();
@@ -346,9 +346,9 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
  * and size of the internal state.
  */
 void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(int64_t);
-  static const size_t total_size = seed_size + offset_size;
+  constexpr size_t seed_size = sizeof(uint64_t);
+  constexpr size_t offset_size = sizeof(int64_t);
+  constexpr size_t total_size = seed_size + offset_size;
 
   detail::check_rng_state(new_state);
 
@@ -461,7 +461,7 @@ void CUDAGeneratorImpl::unregister_graph(cuda::CUDAGraph* graph) {
  */
 PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) {
   if (at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None) {
-    uint32_t offset = state_->offset_intragraph_;
+    uint64_t offset = state_->offset_intragraph_;
     state_->increase(increment);
     return PhiloxCudaState(
         state_->seed_extragraph_.data_ptr<int64_t>(),
diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.h b/aten/src/ATen/cuda/CUDAGeneratorImpl.h
index b0b77cb822a8..d4ab49382e7f 100644
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.h
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.h
@@ -96,16 +96,16 @@ struct CUDAGraph;
 struct CUDAGeneratorState : public c10::intrusive_ptr_target {
   uint64_t seed_;
   uint64_t philox_offset_per_thread_;
-  uint32_t offset_intragraph_;
+  uint64_t offset_intragraph_;
   bool capturing_{};
   std::unordered_set<cuda::CUDAGraph*> registered_graphs_;
-  at::TensorBase seed_extragraph_{};
-  at::TensorBase offset_extragraph_{};
+  at::TensorBase seed_extragraph_;
+  at::TensorBase offset_extragraph_;
 
   CUDAGeneratorState(
       uint64_t seed = default_rng_seed_val,
       uint64_t philox_offset_per_thread = 0,
-      uint32_t offset_intragraph = 0)
+      uint64_t offset_intragraph = 0)
       : seed_(seed),
         philox_offset_per_thread_(philox_offset_per_thread),
         offset_intragraph_(offset_intragraph) {}
@@ -167,7 +167,7 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
   CUDAGeneratorImpl* clone_impl() const override;
 
   c10::intrusive_ptr<CUDAGeneratorState> state_;
-  std::atomic_flag no_reset_rnn_state_{};
+  std::atomic_flag no_reset_rnn_state_;
 };
 
 namespace cuda::detail {
diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h
index c18ad66b2080..a32e7b4b86f0 100644
--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@@ -56,7 +56,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
 
   // the ID assigned by cuda during graph capture,
   // used to identify when a stream is participating in capture
-  CaptureId_t capture_id_ = -1;
+  CaptureId_t capture_id_ = 0;
 
   // uuid used to request a particular private mempool from CUDACachingAllocator.
   // By default, this will be set to {id_, 0}.
diff --git a/aten/src/ATen/cuda/CUDASparse.h b/aten/src/ATen/cuda/CUDASparse.h
index 736fbe4ae50d..e00e50b38d2d 100644
--- a/aten/src/ATen/cuda/CUDASparse.h
+++ b/aten/src/ATen/cuda/CUDASparse.h
@@ -6,43 +6,15 @@
 #define HIPSPARSE_VERSION ((hipsparseVersionMajor*100000) + (hipsparseVersionMinor*100) + hipsparseVersionPatch)
 #endif
 
-// cuSparse Generic API added in CUDA 10.1
-// Windows support added in CUDA 11.0
-#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && ((CUSPARSE_VERSION >= 10300) || (CUSPARSE_VERSION >= 11000 && defined(_WIN32)))
-#define AT_USE_CUSPARSE_GENERIC_API() 1
-#else
-#define AT_USE_CUSPARSE_GENERIC_API() 0
-#endif
-
-// cuSparse Generic API descriptor pointers were changed to const in CUDA 12.0
-#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \
-    (CUSPARSE_VERSION < 12000)
-#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 1
-#else
-#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 0
-#endif
-
-#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \
-    (CUSPARSE_VERSION >= 12000)
-#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 1
-#else
-#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 0
-#endif
 
 #if defined(USE_ROCM)
 // hipSparse const API added in v2.4.0
 #if HIPSPARSE_VERSION >= 200400
-#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 1
-#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0
 #define AT_USE_HIPSPARSE_GENERIC_API() 1
 #else
-#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0
-#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 1
 #define AT_USE_HIPSPARSE_GENERIC_API() 1
 #endif
 #else // USE_ROCM
-#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0
-#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0
 #define AT_USE_HIPSPARSE_GENERIC_API() 0
 #endif // USE_ROCM
 
diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
index 092314ac81f2..d5f04df55f9c 100644
--- a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
+++ b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp
@@ -12,8 +12,6 @@ cusparseStatus_t destroyConstDnMat(const cusparseDnMatDescr* dnMatDescr) {
   return cusparseDestroyDnMat(const_cast<cusparseDnMatDescr*>(dnMatDescr));
 }
 
-#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
-
 namespace {
 
 // If a specific GPU model does not provide native support for a given data
@@ -210,6 +208,4 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int6
   descriptor_.reset(raw_descriptor);
 }
 
-#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
-
 } // namespace at::cuda::sparse
diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.h b/aten/src/ATen/cuda/CUDASparseDescriptors.h
index 7fc482f2a3fb..f12ef628e13f 100644
--- a/aten/src/ATen/cuda/CUDASparseDescriptors.h
+++ b/aten/src/ATen/cuda/CUDASparseDescriptors.h
@@ -35,7 +35,6 @@ class CuSparseDescriptor {
   std::unique_ptr<T, CuSparseDescriptorDeleter<T, destructor>> descriptor_;
 };
 
-#if AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
 template <typename T, cusparseStatus_t (*destructor)(const T*)>
 struct ConstCuSparseDescriptorDeleter {
   void operator()(T* x) {
@@ -58,7 +57,6 @@ class ConstCuSparseDescriptor {
  protected:
   std::unique_ptr<T, ConstCuSparseDescriptorDeleter<T, destructor>> descriptor_;
 };
-#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS || AT_USE_HIPSPARSE_CONST_DESCRIPTORS
 
 #if defined(USE_ROCM)
 using cusparseMatDescr = std::remove_pointer_t<hipsparseMatDescr_t>;
@@ -123,39 +121,8 @@ class TORCH_CUDA_CPP_API CuSparseBsrsm2Info
 
 #endif // AT_USE_HIPSPARSE_TRIANGULAR_SOLVE
 
-#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
-
 cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);
 
-#if AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS()
-class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
-    : public CuSparseDescriptor<cusparseDnMatDescr, &cusparseDestroyDnMat> {
- public:
-  explicit CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1);
-};
-
-class TORCH_CUDA_CPP_API CuSparseConstDnMatDescriptor
-    : public CuSparseDescriptor<const cusparseDnMatDescr, &destroyConstDnMat> {
- public:
-  explicit CuSparseConstDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1);
-  cusparseDnMatDescr* unsafe_mutable_descriptor() const {
-    return const_cast<cusparseDnMatDescr*>(descriptor());
-  }
-  cusparseDnMatDescr* unsafe_mutable_descriptor() {
-    return const_cast<cusparseDnMatDescr*>(descriptor());
-  }
-};
-
-class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor
-    : public CuSparseDescriptor<cusparseDnVecDescr, &cusparseDestroyDnVec> {
- public:
-  explicit CuSparseDnVecDescriptor(const Tensor& input);
-};
-
-class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor
-    : public CuSparseDescriptor<cusparseSpMatDescr, &cusparseDestroySpMat> {};
-
-#elif AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
   class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
       : public ConstCuSparseDescriptor<
             cusparseDnMatDescr,
@@ -194,7 +161,6 @@ class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor
       : public ConstCuSparseDescriptor<
             cusparseSpMatDescr,
             &cusparseDestroySpMat> {};
-#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
 
 class TORCH_CUDA_CPP_API CuSparseSpMatCsrDescriptor
     : public CuSparseSpMatDescriptor {
@@ -283,6 +249,4 @@ class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor
   }
 };
 
-#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
-
 } // namespace at::cuda::sparse
diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp
index 34aa15d0c06c..a2260d23b2d4 100644
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@@ -9,7 +9,6 @@
 
 #include <cuda_runtime_api.h>
 #include <future>
-#include <unordered_map>
 
 namespace at::cuda {
 namespace {
@@ -72,9 +71,20 @@ using Block = HostBlock<CUDAStream>;
 struct CUDACachingHostAllocatorImpl
     : public CachingHostAllocatorImpl<CUDAStream, EventPool::Event> {
  private:
-  std::unordered_map<void*, bool> use_host_register;
+  ska::flat_hash_map<void*, bool> use_host_register;
 
   void allocate_host_memory(size_t size, void** ptr) override {
+    // try allocating from reserve segment first before calling into expensive APIs
+    if (get_reserve_segment().initialized()) {
+      *ptr = get_reserve_segment().allocate(size);
+      if (*ptr != nullptr) {
+        return;
+      }
+    }
+    allocate_host_memory_slowpath(size, ptr);
+  }
+
+  void allocate_host_memory_slowpath(size_t size, void** ptr) {
     // Pinned memory pointers allocated by any device can be directly used by
     // any other device, regardless of the current device at the time of
     // allocation, since we assume unified addressing. So we grab any existing
@@ -113,6 +123,18 @@ struct CUDACachingHostAllocatorImpl
   }
 
   void free_block(Block* block) override {
+    // We never free blocks from the reserve segment
+    if (get_reserve_segment().initialized()) {
+      // Check if the block is from the reserve segment
+      if (get_reserve_segment().owns(block->ptr_)) {
+        return;
+      }
+    }
+
+    free_block_slowpath(block);
+  }
+
+  void free_block_slowpath(Block* block) {
     auto start = std::chrono::steady_clock::now();
     // Users may change the allocator config at will. torch unit tests do this.
     // However, allocations using cudaHostRegister should use corresonding
@@ -161,17 +183,26 @@ struct CUDACachingHostAllocatorImpl
     return true;
   }
 
-  bool pinned_use_background_threads() override {
-    return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
-        pinned_use_background_threads();
-  }
-
   EventPool::Event create_event_internal(DeviceIndex idx) {
     // Leak the event pool to avoid shutdown issue.
     static auto* event_pool = new EventPool();
     return event_pool->get(idx);
   }
 
+  PinnedReserveSegment& get_reserve_segment() {
+    static auto reserve_segment = [&]() {
+      if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() > 0) {
+        void *ptr;
+        size_t sz = c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_reserve_segment_size_mb() * 1024 * 1024;
+        allocate_host_memory_slowpath(sz, &ptr);
+        return PinnedReserveSegment(ptr, sz);
+      } else {
+        return PinnedReserveSegment();
+      }
+    } ();
+    return reserve_segment;
+  }
+
   TaskThreadPool* getThreadPool() {
     static TaskThreadPool* pool = new TaskThreadPool(
         static_cast<int>(c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
@@ -186,15 +217,15 @@ struct CUDACachingHostAllocatorImpl
       size_t numThreads,
       size_t pageSize) {
     uintptr_t start = (uintptr_t)ptr + (size * i / numThreads);
-    uintptr_t end = (uintptr_t)start + (size / numThreads);
+    uintptr_t end = start + (size / numThreads);
     if (i == (numThreads - 1)) {
       end = (uintptr_t)ptr + size;
     }
 
     // pre-fault/map the pages by setting the first byte of the page
     uintptr_t alignedStart =
-        (((uintptr_t)start + pageSize - 1) & ~(pageSize - 1));
-    for (uintptr_t p = alignedStart; p < ((uintptr_t)end); p += pageSize) {
+        ((start + pageSize - 1) & ~(pageSize - 1));
+    for (uintptr_t p = alignedStart; p < (end); p += pageSize) {
       // NOLINTNEXTLINE(performance-no-int-to-ptr)
       memset((void*)p, 0, 1);
     }
diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
index 329851341443..d7832c761ae5 100644
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -310,7 +310,7 @@ cublasHandle_t getCurrentCUDABlasHandle() {
   // FP32 data type calculations based on the value of the allow_tf32 flag.
   // To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH.
   if (!NoTF32Guard::should_disable_tf32() &&
-      at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
+      at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH));
   } else {
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh
index 23a3ff8c8958..7828c3917fc4 100644
--- a/aten/src/ATen/cuda/cub.cuh
+++ b/aten/src/ATen/cuda/cub.cuh
@@ -177,7 +177,6 @@ inline void segmented_sort_pairs(
   }
 }
 
-#if CUB_SUPPORTS_UNIQUE_BY_KEY()
 template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename NumSelectedIteratorT>
 inline void unique_by_key(
   KeysInputIteratorT keys_in, ValuesInputIteratorT values_in,
@@ -193,7 +192,6 @@ inline void unique_by_key(
   CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSelect::UniqueByKey,
     keys_in, values_in, keys_out_, values_out, num_selected, num_input_items, c10::cuda::getCurrentCUDAStream());
 }
-#endif
 
 namespace impl {
 
@@ -579,7 +577,6 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
 #endif
 }
 
-#if CUB_SUPPORTS_SCAN_BY_KEY()
 
 template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT>
 inline void inclusive_sum_by_key(KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, int64_t num_items) {
@@ -607,7 +604,6 @@ inline void inclusive_scan_by_key(KeysInputIteratorT keys, ValuesInputIteratorT
 #endif
 }
 
-#endif
 
 template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
 void unique(InputIteratorT input, OutputIteratorT output,
diff --git a/aten/src/ATen/cuda/cub_definitions.cuh b/aten/src/ATen/cuda/cub_definitions.cuh
index b80951269209..0d76ae6e8dcf 100644
--- a/aten/src/ATen/cuda/cub_definitions.cuh
+++ b/aten/src/ATen/cuda/cub_definitions.cuh
@@ -28,22 +28,6 @@
 #define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
 #endif
 
-// cub support for UniqueByKey is added to cub 1.16 in:
-// https://github.com/NVIDIA/cub/pull/405
-#if CUB_VERSION >= 101600
-#define CUB_SUPPORTS_UNIQUE_BY_KEY() true
-#else
-#define CUB_SUPPORTS_UNIQUE_BY_KEY() false
-#endif
-
-// cub support for scan by key is added to cub 1.15
-// in https://github.com/NVIDIA/cub/pull/376
-#if CUB_VERSION >= 101500
-#define CUB_SUPPORTS_SCAN_BY_KEY() 1
-#else
-#define CUB_SUPPORTS_SCAN_BY_KEY() 0
-#endif
-
 // cub support for cub::FutureValue is added to cub 1.15 in:
 // https://github.com/NVIDIA/cub/pull/305
 #if CUB_VERSION >= 101500
diff --git a/aten/src/ATen/cuda/detail/BLASConstants.cu b/aten/src/ATen/cuda/detail/BLASConstants.cu
new file mode 100644
index 000000000000..967388044705
--- /dev/null
+++ b/aten/src/ATen/cuda/detail/BLASConstants.cu
@@ -0,0 +1,54 @@
+#include <ATen/Functions.h>
+#include <ATen/Tensor.h>
+#include <ATen/cuda/Exceptions.h>
+
+#include <mutex>
+
+namespace at {
+namespace cuda {
+namespace detail {
+
+__device__ __constant__ float cublas_one_device;
+__device__ __constant__ float cublas_zero_device;
+
+float *get_cublas_device_one() {
+  static c10::once_flag init_flag;
+
+  c10::call_once(init_flag, []() {
+    const float one = 1.f;
+    AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_one_device, &one, sizeof(float)));
+  });
+
+  float *ptr;
+  AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
+  return ptr;
+}
+
+float *get_cublas_device_zero() {
+  static c10::once_flag init_flag;
+
+  c10::call_once(init_flag, []() {
+    const float zero = 0.f;
+    AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_zero_device, &zero, sizeof(float)));
+  });
+
+  float *ptr;
+  AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
+  return ptr;
+}
+
+float *get_user_alpha_ptr() {
+  static float *alpha_ptr;
+
+  static c10::once_flag init_flag;
+
+  c10::call_once(init_flag, []() {
+    AT_CUDA_CHECK(cudaMalloc(&alpha_ptr, sizeof(float)));
+  });
+
+  return alpha_ptr;
+}
+
+} // namespace detail
+} // namespace cuda
+} // namespace at
diff --git a/aten/src/ATen/cuda/detail/BLASConstants.h b/aten/src/ATen/cuda/detail/BLASConstants.h
new file mode 100644
index 000000000000..d62aaf1330ee
--- /dev/null
+++ b/aten/src/ATen/cuda/detail/BLASConstants.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <ATen/core/TensorBase.h>
+
+namespace at::cuda::detail {
+
+float *get_cublas_device_one();
+float *get_cublas_device_zero();
+float *get_user_alpha_ptr();
+
+} // namespace at::cuda::detail
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 72826b584792..b7f80101d926 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -281,6 +281,9 @@ bool CUDAHooks::compiledWithMIOpen() const {
 
 bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
 #if AT_CUDNN_ENABLED()
+  if (!hasCUDA()) {
+    return false;
+  }
   // NOTE: extra parenthesis around numbers disable clang warnings about
   // dead code
   return true;
@@ -291,6 +294,9 @@ bool CUDAHooks::supportsDilatedConvolutionWithCuDNN() const {
 
 bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const {
 #if AT_CUDNN_ENABLED()
+  if (!hasCUDA()) {
+    return false;
+  }
   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
   // Check for Volta cores
   if (prop->major >= 7) {
@@ -305,6 +311,26 @@ bool CUDAHooks::supportsDepthwiseConvolutionWithCuDNN() const {
 
 bool CUDAHooks::supportsBFloat16ConvolutionWithCuDNNv8() const {
 #if AT_CUDNN_ENABLED()
+  if (!hasCUDA()) {
+    return false;
+  }
+  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+  // Check for Volta cores
+  if (prop->major >= 8) {
+    return true;
+  } else {
+    return false;
+  }
+#else
+  return false;
+#endif
+}
+
+bool CUDAHooks::supportsBFloat16RNNWithCuDNN() const {
+#if AT_CUDNN_ENABLED() && (CUDNN_VERSION >= 91300)
+  if (!hasCUDA()) {
+    return false;
+  }
   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
   // Check for Volta cores
   if (prop->major >= 8) {
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
index 2780369a37b7..8d3d1db00392 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -17,7 +17,7 @@ TORCH_CUDA_CPP_API void set_magma_init_fn(void (*magma_init_fn)());
 
 // The real implementation of CUDAHooksInterface
 struct CUDAHooks : public at::CUDAHooksInterface {
-  CUDAHooks(at::CUDAHooksArgs) {}
+  CUDAHooks(at::CUDAHooksArgs /*unused*/) {}
   void init() const override;
   Device getDeviceFromPtr(void* data) const override;
   bool isPinnedPtr(const void* data) const override;
@@ -45,6 +45,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   bool supportsDilatedConvolutionWithCuDNN() const override;
   bool supportsDepthwiseConvolutionWithCuDNN() const override;
   bool supportsBFloat16ConvolutionWithCuDNNv8() const override;
+  bool supportsBFloat16RNNWithCuDNN() const override;
   bool hasCUDART() const override;
   long versionCUDART() const override;
   long versionCuDNN() const override;
diff --git a/aten/src/ATen/cuda/detail/DeviceThreadHandles.h b/aten/src/ATen/cuda/detail/DeviceThreadHandles.h
index 1f80c863b639..71a344d281d2 100644
--- a/aten/src/ATen/cuda/detail/DeviceThreadHandles.h
+++ b/aten/src/ATen/cuda/detail/DeviceThreadHandles.h
@@ -122,7 +122,7 @@ struct DeviceThreadHandlePool : public std::enable_shared_from_this<DeviceThread
 
     // Called by the destructor.  Releases this thread's handles back into the pool.
     void release() {
-        if(my_handles.size() > 0) {
+        if(!my_handles.empty()) {
             auto parent = weak_parent.lock();
             if (!parent) {
                 // If this thread exits after atexit handlers have completed, the
diff --git a/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh b/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
index 231cd167cacb..7de0321256fd 100644
--- a/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
+++ b/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
@@ -19,7 +19,7 @@ struct PhiloxCudaState {
   // Called if graph capture is underway
   PhiloxCudaState(int64_t* seed,
                   int64_t* offset_extragraph,
-                  uint32_t offset_intragraph) {
+                  uint64_t offset_intragraph) {
     seed_.ptr = seed;
     offset_.ptr = offset_extragraph;
     offset_intragraph_ = offset_intragraph;
@@ -36,7 +36,7 @@ struct PhiloxCudaState {
 
   Payload seed_{};
   Payload offset_{};
-  uint32_t offset_intragraph_ = 0;
+  uint64_t offset_intragraph_ = 0;
   bool captured_ = false;
 };
 
diff --git a/aten/src/ATen/cuda/tunable/GemmCommon.h b/aten/src/ATen/cuda/tunable/GemmCommon.h
index 6d19907aba4a..5d9e33b2b5b2 100644
--- a/aten/src/ATen/cuda/tunable/GemmCommon.h
+++ b/aten/src/ATen/cuda/tunable/GemmCommon.h
@@ -13,6 +13,7 @@
 #include <c10/core/ScalarType.h>
 
 #include <ATen/cuda/tunable/TunableOp.h>
+#include <ATen/cuda/tunable/Tunable.h>
 #include <ATen/cuda/CUDABlas.h>
 #include <ATen/cuda/Exceptions.h>
 #include <c10/util/StringUtil.h>
@@ -29,7 +30,7 @@
 
 namespace at::cuda::tunable {
 
-using at::cuda::blas::ScalingType;
+using at::blas::ScalingType;
 
 enum class BlasOp {
   N = 0,
@@ -150,6 +151,7 @@ inline std::string ScalarTypeToBLASType(c10::ScalarType scalar_type) {
       BLASType = "unknown";
   }
   return BLASType;
+
 }
 
 // Similar to Compute Type in GemmRocblas.h
@@ -162,7 +164,7 @@ inline std::string ComputeTypeFor() {
 // ROCBLAS and hipBLASLt.
 template <>
 inline std::string ComputeTypeFor<float>() {
-  if (at::globalContext().float32Precision("cuda", "matmul") != "tf32") {
+  if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) != at::Float32Precision::TF32) {
     return "f32_r";
   } else {
     return "xf32_r";
@@ -244,33 +246,25 @@ inline std::string to_string_epilogue(const at::cuda::blas::GEMMAndBiasActivatio
 
 namespace detail {
 
-static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size) {
+static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size, const NumericalCheckConfig& config) {
+
+  if (!config.enabled) {
+    return true; // skip when disabled
+  }
+
   auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA);
-  // comparison done as 1D tensor
   at::Tensor ref = at::from_blob(c,       {size}, options);
   at::Tensor oth = at::from_blob(other_c, {size}, options);
   at::Tensor ref_float = ref.to(at::kFloat);
   at::Tensor oth_float = oth.to(at::kFloat);
-  std::vector<double> atols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
-  std::vector<double> rtols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
-  double last_succeed_atol = 1;
-  double last_succeed_rtol = 1;
-  for (auto& atol : atols) {
-    for (auto& rtol : rtols) {
-      if (at::allclose(ref_float, oth_float, rtol, atol)) {
-        last_succeed_atol = atol;
-        last_succeed_rtol = rtol;
-      }
-    }
-  }
-  if (last_succeed_atol == 1) {
-    return false;
-  }
-  else {
-    TUNABLE_LOG3("├──verify numerics: atol=", last_succeed_atol, ", rtol=", last_succeed_rtol);
-  }
 
-  return true;
+  const bool ok = at::allclose(ref_float, oth_float, config.rtol, config.atol);
+  if (ok) {
+    TUNABLE_LOG3("├──verify numerics: PASSED with atol=", config.atol, ", rtol=", config.rtol);
+  } else {
+    TUNABLE_LOG3("├──verify numerics: FAILED with atol=", config.atol, ", rtol=", config.rtol);
+  }
+  return ok;
 }
 
 }
@@ -355,8 +349,10 @@ struct GemmParams : OpParams {
   }
 
   TuningStatus NumericalCheck(GemmParams<T> *other) {
+    auto* ctx = getTuningContext();
+    auto cfg = ctx->GetNumericalCheckConfig();
     auto c_dtype = c10::CppTypeToScalarType<T>::value;
-    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
   }
 
   char transa{};
@@ -449,8 +445,10 @@ struct GemmAndBiasParams : OpParams {
   }
 
   TuningStatus NumericalCheck(GemmAndBiasParams<T> *other) {
+    auto* ctx = getTuningContext();
+    auto cfg = ctx->GetNumericalCheckConfig();
     auto c_dtype = c10::CppTypeToScalarType<T>::value;
-    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
   }
 
   char transa{};
@@ -546,8 +544,10 @@ struct GemmStridedBatchedParams : OpParams {
   }
 
   TuningStatus NumericalCheck(GemmStridedBatchedParams<T> *other) {
+    auto* ctx = getTuningContext();
+    auto cfg = ctx->GetNumericalCheckConfig();
     auto c_dtype = c10::CppTypeToScalarType<C_Dtype>::value;
-    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
   }
 
   char transa{};
@@ -663,7 +663,9 @@ struct ScaledGemmParams : OpParams {
   }
 
   TuningStatus NumericalCheck(ScaledGemmParams<T> *other) {
-    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
+    auto* ctx = getTuningContext();
+    auto cfg = ctx->GetNumericalCheckConfig();
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
   }
 
   char transa{};
diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
index 1a0d96899906..29affa2d21ff 100644
--- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@@ -506,7 +506,7 @@ class HipblasltGemmOp : public Callable<ParamsT> {
       }
 
       hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
-      if (at::globalContext().float32Precision("cuda", "matmul") == "tf32") {
+      if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
         computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
       }
       HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F);
diff --git a/aten/src/ATen/cuda/tunable/GemmRocblas.h b/aten/src/ATen/cuda/tunable/GemmRocblas.h
index d7c45dc91c21..60eaa2e4d475 100644
--- a/aten/src/ATen/cuda/tunable/GemmRocblas.h
+++ b/aten/src/ATen/cuda/tunable/GemmRocblas.h
@@ -141,7 +141,7 @@ class RocblasGemmOp : public Callable<GemmParams<T>> {
 
     TuningStatus Call(const GemmParams<T>* params) override {
       auto input_output_type = RocBlasDataTypeFor<T>();
-      if (at::globalContext().float32Precision("cuda", "matmul") == "tf32" && input_output_type == rocblas_datatype_f32_r)
+      if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r)
         return FAIL;  // no support for TF32 in rocBLAS
       auto compute_type = RocBlasComputeTypeFor<T>();
       auto h_a = DoCastForHalfOrBfloat16(params->alpha);
@@ -209,7 +209,7 @@ class RocblasGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>
 
     TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
       auto input_output_type = RocBlasDataTypeFor<T>();
-      if (at::globalContext().float32Precision("cuda", "matmul") == "tf32" && input_output_type == rocblas_datatype_f32_r)
+      if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r)
         return FAIL;  // no support for TF32 in rocBLAS
       auto compute_type = RocBlasComputeTypeFor<T>();
       auto h_a = DoCastForHalfOrBfloat16(params->alpha);
diff --git a/aten/src/ATen/cuda/tunable/README.md b/aten/src/ATen/cuda/tunable/README.md
index b30040b7e284..db31af9259a5 100644
--- a/aten/src/ATen/cuda/tunable/README.md
+++ b/aten/src/ATen/cuda/tunable/README.md
@@ -145,7 +145,7 @@ programmatically since the settings become fixed. Use the C++ or Python APIs ins
 | PYTORCH_TUNABLEOP_VERBOSE | Default is 0. Set to 1 to enable basic logging. 2 for basic tuning status. 3 for full trace. |
 | PYTORCH_TUNABLEOP_VERBOSE_FILENAME | Default is "err" for stderr. Set to "out" for stdout or a filename for capturing verbose logging. |
 | PYTORCH_TUNABLEOP_FILENAME | Default is 'tunableop_results.csv'. |
-| PYTORCH_TUNABLEOP_NUMERICAL_CHECK | Default is 0. Set to 1 to enable. |
+| PYTORCH_TUNABLEOP_NUMERICAL_CHECK | Default is off. Set 'atol_rtol' to enable, for example "1e-5_1e-5". |
 | PYTORCH_TUNABLEOP_ROCBLAS_ENABLED | Default is 1. Set to 0 to disable rocblas being considered during tuning. |
 | PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED | Default is 1. Set to 0 to disable hipblaslt being considered during tuning. |
 | PYTORCH_TUNABLEOP_MAX_TUNING_DURATION_MS | Default is 30. Unit is milliseconds. |
@@ -173,10 +173,9 @@ All python APIs exist in the `torch.cuda.tunable` module.
 | get_max_tuning_iterations() -> int | |
 | set_filename(filename: str, insert_device_ordinal: bool = False) -> None | |
 | get_filename() -> str | |
+| set_numerical_check_tolerances(enable: bool, atol: float, rtol: float) -> None | Enable or disable numerical checking; atol and rtol default to 1e-5.
 | get_results() -> Tuple[str, str, str, float] | |
 | get_validators() -> Tuple[str, str] | |
-| write_file_on_exit(val: bool) -> None | Default is True. |
-| write_file(filename: Optional[str] = None) -> None | If filename not given, it will call get_filename(). |
 | read_file(filename: Optional[str] = None) -> None | If filename not given, it will call get_filename(). |
 | tune_gemm_in_file(filename: str) -> None | read an untuned file and tune GEMMs in it. |
 | mgpu_tune_gemm_in_file(filename_pattern: str, num_gpus: int) -> None: -> None | read one or more untuned files and tune all unique GEMMs on one or more GPUs. |
diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp
index 3511e48ae061..c5ea0c6dd17c 100644
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@@ -107,14 +107,30 @@ void TuningResultsManager::AddImpl(const std::string& op_signature,
 }
 
 void TuningResultsManager::Add(const std::string& op_signature, const std::string& params_signature, ResultEntry best) {
-  std::scoped_lock l{lock_};
+  bool is_new = false;
+  ResultEntry inserted = ResultEntry::Null();
 
-  auto it = results_.find(op_signature);
-  if (it == results_.end()) {
-    it = results_.insert({op_signature, {}}).first;
+  // ---- mutate maps under results lock ----
+  {
+    std::scoped_lock l{lock_};
+    auto& km = results_[op_signature];  // creates if missing
+    is_new = (km.find(params_signature) == km.end());
+    AddImpl(op_signature, params_signature, std::move(best), km);
+    if (is_new) {
+      inserted = km.at(params_signature);  // snapshot for I/O after unlocking
+    }
+  }
+   if (!is_new) return;  // only write once per unique (op, params)
+
+   TuningContext* ctx = getTuningContext();
+  if (ctx->IsTuningEnabled() && !ctx->IsRecordUntunedEnabled()) {
+    InitRealtimeAppend(ctx->GetFilename(), ctx->GetTuningResultsValidator().GetAllValidators());
+
+    if (is_new && realtime_out_ && realtime_out_->good()) {
+      AppendResultLine(op_signature, params_signature, inserted);
+    }
   }
 
-  AddImpl(op_signature, params_signature, std::move(best), it->second);
 }
 
 void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature,
@@ -150,6 +166,77 @@ void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std
   }
 }
 
+void TuningResultsManager::InitRealtimeAppend(const std::string& filename, const std::unordered_map<std::string, std::string>& validators) {
+  std::scoped_lock fl{realtime_file_mutex_};
+
+  if (realtime_out_ && realtime_out_->good() && realtime_filename_ == filename) {
+    return;
+  }
+
+  if (realtime_out_ && realtime_filename_ != filename) {
+    realtime_out_->flush();
+    realtime_out_->close();
+    realtime_out_.reset();
+    validators_written_ = false;
+  }
+
+  bool file_exists = false;
+  bool file_empty = true;
+
+  {
+    std::ifstream check_file(filename);
+    if (check_file.good()) {
+      file_exists = true;
+      file_empty = (check_file.peek() == std::ifstream::traits_type::eof());
+    }
+  }
+
+  realtime_out_ = std::make_unique<std::ofstream>(filename, std::ios::out | std::ios::app);
+
+  if (!realtime_out_->good()) {
+    TORCH_WARN("TunableOp realtime append: failed to open '", filename,"'");
+    realtime_out_.reset();
+    return;
+  }
+
+  if(!file_exists || file_empty) {
+    for(const auto& [key, val] : validators) {
+      (*realtime_out_) << "Validator," << key << "," << val << std::endl;
+      realtime_out_->flush();
+    }
+    validators_written_ = true;
+
+    TUNABLE_LOG2("Wrote validators to realtime output file");
+  }
+
+  realtime_filename_ = filename;
+}
+
+void TuningResultsManager::AppendResultLine(const std::string& op_sig, const std::string& param_sig, const ResultEntry& result) {
+  std::scoped_lock fl{realtime_file_mutex_};
+
+  if(!realtime_out_ || !realtime_out_->good()) {
+    return;
+  }
+
+  (*realtime_out_) << op_sig << "," << param_sig << "," << result << std::endl;
+  realtime_out_->flush(); //ensure immediate write to disk
+
+  TUNABLE_LOG3("Realtime append: ", op_sig, "(", param_sig, ") -> ", result);
+}
+
+void TuningResultsManager::CloseRealtimeAppend() {
+  std::scoped_lock fl{realtime_file_mutex_};
+
+
+  if(realtime_out_) {
+    realtime_out_->flush();
+    realtime_out_->close();
+    realtime_out_.reset();
+    TUNABLE_LOG2("Closed realtime output file");
+  }
+}
+
 void TuningResultsManager::Delete(const std::string& op_signature, const std::string& params_signature) {
   std::scoped_lock l{lock_};
 
@@ -396,7 +483,6 @@ TuningContext::TuningContext() :
     tuning_enable_{true},
     record_untuned_enable_{false},
     manager_initialized_{false},
-    write_file_on_exit_{true},
     numerics_check_enable_{false},
     max_tuning_duration_ms_{30},
     max_tuning_iterations_{100},
@@ -404,8 +490,6 @@ TuningContext::TuningContext() :
     max_warmup_iterations_{0},
     icache_flush_{true},
     rotating_buffer_size_{-1},
-    filename_{},
-    untuned_file_{},
     results_count_from_input_file_{0},
     is_shutting_down_{false}
 {
@@ -419,20 +503,8 @@ TuningContext::~TuningContext() {
     // but doesn't do any computation itself.
     return;
   }
-  auto filename = GetFilename();
-  if (IsTunableOpEnabled() && IsTuningEnabled() && !filename.empty() && write_file_on_exit_) {
-    if (results_count_from_input_file_ < GetTuningResultsManager().GetSize()) {
-      if (results_count_from_input_file_ > 0) {
-        TUNABLE_LOG1("additional tuning results available, rewriting file ", filename);
-      }
-      else {
-        TUNABLE_LOG1("writing file ", filename);
-      }
-      if (!WriteFile(filename)) {
-        TUNABLE_LOG1("failed to write file ", filename);
-      }
-    }
-  }
+  TUNABLE_LOG1("Closing File");
+  GetTuningResultsManager().CloseRealtimeAppend(); // Since, we do instant logging by default now.
 
   if (untuned_file_.good()) {
     untuned_file_.close();
@@ -513,20 +585,54 @@ std::ofstream& TuningContext::GetUntunedFile(){
   return untuned_file_;
 }
 
-void TuningContext::WriteFileOnExit(bool value) {
-  write_file_on_exit_ = value;
-}
 
 void TuningContext::EnableNumericsCheck(bool value) {
   numerics_check_enable_ = value;
 }
 
-bool TuningContext::IsNumericsCheckEnabled() const {
-  const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_NUMERICAL_CHECK");
-  if (env == "1") {
-    return true;
+NumericalCheckConfig TuningContext::GetNumericalCheckConfig() const {
+  const auto env_opt = c10::utils::get_env("PYTORCH_TUNABLEOP_NUMERICAL_CHECK");
+
+  if (!env_opt.has_value()) {
+    return numerics_cfg_;
+  }
+
+  const std::string& env = env_opt.value();
+
+  if (env == "0") {
+    return NumericalCheckConfig(false, 1e-5, 1e-5);
+  }
+
+  const size_t underscore = env.find('_');
+
+  TORCH_CHECK(
+      underscore != std::string::npos,
+      "Invalid PYTORCH_TUNABLEOP_NUMERICAL_CHECK format. "
+      "Expected 'atol_rtol', got: ",
+      env);
+
+  double atol = 0.0;
+  double rtol = 0.0;
+
+  try {
+    atol = std::stod(env.substr(0, underscore));
+    rtol = std::stod(env.substr(underscore + 1));
+  } catch (const std::exception& e) {
+    TORCH_CHECK(false, "Failed to parse PYTORCH_TUNABLEOP_NUMERICAL_CHECK: ", e.what());
   }
-  return numerics_check_enable_;
+
+  TORCH_CHECK( atol > 0.0 && rtol > 0.0, "Tolerance values must be positive. atol=", atol, ", rtol=", rtol);
+  return NumericalCheckConfig(true, atol, rtol);
+}
+
+void TuningContext::SetNumericalCheckConfig(bool enabled, double atol, double rtol) {
+  TORCH_CHECK(atol > 0.0 && rtol > 0.0, "Numerical check tolerances must be positive");
+  numerics_cfg_ = {enabled, atol, rtol};
+}
+
+bool TuningContext::IsNumericsCheckEnabled() const {
+  const auto cfg = GetNumericalCheckConfig();
+  return cfg.enabled || numerics_check_enable_;
 }
 
 void TuningContext::SetMaxTuningDurationMs(int max_duration_ms) {
@@ -636,11 +742,6 @@ TuningResultsManager& TuningContext::GetTuningResultsManager() {
     auto filename = GetFilename();
     if (!filename.empty() && !IsRecordUntunedEnabled()) {
       ReadFile(filename);
-      // attempt immediately to open file for writing to catch errors early
-      std::ofstream file(filename, std::ios::out | std::ios::app);
-      if (!file.good()) {
-        TORCH_WARN("failed to open file '", filename, "' for writing; your tuning results will not be saved");
-      }
     }
   });
   return manager_;
@@ -746,27 +847,6 @@ bool TuningContext::ReadFile(const std::string& filename_) {
   return true;
 }
 
-bool TuningContext::WriteFile(const std::string& filename_) {
-  std::string filename = filename_.empty() ? GetFilename() : filename_;
-  std::ofstream file(filename, std::ios::out | std::ios::trunc);
-  if (!file.good()) {
-    TUNABLE_LOG1("error opening tuning results file for writing ", filename);
-    return false;
-  }
-  auto validators = GetTuningResultsValidator().GetAllValidators();
-  for (const auto& [key, val] : validators) {
-    file << "Validator," << key << "," << val << std::endl;
-  }
-  auto results = GetTuningResultsManager().Dump();
-  for (const auto& [op_sig, kernelmap] : results) {
-    for (const auto& [param_sig, result] : kernelmap) {
-      file << op_sig << "," << param_sig << "," << result << std::endl;
-    }
-  }
-  file.close();
-  return true;
-}
-
 namespace {
 
 struct MaybeDelete {
diff --git a/aten/src/ATen/cuda/tunable/Tunable.h b/aten/src/ATen/cuda/tunable/Tunable.h
index 5e885d4764d2..17b4ea34ddf6 100644
--- a/aten/src/ATen/cuda/tunable/Tunable.h
+++ b/aten/src/ATen/cuda/tunable/Tunable.h
@@ -103,10 +103,24 @@ class TORCH_CUDA_CPP_API TuningResultsManager {
 
     void RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature,
       const std::string& params_signature, const std::string& blas_signature);
+
+    void InitRealtimeAppend(
+        const std::string& filename,
+        const std::unordered_map<std::string, std::string>& validators);
+
+    void AppendResultLine(const std::string& op_sig,
+                         const std::string& param_sig,
+                         const ResultEntry& result);
+
+    void CloseRealtimeAppend();  // For clean shutdown
   private:
     std::mutex lock_;
+    std::mutex realtime_file_mutex_;
+    std::unique_ptr<std::ofstream> realtime_out_;
+    std::string realtime_filename_;
     ResultsMap results_;
     UntunedMap untuned_results_;
+    bool validators_written_ = false;
 
 };
 
@@ -134,6 +148,16 @@ class TORCH_CUDA_CPP_API TuningResultsValidator {
     GetValidateFuncs validators_;
 };
 
+struct NumericalCheckConfig {
+  bool   enabled{false};
+  double atol{1e-5};
+  double rtol{1e-5};
+
+  NumericalCheckConfig() = default;
+  NumericalCheckConfig(bool e, double a, double r) : enabled(e), atol(a), rtol(r) {}
+};
+
+
 class TORCH_CUDA_CPP_API TuningContext {
   public:
     TuningContext();
@@ -155,6 +179,8 @@ class TORCH_CUDA_CPP_API TuningContext {
 
     void EnableNumericsCheck(bool value);
     bool IsNumericsCheckEnabled() const;
+    void SetNumericalCheckConfig(bool enabled, double atol, double rtol);
+    NumericalCheckConfig GetNumericalCheckConfig() const;
 
     void SetMaxTuningDurationMs(int max_duration_ms);
     int GetMaxTuningDurationMs() const;
@@ -185,10 +211,7 @@ class TORCH_CUDA_CPP_API TuningContext {
     void SetFilename(const std::string& filename, bool insert_device_ordinal=false);
     std::string GetFilename() const;
 
-    void WriteFileOnExit(bool value);
-
     bool ReadFile(const std::string& filename={});
-    bool WriteFile(const std::string& filename={});
 
     template<class... Types>
     void Log(int level, Types... args) {
@@ -207,7 +230,6 @@ class TORCH_CUDA_CPP_API TuningContext {
     bool tuning_enable_;
     bool record_untuned_enable_;
     bool manager_initialized_;
-    bool write_file_on_exit_;
     bool numerics_check_enable_;
     int max_tuning_duration_ms_;
     int max_tuning_iterations_;
@@ -222,6 +244,8 @@ class TORCH_CUDA_CPP_API TuningContext {
     std::ofstream untuned_file_;
     size_t results_count_from_input_file_;
     bool is_shutting_down_;
+
+    NumericalCheckConfig numerics_cfg_{};
 };
 
 TORCH_CUDA_CPP_API TuningContext* getTuningContext();
diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h
index d941c230630c..c014d1ea569c 100644
--- a/aten/src/ATen/cuda/tunable/TunableGemm.h
+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
@@ -109,7 +109,8 @@ class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
           params->c_scale_ptr,
           params->ldc,
           params->c_dtype,
-          params->use_fast_accum);
+          params->use_fast_accum,
+          std::nullopt /* alpha */);
       return OK;
     }
 };
diff --git a/aten/src/ATen/cuda/tunable/TunableOp.h b/aten/src/ATen/cuda/tunable/TunableOp.h
index 6ca9e213e148..d7bf0e6d93d8 100644
--- a/aten/src/ATen/cuda/tunable/TunableOp.h
+++ b/aten/src/ATen/cuda/tunable/TunableOp.h
@@ -29,7 +29,7 @@ template <typename ParamsT>
 class Callable {
   public:
     virtual ~Callable() = default;
-    virtual TuningStatus Call(const ParamsT*) {
+    virtual TuningStatus Call(const ParamsT* /*unused*/) {
       return FAIL;
     }
     virtual TuningStatus IsSupported(const ParamsT* params) {
@@ -267,27 +267,10 @@ class TunableOp {
       for (size_t i = 0; i < op_names_.size(); i++) {
         auto* candidate = ops_[op_names_[i]].get(); // borrow pointer
 
-        if (do_numerics_check) {
-          ParamsT* numerical_params = params->DeepCopy(false);
-          auto status = candidate->Call(numerical_params);
-          if (status != OK) {
-            numerical_params->Delete();
-            TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
-            continue;
-          }
-          status = reference_params->NumericalCheck(numerical_params);
-          numerical_params->Delete();
-          if (status != OK) {
-            TUNABLE_LOG3("├──numerics check failed for id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
-            continue;
-          }
-        }
-        else {
-          auto status = candidate->Call(reusable_params[0]);
-          if (status != OK) {
-            TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
-            continue;
-          }
+        auto status = candidate->Call(reusable_params[0]);
+        if (status != OK) {
+          TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+          continue;
         }
 
         // collect a small profile
@@ -310,6 +293,22 @@ class TunableOp {
           continue;
         }
 
+        if (do_numerics_check) {
+          ParamsT* numerical_params = params->DeepCopy(false);
+          auto status = candidate->Call(numerical_params);
+          if (status != OK) {
+            numerical_params->Delete();
+            TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+            continue;
+          }
+          status = reference_params->NumericalCheck(numerical_params);
+          numerical_params->Delete();
+          if (status != OK) {
+            TUNABLE_LOG3("├──numerics check failed for id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+            continue;
+          }
+        }
+
         // for warmup does user set max duration, max iters, or both?
         // warmup is skipped by default, i.e. warmup_iter = 0
         // warmup will be set to the non-zero value of max_warmup_duration
diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp
index 2fc1867d276d..dbd178e0f8ee 100644
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@@ -141,7 +141,7 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
     size[i] = (int) t.size(i);
   }
   for (const auto i : c10::irange(dim, pad)) {
-    size[i] = (int) 1;
+    size[i] = 1;
   }
   dim = std::max(dim, pad);
   cudnnTensorFormat_t filter_format{};
diff --git a/aten/src/ATen/cudnn/Types.cpp b/aten/src/ATen/cudnn/Types.cpp
index f6e080c433d6..f612436f5672 100644
--- a/aten/src/ATen/cudnn/Types.cpp
+++ b/aten/src/ATen/cudnn/Types.cpp
@@ -2,6 +2,8 @@
 
 #include <ATen/ATen.h>
 
+#include <c10/util/Exception.h>
+
 namespace at::native {
 
 cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype) {
@@ -20,9 +22,10 @@ cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype) {
   } else if (dtype == at::kByte) {
     return CUDNN_DATA_UINT8;
   }
-  std::string msg("getCudnnDataTypeFromScalarType() not supported for ");
-  msg += toString(dtype);
-  throw std::runtime_error(msg);
+  TORCH_CHECK(false,
+    "getCudnnDataTypeFromScalarType() not supported for ",
+    toString(dtype)
+  );
 }
 
 cudnnDataType_t getCudnnDataType(const at::Tensor& tensor) {
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index 00573e3cf701..f1f205691747 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -166,6 +166,10 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
     return false;
   }
 
+  virtual bool supportsBFloat16RNNWithCuDNN() const {
+    return false;
+  }
+
   virtual long versionCuDNN() const {
     TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
   }
diff --git a/aten/src/ATen/detail/HPUHooksInterface.h b/aten/src/ATen/detail/HPUHooksInterface.h
index 8cf9502a7e1b..3240ff4dac13 100644
--- a/aten/src/ATen/detail/HPUHooksInterface.h
+++ b/aten/src/ATen/detail/HPUHooksInterface.h
@@ -25,7 +25,7 @@ struct TORCH_API HPUHooksInterface : AcceleratorHooksInterface {
         false, "Cannot get device of pointer on HPU without HPU backend");
   }
 
-  bool isPinnedPtr(const void*) const override {
+  bool isPinnedPtr(const void* /*data*/) const override {
     return false;
   }
 
diff --git a/aten/src/ATen/functorch/BatchRulesHelper.h b/aten/src/ATen/functorch/BatchRulesHelper.h
index 70fbf3135a3c..ee23a0320f7c 100644
--- a/aten/src/ATen/functorch/BatchRulesHelper.h
+++ b/aten/src/ATen/functorch/BatchRulesHelper.h
@@ -410,7 +410,7 @@ struct ExistingBdimBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T..
 
 
 template <typename F, F Method, typename... ExtraArgs>
-Tensor& unary_inplace_batch_rule(Tensor& self, std::optional<int64_t>, ExtraArgs... extra_args) {
+Tensor& unary_inplace_batch_rule(Tensor& self, std::optional<int64_t> /*unused*/, ExtraArgs... extra_args) {
   INVOKE(self, Method)(std::forward<ExtraArgs>(extra_args)...);
   return self;
 }
diff --git a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
index 4f74468af085..cab76b3af9ad 100644
--- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
@@ -39,7 +39,7 @@ Tensor vdot_decomp(const Tensor& A, const Tensor& B) {
 // NB: I wrote this like this because we *might* want its for a future matmul
 // batch rule that isn't decomposed...
 // "tv" = tensor @ vector
-static std::tuple<Tensor, std::optional<int64_t>> tv_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> tv_batch_rule(
     const Tensor& self, std::optional<int64_t> self_bdim,
     const Tensor& other, std::optional<int64_t> other_bdim) {
   if (self_bdim && other_bdim) {
@@ -66,7 +66,7 @@ static std::tuple<Tensor, std::optional<int64_t>> tv_batch_rule(
   TORCH_INTERNAL_ASSERT(false, "can't get here");
 }
 
-static std::tuple<Tensor, std::optional<int64_t>> mv_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> mv_batch_rule(
     const Tensor& self, std::optional<int64_t> self_bdim,
     const Tensor& other, std::optional<int64_t> other_bdim) {
   auto self_logical_rank = rankWithoutBatchDim(self, self_bdim);
@@ -79,7 +79,7 @@ static std::tuple<Tensor, std::optional<int64_t>> mv_batch_rule(
   return tv_batch_rule(self, self_bdim, other, other_bdim);
 }
 
-static std::tuple<Tensor, std::optional<int64_t>> mm_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> mm_batch_rule(
     const Tensor& self, std::optional<int64_t> self_bdim,
     const Tensor& other, std::optional<int64_t> other_bdim) {
   auto self_logical_rank = rankWithoutBatchDim(self, self_bdim);
@@ -94,7 +94,7 @@ static std::tuple<Tensor, std::optional<int64_t>> mm_batch_rule(
   return std::make_tuple( at::matmul(self_, other_), 0 );
 }
 
-static std::tuple<Tensor, std::optional<int64_t>> bmm_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> bmm_batch_rule(
     const Tensor& self, std::optional<int64_t> self_bdim,
     const Tensor& other, std::optional<int64_t> other_bdim) {
   auto self_logical_rank = rankWithoutBatchDim(self, self_bdim);
@@ -176,7 +176,7 @@ struct LinalgCheckMatrixUnaryRuleHelper;
 
 template <char const *op_name, typename F, F Func, typename A, typename... T>
 struct LinalgCheckMatrixUnaryRuleHelper<op_name, F, Func, typelist<A, T...>> {
-  static inline Tensor check_and_reshape_input(const Tensor& tensor, std::optional<int64_t> batch_dim) {
+  static Tensor check_and_reshape_input(const Tensor& tensor, std::optional<int64_t> batch_dim) {
     TORCH_CHECK(rankWithoutBatchDim(tensor, batch_dim) >= 2, op_name, ": The input tensor A must have at least 2 dimensions.");
     return moveBatchDimToFront(tensor, batch_dim);
   }
@@ -222,7 +222,7 @@ struct LinalgCheckMatrixBinaryRuleHelper;
 
 template <char const *op_name, typename F, F Func, typename A, typename B, typename... T>
 struct LinalgCheckMatrixBinaryRuleHelper<op_name, F, Func, typelist<A, B, T...>> {
-  static inline std::tuple<Tensor, Tensor> check_inputs_and_reshape_inputs(
+  static std::tuple<Tensor, Tensor> check_inputs_and_reshape_inputs(
       const Tensor& first, std::optional<int64_t> first_bdim,
       const Tensor& second, std::optional<int64_t> second_bdim) {
     TORCH_CHECK(rankWithoutBatchDim(first, first_bdim) >= 2,
@@ -250,7 +250,7 @@ struct LinalgCheckMatrixBinaryRuleHelper<op_name, F, Func, typelist<A, B, T...>>
   }
 };
 
-static void expect_at_least_rank(
+void expect_at_least_rank(
     const Tensor& tensor,
     std::optional<int64_t> tensor_bdim,
     int64_t expected_rank,
@@ -472,7 +472,7 @@ atol_rtol_tensor_batch_rule(
   return std::make_tuple(Func(input_, atol_, rtol_, hermitian), 0);
 }
 
-static std::tuple<Tensor, std::optional<int64_t>>
+std::tuple<Tensor, std::optional<int64_t>>
 pinv_batch_rule(
     const Tensor& input, std::optional<int64_t> input_bdim, const std::optional<Tensor>& atol,
     const std::optional<int64_t> atol_bdim, const std::optional<Tensor>& rtol,
diff --git a/aten/src/ATen/functorch/BatchRulesModules.cpp b/aten/src/ATen/functorch/BatchRulesModules.cpp
index 6e63708a90f4..5fba8d257ceb 100644
--- a/aten/src/ATen/functorch/BatchRulesModules.cpp
+++ b/aten/src/ATen/functorch/BatchRulesModules.cpp
@@ -213,40 +213,22 @@ static cudnn_grid_sample_backward_batch_rule(
   return grid_sample_backward_helper_out(std::move(bw_out), 0, 0, bdim_size);
 }
 
-// TODO: replace with targetable functionalization
+// uses functional formulation for one_hot under vmap to be compatible with
+// fakeTensor/dynamic shapes and compiled functorch transforms.
+// mirrors the meta path in aten/src/ATen/native/Onehot.cpp,
+// but requires explicit positive num_classes under vmap to avoid
+// data-dependent output shapes.
 static Tensor one_hot_decomposition_hack(const Tensor &self, int64_t num_classes) {
     TORCH_CHECK(self.dtype() == kLong, "one_hot is only applicable to index tensor.");
-    auto shape = self.sym_sizes().vec();
-
-    // empty tensor could be converted to one hot representation,
-    // but shape inference is not possible.
-    if (self.sym_numel() == 0) {
-        if (num_classes <= 0) {
-            TORCH_CHECK(false, "Can not infer total number of classes from empty tensor.");
-        } else {
-            shape.emplace_back(num_classes);
-            return at::empty_symint(shape, self.options());
-        }
-    }
 
+    // disallow implicit inference under vmap; this would be data-dependent
+    // and is intentionally guarded by Dynamo in torch/_dynamo/variables/torch.py.
     TORCH_CHECK(num_classes > 0, "When vmap-ing torch.nn.functional.one_hot, please "
         "provide an explicit positive num_classes argument.");
 
-    // Disabling all of the following checks. This is OK because scatter has checks too.
-    // Maybe one_hot should be a primitive wrt autograd so we don't have to deal with this.
-    // // non-empty tensor
-    // if (self.device().type() != at::kCUDA) {
-    //   //for cuda, rely on device assert thrown by scatter
-    //   TORCH_CHECK(self.min().item().toLong() >= 0, "Class values must be non-negative.");
-    // }
-    // if (self.device().type() != at::kCUDA) {
-    //   //rely on device asserts from scatter to avoid sync here
-    //   TORCH_CHECK(num_classes > self.max().item().toLong(), "Class values must be smaller than num_classes.");
-    // }
-
-    shape.emplace_back(num_classes);
-    Tensor ret = at::zeros_symint(shape, self.options());
-    return ret.scatter(-1, self.unsqueeze(-1), 1);
+    const auto options = self.options();
+    at::Tensor index = at::arange(num_classes, options);
+    return at::eq(self.unsqueeze(-1), index).to(at::kLong);
 }
 
 template <typename A, A a, typename C>
diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
index 14f03bd17f4d..f5c770371de8 100644
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@@ -12,13 +12,14 @@
 #include <ATen/native/IndexKernel.h>
 #include <ATen/native/IndexingUtils.h>
 #include <torch/library.h>
+#include <c10/util/Exception.h>
 
 
 // NOLINTBEGIN(bugprone-unchecked-optional-access)
 namespace at::functorch {
 
 namespace {
-static bool any_has_value(ArrayRef<std::optional<int64_t>> bdims) {
+bool any_has_value(ArrayRef<std::optional<int64_t>> bdims) {
   for (const auto& bdim : bdims) {
     if (bdim.has_value()) {
       return true;
@@ -27,7 +28,7 @@ static bool any_has_value(ArrayRef<std::optional<int64_t>> bdims) {
   return false;
 }
 
-static int64_t get_num_leading_nones(ArrayRef<std::optional<Tensor>> indices) {
+int64_t get_num_leading_nones(ArrayRef<std::optional<Tensor>> indices) {
   int64_t result = 0;
   for (const auto& idx : indices) {
     if (!idx.has_value() || !idx->defined()) {
@@ -39,7 +40,7 @@ static int64_t get_num_leading_nones(ArrayRef<std::optional<Tensor>> indices) {
   return result;
 }
 
-static int64_t get_max_index_logical_dim(
+int64_t get_max_index_logical_dim(
     ArrayRef<std::optional<Tensor>> indices,
     ArrayRef<std::optional<int64_t>> indices_bdims) {
   int64_t max_logical_dim = -1;
@@ -56,7 +57,7 @@ static int64_t get_max_index_logical_dim(
   return max_logical_dim;
 }
 
-static std::vector<std::optional<Tensor>> batchIndices(
+std::vector<std::optional<Tensor>> batchIndices(
   at::TensorOptions options,
   ArrayRef<std::optional<Tensor>> indices,
   ArrayRef<std::optional<int64_t>> indices_bdims,
@@ -94,9 +95,10 @@ static std::vector<std::optional<Tensor>> batchIndices(
     if (index.has_value() && index->sym_numel() != 0) {
       const auto idx_bdim = indices_bdims[i];
       indices_.emplace_back(maybePadToLogicalRank(moveBatchDimToFront(index.value(), idx_bdim), idx_bdim, maxLogicalRank));
-      if (index.value().dtype() == kBool && indices_bdims[i].has_value()) {
-        throw std::runtime_error("vmap: We do not support batching operators that can support dynamic shape. Attempting to batch over indexing with a boolean mask.");
-      }
+      TORCH_CHECK(
+        !(index.value().dtype() == kBool) || !indices_bdims[i].has_value(),
+        "vmap: We do not support batching operators that can support dynamic shape. Attempting to batch over indexing with a boolean mask."
+      );
     } else {
       indices_.push_back(index);
     }
@@ -124,7 +126,7 @@ static std::vector<std::optional<Tensor>> batchIndices(
 
 // Define an "advanced index" to be a selection object that is
 // a non-trivial Tensor (i.e. it does not represent :).
-static bool is_advanced_index(const std::optional<Tensor>& idx) {
+bool is_advanced_index(const std::optional<Tensor>& idx) {
   if (!idx.has_value()) {
     return false;
   }
@@ -135,7 +137,7 @@ static bool is_advanced_index(const std::optional<Tensor>& idx) {
 }
 
 // See NOTE: [advanced indices adjacent] for definition
-static bool are_advanced_indices_adjacent(ArrayRef<std::optional<Tensor>> indices) {
+bool are_advanced_indices_adjacent(ArrayRef<std::optional<Tensor>> indices) {
   int64_t num_advanced_indices_regions = 0;
   bool in_advanced_indices_region = false;
   for (const auto& idx : indices) {
@@ -163,7 +165,7 @@ static bool are_advanced_indices_adjacent(ArrayRef<std::optional<Tensor>> indice
 // - result: Tensor[B, 4, 5, 6, 2, 3, 7, 8]
 //                     -------  ----
 //                     region2  region1
-static Tensor swap_regions(const Tensor& tensor, int64_t first_region_size, int64_t second_region_size) {
+Tensor swap_regions(const Tensor& tensor, int64_t first_region_size, int64_t second_region_size) {
   VmapDimVector permutation(tensor.dim(), 0);
   std::iota(permutation.begin(), permutation.end(), 0);
   std::rotate(
@@ -551,7 +553,7 @@ Tensor &_index_put_impl__plumbing(Tensor &self, const List<std::optional<Tensor>
   return self;
 }
 
-static Tensor maybe_permute_values(
+Tensor maybe_permute_values(
     const Tensor& values,
     ArrayRef<std::optional<Tensor>> orig_indices,
     ArrayRef<std::optional<int64_t>> orig_indices_bdims) {
@@ -1050,7 +1052,7 @@ std::tuple<Tensor, std::optional<int64_t>> index_add_batch_rule(
                                    other, other_bdim, alpha, false);
 }
 
-static std::tuple<Tensor,Tensor> binary_pointwise_align(
+std::tuple<Tensor,Tensor> binary_pointwise_align(
     const Tensor & self,
     std::optional<int64_t> self_bdim,
     const Tensor & mask,
diff --git a/aten/src/ATen/functorch/BatchRulesViews.cpp b/aten/src/ATen/functorch/BatchRulesViews.cpp
index cd1d0e1487fb..08db1d202b4e 100644
--- a/aten/src/ATen/functorch/BatchRulesViews.cpp
+++ b/aten/src/ATen/functorch/BatchRulesViews.cpp
@@ -346,7 +346,7 @@ std::tuple<Tensor, std::optional<int64_t>> slice_batch_rule(
   return std::make_tuple(std::move(result), 0);
 }
 
-static bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
+bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
   return dim == 0 || dim == -1;
 }
 
diff --git a/aten/src/ATen/functorch/BatchedTensorImpl.h b/aten/src/ATen/functorch/BatchedTensorImpl.h
index 3eccc94d3ea6..985b289b3fe0 100644
--- a/aten/src/ATen/functorch/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@@ -160,6 +160,10 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({
   DispatchKey::CUDA,
   DispatchKey::CPU,
   DispatchKey::PrivateUse1,
+  DispatchKey::SparseCPU,
+  DispatchKey::SparseCUDA,
+  DispatchKey::SparseCsrCPU,
+  DispatchKey::SparseCsrCUDA,
 });
 
 inline DispatchKeySet getKeysToPropagateToWrapper(const Tensor& tensor, DispatchKeySet to_propagate=kKeysToPropagateToWrapper) {
diff --git a/aten/src/ATen/functorch/DynamicLayer.cpp b/aten/src/ATen/functorch/DynamicLayer.cpp
index 4ec902b668e4..69af08a7bd7c 100644
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@@ -465,11 +465,11 @@ static void dynamicLayerBack(const c10::OperatorHandle& op, torch::jit::Stack* s
 
 // used for functions that have aliasing operations but should be treated like they're out of place (i.e. lift_fresh)
 static void dynamicLayerBackGradSpecialCase(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  return dynamicLayerBack(op, stack, true);
+  dynamicLayerBack(op, stack, true);
 }
 
 static void dynamicLayerBackFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  return dynamicLayerBack(op, stack, false);
+  dynamicLayerBack(op, stack, false);
 }
 
 TORCH_LIBRARY_IMPL(_, FuncTorchDynamicLayerFrontMode, m) {
diff --git a/aten/src/ATen/functorch/Interpreter.h b/aten/src/ATen/functorch/Interpreter.h
index 1c76230fb455..2a0e40199449 100644
--- a/aten/src/ATen/functorch/Interpreter.h
+++ b/aten/src/ATen/functorch/Interpreter.h
@@ -3,6 +3,7 @@
 #include <ATen/functorch/Macros.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <c10/core/impl/LocalDispatchKeySet.h>
+#include <c10/util/Exception.h>
 #include <optional>
 #include <bitset>
 #include <utility>
@@ -106,9 +107,10 @@ struct VmapInterpreterMeta {
 
   template <typename T>
   friend void to_json(T& json_j, const VmapInterpreterMeta& json_t) {
-    if (json_t.batchSize_.is_heap_allocated()) {
-      throw std::runtime_error("Serialization for heap-allocated SymInt is not implemented yet");
-    }
+    TORCH_CHECK(
+      !json_t.batchSize_.is_heap_allocated(),
+      "Serialization for heap-allocated SymInt is not implemented yet"
+    );
     json_j["batchSize"] = json_t.batchSize_.as_int_unchecked();
     json_j["randomness"] = static_cast<int64_t>(json_t.randomness_);
   }
@@ -302,7 +304,7 @@ struct Interpreter {
     } else if (meta.contains("Functionalize")) {
       json_t.meta_.emplace<FunctionalizeInterpreterMeta>(meta["Functionalize"].template get<FunctionalizeInterpreterMeta>());
     } else {
-      throw std::runtime_error("unknown interpreter metadata type");
+      TORCH_CHECK(false, "unknown interpreter metadata type");
     }
   }
 
diff --git a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
index 69517407e682..22a15c168445 100644
--- a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
+++ b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp
@@ -68,18 +68,18 @@ namespace at::functorch {
 
 namespace{
 // PyTorch allows operations to specify dim 0 and dim -1 on a scalar tensor.
-static bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
+bool is_allowed_dim_on_scalar_tensor(int64_t dim) {
   return dim == 0 || dim == -1;
 }
 
-static int64_t get_current_level() {
+int64_t get_current_level() {
   auto maybe_level = maybeCurrentDynamicLayer();
   TORCH_INTERNAL_ASSERT(maybe_level.has_value());
   return maybe_level->layerId();
 }
 
 // This check should probably go into the dispatcher...
-static bool participatesInCurrentLevel(const Tensor& self) {
+bool participatesInCurrentLevel(const Tensor& self) {
   auto current_level = get_current_level();
   auto* maybe_batched_impl = maybeGetBatchedImpl(self);
   if (!maybe_batched_impl) {
@@ -90,7 +90,7 @@ static bool participatesInCurrentLevel(const Tensor& self) {
   return self_level == current_level;
 }
 
-static bool participatesInCurrentLevel(ITensorListRef self) {
+bool participatesInCurrentLevel(ITensorListRef self) {
   for (const Tensor& tensor : self) {
     if (participatesInCurrentLevel(tensor)) {
       return true;
@@ -285,7 +285,7 @@ std::vector<Tensor> unbind_batching_rule(const Tensor& self, int64_t dim) {
 // given (sizes, strides, storage_offset) returns the maximum location that
 // can be indexed (or nullopt if such a location doesn't exist, e.g., tensors
 // with zero-size dims).
-static std::optional<c10::SymInt> maximum_indexable_location(
+std::optional<c10::SymInt> maximum_indexable_location(
     c10::SymIntArrayRef sizes, c10::SymIntArrayRef strides, const c10::SymInt& storage_offset) {
   auto result = native::storage_size_for(sizes, strides);
   if (result == 0) {
@@ -298,7 +298,7 @@ static std::optional<c10::SymInt> maximum_indexable_location(
 // This checks that the range of possible memory locations accessible by
 // x.as_strided(sizes, strides, maybe_storage_offset)
 // are within the bounds of possible memory locations accessible by x.
-static void checkBasicAsStridedValidForSlice(
+void checkBasicAsStridedValidForSlice(
     const Tensor& physical_tensor,
     int64_t num_batch_dims,
     c10::SymIntArrayRef sizes,
diff --git a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
index ecedc729ccd7..667e92970033 100644
--- a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
+++ b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp
@@ -6,6 +6,7 @@
 #include <ATen/functorch/BatchedTensorImpl.h>
 #include <ATen/Dispatch.h>
 #include <c10/util/irange.h>
+#include <c10/util/Exception.h>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/xnnpack/Engine.h>
@@ -70,7 +71,7 @@ Tensor linear_hack(const Tensor& input, const Tensor& weight, const std::optiona
   return output;
 }
 
-static inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) {
+inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) {
   if (reduction == at::Reduction::Mean) {
     return unreduced.mean();
   } else if (reduction == at::Reduction::Sum) {
@@ -108,9 +109,7 @@ Tensor binary_cross_entropy_with_logits_hack(
 }
 
 Tensor trace_backward_decomp(const Tensor& grad, IntArrayRef sizes) {
-  if (sizes.size() != 2) {
-    throw std::runtime_error("expected matrix input");
-  }
+  TORCH_CHECK(sizes.size() == 2, "expected matrix input");
   auto grad_input = at::zeros(sizes[0] * sizes[1], grad.options());
   auto indices = at::arange(0, grad_input.numel(), sizes[1] + 1, grad.options().dtype(at::kLong));
   // Workaround using index_put instead of yet unsupported index_fill_
@@ -128,7 +127,7 @@ namespace {
 template<bool inplace>
 using Ctype = std::conditional_t<inplace, Tensor&, Tensor>;
 
-static Tensor make_feature_noise(const Tensor& input) {
+Tensor make_feature_noise(const Tensor& input) {
   auto input_sizes = input.sizes();
   TORCH_CHECK(input.dim() >= 2, "Feature dropout requires at least 2 dimensions in the input");
   std::vector<int64_t> sizes;
@@ -142,7 +141,7 @@ static Tensor make_feature_noise(const Tensor& input) {
   return at::empty(sizes, input.options());
 }
 
-static bool is_fused_kernel_acceptable(const Tensor& input, double p) {
+bool is_fused_kernel_acceptable(const Tensor& input, double p) {
   return (input.is_cuda() || input.is_xpu() || input.is_lazy() || input.is_privateuseone()) && p > 0 && p < 1 && input.numel() > 0;
 }
 
@@ -211,7 +210,7 @@ ALIAS_SPECIALIZATION(_feature_dropout,       true,  false)
 ALIAS_SPECIALIZATION(_alpha_dropout,         false, true )
 ALIAS_SPECIALIZATION(_feature_alpha_dropout, true,  true )
 
-static Tensor dropout(const Tensor& input, double p, bool train) {
+Tensor dropout(const Tensor& input, double p, bool train) {
   auto result = [&]() {
     NoNamesGuard guard;
     if (train && is_fused_kernel_acceptable(input, p)) {
diff --git a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
index f4316def4fb4..cfdecaac778b 100644
--- a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
@@ -90,6 +90,10 @@ class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllo
     allocator_->setMemoryFraction(fraction, device);
   }
 
+  std::vector<HIPCachingAllocator::StreamSegmentSize> getExpandableSegmentSizes(c10::DeviceIndex device) override {
+    return allocator_->getExpandableSegmentSizes(device);
+  }
+
   void enable(bool value) override {
     allocator_->enable(value);
   }
diff --git a/aten/src/ATen/metal/Context.h b/aten/src/ATen/metal/Context.h
index 1f977cf50d9e..e4c6da738e0d 100644
--- a/aten/src/ATen/metal/Context.h
+++ b/aten/src/ATen/metal/Context.h
@@ -18,7 +18,7 @@ extern std::atomic<const MetalInterface*> g_metal_impl_registry;
 
 class MetalImplRegistrar {
  public:
-  explicit MetalImplRegistrar(MetalInterface*);
+  explicit MetalImplRegistrar(MetalInterface* /*impl*/);
 };
 
 at::Tensor& metal_copy_(at::Tensor& self, const at::Tensor& src);
diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp
index d858df073397..6c58de099648 100644
--- a/aten/src/ATen/mps/EmptyTensor.cpp
+++ b/aten/src/ATen/mps/EmptyTensor.cpp
@@ -12,7 +12,7 @@
 
 #define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled"
 #define MPS_ERROR_RUNTIME_TOO_LOW \
-  "The MPS backend is supported on MacOS 13.0+.", \
+  "The MPS backend is supported on MacOS 14.0+. ", \
   "Current OS version can be queried using `sw_vers`"
 #define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \
   "as the MPS framework doesn't support float64. Please use float32 instead."
diff --git a/aten/src/ATen/mps/MPSHooks.mm b/aten/src/ATen/mps/MPSHooks.mm
index a2ec221c1bfe..34fbd31af91d 100644
--- a/aten/src/ATen/mps/MPSHooks.mm
+++ b/aten/src/ATen/mps/MPSHooks.mm
@@ -70,7 +70,10 @@
 }
 
 void* MPSHooks::getCommandBuffer() const {
-  return at::mps::getDefaultMPSStream()->commandBuffer();
+  auto stream = at::mps::getDefaultMPSStream();
+  // Release pending computeCommandEncoder, as extensions is likely to allocate new one
+  stream->endKernelCoalescing();
+  return stream->commandBuffer();
 }
 
 void* MPSHooks::getDispatchQueue() const {
diff --git a/aten/src/ATen/mps/MPSStream.mm b/aten/src/ATen/mps/MPSStream.mm
index e9627a343ad6..71325bd69e1d 100644
--- a/aten/src/ATen/mps/MPSStream.mm
+++ b/aten/src/ATen/mps/MPSStream.mm
@@ -158,7 +158,18 @@ @interface MPSGraphExecutionDescriptor ()
       endKernelCoalescing();
       id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
 
-      [blitEncoder fillBuffer:buffer range:NSMakeRange(offset, length) value:value];
+      // For some reason fillBufferfor stopped working for lengh > 4Gb on MacOS 26
+      // See https://github.com/pytorch/pytorch/issues/163962
+      // Workaround by batching copy commands into 4Gb chunks
+      constexpr size_t max_copy_size = 0x100000000; // 4GB
+      size_t bytes_filled = 0;
+      size_t bytes_remains = length;
+      while (bytes_remains > 0) {
+        NSUInteger bytes_to_copy = std::min(max_copy_size, bytes_remains);
+        [blitEncoder fillBuffer:buffer range:NSMakeRange(offset + bytes_filled, bytes_to_copy) value:value];
+        bytes_filled += bytes_to_copy;
+        bytes_remains -= bytes_to_copy;
+      }
       [blitEncoder endEncoding];
       synchronize(syncType);
     }
diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
index db11422f2d83..c164120a1f3c 100644
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@@ -240,8 +240,8 @@ TORCH_META_FUNC(gelu_backward) (
 
 namespace at::native {
 
-static const double SELU_ALPHA = 1.6732632423543772848170429916717;
-static const double SELU_SCALE = 1.0507009873554804934193349852946;
+static constexpr double SELU_ALPHA = 1.6732632423543772848170429916717;
+static constexpr double SELU_SCALE = 1.0507009873554804934193349852946;
 
 DEFINE_DISPATCH(elu_stub);
 DEFINE_DISPATCH(elu_backward_stub);
@@ -670,6 +670,8 @@ Tensor rrelu_with_noise_backward(
 }
 
 Tensor rrelu(const Tensor & self, const Scalar& lower, const Scalar& upper, bool training, std::optional<Generator> generator) {
+  TORCH_CHECK(std::isfinite(lower.to<double>()), "rrelu: lower bound must be finite, got ", lower.to<double>());
+  TORCH_CHECK(std::isfinite(upper.to<double>()), "rrelu: upper bound must be finite, got ", upper.to<double>());
   TORCH_CHECK(lower.to<double>() <= upper.to<double>(), "Lower bound should be less than or equal to the upper bound")
   auto noise = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   return at::rrelu_with_noise(self, noise, lower, upper, training, std::move(generator));
diff --git a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
index e744c2b5e0e7..5821cd561cdf 100644
--- a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
+++ b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp
@@ -24,7 +24,7 @@ namespace at::native {
 namespace {
 
 template <typename scalar_t>
-static void adaptive_avg_pool3d_out_frame(
+void adaptive_avg_pool3d_out_frame(
     const scalar_t* input_p,
     scalar_t* output_p,
     int64_t sizeD,
@@ -176,7 +176,7 @@ void adaptive_avg_pool3d_out_cpu_template(
 }
 
 template <typename scalar_t>
-static void adaptive_avg_pool3d_backward_out_frame(
+void adaptive_avg_pool3d_backward_out_frame(
     scalar_t* gradInput_p,
     const scalar_t* gradOutput_p,
     int64_t sizeD,
diff --git a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
index 46dc5623b595..ef4bab3ec1de 100644
--- a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
+++ b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp
@@ -93,7 +93,7 @@ namespace {
 // 5d tensor B x D x T x H x W
 
 template <typename scalar_t>
-static void adaptive_max_pool3d_single_out_frame(
+void adaptive_max_pool3d_single_out_frame(
           const scalar_t *input_p,
           scalar_t *output_p,
           int64_t *ind_p,
@@ -170,7 +170,7 @@ static void adaptive_max_pool3d_single_out_frame(
 }
 
 template <typename scalar_t>
-static void adaptive_max_pool3d_out_frame(
+void adaptive_max_pool3d_out_frame(
           const scalar_t *input_data,
           scalar_t *output_data,
           int64_t *indices_data,
@@ -202,7 +202,7 @@ static void adaptive_max_pool3d_out_frame(
 }
 
 template <typename scalar_t>
-static void adaptive_max_pool3d_backward_single_out_frame(
+void adaptive_max_pool3d_backward_single_out_frame(
           scalar_t *gradInput_p,
           const scalar_t *gradOutput_p,
           const int64_t *ind_p,
@@ -241,7 +241,7 @@ static void adaptive_max_pool3d_backward_single_out_frame(
 }
 
 template <typename scalar_t>
-static void adaptive_max_pool3d_backward_out_frame(
+void adaptive_max_pool3d_backward_out_frame(
           scalar_t *gradInput_data,
           const scalar_t *gradOutput_data,
           const int64_t *indices_data,
diff --git a/aten/src/ATen/native/AveragePool3d.cpp b/aten/src/ATen/native/AveragePool3d.cpp
index 8a588b7cac11..365cfa311512 100644
--- a/aten/src/ATen/native/AveragePool3d.cpp
+++ b/aten/src/ATen/native/AveragePool3d.cpp
@@ -153,7 +153,7 @@ namespace at::native {
 namespace {
 
 template <typename scalar_t>
-static void avg_pool3d_out_frame(
+void avg_pool3d_out_frame(
           const scalar_t *input_p,
           scalar_t *output_p,
           int64_t nslices,
@@ -333,7 +333,7 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cpu) (
 namespace {
 
 template <typename scalar_t>
-static void avg_pool3d_backward_out_frame(
+void avg_pool3d_backward_out_frame(
           scalar_t *gradInput_p,
           const scalar_t *gradOutput_p,
           int64_t nslices,
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index d323e54a95ab..6669357cda45 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -2060,7 +2060,7 @@ std::tuple<Tensor, Tensor> linalg_lu_factor(const Tensor& A, bool pivot) {
 }
 
 // TODO Deprecate this function in favour of linalg_lu_factor_ex
-std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool compute_pivots, bool) {
+std::tuple<Tensor, Tensor, Tensor> _lu_with_info(const Tensor& self, bool compute_pivots, bool /*unused*/) {
    TORCH_WARN_ONCE(
     "torch.lu is deprecated in favor of torch.linalg.lu_factor / torch.linalg.lu_factor_ex and will be ",
     "removed in a future PyTorch release.\n",
diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
index 54fb610722d6..df64aa42e602 100644
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@@ -143,13 +143,13 @@ Tensor& cholesky_inverse_kernel_impl(Tensor& result, Tensor& infos, bool upper)
  For more info see https://github.com/pytorch/pytorch/issues/145801#issuecomment-2631781776
 */
 template <typename T>
-static inline
+inline
 std::enable_if_t<std::is_floating_point_v<T>, int> lapack_work_to_int(const T val) {
     const auto next_after = std::nextafter(val, std::numeric_limits<T>::infinity());
     return std::max<int>(1, std::ceil(next_after));
 }
 template <typename T>
-static inline
+inline
 std::enable_if_t<c10::is_complex<T>::value, int> lapack_work_to_int(const T val) {
     return lapack_work_to_int(val.real());
 }
@@ -343,7 +343,7 @@ void linalg_eigh_kernel(const Tensor& eigenvalues, const Tensor& eigenvectors, c
   For further details, please see the LAPACK documentation for GEQRF.
 */
 template <typename scalar_t>
-static void apply_geqrf(const Tensor& input, const Tensor& tau) {
+void apply_geqrf(const Tensor& input, const Tensor& tau) {
 #if !AT_BUILD_WITH_LAPACK()
   TORCH_CHECK(
       false,
@@ -1039,7 +1039,7 @@ void lu_solve_kernel(const Tensor& LU, const Tensor& pivots, const Tensor& B, Tr
 }
 
 template <typename scalar_t>
-static void apply_svd(const Tensor& A,
+void apply_svd(const Tensor& A,
                       const bool full_matrices,
                       const bool compute_uv,
                       const Tensor& U,
diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp
index 49366151ae60..6b7496f49732 100644
--- a/aten/src/ATen/native/Blas.cpp
+++ b/aten/src/ATen/native/Blas.cpp
@@ -58,7 +58,7 @@ scalar_t dot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y,
 template<typename scalar_t>
 scalar_t vdot_impl(int64_t n, const scalar_t *x, int64_t incx, const scalar_t *y, int64_t incy);
 
-static constexpr inline bool lda_cond(int64_t m, int64_t n, int64_t lda) {
+static constexpr bool lda_cond(int64_t m, int64_t n, int64_t lda) {
   return n == 1 || lda >= std::max<int64_t>(1L, m);
 }
 
diff --git a/aten/src/ATen/native/BlasKernel.cpp b/aten/src/ATen/native/BlasKernel.cpp
index 5f3976bd18d6..b476ca3cff8f 100644
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@@ -286,7 +286,7 @@ template void scal_fast_path<scalar_t>(int *n, scalar_t *a, scalar_t *x, int *in
 #if AT_BUILD_WITH_BLAS()
 template <>
 bool scal_use_fast_path<double>(int64_t n, int64_t incx) {
-  auto intmax = std::numeric_limits<int>::max();
+  auto constexpr intmax = std::numeric_limits<int>::max();
   return n <= intmax && incx <= intmax;
 }
 
@@ -315,7 +315,7 @@ bool gemv_use_fast_path<float>(
     int64_t incx,
     [[maybe_unused]] float beta,
     int64_t incy) {
-  auto intmax = std::numeric_limits<int>::max();
+  auto constexpr intmax = std::numeric_limits<int>::max();
   return (m <= intmax) && (n <= intmax) && (lda <= intmax) &&
          (incx > 0) && (incx <= intmax) && (incy > 0) && (incy <= intmax);
 }
@@ -375,7 +375,7 @@ static void bf16_gemv_trans(
   const at::BFloat16 beta,
   at::BFloat16* y,
   const int incy) {
-  return bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
+  bf16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 template <>
diff --git a/aten/src/ATen/native/BucketizationUtils.h b/aten/src/ATen/native/BucketizationUtils.h
index 70878ecd704d..bd19f9c987f1 100644
--- a/aten/src/ATen/native/BucketizationUtils.h
+++ b/aten/src/ATen/native/BucketizationUtils.h
@@ -70,7 +70,7 @@ inline void searchsorted_maybe_trim_input_tensors(
     const Tensor& raw_boundaries) {
   Tensor trimmed_sorter;
   Tensor raw_sorter;
-  return searchsorted_maybe_trim_input_tensors(
+  searchsorted_maybe_trim_input_tensors(
       trimmed_input,
       trimmed_boundaries,
       trimmed_sorter,
diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp
index 20be0d6fe017..c17a70ea308a 100644
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@@ -991,7 +991,7 @@ std::size_t UnsafeUkernelKeyHasher<PackKey>::operator()(const PackKey& key) cons
 template <typename key_t, typename value_t>
 struct KernelCache  {
   using kstore_t = std::unordered_map<key_t, std::shared_ptr<value_t>, UnsafeUkernelKeyHasher<key_t>>;
-  static inline std::shared_ptr<value_t>&& fetch_or_create(
+  static std::shared_ptr<value_t>&& fetch_or_create(
       const key_t& key,
       const std::function<std::shared_ptr<value_t>()>& callback) {
     auto&& search = get_store().find(key);
@@ -1003,7 +1003,7 @@ struct KernelCache  {
     }
   }
 
-  static inline kstore_t& get_store() {
+  static kstore_t& get_store() {
     static thread_local kstore_t cache_kernels;
     return cache_kernels;
   }
@@ -1067,7 +1067,7 @@ struct GemmHelper {
 struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
   // Fetch/create GemmHelper object and execute brgemm with batch size = 1
   template <typename scalar_t_a, typename scalar_t_b, typename scalar_t_c>
-  static inline void call(
+  static void call(
       int64_t M,
       int64_t N,
       int64_t K,
@@ -1118,12 +1118,12 @@ struct Brgemm : public KernelCache <BrgemmKey, GemmHelper> {
         .execute(A, B, (*value).A_B_offsets, C, (*value).scratchpad.data());
   }
 
-  static inline std::shared_ptr<GemmHelper>& get_current() {
+  static std::shared_ptr<GemmHelper>& get_current() {
     static thread_local std::shared_ptr<GemmHelper> current;
     return current;
   }
 
-  static inline bool device_check(ScalarType dtype) {
+  static bool device_check(ScalarType dtype) {
     if (!at::globalContext().userEnabledMkldnn()) {
       return false;
     }
@@ -1153,7 +1153,7 @@ using pack_t = dnnl::ukernel::brgemm_pack_B;
 using pack_t = dnnl::ukernel::transform;
 #endif
 struct Pack : public KernelCache <PackKey, pack_t> {
-  static inline void call(
+  static void call(
       int64_t K,
       int64_t N,
       int64_t ld_in,
@@ -1182,7 +1182,7 @@ struct Pack : public KernelCache <PackKey, pack_t> {
     }
   }
 
-  static inline bool could_pack(ScalarType dtype) {
+  static bool could_pack(ScalarType dtype) {
     if (!at::globalContext().userEnabledMkldnn()) {
       return false;
     }
diff --git a/aten/src/ATen/native/Col2Im.cpp b/aten/src/ATen/native/Col2Im.cpp
index 51e005c2901b..f0270a02b267 100644
--- a/aten/src/ATen/native/Col2Im.cpp
+++ b/aten/src/ATen/native/Col2Im.cpp
@@ -71,7 +71,7 @@
 namespace at::native {
 namespace {
 
-static void col2im_out_cpu_template(
+void col2im_out_cpu_template(
     Tensor& output,
     const Tensor& input_,
     IntArrayRef output_size,
diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index e160c84ced33..892144ac663a 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -465,8 +465,11 @@ inline bool mps_conv_use_channels_last(const at::Tensor& input, const at::Tensor
     return false;
   }
 
-  auto fmt = input.suggest_memory_format();
-  return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d;
+  auto is_channel_last = [](const at::Tensor& t) {
+    auto fmt = t.suggest_memory_format();
+    return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d;
+  };
+  return is_channel_last(input) || is_channel_last(weight);
 }
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index ab427f396e34..1158359be239 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -32,10 +32,6 @@
 #include <ATen/native/mkldnn/Utils.h>
 #endif
 
-#ifdef USE_MPS
-#include <ATen/mps/MPSDevice.h>
-#endif
-
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
@@ -410,11 +406,23 @@ struct ConvParams {
   // cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest
   // that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how)
 #if !defined(C10_MOBILE)
-    if (!detail::getCUDAHooks().compiledWithCuDNN()) {
+    if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
       return false;
     }
+    static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
+    // broken on cuDNN 9.8
+    if (cudnn_version >= 90800) {
+      if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
+          (input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&
+          weight.dim() == 5) {
+        for (int i = 2; i < weight.dim(); i++) {
+          if (weight.size(i) != 1) {
+            return false;
+          }
+        }
+      }
+    }
     if (needs_64bit_indexing_no_split(input, weight)) {
-      static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
       if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
         TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
                         " if the V8 API is not enabled or before cuDNN version 9.3+."
@@ -422,9 +430,6 @@ struct ConvParams {
         return false;
       }
     }
-    if (!input.is_cuda() || !cudnn_enabled) {
-      return false;
-    }
     if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) {
       if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) {
         return false;
@@ -443,16 +448,19 @@ struct ConvParams {
 
   // Use cudnn for FP16 depthwise convolutions
   bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const  {
-    if (!detail::getCUDAHooks().compiledWithCuDNN()) {
+    if (!cudnn_enabled || !detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda()) {
       return false;
     }
-    if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) {
-      // always use cudnn_depthwise for channels_last format
-      return true;
-    }
     // native kernel doesn't support 64-bit non-splittable case
-    if (cudnn_enabled && !(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
+    if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
       static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
+      // TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
+      if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
+        if (cudnn_version < 0 || cudnn_version > 91000) {
+          return false;
+        }
+      }
+
       if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
         TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
                         " if the V8 API is not enabled or before cuDNN version 9.3+."
@@ -462,6 +470,10 @@ struct ConvParams {
         return true;
       }
     }
+    if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
+      // always use cudnn_depthwise for channels_last format
+      return true;
+    }
     if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) {
       bool kernel_cond =  (use_cudnn(input, weight) &&
                            input.scalar_type() == kHalf && // only for FP16
@@ -646,6 +658,7 @@ static void check_shape_forward(const at::Tensor& input,
   TORCH_CHECK(!params.is_output_padding_neg(), "negative output_padding is not supported");
   TORCH_CHECK(!params.is_stride_nonpos(), "non-positive stride is not supported");
   TORCH_CHECK(!params.is_dilation_neg(), "dilation should be greater than zero");
+  TORCH_CHECK(groups > 0, "expected groups to be greater than 0, but got groups=", groups);
 
   TORCH_CHECK(weight_dim == k,
            "Expected ", weight_dim, "-dimensional input for ", weight_dim,
@@ -690,7 +703,7 @@ static void check_shape_forward(const at::Tensor& input,
       // If kernel size is incorrect
       std::ostringstream input_ss;
       std::ostringstream kernel_ss;
-      std::string separator = "";
+      std::string separator;
 
       for (int i = 0, len = input_shape.size(); i < len; ++i) {
         input_ss << separator << input_shape[i];
@@ -1007,7 +1020,7 @@ static Tensor convolution_same(
 
   if (symmetric_padding) {
     // All backends handle symmetric padding natively
-    SymDimVector output_padding(static_cast<size_t>(dim));
+    SymDimVector output_padding(dim);
     return at::convolution_symint(input, weight, bias, stride, padding_l, dilation,
                                false, output_padding, groups);
   }
@@ -1027,7 +1040,7 @@ static Tensor convolution_same(
     }
   }
   auto padded_input = at::constant_pad_nd_symint(input, pad_nd, 0);
-  SymDimVector output_padding(static_cast<size_t>(dim));
+  SymDimVector output_padding(dim);
   return at::convolution_symint(padded_input, weight, bias, stride, padding_l,
                                 dilation, false, output_padding, groups);
 }
@@ -1162,7 +1175,7 @@ at::Tensor convolution(
   bool deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
   return at::_convolution(input, weight, bias, stride, padding, dilation,
                           transposed, output_padding, groups,
-                          ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN("conv"));
+                          ctx.benchmarkCuDNN(), deterministic, ctx.userEnabledCuDNN(), ctx.allowTF32CuDNN(at::Float32Op::CONV));
 }
 
 at::Tensor convolution_overrideable(
@@ -1307,7 +1320,7 @@ ConvBackend select_conv_backend(
   params.benchmark = ctx.benchmarkCuDNN();
   params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
   params.cudnn_enabled = ctx.userEnabledCuDNN();
-  params.allow_tf32 = ctx.allowTF32CuDNN("conv");
+  params.allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
 
   auto input = input_r;
   auto weight = weight_r;
@@ -1429,12 +1442,8 @@ static inline at::MemoryFormat determine_backend_memory_format(
       }
       break;
     case ConvBackend::Mps:
+    case ConvBackend::MpsTranspose:
       if (mps_conv_use_channels_last(input, weight)) {
-#ifdef USE_MPS
-        if (!mps::is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_15_0_PLUS)) {
-          break;
-        }
-#endif
         backend_memory_format = (k == 5) ? MemoryFormat::ChannelsLast3d : MemoryFormat::ChannelsLast;
       }
       break;
@@ -1691,7 +1700,7 @@ at::Tensor _convolution(
   c10::MaybeOwned<Tensor> bias_r_maybe_owned = at::borrow_from_optional_tensor(bias_r_opt);
   const Tensor& bias_r = *bias_r_maybe_owned;
 
-  return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN("conv"));
+  return at::_convolution(input_r, weight_r, bias_r, stride_, padding_, dilation_, transposed_, output_padding_, groups_, benchmark, deterministic, cudnn_enabled, at::globalContext().allowTF32CuDNN(at::Float32Op::CONV));
 }
 
 std::tuple<Tensor, Tensor, Tensor> convolution_backward_overrideable(
@@ -1989,7 +1998,7 @@ std::tuple<Tensor, Tensor, Tensor> convolution_backward(
   params.benchmark = ctx.benchmarkCuDNN();
   params.deterministic = ctx.deterministicCuDNN() || ctx.deterministicAlgorithms();
   params.cudnn_enabled = ctx.userEnabledCuDNN();
-  params.allow_tf32 = ctx.allowTF32CuDNN("conv");
+  params.allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
 
   // Validate inputs.
   check_shape_backward(input, weight.sizes(), params);
diff --git a/aten/src/ATen/native/ConvolutionMM2d.cpp b/aten/src/ATen/native/ConvolutionMM2d.cpp
index 619542c29ef5..538a893d54ea 100644
--- a/aten/src/ATen/native/ConvolutionMM2d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM2d.cpp
@@ -25,7 +25,7 @@ namespace at::native {
 
 namespace {
 
-static Tensor compute_columns2d(
+Tensor compute_columns2d(
     const Tensor& input,
     IntArrayRef padding,
     IntArrayRef stride,
@@ -93,7 +93,7 @@ static Tensor compute_columns2d(
   return columns.contiguous();
 }
 
-static inline void slow_conv2d_shape_check(
+inline void slow_conv2d_shape_check(
     const Tensor& input,
     const Tensor& grad_output,
     const Tensor& weight,
@@ -205,7 +205,7 @@ static inline void slow_conv2d_shape_check(
   }
 }
 
-static inline Tensor view_weight_2d(const Tensor& weight_,
+inline Tensor view_weight_2d(const Tensor& weight_,
     at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) {
   Tensor weight = weight_.contiguous(memory_format);
   if (weight.dim() == 4) {
@@ -220,7 +220,7 @@ static inline Tensor view_weight_2d(const Tensor& weight_,
 }
 
 template <typename scalar_t>
-static void slow_conv2d_update_output_frame(
+void slow_conv2d_update_output_frame(
     TensorAccessor<const scalar_t, 3> input,
     TensorAccessor<scalar_t, 3> output,
     TensorAccessor<const scalar_t, 2> weight,
@@ -480,7 +480,7 @@ void slow_conv2d_backward_weight_frame(
   }
 }
 
-static void slow_conv2d_backward_weight_out_cpu_template(
+void slow_conv2d_backward_weight_out_cpu_template(
     Tensor& grad_weight,
     const Tensor& input,
     const Tensor& grad_output_,
diff --git a/aten/src/ATen/native/ConvolutionMM3d.cpp b/aten/src/ATen/native/ConvolutionMM3d.cpp
index f361b3a81912..894bf29456f7 100644
--- a/aten/src/ATen/native/ConvolutionMM3d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM3d.cpp
@@ -9,6 +9,7 @@
 #include <ATen/native/TransposeType.h>
 #include <ATen/native/Unfold3d.h>
 #include <c10/util/irange.h>
+#include <c10/util/safe_numerics.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -27,7 +28,7 @@ namespace at::native {
 
 namespace {
 
-static Tensor compute_columns3d(
+Tensor compute_columns3d(
     const Tensor& input_,
     IntArrayRef stride,
     IntArrayRef padding,
@@ -107,7 +108,7 @@ static Tensor compute_columns3d(
   return columns;
 }
 
-static inline void slow_conv3d_shape_check(
+inline void slow_conv3d_shape_check(
     const Tensor& input,
     const Tensor& grad_output,
     const Tensor& weight,
@@ -174,6 +175,23 @@ static inline void slow_conv3d_shape_check(
   const int64_t input_height = input.size(dim_height);
   const int64_t input_width = input.size(dim_width);
 
+  constexpr int64_t MAX_SAFE_PAD = (1LL << 61);
+
+  TORCH_CHECK_VALUE(
+    pad_height <= MAX_SAFE_PAD,
+    "Padding height too large: pad_height=",
+    pad_height);
+
+  TORCH_CHECK_VALUE(
+    pad_width <= MAX_SAFE_PAD,
+    "Padding width too large: pad_width=",
+    pad_width);
+
+  TORCH_CHECK_VALUE(
+    pad_depth <= MAX_SAFE_PAD,
+    "Padding depth too large: pad_depth=",
+    pad_depth);
+
   const int64_t exact_input_depth = input_depth + 2 * pad_depth;
   const int64_t exact_input_height = input_height + 2 * pad_height;
   const int64_t exact_input_width = input_width + 2 * pad_width;
@@ -221,6 +239,14 @@ static inline void slow_conv3d_shape_check(
       output_width,
       "). Output size is too small");
 
+  uint64_t kernel_product;
+  TORCH_CHECK(
+    !c10::mul_overflows(kernel_height, kernel_width, &kernel_product),
+    "Kernel height x width product is too large: kernel_height=",
+    kernel_height,
+    ", kernel_width=",
+    kernel_width);
+
   if (weight.defined()) {
     int64_t n_input_plane = weight.size(1);
     if (weight.dim() == 2) {
@@ -247,7 +273,7 @@ static inline void slow_conv3d_shape_check(
   }
 }
 
-static Tensor view_weight_2d(const Tensor& weight_) {
+Tensor view_weight_2d(const Tensor& weight_) {
   Tensor weight = weight_.contiguous();
   if (weight.dim() == 5) {
     const int64_t s1 = weight.size(0);
@@ -260,7 +286,7 @@ static Tensor view_weight_2d(const Tensor& weight_) {
 }
 
 template <typename scalar_t>
-static void slow_conv3d_update_output_frame(
+void slow_conv3d_update_output_frame(
     TensorAccessor<const scalar_t, 4> input,
     TensorAccessor<scalar_t, 4> output,
     TensorAccessor<const scalar_t, 2> weight,
@@ -489,7 +515,7 @@ void slow_conv3d_backward_weight_frame(
       grad_weight.data(), ldc, grad_weight.stride(0) * n);
 }
 
-static void slow_conv3d_backward_parameters_out_cpu_template(
+void slow_conv3d_backward_parameters_out_cpu_template(
     Tensor& grad_weight,
     const Tensor& input,
     const Tensor& grad_output,
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index 3d388194ea49..0b3ffda30577 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -1,6 +1,5 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Copy.h>
-#include <ATen/native/Copy.h>
 
 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h
index 1c9db44aebb0..755fe00b1f1c 100644
--- a/aten/src/ATen/native/Distributions.h
+++ b/aten/src/ATen/native/Distributions.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <array>
 #include <ATen/native/Math.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/MathConstants.h>
@@ -127,7 +128,7 @@ C10_DEVICE scalar_t sample_gamma(scalar_t alpha, BaseSampler<accscalar_t, unifor
 
 template<typename scalar_t>
 C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
-  const static scalar_t kTailValues[] = {
+  constexpr static scalar_t kTailValues[] = {
     0.0810614667953272,
     0.0413406959554092,
     0.0276779256849983,
@@ -139,7 +140,7 @@ C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
     0.00925546218271273,
     0.00833056343336287
   };
-  if (k <= 9) {
+  if (k < std::size(kTailValues)) {
     return kTailValues[static_cast<size_t>(k)];
   }
   scalar_t kp1sq = (k + 1) * (k + 1);
diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
index 150970edc507..e1076d0400f7 100644
--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@@ -108,7 +108,7 @@ bool is_fast_path(const Tensor& src, const std::optional<Tensor>& scale, Tensor&
 // index_add (using add_indices as the index), without creating an intermediary
 // tensor to hold the selected embeddings
 template <typename data_t, typename index_t>
-static std::enable_if_t<std::is_same_v<data_t, double>, void>
+std::enable_if_t<std::is_same_v<data_t, double>, void>
 index_select_add(
     const Tensor& select_indices,
     const Tensor& add_indices,
@@ -494,7 +494,7 @@ index_select_add(const Tensor &select_indices,
 // mul (scaling by per_sample_weights)
 // index_add (using add_indices as the index)
 template <typename data_t, typename index_t>
-static std::enable_if_t<std::is_same_v<data_t, double>, void>
+std::enable_if_t<std::is_same_v<data_t, double>, void>
 index_select_scale_add(
     const Tensor& select_indices,
     const Tensor& add_indices,
diff --git a/aten/src/ATen/native/Fill.cpp b/aten/src/ATen/native/Fill.cpp
index 5ff1e6b61ed2..8e04a7490e87 100644
--- a/aten/src/ATen/native/Fill.cpp
+++ b/aten/src/ATen/native/Fill.cpp
@@ -97,43 +97,38 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
   int64_t nDims = self.dim();
   TORCH_CHECK(nDims >= 2, "dimensions must larger than 1");
 
-  int64_t height = self.size(0);
-  int64_t width = self.size(1);
+  auto height = self.sym_size(0);
+  auto width = self.sym_size(1);
 
   if (nDims > 2) {
-    int64_t dim1 = height;
     for (const auto i : c10::irange(1, nDims)) {
-      if (self.size(i) != dim1) {
+      if (self.sym_size(i) != height) {
         TORCH_CHECK(false, "all dimensions of input must be of equal length");
       }
     }
   }
 
-  int64_t storage_offset = self.storage_offset();
-  std::vector<int64_t> sizes;
-  std::vector<int64_t> strides;
-  int64_t size = std::min(height, width);
+  auto storage_offset = self.sym_storage_offset();
+  auto size = std::min(height, width);
 
   int64_t stride = 0;
   for (const auto i : c10::irange(nDims)) {
     stride += self.stride(i);
   }
-  strides.push_back(stride);
-  sizes.push_back(size);
+  std::vector<SymInt> strides{stride};
+  std::vector<SymInt> sizes{size};
 
-  auto main_diag = self.as_strided(sizes, strides, storage_offset);
+  auto main_diag = self.as_strided_symint(sizes, strides, storage_offset);
   main_diag.fill_(fill_value);
 
   if (wrap && nDims == 2 && height > width + 1) {
-    std::vector<int64_t> wrap_sizes;
+    auto step = width + 1;
+    auto wrap_size = ((self.numel() + step - 1) / step) - size;
+    std::vector<SymInt> wrap_sizes{wrap_size};
 
-    int64_t step = width + 1;
-    int64_t wrap_size = ((self.numel() + step - 1) / step) - size;
-    wrap_sizes.push_back(wrap_size);
+    auto offset = self.stride(0) * (width + 1);
 
-    int64_t offset = self.stride(0) * (width + 1);
-
-    auto wrap_diag = self.as_strided(wrap_sizes, strides, storage_offset + offset);
+    auto wrap_diag = self.as_strided_symint(wrap_sizes, strides, storage_offset + offset);
     wrap_diag.fill_(fill_value);
   }
 
diff --git a/aten/src/ATen/native/FractionalMaxPool2d.cpp b/aten/src/ATen/native/FractionalMaxPool2d.cpp
index 059d27b39546..664a612d0b13 100644
--- a/aten/src/ATen/native/FractionalMaxPool2d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool2d.cpp
@@ -130,7 +130,7 @@ namespace native {
 namespace {
 
 template <typename scalar_t>
-static void fractional_max_pool2d_out_single_batch_frame(
+void fractional_max_pool2d_out_single_batch_frame(
   const scalar_t* input,
   scalar_t* output,
   int64_t* indices,
@@ -188,7 +188,7 @@ static void fractional_max_pool2d_out_single_batch_frame(
 }
 
 template <typename scalar_t>
-static void fractional_max_pool2d_out_frame(
+void fractional_max_pool2d_out_frame(
   const scalar_t* input,
   scalar_t* output,
   int64_t* indices,
@@ -220,7 +220,7 @@ static void fractional_max_pool2d_out_frame(
   }
 
 template <typename scalar_t>
-static void fractional_max_pool2d_backward_out_single_batch_frame(
+void fractional_max_pool2d_backward_out_single_batch_frame(
   scalar_t* gradInput,
   const scalar_t* gradOutput,
   const int64_t* indices,
@@ -247,7 +247,7 @@ static void fractional_max_pool2d_backward_out_single_batch_frame(
 }
 
 template <typename scalar_t>
-static void fractional_max_pool2d_backward_out_frame(
+void fractional_max_pool2d_backward_out_frame(
   scalar_t* gradInput,
   const scalar_t* gradOutput,
   const int64_t* indices,
diff --git a/aten/src/ATen/native/FractionalMaxPool3d.cpp b/aten/src/ATen/native/FractionalMaxPool3d.cpp
index 68328018b24b..5ed3fdeab765 100644
--- a/aten/src/ATen/native/FractionalMaxPool3d.cpp
+++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp
@@ -99,7 +99,7 @@ namespace at::native {
 namespace {
 
 template<typename scalar_t>
-static void fractional_max_pool3d_out_single_batch_frame(
+void fractional_max_pool3d_out_single_batch_frame(
   const scalar_t* input,
   scalar_t* output,
   int64_t* indices,
@@ -169,7 +169,7 @@ static void fractional_max_pool3d_out_single_batch_frame(
 }
 
 template<typename scalar_t>
-static void fractional_max_pool3d_out_frame(
+void fractional_max_pool3d_out_frame(
   const scalar_t* input,
   scalar_t* output,
   int64_t* indices,
@@ -257,7 +257,7 @@ TORCH_IMPL_FUNC(fractional_max_pool3d_out_cpu)(
 namespace {
 
 template<typename scalar_t>
-static void fractional_max_pool3d_backward_out_single_batch_frame(
+void fractional_max_pool3d_backward_out_single_batch_frame(
   scalar_t* gradInput,
   const scalar_t* gradOutput,
   const int64_t* indices,
@@ -287,7 +287,7 @@ static void fractional_max_pool3d_backward_out_single_batch_frame(
 }
 
 template<typename scalar_t>
-static void fractional_max_pool3d_backward_out_frame(
+void fractional_max_pool3d_backward_out_frame(
   scalar_t* gradInput,
   const scalar_t* gradOutput,
   const int64_t* indices,
diff --git a/aten/src/ATen/native/GridSamplerUtils.h b/aten/src/ATen/native/GridSamplerUtils.h
index f783043c7961..3388af7b8a0a 100644
--- a/aten/src/ATen/native/GridSamplerUtils.h
+++ b/aten/src/ATen/native/GridSamplerUtils.h
@@ -93,6 +93,12 @@ inline bool cond_cudnn_grid_sampler(
   const TensorBase& input,
   const TensorBase& grid
 ) {
+  auto st = input.scalar_type();
+  if (!(st == kDouble || st == kFloat || st == kHalf))
+    return false;
+  st = grid.scalar_type();
+  if (!(st == kDouble || st == kFloat || st == kHalf))
+    return false;
   return (
     at::native::cudnn_is_acceptable(input) &&
     at::native::cudnn_is_acceptable(grid) &&
diff --git a/aten/src/ATen/native/Histogram.cpp b/aten/src/ATen/native/Histogram.cpp
index 9954edef9460..5919997cf5fe 100644
--- a/aten/src/ATen/native/Histogram.cpp
+++ b/aten/src/ATen/native/Histogram.cpp
@@ -23,6 +23,7 @@
 #include <ATen/ops/linspace.h>
 #endif
 
+#include <cmath>
 #include <numeric>
 #include <tuple>
 #include <vector>
@@ -202,6 +203,46 @@ select_outer_bin_edges(const Tensor& input, std::optional<c10::ArrayRef<double>>
     return std::make_pair(leftmost_edges, rightmost_edges);
 }
 
+
+/* Bin edges correction based on the precision representation.
+ * To maintain the backward compatibility we take max(std::nextafter<>, +1)
+ * and min(std::nextafter<>, -1) for scalar types. For other types +/- 1 as usual.
+ */
+void bins_edges_correction(const ScalarType& t, double &leftmost_edge, double &rightmost_edge)
+{
+#define UPDATE_WITH_LIMIT(real_type, scalartype) \
+  case ScalarType::scalartype:                   \
+    leftmost_edge = std::min(                    \
+        static_cast<double>(                     \
+            std::nexttoward(                     \
+                static_cast<real_type>(leftmost_edge),   \
+                std::numeric_limits<real_type>::lowest() \
+            )                                    \
+        ),                                       \
+        leftmost_edge - 1.                       \
+    );                                           \
+    rightmost_edge = std::max(                   \
+        static_cast<double>(                     \
+            std::nexttoward(                     \
+                static_cast<real_type>(rightmost_edge), \
+                std::numeric_limits<real_type>::max()   \
+            )                                    \
+        ),                                       \
+        rightmost_edge + 1.                      \
+    );                                           \
+    break;
+
+    switch (t) {
+        UPDATE_WITH_LIMIT(double, Double)
+        UPDATE_WITH_LIMIT(float, Float)
+        default:
+            // Fallback to the default behavior for other types
+            leftmost_edge -= 1;
+            rightmost_edge += 1;
+    }
+#undef UPDATE_WITH_LIMIT
+}
+
 /* histc's version of the logic for outermost bin edges.
  */
 std::pair<double, double> histc_select_outer_bin_edges(const Tensor& input,
@@ -216,8 +257,7 @@ std::pair<double, double> histc_select_outer_bin_edges(const Tensor& input,
     }
 
     if (leftmost_edge == rightmost_edge) {
-        leftmost_edge -= 1;
-        rightmost_edge += 1;
+        bins_edges_correction(input.dtype().toScalarType(), leftmost_edge, rightmost_edge);
     }
 
     TORCH_CHECK(!(std::isinf(leftmost_edge) || std::isinf(rightmost_edge) ||
diff --git a/aten/src/ATen/native/Im2Col.cpp b/aten/src/ATen/native/Im2Col.cpp
index 25eb4d678724..acdcb2b27bda 100644
--- a/aten/src/ATen/native/Im2Col.cpp
+++ b/aten/src/ATen/native/Im2Col.cpp
@@ -19,7 +19,7 @@
 namespace at::native {
 namespace {
 
-static void im2col_out_cpu_template(
+void im2col_out_cpu_template(
     Tensor& output,
     const Tensor& input_,
     IntArrayRef kernel_size,
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 616e6ec60e13..7b5ec83e1698 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -2801,6 +2801,7 @@ Tensor matrix_exp(const Tensor& a) {
 // TODO This should be deprecated in favor of linalg_matrix_exp_differential
 //      in FunctionsManual.cpp
 Tensor matrix_exp_backward(const Tensor& self, const Tensor& grad) {
+  squareCheckInputs(self, "matrix_exp_backward");
   NoTF32Guard disable_tf32;
   return backward_analytic_function_of_a_matrix(
     self, grad,
diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp
index 265bc112adcc..40d79d97c0cd 100644
--- a/aten/src/ATen/native/Loss.cpp
+++ b/aten/src/ATen/native/Loss.cpp
@@ -61,7 +61,7 @@
 constexpr float EPSILON = 1e-12;
 
 namespace {
-  static inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) {
+  inline at::Tensor apply_loss_reduction(const at::Tensor& unreduced, int64_t reduction) {
     if (reduction == at::Reduction::Mean) {
       return unreduced.mean();
     } else if (reduction == at::Reduction::Sum) {
diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp
index 46b9397a008c..2e2bc5542b51 100644
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@@ -44,7 +44,7 @@ namespace {
 
 // this ad-hoc converts from targets (l in [1]) to augmented targets (l' in [1]) note that no bound-checking is done
 template<typename target_t>
-static inline int64_t get_target_prime(target_t* target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) {
+inline int64_t get_target_prime(target_t* target, int64_t offset, int64_t stride, int64_t idx, int64_t BLANK) {
   if (idx % 2 == 0) {
     return BLANK;
   } else {
diff --git a/aten/src/ATen/native/LossMultiLabelMargin.cpp b/aten/src/ATen/native/LossMultiLabelMargin.cpp
index a3ec774a0a46..b524d277cd0a 100644
--- a/aten/src/ATen/native/LossMultiLabelMargin.cpp
+++ b/aten/src/ATen/native/LossMultiLabelMargin.cpp
@@ -58,7 +58,7 @@ inline scalar_t multilabel_margin_loss_forward_inner_sum_cpu(
 }
 
 template <typename scalar_t>
-static void multilabel_margin_loss_forward_out_frame(
+void multilabel_margin_loss_forward_out_frame(
     const Tensor& input_contiguous,
     const Tensor& target_contiguous,
     Tensor& output,
@@ -108,7 +108,7 @@ static void multilabel_margin_loss_forward_out_frame(
   }
 }
 
-static void multilabel_margin_loss_forward_out_cpu_template(
+void multilabel_margin_loss_forward_out_cpu_template(
     const Tensor& input,
     const Tensor& target,
     Tensor& output,
@@ -153,7 +153,7 @@ static void multilabel_margin_loss_forward_out_cpu_template(
 }
 
 template <typename scalar_t>
-static void multilabel_margin_loss_backward_out_frame(
+void multilabel_margin_loss_backward_out_frame(
     Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input_contiguous,
@@ -222,7 +222,7 @@ static void multilabel_margin_loss_backward_out_frame(
   }
 }
 
-static void multilabel_margin_loss_backward_out_cpu_template(
+void multilabel_margin_loss_backward_out_cpu_template(
     Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
diff --git a/aten/src/ATen/native/LossMultiMargin.cpp b/aten/src/ATen/native/LossMultiMargin.cpp
index f003cfcf2c5a..f9dc074a6983 100644
--- a/aten/src/ATen/native/LossMultiMargin.cpp
+++ b/aten/src/ATen/native/LossMultiMargin.cpp
@@ -57,7 +57,7 @@ inline int64_t target_index_checked(
 }
 
 template <typename scalar_t>
-static inline void multi_margin_loss_cpu_kernel(
+inline void multi_margin_loss_cpu_kernel(
     Tensor& output,
     const scalar_t* input_data,
     const int64_t* target_data,
@@ -148,7 +148,7 @@ void multi_margin_loss_out_cpu_template(
 }
 
 template <typename scalar_t>
-static void multi_margin_loss_backward_cpu_kernel(
+void multi_margin_loss_backward_cpu_kernel(
     scalar_t* grad_input_data,
     const Tensor& grad_output,
     const scalar_t* input_data,
diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp
index ca86292403fb..576f56986988 100644
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@@ -159,7 +159,7 @@ inline scalar_t* optional_data(const Tensor& source) {
 }
 
 template <typename scalar_t, typename target_t>
-static void nll_loss_out_frame(
+void nll_loss_out_frame(
     const Tensor& output,
     const Tensor& total_weight,
     const Tensor& input,
@@ -338,7 +338,7 @@ void nll_loss_forward_out_cpu_template(
 }
 
 template <typename scalar_t, typename target_t>
-static void nll_loss_backward_out_frame(
+void nll_loss_backward_out_frame(
     const Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp
index 4ce394ec2f56..7bea90cbd527 100644
--- a/aten/src/ATen/native/LossNLL2d.cpp
+++ b/aten/src/ATen/native/LossNLL2d.cpp
@@ -99,7 +99,7 @@ inline void check_gradout_shape_nll_loss2d(
 
 
 template <typename scalar_t>
-static void nll_loss2d_forward_out_frame(
+void nll_loss2d_forward_out_frame(
     Tensor& output,
     Tensor& total_weight,
     const Tensor& input,
@@ -280,7 +280,7 @@ void nll_loss2d_forward_out_cpu_template(
 }
 
 template <typename scalar_t>
-static void nll_loss2d_backward_out_frame(
+void nll_loss2d_backward_out_frame(
     Tensor& grad_input,
     const Tensor& grad_output,
     const Tensor& input,
diff --git a/aten/src/ATen/native/Math.h b/aten/src/ATen/native/Math.h
index b261da5fe54e..4677542706f6 100644
--- a/aten/src/ATen/native/Math.h
+++ b/aten/src/ATen/native/Math.h
@@ -581,7 +581,7 @@ scalar_t ratevl(scalar_t x, const scalar_t num[], int64_t M,
 template <typename scalar_t>
 static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
   // lanczos approximation
-  static const scalar_t lanczos_sum_expg_scaled_num[13] = {
+  static constexpr scalar_t lanczos_sum_expg_scaled_num[13] = {
     0.006061842346248906525783753964555936883222,
     0.5098416655656676188125178644804694509993,
     19.51992788247617482847860966235652136208,
@@ -596,7 +596,7 @@ static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
     103794043.1163445451906271053616070238554,
     56906521.91347156388090791033559122686859
   };
-  static const scalar_t lanczos_sum_expg_scaled_denom[13] = {
+  static constexpr scalar_t lanczos_sum_expg_scaled_denom[13] = {
     1.,
     66.,
     1925.,
@@ -712,7 +712,7 @@ static scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
 template <typename scalar_t>
 static scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) {
   // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
-  static const scalar_t d[25][25] =
+  static constexpr scalar_t d[25][25] =
     {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2,
       1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4,
       3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6,
diff --git a/aten/src/ATen/native/MaxUnpooling.cpp b/aten/src/ATen/native/MaxUnpooling.cpp
index a71db5e8ef8d..f91b892efec2 100644
--- a/aten/src/ATen/native/MaxUnpooling.cpp
+++ b/aten/src/ATen/native/MaxUnpooling.cpp
@@ -23,8 +23,6 @@ Tensor& max_unpooling2d_forward_out_cpu(
   // Nondeterministic with duplicate indices
   at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out");
 
-  auto oheight = output_size[0];
-  auto owidth = output_size[1];
   TORCH_CHECK(
       indices_.scalar_type() == at::ScalarType::Long,
       "elements in indices should be type int64 but got: ", indices_.scalar_type());
@@ -45,6 +43,9 @@ Tensor& max_unpooling2d_forward_out_cpu(
                 self_.sizes(), " with dimension ", i , " being empty.");
   }
 
+  auto oheight = output_size[0];
+  auto owidth = output_size[1];
+
   auto memory_format = self_.suggest_memory_format();
   auto self = self_.contiguous(memory_format);
   auto indices = indices_.contiguous(memory_format);
diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
index 799b5ffa2cdb..08c42a0d470c 100644
--- a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
+++ b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
@@ -24,7 +24,7 @@
 
 namespace at {
 namespace {
-static inline void slow_conv_transpose2d_shape_check(
+inline void slow_conv_transpose2d_shape_check(
     const Tensor& input,
     const Tensor& grad_output,
     const Tensor& weight,
@@ -386,7 +386,7 @@ void slow_conv_transpose2d_out_cpu_template(
   }
 }
 
-static void slow_conv_transpose2d_backward_out_cpu_template(
+void slow_conv_transpose2d_backward_out_cpu_template(
     const Tensor& input_,
     const Tensor& grad_output_,
     Tensor& grad_input,
diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
index f69e84521e5d..469269ab07df 100644
--- a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
+++ b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
@@ -22,7 +22,7 @@ namespace at::native {
 
 namespace {
 
-static inline void slow_conv_transpose3d_shape_check(
+inline void slow_conv_transpose3d_shape_check(
     const Tensor& input,
     const Tensor& grad_output,
     const Tensor& weight,
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index 13b421d1e688..72526162d133 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -62,7 +62,7 @@
 #include <utility>
 #include <vector>
 
-static const int MIOPEN_DIM_MAX = 5;
+static constexpr int MIOPEN_DIM_MAX = 5;
 
 namespace at::meta {
 
@@ -92,7 +92,7 @@ namespace {
              arg_name, " should contain ", expected, " elements not ", actual);
   }
 
-  static inline Tensor repeat_if_defined(const Tensor& t, const SymInt& repeat) {
+  inline Tensor repeat_if_defined(const Tensor& t, const SymInt& repeat) {
     if (t.defined()) {
       return t.repeat_symint(repeat);
     }
diff --git a/aten/src/ATen/native/Onehot.cpp b/aten/src/ATen/native/Onehot.cpp
index 8833bdb6e471..2a20f95f10c2 100644
--- a/aten/src/ATen/native/Onehot.cpp
+++ b/aten/src/ATen/native/Onehot.cpp
@@ -34,16 +34,16 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) {
         }
     }
 
-    auto shape = self.sizes().vec();
+    auto shape = self.sym_sizes().vec();
 
     // empty tensor could be converted to one hot representation,
     // but shape inference is not possible.
-    if (self.numel() == 0) {
+    if (self.sym_numel() == 0) {
         if (num_classes <= 0) {
             TORCH_CHECK(false, "Can not infer total number of classes from empty tensor.");
         } else {
-            shape.push_back(num_classes);
-            return at::empty(shape, self.options());
+            shape.emplace_back(num_classes);
+            return at::empty_symint(shape, self.options());
         }
     }
 
@@ -66,8 +66,8 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) {
         }
     }
 
-    shape.push_back(num_classes);
-    Tensor ret = at::zeros(shape, self.options());
+    shape.emplace_back(num_classes);
+    Tensor ret = at::zeros_symint(shape, self.options());
     ret.scatter_(-1, self.unsqueeze(-1), 1);
     return ret;
 }
diff --git a/aten/src/ATen/native/PadNd.cpp b/aten/src/ATen/native/PadNd.cpp
index 8099648d37b2..986447bab614 100644
--- a/aten/src/ATen/native/PadNd.cpp
+++ b/aten/src/ATen/native/PadNd.cpp
@@ -70,10 +70,10 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
         new_shape.emplace_back(input_sizes[i]);
     }
 
-    for (const auto i : c10::irange((size_t)l_pad)) {
+    for (const auto i : c10::irange(l_pad)) {
         auto pad_idx = pad.size() - ((i + 1) * 2);
         auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
-        TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
+        TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
                  pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, "
                  "which is invalid. Check dimension ", l_diff + i, " of your input.");
         new_shape.emplace_back(new_dim);
diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp
index f3858ac3d365..75b30320b027 100644
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@@ -108,6 +108,13 @@ bool use_mkldnn(const Tensor& input, TensorList params, TensorList hx) {
   return false;
 }
 
+bool use_cudnn(const Tensor& t) {
+  bool acceptable = at::cudnn_is_acceptable(t);
+  auto st = t.scalar_type();
+  bool bfloat16_cond = st == kBFloat16 && at::detail::getCUDAHooks().supportsBFloat16RNNWithCuDNN();
+  return acceptable && (bfloat16_cond || st == kDouble || st == kFloat || st == kHalf);
+}
+
 template<typename T>
 using pair_of = std::pair<T, T>;
 
@@ -531,7 +538,7 @@ c10::intrusive_ptr<CellParamsBase> make_quantized_cell_params_fp16(
       std::move(w_ih_packed), std::move(w_hh_packed));
 }
 
-static std::unordered_map<
+std::unordered_map<
     std::string,
     c10::intrusive_ptr<CellParamsBase> (*)(CellParamsSerializationType)>
     cell_params_deserializers = {
@@ -571,7 +578,7 @@ struct QRNNCellParamsWrapper {
 
 // Gathers every two elements of a vector in a vector of pairs
 template<typename T>
-static std::vector<pair_of<T>> pair_vec(const std::vector<T>& vals) {
+std::vector<pair_of<T>> pair_vec(const std::vector<T>& vals) {
   TORCH_CHECK(vals.size() % 2 == 0, "Odd number of params or hiddens given to a bidirectional RNN");
   std::vector<pair_of<T>> result;
   result.reserve(vals.size() / 2);
@@ -583,7 +590,7 @@ static std::vector<pair_of<T>> pair_vec(const std::vector<T>& vals) {
 
 // Flattens a vector of pairs
 template<typename T>
-static std::vector<T> unpair_vec(std::vector<pair_of<T>>&& vals) {
+std::vector<T> unpair_vec(std::vector<pair_of<T>>&& vals) {
   std::vector<T> result;
   result.reserve(vals.size() * 2);
   for (const auto i : c10::irange(vals.size())) {
@@ -594,7 +601,7 @@ static std::vector<T> unpair_vec(std::vector<pair_of<T>>&& vals) {
 }
 
 // Parses a flat list of parameter tensors into a list of CellParams
-static std::vector<CellParams> gather_params(TensorList params, bool has_biases, bool has_projections = false) {
+std::vector<CellParams> gather_params(TensorList params, bool has_biases, bool has_projections = false) {
   static at::Tensor undefined;
   std::vector<CellParams> result;
   if (has_biases) {
@@ -1200,7 +1207,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
       bool train,                                                           \
       bool bidirectional,                                                   \
       bool batch_first) {                                                   \
-    if (at::cudnn_is_acceptable(_input)) {                                  \
+    if (use_cudnn(_input)) {                                                \
       Tensor output, hy;                                                    \
       NAME##_cudnn_stub(                                                    \
           _input.device().type(),                                           \
@@ -1262,7 +1269,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_fused_lstm_cell_backwar
       double dropout_p,                                                     \
       bool train,                                                           \
       bool bidirectional) {                                                 \
-    if (at::cudnn_is_acceptable(data)) {                                    \
+    if (use_cudnn(data)) {                                                  \
       Tensor output, hy;                                                    \
       NAME##_packed_cudnn_stub(                                             \
           data.device().type(),                                             \
@@ -1430,7 +1437,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
       TensorList _params, bool has_biases,
       int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
   TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
-  if (at::cudnn_is_acceptable(_input)) {
+  if (use_cudnn(_input)) {
     Tensor output, hy, cy;
     lstm_cudnn_stub(_input.device().type(), output, hy, cy, _input, hx, _params, has_biases,
             num_layers, dropout_p, train, bidirectional, batch_first);
@@ -1491,7 +1498,7 @@ std::tuple<Tensor, Tensor, Tensor> lstm(
       TensorList _params, bool has_biases,
       int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
   TORCH_CHECK(hx.size() == 2, "lstm expects two hidden states");
-  if (at::cudnn_is_acceptable(data)) {
+  if (use_cudnn(data)) {
     Tensor output, hy, cy;
     lstm_packed_cudnn_stub(data.device().type(), output, hy, cy, data, batch_sizes, hx,
             _params, has_biases, num_layers, dropout_p, train, bidirectional);
@@ -1887,10 +1894,10 @@ static DEFINE_QUANTIZED_RNN_CELL_DYNAMIC(quantized_rnn_tanh_cell_dynamic, simple
 
 namespace {
 
-[[maybe_unused]] static auto ensure_linear_params_registered =
+[[maybe_unused]] auto ensure_linear_params_registered =
     register_linear_params();
 
-static auto cell_params_base_registry =
+auto cell_params_base_registry =
     torch::selective_class_<CellParamsBase>("rnn", TORCH_SELECTIVE_CLASS("CellParamsBase"))
         .def_pickle(
             [](const c10::intrusive_ptr<CellParamsBase>& self)
diff --git a/aten/src/ATen/native/RangeUtils.h b/aten/src/ATen/native/RangeUtils.h
index dcab86ca9a42..fd62b8e01329 100644
--- a/aten/src/ATen/native/RangeUtils.h
+++ b/aten/src/ATen/native/RangeUtils.h
@@ -47,7 +47,7 @@ int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar
     int64_t sgn = (xstep > 0) - (xstep < 0);
     size_d = std::ceil((xend - xstart + xstep - sgn) / xstep);
   } else {
-    size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
+    size_d = std::ceil((end.to<double>() - start.to<double>())
                         / step.to<double>());
   }
 
diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp
index daf153e460e9..a946def225b0 100644
--- a/aten/src/ATen/native/Resize.cpp
+++ b/aten/src/ATen/native/Resize.cpp
@@ -107,11 +107,6 @@ void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes) {
   storage->set_nbytes(size_bytes);
 }
 
-// Call the sparse implementation in SparseTensor.cpp directly.
-// A dynamic dispatch here is NOT necessary, so I didn't put
-// this function in native_functions.yaml
-const Tensor& resize_as_sparse_(const Tensor& self, const Tensor& src);
-
 // TODO(VitalyFedyunin): Move it to HTML docs.
 //
 // Strides of the output tensor of `resize_as_` operator is defined by input
diff --git a/aten/src/ATen/native/Scalar.cpp b/aten/src/ATen/native/Scalar.cpp
index 0053b86c3373..39e203f63278 100644
--- a/aten/src/ATen/native/Scalar.cpp
+++ b/aten/src/ATen/native/Scalar.cpp
@@ -15,7 +15,11 @@ namespace at::native {
 
 Scalar item(const Tensor& self) {
   auto numel = self.sym_numel();
-  TORCH_CHECK(numel == 1, "a Tensor with ", numel, " elements cannot be converted to Scalar");
+  TORCH_SYM_CHECK(
+      numel.sym_eq(1),
+      "a Tensor with ",
+      numel,
+      " elements cannot be converted to Scalar");
   if (self.is_sparse()) {
     if (self._nnz() == 0) return Scalar(0);
     if (self.is_coalesced()) return at::_local_scalar_dense(self._values());
diff --git a/aten/src/ATen/native/SharedReduceOps.h b/aten/src/ATen/native/SharedReduceOps.h
index 1de72abd5886..15794040bf39 100644
--- a/aten/src/ATen/native/SharedReduceOps.h
+++ b/aten/src/ATen/native/SharedReduceOps.h
@@ -346,17 +346,17 @@ template<typename acc_t>
 struct AbsSwitch {};
 
 template<typename scalar_t, typename acc_t>
-inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch<acc_t>) {
+inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch<acc_t> /*unused*/) {
   return static_cast<acc_t>(data);
 }
 
 template<typename scalar_t, typename acc_t>
-inline C10_DEVICE acc_t abs_if_complex(std::complex<scalar_t> data, AbsSwitch<acc_t>) {
+inline C10_DEVICE acc_t abs_if_complex(std::complex<scalar_t> data, AbsSwitch<acc_t> /*unused*/) {
   return static_cast<acc_t>(std::abs(data));
 }
 
 template<typename scalar_t, typename acc_t>
-inline C10_DEVICE acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<acc_t>) {
+inline C10_DEVICE acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<acc_t> /*unused*/) {
   return static_cast<acc_t>(std::abs(at::opmath_type<c10::complex<scalar_t>>(data)));
 }
 
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index 7d613fc02312..451869f521df 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -145,12 +145,6 @@
 #include <utility>
 #include <vector>
 
-namespace at::native {
-
-AdvancedIndex make_info(Tensor self, IOptTensorListRef orig);
-
-} // namespace at::native
-
 namespace at::meta {
 
 TORCH_META_FUNC(gather)
@@ -1912,11 +1906,9 @@ Tensor& index_fill_(
         "This also applies to advanced indexing e.g. tensor[mask] = scalar");
   }
 
-  if (!self.is_complex() && source.isComplex()) {
-    TORCH_CHECK(
-        false,
-        "index_fill_(): Converting complex Scalar to non-complex type is not supported");
-  }
+  TORCH_CHECK(
+      self.is_complex() || !source.isComplex(),
+      "index_fill_(): Converting complex Scalar to non-complex type is not supported");
 
   // Handle the case when `self` is 0-dim
   Tensor self_nonzero_dim = (self.dim() == 0) ? self.unsqueeze(-1) : self;
@@ -2682,7 +2674,7 @@ inline std::tuple<Tensor, Tensor, int64_t> _take_along_dim_helper(
       std::move(dim));
 }
 
-static inline void checkDevice(CheckedFrom c, const Tensor& t, Device device) {
+inline void checkDevice(CheckedFrom c, const Tensor& t, Device device) {
   TORCH_CHECK(
       !t.defined() || t.device() == device,
       "Expected tensor to have ",
@@ -2695,7 +2687,7 @@ static inline void checkDevice(CheckedFrom c, const Tensor& t, Device device) {
       ")");
 }
 
-static inline void checkDevice(
+inline void checkDevice(
     CheckedFrom c,
     at::ArrayRef<Tensor> tensors,
     Device device) {
diff --git a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
index bc6c2533eac5..6f127b711d3e 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
+++ b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
@@ -77,7 +77,7 @@ inline AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) {
   // next broadcast all index tensors together
   try {
     indices = expand_outplace(indices);
-  } catch (std::exception& e) {
+  } catch (std::exception&) {
     TORCH_CHECK_INDEX(
         false,
         "shape mismatch: indexing tensors could not be broadcast together"
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index d9a42da482c0..c6126eda61e7 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -73,7 +73,6 @@
 #include <ATen/ops/where_native.h>
 #include <ATen/ops/zeros_like.h>
 
-#include <iostream>
 #include <utility>
 #endif
 
@@ -847,7 +846,7 @@ TORCH_IMPL_FUNC(clamp_Tensor_out)
 (const Tensor& self,
  const OptionalTensorRef min,
  const OptionalTensorRef max,
- const Tensor&) {
+ const Tensor& /*unused*/) {
   if (min && max) {
     clamp_stub(device_type(), *this);
   } else if (min) {
diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp
index 4fa0556ad785..c15b082f107b 100644
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@@ -91,9 +91,6 @@ bool cudnn_is_acceptable(const TensorBase& self) {
     return false;
   if (!self.is_cuda())
     return false;
-  auto st = self.scalar_type();
-  if (!(st == kDouble || st == kFloat || st == kHalf))
-    return false;
   if (!detail::getCUDAHooks().compiledWithCuDNN())
     return false;
   // cuDNN functions like grid_sampler returns CUDNN_STATUS_BAD_PARAM on empty
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index c2d0856c3cd4..6df7761d822d 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -1,3 +1,5 @@
+#include <ATen/core/ATen_fwd.h>
+#include <c10/core/ScalarType.h>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
@@ -1878,19 +1880,18 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {
 
   Tensor xtensor = self.expand(padded_size);
 
-  Tensor result;
+  Tensor urtensor;
   if (self.is_quantized()) {
-    result = at::empty_quantized(target_size, self);
+    urtensor = at::empty_quantized(target_size, self);
   } else {
-    result = at::empty(target_size, self.options());
+    urtensor = at::empty(target_size, self.options());
   }
 
   // return an empty tensor if one of the repeat dimensions is zero
   if (zero_tensor) {
-    return result;
+    return urtensor;
   }
 
-  Tensor urtensor = at::alias(result);
   for (const auto i : c10::irange(xtensor.dim())) {
     // can't unfold with step 0, so make sure step is at least 1
     // (it doesn't matter what it is in that case, because the size is 0).
@@ -1900,7 +1901,22 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {
 
   urtensor.copy_(xtensor.expand_as(urtensor));
 
-  return result;
+  // Combine the dimensions to produce the target_size.
+  // xtensor dims: [a0, ..., ad-1]
+  // urtensor dims: [a0, ..., ad-1, b0, ..., bd-1]
+  // b dims are produced by unfold.
+  // Transform urtensor to [a0 * b0, ..., ad-1 * bd-1]
+  const int64_t n_dims = xtensor.dim();
+  auto range_a = at::arange(xtensor.dim(), at::TensorOptions(at::kLong));
+  auto range_b = range_a + n_dims;
+  auto stacked = stack({std::move(range_a), std::move(range_b)}, 1).flatten();
+  auto permutation = IntArrayRef(stacked.data_ptr<int64_t>(), n_dims * 2);
+  // Permute from [a0, ..., ad-1, b0, ..., bd-1] to [a0, b0, ..., ad-1, bd-1]
+  urtensor = urtensor.permute(permutation);
+  // Reshape from [a0, b0, ..., ad-1, bd-1] to [a0 * b0, ..., ad-1 * bd-1]
+  urtensor = urtensor.reshape(target_size);
+
+  return urtensor;
 }
 
 Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) {
@@ -2051,7 +2067,7 @@ Tensor _reshape_copy_symint(
     TORCH_CHECK(0, "_reshape_copy not implemented for mkldnn tensors");
   }
 
-  if (self.is_contiguous()) {
+  if (self.is_contiguous_or_false()) {
     return self.view_symint(shape).clone(at::MemoryFormat::Contiguous);
   } else {
     return at::_unsafe_view_symint(
@@ -3625,7 +3641,7 @@ Tensor& transpose_(Tensor& self, int64_t dim0, int64_t dim1) {
 namespace {
 // Transpose implementation for sparse compressed layouts
 // NB: We assume that dim1,dim0 have already been wrapped
-static inline Tensor sparse_compressed_transpose(
+inline Tensor sparse_compressed_transpose(
     const Tensor& self,
     int64_t dim0,
     int64_t dim1) {
diff --git a/aten/src/ATen/native/UnfoldBackward.h b/aten/src/ATen/native/UnfoldBackward.h
index 3030cb54aea6..156d2c8974b8 100644
--- a/aten/src/ATen/native/UnfoldBackward.h
+++ b/aten/src/ATen/native/UnfoldBackward.h
@@ -29,7 +29,7 @@ namespace {
 // grad_in does not mean that it is a gradient wrt to input,
 // grad_in/grad_out is just an input/output of unfold_backward kernel.
 
-[[maybe_unused]] static TensorIterator _make_unfold_backward_iter_over_grad_out(
+[[maybe_unused]] TensorIterator _make_unfold_backward_iter_over_grad_out(
     Tensor& grad_out,
     const Tensor& grad_in,
     int64_t dim,
diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp
index d9e1bf30c6a6..b14079e7ea19 100644
--- a/aten/src/ATen/native/Unique.cpp
+++ b/aten/src/ATen/native/Unique.cpp
@@ -124,7 +124,7 @@ struct IsUnique {};
 
 template <typename scalar_t>
 struct IsUnique<scalar_t, false> {
-  inline bool operator() (scalar_t* data_ptr, int64_t i) {
+  bool operator() (scalar_t* data_ptr, int64_t i) {
     if (i == 0) { return true; }
     return c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1]);
   }
@@ -132,7 +132,7 @@ struct IsUnique<scalar_t, false> {
 
 template <typename scalar_t>
 struct IsUnique<scalar_t, true> {
-  inline bool operator() (scalar_t* data_ptr, int64_t i) {
+  bool operator() (scalar_t* data_ptr, int64_t i) {
     if (i == 0) { return true; }
     return (c10::load(&data_ptr[i]) != c10::load(&data_ptr[i - 1]))
         && !(_isnan(data_ptr[i]) && _isnan(data_ptr[i - 1]));
diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h
index 5b49fdd02954..cf6727c2207c 100644
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@@ -4,7 +4,6 @@
 
 #include <ATen/OpMathType.h>
 #include <ATen/TensorUtils.h>
-#include <ATen/OpMathType.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
@@ -407,7 +406,7 @@ scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
 }
 
 template <typename scalar_t>
-void get_cubic_upsample_coefficients(
+static inline void get_cubic_upsample_coefficients(
     scalar_t coeffs[4],
     scalar_t t) {
   scalar_t A = -0.75;
diff --git a/aten/src/ATen/native/UpSampleBicubic2d.cpp b/aten/src/ATen/native/UpSampleBicubic2d.cpp
index b02d809bb57a..3ab8795f6dca 100644
--- a/aten/src/ATen/native/UpSampleBicubic2d.cpp
+++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp
@@ -105,7 +105,7 @@ namespace at::native {
 namespace {
 
 template <typename scalar_t>
-static void upsample_bicubic2d_backward_out_frame(
+void upsample_bicubic2d_backward_out_frame(
     const scalar_t* odata,
     scalar_t* idata,
     int64_t input_height,
@@ -177,7 +177,7 @@ static void upsample_bicubic2d_backward_out_frame(
   });
 }
 
-static void upsample_bicubic2d_backward_kernel(
+void upsample_bicubic2d_backward_kernel(
     const Tensor& grad_input,
     const Tensor& grad_output_,
     IntArrayRef output_size,
diff --git a/aten/src/ATen/native/VariableMethodStubs.cpp b/aten/src/ATen/native/VariableMethodStubs.cpp
index 8c8ad45acc44..02c798a3d040 100644
--- a/aten/src/ATen/native/VariableMethodStubs.cpp
+++ b/aten/src/ATen/native/VariableMethodStubs.cpp
@@ -25,11 +25,11 @@
 namespace at::native {
 
 void _backward(const Tensor& self, TensorList inputs, const std::optional<Tensor>& gradient_opt, std::optional<bool> keep_graph, bool create_graph) {
-  return self._backward(inputs, gradient_opt, keep_graph, create_graph);
+  self._backward(inputs, gradient_opt, keep_graph, create_graph);
 }
 
 void set_data(Tensor& self, const Tensor& new_data) {
-  return self.set_data(new_data);
+  self.set_data(new_data);
 }
 
 Tensor data(const Tensor& self) {
@@ -54,7 +54,7 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) {
 }
 
 void retain_grad(Tensor& self) {
-  return self.retain_grad();
+  self.retain_grad();
 }
 
 bool retains_grad(const Tensor& self) {
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp
index f528dd14adb0..0773217c90a4 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.cpp
@@ -39,6 +39,6 @@ int register_linear_params() {
 }
 
 namespace {
-[[maybe_unused]] static auto linear_params = register_linear_params();
+[[maybe_unused]] auto linear_params = register_linear_params();
 } // namespace
 }  // namespace ao::sparse
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp
index ab2da21d4b58..9bb8fbdb0e05 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp
@@ -17,7 +17,7 @@
 
 namespace ao::sparse {
 
-int register_linear_params();
+
 
 #ifdef USE_FBGEMM
 
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
index 8c3a93289c10..968e58d591c1 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp
@@ -20,7 +20,7 @@
 
 namespace ao::sparse {
 
-int register_linear_params();
+
 
 #ifdef USE_FBGEMM
 namespace {
diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
index bda1984d6207..b9cffe5b0bcb 100644
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_unpack.cpp
@@ -16,7 +16,7 @@
 #endif
 
 namespace ao::sparse {
-int register_linear_params();
+
 
 #ifdef USE_FBGEMM
 
diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp
index 00c9f4eb2534..bc9b452bc687 100644
--- a/aten/src/ATen/native/cpu/Activation.cpp
+++ b/aten/src/ATen/native/cpu/Activation.cpp
@@ -30,7 +30,7 @@ namespace {
 // Workaround for gcc-14.2.0 ICE during RTL pass: expand when compiling for NEON
 __attribute__((optimize("no-tree-vectorize")))
 #endif
-static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) {
+void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) {
   if (at::isReducedFloatingType(input.scalar_type())) {
     AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() {
     using Vec = Vectorized<scalar_t>;
@@ -96,7 +96,7 @@ static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const
   }
 }
 
-static void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) {
+void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) {
   if (at::isReducedFloatingType(iter.dtype())) {
     AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "log_sigmoid_backward_cpu", [&]() {
       using Vec = Vectorized<scalar_t>;
@@ -150,7 +150,7 @@ static void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) {
   }
 }
 
-static void threshold_kernel(
+void threshold_kernel(
     TensorIteratorBase& iter,
     const Scalar& threshold_scalar,
     const Scalar& value_scalar) {
@@ -868,7 +868,7 @@ void hardswish_backward_kernel(TensorIterator& iter) {
   }
 }
 
-static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) {
+void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) {
   if (at::isReducedFloatingType(iter.dtype())) {
     AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "leaky_relu_cpu", [&]() {
     auto zero_vec = Vectorized<float>((float)(0));
@@ -907,7 +907,7 @@ static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) {
   }
 }
 
-static void leaky_relu_backward_kernel(TensorIteratorBase& iter, const Scalar& negval_) {
+void leaky_relu_backward_kernel(TensorIteratorBase& iter, const Scalar& negval_) {
   if (at::isReducedFloatingType(iter.dtype())) {
     AT_DISPATCH_REDUCED_FLOATING_TYPES(iter.dtype(), "leaky_relu_backward_cpu", [&]() {
     auto zero_vec = Vectorized<float>((float)(0));
diff --git a/aten/src/ATen/native/cpu/AtomicAddFloat.h b/aten/src/ATen/native/cpu/AtomicAddFloat.h
index 5b24ee4821c4..526f86d705b7 100644
--- a/aten/src/ATen/native/cpu/AtomicAddFloat.h
+++ b/aten/src/ATen/native/cpu/AtomicAddFloat.h
@@ -22,7 +22,7 @@ static inline void cpu_atomic_add_float(float* dst, float fvalue)
   old_value.floatV = *dst;
   new_value.floatV = old_value.floatV + fvalue;
 
-  unsigned* old_intV = (unsigned*)(&old_value.intV);
+  unsigned* old_intV = &old_value.intV;
   while (!std::atomic_compare_exchange_strong(dst_intV, old_intV, new_value.intV)) {
 #ifdef __aarch64__
     __asm__ __volatile__("yield;" : : : "memory");
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index 3db9646b31c4..10e0daacab33 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -300,7 +300,8 @@ void div_floor_kernel(TensorIteratorBase& iter) {
     // In the special case of unsigned integer division, floor division is
     // equivalent to truncation division (since the signs of the divisor and
     // dividend are always the same)
-    return div_trunc_kernel(iter);
+    div_trunc_kernel(iter);
+    return;
   } else if (isIntegralType(dtype, /*includeBool*/ false)) {
     // There's no SIMD integer division, so don't try to vectorize it.
     AT_DISPATCH_INTEGRAL_TYPES(dtype, "div_floor_cpu", [&]() {
diff --git a/aten/src/ATen/native/cpu/BlasKernel.cpp b/aten/src/ATen/native/cpu/BlasKernel.cpp
index ab3b16c395a3..2e3a82ac049e 100644
--- a/aten/src/ATen/native/cpu/BlasKernel.cpp
+++ b/aten/src/ATen/native/cpu/BlasKernel.cpp
@@ -118,7 +118,7 @@ gemm_notrans_(
   scale_(m, n, beta, c, ldc);
 
   // c += alpha * (a @ b)
-  const uint64_t unsigned_m = static_cast<int64_t>(m);
+  const uint64_t unsigned_m = m;
   const uint64_t i_m = unsigned_m / 4;
   for (const uint64_t l : c10::irange(k)) {
     for (const uint64_t j : c10::irange(n)) {
@@ -369,7 +369,7 @@ void gemm_notrans_(
 #endif // defined(__aarch64__) && !defined(C10_MOBILE)
 
 #if !defined(C10_MOBILE)
-static float compute_dot(const at::Half* a, const at::Half* b, int64_t len) {
+float compute_dot(const at::Half* a, const at::Half* b, int64_t len) {
   return at::native::CPU_CAPABILITY::fp16_dot_with_fp32_arith(
       a, b, len);
 }
@@ -406,7 +406,7 @@ void gemm_transa_(
   });
 }
 
-static float compute_dot(const at::BFloat16* a, const at::BFloat16* b, int64_t len) {
+float compute_dot(const at::BFloat16* a, const at::BFloat16* b, int64_t len) {
   return at::native::CPU_CAPABILITY::bf16_dot_with_fp32_arith(a, b, len);
 }
 
diff --git a/aten/src/ATen/native/cpu/CopyKernel.cpp b/aten/src/ATen/native/cpu/CopyKernel.cpp
index 78651bca746d..365a79ba52ca 100644
--- a/aten/src/ATen/native/cpu/CopyKernel.cpp
+++ b/aten/src/ATen/native/cpu/CopyKernel.cpp
@@ -15,12 +15,12 @@ namespace at::native {
 inline namespace CPU_CAPABILITY {
 
 namespace {
-static bool reduced_input(ScalarType input_t, ScalarType output_t) {
+bool reduced_input(ScalarType input_t, ScalarType output_t) {
   return !at::isFloat8Type(input_t) && at::isReducedFloatingType(input_t) &&
       output_t == kFloat;
 }
 
-static bool reduced_output(ScalarType input_t, ScalarType output_t) {
+bool reduced_output(ScalarType input_t, ScalarType output_t) {
   return !at::isFloat8Type(output_t) && at::isReducedFloatingType(output_t) &&
       input_t == kFloat;
 }
diff --git a/aten/src/ATen/native/cpu/CrossKernel.cpp b/aten/src/ATen/native/cpu/CrossKernel.cpp
index b380ef619b40..66e49f911f68 100644
--- a/aten/src/ATen/native/cpu/CrossKernel.cpp
+++ b/aten/src/ATen/native/cpu/CrossKernel.cpp
@@ -15,7 +15,7 @@ namespace at::native {
 namespace {
 
 template<typename scalar_t>
-static void apply_cross(const Tensor& result, const Tensor& a, const Tensor& b, const int64_t dim) {
+void apply_cross(const Tensor& result, const Tensor& a, const Tensor& b, const int64_t dim) {
   int64_t total = a.numel() / 3;
   int64_t a_stride = a.stride(dim);
   int64_t b_stride = b.stride(dim);
@@ -68,7 +68,7 @@ static void apply_cross(const Tensor& result, const Tensor& a, const Tensor& b,
   });
 }
 
-static void cross_kernel_impl(const Tensor& result, const Tensor& a, const Tensor& b, const int64_t dim) {
+void cross_kernel_impl(const Tensor& result, const Tensor& a, const Tensor& b, const int64_t dim) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, result.scalar_type(), "cross", [&]() {
     apply_cross<scalar_t>(result, a, b, dim);
   });
diff --git a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
index 6526a4308221..1f9a8ff1097d 100644
--- a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
+++ b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp
@@ -452,11 +452,11 @@ void convolution_depthwise3x3_winograd_impl(
 #else
 
 void convolution_depthwise3x3_winograd_impl(
-    const Arguments&,
-    const float* const,
-    const float* const,
-    const float* const,
-    float* const) {
+    const Arguments& /*unused*/,
+    const float* const /*unused*/,
+    const float* const /*unused*/,
+    const float* const /*unused*/,
+    float* const /*unused*/) {
 }
 
 #endif /* __ARM_NEON__ */
diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
index a1a7059b7d64..412d90d9e454 100644
--- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@@ -422,19 +422,19 @@ void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, const double
   });
 }
 
-static void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) {
+void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& self, const double p, const Tensor& dist) {
   AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist_backward", [&] {
     Dist<scalar_t>::apply_backward_pdist(result, grad, self, p, dist);
   });
 }
 
-static void cdist_kernel_impl(Tensor& result, const Tensor& x1, const Tensor& x2, const double p) {
+void cdist_kernel_impl(Tensor& result, const Tensor& x1, const Tensor& x2, const double p) {
   AT_DISPATCH_FLOATING_TYPES(result.scalar_type(), "cdist", [&] {
     Dist<scalar_t>::apply_cdist(result, x1, x2, p);
   });
 }
 
-static void cdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& x1, const Tensor& x2, const double p, const Tensor& dist) {
+void cdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor& x1, const Tensor& x2, const double p, const Tensor& dist) {
   AT_DISPATCH_FLOATING_TYPES(result.scalar_type(), "cdist_backward", [&] {
     Dist<scalar_t>::apply_backward_cdist(result, grad, x1, x2, p, dist);
   });
diff --git a/aten/src/ATen/native/cpu/DistributionKernels.cpp b/aten/src/ATen/native/cpu/DistributionKernels.cpp
index a61e0364579b..e3fdefb52304 100644
--- a/aten/src/ATen/native/cpu/DistributionKernels.cpp
+++ b/aten/src/ATen/native/cpu/DistributionKernels.cpp
@@ -27,7 +27,7 @@
 namespace at::native {
 namespace {
 
-static void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, std::optional<Generator> gen) {
+void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::cauchy_kernel(iter, median, sigma, generator);
 }
@@ -101,7 +101,7 @@ void bernoulli_scalar_kernel(const TensorBase &self, double p, std::optional<Gen
 }
 #endif
 
-static void exponential_kernel_default(TensorIteratorBase& iter, double lambda, std::optional<Generator> gen) {
+void exponential_kernel_default(TensorIteratorBase& iter, double lambda, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::exponential_kernel(iter, lambda, generator);
 }
@@ -198,12 +198,12 @@ void exponential_kernel(TensorIteratorBase &iter, double lambda, std::optional<G
 }
 #endif
 
-static void geometric_kernel(TensorIteratorBase& iter, double p, std::optional<Generator> gen) {
+void geometric_kernel(TensorIteratorBase& iter, double p, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::geometric_kernel(iter, p, generator);
 }
 
-static void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, std::optional<Generator> gen) {
+void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::log_normal_kernel(iter, mean, std, generator);
 }
@@ -218,12 +218,12 @@ void normal_kernel(const TensorBase &self, double mean, double std, std::optiona
   templates::cpu::normal_kernel(self, mean, std, generator);
 }
 
-static void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional<Generator> gen) {
+void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::random_from_to_kernel(iter, range, base, generator);
 }
 
-static void random_kernel(TensorIteratorBase& iter, std::optional<Generator> gen) {
+void random_kernel(TensorIteratorBase& iter, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::random_kernel(iter, generator);
 }
@@ -231,7 +231,7 @@ static void random_kernel(TensorIteratorBase& iter, std::optional<Generator> gen
 // This is the special kernel to handle single specific case:
 // from(inclusive) = std::numeric_limits<int64_t>::lowest()
 // to(exclusive) = None (= std::numeric_limits<int64_t>::max() + 1)
-static void random_full_64_bits_range_kernel(TensorIteratorBase& iter, std::optional<Generator> gen) {
+void random_full_64_bits_range_kernel(TensorIteratorBase& iter, std::optional<Generator> gen) {
   CPUGeneratorImpl* generator = get_generator_or_default<CPUGeneratorImpl>(gen, detail::getDefaultCPUGenerator());
   templates::cpu::random_full_64_bits_range_kernel(iter, generator);
 }
diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h
index 8171ae8e79ad..1f8693902a32 100644
--- a/aten/src/ATen/native/cpu/DistributionTemplates.h
+++ b/aten/src/ATen/native/cpu/DistributionTemplates.h
@@ -85,7 +85,7 @@ struct RandomKernel {
 // ==================================================== Normal ========================================================
 
 #ifdef CPU_CAPABILITY_AVX2
-static void normal_fill_16_AVX2(float *data,
+void normal_fill_16_AVX2(float *data,
                          const __m256* two_pi,
                          const __m256* one,
                          const __m256* minus_two,
@@ -136,7 +136,7 @@ void normal_fill_AVX2(const TensorBase &self, const float mean, const float std,
 #endif
 
 template <typename scalar_t>
-static void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t std) {
+void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t std) {
   for (const auto j : c10::irange(8)) {
     const scalar_t u1 = 1 - data[j]; // [0, 1) -> (0, 1] for log.
     const scalar_t u2 = data[j + 8];
diff --git a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
index 4432b9ace791..5ac497139607 100644
--- a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
+++ b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
@@ -158,14 +158,14 @@ inline void _mul_reduce_max_fusion_kernel(
 }
 
 template <typename scalar_t>
-static inline scalar_t* conditional_data_ptr(scalar_t* ptr, scalar_t* ptr2) {
+inline scalar_t* conditional_data_ptr(scalar_t* ptr, scalar_t* ptr2) {
   TORCH_CHECK(ptr2 == nullptr);
   return ptr;
 }
 
 template <typename scalar_t,
           typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
-static inline scalar_t* conditional_data_ptr(float* ptr, scalar_t* ptr2) {
+inline scalar_t* conditional_data_ptr(float* ptr, scalar_t* ptr2) {
   return ptr2;
 }
 
diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
index 9450b7eca9b3..7587988528eb 100644
--- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@@ -441,7 +441,7 @@ struct ComputeLocation<scalar_t, GridSamplerPadding::Reflection, align_corners>
 // See NOTE [ Grid Sample CPU Kernels ] for details.
 
 template<typename scalar_t>
-static inline void
+inline void
 mask_scatter_add(const scalar_t *src, scalar_t* base_addr,
                  const int_same_size_t<scalar_t> *offsets,
                  const int_same_size_t<scalar_t> *mask, int64_t len) {
@@ -1030,7 +1030,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bicubic,
 // See NOTE [ Grid Sample CPU Kernels ] for details.
 
 template<typename scalar_t, typename ApplyFn>
-static inline void grid_sample_2d_grid_slice_iterator(
+inline void grid_sample_2d_grid_slice_iterator(
     const TensorAccessor<const scalar_t, 3>& grid_slice, const ApplyFn &apply_fn) {
   int64_t out_H = grid_slice.size(0);
   int64_t out_W = grid_slice.size(1);
diff --git a/aten/src/ATen/native/cpu/HistogramKernel.cpp b/aten/src/ATen/native/cpu/HistogramKernel.cpp
index 4a16d2bb7ba9..261683a187b8 100644
--- a/aten/src/ATen/native/cpu/HistogramKernel.cpp
+++ b/aten/src/ATen/native/cpu/HistogramKernel.cpp
@@ -259,7 +259,7 @@ void histogramdd_out_cpu_template(const Tensor& self, const std::optional<Tensor
  *
  * Refer to histogramdd_out_cpu_template for more details.
  */
-static void histogramdd_kernel_impl(const Tensor& self, const std::optional<Tensor>& weight, bool density,
+void histogramdd_kernel_impl(const Tensor& self, const std::optional<Tensor>& weight, bool density,
         Tensor& hist, const TensorList& bin_edges) {
     histogramdd_out_cpu_template<BINARY_SEARCH>(self, weight, density, hist, bin_edges);
 }
@@ -269,7 +269,7 @@ static void histogramdd_kernel_impl(const Tensor& self, const std::optional<Tens
  *
  * Refer to histogramdd_out_cpu_template for more details.
  */
-static void histogramdd_linear_kernel_impl(const Tensor& self, const std::optional<Tensor>& weight,
+void histogramdd_linear_kernel_impl(const Tensor& self, const std::optional<Tensor>& weight,
         bool density, Tensor& hist, const TensorList& bin_edges, bool local_search) {
     if (local_search) {
         // histogramdd codepath: both hist and bin_edges are eventually returned as output,
@@ -298,7 +298,7 @@ void infer_bin_edges_from_input(const Tensor& input, const int64_t N,
     std::copy(max_data, max_data + N, rightmost_edges.begin());
 }
 
-static void histogram_select_outer_bin_edges_impl(const Tensor& input, const int64_t N,
+void histogram_select_outer_bin_edges_impl(const Tensor& input, const int64_t N,
         std::vector<double> &leftmost_edges, std::vector<double> &rightmost_edges) {
     AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "histogramdd", [&]() {
         infer_bin_edges_from_input<scalar_t>(input, N, leftmost_edges, rightmost_edges);
diff --git a/aten/src/ATen/native/cpu/IndexKernel.cpp b/aten/src/ATen/native/cpu/IndexKernel.cpp
index 1e6723b5f08b..57d3ab89c617 100644
--- a/aten/src/ATen/native/cpu/IndexKernel.cpp
+++ b/aten/src/ATen/native/cpu/IndexKernel.cpp
@@ -749,21 +749,29 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
         // });
 
         if (iter_dtype == kByte) {
-          return cpu_hflip_vec<uint8_t>(iter);
+          cpu_hflip_vec<uint8_t>(iter);
+          return;
         } else if (iter_dtype == kChar) {
-          return cpu_hflip_vec<int8_t>(iter);
+          cpu_hflip_vec<int8_t>(iter);
+          return;
         } else if (iter_dtype == kInt) {
-          return cpu_hflip_vec<int32_t>(iter);
+          cpu_hflip_vec<int32_t>(iter);
+          return;
         } else if (iter_dtype == kLong) {
-          return cpu_hflip_vec<int64_t>(iter);
+          cpu_hflip_vec<int64_t>(iter);
+          return;
         } else if (iter_dtype == kShort) {
-          return cpu_hflip_vec<int16_t>(iter);
+          cpu_hflip_vec<int16_t>(iter);
+          return;
         } else if (iter_dtype == kBool) {
-          return cpu_hflip_vec<bool>(iter);
+          cpu_hflip_vec<bool>(iter);
+          return;
         } else if (iter_dtype == kFloat) {
-          return cpu_hflip_vec<float>(iter);
+          cpu_hflip_vec<float>(iter);
+          return;
         } else if (iter_dtype == kDouble) {
-          return cpu_hflip_vec<double>(iter);
+          cpu_hflip_vec<double>(iter);
+          return;
         }
       }
       // other dtypes (float16, bfloat16, complex) are handled by cpu_kernel_vec (see below)
@@ -778,10 +786,12 @@ void flip_kernel(TensorIterator& iter, const bool quantized) {
           c == input_strides_2[1] &&
           c == iter.element_size(0) * iter.shape()[0]  // checks if dim=1 is contiguous as well
       ) {
-        return cpu_hflip_channels_last_vec(iter);
+        cpu_hflip_channels_last_vec(iter);
+        return;
       }
       // Special case: vertical flip using memcpy (faster than generic cpu_kernel_vec)
-      return cpu_vflip_memcpy(iter);
+      cpu_vflip_memcpy(iter);
+      return;
     }
 
     AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(), "flip_cpu",
diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h
index 83b51a998563..aad618a258a3 100644
--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@@ -46,7 +46,7 @@ using namespace vec;
 template <typename traits, std::size_t... INDEX>
 typename traits::ArgsTuple
 dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i,
-                 std::index_sequence<INDEX...>) {
+                 std::index_sequence<INDEX...> /*unused*/) {
   return std::make_tuple(
       c10::load<typename traits::template arg<INDEX>::type>(
           data[INDEX] + i * strides[INDEX])...);
@@ -65,7 +65,7 @@ dereference_vec_impl(char* C10_RESTRICT data[],
                      const typename traits::result_type& opt_scalar,
                      size_t S,
                      int64_t i,
-                     std::index_sequence<INDEX...>) {
+                     std::index_sequence<INDEX...> /*unused*/) {
   using Vec = typename traits::result_type;
   using scalar_t = typename Vec::value_type;
   return std::make_tuple(
@@ -231,7 +231,7 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve
 template <typename traits, typename cb_t>
 inline void unroll_contiguous_scalar_checks(
     const int64_t* /*strides*/,
-    std::index_sequence<>,
+    std::index_sequence<> /*unused*/,
     cb_t&& cb) {
   cb(0);
 }
@@ -239,7 +239,7 @@ inline void unroll_contiguous_scalar_checks(
 template <typename traits, typename cb_t, size_t INDEX0, size_t ...INDEX>
 inline void unroll_contiguous_scalar_checks(
     const int64_t* strides,
-    std::index_sequence<INDEX0, INDEX...>,
+    std::index_sequence<INDEX0, INDEX...> /*unused*/,
     cb_t&& cb) {
   if (is_contiguous_scalar<traits, INDEX0 + 1>(strides)) {
     cb(INDEX0 + 1);
diff --git a/aten/src/ATen/native/cpu/MultinomialKernel.cpp b/aten/src/ATen/native/cpu/MultinomialKernel.cpp
index b75acf4ffc24..7ea8e87e28b1 100644
--- a/aten/src/ATen/native/cpu/MultinomialKernel.cpp
+++ b/aten/src/ATen/native/cpu/MultinomialKernel.cpp
@@ -210,7 +210,7 @@ multinomial_with_replacement_apply(
   }
 }
 
-static void multinomial_with_replacement_kernel_impl(
+void multinomial_with_replacement_kernel_impl(
     Tensor& result,
     const Tensor& self,
     const int64_t n_sample,
diff --git a/aten/src/ATen/native/cpu/PaddingKernel.cpp b/aten/src/ATen/native/cpu/PaddingKernel.cpp
index 59d838b9782d..853fc959f634 100644
--- a/aten/src/ATen/native/cpu/PaddingKernel.cpp
+++ b/aten/src/ATen/native/cpu/PaddingKernel.cpp
@@ -96,7 +96,7 @@ struct ReplicationPad {
 };
 
 template <typename scalar_t>
-static inline void copy_stub(scalar_t* out, const scalar_t* in, int64_t size) {
+inline void copy_stub(scalar_t* out, const scalar_t* in, int64_t size) {
   using Vec = Vectorized<scalar_t>;
   int64_t d = 0;
   for (; d < size - (size % Vec::size()); d += Vec::size()) {
@@ -112,7 +112,7 @@ static inline void copy_stub(scalar_t* out, const scalar_t* in, int64_t size) {
 }
 
 template <typename scalar_t>
-static inline void add_stub(scalar_t* grad_in, const scalar_t* grad_out, int64_t size) {
+inline void add_stub(scalar_t* grad_in, const scalar_t* grad_out, int64_t size) {
   using Vec = Vectorized<scalar_t>;
   int64_t d = 0;
   for (; d < size - (size % Vec::size()); d += Vec::size()) {
diff --git a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
index a9d6db2c0382..6fad9270bf19 100644
--- a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
@@ -9,7 +9,7 @@
 namespace at::native {
 namespace {
 
-static void addcmul_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) {
+void addcmul_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) {
   ScalarType dtype = iter.common_dtype();
   if (at::isReducedFloatingType(dtype)) {
     AT_DISPATCH_REDUCED_FLOATING_TYPES(dtype, "addcmul_cpu_out", [&]() {
@@ -50,7 +50,7 @@ static void addcmul_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) {
   }
 }
 
-static void addcdiv_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) {
+void addcdiv_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) {
   ScalarType dtype = iter.common_dtype();
   if (at::isReducedFloatingType(dtype)) {
     AT_DISPATCH_REDUCED_FLOATING_TYPES(dtype, "addcdiv_cpu_out", [&]() {
@@ -90,7 +90,7 @@ static void addcdiv_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) {
   }
 }
 
-static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double beta) {
+void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double beta) {
   ScalarType dtype = iter.dtype(0);
   if (dtype == kBFloat16) {
     auto norm_val = norm.to<float>();
@@ -176,7 +176,7 @@ static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& no
   }
 }
 
-static void huber_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double delta) {
+void huber_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double delta) {
   ScalarType dtype = iter.dtype(0);
   AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, dtype, "huber_backward_cpu_out", [&] {
     auto norm_val = norm.to<scalar_t>();
@@ -215,7 +215,7 @@ static void huber_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm,
   });
 }
 
-static void mse_backward_cpu_kernel(TensorIterator& iter, const Scalar& value) {
+void mse_backward_cpu_kernel(TensorIterator& iter, const Scalar& value) {
   ScalarType dtype = iter.dtype(0);
   AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "mse_backward_cpu_out", [&] {
     scalar_t scalar_val = value.to<scalar_t>();
diff --git a/aten/src/ATen/native/cpu/PowKernel.cpp b/aten/src/ATen/native/cpu/PowKernel.cpp
index 2cf751f05116..18e14ed5d30d 100644
--- a/aten/src/ATen/native/cpu/PowKernel.cpp
+++ b/aten/src/ATen/native/cpu/PowKernel.cpp
@@ -96,11 +96,14 @@ static void pow_tensor_scalar_kernel(
       dtype == kBFloat16 || isComplexType(dtype)) {
     // Dispatch to fast specialization for sqrt, rsqrt and reciprocal
     if (exp_scalar.equal(.5)) {
-      return sqrt_kernel(iter);
+      sqrt_kernel(iter);
+      return;
     } else if (exp_scalar.equal(-0.5)) {
-      return rsqrt_kernel(iter);
+      rsqrt_kernel(iter);
+      return;
     } else if (exp_scalar.equal(-1.0)) {
-      return reciprocal_kernel(iter);
+      reciprocal_kernel(iter);
+      return;
     }
   }
 
diff --git a/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp b/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp
index ee9396136612..b469aa5c2eee 100644
--- a/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp
+++ b/aten/src/ATen/native/cpu/RangeFactoriesKernel.cpp
@@ -18,7 +18,7 @@ namespace {
 
 using namespace vec;
 
-static void arange_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scalar& scalar_steps, const Scalar& scalar_step) {
+void arange_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scalar& scalar_steps, const Scalar& scalar_step) {
   AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "arange_cpu", [&]() {
     using accscalar_t = at::acc_type<scalar_t, false>;
     auto start = scalar_start.to<accscalar_t>();
@@ -42,7 +42,7 @@ static void arange_kernel(TensorIterator& iter, const Scalar& scalar_start, cons
   });
 }
 
-static void linspace_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scalar& scalar_end, int64_t steps) {
+void linspace_kernel(TensorIterator& iter, const Scalar& scalar_start, const Scalar& scalar_end, int64_t steps) {
   AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, iter.dtype(), "linspace_cpu", [&]() {
     // step should be of double type for all integral types
     using step_t = std::conditional_t<std::is_integral_v<scalar_t>, double, scalar_t>;
diff --git a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
index a53fe53a8457..c7eaa802af12 100644
--- a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
@@ -62,7 +62,7 @@ inline void reduce_all_impl(
   output.fill_(result);
 }
 
-static void min_all_kernel_impl(Tensor& result, const Tensor& input) {
+void min_all_kernel_impl(Tensor& result, const Tensor& input) {
   if (input.scalar_type() == ScalarType::Bool) {
     TensorIterator iter = TensorIteratorConfig()
       .add_input(input)
@@ -87,7 +87,7 @@ static void min_all_kernel_impl(Tensor& result, const Tensor& input) {
   }
 }
 
-static void max_all_kernel_impl(Tensor& result, const Tensor& input) {
+void max_all_kernel_impl(Tensor& result, const Tensor& input) {
   if (input.scalar_type() == ScalarType::Bool) {
     TensorIterator iter = TensorIteratorConfig()
       .add_input(input)
@@ -167,7 +167,7 @@ inline void reduce_all_impl_vec_two_outputs(
   output2.fill_(result.second);
 }
 
-static void aminmax_allreduce_kernel(
+void aminmax_allreduce_kernel(
     const Tensor& input,
     Tensor& min_result,
     Tensor& max_result) {
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
index c06731dfc718..2e6293650194 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -28,7 +28,7 @@ namespace at::native { namespace {
 using namespace vec;
 
 template <typename scalar_t, typename func_t>
-static inline void cpu_cum_base_kernel(const Tensor& result,
+inline void cpu_cum_base_kernel(const Tensor& result,
     const Tensor& self,
     int64_t dim,
     const func_t& f,
@@ -76,7 +76,7 @@ static inline void cpu_cum_base_kernel(const Tensor& result,
   iter.for_each(loop, grain_size);
 }
 
-static void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
+void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
   auto wrap_dim = maybe_wrap_dim(dim, self.dim());
   int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);
 
@@ -95,7 +95,7 @@ static void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t
   });
 }
 
-static void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
+void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t dim) {
   auto wrap_dim = maybe_wrap_dim(dim, self.dim());
   int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);
 
@@ -114,7 +114,7 @@ static void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t
   });
 }
 
-static void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t dim) {
+void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t dim) {
   auto wrap_dim = maybe_wrap_dim(dim, self.dim());
   int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);
 
@@ -135,7 +135,7 @@ static void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t
   });
 }
 
-static void std_var_kernel_impl(TensorIterator& iter, double correction, bool take_sqrt) {
+void std_var_kernel_impl(TensorIterator& iter, double correction, bool take_sqrt) {
   AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "std_cpu", [&] {
     binary_kernel_reduce(
         iter,
@@ -148,7 +148,7 @@ static void std_var_kernel_impl(TensorIterator& iter, double correction, bool ta
   });
 }
 
-static void prod_kernel_impl(TensorIterator& iter) {
+void prod_kernel_impl(TensorIterator& iter) {
   // Workaround for the error: '*' in boolean context, suggest '&&' instead
   if (iter.dtype() == ScalarType::Bool) {
     using scalar_t = bool;
@@ -203,7 +203,7 @@ void norm_kernel_cpu_impl(TensorIterator& iter, const double& val) {
   }
 }
 
-static void norm_kernel_tensor_iterator_impl(
+void norm_kernel_tensor_iterator_impl(
     TensorIterator& iter,
     const Scalar& p) {
   double val = 0;
@@ -256,10 +256,10 @@ static void norm_kernel_tensor_iterator_impl(
   } else {
     if (iter.input_dtype() == kHalf && iter.dtype(0) == kFloat) {
       // type promotion that does cast and reduction in a single kernel
-      return norm_kernel_cpu_impl<at::Half, float>(iter, val);
+      norm_kernel_cpu_impl<at::Half, float>(iter, val); return;
     } else if (iter.input_dtype() == kBFloat16 && iter.dtype(0) == kFloat) {
       // type promotion that does cast and reduction in a single kernel
-      return norm_kernel_cpu_impl<at::BFloat16, float>(iter, val);
+      norm_kernel_cpu_impl<at::BFloat16, float>(iter, val); return;
     }
 
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(kHalf, kBFloat16, kComplexHalf, iter.input_dtype(), "norm_cpu", [&] {
@@ -274,7 +274,7 @@ static void norm_kernel_tensor_iterator_impl(
   }
 }
 
-static void and_kernel_impl(TensorIterator& iter) {
+void and_kernel_impl(TensorIterator& iter) {
   if (iter.dtype() == ScalarType::Byte) {
     // Refer [all, any : uint8 compatibility]
     binary_kernel_reduce_vec(
@@ -312,7 +312,7 @@ static void and_kernel_impl(TensorIterator& iter) {
   }
 }
 
-static void or_kernel_impl(TensorIterator& iter) {
+void or_kernel_impl(TensorIterator& iter) {
   if (iter.dtype() == ScalarType::Byte) {
     // Refer [all, any : uint8 compatibility]
     binary_kernel_reduce_vec(
@@ -346,7 +346,7 @@ struct MinValuesOps: public at::native::MinOps<scalar_t> {
   }
 };
 
-static void min_values_kernel_impl(TensorIterator& iter) {
+void min_values_kernel_impl(TensorIterator& iter) {
   if (iter.dtype() == kLong) {
     // This case is special because of Vectorized<int64_t> does not
     // handle upper_bound<int64_t>().
@@ -367,7 +367,7 @@ static void min_values_kernel_impl(TensorIterator& iter) {
   });
 }
 
-static void max_values_kernel_impl(TensorIterator& iter) {
+void max_values_kernel_impl(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cpu", [&iter] {
     binary_kernel_reduce_vec(
       iter,
@@ -377,7 +377,7 @@ static void max_values_kernel_impl(TensorIterator& iter) {
   });
 }
 
-static void argmax_kernel_impl(TensorIterator &iter) {
+void argmax_kernel_impl(TensorIterator &iter) {
   AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(1), "argmax_cpu", [&] {
     if (is_reduce_lastdim(iter)) {
       using arg_t = std::pair<scalar_t, int64_t>;
@@ -401,7 +401,7 @@ static void argmax_kernel_impl(TensorIterator &iter) {
   });
 }
 
-static void argmin_kernel_impl(TensorIterator &iter) {
+void argmin_kernel_impl(TensorIterator &iter) {
   AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(1), "argmin_cpu", [&] {
     if (is_reduce_lastdim(iter)) {
       using arg_t = std::pair<scalar_t, int64_t>;
@@ -459,7 +459,7 @@ struct XorSumOps {
   }
 };
 
-static void xor_sum_kernel_impl(TensorIterator& iter) {
+void xor_sum_kernel_impl(TensorIterator& iter) {
   // Use iter.dtype(1) to dispatch based on the type of the input tensor
   AT_DISPATCH_ALL_TYPES_AND3(
       kBFloat16, kHalf, kBool, iter.dtype(1), "xor_sum_cpu", [&] {
diff --git a/aten/src/ATen/native/cpu/ReduceUtils.h b/aten/src/ATen/native/cpu/ReduceUtils.h
index fd7c4a2750a6..1b0be8d18db7 100644
--- a/aten/src/ATen/native/cpu/ReduceUtils.h
+++ b/aten/src/ATen/native/cpu/ReduceUtils.h
@@ -8,7 +8,6 @@
 #include <c10/util/irange.h>
 #include <ATen/OpMathType.h>
 #include <ATen/native/cpu/utils.h>
-#include <ATen/OpMathType.h>
 
 namespace at::native {
 inline namespace CPU_CAPABILITY {
diff --git a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
index ed5658f5f0f5..8d22201ed63c 100644
--- a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
@@ -428,10 +428,11 @@ void fp16_gemv_trans(
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0);
 #if !defined(__aarch64__) || defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
   if (at::globalContext().allowFP16ReductionCPU()) {
-    return fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+    fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+    return;
   }
 #endif
-  return fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
+  fp16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, beta, y, incy);
 }
 
 float bf16_dot_with_fp32_arith(const at::BFloat16* vec1, const at::BFloat16* vec2, int64_t len) {
@@ -465,7 +466,7 @@ void bf16_gemv_trans(
   at::BFloat16* y,
   const int incy) {
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(incx == 1 && alpha == 1.0 && beta == 0.0);
-  return bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
+  bf16_gemv_trans_fp32_arith_by_dot_products(m, n, a, lda, x, y, incy);
 }
 
 float fp16_dot(
diff --git a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
index b6d8d684ae62..895263bc4466 100644
--- a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
+++ b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
@@ -41,7 +41,7 @@ class ReduceMultiply {
     *self_data = c10::load(self_data) && c10::load(src_data);
   }
 };
-static ReduceMultiply reduce_multiply;
+ReduceMultiply reduce_multiply;
 
 class ReduceAdd {
 public:
@@ -51,7 +51,7 @@ class ReduceAdd {
     *self_data += opmath_t(c10::load(src_data));
   }
 };
-static ReduceAdd reduce_add;
+ReduceAdd reduce_add;
 
 class ReduceMean {
 public:
@@ -61,7 +61,7 @@ class ReduceMean {
     *self_data += opmath_t(c10::load(src_data));
   }
 };
-static ReduceMean reduce_mean;
+ReduceMean reduce_mean;
 
 class ReduceMaximum {
 public:
@@ -73,7 +73,7 @@ class ReduceMaximum {
     *self_data = at::_isnan<scalar_t>(src_value) ? opmath_t(src_value) : std::max(self_value, opmath_t(src_value));
   }
 };
-static ReduceMaximum reduce_maximum;
+ReduceMaximum reduce_maximum;
 
 class ReduceMinimum {
 public:
@@ -85,7 +85,7 @@ class ReduceMinimum {
     *self_data = at::_isnan<scalar_t>(src_value) ? opmath_t(src_value) : std::min(self_value, opmath_t(src_value));
   }
 };
-static ReduceMinimum reduce_minimum;
+ReduceMinimum reduce_minimum;
 
 class TensorAssign {
 public:
@@ -95,7 +95,7 @@ class TensorAssign {
     *self_data = opmath_t(c10::load(src_data));
   }
 };
-static TensorAssign tensor_assign;
+TensorAssign tensor_assign;
 
 template <bool is_scatter_like = true>
 struct _cpu_scatter_gather_dim_loop {
diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
index dac0f3bef25e..9ecfe55cedc4 100644
--- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
+++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
@@ -17,7 +17,6 @@
 #include <ATen/cpu/vec/functional.h>
 #include <ATen/cpu/vec/vec.h>
 #include <c10/util/irange.h>
-#include <ATen/OpMathType.h>
 
 // [Note AVX-SSE transitions] In general we avoid calls into cmath for code
 // compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in
@@ -969,7 +968,7 @@ struct vec_host_softmax_backward {
   }
 };
 
-static void softmax_lastdim_kernel_impl(
+void softmax_lastdim_kernel_impl(
     const Tensor& result,
     const Tensor& self) {
   AT_DISPATCH_FLOATING_TYPES_AND2(
@@ -978,13 +977,13 @@ static void softmax_lastdim_kernel_impl(
       [&] { vec_host_softmax_lastdim<scalar_t, false>::apply(result, self); });
 }
 
-static void softmax_kernel_impl(const Tensor& result, const Tensor& self, int64_t dim) {
+void softmax_kernel_impl(const Tensor& result, const Tensor& self, int64_t dim) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, self.scalar_type(),
     "softmax_kernel_impl",
     [&] { vec_softmax<scalar_t, false>::apply(result, self, dim); });
 }
 
-static void log_softmax_lastdim_kernel_impl(
+void log_softmax_lastdim_kernel_impl(
     const Tensor& result,
     const Tensor& self) {
   AT_DISPATCH_FLOATING_TYPES_AND2(
@@ -993,13 +992,13 @@ static void log_softmax_lastdim_kernel_impl(
       [&] { vec_host_softmax_lastdim<scalar_t, true>::apply(result, self); });
 }
 
-static void log_softmax_kernel_impl(const Tensor& result, const Tensor& self, int64_t dim) {
+void log_softmax_kernel_impl(const Tensor& result, const Tensor& self, int64_t dim) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, self.scalar_type(),
     "softmax_kernel_impl",
     [&] { vec_softmax<scalar_t, true>::apply(result, self, dim); });
 }
 
-static void softmax_backward_lastdim_kernel_impl(
+void softmax_backward_lastdim_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad,
     const Tensor& output) {
@@ -1011,7 +1010,7 @@ static void softmax_backward_lastdim_kernel_impl(
       });
 }
 
-static void log_softmax_backward_lastdim_kernel_impl(
+void log_softmax_backward_lastdim_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad,
     const Tensor& output) {
@@ -1023,7 +1022,7 @@ static void log_softmax_backward_lastdim_kernel_impl(
       });
 }
 
-static void softmax_backward_kernel_impl(
+void softmax_backward_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad,
     const Tensor& output,
@@ -1039,7 +1038,7 @@ static void softmax_backward_kernel_impl(
       });
 }
 
-static void log_softmax_backward_kernel_impl(
+void log_softmax_backward_kernel_impl(
     const Tensor& grad_input,
     const Tensor& grad,
     const Tensor& output,
diff --git a/aten/src/ATen/native/cpu/SortingKernel.cpp b/aten/src/ATen/native/cpu/SortingKernel.cpp
index b7d83d85996b..7d337c119c98 100644
--- a/aten/src/ATen/native/cpu/SortingKernel.cpp
+++ b/aten/src/ATen/native/cpu/SortingKernel.cpp
@@ -90,7 +90,7 @@ struct KeyValueCompDesc {
 };
 
 #ifdef USE_FBGEMM
-static bool can_use_radix_sort(const TensorBase& values, const bool descending) {
+bool can_use_radix_sort(const TensorBase& values, const bool descending) {
   // radix_sort can be used only for 1D data
   if (values.dim() != 1) return false;
   // radix_sort sorts in ascending order
@@ -106,7 +106,7 @@ static bool can_use_radix_sort(const TensorBase& values, const bool descending)
   return true;
 }
 
-static void parallel_sort1d_kernel(
+void parallel_sort1d_kernel(
     const TensorBase& values,
     const TensorBase& indices) {
   AT_DISPATCH_INTEGRAL_TYPES(values.scalar_type(), "parallel_sort1d_kernel", [&] {
@@ -140,7 +140,7 @@ static void parallel_sort1d_kernel(
 #endif
 
 template <typename scalar_t, typename value_accessor_t, typename indices_accessor_t>
-static inline void sort_kernel_impl(const value_accessor_t& value_accessor,
+inline void sort_kernel_impl(const value_accessor_t& value_accessor,
             const indices_accessor_t& indices_accessor,
             int64_t dim_size, bool descending, bool stable) {
   auto composite_accessor = CompositeRandomAccessorCPU<
@@ -165,7 +165,7 @@ static inline void sort_kernel_impl(const value_accessor_t& value_accessor,
   }
 }
 
-static void sort_kernel(
+void sort_kernel(
     const TensorBase& self,
     const TensorBase& values,
     const TensorBase& indices,
@@ -222,7 +222,7 @@ static void sort_kernel(
   );
 }
 
-static void topk_kernel(
+void topk_kernel(
     const TensorBase &values,
     const TensorBase &indices,
     const TensorBase &self,
diff --git a/aten/src/ATen/native/cpu/SumKernel.cpp b/aten/src/ATen/native/cpu/SumKernel.cpp
index 32364c38ea51..0fda4ae05f3e 100644
--- a/aten/src/ATen/native/cpu/SumKernel.cpp
+++ b/aten/src/ATen/native/cpu/SumKernel.cpp
@@ -286,12 +286,12 @@ struct CastStoreAccumulate {
 };
 
 template <typename StorePolicy, typename scalar_t>
-static void store(char * C10_RESTRICT data, int64_t stride, int64_t index, scalar_t value) {
+void store(char * C10_RESTRICT data, int64_t stride, int64_t index, scalar_t value) {
   StorePolicy::store(data, stride, index, value);
 }
 
 template <typename StorePolicy, typename scalar_t, size_t numel>
-static void store(char * C10_RESTRICT data, int64_t stride, int64_t index,
+void store(char * C10_RESTRICT data, int64_t stride, int64_t index,
                   const std::array<scalar_t, numel> &values) {
   auto *base_ptr = data + stride * index;
   for (const auto k : c10::irange(numel)) {
@@ -301,7 +301,7 @@ static void store(char * C10_RESTRICT data, int64_t stride, int64_t index,
 }
 
 template <typename StorePolicy, typename scalar_t>
-static void store(char * C10_RESTRICT data, int64_t stride, int64_t index,
+void store(char * C10_RESTRICT data, int64_t stride, int64_t index,
                   const Vectorized<scalar_t> &values) {
   using vec_t = Vectorized<scalar_t>;
   alignas(64) std::array<scalar_t, vec_t::size()> array_values{};
diff --git a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
index 2c52a61fc553..c479e1610cbe 100644
--- a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
+++ b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
@@ -29,7 +29,7 @@
 namespace at::native { namespace {
 
 template <typename scalar_t, typename scalar_t_2 = int64_t, typename loop1d_t>
-static inline void compare_base_kernel_core(
+inline void compare_base_kernel_core(
     const Tensor& result1,
     const Tensor& result2,
     const Tensor& self,
@@ -71,7 +71,7 @@ static inline void compare_base_kernel_core(
 }
 
 template <typename scalar_t, typename scalar_t_2=int64_t, typename func_t>
-static inline void compare_base_kernel(const Tensor& result1, const Tensor& result2,
+inline void compare_base_kernel(const Tensor& result1, const Tensor& result2,
     const Tensor& self,
     int64_t dim,
     bool keepdim,
@@ -98,7 +98,7 @@ static inline void compare_base_kernel(const Tensor& result1, const Tensor& resu
       result1, result2, self, dim, keepdim, loop);
 }
 
-static void min_kernel_impl(
+void min_kernel_impl(
     const Tensor& result,
     const Tensor& indice,
     const Tensor& self,
@@ -131,7 +131,7 @@ static void min_kernel_impl(
   });
 }
 
-static void max_kernel_impl(
+void max_kernel_impl(
     const Tensor& result,
     const Tensor& indice,
     const Tensor& self,
@@ -164,7 +164,7 @@ static void max_kernel_impl(
   });
 }
 
-static void aminmax_kernel(
+void aminmax_kernel(
     const Tensor& self,
     int64_t dim,
     bool keepdim,
@@ -212,7 +212,7 @@ static void aminmax_kernel(
   });
 }
 
-static void where_kernel_impl(TensorIterator &iter) {
+void where_kernel_impl(TensorIterator &iter) {
   AT_DISPATCH_V2(
     iter.dtype(), "where_cpu", [&] {
       cpu_kernel(
@@ -224,19 +224,19 @@ static void where_kernel_impl(TensorIterator &iter) {
   kComplexHalf, kHalf, kBFloat16, kBool, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_FLOAT8_TYPES));
 }
 
-static void isposinf_kernel_impl(TensorIteratorBase& iter) {
+void isposinf_kernel_impl(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "isposinf_cpu", [&]() {
     cpu_kernel(iter, [](scalar_t a) -> bool { return a == std::numeric_limits<scalar_t>::infinity(); });
   });
 }
 
-static void isneginf_kernel_impl(TensorIteratorBase& iter) {
+void isneginf_kernel_impl(TensorIteratorBase& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.input_dtype(), "isneginf_cpu", [&]() {
     cpu_kernel(iter, [](scalar_t a) -> bool { return a == -std::numeric_limits<scalar_t>::infinity(); });
   });
 }
 
-static void mode_kernel_impl(
+void mode_kernel_impl(
     Tensor& values,
     Tensor& indices,
     const Tensor& self,
@@ -308,7 +308,7 @@ static void mode_kernel_impl(
 
 // Default brute force implementation of isin(). Used when the number of test elements is small.
 // Iterates through each element and checks it against each test element.
-static void isin_default_kernel_cpu(
+void isin_default_kernel_cpu(
     const Tensor& elements,
     const Tensor& test_elements,
     bool invert,
@@ -339,7 +339,7 @@ static void isin_default_kernel_cpu(
   });
 }
 
-static void clamp_kernel_impl(TensorIteratorBase& iter) {
+void clamp_kernel_impl(TensorIteratorBase& iter) {
   AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "clamp_cpu", [&]() {
     cpu_kernel_vec(iter,
       [](scalar_t a, scalar_t min, scalar_t max) -> scalar_t {
@@ -355,7 +355,7 @@ static void clamp_kernel_impl(TensorIteratorBase& iter) {
   });
 }
 
-static void clamp_scalar_kernel_impl(TensorIteratorBase& iter, const Scalar& min_, const Scalar& max_) {
+void clamp_scalar_kernel_impl(TensorIteratorBase& iter, const Scalar& min_, const Scalar& max_) {
   AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "clamp_scalar_cpu", [&]() {
     const auto min = min_.to<scalar_t>();
     const auto max = max_.to<scalar_t>();
@@ -371,7 +371,7 @@ static void clamp_scalar_kernel_impl(TensorIteratorBase& iter, const Scalar& min
   });
 }
 
-static void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max_) {
+void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max_) {
   AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "clamp_max_scalar_cpu", [&]() {
     const auto max = max_.to<scalar_t>();
     const Vectorized<scalar_t> max_vec(max);
@@ -385,7 +385,7 @@ static void clamp_max_scalar_kernel_impl(TensorIteratorBase& iter, Scalar max_)
   });
 }
 
-static void clamp_min_scalar_kernel_impl(TensorIteratorBase& iter, Scalar min_) {
+void clamp_min_scalar_kernel_impl(TensorIteratorBase& iter, Scalar min_) {
   AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.common_dtype(), "clamp_min_scalar_cpu", [&]() {
     const auto min = min_.to<scalar_t>();
     const Vectorized<scalar_t> min_vec(min);
diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp
index 8c94decfff02..444ec10861da 100644
--- a/aten/src/ATen/native/cpu/Unfold2d.cpp
+++ b/aten/src/ATen/native/cpu/Unfold2d.cpp
@@ -13,7 +13,7 @@ namespace at::native {
 namespace {
 
 template <typename scalar_t>
-static inline void cadd(
+inline void cadd(
     scalar_t* z,
     const scalar_t* x,
     const scalar_t* y,
@@ -34,7 +34,7 @@ static inline void cadd(
 }
 
 template <typename scalar_t>
-static void unfolded2d_acc(
+void unfolded2d_acc(
     scalar_t* finput_data,
     scalar_t* input_data,
     int64_t kH,
@@ -113,7 +113,7 @@ static void unfolded2d_acc(
 }
 
 template <typename scalar_t>
-static void unfolded2d_acc_channels_last(
+void unfolded2d_acc_channels_last(
     scalar_t* finput_data,
     scalar_t* input_data,
     int64_t kH,
@@ -225,7 +225,7 @@ void unfolded2d_acc_kernel(
 }
 
 template <typename scalar_t>
-static void unfolded2d_copy(
+void unfolded2d_copy(
     const scalar_t* input_data,
     scalar_t* finput_data,
     int64_t kH,
@@ -240,7 +240,7 @@ static void unfolded2d_copy(
     int64_t output_height,
     int64_t output_width) {
   at::parallel_for(
-      0, (int64_t)n_input_plane * kH * kW, 0, [&](int64_t start, int64_t end) {
+      0, n_input_plane * kH * kW, 0, [&](int64_t start, int64_t end) {
         for (const auto k : c10::irange(start, end)) {
           int64_t nip = k / (kH * kW);
           int64_t rest = k % (kH * kW);
@@ -316,7 +316,7 @@ static void unfolded2d_copy(
                 for (int64_t x = 0; x < output_width; x++)
                   memcpy(
                       dst + (size_t)y * output_width + x,
-                      src + (size_t)iy * input_width + ix + (int64_t)x * dW,
+                      src + (size_t)iy * input_width + ix + x * dW,
                       sizeof(scalar_t) * (1));
               }
             }
@@ -326,7 +326,7 @@ static void unfolded2d_copy(
 }
 
 template <typename scalar_t>
-static void unfolded2d_copy_channels_last(
+void unfolded2d_copy_channels_last(
     const scalar_t* input_data,
     scalar_t* finput_data,
     int64_t kH,
diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
index 74fb38779ea1..e59e5985bf7f 100644
--- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
@@ -157,13 +157,13 @@ struct Interpolate<1, scalar_t, opmath_t, index_t, 2> {
 };
 
 template <int n, typename scalar_t, typename index_t, int interp_size>
-static inline scalar_t interpolate(char* src, char** data, const int64_t* strides, int64_t i) {
+inline scalar_t interpolate(char* src, char** data, const int64_t* strides, int64_t i) {
   using opmath_t = at::opmath_type<scalar_t>;
   return Interpolate<n, scalar_t, opmath_t, index_t, interp_size>::eval(src, data, strides, i);
 }
 
 template <typename scalar_t, typename index_t>
-static inline scalar_t interpolate_aa_single_dim_zero_strides(
+inline scalar_t interpolate_aa_single_dim_zero_strides(
     char* src,
     char** data,
     const index_t ids_stride) {
@@ -187,7 +187,7 @@ static inline scalar_t interpolate_aa_single_dim_zero_strides(
 }
 
 template <typename scalar_t, typename index_t>
-static inline scalar_t interpolate_aa_single_dim(
+inline scalar_t interpolate_aa_single_dim(
     char* src,
     char** data,
     const int64_t* strides,
@@ -213,7 +213,7 @@ static inline scalar_t interpolate_aa_single_dim(
 }
 
 template<int m>
-static inline bool is_zero_stride(const int64_t* strides) {
+inline bool is_zero_stride(const int64_t* strides) {
   bool output = strides[0] == 0;
   for (const auto i : c10::irange(1, m)) {
     output &= (strides[i] == 0);
@@ -222,7 +222,7 @@ static inline bool is_zero_stride(const int64_t* strides) {
 }
 
 template <typename scalar_t, typename index_t, int interp_size>
-static inline bool is_contiguous_stride(const int64_t* strides) {
+inline bool is_contiguous_stride(const int64_t* strides) {
   bool output = (strides[0] == sizeof(index_t)) && (strides[1] == sizeof(scalar_t));
   for (int i=2; i<2 * interp_size; i+=2) {
     output &= (strides[i] == sizeof(index_t)) && (strides[i + 1] == sizeof(scalar_t));
@@ -282,13 +282,13 @@ struct CheckAlmostAllZeroStrides<0, non_zero_stride_dim, scalar_t, index_t, inte
 };
 
 template <int n, int s, typename scalar_t, typename index_t, int interp_size>
-static inline bool check_almost_all_zero_stride(const int64_t* strides) {
+inline bool check_almost_all_zero_stride(const int64_t* strides) {
   return CheckAlmostAllZeroStrides<n, s, scalar_t, index_t, interp_size>::eval(strides);
 }
 
 // Helper method to compute interpolation for nearest, linear, cubic modes
 template <typename scalar_t, typename index_t, int out_ndims, int interp_size>
-static inline void basic_loop(char** data, const int64_t* strides, int64_t n) {
+inline void basic_loop(char** data, const int64_t* strides, int64_t n) {
   char* dst = data[0];
   char* src = data[1];
   for (const auto i : c10::irange(n)) {
@@ -298,7 +298,7 @@ static inline void basic_loop(char** data, const int64_t* strides, int64_t n) {
 }
 
 template <typename scalar_t>
-static inline void basic_loop_aa_vertical(
+inline void basic_loop_aa_vertical(
     char** data,
     const int64_t* strides,
     int64_t n,
@@ -354,7 +354,7 @@ inline void basic_loop_aa_vertical<uint8_t>(
 }
 
 template <typename scalar_t>
-static inline void basic_loop_aa_horizontal(
+inline void basic_loop_aa_horizontal(
     char** data,
     const int64_t* strides,
     int64_t n,
@@ -1038,7 +1038,7 @@ struct HelperInterpNearest : public HelperInterpBase {
   // We keep this structure for BC and consider as deprecated.
   // See HelperInterpNearestExact as replacement
 
-  static const int interp_size = 1;
+  static constexpr int interp_size = 1;
 
   static inline void init_indices_weights(
     at::ScalarType output_type,
@@ -1155,7 +1155,7 @@ struct HelperInterpNearestExact : public HelperInterpNearest {
 
 struct HelperInterpLinear : public HelperInterpBase {
 
-  static const int interp_size = 2;
+  static constexpr int interp_size = 2;
 
   // Compute indices and weights for each interpolated dimension
   // indices_weights = {
@@ -1275,7 +1275,7 @@ struct HelperInterpLinear : public HelperInterpBase {
 
 struct HelperInterpCubic : public HelperInterpBase {
 
-  static const int interp_size = 4;
+  static constexpr int interp_size = 4;
 
   // Compute indices and weights for each interpolated dimension
   // indices_weights = {
diff --git a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
index 5b545509b1d9..24eddb3e1310 100644
--- a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
+++ b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h
@@ -35,7 +35,7 @@ Like PIL, Pillow is licensed under the open source HPND License
 
 namespace {
 
-static inline __m128i mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) {
+inline __m128i mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) {
   int32_t v;
   if (i32_aligned) {
     v = *(const int32_t*)ptr;
@@ -45,11 +45,11 @@ static inline __m128i mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32
   return _mm_cvtsi32_si128(v);
 }
 
-static inline __m128i mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) {
+inline __m128i mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) {
   return _mm_cvtepu8_epi32(mm_cvtsi32_si128(ptr, i32_aligned));
 }
 
-static inline void _write_endline_rgb_as_uint32(
+inline void _write_endline_rgb_as_uint32(
     uint8_t* C10_RESTRICT output,
     uint32_t data
 ) {
diff --git a/aten/src/ATen/native/cpu/int4mm_kernel.cpp b/aten/src/ATen/native/cpu/int4mm_kernel.cpp
index c8e0b8e86793..676e8bebcec1 100644
--- a/aten/src/ATen/native/cpu/int4mm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/int4mm_kernel.cpp
@@ -838,7 +838,7 @@ void dyn_quant_pack_4bit_weight_kernel(
   }
 }
 
-static void ref_dyn_quant_matmul_4bit_channelwise_kernel(
+void ref_dyn_quant_matmul_4bit_channelwise_kernel(
     size_t m,
     size_t n,
     size_t k,
@@ -906,7 +906,7 @@ static void ref_dyn_quant_matmul_4bit_channelwise_kernel(
           // Round to nearest integer
           const int32_t nudged_zero_point0 = lrintf(zero_point0);
 
-          int8_t* dst_ptr = (int8_t*)lhs_qa8dx + m_idx * dst_stride;
+          int8_t* dst_ptr = lhs_qa8dx + m_idx * dst_stride;
 
           // LHS offset at the beginning of the row
           *((float*)(dst_ptr)) = recip_scale0;
@@ -997,7 +997,7 @@ static void ref_dyn_quant_matmul_4bit_channelwise_kernel(
   }
 }
 
-static void ref_dyn_quant_matmul_4bit_groupwise_kernel(
+void ref_dyn_quant_matmul_4bit_groupwise_kernel(
     size_t m,
     size_t n,
     size_t k,
@@ -1048,7 +1048,7 @@ static void ref_dyn_quant_matmul_4bit_groupwise_kernel(
       zero_point0 = (std::min)(zero_point0, qmax);
       const int32_t nudged_zero_point0 = lrintf(zero_point0);
 
-      int8_t* dst_ptr = (int8_t*)lhs_qa8dx + row_idx * dst_stride;
+      int8_t* dst_ptr = lhs_qa8dx + row_idx * dst_stride;
 
       *((float*)(dst_ptr)) = recip_scale0;
       dst_ptr += sizeof(float);
diff --git a/aten/src/ATen/native/cpu/int8mm_kernel.cpp b/aten/src/ATen/native/cpu/int8mm_kernel.cpp
index 7e2cba98ff1d..496b98261964 100644
--- a/aten/src/ATen/native/cpu/int8mm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/int8mm_kernel.cpp
@@ -100,7 +100,7 @@ inline void tinygemm_kernel(
 
 #elif defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
 
-static inline float _mm256_reduce_add_ps(__m256& v) {
+inline float _mm256_reduce_add_ps(__m256& v) {
   __m256 v1 = _mm256_permute2f128_ps(v, v, 0x1);
   v = _mm256_add_ps(v, v1);
   v1 = _mm256_shuffle_ps(v, v, 0x4E);
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index 1dab8c19c700..68a9582a09c1 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -4,6 +4,7 @@
 #include <c10/util/SmallVector.h>
 #include <c10/core/Scalar.h>
 #include <c10/core/ScalarType.h>
+#include <c10/util/Exception.h>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
 #include <ATen/core/NamedTensor.h>
@@ -105,7 +106,8 @@ c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, b
   }
 }
 
-using at::cuda::blas::ScalingType;
+using at::blas::ScalingType;
+using at::blas::SwizzleType;
 
 /**
  * @brief Prepares matrices for CUBLAS operation
@@ -285,8 +287,8 @@ static bool isSupportedHipLtROCmArch(int index) {
 #if ROCM_VERSION >= 60300
         "gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
 #endif
-#if ROCM_VERSION >= 60500
-        "gfx950"
+#if ROCM_VERSION >= 70000
+        "gfx950", "gfx1150", "gfx1151"
 #endif
     };
     return at::detail::getCUDAHooks().isGPUArch(archs, index);
@@ -294,7 +296,7 @@ static bool isSupportedHipLtROCmArch(int index) {
 #endif
 
 template <typename scalar_t>
-static void launchTunableGemmAndBias(cublasCommonArgs &args, const Scalar& alpha, const scalar_t* bias, cuda::blas::GEMMAndBiasActivationEpilogue activation) {
+void launchTunableGemmAndBias(cublasCommonArgs &args, const Scalar& alpha, const scalar_t* bias, cuda::blas::GEMMAndBiasActivationEpilogue activation) {
   bool transa_ = ((args.transa != 'n') && (args.transa != 'N'));
   bool transb_ = ((args.transb != 'n') && (args.transb != 'N'));
   at::cuda::tunable::GemmAndBiasParams<scalar_t> params;
@@ -1112,7 +1114,7 @@ namespace{
  *   - Returns Error.
  */
 
-using at::cuda::blas::ScalingType;
+using at::blas::ScalingType;
 
 bool is_tensorwise_scaling(const at::Tensor& t, const at::Tensor& scale) {
   return isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat && scale.numel() == 1;
@@ -1124,6 +1126,17 @@ bool is_rowwise_scaling(const at::Tensor& t, const at::Tensor& scale) {
       && scale.is_contiguous());
 }
 
+bool check_size_stride(const at::Tensor& scale, int dim, int size, int stride) {
+  // For Blockwise1x128 and Blockwise128x128,
+  // when the scale tensor has a dimension of size 1, the stride is effectively
+  // "meaningless", i.e. PyTorch decides to use a stride of 1. Thus, the regular
+  // stride check fails. Here, we relax the stride check when the effective
+  // stride is 1.
+
+  return (
+      scale.size(dim) == size && (size <= 1 || scale.stride(dim) == stride));
+}
+
 // 1x16 blocks for packed nvfp4 data and fp8_e4m3fn scales
 bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) {
   // Multiply t.size(1) by 2 to adjust for fp4x2 packing
@@ -1138,21 +1151,35 @@ bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) {
 bool is_blockwise_1x32_scaling(const at::Tensor& t, const at::Tensor& scale) {
   // TODO: We might want to enforce some structure on the shapes of the scale
   // tensors
-  return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
-      && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4)
-      && scale.is_contiguous());
+  bool is_fp8_path = (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
+      && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4));
+  bool is_packed_fp4_path = false;
+#ifdef USE_ROCM
+  is_packed_fp4_path = (t.scalar_type() == ScalarType::Float4_e2m1fn_x2 && scale.scalar_type() == at::kFloat8_e8m0fnu
+      && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1) * 2, 32), 4));
+#endif
+  return (is_fp8_path || is_packed_fp4_path) && scale.is_contiguous();
 }
 
 bool is_blockwise_1x128_scaling(const at::Tensor& t, const at::Tensor& scale) {
-  return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat && scale.dim() == 2
-      && scale.size(0) == t.size(0) && scale.size(1) == ceil_div<int64_t>(t.size(1), 128)
-      && scale.stride(0) == 1 && scale.stride(1) == t.size(0));
+  return (
+      isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat &&
+      scale.dim() == 2 && check_size_stride(scale, 0, t.size(0), 1) &&
+      check_size_stride(
+          scale, 1, ceil_div<int64_t>(t.size(1), 128), t.size(0)));
 }
 
 bool is_blockwise_128x128_scaling(const at::Tensor& t, const at::Tensor& scale) {
-  return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat && scale.dim() == 2
-      && scale.size(0) == ceil_div<int64_t>(t.size(0), 128) && scale.size(1) == ceil_div<int64_t>(t.size(1), 128)
-      && scale.stride(0) == round_up<int64_t>(ceil_div<int64_t>(t.size(1), 128), 4) && scale.stride(1) == 1);
+  return (
+      isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat &&
+      scale.dim() == 2 &&
+      check_size_stride(
+          scale,
+          0,
+          ceil_div<int64_t>(t.size(0), 128),
+          ceil_div<int64_t>(t.size(1), 128)) &&
+      check_size_stride(
+          scale, 1, ceil_div<int64_t>(t.size(1), 128), 1));
 }
 
 bool is_desired_scaling(const at::Tensor& t, const at::Tensor& scale, ScalingType desired_scaling) {
@@ -1203,8 +1230,207 @@ std::pair<ScalingType, ScalingType> get_joint_scaling(
   );
 }
 
+Tensor&
+_tunable_scaled_gemm_rocm(
+          cublasCommonArgs& args,
+          const Tensor& mat1, const Tensor& mat2,
+          const Tensor& scale_a, const Tensor& scale_b,
+          const ScalingType scaling_choice_a, const ScalingType scaling_choice_b,
+          const std::optional<Tensor>& bias,
+          const bool use_fast_accum,
+          const at::ScalarType out_dtype,
+          Tensor& out) {
+#ifdef USE_ROCM
+#define TUNABLE_DISPATCH(BLASOP_A, BLASOP_B)                            \
+      if (mat1.scalar_type() == ScalarType::Float8_e4m3fnuz) {        \
+        if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) {      \
+          static at::cuda::tunable::ScaledGemmTunableOp<              \
+              at::Float8_e4m3fnuz, at::Float8_e4m3fnuz, scalar_t,     \
+              BLASOP_A, BLASOP_B> scaledgemm{};                       \
+          scaledgemm(&params);                                        \
+        }                                                             \
+        else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
+          static at::cuda::tunable::ScaledGemmTunableOp<              \
+              at::Float8_e4m3fnuz, at::Float8_e5m2fnuz, scalar_t,     \
+              BLASOP_A, BLASOP_B> scaledgemm{};                       \
+          scaledgemm(&params);                                        \
+        }                                                             \
+      }                                                               \
+      else if (mat1.scalar_type() == ScalarType::Float8_e5m2fnuz) {   \
+        if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) {      \
+          static at::cuda::tunable::ScaledGemmTunableOp<              \
+              at::Float8_e5m2fnuz, at::Float8_e4m3fnuz, scalar_t,     \
+              BLASOP_A, BLASOP_B> scaledgemm{};                       \
+          scaledgemm(&params);                                        \
+        }                                                             \
+        else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
+          static at::cuda::tunable::ScaledGemmTunableOp<              \
+              at::Float8_e5m2fnuz, at::Float8_e5m2fnuz, scalar_t,     \
+              BLASOP_A, BLASOP_B> scaledgemm{};                       \
+          scaledgemm(&params);                                        \
+        }                                                             \
+      }                                                               \
+      else if (mat1.scalar_type() == ScalarType::Float8_e4m3fn) {     \
+        if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) {        \
+          static at::cuda::tunable::ScaledGemmTunableOp<              \
+              at::Float8_e4m3fn, at::Float8_e4m3fn, scalar_t,         \
+              BLASOP_A, BLASOP_B> scaledgemm{};                       \
+          scaledgemm(&params);                                        \
+        }                                                             \
+        else if (mat2.scalar_type() == ScalarType::Float8_e5m2) {     \
+          static at::cuda::tunable::ScaledGemmTunableOp<              \
+              at::Float8_e4m3fn, at::Float8_e5m2, scalar_t,           \
+              BLASOP_A, BLASOP_B> scaledgemm{};                       \
+          scaledgemm(&params);                                        \
+        }                                                             \
+      }                                                               \
+      else if (mat1.scalar_type() == ScalarType::Float8_e5m2) {       \
+        if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) {        \
+          static at::cuda::tunable::ScaledGemmTunableOp<              \
+              at::Float8_e5m2, at::Float8_e4m3fn, scalar_t,           \
+              BLASOP_A, BLASOP_B> scaledgemm{};                       \
+          scaledgemm(&params);                                        \
+        }                                                             \
+        else if (mat2.scalar_type() == ScalarType::Float8_e5m2) {     \
+          static at::cuda::tunable::ScaledGemmTunableOp<              \
+              at::Float8_e5m2, at::Float8_e5m2, scalar_t,             \
+              BLASOP_A, BLASOP_B> scaledgemm{};                       \
+          scaledgemm(&params);                                        \
+        }                                                             \
+      }
+  AT_DISPATCH_V2(out_dtype, "_tunable_scaled_gemm", AT_WRAP([&] {
+    bool transa_ = ((args.transa != 'n') && (args.transa != 'N'));
+    bool transb_ = ((args.transb != 'n') && (args.transb != 'N'));
+    at::cuda::tunable::ScaledGemmParams<scalar_t> params;
+    params.transa = args.transa;
+    params.transb = args.transb;
+    params.m = args.m;
+    params.n = args.n;
+    params.k = args.k;
+    params.a = args.mata->data_ptr();
+    params.a_scale_ptr = args.scale_mata_ptr;
+    params.a_scale_dtype = args.scale_mata_dtype.value();
+    params.lda = args.lda;
+    params.a_dtype = args.mata->scalar_type();
+    params.a_scale_dtype = args.scale_mata_dtype.value();
+    params.a_scaling_type = args.scaling_mata_type.value();
+    params.b = args.matb->data_ptr();
+    params.b_scale_ptr = args.scale_matb_ptr;
+    params.b_scale_dtype = args.scale_matb_dtype.value();
+    params.ldb = args.ldb;
+    params.b_dtype = args.matb->scalar_type();
+    params.b_scale_dtype = args.scale_matb_dtype.value();
+    params.b_scaling_type = args.scaling_matb_type.value();
+    params.bias_ptr = bias ? bias->data_ptr(): nullptr;
+    params.bias_dtype = bias ? bias->scalar_type() : isFloat8Type(out_dtype) ? at::ScalarType::Half : out_dtype;
+    params.c = args.result->data_ptr();
+    params.c_scale_ptr = args.scale_result_ptr;
+    params.ldc = args.result_ld;
+    params.c_dtype = out_dtype;
+    params.use_fast_accum = use_fast_accum;
+    if (transa_ && transb_) {
+      TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::T)
+    }
+    else if (transa_ && !transb_) {
+      TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::N)
+    }
+    else if (!transa_ && transb_) {
+      TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::T)
+    }
+    else if (!transa_ && !transb_) {
+      TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::N)
+    }
+    else {
+      TORCH_CHECK(false, "unreachable");
+    }
+  }),
+  kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_FLOATING_TYPES));
+#undef TUNABLE_DISPATCH
+  return out;
+#else
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "_scaled_gemm_rocm only callable on ROCM devices");
+#endif
+}
+
+Tensor&
+_scaled_gemm(
+          const Tensor& mat1, const Tensor& mat2,
+          const Tensor& scale_a, const Tensor& scale_b,
+          const ScalingType scaling_choice_a, const ScalingType scaling_choice_b,
+          const std::optional<Tensor>& bias,
+          const bool use_fast_accum,
+          Tensor& out,
+          const std::optional<Tensor>& alpha = std::nullopt) {
+  cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, std::nullopt, scaling_choice_a, scaling_choice_b);
+  const auto out_dtype_ = args.result->scalar_type();
+  TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
+
+// ROCM enables the TunableOp path only
+// but can fallback to at::cuda::blas::scaled_gemm
+#ifdef USE_ROCM
+  auto tuning_ctx = at::cuda::tunable::getTuningContext();
+  bool tunable_op_enabled = tuning_ctx->IsTunableOpEnabled();
+#else
+  bool tunable_op_enabled = false;
+#endif
+  if (tunable_op_enabled) {
+      // Only available on ROCM
+      return _tunable_scaled_gemm_rocm(
+          args,
+          mat1, mat2,
+          scale_a, scale_b,
+          scaling_choice_a, scaling_choice_b,
+          bias,
+          use_fast_accum,
+          out_dtype_,
+          out);
+  }
+  else
+  {
+      at::cuda::blas::scaled_gemm(
+          args.transa,
+          args.transb,
+          args.m,
+          args.n,
+          args.k,
+          args.mata->data_ptr(),
+          args.scale_mata_ptr,
+          args.lda,
+          args.mata->scalar_type(),
+          args.scale_mata_dtype.value(),
+          args.scaling_mata_type.value(),
+          args.matb->data_ptr(),
+          args.scale_matb_ptr,
+          args.ldb,
+          args.matb->scalar_type(),
+          args.scale_matb_dtype.value(),
+          args.scaling_matb_type.value(),
+          bias ? bias->data_ptr(): nullptr,
+          bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
+          args.result->data_ptr(),
+          args.scale_result_ptr,
+          args.result_ld,
+          out_dtype_,
+          use_fast_accum,
+          alpha);
+      return out;
+  }
+}
+
 } // namespace
 
+// NOTE(slayton58): This is defined as part of the _v2 code (way) below - declare the signature here
+//                  to help cleanup v1 call structure.
+Tensor&
+_scaled_rowwise_rowwise(
+          const Tensor&, const Tensor&,
+          const Tensor&, const Tensor&,
+          const std::optional<Tensor>&,
+          const c10::ScalarType,
+          bool,
+          Tensor&);
+
+
 // Computes matrix multiply + bias while applying scaling to input and output matrices
 // Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default.
 // If output matrix type is 16 or 32-bit type, scale_result is not applied.
@@ -1246,6 +1472,10 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   // by decreasing priority. We prefer "simpler" schemes as they are supported
   // more broadly (more GPU archs, more CUDA versions) and because they are more
   // efficient. This tends to matter only for small matmuls (e.g., 1x1x128).
+
+  // List of supported BlockWise pairs for FP8:
+  // https://docs.nvidia.com/cuda/cublas/#element-1d-and-128x128-2d-block-scaling-for-fp8-data-types
+
   auto [scaling_choice_a, scaling_choice_b] = get_joint_scaling(
     {
       std::make_pair(ScalingType::TensorWise, ScalingType::TensorWise),
@@ -1278,7 +1508,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   TORCH_CHECK(isFloat8Type(mat2.scalar_type()) || mat2.scalar_type() == ScalarType::Float4_e2m1fn_x2, "Expected mat2 to be Float8 or Float4_x2 matrix got ", mat2.scalar_type());
 #ifndef USE_ROCM
   // Type restrictions imposed by CuBLASLt as of CUDA-12.1
-  TORCH_CHECK(mat1.scalar_type() != ScalarType::Float8_e5m2 || mat2.scalar_type() != ScalarType::Float8_e5m2,
+  TORCH_CHECK_VALUE(mat1.scalar_type() != ScalarType::Float8_e5m2 || mat2.scalar_type() != ScalarType::Float8_e5m2,
         "Multiplication of two Float8_e5m2 matrices is not supported");
 #endif
   if (use_fast_accum) {
@@ -1344,200 +1574,66 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
 
   // NVIDIA's cuBLAS only started supporting row-wise scaling in version 12.9,
   // and only for compute capability 9.0+. In other cases we use CUTLASS.
-#ifndef USE_ROCM
   // We are doing row-wise scaling
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  if (scaling_choice_a == ScalingType::RowWise && scaling_choice_b == ScalingType::RowWise
-      && ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
-      // cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
-      ||  (dprops->major >= 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
-    TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
-    at::cuda::detail::f8f8bf16_rowwise(
-        mat1,
-        mat2,
-        scale_a,
-        scale_b,
-        bias,
-        use_fast_accum,
-        out);
-    return out;
-  }
-#else
   if (scaling_choice_a == ScalingType::RowWise && scaling_choice_b == ScalingType::RowWise) {
+#ifndef USE_ROCM
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    if ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
+        // cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
+        ||  (dprops->major >= 10 && (!scale_a.sizes().empty() || !scale_b.sizes().empty()))) {
+      TORCH_CHECK_VALUE(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
+      return _scaled_rowwise_rowwise(
+          mat1,
+          mat2,
+          scale_a,
+          scale_b,
+          bias,
+          out.scalar_type(),
+          use_fast_accum,
+          out);
+    }
+#else
     // For ROCm, match behavior of f8f8bf16_rowwise type checking, for unit test purposes.
     Tensor b = mat2;
     if (_scaled_mm_is_fnuz()) {
-      TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fnuz);
+      TORCH_CHECK_VALUE(b.dtype() == at::kFloat8_e4m3fnuz,
+          "Expected b.dtype() == at::kFloat8_e4m3fnuz, got: ", b.dtype());
     }
     else {
-      TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fn);
+      TORCH_CHECK_VALUE(b.dtype() == at::kFloat8_e4m3fn,
+          "Expected b.dtype() == at::kFloat8_e4m3fn, got: ", b.dtype());
     }
     // Until more than bf16 is supported.
-    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16,
+    TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16,
          "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
+#endif
   }
   else if (scaling_choice_a == ScalingType::BlockWise1x32 && scaling_choice_b == ScalingType::BlockWise1x32) {
+#ifdef USE_ROCM
     #if ROCM_VERSION >= 70000
-    TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
+    TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
                 "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
 
-    TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
-                mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
-                "Matrix dimensions must be multiples of 32 for block-wise scaling");
+    int packed_factor = 1;
+    if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
+      // For float4 data type, each byte stores two 4-bit floating-point values,
+      // effectively packing two elements into one byte.
+      packed_factor = 2;
+    }
+    TORCH_CHECK_VALUE(mat1.size(0) % 16 == 0 && (mat1.size(1) * packed_factor) % 128 == 0 &&
+                mat2.size(1) % 16 == 0,
+                "M, N must be multiples of 16 and K must be multiple of 128 for block-wise scaling");
 
-    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
+    TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16 ||
                 out.scalar_type() == ScalarType::Half,
                 "Block-wise scaling only supports BFloat16 or Half output types");
 #else
-    TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
-#endif
-  }
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
 #endif
-
-  cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, scale_result, scaling_choice_a, scaling_choice_b);
-  const auto out_dtype_ = args.result->scalar_type();
-  TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");
-
-#ifdef USE_ROCM
-  auto tuning_ctx = at::cuda::tunable::getTuningContext();
-  if (tuning_ctx->IsTunableOpEnabled()) {
-#define TUNABLE_DISPATCH(BLASOP_A, BLASOP_B)                            \
-        if (mat1.scalar_type() == ScalarType::Float8_e4m3fnuz) {        \
-          if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) {      \
-            static at::cuda::tunable::ScaledGemmTunableOp<              \
-                at::Float8_e4m3fnuz, at::Float8_e4m3fnuz, scalar_t,     \
-                BLASOP_A, BLASOP_B> scaledgemm{};                       \
-            scaledgemm(&params);                                        \
-          }                                                             \
-          else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
-            static at::cuda::tunable::ScaledGemmTunableOp<              \
-                at::Float8_e4m3fnuz, at::Float8_e5m2fnuz, scalar_t,     \
-                BLASOP_A, BLASOP_B> scaledgemm{};                       \
-            scaledgemm(&params);                                        \
-          }                                                             \
-        }                                                               \
-        else if (mat1.scalar_type() == ScalarType::Float8_e5m2fnuz) {   \
-          if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) {      \
-            static at::cuda::tunable::ScaledGemmTunableOp<              \
-                at::Float8_e5m2fnuz, at::Float8_e4m3fnuz, scalar_t,     \
-                BLASOP_A, BLASOP_B> scaledgemm{};                       \
-            scaledgemm(&params);                                        \
-          }                                                             \
-          else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \
-            static at::cuda::tunable::ScaledGemmTunableOp<              \
-                at::Float8_e5m2fnuz, at::Float8_e5m2fnuz, scalar_t,     \
-                BLASOP_A, BLASOP_B> scaledgemm{};                       \
-            scaledgemm(&params);                                        \
-          }                                                             \
-        }                                                               \
-        else if (mat1.scalar_type() == ScalarType::Float8_e4m3fn) {     \
-          if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) {        \
-            static at::cuda::tunable::ScaledGemmTunableOp<              \
-                at::Float8_e4m3fn, at::Float8_e4m3fn, scalar_t,         \
-                BLASOP_A, BLASOP_B> scaledgemm{};                       \
-            scaledgemm(&params);                                        \
-          }                                                             \
-          else if (mat2.scalar_type() == ScalarType::Float8_e5m2) {     \
-            static at::cuda::tunable::ScaledGemmTunableOp<              \
-                at::Float8_e4m3fn, at::Float8_e5m2, scalar_t,           \
-                BLASOP_A, BLASOP_B> scaledgemm{};                       \
-            scaledgemm(&params);                                        \
-          }                                                             \
-        }                                                               \
-        else if (mat1.scalar_type() == ScalarType::Float8_e5m2) {       \
-          if (mat2.scalar_type() == ScalarType::Float8_e4m3fn) {        \
-            static at::cuda::tunable::ScaledGemmTunableOp<              \
-                at::Float8_e5m2, at::Float8_e4m3fn, scalar_t,           \
-                BLASOP_A, BLASOP_B> scaledgemm{};                       \
-            scaledgemm(&params);                                        \
-          }                                                             \
-          else if (mat2.scalar_type() == ScalarType::Float8_e5m2) {     \
-            static at::cuda::tunable::ScaledGemmTunableOp<              \
-                at::Float8_e5m2, at::Float8_e5m2, scalar_t,             \
-                BLASOP_A, BLASOP_B> scaledgemm{};                       \
-            scaledgemm(&params);                                        \
-          }                                                             \
-        }
-    AT_DISPATCH_V2(out_dtype_, "_tunable_scaled_gemm", AT_WRAP([&] {
-      bool transa_ = ((args.transa != 'n') && (args.transa != 'N'));
-      bool transb_ = ((args.transb != 'n') && (args.transb != 'N'));
-      at::cuda::tunable::ScaledGemmParams<scalar_t> params;
-      params.transa = args.transa;
-      params.transb = args.transb;
-      params.m = args.m;
-      params.n = args.n;
-      params.k = args.k;
-      params.a = args.mata->data_ptr();
-      params.a_scale_ptr = args.scale_mata_ptr;
-      params.a_scale_dtype = args.scale_mata_dtype.value();
-      params.lda = args.lda;
-      params.a_dtype = args.mata->scalar_type();
-      params.a_scale_dtype = args.scale_mata_dtype.value();
-      params.a_scaling_type = args.scaling_mata_type.value();
-      params.b = args.matb->data_ptr();
-      params.b_scale_ptr = args.scale_matb_ptr;
-      params.b_scale_dtype = args.scale_matb_dtype.value();
-      params.ldb = args.ldb;
-      params.b_dtype = args.matb->scalar_type();
-      params.b_scale_dtype = args.scale_matb_dtype.value();
-      params.b_scaling_type = args.scaling_matb_type.value();
-      params.bias_ptr = bias ? bias->data_ptr(): nullptr;
-      params.bias_dtype = bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_;
-      params.c = args.result->data_ptr();
-      params.c_scale_ptr = args.scale_result_ptr;
-      params.ldc = args.result_ld;
-      params.c_dtype = out_dtype_;
-      params.use_fast_accum = use_fast_accum;
-      if (transa_ && transb_) {
-        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::T)
-      }
-      else if (transa_ && !transb_) {
-        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::N)
-      }
-      else if (!transa_ && transb_) {
-        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::T)
-      }
-      else if (!transa_ && !transb_) {
-        TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::N)
-      }
-      else {
-        TORCH_CHECK(false, "unreachable");
-      }
-    }),
-    kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_FLOATING_TYPES));
-#undef TUNABLE_DISPATCH
-  }
-  else
 #endif
- {
-    at::cuda::blas::scaled_gemm(
-        args.transa,
-        args.transb,
-        args.m,
-        args.n,
-        args.k,
-        args.mata->data_ptr(),
-        args.scale_mata_ptr,
-        args.lda,
-        args.mata->scalar_type(),
-        args.scale_mata_dtype.value(),
-        args.scaling_mata_type.value(),
-        args.matb->data_ptr(),
-        args.scale_matb_ptr,
-        args.ldb,
-        args.matb->scalar_type(),
-        args.scale_matb_dtype.value(),
-        args.scaling_matb_type.value(),
-        bias ? bias->data_ptr(): nullptr,
-        bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
-        args.result->data_ptr(),
-        args.scale_result_ptr,
-        args.result_ld,
-        out_dtype_,
-        use_fast_accum);
   }
 
-  return out;
+  return _scaled_gemm(mat1, mat2, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
 }
 
 namespace {
@@ -1648,197 +1744,1330 @@ _scaled_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
           bool use_fast_accum) {
   const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
   Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
+
   return _scaled_mm_out_cuda(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
 }
 
+/**
+ * Track concrete implementations available
+ */
+enum class ScaledGemmImplementation {
+  NONE = 0,
+  TENSORWISE_TENSORWISE = 1,
+  ROWWISE_ROWWISE = 2,
+  BLOCK_128x128_1x128 = 3,
+  BLOCK_1x128_128x128 = 4,
+  BLOCK_1x128_1x128 = 5,
+  MXFP8_MXFP8 = 6,
+  NVFP4_NVFP4 = 7,
+  NVFP4_NVFP4_SINGLE_SCALE = 8,
+  MXFP4_MXFP4 = 9,
+};
 
-Tensor
-_scaled_grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
-const Tensor& scale_a, const Tensor& scale_b,
-const std::optional<at::Tensor>& offs,
-const std::optional<at::Tensor>& bias,
-const std::optional<at::Tensor>& scale_result,
-std::optional<c10::ScalarType> out_dtype,
-bool use_fast_accum) {
-  bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true);
-  TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = [9.0, 10.0], or ROCm MI300+");
+/**
+ * Convert passed int (enum) from python back into a
+ * strictly-typed enum
+ */
+template <class EnumType, class ArrayType>
+std::vector<EnumType> convert_int_to_enum(ArrayType& v) {
+  std::vector<EnumType> converted;
+  converted.reserve(v.size());
 
-  TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");
-  TORCH_CHECK(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed");
-  TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
-  TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
-  const bool a_is_2d = mat_a.dim() == 2;
-  const bool b_is_2d = mat_b.dim() == 2;
-  if (!a_is_2d || !b_is_2d) {
-    TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
+  for (auto vi : v) {
+    converted.push_back(static_cast<EnumType>(vi));
   }
-  TORCH_CHECK(
-    mat_a.size(-1) % 16 == 0,
-    "Expected trailing dimension of mat_a to be divisible by 16 ",
-    "but got mat1 shape: (",
-    mat_a.sizes(),
-    ").");
-  TORCH_CHECK(mat_b.size(-2) % 16 == 0 && mat_b.size(-1) % 16 == 0,
-    "Expected mat_b shape to be divisible by 16 ",
-    "but got mat_b shape: (",
-    mat_b.sizes(),
-    ").");
-
+  return converted;
+}
 
-  TORCH_CHECK(!bias.has_value(), "Bias not supported yet");
-  TORCH_CHECK(!scale_result.has_value(), "Scale result not supported yet");
-  TORCH_CHECK(offs.has_value() ==  (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix");
+/**
+ * Both inputs must be fp8,
+ * Each needs a single scale, {Tensorwise (float)}
+ */
+bool check_tensorwise_recipe(c10::ScalarType type_a,
+                             std::vector<ScalingType>& recipe_a,
+                             ArrayRef<Tensor>& scales_a,
+                             c10::ScalarType type_b,
+                             std::vector<ScalingType>& recipe_b,
+                             ArrayRef<Tensor>& scales_b) {
+  // both types must be fp8
+  if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
+    return false;
+  }
 
-  if (offs.has_value()) {
-    TORCH_CHECK(offs->dim() == 1, "offs has to be 1D");
-    TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32");
+  // 1 scale each, {Tensorwise, float}
+  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
+    return false;
   }
+  // Need {Blockwise_1x32, e8m0} for A & B
+  if (recipe_a[0] != ScalingType::TensorWise) return false;
+  if (scales_a[0].scalar_type() != ScalarType::Float) return false;
+  if (recipe_b[0] != ScalingType::TensorWise) return false;
+  if (scales_b[0].scalar_type() != ScalarType::Float) return false;
 
-  // FP8 per-tensor and per-row scaling expect fp32 scales.
-  // MXFP8 expects float8_e8m0fnu scales.
-  TORCH_CHECK(
-      (scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat) ||
-      (scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu),
-      "For FP8 tensorwise and rowwise, both scales must both be float32 tensors. For MXFP8, scales must both be float8_e8m0fnu tensors.");
+  return true;
+}
 
-  const int scale_multiplier = (mat_a.dim() == 2 && mat_b.dim() == 2) ? offs->size(0) : 1;
-  check_scale(mat_a, scale_a, 0 ,0, scale_multiplier);
-  check_scale(mat_b, scale_b, 1, 1, scale_multiplier);
+/**
+ * Both inputs must be fp8,
+ * Each needs scales, {Rowwise (float)}
+ */
+bool check_rowwise_recipe(c10::ScalarType type_a,
+                             std::vector<ScalingType>& recipe_a,
+                             ArrayRef<Tensor>& scales_a,
+                             c10::ScalarType type_b,
+                             std::vector<ScalingType>& recipe_b,
+                             ArrayRef<Tensor>& scales_b) {
+  // both types must be fp8
+  if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
+    return false;
+  }
 
-  const auto out_dtype_ = out_dtype.value_or(kBFloat16);
-  TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
+  // 1 scale each, {Tensorwise, float}
+  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
+    return false;
+  }
 
-  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
+  // Need {RowWise, dp32} for A & B
+  if (recipe_a[0] != ScalingType::RowWise) return false;
+  if (scales_a[0].scalar_type() != ScalarType::Float) return false;
+  if (recipe_b[0] != ScalingType::RowWise) return false;
+  if (scales_b[0].scalar_type() != ScalarType::Float) return false;
 
-#if defined(USE_FBGEMM_GENAI) && defined(USE_CUDA) && !defined(USE_ROCM)
-  // MXFP8 grouped GEMM dispatching
-  bool is_mx8mx8bf16 = (
-    mat_a.scalar_type() == at::kFloat8_e4m3fn && mat_b.scalar_type() == at::kFloat8_e4m3fn &&
-    scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu
-  );
-  TORCH_CHECK(out_dtype == at::kBFloat16, "Only bf16 out_dtype is supported for MXFP8 grouped gemm");
+  return true;
+}
 
-  if (is_mx8mx8bf16) {
-    bool b_is_3d = mat_b.dim() == 3;
-    bool is_2d_2d = a_is_2d && b_is_2d;
-    bool is_2d_3d = a_is_2d && b_is_3d;
-    TORCH_CHECK(is_2d_2d || is_2d_3d, "MXFP8 grouped GEMM currently only supports 2d-2d and 2d-3d cases");
-    TORCH_CHECK(offs.has_value(), "MXFP8 2d-2d and 2d-3d grouped GEMMs requires offsets");
 
-    fbgemm_gpu::mx8mx8bf16_grouped_mm(
-        mat_a,
-        mat_b,
-        scale_a,
-        scale_b,
-        offs.value(),
-        out);
-    return out;
+/**
+ * Two-level scaling, canonical NVFP4
+ * Both inputs must be fp4
+ * A, B need 2 scales, {Blockwise_1x16 (e4m3), Tensorwise (fp32)}
+ */
+bool check_nvfp4_recipe(c10::ScalarType type_a,
+                        std::vector<ScalingType>& recipe_a,
+                        ArrayRef<Tensor>& scales_a,
+                        c10::ScalarType type_b,
+                        std::vector<ScalingType>& recipe_b,
+                        ArrayRef<Tensor>& scales_b) {
+  // both types must be fp4
+  if (type_a != ScalarType::Float4_e2m1fn_x2 || type_b != ScalarType::Float4_e2m1fn_x2) {
+    return false;
   }
-#endif
-
-#ifndef USE_ROCM
-  TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type());
-  TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_b.scalar_type());
-
-  at::cuda::detail::f8f8bf16_grouped_mm(
-      mat_a,
-      mat_b,
-      scale_a,
-      scale_b,
-      offs,
-      bias,
-      use_fast_accum,
-      out);
-    return out;
-#else
-#ifdef USE_FBGEMM_GENAI
-  TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fnuz, "Expected mat_a to be Float8_e4m3fnuz matrix got ", mat_a.scalar_type());
-  TORCH_CHECK(mat_b.dtype() == at::kFloat8_e4m3fnuz, "Expected mat_a to be Float8_e4m3fnuz matrix got ", mat_b.scalar_type());
 
-  fbgemm_gpu::f8f8bf16_rowwise_grouped_mm(
-      mat_a,
-      // FBGEMM expects B matrix shape to be (.., N, K)
-      mat_b.transpose(-2, -1),
-      scale_a,
-      scale_b,
-      offs,
-      out);
-  return out;
-#else
-  TORCH_CHECK(false, "grouped gemm is not supported without USE_FBGEMM_GENAI on ROCM")
-#endif
+  // 2 scales, 2 recipes for each input
+  if (scales_a.size() != 2 || recipe_a.size() != 2 || scales_b.size() != 2 || recipe_b.size() != 2) {
+    return false;
+  }
 
-#endif
+  // Need {Blockwise_1x16, e4m3 for scale[0], Tensorwise, fp32 for scale[1]}
+  if (recipe_a[0] != ScalingType::BlockWise1x16 || recipe_a[1] != ScalingType::TensorWise) return false;
+  if (scales_a[0].scalar_type() != ScalarType::Float8_e4m3fn || scales_a[1].scalar_type() != ScalarType::Float) return false;
+  if (recipe_b[0] != ScalingType::BlockWise1x16 || recipe_b[1] != ScalingType::TensorWise) return false;
+  if (scales_b[0].scalar_type() != ScalarType::Float8_e4m3fn || scales_b[1].scalar_type() != ScalarType::Float) return false;
 
+  return true;
 }
 
-Tensor _grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
-const std::optional<at::Tensor>& offs,
-const std::optional<at::Tensor>& bias,
-std::optional<c10::ScalarType> out_dtype) {
-  _grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype);
-  bool a_b_and_out_are_bf16 = (
-    mat_a.dtype() == at::kBFloat16 &&
-    mat_b.dtype() == at::kBFloat16 &&
-    out_dtype.value_or(at::kBFloat16) == at::kBFloat16
-  );
-#ifndef USE_ROCM
-  bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16;
-#else
-  // _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used.
-  // the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm
-  bool use_fast_path = false;
-#endif
-  const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
-  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
-  if (use_fast_path) {
-    // fast path, no d2h sync needed
-    at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
-  } else {
-    _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
+/**
+ * Single-level scaling, what PyT currently understands
+ * Both inputs must be fp4
+ * A, B need 1 scale, {Blockwise_1x16 (e4m3)}
+ */
+bool check_nvfp4_recipe_single_scale
+                       (c10::ScalarType type_a,
+                        std::vector<ScalingType>& recipe_a,
+                        ArrayRef<Tensor>& scales_a,
+                        c10::ScalarType type_b,
+                        std::vector<ScalingType>& recipe_b,
+                        ArrayRef<Tensor>& scales_b) {
+  // both types must be fp4
+  if (type_a != ScalarType::Float4_e2m1fn_x2 || type_b != ScalarType::Float4_e2m1fn_x2) {
+    return false;
   }
-  return out;
-}
 
-Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
-  IntArrayRef batch1_sizes = batch1.sizes();
-  IntArrayRef batch2_sizes = batch2.sizes();
+  // 2 scales, 2 recipes for each input
+  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
+    return false;
+  }
 
-  Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype));
-  return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out);
-}
+  // Need {Blockwise_1x16, e4m3 for scale[0], Tensorwise, fp32 for scale[1]}
+  if (recipe_a[0] != ScalingType::BlockWise1x16) return false;
+  if (scales_a[0].scalar_type() != ScalarType::Float8_e4m3fn) return false;
+  if (recipe_b[0] != ScalingType::BlockWise1x16) return false;
+  if (scales_b[0].scalar_type() != ScalarType::Float8_e4m3fn) return false;
 
-Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
-  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+  return true;
+}
 
-  TORCH_CHECK(out_dtype == batch1.scalar_type() ||
-    (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
-    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
+/**
+ * Both inputs must be fp8
+ * A, B must only have 1 scale each, A: {Blockwise_1x128 (float), B: {Blockwise_128x128 (float)
+ */
+bool check_deepseek_recipe(ScalingType expected_recipe_a,
+                           ScalingType expected_recipe_b,
+                           c10::ScalarType type_a,
+                           std::vector<ScalingType>& recipe_a,
+                           ArrayRef<Tensor>& scales_a,
+                           c10::ScalarType type_b,
+                           std::vector<ScalingType>& recipe_b,
+                           ArrayRef<Tensor>& scales_b) {
+  // both types must be fp8
+  if (type_a != ScalarType::Float8_e4m3fn || type_b != ScalarType::Float8_e4m3fn) {
+    return false;
+  }
 
-  Scalar beta(0.0);
-  Scalar alpha(1.0);
-  {
-    NoNamesGuard guard;
-    baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
+  // 1 scales, 1 recipes for each input
+  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
+    return false;
   }
 
-  return out;
+  // Need {Blockwise_1x128, float} for A, {Blockwise_128x128, float} for B
+  if (recipe_a[0] != expected_recipe_a) return false;
+  if (scales_a[0].scalar_type() != ScalarType::Float) return false;
+  if (recipe_b[0] != expected_recipe_b) return false;
+  if (scales_b[0].scalar_type() != ScalarType::Float) return false;
+
+  return true;
 }
 
-Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
-  // We need to copy the tensor
-  Tensor out = self.clone().to(self.options().dtype(out_dtype));
+/**
+ * Both inputs must be fp8
+ * A, B must have 1 scale each, {Blockwise_1x32, e8m0}
+ */
+bool check_mxfp8_recipe(c10::ScalarType type_a,
+                        std::vector<ScalingType>& recipe_a,
+                        ArrayRef<Tensor>& scales_a,
+                        c10::ScalarType type_b,
+                        std::vector<ScalingType>& recipe_b,
+                        ArrayRef<Tensor>& scales_b) {
+  // both types must be fp8
+  if (type_a != ScalarType::Float8_e4m3fn || type_b != ScalarType::Float8_e4m3fn) {
+    return false;
+  }
 
-  return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out);
-}
+  // 1 scales, 1 recipes for each input
+  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
+    return false;
+  }
 
-Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
-  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+  // Need {Blockwise_1x32, e8m0} for A & B
+  if (recipe_a[0] != ScalingType::BlockWise1x32) return false;
+  if (scales_a[0].scalar_type() != ScalarType::Float8_e8m0fnu) return false;
+  if (recipe_b[0] != ScalingType::BlockWise1x32) return false;
+  if (scales_b[0].scalar_type() != ScalarType::Float8_e8m0fnu) return false;
 
-  TORCH_CHECK(out_dtype == batch1.scalar_type() ||
-    (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
+  return true;
+}
+
+/**
+ * Both inputs must be fp4
+ * A, B must have 1 scale each, {Blockwise_1x32, e8m0}
+ */
+bool check_mxfp4_recipe(c10::ScalarType type_a,
+                        std::vector<ScalingType>& recipe_a,
+                        ArrayRef<Tensor>& scales_a,
+                        c10::ScalarType type_b,
+                        std::vector<ScalingType>& recipe_b,
+                        ArrayRef<Tensor>& scales_b) {
+  // both types must be fp4
+  if (type_a != ScalarType::Float4_e2m1fn_x2 || type_b != ScalarType::Float4_e2m1fn_x2) {
+    return false;
+  }
+
+  // 1 scales, 1 recipes for each input
+  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 || recipe_b.size() != 1) {
+    return false;
+  }
+
+  // Need {Blockwise_1x32, e8m0} for A & B
+  if (recipe_a[0] != ScalingType::BlockWise1x32) return false;
+  if (scales_a[0].scalar_type() != ScalarType::Float8_e8m0fnu) return false;
+  if (recipe_b[0] != ScalingType::BlockWise1x32) return false;
+  if (scales_b[0].scalar_type() != ScalarType::Float8_e8m0fnu) return false;
+
+  return true;
+}
+
+using acceptance_fn = std::function<bool(c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&, c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&)>;
+using namespace std::placeholders;
+
+std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 9> scale_kernel_dispatch = {{
+  { "tensorwise_tensorwise", check_tensorwise_recipe, ScaledGemmImplementation::TENSORWISE_TENSORWISE },
+  { "rowwise_rowwise", check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE},
+  { "block_1x128_128x128", std::bind(check_deepseek_recipe, ScalingType::BlockWise1x128, ScalingType::BlockWise128x128, _1, _2, _3, _4, _5, _6),
+    ScaledGemmImplementation::BLOCK_1x128_128x128},
+  { "block_128x128_1x128", std::bind(check_deepseek_recipe, ScalingType::BlockWise128x128, ScalingType::BlockWise1x128, _1, _2, _3, _4, _5, _6),
+    ScaledGemmImplementation::BLOCK_128x128_1x128},
+  { "block_1x128_1x128", std::bind(check_deepseek_recipe, ScalingType::BlockWise1x128, ScalingType::BlockWise1x128, _1, _2, _3, _4, _5, _6),
+    ScaledGemmImplementation::BLOCK_1x128_1x128},
+  { "nvfp4_nvfp4", check_nvfp4_recipe, ScaledGemmImplementation::NVFP4_NVFP4},
+  { "nvfp4_nvfp4_single_scale", check_nvfp4_recipe_single_scale, ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE },
+  { "mxfp8_mxfp8", check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8},
+  { "mxfp4_mxfp4", check_mxfp4_recipe, ScaledGemmImplementation::MXFP4_MXFP4}}};
+
+Tensor&
+_scaled_tensorwise_tensorwise(
+          const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a, const Tensor& scale_b,
+          const std::optional<Tensor>& bias,
+          const c10::ScalarType out_dtype,
+          bool use_fast_accum,
+          Tensor& out) {
+  // Restrictions:
+  // A, B are FP8, scales are fp32
+  //
+  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
+      mat_a.scalar_type(), mat_b.scalar_type());
+  TORCH_CHECK_VALUE(scale_a.numel() == 1 && scale_a.scalar_type() == kFloat, "scale_a must have 1 Float element")
+  TORCH_CHECK_VALUE(scale_b.numel() == 1 && scale_b.scalar_type() == kFloat, "scale_b must have 1 Float element")
+
+  auto scaling_choice_a = ScalingType::TensorWise;
+  auto scaling_choice_b = ScalingType::TensorWise;
+
+  _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
+
+  return out;
+}
+
+
+Tensor&
+_scaled_rowwise_rowwise(
+          const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a, const Tensor& scale_b,
+          const std::optional<Tensor>& bias,
+          const c10::ScalarType out_dtype,
+          bool use_fast_accum,
+          Tensor& out) {
+  // Restrictions:
+  // A, B are FP8, scales are fp32, shape M/N for A/B
+  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
+      mat_a.scalar_type(), mat_b.scalar_type());
+  TORCH_CHECK_VALUE(scale_a.size(0) == mat_a.size(0) && scale_a.size(1) == 1, "scale_a must have shape [", mat_a.size(0), ", 1], got [", scale_a.sizes(), "]");
+  TORCH_CHECK_VALUE(scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat, "scale_a must have ", mat_a.size(0), " Float elements, got ", scale_a.numel())
+  TORCH_CHECK_VALUE(scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat, "scale_b must have ", mat_b.size(1), " Float elements, got ", scale_b.numel())
+
+  TORCH_CHECK_VALUE(scale_a.stride(1) == 1, "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1));
+  TORCH_CHECK_VALUE(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1));
+
+  auto scaling_choice_a = ScalingType::RowWise;
+  auto scaling_choice_b = ScalingType::RowWise;
+  //
+  // NVIDIA's cuBLAS only started supporting row-wise scaling in version 12.9,
+  // and only for compute capability 9.0+. In other cases we use CUTLASS.
+#ifndef USE_ROCM
+  // We are doing row-wise scaling
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  if (((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
+      // cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
+      ||  (dprops->major == 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
+    TORCH_CHECK_VALUE(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
+    at::cuda::detail::f8f8bf16_rowwise(
+        mat_a,
+        mat_b,
+        scale_a,
+        scale_b,
+        bias,
+        use_fast_accum,
+        out);
+    return out;
+  }
+#else
+
+  // For ROCm, match behavior of f8f8bf16_rowwise type checking, for unit test purposes.
+  //Tensor b = mat_b;
+  if (_scaled_mm_is_fnuz()) {
+    TORCH_CHECK_VALUE(mat_b.dtype() == at::kFloat8_e4m3fnuz, "expected mat_b.dtype() to be at::kFloat8_e4m3fnuz, but got ", mat_b.dtype());
+  }
+  else {
+    TORCH_CHECK_VALUE(mat_b.dtype() == at::kFloat8_e4m3fn, "expected mat_b.dtype() to be at::kFloat8_e4m3fn, but got ", mat_b.dtype());
+  }
+  // Until more than bf16 is supported.
+  TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16,
+       "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
+#endif
+
+  _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
+
+  return out;
+}
+
+// Check the shapes & sizes of scales for deepseek-style (1x128, 128x128) scaling.
+// Wraps check_size_stride for easier integration, correctly handles cases where a dimension of the scale == 1,
+// and strides become somewhat meaningless
+void _check_deepseek_scale_stride(const Tensor& scale, const Tensor& t, const ScalingType scale_type) {
+  if (scale_type == ScalingType::BlockWise1x128) {
+    TORCH_CHECK_VALUE(check_size_stride(scale, 0, t.size(0), 1),
+        "at dim=0 scale should have ", t.size(0), "elements and stride(0) ", 1, "if ", t.size(0), " > 1 - Got: ",
+        "shape=", scale.sizes(), ", stride=", scale.strides());
+    auto expected_size = ceil_div<int64_t>(t.size(1), 128);
+    TORCH_CHECK_VALUE(check_size_stride(scale, 1, expected_size, t.size(0)),
+        "at dim=1 scale should have ", expected_size, "elements and stride ", t.size(0), "if ", expected_size, " > 1 - Got: ",
+        "shape=", scale.sizes(), ", stride=", scale.strides());
+  } else if (scale_type == ScalingType::BlockWise128x128) {
+      TORCH_CHECK_VALUE(check_size_stride(
+          scale,
+          0,
+          ceil_div<int64_t>(t.size(0), 128),
+          ceil_div<int64_t>(t.size(1), 128)),
+        "at dim=0 scale should have ", ceil_div<int64_t>(t.size(0), 128), "elements and stride(0) ", ceil_div<int64_t>(t.size(1), 128), "if ", ceil_div<int64_t>(t.size(0), 128), " > 1 - Got: ",
+        "shape=", scale.sizes(), ", stride=", scale.strides());
+      TORCH_CHECK(check_size_stride(
+          scale, 1, ceil_div<int64_t>(t.size(1), 128), 1),
+        "at dim=1 scale should have ", ceil_div<int64_t>(t.size(1), 128), "elements and stride(1) ", 1, "if ", ceil_div<int64_t>(t.size(1), 128), " > 1 - Got: ",
+        "shape=", scale.sizes(), ", stride=", scale.strides());
+  }
+}
+
+Tensor&
+_scaled_block1x128_block1x128(
+          const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a, const Tensor& scale_b,
+          const std::optional<Tensor>& bias,
+          const c10::ScalarType out_dtype,
+          const bool use_fast_accum,
+          Tensor& out) {
+  // Restrictions:
+  // A, B are FP8, scales are fp32, shape K//128
+  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
+      mat_a.scalar_type(), mat_b.scalar_type());
+  TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
+      "scale_a must have shape ", mat_a.sizes()[0], " x ", mat_a.sizes()[1] / 128, " Float elements, got ", scale_a.sizes())
+  TORCH_CHECK_VALUE(scale_b.sizes()[0] == ceil_div<int64_t>(mat_b.sizes()[0], 128) && scale_b.sizes()[1] == mat_b.sizes()[1] && scale_b.scalar_type() == kFloat,
+      "scale_b must have shape ", ceil_div<int64_t>(mat_b.sizes()[0], 128), " x ", mat_b.sizes()[1], " Float elements, got ", scale_b.sizes())
+
+  auto scaling_choice_a = ScalingType::BlockWise1x128;
+  auto scaling_choice_b = ScalingType::BlockWise1x128;
+
+  // Check scale strides (including stride=1 small cases)
+  _check_deepseek_scale_stride(scale_a, mat_a, scaling_choice_a);
+  _check_deepseek_scale_stride(scale_b.t(), mat_b.t(), scaling_choice_b);
+
+  _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
+
+  return out;
+}
+
+Tensor&
+_scaled_block128x128_block1x128(
+          const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a, const Tensor& scale_b,
+          const std::optional<Tensor>& bias,
+          const c10::ScalarType out_dtype,
+          const bool use_fast_accum,
+          Tensor& out) {
+  // Restrictions:
+  // A, B are FP8, scales are fp32, shape K//128
+  std::cout << "mat_b: " << mat_b.dim() << ", " << mat_b.sizes() << ", " << mat_b.strides() << std::endl;
+  std::cout << "scale_b: " << scale_b.dim() << ", " << scale_b.sizes() << ", " << scale_b.strides() << std::endl;
+  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
+      mat_a.scalar_type(), mat_b.scalar_type());
+  TORCH_CHECK_VALUE(scale_a.sizes()[0] == ceil_div<int64_t>(mat_a.sizes()[0], 128) && scale_a.sizes()[1] == ceil_div<int64_t>(mat_a.sizes()[1], 128) && scale_a.scalar_type() == kFloat,
+      "scale_a must have shape ", ceil_div<int64_t>(mat_a.sizes()[0], 128), " x ", ceil_div<int64_t>(mat_a.sizes()[1], 128), " Float elements, got ", scale_a.sizes())
+  TORCH_CHECK_VALUE(scale_b.sizes()[0] == ceil_div<int64_t>(mat_b.sizes()[0], 128) && scale_b.sizes()[1] == mat_b.sizes()[1] && scale_b.scalar_type() == kFloat,
+      "scale_b must have shape ", ceil_div<int64_t>(mat_b.sizes()[0], 128), " x ", mat_b.sizes()[1], " Float elements, got ", scale_b.sizes())
+
+  auto scaling_choice_a = ScalingType::BlockWise128x128;
+  auto scaling_choice_b = ScalingType::BlockWise1x128;
+
+  // Check scale strides (including stride=1 small cases)
+  _check_deepseek_scale_stride(scale_a, mat_a, scaling_choice_a);
+  _check_deepseek_scale_stride(scale_b.t(), mat_b.t(), scaling_choice_b);
+
+  _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
+
+  return out;
+}
+
+Tensor&
+_scaled_block1x128_block128x128(
+          const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a, const Tensor& scale_b,
+          const std::optional<Tensor>& bias,
+          const c10::ScalarType out_dtype,
+          const bool use_fast_accum,
+          Tensor& out) {
+  // Restrictions:
+  // A, B are FP8, scales are fp32, A: shape K//128, B: K//128, N//128
+  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
+      mat_a.scalar_type(), mat_b.scalar_type());
+  TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
+      "scale_a must have shape ", mat_a.sizes()[0], " x ", mat_a.sizes()[1] / 128, " Float elements, got ", scale_a.sizes())
+  TORCH_CHECK_VALUE(scale_b.sizes()[0] == mat_b.sizes()[0] / 128 && scale_b.sizes()[1] == mat_b.sizes()[1] / 128 && scale_b.scalar_type() == kFloat,
+      "scale_b must have shape ", mat_b.sizes()[0] / 128, " x ", mat_b.sizes()[1] / 128, " Float elements, got ", scale_b.sizes())
+
+  auto scaling_choice_a = ScalingType::BlockWise1x128;
+  auto scaling_choice_b = ScalingType::BlockWise128x128;
+
+  // Check scale strides (including stride=1 small cases)
+  _check_deepseek_scale_stride(scale_a, mat_a, scaling_choice_a);
+  _check_deepseek_scale_stride(scale_b.t(), mat_b.t(), scaling_choice_b);
+
+  _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
+
+  return out;
+}
+
+Tensor&
+_scaled_mxfp8_mxfp8(
+          const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a, const SwizzleType swizzle_a,
+          const Tensor& scale_b, const SwizzleType swizzle_b,
+          const std::optional<Tensor>& bias,
+          const c10::ScalarType out_dtype,
+          Tensor& out) {
+  // Restrictions:
+  // A, B are FP8, scales are e8m0, A: shape K//32, B: K, N//32
+  // Scales must be swizzled
+  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
+      mat_a.scalar_type(), mat_b.scalar_type());
+
+#ifdef USE_ROCM
+  auto scale_a_elems = ceil_div<int64_t>(mat_a.size(0), 32) * mat_a.size(1);
+  auto scale_b_elems = ceil_div<int64_t>(mat_b.size(1), 32) * mat_b.size(0);
+#else
+  auto scale_a_elems = round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_a.size(1), 32), 4);
+  auto scale_b_elems = round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_b.size(0), 32), 4);
+#endif
+  TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(),
+         "For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel());
+  TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(),
+         "For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel());
+
+#ifndef USE_ROCM
+  TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4, "scale_a must be swizzled to SWIZZLE_32_4_4 format");
+  TORCH_CHECK_VALUE(swizzle_b == SwizzleType::SWIZZLE_32_4_4, "scale_b must be swizzled to SWIZZLE_32_4_4 format");
+#endif
+
+  TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(),
+        "For Blockwise scaling both scales should be contiguous");
+
+  TORCH_CHECK_VALUE(out.scalar_type() == out_dtype, "expected out.scalar_type() to be ", out_dtype, ", but got ", out_dtype);
+
+  auto scaling_choice_a = ScalingType::BlockWise1x32;
+  auto scaling_choice_b = ScalingType::BlockWise1x32;
+
+#ifdef USE_ROCM
+#if ROCM_VERSION >= 70000
+  TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
+              "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
+
+  TORCH_CHECK_VALUE(mat_a.size(0) % 32 == 0 && mat_a.size(1) % 32 == 0 &&
+              mat_b.size(0) % 32 == 0 && mat_b.size(1) % 32 == 0,
+              "Matrix dimensions must be multiples of 32 for block-wise scaling");
+
+  TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16 ||
+              out.scalar_type() == ScalarType::Half,
+              "Block-wise scaling only supports BFloat16 or Half output types");
+#else
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
+#endif
+#endif
+
+  return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
+}
+
+
+Tensor&
+_scaled_mxfp4_mxfp4(
+          const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a, const SwizzleType swizzle_a,
+          const Tensor& scale_b, const SwizzleType swizzle_b,
+          const std::optional<Tensor>& bias,
+          const c10::ScalarType out_dtype,
+          Tensor& out) {
+#ifndef USE_ROCM
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM only");
+#endif
+  // Restrictions:
+  // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
+  TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
+      mat_a.scalar_type(), mat_b.scalar_type());
+
+  auto scale_a_elems = ceil_div<int64_t>(2 * mat_a.size(0), 32) * mat_a.size(1);
+  auto scale_b_elems = ceil_div<int64_t>(2 * mat_b.size(1), 32) * mat_b.size(0);
+  TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(),
+         "For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel());
+  TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(),
+         "For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel());
+
+  TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(),
+        "For Blockwise scaling both scales should be contiguous");
+
+  TORCH_CHECK_VALUE(out.scalar_type() == out_dtype, "expected out.scalar_type() to be ", out_dtype, ", but got ", out_dtype);
+
+  auto scaling_choice_a = ScalingType::BlockWise1x32;
+  auto scaling_choice_b = ScalingType::BlockWise1x32;
+
+#if ROCM_VERSION >= 70000
+  TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
+              "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
+
+  TORCH_CHECK_VALUE(mat_a.size(0) % 32 == 0 && mat_a.size(1) % 32 == 0 &&
+              mat_b.size(0) % 32 == 0 && mat_b.size(1) % 32 == 0,
+              "Matrix dimensions must be multiples of 32 for block-wise scaling");
+
+  TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16 ||
+              out.scalar_type() == ScalarType::Half,
+              "Block-wise scaling only supports BFloat16 or Half output types");
+#else
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
+#endif
+
+  return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
+}
+
+Tensor&
+_scaled_nvfp4_nvfp4(
+          const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a, const SwizzleType swizzle_a,
+          const Tensor& scale_b, const SwizzleType swizzle_b,
+          const std::optional<Tensor>& bias,
+          const c10::ScalarType out_dtype,
+          Tensor& out,
+          const std::optional<Tensor>& global_scale_a = std::nullopt,
+          const std::optional<Tensor>& global_scale_b = std::nullopt) {
+#ifdef USE_ROCM
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "NVFP4 scaling not supported on ROCM");
+#endif
+  std::optional<Tensor> alpha = std::nullopt;
+  // Note: "Or" here means that if only one scale is passed, we check for the other. Otherwise,
+  //       if this is "And" we would silently do nothing in the case where one global scale is
+  //       passed and not the other.
+  if (global_scale_a.has_value() || global_scale_b.has_value()) {
+    TORCH_CHECK_VALUE(global_scale_a.has_value(),
+        "For two-level-scaled NVFP4, global_scale_a must have a value");
+    TORCH_CHECK_VALUE(global_scale_b.has_value(),
+        "For two-level-scaled NVFP4, global_scale_b must have a value");
+    alpha = global_scale_a.value().mul(global_scale_b.value());
+  }
+  // Restrictions:
+  // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
+  // Scales must be swizzled
+  TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
+      mat_a.scalar_type(), mat_b.scalar_type());
+  // Note: fp4x2 format, need to double the K dimension for checking purposes.
+  auto scale_a_elems = round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_a.size(1) * 2, 16), 4);
+  auto scale_b_elems = round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_b.size(0) * 2, 16), 4);
+  TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(),
+         "For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel());
+  TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(),
+         "For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel());
+
+  TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4, "scale_a must be swizzled to SWIZZLE_32_4_4 format");
+  TORCH_CHECK_VALUE(swizzle_b == SwizzleType::SWIZZLE_32_4_4, "scale_b must be swizzled to SWIZZLE_32_4_4 format");
+
+  TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(),
+        "For Blockwise scaling both scales should be contiguous");
+
+  auto scaling_choice_a = ScalingType::BlockWise1x16;
+  auto scaling_choice_b = ScalingType::BlockWise1x16;
+  return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out, alpha);
+}
+
+
+// V2: Computes matrix multiply + bias while applying scaling to input and output matrices
+// Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default.
+// If output matrix type is 16 or 32-bit type, scale_result is not applied.
+// Known limitations:
+//  - Only works if mat1 is row-major and mat2 is column-major
+//  - Only works if matrices sizes are divisible by 32
+//  - If 1-dimensional tensors are used then scale_a should be size = mat1.size(0)
+//    and scale_b should have size = to mat2.size(1)
+//  Arguments:
+//    - `mat1`: the first operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
+//    - `mat2`: the second operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
+//    - `scale_a`: a tensor with the inverse scale of `mat1`, whose shape/strides/dtype depend on the scaling scheme
+//    - `scale_recipe_a`: An integer corresponding to an enum describing the scaling scheme used for `scale_a`
+//    - `swizzle_a`: An integer corresponding to a `SwizzleType` enum describing the swizzling scheme for `scale_a`
+//    - `scale_b`: a tensor with the inverse scale of `mat2`, whose shape/strides/dtype depend on the scaling scheme
+//    - `scale_recipe_b`: An integer corresponding to an enum describing the scaling scheme used for `scale_b`
+//    - `swizzle_b`: An integer corresponding to a `SwizzleType` enum describing the swizzling scheme for `scale_b`
+//    - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
+//    - `out_dtype`: the output dtype, can either be a float8 or a higher precision floating point type
+//    - `use_fast_accum`: if true, enables fast float8 accumulation. Backends may ignore this option if not applicable.
+//    - `out`: a reference to the output tensor
+Tensor&
+_scaled_mm_cuda_v2_out(
+          const Tensor& mat_a, const Tensor& mat_b,
+          ArrayRef<Tensor> scale_a,
+          IntArrayRef scale_recipe_a,
+          IntArrayRef swizzle_a,
+          ArrayRef<Tensor> scale_b,
+          IntArrayRef scale_recipe_b,
+          IntArrayRef swizzle_b,
+          const std::optional<Tensor>& bias,
+          const std::optional<c10::ScalarType> out_dtype,
+          IntArrayRef contraction_dim,
+          bool use_fast_accum,
+          Tensor& out) {
+  // Check sizes
+  bool allowed_device = _scaled_mm_allowed_device();
+  TORCH_CHECK_NOT_IMPLEMENTED(allowed_device,
+      "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+");
+  TORCH_CHECK_VALUE(mat_a.dim() == 2, "mat_a must be a matrix");
+  TORCH_CHECK_VALUE(mat_b.dim() == 2, "mat_b must be a matrix");
+
+  // If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm kernels
+  // do not support this case).
+  if (mat_a.size(0) == 0 || mat_a.size(1) == 0 || mat_b.size(1) == 0) {
+    // `out` was created with `at::empty`. In the case where we are multiplying
+    // MxK by KxN and K is the zero dim, we need to initialize here to properly
+    // return a tensor of zeros.
+    at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
+    if (mat_a.size(1) == 0) {
+      out.zero_();
+    }
+
+    return out;
+  }
+
+  // Check if the input matrix sizes can be multiplied
+  // - if optional contraction dims are provided, use those
+  //   -- mostly for < 1B formats (i.e. nvfp4x2) where cheap .t() is not available.
+  if (contraction_dim.size() > 0) {
+    TORCH_CHECK_VALUE(contraction_dim.size() == 2, "contraction_dim must have exactly 2 elements");
+    auto mat_a_dim = contraction_dim[0];
+    auto mat_b_dim = contraction_dim[1];
+    TORCH_CHECK_VALUE(
+        mat_a.size(mat_a_dim) == mat_b.size(mat_b_dim), "mat_a and mat_b shapes cannot be multiplied (",
+        mat_a.size(0), "x", mat_a.size(1), " and ", mat_b.size(0), "x", mat_b.size(1), ") ",
+        "with contraction dims mat_a: ", mat_a_dim, ", mat_b: ", mat_b_dim);
+  } else {
+    TORCH_CHECK_VALUE(
+        mat_a.size(1) == mat_b.size(0), "mat_a and mat_b shapes cannot be multiplied (",
+        mat_a.size(0), "x", mat_a.size(1), " and ", mat_b.size(0), "x", mat_b.size(1), ")");
+  }
+
+  TORCH_CHECK_VALUE(!bias || bias->numel() == mat_b.sizes()[1], "Bias must be size ", mat_b.sizes()[1],
+       " but got ", bias->numel());
+  TORCH_CHECK_VALUE(
+      mat_a.sizes()[1] % 16 == 0,
+      "Expected trailing dimension of mat1 to be divisible by 16 ",
+      "but got mat1 shape: (",
+      mat_a.sizes()[0],
+      "x",
+      mat_a.sizes()[1],
+      ").");
+  TORCH_CHECK_VALUE(mat_b.sizes()[0] % 16 == 0 && mat_b.sizes()[1] % 16 == 0, "mat2 shape (", mat_b.sizes()[0], "x",
+       mat_b.sizes()[1], ") must be divisible by 16");
+
+  // TODO(slayton): Existing checks, not sure if they should really be here.
+  TORCH_CHECK_VALUE(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
+  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) || mat_a.scalar_type() == ScalarType::Float4_e2m1fn_x2,
+      "Expected mat_a to be Float8 or Float4_x2 matrix got ", mat_a.scalar_type());
+  TORCH_CHECK_VALUE(isFloat8Type(mat_b.scalar_type()) || mat_b.scalar_type() == ScalarType::Float4_e2m1fn_x2,
+      "Expected mat_b to be Float8 or Float4_x2 matrix got ", mat_b.scalar_type());
+#ifndef USE_ROCM
+  // Type restrictions imposed by CuBLASLt as of CUDA-12.1
+  TORCH_CHECK_VALUE(mat_a.scalar_type() != ScalarType::Float8_e5m2 || mat_b.scalar_type() != ScalarType::Float8_e5m2,
+        "Multiplication of two Float8_e5m2 matrices is not supported");
+#endif
+  if (use_fast_accum) {
+    TORCH_CHECK_VALUE(mat_a.scalar_type() != ScalarType::Float4_e2m1fn_x2 && mat_b.scalar_type() != ScalarType::Float4_e2m1fn_x2, "`use_fast_accum` is not supported when `mat_a` or `mat_b` tensors have the `Float4_e2m1fn_x2` dtype.");
+  }
+#ifdef USE_ROCM
+  if (mat_a.scalar_type() == ScalarType::Float4_e2m1fn_x2 || mat_b.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
+    TORCH_CHECK_NOT_IMPLEMENTED(ROCM_VERSION >= 70000,
+        "Float4_e2m1fn_x2 is only supported for ROCm 7.0 and above");
+  }
+  if (mat_a.scalar_type() == ScalarType::Float8_e5m2 || mat_b.scalar_type() == ScalarType::Float8_e5m2) {
+    TORCH_CHECK_NOT_IMPLEMENTED(ROCM_VERSION >= 60500,
+        "Float8_e5m2 is only supported for ROCm 6.5 and above");
+  }
+  if (mat_a.scalar_type() == ScalarType::Float8_e4m3fn || mat_b.scalar_type() == ScalarType::Float8_e4m3fn) {
+    TORCH_CHECK_NOT_IMPLEMENTED(ROCM_VERSION >= 60500,
+        "Float8_e4m3fn is only supported for ROCm 6.5 and above");
+  }
+#endif
+  if (bias) {
+    TORCH_CHECK_VALUE(out.scalar_type() != kFloat,
+        "Bias is not supported when out_dtype is set to Float32");
+
+    TORCH_CHECK_VALUE(bias->scalar_type() == ScalarType::BFloat16 ||
+                bias->scalar_type() == ScalarType::Half,
+        "Bias must be BFloat16 or Half, but got ", bias->scalar_type());
+
+    TORCH_CHECK_VALUE((out.scalar_type() != kFloat &&
+                 out.scalar_type() != ScalarType::BFloat16) ||
+                bias->scalar_type() == ScalarType::BFloat16,
+        "Bias must be BFloat16 to compute ", out.scalar_type(),
+        " output, but got ", bias->scalar_type());
+
+    TORCH_CHECK_VALUE(out.scalar_type() != ScalarType::Half ||
+                bias->scalar_type() == ScalarType::Half,
+        "Bias must be Float16 to compute ", out.scalar_type(),
+        " output, but got ", bias->scalar_type());
+  }
+  {
+    auto bias_ = bias.value_or(Tensor());
+
+    // NOLINTNEXTLINE(*c-array*)
+    TensorArg targs[]{{out, "out", 0}, {mat_a, "mat_a", 1}, {mat_b, "mat_b", 2},
+                      {bias_, "bias", 3}, {scale_a[0], "scale_a", 4}, {scale_b[0], "scale_b", 5}};
+    checkAllSameGPU(__func__, targs);
+  }
+
+  auto out_dtype_ = out_dtype.value_or(at::ScalarType::BFloat16);
+
+  // Conversion of implicitly-defined enums to explicit
+  auto scale_recipe_a_enum = convert_int_to_enum<ScalingType>(scale_recipe_a);
+  auto swizzle_a_enum = convert_int_to_enum<SwizzleType>(swizzle_a);
+  auto scale_recipe_b_enum = convert_int_to_enum<ScalingType>(scale_recipe_b);
+  auto swizzle_b_enum = convert_int_to_enum<SwizzleType>(swizzle_b);
+
+  // at this point we can start working out what we want to be doing
+  // Try to do as few steps as possible.
+  // NOTE: support is deliberately sparse, can explicitly enumerate all combinations allowed.
+  // Do this via a list of defined (name, acceptance, concrete_impl) tuples.
+  bool found_impl = false;
+  ScaledGemmImplementation gemm_impl = ScaledGemmImplementation::NONE;
+
+  for (const auto& fn_entry : scale_kernel_dispatch) {
+    const auto [name, accept_fn, scaled_gemm_impl] = fn_entry;
+    bool ok = accept_fn(mat_a.scalar_type(),
+                        scale_recipe_a_enum,
+                        scale_a,
+                        mat_b.scalar_type(),
+                        scale_recipe_b_enum,
+                        scale_b);
+    if (ok) {
+      gemm_impl = scaled_gemm_impl;
+      found_impl = true;
+      break;
+    }
+  }
+  TORCH_CHECK_VALUE(
+    found_impl,
+    "Invalid scaling configuration.\n"
+    "- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n"
+    "- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (", mat_a.size(0), ", 1) and scale_b should be (1, ", mat_b.size(1), "), and both should be contiguous.\n"
+    "- For BlockWise 1x128 scaling, a and b should be float8, scales should be float, scale_a should be (", mat_a.size(0), ", ", ceil_div<int64_t>(mat_a.size(1), 128), ") and scale_b should be (", ceil_div<int64_t>(mat_b.size(0), 128), ", ", mat_b.size(1), "), and both should be outer-dim-major.\n"
+    "- For BlockWise 128x128 scaling, a and b should be float8, scales should be float, scale_a should be (", ceil_div<int64_t>(mat_a.size(0), 128), ", ", ceil_div<int64_t>(mat_a.size(1), 128), ") and scale_b should be (", ceil_div<int64_t>(mat_b.size(0), 128), ", ", ceil_div<int64_t>(mat_b.size(1), 128), "), and both should be near-inner-dim-major (with 16-byte aligned strides).\n"
+    "- For Blockwise 1x32 scaling, a and b should be float8, scales should be float8_e8m0fnu, scale_a should have ", round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_a.size(1), 32), 4), " elements and scale_b should have ", round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_b.size(0), 32), 4), " elements, and both should be contiguous.\n"
+    "- For Blockwise 1x16 scaling, a and b should be float4 (packed 2x), scales should be float8_e4m3fn, scale_a should have ", round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_a.size(1) * 2, 16), 4), " elements and scale_b should have ", round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(mat_b.size(0) * 2, 16), 4), " elements, and both should be contiguous.\n"
+    "Got mat_a.dtype()=", mat_a.scalar_type(), ", scale_a[0].dtype()=", scale_a[0].scalar_type(), ", scale_a[0].size()=", scale_a[0].sizes(), ", scale_a[0].stride()=", scale_a[0].strides(), ", ",
+    "mat_b.dtype()=", mat_b.scalar_type(), ", scale_b[0].dtype()=", scale_b[0].scalar_type(), ", scale_b[0].size()=", scale_b[0].sizes(), " and scale_b[0].stride()=", scale_b[0].strides()
+  );
+
+  at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
+
+  auto bias_ = bias.value_or(Tensor());
+
+  // dispatch to appropriate lower-level calls for error checking & execution
+  if (gemm_impl == ScaledGemmImplementation::TENSORWISE_TENSORWISE) {
+    return _scaled_tensorwise_tensorwise(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
+  } else if (gemm_impl == ScaledGemmImplementation::ROWWISE_ROWWISE) {
+    return _scaled_rowwise_rowwise(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
+  } else if (gemm_impl == ScaledGemmImplementation::BLOCK_128x128_1x128) {
+    return _scaled_block128x128_block1x128(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
+  } else if (gemm_impl == ScaledGemmImplementation::BLOCK_1x128_128x128) {
+    return _scaled_block1x128_block128x128(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
+  } else if (gemm_impl == ScaledGemmImplementation::BLOCK_1x128_1x128) {
+    return _scaled_block1x128_block1x128(mat_a, mat_b, scale_a[0], scale_b[0], bias, out_dtype_, use_fast_accum, out);
+  } else if (gemm_impl == ScaledGemmImplementation::MXFP8_MXFP8) {
+    return _scaled_mxfp8_mxfp8(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
+  } else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4) {
+    return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out,
+                               scale_a[1], scale_b[1]);
+  } else if (gemm_impl == ScaledGemmImplementation::NVFP4_NVFP4_SINGLE_SCALE) {
+    return _scaled_nvfp4_nvfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
+  } else if (gemm_impl == ScaledGemmImplementation::MXFP4_MXFP4) {
+    return _scaled_mxfp4_mxfp4(mat_a, mat_b, scale_a[0], swizzle_a_enum[0], scale_b[0], swizzle_b_enum[0], bias, out_dtype_, out);
+  } else {
+    TORCH_CHECK_VALUE(false, "Invalid state - found an implementation, but not really");
+  }
+}
+
+Tensor
+_scaled_mm_cuda_v2(
+          const Tensor& mat_a, const Tensor& mat_b,
+          ArrayRef<Tensor> scale_a,
+          IntArrayRef scale_recipe_a,
+          IntArrayRef swizzle_a,
+          ArrayRef<Tensor> scale_b,
+          IntArrayRef scale_recipe_b,
+          IntArrayRef swizzle_b,
+          const std::optional<Tensor>& bias,
+          const std::optional<c10::ScalarType> out_dtype,
+          IntArrayRef contraction_dim,
+          bool use_fast_accum) {
+  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
+  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
+
+  return _scaled_mm_cuda_v2_out(
+                      mat_a, mat_b,
+                      scale_a, scale_recipe_a, swizzle_a,
+                      scale_b, scale_recipe_b, swizzle_b,
+                      bias,
+                      out_dtype,
+                      contraction_dim,
+                      use_fast_accum,
+                      out);
+}
+
+// 2d-2d and 2d-3d
+// scaling=MXFP8
+// CUDA-only
+Tensor&
+_mx8_mx8_bf16_grouped_mm_fbgemm(
+        const Tensor& mat_a,
+        const Tensor& mat_b,
+        const Tensor& scale_a,
+        const SwizzleType& swizzle_a,
+        const Tensor& scale_b,
+        const SwizzleType& swizzle_b,
+        const std::optional<at::Tensor>& offs,
+        Tensor& out) {
+    const bool a_is_2d = mat_a.dim() == 2;
+    const bool b_is_2d = mat_b.dim() == 2;
+    bool b_is_3d = mat_b.dim() == 3;
+    bool is_2d_2d = a_is_2d && b_is_2d;
+    bool is_2d_3d = a_is_2d && b_is_3d;
+    TORCH_CHECK_VALUE(is_2d_2d || is_2d_3d, "MXFP8 grouped GEMM currently only supports 2d-2d and 2d-3d cases");
+    TORCH_CHECK_VALUE(offs.has_value(), "MXFP8 2d-2d and 2d-3d grouped GEMMs requires offsets");
+    TORCH_CHECK_VALUE(out.scalar_type() == at::kBFloat16, "Only bf16 out_dtype is supported for MXFP8 grouped gemm");
+    // MXFP8 expects float8_e8m0fnu scales.
+    TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu,
+        "For MXFP8 grouped gemm, both scales must be float8_e8m0fnu tensors.");
+#ifdef USE_ROCM
+    TORCH_CHECK_VALUE(swizzle_a == SwizzleType::NO_SWIZZLE && swizzle_b == SwizzleType::NO_SWIZZLE,
+        "For ROCM MXFP8 grouped gemm, both scale swizzle types must be SWIZZLE_NONE");
+#else
+    TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4 && swizzle_b == SwizzleType::SWIZZLE_32_4_4,
+        "For CUDA MXFP8 grouped gemm, both scale swizzle types must be SWIZZLE_32_4_4");
+#endif
+
+#if defined(USE_FBGEMM_GENAI) and !defined(USE_ROCM)
+    fbgemm_gpu::mx8mx8bf16_grouped_mm(
+        mat_a,
+        mat_b,
+        scale_a,
+        scale_b,
+        offs.value(),
+        out);
+#else
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "mxfp8_mxfp8 grouped gemm requires compile with USE_FBGEMM_GENAI");
+#endif
+    return out;
+}
+
+// 2d-2d and 2d-3d cases
+// scaling=rowwise
+// CUDA-only
+Tensor&
+_f8_f8_bf16_rowwise_grouped_mm_cuda(
+          const Tensor& mat_a,
+          const Tensor& mat_b,
+          const Tensor& scale_a,
+          const Tensor& scale_b,
+          const std::optional<Tensor>& offs,
+          const std::optional<Tensor>& bias,
+          const bool use_fast_accum,
+          Tensor& out) {
+  TORCH_CHECK_VALUE(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type());
+  TORCH_CHECK_VALUE(mat_b.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_b.scalar_type());
+
+  at::cuda::detail::f8f8bf16_grouped_mm(
+      mat_a,
+      mat_b,
+      scale_a,
+      scale_b,
+      offs,
+      bias,
+      use_fast_accum,
+      out);
+    return out;
+}
+
+// 2d-2d and 2d-3d cases
+// scaling=rowwise
+// only being called for rocm
+Tensor&
+_f8_f8_bf16_rowwise_grouped_mm_rocm(
+      const Tensor& mat_a,
+      const Tensor& mat_b,
+      const Tensor& scale_a,
+      const Tensor& scale_b,
+      const std::optional<Tensor>& offs,
+      Tensor& out) {
+  TORCH_CHECK_VALUE(mat_a.dtype() == at::kFloat8_e4m3fnuz, "Expected mat_a to be Float8_e4m3fnuz matrix got ", mat_a.scalar_type());
+  TORCH_CHECK_VALUE(mat_b.dtype() == at::kFloat8_e4m3fnuz, "Expected mat_a to be Float8_e4m3fnuz matrix got ", mat_b.scalar_type());
+
+#if defined(USE_FBGEMM_GENAI) && defined(USE_ROCM)
+  fbgemm_gpu::f8f8bf16_rowwise_grouped_mm(
+      mat_a,
+      // FBGEMM expects B matrix shape to be (.., N, K)
+      mat_b.transpose(-2, -1),
+      scale_a,
+      scale_b,
+      offs,
+      out);
+#else
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "grouped gemm is not supported without USE_FBGEMM_GENAI on ROCM")
+#endif
+  return out;
+
+}
+
+// Dispatch f8 x f8 -> bf16 row-wise scaled to rocm/cuda
+Tensor&
+_f8_f8_bf16_rowwise_grouped_mm(
+      const Tensor& mat_a,
+      const Tensor& mat_b,
+      const Tensor& scale_a,
+      const Tensor& scale_b,
+      const std::optional<Tensor>& offs,
+      const std::optional<Tensor>& bias,
+      bool use_fast_accum,
+      Tensor& out) {
+  // FP8 per-tensor and per-row scaling expect fp32 scales.
+  TORCH_CHECK_VALUE(scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat,
+      "For grouped FP8 rowwise, both scales must be float32 tensors");
+#ifndef USE_ROCM
+  return _f8_f8_bf16_rowwise_grouped_mm_cuda(
+      mat_a,
+      mat_b,
+      scale_a,
+      scale_b,
+      offs,
+      bias,
+      use_fast_accum,
+      out);
+#else
+  // NOTE: ignore use_fast_accum
+  TORCH_CHECK_VALUE(!bias.has_value(), "ROCM grouped gemm does not support bias")
+  return _f8_f8_bf16_rowwise_grouped_mm_rocm(
+      mat_a,
+      mat_b,
+      scale_a,
+      scale_b,
+      offs,
+      out);
+#endif
+}
+
+Tensor
+_scaled_grouped_mm_cuda(
+        const Tensor& mat_a,
+        const Tensor& mat_b,
+        const Tensor& scale_a,
+        const Tensor& scale_b,
+        const std::optional<at::Tensor>& offs,
+        const std::optional<at::Tensor>& bias,
+        const std::optional<at::Tensor>& scale_result,
+        std::optional<c10::ScalarType> out_dtype,
+        bool use_fast_accum) {
+  bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true);
+  TORCH_CHECK_VALUE(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = [9.0, 10.0], or ROCm MI300+");
+
+  TORCH_CHECK_VALUE(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");
+  TORCH_CHECK_VALUE(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed");
+  TORCH_CHECK_VALUE(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
+  TORCH_CHECK_VALUE(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
+  const bool a_is_2d = mat_a.dim() == 2;
+  const bool b_is_2d = mat_b.dim() == 2;
+
+  // NOTE(slayton): For sub-1B formats want contraction_dim argument?
+  if (!a_is_2d || !b_is_2d) {
+    TORCH_CHECK_VALUE(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
+  }
+  TORCH_CHECK_VALUE(
+    mat_a.size(-1) % 16 == 0,
+    "Expected trailing dimension of mat_a to be divisible by 16 ",
+    "but got mat1 shape: (",
+    mat_a.sizes(),
+    ").");
+  TORCH_CHECK_VALUE(mat_b.size(-2) % 16 == 0 && mat_b.size(-1) % 16 == 0,
+    "Expected mat_b shape to be divisible by 16 ",
+    "but got mat_b shape: (",
+    mat_b.sizes(),
+    ").");
+
+
+  TORCH_CHECK_VALUE(!bias.has_value(), "Bias not supported yet");
+  TORCH_CHECK_VALUE(!scale_result.has_value(), "Scale result not supported yet");
+  TORCH_CHECK_VALUE(offs.has_value() ==  (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix");
+
+  // NOTE: mxfp8 x mxfp8 requires (and asserts later) that offsets is present.
+  //       for rowwise, no offsets implies 3d-3d and is handled by lower-level
+  //       routines
+  if (offs.has_value()) {
+    TORCH_CHECK_VALUE(offs->dim() == 1, "offs has to be 1D");
+    TORCH_CHECK_VALUE(offs->dtype() == at::kInt, "Offsets have to be int32");
+  }
+  // FP8 per-tensor and per-row scaling expect fp32 scales.
+  // MXFP8 expects float8_e8m0fnu scales.
+  TORCH_CHECK_VALUE(
+      (scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat) ||
+      (scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu),
+      "For FP8 tensorwise and rowwise, both scales must both be float32 tensors. For MXFP8, scales must both be float8_e8m0fnu tensors.");
+
+  const int scale_multiplier = (mat_a.dim() == 2 && mat_b.dim() == 2) ? offs->size(0) : 1;
+  check_scale(mat_a, scale_a, 0 ,0, scale_multiplier);
+  check_scale(mat_b, scale_b, 1, 1, scale_multiplier);
+
+  const auto out_dtype_ = out_dtype.value_or(kBFloat16);
+  TORCH_CHECK_VALUE(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
+
+  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
+
+#if defined(USE_FBGEMM_GENAI) && defined(USE_CUDA) && !defined(USE_ROCM)
+  // MXFP8 grouped GEMM dispatching
+  bool is_mx8mx8bf16 = (
+    mat_a.scalar_type() == at::kFloat8_e4m3fn && mat_b.scalar_type() == at::kFloat8_e4m3fn &&
+    scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu
+  );
+#else
+  bool is_mx8mx8bf16 = false;
+#endif
+
+  if (is_mx8mx8bf16) {
+    // Note: Passing implied SwizzleType here, correctness of scale previously checked
+    //       in `check_scale` call
+    return _mx8_mx8_bf16_grouped_mm_fbgemm(
+        mat_a,
+        mat_b,
+        scale_a,
+        SwizzleType::SWIZZLE_32_4_4,
+        scale_b,
+        SwizzleType::SWIZZLE_32_4_4,
+        offs.value(),
+        out);
+  }
+
+  // If we're not MXFP8, then we're row-wise scaling.
+  return _f8_f8_bf16_rowwise_grouped_mm(
+      mat_a,
+      mat_b,
+      scale_a,
+      scale_b,
+      offs,
+      bias,
+      use_fast_accum,
+      out);
+}
+
+namespace {
+
+std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 2> scale_grouped_kernel_dispatch = {{
+  { "rowwise_rowwise", check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE},
+  { "mxfp8_mxfp8", check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8}}};
+
+} // anonymous namespace
+
+Tensor
+_scaled_grouped_mm_cuda_v2(
+          const Tensor& mat_a, const Tensor& mat_b,
+          ArrayRef<Tensor> scale_a,
+          IntArrayRef scale_recipe_a,
+          IntArrayRef swizzle_a,
+          ArrayRef<Tensor> scale_b,
+          IntArrayRef scale_recipe_b,
+          IntArrayRef swizzle_b,
+          const std::optional<Tensor>& offs,
+          const std::optional<Tensor>& bias,
+          const std::optional<c10::ScalarType> out_dtype,
+          IntArrayRef contraction_dim,
+          bool use_fast_accum) {
+  bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true);
+  TORCH_CHECK_VALUE(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = [9.0, 10.0], or ROCm MI300+");
+
+  TORCH_CHECK_VALUE(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");
+  TORCH_CHECK_VALUE(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed");
+  TORCH_CHECK_VALUE(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
+  TORCH_CHECK_VALUE(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
+  const bool a_is_2d = mat_a.dim() == 2;
+  const bool b_is_2d = mat_b.dim() == 2;
+
+  // NOTE(slayton): For sub-1B formats want contraction_dim argument?
+  if (!a_is_2d || !b_is_2d) {
+    if (contraction_dim.size() > 0) {
+      const int dim_a = contraction_dim[0], dim_b = mat_b.size(contraction_dim[1]);
+      TORCH_CHECK_VALUE(mat_a.size(dim_a) == mat_b.size(dim_b),
+          "Contraction dimensions (", dim_a, ",", dim_b, ") of mat_a and mat_b must match, got: ", mat_a.size(dim_a), " and ",
+          mat_b.size(dim_b));
+      // Note: only (-1, -2) is currently supported
+      TORCH_CHECK_VALUE(dim_a == -1 && dim_b == -2, "Curently contraction dims must be (-1, -2) only");
+    } else {
+      TORCH_CHECK_VALUE(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
+    }
+  }
+  TORCH_CHECK_VALUE(
+    mat_a.size(-1) % 16 == 0,
+    "Expected trailing dimension of mat_a to be divisible by 16 ",
+    "but got mat1 shape: (",
+    mat_a.sizes(),
+    ").");
+  TORCH_CHECK_VALUE(mat_b.size(-2) % 16 == 0 && mat_b.size(-1) % 16 == 0,
+    "Expected mat_b shape to be divisible by 16 ",
+    "but got mat_b shape: (",
+    mat_b.sizes(),
+    ").");
+
+  TORCH_CHECK_VALUE(!bias.has_value(), "Bias not supported yet");
+  TORCH_CHECK_VALUE(offs.has_value() ==  (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix");
+
+  // NOTE: mxfp8 x mxfp8 requires (and asserts later) that offsets is present.
+  //       for rowwise, no offsets implies 3d-3d and is handled by lower-level
+  //       routines
+  if (offs.has_value()) {
+    TORCH_CHECK_VALUE(offs->dim() == 1, "offs has to be 1D");
+    TORCH_CHECK_VALUE(offs->dtype() == at::kInt, "Offsets have to be int32");
+  }
+
+  const auto out_dtype_ = out_dtype.value_or(kBFloat16);
+  TORCH_CHECK_VALUE(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
+
+  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
+
+  // Conversion of implicitly-defined enums to explicit
+  auto scale_recipe_a_enum = convert_int_to_enum<ScalingType>(scale_recipe_a);
+  auto swizzle_a_enum = convert_int_to_enum<SwizzleType>(swizzle_a);
+  auto scale_recipe_b_enum = convert_int_to_enum<ScalingType>(scale_recipe_b);
+  auto swizzle_b_enum = convert_int_to_enum<SwizzleType>(swizzle_b);
+
+  // at this point we can start working out what we want to be doing
+  // Try to do as few steps as possible.
+  // NOTE: support is deliberately sparse, can explicitly enumerate all combinations allowed.
+  // Do this via a list of defined (name, acceptance, concrete_impl) tuples.
+  ScaledGemmImplementation gemm_impl = ScaledGemmImplementation::NONE;
+  for (const auto& fn_entry : scale_grouped_kernel_dispatch) {
+    const auto [name, accept_fn, scaled_gemm_impl] = fn_entry;
+    bool ok = accept_fn(mat_a.scalar_type(),
+                        scale_recipe_a_enum,
+                        scale_a,
+                        mat_b.scalar_type(),
+                        scale_recipe_b_enum,
+                        scale_b);
+    if (ok) {
+      gemm_impl = scaled_gemm_impl;
+      break;
+    }
+  }
+  TORCH_CHECK_VALUE(gemm_impl != ScaledGemmImplementation::NONE,
+      "No gemm implementation was found");
+
+  switch (gemm_impl) {
+    case ScaledGemmImplementation::ROWWISE_ROWWISE: {
+      const int scale_multiplier = (mat_a.dim() == 2 && mat_b.dim() == 2) ? offs->size(0) : 1;
+      _check_scales_fp8_rowwise(mat_a, scale_a[0], 0 /* dim */ , 0 /* arg_idx */, scale_multiplier);
+      _check_scales_fp8_rowwise(mat_b, scale_b[0], 1 /* dim */ , 1 /* arg_idx */, scale_multiplier);
+      return _f8_f8_bf16_rowwise_grouped_mm(
+          mat_a,
+          mat_b,
+          scale_a[0],
+          scale_b[0],
+          offs,
+          bias,
+          use_fast_accum,
+          out);
+    }
+    case ScaledGemmImplementation::MXFP8_MXFP8: {
+      _check_scales_mxfp8(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
+      _check_scales_mxfp8(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
+      return _mx8_mx8_bf16_grouped_mm_fbgemm(
+          mat_a,
+          mat_b,
+          scale_a[0],
+          swizzle_a_enum[0],
+          scale_b[0],
+          swizzle_b_enum[0],
+          offs.value(),
+          out);
+    }
+    default:
+      TORCH_CHECK_NOT_IMPLEMENTED(false,
+          "_scaled_grouped_mm_cuda_v2 is in an inconsistent state - should never reach here");
+  }
+}
+
+Tensor _grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
+const std::optional<at::Tensor>& offs,
+const std::optional<at::Tensor>& bias,
+std::optional<c10::ScalarType> out_dtype) {
+  _grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype);
+  bool a_b_and_out_are_bf16 = (
+    mat_a.dtype() == at::kBFloat16 &&
+    mat_b.dtype() == at::kBFloat16 &&
+    out_dtype.value_or(at::kBFloat16) == at::kBFloat16
+  );
+#ifndef USE_ROCM
+  bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16;
+#else
+  // _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used.
+  // the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm
+  bool use_fast_path = false;
+#endif
+  const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
+  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
+  if (use_fast_path) {
+    // fast path, no d2h sync needed
+    at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
+  } else {
+    _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
+  }
+  return out;
+}
+
+static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
+  // ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm
+  TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
+  TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
+
+  const auto batch1_sizes = batch1.sizes();
+  const auto batch2_sizes = batch2.sizes();
+
+  int64_t bs = batch1_sizes[0];
+  int64_t contraction_size = batch1_sizes[2];
+  int64_t res_rows = batch1_sizes[1];
+  int64_t res_cols = batch2_sizes[2];
+  std::vector<int64_t> output_size {bs, res_rows, res_cols};
+
+  TORCH_CHECK(batch2_sizes[0] == bs && batch2_sizes[1] == contraction_size,
+              "Expected size for first two dimensions of batch2 tensor to be: [",
+              bs, ", ", contraction_size, "] but got: [", batch2_sizes[0], ", ", batch2_sizes[1], "].");
+
+  TORCH_CHECK(batch1.scalar_type() == batch2.scalar_type(), "batch1 and batch2 must have the same dtype");
+
+  TORCH_CHECK(out_dtype == batch1.scalar_type() ||
+    (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
     "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
 
+  if (!is_bmm && self_baddbmm.has_value()) {
+    const auto& self = self_baddbmm.value();
+    TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor");
+    TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output");
+  }
+}
+
+Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
+  IntArrayRef batch1_sizes = batch1.sizes();
+  IntArrayRef batch2_sizes = batch2.sizes();
+
+  Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype));
+  return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out);
+}
+
+Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
+  baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true);
+  Scalar beta(0.0);
+  Scalar alpha(1.0);
+  {
+    NoNamesGuard guard;
+    baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
+  }
+
+  return out;
+}
+
+Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
+  // We need to copy the tensor
+  Tensor out = self.clone().to(self.options().dtype(out_dtype));
+
+  return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out);
+}
+
+Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
+  baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self);
   {
     NoNamesGuard guard;
     baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
@@ -1853,6 +3082,12 @@ Tensor _mm_dtype_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarTy
 }
 
 Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarType out_dtype, Tensor &out) {
+  TORCH_CHECK(self.dim() == 2,  "self must be a matrix, got ", self.dim(), "-D tensor");
+  TORCH_CHECK(mat2.dim() == 2,  "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
+  TORCH_CHECK(
+      self.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
+      self.sizes()[0], "x", self.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
+
   TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
   TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "input dtypes must be the same");
   TORCH_CHECK(out_dtype == self.scalar_type() ||
@@ -1861,7 +3096,7 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca
   TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
 
 
-  addmm_out_cuda_impl(const_cast<Tensor&>(out), out, self, mat2, 0, 1);
+  addmm_out_cuda_impl(out, out, self, mat2, 0, 1);
 
   return out;
 }
@@ -1872,6 +3107,14 @@ Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& m
 }
 
 Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
+  TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type());
+  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
+  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
+  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
+  TORCH_CHECK(
+      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
+      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
+
   TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
   TORCH_CHECK(out_dtype == self.scalar_type() ||
     (out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
index 12ad84a15b18..ee28c5c1693f 100644
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -999,12 +999,41 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
       dtypes[i] = iter.dtype(i);
     }
     auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
+#ifdef USE_ROCM
+    constexpr int grp_sz = 128;
+    launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
+      if (unrl) {
+        auto offsets0 = offset_calc.get(idx);
+        auto offsets1 = offset_calc.get(idx + grp_sz);
+        auto offsets2 = offset_calc.get(idx + grp_sz * 2);
+        auto offsets3 = offset_calc.get(idx + grp_sz * 3);
+        void* out0 = data[0] + offsets0[0];
+        void* out1 = data[0] + offsets1[0];
+        void* out2 = data[0] + offsets2[0];
+        void* out3 = data[0] + offsets3[0];
+        arg0_t result0 = invoke(f, &data[1], &offsets0[1], &dtypes[1], 1);
+        arg0_t result1 = invoke(f, &data[1], &offsets1[1], &dtypes[1], 1);
+        arg0_t result2 = invoke(f, &data[1], &offsets2[1], &dtypes[1], 1);
+        arg0_t result3 = invoke(f, &data[1], &offsets3[1], &dtypes[1], 1);
+        c10::cast_and_store<arg0_t>(dtypes[0], out0, result0);
+        c10::cast_and_store<arg0_t>(dtypes[0], out1, result1);
+        c10::cast_and_store<arg0_t>(dtypes[0], out2, result2);
+        c10::cast_and_store<arg0_t>(dtypes[0], out3, result3);
+      } else {
+        auto offsets = offset_calc.get(idx);
+        void* out = data[0] + offsets[0];
+        arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
+        c10::cast_and_store<arg0_t>(dtypes[0], out, result);
+      }
+    });
+#else
     launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) {
       auto offsets = offset_calc.get(idx);
       void* out = data[0] + offsets[0];
       arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
       c10::cast_and_store<arg0_t>(dtypes[0], out, result);
     });
+#endif
   }
 }
 
diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu
index 59b0426bab1f..62a07e1e28c8 100644
--- a/aten/src/ATen/native/cuda/Copy.cu
+++ b/aten/src/ATen/native/cuda/Copy.cu
@@ -42,6 +42,19 @@ void bfloat16_copy_kernel_cuda(TensorIteratorBase &iter) {
     });
 }
 
+#ifdef USE_ROCM
+void bfloat16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) {
+    gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::BFloat16 value) {
+        return static_cast<float>(value);
+    });
+}
+void float16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) {
+    gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::Half value) {
+        return static_cast<float>(value);
+    });
+}
+#endif
+
 void float8_copy_kernel_cuda(TensorIteratorBase &iter) {
   ScalarType dtype = iter.dtype(0);
   ScalarType other_dtype = iter.dtype(1);
@@ -187,7 +200,17 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) {
      } else {
        float16_copy_kernel_cuda(iter);
      }
-  } else if (isBitsType(dtype)) {
+  }
+#ifdef USE_ROCM
+  else if ((iter.dtype(1) == kBFloat16 || iter.dtype(1) == kHalf) && dtype == kFloat) {
+    if (iter.dtype(1) == kBFloat16) {
+      bfloat16tofloat32_copy_kernel_cuda(iter);
+    } else {
+      float16tofloat32_copy_kernel_cuda(iter);
+    }
+  }
+#endif
+  else if (isBitsType(dtype)) {
     TORCH_CHECK(dtype == iter.dtype(1), "copy_() does not support casting "
       "bits types to different bits types. Source dtype is ", iter.dtype(1), "target dtype is ", dtype);
     AT_DISPATCH_BIT_TYPES(dtype, "copy_", [&] {
diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
index 1ed6a7722d9b..344906a2a4df 100644
--- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
@@ -38,12 +38,41 @@ __device__ inline int min(int a, int b) {
 #define BLOCK_STRIDE_BWD 2 // increasing block_stride to lower # of blocks launched
 #endif
 
-static __device__ inline int p_start(int size, int pad, int kernel, int dilation, int stride) {
-  return (size + pad < ((kernel - 1) * dilation + 1)) ? 0 : (size + pad - ((kernel - 1) * dilation + 1)) / stride + 1;
+template <typename index_t>
+static __device__ inline index_t p_start(index_t size, int pad, int kernel, int dilation, int stride) {
+  const auto kernel_extent = static_cast<index_t>((kernel - 1) * dilation + 1);
+  return (size + pad < kernel_extent) ? index_t(0) : (size + pad - kernel_extent) / stride + 1;
 }
 
-static __device__ inline int p_end(int size, int pad, int pooled_size, int stride) {
-  return min((size + pad) / stride + 1, pooled_size);
+template <typename index_t>
+static __device__ inline index_t p_end(index_t size, int pad, index_t pooled_size, int stride) {
+  return std::min((size + pad) / stride + 1, pooled_size);
+}
+
+static inline bool can_use_int32_nhwc(
+    int64_t nbatch, int64_t channels,
+    int64_t height, int64_t width,
+    int64_t pooled_height, int64_t pooled_width,
+    int64_t in_stride_n, int64_t in_stride_c,
+    int64_t in_stride_h, int64_t in_stride_w)
+{
+  constexpr int64_t int_max = std::numeric_limits<int>::max();
+
+  int64_t max_intra_batch =
+      (height ? (height - 1) * in_stride_h : 0) +
+      (width ? (width - 1) * in_stride_w : 0) +
+      (channels? (channels - 1) * in_stride_c : 0);
+
+  int64_t max_input_offset = (nbatch ? (nbatch - 1) * in_stride_n : 0) + max_intra_batch;
+
+  if (max_input_offset > int_max) return false;
+
+  int64_t out_batch_stride = pooled_height * pooled_width * channels;
+  if ((nbatch ? (nbatch - 1) * out_batch_stride : 0) > int_max) return false;
+
+  if (height * width > int_max) return false;
+
+  return true;
 }
 
 // kernels borrowed from Caffe
@@ -85,21 +114,25 @@ __global__ void max_pool_forward_nchw(const int nthreads, const scalar_t* bottom
   }
 }
 
-template <typename scalar_t>
+template <typename scalar_t, typename index_t>
 C10_LAUNCH_BOUNDS_1(CUDA_MAX_THREADS)
-__global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nbatch,
-                                   const int64_t channels, const int64_t height,
-                                   const int64_t width, const int pooled_height, const int pooled_width,
-                                   const int kernel_h, const int kernel_w, const int stride_h,
-                                   const int stride_w, const int pad_h, const int pad_w,
-                                   const int dilation_h, const int dilation_w,
-                                   const int in_stride_n, const int in_stride_c,
-                                   const int in_stride_h, const int in_stride_w,
-                                   const int kernel_stride_C, const int kernel_size_C,
-                                   scalar_t* top_data, int64_t* top_mask) {
-  extern __shared__ int smem[];
-  int *out_mask_cached = smem;
-  scalar_t *out_cached = reinterpret_cast<scalar_t*>(&out_mask_cached[kernel_size_C*blockDim.x*blockDim.y*blockDim.z]);
+__global__ void max_pool_forward_nhwc(
+    const scalar_t* bottom_data,
+    const int nbatch,
+    const index_t channels, const index_t height, const index_t width,
+    const index_t pooled_height, const index_t pooled_width,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w,
+    const index_t in_stride_n, const index_t in_stride_c,
+    const index_t in_stride_h, const index_t in_stride_w,
+    const int kernel_stride_C, const int kernel_size_C,
+    scalar_t* top_data, int64_t* top_mask) {
+
+  extern __shared__ unsigned char smem_raw[];
+  index_t *out_mask_cached = reinterpret_cast<index_t*>(smem_raw);
+  scalar_t *out_cached = reinterpret_cast<scalar_t*>(
+      out_mask_cached + kernel_size_C*blockDim.x*blockDim.y*blockDim.z);
 
   // flattening cta for pre-computation & smem initialization;
   int thread_id = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
@@ -118,26 +151,26 @@ __global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nba
   int channel_id = blockIdx.x / nbatch;
   int channel_offset = threadIdx.x + channel_id * blockDim.x;
 
-  top_data = top_data + batch_id * pooled_height * pooled_width * channels;
-  top_mask = top_mask + batch_id * pooled_height * pooled_width * channels;
-  bottom_data = bottom_data + batch_id * in_stride_n;
+  top_data = top_data + static_cast<index_t>(batch_id) * (pooled_height * pooled_width * channels);
+  top_mask = top_mask + static_cast<index_t>(batch_id) * (pooled_height * pooled_width * channels);
+  bottom_data = bottom_data + static_cast<index_t>(batch_id) * in_stride_n;
 
-  out_cached = &out_cached[(threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x];
-  out_mask_cached = &out_mask_cached[(threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x];
+  out_cached += (threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x;
+  out_mask_cached  += (threadIdx.z * blockDim.y + threadIdx.y) * kernel_size_C*blockDim.x;
 
-  int oH = (pooled_height + gridDim.z-1) / gridDim.z;
-  int oW = (pooled_width + gridDim.y-1) / gridDim.y;
+  int oH = (static_cast<int>(pooled_height) + gridDim.z - 1) / gridDim.z;
+  int oW = (static_cast<int>(pooled_width)  + gridDim.y - 1) / gridDim.y;
   int ostartH = threadIdx.z + blockIdx.z*oH;
-  int oendH = ::min(ostartH+oH, pooled_height);
+  int oendH = ::min(ostartH+oH, static_cast<int>(pooled_height));
   int ostartW = threadIdx.y + blockIdx.y*oW;
-  int oendW = ::min(ostartW+oW, pooled_width);
+  int oendW = ::min(ostartW+oW, static_cast<int>(pooled_width));
 
   for (int oh = ostartH; oh < oendH; oh+=blockDim.z) {
-    int hstart = oh * stride_h - pad_h;
-    int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height);
+    index_t hstart = static_cast<index_t>(oh) * stride_h - pad_h;
+    index_t hend = std::min(hstart + static_cast<index_t>((kernel_h - 1) * dilation_h + 1), height);
     for (int ow = ostartW; ow < oendW; ow+=blockDim.y) {
-      int wstart = ow * stride_w - pad_w;
-      int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width);
+      index_t wstart = static_cast<index_t>(ow) * stride_w - pad_w;
+      index_t wend = std::min(wstart + static_cast<index_t>((kernel_w - 1) * dilation_w + 1), width);
       while(hstart < 0)
         hstart += dilation_h;
       while(wstart < 0)
@@ -185,12 +218,12 @@ __global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nba
       // Else do it Non-Prefetch...
       else
 #endif
-      for (int ih = hstart; ih < hend; ih += dilation_h) {
-        for (int iw = wstart; iw < wend; iw += dilation_w) {
+      for (index_t ih = hstart; ih < hend; ih += dilation_h) {
+        for (index_t iw = wstart; iw < wend; iw += dilation_w) {
           int cached_index = threadIdx.x;
           const scalar_t *ptr_input = bottom_data + ih * in_stride_h + iw * in_stride_w;
-          for(int c = channel_offset; c < channels; c+= blockDim.x*kernel_stride_C) {
-            scalar_t val = ptr_input[c*in_stride_c];
+          for (index_t c = channel_offset; c < channels; c += static_cast<index_t>(blockDim.x) * kernel_stride_C) {
+            scalar_t val = ptr_input[c * in_stride_c];
             if ((val > out_cached[cached_index]) || at::_isnan(val)) {
               out_cached[cached_index] = val;
               out_mask_cached[cached_index] = ih * width + iw;
@@ -200,15 +233,15 @@ __global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nba
         }
       }
 
-      scalar_t *ptr_output_data = top_data + (oh * pooled_width + ow) * channels;
-      int64_t *ptr_output_mask = top_mask + (oh * pooled_width + ow) * channels;
+      scalar_t *ptr_output_data = top_data + (static_cast<index_t>(oh) * pooled_width + ow) * channels;
+      int64_t *ptr_output_mask = top_mask + (static_cast<index_t>(oh) * pooled_width + ow) * channels;
 
       int cached_index = threadIdx.x;
-      for(int c = channel_offset; c < channels; c+= blockDim.x*kernel_stride_C) {
+      for (index_t c = channel_offset; c < channels; c += static_cast<index_t>(blockDim.x) * kernel_stride_C) {
         ptr_output_data[c] = out_cached[cached_index];
-        ptr_output_mask[c] = out_mask_cached[cached_index];
+        ptr_output_mask[c] = static_cast<int64_t>(out_mask_cached[cached_index]);
         out_cached[cached_index] = at::numeric_limits<scalar_t>::lower_bound();
-        out_mask_cached[cached_index] = 0;
+        out_mask_cached[cached_index] = index_t(0);
         cached_index += blockDim.x;
       }
     }
@@ -216,7 +249,7 @@ __global__ void max_pool_forward_nhwc(const scalar_t* bottom_data, const int nba
 }
 
 
-static const int BLOCK_THREADS = 256;
+static constexpr int BLOCK_THREADS = 256;
 
 template <typename scalar_t, typename accscalar_t>
 #if defined (USE_ROCM)
@@ -462,6 +495,11 @@ const Tensor& indices) {
               maxThreadsDim[0], std::min<int>(lastPow2(nInputPlane), max_threads / block_y / block_z));
           const dim3 block(block_x, block_y, block_z);
 
+          bool use_int32 = can_use_int32_nhwc(
+              nbatch, nInputPlane, inputHeight, inputWidth,
+              outputHeight, outputWidth,
+              in_stride_n, in_stride_c, in_stride_h, in_stride_w);
+
           int kernel_stride_C = ceil_div(
               safe_downcast<int, int64_t>(nInputPlane), block_x * 4);
           int kernel_size_C = ceil_div(
@@ -476,18 +514,41 @@ const Tensor& indices) {
               ceil_div(safe_downcast<int, int64_t>(outputHeight), block_z*BLOCK_STRIDE_FWD));
           const dim3 grid(grid_x, grid_y, grid_z);
 
-          size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * (sizeof(int) + sizeof(scalar_t));
-          AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock);
-
-          max_pool_forward_nhwc<scalar_t>
-          <<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>>(
-              input_data, nbatch,
-                  nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
-                  kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-                  in_stride_n, in_stride_c,
-                  in_stride_h, in_stride_w,
-                  kernel_stride_C, kernel_size_C,
-                  output_data, indices_data);
+          size_t shmem_size;
+          size_t mask_elems = static_cast<size_t>(kernel_size_C) * block_x * block_y * block_z;
+
+          if (use_int32) {
+            shmem_size = mask_elems * (sizeof(int32_t) + sizeof(scalar_t));
+            TORCH_CHECK(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock,
+                        "shared memory too small");
+            max_pool_forward_nhwc<scalar_t, int32_t>
+              <<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>>(
+                input_data, static_cast<int>(nbatch),
+                static_cast<int32_t>(nInputPlane),
+                static_cast<int32_t>(inputHeight),
+                static_cast<int32_t>(inputWidth),
+                static_cast<int32_t>(outputHeight),
+                static_cast<int32_t>(outputWidth),
+                kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                static_cast<int32_t>(in_stride_n),
+                static_cast<int32_t>(in_stride_c),
+                static_cast<int32_t>(in_stride_h),
+                static_cast<int32_t>(in_stride_w),
+                kernel_stride_C, kernel_size_C,
+                output_data, indices_data);
+          } else {
+            shmem_size = mask_elems * (sizeof(int64_t) + sizeof(scalar_t));
+            TORCH_CHECK(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock,
+                        "shared memory too small");
+            max_pool_forward_nhwc<scalar_t, int64_t>
+              <<<grid, block, shmem_size, at::cuda::getCurrentCUDAStream()>>>(
+                input_data, static_cast<int>(nbatch),
+                nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+                kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+                in_stride_n, in_stride_c, in_stride_h, in_stride_w,
+                kernel_stride_C, kernel_size_C,
+                output_data, indices_data);
+          }
           C10_CUDA_KERNEL_LAUNCH_CHECK();
           break;
         }
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index 602dfd6e5288..65b0e1441de7 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -15,9 +15,7 @@
 #include <ATen/native/cuda/block_reduce.cuh>
 #include <ATen/native/cuda/thread_constants.h>
 
-#if CUB_SUPPORTS_SCAN_BY_KEY()
 #include <thrust/iterator/reverse_iterator.h>
-#endif
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -36,9 +34,9 @@ namespace at::native {
 namespace {
 
 #if defined(USE_ROCM)
-static const int BLOCKDIMY = 16;
+static constexpr int BLOCKDIMY = 16;
 #else
-static const int BLOCKDIMY = 32;
+static constexpr int BLOCKDIMY = 32;
 #endif
 
 template
@@ -240,10 +238,6 @@ __global__ void renorm_kernel(
 
 } // anonymous namespace
 
-#if !CUB_SUPPORTS_SCAN_BY_KEY()
-template<typename index_t>
-void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
-#endif
 
 Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indices_,
                                int64_t num_weights, int64_t padding_idx,
@@ -306,7 +300,6 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
 
   if (scale_grad_by_freq) {
     count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-#if CUB_SUPPORTS_SCAN_BY_KEY()
     AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
       cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
@@ -333,11 +326,6 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
         num_indices
       );
     });
-#else
-    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cuda", [&] () {
-      embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
-    });
-#endif
   }
 
   return embedding_backward_cuda_kernel(grad, orig_indices,
diff --git a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
index 76307a0bf549..6ce419137345 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
@@ -10,9 +10,7 @@
 
 #include <c10/macros/Macros.h>
 
-#if CUB_SUPPORTS_UNIQUE_BY_KEY()
 #include <thrust/iterator/counting_iterator.h>
-#endif
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -88,9 +86,9 @@ __global__ void compute_grad_weight_bags(
     const int64_t stride_warped) {
 
   int64_t num_of_segments = *num_of_segments_ptr;
-  const int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  const int id = gid / stride_warped;
-  const int startFeature = gid % stride_warped;
+  const int64_t gid = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  const int64_t id = gid / stride_warped;
+  const int64_t startFeature = gid % stride_warped;
   if (startFeature >= stride) {
     return;
   }
@@ -134,9 +132,9 @@ __global__ void compute_grad_weight(
 
   int64_t num_of_segments = *num_of_segments_ptr;
   using accscalar_t = acc_type<scalar_t, true>;
-  const int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  const int id = gid / stride_warped;
-  const int startFeature = gid % stride_warped;
+  const int64_t gid = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  const int64_t id = gid / stride_warped;
+  const int64_t startFeature = gid % stride_warped;
   if (startFeature >= stride) {
     return;
   }
@@ -167,9 +165,9 @@ __global__ void sum_and_scatter(
 
   int64_t num_of_segments = *num_of_segments_ptr;
   int64_t num_of_partial_segments = *num_of_partial_segments_ptr;
-  const int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  const int id = gid / stride_warped;
-  const int startFeature = gid % stride_warped;
+  const int64_t gid = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  const int64_t id = gid / stride_warped;
+  const int64_t startFeature = gid % stride_warped;
   if (startFeature >= stride) {
     return;
   }
@@ -196,18 +194,9 @@ __global__ void compute_num_of_partial_segments(const index_t *partials_per_segm
             partials_per_segment_offset[num_of_segments-1];
 }
 
-#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
-__global__ void write_num_of_segments_for_legacy_thrust_path(int64_t *num_of_segments_ptr, int64_t num_of_segments) {
-  *num_of_segments_ptr = num_of_segments;
-}
-#endif
 
 } // anon namespace
 
-#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
-template<typename index_t>
-int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets);
-#endif
 
 Tensor embedding_backward_cuda_kernel(
         const Tensor &grad,
@@ -234,20 +223,12 @@ Tensor embedding_backward_cuda_kernel(
   auto segment_offsets = at::empty({numel}, orig_indices.options());
   auto num_of_segments_tensor = at::empty({}, grad.options().dtype(kLong));
   int64_t *num_of_segments_ptr = num_of_segments_tensor.mutable_data_ptr<int64_t>();
-#if !CUB_SUPPORTS_UNIQUE_BY_KEY()
-  AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
-    int64_t num_of_segments = embedding_backward_cuda_kernel_unique_by_key<index_t>(sorted_indices, segment_offsets);
-    write_num_of_segments_for_legacy_thrust_path<<<1, 1, 0, c10::cuda::getCurrentCUDAStream()>>>(num_of_segments_ptr, num_of_segments);
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-  });
-#else
   AT_DISPATCH_INDEX_TYPES(orig_indices.scalar_type(), "embedding_backward_cuda_kernel", [&] () {
     cuda::cub::unique_by_key(
       sorted_indices.const_data_ptr<index_t>(), thrust::make_counting_iterator(0),
       segment_offsets.mutable_data_ptr<index_t>(),
       num_of_segments_ptr, sorted_indices.numel());
   });
-#endif
 
   int64_t max_segments = std::min<int64_t>(numel, num_weights);
 
diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
index fb92c7488a15..ab3747df031e 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -31,16 +31,10 @@
 
 #include <c10/macros/Macros.h>
 
-#if CUB_SUPPORTS_SCAN_BY_KEY()
 #include <thrust/iterator/reverse_iterator.h>
-#endif
 
 namespace at::native {
 
-#if !CUB_SUPPORTS_SCAN_BY_KEY()
-template<typename index_t>
-void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count);
-#endif
 
 namespace {
 
@@ -199,7 +193,6 @@ Tensor embedding_bag_backward_cuda_sum_avg(
 
   if (scale_grad_by_freq) {
     count = at::empty_like(indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-#if CUB_SUPPORTS_SCAN_BY_KEY()
     AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
       cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
@@ -226,11 +219,6 @@ Tensor embedding_bag_backward_cuda_sum_avg(
         num_indices
       );
     });
-#else
-    AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_backward_cuda_sum_avg", [&] () {
-      embedding_dense_backward_cuda_scan<index_t>(sorted_indices, count);
-    });
-#endif
   }
   return embedding_backward_cuda_kernel(grad, orig_indices, sorted_indices,
       count, num_weights, padding_idx, mode == EmbeddingBagMode::MEAN, offset2bag,
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
index 7ee02b02b41f..227d42247ebd 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
@@ -51,7 +51,7 @@ std::vector<Tensor> foreach_tensor_list_op(
       Op<opmath_t>(),
       alpha.to<opmath_t>());
 
-  return tensor_lists[2];
+  return std::move(tensor_lists[2]);
 }
 
 template <typename T, template <class> class Op>
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
index 80d748dd3579..9ac0e875b2d6 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
@@ -45,7 +45,7 @@ std::vector<Tensor> foreach_binary_op(
           /* res_arg_index */ 1>(),
       Op<opmath_t>(),
       scalar.to<opmath_t>());
-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }
 
 template <typename T, template <class> class Op>
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
index dcb93188b5e6..b28aa690630b 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
@@ -33,7 +33,7 @@ std::vector<Tensor> foreach_binary_op(
   }
 
   tensor_lists.emplace_back(tensors.vec());
-  tensor_lists.emplace_back(vec_res);
+  tensor_lists.emplace_back(std::move(vec_res));
 
   using opmath_t = at::opmath_type<T>;
   multi_tensor_apply<2, opmath_t>(
@@ -46,7 +46,7 @@ std::vector<Tensor> foreach_binary_op(
           /* res_arg_index */ 1>(),
 
       Op<opmath_t>());
-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }
 
 template <typename T, template <class> class Op>
diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
index ad5eeee5ebec..bc6bd3789125 100644
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
@@ -56,7 +56,7 @@ std::vector<Tensor> foreach_binary_op(
       Op<opmath_t>(),
       scalar.data_ptr<T>(),
       alpha.to<opmath_t>());
-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }
 
 template <typename T, template <class> class Op>
diff --git a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
index 7a3276c44750..7f563f55d556 100644
--- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
@@ -57,7 +57,7 @@ std::vector<Tensor> foreach_pointwise_op(
             scalar.to<opmath_t>());
       });
 
-  return tensor_lists[3];
+  return std::move(tensor_lists[3]);
 }
 
 template <template <class> class Op>
@@ -160,7 +160,7 @@ std::vector<Tensor> foreach_pointwise_op(
             Op<opmath_t>());
       });
 
-  return tensor_lists[3];
+  return std::move(tensor_lists[3]);
 }
 
 #define FOREACH_POINTWISE_OP_SCALAR(NAME, OP)                           \
diff --git a/aten/src/ATen/native/cuda/ForeachTernaryOp.cu b/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
index a6599287f3d6..313aa567bb05 100644
--- a/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
@@ -37,7 +37,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
     vec_res.emplace_back(at::native::empty_like(t));
   }
   std::vector<std::vector<at::Tensor>> tensor_lists{
-      tensors1.vec(), tensors2.vec(), tensors3.vec(), vec_res};
+      tensors1.vec(), tensors2.vec(), tensors3.vec(), std::move(vec_res)};
 
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
       at::ScalarType::Half,
@@ -56,7 +56,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
             LerpFunctor<opmath_t>());
       });
 
-  return tensor_lists[3];
+  return std::move(tensor_lists[3]);
 }
 
 void foreach_tensor_lerp_ternary_cuda_(
@@ -104,7 +104,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
     vec_res.emplace_back(at::native::empty_like(t));
   }
   std::vector<std::vector<at::Tensor>> tensor_lists{
-      tensors1.vec(), tensors2.vec(), vec_res};
+      tensors1.vec(), tensors2.vec(), std::move(vec_res)};
 
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
       at::ScalarType::Half,
@@ -124,7 +124,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
             weight.to<opmath_t>());
       });
 
-  return tensor_lists[2];
+  return std::move(tensor_lists[2]);
 }
 
 void foreach_tensor_lerp_list_cuda_(
@@ -173,7 +173,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
     vec_res.emplace_back(at::native::empty_like(t));
   }
   std::vector<std::vector<at::Tensor>> tensor_lists{
-      tensors1.vec(), tensors2.vec(), vec_res};
+      tensors1.vec(), tensors2.vec(), std::move(vec_res)};
 
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
       at::ScalarType::Half,
@@ -193,7 +193,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
             LerpFunctor<opmath_t>());
       });
 
-  return tensor_lists[2];
+  return std::move(tensor_lists[2]);
 }
 
 void foreach_tensor_lerp_scalarlist_cuda_(
diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
index e04a7939d939..bb070f9d9761 100644
--- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
@@ -67,7 +67,7 @@ std::vector<Tensor> foreach_unary_op(TensorList tensors) {
           /* res_arg_index */ 1>(),
       Op<opmath_t>());
 
-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }
 
 template <typename scalar_t, template <class> class Op>
diff --git a/aten/src/ATen/native/cuda/IGammaKernel.cu b/aten/src/ATen/native/cuda/IGammaKernel.cu
index 624f080d9f6e..73db6272be9e 100644
--- a/aten/src/ATen/native/cuda/IGammaKernel.cu
+++ b/aten/src/ATen/native/cuda/IGammaKernel.cu
@@ -82,7 +82,7 @@ __host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
   // lanczos approximation
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
 
-  static const accscalar_t lanczos_sum_expg_scaled_num[13] = {
+  constexpr accscalar_t lanczos_sum_expg_scaled_num[13] = {
     0.006061842346248906525783753964555936883222,
     0.5098416655656676188125178644804694509993,
     19.51992788247617482847860966235652136208,
@@ -97,7 +97,7 @@ __host__ __device__ scalar_t lanczos_sum_expg_scaled(scalar_t x) {
     103794043.1163445451906271053616070238554,
     56906521.91347156388090791033559122686859
   };
-  static const accscalar_t lanczos_sum_expg_scaled_denom[13] = {
+  constexpr accscalar_t lanczos_sum_expg_scaled_denom[13] = {
     1.,
     66.,
     1925.,
@@ -126,10 +126,10 @@ __host__ __device__ scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
   accscalar_t ax, fac, res, num, numfac;
-  static const accscalar_t MAXLOG = std::is_same_v<accscalar_t,double> ?
+  constexpr accscalar_t MAXLOG = std::is_same_v<accscalar_t,double> ?
     7.09782712893383996843E2 : 88.72283905206835;
-  static const accscalar_t EXP1 = 2.718281828459045;
-  static const accscalar_t lanczos_g = 6.024680040776729583740234375;
+  constexpr accscalar_t EXP1 = 2.718281828459045;
+  constexpr accscalar_t lanczos_g = 6.024680040776729583740234375;
 
   if (::fabs(a - x) > 0.4 * ::fabs(a)) {
     ax = a * ::log(x) - x - ::lgamma(a);
@@ -158,9 +158,9 @@ __host__ __device__ scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
   // Compute igam using DLMF 8.11.4. [igam1]
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
-  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
-  static const int MAXITER = 2000;
+  constexpr int MAXITER = 2000;
 
   int i;
   accscalar_t ans, ax, c, r;
@@ -196,8 +196,8 @@ __host__ __device__ scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
   accscalar_t fac = 1;
   accscalar_t sum = 0;
   accscalar_t term, logx;
-  static const int MAXITER = 2000;
-  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  constexpr int MAXITER = 2000;
+  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
 
   for (n = 1; n < MAXITER; n++) {
@@ -219,7 +219,7 @@ __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t
   // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
-  static const accscalar_t d[25][25] =
+  constexpr accscalar_t d[25][25] =
     {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2, 1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4, 3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6, 8.296711340953086e-7, -1.7665952736826079e-7, 6.7078535434014986e-9, 1.0261809784240308e-8, -4.3820360184533532e-9, 9.1476995822367902e-10, -2.551419399494625e-11, -5.8307721325504251e-11, 2.4361948020667416e-11, -5.0276692801141756e-12, 1.1004392031956135e-13, 3.3717632624009854e-13, -1.3923887224181621e-13, 2.8534893807047443e-14, -5.1391118342425726e-16, -1.9752288294349443e-15},
     {-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3, -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7, -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6, 4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8, 1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9, 4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14, 7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13, -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14, -4.13125571381061e-15},
     {4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4, 2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5, -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6, -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10, -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9, 9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11, 1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12, 4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17, 8.8592218725911273e-15},
@@ -248,7 +248,7 @@ __host__ __device__ scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t
 
   int k, n, sgn;
   int maxpow = 0;
-  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
   accscalar_t lambda = x / a;
   accscalar_t sigma = (x - a) / a;
@@ -314,12 +314,12 @@ __host__ __device__ scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar
   int i;
   accscalar_t ans, ax, c, yc, r, t, y, z;
   accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
-  static const int MAXITER = 2000;
-  static const accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
+  constexpr int MAXITER = 2000;
+  constexpr accscalar_t MACHEP = std::is_same_v<accscalar_t, double> ?
     1.11022302462515654042E-16 : 5.9604644775390625E-8;
-  static const accscalar_t BIG = std::is_same_v<accscalar_t,double> ?
+  constexpr accscalar_t BIG = std::is_same_v<accscalar_t,double> ?
     4.503599627370496e15 : 16777216.;
-  static const accscalar_t BIGINV = std::is_same_v<accscalar_t,double> ?
+  constexpr accscalar_t BIGINV = std::is_same_v<accscalar_t,double> ?
     2.22044604925031308085e-16 : 5.9604644775390625E-8;
 
   ax = _igam_helper_fac(a, x);
@@ -385,10 +385,10 @@ __noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) {
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
   accscalar_t absxma_a;
 
-  static const accscalar_t SMALL = 20.0;
-  static const accscalar_t LARGE = 200.0;
-  static const accscalar_t SMALLRATIO = 0.3;
-  static const accscalar_t LARGERATIO = 4.5;
+  constexpr accscalar_t SMALL = 20.0;
+  constexpr accscalar_t LARGE = 200.0;
+  constexpr accscalar_t SMALLRATIO = 0.3;
+  constexpr accscalar_t LARGERATIO = 4.5;
 
   if ((x < 0) || (a < 0)) {
     // out of defined-region of the function
@@ -467,10 +467,10 @@ __noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) {
 
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
   accscalar_t absxma_a;
-  static const accscalar_t SMALL = 20.0;
-  static const accscalar_t LARGE = 200.0;
-  static const accscalar_t SMALLRATIO = 0.3;
-  static const accscalar_t LARGERATIO = 4.5;
+  constexpr accscalar_t SMALL = 20.0;
+  constexpr accscalar_t LARGE = 200.0;
+  constexpr accscalar_t SMALLRATIO = 0.3;
+  constexpr accscalar_t LARGERATIO = 4.5;
 
   // boundary values following SciPy
   if ((x < 0) || (a < 0)) {
diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
index 850032931de9..927af661396c 100644
--- a/aten/src/ATen/native/cuda/IndexKernel.cu
+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
@@ -5,6 +5,7 @@
 #include <array>
 #include <type_traits>
 #include <ATen/core/TensorBase.h>
+#include <ATen/ceil_div.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Dispatch_v2.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -83,11 +84,17 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
         auto ind_dim_size = index_size[0];
         auto inp_stride_bytes = index_stride[0];
         auto out_stride_bytes = iter.strides(0)[1];
-        if (iter.numel() == 0) return;
-        at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
-        slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
-        return;
-      }
+        // avoid grid overflow in the fast kernel
+        const int64_t vec_chunks = ceil_div(slice_size, alignment);
+        const int64_t blocks_per_slice_upper = ceil_div(vec_chunks, (int64_t)launch_size_nd);
+        const int max_grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
+        // if it's an eligible grid we use the fast path, otherwise default to slower path
+        if (blocks_per_slice_upper <= max_grid_y) {
+          at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
+          slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
+          return;
+        }
+    }
   }
 
   auto sizes = std::array<int64_t, MAX_DIMS>{};
diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
index 02feb55cb69d..dacef18c79b6 100644
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@@ -59,7 +59,7 @@ constexpr uint64_t getDefaultMaxThreadsPerBlock() {
 #ifdef USE_ROCM
 #define SKIP_SORTED_INDICES 32
 template <typename scalar_t, int SZ>
-__global__ void indexing_backward_kernel(
+__global__ void indexing_backward_kernel_many_indices(
   const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
   int64_t numel, int64_t stride, int64_t stride_before, int64_t outer_dim, bool accumulate) {
   using opmath_t = at::opmath_type<scalar_t>;
@@ -254,7 +254,8 @@ __global__ void indexing_backward_kernel_stride_1(
     }
   }
 }
-#else
+#endif
+
 template <typename scalar_t, int SZ>
 __global__ void indexing_backward_kernel(
   const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
@@ -333,6 +334,7 @@ __global__ void indexing_backward_kernel(
   }
 }
 
+#ifndef USE_ROCM
 template <typename scalar_t>
 __global__ void indexing_backward_kernel_stride_1(
   const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
@@ -708,6 +710,9 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
       dim3 block(warp_size, indices_per_block);
 
 #ifdef USE_ROCM
+      dim3 new_grid_many_indices(ceil_div(num_indices, (int64_t) (indices_per_block * warp_size)),
+      grid.y == 1 ? std::min<int>(at::cuda::getCurrentDeviceProperties()->maxGridSize[1], ceil_div(sliceSize, (int64_t) (warp_size))) : grid.y,
+      grid.z);
       dim3 new_grid(ceil_div(num_indices, (int64_t) (indices_per_block * warp_size)), grid.y, grid.z);
       size_t smem_dups_size = indices_per_block * warp_size * sizeof(int64_t);
 #define KERNEL_GRID new_grid
@@ -780,11 +785,43 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
             kBool,
             kBFloat16);
         } else {
+#ifdef USE_ROCM
+          if (num_indices >= 200000)
+            AT_DISPATCH_V2(
+              expandedValue.scalar_type(),
+              "indexing_backward_many_indices",
+              AT_WRAP([&] {
+                indexing_backward_kernel_many_indices<scalar_t, UNROLL><<<new_grid_many_indices, block, smem_dups_size, stream>>>(
+                  sorted_indices.const_data_ptr<int64_t>(),
+                  orig_indices.const_data_ptr<int64_t>(),
+                  expandedValue.const_data_ptr<scalar_t>(),
+                  src_.mutable_data_ptr<scalar_t>(),
+                  num_indices,
+                  sliceSize,
+                  strideBefore,
+                  nElemBefore,
+                  accumulate);
+                C10_CUDA_KERNEL_LAUNCH_CHECK();
+              }),
+              AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
+              // AT_EXPAND(AT_FLOAT8_TYPES),
+              // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
+              // should not be supported here, then reenable AT_FLOAT8_DTYPES
+              kFloat8_e4m3fn,
+              kFloat8_e5m2,
+              kFloat8_e4m3fnuz,
+              kFloat8_e5m2fnuz,
+              kComplexHalf,
+              kHalf,
+              kBool,
+              kBFloat16);
+          else
+#endif
           AT_DISPATCH_V2(
             expandedValue.scalar_type(),
             "indexing_backward",
             AT_WRAP([&] {
-              indexing_backward_kernel<scalar_t, UNROLL><<<KERNEL_GRID, block, KERNEL_SMEM, stream>>>(
+              indexing_backward_kernel<scalar_t, UNROLL><<<grid, block, 0, stream>>>(
                 sorted_indices.const_data_ptr<int64_t>(),
                 orig_indices.const_data_ptr<int64_t>(),
                 expandedValue.const_data_ptr<scalar_t>(),
diff --git a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu b/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
deleted file mode 100644
index 6a549ac3d62c..000000000000
--- a/aten/src/ATen/native/cuda/LegacyThrustHelpers.cu
+++ /dev/null
@@ -1,90 +0,0 @@
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
-#include <ATen/native/cuda/SortingCommon.cuh>
-#include <ATen/cuda/cub_definitions.cuh>
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#else
-#include <ATen/ops/empty_like.h>
-#endif
-
-#include <ATen/cuda/ThrustAllocator.h>
-#include <thrust/device_ptr.h>
-#include <thrust/execution_policy.h>
-#include <thrust/sort.h>
-#include <thrust/unique.h>
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/constant_iterator.h>
-
-namespace at::native {
-
-#if !CUB_SUPPORTS_SCAN_BY_KEY()
-
-template<typename index_t>
-void embedding_dense_backward_cuda_scan(Tensor &sorted_indices, Tensor &count) {
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  at::cuda::ThrustAllocator allocator;
-  auto policy = thrust::cuda::par(allocator).on(stream);
-
-  auto num_indices = count.numel();
-
-  // Compute an increasing sequence per unique item in sortedIndices:
-  // sorted: 2 5 5 5 7 7 8 9 9
-  //  count: 1 1 2 3 1 2 1 1 2
-  auto sorted_data = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
-  auto count_data = thrust::device_ptr<index_t>(count.mutable_data_ptr<index_t>());
-  thrust::inclusive_scan_by_key(
-    policy,
-    sorted_data,
-    sorted_data + num_indices,
-    thrust::make_constant_iterator(1),
-    count_data
-  );
-
-  // Take the maximum of each count per unique key in reverse:
-  // sorted: 2 5 5 5 7 7 8 9 9
-  //  count: 1 3 3 3 2 2 1 2 2
-  thrust::inclusive_scan_by_key(
-    policy,
-    thrust::make_reverse_iterator(sorted_data + num_indices),
-    thrust::make_reverse_iterator(sorted_data),
-    thrust::make_reverse_iterator(count_data + num_indices),
-    thrust::make_reverse_iterator(count_data + num_indices),
-    thrust::equal_to<index_t>(),
-    thrust::maximum<index_t>()
-  );
-}
-
-template
-void embedding_dense_backward_cuda_scan<int>(Tensor &sorted_indices, Tensor &count);
-template
-void embedding_dense_backward_cuda_scan<int64_t>(Tensor &sorted_indices, Tensor &count);
-
-#endif
-
-template<typename index_t>
-int64_t embedding_backward_cuda_kernel_unique_by_key(const Tensor &sorted_indices, Tensor &segment_offsets) {
-  auto stream = at::cuda::getCurrentCUDAStream();
-  at::cuda::ThrustAllocator allocator;
-  auto policy = thrust::cuda::par(allocator).on(stream);
-  const ptrdiff_t numel = sorted_indices.numel();
-  auto sorted_indices_dev = thrust::device_ptr<const index_t>(sorted_indices.const_data_ptr<index_t>());
-  auto dummy = at::empty_like(sorted_indices, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  auto dummy_dev = thrust::device_ptr<index_t>(dummy.mutable_data_ptr<index_t>());
-  auto ends = thrust::unique_by_key_copy(
-          policy,
-          sorted_indices_dev,
-          sorted_indices_dev + numel,
-          thrust::make_counting_iterator(0),
-          dummy_dev,
-          thrust::device_ptr<index_t>(segment_offsets.mutable_data_ptr<index_t>()));
-  return thrust::get<0>(ends) - dummy_dev;
-}
-
-template
-int64_t embedding_backward_cuda_kernel_unique_by_key<int>(const Tensor &sorted_indices, Tensor &segment_offsets);
-template
-int64_t embedding_backward_cuda_kernel_unique_by_key<int64_t>(const Tensor &sorted_indices, Tensor &segment_offsets);
-
-} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
index 02dcf4e7675c..555548547437 100644
--- a/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
+++ b/aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
@@ -8,7 +8,6 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/Dispatch.h>
 #include <ATen/DynamicLibrary.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/native/cuda/MiscUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/LinearAlgebra.h>
diff --git a/aten/src/ATen/native/cuda/LogcumsumexpKernel.cu b/aten/src/ATen/native/cuda/LogcumsumexpKernel.cu
index 18430d5e6947..2d276137c17e 100644
--- a/aten/src/ATen/native/cuda/LogcumsumexpKernel.cu
+++ b/aten/src/ATen/native/cuda/LogcumsumexpKernel.cu
@@ -102,13 +102,7 @@ __host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::comple
 }
 
 void launch_logcumsumexp_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim) {
-// Compile time for CUDA-11.4 is 3x slower than with CUDA-11.6+, specifically for complex numbers
-#if defined(FBCODE_CAFFE2) || defined(OVRSOURCE)
-#define _LCME_DISPATCH AT_DISPATCH_FLOATING_TYPES_AND2
-#else
-#define _LCME_DISPATCH AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2
-#endif
-  _LCME_DISPATCH(ScalarType::Half, ScalarType::BFloat16,
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16,
       self.scalar_type(), "logcumsumexp_cuda",
       [&]() {
         using opmath_t = at::opmath_type<scalar_t>;
diff --git a/aten/src/ATen/native/cuda/Math.cuh b/aten/src/ATen/native/cuda/Math.cuh
index 1d603132e689..1fa245af1a4d 100644
--- a/aten/src/ATen/native/cuda/Math.cuh
+++ b/aten/src/ATen/native/cuda/Math.cuh
@@ -231,7 +231,7 @@ const auto lcm_string = jiterator_stringify(
 const auto digamma_string = jiterator_stringify(
   template <typename T>
   T digamma(T x) {
-    static const double PI_f64 = 3.14159265358979323846;
+    static constexpr double PI_f64 = 3.14159265358979323846;
 
     // Short-circuits if x is +/- 0 and returns -/+ ∞ per the C++ standard
     if (x == 0) {
@@ -3072,9 +3072,9 @@ template <typename scalar_t>
 static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
   // [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
   using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
-  static const double PI_f64 = 3.14159265358979323846;
-  const accscalar_t PSI_10 = 2.25175258906672110764;
-  const accscalar_t A[] = {
+  static constexpr double PI_f64 = 3.14159265358979323846;
+  constexpr accscalar_t PSI_10 = 2.25175258906672110764;
+  constexpr accscalar_t A[] = {
       8.33333333333333333333E-2,
       -2.10927960927960927961E-2,
       7.57575757575757575758E-3,
diff --git a/aten/src/ATen/native/cuda/MaxUnpooling.cu b/aten/src/ATen/native/cuda/MaxUnpooling.cu
index 1109d7b3e6a0..c270a8432ff3 100644
--- a/aten/src/ATen/native/cuda/MaxUnpooling.cu
+++ b/aten/src/ATen/native/cuda/MaxUnpooling.cu
@@ -125,8 +125,6 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
   TORCH_CHECK(
       indices_.scalar_type() == at::ScalarType::Long,
       "elements in indices should be type int64 but got: ", indices_.scalar_type());
-  auto oheight = output_size[0];
-  auto owidth = output_size[1];
 
   TensorArg output_arg{output, "output", 1}, self_arg{self_, "self_", 2},
       indices_arg{indices_, "indices_", 3};
@@ -149,6 +147,9 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
       output_size.size() == 2,
       "There should be exactly two elements (height, width) in output_size, but got ", output_size.size(), " elements.");
 
+  auto oheight = output_size[0];
+  auto owidth = output_size[1];
+
   int64_t dimw = 2;
   int64_t dimh = 1;
   int64_t numBatch = 1;
@@ -217,9 +218,6 @@ static void max_unpooling3d_shape_check(
     IntArrayRef stride,
     IntArrayRef padding,
     const char *fn_name) {
-  int64_t oT = output_size[0];
-  int64_t oH = output_size[1];
-  int64_t oW = output_size[2];
   TORCH_CHECK(
       indices.scalar_type() == at::ScalarType::Long,
       "elements in indices should be type int64 but got: ", indices.scalar_type());
@@ -250,6 +248,10 @@ static void max_unpooling3d_shape_check(
       "strides should be greater than zero, but got stride: ",
       stride);
 
+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+
   int dimw = 3;
   int dimh = 2;
   int dimt = 1;
@@ -402,8 +404,6 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
     const Tensor& indices_,
     IntArrayRef output_size,
     Tensor& grad_input) {
-  int64_t oheight = output_size[0];
-  int64_t owidth = output_size[1];
   TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
   TORCH_CHECK(
       indices_.scalar_type() == at::ScalarType::Long,
@@ -426,6 +426,9 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
 
   TORCH_CHECK(output_size.size() == 2, "output_size must have two elements, got size: ", output_size.size());
 
+  int64_t oheight = output_size[0];
+  int64_t owidth = output_size[1];
+
   int64_t nInputCols, nInputRows, nInputPlane;
 
   int dimw = 2;
@@ -505,13 +508,14 @@ at::Tensor& max_unpooling3d_backward_out_cuda(const Tensor& grad_output_,
     IntArrayRef padding,
     Tensor& grad_input) {
   TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
-  int64_t oT = output_size[0];
-  int64_t oH = output_size[1];
-  int64_t oW = output_size[2];
 
   max_unpooling3d_shape_check(
     self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()");
 
+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+
   int batchSize = 0;
   int inputSlices = 0;
   int inputTime = 0;
diff --git a/aten/src/ATen/native/cuda/NLLLoss2d.cu b/aten/src/ATen/native/cuda/NLLLoss2d.cu
index 046ea7bbc6d7..5009efedc972 100644
--- a/aten/src/ATen/native/cuda/NLLLoss2d.cu
+++ b/aten/src/ATen/native/cuda/NLLLoss2d.cu
@@ -146,6 +146,7 @@ __global__ void nll_loss2d_backward_no_reduce_kernel(
   int64_t batch_size = target.size(0);
   int64_t H = target.size(1);
   int64_t W = target.size(2);
+  int64_t n_classes = grad_input.size(1);
 
   CUDA_KERNEL_LOOP(index, n_threads) {
     const int64_t b = index % batch_size;
@@ -156,6 +157,7 @@ __global__ void nll_loss2d_backward_no_reduce_kernel(
     if (cur_target == ignore_index) {
       continue;
     }
+    CUDA_KERNEL_ASSERT(cur_target >= 0 && cur_target < n_classes);
     scalar_t value = -(weight != nullptr ? weight[cur_target] : static_cast<scalar_t>(1));
     grad_input[b][cur_target][h][w] = value * grad_output[b][h][w];
   }
diff --git a/aten/src/ATen/native/cuda/Nonzero.cu b/aten/src/ATen/native/cuda/Nonzero.cu
index aa1291dc7842..8811f8dc5117 100644
--- a/aten/src/ATen/native/cuda/Nonzero.cu
+++ b/aten/src/ATen/native/cuda/Nonzero.cu
@@ -300,8 +300,6 @@ void nonzero_static_cuda_out_impl(
     int64_t size,
     int64_t fill_value,
     Tensor& out) {
-#if defined(CUDA_VERSION) || defined(USE_ROCM)
-
   Tensor self_contiguous_ = self.contiguous();
   // see comment in nonzero_cuda_out_impl on reqs for out
   bool out_correct_size =
@@ -377,9 +375,6 @@ void nonzero_static_cuda_out_impl(
   if (need_to_copy) {
     out.copy_(out_temp);
   }
-#else
-  TORCH_CHECK(false, "Nonzero_static is not supported for cuda <= 11.4");
-#endif
 }
 
 Tensor& nonzero_out_cuda(const Tensor& self, Tensor& out) {
diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh
index 4e1ddb57fc0f..22d82df5f205 100644
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@@ -413,14 +413,12 @@ struct ReduceOp {
       value = thread_reduce<output_vec_size>(input_slice);
     }
 
-    if (config.should_block_y_reduce()) {
-      value = block_y_reduce<output_vec_size>(value, shared_memory);
-    }
-    __syncthreads();
     if (config.should_block_x_reduce()) {
       value = block_x_reduce<output_vec_size>(value, shared_memory);
     }
-
+    if (config.should_block_y_reduce()) {
+      value = block_y_reduce<output_vec_size>(value, shared_memory);
+    }
     using out_ptr_vec_t = std::array<out_scalar_t*, output_vec_size>;
     using offset_vec_t = std::array<index_t, output_vec_size>;
     offset_vec_t base_offsets;
@@ -655,8 +653,14 @@ struct ReduceOp {
     }
 
     __syncthreads();
-
+    // Intra-warp reduction, fix CUDA to have offset decreasing for better numerics
+    // matching Triton, etc.
+    // TODO(PaulZhang12): AMD and internal
+    #if defined(USE_ROCM) || defined(FBCODE_CAFFE2)
     for (int offset = 1; offset < dim_x; offset <<= 1) {
+    #else
+    for (int offset = dim_x >> 1; offset > 0; offset >>= 1) {
+    #endif
       #pragma unroll
       for (int i = 0; i < output_vec_size; i++) {
         arg_t other = ops.warp_shfl_down(value[i], offset);
@@ -1091,11 +1095,7 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
   // threads with different threadIdx.x are independent and will produce results for different outputs.
   // In such case, values in each loaded vector always correspond to different outputs.
   if (fastest_moving_stride == sizeof(scalar_t)) {
-#ifdef USE_ROCM
     if (reduction_on_fastest_striding_dimension && dim0 >= 128 && iter.num_reduce_dims() == 1) {
-#else
-    if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1 && vt0 >= input_vec_size) {
-#endif
       // Case 1: "vectorize along input"
       // Note that if vt0 < ReduceConfig::vec_size, then this means the register pressure could be high, in such case,
       // we should avoid vectorization.
diff --git a/aten/src/ATen/native/cuda/ReduceMomentKernel.cu b/aten/src/ATen/native/cuda/ReduceMomentKernel.cu
index d7d7fabecc95..cabe86b313e9 100644
--- a/aten/src/ATen/native/cuda/ReduceMomentKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceMomentKernel.cu
@@ -39,9 +39,14 @@ static void std_var_kernel_cuda(TensorIterator& iter, double correction, bool ta
 template <typename scalar_t, typename acc_t=scalar_t, typename out_t=scalar_t>
 void mean_kernel_impl(TensorIterator& iter) {
   //  returns acc_t for all non-complex dtypes and returns T for c10::complex<T>
+  constexpr bool is_16_bits = sizeof(scalar_t) == 2;
   using factor_t = typename c10::scalar_value_type<acc_t>::type;
   factor_t factor = static_cast<factor_t>(iter.num_output_elements()) / iter.numel();
-  gpu_reduce_kernel<scalar_t, out_t>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
+  if constexpr (is_16_bits) {
+    gpu_reduce_kernel<scalar_t, out_t, /*vt0=*/4, /*input_vec_size=*/8>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
+  } else {
+    gpu_reduce_kernel<scalar_t, out_t>(iter, MeanOps<scalar_t, acc_t, factor_t, out_t> {factor});
+  }
 }
 
 static void mean_kernel_cuda(TensorIterator& iter) {
diff --git a/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu b/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
index ea1bd955b8dd..36f0835890de 100644
--- a/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
@@ -13,24 +13,19 @@ namespace at::native {
 template <typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t>
 struct sum_functor {
   void operator()(TensorIterator& iter) {
-#ifdef USE_ROCM
-    // Half and BFloat16 can be packed in groups of up to 8 elements and
-    // can use *_DWORDX4 instructions to achieve that.
-    const bool is_16_bits =
-      ( (std::is_same<at::Half, scalar_t>::value) ||
-        (std::is_same<at::BFloat16, scalar_t>::value) );
-    if (is_16_bits) {
+    const auto sum_combine = [] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
+      return a + b;
+    };
+    constexpr bool is_16_bits = sizeof(scalar_t) == 2;
+    if constexpr (is_16_bits) {
       gpu_reduce_kernel<scalar_t, out_t, /*vt0=*/4, /*input_vec_size=*/8>(
-        iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
-          return a + b;
-        }));
-      return;
+        iter, func_wrapper<out_t>(sum_combine)
+      );
+    } else {
+      gpu_reduce_kernel<scalar_t, out_t>(
+        iter, func_wrapper<out_t>(sum_combine)
+      );
     }
-#endif
-    gpu_reduce_kernel<scalar_t, out_t>(
-        iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
-          return a + b;
-        }));
   }
 };
 
@@ -77,8 +72,8 @@ struct nansum_functor_complex {
 #if AT_USE_JITERATOR()
   void operator()(TensorIterator& iter) {
     std::string func = jiterator_stringify(
-        arg_t combine(arg_t a, scalar_t b) {
-          return a + (std::isnan(b) ? arg_t{0.} : arg_t{b});
+        arg_t combine(arg_t a, arg_t b) {
+          return a + (std::isnan(b) ? arg_t{0.} : b);
         }
     );
     jitted_gpu_reduce_kernel<nansum_name, scalar_t, scalar_t>(
diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu
index e2eb2226acf4..b8774e18487b 100644
--- a/aten/src/ATen/native/cuda/Shape.cu
+++ b/aten/src/ATen/native/cuda/Shape.cu
@@ -226,6 +226,38 @@ __global__ void CatArrayBatchedCopy_contig(
     }
 }
 
+
+template <typename T, typename IndexType, int Dims, int batch_size, int stride_size, int alignment, int elems_per_vec>
+__global__ void CatArrayBatchedCopy_vectorized(
+    char* output,
+    CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
+    TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
+    const int concatDim,
+    IndexType trailingSize) {
+
+    IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
+    IndexType nElements = inputs.nElements[blockIdx.y] / elems_per_vec;
+
+    if(tid >= nElements) return;
+
+    const char * data = (char*)inputs.input[blockIdx.y];
+    IndexType offset = inputs.offset[blockIdx.y] * trailingSize / elems_per_vec;
+    IndexType dimSize = inputs.dimSize[blockIdx.y] * trailingSize / elems_per_vec;
+    int64_t dataOffset = (int64_t)offset  * alignment; // in bytes
+
+    IndexType stride = gridDim.x * blockDim.x;
+
+    while( tid < nElements){
+      int64_t elementOffset = (int64_t)CatArrIndexToOffset<IndexType, Dims>::compute(
+                    os.tensorSize, os.tensorStride, dimSize, concatDim, tid) * alignment; // in bytes
+      auto vec = at::native::memory::ld_vec<alignment>(data + (int64_t)alignment * tid);
+      at::native::memory::st_vec<alignment>(output + dataOffset + elementOffset, vec);
+      tid += stride;
+    }
+}
+
+
+
 /*
   Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads
   to improve memory bandwidth throughput.
@@ -296,12 +328,27 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
   scalar_t *data = (scalar_t *)(out.mutable_data_ptr());
   CatArrInputTensorMetadata<scalar_t, unsigned int, batch_size, stride_size> catMetaData;
   TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam;
+  // If all batches are contiguous we can call a specialized implementation
+  // which requires the input tensor addresses to be aligned to a
+  // 16 Byte boundary.
+
+  constexpr bool isContig = stride_size == 1;
+  bool isAligned = true;
+  constexpr int alignment = 16;
 
   // Next, let's initialize the size, stride arrays for the output Tensor.
+  // for contig case, we'll canonicalize output strides, so that
+  // we don't have arbitrary strides for dims of size 0
+  size_t stride0 = 1;
   if (memory_format == c10::MemoryFormat::Contiguous) {
-    for (int i = 0; i < nDims; ++i) {
+    for (int i = nDims - 1; i >= 0; --i) {
       outputParam.tensorSize[i] = out.size(i);
-      outputParam.tensorStride[i] = out.stride(i);
+      if (isContig) {
+        outputParam.tensorStride[i] = stride0;
+        stride0 *= out.size(i);
+      } else {
+        outputParam.tensorStride[i] = out.stride(i);
+      }
     }
   } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
     // permute the semantics of dims from NCHW to NHWC so that the input
@@ -320,12 +367,15 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
 
   at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
 
-  // If all batches are contiguous we can call a specialized implementation
-  // which requires the input tensor addresses to be aligned to a
-  // 16 Byte boundary.
 
-  bool isContig = true;
-  bool isAligned = true;
+  // for channels last computing slice size correctly is much more involved, so we never send it
+  // on the fully vectorized path
+  // we need output stride in cat dimension to be multiple of alignment,
+  // if we ever use it to compute offsets
+  // for catting in 0th dimension it doesn't matter
+  bool isInOutAligned = isContig && at::native::memory::get_alignment(data) >= alignment &&
+                        memory_format == c10::MemoryFormat::Contiguous && (dimension == 0 ||
+                        outputParam.tensorStride[dimension - 1] * sizeof(scalar_t) % alignment == 0);
   unsigned int max_elements_per_tensor = 0;
 
   // Now we loop
@@ -341,6 +391,16 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
       // high-dimensional tensor
       if (inputs[i+batchCounter].get().numel() > 0) {
         dimSize = inputs[i+batchCounter].get().size(dimension);
+        if (isInOutAligned) {
+          auto t = inputs[i+batchCounter].get();
+          // similarly to output stride, we cannot trust stride value to
+          // determine slice size if the corresponding dimension is 1
+          // we have to multiply all the subsequent sizes
+          int64_t slice_size = dimension == 0 ? t.numel() : t.sizes()[dimension - 1] != 1 ?
+             t.strides()[dimension - 1] : c10::multiply_integers(t.sizes().begin() + dimension, t.sizes().end());
+          slice_size *= sizeof(scalar_t);
+          isInOutAligned &= (slice_size % alignment == 0);
+        }
       }
 
       catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr());
@@ -351,10 +411,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
 #ifdef USE_ROCM
       // On ROCm, CatArrayBatchedCopy_contig is faster
       isAligned = false;
+      isInOutAligned = false;
 #else
       // If at least one of the inputs is not aligned, we can't call the
       // CatArrayBatchedCopy_alignedK_contig
       isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]);
+      isInOutAligned &= at::native::memory::get_alignment(catMetaData.input[batchCounter]) >= alignment;
 #endif
 
       if (stride_size > 1) {
@@ -365,7 +427,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
           catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j];
         }
         catMetaData.isContiguous[batchCounter] = false;
-        isContig = false;
       } else {
         catMetaData.isContiguous[batchCounter] = true;
       }
@@ -388,10 +449,13 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
           max_elements_per_tensor, batchCounter);
 #else
     dim3 applyBlock, catGrid;
-    if (isContig && sizeof(scalar_t) > 2) {
+    if (isInOutAligned) {
+      std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, alignment>(
+        max_elements_per_tensor, batchCounter);
+    } else if (isContig && isAligned && sizeof(scalar_t) > 2) {
       std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>(
           max_elements_per_tensor, batchCounter);
-    } else if (isContig && sizeof(scalar_t) == 2) {
+    } else if (isContig && isAligned && sizeof(scalar_t) == 2) {
       std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_8>(
           max_elements_per_tensor, batchCounter);
     } else {
@@ -399,39 +463,70 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
       getCatGrid(batchCounter, catGrid);
     }
 #endif
+    int32_t trailingSize;
+    int nDimsLocal = nDims;
+    TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
+    if (isInOutAligned) {
+      // in this case we can and should flatten the tensors after the cat dim
+      // we want to view the tensors as if consisting of `alignment`-sized elements
+      // however, we might not be able to cleanly divide just the last dim -
+      // it might not be the multiple of alignment.
+      // however, we know that the full concatted slice is multiple of alignment,
+      // so if we flatten all the dims after and including concat dim,
+      // it will be divisible by alignment
+      // then we need to divide last out size by elems_per_vec,
+      // and divide all strides except last by elems_per_vec (last stride is 1 always)
+      // for input, we will fix up the sizes and strides in the kernel directly
+      kernelOutputParam = outputParam;
+      nDimsLocal = dimension + 1;
+      constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
+      auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
+      kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
+      trailingSize = outputParam.tensorStride[dimension];
+      kernelOutputParam.tensorStride[dimension] = 1;
+      for (int i = 0; i < dimension; ++i) {
+        kernelOutputParam.tensorStride[i] /= elems_per_vec;
+      }
+    }
 
+    int cat_dim = dimension;
     if (memory_format != c10::MemoryFormat::Contiguous) {
-      switch (dimension) {
+      switch (cat_dim) {
       case 0:
         break;
       case 1:
-        dimension = nDims - dimension;
+        cat_dim = nDimsLocal - cat_dim;
         break;
       default:
-        dimension--;
+        cat_dim--;
       }
     }
     // Template Declarations for dim = 1, 2, 3, 4
 #define HANDLE_CASE(DIMS) \
-    if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
+    if (isInOutAligned) {\
+      constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
+      CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
+      catGrid, applyBlock, 0, stream.stream()>>>(\
+        (char*)data, catMetaData, kernelOutputParam, cat_dim, trailingSize);\
+    } else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
       CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
           catGrid, applyBlock, 0, stream.stream()>>>(\
-              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
+              data, catMetaData, outputParam, cat_dim, outputParam.tensorStride[cat_dim]);\
     } else if (isContig && isAligned && sizeof(scalar_t) == 2) { \
       CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_8><<<\
           catGrid, applyBlock, 0, stream.stream()>>>(\
-              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
+              data, catMetaData, outputParam, cat_dim, outputParam.tensorStride[cat_dim]);\
     } else if (isContig) {\
       CatArrayBatchedCopy_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size><<<\
           catGrid, applyBlock, 0, stream.stream()>>>(\
-              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
+              data, catMetaData, outputParam, cat_dim, outputParam.tensorStride[cat_dim]);\
     } else {\
       CatArrayBatchedCopy<scalar_t, unsigned int, DIMS, batch_size, stride_size><<<\
           catGrid, applyBlock, 0, stream.stream()>>>(\
-              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
+              data, catMetaData, outputParam, cat_dim, outputParam.tensorStride[cat_dim]);\
     }\
     C10_CUDA_KERNEL_LAUNCH_CHECK();
-    switch (nDims) {
+    switch (nDimsLocal) {
       case 1:
         HANDLE_CASE(1);
         break;
diff --git a/aten/src/ATen/native/cuda/SortStable.cu b/aten/src/ATen/native/cuda/SortStable.cu
index 4d956616371d..8117eeeec558 100644
--- a/aten/src/ATen/native/cuda/SortStable.cu
+++ b/aten/src/ATen/native/cuda/SortStable.cu
@@ -21,9 +21,15 @@ namespace {
 struct offset_t {
   int stride;
   int begin;
-  __device__ int operator[](int i) {
+  __device__ int operator[](int i) const {
     return stride * (begin + i);
   }
+#if CCCL_VERSION >= 3001000
+  __device__ offset_t& operator+=(int i) {
+    begin += i;
+    return *this;
+  }
+#endif
 };
 // Segmented sort by full sort algorithm:.
 // Say we are sorting a (2, 3) tensor. We have in flattened form:
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cpp b/aten/src/ATen/native/cuda/SpectralOps.cpp
index e2d5ecef006a..3bb6de431cbb 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@@ -121,7 +121,7 @@ void cufft_set_plan_cache_max_size_impl(DeviceIndex device_index, int64_t max_si
     "cufft_set_plan_cache_max_size: expected 0 <= device_index < ",
     at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
     device_index);
-  return cufft_get_plan_cache(device_index).resize(max_size);
+  cufft_get_plan_cache(device_index).resize(max_size);
 }
 
 int64_t cufft_get_plan_cache_size_impl(DeviceIndex device_index) {
@@ -137,7 +137,7 @@ void cufft_clear_plan_cache_impl(DeviceIndex device_index) {
     "cufft_clear_plan_cache: expected 0 <= device_index < ",
     at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
     device_index);
-  return cufft_get_plan_cache(device_index).clear();
+  cufft_get_plan_cache(device_index).clear();
 }
 
 } // namespace at::native::detail
@@ -163,7 +163,7 @@ bool has_large_prime_factor(int64_t n) {
 }
 
 // Execute a general fft operation (can be c2c, onesided r2c or onesided c2r)
-static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
+const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
                          IntArrayRef dim, bool forward) {
   const auto ndim = self.dim();
   const int64_t signal_ndim = dim.size();
@@ -221,22 +221,9 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_
   std::optional<CuFFTConfig> uncached_plan;
   const CuFFTConfig * config = nullptr;
 
-  // Workaround for gh-63152, gh-58724
-  // Bluestein plans in CUDA 11.1 (cufft 10.3) cannot be re-used
   // Bluestein's algorithm is only used when a size has large prime factors,
   // sizes with only small prime factors can still be cached
-  bool use_caching = true;
-#ifdef CUFFT_VERSION
-  if constexpr (10300 <= CUFFT_VERSION && CUFFT_VERSION < 10400) {
-    // Only cache plans for transforms with small prime factors
-    use_caching = std::none_of(
-        signal_size.begin() + 1, signal_size.end(), [](int64_t dim_size) {
-      return has_large_prime_factor(dim_size);
-    });
-  }
-#endif
-
-  if (use_caching && plan_cache.max_size() > 0) {
+  if (plan_cache.max_size() > 0) {
     guard.lock();
     if (plan_cache.max_size() > 0) {  // check again after acquiring the lock
       config = &plan_cache.lookup(Params);
diff --git a/aten/src/ATen/native/cuda/TensorTopK.cpp b/aten/src/ATen/native/cuda/TensorTopK.cpp
index f47e7a887ebe..bc609f829a26 100644
--- a/aten/src/ATen/native/cuda/TensorTopK.cpp
+++ b/aten/src/ATen/native/cuda/TensorTopK.cpp
@@ -19,7 +19,6 @@
 
 namespace at::native {
 
-// TODO: remove this when CUDA <11.6 is no longer supported
 void topk_out_with_sort(
   const Tensor& self,
   int64_t k, int64_t dim, bool largest,
@@ -31,21 +30,12 @@ void topk_out_with_sort(
   indices.copy_(sorted_indices.narrow(dim, 0, k));
 }
 
-// TODO: remove this when CUDA <11.6 is no longer supported
-bool disable_sort_for_topk();
 bool should_use_sort(const Tensor& self, int64_t dim) {
 #if defined(USE_ROCM)
   if (self.dtype() == kBool) return false; // Bool sort not supported in ROCm: https://github.com/pytorch/pytorch/issues/139972
   return (self.numel() >= 10000 && self.numel() == self.size(dim)); // based on the experiments in https://github.com/pytorch/pytorch/pull/146387
 #else
-  if (disable_sort_for_topk()) return false;
-  // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/68632
-  if (self.dim() == 0) return false;
-  if (self.dtype() == kBool) return false; // Bool is not support by topk
-  int64_t slice_size = self.size(dim);
-  if (slice_size == 0) return false;
-  int64_t num_slices = self.numel() / slice_size;
-  return num_slices <= 10 && slice_size >= 100000;
+  return false;
 #endif
 }
 
diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu
index 584c1c49a03c..d95d85bf0237 100644
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@@ -21,11 +21,6 @@ using namespace at::native;
 
 namespace at::native {
 
-// TODO: remove this when CUDA <11.6 is no longer supported
-bool disable_sort_for_topk() {
-  return CUB_SUPPORTS_SCAN_BY_KEY();
-}
-
 namespace sbtopk { // single_block_topk
 
 template <typename T>
@@ -230,7 +225,7 @@ constexpr int BLOCK_THREADS = 256;
 constexpr int RADIX_BITS = 8;
 constexpr int RADIX_DIGITS = 1 << RADIX_BITS; // 2 ^ RADIX_BITS
 constexpr int RADIX_MASK = (RADIX_DIGITS - 1);
-static_assert(RADIX_DIGITS <= BLOCK_THREADS, "radixFindKthValues kernel requires RADIX_DIGITS <= BLOCK_THREADS");
+static_assert(RADIX_DIGITS <= BLOCK_THREADS, "RADIX_DIGITS must be <= BLOCK_THREADS");
 constexpr int MIN_ITEMS_PER_THREAD = 4;
 constexpr int MAX_ITEMS_PER_THREAD = 64;
 
@@ -242,11 +237,10 @@ __global__ void fill(T* x, T value, IndexType size) {
   }
 }
 
-// find the kth smallest value,
-// for largest topk, k_to_find = slice_size - k + 1
+// compute local histogram for each block
 template <typename T, typename IndexType, typename Bitwise, int Dim>
 C10_LAUNCH_BOUNDS_1(BLOCK_THREADS)
-__global__ void radixFindKthValues(
+__global__ void computeBlockDigitCounts(
     at::cuda::detail::TensorInfo<const T, IndexType> input,
     uint32_t slice_size,
     uint32_t* ks_to_find,  // size: num_slices, unused arg but for mysterious reasons perf is better when it's present
@@ -321,12 +315,51 @@ __global__ void radixFindKthValues(
   }
 }
 
+// compute global histogram and cumsum for each row
+__global__ void computeDigitCumSum(
+  short* counts,
+  uint32_t* digit_cum_sum,
+  uint32_t blocks_per_slice) {
+  int tidx = threadIdx.x + blockIdx.x * blockDim.x;
+  int digit_idx = threadIdx.x;
+  uint32_t slice_idx = blockIdx.x;
+
+  typedef cub::BlockScan<uint32_t, RADIX_DIGITS> BlockScan;
+  __shared__ typename BlockScan::TempStorage scan_storage;
+  // accumulates counters from multiple blocks
+  uint32_t digit_count = 0;
+  if (threadIdx.x < RADIX_DIGITS) {
+    constexpr int HISTO_ACCUM_TILE = 4;
+    uint32_t rounds = blocks_per_slice / HISTO_ACCUM_TILE;
+    for (int iter = 0; iter < rounds; iter++)  {
+      int base = HISTO_ACCUM_TILE * iter;
+      #pragma unroll
+      for (int j = 0; j < HISTO_ACCUM_TILE; j++) {
+        int blk = base + j;
+        digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + digit_idx];
+      }
+    }
+    for (int blk = HISTO_ACCUM_TILE * rounds; blk < blocks_per_slice; blk++)  {
+      digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + digit_idx];
+    }
+
+  }
+  // compute the block-wide inclusive prefix sum
+  uint32_t digit_count_cumsum;
+  BlockScan(scan_storage).InclusiveSum(digit_count, digit_count_cumsum);
+  __syncthreads();
+  if (threadIdx.x < RADIX_DIGITS) {
+    digit_cum_sum[tidx] = digit_count_cumsum;
+  }
+}
+
 // Assumption: k can not be larger than UINT32_MAX
 template <typename Bitwise, typename T>
 C10_LAUNCH_BOUNDS_1(RADIX_DIGITS)  // one thread per digit
 __global__ void computeBlockwiseWithinKCounts(
   Bitwise* desires_in,          // size: num_slices
   short* counts,             // size: num_slices * blocks_per_slice * radix_digits
+  uint32_t* digit_cum_sum,
   uint32_t* ks_to_find_in,  // size: num_slices
   uint32_t blocks_per_slice,
   int current_bit,
@@ -338,7 +371,7 @@ __global__ void computeBlockwiseWithinKCounts(
   Bitwise* desires_out,
   uint32_t num_blocks
 ) {
-  // This kernel should be launched with the same number of blocks as the `radixFindKthValues` kernel.
+  // This kernel should be launched with the same number of blocks as the `computeBlockDigitCounts` kernel.
   int tidx = threadIdx.x;
   uint32_t block_idx = getLinearBlockId<uint32_t>();
   uint32_t slice_idx = block_idx / blocks_per_slice;
@@ -351,36 +384,15 @@ __global__ void computeBlockwiseWithinKCounts(
   if (block_idx >= num_blocks) {
     return;
   }
-  typedef cub::BlockScan<uint32_t, BLOCK_THREADS> BlockScan;
-  union __align__(16) TempStorage {
-    uint32_t digit_count_cumsum[RADIX_DIGITS]; // only used if this it the last block for this slice
-    typename BlockScan::TempStorage scan_storage;
-  };
-  __shared__ TempStorage temp_storage;
 
-  // accumulates counters from multiple blocks
-  uint32_t digit_count = 0;
-  if (tidx < RADIX_DIGITS) {
-    for (int blk = 0; blk < blocks_per_slice; ++blk) {
-      digit_count += counts[(slice_idx * blocks_per_slice + blk) * RADIX_DIGITS + tidx];
-    }
-  }
-
-  // compute the block-wide inclusive prefix sum
-  uint32_t digit_count_cumsum;
-  BlockScan(temp_storage.scan_storage).InclusiveSum(digit_count, digit_count_cumsum);
-  __syncthreads();
-  // every thread also need the perfix_sum of it's left value for comparison, so save a copy in shared mem
-  if (tidx < RADIX_DIGITS) {
-    temp_storage.digit_count_cumsum[tidx] = digit_count_cumsum;
-  }
-  __syncthreads();
 
   __shared__ Bitwise desired;
   uint32_t k_to_find = ks_to_find_in[slice_idx];
 
   if (tidx < RADIX_DIGITS) {
-    uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : temp_storage.digit_count_cumsum[tidx - 1];
+    uint32_t position = slice_idx * RADIX_DIGITS + tidx;
+    uint32_t digit_count_cumsum = digit_cum_sum[position];
+    uint32_t digit_count_cumsum_left = (tidx == 0) ? 0 : digit_cum_sum[position - 1];
 
     // if not the last pass: update desired and ks_to_find
     // if last pass: write out the kth value
@@ -401,10 +413,6 @@ __global__ void computeBlockwiseWithinKCounts(
   }
   __syncthreads();
 
-#if !CUB_SUPPORTS_SCAN_BY_KEY()
-  return;
-#endif
-
   Bitwise desired_digit = at::cuda::Bitfield<Bitwise>::getBitfield(desired, current_bit, RADIX_BITS);
 
   // if largest, then only threads that has tidx > desired_digit are active
@@ -460,13 +468,12 @@ __global__ void computeBlockwiseWithinKCounts(
   }
 }
 
-#if CUB_SUPPORTS_SCAN_BY_KEY()
 // Assumption: slice_size can not be larger than UINT32_MAX
 template <typename Bitwise>
 __global__ void computeBlockwiseKthCounts(
   Bitwise* desires,            // size: num_slices
   short* counts,               // size: num_slices * blocks_per_slice * radix_digits
-  uint32_t num_blocks,         // the number of blocks used by `radixFindKthValues` kernel
+  uint32_t num_blocks,         // the number of blocks used by `computeBlockDigitCounts` kernel
   uint32_t blocks_per_slice,
   // outputs:
   uint32_t* kthCounts          // size: num_slices * blocks_per_slice == num_blocks
@@ -592,7 +599,6 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo<const T, IndexType> inpu
     }
   }
 }
-#endif
 
 int get_items_per_thread(uint64_t num_slices, uint64_t slice_size) {
   // occupancy of this kernel is limited by registers per threads
@@ -649,9 +655,7 @@ void launch(
   T* kthValues = reinterpret_cast<T*>(kthValues_buffer.get());
 
   TORCH_CHECK(blocks_per_slice <= std::numeric_limits<uint32_t>::max(), "blocks_per_slice larger than uint32 maximum is not supported");
-  auto semaphores_buffer = allocator.allocate(numInputSlices * sizeof(uint32_t));
-  uint32_t* semaphores = reinterpret_cast<uint32_t*>(semaphores_buffer.get());
-  AT_CUDA_CHECK(cudaMemsetAsync(semaphores, 0, numInputSlices * sizeof(uint32_t), stream));
+
 
   auto ks_to_find_buffer = allocator.allocate(2 * numInputSlices * sizeof(uint32_t));
   uint32_t* ks_to_find = reinterpret_cast<uint32_t*>(ks_to_find_buffer.get());
@@ -668,16 +672,16 @@ void launch(
   static_assert(MAX_ITEMS_PER_THREAD * BLOCK_THREADS < std::numeric_limits<short>::max(),
     "blockwise counter too large");
 
-#if CUB_SUPPORTS_SCAN_BY_KEY()
+  auto digit_cum_sum_buffer = allocator.allocate(numInputSlices * RADIX_DIGITS * sizeof(uint32_t));
+  uint32_t* digit_cum_sum = reinterpret_cast<uint32_t*>(digit_cum_sum_buffer.get());
+  AT_CUDA_CHECK(cudaMemsetAsync(digit_cum_sum, 0, numInputSlices * RADIX_DIGITS * sizeof(uint32_t), stream));
+
   auto withinKCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
   uint32_t* withinKCounts = reinterpret_cast<uint32_t*>(withinKCounts_buffer.get());
   AT_CUDA_CHECK(cudaMemsetAsync(withinKCounts, 0, num_blocks * sizeof(uint32_t), stream));
 
   auto kthCounts_buffer = allocator.allocate(num_blocks * sizeof(uint32_t));
   uint32_t* kthCounts = reinterpret_cast<uint32_t*>(kthCounts_buffer.get());
-#else
-  uint32_t* withinKCounts = nullptr;
-#endif
 
   Bitwise desiredMask = 0;
   dim3 grid;
@@ -691,7 +695,7 @@ void launch(
 
   // iterate radix bits for multiple passes
   for (int current_bit = sizeof(T) * 8 - RADIX_BITS; current_bit >= 0; current_bit -= RADIX_BITS) {
-    radixFindKthValues<T, IndexType, Bitwise, Dim><<<grid, block, 0, stream>>>(
+    computeBlockDigitCounts<T, IndexType, Bitwise, Dim><<<grid, block, 0, stream>>>(
         input,
         inputSliceSize,
         ks_to_find_in, // unused arg
@@ -704,10 +708,14 @@ void launch(
         desired_in,
         counts);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+    computeDigitCumSum<<<numInputSlices, RADIX_DIGITS, 0, stream>>>(counts, digit_cum_sum, blocks_per_slice);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+
     // we unconditionally call this kernel to update desired/ks_to_find/kthValues
     // if cub supports scan_by_key we additionally do k counts
     computeBlockwiseWithinKCounts<Bitwise, T><<<grid, RADIX_DIGITS, 0, stream>>>(
-      desired_in, counts, ks_to_find_in, blocks_per_slice, current_bit, largest, withinKCounts, kthValues, ks_to_find_out, desired_out, num_blocks);
+      desired_in, counts, digit_cum_sum, ks_to_find_in, blocks_per_slice, current_bit, largest, withinKCounts, kthValues, ks_to_find_out, desired_out, num_blocks);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
     // swap desired/ks_to_find in and out for next iter
     auto tmp_desired = desired_in;
@@ -720,7 +728,6 @@ void launch(
   }
   desired = desired_in;
 
-#if CUB_SUPPORTS_SCAN_BY_KEY()
   computeBlockwiseKthCounts<Bitwise><<<std::min(((int64_t)numInputSlices + 255) / 256, (int64_t)1073741824), 256, 0, stream>>>(
     desired, counts, num_blocks, blocks_per_slice, kthCounts);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
@@ -736,28 +743,6 @@ void launch(
     topK, topKWithinSliceStride, indices, indicesWithinSliceStride, items_per_thread,
     blocks_per_slice, kthValues, withinKCounts, kthCounts, num_blocks);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
-#else
-  // Find topk values based on kth values
-  {
-    dim3 grid;
-    TORCH_INTERNAL_ASSERT(getGridFromTiles(numInputSlices, grid), "Too many slices for topk");
-    int warp_size = at::cuda::warp_size();
-    dim3 block(std::min(at::ceil_div((int64_t)inputSliceSize, (int64_t)warp_size) * (int64_t)warp_size, (int64_t)1024));
-    sbtopk::gatherTopK<T, IndexType, Dim, /* WithKthValues= */true><<<grid, block, 0, stream>>>(
-        input,
-        inputSliceSize,
-        outputSliceSize,
-        largest,
-        numInputSlices,
-        inputWithinSliceStride,
-        topK,
-        topKWithinSliceStride,
-        indices,
-        indicesWithinSliceStride,
-        kthValues);
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
-  }
-#endif
 }
 
 } // namespace mbtopk
@@ -765,7 +750,6 @@ void launch(
 bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
   if (num_slices > std::numeric_limits<uint32_t>::max() ||
       slice_size > std::numeric_limits<uint32_t>::max()) return false;
-#if CUB_SUPPORTS_SCAN_BY_KEY()
   // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/74267
   return (num_slices <= 20 && slice_size >= 20000) ||
       (num_slices > 20 && num_slices <= 40 && slice_size >= 10000) ||
@@ -774,12 +758,6 @@ bool should_use_multiblock(int64_t num_slices, int64_t slice_size) {
       (num_slices >= 200 && num_slices < 800 && slice_size >= 3000) ||
       (num_slices >= 800 && num_slices <= 4000 && slice_size >= 800) ||
       (num_slices > 4000 && slice_size >= 400);
-#else
-  // This heuristics is based on the experiment in https://github.com/pytorch/pytorch/pull/71081
-  return (num_slices <= 400 && slice_size >= 5000) ||
-      (num_slices > 400 && num_slices < 4000 && slice_size >= 1000) ||
-      (num_slices >= 4000 && slice_size >= 300);
-#endif
 }
 
 void launch_gather_topk_kernel(
diff --git a/aten/src/ATen/native/cuda/TriangularOps.cu b/aten/src/ATen/native/cuda/TriangularOps.cu
index efc79672f562..dd8084b48ebf 100644
--- a/aten/src/ATen/native/cuda/TriangularOps.cu
+++ b/aten/src/ATen/native/cuda/TriangularOps.cu
@@ -44,7 +44,7 @@ __global__ void triu_tril_kernel(
     const int64_t k,
     const int64_t N_padded,
     const IndexType last_dim_padded) {
-  int64_t linear_idx = (blockIdx.x * blockDim.x + threadIdx.x) * elements_per_thread;
+  int64_t linear_idx = (((int64_t)blockIdx.x) * blockDim.x + threadIdx.x) * elements_per_thread;
   if (linear_idx >= N_padded) {
     return;
   }
diff --git a/aten/src/ATen/native/cuda/UpSample.cuh b/aten/src/ATen/native/cuda/UpSample.cuh
index 50428b377da8..09e094ea2bf0 100644
--- a/aten/src/ATen/native/cuda/UpSample.cuh
+++ b/aten/src/ATen/native/cuda/UpSample.cuh
@@ -277,7 +277,7 @@ struct BilinearFilterFunctor {
     return 0;
   }
 
-  static const int size = 2;
+  static constexpr int size = 2;
 };
 
 // taken from
@@ -301,7 +301,7 @@ struct BicubicFilterFunctor {
     return 0;
   }
 
-  static const int size = 4;
+  static constexpr int size = 4;
 };
 
 template <typename accscalar_t>
diff --git a/aten/src/ATen/native/cuda/int4mm.cu b/aten/src/ATen/native/cuda/int4mm.cu
index 5444bb57eba7..e16a5d1f4d96 100644
--- a/aten/src/ATen/native/cuda/int4mm.cu
+++ b/aten/src/ATen/native/cuda/int4mm.cu
@@ -127,7 +127,7 @@ inline __host__ __device__ uint32_t getAlignmentRoundUp(const void* p) {
   return diff == 0 ? 0 : uint32_t(Align) - diff;
 }
 
-#if defined (__gfx90a__) || defined(__gfx942__)
+#if defined (__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)
 #define CDNA2_OR_LATER 1
 #else
 #define CDNA2_OR_LATER 0
@@ -143,7 +143,7 @@ template<typename T, uint32_t Rank>
 using VecT = T __attribute__((ext_vector_type(Rank)));
 
 static bool isCDNA2orLater(int index) {
-    return at::detail::getCUDAHooks().isGPUArch({"gfx90a", "gfx942"}, index);
+    return at::detail::getCUDAHooks().isGPUArch({"gfx90a", "gfx942", "gfx950"}, index);
 }
 
 #else
diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp
index 152aa324002f..f1753c1b52d2 100644
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@@ -1041,8 +1041,8 @@ std::string generate_code(
   // and `extra_args` for computation call if
   // extra arguments to capture runtime state are passed.
   // (look at polygamma for example).
-  std::string extra_params = "";
-  std::string extra_args = "";
+  std::string extra_params;
+  std::string extra_args;
   for (size_t i = 0; i < extra_args_typenames.size(); i++) {
     auto type = std::string(extra_args_typenames[i]);
     auto name = "extra_arg_" + std::to_string(i);
@@ -1352,7 +1352,7 @@ std::string generate_reduction_code(
     int vec_size,
     int max_threads_codegen) {
   TORCH_INTERNAL_ASSERT(desc.nInputs == 1);
-  TORCH_INTERNAL_ASSERT(desc.extra_args_types.size() == 0);
+  TORCH_INTERNAL_ASSERT(desc.extra_args_types.empty());
 
   return generate_reduction_code(
       desc.nOutputs,
@@ -1451,7 +1451,7 @@ std::optional<std::string> get_cache_dir() {
   std::string cache_dir;
   char* ptkcp = std::getenv("PYTORCH_KERNEL_CACHE_PATH");
   // Create kernel_cache_dir if needed as we do not want to create the base directory passed by the user
-  std::string kernels_cache_dir = "";
+  std::string kernels_cache_dir;
   if (ptkcp != nullptr) {
     cache_dir = std::string(ptkcp);
   } else {
diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index 940680eb3682..c457bd3dba75 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -141,7 +141,11 @@ WelfordDataLN cuWelfordOnlineSum(
   if constexpr (!rms_norm){
     U delta = val - curr_sum.mean;
     U new_count = curr_sum.count + 1.f;
+#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
+    U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
+#else
     U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
+#endif
     return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count};
   } else{
     return {0.f, curr_sum.sigma2 + val * val, 0};
@@ -159,7 +163,11 @@ WelfordDataLN cuWelfordCombine(
     U count = dataA.count + dataB.count;
     U mean, sigma2;
     if (count > decltype(dataB.count){0}) {
+#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
+      auto coef = __builtin_amdgcn_rcpf(count);
+#else
       auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
+#endif
       auto nA = dataA.count * coef;
       auto nB = dataB.count * coef;
       mean = nA*dataA.mean + nB*dataB.mean;
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
index 71cbe361a037..56fb015dfaf3 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@@ -14,7 +14,6 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/cuda/MiscUtils.h>
 #include <ATen/native/LinearAlgebra.h>
-#include <ATen/native/BatchLinearAlgebra.h>
 #include <ATen/native/cuda/linalg/BatchLinearAlgebraLib.h>
 #include <ATen/native/cuda/linalg/MagmaUtils.h>
 #include <ATen/native/cpu/zmath.h>
@@ -1108,10 +1107,14 @@ void ldl_factor_kernel(
   auto preferred_backend = at::globalContext().linalgPreferredBackend();
   switch (preferred_backend) {
     case at::LinalgBackend::Cusolver:
-      return ldl_factor_cusolver(
+       { ldl_factor_cusolver(
           LD, pivots, info, upper, hermitian);
+        return;
+}
     case at::LinalgBackend::Magma:
-      return ldl_factor_magma(LD, pivots, info, upper, hermitian);
+       { ldl_factor_magma(LD, pivots, info, upper, hermitian);
+        return;
+}
     default:
     // By default use cusolver if available and magma otherwise.
     // If cusolver and magma 2.5.4+ are both available and hermitian=true,
@@ -1123,8 +1126,10 @@ void ldl_factor_kernel(
             LD, pivots, info, upper, hermitian);
       }
 #endif
-      return ldl_factor_cusolver(
-          LD, pivots, info, upper, hermitian);
+    { ldl_factor_cusolver(
+      LD, pivots, info, upper, hermitian);
+      return;
+    }
 #else
       return ldl_factor_magma(LD, pivots, info, upper, hermitian);
 #endif
@@ -1238,7 +1243,7 @@ Tensor _cholesky_solve_helper_cuda_magma(const Tensor& self, const Tensor& A, bo
 // Todo: cusolverDn<T>potrsBatched only supports nrhs == 1 and does not have good performance.
 //     Batched cholesky_solve is dispatched to magma.
 Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upper) {
-#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
+#if defined(USE_LINALG_SOLVER)
   auto preferred_backend = at::globalContext().linalgPreferredBackend();
   switch (preferred_backend) {
     case at::LinalgBackend::Cusolver:
@@ -1352,7 +1357,7 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)
 }
 
 static void cholesky_kernel(const Tensor& input, const Tensor& info, bool upper) {
-#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
+#if defined(USE_LINALG_SOLVER)
   auto preferred_backend = at::globalContext().linalgPreferredBackend();
   switch (preferred_backend) {
     case at::LinalgBackend::Cusolver:
@@ -1615,16 +1620,7 @@ static void lu_factor(const Tensor& input, const Tensor& pivots, const Tensor& i
   const auto preferred_backend = at::globalContext().linalgPreferredBackend();
 #ifdef USE_LINALG_SOLVER
   const auto lu_factor_cusolver = [batch_size, m, n](const Tensor& input, const Tensor& pivots, const Tensor& infos, bool compute_pivots) {
-    // In CUDA 10.2, lu_factor_looped_cusolver does not finish the computations when the input
-    // matrix is exactly singular. The returned pivots contain garbage. This breaks linalg.det
-    // Now, batched_cublas does not handle rectangular matrices, so we still dispatch to
-    // looped_cusolver even if m != n.
-#ifdef USE_ROCM
-    constexpr bool looped_correct = true;
-#else
-    constexpr bool looped_correct = CUSOLVER_VERSION >= 11100;
-#endif
-    if (m != n || (looped_correct && (batch_size == 1 || m >= 512))) {
+    if (m != n || (batch_size == 1 || m >= 512)) {
       lu_factor_looped_cusolver(input, pivots, infos, compute_pivots);
     } else {
       lu_factor_batched_cublas(input, pivots, infos, compute_pivots);
@@ -1849,11 +1845,14 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
       // For the benchmarks see
       // https://github.com/pytorch/pytorch/pull/56253#discussion_r622851107
       if (input.size(-2) <= 256 && batchCount(input) >= std::max<int64_t>(2, input.size(-2) / 16)) {
-        return geqrf_batched_cublas(input, tau);
+        geqrf_batched_cublas(input, tau);
+        return;
       } else {
-        return geqrf_cusolver(input, tau);
+        geqrf_cusolver(input, tau);
+        return;
       }
-      return geqrf_batched_cublas(input, tau);
+      geqrf_batched_cublas(input, tau);
+      return;
   };
 
   auto preferred_backend = at::globalContext().linalgPreferredBackend();
@@ -1866,10 +1865,14 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
   // - ?geqrf_gpu allows fast computation of Q via ?orgqr_gpu, but doesn't give R properly.
   // - ?geqrf2_gpu gives correct R, but doesn't allow computation of Q via ?orgqr_gpu
     case at::LinalgBackend::Magma:
-      return geqrf_magma(input, tau);
+      { geqrf_magma(input, tau);
+        return;
+      }
     case at::LinalgBackend::Cusolver:
     default:
-      return geqrf_cusolver_backend(input, tau);
+      { geqrf_cusolver_backend(input, tau);
+        return;
+      }
   }
 #else
   return geqrf_magma(input, tau);
@@ -2709,17 +2712,21 @@ void linalg_lstsq_gels(const Tensor& A, const Tensor& B, const Tensor& /*infos*/
 }
 
 void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) {
-#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
+#if defined(USE_LINALG_SOLVER)
   auto preferred_backend = at::globalContext().linalgPreferredBackend();
   switch (preferred_backend) {
     case at::LinalgBackend::Magma:
-      return gels_magma(a, b, infos);
+      { gels_magma(a, b, infos);
+        return;
+      }
     case at::LinalgBackend::Cusolver:
     default:
       // linalg_lstsq_gels is a generic function that is implemented using
       // geqrf_stub, ormqr_stub, and triangular_solve_stub
       // It dispatches to cuSOLVER for CUDA inputs if USE_LINALG_SOLVER is defined
-      return linalg_lstsq_gels(a, b, infos);
+      { linalg_lstsq_gels(a, b, infos);
+        return;
+      }
   }
 #else
   return gels_magma(a, b, infos);
@@ -2733,7 +2740,7 @@ void lstsq_kernel(const Tensor& a, Tensor& b, Tensor& /*rank*/, Tensor& /*singul
   // first handle the underdetermined case (m < n)
   // this case is not supported by MAGMA or cuBLAS
   if (m < n) {
-#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
+#if defined(USE_LINALG_SOLVER)
     linalg_lstsq_gels(a, b, infos);
 #else
     TORCH_CHECK(
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
index 888ab64db61f..267d1f5acea5 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
@@ -127,8 +127,7 @@ void apply_ldl_solve_cusolver(
     const Tensor& pivots,
     const Tensor& B,
     bool upper) {
-#if !(defined(CUDART_VERSION) && defined(CUSOLVER_VERSION) && \
-    CUSOLVER_VERSION >= 11102)
+#if !(defined(CUDART_VERSION) && defined(CUSOLVER_VERSION))
   TORCH_CHECK(
       false,
       "Calling torch.linalg.ldl_solve on a CUDA tensor requires compiling ",
@@ -663,7 +662,7 @@ void svd_cusolver(const Tensor& A,
   const auto n = A.size(-1);
   const auto k = std::min(m, n);
 
-  static const char* check_svd_doc = "Check doc at https://pytorch.org/docs/stable/generated/torch.linalg.svd.html";
+  static constexpr const char* check_svd_doc = "Check doc at https://pytorch.org/docs/stable/generated/torch.linalg.svd.html";
 
   // The default heuristic is to use gesvdj driver
 #ifdef USE_ROCM
diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
index 4ab411d9a025..c1785d61879d 100644
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.h
@@ -86,7 +86,7 @@ namespace cuda { namespace detail {
 struct LinalgDispatch {
    Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper);
 };
-C10_EXPORT void registerLinalgDispatch(const LinalgDispatch&);
+C10_EXPORT void registerLinalgDispatch(const LinalgDispatch& /*disp_*/);
 }} // namespace cuda::detail
 #endif
 
diff --git a/aten/src/ATen/native/cuda/reduction_template.cuh b/aten/src/ATen/native/cuda/reduction_template.cuh
index 98c463968247..484d04e2a39b 100644
--- a/aten/src/ATen/native/cuda/reduction_template.cuh
+++ b/aten/src/ATen/native/cuda/reduction_template.cuh
@@ -466,7 +466,11 @@ struct ReduceJitOp {
 
     __syncthreads();
 
+    #if defined(USE_ROCM) || defined(FBCODE_CAFFE2)
     for (int offset = 1; offset < dim_x; offset <<= 1) {
+    #else
+    for (int offset = dim_x >> 1; offset > 0; offset >>= 1) {
+    #endif
       #pragma unroll
       for (int i = 0; i < output_vec_size; i++) {
         arg_t other = reducer::warp_shfl_down(value[i], offset);
diff --git a/aten/src/ATen/native/cudnn/ConvShared.cpp b/aten/src/ATen/native/cudnn/ConvShared.cpp
index 9b32f05482d5..b86b7436138f 100644
--- a/aten/src/ATen/native/cudnn/ConvShared.cpp
+++ b/aten/src/ATen/native/cudnn/ConvShared.cpp
@@ -169,7 +169,10 @@ std::string repro_from_args(const ConvolutionParams& params) {
   ss << "If that doesn't trigger the error, please include your original repro script when reporting this issue.\n\n";
   ss << "import torch\n";
   ss << "torch.backends.cuda.matmul.allow_tf32 = "
-     << pybool(at::globalContext().float32Precision("cuda", "matmul") == "tf32")
+     << pybool(
+            at::globalContext().float32Precision(
+                at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
+            at::Float32Precision::TF32)
      << "\n";
   ss << "torch.backends.cudnn.benchmark = "
      << pybool(at::globalContext().benchmarkCuDNN()) << "\n";
@@ -726,7 +729,7 @@ Tensor cudnn_convolution_relu(
 
   auto& ctx = at::globalContext();
   bool benchmark = ctx.benchmarkCuDNN();
-  bool allow_tf32 = ctx.allowTF32CuDNN("conv");
+  bool allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
   auto _bias = bias_t.has_value()
       ? bias_t.value()
       : at::zeros(
@@ -784,7 +787,7 @@ Tensor cudnn_convolution_add_relu(
   }
 
   auto& ctx = at::globalContext();
-  bool allow_tf32 = ctx.allowTF32CuDNN("conv");
+  bool allow_tf32 = ctx.allowTF32CuDNN(at::Float32Op::CONV);
   bool benchmark = ctx.benchmarkCuDNN();
   auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
   auto _bias = bias_t.has_value()
diff --git a/aten/src/ATen/native/cudnn/Conv_v8.cpp b/aten/src/ATen/native/cudnn/Conv_v8.cpp
index f9837ccc79a2..8a19fac27bfd 100644
--- a/aten/src/ATen/native/cudnn/Conv_v8.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp
@@ -252,7 +252,7 @@ struct CacheKeyFusedWrapper : ParamsWrapper<CacheKeyFused> {
   }
 };
 
-static int getLRUCacheLimit() {
+int getLRUCacheLimit() {
   constexpr int DEFAULT_LIMIT =
       10000; // roughly corresponds to 2GiB assuming 200KiB per ExecutionPlan
   // 0 is used to indicate no limit
@@ -337,8 +337,7 @@ struct BenchmarkCache {
             engine_cache_order.begin(), engine_cache_order, it->second.second);
       }
     } else {
-      engine_cache.erase(key);
-      engine_cache.emplace(
+      engine_cache.insert_or_assign(
           key,
           std::make_pair(results, engine_cache_order.end())); // dummy iterator
     }
diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp
index 5cd295f17c67..35d5a64685f2 100644
--- a/aten/src/ATen/native/cudnn/LossCTC.cpp
+++ b/aten/src/ATen/native/cudnn/LossCTC.cpp
@@ -76,7 +76,6 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss_tensor(
 
 #else // AT_CUDNN_ENABLED
 
-#include <ATen/cudnn/Descriptors.h>
 #include <ATen/cudnn/Types.h>
 #include <ATen/cudnn/Utils.h>
 
@@ -284,9 +283,9 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss_tensor(
   checkBackend(c, {*targets}, Backend::CUDA);
   const auto batch_size = log_probs->size(1);
   int64_t input_lengths_size =
-      input_lengths_.sizes().size() ? input_lengths_.size(0) : 1;
+      !input_lengths_.sizes().empty() ? input_lengths_.size(0) : 1;
   int64_t target_lengths_size =
-      target_lengths_.sizes().size() ? target_lengths_.size(0) : 1;
+      !target_lengths_.sizes().empty() ? target_lengths_.size(0) : 1;
   TORCH_CHECK(
       input_lengths_size == batch_size,
       "input_lengths needs to have size to match batch_size");
diff --git a/aten/src/ATen/native/cudnn/MHA.cpp b/aten/src/ATen/native/cudnn/MHA.cpp
index 1658ce34ca6c..7604244997bc 100644
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@@ -142,8 +142,6 @@ void run_cudnn_SDP_bprop_nestedtensor(
 namespace at {
 namespace native {
 
-#include <cudnn_frontend.h>
-
 namespace fe = cudnn_frontend;
 
 constexpr uint8_t MAX_MHA_DIM = 4;
@@ -343,16 +341,22 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
   }
 };
 
-template <typename T, typename KeyType>
 struct MHAGraphCache {
-  std::unordered_map<KeyType, T, ParamsWrapperHash<KeyType>> engine_cache;
+  using KeyType = MHACacheKeyWrapper;
+  using ValueType = std::unique_ptr<fe::graph::Graph>;
+  using MapType =
+      std::unordered_map<KeyType, ValueType, ParamsWrapperHash<KeyType>>;
+  using iterator = typename MapType::iterator;
+  using const_iterator = typename MapType::const_iterator;
+
+  MapType engine_cache;
   int count = 0;
   int hits = 0;
 
   // no mutexes here as caches are now thread local for v8, can also return a
   // pointer to the Execution Plan if we know it will not be invalidated by
   // another thread
-  T* find(const KeyType& key) {
+  iterator find(const KeyType& key) {
     static bool flag =
         c10::utils::check_env("TORCH_CUDNN_SDPA_CACHE_DEBUG") == true;
     if (flag && count) {
@@ -365,16 +369,19 @@ struct MHAGraphCache {
     }
     count++;
     auto it = engine_cache.find(key);
-    if (it == engine_cache.end()) {
-      return nullptr;
+    if (it != engine_cache.end()) {
+      hits++;
     }
-    hits++;
-    return &(it->second);
+    return it;
   }
 
-  void update(const KeyType& key, T& results) {
-    engine_cache.erase(key);
-    engine_cache.emplace(key, std::move(results));
+  const_iterator end() const {
+    return engine_cache.end();
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> try_emplace(const KeyType& key, Args&&... args) {
+    return engine_cache.try_emplace(key, std::forward<Args>(args)...);
   }
 };
 
@@ -383,16 +390,14 @@ struct MHAGraphCache {
 // https://docs.nvidia.com/deeplearning/cudnn/backend/latest/release-notes.html
 // We also leak the caches to workaround potential teardown race issues.
 
-auto& getMHAGraphCache_() {
-  thread_local auto& instance =
-      *new MHAGraphCache<std::shared_ptr<fe::graph::Graph>, MHACacheKeyWrapper>;
-  return instance;
+MHAGraphCache& getMHAGraphCache_() {
+  thread_local MHAGraphCache* instance{new MHAGraphCache()};
+  return *instance;
 }
 
-auto& getMHAGraphBackwardCache_() {
-  thread_local auto& instance =
-      *new MHAGraphCache<std::shared_ptr<fe::graph::Graph>, MHACacheKeyWrapper>;
-  return instance;
+MHAGraphCache& getMHAGraphBackwardCache_() {
+  thread_local MHAGraphCache* instance{new MHAGraphCache()};
+  return *instance;
 }
 
 namespace {
@@ -440,7 +445,7 @@ auto fixSizeOneDimStrideSDPA(
 
 } // namespace
 
-auto build_graph(
+std::unique_ptr<fe::graph::Graph> build_graph(
     int64_t b,
     int64_t h,
     int64_t s_q,
@@ -464,7 +469,7 @@ auto build_graph(
   if (q.scalar_type() == kBFloat16) {
     dtype = fe::DataType_t::BFLOAT16;
   }
-  auto mha_graph = std::make_shared<fe::graph::Graph>();
+  auto mha_graph = std::make_unique<fe::graph::Graph>();
   // We're baking in float accumulation and scale types
   // in theory the graph may support other types, but they
   // have not been tested
@@ -482,9 +487,7 @@ auto build_graph(
   auto scaled_dot_product_flash_attention_options =
       fe::graph::SDPA_attributes()
           .set_name("CUDNN_SDPA")
-          .set_is_inference(return_softmaxstats == false)
-          // TODO(eqy): switch to this API once cuDNN FE is upgraded
-          // .set_generate_stats(return_softmaxstats)
+          .set_generate_stats(return_softmaxstats)
           .set_causal_mask(is_causal)
           .set_attn_scale(attn_scale);
   if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
@@ -534,15 +537,13 @@ auto build_graph(
       fe::graph::Tensor_attributes().set_uid(K).set_name("K"));
   auto V_ = mha_graph->tensor(
       fe::graph::Tensor_attributes().set_uid(V).set_name("V"));
-  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
   if (attn_bias.has_value()) {
-    bias =
+    scaled_dot_product_flash_attention_options.set_bias(
         mha_graph->tensor(fe::graph::Tensor_attributes()
                               .set_uid(BIAS)
                               .set_name("bias")
                               .set_dim(attn_bias.value().sizes().vec())
-                              .set_stride(attn_bias.value().strides().vec()));
-    scaled_dot_product_flash_attention_options.set_bias(bias.value());
+                              .set_stride(attn_bias.value().strides().vec())));
   }
 
   auto [O_, Stats] =
@@ -643,7 +644,7 @@ auto build_graph(
   return mha_graph;
 }
 
-auto build_graph_nestedtensor(
+std::unique_ptr<fe::graph::Graph> build_graph_nestedtensor(
     int64_t b,
     int64_t h_q,
     int64_t h_k,
@@ -671,7 +672,7 @@ auto build_graph_nestedtensor(
   if (q.scalar_type() == kBFloat16) {
     dtype = fe::DataType_t::BFLOAT16;
   }
-  auto mha_graph = std::make_shared<fe::graph::Graph>();
+  auto mha_graph = std::make_unique<fe::graph::Graph>();
   // We're baking in float accumulation and scale types
   // in theory the graph may support other types, but they
   // have not been tested
@@ -704,9 +705,7 @@ auto build_graph_nestedtensor(
   auto scaled_dot_product_flash_attention_options =
       fe::graph::SDPA_attributes()
           .set_name("CUDNN_SDPA_NESTEDTENSOR")
-          .set_is_inference(return_softmaxstats == false)
-          // TODO(eqy): switch to this API once cuDNN FE is upgraded
-          // .set_generate_stats(return_softmaxstats)
+          .set_generate_stats(return_softmaxstats)
           .set_causal_mask(is_causal)
           .set_attn_scale(attn_scale)
           .set_seq_len_q(SEQ_LEN_Q_)
@@ -769,18 +768,16 @@ auto build_graph_nestedtensor(
                                        v_strides[strideidx0],
                                        v_strides[strideidx1],
                                        v_strides[strideidx2]}));
-  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
   if (attn_bias.has_value()) {
     TORCH_CHECK(
         false,
         "attn_bias not yet supportd with cuDNN Attention and NestedTensor");
-    bias =
+    scaled_dot_product_flash_attention_options.set_bias(
         mha_graph->tensor(fe::graph::Tensor_attributes()
                               .set_uid(BIAS)
                               .set_name("bias")
                               .set_dim(attn_bias.value().sizes().vec())
-                              .set_stride(attn_bias.value().strides().vec()));
-    scaled_dot_product_flash_attention_options.set_bias(bias.value());
+                              .set_stride(attn_bias.value().strides().vec())));
   }
   auto RAG_Q_OFF_ =
       mha_graph->tensor(fe::graph::Tensor_attributes()
@@ -850,7 +847,7 @@ auto build_graph_nestedtensor(
   return mha_graph;
 }
 
-auto build_graph_backward(
+std::unique_ptr<fe::graph::Graph> build_graph_backward(
     int64_t b,
     int64_t h,
     int64_t s_q,
@@ -877,7 +874,7 @@ auto build_graph_backward(
   if (q.scalar_type() == kBFloat16) {
     dtype = fe::DataType_t::BFLOAT16;
   }
-  auto mha_graph = std::make_shared<fe::graph::Graph>();
+  auto mha_graph = std::make_unique<fe::graph::Graph>();
   // We're baking in float accumulation and scale types
   // in theory the graph may support other types, but they
   // have not been tested
@@ -922,15 +919,13 @@ auto build_graph_backward(
       fe::graph::Tensor_attributes().set_uid(K).set_name("K"));
   auto V_ = mha_graph->tensor(
       fe::graph::Tensor_attributes().set_uid(V).set_name("V"));
-  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
   if (attn_bias.has_value()) {
-    bias =
+    sdpa_backward_options.set_bias(
         mha_graph->tensor(fe::graph::Tensor_attributes()
                               .set_uid(BIAS)
                               .set_name("bias")
                               .set_dim(attn_bias.value().sizes().vec())
-                              .set_stride(attn_bias.value().strides().vec()));
-    sdpa_backward_options.set_bias(bias.value());
+                              .set_stride(attn_bias.value().strides().vec())));
   }
   if (dropout_probability != 0.0f) {
     auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
@@ -1064,7 +1059,7 @@ auto build_graph_backward(
   return mha_graph;
 }
 
-auto build_graph_backward_nestedtensor(
+std::unique_ptr<fe::graph::Graph> build_graph_backward_nestedtensor(
     int64_t b,
     int64_t h_q,
     int64_t h_k,
@@ -1095,7 +1090,7 @@ auto build_graph_backward_nestedtensor(
   if (q.scalar_type() == kBFloat16) {
     dtype = fe::DataType_t::BFLOAT16;
   }
-  auto mha_graph = std::make_shared<fe::graph::Graph>();
+  auto mha_graph = std::make_unique<fe::graph::Graph>();
   // We're baking in float accumulation and scale types
   // in theory the graph may support other types, but they
   // have not been tested
@@ -1198,18 +1193,16 @@ auto build_graph_backward_nestedtensor(
                                        o_strides[strideidx1],
                                        o_strides[strideidx2]}));
 
-  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
   if (attn_bias.has_value()) {
     TORCH_CHECK(
         false,
         "attn_bias not yet supportd with cuDNN Attention and NestedTensor");
-    bias =
+    sdpa_backward_options.set_bias(
         mha_graph->tensor(fe::graph::Tensor_attributes()
                               .set_uid(BIAS)
                               .set_name("bias")
                               .set_dim(attn_bias.value().sizes().vec())
-                              .set_stride(attn_bias.value().strides().vec()));
-    sdpa_backward_options.set_bias(bias.value());
+                              .set_stride(attn_bias.value().strides().vec())));
   }
   auto RAG_Q_OFF_ =
       mha_graph->tensor(fe::graph::Tensor_attributes()
@@ -1379,9 +1372,9 @@ void run_cudnn_SDP_fprop(
   cudnnHandle_t handle = getCudnnHandle();
 
   // NB: The key initialization will round up sequence length, stride data etc.
-  // if use_ragged_in_dense is enabled (to allow multiple sequence lenghths to
+  // if use_ragged_in_dense is enabled (to allow multiple sequence lengths to
   // reuse the same cached value/graph)
-  auto key = MHACacheKeyWrapper(
+  MHACacheKeyWrapper key(
       b,
       h,
       s_q,
@@ -1396,12 +1389,9 @@ void run_cudnn_SDP_fprop(
       is_causal,
       return_softmaxstats,
       false);
-  auto graph_ptr = getMHAGraphCache_().find(key);
-  std::shared_ptr<fe::graph::Graph> mha_graph;
-  if (graph_ptr) {
-    mha_graph = *graph_ptr;
-  } else {
-    mha_graph = build_graph(
+  auto [cache_it, not_found] = getMHAGraphCache_().try_emplace(key, nullptr);
+  if (not_found) {
+    cache_it->second = build_graph(
         b,
         h,
         s_q,
@@ -1422,39 +1412,39 @@ void run_cudnn_SDP_fprop(
         _dropoutoffset,
         handle);
   }
+  const fe::graph::Graph& mha_graph = *cache_it->second;
   std::unordered_map<int64_t, void*> variant_pack = {
-      {Q, q.data_ptr()},
-      {K, k.data_ptr()},
-      {V, v.data_ptr()},
+      {Q, q.mutable_data_ptr()},
+      {K, k.mutable_data_ptr()},
+      {V, v.mutable_data_ptr()},
       {SCALE, &scaling_factor},
-      {O, o.data_ptr()}};
+      {O, o.mutable_data_ptr()}};
   if (return_softmaxstats) {
-    variant_pack[LSE] = softmaxstats.data_ptr();
+    variant_pack[LSE] = softmaxstats.mutable_data_ptr();
   }
   if (attn_bias.has_value()) {
-    variant_pack[BIAS] = attn_bias.value().data_ptr();
+    variant_pack[BIAS] = attn_bias.value().mutable_data_ptr();
   }
   if (dropout_probability != 0.0f) {
-    variant_pack[SEED] = _dropoutseed.data_ptr();
-    variant_pack[OFFSET] = _dropoutoffset.data_ptr();
+    variant_pack[SEED] = _dropoutseed.mutable_data_ptr();
+    variant_pack[OFFSET] = _dropoutoffset.mutable_data_ptr();
   }
   if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
-    variant_pack[SEQ_LEN_Q] = seqlen_q.data_ptr();
-    variant_pack[SEQ_LEN_KV] = seqlen_kv.data_ptr();
-    variant_pack[RAG_Q_OFF] = rag_off_q.data_ptr();
-    variant_pack[RAG_K_OFF] = rag_off_k.data_ptr();
-    variant_pack[RAG_V_OFF] = rag_off_v.data_ptr();
-    variant_pack[RAG_O_OFF] = rag_off_o.data_ptr();
+    variant_pack[SEQ_LEN_Q] = seqlen_q.mutable_data_ptr();
+    variant_pack[SEQ_LEN_KV] = seqlen_kv.mutable_data_ptr();
+    variant_pack[RAG_Q_OFF] = rag_off_q.mutable_data_ptr();
+    variant_pack[RAG_K_OFF] = rag_off_k.mutable_data_ptr();
+    variant_pack[RAG_V_OFF] = rag_off_v.mutable_data_ptr();
+    variant_pack[RAG_O_OFF] = rag_off_o.mutable_data_ptr();
     if (return_softmaxstats) {
-      variant_pack[RAG_LSE_OFF] = rag_off_lse.data_ptr();
+      variant_pack[RAG_LSE_OFF] = rag_off_lse.mutable_data_ptr();
     }
   }
-  auto workspace_size = mha_graph->get_workspace_size();
+  auto workspace_size = mha_graph.get_workspace_size();
   auto workspace_ptr =
       c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
   TORCH_CHECK(
-      mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
-  getMHAGraphCache_().update(key, mha_graph);
+      mha_graph.execute(handle, variant_pack, workspace_ptr.get()).is_good());
 }
 
 void run_cudnn_SDP_fprop_nestedtensor(
@@ -1494,7 +1484,7 @@ void run_cudnn_SDP_fprop_nestedtensor(
     softmaxstats = at::empty({q.size(0), h_q, 1}, q.options().dtype(kFloat));
   }
 
-  auto key = MHACacheKeyWrapper(
+  MHACacheKeyWrapper key(
       b,
       h_q,
       s_q, // max-seqlen-q
@@ -1509,13 +1499,12 @@ void run_cudnn_SDP_fprop_nestedtensor(
       is_causal,
       return_softmaxstats,
       true);
-  auto graph_ptr = getMHAGraphCache_().find(key);
-  std::shared_ptr<fe::graph::Graph> mha_graph;
 
-  if (graph_ptr) {
-    mha_graph = *graph_ptr;
-  } else {
-    mha_graph = build_graph_nestedtensor(
+  MHAGraphCache& cache = getMHAGraphCache_();
+  auto cache_it = cache.find(key);
+  std::unique_ptr<fe::graph::Graph> mha_graph_storage;
+  if (cache_it == cache.end()) {
+    mha_graph_storage = build_graph_nestedtensor(
         b,
         h_q,
         h_k,
@@ -1540,40 +1529,44 @@ void run_cudnn_SDP_fprop_nestedtensor(
         dropoutoffset,
         handle);
   }
+  const fe::graph::Graph& mha_graph =
+      mha_graph_storage ? *mha_graph_storage : *cache_it->second;
+
   auto seqlen_q = at::diff(cum_seqlen_q, 1, 0);
   auto seqlen_kv = at::diff(cum_seqlen_kv, 1, 0);
-  auto rag_q_off = cum_seqlen_q.mul(h_q * d_qk);
-  auto rag_k_off = cum_seqlen_kv.mul(h_k * d_v);
-  auto rag_v_off = cum_seqlen_kv.mul(h_v * d_v);
+  auto rag_q_off = cum_seqlen_q.mul(q.stride(-3));
+  auto rag_k_off = cum_seqlen_kv.mul(k.stride(-3));
+  auto rag_v_off = cum_seqlen_kv.mul(v.stride(-3));
+  auto rag_o_off = cum_seqlen_q.mul(o.stride(-3));
   auto rag_stats_off = cum_seqlen_q.mul(h_q);
   std::unordered_map<int64_t, void*> variant_pack = {
-      {Q, q.data_ptr()},
-      {K, k.data_ptr()},
-      {V, v.data_ptr()},
+      {Q, q.mutable_data_ptr()},
+      {K, k.mutable_data_ptr()},
+      {V, v.mutable_data_ptr()},
       {SCALE, &scaling_factor},
-      {O, o.data_ptr()},
-      {RAG_Q_OFF, rag_q_off.data_ptr()},
-      {RAG_O_OFF, rag_q_off.data_ptr()},
-      {RAG_K_OFF, rag_k_off.data_ptr()},
-      {RAG_V_OFF, rag_v_off.data_ptr()},
-      {SEQ_LEN_Q, seqlen_q.data_ptr()},
-      {SEQ_LEN_KV, seqlen_kv.data_ptr()}};
+      {O, o.mutable_data_ptr()},
+      {RAG_Q_OFF, rag_q_off.mutable_data_ptr()},
+      {RAG_O_OFF, rag_o_off.mutable_data_ptr()},
+      {RAG_K_OFF, rag_k_off.mutable_data_ptr()},
+      {RAG_V_OFF, rag_v_off.mutable_data_ptr()},
+      {SEQ_LEN_Q, seqlen_q.mutable_data_ptr()},
+      {SEQ_LEN_KV, seqlen_kv.mutable_data_ptr()}};
   if (return_softmaxstats) {
-    variant_pack[LSE] = softmaxstats.data_ptr();
-    variant_pack[RAG_LSE_OFF] = rag_stats_off.data_ptr();
+    variant_pack[LSE] = softmaxstats.mutable_data_ptr();
+    variant_pack[RAG_LSE_OFF] = rag_stats_off.mutable_data_ptr();
   }
   if (dropout_probability != 0.0f) {
-    variant_pack[SEED] = dropoutseed.data_ptr();
-    variant_pack[OFFSET] = dropoutoffset.data_ptr();
+    variant_pack[SEED] = dropoutseed.mutable_data_ptr();
+    variant_pack[OFFSET] = dropoutoffset.mutable_data_ptr();
   }
   if (attn_bias.has_value()) {
     TORCH_CHECK("bias not supported with nestedtensor");
   }
-  auto workspace_size = mha_graph->get_workspace_size();
+  auto workspace_size = mha_graph.get_workspace_size();
   auto workspace_ptr =
       c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
   TORCH_CHECK(
-      mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
+      mha_graph.execute(handle, variant_pack, workspace_ptr.get()).is_good());
 }
 
 void run_cudnn_SDP_bprop(
@@ -1655,7 +1648,7 @@ void run_cudnn_SDP_bprop(
   }
 
   cudnnHandle_t handle = getCudnnHandle();
-  auto key = MHACacheKeyWrapper(
+  MHACacheKeyWrapper key(
       b,
       h,
       s_q,
@@ -1670,12 +1663,10 @@ void run_cudnn_SDP_bprop(
       is_causal,
       true,
       false);
-  auto graph_backward_ptr = getMHAGraphBackwardCache_().find(key);
-  std::shared_ptr<fe::graph::Graph> mha_graph;
-  if (graph_backward_ptr) {
-    mha_graph = *graph_backward_ptr;
-  } else {
-    mha_graph = build_graph_backward(
+  auto [cache_it, not_found] =
+      getMHAGraphBackwardCache_().try_emplace(key, nullptr);
+  if (not_found) {
+    cache_it->second = build_graph_backward(
         b,
         h,
         s_q,
@@ -1699,43 +1690,44 @@ void run_cudnn_SDP_bprop(
         _dropoutoffset,
         handle);
   }
+  const fe::graph::Graph& mha_graph = *cache_it->second;
+
   std::unordered_map<int64_t, void*> variant_pack = {
       // inputs
-      {Q, q.data_ptr()},
-      {K, k.data_ptr()},
-      {V, v.data_ptr()},
-      {O, o.data_ptr()},
-      {DO, dO_.data_ptr()},
-      {LSE, softmaxstats.data_ptr()},
+      {Q, q.mutable_data_ptr()},
+      {K, k.mutable_data_ptr()},
+      {V, v.mutable_data_ptr()},
+      {O, o.mutable_data_ptr()},
+      {DO, dO_.mutable_data_ptr()},
+      {LSE, softmaxstats.mutable_data_ptr()},
       // outputs
-      {DQ, dQ.data_ptr()},
-      {DK, dK.data_ptr()},
-      {DV, dV.data_ptr()},
+      {DQ, dQ.mutable_data_ptr()},
+      {DK, dK.mutable_data_ptr()},
+      {DV, dV.mutable_data_ptr()},
       {SCALE, &scaling_factor}};
   if (dropout_probability != 0.0f) {
-    variant_pack[SEED] = _dropoutseed.data_ptr();
-    variant_pack[OFFSET] = _dropoutoffset.data_ptr();
+    variant_pack[SEED] = _dropoutseed.mutable_data_ptr();
+    variant_pack[OFFSET] = _dropoutoffset.mutable_data_ptr();
   }
   if (attn_bias.has_value()) {
-    variant_pack[BIAS] = attn_bias.value().data_ptr();
+    variant_pack[BIAS] = attn_bias.value().mutable_data_ptr();
   }
   if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
-    variant_pack[SEQ_LEN_Q] = seqlen_q.data_ptr();
-    variant_pack[SEQ_LEN_KV] = seqlen_kv.data_ptr();
-    variant_pack[RAG_Q_OFF] = rag_off_q.data_ptr();
-    variant_pack[RAG_K_OFF] = rag_off_k.data_ptr();
-    variant_pack[RAG_V_OFF] = rag_off_v.data_ptr();
-    variant_pack[RAG_O_OFF] = rag_off_o.data_ptr();
-    variant_pack[RAG_LSE_OFF] = rag_off_lse.data_ptr();
+    variant_pack[SEQ_LEN_Q] = seqlen_q.mutable_data_ptr();
+    variant_pack[SEQ_LEN_KV] = seqlen_kv.mutable_data_ptr();
+    variant_pack[RAG_Q_OFF] = rag_off_q.mutable_data_ptr();
+    variant_pack[RAG_K_OFF] = rag_off_k.mutable_data_ptr();
+    variant_pack[RAG_V_OFF] = rag_off_v.mutable_data_ptr();
+    variant_pack[RAG_O_OFF] = rag_off_o.mutable_data_ptr();
+    variant_pack[RAG_LSE_OFF] = rag_off_lse.mutable_data_ptr();
   }
 
-  auto workspace_size = mha_graph->get_workspace_size();
+  auto workspace_size = mha_graph.get_workspace_size();
   auto workspace_ptr =
       c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
   TORCH_CHECK(!workspace_size || workspace_ptr.get());
   TORCH_CHECK(
-      mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
-  getMHAGraphBackwardCache_().update(key, mha_graph);
+      mha_graph.execute(handle, variant_pack, workspace_ptr.get()).is_good());
 }
 
 void run_cudnn_SDP_bprop_nestedtensor(
@@ -1778,9 +1770,10 @@ void run_cudnn_SDP_bprop_nestedtensor(
 
   auto seqlen_q = at::diff(cum_seqlen_q, 1, 0);
   auto seqlen_kv = at::diff(cum_seqlen_kv, 1, 0);
-  auto rag_q_off = cum_seqlen_q.mul(h_q * d_qk);
-  auto rag_k_off = cum_seqlen_kv.mul(h_k * d_v);
-  auto rag_v_off = cum_seqlen_kv.mul(h_v * d_v);
+  auto rag_q_off = cum_seqlen_q.mul(q.stride(-3));
+  auto rag_k_off = cum_seqlen_kv.mul(k.stride(-3));
+  auto rag_v_off = cum_seqlen_kv.mul(v.stride(-3));
+  auto rag_o_off = cum_seqlen_q.mul(o.stride(-3));
   auto rag_stats_off = cum_seqlen_q.mul(h_q);
 
   auto dprops = at::cuda::getCurrentDeviceProperties();
@@ -1794,7 +1787,7 @@ void run_cudnn_SDP_bprop_nestedtensor(
 
   cudnnHandle_t handle = getCudnnHandle();
 
-  auto key = MHACacheKeyWrapper(
+  MHACacheKeyWrapper key(
       b,
       h_q,
       s_q, // max-seqlen-q
@@ -1809,13 +1802,12 @@ void run_cudnn_SDP_bprop_nestedtensor(
       is_causal,
       true,
       true);
-  auto graph_ptr = getMHAGraphCache_().find(key);
-  std::shared_ptr<fe::graph::Graph> mha_graph;
 
-  if (graph_ptr) {
-    mha_graph = *graph_ptr;
-  } else {
-    mha_graph = build_graph_backward_nestedtensor(
+  MHAGraphCache& cache = getMHAGraphCache_();
+  auto cache_it = cache.find(key);
+  std::unique_ptr<fe::graph::Graph> mha_graph_storage;
+  if (cache_it == cache.end()) {
+    mha_graph_storage = build_graph_backward_nestedtensor(
         b,
         h_q,
         h_k,
@@ -1843,41 +1835,43 @@ void run_cudnn_SDP_bprop_nestedtensor(
         dropoutoffset,
         handle);
   }
+  const fe::graph::Graph& mha_graph =
+      mha_graph_storage ? *mha_graph_storage : *cache_it->second;
 
   std::unordered_map<int64_t, void*> variant_pack = {
       // inputs
-      {Q, q.data_ptr()},
-      {K, k.data_ptr()},
-      {V, v.data_ptr()},
-      {O, o.data_ptr()},
-      {DO, dO_.data_ptr()},
-      {LSE, softmaxstats.data_ptr()},
+      {Q, q.mutable_data_ptr()},
+      {K, k.mutable_data_ptr()},
+      {V, v.mutable_data_ptr()},
+      {O, o.mutable_data_ptr()},
+      {DO, dO_.mutable_data_ptr()},
+      {LSE, softmaxstats.mutable_data_ptr()},
       // outputs
-      {DQ, dQ.data_ptr()},
-      {DK, dK.data_ptr()},
-      {DV, dV.data_ptr()},
+      {DQ, dQ.mutable_data_ptr()},
+      {DK, dK.mutable_data_ptr()},
+      {DV, dV.mutable_data_ptr()},
       {SCALE, &scaling_factor},
-      {RAG_Q_OFF, rag_q_off.data_ptr()},
-      {RAG_O_OFF, rag_q_off.data_ptr()},
-      {RAG_K_OFF, rag_k_off.data_ptr()},
-      {RAG_V_OFF, rag_v_off.data_ptr()},
-      {RAG_LSE_OFF, rag_stats_off.data_ptr()},
-      {SEQ_LEN_Q, seqlen_q.data_ptr()},
-      {SEQ_LEN_KV, seqlen_kv.data_ptr()}};
+      {RAG_Q_OFF, rag_q_off.mutable_data_ptr()},
+      {RAG_O_OFF, rag_o_off.mutable_data_ptr()},
+      {RAG_K_OFF, rag_k_off.mutable_data_ptr()},
+      {RAG_V_OFF, rag_v_off.mutable_data_ptr()},
+      {RAG_LSE_OFF, rag_stats_off.mutable_data_ptr()},
+      {SEQ_LEN_Q, seqlen_q.mutable_data_ptr()},
+      {SEQ_LEN_KV, seqlen_kv.mutable_data_ptr()}};
   if (dropout_probability != 0.0f) {
-    variant_pack[SEED] = _dropoutseed.data_ptr();
-    variant_pack[OFFSET] = _dropoutoffset.data_ptr();
+    variant_pack[SEED] = _dropoutseed.mutable_data_ptr();
+    variant_pack[OFFSET] = _dropoutoffset.mutable_data_ptr();
   }
   TORCH_CHECK(
       !attn_bias.has_value(),
       "attn_bias not yet supportd with cuDNN Attention and NestedTensor");
 
-  auto workspace_size = mha_graph->get_workspace_size();
+  auto workspace_size = mha_graph.get_workspace_size();
   auto workspace_ptr =
       c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
   TORCH_CHECK(!workspace_size || workspace_ptr.get());
   TORCH_CHECK(
-      mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
+      mha_graph.execute(handle, variant_pack, workspace_ptr.get()).is_good());
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp
index 7d73ed530510..704a333b1f84 100644
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@@ -245,7 +245,7 @@ descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const {
       datatype,
       input_datatype,
       algo,
-      at::globalContext().allowTF32CuDNN("rnn"));
+      at::globalContext().allowTF32CuDNN(at::Float32Op::RNN));
 #else
     rnn_desc.set(
         handle,
@@ -261,7 +261,7 @@ descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const {
         datatype,
         input_datatype,
         algo,
-        at::globalContext().allowTF32CuDNN("rnn"));
+        at::globalContext().allowTF32CuDNN(at::Float32Op::RNN));
 #endif
   return rnn_desc;
 }
@@ -1222,7 +1222,7 @@ cudnnRNNAlgo_t get_algo(
 }
 
 cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) {
-  if (dtype == CUDNN_DATA_HALF) {
+  if (dtype == CUDNN_DATA_HALF || dtype == CUDNN_DATA_BFLOAT16) {
     return CUDNN_DATA_FLOAT;
   }
   return dtype;
diff --git a/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip b/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
index 7561cede386f..0050e8419e85 100644
--- a/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
@@ -772,13 +772,21 @@ void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
 
 template <>
 void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  std::string_view arch(dprops->gcnArchName);
-  if (arch == "gfx1100") {
+  static const std::vector<std::string> wmma_archs = {
+    "gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201",
+#if ROCM_VERSION >= 70000
+    "gfx1150", "gfx1151"
+#endif
+  };
+  if (at::detail::getCUDAHooks().isGPUArch(wmma_archs)) {
     dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGS(at::BFloat16));
-  } else{
+  }
+  else if (at::detail::getCUDAHooks().isGPUArch({"gfx9"})) {
     dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGS(at::BFloat16));
   }
+  else {
+    TORCH_CHECK(false, "gemm_internal_ck<at::BFloat16> unsupported gfx arch");
+  }
 }
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/hip/ck_gemm_half.hip b/aten/src/ATen/native/hip/ck_gemm_half.hip
index ebe044c38972..1b39283f9f94 100644
--- a/aten/src/ATen/native/hip/ck_gemm_half.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_half.hip
@@ -599,11 +599,21 @@ void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
 
 template <>
 void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
-  if (at::detail::getCUDAHooks().isGPUArch({"gfx1100"})) {
+  static const std::vector<std::string> wmma_archs = {
+    "gfx1100", "gfx1101", "gfx1102", "gfx1200", "gfx1201",
+#if ROCM_VERSION >= 70000
+    "gfx1150", "gfx1151"
+#endif
+  };
+  if (at::detail::getCUDAHooks().isGPUArch(wmma_archs)) {
     dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGS(at::Half));
-  } else{
+  }
+  else if (at::detail::getCUDAHooks().isGPUArch({"gfx9"})) {
     dispatch_half_gemm(CUDABLAS_GEMM_ARGS(at::Half));
   }
+  else {
+    TORCH_CHECK(false, "gemm_internal_ck<at::Half> unsupported gfx arch");
+  }
 }
 
 } // namespace at::native
diff --git a/aten/src/ATen/native/im2col_shape_check.h b/aten/src/ATen/native/im2col_shape_check.h
index 6c830c5c929c..710954f7a022 100644
--- a/aten/src/ATen/native/im2col_shape_check.h
+++ b/aten/src/ATen/native/im2col_shape_check.h
@@ -2,6 +2,7 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/div_rtn.h>
+#include <c10/util/safe_numerics.h>
 
 namespace at::native {
 
@@ -54,6 +55,14 @@ inline void col2im_shape_check(
 
   int64_t batch_dim = (ndim == 3) ? 0 : -1;
   int64_t n_input_plane = input.size(batch_dim + 1);
+  uint64_t prod_kernel_size = 1;
+
+  TORCH_CHECK(!c10::mul_overflows(static_cast<uint64_t>(kernel_width), static_cast<uint64_t>(kernel_height), &prod_kernel_size),
+            "Given kernel_width = ",
+            kernel_width,
+            " and kernel_height = ",
+            kernel_height,
+            " the product of kernel_width and kernel_height overflowed.");
 
   if (n_input_plane % (kernel_width * kernel_height) != 0) {
     TORCH_CHECK(false,
diff --git a/aten/src/ATen/native/layer_norm.h b/aten/src/ATen/native/layer_norm.h
index 0debe942dd0a..c6f498ca9474 100644
--- a/aten/src/ATen/native/layer_norm.h
+++ b/aten/src/ATen/native/layer_norm.h
@@ -3,6 +3,9 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/native/DispatchStub.h>
 #include <c10/util/accumulate.h>
+#include <c10/core/SymBool.h>
+#include <c10/util/StringUtil.h>
+
 
 namespace at::native {
 
@@ -19,28 +22,30 @@ C10_ALWAYS_INLINE void _check_rms_norm_inputs_symint(
       "Expected normalized_shape to be at least 1-dimensional, i.e., ",
       "containing at least one element, but got normalized_shape = ",
       normalized_shape);
-  TORCH_CHECK(
-      !weight.defined() || weight.sym_sizes().equals(normalized_shape),
-      "Expected weight to be of same shape as normalized_shape, but got ",
-      "weight of shape ",
-      weight.sym_sizes(),
-      " and normalized_shape = ",
-      normalized_shape);
+  if (weight.defined()) {
+    TORCH_SYM_CHECK(
+        sym_equals(weight.sym_sizes(), normalized_shape),
+        "Expected weight to be of same shape as normalized_shape, but got ",
+        "weight of shape ",
+        weight.sym_sizes(),
+        " and normalized_shape = ",
+        normalized_shape);
+  }
 
   const auto input_ndim = input.dim();
   const auto input_shape = input.sym_sizes();
-  if (input_ndim < normalized_ndim ||
-      !input_shape.slice(input_ndim - normalized_ndim)
-           .equals(normalized_shape)) {
-    std::stringstream ss;
-    ss << "Given normalized_shape=" << normalized_shape
-       << ", expected input with shape [*";
-    for (auto size : normalized_shape) {
-      ss << ", " << size;
-    }
-    ss << "], but got input of size" << input_shape;
-    TORCH_CHECK(false, ss.str());
-  }
+  TORCH_CHECK_VALUE(
+      input_ndim >= normalized_ndim,
+      "Input tensor must have at least ", normalized_ndim, " dimensions, but got ", input_ndim);
+
+  auto expect_input_shape_msg = c10::str(
+      "Given normalized_shape=", normalized_shape,
+      ", expected input with shape [*", c10::Join(", ", normalized_shape),
+      "], but got input of size", input_shape);
+
+  TORCH_SYM_CHECK(
+      sym_equals(input_shape.slice(input_ndim - normalized_ndim), normalized_shape),
+      expect_input_shape_msg);
 }
 
 C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_layer_norm_inputs(
diff --git a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
index c9c6127f90a7..ba555ac1bfb2 100644
--- a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
@@ -373,59 +373,67 @@ void addmm_out_sparse_csr(
     if (mat2.layout() == kSparseCsr) {
       if (result.layout() == kStrided) {
         // TODO: Add native CSC support via cuSPARSE if supported.
-        return addmm_dense_result(
+        addmm_dense_result(
             mat2.transpose(0, 1).to_sparse_csr(),
             mat1.transpose(0, 1),
             beta,
             alpha,
             result.transpose(0, 1));
+            return;
       }
     }
     if (mat2.layout() == kSparseCsc) {
       if (result.layout() == kStrided) {
-        return addmm_dense_result(
+        addmm_dense_result(
             mat2.transpose(-2, -1),
             mat1.transpose(-2, -1),
             beta,
             alpha,
             result.transpose(-2, -1));
+            return;
       }
     }
     if (mat2.layout() == kSparseBsc) {
       if (result.layout() == kStrided) {
-        return addmm_dense_result(
+        addmm_dense_result(
             mat2.transpose(-2, -1),
             mat1.transpose(-2, -1),
             beta,
             alpha,
             result.transpose(-2, -1));
+            return;
       }
     }
   }
   if (mat1.layout() == kSparseCsr) {
     if (mat2.layout() == kStrided) {
       if (result.layout() == kStrided) {
-        return addmm_dense_result(mat1, mat2, beta, alpha, result);
+        addmm_dense_result(mat1, mat2, beta, alpha, result);
+        return;
       }
     }
     if (mat2.layout() == kSparseCsr) {
       if (result.layout() == kStrided) {
-        return addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result);
+        addmm_sparse_input_dense_result(mat1, mat2, beta, alpha, result);
+        return;
       }
       if (result.layout() == kSparseCsr) {
-        return addmm_sparse_result(mat1, mat2, beta, alpha, result);
+        addmm_sparse_result(mat1, mat2, beta, alpha, result);
+        return;
       }
     }
     if (mat2.layout() == kSparseCsc) {
       if (result.layout() == kStrided) {
         // TODO: CSR @ CSC kernel would be very fast due to format alignment
-        return addmm_sparse_input_dense_result(
-            mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        addmm_sparse_input_dense_result(
+          mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        return;
       }
       if (result.layout() == kSparseCsr) {
         // TODO: CSR @ CSC kernel would be very fast due to format alignment
-        return addmm_sparse_result(
-            mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        addmm_sparse_result(
+          mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        return;
       }
     }
   }
@@ -433,56 +441,62 @@ void addmm_out_sparse_csr(
     if (mat2.layout() == kStrided) {
       if (result.layout() == kStrided) {
         // TODO: avoid csc->csr conversion with native csc support
-        return addmm_dense_result(
-            mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        addmm_dense_result(
+          mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        return;
       }
     }
     if (mat2.layout() == kSparseCsr) {
       if (result.layout() == kSparseCsr) {
         // TODO: avoid csc->csr conversion with native csc support
-        return addmm_sparse_result(
-            mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        addmm_sparse_result(
+          mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        return;
       }
     }
     if (mat2.layout() == kSparseCsc) {
       if (result.layout() == kStrided) {
-        return addmm_sparse_input_dense_result(
-            mat2.transpose(-2, -1),
-            mat1.transpose(-2, -1),
-            beta,
-            alpha,
-            result.transpose(-2, -1));
+        addmm_sparse_input_dense_result(
+          mat2.transpose(-2, -1),
+          mat1.transpose(-2, -1),
+          beta,
+          alpha,
+          result.transpose(-2, -1));
+        return;
       }
       if (result.layout() == kSparseCsr) {
         // TODO avoid csc->csr
-        return addmm_sparse_result(
-            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
+        addmm_sparse_result(
+          mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
+        return;
       }
       if (result.layout() == kSparseCsc) {
-        return addmm_sparse_result(
-            mat2.transpose(-2, -1),
-            mat1.transpose(-2, -1),
-            beta,
-            alpha,
-            result.transpose(-2, -1));
+        addmm_sparse_result(
+          mat2.transpose(-2, -1),
+          mat1.transpose(-2, -1),
+          beta,
+          alpha,
+          result.transpose(-2, -1));
+        return;
       }
     }
   }
   if (mat1.layout() == kSparseBsr) {
     if (mat2.layout() == kStrided) {
       if (result.layout() == kStrided) {
-        return addmm_dense_result(mat1, mat2, beta, alpha, result);
+        addmm_dense_result(mat1, mat2, beta, alpha, result);
+        return;
       }
     }
   }
   TORCH_CHECK(
-      false,
-      "addmm: computation on CPU is not implemented for ",
-      result.layout(),
-      " + ",
-      mat1.layout(),
-      " @ ",
-      mat2.layout());
+    false,
+    "addmm: computation on CPU is not implemented for ",
+    result.layout(),
+    " + ",
+    mat1.layout(),
+    " @ ",
+    mat2.layout());
 }
 
 /*
@@ -496,16 +510,16 @@ void addmm_out_sparse_csr(
                [out] result of the operation.
 */
 void addmv_out_sparse_csr(
-    const Tensor& mat,
-    const Tensor& vec,
-    const Scalar& beta,
-    const Scalar& alpha,
-    const Tensor& result) {
+  const Tensor& mat,
+  const Tensor& vec,
+  const Scalar& beta,
+  const Scalar& alpha,
+  const Tensor& result) {
 #if !AT_USE_MKL_SPARSE()
   TORCH_CHECK(
-      false,
-      "Calling addmv on a sparse CPU tensor requires Linux platform. ",
-      "Please use PyTorch built with MKL on Linux.");
+    false,
+    "Calling addmv on a sparse CPU tensor requires Linux platform. ",
+    "Please use PyTorch built with MKL on Linux.");
 #else
   c10::MaybeOwned<Tensor> result_ = prepare_dense_vector_for_mkl(result);
   c10::MaybeOwned<Tensor> vec_ = prepare_dense_vector_for_mkl(vec);
diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp
index 8222304e6d07..605de45bed72 100644
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@@ -38,7 +38,6 @@ REGISTER_NO_CPU_DISPATCH(mkldnn_convolution_transpose_backward_stub)
 
 #include <ATen/native/mkldnn/MKLDNNCommon.h>
 #include <ATen/native/mkldnn/Utils.h>
-#include <ATen/native/ConvUtils.h>
 #include <c10/util/irange.h>
 
 namespace at::native {
@@ -105,7 +104,7 @@ static void check_shape_forward(const Tensor& input,
     // If kernel size is incorrect
     std::ostringstream input_ss;
     std::ostringstream kernel_ss;
-    std::string separator = "";
+    std::string separator;
 
     for (int i = 0, len = input_shape.size(); i < len; ++i) {
       input_ss << separator << input_shape[i];
@@ -156,13 +155,17 @@ static void check_shape_forward(const Tensor& input,
 //
 
 static bool mkldnn_conv_enabled_fpmath_mode_bf16(){
-  return at::globalContext().float32Precision("mkldnn", "conv") == "bf16" &&
+  return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::CONV) == at::Float32Precision::BF16 &&
       mkldnn_bf16_device_check();
 }
 
 static bool mkldnn_conv_enabled_fpmath_mode_tf32(){
-  return at::globalContext().float32Precision("mkldnn", "conv") == "tf32" &&
-      cpuinfo_has_x86_amx_fp16();
+#if defined(__x86_64__) || defined(_M_X64)
+    return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::CONV) == at::Float32Precision::TF32 &&
+        cpuinfo_has_x86_amx_fp16();
+#else
+    return false;   //TF32 not supported on power system
+#endif
 }
 
 static inline at::MemoryFormat mkldnn_convolution_memory_format(int64_t dims, bool is_channels_last) {
diff --git a/aten/src/ATen/native/mkldnn/Linear.cpp b/aten/src/ATen/native/mkldnn/Linear.cpp
index 8f0b91b3e3f7..2f8448cf57d1 100644
--- a/aten/src/ATen/native/mkldnn/Linear.cpp
+++ b/aten/src/ATen/native/mkldnn/Linear.cpp
@@ -69,13 +69,17 @@ mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2,
 namespace at::native {
 
 static bool use_mkldnn_bf32_linear() {
-  return at::globalContext().float32Precision("mkldnn", "matmul") == "bf16" &&
+  return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::BF16 &&
       mkldnn_bf16_device_check();
 }
 
 static bool use_mkldnn_tf32_linear() {
-  return at::globalContext().float32Precision("mkldnn", "matmul") == "tf32" &&
+#if defined(__x86_64__) || defined(_M_X64)
+    return at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::TF32 &&
       cpuinfo_has_x86_amx_fp16();
+#else
+  return false;  // TF32 not supported on power system
+#endif
 }
 
 Tensor mkldnn_linear(
diff --git a/aten/src/ATen/native/mkldnn/Matmul.cpp b/aten/src/ATen/native/mkldnn/Matmul.cpp
index 44c06a74a222..fbc8294f45cf 100644
--- a/aten/src/ATen/native/mkldnn/Matmul.cpp
+++ b/aten/src/ATen/native/mkldnn/Matmul.cpp
@@ -111,11 +111,16 @@ static bool use_mkldnn_fp16_matmul() {
 }
 
 static bool use_mkldnn_bf32_matmul() {
-  return use_mkldnn_bf16_matmul() && at::globalContext().float32Precision("mkldnn", "matmul") == "bf16";
+  return use_mkldnn_bf16_matmul() && at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::BF16;
 }
 
+
 static bool use_mkldnn_tf32_matmul() {
-  return cpuinfo_has_x86_amx_fp16() && at::globalContext().float32Precision("mkldnn", "matmul") == "tf32";
+#if defined(__x86_64__) || defined(_M_X64)
+    return cpuinfo_has_x86_amx_fp16() && at::globalContext().float32Precision(at::Float32Backend::MKLDNN, at::Float32Op::MATMUL) == at::Float32Precision::TF32;
+#else
+    return false;  // TF32 not supported on power system
+#endif
 }
 
 // returns an ideep::tensor
@@ -411,7 +416,7 @@ static inline bool checksize(const Tensor& mat1, const Tensor& mat2){
   // else if dim = 3, mat1's size = (b * m * n), mat2's size = (b * n * k)
   // else called from aten::mv, mat1.size = (m * n), mat2.size = (n)
   // only m * n * b * k(if exist) are large enough we can get benefit from mkldnn optimized gemm kernel
-  static const int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
+  constexpr int64_t mkldnn_gemm_min_size = 16 * 16 * 16;
   if (mat1.dim() == 1 && mat2.dim() == 1) {
     // aten::dot
     return mat1.size(0) > mkldnn_gemm_min_size;
diff --git a/aten/src/ATen/native/mkldnn/xpu/Blas.cpp b/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
index 6a66abc7b062..7ef9aa5689d5 100644
--- a/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
@@ -559,4 +559,60 @@ Tensor _int_mm_xpu(const Tensor& self, const Tensor& mat2) {
       at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
   return _int_mm_out_xpu(self, mat2, result);
 }
+
+Tensor _weight_int8pack_mm_xpu(
+    const Tensor& A,
+    const Tensor& B,
+    const Tensor& scales) {
+  auto M = A.size(0);
+  auto N = B.size(0);
+  auto K = A.size(1);
+
+  TORCH_CHECK(
+      A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
+      " : expect A to be either 32-bit or 16-bit float tensor.");
+  TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor.");
+  TORCH_CHECK(
+      A.stride(1) == 1, " : A must be contiguous on the last dimension.");
+  TORCH_CHECK(B.dtype() == kChar, " : expect B to be int8 tensor.");
+  TORCH_CHECK(B.is_contiguous(), " : expect B to be contiguous.");
+  TORCH_CHECK(B.size(1) == K, " : expect B.size(1) == ", K);
+
+  TORCH_CHECK(
+      scales.dim() == 1 && scales.size(0) == N,
+      " : expect scales to be 1d tensor with size ",
+      N);
+
+  auto C = at::empty({M, N}, A.options());
+
+  // --- Launch kernel ---
+  Tensor bias = at::Tensor();
+  Tensor mat2_zero_points = at::Tensor();
+  Tensor non_const_scales = scales;
+  auto post_op_args = torch::List<std::optional<at::Scalar>>();
+
+  at::native::onednn::quantized_matmul(
+      A.contiguous(),
+      1.0,
+      0,
+      B,
+      non_const_scales,
+      mat2_zero_points,
+      bias,
+      C,
+      1.0,
+      0,
+      C.scalar_type(),
+      /*other*/ std::nullopt,
+      /*other scale*/ 1.0,
+      /*other zp*/ 0,
+      /*binary post op*/ "none",
+      /*binary alpha*/ 1.0,
+      /*post_op_name*/ "none",
+      post_op_args,
+      /*post_op_algorithm*/ "none",
+      /*m2_trans*/ false);
+
+  return C;
+}
 } // namespace at::native
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
index 41da31c7eb6b..ede01093ff3e 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
@@ -110,8 +110,9 @@ void quantized_matmul(
   // [Note] Quantized Matrix Multiplication at XPU
   // The following code integrates oneDNN quantized gemm. The quantization
   // config we support:
-  // activation: s8&u8; per tensor calibrated; symmetric&asymmetric
-  // weight: s8; per_tensor/per_channel calibrated; symmetric
+  // activation: s8, u8, fp16, bf16, fp32; per tensor calibrated;
+  // symmetric&asymmetric weight: s8; per_tensor/per_channel calibrated;
+  // symmetric
   auto attr = Attr(static_cast<float>(1.0 / output_scale), output_zero_point);
   construct_attr_by_post_op(
       binary_post_op,
diff --git a/aten/src/ATen/native/mps/MetalShaderLibrary.h b/aten/src/ATen/native/mps/MetalShaderLibrary.h
index 535edd29ebd7..d9f126938b30 100644
--- a/aten/src/ATen/native/mps/MetalShaderLibrary.h
+++ b/aten/src/ATen/native/mps/MetalShaderLibrary.h
@@ -116,6 +116,8 @@ class MetalShaderLibrary {
   std::vector<std::string> getFunctionNames();
   std::shared_ptr<MetalKernelFunction> getKernelFunction(
       const std::string& name);
+  // Returns a raw pointer to the kernel function for use in C APIs
+  MetalKernelFunction* getCachedKernelFunctionPtr(const std::string& name);
   inline MTLComputePipelineState_t getPipelineStateForFunc(
       const std::string& fname) {
     return getLibraryPipelineState(getLibrary(), fname).first;
@@ -164,6 +166,9 @@ class MetalShaderLibrary {
       std::string,
       std::pair<MTLComputePipelineState_t, MTLFunction_t>>
       cplMap;
+  // Cache for kernel functions returned by getCachedKernelFunctionPtr
+  std::unordered_map<std::string, std::unique_ptr<MetalKernelFunction>>
+      kernelCache;
 };
 
 class DynamicMetalShaderLibrary : public MetalShaderLibrary {
diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h
index f9cd28ca06fa..03b3076402d0 100644
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@@ -99,6 +99,9 @@ Tensor getTensorView(const Tensor& t, MPSShape* shape);
 MPSShape* getMPSShape(const TensorBase& t, c10::MemoryFormat memory_format = MemoryFormat::Contiguous);
 MPSShape* getMPSShape(IntArrayRef sizes, c10::MemoryFormat memory_format = MemoryFormat::Contiguous);
 
+// Determines whether a tensor is too large to use MPSGraph
+bool isTooLargeForMPSGraph(const Tensor& tensor, bool useMPSStridedAPI = true);
+
 static inline id<MTLBuffer> getMTLBufferStorage(const TensorBase& tensor) {
   return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
 }
diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index ef42ea6de48c..76a3e7c35aca 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -439,6 +439,22 @@ static void check_mps_shape(MPSShape* shape) {
   }
 }
 
+bool isTooLargeForMPSGraph(const Tensor& tensor, bool useMPSStridedAPI) {
+  static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
+  if ((!tensor.is_contiguous() || tensor.storage_offset()) && useMPSStridedAPI && is_macOS_15_0_or_newer) {
+    auto storage_numel = tensor.storage().nbytes() / tensor.element_size() - tensor.storage_offset();
+    if (storage_numel > std::numeric_limits<int32_t>::max()) {
+      return true;
+    }
+  }
+  for (auto size : tensor.sizes()) {
+    if (size > std::numeric_limits<int32_t>::max()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 MPSNDArray* getMPSNDArray(const TensorBase& t, MPSShape* sizes, MPSShape* strides) {
   id<MTLBuffer> srcBuf = getMTLBufferStorage(t);
 
@@ -696,7 +712,7 @@ Tensor wrapped_scalar_tensor_mps(const Scalar& scalar, const Device device) {
   } else if (scalar.isBoolean()) {
     tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kBool));
   } else if (scalar.isComplex()) {
-    tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kComplexDouble));
+    tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kComplexFloat));
   } else {
     TORCH_INTERNAL_ASSERT(scalar.isIntegral(false));
     tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kLong));
@@ -917,6 +933,22 @@ void executeMPSAllocatorCallback(void* ptr, EventType event) override {}
   return std::make_shared<MetalKernelFunction>(cpl, func);
 }
 
+MetalKernelFunction* MetalShaderLibrary::getCachedKernelFunctionPtr(const std::string& name) {
+  // Check if kernel is already cached
+  auto it = kernelCache.find(name);
+  if (it != kernelCache.end()) {
+    return it->second.get();
+  }
+
+  // Create new kernel function and cache it
+  auto [cpl, func] = getLibraryPipelineState(getLibrary(), name);
+  auto kernel = std::make_unique<MetalKernelFunction>(cpl, func);
+  MetalKernelFunction* raw_ptr = kernel.get();
+  kernelCache[name] = std::move(kernel);
+
+  return raw_ptr;
+}
+
 class BundledShaderLibary : public MetalShaderLibrary {
  public:
   BundledShaderLibary() : MetalShaderLibrary("") {}
diff --git a/aten/src/ATen/native/mps/kernels/Attention.metal b/aten/src/ATen/native/mps/kernels/Attention.metal
index 6bb2cbfb3d71..5a317895f508 100644
--- a/aten/src/ATen/native/mps/kernels/Attention.metal
+++ b/aten/src/ATen/native/mps/kernels/Attention.metal
@@ -14,8 +14,8 @@ template <typename T, int D, int V = D>
     device T* out [[buffer(3)]],
     const constant uint& gqa_factor [[buffer(4)]],
     const constant uint& N [[buffer(5)]],
-    const constant uint2& k_head_seq_stride [[buffer(6)]],
-    const constant uint2& v_head_seq_stride [[buffer(7)]],
+    const constant uint3& qkv_head_strides [[buffer(6)]],
+    const constant uint3& qkv_seq_strides [[buffer(7)]],
     const constant float& scale [[buffer(8)]],
     const device bool* mask [[buffer(9)]],
     const constant uint3& mask_strides [[buffer(10)]],
@@ -28,10 +28,12 @@ template <typename T, int D, int V = D>
   constexpr uint BD = 32;
   constexpr uint qk_per_thread = D / BD;
   constexpr uint v_per_thread = V / BD;
-  const uint k_head_stride = k_head_seq_stride.x;
-  const uint k_seq_stride = k_head_seq_stride.y;
-  const uint v_head_stride = v_head_seq_stride.x;
-  const uint v_seq_stride = v_head_seq_stride.y;
+  const uint q_head_stride = qkv_head_strides.x;
+  const uint q_seq_stride = qkv_seq_strides.x;
+  const uint k_head_stride = qkv_head_strides.y;
+  const uint k_seq_stride = qkv_seq_strides.y;
+  const uint v_head_stride = qkv_head_strides.z;
+  const uint v_seq_stride = qkv_seq_strides.z;
   const uint mask_head_stride = mask_strides.x;
   const uint mask_kv_seq_stride = mask_strides.y;
   const uint mask_q_seq_stride = mask_strides.z;
@@ -54,9 +56,9 @@ template <typename T, int D, int V = D>
   const int kv_head_idx = head_idx / gqa_factor;
   const int Q = tpg.y;
   const int group_offset = head_idx * Q + q_seq_idx;
-  const int q_offset = group_offset;
   const int o_offset = group_offset;
-  queries += q_offset * D + simd_lid * qk_per_thread;
+  queries += head_idx * q_head_stride + q_seq_idx * q_seq_stride +
+      simd_lid * qk_per_thread;
   keys += kv_head_idx * k_head_stride + simd_gid * k_seq_stride +
       simd_lid * qk_per_thread;
   values += kv_head_idx * v_head_stride + simd_gid * v_seq_stride +
@@ -156,8 +158,8 @@ template <typename T, int D, int V = D>
     device float* maxs [[buffer(5)]],
     const constant uint& gqa_factor [[buffer(6)]],
     const constant uint& N [[buffer(7)]],
-    const constant uint2& k_head_seq_stride [[buffer(8)]],
-    const constant uint2& v_head_seq_stride [[buffer(9)]],
+    const constant uint3& qkv_head_strides [[buffer(8)]],
+    const constant uint3& qkv_seq_strides [[buffer(9)]],
     const constant float& scale [[buffer(10)]],
     const device bool* mask [[buffer(11)]],
     const constant uint3& mask_strides [[buffer(12)]],
@@ -170,10 +172,12 @@ template <typename T, int D, int V = D>
   constexpr int BD = 32;
   constexpr int qk_per_thread = D / BD;
   constexpr int v_per_thread = V / BD;
-  const int k_head_stride = k_head_seq_stride.x;
-  const int k_seq_stride = k_head_seq_stride.y;
-  const int v_head_stride = v_head_seq_stride.x;
-  const int v_seq_stride = v_head_seq_stride.y;
+  const int q_head_stride = qkv_head_strides.x;
+  const int q_seq_stride = qkv_seq_strides.x;
+  const int k_head_stride = qkv_head_strides.y;
+  const int k_seq_stride = qkv_seq_strides.y;
+  const int v_head_stride = qkv_head_strides.z;
+  const int v_seq_stride = qkv_seq_strides.z;
   const int mask_kv_seq_stride = mask_strides.x;
   const int mask_q_seq_stride = mask_strides.y;
   const int mask_head_stride = mask_strides.z;
@@ -196,10 +200,10 @@ template <typename T, int D, int V = D>
   const int head_idx = tid.x;
   const int q_seq_idx = tid.y;
   const int o_offset = head_idx * tpg.y + q_seq_idx;
-  const int q_offset = o_offset;
   const int kv_head_idx = head_idx / gqa_factor;
 
-  queries += q_offset * D + simd_lid * qk_per_thread;
+  queries += head_idx * q_head_stride + q_seq_idx * q_seq_stride +
+      simd_lid * qk_per_thread;
   keys += kv_head_idx * k_head_stride +
       (block_idx * BN + simd_gid) * k_seq_stride + simd_lid * qk_per_thread;
   values += kv_head_idx * v_head_stride +
@@ -520,25 +524,25 @@ kernel void attention(
   }
 }
 
-#define INSTANTIATE_SDPA_VECTOR(DTYPE, QK_DIM, VALUE_DIM)    \
-  template [[host_name("sdpa_vector_" #DTYPE "_" #QK_DIM     \
-                       "_" #VALUE_DIM)]] kernel void         \
-  sdpa_vector<DTYPE, QK_DIM, VALUE_DIM>(                     \
-      const device DTYPE* queries [[buffer(0)]],             \
-      const device DTYPE* keys [[buffer(1)]],                \
-      const device DTYPE* values [[buffer(2)]],              \
-      device DTYPE* out [[buffer(3)]],                       \
-      const constant uint& gqa_factor [[buffer(4)]],         \
-      const constant uint& N [[buffer(5)]],                  \
-      const constant uint2& k_head_seq_stride [[buffer(6)]], \
-      const constant uint2& v_head_seq_stride [[buffer(7)]], \
-      const constant float& scale [[buffer(8)]],             \
-      const device bool* mask [[buffer(9)]],                 \
-      const constant uint3& mask_strides [[buffer(10)]],     \
-      const constant bool& has_mask [[buffer(11)]],          \
-      uint3 tid [[threadgroup_position_in_grid]],            \
-      uint3 tpg [[threadgroups_per_grid]],                   \
-      uint simd_gid [[simdgroup_index_in_threadgroup]],      \
+#define INSTANTIATE_SDPA_VECTOR(DTYPE, QK_DIM, VALUE_DIM)   \
+  template [[host_name("sdpa_vector_" #DTYPE "_" #QK_DIM    \
+                       "_" #VALUE_DIM)]] kernel void        \
+  sdpa_vector<DTYPE, QK_DIM, VALUE_DIM>(                    \
+      const device DTYPE* queries [[buffer(0)]],            \
+      const device DTYPE* keys [[buffer(1)]],               \
+      const device DTYPE* values [[buffer(2)]],             \
+      device DTYPE* out [[buffer(3)]],                      \
+      const constant uint& gqa_factor [[buffer(4)]],        \
+      const constant uint& N [[buffer(5)]],                 \
+      const constant uint3& qkv_head_strides [[buffer(6)]], \
+      const constant uint3& qkv_seq_strides [[buffer(7)]],  \
+      const constant float& scale [[buffer(8)]],            \
+      const device bool* mask [[buffer(9)]],                \
+      const constant uint3& mask_strides [[buffer(10)]],    \
+      const constant bool& has_mask [[buffer(11)]],         \
+      uint3 tid [[threadgroup_position_in_grid]],           \
+      uint3 tpg [[threadgroups_per_grid]],                  \
+      uint simd_gid [[simdgroup_index_in_threadgroup]],     \
       uint simd_lid [[thread_index_in_simdgroup]]);
 
 #define INSTANTIATE_SDPA_VECTOR_2PASS_1(DTYPE, QK_DIM, VALUE_DIM) \
@@ -553,8 +557,8 @@ kernel void attention(
       device float* maxs [[buffer(5)]],                           \
       const constant uint& gqa_factor [[buffer(6)]],              \
       const constant uint& N [[buffer(7)]],                       \
-      const constant uint2& k_head_seq_stride [[buffer(8)]],      \
-      const constant uint2& v_head_seq_stride [[buffer(9)]],      \
+      const constant uint3& qkv_head_strides [[buffer(8)]],       \
+      const constant uint3& qkv_seq_strides [[buffer(9)]],        \
       const constant float& scale [[buffer(10)]],                 \
       const device bool* mask [[buffer(11)]],                     \
       const constant uint3& mask_strides [[buffer(12)]],          \
diff --git a/aten/src/ATen/native/mps/kernels/EmbeddingBag.h b/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
index 47bec81bc112..60485815bea4 100644
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
@@ -14,7 +14,8 @@ struct EmbeddingBagParams {
   ::c10::metal::array<idx_type_t, 2> output_strides;
   ::c10::metal::array<idx_type_t, 2> max_indices_strides;
 
-  idx_type_t per_sample_weights_strides;
+  bool use_per_sample_weights;
+  idx_type_t per_sample_weights_stride;
 
   idx_type_t num_indices;
   idx_type_t num_bags;
@@ -23,3 +24,24 @@ struct EmbeddingBagParams {
   EmbeddingBagMode mode;
   int64_t padding_idx;
 };
+
+template <typename idx_type_t = uint32_t>
+struct EmbeddingBagBackwardParams {
+  ::c10::metal::array<idx_type_t, 2> weight_grad_strides;
+  ::c10::metal::array<idx_type_t, 2> output_grad_strides;
+  ::c10::metal::array<idx_type_t, 2> max_indices_strides;
+  bool use_per_sample_weights;
+  idx_type_t per_sample_weights_stride;
+  idx_type_t feature_size;
+  EmbeddingBagMode mode;
+  int64_t padding_idx;
+};
+
+template <typename idx_type_t = uint32_t>
+struct EmbeddingBagPerSampleWeightsBackwardParams {
+  ::c10::metal::array<idx_type_t, 2> output_grad_strides;
+  ::c10::metal::array<idx_type_t, 2> weight_strides;
+  idx_type_t per_sample_weights_grad_stride;
+  idx_type_t feature_size;
+  int64_t padding_idx;
+};
diff --git a/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal b/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
index 861a093d41ad..c97650b7f507 100644
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
@@ -1,4 +1,5 @@
 #include <ATen/native/mps/kernels/EmbeddingBag.h>
+#include <c10/metal/atomic.h>
 #include <c10/metal/utils.h>
 #include <metal_array>
 #include <metal_stdlib>
@@ -23,54 +24,74 @@ struct ReductionOpInit<EmbeddingBagMode::MAX, T> {
 template <EmbeddingBagMode M, typename T>
 struct ReductionOp {
   inline opmath_t<T> operator()(
-      T weight_val,
+      opmath_t<T> weight_val,
       opmath_t<T> out_val,
-      uint32_t per_sample_weights_index,
-      constant T* per_sample_weights,
-      uint32_t per_sample_weights_strides);
+      bool /*is_first*/) {
+    return weight_val + out_val;
+  }
 };
 
 template <typename T>
-struct ReductionOp<EmbeddingBagMode::SUM, T> {
+struct ReductionOp<EmbeddingBagMode::MAX, T> {
   inline opmath_t<T> operator()(
-      T weight_val,
+      opmath_t<T> weight_val,
       opmath_t<T> out_val,
+      bool is_first) {
+    return (is_first || weight_val > out_val) ? weight_val : out_val;
+  }
+};
+
+template <EmbeddingBagMode M, typename T>
+struct MaybeApplyPerSampleWeight {
+  inline opmath_t<T> operator()(
+      opmath_t<T> weight_val,
+      bool /*use_per_sample_weights*/,
+      uint32_t /*per_sample_weights_index*/,
+      constant T* /*per_sample_weights*/,
+      uint32_t /*per_sample_weights_stride*/) {
+    return weight_val;
+  }
+};
+
+template <typename T>
+struct MaybeApplyPerSampleWeight<EmbeddingBagMode::SUM, T> {
+  inline opmath_t<T> operator()(
+      opmath_t<T> weight_val,
+      bool use_per_sample_weights,
       uint32_t per_sample_weights_index,
       constant T* per_sample_weights,
-      uint32_t per_sample_weights_strides) {
-    if (per_sample_weights_strides) {
+      uint32_t per_sample_weights_stride) {
+    if (use_per_sample_weights) {
       T per_sample_weight = per_sample_weights
-          [per_sample_weights_strides * per_sample_weights_index];
-      return static_cast<opmath_t<T>>(per_sample_weight) *
-          static_cast<opmath_t<T>>(weight_val) +
-          out_val;
+          [per_sample_weights_stride * per_sample_weights_index];
+      return static_cast<opmath_t<T>>(per_sample_weight) * weight_val;
     } else {
-      return static_cast<opmath_t<T>>(weight_val) + out_val;
+      return weight_val;
     }
   }
 };
 
-template <typename T>
-struct ReductionOp<EmbeddingBagMode::MEAN, T> {
-  inline opmath_t<T> operator()(
-      T weight_val,
-      opmath_t<T> out_val,
-      uint32_t,
-      constant T*,
-      uint32_t) {
-    return static_cast<opmath_t<T>>(weight_val) + out_val;
-  }
+template <EmbeddingBagMode M, typename T, typename I>
+struct MaybeCalcMaxIndex {
+  inline void operator()(
+      opmath_t<T> /*weight_val*/,
+      opmath_t<T> /*out_val*/,
+      bool /*is_first*/,
+      thread I& /*max_idx*/,
+      I /*weight_idx*/,
+      bool /*pad*/) {}
 };
 
-template <typename T>
-struct ReductionOp<EmbeddingBagMode::MAX, T> {
-  inline opmath_t<T> operator()(
-      T weight_val,
+template <typename T, typename I>
+struct MaybeCalcMaxIndex<EmbeddingBagMode::MAX, T, I> {
+  inline void operator()(
+      opmath_t<T> weight_val,
       opmath_t<T> out_val,
-      uint32_t,
-      constant T*,
-      uint32_t) {
-    return max(static_cast<opmath_t<T>>(weight_val), out_val);
+      bool is_first,
+      thread I& max_idx,
+      I weight_idx,
+      bool pad) {
+    max_idx = !pad && (is_first || weight_val > out_val) ? weight_idx : max_idx;
   }
 };
 
@@ -96,6 +117,30 @@ struct ReductionOpFinal<EmbeddingBagMode::MAX, T> {
   }
 };
 
+template <EmbeddingBagMode M, typename I>
+struct MaybeWriteMaxIndex {
+  inline void operator()(
+      device I*,
+      const constant ::c10::metal::array<uint32_t, 2>&,
+      uint32_t,
+      uint32_t,
+      I) {}
+};
+
+template <typename I>
+struct MaybeWriteMaxIndex<EmbeddingBagMode::MAX, I> {
+  inline void operator()(
+      device I* max_indices,
+      const constant ::c10::metal::array<uint32_t, 2>& max_indices_strides,
+      uint32_t bag_idx,
+      uint32_t feature_idx,
+      I max_idx) {
+    max_indices
+        [bag_idx * max_indices_strides[0] +
+         feature_idx * max_indices_strides[1]] = max_idx;
+  }
+};
+
 template <EmbeddingBagMode M, typename T, typename I>
 void embedding_bag_impl(
     constant T* weight,
@@ -112,7 +157,8 @@ void embedding_bag_impl(
   auto num_bags = params.num_bags;
   auto feature_size = params.feature_size;
   auto padding_idx = params.padding_idx;
-  auto per_sample_weights_strides = params.per_sample_weights_strides;
+  auto use_per_sample_weights = params.use_per_sample_weights;
+  auto per_sample_weights_stride = params.per_sample_weights_stride;
   constant auto& output_strides = params.output_strides;
   constant auto& weight_strides = params.weight_strides;
   constant auto& max_indices_strides = params.max_indices_strides;
@@ -120,8 +166,6 @@ void embedding_bag_impl(
   auto bag_idx = tid / feature_size;
   auto feature_idx = tid % feature_size;
 
-  output += bag_idx * output_strides[0] + feature_idx * output_strides[1];
-
   uint32_t offsets_end = min(bag_idx + 1, num_bags - 1);
   bool is_last_bag = bag_idx + 1 == num_bags;
   uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]);
@@ -131,28 +175,41 @@ void embedding_bag_impl(
   auto out_val = ReductionOpInit<M, T>()();
 
   uint32_t bag_size_ = 0;
+  I max_idx = 0;
 
   for (uint32_t indices_idx = indices_start; indices_idx < indices_end;
        indices_idx++) {
     I weight_idx = indices[indices_idx];
     bool pad = (weight_idx == padding_idx);
-    T weight_val = weight
-        [static_cast<uint32_t>(weight_idx) * weight_strides[0] +
-         feature_idx * weight_strides[1]];
-
-    bag_size_ += static_cast<uint32_t>(!pad);
+    auto weight_val = static_cast<opmath_t<T>>(
+        weight
+            [static_cast<uint32_t>(weight_idx) * weight_strides[0] +
+             feature_idx * weight_strides[1]]);
 
-    auto tmp_val = ReductionOp<M, T>()(
+    weight_val = MaybeApplyPerSampleWeight<M, T>()(
         weight_val,
-        out_val,
+        use_per_sample_weights,
         indices_idx,
         per_sample_weights,
-        per_sample_weights_strides);
+        per_sample_weights_stride);
 
-    out_val = pad ? out_val : tmp_val;
+    auto new_out_val = ReductionOp<M, T>()(weight_val, out_val, bag_size_ == 0);
+
+    MaybeCalcMaxIndex<M, T, I>()(
+        weight_val, out_val, bag_size_ == 0, max_idx, weight_idx, pad);
+
+    out_val = pad ? out_val : new_out_val;
+    offset2bag[indices_idx] = bag_idx;
+    bag_size_ += static_cast<uint32_t>(!pad);
   }
 
-  *output = ReductionOpFinal<M, T>()(out_val, bag_size_);
+  output[bag_idx * output_strides[0] + feature_idx * output_strides[1]] =
+      ReductionOpFinal<M, T>()(out_val, bag_size_);
+
+  bag_size[bag_idx] = bag_size_;
+
+  MaybeWriteMaxIndex<M, I>()(
+      max_indices, max_indices_strides, bag_idx, feature_idx, max_idx);
 }
 
 #define DISPATCH_IMPL(MODE)        \
@@ -190,19 +247,208 @@ kernel void embedding_bag(
   }
 }
 
-#define REGISTER_EMBEDDING_BAG_OP(T, I)                             \
-  template [[host_name("embedding_bag_" #T "_" #I)]]                \
-  kernel void embedding_bag<T, I>(                                  \
-      constant T * weight [[buffer(0)]],                            \
-      constant I * indices [[buffer(1)]],                           \
-      constant I * offsets [[buffer(2)]],                           \
-      constant T * per_sample_weights [[buffer(3)]],                \
-      device T * output [[buffer(4)]],                              \
-      device I * offset2bag [[buffer(5)]],                          \
-      device I * bag_size [[buffer(6)]],                            \
-      device I * max_indices [[buffer(7)]],                         \
-      constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \
-      uint tid [[thread_position_in_grid]]);
+template <EmbeddingBagMode M, typename T>
+struct MaybeDivBagSize {
+  inline opmath_t<T> operator()(opmath_t<T> val, opmath_t<T> /*bag_size*/) {
+    return val;
+  }
+};
+
+template <typename T>
+struct MaybeDivBagSize<EmbeddingBagMode::MEAN, T> {
+  inline opmath_t<T> operator()(opmath_t<T> val, opmath_t<T> bag_size) {
+    return val / bag_size;
+  }
+};
+
+template <EmbeddingBagMode M, typename T, typename I>
+void embedding_bag_backward_sum_mean_impl(
+    constant T* output_grad,
+    constant I* indices,
+    constant I* offset2bag,
+    constant I* bag_size,
+    constant T* per_sample_weights,
+    device AtomicType_t<T>* weight_grad,
+    constant EmbeddingBagBackwardParams<uint32_t>& params,
+    uint tid) {
+  auto feature_size = params.feature_size;
+  auto indices_idx = tid / feature_size;
+  auto bag_idx = static_cast<uint32_t>(offset2bag[indices_idx]);
+  auto bag_size_val = bag_size[bag_idx];
+  auto weight_idx = indices[indices_idx];
+  auto padding_idx = params.padding_idx;
+
+  if (bag_size_val && weight_idx != padding_idx) {
+    auto feature_idx = tid % feature_size;
+    constant auto& weight_grad_strides = params.weight_grad_strides;
+    constant auto& output_grad_strides = params.output_grad_strides;
+    auto use_per_sample_weights = params.use_per_sample_weights;
+    auto per_sample_weights_stride = params.per_sample_weights_stride;
+
+    auto output_grad_val =
+        static_cast<opmath_t<T>>(output_grad
+                                     [bag_idx * output_grad_strides[0] +
+                                      feature_idx * output_grad_strides[1]]);
+
+    opmath_t<T> weight_grad_val = MaybeDivBagSize<M, T>()(
+        MaybeApplyPerSampleWeight<M, T>()(
+            output_grad_val,
+            use_per_sample_weights,
+            indices_idx,
+            per_sample_weights,
+            per_sample_weights_stride),
+        static_cast<opmath_t<T>>(bag_size_val));
+
+    AtomicType<T>::atomic_add(
+        weight_grad,
+        static_cast<int32_t>(weight_idx) * weight_grad_strides[0] +
+            feature_idx * weight_grad_strides[1],
+        static_cast<T>(weight_grad_val));
+  }
+}
+
+template <typename T, typename I>
+void embedding_bag_backward_max_impl(
+    constant T* output_grad,
+    constant I* bag_size,
+    constant I* max_indices,
+    device AtomicType_t<T>* weight_grad,
+    constant EmbeddingBagBackwardParams<uint32_t>& params,
+    uint tid) {
+  auto feature_size = params.feature_size;
+  auto bag_idx = tid / feature_size;
+  auto bag_size_val = bag_size[bag_idx];
+
+  if (bag_size_val) {
+    auto feature_idx = tid % feature_size;
+    constant auto& weight_grad_strides = params.weight_grad_strides;
+    constant auto& output_grad_strides = params.output_grad_strides;
+    constant auto& max_indices_strides = params.max_indices_strides;
+
+    auto output_grad_val = output_grad
+        [bag_idx * output_grad_strides[0] +
+         feature_idx * output_grad_strides[1]];
+    auto max_index =
+        static_cast<uint32_t>(max_indices
+                                  [bag_idx * max_indices_strides[0] +
+                                   feature_idx * max_indices_strides[1]]);
+
+    AtomicType<T>::atomic_add(
+        weight_grad,
+        max_index * weight_grad_strides[0] +
+            feature_idx * weight_grad_strides[1],
+        output_grad_val);
+  }
+}
+
+#define DISPATCH_BACKWARD_SUM_MEAN_IMPL(MODE)        \
+  return embedding_bag_backward_sum_mean_impl<MODE>( \
+      output_grad,                                   \
+      indices,                                       \
+      offset2bag,                                    \
+      bag_size,                                      \
+      per_sample_weights,                            \
+      weight_grad,                                   \
+      params,                                        \
+      tid)
+
+template <typename T, typename I>
+kernel void embedding_bag_backward(
+    constant T* output_grad [[buffer(0)]],
+    constant I* indices [[buffer(1)]],
+    constant I* offset2bag [[buffer(2)]],
+    constant I* bag_size [[buffer(3)]],
+    constant I* max_indices [[buffer(4)]],
+    constant T* per_sample_weights [[buffer(5)]],
+    device AtomicType_t<T>* weight_grad [[buffer(6)]],
+    constant EmbeddingBagBackwardParams<uint32_t>& params [[buffer(7)]],
+    uint tid [[thread_position_in_grid]]) {
+  switch (params.mode) {
+    case EmbeddingBagMode::SUM:
+      DISPATCH_BACKWARD_SUM_MEAN_IMPL(EmbeddingBagMode::SUM);
+    case EmbeddingBagMode::MEAN:
+      DISPATCH_BACKWARD_SUM_MEAN_IMPL(EmbeddingBagMode::MEAN);
+    case EmbeddingBagMode::MAX:
+      return embedding_bag_backward_max_impl(
+          output_grad, bag_size, max_indices, weight_grad, params, tid);
+  }
+}
+
+template <typename T, typename I>
+kernel void embedding_bag_per_sample_weights_backward(
+    constant T* output_grad [[buffer(0)]],
+    constant T* weight [[buffer(1)]],
+    constant I* indices [[buffer(2)]],
+    constant I* offset2bag [[buffer(3)]],
+    device AtomicType_t<T>* per_sample_weights_grad [[buffer(4)]],
+    constant EmbeddingBagPerSampleWeightsBackwardParams<uint32_t>& params
+    [[buffer(5)]],
+    uint tid [[thread_position_in_grid]]) {
+  auto feature_size = params.feature_size;
+  auto padding_idx = params.padding_idx;
+  auto indices_idx = tid / feature_size;
+  auto weight_idx = indices[indices_idx];
+
+  if (weight_idx != padding_idx) {
+    auto feature_idx = tid % feature_size;
+    auto bag_idx = static_cast<uint32_t>(offset2bag[indices_idx]);
+    constant auto& output_grad_strides = params.output_grad_strides;
+    constant auto& weight_strides = params.weight_strides;
+    auto per_sample_weights_grad_stride = params.per_sample_weights_grad_stride;
+
+    auto weight_val = weight
+        [static_cast<uint32_t>(weight_idx) * weight_strides[0] +
+         feature_idx * weight_strides[1]];
+    auto output_grad_val = output_grad
+        [bag_idx * output_grad_strides[0] +
+         feature_idx * output_grad_strides[1]];
+    auto per_sample_weights_grad_val = static_cast<opmath_t<T>>(weight_val) *
+        static_cast<opmath_t<T>>(output_grad_val);
+
+    AtomicType<T>::atomic_add(
+        per_sample_weights_grad,
+        indices_idx * per_sample_weights_grad_stride,
+        static_cast<T>(per_sample_weights_grad_val));
+  }
+}
+
+#define REGISTER_EMBEDDING_BAG_OP(T, I)                                     \
+  template [[host_name("embedding_bag_" #T "_" #I)]]                        \
+  kernel void embedding_bag<T, I>(                                          \
+      constant T * weight [[buffer(0)]],                                    \
+      constant I * indices [[buffer(1)]],                                   \
+      constant I * offsets [[buffer(2)]],                                   \
+      constant T * per_sample_weights [[buffer(3)]],                        \
+      device T * output [[buffer(4)]],                                      \
+      device I * offset2bag [[buffer(5)]],                                  \
+      device I * bag_size [[buffer(6)]],                                    \
+      device I * max_indices [[buffer(7)]],                                 \
+      constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]],         \
+      uint tid [[thread_position_in_grid]]);                                \
+                                                                            \
+  template [[host_name("embedding_bag_backward_" #T "_" #I)]]               \
+  kernel void embedding_bag_backward<T, I>(                                 \
+      constant T * output_grad [[buffer(0)]],                               \
+      constant I * indices [[buffer(1)]],                                   \
+      constant I * offset2bag [[buffer(2)]],                                \
+      constant I * bag_size [[buffer(3)]],                                  \
+      constant I * max_indices [[buffer(4)]],                               \
+      constant T * per_sample_weights [[buffer(5)]],                        \
+      device AtomicType_t<T> * weight_grad [[buffer(6)]],                   \
+      constant EmbeddingBagBackwardParams<uint32_t> & params [[buffer(7)]], \
+      uint tid [[thread_position_in_grid]]);                                \
+                                                                            \
+  template                                                                  \
+      [[host_name("embedding_bag_per_sample_weights_backward_" #T "_" #I)]] \
+      kernel void embedding_bag_per_sample_weights_backward<T, I>(          \
+          constant T * output_grad [[buffer(0)]],                           \
+          constant T * weight [[buffer(1)]],                                \
+          constant I * indices [[buffer(2)]],                               \
+          constant I * offset2bag [[buffer(3)]],                            \
+          device AtomicType_t<T> * per_sample_weights_grad [[buffer(4)]],   \
+          constant EmbeddingBagPerSampleWeightsBackwardParams<uint32_t> &   \
+              params [[buffer(5)]],                                         \
+          uint tid [[thread_position_in_grid]]);
 
 REGISTER_EMBEDDING_BAG_OP(float, int);
 REGISTER_EMBEDDING_BAG_OP(float, long);
diff --git a/aten/src/ATen/native/mps/kernels/GridSampler.metal b/aten/src/ATen/native/mps/kernels/GridSampler.metal
index 331793e08d66..84bfbb57f8f0 100644
--- a/aten/src/ATen/native/mps/kernels/GridSampler.metal
+++ b/aten/src/ATen/native/mps/kernels/GridSampler.metal
@@ -223,9 +223,6 @@ void grid_sampler_single_element(
     auto input_size = input_sizes[input_dim];
     auto coord = static_cast<opmath_t<T>>(coords[coord_dim]);
 
-    // Interpret nan as -1
-    coord = isnan(coord) ? -1 : coord;
-
     if (!align_corners) {
       // Map unaligned grid space to aligned grid space
       auto corner_alignment_factor = static_cast<opmath_t<T>>(input_size) /
diff --git a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
index 4ba2bca720db..3673bd3cc483 100644
--- a/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
+++ b/aten/src/ATen/native/mps/kernels/LinearAlgebra.metal
@@ -441,7 +441,7 @@ kernel void applySYRK(
     uint3 tid [[thread_position_in_threadgroup]],
     uint3 tgid [[threadgroup_position_in_grid]],
     uint3 tpg [[threads_per_threadgroup]],
-    uint sgitg [[simdgroup_index_in_threadgroup]]) {
+    uint warp_id [[simdgroup_index_in_threadgroup]]) {
   const uint tx = tid.x;
   const uint ty = tid.y;
   const uint simdGroupsPerThreadgroup = (tpg.x * tpg.y + 31) / 32;
@@ -474,11 +474,8 @@ kernel void applySYRK(
       (actSize_j % 8 == 0) && (actSize_h % 8 == 0) && (actSize_k % 8 == 0);
 
   if (use_simdgroup) {
-    uint warp_id = sgitg;
-
     simdgroup_matrix<float, 8, 8> negative_identity =
         simdgroup_matrix<float, 8, 8>(-1.0);
-    simdgroup_matrix<float, 8, 8> identity = simdgroup_matrix<float, 8, 8>(1.0);
     simdgroup_matrix<float, 8, 8> Prod;
     simdgroup_matrix<float, 8, 8> Afrag;
     simdgroup_matrix<float, 8, 8> Bfrag;
@@ -521,8 +518,7 @@ kernel void applySYRK(
             /* transpose = */ upper);
 
         simdgroup_multiply(Prod, Afrag, Bfrag);
-        simdgroup_multiply(Prod, Prod, negative_identity);
-        simdgroup_multiply_accumulate(Cfrag, Cfrag, identity, Prod);
+        simdgroup_multiply_accumulate(Cfrag, Prod, negative_identity, Cfrag);
       }
 
       simdgroup_store(
diff --git a/aten/src/ATen/native/mps/kernels/Shape.h b/aten/src/ATen/native/mps/kernels/Shape.h
new file mode 100644
index 000000000000..dcbc3226e923
--- /dev/null
+++ b/aten/src/ATen/native/mps/kernels/Shape.h
@@ -0,0 +1,18 @@
+#pragma once
+#include <c10/metal/common.h>
+
+template <typename idx_type_t = int64_t, unsigned N = c10::metal::max_ndim>
+struct CatSharedParams {
+  int32_t ndim;
+  int32_t cat_dim;
+  ::c10::metal::array<idx_type_t, N> output_strides;
+  ::c10::metal::array<idx_type_t, N> output_sizes;
+};
+
+template <typename idx_type_t = int64_t, unsigned N = c10::metal::max_ndim>
+struct CatInputParams {
+  idx_type_t cat_dim_offset;
+  idx_type_t input_element_offset;
+  ::c10::metal::array<idx_type_t, N> input_strides;
+  ::c10::metal::array<idx_type_t, N> input_sizes;
+};
diff --git a/aten/src/ATen/native/mps/kernels/Shape.metal b/aten/src/ATen/native/mps/kernels/Shape.metal
new file mode 100644
index 000000000000..5c7aed8c01e6
--- /dev/null
+++ b/aten/src/ATen/native/mps/kernels/Shape.metal
@@ -0,0 +1,85 @@
+#include <ATen/native/mps/kernels/Shape.h>
+#include <c10/metal/utils.h>
+#include <metal_array>
+#include <metal_stdlib>
+
+using namespace metal;
+using namespace c10::metal;
+
+template <typename I, typename T_in, typename T_out>
+kernel void cat(
+    constant T_in* input [[buffer(0)]],
+    device T_out* output [[buffer(1)]],
+    constant CatSharedParams<I>& shared_params [[buffer(2)]],
+    constant CatInputParams<I>& input_params [[buffer(3)]],
+    uint tid [[thread_position_in_grid]]) {
+  auto ndim = shared_params.ndim;
+  auto cat_dim = shared_params.cat_dim;
+  constant auto& output_strides = shared_params.output_strides;
+
+  auto cat_dim_offset = input_params.cat_dim_offset;
+  auto input_element_offset = input_params.input_element_offset;
+  constant auto& input_strides = input_params.input_strides;
+  constant auto& input_sizes = input_params.input_sizes;
+
+  auto input_element_idx = static_cast<I>(tid) + input_element_offset;
+  I input_offset = 0;
+  I output_offset = 0;
+
+  for (auto dim = ndim - 1; dim >= 0; dim--) {
+    auto dim_size = input_sizes[dim];
+    auto input_dim_idx = input_element_idx % dim_size;
+    auto output_dim_idx =
+        input_dim_idx + ((dim == cat_dim) ? cat_dim_offset : 0);
+
+    input_offset += input_strides[dim] * input_dim_idx;
+    output_offset += output_strides[dim] * output_dim_idx;
+
+    input_element_idx = input_element_idx / dim_size;
+  }
+
+  output[output_offset] = static_cast<T_out>(input[input_offset]);
+}
+
+#define REGISTER_CAT_OP(I, T_in, T_out)                          \
+  template [[host_name("cat_" #I "_" #T_in "_" #T_out)]]         \
+  kernel void cat<I, T_in, T_out>(                               \
+      constant T_in * input [[buffer(0)]],                       \
+      device T_out * output [[buffer(1)]],                       \
+      constant CatSharedParams<I> & shared_params [[buffer(2)]], \
+      constant CatInputParams<I> & input_params [[buffer(3)]],   \
+      uint tid [[thread_position_in_grid]]);
+
+#define REGISTER_CAT_OP_ALL_INPUT_TYPES(I, T_out) \
+  REGISTER_CAT_OP(I, float, T_out);               \
+  REGISTER_CAT_OP(I, half, T_out);                \
+  REGISTER_CAT_OP(I, bfloat, T_out);              \
+  REGISTER_CAT_OP(I, int, T_out);                 \
+  REGISTER_CAT_OP(I, uint, T_out);                \
+  REGISTER_CAT_OP(I, long, T_out);                \
+  REGISTER_CAT_OP(I, ulong, T_out);               \
+  REGISTER_CAT_OP(I, short, T_out);               \
+  REGISTER_CAT_OP(I, ushort, T_out);              \
+  REGISTER_CAT_OP(I, char, T_out);                \
+  REGISTER_CAT_OP(I, uchar, T_out);               \
+  REGISTER_CAT_OP(I, bool, T_out);
+
+#define REGISTER_CAT_FOR_INDEX_TYPE(I)        \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, float);  \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, half);   \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, bfloat); \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, int);    \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, uint);   \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, long);   \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, ulong);  \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, short);  \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, ushort); \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, char);   \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, uchar);  \
+  REGISTER_CAT_OP_ALL_INPUT_TYPES(I, bool);   \
+                                              \
+  REGISTER_CAT_OP(I, float2, float2);         \
+  REGISTER_CAT_OP(I, half2, half2);
+
+REGISTER_CAT_FOR_INDEX_TYPE(int64_t);
+REGISTER_CAT_FOR_INDEX_TYPE(int32_t);
diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
index dec200d7e5bc..e437ea5ed798 100644
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -512,7 +512,7 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c
 }
 
 static MPSGraphTensor* normcdf(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
-  // (1.0f + erf(x*SQRT1_2)) * 0.5f * x;
+  // (1.0f + erf(x*SQRT1_2)) * 0.5f;
   auto dataType = [inputTensor dataType];
   const float SQRT1_2 = 0.707106781186547524400844362104849039f;
   MPSGraphTensor* sqrt1_2 = [mpsGraph constantWithScalar:SQRT1_2 shape:@[ @1 ] dataType:dataType];
diff --git a/aten/src/ATen/native/mps/operations/Attention.mm b/aten/src/ATen/native/mps/operations/Attention.mm
index 69ec9af055ba..11498ade6fd0 100644
--- a/aten/src/ATen/native/mps/operations/Attention.mm
+++ b/aten/src/ATen/native/mps/operations/Attention.mm
@@ -182,6 +182,8 @@
   uint maxSeqLength = k_.size(2);
   uint N = k_.size(2);
   uint B = q_.size(0) * q_.size(1);
+  uint q_head_stride = q_.stride(1);
+  uint q_seq_stride = q_.stride(2);
   uint k_head_stride = k_.stride(1);
   uint k_seq_stride = k_.stride(2);
   uint v_head_stride = v_.stride(1);
@@ -209,8 +211,8 @@
                   out,
                   1,
                   N,
-                  std::array<uint32_t, 2>{k_head_stride, k_seq_stride},
-                  std::array<uint32_t, 2>{v_head_stride, v_seq_stride},
+                  std::array<uint32_t, 3>{q_head_stride, k_head_stride, v_head_stride},
+                  std::array<uint32_t, 3>{q_seq_stride, k_seq_stride, v_seq_stride},
                   scale_factor);
 
       if (has_mask) {
@@ -257,6 +259,8 @@
   uint B = batchSize * num_heads;
   uint gqa_factor = q_.size(1) / k_.size(1);
 
+  uint q_head_stride = q_.stride(1);
+  uint q_seq_stride = q_.stride(2);
   uint k_head_stride = k_.stride(1);
   uint k_seq_stride = k_.stride(2);
   uint v_head_stride = v_.stride(1);
@@ -294,8 +298,8 @@
                   maxs,
                   gqa_factor,
                   N,
-                  std::array<uint32_t, 2>{k_head_stride, k_seq_stride},
-                  std::array<uint32_t, 2>{v_head_stride, v_seq_stride},
+                  std::array<uint32_t, 3>{q_head_stride, k_head_stride, v_head_stride},
+                  std::array<uint32_t, 3>{q_seq_stride, k_seq_stride, v_seq_stride},
                   scale_factor);
 
       if (has_mask) {
diff --git a/aten/src/ATen/native/mps/operations/Blas.mm b/aten/src/ATen/native/mps/operations/Blas.mm
index 101ef5feb224..16d744cedb8e 100644
--- a/aten/src/ATen/native/mps/operations/Blas.mm
+++ b/aten/src/ATen/native/mps/operations/Blas.mm
@@ -54,6 +54,10 @@ Tensor dot_mps(const Tensor& self, const Tensor& other) {
   using namespace mps;
   using CachedGraph = MPSBinaryCachedGraph;
 
+  if (self.numel() == 0 & other.numel() == 0) {
+    return zeros({}, self.options());
+  }
+
   dot_check(self, other);
 
   auto output = at::empty({}, self.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index 110927379ee5..a457267a9d85 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -52,9 +52,7 @@ static void fill_depthwise_conv_desc(MPSGraphDepthwiseConvolution3DOpDescriptor*
                                      NSUInteger dilationRateInX,
                                      NSUInteger dilationRateInY,
                                      NSUInteger paddingHorizontal,
-                                     NSUInteger paddingVertical,
-                                     c10::MemoryFormat memory_format,
-                                     NSUInteger groups) {
+                                     NSUInteger paddingVertical) {
   descriptor_.strides =
       @[ @1, [[NSNumber alloc] initWithInteger:strideInY], [[NSNumber alloc] initWithInteger:strideInX] ];
   descriptor_.dilationRates =
@@ -103,7 +101,7 @@ static void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_,
   descriptor_.groups = groups;
 }
 
-static Tensor _mps_convolution_impl(const Tensor& input_t_,
+static Tensor _mps_convolution_impl(const Tensor& input_t,
                                     const Tensor& weight_t,
                                     const std::optional<Tensor>& bias_opt,
                                     IntArrayRef padding,
@@ -111,12 +109,15 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
                                     IntArrayRef dilation,
                                     int64_t groups,
                                     std::optional<IntArrayRef> input_shape) {
-  const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
-  Tensor input_t = input_t_;
-  bool is3DConv = input_t.dim() == 5;
-  if (!is_macOS_15_0_or_newer || is3DConv) {
-    input_t = input_t.contiguous();
-  }
+  constexpr auto kChannelsLast = MemoryFormat::ChannelsLast;
+  constexpr auto kContiguous = MemoryFormat::Contiguous;
+  const bool is_macos_15_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
+
+  const bool is3DConv = input_t.dim() == 5;
+  const auto memory_format = input_t.suggest_memory_format();
+  const auto input_suggested_layout = memory_format == kChannelsLast && is_macos_15_plus ? kChannelsLast : kContiguous;
+  const bool is_channels_last = mps_conv_use_channels_last(input_t, weight_t) && !is3DConv;
+  const bool bias_defined = bias_opt ? bias_opt->defined() : false;
 
   TORCH_CHECK(isFloatingType(input_t.scalar_type()), "Convolution is supported only for Floating types");
 
@@ -126,15 +127,6 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
   checkAllSameType(c, {input, weight});
   checkAllSameGPU(c, {input, weight});
 
-  bool bias_defined;
-
-  if (bias_opt == std::nullopt)
-    bias_defined = false;
-  else
-    bias_defined = bias_opt->defined();
-
-  auto memory_format = input_t.suggest_memory_format();
-  bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast) && !is3DConv;
   auto output_t =
       at::empty(input_shape.has_value() ? input_shape.value()
                                         : conv_output_size(input->sizes(), weight->sizes(), padding, stride, dilation),
@@ -142,12 +134,18 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
                 std::nullopt,
                 kMPS,
                 std::nullopt,
-                is_macOS_15_0_or_newer ? memory_format : MemoryFormat::Contiguous);
+                is_channels_last ? kChannelsLast : kContiguous);
   if (output_t.numel() == 0) {
     return output_t;
   }
   TensorArg output{output_t, "result", 0};
 
+  // TODO: Remove me when MacOS-14 is no longer supported
+  std::optional<Tensor> output_c;
+  if (!is_macos_15_plus && is_channels_last) {
+    output_c = at::empty_like(output_t, output_t.options().memory_format(kContiguous));
+  }
+
   if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_1_PLUS)) {
     // On macOS < 15.1, MPS convolution kernel does not support output channels > 2^16
     for (auto elem : output_t.sizes()) {
@@ -186,32 +184,22 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
                                   getArrayRefString(dilation),
                                   getArrayRefString(padding),
                                   groups,
-                                  is_channels_last,
+                                  input_suggested_layout == kChannelsLast,
                                   mps::getTensorsStringKey({input_t, weight_t}),
                                   bias_defined,
                                   bias_shape_key);
 
-    MPSShape* inputShape = mps::getMPSShape(input_t, memory_format);
-    MPSShape* outputShape = mps::getMPSShape(output_t, memory_format);
-    MPSNDArray* inputNDArray = nil;
-    MPSNDArray* outputNDArray = nil;
-
-    if (input_t.is_contiguous(memory_format) && output_t.is_contiguous(memory_format) && is_macOS_15_0_or_newer) {
-      inputNDArray = getMPSNDArray(input_t, inputShape);
-      outputNDArray = getMPSNDArray(*output, outputShape);
-    }
-
+    auto inputShape = mps::getMPSShape(input_t, input_suggested_layout);
+    auto outputShape = mps::getMPSShape(output_t, input_suggested_layout);
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      MPSShape* weightShape = mps::getMPSShape(weight_t);
-      bool isDepthwiseConv = ((groups > 1 && (weightShape[1].intValue == 1)) && inputShape.count >= 4 &&
-                              weightShape.count >= 4 && !is_channels_last);
-
-      MPSGraphTensor* inputTensor =
-          mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(input_t.scalar_type()), inputShape);
-      MPSGraphTensor* weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
-      MPSGraphTensor* outputTensor;
+      bool isDepthwiseConv =
+          (groups > 1 && weight_t.size(1) == 1) && input_t.dim() >= 4 && weight_t.dim() >= 4 && !is_channels_last;
+
+      auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(input_t), inputShape);
+      auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
+      MPSGraphTensor* outputTensor = nil;
       if (is3DConv) {
-        MPSGraphConvolution3DOpDescriptor* conv3dDescriptor_ = [[MPSGraphConvolution3DOpDescriptor new] autorelease];
+        auto conv3dDescriptor_ = [[MPSGraphConvolution3DOpDescriptor new] autorelease];
         fill_conv3d_desc(conv3dDescriptor_,
                          stride[2],
                          stride[1],
@@ -229,17 +217,9 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
                                                     descriptor:conv3dDescriptor_
                                                           name:nil];
       } else if (isDepthwiseConv) {
-        MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
-            [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
-        fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
-                                 stride[1],
-                                 stride[0],
-                                 dilation[1],
-                                 dilation[0],
-                                 padding[1],
-                                 padding[0],
-                                 memory_format,
-                                 groups);
+        auto depthWiseConv3dDescriptor_ = [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
+        fill_depthwise_conv_desc(
+            depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]);
 
         MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor
                                                                 dimension:-3
@@ -258,7 +238,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
                        dilation[0],
                        padding[1],
                        padding[0],
-                       memory_format,
+                       input_suggested_layout,
                        groups);
 
         outputTensor = [mpsGraph convolution2DWithSourceTensor:inputTensor
@@ -270,13 +250,6 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
       MPSGraphTensor* biasTensor = nil;
       if (bias_defined) {
         biasTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(bias_opt.value()));
-      }
-
-      if (is_channels_last && !is_macOS_15_0_or_newer) {
-        outputTensor = mps::convertNHWCtoNCHW(mpsGraph, outputTensor);
-      }
-
-      if (bias_defined) {
         outputTensor = [mpsGraph additionWithPrimaryTensor:outputTensor secondaryTensor:biasTensor name:nil];
       }
       newCachedGraph->inputTensor_ = inputTensor;
@@ -285,27 +258,26 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
       newCachedGraph->outputTensor_ = outputTensor;
     });
 
-    auto inputPlaceholder = inputNDArray ? Placeholder(cachedGraph->inputTensor_, inputNDArray)
-                                         : Placeholder(cachedGraph->inputTensor_, input_t, inputShape);
-    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
+    auto inputPlaceholder = input_suggested_layout == kContiguous
+        ? Placeholder(cachedGraph->inputTensor_, output_c || is3DConv ? input_t.contiguous() : input_t)
+        : Placeholder(cachedGraph->inputTensor_, getMPSNDArray(input_t, inputShape));
+    auto outputPlaceholder = input_suggested_layout == kContiguous
+        ? Placeholder(cachedGraph->outputTensor_, output_c ? *output_c : output_t)
+        : Placeholder(cachedGraph->outputTensor_, getMPSNDArray(output_t, outputShape));
+    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, output_c ? weight_t.contiguous() : weight_t);
     auto biasPlaceholder = Placeholder();
     // Reshape the bias to be broadcastable with output of conv2d or conv3d
     if (bias_defined) {
       if (is3DConv) {
-        biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, bias_shape[0], 1, 1, 1}));
+        biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, bias_shape[0], 1, 1, 1}));
+      } else if (input_suggested_layout == kChannelsLast) {
+        biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, 1, 1, bias_shape[0]}));
       } else {
-        if (is_channels_last && is_macOS_15_0_or_newer) {
-          biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, 1, 1, bias_shape[0]}));
-        } else {
-          biasPlaceholder = Placeholder(cachedGraph->biasTensor_, (bias_opt.value()).view({1, bias_shape[0], 1, 1}));
-        }
+        biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias_opt->view({1, bias_shape[0], 1, 1}));
       }
     }
-    auto outputPlaceholder = outputNDArray ? Placeholder(cachedGraph->outputTensor_, outputNDArray)
-                                           : Placeholder(cachedGraph->outputTensor_, *output);
 
-    NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
-        [[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
+    auto feeds = [[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
     feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
     feeds[weightsPlaceholder.getMPSGraphTensor()] = weightsPlaceholder.getMPSGraphTensorData();
     if (bias_defined) {
@@ -315,7 +287,11 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
-  return *output;
+  if (output_c) {
+    output_t.copy_(*output_c);
+  }
+
+  return output_t;
 }
 
 Tensor _mps_convolution(const Tensor& input_t,
@@ -351,14 +327,21 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
   TensorArg grad_output{grad_output_t, "grad_output", 1}, weight{weight_t, "weight", 2};
   checkAllSameType(c, {grad_output, weight});
   checkAllSameGPU(c, {grad_output, weight});
-  auto memory_format = grad_output_t.suggest_memory_format();
-  bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast) && !is3DConv;
-  auto grad_input_t = at::empty(input_size, grad_output_t.options(), std::nullopt);
+  constexpr auto kChannelsLast = at::MemoryFormat::ChannelsLast;
+  bool is_channels_last = mps_conv_use_channels_last(grad_output_t, weight_t) && !is3DConv;
+  auto grad_input_t =
+      at::empty(input_size, grad_output_t.options(), is_channels_last ? std::optional(kChannelsLast) : std::nullopt);
 
   // Avoid "grad_input" when this is being used as transposed convolution
   TensorArg grad_input{grad_input_t, "result", 0};
   convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups);
 
+  // TODO: Remove me when MacOS-14 is no longer supported
+  std::optional<Tensor> grad_input_c;
+  if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_channels_last) {
+    grad_input_c = at::empty_like(grad_input_t, grad_input_t.options().memory_format(MemoryFormat::Contiguous));
+  }
+
   // Derive from MPSCachedGraph
   struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
@@ -370,7 +353,6 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
   // Add backward with input
   @autoreleasepool {
     MPSStream* stream = getCurrentMPSStream();
-
     MPSShape* mps_input_shape = getMPSShape(input_size);
     std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}",
                                   is3DConv ? "3d_" : "",
@@ -411,15 +393,8 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
       } else if (isDepthwiseConv) {
         MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
             [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
-        fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
-                                 stride[1],
-                                 stride[0],
-                                 dilation[1],
-                                 dilation[0],
-                                 padding[1],
-                                 padding[0],
-                                 at::MemoryFormat::Contiguous,
-                                 groups);
+        fill_depthwise_conv_desc(
+            depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]);
         MPSGraphTensor* weightTransposeTensor = [mpsGraph transposeTensor:weightTensor
                                                                 dimension:-3
                                                             withDimension:-4
@@ -454,14 +429,18 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
       newCachedGraph->gradInputTensor_ = gradInputTensor;
     });
 
-    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t);
-    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
-    auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);
+    auto gradOutputPlaceholder =
+        Placeholder(cachedGraph->gradOutputTensor_, grad_input_c ? grad_output_t.contiguous() : grad_output_t);
+    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, grad_input_c ? weight_t.contiguous() : weight_t);
+    auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input_c ? *grad_input_c : grad_input_t);
 
     auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, weightsPlaceholder);
     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
-  return *grad_input;
+  if (grad_input_c) {
+    grad_input_t.copy_(*grad_input_c);
+  }
+  return grad_input_t;
 }
 
 static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
@@ -474,9 +453,11 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
                                                bool bias_defined) {
   using namespace at::native::mps;
   using namespace mps;
-  bool is3DConv = input_t.dim() == 5;
+  const bool is3DConv = input_t.dim() == 5;
   TORCH_CHECK(isFloatingType(grad_output_t.scalar_type()), "Convolution is supported only for Floating types");
   CheckedFrom c = "mps_convolution_backward_weights";
+  constexpr auto kChannelsLast = at::MemoryFormat::ChannelsLast;
+  bool is_channels_last = mps_conv_use_channels_last(input_t, grad_output_t) && !is3DConv;
 
   // For uniformity with everything else, although it seems grad_weight
   // would be unambiguous too.
@@ -487,7 +468,8 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
   checkAllSameGPU(c, {grad_output, input});
 
   auto grad_weight_t =
-      at::empty(weight_size, grad_output_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
+      at::empty(weight_size, grad_output_t.options(), is_channels_last ? std::optional(kChannelsLast) : std::nullopt);
+
   TensorArg grad_weight{grad_weight_t, "result", 0};
 
   convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
@@ -500,16 +482,23 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
     MPSGraphTensor* gradWeightTensor_ = nil;
   };
 
+  // TODO: Remove me when MacOS-14 is no longer supported
+  std::optional<Tensor> grad_weight_c;
+  if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_channels_last) {
+    grad_weight_c = at::empty_like(grad_weight_t, grad_weight_t.options().memory_format(MemoryFormat::Contiguous));
+  }
+
   @autoreleasepool {
     MPSStream* stream = getCurrentMPSStream();
 
     MPSShape* mps_weight_shape = getMPSShape(weight_size);
-    std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}",
+    std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}:{}",
                                   is3DConv ? "3d_" : "",
                                   getArrayRefString(stride),
                                   getArrayRefString(dilation),
                                   getArrayRefString(padding),
                                   groups,
+                                  is_channels_last,
                                   getTensorsStringKey({grad_output_t, input_t, grad_weight_t}));
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSShape* inputShape = getMPSShape(input_t);
@@ -541,15 +530,8 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
       } else if (isDepthwiseConv) {
         MPSGraphDepthwiseConvolution3DOpDescriptor* depthWiseConv3dDescriptor_ =
             [[MPSGraphDepthwiseConvolution3DOpDescriptor new] autorelease];
-        fill_depthwise_conv_desc(depthWiseConv3dDescriptor_,
-                                 stride[1],
-                                 stride[0],
-                                 dilation[1],
-                                 dilation[0],
-                                 padding[1],
-                                 padding[0],
-                                 at::MemoryFormat::Contiguous,
-                                 groups);
+        fill_depthwise_conv_desc(
+            depthWiseConv3dDescriptor_, stride[1], stride[0], dilation[1], dilation[0], padding[1], padding[0]);
         NSNumber* outputFeatChannelDim = mps_weight_shape[0];
         MPSShape* weightShapeTranspose = @[ @1, outputFeatChannelDim, mps_weight_shape[2], mps_weight_shape[3] ];
         MPSGraphTensor* gradWeightTensorTranspose =
@@ -583,14 +565,19 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
       newCachedGraph->gradWeightTensor_ = gradWeightTensor;
     });
 
-    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t);
-    auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
-    auto outputPlaceholder = Placeholder(cachedGraph->gradWeightTensor_, grad_weight_t);
+    auto gradOutputPlaceholder =
+        Placeholder(cachedGraph->gradOutputTensor_, grad_weight_c ? grad_output_t.contiguous() : grad_output_t);
+    auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, grad_weight_c ? input_t.contiguous() : input_t);
+    auto outputPlaceholder =
+        Placeholder(cachedGraph->gradWeightTensor_, grad_weight_c ? *grad_weight_c : grad_weight_t);
 
     auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, inputPlaceholder);
     runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
   }
 
+  if (grad_weight_c) {
+    grad_weight_t.copy_(*grad_weight_c);
+  }
   return grad_weight_t;
 }
 
diff --git a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
index d593fe2190de..e6690b2531f0 100644
--- a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
+++ b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
@@ -13,8 +13,10 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_embedding_bag_dense_backward_native.h>
 #include <ATen/ops/_embedding_bag_forward_only_native.h>
 #include <ATen/ops/_embedding_bag_native.h>
+#include <ATen/ops/_embedding_bag_per_sample_weights_backward_native.h>
 #include <ATen/ops/empty.h>
 #endif
 
@@ -66,11 +68,12 @@
   int64_t num_indices = indices.size(0);
   int64_t num_bags = offsets.size(0);
   if (include_last_offset) {
+    TORCH_CHECK(num_bags >= 1, "include_last_offset: number of offsets should be at least 1");
     num_bags -= 1;
   }
   int64_t feature_size = weight.size(1);
 
-  auto bag_size = at::empty(offsets.sizes(), indices.options());
+  auto bag_size = at::empty({num_bags}, indices.options());
   auto offset2bag = at::empty({indices.size(0)}, indices.options());
   auto output = at::empty({num_bags, feature_size}, weight.options());
 
@@ -94,7 +97,8 @@
   }
 
   bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
-  params.per_sample_weights_strides = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
+  params.use_per_sample_weights = use_per_sample_weights;
+  params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
 
   params.num_indices = num_indices;
   params.num_bags = num_bags;
@@ -176,4 +180,117 @@
                             padding_idx);
 }
 
+Tensor _embedding_bag_dense_backward_mps(const Tensor& output_grad,
+                                         const Tensor& indices,
+                                         const Tensor& offset2bag,
+                                         const Tensor& bag_size,
+                                         const Tensor& max_indices,
+                                         int64_t num_weights,
+                                         bool scale_grad_by_freq,
+                                         int64_t mode,
+                                         const std::optional<Tensor>& per_sample_weights_opt,
+                                         int64_t padding_idx) {
+  // indices and offset2bag are assumed having correct dtypes and
+  // contiguous here due to the checks in _embedding_bag_backward in
+  // EmbeddingBag.cpp.
+  // Also see NOTE [ embedding_bag Native Functions ] in native_functions.yaml
+  // for more details.
+
+  int64_t feature_size = output_grad.size(1);
+  auto weight_grad = at::zeros({num_weights, feature_size}, output_grad.options());
+  EmbeddingBagBackwardParams<uint32_t> params;
+
+  for (const auto dim : c10::irange(2)) {
+    params.output_grad_strides[dim] = output_grad.stride(dim);
+    params.weight_grad_strides[dim] = weight_grad.stride(dim);
+
+    if (mode == EmbeddingBagMode::MAX) {
+      params.max_indices_strides[dim] = safe_downcast<uint32_t, int64_t>(max_indices.stride(dim));
+    }
+  }
+
+  bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
+  params.use_per_sample_weights = use_per_sample_weights;
+  params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
+  params.feature_size = output_grad.size(1);
+  params.mode = static_cast<EmbeddingBagMode>(mode);
+  params.padding_idx = padding_idx;
+
+  auto num_indices = offset2bag.numel();
+  auto num_threads = (params.mode == EmbeddingBagMode::MAX) ? output_grad.numel() : num_indices * params.feature_size;
+  MPSStream* stream = getCurrentMPSStream();
+
+  mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
+      auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_backward_{}_{}",
+                                                                    mps::scalarToMetalTypeString(output_grad),
+                                                                    mps::scalarToMetalTypeString(indices)));
+
+      getMPSProfiler().beginProfileKernel(
+          pipeline_state, "embedding_bag", {output_grad, indices, offset2bag, bag_size});
+      [computeEncoder setComputePipelineState:pipeline_state];
+      mps::mtl_setArgs(computeEncoder,
+                       output_grad,
+                       indices,
+                       offset2bag,
+                       bag_size,
+                       max_indices,
+                       use_per_sample_weights ? per_sample_weights_opt : std::nullopt,
+                       weight_grad,
+                       params);
+
+      mps::mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
+      getMPSProfiler().endProfileKernel(pipeline_state);
+    }
+  });
+
+  return std::move(weight_grad);
+}
+
+Tensor _embedding_bag_per_sample_weights_backward_mps(const Tensor& output_grad,
+                                                      const Tensor& weight,
+                                                      const Tensor& indices,
+                                                      const Tensor& offsets,
+                                                      const Tensor& offset2bag,
+                                                      int64_t mode,
+                                                      int64_t padding_idx) {
+  TORCH_INTERNAL_ASSERT(static_cast<EmbeddingBagMode>(mode) == EmbeddingBagMode::SUM);
+  int64_t num_indices = indices.size(0);
+  int64_t feature_size = output_grad.size(1);
+  auto per_sample_weights_grad = at::zeros({num_indices}, output_grad.options());
+  EmbeddingBagPerSampleWeightsBackwardParams params;
+
+  for (const auto dim : c10::irange(2)) {
+    params.output_grad_strides[dim] = output_grad.stride(dim);
+    params.weight_strides[dim] = weight.stride(dim);
+  }
+
+  params.per_sample_weights_grad_stride = per_sample_weights_grad.stride(0);
+  params.feature_size = feature_size;
+  params.padding_idx = padding_idx;
+
+  auto num_threads = num_indices * feature_size;
+  MPSStream* stream = getCurrentMPSStream();
+
+  mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
+      auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_per_sample_weights_backward_{}_{}",
+                                                                    mps::scalarToMetalTypeString(output_grad),
+                                                                    mps::scalarToMetalTypeString(indices)));
+
+      getMPSProfiler().beginProfileKernel(
+          pipeline_state, "embedding_bag_per_sample_weights_backward", {output_grad, weight, indices, offset2bag});
+      [computeEncoder setComputePipelineState:pipeline_state];
+      mps::mtl_setArgs(computeEncoder, output_grad, weight, indices, offset2bag, per_sample_weights_grad, params);
+
+      mps::mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
+      getMPSProfiler().endProfileKernel(pipeline_state);
+    }
+  });
+
+  return std::move(per_sample_weights_grad);
+}
+
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index b759eb1373cc..30d041362a1d 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -907,6 +907,8 @@ Tensor embedding_dense_backward_mps(const Tensor& grad_,
   TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int,
               "index_fill_(): Expected dtype int32 or int64 for index");
   TORCH_CHECK(dim == 0 || dim < self.dim(), "index_fill_(): Indexing dim ", dim, " is out of bounds of tensor");
+  TORCH_CHECK(self.is_complex() || !source.is_complex(),
+              "index_fill_(): Converting complex Scalar to non-complex type is not supported");
   // MPS.scatter crashes if used with complex dtypes
   TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "index_fill_(): Complex types are yet not supported");
 
diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
index 66fe8d7b58aa..d5c68119f673 100644
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@@ -20,6 +20,7 @@
 #include <ATen/ops/baddbmm_native.h>
 #include <ATen/ops/bmm_native.h>
 #include <ATen/ops/cholesky_native.h>
+#include <ATen/ops/eye_native.h>
 #include <ATen/ops/linalg_cholesky_ex_native.h>
 #include <ATen/ops/linalg_inv_ex_native.h>
 #include <ATen/ops/linalg_lu_factor_ex_native.h>
@@ -195,6 +196,28 @@ bool use_metal_mm(const Tensor& self, const Tensor& other, const Tensor& output)
        other.size(0) > max_stride_size || other.size(1) > max_stride_size);
 }
 
+void map_mps_decomposition_error_code_to_blas(const Tensor& status) {
+  const auto& status_flat = status.view(-1);
+
+  for (const auto i : c10::irange(status_flat.size(0))) {
+    int code = status_flat[i].item<int>();
+    switch (code) {
+      case MPSMatrixDecompositionStatusSuccess:
+        status_flat[i] = 0;
+        break;
+      case MPSMatrixDecompositionStatusNonPositiveDefinite:
+      case MPSMatrixDecompositionStatusSingular:
+        status_flat[i] = 2;
+        break;
+      case MPSMatrixDecompositionStatusFailure:
+        status_flat[i] = -1;
+        break;
+      default:
+        TORCH_INTERNAL_ASSERT(false, "Unknown MPSMatrixDecompositionStatus enum value: ", code);
+    }
+  }
+}
+
 } // anonymous namespace
 
 static void linalg_lu_factor_ex_out_mps_impl(const Tensor& A,
@@ -486,6 +509,9 @@ static void linalg_solve_out_mps_impl(const Tensor& A,
                   "mpsmatrixdecompositionstatus for details.");
     }
   }
+
+  map_mps_decomposition_error_code_to_blas(info);
+
   if (!left) {
     // If this was a right solve, transpose the result back
     result.copy_(result_t.transpose(-2, -1).contiguous());
@@ -496,26 +522,24 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
   using namespace mps;
   TORCH_CHECK(result.is_mps(), "Output tensor is not MPS");
   TORCH_CHECK(!A.is_complex(), "linalg_inv: not supported for complex types yet!");
-  using CachedGraph = MPSUnaryCachedGraph;
 
-  MPSStream* stream = getCurrentMPSStream();
   info.zero_();
-
   if (A.numel() == 0) {
     return;
   }
 
-  if (!result.is_contiguous()) {
-    result.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
-  }
   auto A_sizes = A.sizes();
   int ndim = A.dim();
 
-  Tensor LU = empty_like(A);
-  Tensor identity = zeros_like(A);
+  Tensor LU = empty_like(A, MemoryFormat::Contiguous);
+  Tensor identity = eye(A.size(-2), A.size(-1), A.scalar_type(), A.options().layout(), A.device()).expand_as(A);
   Tensor pivots = empty({A_sizes.begin(), A_sizes.end() - 1}, A.options().dtype(kInt));
-  (ndim == 2 ? identity.diagonal() : identity.diagonal(0, -2, -1)).fill_(1);
-  linalg_solve_out_mps_impl(A, identity, true, check_errors, result, LU, pivots, info);
+  // need to do this to keep the strides of the result tensor
+  // mps's solve expects row major layout, while inductor
+  // expects result to be column major
+  Tensor tmp = empty_like(A, MemoryFormat::Contiguous);
+  linalg_solve_out_mps_impl(A, identity, true, check_errors, tmp, LU, pivots, info);
+  result.copy_(tmp);
 }
 
 static Tensor& mm_out_mps_impl(const Tensor& self, const Tensor& other, Tensor& output) {
diff --git a/aten/src/ATen/native/mps/operations/Pooling.mm b/aten/src/ATen/native/mps/operations/Pooling.mm
index d916320b2e23..fdfabecef06b 100644
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@@ -519,6 +519,13 @@ static void max_unpool_out_mps_template(const Tensor& input,
                                         Tensor& output,
                                         const int32_t pooling_dims,
                                         const std::string& op_name) {
+  TORCH_CHECK(output_size_.size() == static_cast<size_t>(pooling_dims),
+              op_name,
+              "There should be exactly ",
+              pooling_dims,
+              " elements but got ",
+              output_size_.size());
+
   auto dims = input.dim();
   auto leading_dims = input.dim() - pooling_dims;
 
@@ -534,6 +541,18 @@ static void max_unpool_out_mps_template(const Tensor& input,
   output.resize_(output_size, memory_format);
   output.fill_(0);
 
+  if (indices.defined() && indices.numel() > 0) {
+    auto output_image_size = c10::multiply_integers(output_size_);
+
+    int64_t min_idx = indices.min().item<int64_t>();
+    int64_t max_idx = indices.max().item<int64_t>();
+
+    if (min_idx < 0 || max_idx >= output_image_size) {
+      int64_t error_idx = (min_idx < 0) ? min_idx : max_idx;
+      TORCH_CHECK(false, "Found an invalid max index: ", error_idx, " for output tensor of shape ", output_size_);
+    }
+  }
+
   id<MTLDevice> device = MPSDevice::getInstance()->device();
   MPSStream* mpsStream = getCurrentMPSStream();
   const auto numThreads = input.numel();
diff --git a/aten/src/ATen/native/mps/operations/Shape.mm b/aten/src/ATen/native/mps/operations/Shape.mm
index 0e243c524377..973bef036d56 100644
--- a/aten/src/ATen/native/mps/operations/Shape.mm
+++ b/aten/src/ATen/native/mps/operations/Shape.mm
@@ -2,9 +2,14 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/MemoryOverlap.h>
 #include <ATen/WrapDimUtils.h>
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/Pool.h>
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/TypeProperties.h>
 #include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/kernels/Shape.h>
+
+#include <fmt/format.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -16,6 +21,13 @@
 #endif
 
 namespace at::native {
+
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/Shape_metallib.h>
+#endif
+
 namespace mps {
 
 // Produces a shape with the `dim` dimension set to 0.
@@ -57,6 +69,83 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in
                 ")");
   }
 }
+
+template <typename T>
+std::string get_type_str();
+
+template <>
+std::string get_type_str<int64_t>() {
+  return "int64_t";
+}
+
+template <>
+std::string get_type_str<int32_t>() {
+  return "int32_t";
+}
+
+// NOTE: `output` is expected to already have the correct size.
+template <typename idx_type_t>
+static void cat_out_mps_impl(const ITensorListRef& inputs, int64_t dimension, const Tensor& output) {
+  CatSharedParams<idx_type_t> shared_params;
+
+  shared_params.ndim = output.dim();
+  shared_params.cat_dim = dimension;
+
+  for (const auto dim : c10::irange(output.dim())) {
+    shared_params.output_strides[dim] = safe_downcast<idx_type_t, int64_t>(output.stride(dim));
+    shared_params.output_sizes[dim] = safe_downcast<idx_type_t, int64_t>(output.size(dim));
+  }
+
+  idx_type_t cat_dim_offset = 0;
+  size_t input_idx = 0;
+  MPSStream* stream = getCurrentMPSStream();
+
+  // Launch a separate kernels for each input. This will produce some overhead.
+  // In order to launch only one kernel to process all inputs, we would have to
+  // copy all the input tensor data into a packed buffer, which would not be
+  // ideal.
+  for (const Tensor& input : inputs) {
+    if (input.numel() == 0) {
+      continue;
+    }
+
+    // Metal can only launch up to MAX_INT threads at one time. If the input has
+    // more than that number of elements, launch multiple kernels with different
+    // offsets into the data.
+    const int64_t max_num_threads = static_cast<int64_t>(std::numeric_limits<int32_t>::max());
+
+    for (int64_t numel_remaining = input.numel(); numel_remaining > 0; numel_remaining -= max_num_threads) {
+      auto num_threads = std::min(max_num_threads, numel_remaining);
+      CatInputParams<idx_type_t> input_params;
+
+      input_params.cat_dim_offset = safe_downcast<idx_type_t, int64_t>(cat_dim_offset);
+      input_params.input_element_offset = safe_downcast<idx_type_t, int64_t>(input.numel() - numel_remaining);
+
+      for (const auto dim : c10::irange(input.dim())) {
+        input_params.input_strides[dim] = safe_downcast<idx_type_t, int64_t>(input.stride(dim));
+        input_params.input_sizes[dim] = safe_downcast<idx_type_t, int64_t>(input.size(dim));
+      }
+
+      dispatch_sync_with_rethrow(stream->queue(), ^() {
+        @autoreleasepool {
+          id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
+          auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("cat_{}_{}_{}",
+                                                                        get_type_str<idx_type_t>(),
+                                                                        scalarToMetalTypeString(input),
+                                                                        scalarToMetalTypeString(output)));
+          getMPSProfiler().beginProfileKernel(pipeline_state, "cat", {input});
+          [computeEncoder setComputePipelineState:pipeline_state];
+          mtl_setArgs(computeEncoder, input, output, shared_params, input_params);
+          mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
+          getMPSProfiler().endProfileKernel(pipeline_state);
+        }
+      });
+    }
+
+    cat_dim_offset += input.size(dimension);
+    input_idx++;
+  }
+}
 } // namespace mps
 
 // topk
@@ -219,19 +308,16 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in
               " and out is on ",
               out.device());
 
-  // TODO: For better performance by eliminating input tensor gathering and post transpose,
-  // TODO: it is better to keep the out tensor's memory format.
-  // TODO: dimension needs to be recomputed as:
-  // TODO: dim = 0 --> dim = 0; dim = 1 or 2 --> dim = out.dim()- dim; otherwise dim = dim-1
-  if (needsGather(out)) {
-    out.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
-  }
   std::vector<int64_t> size(notSkippedTensor.sizes().vec());
 
   // Compute size of the result in the cat dimension
   int64_t cat_dim_size = 0;
   idx = 0;
+  bool has_large_tensor = false;
   for (const Tensor& tensor : materialized_inputs) {
+    if (isTooLargeForMPSGraph(tensor)) {
+      has_large_tensor |= true;
+    }
     if (!should_skip(tensor)) {
       // TODO: Factor out `check_shape_except_dim`
       check_shape_except_dim(notSkippedTensor, tensor, dimension, idx);
@@ -249,79 +335,12 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in
     return;
   }
 
-  struct CachedGraph : public MPSCachedGraph {
-    CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
-    std::vector<MPSGraphTensor*> inputTensors_;
-    MPSGraphTensor* outputTensor_ = nil;
-  };
-
-  @autoreleasepool {
-    std::string key = "cat_out_mps:" + std::to_string(dimension) + ":" +
-        (memory_format == MemoryFormat::ChannelsLast ? "NHWC" : "NCHW");
-    if (!all_same_dtype) {
-      key += getTensorsStringKey(input_tensors, true, all_same_sizes_and_stride);
-    } else {
-      key += ":" + getMPSTypeString(input_tensors[0].scalar_type(), true) + ":" + std::to_string(inputs.size());
-    }
-    for (auto idx : skipped_tensor_indices) {
-      key += "," + std::to_string(idx);
-    }
-
-    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      auto len_tensor_array = inputs.size() - skipped_tensor_indices.size();
-      std::vector<MPSGraphTensor*> castInputTensors(len_tensor_array);
-      newCachedGraph->inputTensors_.reserve(len_tensor_array);
-
-      for (const auto idx : c10::irange(len_tensor_array)) {
-        const Tensor& tensor = input_tensors[idx];
-        auto scalar_type = getMPSScalarType(tensor.scalar_type());
-        if (tensor.scalar_type() == kBool) {
-          scalar_type = MPSDataTypeInt8;
-        }
-        newCachedGraph->inputTensors_[idx] = mpsGraphUnrankedPlaceHolder(mpsGraph, scalar_type);
-        if (tensor.scalar_type() != out_dtype) {
-          castInputTensors[idx] = [mpsGraph castTensor:newCachedGraph->inputTensors_[idx]
-                                                toType:getMPSDataType(out_dtype)
-                                                  name:@"castInput"];
-        } else {
-          castInputTensors[idx] = newCachedGraph->inputTensors_[idx];
-        }
-      }
-
-      auto inputTensorsArray = [NSArray arrayWithObjects:castInputTensors.data() count:len_tensor_array];
-      MPSGraphTensor* outputTensor = [mpsGraph concatTensors:inputTensorsArray
-                                                   dimension:dimension // Maybe convert this from int64_t -> int32
-                                                        name:nil];
-      if (getMPSDataType(out_dtype) == MPSDataTypeBool) {
-        outputTensor = [mpsGraph castTensor:outputTensor toType:MPSDataTypeBool name:@"outputTensor"];
-      }
-      newCachedGraph->outputTensor_ = outputTensor;
-    });
+  has_large_tensor |= isTooLargeForMPSGraph(out);
 
-    std::vector<Placeholder> inputPlaceholders;
-    int i = 0;
-    int t_idx = 0;
-    for (const Tensor& tensor : materialized_inputs) {
-      if (std::find(skipped_tensor_indices.begin(), skipped_tensor_indices.end(), i) == skipped_tensor_indices.end()) {
-        auto scalar_type = getMPSScalarType(tensor.scalar_type());
-        if (tensor.scalar_type() == kBool) {
-          scalar_type = MPSDataTypeInt8;
-        }
-        inputPlaceholders.emplace_back(cachedGraph->inputTensors_[t_idx], tensor, nullptr, true, scalar_type);
-        t_idx++;
-      }
-      i++;
-    }
-
-    auto outputDataType = getMPSScalarType(out.scalar_type());
-    Placeholder outputPlaceholder =
-        Placeholder(cachedGraph->outputTensor_, out, /*mpsShape=*/nil, /*gatherTensorData=*/false, outputDataType);
-
-    NSMutableDictionary* feeds = [[NSMutableDictionary new] autorelease];
-    for (auto& inputPlaceholder : inputPlaceholders) {
-      feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData();
-    }
-    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder);
+  if (has_large_tensor) {
+    return mps::cat_out_mps_impl<int64_t>(materialized_inputs, dimension, out);
+  } else {
+    return mps::cat_out_mps_impl<int32_t>(materialized_inputs, dimension, out);
   }
 }
 
diff --git a/aten/src/ATen/native/mps/operations/Unique.mm b/aten/src/ATen/native/mps/operations/Unique.mm
index 7c7683caf428..24bf4a7f9596 100644
--- a/aten/src/ATen/native/mps/operations/Unique.mm
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@@ -9,11 +9,22 @@
 #else
 #include <ATen/ops/_unique2.h>
 #include <ATen/ops/_unique2_native.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/argsort.h>
+#include <ATen/ops/cat.h>
+#include <ATen/ops/cumsum.h>
+#include <ATen/ops/full.h>
+#include <ATen/ops/masked_select.h>
+#include <ATen/ops/nonzero.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/ones_like.h>
 #include <ATen/ops/slice.h>
 #include <ATen/ops/unique_consecutive.h>
 #include <ATen/ops/unique_consecutive_native.h>
 #include <ATen/ops/unique_dim_consecutive.h>
 #include <ATen/ops/unique_dim_consecutive_native.h>
+#include <ATen/ops/unique_dim_native.h>
+#include <ATen/ops/zeros.h>
 #endif
 
 namespace at::native {
@@ -305,4 +316,85 @@ static void runUniqueGraph(UniqueCachedGraph* uniqueGraph,
   return _unique_impl_mps(self, return_inverse, return_counts, false, std::nullopt);
 }
 
+static Tensor lexsort_rows_perm_mps(const Tensor& mat_2d) {
+  const auto rows = mat_2d.size(0), cols = mat_2d.size(1);
+  if (rows <= 1 || cols == 0) {
+    return arange(rows, mat_2d.options().dtype(kLong));
+  }
+
+  auto perm = arange(rows, mat_2d.options().dtype(kLong));
+  for (auto c = cols - 1; c >= 0; --c) {
+    auto keys = mat_2d.select(1, c).index_select(0, perm);
+    const auto idx = argsort(keys, /*dim=*/0, /*descending=*/false);
+    perm = perm.index_select(0, idx);
+  }
+  return perm;
+}
+
+static std::tuple<Tensor, Tensor, Tensor> unique_dim_sorted_mps_impl(const Tensor& self,
+                                                                     int64_t dim,
+                                                                     bool return_inverse,
+                                                                     bool return_counts) {
+  dim = maybe_wrap_dim(dim, self.dim());
+
+  auto sizes = self.sizes().vec();
+  auto num_zero_dims = std::count(sizes.begin(), sizes.end(), (int64_t)0);
+  if (self.size(dim) == 0) {
+    auto output = at::empty(sizes, self.options());
+    auto inverse_indices = at::empty({0}, self.options().dtype(kLong));
+    auto counts = at::empty({0}, self.options().dtype(kLong));
+    return {output, inverse_indices, counts};
+  }
+
+  auto transposed = self.moveaxis(dim, 0);
+  auto orig_sizes = transposed.sizes().vec();
+  auto rows = transposed.size(0);
+  auto input_flat = transposed.contiguous().view({rows, -1});
+
+  auto perm = lexsort_rows_perm_mps(input_flat);
+  auto input_sorted = input_flat.index_select(0, perm);
+
+  Tensor is_unique = at::zeros({rows}, self.options().dtype(kBool));
+  if (rows > 0) {
+    is_unique.narrow(0, 0, 1).fill_(true);
+  }
+  if (rows > 1) {
+    auto a = input_sorted.narrow(0, 1, rows - 1);
+    auto b = input_sorted.narrow(0, 0, rows - 1);
+    auto row_changed = a.ne(b).any(1);
+    is_unique.narrow(0, 1, rows - 1).copy_(row_changed);
+  }
+
+  auto unique_pos = nonzero(is_unique).squeeze(1);
+  auto group_id = cumsum(is_unique.to(kLong), 0).sub(1);
+
+  auto unique_rows_2d = input_sorted.index_select(0, unique_pos);
+
+  Tensor inverse_indices = empty({0}, self.options().dtype(kLong));
+  if (return_inverse) {
+    inverse_indices = empty({rows}, self.options().dtype(kLong));
+    inverse_indices.index_copy_(0, perm, group_id);
+  }
+
+  Tensor counts = empty({0}, self.options().dtype(kLong));
+  if (return_counts) {
+    const auto num_unique = unique_pos.size(0);
+    counts = zeros({num_unique}, self.options().dtype(kLong));
+    counts.scatter_add_(0, group_id, ones_like(group_id, group_id.options().dtype(kLong)));
+  }
+
+  orig_sizes[0] = unique_rows_2d.size(0);
+  auto output = unique_rows_2d.view(orig_sizes).moveaxis(0, dim);
+
+  return std::make_tuple(std::move(output), std::move(inverse_indices), std::move(counts));
+}
+
+std::tuple<Tensor, Tensor, Tensor> unique_dim_mps(const Tensor& self,
+                                                  int64_t dim,
+                                                  const bool /*sorted*/,
+                                                  const bool return_inverse,
+                                                  const bool return_counts) {
+  return unique_dim_sorted_mps_impl(self, dim, return_inverse, return_counts);
+}
+
 } // namespace at::native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index f85e58e49634..d5d1855f1717 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -706,6 +706,7 @@
   variants: function, method
   dispatch:
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_all
+  tags: reduction
 
 
 - func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
@@ -715,6 +716,7 @@
   cpp_no_default_args: ['dim']
   dispatch:
     CompositeExplicitAutograd: all_dims_default
+  tags: reduction
 
 - func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -723,6 +725,7 @@
     CPU, CUDA: all_out
     MPS: all_out_mps
     MTIA: all_out_mtia
+  tags: reduction
 
 - func: all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -731,13 +734,16 @@
     CPU, CUDA: all_dims_out
     CompositeExplicitAutograd: all_dims_out_default
   cpp_no_default_args: ['dim']
+  tags: reduction
 
 - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  tags: reduction
 
 - func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  tags: reduction
 
 - func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool
   variants: function, method
@@ -749,14 +755,14 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: any.out
   variants: function, method
-  tags: core
+  tags: [core, reduction]
 
 - func: any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   structured_delegate: any.dims_out
   variants: function, method
   cpp_no_default_args: ['dim']
-  tags: core
+  tags: [core, reduction]
   dispatch:
     CompositeExplicitAutograd: any_dims_default
 
@@ -766,6 +772,7 @@
   dispatch:
     CPU, CUDA: any_out
     MPS: any_out_mps
+  tags: reduction
 
 - func: any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -774,13 +781,16 @@
     CPU, CUDA: any_dims_out
     CompositeExplicitAutograd: any_dims_out_default
   cpp_no_default_args: ['dim']
+  tags: reduction
 
 - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  tags: reduction
 
 - func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  tags: reduction
 
 - func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   dispatch:
@@ -826,25 +836,27 @@
   structured_delegate: argmax.out
   device_check: NoCheck   # TensorIterator
   variants: function, method
-  tags: core
+  tags: [core, reduction]
 
 - func: argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   dispatch:
     CPU, CUDA: argmax_out
     MPS: argmax_out_mps
+  tags: reduction
 
 - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
   structured_delegate: argmin.out
   device_check: NoCheck   # TensorIterator
   variants: function, method
-  tags: core
+  tags: [core, reduction]
 
 - func: argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   dispatch:
     CPU, CUDA: argmin_out
     MPS: argmin_out_mps
+  tags: reduction
 
 - func: acosh(Tensor self) -> Tensor
   variants: function, method
@@ -1370,6 +1382,7 @@
   dispatch:
     SparseCPU: bmm_sparse_cpu
     SparseCUDA: bmm_sparse_cuda
+    SparseMPS: bmm_sparse_mps
     NestedTensorCPU: bmm_nested
     NestedTensorCUDA: bmm_nested_cuda
   tags: core
@@ -1385,6 +1398,7 @@
     MTIA: bmm_out_mtia
     SparseCPU: bmm_out_sparse_cpu
     SparseCUDA: bmm_out_sparse_cuda
+    SparseMPS: bmm_out_sparse_mps
     SparseCsrCUDA: bmm_out_sparse_csr_cuda
 
 - func: bmm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
@@ -1409,7 +1423,7 @@
 - func: _sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
   variants: function
   dispatch:
-    SparseCPU, SparseCUDA: sparse_broadcast_to
+    SparseCPU, SparseCUDA, SparseMPS: sparse_broadcast_to
 
 - func: cat(Tensor[] tensors, int dim=0) -> Tensor
   structured_delegate: cat.out
@@ -1867,12 +1881,14 @@
     CUDA: count_nonzero_cuda
     MPS: count_nonzero_mps
   autogen: count_nonzero.dim_IntList_out
+  tags: reduction
 
 - func: count_nonzero(Tensor self, int? dim=None) -> Tensor
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: count_nonzero
   autogen: count_nonzero.out
+  tags: reduction
 
 - func: cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor
   variants: function, method
@@ -2379,7 +2395,7 @@
 
 - func: _embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
   dispatch:
-    CPU, CUDA: _embedding_bag_backward_symint
+    CPU, CUDA, MPS: _embedding_bag_backward_symint
 
 - func: _embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
   dispatch:
@@ -2389,12 +2405,14 @@
   dispatch:
     CPU: _embedding_bag_dense_backward_cpu
     CUDA: _embedding_bag_dense_backward_cuda
+    MPS: _embedding_bag_dense_backward_mps
   autogen: _embedding_bag_dense_backward.out
 
 - func: _embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1) -> Tensor
   dispatch:
     CPU: _embedding_bag_per_sample_weights_backward_cpu
     CUDA: _embedding_bag_per_sample_weights_backward_cuda
+    MPS: _embedding_bag_per_sample_weights_backward_mps
   autogen: _embedding_bag_per_sample_weights_backward.out
 
 - func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
@@ -3791,19 +3809,23 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: logsumexp
+  tags: reduction
 
 - func: logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     # calls squeeze
     CompositeExplicitAutogradNonFunctional: logsumexp_out
+  tags: reduction
 
 - func: logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  tags: reduction
 
 - func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  tags: reduction
 
 - func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
 
@@ -3853,13 +3875,15 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: aminmax.out
   variants: function, method
+  tags: reduction
 
 - func: aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max)
   device_check: NoCheck   # TensorIterator
   structured: True
   dispatch:
-    CPU, CUDA: aminmax_out
+    CPU, CUDA, MTIA: aminmax_out
     MPS: aminmax_out_mps
+  tags: reduction
 
 - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
   dispatch:
@@ -3875,7 +3899,7 @@
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: qmax
-  tags: core
+  tags: [core, reduction]
 
 - func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
   device_check: NoCheck   # TensorIterator
@@ -3885,13 +3909,16 @@
   dispatch:
     CPU, CUDA, MTIA: max_out
     MPS: max_out_mps
+  tags: reduction
 
 - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  tags: reduction
 
 - func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
   device_check: NoCheck   # TensorIterator
+  tags: reduction
 
 - func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, SymInt[] sizes, bool keepdim) -> Tensor
   variants: function
@@ -3904,13 +3931,14 @@
 - func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
   variants: function, method
   structured_delegate: amax.out
-  tags: core
+  tags: [core, reduction]
 
 - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   dispatch:
-    CPU, CUDA: amax_out
+    CPU, CUDA, MTIA: amax_out
     MPS: amax_out_mps
+  tags: reduction
 
 # Return: (Tensor output, Tensor indices)
 - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
@@ -3972,13 +4000,14 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: mean
-  tags: core
+  tags: [core, reduction]
 
 # For normal naming convention this should be `mean.out`. However since we already have `mean.out` we have to rename this.
 - func: mean.dtype_out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CompositeExplicitAutograd: mean_dtype_out
+  tags: reduction
 
 - func: mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   structured_delegate: mean.out
@@ -3986,7 +4015,7 @@
   variants: function, method
   dispatch:
     QuantizedCPU: mean_quantized_cpu
-  tags: core
+  tags: [core, reduction]
 
 - func: mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -3995,13 +4024,16 @@
     CPU, CUDA: mean_out
     MPS: mean_out_mps
     QuantizedCPU: mean_out_quantized_cpu
+  tags: reduction
 
 - func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  tags: reduction
 
 - func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  tags: reduction
 
 - func: nanmean(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   device_check: NoCheck   # Composite
@@ -4064,7 +4096,7 @@
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: qmin
-  tags: core
+  tags: [core, reduction]
 
 - func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
   device_check: NoCheck   # TensorIterator
@@ -4074,24 +4106,28 @@
   dispatch:
     CPU, CUDA, MTIA: min_out
     MPS: min_out_mps
+  tags: reduction
 
 - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  tags: reduction
 
 - func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
   device_check: NoCheck   # TensorIterator
+  tags: reduction
 
 - func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
   variants: function, method
   structured_delegate: amin.out
-  tags: core
+  tags: [core, reduction]
 
 - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
   dispatch:
-    CPU, CUDA: amin_out
+    CPU, CUDA, MTIA: amin_out
     MPS: amin_out_mps
+  tags: reduction
 
 # TODO: Add this function to MPS dispatch key so that we avoid declaring it in
 # native_functions.yaml
@@ -4171,7 +4207,7 @@
   structured_delegate: mm.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: _sparse_mm
+    SparseCPU, SparseCUDA, SparseMPS: _sparse_mm
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm
   tags: core
 
@@ -4243,6 +4279,7 @@
     CPU: _weight_int8pack_mm_cpu
     CUDA: _weight_int8pack_mm_cuda
     MPS: _weight_int8pack_mm_mps
+    XPU: _weight_int8pack_mm_xpu
 
 - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
   python_module: sparse
@@ -4542,6 +4579,7 @@
 - func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor
   dispatch:
     CPU, CUDA: _cdist_forward
+    MTIA: _cdist_forward_mtia
     MPS: _cdist_forward_mps
   autogen: _cdist_forward.out
   tags: core
@@ -5854,6 +5892,7 @@
     SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sum_coo
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
   autogen: sum.out
+  tags: reduction
 
 - func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   # TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype
@@ -5864,11 +5903,12 @@
     NestedTensorCPU: NestedTensor_sum_dim_CPU
     SparseCPU, SparseCUDA, SparseMPS: sum_sparse_coo
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed
-  tags: core
+  tags: [core, reduction]
 
 - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  tags: reduction
 
 - func: sum.IntList_out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -5876,9 +5916,11 @@
   dispatch:
     CPU, CUDA: sum_out
     MPS: sum_out_mps
+  tags: reduction
 
 - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  tags: reduction
 
 # TODO: this function will be replaced once nested expand semantics have been settled on
 - func: _nested_sum_backward(Tensor grad, Tensor self, int[1]? dim, bool keepdim=False) -> Tensor
@@ -5890,11 +5932,13 @@
   dispatch:
     CPU, CUDA: nansum
     MPS: nansum_mps
+  tags: reduction
 
 - func: nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA: nansum_out
     MPS: nansum_out_mps
+  tags: reduction
 
 - func: hash_tensor(Tensor self, int[1] dim=[], *, bool keepdim=False, int mode=0) -> Tensor
   variants: function, method
@@ -5958,11 +6002,13 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   cpp_no_default_args: ["unbiased"]
+  tags: reduction
 
 - func: std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   cpp_no_default_args: ["unbiased"]
+  tags: reduction
 
 - func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -5971,16 +6017,19 @@
     CPU, CUDA: std
     MPS: std_mps
     QuantizedCPU: std_quantized_cpu
+  tags: reduction
 
 - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
   cpp_no_default_args: ["unbiased"]
+  tags: reduction
 
 - func: std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
   cpp_no_default_args: ["unbiased"]
+  tags: reduction
 
 - func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
@@ -5989,42 +6038,51 @@
     CPU, CUDA: std_mean
     MPS: std_mean_mps
   autogen: std_mean.correction_out
+  tags: reduction
 
 - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
   cpp_no_default_args: ["unbiased"]
+  tags: reduction
 
 - func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
+  tags: reduction
 
 - func: std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   cpp_no_default_args: ["unbiased"]
+  tags: reduction
 
 - func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: std_out
     QuantizedCPU: std_out_quantized_cpu
+  tags: reduction
 
 - func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   cpp_no_default_args: ["unbiased"]
+  tags: reduction
 
 - func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   cpp_no_default_args: ["unbiased"]
+  tags: reduction
 
 - func: std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  tags: reduction
 
 - func: std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function
+  tags: reduction
 
 - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -6033,13 +6091,13 @@
     CPU, CUDA: prod
     MPS: prod_mps
   autogen: prod.out
-  tags: core
+  tags: [core, reduction]
 
 - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   structured_delegate: prod.int_out
   device_check: NoCheck   # TensorIterator
   variants: function, method
-  tags: core
+  tags: [core, reduction]
 
 - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -6047,13 +6105,16 @@
   dispatch:
     CPU, CUDA: prod_out
     MPS: prod_out_mps
+  tags: reduction
 
 - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  tags: reduction
 
 - func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  tags: reduction
 
 - func: t(Tensor(a) self) -> Tensor(a)
   device_check: NoCheck
@@ -6449,6 +6510,7 @@
   dispatch:
     CPU: unique_dim_cpu
     CUDA: unique_dim_cuda
+    MPS: unique_dim_mps
   tags: dynamic_output_shape
   autogen: unique_dim.out
 
@@ -6513,11 +6575,12 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   cpp_no_default_args: ["unbiased"]
+  tags: reduction
 
 - func: var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
-  tags: core
+  tags: [core, reduction]
   cpp_no_default_args: ["unbiased"]
 
 - func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
@@ -6526,43 +6589,52 @@
   dispatch:
     CPU, CUDA: var
     MPS: var_mps
-  tags: core
+    MTIA: var_mtia
+  tags: [core, reduction]
 
 - func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   cpp_no_default_args: ["unbiased"]
+  tags: reduction
 
 - func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: var_out
+  tags: reduction
 
 - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   cpp_no_default_args: ["unbiased"]
+  tags: reduction
 
 - func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   cpp_no_default_args: ["unbiased"]
+  tags: reduction
 
 - func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  tags: reduction
 
 - func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: function
+  tags: reduction
 
 - func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
   cpp_no_default_args: ["unbiased"]
+  tags: reduction
 
 - func: var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
   cpp_no_default_args: ["unbiased"]
+  tags: reduction
 
 - func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
@@ -6571,15 +6643,18 @@
     CPU, CUDA: var_mean
     MPS: var_mean_mps
   autogen: var_mean.correction_out
+  tags: reduction
 
 - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
   cpp_no_default_args: ["unbiased"]
+  tags: reduction
 
 - func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
   device_check: NoCheck   # TensorIterator
   variants: function
+  tags: reduction
 
 - func: view_as(Tensor(a) self, Tensor other) -> Tensor(a)
   variants: method
@@ -6721,12 +6796,12 @@
 
 - func: native_norm(Tensor self, Scalar p=2) -> Tensor
   dispatch:
-    SparseCPU, SparseCUDA: norm_sparse
+    SparseCPU, SparseCUDA, SparseMPS: norm_sparse
   autogen: native_norm.out
 
 - func: native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor
   dispatch:
-    SparseCPU, SparseCUDA: norm_sparse
+    SparseCPU, SparseCUDA, SparseMPS: norm_sparse
   autogen: native_norm.ScalarOpt_dim_dtype_out
 
 - func: _batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)
@@ -6839,6 +6914,7 @@
   dispatch:
     CompositeExplicitAutograd: norm
   autogen: norm.ScalarOpt_dtype_out
+  tags: reduction
 
 - func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -6846,20 +6922,23 @@
   dispatch:
     CompositeExplicitAutograd: norm
   autogen: norm.Scalar_out
+  tags: reduction
 
 - func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
   structured_delegate: norm.dtype_out
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: sparse_dtype_norm
+    SparseCPU, SparseCUDA, SparseMPS: sparse_dtype_norm
+  tags: reduction
 
 - func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
   structured_delegate: norm.out
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: sparse_norm
+    SparseCPU, SparseCUDA, SparseMPS: sparse_norm
+  tags: reduction
 
 - func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -6867,6 +6946,7 @@
   dispatch:
     CPU, CUDA: norm_dtype_out
     MPS: norm_dtype_out_mps
+  tags: reduction
 
 - func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   structured: True
@@ -6874,21 +6954,26 @@
   dispatch:
     CPU, CUDA: norm_out
     MPS: norm_out_mps
+  tags: reduction
 
 # These four redispatch in their implementation, so OK to be CompositeImplicitAutograd
 - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  tags: reduction
 
 - func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  tags: reduction
 
 - func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  tags: reduction
 
 - func: norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
+  tags: reduction
 
 - func: frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent)
   variants: method, function
@@ -7106,6 +7191,7 @@
     MTIA: addmm_out_mtia
     SparseCPU: addmm_out_sparse_dense_cpu
     SparseCUDA: addmm_out_sparse_dense_cuda
+    SparseMPS: addmm_out_sparse_dense_mps
     SparseCsrCPU: addmm_out_sparse_compressed_cpu
     SparseCsrCUDA: addmm_out_sparse_compressed_cuda
 
@@ -7115,6 +7201,7 @@
   dispatch:
     SparseCPU: addmm_sparse_dense_cpu
     SparseCUDA: addmm_sparse_dense_cuda
+    SparseMPS: addmm_sparse_dense_mps
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: addmm_sparse_compressed_dense
   tags: core
 
@@ -7153,6 +7240,7 @@
     CUDA: _scaled_mm_cuda
   tags: needs_exact_strides
 
+
 - func: _scaled_mm.out(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   dispatch:
@@ -7160,6 +7248,16 @@
     CUDA: _scaled_mm_out_cuda
   tags: needs_exact_strides
 
+- func: _scaled_mm_v2(Tensor self, Tensor mat2, Tensor[] scale_a, int[] recipe_a, int[] swizzle_a, Tensor[] scale_b, int[] recipe_b, int[] swizzle_b, Tensor? bias, ScalarType? out_dtype, int[] contraction_dim=[], bool use_fast_accum=False) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: _scaled_mm_cuda_v2
+
+- func: _scaled_mm_v2.out(Tensor self, Tensor mat2, Tensor[] scale_a, int[] recipe_a, int[] swizzle_a, Tensor[] scale_b, int[] recipe_b, int[] swizzle_b, Tensor? bias, ScalarType? out_dtype, int[] contraction_dim=[], bool use_fast_accum=False, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CUDA: _scaled_mm_cuda_v2_out
+
 
 - func: _scaled_grouped_mm(Tensor self, Tensor mat2, Tensor scale_a, Tensor scale_b, Tensor? offs=None, Tensor? bias=None, Tensor? scale_result=None, ScalarType? out_dtype=None, bool use_fast_accum=False) -> Tensor
   variants: function
@@ -7167,6 +7265,12 @@
     CUDA: _scaled_grouped_mm_cuda
   tags: needs_exact_strides
 
+- func: _scaled_grouped_mm_v2(Tensor self, Tensor mat2, Tensor[] scale_a, int[] recipe_a, int[] swizzle_a, Tensor[] scale_b, int[] recipe_b, int[] swizzle_b, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None, int[] contraction_dim=[], bool use_fast_accum=False) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: _scaled_grouped_mm_cuda_v2
+  tags: needs_exact_strides
+
 - func: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor
   variants: function
   dispatch:
@@ -7362,7 +7466,7 @@
 - func: sparse_mask(Tensor self, Tensor mask) -> Tensor
   variants: method
   dispatch:
-    SparseCPU, SparseCUDA: sparse_mask
+    SparseCPU, SparseCUDA, SparseMPS: sparse_mask
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_mask_sparse_compressed
   autogen: sparse_mask.out
 
@@ -10055,12 +10159,14 @@
     CPU, CUDA: min
     MPS: min_mps
     QuantizedCPU: min_quantized_cpu
+  tags: [reduction]
 
 - func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA: min_unary_out
     QuantizedCPU: min_quantized_unary_out
+  tags: [reduction]
 
 - func: fmin(Tensor self, Tensor other) -> Tensor
   structured_delegate: fmin.out
@@ -10083,6 +10189,7 @@
     CPU, CUDA: max
     MPS: max_mps
     QuantizedCPU: max_quantized_cpu
+  tags: [reduction]
 
 - func: fmax(Tensor self, Tensor other) -> Tensor
   structured_delegate: fmax.out
@@ -10129,6 +10236,7 @@
   dispatch:
     CPU, CUDA: max_unary_out
     QuantizedCPU: max_quantized_unary_out
+  tags: [reduction]
 
 - func: minimum(Tensor self, Tensor other) -> Tensor
   structured_delegate: minimum.out
@@ -10248,13 +10356,16 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: all.all_out
   variants: method, function
+  tags: reduction
 
 - func: all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
   structured: True
   dispatch:
     CPU, CUDA: all_all_out
+    MTIA: all_all_out_mtia
     MPS: all_all_out_mps
+  tags: reduction
 
 - func: any(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -10262,7 +10373,7 @@
   variants: method, function
   dispatch:
     SparseCPU, SparseCUDA, SparseMPS: any_sparse
-  tags: core
+  tags: [core, reduction]
 
 - func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck
@@ -10270,6 +10381,7 @@
   dispatch:
     CPU, CUDA: any_all_out
     MPS: any_all_out_mps
+  tags: reduction
 
 - func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
@@ -10848,6 +10960,7 @@
   dispatch:
     CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
     CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
+    MTIA: foreach_tensor_maximum_scalar_kernel_mtia_
   autogen: _foreach_maximum.Scalar_out
 
 # foreach_minimum/maximum dispatches to clamp_max/min
@@ -14320,6 +14433,7 @@
   python_module: linalg
   variants: function
   structured_delegate: linalg_vector_norm.out
+  tags: reduction
 
 - func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   python_module: linalg
@@ -14327,6 +14441,7 @@
   dispatch:
     CPU, CUDA: linalg_vector_norm_out
     MPS: linalg_vector_norm_out_mps
+  tags: reduction
 
 - func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
   python_module: linalg
diff --git a/aten/src/ATen/native/nested/NestedTensorMath.cpp b/aten/src/ATen/native/nested/NestedTensorMath.cpp
index e6da3db2a58a..ed7442b1c596 100644
--- a/aten/src/ATen/native/nested/NestedTensorMath.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp
@@ -316,7 +316,7 @@ Tensor NestedTensor_to_padded_tensor_generic(
     TORCH_CHECK(
         (int64_t)output_size_.size() == ret_val.dim(),
         "Length of output_size does not match NestedTensor dims. Broadcasting is not supported.");
-    for (int64_t i = 0; i < (int64_t)ret_val.dim(); i++) {
+    for (int64_t i = 0; i < ret_val.dim(); i++) {
       TORCH_CHECK(
           output_size_[i] >= ret_val.size(i),
           "Value in output_size is less than NestedTensor padded size. Truncation is not supported.");
diff --git a/aten/src/ATen/native/quantized/AffineQuantizerBase.h b/aten/src/ATen/native/quantized/AffineQuantizerBase.h
index a0cfafdb9905..b38ec0b47e5e 100644
--- a/aten/src/ATen/native/quantized/AffineQuantizerBase.h
+++ b/aten/src/ATen/native/quantized/AffineQuantizerBase.h
@@ -31,7 +31,7 @@ TORCH_API float dequantize_vec(
     float* dst,
     size_t count = 8);
 template <typename SRC_T, typename DST_T>
-TORCH_API DST_T requantize_val(double, int64_t, double, int64_t, SRC_T src);
+TORCH_API DST_T requantize_val(double /*src_scale*/, int64_t /*src_zero_point*/, double /*dst_scale*/, int64_t /*dst_zero_point*/, SRC_T src);
 
 // Given a multiplier and a zero_point, requantize int32_t computed values back
 // to quantized values. See comment above
diff --git a/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp b/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp
index 811830dd1a98..ffe6f4c31829 100644
--- a/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp
+++ b/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp
@@ -48,8 +48,8 @@ std::tuple<Tensor, Tensor> fake_quantize_per_channel_affine_cachemask(
     int64_t axis,
     int64_t quant_min,
     int64_t quant_max) {
-  TORCH_CHECK(scale.scalar_type() == ScalarType::Float,
-              "Scale must be Float, found ", scale.scalar_type());
+  TORCH_CHECK(scale.scalar_type() == ScalarType::Float || scale.scalar_type() == at::kBFloat16,
+              "Scale must be Float or BFloat16, found ", scale.scalar_type());
   TORCH_CHECK(zero_point.scalar_type() == ScalarType::Int || zero_point.scalar_type() == ScalarType::Float || zero_point.scalar_type() == ScalarType::Half,
               "Zero-point must be Int32, Float or Half, found ", zero_point.scalar_type());
   TORCH_CHECK(scale.dim() == 1, "scale should be a 1-D tensor");
@@ -178,24 +178,30 @@ std::tuple<Tensor, Tensor, Tensor> _fake_quantize_learnable_per_channel_affine_b
           0 & \text{ else }
         \end{cases}
   */
-  auto zero_point_rounded = _get_rounded_zero_point(zero_point, quant_min, quant_max);
+  bool is_bfloat16 = (X.scalar_type() == at::kBFloat16);
+  at::Tensor X_ = is_bfloat16 ? X.to(ScalarType::Float) : X;
+  at::Tensor dY_ = is_bfloat16 ? dY.to(ScalarType::Float) : dY;
+  at::Tensor scale_ = is_bfloat16 ? scale.to(ScalarType::Float) : scale;
+  at::Tensor zero_point_ = is_bfloat16 ? zero_point.to(ScalarType::Float) : zero_point;
 
-  TORCH_CHECK(dY.scalar_type() == ScalarType::Float);
-  TORCH_CHECK(X.scalar_type() == ScalarType::Float);
-  TORCH_CHECK(scale.scalar_type() == ScalarType::Float);
-  TORCH_CHECK(zero_point.scalar_type() == ScalarType::Float);
+  auto zero_point_rounded = _get_rounded_zero_point(zero_point_, quant_min, quant_max);
 
-  TORCH_CHECK(X.sizes() == dY.sizes(), "`X` and `dY` are not the same size");
+  TORCH_CHECK(dY_.scalar_type() == ScalarType::Float);
+  TORCH_CHECK(X_.scalar_type() == ScalarType::Float);
+  TORCH_CHECK(scale_.scalar_type() == ScalarType::Float);
+  TORCH_CHECK(zero_point_.scalar_type() == ScalarType::Float);
+
+  TORCH_CHECK(X_.sizes() == dY_.sizes(), "`X` and `dY` are not the same size");
   TORCH_CHECK(
       quant_min <= 0 && quant_max >= 0,
       "Expecting `quant_min` <= 0 and `quant_max` >= 0");
-  TORCH_CHECK(scale.dim() == 1, "scale should be a 1-D tensor");
-  TORCH_CHECK(zero_point.dim() == 1, "zero point should be a 1-D tensor");
+  TORCH_CHECK(scale_.dim() == 1, "scale should be a 1-D tensor");
+  TORCH_CHECK(zero_point_.dim() == 1, "zero point should be a 1-D tensor");
   TORCH_CHECK(
-      scale.numel() == zero_point.numel(),
+      scale_.numel() == zero_point_.numel(),
       "scale and zero-point need to have the same dimensions");
   TORCH_CHECK(
-      scale.numel() == X.size(axis),
+      scale_.numel() == X_.size(axis),
       "dimensions of scale and zero-point are not consistent with input tensor")
 
   TORCH_CHECK(
@@ -204,42 +210,42 @@ std::tuple<Tensor, Tensor, Tensor> _fake_quantize_learnable_per_channel_affine_b
       "`zero_point` must be between `quant_min` and `quant_max`.");
 
   TORCH_CHECK(
-      axis >= 0 && axis < X.dim(),
+      axis >= 0 && axis < X_.dim(),
       "`axis` must be between 0 and number of dimensions of input");
 
-  if (X.numel() <= 0) {
+  if (X_.numel() <= 0) {
     return std::make_tuple(X, scale, zero_point);
   }
 
-  auto dX = at::empty_like(X, X.options(), MemoryFormat::Preserve);
-  auto dScale_vec = at::empty_like(X, X.options(), MemoryFormat::Preserve);
-  auto dZeroPoint_vec = at::empty_like(X, X.options(), MemoryFormat::Preserve);
-  auto numDimensions = X.ndimension();
+  auto dX = at::empty_like(X_, X_.options(), MemoryFormat::Preserve);
+  auto dScale_vec = at::empty_like(X_, X_.options(), MemoryFormat::Preserve);
+  auto dZeroPoint_vec = at::empty_like(X_, X_.options(), MemoryFormat::Preserve);
+  auto numDimensions = X_.ndimension();
 
   // Create an axis mask for vectorizing and reshaping the scale and zero point tensors
   // into the same shapes as X along the channel axis.
   c10::DimVector axis_mask(numDimensions);
   for (const auto i : c10::irange(numDimensions)) {
-    axis_mask[i] = (i == axis) ? X.size(axis) : 1;
+    axis_mask[i] = (i == axis) ? X_.size(axis) : 1;
   }
-  auto X_shape = X.sizes();
-  auto scale_vectorized = scale.reshape(at::IntArrayRef(axis_mask.data(), numDimensions)).expand(X_shape);
+  auto X_shape = X_.sizes();
+  auto scale_vectorized = scale_.reshape(at::IntArrayRef(axis_mask.data(), numDimensions)).expand(X_shape);
   auto zero_point_vectorized = zero_point_rounded.reshape(at::IntArrayRef(axis_mask.data(), numDimensions)).expand(X_shape);
 
   auto iter = TensorIteratorConfig()
     .add_output(dX)
     .add_output(dScale_vec)
     .add_output(dZeroPoint_vec)
-    .add_input(X)
-    .add_input(dY)
+    .add_input(X_)
+    .add_input(dY_)
     .add_input(scale_vectorized)
     .add_input(zero_point_vectorized)
     .build();
 
   fake_quant_grad_learnable_channel_stub(
-    X.device().type(), iter, quant_min, quant_max, grad_factor);
+    X_.device().type(), iter, quant_min, quant_max, grad_factor);
 
-  auto numElements = X.ndimension() - 1;
+  auto numElements = X_.ndimension() - 1;
 
   // Create a collection of axes that include all but the channel axis for
   // reduction when summing over the dScale and dZeroPoint tensors.
diff --git a/aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp b/aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp
index 56842195d6a7..88ac05cffe9e 100644
--- a/aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp
+++ b/aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp
@@ -184,15 +184,23 @@ std::tuple<Tensor, Tensor, Tensor> _fake_quantize_learnable_per_tensor_affine_ba
           0 & \text{ else }
         \end{cases}
   */
-  float scale_val = scale[0].item<float>();
+
+  bool is_bfloat16 = (X.scalar_type() == at::kBFloat16);
+
+  at::Tensor X_ = is_bfloat16 ? X.to(ScalarType::Float) : X;
+  at::Tensor dY_ = is_bfloat16 ? dY.to(ScalarType::Float) : dY;
+  at::Tensor scale_ = is_bfloat16 ? scale.to(ScalarType::Float) : scale;
+  at::Tensor zero_point_ = is_bfloat16 ? zero_point.to(ScalarType::Float) : zero_point;
+
+  float scale_val = scale_[0].item<float>();
   float inv_scale_val = 1.0f / scale_val;
-  int64_t zero_point_val = native::_get_zero_point_from_tensor(zero_point, quant_min, quant_max, false);
+  int64_t zero_point_val = native::_get_zero_point_from_tensor(zero_point_, quant_min, quant_max, false);
 
-  TORCH_CHECK(dY.scalar_type() == ScalarType::Float);
-  TORCH_CHECK(X.scalar_type() == ScalarType::Float);
-  TORCH_CHECK(scale.scalar_type() == ScalarType::Float);
-  TORCH_CHECK(zero_point.scalar_type() == ScalarType::Float);
-  TORCH_CHECK(X.numel() == dY.numel(), "`X` and `dY` are not the same size");
+  TORCH_CHECK(dY_.scalar_type() == ScalarType::Float);
+  TORCH_CHECK(X_.scalar_type() == ScalarType::Float);
+  TORCH_CHECK(scale_.scalar_type() == ScalarType::Float);
+  TORCH_CHECK(zero_point_.scalar_type() == ScalarType::Float);
+  TORCH_CHECK(X_.numel() == dY_.numel(), "`X` and `dY` are not the same size");
   TORCH_CHECK(
       quant_min <= 0 && quant_max >= 0,
       "`quant_min` should be less than or \
@@ -200,28 +208,28 @@ std::tuple<Tensor, Tensor, Tensor> _fake_quantize_learnable_per_tensor_affine_ba
   TORCH_CHECK(
       zero_point_val >= quant_min && zero_point_val <= quant_max,
       "`zero_point` must be between `quant_min` and `quant_max`.");
-  if (X.numel() <= 0) {
+  if (X_.numel() <= 0) {
     return std::make_tuple(X, scale, zero_point);
   }
 
-  auto dX = at::empty_like(X, X.options(), MemoryFormat::Preserve);
-  auto dScale_vec = at::empty_like(X, X.options(), MemoryFormat::Preserve);
-  auto dZeroPoint_vec = at::empty_like(X, X.options(), MemoryFormat::Preserve);
+  auto dX = at::empty_like(X_, X_.options(), MemoryFormat::Preserve);
+  auto dScale_vec = at::empty_like(X_, X_.options(), MemoryFormat::Preserve);
+  auto dZeroPoint_vec = at::empty_like(X_, X_.options(), MemoryFormat::Preserve);
 
   auto iter = TensorIteratorConfig()
     .add_output(dX)
     .add_output(dScale_vec)
     .add_output(dZeroPoint_vec)
-    .add_input(X)
-    .add_input(dY)
+    .add_input(X_)
+    .add_input(dY_)
     .build();
 
   fake_quant_grad_learnable_tensor_stub(
-    X.device().type(), iter, scale_val, inv_scale_val, zero_point_val, quant_min, quant_max, grad_factor);
+    X_.device().type(), iter, scale_val, inv_scale_val, zero_point_val, quant_min, quant_max, grad_factor);
 
   // The total sums over the scale and zero point gradient vectors are what will be returned in the end.
-  auto dScale = dScale_vec.sum().unsqueeze(0).to(scale.device());
-  auto dZeroPoint = dZeroPoint_vec.sum().unsqueeze(0).to(zero_point.device());
+  auto dScale = dScale_vec.sum().unsqueeze(0).to(scale_.device());
+  auto dZeroPoint = dZeroPoint_vec.sum().unsqueeze(0).to(zero_point_.device());
 
   return std::make_tuple(dX, dScale, dZeroPoint);
 }
diff --git a/aten/src/ATen/native/quantized/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/quantized/TensorAdvancedIndexing.cpp
index ab118fede8ba..c3272d7aab9c 100644
--- a/aten/src/ATen/native/quantized/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/quantized/TensorAdvancedIndexing.cpp
@@ -14,7 +14,7 @@ DEFINE_DISPATCH(index_put_kernel_quantized_stub);
 DEFINE_DISPATCH(index_put_with_sort_quantized_stub);
 
 namespace {
-static TensorIterator make_index_put_iterator(const AdvancedIndex& info, const Tensor& value) {
+TensorIterator make_index_put_iterator(const AdvancedIndex& info, const Tensor& value) {
   TORCH_CHECK(is_expandable_to(value.sizes(), info.src.sizes()), "shape mismatch: value tensor of shape ", value.sizes(),
              " cannot be broadcast to indexing result of shape ", info.src.sizes());
   TensorIteratorConfig config;
@@ -30,7 +30,7 @@ static TensorIterator make_index_put_iterator(const AdvancedIndex& info, const T
   return config.build();
 }
 
-static Tensor & masked_fill_impl_quantized_cpu(Tensor & self, const Tensor & mask, const Scalar& value) {
+Tensor & masked_fill_impl_quantized_cpu(Tensor & self, const Tensor & mask, const Scalar& value) {
   NoNamesGuard guard;
   TORCH_CHECK(mask.dtype() == ScalarType::Bool, "masked_fill only supports boolean masks, "
     "but got dtype ", mask.dtype());
diff --git a/aten/src/ATen/native/quantized/TensorFactories.cpp b/aten/src/ATen/native/quantized/TensorFactories.cpp
index b2eb10bd4e9f..75405c51bd0a 100644
--- a/aten/src/ATen/native/quantized/TensorFactories.cpp
+++ b/aten/src/ATen/native/quantized/TensorFactories.cpp
@@ -104,27 +104,27 @@ Tensor empty_strided_unknown_quantized(
 
 // Provide better error message if dtype is wrong
 Tensor empty_affine_quantized_other_backends_stub(
-    IntArrayRef,
-    std::optional<ScalarType>,
-    std::optional<Layout>,
-    std::optional<Device>,
-    std::optional<bool>,
-    double,
-    int64_t,
-    std::optional<c10::MemoryFormat>) {
+    IntArrayRef /*unused*/,
+    std::optional<ScalarType> /*unused*/,
+    std::optional<Layout> /*unused*/,
+    std::optional<Device> /*unused*/,
+    std::optional<bool> /*unused*/,
+    double /*unused*/,
+    int64_t /*unused*/,
+    std::optional<c10::MemoryFormat> /*unused*/) {
   TORCH_CHECK(false, "Creation of quantized tensor requires quantized dtype like torch.quint8");
 }
 
 Tensor empty_per_channel_affine_quantized_other_backends_stub(
-    IntArrayRef,
-    const Tensor&,
-    const Tensor&,
-    int64_t,
-    std::optional<ScalarType>,
-    std::optional<Layout>,
-    std::optional<Device>,
-    std::optional<bool>,
-    std::optional<c10::MemoryFormat>) {
+    IntArrayRef /*unused*/,
+    const Tensor& /*unused*/,
+    const Tensor& /*unused*/,
+    int64_t /*unused*/,
+    std::optional<ScalarType> /*unused*/,
+    std::optional<Layout> /*unused*/,
+    std::optional<Device> /*unused*/,
+    std::optional<bool> /*unused*/,
+    std::optional<c10::MemoryFormat> /*unused*/) {
   TORCH_CHECK(false, "Creation of quantized tensor requires quantized dtype like torch.quint8");
 }
 
diff --git a/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp b/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp
index 11a85d7e8bc1..de7c380b6b67 100644
--- a/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp
+++ b/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp
@@ -54,7 +54,7 @@ inline int end_index(int out_idx, int out_len, int in_len) {
 
 // adaptive avg pool for 2D and 3D inputs
 template <typename scalar_t>
-static void adaptive_avg_pool_single_out_frame(
+void adaptive_avg_pool_single_out_frame(
     scalar_t* input_p,
     scalar_t* output_p,
     int64_t sizeC,
diff --git a/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp b/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp
index b940e610b59d..640ce50b76e8 100644
--- a/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp
@@ -31,7 +31,7 @@ DEFINE_DISPATCH(qavg_pool2d_nhwc_stub);
 namespace {
 
 template <typename scalar_t>
-static void avg_pool2d_out_frame(
+void avg_pool2d_out_frame(
     const Tensor& input,
     Tensor& output,
     int64_t nInputPlane,
diff --git a/aten/src/ATen/native/quantized/cpu/QuantUtils.h b/aten/src/ATen/native/quantized/cpu/QuantUtils.h
index e81b0d87916b..686bbf4f8317 100644
--- a/aten/src/ATen/native/quantized/cpu/QuantUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/QuantUtils.h
@@ -146,12 +146,12 @@ inline TensorQuantizationParams ChooseQuantizationParams(
   // The arithmetic error on the zero point computed from either pair
   // will be roughly machine_epsilon * (sum of absolute values of terms)
   // so we want to use the variant that adds the smaller terms.
-  double zero_point_from_min = qmin - min / static_cast<double>(scale);
-  double zero_point_from_max = qmax - max / static_cast<double>(scale);
+  double zero_point_from_min = qmin - min / scale;
+  double zero_point_from_max = qmax - max / scale;
   double zero_point_from_min_error =
-      std::abs(qmin) - std::abs(min / static_cast<double>(scale));
+      std::abs(qmin) - std::abs(min / scale);
   double zero_point_from_max_error =
-      std::abs(qmax) - std::abs(max / static_cast<double>(scale));
+      std::abs(qmax) - std::abs(max / scale);
   double initial_zero_point =
       zero_point_from_min_error < zero_point_from_max_error
       ? zero_point_from_min
diff --git a/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp b/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp
index 3ec1babe9180..2f67291eaab7 100644
--- a/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp
@@ -35,7 +35,7 @@ struct UpsampleBilinearParamW {
 
 // at::native functions for the native_functions.yaml
 template <typename scalar_t>
-static void upsample_bilinear2d_out_frame(
+void upsample_bilinear2d_out_frame(
     Tensor& output,
     const Tensor& input,
     int64_t input_height,
diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
index 0919acd21deb..1e4d2b9960d0 100644
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@@ -543,9 +543,9 @@ int register_embedding_params() {
 
 namespace {
 
-[[maybe_unused]] static auto conv2d_params = register_conv_params<2>();
-[[maybe_unused]] static auto conv3d_params = register_conv_params<3>();
-[[maybe_unused]] static auto linear_params = register_linear_params();
-[[maybe_unused]] static auto embedding_params = register_embedding_params();
+[[maybe_unused]] auto conv2d_params = register_conv_params<2>();
+[[maybe_unused]] auto conv3d_params = register_conv_params<3>();
+[[maybe_unused]] auto linear_params = register_linear_params();
+[[maybe_unused]] auto embedding_params = register_embedding_params();
 
 } // namespace
diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
index b5b887b98bb0..293dfb20b9bf 100644
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@@ -560,7 +560,7 @@ float hsum_sq(const int32_t* A, int len) {
   alignas(64) float temp[8];
   _mm256_store_ps(temp, sum_ps);
   for (const auto k : c10::irange(8)) {
-    row_sum += static_cast<float>(temp[k]);
+    row_sum += temp[k];
   }
 #elif defined(CPU_CAPABILITY_AVX512)
   __m512 sum_ps = _mm512_setzero_ps();
@@ -574,7 +574,7 @@ float hsum_sq(const int32_t* A, int len) {
   alignas(64) float temp[16];
   _mm512_store_ps(temp, sum_ps);
   for (const auto k : c10::irange(16)) {
-    row_sum += static_cast<float>(temp[k]);
+    row_sum += temp[k];
   }
 #endif // CPU_CAPABILITY_AVX2 or CPU_CAPABILITY_AVX512
 
@@ -608,7 +608,7 @@ void qrelu_kernel(const Tensor& qx, Tensor& qy) {
   });
 }
 
-static void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
+void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
                                    const Scalar& negval_) {
   int64_t i_zp = qx.q_zero_point();
   // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
@@ -660,7 +660,7 @@ static void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
   });
 }
 
-static void qprelu_out_kernel(Tensor& out,
+void qprelu_out_kernel(Tensor& out,
                               const Tensor& qx,
                               const Tensor& qw) {
   int32_t i_zp = static_cast<int32_t>(qx.q_zero_point());
@@ -1282,7 +1282,7 @@ template <bool ReLUFused = false>
 void qadd_scalar_kernel(Tensor& out, const Tensor& self, const Scalar& other) {
   int64_t zero_point = out.q_zero_point();
   float scale = static_cast<float>(out.q_scale());
-  float inv_scale = static_cast<float>(1.0f / scale);
+  float inv_scale = 1.0f / scale;
   int64_t self_zero_point = self.q_zero_point();
   float self_scale = static_cast<float>(self.q_scale());
 
@@ -2915,7 +2915,7 @@ void fake_quantize_learnable_channel_grad_kernel_cpu(
       // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
       *dx_output = (*dy_input) * (xqi >= quant_min && xqi <= quant_max);
       // Calculate gradients for scale and zero point.
-      float xfqi = static_cast<float>((std::max(std::min(xqi, quant_max), quant_min) - (*zero_point_input)) * (*scale_input));
+      float xfqi = ((std::max(std::min(xqi, quant_max), quant_min) - (*zero_point_input)) * (*scale_input));
       if (xqi < quant_min || xqi > quant_max) {
         *dzero_point_output = (*dy_input) * (-1) * (*scale_input) * grad_factor;
         *dscale_output = ((xqi < quant_min) ? ((*dy_input) * dscale_small) : ((*dy_input) * dscale_big)) * grad_factor;
@@ -3551,7 +3551,7 @@ void dequantize_tensor_per_tensor_affine_cpu(
 
 #if defined(__ARM_NEON__) || defined(__aarch64__)
 
-const static int PARALLEL_THRESHOLD = 1 << 20;
+constexpr static int PARALLEL_THRESHOLD = 1 << 20;
 
 // Generic template defaults to naive quantize implementation
 template <typename T>
@@ -4415,7 +4415,7 @@ void _qmul_tensor_cpu_impl(
     uint8_t y_data = *(y_ptr + idx);
     int32_t x_val = static_cast<int32_t>(x_data) - x_zero_point;
     int32_t y_val = static_cast<int32_t>(y_data) - y_zero_point;
-    int32_t out_val = static_cast<int32_t>(x_val * y_val);
+    int32_t out_val = x_val * y_val;
     float out_val_f = (float)out_val * multiplier;
     if constexpr (std::is_same<T, float>::value) {
       *(out_ptr + idx) = out_val_f;
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index 3b50bad57902..cd8fb6df37f0 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -1198,7 +1198,7 @@ at::Tensor PackedConvWeightsOnednn<kSpatialDim>::apply_impl(
       kSpatialDim == 2 ? ideep::format_tag::nhwc : ideep::format_tag::ndhwc);
   ideep::tensor src(src_desc, act_contig.data_ptr());
   // weights & bias
-  ideep::tensor& weights = *(weight_.get());
+  ideep::tensor& weights = *(weight_);
   bool with_bias = bias_.has_value();
   const auto& kernel_size = weights.get_dims();
   // dst
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
index 807a9b25d377..40fb1c6c0f5f 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
@@ -158,12 +158,46 @@ c10::intrusive_ptr<EmbeddingPackedParamsBase> PackedEmbeddingBagWeight::prepack(
   return packed_ptr;
 }
 
+#ifdef USE_FBGEMM
+namespace {
+/// Number of columns in the rowwise min/max buffer passed to the quantization function(s)
+constexpr int kRowwiseMinMaxNumCols = 2;
+
+bool _validate_rowwise_min_max(
+  const at::Tensor& weight,
+  const std::optional<at::Tensor>& rowwise_min_max_opt) {
+  const auto is_valid_rowwise_min_max = rowwise_min_max_opt.has_value();
+
+  if (is_valid_rowwise_min_max) {
+      TORCH_CHECK(
+        (rowwise_min_max_opt->dim() == 2 &&
+        rowwise_min_max_opt->size(0) == weight.size(0) &&
+        rowwise_min_max_opt->size(1) == kRowwiseMinMaxNumCols),
+        "'rowwise_min_max' must be a 2D tensor with shape [num_rows(weight), 2].");
+  }
+
+  return is_valid_rowwise_min_max;
+}
+
+auto _get_rowwise_min_max_contig(
+  const std::optional<at::Tensor>& rowwise_min_max_opt) {
+    return rowwise_min_max_opt.has_value()
+      ? rowwise_min_max_opt->expect_contiguous(rowwise_min_max_opt->suggest_memory_format())
+      : at::borrow_from_optional_tensor(rowwise_min_max_opt);
+}
+}
+#endif // USE_FBGEMM
+
 namespace at::native {
 
 // Note - This is a temporary pack function for embedding bag which quantizes
 // and packs the float weight tensor. In the next step it will be replaced by a
 // quantize and pack function once we support FP scale and FP zero_point
 //
+// The optional rowwise_min_max argument is to support callers to pass in the min/max
+// values of the weight tensor. If the rowwise_min_max is not provided, the min/max
+// values will be computed from the weight tensor.
+//
 // Python example examining a packed 8bit zero_point and scale:
 //
 // >> x = torch.from_numpy(np.array([[[10, 20], [30, 40]],[[50, 60], [70, 80]]],
@@ -221,7 +255,10 @@ namespace at::native {
 //
 //        [[50.        , 60.00000035],
 //         [70.        , 80.00000035]]])
-Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
+Tensor& qembeddingbag_byte_prepack_out(
+    Tensor& output,
+    const Tensor& weight,
+    const std::optional<Tensor>& rowwise_min_max_opt) {
   // The "last" dimension of an N-Dimensioned batch of embedding bags is
   // quantization channel. E.g. for a 2D embedding bag, this has
   // [ row, col ] dimensions, for batched of embedding bags, dimensions might be
@@ -256,9 +293,16 @@ Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
   auto* output_data = output.data_ptr<uint8_t>();
 
 #ifdef USE_FBGEMM
+  // Move these outside of the ifdef when we support non-FBGEMM flow.
+  const auto is_valid_rowwise_min_max = _validate_rowwise_min_max(weight, rowwise_min_max_opt);
+  const auto rowwise_min_max_contig = _get_rowwise_min_max_contig(rowwise_min_max_opt);
+
   if (weight_contig->scalar_type() == at::ScalarType::Half) {
     const auto weight_data =
         static_cast<fbgemm::float16*>(weight_contig->data_ptr());
+    const auto rowwise_min_max_data = is_valid_rowwise_min_max
+        ? static_cast<fbgemm::float16*>(rowwise_min_max_contig->data_ptr())
+        : nullptr;
     at::parallel_for(
         0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
           fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<
@@ -266,17 +310,21 @@ Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
               weight_data + start_idx * embedding_cols,
               end_idx - start_idx,
               embedding_cols,
-              output_data + start_idx * output_columns);
+              output_data + start_idx * output_columns,
+              (is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
         });
   } else {
     const auto weight_data = weight_contig->data_ptr<float>();
+    const auto rowwise_min_max_data =
+        is_valid_rowwise_min_max ? rowwise_min_max_contig->data_ptr<float>() : nullptr;
     at::parallel_for(
         0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
           fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<float>(
               weight_data + start_idx * embedding_cols,
               end_idx - start_idx,
               embedding_cols,
-              output_data + start_idx * output_columns);
+              output_data + start_idx * output_columns,
+              (is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
         });
   }
 
@@ -326,6 +374,22 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
   return output;
 }
 
+static Tensor qembeddingbag_byte_prepack_with_rowwise_min_max(
+    const Tensor& weight,
+    const Tensor& rowwise_min_max) {
+  const auto weight_contig =
+      weight.expect_contiguous(weight.suggest_memory_format());
+  Tensor output = at::detail::empty_cpu(
+      {0},
+      at::kByte,
+      weight_contig->layout(),
+      weight_contig->device(),
+      std::nullopt,
+      std::nullopt);
+  qembeddingbag_byte_prepack_out(output, weight, rowwise_min_max);
+  return output;
+}
+
 Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
   const auto weight_contig =
       weight.expect_contiguous(weight.suggest_memory_format());
@@ -335,7 +399,7 @@ Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
       "'embedding_bag_byte_prepack' only support float32 or float16.");
   const auto weight_sizes = weight.sym_sizes();
   const auto cols_dim = weight.ndimension() - 1;
-  const auto embedding_cols = weight_sizes[cols_dim];
+  const auto& embedding_cols = weight_sizes[cols_dim];
   // Add 8 bytes per column to store FP32 scale and zero_point per row.
   const auto output_columns = embedding_cols + 2 * sizeof(float);
 
@@ -359,7 +423,8 @@ Tensor _qembeddingbag_nbit_prepack_helper(
     int bit_width,
     const bool optimized_qparams,
     const int64_t nbins,
-    const double ratio) {
+    const double ratio,
+    const std::optional<Tensor>& rowwise_min_max_opt = std::nullopt) {
   TORCH_CHECK(
       weight.scalar_type() == at::ScalarType::Float ||
           weight.scalar_type() == at::ScalarType::Half,
@@ -401,10 +466,17 @@ Tensor _qembeddingbag_nbit_prepack_helper(
   auto* output_data = output.data_ptr<uint8_t>();
 
 #ifdef USE_FBGEMM
+  // Move these outside of the ifdef when we support non-FBGEMM flow.
+  const auto is_valid_rowwise_min_max = _validate_rowwise_min_max(weight, rowwise_min_max_opt);
+  const auto rowwise_min_max_contig = _get_rowwise_min_max_contig(rowwise_min_max_opt);
+
   if (!optimized_qparams) {
     if (weight_contig.scalar_type() == at::ScalarType::Half) {
       const auto weight_data =
           static_cast<fbgemm::float16*>(weight_contig.data_ptr());
+      const auto rowwise_min_max_data = is_valid_rowwise_min_max
+          ? static_cast<fbgemm::float16*>(rowwise_min_max_contig->data_ptr())
+          : nullptr;
       at::parallel_for(
           0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
             fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<
@@ -413,10 +485,13 @@ Tensor _qembeddingbag_nbit_prepack_helper(
                 weight_data + start_idx * embedding_cols,
                 end_idx - start_idx,
                 static_cast<int>(embedding_cols),
-                output_data + start_idx * output_shape[1]);
+                output_data + start_idx * output_shape[1],
+                (is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
           });
     } else {
       const auto weight_data = weight_contig.data_ptr<float>();
+      const auto rowwise_min_max_data =
+          is_valid_rowwise_min_max ? rowwise_min_max_contig->data_ptr<float>() : nullptr;
       at::parallel_for(
           0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
             fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<float>(
@@ -424,7 +499,8 @@ Tensor _qembeddingbag_nbit_prepack_helper(
                 weight_data + start_idx * embedding_cols,
                 end_idx - start_idx,
                 static_cast<int>(embedding_cols),
-                output_data + start_idx * output_shape[1]);
+                output_data + start_idx * output_shape[1],
+                (is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
           });
     }
   } else {
@@ -514,6 +590,16 @@ Tensor qembeddingbag_4bit_prepack(
       weight, 4 /*bit_width*/, optimized_qparams, nbins, ratio);
 }
 
+Tensor qembeddingbag_4bit_prepack_with_rowwise_min_max(
+    const Tensor& weight,
+    const Tensor& rowwise_min_max,
+    const bool optimized_qparams,
+    const int64_t nbins,
+    const double ratio) {
+  return _qembeddingbag_nbit_prepack_helper(
+      weight, 4 /*bit_width*/, optimized_qparams, nbins, ratio, rowwise_min_max);
+}
+
 // Applies 2-bit row-wise quantization by determining the range
 // (maximum - minimum) and bias (minimum value) of each row in the input
 // matrix, and then scaling each element to an 2-bit number between 0 and
@@ -531,6 +617,16 @@ Tensor qembeddingbag_2bit_prepack(
       weight, 2 /*bit_width*/, optimized_qparams, nbins, ratio);
 }
 
+Tensor qembeddingbag_2bit_prepack_with_rowwise_min_max(
+    const Tensor& weight,
+    const Tensor& rowwise_min_max,
+    const bool optimized_qparams,
+    const int64_t nbins,
+    const double ratio) {
+  return _qembeddingbag_nbit_prepack_helper(
+      weight, 2 /*bit_width*/, optimized_qparams, nbins, ratio, rowwise_min_max);
+}
+
 class QEmbeddingPackWeights final {
  public:
   static c10::intrusive_ptr<EmbeddingPackedParamsBase> run(const at::Tensor& weight) {
@@ -542,12 +638,21 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack"),
       TORCH_FN(qembeddingbag_byte_prepack));
+  m.impl(
+      TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack_with_rowwise_min_max"),
+      TORCH_FN(qembeddingbag_byte_prepack_with_rowwise_min_max));
   m.impl(
       TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_prepack"),
       TORCH_FN(qembeddingbag_4bit_prepack));
+  m.impl(
+      TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_prepack_with_rowwise_min_max"),
+      TORCH_FN(qembeddingbag_4bit_prepack_with_rowwise_min_max));
   m.impl(
       TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack"),
       TORCH_FN(qembeddingbag_2bit_prepack));
+  m.impl(
+      TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack_with_rowwise_min_max"),
+      TORCH_FN(qembeddingbag_2bit_prepack_with_rowwise_min_max));
 }
 
 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h
index e157405c107b..c110e63b3629 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h
@@ -3,7 +3,10 @@
 
 namespace at::native {
 
-Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight);
+Tensor& qembeddingbag_byte_prepack_out(
+    Tensor& output,
+    const Tensor& weight,
+    const std::optional<Tensor>& rowwise_min_max_opt = std::nullopt);
 
 Tensor qembeddingbag_byte_prepack(const Tensor& weight);
 
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index a3a494d16fd6..7a80b166f8cb 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -812,7 +812,7 @@ at::Tensor PackedLinearWeightsOnednn::apply_impl(
 
   auto is_input_qint8 = input.scalar_type() == c10::ScalarType::QInt8;
   auto input_contig = input.expect_contiguous();
-  auto& w = *(weight_.get());
+  auto& w = *weight_;
   auto K = input.size(dim - 1), M = input.numel() / K, N = w.get_dim(1);
   auto input_dims = {M, K};
   auto input_data_type = is_input_qint8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::u8;
@@ -1388,7 +1388,7 @@ namespace at::native {
     TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() <= 1,
         "onednn int8 linear: act scale/zp size should be 1/<=1");
     static std::optional<at::Tensor> other = std::nullopt;
-    static const std::string_view binary_post_op = "none";
+    constexpr std::string_view binary_post_op = "none";
     int64_t act_zp = act_zero_point.numel() == 1 ? act_zero_point.item().toLong() : 0;
     return linear_int8_with_onednn_weight(
         act, act_scale.item().toDouble(), act_zp,
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
index 4ed50f6f8735..cdaf3b60eb7d 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@@ -545,7 +545,7 @@ at::Tensor PackedLinearWeightsOnednn::apply_dynamic_impl(
       /*reduce_range=*/reduce_range);
   const std::vector<int32_t>& src_zero_point = std::vector<int32_t>(1, q_params.zero_point);
   // weights, dst
-  auto w = *(weight_.get());
+  auto w = *weight_;
   auto dst_dims = {x.get_dim(0), w.get_dim(1)};
   const ideep::scale_t& src_scales = ideep::scale_t(1, 1.0/q_params.scale);
   const ideep::scale_t& weights_scales = w.get_scale();
diff --git a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
index cd00a351b0e3..31221cd9bf26 100644
--- a/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qsoftmax.cpp
@@ -16,8 +16,8 @@ namespace {
 
 #ifdef USE_PYTORCH_QNNPACK
 
-const static float qnnpack_softmax_output_scale = 0x1.0p-8f;
-const static int qnnpack_softmax_output_zero_point = 0;
+constexpr static float qnnpack_softmax_output_scale = 0x1.0p-8f;
+constexpr static int qnnpack_softmax_output_zero_point = 0;
 
 bool is_qnnpack_compatible(
     const Tensor& qx,
diff --git a/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu b/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
index 7674798b26f2..ccdbe04fdf90 100644
--- a/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
+++ b/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
@@ -101,6 +101,9 @@ __device__ inline bool isinf_device(float v) {
 __device__ inline bool isinf_device(c10::BFloat16 v) {
   return ::isinf(static_cast<float>(v));
 }
+__device__ inline bool isinf_device(at::Half v) {
+  return ::isinf(static_cast<float>(v));
+}
 
 // CUDA kernel to compute Moving Average Min/Max of the tensor.
 // It uses the running_min and running_max along with averaging const, c.
@@ -160,8 +163,8 @@ void _calculate_moving_average(
     std::tie(x_min, x_max) = at::aminmax(x, 1);
     int num_threads = std::min(size, (int64_t)512);
     const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
-    AT_DISPATCH_FLOATING_TYPES_AND(
-        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::kBFloat16, at::kHalf, x.scalar_type(), "aminmax_kernel", [&] {
           scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
           scalar_t* x_max_data = x_max.data_ptr<scalar_t>();
 
@@ -181,8 +184,8 @@ void _calculate_moving_average(
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
     std::tie(x_min, x_max) = at::aminmax(x);
-    AT_DISPATCH_FLOATING_TYPES_AND(
-        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::kBFloat16, at::kHalf, x.scalar_type(), "aminmax_kernel", [&] {
           scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
           scalar_t* x_max_data = x_max.data_ptr<scalar_t>();
 
@@ -221,8 +224,8 @@ void _calc_moving_avg_qparams_helper(
   cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
   int64_t* fake_quant_on_data = fake_quant_on.data_ptr<int64_t>();
   if (per_row_fq) {
-    AT_DISPATCH_FLOATING_TYPES_AND(
-        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::kBFloat16, at::kHalf, x.scalar_type(), "aminmax_kernel", [&] {
           scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
           scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
           int num_threads = std::min(size, (int64_t)512);
@@ -244,8 +247,8 @@ void _calc_moving_avg_qparams_helper(
         });
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
-    AT_DISPATCH_FLOATING_TYPES_AND(
-        at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+        at::kBFloat16, at::kHalf, x.scalar_type(), "aminmax_kernel", [&] {
           scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
           scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
           ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(
diff --git a/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp b/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp
index 8c70f29a135f..b94ab0fd0975 100644
--- a/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/ConvPrepack.cpp
@@ -12,7 +12,6 @@
 #include <ATen/quantized/Quantizer.h>
 #include <c10/core/QScheme.h>
 #include <c10/util/irange.h>
-#include <torch/library.h>
 
 #include <utility>
 
diff --git a/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp b/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp
index f2ff99d38d7f..53da11b4d0fe 100644
--- a/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp
+++ b/aten/src/ATen/native/quantized/cudnn/LinearPrepack.cpp
@@ -10,7 +10,6 @@
 #include <ATen/quantized/Quantizer.h>
 #include <c10/core/QScheme.h>
 #include <c10/util/irange.h>
-#include <torch/library.h>
 
 int register_linear_params();
 
diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp
index 550280dbf6d3..9ce361926155 100644
--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@@ -121,9 +121,12 @@ TORCH_LIBRARY(quantized, m) {
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_unpack(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack) -> Tensor W_origin"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
+  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"), {at::Tag::pt2_compliant_tag});
   m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"), {at::Tag::pt2_compliant_tag});
diff --git a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
index 805035cdd626..7cec767d4466 100644
--- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
+++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h
@@ -31,7 +31,7 @@ using at::sparse::get_sparse_impl;
 
 // ForwardIt: only legacy random access iterator is supported.
 template<class ForwardIt, class T, bool is_lower = true>
-static FUNCAPI INLINE
+FUNCAPI INLINE
 ForwardIt find_bound(ForwardIt first, ForwardIt last, const T& value) {
     ForwardIt RESTRICT it;
     typename std::iterator_traits<ForwardIt>::difference_type count, step;
diff --git a/aten/src/ATen/native/sparse/SparseBlas.cpp b/aten/src/ATen/native/sparse/SparseBlas.cpp
index 2bf15302d7a0..1bf7e459215a 100644
--- a/aten/src/ATen/native/sparse/SparseBlas.cpp
+++ b/aten/src/ATen/native/sparse/SparseBlas.cpp
@@ -65,7 +65,7 @@ Tensor& addmv_out_sparse_compressed(
       return result.zero_();
     } else {
       return at::mul_out(
-          const_cast<Tensor&>(result),
+          result,
           self,
           at::native::scalar_tensor(
               beta,
diff --git a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
index 4faa135713d6..91be22a7e737 100644
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@@ -1330,18 +1330,18 @@ Tensor reduce_sparse_csr_cpu_template(const Tensor& sparse, IntArrayRef dims_to_
 
 template <typename scalar_t>
 struct ReductionAddOp {
-  inline scalar_t operator()(const scalar_t& a, const scalar_t& b) const {
+  scalar_t operator()(const scalar_t& a, const scalar_t& b) const {
     return a + b;
   }
-  inline scalar_t identity() const { return 0; }
+  scalar_t identity() const { return 0; }
 };
 
 template <typename scalar_t>
 struct ReductionMulOp {
-  inline scalar_t operator()(const scalar_t& a, const scalar_t& b) const {
+  scalar_t operator()(const scalar_t& a, const scalar_t& b) const {
     return a * b;
   }
-  inline scalar_t identity() const { return 1; }
+  scalar_t identity() const { return 1; }
 };
 
 }  // namespace
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index e8c5fd013ba8..80f79c652037 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -55,7 +55,6 @@
 #include <ATen/ops/is_pinned_native.h>
 #include <ATen/ops/resize_as_sparse.h>
 #include <ATen/ops/resize_as_sparse_native.h>
-#include <ATen/ops/sparse_coo_tensor.h>
 #include <ATen/ops/sparse_coo_tensor_native.h>
 #include <ATen/ops/sparse_dim_native.h>
 #include <ATen/ops/sparse_mask_native.h>
@@ -274,7 +273,7 @@ Tensor sparse_coo_tensor(IntArrayRef size,
 
 // helper
 namespace {
-static inline Tensor expand_values_if_needed(const Tensor& values) {
+inline Tensor expand_values_if_needed(const Tensor& values) {
   // expand
   if (values.dim() == 0) {
     // Mimic Numpy behavior here and treat it as a 1D tensor
diff --git a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
index 267c19561a29..cfa890d7f344 100644
--- a/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
+++ b/aten/src/ATen/native/sparse/ValidateCompressedIndicesCommon.h
@@ -145,7 +145,7 @@ INVARIANT_CHECK_FUNC_API _check_idx_sorted_distinct_vals_slices_with_cidx(
   }
 }
 
-static inline int64_t indexCount(IntArrayRef sizes) {
+inline int64_t indexCount(IntArrayRef sizes) {
   int64_t res = 1;
   for (const auto& s : sizes) {
     res *= s;
diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp b/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp
index 73cbdf2fddda..b01180ab5ffb 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlas.cpp
@@ -244,7 +244,7 @@ Tensor& addmv_out_sparse_compressed_cuda(
       return result.zero_();
     } else {
       return at::mul_out(
-          const_cast<Tensor&>(result),
+          result,
           self,
           at::native::scalar_tensor(
               beta,
diff --git a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
index fdbdd0950f5f..43638aa2d1b6 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp
@@ -10,7 +10,6 @@
 #include <ATen/native/cuda/MiscUtils.h>
 #include <ATen/native/sparse/SparseBlasImpl.h>
 #include <ATen/native/sparse/cuda/SparseBlasImpl.h>
-#include <ATen/native/sparse/cuda/SparseBlasLegacy.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -68,29 +67,6 @@ c10::MaybeOwned<Tensor> prepare_dense_matrix_for_cusparse(
   }
 }
 
-// This function is used for old CUDA Toolkit versions that doesn't support new cuSPARSE Generic API
-void addmm_out_legacy(
-    const at::sparse_csr::SparseCsrTensor& mat1,
-    const Tensor& mat2,
-    const Scalar& beta,
-    const Scalar& alpha,
-    const Tensor& result) {
-  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(mat1.is_sparse_csr());
-  auto nnz = mat1._nnz();
-  auto m = mat1.size(0);
-  auto k = mat1.size(1);
-  auto n = mat2.size(1);
-  auto crow_indices = mat1.crow_indices().to(kInt);
-  auto col_indices = mat1.col_indices().to(kInt);
-  auto values = mat1.values();
-  auto mat2_ = at::native::expect_resolved_conj(mat2);
-  auto result_ = at::native::expect_resolved_conj(result);
-  at::native::s_addmm_out_csr_sparse_dense_cuda_worker(nnz, m, n, k, result, beta, *result_, alpha, crow_indices, col_indices, values, *mat2_);
-  if (!result.is_same(*result_)) {
-    result.copy_(*result_);
-  }
-}
-
 c10::MaybeOwned<Tensor> inline prepare_dense_vector_for_cusparse(
     const Tensor& tensor) {
   if (tensor.is_non_overlapping_and_dense()) {
@@ -117,15 +93,6 @@ void inline col_indices_and_values_resize_(const Tensor& input, int64_t nnz) {
       input.sizes());
 }
 
-void inline bsrsv2_bsrsm2_may_need_to_sync() {
-#if defined(CUSPARSE_VERSION) && CUSPARSE_VERSION < 11703
-  // cusparse bsrsv2 and bsrsm2 have a synchronization issue that may cause illegal memory access in cuda <= 11.6.x
-  // See https://github.com/pytorch/pytorch/issues/71297
-  ::c10::cuda::device_synchronize();
-#endif
-  // else: do nothing!
-}
-
 void block_sparse_triangular_solve_vec(
     const at::sparse_csr::SparseCsrTensor& A,
     const Tensor& B,
@@ -246,7 +213,6 @@ void block_sparse_triangular_solve_vec(
             CUSPARSE_SOLVE_POLICY_NO_LEVEL,
             work_data.get());
 
-        bsrsv2_bsrsm2_may_need_to_sync();
       });
   if (!X.is_same(*X_)) {
     X.copy_(*X_);
@@ -387,7 +353,6 @@ void block_sparse_triangular_solve_mat(
             CUSPARSE_SOLVE_POLICY_NO_LEVEL,
             work_data.get());
 
-        bsrsv2_bsrsm2_may_need_to_sync();
       });
   if (!X.is_same(*X_)) {
     X.copy_(*X_);
@@ -582,9 +547,6 @@ void spmm(
     const Scalar& beta,
     const Scalar& alpha,
     const Tensor& result) {
-#if !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API())
-  addmm_out_legacy(mat1, mat2, beta, alpha, result);
-#else
   c10::MaybeOwned<Tensor> result_ = prepare_dense_matrix_for_cusparse(result);
   c10::MaybeOwned<Tensor> mat2_ = prepare_dense_matrix_for_cusparse(mat2);
 
@@ -683,7 +645,6 @@ void spmm(
   if (!result.is_same(*result_)) {
     result.copy_(*result_);
   }
-#endif // !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API())
 }
 
 void spgemm(
@@ -692,12 +653,6 @@ void spgemm(
     const Scalar& beta,
     const Scalar& alpha,
     const at::sparse_csr::SparseCsrTensor& C) {
-  // older versions of cusparse on Windows segfault for complex128 dtype
-#if defined(_WIN32) && defined(CUSPARSE_VERSION) && CUSPARSE_VERSION < 11400
-  TORCH_CHECK(
-      !(A.scalar_type() == ScalarType::ComplexDouble),
-      "Sparse multiplication with complex128 dtype inputs is not supported with current CUDA version. Please upgrade to CUDA Toolkit 11.2.1+");
-#endif
 
   IntArrayRef A_sizes = A.sizes();
   auto ndim = A.dim();
@@ -855,7 +810,8 @@ void addmm_out_sparse_csr(
   if (mat1.layout() == kSparseBsr) {
     if (mat2.layout() == kStrided) {
       if (result.layout() == kStrided)
-        return block_sparse_mm(input, mat1, mat2, beta, alpha, result);
+         { block_sparse_mm(input, mat1, mat2, beta, alpha, result); return;
+}
     }
   }
 
@@ -864,13 +820,13 @@ void addmm_out_sparse_csr(
       if (result.layout() == kStrided) {
         auto result_t = result.transpose(-2, -1);
         auto input_t = (result.is_same(input) ? result_t : input.transpose(-2, -1));
-        return block_sparse_mm(
+        block_sparse_mm(
             input_t,
             mat2.transpose(-2, -1),
             mat1.transpose(-2, -1),
             beta,
             alpha,
-            result_t);
+            result_t); return;
       }
     }
   }
@@ -885,41 +841,41 @@ void addmm_out_sparse_csr(
     if (mat2.layout() == kSparseCsr) {
       if (result.layout() == kStrided) {
         // TODO: Add native CSC support via cuSPARSE if supported.
-        return spmm(
+        spmm(
             mat2.transpose(0, 1).to_sparse_csr(),
             mat1.transpose(0, 1),
             beta,
             alpha,
-            result.transpose(0, 1));
+            result.transpose(0, 1)); return;
       }
     }
     if (mat2.layout() == kSparseCsc) {
       if (result.layout() == kStrided) {
-        return spmm(
+        spmm(
             mat2.transpose(-2, -1),
             mat1.transpose(-2, -1),
             beta,
             alpha,
-            result.transpose(-2, -1));
+            result.transpose(-2, -1)); return;
       }
     }
   }
   if (mat1.layout() == kSparseCsr) {
     if (mat2.layout() == kStrided) {
       if (result.layout() == kStrided) {
-        return spmm(mat1, mat2, beta, alpha, result);
+        spmm(mat1, mat2, beta, alpha, result); return;
       }
     }
     if (mat2.layout() == kSparseCsr) {
       if (result.layout() == kSparseCsr) {
-        return spgemm(mat1, mat2, beta, alpha, result);
+        spgemm(mat1, mat2, beta, alpha, result); return;
       }
     }
     if (mat2.layout() == kSparseCsc) {
       if (result.layout() == kSparseCsr) {
         // TODO: Add native CSC support via cuSPARSE if supported.
         // CSR @ CSC kernel would be very fast due to format alignment
-        return spgemm(mat1, mat2.to_sparse_csr(), beta, alpha, result);
+        spgemm(mat1, mat2.to_sparse_csr(), beta, alpha, result); return;
       }
     }
   }
@@ -927,27 +883,28 @@ void addmm_out_sparse_csr(
     if (mat2.layout() == kStrided) {
       if (result.layout() == kStrided) {
         // TODO: Add native CSC support via cuSPARSE if supported.
-        return spmm(mat1.to_sparse_csr(), mat2, beta, alpha, result);
+        spmm(mat1.to_sparse_csr(), mat2, beta, alpha, result); return;
       }
     }
     if (mat2.layout() == kSparseCsr) {
       if (result.layout() == kSparseCsr)
         // TODO: Add native CSC support via cuSPARSE if supported.
-        return spgemm(mat1.to_sparse_csr(), mat2, beta, alpha, result);
+         { spgemm(mat1.to_sparse_csr(), mat2, beta, alpha, result); return;
+}
     }
     if (mat2.layout() == kSparseCsc) {
       if (result.layout() == kSparseCsr) {
         // TODO: Add native CSC support via cuSPARSE if supported.
-        return spgemm(
-            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result);
+        spgemm(
+            mat1.to_sparse_csr(), mat2.to_sparse_csr(), beta, alpha, result); return;
       }
       if (result.layout() == kSparseCsc) {
-        return spgemm(
+        spgemm(
             mat2.transpose(-2, -1),
             mat1.transpose(-2, -1),
             beta,
             alpha,
-            result.transpose(-2, -1));
+            result.transpose(-2, -1)); return;
       }
     }
   }
@@ -978,15 +935,8 @@ void addmv_out_sparse_csr(
     const Scalar& alpha,
     const Tensor& result) {
   if (mat.layout() == kSparseBsr) {
-    return block_sparse_mv(mat, vec, beta, alpha, result);
+    block_sparse_mv(mat, vec, beta, alpha, result); return;
   }
-#if !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API())
-  TORCH_CHECK(
-      false,
-      "Calling addmv on a sparse GPU tensor requires compiling ",
-      "PyTorch with CUDA 10.2+ (CUDA 11+ on Windows). ",
-      "Please use PyTorch built with newer CUDA version.");
-#else
   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
 
   c10::MaybeOwned<Tensor> result_ = prepare_dense_vector_for_cusparse(result);
@@ -997,11 +947,10 @@ void addmv_out_sparse_csr(
   auto descX = at::cuda::sparse::CuSparseDnVecDescriptor(*vec_);
   auto descY = at::cuda::sparse::CuSparseDnVecDescriptor(*result_);
 
-  // cusparseSpMVAlg_t was updated in cuda 11.2.1 (cusparse 11.4.0)
-#if CUSPARSE_VERSION >= 11400
-  cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT;
-#else
+#ifdef USE_ROCM
   cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT;
+#else
+  cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT;
 #endif
 
   // SpMV doesn't support uniform precision computation
@@ -1054,7 +1003,6 @@ void addmv_out_sparse_csr(
   if (!result.is_same(*result_)) {
     result.copy_(*result_);
   }
-#endif // !(AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API())
 }
 
 /*
@@ -1267,17 +1215,13 @@ void triangular_solve_out_sparse_csr(
   }
   if (A.layout() == kSparseBsr) {
     if (B.size(-1) == 1) {
-      return block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular);
+      block_sparse_triangular_solve_vec(A, B, X, upper, transpose, unitriangular); return;
     } else {
-      return block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular);
+      block_sparse_triangular_solve_mat(A, B, X, upper, transpose, unitriangular); return;
     }
   }
-#if !AT_USE_CUSPARSE_GENERIC_SPSV()
-  TORCH_CHECK(
-      false,
-      "Calling triangular solve on a sparse GPU tensor requires compiling ",
-      "PyTorch with at least CUDA 11.3. ",
-      "Please use PyTorch built with newer CUDA version.");
+#ifdef USE_ROCM
+  TORCH_CHECK(false, "ROCm is not supported");
 #else
   c10::MaybeOwned<Tensor> X_ = prepare_dense_matrix_for_cusparse(X);
   // It should be possible to use mixed memory format
@@ -1344,13 +1288,6 @@ void triangular_solve_out_sparse_csr(
               desc_spsv.descriptor()));
         });
   } else {
-#if !AT_USE_CUSPARSE_GENERIC_SPSM()
-    TORCH_CHECK(
-        false,
-        "Calling triangular solve on a sparse GPU tensor requires compiling ",
-        "PyTorch with at least CUDA 11.3.1. ",
-        "Please use PyTorch built with newer CUDA version.");
-#else
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
         X.scalar_type(), "triangular_solve_out_sparse_csr_cuda_impl", [&] {
           scalar_t alpha = 1;
@@ -1404,12 +1341,11 @@ void triangular_solve_out_sparse_csr(
               CUSPARSE_SPSM_ALG_DEFAULT,
               desc_spsm.descriptor()));
         });
-#endif // !AT_USE_CUSPARSE_GENERIC_SPSM()
   }
   if (!X.is_same(*X_)) {
     X.copy_(*X_);
   }
-#endif // !AT_USE_CUSPARSE_GENERIC_SPSV()
+#endif
 }
 
 void sampled_addmm_out_sparse_csr(
@@ -1418,13 +1354,6 @@ void sampled_addmm_out_sparse_csr(
     const Scalar& beta,
     const Scalar& alpha,
     const at::sparse_csr::SparseCsrTensor& C) {
-#if !(AT_USE_CUSPARSE_GENERIC_SDDMM() || AT_USE_HIPSPARSE_GENERIC_API())
-  TORCH_CHECK(
-      false,
-      "Calling sampled_addmm with sparse GPU tensors requires compiling ",
-      "PyTorch with CUDA 11.2.1+. ",
-      "Please use PyTorch built with newer CUDA version.");
-#else
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(A.layout() == Layout::Strided);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(B.layout() == Layout::Strided);
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(C.is_sparse_csr());
@@ -1499,7 +1428,6 @@ void sampled_addmm_out_sparse_csr(
               buffer.get()));
         }
       });
-#endif
 }
 
 } // namespace at::native::sparse::impl::cuda
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index 3730ceb91354..62deedfc2a71 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -672,7 +672,7 @@ Tensor bmm_sparse_cuda(const SparseTensor& self, const Tensor& mat2) {
   return bmm_out_sparse_cuda(self, mat2, result);
 }
 
-#if defined(USE_ROCM) || !(defined(_MSC_VER) && CUSPARSE_VERSION < 11000)
+#if defined(USE_ROCM) || defined(CUSPARSE_VERSION)
 __global__ void search_end_matrix_indices_cuda_kernel(
   int64_t* mat_el_end_indices,
   int64_t num_matrices,
@@ -745,10 +745,6 @@ cudaDataType getTensorCudaDataType(Tensor self) {
 #endif
 
 Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor& result) {
-#if defined(_MSC_VER) && (CUSPARSE_VERSION < 11000)
-  TORCH_CHECK(false, "bmm sparse-dense CUDA is not supported on Windows with cuda before 11.0");
-#elif defined(USE_ROCM) || (defined(CUDART_VERSION) && (CUDART_VERSION >= 10010))  // linux cuda >= 10.1 or windows cuda >= 11.0
-
   TORCH_CHECK(!mat2.is_sparse(), "bmm_sparse: Tensor 'mat2' must be dense");
   TORCH_CHECK(self.dense_dim() == 0, "bmm_sparse: Tensor 'self' must have 0 dense dims, but has ", self.dense_dim());
   TORCH_CHECK(self.sparse_dim() == 3, "bmm_sparse: Tensor 'self' must have 3 sparse dims, but has ", self.sparse_dim());
@@ -944,10 +940,6 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor
   // them in column-major order in memory
   result.transpose_(1,2);
 
-#else
-  TORCH_CHECK(false, "bmm sparse-dense requires CUDA 10.1 or greater");
-#endif
-
   return result;
 }
 
diff --git a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
index c6e3197a22a8..5b1151c3a315 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseMatMul.cu
@@ -40,7 +40,7 @@
 #include <thrust/iterator/discard_iterator.h>
 
 
-#if defined(__CUDACC__) && ((CUSPARSE_VERSION >= 11000) || (defined(USE_ROCM) && ROCM_VERSION >= 60300))
+#if defined(__CUDACC__) && (defined(CUSPARSE_VERSION) || (defined(USE_ROCM) && ROCM_VERSION >= 60300))
 #define IS_CUSPARSE11_AVAILABLE() 1
 #else
 #define IS_CUSPARSE11_AVAILABLE() 0
@@ -689,13 +689,6 @@ void sparse_sparse_matmul_cuda_kernel(
         std::is_same_v<c10::complex<double>, scalar_t>,
     "sparse_sparse_matmul_cuda_kernel only supports data type of half, bfloat16, float, double and complex float, double.");
 
-  // older versions of cusparse on Windows segfault for complex128 dtype
-#if defined(_WIN32) && defined(CUSPARSE_VERSION) && CUSPARSE_VERSION < 11400
-  TORCH_CHECK(
-      !(mat1.scalar_type() == ScalarType::ComplexDouble),
-      "Sparse multiplication with complex128 dtype inputs is not supported with current CUDA version. Please upgrade to CUDA Toolkit 11.2.1+");
-#endif
-
   Tensor mat1_indices_ = mat1._indices().contiguous();
   Tensor mat1_values = mat1._values().contiguous();
 
diff --git a/aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredApply.cu
similarity index 100%
rename from aten/src/ATen/native/sparse/cuda/SparseSemiSturcturedApply.cu
rename to aten/src/ATen/native/sparse/cuda/SparseSemiStructuredApply.cu
diff --git a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
index de73ce612f10..9d735ac0f2c8 100644
--- a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
+++ b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
@@ -64,7 +64,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
   // create sparse descriptor, dtype
   cusparseLtMatDescriptor_t sparse_input_descriptor;
   cudaDataType type;
-  auto compression_factor = 9;
 
   #ifdef USE_ROCM
   TORCH_CHECK(isHipSparseLtSupported());
@@ -73,7 +72,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
   switch (sparse_input.scalar_type()) {
     case at::ScalarType::Char:
       type = CUDA_R_8I;
-      compression_factor = 10;
       break;
     case at::ScalarType::Half:
       type = CUDA_R_16F;
@@ -89,7 +87,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
 #if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602 && !defined(USE_ROCM)
     case at::ScalarType::Float8_e4m3fn:
       type = CUDA_R_8F_E4M3;
-      compression_factor = 10;
       break;
 #endif
     default:
@@ -97,10 +94,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
       break;
   }
 
-  // create a new compressed tensor with the same dtype as
-  auto compressed_tensor =
-      sparse_input.new_empty(sparse_input.numel() * compression_factor / 16);
-
   TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
       &handle,
       &sparse_input_descriptor,
@@ -121,6 +114,15 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
       &compressed_size,
       &compressed_buffer_size));
 
+  // create a new compressed tensor with the same dtype as the input,
+  // and with packed data/metadata stored in an array with original
+  // number of rows, and sufficient columns to provide compressed_size
+  // buffer (in bytes)
+  size_t orig_m = sparse_input.size(0);
+  size_t div = orig_m * sparse_input.itemsize();
+  size_t new_n = (compressed_size + div - 1) / div; // ceil(s,d) = (s+d-1)/d
+  auto compressed_tensor = sparse_input.new_empty({(int64_t)orig_m, (int64_t)new_n});
+
   auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
   auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -153,7 +155,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
     TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
     handle_initialized = true;
   }
-  // cupsarselt constructs
+  // cuSPARSELt constructs
   cusparseLtMatmulDescriptor_t matmul;
   cusparseLtMatmulPlan_t plan;
   cusparseLtMatmulAlgSelection_t alg_sel;
@@ -165,7 +167,6 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
   cudaDataType output_type;
   cudaDataType C_type;
   cusparseComputeType compute_type;
-  auto compression_factor = 9;
 
   #ifdef USE_ROCM
   TORCH_CHECK(isHipSparseLtSupported());
@@ -177,7 +178,6 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
       output_type = CUDA_R_8I;
       C_type = CUDA_R_8I;
       compute_type = CUSPARSE_COMPUTE_32I;
-      compression_factor = 10;
       break;
 
 // cuSPARSELt v0.5.2 onwards changes CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUT_16F
@@ -210,7 +210,6 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
       output_type = CUDA_R_8F_E4M3;
       C_type = CUDA_R_16F;
       compute_type = CUSPARSE_COMPUTE_32F;
-      compression_factor = 10;
       break;
 #endif
 // cuSPARSELt <= v0.5.2 uses CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUTE_16F
@@ -300,9 +299,10 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
     }
   }
 
+  TORCH_INTERNAL_ASSERT(compressed_A.dim() == 2); // encoded M x S
   int64_t k = dense_B.size(0);
   int64_t n = dense_B.size(1);
-  int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
+  int64_t m = compressed_A.size(0);
 
   // initialize sparse descriptor
   cusparseLtMatDescriptor_t sparse_input_descriptor;
diff --git a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
index 589d000ab318..9f33f5b1106f 100644
--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
@@ -1,6 +1,9 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/SparseTensorUtils.h>
+#include <ATen/ExpandUtils.h>
 #include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/sparse/SparseStubs.h>
+#include <ATen/native/sparse/SparseBinaryOpIntersectionCommon.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -13,7 +16,11 @@
 #include <ATen/ops/mul_native.h>
 #include <ATen/ops/empty_native.h>
 #include <ATen/ops/zeros_native.h>
+#include <ATen/ops/ones_like.h>
+#include <ATen/ops/argsort.h>
 #include <ATen/ops/result_type.h>
+#include <ATen/ops/bmm_native.h>
+#include <ATen/ops/addmm_native.h>
 #include <ATen/ops/copy_sparse_to_sparse.h>
 #include <ATen/ops/mul.h>
 #endif
@@ -29,6 +36,305 @@
 #include <ATen/native/mps/Mul_metallib.h>
 #endif
 
+static Tensor& s_addmm_out_sparse_dense_mps(
+    Tensor& r,
+    const Tensor& t,
+    const SparseTensor& sparse_,
+    const Tensor& dense,
+    const Scalar& beta,
+    const Scalar& alpha) {
+  TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: sparse_dim must be 2, got ", sparse_.sparse_dim());
+  TORCH_CHECK(sparse_.dense_dim() == 0, "addmm: sparse values must be 0-dense-dim, got ", sparse_.dense_dim());
+  TORCH_CHECK(dense.dim() == 2, "addmm: 'dense' must be 2D, got ", dense.dim());
+  TORCH_CHECK(t.dim() == 2, "addmm: 't' must be 2D, got ", t.dim());
+
+  const int64_t I = sparse_.size(0);
+  const int64_t J = sparse_.size(1);
+  const int64_t K = dense.size(1);
+
+  TORCH_CHECK(dense.size(0) == J,
+      "addmm: dense (mat2) dim0 must be ", J, ", got ", dense.size(0));
+  TORCH_CHECK(t.size(0) == I && t.size(1) == K,
+      "addmm: 't' shape must be (", I, ", ", K, "), got (", t.size(0), ", ", t.size(1), ")");
+
+  r.resize_({I, K});
+
+  auto sparse = sparse_.coalesce();
+  const int64_t nnz = sparse._nnz();
+
+  if (nnz == 0 || I == 0 || K == 0) {
+    at::mul_out(r, t, beta);
+    return r;
+  }
+
+  const auto v_dtype = sparse._values().scalar_type();
+  const auto d_dtype = dense.scalar_type();
+  const auto t_dtype = t.scalar_type();
+  auto compute_dtype = c10::promoteTypes(c10::promoteTypes(v_dtype, d_dtype), t_dtype);
+
+  TORCH_CHECK(canCast(compute_dtype, r.scalar_type()),
+              "Can't convert computed type ", compute_dtype, " to output ", r.scalar_type());
+
+  auto indices2d = sparse._indices().contiguous();
+  auto values = sparse._values().to(compute_dtype);
+  auto dense_c = dense.to(compute_dtype).contiguous();
+  auto t_c = t.to(compute_dtype).contiguous();
+
+  const bool out_needs_cast = (r.scalar_type() != compute_dtype) || !r.is_contiguous();
+  Tensor out_buf = out_needs_cast
+      ? at::empty({I, K}, r.options().dtype(compute_dtype))
+      : r;
+  auto out_contig = out_buf.contiguous();
+
+  auto device = r.device();
+  auto stream = getCurrentMPSStream();
+
+  const float alpha_f = alpha.to<float>();
+  const float beta_f  = beta.to<float>();
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      const std::string func = "spmm_addmm_coo_" + mps::scalarToMetalTypeString(values);
+      auto pso = lib.getPipelineStateForFunc(func);
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+
+      const uint32_t tew = pso.threadExecutionWidth;
+      const uint32_t gridX = static_cast<uint32_t>(K);
+      const uint32_t gridZ = static_cast<uint32_t>(I);
+      const uint32_t tgW = std::min<uint32_t>(gridX, tew);
+
+      MTLSize grid = MTLSizeMake(gridX, 1, gridZ);
+      MTLSize tgs = MTLSizeMake(tgW, 1, 1);
+
+      mtl_setArgs(enc,
+                  indices2d,
+                  values,
+                  dense_c,
+                  t_c,
+                  out_contig,
+                  std::array<uint32_t, 3>{static_cast<uint32_t>(I),
+                                           static_cast<uint32_t>(J),
+                                           static_cast<uint32_t>(K)},
+                  std::array<float, 2>{alpha_f, beta_f},
+                  static_cast<uint32_t>(nnz));
+      [enc dispatchThreads:grid threadsPerThreadgroup:tgs];
+    }
+  });
+
+  if (out_needs_cast) {
+    r.copy_(out_contig.to(r.scalar_type()));
+  }
+
+  return r;
+}
+
+
+static void build_batch_ptr_mps(
+    const Tensor& indices_dim0,
+    int64_t B,
+    Tensor& batch_ptr
+) {
+  // Builds an array of pointers which point to each batches elements. Example:
+  // idx_b = [0, 0, 0, 1, 1, 2, 2, 2, 2]  // 9 non-zero elements
+  //          └─────┘  └──┘  └─────────┘
+  //          batch 0  batch 1  batch 2
+  // batch_ptr = [0, 3, 5, 9]
+  //              │  │  │  └─ end of batch 2 (total nnz)
+  //              │  │  └──── batch 2 starts at index 5
+  //              │  └─────── batch 1 starts at index 3
+  //              └────────── batch 0 starts at index 0
+  TORCH_CHECK(indices_dim0.is_mps() && batch_ptr.is_mps(), "MPS device expected");
+  auto device = indices_dim0.device();
+  auto stream = getCurrentMPSStream();
+
+  const int64_t nnz = indices_dim0.numel();
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc("build_batch_ptr_from_sorted_batches");
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+
+      const uint32_t tew = pso.threadExecutionWidth;
+      const uint32_t Q = static_cast<uint32_t>(B + 1);
+      const uint32_t tgW = std::min<uint32_t>(Q, tew);
+      MTLSize grid = MTLSizeMake(Q, 1, 1);
+      MTLSize tgs  = MTLSizeMake(tgW, 1, 1);
+
+      mtl_setArgs(enc,
+                  indices_dim0,
+                  batch_ptr,
+                  std::array<uint32_t, 2>{static_cast<uint32_t>(nnz),
+                                          static_cast<uint32_t>(B)});
+      [enc dispatchThreads:grid threadsPerThreadgroup:tgs];
+    }
+  });
+}
+
+static void build_row_ptr_per_batch_mps(
+    const Tensor& rows,
+    const Tensor& batch_ptr,
+    int64_t B,
+    int64_t I,
+    Tensor& row_ptr
+) {
+  // Build per-batch CSR-style row pointer arrays from row indices sorted by batch
+  // Given:
+  //   rows: 1-D array of length nnz with row ids in [0, I), sorted within each batch
+  //   batch_ptr: length B+1, where [batch_ptr[b], batch_ptr[b+1]) is the subrange for batch b
+  // Produces:
+  //   - row_ptr: shape [B, I+1]
+  //
+  // Example (B = 2, I = 4):
+  // rows       = [0,   0,   1,  3,  0,   2,    2]   // 7 non-zero elements
+  //               └─── batch 0 ──┘  └─ batch 1 ─┘
+  // batch_ptr  = [0, 4, 7]
+  //               │  │  └─ end of batch 1 (total nnz)
+  //               │  └──── end of batch 0/start of batch 1
+  //               └─────── start of batch 0
+  //
+  // per-batch row pointers (I+1 entries each):
+  //   row_ptr[0] = [0, 2, 3, 3, 4]
+  //   row_ptr[1] = [0, 1, 1, 3, 3]
+  // laid out in memory: [0, 2, 3, 3, 4,  0, 1, 1, 3, 3]
+  TORCH_CHECK(rows.is_mps() && batch_ptr.is_mps() && row_ptr.is_mps(), "MPS device expected");
+  auto stream = getCurrentMPSStream();
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc("build_row_ptr_from_sorted_rows_by_batch");
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+
+      const uint32_t tew = pso.threadExecutionWidth;
+      const uint32_t Qx = static_cast<uint32_t>(I + 1);
+      const uint32_t Qy = static_cast<uint32_t>(B);
+      const uint32_t tgW = std::min<uint32_t>(Qx, tew);
+
+      MTLSize grid = MTLSizeMake(Qx, Qy, 1);
+      MTLSize tgs = MTLSizeMake(tgW, 1, 1);
+
+      mtl_setArgs(enc,
+                  rows,
+                  batch_ptr,
+                  row_ptr,
+                  std::array<uint32_t, 2>{static_cast<uint32_t>(I),
+                                           static_cast<uint32_t>(B)});
+      [enc dispatchThreads:grid threadsPerThreadgroup:tgs];
+    }
+  });
+}
+
+Tensor& bmm_out_sparse_mps(const SparseTensor& self_, const Tensor& mat2_, Tensor& result_) {
+  TORCH_CHECK(result_.is_mps(), "bmm_sparse: expected 'out' to be MPS, got ", result_.device());
+  TORCH_CHECK(self_.is_mps(),  "bmm_sparse: expected 'self' to be MPS, got ", self_.device());
+  TORCH_CHECK(mat2_.is_mps(),  "bmm_sparse: expected 'mat2' to be MPS, got ", mat2_.device());
+
+  TORCH_CHECK(self_.dense_dim() == 0, "bmm_sparse: Tensor 'self' must have 0 dense dims, but has ", self_.dense_dim());
+  TORCH_CHECK(self_.sparse_dim() == 3, "bmm_sparse: Tensor 'self' must have 3 sparse dims, but has ", self_.sparse_dim());
+  TORCH_CHECK(mat2_.dim() == 3, "bmm_sparse: Tensor 'mat2' must have 3 dims, but has ", mat2_.dim());
+
+  TORCH_CHECK(self_.size(0) == mat2_.size(0), "bmm_sparse: 'self.size(0)' and 'mat2.size(0)' must match");
+  TORCH_CHECK(self_.size(2) == mat2_.size(1), "bmm_sparse: 'self.size(2)' and 'mat2.size(1)' must match");
+
+  const int64_t B = self_.size(0);
+  const int64_t I = self_.size(1);
+  const int64_t J = self_.size(2);
+  const int64_t K = mat2_.size(2);
+
+  auto self = self_.coalesce();
+  const int64_t nnz = self._nnz();
+  if (nnz == 0) {
+    return result_.zero_();
+  }
+
+  const auto computeDtype = at::kFloat;
+
+  auto indices = self._indices();
+  auto values  = self._values();
+
+  auto values_c = values.scalar_type() == computeDtype ? values : values.to(computeDtype);
+  auto mat2_c = mat2_.scalar_type()   == computeDtype ? mat2_   : mat2_.to(computeDtype);
+  auto mat2_contig = mat2_c.contiguous();
+
+  auto idx_b = indices.select(0, 0).contiguous();
+  auto idx_i = indices.select(0, 1).contiguous();
+  auto idx_j = indices.select(0, 2).contiguous();
+
+  // builds an array of pointers of where the batch_idx's pointer starts and ends
+  // look in function for better explanation
+  auto batch_ptr = at::empty({B + 1}, at::device(result_.device()).dtype(kLong));
+  build_batch_ptr_mps(idx_b, B, batch_ptr);
+  // build row_ptr per batch: for each (b, i) get [start, end) into rows/cols/vals
+  auto row_ptr = at::empty({B * (I + 1)}, at::device(result_.device()).dtype(kLong));
+  build_row_ptr_per_batch_mps(idx_i, batch_ptr, B, I, row_ptr);
+
+  const bool out_needs_cast = (result_.scalar_type() != computeDtype) || !result_.is_contiguous();
+  Tensor out_buf = out_needs_cast
+      ? at::empty({B, I, K}, result_.options().dtype(computeDtype))
+      : result_;
+  auto out_contig = out_buf.contiguous();
+
+  auto stream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc("spmm_bmm_coo_rows_grouped_" + mps::scalarToMetalTypeString(values));
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+
+      const uint32_t tew = pso.threadExecutionWidth;
+      const uint32_t tgW = std::min<uint32_t>((uint32_t)K, tew);
+
+      // One threadgroup per (row i, batch b), lanes cover K
+      MTLSize grid = MTLSizeMake(tgW, (uint32_t)I, (uint32_t)B);
+      MTLSize tgs  = MTLSizeMake(tgW, 1, 1);
+
+      mtl_setArgs(enc,
+                  idx_i,
+                  idx_j,
+                  values_c,
+                  mat2_contig,
+                  out_contig,
+                  row_ptr,
+                  std::array<uint32_t, 4>{(uint32_t)B, (uint32_t)I, (uint32_t)J, (uint32_t)K});
+      [enc dispatchThreads:grid threadsPerThreadgroup:tgs];
+    }
+  });
+  if (out_needs_cast) {
+    result_.copy_(out_contig.to(result_.scalar_type()));
+  }
+  return result_;
+}
+
+Tensor bmm_sparse_mps(const Tensor& self, const Tensor& mat2) {
+  Tensor result = at::zeros({self.size(0), self.size(1), mat2.size(2)}, mat2.options());
+  return bmm_out_sparse_mps(self, mat2, result);
+}
+
+Tensor& addmm_out_sparse_dense_mps(
+    const Tensor& self,
+    const SparseTensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    Tensor& result) {
+  c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
+  return s_addmm_out_sparse_dense_mps(result, *b_self, mat1, mat2, beta, alpha);
+}
+
+Tensor addmm_sparse_dense_mps(
+    const Tensor& self,
+    const SparseTensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha
+) {
+  c10::MaybeOwned<Tensor> b_self = expand_size(self, {mat1.size(0), mat2.size(1)}, "addmm_out");
+  Tensor result = at::empty({0}, self.options());
+  return s_addmm_out_sparse_dense_mps(result, *b_self, mat1, mat2, beta, alpha);
+}
+
 static SparseTensor& mul_out_dense_sparse_mps(
     const Tensor& dense,
     const Tensor& sparse,
@@ -436,4 +742,137 @@
   return out;
 }
 
+using OptTensor = std::optional<Tensor>;
+
+
+static void sparse_mask_apply_out_mps_kernel(
+    Tensor& result,
+    const Tensor& src_in,
+    const Tensor& mask_in,
+    bool accumulate_matches,
+    bool require_same_sizes,
+    bool coalesce_mask) {
+  TORCH_CHECK(src_in.is_sparse() && mask_in.is_sparse(),
+              "sparse_mask: expected both inputs to be sparse COO");
+  TORCH_CHECK(src_in.is_mps() && mask_in.is_mps(),
+              "sparse_mask: expected tensors to be on MPS device");
+  TORCH_CHECK(src_in.sparse_dim() == mask_in.sparse_dim(),
+              "sparse_mask: sparse_dim mismatch: ", src_in.sparse_dim(), " vs ", mask_in.sparse_dim());
+  if (require_same_sizes) {
+    TORCH_CHECK(src_in.sizes().equals(mask_in.sizes()),
+                "sparse_mask: sizes must match exactly (no broadcasting)");
+  }
+  auto src  = src_in.coalesce();
+  auto mask = coalesce_mask ? mask_in.coalesce() : mask_in;
+
+  const int64_t src_nnz = src._nnz();
+  const int64_t mask_nnz = mask._nnz();
+  const int64_t sd = src.sparse_dim();
+  result.sparse_resize_(mask.sizes(), mask.sparse_dim(), mask.dense_dim());
+
+  auto commonDtype = at::result_type(src, mask);
+  TORCH_CHECK(canCast(commonDtype, result.scalar_type()),
+              "Can't convert result type ", commonDtype, " to output ", result.scalar_type());
+
+  if (mask_nnz == 0) {
+    alias_into_sparse(
+        result,
+        mask._indices().narrow(1, 0, 0),
+        at::empty({0}, result.options().dtype(result.scalar_type())));
+    result._coalesced_(mask.is_coalesced());
+    return;
+  }
+
+  TORCH_CHECK(sd > 0 || (src_nnz <= 1 && mask_nnz <= 1),
+              "sparse_mask: invalid sparse_dim or nnz");
+
+  if (sd == 0) {
+    auto out_indices = mask._indices().narrow(1, 0, 1);
+    auto out_values = src_nnz
+      ? src._values().narrow(0, 0, 1).to(commonDtype)
+      : at::zeros({1}, at::device(result.device()).dtype(commonDtype));
+    alias_into_sparse(result, out_indices, out_values);
+    result._coalesced_(mask.is_coalesced());
+    return;
+  }
+
+  if (src_nnz == 0) {
+    auto out_indices = mask._indices().contiguous();
+    auto src_values  = src._values().to(commonDtype);
+    auto out_val_sizes = src_values.sizes().vec();
+    out_val_sizes[0] = mask_nnz;
+    auto out_values = at::zeros(out_val_sizes, src_values.options());
+    alias_into_sparse(result, out_indices, out_values);
+    result._coalesced_(mask.is_coalesced());
+    return;
+  }
+
+  auto mask_indices = mask._indices().contiguous();
+  auto src_indices = src._indices().contiguous();
+  auto src_values = src._values().to(commonDtype).contiguous();
+
+  auto mask_keys = flatten_indices(mask_indices, mask.sizes().slice(0, sd)).contiguous();
+  auto src_keys  = flatten_indices(src_indices,  src.sizes().slice(0, sd)).contiguous();
+
+  const bool A_is_src = (src_nnz <= mask_nnz);
+  const int64_t lenA = A_is_src ? src_nnz  : mask_nnz;
+  const int64_t lenB = A_is_src ? mask_nnz : src_nnz;
+  auto A_keys = A_is_src ? src_keys  : mask_keys;
+  auto B_keys = A_is_src ? mask_keys : src_keys;
+
+  const auto device = result.device();
+  auto stream = getCurrentMPSStream();
+
+  auto outA_idx = at::empty({lenA}, at::device(device).dtype(at::kLong));
+  auto outB_idx = at::empty({lenA}, at::device(device).dtype(at::kLong));
+  auto counter = at::zeros({1}, at::device(device).dtype(at::kInt));
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pso = lib.getPipelineStateForFunc("intersect_binary_search");
+      auto enc = stream->commandEncoder();
+      [enc setComputePipelineState:pso];
+      mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter,
+                  static_cast<uint32_t>(lenB), A_is_src);
+      mtl_dispatch1DJob(enc, pso, static_cast<uint32_t>(lenA));
+    }
+  });
+
+  const int64_t M = static_cast<int64_t>(counter.item<int32_t>());
+
+  auto out_val_sizes = src_values.sizes().vec();
+  out_val_sizes[0] = mask_nnz;
+  auto out_values = at::zeros(out_val_sizes, src_values.options());
+
+  if (M > 0) {
+    auto src_match = outA_idx.narrow(0, 0, M);
+    auto mask_match = outB_idx.narrow(0, 0, M);
+
+    auto src_rows = src_values.index_select(0, src_match);
+    if (accumulate_matches) {
+      out_values.index_add_(0, mask_match, src_rows);
+    } else {
+      out_values.index_copy_(0, mask_match, src_rows);
+    }
+  }
+
+  alias_into_sparse(result, mask_indices, out_values);
+  result._coalesced_(mask.is_coalesced());
+}
+
+static void sparse_mask_intersection_out_mps_kernel(
+    Tensor& result,
+    const Tensor& lhs,
+    const Tensor& rhs,
+    const OptTensor& = std::nullopt) {
+  sparse_mask_apply_out_mps_kernel(
+      result,
+      /*src_in=*/lhs,
+      /*mask_in=*/rhs,
+      /*accumulate_matches=*/false,
+      /*require_same_sizes=*/false,
+      /*coalesce_mask=*/false);
+}
+
+REGISTER_MPS_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_mps_kernel);
 } // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/sparse/mps/kernels/Mul.metal b/aten/src/ATen/native/sparse/mps/kernels/Mul.metal
index 4a9caa393f94..a5a53e82a3fd 100644
--- a/aten/src/ATen/native/sparse/mps/kernels/Mul.metal
+++ b/aten/src/ATen/native/sparse/mps/kernels/Mul.metal
@@ -1,7 +1,105 @@
-#include <metal_stdlib>
 #include <c10/metal/indexing.h>
+#include <c10/metal/utils.h>
+using namespace c10::metal;
 using namespace metal;
 
+inline uint lower_bound_i64(device const long* arr, uint lo, uint hi, long key) {
+  uint l = lo, r = hi;
+  while (l < r) {
+    uint m = (l + r) >> 1;
+    long v = arr[m];
+    if (v < key) {
+      l = m + 1;
+    } else {
+      r = m;
+    }
+  }
+  return l;
+}
+
+inline uint upper_bound_i64(device const long* arr, uint lo, uint hi, long key) {
+  uint l = lo, r = hi;
+  while (l < r) {
+    uint m = (l + r) >> 1;
+    long v = arr[m];
+    if (v <= key) {
+      l = m + 1;
+    } else {
+      r = m;
+    }
+  }
+  return l;
+}
+
+kernel void build_row_ptr_from_sorted_rows_by_batch(
+    device const long* rows        [[buffer(0)]],
+    device const long* batch_ptr   [[buffer(1)]],
+    device long*       row_ptr     [[buffer(2)]],
+    constant uint2&    dims        [[buffer(3)]],
+    uint3              tid         [[thread_position_in_grid]])
+{
+  const uint I = dims.x;
+  const uint B = dims.y;
+
+  const uint i = tid.x;
+  const uint b = tid.y;
+
+  if (b >= B || i > I) return;
+
+  const uint base = (uint)batch_ptr[b];
+  const uint lim  = (uint)batch_ptr[b + 1];
+
+  const ulong out_base = (ulong)b * (ulong)(I + 1);
+
+  if (i == I) {
+    row_ptr[out_base + (ulong)I] = (long)lim;
+  } else {
+    const long key = (long)i;
+    const uint pos = lower_bound_i64(rows, base, lim, key);
+    row_ptr[out_base + (ulong)i] = (long)pos;
+  }
+}
+
+template <typename T>
+kernel void spmm_bmm_coo_rows_grouped(
+    device const long*   rows      [[buffer(0)]],
+    device const long*   cols      [[buffer(1)]],
+    device const T*      vals      [[buffer(2)]],
+    device const T*      dense     [[buffer(3)]],
+    device T*            out       [[buffer(4)]],
+    device const long*   row_ptr   [[buffer(5)]],
+    constant uint4&      dims      [[buffer(6)]],
+    uint3                tid       [[thread_position_in_grid]],
+    uint3                ltid      [[thread_position_in_threadgroup]],
+    uint3                tptg      [[threads_per_threadgroup]])
+{
+  const uint B = dims.x;
+  const uint I = dims.y;
+  const uint J = dims.z;
+  const uint K = dims.w;
+
+  const uint b = tid.z;
+  const uint i = tid.y;
+  const uint lane = ltid.x;
+  const uint tgW  = tptg.x;
+
+  const ulong rp_base = (ulong)b * (ulong)(I + 1);
+  const uint start = (uint)row_ptr[rp_base + (ulong)i];
+  const uint end   = (uint)row_ptr[rp_base + (ulong)i + 1];
+
+  for (uint k = lane; k < K; k += tgW) {
+    auto acc = static_cast<accum_t<T>>(T(0));
+    for (uint p = start; p < end; ++p) {
+      const uint c = (uint)cols[p];
+      const auto v = static_cast<accum_t<T>>(vals[p]);
+      const uint d_off = ((b * J) + c) * K + k;
+      const auto d = static_cast<accum_t<T>>(dense[d_off]);
+      acc += mul(v, d);
+    }
+    const uint y_off = ((b * I) + i) * K + k;
+    out[y_off] = static_cast<T>(acc);
+  }
+}
 
 template <typename T>
 kernel void dense_sparse_mul_kernel(
@@ -29,9 +127,9 @@ kernel void dense_sparse_mul_kernel(
   ulong dense_idx = (ulong)key * (ulong)view_cols + (ulong)col;
   ulong val_idx = (ulong)i * (ulong)view_cols + (ulong)col;
 
-  const auto a = static_cast<float>(values[val_idx]);
-  const auto b = static_cast<float>(dense[dense_idx]);
-  out_values[val_idx] = static_cast<T>(a * b);
+  const auto a = static_cast<accum_t<T>>(values[val_idx]);
+  const auto b = static_cast<accum_t<T>>(dense[dense_idx]);
+  out_values[val_idx] = static_cast<T>(mul(a, b));
 }
 
 kernel void intersect_binary_search(
@@ -116,6 +214,76 @@ kernel void fused_gather_mul_kernel(
   }
 }
 
+
+kernel void build_batch_ptr_from_sorted_batches(
+    device const long* batches       [[buffer(0)]],
+    device long*       batch_ptr     [[buffer(1)]],
+    constant uint2&    nnz_B         [[buffer(2)]],
+    uint3              tid           [[thread_position_in_grid]])
+{
+  uint b = tid.x;
+  uint nnz = nnz_B.x;
+  uint batch = nnz_B.y;
+
+  if (b == batch) {
+    batch_ptr[b] = (long)nnz;
+    return;
+  }
+
+  uint lo = 0;
+  uint hi = nnz;
+  long key = (long)b;
+  while (lo < hi) {
+    uint mid = (lo + hi) >> 1;
+    long v = batches[mid];
+    if (v < key) lo = mid + 1;
+    else         hi = mid;
+  }
+  batch_ptr[b] = (long)lo;
+}
+
+template <typename T>
+kernel void spmm_addmm_coo(
+    device const long*   indices2d   [[buffer(0)]],
+    device const T*      vals        [[buffer(1)]],
+    device const T*      dense       [[buffer(2)]],
+    device const T*      t_in        [[buffer(3)]],
+    device T*            out         [[buffer(4)]],
+    constant uint3&      dims        [[buffer(5)]],
+    constant float2&     alpha_beta  [[buffer(6)]],
+    constant uint&       nnz         [[buffer(7)]],
+    uint3                tid         [[thread_position_in_grid]])
+{
+  const uint K = dims.z;
+  const uint k = tid.x;
+  const uint i = tid.z;
+  const float alpha = alpha_beta.x;
+  const float beta = alpha_beta.y;
+
+  device const long* rows = indices2d;
+  device const long* cols = indices2d + nnz;
+
+  const uint start = lower_bound_i64(rows, 0u, nnz, (long)i);
+  const uint end = upper_bound_i64(rows, 0u, nnz, (long)i);
+
+  // accumulator is float for scalar/half/bfloat and float2 for float2
+  auto acc = static_cast<accum_t<T>>(T(0));
+
+  for (uint p = start; p < end; ++p) {
+    const uint c = (uint)cols[p];
+    const auto v = static_cast<accum_t<T>>(vals[p]);
+    const uint dense_off = c * K + k;
+    const auto d = static_cast<accum_t<T>>(dense[dense_off]);
+    acc += mul(v, d);
+  }
+
+  const uint off = i * K + k;
+  const auto base = (beta != 0.0f) ? (static_cast<accum_t<T>>(t_in[off]) * beta) : static_cast<accum_t<T>>(T(0));
+  const auto y = base + alpha * acc;
+  out[off] = static_cast<T>(y);
+}
+
+
 #define INSTANTIATE_DENSE_SPARSE_MUL(DTYPE)                                 \
   template [[host_name("dense_sparse_mul_kernel_" #DTYPE)]] kernel void     \
   dense_sparse_mul_kernel<DTYPE>(                                           \
@@ -130,6 +298,8 @@ kernel void fused_gather_mul_kernel(
 INSTANTIATE_DENSE_SPARSE_MUL(float);
 INSTANTIATE_DENSE_SPARSE_MUL(half);
 INSTANTIATE_DENSE_SPARSE_MUL(bfloat);
+INSTANTIATE_DENSE_SPARSE_MUL(long);
+INSTANTIATE_DENSE_SPARSE_MUL(float2);
 
 #define INSTANTIATE_FUSED_GATHER_MUL(DTYPE)                                  \
   template [[host_name("fused_gather_mul_kernel_" #DTYPE)]] kernel void      \
@@ -145,6 +315,36 @@ INSTANTIATE_DENSE_SPARSE_MUL(bfloat);
       constant uint2&     dims_output   [[buffer(8)]],                       \
       uint3               gid           [[thread_position_in_grid]]);
 
-INSTANTIATE_FUSED_GATHER_MUL(float);
-INSTANTIATE_FUSED_GATHER_MUL(half);
-INSTANTIATE_FUSED_GATHER_MUL(bfloat);
\ No newline at end of file
+INSTANTIATE_FOR_FLOAT_TYPES(INSTANTIATE_FUSED_GATHER_MUL);
+
+
+#define INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED(DTYPE)                         \
+  template [[host_name("spmm_bmm_coo_rows_grouped_" #DTYPE)]] kernel void    \
+  spmm_bmm_coo_rows_grouped<DTYPE>(                                          \
+      device const long*   rows      [[buffer(0)]],                          \
+      device const long*   cols      [[buffer(1)]],                          \
+      device const DTYPE*  vals      [[buffer(2)]],                          \
+      device const DTYPE*  dense     [[buffer(3)]],                          \
+      device DTYPE*        out       [[buffer(4)]],                          \
+      device const long*   row_ptr   [[buffer(5)]],                          \
+      constant uint4&      dims      [[buffer(6)]],                          \
+      uint3                tid       [[thread_position_in_grid]],            \
+      uint3                ltid      [[thread_position_in_threadgroup]],     \
+      uint3                tptg      [[threads_per_threadgroup]]);
+
+INSTANTIATE_FOR_ALL_TYPES(INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED);
+
+#define INSTANTIATE_SPMM_ADDMM_COO(DTYPE) \
+  template [[host_name("spmm_addmm_coo_" #DTYPE)]] kernel void  \
+  spmm_addmm_coo<DTYPE>(                                        \
+    device const long*   indices2d   [[buffer(0)]],             \
+    device const DTYPE*  vals        [[buffer(1)]],             \
+    device const DTYPE*  dense       [[buffer(2)]],             \
+    device const DTYPE*  t_in        [[buffer(3)]],             \
+    device DTYPE*        out         [[buffer(4)]],             \
+    constant uint3&      dims        [[buffer(5)]],             \
+    constant float2&     alpha_beta  [[buffer(6)]],             \
+    constant uint&       nnz         [[buffer(7)]],             \
+    uint3                tid         [[thread_position_in_grid]]);
+
+INSTANTIATE_FOR_ALL_TYPES(INSTANTIATE_SPMM_ADDMM_COO);
diff --git a/aten/src/ATen/native/tags.yaml b/aten/src/ATen/native/tags.yaml
index 101e18ae8428..6a53d4833ade 100644
--- a/aten/src/ATen/native/tags.yaml
+++ b/aten/src/ATen/native/tags.yaml
@@ -93,3 +93,7 @@
           This operator does not support cudagraphs. The presence of this tag on an operator will cause
           Inductor to split the graph around this operator. Note that operators without this tag may still
           not support CUDAGraphs. Inductor may have other hardcoded lists around that.
+- tag: reduction
+  desc: |
+          This tag indicates that an operator performs a reduction operation, computing aggregate values
+          (sum, mean, max, min, etc.) across one or more dimensions of the input tensor(s).
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
index e3dc0778e46b..156034954d9e 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h
@@ -110,9 +110,9 @@ class ApplyLogSumExp {
   using ElementCompute = ElementCompute_;
   using ElementLSE = ElementLSE_;
 
-  static int const kElementsPerAccess = ElementsPerAccess;
-  static int const kCount = kElementsPerAccess;
-  static const ScaleType::Kind kScale =
+  static int constexpr kElementsPerAccess = ElementsPerAccess;
+  static int constexpr kCount = kElementsPerAccess;
+  static constexpr ScaleType::Kind kScale =
       cutlass::epilogue::thread::ScaleType::NoBetaScaling;
 
   using FragmentOutput = Array<ElementOutput, kCount>;
diff --git a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
index 2ef59f42140b..7b83617f643d 100644
--- a/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
+++ b/aten/src/ATen/native/transformers/cuda/mem_eff_attention/kernels/generate_kernels.py
@@ -117,7 +117,7 @@ def cpp_impl(self) -> str:
     def get_all(cls) -> list["FwdKernel"]:
         kernels: list[FwdKernel] = []
         for aligned, dtype, (sm, sm_max) in itertools.product(
-            [True, False], DTYPES.keys(), zip(SM, SM[1:])
+            [True, False], DTYPES.keys(), itertools.pairwise(SM)
         ):
             # Remove some kernels we don't use
             if dtype == "bf16" and sm < 80:
@@ -228,7 +228,7 @@ def get_all(cls) -> list["BwdKernel"]:
         for aligned, dtype, (sm, sm_max), apply_dropout, max_k in itertools.product(
             [True, False],
             DTYPES.keys(),
-            zip(SM, SM[1:]),
+            itertools.pairwise(SM),
             [True, False],
             [32, 64, 128, 2**16],
         ):
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
index c826ef1ab8b1..a4a9c532166a 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@@ -176,6 +176,28 @@ bool check_head_dim_size_flash(sdp_params const& params, bool debug) {
     }
     return false;
   }
+  if constexpr(caller_is_meff) {
+    bool is_half = (params.query.dtype() == at::kHalf) ||
+      (params.query.dtype() == at::kBFloat16);
+    const int64_t alignment = is_half ? 8 : 4;
+    if (!(query_size_last % alignment == 0 && query_size_last > 0 &&
+          value_size_last % alignment == 0 && value_size_last > 0)) {
+      if (debug) {
+        TORCH_WARN(
+            "Mem efficient attention requires last dimension of inputs to be divisible by ",
+            alignment,
+            ". ",
+            "Got Query.size(-1): ",
+            query_size_last,
+            ", Key.size(-1): ",
+            params.key.sym_size(-1),
+            ", Value.size(-1): ",
+            params.value.sym_size(-1),
+            " instead.");
+      }
+      return false;
+    }
+  }
   return true;
 }
 
@@ -294,6 +316,12 @@ bool check_flash_attention_hardware_support(sdp_params const& params, bool debug
   return false;
 #endif
 #else
+  if (!at::cuda::is_available()) {
+    if (debug) {
+      TORCH_WARN("flash attention requires a CUDA device, which is not available.");
+    }
+    return false;
+  }
   auto dprops = at::cuda::getCurrentDeviceProperties();
   if (!check_sm_version<sm80, sm121>(dprops)) {
     if (debug) {
@@ -345,6 +373,12 @@ bool check_mem_efficient_hardware_support(sdp_params const& params, bool debug)
   return false;
 #endif
 #else
+  if (!at::cuda::is_available()) {
+    if (debug) {
+      TORCH_WARN("Mem Efficient attention requires a CUDA device, which is not available.");
+    }
+    return false;
+  }
   auto dprops = at::cuda::getCurrentDeviceProperties();
   if (!check_sm_version<sm50, sm121>(dprops)) {
     if (debug) {
@@ -575,6 +609,12 @@ bool check_cudnn_layout(sdp_params const& params, bool debug) {
 bool check_cudnn_hardware_support(sdp_params const& params, bool debug) {
   using sm80 = SMVersion<8, 0>;
   using sm121 = SMVersion<12, 1>;
+  if (!at::cuda::is_available()) {
+    if (debug) {
+      TORCH_WARN("cuDNN SDPA requires a CUDA device, which is not available.");
+    }
+    return false;
+  }
   auto dprops = at::cuda::getCurrentDeviceProperties();
   if (!check_sm_version<sm80, sm121>(dprops)) {
     if (debug) {
@@ -597,13 +637,7 @@ bool check_for_nested_inputs(sdp_params const& params, bool debug) {
       TORCH_WARN("Experimental cuDNN SDPA nested tensor support is not enabled.");
     }
     return false;
-  } else if (has_for_nested_inputs(params) && (params.query.requires_grad() || params.key.requires_grad() || params.value.requires_grad())) {
-    if (debug) {
-      TORCH_WARN("Experimental cuDNN SDPA nested tensor support does not support backward.");
-      return false;
-    }
   }
-
   const auto dprop = at::cuda::getCurrentDeviceProperties();
   // Check that the input is nested
   if (!(dprop->major == 9 || dprop->major == 10) && has_for_nested_inputs(params)) {
@@ -666,6 +700,15 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
     TORCH_WARN(CUDNN_VERSION, " cuDNN version too old to use cuDNN Attention (< v9.0.0)");
   }
   return false;
+#endif
+#if defined(CUDNN_VERSION)
+  static auto cudnn_version = cudnnGetVersion();
+  if (params.dropout > 0.0 && cudnn_version > 91100 && cudnn_version < 91400) {
+    if (debug) {
+      TORCH_WARN(CUDNN_VERSION, " cuDNN version does not support droppout in SDPA (9.11 - 9.13).");
+    }
+    return false;
+  }
 #endif
   // Define gate functions that determine if a flash kernel can be ran
   // Replace with std::to_array when we migrate to c++20
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
index b5b1ed429289..2467cb809fdb 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
@@ -462,10 +462,11 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
     using sdp::aotriton_adapter::mk_aotensor;
     using sdp::aotriton_adapter::mk_aoscalartensor;
     using sdp::aotriton_adapter::mk_philoxtensor;
+    using sdp::aotriton_adapter::mk_atomictensor;
     using sdp::aotriton_adapter::cast_dtype;
     at::Tensor atomic_counter;
     if (is_causal) {
-      atomic_counter = at::zeros({1}, q.options());
+      atomic_counter = at::zeros({1}, q.options().dtype(at::kInt));
     }
     aotriton::TensorView<4> empty_bias(0, {0,0,0,0}, {0,0,0,0}, cast_dtype(q.dtype()));
     auto seed = use_philox_state ? mk_philoxtensor(philox_state.seed_.ptr) : mk_aoscalartensor(seed_t);
@@ -474,7 +475,7 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
     auto nullscalar = mk_philoxtensor(nullptr);
     auto seed_output = use_philox_state ? mk_philoxtensor(seed_t.data_ptr<int64_t>()) : nullscalar;
     auto offset_output = use_philox_state ? mk_philoxtensor(offset_t.data_ptr<int64_t>()) : nullscalar;
-    auto persistent_counter = is_causal ? mk_philoxtensor(atomic_counter.data_ptr<int64_t>()) : nullscalar;
+    auto persistent_counter = mk_atomictensor(is_causal ? atomic_counter.data_ptr<int32_t>() : nullptr);
     if (uses_swa || AOTRITON_ALWAYS_V3_API) {
 #if AOTRITON_V3_API
       using aotriton::v3::flash::CausalType;
diff --git a/aten/src/ATen/native/ts_native_functions.yaml b/aten/src/ATen/native/ts_native_functions.yaml
index 17c9bd4234f3..4ef380704de8 100644
--- a/aten/src/ATen/native/ts_native_functions.yaml
+++ b/aten/src/ATen/native/ts_native_functions.yaml
@@ -202,6 +202,7 @@ supported:
   - select_backward
   - _trilinear
   - linalg_pinv.atol_rtol_tensor
+  - svd
   - logsumexp.out
 symint:
   - empty.memory_format
diff --git a/aten/src/ATen/native/vulkan/api/QueryPool.cpp b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
index 9c0c7fb2ea86..bfa92357daee 100644
--- a/aten/src/ATen/native/vulkan/api/QueryPool.cpp
+++ b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
@@ -5,6 +5,7 @@
 #include <torch/csrc/profiler/orchestration/vulkan.h>
 #endif // USE_KINETO
 
+#include <algorithm>
 #include <cmath>
 #include <iomanip>
 #include <iostream>
diff --git a/aten/src/ATen/native/xnnpack/Shim.cpp b/aten/src/ATen/native/xnnpack/Shim.cpp
index 03030d7826c6..de255fa6b6fc 100644
--- a/aten/src/ATen/native/xnnpack/Shim.cpp
+++ b/aten/src/ATen/native/xnnpack/Shim.cpp
@@ -29,63 +29,63 @@ bool available() {
 }
 
 bool use_convolution2d(
-    const Tensor&,
-    const Tensor&,
-    const at::OptionalIntArrayRef,
-    const IntArrayRef,
-    const IntArrayRef,
-    const IntArrayRef,
-    const int64_t,
-    bool) {
+    const Tensor& /*unused*/,
+    const Tensor& /*unused*/,
+    const at::OptionalIntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    const int64_t /*unused*/,
+    bool /*unused*/) {
   return false;
 }
 
 Tensor convolution2d(
-    const Tensor&,
-    const Tensor&,
-    const Tensor&,
-    const IntArrayRef,
-    const IntArrayRef,
-    const IntArrayRef,
-    const int64_t) {
+    const Tensor& /*unused*/,
+    const Tensor& /*unused*/,
+    const Tensor& /*unused*/,
+    const IntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    const int64_t /*unused*/) {
   TORCH_CHECK(false, internal::kError);
 }
 
 bool use_linear(
-    const Tensor&,
-    const Tensor&,
-    const Tensor&) {
+    const Tensor& /*unused*/,
+    const Tensor& /*unused*/,
+    const Tensor& /*unused*/) {
   return false;
 }
 
 Tensor linear(
-    const Tensor&,
-    const Tensor&,
-    const Tensor&) {
+    const Tensor& /*unused*/,
+    const Tensor& /*unused*/,
+    const Tensor& /*unused*/) {
   TORCH_CHECK(false, internal::kError);
 }
 
 bool use_max_pool2d(
-    const Tensor&,
-    const IntArrayRef,
-    const IntArrayRef,
-    IntArrayRef,
-    const IntArrayRef,
-    const bool,
-    const float,
-    const float) {
+    const Tensor& /*unused*/,
+    const IntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    IntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    const bool /*unused*/,
+    const float /*unused*/,
+    const float /*unused*/) {
   return false;
 }
 
 Tensor max_pool2d(
-    const Tensor&,
-    const IntArrayRef,
-    const IntArrayRef,
-    IntArrayRef,
-    const IntArrayRef,
-    const bool,
-    const float,
-    const float) {
+    const Tensor& /*unused*/,
+    const IntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    IntArrayRef /*unused*/,
+    const IntArrayRef /*unused*/,
+    const bool /*unused*/,
+    const float /*unused*/,
+    const float /*unused*/) {
   TORCH_CHECK(false, internal::kError);
 }
 
diff --git a/aten/src/ATen/nnapi/nnapi_model_loader.cpp b/aten/src/ATen/nnapi/nnapi_model_loader.cpp
index e7e49ed813f5..4597135ab7e7 100644
--- a/aten/src/ATen/nnapi/nnapi_model_loader.cpp
+++ b/aten/src/ATen/nnapi/nnapi_model_loader.cpp
@@ -77,7 +77,7 @@ typedef struct _SerializedModel {
  * Get the physically stored size of a value.  All values are padded out
  * to a multiple of 4 bytes to ensure the next value is 4-byte aligned.
  */
-static uint32_t value_physical_size(uint32_t len) {
+uint32_t value_physical_size(uint32_t len) {
   uint32_t phys = len;
   if (len % 4 == 0) {
     return len;
diff --git a/aten/src/ATen/ops/from_blob.h b/aten/src/ATen/ops/from_blob.h
index a209380abb64..2853d31ec24b 100644
--- a/aten/src/ATen/ops/from_blob.h
+++ b/aten/src/ATen/ops/from_blob.h
@@ -5,7 +5,7 @@ namespace at {
 
 namespace detail {
 
-inline void noopDelete(void*) {}
+inline void noopDelete(void* /*unused*/) {}
 
 } // namespace detail
 
@@ -90,12 +90,12 @@ class TORCH_API TensorMaker {
 
   void* data_;
   IntArrayRef sizes_;
-  OptionalIntArrayRef strides_{};
-  std::optional<int64_t> storage_offset_{};
-  std::function<void(void*)> deleter_{};
+  OptionalIntArrayRef strides_;
+  std::optional<int64_t> storage_offset_;
+  std::function<void(void*)> deleter_;
   std::unique_ptr<void, ContextDeleter> ctx_{nullptr, detail::noopDelete};
-  std::optional<Device> device_{};
-  TensorOptions opts_{};
+  std::optional<Device> device_;
+  TensorOptions opts_;
   bool resizeable_{};
   c10::Allocator* allocator_{};
 };
diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp
index 94a2bf56f8d7..d8c2a181e99c 100644
--- a/aten/src/ATen/record_function.cpp
+++ b/aten/src/ATen/record_function.cpp
@@ -33,7 +33,7 @@ std::atomic<int64_t> defaultNodeId(-1);
 std::atomic<uint64_t> next_thread_id_{0};
 thread_local uint64_t current_thread_id_ = 0;
 
-static constexpr size_t NumRecordScopes =
+constexpr size_t NumRecordScopes =
     static_cast<size_t>(RecordScope::NUM_SCOPES);
 
 RecordFunctionCallbacks::iterator findCallback(
@@ -203,7 +203,7 @@ class LocalCallbackManager {
   // Runtime cache.
   size_t global_version_{GlobalCallbackManager::NoVersion};
   std::array<CacheEntry, NumRecordScopes> active_callbacks_;
-  std::mt19937 generator_{};
+  std::mt19937 generator_;
 };
 
 // ============================================================================
diff --git a/aten/src/ATen/templates/FunctionalInverses.h b/aten/src/ATen/templates/FunctionalInverses.h
index 3217e097d7ad..b15cd09a6c65 100644
--- a/aten/src/ATen/templates/FunctionalInverses.h
+++ b/aten/src/ATen/templates/FunctionalInverses.h
@@ -2,22 +2,12 @@
 
 // ${generated_comment}
 
+#include <ATen/FunctionalStorageImpl.h>
 #include <ATen/Tensor.h>
 
 namespace at {
 namespace functionalization {
 
-enum class InverseReturnMode {
-  /// Specifies that functional inverses should always return a view.
-  AlwaysView,
-  /// Specifies that functional inverses should always return a non-view / copy.
-  NeverView,
-  /// Specifies that functional inverses should return a view unless a (copying) scatter
-  /// inverse exists, in which case that will be used instead.
-  /// This avoids as_strided() calls that can be difficult for subclasses to handle.
-  ViewOrScatterInverse,
-};
-
 struct FunctionalInverses {
 
 ${view_inverse_declarations}
diff --git a/aten/src/ATen/templates/RegisterFunctionalization.cpp b/aten/src/ATen/templates/RegisterFunctionalization.cpp
index dc8619c25fc5..408aff0cdab4 100644
--- a/aten/src/ATen/templates/RegisterFunctionalization.cpp
+++ b/aten/src/ATen/templates/RegisterFunctionalization.cpp
@@ -4,7 +4,7 @@
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/EmptyTensor.h>
 #include <ATen/FunctionalTensorWrapper.h>
-#include <ATen/FunctionalInverses.h>
+#include <ATen/ViewMetaClasses.h>
 #include <ATen/MemoryOverlap.h>
 #include <torch/library.h>
 
diff --git a/aten/src/ATen/templates/ViewMetaClasses.cpp b/aten/src/ATen/templates/ViewMetaClasses.cpp
new file mode 100644
index 000000000000..0fd53171935f
--- /dev/null
+++ b/aten/src/ATen/templates/ViewMetaClasses.cpp
@@ -0,0 +1,19 @@
+// ${generated_comment}
+
+#include <ATen/FunctionalInverses.h>
+#include <ATen/ViewMetaClasses.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#include <ATen/NativeFunctions.h>
+#else
+${op_headers}
+#endif
+
+namespace at {
+namespace functionalization {
+
+${view_meta_implementations}
+
+} // namespace functionalization
+} // namespace at
diff --git a/aten/src/ATen/templates/ViewMetaClasses.h b/aten/src/ATen/templates/ViewMetaClasses.h
new file mode 100644
index 000000000000..be2dee2a871b
--- /dev/null
+++ b/aten/src/ATen/templates/ViewMetaClasses.h
@@ -0,0 +1,12 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include <ATen/FunctionalStorageImpl.h>
+
+namespace at {
+namespace functionalization {
+
+${view_meta_declarations}
+
+} // namespace functionalization
+} // namespace at
diff --git a/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp b/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp
new file mode 100644
index 000000000000..c784e5abe5c8
--- /dev/null
+++ b/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp
@@ -0,0 +1,11 @@
+#include <ATen/ViewMetaClasses.h>
+#include <torch/csrc/functionalization/Module.h>
+
+namespace torch::functionalization {
+
+void initGenerated(PyObject* module) {
+  auto functionalization = py::handle(module).cast<py::module>();
+  $view_meta_bindings
+}
+
+} // namespace torch::functionalization
diff --git a/aten/src/ATen/test/cuda_allocator_test.cpp b/aten/src/ATen/test/cuda_allocator_test.cpp
index 5aa2378c22c4..ad5bb510f0cb 100644
--- a/aten/src/ATen/test/cuda_allocator_test.cpp
+++ b/aten/src/ATen/test/cuda_allocator_test.cpp
@@ -1,55 +1,83 @@
 #include <gtest/gtest.h>
 
 #include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 
 #include <ATen/test/allocator_clone_test.h>
 
 #include <torch/csrc/cuda/CUDAPluggableAllocator.h>
 
-TEST(AllocatorTestCUDA, test_clone) {
-  test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
-}
+std::unordered_map<void*, size_t> allocation_sizes;
 
-static int called_dummy_free_0 = 0;
-static int called_dummy_free_1 = 0;
+void* logging_malloc(size_t size, int device, cudaStream_t stream) {
+    void* ptr;
+    cudaMalloc(&ptr, size);
+    allocation_sizes[ptr] = size;
+    return ptr;
+}
 
-void* dummy_alloc_0(size_t size, int device, void* stream) {return nullptr;}
-void dummy_free_0(void* data, size_t size, int device, void* stream) {
-  called_dummy_free_0++;
+void logging_free(void* ptr, size_t size, int device, cudaStream_t stream) {
+    if (allocation_sizes.find(ptr) != allocation_sizes.end()) {
+        if (allocation_sizes[ptr] != size) {
+          throw std::runtime_error("free mismatch");
+        }
+    } else {
+      throw std::runtime_error("free of unknown ptr");
+    }
+    cudaFree(ptr);
+    allocation_sizes.erase(ptr);
 }
-void dummy_free_1(void* data, size_t size, int device, void* stream) {
-  called_dummy_free_1++;
+
+TEST(TestTorchUnique, UniqueComparisonTest) {
+  if (!at::cuda::is_available()) return;
+  auto custom_allocator =
+      torch::cuda::CUDAPluggableAllocator::createCustomAllocator(logging_malloc, logging_free);
+  torch::cuda::CUDAPluggableAllocator::changeCurrentAllocator(custom_allocator);
+  // Run the command 3 times; the first 2 will pass and the third invocation will have
+  // different sizes in alloc and free if the test fails.
+  for (int i = 0; i < 3; ++i) {
+    // Initialize simple sorted tensor with repeats
+    at::Tensor sorted_tensor =
+        at::tensor({0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 5},
+                      at::TensorOptions().dtype(at::kFloat).device(at::kCUDA));
+
+    // This operation will call malloc/free with different sizes on the same pointer
+    auto unique_dim_result = at::unique_consecutive(sorted_tensor, false, true, 0);
+
+    // Everything below is only there to validate correct results
+    auto unique_dim_values = std::get<0>(unique_dim_result);
+    auto unique_dim_counts = std::get<2>(unique_dim_result);
+
+    // Check tensor sizes
+    EXPECT_EQ(unique_dim_values.size(0), 5);
+    EXPECT_EQ(unique_dim_counts.size(0), 5);
+
+    // Copy to CPU before accessing elements
+    at::Tensor cpu_values = unique_dim_values.cpu();
+    at::Tensor cpu_counts = unique_dim_counts.cpu();
+
+    // Use accessors on the CPU tensors
+    auto values_accessor = cpu_values.accessor<float, 1>();
+    auto counts_accessor = cpu_counts.accessor<int64_t, 1>();
+
+    // Check individual values using accessors
+    EXPECT_EQ(values_accessor[0], 0.0f);
+    EXPECT_EQ(values_accessor[1], 1.0f);
+    EXPECT_EQ(values_accessor[2], 2.0f);
+    EXPECT_EQ(values_accessor[3], 3.0f);
+    EXPECT_EQ(values_accessor[4], 5.0f);
+
+    // Check count values using accessors
+    EXPECT_EQ(counts_accessor[0], 3);
+    EXPECT_EQ(counts_accessor[1], 2);
+    EXPECT_EQ(counts_accessor[2], 1);
+    EXPECT_EQ(counts_accessor[3], 4);
+    EXPECT_EQ(counts_accessor[4], 1);
+  }
 }
 
-// Tests that data_ptrs have their respective deleters
-// when mixing allocators
-TEST(AllocatorTestCUDA, test_pluggable_allocator_deleters) {
-  // Create a tensor with dummy_allocator_0, where dummy_free_0 is the deleter
-  auto dummy_allocator_0 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_0);
-  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_0.get());
-  at::Tensor a = at::empty({0}, at::TensorOptions().device(at::kCUDA));
-
-  // Create a tensor with dummy_allocator_1, where dummy_free_1 is the deleter
-  auto dummy_allocator_1 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_1);
-  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_1.get());
-  at::Tensor b = at::empty({0}, at::TensorOptions().device(at::kCUDA));
-
-  // Manually use a's deleter
-  auto* ctx = a.storage().data_ptr().get_context();
-  a.storage().data_ptr().get_deleter()(ctx);
-  a.storage().mutable_data_ptr().release_context();
-
-  // a's deleter is dummy_free_0
-  // dummy_free_0 should be called above, so called_dummy_free_0 should be 1
-  ASSERT_TRUE(called_dummy_free_0 == 1);
-
-  // Manually use b's deleter
-  ctx = b.storage().data_ptr().get_context();
-  b.storage().data_ptr().get_deleter()(ctx);
-  b.storage().mutable_data_ptr().release_context();
-
-  // b's deleter is dummy_free_1
-  // dummy_free_1 should be called above, so called_dummy_free_1 should be 1
-  ASSERT_TRUE(called_dummy_free_1 == 1);
+TEST(AllocatorTestCUDA, test_clone) {
+  if (!at::cuda::is_available()) return;
+  test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
 }
diff --git a/aten/src/ATen/test/cuda_caching_host_allocator_test.cpp b/aten/src/ATen/test/cuda_caching_host_allocator_test.cpp
index c29d5dadae73..d03a93f684ea 100644
--- a/aten/src/ATen/test/cuda_caching_host_allocator_test.cpp
+++ b/aten/src/ATen/test/cuda_caching_host_allocator_test.cpp
@@ -19,10 +19,10 @@ TEST(CachingHostAllocatorTest, check_stats) {
   // Clear the stats and ensure they are zero.
   size_t round_size = c10::llvm::PowerOf2Ceil(N);
   auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
-  ASSERT_EQ(stats.allocation.current, 0);
-  ASSERT_EQ(stats.allocation.peak, 0);
-  ASSERT_EQ(stats.allocation.allocated, 0);
-  ASSERT_EQ(stats.allocation.freed, 0);
+  ASSERT_EQ(stats.allocations.current, 0);
+  ASSERT_EQ(stats.allocations.peak, 0);
+  ASSERT_EQ(stats.allocations.allocated, 0);
+  ASSERT_EQ(stats.allocations.freed, 0);
 
   void* ptr{nullptr};
   void* ctx{nullptr};
@@ -32,14 +32,10 @@ TEST(CachingHostAllocatorTest, check_stats) {
     ptr = pinned_tensor.data_ptr();
     ctx = pinned_tensor.storage().data_ptr().get_context();
     auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
-    ASSERT_EQ(stats.allocation.current, 1);
-    ASSERT_EQ(stats.allocation.peak, 1);
-    ASSERT_EQ(stats.allocation.allocated, 1);
-    ASSERT_EQ(stats.allocation.freed, 0);
-    ASSERT_EQ(stats.segment.allocated, 1);
-    ASSERT_EQ(stats.segment.freed, 0);
-    ASSERT_EQ(stats.reserved_bytes.current, round_size);
-    ASSERT_EQ(stats.allocated_bytes.current, round_size);
+    ASSERT_EQ(stats.allocations.current, 1);
+    ASSERT_EQ(stats.allocations.peak, 1);
+    ASSERT_EQ(stats.allocations.allocated, 1);
+    // We dont track active bytes as free blocks are added in process_events
     ASSERT_EQ(stats.host_alloc_time.max, stats.host_alloc_time.min);
     ASSERT_EQ(stats.host_free_time.total, 0);
   }
@@ -50,13 +46,9 @@ TEST(CachingHostAllocatorTest, check_stats) {
     auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
     ASSERT_EQ(ptr, pinned_tensor.data_ptr());
     ASSERT_EQ(ctx, pinned_tensor.storage().data_ptr().get_context());
-    ASSERT_EQ(stats.allocation.current, 1);
-    ASSERT_EQ(stats.allocation.peak, 1);
-    ASSERT_EQ(stats.allocation.allocated, 2);
-    ASSERT_EQ(stats.allocation.freed, 1);
-    ASSERT_EQ(stats.segment.allocated, 1);
-    ASSERT_EQ(stats.segment.freed, 0);
-    ASSERT_EQ(stats.reserved_bytes.current, round_size);
+    ASSERT_EQ(stats.allocations.current, 1);
+    ASSERT_EQ(stats.allocations.peak, 1);
+    ASSERT_EQ(stats.allocations.allocated, 1);
     ASSERT_EQ(stats.allocated_bytes.current, round_size);
   }
   // Ensure we don't reuse the allocation, due to size mismatch.
@@ -68,14 +60,10 @@ TEST(CachingHostAllocatorTest, check_stats) {
     auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
     ASSERT_NE(ptr, pinned_tensor.data_ptr());
     ASSERT_NE(ctx, pinned_tensor.storage().data_ptr().get_context());
-    ASSERT_EQ(stats.allocation.current, 1);
-    ASSERT_EQ(stats.allocation.peak, 2);
-    ASSERT_EQ(stats.allocation.allocated, 3);
-    ASSERT_EQ(stats.allocation.freed, 2);
-    ASSERT_EQ(stats.segment.allocated, 2);
-    ASSERT_EQ(stats.segment.freed, 0);
-    ASSERT_EQ(stats.reserved_bytes.current, round_size + new_round_size);
-    ASSERT_EQ(stats.allocated_bytes.current, new_round_size);
+    ASSERT_EQ(stats.allocations.current, 2);
+    ASSERT_EQ(stats.allocations.peak, 2);
+    ASSERT_EQ(stats.allocations.allocated, 2);
+    ASSERT_EQ(stats.allocated_bytes.current, new_round_size + round_size);
     ASSERT_NE(stats.host_alloc_time.total, stats.host_alloc_time.min);
   }
 
@@ -83,13 +71,10 @@ TEST(CachingHostAllocatorTest, check_stats) {
   {
     at::getHostAllocator(at::kCUDA)->empty_cache();
     auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
-    ASSERT_EQ(stats.allocation.current, 0);
+    ASSERT_EQ(stats.allocations.current, 0);
     ASSERT_EQ(stats.allocated_bytes.current, 0);
-    ASSERT_EQ(stats.allocation.peak, 2);
-    ASSERT_EQ(stats.allocation.allocated, 3);
-    ASSERT_EQ(stats.allocation.freed, 3);
-    ASSERT_EQ(stats.segment.allocated, 2);
-    ASSERT_EQ(stats.segment.freed, 2);
+    ASSERT_EQ(stats.allocations.peak, 2);
+    ASSERT_EQ(stats.allocations.allocated, 2);
     ASSERT_EQ(stats.num_host_alloc, 2);
     ASSERT_EQ(stats.num_host_free, 2);
     ASSERT_NE(stats.host_free_time.total, stats.host_free_time.min);
@@ -100,9 +85,9 @@ TEST(CachingHostAllocatorTest, check_stats) {
     at::getHostAllocator(at::kCUDA)->reset_accumulated_stats();
     at::getHostAllocator(at::kCUDA)->reset_peak_stats();
     auto stats = at::getHostAllocator(at::kCUDA)->get_stats();
-    ASSERT_EQ(stats.allocation.peak, 0);
-    ASSERT_EQ(stats.allocation.allocated, 0);
-    ASSERT_EQ(stats.allocation.freed, 0);
+    ASSERT_EQ(stats.allocations.peak, 0);
+    ASSERT_EQ(stats.allocations.allocated, 0);
+    ASSERT_EQ(stats.allocations.freed, 0);
     ASSERT_EQ(stats.allocated_bytes.peak, 0);
     ASSERT_EQ(stats.num_host_alloc, 0);
     ASSERT_EQ(stats.num_host_free, 0);
diff --git a/aten/src/ATen/test/pow_test.cpp b/aten/src/ATen/test/pow_test.cpp
index 95bb48b341f5..6391c3c8228c 100644
--- a/aten/src/ATen/test/pow_test.cpp
+++ b/aten/src/ATen/test/pow_test.cpp
@@ -14,16 +14,16 @@ using namespace at;
 
 namespace {
 
-const auto int_min = std::numeric_limits<int>::min();
-const auto int_max = std::numeric_limits<int>::max();
-const auto long_min = std::numeric_limits<int64_t>::min();
-const auto long_max = std::numeric_limits<int64_t>::max();
-const auto float_lowest = std::numeric_limits<float>::lowest();
-const auto float_min = std::numeric_limits<float>::min();
-const auto float_max = std::numeric_limits<float>::max();
-const auto double_lowest = std::numeric_limits<double>::lowest();
-const auto double_min = std::numeric_limits<double>::min();
-const auto double_max = std::numeric_limits<double>::max();
+constexpr auto int_min = std::numeric_limits<int>::min();
+constexpr auto int_max = std::numeric_limits<int>::max();
+constexpr auto long_min = std::numeric_limits<int64_t>::min();
+constexpr auto long_max = std::numeric_limits<int64_t>::max();
+constexpr auto float_lowest = std::numeric_limits<float>::lowest();
+constexpr auto float_min = std::numeric_limits<float>::min();
+constexpr auto float_max = std::numeric_limits<float>::max();
+constexpr auto double_lowest = std::numeric_limits<double>::lowest();
+constexpr auto double_min = std::numeric_limits<double>::min();
+constexpr auto double_max = std::numeric_limits<double>::max();
 
 const std::vector<int> ints {
   int_min,
diff --git a/aten/src/ATen/test/vec_test_all_types.cpp b/aten/src/ATen/test/vec_test_all_types.cpp
index b7b756f74ba1..15f058be3636 100644
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@@ -1561,6 +1561,38 @@ namespace {
               << "Failure Details:\nTest Seed to reproduce: " << seed;
         }
     }
+#endif
+#if defined(CPU_CAPABILITY_AVX512)
+    TYPED_TEST(Quantization8BitTests, TransposePackVNNI4) {
+        using VT = ValueType<TypeParam>;
+        constexpr auto K = 197;
+        constexpr auto N = 64;
+        constexpr auto L = K * N;
+        constexpr auto ld_src = N;
+        constexpr auto ld_dst = K * 4;
+        CACHE_ALIGN VT x[L];
+        CACHE_ALIGN VT y[L];
+        CACHE_ALIGN VT ref[L];
+        auto seed = TestSeed();
+        ValueGen<VT> generator(VT(-100), VT(100), seed);
+        for (const auto i : c10::irange(L)) {
+          x[i] = generator.get();
+        }
+        at::vec::transpose_pack_vnni4(x, y, ld_src, K, N);
+        int64_t _N = N / 4;
+        for (int64_t k = 0; k < K; k++) {
+          for(int64_t n = 0; n < _N; n++) {
+            for(int64_t l = 0; l < 4; l++) {
+              ref[n * ld_dst + k * 4 + l] =
+                  c10::load(&(x[k * ld_src + n * 4 + l]));
+            }
+          }
+        }
+        for (const auto i : c10::irange(L)) {
+          ASSERT_EQ(y[i], ref[i])
+              << "Failure Details:\nTest Seed to reproduce: " << seed;
+        }
+    }
 #endif
     TYPED_TEST(FunctionalTests, Map) {
         using vec = TypeParam;
diff --git a/aten/src/ATen/test/xpu_generator_test.cpp b/aten/src/ATen/test/xpu_generator_test.cpp
index f47ca4d72118..0b915c1b0cc9 100644
--- a/aten/src/ATen/test/xpu_generator_test.cpp
+++ b/aten/src/ATen/test/xpu_generator_test.cpp
@@ -80,3 +80,19 @@ TEST(XpuGeneratorTest, testMultithreadingGetSetCurrentSeed) {
   t2.join();
   EXPECT_EQ(gen1.current_seed(), initial_seed+3);
 }
+
+TEST(XpuGeneratorTest, testRNGForking) {
+  // See Note [Acquire lock when using random generators]
+  if (!at::xpu::is_available()) return;
+  auto default_gen = at::xpu::detail::getDefaultXPUGenerator();
+  auto current_gen = at::xpu::detail::createXPUGenerator();
+  {
+    std::lock_guard<std::mutex> lock(default_gen.mutex());
+    current_gen = default_gen.clone(); // capture the current state of default generator
+  }
+  auto target_value = at::randn({1000}, at::kXPU);
+  // Dramatically alter the internal state of the main generator
+  auto x = at::randn({100000}, at::kXPU);
+  auto forked_value = at::randn({1000}, current_gen, at::kXPU);
+  ASSERT_EQ(target_value.sum().item<double>(), forked_value.sum().item<double>());
+}
diff --git a/aten/src/ATen/vulkan/Context.h b/aten/src/ATen/vulkan/Context.h
index d6619527c69b..f8eef31c74bd 100644
--- a/aten/src/ATen/vulkan/Context.h
+++ b/aten/src/ATen/vulkan/Context.h
@@ -18,7 +18,7 @@ extern std::atomic<const VulkanImplInterface*> g_vulkan_impl_registry;
 
 class VulkanImplRegistrar {
  public:
-  explicit VulkanImplRegistrar(VulkanImplInterface*);
+  explicit VulkanImplRegistrar(VulkanImplInterface* /*impl*/);
 };
 
 at::Tensor& vulkan_copy_(at::Tensor& self, const at::Tensor& src);
diff --git a/aten/src/ATen/xpu/PhiloxXpuState.h b/aten/src/ATen/xpu/PhiloxXpuState.h
new file mode 100644
index 000000000000..039b992b89ba
--- /dev/null
+++ b/aten/src/ATen/xpu/PhiloxXpuState.h
@@ -0,0 +1,45 @@
+#pragma once
+
+namespace at {
+
+struct PhiloxXpuState {
+  PhiloxXpuState() = default;
+  PhiloxXpuState(uint64_t seed, uint64_t offset) {
+    seed_.val = seed;
+    offset_.val = offset;
+  }
+  // for graph capture
+  PhiloxXpuState(
+      int64_t* seed,
+      int64_t* offset_extragraph,
+      uint32_t offset_intragraph) {
+    seed_.ptr = seed;
+    offset_.ptr = offset_extragraph;
+    offset_intragraph_ = offset_intragraph;
+    captured_ = true;
+  }
+
+  union Payload {
+    uint64_t val;
+    int64_t* ptr;
+  };
+
+  Payload seed_{};
+  Payload offset_{};
+  uint32_t offset_intragraph_ = 0;
+  bool captured_ = false;
+};
+
+namespace xpu::philox {
+inline std::tuple<uint64_t, uint64_t> unpack(at::PhiloxXpuState arg) {
+  if (arg.captured_) {
+    return std::make_tuple(
+        static_cast<uint64_t>(*arg.seed_.ptr),
+        static_cast<uint64_t>(*(arg.offset_.ptr) + arg.offset_intragraph_));
+  } else {
+    return std::make_tuple(arg.seed_.val, arg.offset_.val);
+  }
+}
+
+} // namespace xpu::philox
+} // namespace at
diff --git a/aten/src/ATen/xpu/XPUGeneratorImpl.cpp b/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
index 1af0f4f890df..7a0859671ba7 100644
--- a/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
+++ b/aten/src/ATen/xpu/XPUGeneratorImpl.cpp
@@ -1,9 +1,14 @@
+#include <ATen/Functions.h>
+#include <ATen/Tensor.h>
 #include <ATen/Utils.h>
 #include <ATen/xpu/XPUGeneratorImpl.h>
+#include <ATen/xpu/XPUGraphsUtils.h>
 #include <c10/core/StreamGuard.h>
 #include <c10/util/CallOnce.h>
 #include <c10/xpu/XPUFunctions.h>
 
+constexpr uint64_t PHILOX_ROUND_SIZE = 4;
+
 namespace at {
 namespace xpu::detail {
 namespace {
@@ -58,29 +63,82 @@ Generator createXPUGenerator(DeviceIndex device) {
 
 } // namespace xpu::detail
 
+// Creates a clone of this XPU Generator State.
+c10::intrusive_ptr<XPUGeneratorState> XPUGeneratorState::clone() {
+  return make_intrusive<XPUGeneratorState>(
+      seed_, philox_offset_per_thread_, offset_intragraph_);
+}
+
+// Function to increase the internal offset based on the specified increment.
+void XPUGeneratorState::increase(uint64_t increment) {
+  increment = ((increment + PHILOX_ROUND_SIZE - 1) / PHILOX_ROUND_SIZE) *
+      PHILOX_ROUND_SIZE;
+  if (at::xpu::currentStreamCaptureStatus() !=
+      at::xpu::CaptureStatus::Executing) {
+    TORCH_INTERNAL_ASSERT(
+        capturing_,
+        "Attempt to increase offset for a XPU generator not in capture mode.");
+    TORCH_INTERNAL_ASSERT(
+        offset_intragraph_ % 4 == 0, "RNG offset must be a multiple of 4.");
+    TORCH_INTERNAL_ASSERT(
+        offset_intragraph_ <= std::numeric_limits<uint32_t>::max() - increment,
+        "Increment causes overflow in the offset value.");
+    offset_intragraph_ += increment;
+  } else {
+    TORCH_INTERNAL_ASSERT(
+        !capturing_,
+        "Offset increment outside graph capture encountered unexpectedly.");
+    TORCH_INTERNAL_ASSERT(
+        philox_offset_per_thread_ % 4 == 0,
+        "RNG offset must be a multiple of 4.");
+    philox_offset_per_thread_ += increment;
+  }
+}
+
 XPUGeneratorImpl::XPUGeneratorImpl(DeviceIndex device_index)
     : GeneratorImpl{
           Device(DeviceType::XPU, device_index),
-          DispatchKeySet(c10::DispatchKey::XPU)} {}
+          DispatchKeySet(c10::DispatchKey::XPU)} {
+  at::xpu::assertNotCapturing("Cannot construct a new XPUGeneratorImpl");
+  state_ = make_intrusive<XPUGeneratorState>();
+}
+
+XPUGeneratorImpl::XPUGeneratorImpl(
+    DeviceIndex device_index,
+    intrusive_ptr<XPUGeneratorState> state)
+    : GeneratorImpl{Device(DeviceType::XPU, device_index), DispatchKeySet(c10::DispatchKey::XPU)},
+      state_(std::move(state)) {}
 
 void XPUGeneratorImpl::set_current_seed(uint64_t seed) {
-  seed_ = seed;
-  set_philox_offset_per_thread(0);
+  if (C10_LIKELY(
+          at::xpu::currentStreamCaptureStatus() ==
+          at::xpu::CaptureStatus::Executing)) {
+    state_->seed_ = seed;
+    state_->philox_offset_per_thread_ = 0;
+  } else {
+    TORCH_CHECK(
+        state_->seed_ == seed,
+        "XPUGeneratorImpl::set_current_seed can be called during stream capture only if new seed is the same as the original seed.");
+  }
 }
 
 void XPUGeneratorImpl::set_offset(uint64_t offset) {
+  at::xpu::assertNotCapturing("Cannot call XPUGeneratorImpl::set_offset");
   set_philox_offset_per_thread(offset);
 }
 
 uint64_t XPUGeneratorImpl::get_offset() const {
-  return philox_offset_per_thread_;
+  at::xpu::assertNotCapturing("Cannot call XPUGeneratorImpl::get_offset");
+  return state_->philox_offset_per_thread_;
 }
 
 uint64_t XPUGeneratorImpl::current_seed() const {
-  return seed_;
+  at::xpu::assertNotCapturing("Cannot call XPUGeneratorImpl::current_seed");
+  return state_->seed_;
 }
 
 uint64_t XPUGeneratorImpl::seed() {
+  at::xpu::assertNotCapturing("Cannot call XPUGeneratorImpl::seed");
   auto random = c10::detail::getNonDeterministicRandom(true);
   this->set_current_seed(random);
   return random;
@@ -88,9 +146,9 @@ uint64_t XPUGeneratorImpl::seed() {
 
 c10::intrusive_ptr<c10::TensorImpl> XPUGeneratorImpl::get_state() const {
   // The RNG state comprises the seed, and an offset used for Philox.
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(uint64_t);
-  static const size_t total_size = seed_size + offset_size;
+  constexpr size_t seed_size = sizeof(uint64_t);
+  constexpr size_t offset_size = sizeof(uint64_t);
+  constexpr size_t total_size = seed_size + offset_size;
 
   // The internal state is returned as a CPU byte tensor.
   auto state_tensor = at::detail::empty_cpu(
@@ -110,39 +168,65 @@ c10::intrusive_ptr<c10::TensorImpl> XPUGeneratorImpl::get_state() const {
 }
 
 void XPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(uint64_t);
-  static const size_t total_size = seed_size + offset_size;
+  at::xpu::assertNotCapturing(
+      "Please ensure to utilize the XPUGeneratorImpl::set_state_index method during capturing.");
+  constexpr size_t seed_size = sizeof(uint64_t);
+  constexpr size_t offset_size = sizeof(uint64_t);
+  constexpr size_t total_size = seed_size + offset_size;
 
   at::detail::check_rng_state(new_state);
+
+  bool no_philox_seed = false;
   auto new_state_size = new_state.numel();
-  TORCH_CHECK(new_state_size == total_size, "RNG state is wrong size");
+  if (new_state_size == total_size - offset_size) {
+    no_philox_seed = true;
+  } else {
+    TORCH_CHECK(new_state_size == total_size, "RNG state is wrong size");
+  }
 
-  uint64_t input_seed;
+  uint64_t input_seed = 0;
   auto new_rng_state = new_state.data_dtype_initialized<uint8_t>();
   memcpy(&input_seed, new_rng_state, seed_size);
   this->set_current_seed(input_seed);
-  uint64_t philox_offset;
-  memcpy(&philox_offset, new_rng_state + seed_size, offset_size);
+  uint64_t philox_offset = 0;
+  if (!no_philox_seed) {
+    memcpy(&philox_offset, new_rng_state + seed_size, offset_size);
+  }
   this->set_philox_offset_per_thread(philox_offset);
 }
 
 void XPUGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
   TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4");
-  philox_offset_per_thread_ = offset;
+  state_->philox_offset_per_thread_ = offset;
 }
 
 uint64_t XPUGeneratorImpl::philox_offset_per_thread() const {
-  return philox_offset_per_thread_;
+  return state_->philox_offset_per_thread_;
+}
+
+PhiloxXpuState XPUGeneratorImpl::philox_xpu_state(uint64_t increment) {
+  if (at::xpu::currentStreamCaptureStatus() !=
+      at::xpu::CaptureStatus::Executing) {
+    uint32_t offset = state_->offset_intragraph_;
+    state_->increase(increment);
+    return PhiloxXpuState(
+        state_->seed_extragraph_.data_ptr<int64_t>(),
+        state_->offset_extragraph_.data_ptr<int64_t>(),
+        offset);
+  } else {
+    uint64_t offset = state_->philox_offset_per_thread_;
+    state_->increase(increment);
+    return PhiloxXpuState(state_->seed_, offset);
+  }
 }
 
 std::pair<uint64_t, uint64_t> XPUGeneratorImpl::philox_engine_inputs(
     uint64_t increment) {
-  increment = ((increment + 3) / 4) * 4;
-  TORCH_INTERNAL_ASSERT(this->philox_offset_per_thread_ % 4 == 0);
-  uint64_t offset = this->philox_offset_per_thread_;
-  this->philox_offset_per_thread_ += increment;
-  return std::make_pair(this->seed_, offset);
+  at::xpu::assertNotCapturing(
+      "Refactor this op to use XPUGeneratorImpl::philox_xpu_state. Cannot call XPUGeneratorImpl::philox_engine_inputs");
+  uint64_t offset = state_->philox_offset_per_thread_;
+  state_->increase(increment);
+  return std::make_pair(state_->seed_, offset);
 }
 
 DeviceType XPUGeneratorImpl::device_type() {
@@ -154,9 +238,8 @@ std::shared_ptr<XPUGeneratorImpl> XPUGeneratorImpl::clone() const {
 }
 
 XPUGeneratorImpl* XPUGeneratorImpl::clone_impl() const {
-  auto gen = new XPUGeneratorImpl(this->device().index());
-  gen->set_current_seed(this->seed_);
-  gen->set_philox_offset_per_thread(this->philox_offset_per_thread_);
+  at::xpu::assertNotCapturing("Cannot call XPUGeneratorImpl::clone_impl");
+  auto gen = new XPUGeneratorImpl(this->device().index(), state_->clone());
   return gen;
 }
 
diff --git a/aten/src/ATen/xpu/XPUGeneratorImpl.h b/aten/src/ATen/xpu/XPUGeneratorImpl.h
index a1f264382a36..331f7387a629 100644
--- a/aten/src/ATen/xpu/XPUGeneratorImpl.h
+++ b/aten/src/ATen/xpu/XPUGeneratorImpl.h
@@ -1,12 +1,43 @@
 #pragma once
 
 #include <ATen/core/Generator.h>
+#include <ATen/core/TensorBase.h>
+#include <ATen/xpu/PhiloxXpuState.h>
+#include <unordered_set>
 
 namespace at {
 
+namespace xpu {
+struct XPUGraph;
+}
+
+struct XPUGeneratorState : public c10::intrusive_ptr_target {
+  uint64_t seed_;
+  uint64_t philox_offset_per_thread_;
+  uint32_t offset_intragraph_;
+  bool capturing_{};
+  at::TensorBase seed_extragraph_{};
+  at::TensorBase offset_extragraph_{};
+
+  XPUGeneratorState(
+      uint64_t seed = default_rng_seed_val,
+      uint64_t philox_offset_per_thread = 0,
+      uint32_t offset_intragraph = 0)
+      : seed_(seed),
+        philox_offset_per_thread_(philox_offset_per_thread),
+        offset_intragraph_(offset_intragraph) {}
+
+  void increase(uint64_t increment);
+
+  c10::intrusive_ptr<XPUGeneratorState> clone();
+};
+
 struct TORCH_XPU_API XPUGeneratorImpl : public GeneratorImpl {
   // Constructors
   XPUGeneratorImpl(DeviceIndex device_index = -1);
+  XPUGeneratorImpl(
+      DeviceIndex device_index,
+      c10::intrusive_ptr<XPUGeneratorState> state_);
   ~XPUGeneratorImpl() override = default;
 
   // XPUGeneratorImpl methods
@@ -18,15 +49,18 @@ struct TORCH_XPU_API XPUGeneratorImpl : public GeneratorImpl {
   uint64_t seed() override;
   void set_state(const c10::TensorImpl& new_state) override;
   c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
+
   void set_philox_offset_per_thread(uint64_t offset);
   uint64_t philox_offset_per_thread() const;
+
+  PhiloxXpuState philox_xpu_state(uint64_t increment);
+  // will remove once all ops are refactored to use philox_xpu_state.
   std::pair<uint64_t, uint64_t> philox_engine_inputs(uint64_t increment);
   static c10::DeviceType device_type();
 
  private:
   XPUGeneratorImpl* clone_impl() const override;
-  uint64_t seed_ = default_rng_seed_val;
-  uint64_t philox_offset_per_thread_ = 0;
+  c10::intrusive_ptr<XPUGeneratorState> state_;
 };
 
 namespace xpu::detail {
diff --git a/aten/src/ATen/xpu/XPUGraphsUtils.h b/aten/src/ATen/xpu/XPUGraphsUtils.h
new file mode 100644
index 000000000000..b18fe4ef0417
--- /dev/null
+++ b/aten/src/ATen/xpu/XPUGraphsUtils.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <c10/xpu/XPUGraphsC10Utils.h>
+
+namespace at::xpu {
+
+inline CaptureStatus currentStreamCaptureStatus() {
+  return c10::xpu::currentStreamCaptureStatusMayInitCtx();
+}
+
+inline void assertNotCapturing(const std::string& attempt) {
+  auto status = currentStreamCaptureStatus();
+  TORCH_CHECK(
+      status == CaptureStatus::Executing,
+      attempt,
+      " during XPU graph capture. If you need this call to be captured, "
+      "please file an issue. "
+      "Current xpuStreamCaptureStatus: ",
+      status);
+}
+
+} // namespace at::xpu
diff --git a/aten/tools/run_tests.sh b/aten/tools/run_tests.sh
index c8bf60611817..93e48aec9085 100755
--- a/aten/tools/run_tests.sh
+++ b/aten/tools/run_tests.sh
@@ -50,6 +50,7 @@ run_if_exists cuda_complex_test
 run_if_exists cuda_complex_math_test
 run_if_exists cuda_cub_test
 run_if_exists cuda_atomic_ops_test
+run_if_exists cuda_allocator_test
 
 if [ "$VALGRIND" == "ON" ]; then
   # NB: As these tests are invoked by valgrind, let's leave them for now as it's
diff --git a/benchmarks/distributed/bench_nvshmem_tile_reduce.py b/benchmarks/distributed/bench_nvshmem_tile_reduce.py
new file mode 100644
index 000000000000..da4ce796d7bb
--- /dev/null
+++ b/benchmarks/distributed/bench_nvshmem_tile_reduce.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""
+Benchmark for NVSHMEM tile reduce operations.
+
+Usage:
+python benchmarks/distributed/bench_nvshmem_tile_reduce.py
+
+This benchmark measures the performance of tile reduce operations across different
+matrix sizes and tile configurations.
+"""
+
+import time
+
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+from torch.testing._internal.common_distributed import MultiProcContinuousTest
+from torch.testing._internal.common_utils import (
+    requires_cuda_p2p_access,
+    skip_but_pass_in_sandcastle_if,
+    skipIfRocm,
+)
+
+
+# Decorator
+def requires_nvshmem():
+    return skip_but_pass_in_sandcastle_if(
+        not symm_mem.is_nvshmem_available(),
+        "bench_nvshmem_tile_reduce requires NVSHMEM, skipping benchmark",
+    )
+
+
+# So that benchmarks are written in device-agnostic way
+device_type = "cuda"
+device_module = torch.get_device_module(device_type)
+
+
+@requires_nvshmem()
+@requires_cuda_p2p_access()
+class NVSHMEMTileReduceBenchmark(MultiProcContinuousTest):
+    def _init_device(self) -> None:
+        # TODO: relieve this (seems to hang if without)
+        device_module.set_device(self.device)
+        # Set NVSHMEM as SymmMem backend
+        symm_mem.set_backend("NVSHMEM")
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    def _benchmark_tile_reduce_single(
+        self,
+        full_size: int,
+        tile_size: int,
+        warmup_iters: int = 5,
+        bench_iters: int = 10,
+    ) -> dict:
+        """
+        Benchmark a single configuration of tile reduce.
+
+        Args:
+            full_size: Size of the full matrix (full_size x full_size)
+            warmup_iters: Number of warmup iterations
+            bench_iters: Number of benchmark iterations
+
+        Returns:
+            Dictionary with benchmark results
+        """
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+
+        # Allocate full matrices
+        full_inp = symm_mem.empty(
+            full_size, full_size, dtype=dtype, device=self.device
+        ).fill_(self.rank)
+        full_out = symm_mem.empty(
+            full_size, full_size, dtype=dtype, device=self.device
+        ).fill_(0)
+
+        slice_ut = slice(0, tile_size)
+        inp_tile = full_inp[slice_ut, slice_ut]
+        out_tile = full_out[slice_ut, slice_ut]
+
+        root = 0
+
+        # Warmup iterations
+        for _ in range(warmup_iters):
+            torch.ops.symm_mem.tile_reduce(inp_tile, out_tile, root, group_name)
+            torch.cuda.synchronize(self.device)
+
+        # Benchmark iterations
+        times = []
+
+        dist.barrier()
+        torch.cuda.synchronize(self.device)
+        start_time = time.perf_counter()
+
+        for _ in range(bench_iters):
+            torch.ops.symm_mem.tile_reduce(inp_tile, out_tile, root, group_name)
+
+        torch.cuda.synchronize(self.device)
+        end_time = time.perf_counter()
+        times.append((end_time - start_time) / bench_iters)
+
+        # Calculate statistics
+        times = torch.tensor(times, dtype=torch.float64)
+        tile_elements = tile_size * tile_size
+        tile_bytes = (
+            tile_elements * dtype.itemsize
+            if hasattr(dtype, "itemsize")
+            else tile_elements * 4
+        )
+
+        results = {
+            "full_size": full_size,
+            "tile_size": tile_size,
+            "tile_elements": tile_elements,
+            "tile_bytes": tile_bytes,
+            "world_size": self.world_size,
+            "mean_time_ms": times.mean().item() * 1000,
+            "std_time_ms": times.std().item() * 1000,
+            "min_time_ms": times.min().item() * 1000,
+            "max_time_ms": times.max().item() * 1000,
+            "throughput_gb_s": tile_bytes / (times.mean().item() * 1e9),
+            "elements_per_sec": tile_elements / times.mean().item(),
+        }
+
+        return results
+
+    @skipIfRocm
+    def test_benchmark_tile_reduce_various_sizes(self) -> None:
+        """
+        Benchmark tile reduce across various matrix sizes.
+        """
+        # Test various matrix sizes
+        tile_sizes = [512, 1024, 2048, 4096, 8192, 16384]
+        full_size = tile_sizes[-1]
+        warmup_iters = 5
+        bench_iters = 20
+
+        results = []
+
+        for tile_size in tile_sizes:
+            try:
+                result = self._benchmark_tile_reduce_single(
+                    full_size, tile_size, warmup_iters, bench_iters
+                )
+                results.append(result)
+
+                if self.rank == 0:
+                    print(
+                        f"Matrix Size: {full_size}x{full_size}, Tile Size: {tile_size}x{tile_size}"
+                    )
+                    print(
+                        f"  Mean Time: {result['mean_time_ms']:.3f} ± {result['std_time_ms']:.3f} ms"
+                    )
+                    print(f"  Throughput: {result['throughput_gb_s']:.2f} GB/s")
+                    print(f"  Bytes: {result['tile_bytes']:.0f}")
+                    print()
+
+            except Exception as e:
+                if self.rank == 0:
+                    print(f"Failed to benchmark matrix size {full_size}: {e}")
+
+        # Print summary
+        if self.rank == 0 and results:
+            print("=== BENCHMARK SUMMARY ===")
+            print(
+                f"{'Matrix Size':<12} {'Tile Size':<10} {'Time (ms)':<12} {'Throughput (GB/s)':<18} {'Bytes':<15}"
+            )
+            print("-" * 70)
+
+            for result in results:
+                print(
+                    f"{result['full_size']}x{result['full_size']:<7} "
+                    f"{result['tile_size']}x{result['tile_size']:<5} "
+                    f"{result['mean_time_ms']:<12.3f} "
+                    f"{result['throughput_gb_s']:<18.2f} "
+                    f"{result['tile_bytes']:<15.0f}"
+                )
+
+
+if __name__ == "__main__":
+    # For standalone usage, you'd need to set up distributed environment
+    # For now, this is meant to be run via the PyTorch test framework
+    from torch.testing._internal.common_utils import run_tests
+
+    run_tests()
diff --git a/benchmarks/dynamo/all_torchbench_models_list.txt b/benchmarks/dynamo/all_torchbench_models_list.txt
index 1e896c333288..5205bded7b74 100644
--- a/benchmarks/dynamo/all_torchbench_models_list.txt
+++ b/benchmarks/dynamo/all_torchbench_models_list.txt
@@ -25,15 +25,6 @@ drq
 fambench_dlrm
 fambench_xlmr
 fastNLP_Bert
-hf_Albert
-hf_Bart
-hf_Bert
-hf_BigBird
-hf_DistilBert
-hf_GPT2
-hf_Longformer
-hf_Reformer
-hf_T5
 maml
 maml_omniglot
 mnasnet1_0
@@ -60,13 +51,6 @@ soft_actor_critic
 speech_transformer
 squeezenet1_1
 tacotron2
-timm_efficientdet
-timm_efficientnet
-timm_nfnet
-timm_regnet
-timm_resnest
-timm_vision_transformer
-timm_vovnet
 tts_angular
 vgg16
 vision_maskrcnn
diff --git a/benchmarks/dynamo/cachebench.py b/benchmarks/dynamo/cachebench.py
index c5cbb1eef4d0..c4d79a1b12ce 100644
--- a/benchmarks/dynamo/cachebench.py
+++ b/benchmarks/dynamo/cachebench.py
@@ -6,7 +6,7 @@
 import subprocess
 import sys
 import tempfile
-from typing import Callable
+from collections.abc import Callable
 
 from torch._inductor.utils import fresh_cache
 
@@ -23,7 +23,6 @@
     "resnet50",
     "moco",
     "llama",
-    "hf_T5",
 ]
 HUGGINGFACE_MODELS: list[str] = [
     "AllenaiLongformerBase",
diff --git a/benchmarks/dynamo/check_accuracy.py b/benchmarks/dynamo/check_accuracy.py
index 4bd518790b3c..83cca8b36b99 100644
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@@ -10,13 +10,13 @@
 
 flaky_models = {
     "yolov3",
-    "gluon_inception_v3",
     "detectron2_maskrcnn_r_101_c4",
-    "timm_efficientnet",  # see https://github.com/pytorch/pytorch/issues/148699
     "XGLMForCausalLM",  # discovered in https://github.com/pytorch/pytorch/pull/128148
     "moondream",  # discovered in https://github.com/pytorch/pytorch/pull/159291
     # discovered in https://github.com/pytorch/pytorch/issues/161419. Its not flaky but really hard to repro, so skipping it
     "mobilenetv3_large_100",
+    # https://github.com/pytorch/pytorch/issues/163670
+    "vision_maskrcnn",
 }
 
 
@@ -34,19 +34,15 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
     if "rocm" in expected_filename:
         flaky_models.update(
             {
+                "Background_Matting",
                 "alexnet",
-                "cait_m36_384",
                 "demucs",
                 "densenet121",
                 "detectron2_fcos_r_50_fpn",
                 "doctr_det_predictor",
                 "doctr_reco_predictor",
-                "hf_BigBird",
-                "hf_Longformer",
-                "hf_Reformer",
-                "hf_Roberta_base",
-                "hf_T5",
-                "hf_T5_base",
+                "dpn107",
+                "fbnetv3_b",
                 "levit_128",
                 "llava",
                 "microbench_unbacked_tolist_sum",
@@ -64,20 +60,21 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                 "squeezenet1_1",
                 "stable_diffusion_text_encoder",
                 "stable_diffusion_unet",
-                "timm_efficientdet",
-                "timm_efficientnet",
-                "timm_nfnet",
-                "timm_regnet",
-                "timm_resnest",
-                "timm_vovnet",
+                "swsl_resnext101_32x16d",
                 "torchrec_dlrm",
                 "vgg16",
+                "BERT_pytorch",
+                "coat_lite_mini",
+                "mobilenet_v3_large",
+                "vision_maskrcnn",
                 # LLM
                 "meta-llama/Llama-3.2-1B",
                 "google/gemma-2-2b",
                 "google/gemma-3-4b-it",
                 "openai/whisper-tiny",
                 "Qwen/Qwen3-0.6B",
+                "mistralai/Mistral-7B-Instruct-v0.3",
+                "openai/gpt-oss-20b",
             }
         )
 
diff --git a/benchmarks/dynamo/check_graph_breaks.py b/benchmarks/dynamo/check_graph_breaks.py
index 87ef46b68324..963f370a1ae1 100644
--- a/benchmarks/dynamo/check_graph_breaks.py
+++ b/benchmarks/dynamo/check_graph_breaks.py
@@ -10,7 +10,6 @@
 
 flaky_models = {
     "yolov3",
-    "gluon_inception_v3",
     "detectron2_maskrcnn_r_101_c4",
     "XGLMForCausalLM",  # discovered in https://github.com/pytorch/pytorch/pull/128148
     "detectron2_fcos_r_50_fpn",
@@ -32,27 +31,21 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
         flaky_models.update(
             {
                 "alexnet",
-                "cait_m36_384",
                 "demucs",
                 "densenet121",
                 "detectron2_fcos_r_50_fpn",
                 "doctr_det_predictor",
                 "doctr_reco_predictor",
-                "hf_BigBird",
-                "hf_Longformer",
-                "hf_Reformer",
-                "hf_Roberta_base",
-                "hf_T5",
-                "hf_T5_base",
                 "levit_128",
                 "llava",
                 "microbench_unbacked_tolist_sum",
+                "resnet50",
+                "resnet152",
                 "sam",
                 "sam_fast",
                 "stable_diffusion_text_encoder",
                 "stable_diffusion_unet",
                 "timm_efficientdet",
-                "timm_nfnet",
                 "torchrec_dlrm",
                 "vgg16",
                 # LLM
@@ -61,6 +54,8 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
                 "google/gemma-3-4b-it",
                 "openai/whisper-tiny",
                 "Qwen/Qwen3-0.6B",
+                "mistralai/Mistral-7B-Instruct-v0.3",
+                "openai/gpt-oss-20b",
             }
         )
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
index b75931080595..54914c1395e1 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
@@ -174,11 +110,11 @@ YituTechConvBert,pass,0
 
 
 
-meta-llama/Llama-3.2-1B,pass,5
+meta-llama/Llama-3.2-1B,pass,0
 
 
 
-google/gemma-2-2b,pass,5
+google/gemma-2-2b,pass,0
 
 
 
@@ -186,8 +122,16 @@ google/gemma-3-4b-it,pass_due_to_skip,0
 
 
 
-openai/whisper-tiny,pass,6
+openai/whisper-tiny,pass,0
+
+
+
+Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
 
 
 
-Qwen/Qwen3-0.6B,pass,5
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
index 37e1b792b3dc..e06f3bde8af1 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4
 
 
 
-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,6
 
 
 
-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5
 
 
 
-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0
 
 
 
-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5
 
 
 
-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4
 
 
 
-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5
 
 
 
-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,6
 
 
 
-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5
 
 
 
-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3
 
 
 
-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6
 
 
 
-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6
 
 
 
-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5
 
 
 
-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_inference.csv
index c889ba0e8d2f..1de6cdf54965 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0
 
 
 
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
index 1def1d99bd53..b5e457e58997 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,7
 
 
 
-botnet26t_256,pass,6
-
-
-
-cait_m36_384,eager_fail_to_run,0
-
-
-
-coat_lite_mini,pass,6
-
-
-
-convit_base,pass,7
-
-
-
-convmixer_768_32,pass,5
-
-
-
-convnext_base,pass,7
-
-
-
-crossvit_9_240,pass,7
-
-
-
-cspdarknet53,pass,7
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
-dla102,pass,7
+deit_tiny_patch16_224.fb_in1k,pass,7
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,6
 
 
 
-dpn107,pass,6
-
-
-
-eca_botnext26ts_256,pass,7
-
-
-
-eca_halonext26ts,pass,7
-
-
-
-ese_vovnet19b_dw,pass,7
-
-
-
-fbnetc_100,pass,7
-
-
-
-fbnetv3_b,pass,6
-
-
-
-gernet_l,pass,6
-
-
-
 ghostnet_100,pass,6
 
 
 
-gluon_inception_v3,pass,7
-
-
-
-gmixer_24_224,pass,6
-
-
-
-gmlp_s16_224,pass,7
-
-
-
-hrnet_w18,pass,5
-
-
-
 inception_v3,pass,6
 
 
 
-jx_nest_base,pass,7
-
-
-
-lcnet_050,fail_accuracy,6
-
-
-
-levit_128,pass,7
-
-
-
-mixer_b16_224,pass,7
-
-
-
-mixnet_l,pass,6
-
-
-
-mnasnet_100,pass,7
-
-
-
 mobilenetv2_100,pass,7
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,7
 
 
 
-pit_b_224,pass,6
-
-
-
-pnasnet5large,pass,5
-
-
-
-poolformer_m36,pass,6
-
-
-
-regnety_002,pass,6
-
-
-
 repvgg_a2,pass,7
 
 
 
-res2net101_26w_4s,pass,6
-
-
-
-res2net50_14w_8s,pass,6
-
-
-
-res2next50,pass,6
-
-
-
-resmlp_12_224,pass,6
-
-
-
-resnest101e,pass,6
-
-
-
-rexnet_100,pass,7
-
-
-
-sebotnet33ts_256,pass,6
-
-
-
-selecsls42b,pass,6
-
-
-
-spnasnet_100,pass,7
-
-
-
 swin_base_patch4_window7_224,pass,7
 
 
 
-swsl_resnext101_32x16d,pass,6
-
-
-
 tf_efficientnet_b0,pass,6
 
 
 
-tf_mixnet_l,pass,6
-
-
-
-tinynet_a,pass,6
-
-
-
-tnt_s_patch16_224,pass,7
-
-
-
-twins_pcpvt_base,pass,7
-
-
-
 visformer_small,pass,7
 
 
 
-vit_base_patch16_224,pass,7
-
-
-
-volo_d1_224,pass,7
+vit_base_patch14_dinov2.lvd142m,pass,7
 
 
 
-xcit_large_24_p8_224,pass_due_to_skip,7
+vit_base_patch16_siglip_256,pass,7
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
index 1d199fe8ea66..6ddac7cc558d 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@@ -130,70 +130,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,8
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,eager_fail_to_run,0
-
-
-
-hf_T5_generate,pass,11
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -218,7 +154,7 @@ maml_omniglot,pass,0
 
 
 
-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2
 
 
 
@@ -342,30 +278,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
@@ -378,7 +290,7 @@ vgg16,pass,0
 
 
 
-vision_maskrcnn,pass,20
+vision_maskrcnn,pass,21
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
index 54b7d63f3a4b..a133b9b67a76 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
@@ -78,62 +78,6 @@ functorch_maml_omniglot,pass,7
 
 
 
-hf_Albert,pass,6
-
-
-
-hf_Bart,pass,6
-
-
-
-hf_Bert,pass,6
-
-
-
-hf_Bert_large,pass,6
-
-
-
-hf_BigBird,pass,6
-
-
-
-hf_DistilBert,pass,6
-
-
-
-hf_GPT2,pass,8
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,25
-
-
-
-hf_Roberta_base,pass,6
-
-
-
-hf_T5_base,eager_2nd_run_OOM,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,6
-
-
-
-hf_distil_whisper,model_fail_to_load,0
-
-
-
 lennard_jones,pass,7
 
 
@@ -146,7 +90,7 @@ maml_omniglot,pass,7
 
 
 
-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9
 
 
 
@@ -250,30 +194,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,fail_accuracy,7
-
-
-
-timm_regnet,pass,7
-
-
-
-timm_resnest,pass,6
-
-
-
-timm_vision_transformer,pass,6
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,6
-
-
-
 torch_multimodal_clip,pass,7
 
 
@@ -286,7 +206,7 @@ vgg16,pass,6
 
 
 
-vision_maskrcnn,pass,39
+vision_maskrcnn,pass,40
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
index 2283da42b145..2077b996f2ed 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@@ -6,58 +6,26 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,0
 
 
@@ -66,10 +34,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -82,10 +46,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -94,10 +54,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -106,18 +62,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -126,26 +74,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
@@ -170,15 +106,15 @@ YituTechConvBert,pass,0
 
 
 
-meta-llama/Llama-3.2-1B,fail_accuracy,0
+meta-llama/Llama-3.2-1B,fail_to_run,0
 
 
 
-google/gemma-2-2b,fail_accuracy,0
+google/gemma-2-2b,fail_to_run,0
 
 
 
-google/gemma-3-4b-it,fail_accuracy,0
+google/gemma-3-4b-it,fail_to_run,0
 
 
 
@@ -186,4 +122,12 @@ openai/whisper-tiny,fail_to_run,0
 
 
 
-Qwen/Qwen3-0.6B,fail_accuracy,0
+Qwen/Qwen3-0.6B,fail_to_run,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,fail_to_run,0
+
+
+
+openai/gpt-oss-20b,fail_to_run,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv
index c889ba0e8d2f..1de6cdf54965 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0
 
 
 
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
index 719b1d120131..b52e4eb905d1 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
@@ -118,62 +118,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,fail_accuracy,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,eager_fail_to_run,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -314,30 +258,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
index 7e100f9787cf..42f0cfef50fc 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
@@ -114,58 +114,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -278,38 +226,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
index 169a42ff7cd4..7519dfc6c8cd 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_huggingface_inference.csv
@@ -6,58 +6,26 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,0
 
 
@@ -66,10 +34,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -82,10 +46,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -94,10 +54,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -106,18 +62,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -126,26 +74,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv
index c889ba0e8d2f..1de6cdf54965 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0
 
 
 
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
index 7e100f9787cf..42f0cfef50fc 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
@@ -114,58 +114,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -278,38 +226,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
index 386f9099731c..73d7786ef29e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
@@ -191,3 +127,11 @@ openai/whisper-tiny,pass_due_to_skip,0
 
 
 Qwen/Qwen3-0.6B,pass_due_to_skip,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
+
+
+
+openai/gpt-oss-20b,pass_due_to_skip,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv
index c7d283b9aa52..1de6cdf54965 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,timeout,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0
 
 
 
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
index a4dbaeb7b546..a0edfdbe47ff 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
@@ -122,66 +122,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,25
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,8
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -198,7 +138,7 @@ maml_omniglot,pass,0
 
 
 
-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2
 
 
 
@@ -302,38 +242,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
@@ -346,7 +254,7 @@ vgg16,pass,0
 
 
 
-vision_maskrcnn,fail_accuracy,29
+vision_maskrcnn,fail_accuracy,30
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
index 386f9099731c..73d7786ef29e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
@@ -191,3 +127,11 @@ openai/whisper-tiny,pass_due_to_skip,0
 
 
 Qwen/Qwen3-0.6B,pass_due_to_skip,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
+
+
+
+openai/gpt-oss-20b,pass_due_to_skip,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv
index c7d283b9aa52..1de6cdf54965 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,timeout,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0
 
 
 
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
index 885029ba8c56..f9f970a7fc83 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
@@ -122,66 +122,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,25
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,8
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -198,7 +138,7 @@ maml_omniglot,pass,0
 
 
 
-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2
 
 
 
@@ -302,38 +242,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
index 386f9099731c..73d7786ef29e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
@@ -191,3 +127,11 @@ openai/whisper-tiny,pass_due_to_skip,0
 
 
 Qwen/Qwen3-0.6B,pass_due_to_skip,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
+
+
+
+openai/gpt-oss-20b,pass_due_to_skip,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_timm_inference.csv
index c889ba0e8d2f..1de6cdf54965 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0
 
 
 
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
index aa7a3161afcc..4c1319db30c8 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@@ -122,66 +122,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,25
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,8
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -198,7 +138,7 @@ maml_omniglot,pass,0
 
 
 
-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2
 
 
 
@@ -302,38 +242,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
index b75931080595..54914c1395e1 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
@@ -174,11 +110,11 @@ YituTechConvBert,pass,0
 
 
 
-meta-llama/Llama-3.2-1B,pass,5
+meta-llama/Llama-3.2-1B,pass,0
 
 
 
-google/gemma-2-2b,pass,5
+google/gemma-2-2b,pass,0
 
 
 
@@ -186,8 +122,16 @@ google/gemma-3-4b-it,pass_due_to_skip,0
 
 
 
-openai/whisper-tiny,pass,6
+openai/whisper-tiny,pass,0
+
+
+
+Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
 
 
 
-Qwen/Qwen3-0.6B,pass,5
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
index 37e1b792b3dc..e06f3bde8af1 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_training.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4
 
 
 
-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,6
 
 
 
-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5
 
 
 
-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0
 
 
 
-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5
 
 
 
-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4
 
 
 
-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5
 
 
 
-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,6
 
 
 
-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5
 
 
 
-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3
 
 
 
-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6
 
 
 
-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6
 
 
 
-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5
 
 
 
-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_inference.csv
index c889ba0e8d2f..1de6cdf54965 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0
 
 
 
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_training.csv
index 1def1d99bd53..b5e457e58997 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_timm_training.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,7
 
 
 
-botnet26t_256,pass,6
-
-
-
-cait_m36_384,eager_fail_to_run,0
-
-
-
-coat_lite_mini,pass,6
-
-
-
-convit_base,pass,7
-
-
-
-convmixer_768_32,pass,5
-
-
-
-convnext_base,pass,7
-
-
-
-crossvit_9_240,pass,7
-
-
-
-cspdarknet53,pass,7
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
-dla102,pass,7
+deit_tiny_patch16_224.fb_in1k,pass,7
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,6
 
 
 
-dpn107,pass,6
-
-
-
-eca_botnext26ts_256,pass,7
-
-
-
-eca_halonext26ts,pass,7
-
-
-
-ese_vovnet19b_dw,pass,7
-
-
-
-fbnetc_100,pass,7
-
-
-
-fbnetv3_b,pass,6
-
-
-
-gernet_l,pass,6
-
-
-
 ghostnet_100,pass,6
 
 
 
-gluon_inception_v3,pass,7
-
-
-
-gmixer_24_224,pass,6
-
-
-
-gmlp_s16_224,pass,7
-
-
-
-hrnet_w18,pass,5
-
-
-
 inception_v3,pass,6
 
 
 
-jx_nest_base,pass,7
-
-
-
-lcnet_050,fail_accuracy,6
-
-
-
-levit_128,pass,7
-
-
-
-mixer_b16_224,pass,7
-
-
-
-mixnet_l,pass,6
-
-
-
-mnasnet_100,pass,7
-
-
-
 mobilenetv2_100,pass,7
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,7
 
 
 
-pit_b_224,pass,6
-
-
-
-pnasnet5large,pass,5
-
-
-
-poolformer_m36,pass,6
-
-
-
-regnety_002,pass,6
-
-
-
 repvgg_a2,pass,7
 
 
 
-res2net101_26w_4s,pass,6
-
-
-
-res2net50_14w_8s,pass,6
-
-
-
-res2next50,pass,6
-
-
-
-resmlp_12_224,pass,6
-
-
-
-resnest101e,pass,6
-
-
-
-rexnet_100,pass,7
-
-
-
-sebotnet33ts_256,pass,6
-
-
-
-selecsls42b,pass,6
-
-
-
-spnasnet_100,pass,7
-
-
-
 swin_base_patch4_window7_224,pass,7
 
 
 
-swsl_resnext101_32x16d,pass,6
-
-
-
 tf_efficientnet_b0,pass,6
 
 
 
-tf_mixnet_l,pass,6
-
-
-
-tinynet_a,pass,6
-
-
-
-tnt_s_patch16_224,pass,7
-
-
-
-twins_pcpvt_base,pass,7
-
-
-
 visformer_small,pass,7
 
 
 
-vit_base_patch16_224,pass,7
-
-
-
-volo_d1_224,pass,7
+vit_base_patch14_dinov2.lvd142m,pass,7
 
 
 
-xcit_large_24_p8_224,pass_due_to_skip,7
+vit_base_patch16_siglip_256,pass,7
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
index 20cad351b127..70486cca6353 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
@@ -130,70 +130,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,8
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,eager_fail_to_run,0
-
-
-
-hf_T5_generate,pass,11
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -218,7 +154,7 @@ maml_omniglot,pass,0
 
 
 
-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2
 
 
 
@@ -342,30 +278,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
@@ -378,7 +290,7 @@ vgg16,pass,0
 
 
 
-vision_maskrcnn,pass,20
+vision_maskrcnn,pass,21
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
index 5050b3762ed9..ef33cd850dfd 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
@@ -78,62 +78,6 @@ functorch_maml_omniglot,pass,7
 
 
 
-hf_Albert,pass,6
-
-
-
-hf_Bart,pass,6
-
-
-
-hf_Bert,pass,6
-
-
-
-hf_Bert_large,pass,6
-
-
-
-hf_BigBird,pass,6
-
-
-
-hf_DistilBert,pass,6
-
-
-
-hf_GPT2,pass,8
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,25
-
-
-
-hf_Roberta_base,pass,6
-
-
-
-hf_T5_base,eager_2nd_run_OOM,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,6
-
-
-
-hf_distil_whisper,model_fail_to_load,0
-
-
-
 lennard_jones,pass,7
 
 
@@ -146,7 +90,7 @@ maml_omniglot,pass,7
 
 
 
-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9
 
 
 
@@ -246,30 +190,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,7
-
-
-
-timm_regnet,pass,7
-
-
-
-timm_resnest,pass,6
-
-
-
-timm_vision_transformer,pass,6
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,6
-
-
-
 torch_multimodal_clip,pass,7
 
 
@@ -282,7 +202,7 @@ vgg16,pass,6
 
 
 
-vision_maskrcnn,pass,39
+vision_maskrcnn,pass,40
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv
index b0e8f34b964e..fe59dabe3b57 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv
@@ -98,58 +98,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -262,38 +210,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
index b0e8f34b964e..fe59dabe3b57 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
@@ -98,58 +98,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -262,38 +210,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
index b2595458b132..54914c1395e1 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
@@ -191,3 +127,11 @@ openai/whisper-tiny,pass,0
 
 
 Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_timm_inference.csv
index c889ba0e8d2f..1de6cdf54965 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0
 
 
 
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
index f26dea6f692e..4e4cc7dc18bc 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@@ -106,66 +106,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,25
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,8
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -182,7 +122,7 @@ maml_omniglot,pass,0
 
 
 
-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2
 
 
 
@@ -286,39 +226,7 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
-torch_multimodal_clip,pass,3
+torch_multimodal_clip,pass,0
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
index 0f088e7892d8..439c9bf53046 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_huggingface_inference.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_timm_inference.csv
index c889ba0e8d2f..1de6cdf54965 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0
 
 
 
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
index 39149853947c..723ef7a272ea 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
@@ -122,66 +122,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,25
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,8
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -198,7 +138,7 @@ maml_omniglot,pass,0
 
 
 
-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2
 
 
 
@@ -302,39 +242,7 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientdet,model_fail_to_load,0
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
-torch_multimodal_clip,pass,3
+torch_multimodal_clip,pass,0
 
 
 
@@ -346,7 +254,7 @@ vgg16,pass,0
 
 
 
-vision_maskrcnn,fail_accuracy,29
+vision_maskrcnn,fail_accuracy,30
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
index b5e1a0989e74..a08bd7265db9 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
@@ -174,11 +110,11 @@ YituTechConvBert,pass,0
 
 
 
-meta-llama/Llama-3.2-1B,pass,5
+meta-llama/Llama-3.2-1B,pass,0
 
 
 
-google/gemma-2-2b,pass,5
+google/gemma-2-2b,pass,0
 
 
 
@@ -186,8 +122,16 @@ google/gemma-3-4b-it,pass,0
 
 
 
-openai/whisper-tiny,pass,6
+openai/whisper-tiny,pass,0
+
+
+
+Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
 
 
 
-Qwen/Qwen3-0.6B,pass,5
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
index 37e1b792b3dc..e06f3bde8af1 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_training.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4
 
 
 
-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,6
 
 
 
-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5
 
 
 
-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0
 
 
 
-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5
 
 
 
-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4
 
 
 
-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5
 
 
 
-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,6
 
 
 
-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5
 
 
 
-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3
 
 
 
-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6
 
 
 
-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6
 
 
 
-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5
 
 
 
-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_inference.csv
index d8263edfcaac..1de6cdf54965 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,fail_to_run,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0
 
 
 
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
index e5464160d32f..b5e457e58997 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_timm_training.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,7
 
 
 
-botnet26t_256,pass,6
-
-
-
-cait_m36_384,eager_fail_to_run,0
-
-
-
-coat_lite_mini,pass,6
-
-
-
-convit_base,pass,7
-
-
-
-convmixer_768_32,pass,5
-
-
-
-convnext_base,pass,7
-
-
-
-crossvit_9_240,pass,7
-
-
-
-cspdarknet53,pass,7
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
-dla102,pass,7
+deit_tiny_patch16_224.fb_in1k,pass,7
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,6
 
 
 
-dpn107,pass,6
-
-
-
-eca_botnext26ts_256,pass,7
-
-
-
-eca_halonext26ts,pass,7
-
-
-
-ese_vovnet19b_dw,pass,7
-
-
-
-fbnetc_100,pass,7
-
-
-
-fbnetv3_b,pass,6
-
-
-
-gernet_l,pass,6
-
-
-
 ghostnet_100,pass,6
 
 
 
-gluon_inception_v3,pass,7
-
-
-
-gmixer_24_224,pass,6
-
-
-
-gmlp_s16_224,pass,7
-
-
-
-hrnet_w18,pass,5
-
-
-
 inception_v3,pass,6
 
 
 
-jx_nest_base,pass,7
-
-
-
-lcnet_050,pass,6
-
-
-
-levit_128,pass,7
-
-
-
-mixer_b16_224,pass,7
-
-
-
-mixnet_l,pass,6
-
-
-
-mnasnet_100,pass,7
-
-
-
 mobilenetv2_100,pass,7
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,7
 
 
 
-pit_b_224,pass,6
-
-
-
-pnasnet5large,pass,5
-
-
-
-poolformer_m36,pass,6
-
-
-
-regnety_002,pass,6
-
-
-
 repvgg_a2,pass,7
 
 
 
-res2net101_26w_4s,pass,6
-
-
-
-res2net50_14w_8s,pass,6
-
-
-
-res2next50,pass,6
-
-
-
-resmlp_12_224,pass,6
-
-
-
-resnest101e,pass,6
-
-
-
-rexnet_100,pass,7
-
-
-
-sebotnet33ts_256,pass,6
-
-
-
-selecsls42b,pass,6
-
-
-
-spnasnet_100,pass,7
-
-
-
 swin_base_patch4_window7_224,pass,7
 
 
 
-swsl_resnext101_32x16d,pass,6
-
-
-
 tf_efficientnet_b0,pass,6
 
 
 
-tf_mixnet_l,pass,6
-
-
-
-tinynet_a,pass,6
-
-
-
-tnt_s_patch16_224,pass,7
-
-
-
-twins_pcpvt_base,pass,7
-
-
-
 visformer_small,pass,7
 
 
 
-vit_base_patch16_224,pass,7
-
-
-
-volo_d1_224,pass,7
+vit_base_patch14_dinov2.lvd142m,pass,7
 
 
 
-xcit_large_24_p8_224,pass_due_to_skip,7
+vit_base_patch16_siglip_256,pass,7
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
index 2b2c1a504647..cb7cfb4c7d68 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
@@ -130,70 +130,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,fail_accuracy,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,8
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,eager_fail_to_run,0
-
-
-
-hf_T5_generate,pass,11
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -218,7 +154,7 @@ maml_omniglot,pass,0
 
 
 
-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2
 
 
 
@@ -342,30 +278,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
@@ -378,7 +290,7 @@ vgg16,pass,0
 
 
 
-vision_maskrcnn,pass,20
+vision_maskrcnn,pass,18
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
index 89871fd49a04..71311ac0faf7 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
@@ -78,62 +78,6 @@ functorch_maml_omniglot,pass,7
 
 
 
-hf_Albert,pass,6
-
-
-
-hf_Bart,pass,6
-
-
-
-hf_Bert,pass,6
-
-
-
-hf_Bert_large,pass,6
-
-
-
-hf_BigBird,pass,6
-
-
-
-hf_DistilBert,pass,6
-
-
-
-hf_GPT2,pass,8
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,25
-
-
-
-hf_Roberta_base,pass,6
-
-
-
-hf_T5_base,eager_2nd_run_OOM,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,6
-
-
-
-hf_distil_whisper,model_fail_to_load,0
-
-
-
 lennard_jones,pass,7
 
 
@@ -146,7 +90,7 @@ maml_omniglot,pass,7
 
 
 
-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9
 
 
 
@@ -246,30 +190,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,fail_accuracy,7
-
-
-
-timm_regnet,pass,7
-
-
-
-timm_resnest,pass,6
-
-
-
-timm_vision_transformer,pass,6
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,6
-
-
-
 torch_multimodal_clip,pass,7
 
 
@@ -282,7 +202,7 @@ vgg16,pass,6
 
 
 
-vision_maskrcnn,pass,39
+vision_maskrcnn,pass,37
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
index b75931080595..54914c1395e1 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
@@ -174,11 +110,11 @@ YituTechConvBert,pass,0
 
 
 
-meta-llama/Llama-3.2-1B,pass,5
+meta-llama/Llama-3.2-1B,pass,0
 
 
 
-google/gemma-2-2b,pass,5
+google/gemma-2-2b,pass,0
 
 
 
@@ -186,8 +122,16 @@ google/gemma-3-4b-it,pass_due_to_skip,0
 
 
 
-openai/whisper-tiny,pass,6
+openai/whisper-tiny,pass,0
+
+
+
+Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
 
 
 
-Qwen/Qwen3-0.6B,pass,5
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
index f65909f3a24e..e06f3bde8af1 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4
 
 
 
-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,6
 
 
 
-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5
 
 
 
-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0
 
 
 
-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5
 
 
 
-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4
 
 
 
-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5
 
 
 
-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,6
 
 
 
-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5
 
 
@@ -110,64 +66,64 @@ MegatronBertForCausalLM,pass,5
 
 
 
-MegatronBertForQuestionAnswering,pass,5
+MobileBertForMaskedLM,pass,3
 
 
 
-MobileBertForMaskedLM,pass,3
+OPTForCausalLM,pass,8
 
 
 
-MobileBertForQuestionAnswering,pass,3
+PLBartForCausalLM,pass,6
 
 
 
-OPTForCausalLM,pass,8
+PegasusForCausalLM,pass,6
 
 
 
-PLBartForCausalLM,pass,6
+RobertaForCausalLM,pass,5
 
 
 
-PLBartForConditionalGeneration,pass,8
+T5ForConditionalGeneration,pass,5
 
 
 
-PegasusForCausalLM,pass,6
+T5Small,pass,5
 
 
 
-PegasusForConditionalGeneration,pass,7
+TrOCRForCausalLM,pass,6
 
 
 
-RobertaForCausalLM,pass,5
+XGLMForCausalLM,pass,6
 
 
 
-RobertaForQuestionAnswering,pass,5
+XLNetLMHeadModel,pass,5
 
 
 
-T5ForConditionalGeneration,pass,5
+YituTechConvBert,pass,5
 
 
 
-T5Small,pass,5
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
 
 
 
-TrOCRForCausalLM,pass,6
+google/gemma-2-2b,eager_fail_to_run,0
 
 
 
-XGLMForCausalLM,pass,6
+google/gemma-3-4b-it,eager_fail_to_run,0
 
 
 
-XLNetLMHeadModel,pass,5
+openai/whisper-tiny,eager_fail_to_run,0
 
 
 
-YituTechConvBert,pass,5
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_inference.csv
index c889ba0e8d2f..1de6cdf54965 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0
 
 
 
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_training.csv
index e5464160d32f..b5e457e58997 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_timm_training.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,7
 
 
 
-botnet26t_256,pass,6
-
-
-
-cait_m36_384,eager_fail_to_run,0
-
-
-
-coat_lite_mini,pass,6
-
-
-
-convit_base,pass,7
-
-
-
-convmixer_768_32,pass,5
-
-
-
-convnext_base,pass,7
-
-
-
-crossvit_9_240,pass,7
-
-
-
-cspdarknet53,pass,7
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
-dla102,pass,7
+deit_tiny_patch16_224.fb_in1k,pass,7
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,6
 
 
 
-dpn107,pass,6
-
-
-
-eca_botnext26ts_256,pass,7
-
-
-
-eca_halonext26ts,pass,7
-
-
-
-ese_vovnet19b_dw,pass,7
-
-
-
-fbnetc_100,pass,7
-
-
-
-fbnetv3_b,pass,6
-
-
-
-gernet_l,pass,6
-
-
-
 ghostnet_100,pass,6
 
 
 
-gluon_inception_v3,pass,7
-
-
-
-gmixer_24_224,pass,6
-
-
-
-gmlp_s16_224,pass,7
-
-
-
-hrnet_w18,pass,5
-
-
-
 inception_v3,pass,6
 
 
 
-jx_nest_base,pass,7
-
-
-
-lcnet_050,pass,6
-
-
-
-levit_128,pass,7
-
-
-
-mixer_b16_224,pass,7
-
-
-
-mixnet_l,pass,6
-
-
-
-mnasnet_100,pass,7
-
-
-
 mobilenetv2_100,pass,7
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,7
 
 
 
-pit_b_224,pass,6
-
-
-
-pnasnet5large,pass,5
-
-
-
-poolformer_m36,pass,6
-
-
-
-regnety_002,pass,6
-
-
-
 repvgg_a2,pass,7
 
 
 
-res2net101_26w_4s,pass,6
-
-
-
-res2net50_14w_8s,pass,6
-
-
-
-res2next50,pass,6
-
-
-
-resmlp_12_224,pass,6
-
-
-
-resnest101e,pass,6
-
-
-
-rexnet_100,pass,7
-
-
-
-sebotnet33ts_256,pass,6
-
-
-
-selecsls42b,pass,6
-
-
-
-spnasnet_100,pass,7
-
-
-
 swin_base_patch4_window7_224,pass,7
 
 
 
-swsl_resnext101_32x16d,pass,6
-
-
-
 tf_efficientnet_b0,pass,6
 
 
 
-tf_mixnet_l,pass,6
-
-
-
-tinynet_a,pass,6
-
-
-
-tnt_s_patch16_224,pass,7
-
-
-
-twins_pcpvt_base,pass,7
-
-
-
 visformer_small,pass,7
 
 
 
-vit_base_patch16_224,pass,7
-
-
-
-volo_d1_224,pass,7
+vit_base_patch14_dinov2.lvd142m,pass,7
 
 
 
-xcit_large_24_p8_224,pass_due_to_skip,7
+vit_base_patch16_siglip_256,pass,7
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
index 1d199fe8ea66..6ddac7cc558d 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
@@ -130,70 +130,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,8
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,eager_fail_to_run,0
-
-
-
-hf_T5_generate,pass,11
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -218,7 +154,7 @@ maml_omniglot,pass,0
 
 
 
-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2
 
 
 
@@ -342,30 +278,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
@@ -378,7 +290,7 @@ vgg16,pass,0
 
 
 
-vision_maskrcnn,pass,20
+vision_maskrcnn,pass,21
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
index 0985e42fc5cb..a133b9b67a76 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
@@ -78,62 +78,6 @@ functorch_maml_omniglot,pass,7
 
 
 
-hf_Albert,pass,6
-
-
-
-hf_Bart,pass,6
-
-
-
-hf_Bert,pass,6
-
-
-
-hf_Bert_large,pass,6
-
-
-
-hf_BigBird,pass,6
-
-
-
-hf_DistilBert,pass,6
-
-
-
-hf_GPT2,pass,8
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,25
-
-
-
-hf_Roberta_base,pass,6
-
-
-
-hf_T5_base,eager_2nd_run_OOM,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,6
-
-
-
-hf_distil_whisper,model_fail_to_load,0
-
-
-
 lennard_jones,pass,7
 
 
@@ -146,7 +90,7 @@ maml_omniglot,pass,7
 
 
 
-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9
 
 
 
@@ -250,30 +194,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,7
-
-
-
-timm_regnet,pass,7
-
-
-
-timm_resnest,pass,6
-
-
-
-timm_vision_transformer,pass,6
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,6
-
-
-
 torch_multimodal_clip,pass,7
 
 
@@ -286,7 +206,7 @@ vgg16,pass,6
 
 
 
-vision_maskrcnn,pass,39
+vision_maskrcnn,pass,40
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
index b75931080595..54914c1395e1 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
@@ -174,11 +110,11 @@ YituTechConvBert,pass,0
 
 
 
-meta-llama/Llama-3.2-1B,pass,5
+meta-llama/Llama-3.2-1B,pass,0
 
 
 
-google/gemma-2-2b,pass,5
+google/gemma-2-2b,pass,0
 
 
 
@@ -186,8 +122,16 @@ google/gemma-3-4b-it,pass_due_to_skip,0
 
 
 
-openai/whisper-tiny,pass,6
+openai/whisper-tiny,pass,0
+
+
+
+Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
 
 
 
-Qwen/Qwen3-0.6B,pass,5
+openai/gpt-oss-20b,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
index 37e1b792b3dc..e06f3bde8af1 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4
 
 
 
-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,6
 
 
 
-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5
 
 
 
-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0
 
 
 
-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5
 
 
 
-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4
 
 
 
-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5
 
 
 
-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,6
 
 
 
-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5
 
 
 
-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3
 
 
 
-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6
 
 
 
-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6
 
 
 
-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5
 
 
 
-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_inference.csv
index d8263edfcaac..1de6cdf54965 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,fail_to_run,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0
 
 
 
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv
index e5464160d32f..b2f40504a499 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,7
 
 
 
-botnet26t_256,pass,6
-
-
-
-cait_m36_384,eager_fail_to_run,0
-
-
-
-coat_lite_mini,pass,6
-
-
-
-convit_base,pass,7
-
-
-
-convmixer_768_32,pass,5
-
-
-
-convnext_base,pass,7
-
-
-
-crossvit_9_240,pass,7
-
-
-
-cspdarknet53,pass,7
+convnextv2_nano.fcmae_ft_in22k_in1k,fail_accuracy,7
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
-dla102,pass,7
+deit_tiny_patch16_224.fb_in1k,pass,7
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,6
 
 
 
-dpn107,pass,6
-
-
-
-eca_botnext26ts_256,pass,7
-
-
-
-eca_halonext26ts,pass,7
-
-
-
-ese_vovnet19b_dw,pass,7
-
-
-
-fbnetc_100,pass,7
-
-
-
-fbnetv3_b,pass,6
-
-
-
-gernet_l,pass,6
-
-
-
 ghostnet_100,pass,6
 
 
 
-gluon_inception_v3,pass,7
-
-
-
-gmixer_24_224,pass,6
-
-
-
-gmlp_s16_224,pass,7
-
-
-
-hrnet_w18,pass,5
-
-
-
 inception_v3,pass,6
 
 
 
-jx_nest_base,pass,7
-
-
-
-lcnet_050,pass,6
-
-
-
-levit_128,pass,7
-
-
-
-mixer_b16_224,pass,7
-
-
-
-mixnet_l,pass,6
-
-
-
-mnasnet_100,pass,7
-
-
-
 mobilenetv2_100,pass,7
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,7
 
 
 
-pit_b_224,pass,6
-
-
-
-pnasnet5large,pass,5
-
-
-
-poolformer_m36,pass,6
-
-
-
-regnety_002,pass,6
-
-
-
 repvgg_a2,pass,7
 
 
 
-res2net101_26w_4s,pass,6
-
-
-
-res2net50_14w_8s,pass,6
-
-
-
-res2next50,pass,6
-
-
-
-resmlp_12_224,pass,6
-
-
-
-resnest101e,pass,6
-
-
-
-rexnet_100,pass,7
-
-
-
-sebotnet33ts_256,pass,6
-
-
-
-selecsls42b,pass,6
-
-
-
-spnasnet_100,pass,7
-
-
-
 swin_base_patch4_window7_224,pass,7
 
 
 
-swsl_resnext101_32x16d,pass,6
-
-
-
 tf_efficientnet_b0,pass,6
 
 
 
-tf_mixnet_l,pass,6
-
-
-
-tinynet_a,pass,6
-
-
-
-tnt_s_patch16_224,pass,7
-
-
-
-twins_pcpvt_base,pass,7
-
-
-
 visformer_small,pass,7
 
 
 
-vit_base_patch16_224,pass,7
-
-
-
-volo_d1_224,pass,7
+vit_base_patch14_dinov2.lvd142m,fail_accuracy,7
 
 
 
-xcit_large_24_p8_224,pass_due_to_skip,7
+vit_base_patch16_siglip_256,pass,7
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
index e41018657c0e..c752deaf1990 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
@@ -130,70 +130,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,fail_accuracy,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,8
-
-
-
-hf_Roberta_base,pass,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,eager_fail_to_run,0
-
-
-
-hf_T5_generate,pass,11
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -218,7 +154,7 @@ maml_omniglot,pass,0
 
 
 
-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2
 
 
 
@@ -342,30 +278,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
@@ -378,7 +290,7 @@ vgg16,pass,0
 
 
 
-vision_maskrcnn,pass,20
+vision_maskrcnn,pass,18
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
index 54b7d63f3a4b..c94765803cc0 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
@@ -78,62 +78,6 @@ functorch_maml_omniglot,pass,7
 
 
 
-hf_Albert,pass,6
-
-
-
-hf_Bart,pass,6
-
-
-
-hf_Bert,pass,6
-
-
-
-hf_Bert_large,pass,6
-
-
-
-hf_BigBird,pass,6
-
-
-
-hf_DistilBert,pass,6
-
-
-
-hf_GPT2,pass,8
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,25
-
-
-
-hf_Roberta_base,pass,6
-
-
-
-hf_T5_base,eager_2nd_run_OOM,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,6
-
-
-
-hf_distil_whisper,model_fail_to_load,0
-
-
-
 lennard_jones,pass,7
 
 
@@ -146,7 +90,7 @@ maml_omniglot,pass,7
 
 
 
-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9
 
 
 
@@ -250,30 +194,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,fail_accuracy,7
-
-
-
-timm_regnet,pass,7
-
-
-
-timm_resnest,pass,6
-
-
-
-timm_vision_transformer,pass,6
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,6
-
-
-
 torch_multimodal_clip,pass,7
 
 
@@ -286,7 +206,7 @@ vgg16,pass,6
 
 
 
-vision_maskrcnn,pass,39
+vision_maskrcnn,pass,37
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
index b75931080595..46f1e5adf4ec 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_inference.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
index 7ec7c3c8482c..988cd1b70a5b 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_huggingface_training.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4
 
 
 
-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,6
 
 
 
-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5
 
 
 
-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,4
-
-
-
 DistilBertForMaskedLM,pass,5
 
 
 
-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4
 
 
 
-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5
 
 
 
-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,6
 
 
 
-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5
 
 
 
-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3
 
 
 
-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6
 
 
 
-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6
 
 
 
-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5
 
 
 
-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_inference.csv
index 6727624d0b09..1de6cdf54965 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,eager_fail_to_run,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0
 
 
 
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_training.csv
index afd846df7cb7..b5e457e58997 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_timm_training.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,7
 
 
 
-botnet26t_256,pass,6
-
-
-
-cait_m36_384,pass,6
-
-
-
-coat_lite_mini,pass,6
-
-
-
-convit_base,pass,7
-
-
-
-convmixer_768_32,pass,5
-
-
-
-convnext_base,pass,7
-
-
-
-crossvit_9_240,pass,7
-
-
-
-cspdarknet53,pass,7
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
-dla102,pass,7
+deit_tiny_patch16_224.fb_in1k,pass,7
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,6
 
 
 
-dpn107,pass,6
-
-
-
-eca_botnext26ts_256,pass,7
-
-
-
-eca_halonext26ts,pass,7
-
-
-
-ese_vovnet19b_dw,pass,7
-
-
-
-fbnetc_100,pass,7
-
-
-
-fbnetv3_b,pass,6
-
-
-
-gernet_l,pass,6
-
-
-
 ghostnet_100,pass,6
 
 
 
-gluon_inception_v3,pass,7
-
-
-
-gmixer_24_224,pass,6
-
-
-
-gmlp_s16_224,pass,7
-
-
-
-hrnet_w18,pass,5
-
-
-
 inception_v3,pass,6
 
 
 
-jx_nest_base,pass,7
-
-
-
-lcnet_050,pass,6
-
-
-
-levit_128,eager_fail_to_run,7
-
-
-
-mixer_b16_224,pass,7
-
-
-
-mixnet_l,pass,6
-
-
-
-mnasnet_100,pass,7
-
-
-
 mobilenetv2_100,pass,7
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,7
 
 
 
-pit_b_224,pass,6
-
-
-
-pnasnet5large,pass,5
-
-
-
-poolformer_m36,pass,6
-
-
-
-regnety_002,pass,6
-
-
-
 repvgg_a2,pass,7
 
 
 
-res2net101_26w_4s,pass,6
-
-
-
-res2net50_14w_8s,pass,6
-
-
-
-res2next50,pass,6
-
-
-
-resmlp_12_224,pass,6
-
-
-
-resnest101e,pass,6
-
-
-
-rexnet_100,pass,7
-
-
-
-sebotnet33ts_256,pass,6
-
-
-
-selecsls42b,pass,6
-
-
-
-spnasnet_100,pass,7
-
-
-
 swin_base_patch4_window7_224,pass,7
 
 
 
-swsl_resnext101_32x16d,pass,6
-
-
-
 tf_efficientnet_b0,pass,6
 
 
 
-tf_mixnet_l,pass,6
-
-
-
-tinynet_a,pass,6
-
-
-
-tnt_s_patch16_224,pass,7
-
-
-
-twins_pcpvt_base,pass,7
-
-
-
 visformer_small,pass,7
 
 
 
-vit_base_patch16_224,pass,7
-
-
-
-volo_d1_224,pass,7
+vit_base_patch14_dinov2.lvd142m,pass,7
 
 
 
-xcit_large_24_p8_224,pass_due_to_skip,7
+vit_base_patch16_siglip_256,pass,7
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
index bf70642a855e..ee742091e008 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
@@ -130,73 +130,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,9
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,8
-
-
-
-hf_Roberta_base,pass,0
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_generate,pass,11
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -205,7 +138,7 @@ llama,pass,0
 
 
 
-llama_v2_7b_16h,model_fail_to_load,0
+llama_v2_7b_16h,pass_due_to_skip,0
 
 
 
@@ -221,7 +154,7 @@ maml_omniglot,pass,0
 
 
 
-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2
 
 
 
@@ -345,38 +278,6 @@ stable_diffusion_unet,model_fail_to_load,0
 
 
 
-timm_efficientdet,pass,2
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
@@ -389,7 +290,7 @@ vgg16,eager_two_runs_differ,0
 
 
 
-vision_maskrcnn,pass,20
+vision_maskrcnn,pass,21
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
index 48d0b111788f..de21a39be4e9 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
@@ -78,70 +78,6 @@ functorch_maml_omniglot,pass,7
 
 
 
-hf_Albert,pass,6
-
-
-
-hf_Bart,pass,6
-
-
-
-hf_Bert,pass,6
-
-
-
-hf_Bert_large,pass,6
-
-
-
-hf_BigBird,pass,6
-
-
-
-hf_DistilBert,pass,6
-
-
-
-hf_GPT2,pass,8
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,25
-
-
-
-hf_Roberta_base,pass,6
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,eager_2nd_run_OOM,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,6
-
-
-
-hf_distil_whisper,model_fail_to_load,0
-
-
-
 lennard_jones,pass,7
 
 
@@ -154,7 +90,7 @@ maml_omniglot,pass,7
 
 
 
-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9
 
 
 
@@ -170,7 +106,7 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0
 
 
 
-mobilenet_v3_large,pass,7
+mobilenet_v3_large,pass,0
 
 
 
@@ -210,7 +146,7 @@ pytorch_unet,pass_due_to_skip,7
 
 
 
-resnet152,pass,7
+resnet152,pass,0
 
 
 
@@ -218,7 +154,7 @@ resnet18,pass,6
 
 
 
-resnet50,pass,6
+resnet50,pass,0
 
 
 
@@ -258,38 +194,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientdet,pass,2
-
-
-
-timm_efficientnet,pass,7
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,7
-
-
-
-timm_resnest,pass,6
-
-
-
-timm_vision_transformer,pass,6
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,6
-
-
-
 torch_multimodal_clip,pass,7
 
 
@@ -302,7 +206,7 @@ vgg16,pass,0
 
 
 
-vision_maskrcnn,pass,39
+vision_maskrcnn,pass,40
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
index a49a27a8223d..681b0e338f58 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_huggingface_inference.csv
@@ -6,59 +6,27 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
-DistillGPT2,pass,2
+DistillGPT2,pass,0
 
 
 
@@ -66,10 +34,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -82,10 +46,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -94,10 +54,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -106,18 +62,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -126,26 +74,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_timm_inference.csv
index c889ba0e8d2f..1de6cdf54965 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0
 
 
 
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
index cc4ef192ca53..05f9596f620a 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
@@ -118,58 +118,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,fail_accuracy,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,eager_fail_to_run,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -178,7 +126,7 @@ llama,fail_to_run,0
 
 
 
-llama_v2_7b_16h,model_fail_to_load,0
+llama_v2_7b_16h,pass_due_to_skip,0
 
 
 
@@ -194,6 +142,10 @@ maml_omniglot,pass,0
 
 
 
+microbench_unbacked_tolist_sum,fail_to_run,0
+
+
+
 mnasnet1_0,pass,0
 
 
@@ -306,30 +258,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
index b75931080595..46f1e5adf4ec 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_inference.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
index 7ec7c3c8482c..988cd1b70a5b 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_huggingface_training.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4
 
 
 
-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,6
 
 
 
-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5
 
 
 
-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,4
-
-
-
 DistilBertForMaskedLM,pass,5
 
 
 
-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4
 
 
 
-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5
 
 
 
-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,6
 
 
 
-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5
 
 
 
-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3
 
 
 
-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6
 
 
 
-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6
 
 
 
-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5
 
 
 
-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_inference.csv
index 6727624d0b09..1de6cdf54965 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,eager_fail_to_run,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0
 
 
 
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_training.csv
index afd846df7cb7..b5e457e58997 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_timm_training.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,7
 
 
 
-botnet26t_256,pass,6
-
-
-
-cait_m36_384,pass,6
-
-
-
-coat_lite_mini,pass,6
-
-
-
-convit_base,pass,7
-
-
-
-convmixer_768_32,pass,5
-
-
-
-convnext_base,pass,7
-
-
-
-crossvit_9_240,pass,7
-
-
-
-cspdarknet53,pass,7
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
-dla102,pass,7
+deit_tiny_patch16_224.fb_in1k,pass,7
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,6
 
 
 
-dpn107,pass,6
-
-
-
-eca_botnext26ts_256,pass,7
-
-
-
-eca_halonext26ts,pass,7
-
-
-
-ese_vovnet19b_dw,pass,7
-
-
-
-fbnetc_100,pass,7
-
-
-
-fbnetv3_b,pass,6
-
-
-
-gernet_l,pass,6
-
-
-
 ghostnet_100,pass,6
 
 
 
-gluon_inception_v3,pass,7
-
-
-
-gmixer_24_224,pass,6
-
-
-
-gmlp_s16_224,pass,7
-
-
-
-hrnet_w18,pass,5
-
-
-
 inception_v3,pass,6
 
 
 
-jx_nest_base,pass,7
-
-
-
-lcnet_050,pass,6
-
-
-
-levit_128,eager_fail_to_run,7
-
-
-
-mixer_b16_224,pass,7
-
-
-
-mixnet_l,pass,6
-
-
-
-mnasnet_100,pass,7
-
-
-
 mobilenetv2_100,pass,7
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,7
 
 
 
-pit_b_224,pass,6
-
-
-
-pnasnet5large,pass,5
-
-
-
-poolformer_m36,pass,6
-
-
-
-regnety_002,pass,6
-
-
-
 repvgg_a2,pass,7
 
 
 
-res2net101_26w_4s,pass,6
-
-
-
-res2net50_14w_8s,pass,6
-
-
-
-res2next50,pass,6
-
-
-
-resmlp_12_224,pass,6
-
-
-
-resnest101e,pass,6
-
-
-
-rexnet_100,pass,7
-
-
-
-sebotnet33ts_256,pass,6
-
-
-
-selecsls42b,pass,6
-
-
-
-spnasnet_100,pass,7
-
-
-
 swin_base_patch4_window7_224,pass,7
 
 
 
-swsl_resnext101_32x16d,pass,6
-
-
-
 tf_efficientnet_b0,pass,6
 
 
 
-tf_mixnet_l,pass,6
-
-
-
-tinynet_a,pass,6
-
-
-
-tnt_s_patch16_224,pass,7
-
-
-
-twins_pcpvt_base,pass,7
-
-
-
 visformer_small,pass,7
 
 
 
-vit_base_patch16_224,pass,7
-
-
-
-volo_d1_224,pass,7
+vit_base_patch14_dinov2.lvd142m,pass,7
 
 
 
-xcit_large_24_p8_224,pass_due_to_skip,7
+vit_base_patch16_siglip_256,pass,7
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
index 7797f2dcdf9d..5b47d0493824 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
@@ -130,73 +130,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,9
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,8
-
-
-
-hf_Roberta_base,pass,0
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_generate,pass,11
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -221,7 +154,7 @@ maml_omniglot,pass,0
 
 
 
-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2
 
 
 
@@ -345,38 +278,6 @@ stable_diffusion_unet,model_fail_to_load,0
 
 
 
-timm_efficientdet,pass,2
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
@@ -389,7 +290,7 @@ vgg16,eager_two_runs_differ,0
 
 
 
-vision_maskrcnn,pass,20
+vision_maskrcnn,pass,21
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
index 643a02fdca8f..e4b9fe47e390 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
@@ -78,70 +78,6 @@ functorch_maml_omniglot,pass,7
 
 
 
-hf_Albert,pass,6
-
-
-
-hf_Bart,pass,6
-
-
-
-hf_Bert,pass,6
-
-
-
-hf_Bert_large,pass,6
-
-
-
-hf_BigBird,fail_to_run,3
-
-
-
-hf_DistilBert,pass,6
-
-
-
-hf_GPT2,pass,8
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,25
-
-
-
-hf_Roberta_base,pass,6
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,eager_2nd_run_OOM,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,6
-
-
-
-hf_distil_whisper,model_fail_to_load,0
-
-
-
 lennard_jones,pass,7
 
 
@@ -154,7 +90,7 @@ maml_omniglot,pass,7
 
 
 
-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9
 
 
 
@@ -170,7 +106,7 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0
 
 
 
-mobilenet_v3_large,pass,7
+mobilenet_v3_large,pass,0
 
 
 
@@ -210,7 +146,7 @@ pytorch_unet,pass_due_to_skip,7
 
 
 
-resnet152,pass,7
+resnet152,pass,0
 
 
 
@@ -254,38 +190,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientdet,pass,2
-
-
-
-timm_efficientnet,pass,7
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,7
-
-
-
-timm_resnest,pass,6
-
-
-
-timm_vision_transformer,pass,6
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,6
-
-
-
 torch_multimodal_clip,pass,7
 
 
@@ -298,7 +202,7 @@ vgg16,pass,0
 
 
 
-vision_maskrcnn,pass,39
+vision_maskrcnn,pass,40
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
index b5e1a0989e74..8850f98ce1ff 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_inference.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
index 37e1b792b3dc..988cd1b70a5b 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_huggingface_training.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4
 
 
 
-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9
 
 
@@ -18,31 +14,11 @@ BartForCausalLM,pass,6
 
 
 
-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5
 
 
 
-BertForQuestionAnswering,pass,5
-
-
-
-BlenderbotForCausalLM,eager_fail_to_run,0
-
-
-
-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
+BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
@@ -50,18 +26,10 @@ DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5
 
 
 
-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4
 
 
 
-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5
 
 
 
-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,6
 
 
 
-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5
 
 
 
-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3
 
 
 
-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6
 
 
 
-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6
 
 
 
-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5
 
 
 
-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_inference.csv
index c889ba0e8d2f..0487b132c937 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,59 +50,7 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
-repvgg_a2,pass,0
-
-
-
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
+repvgg_a2,fail_accuracy,0
 
 
 
@@ -206,40 +58,16 @@ swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv
index e5464160d32f..b2071874b70d 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_timm_training.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,7
 
 
 
-botnet26t_256,pass,6
-
-
-
-cait_m36_384,eager_fail_to_run,0
-
-
-
-coat_lite_mini,pass,6
-
-
-
-convit_base,pass,7
-
-
-
-convmixer_768_32,pass,5
-
-
-
-convnext_base,pass,7
-
-
-
-crossvit_9_240,pass,7
-
-
-
-cspdarknet53,pass,7
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
-dla102,pass,7
+deit_tiny_patch16_224.fb_in1k,pass,7
 
 
 
@@ -54,83 +26,15 @@ dm_nfnet_f0,pass,6
 
 
 
-dpn107,pass,6
-
-
-
-eca_botnext26ts_256,pass,7
-
-
-
-eca_halonext26ts,pass,7
-
-
-
-ese_vovnet19b_dw,pass,7
-
-
-
-fbnetc_100,pass,7
-
-
-
-fbnetv3_b,pass,6
-
-
-
-gernet_l,pass,6
-
-
-
 ghostnet_100,pass,6
 
 
 
-gluon_inception_v3,pass,7
-
-
-
-gmixer_24_224,pass,6
-
-
-
-gmlp_s16_224,pass,7
-
-
-
-hrnet_w18,pass,5
-
-
-
 inception_v3,pass,6
 
 
 
-jx_nest_base,pass,7
-
-
-
-lcnet_050,pass,6
-
-
-
-levit_128,pass,7
-
-
-
-mixer_b16_224,pass,7
-
-
-
-mixnet_l,pass,6
-
-
-
-mnasnet_100,pass,7
-
-
-
-mobilenetv2_100,pass,7
+mobilenetv2_100,fail_accuracy,7
 
 
 
@@ -146,59 +50,7 @@ nfnet_l0,pass,7
 
 
 
-pit_b_224,pass,6
-
-
-
-pnasnet5large,pass,5
-
-
-
-poolformer_m36,pass,6
-
-
-
-regnety_002,pass,6
-
-
-
-repvgg_a2,pass,7
-
-
-
-res2net101_26w_4s,pass,6
-
-
-
-res2net50_14w_8s,pass,6
-
-
-
-res2next50,pass,6
-
-
-
-resmlp_12_224,pass,6
-
-
-
-resnest101e,pass,6
-
-
-
-rexnet_100,pass,7
-
-
-
-sebotnet33ts_256,pass,6
-
-
-
-selecsls42b,pass,6
-
-
-
-spnasnet_100,pass,7
+repvgg_a2,fail_accuracy,7
 
 
 
@@ -206,40 +58,16 @@ swin_base_patch4_window7_224,pass,7
 
 
 
-swsl_resnext101_32x16d,pass,6
-
-
-
 tf_efficientnet_b0,pass,6
 
 
 
-tf_mixnet_l,pass,6
-
-
-
-tinynet_a,pass,6
-
-
-
-tnt_s_patch16_224,pass,7
-
-
-
-twins_pcpvt_base,pass,7
-
-
-
-visformer_small,pass,7
-
-
-
-vit_base_patch16_224,pass,7
+visformer_small,fail_accuracy,7
 
 
 
-volo_d1_224,pass,7
+vit_base_patch14_dinov2.lvd142m,pass,7
 
 
 
-xcit_large_24_p8_224,pass_due_to_skip,7
+vit_base_patch16_siglip_256,pass,7
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
index fed8ebded682..42deaec76b54 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
@@ -130,66 +130,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,fail_to_run,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,8
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,eager_fail_to_run,0
-
-
-
-hf_T5_generate,pass,11
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -198,7 +138,7 @@ llama,pass,0
 
 
 
-llama_v2_7b_16h,model_fail_to_load,0
+llama_v2_7b_16h,pass_due_to_skip,0
 
 
 
@@ -214,6 +154,10 @@ maml_omniglot,pass,0
 
 
 
+microbench_unbacked_tolist_sum,pass,2
+
+
+
 mnasnet1_0,pass,0
 
 
@@ -306,6 +250,10 @@ sam,pass,0
 
 
 
+sam_fast,model_fail_to_load,0
+
+
+
 shufflenet_v2_x1_0,pass,0
 
 
@@ -330,30 +278,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
@@ -366,7 +290,7 @@ vgg16,pass,0
 
 
 
-vision_maskrcnn,pass,20
+vision_maskrcnn,pass,18
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
index ced88884720b..b164cb28d04b 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
@@ -70,7 +70,7 @@ fastNLP_Bert,pass,10
 
 
 
-functorch_dp_cifar10,pass,7
+functorch_dp_cifar10,fail_accuracy,7
 
 
 
@@ -78,58 +78,6 @@ functorch_maml_omniglot,pass,7
 
 
 
-hf_Albert,pass,6
-
-
-
-hf_Bart,pass,6
-
-
-
-hf_Bert,pass,6
-
-
-
-hf_Bert_large,pass,6
-
-
-
-hf_BigBird,fail_to_run,3
-
-
-
-hf_DistilBert,pass,6
-
-
-
-hf_GPT2,pass,8
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,25
-
-
-
-hf_T5_base,eager_2nd_run_OOM,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,6
-
-
-
-hf_distil_whisper,model_fail_to_load,0
-
-
-
 lennard_jones,pass,7
 
 
@@ -142,7 +90,7 @@ maml_omniglot,pass,7
 
 
 
-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9
 
 
 
@@ -158,7 +106,7 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0
 
 
 
-mobilenet_v3_large,pass,7
+mobilenet_v3_large,pass,0
 
 
 
@@ -198,7 +146,7 @@ pytorch_unet,pass_due_to_skip,7
 
 
 
-resnet152,pass,7
+resnet152,pass,0
 
 
 
@@ -242,30 +190,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,7
-
-
-
-timm_regnet,pass,7
-
-
-
-timm_resnest,pass,6
-
-
-
-timm_vision_transformer,pass,6
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,6
-
-
-
 torch_multimodal_clip,pass,7
 
 
@@ -278,7 +202,7 @@ vgg16,pass,0
 
 
 
-vision_maskrcnn,pass,39
+vision_maskrcnn,fail_accuracy,37
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
index b75931080595..46f1e5adf4ec 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_inference.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
index 08061de428d7..a6a0fe77d7fc 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4
 
 
 
-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,6
 
 
 
-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5
 
 
 
-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,4
-
-
-
 DistilBertForMaskedLM,pass,5
 
 
 
-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4
 
 
 
-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5
 
 
 
-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,6
 
 
 
-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5
 
 
@@ -110,64 +66,64 @@ MegatronBertForCausalLM,pass,5
 
 
 
-MegatronBertForQuestionAnswering,pass,5
+MobileBertForMaskedLM,pass,3
 
 
 
-MobileBertForMaskedLM,pass,3
+OPTForCausalLM,pass,8
 
 
 
-MobileBertForQuestionAnswering,pass,3
+PLBartForCausalLM,pass,6
 
 
 
-OPTForCausalLM,pass,8
+PegasusForCausalLM,pass,6
 
 
 
-PLBartForCausalLM,pass,6
+RobertaForCausalLM,pass,5
 
 
 
-PLBartForConditionalGeneration,pass,8
+T5ForConditionalGeneration,pass,5
 
 
 
-PegasusForCausalLM,pass,6
+T5Small,pass,5
 
 
 
-PegasusForConditionalGeneration,pass,7
+TrOCRForCausalLM,pass,6
 
 
 
-RobertaForCausalLM,pass,5
+XGLMForCausalLM,pass,6
 
 
 
-RobertaForQuestionAnswering,pass,5
+XLNetLMHeadModel,pass,5
 
 
 
-T5ForConditionalGeneration,pass,5
+YituTechConvBert,pass,5
 
 
 
-T5Small,pass,5
+meta-llama/Llama-3.2-1B,eager_failed_to_run,0
 
 
 
-TrOCRForCausalLM,pass,6
+google/gemma-2-2b,eager_failed_to_run,0
 
 
 
-XGLMForCausalLM,pass,6
+google/gemma-3-4b-it,eager_failed_to_run,0
 
 
 
-XLNetLMHeadModel,pass,5
+openai/whisper-tiny,eager_failed_to_run,0
 
 
 
-YituTechConvBert,pass,5
+Qwen/Qwen3-0.6B,eager_failed_to_run,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_inference.csv
index 6727624d0b09..1de6cdf54965 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,eager_fail_to_run,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0
 
 
 
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_training.csv
index afd846df7cb7..b5e457e58997 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_timm_training.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,7
 
 
 
-botnet26t_256,pass,6
-
-
-
-cait_m36_384,pass,6
-
-
-
-coat_lite_mini,pass,6
-
-
-
-convit_base,pass,7
-
-
-
-convmixer_768_32,pass,5
-
-
-
-convnext_base,pass,7
-
-
-
-crossvit_9_240,pass,7
-
-
-
-cspdarknet53,pass,7
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,7
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
-dla102,pass,7
+deit_tiny_patch16_224.fb_in1k,pass,7
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,6
 
 
 
-dpn107,pass,6
-
-
-
-eca_botnext26ts_256,pass,7
-
-
-
-eca_halonext26ts,pass,7
-
-
-
-ese_vovnet19b_dw,pass,7
-
-
-
-fbnetc_100,pass,7
-
-
-
-fbnetv3_b,pass,6
-
-
-
-gernet_l,pass,6
-
-
-
 ghostnet_100,pass,6
 
 
 
-gluon_inception_v3,pass,7
-
-
-
-gmixer_24_224,pass,6
-
-
-
-gmlp_s16_224,pass,7
-
-
-
-hrnet_w18,pass,5
-
-
-
 inception_v3,pass,6
 
 
 
-jx_nest_base,pass,7
-
-
-
-lcnet_050,pass,6
-
-
-
-levit_128,eager_fail_to_run,7
-
-
-
-mixer_b16_224,pass,7
-
-
-
-mixnet_l,pass,6
-
-
-
-mnasnet_100,pass,7
-
-
-
 mobilenetv2_100,pass,7
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,7
 
 
 
-pit_b_224,pass,6
-
-
-
-pnasnet5large,pass,5
-
-
-
-poolformer_m36,pass,6
-
-
-
-regnety_002,pass,6
-
-
-
 repvgg_a2,pass,7
 
 
 
-res2net101_26w_4s,pass,6
-
-
-
-res2net50_14w_8s,pass,6
-
-
-
-res2next50,pass,6
-
-
-
-resmlp_12_224,pass,6
-
-
-
-resnest101e,pass,6
-
-
-
-rexnet_100,pass,7
-
-
-
-sebotnet33ts_256,pass,6
-
-
-
-selecsls42b,pass,6
-
-
-
-spnasnet_100,pass,7
-
-
-
 swin_base_patch4_window7_224,pass,7
 
 
 
-swsl_resnext101_32x16d,pass,6
-
-
-
 tf_efficientnet_b0,pass,6
 
 
 
-tf_mixnet_l,pass,6
-
-
-
-tinynet_a,pass,6
-
-
-
-tnt_s_patch16_224,pass,7
-
-
-
-twins_pcpvt_base,pass,7
-
-
-
 visformer_small,pass,7
 
 
 
-vit_base_patch16_224,pass,7
-
-
-
-volo_d1_224,pass,7
+vit_base_patch14_dinov2.lvd142m,pass,7
 
 
 
-xcit_large_24_p8_224,pass_due_to_skip,7
+vit_base_patch16_siglip_256,pass,7
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
index 9199f0cf6c37..ee742091e008 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
@@ -130,73 +130,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,pass,9
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,8
-
-
-
-hf_Roberta_base,pass,0
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,pass,0
-
-
-
-hf_T5_generate,pass,11
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -221,7 +154,7 @@ maml_omniglot,pass,0
 
 
 
-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2
 
 
 
@@ -345,38 +278,6 @@ stable_diffusion_unet,model_fail_to_load,0
 
 
 
-timm_efficientdet,pass,2
-
-
-
-timm_efficientnet,pass,0
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
@@ -389,7 +290,7 @@ vgg16,eager_two_runs_differ,0
 
 
 
-vision_maskrcnn,pass,20
+vision_maskrcnn,pass,21
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
index d1606b622639..62a73728fbba 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
@@ -78,70 +78,6 @@ functorch_maml_omniglot,pass,7
 
 
 
-hf_Albert,pass,6
-
-
-
-hf_Bart,pass,6
-
-
-
-hf_Bert,pass,6
-
-
-
-hf_Bert_large,pass,6
-
-
-
-hf_BigBird,pass,15
-
-
-
-hf_DistilBert,pass,6
-
-
-
-hf_GPT2,pass,8
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Longformer,pass,4
-
-
-
-hf_Reformer,pass,25
-
-
-
-hf_Roberta_base,pass,6
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,eager_2nd_run_OOM,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,6
-
-
-
-hf_distil_whisper,model_fail_to_load,0
-
-
-
 lennard_jones,pass,7
 
 
@@ -154,7 +90,7 @@ maml_omniglot,pass,7
 
 
 
-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9
 
 
 
@@ -170,7 +106,7 @@ mobilenet_v2_quantized_qat,eager_fail_to_run,0
 
 
 
-mobilenet_v3_large,pass,7
+mobilenet_v3_large,pass,0
 
 
 
@@ -210,7 +146,7 @@ pytorch_unet,pass_due_to_skip,7
 
 
 
-resnet152,pass,7
+resnet152,pass,0
 
 
 
@@ -258,38 +194,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientdet,pass,2
-
-
-
-timm_efficientnet,pass,7
-
-
-
-timm_nfnet,pass,0
-
-
-
-timm_regnet,pass,7
-
-
-
-timm_resnest,pass,6
-
-
-
-timm_vision_transformer,pass,6
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,6
-
-
-
 torch_multimodal_clip,pass,7
 
 
@@ -302,7 +206,7 @@ vgg16,pass,0
 
 
 
-vision_maskrcnn,pass,39
+vision_maskrcnn,pass,40
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
index b75931080595..46f1e5adf4ec 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_inference.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,0
 
 
 
-AlbertForQuestionAnswering,pass,0
-
-
-
 AllenaiLongformerBase,pass,4
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,0
 
 
 
-BartForConditionalGeneration,pass,0
-
-
-
 BertForMaskedLM,pass,0
 
 
 
-BertForQuestionAnswering,pass,0
-
-
-
 BlenderbotForCausalLM,pass_due_to_skip,0
 
 
 
-BlenderbotSmallForCausalLM,pass,0
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,0
-
-
-
-CamemBert,pass,0
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,pass,0
-
-
-
 DistilBertForMaskedLM,pass,0
 
 
 
-DistilBertForQuestionAnswering,pass,0
-
-
-
 DistillGPT2,pass,2
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,0
 
 
 
-ElectraForQuestionAnswering,pass,0
-
-
-
 GPT2ForSequenceClassification,pass,0
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,0
 
 
 
-LayoutLMForSequenceClassification,pass,0
-
-
-
 M2M100ForConditionalGeneration,pass,0
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,0
 
 
 
-MBartForConditionalGeneration,pass,0
-
-
-
 MT5ForConditionalGeneration,pass,0
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,0
 
 
 
-MegatronBertForQuestionAnswering,pass,0
-
-
-
 MobileBertForMaskedLM,pass,0
 
 
 
-MobileBertForQuestionAnswering,pass,0
-
-
-
 OPTForCausalLM,pass,0
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,0
 
 
 
-PLBartForConditionalGeneration,pass,0
-
-
-
 PegasusForCausalLM,pass,0
 
 
 
-PegasusForConditionalGeneration,pass,0
-
-
-
 RobertaForCausalLM,pass,0
 
 
 
-RobertaForQuestionAnswering,pass,0
-
-
-
 T5ForConditionalGeneration,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
index 37e1b792b3dc..e06f3bde8af1 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_huggingface_training.csv
@@ -6,10 +6,6 @@ AlbertForMaskedLM,pass,4
 
 
 
-AlbertForQuestionAnswering,pass,5
-
-
-
 AllenaiLongformerBase,pass,9
 
 
@@ -18,50 +14,22 @@ BartForCausalLM,pass,6
 
 
 
-BartForConditionalGeneration,pass,8
-
-
-
 BertForMaskedLM,pass,5
 
 
 
-BertForQuestionAnswering,pass,5
-
-
-
 BlenderbotForCausalLM,eager_fail_to_run,0
 
 
 
-BlenderbotSmallForCausalLM,pass,6
-
-
-
-BlenderbotSmallForConditionalGeneration,pass,8
-
-
-
-CamemBert,pass,5
-
-
-
 DebertaV2ForMaskedLM,pass_due_to_skip,0
 
 
 
-DebertaV2ForQuestionAnswering,eager_1st_run_OOM,0
-
-
-
 DistilBertForMaskedLM,pass,5
 
 
 
-DistilBertForQuestionAnswering,pass,5
-
-
-
 DistillGPT2,pass,7
 
 
@@ -70,10 +38,6 @@ ElectraForCausalLM,pass,4
 
 
 
-ElectraForQuestionAnswering,pass,5
-
-
-
 GPT2ForSequenceClassification,pass,6
 
 
@@ -86,10 +50,6 @@ LayoutLMForMaskedLM,pass,5
 
 
 
-LayoutLMForSequenceClassification,pass,6
-
-
-
 M2M100ForConditionalGeneration,pass,4
 
 
@@ -98,10 +58,6 @@ MBartForCausalLM,pass,6
 
 
 
-MBartForConditionalGeneration,pass,8
-
-
-
 MT5ForConditionalGeneration,pass,5
 
 
@@ -110,18 +66,10 @@ MegatronBertForCausalLM,pass,5
 
 
 
-MegatronBertForQuestionAnswering,pass,5
-
-
-
 MobileBertForMaskedLM,pass,3
 
 
 
-MobileBertForQuestionAnswering,pass,3
-
-
-
 OPTForCausalLM,pass,8
 
 
@@ -130,26 +78,14 @@ PLBartForCausalLM,pass,6
 
 
 
-PLBartForConditionalGeneration,pass,8
-
-
-
 PegasusForCausalLM,pass,6
 
 
 
-PegasusForConditionalGeneration,pass,7
-
-
-
 RobertaForCausalLM,pass,5
 
 
 
-RobertaForQuestionAnswering,pass,5
-
-
-
 T5ForConditionalGeneration,pass,5
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_timm_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_timm_inference.csv
index c889ba0e8d2f..1de6cdf54965 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_timm_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_timm_inference.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,0
 
 
 
-botnet26t_256,pass,0
-
-
-
-cait_m36_384,pass,0
-
-
-
-coat_lite_mini,pass,0
-
-
-
-convit_base,pass,0
-
-
-
-convmixer_768_32,pass,0
-
-
-
-convnext_base,pass,0
-
-
-
-crossvit_9_240,pass,0
-
-
-
-cspdarknet53,pass,0
+convnextv2_nano.fcmae_ft_in22k_in1k,pass,0
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,0
 
 
 
-dla102,pass,0
+deit_tiny_patch16_224.fb_in1k,pass,0
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,0
 
 
 
-dpn107,pass,0
-
-
-
-eca_botnext26ts_256,pass,0
-
-
-
-eca_halonext26ts,pass,0
-
-
-
-ese_vovnet19b_dw,pass,0
-
-
-
-fbnetc_100,pass,0
-
-
-
-fbnetv3_b,pass,0
-
-
-
-gernet_l,pass,0
-
-
-
 ghostnet_100,pass,0
 
 
 
-gluon_inception_v3,pass,0
-
-
-
-gmixer_24_224,pass,0
-
-
-
-gmlp_s16_224,pass,0
-
-
-
-hrnet_w18,pass,0
-
-
-
 inception_v3,pass,0
 
 
 
-jx_nest_base,pass,0
-
-
-
-lcnet_050,pass,0
-
-
-
-levit_128,pass,0
-
-
-
-mixer_b16_224,pass,0
-
-
-
-mixnet_l,pass,0
-
-
-
-mnasnet_100,pass,0
-
-
-
 mobilenetv2_100,pass,0
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,0
 
 
 
-pit_b_224,pass,0
-
-
-
-pnasnet5large,pass,0
-
-
-
-poolformer_m36,pass,0
-
-
-
-regnety_002,pass,0
-
-
-
 repvgg_a2,pass,0
 
 
 
-res2net101_26w_4s,pass,0
-
-
-
-res2net50_14w_8s,pass,0
-
-
-
-res2next50,pass,0
-
-
-
-resmlp_12_224,pass,0
-
-
-
-resnest101e,pass,0
-
-
-
-rexnet_100,pass,0
-
-
-
-sebotnet33ts_256,pass,0
-
-
-
-selecsls42b,pass,0
-
-
-
-spnasnet_100,pass,0
-
-
-
 swin_base_patch4_window7_224,pass,0
 
 
 
-swsl_resnext101_32x16d,pass,0
-
-
-
 tf_efficientnet_b0,pass,0
 
 
 
-tf_mixnet_l,pass,0
-
-
-
-tinynet_a,pass,0
-
-
-
-tnt_s_patch16_224,pass,0
-
-
-
-twins_pcpvt_base,pass,0
-
-
-
 visformer_small,pass,0
 
 
 
-vit_base_patch16_224,pass,0
-
-
-
-volo_d1_224,pass,0
+vit_base_patch14_dinov2.lvd142m,pass,0
 
 
 
-xcit_large_24_p8_224,pass,0
+vit_base_patch16_siglip_256,pass,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_timm_training.csv
index e5464160d32f..b2f40504a499 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_timm_training.csv
@@ -10,35 +10,7 @@ beit_base_patch16_224,pass,7
 
 
 
-botnet26t_256,pass,6
-
-
-
-cait_m36_384,eager_fail_to_run,0
-
-
-
-coat_lite_mini,pass,6
-
-
-
-convit_base,pass,7
-
-
-
-convmixer_768_32,pass,5
-
-
-
-convnext_base,pass,7
-
-
-
-crossvit_9_240,pass,7
-
-
-
-cspdarknet53,pass,7
+convnextv2_nano.fcmae_ft_in22k_in1k,fail_accuracy,7
 
 
 
@@ -46,7 +18,7 @@ deit_base_distilled_patch16_224,pass,7
 
 
 
-dla102,pass,7
+deit_tiny_patch16_224.fb_in1k,pass,7
 
 
 
@@ -54,82 +26,14 @@ dm_nfnet_f0,pass,6
 
 
 
-dpn107,pass,6
-
-
-
-eca_botnext26ts_256,pass,7
-
-
-
-eca_halonext26ts,pass,7
-
-
-
-ese_vovnet19b_dw,pass,7
-
-
-
-fbnetc_100,pass,7
-
-
-
-fbnetv3_b,pass,6
-
-
-
-gernet_l,pass,6
-
-
-
 ghostnet_100,pass,6
 
 
 
-gluon_inception_v3,pass,7
-
-
-
-gmixer_24_224,pass,6
-
-
-
-gmlp_s16_224,pass,7
-
-
-
-hrnet_w18,pass,5
-
-
-
 inception_v3,pass,6
 
 
 
-jx_nest_base,pass,7
-
-
-
-lcnet_050,pass,6
-
-
-
-levit_128,pass,7
-
-
-
-mixer_b16_224,pass,7
-
-
-
-mixnet_l,pass,6
-
-
-
-mnasnet_100,pass,7
-
-
-
 mobilenetv2_100,pass,7
 
 
@@ -146,100 +50,24 @@ nfnet_l0,pass,7
 
 
 
-pit_b_224,pass,6
-
-
-
-pnasnet5large,pass,5
-
-
-
-poolformer_m36,pass,6
-
-
-
-regnety_002,pass,6
-
-
-
 repvgg_a2,pass,7
 
 
 
-res2net101_26w_4s,pass,6
-
-
-
-res2net50_14w_8s,pass,6
-
-
-
-res2next50,pass,6
-
-
-
-resmlp_12_224,pass,6
-
-
-
-resnest101e,pass,6
-
-
-
-rexnet_100,pass,7
-
-
-
-sebotnet33ts_256,pass,6
-
-
-
-selecsls42b,pass,6
-
-
-
-spnasnet_100,pass,7
-
-
-
 swin_base_patch4_window7_224,pass,7
 
 
 
-swsl_resnext101_32x16d,pass,6
-
-
-
 tf_efficientnet_b0,pass,6
 
 
 
-tf_mixnet_l,pass,6
-
-
-
-tinynet_a,pass,6
-
-
-
-tnt_s_patch16_224,pass,7
-
-
-
-twins_pcpvt_base,pass,7
-
-
-
 visformer_small,pass,7
 
 
 
-vit_base_patch16_224,pass,7
-
-
-
-volo_d1_224,pass,7
+vit_base_patch14_dinov2.lvd142m,fail_accuracy,7
 
 
 
-xcit_large_24_p8_224,pass_due_to_skip,7
+vit_base_patch16_siglip_256,pass,7
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
index 014e23e41cb3..b508d8bd6e30 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
@@ -130,66 +130,6 @@ functorch_maml_omniglot,pass,0
 
 
 
-hf_Albert,pass,0
-
-
-
-hf_Bart,pass,0
-
-
-
-hf_Bert,pass,0
-
-
-
-hf_Bert_large,pass,0
-
-
-
-hf_BigBird,fail_accuracy,0
-
-
-
-hf_DistilBert,pass,0
-
-
-
-hf_GPT2,pass,0
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,8
-
-
-
-hf_T5,pass,0
-
-
-
-hf_T5_base,eager_fail_to_run,0
-
-
-
-hf_T5_generate,pass,11
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,0
-
-
-
-hf_distil_whisper,pass,0
-
-
-
 lennard_jones,pass,0
 
 
@@ -198,7 +138,7 @@ llama,pass,0
 
 
 
-llama_v2_7b_16h,model_fail_to_load,0
+llama_v2_7b_16h,pass_due_to_skip,0
 
 
 
@@ -334,30 +274,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,0
-
-
-
-timm_regnet,pass,0
-
-
-
-timm_resnest,pass,0
-
-
-
-timm_vision_transformer,pass,0
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,0
-
-
-
 torch_multimodal_clip,pass,0
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv
index e842ac7cb8e1..91e6df19ff02 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv
@@ -78,58 +78,6 @@ functorch_maml_omniglot,pass,7
 
 
 
-hf_Albert,pass,6
-
-
-
-hf_Bart,pass,6
-
-
-
-hf_Bert,pass,6
-
-
-
-hf_Bert_large,pass,6
-
-
-
-hf_BigBird,pass,6
-
-
-
-hf_DistilBert,pass,6
-
-
-
-hf_GPT2,pass,8
-
-
-
-hf_GPT2_large,pass_due_to_skip,0
-
-
-
-hf_Reformer,pass,25
-
-
-
-hf_T5_base,eager_2nd_run_OOM,0
-
-
-
-hf_T5_large,pass_due_to_skip,0
-
-
-
-hf_Whisper,pass,6
-
-
-
-hf_distil_whisper,model_fail_to_load,0
-
-
-
 lennard_jones,pass,7
 
 
@@ -142,7 +90,7 @@ maml_omniglot,pass,7
 
 
 
-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9
 
 
 
@@ -246,30 +194,6 @@ stable_diffusion_unet,pass_due_to_skip,0
 
 
 
-timm_efficientnet,pass,7
-
-
-
-timm_regnet,pass,7
-
-
-
-timm_resnest,pass,6
-
-
-
-timm_vision_transformer,pass,6
-
-
-
-timm_vision_transformer_large,pass_due_to_skip,0
-
-
-
-timm_vovnet,pass,6
-
-
-
 torch_multimodal_clip,pass,7
 
 
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 922fc977eb19..54900de1ed91 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -50,6 +50,7 @@
     reset_rng_state,
     same,
 )
+from torch._dynamo.utils import bitwise_same
 from torch._logging.scribe import open_source_signpost
 
 
@@ -117,14 +118,8 @@ class CI(NamedTuple):
 
 
 CI_SKIP_OPTIMIZER = {
-    # TIMM
-    "convmixer_768_32",  # accuracy
-    "hrnet_w18",  # Stack issue in fx
     # HF
-    "pnasnet5large",  # Stack issue in fx
     "MobileBertForMaskedLM",  # Stack issue in fx
-    "MobileBertForQuestionAnswering",  # Stack issue in fx
-    "PegasusForConditionalGeneration",  # OOM
 }
 
 try:
@@ -155,7 +150,6 @@ class CI(NamedTuple):
     "detectron2_fasterrcnn_r_50_c4",
     "detectron2_fasterrcnn_r_50_dc5",
     "detectron2_fasterrcnn_r_50_fpn",
-    "hf_T5_generate",
     "Reformer",
     "llama",
 }.union(INTERNAL_CI_SKIP_DYNAMIC_BATCH_ONLY)
@@ -182,51 +176,22 @@ class CI(NamedTuple):
     "speech_transformer",
     "squeezenet1_1",
     "stable_diffusion_text_encoder",
-    "timm_efficientdet",
-    "timm_nfnet",
-    "timm_resnest",
-    "timm_vision_transformer",
-    "timm_vovnet",
     "vgg16",
-    "hf_T5",  # Fails dynamic https://github.com/pytorch/pytorch/issues/115968
     # HF
     "AlbertForMaskedLM",
     "BartForCausalLM",
-    "BartForConditionalGeneration",
-    "BlenderbotSmallForCausalLM",
-    "BlenderbotSmallForConditionalGeneration",
-    "DebertaV2ForQuestionAnswering",  # eager OOM
     "ElectraForCausalLM",
     "M2M100ForConditionalGeneration",
     "MBartForCausalLM",
-    "MBartForConditionalGeneration",
     "OPTForCausalLM",
     "PLBartForCausalLM",
-    "PLBartForConditionalGeneration",
     "PegasusForCausalLM",
     "TrOCRForCausalLM",
     "XGLMForCausalLM",
     # TIMM
     "adv_inception_v3",
-    "botnet26t_256",
-    "cait_m36_384",  # OOM
-    "coat_lite_mini",
-    "convit_base",
-    "dpn107",
-    "fbnetv3_b",
-    "gernet_l",
-    "lcnet_050",
-    "mixnet_l",
-    "res2net101_26w_4s",
-    "res2net50_14w_8s",
-    "res2next50",
-    "resnest101e",
-    "sebotnet33ts_256",
-    "swsl_resnext101_32x16d",
     "tf_efficientnet_b0",
     "ghostnet_100",
-    "gmixer_24_224",
-    "tinynet_a",
 }
 
 # These models OOM in CI
@@ -245,31 +210,21 @@ class CI(NamedTuple):
     "detectron2_maskrcnn_r_101_fpn",
     "detectron2_maskrcnn_r_50_c4",
     "detectron2_maskrcnn_r_50_fpn",
-    "hf_T5_base",
-    "hf_clip",
     "llama_v2_7b_16h",
     "mobilenet_v2_quantized_qat",
     "phi_1_5 resnet50_quantized_qat",
     "BlenderbotForCausalLM",
-    "cait_m36_384",
     "DALLE2_pytorch",
     "moco",
     "timm_efficientdet",
     "ghostnet_100",
-    "regnety_002",
-    "poolformer_m36",
     "inception_v3",
-    "tinynet_a",
-    "selecsls42b",
     "mobilevit_s",
     "pytorch_CycleGAN_and_pix2pix",
     "vision_maskrcnn",
-    "resmlp_12_224",
     "dlrm",
     "resnet50",
     "dm_nfnet_f0",
-    "pit_b_224",
-    "tf_mixnet_l",
 }
 
 
@@ -1105,6 +1060,8 @@ def maybe_mark_profile(*args, **kwargs):
             frozen_model_iter_fn = export_nativert(model, example_inputs)
         elif args.torchscript_jit_trace:
             frozen_model_iter_fn = torchscript_jit_trace(model, example_inputs)
+        elif args.aot_precompile:
+            frozen_model_iter_fn = aot_precompile(model, example_inputs)
         else:
             if kwargs["hf_llm"]:
                 # If it's an llm, we want to optimize model.forward, and use
@@ -1540,6 +1497,37 @@ def opt_export(_, example_inputs):
     return opt_export
 
 
+def aot_precompile(model, example_inputs):
+    example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
+
+    with tempfile.NamedTemporaryFile(suffix=".pt", delete=False) as f:
+        save_path = f.name
+
+    with fresh_cache(), torch._dynamo.config.patch("enable_aot_compile", True):
+        compiled_fn = torch.compile(
+            model,
+            fullgraph=True,
+            options={"guard_filter_fn": lambda guards: [False for _ in guards]},
+        ).forward.aot_compile((example_args, example_kwargs))
+
+        compiled_fn.save_compiled_function(save_path)
+
+        torch._dynamo.reset()
+        with open(save_path, "rb") as f:
+            load_start_time = time.perf_counter()
+            loaded_fn = torch.compiler.load_compiled_function(f)
+            load_end_time = time.perf_counter()
+            print(
+                f"AOT Precompile loading time: {load_end_time - load_start_time} seconds"
+            )
+
+            def opt_aot_precompile(_, example_inputs, collect_outputs=False):
+                example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
+                return loaded_fn(model, *example_args, **example_kwargs)
+
+            return opt_aot_precompile
+
+
 def export_nativert(model, example_inputs):
     optimized = NativeRTCache.load(model, example_inputs)
 
@@ -1763,8 +1751,8 @@ def maybe_snapshot_memory(should_snapshot_memory, suffix):
                         f"{output_filename.rstrip('.csv')}_{suffix}.pickle",
                     )
                 )
-            except Exception as e:
-                log.error("Failed to save memory snapshot, %s", e)
+            except Exception:
+                log.exception("Failed to save memory snapshot")
 
             torch.cuda.memory._record_memory_history(enabled=None)
 
@@ -2068,8 +2056,6 @@ def get_fsdp_auto_wrap_policy(self, model_name: str):
         from diffusers.models.transformer_2d import Transformer2DModel
         from torchbenchmark.models.nanogpt.model import Block
         from transformers.models.llama.modeling_llama import LlamaDecoderLayer
-        from transformers.models.t5.modeling_t5 import T5Block
-        from transformers.models.whisper.modeling_whisper import WhisperEncoderLayer
 
         from torch.distributed.fsdp.wrap import (
             ModuleWrapPolicy,
@@ -2079,10 +2065,6 @@ def get_fsdp_auto_wrap_policy(self, model_name: str):
         # handcrafted wrap policy
         MODEL_FSDP_WRAP = {
             "stable_diffusion_unet": (Transformer2DModel,),
-            "hf_T5": (T5Block,),
-            "hf_T5_base": (T5Block,),
-            "hf_T5_large": (T5Block,),
-            "hf_Whisper": (WhisperEncoderLayer,),
             "llama_v2_7b_16h": (LlamaDecoderLayer,),
             "nanogpt": (Block,),
         }
@@ -2282,7 +2264,9 @@ def record_status(accuracy_status, dynamo_start_stats):
                 del model_copy
                 empty_gpu_cache(current_device)
 
-            # Two eager runs should have exactly same result
+            # Two eager runs should have exactly same result, within tolerance.
+            # TODO If we want the above to be true, then deterministic should be set.
+            # For example, MIOpen convolutions could be implemented with non-deterministic algos.
             is_same = True
             try:
                 if (
@@ -2292,7 +2276,7 @@ def record_status(accuracy_status, dynamo_start_stats):
                         correct_rerun_result,
                         fp64_ref=None,
                         cos_similarity=False,
-                        tol=0,
+                        tol=tolerance if torch.version.hip else 0,
                         equal_nan=self.equal_nan,
                         use_larger_multiplier_for_smaller_tensor=self.use_larger_multiplier_for_smaller_tensor(
                             name
@@ -2300,9 +2284,11 @@ def record_status(accuracy_status, dynamo_start_stats):
                     )
                 ):
                     is_same = False
-            except Exception:
+            except Exception as e:
                 # Sometimes torch.allclose may throw RuntimeError
-                is_same = False
+                exception_string = str(e)
+                accuracy_status = f"fail_exception: {exception_string}"
+                return record_status(accuracy_status, dynamo_start_stats=start_stats)
 
             if not is_same:
                 accuracy_status = "eager_two_runs_differ"
@@ -2323,6 +2309,7 @@ def record_status(accuracy_status, dynamo_start_stats):
                     or self.args.export_aot_inductor
                     or self.args.export_nativert
                     or self.args.torchscript_jit_trace
+                    or self.args.aot_precompile
                 ):
                     # apply export on module directly
                     # no need for n iterations
@@ -2371,6 +2358,40 @@ def record_status(accuracy_status, dynamo_start_stats):
                         new_result = process_fn(new_result)
                         fp64_outputs = process_fn(fp64_outputs)
 
+                if (
+                    self.args.save_model_outputs_to
+                    and self.args.compare_model_outputs_with
+                    and self.args.save_model_outputs_to
+                    == self.args.compare_model_outputs_with
+                ):
+                    log.warning(
+                        "args.save_model_outputs_to and args.compare_model_outputs_with points to the same path."
+                        "Result will be undefined."
+                    )
+
+                if self.args.save_model_outputs_to:
+                    print(f"Save model outputs to: {self.args.save_model_outputs_to}")
+                    torch.save(new_result, self.args.save_model_outputs_to)
+
+                if self.args.compare_model_outputs_with:
+                    print(
+                        f"Load model outputs from {self.args.compare_model_outputs_with} to compare"
+                    )
+                    saved_result = torch.load(self.args.compare_model_outputs_with)
+                    is_bitwise_same = bitwise_same(saved_result, new_result)
+                    if not is_bitwise_same:
+                        print(
+                            "The result is not bitwise equivalent to the previously saved result"
+                        )
+                        return record_status(
+                            "not_bitwise_equivalent", dynamo_start_stats=start_stats
+                        )
+
+                    print(
+                        "The result is bitwise equivalent to the previously saved result"
+                    )
+                    del saved_result
+
                 if not same(
                     correct_result,
                     new_result,
@@ -2384,9 +2405,11 @@ def record_status(accuracy_status, dynamo_start_stats):
                     force_max_multiplier=force_max_multiplier,
                 ):
                     is_same = False
-            except Exception:
+            except Exception as e:
                 # Sometimes torch.allclose may throw RuntimeError
-                is_same = False
+                exception_string = str(e)
+                accuracy_status = f"fail_exception: {exception_string}"
+                return record_status(accuracy_status, dynamo_start_stats=start_stats)
 
             if not is_same:
                 if self.args.skip_accuracy_check:
@@ -2744,6 +2767,7 @@ def warmup(fn, model, example_inputs, mode, niters=5):
                 self.args.export_aot_inductor
                 or self.args.export_nativert
                 or self.args.torchscript_jit_trace
+                or self.args.aot_precompile
             ):
                 optimized_model_iter_fn = optimize_ctx
             else:
@@ -3411,6 +3435,17 @@ def get_example_inputs(self):
         help="Enables caching precompile, serializing artifacts to DynamoCache between runs",
     )
 
+    parser.add_argument(
+        "--save-model-outputs-to",
+        default="",
+        help="Specify the path to save model output to so we can load later for comparison",
+    )
+    parser.add_argument(
+        "--compare-model-outputs-with",
+        default="",
+        help="Specify the path for the saved model outputs to compare against",
+    )
+
     group_latency = parser.add_mutually_exclusive_group()
     group_latency.add_argument(
         "--cold-start-latency",
@@ -3509,6 +3544,11 @@ def get_example_inputs(self):
         action="store_true",
         help="Measure pass rate with Export+AOTInductor",
     )
+    group.add_argument(
+        "--aot-precompile",
+        action="store_true",
+        help="Measure pass rate with AOT Precompile",
+    )
     group.add_argument(
         "--export-nativert",
         action="store_true",
@@ -3580,18 +3620,10 @@ def process_caching_precompile():
     )
     from torch._dynamo.precompile_context import PrecompileContext
 
-    # Serialize all callables, clear PrecompileContext
-    # TODO: put this under torch.compiler API once ready
-    serialized = PrecompileContext.serialize()
-    PrecompileContext.clear()
-    if serialized is not None:
-        artifacts, info = serialized
-        print(
-            f"Saving {len(info.precompile_dynamo_artifacts)} Precompile Artifact(s)..."
-        )
-        results = PrecompileContext.deserialize(artifacts)
-        assert results is not None
-        PrecompileContext.populate_caches(results)
+    debug_info = PrecompileContext.save_to_dynamo_cache()
+    print(
+        f"Saved {len(debug_info['dynamo'])} precompile artifacts with {len(debug_info['backends'])} backends"
+    )
 
 
 def process_entry(rank, runner, original_dir, args):
@@ -3698,6 +3730,43 @@ def write_csv_when_exception(args, name: str, status: str, device=None):
         write_outputs(output_filename, headers, row)
 
 
+def setup_determinism_for_accuracy_test(args):
+    if args.only is not None and args.only not in {
+        "alexnet",
+        "Background_Matting",
+        "pytorch_CycleGAN_and_pix2pix",
+        "pytorch_unet",
+        "Super_SloMo",
+        "vgg16",
+        # https://github.com/pytorch/pytorch/issues/96724
+        "Wav2Vec2ForCTC",
+        "Wav2Vec2ForPreTraining",
+        "sam",
+        "sam_fast",
+        "resnet50_quantized_qat",
+        "mobilenet_v2_quantized_qat",
+        "detectron2_maskrcnn",
+        "detectron2_maskrcnn_r_101_c4",
+        "detectron2_maskrcnn_r_101_fpn",
+        "detectron2_maskrcnn_r_50_c4",
+        "detectron2_maskrcnn_r_50_fpn",
+        "detectron2_fasterrcnn_r_101_c4",
+        "detectron2_fasterrcnn_r_101_dc5",
+        "detectron2_fasterrcnn_r_101_fpn",
+        "detectron2_fasterrcnn_r_50_c4",
+        "detectron2_fasterrcnn_r_50_dc5",
+        "detectron2_fasterrcnn_r_50_fpn",
+    }:
+        # some of the models do not support use_deterministic_algorithms
+        torch.use_deterministic_algorithms(True)
+    if args.devices == ["xpu"]:
+        torch.use_deterministic_algorithms(True, warn_only=True)
+
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    torch.backends.mkldnn.deterministic = True
+
+
 def run(runner, args, original_dir=None):
     # Pass the parsed args object to benchmark runner object
     torch._dynamo.reset()
@@ -3763,54 +3832,21 @@ def run(runner, args, original_dir=None):
             # TODO - Using train mode for timm_models and HF models. Move to train mode for Torchbench as well.
             args.use_eval_mode = True
         inductor_config.fallback_random = True
-        if args.only is not None and args.only not in {
-            "alexnet",
-            "Background_Matting",
-            "pytorch_CycleGAN_and_pix2pix",
-            "pytorch_unet",
-            "Super_SloMo",
-            "vgg16",
-            # https://github.com/pytorch/pytorch/issues/96724
-            "Wav2Vec2ForCTC",
-            "Wav2Vec2ForPreTraining",
-            "sam",
-            "sam_fast",
-            "resnet50_quantized_qat",
-            "mobilenet_v2_quantized_qat",
-            "detectron2_maskrcnn",
-            "detectron2_maskrcnn_r_101_c4",
-            "detectron2_maskrcnn_r_101_fpn",
-            "detectron2_maskrcnn_r_50_c4",
-            "detectron2_maskrcnn_r_50_fpn",
-            "detectron2_fasterrcnn_r_101_c4",
-            "detectron2_fasterrcnn_r_101_dc5",
-            "detectron2_fasterrcnn_r_101_fpn",
-            "detectron2_fasterrcnn_r_50_c4",
-            "detectron2_fasterrcnn_r_50_dc5",
-            "detectron2_fasterrcnn_r_50_fpn",
-        }:
-            # some of the models do not support use_deterministic_algorithms
-            torch.use_deterministic_algorithms(True)
-        if args.devices == ["xpu"]:
-            torch.use_deterministic_algorithms(True, warn_only=True)
+
+        setup_determinism_for_accuracy_test(args)
+
         os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
         if args.only is not None and args.only in {
-            "DebertaForQuestionAnswering",
             "nvidia_deeprecommender",
-            "crossvit_9_240",
         }:
             # These seem unhappy with numerics of larger cuBLASLt workspace
             torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
             torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
 
-        torch.backends.cudnn.deterministic = True
         torch.backends.cudnn.allow_tf32 = False
-        torch.backends.cudnn.benchmark = False
         torch.backends.cuda.matmul.allow_tf32 = False
         torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(False)
 
-        torch.backends.mkldnn.deterministic = True
-
         # Remove randomness when torch manual seed is called
         patch_torch_manual_seed()
 
@@ -3825,7 +3861,6 @@ def run(runner, args, original_dir=None):
             runner.skip_models.update(
                 {
                     # xfail: https://github.com/pytorch/pytorch/issues/145773
-                    "convit_base",
                     "llama",
                     "cm3leon_generate",
                 }
@@ -3856,22 +3891,6 @@ def run(runner, args, original_dir=None):
         global synchronize
         synchronize = torch.cuda.synchronize if HAS_CUDA else torch.xpu.synchronize
 
-    if (
-        args.devices == ["cuda"]
-        and torch.cuda.get_device_properties(0).total_memory < 25 * 2**30
-    ):
-        # OOM errors on an RTX 3090 with 24gb RAM
-        runner.skip_models.update(
-            {
-                # torchbench
-                "hf_Longformer",
-                "timm_nfnet",
-                "timm_efficientdet",
-            }
-        )
-        if args.training:
-            runner.skip_models.add("hf_T5")
-
     if args.nnc:
         torch._C._jit_override_can_fuse_on_cpu(True)
         torch._C._jit_override_can_fuse_on_gpu(True)
@@ -3960,6 +3979,10 @@ def run(runner, args, original_dir=None):
         optimize_ctx = export
         experiment = speedup_experiment
         output_filename = "export.csv"
+    elif args.aot_precompile:
+        optimize_ctx = aot_precompile
+        experiment = speedup_experiment
+        output_filename = "aot_precompile.csv"
     elif args.export_nativert:
         optimize_ctx = export_nativert
         experiment = speedup_experiment
@@ -4041,7 +4064,7 @@ def model_iter_fn_and_mark_step(*args, **kwargs):
         else:
             optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython)
         experiment = (
-            speedup_experiment if not args.backend == "torchao" else latency_experiment
+            speedup_experiment if args.backend != "torchao" else latency_experiment
         )
         if args.accuracy:
             output_filename = f"accuracy_{args.backend}.csv"
diff --git a/benchmarks/dynamo/dist_util.py b/benchmarks/dynamo/dist_util.py
index 2994c0681c77..14b9de188ec4 100644
--- a/benchmarks/dynamo/dist_util.py
+++ b/benchmarks/dynamo/dist_util.py
@@ -21,9 +21,6 @@
 except ImportError:
     from torchbench import setup_torchbench_cwd
 
-from transformers.models.bert.modeling_bert import BertLayer, BertLMPredictionHead
-from transformers.models.t5.modeling_t5 import T5Block
-
 
 def setup(rank, world_size):
     os.environ["MASTER_ADDR"] = os.getenv("MASTER_ADDR", "localhost")
@@ -128,8 +125,6 @@ def check_fn(submodule):
 
 MODEL_FSDP_WRAP = {
     "toy_model": (MyModule,),
-    "hf_Bert": (BertLayer, BertLMPredictionHead),
-    "hf_T5": (T5Block,),
 }
 
 
diff --git a/benchmarks/dynamo/distributed.py b/benchmarks/dynamo/distributed.py
index d2af630c8bfb..2b6ed0721ac1 100644
--- a/benchmarks/dynamo/distributed.py
+++ b/benchmarks/dynamo/distributed.py
@@ -158,7 +158,7 @@ def print_compile(gm, ex):
     model_arg.add_argument(
         "--torchbench-model",
         "--torchbench_model",
-        help="name of torchbench model, e.g. hf_Bert",
+        help="name of torchbench model, e.g. BERT_pytorch",
     )
     model_arg.add_argument(
         "--toy-model", "--toy_model", action="store_true", help="use toy model instead"
diff --git a/benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv b/benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
index d10d41f10f33..ae5b7e6d82cb 100644
--- a/benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
+++ b/benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
@@ -12,17 +12,6 @@ cuda,dlrm,1024,1.3421,3.2177,4.9493,1.0009
 cuda,drq,1,1.0820,3.8157,8.0732,0.9687
 cuda,fastNLP_Bert,6,1.4839,37.9050,32.7583,1.1563
 cuda,functorch_dp_cifar10,64,1.5014,6.9596,14.1516,0.4432
-cuda,hf_Albert,8,2.2452,30.6134,25.9036,1.3098
-cuda,hf_Bart,4,1.7012,34.3999,37.9975,1.0128
-cuda,hf_Bert,4,1.9003,23.3435,34.8196,1.0273
-cuda,hf_Bert_large,4,1.6346,52.8525,62.3112,1.0726
-cuda,hf_BigBird,2,1.9208,105.2672,101.4787,1.1415
-cuda,hf_DistilBert,8,1.3988,22.5793,20.2386,1.0232
-cuda,hf_GPT2,4,1.8075,27.5184,25.3428,1.1562
-cuda,hf_GPT2_large,4,1.7716,118.7404,68.1618,1.1725
-cuda,hf_Reformer,4,1.1744,70.4228,15.1152,0.9266
-cuda,hf_T5,8,1.8778,93.3134,37.0046,1.2279
-cuda,hf_T5_large,2,2.3623,101.5518,143.7982,1.1674
 cuda,lennard_jones,1000,1.0649,1.5233,4.1119,0.9998
 cuda,mnasnet1_0,32,1.1957,19.1993,27.2302,0.7758
 cuda,mobilenet_v2,96,1.4876,32.3311,27.4719,1.1729
@@ -42,14 +31,6 @@ cuda,shufflenet_v2_x1_0,128,1.3027,25.7017,27.9875,1.1015
 cuda,soft_actor_critic,256,0.9965,2.2580,4.6661,0.9995
 cuda,speech_transformer,32,1.8405,35.1645,33.3422,1.0888
 cuda,squeezenet1_1,32,1.4191,7.3454,9.4751,1.1148
-cuda,timm_efficientdet,1,1.6630,78.2697,150.9620,0.9904
-cuda,timm_efficientnet,32,1.2689,28.5348,66.3911,0.9428
-cuda,timm_nfnet,128,1.5319,79.5429,32.9961,1.1070
-cuda,timm_regnet,32,1.0564,56.9897,53.0027,0.9500
-cuda,timm_resnest,32,1.6485,14.3908,56.7240,0.9515
-cuda,timm_vision_transformer,8,1.6100,18.7736,36.9495,0.7301
-cuda,timm_vision_transformer_large,8,1.0842,170.9849,72.0604,0.9762
-cuda,timm_vovnet,32,1.0472,25.4676,24.8428,0.8843
 cuda,tts_angular,64,1.0366,6.9889,4.2683,0.9973
 cuda,vgg16,64,1.2560,52.7072,7.3733,0.9884
 cuda,yolov3,16,1.2600,54.2350,42.4711,1.0108
diff --git a/benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv b/benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
index 2462b4cd752d..80339a7ae303 100644
--- a/benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
+++ b/benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
@@ -1,29 +1,16 @@
 #name,backend,data_type,shape,wrapper,perf_speedup_target_c7i_metal_24xl
-#timm_vision_transformer,inductor,float32,static,default,1.039510755
 phlippe_densenet,inductor,float32,static,default,1.46474287
 basic_gnn_edgecnn,inductor,float32,dynamic,default,1.30092957
 llama_v2_7b_16h,inductor,float32,dynamic,default,1.23234331
 resnet50,inductor,float32,dynamic,default,1.67742767
-#timm_efficientnet,inductor,float32,static,cpp,
 mobilenet_v3_large,inductor,float32,static,cpp,2.63311706
-timm_resnest,inductor,float32,dynamic,cpp,1.7321529
 functorch_maml_omniglot,inductor,float32,dynamic,cpp,1.126799
-#hf_GPT2,inductor,float32,dynamic,cpp,
 yolov3,export-aot-inductor,float32,static,default,1.40687424
 mobilenet_v2,export-aot-inductor,float32,static,default,2.90375357
 resnext50_32x4d,export-aot-inductor,float32,dynamic,default,1.49299689
-hf_Albert,export-aot-inductor,float32,dynamic,default,1.261471
 resnext50_32x4d,inductor,amp,static,default,1.47023111
 vgg16,inductor,amp,static,default,1.2692454
-hf_Longformer,inductor,amp,dynamic,default,1.22015225
-hf_Bert_large,inductor,amp,dynamic,default,1.18572179
 llama,inductor,amp,static,default,1.33157028
-timm_regnet,inductor,amp,static,cpp,1.12734073
 mnasnet1_0,inductor,amp,static,cpp,2.1296814
-#hf_T5_generate,inductor,amp,dynamic,cpp,
-timm_vovnet,inductor,amp,dynamic,cpp,1.10851009
 #mobilenet_v2,inductor,amp,dynamic,cpp,2.27774577 # https://github.com/pytorch/pytorch/issues/131693
-hf_GPT2,export-aot-inductor,amp,static,default,1.4432794
 densenet121,export-aot-inductor,amp,static,default,1.25591385
-hf_DistilBert,export-aot-inductor,amp,dynamic,default,1.2926442
-hf_Bart,export-aot-inductor,amp,dynamic,default,1.19515416
diff --git a/benchmarks/dynamo/genai_layers/utils.py b/benchmarks/dynamo/genai_layers/utils.py
index 749b9cea2032..2db2d7300df5 100644
--- a/benchmarks/dynamo/genai_layers/utils.py
+++ b/benchmarks/dynamo/genai_layers/utils.py
@@ -1,7 +1,8 @@
 import os
 from collections import defaultdict
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 
 import matplotlib.pyplot as plt
 
diff --git a/benchmarks/dynamo/huggingface.py b/benchmarks/dynamo/huggingface.py
index cceb448a849c..d856a241ccac 100755
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@@ -59,7 +59,6 @@ def pip_install(package):
     "BigBirdConfig",
     "BlenderbotForConditionalGeneration",
     "BlenderbotModel",
-    "BlenderbotSmallForConditionalGeneration",
     "BlenderbotSmallModel",
     "CLIPModel",
     "CLIPVisionModel",
@@ -73,7 +72,6 @@ def pip_install(package):
     "MarianForCausalLM",
     "MarianModel",
     "MarianMTModel",
-    "PegasusForConditionalGeneration",
     "PegasusModel",
     "ReformerConfig",
     "ViTForImageClassification",
@@ -126,7 +124,7 @@ def process_hf_reformer_output(out):
             continue
         batch_size = int(batch_size)
         BATCH_SIZE_KNOWN_MODELS[model_name] = batch_size
-assert len(BATCH_SIZE_KNOWN_MODELS)
+assert BATCH_SIZE_KNOWN_MODELS
 
 
 try:
@@ -167,7 +165,7 @@ def get_sequence_length(model_cls, model_name):
             "Bert",
             "Roberta",
         )
-    ) or model_name in ("DistillGPT2", "GoogleFnet", "YituTechConvBert", "CamemBert"):
+    ) or model_name in ("DistillGPT2", "GoogleFnet", "YituTechConvBert"):
         seq_length = 512
     elif model_name in ("TrOCRForCausalLM"):
         seq_length = 256
@@ -222,9 +220,7 @@ def generate_inputs_for_model(
         BlenderbotModel,
         BlenderbotSmallModel,
         BlenderbotForConditionalGeneration,
-        BlenderbotSmallForConditionalGeneration,
         PegasusModel,
-        PegasusForConditionalGeneration,
         MarianModel,
         MarianMTModel,
     ]:
@@ -333,10 +329,6 @@ def rand_int_tensor(device, low, high, shape):
         AutoConfig.from_pretrained("YituTech/conv-bert-base"),
         AutoModelForMaskedLM,
     ),
-    "CamemBert": (
-        AutoConfig.from_pretrained("camembert-base"),
-        AutoModelForMaskedLM,
-    ),
 }
 
 
@@ -375,8 +367,6 @@ def skip_models_due_to_control_flow(self):
 
     def use_larger_multiplier_for_smaller_tensor(self, name):
         return name in [
-            "ElectraForQuestionAnswering",
-            "MegatronBertForQuestionAnswering",
             "GPT2ForSequenceClassification",
         ]
 
diff --git a/benchmarks/dynamo/huggingface.yaml b/benchmarks/dynamo/huggingface.yaml
index b45f199f4d4c..b22988c4ba9c 100644
--- a/benchmarks/dynamo/huggingface.yaml
+++ b/benchmarks/dynamo/huggingface.yaml
@@ -11,6 +11,8 @@ skip:
     - GPTJForQuestionAnswering
     # Model too big
     - google/gemma-3-4b-it
+    - openai/gpt-oss-20b
+    - mistralai/Mistral-7B-Instruct-v0.3
 
   device:
     cpu:
@@ -19,6 +21,8 @@ skip:
       - google/gemma-3-4b-it
       - openai/whisper-tiny
       - Qwen/Qwen3-0.6B
+      - mistralai/Mistral-7B-Instruct-v0.3
+      - openai/gpt-oss-20b
 
   control_flow:
     - AllenaiLongformerBase
@@ -27,24 +31,15 @@ batch_size:
   # TODO - Fails even after fake tensors
   divisors:
     AlbertForMaskedLM: 2
-    AlbertForQuestionAnswering: 2
     AllenaiLongformerBase: 2
     BartForCausalLM: 2
-    BartForConditionalGeneration: 2
     BertForMaskedLM: 2
-    BertForQuestionAnswering: 2
     BlenderbotForCausalLM: 8
     # BlenderbotForConditionalGeneration : 16
-    BlenderbotSmallForCausalLM: 4
-    BlenderbotSmallForConditionalGeneration: 2
-    CamemBert: 2
     DebertaV2ForMaskedLM: 4
-    DebertaV2ForQuestionAnswering: 8
     DistilBertForMaskedLM: 2
-    DistilBertForQuestionAnswering: 2
     DistillGPT2: 2
     ElectraForCausalLM: 2
-    ElectraForQuestionAnswering: 2
     GPT2ForSequenceClassification: 2
     # GPTJForCausalLM : 2
     # GPTJForQuestionAnswering : 2
@@ -52,22 +47,15 @@ batch_size:
     # GPTNeoForSequenceClassification : 2
     GoogleFnet: 2
     LayoutLMForMaskedLM: 2
-    LayoutLMForSequenceClassification: 2
     M2M100ForConditionalGeneration: 4
     MBartForCausalLM: 2
-    MBartForConditionalGeneration: 2
     MT5ForConditionalGeneration: 2
     MegatronBertForCausalLM: 4
-    MegatronBertForQuestionAnswering: 2
     MobileBertForMaskedLM: 2
-    MobileBertForQuestionAnswering: 2
     OPTForCausalLM: 2
     PLBartForCausalLM: 2
-    PLBartForConditionalGeneration: 2
     PegasusForCausalLM: 4
-    PegasusForConditionalGeneration: 2
     RobertaForCausalLM: 2
-    RobertaForQuestionAnswering: 2
     T5ForConditionalGeneration: 2
     T5Small: 2
     TrOCRForCausalLM: 2
@@ -79,25 +67,20 @@ batch_size:
     google/gemma-3-4b-it: 8
     openai/whisper-tiny: 8
     Qwen/Qwen3-0.6B: 8
+    mistralai/Mistral-7B-Instruct-v0.3: 8
+    openai/gpt-oss-20b: 8
 
 
 tolerance:
   higher_training:
     - MT5ForConditionalGeneration
-    # AlbertForQuestionAnswering fails in CI GCP A100 but error does not seem
-    # harmful.
-    - AlbertForQuestionAnswering
 
-  higher_max_autotune_training:
-    # DebertaForQuestionAnswering needs higher tolerance in Max-Autotune mode
-    - DebertaForQuestionAnswering
+  higher_max_autotune_training: []
 
   higher_inference:
     - GPT2ForSequenceClassification
-    - RobertaForQuestionAnswering
 
   higher_inference_cpu:
-    - LayoutLMForSequenceClassification
     - GPT2ForSequenceClassification
 
   cosine: []
diff --git a/benchmarks/dynamo/huggingface_llm_models.py b/benchmarks/dynamo/huggingface_llm_models.py
index c8b0524c4d63..2c68254ebe14 100644
--- a/benchmarks/dynamo/huggingface_llm_models.py
+++ b/benchmarks/dynamo/huggingface_llm_models.py
@@ -99,4 +99,6 @@ def get_model_and_inputs(model_name, device):
     "google/gemma-3-4b-it": TextGenerationBenchmark,
     "openai/whisper-tiny": WhisperBenchmark,
     "Qwen/Qwen3-0.6B": TextGenerationBenchmark,
+    "mistralai/Mistral-7B-Instruct-v0.3": TextGenerationBenchmark,
+    "openai/gpt-oss-20b": TextGenerationBenchmark,
 }
diff --git a/benchmarks/dynamo/huggingface_models_list.txt b/benchmarks/dynamo/huggingface_models_list.txt
index 0a6327ae1aad..99d2467d66e5 100644
--- a/benchmarks/dynamo/huggingface_models_list.txt
+++ b/benchmarks/dynamo/huggingface_models_list.txt
@@ -1,22 +1,13 @@
 AlbertForMaskedLM,8
-AlbertForQuestionAnswering,8
 AllenaiLongformerBase,8
 BartForCausalLM,8
-BartForConditionalGeneration,4
 BertForMaskedLM,32
-BertForQuestionAnswering,32
 BlenderbotForCausalLM,32
 BlenderbotForConditionalGeneration,16
-BlenderbotSmallForCausalLM,256
-BlenderbotSmallForConditionalGeneration,128
-CamemBert,32
 DebertaV2ForMaskedLM,8
-DebertaV2ForQuestionAnswering,8
 DistilBertForMaskedLM,256
-DistilBertForQuestionAnswering,512
 DistillGPT2,32
 ElectraForCausalLM,64
-ElectraForQuestionAnswering,128
 GPT2ForSequenceClassification,8
 GPTJForCausalLM,1
 GPTJForQuestionAnswering,1
@@ -24,22 +15,15 @@ GPTNeoForCausalLM,32
 GPTNeoForSequenceClassification,32
 GoogleFnet,32
 LayoutLMForMaskedLM,32
-LayoutLMForSequenceClassification,32
 M2M100ForConditionalGeneration,64
 MBartForCausalLM,8
-MBartForConditionalGeneration,4
 MT5ForConditionalGeneration,32
 MegatronBertForCausalLM,16
-MegatronBertForQuestionAnswering,16
 MobileBertForMaskedLM,256
-MobileBertForQuestionAnswering,256
 OPTForCausalLM,4
 PLBartForCausalLM,16
-PLBartForConditionalGeneration,8
 PegasusForCausalLM,128
-PegasusForConditionalGeneration,64
 RobertaForCausalLM,32
-RobertaForQuestionAnswering,32
 T5ForConditionalGeneration,8
 T5Small,8
 TrOCRForCausalLM,64
@@ -51,3 +35,5 @@ google/gemma-2-2b,8
 google/gemma-3-4b-it,8
 openai/whisper-tiny,8
 Qwen/Qwen3-0.6B,8
+mistralai/Mistral-7B-Instruct-v0.3, 8
+openai/gpt-oss-20b, 8
diff --git a/benchmarks/dynamo/huggingface_models_list_cpu.txt b/benchmarks/dynamo/huggingface_models_list_cpu.txt
index 4078368a69c4..6ee735fc6edd 100644
--- a/benchmarks/dynamo/huggingface_models_list_cpu.txt
+++ b/benchmarks/dynamo/huggingface_models_list_cpu.txt
@@ -1,41 +1,25 @@
 AlbertForMaskedLM,4
-AlbertForQuestionAnswering,4
 AllenaiLongformerBase,4
 BartForCausalLM,4
-BartForConditionalGeneration,2
 BertForMaskedLM,16
-BertForQuestionAnswering,16
 BigBird,32
 BlenderbotForCausalLM,32
-BlenderbotSmallForCausalLM,64
-BlenderbotSmallForConditionalGeneration,64
-CamemBert,16
 DebertaV2ForMaskedLM,16
-DebertaV2ForQuestionAnswering,2
 DistilBertForMaskedLM,128
-DistilBertForQuestionAnswering,256
 DistillGPT2,16
 ElectraForCausalLM,8
-ElectraForQuestionAnswering,8
 GoogleFnet,16
 GPT2ForSequenceClassification,4
 LayoutLMForMaskedLM,16
-LayoutLMForSequenceClassification,16
 M2M100ForConditionalGeneration,16
 MBartForCausalLM,4
-MBartForConditionalGeneration,2
 MegatronBertForCausalLM,4
-MegatronBertForQuestionAnswering,8
 MobileBertForMaskedLM,64
-MobileBertForQuestionAnswering,64
 MT5ForConditionalGeneration,16
 OPTForCausalLM,2
 PegasusForCausalLM,32
-PegasusForConditionalGeneration,32
 PLBartForCausalLM,8
-PLBartForConditionalGeneration,4
 RobertaForCausalLM,16
-RobertaForQuestionAnswering,16
 T5ForConditionalGeneration,4
 T5Small,1
 TrOCRForCausalLM,32
diff --git a/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py b/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
index f1f9ea9b30ba..8a6978dd448b 100644
--- a/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
+++ b/benchmarks/dynamo/microbenchmarks/operator_inp_utils.py
@@ -296,8 +296,8 @@ def get_all_ops(self):
         for key in self.operator_db.keys():
             try:
                 op = eval(key)
-            except AttributeError as ae:
-                log.warning("Evaluating an op name into an OpOverload: %s", ae)
+            except AttributeError:
+                log.warning("Evaluating an op name into an OpOverload", exc_info=True)
                 continue
             yield op
 
diff --git a/benchmarks/dynamo/pr_time_benchmarks/README.md b/benchmarks/dynamo/pr_time_benchmarks/README.md
index 93a21a77381b..689d2b16e5e4 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/README.md
+++ b/benchmarks/dynamo/pr_time_benchmarks/README.md
@@ -6,4 +6,4 @@
 4. (Optional) flip a flag that you know will change the benchmark and run again with b.txt `PYTHONPATH=./ python benchmarks/[YOUR_BENCHMARK].py a.txt`
 5. Compare `a.txt` and `b.txt` located within the `benchmarks/dynamo/pr_time_benchmarks` folder to make sure things look as you expect
 6. Check in your new benchmark file and submit a new PR
-7. In a few days, if your benchmark is stable, bug Laith Sakka to enable running your benchmark on all PRs. If your a meta employee, you can find the dashboard here: internalfb.com/intern/unidash/dashboard/pt2_diff_time_metrics
+7. In a few days, if your benchmark is stable, bug Laith Sakka to enable running your benchmark on all PRs. If you are a meta employee, you can find the dashboard here: https://internalfb.com/intern/unidash/dashboard/pt2_diff_time_metrics
diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/runtime_overhead.py b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/runtime_overhead.py
new file mode 100644
index 000000000000..3f5c58d70c7e
--- /dev/null
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/runtime_overhead.py
@@ -0,0 +1,111 @@
+import sys
+
+from benchmark_base import BenchmarkBase
+
+import torch
+from torch.autograd.grad_mode import inference_mode
+
+
+class Benchmark(BenchmarkBase):
+    def __init__(self, requires_grad, inference_mode, backward, dynamic):
+        assert not (inference_mode and backward), (
+            "inference_mode and backward cannot be both True"
+        )
+
+        self._requires_grad = requires_grad
+        self._inference_mode = inference_mode
+        self._backward = backward
+
+        super().__init__(
+            category="runtime_overhead",
+            backend="inductor",
+            device="cuda",
+            dynamic=dynamic,
+        )
+
+    def name(self):
+        prefix = f"{self.category()}_{self.backend()}"
+        if self._requires_grad:
+            prefix += "_requires_grad"
+        if self._inference_mode:
+            prefix += "_inference_mode"
+        if self._backward:
+            prefix += "_backward"
+        if self.is_dynamic():
+            prefix += "_dynamic"
+        return prefix
+
+    def description(self):
+        return "runtime of a compiled add1 op small input"
+
+    def _prepare_once(self):
+        torch._dynamo.reset()
+        self.a = torch.ones(2, device=self.device(), requires_grad=self._requires_grad)
+
+        @torch.compile(
+            backend=self.backend(),
+            fullgraph=True,
+            dynamic=self.is_dynamic(),
+        )
+        def add1(a):
+            return a + 1
+
+        self._add1 = add1
+
+        # warmup
+        for _ in range(10):
+            if self._backward:
+                self.forward_val = self._add1(self.a).sum()
+                self.forward_val.backward()
+            else:
+                self._work()
+
+    def _prepare(self):
+        if self._backward:
+            self.forward_val = self._add1(self.a).sum()
+
+    def _work(self):
+        if self._inference_mode:
+            with inference_mode():
+                self._add1(self.a)
+        elif self._backward:
+            self.forward_val.backward()
+        else:
+            self._add1(self.a)
+
+
+def main():
+    result_path = sys.argv[1]
+    all = [
+        Benchmark(
+            requires_grad=False, inference_mode=False, backward=False, dynamic=False
+        ),
+        Benchmark(
+            requires_grad=False, inference_mode=True, backward=False, dynamic=False
+        ),
+        Benchmark(
+            requires_grad=True, inference_mode=False, backward=False, dynamic=False
+        ),
+        Benchmark(
+            requires_grad=True, inference_mode=False, backward=True, dynamic=False
+        ),
+        Benchmark(
+            requires_grad=False, inference_mode=False, backward=False, dynamic=True
+        ),
+        Benchmark(
+            requires_grad=False, inference_mode=True, backward=False, dynamic=True
+        ),
+        Benchmark(
+            requires_grad=True, inference_mode=False, backward=False, dynamic=True
+        ),
+        Benchmark(
+            requires_grad=True, inference_mode=False, backward=True, dynamic=True
+        ),
+    ]
+
+    for benchmark in all:
+        benchmark.enable_instruction_count().collect_all().append_results(result_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/sum_floordiv.py b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/sum_floordiv.py
index 8a292f602c03..a1f3a28a466a 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/sum_floordiv.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/sum_floordiv.py
@@ -3,6 +3,7 @@
 from benchmark_base import BenchmarkBase
 
 import torch
+from torch._dynamo.utils import CompileTimeInstructionCounter
 
 
 class Benchmark(BenchmarkBase):
@@ -32,7 +33,11 @@ def _prepare(self):
     def _work(self):
         # enable_cpp_symbolic_shape_guards has impact on this benchmark
         # Keep using False value for consistency.
-        with torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", False):
+        with (
+            torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", False),
+            torch._export.config.patch(use_new_tracer_experimental=True),
+            CompileTimeInstructionCounter.record(),
+        ):
             torch.export.export(self.m, (self.input,), strict=True)
 
 
diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/update_hint_benchmark.py b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/update_hint_benchmark.py
index cc2edf660f9b..498b4086293d 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/update_hint_benchmark.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/update_hint_benchmark.py
@@ -37,7 +37,7 @@ def _work(self):
         def f(a, b):
             xs = b.tolist()
             for x in xs:
-                torch._check_is_size(x)
+                torch._check(x >= 0)
                 torch._check(x <= self.N)
             return a.split(xs)
 
diff --git a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
index fc11be9ba652..5ca33b7600b2 100644
--- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
@@ -6,7 +6,7 @@ add_loop_eager_dynamic,compile_time_instruction_count,4432000000,0.1
 
 
 
-add_loop_inductor,compile_time_instruction_count,30280000000,0.1
+add_loop_inductor,compile_time_instruction_count,29660000000,0.1
 
 
 
@@ -38,7 +38,7 @@ update_hint_regression,compile_time_instruction_count,1719000000,0.1
 
 
 
-sum_floordiv_regression,compile_time_instruction_count,966100000,0.1
+sum_floordiv_regression,compile_time_instruction_count,3686995725,0.1
 
 
 
@@ -50,27 +50,27 @@ symint_sum_loop,compile_time_instruction_count,4299000000,0.1
 
 
 
-aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,2151000000,0.1
+aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,1869000000,0.1
 
 
 
-aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,6124000000,0.1
+aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5281000000,0.1
 
 
 
-aotdispatcher_partitioner_cpu,compile_time_instruction_count,9005000000,0.1
+aotdispatcher_partitioner_cpu,compile_time_instruction_count,8333000000,0.1
 
 
 
-aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1989000000,0.1
+aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1909000000,0.1
 
 
 
-aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3959000000,0.1
+aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3442000000,0.1
 
 
 
-aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10650000000,0.1
+aotdispatcher_training_subclass_cpu,compile_time_instruction_count,9239000000,0.1
 
 
 
@@ -78,7 +78,7 @@ mm_loop_inductor_gpu,compile_time_instruction_count,4820968837,0.1
 
 
 
-mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,8802129167,0.1
+mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,9051000000,0.1
 
 
 
@@ -86,4 +86,4 @@ basic_NestedModule_eager,compile_time_instruction_count,9554000000,0.1
 
 
 
-basic_InlineMod_eager,compile_time_instruction_count,7464000000,0.1
+basic_InlineMod_eager,compile_time_instruction_count,7618000000,0.1
diff --git a/benchmarks/dynamo/timm_models.py b/benchmarks/dynamo/timm_models.py
index b63c41947b9a..59534e8341cb 100755
--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@@ -60,84 +60,41 @@ def pip_install(package):
 
 BATCH_SIZE_DIVISORS = {
     "beit_base_patch16_224": 2,
-    "convit_base": 2,
-    "convmixer_768_32": 2,
-    "convnext_base": 2,
-    "cspdarknet53": 2,
     "deit_base_distilled_patch16_224": 2,
     "gluon_xception65": 2,
     "mobilevit_s": 2,
-    "pnasnet5large": 2,
-    "poolformer_m36": 2,
-    "resnest101e": 2,
     "swin_base_patch4_window7_224": 2,
-    "swsl_resnext101_32x16d": 2,
-    "vit_base_patch16_224": 2,
-    "volo_d1_224": 2,
-    "jx_nest_base": 4,
 }
 
 REQUIRE_HIGHER_TOLERANCE = {
-    "crossvit_9_240",
-    "fbnetv3_b",
-    "gmixer_24_224",
-    "hrnet_w18",
     "inception_v3",
-    "mixer_b16_224",
     "mobilenetv3_large_100",
-    "sebotnet33ts_256",
-    "selecsls42b",
-    "convnext_base",
-    "cait_m36_384",
 }
 
-REQUIRE_HIGHER_TOLERANCE_AMP = {
-    "poolformer_m36",
-}
+REQUIRE_HIGHER_TOLERANCE_AMP = {}
 
 REQUIRE_EVEN_HIGHER_TOLERANCE = {
-    "levit_128",
-    "sebotnet33ts_256",
     "beit_base_patch16_224",
-    "cspdarknet53",
 }
 
 # These models need higher tolerance in MaxAutotune mode
-REQUIRE_EVEN_HIGHER_TOLERANCE_MAX_AUTOTUNE = {
-    "gluon_inception_v3",
-}
+REQUIRE_EVEN_HIGHER_TOLERANCE_MAX_AUTOTUNE = {}
 
 REQUIRE_HIGHER_TOLERANCE_FOR_FREEZING = {
     "adv_inception_v3",
-    "botnet26t_256",
-    "gluon_inception_v3",
-    "selecsls42b",
-    "swsl_resnext101_32x16d",
 }
 
 SCALED_COMPUTE_LOSS = {
-    "ese_vovnet19b_dw",
-    "fbnetc_100",
-    "mnasnet_100",
     "mobilevit_s",
-    "sebotnet33ts_256",
 }
 
-FORCE_AMP_FOR_FP16_BF16_MODELS = {
-    "convit_base",
-    "xcit_large_24_p8_224",
-}
+FORCE_AMP_FOR_FP16_BF16_MODELS = {}
 
-SKIP_ACCURACY_CHECK_AS_EAGER_NON_DETERMINISTIC_MODELS = {
-    "xcit_large_24_p8_224",
-}
+SKIP_ACCURACY_CHECK_AS_EAGER_NON_DETERMINISTIC_MODELS = {}
 
 REQUIRE_LARGER_MULTIPLIER_FOR_SMALLER_TENSOR = {
     "inception_v3",
     "mobilenetv3_large_100",
-    "cspdarknet53",
-    "gluon_inception_v3",
-    "cait_m36_384",
 }
 
 
@@ -269,15 +226,11 @@ def skip_accuracy_check_as_eager_non_deterministic(self):
 
     @property
     def guard_on_nn_module_models(self):
-        return {
-            "convit_base",
-        }
+        return {}
 
     @property
     def inline_inbuilt_nn_modules_models(self):
-        return {
-            "lcnet_050",
-        }
+        return {}
 
     @download_retry_decorator
     def _download_model(self, model_name):
@@ -318,8 +271,6 @@ def load_model(
             memory_format=torch.channels_last if channels_last else None,
         )
 
-        self.num_classes = model.num_classes
-
         data_config = resolve_data_config(
             vars(self._args) if timmversion >= "0.8.0" else self._args,
             model=model,
@@ -349,7 +300,6 @@ def load_model(
         example_inputs = [
             example_inputs,
         ]
-        self.target = self._gen_target(batch_size, device)
 
         self.loss = torch.nn.CrossEntropyLoss().to(device)
 
@@ -417,11 +367,6 @@ def get_tolerance_and_cosine_flag(self, is_training, current_device, name):
                 tolerance = 1e-2
         return tolerance, cosine
 
-    def _gen_target(self, batch_size, device):
-        return torch.empty((batch_size,) + (), device=device, dtype=torch.long).random_(
-            self.num_classes
-        )
-
     def compute_loss(self, pred):
         # High loss values make gradient checking harder, as small changes in
         # accumulation order upsets accuracy checks.
diff --git a/benchmarks/dynamo/timm_models.yaml b/benchmarks/dynamo/timm_models.yaml
index 6a6fdde849ab..1650a8750053 100644
--- a/benchmarks/dynamo/timm_models.yaml
+++ b/benchmarks/dynamo/timm_models.yaml
@@ -10,6 +10,4 @@ skip:
       # timeout on aarch64
       - dm_nfnet_f0
       - nfnet_l0
-      - resnest101e
-      - swsl_resnext101_32x16d
       - visformer_small
diff --git a/benchmarks/dynamo/timm_models_list.txt b/benchmarks/dynamo/timm_models_list.txt
index 0c13a8cb1d24..a006af403f76 100644
--- a/benchmarks/dynamo/timm_models_list.txt
+++ b/benchmarks/dynamo/timm_models_list.txt
@@ -1,61 +1,18 @@
 adv_inception_v3 128
 beit_base_patch16_224 128
-botnet26t_256 128
-cait_m36_384 4
-coat_lite_mini 128
-convit_base 128
-convmixer_768_32 64
-convnext_base 128
-crossvit_9_240 256
-cspdarknet53 128
+convnextv2_nano.fcmae_ft_in22k_in1k 128
 deit_base_distilled_patch16_224 128
-dla102 128
+deit_tiny_patch16_224.fb_in1k 128
 dm_nfnet_f0 128
-dpn107 64
-eca_botnext26ts_256 128
-eca_halonext26ts 128
-ese_vovnet19b_dw 256
-fbnetc_100 512
-fbnetv3_b 256
-gernet_l 128
 ghostnet_100 512
-gluon_inception_v3 256
-gmixer_24_224 128
-gmlp_s16_224 128
-hrnet_w18 128
 inception_v3 128
-jx_nest_base 128
-lcnet_050 256
-levit_128 1024
-mixer_b16_224 128
-mixnet_l 128
-mnasnet_100 512
 mobilenetv2_100 128
 mobilenetv3_large_100 512
 mobilevit_s 128
 nfnet_l0 128
-pit_b_224 64
-pnasnet5large 32
-poolformer_m36 128
-regnety_002 1024
 repvgg_a2 128
-res2net101_26w_4s 128
-res2net50_14w_8s 128
-res2next50 128
-resmlp_12_224 128
-resnest101e 128
-rexnet_100 256
-sebotnet33ts_256 64
-selecsls42b 128
-spnasnet_100 128
 swin_base_patch4_window7_224 128
-swsl_resnext101_32x16d 64
 tf_efficientnet_b0 128
-tf_mixnet_l 128
-tinynet_a 128
-tnt_s_patch16_224 128
-twins_pcpvt_base 128
 visformer_small 128
-vit_base_patch16_224 128
-volo_d1_224 128
-xcit_large_24_p8_224 16
+vit_base_patch14_dinov2.lvd142m 128
+vit_base_patch16_siglip_256 128
\ No newline at end of file
diff --git a/benchmarks/dynamo/timm_models_list_cpu.txt b/benchmarks/dynamo/timm_models_list_cpu.txt
index 495b8ddafe25..96b743b48bd2 100644
--- a/benchmarks/dynamo/timm_models_list_cpu.txt
+++ b/benchmarks/dynamo/timm_models_list_cpu.txt
@@ -1,59 +1,18 @@
 adv_inception_v3,128
 beit_base_patch16_224,64
-botnet26t_256,128
-cait_m36_384,4
-coat_lite_mini,32
-convit_base,64
-convmixer_768_32,2
-convnext_base,64
-crossvit_9_240,32
-cspdarknet53,64
+convnextv2_nano.fcmae_ft_in22k_in1k,128
 deit_base_distilled_patch16_224,64
+deit_tiny_patch16_224.fb_in1k,128
 dm_nfnet_f0,128
-dpn107,32
-eca_botnext26ts_256,128
-eca_halonext26ts,128
-ese_vovnet19b_dw,128
-fbnetc_100,32
-fbnetv3_b,32
-gernet_l,128
 ghostnet_100,128
-gluon_inception_v3,128
-gmixer_24_224,16
-gmlp_s16_224,128
-hrnet_w18,128
 inception_v3,128
-jx_nest_base,32
-lcnet_050,64
-mixer_b16_224,128
-mixnet_l,128
-mnasnet_100,32
 mobilenetv2_100,32
 mobilenetv3_large_100,32
 mobilevit_s,256
 nfnet_l0,128
-pit_b_224,64
-pnasnet5large,16
-poolformer_m36,64
-regnety_002,128
 repvgg_a2,128
-res2net101_26w_4s,64
-res2net50_14w_8s,128
-res2next50,128
-resmlp_12_224,128
-resnest101e,64
-rexnet_100,128
-sebotnet33ts_256,64
-selecsls42b,128
-spnasnet_100,32
 swin_base_patch4_window7_224,64
-swsl_resnext101_32x16d,32
 tf_efficientnet_b0,128
-tf_mixnet_l,32
-tinynet_a,128
-tnt_s_patch16_224,32
-twins_pcpvt_base,64
 visformer_small,128
-vit_base_patch16_224,64
-volo_d1_224,64
-xcit_large_24_p8_224,5
+vit_base_patch14_dinov2.lvd142m,128
+ViT-B-16-SigLIP-i18n-256,128
\ No newline at end of file
diff --git a/benchmarks/dynamo/torchao_backend.py b/benchmarks/dynamo/torchao_backend.py
index 96e1c4569274..6b4204db7b36 100644
--- a/benchmarks/dynamo/torchao_backend.py
+++ b/benchmarks/dynamo/torchao_backend.py
@@ -1,4 +1,5 @@
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import torch
 
diff --git a/benchmarks/dynamo/torchbench.py b/benchmarks/dynamo/torchbench.py
index 1f10ecc661d8..da6a3e1336aa 100755
--- a/benchmarks/dynamo/torchbench.py
+++ b/benchmarks/dynamo/torchbench.py
@@ -75,29 +75,7 @@ def setup_torchbench_cwd():
     return original_dir
 
 
-def process_hf_reformer_output(out):
-    assert isinstance(out, list)
-    # second output is unstable
-    return [elem for i, elem in enumerate(out) if i != 1]
-
-
-def process_hf_whisper_output(out):
-    out_ret = []
-    for i, elem in enumerate(out):
-        if i == 0:
-            if elem is not None:
-                assert isinstance(elem, dict)
-                out_ret.append({k: v for k, v in elem.items() if k != "logits"})
-        elif i != 1:
-            out_ret.append(elem)
-
-    return out_ret
-
-
-process_train_model_output = {
-    "hf_Reformer": process_hf_reformer_output,
-    "hf_Whisper": process_hf_whisper_output,
-}
+process_train_model_output = {}
 
 
 class TorchBenchmarkRunner(BenchmarkRunner):
@@ -227,12 +205,10 @@ def inline_inbuilt_nn_modules_models(self):
             "drq",
             "hf_Reformer",
             "DALLE2_pytorch",
-            "hf_BigBird",
             "detectron2_maskrcnn_r_50_fpn",
             "detectron2_maskrcnn_r_101_fpn",
             "vision_maskrcnn",
             "doctr_reco_predictor",
-            "hf_T5_generate",
         }
 
     def load_model(
@@ -395,8 +371,6 @@ def load_model(
             and hasattr(model.config, "use_cache")
         ):
             model.config.use_cache = False
-        if model_name == "hf_T5_generate":
-            model.model.config.use_cache = False
 
         self.validate_model(model, example_inputs)
         return device, benchmark.name, model, example_inputs, batch_size
diff --git a/benchmarks/dynamo/torchbench.yaml b/benchmarks/dynamo/torchbench.yaml
index bf0a1b6c31e8..7d7817936e56 100644
--- a/benchmarks/dynamo/torchbench.yaml
+++ b/benchmarks/dynamo/torchbench.yaml
@@ -5,8 +5,6 @@ batch_size:
     demucs: 4
     dlrm: 1024
     densenet121: 4
-    hf_Reformer: 4
-    hf_T5_base: 4
     timm_efficientdet: 1
     llama_v2_7b_16h: 1
     # reduced from 16 due to cudagraphs OOM in TorchInductor dashboard
@@ -30,7 +28,6 @@ tolerance:
     - alexnet
     - attention_is_all_you_need_pytorch
     - densenet121
-    - hf_Albert
     - vgg16
     - mobilenet_v3_large
     - nvidia_deeprecommender
@@ -40,20 +37,16 @@ tolerance:
     - soft_actor_critic
     - tacotron2
     - yolov3
-    - timm_efficientdet
-    - timm_efficientnet
     - squeezenet1_1
 
   higher_fp16:
     - doctr_reco_predictor
     - drq
-    - hf_Whisper
     - phlippe_resnet
 
   higher_bf16:
     - doctr_reco_predictor
     - drq
-    - hf_Whisper
 
   # These models need higher tolerance for xpu devices with bf16
   higher_bf16_xpu:
@@ -71,16 +64,9 @@ tolerance:
 
 require_larger_multiplier_for_smaller_tensor:
   - yolov3
-  - timm_efficientnet
 
 # These benchmarks took >600s on an i9-11900K CPU
 very_slow: &VERY_SLOW_MODELS
-  # 3339s
-  - hf_BigBird
-  # 3062s
-  - hf_Longformer
-  # 930s
-  - hf_T5
 
 
 # These benchmarks took >60s on an i9-11900K CPU
@@ -92,18 +78,6 @@ slow:
   - demucs
   # 242s
   - fastNLP_Bert
-  # 221s
-  - hf_Albert
-  # 400s
-  - hf_Bart
-  # 334s
-  - hf_Bert
-  # 187s
-  - hf_DistilBert
-  # 470s
-  - hf_GPT2
-  # 141s
-  - hf_Reformer
   # 317s
   - speech_transformer
   # 99s
@@ -187,11 +161,36 @@ skip:
     - hf_clip
     # multi gpu not always available in benchmark runners
     - simple_gpt_tp_manual
+    # skip hf and timm models in torchbench since
+    # there are already separate benchmarks for them
+    - hf_Albert
+    - hf_Bart
+    - hf_Bert
+    - hf_BigBird
+    - hf_DistilBert
+    - hf_GPT2
+    - hf_Longformer
+    - hf_Reformer
+    - hf_T5
+    - timm_efficientdet
+    - timm_efficientnet
+    - timm_nfnet
+    - timm_regnet
+    - timm_resnest
+    - timm_vision_transformer
+    - timm_vovnet
+    - hf_Bert_large
+    - hf_GPT2_large
+    - hf_Roberta_base
+    - hf_T5_base
+    - hf_T5_generate
+    - hf_T5_large
+    - hf_Whisper
+    - hf_distil_whisper
+    - timm_vision_transformer_large
 
   device:
     cpu:
-      # OOMs
-      - hf_T5_generate
       # model is CUDA only
       - cm3leon_generate
       # timeout
@@ -208,16 +207,12 @@ skip:
       - torchrec_dlrm
       - simple_gpt
       # works on cuda, accuracy failure on cpu
-      - hf_Whisper
       - stable_diffusion_text_encoder
       - llava
       - moco
 
     # Skip these additional models when running on aarch64
-    cpu_aarch64:
-      # timeout on aarch64
-      - timm_regnet
-      - timm_nfnet
+    cpu_aarch64: []
 
     cuda: []
 
@@ -235,7 +230,6 @@ skip:
       - sam_fast
       # Model's DEFAULT_TRAIN_BSIZE is not implemented
       - cm3leon_generate
-      - hf_T5_generate
       - doctr_det_predictor
       - doctr_reco_predictor
       - moondream
@@ -247,9 +241,6 @@ skip:
     - cm3leon_generate
     - detectron2_fcos_r_50_fpn
     - fastNLP_Bert
-    - hf_Longformer
-    - hf_Reformer
-    - hf_T5_generate
     - opacus_cifar10
     - speech_transformer
 
@@ -286,9 +277,6 @@ accuracy:
       # Models too large to have eager, dynamo and fp64_numbers simultaneosuly
       # even for 40 GB machine. We have tested accuracy for smaller version of
       # these models
-      - hf_GPT2_large
-      - hf_T5_large
-      - timm_vision_transformer_large
       # accuracy https://github.com/pytorch/pytorch/issues/93847
       - maml
       - llama_v2_7b_16h
@@ -300,5 +288,4 @@ accuracy:
       - pytorch_unet
 
   max_batch_size:
-    hf_GPT2: 2
     pytorch_unet: 2
diff --git a/benchmarks/dynamo/torchbench_models_list.txt b/benchmarks/dynamo/torchbench_models_list.txt
index 04947c4a6a30..f8f36810c693 100644
--- a/benchmarks/dynamo/torchbench_models_list.txt
+++ b/benchmarks/dynamo/torchbench_models_list.txt
@@ -4,11 +4,6 @@ LearningToPaint,1024
 alexnet,1024
 dcgan,1024
 densenet121,64
-hf_Albert,32
-hf_Bart,16
-hf_Bert,16
-hf_GPT2,16
-hf_T5,4
 mnasnet1_0,256
 mobilenet_v2,128
 mobilenet_v3_large,256
@@ -19,10 +14,4 @@ resnet50,128
 resnext50_32x4d,128
 shufflenet_v2_x1_0,512
 squeezenet1_1,512
-timm_nfnet,256
-timm_efficientnet,128
-timm_regnet,128
-timm_resnest,256
-timm_vision_transformer,256
-timm_vovnet,128
 vgg16,128
diff --git a/benchmarks/dynamo/torchbench_models_list_cpu.txt b/benchmarks/dynamo/torchbench_models_list_cpu.txt
index ab485702b838..af2293b5a4a6 100644
--- a/benchmarks/dynamo/torchbench_models_list_cpu.txt
+++ b/benchmarks/dynamo/torchbench_models_list_cpu.txt
@@ -6,18 +6,6 @@ densenet121,512
 dlrm,2048
 fastNLP_Bert,8
 functorch_dp_cifar10,1024
-hf_Albert,8
-hf_Bart,8
-hf_Bert,8
-hf_Bert_large,8
-hf_DistilBert,8
-hf_GPT2,8
-hf_GPT2_large,1
-hf_Longformer,4
-hf_Reformer,8
-hf_T5,4
-hf_T5_base,1
-hf_T5_large,1
 LearningToPaint,96
 lennard_jones,1024
 mnasnet1_0,32
@@ -35,13 +23,6 @@ shufflenet_v2_x1_0,64
 speech_transformer,1024
 squeezenet1_1,16
 Super_SloMo,1024
-timm_efficientnet,64
-timm_nfnet,128
-timm_regnet,32
-timm_resnest,32
-timm_vision_transformer,16
-timm_vision_transformer_large,8
-timm_vovnet,32
 tts_angular,1024
 vgg16,64
 vision_maskrcnn,1
diff --git a/benchmarks/fastrnns/bench.py b/benchmarks/fastrnns/bench.py
index 23a3d81c909c..8ad6b96fde4d 100644
--- a/benchmarks/fastrnns/bench.py
+++ b/benchmarks/fastrnns/bench.py
@@ -127,7 +127,7 @@ def train_batch(modeldef):
         bwd_time = bwd_start_event.elapsed_time(bwd_end_event)
         return fwd_time, bwd_time
 
-    creator_args = creator_args = {
+    creator_args = {
         "seqLength": seqLength,
         "numLayers": numLayers,
         "inputSize": inputSize,
diff --git a/benchmarks/fastrnns/test_bench.py b/benchmarks/fastrnns/test_bench.py
index 65074512cfb8..27d517376614 100644
--- a/benchmarks/fastrnns/test_bench.py
+++ b/benchmarks/fastrnns/test_bench.py
@@ -12,7 +12,7 @@ def modeldef(request, net_name, executor, fuser):
 
     # Given a 'net_name' provided by generate_tests, build the thing
     name, rnn_creator, context = get_nn_runners(net_name)[0]
-    creator_args = creator_args = {
+    creator_args = {
         "seqLength": 100,
         "numLayers": 1,
         "inputSize": 512,
diff --git a/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py b/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
index a974eb8ae5ca..9d5772c4f124 100644
--- a/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
+++ b/benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py
@@ -1,7 +1,8 @@
 import time
 from argparse import ArgumentParser
 from collections import defaultdict
-from typing import Any, Callable, NamedTuple
+from collections.abc import Callable
+from typing import Any, NamedTuple
 
 import torch
 from torch.autograd import functional
diff --git a/benchmarks/functional_autograd_benchmark/torchaudio_models.py b/benchmarks/functional_autograd_benchmark/torchaudio_models.py
index 19fa23e55413..5a26616cb507 100644
--- a/benchmarks/functional_autograd_benchmark/torchaudio_models.py
+++ b/benchmarks/functional_autograd_benchmark/torchaudio_models.py
@@ -367,7 +367,7 @@ def get_seq_lens(self, input_length):
         """
         seq_len = input_length
         for m in self.conv.modules():
-            if type(m) == nn.modules.conv.Conv2d:
+            if type(m) is nn.modules.conv.Conv2d:
                 seq_len = (
                     seq_len
                     + 2 * m.padding[1]
diff --git a/benchmarks/functional_autograd_benchmark/utils.py b/benchmarks/functional_autograd_benchmark/utils.py
index 46f0061cd3fe..8efc0bdcddd1 100644
--- a/benchmarks/functional_autograd_benchmark/utils.py
+++ b/benchmarks/functional_autograd_benchmark/utils.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+from typing import Optional, Union
 
 import torch
 from torch import nn, Tensor
diff --git a/benchmarks/gpt_fast/common.py b/benchmarks/gpt_fast/common.py
index 5d9fc7c4aa6b..4cbd0bd0f2dc 100644
--- a/benchmarks/gpt_fast/common.py
+++ b/benchmarks/gpt_fast/common.py
@@ -1,5 +1,6 @@
 import dataclasses
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Optional
 
 
 all_experiments: dict[str, Callable] = {}
diff --git a/benchmarks/gpt_fast/mixtral_moe_quantize.py b/benchmarks/gpt_fast/mixtral_moe_quantize.py
index 50ffd61bdb83..fd0342ce3d59 100644
--- a/benchmarks/gpt_fast/mixtral_moe_quantize.py
+++ b/benchmarks/gpt_fast/mixtral_moe_quantize.py
@@ -85,7 +85,7 @@ def create_quantized_state_dict(self):
                 cur_state_dict[f"{fqn}.weight"] = int8_weight
                 cur_state_dict[f"{fqn}.scales"] = scales.to(mod.weight.dtype)
             elif isinstance(mod, ConditionalFeedForward):
-                for weight_idx in range(0, 3):
+                for weight_idx in range(3):
                     weight_name = f"w{weight_idx + 1}"
                     scales_name = f"scales{weight_idx + 1}"
                     weight = getattr(mod, weight_name)
diff --git a/benchmarks/inductor_backends/cutlass.py b/benchmarks/inductor_backends/cutlass.py
index 7141872ec3c4..b2ed506302ae 100644
--- a/benchmarks/inductor_backends/cutlass.py
+++ b/benchmarks/inductor_backends/cutlass.py
@@ -9,8 +9,9 @@
 import time
 from abc import abstractmethod
 from collections import defaultdict
+from collections.abc import Callable
 from dataclasses import asdict, dataclass, field
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 
 from tabulate import tabulate
 from tqdm import tqdm
diff --git a/benchmarks/instruction_counts/core/api.py b/benchmarks/instruction_counts/core/api.py
index 7d0b1a0f72ea..d22fc5a66fab 100644
--- a/benchmarks/instruction_counts/core/api.py
+++ b/benchmarks/instruction_counts/core/api.py
@@ -66,7 +66,7 @@ class GroupedSetup:
 
     def __post_init__(self) -> None:
         for field in dataclasses.fields(self):
-            assert field.type == str
+            assert field.type is str
             value: str = getattr(self, field.name)
             object.__setattr__(self, field.name, textwrap.dedent(value))
 
diff --git a/benchmarks/operator_benchmark/README.md b/benchmarks/operator_benchmark/README.md
index 0a8ad5846bf2..b4556d908d7d 100644
--- a/benchmarks/operator_benchmark/README.md
+++ b/benchmarks/operator_benchmark/README.md
@@ -20,7 +20,7 @@ Key Features:
 The instruction below installs a cpp\_extension for PyTorch and it is required to run the benchmark suite.
 ```bash
 cd pt_extension
-python -m pip install .
+python -m pip install . -v --no-build-isolation
 ```
 
 ## How to run the benchmarks:
diff --git a/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv b/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv
new file mode 100644
index 000000000000..dfc72e4665dd
--- /dev/null
+++ b/benchmarks/operator_benchmark/aarch64_expected_ci_operator_benchmark_eager_float32_cpu.csv
@@ -0,0 +1,1319 @@
+Benchmarking Framework,Benchmarking Module Name,Case Name,tag,run_backward,Execution Time,Peak Memory (KB)
+PyTorch,add,add_M1_N1_K1_cpu,short,False,4.244240,0.000000
+PyTorch,add,add_M64_N64_K64_cpu,short,False,56.719577,0.000000
+PyTorch,add,add_M64_N64_K128_cpu,short,False,56.826275,0.000000
+PyTorch,add,add_M1_N1_K1_cpu_bwdall_BACKWARD,short,True,47.834313,0.000000
+PyTorch,add,add_M1_N1_K1_cpu_bwd1_BACKWARD,short,True,47.872547,0.000000
+PyTorch,add,add_M1_N1_K1_cpu_bwd2_BACKWARD,short,True,47.790496,0.000000
+PyTorch,add,add_M64_N64_K64_cpu_bwdall_BACKWARD,short,True,216.173346,0.000000
+PyTorch,add,add_M64_N64_K64_cpu_bwd1_BACKWARD,short,True,217.600432,0.000000
+PyTorch,add,add_M64_N64_K64_cpu_bwd2_BACKWARD,short,True,216.916940,0.000000
+PyTorch,add,add_M64_N64_K128_cpu_bwdall_BACKWARD,short,True,250.406573,0.000000
+PyTorch,add,add_M64_N64_K128_cpu_bwd1_BACKWARD,short,True,250.049463,0.000000
+PyTorch,add,add_M64_N64_K128_cpu_bwd2_BACKWARD,short,True,250.817280,0.000000
+PyTorch,arange,arange_start0_end1000_step2.5_cpu_dtypetorch.float32,short,False,7.851754,0.000000
+PyTorch,arange,arange_start-1024_end2048_step1_cpu_dtypetorch.float32,short,False,8.597164,0.000000
+PyTorch,as_strided,"as_strided_M8_N8_size(2,2)_stride(1,1)_storage_offset0_cpu",short,False,3.503591,0.000000
+PyTorch,as_strided,"as_strided_M256_N256_size(32,32)_stride(1,1)_storage_offset0_cpu",short,False,3.584804,0.000000
+PyTorch,as_strided,"as_strided_M512_N512_size(64,64)_stride(2,2)_storage_offset1_cpu",short,False,3.723034,0.000000
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingTrue_cudnnFalse,short,False,343.685714,0.000000
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingFalse_cudnnFalse,short,False,96.169117,0.000000
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingTrue_cudnnFalse_bwdall_BACKWARD,short,True,335.407438,0.000000
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingTrue_cudnnFalse_bwd1_BACKWARD,short,True,337.885862,0.000000
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingFalse_cudnnFalse_bwdall_BACKWARD,short,True,326.908147,0.000000
+PyTorch,batchnorm,batchnorm_M1_N256_K3136_cpu_trainingFalse_cudnnFalse_bwd1_BACKWARD,short,True,329.085216,0.000000
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingTrue_cudnnFalse,short,False,363.524665,0.000000
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingFalse_cudnnFalse,short,False,129.891489,0.000000
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingTrue_cudnnFalse_bwdall_BACKWARD,short,True,484.415291,0.000000
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingTrue_cudnnFalse_bwd1_BACKWARD,short,True,486.083544,0.000000
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingFalse_cudnnFalse_bwdall_BACKWARD,short,True,439.912925,0.000000
+PyTorch,batchnorm,batchnorm_N3136_C256_cpu_trainingFalse_cudnnFalse_bwd1_BACKWARD,short,True,439.728483,0.000000
+PyTorch,add_,add__M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,3.355920,0.000000
+PyTorch,add_,add__M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,54.177022,0.000000
+PyTorch,add_,add__M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,55.350490,0.000000
+PyTorch,sub_,sub__M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,1.466720,0.000000
+PyTorch,sub_,sub__M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,53.482515,0.000000
+PyTorch,sub_,sub__M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,55.382850,0.000000
+PyTorch,mul_,mul__M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,3.065535,0.000000
+PyTorch,mul_,mul__M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,51.635021,0.000000
+PyTorch,mul_,mul__M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,54.669222,0.000000
+PyTorch,copy_,copy__M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,1.412698,0.000000
+PyTorch,copy_,copy__M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,50.044207,0.000000
+PyTorch,copy_,copy__M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,49.480417,0.000000
+PyTorch,div_,div__M1_N1_K1_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,3.127072,0.000000
+PyTorch,div_,div__M64_N64_K64_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,59.241161,0.000000
+PyTorch,div_,div__M64_N64_K128_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,59.852816,0.000000
+PyTorch,add,"add_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,False,57.006677,0.000000
+PyTorch,sub,"sub_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,False,55.606088,0.000000
+PyTorch,div,"div_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,False,58.529255,0.000000
+PyTorch,mul,"mul_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,False,54.645077,0.000000
+PyTorch,add,add_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,4.397014,0.000000
+PyTorch,add,add_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,59.243500,0.000000
+PyTorch,add,add_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,57.947691,0.000000
+PyTorch,sub,sub_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,1.925851,0.000000
+PyTorch,sub,sub_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,57.308320,0.000000
+PyTorch,sub,sub_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,57.787743,0.000000
+PyTorch,div,div_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,7.978539,0.000000
+PyTorch,div,div_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,159.754860,0.000000
+PyTorch,div,div_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,165.360235,0.000000
+PyTorch,mul,mul_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,3.928136,0.000000
+PyTorch,mul,mul_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,56.413499,0.000000
+PyTorch,mul,mul_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,55.925090,0.000000
+PyTorch,logical_and,"logical_and_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.bool",short,False,78.404254,0.000000
+PyTorch,logical_and,logical_and_M1_N1_K1_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,5.354032,0.000000
+PyTorch,logical_and,logical_and_M64_N64_K64_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,54.072783,0.000000
+PyTorch,logical_and,logical_and_M64_N64_K128_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,53.680283,0.000000
+PyTorch,bmm,bmm_B2_M1_N8_K2_cpu_dtypetorch.float32,short,False,4.407892,0.000000
+PyTorch,bmm,bmm_B2_M1_N8_K2_cpu_dtypetorch.bfloat16,short,False,4.213927,0.000000
+PyTorch,bmm,bmm_B128_M64_N32_K64_cpu_dtypetorch.float32,short,False,200.303424,0.000000
+PyTorch,bmm,bmm_B128_M64_N32_K64_cpu_dtypetorch.bfloat16,short,False,229.912606,0.000000
+PyTorch,baddbmm,baddbmm_B2_M1_N8_K2_cpu_dtypetorch.float32,short,False,6.631313,0.000000
+PyTorch,baddbmm,baddbmm_B2_M1_N8_K2_cpu_dtypetorch.bfloat16,short,False,6.476986,0.000000
+PyTorch,baddbmm,baddbmm_B128_M64_N32_K64_cpu_dtypetorch.float32,short,False,266.065131,0.000000
+PyTorch,baddbmm,baddbmm_B128_M64_N32_K64_cpu_dtypetorch.bfloat16,short,False,295.503063,0.000000
+PyTorch,cat,"cat_sizes(1,1,1)_N2_dim0_cpu",short,False,4.301950,0.000000
+PyTorch,cat,"cat_sizes(512,512,2)_N2_dim1_cpu",short,False,99.093415,0.000000
+PyTorch,cat,"cat_sizes(128,1024,2)_N2_dim1_cpu",short,False,96.771578,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size2_channels_per_group16_height16_width16_groups2_channel_lastTrue,short,False,52.475549,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size2_channels_per_group16_height16_width16_groups2_channel_lastFalse,short,False,46.483135,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size2_channels_per_group32_height32_width32_groups2_channel_lastTrue,short,False,57.179441,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size2_channels_per_group32_height32_width32_groups2_channel_lastFalse,short,False,51.114112,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size4_channels_per_group32_height32_width32_groups4_channel_lastTrue,short,False,77.045573,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size4_channels_per_group32_height32_width32_groups4_channel_lastFalse,short,False,57.527440,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size4_channels_per_group64_height64_width64_groups4_channel_lastTrue,short,False,299.237060,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size4_channels_per_group64_height64_width64_groups4_channel_lastFalse,short,False,165.268507,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size8_channels_per_group64_height64_width64_groups8_channel_lastTrue,short,False,1034.480289,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size8_channels_per_group64_height64_width64_groups8_channel_lastFalse,short,False,627.552450,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size16_channels_per_group64_height64_width64_groups16_channel_lastTrue,short,False,4709.313910,0.000000
+PyTorch,channel_shuffle,channel_shuffle_batch_size16_channels_per_group64_height64_width64_groups16_channel_lastFalse,short,False,2470.991690,0.000000
+PyTorch,chunk,chunk_M8_N8_chunks2_cpu,short,False,6.881959,0.000000
+PyTorch,chunk,chunk_M256_N512_chunks2_cpu,short,False,7.016489,0.000000
+PyTorch,chunk,chunk_M512_N512_chunks2_cpu,short,False,6.829479,0.000000
+PyTorch,Conv1d,Conv1d_IC128_OC256_kernel3_stride1_N1_L64_cpu,short,False,161.526501,0.000000
+PyTorch,Conv1d,Conv1d_IC256_OC256_kernel3_stride2_N4_L64_cpu,short,False,389.396360,0.000000
+PyTorch,Conv2d,Conv2d_IC256_OC256_kernel3_stride1_N1_H16_W16_G1_pad0_cpu,short,False,837.232033,0.000000
+PyTorch,ConvTranspose2d,ConvTranspose2d_IC256_OC256_kernel3_stride1_N1_H16_W16_G1_pad0_cpu,short,False,1259.104354,0.000000
+PyTorch,Conv2dPointwise,Conv2dPointwise_IC256_OC256_stride1_N1_H16_W16_G1_pad0_cpu,short,False,423.592581,0.000000
+PyTorch,Conv3d,Conv3d_IC64_OC64_kernel3_stride1_N8_D4_H16_W16_cpu,short,False,4713.401237,0.000000
+PyTorch,ConvTranspose3d,ConvTranspose3d_IC64_OC64_kernel3_stride1_N8_D4_H16_W16_cpu,short,False,9798.085490,0.000000
+PyTorch,diag,diag_dim1_M64_N64_diagonal0_outTrue_cpu,short,False,9.983573,0.000000
+PyTorch,diag,diag_dim2_M128_N128_diagonal-10_outFalse_cpu,short,False,7.817579,0.000000
+PyTorch,diag,diag_dim1_M256_N256_diagonal20_outTrue_cpu,short,False,102.008750,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,25.932070,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,79.094040,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,25.618948,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,71.670897,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,25.800482,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,63.936052,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,25.779446,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,70.597326,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,26.118981,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,62.572553,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,26.209740,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,62.822163,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,25.702759,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,66.037250,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,25.827319,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,71.249488,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,25.775656,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,62.907740,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,25.834111,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,75.054840,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,26.253773,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,61.943780,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,26.276609,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,61.851260,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,25.689124,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,69.262678,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,25.672505,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,73.133838,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,25.631939,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,66.750426,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,25.913212,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,64.675854,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,26.447855,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,61.601586,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,26.252401,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,61.955597,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,25.703098,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,68.315884,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,25.807940,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,75.701812,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,25.857585,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,62.865699,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,25.785043,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,63.303901,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,26.329548,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,61.085350,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,26.401250,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,61.327850,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,76.646453,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,76.408263,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,66.143049,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,66.626689,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,78.586541,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,78.437226,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,67.294776,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,67.519295,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,83.240654,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,82.798171,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,70.350631,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,71.047552,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,76.947381,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,76.043851,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,68.641934,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,68.768893,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,78.648941,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,77.599791,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,69.483032,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,69.184328,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,83.075783,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,83.171316,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,72.100870,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,72.667771,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,77.178308,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,76.987765,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,173.891298,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,174.383305,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,78.001683,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,78.145431,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,174.426247,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,173.456537,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,83.578019,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,83.350259,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,179.564871,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,181.208623,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,76.724585,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,77.335260,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,172.416292,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,170.913750,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,77.864377,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,77.955812,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,173.070785,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,173.094255,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu_BACKWARD,short,True,82.591598,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu_BACKWARD,short,True,82.869897,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,181.269854,0.000000
+PyTorch,embeddingbag,embeddingbag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,181.079995,0.000000
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size8_cpu,short,False,13.257645,0.000000
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size16_cpu,short,False,13.274894,0.000000
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size64_cpu,short,False,13.594135,0.000000
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size8_cpu,short,False,13.210569,0.000000
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size16_cpu,short,False,13.358302,0.000000
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size64_cpu,short,False,13.676537,0.000000
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size8_cpu,short,False,13.230114,0.000000
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size16_cpu,short,False,13.316872,0.000000
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size64_cpu,short,False,13.728165,0.000000
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size8_cpu,short,False,13.240829,0.000000
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size16_cpu,short,False,13.322630,0.000000
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size64_cpu,short,False,13.678991,0.000000
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size8_cpu_BACKWARD,short,True,52.434260,0.000000
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size16_cpu_BACKWARD,short,True,54.270657,0.000000
+PyTorch,embedding,embedding_num_embeddings10_embedding_dim64_input_size64_cpu_BACKWARD,short,True,60.054990,0.000000
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size8_cpu_BACKWARD,short,True,55.491721,0.000000
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size16_cpu_BACKWARD,short,True,56.325304,0.000000
+PyTorch,embedding,embedding_num_embeddings120_embedding_dim64_input_size64_cpu_BACKWARD,short,True,61.959455,0.000000
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size8_cpu_BACKWARD,short,True,158.577292,0.000000
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size16_cpu_BACKWARD,short,True,157.616690,0.000000
+PyTorch,embedding,embedding_num_embeddings1000_embedding_dim64_input_size64_cpu_BACKWARD,short,True,164.962560,0.000000
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size8_cpu_BACKWARD,short,True,191.301190,0.000000
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size16_cpu_BACKWARD,short,True,196.503447,0.000000
+PyTorch,embedding,embedding_num_embeddings2300_embedding_dim64_input_size64_cpu_BACKWARD,short,True,201.295830,0.000000
+PyTorch,fill_,fill__N1_cpu_dtypetorch.int32,short,False,1.126186,0.000000
+PyTorch,fill_,fill__N1024_cpu_dtypetorch.int32,short,False,2.565226,0.000000
+PyTorch,fill_,fill__N2048_cpu_dtypetorch.int32,short,False,2.978169,0.000000
+PyTorch,gather,gather_M256_N512_dim0_cpu,short,False,113.958748,0.000000
+PyTorch,gather,gather_M512_N512_dim1_cpu,short,False,72.347757,0.000000
+PyTorch,GroupNormBenchmark,"GroupNormBenchmark_dims(32,8,16)_num_groups2",short,False,60.884617,0.000000
+PyTorch,GroupNormBenchmark,"GroupNormBenchmark_dims(32,8,16)_num_groups4",short,False,53.373645,0.000000
+PyTorch,GroupNormBenchmark,"GroupNormBenchmark_dims(32,8,56,56)_num_groups2",short,False,113.483659,0.000000
+PyTorch,GroupNormBenchmark,"GroupNormBenchmark_dims(32,8,56,56)_num_groups4",short,False,114.206127,0.000000
+PyTorch,Hardsigmoid,Hardsigmoid_N1_C3_H256_W256_cpu,short,False,66.121431,0.000000
+PyTorch,Hardsigmoid,Hardsigmoid_N4_C3_H256_W256_cpu,short,False,74.423833,0.000000
+PyTorch,Hardswish,Hardswish_N1_C3_H256_W256_cpu,short,False,67.379220,0.000000
+PyTorch,Hardswish,Hardswish_N4_C3_H256_W256_cpu,short,False,82.693655,0.000000
+PyTorch,index_add_,index_add__M8_N32_K1_dim0_cpu_dtypetorch.float32,short,False,7.053411,0.000000
+PyTorch,index_add_,index_add__M256_N512_K1_dim1_cpu_dtypetorch.float32,short,False,13.263054,0.000000
+PyTorch,index_add_,index_add__M512_N512_K1_dim2_cpu_dtypetorch.float32,short,False,108.319590,0.000000
+PyTorch,index_select,index_select_M8_N8_K1_dim1_cpu,short,False,4.514675,0.000000
+PyTorch,index_select,index_select_M256_N512_K1_dim1_cpu,short,False,54.654160,0.000000
+PyTorch,index_select,index_select_M512_N512_K1_dim1_cpu,short,False,103.358516,0.000000
+PyTorch,index_select,index_select_M8_N8_K2_dim1_cpu,short,False,4.561579,0.000000
+PyTorch,index_select,index_select_M256_N512_K2_dim1_cpu,short,False,212.789483,0.000000
+PyTorch,index_select,index_select_M512_N512_K2_dim1_cpu,short,False,430.552168,0.000000
+PyTorch,InstanceNormBenchmark,"InstanceNormBenchmark_dims(32,8,16)",short,False,169.785802,0.000000
+PyTorch,InstanceNormBenchmark,"InstanceNormBenchmark_dims(32,8,56,56)",short,False,359.232437,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastTrue_modenearest",short,False,10.529644,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastTrue_modelinear",short,False,12.189028,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastTrue_modebicubic",short,False,46.246996,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastFalse_modenearest",short,False,22.743285,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastFalse_modelinear",short,False,24.601899,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastFalse_modebicubic",short,False,34.769822,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastTrue_modenearest",short,False,128.987081,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastTrue_modelinear",short,False,193.039880,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastTrue_modebicubic",short,False,487.996140,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastFalse_modenearest",short,False,80.409450,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastFalse_modelinear",short,False,112.757609,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastFalse_modebicubic",short,False,291.153090,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastTrue_modenearest",short,False,136.694490,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastTrue_modelinear",short,False,207.920459,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastTrue_modebicubic",short,False,547.632725,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastFalse_modenearest",short,False,81.090366,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastFalse_modelinear",short,False,117.256844,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastFalse_modebicubic",short,False,319.923544,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastTrue_modenearest",short,False,10.135673,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastTrue_modelinear",short,False,11.241479,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastTrue_modebicubic",short,False,25.862923,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastFalse_modenearest",short,False,9.880939,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastFalse_modelinear",short,False,11.446106,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastFalse_modebicubic",short,False,25.877143,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastTrue_modenearest",short,False,80.987965,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastTrue_modelinear",short,False,112.928955,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastTrue_modebicubic",short,False,293.535760,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastFalse_modenearest",short,False,80.649728,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastFalse_modelinear",short,False,112.735063,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastFalse_modebicubic",short,False,292.594442,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastTrue_modenearest",short,False,81.071167,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastTrue_modelinear",short,False,119.073692,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastTrue_modebicubic",short,False,325.062960,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastFalse_modenearest",short,False,80.776966,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastFalse_modelinear",short,False,118.075726,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastFalse_modebicubic",short,False,325.422923,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastTrue_modenearest_dtypetorch.uint8",short,False,10.408200,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,60,40)_output_size(24,24)_channels_lastFalse_modenearest_dtypetorch.uint8",short,False,23.989929,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastTrue_modenearest_dtypetorch.uint8",short,False,142.707918,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,600,400)_output_size(240,240)_channels_lastFalse_modenearest_dtypetorch.uint8",short,False,100.752786,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastTrue_modenearest_dtypetorch.uint8",short,False,153.185516,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,3,320,320)_output_size(256,256)_channels_lastFalse_modenearest_dtypetorch.uint8",short,False,104.761840,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastTrue_modenearest_dtypetorch.uint8",short,False,9.870818,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,60,40)_output_size(24,24)_channels_lastFalse_modenearest_dtypetorch.uint8",short,False,9.931431,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastTrue_modenearest_dtypetorch.uint8",short,False,99.600515,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,600,400)_output_size(240,240)_channels_lastFalse_modenearest_dtypetorch.uint8",short,False,99.164257,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastTrue_modenearest_dtypetorch.uint8",short,False,103.419602,0.000000
+PyTorch,interpolate,"interpolate_input_size(1,1,320,320)_output_size(256,256)_channels_lastFalse_modenearest_dtypetorch.uint8",short,False,103.148608,0.000000
+PyTorch,LayerNormBenchmark,"LayerNormBenchmark_dims(1,8,16)",short,False,9.418410,0.000000
+PyTorch,LayerNormBenchmark,"LayerNormBenchmark_dims(8,8,16)",short,False,57.969351,0.000000
+PyTorch,LayerNormBenchmark,"LayerNormBenchmark_dims(32,8,16)",short,False,59.316279,0.000000
+PyTorch,LayerNormBenchmark,"LayerNormBenchmark_dims(64,128,56,56)",short,False,2573.762285,0.000000
+PyTorch,linear,linear_N1_IN1_OUT1_cpu,short,False,17.240207,0.000000
+PyTorch,linear,linear_N4_IN256_OUT128_cpu,short,False,70.636017,0.000000
+PyTorch,linear,linear_N16_IN512_OUT256_cpu,short,False,155.853732,0.000000
+PyTorch,matmul,matmul_M1_N1_K1_trans_aTrue_trans_bFalse_cpu,short,False,5.217676,0.000000
+PyTorch,matmul,matmul_M128_N128_K128_trans_aTrue_trans_bFalse_cpu,short,False,130.986713,0.000000
+PyTorch,matmul,matmul_M256_N256_K256_trans_aFalse_trans_bTrue_cpu,short,False,4967.684160,0.000000
+PyTorch,mm,mm_M1_N1_K1_cpu_dtypetorch.float32,short,False,4.969217,0.000000
+PyTorch,mm,mm_M64_N64_K64_cpu_dtypetorch.float32,short,False,56.936066,0.000000
+PyTorch,mm,mm_M64_N64_K128_cpu_dtypetorch.float32,short,False,59.284410,0.000000
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float32_replace_infTrue,short,False,6.358168,0.000000
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float32_replace_infFalse,short,False,6.798741,0.000000
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float64_replace_infTrue,short,False,8.008753,0.000000
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float64_replace_infFalse,short,False,8.567021,0.000000
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float32_replace_infTrue,short,False,6.319673,0.000000
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float32_replace_infFalse,short,False,6.744320,0.000000
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float64_replace_infTrue,short,False,8.063743,0.000000
+PyTorch,nan_to_num,nan_to_num_M16_N64_dtypetorch.float64_replace_infFalse,short,False,8.583122,0.000000
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float32_replace_infTrue,short,False,7.557407,0.000000
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float32_replace_infFalse,short,False,8.056106,0.000000
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float64_replace_infTrue,short,False,13.849453,0.000000
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float64_replace_infFalse,short,False,14.596365,0.000000
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float32_replace_infTrue,short,False,7.504524,0.000000
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float32_replace_infFalse,short,False,8.090356,0.000000
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float64_replace_infTrue,short,False,14.077416,0.000000
+PyTorch,nan_to_num,nan_to_num_M64_N64_dtypetorch.float64_replace_infFalse,short,False,14.615643,0.000000
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float32_replace_infTrue,short,False,4.053200,0.000000
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float32_replace_infFalse,short,False,4.485825,0.000000
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float64_replace_infTrue,short,False,5.800954,0.000000
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float64_replace_infFalse,short,False,6.403105,0.000000
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float32_replace_infTrue,short,False,4.020517,0.000000
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float32_replace_infFalse,short,False,4.438027,0.000000
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float64_replace_infTrue,short,False,5.689130,0.000000
+PyTorch,nan_to_num_,nan_to_num__M16_N64_dtypetorch.float64_replace_infFalse,short,False,6.420881,0.000000
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float32_replace_infTrue,short,False,4.984703,0.000000
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float32_replace_infFalse,short,False,5.660661,0.000000
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float64_replace_infTrue,short,False,11.735412,0.000000
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float64_replace_infFalse,short,False,12.347645,0.000000
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float32_replace_infTrue,short,False,5.176911,0.000000
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float32_replace_infFalse,short,False,5.569892,0.000000
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float64_replace_infTrue,short,False,11.676570,0.000000
+PyTorch,nan_to_num_,nan_to_num__M64_N64_dtypetorch.float64_replace_infFalse,short,False,12.506719,0.000000
+PyTorch,MaxPool1d,MaxPool1d_kernel3_stride1_N8_C256_L256_cpu,short,False,121.343571,0.000000
+PyTorch,AvgPool1d,AvgPool1d_kernel3_stride1_N8_C256_L256_cpu,short,False,315.454573,0.000000
+PyTorch,MaxPool2d,"MaxPool2d_kernel[3,1]_stride[2,1]_N1_C16_H32_W32_cpu",short,False,58.314310,0.000000
+PyTorch,AvgPool2d,"AvgPool2d_kernel[3,1]_stride[2,1]_N1_C16_H32_W32_cpu",short,False,55.510125,0.000000
+PyTorch,AdaptiveMaxPool2d,"AdaptiveMaxPool2d_kernel[3,1]_stride[2,1]_N1_C16_H32_W32_cpu",short,False,63.309880,0.000000
+PyTorch,FractionalMaxPool2d,"FractionalMaxPool2d_kernel[3,1]_stride[2,1]_N1_C16_H32_W32_cpu",short,False,66.127681,0.000000
+PyTorch,MaxPool3d,"MaxPool3d_kernel[3,1,3]_stride[2,1,2]_N1_C16_D16_H32_W32_cpu",short,False,236.593780,0.000000
+PyTorch,AvgPool3d,"AvgPool3d_kernel[3,1,3]_stride[2,1,2]_N1_C16_D16_H32_W32_cpu",short,False,100.692771,0.000000
+PyTorch,AdaptiveMaxPool3d,"AdaptiveMaxPool3d_kernel[3,1,3]_stride[2,1,2]_N1_C16_D16_H32_W32_cpu",short,False,192.562352,0.000000
+PyTorch,FractionalMaxPool3d,"FractionalMaxPool3d_kernel[3,1,3]_stride[2,1,2]_N1_C16_D16_H32_W32_cpu",short,False,66.164532,0.000000
+PyTorch,fmod,fmod_M1_N1_K1_cpu_dtypetorch.int32,short,False,3.635065,0.000000
+PyTorch,fmod,fmod_M1_N1_K1_cpu_dtypetorch.float32,short,False,3.901028,0.000000
+PyTorch,fmod,fmod_M1_N1_K1_cpu_dtypetorch.float64,short,False,4.041925,0.000000
+PyTorch,fmod,fmod_M64_N64_K64_cpu_dtypetorch.int32,short,False,129.514345,0.000000
+PyTorch,fmod,fmod_M64_N64_K64_cpu_dtypetorch.float32,short,False,151.149918,0.000000
+PyTorch,fmod,fmod_M64_N64_K64_cpu_dtypetorch.float64,short,False,746.067340,0.000000
+PyTorch,fmod,fmod_M64_N64_K128_cpu_dtypetorch.int32,short,False,210.913781,0.000000
+PyTorch,fmod,fmod_M64_N64_K128_cpu_dtypetorch.float32,short,False,252.686828,0.000000
+PyTorch,fmod,fmod_M64_N64_K128_cpu_dtypetorch.float64,short,False,1484.044931,0.000000
+PyTorch,remainder,remainder_M1_N1_K1_cpu_dtypetorch.int32,short,False,3.976802,0.000000
+PyTorch,remainder,remainder_M1_N1_K1_cpu_dtypetorch.float32,short,False,4.075495,0.000000
+PyTorch,remainder,remainder_M1_N1_K1_cpu_dtypetorch.float64,short,False,3.834691,0.000000
+PyTorch,remainder,remainder_M64_N64_K64_cpu_dtypetorch.int32,short,False,146.646648,0.000000
+PyTorch,remainder,remainder_M64_N64_K64_cpu_dtypetorch.float32,short,False,170.557022,0.000000
+PyTorch,remainder,remainder_M64_N64_K64_cpu_dtypetorch.float64,short,False,867.868537,0.000000
+PyTorch,remainder,remainder_M64_N64_K128_cpu_dtypetorch.int32,short,False,243.740380,0.000000
+PyTorch,remainder,remainder_M64_N64_K128_cpu_dtypetorch.float32,short,False,292.164866,0.000000
+PyTorch,remainder,remainder_M64_N64_K128_cpu_dtypetorch.float64,short,False,1730.402555,0.000000
+PyTorch,Softmax,Softmax_N1_C3_H256_W256_cpu,short,False,122.847048,0.000000
+PyTorch,Softmax,Softmax_N4_C3_H256_W256_cpu,short,False,317.788112,0.000000
+PyTorch,Softmax2d,Softmax2d_N1_C3_H256_W256_cpu,short,False,120.565735,0.000000
+PyTorch,Softmax2d,Softmax2d_N4_C3_H256_W256_cpu,short,False,316.982444,0.000000
+PyTorch,LogSoftmax,LogSoftmax_N1_C3_H256_W256_cpu,short,False,162.530153,0.000000
+PyTorch,LogSoftmax,LogSoftmax_N4_C3_H256_W256_cpu,short,False,266.478752,0.000000
+PyTorch,split,split_M8_N8_parts2_cpu,short,False,6.753952,0.000000
+PyTorch,split,split_M256_N512_parts2_cpu,short,False,6.873656,0.000000
+PyTorch,split,split_M512_N512_parts2_cpu,short,False,6.848019,0.000000
+PyTorch,stack,"stack_sizes(1,1,1)_N2_cpu_dim0",short,False,5.736891,0.000000
+PyTorch,stack,"stack_sizes(1,1,1)_N2_cpu_dim1",short,False,6.185757,0.000000
+PyTorch,stack,"stack_sizes(1,1,1)_N2_cpu_dim2",short,False,6.094516,0.000000
+PyTorch,stack,"stack_sizes(1,1,1)_N2_cpu_dim3",short,False,6.894034,0.000000
+PyTorch,stack,"stack_sizes(512,512,2)_N2_cpu_dim0",short,False,98.350665,0.000000
+PyTorch,stack,"stack_sizes(512,512,2)_N2_cpu_dim1",short,False,100.461322,0.000000
+PyTorch,stack,"stack_sizes(512,512,2)_N2_cpu_dim2",short,False,218.911485,0.000000
+PyTorch,stack,"stack_sizes(512,512,2)_N2_cpu_dim3",short,False,166.567879,0.000000
+PyTorch,stack,"stack_sizes(128,1024,2)_N2_cpu_dim0",short,False,99.504077,0.000000
+PyTorch,stack,"stack_sizes(128,1024,2)_N2_cpu_dim1",short,False,98.383429,0.000000
+PyTorch,stack,"stack_sizes(128,1024,2)_N2_cpu_dim2",short,False,153.173778,0.000000
+PyTorch,stack,"stack_sizes(128,1024,2)_N2_cpu_dim3",short,False,123.909933,0.000000
+PyTorch,sum,sum_R64_V32_dim0_contiguousTrue_cpu,short,False,6.692267,0.000000
+PyTorch,sum,sum_R64_V32_dim0_contiguousFalse_cpu,short,False,8.023065,0.000000
+PyTorch,sum,sum_R64_V32_dim1_contiguousTrue_cpu,short,False,6.881371,0.000000
+PyTorch,sum,sum_R64_V32_dim1_contiguousFalse_cpu,short,False,7.601940,0.000000
+PyTorch,sum,sum_R64_V512_dim0_contiguousTrue_cpu,short,False,44.774431,0.000000
+PyTorch,sum,sum_R64_V512_dim0_contiguousFalse_cpu,short,False,49.214148,0.000000
+PyTorch,sum,sum_R64_V512_dim1_contiguousTrue_cpu,short,False,45.532505,0.000000
+PyTorch,sum,sum_R64_V512_dim1_contiguousFalse_cpu,short,False,51.539750,0.000000
+PyTorch,sum,sum_R256_V32_dim0_contiguousTrue_cpu,short,False,7.732977,0.000000
+PyTorch,sum,sum_R256_V32_dim0_contiguousFalse_cpu,short,False,9.670269,0.000000
+PyTorch,sum,sum_R256_V32_dim1_contiguousTrue_cpu,short,False,7.691115,0.000000
+PyTorch,sum,sum_R256_V32_dim1_contiguousFalse_cpu,short,False,9.625176,0.000000
+PyTorch,sum,sum_R256_V512_dim0_contiguousTrue_cpu,short,False,50.954394,0.000000
+PyTorch,sum,sum_R256_V512_dim0_contiguousFalse_cpu,short,False,57.957757,0.000000
+PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,False,53.592068,0.000000
+PyTorch,sum,sum_R256_V512_dim1_contiguousFalse_cpu,short,False,51.339726,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N16_cpu,short,False,7.040985,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N64_cpu,short,False,7.168604,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N128_cpu,short,False,7.434442,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N16_cpu,short,False,7.078318,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N64_cpu,short,False,7.426670,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N128_cpu,short,False,7.679027,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N16_cpu,short,False,7.281365,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N64_cpu,short,False,7.682783,0.000000
+PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N128_cpu,short,False,8.381938,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N16_cpu,short,False,7.039854,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N64_cpu,short,False,7.399855,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N128_cpu,short,False,7.715193,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N16_cpu,short,False,7.255140,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N64_cpu,short,False,7.753522,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N128_cpu,short,False,8.364281,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N16_cpu,short,False,7.476377,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N64_cpu,short,False,8.458564,0.000000
+PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N128_cpu,short,False,9.391939,0.000000
+PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.float32,short,False,4.461410,0.000000
+PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.bfloat16,short,False,4.560082,0.000000
+PyTorch,addcmul,addcmul_M32_N64_cpu_dtypetorch.float32,short,False,5.141248,0.000000
+PyTorch,addcmul,addcmul_M32_N64_cpu_dtypetorch.bfloat16,short,False,5.819053,0.000000
+PyTorch,addcdiv,addcdiv_M1_N2_cpu_dtypetorch.float32,short,False,4.922033,0.000000
+PyTorch,addcdiv,addcdiv_M1_N2_cpu_dtypetorch.bfloat16,short,False,4.861055,0.000000
+PyTorch,addcdiv,addcdiv_M32_N64_cpu_dtypetorch.float32,short,False,5.560473,0.000000
+PyTorch,addcdiv,addcdiv_M32_N64_cpu_dtypetorch.bfloat16,short,False,6.113489,0.000000
+PyTorch,topk,"topk_shape(16,4)_k4_dim1_cpu_dtypetorch.float32",short,False,6.656324,0.000000
+PyTorch,topk,"topk_shape(1048576,)_k16_dim0_cpu_dtypetorch.float32",short,False,2137.073922,0.000000
+PyTorch,where,"where_cond_shape(8,16,1)_input_shape(1,)_other_shape(1,)_cpu_dtypetorch.float32",short,False,6.551560,0.000000
+PyTorch,where,"where_cond_shape(8,16,1)_input_shape(16,1)_other_shape(8,16,1)_cpu_dtypetorch.float32",short,False,6.548704,0.000000
+PyTorch,where,"where_cond_shape(8,16,1)_input_shape(8,1,1)_other_shape(1,)_cpu_dtypetorch.float32",short,False,6.417945,0.000000
+PyTorch,relu,"relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,9.394759,0.000000
+PyTorch,relu,"relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,9.308802,0.000000
+PyTorch,relu,"relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,9.267544,0.000000
+PyTorch,relu,"relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,9.685650,0.000000
+PyTorch,relu,"relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,9.606769,0.000000
+PyTorch,relu,"relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,9.553571,0.000000
+PyTorch,relu,"relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,80.796781,0.000000
+PyTorch,relu,"relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,91.592676,0.000000
+PyTorch,relu,"relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,83.363830,0.000000
+PyTorch,relu,"relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,82.888682,0.000000
+PyTorch,relu,"relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,97.166943,0.000000
+PyTorch,relu,"relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,104.243662,0.000000
+PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,8.418549,0.000000
+PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,8.500449,0.000000
+PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,8.443481,0.000000
+PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,8.960919,0.000000
+PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,8.986856,0.000000
+PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,8.814634,0.000000
+PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,80.921564,0.000000
+PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,81.595518,0.000000
+PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,85.112929,0.000000
+PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,84.740682,0.000000
+PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,85.530059,0.000000
+PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,106.365863,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,8.055478,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,8.238628,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,8.119306,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,8.683609,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,8.759866,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,8.594149,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,77.579946,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,83.634438,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,84.316144,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,84.438504,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,84.312683,0.000000
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,105.458681,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,6.480224,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,6.658893,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,6.502791,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,7.091508,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,7.071250,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,7.143394,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,344.615549,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,360.922264,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,360.622480,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,344.514761,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,361.637229,0.000000
+PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,360.860964,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,12.176948,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,11.734075,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,11.181202,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,13.658838,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,13.976081,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,12.947895,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,437.285316,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,445.478465,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,523.076388,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,442.810632,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,449.038734,0.000000
+PyTorch,functional.leaky_relu,"functional.leaky_relu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,541.625834,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,6.427155,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,6.355635,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,6.445739,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,7.175534,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,7.055749,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,7.111532,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,321.942471,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,412.526749,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,413.297580,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,322.569442,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,413.410907,0.000000
+PyTorch,functional.sigmoid,"functional.sigmoid_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,414.466411,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,6.392274,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,6.349999,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,6.554333,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,7.061919,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,7.149233,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,7.086558,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,406.644221,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,373.447059,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,371.772997,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,409.167217,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,373.676758,0.000000
+PyTorch,functional.tanh,"functional.tanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,374.537943,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,9.930822,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,10.116378,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,10.149234,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,11.481823,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,11.614461,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,11.762893,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,335.415021,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,350.660354,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,351.735603,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,336.152532,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,350.996697,0.000000
+PyTorch,functional.hardswish,"functional.hardswish_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,353.547824,0.000000
+PyTorch,functional.elu,"functional.elu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,10.267545,0.000000
+PyTorch,functional.elu,"functional.elu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,10.379921,0.000000
+PyTorch,functional.elu,"functional.elu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,10.477865,0.000000
+PyTorch,functional.elu,"functional.elu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,11.684307,0.000000
+PyTorch,functional.elu,"functional.elu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,12.064549,0.000000
+PyTorch,functional.elu,"functional.elu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,12.134612,0.000000
+PyTorch,functional.elu,"functional.elu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,154.252406,0.000000
+PyTorch,functional.elu,"functional.elu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,450.243138,0.000000
+PyTorch,functional.elu,"functional.elu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,449.014350,0.000000
+PyTorch,functional.elu,"functional.elu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,153.808653,0.000000
+PyTorch,functional.elu,"functional.elu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,445.457985,0.000000
+PyTorch,functional.elu,"functional.elu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,453.355262,0.000000
+PyTorch,functional.celu,"functional.celu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,9.940230,0.000000
+PyTorch,functional.celu,"functional.celu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,10.151808,0.000000
+PyTorch,functional.celu,"functional.celu_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,10.292930,0.000000
+PyTorch,functional.celu,"functional.celu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,11.492981,0.000000
+PyTorch,functional.celu,"functional.celu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,11.703474,0.000000
+PyTorch,functional.celu,"functional.celu_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,11.779910,0.000000
+PyTorch,functional.celu,"functional.celu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,156.045063,0.000000
+PyTorch,functional.celu,"functional.celu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,446.178772,0.000000
+PyTorch,functional.celu,"functional.celu_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,449.322654,0.000000
+PyTorch,functional.celu,"functional.celu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,False,155.598436,0.000000
+PyTorch,functional.celu,"functional.celu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,False,451.376561,0.000000
+PyTorch,functional.celu,"functional.celu_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,False,456.399200,0.000000
+PyTorch,add,add_N2_dtypetorch.quint8_contigFalse,short,False,54.525704,0.000000
+PyTorch,add,add_N2_dtypetorch.quint8_contigTrue,short,False,48.507417,0.000000
+PyTorch,add,add_N2_dtypetorch.qint8_contigFalse,short,False,54.165648,0.000000
+PyTorch,add,add_N2_dtypetorch.qint8_contigTrue,short,False,49.270978,0.000000
+PyTorch,add,add_N2_dtypetorch.qint32_contigFalse,short,False,10.166548,0.000000
+PyTorch,add,add_N2_dtypetorch.qint32_contigTrue,short,False,9.839232,0.000000
+PyTorch,add,add_N8_dtypetorch.quint8_contigFalse,short,False,55.172433,0.000000
+PyTorch,add,add_N8_dtypetorch.quint8_contigTrue,short,False,46.703761,0.000000
+PyTorch,add,add_N8_dtypetorch.qint8_contigFalse,short,False,55.712299,0.000000
+PyTorch,add,add_N8_dtypetorch.qint8_contigTrue,short,False,47.370029,0.000000
+PyTorch,add,add_N8_dtypetorch.qint32_contigFalse,short,False,11.358310,0.000000
+PyTorch,add,add_N8_dtypetorch.qint32_contigTrue,short,False,11.571205,0.000000
+PyTorch,add,add_N64_dtypetorch.quint8_contigFalse,short,False,59.735500,0.000000
+PyTorch,add,add_N64_dtypetorch.quint8_contigTrue,short,False,47.242686,0.000000
+PyTorch,add,add_N64_dtypetorch.qint8_contigFalse,short,False,60.975918,0.000000
+PyTorch,add,add_N64_dtypetorch.qint8_contigTrue,short,False,47.022490,0.000000
+PyTorch,add,add_N64_dtypetorch.qint32_contigFalse,short,False,29.096942,0.000000
+PyTorch,add,add_N64_dtypetorch.qint32_contigTrue,short,False,89.559198,0.000000
+PyTorch,add,add_N512_dtypetorch.quint8_contigFalse,short,False,213.117569,0.000000
+PyTorch,add,add_N512_dtypetorch.quint8_contigTrue,short,False,58.900791,0.000000
+PyTorch,add,add_N512_dtypetorch.qint8_contigFalse,short,False,212.745501,0.000000
+PyTorch,add,add_N512_dtypetorch.qint8_contigTrue,short,False,58.136227,0.000000
+PyTorch,add,add_N512_dtypetorch.qint32_contigFalse,short,False,186.300471,0.000000
+PyTorch,add,add_N512_dtypetorch.qint32_contigTrue,short,False,690.767958,0.000000
+PyTorch,add_relu,add_relu_N2_dtypetorch.quint8_contigFalse,short,False,10.009465,0.000000
+PyTorch,add_relu,add_relu_N2_dtypetorch.quint8_contigTrue,short,False,9.746104,0.000000
+PyTorch,add_relu,add_relu_N2_dtypetorch.qint8_contigFalse,short,False,10.162506,0.000000
+PyTorch,add_relu,add_relu_N2_dtypetorch.qint8_contigTrue,short,False,9.701948,0.000000
+PyTorch,add_relu,add_relu_N2_dtypetorch.qint32_contigFalse,short,False,10.097318,0.000000
+PyTorch,add_relu,add_relu_N2_dtypetorch.qint32_contigTrue,short,False,9.738773,0.000000
+PyTorch,add_relu,add_relu_N8_dtypetorch.quint8_contigFalse,short,False,11.193524,0.000000
+PyTorch,add_relu,add_relu_N8_dtypetorch.quint8_contigTrue,short,False,11.319229,0.000000
+PyTorch,add_relu,add_relu_N8_dtypetorch.qint8_contigFalse,short,False,11.153031,0.000000
+PyTorch,add_relu,add_relu_N8_dtypetorch.qint8_contigTrue,short,False,11.185324,0.000000
+PyTorch,add_relu,add_relu_N8_dtypetorch.qint32_contigFalse,short,False,11.368479,0.000000
+PyTorch,add_relu,add_relu_N8_dtypetorch.qint32_contigTrue,short,False,11.326698,0.000000
+PyTorch,add_relu,add_relu_N64_dtypetorch.quint8_contigFalse,short,False,29.288667,0.000000
+PyTorch,add_relu,add_relu_N64_dtypetorch.quint8_contigTrue,short,False,81.897881,0.000000
+PyTorch,add_relu,add_relu_N64_dtypetorch.qint8_contigFalse,short,False,39.738525,0.000000
+PyTorch,add_relu,add_relu_N64_dtypetorch.qint8_contigTrue,short,False,82.035375,0.000000
+PyTorch,add_relu,add_relu_N64_dtypetorch.qint32_contigFalse,short,False,43.063633,0.000000
+PyTorch,add_relu,add_relu_N64_dtypetorch.qint32_contigTrue,short,False,89.797751,0.000000
+PyTorch,add_relu,add_relu_N512_dtypetorch.quint8_contigFalse,short,False,186.276330,0.000000
+PyTorch,add_relu,add_relu_N512_dtypetorch.quint8_contigTrue,short,False,621.216089,0.000000
+PyTorch,add_relu,add_relu_N512_dtypetorch.qint8_contigFalse,short,False,397.837161,0.000000
+PyTorch,add_relu,add_relu_N512_dtypetorch.qint8_contigTrue,short,False,626.707880,0.000000
+PyTorch,add_relu,add_relu_N512_dtypetorch.qint32_contigFalse,short,False,399.039524,0.000000
+PyTorch,add_relu,add_relu_N512_dtypetorch.qint32_contigTrue,short,False,695.372335,0.000000
+PyTorch,mul,mul_N2_dtypetorch.quint8_contigFalse,short,False,10.792049,0.000000
+PyTorch,mul,mul_N2_dtypetorch.quint8_contigTrue,short,False,10.337356,0.000000
+PyTorch,mul,mul_N2_dtypetorch.qint8_contigFalse,short,False,29.766997,0.000000
+PyTorch,mul,mul_N2_dtypetorch.qint8_contigTrue,short,False,10.670764,0.000000
+PyTorch,mul,mul_N2_dtypetorch.qint32_contigFalse,short,False,10.747730,0.000000
+PyTorch,mul,mul_N2_dtypetorch.qint32_contigTrue,short,False,10.272625,0.000000
+PyTorch,mul,mul_N8_dtypetorch.quint8_contigFalse,short,False,11.249079,0.000000
+PyTorch,mul,mul_N8_dtypetorch.quint8_contigTrue,short,False,10.184144,0.000000
+PyTorch,mul,mul_N8_dtypetorch.qint8_contigFalse,short,False,412.500754,0.000000
+PyTorch,mul,mul_N8_dtypetorch.qint8_contigTrue,short,False,380.488152,0.000000
+PyTorch,mul,mul_N8_dtypetorch.qint32_contigFalse,short,False,11.217967,0.000000
+PyTorch,mul,mul_N8_dtypetorch.qint32_contigTrue,short,False,10.372477,0.000000
+PyTorch,mul,mul_N64_dtypetorch.quint8_contigFalse,short,False,26.384046,0.000000
+PyTorch,mul,mul_N64_dtypetorch.quint8_contigTrue,short,False,13.281053,0.000000
+PyTorch,mul,mul_N64_dtypetorch.qint8_contigFalse,short,False,427.333217,0.000000
+PyTorch,mul,mul_N64_dtypetorch.qint8_contigTrue,short,False,378.800277,0.000000
+PyTorch,mul,mul_N64_dtypetorch.qint32_contigFalse,short,False,22.636102,0.000000
+PyTorch,mul,mul_N64_dtypetorch.qint32_contigTrue,short,False,13.891831,0.000000
+PyTorch,mul,mul_N512_dtypetorch.quint8_contigFalse,short,False,324.837860,0.000000
+PyTorch,mul,mul_N512_dtypetorch.quint8_contigTrue,short,False,70.655191,0.000000
+PyTorch,mul,mul_N512_dtypetorch.qint8_contigFalse,short,False,697.828340,0.000000
+PyTorch,mul,mul_N512_dtypetorch.qint8_contigTrue,short,False,414.893995,0.000000
+PyTorch,mul,mul_N512_dtypetorch.qint32_contigFalse,short,False,140.090565,0.000000
+PyTorch,mul,mul_N512_dtypetorch.qint32_contigTrue,short,False,72.970641,0.000000
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.quint8_contigFalse,short,False,9.650154,0.000000
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.quint8_contigTrue,short,False,9.056958,0.000000
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.qint8_contigFalse,short,False,10.032105,0.000000
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.qint8_contigTrue,short,False,9.419741,0.000000
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.qint32_contigFalse,short,False,9.857270,0.000000
+PyTorch,add_scalar,add_scalar_N2_dtypetorch.qint32_contigTrue,short,False,9.260383,0.000000
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.quint8_contigFalse,short,False,10.275563,0.000000
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.quint8_contigTrue,short,False,8.914322,0.000000
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.qint8_contigFalse,short,False,9.973162,0.000000
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.qint8_contigTrue,short,False,9.329676,0.000000
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.qint32_contigFalse,short,False,9.742725,0.000000
+PyTorch,add_scalar,add_scalar_N8_dtypetorch.qint32_contigTrue,short,False,9.058522,0.000000
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.quint8_contigFalse,short,False,20.745533,0.000000
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.quint8_contigTrue,short,False,11.517188,0.000000
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.qint8_contigFalse,short,False,14.588801,0.000000
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.qint8_contigTrue,short,False,9.918611,0.000000
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.qint32_contigFalse,short,False,13.542074,0.000000
+PyTorch,add_scalar,add_scalar_N64_dtypetorch.qint32_contigTrue,short,False,10.794776,0.000000
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.quint8_contigFalse,short,False,120.869888,0.000000
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.quint8_contigTrue,short,False,75.806970,0.000000
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.qint8_contigFalse,short,False,81.201255,0.000000
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.qint8_contigTrue,short,False,55.456395,0.000000
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.qint32_contigFalse,short,False,85.280151,0.000000
+PyTorch,add_scalar,add_scalar_N512_dtypetorch.qint32_contigTrue,short,False,59.971946,0.000000
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.quint8_contigFalse,short,False,9.801843,0.000000
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.quint8_contigTrue,short,False,9.290992,0.000000
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.qint8_contigFalse,short,False,9.980126,0.000000
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.qint8_contigTrue,short,False,9.359637,0.000000
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.qint32_contigFalse,short,False,9.915617,0.000000
+PyTorch,mul_scalar,mul_scalar_N2_dtypetorch.qint32_contigTrue,short,False,9.210668,0.000000
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.quint8_contigFalse,short,False,9.820922,0.000000
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.quint8_contigTrue,short,False,9.130066,0.000000
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.qint8_contigFalse,short,False,9.822860,0.000000
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.qint8_contigTrue,short,False,9.208939,0.000000
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.qint32_contigFalse,short,False,9.923802,0.000000
+PyTorch,mul_scalar,mul_scalar_N8_dtypetorch.qint32_contigTrue,short,False,9.228233,0.000000
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.quint8_contigFalse,short,False,13.801614,0.000000
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.quint8_contigTrue,short,False,9.730629,0.000000
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.qint8_contigFalse,short,False,14.292015,0.000000
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.qint8_contigTrue,short,False,9.772135,0.000000
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.qint32_contigFalse,short,False,13.532725,0.000000
+PyTorch,mul_scalar,mul_scalar_N64_dtypetorch.qint32_contigTrue,short,False,10.971262,0.000000
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.quint8_contigFalse,short,False,79.350580,0.000000
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.quint8_contigTrue,short,False,56.108255,0.000000
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.qint8_contigFalse,short,False,80.221636,0.000000
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.qint8_contigTrue,short,False,54.967161,0.000000
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.qint32_contigFalse,short,False,85.677349,0.000000
+PyTorch,mul_scalar,mul_scalar_N512_dtypetorch.qint32_contigTrue,short,False,58.340807,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,274.988859,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,314.877017,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,274.143065,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,333.170297,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,276.114808,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,318.133386,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,316.446400,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,351.285540,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,316.018478,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,351.023262,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,314.584634,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,348.879078,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,510.666462,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,546.541658,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,513.146251,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,544.085314,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,512.262547,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,563.350471,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,526.527040,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,561.490715,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,526.299266,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,563.797929,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,533.919534,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,585.499031,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,77.160832,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,77.230151,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,77.935535,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,77.894121,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,81.645482,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,81.267530,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,87.730819,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,87.759078,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,88.382237,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,88.687020,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,92.216803,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,92.051609,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,318.113337,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,316.527647,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,311.871957,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,316.786788,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,318.008949,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,318.298942,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,309.078271,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,309.316080,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,309.372130,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,311.992863,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu_BACKWARD,short,True,312.211778,0.000000
+PyTorch,qatEmbeddingBag,qatEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu_BACKWARD,short,True,311.930870,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size8_cpu,short,False,266.095368,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size16_cpu,short,False,264.323879,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size64_cpu,short,False,265.230784,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size8_cpu,short,False,300.983800,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size16_cpu,short,False,302.473380,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size64_cpu,short,False,302.886389,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size8_cpu,short,False,497.948795,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size16_cpu,short,False,497.101363,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size64_cpu,short,False,498.723660,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size8_cpu,short,False,516.198427,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size16_cpu,short,False,516.910952,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size64_cpu,short,False,518.768045,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size8_cpu_BACKWARD,short,True,64.304382,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size16_cpu_BACKWARD,short,True,65.962808,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings10_embedding_dim64_input_size64_cpu_BACKWARD,short,True,71.122468,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size8_cpu_BACKWARD,short,True,73.623478,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size16_cpu_BACKWARD,short,True,75.755343,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings120_embedding_dim64_input_size64_cpu_BACKWARD,short,True,81.115363,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size8_cpu_BACKWARD,short,True,295.989743,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size16_cpu_BACKWARD,short,True,296.732952,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings1000_embedding_dim64_input_size64_cpu_BACKWARD,short,True,303.545079,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size8_cpu_BACKWARD,short,True,332.342200,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size16_cpu_BACKWARD,short,True,333.213785,0.000000
+PyTorch,qatEmbedding,qatEmbedding_num_embeddings2300_embedding_dim64_input_size64_cpu_BACKWARD,short,True,339.762786,0.000000
+PyTorch,QBatchNorm1d,QBatchNorm1d_M1_N256_K3136_cpu_dtypetorch.qint8,short,False,1279.230735,0.000000
+PyTorch,QBatchNorm2d,QBatchNorm2d_M1_N256_K3136_cpu_dtypetorch.qint8,short,False,1143.587020,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigall_dtypetorch.quint8,short,False,229.089037,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigall_dtypetorch.qint8,short,False,229.814037,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigall_dtypetorch.qint32,short,False,919.673338,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigone_dtypetorch.quint8,short,False,301.101660,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigone_dtypetorch.qint8,short,False,300.354370,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contigone_dtypetorch.qint32,short,False,996.242370,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contignone_dtypetorch.quint8,short,False,367.358463,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contignone_dtypetorch.qint8,short,False,373.531795,0.000000
+PyTorch,qcat,qcat_M256_N512_K1_L2_dim0_contignone_dtypetorch.qint32,short,False,1071.199771,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigall_dtypetorch.quint8,short,False,355.003390,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigall_dtypetorch.qint8,short,False,357.724388,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigall_dtypetorch.qint32,short,False,1591.623679,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigone_dtypetorch.quint8,short,False,458.641811,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigone_dtypetorch.qint8,short,False,458.108343,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contigone_dtypetorch.qint32,short,False,1715.952436,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contignone_dtypetorch.quint8,short,False,556.800793,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contignone_dtypetorch.qint8,short,False,557.022942,0.000000
+PyTorch,qcat,qcat_M512_N512_K2_L1_dim1_contignone_dtypetorch.qint32,short,False,1831.625177,0.000000
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.659249,0.000000
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.488580,0.000000
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.062653,0.000000
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,29.175123,0.000000
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,7.031340,0.000000
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,18.240752,0.000000
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.901555,0.000000
+PyTorch,eq,eq_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.333026,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.366241,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,23.646604,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.343720,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,28.861064,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.998121,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,17.624672,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.924173,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.223008,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.916533,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.926139,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.413789,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,29.167968,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,7.286591,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,19.297183,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,11.087414,0.000000
+PyTorch,eq,eq_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.674432,0.000000
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,18.425990,0.000000
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,33.055810,0.000000
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,21.737632,0.000000
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,37.173348,0.000000
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,11.547812,0.000000
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.831548,0.000000
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.424478,0.000000
+PyTorch,eq,eq_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.738332,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,19.230981,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,34.484918,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,22.740766,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,38.301714,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,10.705394,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.413391,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.401949,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.602660,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,28.037415,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,43.889381,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,27.580923,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,43.491900,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,21.994874,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,34.649429,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,20.859801,0.000000
+PyTorch,eq,eq_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,33.119628,0.000000
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.472581,0.000000
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.114184,0.000000
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.017749,0.000000
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,29.735235,0.000000
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.569071,0.000000
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,17.797276,0.000000
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.891585,0.000000
+PyTorch,ne,ne_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.659451,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.143022,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,23.786464,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.225867,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,29.986286,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.614645,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,17.335371,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,11.021240,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.611790,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.667795,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.338721,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.562054,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,29.746058,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,7.040875,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,18.537772,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,11.289554,0.000000
+PyTorch,ne,ne_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,24.121479,0.000000
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,17.736341,0.000000
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,32.487414,0.000000
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,20.927801,0.000000
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,36.157429,0.000000
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,11.152495,0.000000
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.151756,0.000000
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,15.921099,0.000000
+PyTorch,ne,ne_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.827231,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,18.198807,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,33.871904,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,21.828119,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,38.920595,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,11.054162,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.071486,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.014435,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,29.079400,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,28.000709,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,42.665661,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,26.996536,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,42.408350,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,22.120757,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,34.036985,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,20.305630,0.000000
+PyTorch,ne,ne_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,33.293711,0.000000
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.989175,0.000000
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.022303,0.000000
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.211976,0.000000
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,28.225586,0.000000
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.725662,0.000000
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,18.036751,0.000000
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,11.195603,0.000000
+PyTorch,lt,lt_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.173156,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.922803,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.063407,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.478919,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,28.725090,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.556450,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,17.992666,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,11.041052,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.128039,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.908588,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.932022,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.509387,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,28.507423,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.991223,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,18.883428,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,11.340537,0.000000
+PyTorch,lt,lt_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.474580,0.000000
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,17.780582,0.000000
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,33.483268,0.000000
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,21.736950,0.000000
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,37.622393,0.000000
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,11.501619,0.000000
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,25.636465,0.000000
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.487000,0.000000
+PyTorch,lt,lt_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.538948,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,19.407710,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,34.710407,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,23.001715,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,38.803145,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,11.308907,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,25.126098,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.409281,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.723077,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,28.078608,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,43.862870,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,28.342684,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,45.247717,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,22.467307,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,35.229839,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,20.828508,0.000000
+PyTorch,lt,lt_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,34.281815,0.000000
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.201065,0.000000
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.046987,0.000000
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,13.518527,0.000000
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,28.247002,0.000000
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.413535,0.000000
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,17.443923,0.000000
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.946319,0.000000
+PyTorch,gt,gt_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.251914,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,9.841737,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,23.463844,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,13.387307,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,28.580578,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.499470,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,17.091755,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.880642,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.144200,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.522574,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.733810,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,13.634346,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,28.491347,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.759546,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,18.334460,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,11.276761,0.000000
+PyTorch,gt,gt_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.338620,0.000000
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,17.579850,0.000000
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,33.150634,0.000000
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,21.018504,0.000000
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,37.094236,0.000000
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,10.386846,0.000000
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.705712,0.000000
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.199474,0.000000
+PyTorch,gt,gt_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.768630,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,18.496909,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,34.266361,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,22.630030,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,38.576213,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,10.491930,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,23.950235,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,15.528805,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.809764,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,27.852019,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,43.631335,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,28.047012,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,43.522750,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,21.437350,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,34.323098,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,20.572556,0.000000
+PyTorch,gt,gt_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,33.726399,0.000000
+PyTorch,le,le_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.355769,0.000000
+PyTorch,le,le_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.231171,0.000000
+PyTorch,le,le_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.381682,0.000000
+PyTorch,le,le_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,27.908206,0.000000
+PyTorch,le,le_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,7.015842,0.000000
+PyTorch,le,le_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,18.156515,0.000000
+PyTorch,le,le_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.764506,0.000000
+PyTorch,le,le_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,22.775082,0.000000
+PyTorch,le,le_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.645387,0.000000
+PyTorch,le,le_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,23.661967,0.000000
+PyTorch,le,le_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.528062,0.000000
+PyTorch,le,le_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,28.619186,0.000000
+PyTorch,le,le_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.821544,0.000000
+PyTorch,le,le_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,17.372435,0.000000
+PyTorch,le,le_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.892625,0.000000
+PyTorch,le,le_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,22.654621,0.000000
+PyTorch,le,le_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.859466,0.000000
+PyTorch,le,le_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.897908,0.000000
+PyTorch,le,le_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.472520,0.000000
+PyTorch,le,le_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,27.655807,0.000000
+PyTorch,le,le_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,7.103746,0.000000
+PyTorch,le,le_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,18.891796,0.000000
+PyTorch,le,le_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,11.237153,0.000000
+PyTorch,le,le_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.076524,0.000000
+PyTorch,le,le_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,18.089216,0.000000
+PyTorch,le,le_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,33.345103,0.000000
+PyTorch,le,le_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,21.725297,0.000000
+PyTorch,le,le_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,35.991615,0.000000
+PyTorch,le,le_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,12.072585,0.000000
+PyTorch,le,le_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.803279,0.000000
+PyTorch,le,le_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.287302,0.000000
+PyTorch,le,le_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.200946,0.000000
+PyTorch,le,le_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,19.513103,0.000000
+PyTorch,le,le_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,34.783793,0.000000
+PyTorch,le,le_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,22.548814,0.000000
+PyTorch,le,le_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,37.271383,0.000000
+PyTorch,le,le_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,11.784068,0.000000
+PyTorch,le,le_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.427171,0.000000
+PyTorch,le,le_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.172816,0.000000
+PyTorch,le,le_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.083668,0.000000
+PyTorch,le,le_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,28.238695,0.000000
+PyTorch,le,le_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,44.109961,0.000000
+PyTorch,le,le_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,28.149361,0.000000
+PyTorch,le,le_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,41.709949,0.000000
+PyTorch,le,le_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,22.886642,0.000000
+PyTorch,le,le_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,34.559269,0.000000
+PyTorch,le,le_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,20.791157,0.000000
+PyTorch,le,le_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,33.302911,0.000000
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.306199,0.000000
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,23.398023,0.000000
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.367481,0.000000
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,29.014630,0.000000
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.389997,0.000000
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,17.330705,0.000000
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.804766,0.000000
+PyTorch,ge,ge_N8_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.171337,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.069797,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,23.063348,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.393169,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,29.074848,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.426396,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,16.922122,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.935307,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.255825,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,10.479719,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,24.519697,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,14.386574,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,29.143988,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,6.898638,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,18.271767,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,10.997651,0.000000
+PyTorch,ge,ge_N8_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,23.476497,0.000000
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,16.836825,0.000000
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,32.890492,0.000000
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,20.590077,0.000000
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,36.788412,0.000000
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,9.996323,0.000000
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.700884,0.000000
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,16.088683,0.000000
+PyTorch,ge,ge_N64_dtypetorch.quint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.550079,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantFalse,short,False,18.296114,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigFalse_other_scalarFalse_out_variantTrue,short,False,34.263955,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantFalse,short,False,21.947267,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigFalse_other_scalarTrue_out_variantTrue,short,False,38.622379,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantFalse,short,False,10.075395,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigTrue_other_scalarFalse_out_variantTrue,short,False,24.391116,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantFalse,short,False,15.990073,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint8_contigTrue_other_scalarTrue_out_variantTrue,short,False,28.557654,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantFalse,short,False,28.126564,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigFalse_other_scalarFalse_out_variantTrue,short,False,43.531679,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantFalse,short,False,26.983753,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigFalse_other_scalarTrue_out_variantTrue,short,False,43.014786,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantFalse,short,False,21.464556,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigTrue_other_scalarFalse_out_variantTrue,short,False,34.336164,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantFalse,short,False,20.083832,0.000000
+PyTorch,ge,ge_N64_dtypetorch.qint32_contigTrue_other_scalarTrue_out_variantTrue,short,False,33.717209,0.000000
+PyTorch,QConv1d,QConv1d_IC128_OC256_kernel3_stride1_N1_L64_cpu,short,False,2474.554141,0.000000
+PyTorch,QConv1d,QConv1d_IC256_OC256_kernel3_stride2_N4_L64_cpu,short,False,10019.689350,0.000000
+PyTorch,QConv2d,QConv2d_IC256_OC256_kernel3_stride1_N1_H16_W16_G1_pad0_cpu,short,False,2819.508730,0.000000
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim128,short,False,18.134076,0.000000
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim256,short,False,34.939813,0.000000
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim512,short,False,65.717219,0.000000
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim128,short,False,36.029054,0.000000
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim256,short,False,66.511117,0.000000
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim512,short,False,128.594099,0.000000
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim128,short,False,35.738603,0.000000
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim256,short,False,67.034801,0.000000
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim512,short,False,129.472195,0.000000
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim128,short,False,6.597953,0.000000
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim256,short,False,9.279742,0.000000
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim512,short,False,12.878452,0.000000
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim128,short,False,57.690957,0.000000
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim256,short,False,109.143374,0.000000
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim512,short,False,211.718602,0.000000
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim128,short,False,110.866952,0.000000
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim256,short,False,213.131957,0.000000
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim512,short,False,418.880093,0.000000
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim128_batch_size10,short,False,206.945818,0.000000
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim256_batch_size10,short,False,363.442792,0.000000
+PyTorch,qembeddingbag_byte_prepack,qembeddingbag_byte_prepack_num_embeddings80_embedding_dim512_batch_size10,short,False,666.987745,0.000000
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim128_batch_size10,short,False,6.759820,0.000000
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim256_batch_size10,short,False,6.655541,0.000000
+PyTorch,qembeddingbag_4bit_prepack,qembeddingbag_4bit_prepack_num_embeddings80_embedding_dim512_batch_size10,short,False,6.737512,0.000000
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim128_batch_size10,short,False,6.743112,0.000000
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim256_batch_size10,short,False,6.652576,0.000000
+PyTorch,qembeddingbag_2bit_prepack,qembeddingbag_2bit_prepack_num_embeddings80_embedding_dim512_batch_size10,short,False,6.841990,0.000000
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim128_batch_size10,short,False,23.021744,0.000000
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim256_batch_size10,short,False,38.487234,0.000000
+PyTorch,qembeddingbag_byte_unpack,qembeddingbag_byte_unpack_num_embeddings80_embedding_dim512_batch_size10,short,False,71.024263,0.000000
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim128_batch_size10,short,False,8.177698,0.000000
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim256_batch_size10,short,False,8.039202,0.000000
+PyTorch,qembeddingbag_4bit_unpack,qembeddingbag_4bit_unpack_num_embeddings80_embedding_dim512_batch_size10,short,False,8.332832,0.000000
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim128_batch_size10,short,False,11.874304,0.000000
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim256_batch_size10,short,False,11.875088,0.000000
+PyTorch,qembeddingbag_2bit_unpack,qembeddingbag_2bit_unpack_num_embeddings80_embedding_dim512_batch_size10,short,False,11.973970,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,37.749198,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,37.918866,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,37.601117,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,37.524010,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,37.579205,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,37.955366,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,37.884045,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,38.208370,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,38.443378,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,38.740487,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,38.368374,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags10_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,38.422703,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,37.686129,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,37.801677,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,37.489407,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,37.679521,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,37.752840,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,37.905238,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,37.819355,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,38.130109,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,38.408468,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,38.747029,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,38.404787,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags120_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,38.502984,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,37.756773,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,37.893388,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,37.831078,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,37.867489,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,37.857305,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,37.989236,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,37.809535,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,37.960946,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,38.544690,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,38.844939,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,38.371755,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags1000_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,39.108865,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,37.655707,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,37.948385,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,37.677788,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size8_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,38.097931,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,37.906198,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,38.246369,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,37.859952,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size16_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,38.499342,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetTrue_cpu,short,False,38.788211,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseTrue_include_last_offsetFalse_cpu,short,False,38.998297,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetTrue_cpu,short,False,38.683481,0.000000
+PyTorch,qEmbeddingBag,qEmbeddingBag_embeddingbags2300_dim64_modesum_input_size64_offset0_sparseFalse_include_last_offsetFalse_cpu,short,False,38.536436,0.000000
+PyTorch,QGroupNormBenchmark,"QGroupNormBenchmark_dims(32,8,16)_num_groups2_dtypetorch.qint8",short,False,58.164334,0.000000
+PyTorch,QGroupNormBenchmark,"QGroupNormBenchmark_dims(32,8,16)_num_groups4_dtypetorch.qint8",short,False,57.796211,0.000000
+PyTorch,QGroupNormBenchmark,"QGroupNormBenchmark_dims(32,8,56,56)_num_groups2_dtypetorch.qint8",short,False,1148.216412,0.000000
+PyTorch,QGroupNormBenchmark,"QGroupNormBenchmark_dims(32,8,56,56)_num_groups4_dtypetorch.qint8",short,False,1148.804126,0.000000
+PyTorch,QInstanceNormBenchmark,"QInstanceNormBenchmark_dims(32,8,16)_dtypetorch.qint8",short,False,57.575234,0.000000
+PyTorch,QInstanceNormBenchmark,"QInstanceNormBenchmark_dims(32,8,56,56)_dtypetorch.qint8",short,False,1147.707670,0.000000
+PyTorch,q_interpolate,q_interpolate_M32_N32_K32_dtypetorch.quint8_modenearest_scale0.5_contigTrue,short,False,7.150264,0.000000
+PyTorch,q_interpolate,q_interpolate_M32_N32_K32_dtypetorch.quint8_modebilinear_scale0.5_contigTrue,short,False,9.218789,0.000000
+PyTorch,q_interpolate,q_interpolate_M32_N32_K32_dtypetorch.quint8_modenearest_scale2.0_contigTrue,short,False,7.490512,0.000000
+PyTorch,q_interpolate,q_interpolate_M32_N32_K32_dtypetorch.quint8_modebilinear_scale2.0_contigTrue,short,False,9.314491,0.000000
+PyTorch,q_interpolate,q_interpolate_M3_N720_K1280_dtypetorch.quint8_modebilinear_scale0.83333_contigTrue,short,False,66.910531,0.000000
+PyTorch,QLayerNormBenchmark,"QLayerNormBenchmark_dims(1,8,16)_dtypetorch.qint8",short,False,15.853110,0.000000
+PyTorch,QLayerNormBenchmark,"QLayerNormBenchmark_dims(8,8,16)_dtypetorch.qint8",short,False,62.647792,0.000000
+PyTorch,QLayerNormBenchmark,"QLayerNormBenchmark_dims(32,8,16)_dtypetorch.qint8",short,False,66.094037,0.000000
+PyTorch,QLayerNormBenchmark,"QLayerNormBenchmark_dims(64,128,56,56)_dtypetorch.qint8",short,False,51655.592280,0.000000
+PyTorch,QLinear,QLinear_N1_IN1_OUT1_cpu,short,False,48.466068,0.000000
+PyTorch,QLinear,QLinear_N4_IN256_OUT128_cpu,short,False,97.047966,0.000000
+PyTorch,QLinear,QLinear_N16_IN512_OUT256_cpu,short,False,92.013699,0.000000
+PyTorch,QDynamicLinear,QDynamicLinear_N1_IN1_OUT1_cpu,short,False,55.162945,0.000000
+PyTorch,QDynamicLinear,QDynamicLinear_N4_IN256_OUT128_cpu,short,False,181.460491,0.000000
+PyTorch,QDynamicLinear,QDynamicLinear_N16_IN512_OUT256_cpu,short,False,186.868091,0.000000
+PyTorch,MinMaxObserver,MinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_affine,short,False,178.683642,0.000000
+PyTorch,MinMaxObserver,MinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_symmetric,short,False,165.985880,0.000000
+PyTorch,MovingAverageMinMaxObserver,MovingAverageMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_affine,short,False,209.793412,0.000000
+PyTorch,MovingAverageMinMaxObserver,MovingAverageMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_symmetric,short,False,199.116115,0.000000
+PyTorch,PerChannelMinMaxObserver,PerChannelMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_channel_affine,short,False,383.567212,0.000000
+PyTorch,PerChannelMinMaxObserver,PerChannelMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_channel_symmetric,short,False,386.658467,0.000000
+PyTorch,MovingAveragePerChannelMinMaxObserver,MovingAveragePerChannelMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_channel_affine,short,False,406.231582,0.000000
+PyTorch,MovingAveragePerChannelMinMaxObserver,MovingAveragePerChannelMinMaxObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_channel_symmetric,short,False,424.846136,0.000000
+PyTorch,HistogramObserver,HistogramObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_affine,short,False,1852.950257,0.000000
+PyTorch,HistogramObserver,HistogramObserver_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_symmetric,short,False,1886.575278,0.000000
+PyTorch,HistogramObserverCalculateQparams,HistogramObserverCalculateQparams_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_affine,short,False,1916.034661,0.000000
+PyTorch,HistogramObserverCalculateQparams,HistogramObserverCalculateQparams_C3_M512_N512_dtypetorch.quint8_cpu_qschemetorch.per_tensor_symmetric,short,False,1848.436297,0.000000
+PyTorch,QAdaptiveAvgPool2dBenchmark,"QAdaptiveAvgPool2dBenchmark_N4_C3_input_size(224,224)_output_size(112,112)_contigTrue_dtypetorch.qint32",short,False,125.012330,0.000000
+PyTorch,QAdaptiveAvgPool2dBenchmark,"QAdaptiveAvgPool2dBenchmark_N4_C3_input_size(224,224)_output_size(112,112)_contigTrue_dtypetorch.qint8",short,False,120.338743,0.000000
+PyTorch,QAdaptiveAvgPool2dBenchmark,"QAdaptiveAvgPool2dBenchmark_N4_C3_input_size(224,224)_output_size(112,112)_contigTrue_dtypetorch.quint8",short,False,120.237932,0.000000
+PyTorch,QAvgPool2dBenchmark,"QAvgPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.qint32",short,False,58.290125,0.000000
+PyTorch,QAvgPool2dBenchmark,"QAvgPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.qint8",short,False,56.845484,0.000000
+PyTorch,QAvgPool2dBenchmark,"QAvgPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.quint8",short,False,57.068030,0.000000
+PyTorch,QMaxPool2dBenchmark,"QMaxPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.qint32",short,False,62.013425,0.000000
+PyTorch,QMaxPool2dBenchmark,"QMaxPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.qint8",short,False,61.332599,0.000000
+PyTorch,QMaxPool2dBenchmark,"QMaxPool2dBenchmark_C1_H3_W3_k(3,3)_s(1,1)_p(0,0)_N2_contigTrue_dtypetorch.quint8",short,False,60.981402,0.000000
+PyTorch,QLSTM,QLSTM_I1_H3_NL1_BTrue_DFalse_dtypetorch.qint8,short,False,20708.077910,0.000000
+PyTorch,QLSTM,QLSTM_I1_H3_NL1_BTrue_DTrue_dtypetorch.qint8,short,False,41009.405290,0.000000
+PyTorch,QLSTM,QLSTM_I5_H7_NL4_BTrue_DFalse_dtypetorch.qint8,short,False,81385.994580,0.000000
+PyTorch,QLSTM,QLSTM_I5_H7_NL4_BTrue_DTrue_dtypetorch.qint8,short,False,162347.641390,0.000000
+PyTorch,QMethodTensorInputCopyBenchmark,QMethodTensorInputCopyBenchmark_M32_N32_dtypetorch.quint8_contigFalse,short,False,0.884224,0.000000
+PyTorch,QMethodTensorInputCopyBenchmark,QMethodTensorInputCopyBenchmark_M32_N32_dtypetorch.quint8_contigTrue,short,False,0.881290,0.000000
+PyTorch,QuantizePerTensor,QuantizePerTensor_C3_M512_N512_dtypetorch.quint8_modeQ,short,False,139.818657,0.000000
+PyTorch,DequantizePerTensor,DequantizePerTensor_C3_M512_N512_dtypetorch.quint8_modeD,short,False,111.856445,0.000000
+PyTorch,QuantizePerChannel,QuantizePerChannel_C3_M512_N512_dtypetorch.quint8_modeQ_axis0,short,False,137.870248,0.000000
+PyTorch,DequantizePerChannel,DequantizePerChannel_C3_M512_N512_dtypetorch.quint8_modeD_axis0,short,False,295.384286,0.000000
+PyTorch,FakeQuantize,FakeQuantize_N1_C3_H512_W512_zero_point_dtypetorch.int32_cpu,short,False,498.468140,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu,short,False,212.106189,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu,short,False,212.103393,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu,short,False,210.769552,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu,short,False,210.336579,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwdall_BACKWARD,short,True,645.670738,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd1_BACKWARD,short,True,646.979930,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd2_BACKWARD,short,True,648.774775,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd3_BACKWARD,short,True,647.536140,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwdall_BACKWARD,short,True,645.420480,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd1_BACKWARD,short,True,647.989360,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd2_BACKWARD,short,True,648.279117,0.000000
+PyTorch,learnable_kernel_tensor,learnable_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd3_BACKWARD,short,True,648.012305,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwdall_BACKWARD,short,True,396.607204,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd1_BACKWARD,short,True,396.439610,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd2_BACKWARD,short,True,398.157875,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd3_BACKWARD,short,True,393.582596,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwdall_BACKWARD,short,True,394.932475,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd1_BACKWARD,short,True,398.150060,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd2_BACKWARD,short,True,394.573905,0.000000
+PyTorch,original_kernel_tensor,original_kernel_tensor_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd3_BACKWARD,short,True,389.742169,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu,short,False,462.132270,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu,short,False,460.794395,0.000000
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu,short,False,454.659963,0.000000
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu,short,False,450.819046,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwdall_BACKWARD,short,True,727.548224,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd1_BACKWARD,short,True,732.767646,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd2_BACKWARD,short,True,731.549638,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd3_BACKWARD,short,True,732.523360,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwdall_BACKWARD,short,True,734.845672,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd1_BACKWARD,short,True,734.484530,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd2_BACKWARD,short,True,731.358856,0.000000
+PyTorch,learnable_kernel_channel,learnable_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd3_BACKWARD,short,True,732.279545,0.000000
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwdall_BACKWARD,short,True,392.022089,0.000000
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits4_cpu_bwd1_BACKWARD,short,True,396.691596,0.000000
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwdall_BACKWARD,short,True,395.044202,0.000000
+PyTorch,original_kernel_channel,original_kernel_channel_N1_C3_H512_W512_zero_point_dtypetorch.int32_nbits8_cpu_bwd1_BACKWARD,short,True,393.618618,0.000000
+PyTorch,q_argsort,q_argsort_M512_N512_dtypetorch.quint8,short,False,498.230444,0.000000
+PyTorch,q_clone,q_clone_M512_N512_dtypetorch.quint8,short,False,54.217228,0.000000
+PyTorch,q_mean,q_mean_M512_N512_dtypetorch.quint8,short,False,98.299090,0.000000
+PyTorch,q_relu,q_relu_M512_N512_dtypetorch.quint8,short,False,50.626535,0.000000
+PyTorch,q_relu_,q_relu__M512_N512_dtypetorch.quint8,short,False,50.900865,0.000000
+PyTorch,q_sort,q_sort_M512_N512_dtypetorch.quint8,short,False,489.762199,0.000000
+PyTorch,qtopk,qtopk_M512_N512_k5_dtypetorch.quint8,short,False,106.761619,0.000000
+PyTorch,abs,abs_M512_N512_cpu,short,False,57.051424,0.000000
+PyTorch,abs_,abs__M512_N512_cpu,short,False,52.200911,0.000000
+PyTorch,acos,acos_M512_N512_cpu,short,False,163.152278,0.000000
+PyTorch,acos_,acos__M512_N512_cpu,short,False,154.986924,0.000000
+PyTorch,argsort,argsort_M512_N512_cpu,short,False,1293.551670,0.000000
+PyTorch,asin,asin_M512_N512_cpu,short,False,143.466299,0.000000
+PyTorch,asin_,asin__M512_N512_cpu,short,False,138.166554,0.000000
+PyTorch,atan,atan_M512_N512_cpu,short,False,183.999280,0.000000
+PyTorch,atan_,atan__M512_N512_cpu,short,False,178.477300,0.000000
+PyTorch,ceil,ceil_M512_N512_cpu,short,False,53.237791,0.000000
+PyTorch,ceil_,ceil__M512_N512_cpu,short,False,51.146127,0.000000
+PyTorch,clamp,clamp_M512_N512_cpu,short,False,57.982160,0.000000
+PyTorch,clone,clone_M512_N512_cpu,short,False,55.928251,0.000000
+PyTorch,cos,cos_M512_N512_cpu,short,False,153.934110,0.000000
+PyTorch,cos_,cos__M512_N512_cpu,short,False,149.205590,0.000000
+PyTorch,cosh,cosh_M512_N512_cpu,short,False,233.610736,0.000000
+PyTorch,digamma,digamma_M512_N512_cpu,short,False,512.670916,0.000000
+PyTorch,erf,erf_M512_N512_cpu,short,False,248.115065,0.000000
+PyTorch,erf_,erf__M512_N512_cpu,short,False,245.928480,0.000000
+PyTorch,erfc,erfc_M512_N512_cpu,short,False,471.492698,0.000000
+PyTorch,erfc_,erfc__M512_N512_cpu,short,False,466.460295,0.000000
+PyTorch,erfinv,erfinv_M512_N512_cpu,short,False,1359.954587,0.000000
+PyTorch,exp,exp_M512_N512_cpu,short,False,102.685068,0.000000
+PyTorch,exp_,exp__M512_N512_cpu,short,False,98.656667,0.000000
+PyTorch,expm1,expm1_M512_N512_cpu,short,False,224.464036,0.000000
+PyTorch,expm1_,expm1__M512_N512_cpu,short,False,220.063117,0.000000
+PyTorch,floor,floor_M512_N512_cpu,short,False,53.244395,0.000000
+PyTorch,floor_,floor__M512_N512_cpu,short,False,51.672797,0.000000
+PyTorch,frac,frac_M512_N512_cpu,short,False,55.433832,0.000000
+PyTorch,frac_,frac__M512_N512_cpu,short,False,51.270698,0.000000
+PyTorch,gelu,gelu_M512_N512_cpu,short,False,156.736075,0.000000
+PyTorch,hardshrink,hardshrink_M512_N512_cpu,short,False,57.883780,0.000000
+PyTorch,lgamma,lgamma_M512_N512_cpu,short,False,853.460615,0.000000
+PyTorch,log,log_M512_N512_cpu,short,False,154.847541,0.000000
+PyTorch,log10,log10_M512_N512_cpu,short,False,163.334617,0.000000
+PyTorch,log10_,log10__M512_N512_cpu,short,False,157.360735,0.000000
+PyTorch,log1p,log1p_M512_N512_cpu,short,False,163.516254,0.000000
+PyTorch,log1p_,log1p__M512_N512_cpu,short,False,159.639356,0.000000
+PyTorch,log2,log2_M512_N512_cpu,short,False,163.969243,0.000000
+PyTorch,log2_,log2__M512_N512_cpu,short,False,159.835136,0.000000
+PyTorch,log_,log__M512_N512_cpu,short,False,150.952504,0.000000
+PyTorch,logit,logit_M512_N512_cpu,short,False,177.961690,0.000000
+PyTorch,logit_,logit__M512_N512_cpu,short,False,172.351381,0.000000
+PyTorch,neg,neg_M512_N512_cpu,short,False,55.097290,0.000000
+PyTorch,neg_,neg__M512_N512_cpu,short,False,50.983444,0.000000
+PyTorch,reciprocal,reciprocal_M512_N512_cpu,short,False,63.374416,0.000000
+PyTorch,reciprocal_,reciprocal__M512_N512_cpu,short,False,58.360915,0.000000
+PyTorch,relu,relu_M512_N512_cpu,short,False,55.350610,0.000000
+PyTorch,relu_,relu__M512_N512_cpu,short,False,52.531514,0.000000
+PyTorch,round,round_M512_N512_cpu,short,False,54.882808,0.000000
+PyTorch,round_,round__M512_N512_cpu,short,False,51.705845,0.000000
+PyTorch,rsqrt,rsqrt_M512_N512_cpu,short,False,72.353625,0.000000
+PyTorch,rsqrt_,rsqrt__M512_N512_cpu,short,False,67.110910,0.000000
+PyTorch,sigmoid,sigmoid_M512_N512_cpu,short,False,101.934045,0.000000
+PyTorch,sigmoid_,sigmoid__M512_N512_cpu,short,False,101.207989,0.000000
+PyTorch,sign,sign_M512_N512_cpu,short,False,57.157465,0.000000
+PyTorch,sgn,sgn_M512_N512_cpu,short,False,56.892450,0.000000
+PyTorch,sin,sin_M512_N512_cpu,short,False,129.825713,0.000000
+PyTorch,sin_,sin__M512_N512_cpu,short,False,124.252865,0.000000
+PyTorch,sinh,sinh_M512_N512_cpu,short,False,237.181745,0.000000
+PyTorch,sqrt,sqrt_M512_N512_cpu,short,False,55.643847,0.000000
+PyTorch,sqrt_,sqrt__M512_N512_cpu,short,False,51.970346,0.000000
+PyTorch,square,square_M512_N512_cpu,short,False,56.493474,0.000000
+PyTorch,square_,square__M512_N512_cpu,short,False,53.660946,0.000000
+PyTorch,tan,tan_M512_N512_cpu,short,False,212.381058,0.000000
+PyTorch,tan_,tan__M512_N512_cpu,short,False,209.302840,0.000000
+PyTorch,tanh,tanh_M512_N512_cpu,short,False,254.571910,0.000000
+PyTorch,tanh_,tanh__M512_N512_cpu,short,False,250.419008,0.000000
+PyTorch,trunc,trunc_M512_N512_cpu,short,False,50.202160,0.000000
+PyTorch,trunc_,trunc__M512_N512_cpu,short,False,48.335770,0.000000
+PyTorch,unique,unique_M512_N512_cpu,short,False,18881.017060,0.000000
+PyTorch,zero_,zero__M512_N512_cpu,short,False,48.573353,0.000000
+PyTorch,bernoulli_,bernoulli__M512_N512_cpu,short,False,2761.902873,0.000000
+PyTorch,cauchy_,cauchy__M512_N512_cpu,short,False,6134.592810,0.000000
+PyTorch,digamma_,digamma__M512_N512_cpu,short,False,968.574541,0.000000
+PyTorch,exponential_,exponential__M512_N512_cpu,short,False,4554.747990,0.000000
+PyTorch,normal_,normal__M512_N512_cpu,short,False,1969.108666,0.000000
+PyTorch,random_,random__M512_N512_cpu,short,False,742.022216,0.000000
+PyTorch,sign_,sign__M512_N512_cpu,short,False,53.070620,0.000000
+PyTorch,uniform_,uniform__M512_N512_cpu,short,False,719.128405,0.000000
+PyTorch,half,half_M512_N512_cpu,short,False,56.301074,0.000000
+PyTorch,long,long_M512_N512_cpu,short,False,69.495610,0.000000
diff --git a/benchmarks/operator_benchmark/benchmark_all_other_test.py b/benchmarks/operator_benchmark/benchmark_all_other_test.py
index e368c281d9a4..362fec8c37f5 100644
--- a/benchmarks/operator_benchmark/benchmark_all_other_test.py
+++ b/benchmarks/operator_benchmark/benchmark_all_other_test.py
@@ -7,6 +7,7 @@
     binary_inplace_test,
     binary_test,
     bmm_test,
+    boolean_test,
     cat_test,
     channel_shuffle_test,
     chunk_test,
diff --git a/benchmarks/operator_benchmark/benchmark_core.py b/benchmarks/operator_benchmark/benchmark_core.py
index 3caaf3e3a916..3f79ed2318c4 100644
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@@ -18,6 +18,7 @@
 
 # needs to be imported after torch
 import torch.utils.cpp_extension as cpp_extension  # noqa: F401
+from torch.utils.benchmark import Timer
 
 
 """Performance microbenchmarks.
@@ -348,10 +349,24 @@ def _launch_forward(self, test_case, iters, print_per_iter):
             func = test_case.run_jit_forward
         if self.use_compile:
             func = test_case.run_compile_forward
-        forward_time = timeit.timeit(
-            functools.partial(func, iters, print_per_iter, cuda_sync), number=1
+
+        if not cuda_sync:
+            forward_time = timeit.timeit(
+                functools.partial(func, iters, print_per_iter, cuda_sync), number=1
+            )
+            return forward_time
+        # Stable timing with Timer
+        timer = Timer(
+            stmt="func(iters, print_per_iter, cuda_sync)",
+            globals={
+                "func": func,
+                "iters": iters,
+                "print_per_iter": print_per_iter,
+                "cuda_sync": cuda_sync,
+            },
         )
-        return forward_time
+        result = timer.adaptive_autorange(min_run_time=0.0001)
+        return result.median * iters
 
     def _launch_backward(self, test_case, iters, print_per_iter=False):
         """This function runs forward path of an op to get an output. Then the backward path is executed
diff --git a/benchmarks/operator_benchmark/benchmark_pytorch.py b/benchmarks/operator_benchmark/benchmark_pytorch.py
index a7ff40ebb340..fa022417da45 100644
--- a/benchmarks/operator_benchmark/benchmark_pytorch.py
+++ b/benchmarks/operator_benchmark/benchmark_pytorch.py
@@ -113,7 +113,7 @@ def test_name(self, **kargs):
             value = kargs[key]
             test_name_str.append(
                 ("" if key in skip_key_list else key)
-                + str(value if type(value) != bool else int(value))
+                + str(value if type(value) is not bool else int(value))
             )
         name = (self.module_name() + "_" + "_".join(test_name_str)).replace(" ", "")
         return name
@@ -161,6 +161,8 @@ def run_compile_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
         if self._compile_forward_graph is None:
             self._compile_forward_graph = self._generate_compile_forward_graph()
         self._compile_forward_graph(num_runs)
+        if cuda_sync:
+            torch.cuda.synchronize(torch.cuda.current_device())
 
     def _print_per_iter(self):
         # print last 50 values
diff --git a/benchmarks/operator_benchmark/pt/add_test.py b/benchmarks/operator_benchmark/pt/add_test.py
index 54504c4f3005..739b8ef14a54 100644
--- a/benchmarks/operator_benchmark/pt/add_test.py
+++ b/benchmarks/operator_benchmark/pt/add_test.py
@@ -52,27 +52,6 @@ def forward(self, input_one, input_two):
 op_bench.generate_pt_test(add_long_configs + add_short_configs, AddBenchmark)
 op_bench.generate_pt_gradient_test(add_long_configs + add_short_configs, AddBenchmark)
 
-
-"""Mircobenchmark for addmm operator."""
-
-
-class AddmmBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, M, N, K, device):
-        self.inputs = {
-            "input_one": torch.rand(M, K, device=device, requires_grad=self.auto_set()),
-            "mat1": torch.rand(M, N, device=device, requires_grad=self.auto_set()),
-            "mat2": torch.rand(N, K, device=device, requires_grad=self.auto_set()),
-        }
-        self.set_module_name("addmm")
-
-    def forward(self, input_one, mat1, mat2):
-        return torch.addmm(input_one, mat1, mat2)
-
-
-op_bench.generate_pt_test(add_long_configs + add_short_configs, AddmmBenchmark)
-op_bench.generate_pt_gradient_test(add_long_configs + add_short_configs, AddmmBenchmark)
-
-
 """Mircobenchmark for addr operator."""
 
 
@@ -106,46 +85,5 @@ def forward(self, input_one, vec1, vec2):
 op_bench.generate_pt_test(addr_configs, AddrBenchmark)
 op_bench.generate_pt_gradient_test(addr_configs, AddrBenchmark)
 
-
-"""Mircobenchmark for addbmm operator."""
-
-
-class AddbmmBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, B, M, N, K, device):
-        self.inputs = {
-            "input_one": torch.rand(
-                (M, N), device=device, requires_grad=self.auto_set()
-            ),
-            "batch1": torch.rand(
-                (B, M, K), device=device, requires_grad=self.auto_set()
-            ),
-            "batch2": torch.rand(
-                (
-                    B,
-                    K,
-                    N,
-                ),
-                device=device,
-                requires_grad=self.auto_set(),
-            ),
-        }
-        self.set_module_name("addbmm")
-
-    def forward(self, input_one, batch1, batch2):
-        return torch.addbmm(input_one, batch1, batch2)
-
-
-addbmm_configs = op_bench.cross_product_configs(
-    B=[2, 100],
-    M=[8, 256],
-    N=[256, 16],
-    K=[15, 16],
-    device=["cpu", "cuda"],
-    tags=["addbmm"],
-)
-
-op_bench.generate_pt_test(addbmm_configs, AddbmmBenchmark)
-op_bench.generate_pt_gradient_test(addbmm_configs, AddbmmBenchmark)
-
 if __name__ == "__main__":
     op_bench.benchmark_runner.main()
diff --git a/benchmarks/operator_benchmark/pt/addmm_test.py b/benchmarks/operator_benchmark/pt/addmm_test.py
new file mode 100644
index 000000000000..a98628944b3e
--- /dev/null
+++ b/benchmarks/operator_benchmark/pt/addmm_test.py
@@ -0,0 +1,115 @@
+import operator_benchmark as op_bench
+
+import torch
+
+
+"""Microbenchmarks for add_(matmul) operator. Supports both Caffe2/PyTorch."""
+
+# Configs for PT add operator
+addmm_long_configs = op_bench.cross_product_configs(
+    M=[256, 1024, 3000],
+    N=[512, 4096],
+    K=[512, 4096],
+    device=["cuda"],
+    tags=["long"],
+    dtype=[torch.float16, torch.bfloat16, torch.float32],
+)
+
+
+addmm_short_configs = op_bench.config_list(
+    attr_names=["M", "N", "K"],
+    attrs=[
+        [1, 1, 1],
+        [64, 64, 64],
+        [64, 64, 128],
+    ],
+    cross_product_configs={
+        "device": ["cpu", "cuda"],
+        "dtype": [torch.float],
+    },
+    tags=["short"],
+)
+
+
+"""Mircobenchmark for addmm operator."""
+
+
+class AddmmBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, M, N, K, device, dtype):
+        self.inputs = {
+            "input_one": torch.rand(
+                M, K, device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+            "mat1": torch.rand(
+                M, N, device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+            "mat2": torch.rand(
+                N, K, device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+        }
+        self.set_module_name("addmm")
+
+    def forward(self, input_one, mat1, mat2):
+        return torch.addmm(input_one, mat1, mat2)
+
+
+op_bench.generate_pt_test(addmm_long_configs + addmm_long_configs, AddmmBenchmark)
+op_bench.generate_pt_gradient_test(
+    addmm_long_configs + addmm_long_configs, AddmmBenchmark
+)
+
+"""Mircobenchmark for addbmm operator."""
+
+
+class AddbmmBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, B, M, N, K, device, dtype):
+        self.inputs = {
+            "input_one": torch.rand(
+                (M, N), device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+            "batch1": torch.rand(
+                (B, M, K), device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+            "batch2": torch.rand(
+                (
+                    B,
+                    K,
+                    N,
+                ),
+                device=device,
+                requires_grad=self.auto_set(),
+                dtype=dtype,
+            ),
+        }
+        self.set_module_name("addbmm")
+
+    def forward(self, input_one, batch1, batch2):
+        return torch.addbmm(input_one, batch1, batch2)
+
+
+addbmm_long_configs = op_bench.cross_product_configs(
+    B=[8, 32],
+    M=[256, 1024],
+    N=[256, 1024],
+    K=[64, 128],
+    device=["cuda"],
+    dtype=[torch.float16, torch.bfloat16, torch.float32],
+    tags=["long"],
+)
+addbmm_short_configs = op_bench.cross_product_configs(
+    B=[1, 8],
+    M=[8, 128],
+    N=[32, 64],
+    K=[256, 512],
+    device=["cpu", "cuda"],
+    dtype=[torch.float16, torch.bfloat16, torch.float32],
+    tags=["short"],
+)
+
+op_bench.generate_pt_test(addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark)
+op_bench.generate_pt_gradient_test(
+    addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark
+)
+
+if __name__ == "__main__":
+    op_bench.benchmark_runner.main()
diff --git a/benchmarks/operator_benchmark/pt/binary_test.py b/benchmarks/operator_benchmark/pt/binary_test.py
index 60b1bba7933f..72f685578767 100644
--- a/benchmarks/operator_benchmark/pt/binary_test.py
+++ b/benchmarks/operator_benchmark/pt/binary_test.py
@@ -56,6 +56,9 @@ def forward(self, in_one, in_two):
         ["sub", torch.sub],
         ["div", torch.div],
         ["mul", torch.mul],
+        ["asr", torch.bitwise_right_shift],
+        ["lsl", torch.bitwise_left_shift],
+        ["xor", torch.bitwise_xor],
     ],
 )
 
diff --git a/benchmarks/operator_benchmark/pt/bmm_test.py b/benchmarks/operator_benchmark/pt/bmm_test.py
index 1c6d1f9aca55..f867f6ac09f8 100644
--- a/benchmarks/operator_benchmark/pt/bmm_test.py
+++ b/benchmarks/operator_benchmark/pt/bmm_test.py
@@ -27,12 +27,12 @@
 )
 
 batched_binary_configs_long = op_bench.cross_product_configs(
-    B=[1, 128],
-    M=[8, 128],
-    N=[32, 64],
-    K=[4, 256],
-    device=["cpu", "cuda"],
-    dtype=[torch.float, torch.bfloat16],
+    B=[8, 32],
+    M=[256, 1024],
+    N=[256, 1024],
+    K=[64, 128],
+    device=["cuda"],
+    dtype=[torch.float32, torch.bfloat16, torch.float16],
     tags=["long"],
 )
 
@@ -40,8 +40,12 @@
 class BatchedBinaryOpBenchmark(op_bench.TorchBenchmarkBase):
     def init(self, B, M, N, K, device, dtype, op_func):
         self.inputs = {
-            "batch1": torch.rand((B, M, N), device=device).to(dtype=dtype),
-            "batch2": torch.rand((B, N, K), device=device).to(dtype=dtype),
+            "batch1": torch.rand(
+                (B, M, N), device=device, dtype=dtype, requires_grad=self.auto_set()
+            ),
+            "batch2": torch.rand(
+                (B, N, K), device=device, dtype=dtype, requires_grad=self.auto_set()
+            ),
         }
         self.op_func = op_func
 
@@ -54,6 +58,11 @@ def forward(self, batch1, batch2):
     batched_binary_configs_short + batched_binary_configs_long,
     BatchedBinaryOpBenchmark,
 )
+op_bench.generate_pt_gradient_tests_from_op_list(
+    batched_binary_ops,
+    batched_binary_configs_long,
+    BatchedBinaryOpBenchmark,
+)
 
 
 # batched ternary ops
@@ -66,9 +75,15 @@ def forward(self, batch1, batch2):
 class BatchedTernaryOpBenchmark(op_bench.TorchBenchmarkBase):
     def init(self, B, M, N, K, device, dtype, op_func):
         self.inputs = {
-            "input_": torch.rand((B, M, K), device=device).to(dtype=dtype),
-            "batch1": torch.rand((B, M, N), device=device).to(dtype=dtype),
-            "batch2": torch.rand((B, N, K), device=device).to(dtype=dtype),
+            "input_": torch.rand(
+                (B, M, K), device=device, dtype=dtype, requires_grad=self.auto_set()
+            ),
+            "batch1": torch.rand(
+                (B, M, N), device=device, dtype=dtype, requires_grad=self.auto_set()
+            ),
+            "batch2": torch.rand(
+                (B, N, K), device=device, dtype=dtype, requires_grad=self.auto_set()
+            ),
         }
         self.op_func = op_func
 
@@ -81,6 +96,12 @@ def forward(self, input_, batch1, batch2):
     batched_binary_configs_short + batched_binary_configs_long,
     BatchedTernaryOpBenchmark,
 )
+op_bench.generate_pt_gradient_tests_from_op_list(
+    batched_ternary_ops,
+    batched_binary_configs_long,
+    BatchedTernaryOpBenchmark,
+)
+
 
 # TODO: does it automatically register new scripts?
 
diff --git a/benchmarks/operator_benchmark/pt/boolean_test.py b/benchmarks/operator_benchmark/pt/boolean_test.py
new file mode 100644
index 000000000000..41599e5115e1
--- /dev/null
+++ b/benchmarks/operator_benchmark/pt/boolean_test.py
@@ -0,0 +1,73 @@
+import operator_benchmark as op_bench
+
+import torch
+
+
+"""Microbenchmarks for boolean operators. Supports both Caffe2/PyTorch."""
+
+# Configs for PT all operator
+all_long_configs = op_bench.cross_product_configs(
+    M=[8, 128], N=[32, 64], K=[256, 512], device=["cpu", "cuda"], tags=["long"]
+)
+
+
+all_short_configs = op_bench.config_list(
+    attr_names=["M", "N", "K"],
+    attrs=[
+        [1, 1, 1],
+        [64, 64, 64],
+        [64, 64, 128],
+    ],
+    cross_product_configs={
+        "device": ["cpu", "cuda"],
+    },
+    tags=["short"],
+)
+
+
+class AllBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, M, N, K, device):
+        self.inputs = {
+            "input_one": torch.randint(0, 2, (M, N, K), device=device, dtype=torch.bool)
+        }
+        self.set_module_name("all")
+
+    def forward(self, input_one):
+        return torch.all(input_one)
+
+
+# The generated test names based on all_short_configs will be in the following pattern:
+# all_M8_N16_K32_devicecpu
+# all_M8_N16_K32_devicecpu_bwdall
+# all_M8_N16_K32_devicecpu_bwd1
+# all_M8_N16_K32_devicecpu_bwd2
+# ...
+# Those names can be used to filter tests.
+
+op_bench.generate_pt_test(all_long_configs + all_short_configs, AllBenchmark)
+
+"""Mircobenchmark for any operator."""
+
+
+class AnyBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, M, N, device):
+        self.inputs = {
+            "input_one": torch.randint(0, 2, (M, N), device=device, dtype=torch.bool)
+        }
+        self.set_module_name("any")
+
+    def forward(self, input_one):
+        return torch.any(input_one)
+
+
+any_configs = op_bench.cross_product_configs(
+    M=[8, 256],
+    N=[256, 16],
+    device=["cpu", "cuda"],
+    tags=["any"],
+)
+
+op_bench.generate_pt_test(any_configs, AnyBenchmark)
+
+if __name__ == "__main__":
+    op_bench.benchmark_runner.main()
diff --git a/benchmarks/operator_benchmark/pt/cat_test.py b/benchmarks/operator_benchmark/pt/cat_test.py
index c0dc08593a9c..cf0369a43345 100644
--- a/benchmarks/operator_benchmark/pt/cat_test.py
+++ b/benchmarks/operator_benchmark/pt/cat_test.py
@@ -125,7 +125,7 @@ def init(self, sizes, N, dim, device):
         random.seed(42)
         inputs = []
         gen_sizes = []
-        if type(sizes) == list and N == -1:
+        if type(sizes) is list and N == -1:
             gen_sizes = sizes
         else:
             for i in range(N):
diff --git a/benchmarks/operator_benchmark/pt/conv_test.py b/benchmarks/operator_benchmark/pt/conv_test.py
index 93b4942cea2b..65baf47e0d67 100644
--- a/benchmarks/operator_benchmark/pt/conv_test.py
+++ b/benchmarks/operator_benchmark/pt/conv_test.py
@@ -38,12 +38,16 @@ def forward(self, input):
 op_bench.generate_pt_test(
     configs.conv_1d_configs_short + configs.conv_1d_configs_long, Conv1dBenchmark
 )
-op_bench.generate_pt_test(
-    configs.convtranspose_1d_configs_short
-    + configs.conv_1d_configs_short
-    + configs.conv_1d_configs_long,
-    ConvTranspose1dBenchmark,
-)
+
+
+if not torch.backends.mkldnn.is_acl_available():
+    # convtranpose1d crashes with ACL, see https://github.com/pytorch/pytorch/issues/165654
+    op_bench.generate_pt_test(
+        configs.convtranspose_1d_configs_short
+        + configs.conv_1d_configs_short
+        + configs.conv_1d_configs_long,
+        ConvTranspose1dBenchmark,
+    )
 
 
 """
diff --git a/benchmarks/operator_benchmark/pt/matmul_test.py b/benchmarks/operator_benchmark/pt/matmul_test.py
index e92728e9ebd3..d0c58aa16e8f 100644
--- a/benchmarks/operator_benchmark/pt/matmul_test.py
+++ b/benchmarks/operator_benchmark/pt/matmul_test.py
@@ -13,33 +13,46 @@
         [128, 128, 128, True, False],
         [256, 256, 256, False, True],
     ],
-    cross_product_configs={
-        "device": ["cpu", "cuda"],
-    },
+    cross_product_configs={"device": ["cpu", "cuda"]},
     tags=["short"],
 )
 
 
 mm_long_configs = op_bench.cross_product_configs(
-    M=[32],
-    N=[512, 128],
-    K=[64],
+    M=[256, 1024, 3000],
+    N=[512, 4096],
+    K=[512, 4096],
     trans_a=[False, True],
     trans_b=[True, False],
-    device=["cpu", "cuda"],
+    device=["cuda"],
+    dtype=[torch.float16, torch.bfloat16, torch.float32],
     tags=["long"],
 )
 
 
 class MatMulBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, M, N, K, trans_a, trans_b, device):
+    def init(self, M, N, K, trans_a, trans_b, device, dtype=torch.float):
+        # Create tensors without requires_grad first, then set it separately
+        # This avoids creating graph leaves that cannot be deep copied
+        if trans_a:
+            input_one = torch.rand(M, N, device=device, dtype=dtype)
+        else:
+            input_one = torch.rand(N, M, device=device, dtype=dtype).t()
+
+        if trans_b:
+            input_two = torch.rand(N, K, device=device, dtype=dtype)
+        else:
+            input_two = torch.rand(K, N, device=device, dtype=dtype).t()
+
+        # Set requires_grad after tensor creation to avoid graph leaf issues
+        if self.auto_set():
+            input_one.requires_grad_(True)
+        if self.auto_set():
+            input_two.requires_grad_(True)
+
         self.inputs = {
-            "input_one": torch.rand(M, N, device=device)
-            if trans_a
-            else torch.rand(N, M, device=device).t(),
-            "input_two": torch.rand(N, K, device=device)
-            if trans_b
-            else torch.rand(K, N, device=device).t(),
+            "input_one": input_one,
+            "input_two": input_two,
         }
         self.set_module_name("matmul")
 
@@ -48,6 +61,7 @@ def forward(self, input_one, input_two):
 
 
 op_bench.generate_pt_test(mm_long_configs + mm_short_configs, MatMulBenchmark)
+op_bench.generate_pt_gradient_test(mm_long_configs, MatMulBenchmark)
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/operator_benchmark/pt/mm_test.py b/benchmarks/operator_benchmark/pt/mm_test.py
index bf2a2651e8fb..f9e0743ba712 100644
--- a/benchmarks/operator_benchmark/pt/mm_test.py
+++ b/benchmarks/operator_benchmark/pt/mm_test.py
@@ -23,11 +23,11 @@
 )
 
 mm_long_configs = op_bench.cross_product_configs(
-    M=[8, 128],
-    N=[32, 64],
-    K=[256, 512],
-    device=["cpu", "cuda"],
-    dtype=[torch.float, torch.bfloat16],
+    M=[256, 1024, 3000],
+    N=[512, 4096],
+    K=[512, 4096],
+    device=["cuda"],
+    dtype=[torch.float16, torch.bfloat16, torch.float32],
     tags=["long"],
 )
 
@@ -35,8 +35,12 @@
 class MmOpBenchmark(op_bench.TorchBenchmarkBase):
     def init(self, M, N, K, device, dtype, op_func):
         self.inputs = {
-            "input_one": torch.randn(M, N, device=device).to(dtype=dtype),
-            "input_two": torch.randn(N, K, device=device).to(dtype=dtype),
+            "input_one": torch.randn(
+                M, N, device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+            "input_two": torch.randn(
+                N, K, device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
         }
         self.op_func = op_func
 
@@ -47,6 +51,9 @@ def forward(self, input_one, input_two):
 op_bench.generate_pt_tests_from_op_list(
     ops_list, mm_short_configs + mm_long_configs, MmOpBenchmark
 )
+op_bench.generate_pt_gradient_tests_from_op_list(
+    ops_list, mm_long_configs, MmOpBenchmark
+)
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/operator_benchmark/pt/stack_test.py b/benchmarks/operator_benchmark/pt/stack_test.py
index 9e1e25be1f4e..5dea1d9ca1ef 100644
--- a/benchmarks/operator_benchmark/pt/stack_test.py
+++ b/benchmarks/operator_benchmark/pt/stack_test.py
@@ -61,7 +61,7 @@ def init(self, sizes, N, dim, device):
         random.seed(42)
         inputs = []
         gen_sizes = []
-        if type(sizes) == list and N == -1:
+        if type(sizes) is list and N == -1:
             gen_sizes = sizes
         else:
             for i in range(N):
diff --git a/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv b/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv
similarity index 99%
rename from benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv
rename to benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv
index 9a7b6797e982..3c5a090376ed 100644
--- a/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv
+++ b/benchmarks/operator_benchmark/x86_64_expected_ci_operator_benchmark_eager_float32_cpu.csv
@@ -1158,7 +1158,7 @@ PyTorch,q_argsort,q_argsort_M512_N512_dtypetorch.quint8,short,FALSE,446.4263
 PyTorch,q_clone,q_clone_M512_N512_dtypetorch.quint8,short,FALSE,10.9374
 PyTorch,q_mean,q_mean_M512_N512_dtypetorch.quint8,short,FALSE,10.2288
 PyTorch,q_relu,q_relu_M512_N512_dtypetorch.quint8,short,FALSE,10.3366
-PyTorch,q_relu_,q_relu__M512_N512_dtypetorch.quint8,short,FALSE,25.3594
+PyTorch,q_relu_,q_relu__M512_N512_dtypetorch.quint8,short,FALSE,7.9869
 PyTorch,q_sort,q_sort_M512_N512_dtypetorch.quint8,short,FALSE,447.1303
 PyTorch,qtopk,qtopk_M512_N512_k5_dtypetorch.quint8,short,FALSE,64.856
 PyTorch,abs,abs_M512_N512_cpu,short,FALSE,12.3046
diff --git a/benchmarks/transformer/attention_bias_benchmarks.py b/benchmarks/transformer/attention_bias_benchmarks.py
index 2154e11237e9..f6bf45063309 100644
--- a/benchmarks/transformer/attention_bias_benchmarks.py
+++ b/benchmarks/transformer/attention_bias_benchmarks.py
@@ -1,7 +1,8 @@
 import itertools
+from collections.abc import Callable
 from dataclasses import asdict, dataclass
 from functools import partial
-from typing import Callable, Union
+from typing import Union
 
 import numpy as np
 from tabulate import tabulate
diff --git a/benchmarks/transformer/score_mod.py b/benchmarks/transformer/score_mod.py
index 4be4a1e7c46c..520fb26994e1 100644
--- a/benchmarks/transformer/score_mod.py
+++ b/benchmarks/transformer/score_mod.py
@@ -3,10 +3,11 @@
 import itertools
 import random
 from collections import defaultdict
+from collections.abc import Callable
 from contextlib import nullcontext
 from dataclasses import asdict, dataclass
 from functools import partial
-from typing import Callable, Optional, Union
+from typing import Optional, Union
 
 import numpy as np
 from tabulate import tabulate
@@ -270,7 +271,7 @@ def run_single_backend_sdpa(
 
         if config.calculate_bwd_time:
             # TODO: debug backward pass for njt
-            if eager_sdpa and not config.attn_type == "document_mask":
+            if eager_sdpa and config.attn_type != "document_mask":
                 d_out = torch.randn_like(out_eager.transpose(1, 2)).transpose(1, 2)
                 backward_eager_time = benchmark_torch_function_in_microseconds(
                     out_eager.backward, d_out, retain_graph=True
diff --git a/benchmarks/transformer/sdpa.py b/benchmarks/transformer/sdpa.py
index 2eca4bf06b44..b4bc77bafdd6 100644
--- a/benchmarks/transformer/sdpa.py
+++ b/benchmarks/transformer/sdpa.py
@@ -1,8 +1,8 @@
 import itertools
 from collections import defaultdict
+from collections.abc import Callable
 from contextlib import nullcontext
 from dataclasses import asdict, dataclass
-from typing import Callable
 
 from tabulate import tabulate
 from tqdm import tqdm
diff --git a/buckbuild.bzl b/buckbuild.bzl
index 3e3af13f9118..e60c02cd2ade 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -156,7 +156,7 @@ ROOT = "//" if IS_OSS else "//xplat/caffe2"
 # for targets in subfolders
 ROOT_PATH = "//" if IS_OSS else "//xplat/caffe2/"
 
-C10 = "//c10:c10" if IS_OSS else ("//xplat/caffe2/c10:c10_ovrsource" if is_arvr_mode() else "//xplat/caffe2/c10:c10")
+C10 = "//c10:c10" if IS_OSS else "//xplat/caffe2/c10:c10"
 
 # a dictionary maps third party library name to fbsource and oss target
 THIRD_PARTY_LIBS = {
@@ -391,6 +391,8 @@ def get_aten_generated_files(enabled_backends):
         "CompositeExplicitAutogradFunctions_inl.h",
         "CompositeExplicitAutogradNonFunctionalFunctions.h",
         "CompositeExplicitAutogradNonFunctionalFunctions_inl.h",
+        "ViewMetaClasses.h",
+        "ViewMetaClasses.cpp",
         "VmapGeneratedPlumbing.h",
         "core/ATenOpList.cpp",
         "core/TensorBody.h",
@@ -948,7 +950,6 @@ def define_buck_targets(
             [
                 ("torch/csrc/api/include", "torch/**/*.h"),
                 ("", "torch/csrc/**/*.h"),
-                ("", "torch/csrc/**/*.hpp"),
                 ("", "torch/nativert/**/*.h"),
                 ("", "torch/headeronly/**/*.h"),
                 ("", "torch/script.h"),
@@ -1037,7 +1038,8 @@ def define_buck_targets(
         name = "generated-version-header",
         header_namespace = "torch",
         exported_headers = {
-            "version.h": ":generate-version-header[version.h]",
+            "headeronly/version.h": ":generate-version-header[version.h]",
+            "version.h": "torch/csrc/api/include/torch/version.h"
         },
         labels = labels,
     )
@@ -1046,19 +1048,27 @@ def define_buck_targets(
     fb_native.genrule(
         name = "generate-version-header",
         srcs = [
-            "torch/csrc/api/include/torch/version.h.in",
+            "torch/headeronly/version.h.in",
             "version.txt",
         ],
-        cmd = "$(exe {}tools:gen-version-header) ".format(ROOT_PATH) + " ".join([
+        cmd = "mkdir -p $OUT/torch/headeronly && $(exe {}tools:gen-version-header) ".format(ROOT_PATH) + " ".join([
             "--template-path",
-            "torch/csrc/api/include/torch/version.h.in",
+            "torch/headeronly/version.h.in",
             "--version-path",
             "version.txt",
             "--output-path",
-            "$OUT/version.h",
+            "$OUT/torch/headeronly/version.h",
+        ]),
+        cmd_exe = "md $OUT\\torch\\headeronly 2>nul & $(exe {}tools:gen-version-header) ".format(ROOT_PATH) + " ".join([
+            "--template-path",
+            "torch/headeronly/version.h.in",
+            "--version-path",
+            "version.txt",
+            "--output-path",
+            "$OUT\\torch\\headeronly\\version.h",
         ]),
         outs = {
-            "version.h": ["version.h"],
+            "version.h": ["torch/headeronly/version.h"],
         },
         default_outs = ["."],
     )
@@ -1193,6 +1203,7 @@ def define_buck_targets(
             "NativeMetaFunctions.h": ":gen_aten[NativeMetaFunctions.h]",
             "Operators.h": ":gen_aten[Operators.h]",
             "RedispatchFunctions.h": ":gen_aten[RedispatchFunctions.h]",
+            "ViewMetaClasses.h": ":gen_aten[ViewMetaClasses.h]",
             "core/TensorBody.h": ":gen_aten[core/TensorBody.h]",
             "core/aten_interned_strings.h": ":gen_aten[core/aten_interned_strings.h]",
             "core/enum_tag.h": ":gen_aten[core/enum_tag.h]",
@@ -1998,7 +2009,21 @@ def define_buck_targets(
                     third_party("sleef_arm"),
                 ],
             }),
-            compiler_flags = get_aten_compiler_flags(),
+            compiler_flags = get_aten_compiler_flags() + select({
+                "DEFAULT": [],
+                "ovr_config//os:android-arm32": [
+                    "-mfpu=vfpv3-d16",
+                    "-march=armv7-a",
+                    "-mthumb",
+                    "-mfpu=neon",
+                ],
+                "ovr_config//os:android-x86_32": [
+                    "-mssse3",
+                ],
+                "ovr_config//os:android-x86_64": [
+                    "-mssse3",
+                ],
+            }),
             exported_preprocessor_flags = get_aten_preprocessor_flags(),
             exported_deps = [
                 ":aten_header",
@@ -2034,7 +2059,6 @@ def define_buck_targets(
                 ("", "caffe2/utils/*.h"),
                 ("", "caffe2/core/*.h"),
                 ("", "torch/csrc/*.h"),
-                ("", "torch/csrc/*.hpp"),
                 ("", "torch/csrc/api/include/torch/*.h"),
                 ("", "torch/csrc/autograd/*.h"),
                 ("", "torch/csrc/autograd/*/*.h"),
diff --git a/build.bzl b/build.bzl
index 7c2c3e24dc5a..0ce5b63e7b6c 100644
--- a/build.bzl
+++ b/build.bzl
@@ -118,6 +118,9 @@ def define_targets(rules):
             ":LazyNonNativeIr.h",
             ":RegisterDispatchDefinitions.ini",
             ":RegisterDispatchKey.cpp",
+            ":ViewMetaClassesPythonBinding.cpp",
+            ":ViewMetaClasses.cpp",
+            ":ViewMetaClasses.h",
             ":native_functions.yaml",
             ":shape_inference.h",
             ":tags.yaml",
@@ -139,18 +142,6 @@ def define_targets(rules):
         visibility = ["//visibility:public"],
     )
 
-    rules.genrule(
-        name = "version_h",
-        srcs = [
-            ":torch/csrc/api/include/torch/version.h.in",
-            ":version.txt",
-        ],
-        outs = ["torch/csrc/api/include/torch/version.h"],
-        cmd = "$(execpath //tools/setup_helpers:gen_version_header) " +
-              "--template-path $(location :torch/csrc/api/include/torch/version.h.in) " +
-              "--version-path $(location :version.txt) --output-path $@ ",
-        tools = ["//tools/setup_helpers:gen_version_header"],
-    )
 
 #
 # ATen generated code
@@ -170,6 +161,7 @@ GENERATED_H = [
     "FunctionalInverses.h",
     "RedispatchFunctions.h",
     "RegistrationDeclarations.h",
+    "ViewMetaClasses.h",
     "VmapGeneratedPlumbing.h",
 ]
 
@@ -246,6 +238,7 @@ GENERATED_CPP = [
     "RegisterFunctionalization_1.cpp",
     "RegisterFunctionalization_2.cpp",
     "RegisterFunctionalization_3.cpp",
+    "ViewMetaClasses.cpp",
 ]
 
 GENERATED_CPP_CORE = [
@@ -307,6 +300,7 @@ _GENERATED_AUTOGRAD_PYTHON_CPP = [
     "torch/csrc/autograd/generated/python_torch_functions_1.cpp",
     "torch/csrc/autograd/generated/python_torch_functions_2.cpp",
     "torch/csrc/autograd/generated/python_variable_methods.cpp",
+    "torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp"
 ]
 
 GENERATED_AUTOGRAD_PYTHON = _GENERATED_AUTOGRAD_PYTHON_HEADERS + _GENERATED_AUTOGRAD_PYTHON_CPP
diff --git a/build_variables.bzl b/build_variables.bzl
index 01b204458eee..ce1c5f1c97b5 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -68,6 +68,8 @@ jit_core_sources = [
 # list for the shared files.
 
 core_sources_common = [
+    # This needs to belong here because it defines the first non-inline virtual
+    # function, which matters for AutogradMetaInterface's vtable.
     "torch/csrc/autograd/autograd_meta.cpp",
     "torch/csrc/autograd/forward_grad.cpp",
     "torch/csrc/jit/frontend/edit_distance.cpp",
@@ -897,6 +899,7 @@ libtorch_python_core_sources = [
     "torch/csrc/Stream.cpp",
     "torch/csrc/Event.cpp",
     "torch/csrc/TypeInfo.cpp",
+    "torch/csrc/acc/Module.cpp",
     "torch/csrc/api/src/python/init.cpp",
     "torch/csrc/autograd/functions/init.cpp",
     "torch/csrc/autograd/init.cpp",
@@ -1010,6 +1013,7 @@ libtorch_python_core_sources = [
     "torch/csrc/utils/disable_torch_function.cpp",
     "torch/csrc/utils/verbose.cpp",
     "torch/csrc/cpu/Module.cpp",
+    "torch/csrc/functionalization/Module.cpp",
     "torch/csrc/instruction_counter/Module.cpp",
     "torch/nativert/python/Bindings.cpp",
 ] + lazy_tensor_core_python_sources
@@ -1052,6 +1056,7 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
         "torch/csrc/autograd/generated/python_torch_functions_1.cpp",
         "torch/csrc/autograd/generated/python_torch_functions_2.cpp",
         "torch/csrc/autograd/generated/python_variable_methods.cpp",
+        "torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp",
     ]]
 
     _libtorch_python_sources.extend(libtorch_python_core_sources)
diff --git a/c10/core/AllocatorConfig.cpp b/c10/core/AllocatorConfig.cpp
index c6b6e95f43b2..4d8bc9b8746e 100644
--- a/c10/core/AllocatorConfig.cpp
+++ b/c10/core/AllocatorConfig.cpp
@@ -1,5 +1,4 @@
 #include <c10/core/AllocatorConfig.h>
-#include <c10/core/DeviceType.h>
 #include <c10/util/env.h>
 
 namespace c10::CachingAllocator {
@@ -13,20 +12,22 @@ constexpr size_t kRoundUpPowerOfTwoEnd = 64 * 1024ul * kMB; // 64GB
 
 AcceleratorAllocatorConfig& AcceleratorAllocatorConfig::instance() {
   static AcceleratorAllocatorConfig instance;
-#define C10_ALLOCATOR_CONFIG_PARSE_ENV(env, deprecated)                       \
-  auto env##_name = c10::utils::get_env(#env);                                \
-  if (env##_name.has_value()) {                                               \
-    if (deprecated) {                                                         \
-      TORCH_WARN_ONCE(#env " is deprecated, use PYTORCH_ALLOC_CONF instead"); \
-    }                                                                         \
-    instance.parseArgs(env##_name.value());                                   \
-    return true;                                                              \
+#define C10_ALLOCATOR_CONFIG_PARSE_ENV(env)    \
+  auto env##_name = c10::utils::get_env(#env); \
+  if (env##_name.has_value()) {                \
+    instance.parseArgs(env##_name.value());    \
+    return true;                               \
   }
   static bool env_flag [[maybe_unused]] = []() {
-    C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_ALLOC_CONF, false)
-    // Keep this for backwards compatibility
-    C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_CUDA_ALLOC_CONF, /*deprecated=*/true)
-    C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_HIP_ALLOC_CONF, /*deprecated=*/true)
+    // Parse allocator configuration from environment variables.
+    // The first two entries are kept for backward compatibility with legacy
+    // CUDA and HIP environment variable names. The new unified variable
+    // (PYTORCH_ALLOC_CONF) should be used going forward.
+    // Note: keep the parsing order and logic stable to avoid potential
+    // performance regressions in internal tests.
+    C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_CUDA_ALLOC_CONF)
+    C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_HIP_ALLOC_CONF)
+    C10_ALLOCATOR_CONFIG_PARSE_ENV(PYTORCH_ALLOC_CONF)
     return false;
   }();
 #undef C10_ALLOCATOR_CONFIG_PARSE_ENV
@@ -45,7 +46,7 @@ size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) {
       63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart);
   const size_t interval_end =
       63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd);
-  TORCH_CHECK(
+  TORCH_CHECK_VALUE(
       interval_end - interval_start == kRoundUpPowerOfTwoIntervals,
       "kRoundUpPowerOfTwoIntervals mismatch");
 
@@ -64,7 +65,7 @@ size_t AcceleratorAllocatorConfig::parseMaxSplitSize(
       std::numeric_limits<size_t>::max() / kMB;
 
   size_t val_env = tokenizer.toSizeT(++i);
-  TORCH_CHECK(
+  TORCH_CHECK_VALUE(
       val_env >= min_allowed_split_size_mb,
       "CachingAllocator option max_split_size_mb too small, must be >= ",
       min_allowed_split_size_mb);
@@ -83,7 +84,7 @@ size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize(
       std::numeric_limits<size_t>::max() / kMB;
 
   size_t val_env = tokenizer.toSizeT(++i);
-  TORCH_CHECK(
+  TORCH_CHECK_VALUE(
       val_env >= min_allowed_split_size_mb,
       "CachingAllocator option max_non_split_rounding_mb too small, must be >= ",
       min_allowed_split_size_mb);
@@ -98,7 +99,7 @@ size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold(
     size_t i) {
   tokenizer.checkToken(++i, ":");
   double val_env = tokenizer.toDouble(++i);
-  TORCH_CHECK(
+  TORCH_CHECK_VALUE(
       val_env > 0 && val_env < 1.0,
       "garbage_collect_threshold is invalid, set it in (0.0, 1.0)");
   garbage_collection_threshold_ = val_env;
@@ -119,7 +120,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
       size_t value_index = i;
       tokenizer.checkToken(++i, ":");
       size_t value = tokenizer.toSizeT(++i);
-      TORCH_CHECK(
+      TORCH_CHECK_VALUE(
           value == 0 || llvm::isPowerOf2_64(value),
           "For roundups, the divisions has to be power of 2 or 0 to disable roundup ");
 
@@ -133,7 +134,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
             value);
       } else {
         size_t boundary = tokenizer.toSizeT(value_index);
-        TORCH_CHECK(
+        TORCH_CHECK_VALUE(
             llvm::isPowerOf2_64(boundary),
             "For roundups, the intervals have to be power of 2 ");
 
@@ -163,7 +164,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
         "Expected closing bracket ']' in ConfigTokenizer but reached end of config");
   } else { // Keep this for backwards compatibility
     size_t value = tokenizer.toSizeT(i);
-    TORCH_CHECK(
+    TORCH_CHECK_VALUE(
         llvm::isPowerOf2_64(value),
         "For roundups, the divisions has to be power of 2 ");
     std::fill(
@@ -223,7 +224,7 @@ void AcceleratorAllocatorConfig::parseArgs(const std::string& env) {
       // If a device-specific configuration parser hook is registered, it will
       // check if the key is unrecognized.
       if (device_config_parser_hook_) {
-        TORCH_CHECK(
+        TORCH_CHECK_VALUE(
             getKeys().find(key) != getKeys().end(),
             "Unrecognized key '",
             key,
diff --git a/c10/core/AllocatorConfig.h b/c10/core/AllocatorConfig.h
index 68cc47a8417c..9dbe7af6415d 100644
--- a/c10/core/AllocatorConfig.h
+++ b/c10/core/AllocatorConfig.h
@@ -76,7 +76,7 @@ class ConfigTokenizer {
     } else if (token == "False") {
       return false;
     } else {
-      TORCH_CHECK(
+      TORCH_CHECK_VALUE(
           false,
           "Expected 'True' or 'False' at index ",
           i,
@@ -253,7 +253,7 @@ class C10_API AcceleratorAllocatorConfig {
     device_config_parser_hook_ = std::move(hook);
     auto& mutable_keys = getMutableKeys();
     for (auto& key : keys) {
-      TORCH_CHECK(
+      TORCH_CHECK_VALUE(
           mutable_keys.insert(key).second,
           "Duplicated key '",
           key,
diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp
index 4330bb63c796..c923663675d6 100644
--- a/c10/core/CPUAllocator.cpp
+++ b/c10/core/CPUAllocator.cpp
@@ -154,7 +154,7 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
   }
 };
 
-void NoDelete(void*) {}
+void NoDelete(void* /*unused*/) {}
 
 at::Allocator* GetCPUAllocator() {
   return GetAllocator(DeviceType::CPU);
diff --git a/c10/core/CPUAllocator.h b/c10/core/CPUAllocator.h
index 98debb9db50d..656adc6b14fe 100644
--- a/c10/core/CPUAllocator.h
+++ b/c10/core/CPUAllocator.h
@@ -17,7 +17,7 @@ namespace c10 {
 using MemoryDeleter = void (*)(void*);
 
 // A helper function that is basically doing nothing.
-C10_API void NoDelete(void*);
+C10_API void NoDelete(void* /*unused*/);
 
 // A simple struct that is used to report C10's memory allocation,
 // deallocation status and out-of-memory events to the profiler
diff --git a/c10/core/DeviceGuard.h b/c10/core/DeviceGuard.h
index 7fa366049480..682c58a0a155 100644
--- a/c10/core/DeviceGuard.h
+++ b/c10/core/DeviceGuard.h
@@ -182,7 +182,7 @@ class OptionalDeviceGuard {
   }
 
  private:
-  impl::InlineOptionalDeviceGuard<impl::VirtualGuardImpl> guard_{};
+  impl::InlineOptionalDeviceGuard<impl::VirtualGuardImpl> guard_;
 };
 
 // Note [Whither the DeviceGuard boilerplate]
diff --git a/c10/core/DeviceType.h b/c10/core/DeviceType.h
index 911c863363f9..f36b1db4f443 100644
--- a/c10/core/DeviceType.h
+++ b/c10/core/DeviceType.h
@@ -1,100 +1,16 @@
 #pragma once
 
-// This is directly synchronized with caffe2/proto/caffe2.proto, but
-// doesn't require me to figure out how to get Protobuf headers into
-// ATen/core (which would require a lot more build system hacking.)
-// If you modify me, keep me synchronized with that file.
-
 #include <c10/macros/Export.h>
 
-#include <cstddef>
-#include <cstdint>
-#include <functional>
+// If you modified DeviceType in caffe2/proto/caffe2.proto, please also sync
+// your changes into torch/headeronly/core/DeviceType.h.
+#include <torch/headeronly/core/DeviceType.h>
+
 #include <ostream>
 #include <string>
 
 namespace c10 {
 
-// These contains all device types that also have a BackendComponent
-// and therefore participate in per-backend functionality dispatch keys.
-// This is most backends except PrivateUse2 and PrivateUse3
-#define C10_FORALL_BACKEND_DEVICE_TYPES(_, extra) \
-  _(CPU, extra)                                   \
-  _(CUDA, extra)                                  \
-  _(HIP, extra)                                   \
-  _(XLA, extra)                                   \
-  _(MPS, extra)                                   \
-  _(IPU, extra)                                   \
-  _(XPU, extra)                                   \
-  _(HPU, extra)                                   \
-  _(VE, extra)                                    \
-  _(Lazy, extra)                                  \
-  _(Meta, extra)                                  \
-  _(MTIA, extra)                                  \
-  _(PrivateUse1, extra)
-
-enum class DeviceType : int8_t {
-  CPU = 0,
-  CUDA = 1, // CUDA.
-  MKLDNN = 2, // Reserved for explicit MKLDNN
-  OPENGL = 3, // OpenGL
-  OPENCL = 4, // OpenCL
-  IDEEP = 5, // IDEEP.
-  HIP = 6, // AMD HIP
-  FPGA = 7, // FPGA
-  MAIA = 8, // ONNX Runtime / Microsoft
-  XLA = 9, // XLA / TPU
-  Vulkan = 10, // Vulkan
-  Metal = 11, // Metal
-  XPU = 12, // XPU
-  MPS = 13, // MPS
-  Meta = 14, // Meta (tensors with no data)
-  HPU = 15, // HPU / HABANA
-  VE = 16, // SX-Aurora / NEC
-  Lazy = 17, // Lazy Tensors
-  IPU = 18, // Graphcore IPU
-  MTIA = 19, // Meta training and inference devices
-  PrivateUse1 = 20, // PrivateUse1 device
-  // NB: If you add more devices:
-  //  - Change the implementations of DeviceTypeName and isValidDeviceType
-  //    in DeviceType.cpp
-  //  - Change the number below
-  COMPILE_TIME_MAX_DEVICE_TYPES = 21,
-};
-
-constexpr DeviceType kCPU = DeviceType::CPU;
-constexpr DeviceType kCUDA = DeviceType::CUDA;
-constexpr DeviceType kHIP = DeviceType::HIP;
-constexpr DeviceType kFPGA = DeviceType::FPGA;
-constexpr DeviceType kMAIA = DeviceType::MAIA;
-constexpr DeviceType kXLA = DeviceType::XLA;
-constexpr DeviceType kMPS = DeviceType::MPS;
-constexpr DeviceType kMeta = DeviceType::Meta;
-constexpr DeviceType kVulkan = DeviceType::Vulkan;
-constexpr DeviceType kMetal = DeviceType::Metal;
-constexpr DeviceType kXPU = DeviceType::XPU;
-constexpr DeviceType kHPU = DeviceType::HPU;
-constexpr DeviceType kVE = DeviceType::VE;
-constexpr DeviceType kLazy = DeviceType::Lazy;
-constexpr DeviceType kIPU = DeviceType::IPU;
-constexpr DeviceType kMTIA = DeviceType::MTIA;
-constexpr DeviceType kPrivateUse1 = DeviceType::PrivateUse1;
-
-// define explicit int constant
-constexpr int COMPILE_TIME_MAX_DEVICE_TYPES =
-    static_cast<int>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
-
-static_assert(
-    COMPILE_TIME_MAX_DEVICE_TYPES <= 21,
-    "Hey!  You seem to be adding a lot of new DeviceTypes.  The intent was "
-    "for this constant to reflect the actual number of DeviceTypes we support "
-    "in PyTorch; it's important that this number is not too large as we "
-    "use this to allocate stack arrays in some places in our code.  If you "
-    "are indeed just adding the 20th device type, feel free to change "
-    "the check to 32; but if you are adding some sort of extensible device "
-    "types registration, please be aware that you are affecting code that "
-    "this number is small.  Try auditing uses of this constant.");
-
 C10_API std::string DeviceTypeName(DeviceType d, bool lower_case = false);
 
 C10_API bool isValidDeviceType(DeviceType d);
@@ -108,15 +24,6 @@ C10_API bool is_privateuse1_backend_registered();
 
 } // namespace c10
 
-namespace std {
-template <>
-struct hash<c10::DeviceType> {
-  std::size_t operator()(c10::DeviceType k) const {
-    return std::hash<int>()(static_cast<int>(k));
-  }
-};
-} // namespace std
-
 namespace torch {
 // NOLINTNEXTLINE(misc-unused-using-decls)
 using c10::DeviceType;
diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h
index 30aad0aeb00a..c513c4e8e390 100644
--- a/c10/core/DispatchKey.h
+++ b/c10/core/DispatchKey.h
@@ -590,10 +590,12 @@ constexpr uint16_t num_runtime_entries = num_functionality_keys +
 constexpr uint16_t full_backend_mask =
     (static_cast<uint16_t>(1) << num_backends) - 1;
 
-C10_API const char* toString(DispatchKey);
-C10_API const char* toString(BackendComponent);
-C10_API std::ostream& operator<<(std::ostream&, DispatchKey);
-C10_API std::ostream& operator<<(std::ostream&, BackendComponent);
+C10_API const char* toString(DispatchKey /*t*/);
+C10_API const char* toString(BackendComponent /*t*/);
+C10_API std::ostream& operator<<(std::ostream& /*str*/, DispatchKey /*rhs*/);
+C10_API std::ostream& operator<<(
+    std::ostream& /*str*/,
+    BackendComponent /*rhs*/);
 
 C10_API DispatchKey getAutogradKeyFromBackend(BackendComponent k);
 
diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp
index 96ef6b3522ba..72e72f49a5e4 100644
--- a/c10/core/DispatchKeySet.cpp
+++ b/c10/core/DispatchKeySet.cpp
@@ -52,9 +52,7 @@ constexpr DispatchKeySet math_dispatch_keyset = backend_dispatch_keyset |
     // where we would like to support composite implicit kernels but not
     // explicit kernels therefore we manually add the key to the
     // math_dispatch_keyset
-    DispatchKeySet{DispatchKey::NestedTensor} |
-    // Functionalize should always reuse CompositeImplicit decomps.
-    DispatchKeySet{DispatchKey::Functionalize};
+    DispatchKeySet{DispatchKey::NestedTensor};
 
 constexpr DispatchKeySet nested_dispatch_keyset =
     DispatchKeySet(
diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h
index dea4c5a55de7..d46bf7efeed6 100644
--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@@ -172,10 +172,10 @@ class DispatchKeySet final {
   // use of DispatchKeySet in TLS requires this.
   constexpr DispatchKeySet() = default;
 
-  constexpr DispatchKeySet(Full)
+  constexpr DispatchKeySet(Full /*unused*/)
       : repr_((1ULL << (num_backends + num_functionality_keys - 1)) - 1) {}
 
-  constexpr DispatchKeySet(FullAfter, DispatchKey t)
+  constexpr DispatchKeySet(FullAfter /*unused*/, DispatchKey t)
       // LSB after t are OK, but not t itself.
       // "functionalities" have a notion of ordering (e.g. Autograd > Sparse >
       // Quantized > Dense). But backends don't really have an ordering.
@@ -191,7 +191,7 @@ class DispatchKeySet final {
 
   // Public version of DispatchKeySet(uint64_t) API; external users
   // must be explicit when they do this!
-  constexpr DispatchKeySet(Raw, uint64_t x) : repr_(x) {}
+  constexpr DispatchKeySet(Raw /*unused*/, uint64_t x) : repr_(x) {}
 
   constexpr explicit DispatchKeySet(BackendComponent k) {
     if (k == BackendComponent::InvalidBit) {
@@ -631,8 +631,8 @@ class DispatchKeySet final {
   }
 };
 
-C10_API std::string toString(DispatchKeySet);
-C10_API std::ostream& operator<<(std::ostream&, DispatchKeySet);
+C10_API std::string toString(DispatchKeySet /*ts*/);
+C10_API std::ostream& operator<<(std::ostream& /*os*/, DispatchKeySet /*ts*/);
 
 inline int getDispatchTableIndexForDispatchKey(DispatchKey k) {
   return DispatchKeySet(k).getDispatchTableIndexForDispatchKeySet();
diff --git a/c10/core/Event.h b/c10/core/Event.h
index b94db9f4f26d..dfbb17e37da9 100644
--- a/c10/core/Event.h
+++ b/c10/core/Event.h
@@ -127,7 +127,7 @@ struct Event final {
   }
 
   void synchronize() const {
-    return impl_.synchronize();
+    impl_.synchronize();
   }
 
  private:
diff --git a/c10/core/GeneratorImpl.cpp b/c10/core/GeneratorImpl.cpp
index 2aae103f0650..288539403140 100644
--- a/c10/core/GeneratorImpl.cpp
+++ b/c10/core/GeneratorImpl.cpp
@@ -102,7 +102,7 @@ uint64_t getNonDeterministicRandom(bool is_cuda) {
   } else {
     std::random_device rd;
     // limit to 53 bits to ensure unique representation in double
-    s = ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
+    s = (((static_cast<uint64_t>(rd())) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
   }
   return s;
 }
diff --git a/c10/core/RefcountedDeleter.cpp b/c10/core/RefcountedDeleter.cpp
index 796174870bc8..a23b0fbffb19 100644
--- a/c10/core/RefcountedDeleter.cpp
+++ b/c10/core/RefcountedDeleter.cpp
@@ -20,7 +20,8 @@ void maybeApplyRefcountedDeleter(const c10::Storage& storage) {
   std::lock_guard<std::mutex> guard(replace_data_ptr_mutex);
   c10::DataPtr& data_ptr = storage.mutable_data_ptr();
 
-  if ((void*)data_ptr.get_deleter() == (void*)&c10::refcounted_deleter) {
+  if (reinterpret_cast<const void*>(data_ptr.get_deleter()) ==
+      reinterpret_cast<const void*>(&c10::refcounted_deleter)) {
     // Data pointer is already shared
     return;
   }
diff --git a/c10/core/SafePyObject.h b/c10/core/SafePyObject.h
index 6102aed8c0ba..1ec0cdb6751e 100644
--- a/c10/core/SafePyObject.h
+++ b/c10/core/SafePyObject.h
@@ -60,7 +60,7 @@ struct C10_API SafePyObject {
   c10::impl::PyInterpreter& pyinterpreter() const {
     return *pyinterpreter_;
   }
-  PyObject* ptr(const c10::impl::PyInterpreter*) const;
+  PyObject* ptr(const c10::impl::PyInterpreter* /*interpreter*/) const;
 
   // stop tracking the current object, and return it
   PyObject* release() {
@@ -103,7 +103,7 @@ struct C10_API SafePyHandle {
   c10::impl::PyInterpreter& pyinterpreter() const {
     return *pyinterpreter_;
   }
-  PyObject* ptr(const c10::impl::PyInterpreter*) const;
+  PyObject* ptr(const c10::impl::PyInterpreter* /*interpreter*/) const;
   void reset() {
     data_ = nullptr;
     pyinterpreter_ = nullptr;
diff --git a/c10/core/Scalar.h b/c10/core/Scalar.h
index 646a1dde3994..d6701ec2c0e6 100644
--- a/c10/core/Scalar.h
+++ b/c10/core/Scalar.h
@@ -428,7 +428,7 @@ class C10_API Scalar {
       typename std::enable_if_t<
           std::is_integral_v<T> && !std::is_same_v<T, bool>,
           bool>* = nullptr>
-  Scalar(T vv, bool) : tag(Tag::HAS_i) {
+  Scalar(T vv, bool /*unused*/) : tag(Tag::HAS_i) {
     v.i = convert<decltype(v.i), T>(vv);
   }
 
@@ -437,14 +437,14 @@ class C10_API Scalar {
       typename std::enable_if_t<
           !std::is_integral_v<T> && !c10::is_complex<T>::value,
           bool>* = nullptr>
-  Scalar(T vv, bool) : tag(Tag::HAS_d) {
+  Scalar(T vv, bool /*unused*/) : tag(Tag::HAS_d) {
     v.d = convert<decltype(v.d), T>(vv);
   }
 
   template <
       typename T,
       typename std::enable_if_t<c10::is_complex<T>::value, bool>* = nullptr>
-  Scalar(T vv, bool) : tag(Tag::HAS_z) {
+  Scalar(T vv, bool /*unused*/) : tag(Tag::HAS_z) {
     v.z = convert<decltype(v.z), T>(vv);
   }
 };
diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index 4a15eb23ac63..3e1bae1e8856 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -28,101 +28,8 @@
 
 namespace c10 {
 
-// [dtype Macros note] For the macros below:
-//
-// For users: If you want to macro some code for all non-QInt scalar types
-// (i.e. types with complete information, you probably want one of the
-// AT_FORALL_SCALAR_TYPES / AT_FORALL_SCALAR_TYPES_AND macros below, which are
-// designed to behave similarly to the Dispatch macros with the same name.
-//
-// For adding a new dtype: In the beginning, we had an idea that there was a
-// list of all scalar types, and you could use AT_FORALL_SCALAR_TYPES to
-// iterate over them.  But over the years we added weird types which couldn't
-// be handled uniformly everywhere and so in the end we ended up with some
-// mish-mosh of some helper macros, but mostly use sites making a call about
-// what dtypes they can or can't support.  So if you want to add a new dtype,
-// the preferred resolution is to find a dtype similar to what you want,
-// grep for it and edit all the sites you find this way.  If you need to add
-// a completely new kind of dtype, you're going to have to laboriously audit
-// all of the sites everywhere to figure out how it should work.  Consulting
-// some old PRs where we added new dtypes (check history of this file) can
-// help give you an idea where to start.
-
-// If you want to support ComplexHalf for real, add ComplexHalf
-// into this macro (and change the name).  But beware: convert()
-// doesn't work for all the conversions you need...
-//
-// TODO: To add unsigned int types here, we must define accumulate type.
-// But uint8 currently accumulates into int64, so we would have to make
-// an inconsistent choice for the larger types.  Difficult.
-#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF_F8NZ(_) \
-  _(uint8_t, Byte)                                                      \
-  _(int8_t, Char)                                                       \
-  _(int16_t, Short)                                                     \
-  _(int, Int)                                                           \
-  _(int64_t, Long)                                                      \
-  _(at::Half, Half)                                                     \
-  _(float, Float)                                                       \
-  _(double, Double)                                                     \
-  _(c10::complex<float>, ComplexFloat)                                  \
-  _(c10::complex<double>, ComplexDouble)                                \
-  _(bool, Bool)                                                         \
-  _(at::BFloat16, BFloat16)                                             \
-  _(at::Float8_e5m2, Float8_e5m2)                                       \
-  _(at::Float8_e4m3fn, Float8_e4m3fn)
-
-// This macro controls many of our C++ APIs, including constructors
-// for Scalar as well as the data() and item() accessors on Tensor
-#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(_) \
-  _(uint8_t, Byte)                             \
-  _(int8_t, Char)                              \
-  _(int16_t, Short)                            \
-  _(int, Int)                                  \
-  _(int64_t, Long)                             \
-  _(at::Half, Half)                            \
-  _(float, Float)                              \
-  _(double, Double)                            \
-  _(c10::complex<c10::Half>, ComplexHalf)      \
-  _(c10::complex<float>, ComplexFloat)         \
-  _(c10::complex<double>, ComplexDouble)       \
-  _(bool, Bool)                                \
-  _(at::BFloat16, BFloat16)                    \
-  _(at::Float8_e5m2, Float8_e5m2)              \
-  _(at::Float8_e4m3fn, Float8_e4m3fn)          \
-  _(at::Float8_e5m2fnuz, Float8_e5m2fnuz)      \
-  _(at::Float8_e4m3fnuz, Float8_e4m3fnuz)      \
-  _(at::Float8_e8m0fnu, Float8_e8m0fnu)
-
-namespace impl {
-
-// These are used to map ScalarTypes to C++ types.
-
-template <c10::ScalarType N>
-struct ScalarTypeToCPPType;
-
-#define SPECIALIZE_ScalarTypeToCPPType(cpp_type, scalar_type)                \
-  template <>                                                                \
-  struct ScalarTypeToCPPType<c10::ScalarType::scalar_type> {                 \
-    using type = cpp_type;                                                   \
-                                                                             \
-    /* This is a workaround for the CUDA bug which prevents */               \
-    /* ::detail::ScalarTypeToCType<T>::type being used directly due to */    \
-    /* ambiguous reference which can't to be resolved. For some reason it */ \
-    /* can't pick between at::detail and at::cuda::detail. */                \
-    /* For repro example, please see: */                                     \
-    /* https://gist.github.com/izdeby/952ae7cf256ddb740a73776d39a7e7ba */    \
-    /* TODO: remove once the bug is fixed. */                                \
-    static type t;                                                           \
-  };
-
-AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_ScalarTypeToCPPType)
-
-#undef SPECIALIZE_ScalarTypeToCPPType
-
-template <c10::ScalarType N>
-using ScalarTypeToCPPTypeT = typename ScalarTypeToCPPType<N>::type;
-
-} // namespace impl
+// See [dtype Macros note] in torch/headeronly/core/ScalarType.h
+// regarding macros.
 
 template <typename T>
 struct CppTypeToScalarType;
@@ -138,130 +45,6 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
 
 #undef SPECIALIZE_CppTypeToScalarType
 
-// NB: despite its generic sounding name, the macros that don't take _AND
-// are mostly only used by tensorexpr
-#define AT_FORALL_INT_TYPES(_) \
-  _(uint8_t, Byte)             \
-  _(int8_t, Char)              \
-  _(int16_t, Short)            \
-  _(int, Int)                  \
-  _(int64_t, Long)
-
-#define AT_FORALL_SCALAR_TYPES(_) \
-  _(uint8_t, Byte)                \
-  _(int8_t, Char)                 \
-  _(int16_t, Short)               \
-  _(int, Int)                     \
-  _(int64_t, Long)                \
-  _(float, Float)                 \
-  _(double, Double)
-
-// These macros are often controlling how many template instantiations we
-// create for kernels.  It is typically inappropriate to add new dtypes here,
-// instead, new types should be added to use sites on a case-by-case basis.
-// We generally are not accepting new dtypes due to binary size concerns.
-
-#define AT_FORALL_SCALAR_TYPES_AND(SCALARTYPE, _) \
-  _(uint8_t, Byte)                                \
-  _(int8_t, Char)                                 \
-  _(int16_t, Short)                               \
-  _(int, Int)                                     \
-  _(int64_t, Long)                                \
-  _(float, Float)                                 \
-  _(double, Double)                               \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
-             ::c10::ScalarType::SCALARTYPE>::t),  \
-    SCALARTYPE)
-
-#define AT_FORALL_SCALAR_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \
-  _(uint8_t, Byte)                                               \
-  _(int8_t, Char)                                                \
-  _(int16_t, Short)                                              \
-  _(int, Int)                                                    \
-  _(int64_t, Long)                                               \
-  _(float, Float)                                                \
-  _(double, Double)                                              \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<                   \
-             ::c10::ScalarType::SCALARTYPE1>::t),                \
-    SCALARTYPE1)                                                 \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<                   \
-             ::c10::ScalarType::SCALARTYPE2>::t),                \
-    SCALARTYPE2)
-
-#define AT_FORALL_SCALAR_TYPES_AND3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, _) \
-  _(uint8_t, Byte)                                                            \
-  _(int8_t, Char)                                                             \
-  _(int16_t, Short)                                                           \
-  _(int, Int)                                                                 \
-  _(int64_t, Long)                                                            \
-  _(float, Float)                                                             \
-  _(double, Double)                                                           \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<                                \
-             ::c10::ScalarType::SCALARTYPE1>::t),                             \
-    SCALARTYPE1)                                                              \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<                                \
-             ::c10::ScalarType::SCALARTYPE2>::t),                             \
-    SCALARTYPE2)                                                              \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<                                \
-             ::c10::ScalarType::SCALARTYPE3>::t),                             \
-    SCALARTYPE3)
-
-#define AT_FORALL_SCALAR_TYPES_AND7(              \
-    SCALARTYPE1,                                  \
-    SCALARTYPE2,                                  \
-    SCALARTYPE3,                                  \
-    SCALARTYPE4,                                  \
-    SCALARTYPE5,                                  \
-    SCALARTYPE6,                                  \
-    SCALARTYPE7,                                  \
-    _)                                            \
-  _(uint8_t, Byte)                                \
-  _(int8_t, Char)                                 \
-  _(int16_t, Short)                               \
-  _(int, Int)                                     \
-  _(int64_t, Long)                                \
-  _(float, Float)                                 \
-  _(double, Double)                               \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
-             ::c10::ScalarType::SCALARTYPE1>::t), \
-    SCALARTYPE1)                                  \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
-             ::c10::ScalarType::SCALARTYPE2>::t), \
-    SCALARTYPE2)                                  \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
-             ::c10::ScalarType::SCALARTYPE3>::t), \
-    SCALARTYPE3)                                  \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
-             ::c10::ScalarType::SCALARTYPE4>::t), \
-    SCALARTYPE4)                                  \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
-             ::c10::ScalarType::SCALARTYPE5>::t), \
-    SCALARTYPE5)                                  \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
-             ::c10::ScalarType::SCALARTYPE6>::t), \
-    SCALARTYPE6)                                  \
-  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
-             ::c10::ScalarType::SCALARTYPE7>::t), \
-    SCALARTYPE7)
-
-#define AT_FORALL_QINT_TYPES(_) \
-  _(c10::qint8, QInt8)          \
-  _(c10::quint8, QUInt8)        \
-  _(c10::qint32, QInt32)        \
-  _(c10::quint4x2, QUInt4x2)    \
-  _(c10::quint2x4, QUInt2x4)
-
-#define AT_FORALL_FLOAT8_TYPES(_)         \
-  _(at::Float8_e5m2, Float8_e5m2)         \
-  _(at::Float8_e4m3fn, Float8_e4m3fn)     \
-  _(at::Float8_e5m2fnuz, Float8_e5m2fnuz) \
-  _(at::Float8_e4m3fnuz, Float8_e4m3fnuz) \
-  _(at::Float8_e8m0fnu, Float8_e8m0fnu)
-
-#define AT_FORALL_COMPLEX_TYPES(_)     \
-  _(c10::complex<float>, ComplexFloat) \
-  _(c10::complex<double>, ComplexDouble)
-
 #define DEFINE_CONSTANT(_, name) \
   constexpr ScalarType k##name = ScalarType::name;
 
diff --git a/c10/core/Storage.h b/c10/core/Storage.h
index 611133e1bcbd..e061375b8887 100644
--- a/c10/core/Storage.h
+++ b/c10/core/Storage.h
@@ -78,7 +78,7 @@ struct C10_API Storage {
             resizable)) {}
 
  protected:
-  explicit Storage(unsafe_borrow_t, const Storage& rhs)
+  explicit Storage(unsafe_borrow_t /*unused*/, const Storage& rhs)
       : storage_impl_(c10::intrusive_ptr<c10::StorageImpl>::reclaim(
             rhs.storage_impl_.get())) {}
 
@@ -149,7 +149,7 @@ struct C10_API Storage {
   }
 
   void set_data_ptr_noswap(at::DataPtr&& data_ptr) const {
-    return storage_impl_->set_data_ptr_noswap(std::move(data_ptr));
+    storage_impl_->set_data_ptr_noswap(std::move(data_ptr));
   }
 
   DeviceType device_type() const {
diff --git a/c10/core/Stream.h b/c10/core/Stream.h
index a35e608202c7..f71bfe52bbf9 100644
--- a/c10/core/Stream.h
+++ b/c10/core/Stream.h
@@ -82,14 +82,15 @@ class C10_API Stream final {
   /// should use the provided APIs to get a stream.  In particular,
   /// we don't require backends to give any guarantees about non-zero
   /// StreamIds; they are welcome to allocate in whatever way they like.
-  explicit Stream(Unsafe, Device device, StreamId id)
+  explicit Stream(Unsafe /*unused*/, Device device, StreamId id)
       : device_(device), id_(id) {}
 
   /// Construct the default stream of a Device.  The default stream is
   /// NOT the same as the current stream; default stream is a fixed stream
   /// that never changes, whereas the current stream may be changed by
   /// StreamGuard.
-  explicit Stream(Default, Device device) : device_(device), id_(0) {}
+  explicit Stream(Default /*unused*/, Device device)
+      : device_(device), id_(0) {}
 
   bool operator==(const Stream& other) const noexcept {
     return this->device_ == other.device_ && this->id_ == other.id_;
diff --git a/c10/core/StreamGuard.h b/c10/core/StreamGuard.h
index d3057823a5cd..c901a8a768f1 100644
--- a/c10/core/StreamGuard.h
+++ b/c10/core/StreamGuard.h
@@ -143,7 +143,7 @@ struct OptionalStreamGuard {
   }
 
  private:
-  c10::impl::InlineOptionalStreamGuard<impl::VirtualGuardImpl> guard_{};
+  c10::impl::InlineOptionalStreamGuard<impl::VirtualGuardImpl> guard_;
 };
 
 /**
diff --git a/c10/core/SymBool.h b/c10/core/SymBool.h
index 6982d0380e57..d5d509e239b1 100644
--- a/c10/core/SymBool.h
+++ b/c10/core/SymBool.h
@@ -40,8 +40,8 @@ class C10_API SymBool {
     return *c;
   }
 
-  SymBool sym_and(const SymBool&) const;
-  SymBool sym_or(const SymBool&) const;
+  SymBool sym_and(const SymBool& /*sci*/) const;
+  SymBool sym_or(const SymBool& /*sci*/) const;
   SymBool sym_not() const;
 
   SymBool operator&(const SymBool& other) const {
diff --git a/c10/core/SymFloat.h b/c10/core/SymFloat.h
index d30b646a653f..e3064f0b8588 100644
--- a/c10/core/SymFloat.h
+++ b/c10/core/SymFloat.h
@@ -43,17 +43,17 @@ class C10_API SymFloat {
     return data_;
   }
 
-  SymFloat operator+(const SymFloat&) const;
-  SymFloat operator-(const SymFloat&) const;
-  SymFloat operator*(const SymFloat&) const;
-  SymFloat operator/(const SymFloat&) const;
-
-  SymBool sym_eq(const SymFloat&) const;
-  SymBool sym_ne(const SymFloat&) const;
-  SymBool sym_lt(const SymFloat&) const;
-  SymBool sym_le(const SymFloat&) const;
-  SymBool sym_gt(const SymFloat&) const;
-  SymBool sym_ge(const SymFloat&) const;
+  SymFloat operator+(const SymFloat& /*sci*/) const;
+  SymFloat operator-(const SymFloat& /*sci*/) const;
+  SymFloat operator*(const SymFloat& /*sci*/) const;
+  SymFloat operator/(const SymFloat& /*sci*/) const;
+
+  SymBool sym_eq(const SymFloat& /*sci*/) const;
+  SymBool sym_ne(const SymFloat& /*sci*/) const;
+  SymBool sym_lt(const SymFloat& /*sci*/) const;
+  SymBool sym_le(const SymFloat& /*sci*/) const;
+  SymBool sym_gt(const SymFloat& /*sci*/) const;
+  SymBool sym_ge(const SymFloat& /*sci*/) const;
 
   bool operator==(const SymFloat& o) const {
     return sym_eq(o).guard_bool(__FILE__, __LINE__);
diff --git a/c10/core/SymInt.cpp b/c10/core/SymInt.cpp
index b78ca94dc514..1078ae03a40d 100644
--- a/c10/core/SymInt.cpp
+++ b/c10/core/SymInt.cpp
@@ -4,7 +4,6 @@
 #include <c10/core/SymNodeImpl.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/safe_numerics.h>
-#include <functional>
 
 namespace c10 {
 
@@ -50,7 +49,7 @@ bool SymInt::has_hint() const {
   return toSymNodeImplUnowned()->has_hint();
 }
 
-#define DEFINE_BINARY(API, OP, METHOD, RET)                          \
+#define DEFINE_BINARY(API, METHOD, RET)                              \
   RET SymInt::API(const SymInt& sci) const {                         \
     if (auto ma = maybe_as_int()) {                                  \
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(                              \
@@ -68,23 +67,23 @@ bool SymInt::has_hint() const {
     }                                                                \
   }
 
-DEFINE_BINARY(operator_add_slow_path, std::plus<>(), add, SymInt)
-DEFINE_BINARY(operator_sub_slow_path, std::minus<>(), sub, SymInt)
-DEFINE_BINARY(operator_mul_slow_path, std::multiplies<>(), mul, SymInt)
-DEFINE_BINARY(operator_div_slow_path, std::divides<>(), floordiv, SymInt)
-DEFINE_BINARY(operator_mod_slow_path, std::modulus<>(), mod, SymInt)
-DEFINE_BINARY(sym_eq_slow_path, std::equal_to<>(), eq, SymBool)
-DEFINE_BINARY(sym_ne_slow_path, std::not_equal_to<>(), ne, SymBool)
-DEFINE_BINARY(sym_lt_slow_path, std::less<>(), lt, SymBool)
-DEFINE_BINARY(sym_le_slow_path, std::less_equal<>(), le, SymBool)
-DEFINE_BINARY(sym_gt_slow_path, std::greater<>(), gt, SymBool)
-DEFINE_BINARY(sym_ge_slow_path, std::greater_equal<>(), ge, SymBool)
-DEFINE_BINARY(min_slow_path, std::min, sym_min, SymInt)
-DEFINE_BINARY(max_slow_path, std::max, sym_max, SymInt)
+DEFINE_BINARY(operator_add_slow_path, add, SymInt)
+DEFINE_BINARY(operator_sub_slow_path, sub, SymInt)
+DEFINE_BINARY(operator_mul_slow_path, mul, SymInt)
+DEFINE_BINARY(operator_div_slow_path, floordiv, SymInt)
+DEFINE_BINARY(operator_mod_slow_path, mod, SymInt)
+DEFINE_BINARY(sym_eq_slow_path, eq, SymBool)
+DEFINE_BINARY(sym_ne_slow_path, ne, SymBool)
+DEFINE_BINARY(sym_lt_slow_path, lt, SymBool)
+DEFINE_BINARY(sym_le_slow_path, le, SymBool)
+DEFINE_BINARY(sym_gt_slow_path, gt, SymBool)
+DEFINE_BINARY(sym_ge_slow_path, ge, SymBool)
+DEFINE_BINARY(min_slow_path, sym_min, SymInt)
+DEFINE_BINARY(max_slow_path, sym_max, SymInt)
 
 SymInt::operator SymFloat() const {
   if (auto ma = maybe_as_int()) {
-    return SymFloat(double(*ma));
+    return SymFloat(static_cast<double>(*ma));
   } else {
     return SymFloat(toSymNodeImplUnowned()->sym_float());
   }
@@ -130,14 +129,6 @@ int64_t SymInt::guard_int(const char* file, int64_t line) const {
   }
 }
 
-bool SymInt::expect_size(const char* file, int64_t line) const {
-  if (auto ma = maybe_as_int()) {
-    return *ma >= 0;
-  } else {
-    return toSymNodeImplUnowned()->expect_size(file, line);
-  }
-}
-
 SymInt operator-(const SymInt& s) {
   if (auto ma = s.maybe_as_int()) {
     const auto val = *ma;
diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
index 9b1c776cbe2a..fceeb8ed04ac 100644
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@@ -52,7 +52,7 @@ class C10_API SymInt {
   // One appropriate use for this is when you are constructing a symint
   // in a situation where you know it is non-negative (or, if it is negative,
   // the negative value is -1; i.e., not user controlled)
-  SymInt(Unchecked, int64_t d) : data_(d) {}
+  SymInt(Unchecked /*unused*/, int64_t d) : data_(d) {}
 
   // TODO: these implementations are not optimal because they allocate a
   // temporary and then use the move constructor/assignment
@@ -153,14 +153,6 @@ class C10_API SymInt {
   // number can be used to diagnose overspecialization.
   int64_t guard_int(const char* file, int64_t line) const;
 
-  // Insert a guard that this SymInt must be size-like, returning true if
-  // the integer actually is >= 0.  Unlike manually performing a >= 0 test,
-  // if the SymInt in question is an unbacked SymInt (or, potentially in the
-  // future, if it contains unbacked SymInts), we will also treat the
-  // unbacked SymInt as statically testing >= 2 (which will prevent us from
-  // choking on, e.g., contiguity checks.)
-  bool expect_size(const char* file, int64_t line) const;
-
   // Distinguish actual symbolic values from constants stored on the heap
   bool is_symbolic() const {
     return is_heap_allocated() &&
diff --git a/c10/core/SymIntArrayRef.h b/c10/core/SymIntArrayRef.h
index bf050f461f4a..1b1867bfff1d 100644
--- a/c10/core/SymIntArrayRef.h
+++ b/c10/core/SymIntArrayRef.h
@@ -86,4 +86,23 @@ inline SymIntArrayRef fromIntArrayRefSlow(IntArrayRef array_ref) {
       reinterpret_cast<const SymInt*>(array_ref.data()), array_ref.size());
 }
 
+inline c10::SymBool sym_equals(SymIntArrayRef LHS, SymIntArrayRef RHS) {
+  if (LHS.size() != RHS.size()) {
+    return c10::SymBool(false);
+  }
+
+  c10::SymBool result = sym_eq(LHS.size(), RHS.size());
+  for (size_t i = 0; i < RHS.size(); ++i) {
+    c10::SymBool equals = sym_eq(LHS[i], RHS[i]);
+    std::optional<bool> equals_bool = equals.maybe_as_bool();
+
+    if (equals_bool.has_value() && !*equals_bool) {
+      // Early return if element comparison is known to be false
+      return equals;
+    }
+    result = result.sym_and(equals);
+  }
+  return result;
+}
+
 } // namespace c10
diff --git a/c10/core/SymNodeImpl.h b/c10/core/SymNodeImpl.h
index 98f707df8102..f509d38930bb 100644
--- a/c10/core/SymNodeImpl.h
+++ b/c10/core/SymNodeImpl.h
@@ -210,11 +210,6 @@ class C10_API SymNodeImpl : public c10::intrusive_ptr_target {
     // with a better implementation!
     return guard_bool(file, line);
   }
-  virtual bool expect_size(const char* file, int64_t line) {
-    // No improvement for unbacked SymInts by default, replace this
-    // with a better implementation!
-    return ge(wrap_int(0))->guard_bool(file, line);
-  }
   virtual int64_t int_() {
     TORCH_CHECK(false, "NYI");
   }
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index cd0321d3bb6f..c59524a0932c 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -9,7 +9,6 @@
 #include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <c10/util/Logging.h>
 #include <c10/util/accumulate.h>
-#include <c10/util/irange.h>
 #include <optional>
 
 #include <utility>
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 98867da60a7f..66893b86c846 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -359,7 +359,7 @@ struct C10_API VariableVersion {
   // https://cplusplus.github.io/LWG/issue2334.
   VariableVersion(uint32_t version)
       : version_counter_(c10::make_intrusive<VersionCounter>(version)) {}
-  VariableVersion(Disabled = DISABLED) {}
+  VariableVersion(Disabled /*unused*/ = DISABLED) {}
 
   bool enabled() const {
     return version_counter_;
@@ -522,21 +522,21 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    */
   TensorImpl(
       Storage&& storage,
-      DispatchKeySet,
+      DispatchKeySet /*key_set*/,
       const caffe2::TypeMeta data_type);
 
   // See Note [Enum ImplType]
   TensorImpl(
-      ImplType,
+      ImplType /*unused*/,
       Storage&& storage,
-      DispatchKeySet,
+      DispatchKeySet /*key_set*/,
       const caffe2::TypeMeta data_type);
 
   /**
    * Construct a 1-dim 0 size tensor that doesn't have a storage.
    */
   TensorImpl(
-      DispatchKeySet,
+      DispatchKeySet /*key_set*/,
       const caffe2::TypeMeta data_type,
       std::optional<c10::Device> device_opt);
 
@@ -563,9 +563,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   // from under us.
   TensorImpl(
       Storage&& storage,
-      DispatchKeySet,
+      DispatchKeySet /*key_set*/,
       const caffe2::TypeMeta data_type,
-      std::optional<c10::Device>);
+      std::optional<c10::Device> /*device_opt*/);
 
  public:
   TensorImpl(const TensorImpl&) = delete;
@@ -3269,7 +3269,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
     is_le<sizeof(autograd_meta_),         16,  FieldNameEnum::autograd_meta_>();
     is_le<sizeof(extra_meta_),            16,  FieldNameEnum::extra_meta_>();
     are_equal<sizeof(version_counter_),    8,  FieldNameEnum::version_counter_>();
-    are_equal<sizeof(pyobj_slot_),         8,  FieldNameEnum::pyobj_slot_>();
+    are_equal<sizeof(pyobj_slot_),   16,  FieldNameEnum::pyobj_slot_>();
     are_equal<sizeof(sizes_and_strides_), 88,  FieldNameEnum::sizes_and_strides_>();
     are_equal<sizeof(storage_offset_),     8,  FieldNameEnum::storage_offset_>();
     are_equal<sizeof(numel_),              8,  FieldNameEnum::numel_>();
diff --git a/c10/core/TensorOptions.cpp b/c10/core/TensorOptions.cpp
index 599868aea8fd..d3282ae7114e 100644
--- a/c10/core/TensorOptions.cpp
+++ b/c10/core/TensorOptions.cpp
@@ -1,9 +1,5 @@
 #include <c10/core/TensorOptions.h>
 
-#include <c10/core/Device.h>
-#include <c10/core/Layout.h>
-#include <c10/util/Optional.h>
-
 #include <iostream>
 
 namespace c10 {
diff --git a/c10/core/UndefinedTensorImpl.cpp b/c10/core/UndefinedTensorImpl.cpp
index b42d3a92545f..037cac0f63b8 100644
--- a/c10/core/UndefinedTensorImpl.cpp
+++ b/c10/core/UndefinedTensorImpl.cpp
@@ -31,7 +31,7 @@ bool UndefinedTensorImpl::has_storage() const {
 }
 #endif
 
-void UndefinedTensorImpl::set_storage_offset(int64_t) {
+void UndefinedTensorImpl::set_storage_offset(int64_t /*storage_offset*/) {
   TORCH_CHECK(false, "set_storage_offset() called on an undefined Tensor");
 }
 
diff --git a/c10/core/impl/COW.cpp b/c10/core/impl/COW.cpp
index 81bc86e64bda..3adeebcd2d58 100644
--- a/c10/core/impl/COW.cpp
+++ b/c10/core/impl/COW.cpp
@@ -2,7 +2,6 @@
 
 #include <c10/core/Allocator.h>
 #include <c10/core/StorageImpl.h>
-#include <c10/core/alignment.h>
 #include <c10/core/impl/COWDeleter.h>
 #include <c10/util/Exception.h>
 #include <c10/util/ParallelGuard.h>
@@ -45,7 +44,8 @@ bool has_simple_data_ptr(const c10::StorageImpl& storage) {
 }
 
 bool is_cow_data_ptr(const c10::DataPtr& data_ptr) {
-  return (void*)data_ptr.get_deleter() == (void*)&cow::cow_deleter;
+  return reinterpret_cast<const void*>(data_ptr.get_deleter()) ==
+      reinterpret_cast<const void*>(&cow::cow_deleter);
 }
 
 c10::intrusive_ptr<StorageImpl> lazy_clone_storage(StorageImpl& storage) {
diff --git a/c10/core/impl/DeviceGuardImplInterface.cpp b/c10/core/impl/DeviceGuardImplInterface.cpp
index 52f6f5e8c13a..428ea63c0415 100644
--- a/c10/core/impl/DeviceGuardImplInterface.cpp
+++ b/c10/core/impl/DeviceGuardImplInterface.cpp
@@ -9,16 +9,22 @@ std::array<
     static_cast<size_t>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES)>
     device_guard_impl_registry;
 
-DeviceGuardImplRegistrar::DeviceGuardImplRegistrar(
+void registerDeviceGuard(
     DeviceType type,
     const DeviceGuardImplInterface* impl) {
   device_guard_impl_registry[static_cast<size_t>(type)].store(impl);
 }
 
+DeviceGuardImplRegistrar::DeviceGuardImplRegistrar(
+    DeviceType type,
+    const DeviceGuardImplInterface* impl) {
+  registerDeviceGuard(type, impl);
+}
+
 namespace {
 thread_local std::unique_ptr<DeviceGuardImplInterface> tls_fake_device_guard =
     nullptr;
-}
+} // namespace
 
 void ensureCUDADeviceGuardSet() {
   constexpr auto cuda_idx = static_cast<std::size_t>(DeviceType::CUDA);
diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h
index fc8c367f75e8..f9f67497c631 100644
--- a/c10/core/impl/DeviceGuardImplInterface.h
+++ b/c10/core/impl/DeviceGuardImplInterface.h
@@ -111,15 +111,16 @@ struct C10_API DeviceGuardImplInterface {
   /**
    * Get the default stream for a given device.
    */
-  virtual Stream getDefaultStream(Device) const {
+  virtual Stream getDefaultStream(Device /*unused*/) const {
     TORCH_CHECK(false, "Backend doesn't support acquiring a default stream.")
   }
 
   /**
    * Get a stream from the global pool for a given device.
    */
-  virtual Stream getStreamFromGlobalPool(Device, bool isHighPriority = false)
-      const {
+  virtual Stream getStreamFromGlobalPool(
+      Device /*unused*/,
+      bool isHighPriority = false) const {
     (void)isHighPriority; // Suppress unused variable warning
     TORCH_CHECK(false, "Backend doesn't support acquiring a stream from pool.")
   }
@@ -129,7 +130,7 @@ struct C10_API DeviceGuardImplInterface {
    * copied and shared around, device backend should be able to correctly handle
    * the lifetime of the stream.
    */
-  virtual Stream getNewStream(Device, int priority = 0) const {
+  virtual Stream getNewStream(Device /*unused*/, int priority = 0) const {
     (void)priority;
     TORCH_CHECK(false, "Backend doesn't support create a new Stream.")
   }
@@ -228,8 +229,9 @@ struct C10_API DeviceGuardImplInterface {
    * being used on the given stream, and that it should thus avoid recycling the
    * DataPtr until all work on that stream is done.
    */
-  virtual void recordDataPtrOnStream(const c10::DataPtr&, const Stream&) const {
-  }
+  virtual void recordDataPtrOnStream(
+      const c10::DataPtr& /*unused*/,
+      const Stream& /*unused*/) const {}
 
   /**
    * Fetch the elapsed time between two recorded events.
@@ -257,31 +259,31 @@ struct NoOpDeviceGuardImpl : public DeviceGuardImplInterface {
   DeviceType type() const override {
     return D;
   }
-  Device exchangeDevice(Device) const override {
+  Device exchangeDevice(Device /*unused*/) const override {
     return Device(D, -1); // no-op
   }
   Device getDevice() const override {
     return Device(D, -1);
   }
-  void setDevice(Device) const override {
+  void setDevice(Device /*unused*/) const override {
     // no-op
   }
-  void uncheckedSetDevice(Device) const noexcept override {
+  void uncheckedSetDevice(Device /*unused*/) const noexcept override {
     // no-op
   }
-  Stream getStream(Device) const noexcept override {
+  Stream getStream(Device /*unused*/) const noexcept override {
     // no-op
     return Stream(Stream::DEFAULT, Device(D, -1));
   }
 
-  Stream getNewStream(Device, int priority = 0) const override {
+  Stream getNewStream(Device /*unused*/, int priority = 0) const override {
     // no-op
     (void)priority;
     return Stream(Stream::DEFAULT, Device(D, -1));
   }
 
   // NB: These do NOT set the current device
-  Stream exchangeStream(Stream) const noexcept override {
+  Stream exchangeStream(Stream /*unused*/) const noexcept override {
     // no-op
     return Stream(Stream::DEFAULT, Device(D, -1));
   }
@@ -344,7 +346,9 @@ extern C10_API std::array<
 
 class C10_API DeviceGuardImplRegistrar {
  public:
-  DeviceGuardImplRegistrar(DeviceType, const DeviceGuardImplInterface*);
+  DeviceGuardImplRegistrar(
+      DeviceType /*type*/,
+      const DeviceGuardImplInterface* /*impl*/);
 };
 
 #define C10_REGISTER_GUARD_IMPL(DevType, DeviceGuardImpl)              \
@@ -368,6 +372,9 @@ inline const DeviceGuardImplInterface* getDeviceGuardImpl(DeviceType type) {
   return p;
 }
 
+void C10_API
+registerDeviceGuard(DeviceType type, const DeviceGuardImplInterface* impl);
+
 inline bool hasDeviceGuardImpl(DeviceType type) {
   return device_guard_impl_registry[static_cast<size_t>(type)].load();
 }
diff --git a/c10/core/impl/FakeGuardImpl.h b/c10/core/impl/FakeGuardImpl.h
index fa58c5739315..7b2eeca79fd5 100644
--- a/c10/core/impl/FakeGuardImpl.h
+++ b/c10/core/impl/FakeGuardImpl.h
@@ -19,7 +19,7 @@ template <DeviceType T>
 struct FakeGuardImpl final : public DeviceGuardImplInterface {
   static constexpr DeviceType static_type = T;
   // Runtime device type is not used
-  FakeGuardImpl(DeviceType) {}
+  FakeGuardImpl(DeviceType /*unused*/) {}
   FakeGuardImpl() = default;
   DeviceType type() const override {
     return T;
diff --git a/c10/core/impl/GPUTrace.h b/c10/core/impl/GPUTrace.h
index 3acb875b54a3..df8cec135e23 100644
--- a/c10/core/impl/GPUTrace.h
+++ b/c10/core/impl/GPUTrace.h
@@ -16,7 +16,7 @@ struct C10_API GPUTrace {
 
   // This function will only register the first interpreter that tries to invoke
   // it. For all of the next ones it will be a no-op.
-  static void set_trace(const PyInterpreter*);
+  static void set_trace(const PyInterpreter* /*trace*/);
 
   static const PyInterpreter* get_trace() {
     if (!haveState)
diff --git a/c10/core/impl/LocalDispatchKeySet.h b/c10/core/impl/LocalDispatchKeySet.h
index 1232bd25eb3b..bba089bb2ad1 100644
--- a/c10/core/impl/LocalDispatchKeySet.h
+++ b/c10/core/impl/LocalDispatchKeySet.h
@@ -81,7 +81,7 @@ C10_API void _force_tls_local_dispatch_key_set(LocalDispatchKeySet key_set);
 
 class C10_API IncludeDispatchKeyGuard {
  public:
-  IncludeDispatchKeyGuard(DispatchKeySet);
+  IncludeDispatchKeyGuard(DispatchKeySet /*include*/);
   IncludeDispatchKeyGuard(DispatchKey k)
       : IncludeDispatchKeyGuard(DispatchKeySet(k)) {}
   IncludeDispatchKeyGuard(const IncludeDispatchKeyGuard&) = delete;
@@ -99,7 +99,7 @@ class C10_API IncludeDispatchKeyGuard {
 
 class C10_API ExcludeDispatchKeyGuard {
  public:
-  ExcludeDispatchKeyGuard(DispatchKeySet);
+  ExcludeDispatchKeyGuard(DispatchKeySet /*exclude*/);
   ExcludeDispatchKeyGuard(DispatchKey k)
       : ExcludeDispatchKeyGuard(DispatchKeySet(k)) {}
   ExcludeDispatchKeyGuard(const ExcludeDispatchKeyGuard&) = delete;
diff --git a/c10/core/impl/PyInterpreter.cpp b/c10/core/impl/PyInterpreter.cpp
index 913bc7872657..8676f0aaf8e0 100644
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@@ -35,7 +35,7 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
 
   void python_op_registration_trampoline(
       const c10::OperatorHandle& op,
-      c10::DispatchKey,
+      c10::DispatchKey /*unused*/,
       c10::DispatchKeySet keyset,
       torch::jit::Stack* stack,
       bool with_keyset,
@@ -52,19 +52,21 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
 
   void python_dispatcher(
       const c10::OperatorHandle& op,
-      c10::DispatchKeySet,
+      c10::DispatchKeySet /*unused*/,
       torch::jit::Stack* stack) const override {
     PANIC(python_dispatcher);
   }
 
-  bool is_contiguous(const TensorImpl* self, at::MemoryFormat) const override {
+  bool is_contiguous(const TensorImpl* self, at::MemoryFormat /*unused*/)
+      const override {
     PANIC(is_contiguous);
   }
-  c10::SymBool sym_is_contiguous(const TensorImpl* self, at::MemoryFormat)
-      const override {
+  c10::SymBool sym_is_contiguous(
+      const TensorImpl* self,
+      at::MemoryFormat /*unused*/) const override {
     PANIC(sym_is_contiguous);
   }
-  bool is_strides_like(const TensorImpl* self, at::MemoryFormat)
+  bool is_strides_like(const TensorImpl* self, at::MemoryFormat /*unused*/)
       const override {
     PANIC(is_strides_like);
   }
diff --git a/c10/core/impl/PyInterpreterHooks.h b/c10/core/impl/PyInterpreterHooks.h
index 4fe025d2e778..8cb058105158 100644
--- a/c10/core/impl/PyInterpreterHooks.h
+++ b/c10/core/impl/PyInterpreterHooks.h
@@ -13,10 +13,11 @@ struct C10_API PyInterpreterHooksInterface {
 
   // Get the PyInterpreter instance
   // Stub implementation throws error when Python is not available
-  // We return nullptr rather than throwing an error since there are bits of c10
-  // that expect an empty PyObjectSlot when python is not available.
   virtual PyInterpreter* getPyInterpreter() const {
-    return nullptr;
+    TORCH_CHECK(
+        false,
+        "PyTorch was compiled without Python support. "
+        "Cannot access Python interpreter from C++.");
   }
 };
 
@@ -33,6 +34,7 @@ C10_DECLARE_REGISTRY(
 // Get the global PyInterpreter hooks instance
 C10_API const PyInterpreterHooksInterface& getPyInterpreterHooks();
 
+// Helper function to get the global interpreter
 C10_API PyInterpreter* getGlobalPyInterpreter();
 
 } // namespace c10::impl
diff --git a/c10/core/impl/PyObjectSlot.cpp b/c10/core/impl/PyObjectSlot.cpp
index 7476ac1d4c39..0f1bfb211074 100644
--- a/c10/core/impl/PyObjectSlot.cpp
+++ b/c10/core/impl/PyObjectSlot.cpp
@@ -2,7 +2,7 @@
 
 namespace c10::impl {
 
-PyObjectSlot::PyObjectSlot() : pyobj_(nullptr) {}
+PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
 
 PyObjectSlot::~PyObjectSlot() {
   maybe_destroy_pyobj();
@@ -10,9 +10,9 @@ PyObjectSlot::~PyObjectSlot() {
 
 void PyObjectSlot::maybe_destroy_pyobj() {
   if (owns_pyobj()) {
-    TORCH_INTERNAL_ASSERT(getGlobalPyInterpreter() != nullptr);
+    TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
     TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
-    (*getGlobalPyInterpreter())
+    (*pyobj_interpreter_.load(std::memory_order_acquire))
         ->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
     // NB: this destructor can only be entered when there are no
     // references to this C++ object (obviously), NOR any references
@@ -25,7 +25,7 @@ void PyObjectSlot::maybe_destroy_pyobj() {
 }
 
 PyInterpreter* PyObjectSlot::pyobj_interpreter() {
-  return getGlobalPyInterpreter();
+  return pyobj_interpreter_.load(std::memory_order_acquire);
 }
 
 PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
@@ -35,7 +35,7 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
 }
 
 PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
-  auto interpreter = getGlobalPyInterpreter();
+  auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
   if (interpreter) {
     return *interpreter;
   }
diff --git a/c10/core/impl/PyObjectSlot.h b/c10/core/impl/PyObjectSlot.h
index e7d78f8360c3..58b2490eba00 100644
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@@ -6,16 +6,9 @@
 #include <c10/util/python_stub.h>
 #include <optional>
 
-namespace c10::impl {
-
-// Function pointer type for getting the global interpreter
-using GetPyInterpreterFn = PyInterpreter* (*)();
+#include <atomic>
 
-// Global function pointer (set by csrc initialization)
-C10_API extern GetPyInterpreterFn g_get_pyinterpreter_fn;
-
-// Helper function to get the global interpreter
-C10_API PyInterpreter* getGlobalPyInterpreter();
+namespace c10::impl {
 
 struct C10_API PyObjectSlot {
  public:
@@ -33,6 +26,8 @@ struct C10_API PyObjectSlot {
   // NB: THIS FUNCTION CAN RAISE AN EXCEPTION.  Make sure to clean up after
   // PyObject if necessary!
   void init_pyobj(PyObject* pyobj) {
+    pyobj_interpreter_.store(
+        getGlobalPyInterpreter(), std::memory_order_relaxed);
     pyobj_ = pyobj;
   }
 
@@ -60,15 +55,18 @@ struct C10_API PyObjectSlot {
 
   // @todo alban: I'm not too sure what's going on here, we can probably delete
   // it but it's worthwhile making sure
-  std::optional<PyObject*> check_pyobj() const {
-    impl::PyInterpreter* interpreter = getGlobalPyInterpreter();
-    if (interpreter == nullptr || pyobj_ == nullptr) {
+  std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
+    impl::PyInterpreter* interpreter =
+        pyobj_interpreter_.load(std::memory_order_acquire);
+    if (interpreter == nullptr) {
       return std::nullopt;
     }
-    if (c10::impl::HermeticPyObjectTLS::get_state()) {
+
+    if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
       return std::nullopt;
+    } else {
+      return _unchecked_untagged_pyobj();
     }
-    return _unchecked_untagged_pyobj();
   }
 
   PyInterpreter& load_pyobj_interpreter() const;
@@ -78,6 +76,30 @@ struct C10_API PyObjectSlot {
   void set_owns_pyobj(bool b);
 
  private:
+  // This field contains the interpreter tag for this object.  See
+  // Note [Python interpreter tag] for general context
+  //
+  // Note [Memory ordering on Python interpreter tag]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // What memory_order do we need when accessing this atomic?  We don't
+  // need a single total modification order (as provided by
+  // memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
+  // transition from -1 to some positive integer and never changes afterwards.
+  // Because there is only one modification, it trivially already has a total
+  // modification order (e.g., we don't need fences or locked instructions on
+  // x86)
+  //
+  // In fact, one could make a reasonable argument that relaxed reads are OK,
+  // due to the presence of external locking (GIL) to ensure that interactions
+  // with other data structures are still correctly synchronized, so that
+  // we fall in the "Single-Location Data Structures" case as described in
+  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
+  // However, on x86, it doesn't matter if I use acquire or relaxed on the load
+  // as I get the same assembly in both cases.  So I just use the more
+  // conservative acquire (which will impede compiler optimizations but I don't
+  // care)
+  std::atomic<PyInterpreter*> pyobj_interpreter_;
+
   // This field contains a reference to a PyObject representing this Tensor.
   // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
   // PyObject for it and set this field.  This field does not have to be
diff --git a/c10/core/impl/TorchDispatchModeTLS.cpp b/c10/core/impl/TorchDispatchModeTLS.cpp
index c8bdc1bb59ba..55d9e24a5721 100644
--- a/c10/core/impl/TorchDispatchModeTLS.cpp
+++ b/c10/core/impl/TorchDispatchModeTLS.cpp
@@ -1,5 +1,4 @@
 #include <c10/core/DispatchKey.h>
-#include <c10/core/SafePyObject.h>
 #include <c10/core/impl/LocalDispatchKeySet.h>
 #include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <c10/util/irange.h>
diff --git a/c10/core/impl/VirtualGuardImpl.h b/c10/core/impl/VirtualGuardImpl.h
index badcb6232915..3d259f5e390e 100644
--- a/c10/core/impl/VirtualGuardImpl.h
+++ b/c10/core/impl/VirtualGuardImpl.h
@@ -94,11 +94,11 @@ class VirtualGuardImpl final : public DeviceGuardImplInterface {
   }
 
   void synchronizeEvent(void* event) const override {
-    return impl_->synchronizeEvent(event);
+    impl_->synchronizeEvent(event);
   }
 
   void synchronizeDevice(const DeviceIndex device_index) const override {
-    return impl_->synchronizeDevice(device_index);
+    impl_->synchronizeDevice(device_index);
   }
 
  private:
diff --git a/c10/core/impl/alloc_cpu.cpp b/c10/core/impl/alloc_cpu.cpp
index 791104690cd8..d48a6251ed5d 100644
--- a/c10/core/impl/alloc_cpu.cpp
+++ b/c10/core/impl/alloc_cpu.cpp
@@ -56,7 +56,7 @@ void memset_junk(void* data, size_t num) {
 }
 
 #if defined(__linux__) && !defined(__ANDROID__)
-static inline bool is_thp_alloc_enabled() {
+inline bool is_thp_alloc_enabled() {
   static bool value = [&] {
     auto env = c10::utils::check_env("THP_MEM_ALLOC_ENABLE");
     return env.has_value() ? env.value() : 0;
@@ -108,12 +108,15 @@ void* alloc_cpu(size_t nbytes) {
       "DefaultCPUAllocator: not enough memory: you tried to allocate ",
       nbytes,
       " bytes.");
-#elif defined(_MSC_VER)
-#ifdef USE_MIMALLOC
+#elif defined(USE_MIMALLOC)
   data = mi_malloc_aligned(nbytes, gAlignment);
-#else
+  CAFFE_ENFORCE(
+      data,
+      "DefaultCPUAllocator: not enough memory: you tried to allocate ",
+      nbytes,
+      " bytes.");
+#elif defined(_MSC_VER)
   data = _aligned_malloc(nbytes, gAlignment);
-#endif
   CAFFE_ENFORCE(
       data,
       "DefaultCPUAllocator: not enough memory: you tried to allocate ",
@@ -160,12 +163,10 @@ void* alloc_cpu(size_t nbytes) {
 }
 
 void free_cpu(void* data) {
-#ifdef _MSC_VER
 #ifdef USE_MIMALLOC
   mi_free(data);
-#else
+#elif defined(_MSC_VER)
   _aligned_free(data);
-#endif
 #else
   // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
   free(data);
diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp
index 8706f7362a3d..3046259b48a3 100644
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@@ -1,6 +1,5 @@
 #include <c10/cuda/CUDAAllocatorConfig.h>
 #include <c10/cuda/CUDACachingAllocator.h>
-#include <c10/util/llvmMathExtras.h>
 
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 #include <c10/cuda/driver_api.h>
@@ -8,382 +7,119 @@
 
 namespace c10::cuda::CUDACachingAllocator {
 
-constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
-
-CUDAAllocatorConfig::CUDAAllocatorConfig()
-    : m_max_split_size(std::numeric_limits<size_t>::max()),
-      m_max_non_split_rounding_size(kLargeBuffer),
-      m_garbage_collection_threshold(0),
-      m_pinned_num_register_threads(1),
-      m_expandable_segments(false),
-#if CUDA_VERSION >= 12030
-      m_expandable_segments_handle_type(
-          Expandable_Segments_Handle_Type::UNSPECIFIED),
-#else
-      m_expandable_segments_handle_type(
-          Expandable_Segments_Handle_Type::POSIX_FD),
-#endif
-      m_release_lock_on_cudamalloc(false),
-      m_pinned_use_cuda_host_register(false),
-      m_graph_capture_record_stream_reuse(false),
-      m_pinned_use_background_threads(false) {
-  m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
-}
-
-size_t CUDAAllocatorConfig::roundup_power2_divisions(size_t size) {
-  size_t log_size = (63 - llvm::countLeadingZeros(size));
-
-  // Our intervals start at 1MB and end at 64GB
-  const size_t interval_start =
-      63 - llvm::countLeadingZeros(static_cast<size_t>(1048576));
-  const size_t interval_end =
-      63 - llvm::countLeadingZeros(static_cast<size_t>(68719476736));
-  TORCH_CHECK(
-      (interval_end - interval_start == kRoundUpPowerOfTwoIntervals),
-      "kRoundUpPowerOfTwoIntervals mismatch");
-
-  int index = static_cast<int>(log_size) - static_cast<int>(interval_start);
-
-  index = std::max(0, index);
-  index = std::min(index, static_cast<int>(kRoundUpPowerOfTwoIntervals) - 1);
-  return instance().m_roundup_power2_divisions[index];
-}
-
-void CUDAAllocatorConfig::lexArgs(
-    const std::string& env,
-    std::vector<std::string>& config) {
-  std::vector<char> buf;
-
-  for (char ch : env) {
-    if (ch == ',' || ch == ':' || ch == '[' || ch == ']') {
-      if (!buf.empty()) {
-        config.emplace_back(buf.begin(), buf.end());
-        buf.clear();
-      }
-      config.emplace_back(1, ch);
-    } else if (ch != ' ') {
-      buf.emplace_back(ch);
-    }
-  }
-  if (!buf.empty()) {
-    config.emplace_back(buf.begin(), buf.end());
-  }
-}
-
-void CUDAAllocatorConfig::consumeToken(
-    const std::vector<std::string>& config,
-    size_t i,
-    const char c) {
-  TORCH_CHECK(
-      i < config.size() && config[i] == std::string(1, c),
-      "Error parsing CachingAllocator settings, expected ",
-      c,
-      "");
-}
-
-size_t CUDAAllocatorConfig::parseMaxSplitSize(
-    const std::vector<std::string>& config,
-    size_t i) {
-  consumeToken(config, ++i, ':');
-  constexpr int mb = 1024 * 1024;
-  if (++i < config.size()) {
-    size_t val1 = stoi(config[i]);
-    TORCH_CHECK(
-        val1 > kLargeBuffer / mb,
-        "CachingAllocator option max_split_size_mb too small, must be > ",
-        kLargeBuffer / mb,
-        "");
-    val1 = std::max(val1, kLargeBuffer / mb);
-    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
-    m_max_split_size = val1 * 1024 * 1024;
-  } else {
-    TORCH_CHECK(false, "Error, expecting max_split_size_mb value", "");
-  }
-  return i;
-}
-
-size_t CUDAAllocatorConfig::parseMaxNonSplitRoundingSize(
-    const std::vector<std::string>& config,
-    size_t i) {
-  consumeToken(config, ++i, ':');
-  constexpr int mb = 1024 * 1024;
-  if (++i < config.size()) {
-    size_t val1 = stoi(config[i]);
-    TORCH_CHECK(
-        val1 > kLargeBuffer / mb,
-        "CachingAllocator option max_non_split_rounding_mb too small, must be > ",
-        kLargeBuffer / mb,
-        "");
-    val1 = std::max(val1, kLargeBuffer / mb);
-    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
-    m_max_non_split_rounding_size = val1 * 1024 * 1024;
-  } else {
-    TORCH_CHECK(false, "Error, expecting max_non_split_rounding_mb value", "");
-  }
-  return i;
-}
-
-size_t CUDAAllocatorConfig::parseGarbageCollectionThreshold(
-    const std::vector<std::string>& config,
-    size_t i) {
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    double val1 = stod(config[i]);
-    TORCH_CHECK(
-        val1 > 0, "garbage_collect_threshold too small, set it 0.0~1.0", "");
-    TORCH_CHECK(
-        val1 < 1.0, "garbage_collect_threshold too big, set it 0.0~1.0", "");
-    m_garbage_collection_threshold = val1;
-  } else {
-    TORCH_CHECK(
-        false, "Error, expecting garbage_collection_threshold value", "");
-  }
-  return i;
-}
-
-size_t CUDAAllocatorConfig::parseRoundUpPower2Divisions(
-    const std::vector<std::string>& config,
-    size_t i) {
-  consumeToken(config, ++i, ':');
-  bool first_value = true;
-
-  if (++i < config.size()) {
-    if (std::string_view(config[i]) == "[") {
-      size_t last_index = 0;
-      // NOLINTNEXTLINE(bugprone-inc-dec-in-conditions)
-      while (++i < config.size() && std::string_view(config[i]) != "]") {
-        const std::string& val1 = config[i];
-        size_t val2 = 0;
-
-        consumeToken(config, ++i, ':');
-        if (++i < config.size()) {
-          val2 = stoi(config[i]);
-        } else {
-          TORCH_CHECK(
-              false, "Error parsing roundup_power2_divisions value", "");
-        }
-        TORCH_CHECK(
-            val2 == 0 || llvm::isPowerOf2_64(val2),
-            "For roundups, the divisions has to be power of 2 or 0 to disable roundup ",
-            "");
-
-        if (std::string_view(val1) == ">") {
-          std::fill(
-              std::next(
-                  m_roundup_power2_divisions.begin(),
-                  static_cast<std::vector<unsigned long>::difference_type>(
-                      last_index)),
-              m_roundup_power2_divisions.end(),
-              val2);
-        } else {
-          size_t val1_long = stoul(val1);
-          TORCH_CHECK(
-              llvm::isPowerOf2_64(val1_long),
-              "For roundups, the intervals have to be power of 2 ",
-              "");
-
-          size_t index = 63 - llvm::countLeadingZeros(val1_long);
-          index = std::max((size_t)0, index);
-          index = std::min(index, m_roundup_power2_divisions.size() - 1);
-
-          if (first_value) {
-            std::fill(
-                m_roundup_power2_divisions.begin(),
-                std::next(
-                    m_roundup_power2_divisions.begin(),
-                    static_cast<std::vector<unsigned long>::difference_type>(
-                        index)),
-                val2);
-            first_value = false;
-          }
-          if (index < m_roundup_power2_divisions.size()) {
-            m_roundup_power2_divisions[index] = val2;
-          }
-          last_index = index;
-        }
-
-        if (std::string_view(config[i + 1]) != "]") {
-          consumeToken(config, ++i, ',');
-        }
-      }
-    } else { // Keep this for backwards compatibility
-      size_t val1 = stoi(config[i]);
-      TORCH_CHECK(
-          llvm::isPowerOf2_64(val1),
-          "For roundups, the divisions has to be power of 2 ",
-          "");
-      std::fill(
-          m_roundup_power2_divisions.begin(),
-          m_roundup_power2_divisions.end(),
-          val1);
-    }
-  } else {
-    TORCH_CHECK(false, "Error, expecting roundup_power2_divisions value", "");
-  }
-  return i;
-}
-
 size_t CUDAAllocatorConfig::parseAllocatorConfig(
-    const std::vector<std::string>& config,
+    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
     size_t i,
     bool& used_cudaMallocAsync) {
-  // For ease of maintenance and understanding, the CUDA and ROCm
-  // implementations of this function are separated. This avoids having many
-  // #ifdef's throughout.
-#ifdef USE_ROCM
   // Ease burden on ROCm users by allowing either cuda or hip tokens.
   // cuda token is broken up to prevent hipify matching it.
 #define PYTORCH_TOKEN1 \
   "cud"                \
   "aMallocAsync"
 #define PYTORCH_TOKEN2 "hipMallocAsync"
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    TORCH_CHECK(
-        ((config[i] == "native") || (config[i] == PYTORCH_TOKEN1) ||
-         (config[i] == PYTORCH_TOKEN2)),
-        "Unknown allocator backend, "
-        "options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
-    used_cudaMallocAsync =
-        (config[i] == PYTORCH_TOKEN1 || config[i] == PYTORCH_TOKEN2);
-    TORCH_INTERNAL_ASSERT(
-        config[i] == get()->name() ||
-            (config[i] == PYTORCH_TOKEN1 && get()->name() == PYTORCH_TOKEN2),
-        "Allocator backend parsed at runtime != "
-        "allocator backend parsed at load time, ",
-        config[i],
-        " != ",
-        get()->name());
-  } else {
-    TORCH_CHECK(false, "Error parsing backend value", "");
-  }
-  return i;
-#undef PYTORCH_TOKEN1
-#undef PYTORCH_TOKEN2
+  tokenizer.checkToken(++i, ":");
+  i++; // Move to the value after the colon
+#ifdef USE_ROCM
+  TORCH_CHECK_VALUE(
+      ((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1) ||
+       (tokenizer[i] == PYTORCH_TOKEN2)),
+      "Unknown allocator backend, "
+      "options are native, " PYTORCH_TOKEN1 ", and " PYTORCH_TOKEN2);
+  used_cudaMallocAsync =
+      (tokenizer[i] == PYTORCH_TOKEN1 || tokenizer[i] == PYTORCH_TOKEN2);
+  TORCH_INTERNAL_ASSERT(
+      tokenizer[i] == get()->name() ||
+          (tokenizer[i] == PYTORCH_TOKEN1 && get()->name() == PYTORCH_TOKEN2),
+      "Allocator backend parsed at runtime != "
+      "allocator backend parsed at load time, ",
+      tokenizer[i],
+      " != ",
+      get()->name());
 #else // USE_ROCM
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    TORCH_CHECK(
-        ((config[i] == "native") || (config[i] == "cudaMallocAsync")),
-        "Unknown allocator backend, "
-        "options are native and cudaMallocAsync");
-    used_cudaMallocAsync = (config[i] == "cudaMallocAsync");
-    if (used_cudaMallocAsync) {
+  TORCH_CHECK_VALUE(
+      ((tokenizer[i] == "native") || (tokenizer[i] == PYTORCH_TOKEN1)),
+      "Unknown allocator backend, "
+      "options are native and " PYTORCH_TOKEN1);
+  used_cudaMallocAsync = (tokenizer[i] == PYTORCH_TOKEN1);
+  TORCH_INTERNAL_ASSERT(
+      tokenizer[i] == get()->name(),
+      "Allocator backend parsed at runtime != "
+      "allocator backend parsed at load time, ",
+      tokenizer[i],
+      " != ",
+      get()->name());
+  if (used_cudaMallocAsync) {
 #if CUDA_VERSION >= 11040
-      int version = 0;
-      C10_CUDA_CHECK(cudaDriverGetVersion(&version));
-      TORCH_CHECK(
-          version >= 11040,
-          "backend:cudaMallocAsync requires CUDA runtime "
-          "11.4 or newer, but cudaDriverGetVersion returned ",
-          version);
-#else
-      TORCH_CHECK(
-          false,
-          "backend:cudaMallocAsync requires PyTorch to be built with "
-          "CUDA 11.4 or newer, but CUDA_VERSION is ",
-          CUDA_VERSION);
-#endif
-    }
-    TORCH_INTERNAL_ASSERT(
-        config[i] == get()->name(),
-        "Allocator backend parsed at runtime != "
-        "allocator backend parsed at load time");
-  } else {
-    TORCH_CHECK(false, "Error parsing backend value", "");
+    int version = 0;
+    C10_CUDA_CHECK(cudaDriverGetVersion(&version));
+    TORCH_CHECK(
+        version >= 11040,
+        "backend:cudaMallocAsync requires CUDA runtime "
+        "11.4 or newer, but cudaDriverGetVersion returned ",
+        version);
+#else // CUDA_VERSION >= 11040
+    TORCH_CHECK(
+        false,
+        "backend:cudaMallocAsync requires PyTorch to be built with "
+        "CUDA 11.4 or newer, but CUDA_VERSION is ",
+        CUDA_VERSION);
+#endif // CUDA_VERSION >= 11040
   }
-  return i;
 #endif // USE_ROCM
+  return i;
 }
 
-void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
-  // If empty, set the default values
-  m_max_split_size = std::numeric_limits<size_t>::max();
-  m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
-  m_garbage_collection_threshold = 0;
+void CUDAAllocatorConfig::parseArgs(const std::string& env) {
   bool used_cudaMallocAsync = false;
   bool used_native_specific_option = false;
 
-  if (!env.has_value()) {
-    return;
-  }
-  {
-    std::lock_guard<std::mutex> lock(m_last_allocator_settings_mutex);
-    m_last_allocator_settings = env.value();
-  }
-
-  std::vector<std::string> config;
-  lexArgs(env.value(), config);
-
-  for (size_t i = 0; i < config.size(); i++) {
-    std::string_view config_item_view(config[i]);
-    if (config_item_view == "max_split_size_mb") {
-      i = parseMaxSplitSize(config, i);
-      used_native_specific_option = true;
-    } else if (config_item_view == "max_non_split_rounding_mb") {
-      i = parseMaxNonSplitRoundingSize(config, i);
-      used_native_specific_option = true;
-    } else if (config_item_view == "garbage_collection_threshold") {
-      i = parseGarbageCollectionThreshold(config, i);
-      used_native_specific_option = true;
-    } else if (config_item_view == "roundup_power2_divisions") {
-      i = parseRoundUpPower2Divisions(config, i);
-      used_native_specific_option = true;
-    } else if (config_item_view == "backend") {
-      i = parseAllocatorConfig(config, i, used_cudaMallocAsync);
-    } else if (config_item_view == "expandable_segments") {
-      used_native_specific_option = true;
-      consumeToken(config, ++i, ':');
-      ++i;
-      TORCH_CHECK(
-          i < config.size() &&
-              (std::string_view(config[i]) == "True" ||
-               std::string_view(config[i]) == "False"),
-          "Expected a single True/False argument for expandable_segments");
-      config_item_view = config[i];
-      m_expandable_segments = (config_item_view == "True");
+  c10::CachingAllocator::ConfigTokenizer tokenizer(env);
+  for (size_t i = 0; i < tokenizer.size(); i++) {
+    const auto& key = tokenizer[i];
+    if (key == "backend") {
+      i = parseAllocatorConfig(tokenizer, i, used_cudaMallocAsync);
     } else if (
         // ROCm build's hipify step will change "cuda" to "hip", but for ease of
         // use, accept both. We must break up the string to prevent hipify here.
-        config_item_view == "release_lock_on_hipmalloc" ||
-        config_item_view ==
+        key == "release_lock_on_hipmalloc" ||
+        key ==
             "release_lock_on_c"
             "udamalloc") {
       used_native_specific_option = true;
-      consumeToken(config, ++i, ':');
-      ++i;
-      TORCH_CHECK(
-          i < config.size() &&
-              (std::string_view(config[i]) == "True" ||
-               std::string_view(config[i]) == "False"),
-          "Expected a single True/False argument for release_lock_on_cudamalloc");
-      config_item_view = config[i];
-      m_release_lock_on_cudamalloc = (config_item_view == "True");
+      tokenizer.checkToken(++i, ":");
+      m_release_lock_on_cudamalloc = tokenizer.toBool(++i);
     } else if (
         // ROCm build's hipify step will change "cuda" to "hip", but for ease of
         // use, accept both. We must break up the string to prevent hipify here.
-        config_item_view == "pinned_use_hip_host_register" ||
-        config_item_view ==
+        key == "pinned_use_hip_host_register" ||
+        key ==
             "pinned_use_c"
             "uda_host_register") {
-      i = parsePinnedUseCudaHostRegister(config, i);
+      i = parsePinnedUseCudaHostRegister(tokenizer, i);
       used_native_specific_option = true;
-    } else if (config_item_view == "pinned_num_register_threads") {
-      i = parsePinnedNumRegisterThreads(config, i);
+    } else if (key == "pinned_num_register_threads") {
+      i = parsePinnedNumRegisterThreads(tokenizer, i);
       used_native_specific_option = true;
-    } else if (config_item_view == "pinned_use_background_threads") {
-      i = parsePinnedUseBackgroundThreads(config, i);
+    } else if (key == "pinned_reserve_segment_size_mb") {
+      i = parsePinnedReserveSegmentSize(tokenizer, i);
       used_native_specific_option = true;
-    } else if (config_item_view == "graph_capture_record_stream_reuse") {
-      i = parseGraphCaptureRecordStreamReuse(config, i);
+    } else if (key == "graph_capture_record_stream_reuse") {
+      i = parseGraphCaptureRecordStreamReuse(tokenizer, i);
       used_native_specific_option = true;
     } else {
-      TORCH_CHECK(
-          false, "Unrecognized CachingAllocator option: ", config_item_view);
+      const auto& keys =
+          c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
+      TORCH_CHECK_VALUE(
+          keys.find(key) != keys.end(),
+          "Unrecognized key '",
+          key,
+          "' in CUDA allocator config.");
+      // Skip the key and its value
+      i = tokenizer.skipKey(i);
     }
 
-    if (i + 1 < config.size()) {
-      consumeToken(config, ++i, ',');
+    if (i + 1 < tokenizer.size()) {
+      tokenizer.checkToken(++i, ",");
     }
   }
 
@@ -395,81 +131,52 @@ void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
 }
 
 size_t CUDAAllocatorConfig::parsePinnedUseCudaHostRegister(
-    const std::vector<std::string>& config,
+    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
     size_t i) {
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    TORCH_CHECK(
-        (config[i] == "True" || config[i] == "False"),
-        "Expected a single True/False argument for pinned_use_cuda_host_register");
-    m_pinned_use_cuda_host_register = (config[i] == "True");
-  } else {
-    TORCH_CHECK(
-        false, "Error, expecting pinned_use_cuda_host_register value", "");
-  }
+  tokenizer.checkToken(++i, ":");
+  m_pinned_use_cuda_host_register = tokenizer.toBool(++i);
   return i;
 }
 
 size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
-    const std::vector<std::string>& config,
+    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
     size_t i) {
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    TORCH_CHECK(
-        (config[i] == "True" || config[i] == "False"),
-        "Expected a single True/False argument for graph_capture_record_stream_reuse");
-    m_graph_capture_record_stream_reuse = (config[i] == "True");
-  } else {
-    TORCH_CHECK(
-        false, "Error, expecting graph_capture_record_stream_reuse value", "");
-  }
-
+  tokenizer.checkToken(++i, ":");
+  m_graph_capture_record_stream_reuse = tokenizer.toBool(++i);
   return i;
 }
 
 size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
-    const std::vector<std::string>& config,
+    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
     size_t i) {
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    size_t val2 = stoi(config[i]);
-    TORCH_CHECK(
-        llvm::isPowerOf2_64(val2),
-        "Number of register threads has to be power of 2 ",
-        "");
-    auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
-    TORCH_CHECK(
-        val2 <= maxThreads,
-        "Number of register threads should be less than or equal to " +
-            std::to_string(maxThreads),
-        "");
-    m_pinned_num_register_threads = val2;
-  } else {
-    TORCH_CHECK(
-        false, "Error, expecting pinned_num_register_threads value", "");
-  }
+  tokenizer.checkToken(++i, ":");
+  size_t val2 = tokenizer.toSizeT(++i);
+  TORCH_CHECK_VALUE(
+      llvm::isPowerOf2_64(val2),
+      "Number of register threads has to be power of 2, got ",
+      val2);
+  auto maxThreads = CUDAAllocatorConfig::pinned_max_register_threads();
+  TORCH_CHECK_VALUE(
+      val2 <= maxThreads,
+      "Number of register threads should be less than or equal to ",
+      maxThreads,
+      ", got ",
+      val2);
+  m_pinned_num_register_threads = val2;
   return i;
 }
 
-size_t CUDAAllocatorConfig::parsePinnedUseBackgroundThreads(
-    const std::vector<std::string>& config,
+size_t CUDAAllocatorConfig::parsePinnedReserveSegmentSize(
+    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
     size_t i) {
-  consumeToken(config, ++i, ':');
-  if (++i < config.size()) {
-    TORCH_CHECK(
-        (config[i] == "True" || config[i] == "False"),
-        "Expected a single True/False argument for pinned_use_background_threads");
-    m_pinned_use_background_threads = (config[i] == "True");
-  } else {
-    TORCH_CHECK(
-        false, "Error, expecting pinned_use_background_threads value", "");
-  }
+  tokenizer.checkToken(++i, ":");
+  size_t val2 = tokenizer.toSizeT(++i);
+  TORCH_CHECK_VALUE(
+      val2 > 0, "Pinned reserve segment size has to be greater than 0");
+  m_pinned_reserve_segment_size_mb = val2;
   return i;
 }
 
-// General caching allocator utilities
-void setAllocatorSettings(const std::string& env) {
-  CUDACachingAllocator::CUDAAllocatorConfig::instance().parseArgs(env.c_str());
-}
+REGISTER_ALLOCATOR_CONFIG_PARSE_HOOK(CUDAAllocatorConfig)
 
 } // namespace c10::cuda::CUDACachingAllocator
diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h
index 54c41ba70fb6..d61f69467a2d 100644
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@@ -1,16 +1,12 @@
 #pragma once
 
+#include <c10/core/AllocatorConfig.h>
+#include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAMacros.h>
+#include <c10/util/Deprecated.h>
 #include <c10/util/Exception.h>
 #include <c10/util/env.h>
 
-#include <atomic>
-#include <cstddef>
-#include <cstdlib>
-#include <mutex>
-#include <string>
-#include <vector>
-
 namespace c10::cuda::CUDACachingAllocator {
 
 enum class Expandable_Segments_Handle_Type : int {
@@ -22,21 +18,29 @@ enum class Expandable_Segments_Handle_Type : int {
 // Environment config parser
 class C10_CUDA_API CUDAAllocatorConfig {
  public:
+  C10_DEPRECATED_MESSAGE(
+      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_split_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size() instead.")
   static size_t max_split_size() {
-    return instance().m_max_split_size;
+    return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size();
   }
+
+  C10_DEPRECATED_MESSAGE(
+      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::garbage_collection_threshold() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::garbage_collection_threshold() instead.")
   static double garbage_collection_threshold() {
-    return instance().m_garbage_collection_threshold;
+    return c10::CachingAllocator::AcceleratorAllocatorConfig::
+        garbage_collection_threshold();
   }
 
   static bool expandable_segments() {
+    bool enabled = c10::CachingAllocator::AcceleratorAllocatorConfig::
+        use_expandable_segments();
 #ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
-    if (instance().m_expandable_segments) {
+    if (enabled) {
       TORCH_WARN_ONCE("expandable_segments not supported on this platform")
     }
     return false;
 #else
-    return instance().m_expandable_segments;
+    return enabled;
 #endif
   }
 
@@ -66,8 +70,15 @@ class C10_CUDA_API CUDAAllocatorConfig {
     return instance().m_pinned_num_register_threads;
   }
 
+  C10_DEPRECATED_MESSAGE(
+      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_background_threads() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::pinned_use_background_threads() instead.")
   static bool pinned_use_background_threads() {
-    return instance().m_pinned_use_background_threads;
+    return c10::CachingAllocator::AcceleratorAllocatorConfig::
+        pinned_use_background_threads();
+  }
+
+  static size_t pinned_reserve_segment_size_mb() {
+    return instance().m_pinned_reserve_segment_size_mb;
   }
 
   static size_t pinned_max_register_threads() {
@@ -77,24 +88,29 @@ class C10_CUDA_API CUDAAllocatorConfig {
     return 128;
   }
 
-  // This is used to round-up allocation size to nearest power of 2 divisions.
-  // More description below in function roundup_power2_next_division
-  // As an example, if we want 4 divisions between 2's power, this can be done
-  // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
-  static size_t roundup_power2_divisions(size_t size);
+  C10_DEPRECATED_MESSAGE(
+      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
+  static size_t roundup_power2_divisions(size_t size) {
+    return c10::CachingAllocator::AcceleratorAllocatorConfig::
+        roundup_power2_divisions(size);
+  }
 
+  C10_DEPRECATED_MESSAGE(
+      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
   static std::vector<size_t> roundup_power2_divisions() {
-    return instance().m_roundup_power2_divisions;
+    return c10::CachingAllocator::AcceleratorAllocatorConfig::
+        roundup_power2_divisions();
   }
 
   static size_t max_non_split_rounding_size() {
-    return instance().m_max_non_split_rounding_size;
+    return c10::CachingAllocator::AcceleratorAllocatorConfig::
+        max_non_split_rounding_size();
   }
 
+  C10_DEPRECATED_MESSAGE(
+      "c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::last_allocator_settings() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::last_allocator_settings() instead.")
   static std::string last_allocator_settings() {
-    std::lock_guard<std::mutex> lock(
-        instance().m_last_allocator_settings_mutex);
-    return instance().m_last_allocator_settings;
+    return c10::CachingAllocator::getAllocatorSettings();
   }
 
   static CUDAAllocatorConfig& instance() {
@@ -107,66 +123,75 @@ class C10_CUDA_API CUDAAllocatorConfig {
         env = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
       }
 #endif
-      inst->parseArgs(env);
+      // Note: keep the parsing order and logic stable to avoid potential
+      // performance regressions in internal tests.
+      if (!env.has_value()) {
+        env = c10::utils::get_env("PYTORCH_ALLOC_CONF");
+      }
+      if (env.has_value()) {
+        inst->parseArgs(env.value());
+      }
       return inst;
     })();
     return *s_instance;
   }
 
-  void parseArgs(const std::optional<std::string>& env);
+  // Use `Construct On First Use Idiom` to avoid `Static Initialization Order`
+  // issue.
+  static const std::unordered_set<std::string>& getKeys() {
+    static std::unordered_set<std::string> keys{
+        "backend",
+        // keep BC for Rocm: `cuda` -> `cud` `a`, to avoid hipify issues
+        // NOLINTBEGIN(bugprone-suspicious-missing-comma,-warnings-as-errors)
+        "release_lock_on_cud"
+        "amalloc",
+        "pinned_use_cud"
+        "a_host_register",
+        // NOLINTEND(bugprone-suspicious-missing-comma,-warnings-as-errors)
+        "release_lock_on_hipmalloc",
+        "pinned_use_hip_host_register",
+        "graph_capture_record_stream_reuse",
+        "pinned_reserve_segment_size_mb",
+        "pinned_num_register_threads"};
+    return keys;
+  }
+
+  void parseArgs(const std::string& env);
 
  private:
-  CUDAAllocatorConfig();
+  CUDAAllocatorConfig() = default;
 
-  static void lexArgs(const std::string& env, std::vector<std::string>& config);
-  static void consumeToken(
-      const std::vector<std::string>& config,
-      size_t i,
-      const char c);
-  size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
-  size_t parseMaxNonSplitRoundingSize(
-      const std::vector<std::string>& config,
-      size_t i);
-  size_t parseGarbageCollectionThreshold(
-      const std::vector<std::string>& config,
-      size_t i);
-  size_t parseRoundUpPower2Divisions(
-      const std::vector<std::string>& config,
-      size_t i);
   size_t parseAllocatorConfig(
-      const std::vector<std::string>& config,
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
       size_t i,
       bool& used_cudaMallocAsync);
   size_t parsePinnedUseCudaHostRegister(
-      const std::vector<std::string>& config,
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
       size_t i);
   size_t parsePinnedNumRegisterThreads(
-      const std::vector<std::string>& config,
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
       size_t i);
-  size_t parsePinnedUseBackgroundThreads(
-      const std::vector<std::string>& config,
+  size_t parsePinnedReserveSegmentSize(
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
       size_t i);
   size_t parseGraphCaptureRecordStreamReuse(
-      const std::vector<std::string>& config,
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
       size_t i);
 
-  std::atomic<size_t> m_max_split_size;
-  std::atomic<size_t> m_max_non_split_rounding_size;
-  std::vector<size_t> m_roundup_power2_divisions;
-  std::atomic<double> m_garbage_collection_threshold;
-  std::atomic<size_t> m_pinned_num_register_threads;
-  std::atomic<bool> m_expandable_segments;
-  std::atomic<Expandable_Segments_Handle_Type>
-      m_expandable_segments_handle_type;
-  std::atomic<bool> m_release_lock_on_cudamalloc;
-  std::atomic<bool> m_pinned_use_cuda_host_register;
-  std::atomic<bool> m_graph_capture_record_stream_reuse;
-  std::atomic<bool> m_pinned_use_background_threads;
-  std::string m_last_allocator_settings;
-  std::mutex m_last_allocator_settings_mutex;
+  std::atomic<size_t> m_pinned_num_register_threads{1};
+  std::atomic<size_t> m_pinned_reserve_segment_size_mb{0};
+  std::atomic<Expandable_Segments_Handle_Type> m_expandable_segments_handle_type
+#if CUDA_VERSION >= 12030
+      {Expandable_Segments_Handle_Type::UNSPECIFIED};
+#else
+      {Expandable_Segments_Handle_Type::POSIX_FD};
+#endif
+  std::atomic<bool> m_release_lock_on_cudamalloc{false};
+  std::atomic<bool> m_pinned_use_cuda_host_register{false};
+  std::atomic<bool> m_graph_capture_record_stream_reuse{false};
 };
 
-// General caching allocator utilities
-C10_CUDA_API void setAllocatorSettings(const std::string& env);
+// Keep this for backwards compatibility
+using c10::CachingAllocator::setAllocatorSettings;
 
 } // namespace c10::cuda::CUDACachingAllocator
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 1a15495e5bf6..c653591a5ea7 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -64,10 +64,6 @@ namespace cuda::CUDACachingAllocator {
 using namespace c10::CachingAllocator;
 using namespace c10::CachingDeviceAllocator;
 
-// Included here as this is externally used in CUDAAllocatorConfig
-const size_t kLargeBuffer =
-    20971520; // "large" allocations may be packed in 20 MiB blocks
-
 namespace Native {
 
 //
@@ -382,6 +378,7 @@ struct ExpandableSegment {
         peers_(std::move(peers)) {
     cudaDeviceProp prop{};
     C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device_));
+    mapped_size_ = 0;
     // we allocate enough address space for 1 1/8 the total memory on the GPU.
     // This allows for some cases where we have to unmap pages earlier in the
     // segment to put them at the end.
@@ -493,6 +490,7 @@ struct ExpandableSegment {
       return SegmentRange{range.ptr, 0};
     }
     unmapHandles(begin, end);
+    mapped_size_ -= (end - begin) * segment_size_;
     return rangeFromHandles(begin, end);
   }
 
@@ -514,7 +512,7 @@ struct ExpandableSegment {
     header.segment_size = segment_size_;
     header.num_handles = end - begin;
 
-    buf.write((const char*)&header, sizeof(ShareHeader));
+    buf.write(reinterpret_cast<const char*>(&header), sizeof(ShareHeader));
     for (auto i : c10::irange(begin, end)) {
       // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
       auto& handle = handles_.at(i).value();
@@ -530,7 +528,9 @@ struct ExpandableSegment {
         TORCH_CHECK(
             handle.shareable_handle != std::nullopt,
             "shareable_handle is null");
-        buf.write((const char*)&*handle.shareable_handle, sizeof(int));
+        buf.write(
+            reinterpret_cast<const char*>(&*handle.shareable_handle),
+            sizeof(int));
       } else {
         if (!handle.shareable_handle) {
           CUmemFabricHandle fabric_handle;
@@ -543,7 +543,8 @@ struct ExpandableSegment {
             handle.shareable_handle != std::nullopt,
             "shareable_handle is null");
         buf.write(
-            (const char*)&*handle.shareable_handle, sizeof(CUmemFabricHandle));
+            reinterpret_cast<const char*>(&*handle.shareable_handle),
+            sizeof(CUmemFabricHandle));
       }
     }
     return rangeFromHandles(begin, end);
@@ -554,7 +555,7 @@ struct ExpandableSegment {
       std::vector<c10::DeviceIndex> peers,
       std::istream& buf) {
     ShareHeader header{};
-    buf.read((char*)&header, sizeof(ShareHeader));
+    buf.read(reinterpret_cast<char*>(&header), sizeof(ShareHeader));
     auto segment = std::make_unique<ExpandableSegment>(
         device, std::nullopt, header.segment_size, std::move(peers));
 // older build setups (e.g. multiwheels) do not have this syscall, added 2020
@@ -576,11 +577,11 @@ struct ExpandableSegment {
       for (auto i : c10::irange(header.num_handles)) {
         (void)i;
         int fd = 0;
-        buf.read((char*)&fd, sizeof(int));
+        buf.read(reinterpret_cast<char*>(&fd), sizeof(int));
         auto myfd = syscall(SYS_pidfd_getfd, pidfd, fd, 0);
         if (myfd == -1) {
           auto err = errno;
-          close((int)pidfd);
+          close(static_cast<int>(pidfd));
           for (auto& h : segment->handles_) {
             C10_CUDA_DRIVER_CHECK(
                 // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
@@ -600,15 +601,16 @@ struct ExpandableSegment {
             (void*)(uintptr_t)myfd,
             CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
         LOG(INFO) << "use posix fd to import expandable segments.";
-        close((int)myfd);
+        close(static_cast<int>(myfd));
         segment->handles_.emplace_back(Handle{handle, std::nullopt});
       }
-      close((int)pidfd);
+      close(static_cast<int>(pidfd));
     } else {
       for (auto i : c10::irange(header.num_handles)) {
         (void)i;
         CUmemFabricHandle fabric_handle;
-        buf.read((char*)&fabric_handle, sizeof(CUmemFabricHandle));
+        buf.read(
+            reinterpret_cast<char*>(&fabric_handle), sizeof(CUmemFabricHandle));
         CUmemGenericAllocationHandle handle = 0;
         C10_CUDA_DRIVER_CHECK(DriverAPI::get()->cuMemImportFromShareableHandle_(
             &handle,
@@ -632,6 +634,18 @@ struct ExpandableSegment {
     return max_handles_ * segment_size_;
   }
 
+  cudaStream_t getStream() {
+    return *stream_;
+  }
+
+  size_t getMappedSize() const {
+    return mapped_size_;
+  }
+
+  size_t getSegmentSize() const {
+    return segment_size_;
+  }
+
   void addPeer(c10::DeviceIndex device) {
     peers_.push_back(device);
     forEachAllocatedRange(
@@ -666,6 +680,7 @@ struct ExpandableSegment {
           handles_.at(i).value().handle,
           0ULL));
     }
+    mapped_size_ += (end - begin) * segment_size_;
     setAccess(device_, begin, end);
     for (auto p : peers_) {
       setAccess(p, begin, end);
@@ -734,6 +749,7 @@ struct ExpandableSegment {
   std::optional<cudaStream_t> stream_;
   CUdeviceptr ptr_{};
   size_t segment_size_;
+  size_t mapped_size_;
   size_t max_handles_;
   struct Handle {
     CUmemGenericAllocationHandle handle;
@@ -779,6 +795,17 @@ struct ExpandableSegment {
   size_t size() const {
     return 0;
   }
+  cudaStream_t getStream() {
+    return nullptr;
+  }
+
+  size_t getMappedSize() const {
+    return 0;
+  }
+
+  size_t getSegmentSize() const {
+    return 0;
+  }
   void addPeer(c10::DeviceIndex device) {}
 };
 #endif
@@ -789,7 +816,7 @@ struct ExpandableSegment {
 struct BlockState {
   c10::DeviceIndex device = 0;
   cudaStream_t stream = nullptr;
-  stream_set stream_uses = {};
+  stream_set stream_uses;
   size_t size = 0;
   void* ptr = nullptr;
   bool allocated = false;
@@ -797,14 +824,14 @@ struct BlockState {
   // maintain invariant that event_count == 0 ;
   // history will be left alone in checkpoint
 
-  BlockState(Block* block);
+  explicit BlockState(Block* block);
 };
 
 struct SegmentState {
   std::vector<BlockState> blocks;
   bool is_small = false;
 
-  SegmentState(Block* head);
+  explicit SegmentState(Block* head);
 };
 
 struct PrivatePoolState : AllocatorState {
@@ -823,7 +850,7 @@ struct RestoreResult {
   std::vector<Block*> allocations_created;
 };
 
-static bool BlockComparatorSize(const Block* a, const Block* b) {
+bool BlockComparatorSize(const Block* a, const Block* b) {
   if (a->stream != b->stream) {
     return (uintptr_t)a->stream < (uintptr_t)b->stream;
   }
@@ -832,7 +859,7 @@ static bool BlockComparatorSize(const Block* a, const Block* b) {
   }
   return (uintptr_t)a->ptr < (uintptr_t)b->ptr;
 }
-static bool BlockComparatorAddress(const Block* a, const Block* b) {
+bool BlockComparatorAddress(const Block* a, const Block* b) {
   if (a->stream != b->stream) {
     return (uintptr_t)a->stream < (uintptr_t)b->stream;
   }
@@ -922,7 +949,7 @@ class EventPool {
 
 // CUDA graphs helper
 struct PrivatePool {
-  PrivatePool(MempoolId_t id, CUDAAllocator* allocator = nullptr)
+  explicit PrivatePool(MempoolId_t id, CUDAAllocator* allocator = nullptr)
       : id(std::move(id)),
         allocator_(allocator),
         large_blocks(/*small=*/false, this),
@@ -1036,7 +1063,7 @@ class RingBuffer {
 
   void setMaxEntries(size_t size) {
     std::lock_guard<std::mutex> lk(alloc_trace_lock);
-    alloc_trace_max_entries_ = std::max(size_t(1), size);
+    alloc_trace_max_entries_ = std::max(static_cast<size_t>(1), size);
   }
 
   void insertEntries(const T& entry) {
@@ -1051,21 +1078,14 @@ class RingBuffer {
     }
   }
 
-  void getEntries(std::vector<T>& result) {
+  void getEntries(std::vector<T>& result) const {
     std::lock_guard<std::mutex> lk(alloc_trace_lock);
-    result.reserve(alloc_trace->size());
-    result.insert(
-        result.end(),
-        alloc_trace->begin() +
-            static_cast<typename std::vector<T>::difference_type>(
-                alloc_trace_next),
-        alloc_trace->end());
-    result.insert(
-        result.end(),
+    result.reserve(result.size() + alloc_trace->size());
+    std::rotate_copy(
         alloc_trace->begin(),
-        alloc_trace->begin() +
-            static_cast<typename std::vector<T>::difference_type>(
-                alloc_trace_next));
+        std::next(alloc_trace->begin(), alloc_trace_next),
+        alloc_trace->end(),
+        std::back_inserter(result));
   }
 
   void clear() {
@@ -1079,7 +1099,7 @@ class RingBuffer {
 
   // Both alloc_trace and alloc_trace_next needs to be used
   // under alloc_trace_lock.
-  std::mutex alloc_trace_lock;
+  mutable std::mutex alloc_trace_lock;
   size_t alloc_trace_next = 0;
   std::vector<T>*
       alloc_trace; // pointer because we need to intentionally leak this on
@@ -1156,6 +1176,8 @@ class DeviceCachingAllocator {
   // device statistics
   DeviceStats stats;
 
+  c10::DeviceIndex device_id;
+
   // unallocated cached blocks larger than 1 MB
   BlockPool large_blocks;
 
@@ -1183,6 +1205,16 @@ class DeviceCachingAllocator {
   //     ends.
   ska::flat_hash_map<Block*, std::vector<cudaGraphNode_t>> deferred_blocks;
 
+  // Incremental reverse-traversal state cached per graph.
+  // We never re-traverse nodes we've already seen
+  struct GraphReuseContext {
+    ska::flat_hash_map<cudaStream_t, ska::flat_hash_set<cudaGraphNode_t>>
+        visited;
+  };
+  ska::flat_hash_map<MempoolId_t, CaptureId_t, MempoolIdHash>
+      mempool_to_capture_id;
+  ska::flat_hash_map<CaptureId_t, GraphReuseContext> graph_reuse_context;
+
   // outstanding cuda events
   ska::flat_hash_map<
       cuda::CUDAStream,
@@ -1232,12 +1264,17 @@ class DeviceCachingAllocator {
   // thread local compile context for each device
   static thread_local std::stack<std::string> compile_context;
 
+  // thread local user metadata for annotating allocations
+  static thread_local std::string user_metadata;
+
  public:
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-  DeviceCachingAllocator()
-      : large_blocks(/*small=*/false), small_blocks(/*small=*/true) {
+  explicit DeviceCachingAllocator(c10::DeviceIndex id)
+      : device_id(id),
+        large_blocks(/*small=*/false),
+        small_blocks(/*small=*/true) {
     stats.max_split_size =
-        static_cast<int64_t>(CUDAAllocatorConfig::max_split_size());
+        static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
     context_recorder_.store(nullptr);
   }
 
@@ -1258,7 +1295,7 @@ class DeviceCachingAllocator {
     }
   }
 
-  bool isHistoryEnabled() {
+  bool isHistoryEnabled() const {
     return record_history;
   }
 
@@ -1272,9 +1309,17 @@ class DeviceCachingAllocator {
     }
   }
 
+  void setUserMetadata(const std::string& metadata) {
+    user_metadata = metadata;
+  }
+
+  std::string getUserMetadata() {
+    return user_metadata;
+  }
+
   bool checkPoolLiveAllocations(
       MempoolId_t mempool_id,
-      const std::unordered_set<void*>& expected_live_allocations) {
+      const std::unordered_set<void*>& expected_live_allocations) const {
     std::unique_lock<std::recursive_mutex> lock(mutex);
 
     PrivatePool* pool = nullptr;
@@ -1321,10 +1366,7 @@ class DeviceCachingAllocator {
   // All public methods (except the above) acquire the allocator mutex.
   // Thus, do not call a public method from another public method.
 
-  Block* malloc(
-      c10::DeviceIndex device,
-      size_t orig_size,
-      cudaStream_t stream) {
+  Block* malloc(size_t orig_size, cudaStream_t stream) {
     // done outside the lock because we don't know what locks the recorder needs
     // to have...
     auto context = maybeGatherContext(RecordContext::STATE);
@@ -1352,7 +1394,7 @@ class DeviceCachingAllocator {
     size_t size = round_size(orig_size);
     auto& pool = get_pool(size, stream);
     const size_t alloc_size = get_allocation_size(size);
-    AllocParams params(device, size, stream, &pool, alloc_size);
+    AllocParams params(device_id, size, stream, &pool, alloc_size);
     params.stat_types = get_stat_types_for_pool(pool);
 
     // First, try to get a block from the existing pool.
@@ -1367,7 +1409,8 @@ class DeviceCachingAllocator {
       // Do garbage collection if the flag is set.
       if (C10_UNLIKELY(
               set_fraction &&
-              CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
+              AcceleratorAllocatorConfig::garbage_collection_threshold() >
+                  0.0)) {
         garbage_collect_cached_blocks(context);
       }
       // Attempt allocate
@@ -1399,7 +1442,7 @@ class DeviceCachingAllocator {
           beginAllocateToPool(mempool_id, filter);
           auto& mempool = get_pool(size, stream);
           AllocParams mempool_params(
-              device, size, stream, &mempool, alloc_size);
+              device_id, size, stream, &mempool, alloc_size);
           mempool_params.stat_types = get_stat_types_for_pool(mempool);
           block_found = get_free_block(mempool_params);
           endAllocateToPool(mempool_id);
@@ -1426,7 +1469,7 @@ class DeviceCachingAllocator {
         allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
       }
 
-      std::string proc_info = reportProcessMemoryInfo(device);
+      std::string proc_info = reportProcessMemoryInfo(device_id);
 
       record_trace(
           TraceEntry::OOM,
@@ -1444,7 +1487,7 @@ class DeviceCachingAllocator {
               .current,
           stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
               .current,
-          c10::Device(c10::DeviceType::CUDA, device));
+          c10::Device(c10::DeviceType::CUDA, device_id));
 
       auto allocated_bytes =
           stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
@@ -1482,7 +1525,7 @@ class DeviceCachingAllocator {
       lock.unlock();
 
       for (const auto& obs : observers_local) {
-        obs(device,
+        obs(device_id,
             alloc_size,
             set_fraction ? allowed_memory_maximum : device_total,
             device_free);
@@ -1512,7 +1555,7 @@ class DeviceCachingAllocator {
           "CUDA out of memory. Tried to allocate ",
           format_size(alloc_size),
           ". GPU ",
-          static_cast<int>(device),
+          static_cast<int>(device_id),
           " has a total capacity of ",
           format_size(device_total),
           " of which ",
@@ -1619,7 +1662,7 @@ class DeviceCachingAllocator {
       stats.active_bytes[stat_type].increase(block->size);
       stats.requested_bytes[stat_type].increase(block->requested_size);
     });
-    if (block->size >= CUDAAllocatorConfig::max_split_size())
+    if (block->size >= AcceleratorAllocatorConfig::max_split_size())
       stats.oversize_allocations.increase(1);
 
     auto allocated_bytes_gauge =
@@ -1638,44 +1681,70 @@ class DeviceCachingAllocator {
     return block;
   }
 
-  // Insert "free marker" (empty nodes) into the CUDA graph for all streams that
-  // have used the block, including the allocation stream. These nodes mark the
-  // last use of the block in the capture graph. Returns a vector of the
-  // inserted nodes, or an empty vector if any stream is not capturing.
-  std::vector<cudaGraphNode_t> insert_free_marker(Block* block) {
-    std::vector<cudaGraphNode_t> empty_nodes;
-
-    auto try_add_empty_node = [&](cudaStream_t stream) -> bool {
-      cudaStreamCaptureStatus status{};
-      cudaGraph_t graph{};
-      const cudaGraphNode_t* deps = nullptr;
-      size_t num_deps = 0;
+  struct CaptureInfo {
+    cudaGraph_t graph{};
+    CaptureId_t capture_id{0};
+    const cudaGraphNode_t* terminals{nullptr};
+    size_t num_terminals{0};
+    cudaStreamCaptureStatus status{cudaStreamCaptureStatusNone};
+  };
+
+  CaptureInfo stream_get_capture_info(cudaStream_t stream) {
+    CaptureInfo info{};
 #if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
-      C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
-          stream, &status, nullptr, &graph, &deps, nullptr, &num_deps));
+    C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
+        stream,
+        &info.status,
+        &info.capture_id,
+        &info.graph,
+        &info.terminals,
+        nullptr,
+        &info.num_terminals));
 #else
-      C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2(
-          stream, &status, nullptr, &graph, &deps, &num_deps));
+    C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2(
+        stream,
+        &info.status,
+        &info.capture_id,
+        &info.graph,
+        &info.terminals,
+        &info.num_terminals));
 #endif
+    TORCH_INTERNAL_ASSERT(
+        info.status != cudaStreamCaptureStatusInvalidated,
+        "Invalid stream capture status");
 
-      TORCH_INTERNAL_ASSERT(
-          status != cudaStreamCaptureStatusInvalidated,
-          "Invalid stream capture status");
+    return info;
+  }
 
-      if (status == cudaStreamCaptureStatusNone) {
-        return false;
+  // Record "free marker" of the CUDA graph for all streams that
+  // have used the block, including the allocation stream. These nodes mark the
+  // last use of the block in the capture graph. Returns a vector of the
+  // inserted nodes, or an empty vector if any stream is not capturing.
+  std::vector<cudaGraphNode_t> record_free_markers(Block* block) {
+    // Is is possible to have the same marker recorded multiple times, so we use
+    // a set to avoid duplicates
+    ska::flat_hash_set<cudaGraphNode_t> markers;
+    cudaGraph_t owning_graph = nullptr;
+
+    auto try_record = [&](cudaStream_t s) -> bool {
+      auto info = stream_get_capture_info(s);
+      if (info.status == cudaStreamCaptureStatusNone) {
+        return false; // not capturing on this stream -> must defer
       }
 
-      cudaGraphNode_t node{};
-      C10_CUDA_CHECK(cudaGraphAddEmptyNode(&node, graph, deps, num_deps));
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
-      C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies(
-          stream, &node, nullptr, 1, cudaStreamSetCaptureDependencies));
-#else
-      C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies(
-          stream, &node, 1, cudaStreamSetCaptureDependencies));
-#endif
-      empty_nodes.push_back(node);
+      if (owning_graph == nullptr) {
+        owning_graph = info.graph;
+      }
+      TORCH_INTERNAL_ASSERT(
+          info.graph == owning_graph,
+          "All streams in the same capture should agree on the graph");
+
+      // Use current terminals as the free markers for the stream
+      for (size_t i = 0; i < info.num_terminals; ++i) {
+        auto terminal = info.terminals[i];
+        markers.insert(terminal);
+      }
+      owning_graph = info.graph; // all streams in the same capture should agree
       return true;
     };
 
@@ -1683,81 +1752,34 @@ class DeviceCachingAllocator {
     // An empty vector indicates that the block should be deferred for freeing
     // until after capture.
 
-    // Attempt to add an empty node for the allocation stream.
-    if (!try_add_empty_node(block->stream)) {
+    // Allocation stream
+    if (!try_record(block->stream)) {
       return {};
     }
-    // Attempt to add empty nodes for all streams that have used the block.
+    // Any extra streams that used this block
     for (const auto& s : block->stream_uses) {
-      if (!try_add_empty_node(s.stream())) {
+      if (!try_record(s.stream())) {
         return {};
       }
     }
-    return empty_nodes;
-  }
-
-  // Returns the current set of "terminal" nodes in the CUDA graph for a given
-  // stream. These represent the current endpoints of the stream, and may
-  // include additional nodes if the graph branches. Any new work captured will
-  // be attached after one or more of these terminals.
-  std::vector<cudaGraphNode_t> get_terminals(cudaStream_t stream) {
-    std::vector<cudaGraphNode_t> result;
-
-    cudaStreamCaptureStatus status{};
-    cudaGraph_t graph{};
-    const cudaGraphNode_t* dependencies = nullptr;
-    size_t num_dependencies = 0;
-
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
-    C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
-        stream,
-        &status,
-        nullptr,
-        &graph,
-        &dependencies,
-        nullptr,
-        &num_dependencies));
-#else
-    C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2(
-        stream, &status, nullptr, &graph, &dependencies, &num_dependencies));
-#endif
-
-    TORCH_INTERNAL_ASSERT(
-        status == cudaStreamCaptureStatusActive,
-        "Invalid stream capture status");
-
-    for (size_t i = 0; i < num_dependencies; i++) {
-      auto node = dependencies[i];
-      if (node != nullptr) {
-        result.push_back(node);
-      }
-    }
-
-    return result;
+    return std::vector<cudaGraphNode_t>(markers.begin(), markers.end());
   }
 
-  // Returns the set of "reusable" free markers (empty nodes) in the current
+  // Returns the set of "reusable" free markers in the current
   // CUDA graph capture. A free marker is considered reusable if it is a
   // predecessor of every terminal node.
   // This ensures that all future captured work will occur after the free
   // marker, making it safe to reuse.
-  ska::flat_hash_set<cudaGraphNode_t> get_reusable_empty_nodes(
-      cudaStream_t stream) {
-    auto terminals = get_terminals(stream);
-    if (terminals.empty()) {
-      // No terminal nodes found; nothing to free.
-      return {};
-    }
-
-    auto get_dependencies = [](cudaGraphNode_t node,
-                               cudaGraphNode_t* pDependencies,
-                               size_t* pNumDependencies) -> void {
+  void update_visited(
+      const CaptureInfo& info,
+      ska::flat_hash_set<cudaGraphNode_t>& visited) {
+    // This is the versioned cudaGraphNodeGetDependencies helper function.
+    auto node_get_dependencies =
+        [](cudaGraphNode_t n, cudaGraphNode_t* deps, size_t* count) -> void {
 #if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
-      C10_CUDA_CHECK(cudaGraphNodeGetDependencies(
-          node, pDependencies, nullptr, pNumDependencies));
+      C10_CUDA_CHECK(cudaGraphNodeGetDependencies(n, deps, nullptr, count));
 #else
-      C10_CUDA_CHECK(
-          cudaGraphNodeGetDependencies(node, pDependencies, pNumDependencies));
+      C10_CUDA_CHECK(cudaGraphNodeGetDependencies(n, deps, count));
 #endif
     };
 
@@ -1765,62 +1787,43 @@ class DeviceCachingAllocator {
     auto get_parents =
         [&](cudaGraphNode_t node) -> std::vector<cudaGraphNode_t> {
       size_t count = 0;
-      get_dependencies(node, nullptr, &count);
+
+      node_get_dependencies(node, nullptr, &count);
       std::vector<cudaGraphNode_t> out(count);
       if (count) {
-        get_dependencies(node, out.data(), &count);
+        node_get_dependencies(node, out.data(), &count);
         out.resize(count);
       }
       return out;
     };
 
-    // Helper to determine if a node is an empty node (used as a free marker).
-    auto is_empty_node = [](cudaGraphNode_t n) -> bool {
-      cudaGraphNodeType type{};
-      C10_CUDA_CHECK(cudaGraphNodeGetType(n, &type));
-      return type == cudaGraphNodeTypeEmpty;
-    };
+    // For each terminal node, perform a reverse DFS to count, for each free
+    // marker, how many terminals it can reach (i.e., for how many terminals it
+    // is a predecessor). A free marker is reusable if it is a predecessor of
+    // all terminal nodes.
+    std::deque<cudaGraphNode_t> dfs;
+    for (size_t i = 0; i < info.num_terminals; ++i) {
+      dfs.push_back(info.terminals[i]);
+    }
 
-    // For each terminal node, perform a reverse DFS to count, for each empty
-    // node, how many terminals it can reach (i.e., for how many terminals it is
-    // a predecessor). An empty node is reusable if it is a predecessor of all
-    // terminal nodes.
-    ska::flat_hash_map<cudaGraphNode_t, size_t> num_terminals_reachable;
-
-    for (auto terminal : terminals) {
-      ska::flat_hash_set<cudaGraphNode_t> visited;
-      ska::flat_hash_set<cudaGraphNode_t> empty_nodes;
-
-      std::function<void(cudaGraphNode_t)> reverse_dfs =
-          [&](cudaGraphNode_t node) {
-            if (!visited.insert(node).second)
-              return;
-
-            if (is_empty_node(node)) {
-              num_terminals_reachable[node]++;
-              empty_nodes.insert(node);
-            }
-            auto parents = get_parents(node);
-            for (auto p : parents) {
-              reverse_dfs(p);
-            }
-          };
+    while (!dfs.empty()) {
+      auto v = dfs.back();
+      dfs.pop_back();
 
-      reverse_dfs(terminal);
-    }
+      if (visited.count(v)) {
+        continue;
+      }
+      visited.insert(v);
 
-    ska::flat_hash_set<cudaGraphNode_t> reusable_empty_nodes;
-    for (auto [node, count] : num_terminals_reachable) {
-      if (count == terminals.size()) {
-        reusable_empty_nodes.insert(node);
+      auto parents = get_parents(v);
+      for (auto p : parents) {
+        dfs.push_back(p);
       }
     }
-
-    return reusable_empty_nodes;
   }
 
   // A block is considered reusable during CUDA graph capture if every free
-  // marker (empty node) associated with the block is a predecessor of every
+  // marker associated with the block is a predecessor of every
   // terminal node.
   //
   // This ensures that any new operation added to the graph will be attached
@@ -1829,36 +1832,52 @@ class DeviceCachingAllocator {
   // on every stream, so the block's previous lifetime ends before any new
   // lifetime begins. This check relies solely on the DAG topology and does not
   // require event queries, making it safe to use during capture.
-  //
-  // This function iterates over all deferred blocks, determines if their empty
-  // nodes are reusable according to the above criteria, and frees the block if
-  // so.
   void free_safe_blocks_in_capture(
       const std::shared_ptr<GatheredContext>& context,
       cudaStream_t stream) {
-    auto reusable_empty_nodes = get_reusable_empty_nodes(stream);
+    auto info = stream_get_capture_info(stream);
 
     // If there are no reusable empty nodes (e.g., not currently capturing),
     // there is nothing to do.
-    if (reusable_empty_nodes.empty()) {
+    if (info.status == cudaStreamCaptureStatusNone || info.num_terminals == 0) {
       return;
     }
+    if (graph_reuse_context.find(info.capture_id) ==
+        graph_reuse_context.end()) {
+      bool found = false;
+      for (auto& entry : captures_underway) {
+        if (entry.second(stream)) {
+          auto graph_pool = graph_pools.find(entry.first);
+          TORCH_INTERNAL_ASSERT(
+              graph_pool != graph_pools.end(),
+              "Could not find graph pool for capture.");
+          auto mempool_id = graph_pool->first;
+          graph_reuse_context[info.capture_id] = GraphReuseContext{};
+          mempool_to_capture_id[mempool_id] = info.capture_id;
+          found = true;
+          break;
+        }
+      }
+      TORCH_INTERNAL_ASSERT(
+          found, "Could not find memory pool id for capture.");
+    }
+    auto& graph_context = graph_reuse_context[info.capture_id];
+    auto& visited = graph_context.visited[stream];
+    update_visited(info, visited);
 
     std::vector<Block*> blocks_to_erase;
-
-    for (auto& [block, inserted_empty_nodes] : deferred_blocks) {
-      // Skip this block if it has no empty nodes, as we defer its freeing until
+    for (auto& [block, markers] : deferred_blocks) {
+      // Skip this block if it has no markers, as we defer its freeing until
       // after graph capture. Also skip if the block was not allocated on the
       // current stream; such blocks will be freed when
       // free_safe_blocks_in_capture is attempted on that stream.
-      if (inserted_empty_nodes.empty() || block->stream != stream) {
+      if (markers.empty() || block->stream != stream) {
         continue;
       }
 
       bool is_reusable = true;
-
-      for (const auto& node : inserted_empty_nodes) {
-        if (reusable_empty_nodes.find(node) == reusable_empty_nodes.end()) {
+      for (auto m : markers) {
+        if (!visited.count(m)) {
           is_reusable = false;
           break;
         }
@@ -1912,18 +1931,18 @@ class DeviceCachingAllocator {
         block->pool->owner_MempoolId(),
         context ? context : block->context_when_allocated);
 
-    if (block->size >= CUDAAllocatorConfig::max_split_size())
+    if (block->size >= AcceleratorAllocatorConfig::max_split_size())
       stats.oversize_allocations.decrease(1);
 
     // If the block has been used on more than one stream, handle accordingly.
     if (!block->stream_uses.empty()) {
       if (C10_UNLIKELY(!captures_underway.empty())) {
         if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) {
-          // insert_free_marker returns a vector of free markers,
+          // record_free_markers returns a vector of free markers,
           // or an empty vector if any associated stream is not currently
           // capturing. The empty vector means that we will defer the free until
           // capture is finished.
-          deferred_blocks.emplace(block, insert_free_marker(block));
+          deferred_blocks.emplace(block, record_free_markers(block));
         } else {
           // If graph_capture_record_stream_reuse is not enabled, always defer
           // the free until capture is finished.
@@ -1976,15 +1995,16 @@ class DeviceCachingAllocator {
       while (base_block->prev) {
         base_block = base_block->prev;
       }
-      offset = (char*)block->ptr - (char*)base_block->ptr;
+      offset = static_cast<const char*>(block->ptr) -
+          static_cast<const char*>(base_block->ptr);
       cudaIpcMemHandle_t handle;
       C10_CUDA_CHECK(cudaIpcGetMemHandle(&handle, base_block->ptr));
-      ss.write((char*)&handle, CUDA_IPC_HANDLE_SIZE);
+      ss.write(reinterpret_cast<const char*>(&handle), CUDA_IPC_HANDLE_SIZE);
     } else {
       ss.put(SHAREABLE_CUDA_EXPANDABLE_SEGMENT);
       auto full_range = block->expandable_segment_->share(
           SegmentRange(block->ptr, block->size), ss);
-      offset = (char*)block->ptr - (char*)full_range.ptr;
+      offset = static_cast<const char*>(block->ptr) - full_range.ptr;
     }
     return ShareableHandle{offset, ss.str()};
   }
@@ -2025,6 +2045,22 @@ class DeviceCachingAllocator {
     set_fraction = true;
   }
 
+  /** get expandable segment size for all the streams on device **/
+  std::vector<StreamSegmentSize> getExpandableSegmentSizes() {
+    std::lock_guard<std::recursive_mutex> lock(mutex);
+    std::vector<StreamSegmentSize> sizes;
+    for (auto& segment : expandable_segments_) {
+      if (!segment->getStream()) {
+        continue;
+      }
+      sizes.emplace_back(
+          segment->getStream(),
+          segment->getSegmentSize() == kSmallBuffer,
+          segment->getMappedSize());
+    }
+    return sizes;
+  }
+
   /** returns cached blocks to the system allocator **/
   void emptyCache(MempoolId_t mempool_id) {
     auto context = maybeGatherContext(RecordContext::ALL);
@@ -2051,7 +2087,7 @@ class DeviceCachingAllocator {
   }
 
   /** Returns a copy of the memory allocator stats **/
-  DeviceStats getStats() {
+  DeviceStats getStats() const {
     std::lock_guard<std::recursive_mutex> lock(mutex);
     return stats;
   }
@@ -2427,7 +2463,7 @@ class DeviceCachingAllocator {
   }
 
   std::vector<TraceEntry> trace(
-      const std::function<time_t(approx_time_t)>& tsc_to_us) {
+      const std::function<time_t(approx_time_t)>& tsc_to_us) const {
     std::lock_guard<std::recursive_mutex> lock(mutex);
     std::vector<TraceEntry> result;
     alloc_buffer.getEntries(result);
@@ -2469,7 +2505,8 @@ class DeviceCachingAllocator {
     if (size < kMinBlockSize) {
       return kMinBlockSize;
     } else {
-      auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size);
+      auto divisions =
+          AcceleratorAllocatorConfig::roundup_power2_divisions(size);
       if (divisions > 1 && size > (kMinBlockSize * divisions)) {
         return roundup_power2_next_division(size, divisions);
       } else {
@@ -2511,6 +2548,21 @@ class DeviceCachingAllocator {
   // Called by CUDAGraph::capture_end
   void endAllocateToPool(MempoolId_t mempool_id) {
     std::lock_guard<std::recursive_mutex> lock(mutex);
+
+    if (CUDAAllocatorConfig::graph_capture_record_stream_reuse() &&
+        !graph_reuse_context.empty()) {
+      auto capture_id = mempool_to_capture_id[mempool_id];
+      auto graph_context = graph_reuse_context[capture_id];
+      for (auto& [stream, _] : graph_context.visited) {
+        TORCH_INTERNAL_ASSERT(
+            stream_get_capture_info(stream).status ==
+                cudaStreamCaptureStatusNone,
+            "This stream should not be capturing when the capture is ended");
+      }
+      graph_reuse_context.erase(capture_id);
+      mempool_to_capture_id.erase(mempool_id);
+    }
+
     for (auto it = captures_underway.begin(); it != captures_underway.end();
          ++it) {
       if (it->first == mempool_id) {
@@ -2546,7 +2598,7 @@ class DeviceCachingAllocator {
     }
   }
 
-  int getPoolUseCount(MempoolId_t mempool_id) {
+  int getPoolUseCount(MempoolId_t mempool_id) const {
     std::lock_guard<std::recursive_mutex> lock(mutex);
     auto pp = get_private_pool(mempool_id);
     return pp->use_count;
@@ -2642,7 +2694,7 @@ class DeviceCachingAllocator {
     }
   }
 
-  PrivatePool* get_private_pool(MempoolId_t mempool_id) {
+  PrivatePool* get_private_pool(MempoolId_t mempool_id) const {
     auto it = graph_pools.find(mempool_id);
     TORCH_INTERNAL_ASSERT(it != graph_pools.end());
     return it->second.get();
@@ -2948,7 +3000,7 @@ class DeviceCachingAllocator {
     if (block->pool->is_small || CUDAAllocatorConfig::expandable_segments()) {
       return remaining >= kMinBlockSize;
     } else {
-      return (size < CUDAAllocatorConfig::max_split_size()) &&
+      return (size < AcceleratorAllocatorConfig::max_split_size()) &&
           (remaining > kSmallSize);
     }
   }
@@ -2968,7 +3020,7 @@ class DeviceCachingAllocator {
 
     if (C10_UNLIKELY(
             set_fraction &&
-            CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
+            AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
       // Track block reuse interval only when garbage collection is enabled.
       ++pool.get_free_blocks_call_count;
     }
@@ -3010,13 +3062,13 @@ class DeviceCachingAllocator {
     }
 
     // Do not return an oversized block for a large request
-    if ((p.size() < CUDAAllocatorConfig::max_split_size()) &&
-        ((*it)->size >= CUDAAllocatorConfig::max_split_size()))
+    if ((p.size() < AcceleratorAllocatorConfig::max_split_size()) &&
+        ((*it)->size >= AcceleratorAllocatorConfig::max_split_size()))
       return false;
     // Allow oversized block size to be rounded up but within a limit
-    if ((p.size() >= CUDAAllocatorConfig::max_split_size()) &&
+    if ((p.size() >= AcceleratorAllocatorConfig::max_split_size()) &&
         ((*it)->size >=
-         p.size() + CUDAAllocatorConfig::max_non_split_rounding_size()))
+         p.size() + AcceleratorAllocatorConfig::max_non_split_rounding_size()))
       return false;
     p.block = *it;
     pool.blocks.erase(it);
@@ -3039,7 +3091,7 @@ class DeviceCachingAllocator {
     // therefore should be of less overheads.
 
     size_t gc_threshold = static_cast<size_t>(
-        CUDAAllocatorConfig::garbage_collection_threshold() *
+        AcceleratorAllocatorConfig::garbage_collection_threshold() *
         static_cast<double>(allowed_memory_maximum));
     // No need to trigger GC yet
     if (total_allocated_memory <= gc_threshold) {
@@ -3182,12 +3234,13 @@ class DeviceCachingAllocator {
     }
 
     total_allocated_memory += size;
-    p.block = new Block(p.device(), p.stream(), size, p.pool, (char*)ptr);
+    p.block = new Block(
+        p.device(), p.stream(), size, p.pool, static_cast<char*>(ptr));
     for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
       stats.segment[stat_type].increase(1);
       stats.reserved_bytes[stat_type].increase(size);
     });
-    if (size >= CUDAAllocatorConfig::max_split_size())
+    if (size >= AcceleratorAllocatorConfig::max_split_size())
       stats.oversize_segments.increase(1);
     auto reserved_bytes_gauge =
         STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
@@ -3216,7 +3269,7 @@ class DeviceCachingAllocator {
   bool release_available_cached_blocks(
       const AllocParams& p,
       const std::shared_ptr<GatheredContext>& context) {
-    if (CUDAAllocatorConfig::max_split_size() ==
+    if (AcceleratorAllocatorConfig::max_split_size() ==
         std::numeric_limits<size_t>::max())
       return false;
     BlockPool& pool = *p.pool;
@@ -3224,8 +3277,8 @@ class DeviceCachingAllocator {
     // because of std::unique_ptr, block cannot be trivially copied
     // Use constructor for search key.
     Block key(p.search_key.device, p.search_key.stream, p.search_key.size);
-    key.size = (key.size < CUDAAllocatorConfig::max_split_size())
-        ? CUDAAllocatorConfig::max_split_size()
+    key.size = (key.size < AcceleratorAllocatorConfig::max_split_size())
+        ? AcceleratorAllocatorConfig::max_split_size()
         : key.size;
     auto it = pool.blocks.lower_bound(&key);
     if (it == pool.blocks.end() || (*it)->stream != p.stream() ||
@@ -3238,7 +3291,7 @@ class DeviceCachingAllocator {
       --it; // Back up one item.  Now on the largest block for the correct
             // stream
       while ((totalReleased < key.size) &&
-             ((*it)->size >= CUDAAllocatorConfig::max_split_size()) &&
+             ((*it)->size >= AcceleratorAllocatorConfig::max_split_size()) &&
              ((*it)->stream == p.stream())) {
         auto cur = it;
         bool is_first = cur == pool.blocks.begin();
@@ -3340,7 +3393,7 @@ class DeviceCachingAllocator {
     if (pool->owner_PrivatePool && pool->owner_PrivatePool->allocator()) {
       // If there is an active mempool with a given allocator,
       // we use the given allocator's delete function.
-      pool->owner_PrivatePool->allocator()->raw_delete((void*)block->ptr);
+      pool->owner_PrivatePool->allocator()->raw_delete(block->ptr);
     } else {
       C10_CUDA_CHECK(cudaFree((void*)block->ptr));
     }
@@ -3363,7 +3416,7 @@ class DeviceCachingAllocator {
         stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
             .current);
 
-    if (block->size >= CUDAAllocatorConfig::max_split_size())
+    if (block->size >= AcceleratorAllocatorConfig::max_split_size())
       stats.oversize_segments.decrease(1);
     pool->blocks.erase(block);
     delete block;
@@ -3379,8 +3432,7 @@ class DeviceCachingAllocator {
     }
     block->pool->blocks.erase(block);
 
-    ptrdiff_t before_size =
-        static_cast<char*>(unmapped.ptr) - static_cast<char*>(block->ptr);
+    ptrdiff_t before_size = unmapped.ptr - static_cast<char*>(block->ptr);
     if (before_size > 0) {
       // prev? -> before_free -> block
       Block* before_free = new Block(
@@ -3398,7 +3450,7 @@ class DeviceCachingAllocator {
           block->stream,
           after_size,
           block->pool,
-          static_cast<char*>(unmapped.ptr) + unmapped.size);
+          unmapped.ptr + unmapped.size);
       after_free->expandable_segment_ = block->expandable_segment_;
       after_free->splice(block, block->next);
       block->pool->insert_into_blocks(after_free);
@@ -3640,7 +3692,7 @@ class DeviceCachingAllocator {
     if (!compile_context.empty()) {
       compile_string = compile_context.top();
     }
-    auto te = TraceEntry(
+    TraceEntry te(
         action,
         device,
         addr,
@@ -3649,7 +3701,8 @@ class DeviceCachingAllocator {
         mempool_id,
         getApproximateTime(),
         record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr,
-        compile_string);
+        compile_string,
+        user_metadata);
 
     // Callbacks should not include any Pytorch call
     for (const auto& cb : trace_trackers_) {
@@ -3704,6 +3757,7 @@ static void uncached_delete(void* ptr) {
 
 static void local_raw_delete(void* ptr);
 thread_local std::stack<std::string> DeviceCachingAllocator::compile_context;
+thread_local std::string DeviceCachingAllocator::user_metadata;
 #ifdef __cpp_lib_hardware_interference_size
 using std::hardware_destructive_interference_size;
 #else
@@ -3729,7 +3783,7 @@ class NativeCachingAllocator : public CUDAAllocator {
       allocated_blocks;
 
   static size_t get_mutex_shard_id(void* ptr) {
-    return twang_mix64((size_t)ptr) % kNumMutexShard;
+    return twang_mix64(reinterpret_cast<uintptr_t>(ptr)) % kNumMutexShard;
   }
 
   void add_allocated_block(Block* block) {
@@ -3766,7 +3820,8 @@ class NativeCachingAllocator : public CUDAAllocator {
     if (size < device_count) {
       device_allocator.resize(device_count);
       for (const auto i : c10::irange(size, device_count)) {
-        device_allocator[i] = std::make_unique<DeviceCachingAllocator>();
+        device_allocator[i] = std::make_unique<DeviceCachingAllocator>(
+            static_cast<c10::DeviceIndex>(i));
       }
     }
   }
@@ -3786,9 +3841,9 @@ class NativeCachingAllocator : public CUDAAllocator {
         "Allocator not initialized for device ",
         device,
         ": did you call init?");
-    Block* block = device_allocator[device]->malloc(device, size, stream);
+    Block* block = device_allocator[device]->malloc(size, stream);
     add_allocated_block(block);
-    *devPtr = (void*)block->ptr;
+    *devPtr = block->ptr;
     const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
     if (C10_UNLIKELY(interp)) {
       (*interp)->trace_gpu_memory_allocation(
@@ -3837,6 +3892,16 @@ class NativeCachingAllocator : public CUDAAllocator {
     device_allocator[device]->setMemoryFraction(fraction);
   }
 
+  std::vector<StreamSegmentSize> getExpandableSegmentSizes(
+      c10::DeviceIndex device) override {
+    TORCH_INTERNAL_ASSERT(
+        0 <= device && static_cast<size_t>(device) < device_allocator.size(),
+        "Allocator not initialized for device ",
+        device,
+        ": did you call init?");
+    return device_allocator[device]->getExpandableSegmentSizes();
+  }
+
   void recordHistory(
       bool enabled,
       CreateContextFn context_recorder,
@@ -3890,6 +3955,18 @@ class NativeCachingAllocator : public CUDAAllocator {
     device_allocator[device]->popCompileContext();
   }
 
+  void setUserMetadata(const std::string& metadata) override {
+    c10::DeviceIndex device = 0;
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    device_allocator[device]->setUserMetadata(metadata);
+  }
+
+  std::string getUserMetadata() override {
+    c10::DeviceIndex device = 0;
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    return device_allocator[device]->getUserMetadata();
+  }
+
   bool isHistoryEnabled() override {
     c10::DeviceIndex device = 0;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
@@ -3990,8 +4067,8 @@ class NativeCachingAllocator : public CUDAAllocator {
 
     auto& md = result.config_metadata;
     md.garbage_collection_threshold =
-        CUDAAllocatorConfig::garbage_collection_threshold();
-    md.max_split_size = CUDAAllocatorConfig::max_split_size();
+        AcceleratorAllocatorConfig::garbage_collection_threshold();
+    md.max_split_size = AcceleratorAllocatorConfig::max_split_size();
     md.pinned_num_register_threads =
         CUDAAllocatorConfig::pinned_num_register_threads();
     md.expandable_segments = CUDAAllocatorConfig::expandable_segments();
@@ -3999,11 +4076,12 @@ class NativeCachingAllocator : public CUDAAllocator {
         CUDAAllocatorConfig::release_lock_on_cudamalloc();
     md.pinned_use_host_register =
         CUDAAllocatorConfig::pinned_use_cuda_host_register();
-    md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
+    md.last_allocator_settings =
+        AcceleratorAllocatorConfig::last_allocator_settings();
     md.graph_capture_record_stream_reuse =
         CUDAAllocatorConfig::graph_capture_record_stream_reuse();
     md.roundup_power2_divisions =
-        CUDAAllocatorConfig::roundup_power2_divisions();
+        AcceleratorAllocatorConfig::roundup_power2_divisions();
 
     return result;
   }
@@ -4272,7 +4350,7 @@ class NativeCachingAllocator : public CUDAAllocator {
         // SHARABLE_CUDA_MALLOC
       if (type == SHAREABLE_CUDA_MALLOC) {
         cudaIpcMemHandle_t cuda_handle;
-        ss.read((char*)&cuda_handle, CUDA_IPC_HANDLE_SIZE);
+        ss.read(reinterpret_cast<char*>(&cuda_handle), CUDA_IPC_HANDLE_SIZE);
         C10_CUDA_CHECK(cudaIpcOpenMemHandle(
             &cuda_ipc_ptr_, cuda_handle, cudaIpcMemLazyEnablePeerAccess));
       } else if (type == SHAREABLE_CUDA_EXPANDABLE_SEGMENT) {
@@ -4381,11 +4459,12 @@ CUDAAllocator* allocator();
 } // namespace CudaMallocAsync
 
 struct BackendStaticInitializer {
-  // Parses env for backend at load time, duplicating some logic from
-  // CUDAAllocatorConfig. CUDAAllocatorConfig double-checks it later (at
-  // runtime). Defers verbose exceptions and error checks, including Cuda
-  // version checks, to CUDAAllocatorConfig's runtime doublecheck. If this
-  // works, maybe we should move all of CUDAAllocatorConfig here?
+  // Parses the environment configuration for CUDA/ROCm allocator backend at
+  // load time. This duplicates some logic from CUDAAllocatorConfig to ensure
+  // lazy initialization without triggering global static constructors. The
+  // function looks for the key "backend" and returns the appropriate allocator
+  // instance based on its value. If no valid configuration is found, it falls
+  // back to the default Native allocator.
   CUDAAllocator* parseEnvForBackend() {
     auto val = c10::utils::get_env("PYTORCH_CUDA_ALLOC_CONF");
 #ifdef USE_ROCM
@@ -4394,37 +4473,35 @@ struct BackendStaticInitializer {
       val = c10::utils::get_env("PYTORCH_HIP_ALLOC_CONF");
     }
 #endif
+    if (!val.has_value()) {
+      val = c10::utils::get_env("PYTORCH_ALLOC_CONF");
+    }
     if (val.has_value()) {
-      const std::string& config = val.value();
-
-      std::regex exp("[\\s,]+");
-      std::sregex_token_iterator it(config.begin(), config.end(), exp, -1);
-      std::sregex_token_iterator end;
-      std::vector<std::string> options(it, end);
-
-      for (auto option : options) {
-        std::regex exp2("[:]+");
-        std::sregex_token_iterator it2(option.begin(), option.end(), exp2, -1);
-        std::sregex_token_iterator end2;
-        std::vector<std::string> kv(it2, end2);
-        if (kv.size() >= 2) {
-          if (kv[0] == "backend") {
+      c10::CachingAllocator::ConfigTokenizer tokenizer(val.value());
+      for (size_t i = 0; i < tokenizer.size(); i++) {
+        const auto& key = tokenizer[i];
+        if (key == "backend") {
+          tokenizer.checkToken(++i, ":");
+          i++; // Move to the value after the colon
+          if (tokenizer[i] == "cudaMallocAsync"
 #ifdef USE_ROCM
-            // convenience for ROCm users to allow either CUDA or HIP env var
-            if (kv[1] ==
-                    "cud"
-                    "aMallocAsync" ||
-                kv[1] == "hipMallocAsync")
-#else
-            if (kv[1] == "cudaMallocAsync")
+              // convenience for ROCm users to allow either CUDA or HIP env var
+              || tokenizer[i] == "hipMallocAsync"
 #endif
-              return CudaMallocAsync::allocator();
-            if (kv[1] == "native")
-              return &Native::allocator;
+          ) {
+            return CudaMallocAsync::allocator();
           }
+          break;
+        } else {
+          // Skip the key and its value
+          i = tokenizer.skipKey(i);
+        }
+        if (i + 1 < tokenizer.size()) {
+          tokenizer.checkToken(++i, ",");
         }
       }
     }
+    // Default fallback allocator.
     return &Native::allocator;
   }
 
@@ -4436,9 +4513,7 @@ struct BackendStaticInitializer {
 // HIPAllocatorMasqueradingAsCUDA because it needs to happen during static
 // initialization, and doing so there may introduce static initialization
 // order (SIOF) issues.
-#define HIP_MASQUERADING_AS_CUDA \
-  "cud"                          \
-  "a"
+#define HIP_MASQUERADING_AS_CUDA "cuda"
     at::SetAllocator(c10::Device(HIP_MASQUERADING_AS_CUDA).type(), r, 0);
     allocator.store(r);
 #undef HIP_MASQUERADING_AS_CUDA
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index bfc486d69fcf..fbe5dab18e0a 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/core/AllocatorConfig.h>
 #include <c10/core/CachingDeviceAllocator.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAMacros.h>
@@ -49,10 +50,9 @@ namespace c10::cuda::CUDACachingAllocator {
 
 // Preserved only for BC reasons
 // NOLINTNEXTLINE(misc-unused-using-decls)
+using c10::CachingAllocator::kLargeBuffer;
 using c10::CachingDeviceAllocator::DeviceStats;
 
-extern const size_t kLargeBuffer;
-
 typedef std::shared_ptr<GatheredContext> (*CreateContextFn)();
 
 // Struct containing info of an allocation block (i.e. a fractional part of a
@@ -118,7 +118,8 @@ struct TraceEntry {
       MempoolId_t mempool,
       approx_time_t time,
       std::shared_ptr<GatheredContext> context = nullptr,
-      std::string compile_context = "")
+      std::string compile_context = "",
+      std::string user_metadata = "")
       : action_(action),
         device_(device),
         addr_(addr),
@@ -126,7 +127,8 @@ struct TraceEntry {
         stream_(stream),
         size_(size),
         mempool_(std::move(mempool)),
-        compile_context_(std::move(compile_context)) {
+        compile_context_(std::move(compile_context)),
+        user_metadata_(std::move(user_metadata)) {
     time_.approx_t_ = time;
   }
   Action action_;
@@ -137,7 +139,8 @@ struct TraceEntry {
   size_t size_;
   MempoolId_t mempool_;
   trace_time_ time_{};
-  std::string compile_context_{};
+  std::string compile_context_;
+  std::string user_metadata_;
 };
 
 // Calls made by record_function will save annotations
@@ -203,6 +206,14 @@ struct ShareableHandle {
   std::string handle;
 };
 
+struct StreamSegmentSize {
+  StreamSegmentSize(cudaStream_t s, bool small, size_t sz)
+      : stream(s), is_small_pool(small), total_size(sz) {}
+  cudaStream_t stream;
+  bool is_small_pool;
+  size_t total_size;
+};
+
 class CUDAAllocator : public DeviceAllocator {
  public:
   virtual void* raw_alloc(size_t nbytes) = 0;
@@ -211,6 +222,8 @@ class CUDAAllocator : public DeviceAllocator {
   virtual void init(int device_count) = 0;
   virtual double getMemoryFraction(c10::DeviceIndex device) = 0;
   virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
+  virtual std::vector<StreamSegmentSize> getExpandableSegmentSizes(
+      c10::DeviceIndex device) = 0;
   virtual void enable(bool value) = 0;
   virtual bool isEnabled() const = 0;
   virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
@@ -287,6 +300,10 @@ class CUDAAllocator : public DeviceAllocator {
       const std::vector<std::pair<std::string, std::string>>& /*md*/) {}
   virtual void pushCompileContext(std::string& md) {}
   virtual void popCompileContext() {}
+  virtual void setUserMetadata(const std::string& metadata) {}
+  virtual std::string getUserMetadata() {
+    return "";
+  }
   virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
 
   // Attached AllocatorTraceTracker callbacks will be called while the
@@ -350,11 +367,11 @@ inline void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) {
 }
 
 inline void raw_delete(void* ptr) {
-  return get()->raw_delete(ptr);
+  get()->raw_delete(ptr);
 }
 
 inline void init(int device_count) {
-  return get()->init(device_count);
+  get()->init(device_count);
 }
 
 inline double getMemoryFraction(c10::DeviceIndex device) {
@@ -362,15 +379,20 @@ inline double getMemoryFraction(c10::DeviceIndex device) {
 }
 
 inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
-  return get()->setMemoryFraction(fraction, device);
+  get()->setMemoryFraction(fraction, device);
+}
+
+inline std::vector<StreamSegmentSize> getExpandableSegmentSizes(
+    c10::DeviceIndex device) {
+  return get()->getExpandableSegmentSizes(device);
 }
 
 inline void emptyCache(MempoolId_t mempool_id = {0, 0}) {
-  return get()->emptyCache(mempool_id);
+  get()->emptyCache(mempool_id);
 }
 
 inline void enable(bool value) {
-  return get()->enable(value);
+  get()->enable(value);
 }
 
 inline bool isEnabled() {
@@ -378,7 +400,7 @@ inline bool isEnabled() {
 }
 
 inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
-  return get()->cacheInfo(device, largestBlock);
+  get()->cacheInfo(device, largestBlock);
 }
 
 inline void* getBaseAllocation(void* ptr, size_t* size) {
@@ -386,7 +408,7 @@ inline void* getBaseAllocation(void* ptr, size_t* size) {
 }
 
 inline void recordStream(const DataPtr& dataPtr, CUDAStream stream) {
-  return get()->recordStream(dataPtr, stream);
+  get()->recordStream(dataPtr, stream);
 }
 
 inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
@@ -395,11 +417,11 @@ inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
 }
 
 inline void resetAccumulatedStats(c10::DeviceIndex device) {
-  return get()->resetAccumulatedStats(device);
+  get()->resetAccumulatedStats(device);
 }
 
 inline void resetPeakStats(c10::DeviceIndex device) {
-  return get()->resetPeakStats(device);
+  get()->resetPeakStats(device);
 }
 
 inline SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) {
@@ -436,21 +458,21 @@ inline void recordHistory(
     size_t alloc_trace_max_entries,
     RecordContext when,
     bool clearHistory) {
-  return get()->recordHistory(
+  get()->recordHistory(
       enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
 }
 
 inline void recordAnnotation(
     const std::vector<std::pair<std::string, std::string>>& md) {
-  return get()->recordAnnotation(md);
+  get()->recordAnnotation(md);
 }
 
 inline void pushCompileContext(std::string& md) {
-  return get()->pushCompileContext(md);
+  get()->pushCompileContext(md);
 }
 
 inline void popCompileContext() {
-  return get()->popCompileContext();
+  get()->popCompileContext();
 }
 
 inline bool isHistoryEnabled() {
@@ -466,15 +488,15 @@ inline bool checkPoolLiveAllocations(
 }
 
 inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
-  return get()->attachOutOfMemoryObserver(std::move(observer));
+  get()->attachOutOfMemoryObserver(std::move(observer));
 }
 
 inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
-  return get()->attachAllocatorTraceTracker(std::move(tracker));
+  get()->attachAllocatorTraceTracker(std::move(tracker));
 }
 
 inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
-  return get()->releasePool(device, mempool_id);
+  get()->releasePool(device, mempool_id);
 }
 inline void createOrIncrefPool(
     c10::DeviceIndex device,
@@ -518,7 +540,15 @@ inline cudaError_t memcpyAsync(
 inline void enablePeerAccess(
     c10::DeviceIndex dev,
     c10::DeviceIndex dev_to_access) {
-  return get()->enablePeerAccess(dev, dev_to_access);
+  get()->enablePeerAccess(dev, dev_to_access);
+}
+
+inline void setUserMetadata(const std::string& metadata) {
+  get()->setUserMetadata(metadata);
+}
+
+inline std::string getUserMetadata() {
+  return get()->getUserMetadata();
 }
 
 } // namespace c10::cuda::CUDACachingAllocator
diff --git a/c10/cuda/CUDADeviceAssertionHost.cpp b/c10/cuda/CUDADeviceAssertionHost.cpp
index a6d4c3fe9079..d67ee4b23e69 100644
--- a/c10/cuda/CUDADeviceAssertionHost.cpp
+++ b/c10/cuda/CUDADeviceAssertionHost.cpp
@@ -1,8 +1,6 @@
 #include <c10/cuda/CUDADeviceAssertionHost.h>
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAFunctions.h>
-#include <c10/util/Backtrace.h>
-#include <c10/util/Exception.h>
 #include <c10/util/env.h>
 #include <c10/util/irange.h>
 #include <cuda_runtime.h>
diff --git a/c10/cuda/CUDAMallocAsyncAllocator.cpp b/c10/cuda/CUDAMallocAsyncAllocator.cpp
index b5f313e419db..6d6b8a56d29c 100644
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@@ -4,7 +4,6 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/UniqueVoidPtr.h>
 #include <c10/util/flat_hash_map.h>
-#include <c10/util/irange.h>
 
 #include <unordered_set>
 #include <vector>
@@ -14,7 +13,6 @@ namespace c10::cuda::CUDACachingAllocator::CudaMallocAsync {
 using namespace c10::CachingAllocator;
 using namespace c10::CachingDeviceAllocator;
 
-#if CUDA_VERSION >= 11040 || defined(USE_ROCM)
 // CUDA device allocator that uses cudaMallocAsync to implement
 // the same interface as CUDACachingAllocator.cpp.
 
@@ -48,7 +46,7 @@ bool operator==(const UsageStream& lhs, const UsageStream& rhs) {
 
 struct UsageStreamHash {
   size_t operator()(const UsageStream& us) const noexcept {
-    return std::hash<void*>{}(us.stream) + size_t(us.device);
+    return std::hash<void*>{}(us.stream) + static_cast<size_t>(us.device);
   }
 };
 
@@ -447,7 +445,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     return !devs_initialized_flags.empty();
   }
 
-  static inline void assertValidDevice(c10::DeviceIndex device) {
+  static void assertValidDevice(c10::DeviceIndex device) {
     TORCH_CHECK(
         0 <= device && device < device_count, "Invalid device argument.");
   }
@@ -496,6 +494,13 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     // introduces performance nondeterminism.
   }
 
+  std::vector<StreamSegmentSize> getExpandableSegmentSizes(
+      c10::DeviceIndex device) override {
+    TORCH_CHECK(
+        false,
+        "CUDAMallocAsyncAllocator does not yet support getExpandableSegmentSizes.");
+  }
+
   void emptyCache(/*unused*/ MempoolId_t mempool_id) override {
     std::lock_guard<std::mutex> lk(general_mutex);
 
@@ -511,7 +516,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
     }
   }
 
-  void enable(bool) override {
+  void enable(bool /*value*/) override {
     // cannot disable
   }
 
@@ -793,7 +798,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
   void beginAllocateToPool(
       c10::DeviceIndex device,
       MempoolId_t mempool_id,
-      std::function<bool(cudaStream_t)>) override {
+      std::function<bool(cudaStream_t)> /*filter*/) override {
     std::lock_guard<std::mutex> lk(general_mutex);
 
     TORCH_INTERNAL_ASSERT(capture_free_streams.empty());
@@ -926,13 +931,4 @@ CUDAAllocator* allocator() {
   return &device_allocator;
 }
 
-#else
-// NOLINTNEXTLINE(misc-use-internal-linkage)
-CUDAAllocator* allocator() {
-  TORCH_CHECK(false, "Cannot use CudaMallocAsyncAllocator with cuda < 11.4.");
-  return nullptr;
-}
-
-#endif
-
 } // namespace c10::cuda::CUDACachingAllocator::CudaMallocAsync
diff --git a/c10/cuda/CUDAMiscFunctions.cpp b/c10/cuda/CUDAMiscFunctions.cpp
index b1b6170f891e..b305008d44f8 100644
--- a/c10/cuda/CUDAMiscFunctions.cpp
+++ b/c10/cuda/CUDAMiscFunctions.cpp
@@ -1,7 +1,6 @@
 #include <c10/cuda/CUDAMiscFunctions.h>
 #include <c10/util/env.h>
 #include <cuda_runtime.h>
-#include <cstring>
 #include <string>
 
 namespace c10::cuda {
diff --git a/c10/cuda/CUDAMiscFunctions.h b/c10/cuda/CUDAMiscFunctions.h
index ec1114935457..bdb2f9998ecd 100644
--- a/c10/cuda/CUDAMiscFunctions.h
+++ b/c10/cuda/CUDAMiscFunctions.h
@@ -9,7 +9,7 @@
 #include <string>
 
 namespace c10::cuda {
-C10_CUDA_API std::string get_cuda_error_help(cudaError_t) noexcept;
+C10_CUDA_API std::string get_cuda_error_help(cudaError_t /*error*/) noexcept;
 C10_CUDA_API const char* get_cuda_check_suffix() noexcept;
 C10_CUDA_API std::mutex* getFreeMutex();
 } // namespace c10::cuda
diff --git a/c10/cuda/CUDAStream.cpp b/c10/cuda/CUDAStream.cpp
index 6d2b1e06fda9..975468de9f43 100644
--- a/c10/cuda/CUDAStream.cpp
+++ b/c10/cuda/CUDAStream.cpp
@@ -15,14 +15,14 @@ namespace c10::cuda {
 namespace {
 
 // Global stream state and constants
-static c10::once_flag init_flag;
-static DeviceIndex num_gpus = -1;
-static constexpr int kStreamsPerPoolBits = 5;
-static constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
-static constexpr unsigned int kDefaultFlags = cudaStreamNonBlocking;
-static constexpr int kStreamTypeBits = 4;
+c10::once_flag init_flag;
+DeviceIndex num_gpus = -1;
+constexpr int kStreamsPerPoolBits = 5;
+constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
+constexpr unsigned int kDefaultFlags = cudaStreamNonBlocking;
+constexpr int kStreamTypeBits = 4;
 
-static int max_stream_priorities;
+int max_stream_priorities;
 
 // Non-default streams
 // Note: the number of CUDA devices is determined at run time,
@@ -39,14 +39,14 @@ static int max_stream_priorities;
 // the destruction.
 #if !defined(USE_ROCM)
 // CUDA-only: used to initializes the stream pools (once)
-static std::array<c10::once_flag, C10_COMPILE_TIME_MAX_GPUS> device_flags;
+std::array<c10::once_flag, C10_COMPILE_TIME_MAX_GPUS> device_flags;
 #endif
-static std::array<
+std::array<
     std::array<std::atomic<uint32_t>, C10_COMPILE_TIME_MAX_GPUS>,
     c10::cuda::max_compile_time_stream_priorities>
     priority_counters;
 
-static std::array<
+std::array<
     std::array<
         std::array<cudaStream_t, kStreamsPerPool>,
         C10_COMPILE_TIME_MAX_GPUS>,
@@ -128,7 +128,7 @@ std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
   } else if (s.isExt()) {
     stream << "EXT";
   } else {
-    stream << "PRIORITY " << int(s.getStreamType());
+    stream << "PRIORITY " << static_cast<int>(s.getStreamType());
   }
   return stream;
 }
@@ -137,7 +137,7 @@ std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
 // We rely on streamIdIndex and streamIdType being non-negative;
 // see Note [Hazard when concatenating signed integers]
 
-static inline StreamIdType streamIdType(StreamId s) {
+inline StreamIdType streamIdType(StreamId s) {
   // Externally allocated streams have their id being the cudaStream_ptr
   // so the last bit will be 0
   if ((!(s & 1)) && s) {
@@ -151,7 +151,7 @@ static inline StreamIdType streamIdType(StreamId s) {
   return StreamIdType(val);
 }
 
-static inline size_t streamIdIndex(StreamId s) {
+inline size_t streamIdIndex(StreamId s) {
   return static_cast<size_t>(
       (s >> (kStreamTypeBits + 1)) & ((1 << kStreamsPerPoolBits) - 1));
 }
@@ -166,11 +166,11 @@ StreamId makeStreamId(StreamIdType st, size_t si) {
 
 // Thread-local current streams
 // NOLINTNEXTLINE(*-arrays)
-static thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;
+thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;
 
 // Populates global values.
 // Warning: this function must only be called once!
-static void initGlobalStreamState() {
+void initGlobalStreamState() {
   num_gpus = device_count();
   // Check if the number of GPUs matches the expected compile-time max number
   // of GPUs.
@@ -199,7 +199,7 @@ static void initGlobalStreamState() {
 
 // Init a single CUDA or HIP stream
 // See Note [HIP Lazy Streams]
-static void initSingleStream(int p, DeviceIndex device_index, int i) {
+void initSingleStream(int p, DeviceIndex device_index, int i) {
   CUDAGuard device_guard(device_index);
   auto& stream = streams[p][device_index][i];
   auto pri = -p; // lower number is higher priority
@@ -215,7 +215,7 @@ static void initSingleStream(int p, DeviceIndex device_index, int i) {
 
 // Creates the low and high priority stream pools for the specified device
 // Warning: only call once per device!
-static void initDeviceStreamState(DeviceIndex device_index) {
+void initDeviceStreamState(DeviceIndex device_index) {
   for (const auto i : c10::irange(kStreamsPerPool)) {
     for (const auto p : c10::irange(max_stream_priorities)) {
       initSingleStream(p, device_index, i);
@@ -224,7 +224,7 @@ static void initDeviceStreamState(DeviceIndex device_index) {
 }
 
 // Init front-end to ensure initialization only occurs once
-static void initCUDAStreamsOnce() {
+void initCUDAStreamsOnce() {
   // Inits default streams (once, globally)
   c10::call_once(init_flag, initGlobalStreamState);
 
@@ -241,7 +241,7 @@ static void initCUDAStreamsOnce() {
 }
 
 // Helper to verify the GPU index is valid
-static inline void check_gpu(DeviceIndex device_index) {
+inline void check_gpu(DeviceIndex device_index) {
   TORCH_CHECK(
       device_index >= 0 && device_index < num_gpus,
       "Device index value ",
@@ -253,7 +253,7 @@ static inline void check_gpu(DeviceIndex device_index) {
 
 // Helper to determine the index of the stream to return
 // Note: Streams are returned round-robin (see note in CUDAStream.h)
-static uint32_t get_idx(std::atomic<uint32_t>& counter) {
+uint32_t get_idx(std::atomic<uint32_t>& counter) {
   auto raw_idx = counter++;
   return raw_idx % kStreamsPerPool;
 }
diff --git a/c10/cuda/CUDAStream.h b/c10/cuda/CUDAStream.h
index 05c314469f87..a1233aeb0570 100644
--- a/c10/cuda/CUDAStream.h
+++ b/c10/cuda/CUDAStream.h
@@ -70,7 +70,7 @@ class C10_CUDA_API CUDAStream {
   /// Construct a CUDAStream from a Stream with no error checking.
   /// This constructor uses the "named" constructor idiom, and can
   /// be invoked as: CUDAStream(CUDAStream::UNCHECKED, stream)
-  explicit CUDAStream(Unchecked, Stream stream) : stream_(stream) {}
+  explicit CUDAStream(Unchecked /*unused*/, Stream stream) : stream_(stream) {}
 
   bool operator==(const CUDAStream& other) const noexcept {
     return unwrap() == other.unwrap();
diff --git a/c10/cuda/driver_api.cpp b/c10/cuda/driver_api.cpp
index d545bf5477b6..887c2d06347b 100644
--- a/c10/cuda/driver_api.cpp
+++ b/c10/cuda/driver_api.cpp
@@ -1,7 +1,6 @@
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/driver_api.h>
-#include <c10/util/CallOnce.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Logging.h>
 #include <cuda_runtime.h>
diff --git a/c10/metal/utils.h b/c10/metal/utils.h
index aaa0e1741240..14c4b2b2cbae 100644
--- a/c10/metal/utils.h
+++ b/c10/metal/utils.h
@@ -328,5 +328,21 @@ struct pair {
   T2 second;
 };
 
+#define INSTANTIATE_FOR_ALL_TYPES(MACRO) \
+  MACRO(float);                          \
+  MACRO(half);                           \
+  MACRO(bfloat);                         \
+  MACRO(float2);                         \
+  MACRO(long);                           \
+  MACRO(char);                           \
+  MACRO(uchar);                          \
+  MACRO(short);                          \
+  MACRO(int);
+
+#define INSTANTIATE_FOR_FLOAT_TYPES(MACRO) \
+  MACRO(float);                            \
+  MACRO(half);                             \
+  MACRO(bfloat);
+
 } // namespace metal
 } // namespace c10
diff --git a/c10/ovrsource_defs.bzl b/c10/ovrsource_defs.bzl
index 532404f21bba..aafe5a4de8c4 100644
--- a/c10/ovrsource_defs.bzl
+++ b/c10/ovrsource_defs.bzl
@@ -18,9 +18,9 @@ cuda_supported_platforms = [
 
 def define_c10_ovrsource(name, is_mobile):
     if is_mobile:
-        pp_flags = ["-DC10_MOBILE=1", "-DC10_USE_GLOG"]
+        pp_flags = ["-DC10_MOBILE=1"]
     else:
-        pp_flags = ["-DC10_USE_GLOG"]
+        pp_flags = []
 
     oxx_static_library(
         name = name,
diff --git a/c10/util/ApproximateClock.cpp b/c10/util/ApproximateClock.cpp
index a69128a44831..74403830c68c 100644
--- a/c10/util/ApproximateClock.cpp
+++ b/c10/util/ApproximateClock.cpp
@@ -1,7 +1,6 @@
 #include <c10/util/ApproximateClock.h>
-#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
 #include <c10/util/irange.h>
-#include <fmt/format.h>
 
 namespace c10 {
 
@@ -47,7 +46,8 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter::
   for (const auto i : c10::irange(replicates)) {
     auto delta_ns = end_times[i].t_ - start_times_[i].t_;
     auto delta_approx = end_times[i].approx_t_ - start_times_[i].approx_t_;
-    scale_factors[i] = (double)delta_ns / (double)delta_approx;
+    scale_factors[i] =
+        static_cast<double>(delta_ns) / static_cast<double>(delta_approx);
   }
   std::sort(scale_factors.begin(), scale_factors.end());
   long double scale_factor = scale_factors[replicates / 2 + 1];
@@ -65,7 +65,8 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter::
   for (const auto i : c10::irange(replicates)) {
     auto dt = start_times_[i].t_ - t0;
     auto dt_approx =
-        (double)(start_times_[i].approx_t_ - t0_approx) * scale_factor;
+        static_cast<double>(start_times_[i].approx_t_ - t0_approx) *
+        scale_factor;
     t0_correction[i] = dt - (time_t)dt_approx; // NOLINT
   }
   t0 += t0_correction[t0_correction.size() / 2 + 1]; // NOLINT
@@ -73,7 +74,9 @@ std::function<time_t(approx_time_t)> ApproximateClockToUnixTimeConverter::
   return [=](approx_time_t t_approx) {
     // See above for why this is more stable than `A * t_approx + B`.
     return t_approx > t0_approx
-        ? (time_t)((double)(t_approx - t0_approx) * scale_factor) + t0
+        ? static_cast<time_t>(
+              static_cast<double>(t_approx - t0_approx) * scale_factor) +
+            t0
         : 0;
   };
 }
diff --git a/c10/util/DynamicCounter.h b/c10/util/DynamicCounter.h
index d13b2b2191d2..22141f4cdc30 100644
--- a/c10/util/DynamicCounter.h
+++ b/c10/util/DynamicCounter.h
@@ -43,7 +43,7 @@ class DynamicCounterBackendIf {
   virtual void unregisterCounter(std::string_view key) = 0;
 };
 
-void C10_API
-    registerDynamicCounterBackend(std::unique_ptr<DynamicCounterBackendIf>);
+void C10_API registerDynamicCounterBackend(
+    std::unique_ptr<DynamicCounterBackendIf> /*backend*/);
 } // namespace detail
 } // namespace c10::monitor
diff --git a/c10/util/Exception.h b/c10/util/Exception.h
index 545cef535138..f0c85a8b13d8 100644
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@@ -217,7 +217,7 @@ class C10_API WarningHandlerGuard {
 /// The TORCH_WARN_ONCE macro is difficult to test for. Use
 /// setWarnAlways(true) to turn it into TORCH_WARN, which can be
 /// tested for more easily.
-C10_API void set_warnAlways(bool) noexcept(true);
+C10_API void set_warnAlways(bool /*setting*/) noexcept(true);
 C10_API bool get_warnAlways() noexcept(true);
 
 // A RAII guard that sets warn_always (not thread-local) on
diff --git a/c10/util/ExclusivelyOwned.h b/c10/util/ExclusivelyOwned.h
index c2ff416380c8..ebb74a5823a0 100644
--- a/c10/util/ExclusivelyOwned.h
+++ b/c10/util/ExclusivelyOwned.h
@@ -63,7 +63,7 @@ class ExclusivelyOwned {
   explicit ExclusivelyOwned(T&& t) : repr_(EOT::moveToRepr(std::move(t))) {}
 
   template <class... Args>
-  explicit ExclusivelyOwned(std::in_place_t, Args&&... args)
+  explicit ExclusivelyOwned(std::in_place_t /*unused*/, Args&&... args)
       : repr_(EOT::createInPlace(std::forward<Args>(args)...)) {}
 
   ExclusivelyOwned(const ExclusivelyOwned&) = delete;
diff --git a/c10/util/ExclusivelyOwnedTensorTraits.h b/c10/util/ExclusivelyOwnedTensorTraits.h
index 73ff45b8c38d..f19df3089f77 100644
--- a/c10/util/ExclusivelyOwnedTensorTraits.h
+++ b/c10/util/ExclusivelyOwnedTensorTraits.h
@@ -35,26 +35,26 @@ struct ExclusivelyOwnedTensorTraits {
     // incremented.
     const bool isUndefined = toDestroy == UndefinedTensorImpl::singleton();
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        toDestroy->refcount_ == 1 || (toDestroy->refcount_ == 0 && isUndefined),
+        toDestroy->refcount() == 1 ||
+            (toDestroy->refcount() == 0 && isUndefined),
         "ExclusivelyOwned<Tensor> destroyed with isUndefined ",
         isUndefined,
         " and refcount ",
-        toDestroy->refcount_,
+        toDestroy->refcount(),
         ", expected 1 or, if isUndefined, 0!");
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        toDestroy->weakcount_ == 1 ||
-            (toDestroy->weakcount_ == 0 &&
+        toDestroy->weakcount() == 1 ||
+            (toDestroy->weakcount() == 0 &&
              toDestroy == UndefinedTensorImpl::singleton()),
         "ExclusivelyOwned<Tensor> destroyed with isUndefined ",
         isUndefined,
         " and weakcount ",
-        toDestroy->weakcount_,
+        toDestroy->weakcount(),
         ", expected 1 or, if isUndefined, 0!");
     if (!isUndefined) {
 #ifndef NDEBUG
       // Needed to pass the debug assertions in ~intrusive_ptr_target.
-      toDestroy->refcount_ = 0;
-      toDestroy->weakcount_ = 0;
+      toDestroy->combined_refcount_.store(0, std::memory_order_relaxed);
 #endif
       delete toDestroy;
     }
diff --git a/c10/util/FunctionRef.h b/c10/util/FunctionRef.h
index 4cab3be078e4..013874becc36 100644
--- a/c10/util/FunctionRef.h
+++ b/c10/util/FunctionRef.h
@@ -52,12 +52,14 @@ class function_ref<Ret(Params...)> {
   function_ref(
       // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
       Callable&& callable,
-      std::enable_if_t<
-          !std::is_same_v<std::remove_reference_t<Callable>, function_ref>>* =
-          nullptr,
+      std::enable_if_t<!std::is_same_v<
+          std::remove_reference_t<Callable>,
+          function_ref>>* /*unused*/
+      = nullptr,
       std::enable_if_t<std::is_convertible_v<
           typename std::invoke_result_t<Callable, Params...>,
-          Ret>>* = nullptr)
+          Ret>>* /*unused*/
+      = nullptr)
       : callback(callback_fn<std::remove_reference_t<Callable>>),
         callable(reinterpret_cast<intptr_t>(&callable)) {}
 
diff --git a/c10/util/Gauge.h b/c10/util/Gauge.h
index f505c037ebc9..e5596bde6e6f 100644
--- a/c10/util/Gauge.h
+++ b/c10/util/Gauge.h
@@ -26,7 +26,8 @@ class GaugeBackendFactoryIf {
       std::string_view key) noexcept = 0;
 };
 
-void C10_API registerGaugeBackend(std::unique_ptr<GaugeBackendFactoryIf>);
+void C10_API
+    registerGaugeBackend(std::unique_ptr<GaugeBackendFactoryIf> /*backend*/);
 } // namespace detail
 
 // A handle to a Gauge.
diff --git a/c10/util/Logging.h b/c10/util/Logging.h
index 2a08b1f1ce69..fd9b982b7294 100644
--- a/c10/util/Logging.h
+++ b/c10/util/Logging.h
@@ -307,10 +307,11 @@ class C10_API EventSampledHandler {
 
 // Must be called in the main thread before any other threads are spawned.
 C10_API void InitEventSampledHandlers(
-    std::vector<
-        std::pair<std::string_view, std::unique_ptr<EventSampledHandler>>>);
+    std::vector<std::pair<
+        std::string_view,
+        std::unique_ptr<EventSampledHandler>>> /*handlers*/);
 C10_API const std::unique_ptr<EventSampledHandler>& GetEventSampledHandler(
-    std::string_view);
+    std::string_view /*event*/);
 
 /**
  * Very lightweight logging for the first time API usage. It's beneficial for
diff --git a/c10/util/MaybeOwned.h b/c10/util/MaybeOwned.h
index 41f6d2db4acd..11b2d2a15a5c 100644
--- a/c10/util/MaybeOwned.h
+++ b/c10/util/MaybeOwned.h
@@ -82,7 +82,7 @@ class MaybeOwned final {
 
   /// Don't use this; use owned() instead.
   template <class... Args>
-  explicit MaybeOwned(std::in_place_t, Args&&... args)
+  explicit MaybeOwned(std::in_place_t /*unused*/, Args&&... args)
       : isBorrowed_(false), own_(std::forward<Args>(args)...) {}
 
  public:
@@ -177,7 +177,7 @@ class MaybeOwned final {
   }
 
   template <class... Args>
-  static MaybeOwned owned(std::in_place_t, Args&&... args) {
+  static MaybeOwned owned(std::in_place_t /*unused*/, Args&&... args) {
     return MaybeOwned(std::in_place, std::forward<Args>(args)...);
   }
 
diff --git a/c10/util/Metaprogramming.h b/c10/util/Metaprogramming.h
index d759da7a2a4e..d504706f3283 100644
--- a/c10/util/Metaprogramming.h
+++ b/c10/util/Metaprogramming.h
@@ -112,7 +112,7 @@ using make_offset_index_sequence =
  * 2>());
  */
 template <class Tuple, size_t... Is>
-constexpr auto tuple_elements(Tuple t, std::index_sequence<Is...>) {
+constexpr auto tuple_elements(Tuple t, std::index_sequence<Is...> /*unused*/) {
   return std::tuple<std::tuple_element_t<Is, Tuple>...>(std::get<Is>(t)...);
 }
 
@@ -209,7 +209,7 @@ auto tuple_map(
     // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
     std::tuple<Args...>&& tuple,
     const Mapper& mapper,
-    std::index_sequence<Indices...>) {
+    std::index_sequence<Indices...> /*unused*/) {
   return std::tuple<decltype(mapper(std::forward<Args>(std::get<Indices>(
       tuple))))...>(mapper(std::forward<Args>(std::get<Indices>(tuple)))...);
 }
diff --git a/c10/util/OptionalArrayRef.h b/c10/util/OptionalArrayRef.h
index 90610eb7d125..bf2a78985ed8 100644
--- a/c10/util/OptionalArrayRef.h
+++ b/c10/util/OptionalArrayRef.h
@@ -27,7 +27,7 @@ class OptionalArrayRef final {
 
   constexpr OptionalArrayRef() noexcept = default;
 
-  constexpr OptionalArrayRef(std::nullopt_t) noexcept {}
+  constexpr OptionalArrayRef(std::nullopt_t /*unused*/) noexcept {}
 
   OptionalArrayRef(const OptionalArrayRef& other) = default;
 
@@ -89,7 +89,7 @@ class OptionalArrayRef final {
 
   // Assignment
 
-  constexpr OptionalArrayRef& operator=(std::nullopt_t) noexcept {
+  constexpr OptionalArrayRef& operator=(std::nullopt_t /*unused*/) noexcept {
     wrapped_opt_array_ref = std::nullopt;
     return *this;
   }
diff --git a/c10/util/SmallVector.h b/c10/util/SmallVector.h
index 88f450316b64..eaf3cbfc601e 100644
--- a/c10/util/SmallVector.h
+++ b/c10/util/SmallVector.h
@@ -215,7 +215,7 @@ class SmallVectorTemplateCommon
       class ItTy,
       std::enable_if_t<!std::is_same_v<std::remove_const_t<ItTy>, T*>, bool> =
           false>
-  void assertSafeToReferenceAfterClear(ItTy, ItTy) {}
+  void assertSafeToReferenceAfterClear(ItTy /*unused*/, ItTy /*unused*/) {}
 
   /// Check whether any part of the range will be invalidated by growing.
   void assertSafeToAddRange(const T* From, const T* To) {
@@ -228,7 +228,7 @@ class SmallVectorTemplateCommon
       class ItTy,
       std::enable_if_t<!std::is_same_v<std::remove_const_t<ItTy>, T*>, bool> =
           false>
-  void assertSafeToAddRange(ItTy, ItTy) {}
+  void assertSafeToAddRange(ItTy /*unused*/, ItTy /*unused*/) {}
 
   /// Reserve enough space to add one element, and return the updated element
   /// pointer in case it was a reference to the storage.
@@ -538,7 +538,7 @@ class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
   SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
 
   // No need to do a destroy loop for POD's.
-  static void destroy_range(T*, T*) {}
+  static void destroy_range(T* /*unused*/, T* /*unused*/) {}
 
   /// Move the range [I, E) onto the uninitialized memory
   /// starting with "Dest", constructing elements into it as needed.
@@ -563,8 +563,8 @@ class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
       T1* I,
       T1* E,
       T2* Dest,
-      std::enable_if_t<std::is_same_v<std::remove_const_t<T1>, T2>>* =
-          nullptr) {
+      std::enable_if_t<std::is_same_v<std::remove_const_t<T1>, T2>>* /*unused*/
+      = nullptr) {
     // Use memcpy for PODs iterated by pointers (which includes SmallVector
     // iterators): std::uninitialized_copy optimizes to memmove, but we can
     // use memcpy here. Note that I and E are iterators and thus might be
diff --git a/c10/util/StringUtil.h b/c10/util/StringUtil.h
index 8a294cb80aa8..b2c41bb98ee1 100644
--- a/c10/util/StringUtil.h
+++ b/c10/util/StringUtil.h
@@ -87,7 +87,7 @@ C10_API std::ostream& _str(std::ostream& ss, const std::wstring& wString);
 template <>
 inline std::ostream& _str<CompileTimeEmptyString>(
     std::ostream& ss,
-    const CompileTimeEmptyString&) {
+    const CompileTimeEmptyString& /*unused*/) {
   return ss;
 }
 
diff --git a/c10/util/UniqueVoidPtr.cpp b/c10/util/UniqueVoidPtr.cpp
index 3554bcf8ee62..dd92db1066f5 100644
--- a/c10/util/UniqueVoidPtr.cpp
+++ b/c10/util/UniqueVoidPtr.cpp
@@ -2,6 +2,6 @@
 
 namespace c10::detail {
 
-void deleteNothing(void*) {}
+void deleteNothing(void* /*unused*/) {}
 
 } // namespace c10::detail
diff --git a/c10/util/UniqueVoidPtr.h b/c10/util/UniqueVoidPtr.h
index fe2a3c650cdd..394fb5500076 100644
--- a/c10/util/UniqueVoidPtr.h
+++ b/c10/util/UniqueVoidPtr.h
@@ -13,7 +13,7 @@ using DeleterFnPtr = void (*)(void*);
 namespace detail {
 
 // Does not delete anything
-C10_API void deleteNothing(void*);
+C10_API void deleteNothing(void* /*unused*/);
 
 // A detail::UniqueVoidPtr is an owning smart pointer like unique_ptr, but
 // with three major differences:
diff --git a/c10/util/WaitCounter.cpp b/c10/util/WaitCounter.cpp
index b1695802825d..fb004ee39b65 100644
--- a/c10/util/WaitCounter.cpp
+++ b/c10/util/WaitCounter.cpp
@@ -49,7 +49,7 @@ class DynamicBackendWrapper : public WaitCounterBackendIf {
 
   void stop(std::chrono::steady_clock::time_point now, intptr_t ctx) noexcept
       override {
-    return impl_.stop(
+    impl_.stop(
         impl_.self,
         std::chrono::duration_cast<std::chrono::microseconds>(
             now.time_since_epoch())
@@ -162,6 +162,6 @@ WaitCounterHandle::WaitGuard WaitCounterHandle::start() {
 }
 
 void WaitCounterHandle::stop(const SmallVector<intptr_t>& ctxs) {
-  return impl_.stop(ctxs);
+  impl_.stop(ctxs);
 }
 } // namespace c10::monitor
diff --git a/c10/util/WaitCounter.h b/c10/util/WaitCounter.h
index c87c2e3293e5..e8fe2e90aecf 100644
--- a/c10/util/WaitCounter.h
+++ b/c10/util/WaitCounter.h
@@ -35,7 +35,7 @@ class WaitCounterBackendFactoryIf {
 };
 
 C10_API void registerWaitCounterBackend(
-    std::unique_ptr<WaitCounterBackendFactoryIf>);
+    std::unique_ptr<WaitCounterBackendFactoryIf> /*factory*/);
 
 C10_API std::vector<std::shared_ptr<WaitCounterBackendFactoryIf>>
 getRegisteredWaitCounterBackends();
diff --git a/c10/util/complex_math.cpp b/c10/util/complex_math.cpp
index 886aadb14151..d1d690917a9b 100644
--- a/c10/util/complex_math.cpp
+++ b/c10/util/complex_math.cpp
@@ -1,7 +1,5 @@
 #include <c10/util/complex.h>
 
-#include <cmath>
-
 // Note [ Complex Square root in libc++]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // In libc++ complex square root is computed using polar form
diff --git a/c10/util/flags_use_no_gflags.cpp b/c10/util/flags_use_no_gflags.cpp
index f82332a87491..533caa336779 100644
--- a/c10/util/flags_use_no_gflags.cpp
+++ b/c10/util/flags_use_no_gflags.cpp
@@ -15,7 +15,7 @@ using std::string;
 C10_DEFINE_REGISTRY(C10FlagsRegistry, C10FlagParser, const string&)
 
 namespace {
-static bool gCommandLineFlagsParsed = false;
+bool gCommandLineFlagsParsed = false;
 // Since flags is going to be loaded before logging, we would
 // need to have a stringstream to hold the messages instead of directly
 // using caffe logging.
@@ -23,7 +23,7 @@ std::stringstream& GlobalInitStream() {
   static std::stringstream ss;
   return ss;
 }
-static const char* gUsageMessage = "(Usage message not set.)";
+const char* gUsageMessage = "(Usage message not set.)";
 } // namespace
 
 C10_EXPORT void SetUsageMessage(const string& str) {
diff --git a/c10/util/flat_hash_map.h b/c10/util/flat_hash_map.h
index 8688510b2b81..df74877f8b17 100644
--- a/c10/util/flat_hash_map.h
+++ b/c10/util/flat_hash_map.h
@@ -573,13 +573,13 @@ class sherwood_v3_table : private EntryAlloc,
     return emplace(std::move(value));
   }
   template <typename... Args>
-  iterator emplace_hint(const_iterator, Args&&... args) {
+  iterator emplace_hint(const_iterator /*unused*/, Args&&... args) {
     return emplace(std::forward<Args>(args)...).first;
   }
-  iterator insert(const_iterator, const value_type& value) {
+  iterator insert(const_iterator /*unused*/, const value_type& value) {
     return emplace(value).first;
   }
-  iterator insert(const_iterator, value_type&& value) {
+  iterator insert(const_iterator /*unused*/, value_type&& value) {
     return emplace(std::move(value)).first;
   }
 
@@ -896,7 +896,7 @@ class sherwood_v3_table : private EntryAlloc,
 } // namespace detailv3
 
 struct prime_number_hash_policy {
-  static uint64_t mod0(uint64_t) {
+  static uint64_t mod0(uint64_t /*unused*/) {
     return 0llu;
   }
   static uint64_t mod2(uint64_t hash) {
@@ -1883,7 +1883,7 @@ struct power_of_two_hash_policy {
     size = detailv3::next_power_of_two(size);
     return 0;
   }
-  void commit(int8_t) {}
+  void commit(int8_t /*unused*/) {}
   void reset() {}
 };
 
@@ -1989,14 +1989,14 @@ class flat_hash_map
   }
   template <typename M>
   typename Table::iterator insert_or_assign(
-      typename Table::const_iterator,
+      typename Table::const_iterator /*unused*/,
       const key_type& key,
       M&& m) {
     return insert_or_assign(key, std::forward<M>(m)).first;
   }
   template <typename M>
   typename Table::iterator insert_or_assign(
-      typename Table::const_iterator,
+      typename Table::const_iterator /*unused*/,
       key_type&& key,
       M&& m) {
     return insert_or_assign(std::move(key), std::forward<M>(m)).first;
diff --git a/c10/util/int128.cpp b/c10/util/int128.cpp
index b074c05698ef..4ce9c907558f 100644
--- a/c10/util/int128.cpp
+++ b/c10/util/int128.cpp
@@ -132,15 +132,15 @@ std::ostream& operator<<(std::ostream& o, const uint128& b) {
   int div_base_log = 0;
   switch (flags & std::ios::basefield) {
     case std::ios::hex:
-      div = (uint64_t)0x1000000000000000u; // 16^15
+      div = static_cast<uint64_t>(0x1000000000000000u); // 16^15
       div_base_log = 15;
       break;
     case std::ios::oct:
-      div = (uint64_t)01000000000000000000000u; // 8^21
+      div = static_cast<uint64_t>(01000000000000000000000u); // 8^21
       div_base_log = 21;
       break;
     default: // std::ios::dec
-      div = (uint64_t)10000000000000000000u; // 10^19
+      div = static_cast<uint64_t>(10000000000000000000u); // 10^19
       div_base_log = 19;
       break;
   }
diff --git a/c10/util/int128.h b/c10/util/int128.h
index 4bea5a5f1197..11d903002d2b 100644
--- a/c10/util/int128.h
+++ b/c10/util/int128.h
@@ -79,8 +79,8 @@ class C10_API uint128 {
   // Make msvc happy with using operator<<= from DivModImpl
   // which is a static function, and linker complained about missing
   // static version of this overload
-  friend uint128& operator<<=(uint128&, int);
-  uint128& operator>>=(int);
+  friend uint128& operator<<=(uint128& /*self*/, int /*amount*/);
+  uint128& operator>>=(int /*amount*/);
   uint128& operator&=(const uint128& b);
   uint128& operator|=(const uint128& b);
   uint128& operator^=(const uint128& b);
diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h
index 449910cbb29e..3d5478be90e6 100644
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@@ -27,7 +27,78 @@ struct DontIncreaseRefcount {};
 } // namespace raw
 
 namespace detail {
-constexpr uint32_t kImpracticallyHugeReferenceCount = 0x0FFFFFFF;
+constexpr uint64_t kImpracticallyHugeReferenceCount = 0x0FFFFFFF;
+constexpr uint64_t kImpracticallyHugeWeakReferenceCount =
+    (kImpracticallyHugeReferenceCount << 32);
+constexpr uint64_t kReferenceCountOne = 1;
+constexpr uint64_t kWeakReferenceCountOne = (kReferenceCountOne << 32);
+constexpr uint64_t kUniqueRef = (kReferenceCountOne | kWeakReferenceCountOne);
+
+template <class TTarget>
+struct intrusive_target_default_null_type final {
+  static constexpr TTarget* singleton() noexcept {
+    return nullptr;
+  }
+};
+
+template <class TTarget, class ToNullType, class FromNullType>
+TTarget* assign_ptr_(TTarget* rhs) {
+  if (FromNullType::singleton() == rhs) {
+    return ToNullType::singleton();
+  } else {
+    return rhs;
+  }
+}
+
+inline uint32_t refcount(uint64_t combined_refcount) {
+  return static_cast<uint32_t>(combined_refcount);
+}
+
+inline uint32_t weakcount(uint64_t combined_refcount) {
+  return static_cast<uint32_t>(combined_refcount >> 32);
+}
+
+// The only requirement for refcount increment is that it happens-before
+// decrement, so no additional memory ordering is needed.
+inline uint64_t atomic_combined_refcount_increment(
+    std::atomic<uint64_t>& combined_refcount,
+    uint64_t inc) {
+  return combined_refcount.fetch_add(inc, std::memory_order_relaxed) + inc;
+}
+
+inline uint32_t atomic_refcount_increment(
+    std::atomic<uint64_t>& combined_refcount) {
+  return detail::refcount(atomic_combined_refcount_increment(
+      combined_refcount, kReferenceCountOne));
+}
+
+inline uint32_t atomic_weakcount_increment(
+    std::atomic<uint64_t>& combined_refcount) {
+  return detail::weakcount(atomic_combined_refcount_increment(
+      combined_refcount, kWeakReferenceCountOne));
+}
+
+// The requirement is that all modifications to the managed object happen-before
+// invocation of the managed object destructor, and that allocation of the
+// managed object storage happens-before deallocation of the storage.
+//
+// To get this ordering, all non-final decrements must synchronize-with the
+// final decrement. So all non-final decrements have to store-release while the
+// final decrement has to load-acquire, either directly or with the help of
+// fences. But it's easiest just to have all decrements be acq-rel. And it turns
+// out, on modern architectures and chips, it's also fastest.
+inline uint64_t atomic_combined_refcount_decrement(
+    std::atomic<uint64_t>& combined_refcount,
+    uint64_t dec) {
+  return combined_refcount.fetch_sub(dec, std::memory_order_acq_rel) - dec;
+}
+
+inline uint32_t atomic_weakcount_decrement(
+    std::atomic<uint64_t>& combined_refcount) {
+  return detail::weakcount(atomic_combined_refcount_decrement(
+      combined_refcount, kWeakReferenceCountOne));
+}
+
 } // namespace detail
 
 /**
@@ -80,8 +151,14 @@ class C10_API intrusive_ptr_target {
   //    atomically increment the use count, if it is greater than 0.
   //    If it is not, you must report that the storage is dead.
   //
-  mutable std::atomic<uint32_t> refcount_;
-  mutable std::atomic<uint32_t> weakcount_;
+  //.We use a single combined count for refcount and weakcount so that
+  // we can atomically operate on both at the same time for performance
+  // and defined behaviors.
+  //
+  mutable std::atomic<uint64_t> combined_refcount_;
+  static_assert(sizeof(std::atomic<uint64_t>) == 8);
+  static_assert(alignof(std::atomic<uint64_t>) == 8);
+  static_assert(std::atomic<uint64_t>::is_always_lock_free);
 
   template <typename T, typename NullType>
   friend class intrusive_ptr;
@@ -126,16 +203,16 @@ class C10_API intrusive_ptr_target {
         // caller of unsafe_adapt_non_heap_allocated wanted to
         // use). We choose our reference count such that the count
         // will not dip below kImpracticallyHugeReferenceCount regardless.
-        refcount_.load() == 0 ||
-            refcount_.load() >= detail::kImpracticallyHugeReferenceCount,
+        refcount() == 0 ||
+            refcount() >= detail::kImpracticallyHugeReferenceCount,
         "Tried to destruct an intrusive_ptr_target that still has intrusive_ptr to it; refcount was ",
-        refcount_.load());
+        refcount());
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
         // See ~intrusive_ptr for optimization that will frequently result in 1
         // at destruction time.
-        weakcount_.load() == 1 || weakcount_.load() == 0 ||
-            weakcount_.load() == detail::kImpracticallyHugeReferenceCount - 1 ||
-            weakcount_.load() == detail::kImpracticallyHugeReferenceCount,
+        weakcount() == 1 || weakcount() == 0 ||
+            weakcount() == detail::kImpracticallyHugeReferenceCount - 1 ||
+            weakcount() == detail::kImpracticallyHugeReferenceCount,
         "Tried to destruct an intrusive_ptr_target that still has weak_intrusive_ptr to it");
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
@@ -144,7 +221,7 @@ class C10_API intrusive_ptr_target {
 #endif
   }
 
-  constexpr intrusive_ptr_target() noexcept : refcount_(0), weakcount_(0) {}
+  constexpr intrusive_ptr_target() noexcept : combined_refcount_(0) {}
 
   // intrusive_ptr_target supports copy and move: but refcount and weakcount
   // don't participate (since they are intrinsic properties of the memory
@@ -177,53 +254,16 @@ class C10_API intrusive_ptr_target {
    * destructed), this function WILL NOT be called.
    */
   virtual void release_resources() {}
-};
 
-namespace detail {
-template <class TTarget>
-struct intrusive_target_default_null_type final {
-  static constexpr TTarget* singleton() noexcept {
-    return nullptr;
+  uint32_t refcount(std::memory_order order = std::memory_order_relaxed) const {
+    return detail::refcount(combined_refcount_.load(order));
   }
-};
 
-template <class TTarget, class ToNullType, class FromNullType>
-TTarget* assign_ptr_(TTarget* rhs) {
-  if (FromNullType::singleton() == rhs) {
-    return ToNullType::singleton();
-  } else {
-    return rhs;
+  uint32_t weakcount(
+      std::memory_order order = std::memory_order_relaxed) const {
+    return detail::weakcount(combined_refcount_.load(order));
   }
-}
-
-// The only requirement for refcount increment is that it happens-before
-// decrement, so no additional memory ordering is needed.
-inline uint32_t atomic_refcount_increment(std::atomic<uint32_t>& refcount) {
-  return refcount.fetch_add(1, std::memory_order_relaxed) + 1;
-}
-
-inline uint32_t atomic_weakcount_increment(std::atomic<uint32_t>& weakcount) {
-  return weakcount.fetch_add(1, std::memory_order_relaxed) + 1;
-}
-
-// The requirement is that all modifications to the managed object happen-before
-// invocation of the managed object destructor, and that allocation of the
-// managed object storage happens-before deallocation of the storage.
-//
-// To get this ordering, all non-final decrements must synchronize-with the
-// final decrement. So all non-final decrements have to store-release while the
-// final decrement has to load-acquire, either directly or with the help of
-// fences. But it's easiest just to have all decrements be acq-rel. And it turns
-// out, on modern architectures and chips, it's also fastest.
-inline uint32_t atomic_refcount_decrement(std::atomic<uint32_t>& refcount) {
-  return refcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
-}
-
-inline uint32_t atomic_weakcount_decrement(std::atomic<uint32_t>& weakcount) {
-  return weakcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
-}
-
-} // namespace detail
+};
 
 template <class TTarget, class NullType>
 class weak_intrusive_ptr;
@@ -275,7 +315,7 @@ class intrusive_ptr final {
   void retain_() {
     if (target_ != NullType::singleton()) {
       uint32_t new_refcount =
-          detail::atomic_refcount_increment(target_->refcount_);
+          detail::atomic_refcount_increment(target_->combined_refcount_);
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
           new_refcount != 1,
           "intrusive_ptr: Cannot increase refcount after it reached zero.");
@@ -284,41 +324,25 @@ class intrusive_ptr final {
 
   void reset_() noexcept {
     if (target_ != NullType::singleton()) {
-#if defined(__linux__) && (defined(__aarch64__) || defined(__x86_64__))
-      if constexpr (
-          std::atomic<uint64_t>::is_always_lock_free &&
-          std::atomic<uint32_t>::is_always_lock_free &&
-          sizeof(std::atomic<uint64_t>) == 8 &&
-          sizeof(std::atomic<uint32_t>) == 4) {
-        auto both_counts_ =
-            reinterpret_cast<std::atomic<uint64_t>*>(&target_->refcount_);
-        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-            (reinterpret_cast<std::uintptr_t>(both_counts_) %
-             sizeof(std::atomic<uint64_t>)) == 0 &&
-            (reinterpret_cast<std::uintptr_t>(&target_->weakcount_) -
-             reinterpret_cast<std::uintptr_t>(both_counts_)) ==
-                sizeof(std::atomic<uint32_t>));
-        // 0x100000001ULL is a 64-bit number combination of both the refcount_
-        // and weakcount_ being 1.
-        constexpr uint64_t unique_ref_ = 0x100000001ULL;
-        if (both_counts_->load(std::memory_order_acquire) == unique_ref_) {
-          // Both counts are 1, so there are no weak references and
-          // we are releasing the last strong reference. No other
-          // threads can observe the effects of this target_ deletion
-          // call (e.g. calling use_count()) without a data race.
-          target_->refcount_.store(0, std::memory_order_relaxed);
-          delete target_;
-          return;
-        }
+      if (target_->combined_refcount_.load(std::memory_order_acquire) ==
+          detail::kUniqueRef) {
+        // Both counts are 1, so there are no weak references and
+        // we are releasing the last strong reference. No other
+        // threads can observe the effects of this target_ deletion
+        // call (e.g. calling use_count()) without a data race.
+        target_->combined_refcount_.store(0, std::memory_order_relaxed);
+        delete target_;
+        return;
       }
-#endif
 
-      if (detail::atomic_refcount_decrement(target_->refcount_) == 0) {
+      auto combined_refcount = detail::atomic_combined_refcount_decrement(
+          target_->combined_refcount_, detail::kReferenceCountOne);
+      if (detail::refcount(combined_refcount) == 0) {
+        bool should_delete =
+            (combined_refcount == detail::kWeakReferenceCountOne);
         // See comment above about weakcount. As long as refcount>0,
         // weakcount is one larger than the actual number of weak references.
         // So we need to decrement it here.
-        bool should_delete =
-            target_->weakcount_.load(std::memory_order_acquire) == 1;
         if (!should_delete) {
           // justification for const_cast: release_resources is basically a
           // destructor and a destructor always mutates the object, even for
@@ -326,8 +350,8 @@ class intrusive_ptr final {
           // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
           const_cast<std::remove_const_t<TTarget>*>(target_)
               ->release_resources();
-          should_delete =
-              detail::atomic_weakcount_decrement(target_->weakcount_) == 0;
+          should_delete = detail::atomic_weakcount_decrement(
+                              target_->combined_refcount_) == 0;
         }
         if (should_delete) {
           delete target_;
@@ -354,12 +378,12 @@ class intrusive_ptr final {
       // `mov`, whereas an atomic increment does a lock-prefixed `add`, which is
       // much more expensive: https://godbolt.org/z/eKPzj8.)
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-          target_->refcount_ == 0 && target_->weakcount_ == 0,
+          target_->combined_refcount_.load(std::memory_order_relaxed) == 0,
           "intrusive_ptr: Newly-created target had non-zero refcounts. Does its "
           "constructor do something strange like incref or create an "
           "intrusive_ptr from `this`?");
-      target_->refcount_.store(1, std::memory_order_relaxed);
-      target_->weakcount_.store(1, std::memory_order_relaxed);
+      target_->combined_refcount_.store(
+          detail::kUniqueRef, std::memory_order_relaxed);
     }
   }
 
@@ -375,7 +399,9 @@ class intrusive_ptr final {
   // This constructor will not increase the ref counter for you.
   // We use the tagged dispatch mechanism to explicitly mark this constructor
   // to not increase the refcount
-  explicit intrusive_ptr(TTarget* target, raw::DontIncreaseRefcount) noexcept
+  explicit intrusive_ptr(
+      TTarget* target,
+      raw::DontIncreaseRefcount /*unused*/) noexcept
       : target_(target) {}
 
   explicit intrusive_ptr(std::unique_ptr<TTarget> rhs) noexcept
@@ -482,14 +508,14 @@ class intrusive_ptr final {
     if (target_ == NullType::singleton()) {
       return 0;
     }
-    return target_->refcount_.load(std::memory_order_relaxed);
+    return target_->refcount(std::memory_order_relaxed);
   }
 
   uint32_t weak_use_count() const noexcept {
     if (target_ == NullType::singleton()) {
       return 0;
     }
-    return target_->weakcount_.load(std::memory_order_relaxed);
+    return target_->weakcount(std::memory_order_relaxed);
   }
 
   bool unique() const noexcept {
@@ -518,8 +544,8 @@ class intrusive_ptr final {
    */
   static intrusive_ptr reclaim(TTarget* owning_ptr) {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        owning_ptr == NullType::singleton() ||
-            owning_ptr->refcount_.load() == 0 || owning_ptr->weakcount_.load(),
+        owning_ptr == NullType::singleton() || owning_ptr->refcount() == 0 ||
+            owning_ptr->weakcount(),
         "TTarget violates the invariant that refcount > 0  =>  weakcount > 0");
     return intrusive_ptr(owning_ptr, raw::DontIncreaseRefcount{});
   }
@@ -590,11 +616,11 @@ class intrusive_ptr final {
 #ifdef NDEBUG
     expected_decrefs = 0;
 #endif
-    result.target_->refcount_.store(
-        detail::kImpracticallyHugeReferenceCount + expected_decrefs,
+    result.target_->combined_refcount_.store(
+        detail::refcount(
+            detail::kImpracticallyHugeReferenceCount + expected_decrefs) |
+            detail::kImpracticallyHugeWeakReferenceCount,
         std::memory_order_relaxed);
-    result.target_->weakcount_.store(
-        detail::kImpracticallyHugeReferenceCount, std::memory_order_relaxed);
     return result;
   }
 
@@ -611,7 +637,7 @@ class intrusive_ptr final {
   static intrusive_ptr unsafe_reclaim_from_nonowning(TTarget* raw_ptr) {
     // See Note [Stack allocated intrusive_ptr_target safety]
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        raw_ptr == NullType::singleton() || raw_ptr->refcount_.load() > 0,
+        raw_ptr == NullType::singleton() || raw_ptr->refcount() > 0,
         "intrusive_ptr: Can only reclaim pointers that are owned by someone");
     auto ptr = reclaim(raw_ptr); // doesn't increase refcount
     ptr.retain_();
@@ -745,7 +771,7 @@ class weak_intrusive_ptr final {
   void retain_() {
     if (target_ != NullType::singleton()) {
       uint32_t new_weakcount =
-          detail::atomic_weakcount_increment(target_->weakcount_);
+          detail::atomic_weakcount_increment(target_->combined_refcount_);
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
           new_weakcount != 1,
           "weak_intrusive_ptr: Cannot increase weakcount after it reached zero.");
@@ -754,7 +780,7 @@ class weak_intrusive_ptr final {
 
   void reset_() noexcept {
     if (target_ != NullType::singleton() &&
-        detail::atomic_weakcount_decrement(target_->weakcount_) == 0) {
+        detail::atomic_weakcount_decrement(target_->combined_refcount_) == 0) {
       // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDelete)
       delete target_;
     }
@@ -887,7 +913,7 @@ class weak_intrusive_ptr final {
     if (target_ == NullType::singleton()) {
       return 0;
     }
-    return target_->refcount_.load(
+    return target_->refcount(
         std::memory_order_relaxed); // refcount, not weakcount!
   }
 
@@ -895,7 +921,7 @@ class weak_intrusive_ptr final {
     if (target_ == NullType::singleton()) {
       return 0;
     }
-    return target_->weakcount_.load(std::memory_order_relaxed);
+    return target_->weakcount(std::memory_order_relaxed);
   }
 
   bool expired() const noexcept {
@@ -906,16 +932,17 @@ class weak_intrusive_ptr final {
     if (target_ == NullType::singleton()) {
       return intrusive_ptr<TTarget, NullType>();
     } else {
-      auto refcount = target_->refcount_.load(std::memory_order_relaxed);
+      auto combined_refcount =
+          target_->combined_refcount_.load(std::memory_order_relaxed);
       do {
-        if (refcount == 0) {
+        if (detail::refcount(combined_refcount) == 0) {
           // Object already destructed, no strong references left anymore.
           // Return nullptr.
           return intrusive_ptr<TTarget, NullType>();
         }
-      } while (!target_->refcount_.compare_exchange_weak(
-          refcount,
-          refcount + 1,
+      } while (!target_->combined_refcount_.compare_exchange_weak(
+          combined_refcount,
+          combined_refcount + detail::kReferenceCountOne,
           std::memory_order_acquire,
           std::memory_order_relaxed));
 
@@ -952,9 +979,9 @@ class weak_intrusive_ptr final {
     // if refcount == 0, weakcount only must be >0.
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
         owning_weak_ptr == NullType::singleton() ||
-            owning_weak_ptr->weakcount_.load() > 1 ||
-            (owning_weak_ptr->refcount_.load() == 0 &&
-             owning_weak_ptr->weakcount_.load() > 0),
+            owning_weak_ptr->weakcount() > 1 ||
+            (owning_weak_ptr->refcount() == 0 &&
+             owning_weak_ptr->weakcount() > 0),
         "weak_intrusive_ptr: Can only weak_intrusive_ptr::reclaim() owning pointers that were created using weak_intrusive_ptr::release().");
     return weak_intrusive_ptr(owning_weak_ptr);
   }
@@ -1033,7 +1060,7 @@ namespace intrusive_ptr {
 // NullType::singleton to this function
 inline void incref(intrusive_ptr_target* self) {
   if (self) {
-    detail::atomic_refcount_increment(self->refcount_);
+    detail::atomic_refcount_increment(self->combined_refcount_);
   }
 }
 
@@ -1067,7 +1094,7 @@ inline uint32_t use_count(intrusive_ptr_target* self) {
 namespace weak_intrusive_ptr {
 
 inline void incref(weak_intrusive_ptr_target* self) {
-  detail::atomic_weakcount_increment(self->weakcount_);
+  detail::atomic_weakcount_increment(self->combined_refcount_);
 }
 
 inline void decref(weak_intrusive_ptr_target* self) {
diff --git a/c10/util/llvmMathExtras.h b/c10/util/llvmMathExtras.h
index 556699be04b1..6321297a61c7 100644
--- a/c10/util/llvmMathExtras.h
+++ b/c10/util/llvmMathExtras.h
@@ -70,7 +70,7 @@ enum ZeroBehavior {
 namespace detail {
 template <typename T, std::size_t SizeOfT>
 struct TrailingZerosCounter {
-  static std::size_t count(T Val, ZeroBehavior) {
+  static std::size_t count(T Val, ZeroBehavior /*unused*/) {
     if (!Val)
       return std::numeric_limits<T>::digits;
     if (Val & 0x1)
@@ -147,7 +147,7 @@ std::size_t countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
 namespace detail {
 template <typename T, std::size_t SizeOfT>
 struct LeadingZerosCounter {
-  static std::size_t count(T Val, ZeroBehavior) {
+  static std::size_t count(T Val, ZeroBehavior /*unused*/) {
     if (!Val)
       return std::numeric_limits<T>::digits;
 
diff --git a/c10/util/logging_is_not_google_glog.h b/c10/util/logging_is_not_google_glog.h
index 9e9ff45948bb..803a833c3cae 100644
--- a/c10/util/logging_is_not_google_glog.h
+++ b/c10/util/logging_is_not_google_glog.h
@@ -234,7 +234,9 @@ inline std::ostream& operator<<(
   return out;
 }
 
-inline std::ostream& operator<<(std::ostream& out, const std::nullptr_t&) {
+inline std::ostream& operator<<(
+    std::ostream& out,
+    const std::nullptr_t& /*unused*/) {
   out << "(null)";
   return out;
 }
diff --git a/c10/util/order_preserving_flat_hash_map.h b/c10/util/order_preserving_flat_hash_map.h
index fd8196432c99..a288894f69b6 100644
--- a/c10/util/order_preserving_flat_hash_map.h
+++ b/c10/util/order_preserving_flat_hash_map.h
@@ -560,13 +560,13 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
     return emplace(std::move(value));
   }
   template <typename... Args>
-  iterator emplace_hint(const_iterator, Args&&... args) {
+  iterator emplace_hint(const_iterator /*unused*/, Args&&... args) {
     return emplace(std::forward<Args>(args)...).first;
   }
-  iterator insert(const_iterator, const value_type& value) {
+  iterator insert(const_iterator /*unused*/, const value_type& value) {
     return emplace(value).first;
   }
-  iterator insert(const_iterator, value_type&& value) {
+  iterator insert(const_iterator /*unused*/, value_type&& value) {
     return emplace(std::move(value)).first;
   }
 
@@ -1013,7 +1013,7 @@ class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
 } // namespace detailv3
 
 struct prime_number_hash_policy {
-  static uint64_t mod0(uint64_t) {
+  static uint64_t mod0(uint64_t /*unused*/) {
     return 0llu;
   }
   static uint64_t mod2(uint64_t hash) {
@@ -2000,7 +2000,7 @@ struct power_of_two_hash_policy {
     size = detailv3::next_power_of_two(size);
     return 0;
   }
-  void commit(int8_t) {}
+  void commit(int8_t /*unused*/) {}
   void reset() {}
 };
 
@@ -2106,14 +2106,14 @@ class order_preserving_flat_hash_map
   }
   template <typename M>
   typename Table::iterator insert_or_assign(
-      typename Table::const_iterator,
+      typename Table::const_iterator /*unused*/,
       const key_type& key,
       M&& m) {
     return insert_or_assign(key, std::forward<M>(m)).first;
   }
   template <typename M>
   typename Table::iterator insert_or_assign(
-      typename Table::const_iterator,
+      typename Table::const_iterator /*unused*/,
       key_type&& key,
       M&& m) {
     return insert_or_assign(std::move(key), std::forward<M>(m)).first;
diff --git a/c10/util/signal_handler.cpp b/c10/util/signal_handler.cpp
index 7c2bd055c58d..831c0d024524 100644
--- a/c10/util/signal_handler.cpp
+++ b/c10/util/signal_handler.cpp
@@ -11,7 +11,6 @@
 #include <unistd.h>
 
 #include <atomic>
-#include <chrono>
 #include <condition_variable>
 #include <cstdint>
 #include <cstdio>
diff --git a/c10/util/strong_type.h b/c10/util/strong_type.h
index daf8a1804d26..c7d2fc0ecdd5 100644
--- a/c10/util/strong_type.h
+++ b/c10/util/strong_type.h
@@ -65,7 +65,7 @@ struct default_constructible
 
 namespace impl {
   template <typename T>
-  constexpr bool supports_default_construction(const ::strong::default_constructible::modifier<T>*)
+  constexpr bool supports_default_construction(const ::strong::default_constructible::modifier<T>* /*unused*/)
   {
     return true;
   }
@@ -76,7 +76,7 @@ class type : public modifier<M, type<T, Tag, M...>>...
 {
 public:
   template <typename TT = T, typename = std::enable_if_t<std::is_trivially_constructible<TT>{}>>
-  explicit type(uninitialized_t)
+  explicit type(uninitialized_t /*unused*/)
     noexcept
   {
   }
@@ -138,7 +138,7 @@ class type : public modifier<M, type<T, Tag, M...>>...
 
 namespace impl {
   template <typename T, typename Tag, typename ... Ms>
-  constexpr bool is_strong_type_func(const strong::type<T, Tag, Ms...>*) { return true;}
+  constexpr bool is_strong_type_func(const strong::type<T, Tag, Ms...>* /*unused*/) { return true;}
   constexpr bool is_strong_type_func(...) { return false;}
   template <typename T, typename Tag, typename ... Ms>
   constexpr T underlying_type(strong::type<T, Tag, Ms...>*);
diff --git a/c10/xpu/XPUCachingAllocator.cpp b/c10/xpu/XPUCachingAllocator.cpp
index a5e088515ff5..0c00eddf0e47 100644
--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@@ -20,8 +20,6 @@ constexpr size_t kMinBlockSize = 512;
 constexpr size_t kSmallSize = 1048576;
 // "small" allocations are packed in 2 MiB blocks
 constexpr size_t kSmallBuffer = 2097152;
-// "large" allocations may be packed in 20 MiB blocks
-constexpr size_t kLargeBuffer = 20971520;
 // allocations between 1 and 10 MiB may use kLargeBuffer
 constexpr size_t kMinLargeAlloc = 10485760;
 // round up large allocations to 2 MiB
@@ -435,6 +433,18 @@ class DeviceCachingAllocator {
       c10::xpu::DeviceProp device_prop;
       c10::xpu::get_device_properties(&device_prop, device);
       auto device_total = device_prop.global_mem_size;
+      // Estimate the available device memory when the SYCL runtime does not
+      // support the corresponding aspect (ext_intel_free_memory).
+      size_t device_free = device_prop.global_mem_size -
+          stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
+              .current;
+      auto& raw_device = c10::xpu::get_raw_device(device);
+      // TODO: Remove the aspect check once the SYCL runtime bug is fixed on
+      // affected devices.
+      if (raw_device.has(sycl::aspect::ext_intel_free_memory)) {
+        device_free =
+            raw_device.get_info<sycl::ext::intel::info::device::free_memory>();
+      }
       auto allocated_bytes =
           stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
               .current;
@@ -457,7 +467,9 @@ class DeviceCachingAllocator {
           static_cast<int>(device),
           " has a total capacity of ",
           format_size(device_total),
-          ". Of the allocated memory ",
+          " of which ",
+          format_size(device_free),
+          " is free. Of the allocated memory ",
           format_size(allocated_bytes),
           " is allocated by PyTorch, and ",
           format_size(reserved_bytes - allocated_bytes),
diff --git a/c10/xpu/XPUCachingAllocator.h b/c10/xpu/XPUCachingAllocator.h
index 6cdc8c8c71a6..9b1145fa8f5b 100644
--- a/c10/xpu/XPUCachingAllocator.h
+++ b/c10/xpu/XPUCachingAllocator.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/core/AllocatorConfig.h>
 #include <c10/core/CachingDeviceAllocator.h>
 #include <c10/xpu/XPUStream.h>
 
diff --git a/c10/xpu/XPUFunctions.cpp b/c10/xpu/XPUFunctions.cpp
index 6947c078483e..26edf295d1fc 100644
--- a/c10/xpu/XPUFunctions.cpp
+++ b/c10/xpu/XPUFunctions.cpp
@@ -120,17 +120,23 @@ inline void initGlobalDevicePoolState() {
   TORCH_CHECK(
       gDevicePool.devices.size() <= std::numeric_limits<DeviceIndex>::max(),
       "Too many XPU devices, DeviceIndex overflowed!");
-
-#if defined(_WIN32) && SYCL_COMPILER_VERSION < 20250000
-  // The default context feature is disabled by default on Windows for SYCL
-  // compiler versions earlier than 2025.0.0.
-  std::vector<sycl::device> deviceList;
-  for (auto it = gDevicePool.devices.begin(); it != gDevicePool.devices.end();
-       ++it) {
-    deviceList.push_back(*(*it));
+  // Check each device's architecture and issue a warning if it is older than
+  // the officially supported range (Intel GPUs starting from Arc (Alchemist)
+  // series).
+  namespace syclex = sycl::ext::oneapi::experimental;
+  for (const auto& device : gDevicePool.devices) {
+    auto architecture = device->get_info<syclex::info::device::architecture>();
+    if (architecture < syclex::architecture::intel_gpu_acm_g10) {
+      TORCH_WARN(
+          "The detected GPU (",
+          device->get_info<sycl::info::device::name>(),
+          ") is not officially supported by PyTorch XPU. Running workloads on this device may result in unexpected behavior.\n",
+          "For stable and fully supported execution, please use GPUs based on Intel Arc (Alchemist) series or newer.\n",
+          "Refer to the hardware prerequisites for more information: ",
+          "https://github.com/pytorch/pytorch/blob/main/docs/source/notes/get_start_xpu.rst#hardware-prerequisite");
+    }
   }
-  gDevicePool.context = std::make_unique<sycl::context>(deviceList);
-#else
+
   // The default context is utilized for each Intel GPU device, allowing the
   // retrieval of the context from any GPU device.
   const auto& platform = gDevicePool.devices[0]->get_platform();
@@ -140,7 +146,6 @@ inline void initGlobalDevicePoolState() {
 #else
       platform.ext_oneapi_get_default_context());
 #endif
-#endif
 }
 
 inline void initDevicePoolCallOnce() {
@@ -165,9 +170,9 @@ void initDeviceProperties(DeviceProp* device_prop, DeviceIndex device) {
 #define ASSIGN_DEVICE_ASPECT(member) \
   device_prop->has_##member = raw_device.has(sycl::aspect::member);
 
-#define ASSIGN_EXP_CL_ASPECT(member)                                       \
-  device_prop->has_##member = raw_device.ext_oneapi_supports_cl_extension( \
-      "cl_intel_" #member, &cl_version);
+#define ASSIGN_EXP_CL_ASPECT(member) \
+  device_prop->has_##member =        \
+      raw_device.ext_oneapi_supports_cl_extension("cl_intel_" #member);
 
 #define ASSIGN_EXP_DEVICE_PROP(property) \
   device_prop->property =                \
@@ -182,8 +187,6 @@ void initDeviceProperties(DeviceProp* device_prop, DeviceIndex device) {
 
   AT_FORALL_XPU_DEVICE_ASPECT(ASSIGN_DEVICE_ASPECT);
 
-  // TODO: Remove cl_version since it is unnecessary.
-  sycl::ext::oneapi::experimental::cl_version cl_version;
   AT_FORALL_XPU_EXP_CL_ASPECT(ASSIGN_EXP_CL_ASPECT);
 
 #if SYCL_COMPILER_VERSION >= 20250000
diff --git a/c10/xpu/XPUGraphsC10Utils.h b/c10/xpu/XPUGraphsC10Utils.h
new file mode 100644
index 000000000000..b60fc4ac30a6
--- /dev/null
+++ b/c10/xpu/XPUGraphsC10Utils.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <c10/xpu/XPUStream.h>
+#include <iostream>
+
+// XPU Graphs utils used by c10 and aten.
+using namespace sycl::ext::oneapi::experimental;
+namespace c10::xpu {
+
+static_assert(
+    int8_t(queue_state::executing) == 0,
+    "unexpected int(queue_state::executing) value");
+static_assert(
+    int8_t(queue_state::recording) == 1,
+    "unexpected int(queue_state::recording) value");
+
+enum class CaptureStatus : int8_t {
+  Executing = int8_t(queue_state::executing),
+  Recording = int8_t(queue_state::recording)
+};
+
+inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) {
+  switch (status) {
+    case CaptureStatus::Executing:
+      os << "Executing";
+      break;
+    case CaptureStatus::Recording:
+      os << "Recording";
+      break;
+    default:
+      TORCH_INTERNAL_ASSERT(
+          false, "Unknown XPU graph CaptureStatus", int(status));
+  }
+  return os;
+}
+
+inline CaptureStatus currentStreamCaptureStatusMayInitCtx() {
+  auto state = c10::xpu::getCurrentXPUStream().queue().ext_oneapi_get_state();
+  return CaptureStatus(state);
+}
+
+} // namespace c10::xpu
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 51e4023b0d18..7ab54dfa86ad 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -243,8 +243,8 @@ configure_file("${TORCH_SRC_DIR}/_utils_internal.py"
   COPYONLY)
 
 # Generate header with version info
-configure_file("${TORCH_SRC_DIR}/csrc/api/include/torch/version.h.in"
-  "${TORCH_SRC_DIR}/csrc/api/include/torch/version.h"
+configure_file("${TORCH_SRC_DIR}/headeronly/version.h.in"
+  "${TORCH_SRC_DIR}/headeronly/version.h"
   @ONLY)
 
 set(GENERATED_CXX_TORCH
@@ -316,6 +316,7 @@ set(GENERATED_CXX_PYTHON
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_special_functions.cpp"
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_return_types.cpp"
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_enum_tag.cpp"
+  "${TORCH_SRC_DIR}/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp"
   )
 
 set(GENERATED_H_PYTHON
@@ -379,6 +380,9 @@ add_custom_command(
     "${TORCH_ROOT}/aten/src/ATen/templates/LazyIr.h"
     "${TORCH_ROOT}/aten/src/ATen/templates/LazyNonNativeIr.h"
     "${TORCH_ROOT}/aten/src/ATen/templates/RegisterDispatchKey.cpp"
+    "${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClasses.h"
+    "${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClasses.cpp"
+    "${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp"
     ${autograd_python}
     ${autograd_yaml}
     ${autograd_templates}
@@ -540,9 +544,11 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
     ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
   )
 
-  append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
-  if(NOT WIN32)
-    append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
+  if(USE_DISTRIBUTED)
+    append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
+    if(NOT WIN32)
+      append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
+    endif()
   endif()
 endif()
 
@@ -550,12 +556,13 @@ if(USE_CUDA OR USE_ROCM)
   append_filelist("libtorch_cuda_core_sources" Caffe2_GPU_HIP_JIT_FUSERS_SRCS)
 endif()
 
-if(USE_CUDA)
-  append_filelist("libtorch_nativert_cuda_sources" Caffe2_GPU_SRCS)
-endif()
-if(USE_ROCM)
-  append_filelist("libtorch_nativert_cuda_sources" Caffe2_HIP_SRCS)
-endif()
+# NativeRT is disabled
+# if(USE_CUDA)
+#   append_filelist("libtorch_nativert_cuda_sources" Caffe2_GPU_SRCS)
+# endif()
+# if(USE_ROCM)
+#   append_filelist("libtorch_nativert_cuda_sources" Caffe2_HIP_SRCS)
+# endif()
 
 if(USE_CUDA)
   list(APPEND Caffe2_GPU_CU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
@@ -573,30 +580,32 @@ if(USE_CUDA)
     list(APPEND Caffe2_GPU_SRCS
       ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
   endif()
-  append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
-  if(NOT WIN32)
-    append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
-    set_source_files_properties(
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
-      PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
-    )
-  endif()
+  if(USE_DISTRIBUTED)
+    append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
+    if(NOT WIN32)
+      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
+      set_source_files_properties(
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
+        PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
+      )
+    endif()
 
-  set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
-  # Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
-  if(CMAKE_COMPILER_IS_GNUCXX)
-    set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
-  endif()
-  if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
-    set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
+    set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
+    # Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
+    if(CMAKE_COMPILER_IS_GNUCXX)
+      set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
+    endif()
+    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
+      set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
+    endif()
   endif()
   set_source_files_properties(
     ${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@@ -629,9 +638,11 @@ if(USE_ROCM)
     list(APPEND Caffe2_HIP_SRCS
       ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
   endif()
-  append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
-  if(NOT WIN32)
-    append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
+  if(USE_DISTRIBUTED)
+    append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
+    if(NOT WIN32)
+      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
+    endif()
   endif()
   # caffe2_nvrtc's stubs to driver APIs are useful for HIP.
   # See NOTE [ ATen NVRTC Stub and HIP ]
@@ -893,15 +904,6 @@ if(USE_LLVM AND LLVM_FOUND)
     support core analysis executionengine instcombine
     scalaropts transformutils ${LLVM_TARGETS_TO_BUILD} orcjit)
   target_link_libraries(torch_cpu PRIVATE ${LLVM_LINK_LIBS})
-  if(APPLE)
-    set(LINKER_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/unexported_symbols.lds")
-    set_target_properties(torch_cpu PROPERTIES LINK_DEPENDS ${LINKER_SCRIPT})
-    set_target_properties(torch_cpu PROPERTIES LINK_FLAGS "-Wl,-unexported_symbols_list,${LINKER_SCRIPT}")
-  elseif(UNIX)
-    set(LINKER_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/version_script.lds")
-    set_target_properties(torch_cpu PROPERTIES LINK_DEPENDS ${LINKER_SCRIPT})
-    target_link_libraries(torch_cpu PRIVATE "-Wl,--version-script=${LINKER_SCRIPT}")
-  endif()
 endif(USE_LLVM AND LLVM_FOUND)
 
 # This is required for older versions of CMake, which don't allow
@@ -1350,12 +1352,15 @@ if(BUILD_TEST)
     )
   else()
     add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
-    add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
+    # NativeRT is disabled
+    # add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
     add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
-    add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
-    if(NOT WIN32)
-      add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
-      add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
+    if(USE_DISTRIBUTED)
+      add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
+      if(NOT WIN32)
+        add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
+        add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
+      endif()
     endif()
     if(NOT NO_API)
       add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
@@ -1460,40 +1465,46 @@ if(BUILD_LITE_INTERPRETER)
   endif()
 endif()
 
-if(USE_GLOO AND USE_C10D_GLOO)
-  target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
-endif()
-if(USE_UCC AND USE_C10D_UCC)
-  target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
-  if(USE_CUDA)
-    target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
+
+# Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
+# jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
+if(USE_DISTRIBUTED)
+  target_compile_definitions(torch_cpu PUBLIC USE_DISTRIBUTED)
+  if(USE_GLOO AND USE_C10D_GLOO)
+    target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
   endif()
-endif()
-if(USE_NCCL AND USE_C10D_NCCL)
-  if(USE_ROCM)
-    target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
-  else()
-    target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
+  if(USE_UCC AND USE_C10D_UCC)
+    target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
+    if(USE_CUDA)
+      target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
+    endif()
+  endif()
+  if(USE_NCCL AND USE_C10D_NCCL)
+    if(USE_ROCM)
+      target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
+    else()
+      target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
+    endif()
+  endif()
+  if(USE_MPI AND USE_C10D_MPI)
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+      set_source_files_properties(
+        "${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
+        PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
+    endif()
+    target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
+  endif()
+  # Pass USE_RPC in order to reduce use of
+  # #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
+  # need to be removed when RPC is supported
+  if(NOT WIN32)
+    target_compile_definitions(torch_cpu PUBLIC USE_RPC)
+  endif()
+  # Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
+  # can only be compiled with USE_TENSORPIPE is set.
+  if(USE_TENSORPIPE)
+    target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
   endif()
-endif()
-if(USE_MPI AND USE_C10D_MPI)
-  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-    set_source_files_properties(
-      "${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
-      PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
-  endif()
-  target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
-endif()
-# Pass USE_RPC in order to reduce use of
-# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
-# need to be removed when RPC is supported
-if(NOT WIN32)
-  target_compile_definitions(torch_cpu PUBLIC USE_RPC)
-endif()
-# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
-# can only be compiled with USE_TENSORPIPE is set.
-if(USE_TENSORPIPE)
-  target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
 endif()
 
 if(NOT INTERN_BUILD_MOBILE)
diff --git a/caffe2/perfkernels/batch_box_cox_sve128.cc b/caffe2/perfkernels/batch_box_cox_sve128.cc
index e48f1d7ce637..897e3a8ee475 100644
--- a/caffe2/perfkernels/batch_box_cox_sve128.cc
+++ b/caffe2/perfkernels/batch_box_cox_sve128.cc
@@ -2,169 +2,126 @@
 #include <arm_neon.h>
 #include <arm_neon_sve_bridge.h>
 #include <arm_sve.h>
+#include <cfloat>
+#include <cmath>
 
 #include "c10/macros/Macros.h"
 
-// Log and exp approximations inspired from ACL implementation
-
-inline float32x4_t vtaylor_polyq_for_log_f32(float32x4_t x) {
-  const float32x4_t log_tab_1 = vdupq_n_f32(-2.29561495781f);
-  const float32x4_t log_tab_2 = vdupq_n_f32(-2.47071170807f);
-  const float32x4_t log_tab_3 = vdupq_n_f32(-5.68692588806f);
-  const float32x4_t log_tab_4 = vdupq_n_f32(-0.165253549814f);
-  const float32x4_t log_tab_5 = vdupq_n_f32(5.17591238022f);
-  const float32x4_t log_tab_6 = vdupq_n_f32(0.844007015228f);
-  const float32x4_t log_tab_7 = vdupq_n_f32(4.58445882797f);
-  const float32x4_t log_tab_8 = vdupq_n_f32(0.0141278216615f);
-
-  float32x4_t A = vmlaq_f32(log_tab_1, log_tab_5, x);
-  float32x4_t B = vmlaq_f32(log_tab_3, log_tab_7, x);
-  float32x4_t C = vmlaq_f32(log_tab_2, log_tab_6, x);
-  float32x4_t x2 = vmulq_f32(x, x);
-  float32x4_t D = svget_neonq(svmad_f32_x(
-      svptrue_b8(),
-      svset_neonq(svundef_f32(), x),
-      svset_neonq(svundef_f32(), log_tab_8),
-      svset_neonq(svundef_f32(), log_tab_4)));
-  float32x4_t x4 = vmulq_f32(x2, x2);
-  float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
-  return res;
+/// Select `svlog` accuracy:
+/// - 0: original.
+/// - 1: more accurate, similar performance.
+/// - 2: very high accuracy, a bit lower speed.
+#define SVLOG_ACCURACY 2
+
+/// Handle special cases in `svexp`:
+/// - 0: original.
+/// - 1: use clamp, better performance.
+/// - 2: no special case handling.
+#define SVEXP_SPECIAL_CLAMP 1
+
+#if SVLOG_ACCURACY == 2
+static inline svfloat32_t svlog(svfloat32_t x) {
+  const svbool_t ptrue = svptrue_b8();
+
+  svint32_t u = svreinterpret_s32(x) - 0x3F2AAAAB;
+
+  svfloat32_t r = svreinterpret_f32((u & 0x007FFFFF) + 0x3F2AAAAB) - 1.0f;
+  svfloat32_t n = svcvt_f32_x(ptrue, u >> 23);
+  asm("" : "+w"(r)); // NOTE: can improve instruction scheduling.
+
+  svfloat32_t r2 = r * r;
+  svfloat32_t p = -0x1.4F9934p-3f + r * 0x1.5A9AA2p-3f;
+  svfloat32_t q = -0x1.00187Cp-2f + r * 0x1.961348p-3f;
+  svfloat32_t y = -0x1.FFFFC8p-2f + r * 0x1.555D7Cp-2f;
+  return (r + n * 0x1.62E43p-1f) +
+         (y + (q + (p + -0x1.3E737Cp-3f * r2) * r2) * r2) * r2;
 }
+#elif SVLOG_ACCURACY == 1
+static inline svfloat32_t svlog(svfloat32_t x) {
+  const svbool_t ptrue = svptrue_b8();
 
-inline float32x4_t vlogq_f32(float32x4_t x) {
-  const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
+  svint32_t u = svreinterpret_s32(x) - 0x3F2AAAAB;
 
-  // Extract exponent
-  int32x4_t m = svget_neonq(svsub_n_s32_x(
-      svptrue_b8(),
-      svset_neonq(
-          svundef_s32(),
-          vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23))),
-      127));
-  float32x4_t val = vreinterpretq_f32_s32(
-      vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
+  svfloat32_t r = svreinterpret_f32((u & 0x007FFFFF) + 0x3F2AAAAB) - 1.0f;
+  svfloat32_t n = svcvt_f32_x(ptrue, u >> 23);
+  asm("" : "+w"(r)); // NOTE: can improve instruction scheduling.
 
-  // Polynomial Approximation
-  float32x4_t poly = vtaylor_polyq_for_log_f32(val);
+  svfloat32_t r2 = r * r;
+  svfloat32_t A = -0x1.923814p-3f + r * 0x1.689E5Ep-3f;
+  svfloat32_t B = -0x1.FC0968p-3f + r * 0x1.93BF0Cp-3f;
+  svfloat32_t C = -0x1.000478p-1f + r * 0x1.556906p-2f;
 
-  // Reconstruct
-  poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
+  return (r + n * 0x1.62E43p-1f) + (C + (B + A * r2) * r2) * r2;
+}
+#elif SVLOG_ACCURACY == 0
+static inline svfloat32_t svlog(svfloat32_t x) {
+  const svbool_t ptrue = svptrue_b8();
 
-  return poly;
+  svint32_t u = svsra_n_s32(svdup_n_s32(-127), svreinterpret_s32(x), 23);
+
+  svfloat32_t n = svcvt_f32_x(ptrue, u);
+  svfloat32_t r = svreinterpret_f32(svreinterpret_s32(x) - (u << 23));
+
+  svfloat32_t D = -0.165253549814f + r * 0.0141278216615f;
+  svfloat32_t C = -2.47071170807f + r * 0.844007015228f;
+  svfloat32_t B = -5.68692588806f + r * 4.58445882797f;
+  svfloat32_t A = -2.29561495781f + r * 5.17591238022f;
+
+  svfloat32_t r2 = r * r;
+  return (A + n * 0.6931471805f) + (B + (C + D * r2) * r2) * r2;
 }
+#endif
+
+static inline svfloat32_t svexp(svfloat32_t x) {
+  // Clamp interval set to prevent denormals!
+  const svfloat32_t max_input = svdup_n_f32(88.722839f);
+  const svfloat32_t min_input = svdup_n_f32(-87.33654f);
+  const svfloat32_t shift = svdup_n_f32(0x1.0000FEp+23f);
+  const svbool_t ptrue = svptrue_b8();
+
+#if SVEXP_SPECIAL_CLAMP == 1
+  x = svmax_x(ptrue, svmin_x(ptrue, x, max_input), min_input);
+#endif
+
+  svfloat32_t z = svmla_n_f32_x(ptrue, shift, x, 0x1.715476p+0f);
+  svfloat32_t n = z - shift;
+  svfloat32_t scale = svreinterpret_f32(svreinterpret_u32(z) << 23);
+
+  svfloat32_t r_hi = x - n * 0x1.62E400p-1f;
+  svfloat32_t r = r_hi - n * 0x1.7F7D1Cp-20f;
+  svfloat32_t r2 = r * r;
+
+  svfloat32_t C = 0x1.573E2Ep-5f + r * 0x1.0E4020p-7f;
+  svfloat32_t B = 0x1.FFFDB6p-2f + r * 0x1.555E66p-3f;
+  svfloat32_t A = r * 0x1.FFFFECp-1f;
 
-inline float32x4_t vexpq_f32(float32x4_t x) {
-  const auto c1 = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(0x3f7ffff6)));
-  const auto c2 = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(0x3efffedb)));
-  const auto c3 = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(0x3e2aaf33)));
-  const auto c4 = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(0x3d2b9f17)));
-  const auto c5 = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(0x3c072010)));
-
-  const auto shift = vreinterpretq_f32_u32(
-      svget_neonq(svdup_n_u32(0x4b00007f))); // 2^23 + 127 = 0x1.0000fep23f
-  const auto inv_ln2 = vreinterpretq_f32_u32(
-      svget_neonq(svdup_n_u32(0x3fb8aa3b))); // 1 / ln(2) = 0x1.715476p+0f
-  const auto neg_ln2_hi = vreinterpretq_f32_u32(svget_neonq(
-      svdup_n_u32(0xbf317200))); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
-  const auto neg_ln2_lo = vreinterpretq_f32_u32(svget_neonq(svdup_n_u32(
-      0xb5bfbe8e))); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
-
-  const auto inf = svdup_n_f32(std::numeric_limits<float>::infinity());
-  const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5)
-  const auto zero = svdup_n_f32(0.f);
-  const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125)
-
-  // Range reduction:
-  //   e^x = 2^n * e^r
-  // where:
-  //   n = floor(x / ln(2))
-  //   r = x - n * ln(2)
-  //
-  // By adding x / ln(2) with 2^23 + 127 (shift):
-  //   * As FP32 fraction part only has 23-bits, the addition of 2^23 + 127
-  //   forces decimal part
-  //     of x / ln(2) out of the result. The integer part of x / ln(2) (i.e. n)
-  //     + 127 will occupy the whole fraction part of z in FP32 format.
-  //     Subtracting 2^23 + 127 (shift) from z will result in the integer part
-  //     of x / ln(2) (i.e. n) because the decimal part has been pushed out and
-  //     lost.
-  //   * The addition of 127 makes the FP32 fraction part of z ready to be used
-  //   as the exponent
-  //     in FP32 format. Left shifting z by 23 bits will result in 2^n.
-  const auto z = vfmaq_f32(shift, x, inv_ln2);
-  const auto n = z - shift;
-  const auto scale =
-      vreinterpretq_f32_u32(vreinterpretq_u32_f32(z) << 23); // 2^n
-
-  // The calculation of n * ln(2) is done using 2 steps to achieve accuracy
-  // beyond FP32. This outperforms longer Taylor series (3-4 tabs) both in term
-  // of accuracy and performance.
-  const auto r_hi = vfmaq_f32(x, n, neg_ln2_hi);
-  const auto r = vfmaq_f32(r_hi, n, neg_ln2_lo);
-
-  // Compute the truncated Taylor series of e^r.
-  //   poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5)
-  const auto r2 = r * r;
-
-  const auto p1 = c1 * r;
-  const auto p23 = vfmaq_f32(c2, c3, r);
-  const auto p45 = vfmaq_f32(c4, c5, r);
-  const auto p2345 = vfmaq_f32(p23, p45, r2);
-  const auto p12345 = vfmaq_f32(p1, p2345, r2);
-
-  auto poly = svset_neonq(svundef_f32(), vfmaq_f32(scale, p12345, scale));
-
-  // Handle underflow and overflow.
-  poly = svsel_f32(
-      svcmplt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), min_input),
-      zero,
-      poly);
-  poly = svsel_f32(
-      svcmpgt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), max_input),
-      inf,
-      poly);
-
-  return svget_neonq(poly);
+  svfloat32_t poly = scale + (A + (B + C * r2) * r2) * scale;
+
+#if SVEXP_SPECIAL_CLAMP == 0
+  const svfloat32_t inf = svdup_n_f32(std::numeric_limits<float>::infinity());
+  poly = svsel_f32(svcmplt_f32(ptrue, x, min_input), svdup_n_f32(0.0f), poly);
+  poly = svsel_f32(svcmpgt_f32(ptrue, x, max_input), inf, poly);
+#endif
+
+  return poly;
 }
 
-// ln(x) = log2(x) * ln(2)
-// pow(x, n) = exp(n * ln(x))
-inline float32x4_t compute_batch_box_cox_vec_sve128_float(
+static inline svfloat32_t compute_batch_box_cox_vec_sve128_float(
     svfloat32_t lambda1_v,
     svfloat32_t lambda2_v,
     svfloat32_t data_v,
     svfloat32_t k_eps) {
-  // sum_v = lambda2_v + data_v
-  float32x4_t sum_v = vaddq_f32(svget_neonq(data_v), svget_neonq(lambda2_v));
-
-  // test lambda1_v: predNZ == 1 iff lambda1_v != 0
-  svbool_t predNZ = svcmpne_n_f32(svptrue_b8(), lambda1_v, 0.0f);
-
-  // clamp sum_v: sum_v = max(sum_v, k_eps)
-  sum_v = vmaxq_f32(sum_v, svget_neonq(k_eps));
+  const svbool_t ptrue = svptrue_b8();
 
-  // lnData = log(sum_v)
-  svfloat32_t lnData = svset_neonq(svundef_f32(), vlogq_f32(sum_v));
-
-  // if any lambda1 != 0, compute pow(sum_v, lambda1) using lnData
-  // pow(sum_v, lambda1) == exp(lambda1 * ln(sum_v))
+  svfloat32_t lnData = svlog(svmax_x(ptrue, data_v + lambda2_v, k_eps));
+  svbool_t predNZ = svcmpne_n_f32(ptrue, lambda1_v, 0.0f);
   if (C10_LIKELY(svptest_any(predNZ, predNZ))) {
-    // mult = lambda1 * ln(sum_v)
-    float32x4_t mult = vmulq_f32(svget_neonq(lnData), svget_neonq(lambda1_v));
-
-    // lambda1_r = 1 / lambda1
     svfloat32_t lambda1_r = svdivr_f32_m(predNZ, lambda1_v, svdup_n_f32(1.0f));
-
-    // pow = exp(mult)
-    float32x4_t pow = vexpq_f32(mult);
-
-    // merge results
-    // lnData if lambda1 == 0, (lambda1_r * pow - lambda1_r) if lambda1 != 0
+    svfloat32_t pow = svexp(lnData * lambda1_v);
     lnData = svsel_f32(predNZ, lambda1_r, lnData);
-    lnData =
-        svnmsb_f32_m(predNZ, lnData, svset_neonq(svundef_f32(), pow), lnData);
+    lnData = svnmsb_f32_m(predNZ, lnData, pow, lnData);
   }
-  return svget_neonq(lnData);
+  return lnData;
 }
 
 template <typename T>
@@ -180,11 +137,11 @@ template <>
 void compute_batch_box_cox_vec_sve128(
     std::size_t N,
     std::size_t D,
-    const float* data_ptr,
-    const float* __restrict lambda1_ptr,
-    const float* __restrict lambda2_ptr,
-    float* output_ptr) {
-  svfloat32_t k_eps = svdup_n_f32(static_cast<float>(1e-6));
+    const float *data_ptr,
+    const float *__restrict lambda1_ptr,
+    const float *__restrict lambda2_ptr,
+    float *output_ptr) {
+  const svfloat32_t k_eps = svdup_n_f32(static_cast<float>(1e-6));
 
   std::size_t remainder = D % 4;
   std::size_t loopBound = D - remainder;
@@ -198,17 +155,17 @@ void compute_batch_box_cox_vec_sve128(
       svfloat32_t lambda2_v =
           svset_neonq(svundef_f32(), vld1q_f32(lambda2_ptr + j));
       svfloat32_t data_v = svset_neonq(svundef_f32(), vld1q_f32(data_ptr));
-      float32x4_t result = compute_batch_box_cox_vec_sve128_float(
+      svfloat32_t result = compute_batch_box_cox_vec_sve128_float(
           lambda1_v, lambda2_v, data_v, k_eps);
-      vst1q_f32(output_ptr, result);
+      vst1q_f32(output_ptr, svget_neonq(result));
     }
     if (C10_LIKELY(remainder > 0)) {
       svfloat32_t lambda1_v = svld1_f32(remainderPred, lambda1_ptr + loopBound);
       svfloat32_t lambda2_v = svld1_f32(remainderPred, lambda2_ptr + loopBound);
       svfloat32_t data_v = svld1_f32(remainderPred, data_ptr);
-      float32x4_t result = compute_batch_box_cox_vec_sve128_float(
+      svfloat32_t result = compute_batch_box_cox_vec_sve128_float(
           lambda1_v, lambda2_v, data_v, k_eps);
-      svst1_f32(remainderPred, output_ptr, svset_neonq(svundef_f32(), result));
+      svst1_f32(remainderPred, output_ptr, result);
       data_ptr += remainder;
       output_ptr += remainder;
     }
diff --git a/caffe2/perfkernels/batch_box_cox_vec.h b/caffe2/perfkernels/batch_box_cox_vec.h
index 08e4f84fe432..ed2e83062d10 100644
--- a/caffe2/perfkernels/batch_box_cox_vec.h
+++ b/caffe2/perfkernels/batch_box_cox_vec.h
@@ -73,19 +73,6 @@ void box_cox_zero_lambda(
   }
 }
 
-template <typename T>
-at::vec::Vectorized<T> box_cox_nonzero_lambda_impl(
-    at::vec::Vectorized<T> data,
-    at::vec::Vectorized<T> lambda1,
-    at::vec::Vectorized<T> lambda2,
-    at::vec::Vectorized<T> k_eps) {
-  auto sum = data + lambda2;
-  auto max = at::vec::max(sum, k_eps);
-  auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
-  auto pow = max.pow(lambda1);
-  return at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
-}
-
 template <typename T>
 void box_cox_nonzero_lambda(
     int64_t D,
@@ -101,18 +88,21 @@ void box_cox_nonzero_lambda(
   auto k_eps_vec = Vec(k_eps);
   for(; j + VLEN < D; j += VLEN) {
     auto data = Vec::loadu(data_ptr + j);
-    auto lambda1 = Vec::loadu(lambda1_ptr + j);
     auto lambda2 = Vec::loadu(lambda2_ptr + j);
-    auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
+    auto sum = data + lambda2;
+    auto max = at::vec::max(sum, k_eps_vec);
+    auto lambda1 = Vec::loadu(lambda1_ptr + j);
+    auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
+    auto pow = max.pow(lambda1);
+    auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
     res.store(out + j);
   }
-  if (j < D) {
-    auto remaining = D - j;
-    auto data = Vec::loadu(data_ptr + j, remaining);
-    auto lambda1 = Vec::loadu(lambda1_ptr + j, remaining);
-    auto lambda2 = Vec::loadu(lambda2_ptr + j, remaining);
-    auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
-    res.store(out + j, remaining);
+  for ( ;j < D; ++j) {
+    auto sum = data_ptr[j] + lambda2_ptr[j];
+    auto max = std::max(sum, k_eps);
+    auto lambda_over_1 = at::vec::fast_recieprocal(lambda1_ptr[j]);
+    auto pow = std::pow(max, lambda1_ptr[j]);
+    out[j] = pow * lambda_over_1 - lambda_over_1;
   }
 }
 #else
diff --git a/caffe2/perfkernels/hp_emblookup_codegen.py b/caffe2/perfkernels/hp_emblookup_codegen.py
index 91f6ac238c0f..43254cddf26e 100644
--- a/caffe2/perfkernels/hp_emblookup_codegen.py
+++ b/caffe2/perfkernels/hp_emblookup_codegen.py
@@ -74,7 +74,7 @@ def compute(regid, InType, use_weights, isa, prefetch):
         )
 
     code.append("      " + OutType + "* op = &out[rangeIndex * block_size];")
-    for i in range(0, uf):
+    for i in range(uf):
         j = 8 * i
         code.append("      __m256 vop" + str(j) + " = _mm256_setzero_ps();")
 
@@ -158,7 +158,7 @@ def compute(regid, InType, use_weights, isa, prefetch):
         "&input[idx_pref_T0 * fused_block_size];"
     )
 
-    for i in range(0, uf):
+    for i in range(uf):
         j = 8 * i
         cachelinesize = 64
         byteoffset = sizeof[InType] * j
@@ -170,7 +170,7 @@ def compute(regid, InType, use_weights, isa, prefetch):
         code.append("      if (!normalize_by_lengths || length == 0) {")
     else:
         code.append("      if (!normalize_by_lengths || lengths[rangeIndex] == 0) {")
-    for i in range(0, uf):
+    for i in range(uf):
         j = 8 * i
         code.append("        _mm256_storeu_ps(&op[" + str(j) + "], vop" + str(j) + ");")
     code.append("      } else {")
@@ -181,7 +181,7 @@ def compute(regid, InType, use_weights, isa, prefetch):
         code.append(
             "        __m256 vlen_inv = _mm256_set1_ps(1.0f / lengths[rangeIndex]);"
         )
-    for i in range(0, uf):
+    for i in range(uf):
         j = 8 * i
         code.append(
             "        _mm256_storeu_ps(&op["
diff --git a/caffe2/perfkernels/sve_emblookup_codegen.py b/caffe2/perfkernels/sve_emblookup_codegen.py
index 4c5ad01bdc10..6a63920cc8bb 100644
--- a/caffe2/perfkernels/sve_emblookup_codegen.py
+++ b/caffe2/perfkernels/sve_emblookup_codegen.py
@@ -38,7 +38,7 @@ def compute_output(num_unrolls, InType, is_main):
     code = []
 
     if num_unrolls == 1:
-        code.append(f"    // tail loop")
+        code.append("    // tail loop")
         code.append("    if (j < end_offset) {")
     else:
         code.append(f"    // unrolling {num_unrolls} times")
diff --git a/caffe2/unexported_symbols.lds b/caffe2/unexported_symbols.lds
deleted file mode 100644
index a8b10ca3de1a..000000000000
--- a/caffe2/unexported_symbols.lds
+++ /dev/null
@@ -1 +0,0 @@
-*4llvm*
diff --git a/caffe2/utils/threadpool/WorkersPool.h b/caffe2/utils/threadpool/WorkersPool.h
index 5de6b1213e84..274456ffc532 100644
--- a/caffe2/utils/threadpool/WorkersPool.h
+++ b/caffe2/utils/threadpool/WorkersPool.h
@@ -39,7 +39,7 @@ struct AllocAligned {
 #elif defined(_MSC_VER)
     p = _aligned_malloc(sizeof(T), kGEMMLOWPCacheLineSize);
 #else
-    auto res = posix_memalign((void**)&p, kGEMMLOWPCacheLineSize, sizeof(T));
+    auto res = posix_memalign(&p, kGEMMLOWPCacheLineSize, sizeof(T));
     (void)res;
 #endif
 
diff --git a/caffe2/version_script.lds b/caffe2/version_script.lds
deleted file mode 100644
index 8a97b5aef027..000000000000
--- a/caffe2/version_script.lds
+++ /dev/null
@@ -1,4 +0,0 @@
-pytorch {
-  local:
-    *4llvm*;
-};
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 08ffdaf8cf45..733183ef50bd 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -821,9 +821,9 @@ if(NOT Python_Interpreter_FOUND)
   message(FATAL_ERROR "Python3 could not be found.")
 endif()
 
-if(${Python_VERSION} VERSION_LESS 3.9)
+if(${Python_VERSION} VERSION_LESS 3.10)
   message(FATAL_ERROR
-    "Found Python libraries version ${Python_VERSION}. Python < 3.9 is no longer supported by PyTorch.")
+    "Found Python libraries version ${Python_VERSION}. Python < 3.10 is no longer supported by PyTorch.")
 endif()
 
 # ---[ Python + Numpy
@@ -1013,7 +1013,6 @@ if(USE_ROCM)
     list(APPEND HIP_CXX_FLAGS -DTORCH_HIP_VERSION=${TORCH_HIP_VERSION})
     list(APPEND HIP_CXX_FLAGS -Wno-shift-count-negative)
     list(APPEND HIP_CXX_FLAGS -Wno-shift-count-overflow)
-    list(APPEND HIP_CXX_FLAGS -Wno-duplicate-decl-specifier)
     list(APPEND HIP_CXX_FLAGS -DCAFFE2_USE_MIOPEN)
     list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)
     list(APPEND HIP_CXX_FLAGS -std=c++17)
@@ -1045,6 +1044,17 @@ if(USE_ROCM)
        list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling)
     endif(CMAKE_BUILD_TYPE MATCHES Debug)
 
+    # Get EnVar 'USE_LAYERNORM_FAST_RECIPROCAL' (or default to on).
+    if(DEFINED ENV{USE_LAYERNORM_FAST_RECIPROCAL})
+      set(USE_LAYERNORM_FAST_RECIPROCAL $ENV{USE_LAYERNORM_FAST_RECIPROCAL})
+    else()
+      set(USE_LAYERNORM_FAST_RECIPROCAL ON)
+    endif()
+
+    if(USE_LAYERNORM_FAST_RECIPROCAL)
+      add_definitions(-DUSE_LAYERNORM_FAST_RECIPROCAL)
+    endif()
+
     # needed for compat with newer versions of hip-clang that introduced C++20 mangling rules
     list(APPEND HIP_HIPCC_FLAGS -fclang-abi-compat=17)
 
@@ -1134,7 +1144,7 @@ if(USE_CUDA AND CUDA_VERSION VERSION_LESS 13.0)
   include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
 endif()
 
-if(USE_TENSORPIPE)
+if(USE_DISTRIBUTED AND USE_TENSORPIPE)
   if(MSVC)
     message(WARNING "Tensorpipe cannot be used on Windows.")
   else()
@@ -1541,6 +1551,11 @@ if(NOT INTERN_BUILD_MOBILE)
     if(HAVE_MALLOC_USABLE_SIZE)
       add_definitions(-DHAVE_MALLOC_USABLE_SIZE=1)
     endif(HAVE_MALLOC_USABLE_SIZE)
+    set(CMAKE_EXTRA_INCLUDE_FILES "fcntl.h")
+    CHECK_FUNCTION_EXISTS(posix_fallocate HAVE_POSIX_FALLOCATE)
+    if(HAVE_POSIX_FALLOCATE)
+      add_definitions(-DHAVE_POSIX_FALLOCATE=1)
+    endif(HAVE_POSIX_FALLOCATE)
   endif(UNIX)
 
   add_definitions(-DUSE_EXTERNAL_MZCRC)
@@ -1552,6 +1567,12 @@ endif()
 #
 # End ATen checks
 #
+
+# Install `fmtlib` header.
+# This was the default behavior before version 12.0.0.
+# Since PyTorch C API depends on it, make it available for projects that
+# depend on PyTorch.
+set(FMT_INSTALL ON)
 set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
 set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE)
 add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/fmt)
diff --git a/cmake/External/aotriton.cmake b/cmake/External/aotriton.cmake
index 4f7a79a78bfc..b19f25609cad 100644
--- a/cmake/External/aotriton.cmake
+++ b/cmake/External/aotriton.cmake
@@ -46,9 +46,10 @@ if(NOT __AOTRITON_INCLUDED)
   set(__AOTRITON_BASE_URL "https://github.com/ROCm/aotriton/releases/download/")  # @lint-ignore
   set(__AOTRITON_Z "gz")
   # Set the default __AOTRITON_LIB path
-  set(__AOTRITON_LIB "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so")
-  if(WIN32)
-    set(__AOTRITON_LIB "${__AOTRITON_INSTALL_DIR}/lib/aotriton_v2.lib")
+  if(NOT WIN32)
+    set(__AOTRITON_LIB "lib/libaotriton_v2.so")
+  else()
+    set(__AOTRITON_LIB "lib/aotriton_v2.lib")
   endif()
 
   function(aotriton_build_windows_dependencies dlfcn-win32_external xz_external dlfcn-win32_DIR liblzma_DIR)
@@ -143,8 +144,7 @@ if(NOT __AOTRITON_INCLUDED)
       -DHIP_PLATFORM=amd
       $<$<BOOL:${WIN32}>:-Ddlfcn-win32_DIR=${dlfcn-win32_DIR}>
       $<$<BOOL:${WIN32}>:-Dliblzma_DIR=${liblzma_DIR}>
-      BUILD_BYPRODUCTS
-        "${__AOTRITON_LIB}"
+      BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/${__AOTRITON_LIB}"
       USES_TERMINAL_DOWNLOAD TRUE
       USES_TERMINAL_CONFIGURE TRUE
       USES_TERMINAL_BUILD TRUE
@@ -177,7 +177,7 @@ if(NOT __AOTRITON_INCLUDED)
       INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
       "${CMAKE_CURRENT_BINARY_DIR}/aotriton_runtime"
       "${__AOTRITON_INSTALL_DIR}"
-      BUILD_BYPRODUCTS "${__AOTRITON_LIB}"
+      BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/${__AOTRITON_LIB}"
     )
     message(STATUS "Using AOTriton Runtime from pre-compiled binary ${__AOTRITON_URL}.\
     Set env variables AOTRITON_INSTALL_FROM_SOURCE=1 to build from source.")
@@ -244,7 +244,8 @@ if(NOT __AOTRITON_INCLUDED)
   else()
     set(__AOTRITON_SYSTEM_ROCM "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}")
     list(FIND __AOTRITON_ROCM_LIST "rocm${__AOTRITON_SYSTEM_ROCM}" __AOTRITON_RUNTIME_INDEX)
-    if(${__AOTRITON_RUNTIME_INDEX} LESS 0)
+    # Always build aotriton runtime from source on Windows due to lack of pre-built binaries
+    if(${__AOTRITON_RUNTIME_INDEX} LESS 0 OR WIN32)
       message(STATUS "Cannot find AOTriton runtime for ROCM ${__AOTRITON_SYSTEM_ROCM}. \
       Build runtime from source")
       aotriton_build_from_source(ON aotriton_runtime)
@@ -267,7 +268,7 @@ if(NOT __AOTRITON_INCLUDED)
       endforeach()
     endforeach()
   endif()
-  target_link_libraries(__caffe2_aotriton INTERFACE ${__AOTRITON_LIB})
+  target_link_libraries(__caffe2_aotriton INTERFACE "${__AOTRITON_INSTALL_DIR}/${__AOTRITON_LIB}")
   target_include_directories(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/include)
   set(AOTRITON_FOUND TRUE)
 endif() # __AOTRITON_INCLUDED
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index fb64e99bccf2..60951d6c6867 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -128,16 +128,18 @@ function(caffe2_print_configuration_summary)
   endif()
   message(STATUS "  USE_ROCM              : ${USE_ROCM}")
   if(${USE_ROCM})
-    message(STATUS "    ROCM_VERSION          : ${ROCM_VERSION}")
-    message(STATUS "    USE_FLASH_ATTENTION   : ${USE_FLASH_ATTENTION}")
-    message(STATUS "    USE_MEM_EFF_ATTENTION : ${USE_MEM_EFF_ATTENTION}")
-    message(STATUS "    USE_ROCM_CK_SDPA      : ${USE_ROCM_CK_SDPA}")
-    message(STATUS "    USE_ROCM_CK_GEMM      : ${USE_ROCM_CK_GEMM}")
+    message(STATUS "    ROCM_VERSION                  : ${ROCM_VERSION}")
+    message(STATUS "    USE_FLASH_ATTENTION           : ${USE_FLASH_ATTENTION}")
+    message(STATUS "    USE_MEM_EFF_ATTENTION         : ${USE_MEM_EFF_ATTENTION}")
+    message(STATUS "    USE_ROCM_CK_SDPA              : ${USE_ROCM_CK_SDPA}")
+    message(STATUS "    USE_ROCM_CK_GEMM              : ${USE_ROCM_CK_GEMM}")
+    message(STATUS "    USE_LAYERNORM_FAST_RECIPROCAL : ${USE_LAYERNORM_FAST_RECIPROCAL}")
   endif()
   message(STATUS "  BUILD_NVFUSER         : ${BUILD_NVFUSER}")
   message(STATUS "  USE_EIGEN_FOR_BLAS    : ${CAFFE2_USE_EIGEN_FOR_BLAS}")
   message(STATUS "  USE_EIGEN_FOR_SPARSE  : ${USE_EIGEN_SPARSE}")
   message(STATUS "  USE_FBGEMM            : ${USE_FBGEMM}")
+  message(STATUS "  USE_FBGEMM_GENAI      : ${USE_FBGEMM_GENAI}")
   message(STATUS "  USE_KINETO            : ${USE_KINETO}")
   message(STATUS "  USE_GFLAGS            : ${USE_GFLAGS}")
   message(STATUS "  USE_GLOG              : ${USE_GLOG}")
@@ -158,6 +160,7 @@ function(caffe2_print_configuration_summary)
   if(${USE_KLEIDIAI})
     message(STATUS "  USE_KLEIDIAI          : ${USE_KLEIDIAI}")
   endif()
+  message(STATUS "  USE_PRIORITIZED_TEXT_FOR_LD : ${USE_PRIORITIZED_TEXT_FOR_LD}")
   message(STATUS "  USE_UCC               : ${USE_UCC}")
   if(${USE_UCC})
     message(STATUS "    USE_SYSTEM_UCC        : ${USE_SYSTEM_UCC}")
@@ -192,11 +195,13 @@ function(caffe2_print_configuration_summary)
   message(STATUS "  USE_PYTORCH_QNNPACK   : ${USE_PYTORCH_QNNPACK}")
   message(STATUS "  USE_XNNPACK           : ${USE_XNNPACK}")
   message(STATUS "  USE_DISTRIBUTED       : ${USE_DISTRIBUTED}")
-  message(STATUS "    USE_MPI               : ${USE_MPI}")
-  message(STATUS "    USE_GLOO              : ${USE_GLOO}")
-  message(STATUS "    USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
-  message(STATUS "    USE_GLOO_IBVERBS      : ${USE_GLOO_IBVERBS}")
-  message(STATUS "    USE_TENSORPIPE        : ${USE_TENSORPIPE}")
+  if(${USE_DISTRIBUTED})
+    message(STATUS "    USE_MPI               : ${USE_MPI}")
+    message(STATUS "    USE_GLOO              : ${USE_GLOO}")
+    message(STATUS "    USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
+    message(STATUS "    USE_GLOO_IBVERBS      : ${USE_GLOO_IBVERBS}")
+    message(STATUS "    USE_TENSORPIPE        : ${USE_TENSORPIPE}")
+  endif()
   if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
     message(STATUS "  SELECTED_OP_LIST    : ${SELECTED_OP_LIST}")
   endif()
diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake
index 68e66bb3fc38..bccd09690e29 100644
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@@ -439,10 +439,6 @@ function(torch_compile_options libname)
         $<$<COMPILE_LANGUAGE:CXX>: -fvisibility=hidden>)
   endif()
 
-  # Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression)
-  target_compile_options(${libname} PRIVATE
-      $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>>:-O2>)
-
 endfunction()
 
 ##############################################################################
@@ -482,6 +478,7 @@ function(torch_update_find_cuda_flags)
 endfunction()
 
 include(CheckCXXCompilerFlag)
+include(CheckLinkerFlag)
 
 ##############################################################################
 # CHeck if given flag is supported and append it to provided outputvar
@@ -511,3 +508,22 @@ function(target_compile_options_if_supported target flag)
     target_compile_options(${target} PRIVATE ${flag})
   endif()
 endfunction()
+
+# Check if a global link option is supported
+function(add_link_options_if_supported flag)
+  check_linker_flag(C "LINKER:${flag}" _supported)
+  if("${_supported}")
+    add_link_options("LINKER:${flag}")
+  else()
+    message(WARNING "Attempted to use unsupported link option : ${flag}.")
+  endif()
+endfunction()
+
+function(target_link_options_if_supported tgt flag)
+  check_linker_flag(C "LINKER:${flag}" _supported)
+  if("${_supported}")
+    target_link_options("${tgt}" PRIVATE "LINKER:${flag}")
+  else()
+    message(WARNING "Attempted to use unsupported link option : ${flag}.")
+  endif()
+endfunction()
diff --git a/codex_setup.sh b/codex_setup.sh
index 85c7b93e8979..5e47b1358ee5 100755
--- a/codex_setup.sh
+++ b/codex_setup.sh
@@ -10,5 +10,5 @@ COMMIT_DATE=$(echo "$NIGHTLY_PATCH" | grep '^Date:' | sed -E 's/Date: .*, ([0-9]
 VERSION_STRING="2.9.0.dev${COMMIT_DATE}+cpu"
 git rev-parse HEAD > /tmp/orig_work.txt
 git reset --hard $COMMIT
-USE_NIGHTLY=$VERSION_STRING python setup.py develop
+USE_NIGHTLY=$VERSION_STRING python -m pip install --no-build-isolation -v -e .
 echo "source $PWD/.venv/bin/activate" >> ~/.bashrc
diff --git a/docs/cpp/source/conf.py b/docs/cpp/source/conf.py
index b7a0a0172732..10d854c21db4 100644
--- a/docs/cpp/source/conf.py
+++ b/docs/cpp/source/conf.py
@@ -40,7 +40,34 @@
     "sphinx.ext.intersphinx",
 ] + (["breathe", "exhale"] if run_doxygen else [])
 
-intersphinx_mapping = {"pytorch": ("https://pytorch.org/docs/main", None)}
+intersphinx_mapping = {"pytorch": ("https://docs.pytorch.org/docs/main", None)}
+
+# Configure Sphinx warnings and error handling
+suppress_warnings = [
+    "ref.citation",
+    "ref.footnote",
+    "ref.doc",
+    "toc.excluded",
+    "toc.not_readable",
+    "misc.highlighting_failure",
+]
+
+# Configure Breathe
+breathe_show_define_initializer = True
+breathe_show_enumvalue_initializer = True
+breathe_default_members = ("members", "undoc-members")
+
+
+# Fix for Python 3.10+ compatibility with exhale 2.3.0
+# MutableMapping was moved from collections to collections.abc in Python 3.10
+try:
+    import collections
+    from collections.abc import MutableMapping
+
+    if not hasattr(collections, "MutableMapping"):
+        collections.MutableMapping = MutableMapping
+except ImportError:
+    pass
 
 # Setup absolute paths for communicating with breathe / exhale where
 # items are expected / should be trimmed by.
@@ -101,6 +128,21 @@
         Welcome to the developer reference for the PyTorch C++ API.
     """
     ),
+    ############################################################################
+    # Duplicate handling and error management.                                 #
+    ############################################################################
+    # Note: Using Doxyfile instead of stdin configuration
+    # "exhaleDoxygenStdin" is not compatible with "exhaleUseDoxyfile"
+    # Handle unresolved references more gracefully
+    "unabridgedOrphanKinds": {
+        "function",
+        "define",
+        "enum",
+        "enumvalue",
+        "typedef",
+        "variable",
+    },
+    "fullToctreeMaxDepth": 2,
 }
 
 # Tell sphinx what the primary language being documented is.
@@ -174,6 +216,7 @@
 #
 html_theme_options = {
     "canonical_url": "https://pytorch.org/docs/stable/",
+    "analytics_id": "GTM-T8XT4PS",
     "collapse_navigation": False,
     "logo": {"text": "Home"},
     "icon_links": [
diff --git a/docs/source/_static/img/aoti_debug_printer/after_launch.png b/docs/source/_static/img/aoti_debug_printer/after_launch.png
new file mode 100644
index 000000000000..bfe8b33dc89d
Binary files /dev/null and b/docs/source/_static/img/aoti_debug_printer/after_launch.png differ
diff --git a/docs/source/_static/img/aoti_debug_printer/before_launch.png b/docs/source/_static/img/aoti_debug_printer/before_launch.png
new file mode 100644
index 000000000000..370292d09e95
Binary files /dev/null and b/docs/source/_static/img/aoti_debug_printer/before_launch.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/dynamic_shapes_example_specialization.png b/docs/source/_static/img/dynamic_shapes/dynamic_shapes_example_specialization.png
new file mode 100644
index 000000000000..3f70bb446401
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/dynamic_shapes_example_specialization.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse10_debugging_guards_unbacked.png b/docs/source/_static/img/dynamic_shapes/tlparse10_debugging_guards_unbacked.png
new file mode 100644
index 000000000000..0ea39efd3e5a
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse10_debugging_guards_unbacked.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse1_dynamic_shapes_false.png b/docs/source/_static/img/dynamic_shapes/tlparse1_dynamic_shapes_false.png
new file mode 100644
index 000000000000..376bbf6e0972
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse1_dynamic_shapes_false.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse2_dynamic_shapes_true.png b/docs/source/_static/img/dynamic_shapes/tlparse2_dynamic_shapes_true.png
new file mode 100644
index 000000000000..f5bdfa7a1103
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse2_dynamic_shapes_true.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse3_specialization.png b/docs/source/_static/img/dynamic_shapes/tlparse3_specialization.png
new file mode 100644
index 000000000000..7a28844925ec
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse3_specialization.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse4_pgo.png b/docs/source/_static/img/dynamic_shapes/tlparse4_pgo.png
new file mode 100644
index 000000000000..9550b39842e2
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse4_pgo.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse5_dynamic_shapes.png b/docs/source/_static/img/dynamic_shapes/tlparse5_dynamic_shapes.png
new file mode 100644
index 000000000000..4a180bf73664
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse5_dynamic_shapes.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse6_size_related_recompilations.png b/docs/source/_static/img/dynamic_shapes/tlparse6_size_related_recompilations.png
new file mode 100644
index 000000000000..ad5b8c295c8c
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse6_size_related_recompilations.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse7_not_size_related_recompilations.png b/docs/source/_static/img/dynamic_shapes/tlparse7_not_size_related_recompilations.png
new file mode 100644
index 000000000000..525d81143255
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse7_not_size_related_recompilations.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse8_compilation_metrics.png b/docs/source/_static/img/dynamic_shapes/tlparse8_compilation_metrics.png
new file mode 100644
index 000000000000..bf031d6d371e
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse8_compilation_metrics.png differ
diff --git a/docs/source/_static/img/dynamic_shapes/tlparse9_debugging_guards.png b/docs/source/_static/img/dynamic_shapes/tlparse9_debugging_guards.png
new file mode 100644
index 000000000000..6471f4969165
Binary files /dev/null and b/docs/source/_static/img/dynamic_shapes/tlparse9_debugging_guards.png differ
diff --git a/docs/source/accelerator/amp.md b/docs/source/accelerator/amp.md
new file mode 100644
index 000000000000..ac78436f56a5
--- /dev/null
+++ b/docs/source/accelerator/amp.md
@@ -0,0 +1,72 @@
+# Automatic Mixed Precision
+
+## Background
+
+Automatic Mixed Precision (AMP) enables the use of both single precision (32-bit) and half precision (16-bit) floating point types during training or inference.
+
+Key components include:
+
+- [**Autocast**](https://docs.pytorch.org/docs/stable/amp.html#autocasting): Automatically casts operations to lower-precision (e.g., float16 or bfloat16) to improve performance while maintaining accuracy.
+- [**Gradient Scaling**](https://docs.pytorch.org/docs/stable/amp.html#gradient-scaling): Dynamically scales gradients during backpropagation to prevent underflow when training with mixed precision.
+
+## Design
+
+### Casting Strategy
+
+The [`CastPolicy`](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/autocast_mode.h#L416-L438) is used to define type conversion rules. Each enum value represents a set of type conversion requirements for a group of operators, ensuring consistent handling of operations that prioritize either precision or performance.
+
+| Policy                   | Explanation                                                                          |
+| :---                     | :---                                                                                 |
+| **`lower_precision_fp`** | Cast all inputs to `lower_precision_fp` before execute the op.                       |
+| **`fp32`**               | Cast all inputs to `at::kFloat` before running the op.                               |
+| **`fp32_set_opt_dtype`** | Execution in `at::kFloat`, while respecting user-specified output dtype if provided. |
+| **`fp32_append_dtype`**  | Append at::kFloat to the args and redispatch to the type-aware overload              |
+| **`promote`**            | Promote all inputs to the “widest” dtype before execution.                           |
+
+### Operators Lists
+
+PyTorch defines a general list of operators for each of casting strategies mentioned above, as a reference for developers of new accelerators.
+
+| Policy                   | Operators List                                                                                    |
+| :---                     | :---                                                                                              |
+| **`lower_precision_fp`** | [List Link](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/autocast_mode.h#L819-L852) |
+| **`fp32`**               | [List Link](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/autocast_mode.h#L854-L912) |
+| **`fp32_set_opt_dtype`** | [List Link](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/autocast_mode.h#L914-L931) |
+| **`fp32_append_dtype`**  | [List Link](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/autocast_mode.h#L933-L958) |
+| **`promote`**            | [List Link](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/autocast_mode.h#L960-L971) |
+
+## Implementation
+
+### Python Integration
+
+Implement the `get_amp_supported_dtype` method to return the data types supported by the new accelerator in the AMP context.
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/amp/__init__.py
+    :language: python
+    :start-after: LITERALINCLUDE START: AMP GET_SUPPORTED_DTYPE
+    :end-before: LITERALINCLUDE END: AMP GET_SUPPORTED_DTYPE
+    :linenos:
+```
+
+### C++ Integration
+
+This section shows how AMP registers autocast kernels for the `AutocastPrivateUse1` dispatch key.
+
+- Register a fallback that makes unhandled ops fall through to their normal implementations.
+- Register specific aten kernels under `AutocastPrivateUse1` using the `KERNEL_PRIVATEUSEONE` helper macro, which maps an op to the desired precision implementation (with enum `at::autocast::CastPolicy`)
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/amp/autocast_mode.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: AMP FALLTHROUTH
+    :end-before: LITERALINCLUDE END: AMP FALLTHROUTH
+    :linenos:
+
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/amp/autocast_mode.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: AMP IMPL
+    :end-before: LITERALINCLUDE END: AMP IMPL
+    :emphasize-lines: 3,6,8-10
+    :linenos:
+```
diff --git a/docs/source/accelerator/index.md b/docs/source/accelerator/index.md
index 70f25812bb9e..3e8e5c895699 100644
--- a/docs/source/accelerator/index.md
+++ b/docs/source/accelerator/index.md
@@ -44,6 +44,7 @@ Next, we will delve into each chapter of this guide. Each chapter focuses on a k
 
 autoload
 operators
+amp
 ```
 
 [OpenReg URL]: https://github.com/pytorch/pytorch/tree/main/test/cpp_extensions/open_registration_extension/torch_openreg "OpenReg URL"
diff --git a/docs/source/backends.md b/docs/source/backends.md
index 3e6cdc9697bf..6f8791d9a608 100644
--- a/docs/source/backends.md
+++ b/docs/source/backends.md
@@ -61,12 +61,16 @@ These backends include:
 .. attribute::  allow_fp16_reduced_precision_reduction
 
     A :class:`bool` that controls whether reduced precision reductions (e.g., with fp16 accumulation type) are allowed with fp16 GEMMs.
+    Assigning a tuple ``(allow_reduced_precision, allow_splitk)`` lets you also toggle whether
+    split-K heuristics may be used when dispatching to cuBLASLt. ``allow_splitk`` defaults to ``True``.
 ```
 
 ```{eval-rst}
 .. attribute::  allow_bf16_reduced_precision_reduction
 
     A :class:`bool` that controls whether reduced precision reductions are allowed with bf16 GEMMs.
+    Assigning a tuple ``(allow_reduced_precision, allow_splitk)`` lets you also toggle whether
+    split-K heuristics may be used when dispatching to cuBLASLt. ``allow_splitk`` defaults to ``True``.
 ```
 
 ```{eval-rst}
diff --git a/docs/source/bottleneck.rst b/docs/source/bottleneck.rst
deleted file mode 100644
index ed5caf3fff58..000000000000
--- a/docs/source/bottleneck.rst
+++ /dev/null
@@ -1,62 +0,0 @@
-torch.utils.bottleneck
-======================
-
-.. automodule:: torch.utils.bottleneck
-.. currentmodule:: torch.utils.bottleneck
-
-`torch.utils.bottleneck` is a tool that can be used as an initial step for
-debugging bottlenecks in your program. It summarizes runs of your script with
-the Python profiler and PyTorch's autograd profiler.
-
-Run it on the command line with
-
-::
-
-    python -m torch.utils.bottleneck /path/to/source/script.py [args]
-
-where [args] are any number of arguments to `script.py`, or run
-``python -m torch.utils.bottleneck -h`` for more usage instructions.
-
-.. warning::
-    Because your script will be profiled, please ensure that it exits in a
-    finite amount of time.
-
-.. warning::
-    Due to the asynchronous nature of CUDA kernels, when running against
-    CUDA code, the cProfile output and CPU-mode autograd profilers may
-    not show correct timings: the reported CPU time reports the amount of time
-    used to launch the kernels but does not include the time the kernel
-    spent executing on a GPU unless the operation does a synchronize.
-    Ops that do synchronize appear to be extremely expensive under regular
-    CPU-mode profilers.
-    In these case where timings are incorrect, the CUDA-mode autograd profiler
-    may be helpful.
-
-.. note::
-    To decide which (CPU-only-mode or CUDA-mode) autograd profiler output to
-    look at, you should first check if your script is CPU-bound
-    ("CPU total time is much greater than CUDA total time").
-    If it is CPU-bound, looking at the results of the CPU-mode autograd
-    profiler will help. If on the other hand your script spends most of its
-    time executing on the GPU, then it makes sense to start
-    looking for responsible CUDA operators in the output of the CUDA-mode
-    autograd profiler.
-
-    Of course the reality is much more complicated and your script might not be
-    in one of those two extremes depending on the part of the model you're
-    evaluating. If the profiler outputs don't help, you could try looking at
-    the result of :func:`torch.autograd.profiler.emit_nvtx()` with ``nvprof``.
-    However, please take into account that the NVTX overhead is very high and
-    often gives a heavily skewed timeline. Similarly, ``Intel® VTune™ Profiler``
-    helps to analyze performance on Intel platforms further with
-    :func:`torch.autograd.profiler.emit_itt()`.
-
-.. warning::
-    If you are profiling CUDA code, the first profiler that ``bottleneck`` runs
-    (cProfile) will include the CUDA startup time (CUDA buffer allocation cost)
-    in its time reporting. This should not matter if your bottlenecks result
-    in code much slower than the CUDA startup time.
-
-For more complicated uses of the profilers (like in a multi-GPU case),
-please see https://docs.python.org/3/library/profile.html
-or :func:`torch.autograd.profiler.profile()` for more information.
diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst
index d66cf86b4444..ab6e9e82f9f0 100644
--- a/docs/source/community/persons_of_interest.rst
+++ b/docs/source/community/persons_of_interest.rst
@@ -224,6 +224,12 @@ AMD/ROCm/HIP
 -  Jithun Nair (`jithunnair-amd <https://github.com/jithunnair-amd>`__)
 -  (emeritus) Junjie Bai (`bddppq <https://github.com/bddppq>`__)
 
+XPU/Intel GPU
+~~~~~~~~~~~~~
+
+- Eikan Wang (`EikanWang <https://github.com/EikanWang>`__)
+- Guangye Yu (`guangyey <https://github.com/guangyey>`__)
+
 Build + CI
 ~~~~~~~~~~
 
@@ -339,13 +345,16 @@ XLA
 ~~~
 
 -  Jack Cao (`JackCaoG <https://github.com/JackCaoG>`__)
--  Daniel Sohn (`jysohn23 <https://github.com/jysohn23>`__)
--  Zach Cain (`zcain117 <https://github.com/zcain117>`__)
+-  Han Qi (`qihqi <https://github.com/qihqi>`__)
+-  Yifei Teng (`tengyifei <https://github.com/tengyifei>`__)
+-  Siyuan Liu (`lsy323 <https://github.com/lsy323>`__)
 -  Brian Hirsh (`bdhirsh <https://github.com/bdhirsh>`__)
--  Gregory Chanan (`gchanan <https://github.com/gchanan>`__)
+-  (emeritus) Gregory Chanan (`gchanan <https://github.com/gchanan>`__)
 -  (emeritus) Ailing Zhang (`ailzhang <https://github.com/ailzhang>`__)
 -  (emeritus) Davide Libenzi (`dlibenzi <https://github.com/dlibenzi>`__)
 -  (emeritus) Alex Suhan (`asuhan <https://github.com/asuhan>`__)
+-  (emeritus) Daniel Sohn (`jysohn23 <https://github.com/jysohn23>`__)
+-  (emeritus) Zach Cain (`zcain117 <https://github.com/zcain117>`__)
 
 TorchServe
 ~~~~~~~~~~
diff --git a/docs/source/compile/dynamic_shapes_advanced_control_options.md b/docs/source/compile/dynamic_shapes_advanced_control_options.md
new file mode 100644
index 000000000000..e82276681717
--- /dev/null
+++ b/docs/source/compile/dynamic_shapes_advanced_control_options.md
@@ -0,0 +1,239 @@
+(dynamic_shapes_advanced_control_options)=
+# Advanced Options to Control Dynamic Behavior
+
+PyTorch provides several advanced options to control dynamic behavior.
+These options requires a deep understanding of the PyTorch internals and
+may inlvolve setting additional tools. These options include:
+
+* Profile-Guided Optimization (PGO) is a technique that allows the compiler
+  to save automatic dynamic decisions and reuse them across jobs.
+* Compiler Collective is a feature that is used to modify automatic dynamic
+  shapes behavior by inferring if an input is dynamic based on whether
+  its size varies across ranks.
+
+## Profile-Guided Optimization (PGO)
+
+Profile-Guided Optimization (PGO) enhances automatic dynamic by sharing profiling decisions across runs of your model. Specifically, it serializes all the choices made by automatic dynamic into a file on disk. You can then copy this file—or store it in a centralized metadata service like S3—and reuse it on other machines to ensure consistent behavior across environments.
+
+For the purposes of the rest of this tutorial, you can use the following environmental variables to turn on PGO locally `TORCH_COMPILE_JOB_ID=1 TORCH_DYNAMO_AUTOMATIC_DYNAMIC_LOCAL_PGO=1`
+
+(identifying-dynamic-elements-marked-by-pgo)=
+### Identifying Dynamic Elements Marked by PGO
+
+Use `tlparse` to find line numbers of interest and check for multiple values
+seen for inputs.
+
+To determine which elements are marked as dynamic by Profile-Guided Optimization (PGO),
+follow these steps using `tlparse`:
+
+1. In the `tlparse` output, identify the line number of the frame of interest. Example:
+
+   ```{image} ../_static/img/dynamic_shapes/tlparse4_pgo.png
+   ```
+
+2. Open `local_code` using `put_local_code_state_` or `put_remote_code_state_` for the
+   latest frame (for example, 6/1).
+
+   Each `?` indicates that multiple values have been observed for this input.
+
+   For instance, the following output shows that the input `L['m']` has been seen with
+   multiple sizes at `size[0]`, but the stride has consistently been 1:
+
+   ```
+   /data/users/bobren/a/pytorch/r2.py:2:func:
+   L['m']: fully dynamic scalar or tensor
+   L['x']: tensor size=[?] stride=[1]
+   L['y']: tensor size=[?] stride=[1]
+   L['z']: tensor size=[?] stride=[1]
+   ```
+
+```{note}
+If an element is marked as dynamic by PGO, it does not guarantee that it will remain dynamic in the graph. Specialization can revert it to a static state.
+```
+
+## Compiler Collective
+
+Different ranks can communicate with each other to share observed sizes. In the second
+iteration, automatic dynamic uses this information to determine which elements to mark
+as dynamic based on inputs seen across all ranks. Check this [PR](https://github.com/pytorch/pytorch/pull/130935) for more details.
+To enable this feature, use `enable_compiler_collectives=True` with the `@config.patch`
+decorator.
+
+```python
+@config.patch(enable_compiler_collectives=True)
+```
+
+```{note}
+This feature enables the use of collectives during compilation to
+synchronize behavior across ranks. Currently, it is used to modify
+automatic dynamic shapes behavior by inferring if an input is dynamic
+based on whether its size varies across ranks. Since this synchronization
+uses collectives, all ranks must run compilation simultaneously; ranks must
+not diverge with graph breaks. This is most reliably achieved by ensuring
+torch is only run on SPMD programs. Violating this invariant may result in
+deadlocking NCCL and encountering a NCCL timeout.
+```
+
+## Reducing Compilations: Step by Step
+
+If you have a model that you can run on your master job and have a `tlparse`,
+here's whatyou should do next:
+
+### Step 1: Mark Dynamic Elements
+
+The first step is to reduce initial compilations that are eventually optimized away
+by automatic dynamic or PGO. This is straightforward because we know it will work
+upfront. If, in one run, a frame starts with static graphs and converges to
+dynamic graphs, and if you notice a reduction in the number of compiled
+frames in a second (warm) PGO-enabled run, it's likely due to this optimization.
+
+This is a two-step process:
+
+1. Find elements marked as dynamic by PGO or automatic dynamic.
+2. Mark them as dynamic using one of the {ref}`user_annotations`.
+
+#### How to Identify Elements to Mark as Dynamic
+
+Follow these guidelines:
+
+1. **PGO artifact:** Follow the steps in {ref}`identifying-dynamic-elements-marked-by-pgo`.
+2. **Dynamic Logs:** If you have a run with `TORCH_LOGS="+dynamic"`, each
+time a new dynamic dimension is allocated, a debug line will specify it
+along with the input name.
+3. **Compare Graphs:** For frames with reduced compilations across runs,
+inspect the Dynamo graphs in the second run or the latest runs in the
+cold run. Look for elements marked as dynamic in those graphs. Specifically,
+find graphs that are similar (once specialized and once dynamic).
+
+Even without a warm run, you can inspect all graphs for a specific frame
+to see if some are similar and converge to a dynamic version.
+
+For example, in the following `tlparse` snapshot, Dynamo graphs 20/0,
+20/1, and 20/2 are similar except for different sizes (for example,
+graph 20/0 vs. graph 20/2). In the Dynamo graph of 20/2, sizes `s0`,
+`s1`, and `s5` are used for `rotary_pos_emb_` and `x`.
+
+```{image} ../_static/img/dynamic_shapes/tlparse5_dynamic_shapes.png
+```
+
+```{tip}
+Two graphs are considered similar if they have the same sequence of calls for
+torch operations and the same tensor inputs. Variations may exist in integer
+inputs that could be inlined in the specialized version or arithmetic
+computations that only exist in the dynamic version due to inlining in the
+static version.
+```
+
+### Step 2: Debugging: Identifying Missed Opportunities
+
+The complexity of debugging can vary greatly depending on the issues you
+encounter. The end result is often to find a bug, enable a flag, or modify
+user/framework code.
+
+#### Finding Similar Graphs
+
+Start by identifying a group of similar graphs that you might want to combine
+into one dynamic graph, as discussed in the previous section on comparing
+graphs. If you can't find any similar graphs, there's nothing further to do
+in this step.
+
+#### Quick Checks: Fail Fast
+
+After finding similar graphs, you want to understand why the have recompilations.
+Check the following:
+
+1. **Check Recompile Reasons:** For graphs you believe are similar, click on
+`recompile_reason` in the `tlparse` output for the later graph. Ensure the
+reason is size-related and not due to other factors. For example, while
+in these screenshot the recomplile reason is size-related:
+
+```{image} ../_static/img/dynamic_shapes/tlparse6_size_related_recompilations.png
+```
+
+In the one below it is not, which indicates that dynamic shapes won't resolve it:
+
+```{image} ../_static/img/dynamic_shapes/tlparse7_not_size_related_recompilations.png
+:width: 500px
+:align: center
+```
+
+2. **Compare Guards Files:** Ensure there are no guards on non-size-related
+elementsthat exist in one graph but not the others.
+
+3. **Early Check for Custom Triton Kernels:** Check if your model calls custom
+Triton kernels with `tl.constexpr` arguments, as these are always
+specialized. If your model receives different values for these arguments,
+it could be a source of recompilation.
+
+
+## **Identifying and Fixing Recompilation Causes**
+
+1. **Is Something Not Marked Dynamic but Should Be?** Determine if an input was
+marked dynamic and got specialized or was not marked dynamic at all. You can
+identify this by:
+
+    * Checking the Dynamo graph - look for `Sym(number)`. For example:
+
+      ```
+      Sym(256) vs Sym(s0)
+      ```
+
+    * Using dynamic logs:
+
+      ```
+      ["TORCH_LOGS=+dynamic"]
+      create_symbol s2 = 2 for L['self']._modules['cle ...
+      ```
+
+    * Reviewing guards files. If a tensor size is dynamic, it will be indicated as `None`:
+
+      ```
+      TENSOR_MATCH:check_tensor(L['self'].x._parameters['weight']], Parameter, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU), torch.float32, device=None, requires_grad=True, size=[None, None], stride=[None, 1])
+      ```
+
+2. **Why Is It Not Marked Dynamic?** If you determine an element is not marked dynamic, consider:
+
+    * Checking if it's an `nn` module property, parameter, or field. Verify setting for the flags:
+      * `force_parameter_static_shapes = True`
+      * `force_nn_module_property_static_shapes = True`
+      * `allow_unspec_int_on_nn_module = False`
+      * Or using the dynamic allow list to mark it dynamic, which should have the highest priority.
+
+    ```{tip}
+    Marking elements one by one can be time-consuming. Initially, flip the flags to
+    identify any blocking specializations, then decide how to mark them
+    dynamic at the end of the process.
+    ```
+
+    * If you feel, like it could be a bug, please file a bug report and mark
+    with the `module: dynamic shapes` label. Check the list of known issues in
+    [this list](https://github.com/pytorch/pytorch/issues?q=sort%3Aupdated-desc+state%3Aopen+label%3A%22module%3A+dynamic+shapes%22).
+
+3. **Is a Dynamic Element Getting Specialized?** Determine why it is specialized.
+It could be due to user code (such as an `if` condition), framework code, or a
+call  to a Triton kernel. To identify the reason for specialization:
+
+    * **Using tlparse:** Check the `compilation_metrics` for a specialization section, which will indicate what got specialized and the user and framework stack when it happened. Example:
+
+    ```{image} ../_static/img/dynamic_shapes/tlparse8_compilation_metrics.png
+    ```
+
+    The log above indicates that `s0` is specialized to `33` due to the following code:
+
+    ```
+    `if self.x ==33` at example4.py line 16.
+    ```
+
+    * **+Dynamic Logs:** pass `["TORCH_LOGS=+dynamic"]`. Look for the first specialization, as once a variable is specialized, all dependent variables get specialized too.
+
+    Example log:
+
+    ```
+    torch/fx/experimental/symbolic_shapes.py:6557] [0/2] eval Eq(s0, 33) [guard added] if self.x ==33:  # example4.py:16 in forward (_dynamo/variables/tensor.py:1242 in evaluate_expr), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(s0, 33)"
+    V0228 12:04:24.190000 2990033 torch/fx/experimental/symbolic_shapes.py:6000] [0/2] _update_var_to_range s0 = VR[33, 33] (update)
+    ```
+
+    The log above indicates that `s0` is specialized to `33` due to the following code:
+    ```
+    if self.x ==33. At example4.py like 16.
+    ```
diff --git a/docs/source/compile/dynamic_shapes_backed_unbacked.md b/docs/source/compile/dynamic_shapes_backed_unbacked.md
new file mode 100644
index 000000000000..e22e28cdc7b7
--- /dev/null
+++ b/docs/source/compile/dynamic_shapes_backed_unbacked.md
@@ -0,0 +1,45 @@
+(backed-vs-unbacked-symints)=
+# Backed vs Unbacked Symints
+
+Backed `SymInts` are symbolic integers that have a concrete value or "hint"
+associated with them. This means that torch can use these values to make
+decisions about control flow, such as determining which branch of code
+to execute. They are typically derived from operations where the size or
+value is known or can be inferred.
+
+Unbacked `SymInts` are symbolic integers that do not have a concrete value or
+hint. They often arise from data-dependent operations, such as `.nonzero()`
+or `.item()`, where the size or value cannot be determined at compile time.
+Since they lack a concrete value, they cannot be used for control flow
+decisions, and attempting to do so requires a graph break.
+
+Unbacked `SymInts` use *oblivious-size reasoning* which is particularly
+useful when you are dealing with
+{ref}`0/1 specialization recompilation problem <zero-one-specialization>`.
+
+In summary, backed `SymInts` have known values that can be used for
+decision-making, while unbacked `SymInts` do not, requiring special handling
+to avoid graph breaks.
+
+Unbacked symbolic integers can be too restrictive, causing most PyTorch programs
+to fail. To address this, you can use the following methods and APIs as
+workaround:
+
+* Use higher-level APIs like `empty` instead of `empty_strided` to create tensors.
+This ensures the tensor is non-overlapping and dense, avoiding unnecessary stride
+sorting and guard creation.to avoid unnecessary recomputation of these properties.
+
+* Modify your code to make precomputed properties *lazy*. This ensures that
+guards on unbacked symbolic integers are only applied when necessary,
+reducing computational overhead.
+
+## How to use unbacked
+To use unbacked APIs, replace `mark_dynamic` with `mark_unbacked` and
+`TORCH_COMPILE_DYNAMIC_SOURCES` with `TORCH_COMPILE_UNBACKED_SOURCES`.
+This tells the compiler to treat an input as unbacked.
+
+```{seealso}
+* {ref}`dynamic_shapes`
+* {ref}`torch.export`
+* {ref}`what_is_a_specialization`
+```
diff --git a/docs/source/compile/dynamic_shapes_beyond_the_basics.md b/docs/source/compile/dynamic_shapes_beyond_the_basics.md
new file mode 100644
index 000000000000..45cbe70a5b1c
--- /dev/null
+++ b/docs/source/compile/dynamic_shapes_beyond_the_basics.md
@@ -0,0 +1,10 @@
+(dynamic_shapes_beyond_the_basics)=
+# Beyond the Basics
+
+This section covers some advanced topics related to dynamic shapes. This includes more complex explanations of how dynamic shapes work, 0/1 specialization problems, and so on.
+
+```{toctree}
+:maxdepth: 1
+dynamic_shapes_zero_one_specialization
+dynamic_shapes_backed_unbacked
+```
diff --git a/docs/source/compile/dynamic_shapes_core_concepts.md b/docs/source/compile/dynamic_shapes_core_concepts.md
new file mode 100644
index 000000000000..b8ab3d0fe4ce
--- /dev/null
+++ b/docs/source/compile/dynamic_shapes_core_concepts.md
@@ -0,0 +1,134 @@
+(dynamic_shapes_core_concepts)=
+# Dynamic Shapes Core Concepts
+
+This section described the core concepts of dynamic shapes in PyTorch. It is intended to be a
+reference for engineers working on the PyTorch compiler stack and anyone who wants to understand
+the inner workings of dynamic shapes.
+
+## Symbolic integers
+Symbolic integers (Symints) are used to represent variables that can span a range. For example:
+```python
+x = torch.randn(5, 5) # this tensor has a shape [5, 5]
+torch._dynamo.decorators.mark_dynamic(x, 0)
+x = torch.randn(5, 5) # this tensor has a shape [s0, 5]
+y = torch.cat([x, x], dim=0) # this tensor has a shape [2*s0, 5]
+```
+
+However, `z = x * y` would throw an error since we know that pointwise operation like multiply must
+operate on same sized tensors but we know statically `s0 != 2 * s0`. Astute readers may point out
+that this is not true when `s0 == 0` and the reason why that doesn't matter here is described in
+{ref}`zero-one-specialization`.
+
+## Guards
+
+In `torch.compile`, a guard is a mechanism that is used to ensure the validity of a compiled code graph.
+By default, when you make a variable dynamic, it can range from `[-inf, inf]`. For example:
+
+```python
+def foo(x): return x / 2
+
+This works for any dynamic x. But if your code is:
+
+def foo(x)
+    if x > 5:
+        return x / 2
+    return x / 3
+```
+If you call `foo(6)`, it returns `x / 2` and adds a guard `x > 5`. Calling `foo(4)` later will
+require recompilation because the guard is broken.
+
+## Runtime Asserts
+You can use runtime asserts to provide hints when you know certain facts, like batch size being less than 100:
+
+```python
+def foo(batch_size):
+    torch._check(batch_size < 100)
+    if batch_size < 100:
+        return do_something
+    return do_something_else()
+```
+
+## "Hint" Value
+
+A "hint value" in the context of `torch.compile` refers to the actual values known during the compilation process that help the JIT compiler make decisions about expressions. Hint values are particularly useful for handling dynamic shapes, as they provide concrete information that guides the compilation without requiring recompilation for varying dimensions.
+
+
+## Dynamic Behavior Overview
+
+PyTorch assumes static shapes by default. When a size change is detected, it attempts to
+recompile with dynamic input, although this may fail if there are conditional branches
+or missing support for dynamic shapes. To diagnose overspecialization, you can set
+`TORCH_LOGS=dynamic` to view "eval" entries that indicate when and why guards are added.
+
+If you anticipate a dimension will be dynamic, you can use `torch._dynamo.mark_dynamic(tensor, dim)`
+to mark it in advance, specifying `min` and `max` values if known. Using `torch.compile(dynamic=False)`
+disables automatic dynamic shapes, leading to recompilation for each unique size. Conversely,
+`torch.compile(dynamic=True)` aims to use dynamic shapes as much as possible which is most useful
+for small and may not be suitable for large models due to potential crashes or performance issues.
+
+You can whitelist specific sources to be marked as dynamic using the `TORCH_COMPILE_DYNAMIC_SOURCES` environment variable or `torch.compiler.config.dynamic_sources`. This is particularly useful for large
+models with graph breaks, as you can maintain dynamism across graph breaks since
+source names stay consistent. You can also use this to mark integers as dynamic. The format is a comma-delimited list of source names, for example, `"L['x'], L['y']"`.
+You can also use regexes, for example, `"L\['x.*'\], L\['y.*'\]")`.
+This whitelist takes precedence over other flags like `dynamic=False` `force_nn_module_property_static_shapes`, and `force_parameter_static_shapes`.
+
+Sometimes it can be cumbersome to find the right inputs to mark as dynamic. If
+you're willing to take a performance hit for the first batch, one other affordable
+option we have are the `eager_then_compile` stances which derive dynamism for you.
+See {func}`torch.compiler.set_stance` for more details.
+
+
+## Overall Architecture
+
+Symbolic shapes workflow:
+
+1. When compiling a frame in Dynamo, we allocate a `ShapeEnv` (attached to `FakeTensorMode`) to
+track symbolic shapes.
+2. We allocate symbolic sizes for tensors on entry, based on policy decisions.
+3. We propagate symbolic sizes through operators, maintaining both FX IR for symbolic compute export
+and Sympy expressions for reasoning.
+4. We add guards based on conditionals during Dynamo tracing or Inductor optimization, induced from both Python and C++.
+5. Guards can simplify symbolic variables. For instance, asserting `s0 == 4` allows replacing all occurrences of `s0` with `4`.
+6. After tracing and optimizing, we install all guards with the compiled code, ensuring reusability only if all guards evaluate true.
+
+## Internal API Class Hierarchy
+
+### Python Classes
+
+- **`SymInt`/`SymFloat`/`SymBool`**: User-visible classes that simulate their `int`/`float`/`bool` counterparts. Adding two `SymInts` produces a new `SymInt` that symbolically tracks the integer addition.
+
+- **`SymNode`**: Internal structure (accessible via `symint.node`) that holds actual symbolic tracking information. `SymNode` is type-erased, making it convenient to represent mixed-type operations.
+
+- **`ShapeEnv`**: Per-compile context state that tracks all free symbols and guards accumulated so far. Every `SymNode` records its `ShapeEnv` (but not vice versa; `SymNodes` are only used if they participate in a guard).
+
+### C++ Equivalents
+
+- **`c10::SymInt`/`SymFloat`/`SymBool`**: User-visible classes that simulate `int`/`float`/`bool`
+- **`c10::SymNode`/`SymNodeImpl`**: Analogous to Python `SymNode`
+- **No C++ `ShapeEnv`**: For debugging ease, the entire symbolic reasoning apparatus remains in Python
+
+When writing code traceable with `make_fx`, it must handle `SymInt`/`SymFloat`/`SymBool` flowing through it.
+
+## Value Ranges and Constraints
+
+Symbolic variables maintain **value ranges** that specify the set of possible values. By default:
+- Size-like unbacked `SymInts` have value range `[0, Inf]`
+- Regular unbacked `SymInts` have value range `[-Inf, Inf]`
+
+When assertions are made (e.g., `torch._check(x == y)`), the system:
+1. Attempts to replace unbacked symbols with equivalent expressions
+2. Refines value ranges based on the assertion
+3. Remembers boolean expressions that are always true
+
+Important files:
+
+- C++ SymInt API: `c10/core/SymInt.h`, `SymFloat.h`, `SymBool.h`
+- Python SymInt API: `torch/__init__.py` (look for `SymInt/SymFloat/SymBool`)
+- C++ plumbing: `c10/core/SymNodeImpl.h`, `torch/csrc/utils/python_symnode.h`, `torch/csrc/jit/python/init.cpp`
+- Python infrastructure: `torch/fx/experimental/symbolic_shapes.py`
+- Other important files: `torch/_subclasses/fake_tensor.py`, `torch/_meta_registrations.py`, decomps, PrimTorch refs
+
+```{seealso}
+* {ref}`dynamic_shapes`
+* {ref}`dynamic_shapes_troubleshooting`
+```
diff --git a/docs/source/compile/dynamic_shapes_debugging_tlparse_torch_logs.md b/docs/source/compile/dynamic_shapes_debugging_tlparse_torch_logs.md
new file mode 100644
index 000000000000..46c7cb2daee4
--- /dev/null
+++ b/docs/source/compile/dynamic_shapes_debugging_tlparse_torch_logs.md
@@ -0,0 +1,101 @@
+(debugging-tlparse-torch-logs)=
+# Debugging with `tlparse` and `TORCH_LOGS=dynamic`
+
+`tlparse` is a tool used for analyzing and understanding the compilation
+process in PyTorch, particularly when dealing with dynamic shapes. It helps
+identify where guards and specializations occur in your code.
+
+`TORCH_LOGS=dynamic` is an environment variable setting that enables detailed
+logging of dynamic shape operations, providing insights into how symbolic
+shapes are handled during execution.
+
+This section will guide you through using `tlparse` and `TORCH_LOGS=dynamic` to
+troubleshoot dynamic shape issues in your code, including debugging
+specialization, guards, and more.
+
+# Debugging Specialization
+
+In the following example, `x.shape[0]` is dynamic but becomes specialized due to multiplication:
+
+```python
+import torch
+
+@torch.compile
+def fn(x, y):
+    return x * y
+
+x = torch.randn(5)
+y = torch.randn(5)
+torch._dynamo.decorators.mark_dynamic(x, 0)
+
+fn(x, y)
+```
+
+By using `TORCH_LOGS=dynamic`, you can observe this specialization in the logs:
+
+```xml
+TORCH_LOGS=dynamic python tl.py
+I0721 11:10:00.950000 845259 torch/fx/experimental/symbolic_shapes.py:3776] [0/0] create_env
+I0721 11:10:01.030000 845259 torch/fx/experimental/symbolic_shapes.py:5117] [0/0] create_symbol s77 = 5 for L['x'].size()[0] [2, int_oo] return x * y  # tl.py:5 in fn (_dynamo/variables/builder.py:3466 in <lambda>), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL="s77" or to suppress this message run with TORCHDYNAMO_EXTENDED_ADVICE="0"
+I0721 11:10:01.038000 845259 torch/fx/experimental/symbolic_shapes.py:7211] [0/0] eval Eq(s77, 5) [guard added] return x * y  # tl.py:5 in fn (_subclasses/fake_impls.py:922 in infer_size), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(s77, 5)"
+```
+
+The line `eval Eq(s77, 5) [guard added] return x * y # tl.py:5` indicates the specialization.
+
+## Debugging Guards
+
+Consider the following code, which may cause recompilations due to dynamic
+shapes:
+
+```python
+import torch
+
+@torch.compile
+def fn(x, y):
+    if x.shape[0] < 10:
+        return x * y
+
+x = torch.randn(5)
+y = torch.randn(5)
+torch._dynamo.decorators.mark_dynamic(x, 0)
+torch._dynamo.decorators.mark_dynamic(y, 0)
+
+fn(x, y)
+```
+
+To identify where dynamic shape guards originate, use `tlparse`. Here is an example tlparse output:
+
+```{image} ../_static/img/dynamic_shapes/tlparse9_debugging_guards.png
+```
+
+By clicking on the `dynamo_cpp_guards` link, you can view all guards from the compilation, including the symbolic shape guard `L['x'].size()[0] <= 9`.
+
+Astute readers will notice the 0/1 specialization where we guard on `L['x'].size()[0] >= 2`. By modifying the code to use unbacked symbols, this guard is removed:
+
+```python
+import torch
+
+@torch.compile
+def fn(x, y):
+    # Necessary runtime assert since we can't guard on unbacked
+    torch._check(x.shape[0] < 10)
+    if x.shape[0] < 10:
+        return x * y
+
+x = torch.randn(5)
+y = torch.randn(5)
+torch._dynamo.decorators.mark_unbacked(x, 0)
+torch._dynamo.decorators.mark_unbacked(y, 0)
+
+fn(x, y)
+```
+
+Now, this compiled region can be used for inputs of size 0 and 1:
+
+```{image} ../_static/img/dynamic_shapes/tlparse10_debugging_guards_unbacked.png
+```
+
+```{seealso}
+* {ref}`dynamic_shapes`
+* {ref}`troubleshooting_guardondatadependentsymnode_errors`
+```
diff --git a/docs/source/compile/dynamic_shapes_troubleshooting.md b/docs/source/compile/dynamic_shapes_troubleshooting.md
new file mode 100644
index 000000000000..6217f9e9f47c
--- /dev/null
+++ b/docs/source/compile/dynamic_shapes_troubleshooting.md
@@ -0,0 +1,14 @@
+(dynamic_shapes_troubleshooting)=
+
+# Troubleshooting Dynamic Shapes
+
+This section contains a list of common issues that you may encounter when using
+dynamic shapes. The section describes how to use `TORCH_LOGS` and `tlparse` to
+debug the issues, as well as provides some general tips and tricks to help you
+resolve the issues.
+
+```{toctree}
+:maxdepth: 1
+dynamic_shapes_debugging_tlparse_torch_logs
+dynamic_shapes_troubleshooting_guardon_errors
+```
diff --git a/docs/source/compile/dynamic_shapes_troubleshooting_guardon_errors.md b/docs/source/compile/dynamic_shapes_troubleshooting_guardon_errors.md
new file mode 100644
index 000000000000..5261442d9e70
--- /dev/null
+++ b/docs/source/compile/dynamic_shapes_troubleshooting_guardon_errors.md
@@ -0,0 +1,411 @@
+(troubleshooting_guardondatadependentsymnode_errors)=
+
+# Troubleshooting GuardOnDataDependentSymNode Errors
+
+When working with PyTorch models that have data-dependent control flow (using functions
+like `item()`, `tolist()`, or `nonzero())`, you may encounter `GuardOnDataDependentSymNode` errors.
+This section explains what these errors are and how to fix them.
+
+## Common Error Pattern
+The following output shows the common error pattern `GuardOnDataDependentSymNode` errors:
+
+```sh
+torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode: Could not guard on data-dependent expression Eq(u2, -1) (unhinted: Eq(u2, -1)).  (Size-like symbols: none)
+
+Potential framework code culprit (scroll up for full backtrace):
+  File "/data/users/ezyang/a/pytorch/torch/_prims_common/__init__.py", line 855, in infer_size
+    if d == -1:
+
+For more information, run with TORCH_LOGS="dynamic"
+For extended logs when we create symbols, also add TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL="u2"
+If you suspect the guard was triggered from C++, add TORCHDYNAMO_EXTENDED_DEBUG_CPP=1
+For more debugging help, see https://docs.google.com/document/d/1HSuTTVvYH1pTew89Rtpeu84Ht3nQEFTYhAX3Ypa_xJs/edit?usp=sharing
+```
+
+## Root Cause
+
+These errors occur when PyTorch tries to convert a symbolic quantity (for example, `u2 == -1`)
+into a concrete value (such as, `False`) to make branching decisions. In a typical scenario,
+where data-dependent sizes are not involved, PyTorch can determine the concrete value at
+compile time and install a guard to ensure the compilation result remains valid. However,
+with data-dependent quantities, the true value is unknown at compile time, resulting in errors.
+
+You can often rewrite your model, by adding `torch._check` or `torch._check_is_size` to
+bypass these issues. This document aims to teach you how.
+
+## Debugging Tools
+
+Here is the list of some of the debugging tools available in PyTorch that you can use to troubleshoot these errors:
+
+* `TORCH_LOGS="dynamic"` - Shows detailed logs about symbolic operations
+* `TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL="u2"` - Provides extended logs for specific symbols
+* `TORCHDYNAMO_EXTENDED_DEBUG_CPP=1` - Helps when guards are triggered from C++
+
+## Error Variations
+
+Here is a the list of error variations that you might encounter:
+
+| Error Variations | Description |
+|------------------|-------------|
+| "Could not guard on data-dependent expression" | Occurs when trying to extract a concrete boolean from expressions like u0 == 0 or u0 > 10 |
+| "Could not extract specialized integer from data-dependent expression" | Occurs when trying to extract a concrete integer value. <br/> **Common causes:** <br/> - Control flow that depends on the integer (such as, looping `u0` times) <br/> - Overspecialization in code that could work symbolically |
+
+## How to Diagnose Your Problem
+
+### Step 1: Examine the Potential Framework Culprit (Python Backtrace)
+
+The exception provides a backtrace, which often indicates the problem.
+Given that PT2 backtraces can be lengthy, the error message will also
+suggest a potential framework culprit. For example:
+
+```sh
+Potential framework code culprit (scroll up for full backtrace):
+  File "/data/users/ezyang/a/pytorch/torch/_prims_common/__init__.py", line 855, in infer_size
+    if d == -1:
+```
+
+**Consider the Following:**
+
+* Does it make sense that this condition is triggering a guard on a
+data-dependent symbol?
+* Should we know if the quantity in question is size-like?
+(The exception lists size-like symbols; if a symbol is not listed,
+it might be an arbitrary integer.)
+* If the equation involves two distinct symbols, should we know
+they are actually equal?
+*  If all symbols are size-like but the equation involves 0 or 1,
+are we missing a `guard_size_oblivious` wrapper? (Remember, for
+`guard_size_oblivious` between two size tuples, use `sym_eq` instead
+of regular equality.)
+
+In the example above, testing if `d` (a data-dependent value) is `-1` suggests
+that `d` should be non-negative if it were a size. This indicates a missing
+`torch._check_is_size`. If `d` is already size-like but `numel() == 0` fails,
+consider wrapping it in `guard_size_oblivious`.
+
+Using `TORCH_LOGS=dynamic` and examining the user stack trace is crucial for
+understanding how to fix the problem, as they guide you on how to modify the
+user program.
+
+```sh
+[INFO] create_unbacked_symint u0 [-9223372036854775808, 9223372036854775807] (w.py:40 in custom_op_meta)
+```
+
+This log message indicates where (`w.py:40`) the unbacked `SymInt` was
+allocated. An unbacked `SymInt` may be allocated multiple times, so track
+their equalities:
+
+```sh
+[INFO] set_replacement u1 = u0 (trivial_lhs) ValueRanges(lower=0, upper=9223372036854775807, is_bool=False)
+```
+
+### Step 2: Examine the C++ Backtrace
+
+If the framework code culprit is uninformative, the guard might be in C++. You can
+force a C++ backtrace by running with `TORCHDYNAMO_EXTENDED_DEBUG_CPP=1`. This
+provides a detailed C++ backtrace with Python, CPython, and C10/ATen/libtorch
+frames interspersed. Look for symbols in the `at::` or `c10::` namespace that
+resemble kernel-specific code, likely related to the kernel executed per the Python
+backtrace. If using a non-debug build of PyTorch, inlining may cause missing
+frames, requiring source code investigation to locate the issue. For example, see https://github.com/pytorch/pytorch/pull/118579.
+
+Here is an example C++ backtrace from a debugging session:
+
+```
+[2024-02-08 08:20:45,259] torch.fx.experimental.symbolic_shapes: [INFO]   File "../
+__gen_aten__/out/RegisterCompositeImplicitAutograd.cpp", line 2025, in at::
+(anonymous namespace)::(anonymous namespace)
+::wrapper_CompositeImplicitAutograd_Tensor_narrow(at::Tensor const&, long,
+at::Tensor const&, c10::SymInt) [2024-02-08 08:20:45,259] torch.fx.experimental.
+symbolic_shapes: [INFO]   File "../aten/src/ATen/native/TensorShape.cpp", line 1410,
+in at::native::narrow_tensor_symint(at::Tensor const&, long, at::Tensor const&,
+c10::SymInt) [2024-02-08 08:20:45,259] torch.fx.experimental.symbolic_shapes:
+[INFO]   File "../__gen_aten__/out/core/TensorMethods.cpp", line 52, in long
+at::Tensor::item<long>() const [2024-02-08 08:20:45,259] torch.fx.experimental.
+symbolic_shapes: [INFO]   File "../ATen/core/TensorBody.h", line 4274, in
+at::Tensor::item() const
+```
+
+In this example, `at::native::narrow_tensor_symint` calls into `item`, which
+triggers the guard on a data-dependent `SymNode`. You can modify the C++ code to
+avoid specializing, or verify if you should be in this C++ code (e.g., `start` was
+not expected to be a `Tensor`, and modifying this fixed the problem).
+
+## Tools for Fixing Errors
+
+There are a few important functions which you should use to troubleshoot this problem.
+
+### torch._check(cond, msg_fn)
+
+`torch._check` is a function used to assert conditions at runtime, particularly when dealing with symbolic integers (`SymInts`) in PyTorch.
+
+**Example Usage:**
+
+```python
+torch._check(x.size(0) == y, lambda: f"size mismatch: {x.size(0)} != {y}")
+```
+
+The code above does the following:
+
+* Creates a deferred runtime assertion instead of a compile-time guard
+* Teaches the symbolic reasoning system facts about your unbacked SymInts
+* Can eliminate unbacked symbols by replacing them with equivalent expressions
+* Refines value ranges of symbols
+* Remembers boolean expressions that are always true
+
+Semantically, the function behaves like a conditional check:
+```python
+if not cond:
+    raise RuntimeError(msg_fn())
+```
+
+But there a number of key differences:
+
+* The condition is always assumed true at compile time, even if it involves unbacked `SymInts`. The actual check is deferred to runtime, avoiding
+compile-time errors. Instead of setting up a guard, we implement a
+deferred runtime assertion to verify the condition at runtime. At compile
+time, we assume the condition won't trigger an error, so we don't need
+to determine if it evaluates to `True` or `False`.
+
+* If you perform an equality test `u0 = RHS`, we try to replace all instances
+of `u0` with RHS. We will ALWAYS do this if RHS has no unbacked symbols,
+as removing unbacked symbols is beneficial—eliminating them prevents
+the creation of a `GuardOnDataDependentSymNode`. Even if we are not able
+to eliminate u0, we can refine its value range. The value range specifies
+what the set of possible values for a variable are. By default, size-like
+unbacked SymInts have a value range of `[0, Inf]`; if you assert it is
+equal to an expression with a refined value range, say `[2, 20]`, then
+`u0`’s value range will be updated to `[2, 20]`. We also have limited
+support for propagating value ranges in reverse.
+
+* If you perform a boolean test `f(u0)`, we will remember that this expression always evaluates to True, and if you evaluate an expression that contains this expression, we will substitute it with True. We also support some limited reasoning on logically equivalent statements. For example, if you `torch._check(u0 < 4)`, we will also know that `u0 >= 4` evaluates to `False`, and so performing a test like this in a normal non-check conditional will go through fine.
+
+
+### `torch._check_is_size(size)` and `guard_size_oblivious(cond)`
+
+Example:
+```python
+u0 = y.item()
+torch._check_is_size(u0)
+```
+
+**Semantic Equivalent:**
+
+```python
+if u0 < 0:
+    raise RuntimeError("u0 is not a size")
+```
+
+**Key Differences:**
+
+Like `torch._check`, this test will always succeed at compile time, and it will establish that `u0 >= 0`. This refines the value range of `u0` to `[0, Inf]` instead of `[-Inf, Inf]`.
+
+Marking `u0` as size-like is crucial. Size-like unbacked `SymInts` behave like
+their regular counterparts, except when involved in a boolean expression
+evaluated with `guard_size_oblivious`. In such cases, they are assumed not to equal zero or one, temporarily setting their value range to `[2, Inf]`. For instance, a conditional check like `u0 == 1` will evaluate to `False` when `u0` is size-like, instead of causing an error.
+
+For example, `guard_size_oblivious(u0 == 1)` will always return `False` when `u0`
+is size-like.
+
+Marking unbacked symbols as size-like is essential in contexts where tensor
+sizes are expected. PyTorch internals often check if sizes are zero or one to
+handle special cases related to empty or single-element tensors. If you pass an
+unbacked symbol to a factory function like `torch.empty`, it will automatically
+be marked as size-like. However, some quantities, like arguments to `Tensor.view`,
+cannot be inferred as size-like because `-1` is a valid argument. In such cases,
+you need to explicitly use `torch._check_is_size` on an unbacked `SymInt` before
+passing it to `view`.
+
+In PyTorch framework code, if you need to test a size for zero or one, wrap the
+test in `guard_size_oblivious` to assume that size-like unbacked `SymInts` will
+not pass this test. Generally, most framework code has logic for the `>= 2`
+case, which works for the `0/1` case. If using `guard_size_oblivious` in
+PyTorch framework code resolves your issue, it's likely acceptable. However,
+avoid using `guard_size_oblivious` in user code, especially if different
+behavior is required for the `0/1` case at runtime, such as in a
+hand-tracking application.
+
+In C++, this can be done with `TORCH_GUARD_SIZE_OBLIVIOUS(u0.sym_eq(0))`, for example.
+
+### torch._check_is_size(size, max=upper_bound) (New)
+
+This function is semantically equivalent to `torch._check(size <= upper_bound)`.
+However, under `guard_size_oblivious`, it assumes that `size < upper_bound`.
+This functionality only works when the upper bound is an integer constant. If
+`upper_bound` is a symbolic expression, normal semantics apply. There is
+potential to extend this functionality to symbolic expressions with further
+development.
+
+For more details, see the related issue https://github.com/pytorch/pytorch/issues/120288.
+
+
+### `torch._constrain_as_value` and `torch._constrain_as_size`
+
+These APIs are more specialized and are effectively equivalent to
+`torch._check` and `torch._check_is_size`, with the added capability
+of adjusting the value range of a variable by specifying minimum and
+maximum values. However, in recommendation models, these functions are
+unlikely to resolve `GuardOnDataDependentSymNode` errors effectively.
+
+While `constrain_as_value` might seem like a convenient way to ensure a
+variable stays within the bounds of another tensor, it is often impractical.
+This is because value ranges only support constant bounds, and it's common
+for the tensor you want to index into to have a symbolic dimension (for
+example, `s0`). Using its size as the maximum value for a value range
+will force specialization, which is usually undesirable. Instead, if
+necessary, manually handle range checks by using `torch._check()` on
+appropriate expressions based on the errors you encounter.
+
+## Common Fix Patterns
+
+There are several common methods to resolve issues like this. Below,
+we outline the most frequently used solutions.
+
+### When It's Unfixable
+
+In some cases, the issue is genuinely unfixable due to the nature of the code.
+Consider the following example:
+
+```python
+i = x.item()
+if i > 4:
+  return x * 2
+else:
+  return x + 3
+```
+
+If the user code is branching on a data-dependent value, it is impossible to
+trace as is. In such cases, you may need to consider alternative approaches,
+such as using `torch.cond`.
+
+Another common pattern involves indexing with a data-dependent value:
+
+```python
+return self.mlps[x.item()]
+```
+
+Here, `self.mlps` is a Python list or `ModuleList`, and the code branches on a data-dependent value. The simplest solution is to induce a graph break before the indexing operation.
+
+### `u0` is a Size, but We Don’t Know It
+
+Some guards fail on tests that essentially ask, "Is this a size?" but we don't know it is a size. These fall into two categories:
+
+1. **Regular Tests:**
+
+   These are tests like `u0 >= 0` or `u0 != -1` that are unconditionally true
+   for sizes. Adding a `torch._check_is_size(...)` on the relevant size will
+   assert that these tests are true. This is typically uncommon because if
+   the test is for error checking, we can infer that the condition must be
+   true, as an error would occur otherwise. An important exception is APIs
+   that accept both sizes and `-1`; in such cases, the user must indicate that
+   the input data-dependent quantity cannot be `-1`, as something unusual would
+   happen otherwise. For an example, see
+   https://github.com/pytorch/pytorch/pull/107788.
+
+   Sometimes, you can refactor an error-checking API to split a logical
+   disjunction of conditionals into separate conditionals. If you can do so
+   to achieve a single `torch._check(x == y)` statement, it will enable
+   the automatic generation of a deferred runtime assertion. For an example,
+   see https://github.com/pytorch/pytorch/pull/110979.
+
+2. **Edge Case Tests:**
+
+   These are tests like `u0 == 0` or `u0 == 1`, which are not always true for
+   sizes, but where our choice doesn’t really matter. These tests handle edge
+   cases, such as dealing with an empty tensor or testing for broadcasting when
+   we want to assume broadcasting is not occurring. To resolve these situations,
+   two steps are needed:
+
+   * First, the guard itself must be evaluated via `guard_size_oblivious`,
+   which assumes that size-like integers cannot equal zero or one, with the
+   promise that if they do, something reasonable will happen.
+   * Second, the symbols themselves must be marked as size-like, either
+   inferred because they were passed to tensor factory functions or explicitly
+   specified with `torch._check_is_size(...)`. For examples of making guards
+   size-oblivious, see https://github.com/pytorch/pytorch/pull/118579.
+
+Sometimes, these tests can occur in C++. While there are corresponding
+C++ APIs for these tests, it can be more challenging to localize the problem,
+as you do not get a useful backtrace by default.
+
+### `u0` is Actually Equal to `u1`, but We Don’t Know It
+
+Multiple unbacked `SymInts` can be known to be equal at compile time:
+
+```python
+i0 = x.sum().item()
+i1 = x.sum().item()
+return torch.randn(i0) + torch.randn(i1)
+```
+
+If there is a `torch._check(i0 == i1)` somewhere (in the example above, this
+check would occur inside the shape-checking rule for addition), we will
+automatically unify the two unbacked `SymInts` and recognize them as equal.
+However, if such an assertion is missing, you may need to explicitly add an
+assertion to achieve this unification. For an example, see
+https://github.com/pytorch/pytorch/issues/111950).
+
+```{note}
+If we allocate an unbacked `SymInt` and
+immediately set it equal to another, these instances are benign and not easily
+eliminated entirely from the framework.
+```
+
+### `u0` is a Tensor
+
+Another reason you might be overallocating unbacked `SymInts` is due to passing
+around a `Tensor` and relying on its implicit conversion to an integer. Many
+functions that accept an integer will also accept a `Tensor` and automatically
+call `item()` on the integer argument. It's beneficial to examine
+`TORCH_LOGS=dynamic` to determine whether the number of unbacked `SymInts` is
+as expected or excessive. When this occurs, a new `SymInt` will be allocated at
+the line where a PyTorch function is invoked.
+
+This issue is less likely to cause problems now because the return value of
+`t.item()` is memoized, ensuring that you consistently receive the same unbacked
+`SymInt` if you call it multiple times.
+
+### Overspecialization Issue
+
+In non-strict export mode, consider the following code:
+
+```python
+u0 = x.sum().item() return y[:u0]
+```
+
+This code will fail when trying to evaluate `u0` because, when a `SymInt` is
+used directly inside a Python slice (without using Dynamo), Python forces the
+integer to be specialized and fails if it is unbacked.
+
+To resolve this, you can rewrite the program to avoid specialization.
+For the example above, you can fix it by not using slices:
+
+```python
+u0 = x.sum().item() return y.narrow(0, 0, u0)
+```
+
+For more details, see the related issue
+https://github.com/pytorch/pytorch/issues/111950.
+
+### Use Lengths Instead of Offsets
+
+When working with variable sequence lengths, it's common to have tensors
+representing either the lengths or offsets of the sequences. For example, given
+`values = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]`, you might have `lengths = [3, 2, 4]`
+and `offsets = [0, 3, 5, 9]`. While these representations are interconvertible,
+it's better to work with lengths when dealing with them as integers (by calling
+`lengths.tolist()`), rather than offsets.
+
+The reason is that when you perform a `torch.split()` on your `values` tensor, you
+need to create tensors for each sub-sequence, such as tensors of sizes 3, 2, and 4.
+If you have unbacked `SymInts` for sizes, they become `u0`, `u1`, and `u2`. You can
+easily indicate that they are size-like, and you're done. However, if you have
+unbacked `SymInts` for offsets, they become `u1 - u0`, `u2 - u1`, `u3 - u2`, which
+complicates matters. These quantities cannot be conveniently marked as size-like,
+leading to potential issues. Since it's relatively straightforward to write code
+using either lengths or offsets, you should prefer using lengths.
+
+```{seealso}
+* {ref}`dynamic_shapes`
+* {ref}`debugging-tlparse-torch-logs`
+```
diff --git a/docs/source/compile/dynamic_shapes_zero_one_specialization.md b/docs/source/compile/dynamic_shapes_zero_one_specialization.md
new file mode 100644
index 000000000000..0ea3e5237155
--- /dev/null
+++ b/docs/source/compile/dynamic_shapes_zero_one_specialization.md
@@ -0,0 +1,33 @@
+(zero-one-specialization)=
+# The Zero-One Specialization Problem
+
+Before you read this section, you should understand the basics of
+dynamic shapes. Make sure you have read the following sections:
+
+* {ref}`dynamic_shapes`
+* {ref}`torch.export`
+* {ref}`what_is_a_specialization`
+
+In `torch.compile`, we specialize automatically on inputs with sizes
+0 or 1 and assume that any remaining inputs cannot be 0 or 1. This
+simplifies tasks like contiguity and broadcasting checks, as it
+avoids adding extra guards. However, this can cause problems for
+sparse models with many symbolic integers that in practice have
+tensors of size 0, 1, or 2. For example, consider when you a task is
+something like collecting likes on page.
+
+While it's possible to stop specializing on 0/1 upfront, executing
+normal PyTorch code often reintroduces 0/1 guards, as many conditions
+in PyTorch check for values being 0 or 1. Although models that work
+for `N > 2` often generalize to `N = 1`, this isn't guaranteed, especially
+with symbolic variables. For example, in hand tracking, a dimension
+size of `N = 0`, `1`, or `2` may lead to different graph behaviors.
+Simply hoping that the `N > 2` model generalizes can expose soundness issues.
+
+
+```{seealso}
+* {ref}`dynamic_shapes`
+* {ref}`torch.export`
+* {ref}`what_is_a_specialization`
+* {ref}`backed-vs-unbacked-symints`
+```
diff --git a/docs/source/cond.md b/docs/source/cond.md
index 0765d59dae7f..49722fd3b967 100644
--- a/docs/source/cond.md
+++ b/docs/source/cond.md
@@ -34,75 +34,75 @@ Read more about feature classification at: https://pytorch.org/blog/pytorch-feat
 Below is an example that uses cond to branch based on input shape:
 
 ```python
-    import torch
+import torch
 
-    def true_fn(x: torch.Tensor):
-        return x.cos() + x.sin()
+def true_fn(x: torch.Tensor):
+    return x.cos() + x.sin()
 
-    def false_fn(x: torch.Tensor):
-        return x.sin()
+def false_fn(x: torch.Tensor):
+    return x.sin()
 
-    class DynamicShapeCondPredicate(torch.nn.Module):
-        """
-        A basic usage of cond based on dynamic shape predicate.
-        """
+class DynamicShapeCondPredicate(torch.nn.Module):
+    """
+    A basic usage of cond based on dynamic shape predicate.
+    """
 
-        def __init__(self):
-            super().__init__()
+    def __init__(self):
+        super().__init__()
 
-        def forward(self, x: torch.Tensor) -> torch.Tensor:
-            def true_fn(x: torch.Tensor):
-                return x.cos()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        def true_fn(x: torch.Tensor):
+            return x.cos()
 
-            def false_fn(x: torch.Tensor):
-                return x.sin()
+        def false_fn(x: torch.Tensor):
+            return x.sin()
 
-            return torch.cond(x.shape[0] > 4, true_fn, false_fn, (x,))
+        return torch.cond(x.shape[0] > 4, true_fn, false_fn, (x,))
 
-    dyn_shape_mod = DynamicShapeCondPredicate()
+dyn_shape_mod = DynamicShapeCondPredicate()
 ```
 
 We can eagerly run the model and expect the results vary based on input shape:
 
 ```python
-    inp = torch.randn(3)
-    inp2 = torch.randn(5)
-    assert torch.equal(dyn_shape_mod(inp), false_fn(inp))
-    assert torch.equal(dyn_shape_mod(inp2), true_fn(inp2))
+inp = torch.randn(3)
+inp2 = torch.randn(5)
+assert torch.equal(dyn_shape_mod(inp), false_fn(inp))
+assert torch.equal(dyn_shape_mod(inp2), true_fn(inp2))
 ```
 
 We can export the model for further transformations and deployment:
 
 ```python
-    inp = torch.randn(4, 3)
-    dim_batch = torch.export.Dim("batch", min=2)
-    ep = torch.export.export(DynamicShapeCondPredicate(), (inp,), {}, dynamic_shapes={"x": {0: dim_batch}})
-    print(ep)
+inp = torch.randn(4, 3)
+dim_batch = torch.export.Dim("batch", min=2)
+ep = torch.export.export(DynamicShapeCondPredicate(), (inp,), {}, dynamic_shapes={"x": {0: dim_batch}})
+print(ep)
 ```
 
 This gives us an exported program as shown below:
 
 ```
-    class GraphModule(torch.nn.Module):
+class GraphModule(torch.nn.Module):
+    def forward(self, arg0_1: f32[s0, 3]):
+        sym_size: Sym(s0) = torch.ops.aten.sym_size.int(arg0_1, 0)
+        gt: Sym(s0 > 4) = sym_size > 4;  sym_size = None
+        true_graph_0 = self.true_graph_0
+        false_graph_0 = self.false_graph_0
+        conditional: f32[s0, 3] = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
+        return (conditional,)
+
+    class <lambda>(torch.nn.Module):
         def forward(self, arg0_1: f32[s0, 3]):
-            sym_size: Sym(s0) = torch.ops.aten.sym_size.int(arg0_1, 0)
-            gt: Sym(s0 > 4) = sym_size > 4;  sym_size = None
-            true_graph_0 = self.true_graph_0
-            false_graph_0 = self.false_graph_0
-            conditional: f32[s0, 3] = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
-            return (conditional,)
-
-        class <lambda>(torch.nn.Module):
-            def forward(self, arg0_1: f32[s0, 3]):
-                cos: f32[s0, 3] = torch.ops.aten.cos.default(arg0_1)
-                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
-                add: f32[s0, 3] = torch.ops.aten.add.Tensor(cos, sin);  cos = sin = None
-                return add
-
-        class <lambda>(torch.nn.Module):
-            def forward(self, arg0_1: f32[s0, 3]):
-                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
-                return sin
+            cos: f32[s0, 3] = torch.ops.aten.cos.default(arg0_1)
+            sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+            add: f32[s0, 3] = torch.ops.aten.add.Tensor(cos, sin);  cos = sin = None
+            return add
+
+    class <lambda>(torch.nn.Module):
+        def forward(self, arg0_1: f32[s0, 3]):
+            sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+            return sin
 ```
 
 Notice that `torch.cond` is lowered to `torch.ops.higher_order.cond`, its predicate becomes a Symbolic expression over the shape of input,
@@ -111,41 +111,41 @@ and branch functions becomes two sub-graph attributes of the top level graph mod
 Here is another example that showcases how to express a data-dependent control flow:
 
 ```python
-    class DataDependentCondPredicate(torch.nn.Module):
-        """
-        A basic usage of cond based on data dependent predicate.
-        """
-        def __init__(self):
-            super().__init__()
-
-        def forward(self, x: torch.Tensor) -> torch.Tensor:
-            return torch.cond(x.sum() > 4.0, true_fn, false_fn, (x,))
+class DataDependentCondPredicate(torch.nn.Module):
+    """
+    A basic usage of cond based on data dependent predicate.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.cond(x.sum() > 4.0, true_fn, false_fn, (x,))
 ```
 
 The exported program we get after export:
 
 ```
-    class GraphModule(torch.nn.Module):
+class GraphModule(torch.nn.Module):
+    def forward(self, arg0_1: f32[s0, 3]):
+        sum_1: f32[] = torch.ops.aten.sum.default(arg0_1)
+        gt: b8[] = torch.ops.aten.gt.Scalar(sum_1, 4.0);  sum_1 = None
+
+        true_graph_0 = self.true_graph_0
+        false_graph_0 = self.false_graph_0
+        conditional: f32[s0, 3] = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
+        return (conditional,)
+
+    class <lambda>(torch.nn.Module):
+        def forward(self, arg0_1: f32[s0, 3]):
+            cos: f32[s0, 3] = torch.ops.aten.cos.default(arg0_1)
+            sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+            add: f32[s0, 3] = torch.ops.aten.add.Tensor(cos, sin);  cos = sin = None
+            return add
+
+    class <lambda>(torch.nn.Module):
         def forward(self, arg0_1: f32[s0, 3]):
-            sum_1: f32[] = torch.ops.aten.sum.default(arg0_1)
-            gt: b8[] = torch.ops.aten.gt.Scalar(sum_1, 4.0);  sum_1 = None
-
-            true_graph_0 = self.true_graph_0
-            false_graph_0 = self.false_graph_0
-            conditional: f32[s0, 3] = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
-            return (conditional,)
-
-        class <lambda>(torch.nn.Module):
-            def forward(self, arg0_1: f32[s0, 3]):
-                cos: f32[s0, 3] = torch.ops.aten.cos.default(arg0_1)
-                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
-                add: f32[s0, 3] = torch.ops.aten.add.Tensor(cos, sin);  cos = sin = None
-                return add
-
-        class <lambda>(torch.nn.Module):
-            def forward(self, arg0_1: f32[s0, 3]):
-                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
-                return sin
+            sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+            return sin
 ```
 
 ## Invariants of torch.ops.higher_order.cond
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 264bf16a7c4a..d21e67c1caad 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -133,7 +133,7 @@
 html_theme_options = {
     "logo": {"text": "Home"},
     "analytics_id": "GTM-T8XT4PS",
-    "canonical_url": "https://pytorch.org/docs/stable/",
+    "canonical_url": "https://docs.pytorch.org/docs/stable/",
     "switcher": {
         "json_url": "https://docs.pytorch.org/docs/pytorch-versions.json",
         "version_match": switcher_version,
@@ -143,7 +143,7 @@
     "external_links": [
         {
             "name": "Tutorials",
-            "url": "https://pytorch.org/tutorials/",
+            "url": "https://docs.pytorch.org/tutorials/",
         },
     ],
     "show_version_warning_banner": True,
@@ -217,9 +217,7 @@
     "is_available",
     # torch.distributed.checkpoint.state_dict
     "gc_context",
-    "state_dict",
     # torch.distributed.elastic.events
-    "construct_and_record_rdzv_event",
     "record_rdzv_event",
     # torch.distributed.elastic.metrics
     "initialize_metrics",
@@ -430,7 +428,6 @@
     "get_default_qconfig_dict",
     "qconfig_equals",
     # torch.ao.quantization.quantization_mappings
-    "get_default_compare_output_module_list",
     "get_default_dynamic_quant_module_mappings",
     "get_default_dynamic_sparse_quant_module_mappings",
     "get_default_float_to_quantized_operator_mappings",
@@ -473,29 +470,13 @@
     "get_weight_qspec",
     "propagate_annotation",
     "register_annotator",
-    # torch.ao.quantization.utils
     "activation_dtype",
-    "activation_is_dynamically_quantized",
-    "activation_is_int32_quantized",
-    "activation_is_int8_quantized",
-    "activation_is_statically_quantized",
-    "calculate_qmin_qmax",
-    "check_min_max_valid",
     "check_node",
-    "determine_qparams",
-    "get_combined_dict",
-    "get_fqn_to_example_inputs",
-    "get_qconfig_dtypes",
-    "get_qparam_dict",
-    "get_quant_type",
-    "get_swapped_custom_module_class",
-    "getattr_from_fqn",
     "has_no_children_ignoring_parametrizations",
     "is_per_channel",
     "is_per_tensor",
     "op_is_int8_dynamically_quantized",
     "to_underlying_dtype",
-    "validate_qmin_qmax",
     "weight_dtype",
     "weight_is_quantized",
     "weight_is_statically_quantized",
@@ -509,10 +490,6 @@
     "custom_fwd",
     # torch.cuda.amp.common
     "amp_definitely_not_available",
-    # torch.cuda.graphs
-    "graph_pool_handle",
-    "is_current_stream_capturing",
-    "make_graphed_callables",
     # torch.mtia.memory
     "reset_peak_memory_stats",
     # torch.cuda.nccl
@@ -524,25 +501,11 @@
     "reduce_scatter",
     "unique_id",
     "version",
-    # torch.cuda.nvtx
-    "range",
-    "range_end",
-    "range_start",
     # torch.cuda.profiler
     "init",
     "profile",
     "start",
     "stop",
-    # torch.cuda.random
-    "get_rng_state",
-    "get_rng_state_all",
-    "initial_seed",
-    "manual_seed",
-    "manual_seed_all",
-    "seed",
-    "seed_all",
-    "set_rng_state",
-    "set_rng_state_all",
     # torch.distributed.algorithms.ddp_comm_hooks.ddp_zero_hook
     "hook_with_zero_step",
     "hook_with_zero_step_interleaved",
@@ -571,42 +534,6 @@
     # torch.distributed.checkpoint.utils
     "find_state_dict_object",
     "find_tensor_shard",
-    # torch.distributed.collective_utils
-    "all_gather",
-    "all_gather_object_enforce_type",
-    "broadcast",
-    # torch.distributed.distributed_c10d
-    "all_gather",
-    "all_gather_coalesced",
-    "all_gather_into_tensor",
-    "all_gather_object",
-    "all_reduce",
-    "all_reduce_coalesced",
-    "all_to_all",
-    "all_to_all_single",
-    "barrier",
-    "batch_isend_irecv",
-    "broadcast",
-    "broadcast_object_list",
-    "destroy_process_group",
-    "gather",
-    "gather_object",
-    "get_backend",
-    "get_backend_config",
-    "get_global_rank",
-    "get_group_rank",
-    "get_process_group_ranks",
-    "get_rank",
-    "get_world_size",
-    "init_process_group",
-    "irecv",
-    "is_backend_available",
-    "is_gloo_available",
-    "is_initialized",
-    "is_mpi_available",
-    "is_nccl_available",
-    "is_torchelastic_launched",
-    "is_ucc_available",
     "isend",
     "monitored_barrier",
     "new_group",
@@ -680,15 +607,8 @@
     "transformer_auto_wrap_policy",
     "wrap",
     # torch.distributed.nn.functional
-    "all_gather",
-    "all_reduce",
     "all_to_all",
     "all_to_all_single",
-    "broadcast",
-    "gather",
-    "reduce",
-    "reduce_scatter",
-    "scatter",
     # torch.distributed.nn.jit.instantiator
     "get_arg_return_types_from_interface",
     "instantiate_non_scriptable_remote_module_template",
@@ -1099,6 +1019,8 @@
     "loop_pass",
     "these_before_those_pass_constraint",
     "this_before_that_pass_constraint",
+    # torch.fx.passes.regional_inductor
+    "regional_inductor",
     # torch.fx.passes.reinplace
     "reinplace",
     # torch.fx.passes.split_module
@@ -2172,8 +2094,6 @@
     "EventHandler",
     "SynchronizationError",
     "UnsynchronizedAccessError",
-    # torch.cuda.memory
-    "MemPool",
     # torch.distributed.elastic.multiprocessing.errors
     "ChildFailedError",
     "ProcessFailure",
@@ -2479,10 +2399,6 @@
     # torch.amp.grad_scaler
     "GradScaler",
     "OptState",
-    # torch.cuda.graphs
-    "CUDAGraph",
-    # torch.cuda.streams
-    "Event",
     # torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook
     "PostLocalSGDState",
     # torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook
@@ -3176,8 +3092,6 @@
     "WeakIdKeyDictionary",
     "WeakIdRef",
     "WeakTensorKeyDictionary",
-    # torch.utils.debug_mode
-    "DebugMode",
 ]
 
 # The suffix(es) of source filenames.
@@ -3331,6 +3245,13 @@ def coverage_post_process(app, exception):
     if not isinstance(app.builder, CoverageBuilder):
         return
 
+    if not torch.distributed.is_available():
+        raise RuntimeError(
+            "The coverage tool cannot run with a version "
+            "of PyTorch that was built with USE_DISTRIBUTED=0 "
+            "as this module's API changes."
+        )
+
     # These are all the modules that have "automodule" in an rst file
     # These modules are the ones for which coverage is checked
     # Here, we make sure that no module is missing from that list
diff --git a/docs/source/cpu.rst b/docs/source/cpu.rst
index 2125a1d66865..f241ca7b9894 100644
--- a/docs/source/cpu.rst
+++ b/docs/source/cpu.rst
@@ -10,6 +10,7 @@ torch.cpu
     current_device
     current_stream
     is_available
+    is_initialized
     synchronize
     stream
     set_device
diff --git a/docs/source/cuda.aliases.md b/docs/source/cuda.aliases.md
new file mode 100644
index 000000000000..023b88db29e5
--- /dev/null
+++ b/docs/source/cuda.aliases.md
@@ -0,0 +1,47 @@
+# Aliases in torch.cuda
+
+The following are aliases to their counterparts in ``torch.cuda`` in the nested namespaces in which they are defined. For any of these APIs, feel free to use the top-level version in ``torch.cuda`` like ``torch.cuda.seed`` or the nested version ``torch.cuda.random.seed``.
+
+```{eval-rst}
+.. automodule:: torch.cuda.random
+.. currentmodule:: torch.cuda.random
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    get_rng_state
+    get_rng_state_all
+    set_rng_state
+    set_rng_state_all
+    manual_seed
+    manual_seed_all
+    seed
+    seed_all
+    initial_seed
+```
+
+```{eval-rst}
+.. automodule:: torch.cuda.graphs
+.. currentmodule:: torch.cuda.graphs
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    is_current_stream_capturing
+    graph_pool_handle
+    CUDAGraph
+    graph
+    make_graphed_callables
+```
+
+```{eval-rst}
+.. automodule:: torch.cuda.streams
+.. currentmodule:: torch.cuda.streams
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    Stream
+    ExternalStream
+    Event
+```
\ No newline at end of file
diff --git a/docs/source/cuda.md b/docs/source/cuda.md
index 8db30cfed7f3..bd752ad684b7 100644
--- a/docs/source/cuda.md
+++ b/docs/source/cuda.md
@@ -176,10 +176,6 @@
 .. autoclass:: torch.cuda.use_mem_pool
 ```
 
-% FIXME The following doesn't seem to exist. Is it supposed to?
-% https://github.com/pytorch/pytorch/issues/27785
-% .. autofunction:: reset_max_memory_reserved
-
 ## NVIDIA Tools Extension (NVTX)
 
 ```{eval-rst}
@@ -274,10 +270,6 @@ See the docs for {class}`~torch.cuda.gds.GdsFile` for an example of how to use t
 .. py:module:: torch.cuda.gds
 ```
 
-```{eval-rst}
-.. py:module:: torch.cuda.graphs
-```
-
 ```{eval-rst}
 .. py:module:: torch.cuda.jiterator
 ```
@@ -294,14 +286,13 @@ See the docs for {class}`~torch.cuda.gds.GdsFile` for an example of how to use t
 .. py:module:: torch.cuda.profiler
 ```
 
-```{eval-rst}
-.. py:module:: torch.cuda.random
-```
-
 ```{eval-rst}
 .. py:module:: torch.cuda.sparse
 ```
 
 ```{eval-rst}
-.. py:module:: torch.cuda.streams
+.. toctree::
+    :hidden:
+
+    cuda.aliases.md
 ```
diff --git a/docs/source/cuda.tunable.md b/docs/source/cuda.tunable.md
index 565633fe1881..6d877e05397b 100644
--- a/docs/source/cuda.tunable.md
+++ b/docs/source/cuda.tunable.md
@@ -68,14 +68,6 @@
 .. autofunction:: get_validators
 ```
 
-```{eval-rst}
-.. autofunction:: write_file_on_exit
-```
-
-```{eval-rst}
-.. autofunction:: write_file
-```
-
 ```{eval-rst}
 .. autofunction:: read_file
 ```
@@ -95,3 +87,7 @@
 ```{eval-rst}
 .. autofunction:: get_rotating_buffer_size
 ```
+
+```{eval-rst}
+.. autofunction:: set_numerical_check_tolerances
+```
\ No newline at end of file
diff --git a/docs/source/distributed.fsdp.fully_shard.md b/docs/source/distributed.fsdp.fully_shard.md
index 4a54a41cefdb..d19c26067df1 100644
--- a/docs/source/distributed.fsdp.fully_shard.md
+++ b/docs/source/distributed.fsdp.fully_shard.md
@@ -123,3 +123,7 @@ The frontend API is `fully_shard` that can be called on a `module`:
 .. autoclass:: CPUOffloadPolicy
     :members:
 ```
+
+```{eval-rst}
+.. autofunction:: share_comm_ctx
+```
diff --git a/docs/source/distributed.md b/docs/source/distributed.md
index 1a5f8d2b6f3f..5da02bb8a194 100644
--- a/docs/source/distributed.md
+++ b/docs/source/distributed.md
@@ -51,7 +51,7 @@ MPI supports CUDA only if the implementation used to build PyTorch supports it.
 +----------------+-----+-----+-----+-----+-----+-----+-----+-----+
 | reduce_scatter | ✓   | ✓   | ✘   | ✘   | ✘   | ✓   | ✘   | ✓   |
 +----------------+-----+-----+-----+-----+-----+-----+-----+-----+
-| all_to_all     | ✓   | ✓   | ✓   | ?   | ✘   | ✓   | ✘   | ✓   |
+| all_to_all     | ✘   | ✘   | ✓   | ?   | ✘   | ✓   | ✘   | ✓   |
 +----------------+-----+-----+-----+-----+-----+-----+-----+-----+
 | barrier        | ✓   | ✘   | ✓   | ?   | ✘   | ✓   | ✘   | ✓   |
 +----------------+-----+-----+-----+-----+-----+-----+-----+-----+
@@ -221,6 +221,16 @@ inconsistent 'UUID' assignment across ranks, and to prevent races during initial
 
 ```{eval-rst}
 .. autofunction:: torch.distributed.distributed_c10d.is_xccl_available
+.. autofunction:: torch.distributed.distributed_c10d.batch_isend_irecv
+.. autofunction:: torch.distributed.distributed_c10d.destroy_process_group
+.. autofunction:: torch.distributed.distributed_c10d.is_backend_available
+.. autofunction:: torch.distributed.distributed_c10d.irecv
+.. autofunction:: torch.distributed.distributed_c10d.is_gloo_available
+.. autofunction:: torch.distributed.distributed_c10d.is_initialized
+.. autofunction:: torch.distributed.distributed_c10d.is_mpi_available
+.. autofunction:: torch.distributed.distributed_c10d.is_nccl_available
+.. autofunction:: torch.distributed.distributed_c10d.is_torchelastic_launched
+.. autofunction:: torch.distributed.distributed_c10d.is_ucc_available
 ```
 
 ```{eval-rst}
diff --git a/docs/source/distributed.tensor.md b/docs/source/distributed.tensor.md
index cb12eb195c02..3d65399727e1 100644
--- a/docs/source/distributed.tensor.md
+++ b/docs/source/distributed.tensor.md
@@ -260,3 +260,73 @@ these features.
 ```{eval-rst}
 .. py:module:: torch.distributed.tensor.device_mesh
 ```
+
+## Mixed Tensor and DTensor operations
+
+So you got the following error message.
+```
+got mixed torch.Tensor and DTensor, need to convert all
+torch.Tensor to DTensor before calling distributed operators!
+```
+
+There are two cases.
+
+### Case 1: this is user error
+
+The most common way to run into this error is to create a regular Tensor
+(using a factory function) and then perform a Tensor-DTensor operation,
+like the following:
+
+```
+tensor = torch.arange(10)
+return tensor + dtensor
+```
+
+We disallow mixed Tensor-DTensor operations: if the input to any operations
+(e.g. torch.add) is a DTensor, then all Tensor inputs must be DTensors.
+This is because the semantics are ambiguous. We don't know if `tensor` is
+the same across ranks or if it is different so we ask that the user
+figure out how to construct a DTensor with accurate placements from `tensor`.
+
+If each rank does have the same `tensor`, then please construct a replicated
+DTensor:
+
+```
+tensor = torch.arange(10)
+tensor = DTensor.from_local(tensor, placements=(Replicate(),))
+return tensor + dtensor
+```
+
+If you wanted to create a DTensor with shards, below is how to do it.
+Semantically this means that your Tensor data is split between the shards
+and that operations act on the "full stacked data".
+
+```
+tensor = torch.full([], RANK)
+tensor = DTensor.from_local(tensor, placements=(Shard(0),))
+return tensor + dtensor
+```
+
+There are other things you may wish to do with your tensor beyond
+these situations (these are not the only two options!).
+
+## Case 2: the error came from PyTorch framework code
+
+Sometimes the problem is that PyTorch framework code attempts to perform mixed
+Tensor-DTensor operations. These are bugs in PyTorch, please file an issue
+so that we can fix them.
+
+On the user side, the only thing you can do is to avoid using the operation
+that caused the issue and file a bug report.
+
+For PyTorch Developers: one approach of fixing this is to rewrite PyTorch
+framework code to avoid mixed Tensor-DTensor code (like in the previous section).
+
+For PyTorch Developers: the second approach is to turn on DTensor implicit
+replication inside the right places in PyTorch framework code.
+When on, any mixed Tensor-DTensor operations will assume that the
+non-DTensors can be replicated. Please be careful when using this as it
+can lead to silent incorrectness.
+
+- [Turning on implicit replication in Python](https://github.com/pytorch/pytorch/blob/d8e6b2fddc54c748d976e8f0ebe4b63ebe36d85b/torch/distributed/tensor/experimental/__init__.py#L15)
+- [Turning on implicit replication in C++](https://github.com/pytorch/pytorch/blob/7a0f93344e2c851b9bcf2b9c3225a323d48fde26/aten/src/ATen/DTensorState.h#L10)
diff --git a/docs/source/fx.experimental.md b/docs/source/fx.experimental.md
index 24125cd310bc..cba695b5e1c5 100644
--- a/docs/source/fx.experimental.md
+++ b/docs/source/fx.experimental.md
@@ -8,6 +8,10 @@
 These APIs are experimental and subject to change without notice.
 :::
 
+```{eval-rst}
+.. autoclass:: torch.fx.experimental.sym_node.DynamicInt
+```
+
 ## torch.fx.experimental.symbolic_shapes
 
 ```{eval-rst}
diff --git a/docs/source/fx.md b/docs/source/fx.md
index 831534606abe..c9c235382893 100644
--- a/docs/source/fx.md
+++ b/docs/source/fx.md
@@ -1093,6 +1093,9 @@ The set of leaf modules can be customized by overriding
 ```{eval-rst}
 .. autofunction:: torch.fx.replace_pattern
 ```
+```{eval-rst}
+.. autofunction:: torch.fx.traceback.annotate
+```
 
 <!-- The experimental and passes submodules are missing docs. -->
 <!-- Adding it here for coverage but this doesn't add anything to the -->
@@ -1166,6 +1169,7 @@ The set of leaf modules can be customized by overriding
 .. py:module:: torch.fx.passes.operator_support
 .. py:module:: torch.fx.passes.param_fetch
 .. py:module:: torch.fx.passes.pass_manager
+.. py:module:: torch.fx.passes.regional_inductor
 .. py:module:: torch.fx.passes.reinplace
 .. py:module:: torch.fx.passes.runtime_assert
 .. py:module:: torch.fx.passes.shape_prop
diff --git a/docs/source/nn.attention.rst b/docs/source/nn.attention.rst
index 120535d00259..8e7e6b0a762a 100644
--- a/docs/source/nn.attention.rst
+++ b/docs/source/nn.attention.rst
@@ -23,6 +23,7 @@ Submodules
     flex_attention
     bias
     experimental
+    varlen
 
 .. toctree::
     :hidden:
@@ -30,3 +31,4 @@ Submodules
     nn.attention.flex_attention
     nn.attention.bias
     nn.attention.experimental
+    nn.attention.varlen
diff --git a/docs/source/nn.attention.varlen.md b/docs/source/nn.attention.varlen.md
new file mode 100644
index 000000000000..df91e1d968e6
--- /dev/null
+++ b/docs/source/nn.attention.varlen.md
@@ -0,0 +1,17 @@
+```{eval-rst}
+.. role:: hidden
+    :class: hidden-section
+```
+
+# torch.nn.attention.varlen
+
+```{eval-rst}
+.. automodule:: torch.nn.attention.varlen
+.. currentmodule:: torch.nn.attention.varlen
+```
+```{eval-rst}
+.. autofunction:: varlen_attn
+```
+```{eval-rst}
+.. autoclass:: AuxRequest
+```
diff --git a/docs/source/nn.functional.rst b/docs/source/nn.functional.rst
index 9d2ea0eef5ea..015d1d9ffda1 100644
--- a/docs/source/nn.functional.rst
+++ b/docs/source/nn.functional.rst
@@ -218,3 +218,14 @@ DataParallel functions (multi-GPU, distributed)
     :nosignatures:
 
     torch.nn.parallel.data_parallel
+
+Low-Precision functions
+-----------------------
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    ScalingType
+    SwizzleType
+    scaled_mm
+    scaled_grouped_mm
diff --git a/docs/source/notes/broadcasting.rst b/docs/source/notes/broadcasting.rst
index ba245d686a84..c91242936194 100644
--- a/docs/source/notes/broadcasting.rst
+++ b/docs/source/notes/broadcasting.rst
@@ -13,7 +13,6 @@ General semantics
 -----------------
 Two tensors are "broadcastable" if the following rules hold:
 
-- Each tensor has at least one dimension.
 - When iterating over the dimension sizes, starting at the trailing dimension,
   the dimension sizes must either be equal, one of them is 1, or one of them
   does not exist.
@@ -26,7 +25,8 @@ For Example::
 
     >>> x=torch.empty((0,))
     >>> y=torch.empty(2,2)
-    # x and y are not broadcastable, because x does not have at least 1 dimension
+    # x and y are not broadcastable, because the 0-sized dimension of x
+    # does not match the 2-sized dimension of y.
 
     # can line up trailing dimensions
     >>> x=torch.empty(5,3,4,1)
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index 8981ac1bf6ed..c7d3a93f7352 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -608,13 +608,16 @@ Available options:
   for processing events. This avoids any slow path associated with querying/processing of
   events in the fast allocation path. This feature is disabled by default.
 
+* `pinned_reserve_segment_size_mb` option is a size in MB to reserve for pinned memory
+  segment. This allocates a large segment of pinned memory upfront and then uses to allocate
+  small size requests. This helps reduce the number of expensive device library calls.
+
 * ``graph_capture_record_stream_reuse`` (experimental, default: `False`)
   If set to `True`, the CUDA caching allocator will attempt to reclaim device memory during
   CUDA Graph capture by using the graph topology (instead of CUDA events) to determine
   when a freed block is safe to reuse. This can reduce peak memory during long captures that free
   and reallocate buffers across multiple streams, especially when the capture DAG frequently
-  reaches joined frontiers. Note: Enabling this option can significantly increase the time spent
-  capturing the graph.
+  reaches joined frontiers.
 
 .. note::
 
diff --git a/docs/source/notes/libtorch_stable_abi.md b/docs/source/notes/libtorch_stable_abi.md
index 1180a85d0eaa..20bc0c16c198 100644
--- a/docs/source/notes/libtorch_stable_abi.md
+++ b/docs/source/notes/libtorch_stable_abi.md
@@ -1,6 +1,65 @@
 # LibTorch Stable ABI
 
-This note will eventually contain more details on how to use the APIs in torch/csrc/stable. For the moment, it contains a table of internal representations:
+## Overview
+
+The LibTorch Stable ABI (Application Binary Interface) provides an interface for extending PyTorch functionality without being tightly coupled to specific PyTorch versions. This enables the development of custom operators and extensions that remain compatible across PyTorch releases.
+
+The stable ABI consists of three main components:
+
+1. **Stable C headers** - Low-level C API implemented by libtorch (primarily `torch/csrc/inductor/aoti_torch/c/shim.h`)
+2. **Header-only C++ library** - Standalone utilities implemented in only headers such that there is no dependence on libtorch (`torch/headeronly/*`)
+3. **Stable C++ wrappers** - High-level C++ convenience wrappers (`torch/csrc/stable/*`)
+
+We discuss each of these in detail
+
+### `torch/headeronly`
+
+This is a set of inlined C++ headers are completely decoupled from libtorch. The headers consist of certain utilities that might be familiar to custom extension writers. For example, the
+`c10::ScalarType` enum lives here as `torch::headeronly::ScalarType`.
+
+### `torch/csrc/stable`
+
+This is a set of inlined C++ headers that provide wrappers around the C API that handle the rough edges
+discussed below.
+
+It consists of
+
+- torch/csrc/stable/library.h: Provides a stable version of TORCH_LIBRARY and similar macros.
+- torch/csrc/stable/tensor_struct.h: Provides torch::stable::Tensor, a stable version of at::Tensor.
+- torch/csrc/stable/ops.h: Provides a stable interface for calling ATen ops from `native_functions.yaml`.
+- torch/csrc/stable/accelerator.h: Provides a stable interface for device-generic objects and APIs
+(e.g. `getCurrentStream`, `DeviceGuard`).
+
+We are continuing to improve coverage in our `torch/csrc/stable` APIs. Please file an issue if you'd like to see support for particular APIs in your custom extension.
+
+### Stable C headers
+
+The stable C headers used by AOTInductor form the foundation of the stable ABI. However, this is **use at your own risk**. For example, users must handle the memory lifecycle of objects returned by certain APIs.
+ Further, the stack-based APIs discussed below which allow the user to call the PyTorch dispatcher don't provide strong guarantees on forward and backward compatibility.
+
+Unless absolutely necessary, we recommend the high-level C++ API in `torch/csrc/stable`
+which will handle all the rough edges of the C API for the user.
+
+
+## How are objects passed across the ABI boundary when interacting with the dispatcher?
+
+When interacting with the dispatcher via the stable APIs (``STABLE_TORCH_LIBRARY`` etc.) we use a boxed convention. Arguments and returns are represented as a stack of ``StableIValue`` which correlates with a `torch::jit::stack` of IValues. We discuss the following below
+1. StableIValue Conversions
+2. StableIValue stack Conventions
+3. Stable APIs that interact with the dispatcher
+
+### StableIValue Conversions
+
+We provide utilities for users to convert objects to and from StableIValues with the synonymous
+`to` and `from` APIs in `torch/csrc/stable/stableivalue_conversions.h`. We document the stable custom extension representation, libtorch representation and StableIValue
+representations below. Our confidently supported types are the ones in the table that have completed
+rows. You can rely on this subset for proper ABI stability, meaning that you can call `to<T_custom_ext>(arg/ret)` or `from(T)` on these types.
+
+For a limited set of use cases, we also implicitly support any literal type that is representable within 64 bits as StableIValues, as the default reinterpret_cast will succeed. (For example: c10::Device.) These types are currently ABI-stable on best effort but might break in the future and thus should be used for short term testing only.
+
+You can always work with StableIValue abstractions in your custom kernel for types such as c10::Device even if there is no standard defined representation of device in custom extensions by not introspecting into the StableIValue. For example, a custom operator can take as argument a StableIValue device and directly pass it through to an aten operator with `aoti_torch_call_dispatcher`.
+
+
 1. type in custom extension: type used within the end user custom library.
 2. StableIValue representation: a stable conversion of the type to liaison between the user model vs libtorch.so in an ABI-stable manner.
 3. type in libtorch: type used within libtorch.so (or any code binary locked with libtorch).
@@ -31,16 +90,10 @@ This note will eventually contain more details on how to use the APIs in torch/c
 | ? | ? | c10::SymBool | SymBool |
 | ? | ? | at::QScheme | QScheme |
 
-Our confidently supported types are the ones in the table that have completed rows. You can rely on this subset for proper ABI stability.
-
-For a limited set of use cases, we also implicitly support any literal type that is representable within 64 bits as StableIValues, as the default reinterpret_cast will succeed. (For example: c10::Device.) These types are currently ABI-stable on best effort but might break in the future and thus should be used for short term testing only.
-
-You can always work with StableIValue abstractions in your custom kernel for types such as c10::Device even if there is no standard defined representation of device in custom extensions by not introspecting into the StableIValue. For example, a custom operator can take as argument a StableIValue device and directly pass it through to an aten operator with `aoti_torch_call_dispatcher`.
-
 
-## How to use stack-based APIs
+### Stack Conventions
 
-`aoti_torch_call_dispatcher` is what we consider a stack-based API because it takes as input a stack of StableIValues, which correlates with a `torch::jit::stack` of IValues. Working with the dispatcher will likely bring you into proximity with stack-based APIs, so we are documenting some invariants:
+There are two invariants for the stack:
 
 1. The stack is populated left to right.
     a. For example, a stack representing arguments `arg0`, `arg1`, and `arg2` will have `arg0` at index 0, `arg1` at index 1, and `arg2` at index 2.
@@ -49,3 +102,32 @@ You can always work with StableIValue abstractions in your custom kernel for typ
 2. The stack always has ownership of the objects it holds.
     a. When calling a stack-based API, you must give owning references to the calling stack and steal references from the returned stack.
     b. When registering your function to be called with a stack, you must steal references from your argument stack and push onto the stack new references.
+
+### Stack-based APIs
+
+The above is relevant in two places:
+
+1. `STABLE_TORCH_LIBRARY`
+    Unlike `TORCH_LIBRARY`, the dispatcher expects kernels registered via `STABLE_TORCH_LIBRARY` to be boxed. This means they must have the signature `(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) -> void`.We plan to eventually abstract away the need for manual boxing, but, for the time being, please use `from` and `to`.
+
+    ```cpp
+    Tensor my_amax_vec(Tensor t) {
+        std::vector<int64_t> v = {0,1};
+        return amax(t, v, false);
+    }
+
+    void boxed_my_amax_vec(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+        auto res = my_amax_vec(to<Tensor>(stack[0]));
+        stack[0] = from(res);
+    }
+    ```
+
+2. `aoti_torch_call_dispatcher`
+    This API allows you to call the PyTorch dispatcher from C/C++ code. It has the following signature:
+    ```cpp
+    aoti_torch_call_dispatcher(const char* opName, const char* overloadName, StableIValue* stack);
+    ```
+
+    `aoti_torch_call_dispatcher` will call the op overload defined by a given `opName`, `overloadName`, and a stack of
+    StableIValues. This call will populate any return values of the op into the stack in their StableIValue form,
+    with `ret0` at index 0, `ret1` at index 1, and so on.
diff --git a/docs/source/notes/randomness.rst b/docs/source/notes/randomness.rst
index baa050f314b9..588ab670fae7 100644
--- a/docs/source/notes/randomness.rst
+++ b/docs/source/notes/randomness.rst
@@ -125,10 +125,6 @@ deterministic implementation will be used::
             [[ 0.1509,  1.8027],
              [ 0.0333, -1.1444]]], device='cuda:0')
 
-Furthermore, if you are using CUDA tensors, and your CUDA version is 10.2 or greater, you
-should set the environment variable `CUBLAS_WORKSPACE_CONFIG` according to CUDA documentation:
-`<https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility>`_
-
 CUDA convolution determinism
 ----------------------------
 While disabling CUDA convolution benchmarking (discussed above) ensures that
diff --git a/docs/source/pytorch-api.md b/docs/source/pytorch-api.md
index 2e858079d239..3b3f0f627bdd 100644
--- a/docs/source/pytorch-api.md
+++ b/docs/source/pytorch-api.md
@@ -76,7 +76,6 @@ storage
 torch.testing <testing>
 torch.utils <utils>
 torch.utils.benchmark <benchmark_utils>
-torch.utils.bottleneck <bottleneck>
 torch.utils.checkpoint <checkpoint>
 torch.utils.cpp_extension <cpp_extension>
 torch.utils.data <data>
diff --git a/docs/source/quantization-support.md b/docs/source/quantization-support.md
index 2f17a0626595..986b1cb25751 100644
--- a/docs/source/quantization-support.md
+++ b/docs/source/quantization-support.md
@@ -52,6 +52,26 @@ This module contains Eager mode quantization APIs.
     default_eval_fn
 ```
 
+## torch.ao.quantization.utils
+
+```{eval-rst}
+.. automodule:: torch.ao.quantization.utils
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    activation_is_dynamically_quantized
+    activation_is_int32_quantized
+    activation_is_int8_quantized
+    activation_is_statically_quantized
+
+    determine_qparams
+    check_min_max_valid
+    calculate_qmin_qmax
+    validate_qmin_qmax
+```
+
 ## torch.ao.quantization.quantize_fx
 
 This module contains FX graph mode quantization APIs (prototype).
@@ -150,7 +170,7 @@ This module contains a few CustomConfig classes that's used in both eager mode a
 ## torch.ao.quantization.pt2e.export_utils
 
 ```{eval-rst}
-.. currentmodule:: torch.ao.quantization.pt2e.export_utils
+.. automodule:: torch.ao.quantization.pt2e.export_utils
 ```
 
 ```{eval-rst}
diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index d8f7c162b5e0..386a18ffceb0 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -35,7 +35,6 @@ and supported quantized modules and functions.
 
     quantization-support
 
-
 .. torch.ao is missing documentation. Since part of it is mentioned here, adding them here for now.
 .. They are here for tracking purposes until they are more permanently fixed.
 .. py:module:: torch.ao
@@ -135,7 +134,6 @@ and supported quantized modules and functions.
 .. py:module:: torch.ao.quantization.fx.utils
 .. py:module:: torch.ao.quantization.observer
 .. py:module:: torch.ao.quantization.pt2e.duplicate_dq_pass
-.. py:module:: torch.ao.quantization.pt2e.export_utils
 .. py:module:: torch.ao.quantization.pt2e.graph_utils
 .. py:module:: torch.ao.quantization.pt2e.port_metadata_pass
 .. py:module:: torch.ao.quantization.pt2e.prepare
@@ -159,7 +157,6 @@ and supported quantized modules and functions.
 .. py:module:: torch.ao.quantization.quantizer.xnnpack_quantizer
 .. py:module:: torch.ao.quantization.quantizer.xnnpack_quantizer_utils
 .. py:module:: torch.ao.quantization.stubs
-.. py:module:: torch.ao.quantization.utils
 .. py:module:: torch.nn.intrinsic.modules.fused
 .. py:module:: torch.nn.intrinsic.qat.modules.conv_fused
 .. py:module:: torch.nn.intrinsic.qat.modules.linear_fused
diff --git a/docs/source/torch.compiler.config.md b/docs/source/torch.compiler.config.md
index 66059f07ea5b..e67cbb1f2711 100644
--- a/docs/source/torch.compiler.config.md
+++ b/docs/source/torch.compiler.config.md
@@ -1,14 +1,12 @@
 ```{eval-rst}
 .. currentmodule:: torch.compiler.config
-
 ```
 
 # torch.compiler.config
 
 ```{eval-rst}
 .. automodule:: torch.compiler.config
-```
-
-```{eval-rst}
-.. autodata:: torch.compiler.config.job_id
+   :members:
+   :undoc-members:
+   :show-inheritance:
 ```
diff --git a/docs/source/torch.compiler.md b/docs/source/torch.compiler.md
index fe37c3b42aea..11e22aae4cf3 100644
--- a/docs/source/torch.compiler.md
+++ b/docs/source/torch.compiler.md
@@ -82,55 +82,48 @@ Some of the most commonly used backends include:
 
 ## Read More
 
-```{eval-rst}
-.. toctree::
-   :caption: Getting Started for PyTorch Users
-   :maxdepth: 1
-
-   torch.compiler_get_started
-   torch.compiler_api
-   torch.compiler.config
-   torch.compiler_fine_grain_apis
-   torch.compiler_backward
-   torch.compiler_aot_inductor
-   torch.compiler_inductor_profiling
-   torch.compiler_profiling_torch_compile
-   torch.compiler_faq
-   torch.compiler_troubleshooting
-   torch.compiler_performance_dashboard
-   torch.compiler_inductor_provenance
+```{toctree}
+:caption: Getting Started for PyTorch Users
+:maxdepth: 2
+
+torch.compiler_get_started
+torch.compiler_api
+torch.compiler.config
+torch.compiler_dynamic_shapes
+torch.compiler_fine_grain_apis
+torch.compiler_backward
+torch.compiler_aot_inductor
+torch.compiler_inductor_profiling
+torch.compiler_profiling_torch_compile
+torch.compiler_faq
+torch.compiler_troubleshooting
+torch.compiler_performance_dashboard
+torch.compiler_inductor_provenance
 ```
 
-```{eval-rst}
-.. toctree::
-   :caption: `torch.compile` Programming Model
+```{toctree}
+:caption: torch.compile Programming Model
+:maxdepth: 2
 
-   compile/programming_model
+compile/programming_model
 ```
 
-% _If you want to contribute a developer-level topic
-%  that provides in-depth overview of a torch._dynamo feature,
-%  add in the below toc.
+```{toctree}
+:caption: Deep Dive for PyTorch Developers
+:maxdepth: 1
 
-```{eval-rst}
-.. toctree::
-   :caption: Deep Dive for PyTorch Developers
-   :maxdepth: 1
-
-   torch.compiler_dynamo_overview
-   torch.compiler_dynamo_deepdive
-   torch.compiler_dynamic_shapes
-   torch.compiler_nn_module
-   torch.compiler_cudagraph_trees
-   torch.compiler_fake_tensor
+torch.compiler_dynamo_overview
+torch.compiler_dynamo_deepdive
+torch.compiler_nn_module
+torch.compiler_cudagraph_trees
+torch.compiler_fake_tensor
 ```
 
-```{eval-rst}
-.. toctree::
-   :caption: HowTo for PyTorch Backend Vendors
-   :maxdepth: 1
+```{toctree}
+:caption: HowTo for PyTorch Backend Vendors
+:maxdepth: 1
 
-   torch.compiler_custom_backends
-   torch.compiler_transformations
-   torch.compiler_ir
+torch.compiler_custom_backends
+torch.compiler_transformations
+torch.compiler_ir
 ```
diff --git a/docs/source/torch.compiler_aot_inductor.md b/docs/source/torch.compiler_aot_inductor.md
index 0584cac0aa91..e1de04011491 100644
--- a/docs/source/torch.compiler_aot_inductor.md
+++ b/docs/source/torch.compiler_aot_inductor.md
@@ -2,11 +2,6 @@
 
 # AOTInductor: Ahead-Of-Time Compilation for Torch.Export-ed Models
 
-```{warning}
-AOTInductor and its related features are in prototype status and are
-subject to backwards compatibility breaking changes.
-```
-
 AOTInductor is a specialized version of
 [TorchInductor](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747),
 designed to process exported PyTorch models, optimize them, and produce shared libraries as well
@@ -73,6 +68,10 @@ with torch.no_grad():
         # [Optional] Specify the generated shared library path. If not specified,
         # the generated artifact is stored in your system temp directory.
         package_path=os.path.join(os.getcwd(), "model.pt2"),
+        # [Optional] Specify Inductor configs
+        # This specific max_autotune option will turn on more extensive kernel autotuning for
+        # better performance.
+        inductor_configs={"max_autotune": True,},
     )
 ```
 
diff --git a/docs/source/torch.compiler_dynamic_shapes.md b/docs/source/torch.compiler_dynamic_shapes.md
index 95998ffe8491..22cb482cd20b 100644
--- a/docs/source/torch.compiler_dynamic_shapes.md
+++ b/docs/source/torch.compiler_dynamic_shapes.md
@@ -1,129 +1,295 @@
+---
+file_format: mystnb
+kernelspec:
+  name: python3
+mystnb:
+  execution_timeout: 30
+  execution_show_tb: True
+  merge_streams: True
+---
+
+```{code-cell}
+:tags: [remove-cell]
+import torch
+from compile import header_code
+
+torch._logging.set_logs(graph_breaks=True, graph_code=True)
+```
+
+(dynamic_shapes)=
 # Dynamic Shapes
 
-Code: [symbolic_shapes.py](https://github.com/pytorch/pytorch/blob/db4572dbf18f1cf50cf662547e272d3117063747/torch/fx/experimental/symbolic_shapes.py)
-
-See also: [The dynamic shapes manual](https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.fh8zzonyw8ng)
-
-## Motivation
-
-Deep learning compilers commonly only work for static shapes, that is to say, they produced compiled programs which only work for a single specific configuration of input shapes, and must recompile if any input shape changes. This assumption works great for the majority of commonly run deep learning models today, but there are a few situations where it is insufficient:
-
-- Some dimensions, such as batch size or sequence length, may vary. For example, an inference service performing adaptive batching will execute inference requests with varying batch sizes depending on how many requests it received within its batching window. We may also want to consider padding out variable size sequences only to the maximum sequence length within a batch, which may vary from batch-to-batch.
-- Some models exhibit data-dependent output shapes, that is to say, the size of their outputs and intermediates may depend on the actual input data which may vary across runs. For example, detection models may first generate a variable number of potential bounding boxes before running a more expensive image recognition model to identify if the subject is in a bounding box. The number of bounding boxes is data dependent.
-- One particularly important case of data-dependent shapes occurs when dealing with sparse representations, such as sparse tensors, jagged tensors, and graph neural networks. In all of these cases, the amount of data to be processed depends on the sparse structure of the problem, which will typically vary in a data-dependent way.
-
-In supporting dynamic shapes, we chose not to support dynamic rank programs, e.g., programs whose inputs tensors change in dimensionality, as this pattern rarely occurs in real-world deep learning programs, and it avoids the need to reason inductively over symbolic lists of shapes.
-
-## Abridged public API
-
-The default dynamic behavior in PyTorch 2.1 is:
-
-- PT2 assumes everything is static by default
-- If we recompile because a size changed, we will instead attempt to recompile
-  that size as being dynamic (sizes that have changed are likely to change in
-  the future). This generalization may fail (e.g., because user code does a
-  conditional branch on the size in question or missing dynamic shapes support
-  in PT2). If you are trying to understand why PT2 has overspecialized some
-  code, run with `TORCH_LOGS=dynamic` and look for "eval" entries that say
-  when guards are added and why.
-- If you know ahead of time something will be dynamic, you can skip the first
-  recompile with `torch._dynamo.mark_dynamic(tensor, dim)`. If you know ahead of time
-  the `min` and `max` value this dimension can take, you can specify `torch._dynamo.mark_dynamic(tensor, dim, min=min, max=max)`
-- If you say `torch.compile(dynamic=False)`, we will turn off automatic
-  dynamic shapes on recompiles and always recompile for each distinct size.
-  Conversely, if you say `torch.compile(dynamic=True)`, we will try to make
-  everything as dynamic as possible. This is mostly useful for small
-  operators; if you try it on a big model it will (1) probably crash PT2 and (2) run slow for no good reason.
-- You can whitelist specific sources to be marked as dynamic using the
-  `TORCH_COMPILE_DYNAMIC_SOURCES` environment variable or by setting
-  `torch.compiler.config.dynamic_sources`. This is particularly useful for large
-  models with graph breaks, as you can maintain dynamism across graph breaks since
-  source names stay consistent. You can also use this to mark integers as dynamic.
-  The format is a comma-delimited list of source names, e.g., `"L['x'], L['y']"`.
-  You can also use regexes, e.g., `"L\['x.*'\], L\['y.*'\]")`.
-  This whitelist takes precedence over other flags like `dynamic=False`,
-  `force_nn_module_property_static_shapes`, and `force_parameter_static_shapes`.
-- Sometimes it can be cumbersome to find the right inputs to mark as dynamic. If
-  you're willing to take a performance hit for the first batch, one other affordable
-  option we have are the eager_then_compile stances which derive dynamism for you.
-  See [torch.compiler.set_stance](https://docs.pytorch.org/docs/stable/generated/torch.compiler.set_stance.html) for more details.
-
-## The Guard Model
-
-When considering how to add support for dynamic shapes to TorchDynamo and TorchInductor, we made a major design decision: in order to reuse decompositions and other preexisting code written in Python/C++ targeting the PyTorch API, we must be able to trace through dynamic shapes. Unlike a fully symbolic system which might capture both branches of a conditional, we always pick one branch and specialize our trace under the assumption that we only use this trace when we would have made the same choice for that branch in the future. To do this, we maintain a "hint" for every symbolic size saying what its concrete value is at compile time (as TorchDynamo is a just-in-time compiler, it always knows what the actual input sizes are.) When we perform a condition on a tensor, we simply consult the hint to find out which branch to take.
-
-This greatly simplifies the symbolic shape formulas we produce, but means we have a much more involved system for managing guards. Consider, for example, the following program:
-
-```python
-def f(x, y):
-    z = torch.cat([x, y])
-    if z.size(0) > 2:
-        return z.mul(2)
-    else:
-        return z.add(2)
+This section explains how to work with dynamic shapes in PyTorch, including how
+to debug and fix common errors, implement support for dynamic shapes in
+operators, and understand the underlying mechanisms.
+
+Dynamic shapes allow PyTorch models to handle inputs with varying dimensions
+without recompilation. This enables more flexible models that can process
+different batch sizes, sequence lengths, or image dimensions in a single
+compiled artifact. Dynamic shapes work by symbolically tracing tensor
+dimensions rather than using concrete values, creating a computation
+graph that adapts to different input shapes at runtime. By default,
+PyTorch assumes all input shapes to be static.
+
+Typically, deep learning compilers only support static shapes, requiring
+recompilation for input shape changes. While this approach covers many use cases,
+there are situations where this is insufficient:
+
+- **Variable Dimensions** - Batch sizes or sequence lengths vary, such as in
+adaptive batching.
+- **Data-Dependent Outputs** - Models produce outputs based on input data,
+like variable bounding boxes in detection models.
+- **Sparse Representations** - Processing depends on data-varying sparse structures,
+such as in sparse tensors, jagged tensors, and graph neural networks.
+
+Dynamic shapes do not support dynamic rank programs, programs which input tensors
+change in dimensionality, as this is uncommon and unnecessarily complex.
+
+
+## What does it mean for a size/integer to be dynamic?
+
+Dynamic shapes allow avoiding recompilations by making certain dimensions or integers
+dynamic. For example, if a function `f(x)` is compiled with a static size, it will need
+recompilation for different sizes:
+
+```{note}
+For simplicity, this example uses `@torch.compile(dynamic=True)`. Note, that
+this option is not recommended due to it being error prone.
+For a recommended way of enabling dynamic shapes, see {ref}`enable-dynamic-behavior`.
 ```
 
-The final IR we will compile with TorchInductor will either be `torch.cat([x, y]).add(2)` or `torch.cat([x, y]).mul(2)` (with the condition flattened away), but to determine which branch we are in, we would need to know the size of `z`, an intermediate. Because TorchDynamo must know upfront if a compiled trace is valid (we do not support bailouts, like some JIT compilers), we must be able to reduce `z.size(0)` as an expression in terms of the inputs, `x.size(0) + y.size(0)`. This is done by writing meta functions for all operators in PyTorch which can propagate size information to the output of a tensor without actually performing computation on the node.
+```{code-cell}
+import torch
+@torch.compile(dynamic=False)
+def f(x):
+     return x* x.size()[0]
 
-## Overall architecture
+f(torch.rand(10))
+f(torch.rand(20))
+f(torch.rand(30))
+f(torch.rand(40))
+```
+
+In the produced output, you can see that four graphs were generated.
+See the corresponding <a href="_static/img/dynamic_shapes/tlparse1_dynamic_shapes_false.png" target="_blank">tlparse output</a>
 
-Symbolic shapes workflow:
+By making the size dynamic, the function can handle various sizes without recompilation:
 
-1. When we start compiling a frame in Dynamo, we allocate a ShapeEnv (attached to FakeTensorMode) which keeps track of symbolic shapes state.
-2. We allocate symbolic sizes for tensors on entry (what is static or dynamic is a policy decision, with some knobs).
-3. We propagate the symbolic sizes through operators, maintaining both (1) FX IR so that we can faithfully export symbolic compute, and (2) Sympy expressions representing the size vars, so we can reason about them.
-4. When we condition on symbolic sizes, either in Dynamo tracing or in Inductor optimization, we add guards based on the conditional. These can be induced from both Python and C++.
-5. These guards can induce further simplifications on symbolic variables. For example, if you assert `s0 == 4`, we can now replace all occurrences of `s0` with `4`.
-6. When we're done tracing and optimizing, we install all of these guards with the compiled code; the compiled code is only reusable if all the guards evaluate true.
+```{code-cell}
+import torch
+@torch.compile(dynamic=True)
+def f(x):
+     return x* x.size()[0]
 
-Important files:
+f(torch.rand(10))
+f(torch.rand(20))
+f(torch.rand(30))
+f(torch.rand(40))
+```
 
-- C++ SymInt API: `c10/core/SymInt.h`, `SymFloat.h`, `SymBool.h`
-- Python SymInt API: `torch/__init__.py` (look for `SymInt/SymFloat/SymBool`)
-- C++ plumbing: `c10/core/SymNodeImpl.h`, `torch/csrc/utils/python_symnode.h`, `torch/csrc/jit/python/init.cpp`
-- Python infrastructure: `torch/fx/experimental/symbolic_shapes.py`
-- Other important files: `torch/_subclasses/fake_tensor.py`, `torch/_meta_registrations.py`, decomps, PrimTorch refs
+With dynamic shapes enabled, only one graph is created. See the
+corresponding <a href="_static/img/dynamic_shapes/tlparse2_dynamic_shapes_true.png" target="_blank">tlparse output</a>.
 
-## Abridged internal API
+While compilation time differences
+are minimal for this small example, more complex use cases would show significant
+performance improvements.
 
-Understanding the Python class hierarchy:
+(what_is_a_specialization)=
+## What is a specialization?
 
-- SymInt/SymFloat/SymBool: these are user-visible classes that simulate their int/float/bool counterparts. If you add two SymInts, we give you a new SymInt that symbolically tracks that the integer addition had occurred.
-- SymNode: this is the internal structure (accessible via e.g., `symint.node`) which holds the actual symbolic tracking info. SymNode is type erased; this makes it more convenient to represent mixed-type operations. Note that technically you don't have to call into Python SymNode from SymInt; for example, XLA's C++ `SymNodeImpl` would take the place of SymNode.
-- ShapeEnv: per-compile context state which keeps track of all the free symbols and guards we have accumulated so far. Every SymNode records its ShapeEnv (but not vice versa; SymNodes only get used if they participate in a guard).
+**Specialization** refers to optimizing a computational graph for specific input shapes
+by examining shape conditions during control flow. If a branch is taken based on a
+shape condition, the graph is tailored for that condition. If a new input doesn't meet
+this condition, the system will recompile the graph.
 
-C++ is fairly similar:
+Specialization allows you to create optimized computational graphs for specific input
+shapes, which can significantly improve execution speed.
 
-- c10::SymInt/SymFloat/SymBool: user-visible classes that simulate int/float/bool.
-- c10::SymNode/SymNodeImpl: analogous to SymNode
-- There is no ShapeEnv in C++; for ease of debugging, the entire symbolic reasoning apparatus is in Python.
 
-When you write code that is traceable with `make_fx`, it must be able to deal with SymInt/SymFloat/SymBool flowing through it. [The dynamic shapes manual](https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.fh8zzonyw8ng) gives some guidance for how to do this.
+```{code-cell}
+import torch
+@torch.compile(dynamic=True)
+def f(x):
+    if x.size()[0] == 10:
+        return x * 10
 
-## DimDynamic policy
+    if x.size()[0] <= 30:
+        return x*200
 
-Symbolic reasoning:
+    return x*x.size()[0]
+
+f(torch.rand(10))
+f(torch.rand(20))
+f(torch.rand(30))
+f(torch.rand(40))
+f(torch.rand(50))
+```
 
-- Value ranges
-- Sympy usage notes
-- Constraints
-- DimDynamic/Constraint
+In the code above, we specialize that the graph requires an input size of 10, in which
+case it will return `x * 10`. If the input size is less than 30, it will return `x * 200`.
+In the output, you can see that this creates three graphs.
 
-## Unbacked SymInts
+See the corresponding <a href="_static/img/dynamic_shapes/tlparse3_specialization.png" target="_blank">tlparse output</a>
+
+
+This is how graphs created for the above function:
+
+```{image} _static/img/dynamic_shapes/dynamic_shapes_example_specialization.png
+```
 
-To resolve control flow, we check the hint, aka actual value, of a symbolic integer to determine which branch to go. However, in some cases, we may not have a hint: so-called unbacked symbolic integers arise when a size variable emerges from a data-dependent operation like `.nonzero()` or `.item()`. It is illegal to perform control flow on these symbolic integers, so we must graph break on these operations.
+(enable-dynamic-behavior)=
+## Enabling Dynamic Behavior
 
-Naively implemented, this is too restrictive: most PyTorch programs will immediately fail if you try to do anything with unbacked symbolic integers. Here are the most important enhancements to make this actually work:
+There are the following ways to make things dynamic:
 
-- On tensor creation, PyTorch precomputes a lot of data about a tensor; for example, if you use `empty_strided` to create a tensor, we will eagerly sort the strides and determine if the tensor is non-overlapping and dense. Sorts produce a lot of guards. However, it is more common to produce a tensor directly with a higher-level API like `empty`, which is guaranteed to produce a non-overlapping and dense tensor. We modified PyTorch to avoid needlessly recomputing these properties.
-- Even if nontrivial compute is needed, sometimes a property is never actually queried at all. Making these precomputed properties lazy allows us to avoid guarding on an unbacked symbolic integer unless it is actually needed.
-- The data in an integer tensor is generally not known to be non-negative. However, we provide an API `constrain_range` whereby a user can specify that a size is bounded above and below by known limits.
+* {ref}`automatic_dynamic`
+* {ref}`user_annotations` (preferred)
+* {ref}`torch_compile_dynamic_true` (for testing only)
+* {ref}`dynamic_shapes_advanced_control_options` (for advanced use cases)
 
-Similar to the dynamic APIs, there are corresponding unbacked APIs: namely you can use mark_unbacked instead of `mark_dynamic` and `TORCH_COMPILE_UNBACKED_SOURCES` instead of `TORCH_COMPILE_DYNAMIC_SOURCES` to tell the compiler to mark an input as unbacked.
+Read below about each of this options.
 
-In future versions of PT2 (beyond PT2.1), we will extend our reasoning system
-to infer that an unbacked symbolic integer is size-like based on usage. For
-example, if you pass the result of an `.item()` call to a factory function
-like `torch.empty`, we will automatically infer that the result is a size
-(because if it was not, it would fail.) This assumption would get validated
-at runtime, raising an error if it was not fulfilled.
+(automatic_dynamic)=
+### Automatic dynamic
+
+**Automatic dynamic** is the default behavior where {func}`torch.compile` performs
+the initial compilation assuming static shapes are used, while tracking the
+input sizes from that first compilation. When a recompile is triggered, it
+uses this information to identify which dimensions have changed and marks
+those as dynamic for the second compilation.
+
+(user_annotations)=
+### User Annotations
+
+Several APIs allow users to explicitly mark specific inputs
+by name or code as dynamic. This is useful for avoiding initial compilations that
+would eventually become dynamic with the previous tools. It is also used to mark
+elements that do not automatically get marked as dynamic, such as neural network
+module parameters, and so on. User annotations are the preferred way to enable
+dynamic shapes.
+
+#### `mark_dynamic(tensor, dim, min=min, max=max)`
+
+The {func}`torch._dynamo.mark_dynamic` function marks a tensor dimension as dynamic and will fail if it
+gets specialized. It does not work for integers. Use this function only if you know
+all graphs in the frame using this input converge to a single dynamic graph.
+Otherwise, you may encounter a misleading constraint violation error.
+In such cases, consider using {func}`torch._dynamo.maybe_mark_dynamic`. Currently,
+{func}`torch._dynamo.mark_dynamic`
+does not have precedence over `force_parameter_static_shapes = True` or `force_nn_module_property_static_shapes = True`.
+
+If you know in advance that a particular dimension will be dynamic, you
+can avoid the initial recompilation by using {func}`torch._dynamo.mark_dynamic(tensor, dim)`.
+Additionally, if you already know the minimum and maximum possible
+values for this dimension, you can specify them with
+{func}`torch._dynamo.mark_dynamic(tensor, dim, min=min, max=max)`.
+
+Here is a quick example:
+
+```{code-cell}
+import torch
+
+@torch.compile
+def f(x):
+    return x * x.size()[0]
+
+x = torch.randn(10)
+torch._dynamo.mark_dynamic(x, 0)
+
+# first invocation we give it is a tensor marked as dynamic
+f(x)
+# rest of these invocations will use dynamically compiled code
+f(torch.randn(20))
+f(torch.randn(30))
+f(torch.randn(40))
+```
+
+#### `maybe_mark_dynamic(tensor, dim)`
+
+The {func}`torch._dynamo.maybe_mark_dynamic` function shares all properties
+with  {func}`torch._dynamo.mark_dynamic`
+but does not fail if the size gets specialized. Use it for inputs shared by
+multiple graphs or if the number of graphs does not converge to one for a specific
+frame. For instance, in the example above, use {func}`torch._dynamo.maybe_mark_dynamic()` because graphs
+with sizes 0 and 1 will specialize. However, you can use {func}`torch._dynamo.mark_dynamic` to ensure
+you never specialize.
+
+#### `mark_unbacked(tensor, dim)`
+
+The {func}`torch._dynamo.decorators.mark_unbacked` function marks a tensor dimension as unbacked. It is unlikely
+to be the tool you need, but it could be useful if the specialization occurs inside
+a condition `guard_size_oblivious(x)`, and if using it removes the specialization.
+Ensure it fixes the specialization and does not introduce a data-dependent error
+that converts to a graph break at or before the specialization location
+you are trying to  avoid. It might be better to use the next option.
+
+(dynamic_sources_allow_list)=
+#### Dynamic Allow List (`DYNAMIC_SOURCES`)
+
+Use the evnironmental variable `TORCH_COMPILE_DYNAMIC_SOURCES` to pass a configuration
+list of source names to be marked as dynamic. For example:
+`TORCH_COMPILE_DYNAMIC_SOURCES=L[‘x’],L[‘y’]`
+It's easiest to find these dynamic source names using the PGO artifact in `tlparse`.
+You can copy and paste the dynamic source names from the PGO artifact. This method works
+for integers and tensor sizes and has the highest precedence over all other flags
+that force static shapes. It will not throw an error if what is marked dynamic
+gets specialized or if the provided input does not exist.
+
+Here is an example:
+
+```{code-cell}
+import torch
+
+@torch.compile()
+def f(x):
+     return x * x.size()[0]
+
+with torch.compiler.config.patch(dynamic_sources="L['x']"):
+    f(torch.rand(10))
+f(torch.rand(20))
+f(torch.rand(30))
+f(torch.rand(40))
+```
+
+(torch.compiler.set_stance_eager_then_compile)=
+#### `torch.compiler.set_stance ("eager_then_compile")`
+
+At times, identifying the appropriate inputs to mark as dynamic can
+be challenging. If you are willing to accept a performance cost for
+the first batch, another convenient option is to use the
+`eager_then_compile` stances, which automatically determine dynamic
+inputs for you. For more information, see {func}`torch.compiler.set_stance` and [Dynamic Compilation Control with torch.compiler.set_stance](https://docs.pytorch.org/tutorials/recipes/torch_compiler_set_stance_tutorial.html).
+
+(torch_compile_dynamic_true)=
+### `torch.compile (dynamic=true)` (Not recommended)
+
+This setting forces all sizes and integers to be dynamic, increasing the
+chance of encountering dynamic shape bugs. Setting this option is not
+recommended due to it  being error prone.
+It would make every input size dynamic which may result it performance
+regressions and ultimately increase compilation time.
+
+PyTorch also provides advanced control options for dynamic shapes, see:
+{ref}`dynamic_shapes_advanced_control_options`.
+
+## Where Do I Go From Here?
+
+If you encounter a framework code bug or an issue with specialization,
+file an issue so it can be reviewed and potentially improved. If the issue
+is within your user code, consider whether you are willing to rewrite your
+code to avoid it. Determine if it affects correctness or if it's a redundant
+check. If the issue involves a Triton custom kernel with a `constexpr`
+argument, evaluate whether you can rewrite it to address the problem.
+
+```{toctree}
+:maxdepth: 1
+compile/dynamic_shapes_core_concepts
+compile/dynamic_shapes_troubleshooting
+compile/dynamic_shapes_advanced_control_options
+compile/dynamic_shapes_beyond_the_basics
+```
+
+```{seealso}
+* [tlparse documentation](https://github.com/pytorch/tlparse)
+* [The dynamic shapes manual](https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit?tab=t.0#heading=h.fh8zzonyw8ng)
+```
diff --git a/docs/source/torch.intermediate_debug_printer.md b/docs/source/torch.intermediate_debug_printer.md
new file mode 100644
index 000000000000..f8f8e3d44ea6
--- /dev/null
+++ b/docs/source/torch.intermediate_debug_printer.md
@@ -0,0 +1,83 @@
+```{eval-rst}
+:orphan:
+```
+
+# AOTInductor Intermediate Value Debug Printer
+
+This is a user manual on how to use AOT Inductor Intermediate Value Debug Printer tool which is a utility tool that can help pinpoint CUDA IMA kernels / numerical discrepancies when uses AOT Inductor to compile a PyTorch model.
+
+The main functionality of this tool is to automatically print out / or dump the value info of all intermediate tensor arguments before and after each kernel launch call in AOT Inductor.
+
+## How to use
+
+The debug printer can be configured via environment variable. The following flags are both supported to run with internal fbcode buck commands and OSS.
+
+All configurations are defined here: [torch/_inductor/config.py](https://github.com/pytorch/pytorch/blob/768361e67f0eb36491d7b763ef38d7c928ebefe6/torch/_inductor/config.py#L1493-L1505)
+
+
+```
+    # options for debug printing/saving for intermediate tensor values for aot inductor
+
+    0: disable debug dumping
+    1: enable saving intermediate tensor values
+    2: enable printing intermediate tensor values
+    3: enable printing kernel names only (useful for pinpointing troublesome kernels)
+```
+
+
+1. To enable **default** mode debug printing:
+
+    - Add flag `AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=2` (PRINT_ONLY mode) for default printing all supported kernel tensor arg values.
+
+    - Add flag `AOT_INDUCTOR_FILTERED_KERNELS_TO_PRINT={kernel_name_1, kernel_name_2,...}` for selectively printing tensor values associated with the specified kernels. (suggest to do a run with generating full printing logs first)
+
+    Sample command:
+
+    ```
+    AOT_INDUCTOR_FILTERED_KERNELS_TO_PRINT="aoti_torch_cuda_addmm_out" AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=2 TORCH_LOGS="+inductor, output_code" python test/inductor/test_aot_inductor.py -k test_addmm_cuda
+    ```
+
+
+2. To enable **pinpoint** the problematic kernel name only: (Especially useful in CUDA IMA debugging)
+
+   - Add flag `AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=3` (PRINT_KERNEL_NAME_ONLY mode) no tensor numerical values will be dumped.
+
+   Sample command:
+
+   ```
+   AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=3 TORCH_LOGS="+inductor, output_code" python test/inductor/test_aot_inductor.py -k test_addmm_cuda
+   ```
+
+3. To enable **save** the intermediate tensor values:
+
+    - Useful when you want to repro the error in a standalone kernel debugging repro. The saved intermediate tensor values can be used as debugging inputs to the problematic kernel.
+    - Set `AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=1` (SAVE_ONLY mode)  for default saving all supported kernel tensor arg values to `.pt` in a tmp folder.
+    - Similarly, add `AOT_INDUCTOR_FILTERED_KERNELS_TO_PRINT={kernel_name_1, kernel_name_2,...}` for selectively saving tensor values associated with the specified kernels.
+
+    Sample command:
+    ```
+    AOT_INDUCTOR_FILTERED_KERNELS_TO_PRINT="triton_poi_fused_0" AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=1 TORCH_LOGS="+inductor, output_code" python test/inductor/test_aot_inductor.py -k test_addmm_cuda
+    ```
+
+    The saved tensor values will be dumped in a format:  `<before/after_launch>_<kernel_name>_<arg_name>_<device>.pt`
+
+    The dumped `.pt` tensors can be further loaded and used like this:
+    ```
+        def _load_tensor(path):
+            return torch.load(path, weights_only=True)
+        tensor = _load_tensor("../tmp/aoti_torch/before_launch_aoti_torch_cuda_addmm_out_buf1_cuda:0.pt")
+
+        # Simply print tensor to view the full value
+        print(tensor)
+    ```
+
+## Example Outputs
+
+Before launch tensor stats:
+
+![Sample image 1](_static/img/aoti_debug_printer/before_launch.png)
+
+
+After launch tensor stats:
+
+![Sample image 2](_static/img/aoti_debug_printer/after_launch.png)
diff --git a/docs/source/torch.rst b/docs/source/torch.rst
index 645fdd52135f..068ffb52c0ad 100644
--- a/docs/source/torch.rst
+++ b/docs/source/torch.rst
@@ -146,7 +146,7 @@ Indexing, Slicing, Joining, Mutating Ops
 Accelerators
 ----------------------------------
 Within the PyTorch repo, we define an "Accelerator" as a :class:`torch.device` that is being used
-alongside a CPU to speed up computation. These device use an asynchronous execution scheme,
+alongside a CPU to speed up computation. These devices use an asynchronous execution scheme,
 using :class:`torch.Stream` and :class:`torch.Event` as their main way to perform synchronization.
 We also assume that only one such accelerator can be available at once on a given host. This allows
 us to use the current accelerator as the default device for relevant concepts such as pinned memory,
@@ -816,6 +816,10 @@ Operator Tags
 .. py:module:: torch.types
 .. py:module:: torch.version
 
+.. Compiler configuration module - documented in torch.compiler.config.md
+.. py:module:: torch.compiler.config
+   :noindex:
+
 .. Hidden aliases (e.g. torch.functional.broadcast_tensors()). We want `torch.broadcast_tensors()` to
    be visible only.
 .. toctree::
diff --git a/docs/source/utils.md b/docs/source/utils.md
index d667e12b6aae..6742866a8b25 100644
--- a/docs/source/utils.md
+++ b/docs/source/utils.md
@@ -78,7 +78,6 @@ for tracking purposes -->
 .. py:module:: torch.utils.data.graph
 .. py:module:: torch.utils.data.graph_settings
 .. py:module:: torch.utils.data.sampler
-.. py:module:: torch.utils.debug_mode
 .. py:module:: torch.utils.dlpack
 .. py:module:: torch.utils.file_baton
 .. py:module:: torch.utils.flop_counter
diff --git a/docs/source/xpu.aliases.md b/docs/source/xpu.aliases.md
new file mode 100644
index 000000000000..ebec51d37452
--- /dev/null
+++ b/docs/source/xpu.aliases.md
@@ -0,0 +1,32 @@
+# Aliases in torch.xpu
+
+The following are aliases to their counterparts in ``torch.xpu`` in the nested namespaces in which they are defined. For any of these APIs, feel free to use the top-level version in ``torch.xpu`` like ``torch.xpu.seed`` or the nested version ``torch.xpu.random.seed``.
+
+```{eval-rst}
+.. automodule:: torch.xpu.random
+.. currentmodule:: torch.xpu.random
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    get_rng_state
+    get_rng_state_all
+    initial_seed
+    manual_seed
+    manual_seed_all
+    seed
+    seed_all
+    set_rng_state
+    set_rng_state_all
+```
+
+```{eval-rst}
+.. automodule:: torch.xpu.streams
+.. currentmodule:: torch.xpu.streams
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    Event
+    Stream
+```
\ No newline at end of file
diff --git a/docs/source/xpu.md b/docs/source/xpu.md
index 1496a7f82c58..08e0299480e4 100644
--- a/docs/source/xpu.md
+++ b/docs/source/xpu.md
@@ -28,6 +28,7 @@
     is_available
     is_bf16_supported
     is_initialized
+    is_tf32_supported
     set_device
     set_stream
     stream
@@ -86,9 +87,9 @@
      reset_peak_memory_stats
 ```
 
-<!-- This module needs to be documented. Adding here in the meantime
-for tracking purposes -->
 ```{eval-rst}
-.. py:module:: torch.xpu.random
-.. py:module:: torch.xpu.streams
-```
+.. toctree::
+    :hidden:
+
+    xpu.aliases.md
+```
\ No newline at end of file
diff --git a/functorch/.gitignore b/functorch/.gitignore
index 145ab7d60839..58bffff1353d 100644
--- a/functorch/.gitignore
+++ b/functorch/.gitignore
@@ -3,7 +3,6 @@ dist/
 functorch.egg-info/
 *__pycache__*
 functorch/version.py
-functorch/_C.so
 .gdbinit
 t.py
 .vscode/
diff --git a/functorch/CMakeLists.txt b/functorch/CMakeLists.txt
deleted file mode 100644
index bdfa4bfe4550..000000000000
--- a/functorch/CMakeLists.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-cmake_minimum_required(VERSION 3.18)
-project(functorch)
-set(CMAKE_CXX_STANDARD 17)
-
-include(GNUInstallDirs)
-include(CMakePackageConfigHelpers)
-
-set(FT_DIR csrc)
-file(GLOB_RECURSE FT_SOURCES ${FT_DIR}/*.cpp ${FT_DIR}/*.c)
-
-add_library(${PROJECT_NAME} MODULE ${FT_SOURCES})
-target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-target_compile_definitions(${PROJECT_NAME} PRIVATE FUNCTORCH_BUILD_MAIN_LIB)
-target_compile_definitions(${PROJECT_NAME} PRIVATE TORCH_EXTENSION_NAME=_C)
-target_compile_definitions(${PROJECT_NAME} PRIVATE TORCH_API_INCLUDE_EXTENSION_H)
-target_compile_options(${PROJECT_NAME} PRIVATE ${TORCH_PYTHON_COMPILE_OPTIONS})
-target_compile_options_if_supported(${PROJECT_NAME} "-Wmissing-prototypes")
-target_compile_options_if_supported(${PROJECT_NAME} "-Werror=missing-prototypes")
-if(BUILD_LIBTORCHLESS)
-  target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIB} torch_python)
-else()
-  # functorch cannot use the alias to build on windows
-  target_link_libraries(${PROJECT_NAME} PRIVATE torch torch_python)
-endif()
-target_link_libraries(${PROJECT_NAME} PRIVATE pybind::pybind11)
-
-set_target_properties(${PROJECT_NAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY
-      ${CMAKE_BINARY_DIR}/functorch)
-set_target_properties(${PROJECT_NAME} PROPERTIES INSTALL_RPATH "${_rpath_portable_origin}/../torch/lib")
-
-# Copy-pasted prefix/suffix logic for Python extensions from
-# https://github.com/pytorch/pytorch/blob/33bb8ae350611760139457b85842b1d7edf9aa11/caffe2/CMakeLists.txt#L1975
-# https://github.com/pytorch/pytorch/blob/33bb8ae350611760139457b85842b1d7edf9aa11/caffe2/CMakeLists.txt#L2022
-# TODO: It would be good to be able to use Python3_add_library target, but it does not work in many cases
-set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" DEBUG_POSTFIX "")
-if(WIN32)
-  set_target_properties(${PROJECT_NAME} PROPERTIES SUFFIX ".pyd")
-else()
-  set_target_properties(${PROJECT_NAME} PROPERTIES SUFFIX ".so")
-endif()
-# Needed to link functorch on MacOS
-if(NOT ${TORCH_PYTHON_LINK_FLAGS} STREQUAL "")
-  set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS ${TORCH_PYTHON_LINK_FLAGS})
-endif()
-install(TARGETS ${PROJECT_NAME} DESTINATION "${CMAKE_CURRENT_SOURCE_DIR}")
diff --git a/functorch/csrc/dim/arena.h b/functorch/csrc/dim/arena.h
deleted file mode 100644
index ec2cfef66895..000000000000
--- a/functorch/csrc/dim/arena.h
+++ /dev/null
@@ -1,332 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-#include <ATen/ATen.h>
-#include "minpybind.h"
-
-#if defined(_MSC_VER) && !defined(__clang__)
-#include <intrin.h>
-// https://stackoverflow.com/questions/355967/how-to-use-msvc-intrinsics-to-get-the-equivalent-of-this-gcc-code
-inline unsigned int __builtin_clz(unsigned int x) {
-    unsigned long r = 0;
-    _BitScanReverse(&r, x);
-    return (31 - r);
-}
-#endif
-
-inline int round2min8(int num) {
-   int nzeros = __builtin_clz((num - 1)|4);
-   return 1 << (32 - nzeros);
-}
-
-struct Arena;
-template<typename T>
-struct OwnedSlice;
-
-template<typename T>
-struct Slice {
-    Slice()
-    :  begin_(nullptr), size_(0), capacity_(0) {}
-
-    template<typename... Args>
-    Slice(Arena& arena, Args&&... args);
-
-    T* begin() const {
-        return begin_;
-    }
-    T* end() const {
-        return begin_ + size_;
-    }
-    int size() const {
-        return size_;
-    }
-    int capacity() const {
-        return capacity_;
-    }
-
-    T& back(int i=-1) {
-        return begin_[size_ + i];
-    }
-
-    T& operator[](int i) const {
-        return begin_[i];
-    }
-    std::optional<int> index(const T& value) {
-        for (int i : enumerate()) {
-            if (begin_[i] == value) {
-                return i;
-            }
-        }
-        return std::nullopt;
-    }
-    bool contains(const T& value) {
-        return index(value).has_value();
-    }
-
-    void insert(Arena& arena, Slice where, Slice to_insert);
-    void insert(Arena& arena, Slice where, T v) {
-        return insert(arena, where, Slice(&v, &v + 1));
-    }
-    void insert(Arena& arena, int where, T v) {
-        return insert(arena, slice(where, where), v);
-    }
-    void append(Arena& arena, T value);
-    void extend(Arena& arena, Slice to_insert);
-    void extend(Arena& arena, const T* begin, const T* end) {
-        return extend(arena, Slice<T>((T*)begin, (T*)end));
-    }
-
-    bool remove(Arena& A, T value) {
-        auto idx = index(value);
-        if (idx) {
-            insert(A, slice(*idx, *idx + 1), Slice());
-        }
-        return idx.has_value();
-    }
-
-    Slice slice(int begin) {
-        return slice(begin, size_);
-    }
-
-    Slice slice(int begin, int end) {
-        if (begin < 0) {
-            begin += size_;
-        }
-        if (end < 0) {
-            end += size_;
-        }
-        Slice result;
-        result.begin_ = begin_ + begin;
-        result.size_ = end - begin;
-        result.capacity_ = result.size_;
-        return result;
-    }
-
-    bool inside(Slice where) {
-        return begin() <= where.begin() && where.end() <= end();
-    }
-
-    irange enumerate() const {
-        return irange(size_);
-    }
-
-    irange reversed_enumerate() const {
-        return irange(size_ - 1, -1, -1);
-    }
-
-    bool operator==(const Slice<T>& rhs) const {
-        if (size() != rhs.size()) {
-            return false;
-        }
-        return std::equal(begin(), end(), rhs.begin());
-    }
-
-    Slice(T* begin, T* end)
-    : begin_(begin), size_(end - begin), capacity_(size_) {}
-
-protected:
-    static int _length(const T& t) {
-        return 1;
-    }
-    static int _length(Slice t) {
-        return t.size_;
-    }
-    static T* _insert(T*& dst, T t) {
-        *dst = std::move(t);
-        return ++dst;
-    }
-    static T* _insert(T*& dst, Slice t) {
-        std::memcpy(dst, t.begin_, sizeof(T)*t.size_);
-        dst += t.size_;
-        return dst;
-    }
-    T* begin_;
-    int size_;
-    int capacity_;
-    friend struct OwnedSlice<T>;
-};
-
-template<typename T>
-struct OwnedSlice {
-    typedef void (*deleter_t)(Slice<T>);
-    static void _no_delete(Slice<T>) {}
-    OwnedSlice()
-    : deleter_(_no_delete) {}
-    OwnedSlice(const OwnedSlice&) = delete;
-    OwnedSlice& operator=(const OwnedSlice&) = delete;
-    ~OwnedSlice() {
-        deleter_(slice_);
-        if (slice_.size_ > 8) {
-            delete [] slice_.begin_;
-        }
-    }
-    void set(Slice<T> to_own, deleter_t deleter = _no_delete) {
-        slice_.size_ = slice_.capacity_ = to_own.size();
-        slice_.begin_ = (slice_.size_ > 8) ? new T[slice_.size_] : &small_buf[0];
-        std::memcpy(slice_.begin_, to_own.begin(), slice_.size_ * sizeof(T));
-        deleter_ = deleter;
-    }
-    Slice<T> slice() const {
-        return slice_;
-    }
-private:
-    Slice<T> slice_;
-    deleter_t deleter_;
-    T small_buf[8];
-};
-
-template<typename T>
-inline std::ostream& operator<<(std::ostream& s, const Slice<T>& v) {
-    s << "[";
-    for (int i : v.enumerate()) {
-        if (i > 0) {
-            s << ", ";
-        }
-        s << v[i];
-    }
-    s << "]";
-    return s;
-}
-
-struct TensorRef {
-    TensorRef()
-    : impl_(nullptr){}
-    TensorRef(const at::Tensor& t)
-    : impl_(t.unsafeGetTensorImpl()) {}
-    const at::Tensor& operator*() const {
-        return *(at::Tensor*)this;
-    }
-    at::Tensor* operator->() const {
-        return (at::Tensor*)this;
-    }
-    operator bool() const {
-        return impl_ != nullptr;
-    }
-private:
-    at::TensorImpl* impl_;
-};
-
-constexpr int ARENA_MAX_SIZE = 4096;
-constexpr int ALIGNMENT = 8;
-struct Arena {
-    Arena()
-    : allocated_(0) {}
-    template<typename T>
-    T* allocate(int n) {
-        if (!n) {
-            return nullptr;
-        }
-        int to_allocate = sizeof(T)*n;
-        int to_allocate_rounded = ALIGNMENT * ((to_allocate - 1) / ALIGNMENT + 1);
-        auto prev_allocated = allocated_;
-        allocated_ += to_allocate_rounded;
-        if (C10_UNLIKELY_OR_CONST(allocated_ > ARENA_MAX_SIZE)) {
-            overflow_.emplace_back(new char[to_allocate]);
-            return (T*) &overflow_.back()[0];
-        }
-        return (T*) (buffer_ + prev_allocated);
-    }
-    TensorRef autorelease(at::Tensor s) {
-        auto ref = TensorRef(s);
-        s.unsafeReleaseTensorImpl();
-        ar_tensors_.append(*this, ref);
-        return ref;
-    }
-    mpy::handle autorelease(mpy::object obj) {
-        ar_objects_.append(*this, obj);
-        obj.release();
-        return ar_objects_.back();
-    }
-    ~Arena() {
-        for(TensorRef t: ar_tensors_) {
-            c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(t->unsafeGetTensorImpl());
-        }
-        for(mpy::handle h: ar_objects_) {
-            mpy::object::steal(h);
-        }
-    }
-private:
-    int64_t allocated_;
-    char buffer_[ARENA_MAX_SIZE];
-    Slice<TensorRef> ar_tensors_;
-    Slice<mpy::handle> ar_objects_;
-    std::vector<std::unique_ptr<char[]>> overflow_;
-};
-
-template<typename T>
-inline void Slice<T>::insert(Arena& arena, Slice where, Slice to_insert) {
-    AT_ASSERT(inside(where));
-    Slice result = *this;
-    /// b------sb---se-----e,  0----n
-    T* body_dest = where.begin();
-    if (where.size() != to_insert.size()) {
-        int new_size = size() - where.size() + to_insert.size();
-        T* tail_dest = where.begin() + to_insert.size();
-        if (new_size >= capacity_) {
-            int new_capacity = new_size ? round2min8(new_size) : 0;
-            result.capacity_ = new_capacity;
-            result.begin_ = arena.allocate<T>(new_capacity);
-            body_dest = result.begin_ + (where.begin() - begin());
-            tail_dest = body_dest + to_insert.size();
-            //std::memcpy(result.begin_, begin_, sizeof(T)*(where.begin() - begin()));
-            std::copy(begin_, begin_ + (where.begin() - begin()), result.begin_);
-        }
-        std::memmove(tail_dest, where.end(), sizeof(T)*(end() - where.end()));
-        result.size_ = new_size;
-    }
-
-    //std::memcpy(body_dest, to_insert.begin(), sizeof(T)*to_insert.size());
-    std::copy(to_insert.begin(), to_insert.end(), body_dest);
-    *this = result;
-}
-
-template<typename T>
-inline void Slice<T>::append(Arena& arena, T value) {
-    Slice result = *this;
-    if (size_ == capacity_) {
-        int new_size = size_ ? round2min8(size_)*2 : 8;
-        T* n = arena.allocate<T>(new_size);
-        //memcpy(n, begin_, size_*sizeof(T));
-        std::copy(begin_, begin_ + size_, n);
-        result.begin_ = n;
-        result.capacity_ = new_size;
-    }
-    result[result.size_++] = std::move(value);
-    *this = result;
-}
-
-template<typename T>
-inline void Slice<T>::extend(Arena& arena, Slice<T> rhs) {
-    Slice result = *this;
-    result.size_ = size_ + rhs.size();
-    if (result.size_ > capacity_) {
-        int new_size = round2min8(result.size_);
-        T* n = arena.allocate<T>(new_size);
-        //memcpy(n, begin_, size_*sizeof(T));
-        std::copy(begin_, begin_+size_, n);
-        result.begin_ = n;
-        result.capacity_ = new_size;
-    }
-    //memcpy(result.begin_ + size_, rhs.begin(), sizeof(T)*rhs.size());
-    std::copy(rhs.begin(), rhs.end(), result.begin_ + size_);
-    *this = result;
-}
-
-template<typename T>
-template<typename... Args>
-Slice<T>::Slice(Arena& arena, Args&&... args) {
-    int lens[] = {_length(args)...};
-    size_ = 0;
-    for (auto i : lens) {
-        size_ += i;
-    }
-    capacity_ = size_ ? round2min8(size_) : 0;
-    begin_ = arena.allocate<T>(capacity_);
-    T* dst_ = begin_;
-    T* unused[] = {_insert(dst_, args)...};
-    (void) unused;
-}
diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp
deleted file mode 100644
index 5258ba52f99c..000000000000
--- a/functorch/csrc/dim/dim.cpp
+++ /dev/null
@@ -1,3656 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <torch/csrc/utils/python_compat.h>
-
-// Many APIs have changed/don't exist anymore
-#if IS_PYTHON_3_12_PLUS
-
-#include "dim.h"
-
-// Re-enable this some day
-PyObject* Dim_init() {
-  PyErr_SetString(
-      PyExc_RuntimeError, "First class dim doesn't work with python 3.12");
-  return nullptr;
-}
-
-#else
-
-#include <frameobject.h>
-#include <opcode.h>
-#include <iostream>
-#include <new>
-#include <utility>
-#include <vector>
-#include "minpybind.h"
-// #include <torch/csrc/autograd/python_variable.h>
-#include <ATen/ATen.h>
-#include <ATen/functorch/BatchedTensorImpl.h>
-#include <ATen/functorch/DynamicLayer.h>
-#include <torch/csrc/Export.h>
-#include <memory>
-#include "arena.h"
-#include "dim.h"
-#include "python_variable_simple.h"
-
-#if IS_PYTHON_3_11_PLUS
-
-#define Py_BUILD_CORE
-#include "internal/pycore_opcode.h"
-#undef Py_BUILD_CORE
-#endif
-
-// C++ API functions for objects to
-// * construct the object, returning a ref-counted handle
-// * The actual API, with methods that take/return C-typed values
-
-// extend minpybind.h to include
-// * typed handles so that -> can get to their raw API
-// * object/handle distinction for the typed handles
-
-// class Dim: ---------------
-mpy::handle torch_Tensor___mul__;
-mpy::handle _Tensor;
-mpy::handle _Tensor_sum;
-mpy::handle NamedTuple;
-mpy::dict_view pointwise;
-mpy::handle torch_Tensor_expand;
-binaryfunc THPVariable_getitem;
-objobjargproc THPVariable_setitem;
-mpy::handle no_slice;
-PyTypeObject* torch_Tensor;
-mpy::handle torch_Tensor_copy_;
-mpy::handle torch_Tensor_split;
-bool pointwise_optimize = true;
-PyTypeObject* DimType = nullptr;
-
-PyObject* Tensor_getitem(PyObject* self, PyObject* index);
-int Tensor_setitem(PyObject* self, PyObject* index, PyObject* value);
-
-namespace {
-void maybeInitializeGlobals() {
-  // globals that depend on the python dim library,
-  // which we can't lookup until we finish initializing the _C module
-  if (_Tensor.ptr()) {
-    return;
-  }
-  auto dim = mpy::import("functorch.dim");
-  _Tensor = dim.attr("_Tensor");
-  pointwise = dim.attr("pointwise");
-  _Tensor_sum = _Tensor.attr("sum");
-  DimType = (PyTypeObject*)mpy::import("functorch.dim").attr("Dim").ptr();
-}
-
-void replaceMappingIfMatches(mpy::handle tp) {
-  auto T = (PyTypeObject*)tp.ptr();
-  bool recurse = false;
-  if (T->tp_as_mapping->mp_subscript == THPVariable_getitem) {
-    T->tp_as_mapping->mp_subscript = Tensor_getitem;
-    recurse = true;
-  }
-  if (T->tp_as_mapping->mp_ass_subscript == THPVariable_setitem) {
-    T->tp_as_mapping->mp_ass_subscript = Tensor_setitem;
-    recurse = true;
-  }
-  if (recurse) {
-    auto result = tp.attr("__subclasses__").call();
-    mpy::list_view lv(result);
-    for (auto i : lv.enumerate()) {
-      replaceMappingIfMatches(lv[i]);
-    }
-  }
-}
-
-void initializeGlobals(Arena& A) {
-  auto torch = mpy::import("torch");
-  torch_Tensor = (PyTypeObject*)torch.attr("Tensor").ptr();
-  torch_Tensor___mul__ = torch.attr("Tensor").attr("__mul__");
-
-  torch_Tensor_expand = torch.attr("_C").attr("TensorBase").attr("expand");
-  torch_Tensor_split = torch.attr("_C").attr("TensorBase").attr("split");
-  torch_Tensor_copy_ = torch.attr("Tensor").attr("copy_");
-  auto py_TensorBase = torch.attr("_C").attr("TensorBase");
-  auto TensorBase = (PyTypeObject*)py_TensorBase.ptr();
-  THPVariable_getitem = TensorBase->tp_as_mapping->mp_subscript;
-  THPVariable_setitem = TensorBase->tp_as_mapping->mp_ass_subscript;
-  NamedTuple = mpy::import("typing").attr("NamedTuple");
-  no_slice = PySlice_New(NULL, NULL, NULL);
-}
-
-mpy::handle DimensionBindError_;
-mpy::handle DimensionBindError() {
-  if (!DimensionBindError_.ptr()) {
-    DimensionBindError_ =
-        mpy::import("functorch.dim").attr("DimensionBindError");
-  }
-  return DimensionBindError_;
-}
-
-static int64_t n_dims_created = 65;
-
-struct Dim : public mpy::base<Dim> {
-  int64_t level_; // for stable comparisons in prototype
-  mpy::object name_;
-  Dim() : level_(n_dims_created++) {}
-  void init(mpy::object name, int64_t s = -1) {
-    name_ = std::move(name);
-    size_ = s;
-  }
-
-  static bool check_exact(mpy::handle v) {
-    return Py_TYPE(v.ptr()) == DimType;
-  }
-
-  int64_t size() const {
-    if (size_ == -1) {
-      mpy::raise_error(
-          PyExc_ValueError, "dimension %S is unbound", name_.ptr());
-    }
-    return size_;
-  }
-  void set_size(int64_t v) {
-    if (size_ == -1) {
-      size_ = v;
-    } else if (size_ != v) {
-      mpy::raise_error(
-          DimensionBindError(),
-          "Dim '%R' previously bound to a dimension of size %lld cannot bind to a dimension of size %lld",
-          this,
-          this->size_,
-          v);
-    }
-  }
-  bool is_bound() const {
-    return size_ != -1;
-  }
-  static mpy::obj<Dim> create(mpy::object name, int64_t s = -1) {
-    if (!DimType) {
-      maybeInitializeGlobals();
-    }
-    auto r = Dim::alloc(DimType);
-    r->init(std::move(name), s);
-    return r;
-  }
-  static PyTypeObject Type;
-  const at::Tensor& range() {
-    if (!range_.defined()) {
-      range_ = at::arange(size());
-    }
-    return range_;
-  }
-  const at::Tensor& batchtensor() {
-    if (!batchtensor_.defined()) {
-      batchtensor_ = at::functorch::addBatchDim(range(), 0, level_);
-    }
-    return batchtensor_;
-  }
-
- private:
-  int64_t size_{-1};
-  at::Tensor range_;
-  at::Tensor batchtensor_;
-};
-
-struct DimEntry {
-  // union of either a negative number indicating which dimension this is from
-  // the rhs, or a pointer to a first-class dimension. pointers do not have
-  // their highest bit set, so checking the number is negative tells us that it
-  // is not a dim.
-  bool is_positional() const {
-    return data_ < 0;
-  }
-  bool is_none() const {
-    return data_ == 0;
-  }
-  int64_t position() const {
-    return data_;
-  }
-  mpy::hdl<Dim> dim() const {
-    Dim* result;
-    std::memcpy(&result, &data_, sizeof(Dim*));
-    return mpy::hdl<Dim>(result);
-  }
-
-  DimEntry() : data_(0) {}
-
-  DimEntry(int64_t pos) : data_(pos) {
-    AT_ASSERT(pos < 0);
-  }
-  DimEntry(mpy::hdl<Dim> d) {
-    std::memcpy(&data_, &d, sizeof(int64_t));
-  }
-  bool operator==(const DimEntry& rhs) const {
-    return data_ == rhs.data_;
-  }
-
- private:
-  int64_t data_;
-};
-
-// Dim wrapper methods
-DimEntry _wrap_dim(mpy::handle d, size_t N, bool keepdim) {
-  if (Dim::check(d)) {
-    if (keepdim) {
-      mpy::raise_error(
-          PyExc_ValueError,
-          "cannot preserve first-class dimensions with keepdim=True");
-    }
-    return Dim::unchecked_wrap(d);
-  } else if (mpy::is_int(d)) {
-    auto i = mpy::to_int(d);
-    while (i >= 0) {
-      i -= N;
-    }
-    return i;
-  } else {
-    return DimEntry();
-  }
-}
-
-int Dim_init(mpy::hdl<Dim> self, PyObject* args, PyObject* kwds) {
-  PY_BEGIN
-  static constexpr const char* kwlist[] = {"name", "size", nullptr};
-  mpy::handle name;
-  mpy::handle size = nullptr;
-  if (!PyArg_ParseTupleAndKeywords(
-          args, kwds, "O|O", const_cast<char**>(kwlist), &name, &size)) {
-    return -1;
-  }
-  self->init(
-      mpy::object::borrow(name),
-      (size.ptr() && !mpy::is_none(size)) ? mpy::to_int(size) : -1);
-  return 0;
-  PY_END(-1)
-}
-
-PyObject* Dim_repr(Dim* self) {
-  PY_BEGIN
-  mpy::object name = (self->name_.ptr())
-      ? self->name_
-      : mpy::unicode_from_string("<uninitialized dim>");
-  return name.release();
-  PY_END(nullptr)
-}
-
-PyObject* Dim_getsize(Dim* self, void*) {
-  PY_BEGIN
-  return mpy::from_int(self->size()).release();
-  PY_END(nullptr)
-}
-
-int Dim_setsize(Dim* self, PyObject* size, void*) {
-  PY_BEGIN
-  self->set_size(mpy::to_int(size));
-  return 0;
-  PY_END(-1)
-}
-
-PyObject* Dim_getis_bound(Dim* self, void*) {
-  return PyBool_FromLong(self->is_bound());
-}
-
-PyObject* Dim_getlevel(Dim* self, void*) {
-  return PyLong_FromLong(self->level_);
-}
-
-PyObject* Dim_get_levels(Dim* self, void*) {
-  mpy::tuple t(1);
-  t.set(0, mpy::object::borrow(self->ptr()));
-  return t.release();
-}
-
-PyObject* Dim_get_has_device(Dim* self, void*) {
-  Py_RETURN_FALSE;
-}
-
-PyObject* Dim_get_tensor(Dim* self, void*) {
-  return THPVariable_Wrap(self->range());
-}
-
-PyObject* Dim_get_batchtensor(Dim* self, void*) {
-  return THPVariable_Wrap(self->batchtensor());
-}
-
-PyGetSetDef Dim_getsetters[] = {
-    {"size", (getter)Dim_getsize, (setter)Dim_setsize, "Dimension size", NULL},
-    {"is_bound", (getter)Dim_getis_bound, NULL, "is_bound", NULL},
-    {"_level", (getter)Dim_getlevel, NULL, "_level", NULL},
-    {"_levels", (getter)Dim_get_levels, NULL, "_levels", NULL},
-    {"_has_device", (getter)Dim_get_has_device, NULL, "_has_device", NULL},
-    {"_tensor", (getter)Dim_get_tensor, NULL, "_tensor", NULL},
-    {"_batchtensor", (getter)Dim_get_batchtensor, NULL, "_batchtensor", NULL},
-    {"ndim",
-     (getter)[](PyObject* self, void*)
-         ->PyObject* {return mpy::from_int(1).release();
-} // namespace
-, NULL, "ndim", NULL
-}
-, {
-  NULL
-} /* Sentinel */
-}
-;
-}
-PyTypeObject Dim::Type = {
-    PyVarObject_HEAD_INIT(NULL, 0)
-    "_C.Dim", /* tp_name */
-    sizeof(Dim), /* tp_basicsize */
-    0, /* tp_itemsize */
-    Dim::dealloc_stub, /* tp_dealloc */
-    0, /* tp_vectorcall_offset */
-    0, /* tp_getattr */
-    0, /* tp_setattr */
-    0, /* tp_as_async */
-    (reprfunc)Dim_repr, /* tp_repr */
-    0, /* tp_as_number */
-    0, /* tp_as_sequence */
-    0, /* tp_as_mapping */
-    0, /* tp_hash */
-    0, /* tp_call */
-    0, /* tp_str */
-    0, /* tp_getattro */
-    0, /* tp_setattro */
-    0, /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
-    "Dim Object", /* tp_doc */
-    0, /* tp_traverse */
-    0, /* tp_clear */
-    0, /* tp_richcompare */
-    0, /* tp_weaklistoffset */
-    0, /* tp_iter */
-    0, /* tp_iternext */
-    0, /* tp_methods */
-    0, /* tp_members */
-    Dim_getsetters, /* tp_getset */
-    0, /* tp_base */
-    0, /* tp_dict */
-    0, /* tp_descr_get */
-    0, /* tp_descr_set */
-    0, /* tp_dictoffset */
-    (initproc)(void*)static_cast<int (*)(mpy::hdl<Dim>, PyObject*, PyObject*)>(
-        Dim_init), /* tp_init */
-    0, /* tp_alloc */
-    Dim::new_stub, /* tp_new */
-};
-
-// class DimList ------------
-
-struct DimList : public mpy::base<DimList> {
-  mpy::object name_;
-  std::vector<mpy::obj<Dim>> dims_;
-  static PyTypeObject Type;
-  void init(mpy::object name) {
-    name_ = std::move(name);
-  }
-  void set_dims(std::vector<mpy::obj<Dim>> dims) {
-    bound_ = true;
-    dims_ = std::move(dims);
-  }
-  bool is_bound() {
-    return bound_;
-  }
-  void bind_len(int64_t size) {
-    if (bound_) {
-      int64_t b_size = dims_.size();
-      if (b_size != size) {
-        mpy::raise_error(
-            DimensionBindError(),
-            "Dimlist has size %lld but it is being bound to size %d",
-            b_size,
-            size);
-      }
-    } else {
-      bound_ = true;
-      dims_.resize(size);
-      for (Py_ssize_t i = 0; i < size; ++i) {
-        dims_[i] =
-            Dim::create(mpy::unicode_from_format("%S%i", name_.ptr(), (int)i));
-      }
-    }
-  }
-  int64_t size() const {
-    if (!bound_) {
-      mpy::raise_error(DimensionBindError(), "DimList not bound");
-    }
-    return dims_.size();
-  }
-  void set_bound(bool b) {
-    bound_ = b;
-  }
-
- private:
-  bool bound_ = false;
-};
-
-static int DimList_init(DimList* self, PyObject* args, PyObject* kwds);
-
-static PyObject* DimList_repr(DimList* self) {
-  PY_BEGIN
-  if (self->is_bound()) {
-    size_t size = self->dims_.size();
-    mpy::tuple t(size);
-    for (size_t i = 0; i < size; ++i) {
-      t.set(i, self->dims_[i]);
-    }
-    return mpy::repr(t).release();
-  } else if (!mpy::is_none(self->name_)) {
-    return mpy::unicode_from_format("*%S", self->name_.ptr()).release();
-  } else {
-    return mpy::unicode_from_string("<unbound_dimlist>").release();
-  }
-  PY_END(nullptr)
-}
-
-static PyObject* DimList_bind(
-    DimList* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-  mpy::handle sizes;
-  static const char* const _keywords[] = {"sizes", nullptr};
-  static _PyArg_Parser parser = {"O", _keywords, 0};
-  if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &sizes)) {
-    return nullptr;
-  }
-  if (!mpy::is_sequence(sizes)) {
-    mpy::raise_error(PyExc_ValueError, "expected a sequence");
-  }
-  mpy::sequence_view seq = sizes;
-  auto size = seq.size();
-  self->bind_len(size);
-  for (Py_ssize_t i = 0; i < size; ++i) {
-    self->dims_[i]->set_size(mpy::to_int(seq[i]));
-  }
-  Py_RETURN_NONE;
-  PY_END(nullptr)
-}
-
-static PyObject* DimList_bind_len(
-    DimList* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-  int size;
-  static const char* const _keywords[] = {"N", nullptr};
-  static _PyArg_Parser parser = {"i", _keywords, 0};
-  if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, &size)) {
-    return nullptr;
-  }
-  self->bind_len(size);
-  Py_RETURN_NONE;
-  PY_END(nullptr)
-}
-
-static PyMethodDef DimList_methods[] = {
-    {"bind", (PyCFunction)(void*)DimList_bind, METH_FASTCALL | METH_KEYWORDS},
-    {"bind_len",
-     (PyCFunction)(void*)DimList_bind_len,
-     METH_FASTCALL | METH_KEYWORDS},
-    {NULL, NULL, 0, NULL} /* Sentinel */
-};
-
-static Py_ssize_t DimList_len(DimList* self) {
-  PY_BEGIN
-  return self->size();
-  PY_END(-1)
-}
-
-static PyObject* DimList_item(DimList* self, Py_ssize_t idx) {
-  PY_BEGIN
-  if (!self->is_bound()) {
-    mpy::raise_error(DimensionBindError(), "DimList not bound");
-  }
-  if (idx < 0 || (size_t)idx >= self->dims_.size()) {
-    mpy::raise_error(PyExc_IndexError, "index out of bounds");
-  }
-  mpy::object r = self->dims_[idx];
-  return r.release();
-  PY_END(nullptr)
-}
-
-PySequenceMethods DimList_seq{
-    (lenfunc)DimList_len, // lenfunc sq_length;
-    0, // binaryfunc sq_concat;
-    0, // ssizeargfunc sq_repeat;
-    (ssizeargfunc)DimList_item, // ssizeargfunc sq_item;
-    0, // void *was_sq_slice;
-    0, // ssizeobjargproc sq_ass_item;
-    0, // void *was_sq_ass_slice;
-    0, // objobjproc sq_contains;
-
-    0, // binaryfunc sq_inplace_concat;
-    0, // ssizeargfunc sq_inplace_repeat;
-};
-
-static PyObject* DimList_getis_bound(DimList* self, void*) {
-  return PyBool_FromLong(self->is_bound());
-}
-
-static PyGetSetDef DimList_getsetters[] = {
-    {"is_bound", (getter)DimList_getis_bound, NULL, "is_bound", NULL},
-    {NULL} /* Sentinel */
-};
-
-static PyObject* DimList_subscript(DimList* self, mpy::handle idx) {
-  PY_BEGIN
-  if (mpy::is_int(idx)) {
-    return DimList_item(self, mpy::to_int(idx));
-  } else if (mpy::is_slice(idx)) {
-    if (!self->is_bound()) {
-      mpy::raise_error(DimensionBindError(), "DimList not bound");
-    }
-    mpy::slice_view s(idx, self->dims_.size());
-    mpy::tuple r(s.slicelength);
-    for (Py_ssize_t i = s.start, j = 0; i < s.stop; i += s.step) {
-      r.set(j++, self->dims_[i]);
-    }
-    return r.release();
-  } else {
-    mpy::raise_error(PyExc_ValueError, "expected an int or a slice");
-    return nullptr;
-  }
-  PY_END(nullptr)
-}
-
-PyMappingMethods DimList_mapping = {
-    0, // lenfunc mp_length;
-    (binaryfunc)(void*)DimList_subscript, // binaryfunc mp_subscript;
-    0, // objobjargproc mp_ass_subscript;
-};
-
-PyTypeObject DimList::Type = {
-    PyVarObject_HEAD_INIT(NULL, 0)
-    "_C.DimList", /* tp_name */
-    sizeof(DimList), /* tp_basicsize */
-    0, /* tp_itemsize */
-    DimList::dealloc_stub, /* tp_dealloc */
-    0, /* tp_vectorcall_offset */
-    0, /* tp_getattr */
-    0, /* tp_setattr */
-    0, /* tp_as_async */
-    (reprfunc)DimList_repr, /* tp_repr */
-    0, /* tp_as_number */
-    &DimList_seq, /* tp_as_sequence */
-    &DimList_mapping, /* tp_as_mapping */
-    0, /* tp_hash */
-    0, /* tp_call */
-    0, /* tp_str */
-    0, /* tp_getattro */
-    0, /* tp_setattro */
-    0, /* tp_as_buffer */
-    0, /* tp_flags */
-    "DimList Object", /* tp_doc */
-    0, /* tp_traverse */
-    0, /* tp_clear */
-    0, /* tp_richcompare */
-    0, /* tp_weaklistoffset */
-    0, /* tp_iter */
-    0, /* tp_iternext */
-    DimList_methods, /* tp_methods */
-    0, /* tp_members */
-    DimList_getsetters, /* tp_getset */
-    0, /* tp_base */
-    0, /* tp_dict */
-    0, /* tp_descr_get */
-    0, /* tp_descr_set */
-    0, /* tp_dictoffset */
-    (initproc)DimList_init, /* tp_init */
-    0, /* tp_alloc */
-    DimList::new_stub, /* tp_new */
-};
-
-static int DimList_init(DimList* self, PyObject* args, PyObject* kwds) {
-  PY_BEGIN
-  static constexpr const char* kwlist[] = {"len_or_dims", "name", nullptr};
-  mpy::handle len_or_dims = nullptr;
-  PyObject* name = nullptr;
-  if (!PyArg_ParseTupleAndKeywords(
-          args, kwds, "|OO", const_cast<char**>(kwlist), &len_or_dims, &name)) {
-    return -1;
-  }
-  self->init(mpy::object::borrow(name ? name : Py_None));
-  if (len_or_dims.ptr()) {
-    if (mpy::is_int(len_or_dims)) {
-      self->bind_len(mpy::to_int(len_or_dims));
-    } else if (mpy::is_sequence(len_or_dims)) {
-      mpy::sequence_view s(len_or_dims);
-      std::vector<mpy::obj<Dim>> dims;
-      size_t size = s.size();
-      dims.reserve(size);
-      for (size_t i = 0; i < size; ++i) {
-        auto r = s[i];
-        if (mpy::is_int(r)) {
-          dims.emplace_back(Dim::create(
-              mpy::unicode_from_format("%S%i", self->name_.ptr(), (int)i),
-              mpy::to_int(r)));
-        } else {
-          dims.emplace_back(Dim::wrap(r));
-        }
-      }
-      self->set_dims(std::move(dims));
-    } else {
-      PyErr_Format(
-          PyExc_ValueError, "expected a length or a sequence of dimensions");
-      return -1;
-    }
-    return 0;
-  }
-  return 0;
-  PY_END(-1);
-}
-
-// Tensor -----------------------------
-
-PyTypeObject* TensorType = nullptr; // the python wrapper type.
-mpy::object run_torch_function(
-    Arena& A,
-    mpy::handle orig,
-    mpy::vector_args args,
-    bool is_pointwise);
-
-namespace {
-
-at::Tensor _add_batch_dims(Arena& A, at::Tensor t, Slice<DimEntry> levels_) {
-  auto levels = Slice<DimEntry>();
-  levels.extend(A, levels_);
-  while (true) {
-    int64_t min_real_index = -1;
-    int64_t min_index = -1;
-    int64_t min_value = INT_MAX;
-    int64_t i = 0;
-    int64_t r = 0;
-    for (auto l : levels) {
-      if (!l.is_none()) {
-        if (!l.is_positional() && l.dim()->level_ < min_value) {
-          min_value = l.dim()->level_;
-          min_index = i;
-          min_real_index = r;
-        }
-        ++i;
-      }
-      ++r;
-    }
-    if (min_index == -1) {
-      return t;
-    }
-    auto t2 = at::functorch::addBatchDim(std::move(t), min_index, min_value);
-    t = std::move(t2);
-    levels[min_real_index] = DimEntry();
-  }
-}
-
-struct DelayedOperator {
-  DelayedOperator(mpy::object o, mpy::vector_args a)
-      : orig(std::move(o)), args(a) {
-    auto all = a.size();
-    // this will outlive the call so
-    // take ownership of temporaries
-    // in vector args
-    auto buf = new mpy::handle[all];
-    memcpy(buf, args.args, sizeof(mpy::handle) * all);
-    args.args = buf;
-    for (auto i : args.enumerate_all()) {
-      Py_INCREF(args.args[i].ptr());
-    }
-    Py_XINCREF(args.kwnames.ptr());
-  }
-  ~DelayedOperator() {
-    for (auto i : args.enumerate_all()) {
-      Py_DECREF(args[i].ptr());
-    }
-    if (args.has_keywords()) {
-      Py_XDECREF(args.kwnames.ptr());
-    }
-    delete[] args.args;
-  }
-  mpy::object orig;
-  mpy::vector_args args;
-};
-
-void free_levels_dims(Slice<DimEntry> levels) {
-  for (auto e : levels) {
-    if (!e.is_positional()) {
-      mpy::object::steal(e.dim());
-    }
-  }
-}
-} // namespace
-
-struct Tensor : public mpy::base<Tensor> {
- private:
-  at::Tensor tensor_;
-  at::Tensor batchtensor_;
-  OwnedSlice<DimEntry> levels_;
-  bool has_device_;
-  std::unique_ptr<DelayedOperator> delayed_;
-
- public:
-  at::Tensor& tensor(Arena& A) {
-    if (C10_UNLIKELY(!tensor_.defined())) {
-      AT_ASSERT(delayed_);
-      auto t = Tensor::wrap(
-          run_torch_function(A, delayed_->orig, delayed_->args, true));
-      tensor_ = t->tensor(A);
-      delayed_.reset();
-      // don't force creation of batch tensor if it wasn't already provided.
-      batchtensor_ = t->batchtensor_;
-      AT_ASSERT(levels() == t->levels());
-    }
-    return tensor_;
-  }
-  at::Tensor& batchtensor(Arena& A) {
-    if (C10_UNLIKELY(!batchtensor_.defined())) {
-      batchtensor_ = _add_batch_dims(A, tensor(A), levels_.slice());
-    }
-    return batchtensor_;
-  }
-  Slice<DimEntry> levels() {
-    return levels_.slice();
-  }
-  bool has_device() {
-    return has_device_;
-  }
-  DelayedOperator* delayed() {
-    return delayed_.get();
-  }
-  static PyTypeObject Type;
-
-  static bool check_exact(mpy::handle v) {
-    return Py_TYPE(v.ptr()) == TensorType;
-  }
-
-  static mpy::obj<Tensor> create() {
-    if (!TensorType) {
-      TensorType =
-          (PyTypeObject*)mpy::import("functorch.dim").attr("Tensor").release();
-    }
-    return Tensor::alloc(TensorType);
-  }
-  void capture_levels(Slice<DimEntry> levels) {
-    // grab ownership of the dims inside levels
-    for (auto l : levels) {
-      if (!l.is_positional()) {
-        mpy::object::borrow(l.dim()).release();
-      }
-    }
-    levels_.set(levels, free_levels_dims);
-  }
-  static mpy::object from_positional(
-      Arena& A,
-      at::Tensor tensor,
-      Slice<DimEntry> levels,
-      bool has_device);
-  static mpy::obj<Tensor> create_delayed(
-      mpy::object op,
-      mpy::vector_args args,
-      Slice<DimEntry> levels,
-      bool has_device);
-  friend struct EnableAllLayers;
-};
-
-namespace {
-// version in header does a unnecessary refcount +/-
-at::functorch::BatchedTensorImpl* maybeGetBatchedImpl(
-    const at::Tensor& tensor) {
-  if (at::functorch::isBatchedTensor(tensor)) {
-    return static_cast<at::functorch::BatchedTensorImpl*>(
-        tensor.unsafeGetTensorImpl());
-  }
-  return nullptr;
-}
-
-TensorRef unchecked_tensor_from(mpy::handle p) {
-  auto v = (THPVariable*)p.ptr();
-  return TensorRef(*v->cdata);
-}
-
-static int64_t ndim_of_levels(Slice<DimEntry> levels) {
-  int64_t r = 0;
-  for (auto l : levels) {
-    if (l.is_positional()) {
-      ++r;
-    }
-  }
-  return r;
-}
-
-struct TensorInfo {
-  TensorRef tensor;
-  Slice<DimEntry> levels;
-  bool has_device;
-  TensorRef batchedtensor;
-  int64_t ndim() const {
-    return ndim_of_levels(levels);
-  }
-  operator bool() const {
-    return tensor;
-  }
-
-  static TensorInfo create(
-      Arena& A,
-      mpy::handle h,
-      bool ensure_batched = true,
-      bool ensure_present = true) {
-    if (Tensor::check_exact(h)) {
-      auto t = Tensor::unchecked_wrap(h);
-      return TensorInfo{
-          t->tensor(A),
-          t->levels(),
-          t->has_device(),
-          ensure_batched ? t->batchtensor(A) : TensorRef()};
-    } else if (Dim::check_exact(h)) {
-      auto d = Dim::unchecked_wrap(h);
-      return TensorInfo{
-          d->range(),
-          Slice<DimEntry>(A, DimEntry(d)),
-          false,
-          ensure_batched ? d->batchtensor() : TensorRef()};
-    } else if (THPVariable_Check(h.ptr())) {
-      TensorRef t = unchecked_tensor_from(h);
-      Slice<DimEntry> levels;
-      for (auto i : irange(-t->dim(), 0)) {
-        levels.append(A, i);
-      }
-      return TensorInfo{t, levels, true, t};
-    } else {
-      if (ensure_present) {
-        mpy::raise_error(PyExc_ValueError, "expected a tensor object");
-      }
-      return TensorInfo{};
-    }
-  }
-};
-
-static PyObject* py_Tensor_from_positional(
-    PyObject* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-#define ARGS(_) \
-  _(mpy::handle, tensor) _(mpy::handle, py_levels) _(int, has_device)
-  MPY_PARSE_ARGS_KWNAMES("OOp", ARGS)
-#undef ARGS
-
-  if (!THPVariable_Check(tensor.ptr())) {
-    mpy::raise_error(PyExc_ValueError, "_tensor is not a Tensor?");
-  }
-
-  Slice<DimEntry> levels;
-  mpy::sequence_view sq(py_levels);
-  for (auto i : sq.enumerate()) {
-    mpy::object v = sq[i];
-    if (mpy::is_int(v)) {
-      auto vi = mpy::to_int(v);
-      levels.append(A, vi);
-    } else {
-      auto dim = Dim::wrap(std::move(v));
-      mpy::hdl<Dim> hdim = dim;
-      levels.append(A, hdim);
-    }
-  }
-  return Tensor::from_positional(
-             A, THPVariable_Unpack(tensor.ptr()), levels, has_device != 0)
-      .release();
-  PY_END(nullptr)
-}
-} // namespace
-
-mpy::object Tensor::from_positional(
-    Arena& A,
-    at::Tensor tensor,
-    Slice<DimEntry> levels,
-    bool has_device) {
-  size_t seen_dims = 0;
-  int last = 0;
-  // auto sz = tensor.sizes();
-  for (auto i : levels.enumerate()) {
-    auto l = levels[i];
-    if (l.is_positional()) {
-      AT_ASSERT(last == 0 || last + 1 == l.position());
-      last = l.position();
-    } else {
-      mpy::object::borrow(l.dim()).release();
-      // AT_ASSERT(sz[i] == l.dim()->size());
-      ++seen_dims;
-    }
-  }
-  AT_ASSERT(last == 0 || last == -1);
-  if (!seen_dims) {
-    return mpy::object::steal(THPVariable_Wrap(tensor));
-  }
-
-  mpy::obj<Tensor> self = Tensor::create();
-  self->tensor_ = std::move(tensor);
-  AT_ASSERT(self->tensor_.dim() == levels.size());
-  self->levels_.set(levels, free_levels_dims);
-  self->has_device_ = has_device;
-  mpy::object r = std::move(self);
-  return r;
-}
-
-mpy::obj<Tensor> Tensor::create_delayed(
-    mpy::object op,
-    mpy::vector_args args,
-    Slice<DimEntry> levels,
-    bool has_device) {
-  mpy::obj<Tensor> self = Tensor::create();
-  self->capture_levels(levels);
-  self->has_device_ = has_device;
-  self->delayed_ = std::make_unique<DelayedOperator>(std::move(op), args);
-  return self;
-}
-
-namespace {
-mpy::list slice_to_list(Slice<mpy::handle> h) {
-  mpy::list lst(h.size());
-  for (auto i : h.enumerate()) {
-    lst.set(i, mpy::object::borrow(h[i]));
-  }
-  return lst;
-}
-
-mpy::tuple slice_to_tuple(Slice<mpy::handle> h) {
-  mpy::tuple lst(h.size());
-  for (auto i : h.enumerate()) {
-    lst.set(i, mpy::object::borrow(h[i]));
-  }
-  return lst;
-}
-
-enum UType {
-  U_ELEM,
-  U_TUPLE_LIKE,
-  U_DICT,
-};
-
-struct Unflatten {
-  mpy::object operator()(Slice<mpy::handle>& elements) {
-    mpy::object r;
-    switch (type) {
-      case U_ELEM: {
-        r = mpy::object::borrow(elements[0]);
-        elements = elements.slice(1);
-      } break;
-      case U_TUPLE_LIKE: {
-        mpy::tuple tup(children.size());
-        for (auto i : children.enumerate()) {
-          tup.set(i, children[i](elements));
-        }
-        r = obj.call(tup);
-      } break;
-      case U_DICT: {
-        r = mpy::object::checked_steal(PyDict_New());
-        mpy::dict_view rv(r);
-        mpy::dict_view d(obj);
-        Py_ssize_t pos = 0;
-        mpy::handle k, v;
-        for (int i = 0; d.next(&pos, &k, &v); ++i) {
-          rv.set(k, children[i](elements));
-        }
-      } break;
-    }
-    return r;
-  }
-  UType type;
-  mpy::handle obj;
-  Slice<Unflatten> children;
-};
-
-Unflatten tree_flatten(
-    Arena& A,
-    mpy::handle agg,
-    Slice<mpy::handle>& flat_elements) {
-  Slice<Unflatten> c;
-  UType utype;
-  mpy::handle obj;
-  if (mpy::list_view::check(agg)) {
-    obj = agg.type();
-    utype = U_TUPLE_LIKE;
-    mpy::list_view l(agg);
-    for (auto i : l.enumerate()) {
-      c.append(A, tree_flatten(A, l[i], flat_elements));
-    }
-  } else if (mpy::tuple_view::check(agg)) {
-    obj = agg.type();
-    utype = U_TUPLE_LIKE;
-    // includes named tuples
-    mpy::tuple_view l(agg);
-    for (auto i : l.enumerate()) {
-      c.append(A, tree_flatten(A, l[i], flat_elements));
-    }
-  } else if (mpy::dict_view::check(agg)) {
-    utype = U_DICT;
-    mpy::dict_view d(agg);
-    obj = agg;
-    Py_ssize_t pos = 0;
-    mpy::handle k, v;
-    while (d.next(&pos, &k, &v)) {
-      c.append(A, tree_flatten(A, v, flat_elements));
-    }
-  } else {
-    utype = U_ELEM;
-    flat_elements.append(A, agg);
-  }
-  return Unflatten{utype, obj, c};
-}
-
-struct UnflattenVectorArgs {
-  mpy::vector_args operator()(Arena& A, Slice<mpy::handle>& elements) {
-    if (!had_nested) {
-      auto args = elements.begin();
-      elements = Slice<mpy::handle>();
-      return mpy::vector_args(args, nargs, kwnames);
-    }
-    Slice<mpy::handle> args;
-    for (auto u : children) {
-      args.append(A, A.autorelease(u(elements)));
-    }
-    return mpy::vector_args(args.begin(), nargs, kwnames);
-  }
-  Slice<Unflatten> children;
-  Py_ssize_t nargs;
-  mpy::handle kwnames;
-  bool had_nested;
-};
-
-UnflattenVectorArgs tree_flatten(
-    Arena& A,
-    mpy::vector_args args,
-    Slice<mpy::handle>& flat_elements) {
-  UnflattenVectorArgs r;
-  r.kwnames = args.kwnames;
-  r.nargs = args.nargs;
-  r.had_nested = false;
-  auto N = args.size();
-  for (auto i : irange(N)) {
-    auto typ = Py_TYPE(args[i].ptr());
-    // fast checks that this thing isn't something that is nested.
-    bool is_element = !typ->tp_as_sequence || typ == torch_Tensor ||
-        typ == TensorType || typ == DimType;
-    if (!is_element) {
-      flat_elements.extend(A, args.args, args.args + i);
-      for (auto j : irange(i)) {
-        (void)j;
-        r.children.append(A, Unflatten{U_ELEM});
-      }
-      for (auto j : irange(i, N)) {
-        r.children.append(A, tree_flatten(A, args[j], flat_elements));
-        if (r.children.back().type != U_ELEM) {
-          r.had_nested = true;
-        }
-      }
-      return r;
-    }
-  }
-  flat_elements.extend(A, args.args, args.args + N);
-  return r;
-}
-
-struct UnflattenArena {
-  Arena A;
-  Unflatten unflatten;
-};
-
-PyObject* py_unflatten(
-    PyObject* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-#define ARGS(_) _(mpy::handle, ns)
-  MPY_PARSE_ARGS_KWNAMES("O", ARGS)
-#undef ARGS
-  mpy::sequence_view sv(ns);
-  // because we do not have a autorelase pool yet...
-  Arena A;
-  Slice<mpy::handle> slice;
-  mpy::handle Tuple = (PyObject*)&PyTuple_Type;
-  auto inputs = Tuple.call(ns);
-  mpy::tuple_view tv(inputs);
-  for (auto i : tv.enumerate()) {
-    slice.append(A, tv[i]);
-  }
-  auto AA = (UnflattenArena*)PyCapsule_GetPointer(self, "arena");
-  auto r = AA->unflatten(slice).release();
-  AT_ASSERT(r != nullptr);
-  return r;
-  PY_END(nullptr)
-}
-
-PyMethodDef py_unflatten_def = {
-    "unflatten",
-    (PyCFunction)(void*)py_unflatten,
-    METH_FASTCALL | METH_KEYWORDS};
-
-void free_unflatten_arena(PyObject* pc) {
-  delete (UnflattenArena*)PyCapsule_GetPointer(pc, "arena");
-}
-
-PyObject* py_tree_flatten(
-    PyObject* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-#define ARGS(_) _(mpy::handle, tree)
-  MPY_PARSE_ARGS_KWNAMES("O", ARGS)
-#undef ARGS
-  auto A = new UnflattenArena;
-  Slice<mpy::handle> elements;
-  A->unflatten = tree_flatten(A->A, tree, elements);
-  auto cap = mpy::object::checked_steal(
-      PyCapsule_New(A, "arena", free_unflatten_arena));
-  auto unflatten = mpy::object::checked_steal(
-      PyCFunction_New(&py_unflatten_def, cap.release()));
-  mpy::tuple r(2);
-  r.set(0, slice_to_list(elements));
-  r.set(1, std::move(unflatten));
-  return r.release();
-  PY_END(nullptr)
-}
-
-mpy::object tree_map(
-    Arena& A,
-    const std::function<mpy::handle(mpy::handle)>& fn,
-    mpy::handle agg) {
-  Slice<mpy::handle> elements;
-  auto unflatten = tree_flatten(A, agg, elements);
-  for (auto i : elements.enumerate()) {
-    elements[i] = fn(elements[i]);
-  }
-  return unflatten(elements);
-}
-
-// prereq: isinstance(h, _Tensor)
-int64_t _Tensor_ndim(mpy::handle h) {
-  if (Tensor::check(h)) {
-    int64_t r = 0;
-    for (auto l : Tensor::unchecked_wrap(h)->levels()) {
-      if (l.is_positional()) {
-        ++r;
-      }
-    }
-    return r;
-  }
-  // Dim or DelayedMulTensor
-  return 0;
-}
-
-mpy::handle handle_from_tensor(Arena& A, TensorRef t) {
-  // fast case: tensor is live in python
-  std::optional<PyObject*> mb_obj =
-      t->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
-  if (mb_obj.has_value() &&
-      !t->unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj()) {
-    return *mb_obj;
-  }
-  return A.autorelease(mpy::object::checked_steal(THPVariable_Wrap(*t)));
-}
-} // namespace
-struct EnableAllLayers {
-  EnableAllLayers(Arena& A, Slice<DimEntry> levels) {
-    std::vector<std::pair<int64_t, int64_t>> layers;
-    layers.reserve(levels.size());
-    for (auto l : levels) {
-      if (!l.is_positional()) {
-        auto d = l.dim();
-        levels_to_dim_.append(A, d);
-      }
-    }
-    std::sort(
-        levels_to_dim_.begin(),
-        levels_to_dim_.end(),
-        [](mpy::hdl<Dim> lhs, mpy::hdl<Dim> rhs) {
-          return lhs->level_ < rhs->level_;
-        });
-
-    for (auto i : levels_to_dim_.enumerate()) {
-      auto batch_size = levels_to_dim_[i]->size();
-      auto level = at::functorch::initAndPushDynamicLayer(
-          at::functorch::TransformType::Vmap,
-          batch_size,
-          at::functorch::RandomnessType::Different);
-      if (i == 0) {
-        levels_start_ = level;
-      }
-    }
-  }
-
-  ~EnableAllLayers() {
-    auto to_remove = levels_start_ + levels_to_dim_.size() - 1;
-    for (auto i : levels_to_dim_.enumerate()) {
-      AT_ASSERT(
-          at::functorch::popDynamicLayerAndDeleteMetadata().layerId() ==
-          to_remove - i);
-    }
-  }
-
-  mpy::obj<Tensor> from_batched(
-      Arena& A,
-      at::Tensor batchedtensor,
-      bool has_device) {
-    Slice<DimEntry> levels;
-    for (auto i : irange(-batchedtensor.dim(), 0)) {
-      levels.append(A, i);
-    }
-    TensorRef tensor;
-    at::functorch::BatchedTensorImpl* impl = maybeGetBatchedImpl(batchedtensor);
-    while (true) {
-      auto level = impl->level();
-      AT_ASSERT(
-          level >= levels_start_ &&
-          level < levels_start_ + levels_to_dim_.size());
-      mpy::hdl<Dim> dim = levels_to_dim_[level - levels_start_].ptr();
-      levels.insert(A, impl->bdim(), dim);
-      at::functorch::BatchedTensorImpl* nimpl =
-          maybeGetBatchedImpl(impl->value());
-      if (!nimpl) {
-        tensor = impl->value();
-        break;
-      }
-      impl = nimpl;
-    }
-
-    mpy::obj<Tensor> self = Tensor::create();
-    // grab ownership of the tensors
-    self->tensor_ = *tensor;
-    self->batchtensor_ = std::move(batchedtensor);
-    self->has_device_ = has_device;
-    self->capture_levels(levels);
-    return self;
-  }
-  void inplace_update_layers(TensorRef batchtensor, Slice<DimEntry> levels) {
-    // XXX - requires a patch to functorch to att set_level
-    auto impl = maybeGetBatchedImpl(*batchtensor);
-    for (auto i : levels_to_dim_.reversed_enumerate()) {
-      if (!impl) {
-        break;
-      }
-      if (levels.contains(levels_to_dim_[i])) {
-        impl->_unsafe_set_level(levels_start_ + i);
-        impl = maybeGetBatchedImpl(impl->value());
-      }
-    }
-  }
-
- private:
-  int64_t levels_start_{};
-  Slice<mpy::hdl<Dim>> levels_to_dim_;
-};
-
-namespace {
-TensorRef _match_levels(
-    Arena& A,
-    TensorRef v,
-    Slice<DimEntry> from_levels,
-    Slice<DimEntry> to_levels,
-    bool drop_levels = false) {
-  if (from_levels == to_levels) {
-    return v;
-  }
-  // drop_levels -> if a dim appears in from_levels but not to_levels, it is
-  // assumed it has stride 0.
-  at::IntArrayRef sz = v->sizes();
-  at::IntArrayRef sd = v->strides();
-  AT_ASSERT(drop_levels || from_levels.size() <= to_levels.size());
-  Slice<int64_t> nsz;
-  Slice<int64_t> nsd;
-  for (auto l : to_levels) {
-    auto oidx = from_levels.index(l);
-    if (!oidx) {
-      nsz.append(A, l.is_positional() ? 1 : l.dim()->size());
-      nsd.append(A, 0);
-    } else {
-      auto idx = *oidx;
-      nsz.append(A, sz[idx]);
-      nsd.append(A, sd[idx]);
-    }
-  }
-  return A.autorelease(v->as_strided(
-      at::IntArrayRef(nsz.begin(), nsz.end()),
-      at::IntArrayRef(nsd.begin(), nsd.end()),
-      v->storage_offset()));
-}
-} // namespace
-mpy::object run_torch_function(
-    Arena& A,
-    mpy::handle orig,
-    mpy::vector_args args,
-    bool is_pointwise) {
-  if (!pointwise_optimize) {
-    is_pointwise = false;
-  }
-  // std::cout << "__torch_function__ " << ((is_pointwise) ? "pointwise" :
-  // "functorch") << " " << orig << "\n";
-
-  Slice<mpy::hdl<Dim>> all_dims;
-  Slice<mpy::handle> flat_args;
-  auto unflatten_args = tree_flatten(A, args, flat_args);
-  TensorRef device_holding_tensor;
-
-  Slice<TensorInfo> infos;
-  Slice<DimEntry> result_levels;
-  for (auto f : flat_args) {
-    infos.append(A, TensorInfo::create(A, f, !is_pointwise, false));
-    if (infos.back()) {
-      TensorInfo& info = infos.back();
-      AT_ASSERT(is_pointwise || info.batchedtensor);
-      if (!device_holding_tensor && info.has_device) {
-        device_holding_tensor = infos.back().tensor;
-      }
-      for (auto l : info.levels) {
-        if (!result_levels.contains(l)) {
-          result_levels.append(A, l);
-        }
-      }
-    }
-  }
-
-  if (is_pointwise) {
-    for (auto i : flat_args.enumerate()) {
-      if (infos[i]) {
-        TensorRef tensor = infos[i].tensor;
-        if (device_holding_tensor && !infos[i].has_device) {
-          tensor = A.autorelease(tensor->to(device_holding_tensor->device()));
-        }
-        auto ml = _match_levels(A, tensor, infos[i].levels, result_levels);
-        flat_args[i] = handle_from_tensor(A, std::move(ml));
-      }
-    }
-
-    Slice<mpy::handle> flat_it = flat_args;
-    mpy::vector_args uargs = unflatten_args(A, flat_it);
-
-    mpy::object result = orig.call_vector(uargs);
-
-    // fast wrap for normal case where operator just returns a tensor.
-    if (THPVariable_Check(result.ptr())) {
-      return Tensor::from_positional(
-          A,
-          THPVariable_Unpack(result.ptr()),
-          result_levels,
-          device_holding_tensor);
-    }
-    auto wrap = [&](mpy::handle h) {
-      if (THPVariable_Check(h.ptr())) {
-        return A.autorelease(Tensor::from_positional(
-            A,
-            THPVariable_Unpack(h.ptr()),
-            result_levels,
-            device_holding_tensor));
-      }
-      return h;
-    };
-    return tree_map(A, wrap, result);
-  } else {
-    // std::cout << orig << " calling functorch...\n";
-    // std::cout << "rl: " << result_levels << "\n";
-    EnableAllLayers guard(A, result_levels);
-    for (auto i : flat_args.enumerate()) {
-      if (infos[i]) {
-        TensorRef batched = infos[i].batchedtensor;
-        if (device_holding_tensor && !infos[i].has_device) {
-          batched = A.autorelease(batched->to(device_holding_tensor->device()));
-        }
-        guard.inplace_update_layers(batched, infos[i].levels);
-        flat_args[i] = handle_from_tensor(A, batched);
-      }
-    }
-    Slice<mpy::handle> flat_it = flat_args;
-    mpy::vector_args uargs = unflatten_args(A, flat_it);
-    AT_ASSERT(flat_it.size() == 0);
-    mpy::object result = orig.call_vector(uargs);
-    auto wrap = [&](mpy::handle h) {
-      if (THPVariable_Check(h.ptr())) {
-        return A.autorelease(guard.from_batched(
-            A, THPVariable_Unpack(h.ptr()), device_holding_tensor));
-      }
-      return h;
-    };
-    if (THPVariable_Check(result.ptr())) {
-      return guard.from_batched(
-          A, THPVariable_Unpack(result.ptr()), device_holding_tensor);
-    }
-    return tree_map(A, wrap, result);
-  }
-}
-
-namespace {
-
-mpy::object __torch_function__(
-    Arena& A,
-    mpy::handle orig,
-    mpy::vector_args args,
-    bool is_pointwise) {
-  if (orig == torch_Tensor___mul__) {
-    AT_ASSERT(args.nargs == 2 && !args.has_keywords());
-    auto lhs = args[0];
-    auto rhs = args[1];
-    if (mpy::isinstance(lhs, _Tensor) && mpy::isinstance(rhs, _Tensor) &&
-        _Tensor_ndim(lhs) == 0 && _Tensor_ndim(rhs) == 0) {
-      bool has_device = false;
-      Slice<DimEntry> levels;
-      for (auto i : args.enumerate_positional()) {
-        auto t = TensorInfo::create(A, args[i], false);
-        // something like a mask * rhs, which matrix multiplies don't correctly
-        // promote
-        if (!t.tensor->is_floating_point()) {
-          return run_torch_function(A, orig, args, is_pointwise);
-        }
-        has_device = has_device || t.has_device;
-        for (auto l : t.levels) {
-          if (!levels.contains(l)) {
-            levels.append(A, l);
-          }
-        }
-      }
-      // std::cout << "__torch_function__ " << "delay" << " " << orig << "\n";
-      return Tensor::create_delayed(
-          mpy::object::borrow(orig), args, levels, has_device);
-    }
-  }
-  return run_torch_function(A, orig, args, is_pointwise);
-}
-
-mpy::vector_args as_vector_args(
-    Arena& A,
-    mpy::handle args,
-    mpy::handle kwargs) {
-  auto pos_args = (mpy::handle*)&PyTuple_GET_ITEM(args.ptr(), 0);
-  auto pos_n = PyTuple_GET_SIZE(args.ptr());
-  if (!kwargs.ptr()) {
-    return mpy::vector_args(pos_args, pos_n, nullptr);
-  }
-  Slice<mpy::handle> all_args;
-  Slice<mpy::handle> kwnames;
-  all_args.extend(A, pos_args, pos_args + pos_n);
-  mpy::dict_view dv(kwargs);
-  Py_ssize_t pos = 0;
-  mpy::handle key, value;
-  while (dv.next(&pos, &key, &value)) {
-    all_args.append(A, value);
-    kwnames.append(A, key);
-  }
-  return mpy::vector_args(
-      all_args.begin(), pos_n, A.autorelease(slice_to_tuple(kwnames)));
-}
-
-PyObject* py___torch_function__(
-    PyObject* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  maybeInitializeGlobals();
-  AT_ASSERT(nargs == 4 || nargs == 5);
-  auto va = as_vector_args(A, args[3], nargs == 5 ? args[4] : nullptr);
-  bool is_pointwise = pointwise.contains(args[1]);
-  return __torch_function__(A, args[1], std::move(va), is_pointwise).release();
-  PY_END(nullptr)
-}
-
-mpy::object levels_to_tuple(Slice<DimEntry> slice) {
-  mpy::tuple t(slice.size());
-  for (auto i : slice.enumerate()) {
-    t.set(
-        i,
-        slice[i].is_positional() ? mpy::from_int(slice[i].position())
-                                 : mpy::object::borrow(slice[i].dim()));
-  }
-  mpy::object r = std::move(t);
-  return r;
-}
-
-PyObject* Tensor_ndim(Tensor* self, void*) {
-  Py_ssize_t i = 0;
-  for (auto l : self->levels()) {
-    if (l.is_positional()) {
-      ++i;
-    }
-  }
-  return mpy::from_int(i).release();
-}
-
-PyGetSetDef Tensor_getsetters[] = {
-    {"_has_device",
-     (getter)[](PyObject* self, void*)
-         ->PyObject* {
-             return mpy::from_bool(((Tensor*)self)->has_device()).release();
-} // namespace
-, NULL
-}
-, {"_tensor", (getter)[](PyObject* self, void*)->PyObject* {Arena A;
-return THPVariable_Wrap(((Tensor*)self)->tensor(A));
-}
-, NULL
-}
-, {"_batchtensor", (getter)[](PyObject* self, void*)->PyObject* {Arena A;
-return THPVariable_Wrap(((Tensor*)self)->batchtensor(A));
-}
-, NULL
-}
-,
-    {"_levels",
-     (getter)[](PyObject* self, void*)
-         ->PyObject* {PY_BEGIN return levels_to_tuple(((Tensor*)self)->levels())
-                          .release();
-PY_END(nullptr)
-}
-}
-, {"ndim", (getter)Tensor_ndim, NULL, "ndim", NULL}, {
-  NULL
-} /* Sentinel */
-}
-;
-
-PyMethodDef Tensor_methods[] = {
-    {NULL, NULL, 0, NULL} /* Sentinel */
-};
-}
-
-PyTypeObject Tensor::Type = {
-    PyVarObject_HEAD_INIT(NULL, 0)
-    "_C.Tensor", /* tp_name */
-    sizeof(Tensor), /* tp_basicsize */
-    0, /* tp_itemsize */
-    Tensor::dealloc_stub, /* tp_dealloc */
-    0, /* tp_vectorcall_offset */
-    0, /* tp_getattr */
-    0, /* tp_setattr */
-    0, /* tp_as_async */
-    0, /* tp_repr */
-    0, /* tp_as_number */
-    0, /* tp_as_sequence */
-    0, /* tp_as_mapping */
-    0, /* tp_hash */
-    0, /* tp_call */
-    0, /* tp_str */
-    0, /* tp_getattro */
-    0, /* tp_setattro */
-    0, /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
-    "Tensor Object", /* tp_doc */
-    0, /* tp_traverse */
-    0, /* tp_clear */
-    0, /* tp_richcompare */
-    0, /* tp_weaklistoffset */
-    0, /* tp_iter */
-    0, /* tp_iternext */
-    Tensor_methods, /* tp_methods */
-    0, /* tp_members */
-    Tensor_getsetters, /* tp_getset */
-    0, /* tp_base */
-    0, /* tp_dict */
-    0, /* tp_descr_get */
-    0, /* tp_descr_set */
-    0, /* tp_dictoffset */
-    0, /* tp_init */
-    0, /* tp_alloc */
-    Tensor::new_stub, /* tp_new */
-};
-
-// dim() --------------------
-
-static bool relevant_op(_Py_CODEUNIT c) {
-  switch (c) {
-    case STORE_NAME:
-    case STORE_GLOBAL:
-    case STORE_FAST:
-    case STORE_DEREF:
-      return true;
-    default:
-      return false;
-  }
-}
-
-static mpy::object create_dim(mpy::object name, mpy::handle size) {
-  auto d = Dim::create(std::move(name));
-  if (!mpy::is_none(size)) {
-    d->set_size(mpy::to_int(size));
-  }
-  return std::move(d);
-}
-
-static mpy::object create_dimlist(mpy::object name, mpy::handle size) {
-  auto d = DimList::create(std::move(name));
-  if (!mpy::is_none(size)) {
-    if (mpy::is_int(size)) {
-      d->bind_len(mpy::to_int(size));
-    } else {
-      mpy::sequence_view s(size);
-      d->bind_len(s.size());
-      for (auto i : irange(d->size())) {
-        d->dims_[i]->set_size(mpy::to_int(s[i]));
-      }
-    }
-  }
-  return std::move(d);
-}
-
-// Python wrappers that make new reflection primitives available for older
-// runtimes
-#if !(IS_PYTHON_3_11_PLUS)
-#define _PyCode_CODE(CO) ((_Py_CODEUNIT*)PyBytes_AS_STRING((CO)->co_code))
-#endif
-
-namespace {
-struct PyInstDecoder {
-  PyInstDecoder(PyCodeObject* code_object, int lasti)
-      : code_object_(code_object),
-        code_(_PyCode_CODE(code_object)),
-        offset_(lasti / sizeof(_Py_CODEUNIT)) {}
-  // On Windows, _PyOpcode_Caches and _PyOpcode_Deopt are private symbols
-  // See https://github.com/pytorch/pytorch/issues/93854
-  void next() {
-#if IS_PYTHON_3_11_PLUS
-    offset_ += _PyOpcode_Caches[opcode()];
-#endif
-    offset_ += 1;
-  }
-  int opcode() {
-    auto r = _Py_OPCODE(code_[offset_]);
-#if IS_PYTHON_3_11_PLUS
-    r = _PyOpcode_Deopt[r];
-#endif
-    return r;
-  }
-  int oparg() {
-    return _Py_OPARG(code_[offset_]);
-  }
-
-  mpy::object name() {
-    mpy::object names;
-    switch (opcode()) {
-      case STORE_NAME:
-      case STORE_GLOBAL:
-        names = mpy::object::borrow(code_object_->co_names);
-        break;
-      case STORE_FAST:
-        names = mpy::object::steal(PyCode_GetVarnames(code_object_));
-        break;
-      case STORE_DEREF:
-        names = mpy::object::steal(PyCode_GetCellvars(code_object_));
-        break;
-      default:
-        return mpy::object();
-    }
-    return mpy::object::steal(PySequence_GetItem(names.ptr(), oparg()));
-  }
-
- private:
-  PyCodeObject* code_object_;
-  _Py_CODEUNIT* code_;
-  int offset_;
-};
-
-template <mpy::object (*create_object)(mpy::object, mpy::handle)>
-static PyObject* _dims(
-    PyObject* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-  Py_ssize_t specified_ndims = -1;
-  Py_ssize_t found_ndims = 0;
-  Py_ssize_t sizes = -1;
-  mpy::handle n = Py_None;
-  mpy::handle py_sizes = Py_None;
-
-  if (nargs || kwnames) {
-    mpy::vector_args va(args, nargs, kwnames);
-    va.parse("dims", {"n", "sizes"}, {&n, &py_sizes}, 0);
-    if (!mpy::is_none(py_sizes)) {
-      sizes = mpy::sequence_view(py_sizes).size();
-      specified_ndims = sizes;
-    }
-    if (!mpy::is_none(n)) {
-      specified_ndims = mpy::to_int(n);
-    }
-  }
-
-  PyThreadState* state = PyThreadState_GET();
-  auto f = mpy::obj<PyFrameObject>::steal(PyThreadState_GetFrame(state));
-  auto c = mpy::obj<PyCodeObject>::steal(PyFrame_GetCode(f.ptr()));
-  auto lasti = PyFrame_GetLasti(f.ptr());
-  auto decoder = PyInstDecoder(c.ptr(), lasti);
-#if IS_PYTHON_3_11_PLUS
-  // When py3.11 adapts bytecode lasti points to the precall
-  // rather than the call instruction after it
-  if (decoder.opcode() == PRECALL) {
-    decoder.next();
-  }
-#endif
-  decoder.next();
-
-  if (relevant_op(decoder.opcode())) {
-    found_ndims = 1;
-  } else if (decoder.opcode() == UNPACK_SEQUENCE) {
-    found_ndims = decoder.oparg();
-    decoder.next();
-  }
-
-  if (specified_ndims == -1) {
-    if (found_ndims == 0) {
-      mpy::raise_error(
-          PyExc_SyntaxError,
-          "dims() must be assigned to a sequence of variable names or have argument n specified");
-    }
-    specified_ndims = found_ndims;
-  }
-  if (found_ndims != specified_ndims) {
-    found_ndims = 0; // avoid taking the wrong names for dimensions
-  }
-
-  auto genobject = [&](int i) -> mpy::object {
-    mpy::object name;
-    if (i < found_ndims) {
-      name = decoder.name();
-    }
-    if (!name.ptr()) {
-      name = mpy::unicode_from_format("d%d", i);
-      found_ndims = 0; // once we fail at finding a name, we can find any more
-    } else {
-      decoder.next();
-    }
-    return create_object(
-        std::move(name),
-        sizes != -1 ? mpy::sequence_view(py_sizes)[i] : mpy::handle(Py_None));
-  };
-  if (sizes != -1 && sizes != specified_ndims) {
-    mpy::raise_error(
-        PyExc_ValueError,
-        "expected %d sizes but found %d",
-        int(specified_ndims),
-        int(sizes));
-  }
-  if (specified_ndims == 1) {
-    return genobject(0).release();
-  }
-  mpy::tuple result(specified_ndims);
-  for (int i = 0; i < specified_ndims; ++i) {
-    result.set(i, genobject(i));
-  }
-  return result.release();
-  PY_END(nullptr)
-}
-
-struct DotPart {
-  Slice<DimEntry> dims;
-  size_t total_size = 1;
-  void append(Arena& A, mpy::hdl<Dim> d) {
-    total_size *= d->size();
-    dims.append(A, d);
-  }
-};
-
-template <typename T>
-static at::ArrayRef<T> as_array_ref(Slice<T> t) {
-  return at::ArrayRef<T>(t.begin(), t.end());
-}
-
-static TensorRef dot_prepare(
-    Arena& A,
-    std::initializer_list<DotPart> parts,
-    const TensorInfo& t) {
-  Slice<DimEntry> new_levels;
-  bool needs_reshape = false;
-  for (auto p : parts) {
-    if (p.dims.size() != 1) {
-      needs_reshape = true;
-    }
-    new_levels.extend(A, p.dims);
-  }
-  auto r = _match_levels(A, t.tensor, t.levels, new_levels, true);
-  if (!needs_reshape) {
-    return r;
-  }
-  Slice<int64_t> view;
-  for (auto p : parts) {
-    view.append(A, p.total_size);
-  }
-  return A.autorelease(r->reshape(at::IntArrayRef(view.begin(), view.end())));
-}
-
-static mpy::object dot_finish(
-    Arena& A,
-    std::initializer_list<DotPart> parts,
-    at::Tensor r) {
-  Slice<DimEntry> result_levels;
-  bool needs_reshape = false;
-  for (auto p : parts) {
-    if (p.dims.size() != 1) {
-      needs_reshape = true;
-    }
-    result_levels.extend(A, p.dims);
-  }
-  if (needs_reshape) {
-    Slice<int64_t> new_size;
-    for (auto l : result_levels) {
-      new_size.append(A, l.dim()->size());
-    }
-    r = r.reshape(at::IntArrayRef(new_size.begin(), new_size.end()));
-  }
-  return Tensor::from_positional(A, std::move(r), result_levels, true);
-}
-
-static mpy::object dot(
-    Arena& A,
-    TensorInfo lhs,
-    TensorInfo rhs,
-    Slice<DimEntry> sum) {
-  auto lhs_strides = lhs.tensor->strides();
-  auto rhs_strides = rhs.tensor->strides();
-
-  DotPart lro_dims;
-  DotPart lo_dims;
-  DotPart ro_dims;
-  DotPart lr_dims;
-
-  auto insert_dim = [&](mpy::hdl<Dim> d,
-                        std::optional<int> lhs_idx,
-                        std::optional<int> rhs_idx) {
-    bool reduced = sum.contains(d);
-    int64_t lhs_stride = lhs_idx ? lhs_strides[*lhs_idx] : 0;
-    int64_t rhs_stride = rhs_idx ? rhs_strides[*rhs_idx] : 0;
-    if (reduced) {
-      // lr
-      lr_dims.append(A, d);
-    } else {
-      if ((lhs_stride == 0) == (rhs_stride == 0)) {
-        // lro
-        lro_dims.append(A, d);
-      } else if (lhs_stride != 0) {
-        // lo
-        lo_dims.append(A, d);
-      } else {
-        AT_ASSERT(rhs_stride != 0);
-        ro_dims.append(A, d);
-      }
-    }
-  };
-
-  auto rhs_seen = A.allocate<bool>(rhs.levels.size());
-  std::fill(rhs_seen, rhs_seen + rhs.levels.size(), false);
-
-  for (auto i : lhs.levels.enumerate()) {
-    auto d = lhs.levels[i];
-    auto rhs_idx = rhs.levels.index(d);
-    if (rhs_idx) {
-      rhs_seen[*rhs_idx] = true;
-    }
-    insert_dim(d.dim(), i, rhs_idx);
-  }
-
-  for (auto i : rhs.levels.enumerate()) {
-    if (rhs_seen[i]) {
-      continue;
-    }
-    auto d = rhs.levels[i];
-    insert_dim(d.dim(), std::nullopt, i);
-  }
-
-  if (lr_dims.dims.size() != sum.size()) {
-    for (auto& d : sum) {
-      if (!lhs.levels.contains(d) && !rhs.levels.contains(d)) {
-        mpy::raise_error(
-            DimensionBindError(),
-            "summing over non-existent dimension %S",
-            d.dim().ptr());
-      }
-    }
-  }
-
-  // std::cout << lhs.levels << " " << rhs.levels << " " << sum << "\n";
-  // std::cout << lro_dims.dims << " " << lo_dims.dims << " " << ro_dims.dims <<
-  // " " << lr_dims.dims << "\n";
-
-  // no batch, just call mm
-  if (lro_dims.dims.size() != 0) {
-    auto lhs_ = dot_prepare(A, {lro_dims, lo_dims, lr_dims}, lhs);
-    auto rhs_ = dot_prepare(A, {lro_dims, lr_dims, ro_dims}, rhs);
-    return dot_finish(A, {lro_dims, lo_dims, ro_dims}, at::bmm(*lhs_, *rhs_));
-  } else {
-    auto lhs_ = dot_prepare(A, {lo_dims, lr_dims}, lhs);
-    auto rhs_ = dot_prepare(A, {lr_dims, ro_dims}, rhs);
-    return dot_finish(A, {lo_dims, ro_dims}, at::mm(*lhs_, *rhs_));
-  }
-}
-
-static PyObject* test_c(
-    PyObject* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-
-  Arena A;
-  Slice<int> s(A, 3, 4, 5);
-  AT_ASSERT(s.size() == 3 && s.capacity() == 8);
-  AT_ASSERT(s[0] == 3 && s[1] == 4 && s[2] == 5);
-  s.append(A, 6);
-  AT_ASSERT(s[3] == 6);
-  for (int i : irange(10)) {
-    s.append(A, i);
-  }
-  AT_ASSERT(s[0] == 3 && s.back() == 9 && s.size() == 14 && s.capacity() == 16);
-
-  Slice<int> s2(A, -1, -2, -3);
-  AT_ASSERT(s2[1] == -2 && s[0] == 3);
-
-  auto ss = s.slice(1, 2);
-  AT_ASSERT(ss.size() == 1);
-  AT_ASSERT(ss[0] == 4);
-  AT_ASSERT(ss.capacity() == 1);
-  ss.append(A, -4);
-  AT_ASSERT(ss.size() == 2 && ss[1] == -4);
-  ss[0] = 3;
-  AT_ASSERT(s[1] == 4);
-
-  s.insert(A, s.slice(1, 4), ss);
-  AT_ASSERT(s[1] == 3 && s[2] == -4 && s[3] == 0);
-
-  auto sz = s.size();
-  s.insert(A, s.slice(1, 1), 4);
-  AT_ASSERT(s[1] == 4 && sz + 1 == s.size());
-
-  Slice<int> d(A, 0, 1, 2, 3, 4);
-
-  Slice<int> b(A, 0, 1, 2, 3, 4);
-  b.insert(A, b.slice(1, 1), d);
-  AT_ASSERT(b.size() == 10);
-  AT_ASSERT(b[1] == 0);
-  AT_ASSERT(b[5] == 4);
-  AT_ASSERT(b.back() == 4);
-
-  Py_RETURN_NONE;
-
-  PY_END(nullptr);
-}
-
-static PyObject* order(
-    PyObject* _,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  if (kwnames) {
-    mpy::raise_error(
-        PyExc_TypeError, "unexpected keyword arguments %S", kwnames);
-  }
-  AT_ASSERT(nargs-- > 0);
-  Slice<DimEntry> orig_levels;
-  Slice<DimEntry> levels;
-  TensorRef data;
-  mpy::handle self = args++[0];
-  bool has_device;
-  if (Tensor::check_exact(self)) {
-    auto t = Tensor::unchecked_wrap(self);
-    orig_levels = t->levels();
-    data = t->tensor(A);
-    has_device = t->has_device();
-  } else {
-    auto d = Dim::unchecked_wrap(self);
-    orig_levels.append(A, d);
-    data = d->range();
-    has_device = false;
-  }
-
-  Slice<DimEntry> flat_positional_dims;
-  Slice<std::pair<int, int>> to_flatten;
-  levels.extend(A, orig_levels);
-
-  int orig_ndim = ndim_of_levels(levels);
-  auto append = [&](DimEntry d) {
-    auto midx = levels.index(d);
-    if (!midx) {
-      if (d.is_positional()) {
-        mpy::raise_error(
-            PyExc_ValueError,
-            "tensor has %d positional dimensions, but %d specified, or it was specified twice",
-            int(orig_ndim),
-            int(d.position() + orig_ndim));
-      } else {
-        mpy::raise_error(
-            PyExc_ValueError,
-            "tensor of dimensions %R does not contain dim %R or it was specified twice",
-            levels_to_tuple(orig_levels).ptr(),
-            d.dim().ptr());
-      }
-    }
-    levels[*midx] = DimEntry();
-    flat_positional_dims.append(A, d);
-  };
-
-  int n_new_positional = 0;
-  for (auto i : irange(nargs)) {
-    mpy::handle arg = args[i];
-    DimEntry entry = _wrap_dim(arg, orig_ndim, false);
-    if (!entry.is_none()) {
-      append(entry);
-      ++n_new_positional;
-    } else if (DimList::check(arg)) {
-      auto dl = DimList::unchecked_wrap(arg);
-      for (mpy::obj<Dim>& d : dl->dims_) {
-        append(mpy::hdl<Dim>(d));
-        ++n_new_positional;
-      }
-    } else {
-      ++n_new_positional;
-      if (!mpy::is_sequence(arg)) {
-        mpy::raise_error(
-            PyExc_ValueError, "expected a Dim, List[Dim], or Sequence[Dim]");
-      }
-      mpy::sequence_view sq(arg);
-      auto N = sq.size();
-      to_flatten.append(A, std::make_pair(flat_positional_dims.size(), N));
-      for (auto j : irange(N)) {
-        DimEntry e = _wrap_dim(A.autorelease(sq[j]), orig_ndim, false);
-        if (e.is_none()) {
-          mpy::raise_error(PyExc_ValueError, "expected a Dim, or int");
-        }
-        append(e);
-      }
-    }
-  }
-
-  int insert_point = -1;
-  Slice<DimEntry> new_levels;
-  for (auto l : levels) {
-    if (l.is_none()) {
-      continue;
-    }
-    if (l.is_positional()) {
-      if (insert_point == -1) {
-        insert_point = new_levels.size();
-        new_levels.extend(A, flat_positional_dims);
-      }
-    }
-    new_levels.append(A, l);
-  }
-  if (insert_point == -1) {
-    insert_point = new_levels.size();
-    new_levels.extend(A, flat_positional_dims);
-  }
-
-  at::Tensor ndata = *_match_levels(A, data, orig_levels, new_levels);
-
-  if (to_flatten.size()) {
-    Slice<int64_t> view;
-    auto sz = ndata.sizes();
-    // before the new positional dims
-    for (auto i : irange(0, insert_point)) {
-      view.append(A, sz[i]);
-    }
-    int i = 0;
-    for (auto to_flat : to_flatten) {
-      for (; i < to_flat.first; ++i) {
-        view.append(A, sz[insert_point + i]);
-      }
-      int64_t new_size = 1;
-      int last = i + to_flat.second;
-      for (; i < last; ++i) {
-        new_size *= sz[insert_point + i];
-      }
-      view.append(A, new_size);
-    }
-    for (; i < flat_positional_dims.size(); ++i) {
-      view.append(A, sz[insert_point + i]);
-    }
-    // after the new positional dims
-    for (auto i :
-         irange(insert_point + flat_positional_dims.size(), levels.size())) {
-      view.append(A, sz[i]);
-    }
-    // we shorted the number of dimension, so remove them from new levels
-    // we will renumber them later
-    auto n_to_remove = flat_positional_dims.size() - n_new_positional;
-    new_levels.insert(
-        A,
-        new_levels.slice(insert_point, insert_point + n_to_remove),
-        Slice<DimEntry>());
-    ndata = std::move(ndata).reshape(at::IntArrayRef(view.begin(), view.end()));
-  }
-
-  // renumber the positional dimension
-  int seen = 0;
-  for (auto i : new_levels.reversed_enumerate()) {
-    if (new_levels[i].is_positional() ||
-        (i >= insert_point && i < insert_point + n_new_positional)) {
-      new_levels[i] = --seen;
-    }
-  }
-  return Tensor::from_positional(A, std::move(ndata), new_levels, has_device)
-      .release();
-
-  PY_END(nullptr)
-}
-
-static PyObject* expand(
-    PyObject* _,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  AT_ASSERT(nargs-- > 0);
-  auto info = TensorInfo::create(A, args++[0], false);
-  for (auto i : irange(nargs)) {
-    if (!Dim::check(args[i])) {
-      maybeInitializeGlobals();
-      mpy::vector_args vargs(args - 1, nargs + 1, kwnames);
-      if (THPVariable_Check(args[-1])) {
-        return torch_Tensor_expand.call_vector(vargs).release();
-      } else {
-        return __torch_function__(A, torch_Tensor_expand, vargs, false)
-            .release();
-      }
-    }
-  }
-  const at::Tensor& data = *info.tensor;
-  auto levels = info.levels;
-  Slice<DimEntry> new_levels;
-  Slice<int64_t> sz;
-  Slice<int64_t> sd;
-  for (auto i : irange(nargs)) {
-    auto d = Dim::unchecked_wrap(args[i]);
-    if (levels.contains(d) || new_levels.contains(d)) {
-      mpy::raise_error(
-          DimensionBindError(),
-          "expanding dimension %R already exists in tensor with dims",
-          d.ptr());
-    }
-    new_levels.append(A, d);
-    sz.append(A, d->size());
-    sd.append(A, 0);
-  }
-  new_levels.extend(A, levels);
-  at::IntArrayRef osz = data.sizes();
-  at::IntArrayRef osd = data.strides();
-  sz.extend(A, osz.begin(), osz.end());
-  sd.extend(A, osd.begin(), osd.end());
-  at::Tensor ndata = data.as_strided(
-      at::IntArrayRef(sz.begin(), sz.end()),
-      at::IntArrayRef(sd.begin(), sd.end()),
-      data.storage_offset());
-  return Tensor::from_positional(
-             A, std::move(ndata), new_levels, info.has_device)
-      .release();
-  PY_END(nullptr)
-}
-
-static void _bind_dims_to_size(
-    Arena& A,
-    int64_t sz,
-    int64_t sd,
-    Slice<mpy::hdl<Dim>> dims,
-    Slice<int64_t>& nsz,
-    Slice<int64_t>& nsd) {
-  int64_t rhs_prod = 1;
-  for (auto i : dims.enumerate()) {
-    if (!dims[i]->is_bound()) {
-      for (auto j : irange(i + 1, dims.size())) {
-        if (!dims[j]->is_bound()) {
-          mpy::raise_error(
-              DimensionBindError(),
-              "cannot infer the sizes of two dimensions at once %R and %R",
-              dims[i].ptr(),
-              dims[j].ptr());
-        }
-        rhs_prod *= dims[j]->size();
-      }
-      if (sz % rhs_prod != 0) {
-        mpy::tuple tup(dims.size());
-        for (auto j : dims.enumerate()) {
-          tup.set(
-              j,
-              dims[j]->is_bound() ? mpy::from_int(dims[j]->size())
-                                  : mpy::unicode_from_string("?"));
-        }
-        mpy::raise_error(
-            DimensionBindError(),
-            "inferred dimension does not evenly fit into larger dimension: %d vs %R",
-            (int)sz,
-            tup.ptr());
-      }
-      int64_t inferred_size = sz / rhs_prod;
-      dims[i]->set_size(inferred_size);
-      rhs_prod = sz;
-      break;
-    }
-    rhs_prod *= dims[i]->size();
-  }
-  if (rhs_prod != sz) {
-    mpy::tuple tup(dims.size());
-    for (auto j : dims.enumerate()) {
-      tup.set(j, mpy::object::borrow(dims[j]));
-    }
-    mpy::raise_error(
-        DimensionBindError(),
-        "Dimension sizes to do not match (%d != %d) when matching dimension pack %R",
-        (int)sz,
-        (int)rhs_prod,
-        tup.ptr());
-  }
-  auto new_strides = A.allocate<int64_t>(dims.size());
-  auto prev_stride = sd;
-  for (auto i : dims.reversed_enumerate()) {
-    new_strides[i] = prev_stride;
-    prev_stride = dims[i]->size() * prev_stride;
-  }
-  for (auto i : dims.enumerate()) {
-    nsd.append(A, new_strides[i]);
-    nsz.append(A, dims[i]->size());
-  }
-}
-
-static bool has_dims(mpy::handle d) {
-  return Dim::check_exact(d) || Tensor::check_exact(d);
-}
-
-struct IndexingInfo {
-  bool can_call_original; // if true, then it is safe to just call getitem or
-                          // setitem, these objects do not need special handling
-  bool advanced_indexing; // requires actual lookup
-  TensorRef self;
-  Slice<mpy::handle> flat_inputs;
-  Slice<DimEntry> result_levels;
-  bool has_device;
-};
-} // namespace
-
-IndexingInfo getsetitem_flat(
-    Arena& A,
-    TensorInfo self_info,
-    Slice<mpy::handle> input,
-    Slice<DimEntry> keys,
-    Slice<mpy::handle> values,
-    bool has_dimpacks_or_none);
-namespace {
-Slice<mpy::handle> as_slice(mpy::tuple_view tv) {
-  PyObject** begin = &PyTuple_GET_ITEM(tv.ptr(), 0);
-  return Slice<mpy::handle>(
-      (mpy::handle*)begin, (mpy::handle*)(begin + tv.size()));
-}
-
-Slice<mpy::handle> as_slice(mpy::list_view tv) {
-  PyObject** begin = &PyList_GET_ITEM(tv.ptr(), 0);
-  return Slice<mpy::handle>(
-      (mpy::handle*)begin, (mpy::handle*)(begin + tv.size()));
-}
-
-bool maybe_dimpack(
-    Slice<mpy::handle>& elements,
-    mpy::handle s,
-    bool check_first = true) {
-  // can we avoid rechecking?
-  if (mpy::list_view::check(s)) {
-    mpy::list_view tv(s);
-    if (!check_first || (tv.size() && Dim::check_exact(tv[0]))) {
-      elements = as_slice(tv);
-      return true;
-    }
-  }
-  // can we avoid rechecking?
-  if (mpy::tuple_view::check(s)) {
-    mpy::tuple_view tv(s);
-    if (!check_first || (tv.size() && Dim::check_exact(tv[0]))) {
-      elements = as_slice(tv);
-      return true;
-    }
-  }
-  return false;
-};
-
-bool is_dimpack(mpy::handle s) {
-  Slice<mpy::handle> e;
-  return maybe_dimpack(e, s);
-}
-
-mpy::object invoke_getitem(Arena& A, const IndexingInfo& iinfo) {
-  at::Tensor rtensor;
-  if (iinfo.advanced_indexing) {
-    auto self_hdl = handle_from_tensor(A, iinfo.self);
-    auto tup = slice_to_tuple(iinfo.flat_inputs);
-    // std::cout << "calling original getindex " << self_hdl << " " << tup <<
-    // "\n";
-    auto pytensor = mpy::object::checked_steal(
-        THPVariable_getitem(self_hdl.ptr(), tup.ptr()));
-    rtensor = THPVariable_Unpack(pytensor.ptr());
-  } else {
-    // std::cout << "skipping original getindex\n";
-    rtensor = *iinfo.self;
-  }
-  // std::cout << "returning (from_positional)\n";
-  return Tensor::from_positional(
-      A, std::move(rtensor), iinfo.result_levels, iinfo.has_device);
-}
-
-mpy::object index(
-    Arena& A,
-    mpy::handle self,
-    mpy::handle dims,
-    mpy::handle indices) {
-  maybeInitializeGlobals();
-  Slice<mpy::handle> dims_list;
-  Slice<mpy::handle> indices_list;
-  // we allow for matching single dims to multiple dims,
-  // so we first have to normalize everything into the case where there is a
-  // list on lhs and the rhs
-  bool lhs_list = mpy::tuple_view::check(dims) || mpy::list_view::check(dims);
-  bool rhs_list =
-      mpy::tuple_view::check(indices) || mpy::list_view::check(indices);
-  if (lhs_list && rhs_list) {
-    mpy::sequence_view dv(dims);
-    mpy::sequence_view ind(indices);
-    Py_ssize_t N = dv.size();
-    if (N != ind.size()) {
-      mpy::raise_error(
-          PyExc_TypeError,
-          "dims (%d) and indices (%d) must have the same length",
-          int(N),
-          int(ind.size()));
-    }
-    for (auto i : irange(N)) {
-      dims_list.append(A, A.autorelease(dv[i]));
-      indices_list.append(A, A.autorelease(ind[i]));
-    }
-  } else {
-    dims_list.append(A, dims);
-    indices_list.append(A, indices);
-  }
-
-  // dims being indexed can be grouped together into a single index space, and
-  // we have to flatten them int a single dimension before we can index them...
-  auto self_info = TensorInfo::create(A, self, false);
-  auto ndim = self_info.ndim();
-  Slice<DimEntry> new_levels;
-  Slice<DimEntry> to_flatten;
-  Slice<DimEntry> dims_list_flat;
-  auto parse_dim_entry = [&](mpy::handle s) -> DimEntry {
-    auto d = _wrap_dim(s, ndim, false);
-    if (d.is_none()) {
-      mpy::raise_error(
-          PyExc_TypeError,
-          "expected a dimension specifyer but found %R",
-          s.ptr());
-    }
-    return d;
-  };
-  auto dim_not_present = [&](DimEntry d) {
-    if (d.is_positional()) {
-      mpy::raise_error(
-          PyExc_TypeError,
-          "dimension %d not in tensor of %d dimensions",
-          d.position() + ndim,
-          ndim);
-    } else {
-      mpy::raise_error(
-          PyExc_TypeError, "dimension %R not in tensor", d.dim()->ptr());
-    }
-  };
-
-  for (auto i : dims_list.enumerate()) {
-    Slice<mpy::handle> m;
-    if (maybe_dimpack(m, dims_list[i], /*check_first=*/false)) {
-      if (m.size() == 0) {
-        // plausible semantics work for this to have 0 elements (e.g. the index
-        // will always be 0)
-        dims_list_flat.append(A, DimEntry()); // value is just dropped
-      }
-      auto first = parse_dim_entry(m[0]);
-      dims_list_flat.append(A, first);
-      if (m.size() == 1) {
-        continue;
-      }
-      if (to_flatten.size() == 0) {
-        new_levels.extend(A, self_info.levels);
-      }
-      Slice<DimEntry> rest;
-      for (auto i : irange(1, m.size())) {
-        auto d = parse_dim_entry(m[i]);
-        if (!new_levels.remove(A, d)) {
-          dim_not_present(d);
-        }
-        rest.append(A, d);
-      }
-
-      auto first_idx = new_levels.index(first);
-      if (!first_idx) {
-        dim_not_present(first);
-      }
-      new_levels.insert(
-          A, new_levels.slice(*first_idx + 1, *first_idx + 1), rest);
-      to_flatten.extend(A, rest);
-    } else {
-      dims_list_flat.append(A, parse_dim_entry(dims_list[i]));
-    }
-  }
-  if (to_flatten.size() > 0) {
-    TensorRef rearranged =
-        _match_levels(A, self_info.tensor, self_info.levels, new_levels);
-    at::IntArrayRef sizes = rearranged->sizes();
-    Slice<int64_t> new_sizes;
-    Slice<DimEntry> reshape_levels;
-    for (auto i : new_levels.enumerate()) {
-      if (to_flatten.contains(new_levels[i])) {
-        new_sizes.back() *= sizes[i];
-      } else {
-        new_sizes.append(A, sizes[i]);
-        reshape_levels.append(A, new_levels[i]);
-      }
-    }
-    self_info.tensor = A.autorelease(rearranged->reshape(
-        at::IntArrayRef(new_sizes.begin(), new_sizes.end())));
-
-    self_info.levels =
-        reshape_levels; // note: we are using the first level in a flattened
-                        // group to represent the group for the rest of the op
-                        // we need to be careful not to rely the dimensions size
-                        // because it doesn't match the size of the whole group
-  }
-  bool has_dimpacks = false;
-  for (auto idx : indices_list) {
-    if (mpy::tuple_view::check(idx) || mpy::list_view::check(idx)) {
-      has_dimpacks = true;
-      break;
-    }
-  }
-  IndexingInfo info = getsetitem_flat(
-      A,
-      self_info,
-      Slice<mpy::handle>(),
-      dims_list_flat,
-      indices_list,
-      has_dimpacks);
-  return invoke_getitem(A, info);
-}
-
-// true -- the indices were flattened out of a tuple, list or sequence...
-
-Slice<mpy::handle> slice_from_sequence(Arena& A, mpy::handle value) {
-  if (mpy::tuple_view::check(value)) {
-    return as_slice(mpy::tuple_view(value));
-  } else if (mpy::list_view::check(value)) {
-    return as_slice(mpy::list_view(value));
-  } else {
-    mpy::sequence_view sv(value);
-    Slice<mpy::handle> r;
-    for (auto i : sv.enumerate()) {
-      r.append(A, A.autorelease(sv[i]));
-    }
-    return r;
-  }
-}
-
-bool extractIndices(Arena& A, mpy::handle index, Slice<mpy::handle>& indices) {
-  if (mpy::tuple_view::check(index)) {
-    indices.extend(A, as_slice(mpy::tuple_view(index)));
-    return true;
-  } else if (THPVariable_Check(index.ptr())) {
-    indices.append(A, index);
-    return false;
-  } else if (!mpy::is_sequence(index)) {
-    indices.append(A, index);
-    return false;
-  }
-  // a copy of treatSequenceAsTuple modified to add Dim and our wrapped
-  // tensors..
-  mpy::sequence_view sv(index);
-  if (sv.size() >= 32) {
-    indices.extend(A, slice_from_sequence(A, index));
-    return true;
-  }
-  for (auto i : sv.enumerate()) {
-    mpy::handle item;
-    try {
-      item = sv[i];
-    } catch (mpy::exception_set& e) {
-      PyErr_Clear();
-      indices.append(A, index);
-      return false;
-    }
-    if (THPVariable_Check(item.ptr()) || mpy::is_sequence(item) ||
-        PySlice_Check(item.ptr()) || item.ptr() == Py_Ellipsis ||
-        mpy::is_none(item) || has_dims(item)) {
-      indices.extend(A, slice_from_sequence(A, index));
-      return true;
-    }
-  }
-  indices.append(A, index);
-  return false;
-}
-
-IndexingInfo getsetitem(
-    Arena& A,
-    mpy::handle self,
-    mpy::handle index,
-    bool tensors_have_dims) {
-  bool can_call_original_getitem = !tensors_have_dims;
-
-  Slice<mpy::handle> input;
-  if (has_dims(index)) {
-    input.append(A, index);
-  } else {
-    bool is_sequence = extractIndices(A, index, input);
-    // nothing about first class dims here, fallback to getitem
-    if (can_call_original_getitem && !is_sequence) {
-      return {true};
-    }
-  }
-
-  int64_t dims_indexed = 0;
-  int64_t expanding_object = -1;
-  DimList* unbound_dim_list = nullptr;
-  auto check_expanding = [&](int64_t i) {
-    if (expanding_object != -1) {
-      mpy::raise_error(
-          DimensionBindError(),
-          "at most one ... or unbound dimension list can exist in indexing list but found 2 at offsets %d and %d",
-          (int)expanding_object,
-          (int)i);
-    }
-    expanding_object = i;
-  };
-  Slice<int64_t> dimlists;
-
-  // calculate how many dimensioned have been indexed in order to compute the
-  // size of ... or expand a potentially unbound dimension list.
-
-  bool has_dimpacks_or_none = false;
-  for (auto i : input.enumerate()) {
-    mpy::handle s = input[i];
-    if (Dim::check_exact(s) || Tensor::check_exact(s)) {
-      can_call_original_getitem = false;
-      ++dims_indexed;
-    } else if (s.ptr() == Py_Ellipsis) {
-      check_expanding(i);
-    } else if (DimList::check(s)) {
-      can_call_original_getitem = false;
-      auto dl = DimList::unchecked_wrap(s);
-      if (!dl->is_bound()) {
-        check_expanding(i);
-        unbound_dim_list = dl.ptr();
-      } else {
-        dims_indexed += dl->dims_.size();
-      }
-      dimlists.append(A, i);
-    } else if (mpy::is_none(s)) {
-      has_dimpacks_or_none = true;
-    } else if (is_dimpack(s)) {
-      can_call_original_getitem = false;
-      has_dimpacks_or_none = true;
-      ++dims_indexed;
-    } else {
-      ++dims_indexed;
-    }
-  }
-
-  // at this point if we haven't seen any Dim objects, we also can fallback to
-  // the original getitem.
-  if (can_call_original_getitem) {
-    return {true};
-  }
-
-  // std::cout << "__getitem__ " << self << " " << index << "\n";
-
-  TensorInfo self_info = TensorInfo::create(A, self, false, true);
-  auto ndim = self_info.ndim();
-  if (dims_indexed > ndim) {
-    mpy::raise_error(
-        PyExc_ValueError,
-        "at least %d indices were supplied but the tensor only has %d dimensions",
-        (int)dims_indexed,
-        (int)ndim);
-  }
-  // expand any unbound dimension list, or expand ... into individual : slices.
-  auto expanding_dims = ndim - dims_indexed;
-  if (expanding_object != -1) {
-    if (unbound_dim_list) {
-      unbound_dim_list->bind_len(expanding_dims);
-    } else {
-      // ...
-      Slice<mpy::handle> no_slices;
-      for (auto i : irange(expanding_dims)) {
-        (void)i;
-        no_slices.append(A, no_slice);
-      }
-      input.insert(
-          A, input.slice(expanding_object, expanding_object + 1), no_slices);
-    }
-  }
-
-  // flatten out any dimensions stored in dimlist elements directly into the
-  // inputs std::cout << dimlists << " <- dim lists!\n";
-  for (int64_t i = dimlists.size() - 1; i >= 0; --i) {
-    auto idx = dimlists[i];
-    // we added more elements to input because of ...
-    // so we need to also adjust the index to get back to where the
-    // dimlist existed
-    if (!unbound_dim_list && expanding_object != -1 && idx > expanding_object) {
-      idx += expanding_dims;
-    }
-    auto dl = DimList::unchecked_wrap(input[idx]);
-    // XXX would be better if we used an OwnedSlice in DimList
-    Slice<mpy::handle> more_dims(
-        (mpy::handle*)&*dl->dims_.begin(), (mpy::handle*)&*dl->dims_.end());
-    input.insert(A, input.slice(idx, idx + 1), more_dims);
-  }
-
-  return getsetitem_flat(
-      A,
-      self_info,
-      input,
-      Slice<DimEntry>(),
-      Slice<mpy::handle>(),
-      has_dimpacks_or_none);
-}
-} // namespace
-IndexingInfo getsetitem_flat(
-    Arena& A,
-    TensorInfo self_info,
-    Slice<mpy::handle> input,
-    Slice<DimEntry> keys,
-    Slice<mpy::handle> values,
-    bool has_dimpacks_or_none) {
-  // At this point:
-  // ..., DimList have been eliminated
-  // Dim, Tensor, Tuple[Dim,...], int, slice still remain
-
-  // we have to count how many times we see a dimension.
-  // A[i,j] is a simple binding operation, but A[i, i+j] or A[i, i] requires
-  // advanced indexing.
-  Slice<mpy::hdl<Dim>> seen_dims;
-  Slice<int64_t> seen_dims_nuses;
-  auto add_dim = [&](mpy::hdl<Dim> entry) {
-    auto midx = seen_dims.index(entry);
-    if (!midx) {
-      seen_dims.append(A, entry);
-      seen_dims_nuses.append(A, 1);
-    } else {
-      ++seen_dims_nuses[*midx];
-    }
-  };
-
-  Slice<mpy::handle> input_it = input;
-
-  Slice<mpy::handle> flat_inputs;
-  // flat inputs will start with an empty mpy::handle if the
-  // actual value is in the tensor-like object in the tensor info
-  Slice<TensorInfo> tensor_inputs;
-
-  auto append_flat_handle = [&](mpy::handle h) {
-    flat_inputs.append(A, h);
-    tensor_inputs.append(A, TensorInfo());
-  };
-  TensorRef device_holding_tensor;
-  auto append_tensor_input = [&](TensorInfo ti) {
-    flat_inputs.append(A, mpy::handle());
-    tensor_inputs.append(A, ti);
-    if (ti.has_device && !device_holding_tensor) {
-      device_holding_tensor = ti.tensor;
-    }
-  };
-
-  Slice<int64_t> nsz;
-  Slice<int64_t> nsd;
-  at::IntArrayRef sz = self_info.tensor->sizes();
-  at::IntArrayRef sd = self_info.tensor->strides();
-
-  auto append_size = [&](int i) {
-    if (has_dimpacks_or_none) {
-      nsz.append(A, sz[i]);
-      nsd.append(A, sd[i]);
-    }
-  };
-  // std::cout << "self levels: " << self_info.levels << "\n";
-
-  auto parse_nones = [&]() {
-    while (input_it.size() && mpy::is_none(input_it[0])) {
-      append_flat_handle(no_slice);
-      nsz.append(A, 1);
-      nsd.append(A, 0);
-      input_it = input_it.slice(1);
-    }
-  };
-
-  auto append_item = [&](int i, mpy::handle arg) {
-    if (Dim::check_exact(arg)) {
-      auto d = Dim::unchecked_wrap(arg);
-      d->set_size(sz[i]);
-      add_dim(d);
-      append_size(i);
-      append_flat_handle(arg);
-      return;
-    }
-    auto info = TensorInfo::create(A, arg, false, false);
-    if (info) {
-      append_size(i);
-      append_tensor_input(info);
-      for (auto il : info.levels) {
-        if (!il.is_positional()) {
-          add_dim(il.dim());
-        }
-      }
-      return;
-    }
-
-    if (has_dimpacks_or_none) {
-      Slice<mpy::handle> mp;
-      if (maybe_dimpack(mp, arg)) {
-        // dim pack
-        Slice<mpy::hdl<Dim>> dim_pack;
-        for (auto d : mp) {
-          dim_pack.append(A, Dim::wrap(d));
-          add_dim(dim_pack.back());
-          append_flat_handle(dim_pack.back());
-        }
-        _bind_dims_to_size(A, sz[i], sd[i], dim_pack, nsz, nsd);
-        return;
-      }
-    }
-
-    append_size(i);
-    append_flat_handle(arg);
-  };
-
-  // pair up the indexing expressions with dimension of self it indexes
-  // self may have first-class dims, which do not participate the indexing.
-  for (auto i : self_info.levels.enumerate()) {
-    auto l = self_info.levels[i];
-    auto idx = keys.index(l);
-    if (idx) {
-      append_item(i, values[*idx]);
-    } else if (l.is_positional()) {
-      // grab and index from the positional list
-      parse_nones();
-      if (!input_it.size()) {
-        // we might have fewer indices than tensor dimensions,
-        // which implicitly indexes the remaining dimensions with :
-        append_flat_handle(no_slice);
-        append_size(i);
-      } else {
-        mpy::handle arg = input_it[0];
-        input_it = input_it.slice(1);
-        append_item(i, arg);
-      }
-    } else {
-      add_dim(l.dim());
-      append_flat_handle(l.dim());
-      append_size(i);
-    }
-  }
-  // any training Nones may have no existing dimension associated with them in
-  // self.
-  parse_nones();
-
-  // we have to restride the tensor to collapse dimension packs and introduce
-  // our none dimensions.
-  if (has_dimpacks_or_none) {
-    self_info.tensor = A.autorelease(self_info.tensor->as_strided(
-        at::IntArrayRef(nsz.begin(), nsz.end()),
-        at::IntArrayRef(nsd.begin(), nsd.end()),
-        self_info.tensor->storage_offset()));
-  }
-
-  // figure out what the shape of the indexing tensors will be
-  // and what the shape of the resulting tensor will be
-  Slice<DimEntry> result_levels;
-  Slice<DimEntry> index_levels;
-  int64_t tensor_insert_point = -1;
-  bool requires_getindex = false;
-  auto mark_tensor_index = [&] {
-    if (tensor_insert_point == -1) {
-      tensor_insert_point = result_levels.size();
-    } else if (tensor_insert_point != result_levels.size()) {
-      tensor_insert_point = 0;
-    }
-  };
-  for (auto i : flat_inputs.enumerate()) {
-    auto inp = flat_inputs[i];
-    if (tensor_inputs[i]) {
-      requires_getindex = true;
-      mark_tensor_index();
-      for (auto l : tensor_inputs[i].levels) {
-        // std::cout << "Consider to add " << l << "\n";
-        if (!index_levels.contains(l)) {
-          index_levels.append(A, l);
-        }
-      }
-    } else if (Dim::check_exact(inp)) {
-      auto d = Dim::unchecked_wrap(inp);
-      // dimensions used once are just binding operations
-      if (1 == seen_dims_nuses[*seen_dims.index(d)]) {
-        flat_inputs[i] = no_slice;
-        result_levels.append(A, d);
-      } else {
-        requires_getindex = true;
-        flat_inputs[i] = mpy::handle();
-        tensor_inputs[i] = TensorInfo{
-            d->range(), Slice<DimEntry>(A, DimEntry(d)), false, TensorRef()};
-        if (!index_levels.contains(d)) {
-          index_levels.append(A, d);
-        }
-        mark_tensor_index();
-      }
-    } else {
-      if (inp.ptr() != no_slice.ptr()) {
-        requires_getindex = true;
-      }
-      if (!mpy::is_int(inp)) {
-        // note: actual positional indexes are accurately computed later
-        result_levels.append(A, -1);
-      }
-    }
-  }
-
-  // indexing dimensions appear in the tensor at the _first use of a tensor_ in
-  // the indexing. So insert the indexing leveles into the result klevels at
-  // this spot
-  if (tensor_insert_point != -1) {
-    result_levels.insert(
-        A,
-        result_levels.slice(tensor_insert_point, tensor_insert_point),
-        index_levels);
-  }
-
-  // std::cout << "flat inputs: " << flat_inputs << "\n";
-  // std::cout << "result_levels: " << result_levels << "\n";
-  // std::cout << "index_levels: " << index_levels << "\n";
-
-  // get all the tensors to be the right shape for indexing
-  if (requires_getindex) {
-    for (auto i : flat_inputs.enumerate()) {
-      if (tensor_inputs[i]) {
-        AT_ASSERT(!flat_inputs[i].ptr());
-        // std::cout << "tensor " << i << " " << tensor_inputs[i].levels <<
-        // "\n";
-        TensorRef t = tensor_inputs[i].tensor;
-        if (!tensor_inputs[i].has_device && device_holding_tensor) {
-          t = A.autorelease(t->to(device_holding_tensor->device()));
-        }
-        flat_inputs[i] = handle_from_tensor(
-            A, _match_levels(A, t, tensor_inputs[i].levels, index_levels));
-      }
-    }
-  }
-
-  // previously we didn't know how many positional dimensions there would be so
-  // we couldn't number them right so fill it in now.
-  auto seen_positionals = 0;
-  for (auto i : result_levels.reversed_enumerate()) {
-    if (result_levels[i].is_positional()) {
-      result_levels[i] = -(++seen_positionals);
-    }
-  }
-
-  return IndexingInfo{
-      false,
-      requires_getindex,
-      self_info.tensor,
-      flat_inputs,
-      result_levels,
-      self_info.has_device};
-}
-namespace {
-mpy::object __getitem__(Arena& A, mpy::handle self, mpy::handle index) {
-  maybeInitializeGlobals();
-  auto iinfo = getsetitem(A, self, index, has_dims(self));
-  if (iinfo.can_call_original) {
-    return mpy::object::checked_steal(
-        THPVariable_getitem(self.ptr(), index.ptr()));
-  }
-
-  return invoke_getitem(A, iinfo);
-}
-
-void __setitem__(
-    Arena& A,
-    mpy::handle self,
-    mpy::handle index,
-    mpy::handle rhs) {
-  maybeInitializeGlobals();
-  auto iinfo = getsetitem(A, self, index, has_dims(self) || has_dims(rhs));
-  if (iinfo.can_call_original) {
-    if (-1 == THPVariable_setitem(self.ptr(), index.ptr(), rhs.ptr())) {
-      throw mpy::exception_set();
-    }
-    return;
-  }
-
-  auto rhs_info = TensorInfo::create(A, rhs, false, false);
-  if (rhs_info) { // otherwise rhs can be a scalar...
-    for (auto l : rhs_info.levels) {
-      if (!iinfo.result_levels.contains(l)) {
-        if (l.is_positional()) {
-          mpy::raise_error(
-              DimensionBindError(),
-              "rhs contains too many dimensions (%d) compared to indexed value (%d)",
-              ndim_of_levels(iinfo.result_levels),
-              rhs_info.ndim());
-        } else {
-          auto tup = levels_to_tuple(iinfo.result_levels);
-          mpy::raise_error(
-              DimensionBindError(),
-              "rhs of setitem contains dimension %R which is not in the dimension on the left (%R)",
-              l.dim().ptr(),
-              tup.ptr());
-        }
-      }
-    }
-    auto rhs_matched =
-        _match_levels(A, rhs_info.tensor, rhs_info.levels, iinfo.result_levels);
-    rhs = handle_from_tensor(A, rhs_matched);
-  }
-  self = handle_from_tensor(A, iinfo.self);
-
-  if (iinfo.advanced_indexing) {
-    auto tup = slice_to_tuple(iinfo.flat_inputs);
-    if (-1 == THPVariable_setitem(self.ptr(), tup.ptr(), rhs.ptr())) {
-      throw mpy::exception_set();
-    }
-  } else {
-    torch_Tensor_copy_.call(self, rhs);
-  }
-}
-} // namespace
-
-PyObject* Tensor_getitem(PyObject* self, PyObject* index) {
-  Arena A;
-  PY_BEGIN
-  return __getitem__(A, self, index).release();
-  PY_END(nullptr);
-}
-
-int Tensor_setitem(PyObject* self, PyObject* index, PyObject* value) {
-  Arena A;
-  PY_BEGIN
-  __setitem__(A, self, index, value);
-  return 0;
-  PY_END(-1);
-}
-
-namespace {
-PyObject* py___getitem__(
-    PyObject* _,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  AT_ASSERT(nargs == 2);
-  return __getitem__(A, args[0], args[1]).release();
-  PY_END(nullptr)
-}
-
-PyObject* py___setitem__(
-    PyObject* _,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  AT_ASSERT(nargs == 3);
-  __setitem__(A, args[0], args[1], args[2]);
-  Py_RETURN_NONE;
-  PY_END(nullptr)
-}
-
-PyObject* py_index(
-    PyObject* _,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  mpy::vector_args va(args, nargs, kwnames);
-  mpy::handle self, dims, indices;
-  va.parse("index", {"self", "dims", "indices"}, {&self, &dims, &indices}, 3);
-  return index(A, self, dims, indices).release();
-  PY_END(nullptr)
-}
-
-PyObject* py_stack(
-    PyObject* _,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  mpy::vector_args va(args, nargs, kwnames);
-  mpy::handle tensors, new_dim, dim;
-  va.parse(
-      "stack", {"tensors", "new_dim", "dim"}, {&tensors, &new_dim, &dim}, 2);
-
-  Slice<DimEntry> result_levels;
-  Slice<TensorInfo> infos;
-  mpy::sequence_view sv(tensors);
-  auto new_dim_d = Dim::wrap(new_dim);
-  for (auto i : sv.enumerate()) {
-    infos.append(A, TensorInfo::create(A, A.autorelease(sv[i]), false));
-    for (auto l : infos.back().levels) {
-      if (!result_levels.contains(l)) {
-        result_levels.append(A, l);
-      }
-    }
-  }
-  new_dim_d->set_size(infos.size());
-  std::vector<at::Tensor> inputs;
-  inputs.reserve(infos.size());
-  for (auto in : infos) {
-    inputs.emplace_back(*_match_levels(A, in.tensor, in.levels, result_levels));
-  }
-  auto ndim = ndim_of_levels(result_levels);
-  int64_t rawdim = 0;
-  if (dim.ptr()) {
-    auto d = _wrap_dim(dim, ndim, false);
-    auto idx = result_levels.index(d);
-    if (!idx) {
-      mpy::raise_error(
-          PyExc_TypeError, "Dimension %R does not exist in inputs", dim.ptr());
-    }
-    rawdim = *idx;
-  }
-  auto result = at::stack(inputs, rawdim);
-  result_levels.insert(A, rawdim, new_dim_d);
-  return Tensor::from_positional(A, std::move(result), result_levels, true)
-      .release();
-  PY_END(nullptr)
-}
-
-PyObject* py_split(
-    PyObject* _,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  maybeInitializeGlobals();
-  mpy::vector_args va(args, nargs, kwnames);
-  mpy::handle self, split_size_or_sections, dim;
-  va.parse(
-      "split",
-      {"self", "split_size_or_sections", "dim"},
-      {&self, &split_size_or_sections, &dim},
-      2);
-  bool dim_is_object = dim.ptr() && Dim::check_exact(dim);
-  Slice<mpy::handle> sizes;
-
-  bool all_dims = true;
-  bool all_ints = true;
-
-  if (!mpy::is_int(split_size_or_sections)) {
-    mpy::sequence_view sv(split_size_or_sections);
-    for (auto i : sv.enumerate()) {
-      sizes.append(A, A.autorelease(sv[i]));
-      if (Dim::check_exact(sizes.back())) {
-        all_ints = false;
-      } else {
-        all_dims = false;
-      }
-    }
-  }
-  if (all_ints) {
-    if (dim_is_object) {
-      mpy::raise_error(
-          PyExc_TypeError,
-          "when dim is specified as a Dim object, split sizes must also be dimensions.");
-    }
-    // call original split (if self has dimensions this will use torch function
-    // to do the split)
-    return torch_Tensor_split
-        .call_vector(mpy::vector_args(args, nargs, kwnames))
-        .release();
-  }
-  if (!all_dims) {
-    mpy::raise_error(
-        PyExc_TypeError, "split list must be ints or dims but got a mix");
-  }
-
-  auto self_info = TensorInfo::create(A, self, false);
-  auto ndim = self_info.ndim();
-  if (!dim_is_object && ndim == 0) {
-    mpy::raise_error(
-        PyExc_TypeError, "split expects at least a 1-dimension tensor");
-  }
-  DimEntry dim_l = dim.ptr() ? _wrap_dim(dim, ndim, false) : -ndim;
-
-  auto idx = self_info.levels.index(dim_l);
-  if (!idx) {
-    if (!dim.ptr()) {
-      dim = A.autorelease(mpy::from_int(0));
-    }
-    mpy::raise_error(
-        PyExc_TypeError, "tensor does not contain dimension %R", dim.ptr());
-  }
-  Slice<int64_t> indices;
-
-  int64_t total_size = 0;
-  Slice<int64_t> unbound;
-  for (auto i : sizes.enumerate()) {
-    auto d = Dim::unchecked_wrap(sizes[i]);
-    if (d->is_bound()) {
-      indices.append(A, d->size());
-      total_size += indices.back();
-    } else {
-      indices.append(A, 0);
-      unbound.append(A, i);
-    }
-  }
-  auto tensor_size = self_info.tensor->sizes()[*idx];
-
-  if (unbound.size()) {
-    if (total_size > tensor_size) {
-      mpy::raise_error(
-          PyExc_TypeError,
-          "sizes of target dimensions add up to more (%d) than source dim (%d)",
-          int(total_size),
-          int(tensor_size));
-    }
-    auto remaining_size = tensor_size - total_size;
-    auto chunk_size = (remaining_size + unbound.size() - 1) / unbound.size();
-    for (auto u : unbound) {
-      auto sz = std::min(chunk_size, remaining_size);
-      Dim::unchecked_wrap(sizes[u])->set_size(sz);
-      indices[u] = sz;
-      remaining_size -= sz;
-    }
-  } else if (tensor_size != total_size) {
-    mpy::raise_error(
-        PyExc_TypeError,
-        "sum of sizes of target dimensions (%d) do not match the than source dim (%d)",
-        int(total_size),
-        int(tensor_size));
-  }
-
-  auto result_tensors = self_info.tensor->split_with_sizes(
-      at::IntArrayRef(indices.begin(), indices.end()), *idx);
-  mpy::tuple result(result_tensors.size());
-  Slice<DimEntry> new_levels;
-  new_levels.extend(A, self_info.levels);
-  for (auto i : sizes.enumerate()) {
-    new_levels[*idx] = Dim::unchecked_wrap(sizes[i]);
-    result.set(
-        i,
-        Tensor::from_positional(
-            A, std::move(result_tensors[i]), new_levels, true));
-  }
-
-  return result.release();
-
-  PY_END(nullptr)
-}
-
-Slice<DimEntry> _wrap_dims(Arena& A, mpy::handle d, size_t N, bool keepdim) {
-  auto de = _wrap_dim(d, N, keepdim);
-  Slice<DimEntry> r;
-  if (!de.is_none()) {
-    r.append(A, de);
-  } else {
-    mpy::sequence_view sq(d);
-    for (auto i : sq.enumerate()) {
-      r.append(A, _wrap_dim(A.autorelease(sq[i]), N, keepdim));
-    }
-  }
-  return r;
-}
-
-struct WrappedOperator : public mpy::base<WrappedOperator> {
-  mpy::object orig;
-  PyMethodDef method_def;
-  mpy::object name, doc;
-
-  bool is_pointwise = false;
-  int64_t dim_offset = 0;
-  int64_t keepdim_offset = 1;
-  std::string dim_name;
-  bool single_dim = false;
-  bool reduce = true;
-
-  static PyTypeObject Type;
-
-  void init(
-      mpy::object orig_,
-      PyCFunction wrapper_implementation,
-      std::string dim_name_ = "") {
-    orig = std::move(orig_);
-    method_def.ml_meth = wrapper_implementation;
-    name = orig.attr("__name__");
-    doc = orig.attr("__doc__");
-    dim_name = std::move(dim_name_);
-    if (!mpy::is_none(doc) && !dim_name.empty()) {
-      doc = mpy::unicode_from_format(
-          "%S\nArgument '%s' can be either an integer or a torchdim.Dim object.\n",
-          doc.ptr(),
-          dim_name.c_str());
-    }
-    method_def.ml_name = mpy::is_none(name) ? "" : PyUnicode_AsUTF8(name.ptr());
-    method_def.ml_doc = mpy::is_none(doc) ? "" : PyUnicode_AsUTF8(doc.ptr());
-    method_def.ml_flags = METH_FASTCALL | METH_KEYWORDS;
-  }
-
-  mpy::object function() {
-    return mpy::object::checked_steal(PyCFunction_New(&method_def, ptr()));
-  }
-};
-} // namespace
-
-PyTypeObject WrappedOperator::Type = {
-    PyVarObject_HEAD_INIT(NULL, 0)
-    "_C.WrappedOperator", /* tp_name */
-    sizeof(WrappedOperator), /* tp_basicsize */
-    0, /* tp_itemsize */
-    WrappedOperator::dealloc_stub, /* tp_dealloc */
-    0, /* tp_vectorcall_offset */
-    0, /* tp_getattr */
-    0, /* tp_setattr */
-    0, /* tp_as_async */
-    0, /* tp_repr */
-    0, /* tp_as_number */
-    0, /* tp_as_sequence */
-    0, /* tp_as_mapping */
-    0, /* tp_hash */
-    0, /* tp_call */
-    0, /* tp_str */
-    0, /* tp_getattro */
-    0, /* tp_setattro */
-    0, /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT, /* tp_flags */
-    "Wrapped Object Holder", /* tp_doc */
-    0, /* tp_traverse */
-    0, /* tp_clear */
-    0, /* tp_richcompare */
-    0, /* tp_weaklistoffset */
-    0, /* tp_iter */
-    0, /* tp_iternext */
-    0, /* tp_methods */
-    0, /* tp_members */
-    0, /* tp_getset */
-    0, /* tp_base */
-    0, /* tp_dict */
-    0, /* tp_descr_get */
-    0, /* tp_descr_set */
-    0, /* tp_dictoffset */
-    0, /* tp_init */
-    0, /* tp_alloc */
-    WrappedOperator::new_stub, /* tp_new */
-};
-
-namespace {
-PyObject* patched_dim_method(
-    PyObject* self_,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  auto self = WrappedOperator::unchecked_wrap(self_);
-  PY_BEGIN
-
-  mpy::vector_args va(args, nargs, kwnames);
-
-  auto _getarg = [&](const char* name, int64_t offset_) -> mpy::handle {
-    auto offset = offset_ + 1; // do not include self
-    auto idx = va.index(name, offset);
-    return idx == -1 ? mpy::handle() : va[idx];
-  };
-  Slice<mpy::handle> patched_args;
-  patched_args.extend(A, va.begin(), va.end());
-  auto _patcharg = [&](const char* name, int64_t offset_, mpy::handle value) {
-    auto offset = offset_ + 1; // do not include self
-    auto idx = va.index(name, offset);
-    if (idx == -1) {
-      mpy::raise_error(PyExc_ValueError, "Missing argument %s", name);
-    }
-    patched_args[idx] = value;
-  };
-
-  auto dim = _getarg(self->dim_name.c_str(), self->dim_offset);
-  if (!dim.ptr()) {
-    auto info = TensorInfo::create(A, args[0], true);
-    EnableAllLayers l(A, info.levels);
-    l.inplace_update_layers(info.batchedtensor, info.levels);
-    patched_args[0] = handle_from_tensor(A, info.batchedtensor);
-    auto r = self->orig.call_vector(patched_args.begin(), nargs, kwnames);
-    return l.from_batched(A, THPVariable_Unpack(r.ptr()), info.has_device)
-        .release();
-  }
-
-  auto info = TensorInfo::create(A, args[0]);
-  auto keepdim = false;
-  if (self->reduce) {
-    auto py_keepdim = _getarg("keepdim", self->keepdim_offset);
-    if (py_keepdim.ptr()) {
-      keepdim = mpy::to_bool(py_keepdim);
-    }
-  }
-
-  auto ndim = info.ndim();
-  auto dims = _wrap_dims(A, dim, ndim, keepdim);
-  Slice<int64_t> dim_indices;
-  auto seen = A.allocate<bool>(info.levels.size());
-  std::fill(seen, seen + info.levels.size(), false);
-
-  for (auto d : dims) {
-    auto midx = info.levels.index(d);
-    if (!midx) {
-      auto tup = levels_to_tuple(info.levels);
-      mpy::raise_error(
-          PyExc_ValueError,
-          "Tensor with dimensions %R does not contain one of %R\n",
-          tup.ptr(),
-          dim.ptr());
-    }
-    seen[*midx] = true;
-    dim_indices.append(A, *midx);
-  }
-  Slice<DimEntry> new_levels;
-  if (self->reduce && !keepdim) {
-    for (auto i : info.levels.enumerate()) {
-      if (!seen[i]) {
-        new_levels.append(A, info.levels[i]);
-      }
-    }
-  } else {
-    new_levels = info.levels;
-  }
-  mpy::object py_indices;
-  if (dim_indices.size() == 1) {
-    py_indices = mpy::from_int(dim_indices[0]);
-  } else {
-    mpy::tuple tup(dim_indices.size());
-    for (auto i : dim_indices.enumerate()) {
-      tup.set(i, mpy::from_int(dim_indices[i]));
-    }
-    py_indices = std::move(tup);
-  }
-  _patcharg(self->dim_name.c_str(), self->dim_offset, py_indices);
-  patched_args[0] = handle_from_tensor(A, info.tensor);
-  auto r = self->orig.call_vector(patched_args.begin(), nargs, kwnames);
-  auto wrap = [&](mpy::handle h) {
-    if (THPVariable_Check(h.ptr())) {
-      return A.autorelease(Tensor::from_positional(
-          A, THPVariable_Unpack(h.ptr()), new_levels, info.has_device));
-    }
-    return h;
-  };
-  return tree_map(A, wrap, r).release();
-  PY_END(nullptr)
-}
-
-PyObject* _wrap(
-    PyObject* self_,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-
-#define ARGS(_)                                             \
-  _(mpy::handle, orig)                                      \
-  _(mpy::handle, dim_offset) _(mpy::handle, keepdim_offset) \
-      _(mpy::handle, dim_name) _(mpy::handle, single_dim)   \
-          _(mpy::handle, reduce)
-  MPY_PARSE_ARGS_KWNAMES("O|OOOOO", ARGS)
-
-  std::string dim_name_str;
-  if (dim_name.ptr()) {
-    dim_name_str = PyUnicode_AsUTF8(dim_name.ptr());
-  } else {
-    dim_name_str = "dim";
-  }
-  auto info = WrappedOperator::create(
-      mpy::object::borrow(orig),
-      (PyCFunction)(void*)patched_dim_method,
-      std::move(dim_name_str));
-  if (dim_offset.ptr()) {
-    info->dim_offset = mpy::to_int(dim_offset);
-  }
-  if (keepdim_offset.ptr()) {
-    info->keepdim_offset = mpy::to_int(keepdim_offset);
-  }
-
-  if (single_dim.ptr()) {
-    info->single_dim = mpy::to_bool(single_dim);
-  }
-  if (reduce.ptr()) {
-    info->reduce = mpy::to_bool(reduce);
-  }
-  return info->function().release();
-#undef ARGS
-
-  PY_END(nullptr)
-}
-
-PyObject* call_torch_function(
-    PyObject* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-  Arena A;
-  maybeInitializeGlobals();
-  auto info = WrappedOperator::unchecked_wrap(self);
-  return __torch_function__(
-             A,
-             info->orig,
-             mpy::vector_args(args, nargs, kwnames),
-             info->is_pointwise)
-      .release();
-  PY_END(nullptr)
-}
-
-PyObject* _wrap_method(
-    PyObject* self,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-  AT_ASSERT(nargs == 2);
-  // XXX - ignore python function wrapped, we will call torch function directly
-  mpy::handle orig = args[0];
-  if (!pointwise.ptr()) {
-    auto dim = mpy::import("functorch.dim");
-    pointwise = dim.attr("pointwise");
-  }
-  auto info = WrappedOperator::create(
-      mpy::object::borrow(orig), (PyCFunction)(void*)call_torch_function);
-  info->is_pointwise = pointwise.contains(orig);
-  return PyInstanceMethod_New(info->function().release());
-  PY_END(nullptr);
-}
-
-PyObject* Tensor_sum(
-    PyObject* self_,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  Arena A;
-  PY_BEGIN
-  maybeInitializeGlobals();
-  mpy::vector_args va(args, nargs, kwnames);
-  auto self_ = Tensor::unchecked_wrap(args[0]);
-  auto d = self_->delayed();
-  if (!d) {
-    return _Tensor_sum.call_vector(va).release();
-  }
-  mpy::handle self, dim, keepdim, dtype;
-  va.parse(
-      "sum",
-      {"self", "dim", "keepdim", "dtype"},
-      {&self, &dim, &keepdim, &dtype},
-      1,
-      1);
-
-  if (dtype.ptr() || (keepdim.ptr() && mpy::to_bool(keepdim))) {
-    // std::cout << "SKIPPING fusion because dtype or keepdim=True specified\n";
-    return _Tensor_sum.call_vector(va).release();
-  }
-  auto levels = self_->levels();
-
-  auto N = ndim_of_levels(levels);
-  auto reduced_dims = _wrap_dims(A, dim, N, false);
-
-  return dot(A,
-             TensorInfo::create(A, d->args[0], false),
-             TensorInfo::create(A, d->args[1], false),
-             reduced_dims)
-      .release();
-  PY_END(nullptr)
-}
-
-PyObject* _parse_test(
-    PyObject* self_,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-  maybeInitializeGlobals();
-
-  int required = mpy::to_int(args[0]);
-  int kwonly = mpy::to_int(args[1]);
-
-  mpy::vector_args va(args + 2, nargs - 2, kwnames);
-
-  mpy::handle a, b, c, d;
-  va.parse(
-      "_parse_test", {"a", "b", "c", "d"}, {&a, &b, &c, &d}, required, kwonly);
-  mpy::tuple r(4);
-  r.set(0, mpy::object::borrow(a.ptr() ? a : Py_None));
-  r.set(1, mpy::object::borrow(b.ptr() ? b : Py_None));
-  r.set(2, mpy::object::borrow(c.ptr() ? c : Py_None));
-  r.set(3, mpy::object::borrow(d.ptr() ? d : Py_None));
-  return r.release();
-
-  PY_END(nullptr)
-}
-
-PyObject* _set_pointwise_optimize(
-    PyObject* self_,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-  mpy::handle value;
-  mpy::vector_args va(args, nargs, kwnames);
-  va.parse("_set_pointwise_optimization", {"value"}, {&value}, 1);
-  pointwise_optimize = mpy::to_bool(value);
-  Py_RETURN_NONE;
-  PY_END(nullptr)
-}
-
-PyObject* _patch_tensor_class(
-    PyObject* self_,
-    PyObject* const* args,
-    Py_ssize_t nargs,
-    PyObject* kwnames) {
-  PY_BEGIN
-
-  auto torch = mpy::import("torch");
-  auto py_TensorBase = torch.attr("_C").attr("TensorBase");
-  replaceMappingIfMatches(py_TensorBase);
-
-  Py_RETURN_NONE;
-  PY_END(nullptr)
-}
-
-const char* dims_doc = R"""(
-dims(n=None, sizes=None) -> torchdim.Dim or Tuple[torchdim.Dim, ...]
-
-Creates and returns one or more Dim objects.
-
-Arg:
-    n (int, optional): The number of dimensions to create. Can be omitted if sizes is specified.
-    sizes (List[Optional[int]], optional): A list the same size as the number of dimensions to be
-      created, specifying each dimensions size, or None to leave the size unset.
-
-Example::
-    >>> batch, channel, width, height = dims(4)
-    >>> batch, channel, width, height = dims(sizes=[None, 3, 224, 224])
-)""";
-
-PyMethodDef methods[] = {
-    {"dims",
-     (PyCFunction)(void*)_dims<create_dim>,
-     METH_FASTCALL | METH_KEYWORDS,
-     dims_doc},
-    {"dimlists",
-     (PyCFunction)(void*)_dims<create_dimlist>,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"_test_c", (PyCFunction)(void*)test_c, METH_FASTCALL | METH_KEYWORDS},
-    {"_wrap_method",
-     (PyCFunction)(void*)_wrap_method,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"Tensor_from_positional",
-     (PyCFunction)(void*)py_Tensor_from_positional,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"__torch_function__",
-     (PyCFunction)(void*)py___torch_function__,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"tree_flatten",
-     (PyCFunction)(void*)py_tree_flatten,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"order", (PyCFunction)(void*)order, METH_FASTCALL | METH_KEYWORDS},
-    {"index", (PyCFunction)(void*)py_index, METH_FASTCALL | METH_KEYWORDS},
-    {"stack", (PyCFunction)(void*)py_stack, METH_FASTCALL | METH_KEYWORDS},
-    {"split", (PyCFunction)(void*)py_split, METH_FASTCALL | METH_KEYWORDS},
-    {"expand", (PyCFunction)(void*)expand, METH_FASTCALL | METH_KEYWORDS},
-    {"__getitem__",
-     (PyCFunction)(void*)py___getitem__,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"__setitem__",
-     (PyCFunction)(void*)py___setitem__,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"_wrap", (PyCFunction)(void*)_wrap, METH_FASTCALL | METH_KEYWORDS},
-    {"Tensor_sum",
-     (PyCFunction)(void*)Tensor_sum,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"_parse_test",
-     (PyCFunction)(void*)_parse_test,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"_set_pointwise_optimize",
-     (PyCFunction)(void*)_set_pointwise_optimize,
-     METH_FASTCALL | METH_KEYWORDS},
-    {"_patch_tensor_class",
-     (PyCFunction)(void*)_patch_tensor_class,
-     METH_FASTCALL | METH_KEYWORDS},
-    {NULL, NULL, 0, NULL} /* Sentinel */
-};
-
-struct PyModuleDef module_def = {
-    PyModuleDef_HEAD_INIT,
-    "_C", /* name of module */
-    NULL, /* module documentation, may be NULL */
-    -1, /* size of per-interpreter state of the module,
-           or -1 if the module keeps state in global variables. */
-    methods};
-} // namespace
-
-PyObject* Dim_init() {
-  Arena A;
-  try {
-    mpy::object mod = mpy::object::checked_steal(PyModule_Create(&module_def));
-    Dim::ready(mod, "Dim");
-    DimList::ready(mod, "DimList");
-    Tensor::ready(mod, "Tensor");
-    WrappedOperator::ready(mod, "_WrappedOperator");
-    Py_INCREF(&PyInstanceMethod_Type);
-    PyModule_AddObject(
-        mod.ptr(), "_instancemethod", (PyObject*)&PyInstanceMethod_Type);
-
-    initializeGlobals(A);
-    return mod.release();
-  } catch (mpy::exception_set& err) {
-    return nullptr;
-  }
-}
-
-#endif
diff --git a/functorch/csrc/dim/dim.h b/functorch/csrc/dim/dim.h
deleted file mode 100644
index 627caa729fc2..000000000000
--- a/functorch/csrc/dim/dim.h
+++ /dev/null
@@ -1,8 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-#pragma once
-#include <Python.h>
-PyObject* Dim_init();
diff --git a/functorch/csrc/dim/dim_opcode.c b/functorch/csrc/dim/dim_opcode.c
deleted file mode 100644
index 1b5d06773445..000000000000
--- a/functorch/csrc/dim/dim_opcode.c
+++ /dev/null
@@ -1,17 +0,0 @@
-#include <torch/csrc/utils/python_compat.h>
-#if defined(_WIN32) && IS_PYTHON_3_11_PLUS
-#define Py_BUILD_CORE
-#define NEED_OPCODE_TABLES // To get _PyOpcode_Deopt, _PyOpcode_Caches
-
-#if IS_PYTHON_3_13_PLUS
-#include <cpython/code.h> // To get PyUnstable_Code_GetFirstFree
-#define NEED_OPCODE_METADATA
-#include "internal/pycore_opcode_metadata.h"
-#undef NEED_OPCODE_METADATA
-#else
-#include "internal/pycore_opcode.h"
-#endif
-
-#undef NEED_OPCODE_TABLES
-#undef Py_BUILD_CORE
-#endif
diff --git a/functorch/csrc/dim/minpybind.h b/functorch/csrc/dim/minpybind.h
deleted file mode 100644
index ceced399b40d..000000000000
--- a/functorch/csrc/dim/minpybind.h
+++ /dev/null
@@ -1,692 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-#include <utility>
-#include <ostream>
-#include <memory>
-
-#define PY_BEGIN try {
-#define PY_END(v) } catch(mpy::exception_set & err) { return (v); }
-
-#if PY_VERSION_HEX < 0x03080000
-    #define PY_VECTORCALL _PyObject_FastCallKeywords
-#else
-    #define PY_VECTORCALL _PyObject_Vectorcall
-#endif
-
-struct irange {
- public:
-    irange(int64_t end)
-    : irange(0, end, 1) {}
-    irange(int64_t begin, int64_t end, int64_t step = 1)
-    : begin_(begin), end_(end), step_(step) {}
-    int64_t operator*() const {
-        return begin_;
-    }
-    irange& operator++() {
-        begin_ += step_;
-        return *this;
-    }
-    bool operator!=(const irange& other) {
-        return begin_ != other.begin_;
-    }
-    irange begin() {
-        return *this;
-    }
-    irange end() {
-        return irange {end_, end_, step_};
-    }
- private:
-    int64_t begin_;
-    int64_t end_;
-    int64_t step_;
-};
-
-namespace mpy {
-
-struct exception_set {
-};
-
-struct object;
-struct vector_args;
-
-struct handle {
-    handle(PyObject* ptr)
-    : ptr_(ptr) {}
-    handle() = default;
-
-
-    PyObject* ptr() const {
-        return ptr_;
-    }
-    object attr(const char* key);
-    bool hasattr(const char* key);
-    handle type() const {
-        return (PyObject*) Py_TYPE(ptr());
-    }
-
-    template<typename... Args>
-    object call(Args&&... args);
-    object call_object(mpy::handle args);
-    object call_object(mpy::handle args, mpy::handle kwargs);
-    object call_vector(mpy::handle* begin, Py_ssize_t nargs, mpy::handle kwnames);
-    object call_vector(vector_args args);
-    bool operator==(handle rhs) {
-        return ptr_ == rhs.ptr_;
-    }
-
-    static handle checked(PyObject* ptr) {
-        if (!ptr) {
-            throw exception_set();
-        }
-        return ptr;
-    }
-
-protected:
-    PyObject* ptr_ = nullptr;
-};
-
-
-template<typename T>
-struct obj;
-
-template<typename T>
-struct hdl : public handle {
-    T* ptr() {
-        return  (T*) handle::ptr();
-    }
-    T* operator->() {
-        return ptr();
-    }
-    hdl(T* ptr)
-    : hdl((PyObject*) ptr) {}
-    hdl(const obj<T>& o)
-    : hdl(o.ptr()) {}
-private:
-    hdl(handle h) : handle(h) {}
-};
-
-struct object : public handle {
-    object() = default;
-    object(const object& other)
-    : handle(other.ptr_) {
-        Py_XINCREF(ptr_);
-    }
-    object(object&& other) noexcept
-    : handle(other.ptr_) {
-        other.ptr_ = nullptr;
-    }
-    object& operator=(const object& other) {
-        return *this = object(other);
-    }
-    object& operator=(object&& other) noexcept {
-        PyObject* tmp = ptr_;
-        ptr_ = other.ptr_;
-        other.ptr_ = tmp;
-        return *this;
-    }
-    ~object() {
-        Py_XDECREF(ptr_);
-    }
-    static object steal(handle o) {
-        return object(o.ptr());
-    }
-    static object checked_steal(handle o) {
-        if (!o.ptr()) {
-            throw exception_set();
-        }
-        return steal(o);
-    }
-    static object borrow(handle o) {
-        Py_XINCREF(o.ptr());
-        return steal(o);
-    }
-    PyObject* release() {
-        auto tmp = ptr_;
-        ptr_ = nullptr;
-        return tmp;
-    }
-protected:
-    explicit object(PyObject* ptr)
-    : handle(ptr) {}
-};
-
-template<typename T>
-struct obj : public object {
-    obj() = default;
-    obj(const obj& other)
-    : object(other.ptr_) {
-        Py_XINCREF(ptr_);
-    }
-    obj(obj&& other) noexcept
-    : object(other.ptr_) {
-        other.ptr_ = nullptr;
-    }
-    obj& operator=(const obj& other) {
-        return *this = obj(other);
-    }
-    obj& operator=(obj&& other) noexcept {
-        PyObject* tmp = ptr_;
-        ptr_ = other.ptr_;
-        other.ptr_ = tmp;
-        return *this;
-    }
-    static obj steal(hdl<T> o) {
-        return obj(o.ptr());
-    }
-    static obj checked_steal(hdl<T> o) {
-        if (!o.ptr()) {
-            throw exception_set();
-        }
-        return steal(o);
-    }
-    static obj borrow(hdl<T> o) {
-        Py_XINCREF(o.ptr());
-        return steal(o);
-    }
-    T* ptr() const {
-        return (T*) object::ptr();
-    }
-    T* operator->() {
-        return ptr();
-    }
-protected:
-    explicit obj(T* ptr)
-    : object((PyObject*)ptr) {}
-};
-
-
-static bool isinstance(handle h, handle c) {
-    return PyObject_IsInstance(h.ptr(), c.ptr());
-}
-
-[[ noreturn ]] inline void raise_error(handle exception, const char *format, ...) {
-    va_list args;
-    va_start(args, format);
-    PyErr_FormatV(exception.ptr(), format, args);
-    va_end(args);
-    throw exception_set();
-}
-
-template<typename T>
-struct base {
-    PyObject_HEAD
-    PyObject* ptr() const {
-        return (PyObject*) this;
-    }
-    static obj<T> alloc(PyTypeObject* type = nullptr) {
-        if (!type) {
-            type = &T::Type;
-        }
-        auto self = (T*) type->tp_alloc(type, 0);
-        if (!self) {
-            throw mpy::exception_set();
-        }
-        new (self) T;
-        return obj<T>::steal(self);
-    }
-    template<typename ... Args>
-    static obj<T> create(Args ... args) {
-        auto self = alloc();
-        self->init(std::forward<Args>(args)...);
-        return self;
-    }
-    static bool check(handle v) {
-        return isinstance(v, (PyObject*)&T::Type);
-    }
-
-    static hdl<T> unchecked_wrap(handle self_) {
-        return hdl<T>((T*)self_.ptr());
-    }
-    static hdl<T> wrap(handle self_) {
-        if (!check(self_)) {
-            raise_error(PyExc_ValueError, "not an instance of %S", &T::Type);
-        }
-        return unchecked_wrap(self_);
-    }
-
-    static obj<T> unchecked_wrap(object self_) {
-        return obj<T>::steal(unchecked_wrap(self_.release()));
-    }
-    static obj<T> wrap(object self_) {
-        return obj<T>::steal(wrap(self_.release()));
-    }
-
-    static PyObject* new_stub(PyTypeObject *type, PyObject *args, PyObject *kwds) {
-        PY_BEGIN
-        return (PyObject*) alloc(type).release();
-        PY_END(nullptr)
-    }
-    static void dealloc_stub(PyObject *self) {
-        ((T*)self)->~T();
-        Py_TYPE(self)->tp_free(self);
-    }
-    static void ready(mpy::handle mod, const char* name) {
-        if (PyType_Ready(&T::Type)) {
-            throw exception_set();
-        }
-        if(PyModule_AddObject(mod.ptr(), name, (PyObject*) &T::Type) < 0) {
-            throw exception_set();
-        }
-    }
-};
-
-inline object handle::attr(const char* key) {
-    return object::checked_steal(PyObject_GetAttrString(ptr(), key));
-}
-
-inline bool handle::hasattr(const char* key) {
-    return PyObject_HasAttrString(ptr(), key);
-}
-
-inline object import(const char* module) {
-    return object::checked_steal(PyImport_ImportModule(module));
-}
-
-template<typename... Args>
-inline object handle::call(Args&&... args) {
-    return object::checked_steal(PyObject_CallFunctionObjArgs(ptr_, args.ptr()..., nullptr));
-}
-
-inline object handle::call_object(mpy::handle args) {
-    return object::checked_steal(PyObject_CallObject(ptr(), args.ptr()));
-}
-
-
-inline object handle::call_object(mpy::handle args, mpy::handle kwargs) {
-    return object::checked_steal(PyObject_Call(ptr(), args.ptr(), kwargs.ptr()));
-}
-
-inline object handle::call_vector(mpy::handle* begin, Py_ssize_t nargs, mpy::handle kwnames) {
-    return object::checked_steal(PY_VECTORCALL(ptr(), (PyObject*const*) begin, nargs, kwnames.ptr()));
-}
-
-struct tuple : public object {
-    void set(int i, object v) {
-        PyTuple_SET_ITEM(ptr_, i, v.release());
-    }
-    tuple(int size)
-    : object(checked_steal(PyTuple_New(size))) {}
-};
-
-struct list : public object {
-    void set(int i, object v) {
-        PyList_SET_ITEM(ptr_, i, v.release());
-    }
-    list(int size)
-    : object(checked_steal(PyList_New(size))) {}
-};
-
-namespace{
-mpy::object unicode_from_format(const char* format, ...) {
-    va_list args;
-    va_start(args, format);
-    auto r = PyUnicode_FromFormatV(format, args);
-    va_end(args);
-    return mpy::object::checked_steal(r);
-}
-mpy::object unicode_from_string(const char * str) {
-    return mpy::object::checked_steal(PyUnicode_FromString(str));
-}
-
-mpy::object from_int(Py_ssize_t s) {
-    return mpy::object::checked_steal(PyLong_FromSsize_t(s));
-}
-mpy::object from_bool(bool b) {
-    return mpy::object::borrow(b ? Py_True : Py_False);
-}
-
-bool is_sequence(handle h) {
-    return PySequence_Check(h.ptr());
-}
-}
-
-struct sequence_view : public handle {
-    sequence_view(handle h)
-    : handle(h) {}
-    Py_ssize_t size() const {
-        auto r = PySequence_Size(ptr());
-        if (r == -1 && PyErr_Occurred()) {
-            throw mpy::exception_set();
-        }
-        return r;
-    }
-    irange enumerate() const {
-        return irange(size());
-    }
-    static sequence_view wrap(handle h) {
-        if (!is_sequence(h)) {
-            raise_error(PyExc_ValueError, "expected a sequence");
-        }
-        return sequence_view(h);
-    }
-    mpy::object operator[](Py_ssize_t i) const {
-        return mpy::object::checked_steal(PySequence_GetItem(ptr(), i));
-    }
-};
-
-namespace {
-mpy::object repr(handle h) {
-    return mpy::object::checked_steal(PyObject_Repr(h.ptr()));
-}
-
-mpy::object str(handle h) {
-    return mpy::object::checked_steal(PyObject_Str(h.ptr()));
-}
-
-
-bool is_int(handle h) {
-    return PyLong_Check(h.ptr());
-}
-
-bool is_none(handle h) {
-    return h.ptr() == Py_None;
-}
-
-Py_ssize_t to_int(handle h) {
-    Py_ssize_t r = PyLong_AsSsize_t(h.ptr());
-    if (r == -1 && PyErr_Occurred()) {
-        throw mpy::exception_set();
-    }
-    return r;
-}
-
-bool to_bool(handle h) {
-    return PyObject_IsTrue(h.ptr()) != 0;
-}
-}
-
-struct slice_view {
-    slice_view(handle h, Py_ssize_t size)  {
-        if(PySlice_Unpack(h.ptr(), &start, &stop, &step) == -1) {
-            throw mpy::exception_set();
-        }
-        slicelength = PySlice_AdjustIndices(size, &start, &stop, step);
-    }
-    Py_ssize_t start, stop, step, slicelength;
-};
-
-static bool is_slice(handle h) {
-    return PySlice_Check(h.ptr());
-}
-
-inline std::ostream& operator<<(std::ostream& ss, handle h) {
-    ss << PyUnicode_AsUTF8(str(h).ptr());
-    return ss;
-}
-
-struct tuple_view : public handle {
-    tuple_view() = default;
-    tuple_view(handle h) : handle(h) {}
-
-    Py_ssize_t size() const {
-        return PyTuple_GET_SIZE(ptr());
-    }
-
-    irange enumerate() const {
-        return irange(size());
-    }
-
-    handle operator[](Py_ssize_t i) {
-        return PyTuple_GET_ITEM(ptr(), i);
-    }
-
-    static bool check(handle h) {
-        return PyTuple_Check(h.ptr());
-    }
-};
-
-struct list_view : public handle {
-    list_view() = default;
-    list_view(handle h) : handle(h) {}
-    Py_ssize_t size() const {
-        return PyList_GET_SIZE(ptr());
-    }
-
-    irange enumerate() const {
-        return irange(size());
-    }
-
-    handle operator[](Py_ssize_t i) {
-        return PyList_GET_ITEM(ptr(), i);
-    }
-
-    static bool check(handle h) {
-        return PyList_Check(h.ptr());
-    }
-};
-
-struct dict_view : public handle {
-    dict_view() = default;
-    dict_view(handle h) : handle(h) {}
-    object keys() const {
-        return mpy::object::checked_steal(PyDict_Keys(ptr()));
-    }
-    object values() const {
-        return mpy::object::checked_steal(PyDict_Values(ptr()));
-    }
-    object items() const {
-        return mpy::object::checked_steal(PyDict_Items(ptr()));
-    }
-    bool contains(handle k) const {
-        return PyDict_Contains(ptr(), k.ptr());
-    }
-    handle operator[](handle k) {
-        return mpy::handle::checked(PyDict_GetItem(ptr(), k.ptr()));
-    }
-    static bool check(handle h) {
-        return PyDict_Check(h.ptr());
-    }
-    bool next(Py_ssize_t* pos, mpy::handle* key, mpy::handle* value) {
-        PyObject *k = nullptr, *v = nullptr;
-        auto r = PyDict_Next(ptr(), pos, &k, &v);
-        *key = k;
-        *value = v;
-        return r;
-    }
-    void set(handle k, handle v) {
-        if (-1 == PyDict_SetItem(ptr(), k.ptr(), v.ptr())) {
-            throw exception_set();
-        }
-    }
-};
-
-
-struct kwnames_view : public handle {
-    kwnames_view() = default;
-    kwnames_view(handle h) : handle(h) {}
-
-    Py_ssize_t size() const {
-        return PyTuple_GET_SIZE(ptr());
-    }
-
-    irange enumerate() const {
-        return irange(size());
-    }
-
-    const char* operator[](Py_ssize_t i) const {
-        PyObject* obj = PyTuple_GET_ITEM(ptr(), i);
-        return PyUnicode_AsUTF8(obj);
-    }
-
-    static bool check(handle h) {
-        return PyTuple_Check(h.ptr());
-    }
-};
-
-inline mpy::object funcname(mpy::handle func) {
-    if (func.hasattr("__name__")) {
-        return func.attr("__name__");
-    } else {
-        return mpy::str(func);
-    }
-}
-
-struct vector_args {
-    vector_args(PyObject *const *a,
-                      Py_ssize_t n,
-                      PyObject *k)
-    : vector_args((mpy::handle*)a, n, k) {}
-    vector_args(mpy::handle* a,
-                    Py_ssize_t n,
-                    mpy::handle k)
-    : args((mpy::handle*)a), nargs(n), kwnames(k) {}
-    mpy::handle* args;
-    Py_ssize_t nargs;
-    kwnames_view kwnames;
-
-    mpy::handle* begin() {
-        return args;
-    }
-    mpy::handle* end() {
-        return args + size();
-    }
-
-    mpy::handle operator[](int64_t i) const {
-        return args[i];
-    }
-    bool has_keywords() const {
-        return kwnames.ptr();
-    }
-    irange enumerate_positional() {
-        return irange(nargs);
-    }
-    irange enumerate_all() {
-        return irange(size());
-    }
-    int64_t size() const {
-        return nargs + (has_keywords() ? kwnames.size() : 0);
-    }
-
-    // bind a test function so this can be tested, first two args for required/kwonly, then return what was parsed...
-
-    // provide write kwarg
-    // don't provide a required arg
-    // don't provide an optional arg
-    // provide a kwarg that is the name of already provided positional
-    // provide a kwonly argument positionally
-    // provide keyword arguments in the wrong order
-    // provide only keyword arguments
-    void parse(const char * fname_cstr, std::initializer_list<const char*> names, std::initializer_list<mpy::handle*> values, int required, int kwonly=0) {
-        auto error = [&]() {
-            // rather than try to match the slower infrastructure with error messages exactly, once we have detected an error, just use that
-            // infrastructure to format it and throw it
-
-            // have to leak this, because python expects these to last
-            const char** names_buf = new const char*[names.size() + 1];
-            std::copy(names.begin(), names.end(), &names_buf[0]);
-            names_buf[names.size()] = nullptr;
-
-#if PY_VERSION_HEX < 0x03080000
-            char* format_str = new char[names.size() + 3];
-            int i = 0;
-            char* format_it = format_str;
-            for (auto it = names.begin(); it != names.end(); ++it, ++i) {
-                if (i == required) {
-                    *format_it++ = '|';
-                }
-                if (i == (int)names.size() - kwonly) {
-                    *format_it++ = '$';
-                }
-                *format_it++ = 'O';
-            }
-            *format_it++ = '\0';
-            _PyArg_Parser* _parser = new _PyArg_Parser{format_str, &names_buf[0], fname_cstr, 0};
-            PyObject *dummy = NULL;
-            _PyArg_ParseStackAndKeywords((PyObject*const*)args, nargs, kwnames.ptr(), _parser, &dummy, &dummy, &dummy, &dummy, &dummy);
-#else
-            _PyArg_Parser* _parser = new _PyArg_Parser{NULL, &names_buf[0], fname_cstr, 0};
-            auto buf = std::make_unique<PyObject*[]>(names.size());
-            _PyArg_UnpackKeywords((PyObject*const*)args, nargs, NULL, kwnames.ptr(), _parser, required, (Py_ssize_t)values.size() - kwonly, 0, &buf[0]);
-#endif
-            throw exception_set();
-        };
-
-        auto values_it = values.begin();
-        auto names_it = names.begin();
-        auto npositional = values.size() - kwonly;
-
-        if (nargs > (Py_ssize_t)npositional) {
-            // TOO MANY ARGUMENTS
-            error();
-        }
-        for (auto i : irange(nargs)) {
-            *(*values_it++) = args[i];
-            ++names_it;
-        }
-
-        if (!kwnames.ptr()) {
-            if (nargs < required) {
-                // not enough positional arguments
-                error();
-            }
-        } else {
-            int consumed = 0;
-            for (auto i : irange(nargs, values.size())) {
-                bool success = i >= required;
-                const char* target_name = *(names_it++);
-                for (auto j : kwnames.enumerate()) {
-                    if (!strcmp(target_name,kwnames[j])) {
-                        *(*values_it) = args[nargs + j];
-                        ++consumed;
-                        success = true;
-                        break;
-                    }
-                }
-                ++values_it;
-                if (!success) {
-                    // REQUIRED ARGUMENT NOT SPECIFIED
-                    error();
-                }
-            }
-            if (consumed != kwnames.size()) {
-                // NOT ALL KWNAMES ARGUMENTS WERE USED
-                error();
-            }
-        }
-    }
-    int index(const char* name, int pos) {
-        if (pos < nargs) {
-            return pos;
-        }
-        if (kwnames.ptr()) {
-            for (auto j : kwnames.enumerate()) {
-                if (!strcmp(name, kwnames[j])) {
-                    return nargs + j;
-                }
-            }
-        }
-        return -1;
-    }
-};
-
-inline object handle::call_vector(vector_args args) {
-    return object::checked_steal(PY_VECTORCALL(ptr(), (PyObject*const*) args.args, args.nargs, args.kwnames.ptr()));
-}
-
-
-}
-
-#define MPY_ARGS_NAME(typ, name) #name ,
-#define MPY_ARGS_DECLARE(typ, name) typ name;
-#define MPY_ARGS_POINTER(typ, name) &name ,
-#define MPY_PARSE_ARGS_KWARGS(fmt, FORALL_ARGS) \
-    static char* kwlist[] = { FORALL_ARGS(MPY_ARGS_NAME) nullptr}; \
-    FORALL_ARGS(MPY_ARGS_DECLARE) \
-    if (!PyArg_ParseTupleAndKeywords(args, kwargs, fmt, kwlist, FORALL_ARGS(MPY_ARGS_POINTER) nullptr)) { \
-        throw mpy::exception_set(); \
-    }
-
-#define MPY_PARSE_ARGS_KWNAMES(fmt, FORALL_ARGS) \
-    static const char * const kwlist[] = { FORALL_ARGS(MPY_ARGS_NAME) nullptr}; \
-    FORALL_ARGS(MPY_ARGS_DECLARE) \
-    static _PyArg_Parser parser = {fmt, kwlist, 0}; \
-    if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &parser, FORALL_ARGS(MPY_ARGS_POINTER) nullptr)) { \
-        throw mpy::exception_set(); \
-    }
diff --git a/functorch/csrc/dim/python_variable_simple.h b/functorch/csrc/dim/python_variable_simple.h
deleted file mode 100644
index d8c22ca312e3..000000000000
--- a/functorch/csrc/dim/python_variable_simple.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-// note: pytorch's python variable simple includes pybind which conflicts with minpybind
-// so this file just reproduces the minimal API needed to extract Tensors from python objects.
-
-#include <torch/csrc/python_headers.h>
-#include <ATen/core/Tensor.h>
-#include <torch/csrc/Export.h>
-
-// Python object that backs torch.autograd.Variable
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
-struct THPVariable {
-  PyObject_HEAD;
-  // Payload
-  c10::MaybeOwned<at::Tensor> cdata;
-  // Hooks to be run on backwards pass (corresponds to Python attr
-  // '_backwards_hooks', set by 'register_hook')
-  PyObject* backward_hooks = nullptr;
-};
-
-TORCH_PYTHON_API extern PyObject *THPVariableClass;
-TORCH_PYTHON_API extern PyObject *ParameterClass;
-
-TORCH_PYTHON_API PyObject * THPVariable_Wrap(const at::TensorBase& var);
-
-inline bool THPVariable_Check(PyObject *obj)
-{
-  if (!THPVariableClass)
-      return false;
-
-  const auto result = PyObject_IsInstance(obj, THPVariableClass);
-  AT_ASSERT(result != -1);
-  return result;
-}
-
-inline const at::Tensor& THPVariable_Unpack(THPVariable* var) {
-  return *var->cdata;
-}
-
-inline const at::Tensor& THPVariable_Unpack(PyObject* obj) {
-  return THPVariable_Unpack(reinterpret_cast<THPVariable*>(obj));
-}
-
-TORCH_PYTHON_API c10::impl::PyInterpreter* getPyInterpreter();
diff --git a/functorch/csrc/init_dim_only.cpp b/functorch/csrc/init_dim_only.cpp
deleted file mode 100644
index 88d4cbcff795..000000000000
--- a/functorch/csrc/init_dim_only.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <torch/extension.h>
-#include <functorch/csrc/dim/dim.h>
-
-namespace at {
-namespace functorch {
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  // initialize first-class dims and install it as a submodule on _C
-  auto dim = Dim_init();
-  if (!dim) {
-    throw py::error_already_set();
-  }
-  py::setattr(m, "dim", py::reinterpret_steal<py::object>(dim));
-}
-
-}}
diff --git a/functorch/dim/README.md b/functorch/dim/README.md
index 517930cb844b..80435c2115c2 100644
--- a/functorch/dim/README.md
+++ b/functorch/dim/README.md
@@ -746,12 +746,14 @@ These compilers and language have syntax and semantics that resemble the loop-le
 
 Dimension objects are just an extension of the existing PyTorch tensors and eager semantics, so there is no friction switching between normal Python code and code that uses them. However, since loops over the dimensions are defined implicitly, they can still execute in Python with good performance compared to explicit loops. Furthermore, with dimension objects, a tensors containing dimensions can compute through code that is oblivious to the dimension such as batching examples. There is no need to separate code into 'compiled' vs 'eager'.
 
-In this way, first-class dims are a way of adapting the nicer syntax of these array compilers and languages to eager numpy-style libraries.
+In this way, first-class dims are a way of adapting the nicer syntax of these array compilers and languages to eager numpy-style libraries.  Note, however, that first class dimensions are not natively compiled, so if you write code that performs many outer products with the expectation of it being fused, you will generally not get good performance or memory use (except for matrix-multiply-like patterns specifically.)
 
 
 Performance Expectations
 ========================
-First-class dimensions are not a compiler. They provide syntax for existing PyTorch operations such as advanced indexing that is easier to read and write. For large sized tensors, the performance of any statements including them will be the same as using the already existing operations. An important exception is the pattern matching of products and summation, where performance will be improved by issuing to a matrix-multiply kernel. The C++ implementation of dimensions adds a small overhead of around 2us on top of PyTorch's normal overhead of 8us to each function that uses them. In the future, the implementation can incorporate more fusion optimization to further improve performance of this style of code.
+First-class dimensions are not a compiler. They provide syntax for existing PyTorch operations such as advanced indexing that is easier to read and write. For large sized tensors, the performance of any statements including them will be the same as using the already existing operations. An important exception is the pattern matching of products and summation, where performance will be improved by issuing to a matrix-multiply kernel.
+
+Originally, there was a C++ implementation of dimensions adds a small overhead of around 2us on top of PyTorch's normal overhead of 8us to each function that uses them.  However, this implementation had some manual memory managemetn bugs and was not kept up to date with CPython updates.  The latest Python implementation is two orders of magnitude slower due to CPU overhead; for overhead sensitive applications you should compile the code to eliminate this overhead.
 
 
 ## License
diff --git a/functorch/dim/__init__.py b/functorch/dim/__init__.py
index 95747181e848..df9ca766e28f 100644
--- a/functorch/dim/__init__.py
+++ b/functorch/dim/__init__.py
@@ -1,13 +1,402 @@
-import functorch._C
+from __future__ import annotations
+
+import dis
+import inspect
+import sys
+from typing import Any, Optional, TYPE_CHECKING, Union
+
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Sequence
+
 import torch
-from functorch._C import dim as _C
+from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
+
+from ._dim_entry import _match_levels, DimEntry, ndim_of_levels
+from ._enable_all_layers import EnableAllLayers
+from ._py_inst_decoder import _PyInstDecoder
+from ._tensor_info import TensorInfo
+
+
+POINTWISE_OPTIMIZE = True
+DOT_OPTIMIZED = True
+
+# Global dimension level counter
+_n_dims_created = 0
+
+
+def _relevant_op(opcode: Optional[str]) -> bool:
+    """Check if opcode is relevant for variable assignment."""
+    return bool(opcode and opcode.startswith("STORE_"))
+
+
+def handle_from_tensor(tensor: torch.Tensor) -> torch.Tensor:
+    """Handle tensor conversion for torch function integration."""
+    return tensor
+
+
+def _create_dim(name: str, size: Optional[int] = None) -> Dim:
+    """Create a new Dim object."""
+    return Dim(name, size if size is not None else -1)
+
+
+def dims(
+    n: Optional[int] = None, sizes: Optional[list[Optional[int]]] = None
+) -> Union[Dim, tuple[Dim, ...]]:
+    """
+    Create and return one or more Dim objects.
+
+    Uses bytecode inspection to determine variable names when possible.
+
+    Args:
+        n (int, optional): The number of dimensions to create. Can be omitted if sizes is specified.
+        sizes (List[Optional[int]], optional): A list the same size as the number of dimensions to be
+          created, specifying each dimensions size, or None to leave the size unset.
+
+    Returns:
+        Union[Dim, Tuple[Dim, ...]]: Single Dim if n=1, tuple of Dims otherwise.
+
+    Examples:
+        >>> batch, channel, width, height = dims(4)
+        >>> batch, channel, width, height = dims(sizes=[None, 3, 224, 224])
+        >>> single_dim = dims(1)
+    """
+    specified_ndims = -1
+    found_ndims = 0
+
+    # Parse arguments
+    if sizes is not None:
+        specified_ndims = len(sizes)
+    if n is not None:
+        specified_ndims = n
+
+    # Use bytecode inspection
+    frame = inspect.currentframe()
+    if frame is None:
+        raise RuntimeError("Unable to get current frame")
+    frame = frame.f_back
+    try:
+        if frame is None:
+            raise RuntimeError("Unable to get caller frame")
+        code = frame.f_code
+        lasti = frame.f_lasti
+
+        decoder = _PyInstDecoder(code, lasti)
+
+        if sys.version_info >= (3, 11):
+            if decoder.opcode() == "PRECALL":
+                decoder.next()
+
+        # Move to next instruction after the call
+        decoder.next()
+
+        # Determine number of dimensions from bytecode
+        if _relevant_op(decoder.opcode()):
+            found_ndims = 1
+        elif decoder.opcode() == "UNPACK_SEQUENCE":
+            found_ndims = decoder.oparg()
+            decoder.next()  # Move past UNPACK_SEQUENCE
+
+        if specified_ndims == -1:
+            if found_ndims == 0:
+                raise SyntaxError(
+                    "dims() must be assigned to a sequence of variable names or have argument n specified"
+                )
+            specified_ndims = found_ndims
+
+        if found_ndims != specified_ndims:
+            found_ndims = 0
+
+        def genobject(i: int) -> Dim:
+            nonlocal found_ndims
+            name = None
+            if i < found_ndims:
+                name = decoder.name()
+
+            if not name:
+                name = f"d{i}"
+                found_ndims = 0
+            else:
+                decoder.next()  # Move to next STORE instruction
+
+            size = sizes[i] if sizes is not None else None
+            return _create_dim(name, size)
+
+        # Validate sizes parameter
+        if sizes is not None and len(sizes) != specified_ndims:
+            raise ValueError(f"expected {specified_ndims} sizes but found {len(sizes)}")
+
+        if specified_ndims == 1:
+            return genobject(0)
+
+        result = []
+        for i in range(specified_ndims):
+            result.append(genobject(i))
+
+        return tuple(result)
+
+    finally:
+        del frame
+
+
+class DimList:
+    """
+    A list of first-class dimensions that can be bound to tensor dimensions.
+
+    A DimList can be in one of two states:
+    1. Unbound: Created with just a name, no specific dimensions yet
+    2. Bound: Either created with specific dimensions/sizes, or bound later via bind() or bind_len()
+    """
+
+    _name: Optional[str]
+    _dims: list[Dim]
+    _bound: bool
+
+    def __init__(
+        self,
+        len_or_dims: Optional[Union[int, Sequence]] = None,
+        name: Optional[str] = None,
+    ):
+        """
+        Initialize a new DimList object.
+
+        Args:
+            len_or_dims: Optional length (int) or sequence of dimensions/sizes
+            name: Optional name for the dimension list
+        """
+        # Initialize attributes
+        self._name = name
+        self._dims: list = []
+        self._bound = False
+
+        if isinstance(len_or_dims, int):
+            self.bind_len(len_or_dims)
+        elif len_or_dims is not None:
+            dims = []
+            for i, item in enumerate(len_or_dims):
+                if isinstance(item, int):
+                    dim_name = f"{self._name}{i}" if self._name else f"dim{i}"
+                    dims.append(Dim(dim_name, item))
+                else:
+                    dims.append(Dim(item))
+            self._set_dims(dims)
+
+    def _set_dims(self, dims: list) -> None:
+        """Set the dimensions and mark as bound."""
+        self._bound = True
+        self._dims = dims
+
+    def bind_len(self, size: int) -> None:
+        """
+        Bind this DimList to a specific length.
+
+        Args:
+            size: Number of dimensions to bind to
+
+        Raises:
+            DimensionBindError: If already bound to a different size
+        """
+        if self._bound:
+            if len(self._dims) != size:
+                raise DimensionBindError(
+                    f"Dimlist has size {len(self._dims)} but it is being bound to size {size}"
+                )
+        else:
+            self._bound = True
+            self._dims = []
+            for i in range(size):
+                dim_name = f"{self._name}{i}" if self._name else f"dim{i}"
+                self._dims.append(Dim(dim_name))
+
+    def bind(self, sizes: Sequence[int]) -> None:
+        """
+        Bind this DimList to specific sizes.
+
+        Args:
+            sizes: Sequence of sizes for each dimension
+
+        Raises:
+            ValueError: If sizes is not a sequence
+        """
+        if not hasattr(sizes, "__len__") or not hasattr(sizes, "__getitem__"):
+            raise ValueError("expected a sequence")
+
+        size = len(sizes)
+        self.bind_len(size)
+
+        for i, dim_size in enumerate(sizes):
+            self._dims[i].size = int(dim_size)
+
+    def _size(self) -> int:
+        if not self._bound:
+            raise DimensionBindError("DimList not bound")
+        return len(self._dims)
+
+    def size(self) -> int:
+        """Return the size (number of dimensions) of this DimList."""
+        return self._size()
+
+    def _set_bound(self, b: bool) -> None:
+        """Set the bound status (for internal use)."""
+        self._bound = b
+
+    @property
+    def is_bound(self) -> bool:
+        """Property to check if DimList is bound."""
+        return self._bound
+
+    def __len__(self) -> int:
+        """Return the length of the DimList."""
+        return self.size()
+
+    def __getitem__(self, key: Union[int, slice]) -> Union[Dim, tuple[Dim, ...]]:
+        if not self._bound:
+            raise DimensionBindError("DimList not bound")
+
+        if isinstance(key, int):
+            if key < 0 or key >= len(self._dims):
+                raise IndexError("index out of bounds")
+            return self._dims[key]
+        elif isinstance(key, slice):
+            start, stop, step = key.indices(len(self._dims))
+            result = []
+            for i in range(start, stop, step):
+                result.append(self._dims[i])
+            return tuple(result)
+        else:
+            raise ValueError("expected an int or a slice")
+
+    def __repr__(self) -> str:
+        """Return string representation of the DimList."""
+        if self._bound:
+            # Show as tuple representation
+            return f"({', '.join(repr(dim) for dim in self._dims)})"
+        elif self._name is not None:
+            # Show as *name for unbound with name
+            return f"*{self._name}"
+        else:
+            # Show as <unbound_dimlist> for unbound without name
+            return "<unbound_dimlist>"
+
+    def __str__(self) -> str:
+        """Return string representation of the DimList."""
+        return self.__repr__()
+
+    @classmethod
+    def __torch_function__(
+        cls,
+        func: Callable,
+        types: tuple,
+        args: tuple = (),
+        kwargs: Optional[dict] = None,
+    ) -> Any:
+        return _Tensor.__torch_function__(func, types, args, kwargs)
+
+
+def _create_dimlist(
+    name: str, size: Optional[Union[int, list[Optional[int]]]] = None
+) -> DimList:
+    """Create a DimList object with the given name and optional size."""
+    dimlist = DimList(name=name)
+    if size is not None:
+        if isinstance(size, int):
+            dimlist.bind_len(size)
+        else:
+            # size is a list of optional ints
+            dimlist.bind_len(len(size))
+            for i, s in enumerate(size):
+                if s is not None:
+                    dimlist._dims[i].size = s
+    return dimlist
+
+
+def dimlists(
+    n: Optional[int] = None, sizes: Optional[list[Optional[int]]] = None
+) -> Union[DimList, tuple[DimList, ...]]:
+    """
+    Create and return one or more DimList objects.
+
+    Similar to dims() but creates DimList objects instead.
+    """
+    specified_ndims = -1
+    found_ndims = 0
 
-from .tree_map import tree_flatten, tree_map
-from .wrap_type import wrap_type
+    # Parse arguments
+    if sizes is not None:
+        specified_ndims = len(sizes)
+    if n is not None:
+        specified_ndims = n
 
+    frame = inspect.currentframe()
+    if frame is None:
+        raise RuntimeError("Unable to get current frame")
+    frame = frame.f_back
+    try:
+        if frame is None:
+            raise RuntimeError("Unable to get caller frame")
+        code = frame.f_code
+        lasti = frame.f_lasti
 
-_C._patch_tensor_class()
-dims, DimList, dimlists = _C.dims, _C.DimList, _C.dimlists
+        decoder = _PyInstDecoder(code, lasti)
+
+        if sys.version_info >= (3, 11):
+            if decoder.opcode() == "PRECALL":
+                decoder.next()
+
+        # Move to next instruction after the call
+        decoder.next()
+
+        # Determine number of dimensions from bytecode
+        if _relevant_op(decoder.opcode()):
+            found_ndims = 1
+        elif decoder.opcode() == "UNPACK_SEQUENCE":
+            found_ndims = decoder.oparg()
+            decoder.next()  # Move past UNPACK_SEQUENCE
+
+        if specified_ndims == -1:
+            if found_ndims == 0:
+                raise SyntaxError(
+                    "dimlists() must be assigned to a sequence of variable names or have argument n specified"
+                )
+            specified_ndims = found_ndims
+
+        if found_ndims != specified_ndims:
+            found_ndims = 0
+
+        # Generator function for dimlist names
+        def genobject(i: int) -> str:
+            nonlocal found_ndims
+            name = None
+            if i < found_ndims:
+                name = decoder.name()
+
+            if not name:
+                name = f"d{i}"
+                found_ndims = 0
+            else:
+                decoder.next()  # Move to next STORE instruction
+
+            return name
+
+        # Validate sizes
+        if sizes is not None and len(sizes) != specified_ndims:
+            raise ValueError(f"expected {specified_ndims} sizes but found {len(sizes)}")
+
+        # Create dimlists
+        if specified_ndims == 1:
+            name = genobject(0)
+            return _create_dimlist(name, sizes[0] if sizes is not None else None)
+
+        result = []
+        for i in range(specified_ndims):
+            name = genobject(i)
+            size = sizes[i] if sizes is not None else None
+            result.append(_create_dimlist(name, size))
+
+        return tuple(result)
+
+    finally:
+        del frame
 
 
 class DimensionMismatchError(Exception):
@@ -21,80 +410,1123 @@ class DimensionBindError(Exception):
 from . import op_properties
 
 
-# use dict to avoid writing C++ bindings for set
-pointwise = dict.fromkeys(op_properties.pointwise, True)
+def _safe_print(*args: Any, **kwargs: Any) -> None:
+    """Safe print that avoids recursive torch function dispatches."""
+    import sys
+
+    # Convert any torch objects to basic representations
+    safe_args = []
+    for arg in args:
+        if hasattr(arg, "__class__") and "torch" in str(type(arg)):
+            safe_args.append(f"<{type(arg).__name__}>")
+        else:
+            safe_args.append(str(arg))
+
+    print(*safe_args, **kwargs, file=sys.stderr)
 
 
 class _Tensor:
-    # fast path around slow wrapping/unwrapping logic for simply queries used
-    # by the implementation...
+    def _get_levels(self) -> list[Any]:
+        raise NotImplementedError("_get_levels must be implemented by subclass")
+
+    def _get_tensor(self) -> Optional[torch.Tensor]:
+        raise NotImplementedError("_get_tensor must be implemented by subclass")
+
+    @property
+    def ndim(self) -> int:
+        raise NotImplementedError("ndim must be implemented by subclass")
 
     @property
-    def dims(self):
-        return tuple(d for d in self._levels if isinstance(d, Dim))
+    def dims(self) -> tuple[Any, ...]:
+        return tuple(l.dim() for l in self._get_levels() if not l.is_positional())
 
-    def dim(self):
+    def dim(self) -> int:
         return self.ndim
 
-    __torch_function__ = classmethod(_C.__torch_function__)
-    expand = _C._instancemethod(_C.expand)
+    @classmethod
+    def __torch_function__(
+        cls,
+        func: Callable,
+        types: tuple,
+        args: tuple = (),
+        kwargs: Optional[dict] = None,
+    ) -> Any:
+        if kwargs is None:
+            kwargs = {}
+
+        if DOT_OPTIMIZED and func is torch.Tensor.__mul__:
+            # Check conditions: 2 args, both are tensor-like, both 0-dimensional
+            if (
+                len(args) == 2
+                and not kwargs
+                and isinstance(args[0], (_Tensor, torch.Tensor))
+                and isinstance(args[1], (_Tensor, torch.Tensor))
+            ):
+                # Get tensor info for both operands
+                lhs_info = TensorInfo.create(
+                    args[0], ensure_batched=False, ensure_present=False
+                )
+                rhs_info = TensorInfo.create(
+                    args[1], ensure_batched=False, ensure_present=False
+                )
+
+                if (
+                    lhs_info
+                    and rhs_info
+                    and lhs_info.tensor is not None
+                    and rhs_info.tensor is not None
+                    and lhs_info.tensor.dim() == 0
+                    and rhs_info.tensor.dim() == 0
+                ):
+                    if (
+                        lhs_info.tensor.is_floating_point()
+                        and rhs_info.tensor.is_floating_point()
+                    ):
+                        # Collect all unique levels and has_device
+                        has_device = lhs_info.has_device or rhs_info.has_device
+                        levels = []
+
+                        for level in lhs_info.levels:
+                            if level not in levels:
+                                levels.append(level)
+                        for level in rhs_info.levels:
+                            if level not in levels:
+                                levels.append(level)
+
+                        # Debug print
+                        # print(f"DEBUG: Creating delayed mul, levels: {levels}, has_device: {has_device}")
+
+                        # Create delayed tensor
+                        return Tensor.create_delayed(func, args, levels, has_device)
+
+        if func is torch.Tensor.__getitem__:
+            from functorch.dim._getsetitem import getitem
+
+            return getitem(cls, func, types, args, kwargs)
+
+        if func is torch.Tensor.__setitem__:
+            from functorch.dim._getsetitem import setitem
+
+            # args should be (tensor, index, value)
+            if len(args) == 3:
+                setitem(args[0], args[1], args[2])
+                return None
+            else:
+                raise ValueError(f"Expected 3 args for __setitem__, got {len(args)}")
+
+        # Fast-path for len; mostly to avoid infinite loop in TestMinFunctorchOnly.test_softmax_split
+        if func is torch.Tensor.__len__:
+            return args[0].size(0)
+
+        # Special handling for torch.softmax - use the pre-wrapped version
+        if func is torch.softmax:
+            return softmax(*args, **kwargs)
+
+        # Special handling for torch.stack - use the custom stack function
+        if func is torch.stack:
+            return stack(*args, **kwargs)
+
+        if (
+            func is torch.Tensor.split
+            or func is torch._VF.split  # type: ignore[attr-defined]
+            or func is torch._VF.split_with_sizes  # type: ignore[attr-defined]
+            or func is torch.split
+        ):
+            return split(*args, **kwargs)
+
+        return _Tensor._torch_function_fallback(func, types, args, kwargs)
+
+    @staticmethod
+    def _torch_function_fallback(
+        func: Callable, types: tuple, args: tuple, kwargs: dict
+    ) -> Any:
+        """Fallback torch function implementation for non-special-cased functions."""
+        is_pointwise = POINTWISE_OPTIMIZE and func in op_properties.pointwise
+        # TODO: optimize pytree here
+        flat_args, spec = tree_flatten((args, kwargs))
+        device_holding_tensor = None
+
+        infos: list[TensorInfo] = []
+        result_levels: list[DimEntry] = []
+
+        for f in flat_args:
+            info = TensorInfo.create(f, not is_pointwise, False)
+            infos.append(info)
+            if info:
+                assert is_pointwise or info.batchedtensor is not None
+                if device_holding_tensor is None and info.has_device:
+                    device_holding_tensor = info.tensor
+                # Collect all unique levels
+                for level in info.levels:
+                    assert isinstance(level, DimEntry)
+                    if level not in result_levels:
+                        result_levels.append(level)
+
+        if is_pointwise:
+            # Pointwise operation: match all tensors to common levels
+            for i, info in enumerate(infos):
+                if info and info.tensor is not None:
+                    tensor = info.tensor
+                    if device_holding_tensor is not None and not info.has_device:
+                        tensor = tensor.to(device_holding_tensor.device)
+                    ml = _match_levels(tensor, info.levels, result_levels)
+                    flat_args[i] = handle_from_tensor(ml)
+
+            unflat_args, unflat_kwargs = tree_unflatten(flat_args, spec)
+            result = func(*unflat_args, **unflat_kwargs)
+
+            # Wrap tensor results
+            def wrap_tensor(obj: Any) -> Any:
+                if isinstance(obj, torch.Tensor):
+                    return Tensor.from_positional(
+                        obj, result_levels, device_holding_tensor is not None
+                    )
+                return obj
+
+            # Small fastpath
+            if isinstance(result, torch.Tensor):
+                return wrap_tensor(result)
+            else:
+                return tree_map(wrap_tensor, result)
+
+        # Non-pointwise operation: use functorch vmap layers
+        with EnableAllLayers(result_levels) as guard:
+            # Update arguments with batched tensors
+            for i, info in enumerate(infos):
+                if info and info.batchedtensor is not None:
+                    batched = info.batchedtensor
+                    if device_holding_tensor is not None and not info.has_device:
+                        batched = batched.to(device_holding_tensor.device)
+                    guard.inplace_update_layers(batched, info.levels)
+                    flat_args[i] = handle_from_tensor(batched)
+
+            unflat_args, unflat_kwargs = tree_unflatten(flat_args, spec)
+            result = func(*unflat_args, **unflat_kwargs)
+
+            # Unwrap results from functorch layers
+            def unwrap_tensor(obj: Any) -> Any:
+                if isinstance(obj, torch.Tensor):
+                    return guard.from_batched(obj, device_holding_tensor is not None)
+                return obj
+
+            if isinstance(result, torch.Tensor):
+                return unwrap_tensor(result)
+            else:
+                return tree_map(unwrap_tensor, result)
+
+    def __setitem__(self, index: Any, value: Any) -> None:
+        """Set values in tensor using first-class dimensions."""
+        from functorch.dim._getsetitem import setitem
+
+        return setitem(self, index, value)
+
+    # expand and index are OK to be methods because they don't have torch.*
+    # versions, but if they did they need the stack/cat treatment
+
+    def expand(self, *args: Dim) -> _Tensor:
+        """
+        Expand tensor by adding new dimensions or expanding existing dimensions.
+
+        If all arguments are Dim objects, adds new named dimensions.
+        Otherwise, falls back to regular tensor expansion behavior.
+
+        Args:
+            args: Either Dim objects for new dimensions or sizes for regular expansion
+
+        Returns:
+            New tensor with expanded dimensions
+
+        Example:
+            >>> i, j = dims()
+            >>> t = torch.randn(3, 4)
+            >>> expanded = t[i].expand(j, k)  # Add j, k dimensions
+            >>> expanded2 = t[i].expand(2, 4)  # Regular expand with sizes
+        """
+        info = TensorInfo.create(self, ensure_batched=False, ensure_present=False)
+
+        for arg in args:
+            if not isinstance(arg, Dim):
+                # Not all args are Dims, fallback to regular expand
+                if isinstance(self, torch.Tensor) and not isinstance(self, _Tensor):
+                    return torch.Tensor.expand(self, *args)
+                else:
+                    return self.__torch_function__(
+                        torch.Tensor.expand, (type(self),), (self,) + args
+                    )
+
+        # All args are Dim objects - proceed with first-class dimension expansion
+        if not info:
+            # No tensor info available, fallback
+            return self.__torch_function__(
+                torch.Tensor.expand, (type(self),), (self,) + args
+            )
+
+        # First-class dimension expansion - all args are Dim objects
+        data = info.tensor
+        if data is None:
+            # No tensor data available, fallback
+            return self.__torch_function__(
+                torch.Tensor.expand, (type(self),), (self,) + args
+            )
+
+        levels = info.levels
+
+        new_levels: list[DimEntry] = []
+        new_sizes = []
+        new_strides = []
+
+        for d in args:
+            # Check if dimension already exists in current levels or new_levels
+            for level in levels:
+                if not level.is_positional() and level.dim() is d:
+                    raise DimensionBindError(
+                        f"expanding dimension {d} already exists in tensor with dims"
+                    )
+            for new_level in new_levels:
+                if not new_level.is_positional() and new_level.dim() is d:
+                    raise DimensionBindError(
+                        f"expanding dimension {d} already exists in tensor with dims"
+                    )
+
+            new_levels.append(DimEntry(d))
+            new_sizes.append(d.size)
+            new_strides.append(0)
+
+        # Add existing levels
+        new_levels.extend(levels)
+
+        # Add existing sizes and strides
+        orig_sizes = list(data.size())
+        orig_strides = list(data.stride())
+        new_sizes.extend(orig_sizes)
+        new_strides.extend(orig_strides)
+
+        # Create expanded tensor using as_strided
+        expanded_data = data.as_strided(new_sizes, new_strides, data.storage_offset())
+
+        # Return new tensor with expanded dimensions
+        result = Tensor.from_positional(expanded_data, new_levels, info.has_device)
+        return result  # type: ignore[return-value]  # Tensor and torch.Tensor are interchangeable
+
+    def index(
+        self,
+        dims: Union[int, Dim, tuple[Union[int, Dim], ...], list[Union[int, Dim]]],
+        indices: Union[
+            int,
+            slice,
+            torch.Tensor,
+            tuple[Union[int, slice, torch.Tensor], ...],
+            list[Union[int, slice, torch.Tensor]],
+        ],
+    ) -> _Tensor:
+        """
+        Index tensor using first-class dimensions.
+        """
+        from ._dim_entry import _match_levels
+        from ._getsetitem import getsetitem_flat, invoke_getitem
+        from ._wrap import _wrap_dim
+
+        # Helper to check if obj is a dimpack (tuple/list) and extract items
+        def maybe_dimpack(obj: Any, check_first: bool = False) -> tuple[Any, bool]:
+            if isinstance(obj, (tuple, list)):
+                return list(obj), True
+            return None, False
+
+        def parse_dim_entry(s: Any) -> Any:
+            d = _wrap_dim(s, self.ndim, False)
+            if d.is_none():
+                raise TypeError(f"expected a dimension specifyer but found {repr(s)}")
+            return d
+
+        # Helper for dimension not present errors
+        def dim_not_present(d: Any) -> None:
+            if d.is_positional():
+                raise TypeError(
+                    f"dimension {d.position() + self.ndim} not in tensor of {self.ndim} dimensions"
+                )
+            else:
+                raise TypeError(f"dimension {repr(d.dim())} not in tensor")
+
+        dims_list: list[Union[int, Dim]] = []
+        indices_list: list[Union[int, slice, torch.Tensor]] = []
 
-    index = _C._instancemethod(_C.index)
+        lhs_list = isinstance(dims, (tuple, list))
+        rhs_list = isinstance(indices, (tuple, list))
 
-    def __repr__(self):
-        tensor, levels, ndim = self._tensor, self._levels, self.ndim
-        return f"{tensor}\nwith dims={tuple(l + ndim if isinstance(l, int) else l for l in levels)} sizes={tuple(tensor.size())}"
+        if lhs_list and rhs_list:
+            # Type narrowing: we know dims and indices are sequences here
+            dims_seq = dims  # type: ignore[assignment]
+            indices_seq = indices  # type: ignore[assignment]
+            if len(dims_seq) != len(indices_seq):  # type: ignore[arg-type]
+                raise TypeError(
+                    f"dims ({len(dims_seq)}) and indices ({len(indices_seq)}) must have the same length"  # type: ignore[arg-type]
+                )
+            dims_list.extend(dims_seq)  # type: ignore[arg-type]
+            indices_list.extend(indices_seq)  # type: ignore[arg-type]
+        else:
+            dims_list.append(dims)  # type: ignore[arg-type]
+            indices_list.append(indices)  # type: ignore[arg-type]
+
+        # Create tensor info
+        self_info = TensorInfo.create(self, False, False)
+
+        new_levels: list[Any] = []
+        to_flatten: list[Any] = []
+        dims_list_flat = []
+
+        # Process each dim specification
+        for i in range(len(dims_list)):
+            m, is_dimpack = maybe_dimpack(dims_list[i], check_first=False)
+            if is_dimpack:
+                if len(m) == 0:
+                    dims_list_flat.append(DimEntry())  # Empty dimpack
+                    continue
+
+                first = parse_dim_entry(m[0])
+                dims_list_flat.append(first)
+
+                if len(m) == 1:
+                    continue
+
+                # Multi-element dimpack requires flattening
+                if len(to_flatten) == 0:
+                    new_levels.extend(self_info.levels)
+
+                rest = []
+                for j in range(1, len(m)):
+                    d = parse_dim_entry(m[j])
+                    removed = False
+                    for k in range(len(new_levels)):
+                        if new_levels[k] == d:
+                            new_levels.pop(k)
+                            removed = True
+                            break
+                    if not removed:
+                        dim_not_present(d)
+                    rest.append(d)
+
+                # Find first in new_levels
+                first_idx = None
+                for k in range(len(new_levels)):
+                    if new_levels[k] == first:
+                        first_idx = k
+                        break
+                if first_idx is None:
+                    dim_not_present(first)
+                    continue  # Skip this iteration if dimension not found
+
+                for j, r in enumerate(rest):
+                    new_levels.insert(first_idx + 1 + j, r)
+                to_flatten.extend(rest)
+            else:
+                dims_list_flat.append(parse_dim_entry(dims_list[i]))
+
+        # Handle dimension flattening if needed
+        if len(to_flatten) > 0:
+            assert self_info.tensor is not None, (
+                "Cannot perform dimension flattening on None tensor"
+            )
+            rearranged = _match_levels(self_info.tensor, self_info.levels, new_levels)
+            sizes = rearranged.size()
+            new_sizes: list[Any] = []
+            reshape_levels = []
+
+            for i in range(len(new_levels)):
+                if new_levels[i] in to_flatten:
+                    if len(new_sizes) == 0:
+                        new_sizes.append(sizes[i])
+                    else:
+                        new_sizes[-1] *= sizes[i]
+                else:
+                    new_sizes.append(sizes[i])
+                    reshape_levels.append(new_levels[i])
+
+            self_info.tensor = rearranged.reshape(new_sizes)
+            self_info.levels = reshape_levels
+
+        # Check for dimpacks in indices
+        has_dimpacks = False
+        for idx in indices_list:
+            if isinstance(idx, (tuple, list)):
+                has_dimpacks = True
+                break
+
+        # Call getsetitem_flat with correct parameters
+        info = getsetitem_flat(
+            self_info,
+            [],  # empty input_list
+            dims_list_flat,  # keys
+            indices_list,  # values
+            has_dimpacks,
+        )
+
+        return invoke_getitem(info)
+
+    def __repr__(self) -> str:
+        tensor, levels, ndim = self._get_tensor(), self._get_levels(), self.ndim
+        dims_repr = []
+        for l in levels:
+            if hasattr(l, "is_positional") and l.is_positional():
+                # Convert negative positional to positive: -1 -> ndim-1, -2 -> ndim-2, etc.
+                dims_repr.append(l.position() + ndim)
+            elif hasattr(l, "dim"):
+                dims_repr.append(l.dim())
+            elif hasattr(l, "data"):
+                dims_repr.append(l.data)
+            else:
+                dims_repr.append(l)
+        return f"{tensor}\nwith dims={tuple(dims_repr)} sizes={tuple(tensor.size())}"  # type: ignore[union-attr]
 
 
 TensorLike = (_Tensor, torch.Tensor)
 
 
-class Dim(_C.Dim, _Tensor):
-    # note that _C.Dim comes before tensor because we want the Dim API for things like size to take precedence.
+class Dim(_Tensor):
+    _level: int
+    _name: str
+    _size: int
+    _range: Optional[torch.Tensor]
+    _batchtensor: Optional[torch.Tensor]
+
+    def __init__(self, name: str, s: int = -1) -> None:
+        global _n_dims_created
+        self._name = name
+        self._size = s
+        self._level = _n_dims_created
+        _n_dims_created += 1
+        self._range = None
+        self._batchtensor = None
+
+    @property
+    def ndim(self) -> int:
+        return 1
+
+    @classmethod
+    def check_exact(cls, obj: Any) -> bool:
+        return type(obj) is cls
+
+    @property
+    def size(self) -> int:
+        if self._size == -1:
+            raise ValueError(f"dimension {self._name} is unbound")
+        return self._size
+
+    @size.setter
+    def size(self, v: int) -> None:
+        if self._size == -1:
+            self._size = v
+        elif self._size != v:
+            raise DimensionBindError(
+                f"Dim '{repr(self)}' previously bound to a dimension of size {self._size} "
+                f"cannot bind to a dimension of size {v}"
+            )
+
+    @property
+    def is_bound(self) -> bool:
+        """Return True if this dimension is bound to a size."""
+        return self._size != -1
+
+    def _get_range(self) -> torch.Tensor:
+        """
+        Get a tensor representing the range [0, size) for this dimension.
+
+        Returns:
+            A 1D tensor with values [0, 1, 2, ..., size-1]
+        """
+        if self._range is None:
+            self._range = torch.arange(self.size)
+        return self._range
+
+    def _get_batchtensor(self) -> torch.Tensor:
+        """
+        Get a batched tensor representation of this dimension.
+
+        Returns:
+            A batched tensor created from the range tensor
+        """
+        if self._batchtensor is None:
+            self._batchtensor = torch._C._functorch._add_batch_dim(
+                self._get_range(), 0, self._level
+            )
+        return self._batchtensor
+
+    def __repr__(self) -> str:
+        """String representation of a Dim object."""
+        return self._name
+
+    # note that Dim comes before tensor because we want the Dim API for things like size to take precedence.
     # Tensor defines format, but we want to print Dims with special formatting
     __format__ = object.__format__
 
 
-class Tensor(_Tensor, _C.Tensor):
-    from_positional = staticmethod(_C.Tensor_from_positional)
-    sum = _C._instancemethod(_C.Tensor_sum)
+# Somewhat confusingly, an FCD tensor is also called Tensor.  This confusion
+# is somewhat intentional, as FCD tensors are intended to be substitutable
+# with regular Tensor (just with some positional dims hidden).
+class Tensor(_Tensor):
+    _tensor: Optional[torch.Tensor]
+    _batchtensor: Optional[torch.Tensor]
+    _levels: list[DimEntry]
+    _has_device: bool
+    _delayed: Optional[Callable[[], torch.Tensor]]
+    _delayed_orig: Optional[Callable]
+    _delayed_args: Optional[tuple]
+
+    @property
+    def ndim(self) -> int:
+        return sum(1 if l.is_positional() else 0 for l in self._levels)
 
+    @classmethod
+    def check_exact(cls, other: Any) -> bool:
+        return type(other) is cls
 
-def cat(tensors, dim, new_dim):
-    n = dims()
-    return stack(tensors, n, dim).index([n, dim], new_dim)
+    @classmethod
+    def from_positional(
+        cls, tensor: torch.Tensor, levels: list[DimEntry], has_device: bool
+    ) -> Union[_Tensor, torch.Tensor]:
+        """
+        Create a functorch Tensor from a regular PyTorch tensor with specified dimension levels.
 
+        This is the primary way to create Tensor objects with first-class dimensions.
 
-_wrap = _C._wrap
+        Args:
+            tensor: The underlying PyTorch tensor
+            levels: List of DimEntry objects specifying the dimension structure
+            has_device: Whether the tensor is on a device (not CPU)
 
+        Returns:
+            A new Tensor instance with the specified dimensions, or a regular torch.Tensor
+            if there are no named dimensions
+        """
+        seen_dims = 0
+        last = 0
 
-def _def(name, *args, **kwargs):
-    orig = getattr(torch.Tensor, name)
-    setattr(_Tensor, name, _C._instancemethod(_wrap(orig, *args, **kwargs)))
+        for i, l in enumerate(levels):
+            if l.is_positional():
+                # Validate consecutive positional dimensions
+                assert last == 0 or last + 1 == l.position(), (
+                    f"Positional dimensions must be consecutive, got {last} then {l.position()}"
+                )
+                last = l.position()
+            else:
+                # This is a named dimension
+                seen_dims += 1
+
+        # Validate final positional dimension
+        assert last == 0 or last == -1, (
+            f"Final positional dimension must be 0 or -1, got {last}"
+        )
+
+        if not seen_dims:
+            return tensor
+
+        # Create Tensor object with proper level management
+        result = cls()
+        result._tensor = tensor
+        result._levels = levels
+        result._has_device = has_device
+        result._batchtensor = None  # Will be created lazily if needed
+        result._delayed = None
+        result._delayed_orig = None
+        result._delayed_args = None
+
+        # Validate tensor dimensionality matches levels
+        assert tensor.dim() == len(levels), (
+            f"Tensor has {tensor.dim()} dimensions but {len(levels)} levels provided"
+        )
+
+        return result
+
+    @classmethod
+    def create_delayed(
+        cls, orig: Callable, args: tuple, levels: list[DimEntry], has_device: bool
+    ) -> _Tensor:
+        """
+        Create a delayed tensor that defers the operation until later.
+        """
+        result = cls()
+        result._tensor = None  # Will be computed when needed
+        result._levels = levels
+        result._has_device = has_device
+        result._batchtensor = None
+        result._delayed_orig = orig
+        result._delayed_args = args
+
+        # Create delayed evaluation function that unwraps Tensor objects
+        def evaluate_delayed() -> torch.Tensor:
+            unwrapped_args = []
+            for arg in args:
+                if hasattr(arg, "_get_tensor"):
+                    unwrapped_args.append(arg._get_tensor())
+                else:
+                    unwrapped_args.append(arg)
+            return orig(*unwrapped_args)
+
+        result._delayed = evaluate_delayed
+
+        return result
+
+    def _get_tensor(self) -> Optional[torch.Tensor]:
+        """Get the underlying tensor, handling delayed operations if needed."""
+        if (
+            hasattr(self, "_delayed")
+            and self._delayed is not None
+            and self._tensor is None
+        ):
+            # Execute the delayed operation
+            self._tensor = self._delayed()
+            # Clear delayed operation to avoid re-execution
+            self._delayed = None
+            self._delayed_orig = None
+            self._delayed_args = None
+        return self._tensor
+
+    def _get_levels(self) -> list[Any]:
+        """Get the dimension levels."""
+        return self._levels
+
+    def _get_has_device(self) -> bool:
+        """Get whether this tensor has device information."""
+        return self._has_device
+
+    def _get_batchtensor(self) -> Optional[torch.Tensor]:
+        """Get the batched tensor representation, creating it lazily if needed."""
+        if self._batchtensor is None:
+            self._batchtensor = self._add_batch_dims(
+                self._get_tensor(), self._get_levels()
+            )
+        return self._batchtensor
+
+    def _add_batch_dims(
+        self, t: Optional[torch.Tensor], levels_: list[Any]
+    ) -> Optional[torch.Tensor]:
+        levels = list(levels_)
+
+        while True:
+            min_real_index = -1
+            min_index = -1
+            min_value = float("inf")  # INT_MAX equivalent
+            i = 0
+            r = 0
+
+            for r, l in enumerate(levels):
+                if not l.is_none():
+                    if not l.is_positional() and l.dim()._level < min_value:
+                        min_value = l.dim()._level
+                        min_index = i
+                        min_real_index = r
+                    i += 1
+
+            if min_index == -1:
+                return t
+
+            assert t is not None
+            t = torch._C._functorch._add_batch_dim(t, min_index, int(min_value))
+
+            levels[min_real_index] = DimEntry()
+        return None
+
+    def order(self, *dims: Any) -> _Tensor:
+        """Reorder the dimensions of this tensor."""
+        from ._order import order
+
+        result = order(self, *dims)
+        return result  # type: ignore[return-value]  # Tensor and torch.Tensor are interchangeable
+
+
+def stack(tensors: Any, new_dim: Any, dim: int = 0) -> _Tensor:
+    """
+    Stack tensors along a new dimension.
+
+    Args:
+        tensors: Sequence of tensors to stack
+        new_dim: The new Dim to create for stacking
+        dim: The dimension position to insert the new dimension (default: 0)
+
+    Returns:
+        Stacked tensor with the new dimension
+    """
+    if not tensors:
+        raise ValueError("stack expects a non-empty sequence of tensors")
+
+    # Check if new_dim is a Dim object
+    if not isinstance(new_dim, Dim):
+        # Fall back to regular torch.stack
+        result = torch.stack(tensors, dim=dim)
+        return result  # type: ignore[return-value]
+
+    # Collect all result_levels from input tensors
+    result_levels = []
+    infos = []
+
+    for t in tensors:
+        info = TensorInfo.create(t, ensure_batched=False, ensure_present=False)
+        infos.append(info)
+        for level in info.levels:
+            if level not in result_levels:
+                result_levels.append(level)
+
+    # Set the new_dim size to match number of tensors
+    new_dim.size = len(tensors)
+
+    # Match all tensors to the common level structure using _match_levels
+    inputs = []
+    for info in infos:
+        assert info.tensor is not None, "Cannot stack tensors with None tensor data"
+        matched_tensor = _match_levels(info.tensor, info.levels, result_levels)
+        inputs.append(matched_tensor)
+
+    # Calculate ndim and resolve the dim parameter
+    ndim = ndim_of_levels(result_levels)
+    rawdim = 0
+    if dim is not None and not (isinstance(dim, int) and dim == 0):
+        from ._wrap import _wrap_dim
+
+        d = _wrap_dim(dim, ndim, False)
+        try:
+            idx = result_levels.index(d)
+        except ValueError:
+            raise TypeError(f"Dimension {dim} does not exist in inputs") from None
+        rawdim = idx
+
+    # Stack tensors at the resolved dimension
+    result = torch.stack(inputs, rawdim)
 
+    # Insert new dimension entry at the correct position
+    result_levels.insert(rawdim, DimEntry(new_dim))
 
-t__getitem__ = _C._instancemethod(_C.__getitem__)
-stack = _C.stack
-split = _C._instancemethod(_C.split)
+    # Return as a first-class tensor
+    tensor_result = Tensor.from_positional(
+        result, result_levels, infos[0].has_device if infos else True
+    )
+    return tensor_result  # type: ignore[return-value]
 
-# note: there is no python reference
-t__setitem__ = _C._instancemethod(_C.__setitem__)
-# this is patched in the C API because otherwise torch.Tensor will
-# no longer be considered a sequence and things will break
-# torch.Tensor.__getitem__ = t__getitem__
 
-_Tensor.__getitem__ = t__getitem__
-# torch.Tensor.__setitem__ = t__setitem__
-_Tensor.__setitem__ = t__setitem__
+def split(tensor: Any, split_size_or_sections: Any, dim: Any = None) -> tuple:
+    """
+    Split tensor along a dimension.
+
+    Can handle both regular integer sizes and Dim objects for split sizes.
+    When Dim objects are used, they get bound to the resulting tensor dimensions.
+    """
+    from ._wrap import _wrap_dim
+
+    # Check if dim is a Dim object
+    dim_is_object = isinstance(dim, Dim)
+
+    # Parse split_size_or_sections
+    if isinstance(split_size_or_sections, int):
+        # Single integer - use regular split
+        if dim_is_object:
+            raise TypeError(
+                "when dim is specified as a Dim object, split sizes must also be dimensions."
+            )
+        return _Tensor._torch_function_fallback(
+            torch.Tensor.split,
+            (type(tensor),),
+            (tensor, split_size_or_sections),
+            {"dim": dim},
+        )
+
+    # Check if it's a sequence
+    sizes = []
+    all_dims = True
+    all_ints = True
+
+    for item in split_size_or_sections:
+        sizes.append(item)
+        if isinstance(item, Dim):
+            all_ints = False
+        else:
+            all_dims = False
+
+    if all_ints:
+        # All integers - use regular split
+        if dim_is_object:
+            raise TypeError(
+                "when dim is specified as a Dim object, split sizes must also be dimensions."
+            )
+        return _Tensor._torch_function_fallback(
+            torch.Tensor.split,
+            (type(tensor),),
+            (tensor, split_size_or_sections),
+            {"dim": dim},
+        )
+
+    if not all_dims:
+        raise TypeError("split list must be ints or dims but got a mix")
+
+    # All are Dim objects - handle first-class dimension split
+    self_info = TensorInfo.create(tensor, ensure_batched=False, ensure_present=False)
+    ndim = self_info.ndim()
+
+    if not dim_is_object and ndim == 0:
+        raise TypeError("split expects at least a 1-dimension tensor")
+
+    # Wrap the dimension
+    dim_l = _wrap_dim(dim, ndim, False) if dim is not None else DimEntry(-ndim)
+
+    # Find the index of the dimension in levels
+    idx = None
+    for i, level in enumerate(self_info.levels):
+        if level == dim_l:
+            idx = i
+            break
+
+    if idx is None:
+        if dim is None:
+            dim = 0
+        raise TypeError(f"tensor does not contain dimension {dim}")
+
+    # Calculate split indices
+    indices = []
+    total_size = 0
+    unbound = []
+
+    for i, size_dim in enumerate(sizes):
+        if size_dim.is_bound:
+            indices.append(size_dim.size)
+            total_size += indices[-1]
+        else:
+            indices.append(0)
+            unbound.append(i)
+
+    assert self_info.tensor is not None, "Cannot get tensor size on None tensor"
+    tensor_size = self_info.tensor.size(idx)
+
+    # Handle unbound dimensions
+    if unbound:
+        if total_size > tensor_size:
+            raise TypeError(
+                f"sizes of target dimensions add up to more ({total_size}) than source dim ({tensor_size})"
+            )
+        remaining_size = tensor_size - total_size
+        chunk_size = (remaining_size + len(unbound) - 1) // len(unbound)
+        for u in unbound:
+            sz = min(chunk_size, remaining_size)
+            sizes[u].size = sz
+            indices[u] = sz
+            remaining_size -= sz
+    elif tensor_size != total_size:
+        raise TypeError(
+            f"sum of sizes of target dimensions ({total_size}) do not match the source dim ({tensor_size})"
+        )
+
+    # Perform the split
+    result_tensors = self_info.tensor.split_with_sizes(indices, idx)
+
+    # Create result with new levels
+    result = []
+    new_levels = list(self_info.levels)
+
+    for i, (result_tensor, size_dim) in enumerate(zip(result_tensors, sizes)):
+        new_levels[idx] = DimEntry(size_dim)
+        result.append(
+            Tensor.from_positional(
+                result_tensor, list(new_levels), self_info.has_device
+            )
+        )
+
+    return tuple(result)
+
+
+def cat(tensors: Any, dim: Any, new_dim: Any) -> _Tensor:
+    n = dims(1)  # Get single Dim instead of tuple
+    return stack(tensors, n, dim).index([n, dim], new_dim)  # type: ignore[list-item]
+
+
+class DotPart:
+    """
+    Helper class for organizing dimensions in dot products.
+    """
+
+    def __init__(self) -> None:
+        self.dims: list[DimEntry] = []
+        self.total_size = 1
+
+    def append(self, dim_entry: Any) -> None:
+        """Add a dimension entry to this part."""
+        self.dims.append(dim_entry)
+        if not dim_entry.is_positional():
+            self.total_size *= dim_entry.dim().size
+
+
+def dot_prepare(parts: list[DotPart], tensor_info: TensorInfo) -> torch.Tensor:
+    """
+    Prepare tensor for dot product by matching levels and reshaping.
+    """
+    new_levels = []
+    needs_reshape = False
+
+    for part in parts:
+        if len(part.dims) != 1:
+            needs_reshape = True
+        new_levels.extend(part.dims)
+
+    if tensor_info.tensor is None:
+        raise RuntimeError("Cannot perform dot product on None tensor")
+    result = _match_levels(tensor_info.tensor, tensor_info.levels, new_levels)
+
+    if not needs_reshape:
+        return result
+
+    # Reshape for matrix operations
+    view = [part.total_size for part in parts]
+    return result.reshape(view)
+
+
+def dot_finish(parts: list[DotPart], result_tensor: torch.Tensor) -> Tensor:
+    """
+    Finish dot product by reshaping result and creating Tensor.
+    """
+    result_levels = []
+    needs_reshape = False
+
+    for part in parts:
+        if len(part.dims) != 1:
+            needs_reshape = True
+        result_levels.extend(part.dims)
+
+    if needs_reshape:
+        new_size = []
+        for level in result_levels:
+            new_size.append(level.dim().size)
+        result_tensor = result_tensor.reshape(new_size)
+
+    tensor_result = Tensor.from_positional(result_tensor, result_levels, True)
+    return tensor_result  # type: ignore[return-value]
+
+
+def dot(lhs: Any, rhs: Any, sum_dims: Any) -> Union[_Tensor, torch.Tensor]:
+    """
+    Perform dot product between two tensors along specified dimensions.
+
+    Args:
+        lhs: Left-hand side tensor
+        rhs: Right-hand side tensor
+        sum_dims: Dimensions to sum over (contract)
+
+    Returns:
+        Result of dot product
+    """
+    # Get tensor info
+    lhs_info = TensorInfo.create(lhs, ensure_batched=False, ensure_present=False)
+    rhs_info = TensorInfo.create(rhs, ensure_batched=False, ensure_present=False)
+
+    if not (lhs_info and rhs_info):
+        # Fall back to regular operations
+        return torch.matmul(lhs, rhs)
+
+    assert lhs_info.tensor is not None and rhs_info.tensor is not None, (
+        "Cannot perform dot product on None tensors"
+    )
+
+    lhs_strides = lhs_info.tensor.stride()
+    rhs_strides = rhs_info.tensor.stride()
+
+    # Create dot parts for different dimension categories
+    lro_dims = DotPart()  # Left-right-output (batch dims)
+    lo_dims = DotPart()  # Left-output only
+    ro_dims = DotPart()  # Right-output only
+    lr_dims = DotPart()  # Left-right (contracted dims)
+
+    def insert_dim(d: Any, lhs_idx: Any, rhs_idx: Any) -> None:
+        """Insert dimension into appropriate part based on stride pattern."""
+        reduced = d in sum_dims
+        lhs_stride = lhs_strides[lhs_idx] if lhs_idx is not None else 0
+        rhs_stride = rhs_strides[rhs_idx] if rhs_idx is not None else 0
+
+        if reduced:
+            lr_dims.append(d)
+        else:
+            if (lhs_stride == 0) == (rhs_stride == 0):
+                lro_dims.append(d)  # Both have or both lack this dim
+            elif lhs_stride != 0:
+                lo_dims.append(d)  # Only lhs has this dim
+            else:
+                ro_dims.append(d)  # Only rhs has this dim
+
+    # Track which rhs dimensions we've seen
+    rhs_seen = [False] * len(rhs_info.levels)
+
+    # Process lhs dimensions
+    for i, lhs_level in enumerate(lhs_info.levels):
+        rhs_idx = None
+        for j, rhs_level in enumerate(rhs_info.levels):
+            if lhs_level == rhs_level:
+                rhs_idx = j
+                rhs_seen[j] = True
+                break
+
+        insert_dim(lhs_level, i, rhs_idx)
+
+    # Process remaining rhs dimensions
+    for i, rhs_level in enumerate(rhs_info.levels):
+        if not rhs_seen[i]:
+            insert_dim(rhs_level, None, i)
+
+    # Validate sum dimensions exist
+    if len(lr_dims.dims) != len(sum_dims):
+        for d in sum_dims:
+            if d not in lhs_info.levels and d not in rhs_info.levels:
+                raise ValueError(f"summing over non-existent dimension {d}")
+
+    # Prepare tensors and perform matrix multiplication
+    if len(lro_dims.dims) != 0:
+        # Batched matrix multiply
+        lhs_tensor = dot_prepare([lro_dims, lo_dims, lr_dims], lhs_info)
+        rhs_tensor = dot_prepare([lro_dims, lr_dims, ro_dims], rhs_info)
+        result = torch.bmm(lhs_tensor, rhs_tensor)
+        return dot_finish([lro_dims, lo_dims, ro_dims], result)
+    else:
+        # Regular matrix multiply
+        lhs_tensor = dot_prepare([lo_dims, lr_dims], lhs_info)
+        rhs_tensor = dot_prepare([lr_dims, ro_dims], rhs_info)
+        result = torch.mm(lhs_tensor, rhs_tensor)
+        return dot_finish([lo_dims, ro_dims], result)
+
+
+from functorch.dim._wrap import _wrap
+from functorch.dim.wrap_type import wrap_type
+
 
-torch.Tensor.split = split
-_Tensor.split = split
-torch.Tensor.expand = _C._instancemethod(_C.expand)
-torch.Tensor.index = _C._instancemethod(_C.index)
 wrap_type(_Tensor, torch.Tensor, _Tensor.__torch_function__)
 del _Tensor.ndim
 
-_Tensor.order = _C._instancemethod(_C.order)
+
+def index(self: Any, positions: Any, dims: Any) -> _Tensor:
+    """
+    Index a regular tensor by binding specified positions to dims.
+
+    This converts a regular tensor to a first-class tensor by binding
+    the specified positional dimensions to Dim objects.
+
+    Args:
+        positions: Tuple of dimension positions to bind
+        dims: Dim objects or tuple of Dim objects to bind to
+
+    Returns:
+        First-class tensor with specified dimensions bound
+    """
+    # If this is already a first-class tensor (_Tensor), call its index method directly
+    if isinstance(self, _Tensor):
+        return _Tensor.index(self, positions, dims)
+
+    # Convert regular tensor to first-class tensor
+    info = TensorInfo.create(self, ensure_batched=False, ensure_present=False)
+
+    # Create the first-class tensor
+    assert info.tensor is not None, "Cannot index None tensor"
+    result = Tensor.from_positional(info.tensor, info.levels, info.has_device)
+
+    # Now call the index method on the first-class tensor
+    # Cast result to _Tensor for the method call
+    return _Tensor.index(result, positions, dims)  # type: ignore[arg-type]
+
+
+def _def(name: str, *args: Any, **kwargs: Any) -> None:
+    orig = getattr(torch.Tensor, name)
+    setattr(_Tensor, name, _wrap(orig, *args, **kwargs))
+
 
 _def("mean")
 _def("sum")
diff --git a/functorch/dim/_dim_entry.py b/functorch/dim/_dim_entry.py
new file mode 100644
index 000000000000..c067a7ad0ce4
--- /dev/null
+++ b/functorch/dim/_dim_entry.py
@@ -0,0 +1,127 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Union
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from . import Dim
+
+import torch  # noqa: TC002
+
+
+# NB: The old code represented dimension was from as negative number, so we
+# follow this convention even though it shouldn't be necessary now
+class DimEntry:
+    # The dimension this is from the rhs, or a FCD
+    data: Union[Dim, int]
+
+    def __init__(self, data: Union[Dim, int, None] = None) -> None:
+        from . import Dim
+
+        if type(data) is int:
+            assert data < 0
+        elif data is None:
+            data = 0
+        else:
+            assert isinstance(data, Dim)
+        self.data = data
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, DimEntry):
+            return False
+        # Use 'is' for Dim objects to avoid triggering __torch_function__
+        # Use '==' only for positional (int) comparisons
+        if self.is_positional() and other.is_positional():
+            # Both are positional (ints)
+            return self.data == other.data
+        elif not self.is_positional() and not other.is_positional():
+            # Both are Dim objects - use 'is' to avoid __eq__
+            return self.data is other.data
+        else:
+            # One is positional, one is Dim - they can't be equal
+            return False
+
+    def is_positional(self) -> bool:
+        return type(self.data) is int and self.data < 0
+
+    def is_none(self) -> bool:
+        # Use isinstance to check for Dim objects, avoid triggering __torch_function__
+        from . import Dim
+
+        if isinstance(self.data, Dim):
+            # This is a Dim object, it can't be "none" (which is represented by 0)
+            return False
+        else:
+            # This is an int or other type
+            return self.data == 0
+
+    def position(self) -> int:
+        assert isinstance(self.data, int)
+        return self.data
+
+    def dim(self) -> Dim:
+        assert not isinstance(self.data, int)
+        return self.data
+
+    def __repr__(self) -> str:
+        return repr(self.data)
+
+
+def ndim_of_levels(levels: Sequence[DimEntry]) -> int:
+    r = 0
+    for l in levels:
+        if l.is_positional():
+            r += 1
+    return r
+
+
+def _match_levels(
+    tensor: torch.Tensor,
+    from_levels: list[DimEntry],
+    to_levels: list[DimEntry],
+    drop_levels: bool = False,
+) -> torch.Tensor:
+    """
+    Reshape a tensor to match target levels using as_strided.
+
+    Args:
+        tensor: Input tensor to reshape
+        from_levels: Current levels of the tensor
+        to_levels: Target levels to match
+        drop_levels: If True, missing dimensions are assumed to have stride 0
+
+    Returns:
+        Reshaped tensor
+    """
+    if from_levels == to_levels:
+        return tensor
+
+    sizes = tensor.size()
+    strides = tensor.stride()
+
+    if not drop_levels:
+        assert len(from_levels) <= len(to_levels), (
+            "Cannot expand dimensions without drop_levels"
+        )
+
+    new_sizes = []
+    new_strides = []
+
+    for level in to_levels:
+        # Find index of this level in from_levels
+        try:
+            idx = from_levels.index(level)
+        except ValueError:
+            # Level not found in from_levels
+            if level.is_positional():
+                new_sizes.append(1)
+            else:
+                new_sizes.append(level.dim().size)
+            new_strides.append(0)
+        else:
+            new_sizes.append(sizes[idx])
+            new_strides.append(strides[idx])
+
+    return tensor.as_strided(new_sizes, new_strides, tensor.storage_offset())
diff --git a/functorch/dim/_enable_all_layers.py b/functorch/dim/_enable_all_layers.py
new file mode 100644
index 000000000000..b05c58b2c843
--- /dev/null
+++ b/functorch/dim/_enable_all_layers.py
@@ -0,0 +1,139 @@
+from __future__ import annotations
+
+from typing import Any, TYPE_CHECKING
+
+import torch
+
+from ._dim_entry import DimEntry
+
+
+if TYPE_CHECKING:
+    from . import Dim, Tensor
+
+
+class EnableAllLayers:
+    """
+    RAII-style context manager for enabling functorch vmap layers.
+    It manages the creation and cleanup of functorch dynamic layers.
+
+    This is probably one of the more algorithmically important parts of first
+    class dims. Intuitively, FCD can be thought of as another way of using
+    vmap, where you don't actually have to vmap at the top level, instead the
+    vmaps are implicitly determined by inspecting the bound dimensions on the
+    FCD tensors involved in a compute (this is similar to our concept of
+    non-lexical modes that we spent a long time talking about years ago). But
+    under the hood you still need to actually enable the vmap mode. So once
+    FCD has determined all of the dims we are batching over, it needs to
+    enable all those layers so functorch can actually apply the batching
+    rules. Therefore enable all layers!
+    """
+
+    levels_start: int
+    levels_to_dim: list[Dim]
+
+    def __init__(self, levels: list[DimEntry]):
+        """
+        Initialize and push dynamic layers for all first-class dimensions.
+
+        Args:
+            levels: List of dimension entries to create layers for
+        """
+
+        from . import Dim
+
+        self.levels_start = 0
+        self.levels_to_dim = []
+
+        for l in levels:
+            if not l.is_positional():
+                d = l.dim()
+                assert isinstance(d, Dim)
+                self.levels_to_dim.append(d)
+
+        # Sort by level for stable ordering
+        self.levels_to_dim.sort(key=lambda d: d._level)
+
+    def __enter__(self) -> EnableAllLayers:  # noqa: PYI034
+        # Create functorch dynamic layers
+        for i, dim in enumerate(self.levels_to_dim):
+            batch_size = dim.size
+            level = torch._C._functorch._vmap_increment_nesting(batch_size, "different")
+            if i == 0:
+                self.levels_start = level
+        return self
+
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+        """Clean up dynamic layers in reverse order."""
+        to_remove = self.levels_start + len(self.levels_to_dim) - 1
+        for i in range(len(self.levels_to_dim)):
+            popped = torch._C._functorch._vmap_decrement_nesting()
+            assert popped == to_remove - i, (
+                f"Expected layer {to_remove - i}, got {popped}"
+            )
+
+    def from_batched(self, batchedtensor: torch.Tensor, has_device: bool) -> Tensor:
+        """
+        Create a Tensor from a batched tensor by unwrapping functorch layers.
+
+        Args:
+            batchedtensor: Batched tensor from functorch operation
+            has_device: Whether tensor has device info
+
+        Returns:
+            Tensor with appropriate levels
+        """
+        # Create positional levels for base dimensions
+        levels: list[DimEntry] = []
+        for i in range(-batchedtensor.dim(), 0):
+            levels.append(DimEntry(i))
+
+        tensor = batchedtensor
+
+        while torch._C._functorch.is_batchedtensor(tensor):
+            level = torch._C._functorch.maybe_get_level(tensor)
+            assert level is not None
+            assert level >= self.levels_start and level < self.levels_start + len(
+                self.levels_to_dim
+            )
+            dim = DimEntry(self.levels_to_dim[level - self.levels_start])
+            bdim = torch._C._functorch.maybe_get_bdim(tensor)
+            assert bdim is not None
+            levels.insert(bdim, dim)
+            tensor = torch._C._functorch.get_unwrapped(tensor)
+
+        from . import Tensor
+
+        result = Tensor()
+        result._tensor = tensor
+        result._batchtensor = batchedtensor
+        result._has_device = has_device
+        result._levels = levels
+        return result
+
+    def inplace_update_layers(
+        self, batchtensor: torch.Tensor, levels: list[DimEntry]
+    ) -> None:
+        """
+        Update the levels of a batched tensor in place.
+
+        This requires the _maybe_unsafe_set_level binding that we'll add to functorch.
+
+        Args:
+            batchtensor: Batched tensor to update
+            levels: New levels to set
+        """
+        # Check if tensor is batched
+        if not torch._C._functorch.is_batchedtensor(batchtensor):
+            return
+
+        impl = batchtensor
+
+        for i in reversed(range(len(self.levels_to_dim))):
+            if impl is None:
+                break
+
+            if any(l == DimEntry(self.levels_to_dim[i]) for l in levels):
+                # This is very interesting!  The level on batch tensor is
+                # meaningless!  We set it RIGHT before we go into vmap
+                torch._C._functorch._maybe_unsafe_set_level(impl, self.levels_start + i)
+                impl = torch._C._functorch.get_unwrapped(impl)
diff --git a/functorch/dim/_getsetitem.py b/functorch/dim/_getsetitem.py
new file mode 100644
index 000000000000..59e2f3c61e0b
--- /dev/null
+++ b/functorch/dim/_getsetitem.py
@@ -0,0 +1,561 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any, Optional, TYPE_CHECKING, Union
+
+import torch
+
+from ._dim_entry import _match_levels, DimEntry
+from ._tensor_info import TensorInfo
+
+
+if TYPE_CHECKING:
+    from . import Dim
+
+
+def _safe_index(lst: list, item: Any) -> Optional[int]:
+    """
+    Helper function to find index of item in list.
+
+    For DimEntry objects, uses __eq__ comparison which properly handles
+    both positional and Dim entries.
+
+    Returns the index if found, None if not found.
+    """
+    for i, list_item in enumerate(lst):
+        # Use == for DimEntry objects as they have proper __eq__ implementation
+        if isinstance(item, DimEntry) and isinstance(list_item, DimEntry):
+            if list_item == item:
+                return i
+        elif list_item is item:
+            return i
+    return None
+
+
+@dataclass
+class IndexingInfo:
+    can_call_original: bool = False
+    advanced_indexing: bool = False
+    self_tensor: Optional[torch.Tensor] = None
+    flat_inputs: list[Any] = field(default_factory=list)
+    result_levels: list[DimEntry] = field(default_factory=list)
+    has_device: bool = False
+
+
+def has_dims(obj: Any) -> bool:
+    """
+    Check if an object has first-class dimensions.
+
+    This function checks if the object is either a Dim or a functorch Tensor
+    that has first-class dimensions, using the proper check_exact methods.
+    """
+    from . import Dim, Tensor
+
+    return Dim.check_exact(obj) or Tensor.check_exact(obj)
+
+
+def _bind_dims_to_size(sz: int, sd: int, dims: list, nsz: list, nsd: list) -> None:
+    """
+    Bind dimensions to size and calculate proper strides for dim packs.
+    """
+    from . import DimensionBindError
+
+    rhs_prod = 1
+    for i, dim in enumerate(dims):
+        if not dim.is_bound:
+            # Check for multiple unbound dimensions
+            for j in range(i + 1, len(dims)):
+                if not dims[j].is_bound:
+                    raise DimensionBindError(
+                        f"cannot infer the sizes of two dimensions at once {dim!r} and {dims[j]!r}"
+                    )
+                rhs_prod *= dims[j].size
+
+            # Calculate the size for this unbound dimension
+            if sz % rhs_prod != 0:
+                tup = tuple(dim.size if dim.is_bound else "?" for dim in dims)
+                raise DimensionBindError(
+                    f"inferred dimension does not evenly fit into larger dimension: {sz} vs {tup}"
+                )
+
+            inferred_size = sz // rhs_prod
+            dim.size = inferred_size
+            rhs_prod = sz
+            break
+        else:
+            rhs_prod *= dim.size
+
+    # Final validation that dimensions match
+    if rhs_prod != sz:
+        tup = tuple(dims)
+        raise DimensionBindError(
+            f"Dimension sizes to do not match ({sz} != {rhs_prod}) when matching dimension pack {tup}"
+        )
+
+    # Calculate new sizes and strides for each dimension in the pack
+    # First calculate all strides by iterating in reverse
+    new_strides = [0] * len(dims)
+    current_stride = sd
+    for i in reversed(range(len(dims))):
+        new_strides[i] = current_stride
+        current_stride *= dims[i].size
+
+    # Then append sizes and strides in forward order
+    for i in range(len(dims)):
+        nsz.append(dims[i].size)
+        nsd.append(new_strides[i])
+
+
+def slice_to_tuple(flat_inputs: list) -> tuple:
+    return tuple(flat_inputs)
+
+
+def extractIndices(index: Any, indices: list) -> bool:
+    if isinstance(index, tuple):  # mpy::tuple_view::check
+        indices.extend(index)
+        return True
+    elif isinstance(index, torch.Tensor):  # THPVariable_Check
+        indices.append(index)
+        return False
+    elif not hasattr(index, "__iter__") or isinstance(
+        index, (str, bytes)
+    ):  # !mpy::is_sequence
+        indices.append(index)
+        return False
+
+    # Handle sequence case (list)
+    if isinstance(index, list):
+        if len(index) >= 32:
+            indices.extend(index)
+            return True
+
+        # Check each item in the sequence
+        for item in index:
+            if (
+                isinstance(item, (torch.Tensor, slice))
+                or hasattr(item, "__iter__")
+                or item is ...
+                or item is None
+                or has_dims(item)
+            ):
+                indices.extend(index)
+                return True
+
+        # If we got here, treat as single index
+        indices.append(index)
+        return False
+
+    # Default case
+    indices.append(index)
+    return False
+
+
+def getitem(cls: Any, func: Any, types: Any, args: Any, kwargs: Any) -> Any:
+    self = args[0]
+    index = args[1]
+
+    iinfo = getsetitem(self, index, has_dims(self))
+    if iinfo.can_call_original:
+        # Call original tensor __getitem__ directly, bypassing __torch_function__
+        return torch.Tensor.__getitem__(self, index)
+
+    return invoke_getitem(iinfo)
+
+
+def setitem(self: Any, index: Any, rhs: Any) -> None:
+    """Set values in tensor using first-class dimensions."""
+    from . import DimensionBindError, TensorInfo
+
+    iinfo = getsetitem(self, index, has_dims(self) or has_dims(rhs))
+
+    if iinfo.can_call_original:
+        # Call original tensor __setitem__ directly, bypassing __torch_function__
+        torch._C.TensorBase.__setitem__(self, index, rhs)
+        return
+
+    # Handle RHS tensor with dimensions
+    rhs_info = TensorInfo.create(rhs, False, False)
+
+    if rhs_info:
+        # Check that rhs dimensions are compatible with result dimensions
+        for l in rhs_info.levels:
+            if not l.is_positional():
+                # Find this dimension in result levels
+                found = False
+                for result_level in iinfo.result_levels:
+                    if (
+                        not result_level.is_positional()
+                        and result_level.dim() is l.dim()
+                    ):
+                        found = True
+                        break
+
+                if not found:
+                    # Create tuple representation of result levels for error message
+                    result_dims: list[Union[int, Dim]] = []
+                    for rl in iinfo.result_levels:
+                        if rl.is_positional():
+                            result_dims.append(rl.position())
+                        else:
+                            result_dims.append(rl.dim())
+
+                    raise DimensionBindError(
+                        f"rhs of setitem contains dimension {l.dim()!r} which is not in the dimension on the left "
+                        f"({tuple(result_dims)!r})"
+                    )
+
+        # Match RHS tensor to result levels
+        assert rhs_info.tensor is not None, "Cannot match levels on None tensor"
+        matched_rhs = _match_levels(
+            rhs_info.tensor, rhs_info.levels, iinfo.result_levels
+        )
+    else:
+        matched_rhs = rhs
+
+    # For advanced indexing with dimensions, we need special handling
+    if iinfo.advanced_indexing:
+        # Use advanced indexing - the flat_inputs already contain matched tensors
+        tup = slice_to_tuple(iinfo.flat_inputs)
+        if iinfo.self_tensor is None:
+            raise RuntimeError("Cannot setitem on None tensor")
+        torch._C.TensorBase.__setitem__(iinfo.self_tensor, tup, matched_rhs)
+    else:
+        # Simple copy operation
+        if iinfo.self_tensor is None:
+            raise RuntimeError("Cannot copy to None tensor")
+        iinfo.self_tensor.copy_(matched_rhs)
+
+
+def invoke_getitem(iinfo: IndexingInfo) -> Any:
+    if iinfo.advanced_indexing:
+        self_tensor = iinfo.self_tensor
+        tup = slice_to_tuple(iinfo.flat_inputs)
+        if self_tensor is None:
+            raise RuntimeError("Cannot getitem on None tensor")
+        rtensor = self_tensor[tup]
+    else:
+        rtensor = iinfo.self_tensor  # type: ignore[assignment]
+        if rtensor is None:
+            raise RuntimeError("Cannot getitem on None tensor")
+        # rtensor is now guaranteed to be not None
+
+    # Create a Tensor with the proper dimensions using the class method
+    from . import Tensor
+
+    return Tensor.from_positional(rtensor, iinfo.result_levels, iinfo.has_device)
+
+
+def getsetitem(self: Any, index: Any, tensors_have_dims: bool) -> IndexingInfo:
+    from . import DimList  # Import DimList for type checking
+
+    can_call_original_getitem = not tensors_have_dims
+
+    input_list = []
+    if has_dims(index):
+        input_list.append(index)
+    else:
+        is_sequence = extractIndices(index, input_list)
+        # nothing about first class dims here, fallback to getitem
+        if can_call_original_getitem and not is_sequence:
+            return IndexingInfo(can_call_original=True)
+
+    # Calculate how many dimensions have been indexed in order to compute the
+    # size of ... or expand a potentially unbound dimension list.
+    dims_indexed = 0
+    expanding_object = -1
+    unbound_dim_list = None
+    dimlists = []  # Track DimList positions for later processing
+
+    def check_expanding(i: int) -> None:
+        nonlocal expanding_object
+        if expanding_object != -1:
+            from . import DimensionBindError
+
+            raise DimensionBindError(
+                f"at most one ... or unbound dimension list can exist in indexing list but found 2 at offsets "
+                f"{expanding_object} and {i}"
+            )
+        expanding_object = i
+
+    def is_dimpack(s: Any) -> bool:
+        from . import Dim
+
+        return (
+            isinstance(s, (tuple, list))
+            and len(s) > 0
+            and all(Dim.check_exact(item) for item in s)
+        )
+
+    has_dimpacks_or_none = False
+    for i, s in enumerate(input_list):
+        if has_dims(s):
+            can_call_original_getitem = False
+            dims_indexed += 1
+        elif s is ...:
+            check_expanding(i)
+        elif isinstance(s, DimList):
+            can_call_original_getitem = False
+            if not s.is_bound:
+                check_expanding(i)
+                unbound_dim_list = s
+            else:
+                dims_indexed += len(s._dims)
+            dimlists.append(i)
+        elif s is None:
+            has_dimpacks_or_none = True
+        elif is_dimpack(s):
+            can_call_original_getitem = False
+            has_dimpacks_or_none = True
+            dims_indexed += 1
+        else:
+            dims_indexed += 1
+
+    # Early return if we can use original getitem
+    if can_call_original_getitem:
+        return IndexingInfo(can_call_original=True)
+
+    self_info = TensorInfo.create(self, False, True)
+    total_dims = len(self_info.levels)  # Total dimensions (positional + named)
+    if dims_indexed > total_dims:
+        raise ValueError(
+            f"at least {dims_indexed} indices were supplied but the tensor only has {total_dims} dimensions"
+        )
+
+    # Expand any unbound dimension list, or expand ... into individual : slices.
+    expanding_dims = total_dims - dims_indexed
+    if expanding_object != -1:
+        if unbound_dim_list is not None:
+            # Bind unbound dimension list to the expanding dimensions
+            unbound_dim_list.bind_len(expanding_dims)
+        else:
+            # Expand ... into slice(None) objects
+            no_slices = [slice(None)] * expanding_dims
+            input_list = (
+                input_list[:expanding_object]
+                + no_slices
+                + input_list[expanding_object + 1 :]
+            )
+
+    # Flatten out any dimensions stored in dimlist elements directly into the inputs
+    # Process in reverse order to maintain indices
+    for i in range(len(dimlists) - 1, -1, -1):
+        idx = dimlists[i]
+
+        # We added more elements to input because of ...
+        # so we need to also adjust the index to get back to where the
+        # dimlist existed
+        if (
+            unbound_dim_list is None
+            and expanding_object != -1
+            and idx > expanding_object
+        ):
+            idx += expanding_dims
+
+        dl = input_list[idx]
+
+        # PRIVATE here naughty
+        input_list = input_list[:idx] + dl._dims + input_list[idx + 1 :]
+
+    return getsetitem_flat(self_info, input_list, [], [], has_dimpacks_or_none)
+
+
+def getsetitem_flat(
+    self_info: TensorInfo,
+    input_list: list,
+    keys: list[DimEntry],
+    values: list,
+    has_dimpacks_or_none: bool,
+) -> IndexingInfo:
+    from . import Dim
+
+    # Track dimension usage
+    seen_dims: list[Any] = []
+    seen_dims_nuses: list[int] = []
+
+    def add_dim(dim: Any) -> None:
+        # Use safe indexing to avoid triggering __torch_function__ on Dim objects
+        idx = _safe_index(seen_dims, dim)
+        if idx is not None:
+            seen_dims_nuses[idx] += 1
+        else:
+            seen_dims.append(dim)
+            seen_dims_nuses.append(1)
+
+    flat_inputs = []
+    tensor_inputs: list[Any] = []
+    device_holding_tensor = None
+
+    def append_flat_handle(handle: Any) -> None:
+        flat_inputs.append(handle)
+        tensor_inputs.append(None)
+
+    def append_tensor_input(ti: TensorInfo) -> None:
+        flat_inputs.append(None)
+        tensor_inputs.append(ti)
+        nonlocal device_holding_tensor
+        if ti.has_device and device_holding_tensor is None:
+            device_holding_tensor = ti.tensor
+
+    nsz = []
+    nsd = []
+    if self_info.tensor is None:
+        raise RuntimeError("Cannot get size/stride on None tensor")
+    sz = self_info.tensor.size()
+    sd = self_info.tensor.stride()
+
+    def append_size(i: int) -> None:
+        if has_dimpacks_or_none:
+            nsz.append(sz[i])
+            nsd.append(sd[i])
+
+    input_it = input_list[:]
+
+    def parse_nones() -> None:
+        nonlocal input_it
+        while input_it and input_it[0] is None:
+            append_flat_handle(slice(None))
+            nsz.append(1)
+            nsd.append(0)
+            input_it = input_it[1:]
+
+    def append_item(i: int, arg: Any) -> None:
+        if Dim.check_exact(arg):
+            d = arg
+            if d._size == -1:
+                d.size = sz[i]
+            add_dim(d)
+            append_size(i)
+            append_flat_handle(arg)
+            return
+
+        info = TensorInfo.create(arg, False, False)
+        if info:
+            append_size(i)
+            append_tensor_input(info)
+            for level in info.levels:
+                if not level.is_positional():
+                    add_dim(level.dim())
+            return
+
+        if has_dimpacks_or_none:
+            if isinstance(arg, (tuple, list)) and all(Dim.check_exact(d) for d in arg):
+                # dim pack
+                dim_pack = list(arg)
+                for d in dim_pack:
+                    add_dim(d)
+                    append_flat_handle(d)
+                _bind_dims_to_size(sz[i], sd[i], dim_pack, nsz, nsd)
+                return
+
+        append_size(i)
+        append_flat_handle(arg)
+
+    # Match indexing expressions with tensor dimensions
+    for i, level in enumerate(self_info.levels):
+        # Use safe indexing to avoid triggering __torch_function__ on DimEntry comparisons
+        idx = _safe_index(keys, level)
+        if idx is not None:
+            append_item(i, values[idx])
+        else:
+            if level.is_positional():
+                parse_nones()
+                if not input_it:
+                    append_flat_handle(slice(None))
+                    append_size(i)
+                else:
+                    arg = input_it[0]
+                    input_it = input_it[1:]
+                    append_item(i, arg)
+            else:
+                add_dim(level.dim())
+                append_flat_handle(level.dim())
+                append_size(i)
+
+    parse_nones()
+
+    # Restride tensor if needed
+    if has_dimpacks_or_none and nsz:
+        if self_info.tensor is None:
+            raise RuntimeError("Cannot restride None tensor")
+        self_tensor = self_info.tensor.as_strided(
+            nsz, nsd, self_info.tensor.storage_offset()
+        )
+    else:
+        self_tensor = self_info.tensor
+
+    # Determine result shape and indexing requirements
+    result_levels: list[Any] = []
+    index_levels = []
+    tensor_insert_point = -1
+    requires_getindex = False
+
+    def mark_tensor_index() -> None:
+        nonlocal tensor_insert_point
+        if tensor_insert_point == -1:
+            tensor_insert_point = len(result_levels)
+        elif tensor_insert_point != len(result_levels):
+            tensor_insert_point = 0
+
+    for i, inp in enumerate(flat_inputs):
+        if tensor_inputs[i] is not None:
+            requires_getindex = True
+            mark_tensor_index()
+            for level in tensor_inputs[i].levels:
+                if level not in index_levels:
+                    index_levels.append(level)
+        elif Dim.check_exact(inp):
+            d = inp
+            # Use safe indexing to avoid triggering __torch_function__
+            dim_idx = _safe_index(seen_dims, d)
+            assert dim_idx is not None, f"Dim {d} not found in seen_dims"
+            if seen_dims_nuses[dim_idx] == 1:
+                flat_inputs[i] = slice(None)
+                result_levels.append(DimEntry(d))
+            else:
+                requires_getindex = True
+                flat_inputs[i] = None
+                tensor_inputs[i] = TensorInfo(
+                    d._get_range(), [DimEntry(d)], False, None
+                )
+                if DimEntry(d) not in index_levels:
+                    index_levels.append(DimEntry(d))
+                mark_tensor_index()
+        else:
+            if inp != slice(None):
+                requires_getindex = True
+            if not isinstance(inp, int):
+                result_levels.append(DimEntry(-1))
+
+    # Insert indexing dimensions at first tensor use point
+    if tensor_insert_point != -1:
+        for level in reversed(index_levels):
+            result_levels.insert(tensor_insert_point, level)
+
+    # Match tensors to indexing shape
+    if requires_getindex:
+        for i in range(len(flat_inputs)):
+            if tensor_inputs[i] is not None:
+                t = tensor_inputs[i].tensor
+                assert t is not None, "TensorInfo should have valid tensor data"
+                if (
+                    not tensor_inputs[i].has_device
+                    and device_holding_tensor is not None
+                ):
+                    t = t.to(device_holding_tensor.device)
+                flat_inputs[i] = _match_levels(t, tensor_inputs[i].levels, index_levels)
+
+    # Number positional dimensions correctly
+    seen_positionals = 0
+    for i in reversed(range(len(result_levels))):
+        if result_levels[i].is_positional():
+            seen_positionals += 1
+            result_levels[i] = DimEntry(-seen_positionals)
+
+    return IndexingInfo(
+        can_call_original=False,
+        advanced_indexing=requires_getindex,
+        self_tensor=self_tensor,
+        flat_inputs=flat_inputs,
+        result_levels=result_levels,
+        has_device=self_info.has_device,
+    )
diff --git a/functorch/dim/_order.py b/functorch/dim/_order.py
new file mode 100644
index 000000000000..baa0f82e4b2a
--- /dev/null
+++ b/functorch/dim/_order.py
@@ -0,0 +1,214 @@
+from __future__ import annotations
+
+from typing import Any, TYPE_CHECKING, Union
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+import torch  # noqa: TC002
+
+from ._dim_entry import _match_levels, DimEntry, ndim_of_levels
+
+
+def _wrap_dim(arg: Any, orig_ndim: int, allow_none: bool = True) -> DimEntry:
+    """
+    Convert various dimension representations to DimEntry.
+
+    Args:
+        arg: The argument to convert (Dim, int, or other)
+        orig_ndim: Original number of dimensions
+        allow_none: Whether to allow None values
+
+    Returns:
+        DimEntry representation of the dimension
+    """
+    from . import Dim
+
+    if arg is None and allow_none:
+        return DimEntry()  # None entry
+    elif isinstance(arg, Dim):
+        return DimEntry(arg)
+    elif isinstance(arg, int):
+        if arg < 0:
+            pos = arg
+        else:
+            pos = arg - orig_ndim
+        return DimEntry(pos)
+    else:
+        return DimEntry()
+
+
+def order(
+    tensor_or_dim: Union[torch.Tensor, Any], *dims: Union[Any, Sequence[Any]]
+) -> torch.Tensor:
+    """
+    Reorder the dimensions of a tensor or create a tensor from a dimension.
+
+    It allows reordering tensor dimensions using first-class dimensions and
+    positional indices.
+
+    Args:
+        tensor_or_dim: Input tensor with first-class dimensions, or a Dim object
+        *dims: Dimensions or sequences of dimensions specifying the new order
+
+    Returns:
+        Tensor with reordered dimensions
+
+    Examples:
+        >>> import torch
+        >>> from functorch.dim import dims
+        >>> batch, channel, height, width = dims(4)
+        >>> x = torch.randn(2, 3, 4, 5)[batch, channel, height, width]
+        >>> # Reorder to [height, width, batch, channel]
+        >>> y = order(x, height, width, batch, channel)
+    """
+    from . import Dim, DimList, Tensor
+
+    # Handle first argument - tensor or dimension
+    if isinstance(tensor_or_dim, Tensor):
+        # First-class tensor
+        orig_levels = tensor_or_dim._levels[:]
+        data = tensor_or_dim._tensor
+        has_device = tensor_or_dim._has_device
+    elif isinstance(tensor_or_dim, Dim):
+        # Single dimension - create range tensor
+        orig_levels = [DimEntry(tensor_or_dim)]
+        data = tensor_or_dim._get_range()
+        has_device = False
+    else:
+        raise ValueError("First argument must be a Tensor or Dim object")
+
+    flat_positional_dims = []
+    to_flatten = []  # List of (start_index, length) pairs for flattening
+    levels = orig_levels[:]
+
+    orig_ndim = ndim_of_levels(levels)
+
+    def append_dim(d: DimEntry) -> None:
+        """Add a dimension to the reordering, removing it from available levels."""
+        try:
+            idx = levels.index(d)
+        except ValueError:
+            idx = None
+        if idx is None:
+            if d.is_positional():
+                raise ValueError(
+                    f"tensor has {orig_ndim} positional dimensions, but {d.position() + orig_ndim} specified, "
+                    f"or it was specified twice"
+                )
+            else:
+                raise ValueError(
+                    f"tensor does not contain dim {d.dim()} or it was specified twice"
+                )
+
+        levels[idx] = DimEntry()
+        flat_positional_dims.append(d)
+
+    n_new_positional = 0
+
+    # Process each dimension argument
+    for arg in dims:
+        entry = _wrap_dim(arg, orig_ndim, False)
+        if not entry.is_none():
+            append_dim(entry)
+            n_new_positional += 1
+        elif isinstance(arg, DimList):
+            # Handle DimList
+            for dim in arg._dims:
+                append_dim(DimEntry(dim))
+                n_new_positional += 1
+        else:
+            # Handle sequences of dimensions for flattening
+            n_new_positional += 1
+            if not hasattr(arg, "__iter__"):
+                raise ValueError("expected a Dim, List[Dim], or Sequence[Dim]")
+
+            # Convert to list to get length
+            seq = list(arg)
+            to_flatten.append((len(flat_positional_dims), len(seq)))
+
+            for item in seq:
+                entry = _wrap_dim(item, orig_ndim, False)
+                if entry.is_none():
+                    raise ValueError("expected a Dim or int")
+                append_dim(entry)
+
+    # Build new level ordering
+    insert_point = -1
+    new_levels: list[DimEntry] = []
+
+    # Add remaining (non-reordered) levels, finding insertion point for new dimensions
+    for level in levels:
+        if level.is_none():
+            continue
+        if level.is_positional():
+            if insert_point == -1:
+                insert_point = len(new_levels)
+                new_levels.extend(flat_positional_dims)
+        new_levels.append(level)
+
+    # If no positional dimensions found, append new dims at the end
+    if insert_point == -1:
+        insert_point = len(new_levels)
+        new_levels.extend(flat_positional_dims)
+
+    # Match tensor to new level structure
+    assert data is not None, "Cannot reorder None tensor"
+    ndata = _match_levels(data, orig_levels, new_levels)
+
+    # Handle dimension flattening if requested
+    if to_flatten:
+        # Now build the reshape target
+        view_shape = []
+        sizes = ndata.size()
+
+        # Add dimensions before the reordered ones
+        for i in range(insert_point):
+            view_shape.append(sizes[i])
+
+        # Process flattening groups
+        i = 0
+        for start_idx, length in to_flatten:
+            # Add individual dims before this flattening group
+            while i < start_idx:
+                view_shape.append(sizes[insert_point + i])
+                i += 1
+
+            # Flatten the group
+            new_size = 1
+            for j in range(length):
+                new_size *= sizes[insert_point + i + j]
+            view_shape.append(new_size)
+            i += length
+
+        # Add remaining individual dims
+        while i < len(flat_positional_dims):
+            view_shape.append(sizes[insert_point + i])
+            i += 1
+
+        # Add dimensions after the reordered ones
+        for i in range(insert_point + len(flat_positional_dims), len(levels)):
+            view_shape.append(sizes[i])
+
+        # Update levels by removing flattened dimensions
+        n_to_remove = len(flat_positional_dims) - n_new_positional
+        if n_to_remove > 0:
+            # Remove flattened levels
+            new_levels = (
+                new_levels[:insert_point] + new_levels[insert_point + n_to_remove :]
+            )
+
+        ndata = ndata.reshape(view_shape)
+
+    # Renumber positional dimensions (negative indexing from the right)
+    seen = 0
+    for i in range(len(new_levels) - 1, -1, -1):
+        if new_levels[i].is_positional() or (
+            i >= insert_point and i < insert_point + n_new_positional
+        ):
+            seen -= 1
+            new_levels[i] = DimEntry(seen)
+
+    result = Tensor.from_positional(ndata, new_levels, has_device)
+    return result  # type: ignore[return-value]
diff --git a/functorch/dim/_py_inst_decoder.py b/functorch/dim/_py_inst_decoder.py
new file mode 100644
index 000000000000..7f08ebb8557f
--- /dev/null
+++ b/functorch/dim/_py_inst_decoder.py
@@ -0,0 +1,67 @@
+import dis
+from typing import Any, Optional
+
+
+class _PyInstDecoder:
+    """
+    Decodes Python bytecode instructions to extract variable names
+    """
+
+    def __init__(self, code_object: Any, lasti: int) -> None:
+        self.code_object = code_object
+        self.instructions = list(dis.get_instructions(code_object))
+        self.offset = self._find_instruction_index(lasti)
+
+    def _find_instruction_index(self, lasti: int) -> int:
+        """Find instruction index corresponding to lasti (byte offset)."""
+        # Find the instruction at or before lasti
+        # This should find the CALL instruction, not the next one
+        best_idx = 0
+        for i, instr in enumerate(self.instructions):
+            if instr.offset <= lasti:
+                best_idx = i
+            else:
+                break
+        return best_idx
+
+    def next(self) -> None:
+        """Advance to the next instruction."""
+        self.offset += 1
+
+    def opcode(self) -> Optional[str]:
+        """Get the opcode name of the current instruction."""
+        if self.offset < len(self.instructions):
+            return self.instructions[self.offset].opname
+        return None
+
+    def oparg(self) -> int:
+        """Get the argument of the current instruction."""
+        if self.offset < len(self.instructions):
+            return self.instructions[self.offset].arg or 0
+        return 0
+
+    def name(self) -> Optional[str]:
+        """
+        Extract variable name from current instruction.
+        """
+        opname = self.opcode()
+        if not opname:
+            return None
+
+        names = None
+        if opname in ("STORE_NAME", "STORE_GLOBAL"):
+            names = self.code_object.co_names
+        elif opname == "STORE_FAST":
+            names = self.code_object.co_varnames
+        elif opname == "STORE_DEREF":
+            names = self.code_object.co_cellvars
+            if not names:
+                names = self.code_object.co_freevars
+        else:
+            return None
+
+        arg = self.oparg()
+        if names and 0 <= arg < len(names):
+            return names[arg]
+
+        return None
diff --git a/functorch/dim/_tensor_info.py b/functorch/dim/_tensor_info.py
new file mode 100644
index 000000000000..1e2513e36c05
--- /dev/null
+++ b/functorch/dim/_tensor_info.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Optional, TYPE_CHECKING
+
+import torch
+
+
+if TYPE_CHECKING:
+    from ._dim_entry import DimEntry
+
+
+@dataclass
+class TensorInfo:
+    tensor: Optional[torch.Tensor]
+    levels: list[DimEntry]
+    has_device: bool
+    batchedtensor: Optional[torch.Tensor]
+
+    def __post_init__(self) -> None:
+        from ._dim_entry import DimEntry
+
+        assert all(isinstance(l, DimEntry) for l in self.levels)
+
+    def ndim(self) -> int:
+        from ._dim_entry import ndim_of_levels
+
+        return ndim_of_levels(self.levels)
+
+    def __bool__(self) -> bool:
+        return self.tensor is not None
+
+    @staticmethod
+    def create(
+        h: Any, ensure_batched: bool = True, ensure_present: bool = True
+    ) -> TensorInfo:
+        from . import Dim, DimEntry, Tensor
+
+        if Tensor.check_exact(h):
+            # functorch Tensor with first-class dimensions
+            return TensorInfo(
+                h._get_tensor(),
+                h._get_levels(),
+                h._get_has_device(),
+                h._get_batchtensor() if ensure_batched else None,
+            )
+        elif Dim.check_exact(h):
+            # For Dim objects, only get range/batchtensor if needed and dimension is bound
+            tensor = h._get_range() if h.is_bound else None
+            batchtensor = (
+                h._get_batchtensor() if ensure_batched and h.is_bound else None
+            )
+            return TensorInfo(
+                tensor,
+                [DimEntry(h)],
+                False,
+                batchtensor,
+            )
+        elif isinstance(h, torch.Tensor):
+            # Plain torch tensor - create positional levels
+            levels = []
+            for i in range(-h.dim(), 0):
+                levels.append(DimEntry(i))
+            return TensorInfo(h, levels, True, h)
+        else:
+            if ensure_present:
+                raise ValueError("expected a tensor object")
+            return TensorInfo(None, [], False, None)
diff --git a/functorch/dim/_wrap.py b/functorch/dim/_wrap.py
new file mode 100644
index 000000000000..3c3a12b54ceb
--- /dev/null
+++ b/functorch/dim/_wrap.py
@@ -0,0 +1,267 @@
+"""
+Python implementation of function wrapping functionality for functorch.dim.
+"""
+
+from __future__ import annotations
+
+import functools
+from typing import Any, Optional, TYPE_CHECKING
+
+import torch
+from torch.utils._pytree import tree_map
+
+from ._dim_entry import DimEntry
+from ._enable_all_layers import EnableAllLayers
+from ._tensor_info import TensorInfo
+
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
+def handle_from_tensor(tensor: torch.Tensor) -> torch.Tensor:
+    """Handle tensor conversion for torch function integration."""
+    return tensor
+
+
+class WrappedOperator:
+    """
+    This class wraps PyTorch operations to support first-class dimensions.
+    """
+
+    def __init__(
+        self, orig: Callable, wrapper_implementation: Callable, dim_name: str = "dim"
+    ):
+        self.orig = orig
+        self.wrapper_implementation = wrapper_implementation
+        self.name = getattr(orig, "__name__", "")
+        self.doc = getattr(orig, "__doc__", None)
+        self.dim_name = dim_name
+
+        self.is_pointwise = False
+        self.dim_offset = 0
+        self.keepdim_offset = 1
+        self.single_dim = False
+        self.reduce = True
+
+        # Update docstring if we have a dim_name
+        if self.doc and self.dim_name:
+            self.doc = f"{self.doc}\nArgument '{self.dim_name}' can be either an integer or a torchdim.Dim object.\n"
+
+    def function(self) -> Callable:
+        """Create a wrapped function that calls our wrapper implementation."""
+
+        def wrapped_func(*args: Any, **kwargs: Any) -> Any:
+            return self.wrapper_implementation(self, *args, **kwargs)
+
+        # Copy metadata using functools.update_wrapper for just __name__ and __doc__
+        functools.update_wrapper(
+            wrapped_func, self.orig, assigned=("__name__",), updated=()
+        )
+        wrapped_func.__doc__ = self.doc
+
+        return wrapped_func
+
+
+def _wrap_dim(dim: Any, ndim: int, keepdim: bool = False) -> DimEntry:
+    """Convert single dimension specification to DimEntry object."""
+    from . import Dim
+
+    if isinstance(dim, Dim):
+        if keepdim:
+            raise ValueError("cannot preserve first-class dimensions with keepdim=True")
+        return DimEntry(dim)
+    elif isinstance(dim, int):
+        i = dim
+        while i >= 0:
+            i -= ndim
+        return DimEntry(i)
+    else:
+        return DimEntry()
+
+
+def _wrap_dims(dim: Any, ndim: int, keepdim: bool = False) -> list[DimEntry]:
+    """Convert dimension specification to list of DimEntry objects."""
+    de = _wrap_dim(dim, ndim, keepdim)
+    result = []
+    if not de.is_none():
+        result.append(de)
+    else:
+        for d in dim:
+            result.append(_wrap_dim(d, ndim, keepdim))
+    return result
+
+
+def patched_dim_method(wrapper: WrappedOperator, *args: Any, **kwargs: Any) -> Any:
+    """
+    This is the core method that handles dimension-aware operations.
+    """
+    if not args:
+        raise ValueError("Expected at least one argument (self)")
+
+    # Get dimension argument
+    dim_arg = kwargs.get(wrapper.dim_name)
+    if dim_arg is None and wrapper.dim_offset < len(args):
+        # Try to get dim from positional args (accounting for self at index 0)
+        dim_idx = wrapper.dim_offset + 1
+        if dim_idx < len(args):
+            dim_arg = args[dim_idx]
+
+    # If no dimension argument provided, fall back to standard functorch handling
+    if dim_arg is None:
+        info = TensorInfo.create(args[0], ensure_batched=True, ensure_present=False)
+        if not info:
+            return wrapper.orig(*args, **kwargs)
+
+        with EnableAllLayers(info.levels) as guard:
+            assert info.batchedtensor is not None
+            guard.inplace_update_layers(info.batchedtensor, info.levels)
+            new_args = list(args)
+            new_args[0] = handle_from_tensor(info.batchedtensor)
+            result = wrapper.orig(*new_args, **kwargs)
+            return guard.from_batched(result, info.has_device)
+
+    # Handle dimension-aware operation
+    info = TensorInfo.create(args[0])
+    if not info:
+        return wrapper.orig(*args, **kwargs)
+
+    # Check for keepdim parameter
+    keepdim = False
+    if wrapper.reduce:
+        keepdim_arg = kwargs.get("keepdim")
+        if keepdim_arg is None and wrapper.keepdim_offset < len(args):
+            keepdim_idx = wrapper.keepdim_offset + 1
+            if keepdim_idx < len(args):
+                keepdim_arg = args[keepdim_idx]
+        if keepdim_arg is not None:
+            keepdim = bool(keepdim_arg)
+
+    # Wrap dimensions
+    ndim = info.ndim()
+    dims = _wrap_dims(dim_arg, ndim, keepdim)
+
+    # Convert dimensions to indices and validate
+    dim_indices: list[int] = []
+    seen = [False] * len(info.levels)
+
+    for d in dims:
+        midx = None
+        for i, level in enumerate(info.levels):
+            if level == d:
+                midx = i
+                break
+
+        if midx is None:
+            # Try to match by position/name more flexibly
+            for i, level in enumerate(info.levels):
+                if hasattr(level, "matches") and level.matches(d):
+                    midx = i
+                    break
+
+            if midx is None:
+                level_strs = [str(level) for level in info.levels]
+                raise ValueError(
+                    f"Tensor with dimensions {level_strs} does not contain {d}"
+                )
+
+        seen[midx] = True
+        dim_indices.append(midx)
+
+    # Determine new levels after reduction
+    new_levels = []
+    if wrapper.reduce and not keepdim:
+        for i, level in enumerate(info.levels):
+            if not seen[i]:
+                new_levels.append(level)
+    else:
+        new_levels = info.levels[:]
+
+    # Create dimension indices for the original function
+    if len(dim_indices) == 1:
+        py_indices: Any = dim_indices[0]
+    else:
+        py_indices = tuple(dim_indices)
+
+    # Update arguments
+    new_args = list(args)
+    new_kwargs = kwargs.copy()
+    assert info.tensor is not None
+    new_args[0] = handle_from_tensor(info.tensor)
+
+    # Update dimension argument
+    if wrapper.dim_name in new_kwargs:
+        new_kwargs[wrapper.dim_name] = py_indices
+    else:
+        dim_idx = wrapper.dim_offset + 1
+        if dim_idx < len(new_args):
+            new_args = list(new_args)
+            new_args[dim_idx] = py_indices
+
+    # Call original function
+    result = wrapper.orig(*new_args, **new_kwargs)
+
+    # Wrap results
+    def wrap_result(obj: Any) -> Any:
+        if isinstance(obj, torch.Tensor):
+            from . import Tensor
+
+            return Tensor.from_positional(obj, new_levels, info.has_device)
+        return obj
+
+    return tree_map(wrap_result, result)
+
+
+def _wrap(
+    orig: Callable,
+    dim_offset: Optional[int] = None,
+    keepdim_offset: Optional[int] = None,
+    dim_name: Optional[str] = None,
+    single_dim: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+) -> Callable:
+    """
+    Wrap a PyTorch function to support first-class dimensions.
+
+    Args:
+        orig: Original function to wrap
+        dim_offset: Offset for dimension argument (default: 0)
+        keepdim_offset: Offset for keepdim argument (default: 1)
+        dim_name: Name of dimension parameter (default: "dim")
+        single_dim: Whether function takes single dimension (default: False)
+        reduce: Whether function reduces dimensions (default: True)
+    """
+    dim_name = dim_name or "dim"
+
+    wrapper = WrappedOperator(orig, patched_dim_method, dim_name)
+
+    if dim_offset is not None:
+        wrapper.dim_offset = dim_offset
+    if keepdim_offset is not None:
+        wrapper.keepdim_offset = keepdim_offset
+    if single_dim is not None:
+        wrapper.single_dim = single_dim
+    if reduce is not None:
+        wrapper.reduce = reduce
+
+    return wrapper.function()
+
+
+def call_torch_function(
+    wrapper: WrappedOperator,
+    func: Callable,
+    types: tuple,
+    args: tuple = (),
+    kwargs: Optional[dict] = None,
+) -> Any:
+    """
+    Handle __torch_function__ calls for wrapped operators.
+    """
+    if kwargs is None:
+        kwargs = {}
+
+    # Import here to avoid circular imports
+    from . import _Tensor
+
+    # Use the torch function mechanism from _Tensor
+    return _Tensor.__torch_function__(func, types, args, kwargs)
diff --git a/functorch/dim/magic_trace.py b/functorch/dim/magic_trace.py
index 5c962a898ca7..d3be42cd5514 100644
--- a/functorch/dim/magic_trace.py
+++ b/functorch/dim/magic_trace.py
@@ -6,11 +6,14 @@
 import os
 import signal
 import subprocess
+from collections.abc import Generator
 from contextlib import contextmanager
 
 
 @contextmanager
-def magic_trace(output="trace.fxt", magic_trace_cache="/tmp/magic-trace"):
+def magic_trace(
+    output: str = "trace.fxt", magic_trace_cache: str = "/tmp/magic-trace"
+) -> Generator[None, None, None]:
     pid = os.getpid()
     if not os.path.exists(magic_trace_cache):
         print(f"Downloading magic_trace to: {magic_trace_cache}")
@@ -26,6 +29,7 @@ def magic_trace(output="trace.fxt", magic_trace_cache="/tmp/magic-trace"):
         subprocess.run(["chmod", "+x", magic_trace_cache])
     args = [magic_trace_cache, "attach", "-pid", str(pid), "-o", output]
     p = subprocess.Popen(args, stderr=subprocess.PIPE, encoding="utf-8")
+    assert p.stderr is not None
     while True:
         x = p.stderr.readline()
         print(x)
@@ -36,7 +40,8 @@ def magic_trace(output="trace.fxt", magic_trace_cache="/tmp/magic-trace"):
     finally:
         p.send_signal(signal.SIGINT)
         r = p.wait()
-        print(p.stderr.read())
-        p.stderr.close()
+        if p.stderr is not None:
+            print(p.stderr.read())
+            p.stderr.close()
         if r != 0:
             raise ValueError(f"magic_trace exited abnormally: {r}")
diff --git a/functorch/dim/tree_map.py b/functorch/dim/tree_map.py
deleted file mode 100644
index 3d2eae0582c8..000000000000
--- a/functorch/dim/tree_map.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from functorch._C import dim
-
-
-tree_flatten = dim.tree_flatten
-
-
-def tree_map(fn, tree):
-    vs, unflatten = tree_flatten(tree)
-    return unflatten(fn(v) for v in vs)
diff --git a/functorch/dim/wrap_type.py b/functorch/dim/wrap_type.py
index b9ebda47c4cf..5020e756ce6c 100644
--- a/functorch/dim/wrap_type.py
+++ b/functorch/dim/wrap_type.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import functools
+from collections.abc import Callable
 from types import (
     BuiltinMethodType,
     FunctionType,
@@ -11,11 +13,8 @@
     MethodDescriptorType,
     WrapperDescriptorType,
 )
+from typing import Any
 
-from functorch._C import dim as _C
-
-
-_wrap_method = _C._wrap_method
 
 FUNC_TYPES = (
     FunctionType,
@@ -26,14 +25,24 @@
 PROPERTY_TYPES = (GetSetDescriptorType, property)
 
 
-def wrap_type(to_patch, pattern, __torch_function__):
-    wrap_method = _wrap_method
+def _py_wrap_method(orig: Callable, __torch_function__: Callable) -> Callable:
+    def impl(*args: Any, **kwargs: Any) -> Any:
+        return __torch_function__(orig, None, args, kwargs)
+
+    # Copy metadata using functools.update_wrapper for just __name__ and __doc__
+    functools.update_wrapper(impl, orig, assigned=("__name__", "__doc__"), updated=())
+
+    return impl
+
+
+def wrap_type(to_patch: Any, pattern: type, __torch_function__: Callable) -> None:
+    wrap_method = _py_wrap_method
 
-    all = {}
+    all: dict[str, Any] = {}
     for t in reversed(pattern.mro()[:-1]):  # skip object
         all.update(t.__dict__)
 
-    def wrap_attr(orig):
+    def wrap_attr(orig: Any) -> property:
         return property(wrap_method(orig.__get__, __torch_function__))
 
     for name, obj in all.items():
diff --git a/functorch/einops/rearrange.py b/functorch/einops/rearrange.py
index d7d71f5103f9..21e3bfaad4d8 100644
--- a/functorch/einops/rearrange.py
+++ b/functorch/einops/rearrange.py
@@ -1,10 +1,10 @@
 from __future__ import annotations
 
 import functools
-from typing import Callable, TYPE_CHECKING, Union
+from typing import TYPE_CHECKING, Union
 
 import torch
-from functorch._C import dim as _C
+from functorch.dim import dims  # noqa: F401
 
 from ._parsing import (
     _ellipsis,
@@ -16,12 +16,10 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
 
 __all__ = ["rearrange"]
 
-dims = _C.dims
-
 
 @functools.lru_cache(256)
 def _create_rearrange_callable(
diff --git a/functorch/experimental/__init__.py b/functorch/experimental/__init__.py
index 3941f6d96e1f..0500fc2c29d3 100644
--- a/functorch/experimental/__init__.py
+++ b/functorch/experimental/__init__.py
@@ -1,5 +1,5 @@
 # PyTorch forward-mode is not mature yet
-from functorch import functionalize
 from torch._functorch.apis import chunk_vmap
 from torch._functorch.batch_norm_replacement import replace_all_batch_norm_modules_
 from torch._functorch.eager_transforms import hessian, jacfwd, jvp
+from torch.func import functionalize
diff --git a/functorch/op_analysis/gen_data.py b/functorch/op_analysis/gen_data.py
index 4fda25f9a4e2..5e874e2bb117 100644
--- a/functorch/op_analysis/gen_data.py
+++ b/functorch/op_analysis/gen_data.py
@@ -33,7 +33,6 @@ def gen_data(special_op_lists, analysis_name):
     annotated_ops = {
         a.strip(): b.strip() for a, b in list(csv.reader(open("annotated_ops")))
     }
-    from collections import defaultdict
 
     uniq_ops = []
     uniq_names = set()
diff --git a/mypy-strict.ini b/mypy-strict.ini
index dddbb623047f..11e520d9ad82 100644
--- a/mypy-strict.ini
+++ b/mypy-strict.ini
@@ -6,7 +6,7 @@
 # files.
 
 [mypy]
-python_version = 3.9
+python_version = 3.10
 plugins = mypy_plugins/check_mypy_version.py, numpy.typing.mypy_plugin
 
 cache_dir = .mypy_cache/strict
diff --git a/pt_template_srcs.bzl b/pt_template_srcs.bzl
index d3a8dcabaa7e..84f5f8bd3e62 100644
--- a/pt_template_srcs.bzl
+++ b/pt_template_srcs.bzl
@@ -156,6 +156,7 @@ def get_generate_code_bin_outs():
             "autograd/generated/python_torch_functions_1.cpp": ["autograd/generated/python_torch_functions_1.cpp"],
             "autograd/generated/python_torch_functions_2.cpp": ["autograd/generated/python_torch_functions_2.cpp"],
             "autograd/generated/python_variable_methods.cpp": ["autograd/generated/python_variable_methods.cpp"],
+            "functionalization/generated/ViewMetaClassesPythonBinding.cpp": ["functionalization/generated/ViewMetaClassesPythonBinding.cpp"],
         })
     return outs
 
diff --git a/pyproject.toml b/pyproject.toml
index 874c629273d7..1ad3595ab297 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@
 requires = [
     # 70.1.0: min version for integrated bdist_wheel command from wheel package
     # 77.0.0: min version for SPDX expression support for project.license
-    "setuptools>=70.1.0,<80.0",
+    "setuptools>=70.1.0",
     "cmake>=3.27",
     "ninja",
     "numpy",
@@ -53,7 +53,7 @@ dev = [
 name = "torch"
 description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 # TODO: change to `license = "BSD-3-Clause"` and enable PEP 639 after pinning setuptools>=77
 # FIXME: As of 2025.06.20, it is hard to ensure the minimum version of setuptools in our CI environment.
 # TOML-table-based license deprecated in setuptools>=77, and the deprecation warning will be changed
@@ -74,7 +74,6 @@ classifiers = [
     "Topic :: Software Development :: Libraries :: Python Modules",
     "Programming Language :: C++",
     "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
@@ -156,13 +155,10 @@ ignore = [
     "E402",
     "C408", # C408 ignored because we like the dict keyword argument syntax
     "E501", # E501 is not flexible enough, we're using B950 instead
-    "E721",
     "E741",
     "EXE001",
     "F405",
     "FURB122", # writelines
-    # these ignores are from flake8-logging-format; please fix!
-    "G101",
     # these ignores are from ruff NPY; please fix!
     "NPY002",
     # these ignores are from ruff PERF; please fix!
@@ -182,9 +178,13 @@ ignore = [
     "SIM116", # Disable Use a dictionary instead of consecutive `if` statements
     "SIM117",
     "SIM118",
+    "SIM300", # Yoda condition detected
     "UP007", # keep-runtime-typing
     "UP045", # keep-runtime-typing
     "TC006",
+    # TODO: Remove Python-3.10 specific suppressions
+    "B905",
+    "UP035",
 ]
 select = [
     "B",
@@ -194,8 +194,7 @@ select = [
     "E",
     "EXE",
     "F",
-    "SIM1",
-    "SIM911",
+    "SIM",
     "W",
     # Not included in flake8
     "FURB",
@@ -203,14 +202,10 @@ select = [
     "NPY",
     "PERF",
     "PGH004",
-    "PIE790",
-    "PIE794",
-    "PIE800",
-    "PIE804",
-    "PIE807",
-    "PIE810",
+    "PIE",
     "PLC0131", # type bivariance
     "PLC0132", # type param mismatch
+    "PLC1802", # len({expression}) used as condition without comparison
     "PLC0205", # string as __slots__
     "PLC3002", # unnecessary-direct-lambda-call
     "PLE",
@@ -218,6 +213,7 @@ select = [
     "PLR0206", # property with params
     "PLR1722", # use sys exit
     "PLR1736", # unnecessary list index
+    "PLW0127", # Self-assignment of variable
     "PLW0129", # assert on string literal
     "PLW0131", # named expr without context
     "PLW0133", # useless exception statement
@@ -240,6 +236,7 @@ select = [
     "Q003",  # avoidable escaped quote
     "Q004",  # unnecessary escaped quote
     "RSE",
+    "RUF007", # pairwise over zip
     "RUF008", # mutable dataclass default
     "RUF013", # ban implicit optional
     "RUF015", # access first ele in constant time
diff --git a/pyrefly.toml b/pyrefly.toml
index 6b94aeb5c1ca..3dd623681849 100644
--- a/pyrefly.toml
+++ b/pyrefly.toml
@@ -1,17 +1,51 @@
+# A Pyrefly configuration for PyTorch
+# Based on https://github.com/pytorch/pytorch/blob/main/mypy.ini
+python-version = "3.12"
+
 project-includes = [
     "torch",
     "caffe2",
+    "tools",
     "test/test_bundled_images.py",
     "test/test_bundled_inputs.py",
     "test/test_complex.py",
     "test/test_datapipe.py",
-    "test/test_futures.py",
+    # "test/test_futures.py", # uncomment when enabling pyrefly
     "test/test_numpy_interop.py",
+    # We exclude test_torch.py because it is full of errors, but most functions lack type signatures,
+    # and mypy.ini specifies `check_untyped_defs = False` for this file.
+    # If you check even the unannotated stuff mypy produces 322 errors.
+    # "test/test_torch.py",
     "test/test_type_hints.py",
     "test/test_type_info.py",
-    "test/test_utils.py",
+    # "test/test_utils.py", # uncomment when enabling pyrefly
 ]
 project-excludes = [
+  # ==== below will be enabled directory by directory ====
+  # ==== to test Pyrefly on a specific directory, simply comment it out ====
+  "torch/_inductor/codegen/triton.py",
+  "tools/linter/adapters/test_device_bias_linter.py",
+  "tools/code_analyzer/gen_operators_yaml.py",
+  "torch/_inductor/runtime/triton_heuristics.py",
+  "torch/_inductor/runtime/triton_helpers.py",
+  "torch/_inductor/runtime/halide_helpers.py",
+  # formatting issues, will turn on after adjusting where suppressions can be
+  # in import statements
+  "tools/flight_recorder/components/types.py",
+  "torch/linalg/__init__.py",
+  "torch/package/importer.py",
+  "torch/package/_package_pickler.py",
+  "torch/jit/annotations.py",
+  "torch/utils/data/datapipes/_typing.py",
+  "torch/nn/functional.py",
+  "torch/_export/utils.py",
+  "torch/fx/experimental/unification/multipledispatch/__init__.py",
+  "torch/nn/modules/__init__.py",
+  "torch/nn/modules/rnn.py", # only remove when parsing errors are fixed
+  "torch/_inductor/codecache.py",
+  "torch/distributed/elastic/metrics/__init__.py",
+  "torch/_inductor/fx_passes/bucketing.py",
+  # ====
   "torch/include/**",
   "torch/csrc/**",
   "torch/distributed/elastic/agent/server/api.py",
@@ -81,19 +115,21 @@ ignore-missing-imports = [
     "onnx.*",
     "onnxruntime.*",
     "onnxscript.*",
-    "redis.*"
+    "redis.*",
 ]
-
-untyped_def_behavior = "check-and-infer-return-any"
-
-# Shut off noisy errors
+# By default, mypy does not check untyped definitions.
+# However, mypy has a configuration called check_untyped_defs which is used
+# to typecheck the interior of untyped functions.
+untyped-def-behavior = "check-and-infer-return-any"
+# In lots of places they define their attributes in `_init` or similar.
+# https://github.com/pytorch/pytorch/blob/75f3e5a88df60caef27fd9c9df3fd51161378fcc/torch/fx/experimental/symbolic_shapes.py#L3632C1-L3633C1
+errors.implicitly-defined-attribute = false
+# In many methods that are overridden, parameters are renamed.
+# We can come up with a codemod for this in the future
+errors.bad-param-name-override = false
+# Mypy doesn't require that imports are explicitly imported, so be compatible with that.
+# Might be a good idea to turn this on in future.
 errors.implicit-import = false
-
-# We exclude test_torch.py because it is full of errors, but most functions lack type signatures,
-# and mypy.ini specifies `check_untyped_defs = False` for this file.
-# If you check even the unannotated stuff mypy produces 322 errors.
-# "test/test_torch.py",
-# Uncomment this file to check
-# [[tool.pyrefly.sub-config]]
-# matches = "test/test_torch.py"
-# untyped-def-behavior = "skip-and-infer-return-any"
+permissive-ignores = true
+replace-imports-with-any = ["!sympy.printing.*", "sympy.*", "onnxscript.onnx_opset.*"]
+search-path = ["tools/experimental"]
diff --git a/requirements-build.txt b/requirements-build.txt
index 2009ddb658ed..85923ae39cbd 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -1,5 +1,5 @@
 # Build System requirements
-setuptools>=70.1.0,<80.0  # setuptools develop deprecated on 80.0
+setuptools>=70.1.0
 cmake>=3.27
 ninja
 numpy
diff --git a/scripts/setup_hooks.py b/scripts/setup_hooks.py
index e8effe7f8232..9c885556d4bb 100644
--- a/scripts/setup_hooks.py
+++ b/scripts/setup_hooks.py
@@ -89,7 +89,7 @@ def ensure_uv() -> None:
     print("Removing existing hook venv...")
     shutil.rmtree(venv_dir)
 
-run(["uv", "venv", str(venv_dir), "--python", "3.9"])
+run(["uv", "venv", str(venv_dir), "--python", "3.10"])
 
 # Install lintrunner in the isolated environment
 print("Installing lintrunner in isolated environment...")
diff --git a/setup.py b/setup.py
index c0523a1b5c60..a980a5f35216 100644
--- a/setup.py
+++ b/setup.py
@@ -156,6 +156,10 @@
 #   USE_ROCM_KERNEL_ASSERT=1
 #     Enable kernel assert in ROCm platform
 #
+#   USE_LAYERNORM_FAST_RECIPROCAL
+#     If set, enables the use of builtin functions for fast reciprocals (1/x) w.r.t.
+#     layer normalization. Default: enabled.
+#
 #   USE_ROCM_CK_GEMM=1
 #     Enable building CK GEMM backend in ROCm platform
 #
@@ -225,10 +229,7 @@
 #
 #   USE_MIMALLOC
 #      Static link mimalloc into C10, and use mimalloc in alloc_cpu & alloc_free.
-#      By default, It is only enabled on Windows.
-#
-#   USE_PRIORITIZED_TEXT_FOR_LD
-#      Uses prioritized text form cmake/prioritized_text.txt for LD
+#      By default, It is only enabled on Windows and AArch64.
 #
 #   BUILD_LIBTORCH_WHL
 #      Builds libtorch.so and its dependencies as a wheel
@@ -259,7 +260,7 @@
 
 
 # Also update `project.requires-python` in pyproject.toml when changing this
-python_min_version = (3, 9, 0)
+python_min_version = (3, 10, 0)
 python_min_version_str = ".".join(map(str, python_min_version))
 if sys.version_info < python_min_version:
     print(
@@ -323,7 +324,6 @@
     IS_LINUX,
     IS_WINDOWS,
 )
-from tools.setup_helpers.generate_linker_script import gen_linker_script
 
 
 def str2bool(value: str | None) -> bool:
@@ -386,12 +386,6 @@ def _get_package_path(package_name: str) -> Path:
 BUILD_LIBTORCH_WHL = str2bool(os.getenv("BUILD_LIBTORCH_WHL"))
 BUILD_PYTHON_ONLY = str2bool(os.getenv("BUILD_PYTHON_ONLY"))
 
-# set up appropriate env variables
-if BUILD_LIBTORCH_WHL:
-    # Set up environment variables for ONLY building libtorch.so and not libtorch_python.so
-    # functorch is not supported without python
-    os.environ["BUILD_FUNCTORCH"] = "OFF"
-
 if BUILD_PYTHON_ONLY:
     os.environ["BUILD_LIBTORCHLESS"] = "ON"
     os.environ["LIBTORCH_LIB_PATH"] = (_get_package_path("torch") / "lib").as_posix()
@@ -1254,21 +1248,6 @@ def run(self) -> None:
     def build_extensions(self) -> None:
         self.create_compile_commands()
 
-        build_lib = Path(self.build_lib).resolve()
-
-        # Copy functorch extension
-        for ext in self.extensions:
-            if ext.name != "functorch._C":
-                continue
-            fullname = self.get_ext_fullname(ext.name)
-            filename = Path(self.get_ext_filename(fullname))
-            src = filename.with_stem("functorch")
-            dst = build_lib / filename
-            if src.exists():
-                report(f"Copying {ext.name} from {src} to {dst}")
-                dst.parent.mkdir(parents=True, exist_ok=True)
-                self.copy_file(src, dst)
-
         super().build_extensions()
 
     def get_outputs(self) -> list[str]:
@@ -1556,11 +1535,6 @@ def make_relative_rpath_args(path: str) -> list[str]:
     )
     ext_modules.append(C)
 
-    # These extensions are built by cmake and copied manually in build_extensions()
-    # inside the build_ext implementation
-    if cmake_cache_vars["BUILD_FUNCTORCH"]:
-        ext_modules.append(Extension(name="functorch._C", sources=[]))
-
     cmdclass = {
         "bdist_wheel": bdist_wheel,
         "build_ext": build_ext,
@@ -1627,26 +1601,6 @@ def main() -> None:
     if BUILD_PYTHON_ONLY:
         install_requires += [f"{LIBTORCH_PKG_NAME}=={TORCH_VERSION}"]
 
-    if str2bool(os.getenv("USE_PRIORITIZED_TEXT_FOR_LD")):
-        gen_linker_script(
-            filein="cmake/prioritized_text.txt", fout="cmake/linker_script.ld"
-        )
-        linker_script_path = os.path.abspath("cmake/linker_script.ld")
-        os.environ["LDFLAGS"] = os.getenv("LDFLAGS", "") + f" -T{linker_script_path}"
-        os.environ["CFLAGS"] = (
-            os.getenv("CFLAGS", "") + " -ffunction-sections -fdata-sections"
-        )
-        os.environ["CXXFLAGS"] = (
-            os.getenv("CXXFLAGS", "") + " -ffunction-sections -fdata-sections"
-        )
-    elif platform.system() == "Linux" and platform.processor() == "aarch64":
-        print_box(
-            """
-            WARNING: we strongly recommend enabling linker script optimization for ARM + CUDA.
-            To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
-            """
-        )
-
     # Parse the command line and check the arguments before we proceed with
     # building deps and setup. We need to set values so `--help` works.
     dist = Distribution()
@@ -1754,7 +1708,18 @@ def main() -> None:
     package_data = {
         "torch": torch_package_data,
     }
-    exclude_package_data = {}
+    # some win libraries are excluded
+    # these are statically linked
+    exclude_windows_libs = [
+        "lib/dnnl.lib",
+        "lib/kineto.lib",
+        "lib/libprotobuf-lite.lib",
+        "lib/libprotobuf.lib",
+        "lib/libprotoc.lib",
+    ]
+    exclude_package_data = {
+        "torch": exclude_windows_libs,
+    }
 
     if not BUILD_LIBTORCH_WHL:
         package_data["torchgen"] = torchgen_package_data
diff --git a/test/ao/sparsity/test_activation_sparsifier.py b/test/ao/sparsity/test_activation_sparsifier.py
index 8e1525b85879..079f5e1941d2 100644
--- a/test/ao/sparsity/test_activation_sparsifier.py
+++ b/test/ao/sparsity/test_activation_sparsifier.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 import copy
 
@@ -55,7 +55,7 @@ def _check_constructor(self, activation_sparsifier, model, defaults, sparse_conf
 
         for key, config in sparsifier_defaults.items():
             # all the keys in combined_defaults should be present in sparsifier defaults
-            assert config == combined_defaults.get(key, None)
+            assert config == combined_defaults.get(key)
 
     def _check_register_layer(
         self, activation_sparsifier, defaults, sparse_config, layer_args_list
@@ -190,7 +190,7 @@ def hook(module, input, output):
                 if features is None:
                     assert torch.all(mask * input_data == output)
                 else:
-                    for feature_idx in range(0, len(features)):
+                    for feature_idx in range(len(features)):
                         feature = torch.Tensor(
                             [features[feature_idx]], device=input_data.device
                         ).long()
@@ -243,7 +243,7 @@ def _check_state_dict(self, sparsifier1):
             if mask1 is None:
                 assert mask2 is None
             else:
-                assert type(mask1) == type(mask2)
+                assert type(mask1) is type(mask2)
                 if isinstance(mask1, list):
                     assert len(mask1) == len(mask2)
                     for idx in range(len(mask1)):
@@ -378,7 +378,7 @@ def _vanilla_norm_sparsifier(data, sparsity_level):
         # some dummy data
         data_list = []
         num_data_points = 5
-        for _ in range(0, num_data_points):
+        for _ in range(num_data_points):
             rand_data = torch.randn(16, 1, 28, 28)
             activation_sparsifier.model(rand_data)
             data_list.append(rand_data)
diff --git a/test/ao/sparsity/test_composability.py b/test/ao/sparsity/test_composability.py
index 528fe9b83c65..1725f288cf7c 100644
--- a/test/ao/sparsity/test_composability.py
+++ b/test/ao/sparsity/test_composability.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 
 import torch
diff --git a/test/ao/sparsity/test_data_scheduler.py b/test/ao/sparsity/test_data_scheduler.py
index cc4d8ddae63f..47a85e1edda1 100644
--- a/test/ao/sparsity/test_data_scheduler.py
+++ b/test/ao/sparsity/test_data_scheduler.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 import copy
 import warnings
@@ -143,7 +143,7 @@ def test_step(self):
 
         # checking step count
         step_cnt = 5
-        for _ in range(0, step_cnt):
+        for _ in range(step_cnt):
             sparsifier.step()
             scheduler.step()
 
diff --git a/test/ao/sparsity/test_data_sparsifier.py b/test/ao/sparsity/test_data_sparsifier.py
index 5217049aafdf..fa08e8c90ac2 100644
--- a/test/ao/sparsity/test_data_sparsifier.py
+++ b/test/ao/sparsity/test_data_sparsifier.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 import copy
 import itertools
@@ -123,7 +123,7 @@ def check_step(self, data_list, data_with_config, defaults, **kwargs):
 
         step_count = 3
 
-        for _ in range(0, step_count):
+        for _ in range(step_count):
             sparsifier.step()
         for some_data in all_data:
             name, data, _ = self._get_name_data_config(some_data)
@@ -710,15 +710,15 @@ def test_ptq_sparsify_first(self):
             **sparse_config,
         )
 
-        assert type(model.emb1) == torch.ao.nn.quantized.modules.embedding_ops.Embedding
+        assert type(model.emb1) is torch.ao.nn.quantized.modules.embedding_ops.Embedding
         assert (
             type(model.embbag1)
-            == torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag
+            is torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag
         )
-        assert type(model.emb_seq[0] == nn.Embedding)
-        assert type(model.emb_seq[1] == nn.EmbeddingBag)
-        assert type(model.linear1) == nn.Linear
-        assert type(model.linear2) == nn.Linear
+        assert type(model.emb_seq[0] is nn.Embedding)
+        assert type(model.emb_seq[1] is nn.EmbeddingBag)
+        assert type(model.linear1) is nn.Linear
+        assert type(model.linear2) is nn.Linear
 
         dequant_emb1 = torch.dequantize(model.emb1.weight())
         dequant_embbag1 = torch.dequantize(model.embbag1.weight())
@@ -749,19 +749,21 @@ def test_ptq_quantize_first(self):
             model, DataNormSparsifier, sparsify_first=False, **sparse_config
         )
 
-        assert type(model.emb1) == torch.ao.nn.quantized.modules.embedding_ops.Embedding
+        assert type(model.emb1) is torch.ao.nn.quantized.modules.embedding_ops.Embedding
         assert (
             type(model.embbag1)
-            == torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag
+            is torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag
         )
-        assert type(
-            model.emb_seq[0] == torch.ao.nn.quantized.modules.embedding_ops.Embedding
+        assert (
+            type(model.emb_seq[0])
+            is torch.ao.nn.quantized.modules.embedding_ops.Embedding
         )
-        assert type(
-            model.emb_seq[1] == torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag
+        assert (
+            type(model.emb_seq[1])
+            is torch.ao.nn.quantized.modules.embedding_ops.EmbeddingBag
         )
-        assert type(model.linear1) == nn.Linear  # not quantized
-        assert type(model.linear2) == nn.Linear  # not quantized
+        assert type(model.linear1) is nn.Linear  # not quantized
+        assert type(model.linear2) is nn.Linear  # not quantized
 
         dequant_emb1 = torch.dequantize(model.emb1.weight())
         dequant_embbag1 = torch.dequantize(model.embbag1.weight())
diff --git a/test/ao/sparsity/test_kernels.py b/test/ao/sparsity/test_kernels.py
index 1ffdca5fd343..86d8ad4d3a62 100644
--- a/test/ao/sparsity/test_kernels.py
+++ b/test/ao/sparsity/test_kernels.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 import copy
 import io
diff --git a/test/ao/sparsity/test_parametrization.py b/test/ao/sparsity/test_parametrization.py
index ac79b6309cf9..95d90725d3c6 100644
--- a/test/ao/sparsity/test_parametrization.py
+++ b/test/ao/sparsity/test_parametrization.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 
 import torch
diff --git a/test/ao/sparsity/test_scheduler.py b/test/ao/sparsity/test_scheduler.py
index b563efac73bd..0477b70fd878 100644
--- a/test/ao/sparsity/test_scheduler.py
+++ b/test/ao/sparsity/test_scheduler.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 import warnings
 
diff --git a/test/ao/sparsity/test_sparsifier.py b/test/ao/sparsity/test_sparsifier.py
index ca80fa7dde7f..a940a3e9feba 100644
--- a/test/ao/sparsity/test_sparsifier.py
+++ b/test/ao/sparsity/test_sparsifier.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 import itertools
 import re
@@ -291,7 +291,7 @@ def test_prepare(self):
             assert hasattr(module.parametrizations["weight"][0], "mask")
             # Check parametrization exists and is correct
             assert is_parametrized(module, "weight")
-            assert type(module.parametrizations.weight[0]) == FakeSparsity
+            assert type(module.parametrizations.weight[0]) is FakeSparsity
 
     def test_mask_squash(self):
         model = SimpleLinear()
@@ -415,7 +415,7 @@ def test_prepare(self):
             assert hasattr(module.parametrizations["weight"][0], "mask")
             # Check parametrization exists and is correct
             assert is_parametrized(module, "weight")
-            assert type(module.parametrizations.weight[0]) == FakeSparsity
+            assert type(module.parametrizations.weight[0]) is FakeSparsity
 
     def test_mask_squash(self):
         model = SimpleLinear()
@@ -472,8 +472,8 @@ def _verify_nearliness(self, mask: torch.Tensor, nearliness: int):
         else:
             height, width = mask.shape
             dist_to_diagonal = nearliness // 2
-            for row in range(0, height):
-                for col in range(0, width):
+            for row in range(height):
+                for col in range(width):
                     if abs(row - col) <= dist_to_diagonal:
                         assert mask[row, col] == 1
                     else:
diff --git a/test/ao/sparsity/test_sparsity_utils.py b/test/ao/sparsity/test_sparsity_utils.py
index 45385bca6f6d..f2deaeb1ecc2 100644
--- a/test/ao/sparsity/test_sparsity_utils.py
+++ b/test/ao/sparsity/test_sparsity_utils.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 
 
 import logging
diff --git a/test/ao/sparsity/test_structured_sparsifier.py b/test/ao/sparsity/test_structured_sparsifier.py
index c62cc3d30539..4ed9bea7d0f7 100644
--- a/test/ao/sparsity/test_structured_sparsifier.py
+++ b/test/ao/sparsity/test_structured_sparsifier.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 import copy
 import random
 
@@ -158,7 +158,7 @@ def _check_pruner_prepared(self, model, pruner, device):
             assert parametrize.is_parametrized(module)
             assert hasattr(module, "parametrizations")
             # Assume that this is the 1st/only parametrization
-            assert type(module.parametrizations.weight[0]) == FakeStructuredSparsity
+            assert type(module.parametrizations.weight[0]) is FakeStructuredSparsity
 
     def _check_pruner_valid_before_step(self, model, pruner, device):
         for config in pruner.groups:
diff --git a/test/bottleneck_test/test.py b/test/bottleneck_test/test.py
deleted file mode 100644
index 0549a6372ab9..000000000000
--- a/test/bottleneck_test/test.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Owner(s): ["module: unknown"]
-
-import torch
-
-
-x = torch.ones((3, 3), requires_grad=True)
-(3 * x).sum().backward()
diff --git a/test/bottleneck_test/test_args.py b/test/bottleneck_test/test_args.py
deleted file mode 100644
index 38fc03701bf2..000000000000
--- a/test/bottleneck_test/test_args.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Owner(s): ["module: unknown"]
-
-import argparse
-
-import torch
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    # Required args. Raises error if they aren't passed.
-    parser.add_argument("--foo", help="foo", required=True)
-    parser.add_argument("--bar", help="bar", required=True)
-    _ = parser.parse_args()
-
-    x = torch.ones((3, 3), requires_grad=True)
-    (3 * x).sum().backward()
diff --git a/test/bottleneck_test/test_cuda.py b/test/bottleneck_test/test_cuda.py
deleted file mode 100644
index d9f9b0b8274f..000000000000
--- a/test/bottleneck_test/test_cuda.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Owner(s): ["module: unknown"]
-
-import torch
-import torch.nn as nn
-
-
-class Model(nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.linear = nn.Linear(20, 20)
-
-    def forward(self, input):
-        out = self.linear(input[:, 10:30])
-        return out.sum()
-
-
-def main():
-    data = torch.randn(10, 50).cuda()
-    model = Model().cuda()
-    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
-    for _ in range(10):
-        optimizer.zero_grad()
-        loss = model(data)
-        loss.backward()
-        optimizer.step()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/test/conftest.py b/test/conftest.py
index d742430f886d..078e4b3b2b8e 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -21,6 +21,16 @@
 from pytest_shard_custom import pytest_addoptions as shard_addoptions, PytestShardPlugin
 
 
+try:
+    from torch.testing._internal.common_utils import parse_cmd_line_args
+except ImportError:
+    # Temporary workaround needed until parse_cmd_line_args makes it into a nightlye because
+    # main / PR's tests are sometimes run against the previous day's nightly which won't
+    # have this function.
+    def parse_cmd_line_args():
+        pass
+
+
 if TYPE_CHECKING:
     from _pytest._code.code import ReprFileLocation
 
@@ -83,6 +93,7 @@ def pytest_addoption(parser: Parser) -> None:
 
 
 def pytest_configure(config: Config) -> None:
+    parse_cmd_line_args()
     xmlpath = config.option.xmlpath_reruns
     # Prevent opening xmllog on worker nodes (xdist).
     if xmlpath and not hasattr(config, "workerinput"):
diff --git a/test/cpp/aoti_abi_check/CMakeLists.txt b/test/cpp/aoti_abi_check/CMakeLists.txt
index 6898e406fb3b..da67eb74f28b 100644
--- a/test/cpp/aoti_abi_check/CMakeLists.txt
+++ b/test/cpp/aoti_abi_check/CMakeLists.txt
@@ -4,11 +4,13 @@ set(AOTI_ABI_CHECK_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_abi_check)
 set(AOTI_ABI_CHECK_TEST_SRCS
   ${AOTI_ABI_CHECK_TEST_ROOT}/main.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_cast.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_devicetype.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_dtype.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_exception.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_macros.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_math.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_rand.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_scalartype.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_vec.cpp
   ${AOTI_ABI_CHECK_TEST_ROOT}/test_vec_half.cpp
 )
@@ -27,7 +29,7 @@ add_executable(test_aoti_abi_check
 target_compile_definitions(test_aoti_abi_check PRIVATE USE_GTEST)
 
 # WARNING: DO NOT LINK torch!!!
-# The purpose is to check if the used aten/c10 headers are writtern in a header-only way
+# The purpose is to check if the used aten/c10 headers are written in a header-only way
 target_link_libraries(test_aoti_abi_check PRIVATE gtest_main)
 target_include_directories(test_aoti_abi_check PRIVATE ${ATen_CPU_INCLUDE})
 
diff --git a/test/cpp/aoti_abi_check/test_devicetype.cpp b/test/cpp/aoti_abi_check/test_devicetype.cpp
new file mode 100644
index 000000000000..68fed1730e9e
--- /dev/null
+++ b/test/cpp/aoti_abi_check/test_devicetype.cpp
@@ -0,0 +1,35 @@
+#include <gtest/gtest.h>
+
+#include <torch/headeronly/core/DeviceType.h>
+
+TEST(TestDeviceType, TestDeviceType) {
+  using torch::headeronly::DeviceType;
+  constexpr DeviceType expected_device_types[] = {
+      torch::headeronly::kCPU,
+      torch::headeronly::kCUDA,
+      DeviceType::MKLDNN,
+      DeviceType::OPENGL,
+      DeviceType::OPENCL,
+      DeviceType::IDEEP,
+      torch::headeronly::kHIP,
+      torch::headeronly::kFPGA,
+      torch::headeronly::kMAIA,
+      torch::headeronly::kXLA,
+      torch::headeronly::kVulkan,
+      torch::headeronly::kMetal,
+      torch::headeronly::kXPU,
+      torch::headeronly::kMPS,
+      torch::headeronly::kMeta,
+      torch::headeronly::kHPU,
+      torch::headeronly::kVE,
+      torch::headeronly::kLazy,
+      torch::headeronly::kIPU,
+      torch::headeronly::kMTIA,
+      torch::headeronly::kPrivateUse1,
+  };
+  for (int8_t i = 0; i <
+       static_cast<int8_t>(torch::headeronly::COMPILE_TIME_MAX_DEVICE_TYPES);
+       i++) {
+    EXPECT_EQ(static_cast<DeviceType>(i), expected_device_types[i]);
+  }
+}
diff --git a/test/cpp/aoti_abi_check/test_scalartype.cpp b/test/cpp/aoti_abi_check/test_scalartype.cpp
new file mode 100644
index 000000000000..6db841b393ae
--- /dev/null
+++ b/test/cpp/aoti_abi_check/test_scalartype.cpp
@@ -0,0 +1,55 @@
+#include <gtest/gtest.h>
+
+#include <torch/headeronly/core/ScalarType.h>
+
+TEST(TestScalarType, ScalarTypeToCPPTypeT) {
+  using torch::headeronly::ScalarType;
+  using torch::headeronly::impl::ScalarTypeToCPPTypeT;
+
+#define DEFINE_CHECK(TYPE, SCALARTYPE) \
+  EXPECT_EQ(typeid(ScalarTypeToCPPTypeT<ScalarType::SCALARTYPE>), typeid(TYPE));
+
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CHECK);
+#undef DEFINE_CHECK
+}
+
+#define DEFINE_CHECK(TYPE, SCALARTYPE)                                       \
+  {                                                                          \
+    EXPECT_EQ(                                                               \
+        typeid(ScalarTypeToCPPTypeT<ScalarType::SCALARTYPE>), typeid(TYPE)); \
+    count++;                                                                 \
+  }
+
+#define TEST_FORALL(M, EXPECTEDCOUNT, ...)               \
+  TEST(TestScalarType, M) {                              \
+    using torch::headeronly::ScalarType;                 \
+    using torch::headeronly::impl::ScalarTypeToCPPTypeT; \
+    int8_t count = 0;                                    \
+    M(__VA_ARGS__ DEFINE_CHECK);                         \
+    EXPECT_EQ(count, EXPECTEDCOUNT);                     \
+  }
+
+TEST_FORALL(AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF_F8NZ, 14)
+TEST_FORALL(AT_FORALL_SCALAR_TYPES_WITH_COMPLEX, 18)
+TEST_FORALL(AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS, 46)
+TEST_FORALL(AT_FORALL_INT_TYPES, 5)
+TEST_FORALL(AT_FORALL_SCALAR_TYPES, 7)
+TEST_FORALL(AT_FORALL_SCALAR_TYPES_AND, 8, Bool, )
+TEST_FORALL(AT_FORALL_SCALAR_TYPES_AND2, 9, Bool, Half, )
+TEST_FORALL(AT_FORALL_SCALAR_TYPES_AND3, 10, Bool, Half, ComplexFloat, )
+TEST_FORALL(
+    AT_FORALL_SCALAR_TYPES_AND7,
+    14,
+    Bool,
+    Half,
+    ComplexHalf,
+    ComplexFloat,
+    ComplexDouble,
+    UInt16,
+    UInt32, )
+TEST_FORALL(AT_FORALL_QINT_TYPES, 5)
+TEST_FORALL(AT_FORALL_FLOAT8_TYPES, 5)
+TEST_FORALL(AT_FORALL_COMPLEX_TYPES, 2)
+
+#undef DEFINE_CHECK
+#undef TEST_FORALL
diff --git a/test/cpp/api/autograd.cpp b/test/cpp/api/autograd.cpp
index 7b6d65ca8e6d..b7e75acb659d 100644
--- a/test/cpp/api/autograd.cpp
+++ b/test/cpp/api/autograd.cpp
@@ -1292,12 +1292,6 @@ torch::Tensor view_op(const torch::Tensor& self) {
   return self.alias();
 }
 
-torch::Tensor view_op_with_extra_arg(
-    const torch::Tensor& self,
-    const torch::Tensor& other) {
-  return self.alias();
-}
-
 std::vector<torch::Tensor> ret_tensor_vector_view(
     const torch::Tensor& self,
     const torch::Tensor& other) {
@@ -1534,35 +1528,9 @@ TEST(TestAutogradNotImplementedFallback, ViewOp) {
   // Test inplace on view
   auto t = torch::tensor({1.}, {torch::kFloat32}).set_requires_grad(true);
 
-  // raise on rebase_history when it refreshes grad_fn
-  ASSERT_THROWS_WITH(
-      v1.add_(t), "which does not have a derivative implemented is forbidden");
-  // base should not be aware of the views, so this is still okay
+  // this works as we can properly replay the view given by the user
+  v1.add_(t);
   b1.add_(t);
-  ASSERT_THROWS_WITH(
-      v1.grad_fn(),
-      "which does not have a derivative implemented is forbidden");
-}
-
-TEST(TestAutogradNotImplementedFallback, ViewOpWithExtraArg) {
-  REGISTER_TEST_OP(
-      "view_op_with_extra_arg",
-      "_test::view_op_with_extra_arg(Tensor(a) self, Tensor other) -> Tensor(a)",
-      view_op_with_extra_arg);
-  auto opHandle = c10::Dispatcher::singleton().findSchemaOrThrow(
-      "_test::view_op_with_extra_arg", "");
-  auto op = [&](const torch::Tensor& _1, const torch::Tensor& _2) {
-    return callOpUnboxed<
-        torch::Tensor,
-        const torch::Tensor&,
-        const torch::Tensor&>(opHandle, _1, _2);
-  };
-  assertBasicChecks(op);
-  auto a = torch::tensor({1.}, {torch::kFloat32});
-  auto b = torch::tensor({2.}, {torch::kFloat32});
-  auto out1 = op(a, b);
-  ASSERT_TRUE(out1.is_view());
-  ASSERT_EQ(out1._base().unsafeGetTensorImpl(), a.unsafeGetTensorImpl());
 }
 
 TEST(TestAutogradNotImplementedFallback, RetTensorVectorView) {
diff --git a/test/cpp/dist_autograd/CMakeLists.txt b/test/cpp/dist_autograd/CMakeLists.txt
index 86a6c924288b..14fd7f7ae9a2 100644
--- a/test/cpp/dist_autograd/CMakeLists.txt
+++ b/test/cpp/dist_autograd/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT WIN32)
+if(USE_DISTRIBUTED AND NOT WIN32)
   set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd")
   set(DIST_AUTOGRAD_TEST_SOURCES
     ${TORCH_ROOT}/test/cpp/common/main.cpp
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
index 306a882627d4..58c812b08ccc 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
@@ -363,6 +363,25 @@ void boxed_my_new_zeros_dtype_variant(StableIValue* stack, uint64_t num_args, ui
   stack[0] = from(res);
 }
 
+Tensor my_copy_(Tensor dst, Tensor src, bool non_blocking) {
+  return copy_(dst, src, non_blocking);
+}
+
+void boxed_my_copy_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  Tensor tensor_res = my_copy_(to<Tensor>(stack[0]), to<Tensor>(stack[1]), to<bool>(stack[2]));
+  stack[0] = from(tensor_res);
+}
+
+Tensor my_clone(Tensor t) {
+  return clone(t);
+}
+
+void boxed_my_clone(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  Tensor tensor_res = my_clone(to<Tensor>(stack[0]));
+  stack[0] = from(tensor_res);
+}
+
+
 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
   m.def("my_transpose(Tensor t, int dim0, int dim1) -> Tensor");
   m.def("my_empty_like(Tensor t) -> Tensor");
@@ -371,6 +390,8 @@ STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
   m.def("my_narrow(Tensor t, int dim, int start, int length) -> Tensor");
   m.def("my_new_empty_dtype_variant(Tensor t) -> Tensor");
   m.def("my_new_zeros_dtype_variant(Tensor t) -> Tensor");
+  m.def("my_copy_(Tensor dst, Tensor src, bool non_blocking) -> Tensor");
+  m.def("my_clone(Tensor t) -> Tensor");
 }
 
 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
@@ -380,6 +401,8 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
   m.impl("my_is_cpu", &boxed_my_is_cpu);
   m.impl("my_new_empty_dtype_variant", &boxed_my_new_empty_dtype_variant);
   m.impl("my_new_zeros_dtype_variant", &boxed_my_new_zeros_dtype_variant);
+  m.impl("my_copy_", &boxed_my_copy_);
+  m.impl("my_clone", &boxed_my_clone);
 }
 
 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeImplicitAutograd, m) {
@@ -415,7 +438,6 @@ void boxed_my_amax_vec(StableIValue* stack, uint64_t num_args, uint64_t num_outp
   stack[0] = from(res);
 }
 
-
 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
   m.def("my_zero_(Tensor(a!) t) -> Tensor(a!)");
   m.def("my_amax(Tensor a) -> Tensor");
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
index 074461d35274..0000d667e1cb 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
@@ -242,6 +242,32 @@ def my_narrow(t, dim, start, length) -> Tensor:
     return torch.ops.libtorch_agnostic.my_narrow.default(t, dim, start, length)
 
 
+def my_copy_(dst, src, non_blocking) -> Tensor:
+    """
+    Returns tensor dst that is updated with src elements.
+
+    Args:
+        dst: Destination tensor
+        src: Source tensor
+        non_blocking: bool
+
+    Returns: Updated tensor
+    """
+    return torch.ops.libtorch_agnostic.my_copy_.default(dst, src, non_blocking)
+
+
+def my_clone(t) -> Tensor:
+    """
+    Returns a clone of input tensor.
+
+    Args:
+        t: Input tensor
+
+    Returns: Cloned tensor
+    """
+    return torch.ops.libtorch_agnostic.my_clone.default(t)
+
+
 def test_device_guard(device_index) -> int:
     """
     Tests the DeviceGuard functionality by creating a device guard and returning an empty tensor.
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
index 0f471e8132a6..35610332a36c 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
@@ -345,6 +345,28 @@ def test_my_new_zeros_dtype_variant(self, device):
             ref_out = t.new_zeros((2, 5), dtype=torch.float)
             self.assertEqual(out, ref_out, exact_device=True)
 
+        def test_my_copy_(self, device):
+            import libtorch_agnostic
+
+            dst = torch.empty(2, 5, device=device)
+            src = torch.randn(2, 5, device=device)
+
+            result = libtorch_agnostic.ops.my_copy_(dst, src, False)
+            expected = src
+            self.assertEqual(result, expected)
+            self.assertEqual(result.data_ptr(), dst.data_ptr())
+
+        def test_my_clone(self, device):
+            import libtorch_agnostic
+
+            t = torch.randn(2, 5, device=device)
+
+            result = libtorch_agnostic.ops.my_clone(t)
+            expected = t.clone()
+            self.assertEqual(result, expected)
+            self.assertNotEqual(result.data_ptr(), expected.data_ptr())
+            self.assertEqual(result.stride(), expected.stride())
+
     instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)
 
 if __name__ == "__main__":
diff --git a/test/cpp_extensions/mps_extension.mm b/test/cpp_extensions/mps_extension.mm
index 882e5c5603e2..30b70a76563d 100644
--- a/test/cpp_extensions/mps_extension.mm
+++ b/test/cpp_extensions/mps_extension.mm
@@ -13,6 +13,11 @@ kernel void add_arrays(device const float* inA,
 {
     result[index] = inA[index] + inB[index];
 }
+
+kernel void add_one(device float* data,
+                    uint index [[thread_position_in_grid]]) {
+  data[index] += 1.0;
+}
 )MPS_ADD_ARRAYS");
 
 at::Tensor get_cpu_add_output(at::Tensor & cpu_input1, at::Tensor & cpu_input2) {
@@ -50,7 +55,31 @@ kernel void add_arrays(device const float* inA,
   return mps_output;
 }
 
+void mps_add_one_new_encoder(const at::Tensor& input) {
+  using namespace at::native::mps;
+  TORCH_CHECK(input.is_mps());
+  TORCH_CHECK(input.numel() > 0);
+
+  @autoreleasepool {
+  auto kernelPSO = lib.getPipelineStateForFunc("add_one");
+  auto serialQueue = torch::mps::get_dispatch_queue();
+
+  dispatch_sync(serialQueue, ^(){
+    auto commandBuffer = torch::mps::get_command_buffer();
+    // Start a compute pass.
+    auto computeEncoder = [commandBuffer computeCommandEncoder];
+    TORCH_CHECK(computeEncoder, "Failed to create compute command encoder");
+    [computeEncoder setComputePipelineState: kernelPSO];
+    mtl_setArgs(computeEncoder, input);
+    mtl_dispatch1DJob(computeEncoder, kernelPSO, input.numel());
+    [computeEncoder endEncoding];
+     torch::mps::commit();
+  });
+  }
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("get_cpu_add_output", &get_cpu_add_output);
   m.def("get_mps_add_output", &get_mps_add_output);
+  m.def("mps_add_one_new_context", &mps_add_one_new_encoder);
 }
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/README.md b/test/cpp_extensions/open_registration_extension/torch_openreg/README.md
index 9474c85a1b84..7f0e037fab27 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/README.md
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/README.md
@@ -25,6 +25,8 @@ The goal of `torch_openreg` is **not to implement a fully functional, high-perfo
 torch_openreg/
 ├── CMakeLists.txt
 ├── csrc
+│   ├── amp
+│   │   └── autocast_mode.cpp
 │   ├── aten
 │   │   ├── native
 │   │   │   ├── Extra.cpp
@@ -59,6 +61,8 @@ torch_openreg/
     │   └── stub.c
     ├── __init__.py
     └── openreg
+        ├── amp
+        │   └── __init__.py
         ├── __init__.py
         ├── meta.py
         └── random.py
@@ -95,11 +99,12 @@ There are 4 DSOs in torch_openreg, and the dependencies between them are as foll
 **Key Directories**:
 
 - `csrc/`: Core device implementation, including operator registration, runtime, etc.
+  - `csrc/amp/`: AMP(Automatic Mixed Precision)
   - `csrc/aten/`: Operator registration
     - `csrc/aten/native/`: Specific operator implementations for the OpenReg device.
-      - `csrc/aten/OpenRegMinimal.cpp`: The most minimal set of operator implementations (allowing for the creation of Tensors and related operations upon completion).
-      - `csrc/aten/OpenRegExtra.cpp`: Implementations for other types of operators.
-    - `csrc/runtime/`: Implementations for Host memory, device memory, Guard, Hooks, etc.
+      - `csrc/aten/native/OpenRegMinimal.cpp`: The most minimal set of operator implementations (allowing for the creation of Tensors and related operations upon completion).
+      - `csrc/aten/native/OpenRegExtra.cpp`: Implementations for other types of operators.
+  - `csrc/runtime/`: Implementations for Host memory, device memory, Guard, Hooks, etc.
 - `third_party/`: A C++ library that simulates a CUDA-like device using the CPU.
 - `torch_openreg/`: Python interface implementation (Python code and C++ Bindings).
   - `torch_openreg/csrc/`: Python C++ binding code.
@@ -126,13 +131,18 @@ There are 4 DSOs in torch_openreg, and the dependencies between them are as foll
 
 ### Autoload
 
-- Autoload Machanism
+When `import torch`, installed accelerators (such as `torch_openreg`) will be automatically loaded, achieving the same experience as the built-in backends.
 
-    When `import torch`, installed accelerators (such as `torch_openreg`) will be automatically loaded, achieving the same experience as the built-in backends.
+- Register the backend with Python `entry points`: See `setup` in `setup.py`
+- Add a callable function for backend initialization: See `_autoload` in `torch_openreg/__init__.py`
+- Dynamically loading the backend without explicit imports: See [Usage Example](#usage-example)
 
-  - Registering the backend with Python `entry points`: See `setup` in `setup.py`
-  - Adding a callable function for backend initialization: See `_autoload` in `torch_openreg/__init__.py`
-  - Dynamically loading the backend without explicit imports: See [Usage Example](#usage-example)
+### AMP(Automatic Mixed Precision)
+
+`AMP` provides convenience methods for mixed precision, where some operations use the `torch.float32` datatype and other operations use `lower precision` floating point datatype: `torch.float16` or `torch.bfloat16`.
+
+- Register specific operator conversion rules: See `autocat_mode.cpp` in `csrc/amp`.
+- Add support for new data types for different accelerators: See `get_amp_supported_dtype` in `torch_openreg/openreg/amp/__init__.py`
 
 ## Installation and Usage
 
@@ -168,11 +178,13 @@ print("Result z:\n", z)
 print(f"Device of z: {z.device}")
 ```
 
+## Documentation
+
+Please refer to [this](https://docs.pytorch.org/docs/main/accelerator/index.html) for a series of documents on integrating new accelerators into PyTorch, which will be kept in sync with the `OpenReg` codebase as well.
+
 ## Future Plans
 
 - **Enhance Features**:
-  - Autoload
-  - AMP
   - Device-agnostic APIs
   - Memory Management
   - Generator
@@ -180,5 +192,3 @@ print(f"Device of z: {z.device}")
   - Custom Tensor&Storage
   - ...
 - **Improve Tests**: Add more test cases related to the integration mechanism.
-- **Improve Documentation**: Add a new chapter on third-party device integration in the `Developer Notes` section of the PyTorch documentation.
-- **Real-time Synchronization**: Keep the code and documentation updated iteratively and in sync.
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/cmake/TorchPythonTargets.cmake b/test/cpp_extensions/open_registration_extension/torch_openreg/cmake/TorchPythonTargets.cmake
index b7a807d264dd..043fadc3538e 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/cmake/TorchPythonTargets.cmake
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/cmake/TorchPythonTargets.cmake
@@ -1,9 +1,7 @@
 if(WIN32)
-  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/torch_python.lib")
-elseif(APPLE)
-  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/libtorch_python.dylib")
+  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/${CMAKE_IMPORT_LIBRARY_PREFIX}torch_python${CMAKE_IMPORT_LIBRARY_SUFFIX}")
 else()
-  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/libtorch_python.so")
+  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}torch_python${CMAKE_SHARED_LIBRARY_SUFFIX}")
 endif()
 
 add_library(torch_python SHARED IMPORTED)
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/amp/autocast_mode.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/amp/autocast_mode.cpp
new file mode 100644
index 000000000000..129e4beabce1
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/amp/autocast_mode.cpp
@@ -0,0 +1,37 @@
+#include <ATen/autocast_mode.h>
+
+using at::Tensor;
+
+Tensor binary_cross_entropy_banned(
+    const Tensor&,
+    const Tensor&,
+    const std::optional<Tensor>&,
+    int64_t) {
+  TORCH_CHECK(
+      false,
+      "torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n"
+      "Many models use a sigmoid layer right before the binary cross entropy layer.\n"
+      "In this case, combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits\n"
+      "or torch.nn.BCEWithLogitsLoss.  binary_cross_entropy_with_logits and BCEWithLogits are\n"
+      "safe to autocast.");
+}
+
+// LITERALINCLUDE START: AMP FALLTHROUTH
+TORCH_LIBRARY_IMPL(_, AutocastPrivateUse1, m) {
+  m.fallback(torch::CppFunction::makeFallthrough());
+}
+// LITERALINCLUDE END: AMP FALLTHROUTH
+
+// LITERALINCLUDE START: AMP IMPL
+TORCH_LIBRARY_IMPL(aten, AutocastPrivateUse1, m) {
+  // lower_precision_fp
+  KERNEL_PRIVATEUSEONE(mm, lower_precision_fp)
+
+  // fp32
+  KERNEL_PRIVATEUSEONE(asin, fp32)
+
+  m.impl(
+      TORCH_SELECTIVE_NAME("aten::binary_cross_entropy"),
+      TORCH_FN((&binary_cross_entropy_banned)));
+}
+// LITERALINCLUDE END: AMP IMPL
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/pyproject.toml b/test/cpp_extensions/open_registration_extension/torch_openreg/pyproject.toml
index 774fe5cdf83d..e67240975d0b 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/pyproject.toml
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/pyproject.toml
@@ -12,7 +12,7 @@ name = "torch_openreg"
 version = "0.0.1"
 description = "A minimal reference implementation of an out-of-tree backend"
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 license = { text = "BSD-3-Clause" }
 authors = [{ name = "PyTorch Team", email = "packages@pytorch.org" }]
 dependencies = [
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_autocast.py b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_autocast.py
new file mode 100644
index 000000000000..01423741660d
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/tests/test_autocast.py
@@ -0,0 +1,50 @@
+# Owner(s): ["module: PrivateUse1"]
+
+import torch
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class TestAutocast(TestCase):
+    def test_autocast_with_unsupported_type(self):
+        with self.assertWarnsRegex(
+            UserWarning,
+            "In openreg autocast, but the target dtype torch.float32 is not supported.",
+        ):
+            with torch.autocast(device_type="openreg", dtype=torch.float32):
+                _ = torch.ones(10)
+
+    def test_autocast_operator_not_supported(self):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.",
+        ):
+            x = torch.randn(2, 3, device="openreg")
+            y = torch.randn(2, 3, device="openreg")
+            with torch.autocast(device_type="openreg", dtype=torch.float16):
+                _ = torch.nn.functional.binary_cross_entropy(x, y)
+
+    def test_autocast_low_precision(self):
+        with torch.amp.autocast(device_type="openreg", dtype=torch.float16):
+            x = torch.randn(2, 3, device="openreg")
+            y = torch.randn(3, 3, device="openreg")
+            result = torch.mm(x, y)
+            self.assertEqual(result.dtype, torch.float16)
+
+    def test_autocast_fp32(self):
+        with torch.amp.autocast(device_type="openreg"):
+            x = torch.randn(2, device="openreg", dtype=torch.float16)
+            result = torch.asin(x)
+            self.assertEqual(result.dtype, torch.float32)
+
+    def test_autocast_default_dtype(self):
+        openreg_fast_dtype = torch.get_autocast_dtype(device_type="openreg")
+        self.assertEqual(openreg_fast_dtype, torch.half)
+
+    def test_autocast_set_dtype(self):
+        for dtype in [torch.float16, torch.bfloat16]:
+            torch.set_autocast_dtype("openreg", dtype)
+            self.assertEqual(torch.get_autocast_dtype("openreg"), dtype)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md
index 0cee2c87ea34..29cbdd707cdd 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md
@@ -125,6 +125,7 @@ Please refer to [example](example/example.cpp) for example.
 The command to compile example.cpp is as follow:
 
 ```Shell
+# The same directory as the current README.md file.
 mkdir build
 
 pushd build
@@ -132,7 +133,7 @@ cmake ..
 make -j 32
 popd
 
-g++ -o out example/example.cpp -L ./build -lopenreg
+g++ -o out example/example.cpp -L ./build -lopenreg -I ./
 LD_LIBRARY_PATH=./build ./out
 ```
 
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/example/example.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/example/example.cpp
index f00f1909b7ec..fbcf08ac6f29 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/example/example.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/example/example.cpp
@@ -1,6 +1,5 @@
 #include "include/openreg.h"
 
-#include <algorithm>
 #include <iostream>
 #include <numeric>
 #include <vector>
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/__init__.py b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/__init__.py
index 670f54245fb0..7c8712666a21 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/__init__.py
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/__init__.py
@@ -3,6 +3,7 @@
 import torch_openreg._C  # type: ignore[misc]
 
 from . import meta  # noqa: F401
+from .amp import get_amp_supported_dtype  # noqa: F401
 
 
 _initialized = False
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/amp/__init__.py b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/amp/__init__.py
new file mode 100644
index 000000000000..02775c54df7f
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/amp/__init__.py
@@ -0,0 +1,9 @@
+import torch
+
+
+# LITERALINCLUDE START: AMP GET_SUPPORTED_DTYPE
+def get_amp_supported_dtype():
+    return [torch.float16, torch.bfloat16]
+
+
+# LITERALINCLUDE END: AMP GET_SUPPORTED_DTYPE
diff --git a/test/cpp_extensions/python_agnostic_extension/test/test_python_agnostic.py b/test/cpp_extensions/python_agnostic_extension/test/test_python_agnostic.py
index 58a8dafb305a..95ca8638ab92 100644
--- a/test/cpp_extensions/python_agnostic_extension/test/test_python_agnostic.py
+++ b/test/cpp_extensions/python_agnostic_extension/test/test_python_agnostic.py
@@ -28,7 +28,7 @@ def setUpClass(cls):
             shutil.rmtree(cls.dist_dir)
 
         # Build the wheel
-        wheel_cmd = [sys.executable, "setup.py", "bdist_wheel"]
+        wheel_cmd = [sys.executable, "-m", "build", "--wheel", "--no-isolation"]
         return_code = shell(wheel_cmd, cwd=cls.extension_root, env=os.environ)
         if return_code != 0:
             raise RuntimeError("python_agnostic bdist_wheel failed to build")
@@ -39,7 +39,7 @@ def setUpClass(cls):
     )
     @unittest.skipIf(not IS_LINUX, "test requires linux tools ldd and nm")
     def test_extension_is_python_agnostic(self, device):
-        # For this test, run_test.py will call `python setup.py bdist_wheel` in the
+        # For this test, run_test.py will call `python -m build --wheel --no-isolation` in the
         # cpp_extensions/python_agnostic_extension folder, where the extension and
         # setup calls specify py_limited_api to `True`. To approximate that the
         # extension is indeed python agnostic, we test
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_comm.py b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
index 5ae26ae9b976..44000e761d8a 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_comm.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
@@ -6,7 +6,8 @@
 import os
 import tempfile
 import unittest
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+from typing import Optional, Union
 from unittest.mock import MagicMock
 
 import torch
@@ -14,6 +15,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributed._composable import checkpoint, replicate
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    apply_activation_checkpointing,
+)
 from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
 from torch.distributed.fsdp import (
     FSDPModule,
@@ -57,6 +61,7 @@
 )
 from torch.testing._internal.common_utils import run_tests, TEST_XPU, xfailIf
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    FeedForward,
     ModelArgs,
     Transformer,
     TransformerBlock,
@@ -1009,6 +1014,222 @@ def set_backward_prefetch(model: Transformer, num_to_prefetch: int) -> None:
             self.assertEqual(events, expected_backward_events)
             events.clear()
 
+    @skip_if_lt_x_gpu(2)
+    def test_set_modules_to_backward_prefetch_inside_ac(self):
+        n_layers = 3
+        reshard_after_forward = True
+        # use checkpoint wrapper instead of torch.utils
+        model_args = ModelArgs(n_layers=n_layers, checkpoint_activations=False)
+        model = Transformer(model_args)
+        apply_activation_checkpointing(
+            model, check_fn=lambda m: isinstance(m, TransformerBlock)
+        )
+        apply_activation_checkpointing(
+            model, check_fn=lambda m: isinstance(m, FeedForward)
+        )
+        fully_shard([model.tok_embeddings, model.pos_embeddings])
+        for layer in model.layers:
+            # mimic fully_shard(layer.moe.experts)
+            fully_shard(
+                layer.feed_forward.w1, reshard_after_forward=reshard_after_forward
+            )
+            fully_shard(layer, reshard_after_forward=reshard_after_forward)
+        fully_shard(
+            [model.norm, model.output], reshard_after_forward=reshard_after_forward
+        )
+        fully_shard(model, reshard_after_forward=reshard_after_forward)
+        inp = torch.randint(
+            0,
+            model_args.vocab_size,
+            (2, model_args.max_seq_len),
+            device=device_type.type,
+        )
+
+        def set_backward_prefetch(model: Transformer) -> None:
+            # tell pyre model.set_modules_to_backward_prefetch is available
+            assert isinstance(model, FSDPModule)
+            assert isinstance(model.output, FSDPModule)
+
+            # mimic deepseek MOE
+            # prefetch layer - 1 and its feedforward before cpu sync during a2a
+            reversed_transformer_blocks = list(reversed(model.layers))
+            prev_transformer_blocks = reversed_transformer_blocks[1:] + [None]
+
+            if (
+                model.norm is not None
+                and model.output is not None
+                and len(model.layers) > 0
+            ):
+                assert isinstance(reversed_transformer_blocks[0], FSDPModule)
+                model.output.set_modules_to_backward_prefetch(
+                    [reversed_transformer_blocks[0]]
+                )
+
+            for transformer_block, prev_transformer_block in zip(
+                reversed_transformer_blocks, prev_transformer_blocks
+            ):
+                assert isinstance(transformer_block, FSDPModule)
+                if prev_transformer_block is not None:
+                    assert isinstance(prev_transformer_block, FSDPModule)
+                    assert hasattr(prev_transformer_block.feed_forward, "w1")
+                    assert isinstance(
+                        prev_transformer_block.feed_forward.w1, FSDPModule
+                    )
+                    transformer_block.set_modules_to_backward_prefetch(
+                        [
+                            prev_transformer_block,
+                            prev_transformer_block.feed_forward.w1,
+                        ]
+                    )
+                elif model.tok_embeddings is not None:
+                    assert isinstance(model.tok_embeddings, FSDPModule)
+                    transformer_block.set_modules_to_backward_prefetch(
+                        [model.tok_embeddings]
+                    )
+
+        events: list[EventType] = []
+        unshard_with_record = self._get_unshard_with_record(
+            FSDPParamGroup.unshard, events
+        )
+        reshard_with_record = self._get_reshard_with_record(
+            FSDPParamGroup.reshard, events
+        )
+        with (
+            patch_unshard(unshard_with_record),
+            patch_reshard(reshard_with_record),
+        ):
+            loss = model(inp)
+            events.clear()
+            loss.sum().backward()
+            expected_backward_events = [
+                ("unshard", "norm, output", TrainingState.PRE_BACKWARD),
+                ("unshard", "layers.2", TrainingState.PRE_BACKWARD),
+                ("reshard", "norm, output", TrainingState.POST_BACKWARD),
+                # layers.2 prefetch w1
+                (
+                    "unshard",
+                    "layers.2._checkpoint_wrapped_module.feed_forward._checkpoint_wrapped_module.w1",
+                    TrainingState.PRE_BACKWARD,
+                ),
+                # layers.2.w1 prefetch layers.1
+                ("unshard", "layers.1", TrainingState.PRE_BACKWARD),
+                (
+                    "reshard",
+                    "layers.2._checkpoint_wrapped_module.feed_forward._checkpoint_wrapped_module.w1",
+                    TrainingState.POST_BACKWARD,
+                ),
+                ("reshard", "layers.2", TrainingState.POST_BACKWARD),
+                (
+                    "unshard",
+                    "layers.1._checkpoint_wrapped_module.feed_forward._checkpoint_wrapped_module.w1",
+                    TrainingState.PRE_BACKWARD,
+                ),
+                ("unshard", "layers.0", TrainingState.PRE_BACKWARD),
+                (
+                    "reshard",
+                    "layers.1._checkpoint_wrapped_module.feed_forward._checkpoint_wrapped_module.w1",
+                    TrainingState.POST_BACKWARD,
+                ),
+                ("reshard", "layers.1", TrainingState.POST_BACKWARD),
+                (
+                    "unshard",
+                    "layers.0._checkpoint_wrapped_module.feed_forward._checkpoint_wrapped_module.w1",
+                    TrainingState.PRE_BACKWARD,
+                ),
+                (
+                    "unshard",
+                    "tok_embeddings, pos_embeddings",
+                    TrainingState.PRE_BACKWARD,
+                ),
+                (
+                    "reshard",
+                    "layers.0._checkpoint_wrapped_module.feed_forward._checkpoint_wrapped_module.w1",
+                    TrainingState.POST_BACKWARD,
+                ),
+                ("reshard", "layers.0", TrainingState.POST_BACKWARD),
+                (
+                    "reshard",
+                    "tok_embeddings, pos_embeddings",
+                    TrainingState.POST_BACKWARD,
+                ),
+                (
+                    "reshard",
+                    "tok_embeddings, pos_embeddings",
+                    TrainingState.POST_BACKWARD,
+                ),
+                ("reshard", "norm, output", TrainingState.POST_BACKWARD),
+            ]
+            self.assertEqual(events, expected_backward_events)
+            events.clear()
+
+            set_backward_prefetch(model)
+            loss = model(inp)
+            events.clear()
+            loss.sum().backward()
+            expected_backward_events = [
+                ("unshard", "norm, output", TrainingState.PRE_BACKWARD),
+                # root explicit prefetch layers.2
+                ("unshard", "layers.2", TrainingState.PRE_BACKWARD),
+                ("reshard", "norm, output", TrainingState.POST_BACKWARD),
+                # layers.2 prefetch layers.1 and feed_forward
+                ("unshard", "layers.1", TrainingState.PRE_BACKWARD),
+                (
+                    "unshard",
+                    "layers.1._checkpoint_wrapped_module.feed_forward._checkpoint_wrapped_module.w1",
+                    TrainingState.PRE_BACKWARD,
+                ),
+                # AC recompute_fn
+                (
+                    "unshard",
+                    "layers.2._checkpoint_wrapped_module.feed_forward._checkpoint_wrapped_module.w1",
+                    TrainingState.FORWARD,
+                ),
+                (
+                    "reshard",
+                    "layers.2._checkpoint_wrapped_module.feed_forward._checkpoint_wrapped_module.w1",
+                    TrainingState.POST_BACKWARD,
+                ),
+                ("reshard", "layers.2", TrainingState.POST_BACKWARD),
+                # layers.1 prefetch layers.0
+                ("unshard", "layers.0", TrainingState.PRE_BACKWARD),
+                (
+                    "unshard",
+                    "layers.0._checkpoint_wrapped_module.feed_forward._checkpoint_wrapped_module.w1",
+                    TrainingState.PRE_BACKWARD,
+                ),
+                (
+                    "reshard",
+                    "layers.1._checkpoint_wrapped_module.feed_forward._checkpoint_wrapped_module.w1",
+                    TrainingState.POST_BACKWARD,
+                ),
+                ("reshard", "layers.1", TrainingState.POST_BACKWARD),
+                # layers.0 prefetch embeddings
+                (
+                    "unshard",
+                    "tok_embeddings, pos_embeddings",
+                    TrainingState.PRE_BACKWARD,
+                ),
+                (
+                    "reshard",
+                    "layers.0._checkpoint_wrapped_module.feed_forward._checkpoint_wrapped_module.w1",
+                    TrainingState.POST_BACKWARD,
+                ),
+                ("reshard", "layers.0", TrainingState.POST_BACKWARD),
+                (
+                    "reshard",
+                    "tok_embeddings, pos_embeddings",
+                    TrainingState.POST_BACKWARD,
+                ),
+                (
+                    "reshard",
+                    "tok_embeddings, pos_embeddings",
+                    TrainingState.POST_BACKWARD,
+                ),
+                ("reshard", "norm, output", TrainingState.POST_BACKWARD),
+            ]
+            self.assertEqual(events, expected_backward_events)
+            events.clear()
+
     @skip_if_lt_x_gpu(2)
     def test_fully_shard_multi_module_backward_prefetch(self):
         n_layers = 5
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_compile.py b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
index 630e20a2540f..57fff5fe8947 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_compile.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
@@ -32,7 +32,7 @@
     sm_is_or_higher_than,
 )
 from torch.testing._internal.common_fsdp import FSDPTest, get_devtype, MLP
-from torch.testing._internal.common_utils import run_tests, skipIfRocm
+from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
     Transformer,
@@ -133,7 +133,11 @@ def skipTestForOldSm(self):
             device_type.type,
             self.rank % torch.get_device_module(device_type).device_count(),
         )
-        if device_type.type == "cuda" and not sm_is_or_higher_than(device, 8, 0):
+        if (
+            device_type.type == "cuda"
+            and not torch.version.hip
+            and not sm_is_or_higher_than(device, 8, 0)
+        ):
             self.skipTest("bf16 requires sm >= 8.0")
 
     def test_dynamo_trace_use_training_state(self):
@@ -478,7 +482,6 @@ def inductor_code_check_fsdp_reduce_scatter(
         file_check = file_check.check("torch.ops._c10d_functional.wait_tensor.")
         return file_check
 
-    @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_compiled_autograd_ctx(self):
         self.skipTestForOldSm()
@@ -643,14 +646,12 @@ def input_creation_fn():
 
         return model_init_fn, input_creation_fn
 
-    @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_simple_mlp_fullgraph_backend_aot_eager(self):
         self._test_traceable_fsdp(
             *self._create_simple_mlp_factory_fns(), "aot_eager", fwd_fullgraph=True
         )
 
-    @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_simple_mlp_fullgraph_backend_aot_eager_decomp_partition(self):
         self._test_traceable_fsdp(
@@ -659,7 +660,6 @@ def test_simple_mlp_fullgraph_backend_aot_eager_decomp_partition(self):
             fwd_fullgraph=True,
         )
 
-    @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_simple_mlp_fullgraph_backend_inductor(self):
         self.skipTestForOldSm()
@@ -731,7 +731,6 @@ def input_creation_fn():
 
         return model_init_fn, input_creation_fn
 
-    @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_nested_fully_shard_backend_aot_eager(self):
         # TODO: fix fwd_fullgraph=False case
@@ -744,7 +743,6 @@ def test_nested_fully_shard_backend_aot_eager(self):
                 fwd_fullgraph=fwd_fullgraph,
             )
 
-    @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_nested_fully_shard_backend_aot_eager_decomp_partition(self):
         # TODO: fix fwd_fullgraph=False case
@@ -866,19 +864,16 @@ def _test_nested_fully_shard_backend_inductor_fullgraph_True(self):
                     pass
                 file_check.run(bwd_code)
 
-    @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_nested_fully_shard_backend_inductor_fullgraph_True(self):
         self._test_nested_fully_shard_backend_inductor_fullgraph_True()
 
-    @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @torch._inductor.config.patch("graph_partition", True)
     def test_nested_fully_shard_backend_inductor_fullgraph_True_graph_partition(self):
         self._test_nested_fully_shard_backend_inductor_fullgraph_True()
 
     @unittest.skip("TODO: fix fwd_fullgraph=False case")
-    @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_nested_fully_shard_backend_inductor_fullgraph_False(self):
         self.skipTestForOldSm()
@@ -956,7 +951,6 @@ def _sdpa_with_graph_break(*args, **kwargs):
         else:
             return contextlib.nullcontext()
 
-    @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     def test_transformer_backend_aot_eager(self):
         # TODO: fix fwd_fullgraph=False case
@@ -975,7 +969,6 @@ def test_transformer_backend_aot_eager(self):
                     fwd_fullgraph=fwd_fullgraph,
                 )
 
-    @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     # TODO: native_dropout has worse accuracy after decomp, need to figure out why
     @torch._inductor.config.patch(fallback_random=True)
@@ -1111,7 +1104,6 @@ def _test_transformer_backend_inductor_fullgraph_True(self):
                 file_check.run(bwd_code)
 
     @unittest.skip('"Traceable FSDP2" is not being maintained anymore.')
-    @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     # TODO: native_dropout causes CUDA IMA error, need to figure out why
     @torch._inductor.config.patch(fallback_random=True)
@@ -1119,7 +1111,6 @@ def test_transformer_backend_inductor_fullgraph_True(self):
         self._test_transformer_backend_inductor_fullgraph_True()
 
     @unittest.skip('"Traceable FSDP2" is not being maintained anymore.')
-    @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     # TODO: native_dropout causes CUDA IMA error, need to figure out why
     @torch._inductor.config.patch(fallback_random=True)
@@ -1128,7 +1119,6 @@ def test_transformer_backend_inductor_fullgraph_True_graph_partition(self):
         self._test_transformer_backend_inductor_fullgraph_True()
 
     @unittest.skip("TODO: fix fwd_fullgraph=False case")
-    @skipIfRocm
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     # TODO: native_dropout causes CUDA IMA error, need to figure out why
     @torch._inductor.config.patch(fallback_random=True)
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_init.py b/test/distributed/_composable/fsdp/test_fully_shard_init.py
index 714145f8b976..bf5f7b8e4019 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_init.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_init.py
@@ -9,7 +9,7 @@
 import torch.nn as nn
 from torch.distributed._composable import replicate
 from torch.distributed.device_mesh import init_device_mesh
-from torch.distributed.fsdp import fully_shard
+from torch.distributed.fsdp import fully_shard, MixedPrecisionPolicy
 from torch.distributed.fsdp._fully_shard._fsdp_init import (
     _get_managed_modules,
     _get_managed_states,
@@ -644,19 +644,28 @@ def world_size(self) -> int:
     def test_meta_device_1d_init(self):
         default_pg = torch.distributed.distributed_c10d._get_default_group()
         mesh = init_device_mesh(device_type.type, mesh_shape=(default_pg.size(),))
-
-        # Test both even sharding (8) and uneven sharding (3)
-        for mlp_dim in (8, 3):
-            with torch.device("meta"):
-                model = nn.Sequential(MLP(mlp_dim, with_buffer=True), MLP(mlp_dim))
+        # Test both even sharding (8), uneven sharding (3), and empty local tensor (1)
+        for mlp_dim in (8, 3, 1):
+            # cover foreach_copy code path for bf16
+            for mp_policy in (
+                MixedPrecisionPolicy(),
+                MixedPrecisionPolicy(
+                    param_dtype=torch.bfloat16, reduce_dtype=torch.float32
+                ),
+            ):
+                with torch.device("meta"):
+                    model = nn.Sequential(
+                        MLP(mlp_dim, dim_multiplier=1, with_buffer=True, bias=False),
+                        MLP(mlp_dim, dim_multiplier=1, bias=False),
+                    )
+                    for param in model.parameters():
+                        self.assertEqual(param.device, torch.device("meta"))
+                    fully_shard(model[0], mesh=mesh, mp_policy=mp_policy)
+                    fully_shard(model[1], mesh=mesh, mp_policy=mp_policy)
+                    fully_shard(model, mesh=mesh, mp_policy=mp_policy)
                 for param in model.parameters():
                     self.assertEqual(param.device, torch.device("meta"))
-                fully_shard(model[0], mesh=mesh)
-                fully_shard(model[1], mesh=mesh)
-                fully_shard(model, mesh=mesh)
-            for param in model.parameters():
-                self.assertEqual(param.device, torch.device("meta"))
-            self._test_to_empty_and_reset_parameters(model, mesh, mlp_dim)
+                self._test_to_empty_and_reset_parameters(model, mesh, mlp_dim)
 
         # Test that we can call `fully_shard` under meta-device context and
         # that `init_device_mesh` call still works
@@ -1305,9 +1314,6 @@ def world_size(self) -> int:
 
     @skip_if_lt_x_gpu(1)
     def test_old_import_training(self):
-        from torch.distributed._composable.fsdp import fully_shard, MixedPrecisionPolicy
-        from torch.distributed._composable.fsdp.fully_shard import FSDPModule
-
         model = nn.Sequential(nn.Linear(16, 16), nn.Linear(16, 16))
         mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16)
         fully_shard(model[0], mp_policy=mp_policy)
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_overlap.py b/test/distributed/_composable/fsdp/test_fully_shard_overlap.py
index e8d52f70e0f4..f4a33a86c417 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_overlap.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_overlap.py
@@ -3,21 +3,29 @@
 import copy
 import functools
 import unittest
-from typing import Callable
+from collections.abc import Callable
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed.fsdp import fully_shard
 from torch.distributed.tensor.experimental import implicit_replication
-from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_distributed import (
+    skip_if_lt_x_gpu,
+    skip_if_rocm_arch_multiprocess,
+)
 from torch.testing._internal.common_fsdp import (
     FSDPTest,
     get_devtype,
     patch_all_gather,
     patch_reduce_scatter,
 )
-from torch.testing._internal.common_utils import get_cycles_per_ms, run_tests, TEST_HPU
+from torch.testing._internal.common_utils import (
+    get_cycles_per_ms,
+    MI200_ARCH,
+    run_tests,
+    TEST_HPU,
+)
 
 
 device_type = torch.device(get_devtype())
@@ -43,6 +51,7 @@ class TestFullyShardOverlap(FSDPTest):
     def world_size(self) -> int:
         return min(2, torch.get_device_module(device_type).device_count())
 
+    @skip_if_rocm_arch_multiprocess(MI200_ARCH)
     @skip_if_lt_x_gpu(2)
     @unittest.skipIf(TEST_HPU, "Sleep is not supported on HPU")
     def test_fully_shard_training_overlap(self):
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py b/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
index 800076d3225d..d193d65b179a 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
@@ -117,6 +117,49 @@ def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
         for key, value in ref_sharded_sd.items():
             self.assertEqual(value, sharded_sd[key])
 
+    @skip_if_lt_x_gpu(2)
+    def test_cached_state_dict(self):
+        self.run_subtests(
+            {"mlp_dim": [2, 3, 4, 5], "mutate_after_state_dict": [True, False]},
+            self._test_cached_state_dict,
+        )
+
+    def _test_cached_state_dict(self, mlp_dim: int, mutate_after_state_dict: bool):
+        torch.manual_seed(42)
+        model = nn.Linear(mlp_dim, mlp_dim, bias=False)
+        fully_shard(model, reshard_after_forward=True)
+        optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
+
+        # call .state_dict() once and use `sd` directly to reduce cpu overhead
+        sd = model.state_dict()
+        assert isinstance(model.weight, DTensor)
+
+        if not mutate_after_state_dict:
+            self.assertTrue(
+                sd["weight"]._local_tensor.untyped_storage().data_ptr()
+                == model.weight._local_tensor.untyped_storage().data_ptr()
+            )
+        else:
+            model = model.cpu()
+            model = model.cuda()
+            self.assertTrue(
+                sd["weight"]._local_tensor.untyped_storage().data_ptr()
+                != model.weight._local_tensor.untyped_storage().data_ptr()
+            )
+
+        torch.manual_seed(42 + self.rank)
+        inp = torch.rand(mlp_dim, mlp_dim, device="cuda")
+        for _ in range(5):
+            optim.zero_grad()
+            loss = model(inp).sum()
+            loss.backward()
+            optim.step()
+            if not mutate_after_state_dict:
+                self.assertTrue(
+                    sd["weight"]._local_tensor.untyped_storage().data_ptr()
+                    == model.weight._local_tensor.untyped_storage().data_ptr()
+                )
+
     @skip_if_lt_x_gpu(2)
     def test_dp_state_dict_cpu_offload(self):
         self.run_subtests(
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_training.py b/test/distributed/_composable/fsdp/test_fully_shard_training.py
index 3d02e053edd2..8331cd90ce9b 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_training.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_training.py
@@ -6,7 +6,7 @@
 import itertools
 import unittest
 from collections import defaultdict
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from typing import Any, Optional, Union
 
 import torch
@@ -24,10 +24,18 @@
     fully_shard,
     OffloadPolicy,
     register_fsdp_forward_method,
+    share_comm_ctx,
+)
+from torch.distributed.fsdp._fully_shard._fsdp_collectives import (
+    foreach_all_gather,
+    foreach_reduce,
 )
 from torch.distributed.tensor import DTensor, init_device_mesh, Shard
 from torch.distributed.tensor.debug import CommDebugMode
-from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_distributed import (
+    skip_if_lt_x_gpu,
+    skip_if_rocm_arch_multiprocess,
+)
 from torch.testing._internal.common_fsdp import (
     check_sharded_parity,
     compiled_fsdp_test,
@@ -36,10 +44,13 @@
     MLP,
     MLPStack,
     patch_all_gather,
+    patch_foreach_all_gather,
+    patch_foreach_reduce,
     patch_reduce_scatter,
 )
 from torch.testing._internal.common_utils import (
     get_cycles_per_ms,
+    MI200_ARCH,
     run_tests,
     TEST_HPU,
     TEST_XPU,
@@ -1198,6 +1209,7 @@ def init_global_mesh(self) -> DeviceMesh:
             mesh_dim_names=("pp", "dp", "tp"),
         )
 
+    @skip_if_rocm_arch_multiprocess(MI200_ARCH)
     @skip_if_lt_x_gpu(4)
     def test_2d_mlp_with_nd_mesh(self):
         global_mesh = self.init_global_mesh()
@@ -1482,6 +1494,116 @@ def forward(self, imgs: torch.Tensor) -> torch.Tensor:
         check_sharded_parity(self, ref_model, model)
 
 
+class TestFullyShardShareCommContext(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(torch.get_device_module(device_type).device_count(), 2)
+
+    @skip_if_lt_x_gpu(2)
+    def test_share_comm_context(self):
+        torch.manual_seed(42)
+        n_layers = 3
+        lin_dim = 16
+        model = nn.Sequential(
+            *[MLP(lin_dim, torch.device("cpu")) for _ in range(n_layers)]
+        )
+        ref_model = copy.deepcopy(model).to(device_type)
+        for layer in model:
+            fully_shard(layer)
+            layer._get_fsdp_state()._lazy_init()
+        share_comm_ctx(list(model))
+
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn(4, 3, lin_dim, device=device_type.type)
+        ref_loss = ref_model(inp).sum()
+
+        all_gather_streams = set()
+        reduce_scatter_streams = set()
+
+        from torch.distributed.fsdp._fully_shard._fsdp_api import (
+            AllGather,
+            ReduceScatter,
+        )
+        from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam
+
+        orig_foreach_all_gather = foreach_all_gather
+
+        def foreach_all_gather_with_assert(
+            fsdp_params: list[FSDPParam],
+            group: dist.ProcessGroup,
+            async_op: bool,
+            all_gather_copy_in_stream: torch.Stream,
+            all_gather_stream: torch.Stream,
+            device: torch.device,
+            all_gather_comm: AllGather,
+        ):
+            nonlocal all_gather_streams
+            all_gather_streams.add(all_gather_stream)
+            return orig_foreach_all_gather(
+                fsdp_params,
+                group,
+                async_op,
+                all_gather_copy_in_stream,
+                all_gather_stream,
+                device,
+                all_gather_comm,
+            )
+
+        orig_foreach_reduce = foreach_reduce
+
+        @torch.no_grad()
+        def foreach_reduce_with_assert(
+            fsdp_params: list[FSDPParam],
+            unsharded_grads: list[torch.Tensor],
+            reduce_scatter_group: dist.ProcessGroup,
+            reduce_scatter_stream: torch.Stream,
+            reduce_scatter_comm: ReduceScatter,
+            orig_dtype: Optional[torch.dtype],
+            reduce_dtype: Optional[torch.dtype],
+            device: torch.device,
+            gradient_divide_factor: Optional[float],
+            all_reduce_group: Optional[dist.ProcessGroup],  # not `None` iff HSDP
+            all_reduce_stream: torch.Stream,
+            all_reduce_grads: bool,
+            partial_reduce_output: Optional[torch.Tensor],  # only used for HSDP
+            all_reduce_hook: Optional[Callable[[torch.Tensor], None]],
+            force_sum_reduction_for_comms: bool = False,
+        ):
+            nonlocal reduce_scatter_streams
+            reduce_scatter_streams.add(reduce_scatter_stream)
+            return orig_foreach_reduce(
+                fsdp_params,
+                unsharded_grads,
+                reduce_scatter_group,
+                reduce_scatter_stream,
+                reduce_scatter_comm,
+                orig_dtype,
+                reduce_dtype,
+                device,
+                gradient_divide_factor,
+                all_reduce_group,
+                all_reduce_stream,
+                all_reduce_grads,
+                partial_reduce_output,
+                all_reduce_hook,
+                force_sum_reduction_for_comms,
+            )
+
+        with (
+            patch_foreach_all_gather(foreach_all_gather_with_assert),
+            patch_foreach_reduce(foreach_reduce_with_assert),
+        ):
+            loss = model(inp).sum()
+            self.assertEqual(ref_loss, loss)
+            ref_loss.backward()
+            loss.backward()
+            for param in ref_model.parameters():
+                dist.all_reduce(param.grad, op=dist.ReduceOp.AVG)
+        self.assertEqual(len(all_gather_streams), 1)
+        self.assertEqual(len(reduce_scatter_streams), 1)
+        check_sharded_parity(self, ref_model, model)
+
+
 class TestFullyShardWorldSize1(FSDPTest):
     @property
     def world_size(self) -> int:
diff --git a/test/distributed/_composable/test_composability/test_2d_composability.py b/test/distributed/_composable/test_composability/test_2d_composability.py
index 925f3a647fef..18fd9d1e21df 100644
--- a/test/distributed/_composable/test_composability/test_2d_composability.py
+++ b/test/distributed/_composable/test_composability/test_2d_composability.py
@@ -41,10 +41,14 @@
 from torch.distributed.tensor.parallel.fsdp import DTensorExtensions
 from torch.distributed.tensor.parallel.input_reshard import input_reshard
 from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_distributed import (
+    skip_if_lt_x_gpu,
+    skip_if_rocm_arch_multiprocess,
+)
 from torch.testing._internal.common_fsdp import FSDPTest, MLP, MLPStack
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
+    MI200_ARCH,
     parametrize,
     run_tests,
     TEST_XPU,
@@ -121,6 +125,7 @@ def init_global_mesh(self) -> DeviceMesh:
             mesh_dim_names=("dp", "tp"),
         )
 
+    @skip_if_rocm_arch_multiprocess(MI200_ARCH)
     @skip_if_lt_x_gpu(2)
     def test_train_parity_2d_mlp(self):
         global_mesh = self.init_global_mesh()
diff --git a/test/distributed/_composable/test_composability/test_pp_composability.py b/test/distributed/_composable/test_composability/test_pp_composability.py
index e4daa81c456c..0d3868922215 100644
--- a/test/distributed/_composable/test_composability/test_pp_composability.py
+++ b/test/distributed/_composable/test_composability/test_pp_composability.py
@@ -1,4 +1,5 @@
 # Owner(s): ["oncall: distributed"]
+import copy
 import os
 from typing import TYPE_CHECKING
 
@@ -6,6 +7,7 @@
 import torch.distributed.checkpoint as dcp
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.distributed._composable.replicate_with_fsdp import replicate
 from torch.distributed.checkpoint import FileSystemReader
 from torch.distributed.checkpoint.default_planner import _EmptyStateDictLoadPlanner
 from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict
@@ -366,6 +368,516 @@ def apply_tp(
 
         torch.distributed.destroy_process_group()
 
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_if_lt_x_gpu(4)
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
+    )
+    @parametrize(
+        "ScheduleClass",
+        [
+            ScheduleGPipe,
+            Schedule1F1B,
+            ScheduleInterleaved1F1B,
+            ScheduleLoopedBFS,
+            ScheduleInterleavedZeroBubble,
+        ],
+    )
+    @parametrize(
+        "MixedPrecisionParam",
+        [
+            torch.bfloat16,
+            torch.float32,
+        ],
+    )
+    def test_replicate_pp(self, ScheduleClass, MixedPrecisionParam):
+        torch.accelerator.set_device_index(self.device)
+        store = torch.distributed.FileStore(self.file_name, self.world_size)
+        torch.distributed.init_process_group(
+            backend=backend,
+            store=store,
+            rank=self.rank,
+            world_size=self.world_size,
+        )
+        dim = 8
+        pp_size = 2
+        num_microbatches = 8
+        replicate_size = self.world_size // (pp_size)
+        device_mesh = init_device_mesh(
+            device_type,
+            mesh_shape=(replicate_size, 1, pp_size),
+            mesh_dim_names=("replicate", "shard", "pp"),
+        )
+        torch.manual_seed(42)
+        dp_mesh = device_mesh["replicate", "shard"]
+        pp_mesh = device_mesh["pp"]
+        pp_group = device_mesh["pp"].get_group()
+
+        # create "entire model"
+        total_layers = 8
+        full_model = nn.ModuleList([MLPModule(dim) for _ in range(total_layers)])
+        ref_full_model = copy.deepcopy(full_model)
+
+        # dummy loss needed just to force backwards to run in schedule step
+        def loss_fn(y, target):
+            return y.sum()
+
+        # Apply DP to stage module
+        def apply_replicate(partial_model):
+            # apply replicate
+            mp_policy = MixedPrecisionPolicy(
+                param_dtype=MixedPrecisionParam,
+                reduce_dtype=torch.float32,
+            )
+            replicate_config = {"mp_policy": mp_policy}
+            for layer_id in range(len(partial_model)):
+                replicate(
+                    partial_model[layer_id],
+                    device_mesh=dp_mesh,
+                    **replicate_config,
+                    reshard_after_forward=False,
+                )
+            dp_model = replicate(partial_model, device_mesh=dp_mesh, **replicate_config)
+            return dp_model
+
+        # Apply same precision to reference model (without replicate)
+        def apply_same_precision(partial_model):
+            if MixedPrecisionParam != torch.float32:
+                # Cast to same precision as pipeline model
+                partial_model = partial_model.to(dtype=MixedPrecisionParam)
+            return partial_model
+
+        # Attach to a schedule
+        if issubclass(ScheduleClass, PipelineScheduleSingle):
+            stage_idx = pp_group.rank()
+            partial_model = nn.Sequential(
+                *full_model[stage_idx * 2 : stage_idx * 2 + 2]
+            )
+            partial_model.to(self.device)
+
+            dp_model = apply_replicate(partial_model)
+            pipeline_stage = PipelineStage(
+                dp_model,
+                stage_idx,
+                pp_group.size(),
+                self.device,
+                group=pp_group,
+            )
+            partial_models = [pipeline_stage.submod]
+            pipeline_schedule = ScheduleClass(
+                pipeline_stage,
+                n_microbatches=num_microbatches,
+                loss_fn=loss_fn,
+                scale_grads=False,
+            )
+
+            ref_partial_model = nn.Sequential(
+                *ref_full_model[stage_idx * 2 : stage_idx * 2 + 2]
+            )
+            ref_partial_model.to(self.device)
+            ref_partial_model = apply_same_precision(
+                ref_partial_model
+            )  # Apply same precision
+
+            ref_pipeline_stage = PipelineStage(
+                ref_partial_model,
+                stage_idx,
+                pp_group.size(),
+                self.device,
+                group=pp_group,
+            )
+            ref_partial_models = [ref_pipeline_stage.submod]
+            ref_pipeline_schedule = ScheduleClass(
+                ref_pipeline_stage,
+                n_microbatches=num_microbatches,
+                loss_fn=loss_fn,
+                scale_grads=False,
+            )
+        else:
+            n_virtual = 2
+            num_stages = pp_group.size() * n_virtual
+            stages = []
+            ref_stages = []
+            for i in range(n_virtual):
+                stage_idx = pp_group.rank() + n_virtual * i
+                # divide the model layers by the number of stages
+                partial_model = nn.Sequential(*full_model[stage_idx : stage_idx + 1])
+                partial_model.to(self.device)
+
+                dp_model = apply_replicate(partial_model)
+                stage = PipelineStage(
+                    dp_model,
+                    stage_idx,
+                    num_stages,
+                    self.device,
+                    group=pp_group,
+                )
+
+                stages.append(stage)
+                partial_models = [pipeline_stage.submod for pipeline_stage in stages]
+
+                ref_partial_model = nn.Sequential(
+                    *ref_full_model[stage_idx : stage_idx + 1]
+                )
+                ref_partial_model.to(self.device)
+                ref_partial_model = apply_same_precision(
+                    ref_partial_model
+                )  # Apply same precision
+
+                ref_stage = PipelineStage(
+                    ref_partial_model,
+                    stage_idx,
+                    num_stages,
+                    self.device,
+                    group=pp_group,
+                )
+
+                ref_stages.append(ref_stage)
+                ref_partial_models = [
+                    pipeline_stage.submod for pipeline_stage in ref_stages
+                ]
+            pipeline_schedule = ScheduleClass(
+                stages,
+                n_microbatches=num_microbatches,
+                loss_fn=loss_fn,
+                scale_grads=False,
+            )
+
+            ref_pipeline_schedule = ScheduleClass(
+                ref_stages,
+                n_microbatches=num_microbatches,
+                loss_fn=loss_fn,
+                scale_grads=False,
+            )
+
+        optimizer_kwargs = {
+            "lr": 0.01,
+            "betas": (0.9, 0.95),
+            "weight_decay": 0.1,
+            "fused": False,
+            "foreach": True,
+        }
+
+        optimizers = [
+            torch.optim.AdamW(model.parameters(), **optimizer_kwargs)
+            for model in partial_models
+        ]
+
+        ref_optimizers = [
+            torch.optim.AdamW(model.parameters(), **optimizer_kwargs)
+            for model in ref_partial_models
+        ]
+
+        for train_step in range(5):
+            for optimizer in optimizers:
+                optimizer.zero_grad()
+            for ref_optimizer in ref_optimizers:
+                ref_optimizer.zero_grad()
+
+            inputs = torch.rand(
+                (num_microbatches, dim), device=self.device, dtype=MixedPrecisionParam
+            )
+            labels = torch.rand(
+                (num_microbatches, dim), device=self.device, dtype=MixedPrecisionParam
+            )
+            is_last_stage = pp_mesh.get_local_rank() == pp_mesh.size() - 1
+            if pp_mesh.get_local_rank() == 0:
+                pipeline_schedule.step(inputs)
+                ref_pipeline_schedule.step(inputs)
+            elif is_last_stage:
+                losses = []
+                ref_losses = []
+                pipeline_schedule.step(target=labels, losses=losses)
+                ref_pipeline_schedule.step(target=labels, losses=ref_losses)
+
+                for loss, ref_loss in zip(losses, ref_losses):
+                    self.assertEqual(loss, ref_loss)
+            else:
+                pipeline_schedule.step()
+                ref_pipeline_schedule.step()
+
+            for optimizer in optimizers:
+                optimizer.step()
+            for ref_optimizer in ref_optimizers:
+                ref_optimizer.step()
+
+        torch.distributed.destroy_process_group()
+
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_if_lt_x_gpu(4)
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
+    )
+    @parametrize(
+        "ScheduleClass",
+        [
+            ScheduleGPipe,
+            Schedule1F1B,
+            ScheduleInterleaved1F1B,
+            ScheduleLoopedBFS,
+            ScheduleInterleavedZeroBubble,
+        ],
+    )
+    def test_replicate_pp_grads(self, ScheduleClass):
+        torch.accelerator.set_device_index(self.device)
+        store = torch.distributed.FileStore(self.file_name, self.world_size)
+        torch.distributed.init_process_group(
+            backend=backend,
+            store=store,
+            rank=self.rank,
+            world_size=self.world_size,
+        )
+        dim = 8
+        pp_size = 2
+        num_microbatches = 8
+        replicate_size = self.world_size // (pp_size)
+        device_mesh = init_device_mesh(
+            device_type,
+            mesh_shape=(replicate_size, 1, pp_size),
+            mesh_dim_names=("replicate", "shard", "pp"),
+        )
+        torch.manual_seed(42)
+        dp_mesh = device_mesh["replicate", "shard"]
+        pp_mesh = device_mesh["pp"]
+        pp_group = device_mesh["pp"].get_group()
+        dp_group = device_mesh["replicate"].get_group()
+
+        # create "entire model"
+        total_layers = 8
+        full_model = nn.ModuleList([MLPModule(dim) for _ in range(total_layers)])
+        ref_model = nn.Sequential(*copy.deepcopy(full_model)).to(self.device)
+
+        # dummy loss needed just to force backwards to run in schedule step
+        def loss_fn(y, target):
+            return y.sum()
+
+        # Simulate microbatch processing for reference model
+        def simulate_stage_forward_backward(model, inputs, labels):
+            """Simulate forward and backward passes through stages for microbatch processing"""
+            batch_size, _ = inputs.shape
+            total_loss = 0
+
+            # Split inputs into microbatches
+            microbatch_size = batch_size // num_microbatches
+
+            for mb_idx in range(num_microbatches):
+                start_idx = mb_idx * microbatch_size
+                end_idx = start_idx + microbatch_size
+                mb_input = inputs[start_idx:end_idx]
+                mb_label = labels[start_idx:end_idx] if labels is not None else None
+
+                # Simulate stage-by-stage processing
+                if issubclass(ScheduleClass, PipelineScheduleSingle):
+                    num_stages = pp_group.size()
+                    layers_per_stage = total_layers // pp_group.size()  # 8 // 2 = 4
+                else:
+                    n_virtual = 2
+                    num_stages = pp_group.size() * n_virtual
+                    layers_per_stage = total_layers // num_stages
+
+                # Forward pass through all stages
+                x = mb_input
+
+                for stage in range(num_stages):
+                    start_layer = stage * layers_per_stage
+                    end_layer = start_layer + layers_per_stage
+
+                    # Process layers for this stage
+                    for layer_idx in range(start_layer, min(end_layer, len(model))):
+                        x = model[layer_idx](x)
+
+                mb_loss = loss_fn(x, mb_label)
+                total_loss += mb_loss
+
+                # Backward pass
+                mb_loss.backward()
+
+            return total_loss / num_microbatches
+
+        # Apply replicate to stage module
+        def apply_replicate(partial_model):
+            for layer_id in range(len(partial_model)):
+                replicate(
+                    partial_model[layer_id],
+                    device_mesh=dp_mesh,
+                    reshard_after_forward=False,
+                )
+            dp_model = replicate(partial_model, device_mesh=dp_mesh)
+            return dp_model
+
+        def pipelined_models_parameters(start_layer, model):
+            layer_idx = start_layer
+
+            for layer in model.children():
+                for name, param in layer.named_parameters():
+                    updated_param_name = f"{layer_idx}.{name}"
+                    pipeline_model_parameter_dict[updated_param_name] = param
+                layer_idx += 1
+
+        def check_gradient_parity(
+            pipeline_model_parameter_dict, ref_model_parameter_dict
+        ):
+            for parameter in pipeline_model_parameter_dict:
+                assert parameter in ref_model_parameter_dict
+
+                pipeline_parameter = pipeline_model_parameter_dict[parameter]
+                if pipeline_parameter.grad is not None:
+                    pipeline_parameter_grad = pipeline_parameter.grad.to_local()
+                    ref_parameter = ref_model_parameter_dict[parameter]
+                    if ref_parameter.grad is not None:
+                        torch.testing.assert_close(
+                            pipeline_parameter_grad,
+                            ref_parameter.grad,
+                            rtol=1e-4,
+                            atol=1e-5,
+                        )
+                    else:
+                        assert pipeline_parameter.grad is None
+
+        pipeline_model_parameter_dict = {}
+
+        # Attach to a schedule
+        if issubclass(ScheduleClass, PipelineScheduleSingle):
+            stage_idx = pp_group.rank()
+            # Calculate layers per stage correctly
+            layers_per_stage = total_layers // pp_group.size()  # 8 // 2 = 4
+            start_layer = stage_idx * layers_per_stage
+            end_layer = start_layer + layers_per_stage
+
+            partial_model = nn.Sequential(*full_model[start_layer:end_layer])
+            partial_model.to(self.device)
+
+            dp_model = apply_replicate(partial_model)
+            pipelined_models_parameters(start_layer, dp_model)
+
+            pipeline_stage = PipelineStage(
+                dp_model,
+                stage_idx,
+                pp_group.size(),
+                self.device,
+                group=pp_group,
+            )
+            partial_models = [pipeline_stage.submod]
+            pipeline_schedule = ScheduleClass(
+                pipeline_stage,
+                n_microbatches=num_microbatches,
+                loss_fn=loss_fn,
+                scale_grads=False,
+            )
+
+        else:
+            n_virtual = 2
+            num_stages = pp_group.size() * n_virtual
+            layers_per_stage = total_layers // num_stages
+            stages = []
+            for i in range(n_virtual):
+                stage_idx = pp_group.rank() + pp_group.size() * i
+                start_layer = stage_idx * layers_per_stage
+                end_layer = start_layer + layers_per_stage
+                # divide the model layers by the number of stages
+                partial_model = nn.Sequential(*full_model[start_layer:end_layer])
+                partial_model.to(self.device)
+
+                dp_model = apply_replicate(partial_model)
+                pipelined_models_parameters(start_layer, dp_model)
+                stage = PipelineStage(
+                    dp_model,
+                    stage_idx,
+                    num_stages,
+                    self.device,
+                    group=pp_group,
+                )
+
+                stages.append(stage)
+                partial_models = [pipeline_stage.submod for pipeline_stage in stages]
+
+            pipeline_schedule = ScheduleClass(
+                stages,
+                n_microbatches=num_microbatches,
+                loss_fn=loss_fn,
+                scale_grads=False,
+            )
+
+        optimizer_kwargs = {
+            "lr": 0.01,
+            "betas": (0.9, 0.95),
+            "weight_decay": 0.1,
+            "fused": False,
+            "foreach": True,
+        }
+
+        optimizers = [
+            torch.optim.AdamW(model.parameters(), **optimizer_kwargs)
+            for model in partial_models
+        ]
+
+        ref_optimizer = torch.optim.AdamW(ref_model.parameters(), **optimizer_kwargs)
+
+        # Helper function to simulate all-reduce for reference model gradients
+        def simulate_all_reduce_grads(model, group):
+            """Simulate all-reduce operation on gradients like replicate does"""
+            for param in model.parameters():
+                if param.grad is not None:
+                    # Scale by the number of replicas (like replicate does)
+                    param.grad.div_(group.size())
+                    # Simulate all-reduce
+                    torch.distributed.all_reduce(param.grad, group=group)
+
+        ref_model_parameter_dict = {}
+        ref_model_parameter_dict = dict(ref_model.named_parameters())
+
+        torch.manual_seed(42 + self.rank)
+        for _ in range(5):
+            for optimizer in optimizers:
+                optimizer.zero_grad()
+            ref_optimizer.zero_grad()
+
+            inputs = torch.rand((num_microbatches, dim), device=self.device)
+            labels = torch.rand((num_microbatches, dim), device=self.device)
+
+            # Ensure all ranks use the same inputs/labels for comparison
+            torch.distributed.broadcast(inputs, 0)
+            torch.distributed.broadcast(labels, 0)
+
+            is_last_stage = pp_mesh.get_local_rank() == pp_mesh.size() - 1
+
+            # Run pipeline schedule
+            if pp_mesh.get_local_rank() == 0:
+                pipeline_schedule.step(inputs)
+            elif is_last_stage:
+                losses = []
+                pipeline_schedule.step(target=labels, losses=losses)
+            else:
+                pipeline_schedule.step()
+
+            # Run reference model simulation
+            if is_last_stage:
+                ref_loss = simulate_stage_forward_backward(ref_model, inputs, labels)
+                # Simulate all-reduce on reference model gradients
+                simulate_all_reduce_grads(ref_model, dp_group)
+
+                # Compare losses - only check on last stage where we have losses
+                if "losses" in locals() and len(losses) > 0:
+                    # Average the microbatch losses to match ref_loss
+                    avg_pipeline_loss = sum(losses) / len(losses)
+                    torch.testing.assert_close(
+                        avg_pipeline_loss, ref_loss, rtol=1e-4, atol=1e-5
+                    )
+            else:
+                # For non-last stages, still run ref model to generate gradients
+                simulate_stage_forward_backward(ref_model, inputs, None)
+                simulate_all_reduce_grads(ref_model, dp_group)
+
+            # Step optimizers
+            for optimizer in optimizers:
+                optimizer.step()
+            ref_optimizer.step()
+
+            check_gradient_parity(
+                pipeline_model_parameter_dict, ref_model_parameter_dict
+            )
+        torch.distributed.destroy_process_group()
+
 
 instantiate_parametrized_tests(ComposabilityTest)
 
diff --git a/test/distributed/_composable/test_replicate_mixed_precision.py b/test/distributed/_composable/test_replicate_mixed_precision.py
new file mode 100644
index 000000000000..2f6de02f3bbc
--- /dev/null
+++ b/test/distributed/_composable/test_replicate_mixed_precision.py
@@ -0,0 +1,626 @@
+# Owner(s): ["oncall: distributed"]
+
+import copy
+import dataclasses
+import functools
+from typing import Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.distributed._functional_collectives as funcol
+import torch.nn as nn
+from torch.distributed._composable.replicate_with_fsdp import replicate
+from torch.distributed.fsdp import MixedPrecisionPolicy
+from torch.distributed.fsdp._fully_shard._fsdp_collectives import (
+    _get_gradient_divide_factors,
+)
+from torch.distributed.tensor import Shard
+from torch.testing._internal.common_distributed import (
+    requires_nccl_version,
+    SaveForwardInputsModel,
+    skip_if_lt_x_gpu,
+)
+from torch.testing._internal.common_fsdp import (
+    check_sharded_parity,
+    FSDPTest,
+    FSDPTestMultiThread,
+    get_devtype,
+    MLP,
+    patch_reduce_scatter,
+    reduce_scatter_with_assert,
+)
+from torch.testing._internal.common_utils import (
+    run_tests,
+    skipIfRocmVersionLessThan,
+    TEST_HPU,
+)
+
+
+device_type = torch.device(get_devtype())
+
+
+class TestReplicateMixedPrecisionTraining(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(2, torch.get_device_module(device_type).device_count())
+
+    def _init_models_and_optims(
+        self,
+        reshard_after_forward: Union[bool, int],
+        param_dtype: Optional[torch.dtype],
+        reduce_dtype: Optional[torch.dtype],
+        use_shard_placement_fn,
+    ):
+        torch.manual_seed(42)
+        model = nn.Sequential(*[MLP(16, torch.device("cpu")) for _ in range(3)])
+        ref_model = copy.deepcopy(model).to(device_type)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+
+        def _shard_placement_fn(param: nn.Parameter) -> Optional[Shard]:
+            largest_dim = -1
+            largest_dim_size = -1
+            for dim, dim_size in enumerate(param.shape):
+                if dim_size > largest_dim_size:
+                    largest_dim = dim
+                    largest_dim_size = dim_size
+            assert largest_dim >= 0, f"{param.shape}"
+            return Shard(largest_dim)
+
+        mp_policy = MixedPrecisionPolicy(
+            param_dtype=param_dtype, reduce_dtype=reduce_dtype
+        )
+        shard_placement_fn = _shard_placement_fn if use_shard_placement_fn else None
+        replicate_fn = functools.partial(
+            replicate,
+            reshard_after_forward=reshard_after_forward,
+            mp_policy=mp_policy,
+            shard_placement_fn=shard_placement_fn,
+        )
+        for mlp in model:
+            replicate_fn(mlp)
+        replicate_fn(model)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
+        return ref_model, ref_optim, model, optim
+
+    def _get_use_shard_placement_fn_vals_for_bf16_reduce(self):
+        use_shard_placement_fn_vals = [False]
+        if self.world_size == 2:
+            # For world size >2, gradient elements get reduced in different
+            # orders for the baseline vs. dim-1 sharding, leading to numeric
+            # differences for bf16 reduction, so only test world size 2.
+            use_shard_placement_fn_vals.append(True)
+        return use_shard_placement_fn_vals
+
+    @skipIfRocmVersionLessThan((7, 0))
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
+    def test_compute_dtype(self):
+        use_shard_placement_fn_vals = (
+            self._get_use_shard_placement_fn_vals_for_bf16_reduce()
+        )
+        self.run_subtests(
+            {
+                "param_dtype": [torch.bfloat16, torch.float16],
+                "reshard_after_forward": [False, True],
+                "use_shard_placement_fn": use_shard_placement_fn_vals,
+            },
+            self._test_compute_dtype,
+        )
+
+    def _test_compute_dtype(
+        self,
+        param_dtype: torch.dtype,
+        reshard_after_forward: Union[bool, int],
+        use_shard_placement_fn: bool,
+    ):
+        ref_model, ref_optim, model, optim = self._init_models_and_optims(
+            reshard_after_forward,
+            param_dtype=param_dtype,
+            reduce_dtype=None,
+            use_shard_placement_fn=use_shard_placement_fn,
+        )
+        ref_model_bf16 = copy.deepcopy(ref_model).to(param_dtype)
+        orig_reduce_scatter = dist.reduce_scatter_tensor
+
+        def assert_fn(output: torch.Tensor):
+            self.assertEqual(output.dtype, param_dtype)
+
+        reduce_scatter = functools.partial(
+            reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
+        )
+        predivide_factor, postdivide_factor, _, _ = _get_gradient_divide_factors(
+            self.process_group, all_reduce_group=None, reduce_dtype=param_dtype
+        )
+
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((4, 16), device=device_type.type, dtype=param_dtype)
+        for iter_idx in range(10):
+            optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            fsdp_loss = model(inp).sum()
+            with patch_reduce_scatter(reduce_scatter):
+                fsdp_loss.backward()
+            optim.step()
+
+            ref_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            ref_loss = ref_model_bf16(inp.to(param_dtype)).sum()
+            ref_loss.backward()
+            for param in ref_model_bf16.parameters():
+                # Use reduce-scatter -> all-gather as all-reduce because for
+                # world size >=4, NCCL all-reduce shows numeric differences
+                # compared with NCCL reduce-scatter
+                if predivide_factor is not None and predivide_factor > 1:
+                    param.grad.div_(predivide_factor)
+                elif predivide_factor is None:
+                    param.grad.div_(self.world_size)
+                output = torch.zeros_like(torch.chunk(param.grad, self.world_size)[0])
+                dist.reduce_scatter_tensor(output, param.grad)
+                dist.all_gather_into_tensor(param.grad, output)
+                if postdivide_factor is not None and postdivide_factor > 1:
+                    param.grad.div_(postdivide_factor)
+            for param_fp32, param_bf16 in zip(
+                ref_model.parameters(), ref_model_bf16.parameters()
+            ):
+                param_fp32.grad = param_bf16.grad.to(param_fp32.dtype)
+                param_bf16.grad = None
+            ref_optim.step()  # fp32 optimizer step
+            for param_fp32, param_bf16 in zip(
+                ref_model.parameters(), ref_model_bf16.parameters()
+            ):
+                param_bf16.detach().copy_(param_fp32)
+
+            self.assertEqual(fsdp_loss, ref_loss)
+            check_sharded_parity(self, ref_model, model)
+
+    @skipIfRocmVersionLessThan((7, 0))
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
+    def test_reduce_dtype(self):
+        self.run_subtests(
+            {
+                "reshard_after_forward": [False, True],
+                "use_shard_placement_fn": [False, True],
+            },
+            self._test_reduce_dtype_fp32_reduce,
+        )
+        use_shard_placement_fn_vals = (
+            self._get_use_shard_placement_fn_vals_for_bf16_reduce()
+        )
+        self.run_subtests(
+            {
+                "reshard_after_forward": [False, True],
+                "use_shard_placement_fn": use_shard_placement_fn_vals,
+            },
+            self._test_reduce_dtype_bf16_reduce,
+        )
+
+    def _test_reduce_dtype_fp32_reduce(
+        self, reshard_after_forward: Union[bool, int], use_shard_placement_fn: bool
+    ):
+        if (
+            self.world_size > 2
+            and isinstance(reshard_after_forward, int)
+            and use_shard_placement_fn
+        ):
+            return
+        param_dtype, reduce_dtype = torch.bfloat16, torch.float32
+        ref_model, ref_optim, model, optim = self._init_models_and_optims(
+            reshard_after_forward,
+            param_dtype=param_dtype,
+            reduce_dtype=reduce_dtype,
+            use_shard_placement_fn=use_shard_placement_fn,
+        )
+        ref_model_bf16 = copy.deepcopy(ref_model).to(param_dtype)
+        orig_reduce_scatter = dist.reduce_scatter_tensor
+
+        def assert_fn(output: torch.Tensor):
+            self.assertEqual(output.dtype, reduce_dtype)
+
+        reduce_scatter = functools.partial(
+            reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
+        )
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((4, 16), device=device_type.type, dtype=param_dtype)
+        for iter_idx in range(10):
+            optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            fsdp_loss = model(inp).sum()
+            with patch_reduce_scatter(reduce_scatter):
+                fsdp_loss.backward()
+            optim.step()
+
+            ref_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            ref_loss = ref_model_bf16(inp.to(param_dtype)).sum()
+            ref_loss.backward()
+            for param in ref_model_bf16.parameters():
+                param.grad.data = param.grad.to(torch.float32)
+                dist.all_reduce(param.grad)  # fp32 reduction
+                param.grad.div_(self.world_size)
+            for param_fp32, param_bf16 in zip(
+                ref_model.parameters(), ref_model_bf16.parameters()
+            ):
+                param_fp32.grad = param_bf16.grad
+                param_bf16.grad = None
+            ref_optim.step()  # fp32 optimizer step
+            for param_fp32, param_bf16 in zip(
+                ref_model.parameters(), ref_model_bf16.parameters()
+            ):
+                param_bf16.detach().copy_(param_fp32)
+
+            self.assertEqual(fsdp_loss, ref_loss)
+            check_sharded_parity(self, ref_model, model)
+
+    def _test_reduce_dtype_bf16_reduce(
+        self, reshard_after_forward: Union[bool, int], use_shard_placement_fn: bool
+    ):
+        param_dtype, reduce_dtype = torch.float32, torch.bfloat16
+        ref_model, ref_optim, model, optim = self._init_models_and_optims(
+            reshard_after_forward,
+            param_dtype=param_dtype,
+            reduce_dtype=reduce_dtype,
+            use_shard_placement_fn=use_shard_placement_fn,
+        )
+        group = dist.distributed_c10d._get_default_group()
+        orig_reduce_scatter = dist.reduce_scatter_tensor
+
+        def assert_fn(output: torch.Tensor):
+            self.assertEqual(output.dtype, reduce_dtype)
+
+        reduce_scatter = functools.partial(
+            reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
+        )
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn((4, 16), device=device_type.type, dtype=param_dtype)
+        for iter_idx in range(10):
+            optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            fsdp_loss = model(inp).sum()
+            with patch_reduce_scatter(reduce_scatter):
+                fsdp_loss.backward()
+            optim.step()
+
+            ref_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            ref_loss = ref_model(inp).sum()
+            ref_loss.backward()
+            for param in ref_model.parameters():
+                param_grad = param.grad.to(reduce_dtype)
+                # Use reduce-scatter -> all-gather to implement all-reduce
+                # since for world size >2, bf16 all-reduce and reduce-scatter
+                # have numeric differences
+                sharded_grad = funcol.reduce_scatter_tensor(
+                    param_grad, scatter_dim=0, reduceOp="avg", group=group
+                )  # bf16 reduction
+                param.grad = funcol.all_gather_tensor(
+                    sharded_grad, gather_dim=0, group=group
+                ).to(param.dtype)  # upcast to fp32
+            ref_optim.step()  # fp32 optimizer step
+
+            self.assertEqual(fsdp_loss, ref_loss)
+            check_sharded_parity(self, ref_model, model)
+
+    @skip_if_lt_x_gpu(2)
+    def test_grad_acc_with_reduce_dtype(self):
+        """
+        Tests that gradient accumulation without reduce-scatter when using
+        bf16 compute and fp32 reduction accumulates the unsharded gradients in
+        fp32.
+        """
+        self.run_subtests(
+            {"reshard_after_forward": [True, False]},
+            self._test_grad_acc_with_reduce_dtype,
+        )
+
+    def _test_grad_acc_with_reduce_dtype(self, reshard_after_forward: bool):
+        torch.manual_seed(42)
+        param_dtype, reduce_dtype = (torch.bfloat16, torch.float32)
+        mp_policy = MixedPrecisionPolicy(
+            param_dtype=param_dtype, reduce_dtype=reduce_dtype
+        )
+        model = nn.Sequential(*[MLP(16, torch.device("cpu")) for _ in range(3)])
+        # To emulate the mixed precision implementation where forward/backward
+        # compute use bf16 and optimizer uses fp32, we maintain both an fp32
+        # and a bf16 copy of the reference model
+        ref_model = copy.deepcopy(model).to(device_type)
+        ref_model_compute = copy.deepcopy(ref_model).to(param_dtype)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        for mlp in model:
+            replicate(
+                mlp, reshard_after_forward=reshard_after_forward, mp_policy=mp_policy
+            )
+        replicate(
+            model, reshard_after_forward=reshard_after_forward, mp_policy=mp_policy
+        )
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        orig_reduce_scatter = dist.reduce_scatter_tensor
+
+        def assert_fn(output: torch.Tensor):
+            self.assertEqual(output.dtype, reduce_dtype)
+
+        reduce_scatter = functools.partial(
+            reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
+        )
+        torch.manual_seed(42 + self.rank + 1)
+        device = device_type
+        # Train on the same input to avoid loss explosion
+        num_microbatches = 4
+        inp = torch.randn((2 * num_microbatches, 16), device=device, dtype=param_dtype)
+        for iter_idx in range(10):
+            microbatch_inps = torch.chunk(inp, 4)
+            for microbatch_idx in range(num_microbatches):
+                is_last_microbatch = microbatch_idx == num_microbatches - 1
+                model.set_requires_gradient_sync(is_last_microbatch)
+                model.set_reshard_after_backward(
+                    is_last_microbatch or reshard_after_forward
+                )
+                losses: list[torch.Tensor] = []
+                for _model in (ref_model_compute, model):
+                    losses.append(
+                        _model(microbatch_inps[microbatch_idx].detach()).sum()
+                    )
+                    self.assertEqual(losses[-1].dtype, param_dtype)
+                    with patch_reduce_scatter(reduce_scatter):
+                        losses[-1].backward()
+                self.assertEqual(losses[0], losses[1])
+                # Manually accumulate gradients into the base reference model
+                # from the compute reference model in fp32
+                for ref_param, ref_param_compute in zip(
+                    ref_model.parameters(), ref_model_compute.parameters()
+                ):
+                    self.assertTrue(ref_param_compute.grad is not None)
+                    self.assertEqual(ref_param.dtype, torch.float32)
+                    if ref_param.grad is not None:
+                        ref_param.grad += ref_param_compute.grad
+                    else:
+                        ref_param.grad = ref_param_compute.grad.to(ref_param.dtype)
+                    ref_param_compute.grad = None
+                # Manually reduce gradients for the reference model on the last
+                # microbatch to implement data parallelism
+                if is_last_microbatch:
+                    for ref_param in ref_model.parameters():
+                        self.assertTrue(ref_param.grad is not None)
+                        dist.all_reduce(ref_param.grad)
+                        ref_param.grad /= self.world_size
+            check_sharded_parity(self, ref_model, model)
+            ref_optim.step()
+            optim.step()
+            ref_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            # Manually copy parameters from the base reference model to the
+            # compute reference model to run the optimizer step for the latter
+            for ref_param, ref_param_compute in zip(
+                ref_model.parameters(), ref_model_compute.parameters()
+            ):
+                ref_param_compute.detach().copy_(ref_param)
+
+
+class TestReplicateMixedPrecisionCasts(FSDPTestMultiThread):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @skip_if_lt_x_gpu(1)
+    def test_float16_on_one_submodule(self):
+        x = torch.zeros(2, 100, device=device_type)
+
+        # Subtest 1: use fp16 on the second child submodule -- does not require
+        # any additional casting logic
+        forward_inputs: dict[str, nn.Module] = {}
+        model = SaveForwardInputsModel(
+            forward_inputs,
+            cast_forward_inputs=False,
+        ).to(device_type)
+        replicate(model.c2, mp_policy=MixedPrecisionPolicy(param_dtype=torch.float16))
+        replicate(model)
+        model(x).sum().backward()
+        self.assertEqual(forward_inputs[model].dtype, torch.float32)
+        self.assertEqual(forward_inputs[model.c1].dtype, torch.float32)
+        self.assertEqual(forward_inputs[model.c2].dtype, torch.float16)
+
+        # Subtest 2: use fp16 on the second child module, where the user module
+        # owns the cast
+        forward_inputs: dict[nn.Module, torch.Tensor] = {}
+        model = SaveForwardInputsModel(
+            forward_inputs=forward_inputs, cast_forward_inputs=True
+        ).to(device_type)
+        replicate(
+            model.c2,
+            mp_policy=MixedPrecisionPolicy(
+                param_dtype=torch.float16, cast_forward_inputs=False
+            ),
+        )
+        replicate(model)
+        model(x).sum().backward()
+        self.assertEqual(forward_inputs[model].dtype, torch.float32)
+        self.assertEqual(forward_inputs[model.c1].dtype, torch.float32)
+        self.assertEqual(forward_inputs[model.c2].dtype, torch.float32)
+
+        # Subtest 3: use fp16 on the first child module and specify its output
+        # dtype so that the second child module does not need to cast
+        forward_inputs: dict[nn.Module, torch.Tensor] = {}
+        model = SaveForwardInputsModel(
+            forward_inputs=forward_inputs, cast_forward_inputs=False
+        ).to(device_type)
+        replicate(
+            model.c1,
+            mp_policy=MixedPrecisionPolicy(
+                param_dtype=torch.float16, output_dtype=torch.float32
+            ),
+        )
+        replicate(model)
+        model(x).sum().backward()
+        self.assertEqual(forward_inputs[model].dtype, torch.float32)
+        self.assertEqual(forward_inputs[model.c1].dtype, torch.float16)
+        self.assertEqual(forward_inputs[model.c2].dtype, torch.float32)
+
+    @skip_if_lt_x_gpu(1)
+    def test_submodules_with_external_inputs(self):
+        self.run_subtests(
+            {"enable_submodule_cast": [False, True]},
+            self._test_submodules_with_external_inputs,
+        )
+
+    def _test_submodules_with_external_inputs(self, enable_submodule_cast: bool):
+        class ToyModule(nn.Module):
+            def __init__(self, forward_inputs: dict[str, torch.Tensor]) -> None:
+                super().__init__()
+                self.l = nn.Linear(100, 100)
+                self.forward_inputs = forward_inputs
+
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                self.forward_inputs["l2_input_x"] = x
+                self.forward_inputs["l2_input_y"] = y
+                return self.l(x)
+
+        class ToyModel(nn.Module):
+            def __init__(self, forward_inputs: dict[str, torch.Tensor]) -> None:
+                super().__init__()
+                self.l1 = nn.Linear(100, 100)
+                self.l2 = ToyModule(forward_inputs)
+                self.forward_inputs = forward_inputs
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                self.forward_inputs["model_input_x"] = x
+                y = torch.ones(
+                    2, 100, device=device_type.type, dtype=torch.float32
+                )  # external input
+                return self.l2(self.l1(x), y)
+
+        forward_inputs: dict[str, torch.Tensor] = {}
+        model = ToyModel(forward_inputs).to(device_type)
+        x = torch.zeros(2, 100, device=device_type.type, dtype=torch.float32)
+        replicate(
+            model.l2,
+            mp_policy=MixedPrecisionPolicy(
+                param_dtype=torch.float16, cast_forward_inputs=enable_submodule_cast
+            ),
+        )
+        replicate(model, mp_policy=MixedPrecisionPolicy(param_dtype=torch.float16))
+        model(x).sum().backward()
+
+        # If we enable `model.l2` to cast (as default), then `l2_input_y` gets
+        # cast to fp16, and if we disable, then it says as fp32.
+        self.assertEqual(forward_inputs["model_input_x"].dtype, torch.float16)
+        self.assertEqual(forward_inputs["l2_input_x"].dtype, torch.float16)
+        self.assertEqual(
+            forward_inputs["l2_input_y"].dtype,
+            torch.float16 if enable_submodule_cast else torch.float32,
+        )
+
+    @skip_if_lt_x_gpu(1)
+    @requires_nccl_version((2, 10), "Need NCCL 2.10+ for bf16 collectives")
+    def test_norm_modules_bf16(self):
+        mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16)
+        self._test_norm_modules(mp_policy)
+
+    @skip_if_lt_x_gpu(1)
+    def test_norm_modules_fp16(self):
+        mp_policy = MixedPrecisionPolicy(param_dtype=torch.float16)
+        self._test_norm_modules(mp_policy)
+
+    def _test_norm_modules(self, mp_policy: MixedPrecisionPolicy):
+        def inner(model: nn.Module, x: torch.Tensor):
+            # Run forward and backward to check for no type mismatch errors
+            z = model(x)
+            self.assertEqual(z.dtype, mp_policy.param_dtype)
+            z.sum().backward()
+
+        # Layer norm
+        model = nn.Sequential(nn.Linear(32, 32), nn.LayerNorm(32), nn.Linear(32, 32))
+        for module in (model[0], model[1], model[2], model):
+            replicate(module, mp_policy=mp_policy)
+        inner(model, torch.randn((4, 32)))
+
+        # Batch norm 1D
+        model = nn.Sequential(nn.Linear(32, 32), nn.BatchNorm1d(32), nn.Linear(32, 32))
+        for module in (model[0], model[1], model[2], model):
+            replicate(module, mp_policy=mp_policy)
+        inner(model, torch.randn((4, 32)))
+
+        # Batch norm 2D: error in backward from buffer dtype mismatch
+        model = nn.Sequential(nn.Conv2d(1, 5, 3), nn.BatchNorm2d(5), nn.Conv2d(5, 4, 3))
+        for module in (model[0], model[1], model[2], model):
+            replicate(module, mp_policy=mp_policy)
+        if TEST_HPU:
+            inner(model, torch.randn((3, 1, 9, 9)))
+        else:
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Expected running_mean to have type",  # Error not seen on HPUs and hence it can be skipped
+            ):
+                # Errors in batch norm 2D backward
+                inner(model, torch.randn((3, 1, 9, 9)))
+
+        # Batch norm 2D: cast buffers down to lower precision
+        model = nn.Sequential(nn.Conv2d(1, 5, 3), nn.BatchNorm2d(5), nn.Conv2d(5, 4, 3))
+        for module in (model[0], model[1], model[2], model):
+            replicate(module, mp_policy=mp_policy)
+        # Casting batch norm buffers to the lower precision allows backward
+        model[1].running_mean = model[1].running_mean.to(mp_policy.param_dtype)
+        model[1].running_var = model[1].running_var.to(mp_policy.param_dtype)
+        inner(model, torch.randn((3, 1, 9, 9)))
+
+        # Batch norm 2D: use special mixed precision policy
+        model = nn.Sequential(nn.Conv2d(1, 5, 3), nn.BatchNorm2d(5), nn.Conv2d(5, 4, 3))
+        bn_mp_policy = MixedPrecisionPolicy(output_dtype=mp_policy.param_dtype)
+        replicate(model[1], mp_policy=bn_mp_policy)
+        for module in (model[0], model[2], model):
+            replicate(module, mp_policy=mp_policy)
+        inner(model, torch.randn((3, 1, 9, 9)))
+
+    @skip_if_lt_x_gpu(1)
+    def test_clamp_reduce_dtype(self):
+        # Initialize the model directly in bf16
+        init_dtype = torch.bfloat16
+        model = nn.Sequential(
+            nn.Linear(32, 32, dtype=init_dtype),
+            nn.Linear(32, 32, dtype=init_dtype),
+        ).to(device_type.type)
+        mp_policy = MixedPrecisionPolicy(
+            param_dtype=torch.bfloat16, reduce_dtype=torch.bfloat16
+        )
+        # Check that we did not clamp the reduce dtype
+        self.assertEqual(mp_policy.reduce_dtype, torch.bfloat16)
+        for module in model:
+            replicate((module), mp_policy=mp_policy)
+        replicate(model, mp_policy=mp_policy)
+
+        # Check that the reduce-scatter runs in bf16 even after we change the
+        # model from bf16 to fp32
+        model.to(torch.float32)
+        orig_reduce_scatter = dist.reduce_scatter_tensor
+
+        def assert_fn(output: torch.Tensor):
+            self.assertEqual(output.dtype, torch.bfloat16)
+
+        reduce_scatter = functools.partial(
+            reduce_scatter_with_assert, self, orig_reduce_scatter, assert_fn
+        )
+        with patch_reduce_scatter(reduce_scatter):
+            inp = torch.randn((4, 32), device=device_type.type)
+            loss = model(inp).sum()
+            loss.backward()
+
+    @skip_if_lt_x_gpu(1)
+    def test_dataclass_input(self):
+        @dataclasses.dataclass
+        class Input:
+            x: torch.Tensor
+
+        class Model(nn.Module):
+            def __init__(self, *args, **kwargs) -> None:
+                super().__init__(*args, **kwargs)
+                self._layer = nn.Linear(10, 10)
+
+            def forward(self, input: Input):
+                return self._layer(input.x)
+
+        mp_policy = MixedPrecisionPolicy(
+            torch.bfloat16, torch.bfloat16, torch.bfloat16, True
+        )
+        model = Model()
+        inp = Input(torch.randn(2, 10).cuda())
+
+        replicate(model, mp_policy=mp_policy)
+        loss = model(inp).sum()
+        loss.backward()
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/_composable/test_replicate_training.py b/test/distributed/_composable/test_replicate_training.py
index 9aac43404d69..0cb7fad1b9b1 100644
--- a/test/distributed/_composable/test_replicate_training.py
+++ b/test/distributed/_composable/test_replicate_training.py
@@ -5,15 +5,33 @@
 import functools
 import itertools
 import unittest
+from collections import defaultdict
 from collections.abc import Iterable
 from typing import Union
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+from torch.distributed._composable import checkpoint
 from torch.distributed._composable.replicate_with_fsdp import replicate
-from torch.distributed.fsdp import CPUOffloadPolicy, FSDPModule, OffloadPolicy
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    _CHECKPOINT_PREFIX,
+    apply_activation_checkpointing,
+)
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.fsdp import (
+    CPUOffloadPolicy,
+    FSDPModule,
+    OffloadPolicy,
+    register_fsdp_forward_method,
+)
 from torch.distributed.tensor import DTensor, init_device_mesh
+from torch.distributed.tensor.debug import CommDebugMode
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    parallelize_module,
+    RowwiseParallel,
+)
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     check_sharded_parity,
@@ -21,6 +39,7 @@
     FSDPTest,
     FSDPTestMultiThread,
     MLP,
+    MLPStack,
     patch_all_gather,
     patch_reduce_scatter,
 )
@@ -652,5 +671,570 @@ def step_post_hook(
             self.assertEqual(ref_loss, loss)
 
 
+class TestReplicateTrainingCompose(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        # Since these tests run with a larger transformer model, they may see
+        # some numeric drift with >2 GPUs
+        return min(torch.get_device_module(device_type).device_count(), 2)
+
+    @skip_if_lt_x_gpu(2)
+    @compiled_fsdp_test(compile_compute_on_module=Transformer)
+    def test_train_parity_with_activation_checkpointing(self):
+        """
+        Tests train parity against DDP when composing with activation
+        checkpointing.
+        """
+        self.run_subtests(
+            {
+                "reshard_after_forward": [True, False],
+                "checkpoint_impl": ["composable", "utils", "wrapper"],
+                "module_grouping": ["block", "mem_eff", "mem_eff_weight_tied"],
+                "test_device_type": [device_type.type],
+            },
+            self._test_train_parity_with_activation_checkpointing,
+        )
+
+    def _test_train_parity_with_activation_checkpointing(
+        self,
+        reshard_after_forward: Union[bool, int],
+        checkpoint_impl: str,
+        module_grouping: str,
+        test_device_type: str,
+    ):
+        assert checkpoint_impl in ("composable", "utils", "wrapper")
+        testing_compile = replicate != torch.distributed._composable.replicate_with_fsdp
+        if testing_compile and checkpoint_impl == "composable":
+            return
+        torch.manual_seed(42)
+        vocab_size = 1024
+        with torch.device(device_type):
+            model_args = ModelArgs(
+                n_layers=3,
+                n_heads=4,
+                vocab_size=vocab_size,
+                max_seq_len=64,
+                dropout_p=0,
+                checkpoint_activations=(checkpoint_impl == "utils"),
+                # For the mem-efficient module grouping, we separate the
+                # embeddings from the output projection, which does not support
+                # weight tying
+                weight_tying=module_grouping != "mem_eff",
+            )
+            model = Transformer(model_args)
+        ref_model = copy.deepcopy(model).to(device_type)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+
+        # Apply activation checkpointing
+        prefixes_to_ignore = ()
+        if checkpoint_impl == "wrapper":
+            prefixes_to_ignore = (_CHECKPOINT_PREFIX,)
+            apply_activation_checkpointing(
+                model, check_fn=lambda m: isinstance(m, TransformerBlock)
+            )
+        elif checkpoint_impl == "composable":
+            for module in model.modules():
+                if isinstance(module, TransformerBlock):
+                    checkpoint(module)
+
+        # Apply Replicate
+        device_mesh = init_device_mesh(
+            test_device_type,
+            (self.world_size, 1),
+            mesh_dim_names=("replicate", "shard"),
+        )
+        fsdp_kwargs = {
+            "reshard_after_forward": reshard_after_forward,
+            "device_mesh": device_mesh,
+        }
+        if module_grouping == "mem_eff":
+            assert model_args.n_layers == 3
+            replicate(model.layers[0], **fsdp_kwargs)
+            replicate([model.layers[1], model.layers[2]], **fsdp_kwargs)
+            replicate([model.tok_embeddings, model.pos_embeddings], **fsdp_kwargs)
+            # Embedding weights are not needed for embedding backward
+            model.tok_embeddings.set_unshard_in_backward(False)
+            replicate([model.norm, model.output], **fsdp_kwargs)
+        elif module_grouping == "mem_eff_weight_tied":
+            replicate([model.tok_embeddings, model.output], **fsdp_kwargs)
+            for layer in model.layers:
+                replicate(layer, **fsdp_kwargs)
+        elif module_grouping == "block":
+            for layer in model.layers:
+                replicate(layer, **fsdp_kwargs)
+        else:
+            raise NotImplementedError(f"Unknown module grouping: {module_grouping}")
+        replicate(model, **fsdp_kwargs)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        torch.manual_seed(42 + self.rank)
+        # Reuse the same input across iterations to avoid loss explosion from
+        # trying to learn from random inputs
+        inp = torch.randint(0, vocab_size, (3, 64), device=device_type.type)
+        check_sharded_parity(
+            self, ref_model, model, prefixes_to_ignore=prefixes_to_ignore
+        )
+        for iter_idx in range(10):
+            losses: list[torch.Tensor] = []
+            for _model in (ref_model, model):
+                torch.manual_seed(iter_idx + 1)  # for dropout determinism
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+
+            for param in ref_model.parameters():
+                if param.grad is not None:
+                    dist.all_reduce(param.grad)
+                    param.grad.div_(self.world_size)
+
+            if not testing_compile:
+                check_sharded_parity(
+                    self, ref_model, model, prefixes_to_ignore=prefixes_to_ignore
+                )
+            self.assertEqual(losses[0], losses[1])
+            for _optim in (ref_optim, optim):
+                _optim.step()
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            if not testing_compile:
+                check_sharded_parity(
+                    self, ref_model, model, prefixes_to_ignore=prefixes_to_ignore
+                )
+
+
+class TestReplicateSharedParams(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.get_device_module(device_type).device_count())
+
+    @skip_if_lt_x_gpu(2)
+    def test_train_parity_with_shared_params(self):
+        self.run_subtests(
+            {
+                "reshard_after_forward": [False, True],
+                "use_activation_checkpointing": [False, True],
+            },
+            self._test_train_shared_params,
+        )
+
+    def _test_train_shared_params(
+        self,
+        reshard_after_forward: bool,
+        use_activation_checkpointing: bool,
+    ):
+        torch.manual_seed(42)
+        model_args = ModelArgs(n_layers=3, dropout_p=0.0, weight_tying=True)
+        model = Transformer(model_args)
+        ref_model = copy.deepcopy(model).to(device_type)
+
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                if use_activation_checkpointing:
+                    checkpoint(module)
+                replicate(module, reshard_after_forward=reshard_after_forward)
+        replicate(model, reshard_after_forward=reshard_after_forward)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        torch.manual_seed(42 + self.rank + 1)
+        for iter_idx in range(10):
+            inp = torch.randint(
+                0, model_args.vocab_size, (2, 16), device=device_type.type
+            )
+            losses: list[torch.Tensor] = []
+            for _model in (ref_model, model):
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+
+            for param in ref_model.parameters():
+                if param.grad is not None:
+                    dist.all_reduce(param.grad)
+                    param.grad.div_(self.world_size)
+
+            for _optim in (ref_optim, optim):
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+                _optim.step()
+
+            self.assertEqual(losses[0], losses[1])
+
+
+class TestReplicateGradientAccumulation(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.get_device_module(device_type).device_count())
+
+    @skip_if_lt_x_gpu(2)
+    def test_gradient_accumulation(self):
+        """
+        Tests gradient accumulation with/without gradient reduction and
+        with/without resharding after backward.
+        """
+
+        shard_size, replicate_size = 1, self.world_size
+        meshes = init_device_mesh(
+            device_type.type,
+            (replicate_size, shard_size),
+            mesh_dim_names=("replicate", "shard"),
+        )
+        self.run_subtests(
+            {
+                "mesh": [meshes],
+                "reshard_after_forward": [True, False],
+                # "all": disable reduce-scatter for all modules
+                # "root_only": disable reduce-scatter for root's linear only
+                # "some_mlps": disable reduce-scatter for some MLPs
+                "mode": ["all", "root_only", "some_mlps"],
+                "reshard_after_backward": [False, True],
+                "offload_policy": [OffloadPolicy(), CPUOffloadPolicy()],
+                # For HSDP only:
+                # `True`: reduce-scatter only (no all-reduce) each microbatch
+                # until the last microbatch
+                # `False`: neither reduce-scatter nor all-reduce each
+                # microbatch until the last microbatch
+                "reduce_scatter_only": [False, True],
+            },
+            self._test_gradient_accumulation,
+        )
+
+    def _test_gradient_accumulation(
+        self,
+        mesh: DeviceMesh,
+        reshard_after_forward: Union[bool, int],
+        mode: str,
+        reshard_after_backward: bool,
+        offload_policy: OffloadPolicy,
+        reduce_scatter_only: bool,  # for HSDP
+    ):
+        if (
+            (
+                not reshard_after_backward
+                and (reshard_after_forward is not False or mode == "some_mlps")
+            )
+            or (
+                isinstance(offload_policy, CPUOffloadPolicy)
+                and reshard_after_forward is not True
+            )
+            or (
+                mesh.ndim != 2
+            )  # may eventually need to change once decision on device mesh is made
+        ):
+            return  # skip since not common or applicable
+
+        torch.manual_seed(42)
+        batch_size, lin_dim, num_mlps, num_microbatches = (2, 32, 3, 3)
+        if mode == "some_mlps":
+            num_mlps_to_disable_reduce_scatter = 2
+        modules = [nn.Linear(lin_dim, lin_dim)]
+        modules.extend(MLP(lin_dim) for _ in range(num_mlps))
+        model = nn.Sequential(*modules)
+        ref_model = copy.deepcopy(model).to(device_type)
+        replicate_fn = functools.partial(
+            replicate,
+            device_mesh=mesh,
+            reshard_after_forward=reshard_after_forward,
+            offload_policy=offload_policy,
+        )
+        for mlp in model[1:]:
+            replicate_fn(mlp)
+        replicate_fn(model)  # root gets the 1st linear
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+
+        def set_grad_sync_flag(
+            module: nn.Module, is_last_microbatch: bool, recurse: bool = True
+        ):
+            if reduce_scatter_only:
+                module.set_requires_all_reduce(is_last_microbatch, recurse=recurse)
+            else:
+                module.set_requires_gradient_sync(is_last_microbatch, recurse=recurse)
+
+        def set_backward_flags(_model: nn.Module, is_last_microbatch: bool):
+            if mode == "all":
+                set_grad_sync_flag(_model, is_last_microbatch)
+                if not reshard_after_backward:
+                    _model.set_reshard_after_backward(is_last_microbatch)
+            elif mode == "some_mlps":
+                for mlp in model[1 : 1 + num_mlps_to_disable_reduce_scatter]:
+                    set_grad_sync_flag(mlp, is_last_microbatch)
+                    if not reshard_after_backward:
+                        mlp.set_reshard_after_backward(is_last_microbatch)
+            elif mode == "root_only":
+                set_grad_sync_flag(model, is_last_microbatch, recurse=False)
+                if not reshard_after_backward:
+                    model.set_reshard_after_backward(is_last_microbatch, recurse=False)
+
+        torch.manual_seed(42 + self.rank + 1)
+        for iter_idx in range(5):
+            comm_count_list = []
+
+            for microbatch_idx in range(num_microbatches):
+                is_last_microbatch = microbatch_idx == num_microbatches - 1
+                set_backward_flags(model, is_last_microbatch)
+                inp = torch.randn(batch_size, lin_dim, device=device_type.type)
+                losses: list[torch.Tensor] = []
+                for _model in (ref_model, model):
+                    with CommDebugMode() as comm_mode:
+                        losses.append(_model(inp).sum())
+                        losses[-1].backward()
+                    comm_count_list.append(comm_mode.get_comm_counts())
+                self.assertEqual(losses[0], losses[1])
+
+            comm_counts = defaultdict(int)
+            for comm_count_dict in comm_count_list:
+                for collective, count in comm_count_dict.items():
+                    comm_counts[collective] += count
+
+            all_gather_count = comm_counts[c10d_ops._allgather_base_]
+            # reduce_scatter_count = comm_counts[c10d_ops._reduce_scatter_base_]
+            all_reduce_count = comm_counts[c10d_ops.allreduce_]
+
+            # Expect one reduce-scatter per MLP plus one for the root's linear
+            # on the last microbatch
+            # expected_reduce_scatter_count = 0
+            expected_all_reduce_count = num_mlps + 1
+
+            if mode == "some_mlps":
+                # Expect additional reduce-scatters for non-disabled MLPs and
+                # the root's linear
+                expected_all_reduce_count += (
+                    num_mlps - num_mlps_to_disable_reduce_scatter + 1
+                ) * (num_microbatches - 1)
+            elif mode == "root_only":
+                # Expect additional reduce-scatters for all MLPs
+                expected_all_reduce_count += (num_mlps) * (num_microbatches - 1)
+
+            # self.assertEqual(reduce_scatter_count, expected_reduce_scatter_count)
+            self.assertEqual(all_reduce_count, expected_all_reduce_count)
+
+            # Expect one all-gather per MLP plus one for the root's linear in
+            # the first microbatch's forward
+            expected_all_gather_count = 0
+
+            self.assertEqual(all_gather_count, expected_all_gather_count)
+
+            for param in ref_model.parameters():
+                if param.grad is not None:
+                    dist.all_reduce(param.grad, op=dist.ReduceOp.AVG)
+            check_sharded_parity(self, ref_model, model)
+            for _optim in (optim, ref_optim):
+                _optim.step()
+                # When `set_to_none=False`, we are exercising mixing
+                # gradient accumulation with and without communication
+                _optim.zero_grad(set_to_none=(iter_idx % 2))
+
+    @skip_if_lt_x_gpu(2)
+    def test_1f1b_microbatching(self):
+        self.run_subtests(
+            {
+                "use_explicit_unshard": [False, True],
+                "reshard_after_backward": [False, True],
+            },
+            self._test_1f1b_microbatching,
+        )
+
+    def _test_1f1b_microbatching(
+        self, use_explicit_unshard: bool, reshard_after_backward: bool
+    ):
+        torch.manual_seed(42)
+        model_args = ModelArgs(dropout_p=0.0)
+        model = Transformer(model_args)
+        ref_model = copy.deepcopy(model).to(device_type)
+        ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                replicate(module, reshard_after_forward=False)
+        replicate(model, reshard_after_forward=False)
+        optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
+
+        num_microbatches = 3
+        local_batch_size = 2
+        torch.manual_seed(42 + self.rank + 1)
+        inps = [
+            torch.randint(
+                0,
+                model_args.vocab_size,
+                (local_batch_size, 16),
+                device=device_type.type,
+            )
+            for _ in range(num_microbatches)
+        ]
+
+        # Before pipelining, we may prefer to issue all all-gathers ahead of
+        # time to increase overlap opportunity at no difference in parameter
+        # memory usage since we do not reshard after forward
+        if use_explicit_unshard:
+            for module in model.modules():
+                if isinstance(module, FSDPModule):
+                    module.unshard(async_op=True)
+
+        # Emulate the 1f1b pipeline schedule and only reduce gradients on the
+        # last microbatch
+        losses: list[torch.Tensor] = []
+        ref_losses: list[torch.Tensor] = []
+        for inp_idx, inp in enumerate(inps):
+            is_last_microbatch = inp_idx == num_microbatches - 1
+            model.set_requires_gradient_sync(is_last_microbatch)
+            model.set_is_last_backward(is_last_microbatch)
+            if not reshard_after_backward:
+                model.set_reshard_after_backward(is_last_microbatch)
+            losses.append(model(inp).sum())
+            losses[-1].backward()
+            ref_losses.append(ref_model(inp).sum())
+            ref_losses[-1].backward()
+        for param in ref_model.parameters():
+            dist.all_reduce(param.grad, op=dist.ReduceOp.AVG)
+
+        for loss, ref_loss in zip(losses, ref_losses):
+            self.assertEqual(loss, ref_loss)
+        optim.step()
+        ref_optim.step()
+        check_sharded_parity(self, ref_model, model)
+
+
+class TestReplicateCustomForwardMethod(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(torch.get_device_module(device_type).device_count(), 2)
+
+    @skip_if_lt_x_gpu(2)
+    def test_register_fsdp_forward_method(self):
+        class VisionTransformer(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.patch_proj = nn.Conv2d(3, 1024, kernel_size=14, stride=14)
+
+            def forward_features(self, imgs: torch.Tensor) -> torch.Tensor:
+                return self.patch_proj(imgs).flatten(2).transpose(1, 2)
+
+            def forward(self, imgs: torch.Tensor) -> torch.Tensor:
+                return self.forward_features(imgs).sum(dim=1)
+
+        class Model(nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.vit, self.projector = VisionTransformer(), nn.Linear(1024, 256)
+
+            def forward(self, imgs: torch.Tensor) -> torch.Tensor:
+                # Run `vit.forward_features`, which is not `forward`!
+                patch_embeddings = self.vit.forward_features(imgs)
+                return self.projector(patch_embeddings)
+
+        torch.manual_seed(42)
+        model = Model()
+        ref_model = copy.deepcopy(model).to(device_type)
+        replicate(model.vit)
+        replicate(model.projector)
+        replicate(model)
+        register_fsdp_forward_method(model.vit, "forward_features")
+
+        torch.manual_seed(42 + self.rank + 1)
+        inp = torch.randn(4, 3, 224, 224, device=device_type.type)
+        ref_loss = ref_model(inp).sum()
+        loss = model(inp).sum()
+        self.assertEqual(ref_loss, loss)
+        ref_loss.backward()
+        loss.backward()
+        for param in ref_model.parameters():
+            dist.all_reduce(param.grad, op=dist.ReduceOp.AVG)
+        check_sharded_parity(self, ref_model, model)
+
+
+class TestReplicateTPTraining(FSDPTest):
+    @property
+    def world_size(self) -> int:
+        return min(4, torch.get_device_module(device_type).device_count())
+
+    def init_global_mesh(self) -> DeviceMesh:
+        return init_device_mesh(
+            device_type.type,
+            (2, 1, 2),
+            mesh_dim_names=("dp_replicate", "dp_shard", "tp"),
+        )
+
+    @skip_if_lt_x_gpu(8)
+    def test_replicate_tp(self):
+        global_mesh = self.init_global_mesh()
+        self.run_subtests(
+            {
+                "reshard_after_forward": [False, True],
+                "use_activation_checkpointing": [False, True],
+                "mlp_dim": [3, 5, 16, 17],
+                "foreach": [False],
+            },
+            functools.partial(self._test_replicate_tp, global_mesh),
+        )
+
+    def _test_replicate_tp(
+        self,
+        global_mesh: DeviceMesh,
+        reshard_after_forward: bool,
+        use_activation_checkpointing: bool,
+        mlp_dim: int,
+        foreach: bool,
+    ):
+        dp_mesh, tp_mesh = global_mesh["dp_replicate", "dp_shard"], global_mesh["tp"]
+        dp_pg = dp_mesh._flatten().get_group()  # used for `replicate()`
+
+        torch.manual_seed(42)
+        model = MLPStack(mlp_dim)
+        ref_model = copy.deepcopy(model).to(device_type)
+
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=foreach)
+
+        parallelize_plan = {
+            # Pass `use_local_output=False` to keep as DTensor to preserve
+            # uneven activation dims
+            "0.in_proj": ColwiseParallel(use_local_output=False),
+            "0.out_proj": RowwiseParallel(use_local_output=False),
+            "1.in_proj": ColwiseParallel(use_local_output=False),
+            "1.out_proj": RowwiseParallel(use_local_output=False),
+            "2.in_proj": ColwiseParallel(use_local_output=False),
+            "2.out_proj": (RowwiseParallel()),
+        }
+
+        model = parallelize_module(model, tp_mesh, parallelize_plan)
+
+        for module in model:
+            if isinstance(module, nn.LayerNorm):
+                continue
+            if use_activation_checkpointing:
+                checkpoint(module)
+            replicate(module, device_mesh=dp_mesh)
+        replicate(model, device_mesh=dp_mesh)
+
+        # Checking parameters match orig model is critical to validate .full_tensor correctly replicates the
+        # strided-sharded layers.
+        for ref_p, p in zip(ref_model.parameters(), model.parameters()):
+            self.assertIsInstance(p, DTensor)
+            self.assertEqual(ref_p, p.full_tensor())
+
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=foreach)
+
+        torch.manual_seed(42 + dp_pg.rank() + 1)
+        device = device_type
+        for iter_idx in range(10):
+            inp = torch.randn((8, mlp_dim), device=device)
+            losses: list[torch.Tensor] = []
+            for _model in (ref_model, model):
+                losses.append(_model(inp).sum())
+                losses[-1].backward()
+
+            for param in ref_model.parameters():
+                if param.grad is not None:
+                    dist.all_reduce(param.grad, op=dist.ReduceOp.AVG)
+
+            for _optim in (ref_optim, optim):
+                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+                _optim.step()
+            self.assertEqual(losses[0], losses[1])
+            check_sharded_parity(self, ref_model, model)
+
+        for _, p in model.named_parameters():
+            self.assertIsInstance(p, DTensor)
+            self.assertEqual(p.device_mesh.ndim, 3)
+            self.assertEqual(len(p.placements), 3)
+            self.assertEqual(
+                p.device_mesh.mesh_dim_names, ("dp_replicate", "dp_shard", "tp")
+            )
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/_composable/test_replicate_with_compiler.py b/test/distributed/_composable/test_replicate_with_compiler.py
index 291b3a426822..8b6ed2d74ffe 100644
--- a/test/distributed/_composable/test_replicate_with_compiler.py
+++ b/test/distributed/_composable/test_replicate_with_compiler.py
@@ -3,8 +3,9 @@
 import contextlib
 import functools
 import unittest
+from collections.abc import Callable
 from copy import deepcopy
-from typing import Callable, Optional, Union
+from typing import Optional, Union
 
 import torch
 import torch.distributed as dist
diff --git a/test/distributed/_pycute/test_int_tuple.py b/test/distributed/_pycute/test_int_tuple.py
index 27cebf30bd57..b6fb10394c5b 100644
--- a/test/distributed/_pycute/test_int_tuple.py
+++ b/test/distributed/_pycute/test_int_tuple.py
@@ -164,6 +164,9 @@ def test_crd2idx_int_with_tuple_shape(self):
             crd2idx(4, ((2, 2, 2), (2, 2, 2)), ((1, 16, 4), (8, 2, 32))), 8
         )  # 4 -> (1,0,0) -> 1*8 = 8
 
+        # Test with zero-length shape and strides
+        self.assertEqual(crd2idx(0, (), ()), 0)  # 0 -> () -> sum([]) = 0
+
     def test_idx2crd_basic(self):
         # Test basic int/int case
         self.assertEqual(idx2crd(2, 5, 1), 2)
diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
index f62e4d29617d..b39b3075060f 100644
--- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
+++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py
@@ -3074,7 +3074,7 @@ def test_init_from_local_shards_invalid_property_cross_ranks(self):
                 wrong_dtype_shards, [10, 10], init_rrefs=True
             )
 
-        tensor_requires_grad = True if self.rank == 0 else False
+        tensor_requires_grad = self.rank == 0
         wrong_requires_grad_shards = [
             sharded_tensor.Shard(
                 torch.randn(
@@ -3121,7 +3121,7 @@ def test_init_from_local_shards_invalid_pin_memory(self):
                 wrong_pin_memory_local_shards, [10, 10], init_rrefs=True
             )
 
-        tensor_pin_memory = True if self.rank == 0 else False
+        tensor_pin_memory = self.rank == 0
         wrong_pin_memory_shards_cross_ranks = [
             sharded_tensor.Shard(
                 torch.randn(5, 5, pin_memory=tensor_pin_memory), local_shard_metadata
diff --git a/test/distributed/_tools/test_fake_collectives.py b/test/distributed/_tools/test_fake_collectives.py
index a2c3ec01fc43..fea834008c22 100644
--- a/test/distributed/_tools/test_fake_collectives.py
+++ b/test/distributed/_tools/test_fake_collectives.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["oncall: distributed"]
 import unittest
 
 import torch
diff --git a/test/distributed/_tools/test_fsdp2_mem_tracker.py b/test/distributed/_tools/test_fsdp2_mem_tracker.py
index 05e7a9640da3..a38afd12de8f 100644
--- a/test/distributed/_tools/test_fsdp2_mem_tracker.py
+++ b/test/distributed/_tools/test_fsdp2_mem_tracker.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: fsdp"]
 import functools
 import gc
 from typing import Union
diff --git a/test/distributed/_tools/test_mem_tracker.py b/test/distributed/_tools/test_mem_tracker.py
index a5824de8fc5e..b4c2c938b131 100644
--- a/test/distributed/_tools/test_mem_tracker.py
+++ b/test/distributed/_tools/test_mem_tracker.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["oncall: distributed"]
 import gc
 import unittest
 
diff --git a/test/distributed/_tools/test_mod_tracker.py b/test/distributed/_tools/test_mod_tracker.py
index 80cbc4c650d2..b4de403b503a 100644
--- a/test/distributed/_tools/test_mod_tracker.py
+++ b/test/distributed/_tools/test_mod_tracker.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["oncall: distributed"]
 
 from copy import copy
 
diff --git a/test/distributed/_tools/test_runtime_estimator.py b/test/distributed/_tools/test_runtime_estimator.py
index 7086d1c69318..4087b8a85520 100644
--- a/test/distributed/_tools/test_runtime_estimator.py
+++ b/test/distributed/_tools/test_runtime_estimator.py
@@ -1,7 +1,8 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["oncall: distributed"]
 import unittest
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable, cast, Union
+from typing import Any, cast, Union
 
 import torch
 from torch import nn, optim
diff --git a/test/distributed/_tools/test_sac_estimator.py b/test/distributed/_tools/test_sac_estimator.py
index be2eba257455..c7bd9f2c3034 100644
--- a/test/distributed/_tools/test_sac_estimator.py
+++ b/test/distributed/_tools/test_sac_estimator.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["oncall: distributed"]
 import unittest
 
 import torch
diff --git a/test/distributed/_tools/test_sac_ilp.py b/test/distributed/_tools/test_sac_ilp.py
index bd9c8d3a8136..9afb267ed167 100644
--- a/test/distributed/_tools/test_sac_ilp.py
+++ b/test/distributed/_tools/test_sac_ilp.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["oncall: distributed"]
 import copy
 import unittest
 
diff --git a/test/distributed/algorithms/quantization/test_quantization.py b/test/distributed/algorithms/quantization/test_quantization.py
index b65e0a747405..6044eac70b51 100644
--- a/test/distributed/algorithms/quantization/test_quantization.py
+++ b/test/distributed/algorithms/quantization/test_quantization.py
@@ -79,7 +79,7 @@ def test_all_gather_fp16(self):
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.group.WORLD
             self._test_all_gather(
                 group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.FP16
@@ -94,7 +94,7 @@ def test_all_gather_bfp16(self):
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="gloo"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.group.WORLD
             self._test_all_gather(
                 group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.BFP16
@@ -111,7 +111,7 @@ def test_all_to_all_fp16(self):
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all(
@@ -135,7 +135,7 @@ def test_all_to_all_bfp16(self):
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all(
@@ -158,7 +158,7 @@ def test_all_to_all_single_fp16(self):
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all_single(
@@ -181,7 +181,7 @@ def test_all_to_all_single_bfp16(self):
             dist.init_process_group(
                 store=store, rank=self.rank, world_size=self.world_size, backend="nccl"
             )
-            group = list(range(0, self.world_size))
+            group = list(range(self.world_size))
             group_id = dist.new_group(range(self.world_size))
             rank_to_GPU = init_multigpu_helper(self.world_size, BACKEND)
             self._test_all_to_all_single(
diff --git a/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py b/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
index e1b1041875af..59cd11fa7ea4 100644
--- a/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
+++ b/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
@@ -54,6 +54,9 @@
 from torch.testing._internal.distributed.common_state_dict import VerifyStateDictMixin
 
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
+
 # Simple and boring model
 class TestDummyModel(torch.nn.Module):
     def __init__(self) -> None:
@@ -72,12 +75,12 @@ def forward(self, x):
         return x
 
     def get_input(self):
-        return torch.rand(8, 8, device="cuda")
+        return torch.rand(8, 8, device=device_type)
 
 
 class TestStatefulObj:
     def __init__(self) -> None:
-        self.data = torch.rand(10, 10, device="cuda")
+        self.data = torch.rand(10, 10, device=device_type)
 
     def state_dict(self):
         return {"data": self.data}
@@ -151,10 +154,11 @@ def _train(model, optim, train_steps=1):
 class TestE2ESaveAndLoad(DTensorTestBase, VerifyStateDictMixin):
     @property
     def backend(self):
-        return "cpu:gloo,cuda:nccl"
+        curr_backend = dist.get_default_backend_for_device(self.device_type)
+        return f"cpu:gloo,{self.device_type}:{curr_backend}"
 
     def _create_model(self, compile, model_type, state_dict_options=None):
-        dummy_model = TestDummyModel().cuda()
+        dummy_model = TestDummyModel().to(self.device_type)
 
         assert model_type in ModelType, f"{model_type} is not supported."
         if model_type == ModelType.FSDP:
@@ -207,8 +211,8 @@ def _create_model(self, compile, model_type, state_dict_options=None):
     def _optim(self, model):
         return torch.optim.Adam(model.parameters(), lr=0.1)
 
-    @with_comms
     @skip_if_lt_x_gpu(4)
+    @with_comms
     @with_temp_dir
     @parametrize("compile", [True, False])
     # TODO: Previously PairwiseParallel does not shard properly, passing ModelType.FSDP_TP test where it
@@ -217,8 +221,8 @@ def _optim(self, model):
     def test_e2e(self, compile, model_type):
         self._run_e2e_test(compile, model_type)
 
-    @with_comms
     @skip_if_lt_x_gpu(4)
+    @with_comms
     @with_temp_dir
     @parametrize(
         "cache_staged_state_dict, async_checkpointer_type, zoc",
@@ -378,9 +382,9 @@ def load_state_dict(self, state_dict):
         # Validate that the non-stateful state dict was replaced with the loaded state dict
         self.assertTrue(sd.set_sd_item_called)
 
+    @skip_if_lt_x_gpu(4)
     @with_comms
     @with_temp_dir
-    @skip_if_lt_x_gpu(4)
     def test_different_ordered_state_dict_keys(self):
         """Tests that the order of keys in the state dict does not matter when loading
         If order was not accounted for, the following test would cause a deadlock.
@@ -394,11 +398,11 @@ def state_dict(self):
 
             def load_state_dict(self, state_dict):
                 tl = [
-                    torch.ones(2, dtype=torch.int64, device="cuda")
+                    torch.ones(2, dtype=torch.int64, device=device_type)
                     for _ in range(world_size)
                 ]
                 t = (
-                    torch.arange(2, dtype=torch.int64, device="cuda")
+                    torch.arange(2, dtype=torch.int64, device=device_type)
                     + 1
                     + 2 * dist.get_rank()
                 )
@@ -410,7 +414,7 @@ def state_dict(self):
 
             def load_state_dict(self, state_dict):
                 tensor = (
-                    torch.arange(2, dtype=torch.int64, device="cuda")
+                    torch.arange(2, dtype=torch.int64, device=device_type)
                     + 1
                     + 2 * dist.get_rank()
                 )
@@ -437,8 +441,8 @@ def test_no_dist(self):
         DCP.save({}, checkpoint_id=self.temp_dir)
         DCP.load({}, checkpoint_id=self.temp_dir)
 
-    @with_comms
     @skip_if_lt_x_gpu(4)
+    @with_comms
     @with_temp_dir
     def test_partial_load(self):
         model, optim = self._create_model(compile=False, model_type=ModelType.NONE)
@@ -476,8 +480,8 @@ def test_partial_load(self):
                     loaded_optim_state[k][optim_key], v[optim_key], offload_to_cpu=True
                 )
 
-    @with_comms
     @skip_if_lt_x_gpu(4)
+    @with_comms
     @with_temp_dir
     def test_overwrite(self):
         t1, t2 = torch.randn(10), torch.randn(10)
diff --git a/test/distributed/checkpoint/e2e/test_fine_tuning.py b/test/distributed/checkpoint/e2e/test_fine_tuning.py
index 1fea2aa3371d..50e158793abc 100644
--- a/test/distributed/checkpoint/e2e/test_fine_tuning.py
+++ b/test/distributed/checkpoint/e2e/test_fine_tuning.py
@@ -82,22 +82,23 @@ def forward(self, batch):
 class TestFineTuning(DTensorTestBase):
     @property
     def world_size(self) -> int:
-        return min(4, torch.cuda.device_count())
+        return min(4, torch.accelerator.device_count())
 
     @property
     def backend(self):
-        return "cpu:gloo,cuda:nccl"
+        curr_backend = dist.get_default_backend_for_device(self.device_type)
+        return f"cpu:gloo,{self.device_type}:{curr_backend}"
 
     def pretrain(self, pretrain_dir: str) -> None:
         device_mesh = init_device_mesh(self.device_type, (self.world_size,))
 
-        model = PreTrainedModel().cuda()
+        model = PreTrainedModel().to(self.device_type)
         model = FSDP(model, device_mesh=device_mesh)
         optim = torch.optim.Adam(model.parameters(), lr=1e-3)
 
         # Training
         for _ in range(3):
-            batch = torch.rand(32, DIM, device="cuda")
+            batch = torch.rand(32, DIM, device=self.device_type)
             loss = model(batch).sum()
             loss.backward()
             optim.step()
@@ -114,7 +115,7 @@ def pretrain(self, pretrain_dir: str) -> None:
     def finetune(self, pretrain_dir: str, finetune_dir: str) -> None:
         device_mesh = init_device_mesh(self.device_type, (self.world_size,))
 
-        model = FineTuningModel().cuda()
+        model = FineTuningModel().to(self.device_type)
         # TODO: make the parallelism more complicated, e.g., using 2D + DDP.
         model = FSDP(model, use_orig_params=True, device_mesh=device_mesh)
         optim = torch.optim.Adam(model.parameters(), lr=1e-3)
@@ -162,7 +163,7 @@ def finetune(self, pretrain_dir: str, finetune_dir: str) -> None:
 
             # Training
             for _ in range(3):
-                batch = torch.rand(32, DIM, device="cuda")
+                batch = torch.rand(32, DIM, device=self.device_type)
                 loss = model(batch).sum()
                 loss.backward()
                 optim.step()
diff --git a/test/distributed/checkpoint/e2e/test_fsdp_ep.py b/test/distributed/checkpoint/e2e/test_fsdp_ep.py
index 51d4b3e99537..03ec9d4d94e1 100644
--- a/test/distributed/checkpoint/e2e/test_fsdp_ep.py
+++ b/test/distributed/checkpoint/e2e/test_fsdp_ep.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn as nn
 from torch.distributed.checkpoint.state_dict import get_state_dict
-from torch.distributed.device_mesh import _mesh_resources, init_device_mesh
+from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.tensor import DTensor
 from torch.testing._internal.common_utils import run_tests
@@ -61,20 +61,20 @@ def forward(self, x):
 class TestFSDPWithEP(DTensorTestBase, VerifyStateDictMixin):
     @property
     def world_size(self) -> int:
-        return min(8, torch.cuda.device_count())
+        return min(8, torch.accelerator.device_count())
 
     @with_comms
     @skip_if_lt_x_gpu(8)
     @with_temp_dir
     def test_e2e(self):
-        model = TopModel(self.rank).cuda()
+        model = TopModel(self.rank).to(self.device_type)
 
         mesh_fsdp_tp = init_device_mesh(
             self.device_type, (2, 4), mesh_dim_names=("dp", "tp")
         )
         # TODO: we are using an internal API atm. Change to a public API once it is ready.
-        mesh_fsdp_ep = _mesh_resources.create_sub_mesh(mesh_fsdp_tp, ("dp",), [(0,)])
-        del _mesh_resources.child_to_root_mapping[mesh_fsdp_ep]
+        mesh_fsdp_ep = mesh_fsdp_tp["dp"]
+        mesh_fsdp_ep._root_mesh = None
 
         mesh_fsdp = init_device_mesh(self.device_type, (8,))
         for i, l in enumerate(model.second.ep_layers):
diff --git a/test/distributed/checkpoint/fsdp/test_fsdp_dsd.py b/test/distributed/checkpoint/fsdp/test_fsdp_dsd.py
index 90147c7213b4..5f5ab1ebd391 100644
--- a/test/distributed/checkpoint/fsdp/test_fsdp_dsd.py
+++ b/test/distributed/checkpoint/fsdp/test_fsdp_dsd.py
@@ -32,10 +32,13 @@
 from torch.utils._pytree import tree_all_only
 
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
+
 class TestFullyShardWithDistributedStateDict(FSDPTest):
     @property
     def world_size(self) -> int:
-        return min(4, torch.cuda.device_count())
+        return min(4, torch.accelerator.device_count())
 
     def _get_base_model(self, mlp_dim: int = 2):
         base_model = nn.Sequential(
@@ -73,7 +76,7 @@ def _test_1d_fsdp_get_model_state_dict(self, mlp_dim: int):
         for module in model2:
             fully_shard(module, reshard_after_forward=False)
         fully_shard(model2, reshard_after_forward=False)
-        inp = torch.randn((2, mlp_dim), device="cuda")
+        inp = torch.randn((2, mlp_dim), device=device_type)
         model2(inp)  # parameters are not resharded after this forward
         # Check that state dict hooks reshard
         osd_2 = model2.state_dict()
@@ -131,7 +134,7 @@ def _test_save_with_fsdp1_and_load_with_fsdp2(self, state_dict_type: StateDictTy
 
         # Save state dict with model wrapped with FSDP1
         fsdp1_model = FSDP(
-            self._get_base_model().cuda(),
+            self._get_base_model().to(device_type),
             use_orig_params=True,
             auto_wrap_policy=always_wrap_policy,
         )
@@ -207,14 +210,14 @@ def _get_base_model(mlp_dim: int = 2):
         # init device mesh
         dp_size = 2
         global_mesh = init_device_mesh(
-            "cuda",
+            device_type,
             (dp_size, self.world_size // dp_size),
             mesh_dim_names=("dp", "tp"),
         )
         dp_mesh, tp_mesh = global_mesh["dp"], global_mesh["tp"]
 
         # Save state dict with original model
-        base_model = _get_base_model().cuda()
+        base_model = _get_base_model().to(device_type)
         base_optim = torch.optim.AdamW(base_model.parameters(), lr=0.1)
 
         # Save state dict with model wrapped with FSDP1
@@ -341,15 +344,17 @@ def _get_base_model(mlp_dim: int = 2):
         # init device mesh
         dp_size = 2
         global_mesh_1d = init_device_mesh(
-            "cuda", (self.world_size,), mesh_dim_names=("tp",)
+            device_type, (self.world_size,), mesh_dim_names=("tp",)
         )
         global_mesh_2d = init_device_mesh(
-            "cuda", (dp_size, self.world_size // dp_size), mesh_dim_names=("dp", "tp")
+            device_type,
+            (dp_size, self.world_size // dp_size),
+            mesh_dim_names=("dp", "tp"),
         )
         dp_mesh, tp_mesh = global_mesh_2d["dp"], global_mesh_2d["tp"]
 
         # Save state dict with original model
-        base_model = _get_base_model().cuda()
+        base_model = _get_base_model().to(device_type)
         base_optim = torch.optim.AdamW(base_model.parameters(), lr=0.1)
 
         # Save state dict with TP model
@@ -495,10 +500,10 @@ def _get_base_model(mlp_dim):
             # init device mesh
             dp_size = 2
             global_mesh_1d = init_device_mesh(
-                "cuda", (self.world_size,), mesh_dim_names=("tp",)
+                device_type, (self.world_size,), mesh_dim_names=("tp",)
             )
             global_mesh_2d = init_device_mesh(
-                "cuda",
+                device_type,
                 (dp_size, self.world_size // dp_size),
                 mesh_dim_names=("dp", "tp"),
             )
@@ -506,7 +511,7 @@ def _get_base_model(mlp_dim):
 
             for save_full_state_dict in [True, False]:
                 # Save state dict with original model
-                base_model = _get_base_model(mlp_dim).cuda()
+                base_model = _get_base_model(mlp_dim).to(device_type)
                 base_optim = torch.optim.AdamW(base_model.parameters(), lr=0.1)
 
                 # Save state dict with FSDP2 + TP model
diff --git a/test/distributed/checkpoint/test_checkpoint.py b/test/distributed/checkpoint/test_checkpoint.py
index 66911327327d..0bc5bf69f2a5 100644
--- a/test/distributed/checkpoint/test_checkpoint.py
+++ b/test/distributed/checkpoint/test_checkpoint.py
@@ -32,7 +32,10 @@
 )
 from torch.distributed.checkpoint.storage import WriteResult
 from torch.futures import Future
-from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
+from torch.testing._internal.common_distributed import (
+    requires_accelerator_dist_backend,
+    skip_if_lt_x_gpu,
+)
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 from torch.testing._internal.distributed._shard.sharded_tensor import (
     ShardedTensorTestBase,
@@ -40,6 +43,9 @@
 )
 
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
+
 if TEST_WITH_DEV_DBG_ASAN:
     print(
         "Skip dev-asan as torch + multiprocessing spawn have known issues",
@@ -62,8 +68,8 @@ def spec(self) -> ChunkShardingSpec:
         return ChunkShardingSpec(
             dim=0,
             placements=[
-                "rank:0/cuda:0",
-                "rank:1/cuda:1",
+                f"rank:0/{device_type}:0",
+                f"rank:1/{device_type}:1",
             ],
         )
 
@@ -75,12 +81,12 @@ def world_size(self) -> int:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     def test_tensor_metadata_with_missing_rank_spec(self) -> None:
         spec = ChunkShardingSpec(
             dim=0,
             placements=[
-                "rank:1/cuda:1",
+                f"rank:1/{device_type}:1",
             ],
         )
 
@@ -92,14 +98,14 @@ def test_tensor_metadata_with_missing_rank_spec(self) -> None:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     def test_default_metadata(self) -> None:
-        device = f"cuda:{dist.get_rank()}"
+        device = f"{device_type}:{dist.get_rank()}"
         spec = ChunkShardingSpec(
             dim=0,
             placements=[
-                "rank:0/cuda:0",
-                "rank:1/cuda:1",
+                f"rank:0/{device_type}:0",
+                f"rank:1/{device_type}:1",
             ],
         )
 
@@ -146,7 +152,7 @@ def __init__(self, fail_conf):
         self.rank = 0 if not dist.is_initialized() else dist.get_rank()
 
     def _get_ranks(self, name):
-        return self.fail_conf[name] if name in self.fail_conf else None
+        return self.fail_conf.get(name, None)
 
     def _fail_rank(self, name):
         ranks = self._get_ranks(name)
@@ -233,12 +239,14 @@ class TestDistributedFailure(ShardedTensorTestBase):
     def get_spec(self):
         return ChunkShardingSpec(
             dim=0,
-            placements=[f"rank:{r}/cuda:{r}" for r in range(dist.get_world_size())],
+            placements=[
+                f"rank:{r}/{device_type}:{r}" for r in range(dist.get_world_size())
+            ],
         )
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     def test_dummy_writer_works(self) -> None:
         state_dict = {
             "sharded": sharded_tensor.rand(self.get_spec(), 20, 20),
@@ -250,7 +258,7 @@ def test_dummy_writer_works(self) -> None:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     def test_dummy_reader_works(self) -> None:
         state_dict = {
             "sharded": sharded_tensor.rand(self.get_spec(), 20, 20),
@@ -313,7 +321,7 @@ def _load():
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(4)
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     def test_save_error_handling(self) -> None:
         state_dict = {
             "sharded": sharded_tensor.rand(self.get_spec(), 20, 20),
@@ -347,7 +355,7 @@ def test_save_error_handling_no_dist(self) -> None:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(4)
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     def test_load_error_handling(self) -> None:
         state_dict = {
             "sharded": sharded_tensor.rand(self.get_spec(), 20, 20),
diff --git a/test/distributed/checkpoint/test_dtensor_checkpoint.py b/test/distributed/checkpoint/test_dtensor_checkpoint.py
index d2f8a22fa943..e34d14b96748 100644
--- a/test/distributed/checkpoint/test_dtensor_checkpoint.py
+++ b/test/distributed/checkpoint/test_dtensor_checkpoint.py
@@ -106,7 +106,7 @@ def create_dtensor_model(
             replicated_dt,
             submesh_sharded_dt,
             submesh_replicated_dt,
-        ).cuda()
+        ).to(self.device_type)
 
         return (
             model,
@@ -135,7 +135,7 @@ def test_distributed_tensor_planner(self) -> None:
                     (
                         'rdt',
                         DTensor(
-                            local_tensor=tensor([4., 5., 6., 7.], device='cuda:0'),
+                            local_tensor=tensor([4., 5., 6., 7.], device=f'{self.device_type}:0'),
                             device_mesh=DeviceMesh:([0, 1, 2, 3]),
                             placements=[Replicate()]
                         )
@@ -143,7 +143,7 @@ def test_distributed_tensor_planner(self) -> None:
                     (
                         'sdt',
                         DTensor(
-                            local_tensor=tensor([0.], device='cuda:0'),
+                            local_tensor=tensor([0.], device=f'{self.device_type}:0'),
                             device_mesh=DeviceMesh:([0, 1, 2, 3]),
                             placements=[Shard(dim=0)])
                         ),
@@ -151,7 +151,7 @@ def test_distributed_tensor_planner(self) -> None:
                     (
                         'submesh_sdt',
                         DTensor(
-                            local_tensor=tensor([8., 9.], device='cuda:0'),
+                            local_tensor=tensor([8., 9.], device=f'{self.device_type}:0'),
                             device_mesh=DeviceMesh:([0, 2]),
                             placements=[Shard(dim=0)]
                         ),
@@ -159,7 +159,7 @@ def test_distributed_tensor_planner(self) -> None:
                     (
                         'submesh_rdt',
                         DTensor(
-                            local_tensor=tensor([12., 13., 14., 15.], device='cuda:0'),
+                            local_tensor=tensor([12., 13., 14., 15.], device=f'{self.device_type}:0'),
                             device_mesh=DeviceMesh:([0, 2]),
                             placements=[Replicate()]
                         )
@@ -189,7 +189,7 @@ def test_distributed_tensor_planner(self) -> None:
                     (
                         'rdt',
                         DTensor(
-                            local_tensor=tensor([40., 50., 60., 70.], device='cuda:0'),
+                            local_tensor=tensor([40., 50., 60., 70.], device=f'{self.device_type}:0'),
                             device_mesh=DeviceMesh:([0, 1, 2, 3]),
                             placements=[Replicate()],
                         )
@@ -197,7 +197,7 @@ def test_distributed_tensor_planner(self) -> None:
                     (
                         'sdt',
                         DTensor(
-                            local_tensor=tensor([0.], device='cuda:0'),
+                            local_tensor=tensor([0.], device=f'{self.device_type}:0'),
                             device_mesh=DeviceMesh:([0, 1, 2, 3]),
                             placements=[Shard(dim=0)],
                         )
@@ -205,14 +205,14 @@ def test_distributed_tensor_planner(self) -> None:
                     (
                         'submesh_sdt',
                         DTensor(
-                            local_tensor=tensor([80., 90.], device='cuda:0'),
+                            local_tensor=tensor([80., 90.], device=f'{self.device_type}:0'),
                             device_mesh=DeviceMesh:([0, 2]),
                             placements=[Shard(dim=0)]
                         )
                     ),
                     ('submesh_rdt',
                         DTensor(
-                            local_tensor=tensor([120., 130., 140., 150.], device='cuda:0'),
+                            local_tensor=tensor([120., 130., 140., 150.], device=f'{self.device_type}:0'),
                             device_mesh=DeviceMesh:([0, 2]),
                             placements=[Replicate()]
                         )
diff --git a/test/distributed/checkpoint/test_dtensor_resharding.py b/test/distributed/checkpoint/test_dtensor_resharding.py
index ddb2a05d3c5e..306f61a597c2 100644
--- a/test/distributed/checkpoint/test_dtensor_resharding.py
+++ b/test/distributed/checkpoint/test_dtensor_resharding.py
@@ -1,9 +1,24 @@
 # Owner(s): ["oncall: distributed"]
+import logging
+from typing import Any
+
 import torch
+import torch.distributed as dist
 import torch.distributed.checkpoint as dist_cp
 from torch.distributed.checkpoint._extension import ZStandard
+from torch.distributed.checkpoint.metadata import (
+    ChunkStorageMetadata,
+    MetadataIndex,
+    TensorProperties,
+)
+from torch.distributed.checkpoint.planner import (
+    TensorWriteData,
+    WriteItem,
+    WriteItemType,
+)
 from torch.distributed.device_mesh import init_device_mesh
-from torch.distributed.tensor import distribute_tensor, Replicate, Shard, zeros
+from torch.distributed.tensor import distribute_tensor, DTensor, Replicate, Shard, zeros
+from torch.distributed.tensor._shards_wrapper import LocalShardsWrapper
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -21,6 +36,9 @@
 )
 
 
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
 CHECKPOINT_DIR = "checkpoint"
 
 ONE_D_PLACEMENTS = [
@@ -260,7 +278,7 @@ def test_dtensor_checkpoint_resharding_with_empty_shard(self):
         """
         Test dtensor checkpoint resharding with dtensor containing empty shards.
         """
-        tensor = torch.rand(1).cuda()
+        tensor = torch.rand(1).to(self.device_type)
         mesh = init_device_mesh(self.device_type, (self.world_size,))
         dtensor = distribute_tensor(tensor, mesh, [Shard(0)])
         ref_state_dict = {"dtensor": dtensor}
@@ -270,7 +288,7 @@ def test_dtensor_checkpoint_resharding_with_empty_shard(self):
             storage_writer=dist_cp.FileSystemWriter(path=self.temp_dir),
         )
 
-        tensor = torch.rand(1).cuda()
+        tensor = torch.rand(1).to(self.device_type)
         mesh_2 = init_device_mesh(self.device_type, (2, self.world_size // 2))
         dtensor = distribute_tensor(tensor, mesh_2, [Shard(0), Shard(0)])
         state_dict = {"dtensor": dtensor}
@@ -279,9 +297,294 @@ def test_dtensor_checkpoint_resharding_with_empty_shard(self):
             storage_reader=dist_cp.FileSystemReader(self.temp_dir),
         )
 
-    # TODO: Add a assertEqual for ref_state_dict["dtensor"].full_tensor()
-    # and state_dict["dtensor"].full_tensor() after we fix the size mismatch
-    # issue for un-even sharding dtensor.
+    @with_comms
+    @with_temp_dir
+    @skip_if_lt_x_gpu(2)
+    def test_dtensor_checkpoint_with_uneven_shards(self) -> None:
+        """
+        Saving a dtensor with uneven shards.
+        rank 0  -> [[0], [1], [2], [3]]
+        rank 1  -> [[4], [5], [6], [7]]
+        rank 2  -> [[8], [9], [10], [11]]
+        rank 3  -> [[12], [13]]
+        """
+        CHECKPOINT_DIR = self.temp_dir
+        mesh_shape = (self.world_size,)
+        mesh_1 = init_device_mesh(self.device_type, mesh_shape)
+        my_rank = dist.get_rank()
+        # Make the last shard uneven
+        if my_rank == self.world_size - 1:
+            local_tensor = torch.arange(
+                start=my_rank * 4, end=(my_rank * 4) + 2, dtype=torch.float
+            ).view(2, 1)
+        else:
+            local_tensor = torch.arange(
+                start=my_rank * 4, end=(my_rank + 1) * 4, dtype=torch.float
+            ).view(4, 1)
+        dtensor = DTensor.from_local(
+            local_tensor,
+            mesh_1,
+            [Shard(0)],
+            run_check=True,
+            shape=torch.Size([14, 1]),
+            stride=torch.Size([1, 1]),
+        )
+
+        state_dict_to_save = {"uneven_sharded_dtensor": dtensor}
+
+        dist_cp.save(
+            state_dict=state_dict_to_save,
+            storage_writer=dist_cp.FileSystemWriter(path=CHECKPOINT_DIR),
+            planner=dist_cp.DefaultSavePlanner(),
+        )
+
+        loading_full_tensor = torch.rand([14, 1], dtype=torch.float, device="cpu")
+        print(f"rank {my_rank} loading_dtensor for load :\n {loading_full_tensor}")
+        state_dict_to_load = {
+            "uneven_sharded_dtensor": loading_full_tensor
+        }  # re-sharding load.
+        dist_cp.load(
+            state_dict=state_dict_to_load,
+            storage_reader=dist_cp.FileSystemReader(self.temp_dir),
+        )
+
+
+class CheckpointableDistTensor(torch.Tensor):
+    """
+    A distributed checkpointable tensor representation. Unlike Dtensor, this representation
+    cannot be used for distributed training.
+
+    Supports distributed tensor save/loads that has uneven shards. (DTensor cannot support the same)
+    """
+
+    _local_tensor: torch.Tensor
+    _shard_offsets: torch.Size
+    _overall_size: torch.Size
+
+    @staticmethod
+    def __new__(
+        cls,
+        fqn: str,
+        local_tensor: torch.Tensor,
+        shard_offsets: list[int],
+        overall_size: list[int],
+    ) -> "CheckpointableDistTensor":
+        r = torch.Tensor._make_wrapper_subclass(
+            cls,
+            overall_size,
+            dtype=local_tensor.dtype,
+            device=local_tensor.device,
+            layout=local_tensor.layout,
+        )
+
+        r._fqn = fqn
+        r._local_tensor = local_tensor
+        r._shard_offsets = torch.Size(shard_offsets)
+        r._overall_size = torch.Size(overall_size)
+
+        return r
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[override]
+        raise NotImplementedError(
+            f"{func} is not supported for CheckpointableDistTensor!"
+        )
+
+    def __create_chunk_list__(self):
+        return [
+            ChunkStorageMetadata(
+                offsets=self._shard_offsets, sizes=self._local_tensor.size()
+            )
+        ]
+
+    def __create_write_items__(self, fqn: str, object: Any) -> list[WriteItem]:
+        return [
+            WriteItem(
+                index=MetadataIndex(fqn=self._fqn, offset=self._shard_offsets),
+                type=WriteItemType.SHARD,
+                tensor_data=TensorWriteData(
+                    chunk=ChunkStorageMetadata(
+                        offsets=self._shard_offsets, sizes=self._local_tensor.size()
+                    ),
+                    properties=TensorProperties.create_from_tensor(self._local_tensor),
+                    size=self._overall_size,
+                ),
+            )
+        ]
+
+    def __get_tensor_shard__(self, index: MetadataIndex) -> torch.Tensor:
+        assert self._fqn == index.fqn and self._shard_offsets == index.offset
+        return self._local_tensor
+
+    def __repr__(self):
+        return (
+            f"CheckpointableDistributedTensor("
+            f"fqn={self._fqn}, "
+            f"local_tensor={self._local_tensor}, "
+            f"shard_offset={self._shard_offset}, "
+            f"overall_size={self._overall_size})"
+        )
+
+
+class TestCheckpointableReshard(DTensorTestBase):
+    """
+    Test DCP reshard loads when shard sizes are uneven across the ranks.
+    """
+
+    @with_comms
+    @with_temp_dir
+    def test_uneven_reshard_with_checkpointable_api(self) -> None:
+        """
+        Saves a 1d distributed tensor that has shards with uneven sizes using Checkpointable API.
+        Loads them back with a different shard plan (resharding). By default this UT runs with
+        NUM_DEVICES = 4.
+        """
+        saving_1d_shard_plan = [
+            (0, 4),
+            (4, 3),
+            (7, 4),
+            (11, 5),
+        ]  # offset, length tuples.
+        loading_1d_shard_plan = [(0, 2), (2, 4), (6, 6), (12, 4)]
+        CHECKPOINT_DIR = self.temp_dir
+        my_rank = dist.get_rank()
+        saving_shard_offset, saving_shard_length = saving_1d_shard_plan[my_rank]
+        saving_local_tensor = torch.arange(
+            start=saving_shard_offset,
+            end=saving_shard_offset + saving_shard_length,
+            dtype=torch.float,
+        ).view(saving_shard_length, 1)
+        logger.info(f"[{my_rank}] saving_local_tensor : {saving_local_tensor}")  # noqa: G004
+        saving_cp_dist_tensor = CheckpointableDistTensor(
+            fqn="checkpointable_tensor",
+            local_tensor=saving_local_tensor,
+            shard_offsets=[saving_shard_offset, 0],
+            overall_size=[16, 1],
+        )
+        state_dict_to_save = {"checkpointable_tensor": saving_cp_dist_tensor}
+
+        dist_cp.save(
+            state_dict=state_dict_to_save,
+            storage_writer=dist_cp.FileSystemWriter(path=CHECKPOINT_DIR),
+            planner=dist_cp.DefaultSavePlanner(),
+        )
+
+        loading_shard_offset, loading_shard_length = loading_1d_shard_plan[my_rank]
+        loading_local_tensor = torch.rand([loading_shard_length, 1], dtype=torch.float)
+        logger.info(
+            f"[{my_rank}] loading_local_tensor (initialized with random vals) : {loading_local_tensor}"  # noqa: G004
+        )
+        expected_loaded_local_val_tensor = torch.arange(
+            start=loading_shard_offset,
+            end=loading_shard_offset + loading_shard_length,
+            dtype=torch.float,
+        ).view(loading_shard_length, 1)
+
+        loading_cp_dist_tensor = CheckpointableDistTensor(
+            fqn="checkpointable_tensor",
+            local_tensor=loading_local_tensor,
+            shard_offsets=[loading_shard_offset, 0],
+            overall_size=[16, 1],
+        )
+        state_dict_to_load = {"checkpointable_tensor": loading_cp_dist_tensor}
+        dist_cp.load(
+            state_dict=state_dict_to_load,
+            storage_reader=dist_cp.FileSystemReader(self.temp_dir),
+        )
+        assert torch.equal(loading_local_tensor, expected_loaded_local_val_tensor)
+
+    @with_comms
+    @with_temp_dir
+    def test_uneven_reshard_with_dtensor_shards_wrapper_api(self) -> None:
+        """
+        Saves a 1d distributed tensor that has shards with uneven sizes using Checkpointable API.
+        Loads them back with a different shard plan (resharding). By default this UT runs with
+        NUM_DEVICES = 4.
+        """
+        # NB: saving shardin plan and loading sharding plans are different and their
+        #     shard lengths are uneven.
+        saving_1d_shard_plan = [
+            (0, 4),
+            (4, 3),
+            (7, 4),
+            (11, 5),
+        ]  # offset, length tuples.
+        loading_1d_shard_plan = [(0, 6), (6, 2), (8, 1), (9, 7)]
+        cp_path = self.temp_dir
+        my_rank = dist.get_rank()
+
+        # 1d device mesh on CPU device
+        mesh_shape = (self.world_size,)
+        device_mesh = init_device_mesh("cpu", mesh_shape)
+
+        saving_shard_offset, saving_shard_length = saving_1d_shard_plan[my_rank]
+        saving_local_tensor = torch.arange(
+            start=saving_shard_offset,
+            end=saving_shard_offset + saving_shard_length,
+            dtype=torch.float,
+        ).view(saving_shard_length, 1)
+
+        # In order to support uneven shards we have to wrap the original shards in LocalShardsWrapper.
+        saving_local_shard_wrapper = LocalShardsWrapper(
+            local_shards=[saving_local_tensor], local_offsets=[(saving_shard_offset, 0)]
+        )
+
+        logger.info(
+            f"[{my_rank}] saving_local_shard_warpper : {saving_local_shard_wrapper}"  # noqa: G004
+        )
+
+        saving_cp_dist_tensor = DTensor.from_local(
+            local_tensor=saving_local_shard_wrapper,
+            device_mesh=device_mesh,
+            placements=[Shard(0)],
+            shape=torch.Size([16, 1]),
+            stride=torch.Size([1, 1]),
+        )
+
+        # put the DTensor in a state dict and call DCP save.
+        state_dict_to_save = {"checkpointable_tensor": saving_cp_dist_tensor}
+        dist_cp.save(
+            state_dict=state_dict_to_save,
+            storage_writer=dist_cp.FileSystemWriter(path=cp_path),
+            planner=dist_cp.DefaultSavePlanner(),
+        )
+
+        loading_shard_offset, loading_shard_length = loading_1d_shard_plan[my_rank]
+        loading_local_tensor = torch.rand(
+            [loading_shard_length, 1], dtype=torch.float, device="cpu"
+        )
+        loading_local_shard_wrapper = LocalShardsWrapper(
+            local_shards=[loading_local_tensor],
+            local_offsets=[(loading_shard_offset, 0)],
+        )
+
+        expected_loaded_local_val_tensor = torch.arange(
+            start=loading_shard_offset,
+            end=loading_shard_offset + loading_shard_length,
+            dtype=torch.float,
+        ).view(loading_shard_length, 1)
+
+        loading_cp_dist_tensor = DTensor.from_local(
+            local_tensor=loading_local_shard_wrapper,
+            device_mesh=device_mesh,
+            placements=[Shard(0)],
+            shape=torch.Size([16, 1]),
+            stride=torch.Size([1, 1]),
+        )
+        state_dict_to_load = {"checkpointable_tensor": loading_cp_dist_tensor}
+
+        dist_cp.load(
+            state_dict=state_dict_to_load,
+            storage_reader=dist_cp.FileSystemReader(path=cp_path),
+        )
+        logger.info(
+            f"[{my_rank}] loaded_shards_wrapper : {loading_local_shard_wrapper}"  # noqa: G004
+        )
+        assert torch.equal(loading_local_tensor, expected_loaded_local_val_tensor)
+        dist.barrier()
 
 
 # TODO: Add dtensor resharding test when world size changes.
diff --git a/test/distributed/checkpoint/test_file_system_checkpoint.py b/test/distributed/checkpoint/test_file_system_checkpoint.py
index e547e5249fd7..8a7d7e191ce6 100644
--- a/test/distributed/checkpoint/test_file_system_checkpoint.py
+++ b/test/distributed/checkpoint/test_file_system_checkpoint.py
@@ -23,7 +23,10 @@
 )
 from torch.distributed.checkpoint._extension import ZStandard
 from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
-from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
+from torch.testing._internal.common_distributed import (
+    requires_accelerator_dist_backend,
+    skip_if_lt_x_gpu,
+)
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -45,6 +48,9 @@
 )
 
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
+
 if TEST_WITH_DEV_DBG_ASAN:
     print(
         "Skip dev-asan as torch + multiprocessing spawn have known issues",
@@ -166,7 +172,7 @@ def world_size(self) -> int:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     @parametrize("extensions", [None, [Rot13Example()], [ZStandard()]])
     def test_read_write_shard_tensor(self, extensions) -> None:
         paths = [tempfile.mkdtemp()]
@@ -178,8 +184,8 @@ def test_read_write_shard_tensor(self, extensions) -> None:
         spec = ChunkShardingSpec(
             dim=0,
             placements=[
-                "rank:0/cuda:0",
-                "rank:1/cuda:1",
+                f"rank:0/{device_type}:0",
+                f"rank:1/{device_type}:1",
             ],
         )
 
@@ -228,14 +234,16 @@ def get_file_path(self) -> str:
 
     def load_tensor(self, tensor: ShardedTensor) -> torch.Tensor:
         res = (
-            torch.zeros(tensor.shape, device="cuda:0") if dist.get_rank() == 0 else None
+            torch.zeros(tensor.shape, device=f"{device_type}:0")
+            if dist.get_rank() == 0
+            else None
         )
         tensor.gather(out=res)
         return res
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     def test_load_with_different_shard_plan(self) -> None:
         path = self.get_file_path()
 
@@ -247,18 +255,18 @@ def test_load_with_different_shard_plan(self) -> None:
             ChunkShardingSpec(
                 dim=0,
                 placements=[
-                    "rank:0/cuda:0",
-                    "rank:1/cuda:1",
+                    f"rank:0/{device_type}:0",
+                    f"rank:1/{device_type}:1",
                 ],
             ),
             # pyre-fixme [28]: Unexpected keyword argument `dim` to call `dist._sharding_spec.api.ChunkShardingSpec.__init__`.
             ChunkShardingSpec(
                 dim=0,
                 placements=[
-                    "rank:0/cuda:0",
-                    "rank:1/cuda:1",
-                    "rank:1/cuda:1",
-                    "rank:0/cuda:0",
+                    f"rank:0/{device_type}:0",
+                    f"rank:1/{device_type}:1",
+                    f"rank:1/{device_type}:1",
+                    f"rank:0/{device_type}:0",
                 ],
             ),
             # This requires the tensors to be [10, 20]
@@ -267,27 +275,27 @@ def test_load_with_different_shard_plan(self) -> None:
                     ShardMetadata(
                         shard_offsets=[0, 0],
                         shard_sizes=[2, 20],
-                        placement="rank:0/cuda:0",
+                        placement=f"rank:0/{device_type}:0",
                     ),
                     ShardMetadata(
                         shard_offsets=[2, 0],
                         shard_sizes=[1, 20],
-                        placement="rank:1/cuda:1",
+                        placement=f"rank:1/{device_type}:1",
                     ),
                     ShardMetadata(
                         shard_offsets=[3, 0],
                         shard_sizes=[3, 20],
-                        placement="rank:0/cuda:0",
+                        placement=f"rank:0/{device_type}:0",
                     ),
                     ShardMetadata(
                         shard_offsets=[6, 0],
                         shard_sizes=[3, 20],
-                        placement="rank:1/cuda:1",
+                        placement=f"rank:1/{device_type}:1",
                     ),
                     ShardMetadata(
                         shard_offsets=[9, 0],
                         shard_sizes=[1, 20],
-                        placement="rank:0/cuda:0",
+                        placement=f"rank:0/{device_type}:0",
                     ),
                 ]
             ),
@@ -297,12 +305,12 @@ def test_load_with_different_shard_plan(self) -> None:
                     ShardMetadata(
                         shard_offsets=[0, 0],
                         shard_sizes=[8, 20],
-                        placement="rank:1/cuda:1",
+                        placement=f"rank:1/{device_type}:1",
                     ),
                     ShardMetadata(
                         shard_offsets=[8, 0],
                         shard_sizes=[2, 20],
-                        placement="rank:0/cuda:0",
+                        placement=f"rank:0/{device_type}:0",
                     ),
                 ]
             ),
@@ -350,7 +358,7 @@ def test_load_with_different_shard_plan(self) -> None:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     def test_load_rowwise_to_colwise(self) -> None:
         path = self.get_file_path()
         self.assertEqual(self.world_size, dist.get_world_size())
@@ -359,8 +367,8 @@ def test_load_rowwise_to_colwise(self) -> None:
         src_spec = ChunkShardingSpec(
             dim=0,
             placements=[
-                "rank:0/cuda:0",
-                "rank:1/cuda:1",
+                f"rank:0/{device_type}:0",
+                f"rank:1/{device_type}:1",
             ],
         )
 
@@ -368,8 +376,8 @@ def test_load_rowwise_to_colwise(self) -> None:
         dst_spec = ChunkShardingSpec(
             dim=1,
             placements=[
-                "rank:0/cuda:0",
-                "rank:1/cuda:1",
+                f"rank:0/{device_type}:0",
+                f"rank:1/{device_type}:1",
             ],
         )
 
@@ -377,14 +385,14 @@ def test_load_rowwise_to_colwise(self) -> None:
             shutil.rmtree(path, ignore_errors=True)
             os.makedirs(path)
 
-        model_to_save = MyShardedModel3(src_spec).cuda(dist.get_rank())
+        model_to_save = MyShardedModel3(src_spec).to(dist.get_rank())
         model_to_save._register_state_dict_hook(state_dict_hook)
         state_dict_to_save = model_to_save.state_dict()
 
         fs_writer = FileSystemWriter(path=path)
         save_state_dict(state_dict=state_dict_to_save, storage_writer=fs_writer)
 
-        model_to_load = MyShardedModel3(dst_spec).cuda(dist.get_rank())
+        model_to_load = MyShardedModel3(dst_spec).to(dist.get_rank())
         model_to_load._register_state_dict_hook(state_dict_hook)
         state_dict_to_load_to = model_to_load.state_dict()
 
@@ -401,7 +409,7 @@ def test_load_rowwise_to_colwise(self) -> None:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     def test_save_load_bytes(self) -> None:
         path = self.get_file_path()
 
@@ -420,7 +428,7 @@ def test_save_load_bytes(self) -> None:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     def test_switch_between_sharded_tensor_to_tensor(self) -> None:
         path = self.get_file_path()
         tensor_size = 32
@@ -429,17 +437,17 @@ def test_switch_between_sharded_tensor_to_tensor(self) -> None:
             ChunkShardingSpec(
                 dim=0,
                 placements=[
-                    "rank:0/cuda:0",
-                    "rank:1/cuda:1",
+                    f"rank:0/{device_type}:0",
+                    f"rank:1/{device_type}:1",
                 ],
             ),
             ChunkShardingSpec(
                 dim=0,
                 placements=[
-                    "rank:0/cuda:0",
-                    "rank:1/cuda:1",
-                    "rank:1/cuda:1",
-                    "rank:0/cuda:0",
+                    f"rank:0/{device_type}:0",
+                    f"rank:1/{device_type}:1",
+                    f"rank:1/{device_type}:1",
+                    f"rank:0/{device_type}:0",
                 ],
             ),
             EnumerableShardingSpec(
@@ -447,12 +455,12 @@ def test_switch_between_sharded_tensor_to_tensor(self) -> None:
                     ShardMetadata(
                         shard_offsets=[0],
                         shard_sizes=[8],
-                        placement="rank:1/cuda:1",
+                        placement=f"rank:1/{device_type}:1",
                     ),
                     ShardMetadata(
                         shard_offsets=[8],
                         shard_sizes=[tensor_size - 8],
-                        placement="rank:0/cuda:0",
+                        placement=f"rank:0/{device_type}:0",
                     ),
                 ]
             ),
@@ -461,12 +469,12 @@ def test_switch_between_sharded_tensor_to_tensor(self) -> None:
                     ShardMetadata(
                         shard_offsets=[0],
                         shard_sizes=[10],
-                        placement="rank:0/cuda:0",
+                        placement=f"rank:0/{device_type}:0",
                     ),
                     ShardMetadata(
                         shard_offsets=[10],
                         shard_sizes=[tensor_size - 10],
-                        placement="rank:1/cuda:1",
+                        placement=f"rank:1/{device_type}:1",
                     ),
                 ]
             ),
@@ -512,15 +520,15 @@ def world_size(self) -> int:
 
     @with_comms(init_rpc=False)
     @skip_if_lt_x_gpu(2)
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     @with_temp_dir
     def test_read_write_shard_tensor(self) -> None:
         # pyre-fixme [28]: Unexpected keyword argument `dim` to call `dist._sharding_spec.api.ChunkShardingSpec.__init__`.
         spec = ChunkShardingSpec(
             dim=0,
             placements=[
-                "rank:0/cuda:0",
-                "rank:1/cuda:1",
+                f"rank:0/{device_type}:0",
+                f"rank:1/{device_type}:1",
             ],
         )
 
diff --git a/test/distributed/checkpoint/test_format_utils.py b/test/distributed/checkpoint/test_format_utils.py
index 248d7a27c308..35058e385377 100644
--- a/test/distributed/checkpoint/test_format_utils.py
+++ b/test/distributed/checkpoint/test_format_utils.py
@@ -22,6 +22,9 @@
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
 
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
+
 class SimpleModelUneven(nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -40,7 +43,7 @@ def forward(self, x):
         return x
 
     def get_input(self):
-        return torch.rand(4, 5, device="cuda")
+        return torch.rand(4, 5, device=device_type)
 
 
 class TestFormatUtils(DTensorTestBase):
@@ -87,7 +90,7 @@ def test_online_torch_save_to_dcp(self) -> None:
 
         # Load into a sharded model
         device_mesh = init_device_mesh(self.device_type, (self.world_size,))
-        model = SimpleModelUneven().cuda()
+        model = SimpleModelUneven().to(self.device_type)
         model = FSDP(
             model,
             device_mesh=device_mesh,
diff --git a/test/distributed/checkpoint/test_fsdp_model_state.py b/test/distributed/checkpoint/test_fsdp_model_state.py
index 12d013d5e776..f73604a1d771 100644
--- a/test/distributed/checkpoint/test_fsdp_model_state.py
+++ b/test/distributed/checkpoint/test_fsdp_model_state.py
@@ -21,7 +21,8 @@
 class FsdpModelStateCheckpoint(DTensorTestBase):
     @property
     def backend(self):
-        return "cpu:gloo,cuda:nccl"
+        curr_backend = dist.get_default_backend_for_device(self.device_type)
+        return f"cpu:gloo,{self.device_type}:{curr_backend}"
 
     def _test_fsdp_model_state(self, process_group) -> None:
         CHECKPOINT_DIR = self.temp_dir
@@ -67,8 +68,8 @@ def _test_fsdp_model_state(self, process_group) -> None:
                 self.assertEqual(model.weight, model_2.weight)
                 self.assertEqual(model.bias, model_2.bias)
 
-    @with_comms
     @skip_if_lt_x_gpu(2)
+    @with_comms
     @with_temp_dir
     def test_fsdp_model_state_no_resharding(self):
         self._test_fsdp_model_state(process_group=None)
@@ -88,8 +89,8 @@ def _create_new_dist_group(self):
 
         return my_fsdp
 
-    @with_comms
     @skip_if_lt_x_gpu(4)
+    @with_comms
     @with_temp_dir
     def test_fsdp_model_state_with_resharding(self):
         self._test_fsdp_model_state(process_group=self._create_new_dist_group())
diff --git a/test/distributed/checkpoint/test_fsdp_optim_state.py b/test/distributed/checkpoint/test_fsdp_optim_state.py
index b23df6a8d808..7adcdafe4530 100644
--- a/test/distributed/checkpoint/test_fsdp_optim_state.py
+++ b/test/distributed/checkpoint/test_fsdp_optim_state.py
@@ -1,6 +1,7 @@
 # Owner(s): ["oncall: distributed"]
 
 import torch
+import torch.distributed as dist
 import torch.distributed.checkpoint as dcp
 import torch.nn as nn
 from torch.distributed._shard.sharded_tensor.api import ShardedTensor
@@ -28,8 +29,9 @@ def _create_model(self):
         layer3_weight_dim = self.world_size * 3
 
         class TestDummyModel(torch.nn.Module):
-            def __init__(self) -> None:
+            def __init__(self, device_type) -> None:
                 super().__init__()
+                self.device_type = device_type
                 self.net1 = nn.Sequential(nn.Linear(8, layer1_weight_dim), nn.ReLU())
                 self.net2 = nn.Sequential(
                     nn.Linear(layer1_weight_dim, layer2_weight_dim), nn.ReLU()
@@ -42,17 +44,18 @@ def forward(self, x):
                 return self.net3(self.net2(self.net1(x)))
 
             def get_input(self):
-                return torch.rand(8, 8, device="cuda")
+                return torch.rand(8, 8, device=self.device_type)
 
-        model = TestDummyModel().cuda()
+        model = TestDummyModel(self.device_type).to(self.device_type)
         return model
 
     @property
     def backend(self):
-        return "cpu:gloo,cuda:nccl"
+        curr_backend = dist.get_default_backend_for_device(self.device_type)
+        return f"cpu:gloo,{self.device_type}:{curr_backend}"
 
-    @with_comms
     @skip_if_lt_x_gpu(2)
+    @with_comms
     @with_temp_dir
     @parametrize("pass_planner", [True, False])
     def test_load_sharded_optimizer_state_dict(self, pass_planner) -> None:
diff --git a/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py b/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py
index 447a32278443..612917963028 100644
--- a/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py
+++ b/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py
@@ -30,7 +30,7 @@ class TestFsdpTpCheckpointConversion(DTensorTestBase):
     def test_fsdp_to_tp(self):
         CHECKPOINT_DIR = self.temp_dir
 
-        model = MLPModule(self.device_type).cuda(self.rank)
+        model = MLPModule(self.device_type).to(self.rank)
         # create a FSDP wrapped model
         fsdp_model = FSDP(model, use_orig_params=True)
 
@@ -49,7 +49,7 @@ def test_fsdp_to_tp(self):
         # create a TP wrapped model
         mesh_shape = (self.world_size,)
         device_mesh = init_device_mesh(self.device_type, mesh_shape)
-        model = MLPModule(self.device_type).cuda(self.rank)
+        model = MLPModule(self.device_type).to(self.rank)
         # Parallelize the module based on the given Parallel Style.
         parallelize_plan = {
             "net1": ColwiseParallel(),
@@ -60,7 +60,7 @@ def test_fsdp_to_tp(self):
 
         # Update the parameters so tp_model.state_dict() will be different from fsdp_model.state_dict().
         torch.manual_seed(0)
-        inp = torch.rand(20, 10).cuda(self.rank)
+        inp = torch.rand(20, 10).to(self.rank)
         output = tp_model(inp)
         output.sum().backward()
         optimizer.step()
diff --git a/test/distributed/checkpoint/test_fsspec.py b/test/distributed/checkpoint/test_fsspec.py
index 9d69d6d386a7..530ffae85271 100644
--- a/test/distributed/checkpoint/test_fsspec.py
+++ b/test/distributed/checkpoint/test_fsspec.py
@@ -2,8 +2,9 @@
 
 import shutil
 import tempfile
+from collections.abc import Callable
 from functools import wraps
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 
 import torch
 import torch.distributed as dist
diff --git a/test/distributed/checkpoint/test_hf_safetensor_e2e.py b/test/distributed/checkpoint/test_hf_safetensor_e2e.py
index 9fbe2c47db03..f0316fde9f2c 100644
--- a/test/distributed/checkpoint/test_hf_safetensor_e2e.py
+++ b/test/distributed/checkpoint/test_hf_safetensor_e2e.py
@@ -587,7 +587,7 @@ def test_dtensor_checkpoint_resharding_with_empty_shard(self):
             print("safetensors not installed")
             return
 
-        tensor = torch.rand(1).cuda()
+        tensor = torch.rand(1).to(self.device_type)
         mesh = init_device_mesh(self.device_type, (self.world_size,))
         dtensor = distribute_tensor(tensor, mesh, [Shard(0)])
         ref_state_dict = {"dtensor": dtensor}
@@ -599,7 +599,7 @@ def test_dtensor_checkpoint_resharding_with_empty_shard(self):
             ),
         )
 
-        tensor = torch.rand(1).cuda()
+        tensor = torch.rand(1).to(self.device_type)
         mesh_2 = init_device_mesh(self.device_type, (2, self.world_size // 2))
         dtensor = distribute_tensor(tensor, mesh_2, [Shard(0), Shard(0)])
         state_dict = {"dtensor": dtensor}
diff --git a/test/distributed/checkpoint/test_hsdp_checkpoint.py b/test/distributed/checkpoint/test_hsdp_checkpoint.py
index d7e627582254..8aa55cd2c24f 100644
--- a/test/distributed/checkpoint/test_hsdp_checkpoint.py
+++ b/test/distributed/checkpoint/test_hsdp_checkpoint.py
@@ -2,6 +2,7 @@
 from copy import deepcopy
 
 import torch
+import torch.distributed as dist
 import torch.distributed.checkpoint as dist_cp
 import torch.nn as nn
 import torch.nn.functional as F
@@ -29,6 +30,9 @@
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
 
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
+
 class SimpleModel(torch.nn.Module):
     def __init__(self) -> None:
         super().__init__()
@@ -44,7 +48,7 @@ def forward(self, x):
         return x
 
     def get_input(self):
-        return torch.rand(4, 5, device="cuda")
+        return torch.rand(4, 5, device=device_type)
 
 
 class SimpleModelUneven(torch.nn.Module):
@@ -64,16 +68,17 @@ def forward(self, x):
         return x
 
     def get_input(self):
-        return torch.rand(4, 5, device="cuda")
+        return torch.rand(4, 5, device=device_type)
 
 
 class TestHSDPCheckpoint(DTensorTestBase):
     @property
     def backend(self):
-        return "cpu:gloo,cuda:nccl"
+        curr_backend = dist.get_default_backend_for_device(self.device_type)
+        return f"cpu:gloo,{self.device_type}:{curr_backend}"
 
-    @with_comms
     @skip_if_lt_x_gpu(4)
+    @with_comms
     @with_temp_dir
     @parametrize("is_even_sharded_model", [True, False])
     def test_hsdp_checkpoint(self, is_even_sharded_model) -> None:
@@ -82,7 +87,7 @@ def test_hsdp_checkpoint(self, is_even_sharded_model) -> None:
 
         mesh_2d = init_device_mesh(self.device_type, (2, self.world_size // 2))
         model = FSDP(
-            simple_model().cuda(),
+            simple_model().to(self.device_type),
             sharding_strategy=ShardingStrategy.HYBRID_SHARD,
             device_mesh=mesh_2d,
         )
@@ -130,8 +135,8 @@ def test_hsdp_checkpoint(self, is_even_sharded_model) -> None:
             self.assertEqual(v1.placements, v2.placements)
             self.assertEqual(v1.to_local(), v2.to_local())
 
-    @with_comms
     @skip_if_lt_x_gpu(4)
+    @with_comms
     @with_temp_dir
     @parametrize("is_even_sharded_model", [True, False])
     def test_hsdp_fsdp_checkpoint_conversion(self, is_even_sharded_model) -> None:
@@ -141,7 +146,7 @@ def test_hsdp_fsdp_checkpoint_conversion(self, is_even_sharded_model) -> None:
         # save the hsdp model state_dict
         mesh_2d = init_device_mesh(self.device_type, (2, self.world_size // 2))
         hsdp_model = FSDP(
-            simple_model().cuda(),
+            simple_model().to(self.device_type),
             sharding_strategy=ShardingStrategy.HYBRID_SHARD,
             device_mesh=mesh_2d,
         )
@@ -159,7 +164,7 @@ def test_hsdp_fsdp_checkpoint_conversion(self, is_even_sharded_model) -> None:
         # initialize a fsdp model to load checkpoint into
         mesh_1d = init_device_mesh(self.device_type, (self.world_size,))
         fsdp_model = FSDP(
-            simple_model().cuda(),
+            simple_model().to(self.device_type),
             device_mesh=mesh_1d,
         )
         FSDP.set_state_dict_type(
diff --git a/test/distributed/checkpoint/test_pg_transport.py b/test/distributed/checkpoint/test_pg_transport.py
index 82ce217093ef..6036c7c11909 100644
--- a/test/distributed/checkpoint/test_pg_transport.py
+++ b/test/distributed/checkpoint/test_pg_transport.py
@@ -1,11 +1,13 @@
 # Owner(s): ["oncall: distributed"]
 
 import logging
+import unittest
 from datetime import timedelta
 from typing import Optional
 from unittest.mock import MagicMock, patch
 
 import torch
+import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed._shard.sharded_tensor import (
     init_from_local_shards,
@@ -23,10 +25,11 @@
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.distributed_c10d import _get_default_group
 from torch.distributed.tensor import DTensor
-from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
+    at_least_x_gpu,
+    HAS_ACCELERATOR,
     MultiProcContinuousTest,
-    requires_nccl,
+    requires_accelerator_dist_backend,
 )
 from torch.testing._internal.common_utils import (
     run_tests,
@@ -35,6 +38,8 @@
 )
 
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 logger = logging.getLogger(__name__)
 
 
@@ -160,9 +165,9 @@ def _test_pg_transport_with_mixed_content(self, device) -> None:
 
 
 def _test_pg_transport_with_sharded_tensor(self, device) -> None:
-    # Set current CUDA device for NCCL
-    if device.type == "cuda":
-        torch.cuda.set_device(device)
+    # Set current accelerator device for NCCL/XCCL
+    if device.type == "cuda" or device.type == "xpu":
+        torch.accelerator.set_device_index(device)
 
     state_dict = _create_sharded_tensor_state_dict(self.rank, self.world_size, device)
     transport = PGTransport(_get_default_group(), timedelta(seconds=10), device)
@@ -227,34 +232,36 @@ def test_pg_transport_with_sharded_tensor(self) -> None:
         _test_pg_transport_with_sharded_tensor(self, self.device)
 
 
-class PgTransportCUDA(MultiProcContinuousTest):
+class PgTransportGPU(MultiProcContinuousTest):
     world_size = 2
     timeout: timedelta = timedelta(seconds=20)
 
     @classmethod
     def backend_str(cls) -> Optional[str]:
-        return "nccl"
-
-    @classmethod
-    def device_type(cls) -> str:
-        return "cuda"
+        return dist.get_default_backend_for_device(cls.device_type())
 
     @property
     def device(self) -> torch.device:
         return torch.device(f"{self.device_type()}:{self.rank}")
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend()
+    @skip_but_pass_in_sandcastle_if(
+        not at_least_x_gpu(2), "test requires 2+ accelerators"
+    )
     def test_pg_transport(self) -> None:
         _test_pg_transport(self, self.device)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend()
+    @skip_but_pass_in_sandcastle_if(
+        not at_least_x_gpu(2), "test requires 2+ accelerators"
+    )
     def test_pg_transport_with_mixed_content(self) -> None:
         _test_pg_transport_with_mixed_content(self, self.device)
 
-    @requires_nccl()
-    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @requires_accelerator_dist_backend()
+    @skip_but_pass_in_sandcastle_if(
+        not at_least_x_gpu(2), "test requires 2+ accelerators"
+    )
     def test_pg_transport_with_sharded_tensor(self) -> None:
         _test_pg_transport_with_sharded_tensor(self, self.device)
 
@@ -578,13 +585,10 @@ def setUp(self):
         self.pg.send = MagicMock(return_value=self.mock_work)
         self.pg.recv = MagicMock(return_value=self.mock_work)
 
+    @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
     def test_send_checkpoint_with_cpu_tensors(self):
-        """Test send_checkpoint with CPU tensors when device is CUDA."""
-        # Skip if CUDA is not available
-        if not torch.cuda.is_available():
-            self.skipTest("CUDA not available")
-
-        device = torch.device("cuda:0")
+        """Test send_checkpoint with CPU tensors when device is accelerator."""
+        device = torch.device(f"{device_type}:0")
 
         # Create a state dict with CPU tensors
         state_dict = {
@@ -592,7 +596,7 @@ def test_send_checkpoint_with_cpu_tensors(self):
             "cpu_tensor2": torch.randn(3, 4),
         }
 
-        # Create transport with CUDA device
+        # Create transport with accelerator device
         transport = PGTransport(self.pg, self.timeout, device)
 
         # Call send_checkpoint
diff --git a/test/distributed/checkpoint/test_planner.py b/test/distributed/checkpoint/test_planner.py
index edf043301ed2..86bed29de998 100644
--- a/test/distributed/checkpoint/test_planner.py
+++ b/test/distributed/checkpoint/test_planner.py
@@ -66,7 +66,7 @@
 def create_sharded_tensor(rank, world_size, shards_per_rank, shard_size=8):
     shards_metadata = []
     local_shards = []
-    for idx in range(0, world_size * shards_per_rank):
+    for idx in range(world_size * shards_per_rank):
         shard_rank = idx // shards_per_rank
         shard_md = ShardMetadata(
             shard_offsets=[idx * shard_size],
diff --git a/test/distributed/checkpoint/test_quantized_hf_storage.py b/test/distributed/checkpoint/test_quantized_hf_storage.py
index 82d658c27105..c8ee756aaf3f 100644
--- a/test/distributed/checkpoint/test_quantized_hf_storage.py
+++ b/test/distributed/checkpoint/test_quantized_hf_storage.py
@@ -4,6 +4,7 @@
 from unittest.mock import MagicMock, patch
 
 import torch
+from torch.distributed.checkpoint._hf_utils import _HFStorageInfo
 from torch.distributed.checkpoint.metadata import MetadataIndex
 from torch.distributed.checkpoint.planner import LoadItemType, ReadItem
 from torch.distributed.checkpoint.quantized_hf_storage import (
@@ -57,6 +58,12 @@ def test_dequantization(self):
             scale2_fqn: file2_name,  # Scale in file 2 (different file scenario)
         }
 
+        # Populate the tensor shapes cache that would normally be built by read_metadata()
+        reader._tensor_full_shapes = {
+            weight1_fqn: torch.Size([4, 4]),
+            weight2_fqn: torch.Size([4, 4]),
+        }
+
         # Mock the main safetensors file (file1)
         mock_file1 = MagicMock()
 
@@ -64,9 +71,9 @@ def test_dequantization(self):
         def mock_get_slice(tensor_name):
             mock_tensor = MagicMock()
             if tensor_name == weight1_fqn:
-                mock_tensor.__getitem__ = lambda _, __: quantized_tensor1
+                mock_tensor.__getitem__ = lambda _, _slice: quantized_tensor1
             elif tensor_name == weight2_fqn:
-                mock_tensor.__getitem__ = lambda _, __: quantized_tensor2
+                mock_tensor.__getitem__ = lambda _, _slice: quantized_tensor2
             return mock_tensor
 
         mock_file1.get_slice = mock_get_slice
@@ -162,6 +169,110 @@ def mock_get_slice(tensor_name):
         committed_tensor2 = args2[1]
         torch.testing.assert_close(committed_tensor2, expected_result2)
 
+    def test_dtensor_slice_dequantization_block_alignment(self):
+        """Test DTensor slice dequantization with proper block alignment logic."""
+        reader = QuantizedHuggingFaceStorageReader(
+            self.path,
+            thread_count=1,
+            block_size=4,  # Small block size for easier testing
+        )
+
+        # Create a larger tensor to test multiple blocks
+        # Full tensor is 8x8, block size is 4x4, so we have 2x2 = 4 blocks
+        full_tensor_shape = torch.Size([8, 8])
+
+        # Create quantized tensor data for a slice (rows 2:6, cols 1:5)
+        # This slice spans across multiple blocks
+        slice_tensor = torch.ones(4, 4, dtype=torch.float32) * 2.0
+
+        # Create scale inverse tensor with different values for each block
+        # Scale tensor shape: (2, 2) for 2x2 blocks
+        scale_inv = torch.tensor(
+            [
+                [1.0, 2.0],  # Block (0,0)=1.0, Block (0,1)=2.0
+                [3.0, 4.0],  # Block (1,0)=3.0, Block (1,1)=4.0
+            ],
+            dtype=torch.float32,
+        )
+
+        # Define tensor names
+        weight_fqn = "model.layers.0.attn.q_proj.weight"
+        scale_fqn = "model.layers.0.attn.q_proj.weight_scale_inv"
+        file_name = "model-00001-of-00001.safetensors"
+
+        # Setup mappings
+        reader._weight_scale_mapping = {weight_fqn: scale_fqn}
+        reader._weight_map = {weight_fqn: file_name, scale_fqn: file_name}
+
+        # Mock storage_data to provide tensor shape information
+        reader.storage_data = {
+            MetadataIndex(fqn=weight_fqn, offset=[0, 0]): _HFStorageInfo(
+                relative_path=file_name, shape=full_tensor_shape, dtype=torch.float32
+            )
+        }
+
+        # Populate the tensor shapes cache that would normally be built by read_metadata()
+        reader._tensor_full_shapes = {
+            weight_fqn: full_tensor_shape,
+        }
+
+        # Create ReadItem for a slice that spans multiple blocks
+        # Request slice [2:6, 1:5] from the full 8x8 tensor
+        read_item = ReadItem(
+            type=LoadItemType.TENSOR,
+            storage_index=MetadataIndex(
+                fqn=weight_fqn,
+                offset=torch.Size([0, 0]),
+            ),
+            dest_index=MetadataIndex(
+                fqn=weight_fqn,
+                offset=torch.Size([0, 0]),
+            ),
+            storage_offsets=[2, 1],  # Start at row 2, col 1
+            dest_offsets=[0, 0],
+            lengths=[4, 4],  # 4x4 slice
+        )
+
+        # Mock safetensors file
+        mock_file = MagicMock()
+
+        # Mock get_slice to return the slice tensor
+        mock_tensor_slice = MagicMock()
+        mock_tensor_slice.__getitem__ = lambda _, _slice: slice_tensor
+        mock_file.get_slice.return_value = mock_tensor_slice
+
+        # Mock get_tensor for scale
+        mock_file.get_tensor.return_value = scale_inv
+
+        # Create target tensor
+        target_tensor = torch.zeros(4, 4, dtype=torch.float32)
+        mock_planner = MagicMock()
+        mock_planner.resolve_tensor.return_value = target_tensor
+
+        # Process the request
+        reader._process_read_request(mock_file, read_item, mock_planner)
+
+        # Verify the result
+        mock_planner.commit_tensor.assert_called_once()
+        args, _ = mock_planner.commit_tensor.call_args
+        committed_tensor = args[1]
+
+        # Expected result calculation:
+        # The slice [2:6, 1:5] intersects with blocks as follows:
+        # - Block (0,0): covers [0:4, 0:4] -> intersection [2:4, 1:4] -> local [0:2, 0:3] with scale 1.0
+        # - Block (0,1): covers [0:4, 4:8] -> intersection [2:4, 4:5] -> local [0:2, 3:4] with scale 2.0
+        # - Block (1,0): covers [4:8, 0:4] -> intersection [4:6, 1:4] -> local [2:4, 0:3] with scale 3.0
+        # - Block (1,1): covers [4:8, 4:8] -> intersection [4:6, 4:5] -> local [2:4, 3:4] with scale 4.0
+
+        expected_result = torch.zeros(4, 4, dtype=torch.float32)
+        # Fill expected values based on block intersections
+        expected_result[0:2, 0:3] = 2.0 * 1.0  # Block (0,0) intersection
+        expected_result[0:2, 3:4] = 2.0 * 2.0  # Block (0,1) intersection
+        expected_result[2:4, 0:3] = 2.0 * 3.0  # Block (1,0) intersection
+        expected_result[2:4, 3:4] = 2.0 * 4.0  # Block (1,1) intersection
+
+        torch.testing.assert_close(committed_tensor, expected_result)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/checkpoint/test_save_load_api.py b/test/distributed/checkpoint/test_save_load_api.py
index 00f0c51196b2..1a7f763dc882 100644
--- a/test/distributed/checkpoint/test_save_load_api.py
+++ b/test/distributed/checkpoint/test_save_load_api.py
@@ -37,7 +37,7 @@ def world_size(self) -> int:
     @skip_if_lt_x_gpu(4)
     @with_temp_dir
     def test_auto_detect(self):
-        model = FSDP(MyTestModule().cuda())
+        model = FSDP(MyTestModule().to(self.device_type))
         device_mesh = init_device_mesh(self.device_type, (self.world_size,))
         model = FSDP(model, device_mesh=device_mesh)
         dcp.save(model.state_dict(), checkpoint_id=os.path.join(self.temp_dir, "first"))
diff --git a/test/distributed/checkpoint/test_state_dict.py b/test/distributed/checkpoint/test_state_dict.py
index a42215e0ea0d..050e16720f27 100644
--- a/test/distributed/checkpoint/test_state_dict.py
+++ b/test/distributed/checkpoint/test_state_dict.py
@@ -3,8 +3,9 @@
 import copy
 import functools
 import sys
+from collections.abc import Callable
 from itertools import chain
-from typing import Callable, Union
+from typing import Union
 
 import torch
 import torch.distributed as dist
diff --git a/test/distributed/checkpoint/test_state_dict_stager.py b/test/distributed/checkpoint/test_state_dict_stager.py
index 8134472f52d5..22cb2f32cf4a 100644
--- a/test/distributed/checkpoint/test_state_dict_stager.py
+++ b/test/distributed/checkpoint/test_state_dict_stager.py
@@ -3,6 +3,7 @@
 import dataclasses
 import os
 import tempfile
+import unittest
 from datetime import timedelta
 
 import torch
@@ -18,14 +19,21 @@
 from torch.distributed.checkpoint._state_dict_stager import StateDictStager
 from torch.distributed.checkpoint.staging import _ReplicationStager
 from torch.distributed.tensor import DeviceMesh, distribute_tensor
-from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
-from torch.testing._internal.common_utils import requires_cuda, run_tests, TestCase
+from torch.testing._internal.common_distributed import (
+    HAS_ACCELERATOR,
+    requires_accelerator_dist_backend,
+    skip_if_lt_x_gpu,
+)
+from torch.testing._internal.common_utils import run_tests, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     with_comms,
 )
 
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
+
 def create_cpu_state_dict(state_dict):
     cpu_state_dict = {}
     for key, value in state_dict.items():
@@ -33,16 +41,16 @@ def create_cpu_state_dict(state_dict):
     return cpu_state_dict
 
 
-def compare_state_dicts(cuda_state_dict, cpu_state_dict, rtol=1e-5, atol=1e-8):
+def compare_state_dicts(gpu_state_dict, cpu_state_dict, rtol=1e-5, atol=1e-8):
     """
-    Compare if two state dictionaries (one on CUDA, one on CPU) are otherwise the same.
+    Compare if two state dictionaries (one on GPU, one on CPU) are otherwise the same.
 
     This function checks if the tensors in both state dictionaries have the same values,
     shapes, dtypes, etc., ignoring the device difference. It also checks if tensors that
     share storage in one state dict also share storage in the other.
 
     Args:
-        cuda_state_dict: The state dictionary with tensors on CUDA
+        gpu_state_dict: The state dictionary with tensors on GPU
         cpu_state_dict: The state dictionary with tensors on CPU
         rtol: Relative tolerance for comparing tensor values
         atol: Absolute tolerance for comparing tensor values
@@ -52,65 +60,65 @@ def compare_state_dicts(cuda_state_dict, cpu_state_dict, rtol=1e-5, atol=1e-8):
         str: Error message if the state dictionaries are not equivalent, empty string otherwise
     """
     # Track storage data pointers to check storage sharing
-    cuda_storage_ptrs = {}
+    gpu_storage_ptrs = {}
     cpu_storage_ptrs = {}
 
-    def compare_objects(cuda_obj, cpu_obj, path=""):
+    def compare_objects(gpu_obj, cpu_obj, path=""):
         # If objects are tensors, compare them
-        if isinstance(cuda_obj, torch.Tensor) and isinstance(cpu_obj, torch.Tensor):
+        if isinstance(gpu_obj, torch.Tensor) and isinstance(cpu_obj, torch.Tensor):
             # Check if devices are as expected
-            if cuda_obj.device.type != "cuda":
+            if gpu_obj.device.type != device_type:
                 return (
                     False,
-                    f"Expected CUDA tensor, got {cuda_obj.device.type} tensor at {path}",
+                    f"Expected accelerator tensor, got {gpu_obj.device.type} tensor at {path}",
                 )
             if cpu_obj.device.type != "cpu":
                 return (
                     False,
                     f"Expected CPU tensor, got {cpu_obj.device.type} tensor at {path}",
                 )
-            if cuda_obj.storage_offset() != cpu_obj.storage_offset():
+            if gpu_obj.storage_offset() != cpu_obj.storage_offset():
                 return (
                     False,
-                    f"Storage offset mismatch at {path}: {cuda_obj.storage_offset()} vs {cpu_obj.storage_offset()}",
+                    f"Storage offset mismatch at {path}: {gpu_obj.storage_offset()} vs {cpu_obj.storage_offset()}",
                 )
 
-            if not torch.equal(cuda_obj.cpu(), cpu_obj):
+            if not torch.equal(gpu_obj.cpu(), cpu_obj):
                 return (
                     False,
                     f"Tensors are not same at {path}",
                 )
 
             # Track storage sharing
-            cuda_storage_ptr = cuda_obj.storage().data_ptr()
+            gpu_storage_ptr = gpu_obj.storage().data_ptr()
             cpu_storage_ptr = cpu_obj.storage().data_ptr()
 
-            if cuda_storage_ptr in cuda_storage_ptrs:
-                # This CUDA tensor shares storage with another tensor
+            if gpu_storage_ptr in gpu_storage_ptrs:
+                # This GPU tensor shares storage with another tensor
                 # Check if the corresponding CPU tensors also share storage
-                if cpu_storage_ptr != cuda_storage_ptrs[cuda_storage_ptr]:
+                if cpu_storage_ptr != gpu_storage_ptrs[gpu_storage_ptr]:
                     return (
                         False,
-                        f"Storage sharing mismatch: CUDA tensors share storage but CPU tensors don't at {path}",
+                        f"Storage sharing mismatch: GPU tensors share storage but CPU tensors don't at {path}",
                     )
             else:
                 # First time seeing this storage
-                cuda_storage_ptrs[cuda_storage_ptr] = cpu_storage_ptr
-                cpu_storage_ptrs[cpu_storage_ptr] = cuda_storage_ptr
+                gpu_storage_ptrs[gpu_storage_ptr] = cpu_storage_ptr
+                cpu_storage_ptrs[cpu_storage_ptr] = gpu_storage_ptr
 
             return True, ""
 
         # If objects are dictionaries, compare them recursively
-        elif isinstance(cuda_obj, dict) and isinstance(cpu_obj, dict):
-            if cuda_obj.keys() != cpu_obj.keys():
+        elif isinstance(gpu_obj, dict) and isinstance(cpu_obj, dict):
+            if gpu_obj.keys() != cpu_obj.keys():
                 return (
                     False,
-                    f"Dictionary keys mismatch at {path}: {cuda_obj.keys()} vs {cpu_obj.keys()}",
+                    f"Dictionary keys mismatch at {path}: {gpu_obj.keys()} vs {cpu_obj.keys()}",
                 )
 
-            for key in cuda_obj:
+            for key in gpu_obj:
                 result, error = compare_objects(
-                    cuda_obj[key], cpu_obj[key], f"{path}.{key}" if path else key
+                    gpu_obj[key], cpu_obj[key], f"{path}.{key}" if path else key
                 )
                 if not result:
                     return False, error
@@ -118,37 +126,37 @@ def compare_objects(cuda_obj, cpu_obj, path=""):
             return True, ""
 
         # If objects are lists, tuples, or sets, compare them recursively
-        elif isinstance(cuda_obj, (list, tuple, set)) and isinstance(
+        elif isinstance(gpu_obj, (list, tuple, set)) and isinstance(
             cpu_obj, (list, tuple, set)
         ):
-            if len(cuda_obj) != len(cpu_obj):
+            if len(gpu_obj) != len(cpu_obj):
                 return (
                     False,
-                    f"Collection length mismatch at {path}: {len(cuda_obj)} vs {len(cpu_obj)}",
+                    f"Collection length mismatch at {path}: {len(gpu_obj)} vs {len(cpu_obj)}",
                 )
-            if type(cuda_obj) != type(cpu_obj):
+            if type(gpu_obj) is not type(cpu_obj):
                 return (
                     False,
-                    f"Collection type mismatch at {path}: {type(cuda_obj)} vs {type(cpu_obj)}",
+                    f"Collection type mismatch at {path}: {type(gpu_obj)} vs {type(cpu_obj)}",
                 )
 
-            for i, (cuda_item, cpu_item) in enumerate(zip(cuda_obj, cpu_obj)):
-                result, error = compare_objects(cuda_item, cpu_item, f"{path}[{i}]")
+            for i, (gpu_item, cpu_item) in enumerate(zip(gpu_obj, cpu_obj)):
+                result, error = compare_objects(gpu_item, cpu_item, f"{path}[{i}]")
                 if not result:
                     return False, error
 
             return True, ""
 
         # If objects are custom classes, compare their attributes
-        elif hasattr(cuda_obj, "__dict__") and hasattr(cpu_obj, "__dict__"):
-            if type(cuda_obj) != type(cpu_obj):
+        elif hasattr(gpu_obj, "__dict__") and hasattr(cpu_obj, "__dict__"):
+            if type(gpu_obj) is not type(cpu_obj):
                 return (
                     False,
-                    f"Object type mismatch at {path}: {type(cuda_obj)} vs {type(cpu_obj)}",
+                    f"Object type mismatch at {path}: {type(gpu_obj)} vs {type(cpu_obj)}",
                 )
 
             result, error = compare_objects(
-                cuda_obj.__dict__, cpu_obj.__dict__, f"{path}.__dict__"
+                gpu_obj.__dict__, cpu_obj.__dict__, f"{path}.__dict__"
             )
             if not result:
                 return False, error
@@ -157,18 +165,18 @@ def compare_objects(cuda_obj, cpu_obj, path=""):
 
         # For other types, use direct equality comparison
         else:
-            if type(cuda_obj) != type(cpu_obj):
+            if type(gpu_obj) is not type(cpu_obj):
                 return (
                     False,
-                    f"Type mismatch at {path}: {type(cuda_obj)} vs {type(cpu_obj)}",
+                    f"Type mismatch at {path}: {type(gpu_obj)} vs {type(cpu_obj)}",
                 )
-            if cuda_obj != cpu_obj:
-                return False, f"Value mismatch at {path}: {cuda_obj} vs {cpu_obj}"
+            if gpu_obj != cpu_obj:
+                return False, f"Value mismatch at {path}: {gpu_obj} vs {cpu_obj}"
 
             return True, ""
 
     # Start the recursive comparison
-    result, error = compare_objects(cuda_state_dict, cpu_state_dict)
+    result, error = compare_objects(gpu_state_dict, cpu_state_dict)
     return result, error
 
 
@@ -198,7 +206,7 @@ class FrozenDataClass:
 
 
 class TestStateDictStager(TestCase):
-    @requires_cuda
+    @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
     def test_views(self):
         test_configs = [
             (False, False),  # pin_memory=False, share_memory=False,
@@ -208,9 +216,9 @@ def test_views(self):
         ]
         for pin_memory, share_memory in test_configs:
             with self.subTest(pin_memory=pin_memory, share_memory=share_memory):
-                tensor1 = torch.randn(4, 4).cuda()
+                tensor1 = torch.randn(4, 4).to(device_type)
                 tensor2 = tensor1.view(16)
-                tensor3 = torch.randn(4, 4).cuda()
+                tensor3 = torch.randn(4, 4).to(device_type)
                 state_dict = {
                     "tensor1": tensor1,
                     "tensor2": tensor2,
@@ -253,7 +261,7 @@ def test_views(self):
                 assert num_bytes == expected_bytes, (
                     f"Expected {expected_bytes} bytes, got {num_bytes}"
                 )
-                # Verify that the CPU state dict is equivalent to the original CUDA state dict
+                # Verify that the CPU state dict is equivalent to the original GPU state dict
                 result, error = compare_state_dicts(state_dict, cpu_state_dict)
                 assert result, f"State dicts are not equivalent: {error}"
 
@@ -273,7 +281,7 @@ def test_views(self):
                     == recursive["type"].tensor1.storage().data_ptr()
                 )
 
-    @requires_cuda
+    @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
     def test_caching(self):
         """
         Test that the StateDictStager correctly caches and reuses storages.
@@ -287,9 +295,9 @@ def test_caching(self):
         for pin_memory, share_memory in test_configs:
             with self.subTest(pin_memory=pin_memory, share_memory=share_memory):
                 # Create test tensors and state dict
-                tensor1 = torch.randn(4, 4).cuda()
+                tensor1 = torch.randn(4, 4).to(device_type)
                 tensor2 = tensor1.view(16)
-                tensor3 = torch.randn(4, 4).cuda()
+                tensor3 = torch.randn(4, 4).to(device_type)
                 state_dict = {
                     "tensor1": tensor1,
                     "tensor2": tensor2,
@@ -365,14 +373,14 @@ def test_caching(self):
                     "Updated values should be reflected in the cached state dict"
                 )
 
-    @requires_cuda
+    @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
     def test_tensor_attrs(self):
         """
         Test that tensor attributes are preserved during stage with StateDictStager.
         """
-        tensor1 = torch.randn(4, 4).cuda()
+        tensor1 = torch.randn(4, 4).to(device_type)
         tensor2 = tensor1.view(16)
-        tensor3 = torch.randn(4, 4).cuda()
+        tensor3 = torch.randn(4, 4).to(device_type)
 
         # Add custom attributes to tensors
         tensor1.a = 42
@@ -411,18 +419,22 @@ def test_tensor_attrs(self):
             "Tensor attribute 'c' has incorrect value"
         )
 
-    @requires_cuda
+    @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
     def test_different_dtypes(self):
         """
         Test that StateDictStager works correctly with tensors of different data types.
         """
         # Create tensors with different dtypes
         tensors = {
-            "float32": torch.randn(4, 4, dtype=torch.float32).cuda(),
-            "float64": torch.randn(4, 4, dtype=torch.float64).cuda(),
-            "int32": torch.randint(-100, 100, (4, 4), dtype=torch.int32).cuda(),
-            "int64": torch.randint(-100, 100, (4, 4), dtype=torch.int64).cuda(),
-            "bool": torch.randint(0, 2, (4, 4), dtype=torch.bool).cuda(),
+            "float32": torch.randn(4, 4, dtype=torch.float32).to(device_type),
+            "float64": torch.randn(4, 4, dtype=torch.float64).to(device_type),
+            "int32": torch.randint(-100, 100, (4, 4), dtype=torch.int32).to(
+                device_type
+            ),
+            "int64": torch.randint(-100, 100, (4, 4), dtype=torch.int64).to(
+                device_type
+            ),
+            "bool": torch.randint(0, 2, (4, 4), dtype=torch.bool).to(device_type),
         }
 
         # Create a state dict with these tensors
@@ -447,7 +459,7 @@ def test_different_dtypes(self):
                 f"Tensor {dtype_name} has incorrect values",
             )
 
-    @requires_cuda
+    @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
     def test_empty_tensors(self):
         """
         Test that StateDictStager works correctly with empty tensors.
@@ -462,15 +474,17 @@ def test_empty_tensors(self):
             with self.subTest(pin_memory=pin_memory, share_memory=share_memory):
                 # Create empty tensors with different shapes
                 tensors = {
-                    "empty_0d": torch.tensor([], dtype=torch.float32).cuda(),
-                    "empty_1d": torch.tensor([], dtype=torch.float32).reshape(0).cuda(),
+                    "empty_0d": torch.tensor([], dtype=torch.float32).to(device_type),
+                    "empty_1d": torch.tensor([], dtype=torch.float32)
+                    .reshape(0)
+                    .to(device_type),
                     "empty_2d": torch.tensor([], dtype=torch.float32)
                     .reshape(0, 0)
-                    .cuda(),
+                    .to(device_type),
                     "empty_3d": torch.tensor([], dtype=torch.float32)
                     .reshape(0, 0, 0)
-                    .cuda(),
-                    "zero_dim": torch.tensor(0.0).cuda(),  # scalar tensor
+                    .to(device_type),
+                    "zero_dim": torch.tensor(0.0).to(device_type),  # scalar tensor
                 }
 
                 # Create a state dict with these tensors
@@ -500,13 +514,13 @@ def test_empty_tensors(self):
                         f"Tensor {tensor_name} has incorrect dtype",
                     )
 
-    @requires_cuda
+    @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
     def test_complex_storage_sharing(self):
         """
         Test that StateDictStager correctly handles complex storage sharing scenarios.
         """
         # Create a base tensor
-        base_tensor = torch.randn(10, 10).cuda()
+        base_tensor = torch.randn(10, 10).to(device_type)
 
         # Create various views and slices that share storage
         view1 = base_tensor.view(100)
@@ -582,13 +596,13 @@ def test_complex_storage_sharing(self):
             "slice3 should reflect changes to base",
         )
 
-    @requires_cuda
+    @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
     def test_dataclasses(self):
         # Create tensors
-        tensor1 = torch.randn(4, 4).cuda()
-        tensor2 = torch.randn(8, 8).cuda()
-        tensor3 = torch.randn(2, 6).cuda()
-        tensor4 = torch.randn(3, 5).cuda()
+        tensor1 = torch.randn(4, 4).to(device_type)
+        tensor2 = torch.randn(8, 8).to(device_type)
+        tensor3 = torch.randn(2, 6).to(device_type)
+        tensor4 = torch.randn(3, 5).to(device_type)
 
         # Create dataclass instances
         nested = NestedTensorStruct(tensor=tensor3)
@@ -695,14 +709,14 @@ def test_cpu_storage_independence(self):
             "CPU tensor should have the same values as the original tensor",
         )
 
-    @requires_cuda
+    @unittest.skipIf(not HAS_ACCELERATOR, "No accelerator")
     def test_tensor_pinned_and_shared(self):
         """
         Test that verifies tensors are actually pinned and shared using tensor.is_pinned() and tensor.is_shared() methods.
         """
         # Create test tensors
-        tensor1 = torch.randn(4, 4).cuda()
-        tensor2 = torch.randn(8, 8).cuda()
+        tensor1 = torch.randn(4, 4).to(device_type)
+        tensor2 = torch.randn(8, 8).to(device_type)
 
         # Create a state dict with these tensors
         state_dict = {
@@ -797,15 +811,17 @@ def test_tensor_pinned_and_shared(self):
 
 class TestDTensorStateDictStager(DTensorTestBase):
     @with_comms
-    @requires_nccl()
+    @requires_accelerator_dist_backend()
     @skip_if_lt_x_gpu(2)
     def test_dtensor(self):
         """
         Test that StateDictStager works correctly with DTensors.
         """
         # Create a DTensor
-        device_mesh = dist.DeviceMesh("cuda", list(range(dist.get_world_size())))
-        tensor = torch.randn(3, 3, device="cuda")
+        device_mesh = dist.DeviceMesh(
+            self.device_type, list(range(dist.get_world_size()))
+        )
+        tensor = torch.randn(3, 3, device=self.device_type)
         dtensor = DTensor.from_local(tensor, device_mesh, [Shard(0)])
 
         dtensor = dtensor + 1
diff --git a/test/distributed/checkpoint/test_tp_checkpoint.py b/test/distributed/checkpoint/test_tp_checkpoint.py
index 24e9dbd6fd17..a406999edc2f 100644
--- a/test/distributed/checkpoint/test_tp_checkpoint.py
+++ b/test/distributed/checkpoint/test_tp_checkpoint.py
@@ -47,7 +47,7 @@ def test_tp_checkpoint(self):
         tp_mesh = init_device_mesh(self.device_type, mesh_shpe)
 
         # create model and move it to GPU with id rank
-        model = MLPModule(self.device_type).cuda(self.rank)
+        model = MLPModule(self.device_type).to(self.rank)
         # Parallelize the module based on the given Parallel Style.
         parallelize_plan = {
             "net1": ColwiseParallel(),
@@ -65,7 +65,7 @@ def test_tp_checkpoint(self):
 
         # Update the parameters so model.state_dict() will be different from original_state_dict.
         torch.manual_seed(0)
-        inp = torch.rand(20, 10).cuda(self.rank)
+        inp = torch.rand(20, 10).to(self.rank)
         output = model(inp)
         output.sum().backward()
         optimizer.step()
@@ -94,7 +94,7 @@ def test_tp_checkpoint_load_on_meta_device(self):
         tp_mesh = init_device_mesh(self.device_type, mesh_shpe)
 
         # create model and move it to GPU with id rank
-        model = UnevenShardedModel(self.device_type).cuda(self.rank)
+        model = UnevenShardedModel(self.device_type).to(self.rank)
         # Parallelize the module based on the given Parallel Style.
         parallelize_plan = {
             "net1": ColwiseParallel(),
diff --git a/test/distributed/checkpoint/test_utils.py b/test/distributed/checkpoint/test_utils.py
index 1074d11a77b0..79dbe741822c 100644
--- a/test/distributed/checkpoint/test_utils.py
+++ b/test/distributed/checkpoint/test_utils.py
@@ -45,7 +45,7 @@
 def create_sharded_tensor(rank, world_size, shards_per_rank):
     shards_metadata = []
     local_shards = []
-    for idx in range(0, world_size * shards_per_rank):
+    for idx in range(world_size * shards_per_rank):
         shard_rank = idx // shards_per_rank
         shard_md = ShardMetadata(
             shard_offsets=[idx * 8], shard_sizes=[8], placement=f"rank:{shard_rank}/cpu"
@@ -199,7 +199,7 @@ def testLongReadinto(self):
 class TestDistWrapper(DTensorTestBase):
     @property
     def world_size(self):
-        return min(4, torch.cuda.device_count())
+        return min(4, torch.accelerator.device_count())
 
     @with_comms
     @skip_if_lt_x_gpu(4)
diff --git a/test/distributed/elastic/agent/server/test/api_test.py b/test/distributed/elastic/agent/server/test/api_test.py
index 11776324ed7f..dd96f9b6dfb0 100644
--- a/test/distributed/elastic/agent/server/test/api_test.py
+++ b/test/distributed/elastic/agent/server/test/api_test.py
@@ -633,7 +633,7 @@ def test_restart_workers(self):
         worker_group = agent.get_worker_group()
 
         num_restarts = 3
-        for _ in range(0, num_restarts):
+        for _ in range(num_restarts):
             agent._restart_workers(worker_group)
             self.assertEqual(WorkerState.HEALTHY, worker_group.state)
 
diff --git a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
index ff89b5c51f01..22310a9ba495 100644
--- a/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
+++ b/test/distributed/elastic/agent/server/test/local_elastic_agent_test.py
@@ -16,8 +16,9 @@
 import time
 import unittest
 import uuid
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Callable, Optional
+from typing import Optional
 from unittest import mock
 from unittest.mock import Mock, patch
 
diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py
index 80c17f6dea99..19d941e0d9c6 100644
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@@ -15,8 +15,9 @@
 import sys
 import tempfile
 import time
+from collections.abc import Callable
 from itertools import product
-from typing import Callable, Union
+from typing import Union
 from unittest import mock
 
 import torch
@@ -145,7 +146,7 @@ def echo_large(size: int) -> dict[int, str]:
     returns a large output ({0: test0", 1: "test1", ..., (size-1):f"test{size-1}"})
     """
     out = {}
-    for idx in range(0, size):
+    for idx in range(size):
         out[idx] = f"test{idx}"
     return out
 
@@ -558,7 +559,7 @@ def test_binary_exit(self):
             FAIL = 138
             pc = start_processes(
                 name="echo",
-                entrypoint=bin("echo1.py"),
+                entrypoint=bin("echo4.py"),
                 args={0: ("--exitcode", FAIL, "foo"), 1: ("--exitcode", 0, "bar")},
                 envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
                 logs_specs=DefaultLogsSpecs(
diff --git a/test/distributed/elastic/multiprocessing/bin/echo1.py b/test/distributed/elastic/multiprocessing/bin/echo1.py
index 5ffa5bd90455..8bcd574e8d85 100755
--- a/test/distributed/elastic/multiprocessing/bin/echo1.py
+++ b/test/distributed/elastic/multiprocessing/bin/echo1.py
@@ -9,7 +9,6 @@
 import argparse
 import os
 import sys
-import time
 
 
 if __name__ == "__main__":
@@ -24,6 +23,5 @@
         print(f"exit {exitcode} from {rank}", file=sys.stderr)
         sys.exit(exitcode)
     else:
-        time.sleep(1000)
         print(f"{args.msg} stdout from {rank}")
         print(f"{args.msg} stderr from {rank}", file=sys.stderr)
diff --git a/test/distributed/elastic/multiprocessing/bin/echo4.py b/test/distributed/elastic/multiprocessing/bin/echo4.py
new file mode 100755
index 000000000000..5ffa5bd90455
--- /dev/null
+++ b/test/distributed/elastic/multiprocessing/bin/echo4.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+import sys
+import time
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="test binary, exits with exitcode")
+    parser.add_argument("--exitcode", type=int, default=0)
+    parser.add_argument("msg", type=str)
+    args = parser.parse_args()
+
+    rank = int(os.environ["RANK"])
+    exitcode = args.exitcode
+    if exitcode != 0:
+        print(f"exit {exitcode} from {rank}", file=sys.stderr)
+        sys.exit(exitcode)
+    else:
+        time.sleep(1000)
+        print(f"{args.msg} stdout from {rank}")
+        print(f"{args.msg} stderr from {rank}", file=sys.stderr)
diff --git a/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py b/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py
index 82bc62d439dd..a0cffd57438a 100644
--- a/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py
+++ b/test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py
@@ -9,8 +9,9 @@
 import os
 import tempfile
 from base64 import b64encode
+from collections.abc import Callable
 from datetime import timedelta
-from typing import Callable, cast, ClassVar
+from typing import cast, ClassVar
 from unittest import mock, TestCase
 
 from rendezvous_backend_test import RendezvousBackendTestMixin
diff --git a/test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py b/test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py
index 173effd5cd0b..0ee101598776 100644
--- a/test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py
+++ b/test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py
@@ -14,8 +14,9 @@
 import time
 from abc import ABC, abstractmethod
 from base64 import b64encode
+from collections.abc import Callable
 from datetime import datetime, timedelta, timezone
-from typing import Callable, cast, Optional
+from typing import cast, Optional
 from unittest import TestCase
 from unittest.mock import call, MagicMock, Mock, patch, PropertyMock
 
diff --git a/test/distributed/elastic/rendezvous/rendezvous_backend_test.py b/test/distributed/elastic/rendezvous/rendezvous_backend_test.py
index 48815e21718b..f4c0fd7b88db 100644
--- a/test/distributed/elastic/rendezvous/rendezvous_backend_test.py
+++ b/test/distributed/elastic/rendezvous/rendezvous_backend_test.py
@@ -7,7 +7,8 @@
 # LICENSE file in the root directory of this source tree.
 
 from abc import ABC, abstractmethod
-from typing import Any, Callable, cast, Optional
+from collections.abc import Callable
+from typing import Any, cast, Optional
 
 from torch.distributed.elastic.rendezvous import RendezvousStateError
 from torch.distributed.elastic.rendezvous.dynamic_rendezvous import (
diff --git a/test/distributed/elastic/timer/file_based_local_timer_test.py b/test/distributed/elastic/timer/file_based_local_timer_test.py
index cf597eb6a37a..0125ce5cd25a 100644
--- a/test/distributed/elastic/timer/file_based_local_timer_test.py
+++ b/test/distributed/elastic/timer/file_based_local_timer_test.py
@@ -191,7 +191,7 @@ def _request_on_interval(file_path, n, interval, sem):
         """
         client = timer.FileTimerClient(file_path)
         sem.release()
-        for _ in range(0, n):
+        for _ in range(n):
             client.acquire("test_scope", 0)
             time.sleep(interval)
 
diff --git a/test/distributed/elastic/timer/local_timer_example.py b/test/distributed/elastic/timer/local_timer_example.py
index 09421f4b38f5..6d438f2536d6 100644
--- a/test/distributed/elastic/timer/local_timer_example.py
+++ b/test/distributed/elastic/timer/local_timer_example.py
@@ -102,7 +102,7 @@ def _run_example_with(self, start_method):
 
             world_size = 8
             processes = []
-            for i in range(0, world_size):
+            for i in range(world_size):
                 if i % 2 == 0:
                     p = spawn_ctx.Process(target=_stuck_function, args=(i, mp_queue))
                 else:
@@ -110,7 +110,7 @@ def _run_example_with(self, start_method):
                 p.start()
                 processes.append(p)
 
-            for i in range(0, world_size):
+            for i in range(world_size):
                 p = processes[i]
                 p.join()
                 if i % 2 == 0:
diff --git a/test/distributed/elastic/timer/local_timer_test.py b/test/distributed/elastic/timer/local_timer_test.py
index b65b202d5ec6..8818b1788c62 100644
--- a/test/distributed/elastic/timer/local_timer_test.py
+++ b/test/distributed/elastic/timer/local_timer_test.py
@@ -127,7 +127,7 @@ def _enqueue_on_interval(mp_queue, n, interval, sem):
         interval seconds. Releases the given semaphore once before going to work.
         """
         sem.release()
-        for i in range(0, n):
+        for i in range(n):
             mp_queue.put(TimerRequest(i, "test_scope", 0))
             time.sleep(interval)
 
diff --git a/test/distributed/elastic/utils/data/cycling_iterator_test.py b/test/distributed/elastic/utils/data/cycling_iterator_test.py
index c9cb055a2c22..835ed6ebbd01 100644
--- a/test/distributed/elastic/utils/data/cycling_iterator_test.py
+++ b/test/distributed/elastic/utils/data/cycling_iterator_test.py
@@ -15,7 +15,7 @@ class CyclingIteratorTest(unittest.TestCase):
     def generator(self, epoch, stride, max_epochs):
         # generate an continuously incrementing list each epoch
         # e.g. [0,1,2] [3,4,5] [6,7,8] ...
-        return iter([stride * epoch + i for i in range(0, stride)])
+        return iter([stride * epoch + i for i in range(stride)])
 
     def test_cycling_iterator(self):
         stride = 3
@@ -25,7 +25,7 @@ def generator_fn(epoch):
             return self.generator(epoch, stride, max_epochs)
 
         it = CyclingIterator(n=max_epochs, generator_fn=generator_fn)
-        for i in range(0, stride * max_epochs):
+        for i in range(stride * max_epochs):
             self.assertEqual(i, next(it))
 
         with self.assertRaises(StopIteration):
diff --git a/test/distributed/flight_recorder/test_fr_analysis.py b/test/distributed/flight_recorder/test_fr_analysis.py
index 2822ed5dcb3a..4e96360b8f1f 100644
--- a/test/distributed/flight_recorder/test_fr_analysis.py
+++ b/test/distributed/flight_recorder/test_fr_analysis.py
@@ -143,6 +143,19 @@ def test_match_one_event(self):
             match_one_event(e11, e12, membership, "0").state,
             MatchState.FULLY_MATCHED,
         )
+        e13 = create_one_event(
+            "gather",
+            ("0", "default"),
+            [[4, 4]],
+            [[4, 4]],
+            "completed",
+            1,
+            output_dtypes="",
+        )
+        self.assertEqual(
+            match_one_event(e11, e13, membership, "0").state,
+            MatchState.FULLY_MATCHED,
+        )
 
     def test_all_events(self):
         for collective in sorted(COLLECTIVES):
diff --git a/test/distributed/fsdp/test_fsdp_apply.py b/test/distributed/fsdp/test_fsdp_apply.py
index d56ac09ebe5a..c0f1a791c534 100644
--- a/test/distributed/fsdp/test_fsdp_apply.py
+++ b/test/distributed/fsdp/test_fsdp_apply.py
@@ -44,14 +44,14 @@ def world_size(self):
 
     @torch.no_grad()
     def _init_linear_weights(self, m):
-        if type(m) == nn.Linear:
+        if type(m) is nn.Linear:
             m.weight.fill_(1.0)
             m.bias.fill_(1.0)
 
     def check_weights(self, fsdp, expected_tensor_fn, check):
         with FSDP.summon_full_params(fsdp, recurse=True):
             linear_modules = [
-                module for module in fsdp.modules() if type(module) == nn.Linear
+                module for module in fsdp.modules() if type(module) is nn.Linear
             ]
             for module in linear_modules:
                 for param in module.parameters():
diff --git a/test/distributed/fsdp/test_fsdp_comm_hooks.py b/test/distributed/fsdp/test_fsdp_comm_hooks.py
index 624e74d37368..6a204d5eb22f 100644
--- a/test/distributed/fsdp/test_fsdp_comm_hooks.py
+++ b/test/distributed/fsdp/test_fsdp_comm_hooks.py
@@ -34,11 +34,7 @@
     acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
 )
 
-# bfloat16 is only supported by CUDA 11+ or XPU
-BFLOAT16_AVAILABLE = (
-    torch.cuda.is_available()
-    and (torch.version.cuda is not None or torch.version.hip is not None)
-) or torch.xpu.is_available()
+BFLOAT16_AVAILABLE = torch.cuda.is_bf16_supported() or torch.xpu.is_bf16_supported()
 
 
 class Net(nn.Module):
diff --git a/test/distributed/fsdp/test_fsdp_core.py b/test/distributed/fsdp/test_fsdp_core.py
index d6ee32c1f2e3..45e58ea3d049 100644
--- a/test/distributed/fsdp/test_fsdp_core.py
+++ b/test/distributed/fsdp/test_fsdp_core.py
@@ -4,7 +4,8 @@
 import itertools
 import sys
 import unittest
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 from unittest import mock
 
 import torch
diff --git a/test/distributed/fsdp/test_fsdp_freezing_weights.py b/test/distributed/fsdp/test_fsdp_freezing_weights.py
index ad318a6bf752..730b8cd7308e 100644
--- a/test/distributed/fsdp/test_fsdp_freezing_weights.py
+++ b/test/distributed/fsdp/test_fsdp_freezing_weights.py
@@ -155,7 +155,7 @@ def _dist_train(
 
         ddp_kwargs = {
             "device_ids": [self.rank],
-            "find_unused_parameters": True if disable_autograd else False,
+            "find_unused_parameters": bool(disable_autograd),
         }
 
         model = self._create_model(
diff --git a/test/distributed/fsdp/test_fsdp_hybrid_shard.py b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
index 26a05bbc4171..e2ea4c5fc9af 100644
--- a/test/distributed/fsdp/test_fsdp_hybrid_shard.py
+++ b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
@@ -124,7 +124,7 @@ def test_hsdp_save_load_state_dict(self):
         model = MyModel().to(device_type)
         num_node_devices = torch.accelerator.device_count()
         shard_rank_lists = (
-            list(range(0, num_node_devices // 2)),
+            list(range(num_node_devices // 2)),
             list(range(num_node_devices // 2, num_node_devices)),
         )
         shard_groups = (
@@ -175,7 +175,7 @@ def test_hsdp_sync_module_state(self):
         model = MyModel().to(device_type)
         num_node_devices = torch.accelerator.device_count()
         shard_rank_lists = (
-            list(range(0, num_node_devices // 2)),
+            list(range(num_node_devices // 2)),
             list(range(num_node_devices // 2, num_node_devices)),
         )
         shard_groups = (
diff --git a/test/distributed/fsdp/test_fsdp_misc.py b/test/distributed/fsdp/test_fsdp_misc.py
index 45c1668dfb2e..2ae986af785b 100644
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@@ -1021,7 +1021,7 @@ def test_world_size_1_sharding_strategy_warning(self):
             )
             for warning in w:
                 self.assertTrue(
-                    warning.category != UserWarning
+                    warning.category is not UserWarning
                     or not str(warning.message).startswith(warning_prefix)
                 )
 
diff --git a/test/distributed/fsdp/test_fsdp_optim_state.py b/test/distributed/fsdp/test_fsdp_optim_state.py
index 09d24d5fba8d..99e5db33d67d 100644
--- a/test/distributed/fsdp/test_fsdp_optim_state.py
+++ b/test/distributed/fsdp/test_fsdp_optim_state.py
@@ -2,9 +2,10 @@
 
 import bisect
 import sys
+from collections.abc import Callable
 from copy import deepcopy
 from enum import auto, Enum
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 
 import torch
 import torch.nn as nn
@@ -420,7 +421,7 @@ def _are_equal_states(
             return False
         for state_name, value1 in state1.items():
             value2 = state2[state_name]
-            if type(value1) != type(value2):
+            if type(value1) is not type(value2):
                 return False
             if torch.is_tensor(value1):  # tensor state
                 assert torch.is_tensor(value2)
diff --git a/test/distributed/fsdp/test_wrap.py b/test/distributed/fsdp/test_wrap.py
index f5769f8b534d..aa224edaefa1 100644
--- a/test/distributed/fsdp/test_wrap.py
+++ b/test/distributed/fsdp/test_wrap.py
@@ -5,8 +5,9 @@
 import os
 import tempfile
 import unittest
+from collections.abc import Callable
 from enum import auto, Enum
-from typing import Callable, Union
+from typing import Union
 
 import torch
 import torch.nn as nn
diff --git a/test/distributed/optim/test_apply_optimizer_in_backward.py b/test/distributed/optim/test_apply_optimizer_in_backward.py
index a2db80597cda..c7be2c8a1d08 100644
--- a/test/distributed/optim/test_apply_optimizer_in_backward.py
+++ b/test/distributed/optim/test_apply_optimizer_in_backward.py
@@ -39,7 +39,7 @@ def _run_training_loop_and_validate(self, inp, models, optimizers):
             with self.subTest(i):
                 _validate_params(
                     [model.parameters() for model in models],
-                    torch.testing.assert_allclose,
+                    torch.testing.assert_close,
                 )
 
             for opt in optimizers:
@@ -77,7 +77,7 @@ def _test_apply_optimizer_in_backward(self, share_params) -> None:
                 model.parameters(),
                 model_with_opt_in_bwd.parameters(),
             ],
-            torch.testing.assert_allclose,
+            torch.testing.assert_close,
         )
 
         self._run_training_loop_and_validate(
@@ -113,10 +113,10 @@ def test_no_register_hook(self):
 
         for p1, p2 in zip(model_with_hook.parameters(), initial_model.parameters()):
             with self.assertRaises(AssertionError):
-                torch.testing.assert_allclose(p1, p2)
+                torch.testing.assert_close(p1, p2)
 
         for p1, p2 in zip(model_no_hook.parameters(), initial_model.parameters()):
-            torch.testing.assert_allclose(p1, p2)
+            torch.testing.assert_close(p1, p2)
 
     def test_multiple_optim_for_params(self) -> None:
         model = nn.Sequential(nn.Linear(10, 10), nn.Linear(10, 10))
diff --git a/test/distributed/pipelining/model_registry.py b/test/distributed/pipelining/model_registry.py
index 347dad6fb766..5213cf4ac647 100644
--- a/test/distributed/pipelining/model_registry.py
+++ b/test/distributed/pipelining/model_registry.py
@@ -8,7 +8,7 @@
 
 class ExampleCode(torch.nn.Module):
     def __init__(self, d_hid, splits=2):
-        assert splits <= 4
+        assert splits <= 8
         super().__init__()
         self.splits = splits
         self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
@@ -17,6 +17,10 @@ def __init__(self, d_hid, splits=2):
         self.lin0 = torch.nn.Linear(d_hid, d_hid)
         self.lin1 = torch.nn.Linear(d_hid, d_hid)
         self.lin2 = torch.nn.Linear(d_hid, d_hid)
+        self.lin3 = torch.nn.Linear(d_hid, d_hid)
+        self.lin4 = torch.nn.Linear(d_hid, d_hid)
+        self.lin5 = torch.nn.Linear(d_hid, d_hid)
+        self.lin6 = torch.nn.Linear(d_hid, d_hid)
 
     def forward(self, x):
         x = torch.mm(x, self.mm_param0)
@@ -35,6 +39,22 @@ def forward(self, x):
             pipe_split()
             x = self.lin2(x)
             x = torch.relu(x)
+        if self.splits > 4:
+            pipe_split()
+            x = self.lin3(x)
+            x = torch.relu(x)
+        if self.splits > 5:
+            pipe_split()
+            x = self.lin4(x)
+            x = torch.relu(x)
+        if self.splits > 6:
+            pipe_split()
+            x = self.lin5(x)
+            x = torch.relu(x)
+        if self.splits > 7:
+            pipe_split()
+            x = self.lin6(x)
+            x = torch.relu(x)
         return x
 
 
@@ -43,7 +63,7 @@ class ModelWithKwargs(torch.nn.Module):
     DEFAULT_BATCH_SIZE = 256
 
     def __init__(self, d_hid: int = DEFAULT_DHID, splits=2):
-        assert splits <= 4
+        assert splits <= 8
         super().__init__()
         self.splits = splits
         self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
@@ -52,6 +72,10 @@ def __init__(self, d_hid: int = DEFAULT_DHID, splits=2):
         self.lin1 = torch.nn.Linear(d_hid, d_hid)
         self.lin2 = torch.nn.Linear(d_hid, d_hid)
         self.lin3 = torch.nn.Linear(d_hid, d_hid)
+        self.lin4 = torch.nn.Linear(d_hid, d_hid)
+        self.lin5 = torch.nn.Linear(d_hid, d_hid)
+        self.lin6 = torch.nn.Linear(d_hid, d_hid)
+        self.lin7 = torch.nn.Linear(d_hid, d_hid)
 
     def forward(self, x, y=torch.zeros(DEFAULT_BATCH_SIZE, DEFAULT_DHID)):
         x = torch.mm(x, self.mm_param0)
@@ -70,6 +94,22 @@ def forward(self, x, y=torch.zeros(DEFAULT_BATCH_SIZE, DEFAULT_DHID)):
             pipe_split()
             x = self.lin3(x)
             x = torch.relu(x)
+        if self.splits > 4:
+            pipe_split()
+            x = self.lin4(x)
+            x = torch.relu(x)
+        if self.splits > 5:
+            pipe_split()
+            x = self.lin5(x)
+            x = torch.relu(x)
+        if self.splits > 6:
+            pipe_split()
+            x = self.lin6(x)
+            x = torch.relu(x)
+        if self.splits > 7:
+            pipe_split()
+            x = self.lin7(x)
+            x = torch.relu(x)
         return x
 
 
diff --git a/test/distributed/pipelining/schedule_registry.py b/test/distributed/pipelining/schedule_registry.py
index 9b401193a172..0b037bd8b99b 100644
--- a/test/distributed/pipelining/schedule_registry.py
+++ b/test/distributed/pipelining/schedule_registry.py
@@ -2,7 +2,8 @@
 # Owner(s): ["oncall: distributed"]
 # This file is a Schedule zoo for testing torch.distributed.pipelining.
 # It includes schedules designed purely for testing purposes
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Optional
 
 from torch.distributed.pipelining.schedules import (
     _Action,
diff --git a/test/distributed/pipelining/test_microbatch.py b/test/distributed/pipelining/test_microbatch.py
index 99bb0fddaa21..b49a1cea324a 100644
--- a/test/distributed/pipelining/test_microbatch.py
+++ b/test/distributed/pipelining/test_microbatch.py
@@ -9,6 +9,7 @@
     split_args_kwargs_into_chunks,
     TensorChunkSpec,
 )
+from torch.nn.attention.flex_attention import create_block_mask, flex_attention
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     skipXPUIf,
@@ -59,6 +60,215 @@ def test_split_and_merge(self):
         torch.testing.assert_close(merged_kwargs, kwargs)
         print("Microbatch test passed")
 
+    def test_split_block_mask(self, device):
+        B = 6
+        H = 1
+        SEQ_LEN = 512
+        DIM = 32
+        DOC_LEN = 30
+
+        def create_block_causal_mask(batch, eos_id: int):
+            mask = batch == eos_id
+            mask[:, -1] = True
+            acc_mask = torch.cumsum(torch.where(mask, 1, 0), dim=1)
+            seq_idx = torch.zeros_like(acc_mask, dtype=torch.int32)
+            seq_idx[:, 1:] = acc_mask[:, :-1]
+
+            def block_causal_mask(
+                b: torch.Tensor,
+                h: torch.Tensor,
+                q_idx: torch.Tensor,
+                kv_idx: torch.Tensor,
+            ):
+                return (seq_idx[b, q_idx] == seq_idx[b, kv_idx]) & (q_idx >= kv_idx)
+
+            return block_causal_mask
+
+        # Create a fake batch which packs several documents together, which
+        # each has DOC_LEN tokens and the last token is the EOS token, DOC_LEN - 1.
+        total_elements = B * SEQ_LEN
+        batch = torch.arange(total_elements, device=device) % DOC_LEN
+        batch = batch.reshape(B, SEQ_LEN)
+        q, k, v = (
+            torch.randn(B, H, SEQ_LEN, DIM, device=device, requires_grad=True)
+            for i in range(3)
+        )
+        block_mask_fn = torch.compile(create_block_mask, fullgraph=True)
+        block_mask = block_mask_fn(
+            create_block_causal_mask(batch, DOC_LEN - 1),
+            B=B,
+            H=H,
+            Q_LEN=SEQ_LEN,
+            KV_LEN=SEQ_LEN,
+            device=device,
+        )
+        block_mask2 = block_mask_fn(
+            create_block_causal_mask(batch, DOC_LEN - 1),
+            B=B,
+            H=H,
+            Q_LEN=SEQ_LEN,
+            KV_LEN=SEQ_LEN,
+            device=device,
+        )
+        if device == "cuda":
+            flex_fn = torch.compile(flex_attention)
+        else:
+            # It's unclear why CPU + torch.compile + flex_attention can cause an issue.
+            flex_fn = flex_attention
+        out = flex_fn(q, k, v, block_mask=block_mask)
+        out.sum().backward()
+
+        q_clone, k_clone, v_clone = (target.clone().detach() for target in (q, k, v))
+        arg_split, _ = split_args_kwargs_into_chunks(
+            (
+                q_clone,
+                k_clone,
+                v_clone,
+                {"unused_block_mask": block_mask2, "block_mask": block_mask},
+            ),
+            {},
+            chunks=4,
+            args_chunk_spec=None,
+            kwargs_chunk_spec=None,
+        )
+        assert len(arg_split) == 4
+
+        q_total_chunks = []
+        dq_total_chunks = []
+        k_total_chunks = []
+        dk_total_chunks = []
+        v_total_chunks = []
+        dv_total_chunks = []
+        block_mask_total_chunks = []
+        out_total_chunks = []
+        for i in range(len(arg_split)):
+            q_chunk, k_chunk, v_chunk, block_mask_chunk = arg_split[i]
+            for chunk, total_chunks in zip(
+                (q_chunk, k_chunk, v_chunk),
+                (q_total_chunks, k_total_chunks, v_total_chunks),
+            ):
+                chunk.requires_grad = True
+                total_chunks.append(chunk)
+
+            out_chunk = flex_fn(
+                q_chunk, k_chunk, v_chunk, block_mask=block_mask_chunk["block_mask"]
+            )
+
+            out_chunk.sum().backward()
+            dq_total_chunks.append(q_chunk.grad)
+            dk_total_chunks.append(k_chunk.grad)
+            dv_total_chunks.append(v_chunk.grad)
+            block_mask_total_chunks.append(block_mask_chunk["block_mask"])
+            out_total_chunks.append(out_chunk)
+
+        concat_q = torch.cat(q_total_chunks, dim=0)
+        concat_dq = torch.cat(dq_total_chunks, dim=0)
+        concat_k = torch.cat(k_total_chunks, dim=0)
+        concat_dk = torch.cat(dk_total_chunks, dim=0)
+        concat_v = torch.cat(v_total_chunks, dim=0)
+        concat_dv = torch.cat(dv_total_chunks, dim=0)
+        concat_kv_indices = torch.cat(
+            [bm.kv_indices for bm in block_mask_total_chunks], dim=0
+        )
+        concat_kv_num_blocks = torch.cat(
+            [bm.kv_num_blocks for bm in block_mask_total_chunks], dim=0
+        )
+        concat_kv_full_num_blocks = torch.cat(
+            [bm.full_kv_num_blocks for bm in block_mask_total_chunks], dim=0
+        )
+        concat_kv_full_indices = torch.cat(
+            [bm.full_kv_indices for bm in block_mask_total_chunks], dim=0
+        )
+        concat_out = torch.cat(out_total_chunks, dim=0)
+        self.assertEqual(concat_q, q)
+        self.assertEqual(concat_dq, q.grad)
+        self.assertEqual(concat_k, k)
+        self.assertEqual(concat_dk, k.grad)
+        self.assertEqual(concat_v, v)
+        self.assertEqual(concat_dv, v.grad)
+        self.assertEqual(concat_kv_indices, block_mask.kv_indices)
+        self.assertEqual(concat_kv_num_blocks, block_mask.kv_num_blocks)
+        self.assertEqual(concat_kv_full_num_blocks, block_mask.full_kv_num_blocks)
+        self.assertEqual(concat_kv_full_indices, block_mask.full_kv_indices)
+        self.assertEqual(concat_out, out)
+
+    def test_split_block_mask_batch_size_one(self, device):
+        B = 6
+        H = 1
+        SEQ_LEN = 512
+        DIM = 32
+
+        def create_causal_mask():
+            def causal_mask(
+                b: torch.Tensor,
+                h: torch.Tensor,
+                q_idx: torch.Tensor,
+                kv_idx: torch.Tensor,
+            ):
+                return q_idx >= kv_idx
+
+            return causal_mask
+
+        q, k, v = (torch.randn(B, H, SEQ_LEN, DIM, device=device) for i in range(3))
+        block_mask_fn = torch.compile(create_block_mask, fullgraph=True)
+        block_mask = block_mask_fn(
+            create_causal_mask(),
+            B=1,
+            H=H,
+            Q_LEN=SEQ_LEN,
+            KV_LEN=SEQ_LEN,
+            device=device,
+        )
+        if device == "cuda":
+            flex_fn = torch.compile(flex_attention)
+        else:
+            # It's unclear why CPU + torch.compile + flex_attention can cause an issue.
+            flex_fn = flex_attention
+        out = flex_fn(q, k, v, block_mask=block_mask)
+
+        q_clone, k_clone, v_clone = (target.clone().detach() for target in (q, k, v))
+        arg_split, _ = split_args_kwargs_into_chunks(
+            (q_clone, k_clone, v_clone, {"block_mask": block_mask}),
+            {},
+            chunks=4,
+            args_chunk_spec=None,
+            kwargs_chunk_spec=None,
+        )
+
+        assert len(arg_split) == 4
+
+        out_total_chunks = []
+        for i in range(len(arg_split)):
+            q_chunk, k_chunk, v_chunk, block_mask_chunk = arg_split[i]
+            out_chunk = flex_fn(
+                q_chunk, k_chunk, v_chunk, block_mask=block_mask_chunk["block_mask"]
+            )
+            out_total_chunks.append(out_chunk)
+
+        concat_out = torch.cat(out_total_chunks, dim=0)
+        self.assertEqual(concat_out, out)
+
+    def test_split_block_mask_none(self, device):
+        B = 6
+        H = 1
+        SEQ_LEN = 512
+        DIM = 32
+
+        q, k, v = (torch.randn(B, H, SEQ_LEN, DIM, device=device) for i in range(3))
+        arg_split, kwarg_split = split_args_kwargs_into_chunks(
+            (q, k, v, None),
+            {"attention_mask": None},
+            chunks=4,
+            args_chunk_spec=None,
+            kwargs_chunk_spec=None,
+        )
+
+        assert len(arg_split) == 4
+
+        for i in range(len(arg_split)):
+            self.assertIsNone(arg_split[i][3])
+            self.assertIsNone(kwarg_split[i]["attention_mask"])
+
     @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1682")
     def test_chunk_spec(self, device):
         mod = ModelWithKwargs().to(device)
diff --git a/test/distributed/pipelining/test_schedule.py b/test/distributed/pipelining/test_schedule.py
index dabf3d78a6f1..714ab8f65911 100644
--- a/test/distributed/pipelining/test_schedule.py
+++ b/test/distributed/pipelining/test_schedule.py
@@ -8,6 +8,7 @@
 from model_registry import MultiMLP
 
 import torch
+from torch._dynamo import OptimizedModule
 from torch.distributed.pipelining import (
     Schedule1F1B,
     ScheduleDualPipeV,
@@ -65,7 +66,7 @@ def __init__(self, *args, **kwargs):
         self.num_stages = kwargs.get("num_stages", 1)
         self.group_size = kwargs.get("group_size", 1)
         self.group_rank = kwargs.get("group_rank", 0)
-        self.group = kwargs.get("group", None)
+        self.group = kwargs.get("group")
 
     def _create_grad_recv_info(self, *args, **kwargs):
         return None
@@ -202,7 +203,71 @@ def loss_fn(y, target):
 
         torch.distributed.destroy_process_group()
 
-    def test_zero_bubble_schedule_errors_with_compile(self):
+    @parametrize(
+        "ScheduleClass",
+        [
+            Schedule1F1B,
+            ScheduleGPipe,
+            ScheduleInterleaved1F1B,
+            ScheduleInterleavedZeroBubble,
+            ScheduleLoopedBFS,
+        ],
+    )
+    def test_schedule_eval_then_train(self, ScheduleClass):
+        """
+        Test that simply runs evaluation followed by training.
+        """
+        store = FakeStore()
+        torch.distributed.init_process_group(
+            backend="fake", rank=0, world_size=1, store=store
+        )
+        d_hid, batch_size = 512, 256
+        n_stages = 1
+        device = "cpu"
+        full_mod = MultiMLP(d_hid, n_layers=n_stages)
+        full_mod.to(device)
+
+        x = torch.randn(batch_size, d_hid, device=device)
+        target = torch.randn(batch_size, d_hid, device=device)
+
+        def loss_fn(y, target):
+            return torch.nn.functional.cross_entropy(y, target)
+
+        submod_name = "layers.0"
+        stage_module = full_mod.get_submodule(submod_name)
+
+        # Create a pipeline stage to wrap that submodule
+        num_microbatches = 2
+        stages = [PipelineStage(stage_module, 0, n_stages, device)]
+
+        if issubclass(ScheduleClass, PipelineScheduleSingle):
+            stages = stages[0]
+
+        # Attach to a schedule
+        schedule = ScheduleClass(stages, num_microbatches, loss_fn=loss_fn)
+        # Run eval
+        for _ in range(2):
+            # Zero gradients
+            stage_module.zero_grad()
+            losses = []
+            schedule.eval(x, target=target, losses=losses)
+        # Run training
+        try:
+            for _ in range(2):
+                losses = []
+                schedule.step(x, target=target, losses=losses)
+        finally:
+            torch.distributed.destroy_process_group()
+
+    @parametrize(
+        "ScheduleClass",
+        [
+            ScheduleInterleavedZeroBubble,
+            ScheduleZBVZeroBubble,
+            ScheduleDualPipeV,
+        ],
+    )
+    def test_zero_bubble_schedule_errors_with_compile(self, ScheduleClass):
         """
         Test that zero bubble schedules raise an error when used with torch.compile.
         """
@@ -215,16 +280,18 @@ def test_zero_bubble_schedule_errors_with_compile(self):
         model = MultiMLP(8, n_layers=n_stages)
         # full_mod
         compiled_model = torch.compile(model)
+        self.assertTrue(isinstance(compiled_model, OptimizedModule))
         stage = PipelineStage(
             compiled_model,
             0,
             n_stages,
             device,
         )
-        with self.assertRaises(RuntimeError):
-            ScheduleInterleavedZeroBubble([stage], 2)
-
-        torch.distributed.destroy_process_group()
+        try:
+            with self.assertRaises(RuntimeError):
+                ScheduleClass([stage], 2)
+        finally:
+            torch.distributed.destroy_process_group()
 
 
 instantiate_parametrized_tests(ScheduleTest)
@@ -469,6 +536,23 @@ def test_action_parse(self, action_str_and_ref):
                 "compute": ["0F0", "0F1", "   ", "0B0", "0B1"],
                 "comms": ["0UNSHARD", "0F0", "0F1", "0B0", "0B1", "0RESHARD"],
             },
+            {
+                "compute": ["0F0", "0F1", "1F0", "1F1", "1B0", "1B1", "0B0", "0B1"],
+                "comms": [
+                    "0UNSHARD",
+                    "1UNSHARD",
+                    "0F0",
+                    "0F1",
+                    "1F0",
+                    "1F1",
+                    "1B0",
+                    "1B1",
+                    "1RESHARD",
+                    "0B0",
+                    "0B1",
+                    "0RESHARD",
+                ],
+            },
         ],
     )
     def test_unshard_reshard(self, test_info):
@@ -913,6 +997,7 @@ def test_grad_with_split_b_w(self):
             stages,
             num_microbatches,
             loss_fn=loss_fn,
+            scale_grads=False,
         )
         schedule._prepare_schedule_with_comms(
             {
diff --git a/test/distributed/pipelining/test_schedule_multiproc.py b/test/distributed/pipelining/test_schedule_multiproc.py
index 9ba12c3d6996..9806bb5d0387 100644
--- a/test/distributed/pipelining/test_schedule_multiproc.py
+++ b/test/distributed/pipelining/test_schedule_multiproc.py
@@ -2,7 +2,6 @@
 # Owner(s): ["oncall: distributed"]
 import copy
 import logging
-import tempfile
 from dataclasses import dataclass
 
 from model_registry import ModelWithKwargs, MultiMLP, MultiMLPKwargs, MultiMLPWithDw
@@ -27,7 +26,15 @@
     ScheduleLoopedBFS,
     ScheduleZBVZeroBubble,
 )
-from torch.distributed.pipelining.schedules import _PipelineScheduleRuntime
+from torch.distributed.pipelining.schedules import (
+    _Action,
+    _PipelineContext,
+    _PipelineScheduleRuntime,
+    _wait_batch_p2p,
+    FORWARD,
+    OVERLAP_F_B,
+)
+from torch.distributed.pipelining.stage import _PipelineStageBase  # noqa: TC002
 from torch.nn.modules.loss import MSELoss
 from torch.testing._internal.common_distributed import (
     MultiProcContinuousTest,
@@ -330,6 +337,70 @@ def test_eval_inference_mode(self, ScheduleClass):
         if self.rank == self.world_size - 1:
             self.assertTrue(len(losses) > 0, "Losses should be computed during eval()")
 
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
+    )
+    @parametrize(
+        "ScheduleClass",
+        [
+            ScheduleGPipe,
+            Schedule1F1B,
+            ScheduleInterleaved1F1B,
+            ScheduleLoopedBFS,
+            ScheduleInterleavedZeroBubble,
+        ],
+    )
+    def test_return_output(self, ScheduleClass):
+        num_microbatches = 4
+        if ScheduleClass in [
+            ScheduleInterleaved1F1B,
+            ScheduleLoopedBFS,
+            ScheduleInterleavedZeroBubble,
+        ]:
+            # Multi-stage schedules
+            stages_per_rank = 2
+            n_stages = stages_per_rank * self.world_size
+            mod, _, x, target, loss_fn = setup_models_and_data(
+                self.config, n_layers=n_stages
+            )
+
+            # Create multi-stage pipeline
+            stages, stage_modules, _ = create_multi_stage_pipeline(
+                self.config, mod, stages_per_rank, n_stages
+            )
+            schedule = ScheduleClass(
+                stages,
+                num_microbatches,
+                loss_fn=loss_fn,
+                scale_grads=False,
+            )
+        else:
+            # Single-stage schedules
+            mod, _, x, target, loss_fn = setup_models_and_data(self.config)
+
+            # Create single-stage pipeline
+            stage, stage_module, _ = create_single_stage_pipeline(
+                self.config, mod, x, num_microbatches
+            )
+            schedule = ScheduleClass(
+                stage,
+                num_microbatches,
+                loss_fn=loss_fn,
+                scale_grads=False,
+            )
+
+        losses = []
+
+        if self.rank == self.world_size - 1:
+            output = schedule.step(target=target, losses=losses, return_outputs=False)
+        else:
+            schedule.step(x)
+
+        # Verify that output is None
+        if self.rank == self.world_size - 1:
+            self.assertTrue(output is None, "Output should be None")
+
     @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_but_pass_in_sandcastle_if(
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
@@ -515,8 +586,7 @@ def test_grad_with_manual(self, ScheduleClass, shape_inference):
             ScheduleInterleavedZeroBubble,
         ],
     )
-    @parametrize("use_new_runtime", [False, True])
-    def test_grad_with_manual_interleaved(self, ScheduleClass, use_new_runtime):
+    def test_grad_with_manual_interleaved(self, ScheduleClass):
         stages_per_rank = 2
         n_stages = stages_per_rank * self.world_size
         mod, ref_mod, x, target, loss_fn = setup_models_and_data(
@@ -543,46 +613,6 @@ def test_grad_with_manual_interleaved(self, ScheduleClass, use_new_runtime):
             stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
         )
 
-        # Handle new runtime testing
-        if use_new_runtime:
-            old_schedule = schedule
-            tmp_schedule = _PipelineScheduleRuntime(
-                stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
-            )
-            tmp_schedule._prepare_schedule_with_comms(old_schedule.pipeline_order)
-
-            # Test CSV round-trip for compute_comms schedule
-            schedule = _PipelineScheduleRuntime(
-                stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
-            )
-            with tempfile.NamedTemporaryFile() as f:
-                tmp_schedule._dump_csv(f.name)
-                f.seek(0)
-                schedule._load_csv(f.name, format="compute_comms")
-
-            one_more_schedule = _PipelineScheduleRuntime(
-                stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
-            )
-            one_more_schedule._prepare_schedule_with_comms(
-                schedule.pipeline_order_with_comms, format="compute_comms"
-            )
-
-            # Verify schedule consistency
-            self.assertEqual(
-                len(schedule.pipeline_order_with_comms),
-                len(one_more_schedule.pipeline_order_with_comms),
-            )
-            for rank in schedule.pipeline_order_with_comms:
-                self.assertEqual(
-                    len(schedule.pipeline_order_with_comms[rank]),
-                    len(one_more_schedule.pipeline_order_with_comms[rank]),
-                )
-                for a, b in zip(
-                    schedule.pipeline_order_with_comms[rank],
-                    one_more_schedule.pipeline_order_with_comms[rank],
-                ):
-                    self.assertEqual(a, b)
-
         # Run pipeline with tensor leak checking
         out = None
         losses = []
@@ -706,8 +736,7 @@ def dw_runner():
         "schedule_class",
         [ScheduleZBVZeroBubble, ScheduleDualPipeV],
     )
-    @parametrize("use_new_runtime", [False, True])
-    def test_v_shape_schedules(self, schedule_class, use_new_runtime):
+    def test_v_shape_schedules(self, schedule_class):
         n_stages = 8
         rank_stages = {0: [0, 7], 1: [1, 6], 2: [2, 5], 3: [3, 4]}
         mod, ref_mod, x, target, loss_fn = setup_models_and_data(
@@ -728,13 +757,6 @@ def test_v_shape_schedules(self, schedule_class, use_new_runtime):
             stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
         )
 
-        if schedule_class != ScheduleDualPipeV and use_new_runtime:
-            old_schedule = schedule
-            schedule = _PipelineScheduleRuntime(
-                stages, num_microbatches, loss_fn=loss_fn
-            )
-            schedule._prepare_schedule_with_comms(old_schedule.pipeline_order)
-
         # Run pipeline - special case where first and last stage are on rank 0
         out = None
         losses = []
@@ -758,6 +780,201 @@ def test_v_shape_schedules(self, schedule_class, use_new_runtime):
     @skip_but_pass_in_sandcastle_if(
         not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
     )
+    def test_custom_function_callback(self):
+        """Test the custom function callback functionality with _PipelineScheduleRuntime."""
+        n_stages = 8
+        rank_stages = {0: [0, 7], 1: [1, 6], 2: [2, 5], 3: [3, 4]}
+        mod, ref_mod, x, target, loss_fn = setup_models_and_data(
+            self.config, n_layers=n_stages
+        )
+
+        # Run reference
+        ref_out, ref_loss = run_reference_model(ref_mod, x, target, loss_fn)
+
+        # Create multi-stage pipeline with custom stage indices
+        num_microbatches = 8
+        stage_indices = rank_stages[self.rank]
+        stages, stage_modules, submod_names = create_multi_stage_pipeline(
+            self.config, mod, len(stage_indices), n_stages, stage_indices
+        )
+
+        # Use DualPipeV schedule as the base schedule
+        base_schedule = ScheduleDualPipeV(
+            stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
+        )
+        base_schedule._prepare_schedule_with_comms(base_schedule.pipeline_order)
+
+        # Track both types of callbacks separately
+        forward_calls = []
+        overlap_calls = []
+
+        def forward_callback(action: _Action, ctx: _PipelineContext):
+            """Custom callback for FORWARD computation that mimics the original implementation."""
+            schedule = ctx.schedule_ref
+            assert isinstance(schedule, _PipelineScheduleRuntime)
+            stage_index_to_stage: dict[int, _PipelineStageBase] = {
+                stage.stage_index: stage for stage in schedule._stages
+            }
+            stage = stage_index_to_stage[action.stage_index]
+            stage_index = stage.stage_index
+            mb_index = action.microbatch_index
+            assert mb_index is not None
+            fwd_recv_ops = schedule.fwd_recv_ops
+            arg_mbs = ctx.arg_mbs
+            kwarg_mbs = ctx.kwarg_mbs
+
+            is_next_stage_on_this_rank = stage_index + 1 in stage_index_to_stage
+            is_prev_stage_on_this_rank = stage_index - 1 in stage_index_to_stage
+
+            # used in verification at the end
+            forward_calls.append((stage_index, mb_index))
+
+            if (
+                not stage.is_first
+                # no recv op expected for V-schedule special case (see [Note: V-schedule special case])
+                and not is_prev_stage_on_this_rank
+            ):
+                assert (
+                    stage_index,
+                    mb_index,
+                ) in fwd_recv_ops, f"Computing {action=} before receiving input"
+                from torch.distributed.pipelining.schedules import _wait_batch_p2p
+
+                _wait_batch_p2p(fwd_recv_ops.pop((stage_index, mb_index)))
+
+            output = stage.forward_one_chunk(
+                mb_index,
+                arg_mbs[mb_index],  # type: ignore[index]
+                kwarg_mbs[mb_index],  # type: ignore[index]
+            )
+            schedule._maybe_compute_loss(stage, output, ctx.target_mbs, mb_index)
+
+            # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
+            # see [Note: V-schedule special case]
+            if is_next_stage_on_this_rank:
+                stage_index_to_stage[stage_index + 1].set_local_fwd_input(
+                    output, mb_index
+                )
+
+        def overlap_callback(action: _Action, ctx: _PipelineContext):
+            """Custom callback for OVERLAP_F_B computation that mimics the original implementation."""
+            schedule = ctx.schedule_ref
+            assert isinstance(schedule, _PipelineScheduleRuntime)
+            stage_index_to_stage: dict[int, _PipelineStageBase] = {
+                stage.stage_index: stage for stage in schedule._stages
+            }
+            assert action.sub_actions is not None
+            fwd_action = action.sub_actions[0]
+            bwd_action = action.sub_actions[1]
+
+            # Forward ========================================================
+            forward_callback(fwd_action, ctx)
+            overlap_calls.append(
+                (
+                    fwd_action.stage_index,
+                    fwd_action.microbatch_index,
+                    bwd_action.stage_index,
+                    bwd_action.microbatch_index,
+                )
+            )
+
+            # Backward ========================================================
+            backward_stage_index = bwd_action.stage_index
+            backward_stage = stage_index_to_stage[backward_stage_index]
+            backward_mb_index = bwd_action.microbatch_index
+            assert backward_mb_index is not None
+            bwd_recv_ops = schedule.bwd_recv_ops
+            is_next_stage_on_this_rank = (
+                backward_stage.stage_index + 1 in stage_index_to_stage
+            )
+            is_prev_stage_on_this_rank = (
+                backward_stage.stage_index - 1 in stage_index_to_stage
+            )
+            if (
+                not backward_stage.is_last
+                # no recv op expected for V-schedule special case (see [Note: V-schedule special case])
+                and not is_next_stage_on_this_rank
+            ):
+                assert (
+                    backward_stage_index,
+                    backward_mb_index,
+                ) in bwd_recv_ops, (
+                    f"Attempted to run compute {action=} before receiving input"
+                )
+                _wait_batch_p2p(
+                    bwd_recv_ops.pop((backward_stage_index, backward_mb_index))
+                )
+            loss = schedule._maybe_get_loss(backward_stage, backward_mb_index)
+            schedule.backward_counter[backward_stage_index] += 1
+            last_backward = (
+                schedule.backward_counter[backward_stage_index]
+                == schedule._n_microbatches
+            )
+            grad_scale_factor = schedule._n_microbatches if schedule.scale_grads else 1
+            backward_stage.backward_one_chunk(
+                backward_mb_index,
+                loss=loss,
+                full_backward=True,
+                last_backward=last_backward,
+            )
+            if last_backward:
+                backward_stage.scale_grads(grad_scale_factor)
+            # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
+            # see [Note: V-schedule special case]
+            if is_prev_stage_on_this_rank:
+                stage_index_to_stage[backward_stage_index - 1].set_local_bwd_input(
+                    backward_stage.get_local_bwd_output(backward_mb_index),
+                    backward_mb_index,
+                )
+
+        # Add the callback for FORWARD computation type
+
+        base_schedule.register_custom_function(FORWARD, forward_callback)
+        base_schedule.register_custom_function(OVERLAP_F_B, overlap_callback)
+
+        # Run pipeline - special case where first and last stage are on rank 0
+        out = None
+        losses = []
+        num_loops = 2
+        for _ in range(num_loops):
+            zero_gradients(stage_modules)
+            if self.rank == 0:
+                out = base_schedule.step(x, target=target, losses=losses)
+            else:
+                base_schedule.step()
+
+        dist.barrier()
+
+        # Verify results (rank 0 has both first and last stages)
+        if self.rank == 0:
+            torch.testing.assert_close(out, ref_out)
+            pipe_loss = sum(losses)
+            torch.testing.assert_close(pipe_loss, ref_loss)
+
+            # Verify overlap callbacks were called
+            self.assertGreater(
+                len(overlap_calls), 0, "OVERLAP_F_B callback should have been called"
+            )
+
+            # In a V-schedule with 8 microbatches and 2 stages per rank,
+            # rank 0 should have 32 calls (8 microbatches * 2 stages * 2 loops)
+            expected_count = num_microbatches * 2 * num_loops
+            self.assertEqual(len(forward_calls), expected_count)
+
+            # Verify all callback calls are for stages on this rank
+            for stage_idx, _ in forward_calls:
+                self.assertIn(
+                    stage_idx,
+                    stage_indices,
+                    f"Callback called for stage {stage_idx} not on rank {self.rank}",
+                )
+
+        # Check gradients using helper method
+        check_gradients(self.config, stage_modules, ref_mod, submod_names)
+
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIACCELERATOR, "NCCL test requires 2+ GPUs"
+    )
     @parametrize(
         "ScheduleClass",
         [ScheduleInterleavedZeroBubble, ScheduleInterleaved1F1B],
@@ -855,8 +1072,7 @@ def config(self) -> PipelineTestConfig:
         "schedule_class",
         [ScheduleVShaped, ScheduleUnbalanced],
     )
-    @parametrize("use_new_runtime", [False, True])
-    def test_non_symmetric_stage_ids(self, schedule_class, use_new_runtime):
+    def test_non_symmetric_stage_ids(self, schedule_class):
         n_stages = schedule_class.n_stages
         rank_stages = schedule_class.rank_stages
 
@@ -879,13 +1095,6 @@ def test_non_symmetric_stage_ids(self, schedule_class, use_new_runtime):
             stages, num_microbatches, loss_fn=loss_fn, scale_grads=False
         )
 
-        if use_new_runtime:
-            old_schedule = schedule
-            schedule = _PipelineScheduleRuntime(
-                stages, num_microbatches, loss_fn=loss_fn
-            )
-            schedule._prepare_schedule_with_comms(old_schedule.pipeline_order)
-
         # Run pipeline - special case where first and last stage are on rank 0
         out = None
         losses = []
diff --git a/test/distributed/pipelining/test_stage.py b/test/distributed/pipelining/test_stage.py
index 12c8d6203735..1e6dad4a77d7 100644
--- a/test/distributed/pipelining/test_stage.py
+++ b/test/distributed/pipelining/test_stage.py
@@ -30,7 +30,7 @@
 
 d_hid = 512
 batch_size = 256
-chunks = 4
+chunks = 8
 
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 backend = dist.get_default_backend_for_device(device_type)
diff --git a/test/distributed/tensor/debug/test_debug_mode.py b/test/distributed/tensor/debug/test_debug_mode.py
index b5b57169bfe5..aab91ddebe94 100644
--- a/test/distributed/tensor/debug/test_debug_mode.py
+++ b/test/distributed/tensor/debug/test_debug_mode.py
@@ -6,6 +6,7 @@
 import torch.distributed as dist
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.distributed.tensor import DeviceMesh, DTensor, Partial, Replicate, Shard
+from torch.distributed.tensor._dtensor_spec import ShardOrderEntry
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -14,8 +15,8 @@
     TestCase,
 )
 from torch.testing._internal.distributed.fake_pg import FakeStore
+from torch.utils._debug_mode import DebugMode
 from torch.utils._python_dispatch import TorchDispatchMode
-from torch.utils.debug_mode import DebugMode
 
 
 @requires_cuda
@@ -41,20 +42,21 @@ def test_debug_mode_mm(self):
         x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
         y_dtensor = DTensor.from_local(y, mesh, [Shard(0)], run_check=False)
 
-        with DebugMode() as debug_mode:
+        with DebugMode(record_torchfunction=True) as debug_mode:
             torch.mm(x_dtensor, y_dtensor).sum()
 
         self.assertExpectedInline(
             debug_mode.debug_string(),
             """\
-  torch.mm(dt: f32[8, 8][S(0)], dt: f32[8, 32][S(0)])
-    aten::mm(dt: f32[8, 8][S(0)], dt: f32[8, 32][S(0)])
-      redistribute_input(1, [S(0)] -> [R])
-        _c10d_functional::all_gather_into_tensor(t: f32[1, 32], 8, 0)
-        _c10d_functional::wait_tensor(t: f32[8, 32])
+  torch.mm(dt: f32[8, 8]| S(0), dt: f32[8, 32]| S(0))
+    aten::mm(dt: f32[8, 8]| S(0), dt: f32[8, 32]| S(0))
+      redistribute_input(1, S(0) -> R)
+        redistribute_input(t: f32[1, 32], trace: S(0)->R)
+          _c10d_functional::all_gather_into_tensor(t: f32[1, 32], 8, 0)
+          _c10d_functional::wait_tensor(t: f32[8, 32])
       aten::mm(t: f32[1, 8], t: f32[8, 32])
-  <method 'sum' of 'torch._C.TensorBase' objects>(dt: f32[8, 32][S(0)])
-    aten::sum(dt: f32[8, 32][S(0)])
+  <method 'sum' of 'torch._C.TensorBase' objects>(dt: f32[8, 32]| S(0))
+    aten::sum(dt: f32[8, 32]| S(0))
       aten::sum(t: f32[1, 32])""",
         )
 
@@ -80,36 +82,76 @@ def test_debug_mode_backward(self):
         x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
         y_dtensor = DTensor.from_local(y, mesh, [Shard(1)], run_check=False)
 
-        with DebugMode() as debug_mode:
+        with DebugMode(record_torchfunction=True) as debug_mode:
             z = x_dtensor + y_dtensor
             z.sum().backward()
 
         self.assertExpectedInline(
             debug_mode.debug_string(),
             """\
-  <method 'add' of 'torch._C.TensorBase' objects>(dt: f32[8, 8][S(0)], dt: f32[8, 8][S(1)])
-    aten::add.Tensor(dt: f32[8, 8][S(0)], dt: f32[8, 8][S(1)])
-      redistribute_input(1, [S(1)] -> [S(0)])
-        _dtensor::shard_dim_alltoall(t: f32[8, 1], 1, 0, 0)
+  <method 'add' of 'torch._C.TensorBase' objects>(dt: f32[8, 8]| S(0), dt: f32[8, 8]| S(1))
+    aten::add.Tensor(dt: f32[8, 8]| S(0), dt: f32[8, 8]| S(1))
+      redistribute_input(1, S(1) -> S(0))
+        redistribute_input(t: f32[8, 1], trace: S(1)->S(0))
+          _dtensor::shard_dim_alltoall(t: f32[8, 1], 1, 0, 0)
       aten::add.Tensor(t: f32[1, 8], t: f32[1, 8])
-  <method 'sum' of 'torch._C.TensorBase' objects>(dt: f32[8, 8][S(0)])
-    aten::sum(dt: f32[8, 8][S(0)])
+  <method 'sum' of 'torch._C.TensorBase' objects>(dt: f32[8, 8]| S(0))
+    aten::sum(dt: f32[8, 8]| S(0))
       aten::sum(t: f32[1, 8])
-  torch._tensor.backward(dt: f32[][P], gradient=None, retain_graph=None, create_graph=False, inputs=None)
-    aten::ones_like(dt: f32[][P], pin_memory=False, memory_format=torch.preserve_format)
+  torch._tensor.backward(dt: f32[]| P, gradient=None, retain_graph=None, create_graph=False, inputs=None)
+    aten::ones_like(dt: f32[]| P, pin_memory=False, memory_format=torch.preserve_format)
       aten::ones_like(t: f32[], pin_memory=False, memory_format=torch.preserve_format)
-    aten::expand(dt: f32[][R], [8, 8])
+    aten::expand(dt: f32[]| R, [8, 8])
       aten::expand(t: f32[], [8, 8])
-      aten::split.Tensor(t: f32[8, 8], 1, 1)
-      aten::clone(t: f32[8, 1])
+      redistribute_input(t: f32[8, 8], trace: R->S(1))
+        aten::split.Tensor(t: f32[8, 8], 1, 1)
+        aten::clone(t: f32[8, 1])
       aten::_to_copy(t: f32[8, 1], dtype=torch.float32, layout=torch.strided, device=cpu)
-      aten::detach(t: f32[8, 1])
-      aten::split.Tensor(t: f32[8, 8], 1)
-      aten::clone(t: f32[1, 8])
+      redistribute_input(t: f32[8, 8], trace: R->S(0))
+        aten::detach(t: f32[8, 1])
+        aten::split.Tensor(t: f32[8, 8], 1)
+        aten::clone(t: f32[1, 8])
       aten::_to_copy(t: f32[1, 8], dtype=torch.float32, layout=torch.strided, device=cpu)
       aten::detach(t: f32[1, 8])""",
         )
 
+    def test_debug_mode_densor_redistribution_trace(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size).view(4, 2))
+
+        x = torch.randn(16, 8, requires_grad=True)
+        y = torch.randn(8, 16, requires_grad=True)
+        x_dtensor = DTensor.from_local(x, mesh, [Shard(0), Shard(0)], run_check=False)
+        y_dtensor = DTensor.from_local(y, mesh, [Shard(1), Shard(1)], run_check=False)
+        x_dtensor._spec.shard_order = (ShardOrderEntry(tensor_dim=0, mesh_dims=(0, 1)),)
+        y_dtensor._spec.shard_order = (ShardOrderEntry(tensor_dim=1, mesh_dims=(0, 1)),)
+        with DebugMode(record_torchfunction=False) as debug_mode:
+            torch.mm(x_dtensor, y_dtensor).sum()
+
+        self.assertExpectedInline(
+            debug_mode.debug_string(),
+            """\
+  aten::mm(dt: f32[128, 8]| S(0)[0]S(0)[1], dt: f32[8, 128]| S(1)[0]S(1)[1])
+    redistribute_input(0, S(0)[0]S(0)[1] -> S(0)R)
+      redistribute_input(t: f32[16, 8], trace: S(0)[0]S(0)[1]->S(0)R)
+        _c10d_functional::all_gather_into_tensor(t: f32[16, 8], 2, 3)
+        _c10d_functional::wait_tensor(t: f32[32, 8])
+    redistribute_input(1, S(1)[0]S(1)[1] -> RS(1))
+      redistribute_input(t: f32[8, 16], trace: S(1)[0]S(1)[1]->S(1)R->RR->RS(1))
+        _c10d_functional::all_gather_into_tensor(t: f32[8, 16], 2, 3)
+        _c10d_functional::wait_tensor(t: f32[16, 16])
+        aten::chunk(t: f32[16, 16], 2)
+        aten::cat(['t: f32[8, 16]', 't: f32[8, 16]'], 1)
+        _c10d_functional::all_gather_into_tensor(t: f32[8, 32], 4, 1)
+        _c10d_functional::wait_tensor(t: f32[32, 32])
+        aten::chunk(t: f32[32, 32], 4)
+        aten::cat(['t: f32[8, 32]', 't: f32[8, 32]', 't: f32[8, 32]', 't: f32[8, 32]'], 1)
+        aten::chunk(t: f32[8, 128], 2, 1)
+        aten::clone(t: f32[8, 64])
+    aten::mm(t: f32[32, 8], t: f32[8, 64])
+  aten::sum(dt: f32[128, 128]| S(0)S(1))
+    aten::sum(t: f32[32, 64])""",
+        )
+
     def test_debug_mode_einsum(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size).view(4, 2))
 
@@ -121,54 +163,56 @@ def test_debug_mode_einsum(self):
         b_dt = DTensor.from_local(b, mesh, [Replicate(), Partial()], run_check=False)
 
         # Capture the operator decomposition
-        with DebugMode() as debug_mode:
+        with DebugMode(record_torchfunction=True) as debug_mode:
             torch.einsum("bld,dnh->blnh", a_dt, b_dt)
 
         self.assertExpectedInline(
             debug_mode.debug_string(),
             """\
-  torch.functional.einsum(bld,dnh->blnh, dt: f32[16, 6, 8][P, R], dt: f32[8, 4, 4][R, P])
-    aten::unsqueeze(dt: f32[16, 6, 8][P, R], 3)
+  torch.functional.einsum(bld,dnh->blnh, dt: f32[16, 6, 8]| PR, dt: f32[8, 4, 4]| RP)
+    aten::unsqueeze(dt: f32[16, 6, 8]| PR, 3)
       aten::unsqueeze(t: f32[16, 6, 8], 3)
-    aten::unsqueeze(dt: f32[16, 6, 8, 1][P, R], 4)
+    aten::unsqueeze(dt: f32[16, 6, 8, 1]| PR, 4)
       aten::unsqueeze(t: f32[16, 6, 8, 1], 4)
-    aten::permute(dt: f32[16, 6, 8, 1, 1][P, R], [0, 1, 3, 4, 2])
+    aten::permute(dt: f32[16, 6, 8, 1, 1]| PR, [0, 1, 3, 4, 2])
       aten::permute(t: f32[16, 6, 8, 1, 1], [0, 1, 3, 4, 2])
-    aten::unsqueeze(dt: f32[8, 4, 4][R, P], 3)
+    aten::unsqueeze(dt: f32[8, 4, 4]| RP, 3)
       aten::unsqueeze(t: f32[8, 4, 4], 3)
-    aten::unsqueeze(dt: f32[8, 4, 4, 1][R, P], 4)
+    aten::unsqueeze(dt: f32[8, 4, 4, 1]| RP, 4)
       aten::unsqueeze(t: f32[8, 4, 4, 1], 4)
-    aten::permute(dt: f32[8, 4, 4, 1, 1][R, P], [3, 4, 1, 2, 0])
+    aten::permute(dt: f32[8, 4, 4, 1, 1]| RP, [3, 4, 1, 2, 0])
       aten::permute(t: f32[8, 4, 4, 1, 1], [3, 4, 1, 2, 0])
-    aten::permute(dt: f32[16, 6, 1, 1, 8][P, R], [0, 1, 4, 2, 3])
+    aten::permute(dt: f32[16, 6, 1, 1, 8]| PR, [0, 1, 4, 2, 3])
       aten::permute(t: f32[16, 6, 1, 1, 8], [0, 1, 4, 2, 3])
-    aten::view(dt: f32[16, 6, 8, 1, 1][P, R], [1, 96, 8])
+    aten::view(dt: f32[16, 6, 8, 1, 1]| PR, [1, 96, 8])
       aten::view(t: f32[16, 6, 8, 1, 1], [1, 96, 8])
-    aten::permute(dt: f32[1, 1, 4, 4, 8][R, P], [4, 2, 3, 0, 1])
+    aten::permute(dt: f32[1, 1, 4, 4, 8]| RP, [4, 2, 3, 0, 1])
       aten::permute(t: f32[1, 1, 4, 4, 8], [4, 2, 3, 0, 1])
-    aten::view(dt: f32[8, 4, 4, 1, 1][R, P], [1, 8, 16])
+    aten::view(dt: f32[8, 4, 4, 1, 1]| RP, [1, 8, 16])
       aten::view(t: f32[8, 4, 4, 1, 1], [1, 8, 16])
-    aten::bmm(dt: f32[1, 96, 8][P, R], dt: f32[1, 8, 16][R, P])
-      redistribute_input(0, [P, R] -> [S(2), S(2)])
-        aten::chunk(t: f32[1, 96, 8], 4, 2)
-        aten::cat(['t: f32[1, 96, 2]', 't: f32[1, 96, 2]', 't: f32[1, 96, 2]', 't: f32[1, 96, 2]'])
-        _c10d_functional::reduce_scatter_tensor(t: f32[4, 96, 2], sum, 4, 1)
-        _c10d_functional::wait_tensor(t: f32[1, 96, 2])
-        aten::chunk(t: f32[1, 96, 2], 2, 2)
-        aten::clone(t: f32[1, 96, 1])
-      redistribute_input(1, [R, P] -> [S(1), S(1)])
-        aten::chunk(t: f32[1, 8, 16], 4, 1)
-        aten::clone(t: f32[1, 2, 16])
-        aten::chunk(t: f32[1, 2, 16], 2, 1)
-        aten::cat(['t: f32[1, 1, 16]', 't: f32[1, 1, 16]'])
-        _c10d_functional::reduce_scatter_tensor(t: f32[2, 1, 16], sum, 2, 3)
-        _c10d_functional::wait_tensor(t: f32[1, 1, 16])
+    aten::bmm(dt: f32[1, 96, 8]| PR, dt: f32[1, 8, 16]| RP)
+      redistribute_input(0, PR -> S(2)[0]S(2)[1])
+        redistribute_input(t: f32[1, 96, 8], trace: PR->S(2)R->S(2)[0]S(2)[1])
+          aten::chunk(t: f32[1, 96, 8], 4, 2)
+          aten::cat(['t: f32[1, 96, 2]', 't: f32[1, 96, 2]', 't: f32[1, 96, 2]', 't: f32[1, 96, 2]'])
+          _c10d_functional::reduce_scatter_tensor(t: f32[4, 96, 2], sum, 4, 1)
+          _c10d_functional::wait_tensor(t: f32[1, 96, 2])
+          aten::chunk(t: f32[1, 96, 2], 2, 2)
+          aten::clone(t: f32[1, 96, 1])
+      redistribute_input(1, RP -> S(1)[0]S(1)[1])
+        redistribute_input(t: f32[1, 8, 16], trace: RP->S(1)P->S(1)[0]S(1)[1])
+          aten::chunk(t: f32[1, 8, 16], 4, 1)
+          aten::clone(t: f32[1, 2, 16])
+          aten::chunk(t: f32[1, 2, 16], 2, 1)
+          aten::cat(['t: f32[1, 1, 16]', 't: f32[1, 1, 16]'])
+          _c10d_functional::reduce_scatter_tensor(t: f32[2, 1, 16], sum, 2, 3)
+          _c10d_functional::wait_tensor(t: f32[1, 1, 16])
       aten::bmm(t: f32[1, 96, 1], t: f32[1, 1, 16])
-    aten::view(dt: f32[1, 96, 16][P, P], [16, 6, 1, 4, 4])
+    aten::view(dt: f32[1, 96, 16]| PP, [16, 6, 1, 4, 4])
       aten::view(t: f32[1, 96, 16], [16, 6, 1, 4, 4])
-    aten::permute(dt: f32[16, 6, 1, 4, 4][P, P], [0, 1, 3, 4, 2])
+    aten::permute(dt: f32[16, 6, 1, 4, 4]| PP, [0, 1, 3, 4, 2])
       aten::permute(t: f32[16, 6, 1, 4, 4], [0, 1, 3, 4, 2])
-    aten::view(dt: f32[16, 6, 4, 4, 1][P, P], [16, 6, 4, 4])
+    aten::view(dt: f32[16, 6, 4, 4, 1]| PP, [16, 6, 4, 4])
       aten::view(t: f32[16, 6, 4, 4, 1], [16, 6, 4, 4])""",
         )
 
@@ -176,7 +220,7 @@ def test_real_tensor(self):
         x = torch.randn(8, 8, 8)
         linear = torch.nn.Linear(8, 8)
 
-        with DebugMode() as debug_mode:
+        with DebugMode(record_torchfunction=True) as debug_mode:
             linear(x).sum()
 
         self.assertExpectedInline(
@@ -196,7 +240,7 @@ def test_fake_tensor(self):
             x = torch.randn(8, 8)
             y = torch.randn(8, 8, 8)
 
-        with DebugMode(record_faketensor=True) as debug_mode:
+        with DebugMode(record_torchfunction=True, record_faketensor=True) as debug_mode:
             torch.matmul(y, x)
 
         self.assertExpectedInline(
@@ -208,6 +252,29 @@ def test_fake_tensor(self):
       aten::_unsafe_view(ft: f32[64, 8], [8, 8, 8])""",
         )
 
+    def test_tensor_attributes(self):
+        x = torch.randn(8, 8)
+        x.a1 = "x1"
+        x.a2 = "x2"
+        y = torch.randn(8, 8, 8)
+        y.a1 = "y"
+
+        with DebugMode(
+            record_torchfunction=True,
+            record_faketensor=True,
+            record_tensor_attributes=["a1", "a2"],
+        ) as debug_mode:
+            torch.matmul(y, x)
+
+        self.assertExpectedInline(
+            debug_mode.debug_string(),
+            """\
+  torch.matmul(t: f32[8, 8, 8]{a1=y}, t: f32[8, 8]{a1=x1, a2=x2})
+      aten::view(t: f32[8, 8, 8]{a1=y}, [64, 8])
+      aten::mm(t: f32[64, 8], t: f32[8, 8]{a1=x1, a2=x2})
+      aten::_unsafe_view(t: f32[64, 8], [8, 8, 8])""",
+        )
+
     @parametrize("has_inner_mode", [True, False])
     @parametrize("has_outer_mode", [True, False])
     def test_nested_debug_mode(self, has_inner_mode, has_outer_mode):
@@ -238,9 +305,30 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
                 with inner_mode:
                     torch.mm(x_dtensor, y_dtensor)
 
-        self.assertTrue(
-            "redistribute_input(1, [S(0)] -> [R])" in debug_mode.debug_string()
-        )
+        self.assertTrue("redistribute_input(1, S(0) -> R)" in debug_mode.debug_string())
+
+    def test_debug_mode_higher_order_cond(self):
+        """Test DebugMode with higher order operation."""
+        x = torch.randn(1, 8, requires_grad=True)
+
+        with DebugMode(record_torchfunction=True) as debug_mode:
+            # rewrite torch.conda as torch.ops.higher_order.cond to avoid compilation
+            torch.ops.higher_order.cond(
+                torch.tensor(True), lambda x: x + 1, lambda x: x - 1, (x,)
+            )
+
+        # Verify that cond operations are captured in debug mode
+        self.assertIn("torch.ops.higher_order.cond", debug_mode.debug_string())
+
+    def test_compile(self):
+        @torch.compile
+        def f(x):
+            return x.sin().cos()
+
+        x = torch.randn(8)
+        with DebugMode() as debug_mode:
+            f(x)
+        self.assertEqual(len(debug_mode.debug_string()), 0)
 
 
 instantiate_parametrized_tests(TestDTensorDebugMode)
diff --git a/test/distributed/tensor/parallel/test_micro_pipeline_tp.py b/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
index df3e2ffb3885..25e50e493cc5 100644
--- a/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
+++ b/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
@@ -352,7 +352,7 @@ def func(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
     @parametrize("scatter_dim", [0, 1, 2])
     @fresh_cache()
     def test_fuse_scaled_matmul_reduce_scatter(self, A_dims, scatter_dim):
-        if scatter_dim >= A_dims:
+        if scatter_dim >= A_dims - 1:
             return
 
         group = dist.group.WORLD
@@ -402,7 +402,7 @@ def func(
 
     @runOnRocmArch(MI300_ARCH)
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
-    @parametrize("scatter_dim", [0, 1, 2])
+    @parametrize("scatter_dim", [0, 1])
     @fresh_cache()
     def test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape(
         self, scatter_dim
diff --git a/test/distributed/tensor/test_api.py b/test/distributed/tensor/test_api.py
index a4efd6d5b6be..a3369f15869c 100644
--- a/test/distributed/tensor/test_api.py
+++ b/test/distributed/tensor/test_api.py
@@ -1,13 +1,17 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
+import tempfile
+
 import torch
+import torch.distributed.checkpoint as dcp
 import torch.nn as nn
 from torch.distributed.tensor import (
     DeviceMesh,
     distribute_module,
     distribute_tensor,
     DTensor,
+    Partial,
     Replicate,
     Shard,
 )
@@ -356,6 +360,33 @@ def shard_fn(name, module, device_mesh):
             self.assertFalse(param.is_meta)
             self.assertTrue(param.device.type == device_mesh.device_type)
 
+    @with_comms
+    def test_checkpoint_apis_check_partial_placement(self):
+        device_mesh = self.build_device_mesh()
+        tensor = torch.randn(5, 5, device=self.device_type)
+        dtensor = DTensor.from_local(tensor, device_mesh, [Partial()])
+        with self.assertRaisesRegex(
+            ValueError, "Any checkpointing related operations are not supported for"
+        ):
+            dtensor.__create_write_items__("fqn", None)
+
+        with self.assertRaisesRegex(
+            ValueError, "Any checkpointing related operations are not supported for"
+        ):
+            dtensor.__create_chunk_list__()
+
+        with self.assertRaisesRegex(
+            ValueError, "Any checkpointing related operations are not supported for"
+        ):
+            dtensor.__get_tensor_shard__(0)
+
+        # Ideally we should not allow checkpointing related operations for DTensor
+        with self.assertRaisesRegex(
+            dcp.api.CheckpointException,
+            "Any checkpointing related operations are not supported for",
+        ):
+            dcp.save({"fqn": dtensor}, checkpoint_id=tempfile.mkdtemp())
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_attention.py b/test/distributed/tensor/test_attention.py
index 0b48118c2460..66d80f604551 100644
--- a/test/distributed/tensor/test_attention.py
+++ b/test/distributed/tensor/test_attention.py
@@ -1,34 +1,45 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
-import functools
 import itertools
 import random
 import unittest
-from typing import Union
+from typing import Any, Callable, ClassVar, Optional
 
 import torch
 import torch.distributed as dist
+import torch.distributed.distributed_c10d as c10d
 import torch.nn.functional as F
-from torch import nn, Tensor
+from torch import Tensor
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.tensor import DeviceMesh
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.experimental._attention import (
-    _AttentionContextParallel,
     _CausalBehavior,
+    _context_parallel_shard,
+    _ContextParallel,
     _cp_options,
-    _DispatchMode,
+    _disable_context_parallel_dispatcher,
+    _enable_context_parallel_dispatcher,
     _is_causal_behavior,
     _RotateMethod,
     context_parallel,
     context_parallel_unshard,
     set_rotate_method,
 )
+from torch.distributed.tensor.experimental._cp_custom_ops import flex_cp_allgather
+from torch.distributed.tensor.experimental._load_balancer import (
+    _HeadTailLoadBalancer,
+    _LoadBalancer,
+    _PerDocumentHeadTailLoadBalancer,
+    _PTRRLoadBalancer,
+)
 from torch.distributed.tensor.parallel import parallelize_module
 from torch.nn.attention import sdpa_kernel, SDPBackend
 from torch.nn.attention.flex_attention import (
     _mask_mod_signature,
+    AuxOutput,
     AuxRequest,
+    BlockMask,
     create_block_mask,
     flex_attention,
 )
@@ -42,8 +53,6 @@
 from torch.testing._internal.common_utils import run_tests, skipIfRocm
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
-    ModelArgs,
-    Transformer,
     with_comms,
 )
 
@@ -63,6 +72,24 @@
 }  # mapping from _RotateMethod enum to string
 
 
+class SDPAWrapper(torch.nn.Module):
+    def __init__(self, compiled: bool, backend: SDPBackend) -> None:
+        super().__init__()
+        if compiled:
+            self.sdpa = torch.compile(
+                F.scaled_dot_product_attention,
+                fullgraph=True,
+                backend="aot_eager",
+            )
+        else:
+            self.sdpa = F.scaled_dot_product_attention
+        self.backend = backend
+
+    def forward(self, *args: object, **kwargs: object) -> torch.Tensor:
+        with sdpa_kernel(self.backend):
+            return self.sdpa(*args, **kwargs)
+
+
 class RingAttentionTest(DTensorTestBase):
     @property
     def world_size(self) -> int:
@@ -88,14 +115,105 @@ def test_ring_attention_sdpa(self) -> None:
                 "load_balance": [True, False],
                 "rotater": [_RotateMethod.ALL_TO_ALL, _RotateMethod.ALL_GATHER],
                 "test_forward_only": [True, False],
-                "dispatch_mode": [
-                    _DispatchMode.MONKEY_PATCH,
-                    _DispatchMode.TORCH_FUNCTION,
-                ],
+                "use_context": [True, False],
             },
             self._test_ring_attention_sdpa,
         )
 
+    def _ring_attention_sdpa(
+        self,
+        cp_q: torch.Tensor,
+        cp_k: torch.Tensor,
+        cp_v: torch.Tensor,
+        *,
+        fn_eval: Callable,
+        mesh: DeviceMesh,
+        seq_dim: int,
+        is_causal: bool,
+        compiled: bool,
+        backend: SDPBackend,
+        rotater: _RotateMethod,
+        test_forward_only: bool,
+        load_balance: bool,
+        use_context: bool,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        if not use_context:
+            cp_plan = _ContextParallel(
+                seq_dim=seq_dim,
+                attention_type=_ContextParallel.AttentionType.SDPA,
+            )
+            attention = SDPAWrapper(compiled=compiled, backend=backend)
+            attention = parallelize_module(attention, mesh, cp_plan)
+            if load_balance:
+                seq_len = cp_q.size(seq_dim)
+                load_balancer = _HeadTailLoadBalancer(seq_len, mesh.size(), cp_q.device)
+            else:
+                load_balancer = None
+            cp_q, cp_k, cp_v = _context_parallel_shard(
+                mesh, (cp_q, cp_k, cp_v), (seq_dim,) * 3, load_balancer=load_balancer
+            )
+            _enable_context_parallel_dispatcher()
+        else:
+            # Theoretically, context_parallel() should not be used to shard
+            # parameters because when require_grad is True, resize_ is not
+            # allowed. But requires_grad of cp_q, cp_k, and cp_v are False
+            # now. So we can just use context_parallel() to shard q, k, v.
+            # In reality, context_paralle() should be used to shard the input.
+            # In reality, context_parallel() should only be used to shard
+            # the model inputs (batch).
+
+            _cp_options.enable_load_balance = load_balance
+            cp_context = context_parallel(
+                mesh, buffers=(cp_q, cp_k, cp_v), buffer_seq_dims=(seq_dim,) * 3
+            )
+            cp_context.__enter__()
+
+            # NOTE: This demonstrates that monkey patching is not fully reliable.
+            # If we use SDPAWrapper directly, the monkey patching dispatch mode
+            # does not function correctly. To ensure proper behavior,
+            # F.scaled_dot_product_attention must be referenced within the
+            # context_parallel() scope.
+            attention = F.scaled_dot_product_attention
+            if compiled:
+                attention = torch.compile(
+                    attention, fullgraph=True, backend="aot_eager"
+                )
+
+        for target in [cp_q, cp_k, cp_v]:
+            target.requires_grad = True
+
+        with CommDebugMode() as comm_mode:
+            with sdpa_kernel(backend):
+                cp_out = fn_eval(
+                    attention,
+                    cp_q,
+                    cp_k,
+                    cp_v,
+                    is_causal=is_causal,
+                )
+
+            if not compiled and rotater == _RotateMethod.ALL_TO_ALL:
+                # Compiler and CommDebugMode do not work well together.
+                expect_all2all_count = (
+                    self.world_size - 1
+                    if test_forward_only
+                    else self.world_size * 3 - 2
+                )
+                self.assertDictEqual(
+                    comm_mode.get_comm_counts(),
+                    {c10d_functional.all_to_all_single: expect_all2all_count},
+                )
+        cp_dq, cp_dk, cp_dv = cp_q.grad, cp_k.grad, cp_v.grad
+        for target in [cp_q, cp_k, cp_v]:
+            target.requires_grad = False
+
+        if not use_context:
+            _disable_context_parallel_dispatcher()
+        else:
+            cp_context.__exit__(None, None, None)
+
+        return cp_out, cp_dq, cp_dk, cp_dv
+
     def _test_ring_attention_sdpa(
         self,
         is_causal: bool,
@@ -104,10 +222,8 @@ def _test_ring_attention_sdpa(
         load_balance: bool,
         rotater: _RotateMethod,
         test_forward_only: bool,
-        dispatch_mode: _DispatchMode,
+        use_context: bool,
     ) -> None:
-        torch.distributed.tensor.experimental._attention._dispatch_mode = dispatch_mode
-
         def fn_eval(fn, *args, **kwargs):
             if test_forward_only:
                 with torch.no_grad():
@@ -125,8 +241,8 @@ def fn_eval(fn, *args, **kwargs):
         device_mesh = DeviceMesh(self.device_type, torch.arange(0, self.world_size))
         dtype = torch.bfloat16
         bs = 8
-        query_tokens = 64
-        context_tokens = 64
+        seq_length = 1024
+        seq_dim = 2
         dim = 32
         nheads = 8
         torch.manual_seed(10)
@@ -137,26 +253,15 @@ def fn_eval(fn, *args, **kwargs):
             else torch.float32
         )
 
-        _cp_options.enable_load_balance = load_balance
-
-        q = torch.rand(
-            (bs, nheads, self.world_size * query_tokens, dim),
-            device=self.device_type,
-            dtype=dtype,
-            requires_grad=True,
-        )
-        k = torch.rand(
-            (bs, nheads, self.world_size * context_tokens, dim),
-            device=self.device_type,
-            dtype=dtype,
-            requires_grad=True,
-        )
-        v = torch.rand(
-            (bs, nheads, self.world_size * context_tokens, dim),
-            device=self.device_type,
-            dtype=dtype,
-            requires_grad=True,
-        )
+        q, k, v = [
+            torch.rand(
+                (bs, nheads, seq_length * self.world_size, dim),
+                device=self.device_type,
+                dtype=dtype,
+                requires_grad=True,
+            )
+            for _ in range(3)
+        ]
 
         # Ensure all ranks have the same initialization data.
         with torch.no_grad():
@@ -167,81 +272,49 @@ def fn_eval(fn, *args, **kwargs):
         with sdpa_kernel(backend):
             out = fn_eval(F.scaled_dot_product_attention, q, k, v, is_causal=is_causal)
 
-        cp_q = q.detach().clone()
-        cp_k = k.detach().clone()
-        cp_v = v.detach().clone()
-        # Theoretically, context_parallel() should not be used to shard
-        # parameters because when require_grad is True, resize_ is not
-        # allowed. But requires_grad of cp_q, cp_k, and cp_v are False
-        # now. So we can just use context_parallel() to shard q, k, v.
-        # In reality, context_paralle() should be used to shard the input.
-        with context_parallel(
-            device_mesh, buffers=(cp_q, cp_k, cp_v), buffer_seq_dims=(2, 2, 2)
-        ):
-            cp_q.requires_grad = True
-            cp_k.requires_grad = True
-            cp_v.requires_grad = True
-            with CommDebugMode() as comm_mode:
-                with sdpa_kernel(backend):
-                    if compiled:
-                        fn = torch.compile(
-                            F.scaled_dot_product_attention,
-                            fullgraph=True,
-                            backend="aot_eager",
-                        )
-                    else:
-                        fn = F.scaled_dot_product_attention
-
-                    cp_out = fn_eval(fn, cp_q, cp_k, cp_v, is_causal=is_causal)
-
-                    if not compiled and rotater == _RotateMethod.ALL_TO_ALL:
-                        # Compiler and CommDebugMode do not work well together.
-                        expect_all2all_count = (
-                            self.world_size - 1
-                            if test_forward_only
-                            else self.world_size * 3 - 2
-                        )
-                        self.assertDictEqual(
-                            comm_mode.get_comm_counts(),
-                            {c10d_functional.all_to_all_single: expect_all2all_count},
-                        )
-
-            # Due to numerical error, we need to choose different atol for different
-            # attention kernels
-            (cp_out,) = context_parallel_unshard(device_mesh, [cp_out], [2])
-            atol = (
-                1e-08
-                if backend == SDPBackend.EFFICIENT_ATTENTION
-                else 1e-3 * self.world_size
-            )
-            self.assertTrue(torch.allclose(out, cp_out, atol=atol))
-
-            if not test_forward_only:
-                cp_dq, cp_dk, cp_dv = context_parallel_unshard(
-                    device_mesh,
-                    [cp_q.grad, cp_k.grad, cp_v.grad],
-                    [2, 2, 2],
-                )
-                atol = (
-                    2e-06
-                    if backend == SDPBackend.EFFICIENT_ATTENTION
-                    else 8e-3 * self.world_size
-                )
-                self.assertTrue(torch.allclose(q.grad, cp_dq, atol=atol))
-                self.assertTrue(torch.allclose(k.grad, cp_dk, atol=atol))
-                self.assertTrue(torch.allclose(v.grad, cp_dv, atol=atol))
+        cp_q, cp_k, cp_v = [target.detach().clone() for target in [q, k, v]]
+        cp_out, cp_dq, cp_dk, cp_dv = self._ring_attention_sdpa(
+            cp_q,
+            cp_k,
+            cp_v,
+            fn_eval=fn_eval,
+            mesh=device_mesh,
+            seq_dim=seq_dim,
+            is_causal=is_causal,
+            compiled=compiled,
+            backend=backend,
+            rotater=rotater,
+            test_forward_only=test_forward_only,
+            load_balance=load_balance,
+            use_context=use_context,
+        )
 
-                cp_q.grad = None
-                cp_k.grad = None
-                cp_v.grad = None
+        # Due to numerical error, we need to choose different atol for different
+        # attention kernels
+        (cp_out,) = context_parallel_unshard(device_mesh, [cp_out], [seq_dim])
+        atol = (
+            2e-06
+            if backend == SDPBackend.EFFICIENT_ATTENTION
+            else 8e-3 * self.world_size
+        )
+        rtol = (
+            1e-05
+            if backend == SDPBackend.EFFICIENT_ATTENTION
+            else 1e-3 * self.world_size
+        )
+        torch.testing.assert_close(out, cp_out, atol=atol, rtol=rtol)
 
-            cp_q.requires_grad = False
-            cp_k.requires_grad = False
-            cp_v.requires_grad = False
+        if test_forward_only:
+            return
 
-        torch.distributed.tensor.experimental._attention._dispatch_mode = (
-            _DispatchMode.MONKEY_PATCH
+        cp_dq, cp_dk, cp_dv = context_parallel_unshard(
+            device_mesh,
+            [cp_dq, cp_dk, cp_dv],
+            [seq_dim] * 3,
         )
+        torch.testing.assert_close(q.grad, cp_dq, atol=atol, rtol=rtol)
+        torch.testing.assert_close(k.grad, cp_dk, atol=atol, rtol=rtol)
+        torch.testing.assert_close(v.grad, cp_dv, atol=atol, rtol=rtol)
 
     def test_is_causal_behavior(self) -> None:
         _cp_options.enable_load_balance = False
@@ -273,180 +346,6 @@ def test_is_causal_behavior(self) -> None:
                     behavior,
                 )
 
-    @skip_if_lt_x_gpu(2)
-    @unittest.skipIf(
-        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
-    )
-    @with_comms
-    def test_ring_attention_native_transformer(self) -> None:
-        self.run_subtests(
-            {
-                "is_causal": [True, False],
-                "rotater": [_RotateMethod.ALL_GATHER, _RotateMethod.ALL_TO_ALL],
-            },
-            self._test_ring_attention_native_transformer,
-        )
-
-    @sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION])
-    def _test_ring_attention_native_transformer(
-        self, is_causal: bool, rotater: _RotateMethod
-    ) -> None:
-        _cp_options.enable_load_balance = is_causal
-        set_rotate_method(rotater_enum_to_str[rotater])
-        self.assertEqual(_cp_options.rotate_method, rotater)
-        device_mesh = DeviceMesh(
-            self.device_type,
-            torch.arange(0, self.world_size),
-        )
-        dtype = torch.bfloat16
-        bs = 8
-        ntokens = 8
-        dim = 32
-        nheads = 8
-        num_layers = 2
-
-        encoder_layer = nn.TransformerEncoderLayer(
-            d_model=dim,
-            nhead=nheads,
-            dim_feedforward=dim,
-            batch_first=True,
-        ).to(dtype)
-        encoder_layer = parallelize_module(
-            module=encoder_layer,
-            device_mesh=device_mesh,
-            parallelize_plan={
-                "self_attn": _AttentionContextParallel(),
-            },
-        )
-        model = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
-        model = model.to(self.device_type).to(dtype)
-
-        mask = (
-            nn.Transformer.generate_square_subsequent_mask(
-                ntokens, device=self.device_type, dtype=dtype
-            )
-            if is_causal
-            else None
-        )
-        seq = torch.rand((bs, ntokens, dim), device=self.device_type, dtype=dtype)
-
-        with CommDebugMode() as comm_mode:
-            out = model(seq, mask=mask, is_causal=is_causal)
-
-        if rotater == _RotateMethod.ALL_TO_ALL:
-            self.assertDictEqual(
-                comm_mode.get_comm_counts(),
-                {
-                    c10d_functional.all_to_all_single: (self.world_size - 1)
-                    * num_layers,
-                },
-            )
-        else:
-            self.assertDictEqual(
-                comm_mode.get_comm_counts(),
-                {
-                    c10d_functional.all_gather_into_tensor: num_layers,
-                },
-            )
-
-        with CommDebugMode() as comm_mode:
-            out.sum().backward()
-
-        if rotater == _RotateMethod.ALL_TO_ALL:
-            self.assertDictEqual(
-                comm_mode.get_comm_counts(),
-                {
-                    c10d_functional.all_to_all_single: (self.world_size * 2 - 1)
-                    * num_layers,
-                },
-            )
-        else:
-            self.assertDictEqual(
-                comm_mode.get_comm_counts(),
-                {
-                    c10d_functional.all_gather_into_tensor: num_layers,
-                    c10d_functional.all_to_all_single: self.world_size * num_layers,
-                },
-            )
-
-    @skip_if_lt_x_gpu(2)
-    @unittest.skipIf(
-        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
-    )
-    @with_comms
-    @sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION])
-    def test_ring_attention_custom_transformer(self) -> None:
-        self.run_subtests(
-            {"rotater": [_RotateMethod.ALL_GATHER, _RotateMethod.ALL_TO_ALL]},
-            self._test_ring_attention_custom_transformer,
-        )
-
-    def _test_ring_attention_custom_transformer(self, rotater: _RotateMethod) -> None:
-        set_rotate_method(rotater_enum_to_str[rotater])
-        self.assertEqual(_cp_options.rotate_method, rotater)
-        device_mesh = DeviceMesh(
-            self.device_type,
-            torch.arange(0, self.world_size),
-        )
-        # early init DTensor RNG tracker to avoid broadcast be captuured in comm_mode
-        torch.distributed.tensor._random.manual_seed(10, device_mesh)
-
-        dtype = torch.bfloat16
-        bs = 2
-        args = ModelArgs()
-
-        model = Transformer(args).to(dtype).to(self.device_type)
-
-        model = parallelize_module(
-            module=model,
-            device_mesh=device_mesh,
-            parallelize_plan={
-                f"layers.{i}.attention": _AttentionContextParallel()
-                for i in range(args.n_layers)
-            },
-        )
-
-        seq = torch.randint(
-            args.vocab_size, (bs, args.max_seq_len), device=self.device_type
-        )
-
-        with CommDebugMode() as comm_mode:
-            out = model(seq)
-
-        if rotater == _RotateMethod.ALL_TO_ALL:
-            self.assertDictEqual(
-                comm_mode.get_comm_counts(),
-                {
-                    c10d_functional.all_to_all_single: (self.world_size - 1)
-                    * args.n_layers,
-                },
-            )
-        else:
-            self.assertDictEqual(
-                comm_mode.get_comm_counts(),
-                {c10d_functional.all_gather_into_tensor: args.n_layers},
-            )
-
-        with CommDebugMode() as comm_mode:
-            out.sum().backward()
-
-        if rotater == _RotateMethod.ALL_TO_ALL:
-            self.assertDictEqual(
-                comm_mode.get_comm_counts(),
-                {
-                    c10d_functional.all_to_all_single: (self.world_size * 2 - 1)
-                    * args.n_layers,
-                },
-            )
-        else:
-            self.assertDictEqual(
-                comm_mode.get_comm_counts(),
-                {
-                    c10d_functional.all_gather_into_tensor: args.n_layers,
-                    c10d_functional.all_to_all_single: self.world_size * args.n_layers,
-                },
-            )
-
 
 # Compile the flex_attention function
 compiled_flex_attention = torch.compile(flex_attention, dynamic=False, fullgraph=True)
@@ -460,7 +359,7 @@ def causal_mask(b, h, q_idx, kv_idx):
 
 
 # copied from https://github.com/meta-pytorch/attention-gym/blob/main/attn_gym/masks/document_mask.py
-def generate_random_lengths(total_length, num_documents):
+def generate_random_lengths(total_length, num_documents) -> list[int]:
     # Initialize all lengths to 1 to ensure each document has at least one token
     lengths = [1] * num_documents
     remaining_length = total_length - num_documents
@@ -473,9 +372,27 @@ def generate_random_lengths(total_length, num_documents):
     return lengths
 
 
-def length_to_offsets(
-    lengths: list[list[int]], device: Union[str, torch.device]
-) -> Tensor:
+def generate_random_lengths_in_chunks(
+    total_length, num_documents, chunk_size
+) -> list[int]:
+    # Generate a list of random document lengths so that each document contains
+    # some number of chunks of size `chunk_size`. This means each document's length
+    # must be a multiple of `chunk_size`. Besides, the lengths of all the documents
+    # sum up to `total_length`.
+    num_chunks = total_length // chunk_size
+    assert total_length % chunk_size == 0 and num_chunks >= num_documents
+
+    num_chunks_per_document = [1] * num_documents
+    remaining_chunks = num_chunks - num_documents
+    # Randomly distribute the remaining chunks
+    for _ in range(remaining_chunks):
+        index = random.randint(0, num_documents - 1)  # document_id
+        num_chunks_per_document[index] += 1
+
+    return [num_chunks * chunk_size for num_chunks in num_chunks_per_document]
+
+
+def length_to_offsets(lengths: list[list[int]], device: str | torch.device) -> Tensor:
     """Converts a list of lengths to a list of offsets.
 
     Args:
@@ -532,51 +449,64 @@ def doc_mask_mod(b, h, q_idx, kv_idx):
     return doc_mask_mod
 
 
-class RingFlexAttentionTest(DTensorTestBase):
+class FlexAttentionWrapper(torch.nn.Module):
+    _flex_attn: ClassVar[Callable] = torch.compile(flex_attention)
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self, *args: object, **kwargs: object
+    ) -> [
+        torch.Tensor | tuple[torch.Tensor, torch.Tensor],
+        tuple[torch.Tensor, AuxOutput],
+    ]:
+        return FlexAttentionWrapper._flex_attn(*args, **kwargs)
+
+
+class CPFlexAttentionTest(DTensorTestBase):
     @property
     def world_size(self) -> int:
         return 2
 
-    def _test_ring_flex_attention(
-        self, qkv_size, B=1, mask_func=causal_mask, atol=1e-6, rtol=1e-2
+    def _test_cp_flex_attention(
+        self,
+        *,
+        qkv_size: int,
+        B: int = 1,
+        block_mask,
+        lb_type: str,
+        document_lengths: Optional[list[list[int]]] = None,
     ) -> None:
-        torch.cuda.manual_seed(10)
+        torch.use_deterministic_algorithms(True)
+        torch.cuda.manual_seed(1234)
+
         dtype = torch.float32
         bs = B if B > 1 else 8
-        query_tokens = context_tokens = qkv_size
         dim = 32
         nheads = 8
-
-        q = torch.rand(
-            (bs, nheads, query_tokens, dim),
-            device=self.device_type,
-            dtype=dtype,
-            requires_grad=True,
-        )
-        k = torch.rand(
-            (bs, nheads, context_tokens, dim),
-            device=self.device_type,
-            dtype=dtype,
-            requires_grad=True,
-        )
-        v = torch.rand(
-            (bs, nheads, context_tokens, dim),
-            device=self.device_type,
-            dtype=dtype,
-            requires_grad=True,
+        seq_dim = 2
+        lb = self._get_load_balancer(
+            lb_type,
+            {
+                "seq_length": qkv_size,
+                "document_lengths": document_lengths,
+                "block_mask": block_mask,
+            },
         )
 
-        block_mask = compiled_create_block_mask(
-            mask_func,
-            B=B,
-            H=1,
-            Q_LEN=query_tokens,
-            KV_LEN=context_tokens,
-            device=self.device_type,
-        )
+        qkv = [
+            torch.rand(
+                (bs, nheads, qkv_size, dim),
+                device=self.device_type,
+                dtype=dtype,
+                requires_grad=True,
+            )
+            for _ in range(3)
+        ]
 
         expect_out, expect_aux = compiled_flex_attention(
-            q, k, v, block_mask=block_mask, return_aux=AuxRequest(lse=True)
+            *qkv, block_mask=block_mask, return_aux=AuxRequest(lse=True)
         )
         expect_out.sum().backward()
 
@@ -586,116 +516,156 @@ def _test_ring_flex_attention(
             mesh_shape=(self.world_size,),
             mesh_dim_names=("cp",),
         )
-        # NOTE: cp needs to know the sharding dimension
-        # TODO: see if this can be moved to the cp context
-        from torch.distributed.tensor.experimental._attention import _set_cp_global_var
 
-        _set_cp_global_var("cp_shard_dim", 2)
-        self.assertEqual(
-            torch.distributed.tensor.experimental._attention._cp_global_vars.cp_shard_dim,
-            2,
+        flex_attention_wrapper_module = FlexAttentionWrapper()
+        cp_plan = _ContextParallel(
+            seq_dim=seq_dim,
+            attention_type=_ContextParallel.AttentionType.FLEX,
         )
-
-        # NOTE: we do not test load balance here
-        _cp_options.enable_load_balance = False
-
-        # set CP context dispatch mode to use TORCH_FUNCTION for flex_attention
-        torch.distributed.tensor.experimental._attention._dispatch_mode = (
-            _DispatchMode.TORCH_FUNCTION
+        parallelize_module(
+            flex_attention_wrapper_module,
+            device_mesh,
+            cp_plan,
         )
 
-        # prepare input buffer
-        cp_q = q.detach().clone()
-        cp_k = k.detach().clone()
-        cp_v = v.detach().clone()
-
-        # create block_mask for CP
-        from torch.distributed.tensor.experimental._attention import (
-            create_cp_block_mask,
+        *cp_qkv, cp_block_mask = _context_parallel_shard(
+            device_mesh,
+            [t.detach().clone() for t in qkv] + [block_mask],
+            [seq_dim] * 4,
+            load_balancer=lb,
         )
+        for t in cp_qkv:
+            t.requires_grad = True
 
-        # NOTE: call create_block_mask() within TorchFunctionMode would cause error in create_fw_bw_graph
-        cp_block_mask = create_cp_block_mask(
-            mask_func,
-            B=B,
-            H=1,
-            Q_LEN=query_tokens,
-            KV_LEN=context_tokens,
-            device_mesh=device_mesh,
+        cp_out, cp_aux = flex_attention_wrapper_module(
+            *cp_qkv,
+            block_mask=cp_block_mask,
+            return_aux=AuxRequest(lse=True),
         )
 
-        # shard qkv on seq_dim
-        shard_dim = 2
-
-        with context_parallel(
-            device_mesh,
-            buffers=[cp_q, cp_k, cp_v],
-            buffer_seq_dims=[shard_dim] * 3,
-        ):
-            cp_q.requires_grad = True
-            cp_k.requires_grad = True
-            cp_v.requires_grad = True
-
-            cp_out, cp_aux = compiled_flex_attention(
-                cp_q,
-                cp_k,
-                cp_v,
-                block_mask=cp_block_mask,
-                return_aux=AuxRequest(lse=True),
-            )
-
-            # check block_mask rewrite doesn't escape to the outside
-            assert cp_block_mask.seq_lengths == (
-                cp_q.size(dim=shard_dim),
-                cp_k.size(dim=shard_dim),
-            )
-
-            # backward run
-            cp_out.sum().backward()
-
-            cp_q.requires_grad = False
-            cp_k.requires_grad = False
-            cp_v.requires_grad = False
+        # backward run
+        cp_out.sum().backward()
 
+        atol = 2e-06
+        rtol = 1e-05
         # unshard the output
         cp_out, cp_lse = context_parallel_unshard(
-            device_mesh, [cp_out, cp_aux.lse], [2, 2]
+            device_mesh,
+            buffers=[cp_out, cp_aux.lse],
+            seq_dims=[seq_dim] * 2,
+            load_balancer=lb,
         )
         torch.testing.assert_close(cp_out, expect_out, atol=atol, rtol=rtol)
         torch.testing.assert_close(cp_lse, expect_aux.lse, atol=atol, rtol=rtol)
 
         # unshard the gradient
-        cp_q_grad, cp_k_grad, cp_v_grad = context_parallel_unshard(
+        cp_qkv_grad = context_parallel_unshard(
             device_mesh,
-            [cp_q.grad, cp_k.grad, cp_v.grad],
-            [2, 2, 2],
+            buffers=[t.grad for t in cp_qkv],
+            seq_dims=[seq_dim] * 3,
+            load_balancer=lb,
         )
-        torch.testing.assert_close(cp_q_grad, q.grad, atol=atol, rtol=rtol)
-        torch.testing.assert_close(cp_k_grad, k.grad, atol=atol, rtol=rtol)
-        torch.testing.assert_close(cp_v_grad, v.grad, atol=atol, rtol=rtol)
 
-        # reset CP context dispatch mode to default
-        torch.distributed.tensor.experimental._attention._dispatch_mode = (
-            _DispatchMode.MONKEY_PATCH
-        )
+        qkv_grad = [t.grad for t in qkv]
+        for grad, cp_grad in zip(qkv_grad, cp_qkv_grad):
+            torch.testing.assert_close(grad, cp_grad, atol=atol, rtol=rtol)
+
+    def _get_load_balancer(
+        self, lb_type: str, kwargs: dict[str, Any]
+    ) -> Optional[_LoadBalancer]:
+        seq_length = kwargs["seq_length"]
+        document_lengths = kwargs["document_lengths"]
+        block_mask = kwargs["block_mask"]
+
+        # generate load balancer
+        if lb_type == "None":
+            load_balancer = None  # no load-balance
+        elif lb_type == "_HeadTailLoadBalancer":
+            assert isinstance(seq_length, int)
+            load_balancer = _HeadTailLoadBalancer(
+                seq_length, self.world_size, torch.device(self.device_type)
+            )
+        elif lb_type == "_PerDocumentHeadTailLoadBalancer":
+            assert isinstance(document_lengths, list)
+            load_balancer = _PerDocumentHeadTailLoadBalancer(
+                document_lengths, self.world_size, torch.device(self.device_type)
+            )
+        elif lb_type == "_PTRRLoadBalancer":
+            assert isinstance(block_mask, BlockMask)
+            load_balancer = _PTRRLoadBalancer(
+                block_mask,
+                self.world_size,
+            )
+        else:
+            raise ValueError(f"load_balancer type {lb_type} is not supported!")
+
+        return load_balancer
 
     @skip_if_lt_x_gpu(2)
     @with_comms
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
     )
-    def test_ring_flex_attention(self) -> None:
-        self.run_subtests(
-            {"qkv_size": [128 * self.world_size, 2048]},
-            self._test_ring_flex_attention,
+    def test_cp_flex_attention_causal_mask(self) -> None:
+        seq_length_list = [256 * self.world_size, 2048]
+        load_balance_type_list = [
+            "None",
+            "_HeadTailLoadBalancer",
+            "_PTRRLoadBalancer",
+        ]
+
+        # NOTE: Each (seq_len, load_balance_type) tuple introduces 2
+        # create_block_mask compilations: 1 for single-rank flex_attention and 1 for
+        # CP flex_attention. In order to avoid the "exceeds_recompile_limit" error,
+        # we need to increase the cache_size_limit to 2 * num_of_sub_test_runs which
+        # will be the total number of compilations in our test case.
+        torch._dynamo.config.cache_size_limit = (len(seq_length_list) + 1) * (
+            1 + len(load_balance_type_list)
         )
 
-        # NOTE: Context Parallel should not be used for small attentions (block_size < 128)
-        with self.assertRaisesRegex(AssertionError, "Tensor-likes are not close"):
-            self.run_subtests(
-                {"qkv_size": [64 * self.world_size]},
-                self._test_ring_flex_attention,
+        for qkv_size, lb_type in itertools.product(
+            seq_length_list, load_balance_type_list
+        ):
+            block_mask = compiled_create_block_mask(
+                causal_mask,
+                B=1,
+                H=1,
+                Q_LEN=qkv_size,
+                KV_LEN=qkv_size,
+                device=self.device_type,
             )
+            self._test_cp_flex_attention(
+                qkv_size=qkv_size, block_mask=block_mask, lb_type=lb_type
+            )
+
+        # NOTE: Context Parallel should not be used for small attentions (block_size < 128)
+        qkv_size = 64 * self.world_size
+        block_mask = compiled_create_block_mask(
+            causal_mask,
+            B=1,
+            H=1,
+            Q_LEN=qkv_size,
+            KV_LEN=qkv_size,
+            device=self.device_type,
+        )
+
+        for lb_type in ["None", "_HeadTailLoadBalancer"]:
+            with self.assertRaisesRegex(
+                NotImplementedError,
+                f"Q_LEN {qkv_size} is not divisible",
+            ):
+                self._test_cp_flex_attention(
+                    qkv_size=qkv_size, block_mask=block_mask, lb_type=lb_type
+                )
+
+        for lb_type in ["_PTRRLoadBalancer"]:
+            with self.assertRaisesRegex(
+                NotImplementedError,
+                "must be divisible by group_size",
+            ):
+                self._test_cp_flex_attention(
+                    qkv_size=qkv_size, block_mask=block_mask, lb_type=lb_type
+                )
 
     # TODO: merge with the above test
     @skip_if_lt_x_gpu(2)
@@ -703,17 +673,10 @@ def test_ring_flex_attention(self) -> None:
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
     )
-    def test_ring_flex_attention_document_mask(self) -> None:
+    def test_cp_flex_attention_document_mask(self) -> None:
         random.seed(10)
 
-        # NOTE: Each (batch_size, seq_len) tuple introduces 2 create_block_mask
-        # compilations: 1 for single-rank flex_attention and 1 for CP flex_attention.
-        # In order to avoid the "exceeds_recompile_limit" error, we need to increase
-        # the cache_size_limit to 12 which is the total number of compilations in our
-        # test case.
-        torch._dynamo.config.cache_size_limit = 12
-
-        # initialize document mask
+        # parameters for testing
         doc_count = 28
         batch_size_list = [2, 4, 8]
         max_seq_len_list = [
@@ -721,31 +684,126 @@ def test_ring_flex_attention_document_mask(self) -> None:
             2048,
             # 128 * self.world_size  # NOTE: Mismatched elements: 8 / 131072 (0.0%),
         ]
+        load_balance_type = [
+            "None",
+            "_HeadTailLoadBalancer",
+            "_PerDocumentHeadTailLoadBalancer",
+            "_PTRRLoadBalancer",
+        ]
+
+        # NOTE: Each (batch_size, seq_len, load_balance_type) tuple introduces 2
+        # create_block_mask compilations: 1 for single-rank flex_attention and 1 for
+        # CP flex_attention. In order to avoid the "exceeds_recompile_limit" error,
+        # we need to increase the cache_size_limit to 2 * num_of_sub_test_runs which
+        # will be the total number of compilations in our test case.
+        torch._dynamo.config.cache_size_limit = (
+            2 * len(batch_size_list) * len(max_seq_len_list) * len(load_balance_type)
+        )
 
         # TODO: change this for-loop to run_subtests
         # Use a for-loop instead of run_subtests because we need to intialize the mask
-        # for each subtest. This can be baked into self._test_ring_flex_attention as
+        # for each subtest. This can be baked into self._test_cp_flex_attention as
         # a str argument denoting mask type.
-        for batch_size, max_seq_len in itertools.product(
-            batch_size_list, max_seq_len_list
+        for batch_size, max_seq_len, lb_type in itertools.product(
+            batch_size_list,
+            max_seq_len_list,
+            load_balance_type,
         ):
+            # initialize document mask
             lengths = [
-                generate_random_lengths(max_seq_len, doc_count)
+                (
+                    generate_random_lengths_in_chunks(
+                        max_seq_len, doc_count, chunk_size=2 * self.world_size
+                    )
+                    if lb_type == "_PerDocumentHeadTailLoadBalancer"
+                    else generate_random_lengths(max_seq_len, doc_count)
+                )
                 for _ in range(batch_size)
             ]
             offsets = length_to_offsets(lengths, self.device_type)
             document_causal_mask = generate_doc_mask_mod(causal_mask, offsets)
+            block_mask = compiled_create_block_mask(
+                document_causal_mask,
+                B=batch_size,
+                H=1,
+                Q_LEN=max_seq_len,
+                KV_LEN=max_seq_len,
+                device=self.device_type,
+            )
 
-            # construct testing function
-            test_func = functools.partial(
-                self._test_ring_flex_attention,
+            self._test_cp_flex_attention(
                 qkv_size=max_seq_len,
                 B=batch_size,
-                mask_func=document_causal_mask,
-                atol=1e-6,
+                lb_type=lb_type,
+                block_mask=block_mask,
+                document_lengths=lengths,
             )
 
-            test_func()
+
+class TestCPCustomOps(DTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @skip_if_lt_x_gpu(2)
+    @with_comms
+    def test_flex_cp_custom_op(self) -> None:
+        mesh = init_device_mesh(
+            device_type=self.device_type,
+            mesh_shape=(self.world_size,),
+            mesh_dim_names=("cp",),
+        )
+        examples_k_v = [
+            (
+                torch.randn(8, 8, 8, 8, device=self.device_type),
+                torch.randn(8, 8, 8, 8, device=self.device_type),
+                2,
+                c10d._get_process_group_name(mesh.get_group()),
+            ),
+            (
+                torch.randn(8, 8, 8, 8, device=self.device_type, requires_grad=True),
+                torch.randn(8, 8, 8, 8, device=self.device_type, requires_grad=True),
+                2,
+                c10d._get_process_group_name(mesh.get_group()),
+            ),
+        ]
+        for example in examples_k_v:
+            torch.library.opcheck(flex_cp_allgather, example)
+
+
+class TestSharding(DTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @skip_if_lt_x_gpu(2)
+    @with_comms
+    def test_context_parallel_shard(self) -> None:
+        B = 4
+        seq_len = 32
+
+        device_mesh = init_device_mesh(
+            mesh_shape=(2,), mesh_dim_names=("cp",), device_type=self.device_type
+        )
+        freqs_cis = torch.arange(0, seq_len, device=self.device_type)
+        q = torch.ones(B * seq_len, device=self.device_type).reshape(B, seq_len)
+        k = torch.ones(B * seq_len, device=self.device_type).reshape(B, seq_len)
+        v = torch.ones(B * seq_len, device=self.device_type).reshape(B, seq_len)
+
+        load_balancer = _HeadTailLoadBalancer(
+            seq_len, self.world_size, torch.device(self.device_type)
+        )
+        freqs_cis_shard, q_shard, k_shard, v_shard = _context_parallel_shard(
+            device_mesh, [freqs_cis, q, k, v], [0, 1, 1, 1], load_balancer=load_balancer
+        )
+        self.assertEqual(freqs_cis_shard.size(), (seq_len // 2,))
+        chunks = freqs_cis.chunk(self.world_size * 2)
+        self.assertEqual(
+            freqs_cis_shard,
+            torch.cat(
+                [chunks[self.rank], chunks[self.world_size * 2 - self.rank - 1]], dim=0
+            ),
+        )
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/tensor/test_convolution_ops.py b/test/distributed/tensor/test_convolution_ops.py
index d249a6d2ff77..68c52353b21a 100644
--- a/test/distributed/tensor/test_convolution_ops.py
+++ b/test/distributed/tensor/test_convolution_ops.py
@@ -203,6 +203,34 @@ def test_conv_backward_none_grad_inp(self):
         self.assertTrue(b_dt.grad is not None)
         self.assertTrue(x_dt.grad is None)
 
+    @with_comms
+    def test_conv1d(self):
+        device_mesh = self.build_device_mesh()
+        model = nn.Conv1d(64, 64, 3, padding=1)
+        model_gt = copy.deepcopy(model)
+        x = torch.randn(1, 64, 8)
+        x_dt = DTensor.from_local(x, device_mesh, [Replicate()])
+        model_dt = distribute_module(
+            model, device_mesh, _conv_fn, input_fn=None, output_fn=None
+        )
+        out_dt = model_dt(x_dt)
+        out = model_gt(x)
+        self.assertEqual(out_dt.shape, out.shape)
+
+    @with_comms
+    def test_conv3d(self):
+        device_mesh = self.build_device_mesh()
+        model = nn.Conv3d(64, 64, 3, padding=1)
+        model_gt = copy.deepcopy(model).to(device=self.device_type)
+        x = torch.randn(1, 64, 8, 8, 8, device=self.device_type)
+        x_dt = DTensor.from_local(x, device_mesh, [Replicate()])
+        model_dt = distribute_module(
+            model, device_mesh, _conv_fn, input_fn=None, output_fn=None
+        )
+        out_dt = model_dt(x_dt)
+        out = model_gt(x)
+        self.assertEqual(out_dt.shape, out.shape)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_dtensor.py b/test/distributed/tensor/test_dtensor.py
index 083f6d459c7e..0a607581a340 100644
--- a/test/distributed/tensor/test_dtensor.py
+++ b/test/distributed/tensor/test_dtensor.py
@@ -10,6 +10,7 @@
 import torch
 import torch.nn.functional as F
 from torch.distributed._functional_collectives import AsyncCollectiveTensor
+from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.tensor import (
     DeviceMesh,
     distribute_tensor,
@@ -19,7 +20,11 @@
     Shard,
 )
 from torch.distributed.tensor._api import _shard_tensor
-from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+from torch.distributed.tensor._dtensor_spec import (
+    DTensorSpec,
+    ShardOrderEntry,
+    TensorMeta,
+)
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.experimental import implicit_replication
 from torch.distributed.tensor.parallel import (
@@ -27,9 +32,13 @@
     parallelize_module,
     RowwiseParallel,
 )
+from torch.distributed.tensor.placement_types import _StridedShard
+from torch.testing import make_tensor
 from torch.testing._internal.common_utils import IS_FBCODE, run_tests, skipIfHpu
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    create_local_tensor_test_class,
     DTensorTestBase,
+    map_local_tensor_for_rank,
     with_comms,
 )
 
@@ -170,9 +179,9 @@ def test_dtensor_stride(self):
     @with_comms
     def test_from_local(self):
         device_mesh = self.build_device_mesh()
-        placements = [Shard(0)]
+        shard_spec = [Shard(0)]
         local_tensor = torch.randn(3, 3)
-        sharded_tensor = DTensor.from_local(local_tensor, device_mesh, placements)
+        sharded_tensor = DTensor.from_local(local_tensor, device_mesh, shard_spec)
         self.assertEqual(sharded_tensor.size(), torch.Size([self.world_size * 3, 3]))
 
         replica_spec = [Replicate()]
@@ -189,14 +198,14 @@ def test_from_local(self):
         local_tensor_temp = local_tensor_with_grad * 3
         # create the dist tensor with non leaf local tensor, dist tensor created
         # should also be non leaf node
-        dist_tensor = DTensor.from_local(local_tensor_temp, device_mesh, placements)
+        dist_tensor = DTensor.from_local(local_tensor_temp, device_mesh, shard_spec)
         self.assertFalse(dist_tensor.is_leaf)
         # do some random operations on dist tensor
         output = dist_tensor * 3
         self.assertIsInstance(output, DTensor)
         # trigger .backward() on dist tensor directly
         local_grad = torch.ones(3, 3)
-        grad_output = DTensor.from_local(local_grad, device_mesh, placements)
+        grad_output = DTensor.from_local(local_grad, device_mesh, shard_spec)
         # run backward directly on dist tensor
         output.backward(grad_output)
         # check it gradients flow back to original torch.Tensor
@@ -204,6 +213,16 @@ def test_from_local(self):
         expected_grad = torch.ones(3, 3) * 9
         self.assertEqual(local_tensor_with_grad.grad, expected_grad)
 
+        # DTensor.from_local should raise error if the `local_tensor`
+        # argument is a DTensor
+        local_tensor = torch.ones(2, 2)
+        dtensor = DTensor.from_local(local_tensor, device_mesh, shard_spec)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "the local_tensor argument only accepts torch.Tensor"
+        ):
+            DTensor.from_local(dtensor, device_mesh, shard_spec)
+
     @with_comms
     def test_from_local_uneven_sharding(self):
         device_mesh = self.build_device_mesh()
@@ -219,7 +238,7 @@ def test_from_local_uneven_sharding(self):
         )
 
         dtensor = DTensor.from_local(
-            tensor_list[self.rank],
+            map_local_tensor_for_rank(tensor_list, self.rank, lambda tl, r: tl[r]),
             device_mesh,
             (Shard(0),),
             shape=global_tensor.size(),
@@ -247,7 +266,7 @@ def test_from_local_uneven_sharding_raise_error(self):
             RuntimeError, "Please pass both shape and stride at the same time."
         ):
             DTensor.from_local(
-                tensor_list[self.rank],
+                map_local_tensor_for_rank(tensor_list, self.rank, lambda tl, r: tl[r]),
                 device_mesh,
                 (Shard(0),),
                 shape=global_tensor.size(),
@@ -257,7 +276,7 @@ def test_from_local_uneven_sharding_raise_error(self):
             RuntimeError, "Please pass both shape and stride at the same time."
         ):
             DTensor.from_local(
-                tensor_list[self.rank],
+                map_local_tensor_for_rank(tensor_list, self.rank, lambda tl, r: tl[r]),
                 device_mesh,
                 (Shard(0),),
                 stride=global_tensor.stride(),
@@ -575,7 +594,12 @@ def test_shard_tensor(self):
         self.assertEqual(sharded_tensor.size(), torch.Size([ws, ws]))
         self.assertEqual(sharded_tensor.placements, placements)
         local_tensor = sharded_tensor.to_local()
-        self.assertEqual(local_tensor, full_tensor[range(self.rank, self.rank + 1), :])
+        self.assertEqual(
+            local_tensor,
+            map_local_tensor_for_rank(
+                full_tensor, self.rank, lambda ft, r: ft[range(r, r + 1), :]
+            ),
+        )
 
         # Shard by column
         placements = [Shard(1)]
@@ -583,7 +607,12 @@ def test_shard_tensor(self):
         self.assertEqual(sharded_tensor.size(), torch.Size([ws, ws]))
         self.assertEqual(sharded_tensor.placements, placements)
         local_tensor = sharded_tensor.to_local()
-        self.assertEqual(local_tensor, full_tensor[:, range(self.rank, self.rank + 1)])
+        self.assertEqual(
+            local_tensor,
+            map_local_tensor_for_rank(
+                full_tensor, self.rank, lambda ft, r: ft[:, range(r, r + 1)]
+            ),
+        )
 
         # assert full tensor is not changed
         self.assertEqual(full_tensor, torch.arange(ws * ws).reshape(ws, ws))
@@ -603,6 +632,19 @@ def test_shard_tensor_2d(self):
         self.assertEqual(local_tensor.item(), self.rank)
 
 
+DTensorTestWithLocalTensor = create_local_tensor_test_class(
+    DTensorTest,
+    skipped_tests=[
+        # Async output in local mode is not supported
+        "test_dtensor_async_output",
+        # Disabling saving and loading in local mode since it requires a deeper
+        # integration
+        "test_dtensor_save_load",
+        "test_dtensor_save_load_import",
+    ],
+)
+
+
 class DTensorMeshTest(DTensorTestBase):
     @property
     def world_size(self):
@@ -870,6 +912,19 @@ def test_vmap_embedding(self):
         local_expected = expected.to_local()
         self.assertEqual(local_result, local_expected)
 
+    @unittest.expectedFailure
+    @with_comms
+    def test_inplace_on_local_tensor_view(self):
+        mesh = self.build_device_mesh()
+        seq = 8
+        vocab = 16
+        leaf = torch.randn((seq, vocab), device=self.device_type, requires_grad=True)
+        dtensor_leaf = DTensor.from_local(leaf, mesh, [Shard(1)])
+        dtensor_vocab_parallel_logits = dtensor_leaf * 2  # make this non-leaf
+        vocab_parallel_logits = dtensor_vocab_parallel_logits.to_local()
+        logits_max = torch.randn(seq, device=self.device_type)
+        vocab_parallel_logits -= logits_max.unsqueeze(dim=1)
+
     @with_comms
     def test_auto_implicit_replication(self):
         mesh = self.build_device_mesh()
@@ -896,6 +951,27 @@ def add_scalar_tensor_with_dtensor():
             (numel_1_tensor + sharded_dtensor).to_local(), numel_1_tensor + local_tensor
         )
 
+    @unittest.expectedFailure
+    @with_comms
+    def test_dtensor_cond(self):
+        mesh = self.build_device_mesh()
+
+        def make_dtensor(*shape, dtype, device):
+            return distribute_tensor(
+                make_tensor(*shape, dtype=dtype, device=device),
+                device_mesh=mesh,
+                placements=None,
+            )
+
+        x = make_dtensor(1, 1, dtype=torch.bfloat16, device="cuda")
+
+        # Fails with AssertionError: P1972527564
+        torch.cond(
+            x > 0,
+            lambda: 1.0 / x,
+            lambda: torch.zeros_like(x),
+        )
+
     @with_comms
     def test_metadata_consistency_check(self):
         device_mesh = self.build_device_mesh()
@@ -944,6 +1020,19 @@ def test_metadata_consistency_check(self):
             self.fail("Unexpected ValueError raised with run_check=False")
 
 
+DTensorMeshTestWithLocalTensor = create_local_tensor_test_class(
+    DTensorMeshTest,
+    skipped_tests=[
+        # Test asserts must be rewritten for local tensor
+        "test_from_local_sub_mesh",
+        "test_default_value_sub_mesh",
+        "test_redistribute_sub_mesh",
+        # Local tensor mode doesn't support tensors of different types on different ranks
+        "test_metadata_consistency_check",
+    ],
+)
+
+
 class TestDTensorPlacementTypes(DTensorTestBase):
     @property
     def world_size(self):
@@ -977,7 +1066,7 @@ def test_split_tensor_1D(self) -> None:
                 assert_array_equal(expected_pad_sizes, pad_sizes)
 
                 is_tensor_empty = [
-                    False if splitted_tensor.numel() > 0 else True
+                    not splitted_tensor.numel() > 0
                     for splitted_tensor in splitted_tensor_list
                 ]
                 expected_is_tensor_empty = [True] * self.world_size
@@ -1000,15 +1089,201 @@ def test_split_tensor_1D(self) -> None:
                     for i, tensor in enumerate(splitted_tensor_list)
                 ]
                 expected_is_tensor_empty = [
-                    False if idx < size else True
-                    for idx, _ in enumerate(range(self.world_size))
+                    not idx < size for idx, _ in enumerate(range(self.world_size))
                 ]
                 is_tensor_empty = [
-                    False if unpadded_tensor.numel() > 0 else True
-                    for unpadded_tensor in unpadded_list
+                    not unpadded_tensor.numel() > 0 for unpadded_tensor in unpadded_list
                 ]
                 assert_array_equal(expected_is_tensor_empty, is_tensor_empty)
 
 
+TestDTensorPlacementTypesWithLocalTensor = create_local_tensor_test_class(
+    TestDTensorPlacementTypes,
+)
+
+
+class TestDTensorSpec(DTensorTestBase):
+    @property
+    def world_size(self):
+        return 8
+
+    def test_dtensor_spec_print(self):
+        self.assertExpectedInline(
+            DTensorSpec.format_shard_order_str((Shard(2), Shard(1), Shard(0)), None),
+            """S(2)S(1)S(0)""",
+        )
+        self.assertExpectedInline(
+            DTensorSpec.format_shard_order_str(
+                (Shard(2), Shard(1), Shard(0)),
+                (
+                    ShardOrderEntry(tensor_dim=0, mesh_dims=(2,)),
+                    ShardOrderEntry(tensor_dim=1, mesh_dims=(1,)),
+                    ShardOrderEntry(tensor_dim=2, mesh_dims=(0,)),
+                ),
+            ),
+            """S(2)S(1)S(0)""",
+        )
+        self.assertExpectedInline(
+            DTensorSpec.format_shard_order_str(
+                (Shard(1), Shard(1), Shard(1)),
+                (ShardOrderEntry(tensor_dim=1, mesh_dims=(2, 0, 1)),),
+            ),
+            """S(1)[1]S(1)[2]S(1)[0]""",
+        )
+        self.assertExpectedInline(
+            DTensorSpec.format_shard_order_str(
+                (Replicate(), Replicate(), Replicate()), None
+            ),
+            """RRR""",
+        )
+        self.assertExpectedInline(
+            DTensorSpec.format_shard_order_str(
+                (Replicate(), Replicate(), Shard(1)), None
+            ),
+            """RRS(1)""",
+        )
+
+    @with_comms
+    def test_dtensor_spec_with_invalid_shard_order(self):
+        mesh_shape = (2, 2, self.world_size // 4)
+        mesh = init_device_mesh(self.device_type, mesh_shape)
+        tensor_local = torch.randn(8, 6, 5, device=self.device_type)
+        tensor_global = DTensor.from_local(
+            tensor_local, mesh, [Shard(1), Shard(1), Shard(0)]
+        )
+        tensor_global._spec.shard_order = (
+            ShardOrderEntry(tensor_dim=0, mesh_dims=(2,)),
+            ShardOrderEntry(tensor_dim=1, mesh_dims=(1, 0)),
+        )
+        with self.assertRaisesRegex(
+            AssertionError, r"shard_order .* has empty mesh dim"
+        ):
+            tensor_global._spec.shard_order = (
+                ShardOrderEntry(tensor_dim=1, mesh_dims=()),
+                ShardOrderEntry(tensor_dim=0, mesh_dims=(2,)),
+            )
+        with self.assertRaisesRegex(
+            AssertionError, "tensor dim should be sorted in shard_order"
+        ):
+            tensor_global._spec.shard_order = (
+                ShardOrderEntry(tensor_dim=1, mesh_dims=(1, 0)),
+                ShardOrderEntry(tensor_dim=0, mesh_dims=(2,)),
+            )
+        with self.assertRaisesRegex(
+            AssertionError,
+            r"placement\[\d+\] doesn't have a matching shard in shard_order",
+        ):
+            tensor_global._spec.shard_order = (
+                ShardOrderEntry(tensor_dim=0, mesh_dims=(1,)),
+                ShardOrderEntry(tensor_dim=1, mesh_dims=(1, 0)),
+            )
+        with self.assertRaisesRegex(
+            AssertionError, r"shard_order .* has invalid mesh dim \([\d,]+\)"
+        ):
+            tensor_global._spec.shard_order = (
+                ShardOrderEntry(tensor_dim=0, mesh_dims=(3,)),
+                ShardOrderEntry(tensor_dim=1, mesh_dims=(1, 0)),
+            )
+        with self.assertRaisesRegex(
+            AssertionError, r"shard_order .* has invalid tensor dim -?\d+"
+        ):
+            tensor_global._spec.shard_order = (
+                ShardOrderEntry(tensor_dim=0, mesh_dims=(2,)),
+                ShardOrderEntry(tensor_dim=-1, mesh_dims=(1, 0)),
+            )
+
+    @with_comms
+    def test_dtensor_spec_update(self):
+        mesh_shape = (2, 2, self.world_size // 4)
+        mesh = init_device_mesh(self.device_type, mesh_shape)
+        tensor_local = torch.randn(8, 6, 5, device=self.device_type)
+        tensor_global_1 = DTensor.from_local(
+            tensor_local, mesh, [Shard(1), Shard(1), Shard(0)]
+        )
+        tensor_global_2 = DTensor.from_local(
+            tensor_local, mesh, [Shard(1), Shard(1), Shard(0)]
+        )
+        self.assertNotEqual(id(tensor_global_1), id(tensor_global_2))
+        self.assertEqual(hash(tensor_global_1._spec), hash(tensor_global_2._spec))
+        self.assertEqual(tensor_global_1._spec, tensor_global_2._spec)
+        # not using the default shard_order
+        tensor_global_1._spec.shard_order = (
+            ShardOrderEntry(tensor_dim=0, mesh_dims=(2,)),
+            ShardOrderEntry(tensor_dim=1, mesh_dims=(1, 0)),
+        )
+        # hash should be recomputed in DTensorSpec.__setattr__()
+        self.assertNotEqual(hash(tensor_global_1._spec), hash(tensor_global_2._spec))
+        self.assertNotEqual(tensor_global_1._spec, tensor_global_2._spec)
+
+    @with_comms
+    def test_dtensor_spec_default_shard_order_generation(self):
+        mesh_shape = (2, 2, self.world_size // 4)
+        mesh = init_device_mesh(self.device_type, mesh_shape)
+        tensor_local = torch.randn(8, 6, 5, device=self.device_type)
+
+        tensor_global = DTensor.from_local(
+            tensor_local, mesh, [Shard(1), Shard(1), Shard(0)]
+        )
+        self.assertEqual(
+            tensor_global._spec.shard_order,
+            (
+                ShardOrderEntry(tensor_dim=0, mesh_dims=(2,)),
+                ShardOrderEntry(tensor_dim=1, mesh_dims=(0, 1)),
+            ),
+        )
+
+        tensor_global = DTensor.from_local(
+            tensor_local, mesh, [Replicate(), Replicate(), Replicate()]
+        )
+        self.assertEqual(tensor_global._spec.shard_order, ())
+
+        # shard order omit partial
+        tensor_global = DTensor.from_local(
+            tensor_local, mesh, [Partial(), Replicate(), Replicate()]
+        )
+        self.assertEqual(tensor_global._spec.shard_order, ())
+
+        # shard_order doesn't work with _StridedShard
+        tensor_global = DTensor.from_local(
+            tensor_local,
+            mesh,
+            [Replicate(), _StridedShard(0, split_factor=2), Shard(0)],
+        )
+        self.assertEqual(tensor_global._spec.shard_order, ())
+
+    @with_comms
+    def test_default_shard_order(self):
+        mesh_shape = (2, 2, self.world_size // 4)
+        mesh = init_device_mesh(self.device_type, mesh_shape)
+        tensor_local = torch.randn(8, 6, 5, device=self.device_type)
+
+        tensor_global = DTensor.from_local(
+            tensor_local, mesh, [Shard(1), Shard(2), Shard(1)]
+        )
+        # DTensorSpec automatically builds the default left-to-right order
+        self.assertEqual(
+            tensor_global._spec.shard_order,
+            (
+                ShardOrderEntry(tensor_dim=1, mesh_dims=(0, 2)),
+                ShardOrderEntry(tensor_dim=2, mesh_dims=(1,)),
+            ),
+        )
+        self.assertTrue(
+            DTensorSpec.is_default_device_order(tensor_global._spec.shard_order)
+        )
+        # manually set the shard_order by exchange mesh dim 0 and 2
+        tensor_global._spec.shard_order = (
+            ShardOrderEntry(tensor_dim=1, mesh_dims=(2, 0)),
+            ShardOrderEntry(tensor_dim=2, mesh_dims=(1,)),
+        )
+        self.assertFalse(
+            DTensorSpec.is_default_device_order(tensor_global._spec.shard_order)
+        )
+
+
+TestDTensorSpecWithLocalTensor = create_local_tensor_test_class(
+    TestDTensorSpec,
+)
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_dtensor_compile.py b/test/distributed/tensor/test_dtensor_compile.py
index 4095c621eed5..b82e9c97b57a 100644
--- a/test/distributed/tensor/test_dtensor_compile.py
+++ b/test/distributed/tensor/test_dtensor_compile.py
@@ -266,6 +266,41 @@ def fn(x: DeviceMesh):
         compiled_out = compiled_fn(mesh)
         self.assertEqual(opt_fn, compiled_out)
 
+    def test_get_local_rank_compile(self):
+        mesh = init_device_mesh(
+            self.device_type, (self.world_size,), mesh_dim_names=("dp",)
+        )
+
+        def fn_with_str_arg(x):
+            local_rank = x.device_mesh.get_local_rank("dp")
+            return x * local_rank
+
+        x = DTensor.from_local(torch.rand(4, 4), mesh, [Shard(0)], run_check=False)
+        ref = fn_with_str_arg(x)
+
+        opt_fn = torch.compile(fn_with_str_arg, backend="aot_eager", fullgraph=True)
+        res = opt_fn(x)
+        self.assertEqual(res, ref)
+
+        def fn_with_int_arg(x):
+            local_rank = x.device_mesh.get_local_rank(0)
+            return x * local_rank
+
+        ref2 = fn_with_int_arg(x)
+        opt_fn2 = torch.compile(fn_with_int_arg, backend="aot_eager", fullgraph=True)
+        res2 = opt_fn2(x)
+        self.assertEqual(res2, ref2)
+
+        def fn_without_arg(x):
+            # will fail if device_mesh.ndim > 1
+            local_rank = x.device_mesh.get_local_rank()
+            return x + local_rank
+
+        ref3 = fn_without_arg(x)
+        opt_fn3 = torch.compile(fn_without_arg, backend="aot_eager", fullgraph=True)
+        res3 = opt_fn3(x)
+        self.assertEqual(res3, ref3)
+
     def test_fakify_dtensor(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
@@ -388,6 +423,47 @@ def fn(x, y):
         res = opt_fn(x, y)
         self.assertEqual(res, ref)
 
+    def test_dtensor_dynamic_recompiles(self):
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        def inp(*shape):
+            param = torch.randn(*shape, requires_grad=True)
+            x = DTensor.from_local(param, mesh, [Shard(0)], run_check=False)
+            torch._dynamo.mark_dynamic(x, 0)
+            torch._dynamo.mark_dynamic(x, 1)
+            return x
+
+        def run(func, *shape):
+            res = func(inp(*shape))
+            res.sum().backward()
+
+        @torch.compile(backend=cnt, fullgraph=True)
+        def f(x):
+            y = x * x
+            return y.to_local()
+
+        run(f, 4, 4)
+        run(f, 6, 8)
+        run(f, 10, 10)
+        self.assertEqual(cnt.frame_count, 1)
+
+        # sanity check that shape guard recompiles are still handled
+        @torch.compile(backend=cnt, fullgraph=True)
+        def g(x):
+            if x.size(0) <= 16:
+                y = x * x
+            else:
+                y = x + x
+            return y.to_local()
+
+        cnt.clear()
+        run(g, 4, 4)
+        run(g, 8, 8)
+        self.assertEqual(cnt.frame_count, 1)
+        run(g, 64, 8)
+        self.assertEqual(cnt.frame_count, 2)
+
     def test_dtensor_attribute_access_on_intermediate(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
@@ -918,6 +994,9 @@ def fn(x):
         out_dt = torch.matmul(tmp_dt, y_dt)
         out_dt.sum().backward()
 
+    @unittest.skipIf(
+        torch._inductor.config.triton.native_matmul, "Matmul is now generated"
+    )
     def _test_tp_compile_comm_reordering(self):
         class FakeAttention(nn.Module):
             def __init__(self) -> None:
diff --git a/test/distributed/tensor/test_dtensor_export.py b/test/distributed/tensor/test_dtensor_export.py
new file mode 100644
index 000000000000..4f339e438476
--- /dev/null
+++ b/test/distributed/tensor/test_dtensor_export.py
@@ -0,0 +1,380 @@
+# Owner(s): ["oncall: distributed"]
+
+import contextlib
+import unittest
+
+import torch
+import torch.distributed as dist
+import torch.fx.traceback as fx_traceback
+from torch._dynamo.functional_export import _dynamo_graph_capture_for_export
+from torch._functorch.aot_autograd import aot_export_joint_with_descriptors
+from torch._functorch.partitioners import min_cut_rematerialization_partition
+from torch._guards import tracing, TracingContext
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import distribute_tensor, Partial, Replicate, Shard
+from torch.distributed.tensor._api import DTensor
+from torch.distributed.tensor._dtensor_spec import DTensorSpec
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    parallelize_module,
+    RowwiseParallel,
+)
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    requires_cuda,
+    run_tests,
+    TestCase,
+)
+from torch.testing._internal.distributed._tensor.common_dtensor import MLPModule
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+
+class SimpleModel(torch.nn.Module):
+    def __init__(self, device):
+        super().__init__()
+        self.mlp_0 = MLPModule(device)
+        self.mlp_1 = MLPModule(device)
+
+    def forward(self, input):
+        return self.mlp_1(self.mlp_0(input))
+
+
+class EinsumModel(torch.nn.Module):
+    """Simple model that uses einsum with DTensor inputs and returns DTensor."""
+
+    def __init__(self):
+        super().__init__()
+        self.placement = None
+
+    def forward(self, x, y, z):
+        result = torch.einsum("bsh,hd->bsd", x, y)
+        self.placement = result.placements[0]
+        self.placement_2 = y.placements[0]
+        self.placement_3 = z.placements[0]
+        return result
+
+
+class SimpleModelDynamicShapes(torch.nn.Module):
+    def __init__(self, device):
+        super().__init__()
+        self.mlp_0 = MLPModule(device)
+        self.mlp_1 = MLPModule(device)
+
+    def forward(self, input):
+        if input.shape[0] > 4:
+            return self.mlp_0(input.sin())
+        return self.mlp_1(input.cos())
+
+
+class SimpleModelAnnotated(torch.nn.Module):
+    def __init__(self, device):
+        super().__init__()
+        self.mlp_0 = MLPModule(device)
+        self.mlp_1 = MLPModule(device)
+
+    def forward(self, input):
+        with fx_traceback.annotate({"pp_stage": 0}):
+            x = self.mlp_0(input)
+        return self.mlp_1(x)
+
+
+def strict_export_and_aot_export_joint_with_descriptors(model, inputs):
+    # needed for stric export
+    torch.utils._pytree.register_constant(DTensorSpec)
+
+    # install_free_tensors is required for dynamo to work
+    with torch._dynamo.config.patch(
+        install_free_tensors=True, inline_inbuilt_nn_modules=True
+    ):
+        with torch._export.utils._disable_aten_to_metadata_assertions():
+            ep = torch.export.export(model, (inputs,), strict=True)
+
+    # joint_gm produced here is missing the backward region, due to incompatiblility
+    # between ep.module() and aot_export_joint_with_descriptors.
+    # Keeping this here to show the issue.
+    return aot_export_joint_with_descriptors_alone(ep.module(), inputs)
+
+
+def graph_capture_and_aot_export_joint_with_descriptors(model, inputs):
+    with torch._dynamo.config.patch(install_free_tensors=True):
+        # TODO: switch to use the official graph_capture API once it is ready
+        gm = _dynamo_graph_capture_for_export(model)(inputs)
+        fake_mode = gm.meta.get("fake_mode", None)
+    with tracing(TracingContext(fake_mode)):
+        return aot_export_joint_with_descriptors_alone(gm, inputs)
+
+
+def aot_export_joint_with_descriptors_alone(model, inputs):
+    with contextlib.ExitStack() as stack:
+        joint_with_descriptors = aot_export_joint_with_descriptors(
+            stack,
+            model,
+            (inputs,),
+        )
+        return joint_with_descriptors.graph_module
+
+
+def _count_op(gm, target):
+    return sum(1 for node in gm.graph.nodes if node.target == target)
+
+
+@requires_cuda
+class DTensorExportTest(TestCase):
+    def tearDown(self):
+        super().tearDown()
+        dist.destroy_process_group()
+
+    def setUp(self):
+        super().setUp()
+        self.world_size = 8
+        store = FakeStore()
+        dist.init_process_group(
+            backend="fake", rank=0, world_size=self.world_size, store=store
+        )
+        self.device_type = "cuda"
+
+    def _run_test(self, export_fn, test_annotation=False):
+        dp_degree = 2
+        tp_degree = self.world_size // dp_degree
+
+        # 2-D mesh is [dp, tp]
+        mesh_2d = init_device_mesh(
+            self.device_type,
+            mesh_shape=(dp_degree, tp_degree),
+            mesh_dim_names=["dp", "tp"],
+        )
+
+        model = None
+        if test_annotation:
+            model = SimpleModelAnnotated(self.device_type)
+        else:
+            model = SimpleModel(self.device_type)
+        parallelize_plan = {
+            "mlp_0.net1": ColwiseParallel(),
+            "mlp_0.net2": RowwiseParallel(),
+            "mlp_1.net1": ColwiseParallel(),
+            "mlp_1.net2": RowwiseParallel(),
+        }
+        tp_model = parallelize_module(model, mesh_2d["tp"], parallelize_plan)
+
+        inputs = torch.rand(20, 10, device=self.device_type)
+        inputs = distribute_tensor(inputs, mesh_2d["tp"], placements=[Replicate()])
+
+        joint_gm = export_fn(tp_model, inputs)
+        fw_gm, bw_gm = min_cut_rematerialization_partition(
+            joint_gm, None, num_fwd_outputs=1
+        )
+
+        self.assertTrue(
+            _count_op(joint_gm, torch.ops._c10d_functional.all_reduce.default),
+            3,
+        )
+        self.assertTrue(
+            _count_op(fw_gm, torch.ops._c10d_functional.all_reduce.default),
+            2,
+        )
+        self.assertTrue(
+            _count_op(bw_gm, torch.ops._c10d_functional.all_reduce.default),
+            1,
+        )
+
+        if test_annotation:
+
+            def has_tag(node):
+                return "custom" in node.meta and node.meta["custom"] == {"pp_stage": 0}
+
+            def marked_nodes(gm):
+                return [
+                    node.name
+                    for node in gm.graph.nodes
+                    if has_tag(node) and node.op == "call_function"
+                ]
+
+            def unmarked_nodes(gm):
+                return [
+                    node.name
+                    for node in gm.graph.nodes
+                    if not has_tag(node) and node.op == "call_function"
+                ]
+
+            marked_nodes_fw = [
+                "t",
+                "addmm",
+                "view",
+                "relu",
+                "view_1",
+                "t_1",
+                "div",
+                "addmm_1",
+                "all_reduce",
+                "wait_tensor",
+                "view_2",
+                "t_12",
+            ]
+            unmarked_nodes_fw = [
+                "view_3",
+                "t_2",
+                "addmm_2",
+                "view_4",
+                "relu_1",
+                "view_5",
+                "t_3",
+                "div_1",
+                "addmm_3",
+                "all_reduce_1",
+                "wait_tensor_1",
+                "view_6",
+                "t_4",
+                "t_8",
+            ]
+
+            marked_nodes_bw = [
+                "mm_4",
+                "t_13",
+                "view_1",
+                "mm_5",
+                "t_14",
+                "sum_3",
+                "view_9",
+                "t_15",
+                "detach",
+                "detach_3",
+                "threshold_backward_1",
+                "t_16",
+                "mm_6",
+                "t_17",
+                "sum_4",
+                "view_10",
+                "t_18",
+            ]
+            unmarked_nodes_bw = [
+                "mm",
+                "t_5",
+                "view_5",
+                "mm_1",
+                "t_6",
+                "sum_1",
+                "view_7",
+                "t_7",
+                "detach_1",
+                "detach_2",
+                "threshold_backward",
+                "mm_2",
+                "t_9",
+                "mm_3",
+                "t_10",
+                "sum_2",
+                "view_8",
+                "t_11",
+                "all_reduce_2",
+                "wait_tensor_2",
+            ]
+
+            self.assertEqual(marked_nodes(fw_gm), marked_nodes_fw)
+            self.assertEqual(unmarked_nodes(fw_gm), unmarked_nodes_fw)
+
+            self.assertEqual(marked_nodes(bw_gm), marked_nodes_bw)
+            self.assertEqual(unmarked_nodes(bw_gm), unmarked_nodes_bw)
+
+            self.assertEqual(
+                set(marked_nodes(joint_gm)), set(marked_nodes_fw + marked_nodes_bw)
+            )
+            self.assertEqual(
+                set(unmarked_nodes(joint_gm)),
+                set(unmarked_nodes_fw + unmarked_nodes_bw),
+            )
+
+    @parametrize(
+        "export_fn",
+        [
+            graph_capture_and_aot_export_joint_with_descriptors,
+            aot_export_joint_with_descriptors_alone,
+        ],
+    )
+    def test_export_parallelize_module_with_dtensor_input(
+        self,
+        export_fn,
+    ):
+        self._run_test(export_fn)
+
+    # aot_export_joint_with_descriptors on strict-exported exported_program.module()
+    # is producing a joint graph with backward region missing
+    @unittest.expectedFailure
+    def test_strict_export_parallelize_module_with_dtensor_input(self):
+        self._run_test(strict_export_and_aot_export_joint_with_descriptors)
+
+    def test_annotate_aot_export_joint_with_descriptors_alone(self):
+        self._run_test(aot_export_joint_with_descriptors_alone, True)
+
+    def test_dynamic_shapes(self):
+        dp_degree = 2
+        tp_degree = self.world_size // dp_degree
+
+        # 2-D mesh is [dp, tp]
+        mesh_2d = init_device_mesh(
+            self.device_type,
+            mesh_shape=(dp_degree, tp_degree),
+            mesh_dim_names=["dp", "tp"],
+        )
+
+        model = SimpleModelDynamicShapes(self.device_type)
+        parallelize_plan = {
+            "mlp_0.net1": ColwiseParallel(),
+            "mlp_0.net2": RowwiseParallel(),
+            "mlp_1.net1": ColwiseParallel(),
+            "mlp_1.net2": RowwiseParallel(),
+        }
+        tp_model = parallelize_module(model, mesh_2d["tp"], parallelize_plan)
+
+        inputs = torch.rand(20, 10, device=self.device_type)
+        inputs = distribute_tensor(inputs, mesh_2d["tp"], placements=[Replicate()])
+        torch._dynamo.mark_dynamic(inputs, 0, min=5, max=100)
+
+        joint_gm = graph_capture_and_aot_export_joint_with_descriptors(tp_model, inputs)
+
+        res = []
+        for node in joint_gm.graph.nodes:
+            if node.op == "placeholder":
+                assert "val" in node.meta
+                fake_val = node.meta["val"]
+                if isinstance(fake_val, torch._subclasses.fake_tensor.FakeTensor):
+                    res.append(list(fake_val.shape))
+
+        self.assertExpectedInline(
+            str(res),
+            """[[4, 10], [4], [10, 4], [10], [s22, 10], [s22, 10]]""",
+        )
+
+    def test_einsum_dtensor_export(self):
+        """Test exporting a model with einsum that has DTensor inputs/outputs with side effects"""
+        world_size = 4
+        # Create device mesh
+        device_mesh = init_device_mesh(self.device_type, mesh_shape=(world_size,))
+        model = EinsumModel()
+
+        x = torch.randn(4, 8, 16)
+        x_dtensor = distribute_tensor(x, device_mesh, placements=[Shard(0)])
+
+        # y: [16, 16] replicated
+        y = torch.randn(16, 16)
+        z = torch.randn(16, 16)
+        y_dtensor = distribute_tensor(y, device_mesh, placements=[Replicate()])
+        z_dtensor = DTensor.from_local(z, device_mesh, placements=[Partial()])
+
+        # Run model to verify it works
+        output = model(x_dtensor, y_dtensor, z_dtensor)
+        with torch._dynamo.config.patch(install_free_tensors=True):
+            # TODO: switch to use the official graph_capture API once it is ready
+            gm = _dynamo_graph_capture_for_export(model)(
+                x_dtensor, y_dtensor, z_dtensor
+            )
+        output_gm = gm(x_dtensor, y_dtensor, z_dtensor)
+        self.assertEqual(output, output_gm)
+
+
+instantiate_parametrized_tests(DTensorExportTest)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/tensor/test_dtensor_ops.py b/test/distributed/tensor/test_dtensor_ops.py
index 0ea6fa3a2a70..df51152a9030 100644
--- a/test/distributed/tensor/test_dtensor_ops.py
+++ b/test/distributed/tensor/test_dtensor_ops.py
@@ -1,25 +1,35 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
+import copy
+import re
 import unittest
 import warnings
 
 import torch
 import torch.distributed as dist
 import torch.testing._internal.common_methods_invocations as common_ops
-from torch.distributed.tensor import DTensor, init_device_mesh
+from torch.distributed._local_tensor import LocalTensorMode, reconcile_args
+from torch.distributed.tensor import (
+    distribute_tensor,
+    DTensor,
+    init_device_mesh,
+    Replicate,
+    Shard,
+)
 from torch.overrides import resolve_name
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
     ops,
 )
 from torch.testing._internal.common_methods_invocations import DecorateInfo, op_db
-from torch.testing._internal.common_utils import run_tests, suppress_warnings
+from torch.testing._internal.common_utils import run_tests, suppress_warnings, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorConverter,
     DTensorOpTestBase,
 )
 from torch.utils import _pytree as pytree
+from torch.utils._debug_mode import DebugMode
 from torch.utils._pytree import tree_map
 
 
@@ -41,7 +51,7 @@ def skip(op_name, variant_name="", *, device_type=None, dtypes=None):
     return (op_name, variant_name, device_type, dtypes, False)
 
 
-def skipOps(test_case_name, base_test_name, to_skip):
+def skipOps(op_db, test_case_name, base_test_name, to_skip):
     all_opinfos = op_db
     for xfail in to_skip:
         op_name, variant_name, device_type, dtypes, expected_failure = xfail
@@ -80,6 +90,34 @@ def wrapped(fn):
     return wrapped
 
 
+def repurpose_ops(op_db, base_test_name, derived_test_name):
+    """
+    Copies op info database and for the decorators that applied to base test class updates
+    them to apply to derived test class. The class update is required because decorators are applied
+    only if the class name matches (it doesn't consider base classes).
+
+    Specifically we use this function to create two test classes (one for multi-threaded and one for
+    local tensor flavors) that share common test body but different rules for skip or fail.
+
+    Args:
+        op_db: List of OpInfo objects to be repurposed.
+        base_test_name: The original test class name to be replaced.
+        derived_test_name: The new test class name to set in decorators.
+
+    Returns:
+        list: A new list of OpInfo objects with updated target class names for the
+        decorator.
+    """
+    repurposed_ops = []
+    for opinfo in op_db:
+        opinfo_copy = copy.deepcopy(opinfo)
+        for decorator in list(opinfo_copy.decorators):
+            if hasattr(decorator, "cls_name") and decorator.cls_name == base_test_name:
+                decorator.cls_name = derived_test_name
+        repurposed_ops.append(opinfo_copy)
+    return repurposed_ops
+
+
 # Re-generate this failed list, turn on dry_run of the below func
 # check_dtensor_func(self, test, op, dry_run=True), then run sth
 # like python test/distributed/tensor/test_dtensor_ops.py > failed.expect
@@ -117,7 +155,6 @@ def wrapped(fn):
     xfail("cholesky"),
     xfail("cholesky_inverse"),
     xfail("cholesky_solve"),
-    xfail("chunk"),
     xfail("combinations"),
     xfail("complex"),
     xfail("count_nonzero"),
@@ -155,7 +192,6 @@ def wrapped(fn):
     xfail("fmin"),
     xfail("frexp"),
     xfail("full"),
-    xfail("full_like"),
     xfail("geometric"),
     xfail("geqrf"),
     xfail("grid_sampler_2d"),
@@ -193,7 +229,6 @@ def wrapped(fn):
     xfail("linalg.lu_factor_ex"),
     xfail("linalg.lu_solve"),
     xfail("linalg.matrix_power"),
-    xfail("linalg.multi_dot"),
     xfail("linalg.pinv"),
     xfail("linalg.pinv", "hermitian"),
     xfail("linalg.slogdet"),
@@ -220,7 +255,6 @@ def wrapped(fn):
     xfail("masked_select"),
     xfail("masked.argmax"),
     xfail("masked.argmin"),
-    xfail("masked.cumprod"),
     xfail("masked.logsumexp"),
     xfail("masked.median"),
     xfail("matrix_exp"),
@@ -238,8 +272,6 @@ def wrapped(fn):
     xfail("native_batch_norm"),
     xfail("narrow_copy"),
     xfail("ne"),
-    xfail("new_empty"),
-    xfail("new_empty_strided"),
     xfail("transpose"),
     xfail("nn.functional.adaptive_avg_pool1d"),
     xfail("nn.functional.adaptive_avg_pool2d"),
@@ -266,8 +298,6 @@ def wrapped(fn):
     xfail("nn.functional.cosine_similarity"),
     xfail("nn.functional.ctc_loss"),
     xfail("nn.functional.dropout"),
-    xfail("nn.functional.dropout2d"),
-    xfail("nn.functional.dropout3d"),
     xfail("nn.functional.elu"),
     xfail("nn.functional.fractional_max_pool2d"),
     xfail("nn.functional.fractional_max_pool3d"),
@@ -475,13 +505,21 @@ def wrapped(fn):
     skip("_segment_reduce", "offsets"),
     # TODO: fix the following ops
     skip("squeeze"),
-    # These must be skipped as their contents are nondeterministic
     skip("empty"),
     skip("empty_strided"),
     skip("empty_like"),
     skip("empty_permuted"),
+    skip("new_empty"),
+    skip("new_empty_strided"),
 }
 
+dtensor_multi_threaded_fails = {
+    xfail("full_like"),
+    xfail("nn.functional.dropout2d"),
+    xfail("nn.functional.dropout3d"),
+    xfail("masked.cumprod"),
+    skip("nn.functional.multi_head_attention_forward"),
+}
 
 # Add a list of ops that are currently failing BW pass
 skip_bw = [
@@ -500,7 +538,13 @@ def wrapped(fn):
 DEVICE_TYPE = "cpu"
 
 
-class TestDTensorOps(DTensorOpTestBase):
+class TestDTensorOps(TestCase):
+    __test__ = False
+
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        cls.__test__ = True
+
     @property
     def world_size(self) -> int:
         return OP_DB_WORLD_SIZE
@@ -528,14 +572,6 @@ def test():
 
         self.check_dtensor_func(test, op)
 
-    # only allow float dytpe for now, we can relax this constraint
-    # when feel necessary later (i.e when adding quantization support).
-    @suppress_warnings
-    @ops(op_db, allowed_dtypes=(torch.float,))
-    @skipOps("TestDTensorOps", "test_dtensor_op_db", dtensor_fails)
-    def test_dtensor_op_db(self, dtype, op):
-        self.run_opinfo_test(dtype, op)
-
     def assert_ref_dtensor_equal(self, dtensor_rs, rs):
         flat_dtensor_rs = pytree.tree_leaves(dtensor_rs)
         flat_rs = pytree.tree_leaves(rs)
@@ -560,6 +596,9 @@ def assert_ref_dtensor_equal(self, dtensor_rs, rs):
 
             self.assertEqualOnRank(dtensor_r, r)
 
+    def assertEqualOnRank(self, x, y, msg=None, *, rank=0) -> None:
+        raise NotImplementedError
+
     def run_dtensor_crossref(self, func, args, kwargs):
         to_dtensor = DTensorConverter(self.mesh, args, kwargs)
 
@@ -573,7 +612,8 @@ def concat_res_if_necessary(func, res: object) -> object:
                 return res
 
         # TODO: also handle cases where func raise an exception
-        rs = func(*args, **kwargs)
+        op_args, op_kwargs = reconcile_args(args, kwargs)
+        rs = func(*op_args, **op_kwargs)
         rs = concat_res_if_necessary(func, rs)
 
         def to_replicate(e: object) -> object:
@@ -628,12 +668,12 @@ def to_replicate(e: object) -> object:
                         self.assert_ref_dtensor_equal(dtensor_rs, rs)
                     else:
                         raise RuntimeError(
-                            f"failed to convert args to DTensor; "
+                            f"Failed to convert args to DTensor; "
                             f"originally (*{args}, **{kwargs})"
                         )
                 except Exception as e:
                     raise RuntimeError(
-                        f"{str(e)}\n\nfailed to run: {resolve_name(func)}, with (*{dtensor_args}, **{dtensor_kwargs})"
+                        f"{str(e)}\n\nFailed to run: {resolve_name(func)}, with (*{dtensor_args}, **{dtensor_kwargs})"
                     ) from e
         return rs
 
@@ -649,7 +689,7 @@ def check_dtensor_func(self, test_func, opinfo, dry_run=False):
                 else:
                     print(f"xfail('{opinfo.name}'),")
 
-    def test_one_hot(self):
+    def run_one_hot(self):
         ops = [op for op in op_db if op.name == "nn.functional.one_hot"]
         assert len(ops) == 1
         op = ops[0]
@@ -661,10 +701,129 @@ def test_one_hot(self):
             sample_inputs_filter=lambda s: s.kwargs["num_classes"] != -1,
         )
 
+    def run_mean(self):
+        self.mesh = init_device_mesh(DEVICE_TYPE, (self.world_size,))
+
+        shape = [2 * self.world_size + 1, 2 * self.world_size]
+        tensor = (
+            torch.arange(shape[0] * shape[1], dtype=torch.float32)
+            .reshape(shape)
+            .to(DEVICE_TYPE)
+        )
+
+        for is_evenly_shardable in [True, False]:
+            if is_evenly_shardable:
+                placement = [Shard(1)]
+                reduce_dim = 1
+            else:
+                placement = [Shard(0)]
+                reduce_dim = 0
+            dtensor = distribute_tensor(tensor, self.mesh, placement)
+
+            with DebugMode(record_torchfunction=False) as debug_mode:
+                mean = dtensor.mean(dim=reduce_dim)
+                full_tensor = mean.full_tensor()
+
+            self.assertEqual(full_tensor, tensor.mean(dim=reduce_dim))
+
+            if is_evenly_shardable:
+                self.assertTrue("P->R" in debug_mode.debug_string())
+            else:
+                self.assertTrue("S(0)->R" in debug_mode.debug_string())
+
+    def test_embedding_error_msg(self):
+        self.mesh_2d = init_device_mesh(
+            DEVICE_TYPE, (2, self.world_size // 2), mesh_dim_names=("dp", "tp")
+        )
+        self.mesh_1d = self.mesh_2d["tp"]
+
+        weight_global = torch.randn(2048, 256, device=DEVICE_TYPE)
+        weight_dtensor = distribute_tensor(weight_global, self.mesh_1d, [Shard(0)])
+
+        input_global = torch.randint(0, 2048, (16, 2048), device=DEVICE_TYPE)
+        input_dtensor = distribute_tensor(
+            input_global, self.mesh_2d, [Shard(0), Replicate()]
+        )
+
+        expected_error_msg = (
+            "Sharding propagation failed for aten.embedding.default"
+            "(Spec(f32[2048, 256](S(0))), Spec(i64[16, 2048](S(0)R))) "
+            "on DeviceMesh((dp=2, tp=2), "
+        )
+        with self.assertRaisesRegex(RuntimeError, re.escape(expected_error_msg)):
+            _ = torch.ops.aten.embedding.default(weight_dtensor, input_dtensor)
+
+
+class TestMultiThreadedDTensorOps(DTensorOpTestBase, TestDTensorOps):
+    _op_db = repurpose_ops(op_db, "TestDTensorOps", "TestMultiThreadedDTensorOps")
+
+    @suppress_warnings
+    @ops(_op_db, allowed_dtypes=(torch.float,))
+    @skipOps(
+        _op_db,
+        "TestMultiThreadedDTensorOps",
+        "test_dtensor_op_db",
+        dtensor_fails | dtensor_multi_threaded_fails,
+    )
+    def test_dtensor_op_db(self, dtype, op):
+        self.run_opinfo_test(dtype, op)
+
+    def test_mean(self):
+        self.run_mean()
+
+    def test_one_hot(self):
+        self.run_one_hot()
+
+
+class TestLocalDTensorOps(TestDTensorOps):
+    _op_db = repurpose_ops(op_db, "TestDTensorOps", "TestLocalDTensorOps")
+
+    def setUp(self) -> None:
+        super().setUp()
+        torch.distributed.init_process_group("fake", rank=0, world_size=self.world_size)
+        self.fake_pg = torch.distributed.distributed_c10d._get_default_group()
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            dist.destroy_process_group()
+        except AssertionError:
+            pass
+
+    @suppress_warnings
+    @ops(_op_db, allowed_dtypes=(torch.float,))
+    @skipOps(
+        _op_db,
+        "TestLocalDTensorOps",
+        "test_dtensor_op_db",
+        dtensor_fails,
+    )
+    def test_dtensor_op_db(self, dtype, op):
+        self.run_opinfo_test(dtype, op)
+
+    def test_mean(self):
+        with LocalTensorMode(frozenset(range(self.world_size))):
+            self.run_mean()
+
+    def test_one_hot(self):
+        self.run_one_hot()
+
+    def run_opinfo_test(
+        self, dtype, op, requires_grad=True, sample_inputs_filter=lambda s: True
+    ):
+        with LocalTensorMode(frozenset(range(self.world_size))):
+            super().run_opinfo_test(dtype, op, requires_grad, sample_inputs_filter)
+
+    def assertEqualOnRank(self, x, y, msg=None, *, rank=0):
+        self.assertEqual(x, y, msg)
+
 
 # only instantiate tests for DEVICE_TYPE alone (i.e. either CPU or GPU)
-instantiate_device_type_tests(TestDTensorOps, globals(), only_for=(DEVICE_TYPE,))
+instantiate_device_type_tests(
+    TestMultiThreadedDTensorOps, globals(), only_for=(DEVICE_TYPE,)
+)
 
+instantiate_device_type_tests(TestLocalDTensorOps, globals(), only_for=(DEVICE_TYPE,))
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_dynamic.py b/test/distributed/tensor/test_dynamic.py
new file mode 100644
index 000000000000..a53f9e6d8dd2
--- /dev/null
+++ b/test/distributed/tensor/test_dynamic.py
@@ -0,0 +1,65 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+from unittest.mock import patch
+
+import torch
+from torch.distributed.tensor import distribute_tensor, DTensor
+from torch.distributed.tensor.placement_types import Replicate
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+)
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+from torch.testing._internal.inductor_utils import GPU_TYPE
+from torch.testing._internal.triton_utils import requires_gpu
+
+
+class TestDynamic(DTensorTestBase):
+    @requires_gpu
+    @with_comms
+    @parametrize("fake_tensor_cache_enabled", [False, True])
+    def test_embedding(self, fake_tensor_cache_enabled):
+        with patch.object(
+            torch._dynamo.config, "fake_tensor_cache_enabled", fake_tensor_cache_enabled
+        ):
+            device_mesh = self.build_device_mesh()
+
+            placements = (Replicate(),)
+
+            num_embeddings = 202048
+            embedding_dim = 256
+            weight = distribute_tensor(
+                torch.rand(
+                    [num_embeddings, embedding_dim],
+                    dtype=torch.float32,
+                    device=GPU_TYPE,
+                    requires_grad=True,
+                ),
+                device_mesh,
+                placements,  # [Replicate()],
+            )
+
+            def forward(input_batch_inputs_):
+                to = weight.to(torch.float32)
+                emb = torch.nn.functional.embedding(input_batch_inputs_, to)
+                return emb
+
+            arg0 = torch.randint(
+                low=0, high=100, size=(2, 512), dtype=torch.int64, device=GPU_TYPE
+            )
+            arg0 = DTensor.from_local(arg0, device_mesh, placements)
+
+            compiled_forward = torch.compile(forward, fullgraph=True, dynamic=True)
+            _out = compiled_forward(arg0)
+
+
+instantiate_parametrized_tests(TestDynamic)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/tensor/test_fake.py b/test/distributed/tensor/test_fake.py
deleted file mode 100644
index 099c6e87f5f1..000000000000
--- a/test/distributed/tensor/test_fake.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates
-# Owner(s): ["oncall: distributed"]
-
-import torch
-from torch._subclasses.fake_tensor import FakeTensorMode
-from torch.distributed.tensor import DTensor
-from torch.distributed.tensor.placement_types import Shard
-from torch.testing._internal.common_utils import run_tests, TestCase
-from torch.testing._internal.distributed.fake_pg import FakeStore
-
-
-class TestFakeDTensor(TestCase):
-    def test_fake_dtensor_operations(self):
-        # Use FakeTensorMode to handle CUDA tensors without actual CUDA
-        fake_mode = FakeTensorMode()
-        world_size = 4
-
-        fake_store = FakeStore()
-        torch.distributed.init_process_group(
-            "fake", store=fake_store, rank=0, world_size=world_size
-        )
-        device_mesh = torch.distributed.device_mesh.init_device_mesh(
-            "cuda",
-            (2, world_size // 2),
-        )
-
-        # Create fake CUDA tensor using FakeTensorMode
-        with fake_mode:
-            x = torch.randn(1, 1, device="cuda")
-            x = DTensor.from_local(x, device_mesh, [Shard(0), Shard(1)])
-
-            # Test basic DTensor operations
-            self.assertIsInstance(x, DTensor)
-
-            # Test sum operation
-            r = x.sum(1)
-            self.assertIsInstance(r, DTensor)
-
-
-if __name__ == "__main__":
-    run_tests()
diff --git a/test/distributed/tensor/test_math_ops.py b/test/distributed/tensor/test_math_ops.py
index 085cdc296df8..9454a812abc7 100644
--- a/test/distributed/tensor/test_math_ops.py
+++ b/test/distributed/tensor/test_math_ops.py
@@ -880,6 +880,34 @@ def test_histc(self):
                 out_full = out_dt.full_tensor()
                 self.assertEqual(global_bins, out_full)
 
+    @with_comms
+    def test_logsumexp(self):
+        mesh = self.build_device_mesh()
+        comm_mode = CommDebugMode()
+        inp = torch.rand(3, 5, device=self.device_type)
+
+        shard_dim = 0
+        input_dtensor = distribute_tensor(
+            inp, device_mesh=mesh, placements=[Shard(shard_dim)]
+        )
+
+        logsumexp_dims = [0, 1]
+        for dim in logsumexp_dims:
+            output = torch.logsumexp(inp, dim=dim)
+            with comm_mode:
+                output_dtensor = torch.logsumexp(input_dtensor, dim=dim)
+                if dim == shard_dim:
+                    self.assertEqual(comm_mode.get_total_counts(), 1)
+                    self.assertEqual(
+                        comm_mode.get_comm_counts()[funcol.all_gather_into_tensor],
+                        1,
+                    )
+                    self.assertTrue(output_dtensor.placements[0].is_replicate())
+                else:
+                    self.assertEqual(comm_mode.get_total_counts(), 0)
+                    self.assertTrue(output_dtensor.placements[0].is_shard(shard_dim))
+                self.assertEqual(output_dtensor.full_tensor(), output)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_op_schema.py b/test/distributed/tensor/test_op_schema.py
index ae6aa3dbc991..eada226c6904 100644
--- a/test/distributed/tensor/test_op_schema.py
+++ b/test/distributed/tensor/test_op_schema.py
@@ -1,8 +1,10 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
+import random
+
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
-from torch.distributed.tensor._op_schema import OpSchema
+from torch.distributed.tensor._op_schema import OpSchema, RuntimeSchemaInfo
 from torch.testing._internal.common_utils import run_tests, TestCase
 
 
@@ -10,12 +12,108 @@ class TestOpSchema(TestCase):
     def test_equality_checks_lists_of_dtensor_spec(self):
         """If x == y, then we must have h(x) == h(y)."""
         dts = DTensorSpec(mesh=None, placements=tuple(), tensor_meta=None)
-        schema1 = OpSchema(op=None, args_schema=[dts, [dts]], kwargs_schema={})
-        schema2 = OpSchema(op=None, args_schema=[dts, [dts, dts]], kwargs_schema={})
+        schema1 = OpSchema(op=None, args_schema=(dts, [dts]), kwargs_schema={})
+        schema2 = OpSchema(op=None, args_schema=(dts, [dts, dts]), kwargs_schema={})
         # This is a regression test; these schemas used to compare equal.
         self.assertNotEqual(schema1, schema2)
         self.assertNotEqual(hash(schema1), hash(schema2))
 
+    def test_equality_respects_static_attributes(self):
+        def _get_sample_op_schemas(static_arg_val, static_kwarg_val):
+            dts = DTensorSpec(mesh=None, placements=tuple(), tensor_meta=None)
+            static_argnum = 2
+            static_kwargkey = ["statickwarg"]
+            annotated_schemas = [
+                (False, False, None),
+                (True, False, RuntimeSchemaInfo(static_argnum=static_argnum)),
+                (False, True, RuntimeSchemaInfo(static_kwargkey=static_kwargkey)),
+                (
+                    True,
+                    True,
+                    RuntimeSchemaInfo(
+                        static_argnum=static_argnum, static_kwargkey=static_kwargkey
+                    ),
+                ),
+            ]
+
+            # non-tensor args show up in hash iff the argnum is static/
+            # kwargs show up in hash iff their name is in static_kwargkey.
+            # random elements are random because they are not supposed to matter for
+            # equality at all.
+            args_schema = (dts, random.randint(1, 1000000), static_arg_val)
+            kwargs_schema = {
+                "ignoredkwarg": random.randint(1, 1000000),
+                "statickwarg": static_kwarg_val,
+            }
+            return [
+                (
+                    has_static_arg,
+                    has_static_kwarg,
+                    OpSchema(
+                        op=None,
+                        args_schema=args_schema,
+                        kwargs_schema=kwargs_schema,
+                        schema_info=si,
+                    ),
+                )
+                for (has_static_arg, has_static_kwarg, si) in annotated_schemas
+            ]
+
+        for lhs_has_static_arg, lhs_has_static_kwarg, lhs in _get_sample_op_schemas(
+            1, 2
+        ):
+            # Static arg/kwarg both match
+            for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
+                1, 2
+            ):
+                if (
+                    lhs_has_static_arg == rhs_has_static_arg
+                    and lhs_has_static_kwarg == rhs_has_static_kwarg
+                ):
+                    self.assertEqual(lhs, rhs)
+                else:
+                    self.assertNotEqual(lhs, rhs)
+
+            # Static arg mismatch
+            for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
+                3, 2
+            ):
+                if (
+                    lhs_has_static_arg
+                    or rhs_has_static_arg
+                    or lhs_has_static_kwarg != rhs_has_static_kwarg
+                ):
+                    self.assertNotEqual(lhs, rhs)
+                else:
+                    self.assertEqual(lhs, rhs)
+
+            # Static kwarg mismatch
+            for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
+                1, 3
+            ):
+                if (
+                    lhs_has_static_kwarg
+                    or rhs_has_static_kwarg
+                    or lhs_has_static_arg != rhs_has_static_arg
+                ):
+                    self.assertNotEqual(lhs, rhs)
+                else:
+                    self.assertEqual(lhs, rhs)
+
+            # Static arg/kwarg both mismatch
+            for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
+                3, 4
+            ):
+                if (
+                    lhs_has_static_arg
+                    or rhs_has_static_arg
+                    or lhs_has_static_kwarg
+                    or rhs_has_static_kwarg
+                ):
+                    self.assertNotEqual(lhs, rhs)
+                else:
+                    self.assertEqual(lhs, rhs)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_pointwise_ops.py b/test/distributed/tensor/test_pointwise_ops.py
index 28dd1ac9def5..953709c19758 100644
--- a/test/distributed/tensor/test_pointwise_ops.py
+++ b/test/distributed/tensor/test_pointwise_ops.py
@@ -1,8 +1,8 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
-from collections.abc import Sequence
-from typing import Any, Callable, Optional
+from collections.abc import Callable, Sequence
+from typing import Any, Optional
 from unittest import skip
 
 import torch
diff --git a/test/distributed/tensor/test_random_ops.py b/test/distributed/tensor/test_random_ops.py
index 2cf9916c7d67..61b88ee169e2 100644
--- a/test/distributed/tensor/test_random_ops.py
+++ b/test/distributed/tensor/test_random_ops.py
@@ -31,6 +31,7 @@
     skip_unless_torch_gpu,
     with_comms,
 )
+from torch.utils._typing_utils import not_none
 
 
 def get_generator_seed_for_device_type(device_type: str) -> int:
@@ -549,7 +550,9 @@ def test_deterministic_uniform_2d(self):
             # local_shard_list_on_dim[i] has the list of all shards on that dim
             # as a tuple (local_shard_offset, local_shard_size)
             dtensor_shape = dtensor.shape
-            local_shard_list_on_dim = [[(0, l)] for l in dtensor_shape]
+            local_shard_list_on_dim: list[list[tuple[int, int]]] = [
+                [(0, l)] for l in dtensor_shape
+            ]
             for idx, placement in enumerate(placements):
                 if isinstance(placement, Shard):
                     mesh_dim_size = device_mesh.size(idx)
@@ -565,7 +568,7 @@ def test_deterministic_uniform_2d(self):
                             shard_idx_on_dim,
                         )
                         local_shard_list_on_dim[shard_dim].append(
-                            (shard_offset, shard_size)
+                            (not_none(shard_offset), shard_size)
                         )
 
             local_shard_comb = itertools.product(*local_shard_list_on_dim)
diff --git a/test/distributed/tensor/test_redistribute.py b/test/distributed/tensor/test_redistribute.py
index fe07b0dd6a24..1eb0830422f6 100644
--- a/test/distributed/tensor/test_redistribute.py
+++ b/test/distributed/tensor/test_redistribute.py
@@ -1,7 +1,10 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
 
+import contextlib
+import copy
 import itertools
+import unittest
 
 import torch
 from torch.distributed.device_mesh import init_device_mesh
@@ -14,7 +17,10 @@
     Shard,
 )
 from torch.distributed.tensor._collective_utils import shard_dim_alltoall
+from torch.distributed.tensor._dtensor_spec import ShardOrderEntry
+from torch.distributed.tensor._redistribute import redistribute_local_tensor
 from torch.distributed.tensor.debug import CommDebugMode
+from torch.distributed.tensor.placement_types import _StridedShard
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -26,6 +32,7 @@
     DTensorTestBase,
     with_comms,
 )
+from torch.utils._debug_mode import DebugMode
 
 
 funcol = torch.ops.c10d_functional
@@ -355,7 +362,7 @@ def test_replicate_to_partial(self):
         replica_spec = Replicate()
         # 1) test replicate -> partial forward
         replica_tensor = distribute_tensor(local_tensor, device_mesh, [replica_spec])
-        with self.assertRaisesRegex(RuntimeError, "Can not redistribute to Partial"):
+        with self.assertRaisesRegex(RuntimeError, "Can not redistribute"):
             partial_tensor = replica_tensor.redistribute(device_mesh, [partial_spec])
 
         from torch.distributed.tensor._redistribute import Redistribute
@@ -599,6 +606,58 @@ def test_shard_dim_alltoall(self, dtype):
         self.assertEqual(new_tensor.shape, new_meta_tensor.shape)
         self.assertEqual(new_tensor.stride(), new_meta_tensor.stride())
 
+    @with_comms
+    def test_one_chunk_mesh(self):
+        # mesh size is 1 on second dim
+        mesh = init_device_mesh(self.device_type, (4, 1))
+
+        srcs = [Shard(1), Replicate(), Partial()]
+        dsts = [Shard(0), Shard(1), Replicate()]
+
+        comm_mode = CommDebugMode()
+
+        for src, dst in itertools.product(srcs, dsts):
+            tensor = torch.randn(16, 8, device=self.device_type)
+            dt = DTensor.from_local(tensor, mesh, [Shard(0), src])
+
+            with comm_mode:
+                out = dt.redistribute(mesh, [Shard(0), dst])
+
+            self.assertEqual(comm_mode.get_total_counts(), 0)
+            self.assertEqual(out.placements, [Shard(0), dst])
+
+    @with_comms
+    def test_redistribute_to_partial(self):
+        mesh = init_device_mesh(self.device_type, (2, 2))
+
+        tensor = torch.randn(12, 8, device=self.device_type)
+
+        test_cases = [
+            # Partial to Partial is allowed
+            ([Partial(), Shard(0)], [Partial(), Shard(0)], True),
+            ([Partial(), Shard(0)], [Partial(), Shard(1)], True),
+            ([Shard(0), Partial()], [Replicate(), Partial()], True),
+            ([Shard(0), Partial("prod")], [Replicate(), Partial("prod")], True),
+            # Non-Partial to Partial is NOT allowed
+            ([Shard(0), Replicate()], [Shard(0), Partial()], False),
+            ([Shard(0), Replicate()], [Replicate(), Partial()], False),
+            ([Shard(0), Shard(1)], [Replicate(), Partial()], False),
+            # Partial to partial is allowed, if only the reduction ops is the same
+            ([Shard(0), Partial("prod")], [Replicate(), Partial("sum")], False),
+        ]
+
+        for src, dst, allow in test_cases:
+            dt = DTensor.from_local(tensor, mesh, src)
+            raise_context = (
+                self.assertRaisesRegex(RuntimeError, "Can not redistribute")
+                if not allow
+                else contextlib.nullcontext()
+            )
+
+            with raise_context:
+                out = dt.redistribute(mesh, dst)
+                self.assertEqual(out.placements, dst)
+
 
 instantiate_parametrized_tests(RedistributeTest)
 
@@ -695,5 +754,414 @@ def test_redistribute_shard_dim_multi_dim_mesh(self):
             self.assertEqual(local_out_dt, local_expected_dt)
 
 
+class DistributeWithDeviceOrderTest(DTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 8
+
+    def _extract_redistribute_trace_from_debug_mode(self, s: str) -> str:
+        import re
+
+        match = re.search(r"trace:\s*(.*)\)", s)
+        if match:
+            trace_str = match.group(1)
+            return trace_str
+        else:
+            return ""
+
+    # TODO(zpcore): remove once the native redistribute supports shard_order arg
+    def redistribute(
+        self,
+        dtensor_input,
+        device_mesh,
+        placements,
+        shard_order,
+        use_graph_based_transform=True,
+    ):
+        """
+        wrapper function to support shard_order for redistribution
+        This is a simpler version of Redistribute, only considers the forward.
+        """
+        if placements is None:
+            placements = self._shard_order_to_placement(shard_order, device_mesh)
+        placements = tuple(placements)
+        old_spec = dtensor_input._spec
+        new_spec = copy.deepcopy(old_spec)
+        new_spec.placements = placements
+        if shard_order is not None:
+            new_spec.shard_order = shard_order
+        else:
+            new_spec.shard_order = ()
+        if old_spec == new_spec:
+            return dtensor_input
+        dtensor_input = DTensor.from_local(
+            redistribute_local_tensor(
+                dtensor_input.to_local(),
+                old_spec,
+                new_spec,
+                use_graph_based_transform=use_graph_based_transform,
+            ),
+            device_mesh,
+        )
+        dtensor_input._spec = copy.deepcopy(new_spec)
+        return dtensor_input  # returns DTensor
+
+    # TODO(zpcore): remove once the native distribute_tensor supports
+    # shard_order arg
+    def distribute_tensor(
+        self,
+        input_tensor,
+        device_mesh,
+        placements,
+        shard_order,
+        use_graph_based_transform=True,
+    ):
+        """wrapper function to support shard_order for tensor distribution"""
+        if placements is None:
+            placements = self._shard_order_to_placement(shard_order, device_mesh)
+        placements = tuple(placements)
+        tensor_dt = distribute_tensor(input_tensor, device_mesh, placements)
+        # fix the shard order
+        return self.redistribute(
+            tensor_dt, device_mesh, placements, shard_order, use_graph_based_transform
+        )
+
+    # TODO(zpcore): remove once the native redistribute supports shard_order arg
+    def full_tensor(self, dtensor_input):
+        """wrapper function to support DTensor.full_tensor"""
+        return self.redistribute(
+            dtensor_input, dtensor_input.device_mesh, placements=None, shard_order=()
+        ).to_local()
+
+    def _shard_order_to_placement(self, shard_order, mesh):
+        """convert shard_order to placement with only Replicate() and Shard()"""
+        placements = [Replicate() for _ in range(mesh.ndim)]
+        if shard_order is not None:
+            for entry in shard_order:
+                tensor_dim = entry.tensor_dim
+                mesh_dims = entry.mesh_dims
+                for mesh_dim in mesh_dims:
+                    placements[mesh_dim] = Shard(tensor_dim)
+        return tuple(placements)
+
+    def _convert_shard_order_dict_to_ShardOrder(self, shard_order):
+        """Convert shard_order dict to ShardOrder"""
+        return tuple(
+            ShardOrderEntry(tensor_dim=tensor_dim, mesh_dims=tuple(mesh_dims))
+            for tensor_dim, mesh_dims in shard_order.items()
+        )
+
+    @with_comms
+    def test_ordered_redistribute(self):
+        """Test ordered redistribution with various sharding syntaxes"""
+        torch.manual_seed(21)
+        mesh = init_device_mesh(self.device_type, (2, 2, 2))
+        input_data = torch.randn((8, 8, 8), device=self.device_type)
+        sharding_src_dst_pairs_with_expected_trace = [
+            (
+                (
+                    [Shard(0), Shard(0), Shard(0)],
+                    (ShardOrderEntry(tensor_dim=0, mesh_dims=(0, 1, 2)),),
+                ),
+                (
+                    [Replicate(), Shard(0), Shard(0)],
+                    (ShardOrderEntry(tensor_dim=0, mesh_dims=(1, 2)),),
+                ),
+            ),
+            (
+                (
+                    [Shard(0), Shard(0), Shard(0)],
+                    (ShardOrderEntry(tensor_dim=0, mesh_dims=(1, 0, 2)),),
+                ),
+                (
+                    [Replicate(), Shard(0), Shard(0)],
+                    (ShardOrderEntry(tensor_dim=0, mesh_dims=(1, 2)),),
+                ),
+            ),
+            (
+                (
+                    [Shard(0), Shard(0), Shard(0)],
+                    (ShardOrderEntry(tensor_dim=0, mesh_dims=(1, 0, 2)),),
+                ),
+                (
+                    [Shard(0), Shard(0), Replicate()],
+                    (ShardOrderEntry(tensor_dim=0, mesh_dims=(0, 1)),),
+                ),
+            ),
+            # If we use the graph search solution, the redistribution path will
+            # be S(0)[0, 1] -> S(0)[0]S(1)[1] -> S(1)[1] -> S(0)[2]S(1)[1],
+            # which takes only 1 comm count. However, this placement follows the
+            # default device order and the greedy solution will be triggered,
+            # which results in path: S(0)[0, 1] -> S(0)[0]S(1)[1] -> S(1)[1] ->
+            # S(0)[2]S(1)[1] with 2 comm count
+            (
+                (
+                    [Shard(0), Shard(0), Replicate()],
+                    (ShardOrderEntry(tensor_dim=0, mesh_dims=(0, 1)),),
+                ),
+                (
+                    [Replicate(), Shard(1), Shard(0)],
+                    (
+                        ShardOrderEntry(tensor_dim=0, mesh_dims=(2,)),
+                        ShardOrderEntry(tensor_dim=1, mesh_dims=(1,)),
+                    ),
+                ),
+            ),
+        ]
+        for idx, ((src_placement, src_order), (dst_placement, dst_order)) in enumerate(
+            sharding_src_dst_pairs_with_expected_trace
+        ):
+            sharded_dt = self.distribute_tensor(
+                input_data.clone(), mesh, src_placement, shard_order=src_order
+            )
+            with DebugMode(record_torchfunction=False) as debug_mode:
+                sharded_dt = self.redistribute(
+                    sharded_dt, mesh, dst_placement, dst_order
+                )
+            trace_str = self._extract_redistribute_trace_from_debug_mode(
+                debug_mode.debug_string()
+            )
+            if idx == 0:
+                self.assertExpectedInline(
+                    trace_str,
+                    """S(0)[0]S(0)[1]S(0)[2]->S(0)[0]S(0)[1]S(1)->S(0)S(1)[1]S(1)[0]->RS(1)[1]S(1)[0]->RS(0)S(1)->RS(0)[0]S(0)[1]""",
+                )
+            elif idx == 1:
+                self.assertExpectedInline(
+                    trace_str,
+                    """S(0)[1]S(0)[0]S(0)[2]->S(0)[1]S(0)[0]S(1)->RS(0)S(1)->RS(0)[0]S(0)[1]""",
+                )
+            elif idx == 2:
+                self.assertExpectedInline(
+                    trace_str,
+                    """S(0)[1]S(0)[0]S(0)[2]->S(0)[1]S(0)[0]R->S(1)S(0)R->S(1)S(2)R->S(0)S(2)R->S(0)[0]S(0)[1]R""",
+                )
+            elif idx == 3:
+                self.assertExpectedInline(
+                    trace_str,
+                    """S(0)[0]S(0)[1]R->S(0)S(1)R->RS(1)R->RS(1)S(0)""",
+                )
+            expected_dt = self.distribute_tensor(
+                input_data.clone(), mesh, dst_placement, shard_order=dst_order
+            )
+            self.assertEqual(sharded_dt.to_local(), expected_dt.to_local())
+
+    def generate_shard_orders(self, mesh, tensor_rank):
+        # Generate all possible sharding placement of tensor with rank
+        # `tensor_rank` over mesh.
+        def _split_list(lst: list, N: int):
+            def compositions(n, k):
+                if k == 1:
+                    yield [n]
+                else:
+                    for i in range(1, n - k + 2):
+                        for tail in compositions(n - i, k - 1):
+                            yield [i] + tail
+
+            length = len(lst)
+            for comp in compositions(length, N):
+                result = []
+                start = 0
+                for size in comp:
+                    result.append(lst[start : start + size])
+                    start += size
+                yield result
+
+        all_mesh = list(range(mesh.ndim))
+        all_device_order = list(itertools.permutations(all_mesh))
+        for device_order in all_device_order:
+            # split on device orders, and assign each device order segment to a tensor dim
+            for num_split in range(1, mesh.ndim + 1):
+                for splitted_list in _split_list(list(range(mesh.ndim)), num_split):
+                    for tensor_dims in itertools.combinations(
+                        range(tensor_rank), len(splitted_list)
+                    ):
+                        shard_order = {}
+                        assert len(tensor_dims) == len(splitted_list)
+                        for tensor_dim, mesh_dims in zip(tensor_dims, splitted_list):
+                            shard_order[tensor_dim] = device_order[
+                                mesh_dims[0] : mesh_dims[-1] + 1
+                            ]
+                        yield self._convert_shard_order_dict_to_ShardOrder(shard_order)
+
+    @with_comms
+    def test_generate_shard_orders(self):
+        """Check if `generate_shard_orders` generates unique sharding combinations"""
+        import math
+
+        test_inputs = [
+            {"mesh": init_device_mesh(self.device_type, (2, 2, 2)), "tensor_rank": 2},
+            {"mesh": init_device_mesh(self.device_type, (2, 2, 2)), "tensor_rank": 3},
+            {"mesh": init_device_mesh(self.device_type, (2, 2, 2)), "tensor_rank": 4},
+        ]
+        for test_input in test_inputs:
+            all_combinations = []
+            for shard_order in self.generate_shard_orders(
+                test_input["mesh"], test_input["tensor_rank"]
+            ):
+                all_combinations.append(shard_order)  # noqa: PERF402
+            for i in range(len(all_combinations)):
+                for j in range(i + 1, len(all_combinations)):
+                    assert all_combinations[i] != all_combinations[j], (
+                        f"Duplicate elements found in all_combinations {all_combinations[i]}, {all_combinations[j]}"
+                    )
+            expected_total_combination = 0
+            N = test_input["mesh"].ndim
+            M = test_input["tensor_rank"]
+            for i in range(1, N + 1):
+                # assign total i split of device to tensor dims
+                if M < i:
+                    continue
+                device_combination_count = math.comb(
+                    N - 1, i - 1
+                )  # choose i-1 non-empty segments from a list of size N
+                tensor_dim_order_permutation = math.comb(M, i)  # choose i tensor dims
+                expected_total_combination += (
+                    device_combination_count * tensor_dim_order_permutation
+                )
+            # multiply by total possible permutation of device order
+            expected_total_combination *= math.factorial(N)
+            self.assertEqual(len(all_combinations), expected_total_combination)
+
+    @with_comms
+    def test_ordered_distribute_all_combination(self):
+        """Exhaustively test all possible sharding combinations and verify correctness"""
+        torch.manual_seed(21)
+        mesh = init_device_mesh(self.device_type, (2, 2, 2))
+        input_tensor_shape = [
+            # even sharding
+            (16, 8),
+            (8, 16, 32),
+            (8, 32, 16, 16),
+            # uneven sharding with padding
+            (17, 5),
+            (13, 2, 13),
+            (33, 16, 8, 1),
+        ]
+
+        # 1. Verify correctness of distribute_tensor from Tensor to DTensor.
+        for tensor_shape in input_tensor_shape:
+            input_data = torch.randn(tensor_shape, device=self.device_type)
+            tensor_rank = input_data.ndim
+            for shard_order in self.generate_shard_orders(mesh, tensor_rank):
+                sharded_dt = self.distribute_tensor(
+                    input_data.clone(), mesh, placements=None, shard_order=shard_order
+                )
+                self.assertEqual(self.full_tensor(sharded_dt), input_data)
+
+        # 2. Verify the correctness of redistribution from DTensor to DTensor.
+        # This test repeatedly redistributes a DTensor to various ordered
+        # placements and checks that the resulting tensor matches the original
+        # full tensor.
+        for tensor_shape in input_tensor_shape:
+            input_data = torch.randn(tensor_shape, device=self.device_type)
+            tensor_rank = input_data.ndim
+            prev_sharded_dt = None
+            for shard_order in self.generate_shard_orders(mesh, tensor_rank):
+                if prev_sharded_dt is None:
+                    prev_sharded_dt = self.distribute_tensor(
+                        input_data.clone(),
+                        mesh,
+                        placements=None,
+                        shard_order=shard_order,
+                    )
+                else:
+                    sharded_dt = self.redistribute(
+                        prev_sharded_dt, mesh, placements=None, shard_order=shard_order
+                    )
+                    self.assertEqual(self.full_tensor(sharded_dt), input_data)
+                    prev_sharded_dt = sharded_dt
+
+    @with_comms
+    def test_ordered_redistribute_with_partial(self):
+        """Test mixing Partial in the original placements and do redistribute."""
+        # This test takes 226s to complete on 8XA100...
+        torch.manual_seed(21)
+        mesh = init_device_mesh(self.device_type, (2, 2, 2))
+        input_tensor_shape = [
+            # even sharding
+            (16, 8),
+            (8, 16, 32),
+            # uneven sharding with padding
+            (17, 5),
+            (13, 2, 13),
+            (33, 16, 8, 1),
+        ]
+        placement_choice = [
+            Shard(0),
+            Shard(1),
+            Shard(2),
+            Partial("sum"),
+            Partial("min"),
+            Replicate(),
+        ]
+        # pick 3 for the 3D mesh
+        partial_placement_comb = list(itertools.combinations(placement_choice, 3))
+
+        def _is_valid_placement(placements, tensor_rank):
+            # Check if placements is valid for tensor with rank `tensor_rank`
+            for placement in placements:
+                if isinstance(placement, Shard):
+                    if placement.dim >= tensor_rank:
+                        return False
+            return True
+
+        for shape in input_tensor_shape:
+            for placements in partial_placement_comb:
+                if not _is_valid_placement(placements, len(shape)):
+                    continue
+                local_tensor = torch.randn(shape, device=self.device_type)
+                full_tensor = DTensor.from_local(local_tensor, mesh, placements)
+                for shard_order in self.generate_shard_orders(mesh, len(shape)):
+                    sharded_dt = self.redistribute(
+                        full_tensor, mesh, placements=None, shard_order=shard_order
+                    )
+                    self.assertEqual(
+                        self.full_tensor(sharded_dt), self.full_tensor(full_tensor)
+                    )
+
+    @unittest.skip(
+        "Temporarily skipping until we support special placement types in "
+        "graph based redistribution"
+    )
+    @with_comms
+    def test_ordered_redistribute_for_special_placement(self):
+        """Test ordered redistribution with special placement"""
+        from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
+
+        torch.manual_seed(21)
+        mesh = init_device_mesh(self.device_type, (8,))
+        input_data = torch.randn((8, 8), device=self.device_type)
+        src_placement = [Shard(1)]
+        tgt_placement = [
+            (_MaskPartial(offset_shape=torch.Size([10, 20]), offset_dim=0),)
+        ]
+        sharded_dt = self.distribute_tensor(
+            input_data.clone(),
+            mesh,
+            src_placement,
+            shard_order=(ShardOrderEntry(tensor_dim=1, mesh_dims=(0,)),),
+        )
+        sharded_dt = self.redistribute(
+            sharded_dt, mesh, tgt_placement, shard_order=None
+        )
+
+    @with_comms
+    def test_shard_order_same_data_as_strided_shard(self):
+        device_mesh = init_device_mesh(self.device_type, (4, 2))
+        x = torch.randn(8, 4, device=self.device_type)
+        # specify right-to-left order use _StridedShard
+        strided_placement = [_StridedShard(-2, split_factor=2), Shard(-2)]
+        x_strided_dt = distribute_tensor(x, device_mesh, strided_placement)
+        # specify right-to-left order use ordered shard
+        x_ordered_dt = self.distribute_tensor(
+            x,
+            device_mesh,
+            placements=[Shard(0), Shard(0)],
+            shard_order=(ShardOrderEntry(tensor_dim=0, mesh_dims=(1, 0)),),
+        )
+        self.assertEqual(x_ordered_dt.to_local(), x_strided_dt.to_local())
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_tensor_ops.py b/test/distributed/tensor/test_tensor_ops.py
index eaa1969068c1..8368befabfec 100644
--- a/test/distributed/tensor/test_tensor_ops.py
+++ b/test/distributed/tensor/test_tensor_ops.py
@@ -17,6 +17,7 @@
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import run_tests, skipIfRocm
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    create_local_tensor_test_class,
     DTensorConverter,
     DTensorTestBase,
     with_comms,
@@ -704,6 +705,12 @@ def test_where_type_promotion(self):
 
     @with_comms
     def test_dtensor_dtype_conversion(self):
+        from torch.distributed.tensor.debug import (
+            _clear_sharding_prop_cache,
+            _get_sharding_prop_cache_info,
+        )
+
+        _clear_sharding_prop_cache()
         device_mesh = self.build_device_mesh()
         shard_spec = [Shard(0)]
         # by default we start from bf16 dtype
@@ -722,8 +729,6 @@ def test_dtensor_dtype_conversion(self):
         self.assertEqual(bf16_sharded_dtensor1.dtype, torch.bfloat16)
         self.assertEqual(bf16_sharded_dtensor1.to_local().dtype, torch.bfloat16)
 
-        from torch.distributed.tensor.debug import _get_sharding_prop_cache_info
-
         # by this point we only have cache misses
         hits, misses, _, _ = _get_sharding_prop_cache_info()
         self.assertEqual(hits, 0)
@@ -775,7 +780,7 @@ def test_split_on_partial(self):
         )
 
     def _test_split_on_partial(self, reduce_op: str, split_size: int, split_dim: int):
-        torch.manual_seed(self.rank)
+        self.init_manual_seed_for_rank()
         mesh = self.build_device_mesh()
 
         partial_tensor = torch.randn(8, 8, device=self.device_type)
@@ -822,5 +827,9 @@ def test_unbind(self):
                     self.assertEqual(x.full_tensor(), y)
 
 
+DistTensorOpsTestWithLocalTensor = create_local_tensor_test_class(
+    DistTensorOpsTest,
+)
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_utils.py b/test/distributed/tensor/test_utils.py
index dbfbac12223b..01f150f090b7 100644
--- a/test/distributed/tensor/test_utils.py
+++ b/test/distributed/tensor/test_utils.py
@@ -1,6 +1,7 @@
 # Owner(s): ["oncall: distributed"]
 
 import itertools
+from typing import Any
 
 import torch
 from torch.distributed.device_mesh import init_device_mesh
@@ -9,11 +10,19 @@
 from torch.distributed.tensor._utils import (
     _compute_local_shape_and_global_offset,
     _explicit_order_placements,
+    compute_global_tensor_info,
     compute_global_tensor_shape,
     compute_local_shape_and_global_offset,
+    compute_local_tensor_info,
 )
 from torch.distributed.tensor.debug import CommDebugMode
-from torch.distributed.tensor.placement_types import _StridedShard, Replicate, Shard
+from torch.distributed.tensor.placement_types import (
+    _StridedShard,
+    Partial,
+    Placement,
+    Replicate,
+    Shard,
+)
 from torch.testing._internal.common_utils import run_tests, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
@@ -442,6 +451,105 @@ def test_strided_sharding_assumption_in_meta_compute(self):
             )
 
 
+class UtilSingleDeviceTest(TestCase):
+    def test_compute_global_tensor_info_unsupported_placement(self):
+        class MockDeviceMesh:
+            def size(self, x):
+                return x
+
+        class FakePlacement(Placement):
+            pass
+
+        device_mesh: Any = MockDeviceMesh()
+        local_tensor = torch.tensor([1])
+        with self.assertRaises(RuntimeError):
+            compute_global_tensor_info(local_tensor, device_mesh, [FakePlacement()])
+
+    def test_compute_global_tensor_info_non_shard_placements(self):
+        class MockDeviceMesh:
+            def size(self, x):
+                return x
+
+        device_mesh: Any = MockDeviceMesh()
+        local_tensor = torch.tensor([[1], [2]])
+        global_size, global_stride = compute_global_tensor_info(
+            local_tensor, device_mesh, [Replicate(), Partial()]
+        )
+        self.assertEqual(global_size, local_tensor.size())
+        self.assertEqual(global_stride, local_tensor.stride())
+
+    def test_compute_global_tensor_info_shard_placement(self):
+        class MockDeviceMesh:
+            def size(self, dim):
+                return dim + 2
+
+        device_mesh: Any = MockDeviceMesh()
+        local_tensor = torch.tensor([[[1], [2], [3]], [[4], [5], [6]]])
+        global_size, global_stride = compute_global_tensor_info(
+            local_tensor, device_mesh, [Shard(0), Shard(1), Shard(2)]
+        )
+        self.assertEqual(
+            global_size, [(i + 2) * x for (i, x) in enumerate(local_tensor.size())]
+        )
+        self.assertEqual(global_stride[0], local_tensor.stride()[0] * 3 * 4)
+        self.assertEqual(global_stride[1], local_tensor.stride()[1])
+        self.assertEqual(global_stride[2], local_tensor.stride()[2] * 3)
+
+    def test_compute_tensor_info(self):
+        from torch.testing._internal.distributed.fake_pg import FakeStore
+
+        world_size = 256
+        fake_store = FakeStore()
+        torch.distributed.init_process_group(
+            "fake", store=fake_store, rank=0, world_size=world_size
+        )
+        mesh = torch.distributed.device_mesh.init_device_mesh(
+            "cpu",
+            (8, 8, 4),
+            mesh_dim_names=(
+                "dp",
+                "tp",
+                "cp",
+            ),
+        )
+        assert world_size == mesh.shape[0] * mesh.shape[1] * mesh.shape[2]
+
+        # Add Partial() when we are allowed to redistribute to it
+        options = [Shard(0), Shard(1), Shard(2), Replicate()]
+        all_placements = [tuple(p) for p in itertools.product(options, repeat=3)]
+        for placements in all_placements:
+            local_tensor = torch.empty_strided(
+                (4, 4, 4),
+                (16, 4, 1),
+            )
+            local_dt = DTensor.from_local(local_tensor, mesh, placements)
+
+            global_shape, global_stride = compute_global_tensor_info(
+                local_tensor, mesh, placements
+            )
+            global_dt = local_dt.redistribute(mesh, [Replicate()] * mesh.ndim)
+            self.assertEqual(global_shape, global_dt.size())
+            self.assertEqual(global_stride, global_dt.stride())
+
+            global_tensor = torch.empty_strided(
+                global_shape,
+                global_stride,
+            )
+            new_local_shape, new_local_stride = compute_local_tensor_info(
+                global_tensor,
+                mesh,
+                placements,
+            )
+            self.assertEqual(new_local_shape, local_tensor.size())
+            self.assertEqual(new_local_stride, local_tensor.stride())
+
+            new_local_dt = global_dt.redistribute(mesh, placements)
+            self.assertEqual(new_local_shape, new_local_dt.to_local().size())
+            self.assertEqual(new_local_stride, new_local_dt.to_local().stride())
+
+        torch.distributed.destroy_process_group()
+
+
 class TestStridedSharding(DTensorTestBase):
     @property
     def world_size(self):
@@ -638,6 +746,25 @@ def test_2d_mesh_2d_tensor_strided_sharding(self):
         )
         self.assertEqual(full_tensor, x)
 
+    @with_comms
+    def test_2d_mesh_uneven_strided_shard(self):
+        mesh = init_device_mesh(
+            self.device_type,
+            (self.world_size // 2, 2),
+            mesh_dim_names=("fsdp", "tp"),
+        )
+
+        for size in (2, 3, 5, 11):
+            tensor = torch.arange(size, device=self.device_type).view(1, -1)
+            dtensor = distribute_tensor(
+                tensor,
+                device_mesh=mesh,
+                placements=(Replicate(), Replicate()),
+            ).redistribute(
+                mesh, placements=(_StridedShard(dim=1, split_factor=2), Shard(1))
+            )
+            self.assertEqual(dtensor.full_tensor(), tensor)
+
 
 class Test2DStridedLocalShard(DTensorTestBase):
     @property
diff --git a/test/distributed/tensor/test_view_ops.py b/test/distributed/tensor/test_view_ops.py
index 815b588a7ded..857d5bd7a91d 100644
--- a/test/distributed/tensor/test_view_ops.py
+++ b/test/distributed/tensor/test_view_ops.py
@@ -30,6 +30,7 @@
 from torch.distributed.tensor.placement_types import _StridedShard, Placement
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
+    create_local_tensor_test_class,
     DTensorTestBase,
     with_comms,
 )
@@ -647,7 +648,7 @@ def test_view_redistribution(self):
     @with_comms
     def test_squeeze_(self):
         mesh_2d = init_device_mesh(self.device_type, (3, 2), mesh_dim_names=("a", "b"))
-        torch.manual_seed(self.rank)
+        self.init_manual_seed_for_rank()
         x = torch.randn((1, 4), device=self.device_type)
         dist_x = DTensor.from_local(x, mesh_2d, [Partial(), Shard(1)])
         self._test_op_on_dtensor(
@@ -664,5 +665,13 @@ def test_squeeze_(self):
         self.assertEqual(dist_x.placements, [Partial(), Shard(0)])
 
 
+TestViewOpsWithLocalTensor = create_local_tensor_test_class(
+    TestViewOps,
+    skipped_tests=[
+        # Comparing data pointers is not supported for local tensor
+        "test_dtensor_view_op_uneven",
+    ],
+)
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_xla_integration.py b/test/distributed/tensor/test_xla_integration.py
index e39931e1f183..727b128b6b4c 100644
--- a/test/distributed/tensor/test_xla_integration.py
+++ b/test/distributed/tensor/test_xla_integration.py
@@ -3,8 +3,9 @@
 
 import os
 import unittest
+from collections.abc import Callable
 from functools import wraps
-from typing import Any, Callable
+from typing import Any
 
 import numpy as np
 
diff --git a/test/distributed/test_aten_comm_compute_reordering.py b/test/distributed/test_aten_comm_compute_reordering.py
new file mode 100644
index 000000000000..10db5ccbd1f3
--- /dev/null
+++ b/test/distributed/test_aten_comm_compute_reordering.py
@@ -0,0 +1,846 @@
+# flake8: noqa: B950
+# Owner(s): ["module: inductor"]
+import unittest
+from unittest.mock import patch
+
+import torch
+import torch._dynamo
+import torch._dynamo.logging
+import torch._dynamo.test_case
+
+# for some reason importing functional collectives after dynamo breaks collectives handling!
+import torch.distributed._functional_collectives as _functional_collectives
+from torch._C import FileCheck
+from torch._dynamo.utils import counters, same
+from torch._inductor.utils import run_and_get_code, run_and_get_triton_code
+from torch.testing._internal.common_distributed import (
+    _dynamo_dist_per_rank_init,
+    at_least_x_gpu,
+    DynamoDistributedMultiProcTestCase,
+    requires_accelerator_dist_backend,
+)
+
+
+aten = torch.ops.aten
+import functools
+
+from torch.testing._internal.common_fsdp import get_devtype
+from torch.testing._internal.common_utils import skipIfRocm
+from torch.testing._internal.inductor_utils import HAS_GPU
+
+
+def estimate_aten_runtime(fx_node, compute_multiplier=1.0):
+    # for tests, assume a matmul can hide a single collective
+    if "c10" in str(fx_node.target):
+        return 1.0
+    elif fx_node.target == aten.mm.default:
+        return compute_multiplier
+    else:
+        return None
+
+
+device_type = str(get_devtype())
+
+
+def apply_reordering_and_get_graph(graph, out_li) -> None:
+    gm = graph.owning_module
+    from torch._inductor.fx_passes.overlap_scheduling import schedule_overlap_bucketing
+
+    schedule_overlap_bucketing(gm)
+    gm.graph.lint()
+    out_li.append(str(gm.graph))
+
+
+def run_and_get_aten_graph(fn, *inputs):
+    li = []
+    apply = functools.partial(apply_reordering_and_get_graph, out_li=li)
+    with torch._inductor.config.patch(post_grad_custom_post_pass=apply):
+        out = fn(*inputs)
+
+    return out, li[0]
+
+
+def get_patches():
+    return {
+        "test_configs.estimate_aten_runtime": estimate_aten_runtime,
+        "reorder_for_locality": False,
+        "triton.native_matmul": False,
+        "reorder_for_compute_comm_overlap_passes": [],
+        "compile_threads": 1,
+        "force_disable_caches": True,
+        # Messes up existing test strings
+        "test_configs.aten_fx_overlap_insert_overlap_deps": False,
+        # interferes with testing, / custom estimation
+        "test_configs.assume_bucketing_reduces_latency": False,
+    }
+
+
+@requires_accelerator_dist_backend()
+# TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
+@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+class TestComputeCommReorderingMultiProc(DynamoDistributedMultiProcTestCase):
+    """
+    Run correctness checks in multi-proc runner, mark with minimum # GPUs to run under
+
+    Note: these tests are a fork of test/distributed/test_compute_comm_reordering.py
+
+    """
+
+    def setUp(self):
+        super().setUp()
+        torch._dynamo.reset()
+        torch._dynamo.utils.counters.clear()
+
+    def get_world_trs(self):
+        return {
+            "tag": "",
+            "ranks": list(range(self.world_size)),
+            "group_size": self.world_size,
+        }
+
+    @property
+    def world_size(self) -> int:
+        # hack: no matter whether we have 2 or 3 or 4 gpus, just run on 2
+        # works around issue with skipif<2 and workers with unpredictable #s gpu
+        return 2
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(get_patches())
+    def test_sink_waits(self):
+        def func(a):
+            ar = _functional_collectives.all_reduce(a, "sum", "0")
+            b = torch.matmul(a, a)
+            return torch.matmul(ar, b)
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            inputs = torch.ones(4, 4, dtype=torch.float, device=device_type) + self.rank
+
+            out, aten_graph_str = run_and_get_aten_graph(torch.compile(func), inputs)
+
+            # Verify that the wait_tensor is sinked below the 1st matmul but
+            # above the 2nd matmul.
+            (
+                FileCheck()
+                .check("all_reduce.default")
+                .check("aten.mm.default")
+                .check("wait_tensor.default")
+                .check("aten.mm.default")
+                .run(aten_graph_str)
+            )
+            correct = func(inputs)
+            self.assertTrue(same(out, correct))
+            self.assertEqual(counters["inductor"]["overlap_scheduling_exposed"], 0)
+
+    @torch._inductor.config.patch(get_patches())
+    def test_raise_comms(self):
+        def func(a):
+            b = torch.matmul(a, a)
+            c = torch.relu(b)
+            d = torch.matmul(c, c)
+            e = _functional_collectives.all_reduce((b + 1), "sum", "0")
+            return torch.matmul(d, e)
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            inputs = torch.ones(4, 4, dtype=torch.float, device=device_type) + self.rank
+            compiled = torch.compile(func)
+            out, aten_graph_str = run_and_get_aten_graph(torch.compile(func), inputs)
+            # Verify that the all_reduce_ has been raised above the 2nd matmul
+            # but below the 1st matmul. Note that the all_reduce_ directly
+            # writes to the output buffer of the 1st matmul, which is an input
+            # to the first relu. Therefore, the all_reduce_ should be scheduled
+            # after the first relu.
+            (
+                FileCheck()
+                .check("aten.mm")
+                .check("all_reduce.default")
+                .check("aten.mm")
+                .check("wait_tensor.default")
+                .check("aten.mm")
+                .run(aten_graph_str)
+            )
+            out = compiled(inputs)
+            correct = func(inputs)
+            self.assertTrue(same(out, correct))
+            self.assertEqual(counters["inductor"]["overlap_scheduling_exposed"], 0)
+
+    @torch._inductor.config.patch(get_patches())
+    def test_sink_waits_raise_comms(self):
+        def func(a, *, tag, ranks, group_size):
+            b = torch.matmul(a, a)
+            c = torch.relu(b)
+            d = torch.matmul(c, c)
+            e = _functional_collectives.all_reduce(b, "sum", "0")
+            f = torch.relu(d)
+            g = torch.matmul(f, f)
+            return torch.mm(e, g)
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            inputs = torch.ones(
+                4, 4, dtype=torch.float, device=device_type
+            )  # + self.rank
+            kwargs = self.get_world_trs()
+            func = functools.partial(func, **kwargs)
+            compiled = torch.compile(func)
+            out, aten_graph_str = run_and_get_aten_graph(compiled, inputs)
+            # Things to verify:
+            # - The all_reduce_ and its prologue should be raised above the 2nd
+            # matmul but below the 1st matmul.
+            # - The wait_tensor should be sinked below the 3rd matmul but above
+            # the 4th matmul.
+
+            self.assertExpectedInline(
+                aten_graph_str,
+                """\
+graph():
+    %arg0_1 : [num_users=1] = placeholder[target=arg0_1]
+    %mm : [num_users=2] = call_function[target=torch.ops.aten.mm.default](args = (%arg0_1, %arg0_1), kwargs = {})
+    %relu : [num_users=1] = call_function[target=torch.ops.aten.relu.default](args = (%mm,), kwargs = {})
+    %all_reduce : [num_users=1] = call_function[target=torch.ops._c10d_functional.all_reduce.default](args = (%mm, sum, 0), kwargs = {})
+    %mm_1 : [num_users=1] = call_function[target=torch.ops.aten.mm.default](args = (%relu, %relu), kwargs = {})
+    %relu_1 : [num_users=1] = call_function[target=torch.ops.aten.relu.default](args = (%mm_1,), kwargs = {})
+    %mm_2 : [num_users=1] = call_function[target=torch.ops.aten.mm.default](args = (%relu_1, %relu_1), kwargs = {})
+    %wait_tensor : [num_users=1] = call_function[target=torch.ops._c10d_functional.wait_tensor.default](args = (%all_reduce,), kwargs = {})
+    %mm_3 : [num_users=1] = call_function[target=torch.ops.aten.mm.default](args = (%wait_tensor, %mm_2), kwargs = {})
+    return (mm_3,)""",
+            )
+
+            # Note: this triggered an all_reduce_ bug
+            correct = func(inputs, **self.get_world_trs())
+            self.assertTrue(same(out, correct))
+            self.assertEqual(counters["inductor"]["overlap_scheduling_exposed"], 0)
+
+    @torch._inductor.config.patch(get_patches())
+    def test_reorder_compute_for_overlap_mul(self):
+        def func(a, *, tag, ranks, group_size):
+            ar = _functional_collectives.all_reduce(a, "sum", ranks, tag)
+            g = torch.matmul(a, a)
+            c = torch.relu(a)
+            d = torch.matmul(c, c)
+            f = d * c * ar
+            fr = _functional_collectives.all_reduce(f, "sum", ranks, tag)
+            e = torch.matmul(d + ar + fr, g)
+            return (e,)
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            inputs = torch.ones(4, 4, dtype=torch.float, device=device_type) + self.rank
+            func_c = functools.partial(func, **self.get_world_trs())
+            compiled = torch.compile(func_c)
+            out_c, aten_graph_str = run_and_get_aten_graph(compiled, inputs)
+            # Note: because we have given collectives and mms equal estimation,
+            # we overlap each collective with a single mm.
+            # Same schedule as in test_reorder_compute_for_overlap_custom_runtime_estimation
+            # although there is an exposed collective
+            (
+                FileCheck()
+                .check("all_reduce.default")
+                .check("aten.mm")
+                .check("aten.mm")
+                .check("wait_tensor.default")
+                .check("aten.mul")
+                .check("all_reduce.default")
+                .check("wait_tensor.default")
+                .check("aten.mm")
+                .run(aten_graph_str)
+            )
+            correct = func(inputs, **self.get_world_trs())
+            self.assertEqual(counters["inductor"]["overlap_scheduling_exposed"], 1)
+            self.assertTrue(same(out_c, correct))
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @skipIfRocm
+    # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
+    @patch.object(torch._inductor.config, "compile_threads", 1)
+    @unittest.skipIf(True, "Logic not yet implemented")
+    @torch._inductor.config.patch(get_patches())
+    def test_grouped_scheduler_node(self):
+        def func(a, *, tag, ranks, group_size):
+            add = a + a
+            div = add / a
+            ar = _functional_collectives.all_reduce(div, "sum", ranks, tag)
+            # Normally, we would fuse `add = a + a`, `div = add / a` and `mul = a * a` together into a single fused op,
+            # but here in this unit test, we intentionally put `add`, `div` and `ar` computation
+            # into a GroupedSchedulerNode, which prevents them from being fused with any other ops.
+            mul = a * a
+            mm = torch.matmul(mul, ar)
+            return (mm,)
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            inputs = torch.ones(4, 4, dtype=torch.float, device=device_type) + self.rank
+            compiled = torch.compile(func)
+            code = run_and_get_triton_code(compiled, inputs, **self.get_world_trs())
+            # Expectations:
+            # 1. `add = a + a` and `div = add / a` are still fused, which means fusion
+            #    still happens among nodes within a GroupedSchedulerNode.
+            # 2. `mul = a * a` is not fused with `add` or `div`, because the latter two are within
+            #    GroupedSchedulerNode and thus are prevented from being fused with any outside ops.
+            FileCheck().check("triton_poi_fused_add_all_reduce_div_0.").check(
+                "_c10d_functional.all_reduce_."
+            ).check("triton_poi_fused_mul_1.").run(code)
+            out = compiled(inputs, **self.get_world_trs())
+            correct = func(inputs, **self.get_world_trs())
+            self.assertTrue(same(out, correct))
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(get_patches())
+    def test_inductor_default_comms_ordering(self):
+        pg_info = self.get_world_trs()
+        tag = pg_info["tag"]
+        ranks = pg_info["ranks"]
+        group_size = pg_info["group_size"]
+
+        g1 = torch.ones(10, 10, device=device_type)
+        g2 = torch.ones(11, 11, device=device_type)
+        g3 = torch.ones(12, 12, device=device_type)
+
+        @torch.compile
+        def fn(g1, g2, g3):
+            handle1 = torch.ops.c10d_functional.all_reduce(
+                g1, "avg", tag, ranks, group_size
+            )
+            handle2 = torch.ops.c10d_functional.all_reduce(
+                g2, "avg", tag, ranks, group_size
+            )
+            handle3 = torch.ops.c10d_functional.all_reduce(
+                g3, "avg", tag, ranks, group_size
+            )
+
+            # wait on them in a different order
+            grad3 = torch.ops._c10d_functional.wait_tensor.default(handle3)
+            grad2 = torch.ops._c10d_functional.wait_tensor.default(handle2)
+            grad1 = torch.ops._c10d_functional.wait_tensor.default(handle1)
+            return grad3, grad2, grad1
+
+        with _dynamo_dist_per_rank_init(
+            self.rank, self.world_size, self.backend(device_type), fake_pg=True
+        ):
+            # all_reduces remain in order!
+            # note: this isnt actually invariant of pass currently..
+            # but we should keep collectives stable without reordering opportunities
+
+            _, code = run_and_get_aten_graph(fn, g1, g2, g3)
+
+            FileCheck().check("all_reduce").check_same("arg0_1").check(
+                "all_reduce"
+            ).check_same("arg1_1").check("all_reduce").check_same("arg2_1").run(code)
+            self.assertEqual(counters["inductor"]["overlap_scheduling_exposed"], 3)
+            # these have no overlap opportunities
+            self.assertEqual(counters["inductor"]["overlap_scheduling_bad_exposed"], 0)
+
+
+def get_bucket_patches(compute_multiplier=1.0):
+    estimate_aten_runtime_part = functools.partial(
+        estimate_aten_runtime, compute_multiplier=compute_multiplier
+    )
+    return {
+        "test_configs.estimate_aten_runtime": estimate_aten_runtime_part,
+        "test_configs.aten_fx_overlap_preserving_bucketing": True,
+        "reorder_for_locality": False,
+        "triton.native_matmul": False,
+        "reorder_for_compute_comm_overlap_passes": [],
+        "compile_threads": 1,
+        "force_disable_caches": True,
+        # messes up test strings
+        "test_configs.aten_fx_overlap_insert_overlap_deps": False,
+        # interferes with testing, / custom estimation
+        "test_configs.assume_bucketing_reduces_latency": False,
+    }
+
+
+class TestComputeCommReorderingBucketing(TestComputeCommReorderingMultiProc):
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(get_bucket_patches())
+    def test_basic_all_gather_bucketing(self):
+        """Test that independent all_gather operations get bucketed together."""
+
+        def func(a, b, c, *, ranks):
+            # Three independent all_gathers that should be bucketed
+            ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks) + 3
+            ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks) + 4
+            ag3 = _functional_collectives.all_gather_tensor(c, 0, ranks) + 5
+            return ag1 + ag2 + ag3
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            inputs_a = (
+                torch.ones(4, 4, dtype=torch.float, device=device_type) + self.rank
+            )
+            inputs_b = torch.ones(4, 4, dtype=torch.float, device=device_type) * 2
+            inputs_c = torch.ones(4, 4, dtype=torch.float, device=device_type) * 3
+            ranks = list(range(self.world_size))
+
+            func_c = functools.partial(func, ranks=ranks)
+            compiled = torch.compile(func_c)
+            out, aten_graph_str = run_and_get_aten_graph(
+                compiled, inputs_a, inputs_b, inputs_c
+            )
+
+            # Should see a single bucketed all_gather
+            FileCheck().check_count(
+                "torch.ops._c10d_functional.all_gather_into_tensor", 1, exactly=True
+            ).run(aten_graph_str)
+
+            correct = func(inputs_a, inputs_b, inputs_c, ranks=ranks)
+            self.assertTrue(same(out, correct))
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(get_bucket_patches())
+    def test_reduce_scatter_bucketing(self):
+        """Test bucketing of reduce_scatter operations."""
+
+        def func(a, b, c):
+            rs1 = _functional_collectives.reduce_scatter_tensor(a, "sum", 0, "0")
+            rs2 = _functional_collectives.reduce_scatter_tensor(b, "sum", 0, "0")
+            rs3 = _functional_collectives.reduce_scatter_tensor(c, "sum", 0, "0")
+            return torch.cat([rs1, rs2, rs3])
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            inputs_a = torch.ones(8, 4, dtype=torch.float, device=device_type)
+            inputs_b = torch.ones(8, 4, dtype=torch.float, device=device_type) * 2
+            inputs_c = torch.ones(8, 4, dtype=torch.float, device=device_type) * 3
+
+            out, aten_graph_str = run_and_get_aten_graph(
+                torch.compile(func), inputs_a, inputs_b, inputs_c
+            )
+
+            # Should bucket reduce_scatter ops
+            FileCheck().check_count(
+                "torch.ops._c10d_functional.reduce_scatter_tensor", 1, exactly=True
+            ).run(aten_graph_str)
+
+            # TODO: debug - on ci this fails.
+            # correct = func(inputs_a, inputs_b, inputs_c)
+            # self.assertTrue(same(out, correct))
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(get_bucket_patches())
+    def test_no_bucketing_with_dependent_hiding_nodes(self):
+        """Test that collectives with dependent hiding nodes don't get bucketed."""
+
+        def func(a, b, *, ranks):
+            # ag1 could be hidden by mm1
+            ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks)
+            mm1 = torch.matmul(a, a)
+
+            # ag2 can be hidden by mm2, but mm2 depends on ag1's result
+            # ag2 start
+            mm2 = torch.matmul(ag1[:4], b)
+            # ag2 end
+            ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks)
+
+            return ag1.sum() * ag2.sum() * mm1 * mm2
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            inputs_a = torch.ones(4, 4, dtype=torch.float, device=device_type)
+            inputs_b = torch.ones(4, 4, dtype=torch.float, device=device_type)
+            ranks = list(range(self.world_size))
+
+            func_c = functools.partial(func, ranks=ranks)
+            compiled = torch.compile(func_c)
+            out, aten_graph_str = run_and_get_aten_graph(compiled, inputs_a, inputs_b)
+
+            # mm2 depends on ag1, so if mm2 is to hide ag2, we can't bucket ag1 and ag2
+            # because that would create a dependency issue, even though we could bucket them
+            FileCheck().check_count(
+                "torch.ops._c10d_functional.all_gather_into_tensor", 2, exactly=True
+            ).run(aten_graph_str)
+
+            correct = func(inputs_a, inputs_b, ranks=ranks)
+            self.assertTrue(same(out, correct))
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(get_bucket_patches())
+    def test_no_bucketing_when_collective_depends_on_hiding_node(self):
+        """Test that collectives don't get bucketed when one depends on another's hiding node."""
+
+        def func(a, *, ranks):
+            # ag1 hidden by mm1
+            ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks)
+            mm1 = torch.matmul(a, a)
+
+            # ag2 depends on mm1 (which hides ag1)
+            b = mm1 * 2
+            ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks)
+
+            return ag1.sum() * ag2.sum() * mm1
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            inputs = torch.ones(4, 4, dtype=torch.float, device=device_type)
+            ranks = list(range(self.world_size))
+
+            func_c = functools.partial(func, ranks=ranks)
+            compiled = torch.compile(func_c)
+            out, aten_graph_str = run_and_get_aten_graph(compiled, inputs)
+
+            # ag2 depends on mm1 (ag1's hiding node), so they can't be bucketed
+            FileCheck().check_count(
+                "_c10d_functional.all_gather_into_tensor", 2, exactly=True
+            ).run(aten_graph_str)
+
+            correct = func(inputs, ranks=ranks)
+            self.assertTrue(same(out, correct))
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(get_bucket_patches(2.0))
+    def test_bucketing_wait_sink(self):
+        """Test that 4 independent all-gathers split bucketed."""
+
+        def func(a, b, c, d, *, ranks):
+            # All 4 all-gathers are independent - COULD be bucketed together
+            ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks)
+            ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks)
+            ag3 = _functional_collectives.all_gather_tensor(c[:4], 0, ranks)
+            ag4 = _functional_collectives.all_gather_tensor(d[:4], 0, ranks)
+
+            # First compute - can hide ag1 and ag2
+            e = a * 5
+            mm1 = torch.matmul(e, e.T)
+
+            # Second compute - can hide ag3 and ag4
+            f = b * 6
+            mm2 = torch.matmul(f, f.T)
+
+            # Use all collective results
+            result = (
+                ag1.sum() * 1.1
+                + ag2.sum() * 1.2
+                + ag3.sum() * 1.3
+                + ag4.sum() * 1.4
+                + mm1.sum()
+                + mm2.sum()
+            )
+
+            return result
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            a = torch.ones(8, 8, dtype=torch.float, device=device_type)
+            b = torch.ones(8, 8, dtype=torch.float, device=device_type) * 2
+            c = torch.ones(8, 8, dtype=torch.float, device=device_type) * 3
+            d = torch.ones(8, 8, dtype=torch.float, device=device_type) * 4
+            ranks = list(range(self.world_size))
+
+            func_c = functools.partial(func, ranks=ranks)
+            compiled = torch.compile(func_c)
+            out, aten_graph_str = run_and_get_aten_graph(compiled, a, b, c, d)
+
+            # The 4 all gathers can be bucketed, and their waits should be sunk below the mms
+            FileCheck().check_count(
+                "_c10d_functional.all_gather_into_tensor", 1, exactly=True
+            ).check_count("ops.aten.mm", 2, exactly=True).check(
+                "_c10d_functional.wait_tensor"
+            ).run(aten_graph_str)
+
+            correct = func(a, b, c, d, ranks=ranks)
+            self.assertTrue(same(out, correct))
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(get_bucket_patches(2.0))
+    def test_bucketing_split_for_overlap_blocking_no_deps(self):
+        """Test that 4 independent all-gathers split into 2+2 buckets for better overlap with compute."""
+
+        def func(a, b, c, d, *, ranks):
+            # All 4 all-gathers are independent - COULD be bucketed together
+            ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks)
+            ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks)
+            ag3 = _functional_collectives.all_gather_tensor(c[:4], 0, ranks)
+            ag4 = _functional_collectives.all_gather_tensor(d[:4], 0, ranks)
+
+            # First compute - can hide ag1 and ag2
+            e = a * 5  # Use a to avoid fusion
+            mm1 = torch.matmul(e, e.T)
+
+            # Force ag1/ag2 to complete before mm2 (but ag3/ag4 can still be deferred)
+            # Use first 8x8 elements to match mm1's shape
+            intermediate = ag1[:8, :8] + ag2[:8, :8]
+
+            # Second compute - depends on ag1/ag2 through intermediate, can hide ag3/ag4
+            mm2 = torch.matmul(mm1 + intermediate, c[:8])
+
+            # Use all results
+            result = (
+                ag1.sum() * 1.1
+                + ag2.sum() * 1.2
+                + ag3.sum() * 1.3
+                + ag4.sum() * 1.4
+                + mm1.sum()
+                + mm2.sum()
+            )
+            return result
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            a = torch.ones(8, 8, dtype=torch.float, device=device_type)
+            b = torch.ones(8, 8, dtype=torch.float, device=device_type) * 2
+            c = torch.ones(8, 8, dtype=torch.float, device=device_type) * 3
+            d = torch.ones(8, 8, dtype=torch.float, device=device_type) * 4
+            ranks = list(range(self.world_size))
+
+            func_c = functools.partial(func, ranks=ranks)
+            compiled = torch.compile(func_c)
+            out, aten_graph_str = run_and_get_aten_graph(compiled, a, b, c, d)
+
+            # The 4 all gathers can be bucketed, and the wait should be sunk below the mms
+            FileCheck().check_count(
+                "_c10d_functional.all_gather_into_tensor", 1, exactly=True
+            ).check_count("ops.aten.mm", 2, exactly=True).check_count(
+                "_c10d_functional.wait_tensor", 1, exactly=True
+            ).run(aten_graph_str)
+
+            correct = func(a, b, c, d, ranks=ranks)
+            self.assertTrue(same(out, correct))
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(get_bucket_patches(2.0))
+    def test_bucketing_split_for_overlap(self):
+        """Test that 4 independent all-gathers split into 2+2 buckets for better overlap with compute."""
+
+        def func(a, b, c, d, *, ranks):
+            # All 4 all-gathers are independent - COULD be bucketed together
+            ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks)
+            ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks)
+            ag3 = _functional_collectives.all_gather_tensor(c[:4], 0, ranks)
+            ag4 = _functional_collectives.all_gather_tensor(d[:4], 0, ranks)
+
+            # First compute - can hide ag1 and ag2
+            e = a * 5  # Use a to avoid fusion
+            mm1 = torch.matmul(e, e.T)
+
+            # Force ag1/ag2 to complete before mm2 (but ag3/ag4 can still be deferred)
+            intermediate = ag1[:2, :2] + ag2[:2, :2]  # Small slice to minimize compute
+
+            # Second compute - depends on ag1/ag2 through intermediate, can hide ag3/ag4
+            f = b * 6
+            # Expand intermediate to match mm1's shape for broadcasting
+            intermediate_expanded = torch.nn.functional.pad(intermediate, (0, 6, 0, 6))
+            mm2 = torch.matmul(mm1 + intermediate_expanded, f.T)
+
+            # Use all results
+            result = (
+                ag1.sum() * 1.1
+                + ag2.sum() * 1.2
+                + ag3.sum() * 1.3
+                + ag4.sum() * 1.4
+                + mm1.sum()
+                + mm2.sum()
+            )
+
+            return result
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            a = torch.ones(8, 8, dtype=torch.float, device=device_type)
+            b = torch.ones(8, 8, dtype=torch.float, device=device_type) * 2
+            c = torch.ones(8, 8, dtype=torch.float, device=device_type) * 3
+            d = torch.ones(8, 8, dtype=torch.float, device=device_type) * 4
+            ranks = list(range(self.world_size))
+
+            func_c = functools.partial(func, ranks=ranks)
+            compiled = torch.compile(func_c)
+            out, aten_graph_str = run_and_get_aten_graph(compiled, a, b, c, d)
+
+            # Should have 2 bucketed all-gathers (one for ag1+ag2, one for ag3+ag4)
+            FileCheck().check_count(
+                "_c10d_functional.all_gather_into_tensor_out", 2, exactly=True
+            ).run(aten_graph_str)
+
+            # Verify the ordering - first bucket, then mm1, then second bucket, then mm2
+            FileCheck().check("_c10d_functional.all_gather_into_tensor_out").check(
+                "ops.aten.mm"
+            ).check("_c10d_functional.all_gather_into_tensor_out").check(
+                "ops.aten.mm"
+            ).run(aten_graph_str)
+
+            # Verify correctness
+            correct = func(a, b, c, d, ranks=ranks)
+            self.assertTrue(same(out, correct))
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(get_bucket_patches())
+    def test_bucket_exposed_with_hidden_single_overlap(self):
+        """Test that exposed and hidden collectives bucket together when overlap is preserved."""
+
+        def func(a, b, c, *, ranks):
+            # ag1 will be hidden by mm1
+            ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks)
+
+            # ag2 and ag3 are exposed (no compute to hide them)
+            ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks)
+            ag3 = _functional_collectives.all_gather_tensor(c, 0, ranks)
+
+            # can only hide one collective
+            mm1 = torch.matmul(a[:2], a[:2].T)  # 2x2 matmul, hides only ag1
+
+            # All three can bucket together because:
+            # bucketing ag1, ag2, ag3 together does not prevent ag1 being hidden by mm1.
+
+            return ag1.sum() + ag2.sum() + ag3.sum() + mm1.sum()
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            a = torch.ones(8, 8, dtype=torch.float, device=device_type)
+            b = torch.ones(8, 8, dtype=torch.float, device=device_type) * 2
+            c = torch.ones(8, 8, dtype=torch.float, device=device_type) * 3
+            ranks = list(range(self.world_size))
+
+            func_c = functools.partial(func, ranks=ranks)
+            compiled = torch.compile(func_c)
+            out, aten_graph_str = run_and_get_aten_graph(compiled, a, b, c)
+
+            # Should have 1 bucketed operation containing all 3 all-gathers
+            FileCheck().check_count("wait_tensor.default", 1, exactly=True).run(
+                aten_graph_str
+            )
+
+            # Verify bucketed collective overlaps with mm1
+            FileCheck().check("functional.all_gather_into_tensor").check(
+                "aten.mm"
+            ).check("wait_tensor").run(aten_graph_str)
+
+            # Verify correctness
+            correct = func(a, b, c, ranks=ranks)
+            self.assertTrue(same(out, correct))
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(get_bucket_patches(2.0))
+    def test_bucketing_split_for_overlap_blocking_deps_inductor(self):
+        """Test that 4 independent all-gathers split into 2+2 buckets for better overlap with compute."""
+
+        # check that ordering is preserved in inductor
+
+        def func(a, b, c, d, *, ranks):
+            # All 4 all-gathers are independent - COULD be bucketed together
+            ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks)
+            ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks)
+            ag3 = _functional_collectives.all_gather_tensor(c[:4], 0, ranks)
+            ag4 = _functional_collectives.all_gather_tensor(d[:4], 0, ranks)
+
+            # First compute - can hide ag1 and ag2
+            e = a * 5  # Use a to avoid fusion
+            mm1 = torch.matmul(e, e.T)
+
+            # Force ag1/ag2 to complete before mm2 (but ag3/ag4 can still be deferred)
+            # Use first 8x8 elements to match mm1's shape
+            intermediate = ag1[:8, :8] + ag2[:8, :8]
+
+            # Second compute - depends on ag1/ag2 through intermediate, can hide ag3/ag4
+            mm2 = torch.matmul(mm1 + intermediate, c[:8])
+
+            # Use all results
+            result = (
+                ag1.sum() * 1.1
+                + ag2.sum() * 1.2
+                + ag3.sum() * 1.3
+                + ag4.sum() * 1.4
+                + mm1.sum()
+                + mm2.sum()
+            )
+            return result
+
+        li = []
+        apply = functools.partial(apply_reordering_and_get_graph, out_li=li)
+        with (
+            _dynamo_dist_per_rank_init(
+                self.rank,
+                self.world_size,
+                self.backend(device_type),
+                fake_pg=not at_least_x_gpu(2),
+            ),
+            torch._inductor.config.patch(
+                "test_configs.aten_fx_overlap_insert_overlap_deps", True
+            ),
+            torch._inductor.config.patch(post_grad_custom_post_pass=apply),
+        ):
+            a = torch.ones(8, 8, dtype=torch.float, device=device_type)
+            b = torch.ones(8, 8, dtype=torch.float, device=device_type) * 2
+            c = torch.ones(8, 8, dtype=torch.float, device=device_type) * 3
+            d = torch.ones(8, 8, dtype=torch.float, device=device_type) * 4
+            ranks = list(range(self.world_size))
+
+            func_c = functools.partial(func, ranks=ranks)
+            compiled = torch.compile(func_c)
+            test_out, (code,) = run_and_get_code(compiled, a, b, c, d)
+
+            # Check that right deps are added
+            f = FileCheck()
+            for _ in range(2):
+                f.check("control_deps").check_same("all_gather").check_same(
+                    "subgraph_mm"
+                )
+                f.check("control_deps").check_same("mm").check_same("subgraph_wait")
+            f.run(li[0])
+
+            f = FileCheck()
+            for _ in range(2):
+                f.check_count("all_gather_into_tensor_out.default(", 1, exactly=True)
+                f.check_count("extern_kernels.mm(", 1, exactly=True)
+                f.check_count("wait_tensor.default(", 1, exactly=True)
+            f.run(code)
+
+            correct = func(a, b, c, d, ranks=ranks)
+            self.assertTrue(same(test_out, correct))
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index 89afc369fe14..985e2d5f151a 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -55,7 +55,7 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 if platform == "darwin":
     LOOPBACK = "lo0"
@@ -2271,7 +2271,7 @@ def testNodeLocalRank(self):
 if __name__ == "__main__":
     if device_type != "cpu":
         assert not torch.get_device_module()._initialized, (
-            "test_distributed must not have initialized {device_type} context on main process"
+            f"test_distributed must not have initialized {device_type} context on main process"
         )
 
     run_tests()
diff --git a/test/distributed/test_c10d_functional_native.py b/test/distributed/test_c10d_functional_native.py
index 6a5df5d988a3..0877eb53cd6f 100644
--- a/test/distributed/test_c10d_functional_native.py
+++ b/test/distributed/test_c10d_functional_native.py
@@ -938,6 +938,9 @@ def func2(arg: torch.Tensor) -> torch.Tensor:
         assert "torch.ops._c10d_functional.wait_tensor.default" in code
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @unittest.skipIf(
+        torch._inductor.config.triton.native_matmul, "no extern_kernels.mm"
+    )
     @fresh_cache()
     def test_inductor_reuse_buffer_after_inplace_collective(self):
         def func(arg: torch.Tensor) -> torch.Tensor:
@@ -1121,12 +1124,6 @@ def func(args: list[torch.Tensor]) -> torch.Tensor:
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @fresh_cache()
     def test_inductor_all_to_all_single(self):
-        def _tolist_with_constrain_as_size(tensor):
-            lst = tensor.tolist()
-            for elem in lst:
-                torch._check_is_size(elem)
-            return lst
-
         def func(
             input: torch.Tensor,
             output_split_sizes: torch.Tensor,
@@ -1134,8 +1131,8 @@ def func(
         ) -> torch.Tensor:
             output = funcol.all_to_all_single(
                 input,
-                _tolist_with_constrain_as_size(output_split_sizes),
-                _tolist_with_constrain_as_size(input_split_sizes),
+                output_split_sizes.tolist(),
+                input_split_sizes.tolist(),
                 "0",
             )
             return funcol.wait_tensor(output)
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index 0b265e65cf57..ffd48407abd0 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -1459,7 +1459,7 @@ def test_allgather_coalesced_async(self):
     @requires_gloo()
     def test_reduce_checks(self):
         store = c10d.FileStore(self.file_name, self.world_size)
-        pg = pg = self._create_process_group_gloo(
+        pg = self._create_process_group_gloo(
             store, self.rank, self.world_size, self.opts()
         )
 
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index f44394e3148c..7410255d27a8 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -83,7 +83,6 @@
     )
     sys.exit(0)
 
-# bfloat16 is only supported by CUDA 11+
 BFLOAT16_AVAILABLE = torch.cuda.is_available() and (
     torch.version.cuda is not None or torch.version.hip is not None
 )
@@ -2073,7 +2072,7 @@ def first_bucket_size(ddp_bucket_mb):
                         opt = torch.optim.SGD(m.parameters(), lr=0.1)
                         opt_ddp = torch.optim.SGD(m_ddp.parameters(), lr=0.1)
                         has_half = any(p.dtype is torch.half for p in m.parameters())
-                        tol = 1.0e-3 if has_half else 1.0e-5
+                        tol = 3.0e-3 if has_half else 1.0e-5
                     except BaseException:
                         # Prints case-specific debugging info to narrow down failing case.
                         print(
@@ -2771,11 +2770,7 @@ def hook(work_info: torch._C._distributed_c10d.WorkInfo):
         # from rank0 to other ranks. However, this is DDP's internal implementation,
         # which is subject to change in future versions.
         self.assertTrue(num_hook_fired[OpType.BROADCAST] > 0)
-        ctor_allreduce = (
-            num_hook_fired[OpType.ALLREDUCE]
-            if OpType.ALLREDUCE in num_hook_fired
-            else 0
-        )
+        ctor_allreduce = num_hook_fired.get(OpType.ALLREDUCE, 0)
 
         x = torch.zeros(2, 1000).cuda(self.rank)
         ddp(x).sum().backward()
@@ -2894,6 +2889,25 @@ def _reduce_timeout(self):
         os.environ["TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC"] = "4"
         os.environ["TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC"] = "1000"
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(3)
+    @skip_if_rocm_multiprocess
+    def test_send_recv_non_dense_tensor(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        device = torch.device("cuda", self.rank % torch.cuda.device_count())
+        dist.init_process_group(
+            rank=self.rank, world_size=self.world_size, store=store, device_id=device
+        )
+        full = torch.empty((64, 64), device=device).fill_(self.rank)
+        # Take a slice in col dimension, making it non-dense
+        block = full[:, 16:32]
+        if self.rank == 0:
+            with self.assertRaises(ValueError):
+                dist.send(block, dst=1)
+        elif self.rank == 1:
+            with self.assertRaises(ValueError):
+                dist.recv(block, src=0)
+
     @requires_nccl()
     @requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
@@ -3803,6 +3817,27 @@ def test_allgather_base(self):
         dist.all_gather_into_tensor(output_tensor, tensor)
         self.assertEqual(output_tensor, tensor)
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    def test_allgather_noncontig(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            "nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        device = "cuda"
+        tensor = (
+            torch.arange(0, 16, device=torch.device(device))
+            .view(2, 2, 2, 2)
+            .to(memory_format=torch.channels_last)
+        )
+        tensor_list = [torch.empty_like(tensor) for _ in range(self.world_size)]
+        dist.all_gather(tensor_list, tensor)
+        for o in tensor_list:
+            self.assertEqual(o, tensor)
+
     @requires_nccl()
     @skip_if_lt_x_gpu(1)
     @parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
@@ -4563,6 +4598,34 @@ def test_short_pickle(self, timing_enabled, include_collectives):
         )
         dist.destroy_process_group()
 
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize("timing_enabled", [True, False])
+    def test_fr_record_reset(self, timing_enabled):
+        if self.rank == self.MAIN_PROCESS_RANK:
+            return
+        pg = self._create_process_group_nccl()
+        if timing_enabled:
+            pg._enable_collectives_timing()
+        device = self.local_device
+        self.set_thread_name("fr_test_thread")
+        a = torch.full((3, 4), float(self.rank), device=device)
+        for _ in range(5):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.cuda.synchronize(device=device)
+        # gah ok so now the duration_ms is populated best-effort since it can only happen outside "dump()" api
+        time.sleep(1)
+        torch._C._distributed_c10d._reset_fr_recording_nccl()
+        for _ in range(4):
+            f = pg.allreduce(a)
+        f.wait()
+        torch.cuda.synchronize(device=device)
+        time.sleep(1)
+        t = pickle.loads(torch._C._distributed_c10d._dump_nccl_trace())
+        self.assertEqual(len(t["entries"]), 4)
+        dist.destroy_process_group()
+
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_dump_pipe(self):
diff --git a/test/distributed/test_c10d_spawn.py b/test/distributed/test_c10d_spawn.py
index adddc7f71afd..26e20a4f45db 100644
--- a/test/distributed/test_c10d_spawn.py
+++ b/test/distributed/test_c10d_spawn.py
@@ -21,7 +21,7 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 if not c10d.is_available():
     print("c10d not available, skipping tests", file=sys.stderr)
diff --git a/test/distributed/test_composability.py b/test/distributed/test_composability.py
index b87e85a9a458..3508a43cb548 100644
--- a/test/distributed/test_composability.py
+++ b/test/distributed/test_composability.py
@@ -6,8 +6,12 @@
 import torch.nn.functional as F
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp import fully_shard, MixedPrecisionPolicy
+from torch.distributed.fsdp._fully_shard._fsdp_param import ShardedState
 from torch.distributed.pipelining import PipelineStage
 from torch.distributed.pipelining.schedules import (
+    _Action,
+    _ComputationType,
+    _PipelineScheduleRuntime,
     PipelineScheduleSingle,
     Schedule1F1B,
     ScheduleGPipe,
@@ -142,6 +146,7 @@ def _build_pp_schedule(
         total_layers,
         apply_dp,
         loss_fn,
+        scale_grads=True,
     ):
         if issubclass(ScheduleClass, PipelineScheduleSingle):
             pipeline_stage, offset = self._build_pp_stage(
@@ -159,6 +164,7 @@ def _build_pp_schedule(
                 pipeline_stage,
                 n_microbatches=num_microbatches,
                 loss_fn=loss_fn,
+                scale_grads=scale_grads,
             )
         else:
             n_virtual = 2
@@ -181,6 +187,7 @@ def _build_pp_schedule(
                 stages,
                 n_microbatches=num_microbatches,
                 loss_fn=loss_fn,
+                scale_grads=scale_grads,
             )
         return pipeline_schedule, partial_models, offsets
 
@@ -375,8 +382,154 @@ def apply_dp(partial_model):
                     p.grad.full_tensor(), ref_p.grad, atol=5e-5, rtol=2e-2
                 )
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(4)
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "Test requires 4+ GPUs")
+    @parametrize("dp_type", ["FSDP", "FSDP_MP"])
+    def test_pp_fsdp_unshard_reshard_runtime(self, dp_type):
+        """Test FSDP UNSHARD/RESHARD functionality using _PipelineScheduleRuntime with custom schedules."""
+        if TEST_WITH_ROCM:
+            return
 
-instantiate_parametrized_tests(ComposabilityTest)
+        torch.get_device_module(device_type).set_device(self.device)
+        mesh_shape = (self.world_size, 1)
+        mesh_dim_names = ("dp", "pp")
+        device_mesh = init_device_mesh(
+            "cuda", mesh_shape=mesh_shape, mesh_dim_names=mesh_dim_names
+        )
+        pp_group = device_mesh["pp"].get_group()
+        dp_mesh = device_mesh["dp"]
 
+        # fsdp_mixed-precision dtype
+        mp_dtype = torch.bfloat16 if dp_type == "FSDP_MP" else torch.float32
+        total_layers = 4
+        dim = 10
+        full_model = nn.ModuleList([MLPModule(dim) for _ in range(total_layers)])
+
+        def apply_dp(partial_model):
+            mp_policy = MixedPrecisionPolicy(
+                param_dtype=mp_dtype,
+                reduce_dtype=torch.float32,
+            )
+            fsdp_config = {"mesh": dp_mesh, "mp_policy": mp_policy}
+            for layer in partial_model.children():
+                fully_shard(
+                    layer,
+                    **fsdp_config,
+                    reshard_after_forward=False,
+                )
+            return fully_shard(partial_model, **fsdp_config)
+
+        # Build pipeline stages
+        num_stages = pp_group.size()
+        layers_per_stage = total_layers // num_stages
+        stage_idx = pp_group.rank()
+        offset = stage_idx * layers_per_stage
+
+        partial_model = nn.Sequential(
+            *full_model[offset : (stage_idx + 1) * layers_per_stage]
+        )
+        partial_model.to(self.device)
+        fsdp_model = apply_dp(partial_model)
+        distributed_state = fully_shard.state(fsdp_model)
+        distributed_state._lazy_init()
+
+        stage = PipelineStage(
+            fsdp_model,
+            stage_idx,
+            num_stages,
+            self.device,
+            group=pp_group,
+        )
+
+        # Helper function to check FSDP sharding state
+        def check_fsdp_unsharded_state(module, expected_unsharded=False):
+            """Check if FSDP parameters are in expected sharding state."""
+            distributed_state = fully_shard.state(module)
+            unsharded_count = 0
+            total_fsdp_params = 0
+
+            for state in distributed_state._state_ctx.all_states:
+                if state._fsdp_param_group:
+                    group = state._fsdp_param_group
+                    for fsdp_param in group.fsdp_params:
+                        total_fsdp_params += 1
+                        if fsdp_param.sharded_state == ShardedState.UNSHARDED:
+                            unsharded_count += 1
+
+            if expected_unsharded:
+                self.assertEqual(
+                    unsharded_count,
+                    total_fsdp_params,
+                    f"Expected all {total_fsdp_params} FSDP parameters to be unsharded, "
+                    f"but only {unsharded_count} are unsharded",
+                )
+            else:
+                self.assertEqual(
+                    unsharded_count,
+                    0,
+                    f"Expected all FSDP parameters to be sharded, "
+                    f"but {unsharded_count} out of {total_fsdp_params} are unsharded",
+                )
+
+            return total_fsdp_params > 0  # Return whether we found any FSDP parameters
+
+        # Test initial state - should be sharded
+        has_fsdp = check_fsdp_unsharded_state(stage.submod, expected_unsharded=False)
+
+        if not has_fsdp:
+            self.skipTest("No FSDP parameters found in the model")
+
+        def create_schedule(computation_types, microbatch_index=None):
+            schedule = {
+                0: [
+                    _Action(
+                        stage_index=0,  # stage 0 (the only stage)
+                        computation_type=comp_type,
+                        microbatch_index=microbatch_index
+                        if comp_type == _ComputationType.FORWARD
+                        else None,
+                    )
+                    for comp_type in computation_types
+                ]
+            }
+            return schedule
+
+        unshard_schedule = create_schedule(
+            [
+                _ComputationType.UNSHARD,
+                _ComputationType.FORWARD,
+            ],
+            microbatch_index=0,
+        )
+        unshard_reshard_schedule = create_schedule(
+            [
+                _ComputationType.UNSHARD,
+                _ComputationType.FORWARD,
+                _ComputationType.RESHARD,
+            ],
+            microbatch_index=0,
+        )
+
+        # Test 1: Run UNSHARD + RESHARD schedule
+        runtime = _PipelineScheduleRuntime(
+            [stage], n_microbatches=1, loss_fn=None, scale_grads=False
+        )
+        runtime.pipeline_order_with_comms = unshard_reshard_schedule
+        dummy_input = torch.randn(1, dim, device=self.device, dtype=mp_dtype)
+        runtime.step(dummy_input)
+
+        # Verify parameters are now sharded again
+        check_fsdp_unsharded_state(stage.submod, expected_unsharded=False)
+
+        # Test 2: Run UNSHARD only schedule
+        runtime.pipeline_order_with_comms = unshard_schedule
+        runtime.step(dummy_input)
+
+        # Verify parameters are still sharded
+        check_fsdp_unsharded_state(stage.submod, expected_unsharded=False)
+
+
+instantiate_parametrized_tests(ComposabilityTest)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_compute_comm_reordering.py b/test/distributed/test_compute_comm_reordering.py
index 35b7a45dee7b..a13611a53609 100644
--- a/test/distributed/test_compute_comm_reordering.py
+++ b/test/distributed/test_compute_comm_reordering.py
@@ -78,6 +78,10 @@ def create_grouped_node_for_allreduce_and_its_deps(snodes):
 
 
 @requires_accelerator_dist_backend()
+@unittest.skipIf(
+    torch._inductor.config.triton.native_matmul,
+    "native matmul is fused with surrounding ops",
+)
 class TestComputeCommReorderingMultiProc(DynamoDistributedMultiProcTestCase):
     """
     Run correctness checks in multi-proc runner, mark with minimum # GPUs to run under
@@ -367,6 +371,10 @@ def func(a, *, tag, ranks, group_size):
             self.assertTrue(same(out, correct))
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @unittest.skipIf(
+        torch._inductor.config.triton.native_matmul,
+        "native matmul is fused with surrounding ops",
+    )
     # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor
     @patch.object(torch._inductor.config, "compile_threads", 1)
     @patch.object(
diff --git a/test/distributed/test_cupy_as_tensor.py b/test/distributed/test_cupy_as_tensor.py
index 8340217b6c06..e0a98ae96042 100644
--- a/test/distributed/test_cupy_as_tensor.py
+++ b/test/distributed/test_cupy_as_tensor.py
@@ -7,8 +7,13 @@
 
 import torch
 from torch.multiprocessing.reductions import reduce_tensor
+from torch.testing._internal.common_cuda import SM100OrLater
 from torch.testing._internal.common_distributed import MultiProcContinuousTest
-from torch.testing._internal.common_utils import requires_cuda_p2p_access, run_tests
+from torch.testing._internal.common_utils import (
+    requires_cuda_p2p_access,
+    run_tests,
+    skip_but_pass_in_sandcastle_if,
+)
 
 
 # So that tests are written in device-agnostic way
@@ -59,6 +64,10 @@ def _init_device(self) -> None:
     def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
 
+    @skip_but_pass_in_sandcastle_if(
+        SM100OrLater,
+        "Fails if ran in docker environment without privileged access (https://github.com/pytorch/pytorch/issues/165170)",
+    )
     def test_cupy_as_tensor(self) -> None:
         """
         Test that torch.as_tensor works for cupy array interface
diff --git a/test/distributed/test_data_parallel.py b/test/distributed/test_data_parallel.py
index 26f64df90d94..d3273673e89c 100644
--- a/test/distributed/test_data_parallel.py
+++ b/test/distributed/test_data_parallel.py
@@ -96,7 +96,7 @@ def step(model):
         step(model_dp)
 
         for p1, p2 in zip(model.parameters(), model_dp.parameters()):
-            self.assertTrue(p1.allclose(p2))
+            self.assertEqual(p1, p2)
 
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_data_parallel_lazy_linear(self):
@@ -760,7 +760,7 @@ def forward(self, x):
                     opt = torch.optim.SGD(m.parameters(), lr=0.1)
                     opt_dp = torch.optim.SGD(m_dp.parameters(), lr=0.1)
                     has_half = any(p.dtype is torch.half for p in m.parameters())
-                    tol = 1.0e-3 if has_half else 1.0e-5
+                    tol = 3.0e-3 if has_half else 1.0e-5
                 except BaseException:
                     # Prints case-specific debugging info to narrow down failing case.
                     print(
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
index 4cee5c38cf8a..2db674a458ed 100644
--- a/test/distributed/test_device_mesh.py
+++ b/test/distributed/test_device_mesh.py
@@ -2,12 +2,14 @@
 # Owner(s): ["oncall: distributed"]
 import os
 import unittest
+from datetime import timedelta
 
 import torch
 import torch.distributed as dist
 import torch.distributed._functional_collectives as funcol
 from torch._C._distributed_c10d import Backend as C10dBackend
 from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.distributed._mesh_layout import _MeshLayout as _Layout
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh, init_device_mesh
 from torch.distributed.distributed_c10d import (
     _get_default_group,
@@ -27,7 +29,7 @@
 )
 from torch.distributed.tensor.placement_types import _Partial, Shard
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_utils import run_tests, TEST_XPU
+from torch.testing._internal.common_utils import run_tests, TEST_XPU, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     with_comms,
@@ -39,6 +41,13 @@
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
 device_count = torch.accelerator.device_count()
 
+try:
+    import torch._C._distributed_c10d.ProcessGroupNCCL
+
+    _NCCL_AVAILABLE = True
+except ImportError:
+    _NCCL_AVAILABLE = False
+
 
 def _set_env_var(addr="localhost", port="25364", world_size=1, rank=0, local_rank=-1):
     os.environ["MASTER_ADDR"] = addr
@@ -336,18 +345,33 @@ def test_raises_invalid_device_type(self):
             )
 
     @with_comms
-    def test_set_mesh_dim_group_options(self):
-        device_type = (
-            torch.accelerator.current_accelerator().type
-            if torch.accelerator.is_available()
-            else "cpu"
+    def test_get_root_mesh_multiple_independent_meshes(self):
+        # regression test for issue #163330
+        # when creating multiple independent device meshes and slicing them,
+        # get_root_mesh should return the correct parent mesh for each submesh
+        mesh1 = init_device_mesh(
+            self.device_type,
+            (2, 2),
+            mesh_dim_names=("dp", "tp"),
         )
-        _mesh_resources._set_mesh_dim_group_options(1, "fake", None)
+        mesh1_dp = mesh1["dp"]
+        mesh1_tp = mesh1["tp"]
 
-        mesh_tensor = torch.arange(4).reshape(2, 2)
-        mesh = DeviceMesh(device_type, mesh_tensor)
-        # Fake pg only have BackendType as BackendType::CUSTOM.
-        self.assertEqual(mesh.get_group(1)._get_backend_name(), "custom")
+        mesh2 = init_device_mesh(
+            self.device_type,
+            (2, 2),
+            mesh_dim_names=("dim1", "dim2"),
+        )
+        mesh2_dim1 = mesh2["dim1"]
+        mesh2_dim2 = mesh2["dim2"]
+
+        self.assertEqual(_mesh_resources.get_root_mesh(mesh1_dp), mesh1)
+        self.assertEqual(_mesh_resources.get_root_mesh(mesh1_tp), mesh1)
+        self.assertEqual(_mesh_resources.get_root_mesh(mesh2_dim1), mesh2)
+        self.assertEqual(_mesh_resources.get_root_mesh(mesh2_dim2), mesh2)
+
+        self.assertNotEqual(_mesh_resources.get_root_mesh(mesh1_dp), mesh2)
+        self.assertNotEqual(_mesh_resources.get_root_mesh(mesh1_tp), mesh2)
 
 
 class DeviceMeshTestNDim(DTensorTestBase):
@@ -439,6 +463,7 @@ def test_device_mesh_parent_child_hash(self):
         ep_mesh = ep_mesh_1 if self.rank < self.world_size // 2 else ep_mesh_2
         # ep_mesh is considered different from mesh_2d["TP"]
         self.assertEqual(mesh_2d["TP"]._flatten_mesh_list, ep_mesh._flatten_mesh_list)
+        self.assertEqual(mesh_2d["TP"]._layout, ep_mesh._layout)
         self.assertEqual(mesh_2d["TP"].mesh.shape, ep_mesh.mesh.shape)
         self.assertEqual(mesh_2d["TP"].device_type, ep_mesh.device_type)
         self.assertNotEqual(mesh_2d["TP"].mesh_dim_names, ep_mesh.mesh_dim_names)
@@ -453,6 +478,7 @@ def test_device_mesh_parent_child_hash(self):
         )
         # another_mesh is considered the same as ep_mesh
         self.assertEqual(ep_mesh._flatten_mesh_list, another_mesh._flatten_mesh_list)
+        self.assertEqual(ep_mesh._layout, another_mesh._layout)
         self.assertEqual(ep_mesh.mesh.shape, another_mesh.mesh.shape)
         self.assertEqual(ep_mesh.device_type, another_mesh.device_type)
         self.assertEqual(ep_mesh.mesh_dim_names, another_mesh.mesh_dim_names)
@@ -510,7 +536,7 @@ def test_from_group_with_mesh_shape_2d(self):
         # Create shard groups (e.g. (0, 1, 2, 3), (4, 5, 6, 7))
         # and assign the correct shard group to each rank
         shard_rank_lists = (
-            list(range(0, self.world_size // 2)),
+            list(range(self.world_size // 2)),
             list(range(self.world_size // 2, self.world_size)),
         )
         shard_groups = (
@@ -538,7 +564,6 @@ def test_from_group_with_mesh_shape_2d(self):
             mesh_dim_names=("dp_replicate", "dp_shard"),
         )
 
-        # self.assertEqual(ref_mesh._dim_group_names, dp_mesh._dim_group_names)
         for mesh_dim_group, ref_mesh_dim_group in zip(
             dp_mesh.get_all_groups(), ref_mesh.get_all_groups()
         ):
@@ -799,6 +824,10 @@ def test_get_item_3d(self):
         # Test slicing out 1D mesh from a sub-2D mesh.
         shard_mesh = hsdp_mesh_2["Shard"]
         self.assertEqual(shard_mesh.mesh.tolist(), shard_group[shard_group_idx])
+        replicate_mesh = hsdp_mesh_2["Replicate"]
+        self.assertEqual(
+            replicate_mesh.mesh.tolist(), replicate_group[replicate_group_idx]
+        )
 
     @with_comms
     def test_cache_and_reuse_submesh_slice_result(self):
@@ -862,7 +891,7 @@ def test_flatten_mesh_3d(self):
 
         # Test flatten into an existing mesh_dim_name inside the mesh
         with self.assertRaisesRegex(
-            RuntimeError,
+            ValueError,
             "already exists for submesh of the DeviceMesh",
         ):
             mesh_3d._flatten("dp")
@@ -872,12 +901,15 @@ def test_flatten_mesh_3d(self):
         flattened_dp_cp_mesh = dp_cp_mesh._flatten()
         self.assertEqual(dp_cp_mesh.mesh.flatten(), flattened_dp_cp_mesh.mesh)
         self.assertEqual(flattened_dp_cp_mesh.mesh_dim_names[0], "dp_cp")
-        root_mesh = _mesh_resources.get_root_mesh(dp_cp_mesh)
+        self.assertEqual(flattened_dp_cp_mesh.get_group().group_desc, "mesh_dp_cp")
+        root_mesh = dp_cp_mesh._get_root_mesh()
         self.assertEqual(root_mesh, mesh_3d)
-        flatten_mesh_root_dims = _mesh_resources.flatten_name_to_root_dims[root_mesh][
-            "dp_cp"
-        ]
-        self.assertEqual(flatten_mesh_root_dims, (0, 1))
+        flatten_mesh_layout = root_mesh._flatten_mapping["dp_cp"]._layout
+        self.assertEqual(flatten_mesh_layout, flattened_dp_cp_mesh._layout)
+        self.assertEqual(
+            flattened_dp_cp_mesh._layout.global_ranks(8),
+            [[0, 2, 4, 6], [1, 3, 5, 7]],
+        )
 
         ref_pg_count = _world.group_count
         # Calling flatten again should not create a new pg.
@@ -890,18 +922,37 @@ def test_flatten_mesh_3d(self):
         flattened_dp_tp_mesh = dp_tp_mesh._flatten()
         self.assertEqual(dp_tp_mesh.mesh.flatten(), flattened_dp_tp_mesh.mesh)
         self.assertEqual(flattened_dp_tp_mesh.mesh_dim_names[0], "dp_tp")
-        root_mesh = _mesh_resources.get_root_mesh(dp_tp_mesh)
+        root_mesh = dp_tp_mesh._get_root_mesh()
         self.assertEqual(root_mesh, mesh_3d)
-        flatten_mesh_root_dims = _mesh_resources.flatten_name_to_root_dims[root_mesh][
-            "dp_tp"
-        ]
-        self.assertEqual(flatten_mesh_root_dims, (0, 2))
+        flatten_mesh_root_layout = root_mesh._flatten_mapping["dp_tp"]._layout
+        self.assertEqual(flatten_mesh_root_layout, flattened_dp_tp_mesh._layout)
+        self.assertEqual(
+            flattened_dp_tp_mesh._layout.global_ranks(8),
+            [[0, 1, 4, 5], [2, 3, 6, 7]],
+        )
+        with self.assertRaisesRegex(
+            NotImplementedError,
+            "Currently, this only allows slicing out a contiguous flattened dim",
+        ):
+            mesh_3d["dp_tp", "cp"]
 
         # Test flatten with a flattened mesh_dim_name
         cp_tp_mesh = mesh_3d["cp", "tp"]
         cp_tp_mesh._flatten("dummy")
         self.assertEqual(mesh_3d["dummy"].mesh_dim_names[0], "dummy")
 
+        # Test flatten into an existing mesh_dim_name inside the mesh
+        with self.assertRaisesRegex(
+            ValueError,
+            "dp already exists for submesh of the DeviceMesh",
+        ):
+            mesh_3d._flatten("dp")
+        with self.assertRaisesRegex(
+            ValueError,
+            "Flatten mesh with mesh_dim_name dp_tp has been created before",
+        ):
+            mesh_3d["cp", "tp"]._flatten("dp_tp")
+
     @with_comms(eager_init=True)
     def test_flatten_mesh_4d(self):
         mesh_shape = (2, 2, 2, 1)
@@ -917,7 +968,86 @@ def test_flatten_mesh_4d(self):
         # check flattened mesh dim names is correct
         self.assertEqual(dp_cp_mesh.mesh_dim_names, ("dp_cp",))
         # check flattened mesh dependency
-        self.assertEqual(_mesh_resources.get_root_mesh(dp_cp_mesh), mesh_4d)
+        self.assertEqual(dp_cp_mesh._get_root_mesh(), mesh_4d)
+
+    @with_comms
+    def test_unflatten_mesh_2d(self):
+        mesh_shape = (4, 2)
+        mesh_dim_names = ("dp", "tp")
+        mesh_2d = init_device_mesh(
+            self.device_type, mesh_shape, mesh_dim_names=mesh_dim_names
+        )
+        unflatten_mesh = mesh_2d._unflatten(0, (2, 2), ("dp_shard", "dp_replicate"))
+        self.assertEqual(
+            unflatten_mesh.mesh_dim_names, ["dp_shard", "dp_replicate", "tp"]
+        )
+        self.assertEqual(mesh_2d["tp"].mesh, unflatten_mesh["tp"].mesh)
+        self.assertEqual(mesh_2d["tp"].get_group(), unflatten_mesh["tp"].get_group())
+
+        # Not supporting slicing out unflatten dim name from root mesh.
+        with self.assertRaises(KeyError):
+            self.assertEqual(mesh_2d["dp_shard"].mesh, unflatten_mesh["dp_shard"].mesh)
+
+    @with_comms
+    def test_unflatten_mesh_3d(self):
+        # Test unflatten from a dummy world mesh, which is the case we need for Expert Parallelism(EP).
+        global_mesh = init_device_mesh(
+            self.device_type,
+            (8,),
+            mesh_dim_names=("world",),
+        )
+        non_ep_mesh = global_mesh._unflatten(0, (2, 2, 2), ("dp", "cp", "tp"))
+        ep_mesh = global_mesh._unflatten(0, (2, 2, 2), ("dp", "ep", "ep_tp"))
+        self.assertEqual(non_ep_mesh["cp"].mesh, ep_mesh["ep"].mesh)
+        self.assertEqual(non_ep_mesh["tp"].mesh, ep_mesh["ep_tp"].mesh)
+        mesh_3d = global_mesh._unflatten(0, (4, 2, 1), ("dp", "cp", "tp"))
+        unflatten_mesh = mesh_3d._unflatten(0, (2, 2), ("dp_shard", "dp_replicate"))
+        self.assertEqual(
+            unflatten_mesh.mesh_dim_names, ["dp_shard", "dp_replicate", "cp", "tp"]
+        )
+        self.assertEqual(mesh_3d["tp"].mesh, unflatten_mesh["tp"].mesh)
+        self.assertEqual(mesh_3d["tp"].get_group(), unflatten_mesh["tp"].get_group())
+        self.assertEqual(mesh_3d["cp"].mesh, unflatten_mesh["cp"].mesh)
+        self.assertEqual(mesh_3d["cp"].get_group(), unflatten_mesh["cp"].get_group())
+
+        # Test unflatten with backend override set.
+        if not _NCCL_AVAILABLE:
+            return
+        opts = dist.ProcessGroupNCCL.Options()
+        opts._timeout = timedelta(seconds=30)
+        mesh_2d = global_mesh._unflatten(
+            0,
+            (1, 8),
+            ("pp", "spmd"),
+            backend_override={"pp": "fake", "spmd": ("nccl", opts)},
+        )
+        opts = dist.ProcessGroupNCCL.Options()
+        opts._timeout = timedelta(seconds=60)
+        mesh_4d = mesh_2d._unflatten(
+            1,
+            (2, 2, 2),
+            ("dp", "cp", "tp"),
+            backend_override={"dp": "nccl", "cp": "nccl", "tp": ("nccl", opts)},
+        )
+        self.assertEqual(mesh_4d["pp"].get_group()._get_backend_name(), "custom")
+        spmd_pg = mesh_2d["spmd"].get_group()
+        self.assertEqual(spmd_pg._get_backend_name(), "nccl")
+        w = spmd_pg.allreduce(torch.rand(10).cuda(self.rank))
+        self.assertTrue(
+            spmd_pg._get_backend(
+                torch.device(f"cuda:{self.rank}")
+            )._verify_work_timeout(w, timedelta(seconds=30))
+        )
+        w.wait()
+        tp_pg = mesh_4d["tp"].get_group()
+        self.assertEqual(tp_pg._get_backend_name(), "nccl")
+        w = tp_pg.allreduce(torch.rand(10).cuda(self.rank))
+        self.assertTrue(
+            tp_pg._get_backend(torch.device(f"cuda:{self.rank}"))._verify_work_timeout(
+                w, timedelta(seconds=60)
+            )
+        )
+        w.wait()
 
     @with_comms
     def test_reconstruct_mesh_with_flatten_dim(self):
@@ -967,12 +1097,19 @@ def test_get_root_mesh(self):
         dp_mesh = mesh_3d["dp"]
         cp_mesh = mesh_3d["cp"]
         tp_mesh = mesh_3d["tp"]
+        # Test BC case is still working
         self.assertEqual(_mesh_resources.get_root_mesh(dp_cp_mesh), mesh_3d)
         self.assertEqual(_mesh_resources.get_root_mesh(dp_tp_mesh), mesh_3d)
         self.assertEqual(_mesh_resources.get_root_mesh(cp_tp_mesh), mesh_3d)
         self.assertEqual(_mesh_resources.get_root_mesh(dp_mesh), mesh_3d)
         self.assertEqual(_mesh_resources.get_root_mesh(cp_mesh), mesh_3d)
         self.assertEqual(_mesh_resources.get_root_mesh(tp_mesh), mesh_3d)
+        self.assertEqual(dp_cp_mesh._get_root_mesh(), mesh_3d)
+        self.assertEqual(dp_tp_mesh._get_root_mesh(), mesh_3d)
+        self.assertEqual(cp_tp_mesh._get_root_mesh(), mesh_3d)
+        self.assertEqual(dp_mesh._get_root_mesh(), mesh_3d)
+        self.assertEqual(cp_mesh._get_root_mesh(), mesh_3d)
+        self.assertEqual(tp_mesh._get_root_mesh(), mesh_3d)
 
     @with_comms
     def test_get_root_mesh_dim_exist(self):
@@ -982,15 +1119,15 @@ def test_get_root_mesh_dim_exist(self):
             self.device_type, mesh_shape, mesh_dim_names=mesh_dim_names
         )
 
-        self.assertEqual(_mesh_resources.get_root_mesh_dim(mesh_2d["DP"]), 0)
-        self.assertEqual(_mesh_resources.get_root_mesh_dim(mesh_2d["TP"]), 1)
+        self.assertEqual(mesh_2d["DP"]._get_root_mesh_dim(), 0)
+        self.assertEqual(mesh_2d["TP"]._get_root_mesh_dim(), 1)
 
     @with_comms
     def test_get_root_mesh_dim_not_exist(self):
         mesh_shape = (self.world_size,)
         mesh = init_device_mesh(self.device_type, mesh_shape)
 
-        self.assertEqual(_mesh_resources.get_root_mesh_dim(mesh), None)
+        self.assertEqual(mesh._get_root_mesh_dim(), None)
 
     @with_comms
     def test_get_mesh_dim_by_name(self):
@@ -1000,8 +1137,8 @@ def test_get_mesh_dim_by_name(self):
             self.device_type, mesh_shape, mesh_dim_names=mesh_dim_names
         )
 
-        self.assertEqual(_mesh_resources.get_mesh_dim_by_name(mesh_2d, "DP"), 0)
-        self.assertEqual(_mesh_resources.get_mesh_dim_by_name(mesh_2d, "TP"), 1)
+        self.assertEqual(mesh_2d._get_mesh_dim_by_name("DP"), 0)
+        self.assertEqual(mesh_2d._get_mesh_dim_by_name("TP"), 1)
 
     @with_comms
     def test_get_all_submeshes(self):
@@ -1010,7 +1147,7 @@ def test_get_all_submeshes(self):
             (2, 4),
             mesh_dim_names=("replicate", "shard"),
         )
-        all_submeshes = _mesh_resources._get_all_submeshes(mesh_2d, "replicate")
+        all_submeshes = mesh_2d._get_all_submeshes("replicate")
         self.assertEqual(len(all_submeshes), 4)
         self.assertEqual(
             all(submesh.mesh.numel() == 2 for submesh in all_submeshes), True
@@ -1288,5 +1425,286 @@ def test_scatter_nd(self):
             self.assertEqual(received_tensor, torch.ones(3, 3) * self.rank)
 
 
+class CuTeLayoutTest(TestCase):
+    def test_coalesce(self):
+        # ((3,2),(2,1)) -> (6,1)
+        l = _Layout((3, 2), (2, 1))
+        l = l.coalesce()
+        self.assertEqual(list(l.sizes_and_strides), [(6, 1)])
+
+        # ((2,12),(3,4),(4,1)) -> (24,1)
+        l = _Layout((2, 3, 4), (12, 4, 1))
+        l = l.coalesce()
+        self.assertEqual(list(l.sizes_and_strides), [(24, 1)])
+
+    def test_coalesce_non_coalescible(self):
+        # ((3,4),(2,1)) stays as-is (4 ≠ 2*1)
+        l = _Layout((3, 2), (4, 1))
+        l = l.coalesce()
+        self.assertEqual(list(l.sizes_and_strides), [(3, 4), (2, 1)])
+
+    def test_complement_n_group_layout(self):
+        # complement((4,2), 8) = (2,1); together form (8,1)
+        pg_layout = _Layout(
+            (4,),
+            (2,),
+        )
+        outer = pg_layout.complement(world_size=8)
+        self.assertEqual(list(outer.sizes_and_strides), [(2, 1)])
+        self.assertEqual(
+            pg_layout.all_ranks_from_zero(),
+            [0, 2, 4, 6],
+        )
+        groups = [
+            [o + i for i in pg_layout.all_ranks_from_zero()]
+            for o in outer.all_ranks_from_zero()
+        ]
+        self.assertEqual(
+            groups,
+            [
+                [0, 2, 4, 6],
+                [1, 3, 5, 7],
+            ],
+        )
+        self.assertEqual(
+            pg_layout.global_ranks(8),
+            [
+                [0, 2, 4, 6],
+                [1, 3, 5, 7],
+            ],
+        )
+        # complement((4,2), 16) = ((2,8), (2,1)); together form (16,1)
+        outer = pg_layout.complement(world_size=16)
+        self.assertEqual(list(outer.sizes_and_strides), [(2, 8), (2, 1)])
+        self.assertEqual(
+            outer.all_ranks_from_zero(),
+            [0, 1, 8, 9],
+        )
+        self.assertEqual(
+            pg_layout.global_ranks(16),
+            [
+                [0, 2, 4, 6],
+                [1, 3, 5, 7],
+                [8, 10, 12, 14],
+                [9, 11, 13, 15],
+            ],
+        )
+
+        # Complement ((2,4), (2,1)) under world_size=16 → complement ((2,8), (2,2))
+        pg_layout = _Layout((2, 2), (4, 1))
+        self.assertEqual(
+            pg_layout.all_ranks_from_zero(),
+            [0, 1, 4, 5],
+        )
+        outer = pg_layout.complement(world_size=16)
+        self.assertEqual(list(outer.sizes_and_strides), [(2, 8), (2, 2)])
+        self.assertEqual(
+            outer.all_ranks_from_zero(),
+            [0, 2, 8, 10],
+        )
+        self.assertEqual(
+            pg_layout.global_ranks(16),
+            [
+                [0, 1, 4, 5],
+                [2, 3, 6, 7],
+                [8, 9, 12, 13],
+                [10, 11, 14, 15],
+            ],
+        )
+
+        # Test layout_to_global_ranks and layout_to_all_ranks_from_zero
+        pg_layout = _Layout((2, 2), (4, 2))
+        self.assertEqual(
+            pg_layout.all_ranks_from_zero(),
+            [0, 2, 4, 6],
+        )
+        self.assertEqual(
+            pg_layout.global_ranks(16),
+            [
+                [0, 2, 4, 6],
+                [1, 3, 5, 7],
+                [8, 10, 12, 14],
+                [9, 11, 13, 15],
+            ],
+        )
+        outer = pg_layout.complement(world_size=16)
+        self.assertEqual(list(outer.sizes_and_strides), [(2, 8), (2, 1)])
+        # Test when stride is not monotonically decreasing, the complement layout
+        # is same as the one sorted its stride.
+        pg_layout_r = _Layout((2, 2), (2, 4))
+        outer = pg_layout_r.complement(world_size=16)
+        self.assertEqual(list(outer.sizes_and_strides), [(2, 8), (2, 1)])
+        self.assertEqual(
+            pg_layout_r.global_ranks(16),
+            [
+                [0, 4, 2, 6],
+                [1, 5, 3, 7],
+                [8, 12, 10, 14],
+                [9, 13, 11, 15],
+            ],
+        )
+
+        # Test just all_ranks_from_zero and global_ranks.
+        pg_layout = _Layout((4,), (2,))
+        self.assertEqual(
+            pg_layout.all_ranks_from_zero(),
+            [0, 2, 4, 6],
+        )
+        self.assertEqual(
+            pg_layout.global_ranks(16),
+            [
+                [0, 2, 4, 6],
+                [1, 3, 5, 7],
+                [8, 10, 12, 14],
+                [9, 11, 13, 15],
+            ],
+        )
+
+    def test_composition(self):
+        # self = ((4,2), (2,1)), l = (2,1)  → self o l = (2,1)
+        orig_l = _Layout((4, 2), (2, 1))
+        right_l = _Layout((2,), (1,))
+        composed_layout = orig_l.composition(right_l)
+        self.assertEqual(list(composed_layout.sizes_and_strides), [(2, 1)])
+        self.assertEqual(
+            composed_layout.global_ranks(8),
+            [
+                [0, 1],
+                [2, 3],
+                [4, 5],
+                [6, 7],
+            ],
+        )
+
+        # self = (4,2), l = (2,1)  → self o l = (2,2)
+        orig_l = _Layout((4,), (2,))
+        right_l = _Layout((2,), (1,))
+        composed_layout = orig_l.composition(right_l)
+        self.assertEqual(list(composed_layout.sizes_and_strides), [(2, 2)])
+        self.assertEqual(
+            composed_layout.global_ranks(8),
+            [
+                [0, 2],
+                [1, 3],
+                [4, 6],
+                [5, 7],
+            ],
+        )
+
+        # self = (4,2), l = ((2,2), (2,1))  → self o l = ((2,4), (2,2))
+        # This is to mimic the un-flatten from a 2D mesh to a 1D mesh.
+        right_l = _Layout((2, 2), (2, 1))
+        composed_layout = orig_l.composition(right_l)
+        self.assertEqual(list(composed_layout.sizes_and_strides), [(2, 4), (2, 2)])
+        self.assertEqual(
+            composed_layout[0].global_ranks(8),
+            [
+                [0, 4],
+                [1, 5],
+                [2, 6],
+                [3, 7],
+            ],
+        )
+        self.assertEqual(
+            composed_layout[1].global_ranks(8),
+            [
+                [0, 2],
+                [1, 3],
+                [4, 6],
+                [5, 7],
+            ],
+        )
+
+        # Error case.
+        orig_l = _Layout((4, 2), (4, 1))
+        with self.assertRaises(
+            AssertionError,
+        ):
+            right_l = _Layout((2,), (3,))
+            orig_l.composition(right_l)
+
+    def test_check_non_overlap(self):
+        """Test the check_non_overlap method for various layout configurations."""
+        # Test 1: Valid layout - no overlap
+        # sizes=(2,3), strides=(6,1) - stride 6 > span 3, so no overlap
+        layout1 = _Layout((2, 3), (6, 1))
+        self.assertTrue(layout1.check_non_overlap())
+
+        # Test 2: Invalid layout - overlap due to stride < previous span
+        # sizes=(2,3), strides=(2,1) - stride 2 < span 3, causes overlap
+        layout2 = _Layout((2, 3), (2, 1))
+        self.assertFalse(layout2.check_non_overlap())
+
+        # Test 3: Invalid layout - duplicate strides
+        # sizes=(2,3), strides=(1,1) - same stride, causes overlap
+        layout3 = _Layout((2, 3), (1, 1))
+        self.assertFalse(layout3.check_non_overlap())
+
+        # Test 4: Valid layout - single dimension
+        layout4 = _Layout((4,), (1,))
+        self.assertTrue(layout4.check_non_overlap())
+
+        # Test 5: Valid layout - exact boundary case
+        # sizes=(2,3), strides=(3,1) - stride 3 == span 3, valid
+        layout5 = _Layout((2, 3), (3, 1))
+        self.assertTrue(layout5.check_non_overlap())
+
+        # Test 6: Valid layout - multi-dimensional with proper spacing
+        layout6 = _Layout((2, 2, 2), (8, 4, 1))
+        self.assertTrue(layout6.check_non_overlap())
+
+        # Test 7: Valid layout - stride not ordered
+        layout7 = _Layout((2, 2, 2), (4, 1, 2))
+        self.assertTrue(layout7.check_non_overlap())
+
+        # Test 8: Valid layout - Interleaved but no overlap
+        layout8 = _Layout((3, 2), (2, 3))
+        self.assertTrue(layout8.check_non_overlap())
+
+    def test_remap_to_tensor(self):
+        """Test the remap_to_tensor method for various scenarios."""
+        # Test 1: Consecutive ranks, full world - should return logical groups directly
+        original_mesh = torch.tensor([0, 1, 2, 3], dtype=torch.int)
+        layout1 = _Layout((2, 2), (2, 1))  # row-major 2x2
+        result1 = layout1.remap_to_tensor(original_mesh)
+        expected1 = torch.tensor([[[0, 1], [2, 3]]], dtype=torch.int)
+        self.assertEqual(result1, expected1)
+
+        # Test 2: Non-consecutive ranks - should map to actual ranks
+        original_mesh = torch.tensor([10, 20, 30, 40], dtype=torch.int)
+        layout2 = _Layout((2, 2), (2, 1))
+        result2 = layout2.remap_to_tensor(original_mesh)
+        expected2 = torch.tensor([[[10, 20], [30, 40]]], dtype=torch.int)
+        self.assertEqual(result2, expected2)
+
+        # Test 4: 1D layout with consecutive ranks
+        original_mesh = torch.tensor([0, 1, 2, 3], dtype=torch.int)
+        layout4 = _Layout((4,), (1,))
+        result4 = layout4.remap_to_tensor(original_mesh)
+        expected4 = torch.tensor([[0, 1, 2, 3]], dtype=torch.int)
+        self.assertEqual(result4, expected4)
+
+        # Test 5: Complex strided layout with non-consecutive ranks
+        original_mesh = torch.tensor([5, 10, 15, 20], dtype=torch.int)
+        layout5 = _Layout((2, 2), (2, 1))
+        result5 = layout5.remap_to_tensor(original_mesh)
+        expected5 = torch.tensor([[[5, 10], [15, 20]]], dtype=torch.int)
+        self.assertEqual(result5, expected5)
+
+        # Test 6: Tensor Cute representation of a 2D mesh
+        original_mesh = torch.tensor([0, 2, 1, 3], dtype=torch.int)
+        layout6 = _Layout((2, 2), (1, 2))  # column-major style
+        result6 = layout6.remap_to_tensor(original_mesh)
+        expected6 = torch.tensor([[[0, 1], [2, 3]]], dtype=torch.int)
+        self.assertEqual(result6, expected6)
+
+        # Test 7: Layout with different stride pattern
+        original_mesh = torch.tensor([0, 2, 1, 4], dtype=torch.int)
+        layout7 = _Layout((2, 2), (1, 2))  # column-major style
+        result7 = layout7.remap_to_tensor(original_mesh)
+        expected7 = torch.tensor([[[0, 1], [2, 4]]], dtype=torch.int)
+        self.assertEqual(result7, expected7)
+
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_dist2.py b/test/distributed/test_dist2.py
index b335eff1c216..2c444fbfe567 100644
--- a/test/distributed/test_dist2.py
+++ b/test/distributed/test_dist2.py
@@ -53,7 +53,13 @@ def test_context_manager(self):
 
 
 class Dist2MultiProcessTestCase(MultiProcessTestCase):
-    device: torch.device
+    @property
+    def device(self) -> torch.device:
+        raise NotImplementedError
+
+    # @device.setter
+    # def device(self, value: torch.device) -> None:
+    #     self._device = value
 
     @property
     def world_size(self) -> int:
@@ -257,7 +263,9 @@ def test_remote_group_merge(self) -> None:
 
 
 class ProcessGroupGlooTest(Dist2MultiProcessTestCase):
-    device = torch.device("cpu")
+    @property
+    def device(self) -> torch.device:
+        return torch.device("cpu")
 
     @requires_gloo()
     def new_group(self) -> torch.distributed.ProcessGroup:
@@ -274,6 +282,10 @@ def new_group(self) -> torch.distributed.ProcessGroup:
 
 
 class ProcessGroupNCCLTest(Dist2MultiProcessTestCase):
+    @property
+    def device(self) -> torch.device:
+        return torch.device("cuda", self.rank)
+
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def new_group(self) -> torch.distributed.ProcessGroup:
@@ -282,8 +294,6 @@ def new_group(self) -> torch.distributed.ProcessGroup:
         os.environ["MASTER_ADDR"] = "127.0.0.1"
         os.environ["MASTER_PORT"] = "29501"
 
-        self.device = torch.device("cuda", self.rank)
-
         return dist2.new_group(
             backend="nccl",
             timeout=timedelta(seconds=60),
diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py
index af07e50435a8..b75fb91379f9 100644
--- a/test/distributed/test_dynamo_distributed.py
+++ b/test/distributed/test_dynamo_distributed.py
@@ -350,6 +350,10 @@ class TestFakeDistributedSingleProc(torch._dynamo.test_case.TestCase):
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @patch.object(config, "optimize_ddp", True)
     @patch.object(torch._inductor.config, "fallback_random", True)
+    @unittest.skipIf(
+        torch._inductor.config.triton.native_matmul,
+        "FIXME : native matmul fails. RuntimeError: Cannot access data pointer of Tensor",
+    )
     def test_hf_bert_ddp_inductor(self):
         model, inputs = get_hf_bert(0)
         model = FakeDDP(model)
diff --git a/test/distributed/test_fake_pg.py b/test/distributed/test_fake_pg.py
index 0214680ba5e0..ad233bcdba4a 100644
--- a/test/distributed/test_fake_pg.py
+++ b/test/distributed/test_fake_pg.py
@@ -7,6 +7,7 @@
 import torch.distributed as dist
 import torch.distributed._functional_collectives as funcol
 import torch.nn as nn
+from torch._C._distributed_c10d import FakeProcessGroup
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.tensor import DeviceMesh, Shard
@@ -22,6 +23,7 @@
 from torch.testing._internal.common_utils import run_tests, skipIfHpu, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import MLPModule
 from torch.testing._internal.distributed.fake_pg import FakeStore
+from torch.utils._python_dispatch import TorchDispatchMode
 
 
 if not dist.is_available():
@@ -216,6 +218,90 @@ def test_fsdp_tp_fake_e2e(self):
                 loss.backward()
                 optim.step()
 
+    def test_error_on_collective(self):
+        from torch.testing._internal.distributed.fake_pg import FakeStore
+
+        # Test with error_on_collective=False (default behavior)
+        store = FakeStore()
+        dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
+
+        # These should work normally
+        tensor = torch.ones(3, 3)
+        dist.all_reduce(tensor)
+        self.assertEqual(tuple(tensor.shape), (3, 3))
+
+        dist.destroy_process_group()
+
+        # Test with error_on_collective=True
+        from torch._C._distributed_c10d import FakeProcessGroup
+
+        options = FakeProcessGroup.Options()
+        options.error_on_collective = True
+
+        store = FakeStore()
+        dist.init_process_group(
+            backend="fake", rank=0, world_size=2, store=store, pg_options=options
+        )
+
+        # These should now raise errors
+        tensor = torch.ones(3, 3)
+        with self.assertRaisesRegex(
+            RuntimeError, "FakeProcessGroup collective operation error"
+        ):
+            dist.all_reduce(tensor)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "FakeProcessGroup collective operation error"
+        ):
+            output_tensors = [torch.empty_like(tensor) for _ in range(2)]
+            dist.all_gather(output_tensors, tensor)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "FakeProcessGroup collective operation error"
+        ):
+            dist.broadcast(tensor, src=0)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "FakeProcessGroup collective operation error"
+        ):
+            dist.barrier()
+
+    def test_fake_process_group_direct_usage_error(self):
+        class SimpleTensorMode(TorchDispatchMode):
+            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                if kwargs is None:
+                    kwargs = {}
+                return func(*args, **kwargs)
+
+        with self.assertRaisesRegex(TypeError, r"No constructor defined"):
+            fake_pg = FakeProcessGroup(rank=0, world_size=3)
+
+            with SimpleTensorMode():
+                tensor = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+                dist.all_reduce(tensor, group=fake_pg)
+
+    def test_fake_process_group_proper_usage_dispatch(self):
+        class SimpleTensorMode(TorchDispatchMode):
+            def __init__(self):
+                self.ops = []
+
+            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                self.ops.append(str(func))
+                if kwargs is None:
+                    kwargs = {}
+                return func(*args, **kwargs)
+
+        fake_store = FakeStore()
+        dist.init_process_group("fake", store=fake_store, rank=0, world_size=3)
+
+        with SimpleTensorMode() as mode:
+            tensor = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+            dist.all_reduce(tensor)
+
+        op_names = [str(op) for op in mode.ops]
+        self.assertIn("aten.lift_fresh.default", op_names)
+        self.assertIn("c10d.allreduce_.default", op_names)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
index a0197eb89ebc..62e5143d0622 100644
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@@ -53,13 +53,6 @@
 from torch.utils._python_dispatch import TorchDispatchMode
 
 
-def _tolist_with_constrain_as_size(tensor):
-    lst = tensor.tolist()
-    for elem in lst:
-        torch._check_is_size(elem)
-    return lst
-
-
 @requires_accelerator_dist_backend(["nccl", "xccl"])
 @instantiate_parametrized_tests
 class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase):
@@ -537,10 +530,8 @@ def example(
             ranks,
             group_size,
         ):
-            input_split_sizes = _tolist_with_constrain_as_size(input_split_sizes_tensor)
-            output_split_sizes = _tolist_with_constrain_as_size(
-                output_split_sizes_tensor
-            )
+            input_split_sizes = input_split_sizes_tensor.tolist()
+            output_split_sizes = output_split_sizes_tensor.tolist()
             a2a = torch.ops.c10d_functional.all_to_all_single(
                 inp,
                 output_split_sizes,
@@ -700,10 +691,8 @@ def example(
             ranks,
             group_size,
         ):
-            input_split_sizes = _tolist_with_constrain_as_size(input_split_sizes_tensor)
-            output_split_sizes = _tolist_with_constrain_as_size(
-                output_split_sizes_tensor
-            )
+            input_split_sizes = input_split_sizes_tensor.tolist()
+            output_split_sizes = output_split_sizes_tensor.tolist()
             a2a = torch.ops.custom_ns.alltoall_autograd.default(
                 inp,
                 output_split_sizes,
@@ -1754,6 +1743,124 @@ def func2(x, w, rs_0, rs_1, tag, ranks, group_size):
             correct = f(*inputs, **self.get_world_trs())
             assert same(out, correct), f"{out} va {correct}"
 
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @unittest.skipIf(not SM80OrLater, "bfloat16")
+    @parametrize("bucket_mode", ["all"])
+    def test_all_reduce_bucket(self, bucket_mode):
+        def func(x, w, ar_0, ar_1, tag, ranks, group_size):
+            y = torch.mm(x, w)
+
+            group_name = (
+                torch.distributed.distributed_c10d._get_default_group().group_name
+            )
+            ar_0_out = torch.ops._c10d_functional.all_reduce.default(
+                ar_0, "sum", group_name
+            )
+            ar_1_out = torch.ops._c10d_functional.all_reduce.default(
+                ar_1, "sum", group_name
+            )
+
+            ar_0_w = torch.ops.c10d_functional.wait_tensor(ar_0_out)
+            ar_1_w = torch.ops.c10d_functional.wait_tensor(ar_1_out)
+
+            return y, ar_0_w, ar_1_w
+
+        f = func
+
+        x = torch.ones(4, 384, device="cuda", dtype=torch.float32)
+        w = torch.ones(384, 512, device="cuda", dtype=torch.float32)
+        ar_0 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
+        ar_1 = torch.ones(384, 256, device="cuda", dtype=torch.float32)
+        inputs = [x, w, ar_0, ar_1]
+        f(*inputs, **self.get_world_trs())
+
+        def _pass(g):
+            from torch._inductor.fx_passes.bucketing import bucket_all_reduce
+
+            bucket_all_reduce(g.owning_module, lambda _: 2000)
+
+        torch._inductor.config.post_grad_custom_post_pass = _pass
+
+        with torch._inductor.config.patch(
+            {
+                "reorder_for_compute_comm_overlap": False,
+            }
+        ):
+            compiled = torch.compile(f)
+            compiled(*inputs, **self.get_world_trs())
+            code = run_and_get_triton_code(compiled, *inputs, **self.get_world_trs())
+        # NOTE: The first return value should be the output of the first wait_tensor.
+        # We want to make sure no unnecessary copy is made.
+        (
+            FileCheck()
+            .check_count(
+                "torch.ops._c10d_functional.all_reduce_.default(",
+                count=1,
+                exactly=True,
+            )
+            .run(code)
+        )
+        out = compiled(*inputs, **self.get_world_trs())
+        correct = f(*inputs, **self.get_world_trs())
+        assert same(out, correct), f"{out} va {correct}"
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @unittest.skipIf(not SM80OrLater, "bfloat16")
+    @parametrize("bucket_mode", ["all_custom_ops_multidtype"])
+    def test_all_gather_bucket_multidtype(self, bucket_mode):
+        def func(x, w, ag_0, ag_1, *, tag, ranks, group_size):
+            # do some unrelated matmuls
+            y = torch.mm(x, w)
+
+            group_name = (
+                torch.distributed.distributed_c10d._get_default_group().group_name
+            )
+
+            ag_0_w = torch.ops._c10d_functional.all_gather_into_tensor(
+                ag_0, group_size, group_name
+            )
+            ag_0_out = torch.ops.c10d_functional.wait_tensor(ag_0_w)
+            ag_0_out = ag_0_out * 2
+
+            ag_1_w = torch.ops._c10d_functional.all_gather_into_tensor(
+                ag_1, group_size, group_name
+            )
+
+            ag_1_out = torch.ops.c10d_functional.wait_tensor(ag_1_w)
+
+            return y, ag_0_out, ag_1_out
+
+        x = torch.ones(4, 384, device="cuda", dtype=torch.float32)
+        w = torch.ones(384, 512, device="cuda", dtype=torch.float32)
+        ag_0 = torch.ones(384, 512, device="cuda", dtype=torch.bfloat16)
+        ag_1 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
+        inputs = [x, w, ag_0, ag_1]
+        correct = func(*inputs, **self.get_world_trs())
+
+        with torch._inductor.config.patch(
+            {
+                "bucket_all_gathers_fx": bucket_mode,
+                "reorder_for_compute_comm_overlap": False,
+            }
+        ):
+            compiled = torch.compile(func)
+            code = run_and_get_triton_code(compiled, *inputs, **self.get_world_trs())
+            (
+                FileCheck()
+                .check_count(
+                    "torch.ops._c10d_functional.all_gather_into_tensor_out.default(",
+                    count=1,
+                    exactly=True,
+                )
+                .run(code)
+            )
+        out = compiled(*inputs, **self.get_world_trs())
+        _, y_ag0, y_ag1 = out
+        assert y_ag0.dtype == ag_0.dtype
+        assert y_ag1.dtype == ag_1.dtype
+
+        assert same(out, correct), f"{out} va {correct}"
+
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @unittest.skipIf(not SM80OrLater, "bfloat16")
     @parametrize("bucket_mode", ["all", "all_custom_ops"])
@@ -1901,36 +2008,37 @@ def _reorder_communication_preserving_peak_memory(
 
         # NOTE: The first return value should be the output of the first wait_tensor.
         # We want to make sure no unnecessary copy is made.
-        (
-            FileCheck()
-            .check_count(
-                "torch.ops._c10d_functional.all_gather_into_tensor_out.default(",
-                count=2,
-                exactly=True,
-            )
-            .check(
-                "extern_kernels.mm",
-            )
-            .check(
-                "extern_kernels.addmm",
-            )
-            .run(code)
-        )
-        (
-            FileCheck()
-            .check_count(
-                "torch.ops._c10d_functional.reduce_scatter_tensor.default(",
-                count=2,
-                exactly=True,
-            )
-            .check(
-                "extern_kernels.mm",
+        if not torch._inductor.config.triton.native_matmul:
+            (
+                FileCheck()
+                .check_count(
+                    "torch.ops._c10d_functional.all_gather_into_tensor_out.default(",
+                    count=2,
+                    exactly=True,
+                )
+                .check(
+                    "extern_kernels.mm",
+                )
+                .check(
+                    "extern_kernels.addmm",
+                )
+                .run(code)
             )
-            .check(
-                "extern_kernels.addmm",
+            (
+                FileCheck()
+                .check_count(
+                    "torch.ops._c10d_functional.reduce_scatter_tensor.default(",
+                    count=2,
+                    exactly=True,
+                )
+                .check(
+                    "extern_kernels.mm",
+                )
+                .check(
+                    "extern_kernels.addmm",
+                )
+                .run(code)
             )
-            .run(code)
-        )
         out = compiled(*inputs, **self.get_world_trs())
         correct = func(*inputs, **self.get_world_trs())
         assert same(out, correct), f"{out} va {correct}"
diff --git a/test/distributed/test_local_tensor.py b/test/distributed/test_local_tensor.py
new file mode 100644
index 000000000000..114780627e33
--- /dev/null
+++ b/test/distributed/test_local_tensor.py
@@ -0,0 +1,415 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+from contextlib import nullcontext
+
+import torch
+import torch.distributed as dist
+from torch.distributed._local_tensor import (
+    local_tensor_mode,
+    LocalTensor,
+    LocalTensorMode,
+)
+from torch.distributed.tensor import (
+    DeviceMesh,
+    distribute_tensor,
+    init_device_mesh,
+    Partial,
+    Replicate,
+    Shard,
+)
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class LocalTensorTestBase(TestCase):
+    def assertEqual(self, lhs, rhs, **kwargs):
+        mode = local_tensor_mode()
+        with nullcontext() if mode is None else mode.disable():
+            if isinstance(lhs, LocalTensor) and isinstance(rhs, LocalTensor):
+                assert isinstance(lhs, LocalTensor) and isinstance(rhs, LocalTensor)
+                super().assertEqual(lhs._ranks, rhs._ranks)
+                for r in lhs._ranks:
+                    super().assertEqual(
+                        lhs._local_tensors[r],
+                        rhs._local_tensors[r],
+                        lambda m: f"rank {r}: {m}",
+                    )
+            elif isinstance(lhs, LocalTensor) or isinstance(rhs, LocalTensor):
+                lhs, rhs = (lhs, rhs) if isinstance(lhs, LocalTensor) else (rhs, lhs)
+                for r in lhs._ranks:
+                    super().assertEqual(
+                        lhs._local_tensors[r], rhs, lambda m: f"rank {r}: {m}"
+                    )
+            else:
+                return super().assertEqual(lhs, rhs, **kwargs)
+
+    @property
+    def world_size(self):
+        raise NotImplementedError("override world-size in your subclass")
+
+    def build_device_mesh(self) -> DeviceMesh:
+        return init_device_mesh("cpu", (self.world_size,))
+
+    def setUp(self):
+        super().setUp()
+        torch.distributed.init_process_group(
+            # TODO: test other ranks too
+            "fake",
+            rank=0,
+            world_size=self.world_size,
+        )
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            dist.destroy_process_group()
+        except AssertionError:
+            pass
+
+
+class TestLocalTensorWorld2(LocalTensorTestBase):
+    world_size = 2
+
+    def test_local_tensor_dtype_consistency(self):
+        """Test that LocalTensor enforces dtype consistency."""
+        device = torch.device("cpu")
+        shape = (2, 3)
+
+        inconsistent_tensors = {
+            0: torch.randn(shape, dtype=torch.float32, device=device),
+            1: torch.randn(
+                shape, dtype=torch.float64, device=device
+            ),  # Different dtype
+        }
+
+        with self.assertRaises(AssertionError):
+            LocalTensor(inconsistent_tensors)
+
+    def test_local_tensor_creation_fails_with_grad_tensors(self):
+        """Test that LocalTensor creation fails when local tensors have requires_grad=True."""
+        device = torch.device("cpu")
+        shape = (2, 3)
+        dtype = torch.float32
+
+        # Create sample local tensors for different ranks
+        local_tensors = {
+            0: torch.randn(shape, dtype=dtype, device=device, requires_grad=True),
+            1: torch.randn(shape, dtype=dtype, device=device, requires_grad=True),
+        }
+
+        with self.assertRaises(AssertionError):
+            LocalTensor(local_tensors)
+
+        # TODO: test flatten/unflatten
+
+    def test_basic_arithmetic_operations(self):
+        """Test basic arithmetic operations on LocalTensors."""
+        device = torch.device("cpu")
+        shape = (2, 3)
+        dtype = torch.float32
+
+        # Create identical local tensors for consistency tests
+        base_tensor = torch.randn(shape, dtype=dtype, device=device)
+        identical_local_tensors = {
+            0: base_tensor.clone(),
+            1: base_tensor.clone(),
+        }
+
+        lt1 = LocalTensor(identical_local_tensors)
+        lt2 = LocalTensor(identical_local_tensors)
+
+        # Test addition
+        result_add = lt1 + lt2
+        self.assertIsInstance(result_add, LocalTensor)
+        self.assertEqual(len(result_add._local_tensors), 2)
+
+        # Verify the operation was applied to each local tensor
+        for rank in identical_local_tensors.keys():
+            expected = identical_local_tensors[rank] + identical_local_tensors[rank]
+            self.assertEqual(result_add._local_tensors[rank], expected)
+
+        # Test multiplication
+        result_mul = lt1 * 2.0
+        self.assertIsInstance(result_mul, LocalTensor)
+        for rank in identical_local_tensors.keys():
+            expected = identical_local_tensors[rank] * 2.0
+            self.assertEqual(result_mul._local_tensors[rank], expected)
+
+    # TODO: consider an op-info test; we don't actually need to cover all ops
+    # but it will help make sure views and more exotic things are done
+    # correctly (in standard subclass style)
+
+    def test_mixed_operations_with_regular_tensors(self):
+        """Test operations between LocalTensors and regular tensors."""
+        device = torch.device("cpu")
+        shape = (2, 3)
+        dtype = torch.float32
+
+        # Create identical local tensors for consistency tests
+        base_tensor = torch.randn(shape, dtype=dtype, device=device)
+        identical_local_tensors = {
+            0: base_tensor.clone(),
+            1: base_tensor.clone(),
+        }
+
+        lt = LocalTensor(identical_local_tensors)
+        regular_tensor = torch.ones_like(identical_local_tensors[0])
+
+        # Test LocalTensor + regular tensor
+        result = lt + regular_tensor
+        self.assertIsInstance(result, LocalTensor)
+
+        for rank in identical_local_tensors.keys():
+            expected = identical_local_tensors[rank] + regular_tensor
+            self.assertEqual(result._local_tensors[rank], expected)
+
+    def test_local_tensor_mode(self):
+        """Test LocalTensorMode functionality."""
+        device = torch.device("cpu")
+        shape = (2, 3)
+        dtype = torch.float32
+
+        # Create identical local tensors for consistency tests
+        base_tensor = torch.randn(shape, dtype=dtype, device=device)
+        identical_local_tensors = {
+            0: base_tensor.clone(),
+            1: base_tensor.clone(),
+        }
+
+        lt = LocalTensor(identical_local_tensors)
+
+        with LocalTensorMode(lt._ranks):
+            result = lt + 1.0
+            self.assertIsInstance(result, LocalTensor)
+
+            regular = torch.ones(2, 2)
+            regular_result = regular + 1.0
+            self.assertIsInstance(regular, LocalTensor)
+            self.assertIsInstance(regular_result, LocalTensor)
+
+    def test_empty_local_tensors(self):
+        """Test behavior with empty local tensors dict."""
+        # TODO: raise a better error here
+        with self.assertRaises(StopIteration):  # next() on empty iterator
+            LocalTensor({})
+
+    def test_collectives_within_local_tensor_mode(self):
+        """Test that collective operations work within LocalTensorMode context."""
+        test_tensors = {
+            0: torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
+            1: torch.tensor([[5.0, 6.0], [7.0, 8.0]]),
+        }
+        lt = LocalTensor(test_tensors)
+        fake_pg = torch.distributed.distributed_c10d._get_default_group()
+
+        with LocalTensorMode(lt._ranks):
+            # Test all_reduce within mode
+            lt_sum = LocalTensor({k: v.clone() for k, v in test_tensors.items()})
+            dist.all_reduce(lt_sum, group=fake_pg)
+
+            expected_sum = torch.tensor([[6.0, 8.0], [10.0, 12.0]])
+            for rank in test_tensors.keys():
+                self.assertEqual(lt_sum._local_tensors[rank], expected_sum)
+
+            # Test broadcast within mode
+            lt_broadcast = LocalTensor({k: v.clone() for k, v in test_tensors.items()})
+            dist.broadcast(lt_broadcast, src=0, group=fake_pg)
+
+            for rank in test_tensors.keys():
+                self.assertEqual(lt_broadcast._local_tensors[rank], test_tensors[0])
+
+            # Test that regular operations still work
+            result = lt + 1.0
+            self.assertIsInstance(result, LocalTensor)
+
+    def test_scalar_mul_reduction_bug(self):
+        with LocalTensorMode(self.world_size):
+            mesh = self.build_device_mesh()
+
+            tensor = torch.tensor([10, 10]).float()
+            dt = distribute_tensor(tensor, device_mesh=mesh, placements=[Shard(0)])
+            y = dt.sum() * 1  # noqa: F841
+
+            tensor = torch.arange(10).reshape(10, 1).float().requires_grad_()
+            dt = distribute_tensor(tensor, device_mesh=mesh, placements=[Shard(0)])
+
+            print(dt.sum() * 1, dt.sum() * 2, dt.sum() * 3)
+
+    def test_uneven_sharding_mean_bug(self):
+        with LocalTensorMode(self.world_size):
+            mesh = self.build_device_mesh()
+            tensor = torch.arange(12).reshape(-1, 4).float()
+
+            dt = distribute_tensor(tensor, device_mesh=mesh, placements=[Shard(0)])
+
+            mean = dt.mean()
+            self.assertEqual(mean.placements, [Replicate()])
+            full = mean.full_tensor()
+            self.assertEqual(tensor.mean(), full)
+
+    def test_uneven_sharding_prod(self):
+        with LocalTensorMode(self.world_size):
+            mesh = self.build_device_mesh()
+            tensor = (torch.arange(12) + 1).reshape(-1, 4).float()
+
+            dt = distribute_tensor(tensor, device_mesh=mesh, placements=[Shard(0)])
+
+            x = dt.prod()
+            full = x.full_tensor()
+            self.assertEqual(tensor.prod(), full)
+
+    def test_even_sharding_mean_is_partial(self):
+        with LocalTensorMode(self.world_size):
+            mesh = self.build_device_mesh()
+            tensor = torch.arange(16).reshape(4, 4).float()
+
+            dt = distribute_tensor(tensor, device_mesh=mesh, placements=[Shard(0)])
+
+            mean = dt.mean()
+            full = mean.full_tensor()
+            self.assertEqual(tensor.mean(), full)
+            self.assertEqual(mean.placements, [Partial("avg")])
+
+
+class TestLocalTensorWorld3(LocalTensorTestBase):
+    world_size = 3
+
+    def test_collective_reduction_operations(self):
+        """Test different reduction operations for all_reduce."""
+        # Create different tensors for each rank with simple values for testing
+        test_tensors = {
+            0: torch.tensor([[1.0, 4.0], [2.0, 5.0]]),
+            1: torch.tensor([[2.0, 1.0], [3.0, 6.0]]),
+            2: torch.tensor([[3.0, 2.0], [1.0, 4.0]]),
+        }
+
+        fake_pg = torch.distributed.distributed_c10d._get_default_group()
+
+        # Test SUM reduction
+        lt_sum = LocalTensor({k: v.clone() for k, v in test_tensors.items()})
+        dist.all_reduce(lt_sum, op=dist.ReduceOp.SUM, group=fake_pg)
+        expected_sum = torch.tensor([[6.0, 7.0], [6.0, 15.0]])  # Sum of all tensors
+        for rank in test_tensors.keys():
+            self.assertEqual(lt_sum._local_tensors[rank], expected_sum)
+
+        # Test MAX reduction
+        lt_max = LocalTensor({k: v.clone() for k, v in test_tensors.items()})
+        dist.all_reduce(lt_max, op=dist.ReduceOp.MAX, group=fake_pg)
+        expected_max = torch.tensor([[3.0, 4.0], [3.0, 6.0]])  # Max across all tensors
+        for rank in test_tensors.keys():
+            self.assertEqual(lt_max._local_tensors[rank], expected_max)
+
+        # Test MIN reduction
+        lt_min = LocalTensor({k: v.clone() for k, v in test_tensors.items()})
+        dist.all_reduce(lt_min, op=dist.ReduceOp.MIN, group=fake_pg)
+        expected_min = torch.tensor([[1.0, 1.0], [1.0, 4.0]])  # Min across all tensors
+        for rank in test_tensors.keys():
+            self.assertEqual(lt_min._local_tensors[rank], expected_min)
+
+    def test_all_reduce_collective(self):
+        """Test that all_reduce collective operation works correctly with LocalTensor."""
+        # Create different tensors for each rank
+        different_tensors = {
+            0: torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]),
+            1: torch.tensor([[10.0, 20.0, 30.0], [40.0, 50.0, 60.0]]),
+            2: torch.tensor([[100.0, 200.0, 300.0], [400.0, 500.0, 600.0]]),
+        }
+
+        fake_pg = torch.distributed.distributed_c10d._get_default_group()
+
+        # Test all_reduce with SUM (default)
+        lt_sum = LocalTensor({k: v.clone() for k, v in different_tensors.items()})
+        lt_sum = lt_sum + 1
+        dist.all_reduce(lt_sum, group=fake_pg)
+
+        # Verify all ranks have the sum of all tensors (after adding 1 to each)
+        expected_sum = torch.tensor([[114.0, 225.0, 336.0], [447.0, 558.0, 669.0]])
+        for rank in different_tensors.keys():
+            self.assertEqual(lt_sum._local_tensors[rank], expected_sum)
+
+    def test_broadcast_collective(self):
+        """Test that broadcast collective operation works correctly with LocalTensor."""
+        # Create different tensors for each rank
+        different_tensors = {
+            0: torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]),
+            1: torch.tensor([[10.0, 20.0, 30.0], [40.0, 50.0, 60.0]]),
+            2: torch.tensor([[100.0, 200.0, 300.0], [400.0, 500.0, 600.0]]),
+        }
+
+        fake_pg = torch.distributed.distributed_c10d._get_default_group()
+
+        # Test broadcast from rank 1
+        lt_broadcast = LocalTensor({k: v.clone() for k, v in different_tensors.items()})
+        dist.broadcast(lt_broadcast, src=1, group=fake_pg)
+
+        # Verify all ranks have rank 1's original tensor
+        expected_broadcast = different_tensors[1]
+        for rank in different_tensors.keys():
+            self.assertEqual(lt_broadcast._local_tensors[rank], expected_broadcast)
+
+    def test_all_gather_collective(self):
+        """Test that all_gather collective operation works correctly with LocalTensor."""
+        # Create different tensors for each rank
+        different_tensors = {
+            0: torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]),
+            1: torch.tensor([[10.0, 20.0, 30.0], [40.0, 50.0, 60.0]]),
+            2: torch.tensor([[100.0, 200.0, 300.0], [400.0, 500.0, 600.0]]),
+        }
+
+        fake_pg = torch.distributed.distributed_c10d._get_default_group()
+
+        # Test all_gather
+        lt_gather = LocalTensor(different_tensors)
+        tensor_list = [torch.zeros_like(lt_gather) for _ in range(3)]
+
+        dist.all_gather(tensor_list, lt_gather, group=fake_pg)
+
+        # Verify each position in tensor_list contains the corresponding rank's tensor
+        self.assertEqual(tensor_list[0], different_tensors[0])
+        self.assertEqual(tensor_list[1], different_tensors[1])
+        self.assertEqual(tensor_list[2], different_tensors[2])
+
+
+class TestLocalTensorWorld4(LocalTensorTestBase):
+    world_size = 4
+
+    def test_dtensor_cat(self):
+        with LocalTensorMode(self.world_size):
+            device_mesh = self.build_device_mesh()
+
+            t1 = torch.arange(16).view(4, 4).float()
+            d1 = distribute_tensor(t1, device_mesh, [Replicate()])
+            t2 = (torch.arange(16) + 16).view(4, 4).float()
+            d2 = distribute_tensor(t2, device_mesh, [Shard(0)])
+
+            local_res = torch.cat([t1, t2], dim=-1)
+            dist_res = torch.cat([d1, d2], dim=-1)
+            full_tensor = dist_res.full_tensor()
+            self.assertEqual(full_tensor, local_res)
+
+
+class TestLocalTensorWorld8(LocalTensorTestBase):
+    world_size = 8
+
+    def test_dtensor_addmm(self):
+        with LocalTensorMode(self.world_size):
+            device_mesh = self.build_device_mesh()
+
+            shard_spec = [Shard(0)]
+            replica_spec = [Replicate()]
+
+            tensor_to_shard = torch.randn(12, 8)
+            mat1 = distribute_tensor(tensor_to_shard, device_mesh, shard_spec)
+            tensor_to_replicate = torch.randn(8, 4)
+            mat2 = distribute_tensor(tensor_to_replicate, device_mesh, replica_spec)
+            input_tensor = torch.randn(4)
+            input = distribute_tensor(input_tensor, device_mesh, replica_spec)
+
+            dist_res = torch.addmm(input, mat1, mat2)
+            local_res = torch.addmm(input_tensor, tensor_to_shard, tensor_to_replicate)
+            full_tensor = dist_res.full_tensor()
+            self.assertEqual(full_tensor, local_res)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/test_nccl.py b/test/distributed/test_nccl.py
index 49d72b8b4edd..c02676a37a02 100644
--- a/test/distributed/test_nccl.py
+++ b/test/distributed/test_nccl.py
@@ -1,6 +1,5 @@
 # Owner(s): ["oncall: distributed"]
 
-import re
 import sys
 
 import torch
@@ -29,15 +28,9 @@
 )
 
 
-HIP_VERSION = (
-    0.0
-    if torch.version.hip is None
-    else float(re.search(r"^\d+\.\d+", torch.version.hip)[0])
-)
-
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 nGPUs = torch.cuda.device_count()
 if not TEST_CUDA:
@@ -66,9 +59,6 @@ def test_unique_id(self, device):
         self.assertIsInstance(uid, bytes)
         self.assertGreater(len(uid), 1)
 
-    @skip_but_pass_in_sandcastle_if(
-        TEST_WITH_ROCM and HIP_VERSION < 3.5, "Skip NCCL tests for ROCm"
-    )
     @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "only one GPU detected")
     @dtypes(*broadcast_dtypes)
@@ -91,9 +81,6 @@ def test_broadcast(self, device, dtype):
         for i in range(torch.cuda.device_count()):
             self.assertEqual(tensors[i], expected)
 
-    @skip_but_pass_in_sandcastle_if(
-        TEST_WITH_ROCM and HIP_VERSION < 3.5, "Skip NCCL tests for ROCm"
-    )
     @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "only one GPU detected")
     @dtypes(*datatypes)
@@ -118,10 +105,6 @@ def test_reduce(self, device, dtype):
 
     @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "only one GPU detected")
-    @skip_but_pass_in_sandcastle_if(
-        TEST_WITH_ROCM and HIP_VERSION < 3.5 and dtype == torch.bfloat16,  # noqa: F821
-        "Skip bfloat16 test for ROCm < 3.5",
-    )
     @dtypes(*datatypes)
     def test_all_reduce(self, device, dtype):
         cpu_tensors = [
@@ -151,9 +134,6 @@ def test_all_reduce(self, device, dtype):
         for tensor in tensors:
             self.assertEqual(tensor, expected)
 
-    @skip_but_pass_in_sandcastle_if(
-        TEST_WITH_ROCM and HIP_VERSION < 3.5, "Skip NCCL tests for ROCm"
-    )
     @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
     def test_collective_errors(self, device):
         t = torch.rand(10).cuda(0)
@@ -182,9 +162,6 @@ def test_collective_errors(self, device):
         ):
             nccl.reduce_scatter(t, t)
 
-    @skip_but_pass_in_sandcastle_if(
-        TEST_WITH_ROCM and HIP_VERSION < 3.5, "Skip NCCL tests for ROCm"
-    )
     @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "only one GPU detected")
     @dtypes(*datatypes)
@@ -211,9 +188,6 @@ def test_all_gather(self, device, dtype):
         for tensor in outputs:
             self.assertEqual(tensor, expected)
 
-    @skip_but_pass_in_sandcastle_if(
-        TEST_WITH_ROCM and HIP_VERSION < 3.5, "Skip NCCL tests for ROCm"
-    )
     @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "only one GPU detected")
     @dtypes(*datatypes)
diff --git a/test/distributed/test_nvshmem.py b/test/distributed/test_nvshmem.py
index 18db50582d27..2a601957ebf9 100644
--- a/test/distributed/test_nvshmem.py
+++ b/test/distributed/test_nvshmem.py
@@ -193,21 +193,19 @@ def test_nvshmem_put(self) -> None:
         dtype = torch.float
         numel = 1024
         tensor = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(self.rank)
-        symm_mem.rendezvous(tensor, group=group_name)
+        hdl = symm_mem.rendezvous(tensor, group=group_name)
+        signal_pad = hdl.get_signal_pad(self.rank)
+        signal_val = 5
 
         if self.rank == 0:
-            torch.ops.symm_mem.nvshmem_put(tensor, 1)
-            # TODO: remove after we have wait_signal
-            dist.barrier()
+            torch.ops.symm_mem.nvshmem_put_with_signal(
+                tensor, signal_pad, signal_val, 1
+            )
         elif self.rank == 1:
-            # handle.wait_signal(src_rank=0)
-            # TODO: remove after we have wait_signal
-            dist.barrier()
+            torch.ops.symm_mem.nvshmem_wait_for_signal(signal_pad, signal_val, 0)
             torch.testing.assert_close(
                 tensor, torch.zeros(numel, dtype=dtype, device=self.device)
             )
-        else:
-            dist.barrier()
 
     @skipIfRocm
     def test_nvshmem_get(self) -> None:
@@ -240,8 +238,6 @@ class NVSHMEMAll2AllTest(MultiProcContinuousTest):
     def _init_device(self) -> None:
         # TODO: relieve this (seems to hang if without)
         device_module.set_device(self.device)
-        # NOTE: required for nvshmem allocation
-        torch.empty(1, device=self.device)
         # Set NVSHMEM as SymmMem backend
         symm_mem.set_backend("NVSHMEM")
 
@@ -301,25 +297,33 @@ def test_all_to_all_vdev(self) -> None:
             torch.randn(max_inp_numel, dtype=dtype, device=self.device)
         )
         out = symm_mem.empty(max_out_numel, dtype=dtype, device=self.device).fill_(-1)
-        in_out_splits = symm_mem.empty(
-            (3, self.world_size), dtype=torch.int64, device=self.device
+        in_splits = symm_mem.empty(
+            self.world_size, dtype=torch.int64, device=self.device
+        )
+        out_splits_offsets = symm_mem.empty(
+            (2, self.world_size), dtype=torch.int64, device=self.device
         )
         # Row 0 is input splits
-        in_out_splits[0].copy_(inp_splits)
+        in_splits.copy_(inp_splits)
 
-        torch.ops.symm_mem.all_to_all_vdev(inp, out, in_out_splits, group_name)
+        # Sync all ranks to ensure remote tensors are allocated
+        dist.barrier()
+
+        torch.ops.symm_mem.all_to_all_vdev(
+            inp, out, in_splits, out_splits_offsets, group_name
+        )
 
         # Check input splits (row 0) -- should not change
-        torch.testing.assert_close(in_out_splits[0], inp_splits)
+        torch.testing.assert_close(in_splits, inp_splits)
 
         # Check output splits (row 1)
-        torch.testing.assert_close(in_out_splits[1], out_splits)
+        torch.testing.assert_close(out_splits_offsets[0], out_splits)
 
         # Check output offsets (row 2)
         out_offsets = torch.cumsum(out_splits, dim=0)  # inclusive scan
         # output offsets from `all_to_all_vdev` is exclusive scan
-        self.assertEqual(in_out_splits[2][0], 0)
-        torch.testing.assert_close(in_out_splits[2][1:], out_offsets[:-1])
+        self.assertEqual(out_splits_offsets[1][0], 0)
+        torch.testing.assert_close(out_splits_offsets[1][1:], out_offsets[:-1])
 
         # Check data
         expected = torch.empty(out_numel, dtype=dtype, device=self.device)
@@ -375,6 +379,9 @@ def test_all_to_all_vdev_2d(self, align: int) -> None:
             (2, nsplits), dtype=torch.int64, device=self.device
         ).fill_(-1)
 
+        # Sync all ranks to ensure remote tensors are allocated
+        dist.barrier()
+
         torch.ops.symm_mem.all_to_all_vdev_2d(
             inp, out, in_splits, out_splits_offsets, group_name, major_align=align
         )
@@ -456,14 +463,6 @@ def test_all_to_all_vdev_2d_offset(self) -> None:
             0, k * nsplits, k, dtype=torch.int64, device=self.device
         )
 
-        # Exchange input splits to get output splits
-        out_splits = torch.zeros_like(inp_splits)
-        # First need to transpose the input splits
-        inp_splits_t = inp_splits.reshape(ne, self.world_size).t().contiguous()
-        dist.all_to_all_single(out_splits, inp_splits_t)
-
-        # Actual number of output elements
-        out_numel = out_splits.sum().item()
         # Max number of input elements (must be a constant across ranks for symmetric memory allocation)
         # Remember that we up-align each input split to k?
         max_inp_numel = k * nsplits
@@ -490,6 +489,9 @@ def test_all_to_all_vdev_2d_offset(self) -> None:
         # Row 1 is input offsets
         in_splits_offsets[1].copy_(inp_offsets)
 
+        # Sync all ranks to ensure remote tensors are allocated
+        dist.barrier()
+
         torch.ops.symm_mem.all_to_all_vdev_2d_offset(
             inp, out, in_splits_offsets, out_splits_offsets, group_name
         )
@@ -501,6 +503,11 @@ def test_all_to_all_vdev_2d_offset(self) -> None:
         torch.testing.assert_close(in_splits_offsets[1], inp_offsets)
 
         # Check output splits (row 1)
+        # Exchange input splits to get output splits
+        out_splits = torch.zeros_like(inp_splits)
+        # First need to transpose the input splits
+        inp_splits_t = inp_splits.reshape(ne, self.world_size).t().contiguous()
+        dist.all_to_all_single(out_splits, inp_splits_t)
         torch.testing.assert_close(received_out_splits, out_splits)
 
         # Check output offsets (row 2)
@@ -543,88 +550,110 @@ def test_all_to_all_vdev_2d_offset(self) -> None:
 
         # Concatenate the output chunks received from all peers
         out_expected = torch.cat(out_chunks)
+        # Actual number of output elements
+        out_numel = out_splits.sum().item()
         self.assertEqual(out_expected.shape[0], out_numel)
 
         # Check data
         torch.testing.assert_close(out_expected, out[:out_numel])
 
-    def helper_test_dispatch_combine(self, align: int, group_name) -> None:
-        """
-        Shuffle the tokens, then combine them, and check if the combined data is
-        exactly the same as the original input data
-        """
-        symm_mem.enable_symm_mem_for_group(group_name)
 
-        dtype = torch.float
-        # Number of experts per rank
-        ne = 8
-        nsplits = ne * self.world_size
-
-        # Number of elements for an expert is random between [0, k)
-        k = 10
-        inp_splits = torch.randint(k, (nsplits,), dtype=torch.int64, device=self.device)
+# Help function used by multiple tests
+def dispatch_then_combine(device, align: int, group) -> None:
+    """
+    Shuffle the tokens, then combine them, and check if the combined data is
+    exactly the same as the original input data
+    """
+    group_name = group.group_name
+    symm_mem.enable_symm_mem_for_group(group_name)
+
+    dtype = torch.float
+    # Number of experts per rank
+    ne = 8
+    nsplits = ne * group.size()
+
+    # Number of elements for an expert is random between [0, k)
+    k = 10
+    inp_splits = torch.randint(k, (nsplits,), dtype=torch.int64, device=device)
+
+    # Actual number of input elements
+    inp_numel = inp_splits.sum().item()
+    # Max number of input elements (must be a constant across ranks for symmetric memory allocation)
+    max_inp_numel = k * nsplits
+    # Max number of output elements (must be a constant across ranks for symmetric memory allocation)
+    overflow_factor = group.size()  # worst case: one rank receives all data
+    max_out_numel = max_inp_numel * overflow_factor
+
+    # Buffers for shuffle
+    inp = symm_mem.empty(max_inp_numel, dtype=dtype, device=device).copy_(
+        torch.randn(max_inp_numel, dtype=dtype, device=device)
+    )
+    out = symm_mem.empty(max_out_numel, dtype=dtype, device=device).fill_(-1)
+    in_splits = symm_mem.empty(nsplits, dtype=torch.int64, device=device).copy_(
+        inp_splits
+    )
+    # 2 rows: output splits, output offsets
+    # Initiallizing all values to -1 to check if they are updated
+    out_splits_offsets = symm_mem.empty(
+        (2, nsplits), dtype=torch.int64, device=device
+    ).fill_(-1)
+
+    # Buffers for combine
+    combine_out = symm_mem.empty(max_out_numel, dtype=dtype, device=device).fill_(-1)
+    # 2 rows: output splits, output offsets
+    # Initiallizing all values to -1 to check if they are updated
+    combine_out_splits_offsets = symm_mem.empty(
+        (2, nsplits), dtype=torch.int64, device=device
+    ).fill_(-1)
+
+    # Wait for all ranks to finish tensor allocation before accessing them
+    torch.cuda.synchronize(device)
+    dist.barrier(group=group)
+
+    # Shuffle the tokens
+    torch.ops.symm_mem.all_to_all_vdev_2d(
+        inp, out, in_splits, out_splits_offsets, group_name, major_align=align
+    )
 
-        # Exchange input splits to get output splits
-        out_splits = torch.zeros_like(inp_splits)
-        dist.all_to_all_single(out_splits, inp_splits)
+    # Combine the tokens
+    # `out_splits_offsets` from shuffle is exactly the `input_splits_offsets` for combine
+    # `out` data from shuffle is exactly the `input` data for combine
+    torch.ops.symm_mem.all_to_all_vdev_2d_offset(
+        out, combine_out, out_splits_offsets, combine_out_splits_offsets, group_name
+    )
 
-        # Actual number of input elements
-        inp_numel = inp_splits.sum().item()
-        # Max number of input elements (must be a constant across ranks for symmetric memory allocation)
-        max_inp_numel = k * nsplits
-        # Max number of output elements (must be a constant across ranks for symmetric memory allocation)
-        overflow_factor = self.world_size  # worst case: one rank receives all data
-        max_out_numel = max_inp_numel * overflow_factor
+    # Assert the combined data is exactly the same as the original input data
+    torch.testing.assert_close(combine_out[:inp_numel], inp[:inp_numel])
 
-        # Buffers for shuffle
-        inp = symm_mem.empty(max_inp_numel, dtype=dtype, device=self.device).fill_(
-            self.rank
-        )
-        out = symm_mem.empty(max_out_numel, dtype=dtype, device=self.device).fill_(-1)
-        in_splits = symm_mem.empty(
-            nsplits, dtype=torch.int64, device=self.device
-        ).copy_(inp_splits)
-        # 2 rows: output splits, output offsets
-        # Initiallizing all values to -1 to check if they are updated
-        out_splits_offsets = symm_mem.empty(
-            (2, nsplits), dtype=torch.int64, device=self.device
-        ).fill_(-1)
+    # Assert the combined out splits are exactly the same as the original input splits
+    torch.testing.assert_close(combine_out_splits_offsets[0], inp_splits)
 
-        # Shuffle the tokens
-        torch.ops.symm_mem.all_to_all_vdev_2d(
-            inp, out, in_splits, out_splits_offsets, group_name, major_align=align
-        )
-
-        # Buffers for combine
-        combine_out = symm_mem.empty(
-            max_out_numel, dtype=dtype, device=self.device
-        ).fill_(-1)
-        # 2 rows: output splits, output offsets
-        # Initiallizing all values to -1 to check if they are updated
-        combine_out_splits_offsets = symm_mem.empty(
-            (2, nsplits), dtype=torch.int64, device=self.device
-        ).fill_(-1)
+    # Assert the combined out offsets are exactly the same as the original input offsets
+    inp_offsets = torch.cumsum(inp_splits, dim=0)  # inclusive scan
+    # Make it exclusive scan because that's what `all_to_all_vdev_2d_offset` returns
+    inp_offsets = torch.cat([torch.zeros(1, device=device), inp_offsets[:-1]]).to(
+        torch.int64
+    )
+    torch.testing.assert_close(combine_out_splits_offsets[1], inp_offsets)
 
-        # Combine the tokens
-        # `out_splits_offsets` from shuffle is exactly the `input_splits_offsets` for combine
-        # `out` data from shuffle is exactly the `input` data for combine
-        torch.ops.symm_mem.all_to_all_vdev_2d_offset(
-            out, combine_out, out_splits_offsets, combine_out_splits_offsets, group_name
-        )
+    # Wait for all ranks to finish accessing tensors before freeing them
+    dist.barrier(group=group)
+    torch.cuda.synchronize(device)
 
-        # Assert the combined data is exactly the same as the original input data
-        torch.testing.assert_close(combine_out[:inp_numel], inp[:inp_numel])
 
-        # Assert the combined out splits are exactly the same as the original input splits
-        torch.testing.assert_close(combine_out_splits_offsets[0], inp_splits)
+@instantiate_parametrized_tests
+@requires_nvshmem()
+@requires_cuda_p2p_access()
+class DispatchCombineTest(MultiProcContinuousTest):
+    def _init_device(self) -> None:
+        # TODO: relieve this (seems to hang if without)
+        device_module.set_device(self.device)
+        # Set NVSHMEM as SymmMem backend
+        symm_mem.set_backend("NVSHMEM")
 
-        # Assert the combined out offsets are exactly the same as the original input offsets
-        inp_offsets = torch.cumsum(inp_splits, dim=0)  # inclusive scan
-        # Make it exclusive scan because that's what `all_to_all_vdev_2d_offset` returns
-        inp_offsets = torch.cat(
-            [torch.zeros(1, device=self.device), inp_offsets[:-1]]
-        ).to(torch.int64)
-        torch.testing.assert_close(combine_out_splits_offsets[1], inp_offsets)
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
 
     @skipIfRocm
     @parametrize("align", [1, 8, 16])  # `major_align` of output
@@ -634,7 +663,22 @@ def test_dispatch_combine(self, align: int) -> None:
         """
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        self.helper_test_dispatch_combine(align, dist.group.WORLD.group_name)
+        dispatch_then_combine(self.device, align, dist.group.WORLD)
+
+
+@instantiate_parametrized_tests
+@requires_nvshmem()
+@requires_cuda_p2p_access()
+class DispatchCombineInSubgroups(MultiProcContinuousTest):
+    def _init_device(self) -> None:
+        # TODO: relieve this (seems to hang if without)
+        device_module.set_device(self.device)
+        # Set NVSHMEM as SymmMem backend
+        symm_mem.set_backend("NVSHMEM")
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
 
     @skipIfRocm
     # TODO: FIXIT. Currently, `MultiProcContinuousTest` treats the skip code as a
@@ -654,7 +698,122 @@ def test_dispatch_combine_subgroup(self) -> None:
             device_type, (ngroups, subgroup_size), mesh_dim_names=("dp", "ep")
         )
         subgroup = dm.get_group("ep")
-        self.helper_test_dispatch_combine(align=8, group_name=subgroup.group_name)
+        dispatch_then_combine(self.device, align=8, group=subgroup)
+
+
+@instantiate_parametrized_tests
+@requires_nvshmem()
+@requires_cuda_p2p_access()
+class NVSHMEMTileCommTest(MultiProcContinuousTest):
+    def _init_device(self) -> None:
+        # TODO: relieve this (seems to hang if without)
+        device_module.set_device(self.device)
+        # Set NVSHMEM as SymmMem backend
+        symm_mem.set_backend("NVSHMEM")
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    @skipIfRocm
+    @parametrize("tile_size", [32, 128, 512])
+    @parametrize("dtype", [torch.float, torch.half, torch.bfloat16])
+    def test_tile_reduce(self, tile_size: int, dtype: torch.dtype) -> None:
+        full_size = 1024
+        assert tile_size <= full_size
+
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        full_inp = symm_mem.empty(
+            full_size, full_size, dtype=dtype, device=self.device
+        ).fill_(self.rank)
+        full_out = symm_mem.empty(
+            full_size, full_size, dtype=dtype, device=self.device
+        ).fill_(0)
+
+        slice_ut = slice(tile_size, 2 * tile_size)
+        inp_tile = full_inp[slice_ut, slice_ut]
+        out_tile = full_out[slice_ut, slice_ut]
+
+        # Reduce the tile
+        root = 0
+        torch.ops.symm_mem.tile_reduce(inp_tile, out_tile, root, group_name)
+
+        # Check data
+        expected = torch.zeros_like(full_out)
+        expected_tile = expected[slice_ut, slice_ut]
+        if self.rank == root:
+            expected_tile.fill_(self.world_size * (self.world_size - 1) / 2)
+
+        torch.testing.assert_close(full_out, expected)
+
+    @skipIfRocm
+    @parametrize("tile_size", [32, 128, 512])
+    @parametrize(
+        "root_ratio", [1, 2]
+    )  # 1: all ranks are roots, 2: half of ranks are roots
+    @parametrize("dtype", [torch.float, torch.half, torch.bfloat16])
+    def test_multi_root_tile_reduce(
+        self, tile_size: int, root_ratio: int, dtype: torch.dtype
+    ) -> None:
+        full_size = 2048
+        num_slices_col = 2  # number of tiles on column dimension
+        num_slices_row = (
+            self.world_size // num_slices_col
+        )  # number of tiles on row dimension
+        assert tile_size * num_slices_col <= full_size
+        assert tile_size * num_slices_row <= full_size
+
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        full_inp = symm_mem.empty(
+            full_size, full_size, dtype=dtype, device=self.device
+        ).fill_(self.rank)
+        full_out = symm_mem.empty(
+            full_size, full_size, dtype=dtype, device=self.device
+        ).fill_(0)
+
+        # Get range of each slice in terms of element indices
+        slices_row = [
+            slice(s * tile_size, (s + 1) * tile_size) for s in range(num_slices_row)
+        ]
+        slices_col = [
+            slice(s * tile_size, (s + 1) * tile_size) for s in range(num_slices_col)
+        ]
+
+        # Active roots, can be a subset of all ranks
+        num_active_roots = self.world_size // root_ratio
+        active_roots = list(range(num_active_roots))
+
+        # Map rank to slice indices (e.g. rank 0 -> (0, 0), rank 1 -> (0, 1), rank 2 -> (1, 0), rank 3 -> (1, 1))
+        map_rank_to_slices = lambda r: (  # noqa: E731
+            slices_row[r // num_slices_col],
+            slices_col[r % num_slices_col],
+        )
+        # Populate input tiles
+        input_tiles_ij = [map_rank_to_slices(r) for r in active_roots]
+        input_tiles = [
+            full_inp[slice_i, slice_j] for (slice_i, slice_j) in input_tiles_ij
+        ]
+        # My output tile (i.e. the one that I will reduce)
+        out_tile_ij = map_rank_to_slices(self.rank)
+        out_tile = full_out[out_tile_ij[0], out_tile_ij[1]]
+
+        # Reduce the tiles
+        torch.ops.symm_mem.multi_root_tile_reduce(
+            input_tiles, out_tile, active_roots, group_name
+        )
+
+        # Check data
+        expected = torch.zeros_like(full_out)
+        expected_tile = expected[out_tile_ij[0], out_tile_ij[1]]
+        if self.rank in active_roots:
+            expected_tile.fill_(self.world_size * (self.world_size - 1) / 2)
+        torch.testing.assert_close(full_out, expected)
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/test_nvshmem_triton.py b/test/distributed/test_nvshmem_triton.py
index 58f6a74bacc6..3fec9a01f049 100644
--- a/test/distributed/test_nvshmem_triton.py
+++ b/test/distributed/test_nvshmem_triton.py
@@ -2,6 +2,8 @@
 # To run:
 # python test/distributed/test_nvshmem_triton.py
 
+import sys
+
 import triton.language as tl
 
 import torch
@@ -9,24 +11,22 @@
 import torch.distributed._symmetric_memory as symm_mem
 import torch.distributed._symmetric_memory._nvshmem_triton as nvshmem
 from torch._inductor.runtime.triton_compat import triton
+from torch.distributed._symmetric_memory._nvshmem_triton import requires_nvshmem
+from torch.testing._internal.common_cuda import SM100OrLater
 from torch.testing._internal.common_distributed import MultiProcContinuousTest
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
-    skip_but_pass_in_sandcastle,
     skip_but_pass_in_sandcastle_if,
     skipIfRocm,
 )
 from torch.testing._internal.inductor_utils import IS_H100, requires_triton
 
 
-# Decorators
-def requires_nvshmem():
-    return skip_but_pass_in_sandcastle_if(
-        not symm_mem.is_nvshmem_available(),
-        "test_nvshmem requires NVSHMEM, skipping tests",
-    )
+if not symm_mem.is_nvshmem_available():
+    print("NVSHMEM not available, skipping tests")
+    sys.exit(0)
 
 
 def requires_h100():
@@ -42,8 +42,11 @@ def requires_h100():
 
 
 # Shared Triton JIT kernels
+
+
+@requires_nvshmem
 @triton.jit
-def nvshmem_put_kernel(
+def my_put_kernel(
     dest,
     src,
     nelems,
@@ -52,38 +55,45 @@ def nvshmem_put_kernel(
     nvshmem.put(dest, src, nelems, pe)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_get_kernel(
+def my_get_kernel(
     dest,
     src,
     nelems,
     pe,
+    nbi: tl.constexpr,  # use nonblocking interface if True
 ):
-    nvshmem.get(dest, src, nelems, pe)
+    if nbi:
+        nvshmem.get_nbi(dest, src, nelems, pe)
+        nvshmem.quiet()
+    else:
+        nvshmem.get(dest, src, nelems, pe)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_putmem_signal_block_kernel(
-    dst_ptr,
-    src_ptr,
+def my_putmem_signal_block_kernel(
+    dst,
+    src,
     size_bytes,
-    sig_ptr,
-    signal_val,
+    signal,
+    sig_val,
     sig_op,
     peer,
 ):
-    nvshmem.putmem_signal_block(
-        dst_ptr, src_ptr, size_bytes, sig_ptr, signal_val, sig_op, peer
-    )
+    nvshmem.putmem_signal_block(dst, src, size_bytes, signal, sig_val, sig_op, peer)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_signal_wait_until_kernel(sig_ptr, cmp_op, cmp_val):
-    nvshmem.signal_wait_until(sig_ptr, cmp_op, cmp_val)
+def my_signal_wait_until_kernel(signal, cmp_op, cmp_val):
+    nvshmem.signal_wait_until(signal, cmp_op, cmp_val)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_signal_op_kernel(
+def my_signal_op_kernel(
     sig_addr,
     signal,
     sig_op,
@@ -92,8 +102,9 @@ def nvshmem_signal_op_kernel(
     nvshmem.signal_op(sig_addr, signal, sig_op, peer)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_wait_until_kernel(
+def my_wait_until_kernel(
     ivar,
     cmp_op,
     cmp_val,
@@ -101,13 +112,15 @@ def nvshmem_wait_until_kernel(
     nvshmem.wait_until(ivar, cmp_op, cmp_val)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_fence_kernel():
+def my_fence_kernel():
     nvshmem.fence()
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_put_with_fence_kernel(
+def my_put_with_fence_kernel(
     dst1,
     src1,
     dst2,
@@ -129,8 +142,9 @@ def nvshmem_put_with_fence_kernel(
     nvshmem.put(flag_dst, flag_src, 1, peer)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_put_with_quiet_kernel(
+def my_put_with_quiet_kernel(
     dst,
     src,
     flag_dst,
@@ -147,8 +161,9 @@ def nvshmem_put_with_quiet_kernel(
     nvshmem.put(flag_dst, flag_src, 1, peer)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_barrier_test_kernel(
+def my_barrier_test_kernel(
     dst,
     src,
     nelems,
@@ -181,13 +196,15 @@ def nvshmem_barrier_test_kernel(
         tl.store(p_dst, received + 1)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_barrier_all_kernel():
+def my_barrier_all_kernel():
     nvshmem.barrier_all()
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_sync_test_kernel(
+def my_sync_test_kernel(
     local_data,
     remote_data,
     nelems,
@@ -213,8 +230,9 @@ def nvshmem_sync_test_kernel(
     # because sync_all() made those local stores visible
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_alltoall_kernel(
+def my_alltoall_kernel(
     team_handle,
     dst,
     src,
@@ -223,8 +241,9 @@ def nvshmem_alltoall_kernel(
     nvshmem.alltoall(team_handle, dst, src, nelems_per_pe)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_broadcast_kernel(
+def my_broadcast_kernel(
     team_handle,
     dst,
     src,
@@ -234,8 +253,9 @@ def nvshmem_broadcast_kernel(
     nvshmem.broadcast(team_handle, dst, src, nelems, pe_root)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_reduce_kernel(
+def my_reduce_kernel(
     team_handle,
     dest_tensor,
     source_tensor,
@@ -245,8 +265,11 @@ def nvshmem_reduce_kernel(
     nvshmem.reduce(team_handle, dest_tensor, source_tensor, nreduce, operation)
 
 
+@skip_but_pass_in_sandcastle_if(
+    SM100OrLater,
+    "Skipping all NVSHMEM Triton tests due to https://github.com/pytorch/pytorch/issues/162897",
+)
 @instantiate_parametrized_tests
-@requires_nvshmem()
 class NVSHMEMTritonTest(MultiProcContinuousTest):
     def _init_device(self) -> None:
         # TODO: relieve this (seems to hang if without)
@@ -265,9 +288,6 @@ def test_triton_put(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
 
-        # Enable NVSHMEM for Triton
-        nvshmem_lib = nvshmem.enable_triton()
-
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -297,12 +317,11 @@ def test_triton_put(self) -> None:
         peer = 1 - rank
         if rank == 0:
             # Rank 0 puts its data to Rank 1
-            nvshmem_put_kernel[(1,)](
+            my_put_kernel[(1,)](
                 dst,
                 src,
                 nelems,
                 peer,
-                extern_libs=nvshmem_lib,
             )
 
         # Synchronize after operation
@@ -318,11 +337,11 @@ def test_triton_put(self) -> None:
     @skipIfRocm
     @requires_triton()
     @requires_h100()
-    def test_triton_get(self) -> None:
+    @parametrize("nbi", [False, True])  # Test both blocking and nonblocking interfaces
+    def test_triton_get(self, nbi: bool) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
 
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -344,12 +363,12 @@ def test_triton_get(self) -> None:
         peer = 1 - rank
         if rank == 1:
             # Rank 1 gets data from rank 0 using tensor-aware API
-            nvshmem_get_kernel[(1,)](
+            my_get_kernel[(1,)](
                 out,
                 inp,
                 numel,
                 peer,
-                extern_libs=nvshmem_lib,
+                nbi=nbi,
             )
         if rank == 1:
             torch.testing.assert_close(
@@ -363,7 +382,6 @@ def test_triton_get_ring(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
 
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -386,12 +404,12 @@ def test_triton_get_ring(self) -> None:
         peer = (rank - 1) % world_size
 
         # All ranks execute the get operation using tensor-aware API
-        nvshmem_get_kernel[(1,)](
+        my_get_kernel[(1,)](
             out,
             inp,
             numel,
             peer,
-            extern_libs=nvshmem_lib,
+            nbi=False,
         )
 
         expected_value = peer
@@ -399,7 +417,6 @@ def test_triton_get_ring(self) -> None:
             out, expected_value * torch.ones(numel, dtype=dtype, device=self.device)
         )
 
-    @skip_but_pass_in_sandcastle("Hangs")
     @skipIfRocm
     @requires_triton()
     @requires_h100()
@@ -407,8 +424,6 @@ def test_triton_put_signal_set(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
 
-        nvshmem_lib = nvshmem.enable_triton()
-
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -421,7 +436,7 @@ def test_triton_put_signal_set(self) -> None:
         val = 11
         inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
         out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
-        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        symm_mem.rendezvous(inp, group=group_name)
         out_hdl = symm_mem.rendezvous(out, group=group_name)
 
         # Use the signal pad attached to the output symmetric memory handle
@@ -435,28 +450,22 @@ def test_triton_put_signal_set(self) -> None:
 
         if rank == 0:
             # Rank 0 puts into Rank 1
-            dst_ptr = out_hdl.buffer_ptrs[peer]
-            src_ptr = inp_hdl.buffer_ptrs[rank]
-            sig_ptr = out_hdl.signal_pad_ptrs[peer]
-            nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
-                dst_ptr,
-                src_ptr,
+            my_putmem_signal_block_kernel[(1, 1, 1)](
+                out,
+                inp,
                 size_bytes=msg_size_bytes,
-                sig_ptr=sig_ptr,
-                signal_val=SIGNAL_VAL,
+                signal=flag,
+                sig_val=SIGNAL_VAL,
                 sig_op=NVSHMEM_SIGNAL_SET,
                 peer=peer,
-                extern_libs=nvshmem_lib,
             )
 
         if rank == 1:
             # Wait until signal flag is set by Rank 0
-            sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
-            nvshmem_signal_wait_until_kernel[(1,)](
-                sig_ptr_local,
+            my_signal_wait_until_kernel[(1,)](
+                flag,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=SIGNAL_VAL,
-                extern_libs=nvshmem_lib,
             )
             # After wait completes, verify data and flag contents
             torch.testing.assert_close(
@@ -466,7 +475,6 @@ def test_triton_put_signal_set(self) -> None:
                 flag, torch.tensor([SIGNAL_VAL], dtype=torch.int64, device=self.device)
             )
 
-    @skip_but_pass_in_sandcastle("Hangs")
     @skipIfRocm
     @requires_triton()
     @requires_h100()
@@ -474,8 +482,6 @@ def test_triton_put_signal_add(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
 
-        nvshmem_lib = nvshmem.enable_triton()
-
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -488,7 +494,7 @@ def test_triton_put_signal_add(self) -> None:
         val = 11
         inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
         out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
-        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        symm_mem.rendezvous(inp, group=group_name)
         out_hdl = symm_mem.rendezvous(out, group=group_name)
 
         # Use the signal pad attached to the output symmetric memory handle
@@ -502,27 +508,21 @@ def test_triton_put_signal_add(self) -> None:
 
         if rank == 0:
             # Rank 0 puts into Rank 1
-            dst_ptr = out_hdl.buffer_ptrs[peer]
-            src_ptr = inp_hdl.buffer_ptrs[rank]
-            sig_ptr = out_hdl.signal_pad_ptrs[peer]
-            nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
-                dst_ptr,
-                src_ptr,
+            my_putmem_signal_block_kernel[(1, 1, 1)](
+                out,
+                inp,
                 size_bytes=msg_size_bytes,
-                sig_ptr=sig_ptr,
-                signal_val=SIGNAL_VAL,
+                signal=flag,
+                sig_val=SIGNAL_VAL,
                 sig_op=NVSHMEM_SIGNAL_ADD,
                 peer=peer,
-                extern_libs=nvshmem_lib,
             )
 
         if rank == 1:
-            sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
-            nvshmem_signal_wait_until_kernel[(1, 1, 1)](
-                sig_ptr_local,
+            my_signal_wait_until_kernel[(1, 1, 1)](
+                flag,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=SIGNAL_VAL,
-                extern_libs=nvshmem_lib,
             )
             torch.testing.assert_close(
                 out, val * torch.ones(numel, dtype=dtype, device=self.device)
@@ -538,7 +538,6 @@ def test_triton_wait_until(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
 
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
 
@@ -557,15 +556,12 @@ def test_triton_wait_until(self) -> None:
             [FLAG_FINAL_VALUE], dtype=torch.int32, device=self.device
         )
 
-        nvshmem_barrier_all_kernel[(1,)](extern_libs=nvshmem_lib)
-
         if rank == 0:
             # Rank 0 (the waiter)
-            nvshmem_wait_until_kernel[(1,)](
+            my_wait_until_kernel[(1,)](
                 flag,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=FLAG_FINAL_VALUE,
-                extern_libs=nvshmem_lib,
             )
 
             # Verification
@@ -577,22 +573,18 @@ def test_triton_wait_until(self) -> None:
         if rank == 1:
             # Rank 1 (the signaler)
             # Launch a kernel to put the value to Rank 0's flag tensor.
-            nvshmem_put_kernel[(1,)](
+            my_put_kernel[(1,)](
                 flag,  # Destination symmetric tensor on the remote PE
                 expected_flag,  # Source data tensor (local)
                 1,  # Number of elements
                 peer,  # The target PE (Rank 0)
-                extern_libs=nvshmem_lib,
             )
 
-    @skip_but_pass_in_sandcastle("Hangs")
     @skipIfRocm
     @requires_triton()
     @requires_h100()
     def test_triton_signal_wait_until(self) -> None:
         self._init_device()
-        # Enable NVSHMEM for Triton
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -612,7 +604,7 @@ def test_triton_signal_wait_until(self) -> None:
 
         # Producer (rank 0) prepares the data to send
         inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val_to_put)
-        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        symm_mem.rendezvous(inp, group=group_name)
         # Consumer (rank 1) prepares the destination buffer
         out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
         out_hdl = symm_mem.rendezvous(out, group=group_name)
@@ -622,27 +614,21 @@ def test_triton_signal_wait_until(self) -> None:
 
         if rank == 0:
             # Producer (rank 0): Puts data into rank 1's `out` buffer and then sets the flag
-            dst_ptr = out_hdl.buffer_ptrs[peer]
-            src_ptr = inp_hdl.buffer_ptrs[rank]
-            sig_ptr = out_hdl.signal_pad_ptrs[peer]
-            nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
-                dst_ptr,
-                src_ptr,
+            my_putmem_signal_block_kernel[(1, 1, 1)](
+                out,
+                inp,
                 size_bytes=msg_size_bytes,
-                sig_ptr=sig_ptr,
-                signal_val=COMPLETION_FLAG_VAL,
+                signal=flag,
+                sig_val=COMPLETION_FLAG_VAL,
                 sig_op=NVSHMEM_SIGNAL_SET,
                 peer=peer,
-                extern_libs=nvshmem_lib,
             )
         elif rank == 1:
             # Consumer (rank 1): Waits on the signal variable using `signal_wait_until`.
-            sig_ptr = out_hdl.signal_pad_ptrs[rank]
-            nvshmem_signal_wait_until_kernel[(1, 1, 1)](
-                sig_ptr,
+            my_signal_wait_until_kernel[(1, 1, 1)](
+                flag,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=COMPLETION_FLAG_VAL,
-                extern_libs=nvshmem_lib,
             )
             # After the wait returns, verify data and flag
             torch.testing.assert_close(
@@ -669,7 +655,6 @@ def test_triton_fence(self) -> None:
         """
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -700,7 +685,7 @@ def test_triton_fence(self) -> None:
         NVSHMEM_CMP_EQ = 0  # compare equal
 
         if rank == 0:
-            nvshmem_put_with_fence_kernel[(1,)](
+            my_put_with_fence_kernel[(1,)](
                 out1,
                 inp1,
                 out2,
@@ -709,15 +694,13 @@ def test_triton_fence(self) -> None:
                 flag_update_val,
                 nelems=numel,
                 peer=peer,
-                extern_libs=nvshmem_lib,
             )
         elif rank == 1:
             # Wait until flag is set by Rank 0
-            nvshmem_wait_until_kernel[(1,)](
+            my_wait_until_kernel[(1,)](
                 flag,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=flag_val,
-                extern_libs=nvshmem_lib,
             )
 
             # Verify ordered data arrival.
@@ -737,7 +720,6 @@ def test_triton_fence(self) -> None:
     def test_triton_quiet(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -763,21 +745,19 @@ def test_triton_quiet(self) -> None:
 
         dist.barrier()
         if rank == 1:
-            nvshmem_put_with_quiet_kernel[(1,)](
+            my_put_with_quiet_kernel[(1,)](
                 out,
                 inp,
                 flag,
                 flag_update_val,
                 nelems=numel,
                 peer=peer,
-                extern_libs=nvshmem_lib,
             )
         elif rank == 0:
-            nvshmem_wait_until_kernel[(1,)](
+            my_wait_until_kernel[(1,)](
                 flag,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=flag_val,
-                extern_libs=nvshmem_lib,
             )
             torch.testing.assert_close(
                 out, val * torch.ones(numel, dtype=dtype, device=self.device)
@@ -790,7 +770,6 @@ def test_triton_quiet(self) -> None:
     def test_triton_barrier(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -802,11 +781,10 @@ def test_triton_barrier(self) -> None:
         symm_mem.rendezvous(src, group=group_name)
         symm_mem.rendezvous(dst, group=group_name)
 
-        nvshmem_barrier_test_kernel[(1,)](
+        my_barrier_test_kernel[(1,)](
             dst,
             src,
             nelems=numel,
-            extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
             num_ctas=1,
         )
@@ -828,7 +806,6 @@ def test_triton_sync(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
 
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -842,11 +819,10 @@ def test_triton_sync(self) -> None:
         symm_mem.rendezvous(remote_data, group=group_name)
 
         # Launch kernel with cooperative grid
-        nvshmem_sync_test_kernel[(1,)](
+        my_sync_test_kernel[(1,)](
             local_data,
             remote_data,
             nelems=numel,
-            extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
             num_ctas=1,
         )
@@ -873,7 +849,6 @@ def test_triton_sync(self) -> None:
     def test_triton_alltoall(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         world_size = dist.get_world_size()
@@ -898,12 +873,11 @@ def test_triton_alltoall(self) -> None:
         dist.barrier()
         team_handle = 0  # NVSHMEM_TEAM_WORLD handle is 0
         # Launch the kernel using new tensor-aware API
-        nvshmem_alltoall_kernel[(1,)](
+        my_alltoall_kernel[(1,)](
             team_handle,
             dst,
             src,
             nelems_per_pe,
-            extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
         )
         # Synchronize after alltoall
@@ -922,7 +896,6 @@ def test_triton_alltoall(self) -> None:
     def test_triton_broadcast(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -953,13 +926,12 @@ def test_triton_broadcast(self) -> None:
 
         # Execute broadcast
         team_handle = 0  # NVSHMEM_TEAM_WORLD
-        nvshmem_broadcast_kernel[(1,)](
+        my_broadcast_kernel[(1,)](
             team_handle,
             dst,
             src,
             nelems,
             pe_root,
-            extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
         )
 
@@ -992,7 +964,6 @@ def test_triton_broadcast(self) -> None:
     def test_triton_sum_reduce(self, dtype) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         world_size = dist.get_world_size()
@@ -1019,13 +990,12 @@ def test_triton_sum_reduce(self, dtype) -> None:
 
         # Execute sum reduction across all ranks
         team_handle = 0  # NVSHMEM_TEAM_WORLD
-        nvshmem_reduce_kernel[(1,)](
+        my_reduce_kernel[(1,)](
             team_handle,
             dst,
             src,
             nreduce,
             operation="sum",
-            extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
         )
 
@@ -1056,7 +1026,6 @@ def test_triton_sum_reduce(self, dtype) -> None:
     def test_triton_minmax_reduce(self, dtype) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         world_size = dist.get_world_size()
@@ -1098,23 +1067,21 @@ def test_triton_minmax_reduce(self, dtype) -> None:
         dist.barrier()
         # Execute MIN reduction
         team_handle = 0
-        nvshmem_reduce_kernel[(1,)](
+        my_reduce_kernel[(1,)](
             team_handle,
             dst_min,
             src_min,
             nreduce,
             operation="min",
-            extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
         )
         # Execute MAX reduction
-        nvshmem_reduce_kernel[(1,)](
+        my_reduce_kernel[(1,)](
             team_handle,
             dst_max,
             src_max,
             nreduce,
             operation="max",
-            extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
         )
         dist.barrier()
@@ -1145,7 +1112,6 @@ def test_triton_minmax_reduce(self, dtype) -> None:
     def test_triton_prod_reduce(self, dtype) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         world_size = dist.get_world_size()
@@ -1175,9 +1141,8 @@ def test_triton_prod_reduce(self, dtype) -> None:
         vals[0, ::2] = 1
         vals[0, 1::2] = 2
         vals[1] = 1
-        vals2 = vals[2].view(-1, 2, 2)
-        vals2[:, 0] = 1
-        vals2[:, 1] = 2
+        for rank in range(world_size):
+            vals[2, rank] = 1 if (rank // 2) % 2 == 0 else 2
         expected = vals.prod(-1).tolist()
 
         # Synchronize before reduction
@@ -1185,13 +1150,12 @@ def test_triton_prod_reduce(self, dtype) -> None:
 
         # Execute product reduction across all ranks
         team_handle = 0  # NVSHMEM_TEAM_WORLD
-        nvshmem_reduce_kernel[(1,)](
+        my_reduce_kernel[(1,)](
             team_handle,
             dst,
             src,
             nreduce,
             operation="prod",
-            extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
         )
 
diff --git a/test/distributed/test_serialization.py b/test/distributed/test_serialization.py
index e0feb551f97d..3adb099aa7a3 100644
--- a/test/distributed/test_serialization.py
+++ b/test/distributed/test_serialization.py
@@ -95,6 +95,18 @@ def test_various_data_types(self) -> None:
         result = _streaming_load(file)
         torch.testing.assert_close(result, state_dict)
 
+    def test_empty_tensor(self) -> None:
+        state_dict = {
+            "empty": torch.zeros(0, 10),
+        }
+
+        file = BytesIO()
+        _streaming_save(state_dict, file)
+        file.seek(0)
+
+        result = _streaming_load(file, weights_only=False)
+        self.assertEqual(result, state_dict)
+
     def test_dtensor(self) -> None:
         dist.init_process_group(
             backend="gloo", rank=0, world_size=1, store=dist.HashStore()
diff --git a/test/distributed/test_store.py b/test/distributed/test_store.py
index e557a4835962..a6b69eeb8b93 100644
--- a/test/distributed/test_store.py
+++ b/test/distributed/test_store.py
@@ -43,7 +43,7 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 if platform == "darwin":
     LOOPBACK = "lo0"
@@ -1178,6 +1178,6 @@ def listen() -> None:
 if __name__ == "__main__":
     if device_type != "cpu":
         assert not torch.get_device_module()._initialized, (
-            "test_distributed must not have initialized {device_type} context on main process"
+            f"test_distributed must not have initialized {device_type} context on main process"
         )
     run_tests()
diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
index d4fc073d1d6f..9f4add3bca5a 100644
--- a/test/distributed/test_symmetric_memory.py
+++ b/test/distributed/test_symmetric_memory.py
@@ -4,7 +4,7 @@
 import os
 import random
 from contextlib import nullcontext
-from unittest import skip, skipIf
+from unittest import skip, skipIf, skipUnless
 
 import torch
 import torch.distributed as dist
@@ -22,7 +22,13 @@
     restride_A_for_fused_matmul_reduce_scatter,
     restride_A_shard_for_fused_all_gather_matmul,
 )
-from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM90OrLater
+from torch.testing._internal.common_cuda import (
+    _get_torch_cuda_version,
+    SM100OrLater,
+    SM89OrLater,
+    SM90OrLater,
+    xfailIfSM100OrLater,
+)
 from torch.testing._internal.common_device_type import e4m3_type
 from torch.testing._internal.common_distributed import (
     MultiProcContinuousTest,
@@ -46,6 +52,9 @@
 
 test_contexts = [nullcontext, _test_mode]
 
+# Set environment variable to disable multicast for all tests in this module
+os.environ["TORCH_SYMM_MEM_DISABLE_MULTICAST"] = "1"
+
 # So that tests are written in device-agnostic way
 device_type = "cuda"
 device_module = torch.get_device_module(device_type)
@@ -288,7 +297,7 @@ def _init_process(self):
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
     @skip_if_lt_x_gpu(2)
-    @parametrize("gather_dim", [0, 1])
+    @parametrize("gather_dim", [0, 1, 2])
     def test_fused_all_gather_matmul(self, gather_dim: int) -> None:
         self._init_process()
 
@@ -300,7 +309,10 @@ def test_fused_all_gather_matmul(self, gather_dim: int) -> None:
         rank = self.rank
 
         torch.manual_seed(42 + rank)
-        A_shard = torch.rand(BATCH, M // self.world_size, K, device="cuda")
+        A_shard_shape = [BATCH, M, K]
+        A_shard_shape[gather_dim] //= self.world_size
+
+        A_shard = torch.rand(A_shard_shape, device="cuda")
         Bs = [torch.rand(K, N, device="cuda") for _ in range(3)]
 
         ag_output_0, mm_outputs_0 = _fused_all_gather_matmul_fallback(
@@ -324,6 +336,10 @@ def test_fused_all_gather_matmul(self, gather_dim: int) -> None:
     @skip_if_lt_x_gpu(2)
     @parametrize("symm_mem_input", [True, False])
     @parametrize("is_b_row_major", [True, False])
+    @skipIf(
+        SM100OrLater,
+        "https://github.com/pytorch/pytorch/issues/162917",
+    )
     def test_fused_all_gather_matmul_native(
         self, symm_mem_input: bool, is_b_row_major: bool
     ) -> None:
@@ -417,6 +433,7 @@ def test_multimem_all_gather_matmul(self) -> None:
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
     @skip_if_lt_x_gpu(2)
+    @skipUnless(SM89OrLater, "Requires compute capability >= 8.9")
     @parametrize("gather_dim", [0, 1])
     @parametrize(
         "scale_mode", ["tensor-wise", "row-wise-replicated", "row-wise-sharded"]
@@ -438,7 +455,7 @@ def test_fused_all_gather_scaled_matmul(
         elif gather_dim == 1:
             leading_dims = (BATCH, M // self.world_size)
         else:
-            raise AssertionError("Invalid scale_mode: {scale_mode}")
+            raise AssertionError(f"Invalid scale_mode: {scale_mode}")
 
         torch.manual_seed(42 + rank)
 
@@ -505,14 +522,14 @@ def test_fused_all_gather_scaled_matmul(
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
     @skip_if_lt_x_gpu(2)
-    @parametrize("scatter_dim", [0, 1])
+    @parametrize("scatter_dim", [0, 1, 2])
     def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None:
         self._init_process()
 
         BATCH = 8
         M = 64
         N = 16
-        K = 32
+        K = 1024
         group = dist.group.WORLD
         rank = self.rank
 
@@ -532,8 +549,13 @@ def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None:
 
     @skip_if_rocm_multiprocess  # AsyncTP support changed _fused_scaled_matmul_reduce_scatter_fallback API, need more changes
     @skip_if_lt_x_gpu(2)
+    @skipUnless(SM89OrLater, "Requires compute capability >= 8.9")
     @parametrize("scatter_dim", [0, 1])
     @parametrize("rowwise", [True, False])
+    @skipIf(
+        SM100OrLater,
+        "https://github.com/pytorch/pytorch/issues/162940",
+    )
     def test_fused_scaled_matmul_reduce_scatter(
         self, scatter_dim: int, rowwise: bool
     ) -> None:
@@ -883,6 +905,8 @@ def test_multimem_all_reduce(
     @parametrize("dtype", [torch.float, torch.bfloat16])
     @parametrize("align_bytes", [4, 8, 16])
     @parametrize("size_bytes", [4, 8192, 8196])
+    # https://github.com/pytorch/pytorch/issues/164015
+    @xfailIfSM100OrLater
     def test_multimem_one_shot_all_reduce(
         self, dtype: torch.dtype, size_bytes: int, align_bytes: int
     ) -> None:
@@ -903,6 +927,37 @@ def test_multimem_one_shot_all_reduce(
             gathered_inps.sum(dim=0), res, rtol=1e-03, atol=1e-05
         )
 
+    @skip_if_lt_x_gpu(4)
+    @requires_multicast_support()
+    @parametrize("dtype", [torch.float, torch.bfloat16])
+    @parametrize("size_bytes", [4, 8192, 8196])
+    # https://github.com/pytorch/pytorch/issues/164015
+    @xfailIfSM100OrLater
+    def test_multimem_one_shot_reduce_out(
+        self, dtype: torch.dtype, size_bytes: int
+    ) -> None:
+        self._init_process()
+        group_name = dist.group.WORLD.group_name
+
+        inp = symm_mem.empty(
+            size_bytes // dtype.itemsize, dtype=dtype, device=self.device
+        ).normal_()
+        out = torch.empty_like(inp)
+        symm_mem.rendezvous(inp, group=group_name)
+
+        root = 0
+        torch.ops.symm_mem.multimem_one_shot_reduce_out(
+            inp, "sum", root, group_name, out
+        )
+
+        gathered_inps = all_gather_tensor(inp, 0, "0").view(self.world_size, -1)
+        # Only verify that the results are close to the sum of inputs across
+        # ranks (see Note [multimem_one_shot_all_reduce]).
+        if self.rank == root:
+            torch.testing.assert_close(
+                gathered_inps.sum(dim=0), out, rtol=1e-03, atol=1e-05
+            )
+
     @skipIf(
         not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
     )
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index aaae775f191c..5558a9c21eda 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -124,7 +124,7 @@
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 TEST_NUMPY = True
 try:
@@ -5722,11 +5722,11 @@ def test_kl_monte_carlo(self):
     def test_kl_multivariate_normal(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         n = 5  # Number of tests for multivariate_normal
-        for i in range(0, n):
-            loc = [torch.randn(4) for _ in range(0, 2)]
+        for i in range(n):
+            loc = [torch.randn(4) for _ in range(2)]
             scale_tril = [
                 transform_to(constraints.lower_cholesky)(torch.randn(4, 4))
-                for _ in range(0, 2)
+                for _ in range(2)
             ]
             p = MultivariateNormal(loc=loc[0], scale_tril=scale_tril[0])
             q = MultivariateNormal(loc=loc[1], scale_tril=scale_tril[1])
@@ -5755,10 +5755,10 @@ def test_kl_multivariate_normal(self):
 
     def test_kl_multivariate_normal_batched(self):
         b = 7  # Number of batches
-        loc = [torch.randn(b, 3) for _ in range(0, 2)]
+        loc = [torch.randn(b, 3) for _ in range(2)]
         scale_tril = [
             transform_to(constraints.lower_cholesky)(torch.randn(b, 3, 3))
-            for _ in range(0, 2)
+            for _ in range(2)
         ]
         expected_kl = torch.stack(
             [
@@ -5766,7 +5766,7 @@ def test_kl_multivariate_normal_batched(self):
                     MultivariateNormal(loc[0][i], scale_tril=scale_tril[0][i]),
                     MultivariateNormal(loc[1][i], scale_tril=scale_tril[1][i]),
                 )
-                for i in range(0, b)
+                for i in range(b)
             ]
         )
         actual_kl = kl_divergence(
@@ -5777,7 +5777,7 @@ def test_kl_multivariate_normal_batched(self):
 
     def test_kl_multivariate_normal_batched_broadcasted(self):
         b = 7  # Number of batches
-        loc = [torch.randn(b, 3) for _ in range(0, 2)]
+        loc = [torch.randn(b, 3) for _ in range(2)]
         scale_tril = [
             transform_to(constraints.lower_cholesky)(torch.randn(b, 3, 3)),
             transform_to(constraints.lower_cholesky)(torch.randn(3, 3)),
@@ -5788,7 +5788,7 @@ def test_kl_multivariate_normal_batched_broadcasted(self):
                     MultivariateNormal(loc[0][i], scale_tril=scale_tril[0][i]),
                     MultivariateNormal(loc[1][i], scale_tril=scale_tril[1]),
                 )
-                for i in range(0, b)
+                for i in range(b)
             ]
         )
         actual_kl = kl_divergence(
@@ -5800,15 +5800,15 @@ def test_kl_multivariate_normal_batched_broadcasted(self):
     def test_kl_lowrank_multivariate_normal(self):
         set_rng_seed(0)  # see Note [Randomized statistical tests]
         n = 5  # Number of tests for lowrank_multivariate_normal
-        for i in range(0, n):
-            loc = [torch.randn(4) for _ in range(0, 2)]
-            cov_factor = [torch.randn(4, 3) for _ in range(0, 2)]
+        for i in range(n):
+            loc = [torch.randn(4) for _ in range(2)]
+            cov_factor = [torch.randn(4, 3) for _ in range(2)]
             cov_diag = [
-                transform_to(constraints.positive)(torch.randn(4)) for _ in range(0, 2)
+                transform_to(constraints.positive)(torch.randn(4)) for _ in range(2)
             ]
             covariance_matrix = [
                 cov_factor[i].matmul(cov_factor[i].t()) + cov_diag[i].diag()
-                for i in range(0, 2)
+                for i in range(2)
             ]
             p = LowRankMultivariateNormal(loc[0], cov_factor[0], cov_diag[0])
             q = LowRankMultivariateNormal(loc[1], cov_factor[1], cov_diag[1])
@@ -5861,10 +5861,10 @@ def test_kl_lowrank_multivariate_normal(self):
 
     def test_kl_lowrank_multivariate_normal_batched(self):
         b = 7  # Number of batches
-        loc = [torch.randn(b, 3) for _ in range(0, 2)]
-        cov_factor = [torch.randn(b, 3, 2) for _ in range(0, 2)]
+        loc = [torch.randn(b, 3) for _ in range(2)]
+        cov_factor = [torch.randn(b, 3, 2) for _ in range(2)]
         cov_diag = [
-            transform_to(constraints.positive)(torch.randn(b, 3)) for _ in range(0, 2)
+            transform_to(constraints.positive)(torch.randn(b, 3)) for _ in range(2)
         ]
         expected_kl = torch.stack(
             [
@@ -5876,7 +5876,7 @@ def test_kl_lowrank_multivariate_normal_batched(self):
                         loc[1][i], cov_factor[1][i], cov_diag[1][i]
                     ),
                 )
-                for i in range(0, b)
+                for i in range(b)
             ]
         )
         actual_kl = kl_divergence(
@@ -5887,7 +5887,7 @@ def test_kl_lowrank_multivariate_normal_batched(self):
 
     def test_kl_exponential_family(self):
         for (p, _), (_, q) in self.finite_examples:
-            if type(p) == type(q) and issubclass(type(p), ExponentialFamily):
+            if type(p) is type(q) and issubclass(type(p), ExponentialFamily):
                 actual = kl_divergence(p, q)
                 expected = _kl_expfamily_expfamily(p, q)
                 self.assertEqual(
diff --git a/test/dynamo/cpython/3_13/test_baseexception.diff b/test/dynamo/cpython/3_13/test_baseexception.diff
index 240e4e554d6a..19e9fc0a5601 100644
--- a/test/dynamo/cpython/3_13/test_baseexception.diff
+++ b/test/dynamo/cpython/3_13/test_baseexception.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_baseexception.py b/test/dynamo/cpython/3_13/test_baseexception.py
-index e599b02c17d..750d7a84fb4 100644
+index e599b02c17d..057b6ec01b9 100644
 --- a/test/dynamo/cpython/3_13/test_baseexception.py
 +++ b/test/dynamo/cpython/3_13/test_baseexception.py
 @@ -1,10 +1,64 @@
@@ -78,7 +78,27 @@ index e599b02c17d..750d7a84fb4 100644
          self.assertEqual(len(exc_set), 0, "%s not accounted for" % exc_set)
  
      interface_tests = ("length", "args", "str", "repr")
-@@ -142,7 +193,7 @@ class ExceptionClassTests(unittest.TestCase):
+@@ -122,12 +173,13 @@ class ExceptionClassTests(unittest.TestCase):
+         # in PyObject_SetAttr.
+         import gc
+         d = {}
+-        class HashThisKeyWillClearTheDict(str):
+-            def __hash__(self) -> int:
+-                d.clear()
+-                return super().__hash__()
+-        class Value(str):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class HashThisKeyWillClearTheDict(str):
++                def __hash__(self) -> int:
++                    d.clear()
++                    return super().__hash__()
++            class Value(str):
++                pass
+         exc = Exception()
+ 
+         d[HashThisKeyWillClearTheDict()] = Value()  # refcount of Value() is 1 now
+@@ -142,7 +194,7 @@ class ExceptionClassTests(unittest.TestCase):
          gc.collect()
  
  
@@ -87,7 +107,31 @@ index e599b02c17d..750d7a84fb4 100644
  
      """Test usage of exceptions"""
  
-@@ -208,5 +259,5 @@ class UsageTests(unittest.TestCase):
+@@ -182,8 +234,9 @@ class UsageTests(unittest.TestCase):
+         # BaseException; the ability was not possible until BaseException's
+         # introduction so no need to support new-style objects that do not
+         # inherit from it.
+-        class NewStyleClass(object):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class NewStyleClass(object):
++                pass
+         self.raise_fails(NewStyleClass)
+         self.raise_fails(NewStyleClass())
+ 
+@@ -194,8 +247,9 @@ class UsageTests(unittest.TestCase):
+     def test_catch_non_BaseException(self):
+         # Trying to catch an object that does not inherit from BaseException
+         # is not allowed.
+-        class NonBaseException(object):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class NonBaseException(object):
++                pass
+         self.catch_fails(NonBaseException)
+         self.catch_fails(NonBaseException())
+ 
+@@ -208,5 +262,5 @@ class UsageTests(unittest.TestCase):
          self.catch_fails("spam")
  
  
diff --git a/test/dynamo/cpython/3_13/test_baseexception.py b/test/dynamo/cpython/3_13/test_baseexception.py
index 750d7a84fb45..057b6ec01b99 100644
--- a/test/dynamo/cpython/3_13/test_baseexception.py
+++ b/test/dynamo/cpython/3_13/test_baseexception.py
@@ -173,12 +173,13 @@ def test_setstate_refcount_no_crash(self):
         # in PyObject_SetAttr.
         import gc
         d = {}
-        class HashThisKeyWillClearTheDict(str):
-            def __hash__(self) -> int:
-                d.clear()
-                return super().__hash__()
-        class Value(str):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class HashThisKeyWillClearTheDict(str):
+                def __hash__(self) -> int:
+                    d.clear()
+                    return super().__hash__()
+            class Value(str):
+                pass
         exc = Exception()
 
         d[HashThisKeyWillClearTheDict()] = Value()  # refcount of Value() is 1 now
@@ -233,8 +234,9 @@ def test_raise_new_style_non_exception(self):
         # BaseException; the ability was not possible until BaseException's
         # introduction so no need to support new-style objects that do not
         # inherit from it.
-        class NewStyleClass(object):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class NewStyleClass(object):
+                pass
         self.raise_fails(NewStyleClass)
         self.raise_fails(NewStyleClass())
 
@@ -245,8 +247,9 @@ def test_raise_string(self):
     def test_catch_non_BaseException(self):
         # Trying to catch an object that does not inherit from BaseException
         # is not allowed.
-        class NonBaseException(object):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class NonBaseException(object):
+                pass
         self.catch_fails(NonBaseException)
         self.catch_fails(NonBaseException())
 
diff --git a/test/dynamo/cpython/3_13/test_exceptions.diff b/test/dynamo/cpython/3_13/test_exceptions.diff
index 6dcc9c858a9f..4cfc8f3d600e 100644
--- a/test/dynamo/cpython/3_13/test_exceptions.diff
+++ b/test/dynamo/cpython/3_13/test_exceptions.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_exceptions.py b/test/dynamo/cpython/3_13/test_exceptions.py
-index c91f6662948..0ded70db3c7 100644
+index c91f6662948..3a62dec411c 100644
 --- a/test/dynamo/cpython/3_13/test_exceptions.py
 +++ b/test/dynamo/cpython/3_13/test_exceptions.py
 @@ -1,3 +1,59 @@
@@ -71,7 +71,305 @@ index c91f6662948..0ded70db3c7 100644
  
      def raise_catch(self, exc, excname):
          with self.subTest(exc=exc, excname=excname):
-@@ -1844,7 +1900,7 @@ class ExceptionTests(unittest.TestCase):
+@@ -343,12 +399,13 @@ class ExceptionTests(unittest.TestCase):
+         # test that setting an exception at the C level works even if the
+         # exception object can't be constructed.
+ 
+-        class BadException(Exception):
+-            def __init__(self_):
+-                raise RuntimeError("can't instantiate BadException")
++        with torch._dynamo.error_on_graph_break(False):
++            class BadException(Exception):
++                def __init__(self_):
++                    raise RuntimeError("can't instantiate BadException")
+ 
+-        class InvalidException:
+-            pass
++            class InvalidException:
++                pass
+ 
+         @unittest.skipIf(_testcapi is None, "requires _testcapi")
+         def test_capi1():
+@@ -636,8 +693,9 @@ class ExceptionTests(unittest.TestCase):
+         self.assertIsInstance(e, IndexError)
+         self.assertEqual(e.__traceback__, tb)
+ 
+-        class MyException(Exception):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class MyException(Exception):
++                pass
+ 
+         e = MyException().with_traceback(tb)
+         self.assertIsInstance(e, MyException)
+@@ -696,8 +754,9 @@ class ExceptionTests(unittest.TestCase):
+         self.assertIsNone(e.__context__)
+         self.assertIsNone(e.__cause__)
+ 
+-        class MyException(OSError):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class MyException(OSError):
++                pass
+ 
+         e = MyException()
+         self.assertIsNone(e.__context__)
+@@ -726,10 +785,11 @@ class ExceptionTests(unittest.TestCase):
+         # but user-defined subclasses can if they want
+         self.assertRaises(TypeError, BaseException, a=1)
+ 
+-        class DerivedException(BaseException):
+-            def __init__(self, fancy_arg):
+-                BaseException.__init__(self)
+-                self.fancy_arg = fancy_arg
++        with torch._dynamo.error_on_graph_break(False):
++            class DerivedException(BaseException):
++                def __init__(self, fancy_arg):
++                    BaseException.__init__(self)
++                    self.fancy_arg = fancy_arg
+ 
+         x = DerivedException(fancy_arg=42)
+         self.assertEqual(x.fancy_arg, 42)
+@@ -779,11 +839,12 @@ class ExceptionTests(unittest.TestCase):
+         # Make sure exception state is cleaned up as soon as the except
+         # block is left. See #2507
+ 
+-        class MyException(Exception):
+-            def __init__(self, obj):
+-                self.obj = obj
+-        class MyObj:
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class MyException(Exception):
++                def __init__(self, obj):
++                    self.obj = obj
++            class MyObj:
++                pass
+ 
+         def inner_raising_func():
+             # Create some references in exception value and traceback
+@@ -881,11 +942,12 @@ class ExceptionTests(unittest.TestCase):
+         self.assertIsNone(obj)
+ 
+         # Inside an exception-silencing "with" block
+-        class Context:
+-            def __enter__(self):
+-                return self
+-            def __exit__ (self, exc_type, exc_value, exc_tb):
+-                return True
++        with torch._dynamo.error_on_graph_break(False):
++            class Context:
++                def __enter__(self):
++                    return self
++                def __exit__ (self, exc_type, exc_value, exc_tb):
++                    return True
+         obj = MyObj()
+         wr = weakref.ref(obj)
+         with Context():
+@@ -1027,11 +1089,12 @@ class ExceptionTests(unittest.TestCase):
+     def _check_generator_cleanup_exc_state(self, testfunc):
+         # Issue #12791: exception state is cleaned up as soon as a generator
+         # is closed (reference cycles are broken).
+-        class MyException(Exception):
+-            def __init__(self, obj):
+-                self.obj = obj
+-        class MyObj:
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class MyException(Exception):
++                def __init__(self, obj):
++                    self.obj = obj
++            class MyObj:
++                pass
+ 
+         def raising_gen():
+             try:
+@@ -1090,10 +1153,11 @@ class ExceptionTests(unittest.TestCase):
+     def test_3114(self):
+         # Bug #3114: in its destructor, MyObject retrieves a pointer to
+         # obsolete and/or deallocated objects.
+-        class MyObject:
+-            def __del__(self):
+-                nonlocal e
+-                e = sys.exception()
++        with torch._dynamo.error_on_graph_break(False):
++            class MyObject:
++                def __del__(self):
++                    nonlocal e
++                    e = sys.exception()
+         e = ()
+         try:
+             raise Exception(MyObject())
+@@ -1103,12 +1167,13 @@ class ExceptionTests(unittest.TestCase):
+         self.assertIsNone(e)
+ 
+     def test_raise_does_not_create_context_chain_cycle(self):
+-        class A(Exception):
+-            pass
+-        class B(Exception):
+-            pass
+-        class C(Exception):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class A(Exception):
++                pass
++            class B(Exception):
++                pass
++            class C(Exception):
++                pass
+ 
+         # Create a context chain:
+         # C -> B -> A
+@@ -1164,12 +1229,13 @@ class ExceptionTests(unittest.TestCase):
+     def test_no_hang_on_context_chain_cycle2(self):
+         # See issue 25782. Cycle at head of context chain.
+ 
+-        class A(Exception):
+-            pass
+-        class B(Exception):
+-            pass
+-        class C(Exception):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class A(Exception):
++                pass
++            class B(Exception):
++                pass
++            class C(Exception):
++                pass
+ 
+         # Context cycle:
+         # +-----------+
+@@ -1200,16 +1266,17 @@ class ExceptionTests(unittest.TestCase):
+     def test_no_hang_on_context_chain_cycle3(self):
+         # See issue 25782. Longer context chain with cycle.
+ 
+-        class A(Exception):
+-            pass
+-        class B(Exception):
+-            pass
+-        class C(Exception):
+-            pass
+-        class D(Exception):
+-            pass
+-        class E(Exception):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class A(Exception):
++                pass
++            class B(Exception):
++                pass
++            class C(Exception):
++                pass
++            class D(Exception):
++                pass
++            class E(Exception):
++                pass
+ 
+         # Context cycle:
+         #             +-----------+
+@@ -1364,11 +1431,12 @@ class ExceptionTests(unittest.TestCase):
+     def test_badisinstance(self):
+         # Bug #2542: if issubclass(e, MyException) raises an exception,
+         # it should be ignored
+-        class Meta(type):
+-            def __subclasscheck__(cls, subclass):
+-                raise ValueError()
+-        class MyException(Exception, metaclass=Meta):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Meta(type):
++                def __subclasscheck__(cls, subclass):
++                    raise ValueError()
++            class MyException(Exception, metaclass=Meta):
++                pass
+ 
+         with captured_stderr() as stderr:
+             try:
+@@ -1602,8 +1670,9 @@ class ExceptionTests(unittest.TestCase):
+         self.assertTrue(issubclass(error3, error2))
+ 
+         # test with explicit base tuple
+-        class C(object):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class C(object):
++                pass
+         error4 = _testcapi.make_exception_with_doc("_testcapi.error4", doc4,
+                                                    (error3, C))
+         self.assertTrue(issubclass(error4, error3))
+@@ -1623,8 +1692,9 @@ class ExceptionTests(unittest.TestCase):
+         # Issue #5437: preallocated MemoryError instances should not keep
+         # traceback objects alive.
+         from _testcapi import raise_memoryerror
+-        class C:
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class C:
++                pass
+         wr = None
+         def inner():
+             nonlocal wr
+@@ -1644,8 +1714,9 @@ class ExceptionTests(unittest.TestCase):
+     @no_tracing
+     def test_recursion_error_cleanup(self):
+         # Same test as above, but with "recursion exceeded" errors
+-        class C:
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class C:
++                pass
+         wr = None
+         def inner():
+             nonlocal wr
+@@ -1670,11 +1741,12 @@ class ExceptionTests(unittest.TestCase):
+ 
+     def test_unraisable(self):
+         # Issue #22836: PyErr_WriteUnraisable() should give sensible reports
+-        class BrokenDel:
+-            def __del__(self):
+-                exc = ValueError("del is broken")
+-                # The following line is included in the traceback report:
+-                raise exc
++        with torch._dynamo.error_on_graph_break(False):
++            class BrokenDel:
++                def __del__(self):
++                    exc = ValueError("del is broken")
++                    # The following line is included in the traceback report:
++                    raise exc
+ 
+         obj = BrokenDel()
+         with support.catch_unraisable_exception() as cm:
+@@ -1728,11 +1800,12 @@ class ExceptionTests(unittest.TestCase):
+ 
+     def test_yield_in_nested_try_excepts(self):
+         #Issue #25612
+-        class MainError(Exception):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class MainError(Exception):
++                pass
+ 
+-        class SubError(Exception):
+-            pass
++            class SubError(Exception):
++                pass
+ 
+         def main():
+             try:
+@@ -1807,8 +1880,9 @@ class ExceptionTests(unittest.TestCase):
+         # subclass object. Finally, it checks that creating a new MemoryError
+         # succeeds, proving that the freelist is not corrupted.
+ 
+-        class TestException(MemoryError):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class TestException(MemoryError):
++                pass
+ 
+         try:
+             raise MemoryError
+@@ -1844,7 +1918,7 @@ class ExceptionTests(unittest.TestCase):
          self.assertIn(b'MemoryError', err)
  
  
@@ -80,7 +378,18 @@ index c91f6662948..0ded70db3c7 100644
      def test_name_error_has_name(self):
          try:
              bluch
-@@ -1894,7 +1950,7 @@ class NameErrorTests(unittest.TestCase):
+@@ -1886,15 +1960,16 @@ class NameErrorTests(unittest.TestCase):
+ 
+     def test_gh_111654(self):
+         def f():
+-            class TestClass:
+-                TestClass
++            with torch._dynamo.error_on_graph_break(False):
++                class TestClass:
++                    TestClass
+ 
+         self.assertRaises(NameError, f)
+ 
      # Note: name suggestion tests live in `test_traceback`.
  
  
@@ -89,7 +398,33 @@ index c91f6662948..0ded70db3c7 100644
      def test_attributes(self):
          # Setting 'attr' should not be a problem.
          exc = AttributeError('Ouch!')
-@@ -1937,7 +1993,7 @@ class AttributeErrorTests(unittest.TestCase):
+@@ -1907,8 +1982,9 @@ class AttributeErrorTests(unittest.TestCase):
+         self.assertIs(exc.obj, sentinel)
+ 
+     def test_getattr_has_name_and_obj(self):
+-        class A:
+-            blech = None
++        with torch._dynamo.error_on_graph_break(False):
++            class A:
++                blech = None
+ 
+         obj = A()
+         try:
+@@ -1923,9 +1999,10 @@ class AttributeErrorTests(unittest.TestCase):
+             self.assertEqual(obj, exc.obj)
+ 
+     def test_getattr_has_name_and_obj_for_method(self):
+-        class A:
+-            def blech(self):
+-                return
++        with torch._dynamo.error_on_graph_break(False):
++            class A:
++                def blech(self):
++                    return
+ 
+         obj = A()
+         try:
+@@ -1937,7 +2014,7 @@ class AttributeErrorTests(unittest.TestCase):
      # Note: name suggestion tests live in `test_traceback`.
  
  
@@ -98,7 +433,7 @@ index c91f6662948..0ded70db3c7 100644
  
      def test_attributes(self):
          # Setting 'name' and 'path' should not be a problem.
-@@ -2024,7 +2080,7 @@ def run_script(source):
+@@ -2024,7 +2101,7 @@ def run_script(source):
      _rc, _out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN)
      return err.decode('utf-8').splitlines()
  
@@ -107,7 +442,7 @@ index c91f6662948..0ded70db3c7 100644
      def tearDown(self):
          unlink(TESTFN)
  
-@@ -2159,7 +2215,7 @@ class AssertionErrorTests(unittest.TestCase):
+@@ -2159,7 +2236,7 @@ class AssertionErrorTests(unittest.TestCase):
  
  
  @support.force_not_colorized_test_class
@@ -116,7 +451,19 @@ index c91f6662948..0ded70db3c7 100644
      maxDiff = None
  
      @force_not_colorized
-@@ -2290,6 +2346,7 @@ class SyntaxErrorTests(unittest.TestCase):
+@@ -2254,8 +2331,9 @@ class SyntaxErrorTests(unittest.TestCase):
+                     the_exception = exc
+ 
+     def test_subclass(self):
+-        class MySyntaxError(SyntaxError):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class MySyntaxError(SyntaxError):
++                pass
+ 
+         try:
+             raise MySyntaxError("bad bad", ("bad.py", 1, 2, "abcdefg", 1, 7))
+@@ -2290,6 +2368,7 @@ class SyntaxErrorTests(unittest.TestCase):
          err = run_script(b"\x89")
          self.assertIn("SyntaxError: Non-UTF-8 code starting with '\\x89' in file", err[-1])
  
@@ -124,7 +471,7 @@ index c91f6662948..0ded70db3c7 100644
      def test_string_source(self):
          def try_compile(source):
              with self.assertRaises(SyntaxError) as cm:
-@@ -2405,7 +2462,7 @@ class SyntaxErrorTests(unittest.TestCase):
+@@ -2405,7 +2484,7 @@ class SyntaxErrorTests(unittest.TestCase):
          self.assertRaises(TypeError, SyntaxError, "bad bad", args)
  
  
@@ -133,7 +480,7 @@ index c91f6662948..0ded70db3c7 100644
      def test_except_star_invalid_exception_type(self):
          with self.assertRaises(TypeError):
              try:
-@@ -2420,7 +2477,7 @@ class TestInvalidExceptionMatcher(unittest.TestCase):
+@@ -2420,7 +2499,7 @@ class TestInvalidExceptionMatcher(unittest.TestCase):
                  pass
  
  
@@ -142,7 +489,42 @@ index c91f6662948..0ded70db3c7 100644
  
      def lineno_after_raise(self, f, *expected):
          try:
-@@ -2529,5 +2586,5 @@ class PEP626Tests(unittest.TestCase):
+@@ -2499,11 +2578,12 @@ class PEP626Tests(unittest.TestCase):
+         self.lineno_after_raise(in_finally_except, 4)
+ 
+     def test_lineno_after_with(self):
+-        class Noop:
+-            def __enter__(self):
+-                return self
+-            def __exit__(self, *args):
+-                pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Noop:
++                def __enter__(self):
++                    return self
++                def __exit__(self, *args):
++                    pass
+         def after_with():
+             with Noop():
+                 1/0
+@@ -2518,16 +2598,17 @@ class PEP626Tests(unittest.TestCase):
+         self.lineno_after_raise(f, None)
+ 
+     def test_lineno_after_raise_in_with_exit(self):
+-        class ExitFails:
+-            def __enter__(self):
+-                return self
+-            def __exit__(self, *args):
+-                raise ValueError
++        with torch._dynamo.error_on_graph_break(False):
++            class ExitFails:
++                def __enter__(self):
++                    return self
++                def __exit__(self, *args):
++                    raise ValueError
+ 
+         def after_with():
+             with ExitFails():
                  1/0
          self.lineno_after_raise(after_with, 1, 1)
  
diff --git a/test/dynamo/cpython/3_13/test_exceptions.py b/test/dynamo/cpython/3_13/test_exceptions.py
index 0ded70db3c78..3a62dec411c4 100644
--- a/test/dynamo/cpython/3_13/test_exceptions.py
+++ b/test/dynamo/cpython/3_13/test_exceptions.py
@@ -399,12 +399,13 @@ def testSettingException(self):
         # test that setting an exception at the C level works even if the
         # exception object can't be constructed.
 
-        class BadException(Exception):
-            def __init__(self_):
-                raise RuntimeError("can't instantiate BadException")
+        with torch._dynamo.error_on_graph_break(False):
+            class BadException(Exception):
+                def __init__(self_):
+                    raise RuntimeError("can't instantiate BadException")
 
-        class InvalidException:
-            pass
+            class InvalidException:
+                pass
 
         @unittest.skipIf(_testcapi is None, "requires _testcapi")
         def test_capi1():
@@ -692,8 +693,9 @@ def testWithTraceback(self):
         self.assertIsInstance(e, IndexError)
         self.assertEqual(e.__traceback__, tb)
 
-        class MyException(Exception):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class MyException(Exception):
+                pass
 
         e = MyException().with_traceback(tb)
         self.assertIsInstance(e, MyException)
@@ -752,8 +754,9 @@ def testChainingAttrs(self):
         self.assertIsNone(e.__context__)
         self.assertIsNone(e.__cause__)
 
-        class MyException(OSError):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class MyException(OSError):
+                pass
 
         e = MyException()
         self.assertIsNone(e.__context__)
@@ -782,10 +785,11 @@ def testKeywordArgs(self):
         # but user-defined subclasses can if they want
         self.assertRaises(TypeError, BaseException, a=1)
 
-        class DerivedException(BaseException):
-            def __init__(self, fancy_arg):
-                BaseException.__init__(self)
-                self.fancy_arg = fancy_arg
+        with torch._dynamo.error_on_graph_break(False):
+            class DerivedException(BaseException):
+                def __init__(self, fancy_arg):
+                    BaseException.__init__(self)
+                    self.fancy_arg = fancy_arg
 
         x = DerivedException(fancy_arg=42)
         self.assertEqual(x.fancy_arg, 42)
@@ -835,11 +839,12 @@ def testExceptionCleanupState(self):
         # Make sure exception state is cleaned up as soon as the except
         # block is left. See #2507
 
-        class MyException(Exception):
-            def __init__(self, obj):
-                self.obj = obj
-        class MyObj:
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class MyException(Exception):
+                def __init__(self, obj):
+                    self.obj = obj
+            class MyObj:
+                pass
 
         def inner_raising_func():
             # Create some references in exception value and traceback
@@ -937,11 +942,12 @@ def inner_raising_func():
         self.assertIsNone(obj)
 
         # Inside an exception-silencing "with" block
-        class Context:
-            def __enter__(self):
-                return self
-            def __exit__ (self, exc_type, exc_value, exc_tb):
-                return True
+        with torch._dynamo.error_on_graph_break(False):
+            class Context:
+                def __enter__(self):
+                    return self
+                def __exit__ (self, exc_type, exc_value, exc_tb):
+                    return True
         obj = MyObj()
         wr = weakref.ref(obj)
         with Context():
@@ -1083,11 +1089,12 @@ def run_gen():
     def _check_generator_cleanup_exc_state(self, testfunc):
         # Issue #12791: exception state is cleaned up as soon as a generator
         # is closed (reference cycles are broken).
-        class MyException(Exception):
-            def __init__(self, obj):
-                self.obj = obj
-        class MyObj:
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class MyException(Exception):
+                def __init__(self, obj):
+                    self.obj = obj
+            class MyObj:
+                pass
 
         def raising_gen():
             try:
@@ -1146,10 +1153,11 @@ def do_send(g):
     def test_3114(self):
         # Bug #3114: in its destructor, MyObject retrieves a pointer to
         # obsolete and/or deallocated objects.
-        class MyObject:
-            def __del__(self):
-                nonlocal e
-                e = sys.exception()
+        with torch._dynamo.error_on_graph_break(False):
+            class MyObject:
+                def __del__(self):
+                    nonlocal e
+                    e = sys.exception()
         e = ()
         try:
             raise Exception(MyObject())
@@ -1159,12 +1167,13 @@ def __del__(self):
         self.assertIsNone(e)
 
     def test_raise_does_not_create_context_chain_cycle(self):
-        class A(Exception):
-            pass
-        class B(Exception):
-            pass
-        class C(Exception):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class A(Exception):
+                pass
+            class B(Exception):
+                pass
+            class C(Exception):
+                pass
 
         # Create a context chain:
         # C -> B -> A
@@ -1220,12 +1229,13 @@ def cycle():
     def test_no_hang_on_context_chain_cycle2(self):
         # See issue 25782. Cycle at head of context chain.
 
-        class A(Exception):
-            pass
-        class B(Exception):
-            pass
-        class C(Exception):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class A(Exception):
+                pass
+            class B(Exception):
+                pass
+            class C(Exception):
+                pass
 
         # Context cycle:
         # +-----------+
@@ -1256,16 +1266,17 @@ class C(Exception):
     def test_no_hang_on_context_chain_cycle3(self):
         # See issue 25782. Longer context chain with cycle.
 
-        class A(Exception):
-            pass
-        class B(Exception):
-            pass
-        class C(Exception):
-            pass
-        class D(Exception):
-            pass
-        class E(Exception):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class A(Exception):
+                pass
+            class B(Exception):
+                pass
+            class C(Exception):
+                pass
+            class D(Exception):
+                pass
+            class E(Exception):
+                pass
 
         # Context cycle:
         #             +-----------+
@@ -1420,11 +1431,12 @@ def test_unicode_error_str_does_not_crash(self):
     def test_badisinstance(self):
         # Bug #2542: if issubclass(e, MyException) raises an exception,
         # it should be ignored
-        class Meta(type):
-            def __subclasscheck__(cls, subclass):
-                raise ValueError()
-        class MyException(Exception, metaclass=Meta):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Meta(type):
+                def __subclasscheck__(cls, subclass):
+                    raise ValueError()
+            class MyException(Exception, metaclass=Meta):
+                pass
 
         with captured_stderr() as stderr:
             try:
@@ -1658,8 +1670,9 @@ def test_exception_with_doc(self):
         self.assertTrue(issubclass(error3, error2))
 
         # test with explicit base tuple
-        class C(object):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class C(object):
+                pass
         error4 = _testcapi.make_exception_with_doc("_testcapi.error4", doc4,
                                                    (error3, C))
         self.assertTrue(issubclass(error4, error3))
@@ -1679,8 +1692,9 @@ def test_memory_error_cleanup(self):
         # Issue #5437: preallocated MemoryError instances should not keep
         # traceback objects alive.
         from _testcapi import raise_memoryerror
-        class C:
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class C:
+                pass
         wr = None
         def inner():
             nonlocal wr
@@ -1700,8 +1714,9 @@ def inner():
     @no_tracing
     def test_recursion_error_cleanup(self):
         # Same test as above, but with "recursion exceeded" errors
-        class C:
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class C:
+                pass
         wr = None
         def inner():
             nonlocal wr
@@ -1726,11 +1741,12 @@ def test_errno_ENOTDIR(self):
 
     def test_unraisable(self):
         # Issue #22836: PyErr_WriteUnraisable() should give sensible reports
-        class BrokenDel:
-            def __del__(self):
-                exc = ValueError("del is broken")
-                # The following line is included in the traceback report:
-                raise exc
+        with torch._dynamo.error_on_graph_break(False):
+            class BrokenDel:
+                def __del__(self):
+                    exc = ValueError("del is broken")
+                    # The following line is included in the traceback report:
+                    raise exc
 
         obj = BrokenDel()
         with support.catch_unraisable_exception() as cm:
@@ -1784,11 +1800,12 @@ class C(): pass
 
     def test_yield_in_nested_try_excepts(self):
         #Issue #25612
-        class MainError(Exception):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class MainError(Exception):
+                pass
 
-        class SubError(Exception):
-            pass
+            class SubError(Exception):
+                pass
 
         def main():
             try:
@@ -1863,8 +1880,9 @@ def test_memory_error_subclasses(self):
         # subclass object. Finally, it checks that creating a new MemoryError
         # succeeds, proving that the freelist is not corrupted.
 
-        class TestException(MemoryError):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class TestException(MemoryError):
+                pass
 
         try:
             raise MemoryError
@@ -1942,8 +1960,9 @@ def f():
 
     def test_gh_111654(self):
         def f():
-            class TestClass:
-                TestClass
+            with torch._dynamo.error_on_graph_break(False):
+                class TestClass:
+                    TestClass
 
         self.assertRaises(NameError, f)
 
@@ -1963,8 +1982,9 @@ def test_attributes(self):
         self.assertIs(exc.obj, sentinel)
 
     def test_getattr_has_name_and_obj(self):
-        class A:
-            blech = None
+        with torch._dynamo.error_on_graph_break(False):
+            class A:
+                blech = None
 
         obj = A()
         try:
@@ -1979,9 +1999,10 @@ class A:
             self.assertEqual(obj, exc.obj)
 
     def test_getattr_has_name_and_obj_for_method(self):
-        class A:
-            def blech(self):
-                return
+        with torch._dynamo.error_on_graph_break(False):
+            class A:
+                def blech(self):
+                    return
 
         obj = A()
         try:
@@ -2310,8 +2331,9 @@ def test_range_of_offsets(self):
                     the_exception = exc
 
     def test_subclass(self):
-        class MySyntaxError(SyntaxError):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class MySyntaxError(SyntaxError):
+                pass
 
         try:
             raise MySyntaxError("bad bad", ("bad.py", 1, 2, "abcdefg", 1, 7))
@@ -2556,11 +2578,12 @@ def in_finally_except():
         self.lineno_after_raise(in_finally_except, 4)
 
     def test_lineno_after_with(self):
-        class Noop:
-            def __enter__(self):
-                return self
-            def __exit__(self, *args):
-                pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Noop:
+                def __enter__(self):
+                    return self
+                def __exit__(self, *args):
+                    pass
         def after_with():
             with Noop():
                 1/0
@@ -2575,11 +2598,12 @@ def f():
         self.lineno_after_raise(f, None)
 
     def test_lineno_after_raise_in_with_exit(self):
-        class ExitFails:
-            def __enter__(self):
-                return self
-            def __exit__(self, *args):
-                raise ValueError
+        with torch._dynamo.error_on_graph_break(False):
+            class ExitFails:
+                def __enter__(self):
+                    return self
+                def __exit__(self, *args):
+                    raise ValueError
 
         def after_with():
             with ExitFails():
diff --git a/test/dynamo/cpython/3_13/test_raise.diff b/test/dynamo/cpython/3_13/test_raise.diff
index 8e88286d1e8b..25b4c0e613cf 100644
--- a/test/dynamo/cpython/3_13/test_raise.diff
+++ b/test/dynamo/cpython/3_13/test_raise.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_raise.py b/test/dynamo/cpython/3_13/test_raise.py
-index 6d26a61bee4..042d1ae3d7c 100644
+index 6d26a61bee4..ce748433d28 100644
 --- a/test/dynamo/cpython/3_13/test_raise.py
 +++ b/test/dynamo/cpython/3_13/test_raise.py
 @@ -1,3 +1,58 @@
@@ -70,7 +70,35 @@ index 6d26a61bee4..042d1ae3d7c 100644
      def test_invalid_reraise(self):
          try:
              raise
-@@ -148,7 +203,7 @@ class TestRaise(unittest.TestCase):
+@@ -120,9 +175,10 @@ class TestRaise(unittest.TestCase):
+         self.assertRaises(StopIteration, lambda: next(g))
+ 
+     def test_erroneous_exception(self):
+-        class MyException(Exception):
+-            def __init__(self):
+-                raise RuntimeError()
++        with torch._dynamo.error_on_graph_break(False):
++            class MyException(Exception):
++                def __init__(self):
++                    raise RuntimeError()
+ 
+         try:
+             raise MyException
+@@ -133,9 +189,10 @@ class TestRaise(unittest.TestCase):
+ 
+     def test_new_returns_invalid_instance(self):
+         # See issue #11627.
+-        class MyException(Exception):
+-            def __new__(cls, *args):
+-                return object()
++        with torch._dynamo.error_on_graph_break(False):
++            class MyException(Exception):
++                def __new__(cls, *args):
++                    return object()
+ 
+         with self.assertRaises(TypeError):
+             raise MyException
+@@ -148,7 +205,7 @@ class TestRaise(unittest.TestCase):
  
  
  
@@ -79,7 +107,37 @@ index 6d26a61bee4..042d1ae3d7c 100644
  
      def testCauseSyntax(self):
          try:
-@@ -221,7 +276,7 @@ class TestCause(unittest.TestCase):
+@@ -186,10 +243,11 @@ class TestCause(unittest.TestCase):
+             self.fail("No exception raised")
+ 
+     def test_class_cause_nonexception_result(self):
+-        class ConstructsNone(BaseException):
+-            @classmethod
+-            def __new__(*args, **kwargs):
+-                return None
++        with torch._dynamo.error_on_graph_break(False):
++            class ConstructsNone(BaseException):
++                @classmethod
++                def __new__(*args, **kwargs):
++                    return None
+         try:
+             raise IndexError from ConstructsNone
+         except TypeError as e:
+@@ -209,9 +267,10 @@ class TestCause(unittest.TestCase):
+             self.fail("No exception raised")
+ 
+     def test_erroneous_cause(self):
+-        class MyException(Exception):
+-            def __init__(self):
+-                raise RuntimeError()
++        with torch._dynamo.error_on_graph_break(False):
++            class MyException(Exception):
++                def __init__(self):
++                    raise RuntimeError()
+ 
+         try:
+             raise IndexError from MyException
+@@ -221,7 +280,7 @@ class TestCause(unittest.TestCase):
              self.fail("No exception raised")
  
  
@@ -88,7 +146,7 @@ index 6d26a61bee4..042d1ae3d7c 100644
  
      def test_sets_traceback(self):
          try:
-@@ -242,7 +297,7 @@ class TestTraceback(unittest.TestCase):
+@@ -242,7 +301,7 @@ class TestTraceback(unittest.TestCase):
              self.fail("No exception raised")
  
  
@@ -97,7 +155,7 @@ index 6d26a61bee4..042d1ae3d7c 100644
  
      def raiser(self):
          raise ValueError
-@@ -308,7 +363,7 @@ class TestTracebackType(unittest.TestCase):
+@@ -308,7 +367,7 @@ class TestTracebackType(unittest.TestCase):
              types.TracebackType(other_tb, frame, 1, "nuh-uh")
  
  
@@ -106,7 +164,45 @@ index 6d26a61bee4..042d1ae3d7c 100644
      def test_instance_context_instance_raise(self):
          context = IndexError()
          try:
-@@ -498,7 +553,7 @@ class TestContext(unittest.TestCase):
+@@ -392,11 +451,12 @@ class TestContext(unittest.TestCase):
+             self.fail("No exception raised")
+ 
+     def test_context_manager(self):
+-        class ContextManager:
+-            def __enter__(self):
+-                pass
+-            def __exit__(self, t, v, tb):
+-                xyzzy
++        with torch._dynamo.error_on_graph_break(False):
++            class ContextManager:
++                def __enter__(self):
++                    pass
++                def __exit__(self, t, v, tb):
++                    xyzzy
+         try:
+             with ContextManager():
+                 1/0
+@@ -471,12 +531,13 @@ class TestContext(unittest.TestCase):
+         import gc
+         # A re-raised exception in a __del__ caused the __context__
+         # to be cleared
+-        class C:
+-            def __del__(self):
+-                try:
+-                    1/0
+-                except:
+-                    raise
++        with torch._dynamo.error_on_graph_break(False):
++            class C:
++                def __del__(self):
++                    try:
++                        1/0
++                    except:
++                        raise
+ 
+         def f():
+             x = C()
+@@ -498,7 +559,7 @@ class TestContext(unittest.TestCase):
              self.assertEqual(ZeroDivisionError, cm.unraisable.exc_type)
  
  
@@ -115,7 +211,7 @@ index 6d26a61bee4..042d1ae3d7c 100644
      def test_tuples(self):
          try:
              raise (IndexError, KeyError) # This should be a tuple!
-@@ -517,4 +572,4 @@ class TestRemovedFunctionality(unittest.TestCase):
+@@ -517,4 +578,4 @@ class TestRemovedFunctionality(unittest.TestCase):
  
  
  if __name__ == "__main__":
diff --git a/test/dynamo/cpython/3_13/test_raise.py b/test/dynamo/cpython/3_13/test_raise.py
index 042d1ae3d7c0..ce748433d283 100644
--- a/test/dynamo/cpython/3_13/test_raise.py
+++ b/test/dynamo/cpython/3_13/test_raise.py
@@ -175,9 +175,10 @@ def reraise():
         self.assertRaises(StopIteration, lambda: next(g))
 
     def test_erroneous_exception(self):
-        class MyException(Exception):
-            def __init__(self):
-                raise RuntimeError()
+        with torch._dynamo.error_on_graph_break(False):
+            class MyException(Exception):
+                def __init__(self):
+                    raise RuntimeError()
 
         try:
             raise MyException
@@ -188,9 +189,10 @@ def __init__(self):
 
     def test_new_returns_invalid_instance(self):
         # See issue #11627.
-        class MyException(Exception):
-            def __new__(cls, *args):
-                return object()
+        with torch._dynamo.error_on_graph_break(False):
+            class MyException(Exception):
+                def __new__(cls, *args):
+                    return object()
 
         with self.assertRaises(TypeError):
             raise MyException
@@ -241,10 +243,11 @@ def test_class_cause(self):
             self.fail("No exception raised")
 
     def test_class_cause_nonexception_result(self):
-        class ConstructsNone(BaseException):
-            @classmethod
-            def __new__(*args, **kwargs):
-                return None
+        with torch._dynamo.error_on_graph_break(False):
+            class ConstructsNone(BaseException):
+                @classmethod
+                def __new__(*args, **kwargs):
+                    return None
         try:
             raise IndexError from ConstructsNone
         except TypeError as e:
@@ -264,9 +267,10 @@ def test_instance_cause(self):
             self.fail("No exception raised")
 
     def test_erroneous_cause(self):
-        class MyException(Exception):
-            def __init__(self):
-                raise RuntimeError()
+        with torch._dynamo.error_on_graph_break(False):
+            class MyException(Exception):
+                def __init__(self):
+                    raise RuntimeError()
 
         try:
             raise IndexError from MyException
@@ -447,11 +451,12 @@ def test_raise_finally(self):
             self.fail("No exception raised")
 
     def test_context_manager(self):
-        class ContextManager:
-            def __enter__(self):
-                pass
-            def __exit__(self, t, v, tb):
-                xyzzy
+        with torch._dynamo.error_on_graph_break(False):
+            class ContextManager:
+                def __enter__(self):
+                    pass
+                def __exit__(self, t, v, tb):
+                    xyzzy
         try:
             with ContextManager():
                 1/0
@@ -526,12 +531,13 @@ def test_3611(self):
         import gc
         # A re-raised exception in a __del__ caused the __context__
         # to be cleared
-        class C:
-            def __del__(self):
-                try:
-                    1/0
-                except:
-                    raise
+        with torch._dynamo.error_on_graph_break(False):
+            class C:
+                def __del__(self):
+                    try:
+                        1/0
+                    except:
+                        raise
 
         def f():
             x = C()
diff --git a/test/dynamo/test_activation_checkpointing.py b/test/dynamo/test_activation_checkpointing.py
index f3a79308aab9..9c168a8e04ae 100644
--- a/test/dynamo/test_activation_checkpointing.py
+++ b/test/dynamo/test_activation_checkpointing.py
@@ -759,6 +759,38 @@ def fn(x):
             ),
         )
 
+    def test_sac_with_partial_context_fn(self):
+        class CustomPolicy:
+            def __init__(self):
+                super().__init__()
+
+            def __call__(self, ctx, out, func, *args, **kwargs):
+                return CheckpointPolicy.MUST_SAVE
+
+        def f(x, y):
+            return torch.sigmoid(torch.matmul(torch.matmul(x, y), y)) * y
+
+        context_fn1 = functools.partial(
+            create_selective_checkpoint_contexts, CustomPolicy()
+        )
+
+        def fn(x, y):
+            return torch.utils.checkpoint.checkpoint(
+                f,
+                x,
+                y,
+                use_reentrant=False,
+                context_fn=context_fn1,
+            )
+
+        opt_fn = torch.compile(fn, backend="aot_eager_decomp_partition", fullgraph=True)
+        a = torch.randn(4, 4, requires_grad=True, device="cpu")
+        b = torch.randn(4, 4, requires_grad=True, device="cpu")
+
+        expected = fn(a, b)
+        result = opt_fn(a, b)
+        self.assertEqual(result, expected)
+
     @requires_cuda_and_triton
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_must_not_recompute_gemm(self, device):
@@ -806,6 +838,55 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
+    @requires_cuda_and_triton
+    @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
+    def test_compile_selective_checkpoint_must_not_recompute_gemm_no_functionalization(
+        self, device
+    ):
+        def selective_checkpointing_context_fn():
+            no_recompute_list = [
+                torch.ops.aten.mm.default,
+            ]
+            return create_selective_checkpoint_contexts(
+                _get_custom_policy(no_recompute_list=no_recompute_list)
+            )
+
+        def gn(x, y):
+            return torch.sigmoid(torch.matmul(torch.matmul(x, y), y)) * y
+
+        def fn(x, y):
+            return torch.utils.checkpoint.checkpoint(
+                gn,
+                x,
+                y,
+                use_reentrant=False,
+                context_fn=selective_checkpointing_context_fn,
+            )
+
+        x = torch.randn(4, 4, requires_grad=True, device=device)
+        y = torch.randn(4, 4, requires_grad=True, device=device)
+
+        fw_compiler = functools.partial(
+            count_ops,
+            freq=1,
+            op=torch.ops.aten.sigmoid.default,
+        )
+        bw_compiler = functools.partial(
+            count_ops,
+            # Main check here is just that sigmoid is properly recomputed
+            # (we will see a sigmoid() and sigmoid_backward() in the bw graph)
+            freq=1,
+            op=torch.ops.aten.sigmoid.default,
+        )
+        backend = aot_autograd(
+            fw_compiler=fw_compiler,
+            bw_compiler=bw_compiler,
+            partition_fn=min_cut_rematerialization_partition,
+            disable_functionalization=True,
+        )
+        self._validate(fn, backend, x, y)
+        self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
+
     @requires_cuda_and_triton
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_triton_kernel(self, device):
@@ -1566,6 +1647,29 @@ def fn(x):
 
         self.assertEqual(opt_fn(x), fn(x))
 
+    @torch._dynamo.config.patch(skip_fwd_side_effects_in_bwd_under_checkpoint=True)
+    def test_nonlocal_mutation(self):
+        counter = 0
+
+        def gn(x):
+            nonlocal counter
+            counter += 1
+            return torch.sin(x)
+
+        def fn(x):
+            return torch.utils.checkpoint.checkpoint(gn, x, use_reentrant=True)
+
+        x = torch.randn(4, 4, requires_grad=True)
+        fn(x).sum().backward()
+        # The mutation is reapplied in the backward as well
+        self.assertEqual(counter, 2)
+        counter = 0
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        opt_fn(x).sum().backward()
+        # The mutation is not reapplied in the backward because the flag was on.
+        self.assertEqual(counter, 1)
+
 
 devices = ["cuda", "hpu"]
 instantiate_device_type_tests(
diff --git a/test/dynamo/test_aot_autograd.py b/test/dynamo/test_aot_autograd.py
index 8b2740596a72..a51e28e37a09 100644
--- a/test/dynamo/test_aot_autograd.py
+++ b/test/dynamo/test_aot_autograd.py
@@ -715,6 +715,42 @@ def fn(x, y):
         out = compiled_fn(x, y)
         out.sum().backward()
 
+    def test_joint_custom_pass(self):
+        is_called = False
+
+        def joint_custom_pass(joint_gm: torch.fx.GraphModule, joint_inputs):
+            nonlocal is_called
+            is_called = True
+
+            self.assertTrue(isinstance(joint_gm, torch.fx.GraphModule))
+
+            self.assertTrue(isinstance(joint_inputs, tuple))
+            # first input is list of primals
+            self.assertTrue(isinstance(joint_inputs[0], list))
+            # second input is list of tangents
+            self.assertTrue(isinstance(joint_inputs[1], list))
+
+            return joint_gm
+
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x.sin()
+
+        x = torch.randn(10, requires_grad=False)
+        compiled_fn = torch.compile(M(), backend="aot_eager")
+
+        with torch._functorch.config.patch("joint_custom_pass", joint_custom_pass):
+            _ = compiled_fn(x)
+        # x doesn't require grad, shouldn't trigger joint graph compiler
+        self.assertFalse(is_called)
+
+        y = torch.randn(10, requires_grad=True)
+        with torch._functorch.config.patch("joint_custom_pass", joint_custom_pass):
+            out = compiled_fn(y)
+        # y requires grad, should trigger joint graph compiler
+        self.assertTrue(is_called)
+        out.sum().backward()
+
     @expectedFailureDynamic  # https://github.com/pytorch/pytorch/issues/103539
     @torch._dynamo.config.patch(automatic_dynamic_shapes=False)
     @patch("torch._functorch.config.debug_assert", True)
@@ -880,43 +916,41 @@ def _prepare_model_args():
             dedent(
                 """\
 SeqNr|OrigAten|SrcFn|FwdSrcFn
-0|aten.convolution.default|l__self___conv1|
-0|aten.add.Tensor|l__self___bn1|
-1|aten._native_batch_norm_legit_functional.default|l__self___bn1|
-2|aten.relu.default|l__self___relu1|
-2|aten.detach.default|l__self___relu1|
-2|aten.detach.default|l__self___relu1|
+0|aten.convolution.default|conv2d|
+0|aten.add.Tensor|add_|
+1|aten._native_batch_norm_legit_functional.default|batch_norm|
+2|aten.relu.default|relu|
+2|aten.detach.default|relu|
 3|aten.add.Tensor|add|
 4|aten.view.default|flatten|
-5|aten.view.default|l__self___fc1|
-6|aten.t.default|l__self___fc1|
-7|aten.addmm.default|l__self___fc1|
-8|aten.view.default|l__self___fc1|
-9|aten.sub.Tensor|l__self___loss_fn|
-10|aten.abs.default|l__self___loss_fn|
-11|aten.mean.default|l__self___loss_fn|
-11|aten.ones_like.default||l__self___loss_fn
-11|aten.expand.default||l__self___loss_fn
-11|aten.div.Scalar||l__self___loss_fn
-10|aten.sgn.default||l__self___loss_fn
-10|aten.mul.Tensor||l__self___loss_fn
-8|aten.view.default||l__self___fc1
-7|aten.t.default||l__self___fc1
-7|aten.mm.default||l__self___fc1
-7|aten.t.default||l__self___fc1
-7|aten.mm.default||l__self___fc1
-7|aten.t.default||l__self___fc1
-7|aten.sum.dim_IntList||l__self___fc1
-7|aten.view.default||l__self___fc1
-6|aten.t.default||l__self___fc1
-5|aten.view.default||l__self___fc1
-4|aten.view.default||
-2|aten.detach.default||l__self___relu1
-2|aten.detach.default||l__self___relu1
-2|aten.threshold_backward.default||l__self___relu1
-1|aten.native_batch_norm_backward.default||l__self___bn1
-0|aten.convolution_backward.default||l__self___conv1
-11|aten.add.Tensor||l__self___loss_fn
+5|aten.view.default|linear|
+6|aten.t.default|linear|
+7|aten.addmm.default|linear|
+8|aten.view.default|linear|
+9|aten.sub.Tensor|l1_loss|
+10|aten.abs.default|l1_loss|
+11|aten.mean.default|l1_loss|
+11|aten.ones_like.default||l1_loss
+11|aten.expand.default||l1_loss
+11|aten.div.Scalar||l1_loss
+10|aten.sgn.default||l1_loss
+10|aten.mul.Tensor||l1_loss
+8|aten.view.default||linear
+7|aten.t.default||linear
+7|aten.mm.default||linear
+7|aten.t.default||linear
+7|aten.mm.default||linear
+7|aten.t.default||linear
+7|aten.sum.dim_IntList||linear
+7|aten.view.default||linear
+6|aten.t.default||linear
+5|aten.view.default||linear
+4|aten.view.default||flatten
+2|aten.detach.default||relu
+2|aten.threshold_backward.default||relu
+1|aten.native_batch_norm_backward.default||batch_norm
+0|aten.convolution_backward.default||conv2d
+11|aten.add.Tensor||l1_loss
 """
             ),
         )
@@ -1650,6 +1684,40 @@ def __call__(self, *args, **kwargs):
         # However, at the time this change was introduced, it went down from 15154 to 403.
         self.assertLess(len(shape_env_guards), 1000)
 
+    # See # https://github.com/pytorch/pytorch/issues/164814
+    def test_aot_autograd_stride_reconstruction_on_zero_dim_dynamic_shaped_tensor(
+        self,
+    ) -> None:
+        def repro(sentinel: torch.Tensor, skip_squeeze: bool = False) -> torch.Tensor:
+            x = torch.unique(torch.ones(1))
+            x = torch.reshape(x, [1])
+            if not skip_squeeze:
+                x = torch.squeeze(x)  # 0-d tensor
+            return x * sentinel
+
+        # Grad required to trigger the issue (need to replay stride)
+        sentinel = torch.tensor(1.0, requires_grad=True)
+        eager_sq = repro(sentinel)
+        comp_aot_sq = torch.compile(repro, backend="aot_eager", fullgraph=True)(
+            sentinel
+        )
+        comp_ind_sq = torch.compile(repro, backend="inductor", fullgraph=True)(sentinel)
+        self.assertEqual(eager_sq, comp_aot_sq)
+        self.assertEqual(eager_sq, comp_ind_sq)
+        self.assertEqual(eager_sq.stride(), comp_ind_sq.stride())
+
+        # Now check semantics preserved when skipping squeeze
+        eager_no_sq = repro(sentinel, skip_squeeze=True)
+        comp_aot_no_sq = torch.compile(repro, backend="aot_eager", fullgraph=True)(
+            sentinel, skip_squeeze=True
+        )
+        comp_ind_no_sq = torch.compile(repro, backend="inductor", fullgraph=True)(
+            sentinel, skip_squeeze=True
+        )
+        self.assertEqual(eager_no_sq, comp_aot_no_sq)
+        self.assertEqual(eager_no_sq, comp_ind_no_sq)
+        self.assertEqual(eager_no_sq.stride(), comp_ind_no_sq.stride())
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_aot_autograd_cache.py b/test/dynamo/test_aot_autograd_cache.py
index 68ac9d427f8e..04af76c90c52 100644
--- a/test/dynamo/test_aot_autograd_cache.py
+++ b/test/dynamo/test_aot_autograd_cache.py
@@ -519,11 +519,7 @@ def fn(x, y):
     @functorch_config.patch(
         {"enable_autograd_cache": True, "view_replay_for_aliased_outputs": True}
     )
-    def test_view_replay_bypass(self):
-        """
-        Should bypass when view replay is turned on
-        """
-
+    def test_view_replay(self):
         def fn(a):
             tmp = a.detach()
             a.mul_(2)
@@ -531,10 +527,25 @@ def fn(a):
 
         with torch.autograd._force_original_view_tracking(True):
             compiled_fn = torch.compile(fn)
-            compiled_fn(torch.rand(2, 3))
 
-        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
-        self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], 1)
+        def run_and_check(miss, hit, bypass):
+            self._clear_dynamo_and_codecache()
+
+            inp = torch.rand(2, 3)
+            compiled_inp = inp.clone().detach()
+
+            with torch.autograd._force_original_view_tracking(True):
+                out = fn(inp)
+                compiled_out = compiled_fn(compiled_inp)
+
+            self.assertEqual(out, compiled_out)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], miss)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], hit)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], bypass)
+
+        run_and_check(miss=1, hit=0, bypass=0)
+        run_and_check(miss=1, hit=1, bypass=0)
+        run_and_check(miss=1, hit=2, bypass=0)
 
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch("fx_graph_cache", True)
diff --git a/test/dynamo/test_aot_compile.py b/test/dynamo/test_aot_compile.py
index fb87aca71013..d543fe76d65c 100644
--- a/test/dynamo/test_aot_compile.py
+++ b/test/dynamo/test_aot_compile.py
@@ -1,8 +1,11 @@
 # Owner(s): ["module: dynamo"]
 
+import functools
+import inspect
 import os
 import pickle
 from contextlib import contextmanager
+from unittest.mock import patch
 
 import torch
 import torch._dynamo.testing
@@ -29,8 +32,27 @@ def __init__(self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor])
 
     @classmethod
     def serialize_compile_artifacts(cls, fn) -> bytes:
+        import sympy
+
+        from torch._subclasses import FakeTensorMode
+        from torch.fx._graph_pickler import Options
+
         state = fn.__dict__.copy()
-        state["gm"] = GraphPickler.dumps(state["gm"])
+        graph_reducer_override = GraphPickler.reducer_override
+
+        def _graph_reducer_override(self, obj):
+            if (
+                inspect.isclass(obj)
+                and issubclass(obj, sympy.Function)
+                and hasattr(obj, "_torch_unpickler")
+            ):
+                return obj._torch_unpickler, (obj._torch_handler_name,)
+            if isinstance(obj, FakeTensorMode):
+                return type(None), ()
+            return graph_reducer_override(self, obj)
+
+        with patch.object(GraphPickler, "reducer_override", _graph_reducer_override):
+            state["gm"] = GraphPickler.dumps(state["gm"], Options(ops_filter=None))
         return pickle.dumps(state)
 
     @classmethod
@@ -54,6 +76,14 @@ def forward(self, x):
         return self.linear(x)
 
 
+class RepeatInterleaveModule(torch.nn.Module):
+    def forward(self, x):
+        chunk = x.chunk(2, dim=-1)
+        y = chunk[0]
+        y_repeat = y.repeat_interleave(2, dim=-1)
+        return y_repeat
+
+
 @torch._dynamo.config.patch("enable_aot_compile", True)
 @instantiate_parametrized_tests
 class TestAOTCompile(torch._inductor.test_case.TestCase):
@@ -114,6 +144,34 @@ def backend(gm, example_inputs):
             actual = compiled_fn(mod, *inputs)
             self.assertEqual(expected, actual)
 
+    def test_aot_compile_repeat_interleave(self):
+        mod = RepeatInterleaveModule()
+
+        def backend(gm, example_inputs):
+            return CustomCompiledFunction(gm, example_inputs)
+
+        inputs = (torch.randn(2, 4),)
+
+        # The first dim should be dynamic to repro the issue of repeat_interleave
+        # torch._dynamo.mark_dynamic(inputs[0], [0])
+
+        compiled_fn = torch.compile(
+            mod,
+            fullgraph=True,
+            backend=backend,
+        ).forward.aot_compile((inputs, {}))
+
+        expected = mod(*inputs)
+        actual = compiled_fn(mod, *inputs)
+        self.assertEqual(expected, actual)
+        compiled_fn.save_compiled_function(self.path())
+        torch._dynamo.reset()
+        with torch.compiler.set_stance("fail_on_recompile"):
+            with open(self.path(), "rb") as f:
+                compiled_fn = torch.compiler.load_compiled_function(f)
+            actual = compiled_fn(mod, *inputs)
+            self.assertEqual(expected, actual)
+
     def test_decorated_function_aot(self):
         def check_inputs(fn):
             def _fn(*args, **kwargs):
@@ -146,6 +204,55 @@ def backend(gm, example_inputs):
             actual = compiled_fn(*example_inputs)
             self.assertEqual(expected, actual)
 
+    def test_decorated_function_with_functools_wrap_aot(self):
+        def check_inputs(fn):
+            @functools.wraps(fn)
+            def _fn(*args, **kwargs):
+                for arg in args:
+                    assert arg.shape[0] > 1
+
+                return fn(*args, **kwargs)
+
+            return _fn
+
+        @check_inputs
+        def foo(x, y):
+            a = x + x
+            b = y + y
+            c = a + b
+            return c
+
+        example_inputs = (torch.ones(3), torch.ones(3))
+        expected = foo(*example_inputs)
+
+        def backend(gm, example_inputs):
+            return CustomCompiledFunction(gm, example_inputs)
+
+        with torch.compiler.set_stance("fail_on_recompile"):
+            compiled_fn = torch.compile(
+                foo,
+                fullgraph=True,
+                backend=backend,
+            ).aot_compile((example_inputs, {}))
+            actual = compiled_fn(*example_inputs)
+            self.assertEqual(expected, actual)
+
+    def test_aot_compile_disable_guard_check(self):
+        def fn(x, y):
+            return x + y
+
+        with torch.no_grad():
+            compiled_fn = torch.compile(fn, fullgraph=True).aot_compile(
+                ((torch.randn(3, 4), torch.randn(3, 4)), {})
+            )
+        inputs = (torch.randn(3, 4), torch.randn(3, 4))
+        expected = fn(*inputs)
+        with self.assertRaisesRegex(RuntimeError, "GuardManager check failed"):
+            compiled_fn(*inputs)
+        compiled_fn.disable_guard_check()
+        actual = compiled_fn(*inputs)
+        self.assertEqual(expected, actual)
+
     def test_aot_compile_source_info(self):
         from torch._dynamo.package import SourceInfo
 
diff --git a/test/dynamo/test_backends.py b/test/dynamo/test_backends.py
index 30c354b6ec11..28579f727b05 100644
--- a/test/dynamo/test_backends.py
+++ b/test/dynamo/test_backends.py
@@ -1,5 +1,4 @@
 # Owner(s): ["module: dynamo"]
-import sys
 import unittest
 from unittest.mock import MagicMock, patch
 
@@ -305,23 +304,15 @@ def test_lookup_custom_backend(self):
         backends_group = "torch_dynamo_backends"
         name = "mycustombackend"
 
-        mock_3_9 = MagicMock()
-        mock_3_9.load.return_value = lambda: "mocked 3.9"
-        mock_3_9.name = name
-
         mock_3_10 = MagicMock()
         mock_3_10.load.return_value = lambda: "mocked 3.10"
 
         def mock_eps(group=None):
-            if sys.version_info < (3, 10):
-                return {backends_group: [mock_3_9]}
-            else:
-                assert group == backends_group, group
-                mock_group = MagicMock()
-                mock_group.names = [name]
-                mock_group[name] = mock_3_10
-                # mock_group[name].load.return_value = lambda: "mocked 3.10"
-                return mock_group
+            assert group == backends_group, group
+            mock_group = MagicMock()
+            mock_group.names = [name]
+            mock_group[name] = mock_3_10
+            return mock_group
 
         with patch("importlib.metadata.entry_points", mock_eps):
             from torch._dynamo.backends import registry
diff --git a/test/dynamo/test_backward_higher_order_ops.py b/test/dynamo/test_backward_higher_order_ops.py
index 2c60d6ba4cf5..97a380934484 100644
--- a/test/dynamo/test_backward_higher_order_ops.py
+++ b/test/dynamo/test_backward_higher_order_ops.py
@@ -140,7 +140,7 @@ def forward(self, L_inputs_ : list, s69: "Sym(s21)", L_sizes_0_: "f32[0, s21]"):
 
         size: "Sym(s21)" = l_sizes_0_.size(1);  l_sizes_0_ = None
 
-        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [size], False)]);  getitem = size = None
+        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [size], False, 6)]);  getitem = size = None
         getitem_9: "f32[s21]" = validate_outputs[0];  validate_outputs = None
 
         call_aot_bwd_prologue = torch__dynamo_compiled_autograd_call_aot_bwd_prologue((), [], getitem_9);  getitem_9 = None
@@ -171,7 +171,7 @@ def forward(self, L_inputs_ : list, s69: "Sym(s21)", L_sizes_0_: "f32[0, s21]"):
 
         size: "Sym(s21)" = l_sizes_0_.size(1);  l_sizes_0_ = None
 
-        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [size], False)]);  getitem = size = None
+        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [size], False, 6)]);  getitem = size = None
         getitem_9: "f32[s21]" = validate_outputs[0];  validate_outputs = None
 
         call_aot_bwd_prologue = torch__dynamo_compiled_autograd_call_aot_bwd_prologue((), [], getitem_9);  getitem_9 = None
@@ -255,7 +255,7 @@ def forward(self, L_inputs_ : list, s69: "Sym(s21)", L_sizes_0_: "f32[0, s21]",
 
         size: "Sym(s21)" = l_sizes_0_.size(1);  l_sizes_0_ = None
 
-        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [size], False)]);  getitem = size = None
+        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [size], False, 6)]);  getitem = size = None
         getitem_9: "f32[s21]" = validate_outputs[0];  validate_outputs = None
 
         call_aot_bwd_prologue = torch__dynamo_compiled_autograd_call_aot_bwd_prologue((), [], getitem_9);  getitem_9 = None
diff --git a/test/dynamo/test_bytecode_utils.py b/test/dynamo/test_bytecode_utils.py
index ea5ec7b55a4f..32596abc72a3 100644
--- a/test/dynamo/test_bytecode_utils.py
+++ b/test/dynamo/test_bytecode_utils.py
@@ -80,7 +80,7 @@ def fn():
         self.assertEqual(fn.__code__.co_lnotab, result[1].co_lnotab)
 
     @unittest.skipIf(
-        sys.version_info < (3, 10) or sys.version_info >= (3, 11),
+        sys.version_info >= (3, 11),
         "linetable test for Python 3.10",
     )
     def test_linetable_310_writer(self):
@@ -95,19 +95,6 @@ def fn():
         result = bytecode_transformation.assemble(inst, fn.__code__.co_firstlineno)
         self.assertTrue(result[1] == fn.__code__.co_linetable)
 
-    @unittest.skipIf(sys.version_info >= (3, 10), "use lnotab when python < 3.10")
-    def test_lnotab_writer(self):
-        def fn():
-            a = 10
-            b = 20
-            c = a + b
-            f = "lnotab_writer"
-            return f"Test if {f} generates correct co_lnotab: {c}"
-
-        inst = dis.get_instructions(fn)
-        result = bytecode_transformation.assemble(inst, fn.__code__.co_firstlineno)
-        self.assertTrue(result[1] == fn.__code__.co_lnotab)
-
     def test_if_tensor_is_none(self):
         """
         Python 3.11 adds new jump instructions that check if
diff --git a/test/dynamo/test_callback.py b/test/dynamo/test_callback.py
index 56ff5ff41e68..881515717460 100644
--- a/test/dynamo/test_callback.py
+++ b/test/dynamo/test_callback.py
@@ -30,7 +30,7 @@ def tearDown(self) -> None:
 
     def test_callbacks_with_duplicate_prevention(self) -> None:
         trigger = CallbackTrigger.DYNAMO
-        compile_id = CompileId(0, 0)
+        compile_id = CompileId(frame_id=0, frame_compile_id=0)
         with (
             callback_handler.install_callbacks(trigger, compile_id),
             callback_handler.install_callbacks(trigger, compile_id),
@@ -40,7 +40,7 @@ def test_callbacks_with_duplicate_prevention(self) -> None:
 
     def test_counter(self) -> None:
         trigger = CallbackTrigger.DYNAMO
-        compile_id = CompileId(0, 0)
+        compile_id = CompileId(frame_id=0, frame_compile_id=0)
         with callback_handler.install_callbacks(trigger, compile_id):
             self.assertEqual(
                 callback_handler._CompilationCallbackHandler__pending_callbacks_counter,
@@ -56,7 +56,7 @@ def test_counter_assertion(self) -> None:
             AssertionError, "Pending callbacks counter cannot become negative."
         ):
             trigger = CallbackTrigger.DYNAMO
-            compile_id = CompileId(0, 0)
+            compile_id = CompileId(frame_id=0, frame_compile_id=0)
             with callback_handler.install_callbacks(trigger, str(compile_id)):
                 pass
         self.assertEqual(
diff --git a/test/dynamo/test_compile.py b/test/dynamo/test_compile.py
index 1f7290c51dd8..7df0ba2f1d3e 100644
--- a/test/dynamo/test_compile.py
+++ b/test/dynamo/test_compile.py
@@ -234,6 +234,27 @@ def fn(x, y):
         with self.assertRaises(IndexError):
             fn(torch.randn(10), 99)
 
+    def test_list_bad_weakref(self):
+        import weakref
+
+        a = torch.Event()
+        with self.assertRaises(TypeError):
+            weakref.ref(a)
+
+        @torch.compile(backend="eager")
+        class Mod(torch.nn.Module):
+            def __init__(self, event):
+                super().__init__()
+                self.event = event
+
+            def forward(self, x):
+                return x * int(self.event.query())
+
+        e = torch.Event()
+        m = Mod(e)
+        a = torch.randn(10)
+        self.assertEqual(m(a), a)
+
 
 # The private variants of the below functions are extensively tested
 # So as long as the signatures match we're good
diff --git a/test/dynamo/test_compiler_bisector.py b/test/dynamo/test_compiler_bisector.py
index 161f9674cd4a..9ce4d714fbd9 100644
--- a/test/dynamo/test_compiler_bisector.py
+++ b/test/dynamo/test_compiler_bisector.py
@@ -95,6 +95,40 @@ def test_fn():
         self.assertEqual(out.bisect_number, 1)
         self.assertTrue("aten.exponential" in out.debug_info)
 
+    def test_pre_grad(self):
+        import operator
+
+        from torch._inductor import config
+
+        # similar setup to test_joint_graph (see below)
+        def pass_fn(graph: torch.fx.Graph):
+            nodes = graph.find_nodes(op="call_function", target=operator.add)
+            assert len(nodes) == 1
+            args = list(nodes[0].args)
+            args[1] = 2
+            nodes[0].args = tuple(args)
+
+        config.pre_grad_custom_pass = pass_fn
+
+        def foo(x):
+            return x + 1
+
+        def test_fn():
+            torch._dynamo.reset()
+
+            inp = torch.rand([10])
+
+            out = foo(inp)
+            out_c = torch.compile(foo)(inp)
+
+            return torch.allclose(out, out_c)
+
+        out = CompilerBisector.do_bisect(test_fn)
+        self.assertEqual(out.backend, "inductor")
+        self.assertEqual(out.subsystem, "pre_grad_passes")
+        self.assertEqual(out.bisect_number, 0)
+        self.assertTrue("pre_grad_custom_pass" in out.debug_info)
+
     def test_joint_graph(self):
         from torch._inductor import config
 
@@ -184,7 +218,7 @@ def test_fn():
                 torch._dynamo.reset()
 
                 try:
-                    torch.testing.assert_allclose(torch.compile(op)(x), op(x))
+                    torch.testing.assert_close(torch.compile(op)(x), op(x))
                 except Exception:
                     return False
                 return True
diff --git a/test/dynamo/test_decorators.py b/test/dynamo/test_decorators.py
index 6af25a385c2f..2b626132103a 100644
--- a/test/dynamo/test_decorators.py
+++ b/test/dynamo/test_decorators.py
@@ -2,6 +2,7 @@
 import functools
 import operator
 import os
+import re
 import unittest.mock as mock
 from unittest.mock import patch
 
@@ -893,6 +894,29 @@ def gn(x):
         self.assertEqual(gn(inp), inp + 3)
         self.assertEqual(cnts.frame_count, 1)
 
+    def test_step_unsupported(self):
+        cnts = torch._dynamo.testing.CompileCounter()
+
+        @torch.compile(backend=cnts)
+        def fn(x):
+            x = x + 1 + 2
+            torch._dynamo.step_unsupported()
+            return x + 4
+
+        inp = torch.ones(3)
+        self.assertEqual(fn(inp), inp + 7)
+        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(cnts.op_count, 2)
+
+    def test_step_unsupported_empty_checkpoint(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            torch._dynamo.step_unsupported()
+            return x + 1
+
+        inp = torch.ones(3)
+        self.assertEqual(fn(inp), inp + 1)
+
     @skipIfWindows(
         msg="TODO: (xuhancn), confirm if torch.compiler.disable work on Windows."
     )
@@ -1449,6 +1473,30 @@ def a(x, n):
         self.assertEqual(out1, inp + 2)
         self.assertEqual(out2, inp + 2)
 
+    def test_fail_on_recompile_shows_guard_details(self):
+        @torch.compile(backend="eager", dynamic=False)
+        def f(x):
+            return x + 1
+
+        f(torch.ones(4))
+        f(torch.ones(5))
+
+        def post_munge(s):
+            return re.sub(r"line number: \d+", "line number: N", s)
+
+        with torch.compiler.set_stance("fail_on_recompile"):
+            f(torch.ones(4))
+            self.assertExpectedInlineMunged(
+                RuntimeError,
+                lambda: f(torch.ones(7)),
+                """\
+Detected recompile when torch.compile stance is 'fail_on_recompile'. filename: 'test_decorators.py', function name: 'f', line number: N
+    triggered by the following guard failure(s):
+    - 0/0: tensor 'x' size mismatch at index 0. expected 4, actual 7
+    - 0/1: tensor 'x' size mismatch at index 0. expected 5, actual 7""",  # noqa: B950
+                post_munge=post_munge,
+            )
+
     def test_set_stance_fail_on_recompile_with_disable(self):
         @torch.compiler.disable
         def inner(x):
diff --git a/test/dynamo/test_dicts.py b/test/dynamo/test_dicts.py
index 3b1c9315336e..ca67df90d539 100644
--- a/test/dynamo/test_dicts.py
+++ b/test/dynamo/test_dicts.py
@@ -2,6 +2,7 @@
 
 # ruff: noqa: TRY002
 
+import enum
 import itertools
 import operator
 import types
@@ -56,6 +57,30 @@ def fn(x):
         opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
         self.assertEqual(fn(x), opt_fn(x))
 
+    def test_dict_contains_enum(self):
+        class TensorDim(str, enum.Enum):
+            DDP = "ddp"
+            FSDP = "fsdp"
+            CP = "cp"
+            TP = "tp"
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                val = x.sin()
+                if TensorDim.DDP in {"ddp"}:
+                    val += x.cos()
+                if "ddp" in {TensorDim.DDP}:
+                    val += x.cos()
+                return val
+
+        inp = torch.randn(4, 4)
+        mod = Foo()
+        opt_f = torch.compile(mod)
+        self.assertEqual(mod(inp), opt_f(inp))
+
     def test_dict_subclass_local_with_non_dict_method(self):
         # Checks that add_1 method is inlined
         class MethodDict(dict):
diff --git a/test/dynamo/test_error_messages.py b/test/dynamo/test_error_messages.py
index 081ceb5065df..2a22098a54d7 100644
--- a/test/dynamo/test_error_messages.py
+++ b/test/dynamo/test_error_messages.py
@@ -14,7 +14,7 @@
 import torch._dynamo.test_case
 import torch.utils._pytree as python_pytree
 from torch._dynamo.exc import ResumePrologueTracingError, Unsupported
-from torch._dynamo.testing import skipIfNotPy312
+from torch._dynamo.testing import skipIfNotPy312, skipIfOnlyNotPy312
 from torch._dynamo.utils import counters
 from torch.testing._internal.common_utils import (
     IS_FBCODE,
@@ -48,27 +48,6 @@ def __exit__(self, exc_type, exc_value, traceback):
 
 
 class ErrorMessagesTest(LoggingTestCase):
-    def test_dynamic_shape_operator(self):
-        def fn():
-            return torch.nonzero(torch.rand([10, 10]))
-
-        self.assertExpectedInlineMunged(
-            Unsupported,
-            lambda: torch.compile(fn, backend="eager", fullgraph=True)(),
-            """\
-Dynamic shape operator
-  Explanation: Operator `aten.nonzero.default`'s output shape depends on input Tensor data.
-  Hint: Enable tracing of dynamic shape operators with `torch._dynamo.config.capture_dynamic_output_shape_ops = True`
-
-  Developer debug context: aten.nonzero.default
-
- For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0036.html
-
-from user code:
-   File "test_error_messages.py", line N, in fn
-    return torch.nonzero(torch.rand([10, 10]))""",
-        )
-
     def test_dynamic_shape_operator_no_meta_kernel(self):
         def fn():
             return torch.linalg.lstsq(torch.rand(10, 10), torch.rand(10, 10))
@@ -91,29 +70,6 @@ def fn():
     return torch.linalg.lstsq(torch.rand(10, 10), torch.rand(10, 10))""",
             )
 
-    def test_data_dependent_operator(self):
-        def fn(x):
-            return x.item()
-
-        self.assertExpectedInlineMunged(
-            Unsupported,
-            lambda: torch.compile(fn, backend="eager", fullgraph=True)(
-                torch.Tensor([1])
-            ),
-            """\
-Unsupported Tensor.item() call with capture_scalar_outputs=False
-  Explanation: Dynamo does not support tracing `Tensor.item()` with config.capture_scalar_outputs=False.
-  Hint: Set `torch._dynamo.config.capture_scalar_outputs = True` or `export TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` to include these operations in the captured graph.
-
-  Developer debug context: call_method TensorVariable() item () {}
-
- For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0124.html
-
-from user code:
-   File "test_error_messages.py", line N, in fn
-    return x.item()""",
-        )
-
     def test_data_dependent_operator2(self):
         def fn(x):
             return torch.equal(x, x)
@@ -157,7 +113,7 @@ def fn(lst):
   Explanation: Cannot perform sort with non-constant key. First non-constant key type: <class 'torch.Tensor'>. Most notably, we cannot sort with Tensor or SymInt keys, but we can sort ints.
   Hint: Use something else as the key.
 
-  Developer debug context: TensorVariable()
+  Developer debug context: LazyVariableTracker(realized: TensorVariable())
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0207.html
 
@@ -260,7 +216,7 @@ def fn(obj):
   Hint: If the context manager seems like it should be supported (e.g. torch.set_grad_enabled), then it may be the case that it was created outside the compiled region, which Dynamo does not support. Supported context managers can cross graph break boundaries only if they are local non-closure variables, or are intermediate values.
   Hint: File an issue to PyTorch. Simple context managers can potentially be supported, but note that context managers can't be supported in general
 
-  Developer debug context: Attempted SETUP_WITH/BEFORE_WITH on ConstantVariable(int: 3)
+  Developer debug context: Attempted SETUP_WITH/BEFORE_WITH/LOAD_SPECIAL on LazyVariableTracker(realized: ConstantVariable(int: 3))
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0142.html
 
@@ -587,7 +543,7 @@ def fn(x, y):
   Explanation: Creating slices with Tensor arguments is not supported. e.g. `l[:x]`, where `x` is a 1-element tensor.
   Hint: It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues.
 
-  Developer debug context: SliceVariable start: ConstantVariable(NoneType: None), stop: TensorVariable(), step: ConstantVariable(NoneType: None)
+  Developer debug context: SliceVariable start: ConstantVariable(NoneType: None), stop: LazyVariableTracker(realized: TensorVariable()), step: ConstantVariable(NoneType: None)
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0038.html
 
@@ -913,6 +869,51 @@ def fn(x):
     if x.sum() > 0:""",
         )
 
+    # Test that the bytecode source attribution is correct with VariableTracker
+    @make_logging_test(trace_bytecode=True)
+    def test_variable_tracker_source_attribution(self, records):
+        def inner(x):
+            return x + 1
+
+        @torch.compile(backend="eager")
+        def fn(x):
+            x = inner(x)
+            return inner(x)
+
+        fn(torch.ones(3))
+
+        def find_trace_bytecode_lines(long_string):
+            # Split the string into lines
+            lines = long_string.split("\n")
+            # More comprehensive pattern to capture LazyVariableTracker info
+            pattern = r"LazyVariableTracker\([^)]*\)"
+            # Find all lines containing the pattern
+            result = [line for line in lines if re.search(pattern, line)]
+            return result
+
+        # Get all log messages, not just the last one
+        all_messages = []
+        for record in records:
+            msg = munge_exc(record.getMessage(), skip=0)
+
+            all_messages.append(msg)
+
+        # Combine all messages to search through
+        combined_msg = "\n".join(all_messages)
+        all_lines = find_trace_bytecode_lines(combined_msg)
+
+        # For now, just check that we found some lines with LazyVariableTracker
+        self.assertGreater(
+            len(all_lines), 0, "Should find at least one LazyVariableTracker line"
+        )
+
+        self.assertIn(
+            "LazyVariableTracker(unrealized: <class 'function'>)", all_lines[0]
+        )
+        self.assertIn(
+            "LazyVariableTracker(realized: UserFunctionVariable())", all_lines[3]
+        )
+
     @make_logging_test(graph_breaks=True)
     def test_data_dependent_branching_gb(self, records):
         def fn(x):
@@ -1059,6 +1060,7 @@ def gn():
             "<Internal traceback>\n",
             msg,
         )
+
         self.assertExpectedInline(
             msg,
             """\
@@ -1095,7 +1097,6 @@ def hn(x):
 
         torch.compile(fn, backend="eager")(torch.randn(3))
 
-        # check the log for the 2nd torch._dynamo.graph_break()
         self.assertExpectedInline(
             munge_exc(records[-1].getMessage(), skip=0),
             """\
@@ -1119,6 +1120,104 @@ def hn(x):
 """,
         )
 
+    @torch._dynamo.config.patch(verbose=True)
+    @make_logging_test(graph_breaks=True)
+    def test_latest_bytecode_to_graph_break_fullgraph(self, records):
+        def fn(x):
+            y = x + 1
+            z = x + y
+            torch._dynamo.graph_break()
+            return z
+
+        self.assertExpectedInlineMunged(
+            Unsupported,
+            lambda: torch.compile(fn, backend="eager", fullgraph=True)(torch.randn(3)),
+            """\
+Call to `torch._dynamo.graph_break()`
+  Explanation: User-inserted graph break. Message: None
+  Hint: Remove the `torch._dynamo.graph_break()` call.
+
+  Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
+
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
+
+from user code:
+   File "test_error_messages.py", line N, in fn
+    torch._dynamo.graph_break()
+""",
+        )
+
+    @skipIfOnlyNotPy312
+    @torch._dynamo.config.patch(verbose=True)
+    @make_logging_test(graph_breaks=True)
+    def test_latest_bytecode_to_graph_break_python_versioning(self, records):
+        @torch.compile(backend="eager")
+        def fn(x):
+            y = x + 1
+            z = x + y
+            torch._dynamo.graph_break()
+            return z
+
+        fn(torch.ones(3))
+
+        s = munge_exc(records[0].getMessage(), skip=0)
+
+        self.assertExpectedInline(
+            s,
+            """\
+Graph break in user code at test_error_messages.py:N
+Graph Break Reason: Call to `torch._dynamo.graph_break()`
+  Explanation: User-inserted graph break. Message: None
+  Hint: Remove the `torch._dynamo.graph_break()` call.
+
+  Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
+
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
+User code traceback:
+  File "test_error_messages.py", line N, in test_latest_bytecode_to_graph_break_python_versioning
+    fn(torch.ones(3))
+
+========== most recent `torch.compile` tracing attempt started here ==========
+
+  File "test_error_messages.py", line N, in fn
+    torch._dynamo.graph_break()
+
+NOTE: the most recent `torch.compile` tracing attempt might not be where you applied `torch.compile`! This is due to how graph breaks are implemented - the optimized code object returned by Dynamo will call another Dynamo-generated resume function and tracing is re-enabled by calling the resume function as a normal Python function, which Dynamo intercepts as a top-level frame.
+Most recent bytecode instructions traced (max 20):
+TRACE RESUME 0 []
+TRACE LOAD_FAST 'x' []
+TRACE LOAD_CONST 1 [LazyVariableTracker(unrealized: <class 'torch.Tensor'>)]
+TRACE BINARY_OP 0 [LazyVariableTracker(unrealized: <class 'torch.Tensor'>), ConstantVariable(int: 1)]
+TRACE STORE_FAST 'y' [TensorVariable()]
+TRACE LOAD_FAST 'x' []
+TRACE LOAD_FAST 'y' [TensorVariable()]
+TRACE BINARY_OP 0 [TensorVariable(), TensorVariable()]
+TRACE STORE_FAST 'z' [TensorVariable()]
+TRACE LOAD_GLOBAL 'torch' []
+TRACE LOAD_ATTR '_dynamo' [LazyVariableTracker(unrealized: <class 'module'>)]
+TRACE LOAD_ATTR 'graph_break' [LazyVariableTracker(unrealized: <class 'module'>)]
+TRACE CALL 0 [NullVariable, LazyVariableTracker(unrealized: <class 'function'>)]""",
+        )
+
+    @torch._dynamo.config.patch(verbose=True)
+    @make_logging_test(graph_breaks=True)
+    def test_latest_bytecode_to_graph_break(self, records):
+        @torch.compile(backend="eager")
+        def fn(x):
+            y = x + 1
+            z = x + y
+            torch._dynamo.graph_break()
+            return z
+
+        fn(torch.ones(3))
+
+        pattern = r"TRACE.*"
+        s = munge_exc(records[0].getMessage(), skip=0)
+        matches = re.findall(pattern, s)
+        self.assertEqual((len(matches) > 10), True)
+        self.assertEqual((len(matches) <= 20), True)
+        self.assertIn("Most recent bytecode instructions traced (max 20):", s)
+
     @torch._dynamo.config.patch(verbose=True)
     @make_logging_test(graph_breaks=True)
     def test_graph_break_traceback_above_dynamo_shows_user_code(self, records):
diff --git a/test/dynamo/test_exceptions.py b/test/dynamo/test_exceptions.py
index 43fdc335b8c2..2935e9478fcf 100644
--- a/test/dynamo/test_exceptions.py
+++ b/test/dynamo/test_exceptions.py
@@ -10,6 +10,7 @@
 import torch.nn
 import torch.utils.checkpoint
 from torch._dynamo.bytecode_transformation import Instruction
+from torch._dynamo.exc import Unsupported
 from torch._dynamo.symbolic_convert import SpeculationLog, SpeculationLogDivergence
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -126,7 +127,7 @@ def fn(x):
                 x = torch.sigmoid(x)
                 try:
                     x = torch.cos(x)
-                    raise AssertionError
+                    raise AssertionError  # noqa: B904
                 except AssertionError:
                     x = torch.cos(x)
 
@@ -630,7 +631,7 @@ def fn():
                 raise ZeroDivisionError
             except ZeroDivisionError:
                 try:
-                    raise ValueError
+                    raise ValueError  # noqa: B904
                 except ValueError:
                     pass
                 raise
@@ -680,7 +681,7 @@ def cm():
                 yield 1
             except ValueError:
                 try:
-                    raise TypeError
+                    raise TypeError  # noqa: B904
                 finally:
                     pass
 
@@ -710,7 +711,7 @@ def fn():
                 raise ValueError
             except ValueError:
                 try:
-                    raise TypeError
+                    raise TypeError  # noqa: B904
                 finally:
                     pass
 
@@ -934,6 +935,29 @@ def test_raise_set___context__(self):
 
         assert exc2.__context__ is None
 
+    def test_exception_kwargs(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn():
+            raise AttributeError(name="a")
+
+        self.assertRaises(Unsupported, fn)
+
+    def test_stack_trace_from_observed_exception(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(16, 16)
+
+            def forward(self, x):
+                # no attribute w on self.linear
+                weight = self.linear.w
+                return torch.nn.functional.linear(x, weight)
+
+        x = (torch.randn(4, 16, requires_grad=True),)
+
+        with self.assertRaisesRegex(Exception, "weight = self.linear.w"):
+            torch._dynamo.functional_export._dynamo_graph_capture_for_export(Model())(x)
+
 
 instantiate_parametrized_tests(ExceptionTests)
 
diff --git a/test/dynamo/test_export.py b/test/dynamo/test_export.py
index 53c9e2b79f38..db69ce0d1d20 100644
--- a/test/dynamo/test_export.py
+++ b/test/dynamo/test_export.py
@@ -49,9 +49,9 @@ def pre_attention_state_ops(input, mems, state):
             lc_key = state[0]
             lc_val = state[1]
             bar = []
-            for _ in range(0, 4):
+            for _ in range(4):
                 bar2 = []
-                for _ in range(0, 3):
+                for _ in range(3):
                     bar2.append(
                         lc_key + lc_val + torch.tensor([0.1, 0.25, 0.4, 0.5, 0.1])
                     )
@@ -665,9 +665,9 @@ def pre_attention_state_ops(input, mems, state):
             lc_key = state[0]
             lc_val = state[1]
             bar = []
-            for _ in range(0, 4):
+            for _ in range(4):
                 bar2 = []
-                for _ in range(0, 3):
+                for _ in range(3):
                     bar2.append(
                         lc_key + lc_val + torch.tensor([0.1, 0.25, 0.4, 0.5, 0.1])
                     )
@@ -1923,7 +1923,6 @@ def forward(self, x):
     cond = torch.ops.higher_order.cond(le, cond_true_0, cond_false_0, (l_x_,));  le = cond_true_0 = cond_false_0 = l_x_ = None
     getitem_3 = cond[0]
     sym_size_int_1 = torch.ops.aten.sym_size.int(getitem_3, 0);  getitem_3 = None
-    sym_constrain_range_for_size_default = torch.ops.aten.sym_constrain_range_for_size.default(sym_size_int_1);  sym_constrain_range_for_size_default = None
     ge = sym_size_int_1 >= 2;  sym_size_int_1 = None
     _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 2 on node 'ge'");  ge = _assert_scalar_default = None
     getitem_2 = cond[0];  cond = None
@@ -2713,19 +2712,20 @@ def forward(self, x, y):
             torch._dynamo.exc.UserError,
             ".*y.*size.*2.* = 4 is not equal to .*x.*size.*1.* = 3",
         ):
-            torch.export.export(bar, (x, y), dynamic_shapes=dynamic_shapes, strict=True)
+            with torch._export.config.patch(use_new_tracer_experimental=True):
+                torch.export.export(
+                    bar, (x, y), dynamic_shapes=dynamic_shapes, strict=True
+                )
         y = torch.randn(10, 3, 3)
-        ebar = torch.export.export(
-            bar, (x, y), dynamic_shapes=dynamic_shapes, strict=True
-        )
-        self.assertEqual(
-            [
-                str(node.meta["val"].shape)
-                for node in ebar.graph_module.graph.nodes
-                if node.op == "placeholder"
-            ],
-            ["torch.Size([s17, s27, s27])", "torch.Size([s17, s27, s27])"],
-        )
+        with torch._export.config.patch(use_new_tracer_experimental=True):
+            ebar = torch.export.export(
+                bar, (x, y), dynamic_shapes=dynamic_shapes, strict=True
+            )
+
+        for node in ebar.graph_module.graph.nodes:
+            if node.op == "placeholder":
+                shape = node.meta["val"].shape
+                self.assertEqual(shape[1], shape[2])
 
     @torch._dynamo.config.patch(
         capture_dynamic_output_shape_ops=True,
@@ -2754,7 +2754,6 @@ def f(x):
     def test_exported_graph_serialization(self):
         def f(x, y):
             b = x.item()
-            torch._check_is_size(b)
             return torch.empty((b, y.shape[0]))
 
         x = torch.tensor([3])
@@ -3149,7 +3148,6 @@ def f_pred_complex_expression_traced_as_symnode_var(x):
             gm, _ = torch._dynamo.export(f, aten_graph=True)(*example_inputs)
             self.assertEqual(gm(*example_inputs), f(*example_inputs))
 
-    @unittest.expectedFailure  # TODO: Not sure why dynamo creates a new inputs for self.a
     def test_sum_param(self):
         # Setting a new attribute inside forward()
         class Foo(torch.nn.Module):
@@ -3540,24 +3538,16 @@ def forward(self, pred, x):
             [[], [], [], []],
         )
 
-    def test_invalid_input_global(self) -> None:
+    def test_input_global(self) -> None:
         global bulbous_bouffant
         bulbous_bouffant = torch.randn(3)
 
         def f(y):
             return bulbous_bouffant + y
 
-        self.assertExpectedInlineMunged(
-            UserError,
-            lambda: torch._dynamo.export(f)(torch.randn(3)),
-            """\
-G['bulbous_bouffant'], accessed at:
-  File "test_export.py", line N, in f
-    return bulbous_bouffant + y
-""",
-        )
+        torch._dynamo.export(f)(torch.randn(3))
 
-    def test_invalid_input_global_multiple_access(self) -> None:
+    def test_input_global_multiple_access(self) -> None:
         global macademia
         macademia = torch.randn(3)
 
@@ -3571,33 +3561,17 @@ def f(y):
             y = g(y)
             return macademia + y
 
-        # NB: This doesn't actually work (it only reports the first usage),
-        # but I'm leaving the test here in case we fix it later
-        self.assertExpectedInlineMunged(
-            UserError,
-            lambda: torch._dynamo.export(f)(torch.randn(3)),
-            """\
-G['macademia'], accessed at:
-  File "test_export.py", line N, in f
-    y = g(y)
-  File "test_export.py", line N, in g
-    y = macademia + y
-""",
-        )
+        torch._dynamo.export(f)(torch.randn(3))
 
-    def test_invalid_input_nonlocal(self) -> None:
+    def test_input_nonlocal(self) -> None:
         arglebargle = torch.randn(3)
 
         def f(y):
             return arglebargle + y
 
-        self.assertExpectedInlineMunged(
-            UserError,
-            lambda: torch._dynamo.export(f)(torch.randn(3)),
-            """L['arglebargle'], a closed over free variable""",
-        )
+        torch._dynamo.export(f)(torch.randn(3))
 
-    def test_invalid_input_unused_nonlocal_ok(self) -> None:
+    def test_input_unused_nonlocal_ok(self) -> None:
         arglebargle = torch.randn(3)
 
         def f(y):
@@ -4669,7 +4643,6 @@ def test_export_fast_binary_broadcast_check_unbacked(self, device):
         class MyModel(torch.nn.Module):
             def forward(self, numel, scalar):
                 u0 = numel.item()
-                torch._check_is_size(u0)
                 x = torch.ones(u0 + 1)
                 return scalar - x
 
diff --git a/test/dynamo/test_export_mutations.py b/test/dynamo/test_export_mutations.py
index 8b8cc75b603a..c67fafba2edb 100644
--- a/test/dynamo/test_export_mutations.py
+++ b/test/dynamo/test_export_mutations.py
@@ -29,7 +29,7 @@ def forward(self, x):
                 self.a = self.a.to(torch.float64)
                 return x.sum() + self.a.sum()
 
-        self.check_failure_on_export(Foo(), torch.randn(3, 2))
+        self.check_same_with_export(Foo(), torch.randn(3, 2))
 
     def test_module_attribute_mutation_violation_negative_1(self):
         # Mutating attribute with a Tensor type inside __init__ but
diff --git a/test/dynamo/test_fake_distributed.py b/test/dynamo/test_fake_distributed.py
index 7a73e24cc8b0..41e373a50d76 100644
--- a/test/dynamo/test_fake_distributed.py
+++ b/test/dynamo/test_fake_distributed.py
@@ -127,6 +127,8 @@ def test_device_mesh_get_local_rank(self):
         def fn(x):
             local_rank = device_mesh.get_local_rank()
             global_rank = device_mesh.get_rank()
+            if "dp" not in device_mesh.mesh_dim_names:
+                x = x * 2
             return x + local_rank + global_rank
 
         x = torch.ones(10)
diff --git a/test/dynamo/test_frame_init.py b/test/dynamo/test_frame_init.py
index 59fdb20b71f7..20cebe9e7007 100644
--- a/test/dynamo/test_frame_init.py
+++ b/test/dynamo/test_frame_init.py
@@ -95,7 +95,11 @@ def callback1(frame, cache_entry, frame_state):
                 transformed_code = code_map1[frame.f_code]
                 return wrap_guarded_code(
                     GuardedCode(
-                        transformed_code, empty_guard_manager, CompileId(None, 0, 0)
+                        transformed_code,
+                        empty_guard_manager,
+                        CompileId(
+                            frame_id=None, frame_compile_id=0, compiled_autograd_id=0
+                        ),
                     )
                 )
             return ConvertFrameReturn()
@@ -105,7 +109,11 @@ def callback2(frame, cache_entry, frame_state):
                 transformed_code = code_map2[frame.f_code]
                 return wrap_guarded_code(
                     GuardedCode(
-                        transformed_code, empty_guard_manager, CompileId(None, 0, 0)
+                        transformed_code,
+                        empty_guard_manager,
+                        CompileId(
+                            frame_id=None, frame_compile_id=0, compiled_autograd_id=0
+                        ),
                     )
                 )
             return ConvertFrameReturn()
diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index 09666e1da397..647033e63e4c 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -410,10 +410,6 @@ def test_itertools_combinations(a, b):
             combs.append(torch.ones(size))
         return combs
 
-    @unittest.skipIf(
-        sys.version_info < (3, 10),
-        "itertools.pairwise was added at Python 3.10",
-    )
     @make_test
     def test_itertools_pairwise(a):
         pairs = []
@@ -2087,6 +2083,12 @@ def test_namedtuple_user_methods(a, b):
         mytuple = FunctionTests.MyNamedTuple(a, b)
         return mytuple.add(), mytuple.static_method(), mytuple.class_method()
 
+    @make_test
+    def test_namedtuple_replace(a, b):
+        mytuple = FunctionTests.MyNamedTuple(a, b)
+        replaced = mytuple._replace(first=b)
+        return mytuple.first + mytuple.second + replaced.first + replaced.second
+
     @make_test
     def test_generic_namedtuple_user_methods(a, b):
         mytuple = FunctionTests.MyGenericNamedTuple(a, b)
@@ -3625,7 +3627,7 @@ def test(range, slice, expected=None):
                 )
 
         test(range(10), slice(1, 10, 2), expected=range(1, 10, 2))
-        test(range(10), slice(None, 10, None), expected=range(0, 10))
+        test(range(10), slice(None, 10, None), expected=range(10))
         test(range(10), slice(-1, 7, None), expected=range(9, 7))
         test(range(10), slice(-1, 7, 2), expected=range(9, 7, 2))
         test(range(1, 10, 2), slice(3, 7, 2), expected=range(7, 11, 4))
@@ -4692,10 +4694,6 @@ def g():
         self.assertEqual(len(lst), 2)
         self.assertEqual(lst[0], lst[1])
 
-    @unittest.skipIf(
-        sys.version_info < (3, 10),
-        "zip strict kwargs not implemented for Python < 3.10",
-    )
     def test_zip_strict(self):
         def fn(x, ys, zs):
             x = x.clone()
@@ -5175,6 +5173,52 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(ref, res)
 
+    def test_property_class_transmute(self):
+        class PropertyGetter:
+            def __call__(self, obj):
+                return True
+
+        p = property(PropertyGetter())
+
+        class Mod(torch.nn.Module):
+            def forward(self, x):
+                if self.p:
+                    return x + 1
+                else:
+                    raise RuntimeError("whoops")
+
+        mod = Mod()
+        mod.__class__ = type(mod.__class__.__name__, (mod.__class__,), {"p": p})
+
+        opt_mod = torch.compile(mod, backend="eager", fullgraph=True)
+        x = torch.randn(1)
+        self.assertEqual(opt_mod(x), x + 1)
+
+    def test_property_functools_partial(self):
+        def p_getter(obj, *, delta: int):
+            # Use instance state + a bound constant
+            return (getattr(obj, "flag", 0) + delta) > 0
+
+        class Mod(torch.nn.Module):
+            def __init__(self, flag: int):
+                super().__init__()
+                self.flag = flag
+
+            # fget is a functools.partial object
+            p = property(functools.partial(p_getter, delta=1))
+
+            def forward(self, x):
+                if self.p:  # calls p_getter(self, delta=1)
+                    return x + 1
+                else:
+                    raise RuntimeError("whoops")
+
+        mod = Mod(flag=1)
+
+        opt_mod = torch.compile(mod, backend="eager", fullgraph=True)
+        x = torch.randn(1)
+        self.assertEqual(opt_mod(x), x + 1)
+
 
 instantiate_parametrized_tests(FunctionTests)
 instantiate_parametrized_tests(DefaultsTests)
diff --git a/test/dynamo/test_fx_annotate.py b/test/dynamo/test_fx_annotate.py
new file mode 100644
index 000000000000..337ce0f5764c
--- /dev/null
+++ b/test/dynamo/test_fx_annotate.py
@@ -0,0 +1,293 @@
+# Owner(s): ["module: dynamo"]
+
+import torch
+import torch._dynamo.test_case
+import torch.fx.traceback as fx_traceback
+import torch.utils.checkpoint
+from torch._dynamo.test_case import run_tests
+from torch._dynamo.testing import AotEagerAndRecordGraphs
+from torch.nn.attention.flex_attention import create_block_mask, flex_attention
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
+
+
+def checkpoint_wrapper(fn):
+    def inner(*args):
+        return torch.utils.checkpoint.checkpoint(fn, *args, use_reentrant=True)
+
+    return inner
+
+
+class AnnotateTests(torch._dynamo.test_case.TestCase):
+    def test_annotations(self):
+        class Mod(torch.nn.Module):
+            def forward(self, x):
+                with fx_traceback.annotate({"pp_stage": 0}):
+                    with fx_traceback.annotate({"fdsp_bucket": 0}):
+                        sin = torch.sin(x)
+                    sub = sin - 2
+                    with fx_traceback.annotate({"cuda_stream": 2, "fsdp_bucket": 1}):
+                        mul = sub * 2
+                div = mul / 3
+                return div
+
+        m = Mod()
+        backend = AotEagerAndRecordGraphs()
+        opt_m = torch.compile(m, backend=backend, fullgraph=True)
+        x = torch.randn(10, requires_grad=True)
+        opt_m(x).sum().backward()
+
+        self.assertEqual(len(backend.fw_graphs), 1)
+        self.assertEqual(len(backend.bw_graphs), 1)
+
+        dynamo_metadata = fx_traceback._get_custom_metadata(backend.graphs[0])
+        fw_metadata = fx_traceback._get_custom_metadata(backend.fw_graphs[0])
+        bw_metadata = fx_traceback._get_custom_metadata(backend.bw_graphs[0])
+        self.assertExpectedInline(
+            str(dynamo_metadata),
+            """\
+('placeholder', 'l_x_', {'pp_stage': 0, 'fdsp_bucket': 0})
+('call_function', 'sin', {'pp_stage': 0, 'fdsp_bucket': 0})
+('call_function', 'sub', {'pp_stage': 0})
+('call_function', 'mul', {'pp_stage': 0, 'cuda_stream': 2, 'fsdp_bucket': 1})""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(fw_metadata),
+            """\
+('call_function', 'sin', {'pp_stage': 0, 'fdsp_bucket': 0})
+('call_function', 'sub', {'pp_stage': 0})
+('call_function', 'mul', {'pp_stage': 0, 'cuda_stream': 2, 'fsdp_bucket': 1})""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(bw_metadata),
+            """\
+('call_function', 'mul_1', {'pp_stage': 0, 'cuda_stream': 2, 'fsdp_bucket': 1})
+('call_function', 'cos', {'pp_stage': 0, 'fdsp_bucket': 0})
+('call_function', 'mul_2', {'pp_stage': 0, 'fdsp_bucket': 0})""",  # noqa: B950
+        )
+
+    def test_activation_checkpointing(self):
+        @checkpoint_wrapper
+        def gn(x):
+            return torch.sin(x)
+
+        def fn(x):
+            with fx_traceback.annotate({"ac_sin": 0}):
+                ac = gn(x)
+            return torch.sigmoid(ac)
+
+        backend = AotEagerAndRecordGraphs()
+        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
+        x = torch.randn(10, requires_grad=True)
+        opt_fn(x).sum().backward()
+
+        self.assertEqual(len(backend.fw_graphs), 1)
+        self.assertEqual(len(backend.bw_graphs), 1)
+
+        dynamo_metadata = fx_traceback._get_custom_metadata(backend.graphs[0])
+        fw_metadata = fx_traceback._get_custom_metadata(backend.fw_graphs[0])
+        bw_metadata = fx_traceback._get_custom_metadata(backend.bw_graphs[0])
+        self.assertExpectedInline(
+            str(dynamo_metadata),
+            """\
+('placeholder', 'l_x_', {'ac_sin': 0})
+('get_attr', 'wrap_body_0', {'ac_sin': 0})
+[('placeholder', 'l_x_', {'ac_sin': 0}), ('call_function', 'sin', {'ac_sin': 0}), ('output', 'output', {'ac_sin': 0})]
+('call_function', 'tag_activation_checkpoint', {'ac_sin': 0})
+('call_function', 'ac', {'ac_sin': 0})""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(fw_metadata),
+            """('call_function', 'sin', {'ac_sin': 0})""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(bw_metadata),
+            """\
+('call_function', 'cos', {'ac_sin': 0})
+('call_function', 'mul', {'ac_sin': 0})""",  # noqa: B950
+        )
+
+    def test_activation_checkpointing_annotation_inside(self):
+        @checkpoint_wrapper
+        def gn(x):
+            x = x + 1
+            with fx_traceback.annotate({"stage": 0}):
+                p = torch.sin(x)
+            return p + 1
+
+        def fn(x):
+            ac = gn(x)
+            return torch.sigmoid(ac)
+
+        backend = AotEagerAndRecordGraphs()
+        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
+        x = torch.randn(10, requires_grad=True)
+        opt_fn(x).sum().backward()
+
+        self.assertEqual(len(backend.fw_graphs), 1)
+        self.assertEqual(len(backend.bw_graphs), 1)
+
+        dynamo_metadata = fx_traceback._get_custom_metadata(backend.graphs[0])
+        fw_metadata = fx_traceback._get_custom_metadata(backend.fw_graphs[0])
+        bw_metadata = fx_traceback._get_custom_metadata(backend.bw_graphs[0])
+        self.assertExpectedInline(
+            str(dynamo_metadata),
+            """[('call_function', 'p', {'stage': 0})]""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(fw_metadata),
+            """('call_function', 'sin', {'stage': 0})""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(bw_metadata),
+            """\
+('call_function', 'cos', {'stage': 0})
+('call_function', 'mul', {'stage': 0})""",  # noqa: B950
+        )
+
+    @requires_cuda_and_triton
+    def test_ac_flex_attention(self):
+        def _squared(score, b, h, m, n):
+            return score * score
+
+        def mask_mod(b, h, q, k):
+            return q >= 0
+
+        a = 12
+        b = 64
+        block_mask = create_block_mask(mask_mod, None, None, a * b, a * b)
+
+        def gn(x: torch.Tensor):
+            with fx_traceback.annotate({"compile_inductor": 0}):
+                return flex_attention(
+                    x, x, x, block_mask=block_mask, score_mod=_squared
+                )
+
+        def fn(x):
+            x = torch.sin(x)
+            x = gn(x)
+            return torch.cos(x)
+
+        x = torch.randn(
+            1,
+            1,
+            a * b,
+            b,
+            dtype=torch.bfloat16,
+            device="cuda",
+            requires_grad=True,
+        )
+
+        backend = AotEagerAndRecordGraphs()
+        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
+        opt_fn(x).sum().backward()
+
+        self.assertEqual(len(backend.fw_graphs), 1)
+        self.assertEqual(len(backend.bw_graphs), 1)
+
+        dynamo_metadata = fx_traceback._get_custom_metadata(backend.graphs[0])
+        fw_metadata = fx_traceback._get_custom_metadata(backend.fw_graphs[0])
+        bw_metadata = fx_traceback._get_custom_metadata(backend.bw_graphs[0])
+        self.assertExpectedInline(
+            str(dynamo_metadata),
+            """\
+('placeholder', 'l_gn_closure_1_cell_contents_kv_indices', {'compile_inductor': 0})
+('placeholder', 'l_gn_closure_1_cell_contents_kv_num_blocks', {'compile_inductor': 0})
+('placeholder', 'l_gn_closure_1_cell_contents_full_kv_num_blocks', {'compile_inductor': 0})
+('placeholder', 'l_gn_closure_1_cell_contents_full_kv_indices', {'compile_inductor': 0})
+('placeholder', 'l_gn_closure_1_cell_contents_q_num_blocks', {'compile_inductor': 0})
+('placeholder', 'l_gn_closure_1_cell_contents_q_indices', {'compile_inductor': 0})
+('placeholder', 'l_gn_closure_1_cell_contents_full_q_num_blocks', {'compile_inductor': 0})
+('placeholder', 'l_gn_closure_1_cell_contents_full_q_indices', {'compile_inductor': 0})
+('get_attr', 'score_mod_0', {'compile_inductor': 0})
+[('placeholder', 'child', {'compile_inductor': 0}), ('placeholder', 'child_1', {'compile_inductor': 0}), ('placeholder', 'child_2', {'compile_inductor': 0}), ('placeholder', 'child_3', {'compile_inductor': 0}), ('placeholder', 'child_4', {'compile_inductor': 0}), ('call_function', 'mul', {'compile_inductor': 0}), ('output', 'output', {'compile_inductor': 0})]
+('get_attr', 'mask_fn_0', {'compile_inductor': 0})
+[('placeholder', 'child', {'compile_inductor': 0}), ('placeholder', 'child_1', {'compile_inductor': 0}), ('placeholder', 'child_2', {'compile_inductor': 0}), ('placeholder', 'child_3', {'compile_inductor': 0}), ('call_function', 'ge', {'compile_inductor': 0}), ('output', 'output', {'compile_inductor': 0})]
+('call_function', 'flex_attention', {'compile_inductor': 0})
+('call_function', 'out', {'compile_inductor': 0})""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(fw_metadata),
+            """\
+('get_attr', 'sdpa_score0', {'compile_inductor': 0})
+[('placeholder', 'arg0_1', {'compile_inductor': 0}), ('placeholder', 'arg1_1', {'compile_inductor': 0}), ('placeholder', 'arg2_1', {'compile_inductor': 0}), ('placeholder', 'arg3_1', {'compile_inductor': 0}), ('placeholder', 'arg4_1', {'compile_inductor': 0}), ('call_function', 'mul', {'compile_inductor': 0}), ('output', 'output', {'compile_inductor': 0})]
+('get_attr', 'sdpa_mask0', {'compile_inductor': 0})
+[('placeholder', 'arg0_1', {'compile_inductor': 0}), ('placeholder', 'arg1_1', {'compile_inductor': 0}), ('placeholder', 'arg2_1', {'compile_inductor': 0}), ('placeholder', 'arg3_1', {'compile_inductor': 0}), ('call_function', 'ge', {'compile_inductor': 0}), ('output', 'output', {'compile_inductor': 0})]
+('call_function', 'flex_attention', {'compile_inductor': 0})
+('call_function', 'getitem', {'compile_inductor': 0})
+('call_function', 'getitem_1', {'compile_inductor': 0})
+('call_function', 'detach_1', {'compile_inductor': 0})
+('call_function', 'detach_3', {'compile_inductor': 0})""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(bw_metadata),
+            """\
+('placeholder', 'getitem', {'compile_inductor': 0})
+('placeholder', 'detach_3', {'compile_inductor': 0})
+('call_function', 'zeros', {'compile_inductor': 0})
+('call_function', 'detach', {'compile_inductor': 0})
+('call_function', 'detach_2', {'compile_inductor': 0})
+('get_attr', 'fw_graph0', {'compile_inductor': 0})
+[('placeholder', 'arg0_1', {'compile_inductor': 0}), ('placeholder', 'arg1_1', {'compile_inductor': 0}), ('placeholder', 'arg2_1', {'compile_inductor': 0}), ('placeholder', 'arg3_1', {'compile_inductor': 0}), ('placeholder', 'arg4_1', {'compile_inductor': 0}), ('call_function', 'mul', {'compile_inductor': 0}), ('output', 'output', {'compile_inductor': 0})]
+('get_attr', 'joint_graph0', {'compile_inductor': 0})
+[('placeholder', 'arg0_1', {'compile_inductor': 0}), ('placeholder', 'arg1_1', {'compile_inductor': 0}), ('placeholder', 'arg2_1', {'compile_inductor': 0}), ('placeholder', 'arg3_1', {'compile_inductor': 0}), ('placeholder', 'arg4_1', {'compile_inductor': 0}), ('placeholder', 'arg5_1', {'compile_inductor': 0}), ('call_function', 'mul_1', {'compile_inductor': 0}), ('call_function', 'mul_2', {'compile_inductor': 0}), ('call_function', 'add', {'compile_inductor': 0}), ('output', 'output', {'compile_inductor': 0})]
+('get_attr', 'mask_graph0', {'compile_inductor': 0})
+[('placeholder', 'arg0_1', {'compile_inductor': 0}), ('placeholder', 'arg1_1', {'compile_inductor': 0}), ('placeholder', 'arg2_1', {'compile_inductor': 0}), ('placeholder', 'arg3_1', {'compile_inductor': 0}), ('call_function', 'ge', {'compile_inductor': 0}), ('output', 'output', {'compile_inductor': 0})]
+('call_function', 'flex_attention_backward', {'compile_inductor': 0})
+('call_function', 'getitem_3', {'compile_inductor': 0})
+('call_function', 'getitem_4', {'compile_inductor': 0})
+('call_function', 'getitem_5', {'compile_inductor': 0})""",  # noqa: B950
+        )
+
+    def test_as_decorator(self):
+        class Mod(torch.nn.Module):
+            @fx_traceback.annotate({"fdsp_bucket": 0})
+            def sin(self, x):
+                return torch.sin(x)
+
+            def forward(self, x):
+                with fx_traceback.annotate({"pp_stage": 0}):
+                    sin = self.sin(x)
+                    sub = sin - 2
+                    mul = sub * 2
+                div = mul / 3
+                return div
+
+        m = Mod()
+        backend = AotEagerAndRecordGraphs()
+        opt_m = torch.compile(m, backend=backend, fullgraph=True)
+        x = torch.randn(10, requires_grad=True)
+        m(x)
+        opt_m(x).sum().backward()
+
+        self.assertEqual(len(backend.fw_graphs), 1)
+        self.assertEqual(len(backend.bw_graphs), 1)
+
+        dynamo_metadata = fx_traceback._get_custom_metadata(backend.graphs[0])
+        fw_metadata = fx_traceback._get_custom_metadata(backend.fw_graphs[0])
+        bw_metadata = fx_traceback._get_custom_metadata(backend.bw_graphs[0])
+        self.assertExpectedInline(
+            str(dynamo_metadata),
+            """\
+('placeholder', 'l_x_', {'pp_stage': 0, 'fdsp_bucket': 0})
+('call_function', 'sin', {'pp_stage': 0, 'fdsp_bucket': 0})
+('call_function', 'sub', {'pp_stage': 0})
+('call_function', 'mul', {'pp_stage': 0})""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(fw_metadata),
+            """\
+('call_function', 'sin', {'pp_stage': 0, 'fdsp_bucket': 0})
+('call_function', 'sub', {'pp_stage': 0})
+('call_function', 'mul', {'pp_stage': 0})""",  # noqa: B950
+        )
+        self.assertExpectedInline(
+            str(bw_metadata),
+            """\
+('call_function', 'mul_1', {'pp_stage': 0})
+('call_function', 'cos', {'pp_stage': 0, 'fdsp_bucket': 0})
+('call_function', 'mul_2', {'pp_stage': 0, 'fdsp_bucket': 0})""",  # noqa: B950
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/test_graph_region_tracker.py b/test/dynamo/test_graph_region_tracker.py
index e930ff787a9a..ce456596fd55 100644
--- a/test/dynamo/test_graph_region_tracker.py
+++ b/test/dynamo/test_graph_region_tracker.py
@@ -1,6 +1,5 @@
 # Owner(s): ["module: dynamo"]
 import contextlib
-import os
 
 import torch
 import torch.fx
@@ -196,21 +195,6 @@ def fn(x, y, z):
         )
 
     def test_mismatched_global_state(self):
-        @contextlib.contextmanager
-        def _hip_allow_tf32():
-            # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
-            # and only for MI300+
-            hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
-            os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
-
-            try:
-                yield
-            finally:
-                if hip_allow_tf32 is not None:
-                    os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
-                else:
-                    del os.environ["HIPBLASLT_ALLOW_TF32"]
-
         def inner_fn(x, y):
             x1 = x * 1
             y1 = y + 1
@@ -251,31 +235,29 @@ def set_default_dtype_bfloat16():
         def reset_default_dtype():
             torch.set_default_dtype(old_dtype)
 
-        tf32_ctx = _hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
-        with tf32_ctx():
-            for ctx in [
-                lambda: torch.set_grad_enabled(False),
-                torch.autograd.grad_mode.inference_mode,
-                lambda: torch.autograd.graph.disable_saved_tensors_hooks(
-                    "This is not supported"
-                ),
-                # lambda: torch.set_num_threads(2), : Unsupported
-                (set_default_dtype_bfloat16, reset_default_dtype),
-                (
-                    lambda: torch.use_deterministic_algorithms(True),
-                    lambda: torch.use_deterministic_algorithms(False),
-                ),
-                # (lambda: torch.use_deterministic_algorithms(True, warn_only=True),
-                # lambda: torch.use_deterministic_algorithms(False)), : Unsupported
-                create_toggle_fns("allow_bf16_reduced_precision_reduction"),
-                create_toggle_fns("allow_fp16_reduced_precision_reduction"),
-                create_toggle_fns("allow_tf32"),
-            ]:
-                self.assertExpectedInline(
-                    self.get_result(fn, torch.rand(10, 10), torch.ones(10, 20), ctx),
-                    """[[['x1_2', 'y1_2', 'sum_3', 'o0'], ['x1_3', 'y1_3', 'sum_4', 'o2']], \
+        for ctx in [
+            lambda: torch.set_grad_enabled(False),
+            torch.autograd.grad_mode.inference_mode,
+            lambda: torch.autograd.graph.disable_saved_tensors_hooks(
+                "This is not supported"
+            ),
+            # lambda: torch.set_num_threads(2), : Unsupported
+            (set_default_dtype_bfloat16, reset_default_dtype),
+            (
+                lambda: torch.use_deterministic_algorithms(True),
+                lambda: torch.use_deterministic_algorithms(False),
+            ),
+            # (lambda: torch.use_deterministic_algorithms(True, warn_only=True),
+            # lambda: torch.use_deterministic_algorithms(False)), : Unsupported
+            create_toggle_fns("allow_bf16_reduced_precision_reduction"),
+            create_toggle_fns("allow_fp16_reduced_precision_reduction"),
+            create_toggle_fns("allow_tf32"),
+        ]:
+            self.assertExpectedInline(
+                self.get_result(fn, torch.rand(10, 10), torch.ones(10, 20), ctx),
+                """[[['x1_2', 'y1_2', 'sum_3', 'o0'], ['x1_3', 'y1_3', 'sum_4', 'o2']], \
 [['x1', 'y1', 'sum_1', 'o4'], ['x1_1', 'y1_1', 'sum_2', 'o5']]]""",
-                )
+            )
 
     def test_mutation_tracking_simple(self):
         def fn(x, y, z):
diff --git a/test/dynamo/test_guard_manager.py b/test/dynamo/test_guard_manager.py
index 32f666698bd0..f11c04c8071d 100644
--- a/test/dynamo/test_guard_manager.py
+++ b/test/dynamo/test_guard_manager.py
@@ -116,8 +116,6 @@ def test_python_lambda_leaf_guard(self):
         const_guard = guards.LAMBDA_GUARD(
             root,
             functools.partial(equals_match, expected=5),
-            {},
-            False,
             equals_match_verbose_code_parts(5),
         )
         self.assertTrue(const_guard(5))
@@ -407,14 +405,10 @@ def test_guard_manager_leaf_guard(self):
         guard_manager.add_type_match_guard(id_type(5), ["type(x) == int"])
         guard_manager.add_lambda_guard(
             functools.partial(ge_match, expected=5),
-            {},
-            False,
             ge_match_verbose_code_parts(expected=5),
         )
         guard_manager.add_lambda_guard(
             functools.partial(less_match, expected=10),
-            {},
-            False,
             less_match_verbose_code_parts(expected=10),
         )
         self.assertEqual(len(guard_manager.get_leaf_guards()), 3)
@@ -434,14 +428,10 @@ def __init__(self, x, y):
         guard_manager.add_type_match_guard(id_type(foo), ["type(x) == Foo"])
         guard_manager.getattr_manager("x", "x", 1, default_mgr_enum).add_lambda_guard(
             functools.partial(equals_match, expected=foo.x),
-            {},
-            False,
             equals_match_verbose_code_parts(foo.x),
         )
         guard_manager.getattr_manager("y", "y", 2, default_mgr_enum).add_lambda_guard(
             functools.partial(equals_match, expected=foo.y),
-            {},
-            False,
             equals_match_verbose_code_parts(foo.y),
         )
         self.assertEqual(len(guard_manager.get_leaf_guards()), 1)
@@ -484,14 +474,10 @@ def test_item_guard_manager(self):
         guard_manager.add_type_match_guard(id_type(foo), ["type(x) == Foo"])
         guard_manager.getitem_manager(0, "", 1, default_mgr_enum).add_lambda_guard(
             functools.partial(equals_match, expected=foo[0]),
-            {},
-            False,
             equals_match_verbose_code_parts(foo[0]),
         )
         guard_manager.getitem_manager(1, "", 2, default_mgr_enum).add_lambda_guard(
             functools.partial(equals_match, expected=foo[1]),
-            {},
-            False,
             equals_match_verbose_code_parts(foo[1]),
         )
         self.assertEqual(len(guard_manager.get_leaf_guards()), 1)
@@ -599,8 +585,6 @@ def test_globals(self):
             lambda x: isinstance(x, Pair)
             and isinstance(x.x, torch.Tensor)
             and isinstance(x.y, int),
-            {},
-            False,
             "global guard fail",
         )
 
@@ -651,8 +635,6 @@ def mul(self, x):
         )
         attr_manager.add_lambda_guard(
             lambda x: x == 4,
-            {},
-            False,
             "Expected value 4",
         )
 
@@ -693,8 +675,6 @@ def test_global_weakref(self):
 
         weakref_manager.add_lambda_guard(
             lambda x: isinstance(x, torch.Tensor),
-            {},
-            False,
             "global weakref fail",
         )
 
@@ -714,8 +694,6 @@ def test_lambda_manager(self):
         )
         foo_mgr.add_lambda_guard(
             lambda x: x == 3,
-            {},
-            False,
             "Expected value 3",
         )
         self.assertTrue(guard_manager.check(a))
@@ -801,7 +779,7 @@ def nothing():
         # Add key-value manager (nothing : {"z" : 3})
         self.assertTrue(root.check(f_locals))
         dict_mgr.get_key_manager(1, "", nothing, default_mgr_enum).add_lambda_guard(
-            lambda x: x is nothing, {}, False, ["x is nothing"]
+            lambda x: x is nothing, ["x is nothing"]
         )
         self.assertTrue(root.check(f_locals))
         value_mgr = dict_mgr.get_value_manager(
@@ -984,6 +962,42 @@ def hook(guard_wrapper, f_locals, builder):
             opt_fn(torch.randn(4, 4))
 
 
+class DuplicateGuardTest(torch._dynamo.test_case.TestCase):
+    def test_duplicate_guard(self):
+        class Foo:
+            def __init__(self):
+                self.x = 4
+                self.bar = 4
+
+        foo = Foo()
+
+        def fn(x):
+            if hasattr(foo, "y"):
+                x = torch.sin(x)
+            if hasattr(foo, "y"):
+                x = torch.sin(x)
+
+            if hasattr(foo, "bar"):
+                x = torch.cos(x)
+            if hasattr(foo, "bar"):
+                x = torch.cos(x)
+            return x + foo.x
+
+        try:
+            from .utils import install_guard_manager_testing_hook
+        except ImportError:
+            from utils import install_guard_manager_testing_hook
+
+        def hook(guard_wrapper, f_locals, builder):
+            guard_str = str(guard_wrapper)
+            # One for tensor and one for y
+            self.assertEqual(guard_str.count("NO_HASATTR"), 2)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        with install_guard_manager_testing_hook(hook):
+            opt_fn(torch.randn(4, 4))
+
+
 class RecursiveDictTagTests(torch._dynamo.test_case.TestCase):
     def setUp(self):
         self._prev = torch._dynamo.config.use_recursive_dict_tags_for_guards
diff --git a/test/dynamo/test_guard_serialization.py b/test/dynamo/test_guard_serialization.py
index 7e19de297343..54520bba448a 100644
--- a/test/dynamo/test_guard_serialization.py
+++ b/test/dynamo/test_guard_serialization.py
@@ -4,8 +4,10 @@
 import importlib
 import pickle
 import sys
+import tempfile
 import types
 import unittest
+import weakref
 from collections.abc import Iterator
 from unittest.mock import patch
 
@@ -18,6 +20,7 @@
 from torch._dynamo.bytecode_transformation import transform_code_object
 from torch._dynamo.exc import PackageError
 from torch._dynamo.guards import CheckFunctionManager, CompileId
+from torch._dynamo.package import CompilePackage
 from torch._dynamo.symbolic_convert import (
     ExceptionStack,
     InstructionTranslator,
@@ -44,10 +47,33 @@ def forward(self, x):
         return x + 1
 
 
+class GlobalNestedModule(torch.nn.Module):
+    def __init__(self, submodule=None):
+        super().__init__()
+        self.linear = torch.nn.Linear(10, 10)
+        self.param = torch.nn.Parameter(torch.randn(3, 2))
+        self.nested = submodule or GlobalModule()
+
+    def forward(self, x):
+        return self.linear(x) + 1
+
+
 def global_func(x):
     return x + 1
 
 
+class ModuleNotSerializable(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.param = torch.nn.Parameter(torch.randn(3, 2))
+
+    def __getstate__(self):
+        raise NotImplementedError("not serialzable")
+
+    def forward(self, x):
+        return x + self.param
+
+
 class GlobalTorchFunctionMode(TorchFunctionMode):
     def __torch_function__(self, func, types, args=(), kwargs=None):
         if kwargs is None:
@@ -63,6 +89,39 @@ def add(self, x):
         return x + 1
 
 
+class MyClassNotSerializable:
+    def __getstate__(self):
+        raise NotImplementedError
+
+    def add(self, x):
+        return x + 1
+
+
+class Inputs:
+    def __init__(self, x, unused):
+        self.x = x
+        self.unused = unused
+
+
+def _global_func_wrong_fqn(x):
+    return x + 1
+
+
+global_func_wrong_fqn = _global_func_wrong_fqn
+del _global_func_wrong_fqn
+
+
+class FlatModule(torch.nn.Module):
+    def forward(self, x):
+        return x + 2
+
+
+class ModWithDict(torch.nn.Module):
+    def __init__(self, d):
+        super().__init__()
+        self.d = d
+
+
 class SubclassWithMeta(torch.Tensor):
     @staticmethod
     def __new__(cls, a, extra, outer_size=None, outer_stride=None):
@@ -262,7 +321,7 @@ def _tracefunc(self, frame, event, arg):
     def _test_serialization(self, guard_type, fn, *args, **kwargs):
         # kwargs might contain a callable that generates kwargs
         torch._dynamo.reset()
-        kwarg_gen_fn = kwargs.get("_gen_fn", None)
+        kwarg_gen_fn = kwargs.get("_gen_fn")
         if kwarg_gen_fn is not None:
             kwargs = kwarg_gen_fn()
 
@@ -329,7 +388,9 @@ def transform(instructions: list, code_options: dict[str, object]):
                 package=None,
             )
             with (
-                compile_context(CompileContext(CompileId(0, 0))),
+                compile_context(
+                    CompileContext(CompileId(frame_id=0, frame_compile_id=0))
+                ),
                 tracing(tracer.output.tracing_context),
                 tracer.set_current_tx(),
                 get_metrics_context(),
@@ -353,15 +414,13 @@ def transform(instructions: list, code_options: dict[str, object]):
                 self._cached_guards_state = guards_state
                 self._cached_f_code = self._frame_state.f_code
                 self.assertIsNotNone(guards_state)
-                guards_state = pickle.loads(guards_state)
+                guards_state = torch._dynamo.package.load_guards_state(guards_state)
 
-                check_fn_manager = CheckFunctionManager(
+                loaded_gm = torch._dynamo.package.load_guard_manager(
+                    guards_state,
                     self._frame_state.f_code,
-                    guards_state.output_graph,
-                    shape_code_parts=guards_state.shape_code_parts,
-                    runtime_global_scope=self._frame_state.f_globals,
+                    self._frame_state.f_globals,
                 )
-                loaded_gm = check_fn_manager.guard_manager
 
         try:
             transform_code_object(self._frame_state.f_code, transform)
@@ -1370,10 +1429,289 @@ def forward(self, foo, x):
             ref, loaded, {"self": m, "foo": MyClass().add, "x": torch.randn(3, 2)}, True
         )
 
+    def test_bound_methods_missing(self):
+        class MyClass:
+            def __getstate__(self):
+                raise NotImplementedError
+
+            def add(self, x):
+                return x + 1
+
+        def foo(x: torch.Tensor, y: list[MyClass]):
+            assert len(y) == 1
+            return x + 1
+
+        ref, loaded = self._test_serialization(
+            "TYPE_MATCH", foo, torch.randn(3, 2), [MyClass()]
+        )
+        self._test_check_fn(
+            ref, loaded, {"x": torch.randn(3, 2), "y": [MyClass()]}, True
+        )
+
+    def test_bound_methods_empty(self):
+        def foo(x, y):
+            assert callable(y[0])
+            return x + 1
+
+        ref, loaded = self._test_serialization(
+            "TYPE_MATCH", foo, torch.randn(3, 2), [MyClassNotSerializable().add]
+        )
+        self._test_check_fn(
+            ref,
+            loaded,
+            {"x": torch.randn(3, 2), "y": [MyClassNotSerializable().add]},
+            True,
+        )
+
+    def test_ddp_module(self):
+        import torch.distributed as dist
+
+        if not dist.is_available():
+            self.skipTest("Torch distributed is not available")
+        from torch.nn.parallel import DistributedDataParallel as DDP
+
+        tmpfile = tempfile.NamedTemporaryFile()
+        dist.init_process_group(
+            backend="gloo", rank=0, world_size=1, init_method=f"file://{tmpfile.name}"
+        )
+        try:
+            ddp_model = DDP(GlobalNestedModule())
+
+            def foo(ddp, x):
+                return ddp(x)
+
+            x = torch.randn(10)
+            package = CompilePackage(foo)
+            torch._dynamo.optimize(
+                package=package,
+                guard_filter_fn=lambda gs: [
+                    x.guard_type not in ("CLOSURE_MATCH", "ID_MATCH") for x in gs
+                ],
+            )(foo)(ddp_model, x)
+            self.assertEqual(len(package._codes[foo.__code__].guarded_codes), 1)
+            torch._dynamo.package.load_guards_state(
+                package._codes[foo.__code__].guarded_codes[0].guards_state
+            )
+        finally:
+            dist.destroy_process_group()
+
+    def test_dict_keys_serialization(self):
+        d = {1: 2, 3: 4}
+
+        def foo(x, y):
+            for k in y:
+                x += k
+            return x
+
+        ref, loaded = self._test_serialization(
+            "TYPE_MATCH", foo, torch.randn(3, 2), d.keys()
+        )
+        self._test_check_fn(
+            ref,
+            loaded,
+            {"x": torch.randn(3, 2), "y": d.keys()},
+            True,
+        )
+
+    def test_unserializable_sharded_tensor(self):
+        import torch.distributed as dist
+
+        if not dist.is_available():
+            self.skipTest("Torch distributed is not available")
+
+        tmpfile = tempfile.NamedTemporaryFile()
+        dist.init_process_group(
+            backend="gloo", rank=0, world_size=1, init_method=f"file://{tmpfile.name}"
+        )
+        try:
+            ChunkShardingSpec = dist._shard.sharding_spec.ChunkShardingSpec
+            ShardedTensor = dist._shard.sharded_tensor.ShardedTensor
+            tensor = torch.arange(2, dtype=torch.int64)
+            local_tensor = torch.unsqueeze(torch.cat([tensor, tensor + 2]), 0)
+
+            sharding_dim = 0
+            sharding_spec = ChunkShardingSpec(
+                dim=sharding_dim,
+                placements=[
+                    "rank:0/cpu",
+                ],
+            )
+            st = ShardedTensor._init_from_local_tensor(
+                local_tensor, sharding_spec, [1, 4]
+            )
+
+            def foo(inputs):
+                return inputs.x + 1
+
+            ref, loaded = self._test_serialization(
+                "TENSOR_MATCH", foo, Inputs(torch.randn(3, 2), st)
+            )
+            self._test_check_fn(
+                ref, loaded, {"inputs": Inputs(torch.randn(3, 2), st)}, True
+            )
+        finally:
+            dist.destroy_process_group()
+
+    def test_function_with_wrong_fqn(self):
+        def foo(inputs):
+            return inputs.x + 1
+
+        x = torch.randn(3, 2)
+        ref, loaded = self._test_serialization(
+            "TENSOR_MATCH", foo, Inputs(x, global_func_wrong_fqn)
+        )
+        self._test_check_fn(
+            ref, loaded, {"inputs": Inputs(x, global_func_wrong_fqn)}, True
+        )
+
+    def test_c10d_work(self):
+        import torch.distributed as dist
+
+        if not dist.is_available():
+            self.skipTest("Torch distributed is not available")
+
+        Work = dist.distributed_c10d.Work
+
+        class DummyWork(Work):
+            def __init__(self, should_succeed=True):
+                super().__init__()
+                self._done = False
+                self._should_succeed = should_succeed
+
+            def is_completed(self):
+                return self._done
+
+            def is_success(self):
+                return self._should_succeed
+
+            def wait(self, timeout=None):
+                self._done = True
+                if not self._should_succeed:
+                    raise RuntimeError("DummyWork failed")
+                return self
+
+            def result(self):
+                if not self._should_succeed:
+                    raise RuntimeError("DummyWork failed")
+                return "dummy_result"
+
+        def foo(inputs):
+            return inputs.x + 1
+
+        x = torch.randn(3, 2)
+        ref, loaded = self._test_serialization(
+            "TENSOR_MATCH", foo, Inputs(x, DummyWork())
+        )
+        self._test_check_fn(ref, loaded, {"inputs": Inputs(x, DummyWork())}, True)
+
+    def test_unused_weakref(self):
+        def foo(inputs):
+            return inputs.x + 1
+
+        x = torch.randn(3, 2)
+        ref, loaded = self._test_serialization(
+            "TENSOR_MATCH", foo, Inputs(x, weakref.ref(x))
+        )
+        self._test_check_fn(ref, loaded, {"inputs": Inputs(x, weakref.ref(x))}, True)
+
+    def test_unused_stream(self):
+        if not torch.cuda.is_available():
+            self.skipTest("CUDA is not available")
+
+        def foo(inputs):
+            return inputs.x + 1
+
+        x = torch.randn(3, 2)
+        ref, loaded = self._test_serialization(
+            "TENSOR_MATCH", foo, Inputs(x, torch.cuda.Stream())
+        )
+        self._test_check_fn(
+            ref, loaded, {"inputs": Inputs(x, torch.cuda.Stream())}, True
+        )
+
+    def test_unused_process_group(self):
+        import torch.distributed as dist
+
+        if not dist.is_available():
+            self.skipTest("Torch distributed is not available")
+
+        def foo(inputs):
+            return inputs.x + 1
+
+        tmpfile = tempfile.NamedTemporaryFile()
+        dist.init_process_group(
+            backend="gloo",
+            init_method=f"file://{tmpfile.name}",
+            rank=0,
+            world_size=1,
+        )
+
+        try:
+            pg = dist.distributed_c10d._get_default_group()
+            x = torch.randn(3, 2)
+            ref, loaded = self._test_serialization("TENSOR_MATCH", foo, Inputs(x, pg))
+            self._test_check_fn(ref, loaded, {"inputs": Inputs(x, pg)}, True)
+        finally:
+            dist.destroy_process_group()
+
+    def test_unserializable_submodule(self):
+        def foo(mod, x):
+            return mod(x)
+
+        x = torch.randn(10, 10)
+        mod = GlobalNestedModule(ModuleNotSerializable())
+        ref, loaded = self._test_serialization("TENSOR_MATCH", foo, mod, x)
+        self._test_check_fn(ref, loaded, {"mod": mod, "x": x}, True)
+
+    def test_closure_var_missing(self):
+        captured = torch.randn(3, 2)
+
+        def bar(x):
+            return x + captured
+
+        def foo(f, x):
+            return f(x)
+
+        x = torch.randn(3, 2)
+        ref, loaded = self._test_serialization("TENSOR_MATCH", foo, bar, x)
+        self._test_check_fn(ref, loaded, {"f": bar, "x": x}, True)
+
+    def test_bound_method_patched_forward(self):
+        def forward(x):
+            return x + 1
+
+        m = FlatModule()
+        m_forward = m.forward
+        m.forward = forward
+
+        def foo(f, x):
+            assert callable(f)
+            return f(x)
+
+        x = torch.randn(3, 2)
+        ref, loaded = self._test_serialization("TYPE_MATCH", foo, m_forward, x)
+        self._test_check_fn(ref, loaded, {"f": m_forward, "x": x}, True)
+
+    def test_guard_on_key_order_with_cache(self):
+        def foo(x, mod):
+            for y in mod.d.values():
+                x *= y
+            return x
+
+        x = torch.randn(3, 2)
+        d = {"a": 1e9, "b": 1e-9}
+        ref, loaded = self._test_serialization(
+            "DICT_KEYS_MATCH", foo, x, ModWithDict(d)
+        )
+        self._test_check_fn(
+            ref, loaded, {"x": x, "d": ModWithDict({"b": 1e-9, "a": 1e9})}, False
+        )
+
 
 class SimpleModule(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, c):
         super().__init__()
+        self.c = c
         self.p = torch.nn.Parameter(torch.randn(3, 2))
 
     def forward(self, x):
@@ -1398,7 +1736,7 @@ def test_guard_serialization_fsdp_module(self):
             from torch.distributed.fsdp import fully_shard
 
             mesh = init_device_mesh(str(torch.get_default_device()), (1,))
-            m = SimpleModule()
+            m = SimpleModule(42)
             m = fully_shard(m, mesh=mesh)
             inputs = distribute_tensor(torch.randn(3, 2), mesh, [Replicate()])
             ref, loaded = self._test_serialization("TENSOR_MATCH", m, inputs)
diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py
index 78943b41bc26..8b71fe398263 100644
--- a/test/dynamo/test_higher_order_ops.py
+++ b/test/dynamo/test_higher_order_ops.py
@@ -815,6 +815,7 @@ def forward(self, l_x_: "f32[3]", u0: "Sym(u0)", c: "i64[u0, 1]"):
 
     @torch._dynamo.config.patch(
         capture_dynamic_output_shape_ops=True,
+        capture_scalar_outputs=True,
     )
     def test_tensor_to_list_closure(self):
         def f(x):
diff --git a/test/dynamo/test_inline_and_install.py b/test/dynamo/test_inline_and_install.py
index b38b96ccc3e9..e484ebaf9de5 100644
--- a/test/dynamo/test_inline_and_install.py
+++ b/test/dynamo/test_inline_and_install.py
@@ -1,5 +1,4 @@
 # Owner(s): ["module: dynamo"]
-import unittest
 
 from torch._dynamo import config
 from torch._dynamo.testing import make_test_cls_with_patches
@@ -42,40 +41,6 @@ def make_dynamic_cls(cls):
     make_dynamic_cls(test)
 del test
 
-# After installing and inlining is turned on, these tests won't throw
-# errors in export (which is expected for the test to pass)
-# Therefore, these unittest are expected to fail, and we need to update the
-# semantics
-unittest.expectedFailure(
-    InlineAndInstallExportTests.test_invalid_input_global_inline_and_install  # noqa: F821
-)
-unittest.expectedFailure(
-    InlineAndInstallExportTests.test_invalid_input_global_multiple_access_inline_and_install  # noqa: F821
-)
-unittest.expectedFailure(
-    InlineAndInstallExportTests.test_invalid_input_nonlocal_inline_and_install  # noqa: F821
-)
-
-
-# These tests do string comparison on the graphs, and since buffers are now inlined, they
-# are named different, resulting in failure
-unittest.expectedFailure(
-    InlineAndInstallExportTests.test_param_buffer_safe_from_mutation_simple_inline_and_install  # noqa: F821
-)
-
-
-# This particular test is marked expecting failure, since dynamo was creating second param for a
-# and this was causing a failure in the sum; however with these changes, that test is fixed
-# so will now pass, so we need to mark that it is no longer expected to fail
-def expectedSuccess(test_item):
-    test_item.__unittest_expecting_failure__ = False
-    return test_item
-
-
-expectedSuccess(
-    InlineAndInstallExportTests.test_sum_param_inline_and_install  # noqa: F821
-)
-
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/dynamo/test_logging.py b/test/dynamo/test_logging.py
index 2a83b28b50a9..dd7e93de14cc 100644
--- a/test/dynamo/test_logging.py
+++ b/test/dynamo/test_logging.py
@@ -137,7 +137,11 @@ def test_fusion(self, records):
         fn_opt = torch.compile(inductor_schedule_fn, backend="inductor")
         fn_opt(torch.ones(1000, 1000, device=device_type))
         self.assertGreater(len(records), 0)
-        self.assertLess(len(records), 8)
+
+        # LOAF will add an extra round of fusion and result in more logs
+        self.assertLess(
+            len(records), 8 * (1 + torch._inductor.config.loop_ordering_after_fusion)
+        )
 
     @requires_cuda_and_triton
     @make_logging_test(cudagraphs=True)
@@ -729,7 +733,7 @@ def f(x, y, z):
 +- __SHAPE_GUARD__: L['x'].size()[0] == 2*L['y'].size()[0]  # return x + torch.cat([y, z])  # #:# in # #:# in #
 +- __SHAPE_GUARD__: L['z'].size()[0] == L['y'].size()[0]  # duck sizing added this equality because these variables had the same size 3 (to avoid this specialization, set torch.fx.experimental._config.use_duck_shape = False)
 +- __SHAPE_GUARD__: ((2*L['y'].size()[0]) % 3) == 0  # if x.size(0) % 3 == 0:  # #:# in # #:# in #
-+- __SHAPE_GUARD__: 2 <= L['y'].size()[0]  # return x + torch.cat([y, z])  # #:# in # (user code shown is first use of this value--the guard itself is not due user code but due to 0/1 specialization in the framework; to avoid specialization try torch._dynamo.mark_unbacked(tensor, dim))""",  # noqa: B950
++- __SHAPE_GUARD__: 2 <= L['y'].size()[0]  # return x + torch.cat([y, z])  # #:# in # (user code shown is first use of this value--the guard itself is not due user code but due to 0/1 specialization in the framework; to avoid specialization try torch._dynamo.decorators.mark_unbacked(tensor, dim))""",  # noqa: B950
         )
 
     @make_logging_test(guards=True)
@@ -745,7 +749,7 @@ def f(x, y):
             munge_shape_guards(record.getMessage()),
             """\
 +- __SHAPE_GUARD__: L['x'].size()[0] == 2*L['y'].size()[0]  # return any([x.size(0) == y.size(0) * 2])  # #:# in # #:# in #
-+- __SHAPE_GUARD__: 2 <= L['y'].size()[0]  # return any([x.size(0) == y.size(0) * 2])  # #:# in # (user code shown is first use of this value--the guard itself is not due user code but due to 0/1 specialization in the framework; to avoid specialization try torch._dynamo.mark_unbacked(tensor, dim))""",  # noqa: B950
++- __SHAPE_GUARD__: 2 <= L['y'].size()[0]  # return any([x.size(0) == y.size(0) * 2])  # #:# in # (user code shown is first use of this value--the guard itself is not due user code but due to 0/1 specialization in the framework; to avoid specialization try torch._dynamo.decorators.mark_unbacked(tensor, dim))""",  # noqa: B950
         )
 
     @make_logging_test(guards=True)
@@ -978,6 +982,7 @@ def bar():
     "graph_region_expansion",
     "hierarchical_compile",
     "compute_dependencies",
+    "annotation",
 }
 for name in torch._logging._internal.log_registry.artifact_names:
     if name not in exclusions:
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index 85831321f09a..60883b69a4d5 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -50,7 +50,6 @@
     CompileCounter,
     CompileCounterWithBackend,
     expectedFailureDynamic,
-    requiresPy310,
     same,
     skipIfNotPy311,
     unsupported,
@@ -243,6 +242,57 @@ def boolarg(aa, bb, flag):
         self.assertTrue(same(val4, correct1))
         self.assertEqual(counter.frame_count, 3)
 
+    def test_dynamo_inside_custom_op(self):
+        cnt = torch._dynamo.testing.InductorAndRecordGraphs()
+        cnt1 = torch._dynamo.testing.InductorAndRecordGraphs()
+
+        with torch.library._scoped_library("mylib", "FRAGMENT") as m:
+            m.define("foo(Tensor x) -> Tensor")
+
+            def inner(x):
+                return x.sin().cos()
+
+            def foo_impl(x):
+                return torch.compile(inner, fullgraph=True, dynamic=True, backend=cnt)(
+                    x
+                )
+
+            m.impl("foo", foo_impl, "CompositeExplicitAutograd")
+
+            @torch.compile(fullgraph=True, dynamic=True, backend=cnt1)
+            def f(x):
+                return torch.ops.mylib.foo.default(x)
+
+            x = torch.randn(3)
+            res = f(x)
+            res1 = f(x)
+            res2 = f(x)
+            expected = x.sin().cos()
+            self.assertEqual(res, expected)
+            self.assertEqual(res1, expected)
+            self.assertEqual(res2, expected)
+            self.assertTrue(len(cnt.inductor_graphs), 1)
+            self.assertTrue(len(cnt1.inductor_graphs), 1)
+            self.assertExpectedInline(
+                str(cnt.inductor_graphs[0].graph).strip(),
+                """\
+graph():
+    %arg0_1 : [num_users=0] = placeholder[target=arg0_1]
+    %arg1_1 : [num_users=1] = placeholder[target=arg1_1]
+    %sin : [num_users=1] = call_function[target=torch.ops.aten.sin.default](args = (%arg1_1,), kwargs = {})
+    %cos : [num_users=1] = call_function[target=torch.ops.aten.cos.default](args = (%sin,), kwargs = {})
+    return (cos,)""",
+            )
+            self.assertExpectedInline(
+                str(cnt1.inductor_graphs[0].graph).strip(),
+                """\
+graph():
+    %arg0_1 : [num_users=0] = placeholder[target=arg0_1]
+    %arg1_1 : [num_users=1] = placeholder[target=arg1_1]
+    %foo : [num_users=1] = call_function[target=torch.ops.mylib.foo.default](args = (%arg1_1,), kwargs = {})
+    return (foo,)""",
+            )
+
     @torch._dynamo.config.patch(accumulated_recompile_limit=1)
     def test_dynamo_disabled_in_custom_op_kernels(self):
         counters.clear()
@@ -1254,7 +1304,7 @@ def _fn(a, b, func=func):
     def test_bound_shape_checks(self):
         def f1(x, y):
             b = x.item()
-            torch._check_is_size(b)
+            torch._check(b >= 0)
             torch._check(b < y.shape[0])
             return y[:b]
 
@@ -1277,7 +1327,6 @@ def test_arange_length_with_float32_dtype(self):
         @torch.compile(fullgraph=True)
         def f(x):
             y = x.item()
-            torch._check_is_size(y)
             r = torch.arange(y, dtype=torch.float32)
 
             if r.size(0) == y:
@@ -1324,13 +1373,13 @@ def f(x):
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     # Translation validation changes the exception type, don't run with it
     @torch.fx.experimental._config.patch(translation_validation=False)
-    def test_torch_check_is_size(self):
+    def test_torch_check_nonnegative(self):
         cnts = torch._dynamo.testing.CompileCounter()
 
         @torch.compile(backend=cnts, fullgraph=True)
         def f(x):
             y = x.item()
-            torch._check_is_size(y)
+            torch._check(y >= 0)
             # Cannot conditional on unbacked SymInt
             if y == 0:
                 assert False
@@ -2592,6 +2641,7 @@ def fn(x):
             y = fn(x)
         self.assertTrue(y.flags.writeable)  # XXX: differs from numpy
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_numpy_tolist(self):
         def fn(x):
             return x.tolist()
@@ -3371,9 +3421,9 @@ def test_global_state_guard_serialization(self):
         # Test on non autocast state and autocast cache states.
         self.assertIn("autocast_state", json_guards)
         for key, value in json_guards.items():
-            if type(value) == int:
+            if type(value) is int:
                 variant = value + 1
-            elif type(value) == bool:
+            elif type(value) is bool:
                 variant = not value
             elif isinstance(value, dict) and key == "autocast_state":
                 variant = value.copy()
@@ -6827,13 +6877,10 @@ def guard_failures(failure):
 
         self.assertTrue(guard_failure is not None)
         first_guard_failure = guard_failure[0].partition("\n")[0]
-        if torch._dynamo.config.assume_static_by_default:
-            self.assertIn(
-                """tensor 'x' size mismatch at index 0. expected 2, actual 5""",
-                first_guard_failure,
-            )
-        else:
-            self.assertIn("""x.size()[0] < 3""", first_guard_failure)
+        self.assertIn(
+            """tensor 'x' size mismatch at index 0. expected 2, actual 5""",
+            first_guard_failure,
+        )
 
     def test_guard_failure_fn2(self):
         def fn(x, y):
@@ -7207,9 +7254,7 @@ def fn(x):
             return x + 1
 
         guard_manager = torch._dynamo.guards.RootGuardManager()
-        guard_manager.add_lambda_guard(
-            lambda L: isinstance(L["x"], int), {"x": 0}, True, []
-        )
+        guard_manager.add_lambda_guard(lambda L: isinstance(L["x"], int), [])
 
         def injected(x):
             return x + 42
@@ -7234,33 +7279,27 @@ def fn(x):
             return x + 1
 
         guard_manager_bool = torch._dynamo.guards.RootGuardManager()
-        guard_manager_bool.add_lambda_guard(
-            lambda L: isinstance(L["x"], bool), {"x": 0}, True, []
-        )
+        guard_manager_bool.add_lambda_guard(lambda L: isinstance(L["x"], bool), [])
 
         def injected_bool(x: bool):
             return x + 102
 
         guard_manager_int = torch._dynamo.guards.RootGuardManager()
-        guard_manager_int.add_lambda_guard(
-            lambda L: isinstance(L["x"], int), {"x": 0}, True, []
-        )
+        guard_manager_int.add_lambda_guard(lambda L: isinstance(L["x"], int), [])
 
         def injected_int(x: int):
             return x + 42
 
         guard_manager_tensor = torch._dynamo.guards.RootGuardManager()
         guard_manager_tensor.add_lambda_guard(
-            lambda L: isinstance(L["x"], torch.Tensor), {"x": 0}, True, []
+            lambda L: isinstance(L["x"], torch.Tensor), []
         )
 
         def injected_tensor(x: torch.Tensor):
             return x + 100
 
         guard_manager_str = torch._dynamo.guards.RootGuardManager()
-        guard_manager_str.add_lambda_guard(
-            lambda L: isinstance(L["x"], str), {"x": 0}, True, []
-        )
+        guard_manager_str.add_lambda_guard(lambda L: isinstance(L["x"], str), [])
 
         def injected_str(x: str):
             return x + "1"
@@ -7337,10 +7376,7 @@ def fn(x):
 
         guard_manager_bool = torch._dynamo.guards.RootGuardManager()
         guard_manager_bool.add_lambda_guard(
-            lambda L: isinstance(L["x"], bool),
-            {"x": 0},
-            True,
-            ["isinstance(L['x'], bool)"],
+            lambda L: isinstance(L["x"], bool), ["isinstance(L['x'], bool)"]
         )
 
         def injected_bool(x: bool):
@@ -7717,6 +7753,19 @@ def fn(x):
         opt_fn = torch.compile(fn, backend="eager")
         self.assertEqual(opt_fn(torch.ones(1)), torch.tensor([3.0]))
 
+    def test_sparse_output_inductor_should_break(self) -> None:
+        # See https://github.com/pytorch/pytorch/issues/164823
+        # We want consistent semantics here
+        def forward(x: torch.Tensor) -> torch.Tensor:
+            x_sparse = x.to_sparse()
+            return x_sparse * 2
+
+        test_tensor = torch.randn(10, 10)
+        pt = forward(test_tensor)
+        aot_eager = torch.compile(forward, backend="aot_eager")(test_tensor)
+        self.assertEqual(pt, aot_eager)
+        inductor = torch.compile(forward, backend="inductor")(test_tensor)
+
     def test_nested_sequential_try_with(self):
         def fn(x):
             with torch.set_grad_enabled(True):
@@ -7979,6 +8028,7 @@ def fn(x):
         self.assertEqual(fn(torch.tensor([4])).size(0), 1)
         self.assertEqual(fn(torch.tensor([1])).size(0), 0)
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_sym_and_terms(self):
         from torch.fx.experimental.symbolic_shapes import sym_and
 
@@ -8017,8 +8067,11 @@ def func(a, b):
         torch._dynamo.decorators.mark_unbacked(b, 1)
         func(a, b)
         func(torch.rand(4, 5), torch.rand(4, 5))
-        with self.assertRaises(RuntimeError):
-            func(torch.rand(1, 1), torch.rand(2, 1))
+        # This does not raise an error right now because of a recompilation.
+        # https://github.com/pytorch/pytorch/issues/163785
+        # with self.assertRaises(AssertionError):
+        #     func(torch.rand(1, 1), torch.rand(2, 1))
+        func(torch.rand(1, 1), torch.rand(2, 1))
 
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_sym_constrain_range_on_replaced_unbacked_symbol(self):
@@ -8078,7 +8131,6 @@ def test_symint_fold_nontrivial_product_modulo(self):
         @torch.compile(fullgraph=True)
         def f(x):
             u0, u1 = x.tolist()
-            torch._check_is_size(u0)
             # The condition should fold to true.
             if ((u0 + 10) * (u0 + 10)) % (u0 + 10) == 0:
                 return torch.tensor(True)
@@ -8146,6 +8198,19 @@ def my_dyn_fn(x, y):
         torch._dynamo.reset()
         torch.compile(my_dyn_fn, backend="eager")(y, y)
 
+    def test_tolist(self):
+        # This should compile with no faluire.
+        cnt = CompileCounterWithBackend("inductor")
+
+        @torch.compile(fullgraph=False, backend=cnt)
+        def func(a):
+            a = a * 100
+            u0, u1, u2, u3, u4 = a.tolist()
+            return a * u0 * u1
+
+        func(torch.tensor([1, 2, 3, 4, 5]))
+        self.assertEqual(cnt.frame_count, 2)
+
     # Sadly, this does not throw - we do not prop correctly across the graph break
     @unittest.expectedFailure
     def test_raise_guard_partial_constraint_across_break(self):
@@ -8478,43 +8543,24 @@ def write_state(state):
         def fn(x):
             return x + 1
 
-        import contextlib
-
-        @contextlib.contextmanager
-        def _hip_allow_tf32():
-            # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
-            # and only for MI300+
-            hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
-            os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
-
-            try:
-                yield
-            finally:
-                if hip_allow_tf32 is not None:
-                    os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
-                else:
-                    del os.environ["HIPBLASLT_ALLOW_TF32"]
-
-        tf32_ctx = _hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
-        with tf32_ctx():
-            initial_state = read_state()
-            y = torch.randn(10)
-            try:
-                for round in range(3):
-                    for i in range(len(initial_state)):
-                        new_state = [False] * len(initial_state)
-                        new_state[i] = True
-                        write_state(new_state)
-                        assert read_state() == new_state
-                        last_state.clear()
-                        fn(y)
-                        assert last_state == new_state
-                        if round == 0:
-                            assert cnt == i + 1
-                        else:
-                            assert cnt == len(initial_state)
-            finally:
-                write_state(initial_state)
+        initial_state = read_state()
+        y = torch.randn(10)
+        try:
+            for round in range(3):
+                for i in range(len(initial_state)):
+                    new_state = [False] * len(initial_state)
+                    new_state[i] = True
+                    write_state(new_state)
+                    assert read_state() == new_state
+                    last_state.clear()
+                    fn(y)
+                    assert last_state == new_state
+                    if round == 0:
+                        assert cnt == i + 1
+                    else:
+                        assert cnt == len(initial_state)
+        finally:
+            write_state(initial_state)
 
     def test_grad_state_mutated(self):
         prior = torch.is_grad_enabled()
@@ -8633,46 +8679,63 @@ def global_context_capture_fn(frame_summary):
         self.assertEqual(seen_frames[0].line, "r, r2 = uwu_inline_me(x, y, z)")
 
     def test_fullgraph_capture(self):
-        from torch._dynamo.convert_frame import (
-            FrameInfo,
-            fullgraph_capture,
-            get_compile_id,
-        )
+        from torch._dynamo.convert_frame import fullgraph_capture
         from torch._dynamo.utils import dynamo_timed, get_metrics_context
-        from torch._guards import compile_context, CompileContext
 
         def foo(x):
-            return x + x.shape[0]
+            if x.shape[1] >= 3:
+                return x + x.shape[0]
+            else:
+                return x - x.shape[0]
 
         x = torch.randn(4, 3)
-        f_locals = {"x": x}
         with (
-            compile_context(CompileContext(get_compile_id({}))),
-            dynamo_timed(""),
             get_metrics_context(),
+            dynamo_timed(""),
         ):
-            capture_output = fullgraph_capture(
-                FrameInfo(
-                    foo.__code__,
-                    foo.__globals__,
-                    f_locals,
-                    builtins,
-                    (),
-                )
-            )
-            dynamo_output = capture_output.dynamo_output
+            capture_output = fullgraph_capture(foo, (x,))
+            graph_capture_output = capture_output.graph_capture_output
+            fn = graph_capture_output.build_guards(foo.__code__)
+
+            for guard in graph_capture_output.output_graph.guards:
+                if guard.source == torch._guards.GuardSource.SHAPE_ENV:
+                    dynamic = guard.code_list is not None
+                    if dynamic:
+                        self.assertEqual(
+                            guard.code_list,
+                            [
+                                "L['x'].stride()[0] == L['x'].size()[1]",
+                                "2 <= L['x'].size()[0]",
+                                "3 <= L['x'].size()[1]",
+                            ],
+                        )
+                        self.assertTrue(
+                            fn.guard_manager.check({"x": torch.randn(3, 3)})
+                        )
+                        self.assertTrue(
+                            fn.guard_manager.check({"x": torch.randn(4, 4)})
+                        )
+                    else:
+                        self.assertFalse(
+                            fn.guard_manager.check({"x": torch.randn(3, 3)})
+                        )
+                        self.assertFalse(
+                            fn.guard_manager.check({"x": torch.randn(4, 4)})
+                        )
+                    self.assertFalse(fn.guard_manager.check({"x": torch.randn(4, 2)}))
+                    self.assertFalse(fn.guard_manager.check({"x": torch.randn(1, 3)}))
+                    break
+
             backend_input = capture_output.backend_input
-            self.assertTrue(
-                dynamo_output.build_guards(foo.__code__).guard_manager.check(f_locals)
-            )
+            self.assertTrue(fn.guard_manager.check({"x": x}))
         import_sources = {
             alias: importlib.import_module(module_name)
-            for alias, module_name in dynamo_output.tracer_output.output_graph.import_sources.items()
+            for alias, module_name in graph_capture_output.import_sources.items()
         }
         self.assertEqual(
             foo(x),
             types.FunctionType(
-                dynamo_output.bytecode,
+                graph_capture_output.bytecode,
                 {
                     **import_sources,
                     backend_input.backend_id: backend_input.graph_module,
@@ -9193,6 +9256,47 @@ def fn(x):
         self.assertEqual(counter.frame_count, 2)
         self.assertEqual(counter.op_count, 2)
 
+    def test_jacfwd_one_hot_dynamic_compile(self):
+        import torch.nn.functional as F
+
+        MAX, BATCH = 3, 37
+
+        def func(x, idxs):
+            return x.square() * F.one_hot(idxs, MAX)
+
+        def jacfunc(x, idxs):
+            return torch.func.jacfwd(func, argnums=(0,))(x, idxs)
+
+        idxs = torch.randint(MAX, (BATCH,), dtype=torch.int64)
+        x = torch.rand((BATCH, MAX), dtype=torch.float64)
+        eager = jacfunc(x, idxs)
+
+        compiled = torch.compile(jacfunc, backend="eager", dynamic=True)
+        out_comp = compiled(x, idxs)
+        self.assertEqual(eager[0], out_comp[0])
+
+    def test_tracing_nested_py_tree_mixed_all(self):
+        def fn(xs):
+            flat_xs, spec = python_pytree.tree_flatten(xs)
+            res = [x.clone() for x in flat_xs]
+            return python_pytree.tree_unflatten(res, spec)
+
+        xs = [torch.tensor(i) for i in range(3)]
+        xsa = (xs, xs)
+        xsb = {"aa": xsa, "ab": xs}
+        xsl = {
+            "a": xs,
+            "b": xsa,
+            "c": xsb,
+        }
+
+        counter = CompileCounter()
+        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
+        real_out = fn(xsl)
+        self.assertEqual(comp_out, real_out)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 18)
+
     def test_any_all_symnode(self):
         cnt = CompileCounter()
 
@@ -9222,14 +9326,10 @@ def fn(x):
     @torch._dynamo.config.patch(
         capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
     )
-    def test_unbacked_symint(self):
+    def test_unbacked_symint_split(self):
         @torch.compile(backend="eager")
         def f(lengths, values):
             sizes = lengths.tolist()
-            for s in sizes:
-                torch._check_is_size(s)
-                torch._check(s >= 2)
-                torch._check(s <= 100)
             return torch.split(values, sizes)
 
         f(torch.tensor([2, 3, 4]), torch.randn(9))
@@ -9699,6 +9799,7 @@ def fn(img):
         img2 = torch.randn(1, 3, 8, 15)
         self.assertRaises(AssertionError, lambda: fn(img2))
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_tolist_scalar(self):
         def fn(x):
             new_list = []
@@ -9713,6 +9814,7 @@ def fn(x):
         self.assertEqual(eager, compiled)
         self.assertEqual(counter.frame_count, 1)
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_tolist_1d(self):
         def fn(x):
             new_list = []
@@ -9727,6 +9829,7 @@ def fn(x):
         self.assertEqual(eager, compiled)
         self.assertEqual(counter.frame_count, 1)
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
     def test_tolist_kd(self):
         def fn(x):
             new_list = []
@@ -9741,6 +9844,7 @@ def fn(x):
         self.assertEqual(eager, compiled)
         self.assertEqual(counter.frame_count, 1)
 
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
     @patch.object(torch._dynamo.config, "specialize_int", True)
     def test_tolist_0d(self):
         def fn(x):
@@ -9763,12 +9867,12 @@ def fn(x):
             new_list = []
             i = x.tolist()
             new_list.append(i * 4)
-            return new_list
+            return new_list, x * 10
 
         x = torch.randint(3, 5, [5, 5])
         eager = fn(x)
         counter = CompileCounter()
-        compiled_fn = torch.compile(fn, backend=counter, fullgraph=True)
+        compiled_fn = torch.compile(fn, backend=counter, fullgraph=False)
         compiled = compiled_fn(x)
         self.assertEqual(eager, compiled)
         self.assertEqual(counter.frame_count, 1)
@@ -10519,7 +10623,7 @@ def forward(self, x):
         expected_fqn = {
             "L__self___test_param": "test_param",
             "L__self___test_buf": "test_buf",
-            "getattr_L__self___foo_bar___0__": "foo_bar.0",
+            "L__self___foo_bar_0": "foo_bar.0",
             "L__self___foo_bar_test_param": "foo_bar.test_param",
             "L__self___foo_bar_test_buf": "foo_bar.test_buf",
         }
@@ -10628,7 +10732,6 @@ def fn(x, y):
 
         self.assertEqual(actual, expected)
 
-    @requiresPy310
     def test_frozen_dataclass_kw_only(self):
         @dataclasses.dataclass(frozen=True)
         class TestDataClass:
@@ -11311,15 +11414,13 @@ def fn(x):
         c2 = _debug_get_cache_entry_list(fn.__code__)
         self.assertEqual(len(c2), 0)
 
-    def test_guard_size_oblivious_simplification(self):
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_check_simplification(self):
         @torch.compile(backend="eager", fullgraph=True)
         def fn(x):
             u0, u1 = x.tolist()
-            torch._check_is_size(u0)
-            torch._check_is_size(u1)
-            torch._check((2 * u0) % (u0 + u1) == 0)
             torch._check((2 * u0) // (u0 + u1) != 0)
-            if guard_size_oblivious((2 * u0) // (u0 + u1) == 0):
+            if (2 * u0) // (u0 + u1) == 0:
                 return torch.tensor(True)
             else:
                 return torch.tensor(False)
@@ -11340,33 +11441,13 @@ def fn(x, y):
         with self.assertRaisesRegex(RuntimeError, "specialized"):
             fn(x, y)
 
-    def test_sym_max_unbacked_sizelike_simplification(self):
-        @torch.compile(fullgraph=True, backend="eager")
-        def cf(x):
-            u0, u1 = x.tolist()
-            torch._check_is_size(u0)
-            torch._check_is_size(u1)
-            torch._check(u0 + u1 == 20)
-
-            y = 0
-            if guard_size_oblivious(torch.sym_max(1, u0 + u1) == 20):
-                y += 1
-            if guard_size_oblivious(torch.sym_max(1, u0**2 + u1 + 2) != 1):
-                y += 1
-            if guard_size_oblivious(torch.sym_min(1, u0) == 1):
-                y += 1
-            return y
-
-        # Previously would have thrown guard on data dependent
-        self.assertEqual(cf(torch.tensor([10, 10])), 3)
-
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
-    def test_guard_size_oblivious(self):
+    def test_infer_unbacked_size_gt_zero(self):
         # This code, in fact, does NOT work in eager
         @torch.compile(backend="eager", fullgraph=True)
         def fn(x):
             y = torch.zeros(x.item())
-            if guard_size_oblivious(y.size(0) == 0):
+            if y.size(0) < 0:
                 assert False
             return y
 
@@ -13303,7 +13384,7 @@ def fn():
             counter = CompileCounter()
             opt_fn = torch.compile(fn, backend=counter)
             res = opt_fn()
-            self.assertEqual(res.device.type, device)
+            self.assertTrue(res.device.type in device)
             self.assertEqual(res.device.index, 0)
             self.assertEqual(counter.frame_count, 2)
 
@@ -13373,6 +13454,34 @@ def f(actions, n_act, epsilon=0.1):
         y = torch.tensor(5)
         f(x, y)
 
+    def test_full_graph_capture_scalar_outputs(self):
+        @torch.compile(fullgraph=True)
+        def foo(a):
+            return torch.randn(5) * a.item()
+
+        # We expect to no longer raise here
+        foo(torch.tensor(2.0))
+
+    def test_full_graph_capture_dynamic_output_shape_ops(self):
+        def fn(x):
+            nz = torch.nonzero(x)
+            squared = nz * nz
+            sliced = torch.ops.aten.slice.Tensor(squared, dim=1, start=-2, end=None)
+            view = sliced.unsqueeze(dim=0)
+            return view.squeeze(dim=0)
+
+        example_inputs = (torch.randn(1, 1, 1, 1),)
+        # we expect to no longer raise here
+        torch.compile(fn, fullgraph=True)(*example_inputs)
+
+    def test_dynamic_fill_diagonal_(self):
+        @torch.compile(dynamic=True)
+        def f(x):
+            x.fill_diagonal_(True)
+
+        x = torch.zeros(4, 4)
+        f(x)
+
     def test_dynamic_float_scalar_tensor_coersion(self):
         # Minified version of https://github.com/pytorch/pytorch/issues/158376#issuecomment-3079591367
         class Foo:
diff --git a/test/dynamo/test_modes.py b/test/dynamo/test_modes.py
index 818e5a85aa26..82c87bde8c0b 100644
--- a/test/dynamo/test_modes.py
+++ b/test/dynamo/test_modes.py
@@ -12,7 +12,11 @@
     _push_on_torch_function_stack,
 )
 from torch._dynamo.utils import counters
-from torch.overrides import _get_current_function_mode_stack, BaseTorchFunctionMode
+from torch.overrides import (
+    _get_current_function_mode_stack,
+    BaseTorchFunctionMode,
+    TorchFunctionMode,
+)
 from torch.testing._internal.common_utils import skipIfXpu
 from torch.testing._internal.inductor_utils import GPU_TYPE
 from torch.testing._internal.triton_utils import requires_gpu
@@ -190,6 +194,19 @@ def test_torch_function_mode_guards_py(self):
     def test_torch_function_mode_guards_cpp(self):
         self._run_torch_function_mode_guard_test()
 
+    @requires_gpu
+    def test_torch_function_mode_preserves_cuda_rng_state(self):
+        class ConstantReturnMode(TorchFunctionMode):
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                return -42
+
+        @torch._dynamo.optimize("eager")
+        def fn():
+            with ConstantReturnMode():
+                return 123
+
+        self.assertEqual(fn(), 123)
+
     def test_stack_state_mutation_default_device(self):
         m = BaseTorchFunctionMode()
         m1 = BaseTorchFunctionMode()
diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index 7cac7eca7239..c251ce28bac4 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -3047,7 +3047,7 @@ def forward(self, x):
         def generate(x, c):
             return mod(x) + c
 
-        for _ in range(0, 10):
+        for _ in range(10):
             generate(torch.randn(10, 10), 0)
             generate(torch.randn(10, 10), 1)
         self.assertEqual(cnt.frame_count, 2)
diff --git a/test/dynamo/test_nested_graph_breaks.py b/test/dynamo/test_nested_graph_breaks.py
index 1f404239447c..93221883852f 100644
--- a/test/dynamo/test_nested_graph_breaks.py
+++ b/test/dynamo/test_nested_graph_breaks.py
@@ -363,6 +363,31 @@ def outer(x):
         self.assertEqual(cnts.frame_count, 2)
         self.assertEqual(cnts.op_count, 13)
 
+    def test_cells_double_graph_break(self):
+        def f1(x1):
+            cell1 = x1 + 1
+
+            def f2(x2):
+                nonlocal cell1
+                cell1 += 2
+                torch._dynamo.graph_break()
+                torch._dynamo.graph_break()
+                return x2 + cell1
+
+            return f2(x1 + 4), cell1
+
+        def outer(x):
+            return f1(x)
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(outer)
+        x = torch.zeros(3)
+        res = outer(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.op_count, 4)
+
     def test_side_effects_cells(self):
         cell1, cell2, cell3, cell4 = (torch.zeros(3),) * 4
 
@@ -511,6 +536,7 @@ def f5(x):
         self.assertEqual(cnts.frame_count, 5)
         # 4 additions from f5+f4, 2 x 4 additions from f2+f1 (i == 5, i != 5)
         self.assertEqual(cnts.op_count, 12)
+        self.assertEqual(torch._dynamo.utils.counters["frames"]["total"], 6)
 
     def test_nested_graph_break_in_try_block(self):
         # NOTE: this also tests nested step_graph_break
@@ -551,13 +577,40 @@ def f5(x):
         x = torch.zeros(3)
         res = f5(x)
         ref = opt_fn(x)
-        print(ref, res)
         self.assertEqual(ref, res)
         # skip frame due to graph break in try block
         # 2 frames from f5+f4+(first part of f3), 2 frames from f2+f1
         self.assertEqual(cnts.frame_count, 4)
         # 5 additions from f5+f4+(first part of f3), 4 additions from f2+f1
         self.assertEqual(cnts.op_count, 9)
+        self.assertEqual(torch._dynamo.utils.counters["frames"]["total"], 4)
+
+    def test_nested_step_unsupported(self):
+        global f1, f2, f3
+
+        def f1(x):
+            return x + 1
+
+        def f2(x):
+            x = x + 2
+            torch._dynamo.step_unsupported()
+            return f1(x) + 4
+
+        def f3(x):
+            x = x + 8
+            return f2(x) + 16
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+        x = torch.zeros(3)
+        res = f3(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        # 1 frame from start of f3 + start of f2, 1 frame from f1, 1 frame from the end of f3
+        self.assertEqual(cnts.frame_count, 3)
+        # all ops except + 4
+        self.assertEqual(cnts.op_count, 4)
+        self.assertEqual(torch._dynamo.utils.counters["frames"]["total"], 3)
 
 
 if __name__ == "__main__":
diff --git a/test/dynamo/test_package.py b/test/dynamo/test_package.py
index 96a726ad6680..b7fe2bc49fca 100644
--- a/test/dynamo/test_package.py
+++ b/test/dynamo/test_package.py
@@ -16,13 +16,10 @@
 from torch._dynamo.precompile_context import PrecompileContext
 from torch._dynamo.testing import reduce_to_scalar_loss
 from torch._functorch import config as functorch_config
-from torch._inductor.mock_cache import global_stats, PatchCaches, Stats
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
-    skipIfRocm,
-    skipIfXpu,
 )
 from torch.testing._internal.inductor_utils import (
     HAS_CUDA_AND_TRITON,
@@ -50,9 +47,7 @@ def setUp(self):
         DynamoCache.clear()
         PrecompileContext.clear()
 
-    def _save_and_reload(
-        self, expected_backends, expected_dynamo, expected_autotune=None
-    ):
+    def _save_and_reload(self, expected_backends, expected_dynamo):
         """
         Serializes all artifacts, clears all caches, then reloads the serialized artifact
         Simulates a new process.
@@ -61,24 +56,12 @@ def _save_and_reload(
             expected_backends: Expected number of precompile_aot_autograd_artifacts
             expected_dynamo: Expected number of precompile_dynamo_artifacts
         """
-        serialized = PrecompileContext.serialize()
-        assert serialized is not None
-        (bytes_, cache_info) = serialized
-        self.assertEqual(
-            len(cache_info.precompile_aot_autograd_artifacts), expected_backends
-        )
-        self.assertEqual(len(cache_info.precompile_dynamo_artifacts), expected_dynamo)
-        if expected_autotune is not None:
-            self.assertEqual(len(cache_info.autotune_artifacts), expected_autotune)
-
+        debug_info = PrecompileContext.save_to_dynamo_cache()
+        self.assertEqual(len(debug_info["dynamo"]), expected_dynamo)
+        self.assertEqual(len(debug_info["backends"]), expected_backends)
         torch._dynamo.reset()
-        DynamoCache.clear()
         PrecompileContext.clear()
 
-        deserialized = PrecompileContext.deserialize(bytes_)
-        assert deserialized is not None
-        PrecompileContext.populate_caches(deserialized)
-
     @unittest.expectedFailure  # FUNCTION_MATCH guard not serializable today
     def test_nn_module(self):
         class MyModule(torch.nn.Module):
@@ -440,41 +423,6 @@ def fn2(x):
             self.assertEqual(expected, [result1, result2])
         self.assertEqual(torch._dynamo.convert_frame.FRAME_COUNTER, total_frames)
 
-    @parametrize("device", ("cuda", "xpu"))
-    @torch._dynamo.config.patch(caching_precompile=True)
-    @skipIfXpu
-    @skipIfRocm
-    def test_automatic_dynamo_autotune_cache(self, device):
-        if device == "cuda" and not HAS_CUDA_AND_TRITON:
-            raise unittest.SkipTest("Requires CUDA/Triton")
-        if device == "xpu" and not HAS_XPU_AND_TRITON:
-            raise unittest.SkipTest("Requires XPU/Triton")
-
-        def fn(x, y):
-            return x.sin() + y
-
-        arg1 = torch.randn(3, 3, device=device)
-        arg2 = torch.randn(3, 3, device=device)
-        expected = fn(arg1, arg2).clone()
-
-        with PatchCaches():
-            compiled_fn1 = torch.compile(fn, mode="max-autotune")
-            result = compiled_fn1(arg1, arg2).clone()
-            self.assertEqual(expected, result)
-            self.assertEqual(global_stats.autotune_local, Stats(1, 0, 1))
-            DynamoCache.clear()
-
-            total_frames = torch._dynamo.convert_frame.FRAME_COUNTER
-            self._save_and_reload(
-                expected_backends=1, expected_dynamo=1, expected_autotune=1
-            )
-            compiled_fn1 = torch.compile(fn, mode="max-autotune")
-            with torch.compiler.set_stance("fail_on_recompile"):
-                result1 = compiled_fn1(arg1, arg2).clone()
-                self.assertEqual(expected, result1)
-            self.assertEqual(torch._dynamo.convert_frame.FRAME_COUNTER, total_frames)
-            self.assertEqual(global_stats.autotune_local, Stats(2, 1, 1))
-
     @parametrize("device", ("cpu", "cuda", "xpu"))
     @torch._dynamo.config.patch(caching_precompile=True)
     def test_automatic_dynamo_recompiles(self, device):
@@ -583,6 +531,53 @@ def fn(x):
 
         self.assertEqual(torch._dynamo.convert_frame.FRAME_COUNTER, total_frames)
 
+    @parametrize("device", ("cpu", "cuda", "xpu"))
+    @torch._dynamo.config.patch(caching_precompile=True)
+    def test_graph_break_partial_backend(self, device):
+        if device == "cuda" and not HAS_CUDA_AND_TRITON:
+            raise unittest.SkipTest("Requires CUDA/Triton")
+        if device == "xpu" and not HAS_XPU_AND_TRITON:
+            raise unittest.SkipTest("Requires XPU/Triton")
+
+        def fn(x):
+            y = x.sin()
+            torch._dynamo.graph_break()
+            return x.sin() + y
+
+        arg1 = torch.randn(3, 2, device=device, requires_grad=True)
+        arg2 = arg1.clone().detach_().requires_grad_(True)
+        compiled_fn = torch.compile(fn)
+        expected1 = compiled_fn(arg1)
+        expected1.sum().backward()
+        total_frames = torch._dynamo.convert_frame.FRAME_COUNTER
+
+        # Remove backends related to resume functions
+        dynamo_entry = next(iter(PrecompileContext._dynamo_cache_entries.values()))
+        for code in dynamo_entry.codes:
+            module = sys.modules[code.python_module]
+            if code.install_to_global:
+                # Clear the fn_names from global scope, to simulate a new environment
+                for fn_name in code.function_names:
+                    module.__dict__.pop(fn_name)
+            for fn_name in code.function_names:
+                if "resume" in fn_name:
+                    self.assertEqual(len(code.backend_ids), 1)
+                    # delete the fn from the global scope to simulate a new
+                    backend = code.backend_ids[0]
+                    # Delete the backend associated with the resume function
+                    del PrecompileContext._backend_artifacts_by_key[backend]
+
+        self._save_and_reload(expected_backends=1, expected_dynamo=1)
+
+        compiled_fn = torch.compile(fn)
+        # Run it again. There will be a recompile because one of the backends is deleted, but it should
+        # still work.
+        expected2 = compiled_fn(arg2)
+        expected2.sum().backward()
+        self.assertEqual(expected1, expected2)
+        # One recompile on a new frame, so total_frames should increase by 1
+        self.assertEqual(torch._dynamo.convert_frame.FRAME_COUNTER, total_frames + 1)
+
     @parametrize("device", ("cpu", "cuda", "xpu"))
     @torch._dynamo.config.patch(caching_precompile=True)
     def test_call_function_from_resume(self, device):
diff --git a/test/dynamo/test_pgo.py b/test/dynamo/test_pgo.py
index ce2fda138729..e7e572d6f1ab 100644
--- a/test/dynamo/test_pgo.py
+++ b/test/dynamo/test_pgo.py
@@ -62,6 +62,13 @@ def f(x):
         force_nn_module_property_static_shapes=False,
     )
     def test_whitelist_suggestion(self):
+        from torch._dynamo.pgo import (
+            _collect_dynamic_sources,
+            _collect_missing_sources,
+            get_code_state,
+            render_code_state,
+        )
+
         cnts = CompileCounter()
 
         @torch.compile(backend=cnts, fullgraph=True)
@@ -83,15 +90,19 @@ def forward(self, x, y):
         ]
 
         def check_whitelist(sources_):
-            state = torch._dynamo.pgo.render_code_state(
-                torch._dynamo.pgo.get_code_state()
-            )
+            state = render_code_state(get_code_state())
             whitelist = re.search(r'TORCH_COMPILE_DYNAMIC_SOURCES="(.*)"', state).group(
                 1
             )
             for src in sources_:
                 self.assertTrue(src in whitelist)
 
+        def check_num_missing_whitelist(expected):
+            frame_state = next(iter(get_code_state().values()))
+            all_dynamic_sources = _collect_dynamic_sources(frame_state)
+            missing_whitelist = _collect_missing_sources(all_dynamic_sources)
+            self.assertEqual(len(missing_whitelist), expected)
+
         # check growing whitelist
         f = Foo()
         f(torch.randn(2, 4), torch.randn(4))
@@ -107,11 +118,13 @@ def check_whitelist(sources_):
         f.attr = torch.randn(8)
         f(torch.randn(8, 8), torch.randn(8))
         check_whitelist(sources)
+        check_num_missing_whitelist(5)
 
         # now use suggested whitelist
         self.reset()
         cnts.clear()
-        state = torch._dynamo.pgo.render_code_state(torch._dynamo.pgo.get_code_state())
+        code_state = get_code_state()
+        state = render_code_state(code_state)
         whitelist = re.search(r'TORCH_COMPILE_DYNAMIC_SOURCES="(.*)"', state).group(1)
         with torch.compiler.config.patch(dynamic_sources=whitelist):
             f = Foo()
@@ -121,6 +134,7 @@ def check_whitelist(sources_):
             f.attr = torch.randn(8)
             f(torch.randn(8, 8), torch.randn(8))
             self.assertEqual(cnts.frame_count, 1)
+            check_num_missing_whitelist(0)
 
     def test_no_empty_graph_allowlist(self):
         @torch._dynamo.disable
@@ -443,59 +457,15 @@ def t(x, y):
             f(t(2, 4), t(2, 2))
             f(t(4, 2), t(2, 2))
 
-            # with default remote (dynamic x) + extra remote (dynamic y),
-            # we should be able to wobble x & y with no recompiles.
+            # with both default remote present, we ignore extra remote.
             self.reset()
             cnts.clear()
             with torch.compiler.config.patch(pgo_extra_read_key="sticky_1"):
                 f(t(2, 2), t(2, 2))
-                f(t(2, 4), t(4, 2))
-                f(t(4, 2), t(2, 4))
+                f(t(6, 8), t(2, 2))
                 self.assertEqual(cnts.frame_count, 1)
-
-    def test_profile_merges(self):
-        from torch._dynamo.pgo import auto_dynamic, merge_pgo_entry
-
-        @torch.compile(backend="eager", fullgraph=True)
-        def f(ints, t_scalar, tensors):
-            # arbitrary compute
-            return ints[0] + ints[1], t_scalar + 1, [t + 1 for t in tensors]
-
-        # single static run
-        f(
-            [0, 2],
-            torch.tensor(0),
-            [
-                torch.randn(2),
-                torch.randn(2, 2),
-                torch.randn(4, 4),
-            ],
-        )
-        # collect profiles
-        profile = next(
-            iter(torch._dynamo.pgo.get_code_state().values())
-        ).automatic_dynamic
-        i0, i1 = profile["L['ints'][0]"], profile["L['ints'][1]"]
-        ts = profile["L['t_scalar]"]
-        t0, t1, t2 = (
-            profile["L['tensors'][0]"],
-            profile["L['tensors'][1]"],
-            profile["L['tensors'][2]"],
-        )
-        # merging same scalar, or tensor into scalar -> no-op
-        merge_pgo_entry(i0, i0)
-        merge_pgo_entry(ts, i0)
-        merge_pgo_entry(t0, i0)
-        self.assertEqual(i0.scalar, 0)
-        # merging different scalars -> dynamic
-        merge_pgo_entry(i1, i0)
-        self.assertEqual(i0.scalar, auto_dynamic)
-        # merging different rank tensors -> static
-        merge_pgo_entry(t0, t2)
-        self.assertEqual(t2.size, (4, 4))
-        # merging same rank tensors -> dynamic
-        merge_pgo_entry(t1, t2)
-        self.assertEqual(t2.size, (auto_dynamic, auto_dynamic))
+                f(t(2, 2), t(2, 4))
+                self.assertEqual(cnts.frame_count, 2)
 
 
 if __name__ == "__main__":
diff --git a/test/dynamo/test_precompile_context.py b/test/dynamo/test_precompile_context.py
index b509adf28112..6c72f65f53ae 100644
--- a/test/dynamo/test_precompile_context.py
+++ b/test/dynamo/test_precompile_context.py
@@ -1,16 +1,9 @@
 # Owner(s): ["module: dynamo"]
-
-import pickle
-
 import torch
 import torch._dynamo
 import torch._dynamo.test_case
 import torch._functorch
-from torch._dynamo.precompile_context import (
-    EditablePrecompileCacheArtifact,
-    PrecompileCacheArtifact,
-    PrecompileContext,
-)
+from torch._dynamo.precompile_context import BackendCacheArtifact, PrecompileContext
 from torch._functorch import config as functorch_config
 from torch._functorch._aot_autograd.autograd_cache import (
     BundledAOTAutogradCacheArtifact,
@@ -47,32 +40,13 @@ def simple_function(x):
         x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
         result = compiled_fn(x)
         result.sum().backward()
-        self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2)
-        self.assertEqual(len(PrecompileContext._new_cache_artifacts), 0)
-
-        result = PrecompileContext.serialize()
-        assert result is not None
-        serialized, cache_info = result
-        self.assertEqual(len(cache_info.precompile_aot_autograd_artifacts), 1)
-
-        artifacts = PrecompileContext.deserialize(serialized)
-        assert artifacts is not None
-        deserialized = artifacts["precompile_aot_autograd"]
-        assert len(deserialized) == 1
-        entry = deserialized[0]
-        assert isinstance(entry, BundledAOTAutogradCacheArtifact)
-        entry = entry.after_deserialization()
-        # Now that we've serialized, there should be no new cache artifacts
-        self.assertEqual(
-            len(PrecompileContext._new_cache_artifacts["precompile_aot_autograd"]), 0
-        )
+        self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1)
+        self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1)
+        cache_entries, _ = PrecompileContext.create_cache_entries()
+        self.assertEqual(len(cache_entries), 1)
 
     @requires_triton()
     def test_serialize_by_key(self):
-        """
-        Test that after torch.compile, PrecompileContext._new_cache_artifacts length is 1
-        """
-
         def simple_function(x):
             return x.sin() + x.cos()
 
@@ -82,17 +56,16 @@ def simple_function(x):
         x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
         result = compiled_fn(x)
         result.sum().backward()
-        self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2)
-        for key in PrecompileContext._new_cache_artifacts_by_key.keys():
+        self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1)
+        self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1)
+        for key in PrecompileContext._backend_artifacts_by_key.keys():
             result = PrecompileContext.serialize_artifact_by_key(key)
-            assert isinstance(result, PrecompileCacheArtifact)
+            assert isinstance(result, BackendCacheArtifact)
             self.assertEqual(result.key, key)
 
-        self.assertEqual(len(PrecompileContext._new_cache_artifacts), 0)
-        result = PrecompileContext.serialize()
-        assert result is not None
-        _, cache_info = result
-        self.assertEqual(len(cache_info.precompile_aot_autograd_artifacts), 1)
+        # This should still work
+        result, _ = PrecompileContext.create_cache_entries()
+        assert len(result) == 1
 
     @requires_triton()
     def test_editable(self):
@@ -109,13 +82,10 @@ def simple_function(x):
         x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
         result = compiled_fn(x)
         result.sum().backward()
-        self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2)
+        self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1)
+        self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1)
         # Find the key for the artifact of type "precompile_aot_autograd"
-        key = next(
-            k
-            for k, v in PrecompileContext._new_cache_artifacts_by_key.items()
-            if isinstance(v, EditablePrecompileCacheArtifact)
-        )
+        key = next(iter(PrecompileContext._backend_artifacts_by_key))
 
         def edit_fn(x):
             x._my_private_field = 42
@@ -127,24 +97,12 @@ def edit_fn(x):
         assert isinstance(result, BundledAOTAutogradCacheArtifact)
         self.assertEqual(result.key, key)
 
-        self.assertEqual(len(PrecompileContext._new_cache_artifacts), 0)
-        result = PrecompileContext.serialize()
-        assert result is not None
-        artifacts, cache_info = result
-        self.assertEqual(len(cache_info.precompile_aot_autograd_artifacts), 1)
-
-        deserialized = PrecompileContext.deserialize(artifacts)
-        assert deserialized is not None
-        aot_autograd_artifacts = deserialized["precompile_aot_autograd"]
+        result, _ = PrecompileContext.create_cache_entries()
+        assert len(result) == 1
+        aot_autograd_artifacts = next(iter(result.values())).backends
         assert len(aot_autograd_artifacts) == 1
-        entry = aot_autograd_artifacts[0]
-        assert isinstance(entry, BundledAOTAutogradCacheArtifact)
-        raw_entry = pickle.loads(entry.content)
-        self.assertEqual(raw_entry._my_private_field, 42)
-        # Now that we've serialized, there should be no new cache artifacts
-        self.assertEqual(
-            len(PrecompileContext._new_cache_artifacts["precompile_aot_autograd"]), 0
-        )
+        entry = next(iter(aot_autograd_artifacts.values())).content
+        self.assertEqual(entry._my_private_field, 42)
 
 
 if __name__ == "__main__":
diff --git a/test/dynamo/test_profiler.py b/test/dynamo/test_profiler.py
index 61dc63ed2d5c..921d7021650f 100644
--- a/test/dynamo/test_profiler.py
+++ b/test/dynamo/test_profiler.py
@@ -162,6 +162,34 @@ def fn(x, y, z):
             any(e.name == "TorchDynamo Cache Lookup" for e in prof.events())
         )
 
+    def test_profiler_enabled_export(self):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                x = torch.sin(x)
+                if torch.autograd._profiler_enabled():
+                    return torch.cos(x)
+                else:
+                    return torch.sigmoid(x)
+
+        mod = Mod()
+
+        x = torch.randn(4)
+        opt_mod = torch._dynamo.export(mod, (x))
+
+        ref = mod(x)
+        res = opt_mod.graph_module(x)
+        self.assertEqual(ref, res)
+
+        with torch.autograd.profiler.profile():
+            ref = mod(x)
+            # Reexport because export skips guards
+            opt_mod = torch._dynamo.export(mod, (x))
+            res = opt_mod.graph_module(x)
+            self.assertEqual(ref, res)
+
     def test_profiler_dynamo_compiled_region(self):
         def fn(x, y):
             r = y.sum(dim=1)
diff --git a/test/dynamo/test_python_autograd.py b/test/dynamo/test_python_autograd.py
index 2acaf67add69..a615c653f56c 100644
--- a/test/dynamo/test_python_autograd.py
+++ b/test/dynamo/test_python_autograd.py
@@ -82,7 +82,7 @@ def grad(L, desired_results: list[Variable]) -> list[Variable]:
     # look up dL_dentries. If a variable is never used to compute the loss,
     # we consider its gradient None, see the note below about zeros for more information.
     def gather_grad(entries: list[str]):
-        return [dL_d[entry] if entry in dL_d else None for entry in entries]
+        return [dL_d.get(entry) for entry in entries]
 
     # propagate the gradient information backward
     for entry in reversed(gradient_tape):
diff --git a/test/dynamo/test_recompile_ux.py b/test/dynamo/test_recompile_ux.py
index f945039b55d1..880d37434b31 100644
--- a/test/dynamo/test_recompile_ux.py
+++ b/test/dynamo/test_recompile_ux.py
@@ -242,11 +242,12 @@ def f(x):
             opt_f(torch.randn(8 + i))
 
         failure_str = "\n".join(failure_reasons)
-        for line in """\
-tensor 'x' size mismatch at index 0. expected 11, actual 12
-tensor 'x' size mismatch at index 0. expected 10, actual 12
-tensor 'x' size mismatch at index 0. expected 9, actual 12
-tensor 'x' size mismatch at index 0. expected 8, actual 12""".split("\n"):
+        for line in [
+            "tensor 'x' size mismatch at index 0. expected 11, actual 12",
+            "tensor 'x' size mismatch at index 0. expected 10, actual 12",
+            "tensor 'x' size mismatch at index 0. expected 9, actual 12",
+            "tensor 'x' size mismatch at index 0. expected 8, actual 12",
+        ]:
             self.assertIn(
                 line,
                 failure_str,
@@ -281,16 +282,13 @@ def filter_reasons():
             failure_reasons.clear()
             opt_f([7, 8])
 
-            for line in """\
-len(x) == 3""".split("\n"):
+            for line in ["len(x) == 3"]:
                 self.assertIn(line, filter_reasons())
 
             failure_reasons.clear()
             opt_f([9])
 
-            for line in """\
-len(x) == 2
-len(x) == 3""".split("\n"):
+            for line in ["len(x) == 2", "len(x) == 3"]:
                 self.assertIn(line, filter_reasons())
 
     @torch._dynamo.config.patch(recompile_limit=1)
diff --git a/test/dynamo/test_regional_inductor.py b/test/dynamo/test_regional_inductor.py
new file mode 100644
index 000000000000..fc31e25dce3f
--- /dev/null
+++ b/test/dynamo/test_regional_inductor.py
@@ -0,0 +1,284 @@
+# Owner(s): ["module: dynamo"]
+
+import functools
+
+import torch
+import torch._inductor.test_case
+import torch.fx.traceback as fx_traceback
+import torch.utils.checkpoint
+from torch._dynamo.backends.common import aot_autograd
+from torch._inductor.test_case import run_tests
+from torch._inductor.utils import run_fw_bw_and_get_code
+from torch.fx.passes.regional_inductor import regional_inductor
+from torch.nn.attention.flex_attention import create_block_mask, flex_attention
+from torch.testing._internal.common_utils import skipIfTorchDynamo
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
+
+
+# Open questions / follow-ups
+# 1) CSE behavior with meta custom nodes
+#   Common subexpression elimination may not differentiate between distinct meta
+#   custom nodes and could remove expressions, which might confuse users.
+#
+# 2) SAC: recompute vs. forward size
+#   If the recomputed forward is smaller than the original forward, do we end up
+#   compiling only the smaller region?
+#
+# 3) fx_traceback.annotate nesting
+#   How does nesting behave? Are there any ordering requirements?
+#
+# 4) Planned uses for annotations
+#   a) compile flex
+#   b) streams
+#   c) nn.Module info to organize MoE runtime
+#   d) pipeline-parallel stages
+#   e) rename graph nodes for easier debugging
+#   f) disallow nested regional compile
+
+
+def aot_eager_regional_inductor():
+    return aot_autograd(
+        fw_compiler=regional_inductor,
+        bw_compiler=regional_inductor,
+    )
+
+
+@skipIfTorchDynamo("Not a suitable dynamo wrapped test")
+class RegionalInductorTests(torch._inductor.test_case.TestCase):
+    def test_simple(self):
+        def fn(x, y):
+            sin = torch.sin(x)
+
+            with fx_traceback.annotate({"compile_with_inductor": 0}):
+                mul = sin * y
+                add = mul + 1
+
+            return torch.sin(add)
+
+        opt_fn = torch.compile(
+            fn, backend=aot_eager_regional_inductor(), fullgraph=True
+        )
+        x = torch.randn(10, requires_grad=True)
+        y = torch.randn(10, requires_grad=True)
+
+        # Check that inductor compilation is called twice
+        _, codes = run_fw_bw_and_get_code(lambda: opt_fn(x, y))
+        self.assertEqual(len(codes), 2)
+
+    def test_repeated_blocks(self):
+        def fn(x, y):
+            sin = torch.sin(x)
+
+            with fx_traceback.annotate({"compile_with_inductor": 0}):
+                mul = sin * y
+                add = mul + 1
+
+            return torch.sin(add)
+
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                a = fn(x, y)
+                return fn(a, y)
+
+        mod = Mod()
+
+        opt_mod = torch.compile(
+            mod, backend=aot_eager_regional_inductor(), fullgraph=True
+        )
+        x = torch.randn(10, requires_grad=True)
+        y = torch.randn(10, requires_grad=True)
+
+        # Check that inductor compilation is called 4 times
+        # there will be 2 partitions in the fwd and 2 in the bwd, totalling 4
+        _, codes = run_fw_bw_and_get_code(lambda: opt_mod(x, y))
+        self.assertEqual(len(codes), 4)
+
+    def test_invoke_subgraph(self):
+        # Checks that get_attr nodes custom metadata is propagated
+        @torch.compiler.nested_compile_region
+        def gn(x):
+            return torch.sin(x)
+
+        def fn(x):
+            x = x + 1
+            with fx_traceback.annotate({"compile_with_inductor": 0}):
+                z = gn(x)
+            return torch.sigmoid(z)
+
+        opt_fn = torch.compile(
+            fn, backend=aot_eager_regional_inductor(), fullgraph=True
+        )
+        x = torch.randn(10, requires_grad=True)
+
+        _, codes = run_fw_bw_and_get_code(lambda: opt_fn(x))
+        self.assertEqual(len(codes), 2)
+
+    def test_invoke_subgraph_inner(self):
+        # Checks that the inductor regions are searched recursively.
+        @torch.compiler.nested_compile_region
+        def gn(x):
+            with fx_traceback.annotate({"compile_with_inductor": 0}):
+                return torch.sin(x)
+
+        def fn(x):
+            x = x + 1
+            x = gn(x)
+            x = x + 1
+            x = gn(x)
+            return torch.sigmoid(x)
+
+        opt_fn = torch.compile(
+            fn, backend=aot_eager_regional_inductor(), fullgraph=True
+        )
+        x = torch.randn(10, requires_grad=True)
+
+        _, codes = run_fw_bw_and_get_code(lambda: opt_fn(x))
+        # the invoke_subgraph is called twice - but the inside code is compiled
+        # once - so in total 2 (1 fwd + 1 bwd)
+        self.assertEqual(len(codes), 2)
+
+    @requires_cuda_and_triton
+    def test_flex_attention(self):
+        def _squared(score, b, h, m, n):
+            return score * score
+
+        def mask_mod(b, h, q, k):
+            return q >= 0
+
+        a = 12
+        b = 64
+        block_mask = create_block_mask(mask_mod, None, None, a * b, a * b)
+
+        def fn(x):
+            x = torch.sin(x)
+            with fx_traceback.annotate({"compile_with_inductor": 0}):
+                x = flex_attention(x, x, x, block_mask=block_mask, score_mod=_squared)
+            return torch.cos(x)
+
+        x = torch.randn(
+            1,
+            1,
+            a * b,
+            b,
+            dtype=torch.bfloat16,
+            device="cuda",
+            requires_grad=True,
+        )
+
+        opt_fn = torch.compile(
+            fn,
+            backend=aot_eager_regional_inductor(),
+            fullgraph=True,
+        )
+
+        _, codes = run_fw_bw_and_get_code(lambda: opt_fn(x))
+        # flex in forward and flex_backward in backward
+        self.assertEqual(len(codes), 2)
+
+    @requires_cuda_and_triton
+    def test_selective_ac_flex(self):
+        class FlexAttentionModule(torch.nn.Module):
+            def __init__(self, hidden_size, num_heads):
+                super().__init__()
+                self.hidden_size = hidden_size
+                self.num_heads = num_heads
+                self.head_dim = hidden_size // num_heads
+
+                # In-projections (query, key, value)
+                self.q_proj = torch.nn.Linear(hidden_size, hidden_size)
+                self.k_proj = torch.nn.Linear(hidden_size, hidden_size)
+                self.v_proj = torch.nn.Linear(hidden_size, hidden_size)
+
+                # Out-projection
+                self.out_proj = torch.nn.Linear(hidden_size, hidden_size)
+
+            def forward(self, x):
+                batch_size, seq_len, _ = x.size()
+
+                # Project queries, keys, and values
+                q = (
+                    self.q_proj(x)
+                    .view(batch_size, seq_len, self.num_heads, self.head_dim)
+                    .transpose(1, 2)
+                )
+                k = (
+                    self.k_proj(x)
+                    .view(batch_size, seq_len, self.num_heads, self.head_dim)
+                    .transpose(1, 2)
+                )
+                v = (
+                    self.v_proj(x)
+                    .view(batch_size, seq_len, self.num_heads, self.head_dim)
+                    .transpose(1, 2)
+                )
+
+                # Apply flex attention
+                with torch.fx.traceback.annotate({"compile_with_inductor": 0}):
+                    attn_output = flex_attention(
+                        q,
+                        k,
+                        v,
+                    )
+
+                # Reshape output
+                attn_output = (
+                    attn_output.transpose(1, 2)
+                    .contiguous()
+                    .view(batch_size, seq_len, self.hidden_size)
+                )
+
+                # Out projection
+                output = self.out_proj(attn_output)
+
+                return output
+
+        from torch.utils.checkpoint import (
+            checkpoint,
+            create_selective_checkpoint_contexts,
+        )
+
+        ops_to_save = [
+            torch.ops.aten.mm.default,
+        ]
+        context_fn = functools.partial(
+            create_selective_checkpoint_contexts, ops_to_save
+        )
+
+        # Define a model that uses FlexAttention with selective activation checkpointing
+        class SacModule(torch.nn.Module):
+            def __init__(self, hidden_size, num_heads, context_fn):
+                super().__init__()
+                self.flex_attn = FlexAttentionModule(hidden_size, num_heads)
+                self.context_fn = context_fn
+
+            def forward(self, x):
+                def flex_attn_fn(x):
+                    return self.flex_attn(x)
+
+                output = checkpoint(
+                    flex_attn_fn,
+                    x,
+                    use_reentrant=False,
+                    context_fn=self.context_fn,
+                )
+
+                return output
+
+        flex_module = SacModule(hidden_size=512, num_heads=8, context_fn=context_fn).to(
+            "cuda", dtype=torch.bfloat16
+        )
+        x = torch.ones(8, 1024, 512, device="cuda", dtype=torch.bfloat16)
+        compiled_module = torch.compile(
+            flex_module, backend=aot_eager_regional_inductor(), fullgraph=True
+        )
+
+        _, codes = run_fw_bw_and_get_code(lambda: compiled_module(x))
+        # flex in forward and flex_backward in backward
+        self.assertEqual(len(codes), 2)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 46f0e67a5b7b..ac0515ac6ba8 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -46,6 +46,8 @@
 from torch._dynamo.debug_utils import same_two_models
 from torch._dynamo.testing import (
     CompileCounter,
+    CompileCounterWithBackend,
+    EagerAndRecordGraphs,
     rand_strided,
     same,
     skipIfNotPy312,
@@ -53,6 +55,7 @@
 )
 from torch._inductor.utils import fresh_cache
 from torch.nn import functional as F
+from torch.nn.attention.flex_attention import create_block_mask, flex_attention
 from torch.profiler import profile, ProfilerActivity
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
@@ -3248,7 +3251,7 @@ def f(x):
     def test_rewrite_assert_with_non_string_msg(self):
         def f(x):
             b = x.sin()
-            assert x[0] == 2, x.size()
+            assert x[0] == 2, f"Error {x}: {x.size()}"
             return x.cos() + b
 
         torch._dynamo.utils.counters.clear()
@@ -4258,7 +4261,7 @@ def test_deferred_runtime_asserts(self):
         @torch.compile(fullgraph=True)
         def f(x):
             y = x.item()
-            torch._check_is_size(y)
+            torch._check(y >= 0)
             if y >= 0:
                 return x * 2
             else:
@@ -4468,7 +4471,7 @@ def func3(x, y):
 
         compiled_fn = torch.compile(func, backend=cnt, fullgraph=True)
         requires_grad = func is not func1
-        for _ in range(0, 5):
+        for _ in range(5):
             # Inputs
             eager_a = torch.ones([6], requires_grad=requires_grad)
             compiled_a = torch.ones([6], requires_grad=requires_grad)
@@ -4620,7 +4623,7 @@ def fn(x, counter):
         x = torch.rand([2, 2])
         self.assertEqual(opt_fn(x, counter), fn(x, counter))
         self.assertEqual(counter[0], 2)
-        for _ in range(0, 10):
+        for _ in range(10):
             opt_fn(x, counter)
         self.assertEqual(counter[0], 12)
         if torch._dynamo.config.assume_static_by_default:
@@ -4781,7 +4784,7 @@ def fn(x):
     def test_contains_range_constprop(self):
         def fn(x):
             # dynamo should const prop to False
-            if 3 in range(0, 10):
+            if 3 in range(10):
                 return x + 1
             else:
                 return x + 2
@@ -4822,6 +4825,67 @@ def foo(a):
             "encountered a mutation on a view chain of length 2, where view 1 was an as_strided",
         ):
             f_compiled(a)
+        # See https://github.com/pytorch/pytorch/issues/161010
+
+    def test_preserve_stride_with_clone(self) -> None:
+        A = torch.rand(5, 5, device="cuda" if torch.cuda.is_available() else "cpu")
+        B = torch.rand(5, 5, device="cuda" if torch.cuda.is_available() else "cpu")
+
+        def fn(
+            src: torch.Tensor, count: torch.Tensor
+        ) -> tuple[tuple[int, ...], tuple[int, ...]]:
+            Q, R = torch.linalg.qr(src)
+            rhs = torch.ones(Q.shape[0], 1, device=src.device)
+            a = torch.linalg.solve_triangular(R, Q.T @ rhs, upper=True)
+            cloned = a.clone(memory_format=torch.preserve_format)
+            return a.stride(), cloned.stride()
+
+        a_stride, cloned_stride = fn(A, torch.zeros(1))
+        self.assertEqual(
+            a_stride,
+            cloned_stride,
+            f"Strides should match in eager: {a_stride} against {cloned_stride}",
+        )
+
+        compiled_a_stride, compiled_cloned_stride = torch.compile(fn, backend="eager")(
+            B, torch.zeros(1)
+        )
+        self.assertEqual(
+            compiled_a_stride,
+            compiled_cloned_stride,
+            f"Strides should match in eager: {compiled_a_stride} against {compiled_cloned_stride}",
+        )
+
+    # Extension of https://github.com/pytorch/pytorch/issues/161010
+    # in the non memory dense case
+    def test_clone_not_memory_dense(self):
+        def foo() -> torch.Tensor:
+            x = torch.randn(10, 8).t()[::2, ::2]
+            y = x.clone()
+            return y
+
+        y = foo()
+        self.assertEqual(
+            y.stride(),
+            (1, 4),
+            "Reference eager implementation should have stride (1, 4)",
+        )
+        y = torch.compile(foo, backend="eager")()
+        self.assertEqual(
+            y.stride(), (1, 4), "Compile with eager backend should have stride (1, 4)"
+        )
+        y = torch.compile(foo, backend="aot_eager")()
+        self.assertEqual(
+            y.stride(),
+            (1, 4),
+            "Compile with aot_eager backend should have stride (1, 4)",
+        )
+        y = torch.compile(foo, backend="inductor")()
+        self.assertEqual(
+            y.stride(),
+            (1, 4),
+            "Compile with inductor backend should have stride (1, 4)",
+        )
 
     # https://github.com/pytorch/pytorch/issues/146598
     @unittest.expectedFailure
@@ -5755,6 +5819,31 @@ def fn(x):
 
         fn(torch.rand(4))
 
+    def test_export_vs_dynamo_for_multiheadattention(self):
+        # More details at https://github.com/pytorch/pytorch/issues/164062
+
+        # Ensure that both dynamo and export do not take the fast path.
+        with torch.no_grad():
+            inp = torch.randn(1, 2, 64)
+            mha = nn.MultiheadAttention(64, 2, dropout=0.1, batch_first=True)
+            mha.eval()
+
+            backend = EagerAndRecordGraphs()
+            mha_compile = torch.compile(mha, backend=backend, fullgraph=True)
+            mha_compile(inp, inp, inp)
+            torch.compiler.reset()
+
+            mha_export = torch._dynamo.export(mha)(inp, inp, inp)
+
+            compile_nodes = backend.graphs[0].graph.find_nodes(
+                op="call_function", target=torch._native_multi_head_attention
+            )
+            export_nodes = mha_export.graph_module.graph.find_nodes(
+                op="call_function", target=torch._native_multi_head_attention
+            )
+            self.assertEqual(len(compile_nodes), 0)
+            self.assertEqual(len(export_nodes), 0)
+
     def test_negative_floor_div_solve(self):
         class CompiledClass(nn.Module):
             def __init__(self) -> None:
@@ -7195,6 +7284,26 @@ def fn(x):
         flag = False
         self.assertEqual(fn(inp), opt_fn(inp))
 
+    def test_cells_unsupported_step_exception(self):
+        # This error happened because:
+        #  - we were generating cells into a list on the stack
+        #  - we encountered an unsupported step, resulting in a step graph break
+        #  - we encounter an exception, which pops the stack until it reaches a certain length;
+        #    the presence of the list of cells then messes things up.
+
+        cell = 0
+
+        @torch.compile(backend="eager")
+        def fn(x):
+            x = x + 1 + 2
+            torch._dynamo.step_unsupported()
+            with contextlib.nullcontext():
+                print(cell)
+                raise AssertionError
+
+        with self.assertRaises(AssertionError):
+            fn(torch.ones(3))
+
     def test_unbind_copy_out(self):
         def f(eye, out):
             torch.unbind_copy(eye, out=out)
@@ -7262,6 +7371,102 @@ def fn():
         )
         self.assertEqual(explain_output.break_reasons[0].reason, expected_msg)
 
+    @parametrize("backend", ["eager", "inductor"])
+    def test_issue164247(self, backend: str):
+        if backend == "inductor" and torch._dynamo.config.dynamic_shapes:
+            raise unittest.SkipTest(
+                "Skip only in dynamic-shapes wrapper (known issue #157612)"
+            )
+
+        class MixedFakeModeModel(nn.Module):
+            def __init__(self, dim=64):
+                super().__init__()
+                self.dim = dim
+                self.lin = torch.nn.Linear(64, 64)
+
+            def forward(self, x):
+                batch_size, seq_len, _ = x.shape
+
+                # Process input first - this creates fake tensors in export's fake mode
+                processed = self.lin(x)
+
+                # Create some computation that depends on processed tensor
+                intermediate = processed.sum(dim=-1).detach()  # Shape: (batch, seq_len)
+
+                def dynamic_mask_function(batch_idx, head_idx, q_idx, kv_idx):
+                    threshold = intermediate[
+                        batch_idx, q_idx % seq_len
+                    ]  # Access the captured tensor
+                    return (kv_idx <= q_idx) & (threshold > 0)
+
+                block_mask = create_block_mask(
+                    mask_mod=dynamic_mask_function,
+                    B=batch_size,
+                    H=None,
+                    Q_LEN=seq_len,
+                    KV_LEN=seq_len,
+                    device=x.device,
+                    _compile=False,
+                )
+                q = processed.view(batch_size, 1, seq_len, self.dim)
+                k = processed.view(batch_size, 1, seq_len, self.dim)
+                v = processed.view(batch_size, 1, seq_len, self.dim)
+
+                out = torch.compile(flex_attention)(q, k, v, block_mask=block_mask)
+                out = flex_attention(q, k, v, block_mask=block_mask)
+
+                return out
+
+        backend_counter = CompileCounterWithBackend(backend)
+        model = MixedFakeModeModel()
+        compiled = torch.compile(model, backend=backend_counter, fullgraph=True)
+
+        if backend == "inductor":
+            # A known InductorError Issue https://github.com/pytorch/pytorch/issues/157612
+            with self.assertRaises(RuntimeError):
+                compiled(torch.randn(2, 128, 64))
+        else:
+            compiled(torch.randn(2, 128, 64))
+
+        # One graph, so no graph breaks
+        self.assertEqual(backend_counter.frame_count, 1)
+        self.assertEqual(len(backend_counter.graphs), 1)
+
+    # https://github.com/pytorch/pytorch/issues/164990
+    def test_guard_same_frame_fail_message(self):
+        import torch._dynamo.guards as g
+
+        # deterministically fail check on the same frame to verify error message correctness
+        # the other example of fail might be datetime.now() until patched - see issue #164990
+        compile_check_fn = g.CheckFunctionManager.compile_check_fn
+
+        def wrapper(self, builder, sorted_guards, guard_fail_fn):
+            compile_check_fn(self, builder, sorted_guards, guard_fail_fn)
+
+            def check(x):
+                return False
+
+            self.guard_manager.check = check
+
+        with mock.patch.object(g.CheckFunctionManager, "compile_check_fn", new=wrapper):
+
+            class Model(nn.Module):
+                def forward(self, x):
+                    return x + 1
+
+            model = Model()
+            x = torch.randn(5)
+
+            with self.assertRaises(AssertionError) as e:
+                torch.compile(model)(x)
+
+        msg = str(e.exception)
+        self.assertIn(
+            "Guard failed on the same frame it was created. This is a bug - please create an issue."
+            "Guard fail reason: ",
+            msg,
+        )
+
 
 class ReproTestsDevice(torch._dynamo.test_case.TestCase):
     def test_sub_alpha_scalar_repro(self, device):
@@ -7865,6 +8070,45 @@ def unsafe_grad(y):
             unsafe_grad(y)  # should not warn
             self.assertEqual(len(w), 1)
 
+    @torch._dynamo.config.patch(install_free_tensors=True)
+    def test_partial_export(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def parallelize(self):
+                fn = self._call_impl
+
+                def wrapped_fn(fn, *args, **kwargs):
+                    new_args_0 = args[0].to(torch.bfloat16)
+                    new_args_1 = args[1].to(torch.bfloat16)
+                    return fn(new_args_0, new_args_1)
+
+                fn = functools.partial(wrapped_fn, fn)
+                self._call_impl = fn
+
+            def forward(self, a, b):
+                return a + b
+
+        from torch._dynamo.functional_export import _dynamo_graph_capture_for_export
+
+        foo = Foo()
+        foo.parallelize()
+        x = torch.randn(4, 4, dtype=torch.float32)
+        y = torch.randn(4, 4, dtype=torch.float32)
+        ref = foo(x, y)
+        gm = _dynamo_graph_capture_for_export(foo)(x, y)
+        res = gm(x, y)
+        self.assertEqual(res, ref)
+
+    def test_current_accelerator(self):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x):
+            torch.accelerator.current_accelerator()
+            return x + 1
+
+        self.assertEqual(fn(torch.ones(3)), torch.ones(3) + 1)
+
 
 instantiate_parametrized_tests(ReproTests)
 
diff --git a/test/dynamo/test_sdpa.py b/test/dynamo/test_sdpa.py
index 5307c5b3cc3f..02a867af76d5 100644
--- a/test/dynamo/test_sdpa.py
+++ b/test/dynamo/test_sdpa.py
@@ -5,6 +5,7 @@
 import torch._dynamo.testing
 from torch._dynamo.testing import CompileCounter
 from torch.backends.cuda import SDPAParams
+from torch.nn.attention import _cur_sdpa_kernel_backends, sdpa_kernel, SDPBackend
 
 
 @contextlib.contextmanager
@@ -99,6 +100,43 @@ def fn(q, k, v, m):
             self.assert_ref_equals_params(o, expected)
             self.assertEqual(counter.frame_count, 1)
 
+    def test_sdpa_c_functions_no_graph_break(self):
+        counter = CompileCounter()
+
+        @torch.compile(fullgraph=True, backend=counter)
+        def test_cur_sdpa_kernel_backends():
+            return _cur_sdpa_kernel_backends()
+
+        result = test_cur_sdpa_kernel_backends()
+
+        self.assertIsInstance(result, list)
+        self.assertEqual(counter.frame_count, 1)
+
+    def test_sdpa_kernel_decorator_with_compile(self):
+        SDPA_BACKEND_PRIORITY = [
+            SDPBackend.MATH,
+            SDPBackend.EFFICIENT_ATTENTION,
+            SDPBackend.FLASH_ATTENTION,
+        ]
+
+        @sdpa_kernel(backends=SDPA_BACKEND_PRIORITY, set_priority=True)
+        def scaled_dot_product_attention(q, k, v, *args, **kwargs):
+            return torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, *args, **kwargs
+            )
+
+        counter = CompileCounter()
+
+        @torch.compile(fullgraph=True, backend=counter)
+        def f(x):
+            return scaled_dot_product_attention(x, x, x)
+
+        x = torch.rand(128, 64, 64, 256, dtype=torch.float16)
+        result = f(x)
+
+        self.assertEqual(result.shape, x.shape)
+        self.assertEqual(counter.frame_count, 1)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_sources.py b/test/dynamo/test_sources.py
index 5b16e00270b0..a2f91afc93b7 100644
--- a/test/dynamo/test_sources.py
+++ b/test/dynamo/test_sources.py
@@ -59,7 +59,7 @@ def __init__(self) -> None:
             def forward(self):
                 if (
                     torch.utils._pytree.SUPPORTED_NODES[CausalLMOutputWithPast].type
-                    == int
+                    is int
                 ):
                     x = torch.sin(self.x)
                 else:
diff --git a/test/dynamo/test_structured_trace.py b/test/dynamo/test_structured_trace.py
index 89c14961a3a7..c061d9adb89e 100644
--- a/test/dynamo/test_structured_trace.py
+++ b/test/dynamo/test_structured_trace.py
@@ -21,7 +21,7 @@
 from torch._inductor.test_case import TestCase
 from torch._logging._internal import TorchLogsFormatter
 from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.testing._internal.common_utils import find_free_port
+from torch.testing._internal.common_utils import find_free_port, xfailIfS390X
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
@@ -109,6 +109,8 @@ def format(self, record):
             metadata["dynamo_start"]["stack"] = "STACK"
         if "inductor_output_code" in metadata:
             metadata["inductor_output_code"]["filename"] = "FILENAME"
+            if "file_path" in metadata["inductor_output_code"]:
+                metadata["inductor_output_code"]["file_path"] = "FILENAME"
         if "stack" in metadata:
             metadata["stack"] = "STACK"
         if "compilation_metrics" in metadata:
@@ -259,7 +261,7 @@ def test_schedule(self):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -293,7 +295,7 @@ def test_cudagraphs(self):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -335,7 +337,7 @@ def fn(x, y):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@@ -357,7 +359,7 @@ def fn(x, y):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
@@ -389,7 +391,7 @@ def test_example_fn(self):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@@ -446,7 +448,7 @@ def test_example_training_fn(self):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
@@ -455,7 +457,7 @@ def test_example_training_fn(self):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"bwd_compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
@@ -598,7 +600,7 @@ def forward(self, x):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -611,7 +613,7 @@ def forward(self, x):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -679,14 +681,14 @@ def forward(self, x):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"describe_storage": {"id": 16, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 29, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 16, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
-{"describe_source": {"describer_id": "ID", "id": 29, "source": "L['self']._modules['layers']._modules['1']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 28, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 16, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 28, "source": "L['self']._modules['layers']._modules['1']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_storage": {"id": 17, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
-{"describe_tensor": {"id": 30, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 17, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
-{"describe_source": {"describer_id": "ID", "id": 30, "source": "L['self']._modules['layers']._modules['1']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_tensor": {"id": 29, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 17, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
+{"describe_source": {"describer_id": "ID", "id": 29, "source": "L['self']._modules['layers']._modules['1']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -698,7 +700,7 @@ def forward(self, x):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
@@ -739,7 +741,7 @@ def fn(x):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
@@ -900,7 +902,7 @@ def fn(a):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@@ -915,7 +917,7 @@ def fn(a):
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "inductor_provenance_tracking_node_mappings", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "inductor_provenance_tracking_kernel_stack_traces", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "fx_graph_cache_hit", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -1015,6 +1017,7 @@ def fn(a):
         logs = self.buffer.getvalue()
         self.assertTrue(all(event in logs for event in chromium_events))
 
+    @xfailIfS390X
     @requires_tlparse
     @torch._dynamo.config.patch("compiled_autograd", True)
     def test_compiled_autograd_attribution(self):
diff --git a/test/dynamo/test_subclasses.py b/test/dynamo/test_subclasses.py
index 9d60cbe81c97..c590abe63788 100644
--- a/test/dynamo/test_subclasses.py
+++ b/test/dynamo/test_subclasses.py
@@ -286,7 +286,7 @@ def __tensor_flatten__(self):
     def __tensor_unflatten__(inner_tensors, metadata, outer_size, outer_stride):
         return OptionalScaledTensor(
             inner_tensors["_data"],
-            inner_tensors["_scale"] if "_scale" in inner_tensors else None,
+            inner_tensors.get("_scale", None),
             constant=metadata["_constant"],
         )
 
@@ -662,7 +662,7 @@ def fn(v0, v1):
         "comparison",
         [
             subtest(isinstance, "isinstance"),
-            subtest(lambda instance, type_: type(instance) == type_, "equality"),
+            subtest(lambda instance, type_: type(instance) is type_, "equality"),
             subtest(lambda instance, type_: type(instance) is type_, "identity"),
         ],
     )
@@ -2129,7 +2129,8 @@ def f(x):
     @torch._dynamo.config.patch("inline_inbuilt_nn_modules", True)
     @parametrize("dynamic", [True, False])
     def test_mark_static_with_subclass_desugaring(self, dynamic):
-        from typing import Any, Callable, Optional
+        from collections.abc import Callable
+        from typing import Any, Optional
 
         from torch._dynamo.decorators import mark_static_address
         from torch._inductor.compile_fx import compile_fx
@@ -3402,10 +3403,10 @@ def f(nt):
             norm_graph,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, s71: "Sym(s71)", L_nt_: "f64[3, s71, 5]"):
+    def forward(self, s71: "Sym(s71)", L_nt_: "NestedTensor(f64[3, s71, 5])"):
         l_nt_ = L_nt_
 
-        add: "f64[3, s71, 5]" = l_nt_ + 2;  l_nt_ = None
+        add: "NestedTensor(f64[3, s71, 5])" = l_nt_ + 2;  l_nt_ = None
         return (add,)
 """,  # noqa: B950
         )
diff --git a/test/dynamo/test_utils.py b/test/dynamo/test_utils.py
index b7166c5ce6d1..a01c4e2e2195 100644
--- a/test/dynamo/test_utils.py
+++ b/test/dynamo/test_utils.py
@@ -8,6 +8,7 @@
 import torch
 import torch._dynamo.config as dynamo_config
 import torch._inductor.config as inductor_config
+import torch.compiler.config as compiler_config
 from torch._dynamo import utils
 from torch._inductor.test_case import TestCase
 
@@ -497,9 +498,11 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
             e.co_filename = None
             e.co_firstlineno = None
             e.inductor_config = None
+            e.compiler_config = None
             e.cuda_version = None
             e.triton_version = None
             e.python_version = None
+            e.pytorch_version = None
             e.stack_trace = None
             e.graph_node_shapes = None
             e.exception_stack_trace = None
@@ -509,6 +512,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
         raw = dataclasses.asdict(compilation_events[0])
         del raw["feature_usage"]
         del raw["ir_count"]
+        del raw["inductor_provenance"]
         del raw["param_numel"]
         del raw["param_bytes"]
         del raw["param_count"]
@@ -528,6 +532,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'code_gen_time_s': 0.0,
  'compile_id': '1/0',
  'compile_time_autotune_time_us': None,
+ 'compiler_config': None,
  'compliant_custom_ops': set(),
  'config_inline_inbuilt_nn_modules': False,
  'config_suppress_errors': False,
@@ -576,6 +581,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'post_grad_pass_time_us': 0,
  'pre_grad_pass_time_us': 0,
  'python_version': None,
+ 'pytorch_version': None,
  'recompile_reason': None,
  'recompile_user_contexts': None,
  'remote_cache_time_saved_s': None,
@@ -613,6 +619,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'code_gen_time_s': 0.0,
  'compile_id': '1/0',
  'compile_time_autotune_time_us': None,
+ 'compiler_config': None,
  'compliant_custom_ops': set(),
  'config_inline_inbuilt_nn_modules': False,
  'config_suppress_errors': False,
@@ -661,6 +668,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'post_grad_pass_time_us': 0,
  'pre_grad_pass_time_us': 0,
  'python_version': None,
+ 'pytorch_version': None,
  'recompile_reason': None,
  'recompile_user_contexts': None,
  'remote_cache_time_saved_s': None,
@@ -691,6 +699,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
         raw = dataclasses.asdict(compilation_events[1])
         del raw["feature_usage"]
         del raw["ir_count"]
+        del raw["inductor_provenance"]
         del raw["guard_latency_us"]
         del raw["param_numel"]
         del raw["param_bytes"]
@@ -709,6 +718,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'code_gen_time_s': 0.0,
  'compile_id': '1/0',
  'compile_time_autotune_time_us': None,
+ 'compiler_config': None,
  'compliant_custom_ops': None,
  'config_inline_inbuilt_nn_modules': False,
  'config_suppress_errors': False,
@@ -757,6 +767,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'post_grad_pass_time_us': 0,
  'pre_grad_pass_time_us': None,
  'python_version': None,
+ 'pytorch_version': None,
  'recompile_reason': None,
  'recompile_user_contexts': None,
  'remote_cache_time_saved_s': None,
@@ -794,6 +805,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'code_gen_time_s': 0.0,
  'compile_id': '1/0',
  'compile_time_autotune_time_us': None,
+ 'compiler_config': None,
  'compliant_custom_ops': None,
  'config_inline_inbuilt_nn_modules': False,
  'config_suppress_errors': False,
@@ -842,6 +854,7 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'post_grad_pass_time_us': 0,
  'pre_grad_pass_time_us': None,
  'python_version': None,
+ 'pytorch_version': None,
  'recompile_reason': None,
  'recompile_user_contexts': None,
  'remote_cache_time_saved_s': None,
@@ -868,6 +881,25 @@ def test_dynamo_timed(self, mock_time, mock_time_ns):
  'triton_version': None}""",  # noqa: B950
         )
 
+    @dynamo_config.patch(
+        {
+            "log_compilation_metrics": True,
+        }
+    )
+    @compiler_config.patch({"job_id": "test_job_id"})
+    def test_compiler_config(self):
+        def test1(x):
+            return x * x
+
+        compilation_events = []
+        with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
+            torch.compile(test1)(torch.randn(1))
+            compilation_events = [arg[0][0] for arg in log_event.call_args_list]
+        self.assertIn(
+            '"job_id": "test_job_id"',
+            compilation_events[0].compiler_config,
+        )
+
     @dynamo_config.patch(
         {
             "log_compilation_metrics": True,
@@ -906,6 +938,27 @@ def test2(x):
             compilation_events = [arg[0][0] for arg in log_event.call_args_list]
         self.assertEqual(compilation_events[0].ir_count, second)
 
+    @dynamo_config.patch(
+        {
+            "log_compilation_metrics": True,
+        }
+    )
+    @inductor_config.patch(
+        {"trace.enabled": True, "trace.provenance_tracking_level": 1},
+    )
+    def test_inductor_provenance(self):
+        module = torch.nn.Linear(6, 66)
+        graph_module = torch.fx.symbolic_trace(module)
+
+        compilation_events = []
+        with mock.patch("torch._dynamo.utils.log_compilation_event") as log_event:
+            torch.compile(graph_module)(torch.randn(6, 6))
+            compilation_events = [arg[0][0] for arg in log_event.call_args_list]
+        self.assertEqual(
+            compilation_events[0].inductor_provenance,
+            {'{"extern_kernels.addmm:1": []}'},
+        )
+
     @dynamo_config.patch({"log_compilation_metrics": True})
     @inductor_config.patch({"force_disable_caches": True})
     def test_dynamic_shape_feature_use(self):
diff --git a/test/dynamo/test_view.py b/test/dynamo/test_view.py
index 03b9ac5a9f81..a9a6e0deca36 100644
--- a/test/dynamo/test_view.py
+++ b/test/dynamo/test_view.py
@@ -11,8 +11,6 @@ def test_view_to_2d(self):
         def f(t, _u0):
             u0 = t[0].item()
             u1 = t[1].item()
-            torch._check_is_size(u0)
-            torch._check_is_size(u1)
             n = u0 * u1
             a = torch.randn(n)
             return a.view(-1, _u0)
@@ -25,8 +23,6 @@ def test_view_to_1d(self):
         def f(t, _n):
             u0 = t[0].item()
             u1 = t[1].item()
-            torch._check_is_size(u0)
-            torch._check_is_size(u1)
             a = torch.randn(u0, u1)
             return a.view(_n)
 
diff --git a/test/dynamo_expected_failures/CPython313-test_exceptions-ExceptionTests.test_yield_in_nested_try_excepts b/test/dynamo_expected_failures/CPython313-test_exceptions-ExceptionTests.test_yield_in_nested_try_excepts
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_exceptions-NameErrorTests.test_gh_111654 b/test/dynamo_expected_failures/CPython313-test_exceptions-NameErrorTests.test_gh_111654
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_views b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_views
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_raise-TestCause.test_erroneous_cause b/test/dynamo_expected_failures/CPython313-test_raise-TestCause.test_erroneous_cause
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_raise-TestRaise.test_erroneous_exception b/test/dynamo_expected_failures/CPython313-test_raise-TestRaise.test_erroneous_exception
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/TestProfiler.test_flops b/test/dynamo_expected_failures/TestProfiler.test_flops
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/TestScript.test_python_frontend b/test/dynamo_expected_failures/TestScript.test_python_frontend
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/TestScript.test_python_frontend_py3 b/test/dynamo_expected_failures/TestScript.test_python_frontend_py3
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_exceptions-ExceptionTests.testChainingAttrs b/test/dynamo_skips/PrivateUse1BackendTest.test_backend_simple
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_exceptions-ExceptionTests.testChainingAttrs
rename to test/dynamo_skips/PrivateUse1BackendTest.test_backend_simple
diff --git a/test/expect/HasDecompTest.test_has_decomposition.expect b/test/expect/HasDecompTest.test_has_decomposition.expect
index c650b102bf1a..42c63ad8706f 100644
--- a/test/expect/HasDecompTest.test_has_decomposition.expect
+++ b/test/expect/HasDecompTest.test_has_decomposition.expect
@@ -524,8 +524,11 @@ aten::_scaled_dot_product_flash_attention_for_cpu_backward
 aten::_scaled_dot_product_fused_attention_overrideable
 aten::_scaled_dot_product_fused_attention_overrideable_backward
 aten::_scaled_grouped_mm
+aten::_scaled_grouped_mm_v2
 aten::_scaled_mm
 aten::_scaled_mm.out
+aten::_scaled_mm_v2
+aten::_scaled_mm_v2.out
 aten::_segment_reduce_backward
 aten::_segment_reduce_backward.out
 aten::_slow_conv2d_backward.grad_input
diff --git a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
index 8dbe257ec3ae..882f58f3c373 100644
--- a/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
+++ b/test/expect/TestFXAPIBackwardCompatibility.test_function_back_compat-fx_backcompat_function_signatures.expect
@@ -63,7 +63,7 @@ torch.fx.node.map_aggregate(a: torch.fx.node.Argument, fn: Callable[[torch.fx.no
 torch.fx.node.map_arg(a: torch.fx.node.Argument, fn: Callable[[torch.fx.node.Node], torch.fx.node.Argument]) -> torch.fx.node.Argument
 torch.fx.passes.reinplace.reinplace(gm, *sample_args)
 torch.fx.passes.runtime_assert.insert_deferred_runtime_asserts(gm: torch.fx.graph_module.GraphModule, shape_env: Any, name: str, export: bool = False) -> None
-torch.fx.passes.split_module.split_module(m: torch.fx.graph_module.GraphModule, root_m: torch.nn.modules.module.Module, split_callback: Callable[[torch.fx.node.Node], int], qualname_map: Optional[Dict[str, str]] = None, keep_original_order: Optional[bool] = False, keep_original_node_name: Optional[bool] = False, keep_original_input_name: bool = True)
+torch.fx.passes.split_module.split_module(m: torch.fx.graph_module.GraphModule, root_m: torch.nn.modules.module.Module, split_callback: Callable[[torch.fx.node.Node], int], qualname_map: Optional[Dict[str, str]] = None, keep_original_order: Optional[bool] = False, keep_original_node_name: Optional[bool] = False, keep_original_input_name: bool = True, partition_affix: Optional[str] = None)
 torch.fx.proxy.Attribute.__init__(self, root: torch.fx.proxy.Proxy, attr: str)
 torch.fx.proxy.Proxy.__init__(self, node: torch.fx.node.Node, tracer: 'Optional[TracerBase]' = None)
 torch.fx.proxy.Proxy.keys(self)
diff --git a/test/export/opinfo_schema.py b/test/export/opinfo_schema.py
index 837213659847..292d06fc04d8 100644
--- a/test/export/opinfo_schema.py
+++ b/test/export/opinfo_schema.py
@@ -38,7 +38,7 @@ def __init__(self) -> None:
 
     def _may_alias_or_mutate(self, func, types, args, kwargs):
         def unwrap(e):
-            if isinstance(e, torch.Tensor) and not type(e) == torch.Tensor:
+            if isinstance(e, torch.Tensor) and type(e) is not torch.Tensor:
                 try:
                     return e.elem
                 except AttributeError:
diff --git a/test/export/test_draft_export.py b/test/export/test_draft_export.py
index 7f7148273ad7..30147b1a7c42 100644
--- a/test/export/test_draft_export.py
+++ b/test/export/test_draft_export.py
@@ -297,7 +297,8 @@ def forward(self, a, b, c):
                     res = torch.ops.mylib.foo1(a, b)
 
                     c_item = c.item()
-                    return res[:c_item]
+                    if c_item > 0:
+                        return res[:c_item]
 
             inp = (torch.ones(3, 3), torch.ones(3, 3), torch.tensor(3))
 
@@ -369,7 +370,8 @@ def forward(self, x, y):
 
                 z = torch.cat([y, y])
 
-                return z[:a]
+                if a > 0:
+                    return z[:a]
 
         ep = draft_export(
             M(),
@@ -387,7 +389,7 @@ def forward(self, x, y):
             for node in _ep.graph.nodes:
                 if bindings := node.meta.get("unbacked_bindings"):
                     unbacked_binding_symbols.update(bindings.keys())
-            self.assertEqual(len(unbacked_binding_symbols), 1)
+            self.assertEqual(len(unbacked_binding_symbols), 2)
 
     def test_offsets(self):
         class M(torch.nn.Module):
diff --git a/test/export/test_dynamic_shapes.py b/test/export/test_dynamic_shapes.py
new file mode 100644
index 000000000000..1b9ff2329406
--- /dev/null
+++ b/test/export/test_dynamic_shapes.py
@@ -0,0 +1,40 @@
+# Owner(s): ["oncall: export"]
+
+from torch._dynamo.test_case import run_tests, TestCase
+from torch.export.dynamic_shapes import _DimHint, _DimHintType, Dim
+
+
+class TestDimHint(TestCase):
+    def test_dimhint_repr(self):
+        hint = _DimHint(_DimHintType.DYNAMIC)
+        self.assertEqual(repr(hint), "DimHint(DYNAMIC)")
+
+        hint_with_bounds = _DimHint(_DimHintType.AUTO, min=1, max=64)
+        self.assertEqual(repr(hint_with_bounds), "DimHint(AUTO, min=1, max=64)")
+
+        non_factory_hint = _DimHint(_DimHintType.STATIC, min=16, _factory=False)
+        self.assertEqual(repr(non_factory_hint), "DimHint(STATIC, min=16)")
+
+    def test_dimhint_factory(self):
+        factory = _DimHint(_DimHintType.AUTO)
+        self.assertTrue(factory._factory)
+
+        result = factory(min=8, max=32)
+        self.assertEqual(result.type, _DimHintType.AUTO)
+        self.assertEqual(result.min, 8)
+        self.assertEqual(result.max, 32)
+        self.assertFalse(result._factory)
+
+        with self.assertRaises(TypeError) as cm:
+            result(min=1, max=10)
+        self.assertIn("object is not callable", str(cm.exception))
+
+        bounded = Dim.DYNAMIC(min=4, max=16)
+        self.assertEqual(repr(bounded), "DimHint(DYNAMIC, min=4, max=16)")
+
+        with self.assertRaises(AssertionError):
+            factory(min=-1)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/export/test_experimental.py b/test/export/test_experimental.py
index f47e787599f6..6e9379be092e 100644
--- a/test/export/test_experimental.py
+++ b/test/export/test_experimental.py
@@ -32,7 +32,8 @@ def forward(self, x):
         m = Module()
         example_inputs = (torch.randn(3),)
         m(*example_inputs)
-        ep = torch.export.export(m, example_inputs, strict=True)
+        with torch._export.config.patch(use_new_tracer_experimental=True):
+            ep = torch.export.export(m, example_inputs, strict=True)
         joint_ep = _export_forward_backward(ep)
         self.assertExpectedInline(
             str(joint_ep.graph_module.code).strip(),
@@ -44,11 +45,9 @@ def forward(self, p_linear_weight, p_linear_bias, c_lifted_tensor_0, x):
     view_1 = torch.ops.aten.view.default(addmm, [3]);  addmm = None
     _softmax = torch.ops.aten._softmax.default(view_1, 0, False);  view_1 = None
     alias = torch.ops.aten.alias.default(_softmax)
-    alias_1 = torch.ops.aten.alias.default(alias);  alias = None
     clone = torch.ops.aten.clone.default(c_lifted_tensor_0);  c_lifted_tensor_0 = None
     _log_softmax = torch.ops.aten._log_softmax.default(_softmax, 0, False);  _softmax = None
-    alias_2 = torch.ops.aten.alias.default(_log_softmax)
-    alias_3 = torch.ops.aten.alias.default(alias_2);  alias_2 = None
+    alias_1 = torch.ops.aten.alias.default(_log_softmax)
     mul = torch.ops.aten.mul.Tensor(_log_softmax, clone);  _log_softmax = None
     sum_1 = torch.ops.aten.sum.dim_IntList(mul, []);  mul = None
     neg = torch.ops.aten.neg.default(sum_1);  sum_1 = None
@@ -58,17 +57,15 @@ def forward(self, p_linear_weight, p_linear_bias, c_lifted_tensor_0, x):
     neg_1 = torch.ops.aten.neg.default(div_1);  div_1 = None
     expand = torch.ops.aten.expand.default(neg_1, [3]);  neg_1 = None
     mul_1 = torch.ops.aten.mul.Tensor(expand, clone);  expand = clone = None
-    alias_4 = torch.ops.aten.alias.default(alias_3);  alias_3 = None
-    alias_5 = torch.ops.aten.alias.default(alias_4);  alias_4 = None
-    exp = torch.ops.aten.exp.default(alias_5);  alias_5 = None
+    alias_2 = torch.ops.aten.alias.default(alias_1);  alias_1 = None
+    exp = torch.ops.aten.exp.default(alias_2);  alias_2 = None
     sum_2 = torch.ops.aten.sum.dim_IntList(mul_1, [0], True)
     mul_2 = torch.ops.aten.mul.Tensor(exp, sum_2);  exp = sum_2 = None
     sub = torch.ops.aten.sub.Tensor(mul_1, mul_2);  mul_1 = mul_2 = None
-    alias_6 = torch.ops.aten.alias.default(alias_1);  alias_1 = None
-    alias_7 = torch.ops.aten.alias.default(alias_6);  alias_6 = None
-    mul_3 = torch.ops.aten.mul.Tensor(sub, alias_7);  sub = None
+    alias_3 = torch.ops.aten.alias.default(alias);  alias = None
+    mul_3 = torch.ops.aten.mul.Tensor(sub, alias_3);  sub = None
     sum_3 = torch.ops.aten.sum.dim_IntList(mul_3, [0], True)
-    mul_4 = torch.ops.aten.mul.Tensor(alias_7, sum_3);  alias_7 = sum_3 = None
+    mul_4 = torch.ops.aten.mul.Tensor(alias_3, sum_3);  alias_3 = sum_3 = None
     sub_1 = torch.ops.aten.sub.Tensor(mul_3, mul_4);  mul_3 = mul_4 = None
     view_2 = torch.ops.aten.view.default(sub_1, [1, 3]);  sub_1 = None
     permute_1 = torch.ops.aten.permute.default(view_2, [1, 0])
@@ -90,11 +87,9 @@ def forward(self, p_linear_weight, p_linear_bias, c_lifted_tensor_0, x):
     view_1 = torch.ops.aten.view.default(addmm, [3]);  addmm = None
     _softmax = torch.ops.aten._softmax.default(view_1, 0, False);  view_1 = None
     alias = torch.ops.aten.alias.default(_softmax)
-    alias_1 = torch.ops.aten.alias.default(alias);  alias = None
     clone = torch.ops.aten.clone.default(c_lifted_tensor_0);  c_lifted_tensor_0 = None
     _log_softmax = torch.ops.aten._log_softmax.default(_softmax, 0, False);  _softmax = None
-    alias_2 = torch.ops.aten.alias.default(_log_softmax)
-    alias_3 = torch.ops.aten.alias.default(alias_2);  alias_2 = None
+    alias_1 = torch.ops.aten.alias.default(_log_softmax)
     mul = torch.ops.aten.mul.Tensor(_log_softmax, clone);  _log_softmax = None
     sum_1 = torch.ops.aten.sum.dim_IntList(mul, []);  mul = None
     neg = torch.ops.aten.neg.default(sum_1);  sum_1 = None
@@ -104,17 +99,15 @@ def forward(self, p_linear_weight, p_linear_bias, c_lifted_tensor_0, x):
     neg_1 = torch.ops.aten.neg.default(div_1);  div_1 = None
     expand = torch.ops.aten.expand.default(neg_1, [3]);  neg_1 = None
     mul_1 = torch.ops.aten.mul.Tensor(expand, clone);  expand = clone = None
-    alias_4 = torch.ops.aten.alias.default(alias_3);  alias_3 = None
-    alias_5 = torch.ops.aten.alias.default(alias_4);  alias_4 = None
-    exp = torch.ops.aten.exp.default(alias_5);  alias_5 = None
+    alias_2 = torch.ops.aten.alias.default(alias_1);  alias_1 = None
+    exp = torch.ops.aten.exp.default(alias_2);  alias_2 = None
     sum_2 = torch.ops.aten.sum.dim_IntList(mul_1, [0], True)
     mul_2 = torch.ops.aten.mul.Tensor(exp, sum_2);  exp = sum_2 = None
     sub = torch.ops.aten.sub.Tensor(mul_1, mul_2);  mul_1 = mul_2 = None
-    alias_6 = torch.ops.aten.alias.default(alias_1);  alias_1 = None
-    alias_7 = torch.ops.aten.alias.default(alias_6);  alias_6 = None
-    mul_3 = torch.ops.aten.mul.Tensor(sub, alias_7);  sub = None
+    alias_3 = torch.ops.aten.alias.default(alias);  alias = None
+    mul_3 = torch.ops.aten.mul.Tensor(sub, alias_3);  sub = None
     sum_3 = torch.ops.aten.sum.dim_IntList(mul_3, [0], True)
-    mul_4 = torch.ops.aten.mul.Tensor(alias_7, sum_3);  alias_7 = sum_3 = None
+    mul_4 = torch.ops.aten.mul.Tensor(alias_3, sum_3);  alias_3 = sum_3 = None
     sub_1 = torch.ops.aten.sub.Tensor(mul_3, mul_4);  mul_3 = mul_4 = None
     view_2 = torch.ops.aten.view.default(sub_1, [1, 3]);  sub_1 = None
     permute_1 = torch.ops.aten.permute.default(view_2, [1, 0])
diff --git a/test/export/test_export.py b/test/export/test_export.py
index 7f03666b807f..23a7ad9bff1e 100755
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -1,8 +1,10 @@
 # Owner(s): ["oncall: export"]
 # ruff: noqa: F841
 # flake8: noqa
+import contextlib
 import copy
 import dataclasses
+import enum
 import functools
 import logging
 import math
@@ -21,6 +23,7 @@
 
 import torch
 import torch._dynamo as torchdynamo
+import torch.fx.traceback as fx_traceback
 import torch.nn.functional as F
 import torch.utils._pytree as pytree
 from functorch.experimental.control_flow import cond, map
@@ -38,6 +41,7 @@
     is_param,
     register_dataclass_as_pytree_node,
 )
+from torch._functorch.aot_autograd import aot_export_joint_with_descriptors
 from torch._higher_order_ops.associative_scan import associative_scan
 from torch._higher_order_ops.hints_wrap import hints_wrapper
 from torch._higher_order_ops.scan import scan
@@ -61,7 +65,10 @@
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 from torch.testing import FileCheck
-from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
+    xfailIfDistributedNotSupported,
+)
 from torch.testing._internal.common_utils import (
     find_library_location,
     IS_FBCODE,
@@ -224,6 +231,10 @@ def is_non_strict_test(test_name):
     )
 
 
+def is_strict_test(test_name):
+    return test_name.endswith(STRICT_SUFFIX)
+
+
 def is_strict_v2_test(test_name):
     return test_name.endswith(STRICT_EXPORT_V2_SUFFIX)
 
@@ -279,7 +290,6 @@ def test_export_inline_constraints(self):
         class Module(torch.nn.Module):
             def forward(self, x):
                 b = x.item()
-                torch._check_is_size(b)
                 return torch.full((b, 1), 1)
 
         f = Module()
@@ -378,7 +388,6 @@ def test_export_slice_unbacked_dim1(self):
         class MySlice(torch.nn.Module):
             def forward(self, x, seq_len):
                 l = seq_len.item()
-                torch._check_is_size(l, max=x.size(1))
                 x = x.narrow(1, 0, l)
                 return x
 
@@ -407,7 +416,6 @@ def test_export_constraints_error(self):
         class ConflictingConstraints(torch.nn.Module):
             def forward(self, x):
                 b = x.item()
-                torch._check_is_size(b)
                 torch._check(b >= 4)
                 torch._check(b <= 5)
                 torch._check(b <= 5)
@@ -713,6 +721,248 @@ def example_inputs(self):
                 )
                 self.assertEqual(node.meta["from_node"][-1].graph_id, graph_id)
 
+    @requires_gpu
+    def test_flex_attention_export(self):
+        from torch.nn.attention.flex_attention import create_block_mask, flex_attention
+
+        class MixedFakeModeModel(torch.nn.Module):
+            def __init__(self, dim=64, use_inductor=True):
+                super().__init__()
+                self.dim = dim
+                self.q_proj = torch.nn.Linear(64, 64)
+                self.k_proj = torch.nn.Linear(64, 64)
+                self.v_proj = torch.nn.Linear(64, 64)
+                self.use_inductor = use_inductor
+
+            def forward(self, x):
+                batch_size, seq_len, _ = x.shape
+
+                # Process input first - this creates fake tensors in export's fake mode
+                processed = self.q_proj(x)
+
+                # Create some computation that depends on processed tensor
+                intermediate = processed.sum(dim=-1).detach()  # Shape: (batch, seq_len)
+
+                # Now call create_block_mask which internally calls torch.compile
+                # The mask function will capture 'intermediate' which is a fake tensor
+                # from export's fake mode, but create_block_mask will create its own fake mode
+                def dynamic_mask_function(batch_idx, head_idx, q_idx, kv_idx):
+                    # This captures the intermediate tensor from the outer scope
+                    # When torch.compile is called inside create_block_mask,
+                    # this tensor will be from export's fake mode while new tensors
+                    # created inside will be from the nested fake mode
+                    threshold = intermediate[
+                        batch_idx, q_idx % seq_len
+                    ]  # Access the captured tensor
+                    return (kv_idx <= q_idx) & (threshold > 0)  # Mix fake modes
+
+                block_mask = create_block_mask(
+                    mask_mod=dynamic_mask_function,
+                    B=batch_size,
+                    H=None,
+                    Q_LEN=seq_len,
+                    KV_LEN=seq_len,
+                    device=x.device,
+                )
+                q = self.q_proj(processed).view(batch_size, 1, seq_len, self.dim)
+                k = self.k_proj(processed).view(batch_size, 1, seq_len, self.dim)
+                v = self.v_proj(processed).view(batch_size, 1, seq_len, self.dim)
+
+                # Use flex_attention with the problematic block_mask
+                backend = "inductor" if self.use_inductor else "eager"
+                out = torch.compile(flex_attention, backend=backend)(
+                    q, k, v, block_mask=block_mask
+                )
+
+                return out
+
+        model = MixedFakeModeModel(use_inductor=False)
+        x = torch.randn(2, 128, 64)
+        # Inductor doesn't work in eager mode flex attention
+        eager_out = model(x)
+        model.use_inductor = True
+        exported_mod = torch.export.export(model, (x,), strict=False).module()
+        self.assertExpectedInline(
+            str(exported_mod.code).strip(),
+            """\
+def forward(self, x):
+    x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    q_proj_weight = self.q_proj.weight
+    q_proj_bias = self.q_proj.bias
+    k_proj_weight = self.k_proj.weight
+    k_proj_bias = self.k_proj.bias
+    v_proj_weight = self.v_proj.weight
+    v_proj_bias = self.v_proj.bias
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
+    linear = torch.ops.aten.linear.default(x, q_proj_weight, q_proj_bias);  x = None
+    sum_1 = torch.ops.aten.sum.dim_IntList(linear, [-1])
+    detach = torch.ops.aten.detach.default(sum_1);  sum_1 = None
+    arange = torch.ops.aten.arange.start(0, 2, device = device(type='cpu'), pin_memory = False)
+    arange_1 = torch.ops.aten.arange.start(0, 1, device = device(type='cpu'), pin_memory = False)
+    arange_2 = torch.ops.aten.arange.start(0, 128, device = device(type='cpu'), pin_memory = False)
+    arange_3 = torch.ops.aten.arange.start(0, 128, device = device(type='cpu'), pin_memory = False)
+    lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions = None
+    _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(2, 'error');  _vmap_increment_nesting = None
+    _add_batch_dim = torch._functorch.predispatch._add_batch_dim(arange, 0, 1);  arange = None
+    lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_1 = None
+    _vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(1, 'error');  _vmap_increment_nesting_1 = None
+    _add_batch_dim_1 = torch._functorch.predispatch._add_batch_dim(arange_1, 0, 2);  arange_1 = _add_batch_dim_1 = None
+    lazy_load_decompositions_2 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_2 = None
+    _vmap_increment_nesting_2 = torch._functorch.predispatch._vmap_increment_nesting(128, 'error');  _vmap_increment_nesting_2 = None
+    _add_batch_dim_2 = torch._functorch.predispatch._add_batch_dim(arange_2, 0, 3);  arange_2 = None
+    lazy_load_decompositions_3 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_3 = None
+    _vmap_increment_nesting_3 = torch._functorch.predispatch._vmap_increment_nesting(128, 'error');  _vmap_increment_nesting_3 = None
+    _add_batch_dim_3 = torch._functorch.predispatch._add_batch_dim(arange_3, 0, 4);  arange_3 = None
+    remainder = torch.ops.aten.remainder.Scalar(_add_batch_dim_2, 128)
+    torch__dynamo__trace_wrapped_higher_order_op_mod_index0 = self.torch__dynamo__trace_wrapped_higher_order_op_ModIndex0
+    function_const_func_spec0 = self.function_const_func_spec0
+    flat_apply = torch.ops.higher_order.flat_apply(function_const_func_spec0, torch__dynamo__trace_wrapped_higher_order_op_mod_index0, 'torch._dynamo._trace_wrapped_higher_order_op.ModIndex', detach, _add_batch_dim, remainder);  function_const_func_spec0 = torch__dynamo__trace_wrapped_higher_order_op_mod_index0 = _add_batch_dim = remainder = None
+    le = torch.ops.aten.le.Tensor(_add_batch_dim_3, _add_batch_dim_2);  _add_batch_dim_3 = _add_batch_dim_2 = None
+    gt = torch.ops.aten.gt.Scalar(flat_apply, 0);  flat_apply = None
+    and_1 = torch.ops.aten.__and__.Tensor(le, gt);  le = gt = None
+    _remove_batch_dim = torch._functorch.predispatch._remove_batch_dim(and_1, 4, 128, 0);  and_1 = None
+    _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting = None
+    _remove_batch_dim_1 = torch._functorch.predispatch._remove_batch_dim(_remove_batch_dim, 3, 128, 0);  _remove_batch_dim = None
+    _vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_1 = None
+    _remove_batch_dim_2 = torch._functorch.predispatch._remove_batch_dim(_remove_batch_dim_1, 2, 1, 0)
+    expand = torch.ops.aten.expand.default(_remove_batch_dim_1, [1, 128, 128]);  _remove_batch_dim_1 = expand = None
+    _vmap_decrement_nesting_2 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_2 = None
+    _remove_batch_dim_3 = torch._functorch.predispatch._remove_batch_dim(_remove_batch_dim_2, 1, 2, 0);  _remove_batch_dim_2 = None
+    _vmap_decrement_nesting_3 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_3 = None
+    pad = torch.ops.aten.pad.default(_remove_batch_dim_3, [0, 0, 0, 0]);  _remove_batch_dim_3 = None
+    view = torch.ops.aten.view.default(pad, [2, 1, 1, 128, 1, 128]);  pad = None
+    permute = torch.ops.aten.permute.default(view, [0, 1, 2, 4, 3, 5]);  view = None
+    sum_2 = torch.ops.aten.sum.dim_IntList(permute, [-2, -1]);  permute = None
+    eq = torch.ops.aten.eq.Scalar(sum_2, 16384)
+    gt_1 = torch.ops.aten.gt.Scalar(sum_2, 0)
+    lt = torch.ops.aten.lt.Scalar(sum_2, 16384);  sum_2 = None
+    and_2 = torch.ops.aten.__and__.Tensor(gt_1, lt);  gt_1 = lt = None
+    _assert_tensor_metadata_default = torch.ops.aten._assert_tensor_metadata.default(and_2, dtype = torch.bool, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default = None
+    to = torch.ops.aten.to.dtype(and_2, torch.int8);  and_2 = None
+    _assert_tensor_metadata_default_1 = torch.ops.aten._assert_tensor_metadata.default(eq, dtype = torch.bool, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default_1 = None
+    to_1 = torch.ops.aten.to.dtype(eq, torch.int8);  eq = None
+    _assert_tensor_metadata_default_2 = torch.ops.aten._assert_tensor_metadata.default(to, dtype = torch.int8, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default_2 = None
+    to_2 = torch.ops.aten.to.dtype(to, torch.int32);  to = None
+    sum_3 = torch.ops.aten.sum.dim_IntList(to_2, [-1])
+    argsort = torch.ops.aten.argsort.stable(to_2, stable = True, descending = True);  to_2 = None
+    _assert_tensor_metadata_default_3 = torch.ops.aten._assert_tensor_metadata.default(sum_3, dtype = torch.int64, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default_3 = None
+    to_3 = torch.ops.aten.to.dtype(sum_3, torch.int32, False, False, torch.contiguous_format);  sum_3 = None
+    _assert_tensor_metadata_default_4 = torch.ops.aten._assert_tensor_metadata.default(argsort, dtype = torch.int64, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default_4 = None
+    to_4 = torch.ops.aten.to.dtype(argsort, torch.int32, False, False, torch.contiguous_format);  argsort = None
+    _assert_tensor_metadata_default_5 = torch.ops.aten._assert_tensor_metadata.default(to_1, dtype = torch.int8, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default_5 = None
+    to_5 = torch.ops.aten.to.dtype(to_1, torch.int32);  to_1 = None
+    sum_4 = torch.ops.aten.sum.dim_IntList(to_5, [-1])
+    argsort_1 = torch.ops.aten.argsort.stable(to_5, stable = True, descending = True);  to_5 = None
+    _assert_tensor_metadata_default_6 = torch.ops.aten._assert_tensor_metadata.default(sum_4, dtype = torch.int64, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default_6 = None
+    to_6 = torch.ops.aten.to.dtype(sum_4, torch.int32, False, False, torch.contiguous_format);  sum_4 = None
+    _assert_tensor_metadata_default_7 = torch.ops.aten._assert_tensor_metadata.default(argsort_1, dtype = torch.int64, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default_7 = None
+    to_7 = torch.ops.aten.to.dtype(argsort_1, torch.int32, False, False, torch.contiguous_format);  argsort_1 = None
+    lazy_load_decompositions_4 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_4 = None
+    _vmap_increment_nesting_4 = torch._functorch.predispatch._vmap_increment_nesting(2, 'error');  _vmap_increment_nesting_4 = None
+    _add_batch_dim_4 = torch._functorch.predispatch._add_batch_dim(to_3, 0, 1)
+    _add_batch_dim_5 = torch._functorch.predispatch._add_batch_dim(to_4, 0, 1)
+    lazy_load_decompositions_5 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_5 = None
+    _vmap_increment_nesting_5 = torch._functorch.predispatch._vmap_increment_nesting(1, 'error');  _vmap_increment_nesting_5 = None
+    _add_batch_dim_6 = torch._functorch.predispatch._add_batch_dim(_add_batch_dim_4, 0, 2);  _add_batch_dim_4 = None
+    _add_batch_dim_7 = torch._functorch.predispatch._add_batch_dim(_add_batch_dim_5, 0, 2);  _add_batch_dim_5 = None
+    new_zeros = torch.ops.aten.new_zeros.default(_add_batch_dim_7, [1, 2], dtype = torch.int32, pin_memory = False)
+    arange_4 = torch.ops.aten.arange.default(1, dtype = torch.int32, device = device(type='cpu'), pin_memory = False)
+    unsqueeze = torch.ops.aten.unsqueeze.default(arange_4, -1);  arange_4 = None
+    arange_5 = torch.ops.aten.arange.default(1, dtype = torch.int32, device = device(type='cpu'), pin_memory = False)
+    unsqueeze_1 = torch.ops.aten.unsqueeze.default(_add_batch_dim_6, -1);  _add_batch_dim_6 = None
+    lt_1 = torch.ops.aten.lt.Tensor(arange_5, unsqueeze_1);  arange_5 = unsqueeze_1 = None
+    where = torch.ops.aten.where.ScalarOther(lt_1, _add_batch_dim_7, 1);  lt_1 = _add_batch_dim_7 = None
+    new_ones = torch.ops.aten.new_ones.default(new_zeros, [], pin_memory = False)
+    index_put_ = torch.ops.aten.index_put_.default(new_zeros, [unsqueeze, where], new_ones);  new_zeros = unsqueeze = where = new_ones = None
+    slice_1 = torch.ops.aten.slice.Tensor(index_put_, 1, 0, 1);  index_put_ = None
+    _remove_batch_dim_4 = torch._functorch.predispatch._remove_batch_dim(slice_1, 2, 1, 0);  slice_1 = None
+    _vmap_decrement_nesting_4 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_4 = None
+    _remove_batch_dim_5 = torch._functorch.predispatch._remove_batch_dim(_remove_batch_dim_4, 1, 2, 0);  _remove_batch_dim_4 = None
+    _vmap_decrement_nesting_5 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_5 = None
+    transpose = torch.ops.aten.transpose.int(_remove_batch_dim_5, -2, -1);  _remove_batch_dim_5 = None
+    _assert_tensor_metadata_default_8 = torch.ops.aten._assert_tensor_metadata.default(transpose, dtype = torch.int32, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default_8 = None
+    to_8 = torch.ops.aten.to.dtype(transpose, torch.int32);  transpose = None
+    sum_5 = torch.ops.aten.sum.dim_IntList(to_8, [-1])
+    argsort_2 = torch.ops.aten.argsort.stable(to_8, stable = True, descending = True);  to_8 = None
+    _assert_tensor_metadata_default_9 = torch.ops.aten._assert_tensor_metadata.default(sum_5, dtype = torch.int64, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default_9 = None
+    to_9 = torch.ops.aten.to.dtype(sum_5, torch.int32, False, False, torch.contiguous_format);  sum_5 = None
+    _assert_tensor_metadata_default_10 = torch.ops.aten._assert_tensor_metadata.default(argsort_2, dtype = torch.int64, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default_10 = None
+    to_10 = torch.ops.aten.to.dtype(argsort_2, torch.int32, False, False, torch.contiguous_format);  argsort_2 = None
+    lazy_load_decompositions_6 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_6 = None
+    _vmap_increment_nesting_6 = torch._functorch.predispatch._vmap_increment_nesting(2, 'error');  _vmap_increment_nesting_6 = None
+    _add_batch_dim_8 = torch._functorch.predispatch._add_batch_dim(to_6, 0, 1)
+    _add_batch_dim_9 = torch._functorch.predispatch._add_batch_dim(to_7, 0, 1)
+    lazy_load_decompositions_7 = torch._functorch.predispatch.lazy_load_decompositions();  lazy_load_decompositions_7 = None
+    _vmap_increment_nesting_7 = torch._functorch.predispatch._vmap_increment_nesting(1, 'error');  _vmap_increment_nesting_7 = None
+    _add_batch_dim_10 = torch._functorch.predispatch._add_batch_dim(_add_batch_dim_8, 0, 2);  _add_batch_dim_8 = None
+    _add_batch_dim_11 = torch._functorch.predispatch._add_batch_dim(_add_batch_dim_9, 0, 2);  _add_batch_dim_9 = None
+    new_zeros_1 = torch.ops.aten.new_zeros.default(_add_batch_dim_11, [1, 2], dtype = torch.int32, pin_memory = False)
+    arange_6 = torch.ops.aten.arange.default(1, dtype = torch.int32, device = device(type='cpu'), pin_memory = False)
+    unsqueeze_2 = torch.ops.aten.unsqueeze.default(arange_6, -1);  arange_6 = None
+    arange_7 = torch.ops.aten.arange.default(1, dtype = torch.int32, device = device(type='cpu'), pin_memory = False)
+    unsqueeze_3 = torch.ops.aten.unsqueeze.default(_add_batch_dim_10, -1);  _add_batch_dim_10 = None
+    lt_2 = torch.ops.aten.lt.Tensor(arange_7, unsqueeze_3);  arange_7 = unsqueeze_3 = None
+    where_1 = torch.ops.aten.where.ScalarOther(lt_2, _add_batch_dim_11, 1);  lt_2 = _add_batch_dim_11 = None
+    new_ones_1 = torch.ops.aten.new_ones.default(new_zeros_1, [], pin_memory = False)
+    index_put__1 = torch.ops.aten.index_put_.default(new_zeros_1, [unsqueeze_2, where_1], new_ones_1);  new_zeros_1 = unsqueeze_2 = where_1 = new_ones_1 = None
+    slice_2 = torch.ops.aten.slice.Tensor(index_put__1, 1, 0, 1);  index_put__1 = None
+    _remove_batch_dim_6 = torch._functorch.predispatch._remove_batch_dim(slice_2, 2, 1, 0);  slice_2 = None
+    _vmap_decrement_nesting_6 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_6 = None
+    _remove_batch_dim_7 = torch._functorch.predispatch._remove_batch_dim(_remove_batch_dim_6, 1, 2, 0);  _remove_batch_dim_6 = None
+    _vmap_decrement_nesting_7 = torch._functorch.predispatch._vmap_decrement_nesting();  _vmap_decrement_nesting_7 = None
+    transpose_1 = torch.ops.aten.transpose.int(_remove_batch_dim_7, -2, -1);  _remove_batch_dim_7 = None
+    _assert_tensor_metadata_default_11 = torch.ops.aten._assert_tensor_metadata.default(transpose_1, dtype = torch.int32, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default_11 = None
+    to_11 = torch.ops.aten.to.dtype(transpose_1, torch.int32);  transpose_1 = None
+    sum_6 = torch.ops.aten.sum.dim_IntList(to_11, [-1])
+    argsort_3 = torch.ops.aten.argsort.stable(to_11, stable = True, descending = True);  to_11 = None
+    _assert_tensor_metadata_default_12 = torch.ops.aten._assert_tensor_metadata.default(sum_6, dtype = torch.int64, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default_12 = None
+    to_12 = torch.ops.aten.to.dtype(sum_6, torch.int32, False, False, torch.contiguous_format);  sum_6 = None
+    _assert_tensor_metadata_default_13 = torch.ops.aten._assert_tensor_metadata.default(argsort_3, dtype = torch.int64, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default_13 = None
+    to_13 = torch.ops.aten.to.dtype(argsort_3, torch.int32, False, False, torch.contiguous_format);  argsort_3 = None
+    linear_1 = torch.ops.aten.linear.default(linear, q_proj_weight, q_proj_bias);  q_proj_weight = q_proj_bias = None
+    view_1 = torch.ops.aten.view.default(linear_1, [2, 1, 128, 64]);  linear_1 = None
+    linear_2 = torch.ops.aten.linear.default(linear, k_proj_weight, k_proj_bias);  k_proj_weight = k_proj_bias = None
+    view_2 = torch.ops.aten.view.default(linear_2, [2, 1, 128, 64]);  linear_2 = None
+    linear_3 = torch.ops.aten.linear.default(linear, v_proj_weight, v_proj_bias);  linear = v_proj_weight = v_proj_bias = None
+    view_3 = torch.ops.aten.view.default(linear_3, [2, 1, 128, 64]);  linear_3 = None
+    sdpa_score0 = self.sdpa_score0
+    sdpa_mask0 = self.sdpa_mask0
+    flex_attention = torch.ops.higher_order.flex_attention(view_1, view_2, view_3, sdpa_score0, (128, 128, to_3, to_4, to_6, to_7, to_9, to_10, to_12, to_13, 128, 128, sdpa_mask0), 0.125, {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': False, 'OUTPUT_MAX': False}, (), (detach,));  view_1 = view_2 = view_3 = sdpa_score0 = to_3 = to_4 = to_6 = to_7 = to_9 = to_10 = to_12 = to_13 = sdpa_mask0 = detach = None
+    getitem = flex_attention[0]
+    getitem_1 = flex_attention[1];  getitem_1 = None
+    getitem_2 = flex_attention[2];  flex_attention = getitem_2 = None
+    return pytree.tree_unflatten((getitem,), self._out_spec)""",
+        )
+        exported_out = exported_mod(x)
+        self.assertEqual(exported_out, eager_out)
+
+    def test_inductor_backend_inside_nonstrict(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                def i_want_faster_code(inp1, inp2):
+                    nonlocal x
+                    return x + inp1 + inp2
+
+                out = torch.compile(i_want_faster_code)(x, x)
+                return x + out
+
+        foo = Foo()
+        with self.assertWarnsRegex(
+            UserWarning, "You are calling torch.compile inside torch.export region"
+        ):
+            ep = export(foo, (torch.randn(4, 4),), strict=False).module()
+        self.assertExpectedInline(
+            str(ep.graph).strip(),
+            """\
+graph():
+    %x : [num_users=4] = placeholder[target=x]
+    %_guards_fn : [num_users=0] = call_module[target=_guards_fn](args = (%x,), kwargs = {})
+    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %x), kwargs = {})
+    %add_1 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add, %x), kwargs = {})
+    %add_2 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %add_1), kwargs = {})
+    return (add_2,)""",
+        )
+
     def test_bincount(self):
         class M(torch.nn.Module):
             def __init__(self):
@@ -725,7 +975,7 @@ def forward(self, x):
 
         model = M()
         ep = export(model, (torch.randint(0, 8, (5,), dtype=torch.int64),))
-        print(ep)
+
         inp = torch.randint(0, 8, (5,), dtype=torch.int64)
         self.assertTrue(torch.allclose(ep.module()(inp), M()(inp)))
 
@@ -782,6 +1032,94 @@ def forward(self, x: torch.Tensor):
         # instead of the scripted function, so we get x.sin()
         self.assertEqual(res, x.sin())
 
+    def test_nested_module_fake_tensor_leak(self):
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self._tensor_cache = None
+
+            def forward(self, x):
+                if self._tensor_cache is None:
+                    self._tensor_cache = x + 2
+                return self._tensor_cache.sum() + x.sum()
+
+        class Foo(torch.nn.Module):
+            def __init__(self, bar):
+                super().__init__()
+                self.bar = bar
+
+            def forward(self, x):
+                return self.bar(x)
+
+        foo = Foo(Bar())
+        _ = export(foo, (torch.ones(4, 4),), strict=False)
+        self.assertTrue(foo.bar._tensor_cache is None)
+
+    def test_export_leak_compile(self):
+        class BaseModule(torch.nn.Module):
+            def forward(self, *args, **kwargs):
+                raise NotImplementedError
+
+        class CacheModule(BaseModule):
+            def __init__(self, cache: torch.Tensor):
+                super().__init__()
+                assert cache.ndim == 3
+                self.cache = torch.nn.Parameter(cache, requires_grad=False)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                n_tokens = x.size(1)
+                rolled_cache = torch.roll(self.cache.data, -n_tokens, dims=1)
+                rolled_cache[:, -n_tokens:, :] = x
+                self.cache.data = rolled_cache
+                return self.cache
+
+        class LinearBlock(torch.nn.Module):
+            def __init__(self, in_features, out_features, activation=None):
+                super().__init__()
+                self.linear = torch.nn.Linear(in_features, out_features)
+                self.activation = activation
+
+            def forward(self, x):
+                x = self.linear(x)
+                return self.activation(x) if self.activation else x
+
+        class MyModel(BaseModule):
+            def __init__(self):
+                super().__init__()
+                default_cache = torch.zeros(1, 10, 5)
+                self.cache_layer = CacheModule(default_cache)
+                self.fc1 = LinearBlock(5, 10, activation=torch.nn.ReLU())
+                self.fc2 = LinearBlock(10, 5)
+
+            def forward(self, x):
+                cached = self.cache_layer(x)
+                out = self.fc1(cached)
+                out = self.fc2(out)
+                return out
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "We found a fake tensor in the exported program constant's list. "
+            "This typically means our tracing system encountered an op that we can't trace through. "
+            "For the potential source, you can refer to following model attribute: cache_layer.lifted_tensor_0. "
+            "Please file an issue on github.",
+        ):
+            _ = export(MyModel(), (torch.randn(1, 3, 5),), strict=False)
+
+        with self.assertWarnsRegex(
+            UserWarning,
+            "We found a fake tensor in the exported program constant's list. "
+            "This typically means our tracing system encountered an op that we can't trace through. "
+            "For the potential source, you can refer to following model attribute: cache_layer.lifted_tensor_0. "
+            "Please file an issue on github.",
+        ):
+            # can't trigger all variant of export because later on it will crash
+            # and it is good because we warned :).
+            with torch._export.config.patch(error_on_lifted_constant_tensors=False):
+                _ = torch.export.export(
+                    MyModel(), (torch.randn(1, 3, 5),), strict=False
+                )
+
     def test_inline_script_class_method(self):
         class M(torch.nn.Module):
             @staticmethod
@@ -807,6 +1145,115 @@ def forward(self, x: torch.Tensor):
         # instead of the scripted function, so we get x.sin()
         self.assertEqual(res, x.sin())
 
+    def test_tag_ac_export(self):
+        ops_to_save = [torch.ops.aten.addmm.default]
+
+        def policy_fn(ctx, op, *args, **wargs):
+            if op in ops_to_save:
+                return torch.utils.checkpoint.CheckpointPolicy.MUST_SAVE
+            else:
+                return torch.utils.checkpoint.CheckpointPolicy.PREFER_RECOMPUTE
+
+        context_fn = functools.partial(
+            torch.utils.checkpoint.create_selective_checkpoint_contexts, policy_fn
+        )
+
+        class Block(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear1 = torch.nn.Linear(128, 128)
+                self.relu = torch.nn.ReLU()
+                self.linear2 = torch.nn.Linear(128, 128)
+
+            def forward(self, x):
+                return self.linear2(self.relu(self.linear1(x)))
+
+        # Wrap the block with checkpointing
+        class CheckpointedBlock(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.block = Block()
+
+            def forward(self, x):
+                return torch.utils.checkpoint.checkpoint(
+                    self.block, x, context_fn=context_fn
+                )
+
+        model = CheckpointedBlock()
+        x = torch.randn(16, 128, requires_grad=True)
+
+        ep = torch.export.export(model, (x,), strict=True)
+        self.assertExpectedInline(
+            str(ep.graph).strip(),
+            """\
+graph():
+    %p_block_linear1_weight : [num_users=1] = placeholder[target=p_block_linear1_weight]
+    %p_block_linear1_bias : [num_users=1] = placeholder[target=p_block_linear1_bias]
+    %p_block_linear2_weight : [num_users=1] = placeholder[target=p_block_linear2_weight]
+    %p_block_linear2_bias : [num_users=1] = placeholder[target=p_block_linear2_bias]
+    %x : [num_users=1] = placeholder[target=x]
+    %wrap_body0 : [num_users=1] = get_attr[target=wrap_body0]
+    %tag_activation_checkpoint : [num_users=1] = call_function[target=torch.ops.higher_order.tag_activation_checkpoint](args = (%wrap_body0, %x, %p_block_linear1_weight, %p_block_linear1_bias, %p_block_linear2_weight, %p_block_linear2_bias), kwargs = {})
+    %getitem : [num_users=1] = call_function[target=operator.getitem](args = (%tag_activation_checkpoint, 0), kwargs = {})
+    return (getitem,)""",
+        )
+
+        self.assertExpectedInline(
+            str(ep.graph_module.wrap_body0.graph).strip(),
+            """\
+graph():
+    %arg0_1 : [num_users=1] = placeholder[target=arg0_1]
+    %arg1_1 : [num_users=1] = placeholder[target=arg1_1]
+    %arg2_1 : [num_users=1] = placeholder[target=arg2_1]
+    %arg3_1 : [num_users=1] = placeholder[target=arg3_1]
+    %arg4_1 : [num_users=1] = placeholder[target=arg4_1]
+    %linear : [num_users=1] = call_function[target=torch.ops.aten.linear.default](args = (%arg0_1, %arg1_1, %arg2_1), kwargs = {})
+    %relu : [num_users=1] = call_function[target=torch.ops.aten.relu.default](args = (%linear,), kwargs = {})
+    %linear_1 : [num_users=1] = call_function[target=torch.ops.aten.linear.default](args = (%relu, %arg3_1, %arg4_1), kwargs = {})
+    return (linear_1,)""",
+        )
+
+        stack = contextlib.ExitStack()
+
+        with stack:
+            jwd = aot_export_joint_with_descriptors(stack, ep.module(), (x,))
+            for node in jwd.graph_module.graph.nodes:
+                if "recompute" in node.meta:
+                    actual = node.meta["recompute"]
+                    expected = policy_fn(None, node.target, None, None)
+                    self.assertEqual(actual, expected)
+            self.assertExpectedInline(
+                str(jwd.graph_module.code).strip(),
+                """\
+def forward(self, primals, tangents):
+    primals_1, primals_2, primals_3, primals_4, primals_5, tangents_1, = fx_pytree.tree_flatten_spec([primals, tangents], self._in_spec)
+    t = torch.ops.aten.t.default(primals_1);  primals_1 = None
+    addmm = torch.ops.aten.addmm.default(primals_2, primals_5, t);  primals_2 = None
+    relu = torch.ops.aten.relu.default(addmm);  addmm = None
+    detach_3 = torch.ops.aten.detach.default(relu)
+    t_1 = torch.ops.aten.t.default(primals_3);  primals_3 = None
+    addmm_1 = torch.ops.aten.addmm.default(primals_4, relu, t_1);  primals_4 = None
+    t_2 = torch.ops.aten.t.default(t_1);  t_1 = None
+    mm = torch.ops.aten.mm.default(tangents_1, t_2);  t_2 = None
+    t_3 = torch.ops.aten.t.default(tangents_1)
+    mm_1 = torch.ops.aten.mm.default(t_3, relu);  t_3 = relu = None
+    t_4 = torch.ops.aten.t.default(mm_1);  mm_1 = None
+    sum_1 = torch.ops.aten.sum.dim_IntList(tangents_1, [0], True);  tangents_1 = None
+    view = torch.ops.aten.view.default(sum_1, [128]);  sum_1 = None
+    t_5 = torch.ops.aten.t.default(t_4);  t_4 = None
+    detach_6 = torch.ops.aten.detach.default(detach_3);  detach_3 = None
+    threshold_backward = torch.ops.aten.threshold_backward.default(mm, detach_6, 0);  mm = detach_6 = None
+    t_6 = torch.ops.aten.t.default(t);  t = None
+    mm_2 = torch.ops.aten.mm.default(threshold_backward, t_6);  t_6 = None
+    t_7 = torch.ops.aten.t.default(threshold_backward)
+    mm_3 = torch.ops.aten.mm.default(t_7, primals_5);  t_7 = primals_5 = None
+    t_8 = torch.ops.aten.t.default(mm_3);  mm_3 = None
+    sum_2 = torch.ops.aten.sum.dim_IntList(threshold_backward, [0], True);  threshold_backward = None
+    view_1 = torch.ops.aten.view.default(sum_2, [128]);  sum_2 = None
+    t_9 = torch.ops.aten.t.default(t_8);  t_8 = None
+    return pytree.tree_unflatten([addmm_1, t_9, view_1, t_5, view, mm_2], self._out_spec)""",
+            )
+
     def test_inline_script_class_method_recursive(self):
         f = 0.4
         i = 2
@@ -1083,6 +1530,93 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         args = (torch.randn(15, 3, 256, 256), torch.ones(15, 32, 256, 256))
         self.assertEqual(gm(*args), m(*args))
 
+    # stride() is called for an undefined tensor
+    @testing.expectedFailureCppRuntimeNonStrict
+    def test_native_multi_attention_head(self):
+        embed_dim = 64
+        num_heads = 4
+        bs = 16
+        sl = 8
+        device = "cpu"
+
+        q = 6 * torch.rand(bs, sl, embed_dim, device=device, dtype=torch.float32) - 3
+        k = q
+        v = q
+
+        qkv = torch.nn.Linear(
+            embed_dim, 3 * embed_dim, device=device, dtype=torch.float32
+        )
+        proj = torch.nn.Linear(embed_dim, embed_dim, device=device, dtype=torch.float32)
+
+        class NativeMHA(torch.nn.Module):
+            def __init__(
+                self,
+                embed_dim,
+                num_heads,
+                qkv,
+                proj,
+                need_weights,
+                average_attn_weights,
+                mask_type,
+            ):
+                super().__init__()
+                self.qkv = qkv
+                self.proj = proj
+                self.embed_dim = embed_dim
+                self.num_heads = num_heads
+                self.need_weights = need_weights
+                self.average_attn_weights = average_attn_weights
+                self.mask_type = mask_type
+
+            def forward(self, q, k, v, key_padding_mask):
+                return torch._native_multi_head_attention(
+                    q,
+                    k,
+                    v,
+                    self.embed_dim,
+                    self.num_heads,
+                    self.qkv.weight,
+                    self.qkv.bias,
+                    self.proj.weight,
+                    self.proj.bias,
+                    key_padding_mask,
+                    need_weights=False,
+                    average_attn_weights=False,
+                    mask_type=1,  # mask_type = 1 => src_key_padding_mask, mask_type = 0 => src_mask
+                )
+
+        for mask_type in (0, 1):
+            for need_weights in (True, False):
+                for average_attn_weights in (True, False):
+                    npt = NativeMHA(
+                        embed_dim=embed_dim,
+                        num_heads=num_heads,
+                        qkv=qkv,
+                        proj=proj,
+                        need_weights=need_weights,
+                        average_attn_weights=average_attn_weights,
+                        mask_type=mask_type,
+                    )
+                    sample_input = (q, k, v, None)
+
+                    ep = export(
+                        npt,
+                        args=sample_input,
+                        dynamic_shapes={
+                            "q": {
+                                0: Dim("dim0_q", max=1024),
+                            },
+                            "k": {
+                                0: Dim("dim0_k", max=1024),
+                            },
+                            "v": {
+                                0: Dim("dim0_v", max=1024),
+                            },
+                            "key_padding_mask": None,
+                        },
+                    )
+                    self.assertEqual(ep.module()(*sample_input), npt(*sample_input))
+
     def test_unused_constant(self):
         class M(torch.nn.Module):
             def forward(self, x):
@@ -1092,6 +1626,24 @@ def forward(self, x):
         ep = export(M(), (torch.ones(3),))
         self.assertEqual(len(ep.constants), 0)
 
+        class M(torch.nn.Module):
+            def __init__(self, num_features: int = 1) -> None:
+                super().__init__()
+                self.num_features = num_features
+
+            def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+                res = [torch.Tensor([])] * self.num_features
+                for i in range(self.num_features):
+                    res[i] = x * (i + 1)
+                return res
+
+        inp = torch.ones(3)
+        ep = export(M(), (inp,))
+        self.assertEqual(len(ep.constants), 0)
+
+        unf = unflatten(ep)
+        self.assertTrue(torch.allclose(M()(inp)[0], unf(inp)[0]))
+
     def test_unbacked_bincount(self):
         class Foo(torch.nn.Module):
             def forward(self, xs):
@@ -1370,7 +1922,7 @@ def hacked_up_forward(self_, x, y):
                 self.mod.forward = hacked_up_forward.__get__(self.mod, Foo)
 
             def __call__(self, x, y):
-                ep = torch.export.export(self.mod, (x, y), strict=True).module()
+                ep = export(self.mod, (x, y), strict=True).module()
                 out = ep(x, y)
                 return out
 
@@ -1379,13 +1931,25 @@ def update(self):
 
         foo = Foo()
         ref = ReferenceControl(foo)
-        with self.assertWarnsRegex(
-            UserWarning,
-            "While exporting, we found certain side effects happened in the model.forward. "
-            "Here are the list of potential sources you can double check: "
-            "\[\"L\['global_list'\]\", \"L\['self'\].bank\", \"L\['self'\].bank_dict\"",
+        # TODO (tmanlaibaatar) this kinda sucks but today there is no good way to get
+        # good source name. We should have an util that post processes dynamo source names
+        # to be more readable.
+        if is_strict_v2_test(self._testMethodName) or is_inline_and_install_strict_test(
+            self._testMethodName
         ):
-            ref(torch.randn(4, 4), torch.randn(4, 4))
+            with self.assertWarnsRegex(
+                UserWarning,
+                r"(L\['self']\._modules\['_export_root']\.forward\.__func__\.__closure__\[1\]\.cell_contents\.bank"
+                r"|L\['self']\._modules\['_export_root']\.forward\.__func__\.__closure__\[1\]\.cell_contents\.bank_dict"
+                r"|L\['self']\._modules\['_export_root']\.forward\.__func__\.__closure__\[0\]\.cell_contents)",
+            ):
+                ref(torch.randn(4, 4), torch.randn(4, 4))
+        else:
+            with self.assertWarnsRegex(
+                UserWarning,
+                r"(L\['global_list'\]|L\['self'\]\.bank|L\['self'\]\.bank_dict)",
+            ):
+                ref(torch.randn(4, 4), torch.randn(4, 4))
 
     def test_mask_nonzero_static(self):
         class TestModule(torch.nn.Module):
@@ -1444,6 +2008,39 @@ def forward(self, x, y):
             ep = export(m, (x, y))
         self.assertEqual(ep.module()(x, y), m(x, y))
 
+    def test_subclass_context(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
+
+        input = TwoTensor(
+            TwoTensor(torch.randn(4, 4), torch.rand(4, 4)),
+            TwoTensor(torch.randn(4, 4), torch.rand(4, 4)),
+        )
+
+        input_test = TwoTensor(
+            TwoTensor(torch.randn(6, 6), torch.rand(6, 6)),
+            TwoTensor(torch.randn(6, 6), torch.rand(6, 6)),
+        )
+
+        for strict in [True, False]:
+            dim = torch.export.ShapesCollection()
+            dim[input] = [Dim.STATIC, Dim.AUTO]
+            ep = torch.export.export(Foo(), (input,), strict=strict, dynamic_shapes=dim)
+            self.assertExpectedInline(
+                str(ep.graph).strip(),
+                """\
+graph():
+    %x : [num_users=1] = placeholder[target=x]
+    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, 1), kwargs = {})
+    return (add,)""",
+            )
+
+            with self.assertRaisesRegex(
+                AssertionError, escape("Guard failed: x.size()[0] == 4")
+            ):
+                ep.module()(input_test)
+
     def test_basic_non_strict_real_tensor(self):
         class Basic(torch.nn.Module):
             def __init__(self) -> None:
@@ -1458,6 +2055,40 @@ def forward(self, x, y):
         ep = export(f, args, strict=False)
         self.assertEqual(ep.module()(*args), f(*args))
 
+    def test_where_decomp(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.ops.aten.where.default(x > 0)
+
+        test_module = TestModule()
+        sample_input = (torch.randn(2, 3),)
+
+        def auto_dynamic_shapes_from_args(args):  # pyre-ignore
+            """
+            This function creates dynamic shapes specification with Dim.AUTO
+            in all dimensions of all tensors for given argument list.
+            """
+            if isinstance(args, list):
+                return [auto_dynamic_shapes_from_args(arg) for arg in args]
+            elif isinstance(args, tuple):
+                return tuple(auto_dynamic_shapes_from_args(arg) for arg in args)
+            elif isinstance(args, dict):
+                return {k: auto_dynamic_shapes_from_args(v) for k, v in args.items()}
+            elif isinstance(args, torch.Tensor):
+                return {j: Dim.AUTO for j in range(args.dim())}
+            else:
+                print(f"args type: {type(args)}")
+                return None
+
+        ep = torch.export.export(
+            test_module,
+            sample_input,
+            dynamic_shapes=auto_dynamic_shapes_from_args(sample_input),
+        ).run_decompositions({})
+
     def test_basic_non_strict_fake_tensor(self):
         class Basic(torch.nn.Module):
             def __init__(self) -> None:
@@ -1864,8 +2495,8 @@ def forward(self, x):
                 # z = 3
                 return x + y + z
 
-        with self.assertRaisesRegex(
-            ValueError,
+        with self.assertWarnsRegex(
+            UserWarning,
             "The tensor attribute self.buf was assigned during export",
         ):
             export(M(), (torch.randn(2, 3),), strict=False)
@@ -1922,8 +2553,8 @@ def forward(self, x):
                 # z = 3 + 3
                 return x + y + z
 
-        with self.assertRaisesRegex(
-            ValueError,
+        with self.assertWarnsRegex(
+            UserWarning,
             "The tensor attributes self.tensors\\[0\\], self.tensors\\[1\\] were assigned during export",
         ):
             export(M(), (torch.randn(2, 3),), strict=False)
@@ -1938,7 +2569,9 @@ def cond_fn(idx, out, y0):
 
             def body_fn(idx, out, y0):
                 i = idx.item()
-                torch._check_is_size(i, max=x.size(0) - 1)
+                # TODO removing those causes PendingUnbackedSymbolNotFound.
+                torch._check(i >= 0)
+                torch._check(i < x.size(0))
                 y0 = x[i] + y0
                 out = out.clone()
                 out[i] = y0
@@ -2046,7 +2679,7 @@ def forward(self, x):
         m = M()
         x = torch.randn(3)
         ep = export(m, (x,))
-        print(ep)
+
         ufm = torch.export.unflatten(ep)
         self.assertExpectedInline(
             str(ufm.graph_module.code).strip(),
@@ -2951,7 +3584,6 @@ def _bool_tensor(nz):
                     sample_input = _tensor(nz=nz)
                     ep = export(mod, (sample_input,), strict=False)
                     self.assertEqual(ep.module()(sample_input), nz)
-                    print(ep)
 
     def test_export_script_module(self):
         class Foo(torch.nn.Module):
@@ -3113,6 +3745,32 @@ def forward(self, x):
                 },
             )
 
+    def test_unbacked_slice_forward(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x, xs):
+                u0, u1 = xs.tolist()
+                out = x[u0:u1]
+                return out
+
+        x = torch.randn(10)
+        idxs = torch.tensor([3, 6])
+        mod = Foo()
+        ep = export(mod, (x, idxs))
+        for xs in [
+            idxs,
+            torch.tensor([-9, -1]),
+            torch.tensor([-10000, 10000]),
+            torch.tensor([0, -10]),
+        ]:
+            self.assertTrue(torch.allclose(ep.module()(x, xs), mod(x, xs)))
+
+        # check unbacked bindings
+        # should be 4 symbols: u0, u1, output size, output storage offset
+        bound_unbacked = set()
+        for node in ep.graph.nodes:
+            bound_unbacked |= node.meta.get("unbacked_bindings", {}).keys()
+        self.assertEqual(len(bound_unbacked), 4)
+
     def test_dim_hint_ranges(self):
         class Foo(torch.nn.Module):
             def forward(self, x, y):
@@ -4423,17 +5081,8 @@ def closure():
                 global_storage.append(closure)
                 return x.sin()
 
-        prev_os_env = os.environ.copy()
-        from torch.export._trace import NONSTRICT_EXPORT_SANITIZE_TRACE
-
-        prev_os_env[NONSTRICT_EXPORT_SANITIZE_TRACE] = "1"
-
         with (
-            patch.dict(
-                os.environ,
-                prev_os_env,
-                clear=True,
-            ),
+            torch._export.config.patch(detect_non_strict_fake_tensor_leaks=True),
             self.assertWarnsRegex(
                 UserWarning, "Detected 1 fake tensors that are still alive after export"
             ),
@@ -4479,17 +5128,8 @@ def update(self):
             isinstance(ref.bank[0], torch._subclasses.fake_tensor.FakeTensor)
         )
 
-        prev_os_env = os.environ.copy()
-        from torch.export._trace import NONSTRICT_EXPORT_SANITIZE_TRACE
-
-        prev_os_env[NONSTRICT_EXPORT_SANITIZE_TRACE] = "1"
-
         with (
-            patch.dict(
-                os.environ,
-                prev_os_env,
-                clear=True,
-            ),
+            torch._export.config.patch(detect_non_strict_fake_tensor_leaks=True),
             self.assertWarnsRegex(
                 UserWarning, "Detected 3 fake tensors that are still alive after export"
             ),
@@ -4514,16 +5154,7 @@ def forward(self, x, y):
             isinstance(global_list[0], torch._subclasses.fake_tensor.FakeTensor)
         )
 
-        prev_os_env = os.environ.copy()
-        from torch.export._trace import NONSTRICT_EXPORT_SANITIZE_TRACE
-
-        prev_os_env[NONSTRICT_EXPORT_SANITIZE_TRACE] = "1"
-
-        with patch.dict(
-            os.environ,
-            prev_os_env,
-            clear=True,
-        ):
+        with torch._export.config.patch(detect_non_strict_fake_tensor_leaks=True):
             warn_re = re.compile(
                 r"Detected\s+\d+\s+fake\s+tensors?"
                 r".*test_export\.py.*global_list\.append\(x \+ y\)",
@@ -4570,16 +5201,7 @@ def forward(self, x, y):
         self.assertIsNotNone(node1_ref(), "node1 should still be alive due to cycle")
         self.assertIsNotNone(node2_ref(), "node2 should still be alive due to cycle")
 
-        prev_os_env = os.environ.copy()
-        from torch.export._trace import NONSTRICT_EXPORT_SANITIZE_TRACE
-
-        prev_os_env[NONSTRICT_EXPORT_SANITIZE_TRACE] = "1"
-
-        with patch.dict(
-            os.environ,
-            prev_os_env,
-            clear=True,
-        ):
+        with torch._export.config.patch(detect_non_strict_fake_tensor_leaks=True):
             warn_re = re.compile(
                 r"Detected\s+\d+\s+fake\s+tensors?"
                 r'.*?[/\\]test_export\.py",\s+line\s+\d+,\s+in\s+forward'
@@ -5282,7 +5904,8 @@ def foo_unbacked_fake_impl(a, b):
 
         # check ShapeEnv counters compared to binding indices
         shape_env = _get_shape_env_from_gm(ep.graph_module)
-        next_index = next(shape_env.unbacked_symint_counter)
+        next_index = shape_env.unbacked_symint_counter
+        shape_env.unbacked_symint_counter += 1
         for symbol in bound:
             self.assertTrue(symbol_is_type(symbol, SymT.UNBACKED_INT))
             self.assertTrue(
@@ -5382,7 +6005,6 @@ def test_unbacked_infer_size(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
                 u0 = x.item()
-                torch._check_is_size(u0)
                 t = torch.empty(u0 - 1)
                 return t + t
 
@@ -6073,7 +6695,27 @@ def forward(self, arg1, arg2, *args, kw1, kw2, **kwargs):
         }
         self._test_export_same_as_eager(kw_func, args, kwargs)
 
-    def test_unbacked_slice(self):
+    def test_unbacked_stack(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                nz = torch.nonzero(x)
+                nz_size = nz.size(0)
+                torch._check(nz_size % 4 == 0)
+
+                # Create two tensors whose leading dimensions are equivalent at
+                # runtime but expressed via different SymInt formulas.
+                first = torch.zeros((nz_size // 2, 4))
+                second = torch.zeros(((nz_size // 4) * 2, 4))
+                return torch.stack([first, second], dim=0)
+
+        inputs = (torch.ones((32,)),)
+
+        ep = export(M(), inputs)
+        orig_res = M()(*inputs)
+        ep_res = ep.module()(*inputs)
+        self.assertTrue(torch.allclose(orig_res, ep_res))
+
+    def test_unbacked_slice_simple(self):
         class M(torch.nn.Module):
             def forward(self, scores, score_thr, topk: torch.Tensor, results=None):
                 valid_mask = scores > score_thr
@@ -6081,8 +6723,6 @@ def forward(self, scores, score_thr, topk: torch.Tensor, results=None):
                 valid_idxs = torch.nonzero(valid_mask).to(scores.device)
 
                 num_topk = torch.minimum(topk, torch.tensor(valid_idxs.shape[0])).item()
-                torch._check_is_size(num_topk)
-                torch._check(scores.shape[0] >= num_topk)
                 scores, idxs = scores.sort(descending=True)
                 scores = scores[:num_topk]
                 topk_idxs = valid_idxs[idxs[:num_topk]]
@@ -6111,6 +6751,7 @@ def forward(self, x, y):
                 b = x.item()
                 torch._check(b >= 0)
                 torch._check(b < y.shape[0])
+
                 return y[0, b]
 
         if is_non_strict_test(self._testMethodName):
@@ -6353,7 +6994,7 @@ class M1(torch.nn.Module):
             def forward(self, x, y):
                 b = x.item()
 
-                torch._check_is_size(b)
+                torch._check(b >= 0)
                 torch._check(b < y.size(0))
                 return y[:b]
 
@@ -6361,7 +7002,7 @@ class M3(torch.nn.Module):
             def forward(self, x, y):
                 b = x.item()
 
-                torch._check_is_size(b)
+                torch._check(b >= 0)
                 torch._check(b < y.size(0) * 2)
                 return y[:b]
 
@@ -7282,9 +7923,11 @@ def test_buffer_util(self):
                 buffer.append(get_buffer(ep, node))
         self.assertEqual(num_buffer, 3)
 
-        self.assertEqual(buffer[0].shape, torch.Size([100]))  # running_mean
-        self.assertEqual(buffer[1].shape, torch.Size([100]))  # running_var
-        self.assertEqual(buffer[2].shape, torch.Size([]))  # num_batches_tracked
+        # The insertion order is not guaranteed to be same for strict vs
+        # non-strict, so commenting this out.
+        # self.assertEqual(buffer[0].shape, torch.Size([100]))  # running_mean
+        # self.assertEqual(buffer[1].shape, torch.Size([100]))  # running_var
+        # self.assertEqual(buffer[2].shape, torch.Size([]))  # num_batches_tracked
 
     def test_export_dynamo_config(self):
         class MyModule(torch.nn.Module):
@@ -7974,7 +8617,7 @@ def test_constrain_size_in_eager(self):
         class Module(torch.nn.Module):
             def forward(self, x, y):
                 n = x.max().item()
-                torch._check_is_size(n)
+                torch._check(n >= 0)
                 return y + n
 
         fn = Module()
@@ -7991,7 +8634,6 @@ def forward(self, x, y):
                 n = x.max().item()
                 torch._check(n >= 2)
                 torch._check(n <= 10)
-                torch._check_is_size(n)
                 return y + n
 
         fn = Module()
@@ -8033,7 +8675,6 @@ def test_constrain_size_with_various_cases(self):
         class Module1(torch.nn.Module):
             def forward(self, x, y):
                 n = x.item()
-                torch._check_is_size(n)
                 torch._check(n >= 0)
                 return y.sum() + torch.ones(n, 5).sum()
 
@@ -8042,7 +8683,6 @@ def forward(self, x, y):
         class Module2(torch.nn.Module):
             def forward(self, x, y):
                 n = x.item()
-                torch._check_is_size(n)
                 torch._check(n >= 0)
                 torch._check(n <= 6)
                 return y.sum() + torch.ones(n, 5).sum()
@@ -8052,7 +8692,6 @@ def forward(self, x, y):
         class Module3(torch.nn.Module):
             def forward(self, x, y):
                 n = x.item()
-                torch._check_is_size(n)
                 torch._check(n >= 0)
                 torch._check(n <= 1)
                 return y.sum() + torch.ones(n, 5).sum()
@@ -8062,7 +8701,6 @@ def forward(self, x, y):
         class Module4(torch.nn.Module):
             def forward(self, x, y):
                 n = x.item()
-                torch._check_is_size(n)
                 torch._check(n >= 2)
                 return y.sum() + torch.ones(n, 5).sum()
 
@@ -8071,7 +8709,6 @@ def forward(self, x, y):
         class Module5(torch.nn.Module):
             def forward(self, x, y):
                 n = x.item()
-                torch._check_is_size(n)
                 torch._check(n >= 1)
                 return y.sum() + torch.ones(n, 5).sum()
 
@@ -8159,7 +8796,7 @@ def forward(self, x, y):
         ep = export(M(), (torch.tensor(1), torch.ones(4, 5)))
 
         # This is because we insert sym_constrain_range in the graph now
-        error_msg = r"Invalid value range for -1 between"
+        error_msg = r".* failed for expression u0 >= 0 on node .*"
         with self.assertRaisesRegex(RuntimeError, error_msg):
             _ = ep.module()(torch.tensor(-1), torch.randn(4, 5))
 
@@ -8213,7 +8850,6 @@ def body_fn(idx, acc):
                     # this check_is_size call needs to be traced by this subgraph for the select call,
                     # it can't be in the cond graph, as that fires & fails right before loop termination.
                     i = idx.item()
-                    torch._check_is_size(i, max=x.size(0) - 1)
                     return idx + 1, acc + x[i]
 
                 acc = torch.zeros(x.size(1))
@@ -8270,7 +8906,6 @@ def __init__(self) -> None:
 
             def forward(self, start_pos: torch.Tensor):
                 pos = start_pos.item()
-                torch._check_is_size(pos)
                 torch._check(pos >= 0)
                 torch._check(pos <= 4)
                 return self.freq[pos] * self.freq[pos]
@@ -8279,17 +8914,11 @@ def forward(self, start_pos: torch.Tensor):
         FileCheck().check_count(
             "torch.ops.aten._assert_scalar.default", 2, exactly=True
         ).run(ep.graph_module.code)
-        FileCheck().check_count(
-            "torch.ops.aten.sym_constrain_range_for_size.default", 1, exactly=True
-        ).run(ep.graph_module.code)
 
         decompose_ep = ep.run_decompositions()
         FileCheck().check_count(
             "torch.ops.aten._assert_scalar.default", 2, exactly=True
         ).run(ep.graph_module.code)
-        FileCheck().check_count(
-            "torch.ops.aten.sym_constrain_range_for_size.default", 1, exactly=True
-        ).run(ep.graph_module.code)
 
     def test_mixed_input(self):
         class Module(torch.nn.Module):
@@ -8374,12 +9003,6 @@ def forward(self, x):
         FileCheck().check_count(
             "torch.ops.aten._assert_scalar.default", 2, exactly=True
         ).run(ep.graph_module.code)
-        FileCheck().check_count(
-            "torch.ops.aten.sym_constrain_range.default", 0, exactly=True
-        ).run(ep.graph_module.code)
-        FileCheck().check_count(
-            "torch.ops.aten.sym_constrain_range_for_size.default", 1, exactly=True
-        ).run(ep.graph_module.code)
 
         with self.assertRaisesRegex(
             RuntimeError,
@@ -8403,12 +9026,6 @@ def forward(self, x):
         FileCheck().check_count(
             "torch.ops.aten._assert_scalar.default", 2, exactly=True
         ).run(ep.graph_module.code)
-        FileCheck().check_count(
-            "torch.ops.aten.sym_constrain_range.default", 0, exactly=True
-        ).run(ep.graph_module.code)
-        FileCheck().check_count(
-            "torch.ops.aten.sym_constrain_range_for_size.default", 1, exactly=True
-        ).run(ep.graph_module.code)
 
     def test_to_module_with_mutated_buffer(self):
         class Foo(torch.nn.Module):
@@ -8788,23 +9405,9 @@ def forward(self, b_a_buffer, x):
             )
 
         else:
-            if is_inline_and_install_strict_test(self._testMethodName):
-                self.assertExpectedInline(
-                    ep.graph_module.code.strip(),
-                    """\
-def forward(self, b____modules__a____buffers__buffer, x):
-    sym_size_int_1 = torch.ops.aten.sym_size.int(x, 0)
-    gt = sym_size_int_1 > 4;  sym_size_int_1 = None
-    true_graph_0 = self.true_graph_0
-    false_graph_0 = self.false_graph_0
-    cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, (x, b____modules__a____buffers__buffer));  gt = true_graph_0 = false_graph_0 = x = b____modules__a____buffers__buffer = None
-    getitem = cond[0];  cond = None
-    return (getitem,)""",
-                )
-            else:
-                self.assertExpectedInline(
-                    ep.graph_module.code.strip(),
-                    """\
+            self.assertExpectedInline(
+                ep.graph_module.code.strip(),
+                """\
 def forward(self, b_a_buffer, x):
     sym_size_int_1 = torch.ops.aten.sym_size.int(x, 0)
     gt = sym_size_int_1 > 4;  sym_size_int_1 = None
@@ -8813,7 +9416,7 @@ def forward(self, b_a_buffer, x):
     cond = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, (x, b_a_buffer));  gt = true_graph_0 = false_graph_0 = x = b_a_buffer = None
     getitem = cond[0];  cond = None
     return (getitem,)""",
-                )
+            )
         self.assertTrue(
             torch.allclose(ep.module()(torch.ones(6, 4)), Foo()(torch.ones(6, 4)))
         )
@@ -8826,7 +9429,6 @@ def test_ccode_python_mod(self):
         class Foo(torch.nn.Module):
             def forward(self, xs):
                 u0, u1 = xs.tolist()
-                torch._check_is_size(u1)
                 return u0, u1
 
         ep = export(Foo(), (torch.tensor([2, 3]),), strict=False)
@@ -9038,24 +9640,6 @@ def forward(self, x):
         ):
             ep.module()(torch.tensor(5))
 
-    def test_is_non_negative_check_function(self):
-        import sympy as sp
-
-        from torch.fx.experimental.symbolic_shapes import _is_non_negative_check
-
-        x = sp.Symbol("x")
-        variable_name = sp.Symbol("variable_name")
-        tensor_shape = sp.Symbol("tensor.shape[0]")
-
-        self.assertEqual(_is_non_negative_check(variable_name >= 0), "variable_name")
-        self.assertEqual(_is_non_negative_check(tensor_shape >= 0), "tensor.shape[0]")
-
-        # Test cases where the condition is not checking for x >= 0
-        self.assertIsNone(_is_non_negative_check(x > 0))
-        self.assertIsNone(_is_non_negative_check(x == 0))
-        self.assertIsNotNone(_is_non_negative_check(0 <= x))
-        self.assertIsNone(_is_non_negative_check(x >= 1))
-
     def test_suggest_torch_checks_with_non_negative_check(self):
         from unittest.mock import patch
 
@@ -9084,7 +9668,6 @@ def test_suggest_torch_checks_with_non_negative_check(self):
             src_map["u"] = ["u"]
             _suggest_torch_checks(mock_exception, src_map)
             error_msg = mock_exception.args[0]
-            self.assertIn("torch._check_is_size(u)", error_msg)
             self.assertIn("torch._check(u < 0)", error_msg)
 
     def test_suggest_torch_checks_with_regular_check(self):
@@ -9119,7 +9702,6 @@ def test_suggest_torch_checks_with_regular_check(self):
             error_msg = mock_exception.args[0]
             self.assertIn("torch._check(u > 5)", error_msg)
             self.assertIn("torch._check(u <= 5)", error_msg)
-            self.assertNotIn("torch._check_is_size", error_msg)
 
     def test_train_eval_on_exported_preautograd_module(self):
         class Foo(torch.nn.Module):
@@ -9412,27 +9994,9 @@ def _decompose_linear_custom(x, weight, bias):
             decomp_table={torch.ops.aten.linear.default: _decompose_linear_custom}
         )
 
-        if is_inline_and_install_strict_test(self._testMethodName):
-            self.assertExpectedInline(
-                str(ep_decompose_linear.graph_module.code).strip(),
-                """\
-def forward(self, p_conv_weight, p_conv_bias, p_conv1d_weight, p_conv1d_bias, c_linear_weight, c_linear_bias, x, y):
-    conv2d = torch.ops.aten.conv2d.default(x, p_conv_weight, p_conv_bias);  x = p_conv_weight = p_conv_bias = None
-    conv1d = torch.ops.aten.conv1d.default(y, p_conv1d_weight, p_conv1d_bias);  y = p_conv1d_weight = p_conv1d_bias = None
-    permute = torch.ops.aten.permute.default(c_linear_weight, [1, 0]);  c_linear_weight = None
-    matmul = torch.ops.aten.matmul.default(conv2d, permute);  conv2d = permute = None
-    mul = torch.ops.aten.mul.Tensor(c_linear_bias, 2);  c_linear_bias = None
-    add = torch.ops.aten.add.Tensor(matmul, mul);  matmul = mul = None
-    cos = torch.ops.aten.cos.default(add);  add = None
-    sum_1 = torch.ops.aten.sum.default(conv1d);  conv1d = None
-    add_1 = torch.ops.aten.add.Tensor(cos, sum_1);  cos = sum_1 = None
-    return (add_1,)""",
-            )
-
-        else:
-            self.assertExpectedInline(
-                str(ep_decompose_linear.graph_module.code).strip(),
-                """\
+        self.assertExpectedInline(
+            str(ep_decompose_linear.graph_module.code).strip(),
+            """\
 def forward(self, p_conv_weight, p_conv_bias, p_conv1d_weight, p_conv1d_bias, c_linear_weight, c_linear_bias, x, y):
     conv2d = torch.ops.aten.conv2d.default(x, p_conv_weight, p_conv_bias);  x = p_conv_weight = p_conv_bias = None
     conv1d = torch.ops.aten.conv1d.default(y, p_conv1d_weight, p_conv1d_bias);  y = p_conv1d_weight = p_conv1d_bias = None
@@ -9444,7 +10008,7 @@ def forward(self, p_conv_weight, p_conv_bias, p_conv1d_weight, p_conv1d_bias, c_
     sum_1 = torch.ops.aten.sum.default(conv1d);  conv1d = None
     add_1 = torch.ops.aten.add.Tensor(cos, sum_1);  cos = sum_1 = None
     return (add_1,)""",
-            )
+        )
 
     def test_export_decomps_dynamic(self):
         class M(torch.nn.Module):
@@ -9488,25 +10052,18 @@ def test_redundant_asserts(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
                 y = x.item()
-                torch._check_is_size(y)
                 return torch.zeros(y)
 
         f = Foo()
 
         ep = export(f, (torch.tensor([3]),))
 
-        FileCheck().check_count(
-            "torch.ops.aten.sym_constrain_range_for_size.default", 1, exactly=True
-        ).run(ep.graph_module.code)
         FileCheck().check_count(
             "torch.ops.aten._assert_scalar.default", 1, exactly=True
         ).run(ep.graph_module.code)
 
         ep = ep.run_decompositions()
 
-        FileCheck().check_count(
-            "torch.ops.aten.sym_constrain_range_for_size.default", 1, exactly=True
-        ).run(ep.graph_module.code)
         FileCheck().check_count(
             "torch.ops.aten._assert_scalar.default", 1, exactly=True
         ).run(ep.graph_module.code)
@@ -9761,13 +10318,9 @@ def forward(self, x):
     %x : [num_users=2] = placeholder[target=x]
     %ones : [num_users=1] = call_function[target=torch.ops.aten.ones.default](args = ([3, 3],), kwargs = {device: cpu, pin_memory: False})
     %detach : [num_users=1] = call_function[target=torch.ops.aten.detach.default](args = (%ones,), kwargs = {})
-    %detach_1 : [num_users=1] = call_function[target=torch.ops.aten.detach.default](args = (%detach,), kwargs = {})
-    %detach_2 : [num_users=1] = call_function[target=torch.ops.aten.detach.default](args = (%detach_1,), kwargs = {})
     %clone : [num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%c_lifted_tensor_0,), kwargs = {})
-    %detach_3 : [num_users=1] = call_function[target=torch.ops.aten.detach.default](args = (%clone,), kwargs = {})
-    %detach_4 : [num_users=1] = call_function[target=torch.ops.aten.detach.default](args = (%detach_3,), kwargs = {})
-    %detach_5 : [num_users=1] = call_function[target=torch.ops.aten.detach.default](args = (%detach_4,), kwargs = {})
-    %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%detach_2, %detach_5), kwargs = {})
+    %detach_1 : [num_users=1] = call_function[target=torch.ops.aten.detach.default](args = (%clone,), kwargs = {})
+    %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%detach, %detach_1), kwargs = {})
     %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %mul), kwargs = {})
     %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add, %x), kwargs = {})
     return (mul_1,)""",
@@ -9848,8 +10401,6 @@ def test_runtime_assert_with_size(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
                 a = x.item()
-                torch._check_is_size(a)
-                torch._check(a <= y.size(0))
                 return y[:a]
 
         ep = export(
@@ -10055,7 +10606,7 @@ def forward(self):
                 return (torch.full((i0,), 0.0),)
 
         f = M()
-        ep = torch.export.export(f, ())
+        ep = export(f, ())
         a = ep.module()()[0]
         self.assertEqual(a.size(), torch.Size([11]))
         self.assertEqual(a, torch.zeros(11))
@@ -10127,6 +10678,28 @@ def forward(self, x):
         ep = export(m, args)
         self.assertEqual(ep.module()(*args), m(*args))
 
+    def test_cdist_forward_compute_mode_zero_export(self):
+        class CDistModel(torch.nn.Module):
+            def __init__(self):
+                super(CDistModel, self).__init__()
+
+            def forward(self, x, y, compute_mode):
+                return torch.ops.aten._cdist_forward(
+                    x, y, p=2.0, compute_mode=compute_mode
+                )
+
+        x = torch.ones([3, 3])
+        y = torch.ones([3, 3])
+        model = CDistModel()
+
+        expected_none = model(x, y, None)
+        ep_none = torch.export.export(model, (x, y, None))
+        self.assertTrue(torch.equal(ep_none.module()(x, y, None), expected_none))
+
+        expected_0 = model(x, y, 0)
+        ep_0 = torch.export.export(model, (x, y, 0))
+        self.assertTrue(torch.equal(ep_0.module()(x, y, 0), expected_0))
+
     def test_export_then_compile_tensor_ctor(self):
         class M(torch.nn.Module):
             def forward(self, scores, mask):
@@ -10312,17 +10885,16 @@ def forward(self, x):
         test_inp = torch.randn(2, 3)
 
         torch_gm = _export_to_torch_ir(orig_eager, (torch.rand(2, 3),), {})
+        torch_gm.state_dict().keys()
         for k, v in orig_eager.state_dict().items():
-            normalized_k = k.replace(".", "_")
-            self.assertIn(normalized_k, torch_gm.state_dict())
-            self.assertEqual(v, torch_gm.state_dict()[normalized_k])
+            self.assertIn(k, torch_gm.state_dict())
+            self.assertEqual(v, torch_gm.state_dict()[k])
         self.assertTrue(torch.allclose(torch_gm(test_inp), orig_eager(test_inp)))
 
         pre_autograd_gm = torch.export._trace._export(
             orig_eager, (torch.rand(2, 3),), {}, pre_dispatch=True
         ).module()
         for k, v in orig_eager.state_dict().items():
-            normalized_k = k.replace(".", "_")
             self.assertIn(k, pre_autograd_gm.state_dict())
             self.assertEqual(v, pre_autograd_gm.state_dict()[k])
         self.assertTrue(torch.allclose(pre_autograd_gm(test_inp), orig_eager(test_inp)))
@@ -10334,6 +10906,7 @@ def forward(self, x):
             self.assertIn(k, ep.state_dict)
             self.assertEqual(v, ep.state_dict[k])
         self.assertTrue(torch.allclose(ep.module()(test_inp), orig_eager(test_inp)))
+        self.assertTrue(torch_gm.state_dict().keys(), orig_eager.state_dict().keys())
 
     def test_nn_module_stack(self):
         class Leaf(torch.nn.Module):
@@ -13057,6 +13630,8 @@ def _test(m, non_persistent_buffer):
         _test(MyModule(), "foo")
         _test(MyOuterModule(), "inner.foo")
 
+    @testing.expectedFailureTrainingIRToRunDecomp  # set_grad disappears after decomp
+    @testing.expectedFailureTrainingIRToRunDecompNonStrict  # set_grad disappears after decomp
     def test_export_with_set_grad_enabled(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -13069,18 +13644,9 @@ def forward(self, x):
 
         model = Model()
         ep = export(model, (torch.randn(4, 4),), {})
-        # _export_for_traininig is using pre_dispatch=False
-        # Therefore the set_grad calls are not replaced with a hop.
-        if not is_training_ir_test(self._testMethodName):
-            self.assertIn(
-                "torch.ops.higher_order.wrap_with_set_grad_enabled",
-                ep.graph_module.code,
-            )
-        gm = torch.export.export(model, (torch.randn(4, 4),)).module()
-        self.assertIn(
-            "set_grad_enabled",
-            gm.code,
-        )
+        FileCheck().check_count(
+            "torch.ops.higher_order.wrap_with_set_grad_enabled", 1, exactly=True
+        ).run(ep.graph_module.code)
 
     def test_export_with_autocast(self):
         class Model(torch.nn.Module):
@@ -13509,29 +14075,16 @@ def forward(self, p, b_alpha, b, c_gamma):
             torch.randn(4),
         )
         ep = export(Foo(), inputs)
-        if is_inline_and_install_strict_test(self._testMethodName):
-            # when installed, prefix name
-            expected_names = [  # user inputs should be prioritized, unprefixed
-                ("p____parameters__param", InputKind.PARAMETER),
-                ("b____buffers__alpha", InputKind.BUFFER),
-                ("b____buffers__beta", InputKind.BUFFER),
-                ("c_gamma_1", InputKind.CONSTANT_TENSOR),
-                ("p_param", InputKind.USER_INPUT),
-                ("b_alpha", InputKind.USER_INPUT),
-                ("b_beta", InputKind.USER_INPUT),
-                ("c_gamma", InputKind.USER_INPUT),
-            ]
-        else:
-            expected_names = [  # user inputs should be prioritized, unprefixed
-                ("p_param_1", InputKind.PARAMETER),
-                ("b_alpha_1", InputKind.BUFFER),
-                ("b_beta_1", InputKind.BUFFER),
-                ("c_gamma_1", InputKind.CONSTANT_TENSOR),
-                ("p_param", InputKind.USER_INPUT),
-                ("b_alpha", InputKind.USER_INPUT),
-                ("b_beta", InputKind.USER_INPUT),
-                ("c_gamma", InputKind.USER_INPUT),
-            ]
+        expected_names = [  # user inputs should be prioritized, unprefixed
+            ("p_param_1", InputKind.PARAMETER),
+            ("b_alpha_1", InputKind.BUFFER),
+            ("b_beta_1", InputKind.BUFFER),
+            ("c_gamma_1", InputKind.CONSTANT_TENSOR),
+            ("p_param", InputKind.USER_INPUT),
+            ("b_alpha", InputKind.USER_INPUT),
+            ("b_beta", InputKind.USER_INPUT),
+            ("c_gamma", InputKind.USER_INPUT),
+        ]
         real_names = [
             (spec.arg.name, spec.kind) for spec in ep.graph_signature.input_specs
         ]
@@ -13654,9 +14207,9 @@ def forward(self, x):
                 self.bar = x.sum()
                 return x + 2
 
-        with self.assertRaisesRegex(
-            ValueError,
-            "During torch.export, following attrs were created in the model.forward:",
+        with self.assertWarnsRegex(
+            UserWarning,
+            "The tensor attribute self.bar was assigned during export",
         ):
             _ = export(Foo(), (torch.randn(4, 4),), strict=False)
 
@@ -13712,7 +14265,6 @@ def forward(self, x, y):
                 y_sum = y.sin().sum()
                 with torch.no_grad():
                     a = x.item()
-                    torch._check_is_size(a)
                     torch._check(a > 2)
                     torch._check(a < 6)
                     unbacked_shape = torch.ops.testlib.foo_unbacked(a)
@@ -13725,9 +14277,8 @@ def forward(self, x, y):
             """\
 def forward(self, x):
     item = torch.ops.aten.item.default(x);  x = None
-    sym_constrain_range_for_size_default = torch.ops.aten.sym_constrain_range_for_size.default(item);  sym_constrain_range_for_size_default = None
-    ge_1 = item >= 3
-    _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u0 >= 3 on node 'ge_1'");  ge_1 = _assert_scalar_default = None
+    ge = item >= 3
+    _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 3 on node 'ge'");  ge = _assert_scalar_default = None
     le = item <= 5
     _assert_scalar_default_1 = torch.ops.aten._assert_scalar.default(le, "Runtime assertion failed for expression u0 <= 5 on node 'le'");  le = _assert_scalar_default_1 = None
     gt_1 = item > 2
@@ -13745,7 +14296,6 @@ def forward(self, x, y):
     sin = torch.ops.aten.sin.default(y)
     sum_1 = torch.ops.aten.sum.dim_IntList(sin, []);  sin = None
     _local_scalar_dense = torch.ops.aten._local_scalar_dense.default(x);  x = None
-    sym_constrain_range_for_size_default = torch.ops.aten.sym_constrain_range_for_size.default(_local_scalar_dense);  sym_constrain_range_for_size_default = None
     ge_1 = _local_scalar_dense >= 3
     _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u2 >= 3 on node 'ge_1'");  ge_1 = _assert_scalar_default = None
     le_1 = _local_scalar_dense <= 5
@@ -14332,7 +14882,6 @@ class Foo(torch.nn.Module):
             def forward(self, x, y):
                 n = y.item()
                 m = y.item()
-                torch._check_is_size(n)
                 torch._check(m >= 0)
                 torch._check(n >= 3)
                 torch._check(-m >= -9)  # m <= 9
@@ -14345,23 +14894,11 @@ def forward(self, x, y):
         FileCheck().check_count(
             "torch.ops.aten._assert_scalar.default", 2, exactly=True
         ).run(ep.graph_module.code)
-        FileCheck().check_count(
-            "torch.ops.aten.sym_constrain_range.default", 0, exactly=True
-        ).run(ep.graph_module.code)
-        FileCheck().check_count(
-            "torch.ops.aten.sym_constrain_range_for_size.default", 1, exactly=True
-        ).run(ep.graph_module.code)
 
         ep = ep.run_decompositions()
         FileCheck().check_count(
             "torch.ops.aten._assert_scalar.default", 2, exactly=True
         ).run(ep.graph_module.code)
-        FileCheck().check_count(
-            "torch.ops.aten.sym_constrain_range.default", 0, exactly=True
-        ).run(ep.graph_module.code)
-        FileCheck().check_count(
-            "torch.ops.aten.sym_constrain_range_for_size.default", 1, exactly=True
-        ).run(ep.graph_module.code)
 
         # check runtime
         ep.module()(torch.randn(10), torch.tensor(5))
@@ -14608,13 +15145,8 @@ def forward(self, x):
             for nn_module_stack in nn_module_stacks
         ]
 
-        if is_inline_and_install_strict_test(self._testMethodName):
-            # when inlined and install have same ID so reference same layer
-            self.assertEqual(filtered_nn_module_stack[0], "sub_net.0")
-            self.assertEqual(filtered_nn_module_stack[1], "sub_net.0")
-        else:
-            self.assertEqual(filtered_nn_module_stack[0], "sub_net.0")
-            self.assertEqual(filtered_nn_module_stack[1], "sub_net.2")
+        self.assertEqual(filtered_nn_module_stack[0], "sub_net.0")
+        self.assertEqual(filtered_nn_module_stack[1], "sub_net.2")
 
     def test_slice_nn_module_stack(self):
         class N(torch.nn.Module):
@@ -14647,17 +15179,11 @@ def forward(self, x, y):
             list(nn_module_stack.values())[-1][0]
             for nn_module_stack in nn_module_stacks
         ]
-        if is_inline_and_install_strict_test(self._testMethodName):
+        if is_strict_test(self._testMethodName) or is_strict_v2_test(
+            self._testMethodName
+        ):
             self.assertEqual(filtered_nn_module_stack[0], "mod_list_1.2")
-            self.assertEqual(filtered_nn_module_stack[1], "mod_list_1.2")
-        # This is fine since both of these will be deprecated soon.
-        elif is_strict_v2_test(self._testMethodName) and IS_FBCODE:
-            self.assertEqual(
-                filtered_nn_module_stack[0], "mod_list_1.slice(2, 3, None).0"
-            )
-            self.assertEqual(
-                filtered_nn_module_stack[1], "mod_list_2.slice(4, 5, None).0"
-            )
+            self.assertEqual(filtered_nn_module_stack[1], "mod_list_2.4")
         else:
             self.assertEqual(
                 filtered_nn_module_stack[0], "mod_list_1.slice(2, 3, None).2"
@@ -14666,6 +15192,45 @@ def forward(self, x, y):
                 filtered_nn_module_stack[1], "mod_list_2.slice(4, 5, None).0"
             )
 
+    def test_enum_str(self):
+        class TensorDim(str, enum.Enum):
+            DDP = "ddp"
+            FSDP = "fsdp"
+            CP = "cp"
+            TP = "tp"
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                val = x.sin()
+                if TensorDim.DDP in {"ddp"}:
+                    val += x.cos()
+                if "ddp" in {TensorDim.DDP}:
+                    val += x.cos()
+                return val
+
+        from torch._dynamo.functional_export import _dynamo_graph_capture_for_export
+
+        inp = torch.randn(4, 4)
+        gm = export(Foo(), (inp,)).run_decompositions().module()
+        self.assertExpectedInline(
+            str(gm.graph).strip(),
+            """\
+graph():
+    %x : [num_users=4] = placeholder[target=x]
+    %_guards_fn : [num_users=0] = call_module[target=_guards_fn](args = (%x,), kwargs = {})
+    %sin : [num_users=1] = call_function[target=torch.ops.aten.sin.default](args = (%x,), kwargs = {})
+    %cos : [num_users=1] = call_function[target=torch.ops.aten.cos.default](args = (%x,), kwargs = {})
+    %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%sin, %cos), kwargs = {})
+    %cos_1 : [num_users=1] = call_function[target=torch.ops.aten.cos.default](args = (%x,), kwargs = {})
+    %add_1 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add, %cos_1), kwargs = {})
+    return (add_1,)""",
+        )
+
+        self.assertEqual(gm(inp), Foo()(inp))
+
     def test_split_const_gm_with_lifted_constants(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -15059,6 +15624,41 @@ def forward(self, x, y):
             test_serdes=True,
         )
 
+    def test_preserve_annotation(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                with fx_traceback.annotate({"pp_stage": 0}):
+                    with fx_traceback.annotate({"fdsp_bucket": 0}):
+                        x = x + 1
+                    x = x - 2
+                    with fx_traceback.annotate({"cuda_stream": 2, "fsdp_bucket": 1}):
+                        x = x * 2
+                x = x / 3
+                return x
+
+        m = M()
+
+        with fx_traceback.preserve_node_meta():
+            ep = export(m, (torch.randn(10),))
+
+        for node in ep.graph.nodes:
+            if node.op in ("placeholder", "output"):
+                continue
+            if node.target == torch.ops.aten.add.Tensor:
+                self.assertTrue(node.meta["custom"], {"pp_stage": 0, "fdsp_bucket": 0})
+            elif node.target == torch.ops.aten.sub.Tensor:
+                self.assertTrue(node.meta["custom"], {"pp_stage": 0})
+            elif node.target == torch.ops.aten.mul.Tensor:
+                self.assertTrue(
+                    node.meta["custom"],
+                    {"pp_stage": 0, "cuda_stream": 2, "fsdp_bucket": 1},
+                )
+            elif node.target == torch.ops.aten.div.Tensor:
+                if "custom" in node.meta:
+                    self.assertTrue(node.meta["custom"], {})
+            else:
+                raise AssertionError(f"Node not checked: {node}, {node.target}")
+
     def test_dynamic_shapes_serdes_generic(self):
         from torch._export.serde.dynamic_shapes import (
             _dump_dynamic_shapes,
@@ -15765,6 +16365,50 @@ def forward(self, x):
             ]
             self.assertEqual(len(shift_op), 1)
 
+    def test_export_rnn_variants_with_warning(self):
+        """
+        Test that when exporting RNN, LSTM, and GRU models in non-strict mode, it:
+
+        1. Produces expected warnings about tensor attributes being assigned during export
+        2. Does not leak fake tensors in the model's flat weights
+        3. Does not produce extra tensor constants in the graph signature
+        """
+        rnn_types = [
+            (torch.nn.RNN, "RNN"),
+            (torch.nn.LSTM, "LSTM"),
+            (torch.nn.GRU, "GRU"),
+        ]
+
+        for rnn_class, rnn_name in rnn_types:
+            with self.subTest(rnn_type=rnn_name):
+                m = rnn_class(
+                    input_size=2, hidden_size=4, num_layers=1, batch_first=True
+                )
+                sample_inputs = (torch.randn(1, 2, 2),)
+                eager_out = m(*sample_inputs)
+
+                # Verify that export produces the expected warning about tensor attributes
+                with self.assertWarnsRegex(
+                    UserWarning,
+                    r"The tensor attributes self\._flat_weights\[0\], self\._flat_weights\[1\], "
+                    r"self\._flat_weights\[2\], self\._flat_weights\[3\] were assigned during export.*",
+                ):
+                    ep = torch.export.export(m, sample_inputs, strict=False)
+
+                ep_out = ep.module()(*sample_inputs)
+                self.assertEqual(eager_out, ep_out)
+
+                # Verify no fake tensor leakage: flat weights should be real tensors
+                for flat_weight in m._flat_weights:
+                    self.assertTrue(
+                        not isinstance(
+                            flat_weight, torch._subclasses.fake_tensor.FakeTensor
+                        )
+                    )
+
+                # Verify no tensor constants in graph signature
+                self.assertEqual(len(ep.graph_signature.lifted_tensor_constants), 0)
+
     @contextmanager
     def distributed_env(self, world_size):
         try:
@@ -15778,6 +16422,7 @@ def distributed_env(self, world_size):
         finally:
             torch.distributed.destroy_process_group()
 
+    @xfailIfDistributedNotSupported
     def test_distributed_all_reduce(self):
         class Foo(torch.nn.Module):
             def __init__(self):
@@ -15795,6 +16440,7 @@ def forward(self, x):
             inp = (torch.randn(4, 4),)
             self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
 
+    @xfailIfDistributedNotSupported
     def test_distributed_all_gather(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
@@ -15810,6 +16456,7 @@ def forward(self, x):
                 torch.allclose(a, b) for a, b in zip(ep.module()(*inp), m(*inp))
             )
 
+    @xfailIfDistributedNotSupported
     def test_distributed_all_gather_into_tensor(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
@@ -15823,6 +16470,7 @@ def forward(self, x):
             inp = (torch.randn(2),)
             self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))
 
+    @xfailIfDistributedNotSupported
     @testing.expectedFailureCppRuntime
     def test_distributed_all_to_all_single(self):
         class Foo(torch.nn.Module):
@@ -15840,6 +16488,7 @@ def forward(self, x):
             )
             self.assertEqual(len(nodes), 1)
 
+    @xfailIfDistributedNotSupported
     @testing.expectedFailureCppRuntime
     def test_distributed_reduce_scatter_tensor(self):
         class Foo(torch.nn.Module):
@@ -16026,6 +16675,26 @@ def forward(self, q, k, v):
         ):
             export(Foo(), (torch.randn(1, 33, 256, 128), k, v))
 
+    def test_namedtuple_input_export(self):
+        # test for NamedTuple inputs with both strict and non-strict export modes
+        from collections import namedtuple
+
+        PointNT = namedtuple("PointNT", ["x", "y"])
+
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        inp = PointNT(torch.ones(3), torch.ones(3))
+
+        ep_non_strict = export(M(), inp)
+        result_non_strict = ep_non_strict.module()(*inp)
+
+        ep_strict = export(M(), inp, strict=True)
+        result_strict = ep_strict.module()(*inp)
+
+        self.assertEqual(result_non_strict, result_strict)
+
 
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
 class TestOneOffModelExportResult(TestCase):
@@ -16062,15 +16731,7 @@ def forward(self, q, k, v):
 
         with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]):
             ep = torch.export.export(ScaledDotProductAttention(), (q, k, v))
-            print(ep.graph)
             ep.run_decompositions()
-            print(ep.graph)
-
-    #         self.assertExpectedInline(ep.graph_module.code.strip(), """\
-    # def forward(self, arg0_1, arg1_1, arg2_1):
-    #     _scaled_dot_product_flash_attention_for_cpu = torch.ops.aten._scaled_dot_product_flash_attention_for_cpu.default(arg0_1, arg1_1, arg2_1, 0.0, True);  arg0_1 = arg1_1 = arg2_1 = None
-    #     getitem = _scaled_dot_product_flash_attention_for_cpu[0];  _scaled_dot_product_flash_attention_for_cpu = None
-    #     return (getitem,)""")
 
     @skipIfCrossRef
     @unittest.skipIf(
diff --git a/test/export/test_export_opinfo.py b/test/export/test_export_opinfo.py
index 61a89aea25ac..075fd6df119b 100644
--- a/test/export/test_export_opinfo.py
+++ b/test/export/test_export_opinfo.py
@@ -56,9 +56,6 @@
     xfail("masked.var"),
     xfail("nn.functional.grid_sample"),
     xfail("to_sparse"),
-    # cannot xfail as it is passing for cpu-only build
-    skip("nn.functional.conv2d"),
-    skip("nn.functional.scaled_dot_product_attention"),
     # following are failing due to OptionalDeviceGuard
     xfail("__getitem__"),
     xfail("nn.functional.batch_norm"),
@@ -81,14 +78,10 @@ def _test_export_helper(self, dtype, op):
     sample_inputs_itr = op.sample_inputs("cpu", dtype, requires_grad=False)
 
     mode = FakeTensorMode(allow_non_fake_inputs=True)
-    converter = mode.fake_tensor_converter
-    # intentionally avoid cuda:0 to flush out some bugs
-    target_device = "cuda:1"
+    target_device = "cuda:0"
 
     def to_fake_device(x):
-        x = converter.from_real_tensor(mode, x)
-        x.fake_device = torch.device(target_device)
-        return x
+        return x.to(target_device)
 
     # Limit to first 100 inputs so tests don't take too long
     for sample_input in itertools.islice(sample_inputs_itr, 100):
@@ -138,9 +131,11 @@ def test_fake_export(self, device, dtype, op):
 
 selected_ops = {
     "__getitem__",
-    # "nn.functional.batch_norm",  # needs to fix
+    "nn.functional.batch_norm",
+    "nn.functional.conv2d",
     "nn.functional.instance_norm",
     "nn.functional.multi_margin_loss",
+    "nn.functional.scaled_dot_product_attention",
     "nonzero",
 }
 selected_op_db = [op for op in op_db if op.name in selected_ops]
@@ -162,51 +157,48 @@ def test_fake_export(self, device, dtype, op):
 from torch.utils import _pytree as pytree
 
 ops = [op for op in op_db if op.name == "{op.name}"]
-assert len(ops) == 1
-op = ops[0]
+assert len(ops) > 0
 
-sample_inputs_itr = op.sample_inputs("cpu", torch.float, requires_grad=False)
+for op in ops:
+    sample_inputs_itr = op.sample_inputs("cpu", torch.float, requires_grad=False)
 
-mode = FakeTensorMode(allow_non_fake_inputs=True)
-converter = mode.fake_tensor_converter
-# intentionally avoid cuda:0 to flush out some bugs
-target_device = "cuda:1"
+    mode = FakeTensorMode(allow_non_fake_inputs=True)
 
-def to_fake_device(x):
-    x = converter.from_real_tensor(mode, x)
-    x.fake_device = torch.device(target_device)
-    return x
+    target_device = "cuda:0"
 
-# Limit to first 100 inputs so tests don't take too long
-for sample_input in itertools.islice(sample_inputs_itr, 100):
-    args = tuple([sample_input.input] + list(sample_input.args))
-    kwargs = sample_input.kwargs
+    def to_fake_device(x):
+        return x.to(target_device)
 
-    # hack to skip non-tensor in args, as export doesn't support it
-    if any(not isinstance(arg, torch.Tensor) for arg in args):
-        continue
+    # Limit to first 100 inputs so tests don't take too long
+    for sample_input in itertools.islice(sample_inputs_itr, 100):
+        args = tuple([sample_input.input] + list(sample_input.args))
+        kwargs = sample_input.kwargs
 
-    if "device" in kwargs:
-        kwargs["device"] = target_device
+        # hack to skip non-tensor in args, as export doesn't support it
+        if any(not isinstance(arg, torch.Tensor) for arg in args):
+            continue
 
-    with mode:
-        args, kwargs = pytree.tree_map_only(
-            torch.Tensor, to_fake_device, (args, kwargs)
-        )
+        if "device" in kwargs:
+            kwargs["device"] = target_device
+
+        with mode:
+            args, kwargs = pytree.tree_map_only(
+                torch.Tensor, to_fake_device, (args, kwargs)
+            )
 
-        class Module(torch.nn.Module):
-            def forward(self, *args):
-                return op.op(*args, **kwargs)
+            class Module(torch.nn.Module):
+                def forward(self, *args):
+                    return op.op(*args, **kwargs)
 
-        m = Module()
+            m = Module()
 
-        ep = torch.export.export(m, args)
+            ep = torch.export.export(m, args)
 
-        for node in ep.graph.nodes:
-            if node.op == "call_function":
-                fake_tensor = node.meta.get("val", None)
-                if isinstance(fake_tensor, FakeTensor):
-                    assert fake_tensor.device == torch.device(target_device)
+            for node in ep.graph.nodes:
+                if node.op == "call_function":
+                    fake_tensor = node.meta.get("val", None)
+                    if isinstance(fake_tensor, FakeTensor):
+                        assert fake_tensor.device == torch.device(target_device)
 """
         r = (
             (
diff --git a/test/export/test_export_with_inline_and_install.py b/test/export/test_export_with_inline_and_install.py
index 2bc6aa3c6782..bb5ad8b63ae1 100644
--- a/test/export/test_export_with_inline_and_install.py
+++ b/test/export/test_export_with_inline_and_install.py
@@ -1,8 +1,6 @@
 # Owner(s): ["oncall: export"]
 
 
-import unittest
-
 from torch._dynamo import config as dynamo_config
 from torch._dynamo.testing import make_test_cls_with_patches
 from torch._export import config as export_config
@@ -67,33 +65,6 @@ def make_dynamic_cls(cls):
 del test
 
 
-# NOTE: For this test, we have a failure that occurs because the buffers (for BatchNorm2D) are installed, and not
-# graph input.  Therefore, they are not in the `program.graph_signature.inputs_to_buffers`
-# and so not found by the unit test when counting the buffers
-unittest.expectedFailure(
-    InlineAndInstallStrictExportTestExport.test_buffer_util_inline_and_install_strict  # noqa: F821
-)
-# this is because we marked unlift hooks to be dynamo skip traced
-unittest.expectedFailure(
-    InlineAndInstallStrictExportTestExport.test_custom_tag_metadata_re_export_inline_and_install_strict  # noqa: F821
-)
-unittest.expectedFailure(
-    InlineAndInstallStrictExportTestExport.test_from_node_metadata_export_inline_and_install_strict  # noqa: F821
-)
-unittest.expectedFailure(
-    InlineAndInstallStrictExportTestExport.test_module_inline_and_install_strict  # noqa: F821
-)
-unittest.expectedFailure(
-    InlineAndInstallStrictExportTestExport.test_module_with_dict_container_inp_out_inline_and_install_strict  # noqa: F821
-)
-unittest.expectedFailure(
-    InlineAndInstallStrictExportTestExport.test_retrace_pre_autograd_inline_and_install_strict  # noqa: F821
-)
-# this is because detect leak test has export root
-unittest.expectedFailure(
-    InlineAndInstallStrictExportTestExport.test_detect_leak_strict_inline_and_install_strict  # noqa: F821
-)
-
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/export/test_nativert.py b/test/export/test_nativert.py
index 9c6fe59907c0..20f61ad03fff 100644
--- a/test/export/test_nativert.py
+++ b/test/export/test_nativert.py
@@ -12,6 +12,7 @@
 import torch._dynamo as torchdynamo
 from torch._C._nativert import PyModelRunner
 from torch._dynamo.test_case import TestCase
+from torch._environment import is_fbcode
 from torch._subclasses.fake_tensor import FakeTensor
 from torch.nativert.backends._lower_utils import (
     lower_exported_program,
@@ -127,7 +128,7 @@ def run_with_nativert(ep):
         flat_results = pytree.tree_leaves(results)
         assert len(flat_results) == len(flat_expected)
         for result, expected in zip(flat_results, flat_expected):
-            assert type(result) == type(expected)
+            assert type(result) is type(expected)
             if isinstance(result, torch.Tensor) and isinstance(expected, torch.Tensor):
                 assert result.shape == expected.shape
                 assert result.dtype == expected.dtype
@@ -197,6 +198,7 @@ def make_dynamic_cls(cls, strict=False):
 
 @unittest.skipIf(IS_WINDOWS, "Windows isn't supported for this case")
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
+@unittest.skipIf(not is_fbcode(), "FBcode only for now")
 class TestNativeRT(TestCase):
     @staticmethod
     def get_module():
@@ -321,7 +323,7 @@ def test_aoti(self, device, m, sample_inputs):
             flat_results = pytree.tree_leaves(results)
             assert len(flat_results) == len(flat_expected)
             for result, expected in zip(flat_results, flat_expected):
-                assert type(result) == type(expected)
+                assert type(result) is type(expected)
                 if isinstance(result, torch.Tensor) and isinstance(
                     expected, torch.Tensor
                 ):
@@ -342,13 +344,12 @@ def test_aoti(self, device, m, sample_inputs):
             pathlib.Path(filename).unlink(missing_ok=True)
 
 
-tests = [
-    test_export.TestExport,
-]
-for test in tests:
-    make_dynamic_cls(test, strict=True)
-    make_dynamic_cls(test, strict=False)
-del test
+if is_fbcode():
+    for test in [test_export.TestExport]:
+        make_dynamic_cls(test, strict=True)
+        make_dynamic_cls(test, strict=False)
+    del test
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/export/test_passes.py b/test/export/test_passes.py
index e93a66ed572b..9cf442c27a2b 100644
--- a/test/export/test_passes.py
+++ b/test/export/test_passes.py
@@ -358,9 +358,7 @@ def _insert_dilimiter_nodes(gm: torch.fx.GraphModule, step: int = 1):
 
         for i, node in enumerate(insert_locs):
             with gm.graph.inserting_before(node):
-                gm.graph.call_function(
-                    torch._C._set_grad_enabled, (True if i % 2 == 0 else False,), {}
-                )
+                gm.graph.call_function(torch._C._set_grad_enabled, (i % 2 == 0,), {})
         return gm
 
     x = torch.randn(2, 2)
diff --git a/test/export/test_serialize.py b/test/export/test_serialize.py
index b4333a0d130f..0e1eb0140bbb 100644
--- a/test/export/test_serialize.py
+++ b/test/export/test_serialize.py
@@ -14,7 +14,8 @@
 from pathlib import Path
 from typing import NamedTuple
 
-from torch.testing._internal.inductor_utils import HAS_GPU
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+from torch.testing._internal.triton_utils import requires_gpu
 
 
 if HAS_GPU:
@@ -81,7 +82,7 @@ def __hash__(self):
                 return 0
 
             def __eq__(self, other):
-                return type(other) == type(self)
+                return type(other) is type(self)
 
             def __call__(self, *args, **kwargs):
                 return torch.ops.aten.add.Tensor(*args, **kwargs)
@@ -888,6 +889,92 @@ def forward(self, x):
         loaded_ep = load(buffer)
         self.assertEqual(m(*sample_inputs), loaded_ep.module()(*sample_inputs))
 
+    def test_1D_tensor_slicing(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.const = torch.arange(8)[::2]
+
+            def forward(self, x):
+                return x + self.const
+
+        m = M()
+        sample_inputs = (torch.randn(4),)
+        ep = torch.export.export(m, sample_inputs)
+        buffer = io.BytesIO()
+        save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = load(buffer)
+        self.assertEqual(m(*sample_inputs), loaded_ep.module()(*sample_inputs))
+
+    def test_2D_tensor_slicing(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.const = torch.randn(4, 4)[:2, :2]
+
+            def forward(self, x):
+                return x + self.const
+
+        m = M()
+        sample_inputs = (torch.randn(2, 2),)
+        ep = torch.export.export(m, sample_inputs)
+        buffer = io.BytesIO()
+        save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = load(buffer)
+        self.assertEqual(m(*sample_inputs), loaded_ep.module()(*sample_inputs))
+
+    def test_non_float_weight(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.p = torch.nn.Parameter(
+                    torch.ones(2, 2, dtype=torch.int8), requires_grad=False
+                )
+
+            def forward(self, x):
+                return x + self.p
+
+        m = M()
+        sample_inputs = (torch.randn(2, 2),)
+        ep = torch.export.export(m, sample_inputs)
+        buffer = io.BytesIO()
+        save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = load(buffer)
+        self.assertEqual(m(*sample_inputs), loaded_ep.module()(*sample_inputs))
+
+    @requires_gpu
+    def test_weight_sharing_gpu(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.c2 = torch.ones(2, 4, device=GPU_TYPE)
+                self.c1 = self.c2[0, :]
+                self.linear = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                return self.linear(x) + self.c1 + self.c2
+
+        m = M().to(GPU_TYPE)
+        sample_inputs = (torch.randn(2, 4, device=GPU_TYPE),)
+        ep = torch.export.export(m, sample_inputs)
+        # Check that c1 and c2 share the same storage
+        self.assertEqual(
+            ep.constants["c1"].untyped_storage(), ep.constants["c2"].untyped_storage()
+        )
+        buffer = io.BytesIO()
+        save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = load(buffer)
+        # Check that c1 and c2 share the same storage after serdes
+        self.assertEqual(
+            loaded_ep.constants["c1"].untyped_storage(),
+            loaded_ep.constants["c2"].untyped_storage(),
+        )
+        self.assertEqual(m(*sample_inputs), loaded_ep.module()(*sample_inputs))
+
     def test_complex_constant(self) -> None:
         class M(torch.nn.Module):
             def forward(self, x):
@@ -1130,7 +1217,8 @@ def forward(self, x, y):
 
             # check ShapeEnv counters
             shape_env = _get_shape_env_from_gm(loaded_ep.graph_module)
-            next_index = next(shape_env.unbacked_symint_counter)
+            next_index = shape_env.unbacked_symint_counter
+            shape_env.unbacked_symint_counter += 1
             for symbol in bound:
                 self.assertTrue(symbol_is_type(symbol, SymT.UNBACKED_INT))
                 self.assertTrue(
@@ -1324,7 +1412,6 @@ class Module(torch.nn.Module):
             def forward(self, x):
                 y = x.nonzero()
                 z = y.size(0)
-                torch._check_is_size(z)
                 torch._check(z == 2)
                 return y
 
@@ -1335,7 +1422,6 @@ class Module(torch.nn.Module):
             def forward(self, x):
                 y = x.nonzero()
                 z = y.size(0)
-                torch._check_is_size(z)
                 torch._check(z % 3 == 0)
                 torch._check(z == 3)
                 return y
@@ -1619,7 +1705,7 @@ def test_constraints(self):
         class Module(torch.nn.Module):
             def forward(self, x, y):
                 n = x.item()
-                torch._check_is_size(n)
+                torch._check(n >= 0)
                 return y.sum() + torch.ones(n, 5).sum()
 
         f = Module()
@@ -1912,6 +1998,51 @@ def forward(self, x):
         inp = (torch.tensor(1),)
         self.assertTrue(torch.allclose(ep.module()(*inp), loaded_ep.module()(*inp)))
 
+    def test_save_load_with_multiple_empty_tensors(self) -> None:
+        # Test scenario where models have multiple empty tensors
+        # but with differnt data types.
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer(
+                    "int_buffer",
+                    torch.zeros([0], dtype=torch.uint8),
+                )
+                self.register_buffer(
+                    "int_buffer2",
+                    torch.zeros([0], dtype=torch.uint8),
+                )
+                self.register_buffer(
+                    "float_buffer",
+                    torch.zeros([0], dtype=torch.float32),
+                )
+
+            def forward(self, t: torch.Tensor) -> torch.Tensor:
+                return t + self.int_buffer + self.float_buffer + self.int_buffer2
+
+        m = M()
+        inp = torch.rand([0])
+
+        ep = torch.export.export(m, (inp,))
+
+        buffer = io.BytesIO()
+        torch.export.save(ep, buffer)
+        model_bytes = buffer.getvalue()
+
+        # First two buffers are duplicates, but not the third one.
+        # So in the serialized model, there will be two physical tensors.
+        self.assertTrue(b"weight_0" in model_bytes)
+        self.assertTrue(b"weight_1" in model_bytes)
+        self.assertFalse(b"weight_2" in model_bytes)
+
+        buffer = io.BytesIO(model_bytes)
+        buffer.seek(0)
+        dep = torch.export.load(buffer)
+        unf = torch.export.unflatten(dep)
+        self.assertEqual(unf.int_buffer.dtype, torch.uint8)
+        self.assertEqual(unf.int_buffer2.dtype, torch.uint8)
+        self.assertEqual(unf.float_buffer.dtype, torch.float32)
+
 
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
 class TestSerializeCustomClass(TestCase):
@@ -2134,7 +2265,8 @@ def test_unbacked_range_serdes(self):
         class Foo(torch.nn.Module):
             def forward(self, x, y):
                 n = x.item()
-                torch._check_is_size(n, max=y.size(0) - 1)
+                torch._check(n >= 0)
+                torch._check(n < y.size(0))
                 return torch.empty(n), y[n]
 
         ep = torch.export.export(
diff --git a/test/export/test_torchbind.py b/test/export/test_torchbind.py
index 3e8d8e35ada1..246122433e06 100644
--- a/test/export/test_torchbind.py
+++ b/test/export/test_torchbind.py
@@ -411,7 +411,7 @@ def forward(self, x):
             F1(), (torch.ones(2, 3),), strict=False, pre_dispatch=pre_dispatch
         )
 
-    def test_torchbind_register_attr_at_runtime_error(self):
+    def test_torchbind_register_attr_at_runtime_get_restored(self):
         # alias as model attribute
         class F3(torch.nn.Module):
             def forward(self, x, foo):
@@ -419,10 +419,8 @@ def forward(self, x, foo):
                 return x + self.foo.add_tensor(x)
 
         foo = torch.classes._TorchScriptTesting._Foo(10, 20)
-        with self.assertRaisesRegex(
-            ValueError, "following attrs were created in the model"
-        ):
-            torch.export.export(F3(), (torch.ones(2, 3), foo))
+        torch.export.export(F3(), (torch.ones(2, 3), foo), strict=False)
+        self.assertFalse(hasattr(foo, "foo"))
 
     @parametrize("pre_dispatch", [True, False])
     def test_torchbind_input_and_alias(self, pre_dispatch):
diff --git a/test/functorch/attn_ft.py b/test/functorch/attn_ft.py
index 7038ded09490..c5130e5f8a26 100644
--- a/test/functorch/attn_ft.py
+++ b/test/functorch/attn_ft.py
@@ -6,7 +6,7 @@
 import math
 
 import torch
-from functorch.dim import cat, dimlists, dims, softmax
+from functorch.dim import cat, dimlists, dims
 from torch import nn
 
 
@@ -142,7 +142,7 @@ def forward(
 
         attention_probs = attention_scores
         # Normalize the attention scores to probabilities.
-        attention_probs = softmax(attention_scores, dim=key_sequence)
+        attention_probs = torch.softmax(attention_scores, dim=key_sequence)
         # # This is actually dropping out entire tokens to attend to, which might
         # # seem a bit unusual, but is taken from the original Transformer paper.
         attention_probs = torch.nn.functional.dropout(
diff --git a/test/functorch/dim/test_getsetitem.py b/test/functorch/dim/test_getsetitem.py
new file mode 100644
index 000000000000..ae7ed0283c75
--- /dev/null
+++ b/test/functorch/dim/test_getsetitem.py
@@ -0,0 +1,265 @@
+# Owner(s): ["module: functorch"]
+import torch
+from functorch.dim import Dim, DimList, dims, Tensor
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class TestGetSetItem(TestCase):
+    """Comprehensive tests for first-class dimension indexing operations."""
+
+    def setUp(self):
+        """Set up common test fixtures."""
+        self.batch, self.height, self.width = dims(3)
+
+    def test_basic_dim_indexing(self):
+        """Test basic indexing with a single Dim."""
+        tensor = torch.randn(3, 4, 5)
+        x, y, z = dims(3)
+
+        # Test indexing with each dim
+        result1 = tensor[x]
+        self.assertIsInstance(result1, Tensor)
+
+        result2 = tensor[y]
+        self.assertIsInstance(result2, Tensor)
+
+        result3 = tensor[z]
+        self.assertIsInstance(result3, Tensor)
+
+    def test_multiple_dim_indexing(self):
+        """Test indexing with multiple Dims."""
+        tensor = torch.randn(3, 4, 5)
+        x, y, z = dims(3)
+
+        # Test multiple dims in one indexing operation
+        result = tensor[x, y]
+        self.assertIsInstance(result, Tensor)
+
+        result = tensor[x, y, z]
+        self.assertIsInstance(result, Tensor)
+
+    def test_mixed_indexing(self):
+        """Test mixing Dims with regular indexing."""
+        tensor = torch.randn(3, 4, 5)
+        x, y, z = dims(3)
+
+        # Mix dim with slice
+        result1 = tensor[x, :]
+        self.assertIsInstance(result1, Tensor)
+
+        result2 = tensor[:, y]
+        self.assertIsInstance(result2, Tensor)
+
+        # Mix dim with integer
+        result3 = tensor[x, 0]
+        self.assertIsInstance(result3, Tensor)
+
+        result4 = tensor[0, y]
+        self.assertIsInstance(result4, Tensor)
+
+    def test_ellipsis_indexing(self):
+        """Test indexing with ellipsis (...)."""
+        tensor = torch.randn(3, 4, 5, 6)
+        x, y, z, w = dims(4)
+
+        # Test ellipsis with dims
+        result1 = tensor[x, ...]
+        self.assertIsInstance(result1, Tensor)
+
+        result2 = tensor[..., y]
+        self.assertIsInstance(result2, Tensor)
+
+        result3 = tensor[x, ..., y]
+        self.assertIsInstance(result3, Tensor)
+
+    def test_none_indexing(self):
+        """Test indexing with None (newaxis)."""
+        tensor = torch.randn(3, 4)
+        x, y = dims(2)
+
+        # Test None with dims
+        result1 = tensor[x, None, y]
+        self.assertIsInstance(result1, Tensor)
+
+        result2 = tensor[None, x]
+        self.assertIsInstance(result2, Tensor)
+
+    def test_slice_indexing(self):
+        """Test indexing with slices mixed with dims."""
+        tensor = torch.randn(6, 8, 10)
+        x, y, z = dims(3)
+
+        # Test various slice patterns with dims
+        result1 = tensor[x, 1:5]
+        self.assertIsInstance(result1, Tensor)
+
+        result2 = tensor[1:3, y]
+        self.assertIsInstance(result2, Tensor)
+
+        result3 = tensor[x, 1:5, z]
+        self.assertIsInstance(result3, Tensor)
+
+    def test_tensor_indexing(self):
+        """Test indexing with tensor indices."""
+        tensor = torch.randn(5, 6, 7)
+        x, y, z = dims(3)
+
+        # Create index tensors
+        idx = torch.tensor([0, 2, 4])
+
+        # Test tensor indexing with dims
+        result1 = tensor[x, idx]
+        self.assertIsInstance(result1, Tensor)
+
+        result2 = tensor[idx, y]
+        self.assertIsInstance(result2, Tensor)
+
+    def test_boolean_indexing(self):
+        """Test boolean indexing with dims."""
+        tensor = torch.randn(4, 5)
+        x, y = dims(2)
+
+        # Create boolean mask
+        mask = torch.tensor([True, False, True, False, True])
+
+        # Test boolean indexing
+        result = tensor[x, mask]
+        self.assertIsInstance(result, Tensor)
+
+    def test_dim_pack_indexing(self):
+        """Test indexing with dimension packs (tuples/lists of dims)."""
+        tensor = torch.randn(3, 4)  # Need 2D tensor for 2 dims
+
+        # Create dims for dim pack
+        a, b = dims(2)
+
+        # Test dim pack indexing - using separate dimensions
+        result = tensor[a, b]
+        self.assertIsInstance(result, Tensor)
+
+    def test_unbound_dim_binding(self):
+        """Test automatic binding of unbound dimensions during indexing."""
+        tensor = torch.randn(6, 8)
+        x = Dim("x")  # unbound
+        y = Dim("y")  # unbound
+
+        # Should automatically bind dimensions
+        result = tensor[x, y]
+        self.assertIsInstance(result, Tensor)
+        self.assertEqual(x.size, 6)
+        self.assertEqual(y.size, 8)
+
+    def test_dimlist_indexing(self):
+        """Test indexing with DimList objects."""
+        tensor = torch.randn(3, 4, 5)
+
+        # Create a bound dimlist
+        dl = DimList(dims(2))
+
+        # Test dimlist indexing
+        result = tensor[dl, :]
+        self.assertIsInstance(result, Tensor)
+
+    def test_unbound_dimlist_indexing(self):
+        """Test indexing with unbound DimList."""
+        tensor = torch.randn(3, 4, 5)
+
+        # Create unbound dimlist
+        dl = DimList()
+
+        # Should bind to remaining dimensions
+        result = tensor[0, dl]
+        self.assertIsInstance(result, Tensor)
+
+    def test_repeated_dim_usage(self):
+        """Test using the same dim multiple times in indexing."""
+        tensor = torch.randn(4, 4, 4)
+        x, y, z = dims(3)
+
+        # This should trigger advanced indexing for repeated dims
+        result = tensor[x, x]
+        self.assertIsInstance(result, Tensor)
+
+    def test_complex_mixed_indexing(self):
+        """Test complex combinations of different indexing types."""
+        tensor = torch.randn(3, 4, 5, 6, 7)
+        a, b, c, d, e = dims(5)
+
+        # Complex mixed indexing
+        idx = torch.tensor([0, 2])
+
+        result1 = tensor[a, 1:3, None, idx, :]
+        self.assertIsInstance(result1, Tensor)
+
+        # Use mask with correct shape
+        correct_mask = torch.tensor([True, False, True, False, False, True, True])
+        result2 = tensor[..., correct_mask]
+        self.assertIsInstance(result2, torch.Tensor)
+
+    def test_edge_cases(self):
+        """Test edge cases and boundary conditions."""
+        x, y, z = dims(3)
+
+        # Single dimension tensor
+        vec = torch.randn(5)
+        a = Dim("a")
+        result1 = vec[a]
+        self.assertIsInstance(result1, Tensor)
+        self.assertEqual(a.size, 5)  # Should bind to tensor size
+
+        # Empty tensor indexing
+        empty = torch.empty(0, 3, 4)
+        result2 = empty[x, :]
+        self.assertIsInstance(result2, Tensor)
+
+    def test_error_conditions(self):
+        """Test conditions that should raise errors."""
+        tensor = torch.randn(3, 4)
+        x, y, z = dims(3)
+
+        # Too many indices
+        with self.assertRaises(ValueError):
+            _ = tensor[x, y, z]  # 3 indices for 2D tensor
+
+        # Multiple unbound dim lists
+        dl1 = DimList()
+        dl2 = DimList()
+        with self.assertRaises(Exception):  # Should raise DimensionBindError
+            _ = tensor[dl1, dl2]
+
+        # Multiple ellipsis
+        with self.assertRaises(Exception):
+            _ = tensor[..., x, ...]
+
+    def test_inferred_dimension_binding(self):
+        """Test dimension binding inference with dim packs."""
+        # Skip this test for now as it requires more complex dim pack functionality
+
+    def test_stride_calculation(self):
+        """Test that stride calculations work correctly with dim packs."""
+        tensor = torch.randn(6, 8)
+
+        # Test basic indexing instead of complex dim packs
+        a, b = dims(2)
+        result1 = tensor[a, b]
+        self.assertIsInstance(result1, Tensor)
+
+        # Test with different tensor
+        tensor2 = torch.randn(2, 3, 4)
+        c, d, e = dims(3)
+        result2 = tensor2[c, d, e]
+        self.assertIsInstance(result2, Tensor)
+
+    def test_device_handling_cpu(self):
+        """Test indexing behavior with CPU tensors."""
+        # CPU tensor
+        cpu_tensor = torch.randn(3, 4)
+        x, y = dims(2)
+
+        result_cpu = cpu_tensor[x, y]
+        self.assertIsInstance(result_cpu, Tensor)
+        self.assertEqual(result_cpu.device, torch.device("cpu"))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/functorch/dim/test_split.py b/test/functorch/dim/test_split.py
new file mode 100644
index 000000000000..12b47c5ab4db
--- /dev/null
+++ b/test/functorch/dim/test_split.py
@@ -0,0 +1,468 @@
+# Owner(s): ["module: functorch"]
+import unittest
+
+import torch
+from functorch.dim import Dim, dims, Tensor
+from torch.testing._internal.common_utils import (
+    run_tests,
+    TEST_WITH_TORCHDYNAMO,
+    TestCase,
+)
+
+
+class TestSplit(TestCase):
+    """Comprehensive tests for first-class dimension split operations."""
+
+    def setUp(self):
+        """Set up common test fixtures."""
+        self.batch, self.height, self.width = dims(3)
+
+    def test_dim_object_split_all_bound(self):
+        """Test split with all Dim objects bound to specific sizes."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Create bound Dim objects
+        d1 = Dim("d1", 3)
+        d2 = Dim("d2", 4)
+        d3 = Dim("d3", 5)
+
+        result = t.split([d1, d2, d3], dim=y)
+        self.assertEqual(len(result), 3)
+
+        # For FCD tensors, check the ordered version to verify shapes
+        self.assertEqual(result[0].order(x, d1, z).shape, (3, 3, 5))
+        self.assertEqual(result[1].order(x, d2, z).shape, (3, 4, 5))
+        self.assertEqual(result[2].order(x, d3, z).shape, (3, 5, 5))
+
+        # Verify dimensions are bound correctly
+        self.assertEqual(d1.size, 3)
+        self.assertEqual(d2.size, 4)
+        self.assertEqual(d3.size, 5)
+
+    def test_dim_object_split_unbound(self):
+        """Test split with unbound Dim objects."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Create unbound Dim objects
+        d1 = Dim("d1")
+        d2 = Dim("d2")
+        d3 = Dim("d3")
+
+        result = t.split([d1, d2, d3], dim=y)
+        self.assertEqual(len(result), 3)
+
+        # Should split evenly: 12 / 3 = 4 each
+        # Check via ordered tensors since FCD tensors have ndim=0
+        for i, part in enumerate(result):
+            if i == 0:
+                self.assertEqual(part.order(x, d1, z).shape, (3, 4, 5))
+            elif i == 1:
+                self.assertEqual(part.order(x, d2, z).shape, (3, 4, 5))
+            else:
+                self.assertEqual(part.order(x, d3, z).shape, (3, 4, 5))
+
+        # Verify dimensions are bound to chunk size
+        self.assertEqual(d1.size, 4)
+        self.assertEqual(d2.size, 4)
+        self.assertEqual(d3.size, 4)
+
+    def test_dim_object_split_mixed_bound_unbound(self):
+        """Test split with mix of bound and unbound Dim objects."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Create mix of bound and unbound
+        d1 = Dim("d1", 3)  # bound
+        d2 = Dim("d2")  # unbound
+        d3 = Dim("d3", 2)  # bound
+
+        result = t.split([d1, d2, d3], dim=y)
+        self.assertEqual(len(result), 3)
+        self.assertEqual(result[0].order(x, d1, z).shape, (3, 3, 5))
+        self.assertEqual(result[1].order(x, d2, z).shape, (3, 7, 5))  # 12 - 3 - 2 = 7
+        self.assertEqual(result[2].order(x, d3, z).shape, (3, 2, 5))
+
+        # Verify unbound dimension was bound to remaining size
+        self.assertEqual(d2.size, 7)
+
+    def test_dim_object_split_multiple_unbound(self):
+        """Test split with multiple unbound Dim objects."""
+        tensor = torch.randn(3, 15, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Create multiple unbound dimensions
+        d1 = Dim("d1", 3)  # bound
+        d2 = Dim("d2")  # unbound
+        d3 = Dim("d3")  # unbound
+
+        result = t.split([d1, d2, d3], dim=y)
+        self.assertEqual(len(result), 3)
+        self.assertEqual(result[0].order(x, d1, z).shape, (3, 3, 5))
+
+        # Remaining 12 should be split evenly between d2 and d3: 6 each
+        self.assertEqual(result[1].order(x, d2, z).shape, (3, 6, 5))
+        self.assertEqual(result[2].order(x, d3, z).shape, (3, 6, 5))
+
+        self.assertEqual(d2.size, 6)
+        self.assertEqual(d3.size, 6)
+
+    def test_dim_object_split_uneven_remainder(self):
+        """Test split with unbound dimensions that don't divide evenly."""
+        tensor = torch.randn(3, 14, 5)  # 14 doesn't divide evenly by 3
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        d1 = Dim("d1", 3)
+        d2 = Dim("d2")  # Should get ceil((14-3)/2) = 6
+        d3 = Dim("d3")  # Should get remaining = 5
+
+        result = t.split([d1, d2, d3], dim=y)
+        self.assertEqual(len(result), 3)
+        self.assertEqual(result[0].order(x, d1, z).shape, (3, 3, 5))
+        self.assertEqual(result[1].order(x, d2, z).shape, (3, 6, 5))
+        self.assertEqual(result[2].order(x, d3, z).shape, (3, 5, 5))
+
+        self.assertEqual(d2.size, 6)
+        self.assertEqual(d3.size, 5)
+
+    def test_split_with_dim_object_parameter(self):
+        """Test split when dim parameter is a Dim object."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Use Dim object as the dim parameter
+        d1 = Dim("d1", 3)
+        d2 = Dim("d2", 4)
+        d3 = Dim("d3", 5)
+
+        result = t.split([d1, d2, d3], dim=y)
+        self.assertEqual(len(result), 3)
+
+    def test_error_mixed_types(self):
+        """Test error when mixing integers and Dim objects in split sizes."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        d1 = Dim("d1", 3)
+
+        # Should raise TypeError for mixed types
+        with self.assertRaises(TypeError):
+            t.split([d1, 4, 5], dim=y)
+
+        with self.assertRaises(TypeError):
+            t.split([3, d1, 5], dim=y)
+
+    def test_error_dim_parameter_with_int_sizes(self):
+        """Test error when dim parameter is Dim but sizes are integers."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Should raise TypeError when dim is Dim object but sizes are ints
+        with self.assertRaises(
+            TypeError,
+            msg="when dim is specified as a Dim object, split sizes must also be dimensions.",
+        ):
+            t.split(3, dim=y)
+
+        with self.assertRaises(
+            TypeError,
+            msg="when dim is specified as a Dim object, split sizes must also be dimensions.",
+        ):
+            t.split([3, 4, 5], dim=y)
+
+    def test_error_size_mismatch(self):
+        """Test error when bound sizes don't match tensor dimension."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Bound dimensions that sum to wrong total
+        d1 = Dim("d1", 3)
+        d2 = Dim("d2", 4)
+        d3 = Dim("d3", 6)  # 3 + 4 + 6 = 13, but tensor has 12
+
+        with self.assertRaises(TypeError):
+            t.split([d1, d2, d3], dim=y)
+
+    def test_error_bound_sizes_exceed_tensor(self):
+        """Test error when bound sizes exceed tensor dimension."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Bound dimensions with one unbound, but bound sizes too large
+        d1 = Dim("d1", 8)
+        d2 = Dim("d2", 6)  # 8 + 6 = 14 > 12
+        d3 = Dim("d3")
+
+        with self.assertRaises(TypeError):
+            t.split([d1, d2, d3], dim=y)
+
+    def test_error_nonexistent_dimension(self):
+        """Test error when splitting on non-existent dimension."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        w = Dim("w")  # Not in tensor
+
+        with self.assertRaises(TypeError):
+            t.split([Dim("d1"), Dim("d2")], dim=w)
+
+    def test_split_different_dims(self):
+        """Test splitting along different dimensions."""
+        tensor = torch.randn(6, 8, 10)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Split along first dimension
+        a, b = Dim("a", 2), Dim("b", 4)
+        result1 = t.split([a, b], dim=x)
+        self.assertEqual(len(result1), 2)
+        self.assertEqual(result1[0].order(a, y, z).shape, (2, 8, 10))
+        self.assertEqual(result1[1].order(b, y, z).shape, (4, 8, 10))
+
+        # Split along last dimension
+        c, d = Dim("c", 3), Dim("d", 7)
+        result2 = t.split([c, d], dim=z)
+        self.assertEqual(len(result2), 2)
+        self.assertEqual(result2[0].order(x, y, c).shape, (6, 8, 3))
+        self.assertEqual(result2[1].order(x, y, d).shape, (6, 8, 7))
+
+    def test_split_single_dim_object(self):
+        """Test split with single Dim object that matches tensor dimension size."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Use a single Dim object with size matching the dimension
+        d1 = Dim("d1", 12)  # Must match the full size of y dimension
+
+        # Single Dim object in list should work when size matches
+        result = t.split([d1], dim=y)
+        self.assertEqual(len(result), 1)  # Single chunk containing entire dimension
+        self.assertEqual(result[0].order(x, d1, z).shape, (3, 12, 5))
+
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO,
+        "TorchDynamo doesn't preserve side effects during tracing",
+    )
+    def test_dimension_binding_consistency(self):
+        """Test that split properly binds dimensions and they remain consistent."""
+        tensor = torch.randn(3, 15, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        d1 = Dim("d1")
+        d2 = Dim("d2")
+        d3 = Dim("d3")
+
+        # Split should bind dimensions
+        t.split([d1, d2, d3], dim=y)
+
+        # Use the bound dimensions in another operation
+        self.assertTrue(d1.is_bound)
+        self.assertTrue(d2.is_bound)
+        self.assertTrue(d3.is_bound)
+
+        # Dimensions should remain bound with same values
+        original_sizes = (d1.size, d2.size, d3.size)
+
+        # Try to use bound dimension again - should maintain same size
+        another_tensor = torch.randn(original_sizes[0], 4)
+        a = Dim("a")
+        t2 = another_tensor[d1, a]  # d1 should still be bound to same size
+        self.assertEqual(t2.order(d1, a).shape, (original_sizes[0], 4))
+
+    def test_split_result_tensor_types(self):
+        """Test that split results are proper first-class dimension tensors."""
+        tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        d1 = Dim("d1", 4)
+        d2 = Dim("d2", 8)
+
+        result = t.split([d1, d2], dim=y)
+
+        # Results should be first-class dimension tensors
+        for part in result:
+            self.assertTrue(isinstance(part, (torch.Tensor, Tensor)))
+
+            # Should have dimensions from original tensor plus new split dimensions
+            if hasattr(part, "dims"):
+                # Check that the split dimension is in the result
+                dims_in_result = part.dims
+                self.assertTrue(len(dims_in_result) > 0)
+
+    def test_large_tensor_split(self):
+        """Test split on larger tensors to verify performance and correctness."""
+        tensor = torch.randn(10, 100, 20)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Split into many small pieces
+        split_dims = [Dim(f"d{i}", 5) for i in range(20)]  # 20 * 5 = 100
+
+        result = t.split(split_dims, dim=y)
+        self.assertEqual(len(result), 20)
+
+        for i, part in enumerate(result):
+            self.assertEqual(part.order(x, split_dims[i], z).shape, (10, 5, 20))
+            self.assertEqual(split_dims[i].size, 5)
+
+    def test_device_handling(self):
+        """Test split behavior with different devices."""
+        if torch.cuda.is_available():
+            # Test on CUDA
+            cuda_tensor = torch.randn(3, 12, 5, device="cuda")
+            x, y, z = dims(3)
+            t = cuda_tensor[x, y, z]
+
+            d1, d2 = Dim("d1", 4), Dim("d2", 8)
+            result = t.split([d1, d2], dim=y)
+
+            for i, part in enumerate(result):
+                ordered = part.order(x, d1 if i == 0 else d2, z)
+                self.assertEqual(ordered.device.type, "cuda")
+                self.assertEqual(ordered.shape[0], 3)
+                self.assertEqual(ordered.shape[2], 5)
+
+        # Test on CPU
+        cpu_tensor = torch.randn(3, 12, 5)
+        x, y, z = dims(3)
+        t = cpu_tensor[x, y, z]
+
+        d1, d2 = Dim("d1", 4), Dim("d2", 8)
+        result = t.split([d1, d2], dim=y)
+
+        for i, part in enumerate(result):
+            ordered = part.order(x, d1 if i == 0 else d2, z)
+            self.assertEqual(ordered.device, torch.device("cpu"))
+
+    def test_split_preserves_dtype(self):
+        """Test that split preserves tensor dtype."""
+        for dtype in [torch.float32, torch.float64, torch.int32, torch.int64]:
+            if dtype in [torch.int32, torch.int64]:
+                tensor = torch.randint(0, 10, (3, 12, 5), dtype=dtype)
+            else:
+                tensor = torch.randn(3, 12, 5, dtype=dtype)
+            x, y, z = dims(3)
+            t = tensor[x, y, z]
+
+            d1, d2 = Dim("d1", 4), Dim("d2", 8)
+            result = t.split([d1, d2], dim=y)
+
+            for i, part in enumerate(result):
+                ordered = part.order(x, d1 if i == 0 else d2, z)
+                self.assertEqual(ordered.dtype, dtype)
+
+    def test_split_with_requires_grad(self):
+        """Test split with tensors that require gradients."""
+        tensor = torch.randn(3, 12, 5, requires_grad=True)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        d1, d2 = Dim("d1", 4), Dim("d2", 8)
+        result = t.split([d1, d2], dim=y)
+
+        for part in result:
+            # Check requires_grad on the ordered tensor to access the underlying tensor properties
+            self.assertTrue(
+                part.order(x, d1 if part is result[0] else d2, z).requires_grad
+            )
+
+    def test_edge_case_single_element_splits(self):
+        """Test splitting into single-element chunks."""
+        tensor = torch.randn(3, 5, 4)
+        x, y, z = dims(3)
+        t = tensor[x, y, z]
+
+        # Split into 5 single-element pieces
+        split_dims = [Dim(f"d{i}", 1) for i in range(5)]
+
+        result = t.split(split_dims, dim=y)
+        self.assertEqual(len(result), 5)
+
+        for i, part in enumerate(result):
+            self.assertEqual(part.order(x, split_dims[i], z).shape, (3, 1, 4))
+
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "TorchDynamo has issues with torch._tensor.split"
+    )
+    def test_split_function_directly(self):
+        """Test that the standalone split function works correctly."""
+        from functorch.dim import split
+
+        # Test on regular tensor
+        tensor = torch.randn(3, 12, 5)
+        result = split(tensor, 4, dim=1)
+        self.assertEqual(len(result), 3)  # 12 / 4 = 3
+        for part in result:
+            self.assertEqual(part.shape, (3, 4, 5))
+
+        # Test on FCD tensor with FCD arguments
+        x, y, z = dims(3)
+        fcd_tensor = tensor[x, y, z]
+
+        d1 = Dim("d1", 4)
+        d2 = Dim("d2", 8)
+        result = split(fcd_tensor, [d1, d2], dim=y)
+        self.assertEqual(len(result), 2)
+        self.assertEqual(result[0].order(x, d1, z).shape, (3, 4, 5))
+        self.assertEqual(result[1].order(x, d2, z).shape, (3, 8, 5))
+
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO,
+        "TorchDynamo can't parse dims() without arguments from bytecode",
+    )
+    def test_split_on_plain_tensor_with_fcd_args(self):
+        """Test that split() works on plain tensors when FCD arguments are provided."""
+        # Test the exact example from the user message
+        x, y = dims()
+
+        # Split a plain tensor with FCD dimensions as split sizes
+        result = torch.randn(8).split([x, y], dim=0)
+        self.assertEqual(len(result), 2)
+
+        # Both parts should be FCD tensors
+        for part in result:
+            self.assertTrue(isinstance(part, (torch.Tensor, Tensor)))
+            self.assertTrue(hasattr(part, "dims"))
+
+        # Check that the dimensions are bound correctly
+        self.assertIs(result[0].dims[0], x)
+        self.assertIs(result[1].dims[0], y)
+        self.assertEqual(x.size, 4)  # 8 / 2 = 4 each
+        self.assertEqual(y.size, 4)
+
+        # Test with repeated dimensions
+        x2 = Dim("x2")
+        result2 = torch.randn(8).split([x2, x2], dim=0)
+        self.assertEqual(len(result2), 2)
+        self.assertEqual(x2.size, 4)  # Both chunks should be size 4
+
+    def test_plain_tensor_regular_split_still_works(self):
+        """Test that regular split on plain tensors still works without FCD args."""
+        tensor = torch.randn(3, 12, 5)
+
+        # Regular split without any FCD arguments should work normally
+        result = tensor.split(4, dim=1)
+        self.assertEqual(len(result), 3)  # 12 / 4 = 3
+        for part in result:
+            self.assertEqual(part.shape, (3, 4, 5))
+            self.assertTrue(isinstance(part, torch.Tensor))
+            self.assertFalse(hasattr(part, "dims"))  # Should be regular tensor
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/functorch/test_ac.py b/test/functorch/test_ac.py
index fde84b6683ed..d0611f19cf2a 100644
--- a/test/functorch/test_ac.py
+++ b/test/functorch/test_ac.py
@@ -106,7 +106,7 @@ def call():
             return f(x, ws)
 
         _, eager_flops = get_mem_and_flops(call)
-        for budget in range(0, 11):
+        for budget in range(11):
             mem, flops = get_mem_and_flops(call, memory_budget=budget / 10)
             if budget <= 5:
                 # We start saving the matmuls
@@ -251,7 +251,7 @@ def call():
             return f(x, ws)
 
         expected = call()
-        for budget in range(0, 11):
+        for budget in range(11):
             memory_budget = budget / 10
             torch._dynamo.reset()
             with config.patch(activation_memory_budget=memory_budget):
diff --git a/test/functorch/test_ac_logging.py b/test/functorch/test_ac_logging.py
index 03ddb7d45842..cb65f028a00f 100644
--- a/test/functorch/test_ac_logging.py
+++ b/test/functorch/test_ac_logging.py
@@ -37,6 +37,7 @@ def setUp(self) -> None:
         self.recomputable_node_idxs: list[int] = []
         self.expected_runtime: int = 100
         self.memories_banned_nodes: list[int] = [50]
+        self.normalized_memories_banned_nodes: list[float] = [0.10344827586206896]
         self.runtimes_banned_nodes: list[int] = [10]
         self.min_cut_saved_values: list[Node] = [self.node1]
 
@@ -93,21 +94,23 @@ def test_create_activation_checkpointing_logging_structure_payload(self) -> None
             "Expected Runtime": self.expected_runtime,
             "Knapsack Saved Nodes": self.saved_node_idxs,
             "Knapsack Recomputed Nodes": self.recomputable_node_idxs,
-            "Knapsack Input Memories": self.memories_banned_nodes,
+            "Knapsack Input Memories": self.normalized_memories_banned_nodes,
+            "Absolute Memories": self.memories_banned_nodes,
             "Knapsack Input Runtimes": self.runtimes_banned_nodes,
             "Min Cut Solution Saved Values": ["node1"],
         }
         result = create_activation_checkpointing_logging_structure_payload(
-            self.graph,
-            input_joint_graph_node_information,
-            joint_graph_edges,
-            self.all_recomputable_banned_nodes,
-            self.expected_runtime,
-            self.saved_node_idxs,
-            self.recomputable_node_idxs,
-            self.memories_banned_nodes,
-            self.runtimes_banned_nodes,
-            self.min_cut_saved_values,
+            joint_graph=self.graph,
+            joint_graph_node_information=input_joint_graph_node_information,
+            joint_graph_edges=joint_graph_edges,
+            all_recomputable_banned_nodes=self.all_recomputable_banned_nodes,
+            expected_runtime=self.expected_runtime,
+            saved_node_idxs=self.saved_node_idxs,
+            recomputable_node_idxs=self.recomputable_node_idxs,
+            memories_banned_nodes=self.memories_banned_nodes,
+            normalized_memories_banned_nodes=self.normalized_memories_banned_nodes,
+            runtimes_banned_nodes=self.runtimes_banned_nodes,
+            min_cut_saved_values=self.min_cut_saved_values,
         )
         self.assertEqual(result, expected_payload)
 
@@ -119,14 +122,15 @@ def test_create_structured_trace_for_min_cut_info(
         self, mock_json_dumps: MagicMock, mock_trace_structured: MagicMock
     ) -> None:
         create_structured_trace_for_min_cut_info(
-            self.graph,
-            self.all_recomputable_banned_nodes,
-            self.saved_node_idxs,
-            self.recomputable_node_idxs,
-            self.expected_runtime,
-            self.memories_banned_nodes,
-            self.runtimes_banned_nodes,
-            self.min_cut_saved_values,
+            joint_graph=self.graph,
+            all_recomputable_banned_nodes=self.all_recomputable_banned_nodes,
+            saved_node_idxs=self.saved_node_idxs,
+            recomputable_node_idxs=self.recomputable_node_idxs,
+            expected_runtime=self.expected_runtime,
+            memories_banned_nodes=self.memories_banned_nodes,
+            normalized_memories_banned_nodes=self.normalized_memories_banned_nodes,
+            runtimes_banned_nodes=self.runtimes_banned_nodes,
+            min_cut_saved_values=self.min_cut_saved_values,
         )
 
         self.assertEqual(mock_trace_structured.call_count, 1)
diff --git a/test/functorch/test_aot_joint_with_descriptors.py b/test/functorch/test_aot_joint_with_descriptors.py
index 4f63337af2ca..24d9042bc9c9 100644
--- a/test/functorch/test_aot_joint_with_descriptors.py
+++ b/test/functorch/test_aot_joint_with_descriptors.py
@@ -9,9 +9,11 @@
 from contextlib import ExitStack
 
 import torch
+import torch.fx.traceback as fx_traceback
 import torch.nn as nn
 import torch.utils._pytree as pytree
 from torch._decomp import decomposition_table
+from torch._dynamo.functional_export import _dynamo_graph_capture_for_export
 from torch._dynamo.testing import normalize_gm
 from torch._functorch._aot_autograd.descriptors import (
     BufferAOTInput,
@@ -34,7 +36,36 @@
     aot_compile_joint_with_descriptors,
     aot_export_joint_with_descriptors,
 )
-from torch.testing._internal.common_utils import run_tests, TestCase
+from torch._guards import tracing, TracingContext
+from torch.nn.attention.flex_attention import create_block_mask, flex_attention
+from torch.testing._internal.common_utils import (
+    requires_cuda,
+    run_tests,
+    skipIfCrossRef,
+    TestCase,
+)
+
+
+def graph_capture(model, inputs, with_export):
+    gm = model
+    fake_mode = None
+    if with_export:
+        with (
+            torch._dynamo.config.patch(install_free_tensors=True),
+            fx_traceback.preserve_node_meta(),
+        ):
+            # TODO: switch to use the official graph_capture API once it is ready
+            gm = _dynamo_graph_capture_for_export(model)(*inputs)
+            fake_mode = gm.meta.get("fake_mode", None)
+
+    with tracing(TracingContext(fake_mode)):
+        with ExitStack() as stack:
+            joint_with_descriptors = aot_export_joint_with_descriptors(
+                stack,
+                gm,
+                inputs,
+            )
+            return joint_with_descriptors.graph_module
 
 
 class TestAOTJointWithDescriptors(TestCase):
@@ -211,9 +242,7 @@ def forward(
         where: "f32[2, 3, 4, 4]" = torch.ops.prims.where.default(le, 0.0, add_4);  le = add_4 = None
         view_of: "f32[2, 3, 4, 4]" = torch.ops.prims.view_of.default(where)
         view_of_1: "f32[2, 3, 4, 4]" = torch.ops.prims.view_of.default(view_of);  view_of = None
-        view_of_2: "f32[2, 3, 4, 4]" = torch.ops.prims.view_of.default(view_of_1);  view_of_1 = None
-        view_of_3: "f32[2, 3, 4, 4]" = torch.ops.prims.view_of.default(view_of_2);  view_of_2 = None
-        le_1: "b8[2, 3, 4, 4]" = torch.ops.prims.le.default(view_of_3, 0.0);  view_of_3 = None
+        le_1: "b8[2, 3, 4, 4]" = torch.ops.prims.le.default(view_of_1, 0.0);  view_of_1 = None
         where_1: "f32[2, 3, 4, 4]" = torch.ops.prims.where.default(le_1, 0.0, tangents_1);  le_1 = tangents_1 = None
         broadcast_in_dim_10: "f32[1, 3]" = torch.ops.prims.broadcast_in_dim.default(squeeze_2, [1, 3], [1]);  squeeze_2 = None
         broadcast_in_dim_11: "f32[1, 3, 1]" = torch.ops.prims.broadcast_in_dim.default(broadcast_in_dim_10, [1, 3, 1], [0, 1]);  broadcast_in_dim_10 = None
@@ -761,6 +790,222 @@ def forward(self, x):
         compiled_fn(*dict(model.named_parameters()).values(), inputs).sum().backward()
         self.assertIsNotNone(model.linear.weight.grad)
 
+    def test_preserve_annotate_simple(self):
+        """Test basic linear module with aot_export_joint_with_descriptors"""
+
+        class SimpleLinear(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(3, 2)
+
+            def forward(self, x):
+                with fx_traceback.annotate({"pp_stage": 0}):
+                    y = self.linear(x)
+                return y - 1
+
+        inputs = (torch.randn(4, 3),)
+        model = SimpleLinear()
+
+        for with_export in [True, False]:
+            graph_module = graph_capture(model, inputs, with_export)
+            custom_metadata = fx_traceback._get_custom_metadata(graph_module)
+            self.assertExpectedInline(
+                str(custom_metadata),
+                """\
+('call_function', 't', {'pp_stage': 0})
+('call_function', 'addmm', {'pp_stage': 0})
+('call_function', 't_1', {'pp_stage': 0})
+('call_function', 'mm', {'pp_stage': 0})
+('call_function', 't_2', {'pp_stage': 0})
+('call_function', 'sum_1', {'pp_stage': 0})
+('call_function', 'view', {'pp_stage': 0})
+('call_function', 't_3', {'pp_stage': 0})""",
+            )
+
+    @requires_cuda
+    def test_preserve_annotate_flex_attention(self):
+        def score_mod(score, b, h, m, n):
+            return score
+
+        def _get_block_causal_mask_mod(seq_idx):
+            def block_causal_mask(b, h, q_idx, kv_idx):
+                # must use this more complicated mask_mod so autograd seq_nr increases
+                return (seq_idx[b, q_idx] == seq_idx[b, kv_idx]) & (q_idx >= kv_idx)
+
+            return block_causal_mask
+
+        a = 12
+        b = 24
+        batch_size = 2
+        seqlen = a * b
+        device = "cuda"
+
+        # Create seq_idx tensor - maps each position to a document/sequence ID
+        # Example: Split sequence into 2 documents for each batch
+        # First half (0:384) belongs to document 0, second half (384:768) to document 1
+        seq_idx = torch.zeros(batch_size, seqlen, dtype=torch.int32, device=device)
+        seq_idx[:, seqlen // 2 :] = 1  # Second half belongs to document 1
+
+        # Get the mask_mod function with seq_idx captured in closure
+        mask_mod = _get_block_causal_mask_mod(seq_idx)
+
+        # Create block_mask with the mask_mod function (which only takes 4 args)
+        # Note: We don't compile create_block_mask itself, just flex_attention
+        block_mask = create_block_mask(mask_mod, None, None, seqlen, seqlen)
+
+        class FlexAttentionModule(torch.nn.Module):
+            """Flex attention submodule similar to the sdpa in Llama3 Attention"""
+
+            def forward(self, xq, xk, xv):
+                """
+                Args:
+                    xq: Query tensor (bs, n_heads, seqlen, head_dim)
+                    xk: Key tensor (bs, n_heads, seqlen, head_dim)
+                    xv: Value tensor (bs, n_heads, seqlen, head_dim)
+                Returns:
+                    Output tensor (bs, n_heads, seqlen, head_dim)
+                """
+                with fx_traceback.annotate({"compile_with_inductor": "flex_attention"}):
+                    output = flex_attention(
+                        xq, xk, xv, block_mask=block_mask, score_mod=score_mod
+                    )
+                return output
+
+        # Model configuration
+        n_heads = 4
+        head_dim = 64
+
+        # Create input tensors in the shape expected by FlexAttentionModule
+        # Shape: (bs, n_heads, seqlen, head_dim)
+        xq = torch.randn(
+            batch_size, n_heads, seqlen, head_dim, requires_grad=True, device=device
+        )
+        xk = torch.randn(
+            batch_size, n_heads, seqlen, head_dim, requires_grad=True, device=device
+        )
+        xv = torch.randn(
+            batch_size, n_heads, seqlen, head_dim, requires_grad=True, device=device
+        )
+
+        model = FlexAttentionModule().to(device)
+        inputs = (xq, xk, xv)
+
+        gm = graph_capture(model, inputs, with_export=True)
+
+        custom_metadata = fx_traceback._get_custom_metadata(gm)
+
+        # not using assertExpectedInline because some CI runs has fewer detach nodes in graph
+        # than other CI runs, so we can't use a fixed string to compare against
+
+        self.assertTrue(
+            "('get_attr', 'sdpa_score0', {'compile_with_inductor': 'flex_attention'})"
+            in custom_metadata
+        )
+        self.assertTrue(
+            "('get_attr', 'sdpa_mask0', {'compile_with_inductor': 'flex_attention'})"
+            in custom_metadata
+        )
+        self.assertTrue(
+            "('call_function', 'flex_attention', {'compile_with_inductor': 'flex_attention'})"
+            in custom_metadata
+        )
+
+        self.assertTrue(
+            "('get_attr', 'fw_graph0', {'compile_with_inductor': 'flex_attention'})"
+            in custom_metadata
+        )
+        self.assertTrue(
+            "('get_attr', 'joint_graph0', {'compile_with_inductor': 'flex_attention'})"
+            in custom_metadata
+        )
+        self.assertTrue(
+            "('get_attr', 'mask_graph0', {'compile_with_inductor': 'flex_attention'})"
+            in custom_metadata
+        )
+        self.assertTrue(
+            "('call_function', 'flex_attention_backward', {'compile_with_inductor': 'flex_attention'})"
+            in custom_metadata
+        )
+
+    def test_preserve_annotate_function(self):
+        """Test basic annotate_fn usage"""
+
+        @fx_traceback.annotate_fn({"pp_stage": 1})
+        def example_function(x):
+            return x * x
+
+        class SimpleLinear(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(3, 2)
+
+            def forward(self, x):
+                with fx_traceback.annotate({"pp_stage": 0}):
+                    y = self.linear(x)
+                y = example_function(y)
+                return y - 1
+
+        inputs = (torch.randn(4, 3),)
+        model = SimpleLinear()
+
+        for with_export in [True, False]:
+            graph_module = graph_capture(model, inputs, with_export)
+            custom_metadata = fx_traceback._get_custom_metadata(graph_module)
+            self.assertExpectedInline(
+                str(custom_metadata),
+                """\
+('call_function', 't', {'pp_stage': 0})
+('call_function', 'addmm', {'pp_stage': 0})
+('call_function', 'mul', {'pp_stage': 1})
+('call_function', 'mul_1', {'pp_stage': 1})
+('call_function', 'mul_2', {'pp_stage': 1})
+('call_function', 't_1', {'pp_stage': 0})
+('call_function', 'mm', {'pp_stage': 0})
+('call_function', 't_2', {'pp_stage': 0})
+('call_function', 'sum_1', {'pp_stage': 0})
+('call_function', 'view', {'pp_stage': 0})
+('call_function', 't_3', {'pp_stage': 0})""",
+            )
+
+    @skipIfCrossRef
+    def test_custom_op_stack_trace(self):
+        @torch.library.custom_op("my_lib::foo", mutates_args={})
+        def foo(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            return x + y
+
+        @foo.register_fake
+        def foo_fake_impl(x, y):
+            return torch.empty_like(x)
+
+        def foo_setup_context(ctx, inputs, output):
+            pass
+
+        def foo_backward(ctx, grad_output):
+            return grad_output, grad_output
+
+        foo.register_autograd(foo_backward, setup_context=foo_setup_context)
+
+        class CustomOpModule(torch.nn.Module):
+            def forward(self, x, y):
+                return foo(x, y)
+
+        model = CustomOpModule()
+        inputs = (torch.randn(4, 3), torch.randn(4, 3))
+
+        gm = graph_capture(model, inputs, with_export=True)
+
+        foo_node = None
+        for node in gm.graph.nodes:
+            if node.op == "call_function" and node.name == "foo":
+                foo_node = node
+                break
+
+        self.assertTrue(foo_node is not None)
+        self.assertTrue("return foo(x, y)" in foo_node.meta.get("stack_trace", None))
+        self.assertTrue("return foo(x, y)" in gm.print_readable(print_output=False))
+        self.assertFalse("self._opoverload" in foo_node.meta.get("stack_trace", None))
+        self.assertFalse("self._opoverload" in gm.print_readable(print_output=False))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 5e8902b0aa8f..dda058dbb244 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -10,9 +10,10 @@
 import itertools
 import unittest
 import warnings
+from collections.abc import Callable
 from contextlib import ContextDecorator, ExitStack, nullcontext
 from functools import partial, wraps
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 from unittest.mock import patch
 
 from common_utils import (
@@ -27,6 +28,7 @@
 import torch
 import torch._dynamo as torchdynamo
 import torch.nn as nn
+import torch.nn.functional as F
 import torch.utils._pytree as pytree
 from functorch import grad, jacrev, make_fx, vjp, vmap
 from functorch.compile import (
@@ -2276,9 +2278,7 @@ def forward(self, primals_1):
     view = torch.ops.aten.view.default(mul, [-1])
     select = torch.ops.aten.select.int(mul, 0, 0)
     detach = torch.ops.aten.detach.default(select);  select = None
-    detach_1 = torch.ops.aten.detach.default(detach);  detach = None
-    detach_2 = torch.ops.aten.detach.default(detach_1);  detach_1 = None
-    return (view, mul, detach_2)""",
+    return (view, mul, detach)""",
         )
 
     def test_output_aliases_intermediate_inplace_view(self):
@@ -2605,6 +2605,170 @@ def f(a, b):
         ]
         self.verify_aot_autograd(f, inp_grad, test_mutation=True)
 
+    def test_fw_bw_mutation_no_functionalization1(self):
+        class FwBwMutation(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, a, b):
+                # input mutation
+                torch._foreach_mul_([b], [2])
+                x = b + 1
+                # intermediate mutation
+                torch._foreach_mul_([x], [3])
+                ctx.save_for_backward(x)
+                return x * a
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                (x,) = ctx.saved_tensors
+                # bw mutation
+                torch._foreach_mul_([x], [4])
+                return grad_output * x, grad_output * x
+
+        def f(a, b):
+            return FwBwMutation.apply(a, b)
+
+        inps = [
+            torch.ones(3, 3, requires_grad=True),
+            torch.ones(3, 3, requires_grad=False),
+        ]
+        inps_ref = [
+            torch.ones(3, 3, requires_grad=True),
+            torch.ones(3, 3, requires_grad=False),
+        ]
+
+        fw_graph = [None]
+        bw_graph = [None]
+
+        def fw_compiler(gm, example_inputs):
+            fw_graph[0] = gm
+            return gm
+
+        def bw_compiler(gm, example_inputs):
+            bw_graph[0] = gm
+            return gm
+
+        compiled_f = compiled_function(
+            f,
+            fw_compiler,
+            bw_compiler,
+            dynamic=False,
+            partition_fn=default_partition,
+            keep_inference_input_mutations=True,
+            disable_functionalization=True,
+        )
+
+        out_ref = f(*inps_ref)
+        out = compiled_f(*inps)
+        self.assertEqual(out, out_ref)
+
+        out_ref.sum().backward()
+        out.sum().backward()
+        self.assertEqual(inps_ref[0].grad, inps[0].grad)
+
+        # important bit: there are 2 mutations in the fw
+        self.assertExpectedInline(
+            fw_graph[0].code.strip(),
+            """\
+def forward(self, primals_1, primals_2):
+    _foreach_mul_ = torch.ops.aten._foreach_mul_.ScalarList([primals_2], [2]);  _foreach_mul_ = None
+    add = torch.ops.aten.add.Tensor(primals_2, 1);  primals_2 = None
+    _foreach_mul__1 = torch.ops.aten._foreach_mul_.ScalarList([add], [3]);  _foreach_mul__1 = None
+    mul = torch.ops.aten.mul.Tensor(add, primals_1);  primals_1 = None
+    return (mul, add)""",
+        )
+
+        # important bit: there is 1 mutation in the bw
+        self.assertExpectedInline(
+            bw_graph[0].code.strip(),
+            """\
+def forward(self, add, tangents_1):
+    _foreach_mul__2 = torch.ops.aten._foreach_mul_.ScalarList([add], [4]);  _foreach_mul__2 = None
+    mul_1 = torch.ops.aten.mul.Tensor(tangents_1, add);  tangents_1 = add = None
+    return (mul_1, None)""",
+        )
+
+    def test_fw_bw_mutation_no_functionalization2(self):
+        class FwBwMutation(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                # input mutation
+                torch._foreach_mul_([x], [2])
+                x = x + 1
+                # intermediate mutation
+                torch._foreach_mul_([x], [3])
+                ctx.save_for_backward(x)
+                return x
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                (x,) = ctx.saved_tensors
+                # bw mutation
+                torch._foreach_mul_([x], [4])
+                return grad_output * x
+
+        def f(a, b):
+            out = FwBwMutation.apply(b)
+            return out * a
+
+        inps = [
+            torch.ones(3, 3, requires_grad=True),
+            torch.ones(3, 3, requires_grad=False),
+        ]
+        inps_ref = [
+            torch.ones(3, 3, requires_grad=True),
+            torch.ones(3, 3, requires_grad=False),
+        ]
+
+        fw_graph = [None]
+        bw_graph = [None]
+
+        def fw_compiler(gm, example_inputs):
+            fw_graph[0] = gm
+            return gm
+
+        def bw_compiler(gm, example_inputs):
+            bw_graph[0] = gm
+            return gm
+
+        compiled_f = compiled_function(
+            f,
+            fw_compiler,
+            bw_compiler,
+            dynamic=False,
+            partition_fn=default_partition,
+            keep_inference_input_mutations=True,
+            disable_functionalization=True,
+        )
+
+        out_ref = f(*inps_ref)
+        out = compiled_f(*inps)
+        self.assertEqual(out, out_ref)
+
+        out_ref.sum().backward()
+        out.sum().backward()
+        self.assertEqual(inps_ref[0].grad, inps[0].grad)
+
+        # important bit: there are 2 mutations in the fw
+        # (the mutation on an activation doesn't get moved to bw)
+        self.assertExpectedInline(
+            fw_graph[0].code.strip(),
+            """\
+def forward(self, primals_1, primals_2):
+    _foreach_mul_ = torch.ops.aten._foreach_mul_.ScalarList([primals_2], [2]);  _foreach_mul_ = None
+    add = torch.ops.aten.add.Tensor(primals_2, 1);  primals_2 = None
+    _foreach_mul__1 = torch.ops.aten._foreach_mul_.ScalarList([add], [3]);  _foreach_mul__1 = None
+    mul = torch.ops.aten.mul.Tensor(add, primals_1);  primals_1 = None
+    return (mul, add)""",
+        )
+
+        self.assertExpectedInline(
+            bw_graph[0].code.strip(),
+            """\
+def forward(self, add, tangents_1):
+    mul_1 = torch.ops.aten.mul.Tensor(tangents_1, add);  tangents_1 = add = None
+    return (mul_1, None)""",
+        )
+
     def test_backward_mutation_metadata(self):
         class BwMutation(torch.autograd.Function):
             @staticmethod
@@ -4943,8 +5107,6 @@ def forward(
     ):
         cos: "f32[2, 2]" = torch.ops.aten.cos.default(arg0_1);  arg0_1 = None
 
-        _set_grad_enabled = torch._C._set_grad_enabled(True);  _set_grad_enabled = None
-
         body_graph_0 = self.body_graph_0
         map_impl = torch.ops.higher_order.map_impl(body_graph_0, [cos], [arg1_1]);  body_graph_0 = arg1_1 = None
         getitem_2: "f32[2, 2]" = map_impl[0];  map_impl = None
@@ -5138,23 +5300,12 @@ def forward(
         relu: "f32[1, 3, 3, 3]" = torch.ops.aten.relu.default(getitem);  getitem = None
         detach: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(relu);  detach = None
         detach_1: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(relu)
-        detach_2: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_1);  detach_1 = None
-        detach_3: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_2);  detach_2 = None
-        detach_4: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_3);  detach_3 = None
         sum_1: "f32[]" = torch.ops.aten.sum.default(relu)
-        detach_5: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(relu);  relu = None
-        detach_6: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_5);  detach_5 = None
-        detach_7: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_6);  detach_6 = None
-        detach_8: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_7);  detach_7 = None
-        detach_9: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_8);  detach_8 = None
-        detach_10: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_9);  detach_9 = None
+        detach_2: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(relu);  relu = None
         ones_like: "f32[]" = torch.ops.aten.ones_like.default(sum_1, pin_memory = False, memory_format = torch.preserve_format)
         expand: "f32[1, 3, 3, 3]" = torch.ops.aten.expand.default(ones_like, [1, 3, 3, 3]);  ones_like = None
-        detach_11: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_4);  detach_4 = None
-        detach_12: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_11);  detach_11 = None
-        detach_13: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_12);  detach_12 = None
-        detach_14: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_13);  detach_13 = None
-        threshold_backward: "f32[1, 3, 3, 3]" = torch.ops.aten.threshold_backward.default(expand, detach_14, 0);  expand = detach_14 = None
+        detach_3: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_1);  detach_1 = None
+        threshold_backward: "f32[1, 3, 3, 3]" = torch.ops.aten.threshold_backward.default(expand, detach_3, 0);  expand = detach_3 = None
         native_batch_norm_backward = torch.ops.aten.native_batch_norm_backward.default(threshold_backward, convolution, arg2_1, getitem_3, getitem_4, getitem_1, getitem_2, True, 1e-05, [True, True, True]);  threshold_backward = convolution = arg2_1 = getitem_1 = getitem_2 = None
         getitem_5: "f32[1, 3, 3, 3]" = native_batch_norm_backward[0]
         getitem_6: "f32[3]" = native_batch_norm_backward[1]
@@ -5163,7 +5314,7 @@ def forward(
         getitem_8 = convolution_backward[0];  getitem_8 = None
         getitem_9: "f32[3, 1, 1, 1]" = convolution_backward[1]
         getitem_10: "f32[3]" = convolution_backward[2];  convolution_backward = None
-        return (getitem_3, getitem_4, add, sum_1, detach_10, getitem_9, getitem_10, getitem_6, getitem_7)
+        return (getitem_3, getitem_4, add, sum_1, detach_2, getitem_9, getitem_10, getitem_6, getitem_7)
         """,  # noqa: B950
         )
 
@@ -5231,14 +5382,12 @@ def forward(
         relu: "f32[1, 3, 3, 3]" = torch.ops.aten.relu.default(getitem);  getitem = None
         sum_1: "f32[]" = torch.ops.aten.sum.default(relu)
         detach: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(relu);  relu = None
-        detach_1: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach);  detach = None
-        detach_2: "f32[1, 3, 3, 3]" = torch.ops.aten.detach.default(detach_1);  detach_1 = None
         return (
             getitem_3,  # InputMutationAOTOutput(mutated_input=PlainAOTInput(idx=4))
             getitem_4,  # InputMutationAOTOutput(mutated_input=PlainAOTInput(idx=5))
             add,  # InputMutationAOTOutput(mutated_input=PlainAOTInput(idx=6))
             sum_1,  # PlainAOTOutput(idx=0)
-            detach_2,  # PlainAOTOutput(idx=1)
+            detach,  # PlainAOTOutput(idx=1)
         )
         """,  # noqa: B950
         )
@@ -5966,6 +6115,219 @@ def test_autocast(self):
             res = aot_mod(x)
         res.sum().backward()
 
+    def test_quantize_activation_duplicate_nodes(self):
+        """Test both quantize_activation_fw and quantize_activation_bw handle duplicate nodes correctly"""
+        import torch.fx as fx
+        from torch._functorch.partitioners import (
+            quantize_activation_bw,
+            quantize_activation_fw,
+        )
+        from torch._subclasses.fake_tensor import extract_tensor_metadata
+
+        # Mock the inductor config
+        with patch.dict(
+            "torch._inductor.config.post_grad_fusion_options",
+            {
+                "activation_quantization_aten_pass": {
+                    "allowed_dtypes": "torch.bfloat16",
+                    "size_in_mb": 1,
+                    "use_scaling": True,
+                    "exclude_primals": False,
+                    "skip_dynamo_guards": True,
+                    "quantize_dynamic_shape": False,
+                    "quant_type": "torch.float16",  # float8_e5m2 must be GPU
+                }
+            },
+        ):
+            # Test Forward Graph with duplicate nodes
+            fwd_graph = fx.Graph()
+
+            # Create input nodes
+            x = fwd_graph.placeholder("x")
+            x.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            x.meta["tensor_meta"] = extract_tensor_metadata(x.meta["val"])
+
+            y = fwd_graph.placeholder("y")
+            y.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            y.meta["tensor_meta"] = extract_tensor_metadata(y.meta["val"])
+
+            # Create a computation node that will be duplicated in outputs
+            mul_node = fwd_graph.call_function(torch.ops.aten.mul.Tensor, (x, y))
+            mul_node.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            mul_node.meta["tensor_meta"] = extract_tensor_metadata(mul_node.meta["val"])
+            mul_node.meta["saved_for_quantization"] = True
+
+            # Create another node
+            add_node = fwd_graph.call_function(torch.ops.aten.add.Tensor, (x, y))
+            add_node.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            add_node.meta["tensor_meta"] = extract_tensor_metadata(add_node.meta["val"])
+
+            # Create output with DUPLICATE nodes - mul_node appears at positions 0 and 2
+            fwd_graph.output((mul_node, add_node, mul_node))
+
+            # Test the forward quantization function
+            quantize_activation_fw(fwd_graph)
+
+            # Get the forward output node
+            fwd_output_node = fwd_graph.find_nodes(op="output")[0]
+            fwd_output_args = fwd_output_node.args[0]
+
+            # Verify forward graph has the correct structure
+            self.assertGreaterEqual(
+                len(fwd_output_args), 3, "Should have at least the original 3 outputs"
+            )
+
+            # Check that positions 0 and 2 reuse the same quantized node
+            pos_0_node = fwd_output_args[0]
+            pos_2_node = fwd_output_args[2]
+
+            # Both should be quantized nodes
+            self.assertTrue(
+                pos_0_node.name.startswith("fp8_quant_"),
+                f"Position 0 should be quantized node, got: {pos_0_node.name}",
+            )
+            self.assertTrue(
+                pos_2_node.name.startswith("fp8_quant_"),
+                f"Position 2 should be quantized node, got: {pos_2_node.name}",
+            )
+
+            # The shared quantized node should have the first occurrence position in its name
+            self.assertIn(
+                "_pos_0",
+                pos_0_node.name,
+                f"Shared quantized node should have '_pos_0' in name: {pos_0_node.name}",
+            )
+            self.assertIn(
+                "_pos_2",
+                pos_2_node.name,
+                f"Shared quantized node should have '_pos_2' in name: {pos_2_node.name}",
+            )
+            # Find scale nodes in the forward output
+            fwd_scale_nodes = [
+                node for node in fwd_output_args if "fp8_scale_" in node.name
+            ]
+            self.assertEqual(
+                len(fwd_scale_nodes),
+                2,
+                "Should have exactly 2 scale node (shared for both quantized instances)",
+            )
+
+            # Test Backward Graph with duplicate nodes
+            bwd_graph = fx.Graph()
+
+            # Create backward placeholders corresponding to forward outputs
+            quant_input1 = bwd_graph.placeholder("fp8_quant_pos_0_mul_tensor")
+            quant_input1.meta["val"] = torch.randn(100, 100, dtype=torch.float16)
+            quant_input1.meta["tensor_meta"] = extract_tensor_metadata(
+                quant_input1.meta["val"]
+            )
+            quant_input1.meta["saved_for_quantization"] = True
+            quant_input1.meta["dequant_type"] = torch.bfloat16
+
+            add_input = bwd_graph.placeholder("add")
+            add_input.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            add_input.meta["tensor_meta"] = extract_tensor_metadata(
+                add_input.meta["val"]
+            )
+
+            quant_input2 = bwd_graph.placeholder("fp8_quant_pos_2_mul_tensor")
+            quant_input2.meta["val"] = torch.randn(100, 100, dtype=torch.float16)
+            quant_input2.meta["tensor_meta"] = extract_tensor_metadata(
+                quant_input2.meta["val"]
+            )
+            quant_input2.meta["saved_for_quantization"] = True
+            quant_input2.meta["dequant_type"] = torch.bfloat16
+
+            # Add scale node (would come from forward)
+            scale_input = bwd_graph.placeholder("fp8_scale_pos_0_mul_tensor")
+            scale_input.meta["val"] = torch.randn(100, 100, dtype=torch.float32)
+            scale_input.meta["tensor_meta"] = extract_tensor_metadata(
+                scale_input.meta["val"]
+            )
+
+            scale_input2 = bwd_graph.placeholder("fp8_scale_pos_2_mul_tensor")
+            scale_input2.meta["val"] = torch.randn(100, 100, dtype=torch.float32)
+            scale_input2.meta["tensor_meta"] = extract_tensor_metadata(
+                scale_input.meta["val"]
+            )
+            # Create some backward computation using both quantized inputs
+            grad_output1 = bwd_graph.placeholder("tangents_1")
+            grad_output1.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            grad_output1.meta["tensor_meta"] = extract_tensor_metadata(
+                grad_output1.meta["val"]
+            )
+
+            grad_output2 = bwd_graph.placeholder("tangents_2")
+            grad_output2.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            grad_output2.meta["tensor_meta"] = extract_tensor_metadata(
+                grad_output2.meta["val"]
+            )
+
+            # Create backward operations using the quantized inputs
+            mul_bwd1 = bwd_graph.call_function(
+                torch.ops.aten.mul.Tensor, (quant_input1, grad_output1)
+            )
+            mul_bwd1.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            mul_bwd1.meta["tensor_meta"] = extract_tensor_metadata(mul_bwd1.meta["val"])
+
+            mul_bwd2 = bwd_graph.call_function(
+                torch.ops.aten.mul.Tensor, (quant_input2, grad_output2)
+            )
+            mul_bwd2.meta["val"] = torch.randn(100, 100, dtype=torch.bfloat16)
+            mul_bwd2.meta["tensor_meta"] = extract_tensor_metadata(mul_bwd2.meta["val"])
+
+            # Create output
+            bwd_graph.output((mul_bwd1, mul_bwd2))
+
+            # Test the backward quantization function
+            quantize_activation_bw(bwd_graph)
+
+            # Verify backward graph processing
+            bwd_placeholders = list(bwd_graph.find_nodes(op="placeholder"))
+            quantized_placeholders = [
+                p for p in bwd_placeholders if "fp8_quant_" in p.name
+            ]
+            scale_placeholders = [p for p in bwd_placeholders if "fp8_scale_" in p.name]
+
+            # Should have processed the quantized placeholders
+            self.assertGreater(
+                len(quantized_placeholders), 0, "Should have quantized placeholders"
+            )
+            self.assertGreater(
+                len(scale_placeholders), 0, "Should have scale placeholders"
+            )
+
+            # Check that dequantization operations were added
+            dequant_operations = [
+                node
+                for node in bwd_graph.nodes
+                if node.op == "call_function"
+                and "convert_element_type" in str(node.target)
+            ]
+
+            # Should have dequantization operations for each quantized input that was processed
+            self.assertGreater(
+                len(dequant_operations),
+                0,
+                "Should have dequantization operations in backward graph",
+            )
+
+            # Verify the backward graph users were properly updated
+            for quant_placeholder in quantized_placeholders:
+                # The quantized placeholder should not be directly used in final operations
+                # (it should be replaced by dequantized versions)
+                direct_users = [
+                    user
+                    for user in quant_placeholder.users
+                    if user.op == "call_function" and "mul" in str(user.target)
+                ]
+                # Direct usage should be minimal (only for dequantization chain)
+                self.assertLessEqual(
+                    len(direct_users),
+                    1,
+                    f"Quantized placeholder {quant_placeholder.name} should have minimal direct users",
+                )
+
 
 class TestAOTDispatch(AOTTestCase):
     # Tests to add cases for (non-exhaustive list, mostly for my notes):
@@ -6119,7 +6481,7 @@ def f(a, b):
         self.assertEqual(out_ref[0].b, out_test[0].b)
         self.assertEqual(out_ref[1], out_test[1])
 
-        # We compiled our graph assuming type(grad_out[1]) == torch.Tensor,
+        # We compiled our graph assuming type(grad_out[1]) is torch.Tensor,
         # but we were wrong: in the below tests, it is a subclass.
         # This will eventually require a repartition + recompile
         with self.assertRaisesRegex(
@@ -6985,6 +7347,26 @@ def fn_(x):
         torch.compile(fn, backend="inductor", fullgraph=True)(x)
         torch.compile(fn_, backend="inductor", fullgraph=True)(x)
 
+    def test_layer_norm(self):
+        def fn(x):
+            return F.layer_norm(x, normalized_shape=(8,))
+
+        x = torch.randn(2, 4, 8)
+        eager = fn(x)
+        aot_eager = torch.compile(backend="aot_eager")(fn)(x)
+        self.assertEqual(eager, aot_eager, atol=0, rtol=0)
+
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA is unavailable")
+    def test_rms_norm(self):
+        # Only CUDA rms norm fails to be decomposed
+        def fn(x):
+            return F.rms_norm(x, normalized_shape=(8,))
+
+        x = torch.randn(2, 4, 8, device="cuda")
+        eager = fn(x)
+        aot_eager = torch.compile(backend="aot_eager")(fn)(x)
+        self.assertEqual(eager, aot_eager, atol=0, rtol=0)
+
     def test_subclass_parameters(self):
         class _M(torch.nn.Module):
             def __init__(self):
@@ -7760,7 +8142,9 @@ def fn(x):
     decorate(
         "linalg.pinv",
         "singular",
-        decorator=toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1e-05)}),
+        # This delta is coming entirely from the clone() on tangents
+        # in AOTDispatcher to make them contiguous
+        decorator=toleranceOverride({torch.float32: tol(atol=1e-02, rtol=1e-02)}),
     ),
     decorate(
         "nn.functional.interpolate",
@@ -7818,9 +8202,6 @@ def fn(x):
     xfail(
         "nn.functional.fractional_max_pool3d", ""
     ),  # rand() received an invalid combination of arguments - g...
-    xfail(
-        "nn.functional.group_norm", ""
-    ),  # Cannot call sizes() on tensor with symbolic sizes/strides
     xfail("trace", ""),  # Cannot call sizes() on tensor with symbolic sizes/strides
     decorate(
         "linalg.householder_product",
@@ -7829,7 +8210,14 @@ def fn(x):
 }
 
 
-def _test_aot_autograd_helper(self, device, dtype, op, dynamic=False):
+def _test_aot_autograd_helper(
+    self,
+    device,
+    dtype,
+    op,
+    dynamic=False,
+    disable_functionalization=False,
+):
     if not op.supports_autograd:
         self.skipTest("Op does not support autograd")
 
@@ -7860,6 +8248,7 @@ def _test_aot_autograd_helper(self, device, dtype, op, dynamic=False):
                 check_gradients=True,
                 try_check_data_specialization=try_check_data_specialization,
                 skip_correctness_check=op.skip_correctness_check_compile_vs_eager,
+                disable_functionalization=disable_functionalization,
             )
         except DynamicOutputShapeException:
             self.skipTest("Dynamic output shape operation in trace")
@@ -7960,6 +8349,36 @@ def test_aot_autograd_exhaustive(self, device, dtype, op):
     def test_aot_autograd_symbolic_exhaustive(self, device, dtype, op):
         _test_aot_autograd_helper(self, device, dtype, op, dynamic=True)
 
+    @ops(op_db + hop_db, allowed_dtypes=(torch.float,))
+    @skipOps(
+        "TestEagerFusionOpInfo",
+        "test_aot_autograd_disable_functionalization_exhaustive",
+        aot_autograd_failures,
+    )
+    def test_aot_autograd_disable_functionalization_exhaustive(self, device, dtype, op):
+        _test_aot_autograd_helper(
+            self, device, dtype, op, disable_functionalization=True
+        )
+
+    @ops(op_db + hop_db, allowed_dtypes=(torch.float,))
+    @patch("functorch.compile.config.debug_assert", True)
+    @skipOps(
+        "TestEagerFusionOpInfo",
+        "test_aot_autograd_disable_functionalization_symbolic_exhaustive",
+        aot_autograd_failures | symbolic_aot_autograd_failures,
+    )
+    def test_aot_autograd_disable_functionalization_symbolic_exhaustive(
+        self, device, dtype, op
+    ):
+        _test_aot_autograd_helper(
+            self,
+            device,
+            dtype,
+            op,
+            dynamic=True,
+            disable_functionalization=True,
+        )
+
 
 aot_autograd_module_failures = set(
     {
@@ -7987,8 +8406,6 @@ def test_aot_autograd_symbolic_exhaustive(self, device, dtype, op):
     torch.nn.Transformer,  # DataDependentOutputException: aten.equal compares a mask input to a mask producing a bool
     torch.nn.TransformerEncoder,  # DataDependentOutputException: aten.equal compares a mask input to a mask producing a bool
     torch.nn.GaussianNLLLoss,  # NotImplementedError: local_scalar_dense/item NYI for torch.bool
-    torch.nn.GroupNorm,  # in native_group_norm_backward cpg, _rem = divmod(C, group)
-    # TypeError: unsupported operand type(s) for divmod(): 'SymInt' and 'int'
     torch.nn.FractionalMaxPool3d,  # int() argument must be a string, a bytes-like object or a number, not 'SymFloat'
     torch.nn.BCELoss,  # new_size = _infer_size(target.size(), weight.size())
     # RuntimeError: expected int at position 0, but got: SymInt
@@ -8292,7 +8709,6 @@ def run_autograd(
         {
             "enable_autograd_cache": True,
             "strict_autograd_cache": True,
-            "view_replay_for_aliased_outputs": False,
         }
     )
     @torch._inductor.config.patch("fx_graph_cache", True)
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index 61658692612b..e47aaa9e9e2b 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -869,9 +869,9 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1):
             """\
 def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1):
     add = torch.ops.aten.add.Tensor(arg0_1, arg1_1);  arg0_1 = arg1_1 = add = None
+    zeros_like = torch.ops.aten.zeros_like.default(arg4_1, pin_memory = False);  arg4_1 = None
     clone = torch.ops.aten.clone.default(arg6_1)
     clone_1 = torch.ops.aten.clone.default(arg6_1);  arg6_1 = None
-    zeros_like = torch.ops.aten.zeros_like.default(arg4_1, pin_memory = False);  arg4_1 = None
     return [clone, clone_1, zeros_like]""",
         )
 
@@ -1830,34 +1830,34 @@ def test_scan_binary_operator(self, reverse, compile_mode, device, autograd):
             ]
         )
 
+        init_clone = [i.clone() for i in init]
+        init_clone2 = [i.clone() for i in init]
+        elements_clone = [ele.clone() for ele in elements]
+        elements_clone2 = [ele.clone() for ele in elements]
         result = scan_fct(
             get_scan_combine_fn("s5_operator", False),
-            init,
-            elements,
-            dim=0,
+            init_clone,
+            elements_clone,
             reverse=reverse,
         )
         expected_result = _fake_scan(
             get_scan_combine_fn("s5_operator", False),
-            init=init,
-            xs=elements,
-            dim=0,
+            init_clone2,
+            elements_clone2,
             reverse=reverse,
         )
         self.assertEqual(result, expected_result)
 
         if autograd:
-            init_flatten, _ = pytree.tree_flatten(init)
-            elements_flatten, _ = pytree.tree_flatten(elements)
-
             result_flatten, _ = pytree.tree_flatten(result)
             result_exp_flatten, _ = pytree.tree_flatten(expected_result)
+
             grad_out = [torch.ones_like(el) for el in result_exp_flatten]
             expected_grads = torch.autograd.grad(
-                result_exp_flatten, (*init_flatten, *elements_flatten), grad_out
+                result_exp_flatten, (*init_clone2, *elements_clone2), grad_out
             )
             grads = torch.autograd.grad(
-                result_flatten, (*init_flatten, *elements_flatten), grad_out
+                result_flatten, (*init_clone, *elements_clone), grad_out
             )
             self.assertEqual(grads, expected_grads)
 
@@ -2007,7 +2007,6 @@ def test_scan_complex_pytree(self, reverse, compile_mode, device, autograd):
     # Fails with: AssertionError: scan is not an OpOverload
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
-    @unittest.expectedFailure
     def test_scan_associative_scan(self):
         combine_mode = "generic"
         compile_mode_scan = "compile"
@@ -2758,9 +2757,7 @@ def forward(self, L_init_0_: "f32[1, 10, 2]", L_init_1_: "f32[1, 10, 2]", L_xs_:
         l_init_1_ = L_init_1_
         l_xs_ = L_xs_
 
-        elem: "f32[3, 10, 2]" = torch.movedim(l_xs_, 0, 0);  l_xs_ = None
-
-        flip: "f32[3, 10, 2]" = torch.flip(elem, [0]);  elem = None
+        flip: "f32[3, 10, 2]" = torch.flip(l_xs_, [0]);  l_xs_ = None
 
         scan_combine_fn_0 = self.scan_combine_fn_0
         scan = torch.ops.higher_order.scan(scan_combine_fn_0, [l_init_0_, l_init_1_], [flip], []);  scan_combine_fn_0 = l_init_0_ = l_init_1_ = flip = None
@@ -2935,9 +2932,7 @@ def RNN(x: torch.Tensor, y: torch.Tensor):
             if autograd:
                 result_flat = pytree.tree_leaves(result)
                 result_exp_flat = pytree.tree_leaves(result_exp)
-                exp_grad_mask = [
-                    True if r.requires_grad else False for r in result_exp_flat
-                ]
+                exp_grad_mask = [bool(r.requires_grad) for r in result_exp_flat]
                 self.check_autograd(
                     [r for r, m in zip(result_flat, exp_grad_mask) if m],
                     [r for r, m in zip(result_exp_flat, exp_grad_mask) if m],
@@ -3458,8 +3453,7 @@ def f(fct, init, xs):
             gm.code.strip(),
             """\
 def forward(self, fct_1, init_1, xs_1):
-    permute = torch.ops.aten.permute.default(xs_1, [0, 1, 2])
-    flip = torch.ops.aten.flip.default(permute, [0]);  permute = None
+    flip = torch.ops.aten.flip.default(xs_1, [0])
     sym_size_int_1 = torch.ops.aten.sym_size.int(init_1, 1)
     sym_size_int_2 = torch.ops.aten.sym_size.int(init_1, 2)
     sym_size_int_3 = torch.ops.aten.sym_size.int(xs_1, 1)
@@ -3483,8 +3477,7 @@ def forward(self, fct_1, init_1, xs_1):
 def forward(self, L_init_ : torch.Tensor, L_xs_ : torch.Tensor):
     l_init_ = L_init_
     l_xs_ = L_xs_
-    elem = torch.movedim(l_xs_, 0, 0);  l_xs_ = None
-    flip = torch.flip(elem, [0]);  elem = None
+    flip = torch.flip(l_xs_, [0]);  l_xs_ = None
     scan_combine_fn_0 = self.scan_combine_fn_0
     scan = torch.ops.higher_order.scan(scan_combine_fn_0, [l_init_], [flip], []);  scan_combine_fn_0 = l_init_ = flip = None
     carry = scan[0]
@@ -3676,7 +3669,7 @@ def __init__(self, combine_fn, dim, reverse, combine_mode, compile_mode):
                     # Check if val is a list and if it has the same length as combine_fn
                     # If so, then use the individual elements.
                     # If not, duplicate the first element.
-                    if type(val) == list and len(val) == chain_len:
+                    if type(val) is list and len(val) == chain_len:
                         kwargs_el[key] = val[ind]
                     else:
                         kwargs_el[key] = val
@@ -3746,9 +3739,7 @@ def _run_test(self, model, model_fake, inputs, autograd_param=None):
         ):
             result_flat = pytree.tree_leaves(result)
             result_exp_flat = pytree.tree_leaves(result_exp)
-            exp_grad_mask = [
-                True if r.requires_grad else False for r in result_exp_flat
-            ]
+            exp_grad_mask = [bool(r.requires_grad) for r in result_exp_flat]
 
             self._check_autograd(
                 [r for r, m in zip(result_flat, exp_grad_mask) if m],
@@ -5350,7 +5341,10 @@ def _check_compile(self, fn, args, *, dynamic=False, backend="eager"):
 
     def _check_export(self, fn, args, *, strict=False, dynamic_shapes=None):
         eg_out = fn(*args)
-        ep = torch.export.export(fn, args, strict=strict, dynamic_shapes=dynamic_shapes)
+        with torch._export.config.patch(use_new_tracer_experimental=True):
+            ep = torch.export.export(
+                fn, args, strict=strict, dynamic_shapes=dynamic_shapes
+            )
         ep_out = ep.module()(*args)
         self.assertEqual(eg_out, ep_out)
         return ep
@@ -5712,10 +5706,9 @@ def test_while_loop_functionalize(self, func_type, while_loop_test):
     )
     def test_while_loop_tracing(self, while_loop_test):
         fn, inp = WHILE_LOOP_TESTS[while_loop_test]
-        allow_non_fake_inputs = (
-            False
-            if while_loop_test not in ("simple_with_linear", "nested_with_linear")
-            else True
+        allow_non_fake_inputs = while_loop_test in (
+            "simple_with_linear",
+            "nested_with_linear",
         )
         self._check_tracing(fn, inp, allow_non_fake_inputs)
 
@@ -6980,6 +6973,106 @@ def f(x, y):
         res_compiled = torch.compile(f)(*example_inputs)
         self.assertEqual(res, res_compiled)
 
+    @skipIfTorchDynamo("Skip because we're testing export")
+    def test_cond_autograd_backward_inp_out_aliasing(self):
+        from torch._dynamo.testing import AotEagerAndRecordGraphs
+
+        backend = AotEagerAndRecordGraphs()
+
+        def fn(x, y):
+            return x + y
+
+        def f(x, y):
+            return control_flow.cond(x.sum() > 4, fn, fn, (x, y))
+
+        example_inputs = (
+            torch.ones(3, 4, requires_grad=True),
+            torch.ones(3, 4, requires_grad=True),
+        )
+        res = f(*example_inputs)
+        res.sum().backward()
+        res_compiled = torch.compile(f, backend=backend)(*example_inputs)
+        res_compiled.sum().backward()
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(backend.bw_graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "f32[3, 4]", primals_2: "f32[3, 4]", gt: "b8[]", tangents_1: "f32[3, 4]"):
+        true_graph_1 = self.true_graph_1
+        false_graph_1 = self.false_graph_1
+        cond_1 = torch.ops.higher_order.cond(gt, true_graph_1, false_graph_1, (primals_1, primals_2, tangents_1));  gt = true_graph_1 = false_graph_1 = primals_1 = primals_2 = tangents_1 = None
+        getitem_1: "f32[3, 4]" = cond_1[0]
+        getitem_2: "f32[3, 4]" = cond_1[1];  cond_1 = None
+        return (getitem_1, getitem_2)
+
+    class true_graph_1(torch.nn.Module):
+        def forward(self, arg0_1: "f32[3, 4]", arg1_1: "f32[3, 4]", arg2_1: "f32[3, 4]"):
+            clone: "f32[3, 4]" = torch.ops.aten.clone.default(arg2_1)
+            clone_1: "f32[3, 4]" = torch.ops.aten.clone.default(arg2_1);  arg2_1 = None
+            return [clone, clone_1]
+
+    class false_graph_1(torch.nn.Module):
+        def forward(self, arg0_1: "f32[3, 4]", arg1_1: "f32[3, 4]", arg2_1: "f32[3, 4]"):
+            clone: "f32[3, 4]" = torch.ops.aten.clone.default(arg2_1)
+            clone_1: "f32[3, 4]" = torch.ops.aten.clone.default(arg2_1);  arg2_1 = None
+            return [clone, clone_1]
+""",  # noqa: B950
+            )
+        self.assertEqual(res, res_compiled)
+
+    @skipIfTorchDynamo("Skip because we're testing export")
+    def test_cond_autograd_backward_out_out_aliasing(self):
+        from torch._dynamo.testing import AotEagerAndRecordGraphs
+
+        backend = AotEagerAndRecordGraphs()
+
+        def fn(x, y):
+            return (x + y).sin()
+
+        def f(x, y):
+            return control_flow.cond(x.sum() > 4, fn, fn, (x, y))
+
+        example_inputs = (
+            torch.ones(3, 4, requires_grad=True),
+            torch.ones(3, 4, requires_grad=True),
+        )
+        res = f(*example_inputs)
+        res.sum().backward()
+        res_compiled = torch.compile(f, backend=backend)(*example_inputs)
+        res_compiled.sum().backward()
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(backend.bw_graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "f32[3, 4]", primals_2: "f32[3, 4]", gt: "b8[]", tangents_1: "f32[3, 4]"):
+        true_graph_1 = self.true_graph_1
+        false_graph_1 = self.false_graph_1
+        cond_1 = torch.ops.higher_order.cond(gt, true_graph_1, false_graph_1, (primals_1, primals_2, tangents_1));  gt = true_graph_1 = false_graph_1 = primals_1 = primals_2 = tangents_1 = None
+        getitem_1: "f32[3, 4]" = cond_1[0]
+        getitem_2: "f32[3, 4]" = cond_1[1];  cond_1 = None
+        return (getitem_1, getitem_2)
+
+    class true_graph_1(torch.nn.Module):
+        def forward(self, arg0_1: "f32[3, 4]", arg1_1: "f32[3, 4]", arg2_1: "f32[3, 4]"):
+            add: "f32[3, 4]" = torch.ops.aten.add.Tensor(arg0_1, arg1_1);  arg0_1 = arg1_1 = None
+            cos: "f32[3, 4]" = torch.ops.aten.cos.default(add);  add = None
+            mul: "f32[3, 4]" = torch.ops.aten.mul.Tensor(arg2_1, cos);  arg2_1 = cos = None
+            clone: "f32[3, 4]" = torch.ops.aten.clone.default(mul)
+            return [mul, clone]
+
+    class false_graph_1(torch.nn.Module):
+        def forward(self, arg0_1: "f32[3, 4]", arg1_1: "f32[3, 4]", arg2_1: "f32[3, 4]"):
+            add: "f32[3, 4]" = torch.ops.aten.add.Tensor(arg0_1, arg1_1);  arg0_1 = arg1_1 = None
+            cos: "f32[3, 4]" = torch.ops.aten.cos.default(add);  add = None
+            mul: "f32[3, 4]" = torch.ops.aten.mul.Tensor(arg2_1, cos);  arg2_1 = cos = None
+            clone: "f32[3, 4]" = torch.ops.aten.clone.default(mul)
+            return [mul, clone]
+""",  # noqa: B950
+            )
+        self.assertEqual(res, res_compiled)
+
     def test_map_functionalized_elem_alias(self):
         def map_fn(x):
             x.view(x.shape)
@@ -7970,9 +8063,8 @@ def forward(self, L_init_ : torch.Tensor, L_xs_ : torch.Tensor, L_add_closure_0_
     l_xs_ = L_xs_
     l_add_closure_0_cell_contents_0_param_ = L_add_closure_0_cell_contents_0_param_
     l_add_closure_0_cell_contents_1_0_ = L_add_closure_0_cell_contents_1_0_
-    r = torch.movedim(l_xs_, 0, 0);  l_xs_ = None
     scan_combine_fn_0 = self.scan_combine_fn_0
-    scan = torch.ops.higher_order.scan(scan_combine_fn_0, [l_init_], [r], [l_add_closure_0_cell_contents_0_param_, l_add_closure_0_cell_contents_1_0_]);  scan_combine_fn_0 = l_init_ = r = l_add_closure_0_cell_contents_0_param_ = l_add_closure_0_cell_contents_1_0_ = None
+    scan = torch.ops.higher_order.scan(scan_combine_fn_0, [l_init_], [l_xs_], [l_add_closure_0_cell_contents_0_param_, l_add_closure_0_cell_contents_1_0_]);  scan_combine_fn_0 = l_init_ = l_xs_ = l_add_closure_0_cell_contents_0_param_ = l_add_closure_0_cell_contents_1_0_ = None
     carry = scan[0]
     out = scan[1];  scan = None
     return (carry, out)""",  # noqa: B950
@@ -7986,9 +8078,8 @@ def forward(self, L_init_ : torch.Tensor, L_xs_ : torch.Tensor, L_add_closure_0_
     l_xs_ = L_xs_
     l_add_closure_0_cell_contents_0_param_ = L_add_closure_0_cell_contents_0_param_
     l_add_closure_0_cell_contents_1_0_ = L_add_closure_0_cell_contents_1_0_
-    movedim = torch.movedim(l_xs_, 0, 0);  l_xs_ = None
     scan_combine_fn_0 = self.scan_combine_fn_0
-    scan = torch.ops.higher_order.scan(scan_combine_fn_0, [l_init_], [movedim], [l_add_closure_0_cell_contents_0_param_, l_add_closure_0_cell_contents_1_0_]);  scan_combine_fn_0 = l_init_ = movedim = l_add_closure_0_cell_contents_0_param_ = l_add_closure_0_cell_contents_1_0_ = None
+    scan = torch.ops.higher_order.scan(scan_combine_fn_0, [l_init_], [l_xs_], [l_add_closure_0_cell_contents_0_param_, l_add_closure_0_cell_contents_1_0_]);  scan_combine_fn_0 = l_init_ = l_xs_ = l_add_closure_0_cell_contents_0_param_ = l_add_closure_0_cell_contents_1_0_ = None
     carry = scan[0]
     out = scan[1];  scan = None
     return (carry, out)""",  # noqa: B950
@@ -8309,14 +8400,14 @@ def test_while_loop_op_pytree_int_carry_export(self, strict, dynamic):
                 """\
 class GraphModule(torch.nn.Module):
     def forward(self, x):
-        x: "f32[s77, 3]";
+        x: "f32[s6, 3]";
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
         _guards_fn = self._guards_fn(x);  _guards_fn = None
 
-        sym_size_int_1: "Sym(s77)" = torch.ops.aten.sym_size.int(x, 0)
+        sym_size_int_1: "Sym(s6)" = torch.ops.aten.sym_size.int(x, 0)
 
-        sin: "f32[s77, 3]" = torch.ops.aten.sin.default(x);  x = None
+        sin: "f32[s6, 3]" = torch.ops.aten.sin.default(x);  x = None
 
         while_loop_cond_graph_0 = self.while_loop_cond_graph_0
         while_loop_body_graph_0 = self.while_loop_body_graph_0
@@ -8328,35 +8419,35 @@ def forward(self, x):
         getitem_9: "Sym(u13)" = while_loop[3]
         getitem_10: "Sym(u14)" = while_loop[4]
 
-        getitem_5: "f32[s77, 3]" = while_loop[5];  while_loop = None
+        getitem_5: "f32[s6, 3]" = while_loop[5];  while_loop = None
 
         add: "Sym(u12 + 1)" = getitem_8 + 1
         add_1: "Sym(u13 + 1)" = getitem_9 + 1
         add_2: "Sym(u14 + 1)" = getitem_10 + 1
 
-        add_3: "f32[s77, 3]" = torch.ops.aten.add.Tensor(getitem_5, getitem_8);  getitem_8 = None
-        add_4: "f32[s77, 3]" = torch.ops.aten.add.Tensor(getitem_5, getitem_9);  getitem_9 = None
-        add_5: "f32[s77, 3]" = torch.ops.aten.add.Tensor(getitem_5, getitem_10);  getitem_10 = None
+        add_3: "f32[s6, 3]" = torch.ops.aten.add.Tensor(getitem_5, getitem_8);  getitem_8 = None
+        add_4: "f32[s6, 3]" = torch.ops.aten.add.Tensor(getitem_5, getitem_9);  getitem_9 = None
+        add_5: "f32[s6, 3]" = torch.ops.aten.add.Tensor(getitem_5, getitem_10);  getitem_10 = None
         return pytree.tree_unflatten((getitem_6, getitem_7, add, add_1, add_2, add_3, add_4, add_5, getitem_5), self._out_spec)
 
     class while_loop_cond_graph_0(torch.nn.Module):
-        def forward(self, arg0_1: "Sym(u20)", arg1_1: "Sym(u21)", arg2_1: "Sym(u22)", arg3_1: "Sym(u23)", arg4_1: "Sym(u24)", arg5_1: "f32[s77, 3]"):
-            mul: "Sym(u22*u23)" = arg2_1 * arg3_1;  arg2_1 = arg3_1 = None
-            mul_1: "Sym(u22*u23*u24)" = mul * arg4_1;  mul = arg4_1 = None
-            mul_2: "Sym(u20*u21)" = arg0_1 * arg1_1;  arg0_1 = arg1_1 = None
-            lt: "Sym(u22*u23*u24 < u20*u21)" = mul_1 < mul_2;  mul_1 = mul_2 = None
+        def forward(self, arg0_1: "Sym(u15)", arg1_1: "Sym(u16)", arg2_1: "Sym(u17)", arg3_1: "Sym(u18)", arg4_1: "Sym(u19)", arg5_1: "f32[s6, 3]"):
+            mul: "Sym(u17*u18)" = arg2_1 * arg3_1;  arg2_1 = arg3_1 = None
+            mul_1: "Sym(u17*u18*u19)" = mul * arg4_1;  mul = arg4_1 = None
+            mul_2: "Sym(u15*u16)" = arg0_1 * arg1_1;  arg0_1 = arg1_1 = None
+            lt: "Sym(u17*u18*u19 < u15*u16)" = mul_1 < mul_2;  mul_1 = mul_2 = None
             return lt
 
     class while_loop_body_graph_0(torch.nn.Module):
-        def forward(self, arg0_1: "Sym(u20)", arg1_1: "Sym(u21)", arg2_1: "Sym(u22)", arg3_1: "Sym(u23)", arg4_1: "Sym(u24)", arg5_1: "f32[s77, 3]"):
-            add: "Sym(u20 + 1)" = arg0_1 + 1;  arg0_1 = None
-            add_1: "Sym(u21 + 1)" = arg1_1 + 1;  arg1_1 = None
+        def forward(self, arg0_1: "Sym(u15)", arg1_1: "Sym(u16)", arg2_1: "Sym(u17)", arg3_1: "Sym(u18)", arg4_1: "Sym(u19)", arg5_1: "f32[s6, 3]"):
+            add: "Sym(u15 + 1)" = arg0_1 + 1;  arg0_1 = None
+            add_1: "Sym(u16 + 1)" = arg1_1 + 1;  arg1_1 = None
 
-            add_2: "Sym(u22 + 1)" = arg2_1 + 1;  arg2_1 = None
-            add_3: "Sym(u23 + 1)" = arg3_1 + 1;  arg3_1 = None
-            add_4: "Sym(u24 + 1)" = arg4_1 + 1;  arg4_1 = None
+            add_2: "Sym(u17 + 1)" = arg2_1 + 1;  arg2_1 = None
+            add_3: "Sym(u18 + 1)" = arg3_1 + 1;  arg3_1 = None
+            add_4: "Sym(u19 + 1)" = arg4_1 + 1;  arg4_1 = None
 
-            add_5: "f32[s77, 3]" = torch.ops.aten.add.Tensor(arg5_1, 1);  arg5_1 = None
+            add_5: "f32[s6, 3]" = torch.ops.aten.add.Tensor(arg5_1, 1);  arg5_1 = None
             return (add, add_1, add_2, add_3, add_4, add_5)
 """,  # noqa: B950
             )
@@ -8587,7 +8678,7 @@ def forward(self, arg0_1: "i64[]", arg1_1: "f32[3, 3]", arg2_1: "f32[3]", arg3_1
 
             rsub: "i64[]" = torch.ops.aten.rsub.Scalar(arg0_1, sym_size_int_1);  sym_size_int_1 = None
             sub_1: "i64[]" = torch.ops.aten.sub.Tensor(rsub, 1);  rsub = None
-            _local_scalar_dense: "Sym(u9)" = torch.ops.aten._local_scalar_dense.default(sub_1);  sub_1 = None
+            _local_scalar_dense: "Sym(u7)" = torch.ops.aten._local_scalar_dense.default(sub_1);  sub_1 = None
             select: "f32[3, 3]" = torch.ops.aten.select.int(arg4_1, 0, _local_scalar_dense);  arg4_1 = _local_scalar_dense = None
             t: "f32[3, 3]" = torch.ops.aten.t.default(arg6_1);  arg6_1 = None
             t_1: "f32[3, 3]" = torch.ops.aten.t.default(t);  t = None
@@ -9014,7 +9105,6 @@ def forward(self, s17: "Sym(s17)", s94: "Sym(s94)", L_y_: "f32[s17, s94]", L_z_:
 
         getitem_5: "f32[u0, s94]" = cond[0]
         sym_size_int: "Sym(u0)" = torch.ops.aten.sym_size.int(getitem_5, 0);  getitem_5 = None
-        _check_is_size = torch._check_is_size(sym_size_int);  _check_is_size = None
 
         ge: "Sym(u0 >= 0)" = sym_size_int >= 0;  sym_size_int = None
         _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar_default = None
diff --git a/test/functorch/test_dims.py b/test/functorch/test_dims.py
index 424321e9358f..eb5202d4bb2e 100644
--- a/test/functorch/test_dims.py
+++ b/test/functorch/test_dims.py
@@ -10,17 +10,9 @@
 from attn_ft import BertSelfAttention as BertSelfAttentionA, Linear
 from attn_positional import BertSelfAttention as BertSelfAttentionB
 
+import functorch.dim
 import torch
-from functorch._C import dim as _C
-from functorch.dim import (
-    Dim,
-    DimensionBindError,
-    DimList,
-    dimlists,
-    dims,
-    stack,
-    Tensor,
-)
+from functorch.dim import Dim, DimList, dimlists, dims, stack, Tensor
 from torch.testing._internal.common_utils import (
     run_tests,
     skipIfTorchDynamo,
@@ -34,12 +26,6 @@
 except ImportError:
     resnet18 = None
 
-_test_c, _parse_test, _set_pointwise_optimize = (
-    _C._test_c,
-    _C._parse_test,
-    _C._set_pointwise_optimize,
-)
-
 from contextlib import contextmanager
 from time import perf_counter
 
@@ -412,11 +398,6 @@ def test_hello(self):
         torch.testing.assert_close(
             A[c + 1, c + 0].order(c), A[torch.arange(2) + 1, torch.arange(2)]
         )
-        try:
-            A[..., 3, ...]
-            raise NotImplementedError
-        except DimensionBindError:
-            pass
 
         C = torch.rand(4, 7)
         c_, x, y, z = dims()
@@ -493,9 +474,6 @@ def test_compare_dims(self):
         j.size = 4
         (i < j)  # noqa: B015
 
-    def test_c(self):
-        _test_c()
-
     def test_seg(self):
         i, k = dims()
         i.size = 4
@@ -507,23 +485,6 @@ def test_expand(self):
         i = dims()
         self.assertEqual(list(A[i].expand(2, 4).order(i).size()), [3, 2, 4])
 
-    def test_parse(self):
-        self.assertEqual(("x", None, None, None), _parse_test(1, 0, "x"))
-        self.assertEqual(("x", None, "y", None), _parse_test(1, 0, "x", c="y"))
-        self.assertEqual(("x", None, "y", "z"), _parse_test(1, 0, "x", d="z", c="y"))
-
-        self.assertEqual(("x", "4", None, None), _parse_test(2, 0, "x", b="4"))
-        self.assertEqual(("x", "y", "z", "q"), _parse_test(2, 0, "x", "y", "z", "q"))
-        with self.assertRaises(TypeError):
-            _parse_test(2, 0, "x", "y", "z", "q", "5")
-        with self.assertRaises(TypeError):
-            _parse_test(2, 0, "x", "y", b="y")
-
-        with self.assertRaises(TypeError):
-            _parse_test(2, 0, "x", c="y")
-        with self.assertRaises(TypeError):
-            _parse_test(2, 0, "x")
-
     def test_network(self):
         if resnet18 is None:
             self.skipTest("no torchvision")
@@ -716,10 +677,10 @@ def test_big_split(self):
 class TestMinFunctorchOnly(TestMin):
     def setUp(self):
         super().setUp()
-        _set_pointwise_optimize(False)
+        functorch.dim.POINTWISE_OPTIMIZE = False
 
     def tearDown(self):
-        _set_pointwise_optimize(True)
+        functorch.dim.POINTWISE_OPTIMIZE = True
         super().tearDown()
 
 
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index b42180bb1adf..0a5d03f9dd1f 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -313,6 +313,24 @@ def test_is_cuda(self, device):
     def test_numel(self, device):
         self._test_attributes(lambda x: x.numel(), device)
 
+    def test_layout_sparse(self, device):
+        indices = torch.tensor([[0, 1, 1], [2, 0, 2]], device=device)
+        values = torch.tensor([3.0, 4.0, 5.0], device=device)
+        sparse_x = torch.sparse_coo_tensor(indices, values, (2, 3), device=device)
+
+        # Verify the input is sparse
+        self.assertEqual(sparse_x.layout, torch.sparse_coo)
+
+        def foo(x):
+            # assert GradTrackingTensor still reports sparse layout
+            self.assertEqual(x.layout, torch.sparse_coo)
+            return x.coalesce()._values().sum()
+
+        result = grad(foo)(sparse_x)
+
+        # The gradient should also be sparse
+        self.assertEqual(result.layout, torch.sparse_coo)
+
     def test_inplace(self, device):
         x = torch.randn([], device=device)
 
@@ -5222,6 +5240,101 @@ def wrapper_fn(x, y):
         self.assertEqual(actual, expected)
 
 
+class TestGradTrackingTensorToList(TestCase):
+    """Tests for tolist() method with GradTrackingTensor (functorch tensors)."""
+
+    def test_tolist_with_grad(self):
+        """Test to see if tolist works inside grad transformation."""
+
+        def f(x):
+            # inside grad, x is a GradTrackingTensor
+            result = x.tolist()
+            # tolist should return a python list and not fail
+            self.assertIsInstance(result, list)
+            self.assertEqual(result, [1.0, 2.0, 3.0])
+            return (x**2).sum()
+
+        x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
+        grad_f = torch.func.grad(f)
+        result = grad_f(x)
+        self.assertIsInstance(result, torch.Tensor)
+        # gradients should still be computed correctly
+        self.assertEqual(result, [2.0, 4.0, 6.0])
+
+    def test_tolist_nested_grad(self):
+        """Test `tolist` with nested grad transformations."""
+
+        def f(x):
+            def g(y):
+                # y is gradTrackingTensor(lvl=1)
+                inner_list = y.tolist()
+                self.assertIsInstance(inner_list, list)
+                return (y**2).sum()
+
+            # x is a gradTrackingTensor(lvl=0)
+            outer_list = x.tolist()
+            self.assertIsInstance(outer_list, list)
+            grad_g = torch.func.grad(g)
+            return grad_g(x).sum()
+
+        x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
+        grad_f = torch.func.grad(f)
+        result = grad_f(x)
+        # should compute second derivate
+        self.assertIsInstance(result, torch.Tensor)
+        # grad_f should return the derivate of g(y) which is (2*x).sum
+        self.assertEqual(
+            result,
+            [
+                2.0,
+                2.0,
+                2.0,
+            ],
+        )
+
+    def test_tolist_multidimensional_grad(self):
+        """Test tolist with multi-dimensional tensors in grad."""
+
+        def f(x):
+            result = x.tolist()
+            self.assertIsInstance(result, list)
+            self.assertEqual(len(result), 2)
+            self.assertEqual(result, [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+            return x.sum()
+
+        x = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], requires_grad=True)
+        grad_f = torch.func.grad(f)
+        result = grad_f(x)
+        self.assertIsInstance(result, torch.Tensor)
+        self.assertEqual(
+            result,
+            [
+                [
+                    1.0,
+                    1.0,
+                    1.0,
+                ],
+                [1.0, 1.0, 1.0],
+            ],
+        )
+
+    def test_tolist_conj_neg_grad(self):
+        """Test tolist method with conjugate/negative tensors in grad context."""
+
+        def f(x):
+            # test with the conjugate view
+            x_conj = x.conj()
+            result_conj = x_conj.tolist()
+            self.assertIsInstance(result_conj, list)
+            return (x * x.conj()).real.sum()
+
+        x = torch.tensor([1.0 + 2.0j, 3.0 + 4.0j], requires_grad=True)
+        grad_f = torch.func.grad(f)
+        result = grad_f(x)
+        self.assertIsInstance(result, torch.Tensor)
+        self.assertEqual(result, [2.0 + 4.0j, 6.0 + 8.0j])
+
+
 only_for = ("cpu", "cuda")
 instantiate_device_type_tests(
     TestGradTransform,
@@ -5301,6 +5414,9 @@ def wrapper_fn(x, y):
     globals(),
     only_for=only_for,
 )
+instantiate_device_type_tests(
+    TestGradTrackingTensorToList, globals(), only_for=only_for
+)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/functorch/test_memory_efficient_fusion.py b/test/functorch/test_memory_efficient_fusion.py
index 4926781d7f65..96e0ccaa6f48 100644
--- a/test/functorch/test_memory_efficient_fusion.py
+++ b/test/functorch/test_memory_efficient_fusion.py
@@ -3,7 +3,7 @@
 import inspect
 import random
 import unittest
-from typing import Callable
+from collections.abc import Callable
 
 import torch
 import torch.fx as fx
diff --git a/test/functorch/xfail_suggester.py b/test/functorch/xfail_suggester.py
index 8ff963f34e66..cab6b018d578 100644
--- a/test/functorch/xfail_suggester.py
+++ b/test/functorch/xfail_suggester.py
@@ -121,9 +121,7 @@ def get_suggested_xfails(base, tests):
         cpu_variant = base + "_cpu_float32"
         cuda_variant = base + "_cuda_float32"
         namespace, api, variant = parse_base(base)
-        if namespace is None:
-            api = api
-        else:
+        if namespace is not None:
             api = f"{namespace}.{api}"
         if cpu_variant in tests and cuda_variant in tests:
             result.append(f"xfail('{api}', '{variant}'),")
diff --git a/test/fx/test_fx_split.py b/test/fx/test_fx_split.py
index 7338dd0314a1..8d2b120e534a 100644
--- a/test/fx/test_fx_split.py
+++ b/test/fx/test_fx_split.py
@@ -296,7 +296,7 @@ def test_split_by_tags(self) -> None:
         gm_output = module(inputs)
         split_gm_output = split_gm(inputs)
 
-        self.assertTrue(type(gm_output) == type(split_gm_output))
+        self.assertTrue(type(gm_output) is type(split_gm_output))
         self.assertTrue(torch.equal(gm_output, split_gm_output))
 
 
diff --git a/test/fx/test_fx_traceback.py b/test/fx/test_fx_traceback.py
index 05369d17078b..1db681ddfd71 100644
--- a/test/fx/test_fx_traceback.py
+++ b/test/fx/test_fx_traceback.py
@@ -177,9 +177,7 @@ def forward(self, x):
             for node_name_2 in node_name_to_from_node:
                 if node_name_2 in {
                     node_name_1,
-                    same_ancestor_nodes[node_name_1]
-                    if node_name_1 in same_ancestor_nodes
-                    else None,
+                    same_ancestor_nodes.get(node_name_1),
                 }:
                     self.assertEqual(
                         node_name_to_from_node[node_name_1],
diff --git a/test/fx/test_matcher_utils.py b/test/fx/test_matcher_utils.py
index d046fccf1f50..6354fec2c6ed 100644
--- a/test/fx/test_matcher_utils.py
+++ b/test/fx/test_matcher_utils.py
@@ -2,7 +2,7 @@
 
 import os
 import sys
-from typing import Callable
+from collections.abc import Callable
 
 import torch
 import torch.nn.functional as F
diff --git a/test/fx/test_source_matcher_utils.py b/test/fx/test_source_matcher_utils.py
index b7453272a83e..1a5755c383d9 100644
--- a/test/fx/test_source_matcher_utils.py
+++ b/test/fx/test_source_matcher_utils.py
@@ -222,7 +222,7 @@ def test_legalize_slice(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
                 b = x.item()
-                torch._check_is_size(b)
+                torch._check(b >= 0)
                 torch._check(b + 1 < y.size(0))
                 return y[: b + 1]
 
diff --git a/test/fx/test_subgraph_rewriter.py b/test/fx/test_subgraph_rewriter.py
index 3f5455f0748a..0ee60f978127 100644
--- a/test/fx/test_subgraph_rewriter.py
+++ b/test/fx/test_subgraph_rewriter.py
@@ -514,8 +514,8 @@ def forward(self, x):
         symbolic_traced: torch.fx.GraphModule = symbolic_trace(module)
         for n, m in zip(symbolic_traced.graph.nodes, graph.nodes):
             if n.op == "placeholder":
-                assert n.type == int
-                assert m.type == int
+                assert n.type is int
+                assert m.type is int
 
     def test_subgraph_rewriter_replace_consecutive_submodules(self):
         def f(x):
diff --git a/test/higher_order_ops/test_invoke_subgraph.py b/test/higher_order_ops/test_invoke_subgraph.py
index 34d8e41d8978..ffbefe5cd9b4 100644
--- a/test/higher_order_ops/test_invoke_subgraph.py
+++ b/test/higher_order_ops/test_invoke_subgraph.py
@@ -340,15 +340,12 @@ def fn(mod, x, y):
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f32[8]", primals_2: "f32[8]", primals_3: "f32[8]"):
         partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
-
         invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1, primals_2, primals_3);  partitioned_fw_subgraph_0_0 = None
         getitem_12: "f32[8]" = invoke_subgraph_4[3]
         getitem_11: "f32[8]" = invoke_subgraph_4[2]
         getitem_10: "f32[8]" = invoke_subgraph_4[1]
         getitem: "f32[8]" = invoke_subgraph_4[0];  invoke_subgraph_4 = None
-
         partitioned_fw_subgraph_0_1 = self.partitioned_fw_subgraph_0_0
-
         invoke_subgraph_6 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_1, 'partitioned_fw_subgraph_0_0', primals_1, primals_2, primals_3);  partitioned_fw_subgraph_0_1 = primals_1 = primals_2 = primals_3 = None
         getitem_15: "f32[8]" = invoke_subgraph_6[3]
         getitem_14: "f32[8]" = invoke_subgraph_6[2]
@@ -373,13 +370,10 @@ def forward(self, primals_0: "f32[8]", primals_1: "f32[8]", primals_2: "f32[8]")
 class GraphModule(torch.nn.Module):
     def forward(self, getitem_12: "f32[8]", getitem_11: "f32[8]", getitem_10: "f32[8]", getitem_15: "f32[8]", getitem_14: "f32[8]", getitem_13: "f32[8]", tangents_1: "f32[8]"):
         partitioned_bw_subgraph_0_1 = self.partitioned_bw_subgraph_0_0
-
         invoke_subgraph_7 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_1, 'partitioned_bw_subgraph_0_0', getitem_13, getitem_14, getitem_15, tangents_1);  partitioned_bw_subgraph_0_1 = getitem_13 = getitem_14 = getitem_15 = None
         getitem_2: "f32[8]" = invoke_subgraph_7[0]
         getitem_3: "f32[8]" = invoke_subgraph_7[1];  invoke_subgraph_7 = None
-
         partitioned_bw_subgraph_0_0 = self.partitioned_bw_subgraph_0_0
-
         invoke_subgraph_5 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', getitem_10, getitem_11, getitem_12, tangents_1);  partitioned_bw_subgraph_0_0 = getitem_10 = getitem_11 = getitem_12 = tangents_1 = None
         getitem_6: "f32[8]" = invoke_subgraph_5[0]
         getitem_7: "f32[8]" = invoke_subgraph_5[1];  invoke_subgraph_5 = None
@@ -657,14 +651,11 @@ def fn(x):
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f32[8]"):
         partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
-
         invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1);  partitioned_fw_subgraph_0_0 = None
         getitem_7: "b8[8]" = invoke_subgraph_4[2]
         getitem_6: "f32[8]" = invoke_subgraph_4[1]
         getitem: "f32[8]" = invoke_subgraph_4[0];  invoke_subgraph_4 = None
-
         partitioned_fw_subgraph_1_0 = self.partitioned_fw_subgraph_1_0
-
         invoke_subgraph_6 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_1_0, 'partitioned_fw_subgraph_1_0', primals_1);  partitioned_fw_subgraph_1_0 = primals_1 = None
         getitem_8: "f32[8]" = invoke_subgraph_6[1]
         getitem_1: "f32[8]" = invoke_subgraph_6[0];  invoke_subgraph_6 = None
@@ -798,14 +789,12 @@ def forward(self, l_x_: "f32[8]", l_y_: "f32[8]"):
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f32[8]", primals_2: "f32[8]"):
         partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
-
         invoke_subgraph_4 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1, primals_2);  partitioned_fw_subgraph_0_0 = primals_1 = None
         getitem_9: "f32[8]" = invoke_subgraph_4[2]
         getitem_8: "f32[8]" = invoke_subgraph_4[1]
         getitem: "f32[8]" = invoke_subgraph_4[0];  invoke_subgraph_4 = None
 
         partitioned_fw_subgraph_0_1 = self.partitioned_fw_subgraph_0_0
-
         invoke_subgraph_6 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_1, 'partitioned_fw_subgraph_0_0', getitem, primals_2);  partitioned_fw_subgraph_0_1 = getitem = primals_2 = None
         getitem_11: "f32[8]" = invoke_subgraph_6[2]
         getitem_10: "f32[8]" = invoke_subgraph_6[1]
@@ -1517,7 +1506,6 @@ def forward(self, l_x_: "f32[8, 8]"):
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f32[8, 8]"):
         partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
-
         invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1);  partitioned_fw_subgraph_0_0 = primals_1 = None
         getitem: "f32[8, 8]" = invoke_subgraph_2[0]
         getitem_1: "f32[8, 8]" = invoke_subgraph_2[1];  invoke_subgraph_2 = None
@@ -1539,7 +1527,6 @@ def forward(self, primals_0: "f32[8, 8]"):
 class GraphModule(torch.nn.Module):
     def forward(self, tangents_1: "f32[8, 8]"):
         partitioned_bw_subgraph_0_0 = self.partitioned_bw_subgraph_0_0
-
         invoke_subgraph_3 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', tangents_1, tangents_1);  partitioned_bw_subgraph_0_0 = tangents_1 = None
         getitem_2: "f32[8, 8]" = invoke_subgraph_3[0];  invoke_subgraph_3 = None
         return (getitem_2,)
@@ -1604,11 +1591,30 @@ def fn(x):
         self.assertEqual(ref, res)
 
     @torch._dynamo.config.patch(capture_scalar_outputs=True)
-    def test_unbacked(self):
+    def test_unbacked1(self):
+        @nested_compile_region
+        def gn(x, y):
+            b = x.item()
+            return y[:b].clone()
+
+        def fn(x, y):
+            return gn(x, y)
+
+        x = torch.tensor(4)
+        y = torch.randn(8)
+        ref = fn(x, y)
+        opt_fn = torch.compile(
+            fn, backend="eager", fullgraph=True
+        )  # Inductor fails with assertion error when lowering aten.sym_constrain_range_for_size.default
+        res = opt_fn(x, y)
+        self.assertEqual(ref, res)
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_unbacked2(self):
         @nested_compile_region
         def gn(x, y):
             b = x.item()
-            torch._check_is_size(b)
+            torch._check(b >= 0)
             torch._check(b < y.shape[0])
             return y[:b].clone()
 
@@ -1659,7 +1665,6 @@ def fn(x, y):
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f32[8, 8]", primals_2: "f32[8, 8]"):
         partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
-
         invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1, primals_2);  partitioned_fw_subgraph_0_0 = primals_1 = primals_2 = None
         getitem_6: "f32[8, 8]" = invoke_subgraph_2[3]
         getitem_5: "f32[8, 8]" = invoke_subgraph_2[2]
@@ -1690,7 +1695,6 @@ def forward(self, getitem_6: "f32[8, 8]", getitem_5: "f32[8, 8]", getitem_4: "f3
         mul: "f32[8, 8]" = torch.ops.aten.mul.Tensor(tangents_1, cos);  tangents_1 = cos = None
 
         partitioned_bw_subgraph_0_0 = self.partitioned_bw_subgraph_0_0
-
         invoke_subgraph_3 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', getitem_4, getitem_5, getitem_6, mul);  partitioned_bw_subgraph_0_0 = getitem_4 = getitem_5 = getitem_6 = mul = None
         getitem_1: "f32[8, 8]" = invoke_subgraph_3[0]
         getitem_2: "f32[8, 8]" = invoke_subgraph_3[1];  invoke_subgraph_3 = None
@@ -2237,14 +2241,12 @@ def fn(x):
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "Sym(s77)", primals_2: "f32[s77, 16]"):
         partitioned_fw_subgraph_0_1 = self.partitioned_fw_subgraph_0_1
-
         invoke_subgraph_8 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_1, 'partitioned_fw_subgraph_0_1', primals_1, primals_2);  partitioned_fw_subgraph_0_1 = primals_2 = None
         getitem_17: "Sym(s77)" = invoke_subgraph_8[2]
         getitem_16: "f32[s77, 16]" = invoke_subgraph_8[1]
         getitem: "f32[s77, 16]" = invoke_subgraph_8[0];  invoke_subgraph_8 = None
 
         partitioned_fw_subgraph_0_2 = self.partitioned_fw_subgraph_0_1
-
         invoke_subgraph_10 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_2, 'partitioned_fw_subgraph_0_1', primals_1, getitem);  partitioned_fw_subgraph_0_2 = getitem = None
         getitem_19: "Sym(s77)" = invoke_subgraph_10[2]
         getitem_18: "f32[s77, 16]" = invoke_subgraph_10[1]
@@ -2253,14 +2255,12 @@ def forward(self, primals_1: "Sym(s77)", primals_2: "f32[s77, 16]"):
         sin: "f32[s77, 16]" = torch.ops.aten.sin.default(getitem_1)
 
         partitioned_fw_subgraph_0_3 = self.partitioned_fw_subgraph_0_1
-
         invoke_subgraph_12 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_3, 'partitioned_fw_subgraph_0_1', primals_1, sin);  partitioned_fw_subgraph_0_3 = sin = None
         getitem_21: "Sym(s77)" = invoke_subgraph_12[2]
         getitem_20: "f32[s77, 16]" = invoke_subgraph_12[1]
         getitem_2: "f32[s77, 16]" = invoke_subgraph_12[0];  invoke_subgraph_12 = None
 
         partitioned_fw_subgraph_0_0 = self.partitioned_fw_subgraph_0_0
-
         invoke_subgraph_14 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1, getitem_2);  partitioned_fw_subgraph_0_0 = None
         getitem_23: "Sym(s77)" = invoke_subgraph_14[2]
         getitem_22: "f32[s77, 16]" = invoke_subgraph_14[1]
@@ -2292,26 +2292,22 @@ def forward(self, primals_1: "Sym(s77)", getitem_17: "Sym(s77)", getitem_19: "Sy
         expand: "f32[s77, 16]" = torch.ops.aten.expand.default(tangents_1, [primals_1, 16]);  tangents_1 = primals_1 = None
 
         partitioned_bw_subgraph_0_0 = self.partitioned_bw_subgraph_0_0
-
         invoke_subgraph_15 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', getitem_23, getitem_22, expand);  partitioned_bw_subgraph_0_0 = getitem_23 = getitem_22 = None
         getitem_5: "f32[s77, 16]" = invoke_subgraph_15[1];  invoke_subgraph_15 = None
 
         add_16: "f32[s77, 16]" = torch.ops.aten.add.Tensor(expand, getitem_5);  expand = getitem_5 = None
 
         partitioned_bw_subgraph_0_3 = self.partitioned_bw_subgraph_0_1
-
         invoke_subgraph_13 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_3, 'partitioned_bw_subgraph_0_1', getitem_21, getitem_20, add_16);  partitioned_bw_subgraph_0_3 = getitem_21 = getitem_20 = add_16 = None
         getitem_8: "f32[s77, 16]" = invoke_subgraph_13[1];  invoke_subgraph_13 = None
 
         mul_10: "f32[s77, 16]" = torch.ops.aten.mul.Tensor(getitem_8, cos);  getitem_8 = cos = None
 
         partitioned_bw_subgraph_0_2 = self.partitioned_bw_subgraph_0_1
-
         invoke_subgraph_11 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_2, 'partitioned_bw_subgraph_0_1', getitem_19, getitem_18, mul_10);  partitioned_bw_subgraph_0_2 = getitem_19 = getitem_18 = mul_10 = None
         getitem_11: "f32[s77, 16]" = invoke_subgraph_11[1];  invoke_subgraph_11 = None
 
         partitioned_bw_subgraph_0_1 = self.partitioned_bw_subgraph_0_1
-
         invoke_subgraph_9 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_1, 'partitioned_bw_subgraph_0_1', getitem_17, getitem_16, getitem_11);  partitioned_bw_subgraph_0_1 = getitem_17 = getitem_16 = getitem_11 = None
         getitem_14: "f32[s77, 16]" = invoke_subgraph_9[1];  invoke_subgraph_9 = None
         return (None, getitem_14)
@@ -2563,7 +2559,6 @@ def test_unbacked(self):
         @nested_compile_region
         def gn(x, y):
             b = x.item()
-            torch._check_is_size(b)
             torch._check(b < y.shape[0])
             return y[:b].clone()
 
diff --git a/test/higher_order_ops/test_local_map.py b/test/higher_order_ops/test_local_map.py
index 46ecacc2b330..f4e85f01e099 100644
--- a/test/higher_order_ops/test_local_map.py
+++ b/test/higher_order_ops/test_local_map.py
@@ -2,29 +2,112 @@
 # flake8: noqa: B950
 
 
+import functools
 import unittest
+from contextlib import contextmanager, ExitStack
 
 import torch
 import torch._dynamo
 import torch._functorch
 import torch._inductor
 import torch._inductor.decomposition
+import torch.fx.traceback as fx_traceback
 import torch.nn.functional as F
 from torch import nn
 from torch._dynamo.variables.higher_order_ops import LocalMapWrappedHigherOrderVariable
+from torch._functorch.aot_autograd import aot_export_joint_with_descriptors
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.nn.attention import sdpa_kernel, SDPBackend
+from torch.utils.checkpoint import create_selective_checkpoint_contexts
 
 
 if torch.distributed.is_available():
     from torch.distributed._tensor.experimental import local_map
     from torch.distributed.tensor.placement_types import Replicate, Shard
 
-from torch.testing._internal.common_utils import run_tests, TEST_WITH_CROSSREF, TestCase
-from torch.testing._internal.triton_utils import requires_cuda_and_triton
+from torch.testing._internal.common_utils import (
+    run_tests,
+    TEST_WITH_CROSSREF,
+    TEST_WITH_TORCHDYNAMO,
+    TEST_WITH_TORCHINDUCTOR,
+    TestCase,
+)
 
 
 nested_compile_region = torch.compiler.nested_compile_region
 
 
+@contextmanager
+def enable_local_map_wrapping():
+    from torch._dynamo.variables.higher_order_ops import (
+        LocalMapWrappedHigherOrderVariable as vt_cls,
+    )
+    from torch._higher_order_ops import local_map as local_map_module
+
+    with vt_cls.enable(), local_map_module.defer_inlining():
+        yield
+
+
+@contextmanager
+def ap_style_frontend_patches():
+    @contextmanager
+    def monkey_patch_export_verifier():
+        from torch._export.verifier import final, Verifier
+
+        prior = Verifier._check_graph_module
+
+        @final
+        def skip_checks(self: Verifier, gm: torch.fx.GraphModule) -> None:
+            return
+
+        try:
+            Verifier._check_graph_module = skip_checks
+            yield
+        finally:
+            Verifier._check_graph_module = prior
+
+    with ExitStack() as stack:
+        stack.enter_context(enable_local_map_wrapping())
+        stack.enter_context(
+            torch._dynamo.utils._disable_saved_tensors_hooks_during_tracing()
+        )
+        stack.enter_context(torch._dynamo.config.patch(install_free_tensors=True))
+        stack.enter_context(monkey_patch_export_verifier())
+        yield
+
+
+def ap_style_initial_capture(model, inputs):
+    """
+    Similar to AP's initial capture, but:
+    - no dtype casting
+    - no AP decomps
+    - no inductor
+    """
+    assert isinstance(inputs, tuple)
+    with ap_style_frontend_patches():
+        ep = torch.export.export(model, inputs, strict=True)
+        unused = ExitStack()
+        joint_with_descriptors = aot_export_joint_with_descriptors(
+            unused,
+            ep.module(),
+            inputs,
+            decompositions=torch._inductor.decomposition.select_decomp_table(),
+        )
+        unused.close()
+    return joint_with_descriptors.graph_module
+
+
+def get_skip_reasons():
+    msg = ""
+    if not torch.distributed.is_available():
+        msg += "Torch distributed not available. "
+    if TEST_WITH_TORCHINDUCTOR or TEST_WITH_TORCHDYNAMO:
+        msg += "Already manually torch.compile'd. "
+
+    return msg != "", msg
+
+
 class MyTransform(torch.autograd.Function):
     @staticmethod
     def forward(ctx, x):
@@ -42,7 +125,20 @@ def context_parallel_attention(query, key, value):
     return out
 
 
-def create_model(attention_fn, nheads, dim1, dim2):
+# NOTE: we use this function directly in the node checks
+def save_scalar_muls(ctx, op, *args, **kwargs):
+    if op == torch.ops.aten.mul.Scalar:
+        return torch.utils.checkpoint.CheckpointPolicy.MUST_SAVE
+    return torch.utils.checkpoint.CheckpointPolicy.MUST_RECOMPUTE
+
+
+def save_mm(ctx, op, *args, **kwargs):
+    if op == torch.ops.aten.mm.default:
+        return torch.utils.checkpoint.CheckpointPolicy.MUST_SAVE
+    return torch.utils.checkpoint.CheckpointPolicy.MUST_RECOMPUTE
+
+
+def create_model(attention_fn, nheads, dim1, dim2, sac_policy=None):
     class LocalMapTransformerBlock(nn.Module):
         def __init__(self, nheads, dim1, dim2):
             super().__init__()
@@ -54,8 +150,14 @@ def __init__(self, nheads, dim1, dim2):
             self.wo = nn.Linear(dim1, dim1, bias=bias)
             self.w1 = nn.Linear(dim1, dim2, bias=bias)
             self.w2 = nn.Linear(dim2, dim1, bias=bias)
-
-        def forward(self, x):
+            if sac_policy:
+                self.sac_context_fn = functools.partial(
+                    create_selective_checkpoint_contexts, sac_policy
+                )
+            else:
+                self.sac_context_fn = None
+
+        def _forward(self, x):
             q = self.wq(x)
             k = self.wk(x)
             v = self.wv(x)
@@ -78,41 +180,83 @@ def forward(self, x):
             o = o0 + o
             return o
 
+        def forward(self, x):
+            if self.sac_context_fn is not None:
+                return torch.utils.checkpoint.checkpoint(
+                    self._forward,
+                    x,
+                    use_reentrant=False,
+                    context_fn=self.sac_context_fn,
+                )
+            return self._forward(x)
+
     return LocalMapTransformerBlock(nheads, dim1, dim2)
 
 
-class TestLocalMap(TestCase):
-    @requires_cuda_and_triton
-    @unittest.skipIf(
-        not torch.distributed.is_available(), "Torch distributed not available."
+def get_local_mapped_functions(mesh):
+    assert torch.distributed.is_available()
+
+    @local_map(
+        out_placements=((Shard(0), Shard(1), Shard(2)),),
+        in_placements=(
+            (Shard(0), Shard(1), Shard(2)),  # query
+            (Shard(0), Shard(1), Replicate()),  # key
+            (Shard(0), Shard(1), Replicate()),  # value
+        ),
+        redistribute_inputs=True,
+        in_grad_placements=None,
+        device_mesh=mesh,
+    )
+    def cp_decorated(query, key, value):
+        return context_parallel_attention(query, key, value)
+
+    cp_function = local_map(
+        context_parallel_attention,
+        out_placements=((Shard(0), Shard(1), Shard(2)),),
+        in_placements=(
+            (Shard(0), Shard(1), Shard(2)),  # query
+            (Shard(0), Shard(1), Replicate()),  # key
+            (Shard(0), Shard(1), Replicate()),  # value
+        ),
+        redistribute_inputs=True,
+        in_grad_placements=None,
+        device_mesh=mesh,
     )
-    def test_simple(self):
-        @local_map(
-            out_placements=((Shard(0), Shard(1), Shard(2)),),
-            in_placements=(
-                (Shard(0), Shard(1), Shard(2)),  # query
-                (Shard(0), Shard(1), Replicate()),  # key
-                (Shard(0), Shard(1), Replicate()),  # value
-            ),
-            redistribute_inputs=True,
-            in_grad_placements=None,
-            device_mesh=None,
-        )
-        def cp_decorated(query, key, value):
-            return context_parallel_attention(query, key, value)
 
-        cp_function = local_map(
-            context_parallel_attention,
-            out_placements=(Shard(0), Shard(1), Shard(2)),
-            in_placements=(
-                (Shard(0), Shard(1), Shard(2)),  # query
-                (Shard(0), Shard(1), Replicate()),  # key
-                (Shard(0), Shard(1), Replicate()),  # value
-            ),
-            redistribute_inputs=True,
-            in_grad_placements=None,
-            device_mesh=None,
-        )
+    return cp_decorated, cp_function
+
+
+class TestLocalMap(TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+        self.exit_stack = ExitStack()
+        self.exit_stack.enter_context(sdpa_kernel(backends=[SDPBackend.MATH]))
+        if torch.distributed.is_available():
+            from torch.testing._internal.distributed.fake_pg import FakeStore
+
+            self.fake_store = FakeStore()
+            self.world_size = 256
+            torch.distributed.init_process_group(
+                "fake", store=self.fake_store, rank=0, world_size=self.world_size
+            )
+            self.mesh = torch.distributed.device_mesh.init_device_mesh(
+                "cpu",
+                (self.world_size // 32, 8, 4),
+                mesh_dim_names=(
+                    "dp",
+                    "tp",
+                    "cp",
+                ),
+            )
+
+    def tearDown(self):
+        self.exit_stack.close()
+        if torch.distributed.is_available():
+            torch.distributed.destroy_process_group()
+
+    @unittest.skipIf(*get_skip_reasons())
+    def test_simple(self):
+        cp_decorated, cp_function = get_local_mapped_functions(self.mesh)
         bs = 8 * 1
         dim1 = 96
         dim2 = dim1 * 4
@@ -123,21 +267,24 @@ def cp_decorated(query, key, value):
 
         backend = EagerAndRecordGraphs()
 
-        model = create_model(cp_decorated, nheads, dim1, dim2).cuda()
-        inputs = (torch.randn(bs, seq_len, dim1, requires_grad=True).cuda(),)
+        model = create_model(cp_decorated, nheads, dim1, dim2)
+        inputs = (torch.randn(bs, seq_len, dim1, requires_grad=True),)
         with LocalMapWrappedHigherOrderVariable.enable():
             out = torch.compile(model, backend=backend)(*inputs)
         out.sum().backward()
 
-        model = create_model(cp_function, nheads, dim1, dim2).cuda()
-        inputs = (torch.randn(bs, seq_len, dim1, requires_grad=True).cuda(),)
+        model = create_model(cp_function, nheads, dim1, dim2)
+        inputs = (torch.randn(bs, seq_len, dim1, requires_grad=True),)
         with LocalMapWrappedHigherOrderVariable.enable():
             out = torch.compile(model, backend=backend)(*inputs)
         out.sum().backward()
 
         if not TEST_WITH_CROSSREF:
             self.assertEqual(len(backend.graphs), 2)
-            # should see local_map_hop in both
+            self.assertEqual(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                normalize_gm(backend.graphs[1].print_readable(print_output=False)),
+            )
             self.assertExpectedInline(
                 normalize_gm(backend.graphs[0].print_readable(print_output=False)),
                 """\
@@ -187,16 +334,455 @@ def forward(self, L_self_modules_wq_parameters_weight_: "f32[96, 96]", L_x_: "f3
         return (o_6,)
 
     class subgraph_0(torch.nn.Module):
-        def forward(self, q_1: "f32[8, 16, 16, 6]", k_1: "f32[8, 16, 16, 6]", v_1: "f32[8, 16, 16, 6]"):
-            out: "f32[8, 16, 16, 6]" = torch._C._nn.scaled_dot_product_attention(query = q_1, key = k_1, value = v_1, is_causal = False);  q_1 = k_1 = v_1 = None
+        def forward(self, q_1: "f32[1, 2, 4, 6]", k_1: "f32[1, 2, 16, 6]", v_1: "f32[1, 2, 16, 6]"):
+            out: "f32[1, 2, 4, 6]" = torch._C._nn.scaled_dot_product_attention(query = q_1, key = k_1, value = v_1, is_causal = False);  q_1 = k_1 = v_1 = None
             return (out,)
 """,
             )
 
+    @unittest.skipIf(*get_skip_reasons())
+    def test_sac(self):
+        cp_decorated, cp_function = get_local_mapped_functions(self.mesh)
+        bs = 8 * 1
+        dim1 = 96
+        dim2 = dim1 * 4
+        nheads = 16
+        seq_len = 16
+
+        from torch._dynamo.testing import AotEagerAndRecordGraphs, normalize_gm
+
+        backend = AotEagerAndRecordGraphs()
+
+        model = create_model(
+            cp_decorated, nheads, dim1, dim2, sac_policy=save_scalar_muls
+        )
+        inputs = (torch.randn(bs, seq_len, dim1, requires_grad=True),)
+        with LocalMapWrappedHigherOrderVariable.enable():
+            out = torch.compile(model, backend=backend)(*inputs)
+        out.sum().backward()
+
+        model = create_model(
+            cp_function, nheads, dim1, dim2, sac_policy=save_scalar_muls
+        )
+        inputs = (torch.randn(bs, seq_len, dim1, requires_grad=True),)
+        with LocalMapWrappedHigherOrderVariable.enable():
+            out = torch.compile(model, backend=backend)(*inputs)
+        out.sum().backward()
+
+        if not TEST_WITH_CROSSREF:
+            self.assertEqual(len(backend.graphs), 2)
             self.assertEqual(
                 normalize_gm(backend.graphs[0].print_readable(print_output=False)),
                 normalize_gm(backend.graphs[1].print_readable(print_output=False)),
             )
+            self.assertEqual(
+                normalize_gm(backend.fw_graphs[0].print_readable(print_output=False)),
+                normalize_gm(backend.fw_graphs[1].print_readable(print_output=False)),
+            )
+            self.assertEqual(
+                normalize_gm(backend.bw_graphs[0].print_readable(print_output=False)),
+                normalize_gm(backend.bw_graphs[1].print_readable(print_output=False)),
+            )
+            self.assertEqual(
+                len(
+                    backend.graphs[0].graph.find_nodes(
+                        op="call_function",
+                        target=torch._higher_order_ops.wrap.tag_activation_checkpoint,
+                    )
+                ),
+                1,
+            )
+            # TODO: add joint to the testing compile backend
+            fw_outs = {
+                n.name
+                for n in backend.fw_graphs[0].graph.find_nodes(op="output")[0].args[0]
+            }
+            bw_ins = {
+                n.name for n in backend.bw_graphs[0].graph.find_nodes(op="placeholder")
+            }
+            for node in backend.fw_graphs[0].graph.nodes:
+                if "recompute" in node.meta:
+                    expected = save_scalar_muls(None, node.target, None, None)
+                    actual = node.meta["recompute"]
+                    self.assertEqual(expected, actual)
+                    if actual == torch.utils.checkpoint.CheckpointPolicy.MUST_SAVE:
+                        self.assertTrue(node.name in fw_outs and node.name in bw_ins)
+                    elif (
+                        actual == torch.utils.checkpoint.CheckpointPolicy.MUST_RECOMPUTE
+                    ):
+                        # can still be in fw_outs for post-graph bytecode
+                        self.assertFalse(node.name in bw_ins)
+
+    @unittest.skipIf(*get_skip_reasons())
+    def test_sac_deferred(self):
+        # This test is in a bit of a weird state, it needs compositional compile API
+        # so that we can defer inlining for up until AOTAutograd stage 1.
+        # Then we should be inlined by stage 2. But we can't do that today.
+
+        cp_decorated, cp_function = get_local_mapped_functions(self.mesh)
+        bs = 8 * 1
+        dim1 = 128
+        dim2 = dim1 * 4
+        nheads = 16
+        seq_len = 16
+
+        from torch._dynamo.testing import AotEagerAndRecordGraphs, normalize_gm
+
+        backend = AotEagerAndRecordGraphs()
+
+        model = create_model(
+            cp_decorated, nheads, dim1, dim2, sac_policy=save_scalar_muls
+        ).to(torch.bfloat16)
+        inputs = (
+            torch.randn(bs, seq_len, dim1, requires_grad=True, dtype=torch.bfloat16),
+        )
+        try:
+            with enable_local_map_wrapping():
+                out = torch.compile(model, backend=backend)(*inputs)
+            out.sum().backward()
+        except AttributeError as e:
+            # TODO: get rid of this when we can install as a subgraph
+            self.assertTrue(
+                "module 'torch._higher_order_ops.local_map' has no attribute 'call_local_map'"
+                in str(e)
+            )
+
+        model = create_model(
+            cp_function, nheads, dim1, dim2, sac_policy=save_scalar_muls
+        ).to(torch.bfloat16)
+        inputs = (
+            torch.randn(bs, seq_len, dim1, requires_grad=True, dtype=torch.bfloat16),
+        )
+        try:
+            with enable_local_map_wrapping():
+                out = torch.compile(model, backend=backend)(*inputs)
+            out.sum().backward()
+        except AttributeError as e:
+            # TODO: get rid of this when we can install as a subgraph
+            self.assertTrue(
+                "module 'torch._higher_order_ops.local_map' has no attribute 'call_local_map'"
+                in str(e)
+            )
+
+        # TODO: re-enable tests on backward when we can install as a subgraph
+        if not TEST_WITH_CROSSREF:
+            self.assertEqual(len(backend.graphs), 2)
+            self.assertEqual(
+                normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+                normalize_gm(backend.graphs[1].print_readable(print_output=False)),
+            )
+            self.assertEqual(
+                normalize_gm(backend.fw_graphs[0].print_readable(print_output=False)),
+                normalize_gm(backend.fw_graphs[1].print_readable(print_output=False)),
+            )
+            # self.assertEqual(
+            #     normalize_gm(backend.bw_graphs[0].print_readable(print_output=False)),
+            #     normalize_gm(backend.bw_graphs[1].print_readable(print_output=False)),
+            # )
+            self.assertEqual(
+                len(
+                    backend.graphs[0].graph.find_nodes(
+                        op="call_function",
+                        target=torch._higher_order_ops.wrap.tag_activation_checkpoint,
+                    )
+                ),
+                1,
+            )
+            # TODO: add joint to the testing compile backend
+            fw_outs = {
+                n.name
+                for n in backend.fw_graphs[0].graph.find_nodes(op="output")[0].args[0]
+            }
+            # bw_ins = {
+            #     n.name for n in backend.bw_graphs[0].graph.find_nodes(op="placeholder")
+            # }
+            for node in backend.fw_graphs[0].graph.nodes:
+                if "recompute" in node.meta:
+                    expected = save_scalar_muls(None, node.target, None, None)
+                    actual = node.meta["recompute"]
+                    self.assertEqual(expected, actual)
+                    if actual == torch.utils.checkpoint.CheckpointPolicy.MUST_SAVE:
+                        self.assertTrue(node.name in fw_outs)
+                    #     self.assertTrue(node.name in fw_outs and node.name in bw_ins)
+                    # elif (
+                    #     actual == torch.utils.checkpoint.CheckpointPolicy.MUST_RECOMPUTE
+                    # ):
+                    #     # can still be in fw_outs for post-graph bytecode
+                    #     self.assertFalse(node.name in bw_ins)
+
+    @unittest.skipIf(*get_skip_reasons())
+    def test_local_map_dynamo_mismatch_placements(self):
+        @local_map(
+            out_placements=((Shard(0), Shard(1), Shard(2)),),
+            in_placements=((Shard(0), Shard(1), Shard(2)),),
+            redistribute_inputs=True,
+            in_grad_placements=None,
+            device_mesh=self.mesh,
+        )
+        def mismatch_input(x, y):
+            return x + y
+
+        x = torch.randn(64, 64, 64, requires_grad=True)
+        y = torch.randn(64, 64, 64, requires_grad=True)
+        with (
+            LocalMapWrappedHigherOrderVariable.enable(),
+            self.assertRaisesRegex(
+                AssertionError,
+                "Expecting 1 inputs to local_map function based on placements, but found 2.",
+            ),
+        ):
+            torch.compile(mismatch_input, backend="eager", fullgraph=True)(x, y)
+
+        @local_map(
+            out_placements=(
+                (Shard(0), Shard(1), Shard(2)),
+                # purposefully mismatched outputs
+            ),
+            in_placements=((Shard(0), Shard(1), Shard(2)),),
+            redistribute_inputs=True,
+            in_grad_placements=None,
+            device_mesh=self.mesh,
+        )
+        def mismatch_outputs(x):
+            return x + 11, x + 12
+
+        x = torch.randn(64, 64, 64, requires_grad=True)
+        with (
+            LocalMapWrappedHigherOrderVariable.enable(),
+            self.assertRaisesRegex(
+                AssertionError,
+                "Expecting 1 outputs to local_map function based on placements, but found 2.",
+            ),
+        ):
+            torch.compile(mismatch_outputs, backend="eager", fullgraph=True)(x)
+
+    @unittest.skipIf(*get_skip_reasons())
+    def test_local_map_with_local_shapes_hop_tracing(self):
+        def fn(x):
+            assert x.shape == (10, 80), "expected local shapes"
+            # force view specialization ops
+            out = x.view(-1) + 10
+            return (out.view(x.shape),)
+
+        x = torch.randn(10, 80)
+        gm = make_fx(fn)(x)
+
+        gm.meta = {
+            "local_map_kwargs": {
+                "in_placements": ((Shard(0), Replicate(), Replicate()),),
+                "out_placements": ((Shard(0), Replicate(), Replicate()),),
+                "device_mesh": self.mesh,
+            }
+        }
+
+        with FakeTensorMode():
+            global_tensor = torch.randn(80, 80, requires_grad=True)
+        with torch._higher_order_ops.local_map.defer_inlining():
+            out = torch._higher_order_ops.local_map_hop(gm, global_tensor)
+            out[0].sum().backward()
+        self.assertEqual(global_tensor.shape, (80, 80))
+
+    @unittest.skipIf(*get_skip_reasons())
+    def test_local_map_with_local_shapes_dynamo_tracing(self):
+        @local_map(
+            out_placements=((Shard(0), Replicate(), Replicate()),),
+            in_placements=((Shard(0), Replicate(), Replicate()),),
+            redistribute_inputs=True,
+            in_grad_placements=None,
+            device_mesh=self.mesh,
+        )
+        def fn(x):
+            out = x.view(-1) + 10
+            return (out.view(x.shape),)
+
+        class MyModule(torch.nn.Module):
+            def forward(self, x):
+                return fn(x)
+
+        model = MyModule()
+        with FakeTensorMode():
+            inputs = (torch.randn(80, 80, requires_grad=True),)
+        gm = ap_style_initial_capture(model, inputs)
+        fw_node, bw_node = [n for n in gm.graph.nodes if "call_local_map" in n.name]
+
+        # Graph should not be aware that Fake key used local shapes
+        fw_inputs = fw_node.args
+        assert len(fw_inputs) == 1
+        self.assertEqual(fw_inputs[0].meta["val"].shape, (80, 80))
+
+        fw_outputs = fw_node.args
+        assert len(fw_outputs) == 1
+        self.assertEqual(fw_outputs[0].meta["val"].shape, (80, 80))
+
+        bw_inputs = bw_node.args
+        assert len(bw_inputs) == 1
+        self.assertEqual(bw_inputs[0].meta["val"].shape, (80, 80))
+
+        bw_outputs = bw_node.meta["val"]
+        assert len(bw_outputs) == 1
+        self.assertEqual(bw_outputs[0].shape, (80, 80))
+
+    @unittest.skipIf(*get_skip_reasons())
+    def test_none_gradients(self):
+        @local_map(
+            out_placements=((Replicate(), Replicate(), Replicate()),),
+            in_placements=(
+                (Replicate(), Replicate(), Replicate()),
+                (Replicate(), Replicate(), Replicate()),
+            ),
+            redistribute_inputs=True,
+            in_grad_placements=None,
+            device_mesh=self.mesh,
+        )
+        def replicate_linear(w, x):
+            # x does not requires_grad, so it will have None gradients
+            return torch.matmul(x, w.t())
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = nn.Linear(80, 80)
+
+            def forward(self, x):
+                return replicate_linear(self.w.weight, x)
+
+        model = MyModule()
+        with FakeTensorMode():
+            inputs = (
+                torch.randn(
+                    80,
+                    80,
+                ),
+            )
+        ap_style_initial_capture(model, inputs)
+
+    @unittest.skipIf(*get_skip_reasons())
+    def test_none_placements(self):
+        class ScalarHolder(torch.nn.Module):
+            def __init__(self, scalar):
+                super().__init__()
+                self.scalar = scalar
+
+            def forward(self, x):
+                return x + self.scalar
+
+        @local_map(
+            out_placements=((Replicate(), Replicate(), Replicate()),),
+            in_placements=(
+                (Replicate(), Replicate(), Replicate()),
+                None,
+                None,
+            ),
+            redistribute_inputs=True,
+            in_grad_placements=None,
+            device_mesh=self.mesh,
+        )
+        def fn_with_non_tensors(x, scalar, module):
+            return x + 10 + scalar + module.scalar
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.module = ScalarHolder(10)
+
+            def forward(self, x):
+                return fn_with_non_tensors(x, 10, self.module)
+
+        x = torch.randn(10, 10, requires_grad=True)
+        model = MyModule()
+        ap_style_initial_capture(model, (x,))
+
+    @unittest.skipIf(*get_skip_reasons())
+    def test_filtered_gradients(self):
+        @local_map(
+            out_placements=(
+                (Replicate(), Replicate(), Replicate()),
+                (Replicate(), Replicate(), Replicate()),
+            ),
+            in_placements=(
+                (Replicate(), Replicate(), Replicate()),
+                (Replicate(), Replicate(), Replicate()),
+            ),
+            redistribute_inputs=True,
+            in_grad_placements=None,
+            device_mesh=self.mesh,
+        )
+        def returns_non_param(w, x):
+            # x does not requires_grad, and it is an output, so its corresponding tangent is filtered out
+            return torch.matmul(x, w.t()), x + 20
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = nn.Linear(80, 80)
+
+            def forward(self, x):
+                a, b = returns_non_param(self.w.weight, x)
+                return a.sum() + b.sum()
+
+        model = MyModule()
+        with FakeTensorMode():
+            inputs = (torch.randn(80, 80),)
+        ap_style_initial_capture(model, inputs)
+
+    @unittest.skipIf(*get_skip_reasons())
+    def test_fx_annotations(self):
+        @local_map(
+            out_placements=((Replicate(), Replicate(), Replicate()),),
+            in_placements=(
+                (Replicate(), Replicate(), Replicate()),
+                (Replicate(), Replicate(), Replicate()),
+                None,
+            ),
+            redistribute_inputs=True,
+            in_grad_placements=None,
+            device_mesh=self.mesh,
+        )
+        def fn(w, x, id):
+            with fx_traceback.annotate({"inside_local_map": id}):
+                return torch.matmul(x, w.t())
+
+        context_fn = functools.partial(create_selective_checkpoint_contexts, save_mm)
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.w = nn.Linear(80, 80)
+
+            def forward(self, x):
+                a = fn(self.w.weight, x, 0)
+                b = torch.utils.checkpoint.checkpoint(
+                    fn, self.w.weight, x, 1, use_reentrant=False, context_fn=context_fn
+                )
+                return a.sum() + b.sum()
+
+        model = MyModule()
+        with FakeTensorMode():
+            fw_inputs = (torch.randn(80, 80),)
+
+        with fx_traceback.preserve_node_meta():
+            joint_gm_deferred = ap_style_initial_capture(model, fw_inputs)
+            joint_inputs = [
+                n.meta["val"]
+                for n in joint_gm_deferred.graph.nodes
+                if n.op == "placeholder"
+            ]
+            # TODO: need a local shape interpreter for cases where the graph specializes on shapes
+            interp = torch.fx.Interpreter(joint_gm_deferred)
+            joint_gm_inlined = make_fx(interp.run)(*joint_inputs)
+
+        mm_nodes = joint_gm_inlined.graph.find_nodes(
+            op="call_function", target=torch.ops.aten.mm.default
+        )
+        self.assertEqual(len(mm_nodes), 4)
+        self.assertNotIn("partitioner_tag", mm_nodes[0].meta)
+        self.assertNotIn("partitioner_tag", mm_nodes[1].meta)
+        self.assertEqual(mm_nodes[2].meta["partitioner_tag"], "is_backward")
+        self.assertEqual(mm_nodes[3].meta["partitioner_tag"], "is_backward")
+        self.assertEqual(mm_nodes[0].meta["custom"]["inside_local_map"], 0)
+        self.assertEqual(mm_nodes[1].meta["custom"]["inside_local_map"], 1)
+        self.assertEqual(mm_nodes[2].meta["custom"]["inside_local_map"], 1)
+        self.assertEqual(mm_nodes[3].meta["custom"]["inside_local_map"], 0)
 
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_analysis.py b/test/inductor/test_analysis.py
index 6e458b1d73b4..55f5bec86c53 100644
--- a/test/inductor/test_analysis.py
+++ b/test/inductor/test_analysis.py
@@ -289,12 +289,14 @@ def test_diff(self, device, dtype):
         om = _test_model(device, dtype)
         REPEAT = 5
         trace1, trace2 = trace_files()
+        print(f"first trace {trace1}")
         torch._dynamo.reset()  # reset the cache
         with fresh_inductor_cache():
             with torch.profiler.profile(record_shapes=True) as p:
                 om()
         p.export_chrome_trace(trace1)
 
+        print(f"second trace {trace2}")
         torch._dynamo.reset()  # reset the cache
         with fresh_inductor_cache():
             with torch.profiler.profile(record_shapes=True) as p:
@@ -302,6 +304,7 @@ def test_diff(self, device, dtype):
                     om()
         p.export_chrome_trace(trace2)
 
+        print("diffing...")
         with patch(
             "sys.argv",
             [
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index 82dc013963f3..0e9ff43cc87e 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -42,6 +42,7 @@
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_cuda import (
     _get_torch_cuda_version,
+    IS_SM90,
     PLATFORM_SUPPORTS_FLASH_ATTENTION,
     PLATFORM_SUPPORTS_FP8,
     PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
@@ -50,6 +51,7 @@
 )
 from torch.testing._internal.common_device_type import (
     _has_sufficient_memory,
+    e4m3_type,
     skipCUDAIf,
 )
 from torch.testing._internal.common_quantization import (
@@ -71,6 +73,7 @@
     skipIfRocm,
     skipIfRocmArch,
     skipIfWindows,
+    skipIfWindowsXPU,
     skipIfXpu,
     TEST_MPS,
     TEST_WITH_ROCM,
@@ -200,7 +203,7 @@ def forward(self, x, y):
                 AOTIRunnerUtil.compile, model, example_inputs
             )
             if self.device == "mps":
-                FileCheck().check("getKernelFunction(").run(code)
+                FileCheck().check("aoti_torch_mps_get_kernel_function(").run(code)
             elif self.device == GPU_TYPE:
                 FileCheck().check("launchKernel(").run(code)
                 if config.aot_inductor.embed_kernel_binary:
@@ -328,6 +331,10 @@ def forward(self, x, y):
         )
         self.assertTrue(actual_path == expected_path)
 
+    @unittest.skipIf(
+        config.triton.native_matmul,
+        "different # of input/output/constants in native matmul",
+    )
     def test_empty_constant_folding(self):
         class Model(torch.nn.Module):
             def __init__(self, device):
@@ -609,6 +616,9 @@ def forward(self, x):
         example_inputs = (torch.randn(32, 64, device=self.device),)
         self.check_model(Model(), example_inputs)
 
+    @unittest.skip(
+        "install_free_tensors leads to OOM - https://github.com/pytorch/pytorch/issues/164062"
+    )
     def test_large_weight(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -696,6 +706,24 @@ def forward(self, x, y):
         with config.patch({"aot_inductor.force_mmap_weights": True}):
             self.check_model(Model(), example_inputs)
 
+    def test_large_mmaped_weights_on_disk(self):
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(512, 250112)
+
+            def forward(self, x, y):
+                return x + self.linear(y)
+
+        example_inputs = (
+            torch.randn(1, 250112, device=self.device),
+            torch.randn(1, 512, device=self.device),
+        )
+        with config.patch(
+            {"aot_inductor.package_constants_on_disk_format": "binary_blob"}
+        ):
+            self.check_model(Model(), example_inputs)
+
     def test_with_offset(self):
         class Model(torch.nn.Module):
             def __init__(self, device):
@@ -1139,6 +1167,7 @@ def forward(self, x, y):
             options={"debug_check_inf_and_nan": True},
         )
 
+    @skipIfWindowsXPU(msg="crash on Windows XPU.")
     def test_assert_async(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU_TYPE")
@@ -1194,7 +1223,6 @@ def forward(self, x, y):
         not PLATFORM_SUPPORTS_FP8,
         "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
     )
-    @skipIfRocm  # _scaled_mm_out_cuda  is not compiled for ROCm platform
     @skipIfXpu
     def test_fp8(self):
         # cuda only
@@ -1207,7 +1235,7 @@ def __init__(self, dtype):
                 self.out_dtype = dtype
 
             def forward(self, x, weight, bias, scale_a, scale_b):
-                weight = weight.to(torch.float8_e4m3fn)
+                weight = weight.to(e4m3_type)
                 output = torch._scaled_mm(
                     x,
                     weight,
@@ -1229,7 +1257,7 @@ def forward(self, x, weight, bias, scale_a, scale_b):
         b_inverse_scale = 1 / b_scale
 
         x_shape = (16, 16)
-        x = torch.rand(*x_shape, device=GPU_TYPE, dtype=dtype).to(torch.float8_e4m3fn)
+        x = torch.rand(*x_shape, device=GPU_TYPE, dtype=dtype).to(e4m3_type)
         dim0_x = Dim("dim0_x", min=1, max=2048)
         dynamic_shapes = ({0: dim0_x}, None, None, None, None)
         self.check_model(
@@ -1238,11 +1266,76 @@ def forward(self, x, weight, bias, scale_a, scale_b):
             dynamic_shapes=dynamic_shapes,
         )
 
+    @unittest.skipIf(
+        TEST_WITH_ROCM or not IS_SM90,
+        "scaled_grouped_mm is only supported on SM90",
+    )
+    @skipIfXpu
+    def test_scaled_grouped_mm(self):
+        # Test torch._scaled_grouped_mm AOTI lowering
+        # cuda only
+        if self.device != "cuda":
+            raise unittest.SkipTest("requires CUDA")
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, weight, scale_a, scale_b, offsets):
+                # x: [num_groups, batch, in_features] - FP8 inputs
+                # weight: [total_out_features, in_features] - FP8 weights (transposed)
+                # scale_a: [num_groups] - input scales
+                # scale_b: [num_groups] - weight scales
+                # offsets: [num_groups] - cumulative output sizes
+                output = torch._scaled_grouped_mm(
+                    x,
+                    weight.t(),
+                    scale_a=scale_a,
+                    scale_b=scale_b,
+                    offs=offsets,
+                    use_fast_accum=True,
+                )
+                return output.half()
+
+        dtype = torch.float16
+        num_groups = 3
+        batch_size = 64
+        in_features = 128
+        out_features_list = [64, 128, 256]  # Different output sizes for each group
+
+        device = GPU_TYPE
+
+        # Calculate offsets (cumulative output sizes)
+        offsets = torch.cumsum(torch.tensor(out_features_list), dim=0).to(
+            device, dtype=torch.int32
+        )
+        total_out_features = sum(out_features_list)
+
+        # Create FP8 input tensors - stacked for all groups
+        x_fp16 = torch.randn(
+            num_groups, batch_size, in_features, dtype=dtype, device=device
+        )
+        x_fp8 = x_fp16.to(torch.float8_e4m3fn)
+
+        # Create FP8 weight tensor - concatenated and transposed
+        weight_fp16 = torch.randn(
+            total_out_features, in_features, dtype=dtype, device=device
+        )
+        weight_fp8 = weight_fp16.to(torch.float8_e4m3fn)
+
+        # Create scales
+        scale_a = torch.ones(num_groups, batch_size, device=device, dtype=torch.float32)
+        scale_b = torch.ones(total_out_features, device=device, dtype=torch.float32)
+
+        self.check_model(
+            Model(),
+            (x_fp8, weight_fp8, scale_a, scale_b, offsets),
+        )
+
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FP8,
         "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
     )
-    @skipIfRocm  # _scaled_mm_out_cuda  is not compiled for ROCm platform
     @skipIfXpu
     def test_fp8_view_of_param(self):
         # cuda only
@@ -1277,15 +1370,13 @@ def forward(self, x, bias, scale_a, scale_b):
         input_bias = torch.rand(32, device=self.device, dtype=dtype)
         weight_shape = (32, 16)
         weight = torch.rand(*weight_shape, device=self.device, dtype=dtype).to(
-            torch.float8_e4m3fn
+            e4m3_type
         )
         a_inverse_scale = 1 / a_scale
         b_inverse_scale = 1 / b_scale
 
         x_shape = (16, 16)
-        x = torch.rand(*x_shape, device=self.device, dtype=dtype).to(
-            torch.float8_e4m3fn
-        )
+        x = torch.rand(*x_shape, device=self.device, dtype=dtype).to(e4m3_type)
         dim0_x = Dim("dim0_x", min=1, max=2048)
         dynamic_shapes = ({0: dim0_x}, None, None, None)
         self.check_model(
@@ -1578,6 +1669,137 @@ def forward(self, x, y):
         )
         self.check_model(Repro(), example_inputs)
 
+    @skipIfMPS
+    @config.patch({"unbacked_symint_fallback": 12})
+    @parametrize("shift_k", [0, 1, 2, 3])
+    @parametrize("use_static_size", [True, False])
+    def test_unbacked_expr_replacements(self, shift_k, use_static_size):
+        """
+        Test parameters
+        - shift_k: Validates that torch._check assertion order doesn't affect
+        results by shifting the order of torch._checks
+        - use_static_size: Tests torch._check compatibility between unbacked
+        symbolic expressions and static shapes
+        """
+
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("Need triton for user-defined triton kernel")
+
+        def realize_out_tensor_with_size(size):
+            STATIC_DIM = 256  # large enough to hit IMA w/o compute-sanitizer
+            tensor = torch.ones((size, STATIC_DIM), device=self.device)
+            # Realize the tensor as an intermediate buffer
+            nrows, ncols = tensor.shape
+            numel = tensor.numel()
+            add_kernel[nrows,](
+                in_ptr0=tensor,
+                in_ptr1=tensor,
+                out_ptr=tensor,
+                n_elements=numel,
+                BLOCK_SIZE=ncols,
+            )
+            return tensor
+
+        class Repro(torch.nn.Module):
+            def forward(self, x, y, lst):
+                STATIC_SIZE = 300
+                s0, s1 = x.shape
+                s2, s3 = y.shape
+                u0, u1, u2, u3, u100 = lst.tolist()
+
+                expr1 = s0 + u0
+                expr2 = s1 + u1
+                expr3 = (s2 * s3) + (u2 // u3)  # make this one a lil complicated
+                expr4 = STATIC_SIZE if use_static_size else u100
+
+                t1 = realize_out_tensor_with_size(expr1)
+                t2 = realize_out_tensor_with_size(expr2)
+                t3 = realize_out_tensor_with_size(expr3)
+                t4 = realize_out_tensor_with_size(expr4)
+
+                # shift tensors to change up the torch._check order
+                tensors = [t1, t2, t3, t4]
+                shifted_tensors = tensors[shift_k:] + tensors[:shift_k]
+
+                # torch.cat implicitly runs torch._check(lhs == rhs)
+                cat = torch.cat(shifted_tensors, dim=1)
+
+                return cat * cat
+
+        # Disable cuda caching allocator to check for IMA
+        torch.cuda.caching_allocator_enable(False)
+        model = Repro()
+        example_inputs = (
+            # s0, s1
+            torch.randn((100, 200), device=self.device),
+            # s2, s3
+            torch.randn((100, 3), device=self.device),
+            # u0, u1, u2, u3, u100
+            torch.tensor([200, 100, 0, 1, 300], device=self.device, dtype=torch.int),
+        )
+        spec = {
+            "x": (Dim.DYNAMIC, Dim.DYNAMIC),
+            "y": (Dim.DYNAMIC, Dim.DYNAMIC),
+            "lst": (Dim.STATIC,),
+        }
+        self.check_model(model, example_inputs, dynamic_shapes=spec)
+        torch.cuda.caching_allocator_enable(True)
+
+    @skipIfMPS
+    @config.patch({"unbacked_symint_fallback": 12})
+    @config.patch({"triton.autotune_at_compile_time": None})
+    def test_replace_unbacked_symbol_with_backed_expr(self):
+        # This will test how autotune_at_compile_time generates sample inputs
+        # when the user torch._checks(s0 + s1 == u0).
+        # We may fail with IMA if the generated input sizes aren't correct.
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires triton")
+
+        def force_realize(tensor):
+            # Realize the tensor as an intermediate buffer
+            nrows, ncols = tensor.shape
+            numel = tensor.numel()
+            add_kernel[nrows,](
+                in_ptr0=tensor,
+                in_ptr1=tensor,
+                out_ptr=tensor,
+                n_elements=numel,
+                BLOCK_SIZE=ncols,
+            )
+
+        INNER_DIM = 256
+
+        class Repro(torch.nn.Module):
+            def forward(self, x, y, lengths):
+                # Realize an intermediate buffer with backed shape: s0 + s1
+                relevant_embeddings = torch.cat([x, y], dim=0)
+                force_realize(relevant_embeddings)
+
+                # Realize an intermediate buffer with unbacked shape: u0
+                num_relevant_embeddings = lengths.nonzero().size(0)
+                ones = torch.ones((num_relevant_embeddings, INNER_DIM), device=x.device)
+                force_realize(ones)
+
+                # Add deferred runtime assertion: s0 + s1 == u0
+                torch._check(relevant_embeddings.size(0) == ones.size(0))
+                relevant_embeddings += ones
+                return relevant_embeddings * relevant_embeddings
+
+        torch.cuda.caching_allocator_enable(False)
+        model = Repro()
+        example_inputs = (
+            torch.randn((1000, INNER_DIM), device=self.device),
+            torch.randn((2000, INNER_DIM), device=self.device),
+            torch.ones(3000),
+        )
+        spec = {
+            "x": (Dim.DYNAMIC, Dim.STATIC),
+            "y": (Dim.DYNAMIC, Dim.STATIC),
+            "lengths": (Dim.DYNAMIC,),
+        }
+        self.check_model(model, example_inputs, dynamic_shapes=spec)
+        torch.cuda.caching_allocator_enable(True)
+
     @config.patch({"triton.autotune_at_compile_time": None})
     def test_stride_with_unbacked_expr(self):
         class Repro(torch.nn.Module):
@@ -1613,7 +1835,6 @@ def forward(self, values, repeats, mask, embeddings, x, z, scalar):
 
                 backed = z.size(0)
                 unbacked = scalar.item()
-                torch._check_is_size(unbacked)
 
                 unbacked_add_expr = backed + unbacked
                 repeated = x.repeat(unbacked_add_expr, 1)
@@ -1639,6 +1860,7 @@ def forward(self, values, repeats, mask, embeddings, x, z, scalar):
         }
         self.check_model(Repro(), example_inputs, dynamic_shapes=spec)
 
+    @skipIfWindowsXPU(msg="crash on Windows XPU.")
     def test_size_with_unbacked_add_expr_transitive(self):
         # Edge case with torch._check(expr1, expr2) + torch._check(expr2, unbacked).
         # When generating example input sizes for autotuning, it should coalesce
@@ -1652,8 +1874,6 @@ def forward(self, values, repeats, mask, embeddings, x, y, z, lst):
                 index_select = torch.index_select(embeddings, 0, index)
 
                 u0, u1 = lst.tolist()
-                torch._check_is_size(u0)
-                torch._check_is_size(u1)
                 backed0, backed1 = z.size(0), z.size(1)
 
                 repeated0 = y.repeat(backed0 + u0, 1)
@@ -1703,9 +1923,6 @@ def test_size_with_unbacked_add_and_mul_expr(self):
         class Repro(torch.nn.Module):
             def forward(self, values, repeats, mask, embeddings, x, y, z, lst):
                 u0, u1, u2 = lst.tolist()
-                torch._check_is_size(u0)
-                torch._check_is_size(u1)
-                torch._check_is_size(u2)
                 backed = z.size(0)
                 backed1 = z.size(1)
 
@@ -1832,7 +2049,7 @@ def __init__(self, user_float_feature_idx, device):
                 self.user_float_feature_idx = user_float_feature_idx
                 self.register_buffer(
                     "_tensor_constant0",
-                    torch.ones(1, device=device, dtype=torch.float32),
+                    torch.ones(5, device=device, dtype=torch.float32),
                     persistent=True,
                 )
                 self.register_buffer(
@@ -1843,6 +2060,7 @@ def __init__(self, user_float_feature_idx, device):
                 self.sub_mod = SubModule(device)
 
             def forward(self, x):
+                self._tensor_constant0[1:2] = 1
                 return (
                     torch.index_select(
                         x, 1, torch.tensor(self.user_float_feature_idx, device=x.device)
@@ -2122,6 +2340,39 @@ def false_fn(x):
             dynamic_shapes=dynamic_shapes,
         )
 
+    def test_cond_symint_input_disable_one_pass(self):
+        class M(torch.nn.Module):
+            def forward(self, x, y, z):
+                a = y.shape[0]
+                b = z.shape[0]
+
+                def true_fn(x):
+                    return x + a
+
+                def false_fn(x):
+                    return x + b * z
+
+                return torch.cond(x.shape[0] > 5, true_fn, false_fn, (x,))
+
+        input1 = (
+            torch.ones(3, 3, device=self.device),
+            torch.ones(5, device=self.device),
+            torch.ones(3, 3, device=self.device),
+        )
+        input2 = (
+            torch.ones(10, 3, device=self.device),
+            torch.ones(6, device=self.device),
+            torch.ones(10, 3, device=self.device),
+        )
+        inputs = (input1, input2)
+        dynamic_shapes = {"x": {0: Dim("d")}, "y": {0: Dim("d1")}, "z": {0: Dim("d")}}
+        with torch._inductor.config.patch({"triton.autotune_at_compile_time": False}):
+            self.check_model_with_multiple_inputs(
+                M(),
+                inputs,
+                dynamic_shapes=dynamic_shapes,
+            )
+
     def test_while_loop_simple(self):
         inputs = (
             torch.randn((10, 20), device=self.device),
@@ -2176,6 +2427,10 @@ def test_while_loop_with_outer_code(self):
 
     # mps doesn't support float64
     @skipIfMPS
+    @unittest.skipIf(
+        config.triton.native_matmul,
+        "FIXME: cannot do get_size on FakeTensor during lowering.",
+    )
     def test_while_loop_with_parameters(self):
         inputs = (
             torch.randn(
@@ -2753,6 +3008,9 @@ def forward(self, x, y):
                 result_package = model_package(*inputs_on_device)
             self.assertTrue(same(result_ref.cpu(), result_package.cpu()))
 
+    @unittest.skipIf(
+        config.triton.native_matmul, "sin and mm are fused in native matmul"
+    )
     def test_reuse_kernel(self):
         class Model(torch.nn.Module):
             def __init__(self) -> None:
@@ -2770,13 +3028,13 @@ def forward(self, x, y):
             torch.randn(87, 87, device=self.device),
         )
         model = Model()
-        self.check_model(
-            model, example_inputs, atol=1e-4, rtol=1e-4
-        )  # 1e-4 is the tol value used in pytorch/torch/_dynamo/utils.py
+
+        # 1e-4 is the tol value used in pytorch/torch/_dynamo/utils.py
+        self.check_model(model, example_inputs, atol=1e-4, rtol=1e-4)
 
         if self.device == "mps":
             self.code_check_count(
-                model, example_inputs, '.getKernelFunction("generated_kernel")', 1
+                model, example_inputs, "aoti_torch_mps_get_kernel_function(", 1
             )
         elif self.device == GPU_TYPE:
             self.code_check_count(
@@ -2845,7 +3103,13 @@ def forward(self, x, y, z):
 
         example_inputs = (x, y, z)
         model = Model(self.device).to(dtype=torch.float)
-        self.check_model(model, example_inputs, dynamic_shapes=dynamic_shapes)
+        self.check_model(
+            model,
+            example_inputs,
+            dynamic_shapes=dynamic_shapes,
+            atol=1e-5,
+            rtol=1e-5,
+        )
 
     def test_fake_tensor_device_validation(self):
         if self.device != GPU_TYPE:
@@ -3210,6 +3474,7 @@ def forward(self, a, b, lengths):
         self.check_model(Model(), example_inputs)
 
     @common_utils.parametrize("minmax", [min, max])
+    @skipIfWindowsXPU(msg="crash on Windows XPU.")
     def test_sympy_cpp_printer_min_max(self, minmax):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
@@ -3699,6 +3964,7 @@ def forward(self, x):
         x = torch.randn(16, 16, device=self.device)
         self.check_model(Model(), (x,))
 
+    @skipIfWindowsXPU(msg="crash on Windows XPU.")
     def test_triton_kernel_dynamic_grid(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
@@ -4043,7 +4309,7 @@ def forward(self, x):
         expected_original_fqns = {
             "L__self___test_param": "test_param",
             "L__self___test_buf": "test_buf",
-            "getattr_L__self___foo_bar___0__": "foo_bar.0",
+            "L__self___foo_bar_0": "foo_bar.0",
             "L__self___foo_bar_test_param": "foo_bar.test_param",
             "L__self___foo_bar_test_buf": "foo_bar.test_buf",
         }
@@ -4054,7 +4320,7 @@ def forward(self, x):
         expected_dtypes = {
             "L__self___test_param": 6,
             "L__self___test_buf": 6,
-            "getattr_L__self___foo_bar___0__": 6,
+            "L__self___foo_bar_0": 6,
             "L__self___foo_bar_test_param": 6,
             "L__self___foo_bar_test_buf": 6,
         }
@@ -4196,6 +4462,7 @@ def forward(self, a):
         model.weight += 1
         self.check_model(model, example_inputs)
 
+    @skipIfWindowsXPU(msg="crash on Windows XPU.")
     def test_triton_kernel_extern_kernel_arg(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
@@ -4952,6 +5219,7 @@ def forward(self, image: torch.Tensor, target_size: torch.Tensor):
         }
         self.check_model(model, example_inputs, dynamic_shapes=dynamic_shapes)
 
+    @unittest.skipIf(config.triton.native_matmul, "matmul is generated")
     def test_aoti_debug_printer_codegen(self):
         # basic addmm model to test codegen for aoti intermediate debug printer
         class Model(torch.nn.Module):
@@ -5035,6 +5303,9 @@ def forward(self, a):
                 FileCheck().check_not(f"before_launch - {kernel_name}").run(code)
                 FileCheck().check_not(f"after_launch - {kernel_name}").run(code)
 
+    @unittest.skipIf(
+        config.triton.native_matmul, "different kernel name when native matmul"
+    )
     @common_utils.parametrize("enable_kernel_profile", (True, False))
     def test_aoti_profiler(self, enable_kernel_profile):
         # basic addmm model
@@ -5076,6 +5347,52 @@ def forward(self, a):
 
             self.check_model(Model(N, K, self.device), example_inputs)
 
+    def test_aoti_user_defined_triton_kernel_profiling(self):
+        if self.device != GPU_TYPE or self.device == "mps":
+            raise unittest.SkipTest("requires GPU")
+
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x, y):
+                out = torch.zeros_like(x)
+                add_kernel[(4,)](x, y, out, n_elements=4, BLOCK_SIZE=16)
+                return out
+
+        example_inputs = (
+            torch.randn(4, 4, device=self.device),
+            torch.randn(4, 4, device=self.device),
+        )
+
+        with (
+            config.patch({"cpp.enable_kernel_profile": True}),
+            torch.profiler.profile(
+                record_shapes=True,
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    getattr(torch.profiler.ProfilerActivity, GPU_TYPE.upper()),
+                ],
+            ) as prof,
+        ):
+            self.check_model(Model(), example_inputs)
+        with common_utils.TemporaryFileName(mode="w+") as fname:
+            prof.export_chrome_trace(fname)
+            with open(fname) as f:
+                import json
+
+                j = json.load(f)
+                op_events = [
+                    e
+                    for e in j["traceEvents"]
+                    if e.get("name", "") == "kernels_.add_kernel_0"
+                ]
+                self.assertEqual(len(op_events), 1)
+                self.assertEqual(
+                    op_events[0]["args"].get("Input Args", ""),
+                    ["in_ptr0", "in_ptr1", "out_ptr", "n_elements"],
+                )
+
     def test_aoti_debug_printer_user_defined_triton_kernel(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("requires GPU")
@@ -5195,7 +5512,6 @@ def forward(self, x):
         not PLATFORM_SUPPORTS_FP8,
         "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
     )
-    @skipIfRocm  # _scaled_mm_out_cuda  is not compiled for ROCm platform
     @skipIfXpu
     def test_aoti_debug_printer_fp8_dtype(self):
         if self.device != GPU_TYPE:
@@ -5207,7 +5523,7 @@ def __init__(self, dtype):
                 self.out_dtype = dtype
 
             def forward(self, x, weight, bias, scale_a, scale_b):
-                weight = weight.to(torch.float8_e4m3fn)
+                weight = weight.to(e4m3_type)
                 output = torch._scaled_mm(
                     x,
                     weight,
@@ -5229,7 +5545,7 @@ def forward(self, x, weight, bias, scale_a, scale_b):
         b_inverse_scale = 1 / b_scale
 
         x_shape = (16, 16)
-        x = torch.rand(*x_shape, device=GPU_TYPE, dtype=dtype).to(torch.float8_e4m3fn)
+        x = torch.rand(*x_shape, device=GPU_TYPE, dtype=dtype).to(e4m3_type)
 
         kernel_calls = [
             (f"aoti_torch_{GPU_TYPE}__scaled_mm_out", 5),
@@ -5706,7 +6022,9 @@ def runner_call(*args, **kwargs):
         runner.update_constant_buffer(attach_weights, False, False)
         expected = model(test_inputs)
         output = runner_call(test_inputs)
-        self.assertEqual(expected, output)
+
+        atol, rtol = 3e-4, 3e-4
+        self.assertEqual(expected, output, atol=atol, rtol=rtol)
 
     def test_weight_on_disk_legacy(self):
         class Model(torch.nn.Module):
@@ -5729,7 +6047,7 @@ def forward(self, a):
                 {
                     "always_keep_tensor_constants": True,
                     "aot_inductor.package_constants_in_so": False,
-                    "aot_inductor.package_constants_on_disk": True,
+                    "aot_inductor.package_constants_on_disk_format": "pickle_weights",
                     "aot_inductor.package": True,
                 }
             ),
@@ -5747,7 +6065,8 @@ def forward(self, a):
             pt2_contents = load_pt2(package_path, load_weights_from_disk=True)
             loaded1 = pt2_contents.aoti_runners["model"]
 
-        self.assertEqual(loaded1(a), model(a))
+        atol, rtol = 3e-4, 3e-4
+        self.assertEqual(loaded1(a), model(a), atol=atol, rtol=rtol)
 
     def test_extract_constants_map(self):
         class Model(torch.nn.Module):
@@ -5955,7 +6274,7 @@ def runner_call(*args, **kwargs):
         test_inputs = torch.randn(M, K, device=self.device)
         expected = model(test_inputs)
         output = runner_call(test_inputs)
-        self.assertEqual(expected, output)
+        self.assertEqual(expected, output, atol=1e-3, rtol=1e-3)
 
         new_weights = {
             "L__self___weight": torch.randn(N, K, device=self.device),
@@ -6087,7 +6406,7 @@ def runner_call(*args, **kwargs):
         test_inputs = torch.randn(M, K, device=self.device)
         expected = model(test_inputs)
         output = runner_call(test_inputs)
-        self.assertEqual(expected, output)
+        self.assertEqual(expected, output, atol=1e-3, rtol=1e-3)
 
         new_weights = {
             "L__self___weight": torch.randn(N, K, device=self.device),
@@ -6104,7 +6423,7 @@ def runner_call(*args, **kwargs):
         new_expected = torch.nn.functional.linear(
             test_inputs, new_weights["L__self___weight"], new_weights["L__self___bias"]
         )
-        self.assertEqual(new_expected, new_output)
+        self.assertEqual(new_expected, new_output, atol=1e-3, rtol=1e-3)
 
         # Inplace substitube tensor, without user managed buffer, result should be different.
         new_weights["L__self___weight"].add_(1)
@@ -6112,7 +6431,7 @@ def runner_call(*args, **kwargs):
 
         new_output = runner_call(test_inputs)
         # Same as the previous result
-        self.assertEqual(new_expected, new_output)
+        self.assertEqual(new_expected, new_output, atol=1e-3, rtol=1e-3)
         new_expected = torch.nn.functional.linear(
             test_inputs, new_weights["L__self___weight"], new_weights["L__self___bias"]
         )
@@ -6132,14 +6451,14 @@ def runner_call(*args, **kwargs):
         # Try user managed_buffer, should have same free memory.
         runner.update_constant_buffer(new_weights, True, False, True)
         mem_after, _ = torch.cuda.mem_get_info(self.device)
-        self.assertEqual(mem_before, mem_after)
+        self.assertEqual(mem_before, mem_after, atol=1e-3, rtol=1e-3)
 
         runner.swap_constant_buffer()
         new_output = runner_call(test_inputs)
         new_expected = torch.nn.functional.linear(
             test_inputs, new_weights["L__self___weight"], new_weights["L__self___bias"]
         )
-        self.assertEqual(new_expected, new_output)
+        self.assertEqual(new_expected, new_output, atol=1e-3, rtol=1e-3)
 
         # Inplace substitube tensor, with user managed buffer, result should be the same.
         new_weights["L__self___weight"].add_(1)
@@ -6149,7 +6468,7 @@ def runner_call(*args, **kwargs):
         new_expected = torch.nn.functional.linear(
             test_inputs, new_weights["L__self___weight"], new_weights["L__self___bias"]
         )
-        self.assertEqual(new_expected, new_output)
+        self.assertEqual(new_expected, new_output, atol=1e-3, rtol=1e-3)
 
         new_weights = {
             "L__self___weight": torch.randn(N, K, device=self.device),
@@ -6171,10 +6490,10 @@ def runner_call(*args, **kwargs):
 
         new_output = runner_call(test_inputs)
         expected_output = model(test_inputs)
-        torch.testing.assert_close(new_output, expected_output)
+        torch.testing.assert_close(new_output, expected_output, atol=1e-3, rtol=1e-3)
 
         with self.assertRaises(AssertionError):
-            torch.testing.assert_close(new_expected, new_output)
+            torch.testing.assert_close(new_expected, new_output, atol=1e-3, rtol=1e-3)
 
     def test_cond_share_predicte(self):
         class Model(torch.nn.Module):
@@ -7065,6 +7384,7 @@ def forward(self, a, b):
 
         self.assertEqual(outputs, outputs_aoti)
 
+    @unittest.skipIf(config.triton.native_matmul, "different code generated")
     def test_pad_non_zero_memory_leak(self):
         if self.device != GPU_TYPE:
             raise unittest.SkipTest("test is only for GPU_TYPE")
@@ -7085,7 +7405,7 @@ def forward(self, x):
         model_aoti = torch._inductor.aoti_load_package(package_path)
         outputs_aoti = model_aoti(*example_inputs)
 
-        self.assertEqual(outputs, outputs_aoti)
+        self.assertEqual(outputs, outputs_aoti, atol=1e-2, rtol=1e-2)
 
         FileCheck().check_regex(
             r"aoti_torch_as_strided\(buf0_handle, .*, &buf0_handle_restrided\)"
@@ -7151,6 +7471,37 @@ def forward(self, x, y):
                         for lib in torch_libs:
                             self.assertTrue(lib not in line)
 
+    def test_unbounded_expr_substitutions(self):
+        class Model(torch.nn.Module):
+            def forward(self, x, y, a, b):
+                u0, s0 = a.item(), b.item()
+                u_max = max(u0, 15)
+                # construct the equality rule Max(15, u0) == s0 * Max(15, u0)
+                torch._check(u_max == s0 * u_max)
+                # size x - [Max(u0, 15), 64]
+                x = x.expand(u_max, *x.shape).clone()
+                return x @ y
+
+        model = Model()
+
+        example_inputs = (
+            torch.randn((64,), dtype=torch.bfloat16, device=self.device),
+            torch.randn((64, 16), dtype=torch.bfloat16, device=self.device),
+            torch.tensor(19, device=self.device),
+            torch.tensor(1, device=self.device),
+        )
+        torch._dynamo.mark_dynamic(example_inputs[-1], 0)
+
+        so_path, code = run_and_get_cpp_code(
+            AOTIRunnerUtil.legacy_compile, model, example_inputs
+        )
+
+        compiled = AOTIRunnerUtil.legacy_load(self.device, so_path)
+        compiled_outputs = compiled(*example_inputs)
+
+        eager_outputs = model(*example_inputs)
+        torch.testing.assert_close(eager_outputs, compiled_outputs)
+
 
 class AOTInductorLoggingTest(LoggingTestCase):
     @make_logging_test(dynamic=logging.DEBUG)
@@ -7191,14 +7542,16 @@ def forward(self, x):
 
 class TestAOTInductorConfig(TestCase):
     def test_no_compile_standalone(self):
-        with config.patch({"aot_inductor.compile_standalone": False}):
+        with config.patch({"aot_inductor_mode.compile_standalone": False}):
             result = maybe_aoti_standalone_config({})
             self.assertEqual(result, {})
 
     def test_compile_standalone_sets_package_cpp(self):
-        result = maybe_aoti_standalone_config({"aot_inductor.compile_standalone": True})
+        result = maybe_aoti_standalone_config(
+            {"aot_inductor_mode.compile_standalone": True}
+        )
         self.assertEqual(result["aot_inductor.package_cpp_only"], True)
-        self.assertEqual(result["aot_inductor.compile_standalone"], True)
+        self.assertEqual(result["aot_inductor_mode.compile_standalone"], True)
         self.assertEqual(result["aot_inductor.embed_kernel_binary"], True)
         self.assertEqual(
             result["aot_inductor.emit_multi_arch_kernel"], not torch.version.hip
@@ -7206,12 +7559,15 @@ def test_compile_standalone_sets_package_cpp(self):
         self.assertEqual(
             result["aot_inductor.model_name_for_generated_files"], "aoti_model"
         )
+        self.assertEqual(result["aot_inductor.dynamic_linkage"], False)
 
     def test_compile_standalone_explicit_set(self):
         patches = {
-            "aot_inductor.compile_standalone": True,
+            "aot_inductor_mode.compile_standalone": True,
             "aot_inductor.package_cpp_only": True,
             "aot_inductor.embed_kernel_binary": True,
+            "aot_inductor.dynamic_linkage": False,
+            "aot_inductor.link_libtorch": False,
             "aot_inductor.emit_multi_arch_kernel": not torch.version.hip,
             "aot_inductor.model_name_for_generated_files": "aoti_model",
         }
@@ -7220,7 +7576,7 @@ def test_compile_standalone_explicit_set(self):
 
     def test_compile_standalone_package_cpp_false_raises(self):
         patches = {
-            "aot_inductor.compile_standalone": True,
+            "aot_inductor_mode.compile_standalone": True,
             "aot_inductor.package_cpp_only": False,
         }
         with self.assertRaises(RuntimeError):
@@ -7228,11 +7584,19 @@ def test_compile_standalone_package_cpp_false_raises(self):
 
         with config.patch({"aot_inductor.package_cpp_only": False}):
             patches = {
-                "aot_inductor.compile_standalone": True,
+                "aot_inductor_mode.compile_standalone": True,
             }
             with self.assertRaises(RuntimeError):
                 maybe_aoti_standalone_config(patches)
 
+    def test_compile_standalone_cross_compile_windows_package_format(self):
+        patches = {
+            "aot_inductor.cross_target_platform": "windows",
+            "aot_inductor.package_constants_in_so": True,
+        }
+        with self.assertRaises(RuntimeError):
+            maybe_aoti_standalone_config(patches)
+
 
 common_utils.instantiate_parametrized_tests(AOTInductorTestsTemplate)
 
@@ -7277,8 +7641,6 @@ def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
 }
 
 MPS_TEST_FAILURES = {
-    # aten::_embedding_bag backward is not currently implemented for the MPS device.
-    "test_embedding_bag": fail_mps(),
     # aten::_scaled_dot_product_efficient_attention is not currently implemented for the MPS device.
     "test_scaled_dot_product_efficient_attention": fail_mps(),
     # aten::_int_mm is not implemented for MPS backend
@@ -7295,8 +7657,6 @@ def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
     "test_index_put_with_none_index": fail_mps(),
     # Error device may not be nil
     "test_zero_size_weight": fail_mps(is_skip=True),
-    # RuntimeError: Cannot compare two tensors on different devices. Got: cpu and mps:0
-    "test_aoti_constant_tensor_name_collision": fail_mps(is_skip=True),
     # MPSGraph does not support tensor dims > INT_MAX
     "test_upper_bound_i64": fail_mps(is_skip=True),
     # MPS doesn't support triton
diff --git a/test/inductor/test_aot_inductor_package.py b/test/inductor/test_aot_inductor_package.py
index 0eb1057c802e..d411ddd5155d 100644
--- a/test/inductor/test_aot_inductor_package.py
+++ b/test/inductor/test_aot_inductor_package.py
@@ -9,14 +9,14 @@
 import tempfile
 import unittest
 import zipfile
+from collections.abc import Callable
 from pathlib import Path
-from typing import Callable
 
 from parameterized import parameterized_class
 
 import torch
 import torch._inductor.config
-from torch._inductor.codecache import get_kernel_bin_format
+from torch._inductor.codecache import get_kernel_bin_format, WritableTempFile
 from torch._inductor.package import load_package, package_aoti
 from torch._inductor.test_case import TestCase
 from torch._inductor.utils import fresh_cache
@@ -119,7 +119,7 @@ def check_model(
             inductor_configs["aot_inductor.package_cpp_only"] = self.package_cpp_only
 
             torch.manual_seed(0)
-            with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+            with WritableTempFile(suffix=".pt2") as f:
                 compiled_model = compile(
                     model,
                     example_inputs,
@@ -242,7 +242,7 @@ def forward(self, x, y):
             expected = ref_model(*ref_inputs)
 
             torch.manual_seed(0)
-            with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+            with WritableTempFile(suffix=".pt2") as f:
                 ep = torch.export.export(model, example_inputs, strict=True)
                 with fresh_cache():
                     # cubin files are removed when exiting this context
@@ -393,7 +393,7 @@ def forward(self, x, y):
 
             # Test compilation when no name is passed in
             options = {
-                "aot_inductor.compile_standalone": True,
+                "aot_inductor_mode.compile_standalone": True,
             }
             with (
                 tempfile.TemporaryDirectory() as tmp_dir,
@@ -407,7 +407,7 @@ def forward(self, x, y):
 
             # Test compilation when model name is passed in
             options = {
-                "aot_inductor.compile_standalone": True,
+                "aot_inductor_mode.compile_standalone": True,
                 "aot_inductor.model_name_for_generated_files": "linear",
             }
             with (
@@ -422,7 +422,7 @@ def forward(self, x, y):
 
             # test invalid model name
             options = {
-                "aot_inductor.compile_standalone": True,
+                "aot_inductor_mode.compile_standalone": True,
                 "aot_inductor.model_name_for_generated_files": "linear/linear",
             }
             with self.assertRaisesRegex(Exception, "Invalid AOTI model name"):
@@ -448,7 +448,7 @@ def forward(self, x):
 
             # Test compilation when model name is passed in
             options = {
-                "aot_inductor.compile_standalone": True,
+                "aot_inductor_mode.compile_standalone": True,
                 "aot_inductor.model_name_for_generated_files": "cos",
             }
             with (
@@ -579,14 +579,48 @@ def forward(self, x, y):
             torch.randn(10, 10, device=self.device),
         )
         metadata = {"dummy": "moo"}
-        compiled_model = self.check_model(
-            Model(),
-            example_inputs,
-            inductor_configs={"aot_inductor.metadata": metadata},
-        )
 
-        loaded_metadata = compiled_model.get_metadata()  # type: ignore[attr-defined]
+        with torch.no_grad():
+            torch.manual_seed(0)
+            model = Model().to(device=self.device)
+            ref_model = copy.deepcopy(model)
+            ref_inputs = copy.deepcopy(example_inputs)
+            expected = ref_model(*ref_inputs)
+
+            inductor_configs = {
+                "aot_inductor.package_cpp_only": self.package_cpp_only,
+                "aot_inductor.metadata": metadata,
+            }
+
+            with WritableTempFile(suffix=".pt2") as f:
+                ep = torch.export.export(model, example_inputs, strict=False)
+                package_path = torch._inductor.aoti_compile_and_package(
+                    ep, package_path=f.name, inductor_configs=inductor_configs
+                )  # type: ignore[arg-type]
+
+                # We can load the metadata w/o loading the actual package
+                loaded_metadata = (
+                    torch._C._aoti.AOTIModelPackageLoader.load_metadata_from_package(
+                        package_path, "model"
+                    )
+                )
+                self.assertEqual(loaded_metadata.get("dummy"), "moo")
+
+                device = loaded_metadata["AOTI_DEVICE_KEY"]
+                current_device_info = torch._inductor.codecache.get_device_information(
+                    device
+                )
+
+                for k, v in current_device_info.items():
+                    self.assertTrue(k in loaded_metadata)
+                    self.assertEqual(v, loaded_metadata[k])
 
+                compiled_model = torch._inductor.aoti_load_package(package_path)
+
+            actual = compiled_model(*example_inputs)
+            self.assertEqual(actual, expected)
+
+        loaded_metadata = compiled_model.get_metadata()  # type: ignore[attr-defined]
         self.assertEqual(loaded_metadata.get("dummy"), "moo")
 
     def test_bool_input(self):
@@ -644,7 +678,7 @@ def forward(self, x):
             ep2.module(), example_inputs2, options=options
         )
 
-        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+        with WritableTempFile(suffix=".pt2") as f:
             package_path = package_aoti(
                 f.name, {"model1": aoti_files1, "model2": aoti_files2}
             )
@@ -696,7 +730,7 @@ def forward(self, a, b):
             ep2.module(), example_inputs2, options=options
         )
 
-        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+        with WritableTempFile(suffix=".pt2") as f:
             package_path = package_aoti(
                 f.name, {"model1": aoti_files1, "model2": aoti_files2}
             )
@@ -732,7 +766,7 @@ def forward(self, a, b):
                 "aot_inductor.package_cpp_only": self.package_cpp_only,
             },
         )
-        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+        with WritableTempFile(suffix=".pt2") as f:
             package_path = package_aoti(f.name, {"model1": aoti_files})
             loaded = load_package(package_path, "model1")
         self.assertTrue(
@@ -916,7 +950,7 @@ def test_package_shared_weights(self):
             "aot_inductor.package_cpp_only": self.package_cpp_only,
             "always_keep_tensor_constants": True,
             "aot_inductor.package_constants_in_so": False,
-            "aot_inductor.package_constants_on_disk": True,
+            "aot_inductor.package_constants_on_disk_format": "pickle_weights",
         }
 
         class Bar(torch.nn.Module):
@@ -952,7 +986,7 @@ def forward(self):
         aoti_files1 = torch._inductor.aot_compile(ep1.module(), (), options=options)
         aoti_files2 = torch._inductor.aot_compile(ep2.module(), (), options=options)
 
-        with tempfile.NamedTemporaryFile(suffix=".pt2") as f:
+        with WritableTempFile(suffix=".pt2") as f:
             package_path = package_aoti(
                 f.name,
                 {"model1": aoti_files1, "model2": aoti_files2},
@@ -1000,7 +1034,7 @@ def test_package_weights_on_disk_nested_module(self):
             "aot_inductor.package_cpp_only": self.package_cpp_only,
             "always_keep_tensor_constants": True,
             "aot_inductor.package_constants_in_so": False,
-            "aot_inductor.package_constants_on_disk": True,
+            "aot_inductor.package_constants_on_disk_format": "pickle_weights",
         }
 
         # linear.weight's node name is linear_weight.
diff --git a/test/inductor/test_aot_inductor_utils.py b/test/inductor/test_aot_inductor_utils.py
index 50edf7b695ad..2a9f593c5a6c 100644
--- a/test/inductor/test_aot_inductor_utils.py
+++ b/test/inductor/test_aot_inductor_utils.py
@@ -52,15 +52,16 @@ def legacy_compile(
             )
             gm = ep.module()
         else:
-            gm = torch.export._trace._export_to_torch_ir(
-                model,
-                example_inputs,
-                dynamic_shapes=dynamic_shapes,
-                disable_constraint_solver=disable_constraint_solver,
-                # Disabling this flag, because instead we can rely on the mapping
-                # dynamo_flat_name_to_original_fqn which is coming from Dynamo.
-                restore_fqn=False,
-            )
+            with torch._export.config.patch(use_new_tracer_experimental=True):
+                gm = torch.export._trace._export_to_torch_ir(
+                    model,
+                    example_inputs,
+                    dynamic_shapes=dynamic_shapes,
+                    disable_constraint_solver=disable_constraint_solver,
+                    # Disabling this flag, because instead we can rely on the mapping
+                    # dynamo_flat_name_to_original_fqn which is coming from Dynamo.
+                    restore_fqn=False,
+                )
 
         if IS_FBCODE:
             from deeplearning.aot_inductor.extern_node_thrift_serializer import (
@@ -156,7 +157,10 @@ def compile(
             # This should really be the default behavior of torch.export.export
             model = WrapperModule(model)
 
-        with torch.no_grad():
+        with (
+            torch.no_grad(),
+            torch._export.config.patch(use_new_tracer_experimental=True),
+        ):
             # strict=False needs extra migration work
             ep = torch.export.export(
                 model,
diff --git a/test/inductor/test_aot_inductor_windows.py b/test/inductor/test_aot_inductor_windows.py
new file mode 100644
index 000000000000..5a22bf7bb762
--- /dev/null
+++ b/test/inductor/test_aot_inductor_windows.py
@@ -0,0 +1,74 @@
+# Owner(s): ["module: inductor"]
+import tempfile
+import unittest
+import zipfile
+
+import torch
+import torch._inductor.config
+from torch._environment import is_fbcode
+from torch._inductor.test_case import TestCase
+from torch.testing._internal.common_utils import IS_CI
+from torch.testing._internal.inductor_utils import HAS_GPU, requires_gpu
+
+
+class Simple(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Linear(10, 16)
+        self.relu = torch.nn.ReLU()
+        self.fc2 = torch.nn.Linear(16, 1)
+        self.sigmoid = torch.nn.Sigmoid()
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+        return x
+
+
+class TestAOTInductorWindowsCrossCompilation(TestCase):
+    @requires_gpu()
+    def test_simple_so(self):
+        if is_fbcode() or IS_CI:
+            raise unittest.SkipTest("requires x86_64-w64-mingw32-gcc")
+
+        # TODO: enable in CI
+        with torch.no_grad():
+            device = "cuda"
+            model = Simple().to(device=device)
+            example_inputs = (torch.randn(8, 10, device=device),)
+            batch_dim = torch.export.Dim("batch", min=1, max=1024)
+            exported = torch.export.export(
+                model, example_inputs, dynamic_shapes={"x": {0: batch_dim}}
+            )
+            package_path = torch._inductor.aoti_compile_and_package(
+                exported,
+                inductor_configs={
+                    "aot_inductor.model_name_for_generated_files": "model",
+                    "aot_inductor.cross_target_platform": "windows",
+                    "aot_inductor.link_libtorch": False,
+                    # TODO: need to add aoti_shim_library_path for CI
+                    "aot_inductor.aoti_shim_library": "executorch",
+                    # no fallback ops
+                    "max_autotune": True,
+                    "max_autotune_gemm_backends": "TRITON,CPP",
+                    "max_autotune_conv_backends": "TRITON,CPP",
+                    "aot_inductor.embed_kernel_binary": True,
+                    # simplify things for now
+                    "aot_inductor.precompile_headers": False,
+                    "aot_inductor.package_constants_on_disk_format": "binary_blob",
+                    "aot_inductor.package_constants_in_so": False,
+                },
+            )
+
+            with tempfile.TemporaryDirectory() as tmpdir:
+                with zipfile.ZipFile(package_path, "r") as zf:
+                    zf.extractall(tmpdir)
+
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+
+    if HAS_GPU:
+        run_tests(needs="filelock")
diff --git a/test/inductor/test_aoti_cross_compile_windows.py b/test/inductor/test_aoti_cross_compile_windows.py
new file mode 100644
index 000000000000..04065add9081
--- /dev/null
+++ b/test/inductor/test_aoti_cross_compile_windows.py
@@ -0,0 +1,371 @@
+# Owner(s): ["module: inductor"]
+import os
+import platform
+import tempfile
+import unittest
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Optional
+
+import torch
+import torch._inductor.config
+from torch._inductor.test_case import TestCase
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, requires_gpu
+
+
+@dataclass
+class ModelTestConfig:
+    """Configuration for a model test case."""
+
+    name: str
+    model_class: type
+    example_inputs: tuple[torch.Tensor, ...]
+    dynamic_shapes: Optional[dict[str, Any]] = None
+    inductor_configs: Optional[dict[str, Any]] = None
+    rtol: float = 1e-4
+    atol: float = 1e-4
+
+
+class WindowsCrossCompilationTestFramework:
+    """
+    Framework for testing cross-compilation from Linux to Windows.
+
+    Provides reusable logic for creating compile and load test methods.
+    """
+
+    _base_path: Optional[Path] = None
+    _win_torch_libs_path: Optional[str] = None
+
+    @classmethod
+    def base_path(cls) -> Path:
+        """Get or create the base path for package files."""
+        if cls._base_path is None:
+            cls._base_path = Path(tempfile.mkdtemp(prefix="aoti_cross_compile_"))
+        return cls._base_path
+
+    @classmethod
+    def set_base_path(cls, path: Optional[Path | str] = None) -> None:
+        """Set the base path for package files."""
+        cls._base_path = Path(path) if path else None
+
+    @classmethod
+    def set_win_torch_libs_path(cls, path: Optional[str] = None) -> None:
+        """Set the path for Windows torch libs."""
+        cls._win_torch_libs_path = path
+
+    @classmethod
+    def get_package_path(cls, model_name: str) -> str:
+        """Get the path for a model's .pt2 package file."""
+        package_dir = cls.base_path()
+        package_dir.mkdir(parents=True, exist_ok=True)
+        return str(package_dir / f"{model_name}_windows.pt2")
+
+    @classmethod
+    def get_win_torch_libs_path(cls) -> str:
+        """Get the path for Windows torch libs."""
+        if cls._win_torch_libs_path is None:
+            raise RuntimeError("Windows torch libs path not set")
+        return str(cls._win_torch_libs_path)
+
+    @classmethod
+    def create_compile_test(cls, config: ModelTestConfig):
+        """Create a compile test method for a model configuration."""
+
+        def compile_test(self):
+            if platform.system() == "Windows":
+                raise unittest.SkipTest(
+                    "This test should run on Linux for cross-compilation"
+                )
+
+            self.assertTrue("WINDOWS_CUDA_HOME" in os.environ)
+
+            with torch.no_grad():
+                # Windows cross-compilation is only used for GPU.
+                # AOTI for CPU should be able to work as native compilation on Windows.
+                device = GPU_TYPE
+                model = config.model_class().to(device=device)
+                example_inputs = config.example_inputs
+
+                # Inputs should already be on GPU_TYPE but ensure they are
+                example_inputs = tuple(inp.to(device) for inp in example_inputs)
+
+                # Export the model
+                exported = torch.export.export(
+                    model, example_inputs, dynamic_shapes=config.dynamic_shapes
+                )
+
+                # Prepare inductor configs
+                inductor_configs = {
+                    "aot_inductor.cross_target_platform": "windows",
+                    "aot_inductor.precompile_headers": False,
+                    "aot_inductor.package_constants_on_disk_format": "binary_blob",
+                    "aot_inductor.package_constants_in_so": False,
+                    "aot_inductor.aoti_shim_library_path": cls.get_win_torch_libs_path(),
+                }
+                if config.inductor_configs:
+                    inductor_configs.update(config.inductor_configs)
+
+                # Compile and package directly to the expected location
+                package_path = cls.get_package_path(config.name)
+                torch._inductor.aoti_compile_and_package(
+                    exported,
+                    package_path=package_path,
+                    inductor_configs=inductor_configs,
+                )
+
+                self.assertTrue(
+                    os.path.exists(package_path),
+                    f"Package file should exist at {package_path}",
+                )
+
+        return compile_test
+
+    @classmethod
+    def create_load_test(cls, config: ModelTestConfig):
+        """Create a load test method for a model configuration."""
+
+        def load_test(self):
+            if platform.system() != "Windows":
+                raise unittest.SkipTest("This test should run on Windows")
+
+            if not HAS_GPU:
+                raise unittest.SkipTest("Test requires GPU")
+
+            package_path = cls.get_package_path(config.name)
+            if not os.path.exists(package_path):
+                raise unittest.SkipTest(
+                    f"Package file not found at {package_path}. "
+                    f"Run test_{config.name}_compile first."
+                )
+
+            with torch.no_grad():
+                # Windows cross-compilation is only used for GPU.
+                # AOTI for CPU should be able to work as native compilation on Windows.
+                device = GPU_TYPE
+
+                # Create original model for comparison
+                original_model = config.model_class().to(device=device)
+                example_inputs = config.example_inputs
+
+                # Inputs should already be on GPU_TYPE but ensure they are
+                example_inputs = tuple(inp.to(device) for inp in example_inputs)
+
+                # Load the compiled package
+                loaded_model = torch._inductor.aoti_load_package(package_path)
+
+                # Test with the same inputs
+                original_output = original_model(*example_inputs)
+                loaded_output = loaded_model(*example_inputs)
+
+                # Compare outputs
+                torch.testing.assert_close(
+                    original_output, loaded_output, rtol=config.rtol, atol=config.atol
+                )
+
+        return load_test
+
+
+def auto_generate_tests(test_class):
+    """
+    Class decorator to automatically generate compile/load test methods
+    from _define_* methods that return ModelTestConfig.
+    """
+    # Find all _define_* methods that return ModelTestConfig
+    define_methods = {}
+    for name in dir(test_class):
+        if name.startswith("_define_") and callable(getattr(test_class, name)):
+            method = getattr(test_class, name)
+            # Try to call the method to see if it returns ModelTestConfig
+            try:
+                # Create a temporary instance to call the method
+                temp_instance = test_class.__new__(test_class)
+                result = method(temp_instance)
+                if isinstance(result, ModelTestConfig):
+                    define_methods[name] = result
+            except Exception:
+                # If method fails, skip it
+                pass
+
+    # Generate compile/load methods for each discovered definition
+    for define_name, config in define_methods.items():
+        model_name = define_name[8:]  # Remove '_define_' prefix
+
+        # Create compile test method
+        compile_method_name = f"test_{model_name}_compile"
+        compile_method = WindowsCrossCompilationTestFramework.create_compile_test(
+            config
+        )
+        compile_method.__name__ = compile_method_name
+        compile_method.__doc__ = f"Step 1: Cross-compile {model_name} model on Linux"
+        compile_method = requires_gpu()(compile_method)
+        setattr(test_class, compile_method_name, compile_method)
+
+        # Create load test method
+        load_method_name = f"test_{model_name}_load"
+        load_method = WindowsCrossCompilationTestFramework.create_load_test(config)
+        load_method.__name__ = load_method_name
+        load_method.__doc__ = f"Step 2: Load and test {model_name} model on Windows"
+        load_method = requires_gpu()(load_method)
+        setattr(test_class, load_method_name, load_method)
+
+    return test_class
+
+
+@auto_generate_tests
+class TestAOTInductorWindowsCrossCompilation(TestCase):
+    """
+    Test class for AOT Inductor Windows cross-compilation.
+
+    Define test methods that return ModelTestConfig, and the decorator
+    will auto-generate compile/load test methods.
+    """
+
+    def _define_simple(self):
+        """Define the Simple model and its test configuration."""
+
+        class Simple(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = torch.nn.Linear(10, 16)
+                self.relu = torch.nn.ReLU()
+                self.fc2 = torch.nn.Linear(16, 1)
+                self.sigmoid = torch.nn.Sigmoid()
+
+            def forward(self, x):
+                x = self.fc1(x)
+                x = self.relu(x)
+                x = self.fc2(x)
+                x = self.sigmoid(x)
+                return x
+
+        return ModelTestConfig(
+            name="simple",
+            model_class=Simple,
+            example_inputs=(torch.randn(8, 10, device=GPU_TYPE),),
+            dynamic_shapes={"x": {0: torch.export.Dim("batch", min=1, max=1024)}},
+        )
+
+    def _define_simple_cnn(self):
+        """Define the SimpleCNN model and its test configuration."""
+
+        class SimpleCNN(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = torch.nn.Conv2d(3, 16, 3)
+                self.relu = torch.nn.ReLU()
+                self.pool = torch.nn.AdaptiveAvgPool2d((1, 1))
+                self.fc = torch.nn.Linear(16, 10)
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.relu(x)
+                x = self.pool(x)
+                x = x.flatten(1)
+                x = self.fc(x)
+                return x
+
+        return ModelTestConfig(
+            name="simple_cnn",
+            model_class=SimpleCNN,
+            example_inputs=(torch.randn(2, 3, 32, 32, device=GPU_TYPE),),
+            dynamic_shapes={"x": {0: torch.export.Dim("batch", min=1, max=16)}},
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
+    def _define_transformer(self):
+        """Define the SimpleTransformer model and its test configuration."""
+
+        class SimpleTransformer(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.embedding = torch.nn.Linear(128, 256)
+                self.attention = torch.nn.MultiheadAttention(256, 8, batch_first=True)
+                self.norm1 = torch.nn.LayerNorm(256)
+                self.ffn = torch.nn.Sequential(
+                    torch.nn.Linear(256, 1024),
+                    torch.nn.ReLU(),
+                    torch.nn.Linear(1024, 256),
+                )
+                self.norm2 = torch.nn.LayerNorm(256)
+                self.output = torch.nn.Linear(256, 10)
+
+            def forward(self, x):
+                # x shape: (batch, seq_len, input_dim)
+                x = self.embedding(x)
+                attn_out, _ = self.attention(x, x, x)
+                x = self.norm1(x + attn_out)
+                ffn_out = self.ffn(x)
+                x = self.norm2(x + ffn_out)
+                x = x.mean(dim=1)  # Global average pooling
+                x = self.output(x)
+                return x
+
+        return ModelTestConfig(
+            name="transformer",
+            model_class=SimpleTransformer,
+            example_inputs=(torch.randn(4, 16, 128, device=GPU_TYPE),),
+            dynamic_shapes={"x": {0: torch.export.Dim("batch", min=1, max=32)}},
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
+
+if __name__ == "__main__":
+    import sys
+
+    from torch._inductor.test_case import run_tests
+
+    # Check for --package-dir argument and remove it before unittest sees it
+    package_dir = None
+    win_torch_lib_dir = None
+    filtered_argv = []
+    i = 0
+    while i < len(sys.argv):
+        if sys.argv[i] == "--package-dir":
+            if i + 1 < len(sys.argv):
+                package_dir = sys.argv[i + 1]
+                i += 2  # Skip both --package-dir and its value
+            else:
+                print("Error: --package-dir requires a valid directory path")
+                sys.exit(1)
+        elif sys.argv[i].startswith("--package-dir="):
+            package_dir = sys.argv[i].split("=", 1)[1]
+            i += 1
+        elif sys.argv[i] == "--win-torch-lib-dir":
+            if i + 1 < len(sys.argv):
+                win_torch_lib_dir = sys.argv[i + 1]
+                i += 2  # Skip both --win-torch-lib-dir and its value
+            else:
+                print("Error: --win-torch-lib-dir requires a valid directory path")
+                sys.exit(1)
+        elif sys.argv[i].startswith("--win-torch-lib-dir="):
+            win_torch_lib_dir = sys.argv[i].split("=", 1)[1]
+            i += 1
+        else:
+            filtered_argv.append(sys.argv[i])
+            i += 1
+
+    # Validate and set the base path for package storage
+    if package_dir:
+        try:
+            package_path = Path(package_dir)
+            package_path.mkdir(parents=True, exist_ok=True)
+            # Test write access
+            test_file = package_path / ".test_write"
+            test_file.touch()
+            test_file.unlink()
+            WindowsCrossCompilationTestFramework.set_base_path(package_path)
+        except Exception:
+            print("Error: --package-dir requires a valid directory path")
+            sys.exit(1)
+
+    # Set Windows torch libs path if provided (only needed for compile tests)
+    if win_torch_lib_dir:
+        WindowsCrossCompilationTestFramework.set_win_torch_libs_path(win_torch_lib_dir)
+
+    # Update sys.argv to remove our custom arguments
+    sys.argv = filtered_argv
+
+    if HAS_GPU:
+        run_tests(needs="filelock")
diff --git a/test/inductor/test_augmented_graph_helper.py b/test/inductor/test_augmented_graph_helper.py
new file mode 100644
index 000000000000..ef1f92e23268
--- /dev/null
+++ b/test/inductor/test_augmented_graph_helper.py
@@ -0,0 +1,367 @@
+# Owner(s): ["module: inductor"]
+import operator
+
+import torch
+import torch.fx as fx
+from torch._inductor.augmented_graph_helper import AugmentedGraphHelper
+from torch.testing._internal.common_utils import TestCase
+from torch.utils._ordered_set import OrderedSet
+
+
+class TestAugmentedGraphHelper(TestCase):
+    """Test suite for AugmentedGraphHelper dependency and merge management."""
+
+    def setUp(self):
+        """Create a simple graph structure for testing."""
+        # Create a torch.fx.Graph with multiple nodes
+        self.graph = fx.Graph()
+
+        # Create placeholder nodes (inputs)
+        self.x = self.graph.placeholder("x")
+        self.y = self.graph.placeholder("y")
+
+        # Create computation nodes with specific names for easy reference
+        self.node_a = self.graph.call_function(
+            torch.add, args=(self.x, self.y), name="A"
+        )
+        self.node_b = self.graph.call_function(
+            torch.mul, args=(self.node_a, self.x), name="B"
+        )
+        self.node_c = self.graph.call_function(
+            torch.sub, args=(self.node_a, self.y), name="C"
+        )
+        self.node_d = self.graph.call_function(
+            torch.div, args=(self.node_b, self.node_c), name="D"
+        )
+        self.node_e = self.graph.call_function(
+            operator.neg, args=(self.node_d,), name="E"
+        )
+        self.node_f = self.graph.call_function(torch.abs, args=(self.node_e,), name="F")
+        self.node_g = self.graph.call_function(
+            torch.relu, args=(self.node_f,), name="G"
+        )
+        self.node_h = self.graph.call_function(
+            torch.sigmoid, args=(self.node_g,), name="H"
+        )
+
+        # Create output
+        self.graph.output(self.node_h)
+
+        # Create a mapping of nodes by name for easier access in tests
+        self.nodes = {}
+        for node in self.graph.nodes:
+            if hasattr(node, "name") and node.name in [
+                "A",
+                "B",
+                "C",
+                "D",
+                "E",
+                "F",
+                "G",
+                "H",
+            ]:
+                self.nodes[node.name] = node
+
+        # Get all nodes and compute ancestors
+        self.all_nodes = list(self.graph.nodes)
+        self.node_ancestors = self._collect_node_ancestors(self.graph)
+
+        # Create tracker with ancestors
+        self.tracker = AugmentedGraphHelper(
+            self.graph, node_ancestors=self.node_ancestors
+        )
+
+    def _collect_node_ancestors(
+        self, graph: fx.Graph
+    ) -> dict[fx.Node, OrderedSet[fx.Node]]:
+        """Collect all ancestors for each node."""
+        from collections import defaultdict
+
+        from torch.utils._ordered_set import OrderedSet
+
+        ancestors: dict[fx.Node, OrderedSet[fx.Node]] = defaultdict(OrderedSet)
+        for node in graph.nodes:
+            for input_node in node.all_input_nodes:
+                ancestors[node].add(input_node)
+                ancestors[node] |= ancestors[input_node]
+        return ancestors
+
+    def get_deps(self, node):
+        """Helper to get dependencies for a node."""
+        return list(getattr(node, "args", []))
+
+    # ========== Basic Functionality Tests ==========
+
+    def test_initial_state(self):
+        """Test that nodes start as singletons."""
+        for node in self.all_nodes:
+            merge_set = self.tracker.merge_sets[node]
+            self.assertEqual(merge_set, {node})
+            self.assertEqual(len(merge_set), 1)
+
+    def test_simple_merge(self):
+        """Test merging two nodes."""
+        node_a = self.nodes["A"]
+        node_b = self.nodes["B"]
+
+        self.merge_nodes(self.tracker, [node_a, node_b])
+
+        # Both should be in same merge set
+        self.assertEqual(self.tracker.merge_sets[node_a], {node_a, node_b})
+        self.assertEqual(self.tracker.merge_sets[node_b], {node_a, node_b})
+        self.assertEqual(
+            self.tracker.merge_sets[node_a], self.tracker.merge_sets[node_b]
+        )
+
+    def test_transitive_merge(self):
+        """Test merging already merged nodes."""
+        node_a = self.nodes["A"]
+        node_b = self.nodes["B"]
+        node_c = self.nodes["C"]
+        node_d = self.nodes["D"]
+
+        # Merge A-B and C-D separately
+        for node in node_b, node_c, node_d:
+            self.tracker.merge_to_set(node_a, node)
+
+        expected_set = {node_a, node_b, node_c, node_d}
+        for node in [node_a, node_b, node_c, node_d]:
+            self.assertEqual(self.tracker.merge_sets[node], expected_set)
+
+    def merge_nodes(self, tracker, nodes):
+        for n in nodes[1:]:
+            tracker.merge_to_set(nodes[0], n)
+
+    def test_unmerge_node(self):
+        """Test removing a node from its merge set."""
+        node_a = self.nodes["A"]
+        node_b = self.nodes["B"]
+        node_c = self.nodes["C"]
+
+        # Merge all three
+        self.merge_nodes(self.tracker, [node_a, node_b, node_c])
+        self.assertEqual(len(self.tracker.merge_sets[node_a]), 3)
+
+        # Unmerge B
+        self.tracker.unmerge_node(node_b)
+
+        # B should be singleton
+        self.assertEqual(self.tracker.merge_sets[node_b], {node_b})
+
+        # A and C should still be together
+        self.assertEqual(self.tracker.merge_sets[node_a], {node_a, node_c})
+        self.assertEqual(self.tracker.merge_sets[node_c], {node_a, node_c})
+
+    def test_unmerge_from_singleton(self):
+        """Test unmerging a node that's already singleton."""
+        node_a = self.nodes["A"]
+
+        # Should be no-op
+        self.tracker.unmerge_node(node_a)
+        self.assertEqual(self.tracker.merge_sets[node_a], {node_a})
+
+    # ========== Dependency Propagation Tests ==========
+
+    def test_merged_deps_collection(self):
+        """Test that dependencies are collected from all merged nodes."""
+        node_a = self.nodes["A"]
+        node_b = self.nodes["B"]
+        node_c = self.nodes["C"]
+
+        # B already depends on A (and x) from graph construction
+        # C already depends on A (and y) from graph construction
+
+        # Merge B and C
+        self.merge_nodes(self.tracker, [node_b, node_c])
+
+        # Get merged deps for B - should include deps from both B and C
+        deps = self.tracker.get_merged_deps(node_b)
+
+        # Should include all dependencies from both nodes
+        self.assertIn(node_a, deps)  # From both B and C
+        self.assertIn(self.x, deps)  # From B
+        self.assertIn(self.y, deps)  # From C
+
+    def test_extra_deps_with_merge(self):
+        """Test extra dependencies work correctly with merged nodes."""
+        node_a = self.nodes["A"]
+        node_b = self.nodes["B"]
+        node_c = self.nodes["C"]
+        node_d = self.nodes["D"]
+
+        # Add extra dep from A to C
+        self.tracker.add_extra_dep(n=node_a, dep=node_c)
+
+        # Merge A and B
+        self.merge_nodes(self.tracker, [node_a, node_b])
+
+        # Add extra dep from D to the merged node (via B)
+        self.tracker.add_extra_dep(n=node_d, dep=node_b)
+
+        # D should depend on B through extra deps
+        deps = self.tracker.get_merged_deps(node_d)
+        self.assertIn(node_b, deps)
+
+        # A should still have its dep on C
+        deps = self.tracker.get_merged_deps(node_a)
+        self.assertIn(node_c, deps)
+
+    # ========== Path Finding Tests ==========
+
+    def test_has_path_direct(self):
+        """Test path finding for direct dependencies."""
+        # In our graph: B depends on A
+        node_a = self.nodes["A"]
+        node_b = self.nodes["B"]
+
+        self.assertTrue(self.tracker.has_path(node_a, node_b))
+        self.assertFalse(self.tracker.has_path(node_b, node_a))
+
+    def test_has_path_transitive(self):
+        """Test path finding through multiple nodes."""
+        # In our graph: A -> B -> D and A -> C -> D -> E
+        node_a = self.nodes["A"]
+        node_e = self.nodes["E"]
+
+        self.assertTrue(self.tracker.has_path(node_a, node_e))
+        self.assertFalse(self.tracker.has_path(node_e, node_a))
+
+    def test_has_path_through_merge(self):
+        """Test path finding when nodes are merged."""
+        # Create a new graph for this specific test
+        graph2 = fx.Graph()
+        x2 = graph2.placeholder("x")
+        a2 = graph2.call_function(torch.neg, args=(x2,), name="A2")
+        b2 = graph2.call_function(torch.abs, args=(a2,), name="B2")
+        c2 = graph2.call_function(torch.relu, args=(x2,), name="C2")
+        d2 = graph2.call_function(torch.sigmoid, args=(c2,), name="D2")
+        graph2.output(d2)
+
+        tracker2 = AugmentedGraphHelper(graph2)
+
+        # Initially no path from B2 to D2
+        self.assertFalse(tracker2.has_path(b2, d2))
+
+        # Merge B2 and C2
+        tracker2.merge_to_set(b2, c2)
+
+        # Now there should be a path B2/C2 -> D2
+        self.assertTrue(tracker2.has_path(b2, d2))
+
+    def test_has_path_with_extra_deps(self):
+        """Test path finding with extra dependencies."""
+
+        graph2 = fx.Graph()
+        x2 = graph2.placeholder("x")
+        a2 = graph2.call_function(torch.neg, args=(x2,), name="A2")
+        b2 = graph2.call_function(torch.abs, args=(a2,), name="B2")
+        c2 = graph2.call_function(torch.relu, args=(x2,), name="C2")
+        d2 = graph2.call_function(torch.sigmoid, args=(c2,), name="D2")
+        graph2.output(d2)
+
+        tracker2 = AugmentedGraphHelper(graph2)
+
+        # Initially no path from B2 to D2
+        self.assertFalse(tracker2.has_path(b2, d2))
+
+        tracker2.add_extra_dep(n=c2, dep=b2)
+
+        # Now there should be a path B2/C2 -> D2
+        self.assertTrue(tracker2.has_path(b2, d2))
+
+    # ========== Cycle Detection Tests ==========
+
+    def test_no_cycle_in_dag(self):
+        """Test that DAG has no cycles."""
+        # Our original graph is a DAG, should have no cycles
+        self.assertFalse(self.tracker.has_cycle())
+
+    def test_simple_cycle_detection(self):
+        """Test detection of simple cycle."""
+        # Create a graph with a cycle
+        graph3 = fx.Graph()
+        x3 = graph3.placeholder("x")
+
+        # We can't create true cycles in fx.Graph directly,
+        # but we can simulate with extra_deps
+        a3 = graph3.call_function(torch.neg, args=(x3,))
+        b3 = graph3.call_function(torch.abs, args=(a3,))
+        c3 = graph3.call_function(torch.relu, args=(b3,))
+        graph3.output(c3)
+
+        tracker3 = AugmentedGraphHelper(graph3)
+        self.assertFalse(tracker3.has_cycle())
+
+        # Add extra dep to create cycle: a3 -> c3
+        tracker3.add_extra_dep(n=a3, dep=c3)
+
+        self.assertTrue(tracker3.has_cycle())
+
+    def test_cycle_through_merge(self):
+        """Test that merging can create cycles."""
+        # Create specific graph for this test
+        graph4 = fx.Graph()
+        x4 = graph4.placeholder("x")
+        a4 = graph4.call_function(torch.neg, args=(x4,))
+        b4 = graph4.call_function(torch.abs, args=(a4,))
+        c4 = graph4.call_function(torch.relu, args=(x4,))
+        d4 = graph4.call_function(torch.sigmoid, args=(c4,))
+        graph4.output(d4)
+
+        tracker4 = AugmentedGraphHelper(graph4)
+
+        # Add extra dep d4 -> a4
+        tracker4.add_extra_dep(n=a4, dep=d4)
+
+        # Now: a4 -> b4, c4 -> d4 -> a4
+        # Merging b4 and c4 would create cycle
+        tracker4.merge_to_set(b4, c4)
+
+        self.assertTrue(tracker4.has_cycle())
+
+    def test_cycle_with_extra_deps(self):
+        """Test cycle detection with extra dependencies."""
+        node_a = self.nodes["A"]
+        node_b = self.nodes["B"]
+
+        # B already depends on A naturally
+        # Add reverse dependency to create cycle
+        self.tracker.add_extra_dep(n=node_a, dep=node_b)
+
+        self.assertTrue(self.tracker.has_cycle())
+
+    def test_multiple_merge_unmerge(self):
+        """Test sequence of merge and unmerge operations."""
+        nodes = [self.nodes[c] for c in ["A", "B", "C", "D", "E"]]
+
+        # Merge A, B, C
+        self.merge_nodes(self.tracker, nodes[:3])
+        self.assertEqual(len(self.tracker.merge_sets[nodes[0]]), 3)
+
+        # Merge D, E
+        self.merge_nodes(self.tracker, nodes[3:5])
+        self.assertEqual(len(self.tracker.merge_sets[nodes[3]]), 2)
+
+        # Merge the two groups via B and D
+        try:
+            self.merge_nodes(self.tracker, [nodes[1], nodes[3]])
+            thrown = False
+        except AssertionError:
+            thrown = True
+        self.assertTrue(thrown)
+
+        # Unmerge C
+        self.tracker.unmerge_node(nodes[2])
+        self.assertEqual(len(self.tracker.merge_sets[nodes[0]]), 2)
+        self.assertEqual(self.tracker.merge_sets[nodes[2]], {nodes[2]})
+
+        # Unmerge A
+        self.tracker.unmerge_node(nodes[0])
+        self.assertEqual(self.tracker.merge_sets[nodes[0]], {nodes[0]})
+        self.assertEqual(len(self.tracker.merge_sets[nodes[1]]), 1)
+
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+
+    run_tests()
diff --git a/test/inductor/test_b2b_gemm.py b/test/inductor/test_b2b_gemm.py
index 60bbfd6c4922..fa5194fc8340 100644
--- a/test/inductor/test_b2b_gemm.py
+++ b/test/inductor/test_b2b_gemm.py
@@ -164,9 +164,7 @@ def f(m1: torch.Tensor, m2: torch.Tensor, m3: torch.Tensor) -> torch.Tensor:
         self.assertTrue("B2B_GEMM_LEFT_TRITON_ENTRANCE" not in code)
         self.assertTrue("B2B_GEMM_RIGHT_TRITON_ENTRANCE" not in code)
 
-    @unittest.skipIf(
-        not (os.environ.get("DO_PERF_TEST") == "1"), "Perf test not enabled"
-    )
+    @unittest.skipIf(os.environ.get("DO_PERF_TEST") != "1", "Perf test not enabled")
     @torch._dynamo.config.patch(recompile_limit=32)
     def test_plain_b2b_gemm_performance(self):
         """compare torch.compile(f, b2b_gemm = off) with torch.compile(f, b2b_gemm = on)"""
@@ -219,9 +217,7 @@ def f(m1: torch.Tensor, m2: torch.Tensor, m3: torch.Tensor) -> torch.Tensor:
         # flaky test assertion: disabled
         # self.assertTrue(average_speedup > 1)
 
-    @unittest.skipIf(
-        not (os.environ.get("DO_PERF_TEST") == "1"), "Perf test not enabled"
-    )
+    @unittest.skipIf(os.environ.get("DO_PERF_TEST") != "1", "Perf test not enabled")
     @torch._dynamo.config.patch(recompile_limit=32)
     def test_gelu_b2b_gemm_performance(self):
         """compare torch.compile(f, b2b_gemm = off) with torch.compile(f, b2b_gemm = on)"""
@@ -276,9 +272,7 @@ def f(m1: torch.Tensor, m2: torch.Tensor, m3: torch.Tensor) -> torch.Tensor:
         # flaky test assertion: disabled
         # self.assertTrue(average_speedup > 1)
 
-    @unittest.skipIf(
-        not (os.environ.get("DO_PERF_TEST") == "1"), "Perf test not enabled"
-    )
+    @unittest.skipIf(os.environ.get("DO_PERF_TEST") != "1", "Perf test not enabled")
     @torch._dynamo.config.patch(recompile_limit=32)
     def test_gelu_mlp_b2b_gemm_performance(self):
         """compare torch.compile(f, b2b_gemm = off) with torch.compile(f, b2b_gemm = on)"""
diff --git a/test/inductor/test_benchmark_fusion.py b/test/inductor/test_benchmark_fusion.py
index 56310adc977d..335b22061be5 100644
--- a/test/inductor/test_benchmark_fusion.py
+++ b/test/inductor/test_benchmark_fusion.py
@@ -165,7 +165,7 @@ def foo(m, inp):
             _, out_code = run_and_get_code(foo_c, m, inp)
 
             # occasionally, CI will make this one kernel. just skip in this case
-            if not out_code[0].count("def triton_") == 2:
+            if out_code[0].count("def triton_") != 2:
                 return
 
             # should be multiple triton invocations
diff --git a/test/inductor/test_binary_folding.py b/test/inductor/test_binary_folding.py
index cac7586e8d35..746a2808c901 100644
--- a/test/inductor/test_binary_folding.py
+++ b/test/inductor/test_binary_folding.py
@@ -81,9 +81,9 @@ def forward(self, x):
             out_optimized = torch.compile(mod_eager)
 
             inps = [4, 3, 4]
-            if module == nn.Conv2d:
+            if module is nn.Conv2d:
                 inps.append(inps[-1])
-            if module == nn.Conv3d:
+            if module is nn.Conv3d:
                 inps.append(inps[-1])
                 inps.append(inps[-1])
 
@@ -195,9 +195,9 @@ def my_inner_compile(gm, example_inputs, *args, **kwargs):
             )
 
             inps = [4, 3, 4]
-            if module[0] == nn.Conv2d:
+            if module[0] is nn.Conv2d:
                 inps.append(inps[-1])
-            if module[0] == nn.Conv3d:
+            if module[0] is nn.Conv3d:
                 inps.append(inps[-1])
                 inps.append(inps[-1])
 
diff --git a/test/inductor/test_cache.py b/test/inductor/test_cache.py
new file mode 100644
index 000000000000..d7ac4df3bf07
--- /dev/null
+++ b/test/inductor/test_cache.py
@@ -0,0 +1,818 @@
+# Owner(s): ["module: inductor"]
+from __future__ import annotations
+
+import pickle
+from concurrent.futures import ThreadPoolExecutor
+from inspect import isclass
+from os import environ
+from pathlib import Path
+from random import randint
+from tempfile import gettempdir
+from typing import Any, TYPE_CHECKING
+from typing_extensions import Self
+from unittest.mock import patch
+
+from torch._inductor import cache as icache
+from torch._inductor.test_case import run_tests, TestCase
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+)
+
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Sequence
+
+
+class TestMixin:
+    @staticmethod
+    def abstract_cache_types() -> set[type[icache.Cache]]:
+        return {icache.Cache, icache.AsyncCache}
+
+    @staticmethod
+    def cache_types() -> Sequence[type[icache.Cache]]:
+        cache_types: list[type[icache.Cache]] = []
+
+        for obj_name in dir(icache):
+            obj = getattr(icache, obj_name)
+
+            if not isclass(obj) or not issubclass(obj, icache.Cache):
+                continue
+
+            if obj in TestMixin.abstract_cache_types():
+                continue
+
+            cache_types.append(obj)
+        return cache_types
+
+    @staticmethod
+    def async_cache_types() -> Sequence[type[icache.AsyncCache]]:
+        return [
+            cache_type
+            for cache_type in TestMixin.cache_types()
+            if issubclass(cache_type, icache.AsyncCache)
+        ]
+
+    @staticmethod
+    def on_disk_cache_types() -> Sequence[type[icache.OnDiskCache]]:
+        return [
+            cache_type
+            for cache_type in TestMixin.cache_types()
+            if issubclass(cache_type, icache.OnDiskCache)
+        ]
+
+    @staticmethod
+    def key_types() -> Sequence[type[icache.Key]]:
+        return [*icache.Key.__constraints__]
+
+    @staticmethod
+    def value_types() -> Sequence[type[icache.Value]]:
+        return [*icache.Value.__constraints__]
+
+    @staticmethod
+    def cache_type_supports_key_and_value_types(
+        cache_type: type[icache.Cache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> bool:
+        assert len(cache_type.__orig_bases__) == 1
+        generic_base = cache_type.__orig_bases__[0]
+        _key_type, _value_type = generic_base.__args__
+        if ((_key_type != icache.Key) and (_key_type != key_type)) or (
+            (_value_type != icache.Value) and (_value_type != value_type)
+        ):
+            return False
+        return True
+
+    def key_not_in(
+        self: Self,
+        cache: icache.Cache[icache.Key, icache.Value],
+        key_fn: Callable[[], icache.Key],
+    ) -> icache.Key:
+        while cache.get(key := key_fn()) is not None:
+            continue
+        return key
+
+    def keys_not_in(
+        self: Self,
+        cache: icache.Cache[icache.Key, icache.Value],
+        key_fn: Callable[[], icache.Key],
+        num: int,
+    ) -> list[icache.Key]:
+        keys = []
+        while len(keys) < num:
+            if (key := self.key_not_in(cache, key_fn)) not in keys:
+                keys.append(key)
+        return keys
+
+    def key(self: Self, key_type: type[icache.Key]) -> icache.Key:
+        if key_type is str:
+            return f"s{randint(0, 2**32)}"
+        elif key_type is int:
+            return randint(0, 2**32)
+        elif key_type == tuple[Any, ...]:
+            return (self.key(str), self.key(int))
+        else:
+            raise NotImplementedError
+
+    def values_unalike(
+        self: Self, value_fn: Callable[[], icache.Value], num: int
+    ) -> list[icache.Value]:
+        values = []
+        while len(values) < num:
+            if (value := value_fn()) not in values:
+                values.append(value)
+        return values
+
+    def value(self: Self, value_type: type[icache.Value]) -> icache.Value:
+        if value_type is str:
+            return f"s{randint(0, 2**32)}"
+        elif value_type is int:
+            return randint(0, 2**32)
+        elif value_type == tuple[Any, ...]:
+            return (self.value(str), self.value(int))
+        elif value_type is bytes:
+            return self.value(str).encode()
+        elif value_type == dict[Any, Any]:
+            return {
+                "zero": self.value(str),
+                1: self.value(int),
+                (2): self.value(tuple[Any, ...]),
+                b"three": self.value(bytes),
+            }
+        elif value_type == list[Any]:
+            return [self.value(str), self.value(int), self.value(dict[Any, Any])]
+        else:
+            raise NotImplementedError
+
+    def maybe_randomize_base_dir(self: Self, cache: icache.Cache) -> None:
+        # multi on disk caches might exist at any time, and the tests
+        # assume they are isolated so we should randomize their base dir
+        if isinstance(cache, icache.OnDiskCache):
+            cache.base_dir = cache.base_dir / f"{hash(cache)}"
+
+
+@instantiate_parametrized_tests
+class CacheTest(TestMixin, TestCase):
+    @parametrize("cache_type", TestMixin.cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_get(
+        self: Self,
+        cache_type: type[icache.Cache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> None:
+        # Checks that a cache returns None for missing keys, and after insertion,
+        # returns the correct value for each key.
+        if not self.cache_type_supports_key_and_value_types(
+            cache_type, key_type, value_type
+        ):
+            return
+
+        cache: icache.Cache = cache_type()
+        self.maybe_randomize_base_dir(cache)
+        key_1, key_2 = self.keys_not_in(cache, lambda: self.key(key_type), 2)
+        value_1, value_2 = self.values_unalike(lambda: self.value(value_type), 2)
+
+        self.assertIsNone(cache.get(key_1))
+        self.assertIsNone(cache.get(key_2))
+
+        self.assertTrue(cache.insert(key_1, value_1))
+        self.assertTrue(cache.insert(key_2, value_2))
+
+        self.assertEqual(cache.get(key_1), value_1)
+        self.assertEqual(cache.get(key_2), value_2)
+
+    @parametrize("cache_type", TestMixin.cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_insert(
+        self: Self,
+        cache_type: type[icache.Cache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> None:
+        # Verifies that inserting a new key succeeds, inserting the same key again fails,
+        # and the value for the key remains the first inserted value.
+        if not self.cache_type_supports_key_and_value_types(
+            cache_type, key_type, value_type
+        ):
+            return
+
+        cache: icache.Cache = cache_type()
+        self.maybe_randomize_base_dir(cache)
+        key = self.key_not_in(cache, lambda: self.key(key_type))
+        value_1, value_2 = self.values_unalike(lambda: self.value(value_type), 2)
+
+        self.assertIsNone(cache.get(key))
+
+        self.assertTrue(cache.insert(key, value_1))
+        self.assertFalse(cache.insert(key, value_2))
+
+        self.assertEqual(cache.get(key), value_1)
+
+    @parametrize("cache_type", TestMixin.cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_get_concurrent(
+        self: Self,
+        cache_type: type[icache.Cache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> None:
+        # Ensures that concurrent reads (get) from the cache return the correct values
+        # for all inserted keys, even under parallel access.
+        if not self.cache_type_supports_key_and_value_types(
+            cache_type, key_type, value_type
+        ):
+            return
+
+        executor, iters = ThreadPoolExecutor(), 100
+
+        cache: icache.Cache = cache_type()
+        self.maybe_randomize_base_dir(cache)
+        keys = self.keys_not_in(cache, lambda: self.key(key_type), iters)
+        values = self.values_unalike(lambda: self.value(value_type), iters)
+
+        for key, value in zip(keys, values):
+            self.assertIsNone(cache.get(key))
+            self.assertTrue(cache.insert(key, value))
+
+        gets = executor.map(cache.get, keys)
+        for value, get in zip(values, gets):
+            self.assertEqual(get, value)
+
+        executor.shutdown()
+
+    @parametrize("cache_type", TestMixin.cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_insert_concurrent(
+        self: Self,
+        cache_type: type[icache.Cache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> None:
+        # Ensures that concurrent inserts work as expected: only the first insert for each key
+        # succeeds, and the cache contains the correct value for each key after all inserts.
+        if not self.cache_type_supports_key_and_value_types(
+            cache_type, key_type, value_type
+        ):
+            return
+
+        executor, iters = ThreadPoolExecutor(), 50
+
+        cache: icache.Cache = cache_type()
+        self.maybe_randomize_base_dir(cache)
+        keys = self.keys_not_in(cache, lambda: self.key(key_type), iters) * 2
+        values = self.values_unalike(lambda: self.value(value_type), iters * 2)
+
+        for key in keys:
+            self.assertIsNone(cache.get(key))
+
+        inserts = executor.map(cache.insert, keys, values)
+        inserted = {}
+        for key, value, insert in zip(keys, values, inserts):
+            if insert:
+                self.assertEqual(cache.get(key), value)
+                self.assertTrue(key not in inserted)
+                inserted[key] = value
+
+        self.assertTrue(set(keys) == set(inserted.keys()))
+        for key, value in inserted.items():
+            self.assertEqual(cache.get(key), value)
+
+        executor.shutdown()
+
+    @parametrize("cache_type", TestMixin.cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    @parametrize("get_first", [True, False])
+    def test_combo_concurrent(
+        self: Self,
+        cache_type: type[icache.Cache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+        get_first: bool,
+    ) -> None:
+        # Tests a mix of concurrent get and insert operations, with the order of operations
+        # varied by the get_first parameter, to ensure correctness under interleaved access.
+        if not self.cache_type_supports_key_and_value_types(
+            cache_type, key_type, value_type
+        ):
+            return
+
+        executor, iters = ThreadPoolExecutor(), 50
+
+        cache: icache.Cache = cache_type()
+        self.maybe_randomize_base_dir(cache)
+        keys = self.keys_not_in(cache, lambda: self.key(key_type), iters) * 2
+        values = self.values_unalike(lambda: self.value(value_type), iters * 2)
+
+        for key in keys:
+            self.assertIsNone(cache.get(key))
+
+        get_futures, insert_futures = [], []
+        for key, value in zip(keys, values):
+            if get_first:
+                get_futures.append(executor.submit(cache.get, key))
+                insert_futures.append(executor.submit(cache.insert, key, value))
+            else:
+                insert_futures.append(executor.submit(cache.insert, key, value))
+                get_futures.append(executor.submit(cache.get, key))
+
+        inserted = {}
+        for key, value, get_future, insert_future in zip(
+            keys, values, get_futures, insert_futures
+        ):
+            if (get := get_future.result()) is not None:
+                if insert_future.result():
+                    self.assertEqual(get, value)
+                    self.assertTrue(key not in inserted)
+                    inserted[key] = value
+            else:
+                if insert_future.result():
+                    self.assertTrue(key not in inserted)
+                    inserted[key] = value
+
+        self.assertTrue(set(keys) == set(inserted.keys()))
+        for key, value in inserted.items():
+            self.assertEqual(cache.get(key), value)
+
+        executor.shutdown()
+
+
+@instantiate_parametrized_tests
+class AsyncCacheTest(TestMixin, TestCase):
+    @parametrize("async_cache_type", TestMixin.async_cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_get_async(
+        self: Self,
+        async_cache_type: type[icache.AsyncCache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> None:
+        # Verifies that asynchronous get and insert operations work as expected:
+        # get_async returns None for missing keys, insert_async inserts values,
+        # and get_async returns the correct value after insertion.
+        if not self.cache_type_supports_key_and_value_types(
+            async_cache_type, key_type, value_type
+        ):
+            return
+
+        async_cache: icache.AsyncCache = async_cache_type()
+        self.maybe_randomize_base_dir(async_cache)
+        key_1, key_2 = self.keys_not_in(async_cache, lambda: self.key(key_type), 2)
+        value_1, value_2 = self.values_unalike(lambda: self.value(value_type), 2)
+
+        executor = ThreadPoolExecutor()
+
+        get_1 = async_cache.get_async(key_1, executor)
+        get_2 = async_cache.get_async(key_2, executor)
+        self.assertIsNone(get_1.result())
+        self.assertIsNone(get_2.result())
+
+        insert_1 = async_cache.insert_async(key_1, value_1, executor)
+        insert_2 = async_cache.insert_async(key_2, value_2, executor)
+        self.assertTrue(insert_1.result())
+        self.assertTrue(insert_2.result())
+
+        get_1 = async_cache.get_async(key_1, executor)
+        get_2 = async_cache.get_async(key_2, executor)
+        self.assertEqual(get_1.result(), value_1)
+        self.assertEqual(get_2.result(), value_2)
+
+        executor.shutdown()
+
+    @parametrize("async_cache_type", TestMixin.async_cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_insert_async(
+        self: Self,
+        async_cache_type: type[icache.AsyncCache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> None:
+        # Ensures that only one of two concurrent insert_async calls for the same key succeeds,
+        # and the cache contains the value from the successful insert.
+        if not self.cache_type_supports_key_and_value_types(
+            async_cache_type, key_type, value_type
+        ):
+            return
+
+        async_cache: icache.AsyncCache = async_cache_type()
+        self.maybe_randomize_base_dir(async_cache)
+        key = self.key_not_in(async_cache, lambda: self.key(key_type))
+        value_1, value_2 = self.values_unalike(lambda: self.value(value_type), 2)
+
+        executor = ThreadPoolExecutor()
+
+        get = async_cache.get_async(key, executor)
+        self.assertIsNone(get.result())
+
+        insert_1 = async_cache.insert_async(key, value_1, executor)
+        insert_2 = async_cache.insert_async(key, value_2, executor)
+        self.assertTrue(insert_1.result() ^ insert_2.result())
+
+        get = async_cache.get_async(key, executor)
+        if insert_1.result():
+            self.assertEqual(get.result(), value_1)
+        else:
+            self.assertEqual(get.result(), value_2)
+
+        executor.shutdown()
+
+    @parametrize("async_cache_type", TestMixin.async_cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_get_async_concurrent(
+        self: Self,
+        async_cache_type: type[icache.AsyncCache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> None:
+        # Ensures that concurrent asynchronous get operations return the correct values
+        # for all inserted keys.
+        if not self.cache_type_supports_key_and_value_types(
+            async_cache_type, key_type, value_type
+        ):
+            return
+
+        executor, iters = ThreadPoolExecutor(), 100
+
+        async_cache: icache.AsyncCache = async_cache_type()
+        self.maybe_randomize_base_dir(async_cache)
+        keys = self.keys_not_in(async_cache, lambda: self.key(key_type), iters)
+        values = self.values_unalike(lambda: self.value(value_type), iters)
+
+        for key, value in zip(keys, values):
+            self.assertIsNone(async_cache.get(key))
+            self.assertTrue(async_cache.insert(key, value))
+
+        gets = executor.map(lambda key: async_cache.get_async(key, executor), keys)
+        for value, get in zip(values, gets):
+            self.assertEqual(get.result(), value)
+
+        executor.shutdown()
+
+    @parametrize("async_cache_type", TestMixin.async_cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_insert_async_concurrent(
+        self: Self,
+        async_cache_type: type[icache.AsyncCache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+    ) -> None:
+        # Ensures that concurrent asynchronous insert operations only allow the first insert
+        # for each key to succeed, and the cache contains the correct value for each key.
+        if not self.cache_type_supports_key_and_value_types(
+            async_cache_type, key_type, value_type
+        ):
+            return
+
+        executor, iters = ThreadPoolExecutor(), 50
+
+        async_cache: icache.AsyncCache = async_cache_type()
+        self.maybe_randomize_base_dir(async_cache)
+        keys = self.keys_not_in(async_cache, lambda: self.key(key_type), iters) * 2
+        values = self.values_unalike(lambda: self.value(value_type), iters * 2)
+
+        for key in keys:
+            self.assertIsNone(async_cache.get(key))
+
+        inserts = executor.map(
+            lambda key, value: async_cache.insert_async(key, value, executor),
+            keys,
+            values,
+        )
+        inserted = {}
+        for key, value, insert in zip(keys, values, inserts):
+            if insert.result():
+                self.assertEqual(async_cache.get(key), value)
+                self.assertTrue(key not in inserted)
+                inserted[key] = value
+
+        self.assertTrue(set(keys) == set(inserted.keys()))
+        for key, value in inserted.items():
+            self.assertTrue(async_cache.get(key), value)
+
+        executor.shutdown()
+
+    @parametrize("async_cache_type", TestMixin.async_cache_types())
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    @parametrize("get_first", [True, False])
+    def test_combo_async_concurrent(
+        self: Self,
+        async_cache_type: type[icache.AsyncCache],
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+        get_first: bool,
+    ) -> None:
+        # Tests a mix of concurrent asynchronous get and insert operations, with the order
+        # of operations varied by the get_first parameter, to ensure correctness under
+        # interleaved async access.
+        if not self.cache_type_supports_key_and_value_types(
+            async_cache_type, key_type, value_type
+        ):
+            return
+
+        executor, iters = ThreadPoolExecutor(), 50
+
+        async_cache: icache.AsyncCache = async_cache_type()
+        self.maybe_randomize_base_dir(async_cache)
+        keys = self.keys_not_in(async_cache, lambda: self.key(key_type), iters) * 2
+        values = self.values_unalike(lambda: self.value(value_type), iters * 2)
+
+        for key in keys:
+            self.assertIsNone(async_cache.get(key))
+
+        get_futures, insert_futures = [], []
+        for key, value in zip(keys, values):
+            if get_first:
+                get_futures.append(async_cache.get_async(key, executor))
+                insert_futures.append(async_cache.insert_async(key, value, executor))
+            else:
+                insert_futures.append(async_cache.insert_async(key, value, executor))
+                get_futures.append(async_cache.get_async(key, executor))
+
+        inserted = {}
+        for key, value, get_future, insert_future in zip(
+            keys, values, get_futures, insert_futures
+        ):
+            if (get := get_future.result()) is not None:
+                if insert_future.result():
+                    self.assertEqual(get, value)
+                    self.assertTrue(key not in inserted)
+                    inserted[key] = value
+            else:
+                if insert_future.result():
+                    self.assertTrue(key not in inserted)
+                    inserted[key] = value
+
+        self.assertTrue(set(keys) == set(inserted.keys()))
+        for key, value in inserted.items():
+            self.assertEqual(async_cache.get(key), value)
+
+        executor.shutdown()
+
+
+@instantiate_parametrized_tests
+class OtherTest(TestMixin, TestCase):
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    @parametrize("with_whitespace", [True, False])
+    @parametrize("with_semicolon_suffix", [True, False])
+    def test_in_memory_cache_from_env_var(
+        self: Self,
+        key_type: type[icache.Key],
+        value_type: type[icache.Value],
+        with_whitespace: bool,
+        with_semicolon_suffix: bool,
+    ) -> None:
+        # Verifies that InMemoryCache.from_env_var correctly parses environment variables
+        # with various whitespace and semicolon suffixes, and loads all key-value pairs.
+        keys = self.keys_not_in(icache.InMemoryCache(), lambda: self.key(key_type), 3)
+        values = self.values_unalike(lambda: self.value(value_type), 3)
+
+        ws = "" if not with_whitespace else " "
+
+        env_var = "IN_MEMORY_CACHE_FROM_ENV_VAR_TEST"
+        env_val = ";".join(
+            [
+                f"{ws}{pickle.dumps(key)!r}{ws},{ws}{pickle.dumps(value)!r}{ws}"
+                for key, value in zip(keys, values)
+            ]
+        ) + (";" if with_semicolon_suffix else "")
+        environ[env_var] = env_val
+
+        cache = icache.InMemoryCache.from_env_var(env_var)
+        for key, value in zip(keys, values):
+            self.assertEqual(cache.get(key), value)
+
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_in_memory_cache_from_env_var_missing_comma_separator(
+        self: Self, key_type: type[icache.Key], value_type: type[icache.Value]
+    ) -> None:
+        # Ensures that InMemoryCache.from_env_var raises CacheError if the environment
+        # variable is missing the required comma separator between key and value.
+        keys = self.keys_not_in(icache.InMemoryCache(), lambda: self.key(key_type), 3)
+        values = self.values_unalike(lambda: self.value(value_type), 3)
+
+        env_var = "IN_MEMORY_CACHE_FROM_ENV_VAR_MISSING_COMMA_SEPARATOR_TEST"
+        env_val = ";".join(
+            [
+                f"{pickle.dumps(key)!r}{pickle.dumps(value)!r}"
+                for key, value in zip(keys, values)
+            ]
+        )
+        environ[env_var] = env_val
+
+        with self.assertRaises(icache.CacheError):
+            _ = icache.InMemoryCache.from_env_var(env_var)
+
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_in_memory_cache_from_env_var_bad_encoding(
+        self: Self, key_type: type[icache.Key], value_type: type[icache.Value]
+    ) -> None:
+        # Ensures that InMemoryCache.from_env_var raises CacheError if the key or value
+        # encoding in the environment variable is invalid (not a valid Python literal).
+        keys = self.keys_not_in(icache.InMemoryCache(), lambda: self.key(key_type), 3)
+        values = self.values_unalike(lambda: self.value(value_type), 3)
+
+        env_var = "IN_MEMORY_CACHE_FROM_ENV_VAR_BAD_ENCODING_TEST"
+        env_val = ";".join(
+            [
+                f"{pickle.dumps(key)!r}/,{pickle.dumps(value)!r}/"
+                for key, value in zip(keys, values)
+            ]
+        )
+        environ[env_var] = env_val
+
+        with self.assertRaises(icache.CacheError):
+            _ = icache.InMemoryCache.from_env_var(env_var)
+
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_in_memory_cache_from_env_var_not_un_pickle_able(
+        self: Self, key_type: type[icache.Key], value_type: type[icache.Value]
+    ) -> None:
+        # Ensures that InMemoryCache.from_env_var raises CacheError if the key or value
+        # cannot be unpickled (e.g., due to data corruption).
+        keys = self.keys_not_in(icache.InMemoryCache(), lambda: self.key(key_type), 3)
+        values = self.values_unalike(lambda: self.value(value_type), 3)
+
+        env_var = "IN_MEMORY_CACHE_FROM_ENV_VAR_NOT_UN_PICKLE_ABLE_TEST"
+        env_val = ";".join(
+            [
+                f"{pickle.dumps(key)[::-1]!r},{pickle.dumps(value)[::-1]!r}"
+                for key, value in zip(keys, values)
+            ]
+        )
+        environ[env_var] = env_val
+
+        with self.assertRaises(icache.CacheError):
+            _ = icache.InMemoryCache.from_env_var(env_var)
+
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_in_memory_cache_from_env_var_duplicated_entries(
+        self: Self, key_type: type[icache.Key], value_type: type[icache.Value]
+    ) -> None:
+        # Verifies that duplicate key-value pairs are allowed if the value is consistent,
+        # but raises CacheError if the same key appears with different values.
+
+        keys = (
+            self.keys_not_in(icache.InMemoryCache(), lambda: self.key(key_type), 3) * 2
+        )
+        values = self.values_unalike(lambda: self.value(value_type), 3) * 2
+
+        env_var = "IN_MEMORY_CACHE_FROM_ENV_VAR_DUPLICATED_ENTRIES_TEST"
+        env_val = ";".join(
+            [
+                f"{pickle.dumps(key)!r},{pickle.dumps(value)!r}"
+                for key, value in zip(keys, values)
+            ]
+        )
+        environ[env_var] = env_val
+
+        # duplicate key => value entries are okay, as long as value is consistent
+        cache = icache.InMemoryCache.from_env_var(env_var)
+        for key, value in zip(keys, values):
+            self.assertEqual(cache.get(key), value)
+
+        keys = (
+            self.keys_not_in(icache.InMemoryCache(), lambda: self.key(key_type), 3) * 2
+        )
+        values = self.values_unalike(lambda: self.value(value_type), 6)
+
+        env_var = "IN_MEMORY_CACHE_FROM_ENV_VAR_DUPLICATED_ENTRIES_TEST"
+        env_val = ";".join(
+            [
+                f"{pickle.dumps(key)!r},{pickle.dumps(value)!r}"
+                for key, value in zip(keys, values)
+            ]
+        )
+        environ[env_var] = env_val
+
+        # duplicate key => value entries with inconsistent values are not okay
+        with self.assertRaises(icache.CacheError):
+            _ = icache.InMemoryCache.from_env_var(env_var)
+
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_in_memory_cache_from_file_path(
+        self: Self, key_type: type[icache.Key], value_type: type[icache.Value]
+    ) -> None:
+        # Checks that InMemoryCache.from_file_path correctly loads a cache from a file
+        # containing a pickled dictionary of key-value pairs.
+        keys = self.keys_not_in(icache.InMemoryCache(), lambda: self.key(key_type), 3)
+        values = self.values_unalike(lambda: self.value(value_type), 3)
+
+        cache = icache.InMemoryCache()
+
+        for key, value in zip(keys, values):
+            self.assertTrue(cache.insert(key, value))
+
+        fpath = Path(gettempdir()) / "IN_MEMORY_CACHE_FROM_FILE_PATH_TEST"
+        with open(fpath, "wb") as fp:
+            pickle.dump(cache._cache, fp)
+
+        from_file_path_cache = icache.InMemoryCache.from_file_path(fpath)
+
+        for key, value in zip(keys, values):
+            self.assertEqual(from_file_path_cache.get(key), value)
+
+    @parametrize("key_type", TestMixin.key_types())
+    @parametrize("value_type", TestMixin.value_types())
+    def test_in_memory_cache_from_file_path_not_un_pickle_able(
+        self: Self, key_type: type[icache.Key], value_type: type[icache.Value]
+    ) -> None:
+        # Ensures that InMemoryCache.from_file_path raises CacheError if the file contents
+        # cannot be unpickled (e.g., due to corruption).
+        keys = self.keys_not_in(icache.InMemoryCache(), lambda: self.key(key_type), 3)
+        values = self.values_unalike(lambda: self.value(value_type), 3)
+
+        cache = icache.InMemoryCache()
+
+        for key, value in zip(keys, values):
+            self.assertTrue(cache.insert(key, value))
+
+        fpath = (
+            Path(gettempdir())
+            / "IN_MEMORY_CACHE_FROM_FILE_PATH_NOT_UN_PICKLE_ABLE_TEST"
+        )
+        with open(fpath, "wb") as fp:
+            pickled_cache = pickle.dumps(cache._cache)
+            pickled_cache = pickled_cache[::-1]
+            fp.write(pickled_cache)
+
+        with self.assertRaises(icache.CacheError):
+            _ = icache.InMemoryCache.from_file_path(fpath)
+
+    def test_in_memory_cache_from_file_path_not_dict(self: Self) -> None:
+        # This test verifies that InMemoryCache.from_file_path raises a CacheError
+        # when the file does not contain a pickled dictionary. It writes a pickled
+        # list to a temporary file, then attempts to load it as a cache. The cache
+        # expects a dictionary structure; loading a non-dictionary should raise an error.
+        fpath = Path(gettempdir()) / "IN_MEMORY_CACHE_FROM_FILE_PATH_NOT_DICT_TEST"
+        with open(fpath, "wb") as fp:
+            pickled_cache = pickle.dumps([1, 2, 3])
+            fp.write(pickled_cache)
+
+        with self.assertRaises(icache.CacheError):
+            _ = icache.InMemoryCache.from_file_path(fpath)
+
+    @parametrize("on_disk_cache_type", TestMixin.on_disk_cache_types())
+    def test_on_disk_cache_fpath_from_key_un_pickle_able(
+        self: Self, on_disk_cache_type: type[icache.OnDiskCache]
+    ) -> None:
+        # This test checks that _fpath_from_key raises a CacheError when given a
+        # key that cannot be pickled. It passes a lambda function (which is not
+        # pickle-able) as the key. The cache uses pickling to serialize keys for
+        # file storage. If a key cannot be pickled, the cache should fail gracefully
+        # and raise a clear error.
+        cache: icache.OnDiskCache = on_disk_cache_type()
+        un_pickle_able_key = lambda: None  # noqa: E731
+
+        with self.assertRaises(icache.CacheError):
+            _ = cache._fpath_from_key(un_pickle_able_key)
+
+    @parametrize("on_disk_cache_type", TestMixin.on_disk_cache_types())
+    def test_on_disk_cache_version_bump(
+        self: Self, on_disk_cache_type: type[icache.OnDiskCache]
+    ) -> None:
+        # This test ensures that cache entries are invalidated when the cache version
+        # changes, and that new entries can be inserted and retrieved after a version bump.
+        # It inserts a key-value pair, then simulates a version bump by patching the cache
+        # version. After the version change, it verifies that the old entry is no longer
+        # retrievable (invalidated), and that a new entry can be inserted and retrieved.
+        # Versioning is used to invalidate stale cache entries when the cache format or
+        # logic changes.
+        cache: icache.OnDiskCache = on_disk_cache_type()
+        key = self.key_not_in(cache, lambda: self.key(str))
+        value = self.value(str)
+
+        self.assertIsNone(cache.get(key))
+        self.assertTrue(cache.insert(key, value))
+        self.assertEqual(cache.get(key), value)
+
+        old_version = icache.OnDiskCache.version
+        bump_version = old_version + 1
+        with patch.object(icache.OnDiskCache, "version", bump_version):
+            self.assertIsNone(cache.get(key))
+            self.assertTrue(cache.insert(key, value))
+            self.assertEqual(cache.get(key), value)
+
+        self.assertIsNone(cache.get(key))
+        self.assertTrue(cache.insert(key, value))
+        self.assertEqual(cache.get(key), value)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/inductor/test_caching.py b/test/inductor/test_caching.py
new file mode 100644
index 000000000000..cf57a88fadd4
--- /dev/null
+++ b/test/inductor/test_caching.py
@@ -0,0 +1,828 @@
+# Owner(s): ["module: inductor"]
+# pyre-strict
+
+import os
+import pickle
+from concurrent.futures import Future, ThreadPoolExecutor, TimeoutError, wait
+from contextlib import contextmanager
+from itertools import combinations
+from random import Random
+from shutil import rmtree
+from threading import Lock
+from typing import Any, Generator, Sequence, TYPE_CHECKING, Union
+from typing_extensions import TypeVar
+from unittest.mock import patch
+
+from filelock import FileLock
+
+from torch._inductor.runtime.caching import (
+    config,
+    context,
+    exceptions,
+    implementations as impls,
+    locks,
+    utils,
+)
+from torch._inductor.test_case import run_tests, TestCase
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+)
+
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+class TestMixin:
+    @property
+    def random_string(self) -> str:
+        return f"s-{Random().randint(0, 2**32)}"
+
+
+@instantiate_parametrized_tests
+class ConfigTest(TestCase):
+    FOO_THIS_VERSION: int = 0
+    FOO_JK_NAME: str = "foo_jk_name"
+    FOO_OSS_DEFAULT: bool = False
+    FOO_ENV_VAR_OVERRIDE: str = "foo_env_var_override"
+    FOO_ENV_VAR_OVERRIDE_LOCK: FileLock = FileLock(
+        f"/tmp/testing/{FOO_ENV_VAR_OVERRIDE}.lock"
+    )
+
+    def assert_versioned_config(self, expected_enabled: bool) -> None:
+        actual_enabled: bool = config._versioned_config(
+            self.FOO_JK_NAME,
+            self.FOO_THIS_VERSION,
+            self.FOO_OSS_DEFAULT,
+            env_var_override=self.FOO_ENV_VAR_OVERRIDE,
+        )
+        self.assertEqual(actual_enabled, expected_enabled)
+
+    @parametrize("enabled", [True, False])
+    def test_versioned_config_env_var_override(
+        self,
+        enabled: bool,
+    ) -> None:
+        """Test that environment variable overrides take precedence over other configuration sources.
+
+        Verifies that when an environment variable override is set to "1" or "0",
+        the _versioned_config function returns the corresponding boolean value
+        regardless of other configuration settings.
+        """
+        with (
+            self.FOO_ENV_VAR_OVERRIDE_LOCK.acquire(timeout=1),
+            patch.dict(
+                os.environ,
+                {
+                    self.FOO_ENV_VAR_OVERRIDE: "1" if enabled else "0",
+                },
+            ),
+            patch(
+                "torch._inductor.runtime.caching.config.is_fbcode",
+                return_value=False,
+            ),
+            patch.object(self, "FOO_OSS_DEFAULT", not enabled),
+        ):
+            self.assert_versioned_config(enabled)
+
+    @parametrize("enabled", [True, False])
+    def test_versioned_config_version_check(
+        self,
+        enabled: bool,
+    ) -> None:
+        """Test that _versioned_config responds correctly to version changes in Facebook environments.
+
+        Verifies that when running in fbcode environments (is_fbcode=True), the configuration
+        is enabled when the JustKnobs version matches the expected version, and disabled when
+        the version differs. This ensures proper rollout control through version management.
+        """
+        with (
+            self.FOO_ENV_VAR_OVERRIDE_LOCK.acquire(timeout=1),
+            patch.dict(os.environ, {}, clear=True),
+            patch(
+                "torch._inductor.runtime.caching.config.is_fbcode",
+                return_value=True,
+            ),
+            patch(
+                "torch._utils_internal.justknobs_getval_int",
+                return_value=self.FOO_THIS_VERSION + (-1 if enabled else 1),
+            ),
+        ):
+            self.assert_versioned_config(enabled)
+
+    @parametrize("enabled", [True, False])
+    def test_versioned_config_oss_default(
+        self,
+        enabled: bool,
+    ) -> None:
+        """Test that _versioned_config uses OSS default values in non-Facebook environments.
+
+        Verifies that when running in non-fbcode environments (is_fbcode=False) with no
+        environment variable overrides, the configuration falls back to the OSS default
+        value. This ensures proper behavior for open-source PyTorch distributions.
+        """
+        with (
+            patch.dict(os.environ, {}, clear=True),
+            patch(
+                "torch._inductor.runtime.caching.config.is_fbcode",
+                return_value=False,
+            ),
+            patch.object(self, "FOO_OSS_DEFAULT", enabled),
+        ):
+            self.assert_versioned_config(enabled)
+
+
+@instantiate_parametrized_tests
+class ContextTest(TestCase):
+    def isolation_schema_from_forms_of_context_selected(
+        self,
+        runtime_forms_of_context_selected: Sequence[str],
+        compile_forms_of_context_selected: Sequence[str],
+    ) -> context.IsolationSchema:
+        return context.IsolationSchema(
+            runtime_context={
+                form_of_context: form_of_context
+                in set(runtime_forms_of_context_selected)
+                for form_of_context in context._RuntimeContext.forms_of_context()
+            },
+            compile_context={
+                form_of_context: form_of_context
+                in set(compile_forms_of_context_selected)
+                for form_of_context in context._CompileContext.forms_of_context()
+            },
+        )
+
+    @parametrize(
+        "runtime_forms_of_context_selected",
+        [(), *list(combinations(context._RuntimeContext.forms_of_context(), 2))],
+    )
+    @parametrize(
+        "compile_forms_of_context_selected",
+        [(), *list(combinations(context._CompileContext.forms_of_context(), 2))],
+    )
+    def test_selected_isolation_context(
+        self,
+        runtime_forms_of_context_selected: Sequence[str],
+        compile_forms_of_context_selected: Sequence[str],
+    ) -> None:
+        """
+        Tests that isolation context generation works correctly for specific combinations
+        of runtime and compile context forms.
+
+        Verifies that the _isolation_context function properly creates isolation contexts
+        based on the selected forms of runtime and compile context, ensuring that only
+        the specified context forms are included in the resulting isolation context.
+        """
+        ischema: context.IsolationSchema = (
+            self.isolation_schema_from_forms_of_context_selected(
+                runtime_forms_of_context_selected, compile_forms_of_context_selected
+            )
+        )
+
+        self.assertEqual(
+            context._isolation_context(ischema),
+            {
+                "runtime_context": {
+                    form_of_context: getattr(context._RuntimeContext, form_of_context)()
+                    for form_of_context in runtime_forms_of_context_selected
+                }
+                or None,
+                "compile_context": {
+                    form_of_context: getattr(context._CompileContext, form_of_context)()
+                    for form_of_context in compile_forms_of_context_selected
+                }
+                or None,
+            },
+        )
+
+    @parametrize("all_runtime_context", [True, False])
+    @parametrize("all_compile_context", [True, False])
+    def test_all_or_none_isolation_context(
+        self, all_runtime_context: bool, all_compile_context: bool
+    ) -> None:
+        """
+        Tests isolation context generation when using all or no context forms.
+
+        Verifies that the isolation context correctly includes all forms of context
+        when set to True, or excludes all forms when set to False, for both
+        runtime and compile contexts.
+        """
+        ischema: context.IsolationSchema = context.IsolationSchema(
+            runtime_context=all_runtime_context, compile_context=all_compile_context
+        )
+        self.assertEqual(
+            context._isolation_context(ischema),
+            {
+                "runtime_context": {
+                    form_of_context: getattr(context._RuntimeContext, form_of_context)()
+                    for form_of_context in context._RuntimeContext.forms_of_context()
+                }
+                if all_runtime_context
+                else None,
+                "compile_context": {
+                    form_of_context: getattr(context._CompileContext, form_of_context)()
+                    for form_of_context in context._CompileContext.forms_of_context()
+                }
+                if all_compile_context
+                else None,
+            },
+        )
+
+    def test_isolation_key_is_distinct(self) -> None:
+        """
+        Tests that different combinations of runtime and compile context forms
+        generate unique isolation keys.
+
+        Verifies that each possible combination of context forms produces a distinct
+        isolation key, ensuring no collisions occur between different contexts.
+        """
+        ikeys: set[str] = set()
+        for num_runtime_forms_of_context_selected in range(
+            len(context._RuntimeContext.forms_of_context())
+        ):
+            for num_compile_forms_of_context_selected in range(
+                len(context._CompileContext.forms_of_context())
+            ):
+                for runtime_forms_of_context_selected in combinations(
+                    context._RuntimeContext.forms_of_context(),
+                    num_runtime_forms_of_context_selected,
+                ):
+                    for compile_forms_of_context_selected in combinations(
+                        context._CompileContext.forms_of_context(),
+                        num_compile_forms_of_context_selected,
+                    ):
+                        ischema: context.IsolationSchema = (
+                            self.isolation_schema_from_forms_of_context_selected(
+                                runtime_forms_of_context_selected,
+                                compile_forms_of_context_selected,
+                            )
+                        )
+                        ikey: str = context._isolation_key(ischema)
+                        self.assertFalse(ikey in ikeys)
+                        ikeys.add(ikey)
+
+    def test_isolation_key_is_repeatable(self) -> None:
+        """
+        Tests that calling the isolation key function multiple times with the same
+        parameters produces the same result.
+
+        Verifies that the isolation key generation is deterministic and consistent
+        across multiple invocations with identical inputs.
+        """
+        self.assertEqual(context._isolation_key(), context._isolation_key())
+
+    def test_select_runtime_context_matches_forms_of_context(self) -> None:
+        """
+        Tests that the selected runtime context matches the forms of context.
+
+        Verifies that the selected runtime context includes only the forms of context
+        specified in the isolation schema, ensuring that the isolation context is
+        properly selected and configured.
+        """
+        self.assertEqual(
+            set(context.SelectedRuntimeContext.__required_keys__),
+            set(context._RuntimeContext.forms_of_context()),
+        )
+
+    def test_select_compile_context_matches_forms_of_context(self) -> None:
+        """
+        Tests that the selected compile context matches the forms of context.
+
+        Verifies that the selected compile context includes only the forms of context
+        specified in the isolation schema, ensuring that the isolation context is
+        properly selected and configured.
+        """
+        self.assertEqual(
+            set(context.SelectedCompileContext.__required_keys__),
+            set(context._CompileContext.forms_of_context()),
+        )
+
+
+@instantiate_parametrized_tests
+class ExceptionsTest(TestCase):
+    exception_typenames: list[str] = [
+        "CacheError",
+        "SystemError",
+        "LockTimeoutError",
+        "FileLockTimeoutError",
+        "UserError",
+        "KeyEncodingError",
+        "KeyPicklingError",
+        "ValueEncodingError",
+        "ValuePicklingError",
+        "ValueDecodingError",
+        "ValueUnPicklingError",
+    ]
+
+    @parametrize("exception_typename", exception_typenames)
+    def test_exception_is_CacheError(self, exception_typename: str) -> None:
+        """Test that all custom cache exceptions inherit from the base CacheError class.
+
+        Verifies that every exception type defined in the caching exceptions module
+        is properly derived from CacheError, ensuring consistent exception hierarchy
+        and enabling unified exception handling throughout the caching system.
+        """
+        self.assertTrue(
+            issubclass(getattr(exceptions, exception_typename), exceptions.CacheError)
+        )
+
+    def test_exception_other(self) -> None:
+        """
+        Test the inheritance relationships among custom cache exception classes.
+
+        Verifies that the exception classes in the caching exceptions module have the correct
+        subclass relationships, ensuring the exception hierarchy is as intended. This includes
+        checks for both direct and indirect inheritance between base and derived exception types.
+        """
+        self.assertTrue(issubclass(exceptions.SystemError, exceptions.CacheError))
+        self.assertTrue(issubclass(exceptions.LockTimeoutError, exceptions.SystemError))
+        self.assertTrue(
+            issubclass(exceptions.FileLockTimeoutError, exceptions.SystemError)
+        )
+        self.assertTrue(issubclass(exceptions.UserError, exceptions.CacheError))
+        self.assertTrue(issubclass(exceptions.KeyEncodingError, exceptions.UserError))
+        self.assertTrue(
+            issubclass(exceptions.KeyPicklingError, exceptions.KeyEncodingError)
+        )
+        self.assertTrue(issubclass(exceptions.ValueEncodingError, exceptions.UserError))
+        self.assertTrue(
+            issubclass(exceptions.ValuePicklingError, exceptions.ValueEncodingError)
+        )
+        self.assertTrue(issubclass(exceptions.ValueDecodingError, exceptions.UserError))
+        self.assertTrue(
+            issubclass(exceptions.ValueUnPicklingError, exceptions.ValueDecodingError)
+        )
+
+
+@instantiate_parametrized_tests
+class ImplementationsTest(TestMixin, TestCase):
+    impl_typenames: list[str] = [
+        "_InMemoryCacheImpl",
+        "_OnDiskCacheImpl",
+    ]
+    cls_id: int = Random().randint(0, 2**32)
+
+    @classmethod
+    def sub_dir(cls) -> str:
+        return f"testing-impls-instance-{cls.cls_id}"
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        rmtree(
+            impls._OnDiskCacheImpl(sub_dir=cls.sub_dir())._cache_dir, ignore_errors=True
+        )
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        rmtree(
+            impls._OnDiskCacheImpl(sub_dir=cls.sub_dir())._cache_dir, ignore_errors=True
+        )
+
+    def impl_from_typename(self, impl_typename: str) -> impls._CacheImpl:
+        if impl_typename == "_OnDiskCacheImpl":
+            return impls._OnDiskCacheImpl(
+                sub_dir=f"{self.sub_dir()}/rng-{self.random_string[4:]}",
+            )
+        else:
+            return getattr(impls, impl_typename)()
+
+    def assert_key_in(self, key: Any, impl: impls._CacheImpl) -> None:
+        self.assertTrue(impl.get(key) is not None)
+
+    def assert_key_not_in(self, key: Any, impl: impls._CacheImpl) -> None:
+        self.assertTrue(impl.get(key) is None)
+
+    def assert_key_value_inserted_in(
+        self, key: Any, value: Any, impl: impls._CacheImpl
+    ) -> None:
+        self.assertTrue(impl.insert(key, value))
+
+    def assert_key_value_not_inserted_in(
+        self, key: Any, value: Any, impl: impls._CacheImpl
+    ) -> None:
+        self.assertFalse(impl.insert(key, value))
+
+    def assert_key_has_value_in(
+        self, key: Any, value: Any, impl: impls._CacheImpl
+    ) -> None:
+        self.assertTrue(((get := impl.get(key)) is not None) and (get.value == value))
+
+    @parametrize("impl_typename", impl_typenames)
+    def test_get(self, impl_typename: str) -> None:
+        """Test cache get operation returns cache miss for non-existent keys.
+
+        Verifies that both in-memory and on-disk cache implementations correctly
+        handle get operations for keys that do not exist in the cache. This test
+        ensures that the cache properly returns a cache miss (hit=False) when
+        attempting to retrieve a non-existent key.
+
+        Args:
+            impl_typename: The cache implementation type to test ("_InMemoryCacheImpl" or "_OnDiskCacheImpl")
+        """
+        impl: impls._CacheImpl = self.impl_from_typename(impl_typename)
+        with impl.lock():
+            self.assert_key_not_in(self.random_string, impl)
+
+    @parametrize("impl_typename", impl_typenames)
+    def test_insert(self, impl_typename: str) -> None:
+        """Test cache insert operation successfully stores and retrieves key-value pairs.
+
+        Verifies that both in-memory and on-disk cache implementations correctly
+        handle insert operations for new key-value pairs. This test ensures that:
+        1. Keys initially don't exist in the cache (cache miss)
+        2. Insert operations succeed for new keys
+        3. The stored value can be retrieved correctly after insertion
+
+        Args:
+            impl_typename: The cache implementation type to test ("_InMemoryCacheImpl" or "_OnDiskCacheImpl")
+        """
+        impl: impls._CacheImpl = self.impl_from_typename(impl_typename)
+        with impl.lock():
+            key: str = self.random_string
+            self.assert_key_not_in(key, impl)
+            value: str = self.random_string
+            self.assert_key_value_inserted_in(key, value, impl)
+            self.assert_key_has_value_in(key, value, impl)
+
+    @parametrize("impl_typename", impl_typenames)
+    def test_insert_will_not_overwrite(self, impl_typename: str) -> None:
+        """Test cache insert operation does not overwrite existing keys.
+
+        Verifies that both in-memory and on-disk cache implementations correctly
+        handle insert operations for keys that already exist in the cache. This test
+        ensures that:
+        1. Keys initially don't exist in the cache (cache miss)
+        2. First insert operation succeeds for new keys
+        3. Subsequent insert operations with the same key fail (inserted=False)
+        4. The original value is preserved and not overwritten
+
+        Args:
+            impl_typename: The cache implementation type to test ("_InMemoryCacheImpl" or "_OnDiskCacheImpl")
+        """
+        impl: impls._CacheImpl = self.impl_from_typename(impl_typename)
+        with impl.lock():
+            key: str = self.random_string
+            self.assert_key_not_in(key, impl)
+            value: str = self.random_string
+            self.assert_key_value_inserted_in(key, value, impl)
+            self.assert_key_value_not_inserted_in(key, self.random_string, impl)
+            self.assert_key_has_value_in(key, value, impl)
+
+    @parametrize("impl_typename", impl_typenames)
+    def test_key_encoding(self, impl_typename: str) -> None:
+        """Test that cache implementations properly handle non-serializable keys.
+
+        Verifies that both in-memory and on-disk cache implementations correctly
+        raise KeyPicklingError when attempting to insert keys that cannot be
+        pickled (such as lambda functions). This ensures proper error handling
+        for invalid key types that would break the caching system.
+
+        Args:
+            impl_typename: The cache implementation type to test ("_InMemoryCacheImpl" or "_OnDiskCacheImpl")
+        """
+        impl: impls._CacheImpl = self.impl_from_typename(impl_typename)
+        with impl.lock():
+            with self.assertRaises(exceptions.KeyPicklingError):
+                impl.insert(lambda: None, None)
+
+    @parametrize("impl_typename", impl_typenames)
+    def test_value_encoding(self, impl_typename: str) -> None:
+        """Test that on-disk cache implementations properly handle non-serializable values.
+
+        Verifies that on-disk cache implementations correctly raise ValuePicklingError
+        when attempting to insert values that cannot be pickled (such as lambda functions).
+        This test only applies to on-disk implementations since in-memory caches don't
+        require serialization. Ensures proper error handling for invalid value types.
+
+        Args:
+            impl_typename: The cache implementation type to test ("_InMemoryCacheImpl" or "_OnDiskCacheImpl")
+        """
+        impl: impls._CacheImpl = self.impl_from_typename(impl_typename)
+        with impl.lock():
+            if isinstance(impl, impls._OnDiskCacheImpl):
+                with self.assertRaises(exceptions.ValuePicklingError):
+                    impl.insert(None, lambda: None)
+
+    @parametrize("impl_typename", impl_typenames)
+    def test_value_decoding(self, impl_typename: str) -> None:
+        """Test that on-disk cache implementations properly handle corrupted cached values.
+
+        Verifies that on-disk cache implementations correctly raise ValueUnPicklingError
+        when attempting to retrieve values from cache files that contain corrupted or
+        invalid pickled data. This test ensures proper error handling when cached data
+        becomes corrupted on disk. Only applies to on-disk implementations since
+        in-memory caches don't involve serialization/deserialization.
+
+        Args:
+            impl_typename: The cache implementation type to test ("_InMemoryCacheImpl" or "_OnDiskCacheImpl")
+        """
+        impl: impls._CacheImpl = self.impl_from_typename(impl_typename)
+        with impl.lock():
+            if isinstance(impl, impls._OnDiskCacheImpl):
+                key: str = self.random_string
+                self.assert_key_not_in(key, impl)
+                fpath: Path = impl._fpath_from_key(key)
+                with open(fpath, "xb") as fp:
+                    impl._write_version_header(fp)
+                    fp.write(b"foo")
+                with self.assertRaises(exceptions.ValueUnPicklingError):
+                    impl.get(key)
+
+    @parametrize("impl_typename", impl_typenames)
+    def test_version_mismatch(self, impl_typename: str) -> None:
+        """Test that on-disk cache implementations properly handle version mismatches.
+
+        Verifies that on-disk cache implementations correctly handle cached data when
+        the cache version changes. This test ensures that:
+        1. Data can be stored and retrieved with the current version
+        2. When version changes, previously cached data becomes inaccessible (cache miss)
+        3. New data can be stored with the new version
+        4. After version change, old cached data remains inaccessible
+
+        This version checking mechanism prevents corruption and compatibility issues
+        when cache formats change between software versions. Only applies to on-disk
+        implementations since in-memory caches don't persist across version changes.
+
+        Args:
+            impl_typename: The cache implementation type to test ("_InMemoryCacheImpl" or "_OnDiskCacheImpl")
+        """
+        impl: impls._CacheImpl = self.impl_from_typename(impl_typename)
+        with impl.lock():
+            if isinstance(impl, impls._OnDiskCacheImpl):
+                key: str = self.random_string
+                self.assert_key_not_in(key, impl)
+                value: str = self.random_string
+                self.assert_key_value_inserted_in(key, value, impl)
+                self.assert_key_has_value_in(key, value, impl)
+                with patch.object(
+                    impls._OnDiskCacheImpl, "_version", impl._version + 1
+                ):
+                    self.assert_key_not_in(key, impl)
+                    self.assert_key_value_inserted_in(key, value, impl)
+                    self.assert_key_has_value_in(key, value, impl)
+                self.assert_key_not_in(key, impl)
+                self.assert_key_value_inserted_in(key, value, impl)
+                self.assert_key_has_value_in(key, value, impl)
+
+
+@instantiate_parametrized_tests
+class LocksTest(TestMixin, TestCase):
+    T = TypeVar("T")
+
+    @contextmanager
+    def executor(self) -> Generator[ThreadPoolExecutor, None, None]:
+        executor: ThreadPoolExecutor = ThreadPoolExecutor()
+        try:
+            yield executor
+        finally:
+            executor.shutdown()
+
+    def is_lock(self, lock_or_flock: Union[Lock, FileLock]) -> bool:
+        return hasattr(lock_or_flock, "locked")
+
+    def is_flock(self, lock_or_flock: Union[Lock, FileLock]) -> bool:
+        return hasattr(lock_or_flock, "is_locked")
+
+    def lock_or_flock_locked(self, lock_or_flock: Union[Lock, FileLock]) -> bool:
+        if self.is_lock(lock_or_flock):
+            return lock_or_flock.locked()
+        elif self.is_flock(lock_or_flock):
+            return lock_or_flock.is_locked
+        else:
+            raise NotImplementedError
+
+    def test_BLOCKING(self) -> None:
+        self.assertEqual(locks._BLOCKING, -1.0)
+
+    def test_NON_BLOCKING(self) -> None:
+        self.assertEqual(locks._NON_BLOCKING, 0.0)
+
+    def test_BLOCKING_WITH_TIMEOUT(self) -> None:
+        self.assertGreater(locks._BLOCKING_WITH_TIMEOUT, 0.0)
+
+    @patch.object(locks, "_BLOCKING_WITH_TIMEOUT", 1.0)
+    @patch.object(locks, "_DEFAULT_TIMEOUT", 1.0)
+    @parametrize("lock_typename", ["Lock", "FileLock"])
+    @parametrize("lock_timeout", ["BLOCKING", "NON_BLOCKING", "BLOCKING_WITH_TIMEOUT"])
+    @parametrize("acquisition_mode", ["safe", "unsafe"])
+    @parametrize("release", ["unlocked", "never", "before_timeout", "after_timeout"])
+    def test_acquire_with_timeout(
+        self,
+        lock_typename: str,
+        lock_timeout: str,
+        acquisition_mode: str,
+        release: str,
+    ) -> None:
+        """Test lock acquisition behavior with various timeout configurations and release scenarios.
+
+        This comprehensive test verifies the lock acquisition functionality for both threading.Lock
+        and FileLock objects across different timeout modes, acquisition patterns, and release timings.
+        The test validates proper exception handling, timeout behavior, and correct lock state management.
+
+        Test parameters:
+        - lock_typename: Tests both "Lock" (threading.Lock) and "FileLock" (filelock.FileLock) types
+        - lock_timeout: Tests "BLOCKING", "NON_BLOCKING", and "BLOCKING_WITH_TIMEOUT" modes
+        - acquisition_mode: Tests both "safe" (context manager) and "unsafe" (manual) acquisition
+        - release: Tests "unlocked", "never", "before_timeout", and "after_timeout" scenarios
+
+        The test ensures that:
+        - Safe acquisition properly manages lock lifecycle through context managers
+        - Unsafe acquisition requires manual release and behaves correctly
+        - Timeout exceptions are raised appropriately for different timeout configurations
+        - Lock states are correctly maintained throughout acquisition and release cycles
+        - Different lock types (Lock vs FileLock) behave consistently with their respective APIs
+        """
+
+        def inner(lock_or_flock: Union[Lock, FileLock], timeout: int) -> None:
+            if self.is_lock(lock_or_flock):
+                lock: Lock = lock_or_flock
+                if acquisition_mode == "safe":
+                    with locks._acquire_lock_with_timeout(lock, timeout=timeout):
+                        self.assertTrue(self.lock_or_flock_locked(lock))
+                elif acquisition_mode == "unsafe":
+                    locks._unsafe_acquire_lock_with_timeout(lock, timeout=timeout)
+                    self.assertTrue(self.lock_or_flock_locked(lock))
+                    lock.release()
+                else:
+                    raise NotImplementedError
+            elif self.is_flock(lock_or_flock):
+                flock: FileLock = lock_or_flock
+                if acquisition_mode == "safe":
+                    with locks._acquire_flock_with_timeout(flock, timeout=timeout):
+                        self.assertTrue(self.lock_or_flock_locked(flock))
+                elif acquisition_mode == "unsafe":
+                    locks._unsafe_acquire_flock_with_timeout(flock, timeout=timeout)
+                    self.assertTrue(self.lock_or_flock_locked(flock))
+                    flock.release()
+                else:
+                    raise NotImplementedError
+            else:
+                raise NotImplementedError
+            self.assertFalse(self.lock_or_flock_locked(lock_or_flock))
+
+        assert lock_typename in ["Lock", "FileLock"]
+        flock_fpath: Path = (
+            impls._OnDiskCacheImpl()._cache_dir
+            / f"testing-locks-instance-{self.random_string}.lock"
+        )
+        lock_or_flock: Union[Lock, FileLock] = (
+            Lock() if lock_typename == "Lock" else FileLock(str(flock_fpath))
+        )
+        lock_exception_type: type = (
+            exceptions.LockTimeoutError
+            if lock_typename == "Lock"
+            else exceptions.FileLockTimeoutError
+        )
+
+        if release == "unlocked":
+            self.assertFalse(self.lock_or_flock_locked(lock_or_flock))
+        elif release in ["never", "before_timeout", "after_timeout"]:
+            self.assertTrue(lock_or_flock.acquire(timeout=locks._NON_BLOCKING))
+            self.assertTrue(self.lock_or_flock_locked(lock_or_flock))
+        else:
+            raise NotImplementedError
+
+        with self.executor() as executor:
+            assert lock_timeout in ["BLOCKING", "NON_BLOCKING", "BLOCKING_WITH_TIMEOUT"]
+            lock_or_flock_future: Future[None] = executor.submit(
+                inner,
+                lock_or_flock,
+                timeout={
+                    "BLOCKING": locks._BLOCKING,
+                    "NON_BLOCKING": locks._NON_BLOCKING,
+                    "BLOCKING_WITH_TIMEOUT": locks._BLOCKING_WITH_TIMEOUT,
+                }[lock_timeout],
+            )
+
+            if release == "unlocked":
+                self.assertIsNone(lock_or_flock_future.result())
+            elif release == "never":
+                wait([lock_or_flock_future], timeout=(locks._BLOCKING_WITH_TIMEOUT * 2))
+                if lock_timeout == "BLOCKING":
+                    with self.assertRaises(TimeoutError):
+                        lock_or_flock_future.result(
+                            timeout=locks._BLOCKING_WITH_TIMEOUT
+                        )
+                elif lock_timeout in ["NON_BLOCKING", "BLOCKING_WITH_TIMEOUT"]:
+                    with self.assertRaises(lock_exception_type):
+                        lock_or_flock_future.result()
+                else:
+                    raise NotImplementedError
+                lock_or_flock.release()
+            elif release == "before_timeout":
+                wait([lock_or_flock_future], timeout=(locks._BLOCKING_WITH_TIMEOUT / 2))
+                lock_or_flock.release()
+                if lock_timeout in ["BLOCKING", "BLOCKING_WITH_TIMEOUT"]:
+                    self.assertIsNone(lock_or_flock_future.result())
+                elif lock_timeout == "NON_BLOCKING":
+                    with self.assertRaises(lock_exception_type):
+                        lock_or_flock_future.result()
+                else:
+                    raise NotImplementedError
+            elif release == "after_timeout":
+                wait([lock_or_flock_future], timeout=(locks._BLOCKING_WITH_TIMEOUT * 2))
+                lock_or_flock.release()
+                if lock_timeout == "BLOCKING":
+                    self.assertIsNone(lock_or_flock_future.result())
+                elif lock_timeout in ["NON_BLOCKING", "BLOCKING_WITH_TIMEOUT"]:
+                    with self.assertRaises(lock_exception_type):
+                        lock_or_flock_future.result()
+                else:
+                    raise NotImplementedError
+
+        flock_fpath.unlink(missing_ok=True)
+
+
+@instantiate_parametrized_tests
+class UtilsTest(TestMixin, TestCase):
+    def test_lru_cache(self) -> None:
+        """Test that the LRU cache decorator works correctly with various argument types.
+
+        Verifies that the _lru_cache decorator properly caches function results
+        and handles different types of arguments including integers, floats, strings,
+        and keyword arguments. Tests that cached calls return identical results
+        to non-cached calls with proper argument preservation.
+        """
+
+        @utils._lru_cache
+        def foo(*args, **kwargs):
+            return args, kwargs
+
+        self.assertEqual(
+            foo(0),
+            (
+                (0,),
+                {},
+            ),
+        )
+        self.assertEqual(
+            foo(0.0),
+            (
+                (0.0,),
+                {},
+            ),
+        )
+        self.assertEqual(
+            foo("foo"),
+            (
+                ("foo",),
+                {},
+            ),
+        )
+        self.assertEqual(
+            foo("foo", bar="bar"),
+            (
+                ("foo",),
+                {"bar": "bar"},
+            ),
+        )
+
+    @parametrize("pickle_able", [True, False])
+    def test_try_pickle_key(self, pickle_able: bool) -> None:
+        """Test that cache key pickling works correctly and raises appropriate exceptions.
+
+        Verifies that the _try_pickle_key function successfully pickles serializable
+        cache keys and raises KeyPicklingError for non-serializable keys like lambda
+        functions. Tests both the successful pickling path and error handling.
+        """
+        if pickle_able:
+            key: str = self.random_string
+            self.assertEqual(pickle.loads(utils._try_pickle_key(key)), key)
+        else:
+            with self.assertRaises(exceptions.KeyPicklingError):
+                _ = utils._try_pickle_key(lambda: None)
+
+    @parametrize("pickle_able", [True, False])
+    def test_try_pickle_value(self, pickle_able: bool) -> None:
+        """Test that cache value pickling works correctly and raises appropriate exceptions.
+
+        Verifies that the _try_pickle_value function successfully pickles serializable
+        cache values and raises ValuePicklingError for non-serializable values like
+        lambda functions. Tests both successful pickling and proper error handling.
+        """
+        if pickle_able:
+            value: str = self.random_string
+            self.assertEqual(pickle.loads(utils._try_pickle_value(value)), value)
+        else:
+            with self.assertRaises(exceptions.ValuePicklingError):
+                _ = utils._try_pickle_value(lambda: None)
+
+    @parametrize("unpickle_able", [True, False])
+    def test_try_unpickle_value(self, unpickle_able: bool) -> None:
+        """Test that cache value unpickling works correctly and raises appropriate exceptions.
+
+        Verifies that the _try_unpickle_value function successfully unpickles valid
+        pickled data and raises ValueUnPicklingError for invalid data like None.
+        Tests both successful unpickling and proper error handling for corrupted data.
+        """
+        if unpickle_able:
+            value: str = self.random_string
+            self.assertEqual(utils._try_unpickle_value(pickle.dumps(value)), value)
+        else:
+            with self.assertRaises(exceptions.ValueUnPicklingError):
+                _ = utils._try_unpickle_value(b"foo")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/inductor/test_codecache.py b/test/inductor/test_codecache.py
index 6da49ab39229..ca2e9007109d 100644
--- a/test/inductor/test_codecache.py
+++ b/test/inductor/test_codecache.py
@@ -7,6 +7,7 @@
 import subprocess
 import sys
 import tempfile
+import textwrap
 import unittest
 from contextlib import contextmanager
 from typing import Optional, Union
@@ -15,6 +16,8 @@
 
 import torch
 from torch._dynamo import reset
+from torch._dynamo.package import DynamoCache
+from torch._dynamo.precompile_context import PrecompileContext
 from torch._dynamo.utils import counters
 from torch._functorch import config as functorch_config
 from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
@@ -56,6 +59,7 @@
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     IS_FBCODE,
+    IS_SANDCASTLE,
     parametrize,
     TEST_WITH_ROCM,
 )
@@ -138,6 +142,101 @@ def test_linemaps_empty(self):
         stack_frames = PyCodeCache.stack_frames_for_code(path, 0)
         self.assertEqual(stack_frames, None)
 
+    @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Skip in fbcode/sandcastle")
+    def test_editable_cached_wrapper(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            env = os.environ.copy()
+            env["TORCHINDUCTOR_CACHE_DIR"] = tmpdir
+
+            step1 = textwrap.dedent(
+                """
+                import glob
+                import os
+                import torch
+                import warnings
+                from torch._inductor import config
+
+                warnings.filterwarnings("ignore")
+                config.fx_graph_cache = True
+                config.fx_graph_remote_cache = False
+                torch._dynamo.reset()
+
+                @torch.compile(backend="inductor")
+                def f(x):
+                    return x * 2
+
+                f(torch.ones(2))
+                cache_dir = os.environ["TORCHINDUCTOR_CACHE_DIR"]
+                pyfiles = glob.glob(os.path.join(cache_dir, "**", "*.py"), recursive=True)
+                print(pyfiles[0])
+                """
+            )
+            wrapper_path = (
+                subprocess.check_output([sys.executable, "-c", step1], env=env)
+                .decode()
+                .strip()
+            )
+
+            step2 = textwrap.dedent(
+                """
+                import torch
+                import warnings
+                from torch._dynamo.utils import counters
+                from torch._inductor import config
+
+                warnings.filterwarnings("ignore")
+                config.fx_graph_cache = True
+                config.fx_graph_remote_cache = False
+                torch._dynamo.reset()
+
+                @torch.compile(backend="inductor")
+                def f(x):
+                    return x * 2
+
+                f(torch.ones(2))
+                print(counters["inductor"]["fxgraph_cache_hit"])
+                """
+            )
+            hit = (
+                subprocess.check_output([sys.executable, "-c", step2], env=env)
+                .decode()
+                .strip()
+            )
+            self.assertEqual(hit, "1")
+
+            with open(wrapper_path) as f:
+                src = f.read()
+            with open(wrapper_path, "w") as f:
+                f.write(
+                    src.replace(
+                        "def call(self, args):",
+                        "def call(self, args):\n        print('debug')",
+                    )
+                )
+
+            step3 = textwrap.dedent(
+                """
+                import torch
+                import warnings
+                from torch._inductor import config
+
+                warnings.filterwarnings("ignore")
+                config.fx_graph_cache = True
+                config.fx_graph_remote_cache = False
+                torch._dynamo.reset()
+
+                @torch.compile(backend="inductor")
+                def f(x):
+                    return x * 2
+
+                f(torch.ones(2))
+                """
+            )
+            out = subprocess.check_output(
+                [sys.executable, "-c", step3], env=env
+            ).decode()
+            self.assertIn("debug", out)
+
 
 @instantiate_parametrized_tests
 class TestFxGraphCache(TestCase):
@@ -146,8 +245,12 @@ class TestFxGraphCache(TestCase):
     def setUp(self):
         super().setUp()
         counters.clear()
+        DynamoCache.clear()
+        PrecompileContext.clear()
+        AOTAutogradCache.clear()
         PatchCaches.setUp()
         CacheArtifactManager.clear()
+        torch._dynamo.reset()
 
     def tearDown(self):
         super().tearDown()
@@ -155,6 +258,8 @@ def tearDown(self):
 
     def reset(self):
         AOTAutogradCache.clear()
+        DynamoCache.clear()
+        PrecompileContext.clear()
         PyCodeCache.cache_clear(purge=True)
         torch._dynamo.reset()
         clear_caches()
@@ -498,6 +603,109 @@ def fn(x, y):
             self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
             self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 1)
 
+    @requires_triton()
+    @config.patch(
+        {
+            "fx_graph_cache": True,
+            "fx_graph_remote_cache": False,
+            "autotune_local_cache": True,
+        }
+    )
+    @torch._dynamo.config.patch(
+        {
+            "caching_precompile": True,
+        }
+    )
+    @parametrize("dynamic", (False, True))
+    @parametrize("device", (GPU_TYPE, "cpu"))
+    @parametrize("dtype", (torch.float32, torch.bfloat16))
+    def test_cache_hot_load_caching_precompile(self, device, dtype, dynamic):
+        """
+        Verify that we can populate and hot load functions from the cache.
+        """
+
+        if device == GPU_TYPE and not HAS_GPU:
+            raise unittest.SkipTest(f"requires {GPU_TYPE}")
+        if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
+            raise unittest.SkipTest("requires SM80 or later")
+
+        def fn(x, y):
+            return x.sin() @ y
+
+        a = torch.rand(100, 100, dtype=dtype, device=device, requires_grad=True)
+        b = torch.rand(100, 100, dtype=dtype, device=device, requires_grad=True)
+
+        # Record artifacts
+        with fresh_cache():
+            compiled_fn = torch.compile(fn, dynamic=dynamic)
+
+            # A first call should miss in the cache.
+            eager_result = fn(a, b)
+            compiled_result = compiled_fn(a, b)
+            compiled_result.sum().backward()
+            self.assertEqual(eager_result, compiled_result)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+            self.assertEqual(counters["dynamo_cache"]["dynamo_cache_miss"], 1)
+            self.assertEqual(counters["dynamo_cache"]["dynamo_cache_hit"], 0)
+
+        artifacts = torch.compiler.save_cache_artifacts()
+
+        self.assertIsNotNone(artifacts)
+
+        artifact_bytes, cache_info = artifacts
+
+        autotune_expect = 2 if device == GPU_TYPE else 0
+        self.assertEqual(len(cache_info.inductor_artifacts), 2)
+        self.assertEqual(len(cache_info.autotune_artifacts), autotune_expect)
+        self.assertEqual(len(cache_info.aot_autograd_artifacts), 1)
+        self.assertEqual(len(cache_info.pgo_artifacts), 0)
+        self.assertEqual(len(cache_info.precompile_artifacts), 1)
+
+        self.reset()
+
+        # Clean triton kernels
+        shutil.rmtree(os.path.join(cache_dir(), "triton"), ignore_errors=True)
+
+        # We did not load anything so dont hit yet
+        with fresh_cache():
+            eager_result = fn(a, b)
+            # With caching precompile, we have to re torch.compile the function
+            # to trigger cache lookup
+            compiled_fn = torch.compile(fn, dynamic=dynamic)
+            compiled_result = compiled_fn(a, b)
+            compiled_result.sum().backward()
+            self.assertEqual(eager_result, compiled_result)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+            self.assertEqual(counters["dynamo_cache"]["dynamo_cache_miss"], 2)
+            self.assertEqual(counters["dynamo_cache"]["dynamo_cache_hit"], 0)
+        self.reset()
+        # Clean triton kernels
+        shutil.rmtree(os.path.join(cache_dir(), "triton"), ignore_errors=True)
+
+        # Hot load and hit
+        with fresh_cache(), torch.compiler.set_stance("fail_on_recompile"):
+            cache_info = torch.compiler.load_cache_artifacts(artifact_bytes)
+            self.assertEqual(len(cache_info.inductor_artifacts), 2)
+            self.assertEqual(len(cache_info.autotune_artifacts), autotune_expect)
+            self.assertEqual(len(cache_info.aot_autograd_artifacts), 1)
+            self.assertEqual(len(cache_info.pgo_artifacts), 0)
+            self.assertEqual(len(cache_info.precompile_artifacts), 1)
+
+            # With caching precompile, we have to re torch.compile the function
+            # to trigger cache lookup
+            compiled_fn = torch.compile(fn, dynamic=dynamic)
+
+            eager_result = fn(a, b)
+            compiled_result = compiled_fn(a, b)
+            compiled_result.sum().backward()
+            self.assertEqual(eager_result, compiled_result)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 2)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
+            self.assertEqual(counters["dynamo_cache"]["dynamo_cache_miss"], 2)
+            self.assertEqual(counters["dynamo_cache"]["dynamo_cache_hit"], 1)
+
     @config.patch(
         {
             "fx_graph_cache": True,
@@ -938,7 +1146,7 @@ def test_constant_handling(self, device):
             raise unittest.SkipTest(f"requires {GPU_TYPE}")
 
         def fn1(x):
-            return x + torch.tensor(list(range(0, 12)), device=device)
+            return x + torch.tensor(list(range(12)), device=device)
 
         def fn2(x):
             return x + torch.tensor(list(range(1, 13)), device=device)
diff --git a/test/inductor/test_compile_subprocess.py b/test/inductor/test_compile_subprocess.py
index 51aa7b70b9c4..bf474bfbf177 100644
--- a/test/inductor/test_compile_subprocess.py
+++ b/test/inductor/test_compile_subprocess.py
@@ -206,7 +206,8 @@ def model_add(x, y):
 
             start = time.time()
             last_report = start
-            while _AsyncFxCompile._stat_compiled_runs < 4:
+            while True:
+                start_stat_compiled_runs = _AsyncFxCompile._stat_compiled_runs
                 # Sleep a bit so we don't drive the CPU unnecessarily.
                 time.sleep(0.25)
 
@@ -219,6 +220,9 @@ def model_add(x, y):
                 # Backward pass
                 output.sum().backward()
 
+                if _AsyncFxCompile._stat_compiled_runs - start_stat_compiled_runs == 2:
+                    break
+
                 # DEBUGGING: Print a periodic message so we know we're still
                 # running...
                 now = time.time()
@@ -231,12 +235,12 @@ def model_add(x, y):
                         "Test timed out before producing a compiled artifact."
                     )
 
-            self.assertEqual(_AsyncFxCompile._stat_compiled_runs, 4)
+            self.assertGreater(_AsyncFxCompile._stat_compiled_runs, 1)
             # Make sure we ran eager at least once. Normally this will be
             # something like 80.
             self.assertGreater(_AsyncFxCompile._stat_eager_runs, 0)
-            self.assertEqual(_AsyncFxCompile._stat_bg_started, 1)
-            self.assertEqual(_AsyncFxCompile._stat_bg_finished, 1)
+            self.assertEqual(_AsyncFxCompile._stat_bg_started, 2)
+            self.assertEqual(_AsyncFxCompile._stat_bg_finished, 2)
 
 
 if RUN_CPU:
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
index e0cd8b99a6b3..716d3bfafee2 100644
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@@ -1599,7 +1599,7 @@ def eager_check():
 
         eager_check()
 
-        for i in range(0, 5):
+        for i in range(5):
             with compiled_autograd._enable(compiler_fn):
                 eager_check()
 
@@ -3604,12 +3604,12 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
         unwrap_maybe_dynamic_int_18 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_23);  getitem_23 = None
         unwrap_maybe_dynamic_int_19 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_24);  getitem_24 = None
 
-        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [], True)]);  getitem = None
+        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [], True, 6)]);  getitem = None
         getitem_25 = validate_outputs[0];  validate_outputs = None
 
         sum_backward0 = torch__dynamo_compiled_autograd_ops_SumBackward0([getitem_25], [True], [unwrap_maybe_dynamic_int, unwrap_maybe_dynamic_int_1]);  getitem_25 = unwrap_maybe_dynamic_int = unwrap_maybe_dynamic_int_1 = None
         getitem_26 = sum_backward0[0];  sum_backward0 = None
-        validate_outputs_1 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_26], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_2, unwrap_maybe_dynamic_int_3], True)]);  getitem_26 = unwrap_maybe_dynamic_int_2 = unwrap_maybe_dynamic_int_3 = None
+        validate_outputs_1 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_26], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_2, unwrap_maybe_dynamic_int_3], True, 6)]);  getitem_26 = unwrap_maybe_dynamic_int_2 = unwrap_maybe_dynamic_int_3 = None
         getitem_27 = validate_outputs_1[0];  validate_outputs_1 = None
 
         getitem_28 = hooks[0];  getitem_28 = None
@@ -3631,7 +3631,7 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
         call_backward = torch__dynamo_external_utils_call_backward(getitem_33, (), make_subclass);  getitem_33 = make_subclass = None
         getitem_36 = call_backward[0]
         getitem_37 = call_backward[1];  call_backward = None
-        validate_outputs_2 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_36, getitem_37], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_16, unwrap_maybe_dynamic_int_17], False), ((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_18, unwrap_maybe_dynamic_int_19], False)]);  getitem_36 = getitem_37 = unwrap_maybe_dynamic_int_16 = unwrap_maybe_dynamic_int_17 = unwrap_maybe_dynamic_int_18 = unwrap_maybe_dynamic_int_19 = None
+        validate_outputs_2 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_36, getitem_37], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_16, unwrap_maybe_dynamic_int_17], False, 6), ((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_18, unwrap_maybe_dynamic_int_19], False, 6)]);  getitem_36 = getitem_37 = unwrap_maybe_dynamic_int_16 = unwrap_maybe_dynamic_int_17 = unwrap_maybe_dynamic_int_18 = unwrap_maybe_dynamic_int_19 = None
         getitem_39 = validate_outputs_2[0]
 
         call_accumulate_grad_1 = torch__dynamo_external_utils_call_accumulate_grad(getitem_4, getitem_39, False);  getitem_4 = getitem_39 = call_accumulate_grad_1 = None
@@ -3866,12 +3866,12 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
         unwrap_maybe_dynamic_int_10 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_12);  getitem_12 = None
         unwrap_maybe_dynamic_int_11 = torch__dynamo_external_utils_unwrap_maybe_dynamic_int(getitem_13);  getitem_13 = None
 
-        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [], False)]);  getitem = None
+        validate_outputs = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem], [((None, None, device(type='cpu'), 6, 0, None), [], False, 6)]);  getitem = None
         getitem_14 = validate_outputs[0];  validate_outputs = None
 
         sum_backward0 = torch__dynamo_compiled_autograd_ops_SumBackward0([getitem_14], [True], [unwrap_maybe_dynamic_int, unwrap_maybe_dynamic_int_1]);  getitem_14 = unwrap_maybe_dynamic_int = unwrap_maybe_dynamic_int_1 = None
         getitem_15 = sum_backward0[0];  sum_backward0 = None
-        validate_outputs_1 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_15], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_2, unwrap_maybe_dynamic_int_3], False)]);  getitem_15 = unwrap_maybe_dynamic_int_2 = unwrap_maybe_dynamic_int_3 = None
+        validate_outputs_1 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_15], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_2, unwrap_maybe_dynamic_int_3], False, 6)]);  getitem_15 = unwrap_maybe_dynamic_int_2 = unwrap_maybe_dynamic_int_3 = None
         getitem_16 = validate_outputs_1[0];  validate_outputs_1 = None
 
         getitem_17 = hooks[0]
@@ -3883,7 +3883,7 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
         mul_backward0 = torch__dynamo_compiled_autograd_ops_MulBackward0([getitem_16], [True, True], call_hook, 6, call_hook_1, 6);  getitem_16 = call_hook = call_hook_1 = None
         getitem_21 = mul_backward0[0]
         getitem_22 = mul_backward0[1];  mul_backward0 = None
-        validate_outputs_2 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_21, getitem_22], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_4, unwrap_maybe_dynamic_int_5], False), ((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_6, unwrap_maybe_dynamic_int_7], False)]);  getitem_21 = getitem_22 = unwrap_maybe_dynamic_int_4 = unwrap_maybe_dynamic_int_5 = unwrap_maybe_dynamic_int_6 = unwrap_maybe_dynamic_int_7 = None
+        validate_outputs_2 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_21, getitem_22], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_4, unwrap_maybe_dynamic_int_5], False, 6), ((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_6, unwrap_maybe_dynamic_int_7], False, 6)]);  getitem_21 = getitem_22 = unwrap_maybe_dynamic_int_4 = unwrap_maybe_dynamic_int_5 = unwrap_maybe_dynamic_int_6 = unwrap_maybe_dynamic_int_7 = None
         getitem_23 = validate_outputs_2[0]
         getitem_24 = validate_outputs_2[1];  validate_outputs_2 = None
 
@@ -3892,7 +3892,7 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
         call_hook_2 = torch__dynamo_external_utils_call_hook(getitem_25, getitem_26, hook_type = 'unpack_hook');  getitem_25 = getitem_26 = None
         cos_backward0 = torch__dynamo_compiled_autograd_ops_CosBackward0([getitem_24], [True], call_hook_2);  getitem_24 = call_hook_2 = None
         getitem_27 = cos_backward0[0];  cos_backward0 = None
-        validate_outputs_3 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_27], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_8, unwrap_maybe_dynamic_int_9], False)]);  getitem_27 = unwrap_maybe_dynamic_int_8 = unwrap_maybe_dynamic_int_9 = None
+        validate_outputs_3 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_27], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_8, unwrap_maybe_dynamic_int_9], False, 6)]);  getitem_27 = unwrap_maybe_dynamic_int_8 = unwrap_maybe_dynamic_int_9 = None
         getitem_28 = validate_outputs_3[0];  validate_outputs_3 = None
         add = torch.add(getitem_23, getitem_28);  getitem_23 = getitem_28 = None
 
@@ -3901,7 +3901,7 @@ def forward(self, inputs, sizes, scalars, hooks, packed_data):
         call_hook_3 = torch__dynamo_external_utils_call_hook(getitem_29, getitem_30, hook_type = 'unpack_hook');  getitem_29 = getitem_30 = None
         sin_backward0 = torch__dynamo_compiled_autograd_ops_SinBackward0([add], [True], call_hook_3);  add = call_hook_3 = None
         getitem_31 = sin_backward0[0];  sin_backward0 = None
-        validate_outputs_4 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_31], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_10, unwrap_maybe_dynamic_int_11], False)]);  getitem_31 = unwrap_maybe_dynamic_int_10 = unwrap_maybe_dynamic_int_11 = None
+        validate_outputs_4 = torch__dynamo_compiled_autograd_ops_validate_outputs([getitem_31], [((None, None, device(type='cpu'), 6, 0, None), [unwrap_maybe_dynamic_int_10, unwrap_maybe_dynamic_int_11], False, 6)]);  getitem_31 = unwrap_maybe_dynamic_int_10 = unwrap_maybe_dynamic_int_11 = None
         getitem_32 = validate_outputs_4[0];  validate_outputs_4 = None
 
         call_accumulate_grad = torch__dynamo_external_utils_call_accumulate_grad(getitem_1, getitem_32, False);  getitem_1 = getitem_32 = call_accumulate_grad = None
@@ -5266,6 +5266,7 @@ def wrap_test_class(orig_cls):
         "test_dropout_inductor",  # functionalize_rng_ops not yet supported
         "test_function_with_kwargs",  # functionalize_rng_ops not yet supported
         "test_module",  # functionalize_rng_ops not yet supported
+        "test_grad_dtype",  # AttributeError: args / Float did not match Double
     },
     "eager": {  # will be run without torch.compiling the CA graph
         "test_setup_context_when_forward_has_default_args",  # autograd.Function with class methods
diff --git a/test/inductor/test_compiled_optimizers.py b/test/inductor/test_compiled_optimizers.py
index 4c3d394b3e9f..36a4424683a9 100644
--- a/test/inductor/test_compiled_optimizers.py
+++ b/test/inductor/test_compiled_optimizers.py
@@ -289,7 +289,7 @@ def build_opt_kwarg_db():
 
                 has_tensor_lr = False
                 for key, val in kwargs.items():
-                    if (not key == "lr" and not key == "betas") and (
+                    if (key != "lr" and key != "betas") and (
                         not isinstance(val, bool) or (isinstance(val, bool) and val)
                     ):
                         name += "_" + key
@@ -450,7 +450,7 @@ def test_fn(self):
                 stack.enter_context(config.patch({"triton.cudagraphs": True}))
 
             kwargs_compiled = deepcopy(kwargs)
-            if isinstance(kwargs.get("lr", None), torch.Tensor):
+            if isinstance(kwargs.get("lr"), torch.Tensor):
                 kwargs["lr"] = kwargs["lr"].to(device)
                 kwargs_compiled["lr"] = kwargs_compiled["lr"].to(device)
 
diff --git a/test/inductor/test_config.py b/test/inductor/test_config.py
index 4fec1f23f1e6..f617a4d12f11 100644
--- a/test/inductor/test_config.py
+++ b/test/inductor/test_config.py
@@ -115,7 +115,7 @@ def test_compile_api(self):
         for kwargs in checks:
             torch._dynamo.reset()
             opt_fn = torch.compile(dummy_fn, **kwargs)
-            torch.testing.assert_allclose(
+            torch.testing.assert_close(
                 opt_fn(x), y, msg=f"torch.compile(..., **{kwargs!r}) failed"
             )
 
@@ -244,6 +244,29 @@ def __call__(self, g: torch.fx.Graph):
             code = torch._inductor.config.codegen_config()
             self.assertNotIn("post_grad_custom", code)
 
+    def test_select_decomp_table_fallback_embedding_bag_byte_unpack(self):
+        """Test that select_decomp_table removes embedding_bag_byte_unpack when fallback is enabled"""
+        from torch._inductor.decomposition import select_decomp_table
+
+        # Test with fallback_embedding_bag_byte_unpack = False (default)
+        with config.patch(fallback_embedding_bag_byte_unpack=False):
+            decomp_table = select_decomp_table()
+            # The operation should be in decompositions when fallback is False
+            # Note: We check if it's in the fast_random_decomps() or decompositions table
+            self.assertTrue(
+                torch.ops.quantized.embedding_bag_byte_unpack.default in decomp_table
+                or len(decomp_table)
+                > 0  # fast_random_decomps() is used when fallback is False
+            )
+
+        # Test with fallback_embedding_bag_byte_unpack = True
+        with config.patch(fallback_embedding_bag_byte_unpack=True):
+            decomp_table = select_decomp_table()
+            # The operation should NOT be in decompositions when fallback is True
+            self.assertNotIn(
+                torch.ops.quantized.embedding_bag_byte_unpack.default, decomp_table
+            )
+
     @unittest.skipIf(not HAS_TRITON, "requires triton")
     def test_options_do_something(self):
         """
diff --git a/test/inductor/test_control_deps.py b/test/inductor/test_control_deps.py
new file mode 100644
index 000000000000..f45f103866c7
--- /dev/null
+++ b/test/inductor/test_control_deps.py
@@ -0,0 +1,78 @@
+# Owner(s): ["module: inductor"]
+
+import torch
+from torch._inductor import config
+from torch._inductor.test_case import run_tests, TestCase as InductorTestCase
+from torch._inductor.utils import run_and_get_code
+from torch.testing import FileCheck
+from torch.testing._internal.common_utils import IS_LINUX
+from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
+    HAS_CUDA_AND_TRITON,
+    requires_gpu,
+)
+
+
+class TestControlDeps(InductorTestCase):
+    @config.patch(reorder_for_locality=False)
+    @requires_gpu()
+    def test_control_deps_prevents_fusion(self):
+        def fn(a, b):
+            c = a + 1
+            d = b @ b
+            e = c * 2
+            return d, e
+
+        # Custom pass to add control dependency from d -> c
+        def add_control_deps(graph):
+            nodes = list(graph.nodes)
+
+            nodes = [n for n in graph.nodes if n.op == "call_function"]
+            assert len(nodes) == 3
+            c_node = nodes[0]
+            d_node = nodes[1]
+            e_node = nodes[2]
+
+            assert d_node.target == torch.ops.aten.mm.default
+
+            from torch.utils._ordered_set import OrderedSet
+
+            deps_map = {d_node: OrderedSet([c_node]), e_node: OrderedSet([d_node])}
+            torch._inductor.fx_passes.control_dependencies.preserve_node_ordering(
+                graph, deps_map
+            )
+            sub_g = graph.find_nodes(
+                op="call_function", target=torch.ops.higher_order.control_deps
+            )
+            assert len(sub_g) == 2
+
+            assert list(sub_g[0].meta["val"].shape) == [256, 256]
+            assert list(sub_g[1].meta["val"].shape) == [256, 256]
+
+            for attr in graph.find_nodes(op="get_attr"):
+                for n in getattr(graph.owning_module, attr.target).graph.nodes:
+                    assert list(n.meta["val"].shape) == [256, 256]
+
+            return graph
+
+        with torch._inductor.config.patch(
+            post_grad_custom_post_pass=add_control_deps,
+        ):
+            compiled_fn = torch.compile(fn)
+            a = torch.rand([256, 256], device=GPU_TYPE)
+            b = torch.rand([256, 256], device=GPU_TYPE)
+
+            _, code = run_and_get_code(torch.compile(fn), a, b)
+            result = compiled_fn(a, b)
+
+            FileCheck().check(".run(").check("extern_kernels.mm(").check(".run(").run(
+                code[0]
+            )
+
+            expected = fn(a, b)
+            torch.testing.assert_close(result, expected)
+
+
+if __name__ == "__main__":
+    if IS_LINUX and HAS_CUDA_AND_TRITON:
+        run_tests(needs="filelock")
diff --git a/test/inductor/test_control_flow.py b/test/inductor/test_control_flow.py
index 715176a5ee51..a3c81bdfd15b 100644
--- a/test/inductor/test_control_flow.py
+++ b/test/inductor/test_control_flow.py
@@ -1,5 +1,5 @@
 # Owner(s): ["module: inductor"]
-import contextlib
+
 import itertools
 import unittest
 
@@ -1661,7 +1661,7 @@ def __init__(self, reverse, dim):
             super().__init__()
             self.reverse = reverse
             self.dim = dim
-            self.linear = torch.nn.Linear(4, 4)
+            self.linear = torch.nn.Linear(4, 4, dtype=torch.float64)
 
         def forward(self, scan_op, init, xs):
             def combine_fn(carry, x):
@@ -1889,27 +1889,58 @@ def _run_test(
         inputs,
         device,
         dynamic,
-        requires_grad=False,
+        autograd=False,
     ):
-        cnt = torch._dynamo.testing.CompileCounterWithBackend("inductor")
-        compiled_model = torch.compile(backend=cnt, fullgraph=True, dynamic=dynamic)(
-            model
-        )
+        import copy
 
+        inputs = [
+            inp.requires_grad_(autograd) if inp.dtype.is_floating_point else inp
+            for inp in inputs
+        ]
         inputs = [inp.to(device=device) for inp in inputs]
         model = model.to(device=device)
-        cloned_inputs = [inp.clone() for inp in inputs]
-        grad_ctx = contextlib.nullcontext() if requires_grad else torch.no_grad()
-        with grad_ctx:
-            result = model(scan, *cloned_inputs)
-            result_exp = model(_fake_scan, *cloned_inputs)
-
-            result_compiled = compiled_model(scan, *cloned_inputs)
-            result_compiled_exp = compiled_model(_fake_scan, *cloned_inputs)
+        for p in model.parameters():
+            p.requires_grad_(autograd)
+
+        model1 = copy.deepcopy(model)
+        model2 = copy.deepcopy(model)
+        model3 = copy.deepcopy(model)
+        model4 = copy.deepcopy(model)
+        model3.compile(fullgraph=True, dynamic=dynamic)
+        model4.compile(fullgraph=True, dynamic=dynamic)
+
+        def _run_model(model, inputs):
+            cloned_inputs = [
+                inp.clone() if isinstance(inp, torch.Tensor) else inp for inp in inputs
+            ]
+            fw_result = model(*cloned_inputs)
+            loss = loss_fn(fw_result)
+            if autograd:
+                loss.backward()
+                return (
+                    fw_result,
+                    loss,
+                    [
+                        inp.grad
+                        for inp in cloned_inputs
+                        if isinstance(inp, torch.Tensor)
+                    ],
+                    {n: p.grad for n, p in model.named_parameters()},
+                )
+            else:
+                return fw_result, loss
+
+        result_exp = _run_model(model1, [_fake_scan] + inputs)
+        result_eager = _run_model(model2, [scan] + inputs)
+        result_compiled = _run_model(model3, [scan] + inputs)
+        result_compiled_exp = _run_model(
+            model4,
+            [_fake_scan] + inputs,
+        )
 
-        self.assertEqual(result, result_exp)
+        self.assertEqual(result_exp, result_eager)
         self.assertEqual(result_exp, result_compiled)
-        self.assertEqual(result_compiled, result_compiled_exp)
+        self.assertEqual(result_exp, result_compiled_exp)
 
     def _compare_result(
         self,
@@ -1929,8 +1960,9 @@ def _compare_result(
     @parametrize("dynamic", [True, False])
     @parametrize("reverse", [True, False])
     @parametrize("dim", [0, 1, 2])
+    @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
-    def test_scan_pytree_in_out(self, device, dynamic, reverse, dim):
+    def test_scan_pytree_in_out(self, device, dynamic, reverse, dim, autograd):
         self._run_test(
             model=ScanModels.SimpleWithPytreeInOuts(reverse=reverse, dim=dim),
             inputs=(
@@ -1940,6 +1972,7 @@ def test_scan_pytree_in_out(self, device, dynamic, reverse, dim):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
@@ -1948,10 +1981,13 @@ def test_scan_pytree_in_out(self, device, dynamic, reverse, dim):
     @parametrize("reverse", [True, False])
     @parametrize("dim", [0, 1, 3])
     @parametrize("scan_length", [1, 5])
+    @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
-    def test_scan_nn_modules(self, device, dynamic, reverse, dim, scan_length):
-        init = torch.randn(20, 16, 4, 4)
-        xs = torch.randn(scan_length, 20, 16, 4, 4)
+    def test_scan_nn_modules(
+        self, device, dynamic, reverse, dim, scan_length, autograd
+    ):
+        init = torch.randn(20, 16, 4, 4, dtype=torch.float64)
+        xs = torch.randn(scan_length, 20, 16, 4, 4, dtype=torch.float64)
         xs = xs.movedim(0, dim)
         self._run_test(
             model=ScanModels.ScanLinearWithView(reverse=reverse, dim=dim),
@@ -1961,6 +1997,7 @@ def test_scan_nn_modules(self, device, dynamic, reverse, dim, scan_length):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
@@ -1969,8 +2006,9 @@ def test_scan_nn_modules(self, device, dynamic, reverse, dim, scan_length):
     @parametrize("reverse", [True, False])
     @parametrize("dim", [0, 1, 3])
     @parametrize("scan_length", [1, 5])
+    @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
-    def test_scan_conv(self, device, dynamic, reverse, dim, scan_length):
+    def test_scan_conv(self, device, dynamic, reverse, dim, scan_length, autograd):
         init = torch.randn(2, 4, 4, 4, dtype=torch.float64)
         xs = torch.randn(scan_length, 2, 4, 4, 4, dtype=torch.float64)
         xs = xs.movedim(0, dim)
@@ -1982,6 +2020,7 @@ def test_scan_conv(self, device, dynamic, reverse, dim, scan_length):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
@@ -1991,10 +2030,13 @@ def test_scan_conv(self, device, dynamic, reverse, dim, scan_length):
     @parametrize("dim", [0, 1, 3])
     @parametrize("pred", [True, False])
     @parametrize("scan_length", [1, 5])
+    @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
-    def test_scan_in_cond(self, device, dynamic, reverse, dim, pred, scan_length):
-        init = torch.randn(4, 4, 4)
-        xs = torch.randn(scan_length, 4, 4, 4)
+    def test_scan_in_cond(
+        self, device, dynamic, reverse, dim, pred, scan_length, autograd
+    ):
+        init = torch.randn(4, 4, 4, dtype=torch.float64)
+        xs = torch.randn(scan_length, 4, 4, 4, dtype=torch.float64)
         xs = xs.movedim(0, dim)
         self._run_test(
             model=ScanModels.ScanInCond(reverse=reverse, dim=dim),
@@ -2005,6 +2047,7 @@ def test_scan_in_cond(self, device, dynamic, reverse, dim, pred, scan_length):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
@@ -2013,8 +2056,9 @@ def test_scan_in_cond(self, device, dynamic, reverse, dim, pred, scan_length):
     @parametrize("reverse", [True, False])
     @parametrize("dim", [0, 1, 3])
     @parametrize("scan_length", [1, 5])
+    @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
-    def test_cond_in_scan(self, device, dynamic, reverse, dim, scan_length):
+    def test_cond_in_scan(self, device, dynamic, reverse, dim, scan_length, autograd):
         init = torch.randn(2, 4, 4, 4)
         xs = torch.randn(scan_length, 4, 4, 4)
         xs = xs.movedim(0, dim)
@@ -2026,13 +2070,15 @@ def test_cond_in_scan(self, device, dynamic, reverse, dim, scan_length):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
+    @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
-    def test_scan_chunked_ce(self, device, dynamic):
+    def test_scan_chunked_ce(self, device, dynamic, autograd):
         self._run_test(
             model=ScanModels.ChunkedCE(10),
             inputs=(
@@ -2043,6 +2089,7 @@ def test_scan_chunked_ce(self, device, dynamic):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
@@ -2066,8 +2113,9 @@ def test_scan_compare_chunked_ce_with_no_scan(self, device, dynamic):
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
+    @parametrize("autograd", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
-    def test_scan_with_clamp(self, device, dynamic):
+    def test_scan_with_clamp(self, device, dynamic, autograd):
         B = 4
         T = 8
         H = 16
@@ -2075,10 +2123,11 @@ def test_scan_with_clamp(self, device, dynamic):
             model=ScanModels.ScanWithClamp(),
             inputs=(
                 torch.randn((B, H)),
-                torch.randn((T, B, H), requires_grad=True),
+                torch.randn((T, B, H)),
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
 
diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
index 10e7c3068f10..f302b021f6c3 100644
--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@@ -8,7 +8,7 @@
 import platform
 import sys
 import unittest
-from typing import Callable
+from collections.abc import Callable
 from unittest.mock import patch
 
 import torch
@@ -128,6 +128,20 @@ def forward(self, x, h=None):
 class CPUReproTests(TestCase):
     common = check_model
 
+    def test_torch_linalg_qr_tuple_slice(self):
+        def fn(x):
+            return torch.linalg.qr(x)[:1]
+
+        x = torch.randn(4, 4)
+        compiled = torch.compile(fn, backend="inductor")
+
+        expected = fn(x)
+        actual = compiled(x)
+
+        self.assertIsInstance(actual, tuple)
+        self.assertEqual(len(actual), 1)
+        torch.testing.assert_close(actual[0], expected[0])
+
     @skipIfRocm
     def test_conv_stride_constraints(self):
         for fmt in [torch.contiguous_format, torch.channels_last]:
@@ -202,6 +216,34 @@ def forward(self, x):
                 (v,),
             )
 
+    def test_complex_cholesky_mh_view_fallback(self):
+        torch.manual_seed(0)
+
+        n = 8
+
+        def fn(inp: torch.Tensor):
+            I0 = torch.eye(n, dtype=inp.dtype, device=inp.device)
+            I = I0.unsqueeze(0).expand(inp.shape[0], n, n).contiguous()
+            hermitian = I + 0.5 * (inp @ inp.mH)
+            chol = torch.linalg.cholesky(hermitian, upper=True)
+            return chol.abs().sum()
+
+        base = torch.randn(4, n, n, dtype=torch.complex64)
+
+        def run(compiled_fn):
+            inp = base.clone().detach().requires_grad_(True)
+            loss = compiled_fn(inp)
+            loss.backward()
+            return loss.detach(), inp.grad.detach()
+
+        expected_loss, expected_grad = run(fn)
+
+        compiled = torch.compile(fn, backend="inductor")
+        actual_loss, actual_grad = run(compiled)
+
+        torch.testing.assert_close(actual_loss, expected_loss)
+        torch.testing.assert_close(actual_grad, expected_grad)
+
     def test_nn_fold(self):
         # Fix https://github.com/pytorch/pytorch/issues/147848
 
@@ -269,6 +311,33 @@ def test_conv2d_autocast(self):
                 (v,),
             )
 
+    def test_conv1d_strided_weight_torch_compile(self):
+        def fn(x, w):
+            wt = w.transpose(2, 1)
+            y = F.conv1d(x, wt)
+            return y.clone()
+
+        x_eager = torch.randn(2, 3, 5, requires_grad=True)
+        w_eager = torch.randn(4, 2, 3, requires_grad=True)
+
+        out_eager = fn(x_eager, w_eager)
+        grad = torch.randn_like(out_eager)
+        out_eager_val = out_eager.detach()
+        out_eager.backward(grad)
+        grad_x_eager = x_eager.grad.detach().clone()
+        grad_w_eager = w_eager.grad.detach().clone()
+
+        x_comp = x_eager.detach().requires_grad_(True)
+        w_comp = w_eager.detach().requires_grad_(True)
+        compiled = torch.compile(fn, backend="inductor", fullgraph=True, dynamic=True)
+        out_comp = compiled(x_comp, w_comp)
+        out_comp_val = out_comp.detach()
+        out_comp.backward(grad)
+
+        torch.testing.assert_close(out_comp_val, out_eager_val)
+        torch.testing.assert_close(x_comp.grad, grad_x_eager)
+        torch.testing.assert_close(w_comp.grad, grad_w_eager)
+
     @config.patch(freezing=True)
     @unittest.skipIf(not TEST_MKL, "Test requires MKL")
     @patch("torch.cuda.is_available", lambda: False)
@@ -1474,22 +1543,26 @@ def fn(
             with config.patch({"cpp.simdlen": None}):
                 torch._dynamo.reset()
                 metrics.reset()
-                self.common(
-                    fn,
-                    (
-                        x,
-                        scale,
-                        zero_point,
-                        use_dequant,
-                        use_quant,
-                        quant_min,
-                        quant_max,
-                        dtype,
-                        dequant_out_dtype,
-                    ),
+                inputs = (
+                    x,
+                    scale,
+                    zero_point,
+                    use_dequant,
+                    use_quant,
+                    quant_min,
+                    quant_max,
+                    dtype,
+                    dequant_out_dtype,
                 )
+                self.common(fn, inputs)
                 check_metrics_vec_kernel_count(1)
 
+                # Check that both main and tail loops are vectorized
+                if dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
+                    compiled_fn = torch.compile(fn)
+                    _, code = run_and_get_cpp_code(compiled_fn, *inputs)
+                    FileCheck().check_count("loadu", 2, exactly=True).run(code)
+
     @requires_vectorization
     def test_dequant_quant_lowering_uint8(self):
         self._test_dequant_quant_lowering_helper(torch.uint8)
@@ -4348,6 +4421,39 @@ def forward(self, x):
                 actual = compiled_m(x)
                 self.assertEqual(expected, actual)
 
+    @torch._dynamo.config.patch(
+        capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
+    )
+    @config.patch(emulate_precision_casts=True)
+    def test_group_norm_backward_symint_divisible_channels(self):
+        def fn(x, weight, bias):
+            y = torch.nn.functional.group_norm(x, 1, weight=weight, bias=bias)
+            return torch.sigmoid(y.max(dim=0).values)
+
+        torch._dynamo.reset()
+        metrics.reset()
+
+        shape = (2, 33, 4, 5)
+        x_ref = torch.rand(shape, dtype=torch.float32, requires_grad=True)
+        weight_ref = torch.rand((33,), dtype=torch.float32, requires_grad=True)
+        bias_ref = torch.rand((33,), dtype=torch.float32, requires_grad=True)
+
+        x_cmp = x_ref.clone().detach().requires_grad_(True)
+        weight_cmp = weight_ref.clone().detach().requires_grad_(True)
+        bias_cmp = bias_ref.clone().detach().requires_grad_(True)
+
+        eager_out = fn(x_ref, weight_ref, bias_ref)
+        eager_out.sum().backward()
+
+        compiled = torch.compile(fn, backend="inductor", fullgraph=True, dynamic=True)
+        compiled_out = compiled(x_cmp, weight_cmp, bias_cmp)
+        compiled_out.sum().backward()
+
+        torch.testing.assert_close(compiled_out, eager_out)
+        torch.testing.assert_close(x_cmp.grad, x_ref.grad)
+        torch.testing.assert_close(weight_cmp.grad, weight_ref.grad)
+        torch.testing.assert_close(bias_cmp.grad, bias_ref.grad)
+
     def test_int_div_vec(self):
         def fn(x, y, mode):
             return torch.div(x, y, rounding_mode=mode)
@@ -4708,6 +4814,22 @@ def fn(x):
         self.common(fn, (x,))
         check_metrics_vec_kernel_count(1)
 
+        # Tail vectorization case
+        x = torch.randn((22, 22), dtype=torch.double)
+        torch._dynamo.reset()
+        metrics.reset()
+        with torch.no_grad():
+            expected = fn(x)
+            compiled_fn = torch.compile(fn)
+            actual, code = run_and_get_cpp_code(compiled_fn, x)
+            self.assertEqual(expected, actual)
+            # 1 generated vec kernel
+            self.assertEqual(metrics.generated_cpp_vec_kernel_count, 1)
+            # Check that both main and tail loops are vectorized
+            FileCheck().check_count(
+                "at::vec::VectorizedN<double,2>::loadu", 2, exactly=True
+            ).run(code)
+
     def test_double_reduction_vec(self):
         def fn(x):
             return x.sum(dim=1)
@@ -4717,6 +4839,22 @@ def fn(x):
         self.common(fn, (x,))
         check_metrics_vec_kernel_count(1)
 
+        # Tail vectorization case
+        x = torch.randn((22, 22), dtype=torch.double)
+        torch._dynamo.reset()
+        metrics.reset()
+        with torch.no_grad():
+            expected = fn(x)
+            compiled_fn = torch.compile(fn)
+            actual, code = run_and_get_cpp_code(compiled_fn, x)
+            self.assertEqual(expected, actual)
+            # 1 generated vec kernel
+            self.assertEqual(metrics.generated_cpp_vec_kernel_count, 1)
+            # Check that both main and tail loops are vectorized
+            FileCheck().check_count(
+                "at::vec::VectorizedN<double,2>::loadu", 2, exactly=True
+            ).run(code)
+
     def test_convert_fp32_to_double_vec(self):
         def fn(x):
             return x.to(torch.double)
@@ -4726,6 +4864,22 @@ def fn(x):
         self.common(fn, (x,))
         check_metrics_vec_kernel_count(1)
 
+        # Tail vectorization case
+        x = torch.randn(22, 22)
+        torch._dynamo.reset()
+        metrics.reset()
+        with torch.no_grad():
+            expected = fn(x)
+            compiled_fn = torch.compile(fn)
+            actual, code = run_and_get_cpp_code(compiled_fn, x)
+            self.assertEqual(expected, actual)
+            # 1 generated vec kernel
+            self.assertEqual(metrics.generated_cpp_vec_kernel_count, 1)
+            # Check that both main and tail loops are vectorized
+            FileCheck().check_count(
+                "at::vec::convert<double,2,float,1>", 2, exactly=True
+            ).run(code)
+
     def test_convert_double_to_fp32_vec(self):
         def fn(x):
             return x.to(torch.float32)
@@ -4735,6 +4889,22 @@ def fn(x):
         self.common(fn, (x,))
         check_metrics_vec_kernel_count(1)
 
+        # Tail vectorization case
+        x = torch.randn((22, 22), dtype=torch.double)
+        torch._dynamo.reset()
+        metrics.reset()
+        with torch.no_grad():
+            expected = fn(x)
+            compiled_fn = torch.compile(fn)
+            actual, code = run_and_get_cpp_code(compiled_fn, x)
+            self.assertEqual(expected, actual)
+            # 1 generated vec kernel
+            self.assertEqual(metrics.generated_cpp_vec_kernel_count, 1)
+            # Check that both main and tail loops are vectorized
+            FileCheck().check_count(
+                "at::vec::convert<float,1,double,2>", 2, exactly=True
+            ).run(code)
+
     def test_no_redundant_to_dtypes_between_fused_scheduler_node(self):
         # https://github.com/pytorch/pytorch/issues/115260
         p0 = torch.tensor([1.0879], dtype=torch.float16)
diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py
index ad27dd3190f8..4e1c48496ebc 100644
--- a/test/inductor/test_cpu_select_algorithm.py
+++ b/test/inductor/test_cpu_select_algorithm.py
@@ -2936,10 +2936,20 @@ def forward(self, x):
 
         x = torch.randn(batch_size, in_features).to(dtype=dtype)
         mod = M().to(dtype=dtype).eval()
-        self.common(mod, (x))
-        _, code = run_and_get_cpp_code(mod, x)
-        # Check that only 2 kernels are in the generated code
-        assert code.count("AMXState amx_state") == 2
+        with verify(dtype) as (atol, rtol):
+            ref_res = mod(x)
+            m = torch.compile(mod)
+            res, code = run_and_get_cpp_code(m, x)
+            self.assertEqual(
+                res,
+                ref_res,
+                atol=atol,
+                rtol=rtol,
+                equal_nan=True,
+                exact_dtype=True,
+            )
+            # Check that only 2 kernels are in the generated code
+            assert code.count("AMXState amx_state") == 2
 
 
 @dynamo_config.patch({"dynamic_shapes": True, "assume_static_by_default": False})
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
index 5cfb62285572..ffdb7b112f89 100644
--- a/test/inductor/test_cuda_repro.py
+++ b/test/inductor/test_cuda_repro.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: inductor"]
 # ruff: noqa: F841
 
+import copy
 import functools
 import gc
 import math
@@ -38,6 +39,8 @@
     DeterministicGuard,
     freeze_rng_state,
     IS_FBCODE,
+    MI350_ARCH,
+    skipIfRocmArch,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
     xfailIfPy312Plus,
@@ -86,6 +89,19 @@ class CudaReproTests(TestCase):
     device = "cuda"
     common = check_model_cuda
 
+    def test_mm_out_dtype_compile(self):
+        a = torch.randn(1, 3, device="cuda", dtype=torch.float16)
+        b = torch.randn(3, 2, device="cuda", dtype=torch.float16)
+
+        def fn(x, y):
+            return torch.mm(x, y, out_dtype=torch.float32)
+
+        compiled = torch.compile(fn, backend="inductor", fullgraph=True)
+        result = compiled(a, b)
+        expected = fn(a, b)
+        self.assertEqual(result.dtype, expected.dtype)
+        self.assertEqual(result, expected)
+
     def test_index_put_issue(self):
         def forward(
             self,
@@ -120,6 +136,47 @@ def forward(
         compiled = compile_fx_inner(mod, inps)
         compiled(inps)
 
+    def test_view_replay_padding_issue_163328(self):
+        class ReproModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.num_points_out = 120
+                self.lc_num = 2
+                input_channels = 16
+                self.linear_main = nn.Linear(input_channels, self.num_points_out * 2)
+                self.linear_lc = nn.Linear(input_channels, self.num_points_out * 2)
+
+            def forward(self, x: torch.Tensor):
+                bs, num_lat, num_lon, channels = x.shape
+                index = num_lat - self.lc_num
+
+                main_x = x[:, :index].reshape(bs * index * num_lon, channels)
+                lc_x = x[:, index:].reshape(bs * self.lc_num * num_lon, channels)
+
+                refline = self.linear_main(main_x).reshape(bs, index, num_lon, -1)
+                lc_refline = self.linear_lc(lc_x).reshape(bs, self.lc_num, num_lon, -1)
+
+                base = torch.cat([refline, lc_refline], dim=1).contiguous()
+                out0 = base.reshape(bs, num_lat, num_lon, self.num_points_out, 2)
+                out1 = base.reshape(bs, num_lat * num_lon, self.num_points_out * 2)
+                return {"ten0": out0, "ten1": out1}
+
+        torch.manual_seed(0)
+        model = ReproModule().cuda()
+        inputs = torch.randn(36, 9, 7, 16, device="cuda", requires_grad=True)
+
+        eager_out = model(inputs)
+        compiled_model = torch.compile(
+            copy.deepcopy(model),
+            backend="inductor",
+            mode="reduce-overhead",
+            fullgraph=True,
+        )
+        compiled_out = compiled_model(inputs)
+
+        self.assertEqual(compiled_out["ten0"], eager_out["ten0"])
+        self.assertEqual(compiled_out["ten1"], eager_out["ten1"])
+
     def test_effn_attn_bias_padding(self):
         batch_size, num_heads, seq_len, head_dim = 2, 32, 512, 128
 
@@ -163,6 +220,7 @@ def fn(
         # dont check rng state
         self.assertEqual(out[:2], fn(query, key, value, input_tensor2)[:2])
 
+    @skipIfRocmArch(MI350_ARCH)
     def test_effn_attn_bias_padding_misaligned(self):
         seqlen_start = 1008
 
@@ -923,6 +981,18 @@ def test_scatter_index_not_wrapped(self):
             out, torch.scatter_reduce(input_orig.clone(), 0, index, src, "sum")
         )
 
+    def test_normalize_norm_leq_one(self):
+        def fn(x: torch.Tensor) -> torch.Tensor:
+            return torch.nn.functional.normalize(x, dim=-1)
+
+        inp = torch.tensor([[3.799999, 0.0, 0.0]], device="cuda", dtype=torch.float32)
+        compiled = torch.compile(fn, backend="inductor", fullgraph=True)
+        out = compiled(inp)
+        norm = out.norm(dim=-1)
+        self.assertTrue(
+            torch.all(norm <= 1.0), f"expected norm <= 1.0 but got {norm.item()}"
+        )
+
     def test_libdevice_routing(self):
         def foo(x):
             return x.exp()
@@ -1348,7 +1418,7 @@ def forward(self, x):
             out2 = model(input2)
             out3 = model(input3)
 
-        self.assertEqual(cnts.frame_count, 1)
+        self.assertEqual(cnts.frame_count, 2)
 
     @config.patch({"triton.cudagraphs": True})
     def test_index_put_no_fallback_cudagraph(self):
@@ -1370,6 +1440,128 @@ def fn(x, y, z):
 
         self.assertEqual(ref, res)
 
+    @torch._inductor.config.patch(emulate_precision_casts=True)
+    def test_emulate_precision_casts_norm_rounding(self):
+        torch.manual_seed(0)
+        torch.cuda.manual_seed_all(0)
+
+        x = torch.rand(1000, device="cuda", dtype=torch.bfloat16)
+        scalar = torch.rand([], device="cuda", dtype=torch.float32)
+
+        def fn(inp, scale):
+            y = inp.norm()
+            return y, y + scale
+
+        opt_fn = torch.compile(fn, backend="inductor", fullgraph=True, dynamic=True)
+
+        expected = fn(x, scalar)
+        actual = opt_fn(x, scalar)
+
+        self.assertEqual(expected, actual)
+
+    @torch._inductor.config.patch(emulate_precision_casts=True)
+    def test_emulate_precision_casts_min_pow_chain(self):
+        torch.manual_seed(0)
+        torch.cuda.manual_seed_all(0)
+
+        with dynamo_config.patch(
+            capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
+        ):
+            arg0 = torch.rand(
+                [383, 55, 2, 3],
+                dtype=torch.float16,
+                device="cuda",
+                requires_grad=True,
+            )
+            arg1 = torch.rand(
+                [383, 55], dtype=torch.bfloat16, device="cuda", requires_grad=True
+            )
+            arg2 = torch.rand(
+                [383, 55], dtype=torch.float32, device="cuda", requires_grad=True
+            )
+            arg3 = torch.rand(
+                [383, 55], dtype=torch.float32, device="cuda", requires_grad=True
+            )
+
+            def fn(a0, a1, a2, a3):
+                t1 = a0.min(dim=2).values
+                t2 = t1.sum(dim=2)
+                t6 = ((((a1) - a2) - a3) - a3) - a3
+                t7 = t6 + t2
+                t8 = torch.pow(torch.pow(torch.pow(torch.pow(t2, t7), t7), t7), t7)
+                return t7, t8
+
+            opt_fn = torch.compile(fn, backend="inductor", fullgraph=True, dynamic=True)
+
+            eager_out = fn(arg0, arg1, arg2, arg3)
+            compiled_args = [
+                arg0.clone().detach().requires_grad_(True),
+                arg1.clone().detach().requires_grad_(True),
+                arg2.clone().detach().requires_grad_(True),
+                arg3.clone().detach().requires_grad_(True),
+            ]
+            compiled_out = opt_fn(*compiled_args)
+
+            for eager_tensor, compiled_tensor in zip(eager_out, compiled_out):
+                torch.testing.assert_close(
+                    eager_tensor,
+                    compiled_tensor,
+                    rtol=1e-3,
+                    atol=1e-3,
+                )
+
+    @torch._inductor.config.patch(emulate_precision_casts=True)
+    def test_emulate_precision_casts_mean_ratio_chain(self):
+        torch.manual_seed(0)
+        torch.cuda.manual_seed_all(0)
+
+        with dynamo_config.patch(
+            capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
+        ):
+            arg0 = torch.rand(
+                [125070], dtype=torch.bfloat16, device="cuda", requires_grad=True
+            )
+            arg1 = torch.rand(
+                [1895, 3, 11], dtype=torch.float16, device="cuda", requires_grad=True
+            )
+            arg2 = torch.rand(
+                [1895, 3, 11], dtype=torch.float32, device="cuda", requires_grad=True
+            )
+            arg3 = torch.rand(
+                [1895, 3, 11], dtype=torch.float32, device="cuda", requires_grad=True
+            )
+            arg4 = torch.rand(
+                [1895, 3, 11], dtype=torch.float32, device="cuda", requires_grad=True
+            )
+            arg5 = torch.rand(
+                [5, 379, 165], dtype=torch.float32, device="cuda", requires_grad=True
+            )
+
+            def fn(a0, a1, a2, a3, a4, a5):
+                t2 = a0.view(379, 165, 2).mean(dim=2)
+                t7 = ((((a1) - a2) - a3) - a2) - a4
+                t8 = t7.view(379, 165)
+                t11 = torch.nn.functional.relu(a5).mean(dim=0)
+                t12 = t2 - t11
+                t13 = (((t2) / t8) / t11) / t12
+                return t13
+
+            opt_fn = torch.compile(fn, backend="inductor", fullgraph=True, dynamic=True)
+
+            eager_out = fn(arg0, arg1, arg2, arg3, arg4, arg5)
+            compiled_args = [
+                tensor.clone().detach().requires_grad_(True)
+                for tensor in (arg0, arg1, arg2, arg3, arg4, arg5)
+            ]
+            compiled_out = opt_fn(*compiled_args)
+
+            torch.testing.assert_close(
+                eager_out,
+                compiled_out,
+                rtol=5e-3,
+                atol=1e-1,
+            )
+
     @torch._inductor.config.patch(emulate_precision_casts=True)
     def test_dont_inplace_disjoint_accesses(self):
         # TODO - would not need mms if we could annotate donated buffer..
@@ -2002,6 +2194,40 @@ def f(x, y):
 
         self.assertEqual(f(x_ref, y_ref), out)
 
+    def test_red_dtype_mismatch(self):
+        for per in (True, False):
+            torch._dynamo.reset()
+            if not per:
+                torch._inductor.config.triton.persistent_reductions = False
+
+            def f(arg0_1, arg1_1):
+                embedding = torch.ops.aten.embedding.default(arg1_1, arg0_1)
+                view = torch.ops.aten.view.default(embedding, [64, 3072])
+                unsqueeze = torch.ops.aten.unsqueeze.default(view, 0)
+                expand = torch.ops.aten.expand.default(unsqueeze, [576, -1, -1])
+                view_1 = torch.ops.aten.view.default(expand, [2, 8, 36, 64, 3072])
+                permute = torch.ops.aten.permute.default(view_1, [0, 1, 3, 2, 4])
+                clone = torch.ops.aten.clone.default(
+                    permute, memory_format=torch.contiguous_format
+                )
+                view_2 = torch.ops.aten.view.default(clone, [2, 18432, 3072])
+                iota = torch.ops.prims.iota.default(
+                    36,
+                    start=0,
+                    step=1,
+                    dtype=torch.int64,
+                    device="cuda",
+                    requires_grad=False,
+                )
+                view_3 = torch.ops.aten.view.default(iota, [1, 36])
+                max_1 = torch.ops.aten.max.default(view_3)
+                return (max_1,)
+
+            x = torch.ones(1, 64, device="cuda", dtype=torch.int64)
+            y = torch.randn(64, 3072, device="cuda", dtype=torch.bfloat16)
+            out = f(x, y)
+            self.assertEqual(torch.compile(f)(x, y), out)
+
     @unittest.skipIf(
         not config.is_fbcode(),
         "bfloat16 atomic add is only supported in fbcode today #97016",
@@ -2215,6 +2441,191 @@ def forward(self, x):
                     f"Max diff: {torch.max(torch.abs(eager_output - compiled_output)):.6f}",
                 )
 
+    def test_identity_load(self):
+        device = "cuda"
+
+        def f(x, y):
+            y2 = torch.cat(
+                [
+                    x[:, 1:],
+                    y[:, None] + 32 * 2048,
+                ],
+                dim=1,
+            )
+
+            x2 = x[:, 1:, None]
+            y3 = y2[:, -1:, None]
+
+            return (
+                torch.cat([x2, y3], dim=1)
+                + torch.arange(-2048, 0, device=device)[None, None, :]
+            ).reshape(1, 32 * 2048)
+
+        # This succeeds
+        eager_out = f(
+            torch.zeros(1, 32, dtype=torch.int64, device=device),
+            torch.zeros(1, dtype=torch.int32, device=device),
+        )
+        # This crashes
+        compile_out, code = run_and_get_code(
+            torch.compile(f),
+            torch.zeros(1, 32, dtype=torch.int64, device=device),
+            torch.zeros(1, dtype=torch.int32, device=device),
+        )
+        # make sure the identity is maintained
+        FileCheck().check("(1 + ((31)").run(code[0])
+
+        self.assertEqual(eager_out, compile_out)
+
+    def test_qwen2_7b_sdpa_input_alignment_requires_recompile(self):
+        # SDPA constraints ensures inputs have alignment (8).
+        device = "cuda"
+
+        def forward(q_proj, k_proj, attn_mask):
+            scale = 0.08838834764831845  # 1/sqrt(128)
+
+            B = attn_mask.size(0)
+            S = attn_mask.size(3)
+            D = 128
+            d_model = q_proj.size(1)
+
+            query_states = q_proj.view(B, S, -1, D).transpose(1, 2)  # [B, Hq, S, D]
+            q = query_states.contiguous()
+
+            Hkv = k_proj.size(1) // D
+            Hq = query_states.size(1)
+
+            nrepeats = Hq // Hkv
+            key_states = k_proj.view(B, S, -1, D).transpose(1, 2)  # [B, Hkv, S, D]
+            kv_repeated = key_states[:, :, None, :].expand(B, Hkv, nrepeats, S, D)
+            kv_repeated = kv_repeated.contiguous()
+            k = kv_repeated.reshape(B, Hq, S, D)
+            v = k.clone()  # value tensor
+
+            inf = torch.scalar_tensor(
+                float("-inf"), dtype=torch.bfloat16, device=device
+            )
+            zero = torch.scalar_tensor(0.0, dtype=torch.bfloat16, device=device)
+            where = torch.where(condition=attn_mask, input=zero, other=inf)
+            pad_amount = 8 - (S % 8)
+            padded = torch.nn.functional.pad(
+                where, (0, pad_amount), value=0.0
+            )  # pad last-dim
+            sliced = padded[..., :S]  # back to [B,1,S,S]
+            attn_bias = sliced.expand(B, Hq, S, S)
+
+            sdpa_out, logsumexp, seed, offset = (
+                torch.ops.aten._scaled_dot_product_efficient_attention.default(
+                    q,
+                    k,
+                    v,
+                    attn_bias,
+                    dropout_p=0.0,
+                    is_causal=True,
+                    scale=scale,
+                    compute_log_sumexp=True,
+                )
+            )
+
+            zeros = torch.zeros(B, S, d_model, device=device, dtype=torch.bfloat16)
+            zeros = zeros.reshape(B, S, Hq, D)
+            grad_out = zeros.permute(0, 2, 1, 3)
+
+            out = (
+                torch.ops.aten._scaled_dot_product_efficient_attention_backward.default(
+                    grad_out,
+                    q,
+                    k,
+                    v,
+                    attn_bias,
+                    sdpa_out,
+                    logsumexp,
+                    seed,
+                    offset,
+                    dropout_p=0.0,
+                    scale=scale,
+                    grad_input_mask=[True, True, True, False],
+                )
+            )
+            return out
+
+        B = 2
+        S = 6144
+        D = 128
+        Hq = 28
+        Hkv = 4
+
+        example_inputs = (
+            torch.randn((B * S, Hq * D), dtype=torch.bfloat16, device=device),  # q_proj
+            torch.randn(
+                (B * S, Hkv * D), dtype=torch.bfloat16, device=device
+            ),  # k_proj
+            torch.zeros((B, 1, S, S), dtype=torch.bool, device=device),  # attn_mask
+        )
+        correct = forward(*example_inputs)
+        compiled = torch.compile(forward, dynamic=True)
+        actual = compiled(*example_inputs)
+        self.assertEqual(actual, correct)
+
+        # run once more with seqlen that isn't divisible by 8
+        S = 6102
+        example_inputs = (
+            torch.randn((S * B, Hq * D), dtype=torch.bfloat16, device=device),  # q_proj
+            torch.randn(
+                (S * B, Hkv * D), dtype=torch.bfloat16, device=device
+            ),  # k_proj
+            torch.zeros((B, 1, S, S), dtype=torch.bool, device=device),  # attn_mask
+        )
+        correct = forward(*example_inputs)
+        actual = compiled(*example_inputs)
+        self.assertEqual(actual, correct)
+
+    @config.patch({"emulate_divison_rounding": True})
+    def test_truediv_emulate_divison_rounding(self):
+        from decimal import Decimal
+
+        y, x = 7.0, 11.0
+
+        @torch.compile
+        def compiled_divide(x, y):
+            return x / y
+
+        for y_dtype in [torch.float16, torch.bfloat16, torch.float32, torch.float64]:
+            for x_dtype in [
+                torch.float16,
+                torch.bfloat16,
+                torch.float32,
+                torch.float64,
+            ]:
+                y_ten = torch.tensor([y], dtype=y_dtype, device="cuda")
+                x_ten = torch.tensor([x], dtype=x_dtype, device="cuda")
+
+                torch._dynamo.reset()
+                compiled_div = Decimal(compiled_divide(x_ten, y_ten).item())
+                eager_div = Decimal((x_ten / y_ten).item())
+
+                self.assertEqual(eager_div, compiled_div)
+
+    @config.patch({"emulate_divison_rounding": False})
+    def test_truediv_base_not_bitwise_equivalent(self):
+        from decimal import Decimal
+
+        y, x = 7.0, 11.0
+
+        y_ten = torch.tensor([y], dtype=torch.float32, device="cuda")
+        x_ten = torch.tensor([x], dtype=torch.float32, device="cuda")
+
+        compile_out, code = run_and_get_code(
+            torch.compile(lambda x, y: x / y),
+            x_ten,
+            y_ten,
+        )
+        compiled_div = Decimal(compile_out.item())
+        eager_div = Decimal((x_ten / y_ten).item())
+
+        self.assertNotEqual(eager_div, compiled_div)
+        self.assertTrue("div_rn" not in code)
+
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_cuda_select_algorithm.py b/test/inductor/test_cuda_select_algorithm.py
index 271532d25ee0..7fd9fadc1ccc 100644
--- a/test/inductor/test_cuda_select_algorithm.py
+++ b/test/inductor/test_cuda_select_algorithm.py
@@ -17,7 +17,11 @@
 from torch.testing._internal.common_quantized import (
     _calculate_dynamic_per_channel_qparams,
 )
-from torch.testing._internal.common_utils import parametrize, TEST_CUDA
+from torch.testing._internal.common_utils import (
+    parametrize,
+    TEST_CUDA,
+    TEST_WITH_SLOW_GRADCHECK,
+)
 
 
 try:
@@ -79,6 +83,7 @@ class TestSelectAlgorithmCuda(BaseTestSelectAlgorithm):
     @parametrize("in_features", (128, 144, 1024))
     @parametrize("out_features", (64, 65, 1024))
     @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+    @unittest.skipIf(TEST_WITH_SLOW_GRADCHECK, "Leaking memory")
     def test_int8_woq_mm_cuda(
         self, dtype, batch_size, mid_dim, in_features, out_features
     ):
@@ -124,6 +129,80 @@ def forward(self, x, scale):
         self.common(mod, (x, w_scales))
         self.assertEqual(counters["inductor"]["woq_matcher_count"], 1)
 
+    @inductor_config.patch({"freezing": True, "cpp.enable_concat_linear": True})
+    @patches
+    @torch.no_grad
+    @dtypes(torch.bfloat16)
+    @parametrize("batch_size", (1, 32))
+    @parametrize("mid_dim", (1, 8))
+    @parametrize("in_features", (128,))
+    @parametrize("out_features", (64,))
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+    @unittest.skipIf(TEST_WITH_SLOW_GRADCHECK, "Leaking memory")
+    def test_int8_woq_mm_concat_cuda(
+        self, dtype, batch_size, mid_dim, in_features, out_features
+    ):
+        def _convert_weight_to_int8pack(w):
+            # Move to CPU for quantization calculation, then back to original device
+            device = w.device
+            w_cpu = w.cpu()
+            scale, zp = _calculate_dynamic_per_channel_qparams(
+                w_cpu.to(torch.float), torch.int8
+            )
+            scale = torch.from_numpy(scale).to(device)
+            zp = torch.from_numpy(zp).to(device)
+            w_int8 = torch.ao.quantization.fx._decomposed.quantize_per_channel(
+                input=w,
+                scales=scale,
+                zero_points=zp,
+                axis=0,
+                quant_min=-128,
+                quant_max=127,
+                dtype=torch.int8,
+            )
+            return w_int8, scale.to(torch.bfloat16)
+
+        class M(torch.nn.Module):
+            def __init__(self, w1, w2, w3):
+                super().__init__()
+                self.w1 = torch.nn.Parameter(w1, requires_grad=False)
+                self.w2 = torch.nn.Parameter(w2, requires_grad=False)
+                self.w3 = torch.nn.Parameter(w3, requires_grad=False)
+
+            def forward(self, x, scale1, scale2, scale3):
+                # Ref: _linear_fp_act_int8_weight_impl in torchao/dtypes/uintx/plain_layout.py
+                y1 = (
+                    torch.mm(x.reshape(-1, x.shape[-1]), self.w1.t().to(x.dtype))
+                    * scale1
+                )
+                y2 = (
+                    torch.mm(x.reshape(-1, x.shape[-1]), self.w2.t().to(x.dtype))
+                    * scale2
+                )
+                y3 = (
+                    torch.mm(x.reshape(-1, x.shape[-1]), self.w3.t().to(x.dtype))
+                    * scale3
+                )
+                return (
+                    y1.reshape(*x.shape[:-1], y1.shape[-1]),
+                    y2.reshape(*x.shape[:-1], y2.shape[-1]),
+                    y3.reshape(*x.shape[:-1], y3.shape[-1]),
+                )
+
+        counters.clear()
+        # Currently, the corresponding torch.fx pattern only supports 3D x
+        # Add 2D X case once the corresponding pattern-matcher pattern is added
+        x = torch.rand((batch_size, mid_dim, in_features), dtype=dtype, device="cuda")
+        w1 = torch.rand((out_features, in_features), dtype=dtype, device="cuda")
+        w2 = torch.rand((out_features, in_features), dtype=dtype, device="cuda")
+        w3 = torch.rand((out_features, in_features), dtype=dtype, device="cuda")
+        w1_int8pack, w1_scales = _convert_weight_to_int8pack(w1)
+        w2_int8pack, w2_scales = _convert_weight_to_int8pack(w2)
+        w3_int8pack, w3_scales = _convert_weight_to_int8pack(w3)
+        mod = M(w1_int8pack, w2_int8pack, w3_int8pack).eval()
+        self.common(mod, (x, w1_scales, w2_scales, w3_scales))
+        self.assertEqual(counters["inductor"]["woq_matcher_count"], 3)
+
 
 instantiate_device_type_tests(TestSelectAlgorithmCuda, globals(), only_for="cuda")
 
diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
index 91e65cad8299..c46c3b86055c 100644
--- a/test/inductor/test_cudagraph_trees.py
+++ b/test/inductor/test_cudagraph_trees.py
@@ -5,6 +5,7 @@
 import gc
 import importlib
 import itertools
+import re
 import sys
 import unittest
 import warnings
@@ -176,7 +177,7 @@ def tearDown(self):
 
         def get_manager(self, device_index=None):
             return torch._inductor.cudagraph_trees.get_container(
-                self.device_idx if not device_index else device_index
+                device_index if device_index else self.device_idx
             ).tree_manager
 
         def get_roots(self):
@@ -912,6 +913,186 @@ def test_unaligned_static_input_non_trees(self):
         def test_unaligned_static_input_no_cudagraphs(self):
             self._test_unaligned_static_input_impl(expected_clones=0)
 
+        @torch._inductor.config.patch("graph_partition", True)
+        @torch._inductor.config.patch("implicit_fallbacks", True)
+        def test_graph_partition_custom_rule(self):
+            def get_num_partitions(code):
+                code = "".join(code)
+                found = re.search(r"partitions=\[(.*)\]", code)
+                assert found is not None
+                partitions = found.group(1)
+                num_partitions = len([p for p in partitions.split(",") if p])
+                return num_partitions
+
+            @torch.library.custom_op("mylib::bar", mutates_args=())
+            def bar(x: torch.Tensor, flag: int) -> torch.Tensor:
+                return x.clone()
+
+            @bar.register_fake
+            def _(x, flag):
+                return x.clone()
+
+            def f(x, flag):
+                x = x + 1
+                x = bar(x, flag)
+                x = x + 1
+                return x
+
+            x = torch.randn(2, device="cuda")
+            f_compiled = torch.compile(f, mode="reduce-overhead", fullgraph=True)
+            _, code = run_and_get_code(f_compiled, x, True)
+            num_partitions = get_num_partitions(code)
+            self.assertEqual(num_partitions, 1)
+
+            @torch.library.custom_op("mylib::baz", mutates_args=())
+            def baz(x: torch.Tensor, flag: int) -> torch.Tensor:
+                return x.clone()
+
+            @baz.register_fake
+            def _(x, flag):
+                return x.clone()
+
+            def should_partition(x, flag):
+                return flag
+
+            torch._inductor.scheduler.register_should_partition_rule(
+                torch.ops.mylib.baz.default, should_partition
+            )
+
+            def f(x, flag):
+                x = x + 1
+                x = baz(x, flag)
+                x = x + 1
+                return x
+
+            f_compiled = torch.compile(f, mode="reduce-overhead", fullgraph=True)
+            _, code = run_and_get_code(f_compiled, x, True)
+            num_partitions = get_num_partitions(code)
+            self.assertEqual(num_partitions, 2)
+
+            _, code = run_and_get_code(f_compiled, x, False)
+            num_partitions = get_num_partitions(code)
+            self.assertEqual(num_partitions, 1)
+
+        @torch._inductor.config.patch("graph_partition", True)
+        @torch._inductor.config.patch("implicit_fallbacks", True)
+        def test_graph_partition_with_memory_plan_reuse(self):
+            BATCH_SIZE = 16
+            MLP_SIZE = 128
+            HIDDEN_SIZE = 128
+            RANDOM_SEED = 0
+
+            @torch.library.custom_op(
+                "silly::attention",
+                mutates_args=["out"],
+                tags=(torch._C.Tag.cudagraph_unsafe,),
+            )
+            def attention(
+                q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out: torch.Tensor
+            ) -> None:
+                out.copy_(q + k + v)
+
+            @attention.register_fake
+            def _(q, k, v, out):
+                return None
+
+            class ParentModel(torch.nn.Module):
+                def __init__(self) -> None:
+                    super().__init__()
+
+                def forward(self, x: torch.Tensor) -> torch.Tensor:
+                    return x
+
+            class Attention(torch.nn.Module):
+                def __init__(self, mlp_size: int, hidden_size: int) -> None:
+                    super().__init__()
+                    self.pre_attn = torch.nn.Linear(mlp_size, hidden_size, bias=False)
+                    self.post_attn = torch.nn.Linear(hidden_size, mlp_size, bias=False)
+                    self.rms_norm_weight = torch.nn.Parameter(torch.ones(hidden_size))
+
+                def rms_norm_ref(self, x: torch.Tensor) -> torch.Tensor:
+                    x_f32 = x.float()
+                    return (
+                        x_f32
+                        * torch.rsqrt(
+                            torch.mean(x_f32.square(), dim=-1, keepdim=True) + 1e-6
+                        )
+                        * self.rms_norm_weight
+                    ).to(x.dtype)
+
+                def forward(self, x: torch.Tensor) -> torch.Tensor:
+                    x = self.pre_attn(x)
+                    x = self.rms_norm_ref(x)
+                    attn_output = torch.empty_like(x)
+                    torch.ops.silly.attention(x, x, x, attn_output)
+                    x = attn_output
+                    x = self.rms_norm_ref(x)
+                    x = self.post_attn(x)
+                    return x
+
+            class CompiledAttention(torch.nn.Module):
+                def __init__(
+                    self,
+                    *,
+                    mlp_size: int,
+                    hidden_size: int,
+                ) -> None:
+                    super().__init__()
+                    self.attn = Attention(mlp_size, hidden_size)
+
+                def forward(self, x: torch.Tensor) -> torch.Tensor:
+                    return self.attn(x)
+
+            class CompiledAttentionTwo(CompiledAttention):
+                def forward(self, x: torch.Tensor) -> torch.Tensor:
+                    return self.attn(x) + x
+
+            class SimpleModelWithTwoGraphs(ParentModel):
+                def __init__(
+                    self,
+                    *,
+                    mlp_size: int,
+                    hidden_size: int,
+                ) -> None:
+                    super().__init__()
+                    self.attn_one = CompiledAttention(
+                        mlp_size=mlp_size,
+                        hidden_size=hidden_size,
+                    )
+                    self.attn_two = CompiledAttentionTwo(
+                        mlp_size=mlp_size,
+                        hidden_size=hidden_size,
+                    )
+
+                    self.hidden_states = torch.zeros((BATCH_SIZE, MLP_SIZE)).cuda()
+
+                def forward(self, x: torch.Tensor) -> torch.Tensor:
+                    bsz = x.shape[0]
+                    # CUDAGraph expects same tensor addresses for each run
+                    self.hidden_states[:bsz].copy_(x)
+                    x = self.attn_one(self.hidden_states[:bsz])
+                    self.hidden_states[:bsz].copy_(x)
+                    x = self.attn_two(self.hidden_states[:bsz])
+                    return x
+
+            eager_model = (
+                SimpleModelWithTwoGraphs(
+                    mlp_size=MLP_SIZE,
+                    hidden_size=HIDDEN_SIZE,
+                )
+                .eval()
+                .cuda()
+            )
+
+            compiled_model = torch.compile(eager_model, mode="reduce-overhead")
+
+            inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda()
+
+            for _ in range(3):
+                eager_out = eager_model(inputs)
+                compiled_out = compiled_model(inputs)
+                self.assertEqual(eager_out, compiled_out)
+
         @torch._inductor.config.patch("graph_partition", True)
         @torch._inductor.config.patch("triton.cudagraph_trees", False)
         def test_graph_partition_gc(self):
@@ -1756,25 +1937,19 @@ def foo(args):
                     args.clear()
                     return (x + 3,)
 
-                inp = torch.rand([20, 20], device="cuda:1")
+                inp = torch.rand([20, 20], device=f"cuda:{self.device_idx}")
 
                 inp_list = [inp]
-                foo_cg = tree_cudagraphify_impl(
-                    foo,
-                    inp_list,
-                    (),
-                    device_index=1,
-                    is_backward=False,
-                    is_inference=True,
-                )
+                foo_cg = self.cudagraphify_impl(foo, inp_list, ())
                 for _ in range(3):
                     self.assertEqual(foo_cg([inp]), foo([inp]))
 
-                self.assertTrue(self.get_manager(device_index=0) is None)
-                self.assertFalse(self.get_manager(device_index=1) is None)
+                next_idx = (self.device_idx + 1) % torch.cuda.device_count()
+                self.assertTrue(self.get_manager(device_index=next_idx) is None)
+                self.assertFalse(self.get_manager(device_index=self.device_idx) is None)
 
             test()
-            self.assertTrue(self.get_manager(device_index=1) is None)
+            self.assertTrue(self.get_manager(device_index=self.device_idx) is None)
 
         def test_error_on_dealloc_use(self):
             @torch.compile()
@@ -2084,19 +2259,19 @@ def test_storage_access_error(self):
                 device = x.untyped_storage()
 
         def test_side_stream_memory_allocation(self):
-            from torch._inductor.cudagraph_trees import cudagraphify_impl
+            device = f"cuda:{self.device_idx}"
 
             def multi_stream_allocation(args):
                 side_stream = torch.cuda.Stream()
                 side_stream.wait_stream(torch.cuda.current_stream())
                 with torch.cuda.stream(side_stream):
                     side_stream_buffer = torch.ones(
-                        *args, device="cuda:0", dtype=torch.float32
+                        *args, device=device, dtype=torch.float32
                     )
                 torch.cuda.current_stream().wait_stream(side_stream)
 
                 main_stream_buffer = torch.ones(
-                    *args, device="cuda:0", dtype=torch.float32
+                    *args, device=device, dtype=torch.float32
                 )
 
                 if isinstance(args, list):
@@ -2104,17 +2279,17 @@ def multi_stream_allocation(args):
 
                 return main_stream_buffer, side_stream_buffer
 
-            graphed_multi_stream_func = cudagraphify_impl(
+            graphed_multi_stream_func = tree_cudagraphify_impl(
                 multi_stream_allocation,
                 inputs=[],
                 static_input_idxs=[],
                 is_backward=False,
                 is_inference=False,
-                device_index=0,
+                device_index=self.device_idx,
                 stack_traces=["dummy stack trace1", "dummy stack trace2"],
             )
 
-            ref_out = torch.ones((2, 3), device="cuda:0", dtype=torch.float32)
+            ref_out = torch.ones((2, 3), device=device, dtype=torch.float32)
 
             for _ in range(3):
                 torch.compiler.cudagraph_mark_step_begin()
@@ -2730,6 +2905,22 @@ def f(x, y):
             # 2 graph partitions lead to 2 cudagraph
             self.assertEqual(self.get_manager().new_graph_id().id, 2)
 
+        def test_graph_partition_view_fallback(self):
+            def f(x):
+                y = x + 1
+                z = torch.ops.aten.view.dtype(y, torch.float8_e4m3fn)
+                z_cpu = z.cpu()
+                u_cuda = z_cpu.cuda()
+                return u_cuda
+
+            compiled_f = torch.compile(f, mode="reduce-overhead")
+
+            for _ in range(3):
+                x = torch.ones(2, dtype=torch.int32, device="cuda")
+                eager_out = f(x)
+                compiled_out = compiled_f(x)
+                self.assertEqual(eager_out, compiled_out)
+
         @torch._inductor.config.patch("graph_partition", True)
         def test_graph_partition_log_message(self):
             def foo(x, y):
@@ -3231,6 +3422,60 @@ def fn(x):
             # splitting on 1 custom gives 2 cudagraphs
             self.assertEqual(self.get_manager().new_graph_id().id, 2)
 
+        @config.patch(implicit_fallbacks=True)
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_custom_op_mutation_late_free(self):
+            @torch.library.custom_op(
+                "mylib::op1",
+                mutates_args=["x"],
+                schema="(Tensor(a!)?  x) -> (Tensor, Tensor)",
+                device_types="cuda",
+            )
+            def op1(x) -> tuple[torch.Tensor, torch.Tensor]:
+                x = x + 1
+                return (x + 1, x + 2)
+
+            @op1.register_fake
+            def _(x) -> tuple[torch.Tensor, torch.Tensor]:
+                return (torch.empty_like(x), torch.empty_like(x))
+
+            @torch.library.custom_op(
+                "mylib::cg_unsafe_op",
+                mutates_args=[],
+                schema="(Tensor x, Tensor y, Tensor x1, Tensor y1) -> Tensor",
+                device_types="cuda",
+                tags=(torch._C.Tag.cudagraph_unsafe,),
+            )
+            def cg_unsafe_op(x0, x1, y0, y1) -> torch.Tensor:
+                return x0 + x1 + y0 + y1
+
+            @cg_unsafe_op.register_fake
+            def _(x0, x1, y0, y1) -> torch.Tensor:
+                return torch.empty_like(x0)
+
+            def f(x):
+                x = x + 1
+                x = op1(x)
+                x0, x1 = x[0], x[1]
+                y0 = x0 + 1
+                y1 = x1 + 1
+                y = cg_unsafe_op(x0, x1, y0, y1)
+                z = y + x0 + x1
+                z0, z1 = op1(z)
+                z2 = z0 + z1
+                res = cg_unsafe_op(z2, z2, y, y)
+                return res
+
+            x = torch.randn(2, 2, device="cuda")
+            x_cloned = x.clone()
+            eager_out = f(x)
+
+            f_compiled = torch.compile(f, mode="reduce-overhead")
+
+            for _ in range(5):
+                compiled_out = f_compiled(x_cloned)
+                self.assertEqual(eager_out, compiled_out)
+
         @config.patch(implicit_fallbacks=True)
         @torch._inductor.config.patch("graph_partition", True)
         def test_graph_partition_custom_op_dynamoc_shapes(self):
@@ -3862,9 +4107,8 @@ def foobar(x, y):
             self.assertEqual(eager_out, compiled_out)
             self.assertEqual(self.get_manager().new_graph_id().id, 1)
 
+        @torch._inductor.config.patch("triton.cudagraph_capture_sizes", (2, 5, 7))
         def test_cudagraph_capture_sizes(self):
-            torch._inductor.config.triton.cudagraph_capture_sizes = (2, 5, 7)
-
             def f(x):
                 return x + 1
 
@@ -3881,14 +4125,16 @@ def run(shape):
 
             self.assertEqual(self.get_manager().new_graph_id().id, 3)
 
-        def test_cudagraph_capture_sizes1(self):
-            torch._inductor.config.triton.cudagraph_capture_sizes = (
+        @torch._inductor.config.patch(
+            "triton.cudagraph_capture_sizes",
+            (
                 (2, 3),
                 (4, 5),
                 (6, 2),
                 (7, 3),
-            )
-
+            ),
+        )
+        def test_cudagraph_capture_sizes1(self):
             def f(x):
                 return x + 1
 
@@ -3907,14 +4153,16 @@ def run(batch_size, seq_len, d):
 
             self.assertEqual(self.get_manager().new_graph_id().id, 4)
 
-        def test_cudagraph_capture_sizes2(self):
-            torch._inductor.config.triton.cudagraph_capture_sizes = (
+        @torch._inductor.config.patch(
+            "triton.cudagraph_capture_sizes",
+            (
                 (2, 3, 4),
                 (4, 4, 3),
                 (3, 4, 4),
                 (4, 2, 3),
-            )
-
+            ),
+        )
+        def test_cudagraph_capture_sizes2(self):
             def f(x):
                 return x + 1
 
diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py
index d0618886660a..55f8dd5d24eb 100644
--- a/test/inductor/test_cutlass_backend.py
+++ b/test/inductor/test_cutlass_backend.py
@@ -8,9 +8,10 @@
 import time
 import unittest
 import unittest.mock as mock
+from collections.abc import Callable
 from enum import Enum
 from pathlib import Path
-from typing import Callable, Optional
+from typing import Optional
 
 from torch._dynamo.exc import BackendCompilerFailed
 from torch._inductor.codegen.cuda.serialization import get_cutlass_operation_serializer
@@ -84,10 +85,10 @@ def _check_if_instances_equal(op1, op2) -> bool:
     Utility function to check if two instances of a class are equal.
     """
     # cutlass uses list and tuple inconsistently
-    if isinstance(op1, (list, tuple)):
+    if isinstance(op1, (list | tuple)):
         return tuple(op1) == tuple(op2)
 
-    if type(op1) != type(op2):
+    if type(op1) is not type(op2):
         return False
 
     # some classes have __eq__ defined but they may be insufficient
@@ -254,10 +255,7 @@ def test_import_cutlass(self):
 
         self.assertTrue(try_import_cutlass())
 
-        if config.is_fbcode():
-            import python_cutlass
-        else:
-            import cutlass as python_cutlass  # noqa: F401
+        import cutlass_cppgen  # type: ignore[import-not-found]  # noqa: F401
         import cutlass_library  # noqa: F401
 
     def test_cutlass_key(self):
@@ -1390,61 +1388,6 @@ def test_cutlass_backend_shape_coverage_mm(
                     f"M={M}, N={N}, K={K}",
                 )
 
-    @unittest.skipIf(not SM90OrLater, "need sm_90")
-    @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
-    @parametrize("presets", ("", "0", "0,999"))
-    def test_cutlass_presets(
-        self,
-        presets: str,
-    ):
-        """
-        Test if some configs can be generated with presets.
-        """
-
-        M, N, K = (128, 128, 16)
-        A = torch.randn(M, K).cuda().half()
-        B = torch.randn(N, K).cuda().half().t()
-
-        with (
-            fresh_cache(),
-            config.patch(
-                {
-                    "max_autotune": True,
-                    "max_autotune_gemm_backends": "CUTLASS",
-                    "cuda.cutlass_max_profiling_configs": 2,
-                    "cuda.cutlass_presets": presets,
-                }
-            ),
-            mock.patch(
-                "torch._inductor.kernel.mm.autotune_select_algorithm",
-                wraps=select_no_algorithm,
-            ) as sa,
-        ):
-            with self.assertRaisesRegex(InductorError, r".*NoValidChoicesError.*"):
-                torch.compile(torch.mm)(A, B)
-
-            self.assertTrue(
-                sa.called,
-                f"autotune_select_algorithm was not called with shape M={M}, N={N}, K={K}",
-            )
-            args, _ = sa.call_args
-            op_name, choices, _, __ = args
-            assert op_name == "mm"
-            cuda_template_count = 0
-            for choice in choices:
-                if isinstance(choice, CUDATemplateCaller):
-                    choice_info = choice.info_dict()
-                    op_conf_name = choice_info.get("op_conf_name", "")
-                    assert isinstance(op_conf_name, str)
-                    cuda_template_count += 1
-
-            self.assertGreater(
-                cuda_template_count,
-                0,
-                "No CUDATemplateCaller choices found for matmul with shape "
-                f"M={M}, N={N}, K={K}",
-            )
-
     @unittest.skipIf(not SM80OrLater, "need sm_80")
     @mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_get_max_alignment(self):
@@ -2150,7 +2093,7 @@ def test_gemm_operation_serialization(self, arch: str, cuda_version: str):
         deserialized_ops = [
             serializer.deserialize(serialized_op) for serialized_op in serialized_ops
         ]
-        for op, deserialized_op in zip(ops, deserialized_ops):
+        for op, deserialized_op in zip(ops, deserialized_ops, strict=False):
             self.assertTrue(_check_if_instances_equal(op, deserialized_op))
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 is only supported on H100+")
diff --git a/test/inductor/test_cutlass_evt.py b/test/inductor/test_cutlass_evt.py
index cae9558d2ec2..862aeb5db1c8 100644
--- a/test/inductor/test_cutlass_evt.py
+++ b/test/inductor/test_cutlass_evt.py
@@ -4,7 +4,6 @@
 import sympy
 
 import torch
-import torch._inductor.config as config
 from torch._dynamo.test_case import TestCase
 from torch._inductor.codegen.cuda.cutlass_utils import (
     torch_dtype_to_cutlass_type,
@@ -27,18 +26,14 @@
 
     LayoutType = cutlass_lib.LayoutType
     DataType = cutlass_lib.DataType
+    from cutlass_cppgen.backend.evt.ir.tensor import Tensor as CutlassTensor
+
     from torch._inductor.codegen.cuda.cutlass_lib_extensions.evt_extensions import (
         _render_argument_type,
         _trace,
         trace,
     )
 
-    if config.is_fbcode():
-        import python_cutlass  # type: ignore[import-untyped, import-not-found]  # noqa: F401
-    else:
-        import cutlass as python_cutlass  # type: ignore[import-untyped, import-not-found]  # noqa: F401
-    CutlassTensor = python_cutlass.backend.evt.ir.tensor.Tensor
-
     BIAS_CODE = """def example_epilogue(accum, C, aux, bias):
         F = accum + C + aux
         E = relu(F) + bias
diff --git a/test/inductor/test_deterministic.py b/test/inductor/test_deterministic.py
new file mode 100644
index 000000000000..b139c68c577c
--- /dev/null
+++ b/test/inductor/test_deterministic.py
@@ -0,0 +1,115 @@
+# Owner(s): ["module: inductor"]
+import contextlib
+import unittest
+
+import torch
+import torch._inductor.config as inductor_config
+from torch._dynamo.utils import counters
+from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import fresh_cache
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+)
+from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
+    HAS_CUDA_AND_TRITON,
+    IS_BIG_GPU,
+)
+
+
+@instantiate_parametrized_tests
+class DeterministicTest(TestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        self._exit_stack = contextlib.ExitStack()
+        self._exit_stack.enter_context(fresh_cache())
+        self._exit_stack.enter_context(
+            getattr(torch.backends, "__allow_nonbracketed_mutation")()  # noqa: B009
+        )
+
+        self.old_flags = [
+            torch.backends.cudnn.deterministic,
+            torch.backends.cudnn.benchmark,
+            torch.backends.mkldnn.deterministic,
+        ]
+
+    def tearDown(self) -> None:
+        (
+            torch.backends.cudnn.deterministic,
+            torch.backends.cudnn.benchmark,
+            torch.backends.mkldnn.deterministic,
+        ) = self.old_flags
+        self._exit_stack.close()
+        super().tearDown()
+
+    @parametrize("deterministic", [False, True])
+    def test_mm_padding(self, deterministic):
+        with inductor_config.patch(deterministic=deterministic):
+
+            @torch.compile()
+            def foo(x, y):
+                return x @ y
+
+            inps = [torch.rand([2049, 2049], device=GPU_TYPE) for _ in range(2)]
+            out = foo(*inps)
+            self.assertEqual(out, inps[0] @ inps[1])
+
+            if deterministic:
+                self.assertTrue(counters["inductor"]["pad_mm_bench"] == 0)
+            else:
+                self.assertTrue(counters["inductor"]["pad_mm_bench"] > 0)
+
+    @parametrize("deterministic", [False, True])
+    @inductor_config.patch(max_autotune=True)
+    @unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
+    def test_max_autotune(self, deterministic):
+        with inductor_config.patch(deterministic=deterministic):
+
+            @torch.compile()
+            def foo(x, y):
+                return x @ y
+
+            inps = [torch.rand([2048, 2048], device=GPU_TYPE) for _ in range(2)]
+            out = foo(*inps)
+            self.assertEqual(out, inps[0] @ inps[1])
+
+            if deterministic:
+                self.assertTrue(counters["inductor"]["select_algorithm_autotune"] == 0)
+            else:
+                self.assertTrue(counters["inductor"]["select_algorithm_autotune"] > 0)
+
+    def test_pointwise_coordesc_tuning(self):
+        @torch.compile(mode="max-autotune")
+        def f(x):
+            return x + 1
+
+        x = torch.randn(2048, device=GPU_TYPE)
+        self.assertEqual(f(x), x + 1)
+
+        self.assertTrue(counters["inductor"]["coordesc_tuning_bench"] > 0)
+
+    @parametrize("deterministic", [False, True])
+    def test_reduction_coordesc_tuning(self, deterministic):
+        with inductor_config.patch(
+            deterministic=deterministic, coordinate_descent_tuning=True
+        ):
+
+            @torch.compile()
+            def foo(x):
+                return x.sum(dim=-1)
+
+            inp = torch.rand([2048, 2048], device=GPU_TYPE)
+
+            out = foo(inp)
+            self.assertEqual(out, inp.sum(dim=-1))
+
+            if deterministic:
+                self.assertTrue(counters["inductor"]["coordesc_tuning_bench"] == 0)
+            else:
+                self.assertTrue(counters["inductor"]["coordesc_tuning_bench"] > 0)
+
+
+if __name__ == "__main__":
+    if HAS_CUDA_AND_TRITON:
+        run_tests()
diff --git a/test/inductor/test_device_assert.py b/test/inductor/test_device_assert.py
index ddf85f9d88da..f3c142299501 100644
--- a/test/inductor/test_device_assert.py
+++ b/test/inductor/test_device_assert.py
@@ -3,100 +3,57 @@
 import torch
 import torch._inductor.config
 from torch._inductor import metrics
-from torch._inductor.compiler_bisector import BisectionResult, CompilerBisector
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import run_and_get_code
-from torch.testing._internal.common_utils import skipIfRocm
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    skipIfRocm,
+)
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
+@instantiate_parametrized_tests
 class TestTorchDeviceAssertTrigger(TestCase):
-    def _run_assert_should_throw(self, device):
+    @parametrize("backend", ["eager", "aot_eager", "inductor"])
+    def test_assert_should_throw(self, backend):
         def func():
-            a = torch.tensor([1.0, -2.0], device=device)
+            a = torch.tensor([1.0, -2.0], device="cpu")
             result = torch.all(a > 0)
             assert result, "should throw"
 
-        def test_fn():
-            torch._dynamo.reset()
-            f_c = torch.compile(func)
+        def func_inline():
+            a = torch.tensor([1.0, -2.0], device="cpu")
+            assert torch.all(a > 0), "should throw"
 
-            try:
-                f_c()
-                return False
-            except Exception:
-                return True
+        with self.assertRaisesRegex(RuntimeError, "should throw"):
+            torch._dynamo.reset()
+            f_c = torch.compile(func, backend=backend)
+            f_c()
 
-        bisect_result = CompilerBisector.do_bisect(test_fn)
-        # do_bisect return None if all system is passed else return BisectionResult
-        self.assertNotIsInstance(bisect_result, BisectionResult)
+        with self.assertRaisesRegex(RuntimeError, "should throw"):
+            torch._dynamo.reset()
+            f_c = torch.compile(func_inline, backend=backend)
+            f_c()
 
-    def _run_assert_should_not_throw(self, device):
+    @parametrize("backend", ["eager", "aot_eager", "inductor"])
+    def test_assert_should_not_throw(self, backend):
         def func():
-            a = torch.tensor([1.0, 2.0], device=device)
+            a = torch.tensor([1.0, 2.0], device="cpu")
             result = torch.all(a > 0)
             assert result, "should throw"
 
-        def test_fn():
-            torch._dynamo.reset()
-            f_c = torch.compile(func)
-
-            try:
-                f_c()
-                return True
-            except Exception:
-                return False
-
-        bisect_result = CompilerBisector.do_bisect(test_fn)
-        self.assertNotIsInstance(bisect_result, BisectionResult)
-
-    def _run_assert_inline_expression_should_throw(self, device):
-        def func():
-            a = torch.tensor([1.0, -2.0], device=device)
-            assert torch.all(a > 0), "should throw"
-
-        def test_fn():
-            torch._dynamo.reset()
-            f_c = torch.compile(func)
-
-            try:
-                f_c()
-                return False
-            except Exception:
-                return True
-
-        bisect_result = CompilerBisector.do_bisect(test_fn)
-        self.assertNotIsInstance(bisect_result, BisectionResult)
-
-    def _run_assert_inline_expression_should_not_throw(self, device):
-        def func():
-            a = torch.tensor([1.0, 2.0], device=device)
+        def func_inline():
+            a = torch.tensor([1.0, 2.0], device="cpu")
             assert torch.all(a > 0), "should throw"
 
-        def test_fn():
-            torch._dynamo.reset()
-            f_c = torch.compile(func)
-
-            try:
-                f_c()
-                return True
-            except Exception:
-                return False
-
-        bisect_result = CompilerBisector.do_bisect(test_fn)
-        self.assertNotIsInstance(bisect_result, BisectionResult)
-
-    @torch._inductor.config.patch(force_disable_caches=True)
-    def test_assert_should_throw(self):
-        device = "cpu"
-        self._run_assert_should_throw(device)
-        self._run_assert_inline_expression_should_throw(device)
+        torch._dynamo.reset()
+        f_c = torch.compile(func, backend=backend)
+        f_c()
 
-    @torch._inductor.config.patch(force_disable_caches=True)
-    def test_assert_should_not_throw(self):
-        device = "cpu"
-        self._run_assert_should_not_throw(device)
-        self._run_assert_inline_expression_should_not_throw(device)
+        torch._dynamo.reset()
+        f_c = torch.compile(func_inline, backend=backend)
+        f_c()
 
     @requires_cuda_and_triton
     @skipIfRocm
diff --git a/test/inductor/test_device_info.py b/test/inductor/test_device_info.py
deleted file mode 100644
index 21db7e0984c8..000000000000
--- a/test/inductor/test_device_info.py
+++ /dev/null
@@ -1,697 +0,0 @@
-# Owner(s): ["module: inductor"]
-
-import unittest
-from unittest.mock import MagicMock, patch
-
-import torch
-from torch._inductor.analysis.device_info import (
-    _get_amd_smi,
-    _get_pynvml,
-    datasheet_tops,
-    DeviceInfo,
-    DeviceSpec,
-    lookup_device_info,
-)
-from torch.testing._internal.common_utils import run_tests, TestCase
-
-
-class TestDeviceInfo(TestCase):
-    def test_lookup_device_info(self):
-        h100_info = lookup_device_info("NVIDIA H100")
-        self.assertIsNotNone(h100_info)
-        if h100_info is not None:
-            self.assertEqual(h100_info.dram_gb, 80)
-            self.assertIn(torch.float32, h100_info.tops)
-
-        unknown_info = lookup_device_info("Unknown Device")
-        self.assertIsNone(unknown_info)
-
-    def test_datasheet_tops_function(self):
-        with (
-            patch("torch.cuda.get_device_name") as mock_get_device_name,
-            patch("torch.cuda.is_available", return_value=True),
-        ):
-            mock_get_device_name.return_value = "NVIDIA H100"
-            tops = datasheet_tops(torch.float32)
-            self.assertIsNotNone(tops)
-            self.assertEqual(tops, 67.0)
-
-            tops_tf32 = datasheet_tops(torch.float32, is_tf32=True)
-            self.assertEqual(tops_tf32, 989.0)
-
-            mock_get_device_name.return_value = "Unknown Device"
-            tops_unknown = datasheet_tops(torch.float32)
-            self.assertIsNone(tops_unknown)
-
-            mock_get_device_name.return_value = None
-            tops_no_device = datasheet_tops(torch.float32)
-            self.assertIsNone(tops_no_device)
-
-    @unittest.skipIf(torch.version.hip, "only nvidia")
-    def test_lazy_pynvml_import(self):
-        """Test pynvml import through torch.cuda."""
-        with (
-            patch("torch.cuda._HAS_PYNVML", True),
-            patch.object(torch.cuda, "pynvml", MagicMock(), create=True) as mock_pynvml,
-        ):
-            pynvml = _get_pynvml()
-            self.assertEqual(pynvml, mock_pynvml)
-
-        with patch("torch.cuda._HAS_PYNVML", False):
-            pynvml = _get_pynvml()
-            self.assertIsNone(pynvml)
-
-    @patch("torch.version.hip", None)
-    @patch("torch._inductor.analysis.device_info._get_pynvml")
-    def test_hardware_lookup_clock_hz_success(self, mock_get_pynvml):
-        mock_pynvml = MagicMock()
-        mock_pynvml.nvmlInit = MagicMock()
-        mock_pynvml.nvmlDeviceGetHandleByIndex.return_value = "mock_handle"
-        mock_pynvml.nvmlDeviceGetMaxClockInfo.return_value = 1500
-        mock_pynvml.NVML_CLOCK_SM = "clock_key"
-        mock_pynvml.nvmlShutdown = MagicMock()
-        mock_get_pynvml.return_value = mock_pynvml
-
-        result = DeviceInfo._hardware_lookup_clock_hz()
-        self.assertEqual(result, 1500 * 1e6)
-
-    @unittest.skipIf(torch.version.hip, "only nvidia")
-    def test_lazy_pynvml_import_caching(self):
-        """Test pynvml caching through torch.cuda (now handled by torch.cuda module)."""
-        with (
-            patch("torch.cuda._HAS_PYNVML", True),
-            patch.object(torch.cuda, "pynvml", MagicMock(), create=True) as mock_pynvml,
-        ):
-            pynvml1 = _get_pynvml()
-            self.assertEqual(pynvml1, mock_pynvml)
-
-            pynvml2 = _get_pynvml()
-            self.assertEqual(pynvml2, mock_pynvml)
-
-            self.assertEqual(pynvml1, pynvml2)
-
-    def test_hardware_lookup_exception_handling(self):
-        with (
-            patch("torch.version.hip", None),
-            patch(
-                "torch.cuda.get_device_properties", side_effect=Exception("CUDA Error")
-            ),
-            patch(
-                "torch._inductor.analysis.device_info._get_pynvml"
-            ) as mock_get_pynvml,
-        ):
-            mock_pynvml = MagicMock()
-            mock_pynvml.nvmlInit.side_effect = Exception("NVML Error")
-            mock_get_pynvml.return_value = mock_pynvml
-
-            # Test direct hardware lookup methods, not the generic lookup methods
-            result = DeviceInfo._hardware_lookup_sm_count()
-            self.assertIsNone(result)
-
-            result = DeviceInfo._hardware_lookup_clock_hz()
-            self.assertIsNone(result)
-
-    def test_device_mapping_aliases(self):
-        mi300x_direct = lookup_device_info("AMD MI300X")
-        mi300x_alias = lookup_device_info("AMD INSTINCT MI300X")
-        self.assertEqual(mi300x_direct, mi300x_alias)
-
-        mi210x_direct = lookup_device_info("AMD MI210X")
-        mi210x_alias = lookup_device_info("AMD INSTINCT MI210X")
-        self.assertEqual(mi210x_direct, mi210x_alias)
-
-    def test_lazy_amd_smi_import_success(self):
-        """Test AMD SMI import through torch.cuda."""
-        with patch("torch.cuda._HAS_PYNVML", False):
-            amd_smi = _get_amd_smi()
-            self.assertIsNone(amd_smi)
-
-    def test_lazy_amd_smi_import_caching(self):
-        """Test AMD SMI caching through torch.cuda."""
-        # Test consistent behavior across multiple calls
-        with patch("torch.cuda._HAS_PYNVML", True):
-            amd_smi1 = _get_amd_smi()
-            amd_smi2 = _get_amd_smi()
-            # Both should return the same result (None in this environment)
-            self.assertEqual(amd_smi1, amd_smi2)
-
-        with patch("torch.cuda._HAS_PYNVML", False):
-            amd_smi1 = _get_amd_smi()
-            amd_smi2 = _get_amd_smi()
-            self.assertEqual(amd_smi1, amd_smi2)
-            self.assertIsNone(amd_smi1)
-
-    def test_amd_device_mapping_entries(self):
-        """Test that AMD devices are properly represented in device mapping."""
-        mi300x = lookup_device_info("AMD MI300X")
-        self.assertIsNotNone(mi300x)
-        if mi300x is not None:
-            self.assertEqual(mi300x.dram_gb, 192.0)
-            self.assertEqual(mi300x.dram_bw_gbs, 5300.0)
-            self.assertIn(torch.float32, mi300x.tops)
-
-        mi300x_instinct = lookup_device_info("AMD INSTINCT MI300X")
-        self.assertEqual(mi300x, mi300x_instinct)
-
-        mi300a = lookup_device_info("AMD MI300A")
-        self.assertIsNotNone(mi300a)
-        if mi300a is not None:
-            self.assertEqual(mi300a.dram_gb, 128.0)
-            self.assertEqual(mi300a.dram_bw_gbs, 5300.0)
-
-        mi210x = lookup_device_info("AMD MI210X")
-        self.assertIsNotNone(mi210x)
-        if mi210x is not None:
-            self.assertEqual(mi210x.dram_gb, 64.0)
-            self.assertEqual(mi210x.dram_bw_gbs, 1600.0)
-
-        mi210x_instinct = lookup_device_info("AMD INSTINCT MI210X")
-        self.assertEqual(mi210x, mi210x_instinct)
-
-    def test_amd_integration_with_datasheet_tops(self):
-        """Test datasheet_tops function with AMD devices."""
-        with (
-            patch("torch.cuda.get_device_name") as mock_get_device_name,
-            patch("torch.cuda.is_available", return_value=True),
-        ):
-            mock_get_device_name.return_value = "AMD MI300X"
-
-            tops_fp32 = datasheet_tops(torch.float32)
-            self.assertEqual(tops_fp32, 163.4)
-
-            tops_fp16 = datasheet_tops(torch.float16)
-            self.assertEqual(tops_fp16, 1307.4)
-
-            tops_bf16 = datasheet_tops(torch.bfloat16)
-            self.assertEqual(tops_bf16, 1307.4)
-
-            tops_tf32 = datasheet_tops(torch.float32, is_tf32=True)
-            self.assertEqual(tops_tf32, 653.7)
-
-    def test_flops_hardware_calculation(self):
-        """Test FLOPS calculation now uses datasheet values with clock adjustment."""
-        with (
-            patch.object(DeviceInfo, "lookup_clock_hz", return_value=1.5e9),
-            patch("torch.cuda.is_available", return_value=True),
-            patch("torch.cuda.get_device_name", return_value="AMD MI300X"),
-        ):
-            flops = DeviceInfo.lookup_tops(
-                device_name="AMD MI300X", dtype=torch.float32
-            )
-            # Now uses datasheet value (163.4 TOPS) with clock adjustment
-            # Device mapping has clock_hz=2100*1e6, so ratio = 1.5e9 / (2100*1e6) = ~0.714
-            datasheet_flops = 163.4 * 1e12
-            device_info = lookup_device_info("AMD MI300X")
-            if device_info and device_info.clock_hz:
-                clock_ratio = 1.5e9 / device_info.clock_hz
-                expected_flops = datasheet_flops * clock_ratio
-            else:
-                expected_flops = datasheet_flops
-            self.assertEqual(flops, expected_flops)
-
-    def test_flops_datasheet_calculation(self):
-        """Test FLOPS calculation using datasheet TOPS."""
-        with (
-            patch("torch.cuda.get_device_name") as mock_get_device_name,
-            patch("torch.cuda.is_available", return_value=True),
-            patch.object(
-                DeviceInfo, "lookup_clock_hz", return_value=1.98e9 / 2
-            ),  # Use datasheet clock
-        ):
-            mock_get_device_name.return_value = "NVIDIA H100"
-
-            flops = DeviceInfo.lookup_tops(
-                device_name="NVIDIA H100", dtype=torch.float32
-            )
-            expected_flops = 67.0 * 1e12 / 2
-            self.assertEqual(flops, expected_flops)
-
-    def test_flops_fallback_to_datasheet(self):
-        """Test FLOPS fallback to datasheet when hardware lookup fails."""
-        with (
-            patch("torch.cuda.get_device_name") as mock_get_device_name,
-            patch("torch.cuda.is_available", return_value=True),
-            patch.object(
-                DeviceInfo, "lookup_clock_hz", return_value=1.98e9 / 2
-            ),  # Use datasheet clock
-        ):
-            mock_get_device_name.return_value = "NVIDIA H100"
-
-            flops = DeviceInfo.lookup_tops(
-                device_name="NVIDIA H100", dtype=torch.float32
-            )
-            expected_flops = 67.0 * 1e12 / 2
-            self.assertEqual(flops, expected_flops)
-
-    def test_flops_clock_adjustment_in_fallback(self):
-        """Test clock adjustment when falling back to datasheet."""
-        custom_device_info = DeviceSpec(
-            memory_clock_hz=100,
-            tops={torch.float32: 100.0},
-            dram_bw_gbs=1000.0,
-            dram_gb=16.0,
-            sm_count=None,
-            clock_hz=1.5e9,
-        )
-
-        with (
-            patch("torch.cuda.get_device_name") as mock_get_device_name,
-            patch("torch.cuda.is_available", return_value=True),
-            patch(
-                "torch._inductor.analysis.device_info.lookup_device_info"
-            ) as mock_lookup,
-        ):
-            mock_get_device_name.return_value = "Custom Device"
-            mock_lookup.return_value = custom_device_info
-
-            with patch.object(
-                DeviceInfo, "_hardware_lookup_clock_hz", return_value=3.0e9
-            ):
-                flops = DeviceInfo.lookup_tops("Custom Device", dtype=torch.float32)
-
-                datasheet_flops = 100.0 * 1e12
-                clock_ratio = 3.0e9 / 1.5e9
-                expected_flops = datasheet_flops * clock_ratio
-                self.assertEqual(flops, expected_flops)
-
-    @patch("torch._inductor.analysis.device_info.lookup_device_info")
-    def test_flops_clock_adjustment_no_expected_clock(self, mock_lookup):
-        """Test fallback behavior when device mapping has None for clock_hz."""
-        device_info = DeviceSpec(
-            memory_clock_hz=100,
-            tops={torch.float32: 100.0},
-            dram_bw_gbs=1000.0,
-            dram_gb=16.0,
-            sm_count=None,
-            clock_hz=None,
-        )
-        mock_lookup.return_value = device_info
-
-        with (
-            patch("torch.cuda.get_device_name") as mock_get_device_name,
-            patch("torch.cuda.is_available", return_value=True),
-        ):
-            mock_get_device_name.return_value = "NVIDIA H100"
-
-            with patch.object(
-                DeviceInfo, "_hardware_lookup_clock_hz", return_value=3.0e9
-            ):
-                flops = DeviceInfo.lookup_tops("NVIDIA H100", dtype=torch.float32)
-
-                expected_flops = 100.0 * 1e12
-                self.assertEqual(flops, expected_flops)
-
-    def test_flops_clock_adjustment_none_clock(self):
-        """Test fallback behavior when clock lookup returns None."""
-        with (
-            patch("torch.cuda.get_device_name") as mock_get_device_name,
-            patch("torch.cuda.is_available", return_value=True),
-        ):
-            mock_get_device_name.return_value = "NVIDIA H100"
-
-            with patch.object(
-                DeviceInfo, "_hardware_lookup_clock_hz", return_value=None
-            ):
-                flops = DeviceInfo.lookup_tops("NVIDIA H100", dtype=torch.float32)
-
-                expected_flops = 67.0 * 1e12
-                self.assertEqual(flops, expected_flops)
-
-    def test_flops_no_device_name(self):
-        """Test FLOPS calculation when device name is unavailable."""
-        with (
-            patch("torch.cuda.get_device_name", return_value=None),
-            patch("torch.cuda.is_available", return_value=False),
-        ):
-            # When there's no device name and we force datasheet, it should return None
-            with patch(
-                "torch._inductor.analysis.device_info.datasheet_tops", return_value=None
-            ):
-                flops = DeviceInfo.lookup_tops("NVIDIA H100", dtype=torch.float32)
-                self.assertIsNone(flops)
-
-            # When cuda is not available, hardware lookup is skipped and datasheet is used
-            flops = DeviceInfo.lookup_tops("NVIDIA H100", dtype=torch.float32)
-            self.assertIsNone(
-                flops
-            )  # Should be None since cuda.is_available() is False
-
-    def test_flops_unknown_device(self):
-        """Test FLOPS calculation with unknown device."""
-        with (
-            patch("torch.cuda.get_device_name") as mock_get_device_name,
-            patch("torch.cuda.is_available", return_value=True),
-        ):
-            mock_get_device_name.return_value = "Unknown Device"
-
-            flops = DeviceInfo.lookup_tops("Unknown Device", dtype=torch.float32)
-            # Should be None for unknown device
-            self.assertIsNone(flops)
-
-    def test_flops_partial_hardware_values(self):
-        """Test FLOPS calculation with some hardware values missing."""
-        with (
-            patch("torch.cuda.get_device_name") as mock_get_device_name,
-            patch("torch.cuda.is_available", return_value=True),
-            patch.object(
-                DeviceInfo, "lookup_clock_hz", return_value=1.98e9 / 2
-            ),  # Use datasheet clock
-        ):
-            mock_get_device_name.return_value = "NVIDIA H100"
-
-            flops = DeviceInfo.lookup_tops(
-                device_name="NVIDIA H100", dtype=torch.float32
-            )
-            expected_flops = 67.0 * 1e12 / 2
-            self.assertEqual(flops, expected_flops)
-
-    def test_flops_exception_handling(self):
-        """Test FLOPS calculation handles exceptions gracefully."""
-        with (
-            patch.object(
-                DeviceInfo,
-                "_hardware_lookup_sm_count",
-                side_effect=Exception("Hardware error"),
-            ),
-            patch("torch.cuda.get_device_name") as mock_get_device_name,
-            patch("torch.cuda.is_available", return_value=True),
-            patch.object(
-                DeviceInfo, "lookup_clock_hz", return_value=1.98e9 / 2
-            ),  # Use datasheet clock
-        ):
-            mock_get_device_name.return_value = "NVIDIA H100"
-
-            flops = DeviceInfo.lookup_tops("NVIDIA H100", dtype=torch.float32)
-            expected_flops = 67.0 * 1e12 / 2
-            self.assertEqual(flops, expected_flops)
-
-    def test_flops_integration_with_hardware_lookup(self):
-        """Test FLOPS integration with datasheet values and clock adjustment."""
-        dn = "NVIDIA H100"
-
-        with (
-            patch.object(DeviceInfo, "lookup_clock_hz", return_value=1500 * 1e6),
-            patch("torch.cuda.is_available", return_value=True),
-            patch("torch.cuda.get_device_name", return_value=dn),
-        ):
-            flops = DeviceInfo.lookup_tops(device_name=dn, dtype=torch.float32)
-            # Now uses datasheet value (67.0 TOPS) with clock adjustment
-            # Device mapping has clock_hz=1.98e9, so ratio = 1500*1e6 / 1.98e9 = ~0.7576
-            datasheet_flops = 67.0 * 1e12
-            device_info = lookup_device_info(dn)
-            if device_info and device_info.clock_hz:
-                clock_ratio = (1500 * 1e6) / device_info.clock_hz
-                expected_flops = datasheet_flops * clock_ratio
-            else:
-                expected_flops = datasheet_flops
-            self.assertEqual(flops, expected_flops)
-
-    @unittest.skipIf(
-        True, "pynvml and amdsmi are not available in CI, run these tests locally"
-    )
-    @unittest.skipIf(torch.version.hip, "only nvidia")
-    def test_pynvml_integration(self):
-        """Test direct pynvml library integration."""
-        try:
-            import pynvml
-
-            # Test basic NVML initialization and device access
-            pynvml.nvmlInit()
-            handle = pynvml.nvmlDeviceGetHandleByIndex(0)
-
-            # Test clock frequency retrieval
-            sm_clock_mhz = pynvml.nvmlDeviceGetMaxClockInfo(
-                handle, pynvml.NVML_CLOCK_SM
-            )
-            self.assertIsInstance(sm_clock_mhz, int)
-            self.assertGreater(sm_clock_mhz, 0)
-
-            # Test memory clock frequency retrieval
-            mem_clock_mhz = pynvml.nvmlDeviceGetMaxClockInfo(
-                handle, pynvml.NVML_CLOCK_MEM
-            )
-            self.assertIsInstance(mem_clock_mhz, int)
-            self.assertGreater(mem_clock_mhz, 0)
-
-            # Test memory bus width retrieval
-            bus_width_bits = pynvml.nvmlDeviceGetMemoryBusWidth(handle)
-            self.assertIsInstance(bus_width_bits, int)
-            self.assertGreater(bus_width_bits, 0)
-
-            # Test bandwidth calculation (same as device_info.py implementation)
-            mem_clock_hz = mem_clock_mhz * 1e6
-            effective_rate = mem_clock_hz * 2  # GDDR uses DDR so *2
-            peak_bw = (effective_rate * bus_width_bits) / 8
-            peak_bw_gbs = peak_bw / (1024**3)
-
-            self.assertIsInstance(peak_bw_gbs, float)
-            self.assertGreater(peak_bw_gbs, 0)
-
-            pynvml.nvmlShutdown()
-
-        except ImportError:
-            self.fail(
-                "pynvml library not available - install with 'pip install nvidia-ml-py'"
-            )
-        except Exception as e:
-            self.fail(f"pynvml integration failed: {e}")
-
-    @unittest.skipIf(
-        True, "pynvml and amdsmi are not available in CI, run these tests locally"
-    )
-    @unittest.skipIf(not torch.version.hip, "only amd")
-    def test_amdsmi_integration(self):
-        """Test direct amdsmi library integration."""
-        try:
-            import amdsmi
-
-            # Test basic AMD SMI initialization
-            amdsmi.amdsmi_init()
-
-            # Test device handle retrieval (matches current implementation)
-            device_handle = amdsmi.amdsmi_get_processor_handles()[0]
-            self.assertIsNotNone(device_handle)
-
-            # Test GPU clock info retrieval (matches current implementation)
-            clock_info = amdsmi.amdsmi_get_clock_info(
-                device_handle, amdsmi.AmdSmiClkType.SYS
-            )
-            self.assertTrue("max_clk" in clock_info)
-            self.assertIsInstance(clock_info["max_clk"], int)
-            self.assertGreater(clock_info["max_clk"], 0)
-
-            # Test GPU memory clock info retrieval (matches current implementation)
-            mem_clock_info = amdsmi.amdsmi_get_clock_info(
-                device_handle, amdsmi.AmdSmiClkType.MEM
-            )
-            self.assertTrue("max_clk" in mem_clock_info)
-            self.assertIsInstance(mem_clock_info["max_clk"], int)
-            self.assertGreater(mem_clock_info["max_clk"], 0)
-
-            amdsmi.amdsmi_shut_down()
-
-        except ImportError:
-            self.fail("amdsmi library not available - install AMD SMI")
-        except Exception as e:
-            self.fail(f"amdsmi integration failed: {e}")
-
-    @unittest.skipIf(
-        True, "pynvml and amdsmi are not available in CI, run these tests locally"
-    )
-    @unittest.skipIf(torch.version.hip, "only amd")
-    def test_pynvml_error_handling(self):
-        """Test pynvml error handling for invalid operations."""
-        try:
-            import pynvml
-
-            pynvml.nvmlInit()
-
-            # Test invalid device index - should raise exception
-            with self.assertRaises(Exception):
-                pynvml.nvmlDeviceGetHandleByIndex(999)  # Invalid index
-
-            pynvml.nvmlShutdown()
-
-        except ImportError:
-            self.skipTest("pynvml library not available")
-
-    @unittest.skipIf(
-        True, "pynvml and amdsmi are not available in CI, run these tests locally"
-    )
-    @unittest.skipIf(not torch.version.hip, "only nvidia")
-    def test_amd_smi_error_handling(self):
-        """Test AMD SMI error handling for invalid operations."""
-        # Try amdsmi only
-        try:
-            import amdsmi
-
-            amdsmi.amdsmi_init()
-
-            # Test invalid device index - should raise exception
-            with self.assertRaises(Exception):
-                amdsmi.amdsmi_get_processor_handle(999)  # Invalid index
-
-            amdsmi.amdsmi_shut_down()
-
-        except ImportError:
-            self.skipTest("amdsmi library not available")
-
-    @unittest.skipIf(True, "amdsmi is not available in CI, run this test locally")
-    @unittest.skipIf(not torch.version.hip, "only amd")
-    def test_amd_hardware_lookup_clock_hz(self):
-        """Test the _amd_hardware_lookup_clock_hz function with real AMD hardware."""
-        # Test the actual function directly
-        clock_hz = DeviceInfo._amd_hardware_lookup_clock_hz()
-
-        self.assertIsInstance(clock_hz, float)
-        self.assertGreater(clock_hz, 0)
-        # Clock frequency should be reasonable (between 500MHz and 5GHz)
-        self.assertGreater(clock_hz, 50e6)
-        self.assertLess(clock_hz, 5e9)
-        # Should return frequency in Hz, not MHz
-        # Most AMD clocks are in GHz range, so check it's properly converted
-        self.assertGreater(clock_hz, 1e8)  # At least 100MHz in Hz
-
-    @unittest.skipIf(True, "amdsmi is not available in CI, run this test locally")
-    @unittest.skipIf(not torch.version.hip, "only amd")
-    def test_amd_hardware_lookup_memory_clock_hz(self):
-        """Test the _amd_hardware_lookup_memory_clock_hz function with real AMD hardware."""
-        try:
-            memory_clock_hz = DeviceInfo._amd_hardware_lookup_memory_clock_hz()
-
-            self.assertIsInstance(memory_clock_hz, float)
-            self.assertGreater(memory_clock_hz, 0)
-            # Memory clock frequency should be reasonable (between 500MHz and 10GHz)
-            self.assertGreater(memory_clock_hz, 500e6)
-            self.assertLess(memory_clock_hz, 10e9)
-            # Should return frequency in Hz, not MHz
-            # Most AMD memory clocks are in GHz range, so check it's properly converted
-            self.assertGreater(memory_clock_hz, 1e8)  # At least 100MHz in Hz
-
-        except ImportError:
-            self.fail("amdsmi library not available - install AMD SMI")
-        except Exception:
-            # If there's a hardware error or no AMD device, the function should
-            # handle it gracefully and return None rather than crash
-            self.assertIsNone(DeviceInfo._amd_hardware_lookup_memory_clock_hz())
-
-    def test_dram_bw_hardware_calculation(self):
-        """Test DRAM bandwidth calculation with memory clock adjustment."""
-        with (
-            patch.object(DeviceInfo, "lookup_memory_clock_hz", return_value=7e9),
-            patch("torch.cuda.is_available", return_value=True),
-            patch("torch.cuda.get_device_name", return_value="AMD MI300X"),
-        ):
-            dram_bw = DeviceInfo.lookup_dram_bw_gbs(device_name="AMD MI300X")
-            # Uses datasheet value (5300.0 GB/s) with memory clock adjustment
-            # Device mapping has memory_clock_hz=5200*1e6, so ratio = 7e9 / (5200*1e6) = ~1.346
-            datasheet_bw = 5300.0
-            device_info = lookup_device_info("AMD MI300X")
-            if device_info and device_info.memory_clock_hz:
-                memory_clock_ratio = 7e9 / device_info.memory_clock_hz
-                expected_bw = datasheet_bw * memory_clock_ratio
-            else:
-                expected_bw = datasheet_bw
-            self.assertEqual(dram_bw, expected_bw)
-
-    def test_dram_bw_datasheet_calculation(self):
-        """Test DRAM bandwidth calculation using datasheet values."""
-        with (
-            patch("torch.cuda.get_device_name") as mock_get_device_name,
-            patch("torch.cuda.is_available", return_value=True),
-            patch.object(
-                DeviceInfo, "lookup_memory_clock_hz", return_value=1.4e10 / 2
-            ),  # Use half datasheet memory clock
-        ):
-            mock_get_device_name.return_value = "NVIDIA H100"
-
-            dram_bw = DeviceInfo.lookup_dram_bw_gbs(device_name="NVIDIA H100")
-            expected_bw = 3350 / 2  # Datasheet bandwidth scaled by memory clock ratio
-            self.assertEqual(dram_bw, expected_bw)
-
-    def test_dram_bw_fallback_to_datasheet(self):
-        """Test DRAM bandwidth fallback to datasheet when hardware lookup fails."""
-        with (
-            patch("torch.cuda.get_device_name") as mock_get_device_name,
-            patch("torch.cuda.is_available", return_value=True),
-            patch.object(
-                DeviceInfo, "lookup_memory_clock_hz", return_value=1.4e10 / 2
-            ),  # Use half datasheet memory clock
-        ):
-            mock_get_device_name.return_value = "NVIDIA H100"
-
-            dram_bw = DeviceInfo.lookup_dram_bw_gbs(device_name="NVIDIA H100")
-            expected_bw = 3350 / 2  # Datasheet bandwidth scaled by memory clock ratio
-            self.assertEqual(dram_bw, expected_bw)
-
-    def test_dram_bw_memory_clock_adjustment_in_fallback(self):
-        """Test memory clock adjustment when falling back to datasheet."""
-        custom_device_info = DeviceSpec(
-            memory_clock_hz=2e9,
-            tops={torch.float32: 100.0},
-            dram_bw_gbs=1000.0,
-            dram_gb=16.0,
-            sm_count=None,
-            clock_hz=1.5e9,
-        )
-
-        with (
-            patch("torch.cuda.get_device_name") as mock_get_device_name,
-            patch("torch.cuda.is_available", return_value=True),
-            patch(
-                "torch._inductor.analysis.device_info.lookup_device_info"
-            ) as mock_lookup,
-        ):
-            mock_get_device_name.return_value = "Custom Device"
-            mock_lookup.return_value = custom_device_info
-
-            with patch.object(DeviceInfo, "lookup_memory_clock_hz", return_value=4e9):
-                dram_bw = DeviceInfo.lookup_dram_bw_gbs("Custom Device")
-
-                datasheet_bw = 1000.0
-                memory_clock_ratio = 4e9 / 2e9
-                expected_bw = datasheet_bw * memory_clock_ratio
-                self.assertEqual(dram_bw, expected_bw)
-
-    @patch("torch._inductor.analysis.device_info.lookup_device_info")
-    def test_dram_bw_memory_clock_adjustment_no_expected_clock(self, mock_lookup):
-        """Test fallback behavior when device mapping has None for memory_clock_hz."""
-        device_info = DeviceSpec(
-            memory_clock_hz=None,
-            tops={torch.float32: 100.0},
-            dram_bw_gbs=1000.0,
-            dram_gb=16.0,
-            sm_count=None,
-            clock_hz=1.5e9,
-        )
-        mock_lookup.return_value = device_info
-
-        with (
-            patch("torch.cuda.get_device_name") as mock_get_device_name,
-            patch("torch.cuda.is_available", return_value=True),
-        ):
-            mock_get_device_name.return_value = "NVIDIA H100"
-
-            with patch.object(DeviceInfo, "lookup_memory_clock_hz", return_value=4e9):
-                dram_bw = DeviceInfo.lookup_dram_bw_gbs("NVIDIA H100")
-
-                expected_bw = 1000.0  # No memory clock adjustment
-                self.assertEqual(dram_bw, expected_bw)
-
-    def test_dram_bw_memory_clock_adjustment_none_clock(self):
-        """Test fallback behavior when memory clock lookup returns None."""
-        with (
-            patch("torch.cuda.get_device_name") as mock_get_device_name,
-            patch("torch.cuda.is_available", return_value=True),
-        ):
-            mock_get_device_name.return_value = "NVIDIA H100"
-
-            with patch.object(DeviceInfo, "lookup_memory_clock_hz", return_value=None):
-                dram_bw = DeviceInfo.lookup_dram_bw_gbs("NVIDIA H100")
-
-                expected_bw = 3350  # Datasheet value without adjustment
-                self.assertEqual(dram_bw, expected_bw)
-
-
-if __name__ == "__main__":
-    run_tests()
diff --git a/test/inductor/test_distributed_patterns.py b/test/inductor/test_distributed_patterns.py
index 780fac7db528..e067bdfedc09 100644
--- a/test/inductor/test_distributed_patterns.py
+++ b/test/inductor/test_distributed_patterns.py
@@ -7,7 +7,7 @@
 from torch._dynamo import compiled_autograd
 from torch._dynamo.test_case import run_tests, TestCase
 from torch._dynamo.testing import CompileCounter
-from torch.testing._internal.common_utils import IS_MACOS, skipIfRocm, skipIfXpu
+from torch.testing._internal.common_utils import IS_MACOS, skipIfXpu
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, requires_gpu
 
 
@@ -205,7 +205,6 @@ def fn(x):
     def test_storage_resize_zero_cpu(self):
         self._test_storage_resize_zero("cpu")
 
-    @skipIfRocm
     @requires_gpu()
     def test_storage_resize_zero_gpu(self):
         self._test_storage_resize_zero(GPU_TYPE)
@@ -230,7 +229,6 @@ def fn(x, out):
     def test_storage_resize_nonzero_cpu(self):
         self._test_storage_resize_nonzero("cpu")
 
-    @skipIfRocm
     @requires_gpu()
     def test_storage_resize_nonzero_gpu(self):
         self._test_storage_resize_nonzero(GPU_TYPE)
@@ -485,7 +483,6 @@ def test_fake_distributed_aot_eager(self):
         # Recompile on grad==None/grad!=None
         self.assertEqual(bw_cnt.frame_count, 2)
 
-    @skipIfRocm
     @skipIfXpu
     @requires_gpu()
     @torch._functorch.config.patch(recompute_views=True)
diff --git a/test/inductor/test_efficient_conv_bn_eval.py b/test/inductor/test_efficient_conv_bn_eval.py
index 2bcd333cbf2a..86b6b6ac8a0d 100644
--- a/test/inductor/test_efficient_conv_bn_eval.py
+++ b/test/inductor/test_efficient_conv_bn_eval.py
@@ -127,11 +127,11 @@ def test_conv_bn_eval(
             spatial_d = (
                 4 if issubclass(module[0], nn.modules.conv._ConvTransposeNd) else 96
             )
-            if module[0] == nn.Conv1d or module[0] == nn.ConvTranspose1d:
+            if module[0] is nn.Conv1d or module[0] is nn.ConvTranspose1d:
                 inps += [spatial_d] * 1
-            if module[0] == nn.Conv2d or module[0] == nn.ConvTranspose2d:
+            if module[0] is nn.Conv2d or module[0] is nn.ConvTranspose2d:
                 inps += [spatial_d] * 2
-            if module[0] == nn.Conv3d or module[0] == nn.ConvTranspose3d:
+            if module[0] is nn.Conv3d or module[0] is nn.ConvTranspose3d:
                 inps += [spatial_d] * 3
             inp = torch.rand(inps).to(self.device)
 
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
index 21d67ffc0ab3..529bbaf82675 100644
--- a/test/inductor/test_flex_attention.py
+++ b/test/inductor/test_flex_attention.py
@@ -7,20 +7,22 @@
 import unittest
 import warnings
 from collections import namedtuple
+from collections.abc import Callable
 from contextlib import contextmanager
 from dataclasses import dataclass
 from itertools import product
-from typing import Callable, Optional, TypeVar, Union
+from typing import Optional, TypeVar, Union
 from unittest import expectedFailure, skip, skipUnless
 from unittest.mock import patch
 
 import torch
 import torch.nn as nn
 from torch._dynamo.testing import CompileCounterWithBackend, normalize_gm
-from torch._inductor import metrics
+from torch._inductor import config, metrics
 from torch._inductor.runtime.triton_compat import HAS_WARP_SPEC
 from torch._inductor.test_case import TestCase as InductorTestCase
 from torch._inductor.utils import run_and_get_code
+from torch.nn.attention import SDPBackend
 from torch.nn.attention.experimental._paged_attention import PagedAttention
 from torch.nn.attention.flex_attention import (
     _create_empty_block_mask,
@@ -454,15 +456,17 @@ def _check_equal(
         compiled_out: torch.Tensor,
         fudge_factor: float,
         tensor_name: Optional[str] = None,
+        fudge_atol: float = 0,
     ):
         compiled_error = (golden_out - compiled_out).abs().mean()
         ref_error = (golden_out - ref_out).abs().mean()
         if torch.isnan(compiled_error).any() or torch.isnan(ref_error).any():
-            self.assertTrue(False, "Output/Grad with NaN")
-        if compiled_error > ref_error * fudge_factor:
-            name = tensor_name if tensor_name is not None else ""
-            msg = f"{name} Compiled error {compiled_error} is greater than ref error {ref_error} by more than {fudge_factor}X."
-            self.assertTrue(False, msg)
+            self.fail("Output/Grad with NaN")
+        name = tensor_name if tensor_name is not None else ""
+        msg = f"{name} Compiled error {compiled_error} is greater than ref error {ref_error} by more than {fudge_factor}X."
+        torch.testing.assert_close(
+            compiled_error, ref_error, rtol=fudge_factor, atol=1e-7, msg=msg
+        )
 
     def _check_out(
         self,
@@ -581,9 +585,7 @@ def run_test(
             )
         q_ref, k_ref, v_ref = query_key_value_clones(q, k, v)
         q_gold, k_gold, v_gold = query_key_value_clones(q, k, v, torch.float64)
-        sdpa_partial = create_attention(
-            score_mod, block_mask, enable_gqa=(not Q_H == KV_H)
-        )
+        sdpa_partial = create_attention(score_mod, block_mask, enable_gqa=(Q_H != KV_H))
 
         compiled_sdpa = torch.compile(sdpa_partial)
         golden_out = sdpa_partial(q_gold, k_gold, v_gold)
@@ -757,7 +759,7 @@ def run_paged_attention(
                 return_lse=return_lse,
                 block_mask=converted_block_mask,
                 score_mod=converted_score_mod,
-                enable_gqa=(not Q_H == KV_H),
+                enable_gqa=(Q_H != KV_H),
                 kernel_options=kernel_options,
             )
         else:
@@ -770,7 +772,7 @@ def run_paged_attention(
                 return_lse=return_lse,
                 block_mask=converted_block_mask,
                 score_mod=converted_score_mod,
-                enable_gqa=(not Q_H == KV_H),
+                enable_gqa=(Q_H != KV_H),
                 kernel_options=kernel_options,
             )
         return compiled_out, compiled_lse
@@ -815,9 +817,7 @@ def run_test_with_paged_attention(
         if block_mask is None:
             block_mask = create_block_mask(noop_mask, Q_B, 1, Q_S, KV_S, device=device)
 
-        sdpa_partial = create_attention(
-            score_mod, block_mask, enable_gqa=(not Q_H == KV_H)
-        )
+        sdpa_partial = create_attention(score_mod, block_mask, enable_gqa=(Q_H != KV_H))
         golden_out, golden_lse = sdpa_partial(q_gold, k_gold, v_gold, return_lse=True)
         ref_out, ref_lse = sdpa_partial(q_ref, k_ref, v_ref, return_lse=True)
 
@@ -1462,7 +1462,7 @@ def mask_mod(b, h, q, kv):
 
         block_mask = create_block_mask(mask_mod, Bq, 1, S, S, device=device)
         attention = functools.partial(
-            flex_attention, block_mask=block_mask, enable_gqa=(not Hq == Hkv)
+            flex_attention, block_mask=block_mask, enable_gqa=(Hq != Hkv)
         )
 
         self.run_test_with_call(attention, dtype, device, Bq, Hq, S, D, Bkv, Hkv, S, D)
@@ -1966,6 +1966,38 @@ def score_mod_scale(qk, b, h, q, kv):
 
         self.run_test(score_mod_scale, dtype, device=device)
 
+    @supported_platform
+    @skip_on_cpu
+    @dtypes(torch.float16)
+    @dtypesIfCUDA(torch.float16)
+    def test_dynamic_captured_buffer(self, device, dtype):
+        def run_with_head_count(compiled_fa, head_count):
+            head_scale = torch.randn(
+                head_count, device=device, dtype=dtype, requires_grad=True
+            )
+
+            def score_mod(score, batch, head, token_q, token_kv):
+                return score * head_scale[head]
+
+            q = torch.randn(
+                B, head_count, S, D, device=device, dtype=dtype, requires_grad=True
+            )
+            k = torch.randn_like(q, requires_grad=True)
+            v = torch.randn_like(q, requires_grad=True)
+
+            block_mask = create_block_mask(noop_mask, B, 1, S, S, device=device)
+
+            out = compiled_fa(q, k, v, score_mod=score_mod, block_mask=block_mask)
+            loss = out.sum()
+            loss.backward()
+            return out
+
+        compiled_fa = torch.compile(flex_attention, fullgraph=True, dynamic=True)
+
+        head_counts = [4, 8, 4, 16, 4]
+        for head_count in head_counts:
+            run_with_head_count(compiled_fa, head_count)
+
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
@@ -3973,6 +4005,7 @@ def mask_mod(b, h, q, kv):
 
     @supported_platform
     @skip_on_cpu
+    @unittest.skipIf(config.triton.native_matmul, "different dynamo counters")
     def test_free_symbol_dynamic(self, device):
         def batch_flip_causal(b, h, q_idx, kv_idx):
             return (q_idx >= kv_idx) & (b % 2 == 0)
@@ -4360,6 +4393,89 @@ def forward(self, q, k, v, block_mask):
         attn_output = mod(q, k, v, mask)
         self.assertEqual(attn_output.device, torch.device("cuda:1"))
 
+    @supported_platform
+    @skip_on_cpu
+    def test_custom_score_mod_layout_freeze(self, device):
+        torch.manual_seed(0)
+
+        class FlexAttentionCPB(nn.Module):
+            def __init__(self, N: int, R: int, H: int = 4, hidden: int = 32):
+                super().__init__()
+                self.mlp = nn.Sequential(
+                    nn.Linear(2, hidden),
+                    nn.GELU(),
+                    nn.Linear(hidden, H, bias=False),
+                )
+                self.gamma = nn.Parameter(torch.zeros(H))
+                self.H = H
+                self._init_tables(N, R)
+                self.register_buffer(
+                    "r_cutoff", torch.tensor(R, dtype=torch.long), persistent=False
+                )
+
+            def _init_tables(self, N: int, R: int) -> None:
+                P = N - R
+                S = int(P**0.5)
+                assert S * S == P
+                rng = torch.arange(-(S - 1), S, dtype=torch.float32)
+                dY, dX = torch.meshgrid(rng, rng, indexing="ij")
+                rel = torch.stack(
+                    [dY / max(S - 1, 1), dX / max(S - 1, 1)], dim=-1
+                ).reshape(-1, 2)
+                rel_table = torch.sign(rel) * torch.log1p(rel.abs())
+                self.register_buffer("rel_table", rel_table, persistent=False)
+
+                yy, xx = torch.arange(S), torch.arange(S)
+                Y, X = torch.meshgrid(yy, xx, indexing="ij")
+                flat = torch.stack([Y, X], 0).flatten(1)
+                d = flat[:, :, None] - flat[:, None, :]
+                d = d.permute(1, 2, 0).contiguous()
+                d[:, :, 0] += S - 1
+                d[:, :, 1] += S - 1
+                d[:, :, 0] *= 2 * S - 1
+                l_idx = d.sum(-1).to(torch.long)
+                idx = torch.full((N, N), 0, dtype=torch.long)
+                idx[R:, R:] = l_idx
+                self.register_buffer("idx_table", idx, persistent=False)
+
+            def _score_mod(self, mu: torch.Tensor):
+                bt = self.mlp(self.rel_table)
+                idx = self.idx_table
+                mu_q, mu_k = mu.unbind(2)
+                gam_sig = torch.sigmoid(self.gamma)
+
+                def score_mod(score, b, h, q, kv):
+                    has_bias = (q >= self.r_cutoff) & (kv >= self.r_cutoff)
+                    l2 = idx[q, kv]
+                    bias = bt[l2, h]
+                    w_gate = gam_sig[h] * (mu_q[b, h, q] + mu_k[b, h, kv])
+                    return score + has_bias.to(score.dtype) * w_gate * bias
+
+                return score_mod
+
+            def forward(self, q, k, v, mu):
+                return flex_attention(q, k, v, score_mod=self._score_mod(mu))
+
+        dtype = torch.bfloat16 if PLATFORM_SUPPORTS_BF16 else torch.float16
+        device_obj = torch.device(device)
+        module = FlexAttentionCPB(N=18, R=2).to(device_obj)
+        compiled_module = torch.compile(module, backend="inductor", dynamic=False)
+
+        q = torch.randn(2, 4, 18, 32, device=device_obj, dtype=dtype)
+        k = torch.randn_like(q)
+        v = torch.randn_like(q)
+        mu = torch.randn(2, 4, 2, 18, device=device_obj)
+
+        with torch.no_grad():
+            with torch.nn.attention.sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+                eager_out = module(q, k, v, mu)
+                compiled_out = compiled_module(q, k, v, mu)
+
+        self.assertEqual(compiled_out.shape, eager_out.shape)
+        torch.testing.assert_close(
+            compiled_out.float(), eager_out.float(), atol=2e-2, rtol=2e-2
+        )
+
     @supported_platform
     @skip_on_cpu
     @common_utils.parametrize(
@@ -4486,6 +4602,111 @@ def flex_attn_fn(x):
 
         torch.testing.assert_close(grad_module, grad_compiled, rtol=1e-2, atol=1e-2)
 
+    @supported_platform
+    @skip_on_cpu
+    def test_selective_ac_with_max_autotune_short_query(self, device):
+        from functools import partial
+
+        from torch.utils.checkpoint import (
+            checkpoint,
+            CheckpointPolicy,
+            create_selective_checkpoint_contexts,
+        )
+
+        compute_intensive_ops = [
+            torch.ops.aten.mm,
+            torch.ops.aten.bmm,
+        ]
+
+        def policy_fn(ctx, op, *args, **kwargs):
+            if op in compute_intensive_ops:
+                return CheckpointPolicy.MUST_SAVE
+            else:
+                return CheckpointPolicy.PREFER_RECOMPUTE
+
+        def causal_mask(b, h, q_idx, kv_idx):
+            return q_idx >= kv_idx
+
+        class DummyAttentionModule(nn.Module):
+            def __init__(self, dim=64, num_heads=4):
+                super().__init__()
+                self.dim = dim
+                self.num_heads = num_heads
+                self.head_dim = dim // num_heads
+
+                self.q_proj = nn.Linear(dim, dim)
+                self.k_proj = nn.Linear(dim, dim)
+                self.v_proj = nn.Linear(dim, dim)
+                self.out_proj = nn.Linear(dim, dim)
+
+                self._activation_checkpoint_context_fn = partial(
+                    create_selective_checkpoint_contexts, policy_fn
+                )
+
+                self._flex_attention = torch.compile(
+                    partial(
+                        checkpoint,
+                        flex_attention,
+                        use_reentrant=False,
+                        context_fn=self._activation_checkpoint_context_fn,
+                    ),
+                    mode="max-autotune-no-cudagraphs",
+                )
+
+            def forward(self, x, block_mask):
+                batch_size, seq_len, _ = x.shape
+
+                q = self.q_proj(x)
+                k = self.k_proj(x)
+                v = self.v_proj(x)
+
+                q = q.view(
+                    batch_size, seq_len, self.num_heads, self.head_dim
+                ).transpose(1, 2)
+                k = k.view(
+                    batch_size, seq_len, self.num_heads, self.head_dim
+                ).transpose(1, 2)
+                v = v.view(
+                    batch_size, seq_len, self.num_heads, self.head_dim
+                ).transpose(1, 2)
+
+                attn_out = self._flex_attention(q, k, v, block_mask=block_mask)
+
+                attn_out = (
+                    attn_out.transpose(1, 2)
+                    .contiguous()
+                    .view(batch_size, seq_len, self.dim)
+                )
+
+                out = self.out_proj(attn_out)
+
+                return out
+
+        batch_size = 2
+        seq_len = 64
+        dim = 64
+        num_heads = 4
+
+        model = DummyAttentionModule(dim=dim, num_heads=num_heads).to(device)
+
+        x = torch.randn(batch_size, seq_len, dim, device=device, requires_grad=True)
+
+        block_mask = create_block_mask(
+            causal_mask,
+            B=batch_size,
+            H=num_heads,
+            Q_LEN=seq_len,
+            KV_LEN=seq_len,
+            device=device,
+        )
+
+        out = model(x, block_mask)
+
+        loss = out.sum()
+        loss.backward()
+
+        self.assertIsNotNone(x.grad)
+
     @supported_platform
     @skip_on_cpu
     def test_validate_small_embedding_size_error_message(self, device):
@@ -4672,8 +4893,8 @@ def causal_mask(b, h, q, kv):
 
         block_mask = create_block_mask(causal_mask, 4, 2, 2048, 2048, device=device)
         self.assertEqual(block_mask.shape, (4, 2, 2048, 2048))
-        self.assertEqual(block_mask[0].shape, (2, 2048, 2048))
-        self.assertEqual(block_mask[0, 0].shape, (2048, 2048))
+        self.assertEqual(block_mask[0].shape, (1, 2, 2048, 2048))
+        self.assertEqual(block_mask[0, 0].shape, (1, 1, 2048, 2048))
         self.assertEqual(block_mask.numel(), 4 * 2 * 2048 * 2048)
         self.assertEqual(block_mask.sparsity(), 46.875)
         self.assertEqual(block_mask[0].sparsity(), 46.875)
@@ -4717,13 +4938,26 @@ def causal_mask(b, h, q, kv):
 
         # Index on batch dimension
         new_block_mask = block_mask[0]
-        assert new_block_mask.kv_num_blocks.shape == (2, 4)
-        assert new_block_mask.kv_indices.shape == (2, 4, 4)
+        assert new_block_mask.kv_num_blocks.shape == (1, 2, 4)
+        assert new_block_mask.kv_indices.shape == (1, 2, 4, 4)
 
         # Index on batch and head dimension
         new_block_mask = block_mask[0, 1]
-        assert new_block_mask.kv_num_blocks.shape == (4,)
-        assert new_block_mask.kv_indices.shape == (4, 4)
+        assert new_block_mask.kv_num_blocks.shape == (
+            1,
+            1,
+            4,
+        )
+        assert new_block_mask.kv_indices.shape == (1, 1, 4, 4)
+
+        # Index on batch and head dimension with -1 semantics
+        new_block_mask = block_mask[-1, -2]
+        assert new_block_mask.kv_num_blocks.shape == (
+            1,
+            1,
+            4,
+        )
+        assert new_block_mask.kv_indices.shape == (1, 1, 4, 4)
 
         # slicing on batch and head dimension
         new_block_mask = block_mask[0:2, 1:2]
@@ -4761,6 +4995,28 @@ def causal_mask(b, h, q, kv):
                 block_mask.full_kv_indices[:, :, q_index, :],
             )
 
+    @supported_platform
+    def test_sliced_blockmask_mask_mod_error(self, device):
+        """Test that sliced BlockMask raises helpful error when used with flex_attention"""
+
+        def causal_mask(b, h, q_idx, kv_idx):
+            return q_idx >= kv_idx
+
+        base_mask = create_block_mask(
+            causal_mask, B=1, H=1, Q_LEN=256, KV_LEN=256, device=device
+        )
+        sliced_mask = base_mask[:, :, 0]
+
+        q = torch.randn(1, 1, 1, 64, device=device)
+        k = torch.randn(1, 1, 256, 64, device=device)
+        v = torch.randn(1, 1, 256, 64, device=device)
+
+        compiled_fa = torch.compile(flex_attention)
+        with self.assertRaisesRegex(
+            RuntimeError, "Cannot use mask_mod from a sliced BlockMask"
+        ):
+            compiled_fa(q, k, v, block_mask=sliced_mask)
+
     @supported_platform
     def test_block_mask_device_change(self, device):
         device = torch.device(device)
@@ -5408,7 +5664,7 @@ def test_block_mask_operations_with_none_q_indices(self, device):
         self.assertEqual(block_mask.BLOCK_SIZE, (128, 128))
 
         sliced_mask = block_mask[0]
-        self.assertEqual(sliced_mask.shape, (1, 128, 512))
+        self.assertEqual(sliced_mask.shape, (1, 1, 128, 512))
         self.assertIsNone(sliced_mask.q_indices)
         self.assertIsNone(sliced_mask.q_num_blocks)
 
@@ -5418,6 +5674,66 @@ def test_block_mask_operations_with_none_q_indices(self, device):
             self.assertEqual(cpu_mask.kv_num_blocks.device.type, "cpu")
             self.assertIsNone(cpu_mask.q_indices)
 
+    @supported_platform
+    @skip_on_cpu
+    def test_broadcasted_head_block_mask(self, device):
+        torch.manual_seed(42)
+
+        def causal_mask(b, h, q_idx, kv_idx):
+            return q_idx >= kv_idx
+
+        def get_mask_mod_with_offset(mask_mod, offset_tensor):
+            def _mask_mod(b, h, q, kv):
+                return mask_mod(b, h, q + offset_tensor, kv)
+
+            return _mask_mod
+
+        B, T, H, D, current_pos = 4, 512, 8, 64, 128
+        dtype = torch.float32
+
+        q = torch.randn(B, H, 1, D, device=device, dtype=dtype)
+        k_cache = torch.randn(B, H, T, D, device=device, dtype=dtype)
+        v_cache = torch.randn(B, H, T, D, device=device, dtype=dtype)
+
+        # Keep future tokens tiny to avoid numerical issues when using full caches
+        k_cache[:, :, current_pos + 1 :, :] = (
+            torch.randn_like(k_cache[:, :, current_pos + 1 :, :]) * 1e-10
+        )
+        v_cache[:, :, current_pos + 1 :, :] = (
+            torch.randn_like(v_cache[:, :, current_pos + 1 :, :]) * 1e-10
+        )
+
+        k_cropped = k_cache[:, :, : current_pos + 1, :]
+        v_cropped = v_cache[:, :, : current_pos + 1, :]
+        sdpa_output = torch.nn.functional.scaled_dot_product_attention(
+            q, k_cropped, v_cropped, attn_mask=None
+        )
+
+        base_mask = create_block_mask(
+            causal_mask,
+            B=B,
+            H=None,  # broadcast across heads
+            Q_LEN=T,
+            KV_LEN=T,
+            device=device,
+            _compile=True,
+        )
+
+        q_block_size = base_mask.BLOCK_SIZE[0]
+        block_offset = current_pos // q_block_size
+        mask_slice = base_mask[:, :, block_offset]
+
+        offset_tensor = torch.tensor(current_pos, device=device)
+        mask_slice.mask_mod = get_mask_mod_with_offset(
+            base_mask.mask_mod, offset_tensor
+        )
+        mask_slice.seq_lengths = (1, mask_slice.seq_lengths[1])
+
+        fa = torch.compile(flex_attention, dynamic=True)
+        flex_output = fa(q, k_cache, v_cache, block_mask=mask_slice)
+
+        self.assertEqual(flex_output, sdpa_output, atol=1e-3, rtol=1e-3)
+
 
 @large_tensor_test_class("2GB", device=test_device[0])
 class TestPagedAttention(InductorTestCase):
@@ -6139,7 +6455,7 @@ def test_local_window_bias(self, device, params):
         bias = torch.randn(
             2 * window_size + 1,
             device=device,
-            dtype=params.dtype,
+            dtype=torch.float32,
             requires_grad=True,
         )
 
@@ -6173,7 +6489,7 @@ def test_global_tokens_bias(self, device, params):
         bias = torch.randn(
             params.seq_length,
             device=device,
-            dtype=params.dtype,
+            dtype=torch.float32,
             requires_grad=True,
         )
 
@@ -6356,12 +6672,12 @@ def test_head_specific_gate(self, device, params, mode: str):
         gate_score = torch.randn(
             params.num_heads,
             device=device,
-            dtype=params.dtype,
+            dtype=torch.float32,
             requires_grad=True,
         )
 
         def bias_func(score, b, h, q_idx, kv_idx):
-            return score * torch.sigmoid(gate_score[h].to(torch.float32))
+            return score * torch.sigmoid(gate_score[h])
 
         flex_compiled = torch.compile(flex_attention, mode=mode)
         out_eager = flex_attention(query, key, value, score_mod=bias_func)
@@ -6396,7 +6712,7 @@ def test_distinct_biases(self, device, params):
         bias2 = torch.randn(
             params.seq_length,
             device=device,
-            dtype=params.dtype,
+            dtype=torch.float32,
             requires_grad=True,
         )
 
@@ -6479,6 +6795,35 @@ def bias_mod(score, b, h, q_idx, kv_idx):
         assert bias.grad, "No gradient computed for bias"
         assert torch.any(bias.grad != 0), "Gradient for bias is 0"
 
+    @skip_on_cpu
+    def test_backprop_error_case(self, device):
+        @torch.compile()
+        def test(x, y):
+            # Materialize a bias matrix
+            B, L, device = x.shape[0], x.shape[1], x.device
+            b = torch.arange(B, device=device, dtype=torch.long).view(B, 1, 1)
+            q_idx = torch.arange(L, device=device, dtype=torch.long).view(1, L, 1)
+            kv_idx = torch.arange(L, device=device, dtype=torch.long).view(1, 1, L)
+            bias_mat = y[b, q_idx] + y[b, kv_idx]  # (B, L, L)
+
+            # Dummy score_mod retrieving bias values
+            def score_mod(score, b, h, q_idx, kv_idx):
+                return score + bias_mat[b, q_idx, kv_idx]
+
+            x_ = x[:, :, None].repeat(1, 1, 16, 1)
+            # torch._dynamo.graph_break()
+            return flex_attention(x_, x_, x_, score_mod=score_mod)
+
+        B, L, D = 2, 16, 64
+
+        x = torch.randn(B, L, D, device=device, requires_grad=True)
+        y = torch.randn(B, L, device=device, requires_grad=True)
+
+        _ = test(x, y).mean().backward()
+
+        assert x.grad.norm() > 0
+        assert y.grad.norm() > 0
+
     @skip_on_cpu
     @common_utils.parametrize(
         "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
diff --git a/test/inductor/test_flex_decoding.py b/test/inductor/test_flex_decoding.py
index 120d8d36b439..a794f5e6e521 100644
--- a/test/inductor/test_flex_decoding.py
+++ b/test/inductor/test_flex_decoding.py
@@ -5,7 +5,8 @@
 import sys
 import unittest
 from collections import namedtuple
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+from typing import Optional, Union
 from unittest import expectedFailure
 from unittest.mock import patch
 
@@ -43,9 +44,6 @@
 
 
 Tolerances = namedtuple("Tolerances", ["atol", "rtol"])
-# In MI300, HIPBLASLT_ALLOW_TF32=1 is used to enable tf32 for matmul.
-# In the current test, HIPBLASLT_ALLOW_TF32 is not set, according to the
-# logic of allowTF32CuBLAS(), set float32_matmul_precision to highest.
 if torch.version.hip:
     torch.set_float32_matmul_precision("highest")
 else:
@@ -414,7 +412,7 @@ def run_test(
         sdpa_partial = create_attention(
             score_mod,
             block_mask,
-            enable_gqa=(not Q_H == KV_H),
+            enable_gqa=(Q_H != KV_H),
             kernel_options=kernel_options,
         )
         compiled_sdpa = torch.compile(sdpa_partial)
@@ -609,7 +607,7 @@ def run_paged_attention(
                 return_lse=True,
                 block_mask=converted_block_mask,
                 score_mod=converted_score_mod,
-                enable_gqa=(not Q_H == KV_H),
+                enable_gqa=(Q_H != KV_H),
             )
         else:
             compiled_lse = None
@@ -620,7 +618,7 @@ def run_paged_attention(
                 return_lse=False,
                 block_mask=converted_block_mask,
                 score_mod=converted_score_mod,
-                enable_gqa=(not Q_H == KV_H),
+                enable_gqa=(Q_H != KV_H),
             )
         return compiled_out, compiled_lse
 
@@ -666,9 +664,7 @@ def run_test_with_paged_attention(
         if block_mask is None:
             block_mask = create_block_mask(noop_mask, Q_B, 1, 1, KV_S, device=device)
 
-        sdpa_partial = create_attention(
-            score_mod, block_mask, enable_gqa=(not Q_H == KV_H)
-        )
+        sdpa_partial = create_attention(score_mod, block_mask, enable_gqa=(Q_H != KV_H))
         golden_out, gold_lse = sdpa_partial(q_gold, k_gold, v_gold, return_lse=True)
         ref_out, ref_lse = sdpa_partial(q_ref, k_ref, v_ref, return_lse=True)
 
@@ -908,7 +904,7 @@ def test_strided_inputs(self, device, dtype: torch.dtype, k_s, v_s, head_dims):
         sdpa_partial = create_attention(
             score_mod=score_mod,
             block_mask=None,
-            enable_gqa=(not Hq == Hkv),
+            enable_gqa=(Hq != Hkv),
         )
         compiled_sdpa = torch.compile(sdpa_partial)
         ref_out = sdpa_partial(q, k, v)
@@ -1146,7 +1142,7 @@ def test_head_dependent_mask_mod(
 
         def head_attention_mod(kv_head_num):
             head_type = torch.tensor(
-                [False if i % kv_head_num == 0 else True for i in range(kv_head_num)],
+                [i % kv_head_num != 0 for i in range(kv_head_num)],
                 dtype=torch.bool,
                 device=device,
             )
diff --git a/test/inductor/test_flex_flash.py b/test/inductor/test_flex_flash.py
new file mode 100644
index 000000000000..f75eff65382d
--- /dev/null
+++ b/test/inductor/test_flex_flash.py
@@ -0,0 +1,286 @@
+# Owner(s): ["module: inductor"]
+
+import unittest
+from contextlib import contextmanager
+
+import torch
+from torch._inductor.kernel.flex.flex_flash_attention import ensure_flash_available
+from torch._inductor.test_case import TestCase as InductorTestCase
+from torch.nn.attention.flex_attention import create_block_mask, flex_attention
+from torch.profiler import profile, ProfilerActivity
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+)
+from torch.testing._internal.common_utils import parametrize
+
+
+def _times_two(score, _b, _h, _m, _n):
+    return score * 2
+
+
+def _causal(score, _b, _h, token_q, token_kv):
+    return torch.where(token_q >= token_kv, score, float("-inf"))
+
+
+def _rel_bias(score, _b, _h, token_q, token_kv):
+    return score + (token_q - token_kv)
+
+
+def create_alibi_learned(num_heads=4, dtype=torch.float16):
+    """ALiBi with learned per-head slopes (tests tensor loading)."""
+    slopes = torch.exp2(-torch.linspace(1, 8, num_heads, device="cuda", dtype=dtype))
+
+    def alibi_score_mod(score, b, h, q_idx, kv_idx):
+        bias = (kv_idx - q_idx) * slopes[h]
+        return score + bias
+
+    return alibi_score_mod
+
+
+def create_pos_bias_table(seq_len=512, dtype=torch.float16):
+    """Relative position bias table (tests computed indexing)."""
+    max_len = seq_len
+    table = torch.randn(2 * max_len - 1, device="cuda", dtype=dtype) * 0.1
+
+    def pos_bias_mod(score, b, h, q_idx, kv_idx):
+        rel_pos = kv_idx - q_idx + max_len - 1
+        bias = table[rel_pos]
+        return score + bias
+
+    return pos_bias_mod
+
+
+def create_head_scale(num_heads=4, dtype=torch.float16):
+    """Per-head scaling factors (tests multiplication with tensor loading)."""
+    scales = torch.rand(num_heads, device="cuda", dtype=dtype) + 0.5
+
+    def head_scale_mod(score, b, h, q_idx, kv_idx):
+        return score * scales[h]
+
+    return head_scale_mod
+
+
+def create_batch_bias(batch_size=2, dtype=torch.float16):
+    """Per-batch bias (tests batch indexing)."""
+    bias = torch.randn(batch_size, device="cuda", dtype=dtype) * 0.1
+
+    def batch_bias_mod(score, b, h, q_idx, kv_idx):
+        return score + bias[b]
+
+    return batch_bias_mod
+
+
+def create_batch_head_bias(batch_size=2, num_heads=4, dtype=torch.float16):
+    """Per-batch-head bias matrix (tests 2D indexing with batch + head)."""
+    bias_matrix = torch.randn(batch_size, num_heads, device="cuda", dtype=dtype) * 0.5
+
+    def batch_head_mod(score, b, h, q_idx, kv_idx):
+        bias = bias_matrix[b, h]
+        return score + bias
+
+    return batch_head_mod
+
+
+def create_dual_buffer_bias(num_heads=4, seq_len=512, dtype=torch.float16):
+    """Dual buffer loading (tests loading from 2 separate tensors)."""
+    head_bias = torch.randn(num_heads, device="cuda", dtype=dtype) * 0.2
+    pos_scale = torch.arange(seq_len, device="cuda", dtype=dtype)
+
+    def dual_buffer_mod(score, b, h, q_idx, kv_idx):
+        head_component = head_bias[h]
+        pos_component = pos_scale[q_idx] * 0.01
+        return score + head_component + pos_component
+
+    return dual_buffer_mod
+
+
+def create_test_tensors(
+    batch_size=2, num_heads=4, seq_len=512, dim=64, dtype=torch.float16, device="cuda"
+):
+    shape = (batch_size, num_heads, seq_len, dim)
+    q = torch.randn(shape, device=device, dtype=dtype, requires_grad=False)
+    k = torch.randn(shape, device=device, dtype=dtype, requires_grad=False)
+    v = torch.randn(shape, device=device, dtype=dtype, requires_grad=False)
+    return q, k, v
+
+
+@contextmanager
+def cuda_kernel_profiler(kernel_pattern="flash_attncute"):
+    """Context manager for profiling CUDA kernels."""
+    result = {"found": False, "kernel_names": []}
+
+    with profile(activities=[ProfilerActivity.CUDA]) as prof:
+        yield result
+
+    kernel_names = [
+        evt.name
+        for evt in prof.events()
+        if evt.device_type == torch.autograd.DeviceType.CUDA and evt.name
+    ]
+    result["kernel_names"] = kernel_names
+    result["found"] = any(kernel_pattern in name for name in kernel_names)
+
+
+def flash_vs_triton(q, k, v, score_mod=None, rtol=5e-3, atol=5e-3):
+    compiled_fn = torch.compile(flex_attention)
+    out_flash = compiled_fn(
+        q, k, v, score_mod=score_mod, kernel_options={"force_flash": True}
+    )
+    out_no_flash = compiled_fn(
+        q, k, v, score_mod=score_mod, kernel_options={"force_flash": False}
+    )
+    torch.testing.assert_close(out_flash, out_no_flash, rtol=rtol, atol=atol)
+    return out_flash, out_no_flash
+
+
+def name_fn(score_mod):
+    return score_mod.__name__.lstrip("_")
+
+
+@unittest.skipIf(
+    not ensure_flash_available(), "Flash attention (CUTE) library is not available"
+)
+class TestFlexFlash(InductorTestCase):
+    @dtypes(torch.float16, torch.bfloat16)
+    def test_flash_attention_basic(self, device, dtype):
+        q, k, v = create_test_tensors(dtype=dtype, device=device)
+        flash_vs_triton(q, k, v)
+
+    @dtypes(torch.float16, torch.bfloat16)
+    @parametrize("score_mod", [_times_two, _causal, _rel_bias], name_fn=name_fn)
+    def test_flash_attention_with_score_mod(self, device, dtype, score_mod):
+        q, k, v = create_test_tensors(dtype=dtype, device=device)
+        flash_vs_triton(q, k, v, score_mod=score_mod)
+
+    @dtypes(torch.float16, torch.bfloat16)
+    @parametrize("seq_len", [127, 255, 383, 511])
+    def test_flash_attention_unfriendly_seqlen_with_causal(
+        self, device, dtype, seq_len
+    ):
+        """Test flash attention with unfriendly sequence lengths and causal masking."""
+        q, k, v = create_test_tensors(seq_len=seq_len, dtype=dtype, device=device)
+        flash_vs_triton(q, k, v, score_mod=_causal)
+
+    @dtypes(torch.float16, torch.bfloat16)
+    def test_force_flash_error_with_block_mask(self, device, dtype):
+        """Test that force_flash=True raises error when BlockMask is provided."""
+        q, k, v = create_test_tensors(dtype=dtype, device=device)
+
+        # Create a causal block mask
+        def causal_mask(b, h, q_idx, kv_idx):
+            return q_idx >= kv_idx
+
+        block_mask = create_block_mask(causal_mask, 2, 4, 512, 512, device=device)
+
+        compiled_fn = torch.compile(flex_attention)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"force_flash=True but flash attention cannot be used.*BlockMask.*not supported",
+        ):
+            compiled_fn(
+                q, k, v, block_mask=block_mask, kernel_options={"force_flash": True}
+            )
+
+    @dtypes(torch.float16, torch.bfloat16)
+    def test_flash_attention_kernel_called(self, device, dtype):
+        """Test that flash attention kernel is actually called when force_flash=True."""
+        q, k, v = create_test_tensors(dtype=dtype, device=device)
+        compiled_fn = torch.compile(flex_attention)
+
+        # Test that flash kernel is called with force_flash=True
+        with cuda_kernel_profiler("flash_attncute") as prof_result:
+            compiled_fn(
+                q, k, v, score_mod=_causal, kernel_options={"force_flash": True}
+            )
+
+        self.assertTrue(
+            prof_result["found"],
+            f"Flash attention kernel not found. Available kernels: {prof_result['kernel_names']}",
+        )
+
+        # Test that flash kernel is NOT called with force_flash=False
+        with cuda_kernel_profiler("flash_attncute") as prof_result:
+            compiled_fn(
+                q, k, v, score_mod=_causal, kernel_options={"force_flash": False}
+            )
+
+        self.assertFalse(
+            prof_result["found"],
+            f"Flash attention kernel unexpectedly found when force_flash=False. Kernels: {prof_result['kernel_names']}",
+        )
+
+    @dtypes(torch.float16, torch.bfloat16)
+    def test_flash_attention_with_alibi_learned(self, device, dtype):
+        """Test flash attention with ALiBi learned slopes (tensor loading)."""
+        q, k, v = create_test_tensors(dtype=dtype, device=device)
+        score_mod = create_alibi_learned(num_heads=4, dtype=dtype)
+        flash_vs_triton(q, k, v, score_mod=score_mod)
+
+    @dtypes(torch.float16, torch.bfloat16)
+    def test_flash_attention_with_pos_bias_table(self, device, dtype):
+        """Test flash attention with position bias table (tensor loading)."""
+        q, k, v = create_test_tensors(dtype=dtype, device=device)
+        score_mod = create_pos_bias_table(seq_len=512, dtype=dtype)
+        flash_vs_triton(q, k, v, score_mod=score_mod)
+
+    @dtypes(torch.float16, torch.bfloat16)
+    def test_flash_attention_with_head_scale(self, device, dtype):
+        """Test flash attention with head scaling (tensor loading)."""
+        q, k, v = create_test_tensors(dtype=dtype, device=device)
+        score_mod = create_head_scale(num_heads=4, dtype=dtype)
+        flash_vs_triton(q, k, v, score_mod=score_mod)
+
+    @dtypes(torch.float16, torch.bfloat16)
+    def test_flash_attention_with_batch_bias(self, device, dtype):
+        """Test flash attention with batch bias (tensor loading)."""
+        q, k, v = create_test_tensors(dtype=dtype, device=device)
+        score_mod = create_batch_bias(batch_size=2, dtype=dtype)
+        flash_vs_triton(q, k, v, score_mod=score_mod)
+
+    @dtypes(torch.float16, torch.bfloat16)
+    def test_flash_attention_with_batch_head_bias(self, device, dtype):
+        """Test flash attention with batch-head bias matrix (tensor loading)."""
+        q, k, v = create_test_tensors(dtype=dtype, device=device)
+        score_mod = create_batch_head_bias(batch_size=2, num_heads=4, dtype=dtype)
+        flash_vs_triton(q, k, v, score_mod=score_mod)
+
+    @dtypes(torch.float16, torch.bfloat16)
+    def test_flash_attention_with_dual_buffer_bias(self, device, dtype):
+        """Test flash attention with dual buffer loading (tensor loading)."""
+        q, k, v = create_test_tensors(dtype=dtype, device=device)
+        score_mod = create_dual_buffer_bias(num_heads=4, seq_len=512, dtype=dtype)
+        flash_vs_triton(q, k, v, score_mod=score_mod)
+
+    @dtypes(torch.float16, torch.bfloat16)
+    def test_force_flash_error_with_requires_grad(self, device, dtype):
+        """Test that force_flash=True raises error when tensor requires gradients."""
+        q, k, v = create_test_tensors(dtype=dtype, device=device)
+
+        # Create a score mod with requires_grad tensor
+        bias = torch.randn(4, device=device, dtype=dtype, requires_grad=True)
+
+        def score_mod_with_grad(score, b, h, q_idx, kv_idx):
+            return score + bias[h]
+
+        compiled_fn = torch.compile(flex_attention)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"force_flash=True but flash attention cannot be used.*require gradients",
+        ):
+            compiled_fn(
+                q,
+                k,
+                v,
+                score_mod=score_mod_with_grad,
+                kernel_options={"force_flash": True},
+            )
+
+
+instantiate_device_type_tests(TestFlexFlash, globals(), only_for="cuda")
+
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+
+    run_tests()
diff --git a/test/inductor/test_fp8.py b/test/inductor/test_fp8.py
index 82e4a923a92e..854e007eb642 100644
--- a/test/inductor/test_fp8.py
+++ b/test/inductor/test_fp8.py
@@ -7,6 +7,7 @@
 import torch
 from torch import Tensor
 from torch._inductor import config, utils
+from torch._inductor.pattern_matcher import PatternMatcherPass
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import run_and_get_code
 from torch.testing._internal.common_cuda import (
@@ -470,6 +471,87 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
             else:
                 self.assertEqual(y_eager, y_compiled, rtol=1e-2, atol=0.05)
 
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    def test_scaled_mm_preserves_strides(self):
+        """Test that scaled_mm preserves stride ordering through a custom pass."""
+
+        GPU_TYPE = "cuda"
+
+        def f(a, b, scale_a, scale_b):
+            # Convert to fp8 with correct strides for scaled_mm
+            dtype_float8 = torch.float8_e4m3fn
+            dtype_float8 = _fix_fp8_dtype_for_rocm(dtype_float8, GPU_TYPE)
+            a_fp8 = a.to(dtype_float8).contiguous()  # row-major
+            b_fp8 = b.t().contiguous().t().to(dtype_float8)  # column-major
+            return torch._scaled_mm(
+                a_fp8, b_fp8, scale_a, scale_b, out_dtype=torch.bfloat16
+            )
+
+        class ScaledMMStridePass(PatternMatcherPass):
+            def __init__(self) -> None:
+                super().__init__()
+                self.called = False
+
+            def __call__(self, g: torch.fx.Graph):
+                # Directly manipulate the graph without using pattern matching
+                for node in g.nodes:
+                    if (
+                        node.op == "call_function"
+                        and node.target == torch.ops.aten._scaled_mm.default
+                    ):
+                        # Insert clone operations before scaled_mm
+                        with g.inserting_before(node):
+                            a_fp8, b_fp8 = node.args[0], node.args[1]
+
+                            # Clone the inputs to potentially change stride ordering
+                            a_cloned = g.call_function(
+                                torch.ops.aten.clone,
+                                (a_fp8,),
+                                {"memory_format": torch.contiguous_format},
+                            )
+                            b_cloned = g.call_function(
+                                torch.ops.aten.clone,
+                                (b_fp8,),
+                                {"memory_format": torch.contiguous_format},
+                            )
+
+                            # Replace the arguments in the scaled_mm call
+                            node.args = (a_cloned, b_cloned) + node.args[2:]
+                            self.called = True
+
+                g.lint()
+                return g
+
+        stride_pass = ScaledMMStridePass()
+
+        # Create inputs with correct strides for scaled_mm
+        a = torch.randn((64, 128), dtype=torch.bfloat16, device=GPU_TYPE)
+        b = torch.randn((128, 64), dtype=torch.bfloat16, device=GPU_TYPE)
+        scale_a = torch.tensor(1.0, device=GPU_TYPE)
+        scale_b = torch.tensor(1.0, device=GPU_TYPE)
+
+        # First, verify that f works without the pass (baseline)
+        expected = f(a, b, scale_a, scale_b)
+
+        from torch._inductor import config
+
+        with config.patch(post_grad_custom_post_pass=stride_pass):
+            f_compiled = torch.compile(f, dynamic=False)
+            result = f_compiled(a, b, scale_a, scale_b)
+
+            # Verify the pattern was called
+            self.assertTrue(stride_pass.called, "Stride ordering pass was not called")
+
+            # Verify correctness - the pass should preserve correctness
+            # even though it modified strides
+            self.assertEqual(expected, result, atol=1e-2, rtol=1e-2)
+
+            # Verify the generated code contains the clones inserted by our pass
+            _, (wrapper,) = run_and_get_code(f_compiled, a, b, scale_a, scale_b)
+            self.assertIn("scaled_mm", wrapper.lower())
+            # The clones should be visible in the generated code
+            self.assertIn("clone", wrapper.lower())
+
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
@@ -541,7 +623,8 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
                 bias,
             )
 
-            FileCheck().check("SCALING_ROWWISE : tl.constexpr = False").run(code[0])
+            FileCheck().check("SCALE_RECIPE_A : tl.constexpr = 0").run(code[0])
+            FileCheck().check("SCALE_RECIPE_B : tl.constexpr = 0").run(code[0])
             self.assertEqual(y_eager.dtype, dtype)
             self.assertEqual(y_compiled.dtype, dtype)
             # depending on the kernel config (BLOCK_M size, etc) selected during Inductor
@@ -686,7 +769,8 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
                 bias,
             )
 
-        FileCheck().check("SCALING_ROWWISE : tl.constexpr = True").run(code[0])
+        FileCheck().check("SCALE_RECIPE_A : tl.constexpr = 1").run(code[0])
+        FileCheck().check("SCALE_RECIPE_B : tl.constexpr = 1").run(code[0])
         self.assertEqual(y_eager.dtype, dtype)
         self.assertEqual(y_compiled.dtype, dtype)
         torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
diff --git a/test/inductor/test_fused_attention.py b/test/inductor/test_fused_attention.py
index 25e96fa9f1e9..4438df288487 100644
--- a/test/inductor/test_fused_attention.py
+++ b/test/inductor/test_fused_attention.py
@@ -997,20 +997,21 @@ def dot_prod_attention(
             attn_weights = scores.float().softmax(dim=-1).type(value.dtype)
             return attn_weights.matmul(value)
 
-        tensor_shape = (4, 2, 16, 32)
-        attn_mask = torch.randn((1, 1, 1, 2), dtype=torch.float, device=self.device)
-        args = [
-            torch.randn(tensor_shape, device=self.device),
-            torch.randn(tensor_shape, device=self.device),
-            torch.randn(tensor_shape, device=self.device),
-            attn_mask,
-        ]
-        self._check_common(
-            dot_prod_attention,
-            args1=args,
-            has_dropout=False,
-            check_train=False,
-        )
+        tensor_shapes = [(4, 2, 16, 32), (1, 2, 16, 32)]
+        for tensor_shape in tensor_shapes:
+            attn_mask = torch.randn((1, 1, 1, 2), dtype=torch.float, device=self.device)
+            args = [
+                torch.randn(tensor_shape, device=self.device),
+                torch.randn(tensor_shape, device=self.device),
+                torch.randn(tensor_shape, device=self.device),
+                attn_mask,
+            ]
+            self._check_common(
+                dot_prod_attention,
+                args1=args,
+                has_dropout=False,
+                check_train=False,
+            )
 
     def _test_sdpa_rewriter_22(self):
         def dot_prod_attention(
@@ -1027,30 +1028,31 @@ def dot_prod_attention(
             attn_weights = scores.float().softmax(dim=-1).type(value.dtype)
             return attn_weights.matmul(value), key, value
 
-        tensor_shape = (4, 2, 16, 32)
-        attn_mask = torch.randn((1, 1, 2, 2), dtype=torch.float, device=self.device)
-        args = [
-            torch.randn(tensor_shape, device=self.device),
-            torch.randn(tensor_shape, device=self.device),
-            torch.randn(tensor_shape, device=self.device),
-            attn_mask,
-        ]
-        self._check_common(
-            dot_prod_attention,
-            args1=args,
-            has_dropout=False,
-            check_train=False,
-        )
-        # test attn_mask with stride of last dim != 1
-        attn_mask_ = attn_mask.transpose(2, 3)
-        args[3] = attn_mask_
-        self._check_common(
-            dot_prod_attention,
-            args1=args,
-            has_dropout=False,
-            check_train=False,
-            contains=self.device == "cpu",
-        )
+        tensor_shapes = [(4, 2, 16, 32), (1, 2, 16, 32)]
+        for tensor_shape in tensor_shapes:
+            attn_mask = torch.randn((1, 1, 2, 2), dtype=torch.float, device=self.device)
+            args = [
+                torch.randn(tensor_shape, device=self.device),
+                torch.randn(tensor_shape, device=self.device),
+                torch.randn(tensor_shape, device=self.device),
+                attn_mask,
+            ]
+            self._check_common(
+                dot_prod_attention,
+                args1=args,
+                has_dropout=False,
+                check_train=False,
+            )
+            # test attn_mask with stride of last dim != 1
+            attn_mask_ = attn_mask.transpose(2, 3)
+            args[3] = attn_mask_
+            self._check_common(
+                dot_prod_attention,
+                args1=args,
+                has_dropout=False,
+                check_train=False,
+                contains=self.device == "cpu",
+            )
 
     def _test_sdpa_rewriter_23(self):
         def dot_prod_attention(
@@ -1067,18 +1069,19 @@ def dot_prod_attention(
             attn_weights = scores.float().softmax(dim=-1).type(value.dtype)
             return attn_weights.matmul(value), key, value
 
-        tensor_shape = (4, 2, 16, 32)
-        args = [
-            torch.randn(tensor_shape, device=self.device),
-            torch.randn(tensor_shape, device=self.device),
-            torch.randn(tensor_shape, device=self.device),
-        ]
-        self._check_common(
-            dot_prod_attention,
-            args1=args,
-            has_dropout=False,
-            check_train=False,
-        )
+        tensor_shapes = [(4, 2, 16, 32), (1, 2, 16, 32)]
+        for tensor_shape in tensor_shapes:
+            args = [
+                torch.randn(tensor_shape, device=self.device),
+                torch.randn(tensor_shape, device=self.device),
+                torch.randn(tensor_shape, device=self.device),
+            ]
+            self._check_common(
+                dot_prod_attention,
+                args1=args,
+                has_dropout=False,
+                check_train=False,
+            )
 
     def _test_sdpa_rewriter_24(self):
         def dot_prod_attention(
diff --git a/test/inductor/test_fuzzer.py b/test/inductor/test_fuzzer.py
index dc1e0089402e..d08f4c9282fa 100644
--- a/test/inductor/test_fuzzer.py
+++ b/test/inductor/test_fuzzer.py
@@ -1,6 +1,5 @@
 # Owner(s): ["module: dynamo"]
 
-import sys
 import unittest
 from typing import Literal
 from unittest.mock import MagicMock, patch
@@ -45,7 +44,6 @@ def test_fn() -> bool:
 
 
 class TestConfigFuzzer(TestCase):
-    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
     def test_sampling_method_toggle(self):
         toggle = SamplingMethod.dispatch(SamplingMethod.TOGGLE)
         self.assertEqual(toggle("", bool, False), True)
@@ -55,26 +53,22 @@ def test_sampling_method_toggle(self):
         self.assertTrue("bar" in toggle("", list[Literal["foo", "bar"]], ["foo"]))
         self.assertTrue("foo" in toggle("", list[Literal["foo", "bar"]], ["bar"]))
 
-    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
     def test_sampling_method_random(self):
         random = SamplingMethod.dispatch(SamplingMethod.RANDOM)
         samp = [random("", bool, False) for i in range(1000)]
         self.assertTrue(not all(samp))
 
     @unittest.skipIf(not HAS_GPU, "requires gpu")
-    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
     def test_config_fuzzer_inductor_gpu(self):
         fuzzer = ConfigFuzzer(inductor_config, create_simple_test_model_gpu, seed=30)
         self.assertIsNotNone(fuzzer.default)
         fuzzer.reproduce([{"max_fusion_size": 1}])
 
-    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
     def test_config_fuzzer_inductor_cpu(self):
         fuzzer = ConfigFuzzer(inductor_config, create_simple_test_model_cpu, seed=100)
         self.assertIsNotNone(fuzzer.default)
         fuzzer.reproduce([{"max_fusion_size": 1}])
 
-    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
     def test_config_fuzzer_bisector_exception(self):
         key_1 = {"e_bool": False, "e_optional": None}
 
@@ -95,7 +89,6 @@ def myfn():
         for res in results:
             self.assertEqual(res, key_1)
 
-    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
     def test_config_fuzzer_bisector_boolean(self):
         key_1 = {"e_bool": False, "e_optional": None}
 
@@ -114,7 +107,6 @@ def myfn():
         for res in results:
             self.assertEqual(res, key_1)
 
-    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
     def test_config_fuzzer_n_tuple(self):
         key_1 = {"e_bool": False, "e_optional": None}
 
@@ -132,7 +124,6 @@ def myfn():
         self.assertEqual(results.num_ran(), max_combo)
         self.assertEqual(results.lookup(tuple(key_1.keys())), Status.FAILED_RUN_RETURN)
 
-    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
     def test_config_fuzzer_inductor_bisect(self):
         # these values just chosen randomly, change to different ones if necessary
         key_1 = {"split_reductions": False, "compute_all_bounds": True}
@@ -163,8 +154,10 @@ def myfn():
             - set(MODULE_DEFAULTS["torch._inductor.config"].keys()),
         )
 
-    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
     @unittest.skipIf(not IS_LINUX, "PerfCounters are only supported on Linux")
+    @unittest.skip(
+        "Need default values for dynamo flags - https://github.com/pytorch/pytorch/issues/164062"
+    )
     def test_config_fuzzer_dynamo_bisect(self):
         # these values just chosen randomly, change to different ones if necessary
         key_1 = {"dead_code_elimination": False, "specialize_int": True}
@@ -195,7 +188,6 @@ def myfn():
             - set(MODULE_DEFAULTS["torch._dynamo.config"].keys()),
         )
 
-    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
     @patch("torch.compile")
     def test_fuzzer_inductor_calling_compile(self, compile):
         def create_key_1():
@@ -209,7 +201,6 @@ def myfn():
         fuzzer.bisect(num_attempts=num_attempts, p=0.5)
         self.assertEqual(compile.call_count, num_attempts)
 
-    @unittest.skipIf(sys.version_info < (3, 10), "python < 3.10 not supported")
     def test_fuzzer_running_test(self):
         def create_key_1():
             def myfn():
diff --git a/test/inductor/test_fxir_backend.py b/test/inductor/test_fxir_backend.py
index 84d2a5d8e882..b5ab15059e70 100644
--- a/test/inductor/test_fxir_backend.py
+++ b/test/inductor/test_fxir_backend.py
@@ -6,7 +6,8 @@
 import itertools
 import operator
 import unittest
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Optional
 
 import sympy
 
@@ -17,11 +18,14 @@
 from torch._dynamo.utils import same
 from torch._higher_order_ops.triton_kernel_wrap import triton_kernel_wrapper_mutation
 from torch._inductor import config
-from torch._inductor.codegen.common import register_backend_for_device
 from torch._inductor.codegen.cpp import CppScheduling
 from torch._inductor.codegen.triton import TritonScheduling
 from torch._inductor.codegen.wrapper import PythonWrapperCodegen
-from torch._inductor.codegen.wrapper_fxir import FxConverter, WrapperFxCodegen
+from torch._inductor.codegen.wrapper_fxir import (
+    FxConverter,
+    replace_floor_div,
+    WrapperFxCodegen,
+)
 from torch._inductor.test_case import TestCase as InductorTestCase
 from torch.export import Dim
 from torch.testing._internal.common_utils import (
@@ -35,7 +39,15 @@
     requires_gpu,
     TRITON_HAS_CPU,
 )
+from torch.utils._sympy.functions import FloorDiv
+
 
+try:
+    from .test_control_flow import CondModels
+except ImportError:
+    from test_control_flow import (
+        CondModels,  # @manual=fbcode//caffe2/test/inductor:control_flow-library
+    )
 
 if HAS_GPU:
     import triton
@@ -43,15 +55,17 @@
 
     from torch.testing._internal.triton_utils import add_kernel_2d_autotuned
 
+test_config = {
+    "compile_threads": 1,
+    "alignment_asserts": False,
+    "size_asserts": False,
+    "scalar_asserts": False,
+    "nan_asserts": False,
+}
+
 
 @requires_gpu()
-@config.patch(
-    compile_threads=1,
-    alignment_asserts=False,
-    size_asserts=False,
-    scalar_asserts=False,
-    nan_asserts=False,
-)
+@config.patch(test_config)
 @instantiate_parametrized_tests
 class FxirTestCase(InductorTestCase):
     device = GPU_TYPE
@@ -116,8 +130,19 @@ def _compile_and_check(
     def setUpClass(cls):
         super().setUpClass()
 
-        # Register the FX backend.
-        register_backend_for_device(cls.device, TritonScheduling, WrapperFxCodegen)
+        # Register the FX backend, storing the default for later.
+        common.init_backend_registration()
+        cls._default_backend = common.device_codegens[cls.device]
+        common.register_backend_for_device(
+            cls.device, TritonScheduling, WrapperFxCodegen
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+
+        # Restore the default backend.
+        common.device_codegens[cls.device] = cls._default_backend
 
     def test_basic(self):
         args = [torch.randn(8, device=self.device) for _ in range(2)]
@@ -471,10 +496,11 @@ def test_dynamic_shapes_precomputed_size(self):
         )
         self.assertIn("ks0", triton_node.kwargs["kwargs"])
 
-    def test_dynamic_launch_grid_calc_python(self):
+    def test_dynamic_launch_grid_calc(self):
         """
-        Test the dyanmic launch grid calculation for Triton kernel wrapper using python mode
+        Test the dyanmic launch grid calculation.
         """
+
         func = torch.add
         args = [torch.randn(shape, device=self.device) for shape in [(7, 12), (7, 1)]]
         (gm,) = self._compile_and_check(func, args, compile_kwargs={"dynamic": True})
@@ -493,41 +519,6 @@ def test_dynamic_launch_grid_calc_python(self):
         self.assertEqual(grid[1], 1)
         self.assertEqual(grid[2], 1)
 
-    def test_dynamic_launch_grid_calc_python_slow(self):
-        """
-        Test the dyanmic launch grid calculation for Triton kernel wrapper using python_slow mode
-        """
-        from torch._inductor.runtime.triton_heuristics import GridExpr
-
-        # Mock GridExpr.from_meta to use "python_slow" mode explicitly
-        original_from_meta = GridExpr.from_meta
-
-        def mocked_from_meta(inductor_meta, cfg, mode="python"):
-            return original_from_meta(inductor_meta, cfg, mode="python_slow")
-
-        with unittest.mock.patch.object(GridExpr, "from_meta", mocked_from_meta):
-            func = torch.add
-            args = [
-                torch.randn(shape, device=self.device) for shape in [(7, 12), (7, 1)]
-            ]
-            (gm,) = self._compile_and_check(
-                func, args, compile_kwargs={"dynamic": True}
-            )
-
-            # Check for the precomputed size arg.
-            (triton_node,) = gm.graph.find_nodes(
-                op="call_function", target=triton_kernel_wrapper_mutation
-            )
-            self.assertIn("grid", triton_node.kwargs)
-            self.assertIn("xnumel", triton_node.kwargs["kwargs"])
-            self.assertIn("XBLOCK", triton_node.kwargs["kwargs"])
-            grid = triton_node.kwargs["grid"][0]
-            xnumel = triton_node.kwargs["kwargs"]["xnumel"].meta["val"]
-            xblock = triton_node.kwargs["kwargs"]["XBLOCK"]
-            self.assertEqual(grid[0].meta["val"], ((xnumel + xblock - 1) // xblock))
-            self.assertEqual(grid[1], 1)
-            self.assertEqual(grid[2], 1)
-
     @config.patch({"trace.enabled": True})
     @unittest.mock.patch("torch._inductor.debug.DebugFormatter.output_code")
     def test_debug(self, mock_output_code):
@@ -630,21 +621,47 @@ def foo(out, index, src):
         # Check for the fallback.
         self.assertEqual(self._count_ops(gm, fallback_op), 1)
 
-    @torch._inductor.config.patch("graph_partition", True)
-    def test_subgraph_raises(self):
+    @parametrize("pred", (False, True))
+    def test_cond_subgraph(self, pred: bool):
+        """
+        Test a model with subgraphs.
+        """
+
+        def foo(pred, x):
+            return torch.cond(pred, torch.cos, torch.sin, [x]) + 1
+
+        x = torch.randn((2, 3), device=self.device)
+        pred_tensor = torch.tensor([pred], device=self.device)
+        gm = self._compile_and_check(
+            foo, [pred_tensor, x], expected_num_triton_kernels=3
+        )[-1]
+
+        # Check for subgraphs.
+        subgm_getattrs = list(gm.graph.find_nodes(op="get_attr"))
+        self.assertEqual(len(subgm_getattrs), 2)
+        for subgm_getattr in subgm_getattrs:
+            target = subgm_getattr.name
+            self.assertTrue(isinstance(getattr(gm, target), torch.fx.GraphModule))
+
+    @parametrize("pred", (False, True))
+    def test_cond_no_operands(self, pred: bool):
         """
-        Test a model with subgraphs. This is not yet supported, so check that we get the
-        expected exception.
+        Test torch.cond when the subgraphs take no inputs.
         """
 
-        def foo(cond, x):
-            return torch.cond(cond, torch.cos, torch.sin, [x])
+        length = 8
+
+        def true_fn():
+            return torch.zeros(length, device=self.device)
+
+        def false_fn():
+            return true_fn() + 5
 
-        cond = torch.tensor([True], device=self.device)
-        x = torch.ones([2, 3], device=self.device)
+        def foo(pred):
+            return torch.cond(pred, true_fn, false_fn, ())
 
-        with self.assertRaisesRegex(BackendCompilerFailed, "Subgraph"):
-            self._compile_and_check(foo, [cond, x])
+        pred_tensor = torch.tensor([pred], device=self.device)
+        self._compile_and_check(foo, [pred_tensor], expected_num_triton_kernels=2)
 
     def test_cpp_raises(self):
         """
@@ -759,9 +776,9 @@ def check(
                 model, inp, dynamic_shapes=dynamic_shapes, strict=strict
             )
             gm = torch._inductor.aot_compile(
-                ep.module(), inp, options={"fx_wrapper": True, "compile_threads": 1}
+                ep.module(), inp, options={"fx_wrapper": True, **test_config}
             )
-            self.assertTrue(torch.allclose(model(*inp), gm(*inp)))
+            self.assertTrue(same(model(*inp), gm(*inp)))
 
             for node in gm.graph.nodes:
                 if (
@@ -919,6 +936,273 @@ def forward(self, x):
             1,
         )
 
+    @parametrize("pred", (False, True))
+    def test_cond_multi_inputs_and_outputs(self, pred):
+        """
+        Test torch.cond and check the output graphs.
+        """
+
+        class M(torch.nn.Module):
+            def forward(self, pred, x, y):
+                def true_fn(x, y):
+                    return torch.tanh(x), torch.relu(y)
+
+                def false_fn(x, y):
+                    return tuple(t / 2 for t in true_fn(x, y))
+
+                return torch.cond(pred, true_fn, false_fn, (x, y))
+
+        pred = torch.tensor([True], device=self.device)
+        (x, y) = [torch.randn(8, device=self.device) for _ in range(2)]
+        gm = self.check(M(), (pred, x, y))
+
+        # Check the graph.
+        self.assertExpectedInline(
+            gm.code.strip(),
+            """\
+def forward(self, arg0_1, arg1_1, arg2_1):
+    true_graph_0 = self.true_graph_0
+    false_graph_0 = self.false_graph_0
+    cond = torch.ops.higher_order.cond(arg0_1, true_graph_0, false_graph_0, (arg1_1, arg2_1));  arg0_1 = true_graph_0 = false_graph_0 = arg1_1 = arg2_1 = None
+    buf1 = cond[0]
+    buf2 = cond[1];  cond = None
+    return [buf1, buf2]""",  # noqa: B950
+        )
+
+    def test_dims_dynamic_outer_static_padded_inner(self):
+        """
+        Test padding on inner dimensions, with dynamic outer dimensions.
+        """
+
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        def get_input_padded_inner(shape):
+            full_shape = shape[:-1] + (shape[-1] * 2,)
+            full = torch.randn(full_shape, dtype=torch.float32, device=self.device)
+            view = torch.as_strided(full, shape, full.stride())
+            return view
+
+        shape = (4, 4, 4)
+        args = tuple(get_input_padded_inner(shape) for _ in range(2))
+        self.check(
+            M(),
+            args,
+            dynamic_shapes=({0: Dim.DYNAMIC, 1: Dim.DYNAMIC, 2: Dim.STATIC},) * 2,
+        )
+
+    @parametrize("length", (4, 8))
+    def test_cond_dynamic_shape_pred_scalar_closure(self, length: int):
+        """
+        Test cond using a predicate computed from dynamic shapes.
+        Also test a dynamic scalar computed outside the branches.
+        """
+
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                z = x.reshape(-1)
+                a = y.shape[0]
+
+                def true_fn(x):
+                    return x + a
+
+                def false_fn(x):
+                    return true_fn(x) / 2
+
+                return torch.cond(x.shape[0] > 5, true_fn, false_fn, (z,))
+
+        (x, y) = [
+            torch.randn(shape, device=self.device)
+            for shape in [(length // 2,) * 2, (length,)]
+        ]
+        dynamic_shapes = {
+            "x": {0: Dim.DYNAMIC},
+            "y": {0: Dim.DYNAMIC},
+        }
+        self.check(M(), (x, y), dynamic_shapes=dynamic_shapes)
+
+    def test_dynamic_scalar_output(self):
+        """
+        Test an output scalar from dynamic shapes.
+        """
+
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x.shape[0] * 3
+
+        x = torch.randn(7, device=self.device)
+        self.check(M(), (x,), dynamic_shapes=({0: Dim.DYNAMIC},))
+
+    @parametrize("dynamic", (False, True))
+    @parametrize("input_", (1.5, 2, False))
+    def test_item(self, input_, dynamic: bool):
+        """
+        Test calling Tensor.item.
+        """
+
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x[1].item()
+
+        x = torch.tensor((input_,) * 10)
+        d = Dim("s0", min=1)
+        dynamic_shapes = ({0: 2 * d},) if dynamic else None
+        self.check(M(), (x,), dynamic_shapes=dynamic_shapes)
+
+    @parametrize("pred", (False, True))
+    def test_mismatched_branch_dynamic(self, pred: bool):
+        """
+        Test cond branches with mismatched dynamic shapes.
+        """
+
+        # Apply an offset to guarantee the truith of the predicate.
+        pred_offset = 1 if pred else -1
+
+        inputs = [
+            torch.tensor([pred], device=self.device),
+        ] + [torch.randn(10, 20, device=self.device) + pred_offset for _ in range(3)]
+        dim0_a = Dim("s0", min=4, max=1024)
+        dim0_b = Dim("s1", min=4, max=1024)
+        dynamic_shapes = {
+            "p": {},
+            "x": {0: dim0_a, 1: None},
+            "y": {0: dim0_b, 1: None},
+            "z": {0: dim0_a, 1: None},
+        }
+
+        self.check(
+            CondModels.MismatchedOutputSize(),
+            tuple(inputs),
+            dynamic_shapes=dynamic_shapes,
+        )
+
+
+class TestReplaceFloorDiv(InductorTestCase):
+    """
+    Tests for floor -> FloorDiv conversion.
+    """
+
+    def _check(self, expr: sympy.Expr) -> sympy.Expr:
+        # Check that we started with floor's.
+        num_floors = expr.count(sympy.floor)
+        self.assertGreater(num_floors, 0)
+
+        replaced = replace_floor_div(expr)
+
+        # Check that all floor's were replaced.
+        # We shoud have no more new FloorDiv's than floor's in the original expression,
+        # although we can have less due to simplification.
+        self.assertEqual(replaced.count(sympy.floor), 0)
+        self.assertLessEqual(
+            replaced.count(FloorDiv) - expr.count(FloorDiv), num_floors
+        )
+
+        def expand_floor_div(
+            numerator: sympy.Expr, denominator: sympy.Expr
+        ) -> sympy.Expr:
+            return sympy.floor(numerator / denominator)
+
+        # Expand FloorDiv back into floor and check for equality.
+        self.assertEqual(
+            *[
+                sympy.simplify(e.replace(FloorDiv, expand_floor_div))
+                for e in (replaced, expr)
+            ]
+        )
+
+        return replaced
+
+    def test_rewrite_floor_div_mul_pow(self):
+        x, y = sympy.symbols("x y")
+        expr = sympy.floor(x / y)
+        self.assertEqual(expr.count(FloorDiv), 0)
+        self.assertEqual(expr.count(sympy.core.mul.Mul), 1)
+        self.assertEqual(expr.count(sympy.Pow), 1)
+
+        rewritten = self._check(expr)
+        self.assertTrue(isinstance(rewritten, FloorDiv))
+        self.assertEqual(rewritten.args, (x, y))
+
+    def test_rewrite_floor_div_mul_rational(self):
+        x = sympy.Symbol("x")
+        expr = sympy.floor(x / 5)
+        self.assertEqual(expr.count(FloorDiv), 0)
+        self.assertEqual(expr.count(sympy.core.mul.Mul), 1)
+        self.assertEqual(expr.count(sympy.Rational), 1)
+
+        rewritten = self._check(expr)
+        self.assertTrue(isinstance(rewritten, FloorDiv))
+        self.assertEqual(rewritten.args, (x, 5))
+
+    def test_no_rewrite_div(self):
+        x, y = sympy.symbols("x y")
+        expr = x / y
+        self.assertEqual(expr.count(FloorDiv), 0)
+
+        rewritten = replace_floor_div(expr)
+        self.assertEqual(rewritten, expr)
+
+    def test_rewrite_floor_div_nested(self):
+        x, y = sympy.symbols("x y")
+        expr = sympy.floor((sympy.floor(x / 5) + 1) / y)
+        self.assertEqual(expr.count(FloorDiv), 0)
+
+        rewritten = self._check(expr)
+        self.assertEqual(rewritten.count(FloorDiv), 2)
+
+    def test_rewrite_floor_div_rational_const(self):
+        expr = sympy.floor(sympy.S.One / 5, evaluate=False)
+        self.assertEqual(expr.count(FloorDiv), 0)
+        self.assertEqual(expr.count(sympy.Mul), 0)
+        self.assertEqual(expr.count(sympy.Rational), 1)
+
+        # Expression evaluates to a compile time constant
+        rewritten = self._check(expr)
+        self.assertEqual(rewritten, sympy.S.Zero)
+
+    def test_no_distribute_mul_floordiv(self):
+        """
+        Test that multiplication doesn't distribute with floor division.
+        """
+        x = sympy.Symbol("x")
+        expr = 2 * sympy.floor(x / 2)
+        rewritten = self._check(expr)
+        self.assertEqual(rewritten.count(sympy.Mul), 1)
+        self.assertEqual(rewritten.count(FloorDiv), 1)
+
+    def test_rational_multi_pows(self):
+        """
+        Test an expression with a rational and multiple pows.
+        """
+        x, y, z = sympy.symbols("x y z")
+        expr = sympy.floor((x / 5) * (y**2) * (z**3))
+        mul = expr.args[0]
+        self.assertTrue(isinstance(mul, sympy.Mul))
+        self.assertTrue(isinstance(mul.args[0], sympy.Rational))
+        self.assertEqual(expr.count(sympy.Pow), 2)
+        rewritten = self._check(expr)
+        self.assertEqual(rewritten.count(FloorDiv), 1)
+
+    def test_variable_exp(self):
+        """
+        Test pow when the exponent is a variable.
+        """
+        x = sympy.Symbol("x", positive=True)
+        expr = sympy.floor(2**-x)
+        replaced = self._check(expr)
+
+        # Check that x went to the denominator.
+        self.assertEqual(replaced.args, (1, 2**x))
+
+    def test_launch_grid_dynamic_padding(self):
+        """
+        Test a complex launch grid expression arising from padding with dynamic shapes.
+        """
+        x, y = sympy.symbols("x y")
+        expr = sympy.floor(-FloorDiv(x * y, 2) / FloorDiv(-x * y, 131070))
+        self._check(expr)
+
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_gpu_cpp_wrapper.py b/test/inductor/test_gpu_cpp_wrapper.py
index 24163ece1f91..18398b620b0b 100644
--- a/test/inductor/test_gpu_cpp_wrapper.py
+++ b/test/inductor/test_gpu_cpp_wrapper.py
@@ -62,6 +62,19 @@ def test_fn():
         )(test_fn)
         comp()
 
+    def test_non_tensor_args_wrapped_on_cpu(self):
+        if not RUN_GPU:
+            self.skipTest("GPU not available")
+
+        def test_fn(x, s):
+            return (x + s).sum()
+
+        compiled = torch.compile(options={"cpp_wrapper": True})(test_fn)
+        x = torch.randn(4, device=self.device)
+        with torch.utils._device.DeviceContext(self.device):
+            _, code = test_torchinductor.run_and_get_cpp_code(compiled, x, 3)
+        self.assertIn("torch.tensor(arg, device='cpu')", code)
+
 
 class DynamicShapesGpuWrapperGpuTests(InductorTestCase):
     device = GPU_TYPE
diff --git a/test/inductor/test_graph_transform_observer.py b/test/inductor/test_graph_transform_observer.py
index 2bd0b6ef43f1..e30f2189cd42 100644
--- a/test/inductor/test_graph_transform_observer.py
+++ b/test/inductor/test_graph_transform_observer.py
@@ -22,7 +22,7 @@
     HAS_PYDOT = False
 
 
-HAS_DOT = True if shutil.which("dot") is not None else False
+HAS_DOT = shutil.which("dot") is not None
 
 
 class TestGraphTransformObserver(TestCase):
diff --git a/test/inductor/test_group_batch_fusion.py b/test/inductor/test_group_batch_fusion.py
index 01c9962e0087..7111e10a69fc 100644
--- a/test/inductor/test_group_batch_fusion.py
+++ b/test/inductor/test_group_batch_fusion.py
@@ -686,7 +686,7 @@ def build_graph(self, desc):
             unsatisfied += 1
             assert unsatisfied <= len(desc)  # cycle or bad input?
             name, v = desc.popleft()
-            args = tuple(lookup.get(n, None) for n in v)
+            args = tuple(lookup.get(n) for n in v)
             if None in args:
                 desc.append((name, v))
                 continue
diff --git a/test/inductor/test_indexing.py b/test/inductor/test_indexing.py
index 3359b237904f..611b1dd966e1 100644
--- a/test/inductor/test_indexing.py
+++ b/test/inductor/test_indexing.py
@@ -254,6 +254,46 @@ def f(x):
             ms = benchmarker.benchmark_gpu(lambda: f(x))
             print(f"{ms=:.03f}")
 
+    @unittest.skipUnless(HAS_GPU, "Need GPU for this test")
+    def test_floordiv_div_sympy_is_integer_bug(self):
+        def foo(arg0, arg1, arg2, arg3, arg4, sentinel):
+            t0 = arg0
+            t1 = t0.reshape((28, 24, 3, 127))
+            t2 = t1.var(dim=2)
+            t3 = arg1
+            t4 = arg2
+            t5 = torch.nn.functional.embedding(
+                torch.clamp(t3, 0, t4.size(0) - 1).to(torch.long), t4
+            )
+            t6 = arg3
+            t7 = torch.nn.functional.pad(t6, [0, 1], mode="constant", value=0.0)
+            t8 = arg4
+            t9 = t8.sum(dim=1)
+            t10 = torch.baddbmm(t5, t7, t9)
+            t11 = torch.cat([t2, t10], dim=0)
+            output = t11 + sentinel
+            return output
+
+        arg0 = torch.rand(
+            [36, 7112, 1, 1], dtype=torch.bfloat16, device=GPU_TYPE, requires_grad=True
+        )
+        arg1 = torch.randint(0, 512, [30, 24], dtype=torch.int64, device=GPU_TYPE)
+        arg2 = torch.rand(
+            [512, 127], dtype=torch.bfloat16, device=GPU_TYPE, requires_grad=True
+        )
+        arg3 = torch.rand(
+            [30, 24, 15], dtype=torch.bfloat16, device=GPU_TYPE, requires_grad=True
+        )
+        arg4 = torch.rand(
+            [30, 4, 16, 127], dtype=torch.bfloat16, device=GPU_TYPE, requires_grad=True
+        )
+        sentinel = torch.tensor(
+            0.0, dtype=torch.bfloat16, device=GPU_TYPE, requires_grad=True
+        )
+        compiled_foo = torch.compile(foo, fullgraph=True, dynamic=True)
+        out_compiled = compiled_foo(arg0, arg1, arg2, arg3, arg4, sentinel)
+        out_compiled.sum().backward()
+
 
 class ExprPrinterTests(InductorTestCase):
     def test_print_pow(self):
diff --git a/test/inductor/test_loop_ordering.py b/test/inductor/test_loop_ordering.py
index af7b2231864c..efe0fbfc2837 100644
--- a/test/inductor/test_loop_ordering.py
+++ b/test/inductor/test_loop_ordering.py
@@ -589,6 +589,31 @@ def f(x, y):
             ".run(", 1 + int(inductor_config.benchmark_kernel), exactly=True
         ).run(code[0])
 
+    @inductor_config.patch(
+        {
+            "max_autotune": True,
+            "max_autotune_gemm_backends": "TRITON",
+            "test_configs.max_mm_configs": 4,
+        }
+    )
+    @skipUnless(HAS_GPU and is_big_gpu(), "Need big gpu for max-autotune")
+    def test_interaction_with_multi_template(self):
+        """
+        Skip MultiTemplateBuffer during loop reordering
+        """
+
+        @torch.compile
+        def f(x, y):
+            return (x @ y), x + 1
+
+        N = 2
+        x = torch.randn([N, N], device=GPU_TYPE, dtype=torch.bfloat16)
+        y = torch.randn([N, N], device=GPU_TYPE, dtype=torch.bfloat16)
+
+        out, code = run_and_get_code(f, x, y)
+        # didn't fuse due to small savings
+        FileCheck().check_count("@triton.jit", 2, exactly=True).run(code[0])
+
     def test_fuse_with_scalar_shared_memory(self):
         """
         Make sure if we can fuse two nodes sharing a scalar before,
@@ -606,6 +631,37 @@ def f(x):
         out, code = run_and_get_code(f, x)
         FileCheck().check_count("@triton.jit", 1, exactly=True).run(code[0])
 
+    def test_3dred_pw_2d_outer_red(self):
+        """
+        Test a pattern as follows. We have a 3d contiguous tensor [m, n, k] as input.
+        1. do reduction on the k dimension and get a [m, n] tensor
+        2. do a pointwise operation on this [m, n] tensor (and realize the computation)
+        3. do a outer reduction on the output of step 2 on the m dimension.
+
+        Each of these step generate a kernel before fusion.
+        Without any loop reorder, kernel 1 and kernel 2 will get fused. And kernel 3 will be separeate.
+
+        But if we reorder the loop for kernel 2, then kernel 2 will get fused with kernel 3.
+        And the fused kernel-2-3 can not be fused with kernel 1.
+
+        The older version of LOAF algorithm will do reorder in this case. But there is no real
+        benefits. There are even some slight downsides
+        1. the original fusion without loop reordering is more natural
+        2. fusion kernel 1 with kernel 2 may help precision when the output of kernel 1 is in low precision.
+           By fusion kernel 1 and kernel 2, the pointwise operation will operate on fp32 precision thanks
+           to fusion.
+        """
+        M, N, K = 64, 64, 64
+
+        def f(x):
+            x = x.sum(dim=-1)
+            x = x + 1  # can be more complex like sigmoid or other ops
+            return x, x.sum(dim=0)
+
+        x = torch.randn(M, N, K, device=GPU_TYPE)
+        self.do_acc_test(f, x)
+        self.assertEqual(0, metrics.num_loop_reordering)
+
 
 @inductor_config.patch(
     {
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index 0922edc70bd8..85405283e4bd 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -10,7 +10,8 @@
 import re
 import tempfile
 import unittest
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Optional
 from unittest import mock
 
 import torch
@@ -48,7 +49,9 @@
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     IS_WINDOWS,
+    NAVI_ARCH,
     parametrize,
+    skipIfRocmArch,
     TEST_WITH_ROCM,
 )
 from torch.testing._internal.logging_utils import multiple_logs_to_string
@@ -171,6 +174,7 @@ def mm(a, b):
             {
                 "max_autotune": True,
                 "triton.enable_persistent_tma_matmul": "1",
+                "triton.native_matmul": False,
                 "triton.enable_template_tma_store": tma_store,
                 "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
             }
@@ -249,6 +253,7 @@ def next_multiple_16(a: int) -> int:
             {
                 "max_autotune": True,
                 "triton.enable_persistent_tma_matmul": "1",
+                "triton.native_matmul": False,
                 "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
             }
         ):
@@ -271,12 +276,14 @@ def next_multiple_16(a: int) -> int:
     @parametrize("b_transposed", (False, True))
     @parametrize("dynamic", (False, True))
     @parametrize("tma_store", (False, True))
+    @parametrize("epilogue_subtile", (False, True))
     def test_blackwell_max_autotune_regular_mm_persistent_tma(
         self,
         a_transposed: bool,
         b_transposed: bool,
         dynamic: bool,
         tma_store: bool,
+        epilogue_subtile: bool,
     ):
         def mm(a, b):
             # TMA requires 16-byte alignment: here we repeat the dims
@@ -308,6 +315,7 @@ def mm(a, b):
                 "max_autotune": True,
                 "triton.enable_persistent_tma_matmul": True,
                 "triton.enable_template_tma_store": tma_store,
+                "triton.enable_epilogue_subtiling": epilogue_subtile,
                 "test_configs.autotune_choice_name_regex": "blackwell_ws_persistent_device_tma",
             }
         ):
@@ -315,6 +323,7 @@ def mm(a, b):
             c_expected = mm(a, b)
 
         torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)
+        write_count = 2 if epilogue_subtile else 1
         if tma_store:
             # Verify that we are using a TMA implementation
             # Note: The tma_descriptor0 is generated by the kernel. If the
@@ -324,7 +333,9 @@ def mm(a, b):
             write_api = "tl.store"
         FileCheck().check("triton_tem_fused_mm").check(
             "triton.language.make_tensor_descriptor"
-        ).check("tl.load_tensor_descriptor").check(write_api).run(code[0])
+        ).check("tl.load_tensor_descriptor").check_count(write_api, write_count).run(
+            code[0]
+        )
 
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
@@ -345,6 +356,7 @@ def mm(a, b):
                 {
                     "max_autotune": True,
                     "triton.enable_persistent_tma_matmul": "1",
+                    "triton.native_matmul": False,
                     "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
                 }
             ),
@@ -381,6 +393,7 @@ def mm(a, b, out):
                 {
                     "max_autotune": True,
                     "triton.enable_persistent_tma_matmul": "1",
+                    "triton.native_matmul": False,
                     "triton.enable_template_tma_store": True,
                     "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
                 }
@@ -415,6 +428,7 @@ def mm(a, b):
             {
                 "max_autotune": True,
                 "triton.enable_persistent_tma_matmul": "1",
+                "triton.native_matmul": False,
                 "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
             }
         ):
@@ -439,6 +453,132 @@ def mm(a, b):
         with config.patch({"max_autotune": True}):
             torch.compile(mm, dynamic=dynamic)(a, b)
 
+    # NOTE: the current Inductor template verifies that the scaling mode is either per-tensor or per-row
+    # TODO: support additional scaling modes for Blackwell
+    @unittest.skipIf(
+        not has_datacenter_blackwell_tma_device(),
+        "Need Blackwell with device-side TMA support in Triton",
+    )
+    @parametrize("dynamic", (False, True))
+    @parametrize("tma_store", (False, True))
+    def test_blackwell_max_autotune_scaled_mm_per_tensor_persistent_tma(
+        self,
+        dynamic: bool,
+        tma_store: bool,
+    ):
+        def scaled_mm(a, b, scale_a, scale_b):
+            # NOTE: Inductor constrains a to be row_major and b to be col_major
+            return torch._scaled_mm(
+                a, b.t(), scale_a, scale_b, use_fast_accum=True, out_dtype=torch.float16
+            )
+
+        def get_scale_per_tensor(t):
+            scale = torch.finfo(torch.float8_e4m3fn).max / t.abs().max()
+            return scale.to(torch.float32)
+
+        # TMA requires 16-byte alignment: here we repeat the dims
+        # by the factor of 8, as float16 is 2-byte.
+        M, N, K = 32, 16, 48
+        a = (torch.randn((M, K)).to(torch.float16).to(GPU_TYPE)).repeat(8, 8)
+        b = (torch.randn((N, K)).to(torch.float16).to(GPU_TYPE)).repeat(8, 8)
+
+        scale_a = get_scale_per_tensor(a)
+        scale_b = get_scale_per_tensor(b)
+
+        a = a.to(torch.float8_e4m3fn)
+        b = b.to(torch.float8_e4m3fn)
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "triton.enable_persistent_tma_matmul": True,
+                "triton.enable_template_tma_store": tma_store,
+                "test_configs.autotune_choice_name_regex": "blackwell_ws_persistent_device_tma",
+            }
+        ):
+            c_actual, code = run_and_get_code(
+                torch.compile(scaled_mm, dynamic=dynamic), a, b, scale_a, scale_b
+            )
+            c_expected = scaled_mm(a, b, scale_a, scale_b)
+
+        torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=0.5)
+        if tma_store:
+            # Verify that we are using a TMA implementation
+            # Note: The tma_descriptor0 is generated by the kernel. If the
+            # code generation process changes this could change.
+            write_api = "tma_descriptor0.store"
+        else:
+            write_api = "tl.store"
+        FileCheck().check("triton_tem_fused__scaled_mm").check(
+            "triton.language.make_tensor_descriptor"
+        ).check("tl.load_tensor_descriptor").check(write_api).run(code[0])
+
+    @unittest.skipIf(
+        not has_datacenter_blackwell_tma_device(),
+        "Need Blackwell with device-side TMA support in Triton",
+    )
+    @parametrize("dynamic", (False, True))
+    @parametrize("tma_store", (False, True))
+    def test_blackwell_max_autotune_scaled_mm_per_row_persistent_tma(
+        self,
+        dynamic: bool,
+        tma_store: bool,
+    ):
+        def scaled_mm(a, b, scale_a, scale_b):
+            # NOTE: Inductor constrains a to be row_major and b to be col_majo
+            return torch._scaled_mm(
+                a,
+                b.t(),
+                scale_a,
+                scale_b.t(),
+                use_fast_accum=True,
+                out_dtype=torch.bfloat16,
+            )
+
+        def get_scale_per_row(t):
+            scale = (
+                torch.finfo(torch.float8_e4m3fn).max
+                / t.abs().max(dim=1, keepdim=True).values
+            )
+            return scale.to(torch.float32)
+
+        # TMA requires 16-byte alignment: here we repeat the dims
+        # by the factor of 8, as float16 is 2-byte.
+        M, N, K = 32, 16, 48
+        a = (torch.randn((M, K)).to(torch.bfloat16).to(GPU_TYPE)).repeat(8, 8)
+        b = (torch.randn((N, K)).to(torch.bfloat16).to(GPU_TYPE)).repeat(8, 8)
+
+        scale_a = get_scale_per_row(a)
+        scale_b = get_scale_per_row(b)
+
+        a = a.to(torch.float8_e4m3fn)
+        b = b.to(torch.float8_e4m3fn)
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "triton.enable_persistent_tma_matmul": True,
+                "triton.enable_template_tma_store": tma_store,
+                "test_configs.autotune_choice_name_regex": "blackwell_ws_persistent_device_tma",
+            }
+        ):
+            c_actual, code = run_and_get_code(
+                torch.compile(scaled_mm, dynamic=dynamic), a, b, scale_a, scale_b
+            )
+            c_expected = scaled_mm(a, b, scale_a, scale_b)
+
+        torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=0.5)
+        if tma_store:
+            # Verify that we are using a TMA implementation
+            # Note: The tma_descriptor0 is generated by the kernel. If the
+            # code generation process changes this could change.
+            write_api = "tma_descriptor0.store"
+        else:
+            write_api = "tl.store"
+        FileCheck().check("triton_tem_fused__scaled_mm").check(
+            "triton.language.make_tensor_descriptor"
+        ).check("tl.load_tensor_descriptor").check(write_api).run(code[0])
+
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
@@ -485,6 +625,7 @@ def addmm(x, a, b):
             {
                 "max_autotune": True,
                 "triton.enable_persistent_tma_matmul": "1",
+                "triton.native_matmul": False,
                 "triton.enable_template_tma_store": tma_store,
                 "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
             }
@@ -526,12 +667,14 @@ def addmm(x, a, b):
     @parametrize("b_transposed", (False, True))
     @parametrize("dynamic", (False, True))
     @parametrize("tma_store", (False, True))
+    @parametrize("epilogue_subtile", (False, True))
     def test_blackwell_max_autotune_addmm_persistent_tma(
         self,
         a_transposed: bool,
         b_transposed: bool,
         dynamic: bool,
         tma_store: bool,
+        epilogue_subtile: bool,
     ):
         def addmm(x, a, b):
             # TMA requires 16-byte alignment: here we repeat the dims
@@ -566,6 +709,7 @@ def addmm(x, a, b):
                 "max_autotune": True,
                 "triton.enable_persistent_tma_matmul": True,
                 "triton.enable_template_tma_store": tma_store,
+                "triton.enable_epilogue_subtiling": epilogue_subtile,
                 "test_configs.autotune_choice_name_regex": "blackwell_ws_persistent_device_tma",
             }
         ):
@@ -576,6 +720,7 @@ def addmm(x, a, b):
 
         make_desc_api = "triton.language.make_tensor_descriptor"
         read_api = "tl.load_tensor_descriptor"
+        write_count = 2 if epilogue_subtile else 1
         if tma_store:
             # Verify that we are using a TMA implementation
             # Note: The tma_descriptor0 is generated by the kernel. If the
@@ -587,7 +732,7 @@ def addmm(x, a, b):
         # Verify that we are using a TMA implementation
         FileCheck().check("triton_tem_fused_addmm").check(make_desc_api).check(
             read_api
-        ).check(write_api).run(code[0])
+        ).check_count(write_api, write_count).run(code[0])
 
         torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)
 
@@ -611,6 +756,7 @@ def addmm(x, a, b):
                 {
                     "max_autotune": True,
                     "triton.enable_persistent_tma_matmul": "1",
+                    "triton.native_matmul": False,
                     "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
                 }
             ),
@@ -647,6 +793,7 @@ def addmm(x, a, b):
             {
                 "max_autotune": True,
                 "triton.enable_persistent_tma_matmul": "1",
+                "triton.native_matmul": False,
                 "test_configs.autotune_choice_name_regex": "mm_persistent_tma",
             }
         ):
@@ -662,6 +809,9 @@ def addmm(x, a, b):
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
+    @unittest.skipIf(
+        has_datacenter_blackwell_tma_device(), "B200 doesn't support sm carveout"
+    )
     @parametrize("carveout", (None, 0, 27))
     @parametrize("op", ("mm", "scaled_mm"))
     def test_honor_sm_carveout_with_triton_tma(self, carveout, op: str):
@@ -706,6 +856,7 @@ def scaled_mm(
             {
                 "max_autotune": True,
                 "triton.enable_persistent_tma_matmul": True,
+                "triton.native_matmul": False,
                 "max_autotune_gemm_backends": "TRITON",
                 "test_configs.autotune_choice_name_regex": "tma",
             }
@@ -930,6 +1081,9 @@ def f(x, weight):
             act = opt_f(x, weight)
             self.assertTrue(torch.allclose(ref, act, atol=4 * 1e-3, rtol=4 * 1e-3))
 
+    @skipIfXpu(
+        msg="Fails on Intel XPU; see https://github.com/pytorch/pytorch/issues/161484"
+    )
     @config.patch(max_autotune_gemm_backends="TRITON")
     @parametrize("search_space", ("DEFAULT", "EXHAUSTIVE"))
     def test_baddmm(self, search_space):
@@ -954,9 +1108,10 @@ def forward(self, x):
         with config.patch({"max_autotune_gemm_search_space": search_space}):
             m_c = torch.compile(mode="max-autotune")(mod)
             out, code = run_and_get_code(m_c, x)
-            self.assertEqual(out, mod(x), atol=2e-3, rtol=1e-3)
+            self.assertEqual(out, mod(x), atol=2e-3, rtol=2e-3)
 
-            FileCheck().check("triton_tem_fused_baddbmm").run(code[0])
+            if not config.triton.native_matmul:
+                FileCheck().check("triton_tem_fused_baddbmm").run(code[0])
 
     @config.patch(max_autotune=True)
     def test_conv1x1_with_free_symbols(self):
@@ -998,7 +1153,7 @@ def f(x, y):
         self.assertEqual(f_c(*inps), f(*inps), atol=0.03, rtol=0.25)
 
         # mm kernel, and cos kernel
-        count = 2 if using_triton_mm else 1
+        count = 2 if (using_triton_mm or config.triton.native_matmul) else 1
         FileCheck().check(get_func_call()).check_count(
             get_kernel_launch(), count, exactly=True
         ).run(code[0])
@@ -1026,6 +1181,7 @@ def f(x, y):
 
     @config.patch("trace.enabled", True)
     @config.patch({"test_configs.force_extern_kernel_in_multi_template": True})
+    @config.patch("triton.native_matmul", False)
     def test_mutation_rename(self):
         torch._logging.set_logs(ir_post_fusion=True)
 
@@ -1040,6 +1196,7 @@ def f(x, y, z, other):
         t = functools.partial(torch.randn, device=GPU_TYPE)
         inps = (t(3, 3), t(3, 3), t(3, 3), t(3))
         fn = torch.compile(f, mode="max-autotune-no-cudagraphs")
+
         (
             (
                 pre_fusion_tream,
@@ -1144,6 +1301,7 @@ def test_conv_backend(self):
 
         self.assertIn("NoValidChoicesError", str(context.exception))
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_non_contiguous_input_mm(self):
         """
         Make sure the triton template can work with non-contiguous inputs without crash.
@@ -1162,6 +1320,7 @@ def f(x, y):
         act = f(x, y)
         torch.testing.assert_close(act, ref, atol=2e-2, rtol=1e-2)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_non_contiguous_input_addmm(self):
         b = torch.randn((768), dtype=torch.bfloat16, device=GPU_TYPE)
         x = rand_strided(
@@ -1177,6 +1336,7 @@ def f(x, y):
         act = f(x, y)
         torch.testing.assert_close(act, ref, atol=2e-2, rtol=1e-2)
 
+    @skipIfRocmArch(NAVI_ARCH)
     def test_non_contiguous_input_bmm(self):
         x = rand_strided(
             (1, 50257, 2048), (0, 1, 50304), dtype=torch.bfloat16, device=GPU_TYPE
@@ -1196,6 +1356,10 @@ def f(x, y):
     # TODO: fix accuracy failure of the triton template on XPU.
     # and enable this test case.
     @skipIfXpu
+    @unittest.skipIf(
+        config.triton.native_matmul,
+        "native matmul and Triton template both have accuracy fail (2.2%)",
+    )
     def test_non_contiguous_input_mm_plus_mm(self):
         x1 = rand_strided((50257, 2048), (1, 50304), device=GPU_TYPE)
         y1 = rand_strided((2048, 768), (768, 1), device=GPU_TYPE)
@@ -1209,12 +1373,15 @@ def f(x1, y1, x2, y2):
 
         ref = x1 @ y1 + x2 @ y2
         act = f(x1, y1, x2, y2)
-        torch.testing.assert_close(act, ref, atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(act, ref, atol=1e-1, rtol=1e-2)
 
     @config.patch(
         max_autotune=True,
         max_autotune_gemm_backends="",
     )
+    @unittest.skipIf(
+        config.triton.native_matmul, "native matmul generates when size >=2"
+    )
     def test_no_valid_choices(self):
         a = torch.zeros([2, 2], device=GPU_TYPE)
         b = torch.zeros([2, 2], device=GPU_TYPE)
@@ -1222,6 +1389,9 @@ def test_no_valid_choices(self):
             torch.compile(lambda a, b: a.matmul(b))(a, b)
         self.assertIn("NoValidChoicesError", str(context.exception))
 
+    @unittest.skipIf(
+        config.triton.native_matmul, "Only test when template is being called"
+    )
     @parametrize("multi_template", (True, False))
     @config.patch(
         max_autotune=True,
@@ -1293,12 +1463,18 @@ def f(x, y):
     @unittest.skipIf(
         config.cpp_wrapper, "decompose_k not supported for cpp_wrapper yet"
     )
+    @unittest.skipIf(
+        config.triton.native_matmul,
+        "ignore decompose_k when native matmul codegen",
+    )
     @parametrize("dynamic", (True, False))
     @parametrize("dtype", (torch.float16, torch.bfloat16))
     @parametrize("sizes", ((32, 32, 32768), (64, 128, 200000), (64, 64, 177147)))
     @config.patch(
         max_autotune=True,
         max_autotune_gemm_backends="TRITON",
+        comprehensive_padding=False,
+        shape_padding=False,
     )
     def test_max_autotune_decompose_k(self, sizes, dtype, dynamic):
         fp16_red_setting = (
@@ -1400,6 +1576,10 @@ def check_divisors(code):
     @unittest.skipIf(
         config.cpp_wrapper, "decompose_k not supported for cpp_wrapper yet"
     )
+    @unittest.skipIf(
+        config.triton.native_matmul,
+        "ignore decompose_k when native matmul codegen",
+    )
     @config.patch(
         max_autotune=True,
         max_autotune_gemm_backends="TRITON",
@@ -1423,7 +1603,9 @@ def f(a, b):
         with mock.patch(
             "torch._inductor.kernel.mm.use_decompose_k_choice"
         ) as decomp_mock:
-            decomp_mock.return_value = True
+            decomp_mock.side_effect = (
+                lambda *args, **kwargs: kwargs.get("threshold_multiple", 1) == 1
+            )
 
             out, code = run_and_get_code(compiled_func, a, b)
             FileCheck().check("extern_kernels.bmm_dtype").check_regex(
@@ -1443,6 +1625,10 @@ def f(a, b):
     @unittest.skipIf(
         config.cpp_wrapper, "decompose_k not supported for cpp_wrapper yet"
     )
+    @unittest.skipIf(
+        config.triton.native_matmul,
+        "ignore decompose_k when native matmul codegen",
+    )
     @config.patch(
         max_autotune=True,
         max_autotune_gemm_backends="TRITON",
@@ -1469,7 +1655,9 @@ def f(a, b):
         with mock.patch(
             "torch._inductor.kernel.mm.use_decompose_k_choice"
         ) as decomp_mock:
-            decomp_mock.return_value = True
+            decomp_mock.side_effect = (
+                lambda *args, **kwargs: kwargs.get("threshold_multiple", 1) == 1
+            )
 
             out, code = run_and_get_code(compiled_func, a, b)
             out.backward()
@@ -1488,6 +1676,10 @@ def f(a, b):
     @unittest.skipIf(
         config.cpp_wrapper, "decompose_k not supported for cpp_wrapper yet"
     )
+    @unittest.skipIf(
+        config.triton.native_matmul,
+        "ignore decompose_k when native matmul codegen",
+    )
     @config.patch(
         max_autotune=True,
         max_autotune_gemm_backends="TRITON",
@@ -1746,6 +1938,7 @@ def test_triton_template_generated_code_cache_key(self):
             "max_autotune_gemm_backends": "TRITON",
         }
     )
+    @unittest.skipIf(config.triton.native_matmul, "only test on template-based matmul")
     def test_triton_template_generated_code_cache_strategy(self):
         def func_test1(x, y, z, m):
             a = torch.matmul(x, y)
@@ -1772,6 +1965,7 @@ def func_test1(x, y, z, m):
             "max_autotune_gemm_backends": "TRITON",
         }
     )
+    @unittest.skipIf(config.triton.native_matmul, "only test on template-based matmul")
     def test_triton_template_generated_code_caching(self):
         def reset_counters():
             torch._dynamo.utils.counters.clear()
@@ -1823,8 +2017,8 @@ def func_test1(x, y, z, m):
                         'num_stages':1,'num_warps':2,'prefix_args':0,'suffix_args':0,'call_sizes':[10,30],
                         'layout':"[[10,30],[30,1],torch.float32,device(type='cuda',index=0),0]",
                         'num_consumer_groups':0,'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','tma_store':False,
-                        'kwargs':{'EVEN_K':False,'ALLOW_TF32':True,'USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32',
-                        'BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8},'hint_override':None}"""
+                        'kwargs':{'EVEN_K':False,'USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32',
+                        'BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8,'ALLOW_TF32':True},'hint_override':None}"""
 
                 expected = expected.replace("cuda", GPU_TYPE)
                 self.assertExpectedInline(
@@ -1862,8 +2056,8 @@ def func_test1(x, y, z, m):
                         "[[s27,s94],[s94,1],torch.float32,device(type='cuda',index=0),0]"],
                     'num_stages':1,'num_warps':2,'prefix_args':0,'suffix_args':0,'call_sizes':[s77,s94],
                     'layout':"[[s77,s94],[s94,1],torch.float32,device(type='cuda',index=0),0]",'num_consumer_groups':0,
-                    'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','tma_store':False,'kwargs':{'EVEN_K':False,'ALLOW_TF32':True,
-                    'USE_FAST_ACCUM':False,'ACC_TYPE':'tl.float32','BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8},'hint_override':None}"""
+                    'num_buffers_warp_spec':0,'epilogue_fn_hash':'identity','tma_store':False,'kwargs':{'EVEN_K':False,'USE_FAST_ACCUM':False,
+                    'ACC_TYPE':'tl.float32','BLOCK_M':16,'BLOCK_N':32,'BLOCK_K':16,'GROUP_M':8,'ALLOW_TF32':True},'hint_override':None}"""
                 expected = expected.replace("cuda", GPU_TYPE)
                 self.assertExpectedInline(
                     remove_white_space(cache_key),
@@ -1901,7 +2095,7 @@ def func_test1(x, y, z, m):
 
         # Test loop.
         def test_func2(x):
-            for i in range(0, 10):
+            for i in range(10):
                 x = torch.matmul(x, x)
             return x
 
@@ -1956,6 +2150,7 @@ def test_func3(x, y, z, m, l):
             "max_autotune_gemm_backends": "TRITON",
         }
     )
+    @unittest.skipIf(config.triton.native_matmul, "only test on template-based matmul")
     def test_triton_template_generated_code_caching_bmm(self):
         def func_test1(x, y, z, m):
             a = torch.bmm(x, y)
@@ -1991,6 +2186,7 @@ def misses():
             "max_autotune_gemm_backends": "ATEN, TRITON",
         }
     )
+    @unittest.skipIf(config.triton.native_matmul, "only test on template-based matmul")
     def test_triton_template_generated_code_caching_mm_plus_mm(self):
         def func_test1(x, y, z, m):
             a = torch.mm(x, y)
@@ -2030,6 +2226,10 @@ def misses():
     @unittest.skipIf(
         config.cpp_wrapper, "decompose_k not supported for cpp_wrapper yet"
     )
+    @unittest.skipIf(
+        config.triton.native_matmul,
+        "ignore decompose_k when native matmul codegen",
+    )
     @config.patch(
         max_autotune=True,
         max_autotune_gemm_backends="TRITON",
@@ -2075,6 +2275,10 @@ def test_max_autotune_decompose_k_envvars(
     @unittest.skipIf(
         TEST_WITH_ROCM, "exhaustive currently only thoroughly tested on NVIDIA"
     )
+    @unittest.skipIf(
+        config.triton.native_matmul,
+        "native matmul takes different tuning configs",
+    )
     @config.patch(max_autotune=True, max_autotune_gemm_search_space="EXHAUSTIVE")
     def test_max_autotune_exhaustive(self):
         def f(a, b):
@@ -2148,7 +2352,11 @@ def mm(x, y):
     @parametrize("op", ("mm", "addmm", "bmm", "baddbmm", "mm_plus_mm"))
     @parametrize("max_autotune", (False, True))
     @config.patch(
-        {"test_configs.max_mm_configs": 4, "max_autotune_gemm_backends": "ATEN,TRITON"}
+        {
+            "test_configs.max_mm_configs": 4,
+            "max_autotune_gemm_backends": "ATEN,TRITON",
+            "triton.native_matmul": False,
+        }
     )
     def test_autotune_gemm_choice_validation(self, op, max_autotune):
         def generate_inputs_and_func(op_name):
@@ -2342,6 +2550,7 @@ def fn(a, b, c):
 
     @config.patch(autotune_local_cache=False, autotune_remote_cache=False)
     @runOnRocmArch(MI300_ARCH)
+    @unittest.skipIf(config.triton.native_matmul, "native matmul has counter 0")
     def test_precompilations(self):
         def fn(a, b, c):
             a = (a @ b) @ c
@@ -2862,7 +3071,11 @@ def mm(a, b):
         b = torch.randn(32, 32, device=GPU_TYPE)
 
         with config.patch(
-            {"max_autotune": True, "max_autotune_gemm_backends": "TRITON"}
+            {
+                "max_autotune": True,
+                "max_autotune_gemm_backends": "TRITON",
+                "triton.native_matmul": False,
+            }
         ):
             torch.compile(mm)(a, b)
 
@@ -2938,8 +3151,12 @@ def foo(x, y):
         out, code = run_and_get_code(torch.compile(foo), x, y)
         self.assertEqual(out, foo(x, y), atol=0.05, rtol=0.05)
         self.check_code(code[0], num_kernels=1, num_allocs=1, num_deallocs=2)
-        # upcast preserves zero mask
-        FileCheck().check("a =").check_not("tl.where").check("tl.dot").run(code[0])
+        if config.triton.native_matmul:
+            # native matmul preserves zero mask - need to optimize; see codegen/triton.py
+            FileCheck().check("a =").check("tl.where").check("tl.dot").run(code[0])
+        else:
+            # upcast preserves zero mask
+            FileCheck().check("a =").check_not("tl.where").check("tl.dot").run(code[0])
 
     @unittest.skip("Triton bug in compilation")
     def test_gather_fusion(self):
@@ -2970,6 +3187,7 @@ def foo(x, y, index):
         not PLATFORM_SUPPORTS_FP8,
         "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
     )
+    @config.patch({"triton.native_matmul": False})
     def test_low_precision(self):
         M = K = N = 128
 
@@ -3001,6 +3219,10 @@ def foo(x, y):
         # should not be done in low precision, two kernels
         self.check_code(code[0], num_kernels=2, num_allocs=2, num_deallocs=3)
 
+    @unittest.skipIf(
+        config.triton.native_matmul,
+        "generated code is different in native matmul",
+    )
     def test_downcast(self):
         # per heuristics, dont fuse a downcast into a mm because it would lead to more reads inside kernel
         M, K, N = (64, 128, 256)
@@ -3015,6 +3237,10 @@ def foo(x, y):
         self.check_code(code[0], num_kernels=2, num_allocs=2, num_deallocs=3)
 
     @parametrize("sizes", ((64, 128, 256), (64, 64, 64), (64, 120, 64)))
+    @unittest.skipIf(
+        config.triton.native_matmul,
+        "generated code is different in native matmul",
+    )
     def test_multiple_fusions(self, sizes):
         M, K, N = sizes
 
@@ -3120,6 +3346,10 @@ def foo(a):
 
     @config.patch(realize_reads_threshold=1, realize_opcount_threshold=1)
     @parametrize("sizes", ((64, 128, 256), (128, 128, 128), (63, 120, 250)))
+    @unittest.skipIf(
+        config.triton.native_matmul,
+        "generated code is different in native matmul",
+    )
     def test_prologue_multiple_nodes(self, sizes):
         M, K, N = sizes
 
@@ -3159,6 +3389,10 @@ def foo(x, y):
         self.assertEqual(out, foo(x, y), atol=0.05, rtol=0.05)
         self.check_code(code[0], num_kernels=1, num_allocs=1, num_deallocs=2)
 
+    @unittest.skipIf(
+        config.triton.native_matmul,
+        "generated code is different in native matmul",
+    )
     def test_preserves_zero_analysis(self):
         fns = (
             (lambda x: x.relu(), False),  # preserves zero
@@ -3211,6 +3445,10 @@ def foo(x):
 
     @config.patch(realize_reads_threshold=1, realize_opcount_threshold=1)
     @config.patch(allow_buffer_reuse=False)
+    @unittest.skipIf(
+        config.triton.native_matmul,
+        "generated code is different in native matmul",
+    )
     def test_mismatched_prologue_group(self):
         def foo(x, y, z):
             a = (x + 2) * 2
@@ -3232,6 +3470,10 @@ def foo(x, y, z):
     @config.patch(shape_padding=True)
     @config.patch(force_shape_pad=True)
     @parametrize("sizes", ((250, 245, 128), (250, 256, 128), (256, 128, 62)))
+    @unittest.skipIf(
+        config.triton.native_matmul,
+        "generated code is different in native matmul",
+    )
     def test_prologue_masked_load(self, sizes):
         M, K, N = sizes
 
diff --git a/test/inductor/test_mem_estimation.py b/test/inductor/test_mem_estimation.py
new file mode 100644
index 000000000000..4b49982c6377
--- /dev/null
+++ b/test/inductor/test_mem_estimation.py
@@ -0,0 +1,345 @@
+# Owner(s): ["module: inductor"]
+
+import functools
+import weakref
+from collections import Counter
+from typing import Callable, Optional
+
+import torch
+from torch._inductor.fx_passes.memory_estimator import (
+    build_memory_profile,
+    MemoryTracker,
+)
+from torch._inductor.test_case import run_tests, TestCase as InductorTestCase
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+from torch.utils._python_dispatch import TorchDispatchMode
+from torch.utils._pytree import tree_map_only
+from torch.utils.weak import WeakIdKeyDictionary
+
+
+def tensor_storage_id(tensor):
+    return tensor._typed_storage()._cdata
+
+
+def device_filter(device):
+    return device.type == GPU_TYPE
+
+
+class FakeTensorMemoryProfilerMode(TorchDispatchMode):
+    def __init__(self, device_filter: Optional[Callable[torch.device, bool]] = None):
+        # counter of storage ids to live references
+        self.storage_count: dict[int, int] = Counter()
+        # live fake tensors
+        self.live_tensors = WeakIdKeyDictionary()
+        self.memory_use = 0
+        self.max_memory = 0
+        self.device_filter = device_filter
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs if kwargs is not None else {}
+        rs = func(*args, **kwargs)
+        tree_map_only(torch._subclasses.FakeTensor, self.increase_memory_use, rs)
+        return rs
+
+    def increase_memory_use(self, tensor):
+        # already accounted for
+        if tensor in self.live_tensors:
+            return
+
+        if self.device_filter is not None and not self.device_filter(tensor.device):
+            return
+
+        self.live_tensors[tensor] = True
+        nbytes = tensor.untyped_storage().nbytes()
+
+        storage_id = tensor_storage_id(tensor)
+
+        # new storage, add to memory
+        if storage_id not in self.storage_count:
+            self.change_memory(nbytes)
+
+        self.storage_count[storage_id] += 1
+
+        # when this tensor dies, we need to adjust memory
+        weakref.finalize(
+            tensor, functools.partial(self.tensor_cleanup, storage_id, nbytes)
+        )
+
+    def tensor_cleanup(self, storage_id, nbytes):
+        self.storage_count[storage_id] -= 1
+        if self.storage_count[storage_id] == 0:
+            del self.storage_count[storage_id]
+            self.change_memory(-nbytes)
+
+    def change_memory(self, delta):
+        self.memory_use += delta
+        self.max_memory = max(self.memory_use, self.max_memory)
+
+
+class TestMemoryProfilingResNet(InductorTestCase):
+    def test_simple_linear_layers(self):
+        """Test with a simple sequential model with explicit weights on CUDA."""
+
+        def create_inputs_and_weights():
+            """Create inputs and weights on CUDA."""
+            x = torch.randn(32, 1000, device=GPU_TYPE)
+            w1 = torch.randn(500, 1000, device=GPU_TYPE)
+            w2 = torch.randn(100, 500, device=GPU_TYPE)
+            w3 = torch.randn(10, 100, device=GPU_TYPE)
+            return x, w1, w2, w3
+
+        def fn(x, w1, w2, w3):
+            h1 = torch.nn.functional.linear(x, w1)
+            h1 = torch.nn.functional.relu(h1)
+            h2 = torch.nn.functional.linear(h1, w2)
+            h2 = torch.nn.functional.relu(h2)
+            out = torch.nn.functional.linear(h2, w3)
+            return out
+
+        with FakeTensorMode():
+            # Trace with make_fx
+            x, w1, w2, w3 = create_inputs_and_weights()
+            fx_graph = make_fx(fn)(x, w1, w2, w3)
+
+            # Static analysis
+            def is_releasable(node):
+                return node.op not in ("placeholder", "get_attr")
+
+            fx_memory_profile = build_memory_profile(fx_graph.graph, is_releasable)
+            fx_peak = max(fx_memory_profile)
+
+            # Runtime profiling
+            profiler = FakeTensorMemoryProfilerMode()
+
+            with profiler:
+                x_runtime, w1_runtime, w2_runtime, w3_runtime = (
+                    create_inputs_and_weights()
+                )
+                result = fn(x_runtime, w1_runtime, w2_runtime, w3_runtime)
+                del result
+
+            runtime_peak = profiler.max_memory
+
+            self.assertEqual(fx_peak, runtime_peak)
+
+    def test_conv_network(self):
+        """Test with a convolutional network."""
+
+        def create_inputs_and_weights():
+            """Create inputs and weights on CUDA."""
+            x = torch.randn(8, 3, 224, 224, device=GPU_TYPE)
+            conv1_weight = torch.randn(64, 3, 3, 3, device=GPU_TYPE)
+            conv2_weight = torch.randn(128, 64, 3, 3, device=GPU_TYPE)
+            linear_weight = torch.randn(10, 128 * 56 * 56, device=GPU_TYPE)
+            return x, conv1_weight, conv2_weight, linear_weight
+
+        def fn(x, conv1_weight, conv2_weight, linear_weight):
+            h = torch.nn.functional.conv2d(x, conv1_weight, padding=1)
+            h = torch.nn.functional.relu(h)
+            h = torch.nn.functional.max_pool2d(h, 2)
+            h = torch.nn.functional.conv2d(h, conv2_weight, padding=1)
+            h = torch.nn.functional.relu(h)
+            h = torch.nn.functional.max_pool2d(h, 2)
+            h = torch.flatten(h, 1)
+            out = torch.nn.functional.linear(h, linear_weight)
+            return out
+
+        with FakeTensorMode():
+            # Trace with make_fx
+            x, conv1_weight, conv2_weight, linear_weight = create_inputs_and_weights()
+            fx_graph = make_fx(fn)(x, conv1_weight, conv2_weight, linear_weight)
+
+            def is_releasable(node):
+                return node.op not in ("placeholder", "get_attr")
+
+            fx_memory_profile = build_memory_profile(fx_graph.graph, is_releasable)
+            fx_peak = max(fx_memory_profile)
+
+            # Runtime profiling
+            profiler = FakeTensorMemoryProfilerMode()
+
+            with profiler:
+                x_runtime, conv1_w, conv2_w, linear_w = create_inputs_and_weights()
+                result = fn(x_runtime, conv1_w, conv2_w, linear_w)
+                del result
+
+            runtime_peak = profiler.max_memory
+
+            self.assertEqual(fx_peak, runtime_peak)
+
+
+class TestMemoryTracker(InductorTestCase):
+    def test_memory_tracker_original_order(self):
+        """Test that MemoryTracker works correctly with original scheduling order and matches runtime profiling."""
+
+        def create_inputs_and_weights():
+            """Create inputs and weights on CUDA."""
+            x = torch.randn(32, 100, device=GPU_TYPE)
+            w1 = torch.randn(100, 50, device=GPU_TYPE)
+            w2 = torch.randn(50, 10, device=GPU_TYPE)
+            return x, w1, w2
+
+        def fn(x, w1, w2):
+            # Create a simple function that allocates intermediate tensors
+            h1 = torch.matmul(x, w1)  # Allocates h1
+            h2 = torch.relu(h1)  # h1 can be freed, h2 allocated
+            out = torch.matmul(h2, w2)  # h2 can be freed, out allocated
+            return out
+
+        with FakeTensorMode():
+            # Create inputs
+            x, w1, w2 = create_inputs_and_weights()
+
+            # Trace the function
+            fx_graph = make_fx(fn)(x, w1, w2)
+
+            # Test MemoryTracker with original order
+            memory_tracker = MemoryTracker(fx_graph.graph, device_filter=device_filter)
+
+            # Schedule nodes in original order
+            compute_nodes = [
+                node
+                for node in fx_graph.graph.nodes
+                if node.op not in ("placeholder", "get_attr", "output")
+            ]
+
+            for node in compute_nodes:
+                memory_tracker.schedule_node(node)
+
+            memory_tracker_peak = memory_tracker.get_current_memory_bytes()
+
+            # Compare with runtime profiling using FakeTensorMemoryProfilerMode
+            profiler = FakeTensorMemoryProfilerMode(device_filter=device_filter)
+
+            with profiler:
+                x_runtime, w1_runtime, w2_runtime = create_inputs_and_weights()
+                result = fn(x_runtime, w1_runtime, w2_runtime)
+                del result
+
+            runtime_peak = profiler.max_memory
+
+            # Verify both approaches track meaningful memory usage
+            self.assertGreater(
+                memory_tracker_peak, 0, "MemoryTracker should track memory usage"
+            )
+            self.assertGreater(
+                runtime_peak, 0, "Runtime profiler should track memory usage"
+            )
+
+    def test_memory_tracker_different_scheduling(self):
+        """Test that different scheduling orders produce different memory usage patterns."""
+
+        def foo(primals_1):
+            zeros = torch.zeros_like(primals_1)  # Create zeros tensor
+            add_result = zeros + 1  # Use zeros (first use)
+            sum_result = zeros.sum()  # Use zeros (second use)
+            cpu = torch.zeros([20], device="cpu")
+            cpu_2 = cpu + 1
+            return add_result, sum_result, cpu_2
+
+        with FakeTensorMode():
+            # Create input
+            primals_1 = torch.randn(1000, 1000, device=GPU_TYPE)
+
+            # Trace the function
+            fx_graph = make_fx(foo)(primals_1)
+
+            # Get compute nodes (excluding placeholders, get_attr, output)
+            compute_nodes = [
+                node
+                for node in fx_graph.graph.nodes
+                if node.op not in ("placeholder", "get_attr", "output")
+            ]
+
+            # Test original order: zeros_like, add, sum
+            # zeros gets freed after sum (last use of zeros)
+            memory_tracker1 = MemoryTracker(fx_graph.graph, device_filter=device_filter)
+            memory_profile1 = []
+            initial_mem = memory_tracker1.get_current_memory_bytes()
+
+            for node in compute_nodes:
+                memory_tracker1.schedule_node(node)
+                memory_profile1.append(memory_tracker1.get_current_memory_bytes())
+
+            # use of primals should not deallocate
+            self.assertEqual(memory_profile1[0], initial_mem * 2)
+
+            # Test different order: zeros_like, sum, add
+            # zeros gets freed after add (last use of zeros in new order)
+            memory_tracker2 = MemoryTracker(fx_graph.graph, device_filter=device_filter)
+            memory_profile2 = []
+
+            # Alternative schedule: change which operation is the last use of zeros
+            # Original: zeros_like, add, sum (zeros freed after sum)
+            # Alternative: zeros_like, sum, add (zeros freed after add)
+            assert len(compute_nodes) == 5, (
+                f"Expected 3 compute nodes, got {len(compute_nodes)}"
+            )
+            reordered_nodes = [
+                compute_nodes[0],  # zeros_like: zeros = torch.zeros_like(primals_1)
+                compute_nodes[2],  # sum: sum_result = zeros.sum() (zeros still alive)
+                compute_nodes[
+                    1
+                ],  # add: add_result = zeros + 1 (last use, zeros freed here)
+                compute_nodes[3],  # cpu = torch.zeros([20], device="cpu")
+                compute_nodes[4],  # cpu_2 = cpu + 1
+            ]
+
+            for node in reordered_nodes:
+                memory_tracker2.schedule_node(node)
+                memory_profile2.append(memory_tracker2.get_current_memory_bytes())
+
+            # Compare peak memories
+            peak1 = max(memory_profile1)
+            peak2 = max(memory_profile2)
+
+            # Both should end with the same final memory (all intermediate tensors freed)
+            self.assertEqual(memory_profile1[-1], memory_profile2[-1])
+
+            # The profiles should be different, showing different memory patterns
+            self.assertNotEqual(
+                memory_profile1,
+                memory_profile2,
+                "Different scheduling should produce different memory profiles",
+            )
+
+            # The different scheduling should produce different peak memory!
+            # Original: zeros + add_result both alive → higher peak
+            # Reordered: zeros freed before add_result created → lower peak
+            self.assertGreater(
+                peak1, peak2, "Original order should have higher peak memory"
+            )
+
+            # Specifically, original has both zeros and add_result alive simultaneously
+            self.assertGreater(
+                memory_profile1[1],
+                memory_profile2[1],
+                "Original order keeps more tensors alive simultaneously",
+            )
+
+            # The reordered version should have lower intermediate memory usage
+            self.assertLess(
+                peak2,
+                peak1,
+                "Reordered schedule reduces peak memory through better deallocation timing",
+            )
+
+            # Verify the MemoryTracker correctly tracks different scheduling
+            # The first tracker should match since we tested accuracy against FakeTensorMemoryProfilerMode
+            self.assertLessEqual(
+                abs(memory_tracker1.peak_memory - peak1),
+                8,
+                "First tracker peak should match profile peak",
+            )
+
+            # The key test: profiles show different peaks due to different deallocation timing
+            self.assertNotEqual(
+                peak1, peak2, "Different scheduling produces different peak memory"
+            )
+
+
+if __name__ == "__main__":
+    if HAS_GPU:
+        run_tests(needs="filelock")
diff --git a/test/inductor/test_memory.py b/test/inductor/test_memory.py
index f905990478f7..bf994b5e6b84 100644
--- a/test/inductor/test_memory.py
+++ b/test/inductor/test_memory.py
@@ -408,13 +408,9 @@ def can_fuse(
             code = run_and_get_triton_code(f_compiled, x, y, z)
             (
                 FileCheck()
-                .check("triton_poi_fused_add_0.run(buf1, arg2_1,")
-                .check("triton_poi_fused_add_0.run(buf3, arg2_1,")
-                .check("triton_poi_fused_add_0.run(buf4, buf3,")
-                .check("triton_poi_fused_add_0.run(buf6, arg2_1,")
-                .check("triton_poi_fused_add_0.run(buf7, buf6,")
-                .check("triton_poi_fused_add_0.run(buf9, arg2_1,")
-                .check("triton_poi_fused_add_0.run(buf10, buf9,")
+                .check("triton_poi_fused_add_0.run(buf2, arg2_1, buf1,")
+                .check("triton_poi_fused_add_1.run(buf4, buf3, arg2_1")
+                .check("triton_poi_fused_add_1.run(buf6, buf5, arg2_1,")
                 .run(code)
             )
 
diff --git a/test/inductor/test_memory_planning.py b/test/inductor/test_memory_planning.py
index 1bcdeaa08e95..867121cd68f9 100644
--- a/test/inductor/test_memory_planning.py
+++ b/test/inductor/test_memory_planning.py
@@ -92,13 +92,13 @@ def test_aoti(self):
         )
 
         FileCheck().check(
-            "int64_t int_array_0[] = {24L + align(12L*s77), };"
+            "int64_t int_array_0[] = {24L + align(12L*s6), };"
         ).check_next("int64_t int_array_1[] = {1L, };").check_next(
             "AtenTensorHandle pool1_handle;"
         ).check_next(
             "aoti_torch_empty_strided(1, int_array_0, int_array_1,"
         ).check_next("RAIIAtenTensorHandle pool1(pool1_handle);").check_next(
-            "int64_t int_array_2[] = {s77, 3L};"
+            "int64_t int_array_2[] = {s6, 3L};"
         ).check_next("int64_t int_array_3[] = {3L, 1L};").check_next(
             "AtenTensorHandle tmp_tensor_handle_0;"
         ).check_next("aoti_torch__alloc_from_pool(pool1, 0").run(code)
diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py
index 8bbf76af6bac..709b1fe7f079 100644
--- a/test/inductor/test_mkldnn_pattern_matcher.py
+++ b/test/inductor/test_mkldnn_pattern_matcher.py
@@ -31,6 +31,7 @@
     IS_LINUX,
     IS_X86,
     MI300_ARCH,
+    MI350_ARCH,
     parametrize,
     skipIfNoXPU,
     skipIfRocm,
@@ -759,7 +760,6 @@ def forward(self, x):
             metrics.reset()
             mod = M(unary_fn, 10, 30, bias=bias).eval()
             # only fuse for linear when the dtype is bf16
-            mod = mod
             v = torch.randn(2, 10)
 
             def matcher_check_fn():
@@ -834,9 +834,7 @@ def forward(self, x):
 
         for dtype in dtypes:
             torch._dynamo.reset()
-            autocast_enabled = (
-                True if dtype in [torch.bfloat16, torch.float16] else False
-            )
+            autocast_enabled = dtype in [torch.bfloat16, torch.float16]
             with (
                 torch.no_grad(),
                 torch.autocast(
@@ -1187,7 +1185,7 @@ def test_qconv2d_xpu(self):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
-    @skipIfRocmArch(MI300_ARCH)
+    @skipIfRocmArch(MI300_ARCH + MI350_ARCH)
     def test_qconv2d_int8_mixed_bf16(self):
         r"""
         This testcase will quantize a single Conv2d module with int8_mixed_bf16 quantization.
@@ -1197,7 +1195,7 @@ def test_qconv2d_int8_mixed_bf16(self):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
-    @skipIfRocmArch(MI300_ARCH)
+    @skipIfRocmArch(MI300_ARCH + MI350_ARCH)
     def test_qconv2d_int8_mixed_bf16_use_autocast(self):
         r"""
         This testcase will quantize a single Conv2d module with int8_mixed_bf16 quantization.
@@ -4420,14 +4418,12 @@ def test_da8w8_sym_act_sym_wgt_with_int_mm(
         out_feature = 64
         q_min, q_max = -32, 31
         # we only test for qlinear_binary in this case
-        test_for_pointwise_binary = (
-            True
-            if M == 1
+        test_for_pointwise_binary = bool(
+            M == 1
             and inplace_add
             and not expand_a_scale
             and not dynamic
             and not has_bias
-            else False
         )
         if test_for_pointwise_binary and not IS_X86:
             self.skipTest("Some UTs are only supported on x86_64 CPUs")
@@ -4694,7 +4690,6 @@ class SelfAttnLikeModule(torch.nn.Module):
             def __init__(
                 self,
                 input_dim,
-                transpose_for_score=False,
                 num_attention_heads=None,
                 attention_head_size=None,
             ) -> None:
@@ -4704,12 +4699,10 @@ def __init__(
                 self.k_proj = torch.nn.Linear(input_dim, input_dim, bias=False)
                 self.v_proj = torch.nn.Linear(input_dim, input_dim, bias=False)
                 self.softmax = torch.nn.Softmax(dim=-1)
-                self.transpose_for_score = transpose_for_score
-                if self.transpose_for_score:
-                    assert num_attention_heads is not None
-                    assert attention_head_size is not None
-                    self.num_attention_heads = num_attention_heads
-                    self.attention_head_size = attention_head_size
+                self.num_attention_heads = num_attention_heads
+                self.attention_head_size = attention_head_size
+                self.all_head_size = self.num_attention_heads * self.attention_head_size
+                self.dense = torch.nn.Linear(self.all_head_size, self.all_head_size)
 
             def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
                 new_x_shape = x.size()[:-1] + (
@@ -4723,19 +4716,21 @@ def forward(self, x):
                 q = self.q_proj(x)
                 k = self.k_proj(x)
                 v = self.v_proj(x)
-                if self.transpose_for_score:
-                    q = self.transpose_for_scores(q)
-                    k = self.transpose_for_scores(k)
-                    v = self.transpose_for_scores(v)
+                q = self.transpose_for_scores(q)
+                k = self.transpose_for_scores(k)
+                v = self.transpose_for_scores(v)
                 scores = torch.matmul(q, k.transpose(-1, -2)) / (self.input_dim**0.5)
                 attention = self.softmax(scores)
                 weighted = torch.matmul(attention, v)
-                return weighted
+                weighted = weighted.permute(0, 2, 1, 3).contiguous()
+                weighted = weighted.reshape(
+                    weighted.size()[:-2] + (self.all_head_size,)
+                )
+                return self.dense(weighted)
 
-        for annotate_matmul in [False, True]:
+        for annotate_matmul in [True, False]:
             mod = SelfAttnLikeModule(
                 input_dim=64 * 16,
-                transpose_for_score=True,
                 num_attention_heads=16,
                 attention_head_size=64,
             ).eval()
@@ -4743,12 +4738,17 @@ def forward(self, x):
 
             def matcher_check_fn():
                 self.assertEqual(
-                    counters["inductor"]["qlinear_weight_prepack_matcher_count"], 3
+                    counters["inductor"]["qlinear_weight_prepack_matcher_count"], 4
                 )
                 self.assertEqual(
                     counters["inductor"]["qlinear_unary_matcher_count"],
                     3 if annotate_matmul and not TEST_ACL else 0,
                 )
+                if IS_X86:  # Some issues on ARM
+                    self.assertEqual(
+                        counters["inductor"]["quant_lift_up_count"],
+                        4 if annotate_matmul and not TEST_ACL else 1,
+                    )
 
             quantizer = X86InductorQuantizer()
             quantizer.set_global(xiq.get_default_x86_inductor_quantization_config())
diff --git a/test/inductor/test_mps_basic.py b/test/inductor/test_mps_basic.py
index 529fe0727028..c2f4505a9f84 100644
--- a/test/inductor/test_mps_basic.py
+++ b/test/inductor/test_mps_basic.py
@@ -121,6 +121,20 @@ def fn(x, y):
             ),
         )
 
+    def test_conv_train(self):
+        # Regression test for https://github.com/pytorch/pytorch/issues/161905
+        def fn(x, y):
+            return torch.nn.functional.conv2d(x, y, None, 1, 1, 1)
+
+        self.common(
+            fn,
+            (
+                torch.rand(4, 512, 7, 7, requires_grad=True),
+                torch.rand(512, 512, 3, 3),
+            ),
+            check_gradient=True,
+        )
+
     def test_cholesky(self):
         def fn(x):
             return (
@@ -134,6 +148,13 @@ def test_reduced_max(self):
         # inductor test do not validate that max of say 16K half elements can be computed
         self.common(torch.max, (torch.rand(16384, dtype=torch.half),), check_lowp=False)
 
+    def test_linalg_inv(self):
+        def fn(x):
+            return torch.linalg.inv(torch.linalg.cholesky(x))
+
+        A = torch.diag(torch.tensor([20.0, 0.5, 5.0], dtype=torch.float32) ** 2)
+        self.common(fn, (A,), check_lowp=False)
+
 
 class MPSBasicTestsAOTI(TestCase):
     def check_model(self, m, inp, dynamic_shapes=None):
@@ -249,7 +270,7 @@ def forward(self, x, y):
         ep = torch.export.export(model, example_inputs)
         package_path = torch._export.aot_compile(ep.module(), example_inputs)
 
-        target_str = 'mps_lib_0.getKernelFunction("generated_kernel")'
+        target_str = "aoti_torch_mps_get_kernel_function("
         target_count = 1
 
         with open(os.path.splitext(package_path)[0] + ".cpp") as cpp:
diff --git a/test/inductor/test_multi_kernel.py b/test/inductor/test_multi_kernel.py
index f576016cf08c..a509586d80a4 100644
--- a/test/inductor/test_multi_kernel.py
+++ b/test/inductor/test_multi_kernel.py
@@ -50,6 +50,16 @@ def _contains_multi_kernel_code(wrapper_code: str):
     )
 
 
+def _contains_size_hint_multi_kernel_code(wrapper_code: str):
+    return (
+        re.search(
+            r"multi_kernel_[^ ]* = async_compile.size_hint_multi_kernel[(]",
+            wrapper_code,
+        )
+        is not None
+    )
+
+
 def make_cpp_wrapper_test(orig_test, **extra_args):
     """
     Wrap an existing test into a new test with cpp-wrapper enabled.
@@ -115,6 +125,7 @@ def fn(x, y):
         )
         x = torch.randn(4096, 4096, device=GPU_TYPE)
         y = torch.randn(4096, 4096, device=GPU_TYPE)
+        torch._dynamo.mark_dynamic(x, 0)
         act, wrapper_code = run_and_get_code(compiled_fn, x, y)
         ref = fn(x, y)
 
@@ -123,7 +134,7 @@ def fn(x, y):
         # We mainly care about the wrapper for the final pass here.
         wrapper_code = wrapper_code[-1]
         self.assertEqual(ref, act)
-        self.assertTrue(_contains_multi_kernel_code(wrapper_code))
+        self.assertTrue(_contains_size_hint_multi_kernel_code(wrapper_code))
 
     @requires_triton()
     # TODO: bobrenjc93 to fix multi-kernel for ROCM
@@ -142,6 +153,7 @@ def fn(x, y):
         )
         x = torch.randn(4096, 4096, device=GPU_TYPE)
         y = torch.randn(4096, 4096, device=GPU_TYPE)
+        torch._dynamo.mark_dynamic(x, 0)
         act, wrapper_code = run_and_get_code(compiled_fn, x, y)
         ref = fn(x, y)
 
@@ -150,7 +162,7 @@ def fn(x, y):
         # We mainly care about the wrapper for the final pass here.
         wrapper_code = wrapper_code[-1]
         self.assertEqual(ref, act)
-        self.assertTrue(_contains_multi_kernel_code(wrapper_code))
+        self.assertTrue(_contains_size_hint_multi_kernel_code(wrapper_code))
 
     @parametrize("force_kernel", (0, 1))
     @unittest.mock.patch.dict(
diff --git a/test/inductor/test_native_matmul.py b/test/inductor/test_native_matmul.py
new file mode 100644
index 000000000000..7c91fd2b9faf
--- /dev/null
+++ b/test/inductor/test_native_matmul.py
@@ -0,0 +1,157 @@
+# Owner(s): ["module: inductor"]
+
+
+from typing import Callable
+
+import torch
+from torch._dynamo.testing import rand_strided
+from torch._dynamo.utils import same
+from torch._inductor import config as inductor_config
+from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import run_and_get_triton_code
+from torch.testing import FileCheck
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+
+
+aten = torch.ops.aten
+
+
+@inductor_config.patch({"triton.native_matmul": True})
+class TestTritonDotReduction(TestCase):
+    def _check_equal(
+        self,
+        f: Callable,
+        example_inputs: tuple[torch.Tensor],
+    ):
+        compiled = torch.compile(f)
+        actual = compiled(*example_inputs)
+        expect = f(*example_inputs)
+        self.assertTrue(same(expect, actual))
+
+    def _check_code(
+        self,
+        f: Callable,
+        example_inputs: tuple[torch.Tensor],
+        kernel_count: int,
+        dot_count: int,
+    ):
+        f = torch.compile(f)
+        code = run_and_get_triton_code(f, *example_inputs)
+        FileCheck().check_regex(r"triton.*mm.*\.run\(").run(code)
+
+        FileCheck().check_count(
+            "@triton.jit",
+            kernel_count,
+        ).check_count(
+            "tl.dot",
+            dot_count,
+        ).run(code)
+
+    def test_matmul(self):
+        def f(x, y):
+            z = x @ y
+            return z
+
+        M, K, N = 128, 128, 128
+        x = rand_strided((M, K), (K, 1), device=GPU_TYPE)
+        y = rand_strided((K, N), (N, 1), device=GPU_TYPE)
+
+        self._check_equal(f, (x, y))
+        self._check_code(f, (x, y), 1, 1)
+
+    def test_mm_1d_expand(self):
+        def f(x, y, M, K):
+            z = x[:, None].expand(M, K) @ y
+            return z
+
+        M, K, N = 128, 128, 128
+        x = rand_strided((M,), (1,), device=GPU_TYPE)
+        y = rand_strided((K, N), (N, 1), device=GPU_TYPE)
+
+        self._check_equal(f, (x, y, M, K))
+        self._check_code(f, (x, y, M, K), 1, 1)
+
+    def test_mm_2_expand(self):
+        def f(x, y, M, K):
+            z = x[:, None].expand(M, K) @ y
+            return z
+
+        M, K, N = 128, 128, 128
+        x = rand_strided((1,), (0,), device=GPU_TYPE)
+        y = rand_strided((K, N), (N, 1), device=GPU_TYPE)
+
+        self._check_equal(f, (x, y, M, K))
+        self._check_code(f, (x, y, M, K), 1, 1)
+
+    def test_matmul_fp16(self):
+        def f(x, y):
+            z = x @ y.to(x.dtype)
+            return z
+
+        M, K, N = 128, 128, 128
+        x = rand_strided((M, K), (K, 1), dtype=torch.float16, device=GPU_TYPE)
+        y = rand_strided((K, N), (N, 1), dtype=torch.float32, device=GPU_TYPE)
+
+        self._check_equal(f, (x, y))
+        self._check_code(f, (x, y), 1, 1)
+
+    def test_reduction_mask_zeroout(self):
+        def f(x, y):
+            return (x + 1) @ (y - 2)
+
+        M, K, N = 62, 62, 62
+        x = rand_strided((M, K), (K, 1), device=GPU_TYPE)
+        y = rand_strided((K, N), (N, 1), device=GPU_TYPE)
+
+        self._check_equal(f, (x, y))
+        self._check_code(f, (x, y), 1, 1)
+
+    def test_3mm_add(self):
+        def f(x, y, z, w, r, t):
+            return x @ y + z @ w + r @ t
+
+        M, K, N = 128, 128, 128
+        x = rand_strided((M, K), (K, 1), device=GPU_TYPE)
+        y = rand_strided((K, N), (N, 1), device=GPU_TYPE)
+        w = rand_strided((M, K), (K, 1), device=GPU_TYPE)
+        z = rand_strided((K, N), (N, 1), device=GPU_TYPE)
+        r = rand_strided((M, K), (K, 1), device=GPU_TYPE)
+        t = rand_strided((K, N), (N, 1), device=GPU_TYPE)
+
+        self._check_equal(f, (x, y, z, w, r, t))
+        self._check_code(f, (x, y, z, w, r, t), 1, 3)
+
+    def test_mm_complex(self):
+        def f(x, y, z, w):
+            return x[z] @ y + w + 3
+
+        M, K, N = 128, 128, 128
+        x = rand_strided((M, K), (K, 1), device=GPU_TYPE)
+        y = rand_strided((K, N), (N, 1), device=GPU_TYPE)
+
+        z = torch.randint(M, (M, K), dtype=torch.long, device=GPU_TYPE)
+        w = rand_strided((M, N), (N, 1), device=GPU_TYPE)
+
+        self._check_equal(f, (x, y, z, w))
+        self._check_code(f, (x, y, z, w), 1, 1)
+
+    def test_batchmatmul(self):
+        def f(x, y):
+            z = torch.bmm(x, y)
+            return z
+
+        B, M, K, N = 256, 128, 128, 128
+        x = rand_strided((B, M, K), (M * K, K, 1), device=GPU_TYPE)
+        y = rand_strided((B, K, N), (K * N, N, 1), device=GPU_TYPE)
+
+        self._check_equal(f, (x, y))
+        self._check_code(f, (x, y), 1, 1)
+
+
+if HAS_GPU:
+    torch.set_default_device(GPU_TYPE)
+
+if __name__ == "__main__":
+    # TODO: support native matmul on xpu
+    if HAS_GPU and GPU_TYPE != "xpu":
+        run_tests()
diff --git a/test/inductor/test_op_dtype_prop.py b/test/inductor/test_op_dtype_prop.py
index 6f7eec601666..bc60e6458b49 100644
--- a/test/inductor/test_op_dtype_prop.py
+++ b/test/inductor/test_op_dtype_prop.py
@@ -67,6 +67,7 @@ class TestCase(InductorTestCase):
     )
     # @config.patch("triton.codegen_upcast_to_fp32", False) # TODO enable
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
+    @config.patch("test_configs.runtime_triton_shape_assert", True)
     @config.patch("test_configs.static_cpp_dtype_assert", True)
     @disable_cache_limit()
     def test_op_dtype_propagation(self, op, dtype):
@@ -204,6 +205,8 @@ def test_dtype_aware_codegen(self, op_name: str, load_upcast_to_fp32, input_dtyp
         # Edge case: torch.round maps to libdevice.nearbyint.
         triton_op_name_overrides = {
             "round": "nearbyint",
+            # torch.sqrt lowers to tl.sqrt_rn after switching away from libdevice.sqrt
+            "sqrt": "sqrt_rn",
         }
         override = triton_op_name_overrides.get(op_name)
         triton_op_name = override if override is not None else torch_op_name
@@ -255,6 +258,7 @@ def test_binary_math_mixed_precision(self):
 
     @config.patch("test_configs.static_cpp_dtype_assert", True)
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
+    @config.patch("test_configs.runtime_triton_shape_assert", True)
     @config.patch("triton.codegen_upcast_to_fp32", False)
     def test_downcast_div_mod(self):
         def fn(x, y):
@@ -269,6 +273,7 @@ def fn(x, y):
 
     @config.patch("test_configs.static_cpp_dtype_assert", True)
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
+    @config.patch("test_configs.runtime_triton_shape_assert", True)
     def test_constant(self):
         def fn():
             return (torch.full((2, 3), 3.1416, device=GPU_TYPE, dtype=torch.float16),)
@@ -278,6 +283,7 @@ def fn():
         self.assertEqual(fn(), out)
 
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
+    @config.patch("test_configs.runtime_triton_shape_assert", True)
     @config.patch("test_configs.static_cpp_dtype_assert", True)
     @config.patch("triton.persistent_reductions", False)
     def test_any(self):
@@ -289,6 +295,7 @@ def fn(x):
         self.assertEqual(fn(x), out)
 
     @config.patch("test_configs.runtime_triton_dtype_assert", True)
+    @config.patch("test_configs.runtime_triton_shape_assert", True)
     @config.patch("test_configs.static_cpp_dtype_assert", True)
     def test_assoc_scan(self):
         from torch._higher_order_ops.associative_scan import associative_scan
diff --git a/test/inductor/test_padding.py b/test/inductor/test_padding.py
index 9ef3a18e2423..c67bde87a369 100644
--- a/test/inductor/test_padding.py
+++ b/test/inductor/test_padding.py
@@ -109,9 +109,6 @@ def setUpClass(cls):
         if HAS_GPU:
             cls.prior_float32_matmul_precision = torch.get_float32_matmul_precision()
             cls.prior_default_device = torch.get_default_device()
-            # In MI300, HIPBLASLT_ALLOW_TF32=1 is used to enable tf32 for matmul.
-            # In the current test, HIPBLASLT_ALLOW_TF32 is not set, according to the
-            # logic of allowTF32CuBLAS(), set float32_matmul_precision to highest.
             if torch.version.hip:
                 torch.set_float32_matmul_precision("highest")
             else:
diff --git a/test/inductor/test_pattern_matcher.py b/test/inductor/test_pattern_matcher.py
index bfdc37100647..1e38852ee815 100644
--- a/test/inductor/test_pattern_matcher.py
+++ b/test/inductor/test_pattern_matcher.py
@@ -3,7 +3,8 @@
 import itertools
 import os
 import unittest
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Optional
 
 import torch
 import torch._dynamo.config as dynamo_config
@@ -180,8 +181,7 @@ def fn2(a, b, c):
             self._test_fused_int_mm_mul_impl(fn2, args, True)
 
     def test_duplicate_search(self):
-        from collections.abc import Iterable
-        from typing import Callable
+        from collections.abc import Callable, Iterable
 
         import torch
         from torch._inductor.pattern_matcher import (
@@ -1429,6 +1429,41 @@ def repl(inp, x1, x2):
                 self.assertEqual(counter, 1)
                 torch.testing.assert_close(actual, expected)
 
+    def test_input_output_same(self):
+        def pattern(x, y):
+            out1 = torch.add(x, y)
+            return out1, x
+
+        def replace(x, y):
+            out1 = torch.mul(x, y)
+            out2 = torch.mul(out1, y)
+            return out1, out2
+
+        my_patterns = PatternMatcherPass()
+        inputs = (torch.ones(3, 3), torch.ones(3, 3))
+        register_replacement(pattern, replace, inputs, fwd_only, my_patterns)
+
+        def custom_pass(graph: torch.fx.Graph) -> torch.fx.Graph:
+            _ = my_patterns.apply(graph)
+            stable_topological_sort(graph)
+            graph.eliminate_dead_code()
+            return graph
+
+        @torch.compile(
+            options={
+                "post_grad_custom_post_pass": custom_pass,
+            }
+        )
+        def f(x, y):
+            res = torch.add(x, y)
+            sub = torch.sub(res, x)
+            return sub
+
+        test, (code,) = run_and_get_code(f, *(torch.ones(3, 3), torch.ones(3, 3)))
+
+        self.assertTrue("aten.add.default" not in code)
+        self.assertTrue("aten.mul.default" not in code)
+
     @inductor_config.patch(fx_graph_remote_cache=False)
     def test_match_equivalent_function_invocations3(self):
         counter = 0
diff --git a/test/inductor/test_pcache.py b/test/inductor/test_pcache.py
deleted file mode 100644
index c07161ad584f..000000000000
--- a/test/inductor/test_pcache.py
+++ /dev/null
@@ -1,310 +0,0 @@
-# Owner(s): ["module: inductor"]
-from __future__ import annotations
-
-from concurrent.futures import ThreadPoolExecutor
-from inspect import isclass
-from os import environ
-from random import randint
-from typing_extensions import Self
-
-from torch._inductor import pcache
-from torch._inductor.test_case import run_tests, TestCase
-from torch.testing._internal.common_utils import (
-    instantiate_parametrized_tests,
-    parametrize,
-)
-
-
-# abstract cache classes don't go through the testing
-# process, as they have unimplemented components and are
-# not meant to be utilized directly by the end user
-ABSTRACT_CACHES: list[type[pcache.Cache]] = [
-    pcache.Cache,
-    pcache.AsyncCache,
-]
-
-STR_BYTES_CACHES: list[type[pcache.Cache]] = []
-STR_BYTES_ASYNC_CACHES: list[type[pcache.AsyncCache]] = []
-
-UNSUPPORTED_CACHES: list[type[pcache.Cache]] = []
-
-
-for obj_name in dir(pcache):
-    obj = getattr(pcache, obj_name)
-    if not isclass(obj) or not issubclass(obj, pcache.Cache) or obj in ABSTRACT_CACHES:
-        continue
-    # we only have Key=str, Value=bytes tests setup
-    for _orig_base in obj.__orig_bases__:
-        if issubclass(_orig_base.__origin__, pcache.Cache):
-            key_type, value_type = _orig_base.__args__
-            if (key_type != str) or (value_type != bytes):
-                UNSUPPORTED_CACHES.append(obj)
-                continue
-    # check association from strongest to weakest
-    if issubclass(obj, pcache.AsyncCache):
-        STR_BYTES_ASYNC_CACHES.append(obj)
-    if issubclass(obj, pcache.Cache):
-        STR_BYTES_CACHES.append(obj)
-
-
-class TestMixin:
-    def str_key(self: Self) -> str:
-        return f"key-{randint(0, 2**32)}"
-
-    def str_key_not_in(self: Self, cache: pcache.Cache[str, pcache.Value]) -> str:
-        while cache.get(str_key := self.str_key()) is not None:
-            continue
-
-        return str_key
-
-    def str_keys_not_in(
-        self: Self, cache: pcache.Cache[str, pcache.Value], num: int
-    ) -> list[str]:
-        str_keys: list[str] = []
-
-        while len(str_keys) < num:
-            str_key = self.str_key_not_in(cache)
-            if str_key not in str_keys:
-                str_keys.append(str_key)
-
-        return str_keys
-
-    def bytes_value(self: Self) -> bytes:
-        return f"value-{randint(0, 2**32)}".encode()
-
-
-@instantiate_parametrized_tests
-class CacheTest(TestMixin, TestCase):
-    @parametrize("Cache", STR_BYTES_CACHES)
-    def test_str_bytes_get_hit(self: Self, Cache: type[pcache.Cache]) -> None:
-        cache: pcache.Cache = Cache()
-
-        key = self.str_key_not_in(cache)
-        value = self.bytes_value()
-
-        self.assertIsNone(cache.get(key))
-        self.assertTrue(cache.insert(key, value))
-        self.assertEqual(cache.get(key), value)
-
-    @parametrize("Cache", STR_BYTES_CACHES)
-    def test_str_bytes_get_miss(self: Self, Cache: type[pcache.Cache]) -> None:
-        cache: pcache.Cache = Cache()
-
-        key = self.str_key_not_in(cache)
-
-        self.assertIsNone(cache.get(key))
-        self.assertIsNone(cache.get(key))
-
-    @parametrize("Cache", STR_BYTES_CACHES)
-    def test_str_bytes_insert_no_overwrite(
-        self: Self, Cache: type[pcache.Cache]
-    ) -> None:
-        cache: pcache.Cache = Cache()
-
-        key = self.str_key_not_in(cache)
-        value_1, value_2 = self.bytes_value(), self.bytes_value()
-
-        self.assertIsNone(cache.get(key))
-        self.assertTrue(cache.insert(key, value_1))
-        self.assertFalse(cache.insert(key, value_2))
-        self.assertEqual(cache.get(key), value_1)
-
-    @parametrize("Cache", STR_BYTES_CACHES)
-    def test_str_bytes_get_insert_thread_safe(
-        self: Self, Cache: type[pcache.Cache]
-    ) -> None:
-        cache: pcache.Cache = Cache()
-        executor: ThreadPoolExecutor = ThreadPoolExecutor()
-
-        num_iters = 1000
-
-        keys = self.str_keys_not_in(cache, num_iters)
-        values = [self.bytes_value() for _ in range(num_iters)]
-
-        get_futures = executor.map(cache.get, keys)
-        insert_futures = executor.map(cache.insert, keys, values)
-
-        for value, get_result, insert_result in zip(
-            values, get_futures, insert_futures
-        ):
-            if get_result is not None:
-                self.assertIsEqual(get_result, value)
-                self.assertTrue(insert_result)
-
-        executor.shutdown()
-
-    @parametrize("Cache", STR_BYTES_CACHES)
-    def test_str_bytes_insert_no_overwrite_thread_safe(
-        self: Self, Cache: type[pcache.Cache]
-    ) -> None:
-        cache: pcache.Cache = Cache()
-        executor: ThreadPoolExecutor = ThreadPoolExecutor()
-
-        num_iters = 1000
-
-        key = self.str_key_not_in(cache)
-        keys = [key for _ in range(num_iters)]
-        values = [self.bytes_value() for _ in range(num_iters)]
-
-        insert_futures = executor.map(cache.insert, keys, values)
-
-        hit_count = 0
-        for value, insert_result in zip(values, insert_futures):
-            self.assertLessEqual(hit_count, 1)
-            if insert_result:
-                hit_count += 1
-                self.assertEqual(cache.get(key), value)
-
-        executor.shutdown()
-
-
-@instantiate_parametrized_tests
-class AsyncCacheTest(TestMixin, TestCase):
-    @parametrize("Cache", STR_BYTES_ASYNC_CACHES)
-    def test_str_bytes_get_hit_async(
-        self: Self, Cache: type[pcache.AsyncCache]
-    ) -> None:
-        cache: pcache.AsyncCache = Cache()
-        executor: ThreadPoolExecutor = ThreadPoolExecutor()
-
-        key = self.str_key_not_in(cache)
-        value = self.bytes_value()
-
-        self.assertIsNone(cache.get_async(key, executor).result())
-        self.assertTrue(cache.insert_async(key, value, executor).result())
-        self.assertEqual(cache.get_async(key, executor).result(), value)
-
-        executor.shutdown()
-
-    @parametrize("Cache", STR_BYTES_ASYNC_CACHES)
-    def test_str_bytes_get_miss_async(
-        self: Self, Cache: type[pcache.AsyncCache]
-    ) -> None:
-        cache: pcache.AsyncCache = Cache()
-        executor: ThreadPoolExecutor = ThreadPoolExecutor()
-
-        key = self.str_key_not_in(cache)
-
-        self.assertIsNone(cache.get_async(key, executor).result())
-        self.assertIsNone(cache.get_async(key, executor).result())
-
-        executor.shutdown()
-
-    @parametrize("Cache", STR_BYTES_ASYNC_CACHES)
-    def test_str_bytes_insert_async_no_overwrite(
-        self: Self, Cache: type[pcache.AsyncCache]
-    ) -> None:
-        cache: pcache.AsyncCache = Cache()
-        executor: ThreadPoolExecutor = ThreadPoolExecutor()
-
-        num_iters = 10
-
-        key = self.str_key_not_in(cache)
-        values = [self.bytes_value() for _ in range(num_iters)]
-
-        self.assertIsNone(cache.get_async(key, executor).result())
-
-        futures = []
-        for value in values:
-            futures.append(cache.insert_async(key, value, executor))
-
-        for future, value in zip(futures, values):
-            if future.result():
-                self.assertTrue(cache.get(key), value)
-
-        executor.shutdown()
-
-
-@instantiate_parametrized_tests
-class OtherTest(TestMixin, TestCase):
-    def test_str_bytes_in_memory_cache_from_env_var(self: Self) -> None:
-        num_iters = 100
-
-        keys = [self.str_key() for _ in range(num_iters)]
-        values = [self.bytes_value() for _ in range(num_iters)]
-
-        env_var = "INMEMORYCACHE_TEST"
-        env_val = ";".join([f"{key},{value!r}" for key, value in zip(keys, values)])
-        environ[env_var] = env_val
-
-        cache = pcache.InMemoryCache.from_env_var(env_var)
-
-        for key, value in zip(keys, values):
-            self.assertEqual(cache.get(key), value)
-
-        for key in keys[num_iters:]:
-            self.assertIsNone(cache.get(key))
-
-    def test_str_bytes_in_memory_cache_from_env_var_bad_kv_pair(self: Self) -> None:
-        key = self.str_key()
-        value = self.bytes_value()
-
-        env_var = "INMEMORYCACHE_TEST"
-        # no comma separator
-        env_val = f"{key}{value!r};"
-        environ[env_var] = env_val
-
-        with self.assertRaisesRegex(
-            ValueError,
-            f"Malformed kv_pair {env_val[:-1]!r} in env_var {env_var!r}, missing comma separator!",
-        ):
-            _ = pcache.InMemoryCache.from_env_var(env_var)
-
-    def test_str_bytes_in_memory_cache_from_env_var_bad_value_not_bytes(
-        self: Self,
-    ) -> None:
-        key = self.str_key()
-        # value is str, not bytes
-        value = self.str_key()
-
-        env_var = "INMEMORYCACHE_TEST"
-        env_val = f"{key},{value};"
-        environ[env_var] = env_val
-
-        with self.assertRaisesRegex(
-            ValueError,
-            f"Malformed value {value!r} in kv_pair {env_val[:-1]!r}, expected b'...' format!",
-        ):
-            _ = pcache.InMemoryCache.from_env_var(env_var)
-
-    def test_str_bytes_in_memory_cache_from_env_var_bad_value_not_encoded(
-        self: Self,
-    ) -> None:
-        key = self.str_key()
-        # value is not encoded properly
-        value = f"b'{chr(256)}'"
-
-        env_var = "INMEMORYCACHE_TEST"
-        env_val = f"{key},{value};"
-        environ[env_var] = env_val
-
-        with self.assertRaisesRegex(
-            ValueError, f"Malformed value {value!r} in kv_pair {env_val[:-1]!r}!"
-        ):
-            _ = pcache.InMemoryCache.from_env_var(env_var)
-
-    def test_str_bytes_in_memory_cache_from_env_var_one_key_many_values(
-        self: Self,
-    ) -> None:
-        num_iters = 2
-
-        key = self.str_key()
-        keys = [key for _ in range(num_iters)]
-        values = [self.bytes_value() for _ in range(num_iters)]
-
-        env_var = "INMEMORYCACHE_TEST"
-        env_val = ";".join([f"{key},{value!r}" for key, value in zip(keys, values)])
-        environ[env_var] = env_val
-
-        with self.assertRaisesRegex(
-            ValueError,
-            f"Duplicated values for key {key!r}, got {values[0]!r} and {values[1]!r}!",
-        ):
-            _ = pcache.InMemoryCache.from_env_var(env_var)
-
-    def test_no_unsupported_caches(self: Self) -> None:
-        self.assertEqual(UNSUPPORTED_CACHES, [])
-
-
-if __name__ == "__main__":
-    run_tests()
diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py
index 83cd236875f4..2dd6d498936f 100644
--- a/test/inductor/test_perf.py
+++ b/test/inductor/test_perf.py
@@ -1156,11 +1156,13 @@ def f():
                 torch.compile(f, fullgraph=True),
             )
 
-            # Check that we are allocating the minimum number of intermediate buffers
+            # Check that we are not allocate intermediate buffers
+            # which can be reused.
             matches = re.findall(r"empty_strided_\w+\(", code)
-            self.assertEqual(len(matches), 1)
+            self.assertEqual(len(matches), 0)
+            self.assertEqual("in_out" in code, True)
 
-            self.assertExpectedInline(count_numel(f), """39""")
+            self.assertExpectedInline(count_numel(f), """45""")
 
     @requires_cuda_and_triton
     def test_inplace_triton_kernel_v1(self):
diff --git a/test/inductor/test_profiler.py b/test/inductor/test_profiler.py
index f22f0374813b..e6cf6bbcc91b 100644
--- a/test/inductor/test_profiler.py
+++ b/test/inductor/test_profiler.py
@@ -3,7 +3,8 @@
 import os
 import tempfile
 import unittest
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Optional
 
 import torch
 import torch._inductor.test_case
diff --git a/test/inductor/test_provenance_tracing.py b/test/inductor/test_provenance_tracing.py
index 7d6b714838ff..16fefea92efb 100644
--- a/test/inductor/test_provenance_tracing.py
+++ b/test/inductor/test_provenance_tracing.py
@@ -150,7 +150,7 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                                 "cppCodeToPost",
                                 {
                                     "triton_poi_fused_mul_0:1": ["mul"],
-                                    "triton_poi_fused_addmm_gelu_1:2": [
+                                    "triton_poi_fused_addmm_gelu_1:3": [
                                         "mul_3",
                                         "mul_1",
                                         "add_tensor",
@@ -164,12 +164,12 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                                 "postToCppCode",
                                 {
                                     "mul": ["triton_poi_fused_mul_0:1"],
-                                    "mul_3": ["triton_poi_fused_addmm_gelu_1:2"],
-                                    "mul_1": ["triton_poi_fused_addmm_gelu_1:2"],
-                                    "add_tensor": ["triton_poi_fused_addmm_gelu_1:2"],
-                                    "add": ["triton_poi_fused_addmm_gelu_1:2"],
-                                    "erf": ["triton_poi_fused_addmm_gelu_1:2"],
-                                    "mul_2": ["triton_poi_fused_addmm_gelu_1:2"],
+                                    "mul_3": ["triton_poi_fused_addmm_gelu_1:3"],
+                                    "mul_1": ["triton_poi_fused_addmm_gelu_1:3"],
+                                    "add_tensor": ["triton_poi_fused_addmm_gelu_1:3"],
+                                    "add": ["triton_poi_fused_addmm_gelu_1:3"],
+                                    "erf": ["triton_poi_fused_addmm_gelu_1:3"],
+                                    "mul_2": ["triton_poi_fused_addmm_gelu_1:3"],
                                 },
                             ),
                             (
@@ -195,18 +195,18 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                             ),
                         ]
                         if backend == "aot_inductor":
-                            expected_mapping[0][1]["aoti_torch_cuda_mm_out:3"] = [
+                            expected_mapping[0][1]["aoti_torch_cuda_mm_out:2"] = [
                                 "mm_default"
                             ]
                             expected_mapping[1][1]["mm_default"] = [
-                                "aoti_torch_cuda_mm_out:3"
+                                "aoti_torch_cuda_mm_out:2"
                             ]
                         else:
-                            expected_mapping[0][1]["extern_kernels.mm:3"] = [
+                            expected_mapping[0][1]["extern_kernels.mm:2"] = [
                                 "mm_default"
                             ]
                             expected_mapping[1][1]["mm_default"] = [
-                                "extern_kernels.mm:3"
+                                "extern_kernels.mm:2"
                             ]
                         self._check_provenance_tracking_node_mappings(
                             filepath, expected_mapping
@@ -217,8 +217,8 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                         if backend == "aot_inductor":
                             expected_data = {
                                 "cpp_fused_mul_0:1": ["mul"],
-                                "aoti_torch_cpu_addmm_out:3": ["addmm"],
-                                "cpp_fused_gelu_1:2": [
+                                "aoti_torch_cpu_addmm_out:2": ["addmm"],
+                                "cpp_fused_gelu_1:3": [
                                     "mul_3",
                                     "mul_1",
                                     "add",
@@ -230,14 +230,14 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                             # backend == "inductor"
                             expected_data = {
                                 "cpp_fused_mul_0:1": ["mul"],
-                                "cpp_fused_gelu_1:2": [
+                                "cpp_fused_gelu_1:3": [
                                     "mul_3",
                                     "mul_1",
                                     "add",
                                     "erf",
                                     "mul_2",
                                 ],
-                                "extern_kernels.addmm:3": ["addmm"],
+                                "extern_kernels.addmm:2": ["addmm"],
                             }
                         self._check_provenance_tracing_kernel_to_post_grad(
                             filepath, expected_data
@@ -550,22 +550,22 @@ def test_tlparse_kernel_stack_traces(self):
         example_inputs = (x, a, b, c)
 
         expected = {
-            "triton_poi_fused_addmm_relu_sigmoid_threshold_backward_0:1": [
+            "triton_poi_fused_addmm_relu_sigmoid_threshold_backward_0:2": [
                 "x = self.sigmoid(x)",
                 "x = self.fc1(x)",
                 "x = self.relu(x)",
             ],
-            "triton_poi_fused_mul_1:2": [
+            "triton_poi_fused_mul_1:3": [
                 "d = a * 3.14",
             ],
-            "triton_poi_fused_addmm_gelu_2:3": [
+            "triton_poi_fused_addmm_gelu_2:5": [
                 "z = torch.nn.functional.gelu(y)",
                 "y = torch.addmm(c, d, b)",
             ],
-            "extern_kernels.mm:4": [
+            "extern_kernels.mm:1": [
                 "x = self.fc1(x)",
             ],
-            "extern_kernels.mm:5": [
+            "extern_kernels.mm:4": [
                 "y = torch.addmm(c, d, b)",
             ],
         }
@@ -648,7 +648,7 @@ def test_kernel_information_generation(self):
                 kernel_info = json.load(f)
 
             expected = {
-                "triton_poi_fused_addmm_relu_sigmoid_0:1": {
+                "triton_poi_fused_addmm_relu_sigmoid_0:2": {
                     "stack_traces": [
                         "x = self.sigmoid(x)",
                         "x = self.fc1(x)",
@@ -657,14 +657,14 @@ def test_kernel_information_generation(self):
                     "post_grad_nodes": ["sigmoid", "relu", "add_tensor_1"],
                     "pre_grad_nodes": ["sigmoid", "relu", "linear"],
                 },
-                "triton_poi_fused_mul_1:2": {
+                "triton_poi_fused_mul_1:3": {
                     "stack_traces": [
                         "d = a * 3.14",
                     ],
                     "post_grad_nodes": ["mul"],
                     "pre_grad_nodes": ["mul"],
                 },
-                "triton_poi_fused_addmm_gelu_2:3": {
+                "triton_poi_fused_addmm_gelu_2:5": {
                     "stack_traces": [
                         "z = torch.nn.functional.gelu(y)",
                         "y = torch.addmm(c, d, b)",
@@ -679,14 +679,14 @@ def test_kernel_information_generation(self):
                     ],
                     "pre_grad_nodes": ["gelu", "addmm"],
                 },
-                "aoti_torch_cuda_mm_out:4": {
+                "aoti_torch_cuda_mm_out:1": {
                     "stack_traces": [
                         "x = self.fc1(x)",
                     ],
                     "post_grad_nodes": ["mm_default_1"],
                     "pre_grad_nodes": ["linear"],
                 },
-                "aoti_torch_cuda_mm_out:5": {
+                "aoti_torch_cuda_mm_out:4": {
                     "stack_traces": [
                         "y = torch.addmm(c, d, b)",
                     ],
diff --git a/test/inductor/test_select_algorithm.py b/test/inductor/test_select_algorithm.py
index 101ed60526b7..2e6ee09bf10a 100644
--- a/test/inductor/test_select_algorithm.py
+++ b/test/inductor/test_select_algorithm.py
@@ -2,7 +2,8 @@
 import contextlib
 import functools
 import unittest.mock
-from typing import Callable
+from collections.abc import Callable
+from typing import Any, Optional, Union
 from unittest.mock import patch
 
 import torch
@@ -14,14 +15,19 @@
 from torch._dynamo.utils import counters
 from torch._inductor import config
 from torch._inductor.autotune_process import TritonBenchmarkRequest
+from torch._inductor.choices import InductorChoices
+from torch._inductor.codegen.common import KernelTemplate
 from torch._inductor.ir import FixedLayout
+from torch._inductor.kernel_inputs import KernelInputs
 from torch._inductor.select_algorithm import (
     autotune_select_algorithm,
+    ExternKernelChoice,
     TritonTemplate,
     TritonTemplateKernel,
 )
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import is_big_gpu, run_and_get_kernels
+from torch._inductor.virtualized import V
 from torch._prims_common import ELEMENTWISE_TYPE_PROMOTION_KIND
 from torch.testing._internal.common_utils import IS_LINUX, skipIfRocm, skipIfXpu
 from torch.testing._internal.inductor_utils import (
@@ -393,6 +399,68 @@ def foo(x, w, b):
         # Autotuning checks correctness of each version
         self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
 
+    @patches
+    @torch._inductor.config.patch(
+        {"conv_1x1_as_mm": True, "max_autotune_gemm_backends": "TRITON"}
+    )
+    def test_convolution_as_mm_triton_only(self):
+        # To convert the 1x1 conv to matmul, x is converted to a channels last
+        # tensor and the channels dimension is permuted to be innermost. This
+        # prologue should not be fused with the matmul since the prologue writes
+        # discontiguously, whilst the mm template currently only supports reading
+        # the input contiguously.
+        #
+        # Before the change associated with this PR, fusion would occur because the actual kernel
+        # input nodes (which don't include views e.g. permute) would be passed to the
+        # `TritonTemplateCaller` rather than the input nodes that include views.
+        # For example after x is converted to channels last, its layout is shape @ stride
+        # [2, 33, 16, 16] @ [8432, 1, 528, 33], or [2, 33, 256] @ [8432, 1, 33], and the
+        # prologue writes this value discontiguously.
+        # After the permute, the mm template fixes the layout to [512, 33] @ [33, 1] and
+        # reads the input contiguously. If the kernel input node for x is passed to the
+        # `TritonTemplateCaller`, then the scheduler will fuse the prologue since the
+        # write is compatible with the read. If however the viewed input is passed
+        # to `TritonTemplateCaller`, then the write won't be compatible with the read,
+        # and the prologue won't be fused.
+        def foo(x, w, b):
+            return aten.convolution(
+                x + 1,
+                w,
+                b,
+                stride=(1, 1),
+                padding=(0, 0),
+                dilation=(1, 1),
+                transposed=False,
+                output_padding=(0, 0),
+                groups=1,
+            )
+
+        x = torch.randn(2, 33, 16, 16, device=GPU_TYPE)
+        w = torch.randn(34, 33, 1, 1, device=GPU_TYPE)
+        b = torch.randn(34, device=GPU_TYPE)
+
+        class SingleMMConfigChoice(InductorChoices):
+            def get_template_configs(
+                self,
+                kernel_inputs: KernelInputs,
+                templates: list[Union[KernelTemplate, ExternKernelChoice]],
+                op_name: str,
+                kwarg_overrides: Optional[dict[str, dict[str, Any]]] = None,
+            ):
+                return super().get_template_configs(
+                    kernel_inputs, templates, op_name, kwarg_overrides
+                )[:1]
+
+        with V.set_choices_handler(SingleMMConfigChoice()):
+            result_compile = torch.compile(foo)(x, w, b)
+        result_eager = foo(x, w, b)
+
+        # If the prologue has been fused this should fail
+        torch.testing.assert_close(result_compile, result_eager)
+
+        # There should not be any autotuning
+        self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 0)
+
     @patches
     @torch._inductor.config.patch(conv_1x1_as_mm=False)
     def test_convolution2_group(self):
@@ -513,7 +581,7 @@ class ExtensionTritonTemplate(TritonTemplate):
     tmp0 = tl.load(A + xindex)
     tmp1 = tl.load(B + xindex)
     tmp2 = tmp0 + tmp1
-    {{store_output(("xindex",), "tmp2", mask="xmask")}}
+    {{store_output(("xindex",), "tmp2", mask="xmask", val_shape=("XBLOCK",))}}
     """
             ),
         )
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 3b295dc4e479..0b1f43c1b3d6 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -19,8 +19,9 @@
 import unittest
 import unittest.mock
 import weakref
+from collections.abc import Callable
 from pathlib import Path
-from typing import Callable, TypeVar
+from typing import TypeVar
 from typing_extensions import ParamSpec
 from unittest.mock import patch
 
@@ -513,11 +514,11 @@ def run(*ex, **kwargs):
     #         print("Graph", graph)
     if check_has_compiled:
         assert called, "Ran graph without calling compile_fx"
-    assert type(actual) == type(correct)
+    assert type(actual) is type(correct)
     if isinstance(actual, (tuple, list)):
         assert len(actual) == len(correct)
         assert all(
-            type(actual_item) == type(correct_item)
+            type(actual_item) is type(correct_item)
             for actual_item, correct_item in zip(actual, correct)
         )
 
@@ -705,7 +706,7 @@ def check_model_gpu(
     if check_lowp:
 
         def downcast_fn(x):
-            if not isinstance(x, torch.Tensor) or not x.dtype == torch.float:
+            if not isinstance(x, torch.Tensor) or x.dtype != torch.float:
                 return x
             return torch.empty_strided(
                 x.size(), x.stride(), device=GPU_TYPE, dtype=torch.half
@@ -1448,6 +1449,39 @@ def fn(a, b):
                 code.count("view_dtype" if config.cpp_wrapper else "aten.view"), 3
             )
 
+    def test_add_complex_strided_fallback(self):
+        @torch.compile
+        def fn(a, b):
+            return a + b
+
+        if not self.is_dtype_supported(torch.complex64):
+            raise unittest.SkipTest("complex64 not supported on device")
+
+        base = torch.randn(3, 4, dtype=torch.complex64, device=self.device)
+        x = base.transpose(0, 1)
+        y = base.transpose(0, 1)
+
+        torch._inductor.metrics.reset()
+        _, code = run_and_get_code(fn, x, y)
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 0)
+
+        code = " ".join(code)
+        fallback_markers = [
+            "extern_kernels.add",
+            "torch.ops.aten.add.Tensor",
+        ]
+        if config.cpp_wrapper:
+            fallback_markers.extend(
+                [
+                    "aoti_torch_cuda_add_Tensor",
+                    "aoti_torch_cpu_add_Tensor",
+                ]
+            )
+        self.assertTrue(
+            any(code.count(marker) >= 1 for marker in fallback_markers),
+            msg=f"Expected complex add with strided inputs to fall back to extern kernels, got:\n{code}",
+        )
+
     def test_add_complex5(self):
         def fn(a, b, alpha):
             return torch.add(a, b, alpha=alpha)
@@ -2420,7 +2454,6 @@ def fn(a):
         self.common(fn, [packed])
 
     @xfail_if_mps_unimplemented
-    @skipCUDAIf(True, "No _weight_int8pack_mm implementation on CUDA")
     @skipIfXpu(msg="No _weight_int8pack_mm implementation on XPU")
     def test_int8_weight_only_quant(self):
         def convert_weight_to_int8pack(b):
@@ -4607,6 +4640,42 @@ def fn(a):
             (torch.randn([4, 4, 4]),),
         )
 
+    @skipIfXpu(msg="Incorrect reference on XPU, see issue #165392")
+    def test_conv1d_with_permute(self):
+        # fix https://github.com/pytorch/pytorch/issues/159462
+        class ConvModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = nn.Conv1d(1, 64, kernel_size=3, padding=1)
+
+            def forward(self, x):
+                x = x.permute(0, 2, 1)
+                return self.conv(x)
+
+        self.common(ConvModel(), (torch.randn([32, 100, 1]),), check_lowp=False)
+
+    def test_conv1d_depthwise(self):
+        class ConvModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = nn.Conv1d(
+                    768,
+                    768,
+                    kernel_size=(9,),
+                    stride=(1,),
+                    padding=(4,),
+                    groups=768,
+                    bias=False,
+                )
+
+            def forward(self, x):
+                return self.conv(x)
+
+        input_tensor = torch.randn([1, 768, 512]).as_strided(
+            (1, 768, 512), (393216, 1, 768)
+        )
+        self.common(ConvModel(), (input_tensor,), check_lowp=False)
+
     def test_convolution1(self):
         m = torch.nn.Sequential(
             torch.nn.Conv2d(5, 6, [3, 3]),
@@ -4625,7 +4694,7 @@ def test_convolution1(self):
             # Make sure we compute also with fp16 in the reference. Otherwise,
             # the reference will compute with fp32 and cast back to fp16, which
             # causes numeric differences beyond tolerance.
-            reference_in_float=False if torch.version.hip else True,
+            reference_in_float=not torch.version.hip,
         )
 
     def test_convolution2(self):
@@ -4659,7 +4728,7 @@ def test_convolution3(self):
             # Make sure we compute also with fp16 in the reference. Otherwise,
             # the reference will compute with fp32 and cast back to fp16, which
             # causes numeric differences beyond tolerance.
-            reference_in_float=False if torch.version.hip else True,
+            reference_in_float=not torch.version.hip,
         )
 
     @skip_if_gpu_halide
@@ -4705,12 +4774,12 @@ def test_conv3d(self):
         self.common(
             m,
             (torch.randn([1, 3, 8, 16, 32]),),
-            atol=6e-5,
+            atol=1e-3,
             rtol=0.001,
             # Make sure we compute also with fp16 in the reference. Otherwise,
             # the reference will compute with fp32 and cast back to fp16, which
             # causes numeric differences beyond tolerance.
-            reference_in_float=False if torch.version.hip else True,
+            reference_in_float=not torch.version.hip,
         )
 
     def test_conv2d_channels_last(self):
@@ -5470,6 +5539,28 @@ def fn(x):
 
         self.common(fn, (torch.randn(1, 1),))
 
+    def test_as_strided_on_views(self):
+        # https://github.com/pytorch/pytorch/issues/163286
+        def fn(a):
+            c = a.view(-1)
+            # convert to float16
+            d = c.view(torch.float16)
+            e = d.as_strided((2, 5), (1, 1))
+            # convert back to bfloat16
+            f = e.view(torch.bfloat16)
+            g = f.as_strided((10, 10), (1, 1))
+            return g
+
+        a = torch.randn(10, 10, dtype=torch.bfloat16)
+        self.common(fn, (a,), reference_in_float=False)
+
+        # test dtype separately
+        out = fn(a)
+        assert out.dtype == torch.bfloat16
+
+        out = torch.compile(fn)(a)
+        assert out.dtype == torch.bfloat16
+
     def test_repeat_interleave(self):
         def fn(x):
             return (
@@ -5620,7 +5711,6 @@ def fn(x):
             (torch.randn([2, 4, 4, 8]),),
         )
 
-    @xfail_if_mps_unimplemented
     def test_embedding_bag(self):
         def fn(w, i, o):
             return aten._embedding_bag(w, i, o, False, 0, False, None)
@@ -5869,6 +5959,16 @@ def f(a):
             a = torch.rand((1, 1000000), device=self.device)
             self.common(f, (a,))
 
+    def test_inplace_flip(self):
+        def f(x, y):
+            x.copy_(x.flip(1))
+            y = y.sum(dim=1, keepdim=True) + y
+            return x + y
+
+        x = torch.randn(20, 1024 * 1024)
+        y = torch.randn(20, 1024 * 1024)
+        self.common(f, (x, y), atol=1e-3, rtol=1e-3)
+
     def test_gather_scatter(self):
         def fn(node_feat, edge_index):
             src_node_feat = node_feat[edge_index[0]]
@@ -6348,9 +6448,9 @@ def fn(x1, x2, x3, x4):
             atol = 3e-4
             rtol = 1e-4
         else:
-            # use default
-            atol = None
-            rtol = None
+            atol = 5e-4
+            rtol = 3e-4
+
         # MPS has correctness problem before MacOS15
         with (
             contextlib.nullcontext()
@@ -6373,6 +6473,7 @@ def fn(x1, x2, x3, x4):
     @skip_if_gpu_halide
     # Constant folding was explicitly turned off due to issue #108388
     # Turn it back on for test
+    @unittest.skipIf(config.triton.native_matmul, "native matmul has better precision")
     @torch._inductor.config.patch(joint_graph_constant_folding=True)
     def test_remove_no_ops(self):
         def matmul_with_op(x, y, fn):
@@ -6394,6 +6495,7 @@ def matmul_with_op(x, y, fn):
             out, source_codes = run_and_get_code(foo_opt, inps[0], inps[1], fn)
             self.assertEqual(out, matmul_with_op(inps[0], inps[1], fn))
 
+            atol, rtol = None, None
             if self.device == "cpu":
                 FileCheck().check_not("cpp_fused").run(source_codes[0])
             else:
@@ -6409,14 +6511,18 @@ def matmul_with_op(x, y, fn):
             ]
             for fn in fns:
                 out, source_codes = run_and_get_code(foo_opt, inps[0], inps[1], fn)
-                self.assertEqual(out, matmul_with_op(inps[0], inps[1], fn))
+                self.assertEqual(
+                    out, matmul_with_op(inps[0], inps[1], fn), atol=atol, rtol=rtol
+                )
 
             # test broadcasted shape bail
             fn = lambda x: x + torch.zeros(  # noqa: E731
                 [256, 256, 256], dtype=lowp_dtype, device=self.device
             )
             out, source_codes = run_and_get_code(foo_opt, inps[0], inps[1], fn)
-            self.assertEqual(out, matmul_with_op(inps[0], inps[1], fn))
+            self.assertEqual(
+                out, matmul_with_op(inps[0], inps[1], fn), atol=atol, rtol=rtol
+            )
 
     def test_remove_noop_copy(self):
         def fn(x, y):
@@ -6765,7 +6871,9 @@ def c(x):
         with patch.object(
             CompileContext,
             "__init__",
-            lambda self, _: CompileContext_init(self, CompileId(999, 999)),
+            lambda self, _: CompileContext_init(
+                self, CompileId(frame_id=999, frame_compile_id=999)
+            ),
         ):
             _, (coda_a2,) = _run_and_get_stripped_kernels(a, x)
             _, (coda_c2,) = _run_and_get_stripped_kernels(c, x)
@@ -6796,6 +6904,7 @@ def b(x):
 
     @config.patch(force_disable_caches=True)
     @skip_if_cpp_wrapper("run_and_get_kernels issue")
+    @unittest.skipIf(config.triton.native_matmul, "matmul is now generated")
     def test_deterministic_codegen_with_suffix(self):
         if "cpu" in str(self.device) and config.is_fbcode():
             raise unittest.SkipTest("cpp packaging is wacky in fbcode")
@@ -7366,6 +7475,56 @@ def fn(a, b):
             rtol=1.3e-06,
         )
 
+    @requires_gpu()
+    def test_grid_sampler_expand_preserves_view(self):
+        if not self.device.startswith("cuda"):
+            self.skipTest("requires CUDA")
+
+        torch.manual_seed(0)
+        torch._dynamo.reset()
+
+        repeats = 9000
+        batch = 48
+        channels = 3
+        img = 224
+        grid_size = 13
+        device = self.device
+
+        class ExpandGridSampler(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.grid = torch.nn.Parameter(
+                    torch.randn(repeats, grid_size, grid_size, 2, device=device)
+                )
+                self.fc = torch.nn.Linear(grid_size * grid_size * channels, 16).to(
+                    device
+                )
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                per_channel = []
+                for i in range(channels):
+                    channel = x[:, i, ...].expand(repeats, -1, -1, -1)
+                    patch = torch.nn.functional.grid_sample(
+                        channel,
+                        self.grid,
+                        mode="bilinear",
+                        align_corners=False,
+                        padding_mode="border",
+                    )
+                    patch = patch.transpose(0, 1).flatten(start_dim=2)
+                    per_channel.append(patch)
+                x = torch.cat(per_channel, dim=2)
+                return self.fc(x)
+
+        model = ExpandGridSampler().to(device)
+        compiled = torch.compile(model, backend="inductor")
+        inp = torch.randn(batch, channels, img, img, device=device)
+
+        out = compiled(inp)
+        out.sum().backward()
+
+        self.assertIsNotNone(model.grid.grad)
+
     def test_upsample_bicubic2d(self):
         def fn(a):
             return (
@@ -8501,7 +8660,15 @@ def forward(self, x, start_pos):
         torch._inductor.metrics.generated_kernel_count = 0
         with torch.no_grad():
             self.common(kv_cache_module, (inp, 1), check_lowp=False)
-        assertGeneratedKernelCountEqual(self, 1)
+
+        if (
+            config.triton.native_matmul
+            and config.cuda_backend == "triton"
+            and self.device == "cuda"
+        ):
+            assertGeneratedKernelCountEqual(self, 2)
+        else:
+            assertGeneratedKernelCountEqual(self, 1)
 
     @skipIfMPS
     def test_slice_scatter_dtype_consistency(self):
@@ -9778,11 +9945,16 @@ def fn(a, b):
             ),
             check_lowp=False,
         )
-        expected_kernel = 0
-        # codegen mm kernel from template
-        self.assertEqual(
-            torch._inductor.metrics.generated_kernel_count, expected_kernel
-        )
+
+        if (
+            config.triton.native_matmul
+            and config.cuda_backend == "triton"
+            and self.device == "cuda"
+        ):
+            self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+        else:
+            # codegen mm kernel from template
+            self.assertEqual(torch._inductor.metrics.generated_kernel_count, 0)
 
     @torch._dynamo.config.patch(assume_static_by_default=False)
     def test_dtype_sympy_expr(self):
@@ -9855,6 +10027,7 @@ def check(r, g):
 
     @xfail_if_mps
     @config.patch(search_autotune_cache=False)
+    @unittest.skipIf(config.triton.native_matmul, "matmul count is different")
     def test_dropout3(self):
         m = torch.nn.Sequential(
             torch.nn.Linear(32, 32, bias=False),
@@ -10556,6 +10729,16 @@ def fn(x):
         for x in (torch.randn(2, 3), torch.randn(2, 2), torch.randn(3, 2)):
             self.common(fn, (x,))
 
+    def test_copy_with_scalar_src(self):
+        def fn(x):
+            buffer = torch.zeros_like(x)
+            buffer.copy_(2)
+            result = x + buffer
+            return result
+
+        x = torch.randn(64, 64, dtype=torch.float32, device=self.device)
+        self.common(fn, (x,))
+
     def test_kwargs(self):
         if self.device == GPU_TYPE:
             raise unittest.SkipTest("histogramdd only supports cpu")
@@ -11125,6 +11308,7 @@ def foo(arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
             {
                 "triton.prefer_nd_tiling": prefer_nd_tiling,
                 "triton.use_block_ptr": use_block_ptr,
+                "triton.native_matmul": False,
             }
         ):
             # Check accuracy
@@ -12335,11 +12519,7 @@ def fn(x):
                 # to be channels-last. If this assertion ever fails then we need
                 # a new test case.
                 self.assertEqual(len(bar_strides), 1)
-                if self.device == "mps" and MACOS_VERSION < 15.0:
-                    # Before MacOS15 contiguous output were returned regardless of input
-                    self.assertEqual(bar_strides[0], expected_stride)
-                else:
-                    self.assertNotEqual(bar_strides[0], expected_stride)
+                self.assertNotEqual(bar_strides[0], expected_stride)
 
     @config.patch(implicit_fallbacks=True)
     @skip_if_cpp_wrapper(
@@ -12790,7 +12970,7 @@ def fn(n):
             )
 
         res = torch.compile(fn)(20)
-        self.assertTrue(torch.all((0 <= res) & (res < 10)).item())
+        self.assertTrue(torch.all((res >= 0) & (res < 10)).item())
 
     @torch._inductor.config.patch(force_shape_pad=True)
     @skip_if_gpu_halide  # correctness issue
@@ -13869,6 +14049,7 @@ def f(a_list):
         ]
         outputs = f(inputs)
 
+        warmup_compiled = f_compiled(inputs)
         with torch.profiler.profile(
             activities=[
                 getattr(torch.profiler.ProfilerActivity, GPU_TYPE.upper()),
@@ -13876,9 +14057,6 @@ def f(a_list):
         ) as p:
             outputs_compiled = f_compiled(inputs)
 
-        # outputs_compiled, (code,) = run_and_get_code(f_compiled, inputs)
-        # self.assertTrue("pinned" in code)
-
         self.assertEqual(outputs, outputs_compiled)
         profile_output = str(p.key_averages())
         print(profile_output)
@@ -13917,12 +14095,6 @@ def f(input, repeats):
         )
         repeat = torch.tensor([1, 2], device=self.device)
 
-        if input.device.type == "mps" and dtype == torch.int64:
-            raise unittest.SkipTest(
-                "torch.compile fails this test with mps & int64, "
-                "see https://github.com/pytorch/pytorch/issues/159408"
-            )
-
         f_compiled = torch.compile(f)
         output, (code,) = run_and_get_code(f_compiled, input, repeat)
         reference = f(input, repeat)
@@ -13943,6 +14115,7 @@ def _is_triggering_buffer_reuse(fn, *inputs):
         code_disallowed = re.sub(r"AOT ID: .*", "AOT ID: ['test']", code_disallowed)
         return code_allowed != code_disallowed
 
+    @unittest.skipIf(config.triton.native_matmul, "matmul is now generated")
     def test_allow_reuse_disable_if_exceed_peak(self):
         @torch.compile
         def fn(inp):  # 1*N^2
@@ -14082,6 +14255,52 @@ def fn(a, b):
         with self.assertRaises(RuntimeError):
             compiled = torch.compile(fn, backend="inductor")(a, b)
 
+    @requires_cuda_and_triton
+    @config.patch(emulate_precision_casts=True)
+    def test_emulate_precision_triton_fp_fusion(self):
+        def fn(a, b):
+            return 2.001 * a + b
+
+        a = torch.full([256], 0.5001, device=GPU_TYPE, dtype=torch.float16)
+        b = torch.full([256], -1, device=GPU_TYPE, dtype=torch.float16)
+
+        compiled = torch.compile(fn)
+        out, (code,) = run_and_get_code(compiled, a, b)
+        self.assertTrue("'enable_fp_fusion': False" in code)
+        torch.testing.assert_close(out, fn(a, b), atol=0, rtol=0)
+
+    @skip_if_cpp_wrapper("skip cpp wrapper")
+    @requires_cuda_and_triton
+    def test_repeat_interleave_decomposition_has_clamp(self):
+        repeat = torch.ones(2560, dtype=torch.int64, device=GPU_TYPE)
+        output_size = 505450
+        data = torch.arange(2560, device=GPU_TYPE)
+
+        if is_dynamic_shape_enabled():
+            raise unittest.SkipTest(
+                "repeat_interleave decomp doesn't support dynamic output size"
+            )
+
+        @torch.compile
+        def fn(repeat, output_size, data):
+            indices = torch.ops.aten.repeat_interleave.Tensor(
+                repeat, output_size=output_size
+            )
+            return data[indices]
+
+        result, code = run_and_get_code(fn, repeat, output_size, data)
+
+        self.assertEqual(result.shape[0], output_size)
+        self.assertTrue(torch.all(result >= 0).item())
+        self.assertTrue(torch.all(result < 2560).item())
+
+        code_str = "\n".join(code)
+        self.assertIn(
+            "triton_helpers.minimum",
+            code_str,
+            "Generated Triton code should use triton_helpers.minimum for clamping",
+        )
+
     # end of class CommonTemplate - add new tests here
 
 
@@ -14326,7 +14545,6 @@ def fn1(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
                 return a[y.to(torch.int64)]
 
             def fn2(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
-                torch._check_is_size(b.shape[0])
                 torch._check(b.shape[0] >= 2)
                 torch._check(b.shape[0] <= 100)
                 return fn1(a, b)
@@ -14859,7 +15077,7 @@ def f(a, b):
                 self.assertExpectedInline(
                     "\n".join(lines),
                     """\
-        tmp0 = tl.reshape(tl.broadcast_to(tl.load(block_ptr0, boundary_check=[2], padding_option='zero', eviction_policy='evict_last')[:, None, :, :], [(511 + XBLOCK) // 512, ((1) * ((1) <= ((511 + XBLOCK) // 512)) + ((511 + XBLOCK) // 512) * (((511 + XBLOCK) // 512) < (1))), ((512) * ((512) <= (XBLOCK)) + (XBLOCK) * ((XBLOCK) < (512))), R0_BLOCK]), [XBLOCK, R0_BLOCK])
+        tmp0 = tl.reshape(tl.load(block_ptr0, boundary_check=[2], padding_option='zero', eviction_policy='evict_last'), [XBLOCK, R0_BLOCK])
         tmp1 = tl.load(block_ptr1, boundary_check=[1], padding_option='zero', eviction_policy='evict_first')""",  # noqa: B950 line too long
                 )
 
@@ -14924,6 +15142,7 @@ def test_has_constant_mask(self, block_multiple, ynumel_exceed_ygrid_size):
                 self.assertTrue("ymask = yindex < ynumel" in code)
                 self.assertTrue("xmask = xindex < xnumel" in code)
 
+        @config.patch("triton.native_matmul", False)
         def test_kernel_names_descriptive(self):
             @torch.compile(backend="inductor")
             def fn1(x):
@@ -14967,11 +15186,7 @@ def fn3(x):
                 ),
                 (
                     fn3,
-                    (
-                        "triton_poi_fused_layer_norm_relu"
-                        if torch._dynamo.config.inline_inbuilt_nn_modules
-                        else "triton_poi_fused_LayerNorm_ReLU"
-                    ),
+                    "triton_poi_fused_LayerNorm_ReLU",
                     (torch.randn(4, 4, device=GPU_TYPE),),
                 ),
             ]
@@ -15569,7 +15784,7 @@ def f(x):
                 ).run(code)
             else:
                 FileCheck().check_count(
-                    "with torch.cuda._DeviceGuard(0)", 1, exactly=True
+                    f"with torch.{GPU_TYPE}._DeviceGuard(0)", 1, exactly=True
                 ).run(code)
 
     class RNNTest(TestCase):
diff --git a/test/inductor/test_torchinductor_codegen_config_overrides.py b/test/inductor/test_torchinductor_codegen_config_overrides.py
index a85c60b625f8..d8f1f06afd5d 100644
--- a/test/inductor/test_torchinductor_codegen_config_overrides.py
+++ b/test/inductor/test_torchinductor_codegen_config_overrides.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: inductor"]
 import importlib
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 from unittest import skipIf
 
 import torch
@@ -31,6 +32,8 @@ def run_and_compare(
         *args,
         compile_kwargs: Optional[dict] = None,
         config_patches: Optional[dict] = None,
+        atol: float | None = 1e-05,
+        rtol: float | None = 1e-08,
     ):
         """
         Runs the module through Inductor, comparing to eager reference.
@@ -52,7 +55,7 @@ def flatten_tensors(tensors):
         ref_tensors = flatten_tensors(func(*args))
         actual_tensors = flatten_tensors(result)
         for ref, actual in zip(ref_tensors, actual_tensors):
-            self.assertTrue(torch.allclose(ref, actual))
+            self.assertTrue(torch.allclose(ref, actual, atol=atol, rtol=rtol))
 
         return result, code
 
@@ -88,6 +91,34 @@ def func(a, b):
         else:
             self.count_code(reinterpret_call, code, 2)
 
+    @requires_gpu()
+    @skipIf(GPU_TYPE == "mps", "Triton is not available for MPS")
+    def test_cse_make_block_ptr_reduction(self):
+        def func(a, b):
+            tmp0 = a * b
+            tmp1 = a + b
+            c = tmp0 + tmp1
+            return c.sum(dim=0)
+
+        config_patches = {
+            "triton.use_block_ptr": True,
+            "triton.tile_reductions": True,
+            "triton.prefer_nd_tiling": True,
+            "triton.max_tiles": 3,
+            "split_reductions": False,
+        }
+        a = torch.randn((512, 4096), device=torch.device(GPU_TYPE))
+        b = torch.randn((512, 4096), device=torch.device(GPU_TYPE))
+        _, code = self.run_and_compare(
+            func,
+            a,
+            b,
+            config_patches=config_patches,
+            atol=1e-4,
+        )
+        self.count_code("= tl.make_block_ptr(in_ptr", code, 2)
+        self.count_code("= tl.load(block_ptr", code, 2)
+
     @requires_gpu()
     @skipIf(GPU_TYPE == "mps", "Triton is not available for MPS")
     def test_kernel_fusion_thresholds(self):
diff --git a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
index 4bcdf0d0cddc..398ab63041d5 100644
--- a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
@@ -104,9 +104,15 @@ def run(*ex, **kwargs):
         ("cpu", "cuda", "xpu"), is_skip=True
     ),
     "test_to_device_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu"), is_skip=True),
+    "test_as_strided_on_views_dynamic_shapes": TestFailure(
+        ("cpu", "cuda", "xpu"), is_skip=True
+    ),
     #
     # Failed to find dynamic for loop variable:
     #
+    "test_conv1d_with_permute_dynamic_shapes": TestFailure(("cpu",), is_skip=True),
+    # XPU always convert conv1d to conv2d and can not match the expected codegen result.
+    "test_conv1d_depthwise_dynamic_shapes": TestFailure(("xpu",), is_skip=True),
     "test_arange1_dynamic_shapes": TestFailure(("cpu",)),
     "test_arange2_dynamic_shapes": TestFailure(("cpu",)),
     "test_arange3_dynamic_shapes": TestFailure(("cpu",)),
diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py
index a72d98493962..5eaa007a8a1c 100644
--- a/test/inductor/test_torchinductor_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_dynamic_shapes.py
@@ -76,7 +76,6 @@
     "test_reduction3_dynamic_shapes": TestFailure(("mps",)),
     "test_reduction5_dynamic_shapes": TestFailure(("mps",)),
     "test_reflection_pad2d_dynamic_shapes": TestFailure(("mps",)),
-    "test_require_stride_expanded_dynamic_shapes": TestFailure(("mps",)),
     "test_roll_dynamic_shapes": TestFailure(("mps",)),
     "test_std_dynamic_shapes": TestFailure(("mps",)),
     "test_var_correction_dynamic_shapes": TestFailure(("mps",)),
@@ -286,7 +285,6 @@ def test_unwrap_storage_didnt_work_repro(self, device):
         def f():
             full = torch.full((), 11)
             i0 = full.item()
-            torch._check_is_size(i0)
             return torch.full((i0,), 0)
 
         opt_f = torch.compile(f, fullgraph=True)
@@ -451,8 +449,6 @@ def f(x):
     def test_return_unbacked_view_split(self, device):
         def f(values, length_per_key):
             u0, u1 = length_per_key.tolist()
-            torch._check_is_size(u0)
-            torch._check_is_size(u1)
             v1, v2 = torch.functional.split(values, [u0, u1])
             return v1, v2
 
@@ -484,7 +480,6 @@ def _cat(t: torch.Tensor, ds: list[int]) -> torch.Tensor:
 
         @torch.library.register_fake("_test::_cat")
         def _cat_fake(t: torch.Tensor, ds: list[int]) -> torch.Tensor:
-            [torch._check_is_size(d) for d in ds]
             return t.new_empty([sum(ds)])
 
         def _cat_setup_context(ctx, inputs, output):
@@ -984,7 +979,6 @@ def test_item_unbacked_stride_nobreak(self, device):
         @torch.compile(fullgraph=True, dynamic=True)
         def f(x):
             a = x.item()
-            torch._check_is_size(a)
             torch._check(a >= 1)
             torch._check(a <= 10)
             return torch.ones(a, a)
@@ -996,8 +990,6 @@ def test_symint_sum_list(self, device):
         @torch.compile()
         def f(xt):
             xs = xt.tolist()
-            for x in xs:
-                torch._check_is_size(x)
             y = sum(xs)
             return torch.zeros(y, device=device)
 
@@ -1116,6 +1108,40 @@ def fn(x):
             self.assertEqual(fn(x), actual)
             FileCheck().check("R0_BLOCK: tl.constexpr = 64").run(source_codes[0])
 
+    def test_non_persistent_dynamic_rblock(self):
+        def reduce_bounded(x, y):
+            """Reduce over a dimension with bounded size."""
+            # x shape: [batch, features, reduction_dim]
+            # reduction_dim is dynamic but bounded to max 128
+            assert x.shape[2] <= 64, f"Reduction dim {x.shape[2]} exceeds max 128"
+
+            # Perform reduction (sum) over the last dimension
+            result = torch.sum(x * y, dim=2)
+            return result
+
+        # Create tensors where reduction dimension is 6 (but could be up to 128)
+        batch = 256
+        features = 5536
+        reduction_dim = 6  # Actual size is small
+
+        x = torch.randn(reduction_dim, batch, features, device=GPU_TYPE).permute(
+            1, 2, 0
+        )
+        y = torch.randn(reduction_dim, batch, features, device=GPU_TYPE).permute(
+            1, 2, 0
+        )
+
+        torch._dynamo.mark_dynamic(x, 2, min=6, max=64)
+        torch._dynamo.mark_dynamic(y, 2, min=6, max=64)
+
+        compiled_fn = torch.compile(reduce_bounded)
+        result, source_codes = run_and_get_code(compiled_fn, x, y)
+
+        FileCheck().check_not("@triton_heuristics.persistent").run(source_codes[0])
+        expected = reduce_bounded(x, y)
+
+        assert torch.allclose(result, expected, atol=1e-3, rtol=1e-3)
+
     def test_unspecialized_float_dynamic(self):
         def fn(x, y):
             return x * y
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 807ccb48a798..bfbccda5dd80 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -266,9 +266,12 @@ def format_op(op):
     "torch.ops.aten._flash_attention_forward": {f16},
     "torch.ops.aten._efficient_attention_forward": {f16, f32},
     "to_sparse": {
+        b8,
         f16,
         f32,
         f64,
+        i32,
+        i64,
     },  # NYI: could not find kernel for aten.view.default at dispatch key DispatchKey.SparseCUDA
 }
 
@@ -283,7 +286,14 @@ def format_op(op):
     "tan": {f16},
     "torch.ops.aten._flash_attention_forward": {f16},
     "torch.ops.aten._efficient_attention_forward": {f16, f32},
-    "to_sparse": {f32, f64},
+    "to_sparse": {
+        b8,
+        f16,
+        f32,
+        f64,
+        i32,
+        i64,
+    },  # align with cuda.
     "linalg.eig": {f32, f64},
     ("linalg.pinv", "singular"): {f64},
     # could not create a primitive
@@ -646,7 +656,7 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     ("tanh", f16): {"atol": 1e-4, "rtol": 1e-2},
     ("nn.functional.embedding_bag", f32): {"check_gradient": False},
     ("nn.functional.embedding_bag", f64): {"check_gradient": False},
-    ("_unsafe_masked_index_put_accumulate", f16): {"atol": 1e-5, "rtol": 5e-3},
+    ("_unsafe_masked_index_put_accumulate", f16): {"atol": 1e-4, "rtol": 0.01},
     ("_unsafe_masked_index", f16): {
         "reference_in_float": True,
         "atol": 3e-4,
@@ -1217,7 +1227,7 @@ def fn(*args, **kwargs):
             # not exercised in test_ops_gradients atm.  The problem is not
             # complex32 per-se (which is supported by data movement only ops)
             # but that when we do backwards we expect other ops like add to work
-            and not dtype == torch.complex32
+            and dtype != torch.complex32
         )
         samples = op.sample_inputs(device, dtype, requires_grad=requires_grad)
 
diff --git a/test/inductor/test_torchinductor_strided_blocks.py b/test/inductor/test_torchinductor_strided_blocks.py
index 41db6b18daba..506174103f56 100644
--- a/test/inductor/test_torchinductor_strided_blocks.py
+++ b/test/inductor/test_torchinductor_strided_blocks.py
@@ -5,7 +5,8 @@
 import importlib
 import math
 import unittest
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional, Union
 
 import torch
 import torch.utils._pytree as pytree
@@ -167,6 +168,8 @@ def count_code(substr: str, expected: Optional[int]):
         self.assertEqual(len(code), expected_num_programs)
         count_code("@triton.jit", expected_num_triton_kernels)
         count_code(self.block_descriptor_constructor_str, expected_num_block_pointers)
+        # Verify that 1D shapes aren't being transposed for the TMA store.
+        count_code("tl.trans", 0)
 
         return result, code
 
@@ -821,7 +824,7 @@ def test_2d_reduction_odd_shapes(
         [
             ((8, 8), 1, 1, True),  # Persistent Welford fallback
             subtest(
-                ((128, 128), 9, 2, False), decorators=[xfail_if_use_tensor_descriptor]
+                ((128, 128), 7, 2, False), decorators=[xfail_if_use_tensor_descriptor]
             ),  # Looped Welford reduction
         ],
     )
@@ -921,7 +924,7 @@ def foo(x):
         result, (code,) = self._run_and_compare(
             foo,
             view,
-            expected_num_block_pointers=6,
+            expected_num_block_pointers=5,
             expected_num_triton_kernels=2,
             config_patches={
                 "triton.multi_kernel": True,
@@ -1072,7 +1075,6 @@ def test_unbacked_size_on_non_contig_dim(self, num_tile_candidates: int):
 
         def foo(x, length):
             unbacked = length.item()
-            torch._check_is_size(unbacked)
 
             repeated = x.repeat(1, unbacked, NUM_REPEAT)
             # permute creates split in middle with unbacked symint is the first range
@@ -1104,6 +1106,8 @@ def foo(x, length):
     # bernoulli operation
     # TODO: fails for triton CPU "Failed to convert to LLVM IR"
     @test_torchinductor.xfail_if_triton_cpu
+    # Disable split_reductions on this test for now due to the interaction with LOAF
+    @config.patch(split_reductions=False)
     def test_removed_buffers(self):
         from torch.ops import aten
 
@@ -1114,8 +1118,8 @@ def fn(a):
         result, code = self._run_and_compare(
             fn,
             *[torch.ones(200, 200, device=self.device) * p],
-            expected_num_triton_kernels=2,
-            expected_num_block_pointers=3,
+            expected_num_triton_kernels=1,
+            expected_num_block_pointers=1,
             atol=p * 0.06,
             rtol=0.06,
         )
diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
index 15a08e7f1627..4739d00f1f4a 100644
--- a/test/inductor/test_triton_kernels.py
+++ b/test/inductor/test_triton_kernels.py
@@ -2567,6 +2567,52 @@ def fn(inp):
         expected = torch.compile(fn, fullgraph=True)(inp)
         self.assertEqual(actual, expected)
 
+    @requires_gpu
+    @inductor_config.patch("emulate_precision_casts", True)
+    def test_triton_kernel_emulate_precision_unaffected(self):
+        @triton.jit
+        def triton_(in_ptr, out_ptr, numel, add_amount, BLOCK_SIZE: tl.constexpr):
+            offsets = tl.arange(0, BLOCK_SIZE)
+            x = tl.load(in_ptr + offsets, mask=(offsets < numel))
+            output = x * x
+            if add_amount is not None:
+                output = output + add_amount
+            tl.store(out_ptr + offsets, output, mask=(offsets < numel))
+
+        def fn(x):
+            y = torch.empty_like(x)
+            BLOCK_SIZE = 256
+            grid = (1,)
+            triton_[grid](x, y, x.numel(), None, BLOCK_SIZE)
+            return y
+
+        t1 = torch.rand(5, device=GPU_TYPE)
+        fn = torch.compile(fn)
+        _, (code,) = run_and_get_code(fn, t1)
+        self.assertTrue("enable_fp_fusion" not in code)
+
+    @requires_gpu
+    @inductor_config.patch("emulate_precision_casts", True)
+    @inductor_config.patch("max_autotune_gemm_backends", "TRITON")
+    def test_triton_kernel_emulate_precision_mm_kernels_do_not_change(self):
+        from torch._inductor.utils import run_and_get_code
+
+        @torch.compile(mode="max-autotune")
+        def fn(a, b):
+            return a @ b
+
+        t1 = torch.rand(512, 512, device=GPU_TYPE)
+        t2 = torch.rand(512, 512, device=GPU_TYPE)
+        try:
+            _, (code,) = run_and_get_code(fn, t1, t2)
+            self.assertTrue("enable_fp_fusion" not in code)
+        except Exception as e:
+            if "NoValidChoicesError" in str(e):
+                raise unittest.SkipTest(
+                    "where inductor has no triton mm kernels available, this test is meaningless"
+                ) from e
+            raise
+
 
 def make_mutation_test(fn):
     @requires_gpu
@@ -2959,7 +3005,7 @@ def add_1_time_kernel(
             mask = offsets < n_elements
             x = tl.load(in_ptr0 + offsets, mask=mask)
             y = tl.load(in_ptr1 + offsets, mask=mask)
-            for i in range(0, BLOCK_SIZE):
+            for i in range(BLOCK_SIZE):
                 i = tl.multiple_of(i, 1)
             output = x + y
             tl.store(out_ptr + offsets, output, mask=mask)
@@ -3114,7 +3160,7 @@ def fwd_kernel(
             x = tl.load(x_block_ptr)
 
             # Compute gating
-            for c2 in range(0, tl.cdiv(C2, BLOCK_SIZE_C2)):
+            for c2 in range(tl.cdiv(C2, BLOCK_SIZE_C2)):
                 # Compute block pointers
                 offs_c2 = c2 * BLOCK_SIZE_C2 + tl.arange(0, BLOCK_SIZE_C2)
                 o_block_ptr = O_ptr + offs_m[:, None] * C2 + offs_c2[None, :]
diff --git a/test/inductor/test_unbacked_symints.py b/test/inductor/test_unbacked_symints.py
index cc9c1251523d..eb882d36160e 100644
--- a/test/inductor/test_unbacked_symints.py
+++ b/test/inductor/test_unbacked_symints.py
@@ -236,7 +236,6 @@ def test_mm_and_friends(self, device, torch_fn, coordinate_descent_tuning):
 
         def fn(x, w, repeats, is_bmm):
             u0 = repeats.item()
-            torch._check_is_size(u0)
 
             x_unbacked = x.expand(u0, 32)
             w_unbacked = w.expand(32, u0)
@@ -268,7 +267,6 @@ def fn(x, w, repeats, is_bmm):
     def test_unbacked_range_tree_divisor(self, device):
         def fn(x, num):
             u0 = num.item()
-            torch._check_is_size(u0)
             zeros = torch.zeros(u0, device=device, dtype=torch.int)
             return (torch.ops.aten.index(x, [None, zeros]),)
 
@@ -302,8 +300,6 @@ def fn(value, mask):
     def test_unbacked_repeat(self, device):
         def fn(x, a, b):
             u0, u1 = a.item(), b.item()
-            torch._check_is_size(u0)
-            torch._check_is_size(u1)
 
             return x.repeat(u0, 2).repeat(2, u1)
 
@@ -361,11 +357,11 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
                     inp = args[0]
 
                     start = inp.slice_bounds[0].item()
-                    torch._check_is_size(start)
+                    torch._check(start >= 0)
                     torch._check(start <= inp.size(0))
 
                     length = (args[0].slice_bounds[1] - args[0].slice_bounds[0]).item()
-                    torch._check_is_size(length)
+                    torch._check(length >= 0)
                     torch._check(start + length <= inp.size(0))
 
                     return CustomSliceSubclass(
@@ -489,6 +485,22 @@ def fn(q, k, vector, scalar):
         expected = fn(*example_inputs)
         torch.testing.assert_close(actual, expected)
 
+    @skipGPUIf(not HAS_GPU, "requires gpu and triton")
+    @dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
+    def test_softmax(self, device):
+        def fn(x):
+            nz = x.nonzero().float()
+            soft = torch.softmax(nz, dim=0)
+            logsoft = torch.nn.functional.log_softmax(nz, dim=0)
+            return soft * logsoft
+
+        example_inputs = (
+            torch.randint(low=0, high=2, size=(32,), device=device, dtype=torch.int8),
+        )
+        actual = torch.compile(fn, fullgraph=True)(*example_inputs)
+        expected = fn(*example_inputs)
+        torch.testing.assert_close(actual, expected)
+
     @skipGPUIf(not HAS_GPU, "requires gpu and triton")
     @skipIfXpu(msg="_scaled_dot_product_flash_attention is not supported on XPU yet")
     @dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
@@ -515,6 +527,35 @@ def fn(x):
         x = torch.tensor([1.0, 0.0, 1.0, 0.0], device=device)
         torch.compile(fn, fullgraph=True)(x)
 
+    @skipGPUIf(not HAS_GPU, "requires gpu and triton")
+    @skipIfXpu(msg="scaled_dot_product_attention is not supported on XPU yet")
+    @dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
+    def test_sdfpa_unbacked_strides(self, device):
+        if device == "cpu":
+            raise unittest.SkipTest("scaled_dot_product_attention has no CPU backend")
+
+        def fn(x, y):
+            B, H, d_h = 2, 4, 16
+            nz = torch.nonzero(x)
+            seq_len = nz.size(0)
+            y = torch.nonzero(y).size(0)
+            strides = (H * seq_len * d_h, seq_len * d_h, d_h, y)
+
+            q = torch.randn(B, H, seq_len, d_h, device=device, dtype=torch.float16)
+            k = torch.randn(B, H, seq_len, d_h, device=device, dtype=torch.float16)
+            v = torch.randn(B, H, seq_len, d_h, device=device, dtype=torch.float16)
+            q = torch.as_strided(q, size=(B, H, seq_len, d_h), stride=strides)
+            k = torch.as_strided(k, size=(B, H, seq_len, d_h), stride=strides)
+            v = torch.as_strided(v, size=(B, H, seq_len, d_h), stride=strides)
+            result = torch.ops.aten._scaled_dot_product_flash_attention.default(
+                q, k, v, dropout_p=0.0, is_causal=False, scale=None
+            )
+            return result
+
+        x = torch.tensor([1.0, 0.0] * 8, device=device)
+        y = torch.tensor([1.0, 0.0], device=device)
+        torch.compile(fn, fullgraph=True)(x, y)
+
     @skipGPUIf(not HAS_GPU, "torch.compile for gpu requires triton")
     @dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
     def test_unbacked_linear_layer_norm_input(self, device):
@@ -551,7 +592,6 @@ def forward(self, x, mask):
     def test_to_int_with_unbacked_size(self, device):
         def fn(x):
             unbacked = x.item()
-            torch._check_is_size(unbacked)
 
             # Transpose to avoid contig short-circuit.
             unbacked_size = torch.ones(
@@ -591,6 +631,50 @@ def fn(x):
         expected = fn(*example_inputs)
         torch.testing.assert_close(actual, expected)
 
+    @skipGPUIf(not HAS_GPU, "requires gpu and triton")
+    @dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
+    @inductor_config.patch({"benchmark_kernel": True})
+    def test_triton_kernel_with_unbacked_symint_fallback(self, device):
+        # The benchmark_kernel=True config exercises the codegen_kernel_benchmark code path
+        # Test isinstance(arg_sig, SizeArg) == True in the fallback path
+        def fn(x):
+            # Create unbacked SymInt
+            nz = torch.nonzero(x)
+            u0 = nz.size(0)
+            # Create indices for index_select operation
+            indices = torch.tensor([1, u0 - 5], device=device)
+            # Create SizeArg object
+            x = torch.index_select(x, 0, indices)
+            return x
+
+        example_inputs = (torch.randn(32, device=device, dtype=torch.float16),)
+        torch._dynamo.mark_dynamic(example_inputs[0], 0)
+        actual = torch.compile(fn, fullgraph=True)(*example_inputs)
+        expected = fn(*example_inputs)
+        torch.testing.assert_close(actual, expected)
+
+    @skipGPUIf(not HAS_GPU, "requires gpu and triton")
+    @inductor_config.patch({"max_autotune": True})
+    @dynamo_config.patch({"capture_scalar_outputs": True})
+    def test_autotune_with_unbacked_stride(self, device):
+        def fn(x, y, a):
+            u0 = a.item()
+            torch._check(u0 != 1)
+            unbacked = x.expand(8, u0, *x.shape).clone()
+            unbacked = torch.permute(unbacked, [0, 2, 1])
+            y = y.expand(8, *y.shape)
+            bmm = torch.ops.aten.bmm(unbacked, y)
+            return bmm
+
+        example_inputs = (
+            torch.randn((32,), dtype=torch.bfloat16, device=device),
+            torch.randn((128, 64), dtype=torch.bfloat16, device=device),
+            torch.tensor(128, device=device),
+        )
+        actual = torch.compile(fn, fullgraph=True)(*example_inputs)
+        expected = fn(*example_inputs)
+        torch.testing.assert_close(actual, expected)
+
 
 instantiate_device_type_tests(TestUnbackedSymints, globals(), allow_xpu=True)
 
diff --git a/test/inductor/test_utils.py b/test/inductor/test_utils.py
index 0fb1a8dcf322..7d23457732a1 100644
--- a/test/inductor/test_utils.py
+++ b/test/inductor/test_utils.py
@@ -131,14 +131,14 @@ def create_fx_node(
                 (
                     torch.ops.aten.convolution,
                     (
-                        torch.Tensor(2, 3, 3),
+                        torch.Tensor(2, 2, 3),
                         torch.Tensor(2, 2, 2),
                         torch.Tensor(2),
-                        (1, 1),
-                        (0, 0),
-                        (1, 1),
+                        (1,),
+                        (0,),
+                        (1,),
                         True,
-                        (0, 0),
+                        (0,),
                         1,
                     ),
                     {},
@@ -198,7 +198,7 @@ def create_fx_node(
     @dtypes(torch.float16, torch.bfloat16, torch.float32)
     def test_get_device_tflops(self, dtype):
         ret = get_device_tflops(dtype)
-        self.assertTrue(type(ret) == float)
+        self.assertTrue(type(ret) is float)
 
 
 instantiate_device_type_tests(TestUtils, globals())
diff --git a/test/jit/test_autodiff_subgraph_slicing.py b/test/jit/test_autodiff_subgraph_slicing.py
index f42aa7f8f436..f128f9c7eec3 100644
--- a/test/jit/test_autodiff_subgraph_slicing.py
+++ b/test/jit/test_autodiff_subgraph_slicing.py
@@ -27,6 +27,9 @@
 )
 
 
+assert GRAPH_EXECUTOR is not None
+
+
 @unittest.skipIf(
     GRAPH_EXECUTOR == ProfilingMode.SIMPLE, "Simple Executor doesn't support gradients"
 )
diff --git a/test/jit/test_backends.py b/test/jit/test_backends.py
index eef8cc75fdcd..60b16469fc08 100644
--- a/test/jit/test_backends.py
+++ b/test/jit/test_backends.py
@@ -16,8 +16,6 @@
     IS_SANDCASTLE,
     IS_WINDOWS,
     raise_on_run_directly,
-    skipIfRocm,
-    TEST_WITH_ROCM,
 )
 from torch.testing._internal.jit_utils import JitTestCase
 
@@ -61,7 +59,7 @@ def sub_accum(self, x, h):
 
 # This is ignored in IS_WINDOWS or IS_MACOS cases. Hence we need the one in TestBackends.
 @unittest.skipIf(
-    TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
+    IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
     "Non-portable load_library call used in test",
 )
 class JitBackendTestCase(JitTestCase):
@@ -144,7 +142,6 @@ def test_execution(self):
         self.check_function("sub_accum", (input, input))
         self.check_function("forward", (input, input))
 
-    @skipIfRocm
     def test_save_load(self):
         # Lowered module should produce the same outputs.
         self.test_execution()
@@ -203,7 +200,6 @@ def test_execution(self):
             backend_method = self.lowered_module.__getattr__("forward")
             backend_method(*(input, input))
 
-    @skipIfRocm
     def test_save_load(self):
         # Test that saving the lowered module is OK but loading fails because the backend is not available.
         buffer = io.BytesIO()
@@ -447,7 +443,7 @@ def test_errors(self):
 
 # This is needed for IS_WINDOWS or IS_MACOS to skip the tests.
 @unittest.skipIf(
-    TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
+    IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
     "Non-portable load_library call used in test",
 )
 class TestBackends(JitTestCase):
@@ -465,27 +461,23 @@ def __init__(self, name):
 
     def setUp(self):
         super().setUp()
-        if not TEST_WITH_ROCM:
-            self.basic_module_test.setUp()
-            self.basic_module_unavailable_test.setUp()
-            self.nested_module_test.setUp()
-            self.selective_lowering_test.setUp()
+        self.basic_module_test.setUp()
+        self.basic_module_unavailable_test.setUp()
+        self.nested_module_test.setUp()
+        self.selective_lowering_test.setUp()
 
-    @skipIfRocm
     def test_execution(self):
         self.basic_module_test.test_execution()
         self.basic_module_unavailable_test.test_execution()
         self.nested_module_test.test_execution()
         self.selective_lowering_test.test_execution()
 
-    @skipIfRocm
     def test_save_load(self):
         self.basic_module_test.test_save_load()
         self.basic_module_unavailable_test.test_save_load()
         self.nested_module_test.test_save_load()
         self.selective_lowering_test.test_save_load()
 
-    @skipIfRocm
     def test_errors(self):
         self.selective_lowering_test.test_errors()
 
@@ -510,7 +502,7 @@ def forward(self, x, h):
 
 # This is ignored in IS_WINDOWS or IS_MACOS cases. Hence we need the one in TestBackends.
 @unittest.skipIf(
-    TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
+    IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE,
     "Non-portable load_library call used in test",
 )
 class JitBackendTestCaseWithCompiler(JitTestCase):
diff --git a/test/jit/test_builtins.py b/test/jit/test_builtins.py
index 781080f5deb6..097130b6f164 100644
--- a/test/jit/test_builtins.py
+++ b/test/jit/test_builtins.py
@@ -326,6 +326,8 @@ def fn(x):
             # This has a longer implementation, maybe not worth copying to
             # TorchScript if named tensors don't work there anyways
             "names",
+            # We don't plan to support grad_dtype in TorchScript
+            "grad_dtype",
         }
 
         for p in properties:
diff --git a/test/jit/test_cuda.py b/test/jit/test_cuda.py
index f026fa0188b2..d83860938d95 100644
--- a/test/jit/test_cuda.py
+++ b/test/jit/test_cuda.py
@@ -14,7 +14,6 @@
     NoTest,
     raise_on_run_directly,
     skipCUDANonDefaultStreamIf,
-    skipIfRocm,
     TEST_CUDA,
 )
 from torch.testing._internal.jit_utils import JitTestCase
@@ -48,7 +47,6 @@ def tearDown(self):
         torch.cuda.empty_cache()
         super().tearDown()
 
-    @skipIfRocm
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_cuda_synchronize(self):
         # Test device synchronization.
@@ -121,7 +119,6 @@ def event_default_args() -> bool:
 
         self.assertTrue(event_default_args)
 
-    @skipIfRocm
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     def test_current_stream(self):
         # Test current stream on the device and check if the stream device index
@@ -161,7 +158,6 @@ def fn_with_device_index_args():
         self.assertEqual(0, d2)
         self.assertEqual(d0, d2)
 
-    @skipIfRocm
     @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
     @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
     @skipCUDANonDefaultStreamIf(True)
diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
index 8258124680b4..ca1172a2ce7e 100644
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@@ -2083,9 +2083,9 @@ def forward(self, x):
 
             mod_eager = ConvBN(3, 32, kernel_size=3, stride=2).eval()
             inps = [4, 3, 4]
-            if modules[0] == nn.Conv2d:
+            if modules[0] is nn.Conv2d:
                 inps.append(inps[-1])
-            if modules[0] == nn.Conv3d:
+            if modules[0] is nn.Conv3d:
                 inps.append(inps[-1])
                 inps.append(inps[-1])
 
@@ -2224,9 +2224,9 @@ def forward(self, x):
             mod_eager = ConvOp(3, 32, kernel_size=3, stride=2).eval()
 
             inps = [4, 3, 4]
-            if module == nn.Conv2d:
+            if module is nn.Conv2d:
                 inps.append(inps[-1])
-            if module == nn.Conv3d:
+            if module is nn.Conv3d:
                 inps.append(inps[-1])
                 inps.append(inps[-1])
 
@@ -2366,10 +2366,10 @@ def forward(self, x):
             mod_eager = LinearBN(32, 32).eval()
 
             inps = [3, 32]
-            if modules[1] == nn.BatchNorm2d:
+            if modules[1] is nn.BatchNorm2d:
                 inps.append(inps[-1])
                 inps.append(inps[-1])
-            if modules[1] == nn.BatchNorm3d:
+            if modules[1] is nn.BatchNorm3d:
                 inps.append(inps[-1])
                 inps.append(inps[-1])
                 inps.append(inps[-1])
@@ -2429,14 +2429,14 @@ def test_bn_not_broadcast_with_linear(self):
 
             N, C = 3, bn_in
             input_shape = [N, C]
-            if modules[1] == nn.BatchNorm1d:
+            if modules[1] is nn.BatchNorm1d:
                 H = linear_in
                 input_shape.append(H)
-            elif modules[1] == nn.BatchNorm2d:
+            elif modules[1] is nn.BatchNorm2d:
                 H, W = 4, linear_in
                 input_shape.append(H)
                 input_shape.append(W)
-            elif modules[1] == nn.BatchNorm3d:
+            elif modules[1] is nn.BatchNorm3d:
                 D, H, W = 4, 4, linear_in
                 input_shape.append(D)
                 input_shape.append(H)
@@ -2504,10 +2504,10 @@ def forward(self, x):
             mod_eager = LinearBN(32, 32).cuda().eval()
 
             inps = [3, 32]
-            if modules[1] == nn.BatchNorm2d:
+            if modules[1] is nn.BatchNorm2d:
                 inps.append(inps[-1])
                 inps.append(inps[-1])
-            if modules[1] == nn.BatchNorm3d:
+            if modules[1] is nn.BatchNorm3d:
                 inps.append(inps[-1])
                 inps.append(inps[-1])
                 inps.append(inps[-1])
@@ -2757,9 +2757,9 @@ def test_conv_to_mkldnn(self):
             for module, trace in product([nn.Conv2d, nn.Conv3d], [False, True]):
                 mod = module(3, 32, kernel_size=3, stride=2).eval()
                 inps = [4, 3, 4]
-                if module == nn.Conv2d:
+                if module is nn.Conv2d:
                     inps.append(inps[-1])
-                if module == nn.Conv3d:
+                if module is nn.Conv3d:
                     inps.append(inps[-1])
                     inps.append(inps[-1])
 
@@ -2997,7 +2997,7 @@ def forward(self, x):
                 mod_eager = Net(3, 6, kernel_size=3, stride=2).eval().cuda()
 
                 inps = [5, 3, 4, 4]
-                if conv == nn.Conv3d:
+                if conv is nn.Conv3d:
                     inps.append(inps[-1])
                 inp = torch.rand(inps).cuda()
 
diff --git a/test/jit/test_hash.py b/test/jit/test_hash.py
index 21c99a8a426c..764110d46dd1 100644
--- a/test/jit/test_hash.py
+++ b/test/jit/test_hash.py
@@ -73,10 +73,6 @@ def fn(f1: float, f2: float):
         self.checkScript(fn, (1.2345, float("inf")))
         self.checkScript(fn, (float("inf"), float("inf")))
         self.checkScript(fn, (1.2345, float("nan")))
-        if sys.version_info < (3, 10):
-            # Hash of two nans are not guaranteed to be equal. From https://docs.python.org/3/whatsnew/3.10.html :
-            # Hashes of NaN values of both float type and decimal.Decimal type now depend on object identity.
-            self.checkScript(fn, (float("nan"), float("nan")))
         self.checkScript(fn, (float("nan"), float("inf")))
 
     def test_hash_int(self):
diff --git a/test/jit/test_typing.py b/test/jit/test_typing.py
index 8f34a1c75b6d..c1a010dcfb94 100644
--- a/test/jit/test_typing.py
+++ b/test/jit/test_typing.py
@@ -210,7 +210,7 @@ def stuff4(x):
         li_1, li_2, li_3 = stuff4([True])
         li_3 = li_3[0]
         for li in [li_1, li_2, li_3]:
-            self.assertTrue(type(li[0]) == bool)
+            self.assertTrue(type(li[0]) is bool)
 
     def test_nested_list(self):
         def foo(z):
diff --git a/test/jit/test_union_pep604.py b/test/jit/test_union_pep604.py
index 0cd2ce33165a..953ce52c4978 100644
--- a/test/jit/test_union_pep604.py
+++ b/test/jit/test_union_pep604.py
@@ -20,7 +20,6 @@
 from torch.testing._internal.jit_utils import JitTestCase, make_global
 
 
-@unittest.skipIf(sys.version_info < (3, 10), "Requires Python 3.10")
 class TestUnion(JitTestCase):
     """
     This class tests the functionality of `Union`.
diff --git a/test/jit/xnnpack/test_xnnpack_delegate.py b/test/jit/xnnpack/test_xnnpack_delegate.py
index b97765ed5bb0..f6c7832d5b28 100644
--- a/test/jit/xnnpack/test_xnnpack_delegate.py
+++ b/test/jit/xnnpack/test_xnnpack_delegate.py
@@ -32,7 +32,7 @@ def forward(self, x):
             },
         )
 
-        for _ in range(0, 20):
+        for _ in range(20):
             sample_input = torch.randn(4, 4, 4)
             actual_output = scripted_module(sample_input)
             expected_output = lowered_module(sample_input)
diff --git a/test/lazy/test_ts_opinfo.py b/test/lazy/test_ts_opinfo.py
index 7c467dc62413..e4652a465d72 100644
--- a/test/lazy/test_ts_opinfo.py
+++ b/test/lazy/test_ts_opinfo.py
@@ -85,6 +85,7 @@ def init_lists():
         "linalg_inv_ex",
         "linalg_pinv.atol_rtol_tensor",
         "logsumexp",
+        "svd",
     }
     # For some ops, we don't support all variants. Here we use formatted_name
     # to uniquely identify the variant.
@@ -220,20 +221,15 @@ def get_name(op):  # noqa: F841
         torch._lazy.wait_device_ops()
         prefix = "aten" if op.name in FALLBACK_LIST else "lazy"
         symint_suffix = "_symint" if op.name in HAS_SYMINT_SUFFIX else ""
-        found = f"{prefix}::{op.name}{symint_suffix}" in remove_suffixes(
-            torch._lazy.metrics.counter_names()
-        )
+        metrics = remove_suffixes(torch._lazy.metrics.counter_names())
+        cands = [f"{prefix}::{op.name}{symint_suffix}"]
         # check aliases
-        if not found:
-            for alias in op.aliases:
-                alias_found = (
-                    f"{prefix}::{alias.name}{symint_suffix}"
-                    in remove_suffixes(torch._lazy.metrics.counter_names())
-                )
-                found = found or alias_found
-                if found:
-                    break
-        self.assertTrue(found)
+        for alias in op.aliases:
+            cands.append(f"{prefix}::{alias.name}{symint_suffix}")
+
+        self.assertTrue(
+            any(c in metrics for c in cands), f"none of {cands} not found in {metrics}"
+        )
 
     @ops(
         [
diff --git a/test/mobile/model_test/update_production_ops.py b/test/mobile/model_test/update_production_ops.py
index ec616d24ec1f..b4549a585e15 100644
--- a/test/mobile/model_test/update_production_ops.py
+++ b/test/mobile/model_test/update_production_ops.py
@@ -17,17 +17,13 @@
     for info in model_infos:
         for op in info["root_operators"]:
             # aggregate occurance per op
-            root_operators[op] = 1 + (root_operators[op] if op in root_operators else 0)
+            root_operators[op] = 1 + (root_operators.get(op, 0))
         for op in info["traced_operators"]:
             # aggregate occurance per op
-            traced_operators[op] = 1 + (
-                traced_operators[op] if op in traced_operators else 0
-            )
+            traced_operators[op] = 1 + (traced_operators.get(op, 0))
         # merge dtypes for each kernel
         for kernal, dtypes in info["kernel_metadata"].items():
-            new_dtypes = dtypes + (
-                kernel_metadata[kernal] if kernal in kernel_metadata else []
-            )
+            new_dtypes = dtypes + (kernel_metadata.get(kernal, []))
             kernel_metadata[kernal] = list(set(new_dtypes))
 
 
diff --git a/test/mobile/test_lite_script_module.py b/test/mobile/test_lite_script_module.py
index a1f84ca7e37b..cee6da61bf3a 100644
--- a/test/mobile/test_lite_script_module.py
+++ b/test/mobile/test_lite_script_module.py
@@ -437,7 +437,7 @@ def forward(self, val: int, x, y, w):
         # additional context to the exception message and preserve the correct
         #  C++ stack trace for symbolication. i.e. it isn't possible to add
         # the debug handle string to show where in the Python code the exception
-        # occured w/o first changing
+        # occurred w/o first changing
         # torch::jit::JITException to extend c10::Error.
         self.assertTrue("self.val and val are same" in error_message)
 
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index c04a6785b50f..3c3b3f53e528 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -53,6 +53,7 @@
     subtest,
     TEST_SCIPY,
     TEST_WITH_ROCM,
+    xfailIf,
 )
 
 
@@ -229,6 +230,101 @@ def test_conv_invalid_groups(self):
         with self.assertRaisesRegex(ValueError, "groups must be a positive integer"):
             torch.nn.Conv3d(1, 1, kernel_size=3, dilation=2, stride=2, groups=-2)
 
+    def test_conv_aten_invalid_groups(self):
+        # test low-level aten ops with invalid groups parameter
+        grad_output = torch.randn(2, 4, 8, dtype=torch.double)
+        input = torch.randn(2, 5, 8, dtype=torch.double)
+        weight = torch.randn(5, 4, 3, dtype=torch.double)
+        bias_sizes = [4]
+        stride = [1]
+        padding = [1]
+        dilation = [1]
+        transposed = True
+        output_padding = [0]
+        output_mask = [True, True, True]
+
+        # test groups=0
+        with self.assertRaisesRegex(
+            RuntimeError, "expected groups to be greater than 0, but got groups=0"
+        ):
+            torch.ops.aten.convolution_backward(
+                grad_output,
+                input,
+                weight,
+                bias_sizes,
+                stride,
+                padding,
+                dilation,
+                transposed,
+                output_padding,
+                0,
+                output_mask,
+            )
+
+        # test groups=-1
+        with self.assertRaisesRegex(
+            RuntimeError, "expected groups to be greater than 0, but got groups=-1"
+        ):
+            torch.ops.aten.convolution_backward(
+                grad_output,
+                input,
+                weight,
+                bias_sizes,
+                stride,
+                padding,
+                dilation,
+                transposed,
+                output_padding,
+                -1,
+                output_mask,
+            )
+
+    def test_conv3d_overflow_values(self):
+        input = torch.full(
+            (
+                0,
+                7,
+                9,
+                1,
+                5,
+            ),
+            0,
+            dtype=torch.float32,
+            requires_grad=False,
+        )
+        weight = torch.full(
+            (
+                9,
+                1,
+            ),
+            4.14214e16,
+            dtype=torch.float32,
+            requires_grad=False,
+        )
+        stride = [5, 5, 5]
+
+        with self.assertRaisesRegex(ValueError, "Padding height too large"):
+            torch.ops.aten.slow_conv3d(
+                input,
+                weight,
+                kernel_size=[5, 5, 5],
+                bias=None,
+                stride=stride,
+                padding=[2**62, 2**62, 2**62],
+            )
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Kernel height x width product is too large:"
+        ):
+            torch.ops.aten.slow_conv3d(
+                input,
+                weight,
+                kernel_size=[2**32, 2**32, 2**32],
+                bias=None,
+                stride=stride,
+                padding=[2**31, 2**31, 2**31],
+            )
+
     def test_Conv1d_module_same_padding(self):
         # Compare module against functional: without strides/dilation, asymmetric padding
         x = torch.rand(1, 1, 20)
@@ -712,6 +808,7 @@ def test_ConvTranspose2d_half_cublas_gemm(self):
     # For https://github.com/pytorch/pytorch/pull/1273
     # Almost identical to the above `test_Conv2d_naive_groups`
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
+    @torch.backends.miopen.flags(immediate=True)
     @tf32_on_and_off(0.001)
     def test_Conv2d_groups_nobias(self):
         dev_dtypes = [("cpu", torch.float)]
@@ -757,6 +854,7 @@ def test_Conv2d_groups_nobias(self):
     # See also https://github.com/pytorch/pytorch/pull/18463#issuecomment-476563686
     # and https://github.com/pytorch/pytorch/pull/18463#issuecomment-477001024
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
+    @torch.backends.miopen.flags(immediate=True)
     @tf32_on_and_off(0.001)
     def test_Conv2d_groups_nobias_v2(self):
         torch.manual_seed(123)
@@ -1194,7 +1292,7 @@ def reproducer(radius: int):
             kernel_x = torch.zeros([3, 1, 1, radius * 2 + 1], device=image.device)
             image = torch.nn.functional.conv2d(image, kernel_x, groups=image.shape[-3])
 
-        for i in range(0, 128):
+        for i in range(128):
             # This should not fail
             reproducer(radius=i)
 
@@ -1364,6 +1462,7 @@ def test_ConvTranspose2d_large_output_padding(self, device, dtype):
     # Very similar to test_Conv2d_naive_groups but with special care to handle
     # the number of groups == number of input channels
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
+    @torch.backends.miopen.flags(immediate=True)
     @tf32_on_and_off(0.01)
     def test_Conv2d_depthwise_naive_groups(self, device, dtype):
         for depth_multiplier in [1, 2]:
@@ -1426,6 +1525,7 @@ def test_Conv2d_depthwise_naive_groups(self, device, dtype):
     @onlyCUDA
     @dtypes(torch.float, torch.double, torch.half)
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
+    @torch.backends.miopen.flags(immediate=True)
     @tf32_on_and_off(0.01)
     def test_Conv3d_depthwise_naive_groups(self, device, dtype):
         for depth_multiplier in [1, 2]:
@@ -1518,48 +1618,49 @@ def test_noncontig_conv_grad(self, device, dtype):
 
     @onlyCUDA
     @dtypes(torch.double)
+    @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
+    @torch.backends.miopen.flags(immediate=True)
     def test_conv_double_backward(self, device, dtype):
-        with torch.backends.cudnn.flags(enabled=True, deterministic=True):
-            # Double backward only runs with DoubleTensor due to precision reason
-            batch_size = 1
-            for kern, inp_size, dilations in [(3, 5, [1, 2]), (4, 9, [1])]:
-                for stride, padding, chan_in, chan_out, dilation in product(
-                    [1], [2], [2], [3], dilations
-                ):
-                    no_weight = stride == 2
-                    result = self.run_conv_double_back_test(
-                        kern,
-                        stride,
-                        padding,
-                        chan_in,
-                        chan_out,
-                        batch_size,
-                        inp_size,
-                        dilation,
-                        no_weight,
-                        use_cuda=True,
-                        dtype=dtype,
-                    )
-                    self.assertTrue(
-                        result,
-                        "Conv double backward test failed with parameters:"
-                        + "\nkern: "
-                        + str(kern)
-                        + "\nstride: "
-                        + str(stride)
-                        + "\npadding: "
-                        + str(padding)
-                        + "\nchan_in: "
-                        + str(chan_in)
-                        + "\nchan_out: "
-                        + str(chan_out)
-                        + "\nbatch_size: "
-                        + str(batch_size)
-                        + "\ninp_size: "
-                        + str(inp_size)
-                        + "\ndilation: "
-                        + str(dilation),
-                    )
+        # Double backward only runs with DoubleTensor due to precision reason
+        batch_size = 1
+        for kern, inp_size, dilations in [(3, 5, [1, 2]), (4, 9, [1])]:
+            for stride, padding, chan_in, chan_out, dilation in product(
+                [1], [2], [2], [3], dilations
+            ):
+                no_weight = stride == 2
+                result = self.run_conv_double_back_test(
+                    kern,
+                    stride,
+                    padding,
+                    chan_in,
+                    chan_out,
+                    batch_size,
+                    inp_size,
+                    dilation,
+                    no_weight,
+                    use_cuda=True,
+                    dtype=dtype,
+                )
+                self.assertTrue(
+                    result,
+                    "Conv double backward test failed with parameters:"
+                    + "\nkern: "
+                    + str(kern)
+                    + "\nstride: "
+                    + str(stride)
+                    + "\npadding: "
+                    + str(padding)
+                    + "\nchan_in: "
+                    + str(chan_in)
+                    + "\nchan_out: "
+                    + str(chan_out)
+                    + "\nbatch_size: "
+                    + str(batch_size)
+                    + "\ninp_size: "
+                    + str(inp_size)
+                    + "\ndilation: "
+                    + str(dilation),
+                )
 
     def test_conv_double_backward_no_bias(self):
         kern = 3
@@ -1675,6 +1776,7 @@ def test_conv_double_backward_stride(self):
         *([torch.float] if MACOS_VERSION < 14.0 else [torch.float, torch.cfloat])
     )  # Complex not supported on MacOS13
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
+    @torch.backends.miopen.flags(immediate=True)
     def test_conv1d_same_padding(self, device, dtype):
         # Test padding='same' outputs the correct shape
         test_args = [
@@ -3351,6 +3453,7 @@ def test_ConvTranspose3d_size_1_kernel(self, device):
     )
     @dtypes(torch.float)
     @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False)
+    @torch.backends.miopen.flags(immediate=True)
     @tf32_on_and_off(0.001)
     def test_Conv2d_naive_groups(self, device, dtype):
         # Check that grouped convolutions matches two half convolutions
@@ -3785,9 +3888,9 @@ def _test_conv_cudnn_nhwc_nchw(self, layer, n, c, h, w, k, filter_size, device):
                         # This is because we have N111 weight that cannot handle
                         # the ambiguous memory_format
                         if w_f == torch.channels_last:
-                            if layer == nn.Conv2d and filter_size * c != 1:
+                            if layer is nn.Conv2d and filter_size * c != 1:
                                 output_format = torch.channels_last
-                            if layer == nn.ConvTranspose2d and filter_size * k != 1:
+                            if layer is nn.ConvTranspose2d and filter_size * k != 1:
                                 output_format = torch.channels_last
                     self._run_conv(
                         layer,
@@ -3823,12 +3926,9 @@ def test_conv_cudnn_mismatch_memory_format(self, device):
                 nn.ConvTranspose2d, n, c, h, w, k, filter_size, device
             )
 
-    # torch.half is erroring out on Windows with CUDA 10.1 + cuDNN 7.6.4
-    # returning CUDNN_STATUS_BAD_PARAM
-    # Disabling that specific test for now [see issue # 33918]
     @onlyCUDA
     @skipCUDAIfNoCudnn
-    @dtypes(torch.float, torch.double)
+    @dtypes(torch.float, torch.double, torch.float16, torch.bfloat16)
     def test_conv_cudnn_nhwc_support(self, device, dtype):
         input = torch.randn(
             (1, 16, 1, 1), dtype=dtype, device="cuda", requires_grad=True
@@ -3879,9 +3979,9 @@ def test_cudnn_convolution_relu(self, device, dtype):
                 continue
             inp = torch.rand(batch, groups, *image_size, dtype=dtype, device=device)
             w = torch.randn(8, groups, *kernel_size, dtype=dtype, device=device)
-            conv2d_out = torch.conv2d(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
             inp = inp.to(memory_format=memory_format)
             w = w.to(memory_format=memory_format)
+            conv2d_out = torch.conv2d(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
             if torch.version.hip:
                 cudnn_out = torch.miopen_convolution_relu(
                     inp, w, None, (1, 1), (0, 0), (1, 1), 1
@@ -3913,12 +4013,11 @@ def test_cudnn_convolution_add_relu(self, device, dtype):
                 continue
             inp = torch.rand(batch, groups, *image_size, dtype=dtype, device=device)
             w = torch.randn(8, groups, *kernel_size, dtype=dtype, device=device)
+            inp = inp.to(memory_format=memory_format)
+            w = w.to(memory_format=memory_format)
             conv2d_out = torch.conv2d(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
             alpha = 2.0
             z = torch.randn_like(conv2d_out)
-
-            inp = inp.to(memory_format=memory_format)
-            w = w.to(memory_format=memory_format)
             z = z.to(memory_format=memory_format)
             if torch.version.hip:
                 cudnn_out = torch.miopen_convolution_add_relu(
@@ -4040,10 +4139,36 @@ def test_conv3d_64bit_indexing(self, device):
         y = m.to(device=device)(x.to(device=device))
         self.assertEqual(yref, y)
 
+    @skipCUDAIfRocm
+    @onlyCUDA
+    @largeTensorTest("40GB", "cuda")
+    def test_conv3d_cudnn_broken(self, device):
+        for dtype in (torch.half, torch.bfloat16):
+            x = torch.rand(1, 16, 124, 1282, 722, dtype=dtype, device=device)
+            m = torch.nn.Conv3d(
+                16,
+                16,
+                kernel_size=(1, 3, 3),
+                padding=0,
+                stride=1,
+                bias=False,
+                dtype=dtype,
+                device=device,
+            )
+            with torch.backends.cudnn.flags(enabled=False):
+                yref = m(x)
+            y = m(x)
+            self.assertEqual(yref, y)
+
     @skipCUDAIfRocm
     @onlyCUDA
     @largeTensorTest("20GB")
     @largeTensorTest("64GB", "cpu")
+    # TODO(eqy): Remove this once it is fixed in cuDNN and we can dispatch to it again
+    @xfailIf(
+        torch.backends.cudnn.version() is not None
+        and torch.backends.cudnn.version() > 91000
+    )
     def test_depthwise_conv_64bit_indexing(self, device):
         x = torch.randn(1, 2, 32800, 32800, dtype=torch.half).to(
             memory_format=torch.channels_last
@@ -4057,7 +4182,8 @@ def test_depthwise_conv_64bit_indexing(self, device):
         del y, yref
 
         # try a batch-splittable case
-        x = x.reshape(100, 2, 3280, 3280).contiguous(memory_format=torch.channels_last)
+        x = x.reshape(100, 2, 3280, 3280)
+        x = x.contiguous(memory_format=torch.channels_last)
         yref = c(x)
         y = c.to(device=device)(x.to(device=device))
         self.assertEqual(yref, y, atol=1e-3, rtol=1e-4)
diff --git a/test/nn/test_embedding.py b/test/nn/test_embedding.py
index 3b21143711a5..f21184290fa1 100644
--- a/test/nn/test_embedding.py
+++ b/test/nn/test_embedding.py
@@ -551,7 +551,7 @@ def gen_2D_indices_from_1D(
                 # Pull out the bag's indices from indices_1D, and fill any
                 # remaining space with padding indices
                 indices_in_bag = []
-                for item_pos in range(0, max_indices_per_bag):
+                for item_pos in range(max_indices_per_bag):
                     if (start + item_pos) < end:
                         indices_in_bag.append(indices_1D[start + item_pos])
                     else:
@@ -632,6 +632,76 @@ def gen_2D_indices_from_1D(
                     weights.grad, weights_check.grad, msg=msg, atol=atol, rtol=rtol
                 )
 
+    @onlyCUDA
+    @dtypes(
+        torch.bfloat16,
+    )
+    @largeTensorTest("80GB", device="cuda")
+    def test_embedding_backward_large_batch_overflow(self, device, dtype):
+        """
+        Test that embedding_dense_backward handles large batches that exceed INT32_MAX thread IDs.
+
+        This reproduces the bug where gid = blockIdx.x * blockDim.x + threadIdx.x overflows
+        when declared as int32, causing negative indices and illegal memory access.
+        """
+        # Parameters chosen to GUARANTEE int32 overflow
+        num_indices = 8_214_880
+        embedding_dim = 4096
+        num_weights = 1280
+        padding_idx = -1
+        scale_grad_by_freq = False
+
+        # Verify parameters guarantee overflow
+        NROWS_PER_THREAD = 10
+        max_segments = min(num_indices, num_weights)
+        min_partial_for_overflow = (2**31) // 4096
+        required_indices = (min_partial_for_overflow - max_segments) * NROWS_PER_THREAD
+
+        assert num_indices > required_indices, (
+            f"Test bug: num_indices={num_indices:,} too small! Need >{required_indices:,}"
+        )
+
+        # Generate indices that create many partial segments
+        # Strategy: ~950 unique indices, each appearing many times
+        num_unique = 954
+        unique_indices = torch.randint(
+            2, 1276, (num_unique,), dtype=torch.int64, device=device
+        )
+        counts = torch.randint(
+            5000, 12000, (num_unique,), dtype=torch.int64, device=device
+        )
+
+        # Normalize to exactly num_indices
+        counts = (counts.float() / counts.float().sum() * num_indices).long()
+        counts[-1] = num_indices - counts[:-1].sum()
+
+        indices = torch.repeat_interleave(unique_indices, counts)
+        assert indices.numel() == num_indices
+
+        # Verify we'll trigger overflow
+        approx_partial_segments = num_indices // NROWS_PER_THREAD + max_segments
+        stride_warped = ((embedding_dim + 31) // 32) * 32
+        total_threads = approx_partial_segments * stride_warped
+
+        assert total_threads > 2**31 - 1, (
+            f"Test bug: threads={total_threads:,} <= INT32_MAX, won't trigger overflow!"
+        )
+
+        # Create gradient output
+        grad_output = torch.randn(
+            num_indices, embedding_dim, dtype=dtype, device=device
+        )
+
+        # This should complete without error (after fix)
+        # Before fix: RuntimeError with "illegal memory access"
+        grad_weight = torch.ops.aten.embedding_dense_backward(
+            grad_output, indices, num_weights, padding_idx, scale_grad_by_freq
+        )
+
+        # Verify output shape
+        assert grad_weight.shape == (num_weights, embedding_dim)
+        assert grad_weight.dtype == torch.bfloat16
+
     # Check correctness of torch.nn.functional.embedding_bag forward and
     # backward functions with padding_idx, given a 2D indices input. Compare
     # against torch.nn.functional.embedding followed by a reduction.
diff --git a/test/nn/test_load_state_dict.py b/test/nn/test_load_state_dict.py
index 8ce1f03c0a84..074ac6273689 100644
--- a/test/nn/test_load_state_dict.py
+++ b/test/nn/test_load_state_dict.py
@@ -474,8 +474,8 @@ def module_load(dest, src, assign=False):
                 f"Expected isinstance(src, {cls}) but got {type(src)}"
             )
             assert (
-                type(dest) == torch.Tensor
-                or type(dest) == torch.nn.Parameter
+                type(dest) is torch.Tensor
+                or type(dest) is torch.nn.Parameter
                 or issubclass(cls, type(dest))
             )
             if assign:
diff --git a/test/nn/test_multihead_attention.py b/test/nn/test_multihead_attention.py
index c0419664d009..3dc6a586ced6 100644
--- a/test/nn/test_multihead_attention.py
+++ b/test/nn/test_multihead_attention.py
@@ -17,7 +17,6 @@
     instantiate_parametrized_tests,
     parametrize as parametrize_test,
     run_tests,
-    skipIfRocm,
     TEST_NUMPY,
     TEST_WITH_CROSSREF,
 )
@@ -486,7 +485,7 @@ def test_multihead_attn_3d_attn_mask(self):
         )[0]
         output_3d = output_3d.transpose(0, 1)  # [N, T, D]
 
-        for i in range(0, batch_size):
+        for i in range(batch_size):
             output_2d = mta_model(
                 query[i].unsqueeze(0).transpose(0, 1),
                 key[i].unsqueeze(0).transpose(0, 1),
@@ -746,7 +745,6 @@ def test_multihead_attn_nested_tensor_outside_fast_path(self):
 
 
 class TestMultiheadAttentionNNDeviceType(NNTestCase):
-    @skipIfRocm(msg="To investigate: yields NaN")
     def test_multihead_self_attn_two_masks_fast_path(self, device):
         """
         Multihead self-attention should give the same result on the fast path (BetterTransformer) as on the slow path
@@ -939,6 +937,26 @@ def test_multihead_attn_fast_path_small_test(self, device, dtype):
         query = torch.randn(4, 4, 4, dtype=dtype, device=device)
         mha(query, query, query)
 
+    @dtypes(torch.double)
+    def test_fast_path_check_with_mask_does_not_break_in_compile(self, device, dtype):
+        # Test TransformerEncoder fast path determination with src_key_padding_mask set.
+        # Specifically, ensure the mask left-align check doesn't fail in torch.compile.
+        # See https://github.com/pytorch/pytorch/issues/163640
+        layer = nn.TransformerEncoderLayer(
+            d_model=512,
+            nhead=8,
+            batch_first=True,
+            dropout=0.1,
+            device=device,
+            dtype=dtype,
+        )
+        encoder = nn.TransformerEncoder(layer, num_layers=2).eval()
+        encoder = torch.compile(encoder, fullgraph=True)
+        x = torch.randn(1, 41, 512, dtype=dtype, device=device)
+        pad_mask = torch.rand(1, 41, device=device) > 0.5
+        pad_mask[..., 0] = True
+        encoder(x, mask=None, src_key_padding_mask=pad_mask)
+
     @dtypes(torch.double)
     @torch.no_grad()
     def test_multihead_attn_in_proj_bias_none(self, device, dtype):
diff --git a/test/nn/test_pooling.py b/test/nn/test_pooling.py
index 3567ecd8e55b..c3a7b829b2b1 100644
--- a/test/nn/test_pooling.py
+++ b/test/nn/test_pooling.py
@@ -857,6 +857,20 @@ def test_MaxUnpool_index_errors(
             else:
                 unpool(output, indices)
 
+    # https://github.com/pytorch/pytorch/issues/163409
+    @onlyNativeDeviceTypes
+    def test_MaxUnpool_invalid_output_size(self, device):
+        input2d = torch.randn(1, 1, 1)
+        input3d = torch.randn(1, 1, 1, 1, 1)
+        unpool2d = torch.nn.MaxUnpool2d(())
+        unpool3d = torch.nn.MaxUnpool3d(())
+
+        with self.assertRaisesRegex(RuntimeError, "There should be exactly"):
+            unpool2d(input2d, torch.zeros_like(input2d, dtype=torch.int64))
+
+        with self.assertRaisesRegex(RuntimeError, "There should be exactly"):
+            unpool3d(input3d, torch.zeros_like(input3d, dtype=torch.int64))
+
     @expectedFailureMPS
     @onlyNativeDeviceTypes
     def test_AdaptiveMaxPool_zero_batch_dim(self, device):
@@ -1121,7 +1135,7 @@ def check(x, *args, **kwargs):
         for size, kernel_size, stride, dilation, ceil_mode in itertools.product(
             sizes, kernel_sizes, strides, dilations, ceil_modes
         ):
-            padding = random.sample(range(0, math.floor(kernel_size / 2) + 1), 1)
+            padding = random.sample(range(math.floor(kernel_size / 2) + 1), 1)
             check(
                 torch.randn(size, device=device, dtype=dtype),
                 kernel_size,
@@ -1411,6 +1425,33 @@ def test_max_pool2d_with_indices_backward_fails(self, device):
                 indices,
             )
 
+    def test_max_unpool_invalid_indices(self):
+        input = torch.randn(1, 1, 2, 2)
+        negative_indices = torch.tensor([[[[-1, 0], [0, 2]]]], dtype=torch.int64)
+        large_indices = torch.tensor([[[[10000, 10], [0, 2]]]], dtype=torch.int64)
+        output_size = (2, 2)
+
+        with self.assertRaisesRegex(RuntimeError, "Found an invalid max index"):
+            F.max_unpool2d(input, negative_indices, output_size)
+
+        with self.assertRaisesRegex(RuntimeError, "Found an invalid max index"):
+            F.max_unpool2d(input, large_indices, output_size)
+
+        input = torch.randn(1, 1, 2, 2, 2)
+        negative_indices = torch.tensor(
+            [[[[[-1, 10], [0, 2]], [[1, 3], [4, 5]]]]], dtype=torch.int64
+        )
+        large_indices = torch.tensor(
+            [[[[[10000, 10], [0, 2]], [[1, 3], [4, 5]]]]], dtype=torch.int64
+        )
+        output_size = (2, 2, 2)
+
+        with self.assertRaisesRegex(RuntimeError, "Found an invalid max index"):
+            F.max_unpool3d(input, negative_indices, output_size)
+
+        with self.assertRaisesRegex(RuntimeError, "Found an invalid max index"):
+            F.max_unpool3d(input, large_indices, output_size)
+
     @onlyCPU
     @dtypes(torch.half, torch.bfloat16)
     def test_avg_pool2d_reduced_floating(self, device, dtype):
diff --git a/test/onnx/exporter/test_api.py b/test/onnx/exporter/test_api.py
index 24a9176bbe5b..7e6a487e18f5 100644
--- a/test/onnx/exporter/test_api.py
+++ b/test/onnx/exporter/test_api.py
@@ -202,6 +202,51 @@ def test_dynamic_axes_supports_output_names(self):
             dynamic_axes={"b": [0, 1, 2], "b_out": [0, 1, 2]},
         )
 
+    def test_from_dynamic_axes_to_dynamic_shapes_deprecation_warning(self):
+        with self.assertWarnsRegex(
+            DeprecationWarning,
+            "from_dynamic_axes_to_dynamic_shapes is deprecated and will be removed in a future release. "
+            "This function converts 'dynamic_axes' format \\(including custom axis names\\) to 'dynamic_shapes' format. "
+            "Instead of relying on this conversion, provide 'dynamic_shapes' directly with custom names.",
+        ):
+            self.assert_export(
+                SampleModelForDynamicShapes(),
+                (torch.randn(2, 2, 3), {"b": torch.randn(2, 2, 3)}),
+                dynamic_axes={
+                    "x": [0, 1, 2],
+                    "b": [0, 1, 2],
+                },
+            )
+
+    def test_from_dynamic_axes_to_dynamic_shapes_keeps_custom_axis_names(self):
+        model = SampleModelForDynamicShapes()
+        input = (
+            torch.randn(2, 2, 3),
+            {"b": torch.randn(2, 2, 3)},
+        )
+        dynamic_axes = {
+            "x": {0: "customx_x_0", 1: "customx_x_1", 2: "customx_x_2"},
+            "b": {0: "customb_b_0", 1: "customb_b_1", 2: "customb_b_2"},
+            "x_out": {0: "customx_out_x_0", 1: "customx_out_x_1", 2: "customx_out_x_2"},
+            "b_out": {0: "customb_out_b_0", 1: "customb_out_b_1", 2: "customb_out_b_2"},
+        }
+        onnx_program = torch.onnx.export(
+            model,
+            input,
+            dynamic_axes=dynamic_axes,
+            input_names=["x", "b"],
+            output_names=["x_out", "b_out"],
+            dynamo=True,
+        )
+
+        # Check whether the dynamic dimension names are preserved
+        self.assertIs(onnx_program.model.graph.inputs[0].shape[0].value, "customx_x_0")
+        self.assertIs(onnx_program.model.graph.inputs[0].shape[1].value, "customx_x_1")
+        self.assertIs(onnx_program.model.graph.inputs[0].shape[2].value, "customx_x_2")
+        self.assertIs(onnx_program.model.graph.inputs[1].shape[0].value, "customb_b_0")
+        self.assertIs(onnx_program.model.graph.inputs[1].shape[1].value, "customb_b_1")
+        self.assertIs(onnx_program.model.graph.inputs[1].shape[2].value, "customb_b_2")
+
     def test_saved_f_exists_after_export(self):
         with common_utils.TemporaryFileName(suffix=".onnx") as path:
             _ = torch.onnx.export(
diff --git a/test/onnx/exporter/test_core.py b/test/onnx/exporter/test_core.py
index 7a2eaaf1a828..e0742cb70f5f 100644
--- a/test/onnx/exporter/test_core.py
+++ b/test/onnx/exporter/test_core.py
@@ -3,6 +3,10 @@
 
 from __future__ import annotations
 
+import io
+import os
+import tempfile
+
 import ml_dtypes
 import numpy as np
 
@@ -82,5 +86,75 @@ def test_tobytes_float4(self):
         self.assertEqual(tensor.tobytes(), b"\x01")
 
 
+class TorchTensorToFileTest(common_utils.TestCase):
+    def _roundtrip_file(self, tensor: _core.TorchTensor) -> bytes:
+        expected = tensor.tobytes()
+        # NamedTemporaryFile (binary)
+        with tempfile.NamedTemporaryFile() as tmp:
+            tensor.tofile(tmp)
+            tmp.seek(0)
+            data = tmp.read()
+        self.assertEqual(data, expected)
+
+        # Explicit path write using open handle
+        with tempfile.TemporaryDirectory() as d:
+            path = os.path.join(d, "bin.dat")
+            with open(path, "wb") as f:
+                tensor.tofile(f)
+            with open(path, "rb") as f:
+                self.assertEqual(f.read(), expected)
+
+        return expected
+
+    def test_tofile_basic_uint8(self):
+        tensor = _core.TorchTensor(torch.arange(10, dtype=torch.uint8))
+        self._roundtrip_file(tensor)
+
+    def test_tofile_float32(self):
+        tensor = _core.TorchTensor(
+            torch.arange(0, 16, dtype=torch.float32).reshape(4, 4)
+        )
+        self._roundtrip_file(tensor)
+
+    def test_tofile_bfloat16(self):
+        tensor = _core.TorchTensor(torch.arange(0, 8, dtype=torch.bfloat16))
+        self._roundtrip_file(tensor)
+
+    def test_tofile_float4_packed(self):
+        # 3 packed bytes -> 6 logical float4 values (when unpacked), but we want packed bytes
+        raw = torch.tensor([0x12, 0x34, 0xAB], dtype=torch.uint8)
+        tensor = _core.TorchTensor(raw.view(torch.float4_e2m1fn_x2))
+        expected = self._roundtrip_file(tensor)
+        self.assertEqual(expected, bytes([0x12, 0x34, 0xAB]))
+
+    def test_tofile_file_like_no_fileno(self):
+        tensor = _core.TorchTensor(torch.arange(0, 32, dtype=torch.uint8))
+        buf = io.BytesIO()
+        tensor.tofile(buf)
+        self.assertEqual(buf.getvalue(), tensor.tobytes())
+
+    def test_tofile_text_mode_error(self):
+        tensor = _core.TorchTensor(torch.arange(0, 4, dtype=torch.uint8))
+        with tempfile.NamedTemporaryFile(mode="w") as tmp_text:
+            path = tmp_text.name
+            with open(path, "w") as f_text:
+                with self.assertRaises(TypeError):
+                    tensor.tofile(f_text)
+
+    def test_tofile_non_contiguous(self):
+        base = torch.arange(0, 64, dtype=torch.int32).reshape(8, 8)
+        sliced = base[:, ::2]  # Stride in last dim -> non-contiguous
+        self.assertFalse(sliced.is_contiguous())
+        tensor = _core.TorchTensor(sliced)
+        # Ensure bytes correspond to the contiguous clone inside implementation
+        expected_manual = sliced.contiguous().numpy().tobytes()
+        with tempfile.NamedTemporaryFile() as tmp:
+            tensor.tofile(tmp)
+            tmp.seek(0)
+            data = tmp.read()
+        self.assertEqual(data, expected_manual)
+        self.assertEqual(tensor.tobytes(), expected_manual)
+
+
 if __name__ == "__main__":
     common_utils.run_tests()
diff --git a/test/onnx/onnx_test_common.py b/test/onnx/onnx_test_common.py
index ab2bfb51bdea..bdeeb40cb9f6 100644
--- a/test/onnx/onnx_test_common.py
+++ b/test/onnx/onnx_test_common.py
@@ -7,8 +7,8 @@
 import io
 import os
 import unittest
-from collections.abc import Collection, Iterable, Mapping, Sequence
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable, Collection, Iterable, Mapping, Sequence
+from typing import Any, Optional, Union
 
 import numpy as np
 import onnxruntime
diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py
index 75de1f3fab83..16ca93dbfe2c 100644
--- a/test/onnx/test_onnx_opset.py
+++ b/test/onnx/test_onnx_opset.py
@@ -36,12 +36,12 @@ def check_onnx_opset_operator(
     # but the op's attributes can optionally be
     # specified as well
     assert len(ops) == len(graph.node)
-    for i in range(0, len(ops)):
+    for i in range(len(ops)):
         assert graph.node[i].op_type == ops[i]["op_name"]
         if "attributes" in ops[i]:
             attributes = ops[i]["attributes"]
             assert len(attributes) == len(graph.node[i].attribute)
-            for j in range(0, len(attributes)):
+            for j in range(len(attributes)):
                 for attribute_field in attributes[j].keys():
                     assert attributes[j][attribute_field] == getattr(
                         graph.node[i].attribute[j], attribute_field
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 6fa49ed61b71..5c11682deeda 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -4879,7 +4879,7 @@ def forward(self, input):
     @skipScriptTest()
     def test_rnn_no_bias(self):
         def make_model(layers, packed_sequence):
-            batch_first = True if packed_sequence == 2 else False
+            batch_first = packed_sequence == 2
             model = torch.nn.RNN(
                 RNN_INPUT_SIZE,
                 RNN_HIDDEN_SIZE,
@@ -4900,7 +4900,7 @@ def make_model(layers, packed_sequence):
             return model
 
         def make_input(batch_size, layers, packed_sequence):
-            batch_first = True if packed_sequence == 2 else False
+            batch_first = packed_sequence == 2
             seq_lengths = np.random.randint(1, RNN_SEQUENCE_LENGTH + 1, size=batch_size)
             seq_lengths = sorted(map(int, seq_lengths), reverse=True)
             inputs = [torch.randn(l, RNN_INPUT_SIZE) for l in seq_lengths]
diff --git a/test/onnx/torchlib/ops_test_common.py b/test/onnx/torchlib/ops_test_common.py
index 54ecbdc19518..d1206da0e07d 100644
--- a/test/onnx/torchlib/ops_test_common.py
+++ b/test/onnx/torchlib/ops_test_common.py
@@ -12,8 +12,8 @@
 import sys
 import unittest
 import warnings
-from collections.abc import Collection, Iterable, Mapping, Sequence
-from typing import Any, Callable, Optional, TypeVar
+from collections.abc import Callable, Collection, Iterable, Mapping, Sequence
+from typing import Any, Optional, TypeVar
 
 import error_reproduction
 import numpy as np
@@ -592,7 +592,6 @@ def _capture_graph_and_evaluate_torch_script_evaluator(
                 proto = onnxscript_function.to_function_proto()
                 ir_function = ir.serde.deserialize_function(proto)
             onnx_model.functions[identifier] = ir_function
-        _ir_passes.add_torchlib_common_imports(onnx_model, opset_version=opset_version)
         _ir_passes.add_opset_imports(onnx_model)
         # Make sure the model is valid
         model_proto = ir.to_proto(onnx_model)
diff --git a/test/onnx/torchlib/ops_test_data.py b/test/onnx/torchlib/ops_test_data.py
index 0194cd0335ff..6adb43044e51 100644
--- a/test/onnx/torchlib/ops_test_data.py
+++ b/test/onnx/torchlib/ops_test_data.py
@@ -39,7 +39,7 @@
 import copy
 import dataclasses
 import functools
-from typing import Any, Callable, Optional, TYPE_CHECKING
+from typing import Any, Optional, TYPE_CHECKING
 from typing_extensions import Self
 
 import numpy as np
@@ -52,7 +52,7 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Collection
+    from collections.abc import Callable, Collection
 
 
 # Create a copy of the op_db to modify
diff --git a/test/onnx/torchlib/test_ops.py b/test/onnx/torchlib/test_ops.py
index a7a52698cd23..be4cda66dc2e 100644
--- a/test/onnx/torchlib/test_ops.py
+++ b/test/onnx/torchlib/test_ops.py
@@ -25,7 +25,7 @@
 from __future__ import annotations
 
 import os
-from typing import Callable, Optional, TYPE_CHECKING
+from typing import Optional, TYPE_CHECKING
 
 import error_reproduction
 import numpy as np
@@ -44,7 +44,7 @@
 
 if TYPE_CHECKING:
     import unittest
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
 
     from torch.testing._internal.opinfo import core as opinfo_core
 
diff --git a/test/optim/test_lrscheduler.py b/test/optim/test_lrscheduler.py
index fbc9e6917779..251920712610 100644
--- a/test/optim/test_lrscheduler.py
+++ b/test/optim/test_lrscheduler.py
@@ -43,7 +43,7 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 
 class TestLRScheduler(TestCase):
@@ -77,7 +77,7 @@ def setUp(self):
         self.opt = SGD(
             [
                 {"params": self.net.conv1.parameters()},
-                {"params": self.net.conv2.parameters(), "lr": 0.5},
+                {"params": self.net.conv2.parameters(), "lr": torch.tensor(0.5)},
             ],
             lr=0.05,
         )
@@ -1509,7 +1509,7 @@ def test_cycle_lr_triangular2_mode_step_size_up_down(self):
             14.0 / 3,
             29.0 / 6,
         ]
-        deltas = [2 * i for i in range(0, 2)]
+        deltas = [2 * i for i in range(2)]
         base_lrs = [1 + delta for delta in deltas]
         max_lrs = [5 + delta for delta in deltas]
         lr_targets = [[x + delta for x in lr_base_target] for delta in deltas]
@@ -2530,7 +2530,7 @@ def test_add_param_group_errors_reduce_lr_on_plateau(self):
         ],
     )
     def test_constant_initial_lr(self, LRClass):
-        # Test that the initial learning rate is constant
+        # Test that the initial learning rate is constant and that it does not alias base_lrs
         lr = torch.as_tensor(0.1)
         opt = SGD([torch.nn.Parameter(torch.randn(1))], lr=lr)
         sch = LRClass(opt)
@@ -2544,6 +2544,7 @@ def test_constant_initial_lr(self, LRClass):
             for group, ori_group in zip(opt.param_groups, ori_param_groups):
                 self.assertEqual(group["initial_lr"], ori_group["initial_lr"])
                 self.assertEqual(sch.base_lrs, [0.1])
+                self.assertIsNot(sch.base_lrs[0], group["initial_lr"])
 
     def test_constant_initial_params_cyclelr(self):
         # Test that the initial learning rate is constant
diff --git a/test/optim/test_optim.py b/test/optim/test_optim.py
index 8e060907fe5a..a8cde85c0df1 100644
--- a/test/optim/test_optim.py
+++ b/test/optim/test_optim.py
@@ -30,7 +30,7 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 
 def _diff_fn(p, grad, opt_differentiable_state, opt_class, kwargs, *ignored):
diff --git a/test/optim/test_swa_utils.py b/test/optim/test_swa_utils.py
index ae9ff2cf01b6..1992a39a7857 100644
--- a/test/optim/test_swa_utils.py
+++ b/test/optim/test_swa_utils.py
@@ -20,7 +20,7 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 
 class TestSWAUtils(TestCase):
diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
index 5351f147cf33..91e4fd7a3776 100644
--- a/test/profiler/test_memory_profiler.py
+++ b/test/profiler/test_memory_profiler.py
@@ -4,8 +4,8 @@
 import itertools as it
 import textwrap
 import unittest
-from collections.abc import Iterator
-from typing import Callable, Optional
+from collections.abc import Callable, Iterator
+from typing import Optional
 
 import torch
 from torch._C._profiler import _EventType, _TensorMetadata
@@ -901,7 +901,7 @@ def _run_and_format_categories(self, fn, indent=12):
                         ptr_pair_to_key[(t.impl_ptr, t.storage_data_ptr)] = key
 
         def format_categories(ptr_pair: int):
-            target_key = ptr_pair_to_key.get(ptr_pair, None)
+            target_key = ptr_pair_to_key.get(ptr_pair)
             if target_key is None:
                 return "???"
 
@@ -1174,12 +1174,10 @@ def step_fn(mark_region):
             aten::mul.Tensor                         17 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 20 (AUTOGRAD_DETAIL)
             aten::sum.dim_IntList                    20 (AUTOGRAD_DETAIL)                          -> 21 (GRADIENT)
             aten::view                               21 (GRADIENT)                                 -> 21 (GRADIENT)
-            aten::detach                             21 (GRADIENT)                                 -> 21 (GRADIENT)
             aten::detach                             21 (GRADIENT)                                 -> ???
             aten::mul.Tensor                         17 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 22 (AUTOGRAD_DETAIL)
             aten::sum.dim_IntList                    22 (AUTOGRAD_DETAIL)                          -> 23 (GRADIENT)
             aten::view                               23 (GRADIENT)                                 -> 23 (GRADIENT)
-            aten::detach                             23 (GRADIENT)                                 -> 23 (GRADIENT)
             aten::detach                             23 (GRADIENT)                                 -> ???""",
         )
 
@@ -1227,12 +1225,10 @@ def step_fn(mark_region):
             aten::sum.dim_IntList                    20 (AUTOGRAD_DETAIL)                          -> 21 (GRADIENT)
             aten::view                               21 (GRADIENT)                                 -> 21 (GRADIENT)
             aten::detach                             21 (GRADIENT)                                 -> 21 (GRADIENT)
-            aten::detach                             21 (GRADIENT)                                 -> 21 (GRADIENT)
             aten::mul.Tensor                         17 (AUTOGRAD_DETAIL), 1 (INPUT)               -> 22 (AUTOGRAD_DETAIL)
             aten::sum.dim_IntList                    22 (AUTOGRAD_DETAIL)                          -> 23 (GRADIENT)
             aten::view                               23 (GRADIENT)                                 -> 23 (GRADIENT)
             aten::detach                             23 (GRADIENT)                                 -> 23 (GRADIENT)
-            aten::detach                             23 (GRADIENT)                                 -> 23 (GRADIENT)
 
             -- Optimizer --------------------------------------------------------------------------------------------
             aten::add_.Tensor                        3 (PARAMETER), 23 (GRADIENT)                  -> 3 (PARAMETER)
@@ -1277,10 +1273,8 @@ def step_fn(mark_region):
             aten::t                                  7 (GRADIENT)                                  -> 7 (GRADIENT)
             aten::sum.dim_IntList                    6 (ACTIVATION)                                -> 9 (GRADIENT)
             aten::view                               9 (GRADIENT)                                  -> 9 (GRADIENT)
-            aten::detach                             9 (GRADIENT)                                  -> 9 (GRADIENT)
             aten::detach                             9 (GRADIENT)                                  -> ???
             aten::t                                  7 (GRADIENT)                                  -> 7 (GRADIENT)
-            aten::detach                             7 (GRADIENT)                                  -> 7 (GRADIENT)
             aten::detach                             7 (GRADIENT)                                  -> ???""",
         )
 
@@ -1318,18 +1312,14 @@ def step_fn(mark_region):
             aten::sum.dim_IntList                    6 (ACTIVATION)                                -> 9 (GRADIENT)
             aten::view                               9 (GRADIENT)                                  -> 9 (GRADIENT)
             aten::detach                             9 (GRADIENT)                                  -> 9 (GRADIENT)
-            aten::detach                             9 (GRADIENT)                                  -> 9 (GRADIENT)
             aten::t                                  7 (GRADIENT)                                  -> 7 (GRADIENT)
             aten::detach                             7 (GRADIENT)                                  -> 7 (GRADIENT)
-            aten::detach                             7 (GRADIENT)                                  -> 7 (GRADIENT)
 
             -- Optimizer --------------------------------------------------------------------------------------------
             aten::detach                             7 (GRADIENT)                                  -> 7 (GRADIENT)
-            aten::detach                             7 (GRADIENT)                                  -> 7 (GRADIENT)
             aten::clone                              7 (GRADIENT)                                  -> 10 (OPTIMIZER_STATE)
             aten::add_.Tensor                        2 (PARAMETER), 10 (OPTIMIZER_STATE)           -> 2 (PARAMETER)
             aten::detach                             9 (GRADIENT)                                  -> 9 (GRADIENT)
-            aten::detach                             9 (GRADIENT)                                  -> 9 (GRADIENT)
             aten::clone                              9 (GRADIENT)                                  -> 11 (OPTIMIZER_STATE)
             aten::add_.Tensor                        3 (PARAMETER), 11 (OPTIMIZER_STATE)           -> 3 (PARAMETER)""",
         )
@@ -1414,7 +1404,6 @@ def step_fn(mark_region):
             aten::t                                  7 (PARAMETER)                                 -> 7 (PARAMETER)
             aten::mm                                 25 (AUTOGRAD_DETAIL), 7 (PARAMETER)           -> 27 (AUTOGRAD_DETAIL)
             aten::t                                  26 (GRADIENT)                                 -> 26 (GRADIENT)
-            aten::detach                             26 (GRADIENT)                                 -> 26 (GRADIENT)
             aten::detach                             26 (GRADIENT)                                 -> ???
             aten::detach                             6 (ACTIVATION)                                -> 6 (ACTIVATION)
             aten::threshold_backward                 27 (AUTOGRAD_DETAIL), 6 (ACTIVATION)          -> 28 (AUTOGRAD_DETAIL)
@@ -1423,10 +1412,8 @@ def step_fn(mark_region):
             aten::t                                  29 (GRADIENT)                                 -> 29 (GRADIENT)
             aten::sum.dim_IntList                    28 (AUTOGRAD_DETAIL)                          -> 30 (GRADIENT)
             aten::view                               30 (GRADIENT)                                 -> 30 (GRADIENT)
-            aten::detach                             30 (GRADIENT)                                 -> 30 (GRADIENT)
             aten::detach                             30 (GRADIENT)                                 -> ???
             aten::t                                  29 (GRADIENT)                                 -> 29 (GRADIENT)
-            aten::detach                             29 (GRADIENT)                                 -> 29 (GRADIENT)
             aten::detach                             29 (GRADIENT)                                 -> ???""",
         )
 
@@ -1451,7 +1438,7 @@ def test_memory_timeline(self) -> None:
         memory_profile = prof._memory_profile()
         timeline = memory_profile.timeline
         times = tuple(t for t, _, _, _ in timeline)
-        self.assertTrue(all(t1 >= t0 for t0, t1 in zip(times, times[1:])), times)
+        self.assertTrue(all(t1 >= t0 for t0, t1 in it.pairwise(times)), times)
         self.assertTrue(
             all(
                 (t == -1) if action == _memory_profiler.Action.PREEXISTING else (t > 0)
@@ -1492,7 +1479,7 @@ def id_for_testing(key):
             for _, action, (key, version), size in prof._memory_profile().timeline
             # We generally don't care about tiny allocations during memory
             # profiling and they add a lot of noise to the unit test.
-            if size > 1024
+            if size > 1024 and isinstance(key, _memory_profiler.TensorKey)
         ]
 
         self.assertExpectedInline(
diff --git a/test/profiler/test_profiler.py b/test/profiler/test_profiler.py
index 28d337300a12..a9321da3fbd3 100644
--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@@ -967,7 +967,7 @@ def test_flops(self):
         profiler_output = prof.key_averages(group_by_input_shape=True).table(
             sort_by="cpu_time_total", row_limit=10
         )
-        self.assertIn("Total MFLOPs", profiler_output)
+        self.assertRegex(profiler_output, "Total M?FLOPs")
         if not (kineto_available() and torch.cuda.is_available()):
             return
 
@@ -983,7 +983,7 @@ def test_flops(self):
         profiler_output = kineto_profiler.key_averages().table(
             sort_by="self_cuda_time_total", row_limit=-1
         )
-        self.assertIn("Total MFLOPs", profiler_output)
+        self.assertRegex(profiler_output, "Total M?FLOPs")
 
     def test_override_time_units(self):
         US_IN_SECOND = 1000.0 * 1000.0
@@ -1805,6 +1805,82 @@ def test_profiler_op_event_kwargs(self):
                     self.assertTrue("grid" not in args)
                     self.assertTrue(e["cat"] == "user_annotation")
 
+    @skipIfTorchDynamo("profiler gets ignored if dynamo activated")
+    def test_profiler_op_event_kwargs_list_of_strings(self):
+        x, y = (torch.rand((4, 4)) for _ in range(2))
+        with profile(record_shapes=True) as p:
+            cm = torch._C._profiler._RecordFunctionFast(
+                "add_test_kwinputs_string_list",
+                [x, y],
+                {
+                    "string_list": ["hello", "world", "test"],
+                    "int_param": 42,
+                    "string_param": "single_string",
+                },
+            )
+            for _ in range(4):
+                with cm:
+                    x.add(y)
+        with TemporaryFileName(mode="w+") as fname:
+            p.export_chrome_trace(fname)
+            with open(fname) as f:
+                j = json.load(f)
+                op_events = [
+                    e
+                    for e in j["traceEvents"]
+                    if e.get("name", "") == "add_test_kwinputs_string_list"
+                ]
+                self.assertTrue(len(op_events) > 0)
+                for e in op_events:
+                    args = e["args"]
+                    self.assertTrue("string_list" in args)
+                    self.assertTrue("int_param" in args)
+                    self.assertTrue("string_param" in args)
+                    # Check that the list of strings is properly serialized
+                    # The list should be formatted as a JSON array by ivalueListToStr
+                    self.assertEqual(args["string_list"], ["hello", "world", "test"])
+                    self.assertEqual(args["int_param"], 42)
+                    self.assertEqual(args["string_param"], "single_string")
+                    self.assertTrue(e["cat"] == "cpu_op")
+
+        # Test mixed types that should be filtered out
+        with profile(record_shapes=True) as p1:
+            cm = torch._C._profiler._RecordFunctionFast(
+                "add_test_kwinputs_string_list_filtered",
+                [x, y],
+                {
+                    "valid_string_list": ["valid1", "valid2"],
+                    "mixed_list": ["string", 123],  # Should be filtered out
+                    "non_string_list": [1, 2, 3],  # Should be filtered out
+                    "valid_int": 100,
+                },
+            )
+            for _ in range(4):
+                with cm:
+                    x.add(y)
+        with TemporaryFileName(mode="w+") as fname1:
+            p1.export_chrome_trace(fname1)
+            with open(fname1) as f1:
+                j = json.load(f1)
+                op_events = [
+                    e
+                    for e in j["traceEvents"]
+                    if e.get("name", "") == "add_test_kwinputs_string_list_filtered"
+                ]
+                self.assertTrue(len(op_events) > 0)
+                for e in op_events:
+                    args = e["args"]
+                    # Only valid types should be present
+                    self.assertTrue("valid_string_list" in args)
+                    self.assertTrue("valid_int" in args)
+                    # Invalid lists should be filtered out
+                    self.assertTrue("mixed_list" not in args)
+                    self.assertTrue("non_string_list" not in args)
+                    # Check values
+                    self.assertEqual(args["valid_string_list"], ["valid1", "valid2"])
+                    self.assertEqual(args["valid_int"], 100)
+                    self.assertTrue(e["cat"] == "cpu_op")
+
     def test_is_profiler_enabled(self):
         self.assertFalse(torch.autograd.profiler._is_profiler_enabled)
 
@@ -1854,7 +1930,7 @@ def test_event_list(self):
         event_list.table()
 
     def _check_all_gpu_present(self, gpu_dict, max_gpu_count):
-        for i in range(0, max_gpu_count):
+        for i in range(max_gpu_count):
             self.assertEqual(gpu_dict["GPU " + str(i)], 1)
 
     # Do json sanity testing. Checks that all events are between profiler start and end
@@ -2063,8 +2139,8 @@ def test_cpu_annotation_overlap(self):
                         step_helper_funcs.append(event)
             self.assertEqual(len(prof_steps), 5)
             self.assertEqual(len(step_helper_funcs), 5)
-            for i in range(0, len(step_helper_funcs)):
-                for j in range(0, len(step_helper_funcs)):
+            for i in range(len(step_helper_funcs)):
+                for j in range(len(step_helper_funcs)):
                     self.assertTrue(
                         not self._partial_overlap(prof_steps[i], step_helper_funcs[j])
                     )
@@ -3162,6 +3238,40 @@ def validate_json(prof):
             assert "Overload Name" in key_averages.table()
             validate_json(prof)
 
+    def test_expose_kineto_event_metadata(self):
+        def check_metadata(prof, op_name, metadata_key):
+            with TemporaryFileName(mode="w+") as fname:
+                prof.export_chrome_trace(fname)
+                with open(fname) as f:
+                    events = json.load(f)["traceEvents"]
+                    found_op = False
+                    for e in events:
+                        if "name" in e and "args" in e and e["name"] == op_name:
+                            assert metadata_key in e["args"], (
+                                f"Metadata for '{op_name}' in Chrome trace did not contain '{metadata_key}'."
+                            )
+                            found_op = True
+                    assert found_op, f"Could not find op '{op_name}' in Chrome trace."
+                found_op = False
+                for event in prof.events():
+                    if event.name == op_name:
+                        assert metadata_key in event.metadata_json, (
+                            f"Metadata for '{op_name}' in FunctionEvent did not contain '{metadata_key}'."
+                        )
+                        found_op = True
+                assert found_op, f"Could not find op '{op_name}' in prof.events()."
+
+        experimental_config = torch._C._profiler._ExperimentalConfig(
+            expose_kineto_event_metadata=True
+        )
+        with profile(
+            experimental_config=experimental_config,
+            activities=[ProfilerActivity.CPU],
+        ) as prof:
+            torch.add(1, 5)
+
+        check_metadata(prof, op_name="aten::add", metadata_key="Ev Idx")
+
     @unittest.skipIf(not torch.cuda.is_available(), "requries CUDA")
     def test_profiler_debug_autotuner(self):
         """
diff --git a/test/profiler/test_profiler_tree.py b/test/profiler/test_profiler_tree.py
index 670e639c98e2..c6316fe3cd7e 100644
--- a/test/profiler/test_profiler_tree.py
+++ b/test/profiler/test_profiler_tree.py
@@ -762,21 +762,22 @@ def test_profiler_experimental_tree_with_stack_and_torch_dispatch(self):
               torch/profiler/profiler.py(...): __enter__
                 ...
               aten::add
-                torch/_library/simple_registry.py(...): find_torch_dispatch_rule
-                  torch/_library/simple_registry.py(...): find
-                    <built-in method get of dict object at 0xXXXXXXXXXXXX>
-                  torch/_library/simple_registry.py(...): find
-                    <built-in method get of dict object at 0xXXXXXXXXXXXX>
-                test_profiler_tree.py(...): __torch_dispatch__
-                  torch/utils/_pytree.py(...): tree_map
-                    ...
-                  torch/utils/_pytree.py(...): tree_map
-                    ...
-                  torch/_ops.py(...): __call__
-                    <built-in method  of PyCapsule object at 0xXXXXXXXXXXXX>
-                      aten::add
-                  torch/utils/_pytree.py(...): tree_map
-                    ...
+                PythonSubclass
+                  torch/_library/simple_registry.py(...): find_torch_dispatch_rule
+                    torch/_library/simple_registry.py(...): find
+                      <built-in method get of dict object at 0xXXXXXXXXXXXX>
+                    torch/_library/simple_registry.py(...): find
+                      <built-in method get of dict object at 0xXXXXXXXXXXXX>
+                  test_profiler_tree.py(...): __torch_dispatch__
+                    torch/utils/_pytree.py(...): tree_map
+                      ...
+                    torch/utils/_pytree.py(...): tree_map
+                      ...
+                    torch/_ops.py(...): __call__
+                      <built-in method  of PyCapsule object at 0xXXXXXXXXXXXX>
+                        aten::add
+                    torch/utils/_pytree.py(...): tree_map
+                      ...
               torch/profiler/profiler.py(...): __exit__
                 torch/profiler/profiler.py(...): stop
                   ...""",
diff --git a/test/profiler/test_record_function.py b/test/profiler/test_record_function.py
index 9ab80c9a07a1..03a15f29907f 100644
--- a/test/profiler/test_record_function.py
+++ b/test/profiler/test_record_function.py
@@ -7,6 +7,7 @@
 import torch.optim
 import torch.utils.data
 import torch.utils.data.datapipes as dp
+from torch._dispatch.python import enable_python_dispatcher
 from torch.autograd import (
     _record_function_with_args_enter,
     _record_function_with_args_exit,
@@ -152,6 +153,79 @@ def test_datapipe_with_record_function_fork(self):
         self.assertTrue(has_iter)
         self.assertTrue(has_child)
 
+    def test_python_dispatch_mode_record_function(self):
+        from torch.utils._python_dispatch import TorchDispatchMode
+
+        class TestDispatchMode(TorchDispatchMode):
+            def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+                if kwargs is None:
+                    kwargs = {}
+                return func(*args, **kwargs)
+
+        with _profile() as prof:
+            with enable_python_dispatcher():
+                with TestDispatchMode():
+                    x = torch.randn(3, 4)
+                    y = torch.sin(x)
+
+        found_python_dispatch_mode = False
+        for e in prof.function_events:
+            if e.name == "PythonDispatchMode":
+                found_python_dispatch_mode = True
+                break
+        self.assertTrue(
+            found_python_dispatch_mode,
+            "PythonDispatchMode record function not found in profiler events",
+        )
+
+    def test_python_subclass_record_function(self):
+        class TestTensorSubclass(torch.Tensor):
+            @staticmethod
+            def __new__(cls, elem):
+                r = torch.Tensor._make_wrapper_subclass(
+                    cls,
+                    elem.size(),
+                    dtype=elem.dtype,
+                    device=elem.device,
+                    requires_grad=elem.requires_grad,
+                )
+                r.elem = elem
+                return r
+
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+                if kwargs is None:
+                    kwargs = {}
+
+                def unwrap(x):
+                    return x.elem if isinstance(x, TestTensorSubclass) else x
+
+                def wrap(x):
+                    return TestTensorSubclass(x) if isinstance(x, torch.Tensor) else x
+
+                unwrapped_args = tuple(unwrap(arg) for arg in args)
+                unwrapped_kwargs = {k: unwrap(v) for k, v in kwargs.items()}
+                result = func(*unwrapped_args, **unwrapped_kwargs)
+
+                if isinstance(result, torch.Tensor):
+                    return TestTensorSubclass(result)
+                return result
+
+        with _profile() as prof:
+            with enable_python_dispatcher():
+                x = TestTensorSubclass(torch.randn(3, 4))
+                y = torch.sin(x)
+
+        found_python_subclass = False
+        for e in prof.function_events:
+            if e.name == "PythonSubclass":
+                found_python_subclass = True
+                break
+        self.assertTrue(
+            found_python_subclass,
+            "PythonSubclass record function not found in profiler events",
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/quantization/core/experimental/test_floatx.py b/test/quantization/core/experimental/test_floatx.py
index ee7fe0a9d186..c4cea4073a5c 100644
--- a/test/quantization/core/experimental/test_floatx.py
+++ b/test/quantization/core/experimental/test_floatx.py
@@ -275,7 +275,7 @@ def test_float8_e8m0fnu_rne_rounding(self, device):
         IMO simpler to special case e8m0 here.
         """
 
-        for biased_exponent in range(0, 256):
+        for biased_exponent in range(256):
             # iterate through all the possible options of guard, round, sticky bits
             # for the current exponent
             for grs in range(8):
diff --git a/test/quantization/core/experimental/test_quantized_tensor.py b/test/quantization/core/experimental/test_quantized_tensor.py
index 02286b94f8db..aac99c3e8299 100644
--- a/test/quantization/core/experimental/test_quantized_tensor.py
+++ b/test/quantization/core/experimental/test_quantized_tensor.py
@@ -11,7 +11,7 @@ class TestQuantizedTensor(unittest.TestCase):
     """
     def test_int_repr(self):
         # generate tensor with random fp values
-        tensor2quantize = tensor2quantize = torch.tensor([0, 0.0215, 0.1692, 0.385, 1, 0.0391])
+        tensor2quantize = torch.tensor([0, 0.0215, 0.1692, 0.385, 1, 0.0391])
 
         observer = APoTObserver(b=4, k=2)
 
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index b6df2089e87e..d8a35264f7de 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -4,56 +4,67 @@
 
 import copy
 import itertools
-import numpy as np
 import operator
 import random
 import unittest
-from packaging.version import Version
-from typing import NamedTuple
+from typing import NamedTuple, TYPE_CHECKING
+
+import numpy as np
 
 import torch
-from torch import _VF
 import torch.jit
 import torch.nn.functional as F
-from torch.nn.modules.utils import _single, _pair
-
-from hypothesis import settings, HealthCheck
-from hypothesis import assume, given, note
-from hypothesis import strategies as st
 import torch.testing._internal.hypothesis_utils as hu
+
+from hypothesis import assume, given, HealthCheck, note, settings, strategies as st
+from packaging.version import Version
+from torch import _VF
+if TYPE_CHECKING:
+    from torch._ops import OpOverloadPacket
+from torch.nn.modules.utils import _pair, _single
+
 hu.assert_deadline_disabled()
 
-from torch.testing._internal.common_cuda import SM80OrLater
-from torch.testing._internal.common_utils import (
-    raise_on_run_directly,
-    TestCase,
-    IS_PPC,
-    IS_MACOS,
-    IS_SANDCASTLE,
-    IS_FBCODE,
-    IS_ARM64
+from typing import Optional
+
+import torch.backends.xnnpack
+from torch.ao.quantization import PerChannelMinMaxObserver
+from torch.testing._internal.common_cuda import (
+    SM80OrLater,
+    TEST_CUDA,
+    TEST_CUDNN,
+    TEST_CUDNN_VERSION,
+)
+from torch.testing._internal.common_quantization import (
+    skipIfNoFBGEMM,
+    skipIfNoONEDNN,
+    skipIfNoQNNPACK,
 )
-from torch.testing._internal.common_quantization import skipIfNoFBGEMM, skipIfNoQNNPACK, skipIfNoONEDNN
-from torch.testing._internal.common_quantized import _quantize, _dequantize, _calculate_dynamic_qparams, \
-    override_quantized_engine, supported_qengines, override_qengines, _snr
 from torch.testing._internal.common_quantized import (
-    qengine_is_qnnpack,
+    _calculate_dynamic_qparams,
+    _dequantize,
+    _quantize,
+    _snr,
+    override_qengines,
+    override_quantized_engine,
     qengine_is_onednn,
+    qengine_is_qnnpack,
+    supported_qengines,
+)
+from torch.testing._internal.common_utils import (
+    IS_ARM64,
+    IS_FBCODE,
+    IS_MACOS,
+    IS_PPC,
+    IS_SANDCASTLE,
+    raise_on_run_directly,
+    TestCase,
 )
-from torch.ao.quantization import PerChannelMinMaxObserver
-from torch.testing._internal.common_cuda import TEST_CUDNN, TEST_CUDNN_VERSION, TEST_CUDA
 from torch.testing._internal.optests import opcheck
-import torch.backends.xnnpack
 
 from torch.utils.cpp_extension import ROCM_HOME
 
-from typing import Optional
-
-np_dtype = {
-    torch.quint8 : np.uint8,
-    torch.qint8 : np.int8,
-    torch.qint32 : np.int32
-}
+np_dtype = {torch.quint8: np.uint8, torch.qint8: np.int8, torch.qint32: np.int32}
 
 TEST_ROCM = TEST_CUDA and torch.version.hip is not None and ROCM_HOME is not None
 
@@ -3042,7 +3053,7 @@ def from_float(cls, other, qconfig=None):
                 lstm_quantized = torch.ao.quantization.convert(
                     lstm_prepared, convert_custom_config_dict=custom_config_dict
                 )
-                assert type(lstm_quantized[0]) == torch.ao.nn.quantized.LSTM
+                assert type(lstm_quantized[0]) is torch.ao.nn.quantized.LSTM
                 qy = lstm_quantized(qx)
 
                 snr = _snr(y, qy)
@@ -7034,8 +7045,8 @@ def _test_qconv_impl_cpu_tensor(
         # ONEDNN only supports symmetric quantization of weight
         if W_zero_point is not None:
             W_zero_point = len(W_zero_point) * [0]
-        fp32_output = True if qconv_output_dtype is torch.float32 else False
-        bfloat16_output = True if qconv_output_dtype is torch.bfloat16 else False
+        fp32_output = qconv_output_dtype is torch.float32
+        bfloat16_output = qconv_output_dtype is torch.bfloat16
         if fp32_output or bfloat16_output:
             Y_scale = 1.0
             Y_zero_point = 0
@@ -7894,8 +7905,8 @@ def _test_qconv_impl_cpu_tensor_fp8(
         weight_in_channel_last_format=False,
     ):
         # We assume FP8 quantization is always symmetric
-        fp32_output = True if qconv_output_dtype is torch.float32 else False
-        bfloat16_output = True if qconv_output_dtype is torch.bfloat16 else False
+        fp32_output = qconv_output_dtype is torch.float32
+        bfloat16_output = qconv_output_dtype is torch.bfloat16
         if fp32_output or bfloat16_output:
             Y_scale = 1.0
             X2_scale = 1.0
@@ -8795,5 +8806,66 @@ def test_compare_tensor_scalar(self, A, b):
             self.assertEqual(result_ref, result,
                              msg=f"'tensor.{op}(scalar)'' failed")
 
+"""Tests the correctness of the quantized::embedding_bag_(byte|4bit|2bit)_prepack_with_rowwise_min_max ops."""
+class TestQuantizedWithMinMax(TestCase):
+    """Validates that the *rowwsie_min_max* quantization functions are equivalent to the ones without it."""
+    def test_quantize_tensor_with_min_max(self):
+        num_rows_list = [1, 2, 10, 100]
+        num_cols_list = [4, 8, 16, 32, 64, 128]
+        # Map of quantization bit rate to tuple of quantize function (with rowwise_min_max) and
+        # quantize function (without rowwise_min_max)
+        bit_rate_to_quant_fn: dict[
+            int,
+            tuple[
+                OpOverloadPacket,
+                OpOverloadPacket,
+            ],
+        ] = {
+            8: (
+                torch.ops.quantized.embedding_bag_byte_prepack_with_rowwise_min_max,
+                torch.ops.quantized.embedding_bag_byte_prepack,
+            ),
+            4: (
+                torch.ops.quantized.embedding_bag_4bit_prepack_with_rowwise_min_max,
+                torch.ops.quantized.embedding_bag_4bit_prepack,
+            ),
+            2: (
+                torch.ops.quantized.embedding_bag_2bit_prepack_with_rowwise_min_max,
+                torch.ops.quantized.embedding_bag_2bit_prepack,
+            ),
+        }
+
+        for quant_fn_with_rowwise_min_max, quant_fn in bit_rate_to_quant_fn.values():
+            for torch_dtype in [torch.float16, torch.float32]:
+                for num_rows, num_cols in itertools.product(num_rows_list, num_cols_list):
+                    weight = torch.rand(num_rows, num_cols, dtype=torch_dtype)
+                    rowwise_min_max = torch.stack(
+                        [weight.min(dim=1).values, weight.max(dim=1).values], dim=1
+                    )
+
+                    # Perform the quantization with rowwise_min_max
+                    weight_quantized = quant_fn_with_rowwise_min_max(
+                        weight, rowwise_min_max
+                    )
+                    assert weight_quantized.dtype == torch.uint8
+
+                    # Confirm that the quantization is matching the one without rowwise_min_max
+                    weight_quantized_no_rowwise_min_max = quant_fn(weight)
+                    assert torch.equal(
+                        weight_quantized, weight_quantized_no_rowwise_min_max
+                    )
+
+                    # Confirtm that incorrect rowwise_min_max will result in different quantization output
+                    incorrect_rowwise_min_max = torch.stack(
+                        [weight.max(dim=1).values, weight.max(dim=1).values], dim=1
+                    )
+                    weight_incorrectly_quantized = quant_fn_with_rowwise_min_max(
+                        weight, incorrect_rowwise_min_max
+                    )
+                    assert weight_incorrectly_quantized.dtype == torch.uint8
+                    assert not torch.equal(
+                        weight_incorrectly_quantized, weight_quantized_no_rowwise_min_max
+                    )
+
 if __name__ == "__main__":
     raise_on_run_directly("test/test_quantization.py")
diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index f241cc438757..65633dbf37ff 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -97,10 +97,10 @@ def param_search_greedy(x, bit_rate, n_bins=200, ratio=0.16):
             # found a local optima
             solutions.append((cur_min, cur_max, cur_loss))
         if loss1 < loss2:
-            cur_min, cur_max, cur_loss = cur_min + stepsize, cur_max, loss1
+            cur_min, cur_loss = cur_min + stepsize, loss1
         else:
-            cur_min, cur_max, cur_loss = cur_min, cur_max - stepsize, loss2
-    if len(solutions):
+            cur_max, cur_loss = cur_max - stepsize, loss2
+    if solutions:
         best = solutions[0]
         for solution in solutions:
             if solution[-1] < best[-1]:
diff --git a/test/quantization/core/test_workflow_module.py b/test/quantization/core/test_workflow_module.py
index 329266d7c63b..73ed76989591 100644
--- a/test/quantization/core/test_workflow_module.py
+++ b/test/quantization/core/test_workflow_module.py
@@ -97,7 +97,7 @@ def test_per_tensor_observers(self, qdtype, qscheme, reduce_range):
                                                     reduce_range=reduce_range)]
 
         def _get_ref_params(reduce_range, qscheme, dtype, input_scale, min_val, max_val):
-            assert dtype in _INT_DTYPES, "Not supported dtype: {dtype}, supported dtypes are {_INT_DTYPES}"
+            assert dtype in _INT_DTYPES, f"Not supported dtype: {dtype}, supported dtypes are {_INT_DTYPES}"
             eps = torch.tensor([tolerance])
             if dtype in [torch.qint8, torch.int8]:
                 if reduce_range:
@@ -138,7 +138,7 @@ def _get_ref_params(reduce_range, qscheme, dtype, input_scale, min_val, max_val)
             # Calculate Qparams should return with a warning for observers with no data
             qparams = myobs.calculate_qparams()
             input_scale = 2**16 if qdtype is torch.qint32 else 1
-            if type(myobs) == MinMaxObserver:
+            if type(myobs) is MinMaxObserver:
                 x = torch.tensor([1.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0]) * input_scale
                 y = torch.tensor([4.0, 5.0, 5.0, 6.0, 7.0, 8.0]) * input_scale
             else:
@@ -201,7 +201,7 @@ def test_per_channel_observers(self, qdtype, qscheme, ch_axis, reduce_range):
                     [[[-4.0, -3.0], [5.0, 5.0]], [[6.0, 3.0], [7.0, 8.0]]],
                 ]
             )
-            if type(myobs) == MovingAveragePerChannelMinMaxObserver:
+            if type(myobs) is MovingAveragePerChannelMinMaxObserver:
                 # Scaling the input tensor to model change in min/max values
                 # across batches
                 result = myobs(0.5 * x)
@@ -908,11 +908,13 @@ def test_quant_min_max_override(self):
         self.assertEqual(fq_module.activation_post_process.quant_min, 0)
         self.assertEqual(fq_module.activation_post_process.quant_max, 127)
 
-    @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']))
-    def test_fused_moving_avg_obs_fake_quant(self, device):
+    @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
+           sampled_dtype=st.sampled_from(['bf16', 'fp16', 'fp32']))
+    def test_fused_moving_avg_obs_fake_quant(self, device, sampled_dtype):
         try:
-            sampled_dtype = st.sampled_from(["bf16", "fp32"]) if device == "cuda" else "fp32"
-            dtype = torch.bfloat16 if sampled_dtype == "bf16" else torch.float32
+            if device == 'cpu':
+                sampled_dtype = 'fp32'
+            dtype = {'bf16' : torch.bfloat16, 'fp16' : torch.half, 'fp32' : torch.float32}[sampled_dtype]
             torch.set_default_dtype(dtype)
 
             with torch.device(device):
diff --git a/test/quantization/core/test_workflow_ops.py b/test/quantization/core/test_workflow_ops.py
index 93ea3e9f57b2..c1e8ecfa214b 100644
--- a/test/quantization/core/test_workflow_ops.py
+++ b/test/quantization/core/test_workflow_ops.py
@@ -51,11 +51,18 @@ def _fake_quantize_per_tensor_affine_grad_reference(dY, X, scale, zero_point, qu
     return res.to(dtype)
 
 # Reference method for the gradients of the fake quantize operator
-def _fake_quantize_learnable_per_tensor_affine_grad_reference(dY, X, scale, zero_point, quant_min, quant_max, device):
+def _fake_quantize_learnable_per_tensor_affine_grad_reference(dY, X, scale, zero_point, quant_min, quant_max, device, dtype):
     r"""This method references the following literatures for back propagation on scale and zero point.
     - https://arxiv.org/pdf/1902.08153.pdf
     - https://arxiv.org/pdf/1903.08066.pdf
     """
+
+    if dtype is torch.bfloat16:
+        dY = dY.to(dtype=torch.float32)
+        X = X.to(dtype=torch.float32)
+        scale = scale.to(dtype=torch.float32)
+        zero_point = zero_point.to(dtype=torch.float32)
+
     zero_point_rounded = int((zero_point + 0.5).clamp(quant_min, quant_max).item())
     Xq = torch.round(X * (1.0 / scale) + zero_point_rounded)
 
@@ -87,6 +94,12 @@ def _fake_quantize_learnable_per_tensor_affine_grad_reference(dY, X, scale, zero
 
     grad_scale = (grad_scale * dY).sum().unsqueeze(dim=0)
     grad_zp = (grad_zp * dY).sum().unsqueeze(dim=0)
+
+    if dtype is torch.bfloat16:
+        grad_X = grad_X.to(torch.bfloat16)
+        grad_scale = grad_scale.to(torch.bfloat16)
+        grad_zp = grad_zp.to(torch.bfloat16)
+
     return grad_X, grad_scale, grad_zp
 
 
@@ -96,11 +109,17 @@ def _quantize_per_tensor(x, scale, zero_point, quant_min, quant_max):
 
 # Reference method for the per channel gradients of the learnable fake quantize operator
 def _fake_quantize_learnable_per_channel_affine_grad_reference(
-        dY, X, per_channel_scale, per_channel_zero_point, axis, quant_min, quant_max, device):
+        dY, X, per_channel_scale, per_channel_zero_point, axis, quant_min, quant_max, device, dtype):
     r"""This method references the following literatures for back propagation on scale and zero point.
     - https://arxiv.org/pdf/1902.08153.pdf
     - https://arxiv.org/pdf/1903.08066.pdf
     """
+    if dtype is torch.bfloat16:
+        dY = dY.to(dtype=torch.float32)
+        X = X.to(dtype=torch.float32)
+        per_channel_scale = per_channel_scale.to(dtype=torch.float32)
+        per_channel_zero_point = per_channel_zero_point.to(dtype=torch.float32)
+
     per_channel_zero_point = ((per_channel_zero_point.detach() + 0.5).clamp(quant_min, quant_max)).type(torch.int32)
     grad_X = _fake_quantize_per_channel_affine_grad_reference(
         dY, X, per_channel_scale, per_channel_zero_point, axis, quant_min, quant_max).to(device)
@@ -152,6 +171,13 @@ def _fake_quantize_learnable_per_channel_affine_grad_reference(
 
         grad_scale[i] = grad_scale_i
         grad_zero_point[i] = grad_zp_i
+
+    # if dtype is torch.bfloat16, we downcast before returning the gradients to mimic autograd's downcasting
+    if dtype is torch.bfloat16:
+        grad_X = grad_X.to(torch.bfloat16)
+        grad_scale = grad_scale.to(torch.bfloat16)
+        grad_zero_point = grad_zero_point.to(torch.bfloat16)
+
     return grad_X, grad_scale, grad_zero_point
 
 def _get_tensor_min_max(
@@ -454,7 +480,7 @@ def test_learnable_forward_per_tensor_cuda(self, X):
         self._test_learnable_forward_per_tensor(
             X, 'cuda', scale_base, zero_point_base)
 
-    def _test_learnable_backward_per_tensor(self, X, device, scale_base, zero_point_base):
+    def _test_learnable_backward_per_tensor(self, X, device, scale_base, zero_point_base, dtype=torch.float32):
         r"""Tests the backward method with additional backprop support for scale and zero point.
         """
         X_base = torch.tensor(X).to(device)
@@ -462,7 +488,7 @@ def _test_learnable_backward_per_tensor(self, X, device, scale_base, zero_point_
         for n_bits in (4, 8):
             quant_min, quant_max = 0, 2 ** n_bits - 1
 
-            X = X_base.clone().float().to(device)
+            X = X_base.clone().to(device)
             X.requires_grad_()
             scale_base = scale_base.to(device)
             zero_point_base = zero_point_base.to(device)
@@ -475,7 +501,7 @@ def _test_learnable_backward_per_tensor(self, X, device, scale_base, zero_point_
                     X, scale, zero_point, quant_min, quant_max, grad_factor).to(device)
                 dout = torch.rand_like(X, dtype=torch.float).to(device)
                 dX, dScale, dZeroPoint = _fake_quantize_learnable_per_tensor_affine_grad_reference(
-                    dout, X, scale, zero_point, quant_min, quant_max, device)
+                    dout, X, scale, zero_point, quant_min, quant_max, device, dtype)
                 Y_prime.backward(dout)
 
                 expected_dX = dX.to(device).detach()
@@ -512,17 +538,20 @@ def test_learnable_backward_per_tensor_cpu(self, X):
         self._test_learnable_backward_per_tensor(
             X, 'cpu', scale_base, zero_point_base)
 
-    @given(X=hu.tensor(shapes=hu.array_shapes(1, 5,),
-                       elements=hu.floats(-1e3, 1e3, allow_nan=False, allow_infinity=False),
-                       qparams=hu.qparams(dtypes=torch.quint8)))
     @unittest.skipIf(not TEST_CUDA, "No gpu is not available.")
-    def test_learnable_backward_per_tensor_cuda(self, X):
-        torch.random.manual_seed(NP_RANDOM_SEED)
-        X, (_, _, _) = X
-        scale_base = torch.normal(mean=0, std=1, size=(1,)).clamp(1e-4, 100)
-        zero_point_base = torch.normal(mean=0, std=128, size=(1,))
-        self._test_learnable_backward_per_tensor(
-            X, 'cuda', scale_base, zero_point_base)
+    def test_learnable_backward_per_tensor_cuda(self):
+        # setting seed to avoid increasing tolerance due to cases where
+        # difference in Python vs CPP downcasting causes tensor mismatches
+        # e.g. 27.87704 vs  27.8408 before downcasting, 27.7500 vs 27.8750 after downcasting for Python vs CPP op
+        torch.random.manual_seed(12)
+        x_shape = (2, 1)
+
+        for dtype in [torch.bfloat16, torch.float32]:
+            X_base = torch.randn(x_shape, dtype=dtype, device='cuda')
+            scale_base = torch.normal(mean=0, std=1, size=(1,)).clamp(1e-4, 100).to(dtype=dtype)
+            zero_point_base = torch.normal(mean=0, std=128, size=(1,)).to(dtype=dtype)
+            self._test_learnable_backward_per_tensor(
+                X_base, 'cuda', scale_base, zero_point_base, dtype)
 
     @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
            X=hu.tensor(shapes=hu.array_shapes(1, 5,),
@@ -599,7 +628,7 @@ def test_fake_quant_control(self):
             # Output of fake quant is not identical to input
             Y = fq_module(X)
             self.assertNotEqual(Y, X)
-            if type(fq_module) == _LearnableFakeQuantize:
+            if type(fq_module) is _LearnableFakeQuantize:
                 fq_module.toggle_fake_quant(False)
             else:
                 torch.ao.quantization.disable_fake_quant(fq_module)
@@ -613,7 +642,7 @@ def test_fake_quant_control(self):
             scale = fq_module.scale.detach().clone()
             zero_point = fq_module.zero_point.detach().clone()
 
-            if type(fq_module) == _LearnableFakeQuantize:
+            if type(fq_module) is _LearnableFakeQuantize:
                 fq_module.toggle_observer_update(False)
                 fq_module.toggle_fake_quant(True)
             else:
@@ -625,7 +654,7 @@ def test_fake_quant_control(self):
             # Observer is disabled, scale and zero-point do not change
             self.assertEqual(fq_module.scale, scale)
             self.assertEqual(fq_module.zero_point, zero_point)
-            if type(fq_module) == _LearnableFakeQuantize:
+            if type(fq_module) is _LearnableFakeQuantize:
                 fq_module.toggle_observer_update(True)
             else:
                 torch.ao.quantization.enable_observer(fq_module)
@@ -724,7 +753,7 @@ def _test_forward_per_channel_cachemask_impl(self, device):
                 X.cpu(), scale.cpu(), zero_point.cpu(), axis, quant_min, quant_max)
             Y_prime = torch.fake_quantize_per_channel_affine(
                 X, scale, zero_point, axis, quant_min, quant_max)
-            torch.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance)
+            torch.testing.assert_close(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance)
             self.assertTrue(Y.dtype == float_type)
 
     def test_forward_per_channel_cachemask_cpu(self):
@@ -823,21 +852,20 @@ def test_learnable_forward_per_channel_cpu(self, X):
         self._test_learnable_forward_per_channel(
             X_base, 'cpu', scale_base, zero_point_base, axis)
 
-    @given(X=hu.per_channel_tensor(shapes=hu.array_shapes(1, 5,),
-                                   qparams=hu.qparams(dtypes=torch.quint8)))
     @unittest.skipIf(not TEST_CUDA, "No gpu is not available.")
-    @unittest.skip(
-        "this is broken without changes to any relevant code, "
-        "we need to remove hypothesis testing in CI")
-    def test_learnable_forward_per_channel_cuda(self, X):
+    def test_learnable_forward_per_channel_cuda(self):
         torch.random.manual_seed(NP_RANDOM_SEED)
-        X, (_, _, axis, _) = X
-        X_base = torch.tensor(X).to('cuda')
-        channel_size = X_base.size(axis)
-        scale_base = torch.normal(mean=0, std=1, size=(channel_size,)).clamp(1e-4, 100)
-        zero_point_base = torch.normal(mean=0, std=128, size=(channel_size,))
-        self._test_learnable_forward_per_channel(
-            X_base, 'cuda', scale_base, zero_point_base, axis)
+        shape = (2, 1, 2, 10)
+        axis = 1
+
+        for dtype in [torch.float32, torch.bfloat16]:
+            X_base = torch.randn(shape, device="cuda").to(dtype)
+            channel_size = X_base.size(axis)
+            scale_base = torch.normal(mean=0, std=1, size=(channel_size,)).clamp(1e-4, 100).to(dtype)
+            zero_point_base = torch.normal(mean=0, std=128, size=(channel_size,)).to(dtype)
+
+            self._test_learnable_forward_per_channel(
+                X_base, 'cuda', scale_base, zero_point_base, axis)
 
     @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
            X=hu.per_channel_tensor(shapes=hu.array_shapes(1, 5,),
@@ -901,7 +929,7 @@ def test_backward_per_channel_cachemask_cpu(self):
     def test_backward_per_channel_cachemask_cuda(self):
         self._test_backward_per_channel_cachemask_impl('cuda')
 
-    def _test_learnable_backward_per_channel(self, X_base, device, scale_base, zero_point_base, axis):
+    def _test_learnable_backward_per_channel(self, X_base, device, scale_base, zero_point_base, axis, dtype=torch.float32):
         r"""Tests the backward path of the learnable FakeQuantizePerTensorAffine op.
         """
         for n_bits in (4, 8):
@@ -923,7 +951,7 @@ def _test_learnable_backward_per_channel(self, X_base, device, scale_base, zero_
 
                 dout = torch.rand(X_curr.shape, dtype=torch.float).to(device)
                 dX, dScale, dZeroPoint = _fake_quantize_learnable_per_channel_affine_grad_reference(
-                    dout, X_curr, scale_curr, zero_point_curr, axis, quant_min, quant_max, device)
+                    dout, X_curr, scale_curr, zero_point_curr, axis, quant_min, quant_max, device, dtype)
                 Y_prime.backward(dout)
 
                 dX_expected = dX.to(device).detach()
@@ -932,7 +960,11 @@ def _test_learnable_backward_per_channel(self, X_base, device, scale_base, zero_
                 dScale_actual = scale_curr.to(device).grad.detach()
                 dZeroPoint_expected = dZeroPoint.to(device).detach()
                 dZeroPoint_actual = zero_point_curr.to(device).grad.detach()
-                tolerance = 1e-4
+
+                # increasing tolerance for bf16 due to differences in python's x.to(torch.bfloat16) and cpp's x.to(at::kBFloat16)
+                # for example, -0.16749558 gets downcast to -1.68 (after applying grad_factor) in python
+                # in CPP, -1.6752 gets downcast to -1.67
+                tolerance = 1e-2 if dtype is torch.bfloat16 else 1e-4
 
                 self.assertTrue(
                     torch.allclose(dX_expected, dX_actual, rtol=tolerance, atol=tolerance),
@@ -962,20 +994,21 @@ def test_learnable_backward_per_channel_cpu(self, X):
         self._test_learnable_backward_per_channel(
             X_base, 'cpu', scale_base, zero_point_base, axis)
 
-    @given(X=hu.per_channel_tensor(shapes=hu.array_shapes(2, 5,),
-                                   qparams=hu.qparams(dtypes=torch.quint8)))
     @unittest.skipIf(not TEST_CUDA, "No gpu is not available.")
-    @unittest.skip(
-        "this is broken without changes to any relevant code, "
-        "we need to remove hypothesis testing in CI")
-    def test_learnable_backward_per_channel_cuda(self, X):
+    def test_learnable_backward_per_channel_cuda(self):
         torch.random.manual_seed(NP_RANDOM_SEED)
-        X, (scale, zero_point, axis, torch_type) = X
-        X_base = torch.tensor(X).to('cuda')
-        scale_base = to_tensor(scale, 'cuda')
-        zero_point_base = to_tensor(zero_point, 'cuda')
-        self._test_learnable_backward_per_channel(
-            X_base, 'cuda', scale_base, zero_point_base, axis)
+
+        x_shape = (2, 1)
+        scale_shape = (2,)
+        zero_point_shape = (2,)
+        axis = 0
+        for dtype in [torch.bfloat16, torch.float32]:
+            X_base = torch.randn(x_shape, dtype=dtype, device='cuda')
+            scale_base = torch.randn(scale_shape, dtype=dtype, device='cuda')
+            zero_point_base = torch.randint(0, 10, zero_point_shape, device='cuda').to(dtype=dtype)
+            self._test_learnable_backward_per_channel(
+                X_base, 'cuda', scale_base, zero_point_base, axis, dtype
+            )
 
     def test_numerical_consistency_per_tensor(self):
         self._test_numerical_consistency('per_tensor')
@@ -1065,15 +1098,17 @@ def test_fake_quantize_per_tensor_affine_inf(self, dtype, device) -> None:
 
 class TestFusedObsFakeQuant(TestCase):
     @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']),
+           sampled_dtype=st.sampled_from(['bf16', 'fp16', 'fp32']),
            symmetric_quant=st.booleans(), use_bool=st.booleans())
     @settings(deadline=None)
-    def test_fused_obs_fake_quant_moving_avg(self, device, symmetric_quant, use_bool) -> None:
+    def test_fused_obs_fake_quant_moving_avg(self, device, sampled_dtype, symmetric_quant, use_bool) -> None:
         """
         Tests the case where we call the fused_obs_fake_quant op multiple times
         and update the running_min and max of the activation tensors.
         """
-        sampled_dtype = st.sampled_from(["bf16", "fp32"]) if device == "cuda" else "fp32"
-        dtype = torch.bfloat16 if sampled_dtype == "bf16" else torch.float32
+        if device == "cpu":
+            sampled_dtype = "fp32"
+        dtype = {'bf16' : torch.bfloat16, 'fp16' : torch.half, 'fp32' : torch.float32}[sampled_dtype]
 
         in_running_min_ref = out_running_min_ref = torch.tensor(float("inf"), dtype=dtype)
         in_running_min_op = torch.tensor(float("inf"), dtype=dtype, device=device)
diff --git a/test/quantization/eager/test_quantize_eager_qat.py b/test/quantization/eager/test_quantize_eager_qat.py
index c5ce0659f55f..da67f19488a4 100644
--- a/test/quantization/eager/test_quantize_eager_qat.py
+++ b/test/quantization/eager/test_quantize_eager_qat.py
@@ -241,7 +241,7 @@ def from_float(cls, mod, qconfig=None):
         Args: `mod` a float module, either produced by torch.ao.quantization utilities
         or directly from user
         """
-        assert type(mod) == cls._FLOAT_MODULE, (
+        assert type(mod) is cls._FLOAT_MODULE, (
             "qat."
             + cls.__name__
             + ".from_float only works for "
@@ -1264,8 +1264,8 @@ def test_linear_bn_workflow(self):
         mp = prepare_qat(m)
         mp(data)
         mq = convert(mp)
-        self.assertTrue(type(mq[1]) == nnq.Linear)
-        self.assertTrue(type(mq[2]) == nn.Identity)
+        self.assertTrue(type(mq[1]) is nnq.Linear)
+        self.assertTrue(type(mq[2]) is nn.Identity)
 
     @skipIfNoXNNPACK
     @override_qengines
diff --git a/test/quantization/fx/test_model_report_fx.py b/test/quantization/fx/test_model_report_fx.py
index 80ab0f1e8618..51bce95e30ab 100644
--- a/test/quantization/fx/test_model_report_fx.py
+++ b/test/quantization/fx/test_model_report_fx.py
@@ -1823,7 +1823,7 @@ def test_get_modules_and_features(self):
             plottable_set = set()
 
             for feature_name in b_1_linear_features:
-                if type(b_1_linear_features[feature_name]) == torch.Tensor:
+                if type(b_1_linear_features[feature_name]) is torch.Tensor:
                     plottable_set.add(feature_name)
 
             returned_plottable_feats = mod_rep_visualizer.get_all_unique_feature_names()
diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py
index e38c56da2a71..f6f1128e422c 100644
--- a/test/quantization/fx/test_quantize_fx.py
+++ b/test/quantization/fx/test_quantize_fx.py
@@ -826,7 +826,7 @@ def conv_bn_res_relu_extra_inputs_getter(pattern):
         # check conv module has two inputs
         named_modules = dict(m.named_modules())
         for node in m.graph.nodes:
-            if node.op == "call_module" and type(named_modules[node.target]) == torch.nn.Conv2d:
+            if node.op == "call_module" and type(named_modules[node.target]) is torch.nn.Conv2d:
                 self.assertTrue(len(node.args) == 2, msg="Expecting the fused op to have two arguments")
 
     def test_fusion_pattern_with_matchallnode(self):
@@ -917,7 +917,7 @@ def forward(self, x, y):
         m = torch.fx.symbolic_trace(M())
         modules = dict(m.named_modules())
         for n in m.graph.nodes:
-            if n.op == 'call_module' and type(modules[n.target]) == nn.ReLU:
+            if n.op == 'call_module' and type(modules[n.target]) is nn.ReLU:
                 self.assertTrue(_is_match(modules, n, pattern))
 
     def test_pattern_match_constant(self):
diff --git a/test/quantization/fx/test_subgraph_rewriter.py b/test/quantization/fx/test_subgraph_rewriter.py
index 41c085b34a04..e410f93803d6 100644
--- a/test/quantization/fx/test_subgraph_rewriter.py
+++ b/test/quantization/fx/test_subgraph_rewriter.py
@@ -454,8 +454,8 @@ def forward(self, x):
         symbolic_traced: torch.fx.GraphModule = symbolic_trace(module)
         for n, m in zip(symbolic_traced.graph.nodes, graph.nodes):
             if n.op == 'placeholder':
-                assert n.type == int
-                assert m.type == int
+                assert n.type is int
+                assert m.type is int
 
     def test_subgraph_writer_replace_consecutive_submodules(self):
 
diff --git a/test/quantization/pt2e/test_numeric_debugger.py b/test/quantization/pt2e/test_numeric_debugger.py
index 510dfabfbcc1..cb4aaf86a85c 100644
--- a/test/quantization/pt2e/test_numeric_debugger.py
+++ b/test/quantization/pt2e/test_numeric_debugger.py
@@ -4,6 +4,8 @@
 import unittest
 from collections import Counter
 
+from packaging import version
+
 import torch
 from torch.ao.quantization import (
     compare_results,
@@ -29,6 +31,10 @@
 )
 
 
+if version.parse(torch.__version__) >= version.parse("2.8.0"):
+    torch._dynamo.config.cache_size_limit = 128
+
+
 @unittest.skipIf(IS_WINDOWS, "Windows not yet supported for torch.compile")
 class TestNumericDebugger(TestCase):
     def _assert_each_node_has_debug_handle(self, model) -> None:
@@ -76,7 +82,7 @@ def _extract_debug_handles_with_prev_decomp_op_from_node(node):
                         prev_decomp_op_to_debug_handle_map[prev_decomp_op]
                         == debug_handle
                     ), f"Node {node} has different debug handle {debug_handle}"
-                    "than previous node sharing the same decomp op {prev_decomp_op}"
+                    f"than previous node sharing the same decomp op {prev_decomp_op}"
 
         bfs_trace_with_node_process(
             model, _extract_debug_handles_with_prev_decomp_op_from_node
diff --git a/test/quantization/pt2e/test_quantize_pt2e.py b/test/quantization/pt2e/test_quantize_pt2e.py
index 8c539224fce4..f6d3eae23321 100644
--- a/test/quantization/pt2e/test_quantize_pt2e.py
+++ b/test/quantization/pt2e/test_quantize_pt2e.py
@@ -2121,14 +2121,9 @@ def test_groupwise_per_channel_quant(self):
         m(*example_inputs)
 
     def test_observer_callback(self):
-        from torch.library import impl, Library
+        from torch.library import custom_op
 
-        test_lib = Library("test_int4", "DEF")  # noqa: TOR901
-        test_lib.define(
-            "quantize_per_tensor_int4(Tensor input, float scale, int zero_point) -> Tensor"
-        )
-
-        @impl(test_lib, "quantize_per_tensor_int4", "CompositeExplicitAutograd")
+        @custom_op("test_int4::quantize_per_tensor_int4", mutates_args=())
         def quantize_per_tensor_int4(
             input: torch.Tensor,
             scale: float,
@@ -2141,11 +2136,7 @@ def quantize_per_tensor_int4(
                 .view(torch.bits8)
             )
 
-        test_lib.define(
-            "dequantize_per_tensor_int4(Tensor input, float scale, int zero_point) -> Tensor"
-        )
-
-        @impl(test_lib, "dequantize_per_tensor_int4", "CompositeExplicitAutograd")
+        @custom_op("test_int4::dequantize_per_tensor_int4", mutates_args=())
         def dequantize_per_tensor_int4(
             input: torch.Tensor,
             scale: float,
diff --git a/test/quantization/pt2e/test_quantize_pt2e_qat.py b/test/quantization/pt2e/test_quantize_pt2e_qat.py
index ca80439bbf34..aa8743c32297 100644
--- a/test/quantization/pt2e/test_quantize_pt2e_qat.py
+++ b/test/quantization/pt2e/test_quantize_pt2e_qat.py
@@ -665,12 +665,6 @@ def get_source_fn(node: torch.fx.Node):
         self.assertNotEqual(get_source_fn(second_conv), get_source_fn(second_relu))
         self.assertNotEqual(get_source_fn(first_relu), get_source_fn(second_relu))
 
-        # Assert that "backbone" exists only in the second set of conv and relu's partition
-        self.assertTrue("backbone" not in get_source_fn(first_conv))
-        self.assertTrue("backbone" not in get_source_fn(first_relu))
-        self.assertTrue("backbone" in get_source_fn(second_conv))
-        self.assertTrue("backbone" in get_source_fn(second_relu))
-
     def test_qat_conv_bn_bias_derived_qspec(self):
         m = self._get_conv_bn_model()
         example_inputs = self.example_inputs
diff --git a/test/quantization/pt2e/test_x86inductor_quantizer.py b/test/quantization/pt2e/test_x86inductor_quantizer.py
index 6c83ab1a869e..9e2e690c21d7 100644
--- a/test/quantization/pt2e/test_x86inductor_quantizer.py
+++ b/test/quantization/pt2e/test_x86inductor_quantizer.py
@@ -332,7 +332,7 @@ def __init__(
         ) -> None:
             super().__init__()
             self.linear = nn.Linear(4, 4, bias=use_bias)
-            if postop == nn.GELU:
+            if postop is nn.GELU:
                 self.postop = postop(approximate=post_op_algo)
             else:
                 self.postop = postop(inplace=inplace_postop)
diff --git a/test/run_test.py b/test/run_test.py
index ab0a56d301fa..59d4f3f980f8 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -284,6 +284,7 @@ def __contains__(self, item):
     # temporarily sets a global config
     "test_autograd_fallback",
     "inductor/test_compiler_bisector",
+    "test_privateuseone_python_backend",
 ] + FSDP_TEST
 
 # Test files that should always be run serially with other test files,
@@ -400,6 +401,7 @@ def __contains__(self, item):
 ]
 FUNCTORCH_TESTS = [test for test in TESTS if test.startswith("functorch")]
 ONNX_TESTS = [test for test in TESTS if test.startswith("onnx")]
+QUANTIZATION_TESTS = [test for test in TESTS if test.startswith("test_quantization")]
 
 
 def _is_cpp_test(test):
@@ -772,6 +774,9 @@ def print_to_file(s):
                 "Test succeeeded in new process, continuing with the rest of the tests"
             )
         elif num_failures[current_failure] >= 3:
+            # This is for log classifier so it can prioritize consistently
+            # failing tests instead of reruns. [1:-1] to remove quotes
+            print_to_file(f"FAILED CONSISTENTLY: {current_failure[1:-1]}")
             if not continue_through_error:
                 print_to_file("Stopping at first consistent failure")
                 break
@@ -836,7 +841,7 @@ def _test_cpp_extensions_aot(test_directory, options, use_ninja):
         "--root",
         "./install",
     ]
-    wheel_cmd = [sys.executable, "-m", "pip", "wheel", ".", "-w", "./dist"]
+    wheel_cmd = [sys.executable, "-m", "build", "--wheel", "--no-isolation"]
     return_code = shell(install_cmd, cwd=cpp_extensions_test_dir, env=shell_env)
     if return_code != 0:
         return return_code
@@ -949,10 +954,7 @@ def test_openreg(test_module, test_directory, options):
 
 
 def test_distributed(test_module, test_directory, options):
-    # MPI tests are broken with Python-3.9
-    mpi_available = subprocess.call(
-        "command -v mpiexec", shell=True
-    ) == 0 and sys.version_info < (3, 9)
+    mpi_available = shutil.which("mpiexec")
     if options.verbose and not mpi_available:
         print_to_stderr("MPI not available -- MPI backend tests will be skipped")
 
@@ -1121,6 +1123,9 @@ def run_doctests(test_module, test_directory, options):
     if torch.mps.is_available():
         os.environ["TORCH_DOCTEST_MPS"] = "1"
 
+    if torch.distributed.is_available():
+        os.environ["TORCH_DOCTEST_DISTRIBUTED"] = "1"
+
     if 0:
         # TODO: could try to enable some of these
         os.environ["TORCH_DOCTEST_QUANTIZED_DYNAMIC"] = "1"
@@ -1470,6 +1475,11 @@ def parse_args():
         action="store_true",
         help="exclude inductor tests",
     )
+    parser.add_argument(
+        "--exclude-quantization-tests",
+        action="store_true",
+        help="exclude quantization tests",
+    )
     parser.add_argument(
         "--dry-run",
         action="store_true",
@@ -1643,6 +1653,9 @@ def get_selected_tests(options) -> list[str]:
     if options.exclude_aot_dispatch_tests:
         options.exclude.extend(AOT_DISPATCH_TESTS)
 
+    if options.exclude_quantization_tests:
+        options.exclude.extend(QUANTIZATION_TESTS)
+
     # these tests failing in CUDA 11.6 temporary disabling. issue https://github.com/pytorch/pytorch/issues/75375
     if torch.version.cuda is not None:
         options.exclude.extend(["distributions/test_constraints"])
diff --git a/test/slow_tests.json b/test/slow_tests.json
index 5a35d23776a3..dc75ed8380ce 100644
--- a/test/slow_tests.json
+++ b/test/slow_tests.json
@@ -1,249 +1,237 @@
 {
-  "EndToEndLSTM (__main__.RNNTest)": 197.77900187174478,
-  "MultiheadAttention (__main__.ModulesTest)": 137.42000325520834,
-  "test_AllenaiLongformerBase_repro_cpu_halide (__main__.HalideCpuTests)": 214.1816660563151,
-  "test__adaptive_avg_pool2d (__main__.CPUReproTests)": 91.37688869900174,
-  "test_adaptive_max_pool2d1_cpu_halide (__main__.HalideCpuTests)": 116.57933298746745,
-  "test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 66.92922253078885,
-  "test_after_aot_gpu_runtime_error (__main__.MinifierIsolateTests)": 65.68500010172527,
-  "test_alexnet_prefix_cpu_halide (__main__.HalideCpuTests)": 177.91966756184897,
-  "test_aot_autograd_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 87.69499969482422,
-  "test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 74.02233378092448,
-  "test_aot_autograd_symbolic_exhaustive_masked_norm_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.45699946085612,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 136.27599589029947,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 259.30466715494794,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 135.36400095621744,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_unfold_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 61.07166544596354,
-  "test_aot_autograd_symbolic_exhaustive_ormqr_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.8491905757359,
-  "test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 90.34733327229817,
-  "test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 140.09266916910806,
-  "test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_False_cpu (__main__.AssociativeScanTests)": 65.17999935150146,
-  "test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_True_cpu (__main__.AssociativeScanTests)": 73.75112533569336,
-  "test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 646.9324035644531,
-  "test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 142.86450004577637,
-  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 493.49299791124133,
-  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 498.72944810655383,
-  "test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 133.2033322652181,
-  "test_avg_pool3d_backward_cpu_halide (__main__.HalideCpuTests)": 61.788333892822266,
-  "test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 69.57333119710286,
-  "test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 81.06516774495442,
-  "test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 215.5933346218533,
-  "test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 135.41816584269205,
-  "test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 338.17533026801215,
-  "test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 423.4767761230469,
-  "test_collect_callgrind (__main__.TestBenchmarkUtils)": 325.6485578748915,
-  "test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 111.10633341471355,
-  "test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 104.33766555786133,
-  "test_comprehensive_diff_cuda_float32 (__main__.TestDecompCUDA)": 69.72683334350586,
-  "test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 71.48199971516927,
-  "test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 96.58033243815105,
-  "test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 96.65433247884114,
-  "test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 464.92467244466144,
-  "test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 460.3839925130208,
-  "test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 263.58483632405597,
-  "test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 298.0318349202474,
-  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1310.3350016276042,
-  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 66.3976656595866,
-  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 1316.084981282552,
-  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 69.58183288574219,
-  "test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.05749893188477,
-  "test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 72.31333287556966,
-  "test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 74.53133392333984,
-  "test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 82.40500005086263,
-  "test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 69.91749890645345,
-  "test_comprehensive_linalg_svd_cuda_complex64 (__main__.TestDecompCUDA)": 70.98916562398274,
-  "test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 126.90333302815755,
-  "test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 112.40283330281575,
-  "test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 114.09550094604492,
-  "test_comprehensive_nn_functional_conv_transpose3d_cuda_complex128 (__main__.TestDecompCUDA)": 63.223000049591064,
-  "test_comprehensive_nn_functional_conv_transpose3d_cuda_complex64 (__main__.TestDecompCUDA)": 67.44083213806152,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDecompCPU)": 62.70066706339518,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float64 (__main__.TestDecompCPU)": 60.468666076660156,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 116.34999974568684,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 116.57566579182942,
-  "test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 115.4306640625,
-  "test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 114.67599741617839,
-  "test_comprehensive_nn_functional_grid_sample_cuda_bfloat16 (__main__.TestDecompCUDA)": 78.96566772460938,
-  "test_comprehensive_nn_functional_grid_sample_cuda_float16 (__main__.TestDecompCUDA)": 60.72616704305013,
-  "test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 270.3598327636719,
-  "test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 260.6623306274414,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestDecompCUDA)": 88.48316701253255,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.13166681925456,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestDecompCUDA)": 83.55450057983398,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 80.67749913533528,
-  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 136.17766698201498,
-  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 157.4010009765625,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1222.983662923177,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1228.281494140625,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1216.2643432617188,
-  "test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 503.51465861002606,
-  "test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 523.0736694335938,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 68.91749954223633,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 61.947166442871094,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 63.17983309427897,
-  "test_comprehensive_nn_functional_unfold_cuda_complex128 (__main__.TestDecompCUDA)": 77.92383321126302,
-  "test_comprehensive_nn_functional_unfold_cuda_complex64 (__main__.TestDecompCUDA)": 69.46137571334839,
-  "test_comprehensive_ormqr_cpu_complex64 (__main__.TestDecompCPU)": 62.2076670328776,
-  "test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 139.3495012919108,
-  "test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 124.99983469645183,
-  "test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 73.96983273824056,
-  "test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 73.27383422851562,
-  "test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 80.94216791788737,
-  "test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 73.65583419799805,
-  "test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 74.30566660563152,
-  "test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 112.75583267211914,
-  "test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 106.72283299763997,
-  "test_constructor_autograd_SparseCSC_cuda (__main__.TestSparseAnyCUDA)": 102.85349909464519,
-  "test_constructor_autograd_SparseCSR_cuda (__main__.TestSparseAnyCUDA)": 73.14683278401692,
-  "test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 137.8197758992513,
-  "test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 437.60955386691626,
-  "test_conv2d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 75.4076665242513,
-  "test_conv2d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 62.40233357747396,
-  "test_conv3d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 149.36666870117188,
-  "test_conv3d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 72.90299987792969,
-  "test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 81.56499862670898,
-  "test_conv_unary_fusion_nnc (__main__.TestMkldnnFusion)": 75.13744566175673,
-  "test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 82.20433298746745,
-  "test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 76.78600056966145,
-  "test_count_nonzero_all (__main__.TestBool)": 655.6186726888021,
-  "test_cpu_gpu_parity_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 80.43400009940652,
-  "test_custom_module_lstm (__main__.TestQuantizedOps)": 798.5362040201823,
-  "test_ddp_uneven_inputs (__main__.TestDistBackendWithSpawn)": 360.75275349617004,
-  "test_diff_hyperparams_sharding_strategy_str_no_shard (__main__.TestFSDPUseOrigParamsMultipleParamGroups)": 60.4433339436849,
-  "test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 85.3961664835612,
-  "test_dtensor_op_db_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDTensorOpsCPU)": 93.10799916585286,
-  "test_eig_check_magma_cuda_float32 (__main__.TestLinalgCUDA)": 215.1919957002004,
-  "test_error_detection_and_propagation (__main__.NcclErrorHandlingTest)": 67.04866790771484,
-  "test_fail_arithmetic_ops.py (__main__.TestTyping)": 64.6271112230089,
-  "test_fail_creation_ops.py (__main__.TestTyping)": 71.04431086573108,
-  "test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 88.46849950154622,
-  "test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 107.12216822306316,
-  "test_fuse_large_params_cpu (__main__.CpuTests)": 80.30040054321289,
-  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 162.87633260091147,
-  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 160.84833441840277,
-  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 153.62799580891928,
-  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 120.26516850789388,
-  "test_grad_nn_Transformer_cpu_float64 (__main__.TestModuleCPU)": 62.87366739908854,
-  "test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 104.12133407592773,
-  "test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 117.95999908447266,
-  "test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 113.97000122070312,
-  "test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 248.1183293660482,
-  "test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 180.4351666768392,
-  "test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 160.81400299072266,
-  "test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 694.055165608724,
-  "test_grid_sampler_2d_cpu_halide (__main__.HalideCpuTests)": 194.28900146484375,
-  "test_group_norm (__main__.TestQuantizedOps)": 207.3484410179986,
-  "test_indirect_device_assert (__main__.TritonCodeGenTests)": 329.52866617838544,
-  "test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 67.15944459703233,
-  "test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 84.40099970499675,
-  "test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 132.7371097140842,
-  "test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 118.91166687011719,
-  "test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 130.4806671142578,
-  "test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 101.25733184814453,
-  "test_linear (__main__.TestStaticQuantizedModule)": 131.34678183661566,
-  "test_linear_binary_cpp_wrapper (__main__.TestCppWrapper)": 124.32133229573567,
-  "test_linear_binary_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 126.89633433024089,
-  "test_linear_relu (__main__.TestStaticQuantizedModule)": 128.11266708374023,
-  "test_lobpcg_ortho_cuda_float64 (__main__.TestLinalgCUDA)": 75.69916741053264,
-  "test_longformer_chunk_dynamic_shapes (__main__.DynamicShapesReproTests)": 106.60366736518012,
-  "test_lstm_cpu (__main__.TestMkldnnCPU)": 66.15800094604492,
-  "test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 130.17633226182727,
-  "test_max_autotune_addmm_max_autotune_gemm_backends_CK_x_shape2 (__main__.TestCKBackend)": 60.61724901199341,
-  "test_max_autotune_addmm_search_space_EXHAUSTIVE_dynamic_True (__main__.TestMaxAutotuneSubproc)": 82.76533508300781,
-  "test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_False_use_aoti_False (__main__.TestCKBackend)": 84.80249977111816,
-  "test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_True_use_aoti_False (__main__.TestCKBackend)": 82.48874931409955,
-  "test_max_pool2d2_cpu_halide (__main__.HalideCpuTests)": 421.6166585286458,
-  "test_max_pool2d3_cpu_halide (__main__.HalideCpuTests)": 133.6796671549479,
-  "test_max_pool2d5_cpu_halide (__main__.HalideCpuTests)": 357.6593322753906,
-  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 63.8608890109592,
-  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 64.60900031195746,
-  "test_proper_exit (__main__.TestDataLoader)": 223.7907740275065,
-  "test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 213.6155548095703,
-  "test_qat_conv2d_unary (__main__.TestQuantizePT2EX86Inductor)": 168.48199971516928,
-  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn1d)": 68.48926869834342,
-  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn2d)": 68.39782928838963,
-  "test_qat_mobilenet_v2 (__main__.TestQuantizePT2EQATModels)": 99.70321994357639,
-  "test_qat_resnet18 (__main__.TestQuantizePT2EQATModels)": 61.103378822063576,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 99.00533294677734,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 100.10599772135417,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 75.0443344116211,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 91.9883321126302,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 100.07866668701172,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 68.79566701253255,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 90.1106669108073,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 88.92966969807942,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 75.10766855875652,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 103.41666666666667,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 96.1106669108073,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 77.91766866048177,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 92.16766611735027,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 92.9856669108073,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 93.22266642252605,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 95.57533264160156,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 70.04799906412761,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 90.56433359781902,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 92.017333984375,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 94.46166737874348,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 95.06233215332031,
-  "test_qrnncell (__main__.TestDynamicQuantizedOps)": 204.8830050362481,
-  "test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 584.1243489583334,
-  "test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 1194.274678548177,
-  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 842.1573282877604,
-  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1500.2438354492188,
-  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 80.01266479492188,
-  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 304.8406728108724,
-  "test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 123.26833089192708,
-  "test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 289.4941685994466,
-  "test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 78.4913330078125,
-  "test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 160.19433085123697,
-  "test_quick_core_backward_split_cuda_float64 (__main__.TestDecompCUDA)": 76.93316650390625,
-  "test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 95.25599924723308,
-  "test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 190.9510014851888,
-  "test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 115.96716562906902,
-  "test_register_spills_cuda (__main__.BenchmarkFusionCudaTest)": 85.82816696166992,
-  "test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 64.81233215332031,
-  "test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 73.0594991048177,
-  "test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 78.28866704305013,
-  "test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 203.66749827067056,
-  "test_save_load_large_string_attribute (__main__.TestSaveLoad)": 118.92166392008464,
-  "test_sdpa_kernel_ctx_manager2_dynamic_shapes (__main__.DynamicShapesCtxManagerTests)": 161.21966722276477,
-  "test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 119.33677842881944,
-  "test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 122.50711229112413,
-  "test_sort_stable_cpu (__main__.CpuTritonTests)": 77.22933451334636,
-  "test_split_cumsum_cpu (__main__.CpuTritonTests)": 89.92000071207683,
-  "test_std (__main__.TestQuantizedOps)": 118.49511219395532,
-  "test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 149.61699732144675,
-  "test_tensor_split (__main__.TestVmapOperators)": 83.01314294423376,
-  "test_terminate_handler_on_crash (__main__.TestTorch)": 111.18021970325046,
-  "test_terminate_signal (__main__.ForkTest)": 131.81088901807865,
-  "test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 131.90911058253712,
-  "test_terminate_signal (__main__.SpawnTest)": 135.51344219843546,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 71.71866671244304,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 69.4015007019043,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 75.85683250427246,
-  "test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 144.25,
-  "test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 142.70416514078775,
-  "test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 105.90866597493489,
-  "test_unary_ops (__main__.TestTEFuserDynamic)": 83.01277730200026,
-  "test_unary_ops (__main__.TestTEFuserStatic)": 84.06699878639645,
-  "test_upsample_bicubic2d_cpu_halide (__main__.HalideCpuTests)": 97.28433227539062,
-  "test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 96.625,
-  "test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 78.01066716512044,
-  "test_views1_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 82.23649978637695,
-  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 100.44966379801433,
-  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestOperatorsCUDA)": 78.67900085449219,
-  "test_vmapjvpvjp_linalg_lu_solve_cpu_float32 (__main__.TestOperatorsCPU)": 75.2140007019043,
-  "test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 100.80166753133138,
-  "test_vmapjvpvjp_linalg_multi_dot_cuda_float32 (__main__.TestOperatorsCUDA)": 96.56916745503743,
-  "test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 99.54433314005534,
-  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cpu_float32 (__main__.TestOperatorsCPU)": 69.86966705322266,
-  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 103.45650100708008,
-  "test_vmapjvpvjp_nn_functional_conv2d_cpu_float32 (__main__.TestOperatorsCPU)": 69.28766759236653,
-  "test_vmapjvpvjp_nn_functional_max_pool2d_cpu_float32 (__main__.TestOperatorsCPU)": 70.02966690063477,
-  "test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 100.93566703796387,
-  "test_vmapjvpvjp_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 94.60433260599773,
-  "test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 98.65516599019368,
-  "test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 105.35816828409831,
-  "test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 74.68983332316081,
-  "test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 152.76449966430664
+  "EndToEndLSTM (__main__.RNNTest)": 155.6796646118164,
+  "MultiheadAttention (__main__.ModulesTest)": 133.05866495768228,
+  "test__adaptive_avg_pool2d (__main__.CPUReproTests)": 85.84300020005968,
+  "test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 65.42522388034396,
+  "test_after_aot_gpu_runtime_error (__main__.MinifierIsolateTests)": 65.31233215332031,
+  "test_aot_autograd_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 79.9153340657552,
+  "test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 81.48433176676433,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 186.04832967122397,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 188.46499633789062,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 115.20666758219402,
+  "test_aot_autograd_symbolic_exhaustive_ormqr_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 61.17433293660482,
+  "test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 86.86166890462239,
+  "test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 138.65032958984375,
+  "test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_False_cpu (__main__.AssociativeScanTests)": 86.7721659342448,
+  "test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_True_cpu (__main__.AssociativeScanTests)": 102.99050013224284,
+  "test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 608.43359375,
+  "test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 173.7251423427037,
+  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 486.642333984375,
+  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 491.10267130533856,
+  "test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 138.62899780273438,
+  "test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 81.7653325398763,
+  "test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 76.25450134277344,
+  "test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 216.97666592068143,
+  "test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 139.57733154296875,
+  "test_binary (__main__.StartProcessesListAsBinaryTest)": 1000.2024993896484,
+  "test_cat_2k_args (__main__.TestTEFuserDynamic)": 118.18855590663023,
+  "test_cat_2k_args (__main__.TestTEFuserStatic)": 111.97772413368027,
+  "test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 320.02644517686633,
+  "test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 402.67100016276044,
+  "test_collect_callgrind (__main__.TestBenchmarkUtils)": 300.41977945963544,
+  "test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 96.34449895222981,
+  "test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 93.42950057983398,
+  "test_comprehensive_diff_cuda_float32 (__main__.TestDecompCUDA)": 64.60500017801921,
+  "test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 65.14833323160808,
+  "test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 114.05733489990234,
+  "test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 106.47933451334636,
+  "test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 451.4360046386719,
+  "test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 486.5513407389323,
+  "test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 231.9798355102539,
+  "test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 263.60083770751953,
+  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1176.4216715494792,
+  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 68.16366640726726,
+  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 1090.5729878743489,
+  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 70.57383346557617,
+  "test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 73.59733327229817,
+  "test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 72.14816729227702,
+  "test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 76.59983317057292,
+  "test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 73.8191655476888,
+  "test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 61.655999501546226,
+  "test_comprehensive_linalg_svd_cuda_complex64 (__main__.TestDecompCUDA)": 63.2686653137207,
+  "test_comprehensive_linalg_vector_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 81.11633337111701,
+  "test_comprehensive_linalg_vector_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 79.07504544939313,
+  "test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 116.84133275349934,
+  "test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 117.59250005086263,
+  "test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 114.76550165812175,
+  "test_comprehensive_nn_functional_conv_transpose3d_cuda_complex128 (__main__.TestDecompCUDA)": 63.56300036112467,
+  "test_comprehensive_nn_functional_conv_transpose3d_cuda_complex64 (__main__.TestDecompCUDA)": 60.701666514078774,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDecompCPU)": 61.75800069173177,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float64 (__main__.TestDecompCPU)": 65.33233261108398,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 117.1604995727539,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 104.54616800944011,
+  "test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 118.75366719563802,
+  "test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 113.73666636149089,
+  "test_comprehensive_nn_functional_grid_sample_cuda_bfloat16 (__main__.TestDecompCUDA)": 66.19416681925456,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 252.66549936930338,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 222.92949676513672,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestDecompCUDA)": 76.49983342488606,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 83.21616744995117,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestDecompCUDA)": 75.92899958292644,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 81.04449971516927,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cpu_float32 (__main__.TestDecompCPU)": 60.393466313680015,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cpu_float64 (__main__.TestDecompCPU)": 62.78193333943685,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 125.94333521525066,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 126.8844985961914,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1251.3123575846355,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1241.600850423177,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1243.9546712239583,
+  "test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 542.0211639404297,
+  "test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 549.787831624349,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 65.82033348083496,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 63.617666244506836,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 64.30649948120117,
+  "test_comprehensive_ormqr_cpu_complex64 (__main__.TestDecompCPU)": 63.736001332600914,
+  "test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 112.08966573079427,
+  "test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 110.03333409627278,
+  "test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 64.95533243815105,
+  "test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 73.05200068155925,
+  "test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 62.977165857950844,
+  "test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 68.06733322143555,
+  "test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 66.93033345540364,
+  "test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 90.26883443196614,
+  "test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 90.10899925231934,
+  "test_constructor_autograd_SparseCSC_cuda (__main__.TestSparseAnyCUDA)": 68.69099998474121,
+  "test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 202.3588892618815,
+  "test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 422.32500712076825,
+  "test_conv2d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 78.0239995320638,
+  "test_conv3d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 155.38232930501303,
+  "test_conv3d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 67.37766520182292,
+  "test_conv3d_cuda (__main__.AOTInductorTestABICompatibleGpu)": 149.59200541178384,
+  "test_conv3d_unary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 64.1897144317627,
+  "test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 81.03766674465604,
+  "test_conv_transpose_with_output_size_and_no_batch_dim_ConvTranspose3d_cuda (__main__.TestConvolutionNNDeviceTypeCUDA)": 138.84200178955993,
+  "test_conv_unary_fusion_nnc (__main__.TestMkldnnFusion)": 71.52855597601996,
+  "test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 82.29533131917317,
+  "test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 79.40083440144856,
+  "test_count_nonzero_all (__main__.TestBool)": 624.7655571831597,
+  "test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 84.02199935913086,
+  "test_eager_sequence_nr_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 129.8006666274298,
+  "test_eig_check_magma_cuda_float32 (__main__.TestLinalgCUDA)": 285.8453318277995,
+  "test_fail_arithmetic_ops.py (__main__.TestTyping)": 64.87388865152995,
+  "test_fail_random.py (__main__.TestTyping)": 72.06940027872722,
+  "test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 78.02199872334798,
+  "test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 79.79700024922688,
+  "test_forward_ad_svd_lowrank_cpu_float32 (__main__.TestCompositeComplianceCPU)": 60.62166849772135,
+  "test_fractional_max_pool2d2_cpu (__main__.CpuTritonTests)": 75.23233540852864,
+  "test_fuse_large_params_cpu (__main__.CpuTests)": 129.14699935913086,
+  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 155.2022221883138,
+  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 154.08022223578558,
+  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 154.93033091227213,
+  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 117.5648307800293,
+  "test_grad_nn_Transformer_cpu_float64 (__main__.TestModuleCPU)": 61.67266718546549,
+  "test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 88.19633356730144,
+  "test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 100.6306660970052,
+  "test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 98.57333119710286,
+  "test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 201.47283172607422,
+  "test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 122.74483235677083,
+  "test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 140.73500061035156,
+  "test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 536.5071665445963,
+  "test_indirect_device_assert (__main__.TritonCodeGenTests)": 325.43634033203125,
+  "test_inductor_dynamic_shapes_broadcasting_dynamic_shapes (__main__.DynamicShapesReproTests)": 104.2214485168457,
+  "test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 68.84588962131076,
+  "test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 84.7916653951009,
+  "test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 126.74522060818143,
+  "test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 118.65966796875,
+  "test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 128.35166676839194,
+  "test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 100.74166615804036,
+  "test_linear_binary_cpp_wrapper (__main__.TestCppWrapper)": 122.9943364461263,
+  "test_linear_binary_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 122.79266611735027,
+  "test_lobpcg_ortho_cuda_float64 (__main__.TestLinalgCUDA)": 65.5205005009969,
+  "test_longformer_chunk_dynamic_shapes (__main__.DynamicShapesReproTests)": 106.49955664740668,
+  "test_low_memory_max_pool_dilation_1_dim_3_cpu_halide (__main__.HalideCpuTests)": 583.9716796875,
+  "test_low_memory_max_pool_dilation_2_dim_3_cpu_halide (__main__.HalideCpuTests)": 506.6836751302083,
+  "test_lstm_cpu (__main__.TestMkldnnCPU)": 83.0096664428711,
+  "test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 127.0445556640625,
+  "test_max_autotune_addmm_max_autotune_gemm_backends_CK_x_shape2 (__main__.TestCKBackend)": 68.56900024414062,
+  "test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_False_use_aoti_False (__main__.TestCKBackend)": 82.81600189208984,
+  "test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_False_use_aoti_True (__main__.TestCKBackend)": 92.80083401997884,
+  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 61.992555406358505,
+  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 63.72611067030165,
+  "test_pattern_matcher_multi_user_cpu (__main__.CpuTritonTests)": 147.29766845703125,
+  "test_proper_exit (__main__.TestDataLoader)": 216.4836629231771,
+  "test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 210.3760011461046,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 104.63733418782552,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 102.59466552734375,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 94.32133229573567,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 111.68400065104167,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 103.05666605631511,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 85.2760009765625,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 96.23033142089844,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 99.38433329264323,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 81.68533325195312,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 100.20899963378906,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 97.05566660563152,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 89.3759994506836,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 100.7616678873698,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 102.47166697184245,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 104.16033172607422,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 103.2269999186198,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 97.83200073242188,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 92.10933176676433,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 108.74566650390625,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 103.50166575113933,
+  "test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 649.3369954427084,
+  "test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 1067.1208394368489,
+  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 795.9996541341146,
+  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1375.9844970703125,
+  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 97.88966623942058,
+  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 302.8671620686849,
+  "test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 151.6493352254232,
+  "test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 255.09516398111978,
+  "test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 74.62466684977214,
+  "test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 141.4095001220703,
+  "test_quick_core_backward_split_cuda_float64 (__main__.TestDecompCUDA)": 67.56100082397461,
+  "test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 96.26366678873698,
+  "test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 175.37733459472656,
+  "test_quick_core_backward_std_cpu_float64 (__main__.TestDecompCPU)": 61.822133255004886,
+  "test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 109.6198336283366,
+  "test_register_spills_cuda (__main__.BenchmarkFusionCudaTest)": 86.16349983215332,
+  "test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 86.66866556803386,
+  "test_rosenbrock_sparse_with_lrsched_False_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 66.37899923324585,
+  "test_rosenbrock_sparse_with_lrsched_True_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 66.62250057856242,
+  "test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 70.87766647338867,
+  "test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 75.49255498250325,
+  "test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 202.91549682617188,
+  "test_save_load_large_string_attribute (__main__.TestSaveLoad)": 123.50400034586589,
+  "test_sdpa_kernel_ctx_manager2_dynamic_shapes (__main__.DynamicShapesCtxManagerTests)": 160.74310980902777,
+  "test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 123.230222913954,
+  "test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 145.21744367811414,
+  "test_sort_bool_cpu (__main__.CpuTritonTests)": 342.22166951497394,
+  "test_sort_transpose_cpu (__main__.CpuTritonTests)": 381.2273356119792,
+  "test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 148.22866336504617,
+  "test_terminate_handler_on_crash (__main__.TestTorch)": 110.12833338313632,
+  "test_terminate_signal (__main__.ForkTest)": 129.44544405076238,
+  "test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 129.49844242301253,
+  "test_terminate_signal (__main__.SpawnTest)": 133.55011155870227,
+  "test_torchvision_smoke (__main__.TestTensorBoardPytorchGraph)": 61.563889821370445,
+  "test_train_parity_multi_group (__main__.TestFullyShard1DTrainingCore)": 160.7593755722046,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 73.10299809773763,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 60.93416659037272,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 69.97583262125652,
+  "test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 145.3736661275228,
+  "test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 138.5906670888265,
+  "test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 102.26050313313802,
+  "test_unary_ops (__main__.TestTEFuserDynamic)": 83.80188674396939,
+  "test_unary_ops (__main__.TestTEFuserStatic)": 84.91933458381229,
+  "test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 89.42000071207683,
+  "test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 69.1251672108968,
+  "test_views1_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 81.20116551717122,
+  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 92.86866505940755,
+  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestOperatorsCUDA)": 494.2426821390788,
+  "test_vmapjvpvjp_linalg_lu_solve_cpu_float32 (__main__.TestOperatorsCPU)": 61.2226676940918,
+  "test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 72.78116671244304,
+  "test_vmapjvpvjp_linalg_multi_dot_cuda_float32 (__main__.TestOperatorsCUDA)": 71.29816627502441,
+  "test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 66.16583188374837,
+  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cpu_float32 (__main__.TestOperatorsCPU)": 71.66399892171223,
+  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 70.33449935913086,
+  "test_vmapjvpvjp_nn_functional_max_pool2d_cpu_float32 (__main__.TestOperatorsCPU)": 66.33299891153972,
+  "test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 70.65683428446452,
+  "test_vmapjvpvjp_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 66.23549969991048,
+  "test_vmapjvpvjp_unbind_cpu_float32 (__main__.TestOperatorsCPU)": 61.09966786702474,
+  "test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 71.27083333333333,
+  "test_vmapvjpvjp_linalg_lstsq_cuda_float32 (__main__.TestOperatorsCUDA)": 61.08866659800211,
+  "test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 75.7148323059082,
+  "test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 74.89849853515625,
+  "test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 148.47533162434897
 }
\ No newline at end of file
diff --git a/test/test_ao_sparsity.py b/test/test_ao_sparsity.py
index 5ae5a0874318..35b96522a81c 100644
--- a/test/test_ao_sparsity.py
+++ b/test/test_ao_sparsity.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: sparse"]
 import logging
 
 # Kernels
diff --git a/test/test_autograd.py b/test/test_autograd.py
index dbd1454ff745..bebe89e09657 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -3694,6 +3694,130 @@ def test_sparse_gather_x_scalar(self):
     def test_sparse_gather_both_scalar(self):
         self._test_sparse_gather((), (), 0)
 
+    @skipIfTorchDynamo("grad_dtype not supported in compile")
+    def test_grad_dtype(self):
+        leaf = torch.tensor([1.0, 2.0], requires_grad=True)
+        # Default to tensor's dtype
+        self.assertEqual(leaf.grad_dtype, torch.float32)
+        leaf.grad_dtype = torch.float16
+        self.assertEqual(leaf.grad_dtype, torch.float16)
+        leaf.grad_dtype = None  # Allow any dtype
+        self.assertIsNone(leaf.grad_dtype)
+
+        # get/set grad_dtype is only allowed on leaf tensors
+        non_leaf = leaf * 2
+        self.assertFalse(non_leaf.is_leaf)
+        with self.assertRaisesRegex(
+            RuntimeError, "grad_dtype can only be accessed on leaf tensors"
+        ):
+            _ = non_leaf.grad_dtype
+        with self.assertRaisesRegex(
+            RuntimeError, "grad_dtype can only be set on leaf tensors"
+        ):
+            non_leaf.grad_dtype = torch.float16
+
+        # Manual setting
+        x = torch.tensor([1.0, 2.0], requires_grad=True)
+        grad_match = torch.tensor([1.0, 1.0])
+        x.grad = grad_match
+        self.assertEqual(x.grad.dtype, torch.float32)
+
+        x.grad = None
+        x.grad_dtype = torch.float16
+        grad_mismatch = torch.tensor([1.0, 1.0])
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "attempting to assign a gradient with dtype.*float.*to a tensor with grad_dtype.*Half",
+        ):
+            x.grad = grad_mismatch
+
+        # When grad_dtype is None, any dtype is allowed
+        x.grad = None
+        x.grad_dtype = None
+        grad_any = torch.tensor([1.0, 1.0], dtype=torch.float64)
+        x.grad = grad_any
+        self.assertEqual(x.grad.dtype, torch.float64)
+
+        # Incoming gradient case
+        class MismatchedGradientFunction(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, inp):
+                return inp * 2
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                return grad_output.to(torch.float64)
+
+        d = torch.tensor([1.0, 2.0], requires_grad=True)
+        output = MismatchedGradientFunction.apply(d)
+        loss = output.sum()
+        loss.backward()
+        # Default behavior is to cast to tensor dtype
+        self.assertEqual(d.grad.dtype, torch.float32)
+        self.assertTrue(torch.allclose(d.grad, torch.tensor([1.0, 1.0])))
+
+        e = torch.tensor([3.0, 4.0], requires_grad=True)
+        e.grad_dtype = None
+        output_e = MismatchedGradientFunction.apply(e)
+        loss_e = output_e.sum()
+        loss_e.backward()
+        # No casting is done if set to None.
+        self.assertTrue(
+            torch.allclose(e.grad, torch.tensor([1.0, 1.0], dtype=torch.float64))
+        )
+
+        f = torch.tensor([5.0, 6.0], requires_grad=True)
+        f.grad_dtype = torch.float16  # Expect float16 gradients
+        output_f = MismatchedGradientFunction.apply(f)
+        loss_f = output_f.sum()
+        loss_f.backward()
+        self.assertTrue(
+            torch.allclose(f.grad, torch.tensor([1.0, 1.0], dtype=torch.float16))
+        )
+
+        # Setting grad_dtype when gradient already exists
+        g = torch.tensor([1.0, 2.0], requires_grad=True)
+        g.grad = torch.tensor([1.0, 1.0])
+        g.grad_dtype = torch.float32
+        self.assertEqual(g.grad_dtype, torch.float32)
+        with self.assertRaisesRegex(
+            RuntimeError, "Cannot set grad_dtype.*because there is already a gradient"
+        ):
+            g.grad_dtype = torch.float16
+        g.grad_dtype = None
+        self.assertIsNone(g.grad_dtype)
+        g.grad = None
+        g.grad_dtype = torch.float16
+        self.assertEqual(g.grad_dtype, torch.float16)
+
+        # Test the case where there is an existing accumulate grad
+        h = torch.tensor([1.0, 2.0], requires_grad=True)
+        _ = h.clone()
+        h.grad_dtype = None
+        output = MismatchedGradientFunction.apply(h)
+        output.sum().backward()
+        self.assertEqual(h.grad.dtype, torch.float64)
+
+        # Mixed accumulation cases
+        k = torch.tensor([1.0, 2.0], requires_grad=True)
+        k.grad_dtype = None
+        y = k * 2
+        y.sum().backward()
+        k.grad = k.grad.to(torch.bfloat16)
+        y2 = k * 3
+        # Doesn't type promote to float32, always coerce to current .grad's dtype.
+        # This is because the accumulation is done in-place on the existing grad.
+        self.assertEqual(k.grad.dtype, torch.bfloat16)
+
+        l = torch.tensor([3.0, 4.0], requires_grad=True, dtype=torch.bfloat16)
+        l.grad_dtype = None
+        z = l * 2
+        z.sum().backward()
+        l.grad = l.grad.to(torch.float32)
+        z2 = l * 3
+        z2.sum().backward()
+        self.assertEqual(l.grad.dtype, torch.float32)
+
     def test_gc_in_destructor(self):
         """
         Previously, if a Function destructor triggered a garbage collection,
@@ -4926,7 +5050,6 @@ def __torch_dispatch__(self, func, types, args, kwargs=None):
 Running aten.div.Tensor from within DivBackward0
 Running aten.mul.Tensor from within MulBackward0
 Running aten.detach.default from within AccumulateGrad
-Running aten.detach.default from within AccumulateGrad
 Done""",
         )
 
@@ -7199,9 +7322,7 @@ def context_fn():
             lambda x: x.exp(), x, use_reentrant=False, context_fn=context_fn
         )
         out.backward()
-        self.assertEqual(
-            verbose_mode.operators, ["exp.default", "detach.default", "detach.default"]
-        )
+        self.assertEqual(verbose_mode.operators, ["exp.default", "detach.default"])
 
         with self.assertRaisesRegex(
             Exception, "only supported when use_reentrant=False"
@@ -7952,6 +8073,35 @@ def test_grad_fn_attr_bindings(self):
         for t in results:
             self.assertEqual(t.grad_fn._saved_scalars, scalars)
 
+    def test_get_data_and_hooks_from_raw_saved_variable(self):
+        def pack_hook(t):
+            return t
+
+        def unpack_hook(t):
+            return t
+
+        a = torch.tensor(2.0, requires_grad=True)
+
+        with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook):
+            b = a**2
+
+        c = b.exp()
+        d = c**2
+
+        pow_sv = b.grad_fn._raw_saved_self
+        exp_sv = c.grad_fn._raw_saved_result
+        pow2_sv = d.grad_fn._raw_saved_self
+
+        # Returns the packed object as-is
+        self.assertTrue(pow_sv.data is a)
+        self.assertTrue(pow_sv.unpack_hook is unpack_hook)
+        # Returns the detached data when the output/leaf is saved
+        self.assertFalse(exp_sv.data is c)
+        self.assertIsNone(exp_sv.unpack_hook)
+        # Returns the un-detached data when input is saved
+        self.assertTrue(pow2_sv.data is c)
+        self.assertIsNone(pow2_sv.unpack_hook)
+
     def test_cant_create_saved_tensors(self):
         with self.assertRaisesRegex(
             RuntimeError,
@@ -8194,7 +8344,8 @@ def _do_test_autograd_simple_views_python(self, dtype):
 
         class IdOneOutput(Function):
             @staticmethod
-            def forward(ctx, a, b, make_view):
+            def forward(ctx, a, make_view, pure_view):
+                ctx._is_pure_view = pure_view
                 if make_view:
                     a = a.narrow(0, 0, 2)
                 else:
@@ -8208,7 +8359,8 @@ def backward(ctx, ga):
 
         class IdTwoOutput(Function):
             @staticmethod
-            def forward(ctx, a, b, make_view):
+            def forward(ctx, a, b, make_view, pure_view):
+                ctx._is_pure_view = pure_view
                 if make_view:
                     a = a.narrow(0, 0, 2)
                 else:
@@ -8222,11 +8374,12 @@ def backward(ctx, ga, gab):
                     ga_nz[0] = False
                 else:
                     ga_nz[0] = True
-                return ga + gab, gab, None
+                return ga + gab, gab, None, None
 
         class ViewOfTemp(Function):
             @staticmethod
-            def forward(ctx, a, make_view):
+            def forward(ctx, a, make_view, pure_view):
+                ctx._is_pure_view = pure_view
                 ctx.save_for_backward(a)
                 if make_view:
                     a = a.narrow(0, 0, 2)
@@ -8241,7 +8394,7 @@ def backward(ctx, grad):
                 (a,) = ctx.saved_tensors
                 res = torch.zeros_like(a)
                 res.select(0, 0).copy_(grad)
-                return res, None
+                return res, None, None
 
         fn_id_to_inplace_on_view_err_msg = {
             "one_output": (
@@ -8250,71 +8403,96 @@ def backward(ctx, grad):
             ),
             "two_output": (
                 "Output 0 of IdTwoOutputBackward is a view and is being modified inplace."
-                " This view is the output of a function that returns multiple views."
+                " This view is the output of a function that returns multiple views.",
+                "Pure view custom Function can only have one input Tensor and one output Tensor."
+                " Open an issue if you need to support more.",
             ),
             "view_of_temp": (
                 "Output 0 of ViewOfTempBackward is a view and is being "
-                "modified inplace. This view was created inside a custom Function"
+                "modified inplace. This view was created inside a custom Function",
+                "a view of a leaf Variable that requires grad is being used in an in-place operation",
             ),
         }
 
         for fn_id in ["one_output", "two_output", "view_of_temp"]:
             for inplace in [True, False]:
                 for make_view in [True, False]:
-                    # Used for special casing the tests below
-                    output_is_a_view = make_view or fn_id == "view_of_temp"
-
-                    def fn(a, b):
-                        # never modify a, b inplace for gracheck
-                        a = a.clone()
-                        b = b.clone()
-                        if fn_id == "two_output":
-                            tmp1, tmp2 = IdTwoOutput.apply(a, b, make_view)
-                            if inplace:
-                                tmp1 += 3
-                                tmp2 += 3
-                            else:
-                                tmp1 = tmp1 + 3
-                                tmp2 = tmp2 + 3
-                            tmp = tmp1 * tmp2
-                        else:
-                            if fn_id == "one_output":
-                                tmp = IdOneOutput.apply(a, b, make_view)
-                            else:
-                                tmp = ViewOfTemp.apply(a + b, make_view)
-                            if inplace:
-                                tmp += 3
+                    for pure_view in [True, False]:
+                        # Used for special casing the tests below
+                        output_is_a_view = make_view or fn_id == "view_of_temp"
+
+                        def fn(a, b):
+                            # never modify a, b inplace for gracheck
+                            a = a.clone()
+                            b = b.clone()
+                            if fn_id == "two_output":
+                                tmp1, tmp2 = IdTwoOutput.apply(
+                                    a, b, make_view, pure_view
+                                )
+                                if inplace:
+                                    tmp1 += 3
+                                    tmp2 += 3
+                                else:
+                                    tmp1 = tmp1 + 3
+                                    tmp2 = tmp2 + 3
+                                tmp = tmp1 * tmp2
                             else:
-                                tmp = tmp + 3
-
-                        return tmp.sum()
-
-                    a = torch.ones(2, dtype=dtype, requires_grad=True)
-                    b = torch.ones(2, dtype=dtype, requires_grad=True)
-
-                    err_msg = fn_id_to_inplace_on_view_err_msg[fn_id]
+                                if fn_id == "one_output":
+                                    tmp = IdOneOutput.apply(a, make_view, pure_view)
+                                else:
+                                    tmp = ViewOfTemp.apply(a + b, make_view, pure_view)
+                                if inplace:
+                                    tmp += 3
+                                else:
+                                    tmp = tmp + 3
+
+                            return tmp.sum()
+
+                        a = torch.ones(2, dtype=dtype, requires_grad=True)
+                        b = torch.ones(2, dtype=dtype, requires_grad=True)
+
+                        err_msg = fn_id_to_inplace_on_view_err_msg[fn_id][
+                            int(pure_view)
+                        ]
+
+                        will_raise_error = (
+                            (pure_view and fn_id == "two_output")
+                            or (pure_view and fn_id == "view_of_temp" and inplace)
+                            or (not pure_view and inplace and output_is_a_view)
+                        )
 
-                    if not inplace or not output_is_a_view:
-                        gradcheck(fn, (a, b), check_batched_grad=False)
+                        if will_raise_error:
+                            with self.assertRaisesRegex(RuntimeError, err_msg):
+                                gradcheck(fn, (a, b), check_batched_grad=False)
+                        else:
+                            gradcheck(fn, (a, b), check_batched_grad=False)
 
-                    # Was the custom backward called properly
-                    bw_called[0] = 0
-                    ga_nz[0] = True  # For the case where the backward is called
+                        # Was the custom backward called properly
+                        bw_called[0] = 0
+                        ga_nz[0] = True  # For the case where the backward is called
 
-                    if inplace and output_is_a_view:
-                        with self.assertRaisesRegex(RuntimeError, err_msg):
-                            fn(a, b)
-                    else:
-                        fn(a, b).abs().backward()
+                        expected_called = 1
+                        expected_ga_nz = True
 
-                    expected_called = 1
-                    expected_ga_nz = True
+                        if will_raise_error:
+                            expected_called = 0
+                            with self.assertRaisesRegex(RuntimeError, err_msg):
+                                fn(a, b)
+                        else:
+                            fn(a, b).abs().backward()
 
-                    if output_is_a_view and inplace:
-                        expected_called = 0
+                        if (
+                            fn_id == "one_output"
+                            and inplace
+                            and output_is_a_view
+                            and pure_view
+                        ):
+                            # We expect the op to have been replayed and we leveraged the pure view
+                            # to re-create the graph, so the original backward was not called
+                            expected_called = 0
 
-                    self.assertTrue(bw_called[0] == expected_called)
-                    self.assertTrue(ga_nz[0] == expected_ga_nz)
+                        self.assertTrue(bw_called[0] == expected_called)
+                        self.assertTrue(ga_nz[0] == expected_ga_nz)
 
     def test_autograd_simple_views_python(self):
         self._do_test_autograd_simple_views_python(torch.double)
@@ -8778,6 +8956,27 @@ def test_named_tensor_for_complex_views(self):
         expected.fill_(complex(abs_1_1j / 2, abs_1_1j / 2))
         self.assertEqual(z.grad, torch.view_as_real(expected))
 
+    def test_custom_function_saving_mutated_view_no_leak(self):
+        class Test(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                ctx.mark_dirty(x)
+                ctx.save_for_backward(x)
+                return x
+
+            @staticmethod
+            def backward(ctx, grad):
+                pass
+
+        def scope():
+            x = torch.tensor(1.0, requires_grad=True).clone()
+            x = x.view_as(x)
+            y = Test.apply(x)
+            return weakref.ref(x)
+
+        ref = scope()
+        self.assertIsNone(ref())
+
     def test_custom_function_return_view_in_nograd(self):
         class Alias(Function):
             @staticmethod
@@ -11662,7 +11861,7 @@ def _test_pyscalar_conversions(t, integral_conv):
             def test_nonzero(tensor, value, expected):
                 tensor[0] = value
                 self.assertEqual(expected, bool(tensor))
-                self.assertEqual(expected, True if tensor else False)
+                self.assertEqual(expected, bool(tensor))
 
             test_nonzero(l, 0, False)
             test_nonzero(l, -2, True)
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index fbbcd831397a..406242964d1c 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -4162,7 +4162,7 @@ def to_np(value):
             for i in complex_exponents if exp_dtype.is_complex else exponents:
                 out_dtype_scalar_exp = (
                     torch.complex128
-                    if base_dtype.is_complex or type(i) == complex
+                    if base_dtype.is_complex or type(i) is complex
                     else torch.float64
                 )
                 expected_scalar_exp = torch.from_numpy(np.float_power(to_np(base), i))
@@ -4190,7 +4190,7 @@ def to_np(value):
         for i in complex_exponents if base_dtype.is_complex else exponents:
             out_dtype_scalar_base = (
                 torch.complex128
-                if exp_dtype.is_complex or type(i) == complex
+                if exp_dtype.is_complex or type(i) is complex
                 else torch.float64
             )
             expected_scalar_base = torch.from_numpy(np.float_power(i, to_np(exp)))
@@ -4205,9 +4205,9 @@ def to_np(value):
     def test_float_power_exceptions(self, device):
         def _promo_helper(x, y):
             for i in (x, y):
-                if type(i) == complex:
+                if type(i) is complex:
                     return torch.complex128
-                elif type(i) == torch.Tensor and i.is_complex():
+                elif type(i) is torch.Tensor and i.is_complex():
                     return torch.complex128
             return torch.double
 
diff --git a/test/test_bundled_inputs.py b/test/test_bundled_inputs.py
index 0ff5373993cf..221502ae3190 100644
--- a/test/test_bundled_inputs.py
+++ b/test/test_bundled_inputs.py
@@ -58,17 +58,20 @@ def forward(self, arg):
         # Make sure the model only grew a little bit,
         # despite having nominally large bundled inputs.
         augmented_size = model_size(sm)
+
         self.assertLess(augmented_size, original_size + (1 << 12))
 
         loaded = save_and_load(sm)
         inflated = loaded.get_all_bundled_inputs()
         self.assertEqual(loaded.get_num_bundled_inputs(), len(samples))
         self.assertEqual(len(inflated), len(samples))
+
         self.assertTrue(loaded(*inflated[0]) is inflated[0][0])
 
         for idx, inp in enumerate(inflated):
             self.assertIsInstance(inp, tuple)
             self.assertEqual(len(inp), 1)
+
             self.assertIsInstance(inp[0], torch.Tensor)
             if idx != 5:
                 # Strides might be important for benchmarking.
@@ -136,6 +139,7 @@ def forward(self, fmt: str, num: int):
         loaded = save_and_load(sm)
         inflated = loaded.get_all_bundled_inputs()
         self.assertEqual(inflated, samples)
+
         self.assertTrue(loaded(*inflated[0]) == "first 1")
 
     def test_multiple_methods_with_inputs(self):
@@ -182,6 +186,7 @@ def foo(self, arg):
         self.assertEqual(inflated, loaded.get_all_bundled_inputs_for_foo())
 
         # Check running and size helpers
+
         self.assertTrue(loaded(*inflated[0]) is inflated[0][0])
         self.assertEqual(loaded.get_num_bundled_inputs(), len(samples))
 
@@ -414,6 +419,7 @@ def {}(self, value: Optional[List[Tensor]]):
         )
         augmented_size = model_size(sm)
         # assert the size has not increased more than 8KB
+
         self.assertLess(augmented_size, original_size + (1 << 13))
 
         loaded = save_and_load(sm)
diff --git a/test/test_complex.py b/test/test_complex.py
index 159f3e18aaee..9941b68c1757 100644
--- a/test/test_complex.py
+++ b/test/test_complex.py
@@ -48,6 +48,7 @@ def test_conj_copy(self, device, dtype):
     def test_all(self, device, dtype):
         # issue: https://github.com/pytorch/pytorch/issues/120875
         x = torch.tensor([1 + 2j, 3 - 4j, 5j, 6], device=device, dtype=dtype)
+
         self.assertTrue(torch.all(x))
 
     @dtypes(*complex_types())
@@ -56,6 +57,7 @@ def test_any(self, device, dtype):
         x = torch.tensor(
             [0, 0j, -0 + 0j, -0 - 0j, 0 + 0j, 0 - 0j], device=device, dtype=dtype
         )
+
         self.assertFalse(torch.any(x))
 
     @onlyCPU
diff --git a/test/test_cpp_extensions_aot.py b/test/test_cpp_extensions_aot.py
index 2f69bcfeb9c4..8ad4882e15ec 100644
--- a/test/test_cpp_extensions_aot.py
+++ b/test/test_cpp_extensions_aot.py
@@ -148,7 +148,8 @@ def test_cusolver_extension(self):
 
     @unittest.skipIf(IS_WINDOWS, "Not available on Windows")
     def test_no_python_abi_suffix_sets_the_correct_library_name(self):
-        # For this test, run_test.py will call `python -m pip install .` in the
+        # For this test, run_test.py will call
+        # `python -m pip install . -v --no-build-isolation` in the
         # cpp_extensions/no_python_abi_suffix_test folder, where the
         # `BuildExtension` class has a `no_python_abi_suffix` option set to
         # `True`. This *should* mean that on Python 3, the produced shared
diff --git a/test/test_cpp_extensions_jit.py b/test/test_cpp_extensions_jit.py
index fd80c7fa565a..176ac3d04470 100644
--- a/test/test_cpp_extensions_jit.py
+++ b/test/test_cpp_extensions_jit.py
@@ -220,6 +220,12 @@ def test_mps_extension(self):
 
         self.assertEqual(cpu_output, mps_output.to("cpu"))
 
+        # Regression test for https://github.com/pytorch/pytorch/issues/163721
+        lib = torch.mps.compile_shader("void kernel noop(device float *x) {}")
+        lib.noop(mps_output)
+        module.mps_add_one_new_context(mps_output)
+        self.assertEqual(cpu_output + 1.0, mps_output.to("cpu"))
+
     def _run_jit_cuda_archflags(self, flags, expected):
         # Compile an extension with given `flags`
         def _check_cuobjdump_output(expected_values, is_ptx=False):
@@ -1234,18 +1240,18 @@ def test_aoti_torch_call_dispatcher(self):
         at::Tensor my_abs(at::Tensor x) {
         StableIValue stack[1];
         RAIIATH raii(torch::aot_inductor::new_tensor_handle(std::move(x)));
-        stack[0] = from(raii.release());
+        stack[0] = torch::stable::detail::from(raii.release());
         aoti_torch_call_dispatcher("aten::abs", "", stack);
-        RAIIATH res(to<AtenTensorHandle>(stack[0]));
+        RAIIATH res(torch::stable::detail::to<AtenTensorHandle>(stack[0]));
         return *reinterpret_cast<at::Tensor*>(res.release());
         }
 
         at::Tensor my_floor(at::Tensor x) {
         StableIValue stack[1];
         RAIIATH raii(torch::aot_inductor::new_tensor_handle(std::move(x)));
-        stack[0] = from(raii.release());
+        stack[0] = torch::stable::detail::from(raii.release());
         aoti_torch_call_dispatcher("aten::floor", "", stack);
-        RAIIATH res(to<AtenTensorHandle>(stack[0]));
+        RAIIATH res(torch::stable::detail::to<AtenTensorHandle>(stack[0]));
         return *reinterpret_cast<at::Tensor*>(res.release());
         }
         """
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 7bd310042862..6d9e23b7ce87 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -98,7 +98,7 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 try:
     import torchvision.models  # noqa: F401
@@ -183,27 +183,29 @@ def empty_stats():
                 "allocated_bytes.current": 0,
                 "allocated_bytes.freed": 0,
                 "allocated_bytes.peak": 0,
-                "allocation.allocated": 0,
-                "allocation.current": 0,
-                "allocation.freed": 0,
-                "allocation.peak": 0,
+                "allocations.allocated": 0,
+                "allocations.current": 0,
+                "allocations.freed": 0,
+                "allocations.peak": 0,
                 "host_alloc_time.count": 0,
                 "host_free_time.count": 0,
                 "num_host_alloc": 0,
                 "num_host_free": 0,
-                "reserved_bytes.allocated": 0,
-                "reserved_bytes.current": 0,
-                "reserved_bytes.freed": 0,
-                "reserved_bytes.peak": 0,
-                "segment.allocated": 0,
-                "segment.current": 0,
-                "segment.freed": 0,
-                "segment.peak": 0,
+                "active_bytes.allocated": 0,
+                "active_bytes.current": 0,
+                "active_bytes.freed": 0,
+                "active_bytes.peak": 0,
+                "active_requests.allocated": 0,
+                "active_requests.current": 0,
+                "active_requests.freed": 0,
+                "active_requests.peak": 0,
             }
 
         def check_stats(expected):
             stats = torch.cuda.host_memory_stats()
             for k, v in expected.items():
+                if v != stats[k]:
+                    print(f"key: {k}, expected: {v}, stats: {stats[k]}")
                 self.assertEqual(v, stats[k])
 
         # Setup the test cleanly
@@ -223,12 +225,12 @@ def check_stats(expected):
         # Make first allocation and check stats
         t1 = torch.ones(alloc1 * 1024, pin_memory=True)
         self.assertTrue(t1.is_pinned())
-        for prefix in ["segment", "allocation"]:
+        for prefix in ["active_requests", "allocations"]:
             for suffix in ["allocated", "current", "peak"]:
                 expected[prefix + "." + suffix] += 1
 
         allocation_size1 = alloc1_aligned * 1024 * 4
-        for prefix in ["allocated_bytes", "reserved_bytes"]:
+        for prefix in ["allocated_bytes", "active_bytes"]:
             for suffix in ["allocated", "current", "peak"]:
                 expected[prefix + "." + suffix] += allocation_size1
 
@@ -237,37 +239,15 @@ def check_stats(expected):
 
         check_stats(expected)
 
-        # Remove first allocation and check stats
-        del t1
-
-        expected["allocation.current"] -= 1
-        expected["allocation.freed"] += 1
-        expected["allocated_bytes.current"] -= allocation_size1
-        expected["allocated_bytes.freed"] += allocation_size1
-
-        check_stats(expected)
-
-        # Make first allocation again and check reuse
-        t1 = torch.ones(alloc1 * 1024, pin_memory=True)
-        self.assertTrue(t1.is_pinned())
-        for suffix in ["allocated", "current"]:
-            expected["allocation" + "." + suffix] += 1
-
-        allocation_size1 = alloc1_aligned * 1024 * 4
-        for suffix in ["allocated", "current"]:
-            expected["allocated_bytes" + "." + suffix] += allocation_size1
-
-        check_stats(expected)
-
         # Make second allocation and check stats
         t2 = torch.ones(alloc2 * 1024, pin_memory=True)
         self.assertTrue(t2.is_pinned())
-        for prefix in ["segment", "allocation"]:
+        for prefix in ["active_requests", "allocations"]:
             for suffix in ["allocated", "current", "peak"]:
                 expected[prefix + "." + suffix] += 1
 
         allocation_size2 = alloc2_aligned * 1024 * 4
-        for prefix in ["allocated_bytes", "reserved_bytes"]:
+        for prefix in ["allocated_bytes", "active_bytes"]:
             for suffix in ["allocated", "current", "peak"]:
                 expected[prefix + "." + suffix] += allocation_size2
 
@@ -276,34 +256,8 @@ def check_stats(expected):
 
         check_stats(expected)
 
-        # Remove first allocation and check stats
-        del t1
-
-        expected["allocation.current"] -= 1
-        expected["allocation.freed"] += 1
-        expected["allocated_bytes.current"] -= allocation_size1
-        expected["allocated_bytes.freed"] += allocation_size1
-
-        check_stats(expected)
-
-        # Remove second allocation and check stats
-        del t2
-
-        expected["allocation.current"] -= 1
-        expected["allocation.freed"] += 1
-        expected["allocated_bytes.current"] -= allocation_size2
-        expected["allocated_bytes.freed"] += allocation_size2
-
-        check_stats(expected)
-
         # Empty cache and check stats
         torch._C._host_emptyCache()
-        expected["segment.freed"] += expected["segment.current"]
-        expected["segment.current"] = 0
-        expected["reserved_bytes.freed"] += expected["reserved_bytes.current"]
-        expected["reserved_bytes.current"] = 0
-        expected["num_host_free"] = expected["num_host_alloc"]
-        expected["host_free_time.count"] += expected["host_alloc_time.count"]
 
         check_stats(expected)
 
@@ -313,8 +267,6 @@ def check_stats(expected):
 
         expected = empty_stats()
 
-        check_stats(expected)
-
     def test_pinned_memory_empty_cache(self):
         try:
             for alloc_settings in (True, False):
@@ -625,7 +577,7 @@ def _test_to_non_blocking(a, non_blocking, dst):
             src = torch.randn(
                 1000000,
                 device="cuda" if dst == "cpu" else "cpu",
-                pin_memory=True if dst == "cuda" else False,
+                pin_memory=dst == "cuda",
             )
             _test_to_non_blocking(src, try_non_blocking, dst)
 
@@ -759,53 +711,7 @@ def check_workspace_size(inp):
 
         torch._C._cuda_clearCublasWorkspaces()
 
-    @contextlib.contextmanager
-    def _hip_allow_tf32(self):
-        # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
-        # and only for MI300+
-        hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
-        os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
-
-        try:
-            yield
-        finally:
-            if hip_allow_tf32 is not None:
-                os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
-            else:
-                del os.environ["HIPBLASLT_ALLOW_TF32"]
-
-    @unittest.skipIf(not TEST_WITH_ROCM, "not relevant for CUDA testing")
-    def test_hipblaslt_allow_tf32(self):
-        tf32_ctx = self._hip_allow_tf32
-        with tf32_ctx():
-            os.environ["HIPBLASLT_ALLOW_TF32"] = "0"
-            # Save original value of allow_tf32
-            orig = torch.backends.cuda.matmul.allow_tf32
-            # If allow_tf32 variable is declared as static in aten/src/ATen/Context.cpp
-            # then matmul.allow_tf32 will return False after this point even if
-            # HIP_BLASLT_ALLOW_TF32 is set to 1 and matmul.allow_tf32 is changed.
-            os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
-            # Toggle torch.backends.cuda.matmul.allow_tf32 couple of times.
-            torch.backends.cuda.matmul.allow_tf32 = not orig
-            test1 = torch.backends.cuda.matmul.allow_tf32
-            torch.backends.cuda.matmul.allow_tf32 = orig
-            test2 = torch.backends.cuda.matmul.allow_tf32
-            self.assertNotEqual(test1, test2)
-            # Restore original value of allow_tf32
-            torch.backends.cuda.matmul.allow_tf32 = orig
-
     def test_cublas_allow_tf32_get_set(self):
-        """
-        We only turn on TF32 for MI300 with a special env var. This is because TF32
-        is only available in MI300+ and is in experimental mode (hipblaslt support
-        is current WIP)
-        """
-        tf32_ctx = self._hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
-
-        with tf32_ctx():
-            self._test_cublas_allow_tf32_get_set_inner()
-
-    def _test_cublas_allow_tf32_get_set_inner(self):
         skip_tf32_cublas = "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE" in os.environ and int(
             os.environ["TORCH_ALLOW_TF32_CUBLAS_OVERRIDE"]
         )
@@ -820,12 +726,6 @@ def _test_cublas_allow_tf32_get_set_inner(self):
         torch.backends.cuda.matmul.allow_tf32 = orig
 
     def test_float32_matmul_precision_get_set(self):
-        tf32_ctx = self._hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
-
-        with tf32_ctx():
-            self._test_float32_matmul_precision_get_set_inner()
-
-    def _test_float32_matmul_precision_get_set_inner(self):
         orig = torch.get_float32_matmul_precision()
         skip_tf32_cublas = "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE" in os.environ and int(
             os.environ["TORCH_ALLOW_TF32_CUBLAS_OVERRIDE"]
@@ -849,25 +749,67 @@ def _test_float32_matmul_precision_get_set_inner(self):
 
     def test_cublas_allow_fp16_reduced_precision_reduction_get_set(self):
         orig = torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction
+        orig_splitk = (
+            torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction_split_k
+        )
         self.assertEqual(
-            torch._C._get_cublas_allow_fp16_reduced_precision_reduction(), orig
+            torch._C._get_cublas_allow_fp16_reduced_precision_reduction(),
+            (orig, orig_splitk),
         )
         torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = not orig
         self.assertEqual(
-            torch._C._get_cublas_allow_fp16_reduced_precision_reduction(), not orig
+            torch._C._get_cublas_allow_fp16_reduced_precision_reduction(),
+            (not orig, True),
+        )
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
+            False,
+            False,
+        )
+        self.assertEqual(
+            torch._C._get_cublas_allow_fp16_reduced_precision_reduction(),
+            (False, False),
+        )
+        with self.assertRaisesRegex(RuntimeError, "allow_splitk=False"):
+            torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
+                True,
+                False,
+            )
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
+            orig,
+            orig_splitk,
         )
-        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = orig
 
     def test_cublas_allow_bf16_reduced_precision_reduction_get_set(self):
         orig = torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction
+        orig_splitk = (
+            torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction_split_k
+        )
         self.assertEqual(
-            torch._C._get_cublas_allow_bf16_reduced_precision_reduction(), orig
+            torch._C._get_cublas_allow_bf16_reduced_precision_reduction(),
+            (orig, orig_splitk),
         )
         torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = not orig
         self.assertEqual(
-            torch._C._get_cublas_allow_bf16_reduced_precision_reduction(), not orig
+            torch._C._get_cublas_allow_bf16_reduced_precision_reduction(),
+            (not orig, True),
+        )
+        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = (
+            False,
+            False,
+        )
+        self.assertEqual(
+            torch._C._get_cublas_allow_bf16_reduced_precision_reduction(),
+            (False, False),
+        )
+        with self.assertRaisesRegex(RuntimeError, "allow_splitk=False"):
+            torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = (
+                True,
+                False,
+            )
+        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = (
+            orig,
+            orig_splitk,
         )
-        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = orig
 
     def test_cublas_allow_fp16_accumulation_get_set(self):
         orig = torch.backends.cuda.matmul.allow_fp16_accumulation
@@ -1059,6 +1001,24 @@ def test_stream_event_repr(self):
         s.record_event(e)
         self.assertTrue("torch.cuda.Event" in e.__repr__())
 
+    def test_cuda_stream_protocol(self):
+        stream = torch.cuda.Stream()
+
+        self.assertTrue(hasattr(stream, "__cuda_stream__"))
+
+        result = stream.__cuda_stream__()
+
+        self.assertIsInstance(result, tuple)
+        self.assertEqual(len(result), 2)
+        self.assertEqual(result[0], 0)  # Protocol version
+        self.assertEqual(result[1], stream.cuda_stream)  # Stream handle
+
+        external_stream = torch.cuda.ExternalStream(stream.cuda_stream)
+        external_result = external_stream.__cuda_stream__()
+
+        self.assertEqual(external_result[0], 0)
+        self.assertEqual(external_result[1], external_stream.cuda_stream)
+
     def test_events(self):
         stream = torch.cuda.current_stream()
         event = torch.cuda.Event(enable_timing=True)
@@ -1513,6 +1473,13 @@ def test_huge_index(self):
         res_cpu = src.cpu()[idx.cpu()]
         self.assertEqual(res.cpu(), res_cpu)
 
+    def test_fast_index_overflow(self):
+        src = torch.randint(0, 20, (4, 87, 1056, 736), device="cuda")
+        indices = torch.tensor([True, False, False, True], device="cuda")
+        res = src[indices]
+        res_cpu = src.cpu()[indices.cpu()]
+        self.assertEqual(res.cpu(), res_cpu)
+
     def test_randint_randomness_for_large_range(self) -> None:
         # For large ranges, randint generation is slightly different. This lead to a subtle bug where some Philox
         # offsets were not calculated correctly, resulting in reused random states.
@@ -1712,8 +1679,6 @@ def test_streaming_backwards_sync(self):
             self.assertEqual(x.grad, torch.ones_like(x) * 3)
             self.assertEqual(torch.cuda.current_stream(), bwd_ambient_stream)
 
-    # Skip the test for ROCm as per https://github.com/pytorch/pytorch/issues/53190
-    @skipIfRocm(msg="flakey on ROCm https://github.com/pytorch/pytorch/issues/53190")
     def test_streaming_backwards_multiple_streams(self):
         MultiplyInStream = self._make_multiply_in_stream()
 
@@ -3211,8 +3176,6 @@ def test_graph_manual_seed_mismatch_raises(self):
     @parametrize(
         "with_amp,cache_enabled,allow_unused_input",
         [
-            subtest((False, False, True), decorators=[skipIfRocm]),
-            subtest((True, False, True), decorators=[skipIfRocm]),
             subtest((True, True, True), decorators=[unittest.expectedFailure]),
             subtest((False, False, False), decorators=[unittest.expectedFailure]),
         ],
@@ -4301,6 +4264,7 @@ def thefree():
     )
     @unittest.skipIf(not has_triton(), "test needs triton")
     @requiresCppContext
+    @serialTest()
     def test_memory_compile_regions(self):
         expected_allocation_sequence = [
             "Torch-Compiled Region: 0/0",
@@ -4402,6 +4366,7 @@ def should_capture2():
     def test_memory_plots_free_segment_stack(self):
         for context in ["alloc", "all", "state"]:
             try:
+                torch._C._cuda_clearCublasWorkspaces()
                 torch.cuda.memory.empty_cache()
                 torch.cuda.memory._record_memory_history(context=context)
                 x = torch.rand(3, 4, device="cuda")
@@ -4413,11 +4378,34 @@ def test_memory_plots_free_segment_stack(self):
             finally:
                 torch.cuda.memory._record_memory_history(None)
 
+    @unittest.skipIf(
+        TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
+    )
+    @requiresCppContext
+    def test_memory_plots_metadata(self):
+        for context in ["alloc", "all", "state"]:
+            try:
+                torch._C._cuda_clearCublasWorkspaces()
+                torch.cuda.memory.empty_cache()
+                torch.cuda.memory._set_memory_metadata("metadata test")
+                torch.cuda.memory._record_memory_history(context="all")
+                x = torch.rand(3, 4, device="cuda")
+                del x
+                torch.cuda.memory.empty_cache()
+                torch.cuda.memory._set_memory_metadata("")
+
+                ss = torch.cuda.memory._snapshot()
+                for event in ss["device_traces"][0]:
+                    self.assertTrue(event["user_metadata"] == "metadata test")
+            finally:
+                torch.cuda.memory._record_memory_history(None)
+
     @unittest.skipIf(
         TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync"
     )
     def test_memory_snapshot_script(self):
         try:
+            torch._C._cuda_clearCublasWorkspaces()
             torch.cuda.memory.empty_cache()
             torch.cuda.memory._record_memory_history("state", stacks="python")
 
@@ -4442,14 +4430,14 @@ def foo():
     @serialTest()
     def test_max_split_expandable(self):
         try:
+            orig = torch.cuda.get_per_process_memory_fraction()
             torch.cuda.memory.empty_cache()
             mb = 1024 * 1024
             _, all_memory = torch.cuda.memory.mem_get_info()
             pre_reserved = torch.cuda.memory_reserved()
             total_allowed = 120 * mb + pre_reserved
             fraction_allowed = total_allowed / all_memory
-            self.assertEqual(int(fraction_allowed * all_memory), total_allowed)
-            orig = torch.cuda.get_per_process_memory_fraction()
+            self.assertEqual(int(round(fraction_allowed * all_memory)), total_allowed)
             torch.cuda.memory.set_per_process_memory_fraction(fraction_allowed)
 
             def alloc(n):
@@ -4478,6 +4466,7 @@ def alloc(n):
     @serialTest()
     def test_garbage_collect_expandable(self):
         try:
+            orig = torch.cuda.get_per_process_memory_fraction(0)
             torch.cuda.memory.empty_cache()
             mb = 1024 * 1024
             _, all_memory = torch.cuda.memory.mem_get_info()
@@ -4485,7 +4474,6 @@ def test_garbage_collect_expandable(self):
             total_allowed = 120 * mb + pre_reserved
             fraction_allowed = total_allowed / all_memory
             self.assertEqual((fraction_allowed * all_memory), total_allowed)
-            orig = torch.cuda.get_per_process_memory_fraction(0)
             torch.cuda.memory.set_per_process_memory_fraction(fraction_allowed)
 
             def alloc(n):
@@ -4505,7 +4493,7 @@ def alloc(n):
             # expandable_segment blocks can be in the free list when this is called.
             alloc(80)
         finally:
-            orig = torch.cuda.get_per_process_memory_fraction(0)
+            torch.cuda.memory.set_per_process_memory_fraction(orig)
 
     def test_allocator_settings(self):
         def power2_div(size, div_factor):
@@ -4594,35 +4582,55 @@ def power2_div(size, div_factor):
         reg_mem = torch.cuda.memory_stats()[key_allocated]
         self.assertEqual(reg_mem - start_mem, nbytes)
 
-        with self.assertRaises(RuntimeError):
-            torch.cuda.memory._set_allocator_settings("foo:1,bar:2")
+        with self.assertRaises(ValueError):
+            torch._C._accelerator_setAllocatorSettings("foo:1,bar:2")
 
-        with self.assertRaises(RuntimeError):
-            torch.cuda.memory._set_allocator_settings(
+        with self.assertRaises(ValueError):
+            torch._C._accelerator_setAllocatorSettings(
                 "garbage_collection_threshold:1.2"
             )
 
-        with self.assertRaises(RuntimeError):
-            torch.cuda.memory._set_allocator_settings("max_split_size_mb:2")
+        with self.assertRaises(ValueError):
+            torch._C._accelerator_setAllocatorSettings("max_split_size_mb:2")
 
-        with self.assertRaises(RuntimeError):
-            torch.cuda.memory._set_allocator_settings("release_lock_on_cudamalloc:none")
+        with self.assertRaises(ValueError):
+            torch._C._accelerator_setAllocatorSettings(
+                "release_lock_on_cudamalloc:none"
+            )
 
-        with self.assertRaises(RuntimeError):
-            torch.cuda.memory._set_allocator_settings(
+        with self.assertRaises(ValueError):
+            torch._C._accelerator_setAllocatorSettings(
                 "pinned_use_cuda_host_register:none"
             )
 
-        with self.assertRaises(RuntimeError):
-            torch.cuda.memory._set_allocator_settings(
+        with self.assertRaises(ValueError):
+            torch._C._accelerator_setAllocatorSettings(
                 "pinned_num_register_threads:none"
             )
 
-        with self.assertRaises(RuntimeError):
-            torch.cuda.memory._set_allocator_settings(
+        with self.assertRaises(ValueError):
+            torch._C._accelerator_setAllocatorSettings(
                 "pinned_num_register_threads:1024"
             )
 
+    def test_allocator_backend(self):
+        def check_output(script: str) -> str:
+            return (
+                subprocess.check_output([sys.executable, "-c", script])
+                .decode("ascii")
+                .strip()
+            )
+
+        test_script = """\
+import os
+os.environ["PYTORCH_ALLOC_CONF"] = "max_split_size_mb:20,backend:cudaMallocAsync,release_lock_on_cudamalloc:none"
+import torch
+torch.cuda.init()
+print(torch.cuda.get_allocator_backend())
+"""
+        rc = check_output(test_script)
+        self.assertEqual(rc, "cudaMallocAsync")
+
     def test_cachingAllocator_raw_alloc(self):
         # Test that raw_alloc respects the setting that
         # activates/deactivates the caching allocator
@@ -6957,7 +6965,7 @@ def test_compile_kernel_large_shared_memory(self):
         with self.assertRaises(RuntimeError):
             kernel.set_shared_memory_config(excessive_shared_mem)
 
-    @tf32_on_and_off(0.005)
+    @tf32_on_and_off(0.05 if TEST_WITH_ROCM else 0.005)
     @unittest.skipIf(not TEST_CUDA, "No CUDA")
     def test_compile_kernel_advanced(self):
         # Test matrix multiplication
@@ -7323,9 +7331,7 @@ def test_graph_external_wait_and_record(self):
         """
         from torch.cuda import _compile_kernel
 
-        spin_wait_kernel = _compile_kernel(
-            kernel_source, "wait_for_cpu", compute_capability="70"
-        )
+        spin_wait_kernel = _compile_kernel(kernel_source, "wait_for_cpu")
 
         x = torch.ones(4, device="cuda")
         x_cpu = torch.zeros(x.shape, device="cpu").pin_memory()
diff --git a/test/test_cuda_nvml_based_avail.py b/test/test_cuda_nvml_based_avail.py
index c47607f4c7ac..3da49da57ad4 100644
--- a/test/test_cuda_nvml_based_avail.py
+++ b/test/test_cuda_nvml_based_avail.py
@@ -127,7 +127,7 @@ def test_partial_uuid_resolver(self):
             _transform_uuid_to_ordinals(["GPU-e4", "GPU-9e8d35e3"], uuids), [2, 1]
         )
         self.assertEqual(
-            _transform_uuid_to_ordinals("GPU-9e8d35e3,GPU-1,GPU-47".split(","), uuids),
+            _transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-1", "GPU-47"], uuids),
             [1, 7, 5],
         )
         # First invalid UUID aborts parsing
diff --git a/test/test_cuda_primary_ctx.py b/test/test_cuda_primary_ctx.py
index 284d048e9e08..7ce0b19ce884 100644
--- a/test/test_cuda_primary_ctx.py
+++ b/test/test_cuda_primary_ctx.py
@@ -4,12 +4,8 @@
 import unittest
 
 import torch
-from torch.testing._internal.common_cuda import (
-    _get_torch_cuda_version,
-    TEST_CUDA,
-    TEST_MULTIGPU,
-)
-from torch.testing._internal.common_utils import NoTest, run_tests, TestCase
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
+from torch.testing._internal.common_utils import NoTest, run_tests, skipIfRocm, TestCase
 
 
 # NOTE: this needs to be run in a brand new process
@@ -35,18 +31,17 @@ def setUp(self):
                 TestCudaPrimaryCtx.CTX_ALREADY_CREATED_ERR_MSG,
             )
 
+    @skipIfRocm(
+        msg="last checked in ROCm 7, HIP runtime doesn't create context for hipSetDevice()"
+    )
     def test_set_device_0(self):
         # In CUDA 12 the behavior of cudaSetDevice has changed. It eagerly creates context on target.
         # The behavior of `torch.cuda.set_device(0)` should also create context on the device 0.
         # Initially, we should not have any context on device 0.
         self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
         torch.cuda.set_device(0)
-        if _get_torch_cuda_version() >= (12, 0):
-            # Now after the device was set, the context should present in CUDA 12.
-            self.assertTrue(torch._C._cuda_hasPrimaryContext(0))
-        else:
-            # In CUDA 11 the context should not be created.
-            self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
+        # Now after the device was set, the context should present in CUDA 12.
+        self.assertTrue(torch._C._cuda_hasPrimaryContext(0))
 
     @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
     def test_str_repr(self):
diff --git a/test/test_custom_ops.py b/test/test_custom_ops.py
index 491648494f6f..5898f5a346ba 100644
--- a/test/test_custom_ops.py
+++ b/test/test_custom_ops.py
@@ -22,7 +22,6 @@
 import torch.testing._internal.optests as optests
 import torch.utils._pytree as pytree
 import torch.utils.cpp_extension
-from functorch import make_fx
 from torch import Tensor
 from torch._custom_op.impl import CustomOp, infer_schema
 from torch._library.fake_profile import (
@@ -35,7 +34,9 @@
     TensorMetadata,
 )
 from torch._library.infer_schema import tuple_to_list
+from torch._library.opaque_object import make_opaque, OpaqueType
 from torch._utils_internal import get_file_path_2  # @manual
+from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 from torch.testing._internal import custom_op_db
 from torch.testing._internal.common_cuda import TEST_CUDA
@@ -56,6 +57,7 @@
     TestCase,
 )
 from torch.testing._internal.custom_op_db import numpy_nonzero
+from torch.testing._internal.two_tensor import TwoTensor
 
 
 # Shadowed by `torch.testing._internal.common_utils.custom_op`
@@ -901,6 +903,8 @@ def _generate_examples(self, typ):
             return [torch.tensor(3)]
         if typ == Optional[torch.types.Number]:
             return [None, 2.718]
+        if typ == OpaqueType:
+            return [make_opaque("moo")]
         origin = typing.get_origin(typ)
         if origin is Union:
             args = typing.get_args(typ)
@@ -1065,6 +1069,16 @@ def foo(x: Tensor, y: Callable) -> Tensor:
 
             del foo
 
+        # Define a named tuple for a Point with x and y coordinates
+        Point = collections.namedtuple("Point", ["x", "y"])
+        with self.assertRaisesRegex(ValueError, "unsupported type"):
+
+            @custom_ops.custom_op(f"{TestCustomOp.test_ns}::foo")
+            def foo(x: Tensor, y: Point) -> Tensor:
+                raise NotImplementedError
+
+            del foo
+
     def test_supported_schemas(self):
         # All of these should already be tested by PyTorch codegen
         # (we share the same mechanism), but here's a sanity check.
@@ -2550,6 +2564,111 @@ def sin_(x):
         sin_(x)
         self.assertEqual(x, expected)
 
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    def test_subclass_accessor_view_error(self):
+        @torch.library.custom_op(
+            "_torch_testing::_failing_two_tensor_accessor",
+            mutates_args=(),
+            schema="(Tensor(a) tx, SymInt idx) -> Tensor(a)",
+        )
+        def _failing_two_tensor_accessor(tx, idx):
+            return tx.view_as(tx)
+
+        def noop(*args):
+            pass
+
+        _failing_two_tensor_accessor.register_autograd(noop, setup_context=noop)
+
+        t = torch.rand(2)
+        with self.assertRaisesRegex(
+            RuntimeError, "Custom ops that are views do not support SymInt."
+        ):
+            torch.ops._torch_testing._failing_two_tensor_accessor(t, 2)
+
+        @torch.library.custom_op(
+            "_torch_testing::_failing_two_tensor_accessor_list",
+            mutates_args=(),
+            schema="(Tensor(a) tx, SymInt[] idx) -> Tensor(a)",
+        )
+        def _failing_two_tensor_accessor_list(tx, idx):
+            return tx.view_as(tx)
+
+        def noop(*args):
+            pass
+
+        _failing_two_tensor_accessor_list.register_autograd(noop, setup_context=noop)
+
+        t = torch.rand(2)
+        with self.assertRaisesRegex(
+            RuntimeError, "Custom ops that are views do not support SymInt."
+        ):
+            torch.ops._torch_testing._failing_two_tensor_accessor_list(t, (2,))
+
+    @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
+    def test_subclass_accessor_view(self):
+        class MyTwoTensor(TwoTensor):
+            @classmethod
+            def __torch_dispatch__(cls, func, types, args, kwargs):
+                if func is torch.ops._torch_testing._two_tensor_accessor.default:
+                    self.assertIsInstance(args[0], MyTwoTensor)
+                    self.assertIn(args[1], (0, 1))
+                    if args[1] == 0:
+                        res = args[0].a
+                    else:
+                        res = args[0].b
+                    # Always return a fresh Tensor!
+                    return res.view_as(res)
+                return super().__torch_dispatch__(func, types, args, kwargs)
+
+        @torch.library.custom_op(
+            "_torch_testing::_two_tensor_accessor",
+            mutates_args=(),
+            schema="(Tensor(a) tx, int idx) -> Tensor(a)",
+        )
+        def _two_tensor_accessor(tx, idx):
+            raise RuntimeError("Should never be called")
+
+        def backward(ctx, gO):
+            gI = gO.clone()
+            if ctx.idx == 0:
+                return MyTwoTensor(gI, torch.zeros_like(gO)), None
+            else:
+                return MyTwoTensor(torch.zeros_like(gO), gI), None
+
+        def setup_ctx(ctx, inputs, output):
+            ctx._is_pure_view = True
+            ctx.idx = inputs[1]
+
+        _two_tensor_accessor.register_autograd(backward, setup_context=setup_ctx)
+
+        x = torch.rand(3)
+        y = torch.rand(3)
+        z = MyTwoTensor(x, y, requires_grad=True)
+        res = torch.ops._torch_testing._two_tensor_accessor(z, 0)
+        res.sum().backward()
+        self.assertEqual(res, x)
+        self.assertTrue(res._is_view())
+        self.assertTrue(res._base is z)
+        self.assertEqual(z.grad, torch.ones_like(z.grad))
+
+        res = torch.ops._torch_testing._two_tensor_accessor(z, 1)
+        res.sum().backward()
+        self.assertEqual(res, y)
+        self.assertTrue(res._is_view())
+        self.assertTrue(res._base is z)
+        self.assertEqual(z.grad, TwoTensor(torch.ones(3), torch.ones(3)))
+
+        leaf = MyTwoTensor(torch.rand(3), torch.rand(3), requires_grad=True)
+        non_leaf = leaf.clone()
+        view_a = torch.ops._torch_testing._two_tensor_accessor(non_leaf, 0)
+        self.assertTrue(view_a._is_view())
+        self.assertTrue(view_a._base is non_leaf)
+        view_a *= 2
+        self.assertEqual(non_leaf.a, view_a)
+        self.assertNotEqual(leaf.a, view_a)
+        non_leaf.sum().backward()
+        self.assertEqual(leaf.grad, MyTwoTensor(2 * torch.ones(3), torch.ones(3)))
+
     @skipIfTorchDynamo("Expected to fail due to no FakeTensor support; not a bug")
     def test_kwarg_only_tensors(self):
         with self.assertRaisesRegex(NotImplementedError, "kwarg-only Tensor args"):
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index da0c12082244..3ac803239c53 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -88,7 +88,7 @@
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 TEST_CUDA_IPC = (
     torch.cuda.is_available()
@@ -3494,7 +3494,7 @@ def test_ind_worker_queue(self):
             max_num_workers = 1
 
         for batch_size in (8, 16, 32, 64):
-            for num_workers in range(0, min(6, max_num_workers)):
+            for num_workers in range(min(6, max_num_workers)):
                 self._run_ind_worker_queue_test(
                     batch_size=batch_size, num_workers=num_workers + 1
                 )
diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 2a57bef2075b..2790145665b1 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -520,7 +520,7 @@ def test_demux_mux_datapipe(self):
         self.assertEqual(list(range(9)), list(n))
 
         # Functional Test: Uneven DataPipes
-        source_numbers = list(range(0, 10)) + [10, 12]
+        source_numbers = list(range(10)) + [10, 12]
         numbers_dp = dp.iter.IterableWrapper(source_numbers)
         n1, n2 = numbers_dp.demux(2, lambda x: x % 2)
         self.assertEqual([0, 2, 4, 6, 8, 10, 12], list(n1))
@@ -1257,7 +1257,7 @@ def test_demux_iterdatapipe(self):
         )
         output1, output2 = list(dp1), list(dp2)
         self.assertEqual(list(range(5, 10)), output1)
-        self.assertEqual(list(range(0, 5)), output2)
+        self.assertEqual(list(range(5)), output2)
 
         # Functional Test: values of the same classification are lumped together, and unlimited buffer
         with warnings.catch_warnings(record=True) as wa:
@@ -1271,7 +1271,7 @@ def test_demux_iterdatapipe(self):
             self.assertRegex(str(wa[-1].message), r"Unlimited buffer size is set")
         output1, output2 = list(dp1), list(dp2)
         self.assertEqual(list(range(5, 10)), output1)
-        self.assertEqual(list(range(0, 5)), output2)
+        self.assertEqual(list(range(5)), output2)
 
         # Functional Test: classifier returns a value outside of [0, num_instance - 1]
         dp0 = input_dp.demux(num_instances=1, classifier_fn=lambda x: x % 2)
@@ -2478,7 +2478,7 @@ def test_issubinstance(self):
             else:
                 self.assertFalse(issubinstance(d, S))
             for t in basic_type:
-                if type(d) == t:
+                if type(d) is t:
                     self.assertTrue(issubinstance(d, t))
                 else:
                     self.assertFalse(issubinstance(d, t))
@@ -2577,7 +2577,7 @@ def __iter__(self):
 
         self.assertTrue(issubclass(DP4, IterDataPipe))
         dp4 = DP4()
-        self.assertTrue(dp4.type.param == tuple)
+        self.assertTrue(dp4.type.param is tuple)
 
         class DP5(IterDataPipe):
             r"""DataPipe without type annotation"""
@@ -2601,7 +2601,7 @@ def __iter__(self) -> Iterator:
 
         self.assertTrue(issubclass(DP6, IterDataPipe))
         dp6 = DP6()
-        self.assertTrue(dp6.type.param == int)
+        self.assertTrue(dp6.type.param is int)
 
         class DP7(IterDataPipe[Awaitable[T_co]]):
             r"""DataPipe with abstract base class"""
@@ -3356,63 +3356,6 @@ def construct_sharded_pipe():
         with self.assertRaises(Exception):
             dp.apply_sharding(2, 1, sharding_group=SHARDING_PRIORITIES.DEFAULT)
 
-    # Test tud.datapipes.iter.grouping.SHARDING_PRIORITIES for backward compatibility
-    # TODO: Remove this test once tud.datapipes.iter.grouping.SHARDING_PRIORITIES is deprecated
-    def test_sharding_groups_in_legacy_grouping_package(self):
-        with self.assertWarnsRegex(
-            FutureWarning,
-            r"Please use `SHARDING_PRIORITIES` "
-            "from the `torch.utils.data.datapipes.iter.sharding`",
-        ):
-            from torch.utils.data.datapipes.iter.grouping import (
-                SHARDING_PRIORITIES as LEGACY_SHARDING_PRIORITIES,
-            )
-
-        def construct_sharded_pipe():
-            sharding_pipes = []
-            dp = NumbersDataset(size=90)
-            dp = dp.sharding_filter(
-                sharding_group_filter=LEGACY_SHARDING_PRIORITIES.DISTRIBUTED
-            )
-            sharding_pipes.append(dp)
-            dp = dp.sharding_filter(
-                sharding_group_filter=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING
-            )
-            sharding_pipes.append(dp)
-            dp = dp.sharding_filter(sharding_group_filter=300)
-            sharding_pipes.append(dp)
-            return dp, sharding_pipes
-
-        dp, sharding_pipes = construct_sharded_pipe()
-
-        for pipe in sharding_pipes:
-            pipe.apply_sharding(
-                2, 1, sharding_group=LEGACY_SHARDING_PRIORITIES.DISTRIBUTED
-            )
-            pipe.apply_sharding(
-                5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING
-            )
-            pipe.apply_sharding(3, 1, sharding_group=300)
-
-        actual = list(dp)
-        expected = [17, 47, 77]
-        self.assertEqual(expected, actual)
-        self.assertEqual(3, len(dp))
-
-        dp, _ = construct_sharded_pipe()
-        dp.apply_sharding(2, 1, sharding_group=LEGACY_SHARDING_PRIORITIES.DEFAULT)
-        with self.assertRaises(Exception):
-            dp.apply_sharding(
-                5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING
-            )
-
-        dp, _ = construct_sharded_pipe()
-        dp.apply_sharding(
-            5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING
-        )
-        with self.assertRaises(Exception):
-            dp.apply_sharding(2, 1, sharding_group=LEGACY_SHARDING_PRIORITIES.DEFAULT)
-
     def test_legacy_custom_sharding(self):
         dp = self._get_pipeline()
         sharded_dp = CustomShardingIterDataPipe(dp)
diff --git a/test/test_decomp.py b/test/test_decomp.py
index 610465db4c48..f5c791c8cbe8 100644
--- a/test/test_decomp.py
+++ b/test/test_decomp.py
@@ -220,6 +220,8 @@ def op_assert_ref(test_case, op, test_dtype, i, orig, decomp, ref, args, kwargs)
         (torch.bfloat16, torch.ops.aten.reflection_pad2d_backward.default): 5e-3,
         (torch.float16, torch.ops.aten.reflection_pad3d_backward.default): 5e-3,
         (torch.bfloat16, torch.ops.aten.reflection_pad3d_backward.default): 5e-2,
+        (torch.float16, torch.ops.aten._batch_norm_with_update.default): 2e-7,
+        (torch.bfloat16, torch.ops.aten._batch_norm_with_update.default): 2e-7,
         # see https://github.com/pytorch/pytorch/pull/96264
         (torch.float16, torch.ops.aten.mv.default): 1e-5,
         (torch.bfloat16, torch.ops.aten.mv.default): 1e-5,
@@ -295,6 +297,7 @@ def op_assert_equal(test_case, op, test_dtype, orig, decomp, args, kwargs):
         rtol, atol = tol_table[(decomp.dtype, op)]
     else:
         rtol, atol = _getDefaultRtolAndAtol(orig.dtype, decomp.dtype)
+
     test_case.assertEqual(
         orig,
         decomp,
@@ -878,7 +881,7 @@ def upcast(x):
                     zip(real_out, decomp_out, real_out_double)
                 ):
                     if not isinstance(orig, torch.Tensor):
-                        assert type(orig) == type(decomp)
+                        assert type(orig) is type(decomp)
                         assert orig == decomp
                         continue
                     op_assert_ref(
@@ -895,7 +898,7 @@ def upcast(x):
             else:
                 for orig, decomp in zip(real_out, decomp_out):
                     if not isinstance(orig, torch.Tensor):
-                        assert type(orig) == type(decomp)
+                        assert type(orig) is type(decomp)
                         assert orig == decomp
                         continue
                     op_assert_equal(
@@ -942,7 +945,7 @@ def do_cross_ref(self, device, dtype, op, *, run_all):
             # not exercised in test_ops_gradients atm.  The problem is not
             # complex32 per-se (which is supported by data movement only ops)
             # but that when we do backwards we expect other ops like add to work
-            and not dtype == torch.complex32
+            and dtype != torch.complex32
         )
         samples = op.sample_inputs(device, dtype, requires_grad=requires_grad)
 
@@ -1255,11 +1258,10 @@ def forward_pass_fn():
         )
 
         # check RMSNorm was fused with sinh
+        self.assertTrue("triton_per_fused__fused_rms_norm_sinh" in generated_codes[0])
         self.assertTrue(
-            "triton_per_fused_add_mean_mul_pow_rsqrt_sinh" in generated_codes[0]
-        )
-        self.assertTrue(
-            "triton_per_fused__fused_rms_norm_backward_cosh_mul" in generated_codes[1]
+            "triton_per_fused__fused_rms_norm__fused_rms_norm_backward_cosh_mul"
+            in generated_codes[1]
         )
 
 
diff --git a/test/test_dlpack.py b/test/test_dlpack.py
index 669a910cb3ae..3d6c4ae7484c 100644
--- a/test/test_dlpack.py
+++ b/test/test_dlpack.py
@@ -383,8 +383,8 @@ def test_dlpack_normalize_strides(self):
         self.assertEqual(y.stride(), (3,))
         z = from_dlpack(y)
         self.assertEqual(z.shape, (1,))
-        # gh-83069, make sure __dlpack__ normalizes strides
-        self.assertEqual(z.stride(), (1,))
+        # Stride normalization has been removed, strides should be preserved
+        self.assertEqual(z.stride(), (3,))
 
     @skipMeta
     @onlyNativeDeviceTypes
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index 0e90587822d6..9a6575cf184d 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -8,6 +8,7 @@
 import unittest
 
 import numpy as np
+import pytest
 import sympy
 
 import torch
@@ -870,7 +871,6 @@ def test_non_overlapping_and_dense_backed(self):
     def test_non_overlapping_and_dense_unbacked(self):
         shape_env = ShapeEnv()
         u0 = shape_env.create_unbacked_symint()
-        torch._check_is_size(u0)
         cf = torch.ops.aten.is_non_overlapping_and_dense.default
 
         self.assertEqual(IsNonOverlappingAndDenseIndicator(u0.node.expr, 2, 2, 1), 1)
@@ -906,7 +906,6 @@ def test_prims_non_overlapping_and_dense(self):
 
         # unbacked
         u0 = shape_env.create_unbacked_symint()
-        torch._check_is_size(u0)
         self.assertTrue(cf(torch.empty_strided((u0, 2), (2, 1), device="meta")))
         self.assertTrue(cf(torch.empty_strided((2, u0), (1, 2), device="meta")))
         self.assertTrue(cf(torch.empty_strided((u0,), (1,), device="meta")))
@@ -934,7 +933,6 @@ def test_prims_non_overlapping_and_dense(self):
 
         # return False on arbitrary strides
         u1 = shape_env.create_unbacked_symint()
-        torch._check_is_size(u1)
         self.assertFalse(
             cf(
                 torch.empty_strided(
@@ -1130,7 +1128,6 @@ def test_numpy_sym_min(self):
     def test_debug_has_internal_overlap_unbacked(self):
         shape_env = ShapeEnv()
         u0 = shape_env.create_unbacked_symint()
-        torch._check_is_size(u0)
         cf = torch._debug_has_internal_overlap
         self.assertEqual(cf(torch.empty_strided((u0, 2), (2, 1), device="meta")), 0)
         self.assertEqual(cf(torch.empty_strided((2, u0), (1, 2), device="meta")), 0)
@@ -1388,7 +1385,7 @@ def test_ephemeral_source_unified_with_non_ephemeral_source(self):
             self.assertEqual(x.storage_offset(), y.storage_offset())
 
     def test_tensor_factory_with_symint(self):
-        args = list(range(0, 3))
+        args = list(range(3))
         expected = torch.tensor(args)
 
         shape_env = ShapeEnv()
@@ -1430,6 +1427,15 @@ def f(a, b):
                 f(torch.tensor([1]), torch.tensor([1])), torch.tensor([20])
             )
 
+    @fresh_cache()
+    def test_slice_backed_size_oblivious(self):
+        @torch.compile(backend="inductor", fullgraph=True, dynamic=True)
+        def f(x):
+            return x[:5]
+
+        with torch.fx.experimental._config.patch(backed_size_oblivious=True):
+            f(torch.randn(10, 10))
+
     def test_baddbmm_symint(self):
         from torch._subclasses.fake_tensor import FakeTensorMode
 
@@ -1818,6 +1824,96 @@ def test_stride_symnode(self):
         self.assertTrue(isinstance(s3, int))
         self.assertTrue(str(s1.node.expr) != str(s2.node.expr))
 
+    @fresh_cache()
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @parametrize("backend", ["inductor", "eager"])
+    def test_dynamic_int_basic_compile(self, backend):
+        from torch.fx.experimental.sym_node import DynamicInt
+
+        cnt = CompileCounterWithBackend(backend)
+
+        # test scalar inputs to function
+        def f(x, y, z):
+            out = torch.tensor([x + y + z])
+            out = out + torch.zeros(abs(x) + 2).sum()  # test out tensor construction
+            return out
+
+        fn = torch.compile(f, fullgraph=True, backend=cnt)
+        x = DynamicInt(1)
+        z = DynamicInt(3)
+        self.assertEqual(fn(x, x, z), f(1, 1, 3))  # guard: x == y
+        self.assertEqual(fn(2, 2, 0), f(2, 2, 0))
+        self.assertEqual(fn(-1, -1, 2), f(-1, -1, 2))
+        self.assertEqual(cnt.frame_count, 1)  # no recompiles
+
+        self.assertEqual(fn(3, 4, 5), f(3, 4, 5))  # now we recompile
+        self.assertEqual(cnt.frame_count, 2)
+
+        # test nn module property
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.i = DynamicInt(1)
+
+            def forward(self, x):
+                return torch.tensor([x + self.i])
+
+        cnt.clear()
+        m = Foo()
+        mc = torch.compile(m, backend=cnt, fullgraph=True)
+
+        self.assertEqual(mc(DynamicInt(0)), m(0))
+        mc.i = -2  # override attribute
+        self.assertEqual(mc(-1), m(-1))
+        self.assertEqual(cnt.frame_count, 1)
+
+    def test_dynamic_int_eager_usage(self):
+        from torch.fx.experimental.sym_node import DynamicInt
+
+        w = DynamicInt(-1)
+        x = DynamicInt(0)
+        y = DynamicInt(1)
+        z = DynamicInt(2)
+
+        def check(l, r):
+            self.assertTrue(isinstance(l, DynamicInt))
+            self.assertEqual(l, r)
+
+        # test arithmetic
+        check(2 * y + z, 4)
+        check((10 - z) // 2, 4)
+        check(1 // z, 0)
+        check(-w + w**2, 2)
+        check(x % z, 0)
+        check(1 << z, 4)
+        check(z | y, 3)
+        check(min(y, z), 1)
+        self.assertTrue(z > -2)
+        with self.assertRaises(ZeroDivisionError):
+            y % x
+
+        # math, numpy
+        self.assertEqual(math.cos(x), y)
+        self.assertEqual(math.prod([z, z], start=z), 8)
+        self.assertEqual(np.arange(z)[y], 1)
+        self.assertTrue(np.allclose(np.ones([y, z]).sum(axis=x), np.ones(z)))
+
+        # test conversions
+        self.assertTrue(isinstance(x + 2, int))
+        self.assertTrue(isinstance(x + 2, DynamicInt))
+        self.assertEqual(y / 2.0, 0.5)  # this could return DynamicFloat in future
+        self.assertEqual(float(z), 2.0)
+        self.assertFalse(bool(x))
+        self.assertEqual(DynamicInt(x).real, x.real)
+
+        # torch functions, scalar inputs
+        self.assertEqual(torch.arange(z)[:w][x], 0)
+        self.assertEqual(torch.add(torch.tensor(w), torch.tensor(w), alpha=z), -3)
+        self.assertEqual(
+            list(torch.nn.Linear(z, y)(torch.randn(z * 2, z)).shape), [4, 1]
+        )
+        self.assertEqual(z * torch.ones(z).sum(dim=x), 4)
+
 
 instantiate_parametrized_tests(TestSymNumberMagicMethods)
 
@@ -1859,6 +1955,19 @@ def test_floordiv_div_by_one(self):
                 TestFloorDiv.python_floordiv(x, y), TestFloorDiv.torch_floordiv(x, y)
             )
 
+    def test_floordiv_div_does_not_generate_non_int_rational(self):
+        s14 = sympy.Symbol("s14", integer=True, positive=True)
+        s37 = sympy.Symbol("s37", integer=True, positive=True)
+
+        inner_expr = FloorDiv(s14, 2016)
+        middle_expr = (24 * s37 + 672) * inner_expr
+        numerator = middle_expr + 21
+        denominator = 22
+        result = FloorDiv(numerator, denominator)
+        rationals = result.atoms(sympy.Rational)
+        all_rationals_ints = all(r.q == 1 for r in rationals)
+        self.assertTrue(all_rationals_ints)
+
     def test_floordiv_simplify(self):
         # Tests how we simplify or evaluate FloorDiv without free variables
         shape_env = ShapeEnv()
@@ -3095,6 +3204,37 @@ def test_guards_float_div(self):
         self.assertTrue(shape_env.evaluate_guards_expression(guards, [hint_int(s0)]))
         self.assertFalse(shape_env.evaluate_guards_expression(guards, [hint_int(s1)]))
 
+    @skipIfTorchDynamo("Attempt to trace generator")
+    @torch.fx.experimental._config.patch("use_duck_shape", False)
+    def test_size_comparison_no_recompile(self):
+        """
+        Test that size comparisons don't cause recompilation.
+        When comparing x.size() == b.size() with different sizes,
+        the compiled function should only compile once.
+        We should not guard in sizes of the inner elements.
+        """
+        cnt = CompileCounter()
+
+        @torch.compile(fullgraph=True, dynamic=True, backend=cnt)
+        def f(x, b):
+            if x.size() == b.size():
+                return x
+            return x * 2
+
+        # First call: shapes differ (1, 2) vs (2, 4, 9), so if branch is False
+        f(torch.rand(10, 2), torch.rand(20, 4, 9))
+
+        # Second call: shapes differ again (1, 2) vs (1, 4, 9), so if branch is False
+        f(torch.rand(10, 2), torch.rand(10, 4, 9))
+
+        # Should only compile once despite different input shapes
+        self.assertEqual(
+            cnt.frame_count,
+            1,
+            f"Expected 1 compilation, got {cnt.frame_count}. "
+            f"Size comparison should not cause recompilation.",
+        )
+
     def test_remove_symbols_without_guarding(self):
         from torch._functorch.partitioners import _remove_symbols_without_guarding
 
@@ -3228,9 +3368,109 @@ def func(a, b):
         torch._dynamo.decorators.mark_unbacked(b, 0)
         func(a, b)
 
-        with self.assertRaises(RuntimeError):
+        # inductor adds the check sometimes itself so it will be reflected
+        # as AssertionError.
+        with self.assertRaises((AssertionError, RuntimeError)):
             func(a, torch.rand(2, 1))
 
+    @pytest.mark.xfail(reason="https://github.com/pytorch/pytorch/issues/163785")
+    @skipIfTorchDynamo("mark_unbacked is not traceable")
+    def test_do_not_guard_unbacked_inputs(self):
+        @torch.compile(fullgraph=True, dynamic=True, backend="inductor")
+        def func(a, b):
+            a.expand(b.shape)
+            return a * 10
+
+        a = torch.rand(1, 1)
+        b = torch.rand(1, 1)
+
+        torch._dynamo.decorators.mark_unbacked(a, 0)
+        torch._dynamo.decorators.mark_unbacked(a, 1)
+        torch._dynamo.decorators.mark_unbacked(b, 0)
+        torch._dynamo.decorators.mark_unbacked(b, 1)
+
+        log_stream, ctx = logs_to_string("torch._dynamo.guards", "guards")
+        with ctx():
+            func(a, b)
+            func(torch.rand(4, 5), torch.rand(4, 5))
+
+        guards = "\n".join(log_stream.getvalue().strip().split("\n")[4:]).strip()
+        self.assertFalse("SYMBOLIC_SHAPE_GUARD" in guards)
+
+    @skipIfTorchDynamo("mark_unbacked is not traceable")
+    def test_div_unbacked_eq_input_tensors(self):
+        @torch.compile(fullgraph=True)
+        def func(a, b):
+            x = a.size()[0]
+            y = b.size()[0]
+            torch._check(x == y)
+            if x // y == 1:
+                a = a * 10
+            if 2 * x // y == 2:
+                a = a * 20
+            return a
+
+        a = torch.randn(10, 10)
+        b = torch.randn(10, 20)
+
+        torch._dynamo.decorators.mark_unbacked(a, 0)
+        torch._dynamo.decorators.mark_unbacked(b, 0)
+        func(a, b)
+
+    @torch.compiler.config.patch(unbacked_sources="L['x'],L['y']")
+    def test_div_unbacked_eq_input_ints(self):
+        @torch.compile(fullgraph=True)
+        def func(x, y):
+            a = torch.rand(1)
+            torch._check(x == y)
+            if x // y == 1:
+                a = a * 10
+            if 2 * x // y == 2:
+                a = a * 20
+            return a
+
+        func(10, 10)
+
+    @skipIfTorchDynamo("mark_unbacked is not traceable")
+    @torch.compiler.config.patch(unbacked_sources="L['y']")
+    def test_div_unbacked_eq_globals(self):
+        tensor = torch.rand(10, 44)
+        y = 10
+
+        @torch.compile(fullgraph=True, dynamic=True)
+        def func():
+            a = torch.rand(1)
+            x = tensor.size()[0]
+            torch._check(x == y)
+            if x // y == 1:
+                a = a * 10
+            if 2 * x // y == 2:
+                a = a * 20
+            return a
+
+        torch._dynamo.decorators.mark_unbacked(tensor, 0)
+        func()
+
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_div_unbacked_eq_item(self):
+        @torch.compile(fullgraph=True)
+        def func(a, b):
+            x = a.item()
+            y = b.item()
+            torch._check(x == y)
+            # TODO we should not need those torch checks.
+            torch._check(x // y == 1)
+            torch._check(2 * x // y == 2)
+            if x // y == 1:
+                a = a * 10
+            if 2 * x // y == 2:
+                a = a * 20
+            return a
+
+        a = torch.tensor([1])
+        b = torch.tensor([1])
+        func(a, b)
+
 
 class TestUbackedOps(TestCase):
     @fresh_cache()
@@ -3246,8 +3486,6 @@ def func(x, y):
             t1 = x.view((f, f))
             t2 = x.reshape((f, f))
             t3 = torch._ops.ops.aten.view_copy(x, (f, f))
-            # TODO avoid _check_is_size here.
-            torch._check_is_size(f)
             return t1 * 10, t2 * 10, t3
 
         compiled_func = torch.compile(
@@ -3282,8 +3520,8 @@ def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "Sym(s7)",
         ge_1: "Sym(u1 >= 0)" = arg1_1 >= 0
         _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
         _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(arg0_1);  arg0_1 = None
-        ge_3: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
-        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_3, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_3 = _assert_scalar_1 = None
+        ge_2: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_2, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_2 = _assert_scalar_1 = None
         pow_1: "Sym(u0**2)" = _local_scalar_dense ** 2
         eq: "Sym(Eq(u1, u0**2))" = arg1_1 == pow_1;  arg1_1 = pow_1 = None
         _assert_scalar_2 = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(u1, u0**2) on node 'eq'");  eq = _assert_scalar_2 = None
@@ -3323,8 +3561,8 @@ def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "i64[u1][1]
         ge_1: "Sym(u1 >= 0)" = arg1_1 >= 0
         _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
         _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(arg0_1);  arg0_1 = None
-        ge_3: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
-        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_3, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_3 = _assert_scalar_1 = None
+        ge_2: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_2, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_2 = _assert_scalar_1 = None
         pow_1: "Sym(u0**2)" = _local_scalar_dense ** 2
         eq: "Sym(Eq(u1, u0**2))" = arg1_1 == pow_1;  arg1_1 = pow_1 = None
         _assert_scalar_2 = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(u1, u0**2) on node 'eq'");  eq = _assert_scalar_2 = None
@@ -3353,8 +3591,6 @@ def test_unbacked_reshape2(self):
         # reshape (u2, u3) -> (u0, u1)
         def func(x, y):
             u0, u1 = y.tolist()
-            torch._check_is_size(u0)
-            torch._check_is_size(u1)
 
             result1 = torch.reshape(x, (u0, u1))
             return result1 * 10
@@ -3387,16 +3623,22 @@ def forward(self, arg0_1: "i64[2][1]cpu", arg1_1: "Sym(u2)", arg2_1: "Sym(u3)",
         _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_3, "Runtime assertion failed for expression u3 >= 0 on node 'ge_1'");  ge_3 = _assert_scalar_1 = None
         select: "i64[][]cpu" = torch.ops.aten.select.int(arg0_1, 0, 0)
         _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(select);  select = None
-        ge_5: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
-        _assert_scalar_2 = torch.ops.aten._assert_scalar.default(ge_5, "Runtime assertion failed for expression u0 >= 0 on node 'ge_2'");  ge_5 = _assert_scalar_2 = None
+        ge_4: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
+        _assert_scalar_2 = torch.ops.aten._assert_scalar.default(ge_4, "Runtime assertion failed for expression u0 >= 0 on node 'ge_2'");  ge_4 = _assert_scalar_2 = None
+        sym_sum: "Sym(u0 + 1)" = torch.sym_sum((1, _local_scalar_dense))
+        gt: "Sym(u0 + 1 > 0)" = sym_sum > 0;  sym_sum = None
+        _assert_scalar_3 = torch.ops.aten._assert_scalar.default(gt, "Runtime assertion failed for expression 0 < u0 + 1 on node 'gt'");  gt = _assert_scalar_3 = None
         select_1: "i64[][]cpu" = torch.ops.aten.select.int(arg0_1, 0, 1);  arg0_1 = None
         _local_scalar_dense_1: "Sym(u1)" = torch.ops.aten._local_scalar_dense.default(select_1);  select_1 = None
-        ge_7: "Sym(u1 >= 0)" = _local_scalar_dense_1 >= 0
-        _assert_scalar_3 = torch.ops.aten._assert_scalar.default(ge_7, "Runtime assertion failed for expression u1 >= 0 on node 'ge_3'");  ge_7 = _assert_scalar_3 = None
+        ge_5: "Sym(u1 >= 0)" = _local_scalar_dense_1 >= 0
+        _assert_scalar_4 = torch.ops.aten._assert_scalar.default(ge_5, "Runtime assertion failed for expression u1 >= 0 on node 'ge_3'");  ge_5 = _assert_scalar_4 = None
+        sym_sum_1: "Sym(u1 + 1)" = torch.sym_sum((1, _local_scalar_dense_1))
+        gt_1: "Sym(u1 + 1 > 0)" = sym_sum_1 > 0;  sym_sum_1 = None
+        _assert_scalar_5 = torch.ops.aten._assert_scalar.default(gt_1, "Runtime assertion failed for expression 0 < u1 + 1 on node 'gt_1'");  gt_1 = _assert_scalar_5 = None
         mul: "Sym(u2*u3)" = arg1_1 * arg2_1;  arg1_1 = arg2_1 = None
         mul_1: "Sym(u0*u1)" = _local_scalar_dense * _local_scalar_dense_1
         eq: "Sym(Eq(u2*u3, u0*u1))" = mul == mul_1;  mul = mul_1 = None
-        _assert_scalar_4 = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(u2*u3, u0*u1) on node 'eq'");  eq = _assert_scalar_4 = None
+        _assert_scalar_6 = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(u2*u3, u0*u1) on node 'eq'");  eq = _assert_scalar_6 = None
         clone: "f32[u2, u3][Max(1, u3), 1]cpu" = torch.ops.aten.clone.default(arg3_1, memory_format = torch.contiguous_format);  arg3_1 = None
         view: "f32[u0, u1][Max(1, u1), 1]cpu" = torch.ops.aten.view.default(clone, [_local_scalar_dense, _local_scalar_dense_1]);  clone = _local_scalar_dense = _local_scalar_dense_1 = None
         mul_21: "f32[u0, u1][Max(1, u1), 1]cpu" = torch.ops.aten.mul.Tensor(view, 10);  view = None
@@ -3449,6 +3691,200 @@ def forward(self, arg0_1: "i64[2][1]cpu", arg1_1: "Sym(u2)", arg2_1: "Sym(u3)",
         self.assertEqual(result_compiled, result_eager)
         self.assertEqual(cnt.frame_count, 2)
 
+    @fresh_cache()
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_unbacked_slice(self):
+        from torch.fx.experimental.symbolic_shapes import statically_known_true
+
+        # standard slice
+        def f1(x, xs):
+            u0, u1 = xs.tolist()
+            # in this test we add the torch checks not to avoid DDE but to ensure
+            # that we pick specific path during compilation.
+            torch._check(u0 >= 0)
+            torch._check(u0 <= x.size(0))
+            torch._check(u1 >= 0)
+            torch._check(u1 <= x.size(0))
+            torch._check(u0 <= u1)
+            out = x[u0:u1]
+            assert statically_known_true(out.size(0) == (u1 - u0))
+            return out
+
+        x, xs = torch.randn(10), torch.tensor([3, 6])
+        fn1 = torch.compile(f1, fullgraph=True, backend="inductor")
+        self.assertEqual(fn1(x, xs).size(0), 3)
+        self.assertTrue(torch.allclose(fn1(x, xs), f1(x, xs)))
+        with self.assertRaises(RuntimeError):
+            fn1(x, torch.tensor([-1, 5]))
+
+        # known negative slice
+        def f2(x, n):
+            u0 = n.item()
+            torch._check(u0 > 1)
+            torch._check(u0 <= x.size(0))
+            out = x[-u0:]
+            assert statically_known_true(out.size(0) == u0)
+            return out
+
+        x, n = torch.randn(10), torch.tensor([5])
+        fn2 = torch.compile(f2, fullgraph=True, backend="inductor")
+        self.assertEqual(fn2(x, n).size(0), 5)
+        self.assertTrue(torch.allclose(fn2(x, n), f2(x, n)))
+        with self.assertRaises(RuntimeError):
+            fn2(x, torch.tensor([-5]))
+
+        # general case: no known info
+        def f3(x, xs):
+            u0, u1 = xs.tolist()
+            return x[u0:u1]
+
+        log_stream, ctx = logs_to_string(
+            "torch._inductor.compile_fx", "post_grad_graphs"
+        )
+        cnts = CompileCounterWithBackend("inductor")
+        x, xs = torch.randn(10), torch.tensor([3, 6])
+        with ctx():
+            fn3 = torch.compile(f3, fullgraph=True, backend=cnts)
+            xs = torch.tensor([-9, -1])  # negative case
+            self.assertTrue(torch.allclose(fn3(x, xs), f3(x, xs)))
+            xs = torch.tensor([-1000, 1000])  # out of bounds
+            self.assertTrue(torch.allclose(fn3(x, xs), f3(x, xs)))
+            xs = torch.tensor([2, -2])  # mixed
+            self.assertTrue(torch.allclose(fn3(x, xs), f3(x, xs)))
+            self.assertEqual(cnts.frame_count, 1)
+
+        aot_graphs = "\n".join(log_stream.getvalue().strip().split("\n")[4:]).strip()
+        self.assertExpectedInline(
+            aot_graphs,
+            """\
+        select: "i64[][]cpu" = torch.ops.aten.select.int(arg0_1, 0, 0)
+        _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(select);  select = None
+        select_1: "i64[][]cpu" = torch.ops.aten.select.int(arg0_1, 0, 1);  arg0_1 = None
+        _local_scalar_dense_1: "Sym(u1)" = torch.ops.aten._local_scalar_dense.default(select_1);  select_1 = None
+        slice_1: "f32[u2][1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, _local_scalar_dense, _local_scalar_dense_1);  arg1_1 = _local_scalar_dense = _local_scalar_dense_1 = None
+        sym_size_int: "Sym(u2)" = torch.ops.aten.sym_size.int(slice_1, 0)
+        ge_1: "Sym(u2 >= 0)" = sym_size_int >= 0
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u2 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
+        le: "Sym(u2 <= 10)" = sym_size_int <= 10;  sym_size_int = None
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(le, "Runtime assertion failed for expression u2 <= 10 on node 'le'");  le = _assert_scalar_1 = None
+        sym_storage_offset_default: "Sym(u3)" = torch.ops.aten.sym_storage_offset.default(slice_1)
+        ge_2: "Sym(u3 >= 0)" = sym_storage_offset_default >= 0;  sym_storage_offset_default = None
+        _assert_scalar_2 = torch.ops.aten._assert_scalar.default(ge_2, "Runtime assertion failed for expression u3 >= 0 on node 'ge_1'");  ge_2 = _assert_scalar_2 = None
+        return (slice_1,)""",  # noqa: B950
+            ignore_comments=True,
+            ignore_empty_lines=True,
+        )
+
+    @fresh_cache()
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @torch._inductor.config.patch("cpp_wrapper", True)
+    def test_unbacked_slice_cpp_wrapper(self):
+        self.test_unbacked_slice()
+
+    @fresh_cache()
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_unbacked_slice_with_step(self):
+        def f1(x, xs):
+            u0, u1 = xs.tolist()
+            out = x[u0:u1:5]
+            return out
+
+        x, xs = torch.randn(10), torch.tensor([2, -2])
+        fn1 = torch.compile(f1, fullgraph=True, backend="inductor")
+        self.assertTrue(torch.allclose(fn1(x, xs), f1(x, xs)))
+
+    @fresh_cache()
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @torch._inductor.config.patch("cpp_wrapper", True)
+    def test_unbacked_slice_with_step_cpp_wrapper(self):
+        self.test_unbacked_slice_with_step()
+
+    @fresh_cache()
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_tensor_split(self):
+        def f1(x, xs):
+            xs = torch.tensor(xs.tolist())
+            return torch.tensor_split(x, xs)
+
+        x = torch.randn(20)
+        xs = torch.tensor([5, 10, 15])
+        fn = torch.compile(f1, fullgraph=True, backend="inductor")
+
+        def compare(x, xs):
+            for i, j in zip(f1(x, xs), fn(x, xs)):
+                self.assertTrue(torch.allclose(i, j))
+
+        compare(x, xs)
+        xs = torch.tensor([-15, 9, 10, 11])
+        compare(x, xs)
+        xs = torch.tensor([-15, -10, -5, -2])
+        compare(x, xs)
+
+    @fresh_cache()
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @torch._inductor.config.patch("cpp_wrapper", True)
+    def test_tensor_split_cpp_wrapper(self):
+        self.test_tensor_split()
+
+    @fresh_cache()
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @torch._dynamo.config.patch("capture_dynamic_output_shape_ops", True)
+    def test_nonzero_slice(self):
+        def f(x):
+            nz = x.nonzero()
+            return nz[:-1]
+
+        x = torch.randn(3, 4)
+        fn = torch.compile(f, fullgraph=True, backend="inductor")
+        self.assertTrue(torch.allclose(f(x), fn(x)))
+        y = torch.zeros(3, 4)
+        self.assertTrue(torch.allclose(f(y), fn(y)))
+
+    @fresh_cache()
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @torch._dynamo.config.patch("capture_dynamic_output_shape_ops", True)
+    @torch._inductor.config.patch("cpp_wrapper", True)
+    def test_nonzero_slice_cpp_wrapper(self):
+        self.test_nonzero_slice()
+
+    @fresh_cache()
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @torch._dynamo.config.patch("capture_dynamic_output_shape_ops", True)
+    def test_nonzero_select(self):
+        def f(x):
+            nz = x.nonzero()
+            return nz[-1] + nz[0]
+
+        x = torch.randn(3, 4)
+        fn = torch.compile(f, fullgraph=True, backend="inductor")
+        self.assertTrue(torch.allclose(f(x), fn(x)))
+
+    @fresh_cache()
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @torch._dynamo.config.patch("capture_dynamic_output_shape_ops", True)
+    @torch._inductor.config.patch("cpp_wrapper", True)
+    def test_nonzero_select_cpp_wrapper(self):
+        self.test_nonzero_select()
+
+    @fresh_cache()
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    @torch._dynamo.config.patch("capture_dynamic_output_shape_ops", True)
+    def test_padnd(self):
+        import torch.nn.functional as F
+
+        def f(x, xs, y):
+            u0, u1 = xs.tolist()
+            for u in [u0, u1]:
+                torch._check(u >= 0)
+            z = F.pad(x, (u0, u1, u0, u1))
+            return z @ y
+
+        x = torch.randn(8, 8)
+        xs = torch.tensor([2, 2])
+        y = torch.randn(12, 4)
+        fn = torch.compile(f, fullgraph=True, backend="inductor")
+        fn(x, xs, y)
+
     @unittest.skip("this test fails due to inductor/autograd issue #153041")
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_unbacked_non_contigious_reshape_failing(self):
@@ -3488,9 +3924,6 @@ def test_invalid_view_unbacked_view(self):
         # hints to to compute strides.
         def func(x, y):
             u0, u1 = y.tolist()
-            torch._check_is_size(u0)
-            torch._check_is_size(u1)
-
             result2 = x.view(u0, u1) * 10
             return result2
 
@@ -3649,6 +4082,17 @@ def func2(x, y):
         self.assertEqual(compiled_func2(x, zero), func2(x, zero))
         self.assertEqual(cnt.frame_count, 2)
 
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_unbacked_select_2(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                nz = x.nonzero()
+                return nz[-1]
+
+        mod = M()
+        x = torch.randn(4)
+        self.assertEqual(torch.compile(mod)(x), mod(x))
+
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_unbacked_select_index_with_check(self):
         def func3(x, y):
@@ -3700,6 +4144,191 @@ def f(x, s13, s57, s77):
         x = torch.rand(10)
         f(x, 4, 4096, 3920)
 
+    @skipIfTorchDynamo("not allowed to trace mark_unbacked")
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_unbacked_reshape3(self):
+        def func(x):
+            x = x.as_strided([x.size()[0], 1536], [2048, 1])
+            result1 = x.view(x.size()[0], -1, 128)
+            return result1 * 10
+
+        compiled = torch.compile(fullgraph=True, backend="inductor")(func)
+        x = torch.randn(10, 2048)
+
+        torch._dynamo.decorators.mark_unbacked(x, 0)
+        self.assertEqual(func(x), compiled(x))
+
+    @fresh_cache()
+    @skipIfTorchDynamo("not allowed to trace mark_unbacked")
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_unbacked_reshape_copy(self):
+        cnt = CompileCounterWithBackend("inductor")
+
+        # Reshape happens in place reshape (no-clone)
+        # reshape u1 -> (u0*u0)
+        def func(x, y):
+            f = y.item()
+            t3 = torch._ops.ops.aten._reshape_copy(x, (f, f))
+            return t3
+
+        compiled_func = torch.compile(
+            fullgraph=True,
+            backend=cnt,
+            dynamic=True,
+        )(func)
+
+        # create a non-contiguous with data being even numbers in [0:cnt-1]
+        # and reshape it into sqrt(cnt)*sqrt(cnt)
+        def make_non_contiguous_tensor_and_test(cnt):
+            # create a non-contiguous tensor x that is skipping odd indices.
+            x = torch.arange(cnt * 2)
+            x = x.as_strided((x.size()[0] // 2,), (2,))
+
+            torch._dynamo.decorators.mark_unbacked(x, 0)
+            sz = torch.tensor([int(math.sqrt(cnt))])
+            compiled_result = compiled_func(x, sz)
+            eager_result = func(x, sz)
+            self.assertEqual(compiled_result, eager_result)
+
+        log_stream, ctx = logs_to_string(
+            "torch._functorch._aot_autograd.graph_capture", "aot_graphs"
+        )
+        with ctx():
+            make_non_contiguous_tensor_and_test(4)
+        aot_graphs = "\n".join(log_stream.getvalue().strip().split("\n")[4:]).strip()
+        self.assertExpectedInline(
+            aot_graphs,
+            """\
+def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "Sym(s7)", arg3_1: "i64[u1][s7]cpu"):
+        ge_1: "Sym(u1 >= 0)" = arg1_1 >= 0
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
+        _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(arg0_1);  arg0_1 = None
+        ge_2: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_2, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_2 = _assert_scalar_1 = None
+        pow_1: "Sym(u0**2)" = _local_scalar_dense ** 2
+        eq: "Sym(Eq(u1, u0**2))" = arg1_1 == pow_1;  arg1_1 = pow_1 = None
+        _assert_scalar_2 = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(u1, u0**2) on node 'eq'");  eq = _assert_scalar_2 = None
+        _reshape_copy: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten._reshape_copy.default(arg3_1, [_local_scalar_dense, _local_scalar_dense]);  arg3_1 = _local_scalar_dense = None
+        return (_reshape_copy,)""",  # noqa: B950
+            ignore_comments=True,
+            ignore_empty_lines=True,
+        )
+
+        make_non_contiguous_tensor_and_test(49)
+        self.assertEqual(cnt.frame_count, 1)
+
+        # Pass in a contiguous tensor, it will recompile due to stride being 1 (0/1 specialization).
+        # marking strides unbacked would have avoided the recompilation here.
+        x = torch.arange(100)
+        torch._dynamo.decorators.mark_unbacked(x, 0)
+
+        log_stream, ctx = logs_to_string(
+            "torch._functorch._aot_autograd.graph_capture", "aot_graphs"
+        )
+        with ctx():
+            compiled_result = compiled_func(x, torch.tensor([10]))
+            eager_result = func(x, torch.tensor([10]))
+            self.assertEqual(compiled_result, eager_result)
+            self.assertEqual(cnt.frame_count, 2)
+
+        aot_graphs = "\n".join(log_stream.getvalue().strip().split("\n")[4:]).strip()
+        self.assertExpectedInline(
+            aot_graphs,
+            """\
+def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "i64[u1][1]cpu"):
+        ge_1: "Sym(u1 >= 0)" = arg1_1 >= 0
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
+        _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(arg0_1);  arg0_1 = None
+        ge_2: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_2, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_2 = _assert_scalar_1 = None
+        pow_1: "Sym(u0**2)" = _local_scalar_dense ** 2
+        eq: "Sym(Eq(u1, u0**2))" = arg1_1 == pow_1;  arg1_1 = pow_1 = None
+        _assert_scalar_2 = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(u1, u0**2) on node 'eq'");  eq = _assert_scalar_2 = None
+        _reshape_copy: "i64[u0, u0][Max(1, u0), 1]cpu" = torch.ops.aten._reshape_copy.default(arg2_1, [_local_scalar_dense, _local_scalar_dense]);  arg2_1 = _local_scalar_dense = None
+        return (_reshape_copy,)""",  # noqa: B950
+            ignore_comments=True,
+            ignore_empty_lines=True,
+        )
+
+        x = torch.arange(25)
+        compiled_result = compiled_func(x, torch.tensor([5]))
+        eager_result = func(x, torch.tensor([5]))
+        self.assertEqual(cnt.frame_count, 2)
+
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_unbacked_item(self):
+        def func():
+            _x_ms = torch.tensor([True, False], dtype=torch.int64)
+            _mask_ms = torch.zeros_like(_x_ms, dtype=torch.bool)
+            _mask_ms[:1] = True
+            var_node_2 = torch.masked_select(_x_ms, _mask_ms)
+            var_node_0 = var_node_2.item()
+            return var_node_0
+
+        result_original = func()
+        compiled_program = torch.compile(func, fullgraph=True, dynamic=True)
+        result_compiled = compiled_program()
+        self.assertEqual(result_original, result_compiled)
+
+    def test_unbacked_item_set_item(self):
+        def my_arithmetic(a, b):
+            wrk = torch.zeros(a.size(0))
+            for i in range(a.size(0)):
+                idx = b[i].item()
+                wrk[idx] += 1
+
+            return wrk
+
+        compiled = torch.compile(my_arithmetic, fullgraph=True, disable=False)
+        a = torch.randn([9])
+        b = torch.ones(9, dtype=torch.int32)
+        compiled(a, b)
+        self.assertEqual(compiled(a, b), my_arithmetic(a, b))
+
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_unbacked_item_set_item2(self):
+        def accumulate(X0, start):
+            start = start.item()
+            N = 3
+            result = X0[start]
+            for i in range(N):
+                result += X0[start + 1 + i]
+            return result
+
+        compiled = torch.compile(accumulate, fullgraph=True)
+        X0 = torch.randn(10, 10)
+        self.assertEqual(
+            accumulate(X0, torch.tensor([1])), compiled(X0, torch.tensor([1]))
+        )
+
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_unbacked_item_set_item3(self):
+        def func(x, y):
+            u0 = y.item()
+            x[u0] = 0
+            return x
+
+        compiled = torch.compile(func, fullgraph=True, disable=False)
+        b = torch.tensor([0])
+        a = torch.ones(9, dtype=torch.int32)
+
+        compiled(a, b)
+        self.assertEqual(compiled(a, b), func(a, b))
+
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_select_scatter_unbacked_index(self):
+        def func(x, y):
+            u0 = y.item()
+            # Create a scalar tensor to scatter into the selected index
+            scalar_src = torch.tensor(42, dtype=x.dtype)
+            return x.select_scatter(scalar_src, 0, u0)
+
+        compiled = torch.compile(func, fullgraph=True, dynamic=True, backend="inductor")
+        b = torch.tensor([0])
+        a = torch.ones(9, dtype=torch.int32)
+
+        self.assertEqual(compiled(a, b), func(a, b))
+
 
 instantiate_parametrized_tests(TestUnbacked)
 
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 83c6f383f430..0a5b6faab2f6 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -201,6 +201,26 @@ def test_convert_fake_to_real(self):
 
         self.assertEqual(torch.ones([10]), out[0])
 
+    def test_conv_nhwc(self):
+        x = torch.randn([1, 1024, 16, 16]).to(memory_format=torch.channels_last)
+        w = torch.randn([256, 1024, 4, 4]).to(memory_format=torch.channels_last)
+        b = torch.randn([256])
+
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x, w, b):
+                return torch.ops.aten.convolution(
+                    x, w, b, [1, 1], [0, 0], [1, 1], False, [0, 0], 1
+                )
+
+        model = Model()
+        with FakeTensorMode(allow_non_fake_inputs=True) as mode:
+            fake_out = model.forward(x, w, b)
+        eager_out = model.forward(x, w, b)
+        self.assertEqual(fake_out.stride(), eager_out.stride())
+
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_zero_dim(self):
         with FakeTensorMode() as mode:
@@ -214,7 +234,7 @@ def test_zero_dim(self):
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_op_with_zero_dim_bypassed(self):
         if torch._functorch.config.fake_tensor_propagate_real_tensors:
-            return
+            self.skipTest("Propagate real tensor not supported")
         shape_env = ShapeEnv()
         mode = FakeTensorMode(shape_env=shape_env)
         x = torch.tensor(1.0, device="cuda")
@@ -1516,14 +1536,61 @@ def test_fake_gpu_no_init(self):
         # Skip this test, we will try to run CUDA operations to real prop so
         # it clearly will not work on CPU runner
         if torch._functorch.config.fake_tensor_propagate_real_tensors:
-            return
-        with FakeTensorMode():
-            torch.empty(10, device=GPU_TYPE)
-            torch.ones(10, device=GPU_TYPE)
-            torch.zeros(10, device=GPU_TYPE)
-            torch.rand(10, device=GPU_TYPE)
-            torch.tensor(3.14, device=GPU_TYPE)
-            torch.tensor([[3.14, 2], [1, 2]], device=GPU_TYPE)
+            self.skipTest("Propagate real tensor not supported")
+
+        with FakeTensorMode(allow_non_fake_inputs=True):
+            self.assertEqual(torch.empty(10, device=GPU_TYPE).device.type, GPU_TYPE)
+            self.assertEqual(torch.ones(10, device=GPU_TYPE).device.type, GPU_TYPE)
+            self.assertEqual(torch.zeros(10, device=GPU_TYPE).device.type, GPU_TYPE)
+            self.assertEqual(torch.rand(10, device=GPU_TYPE).device.type, GPU_TYPE)
+            self.assertEqual(torch.tensor(3.14, device=GPU_TYPE).device.type, GPU_TYPE)
+            self.assertEqual(
+                torch.tensor([[3.14, 2], [1, 2]], device=GPU_TYPE).device.type, GPU_TYPE
+            )
+
+    @unittest.skipIf(not torch.backends.cuda.is_built(), "requires CUDA build")
+    def test_move_module_under_fake(self):
+        if torch._functorch.config.fake_tensor_propagate_real_tensors:
+            self.skipTest("Propagate real tensor not supported")
+
+        class Module(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(2, 2)
+                self.buffer = torch.nn.Buffer(torch.rand(2, 2))
+                self.param = torch.nn.Parameter(torch.rand(2, 2))
+
+            def forward(self, x):
+                return self.linear(x) + self.buffer + self.param
+
+        m = Module()
+        input = torch.rand(2, 2)
+        gpu_device = torch.device(GPU_TYPE, 0)
+
+        with FakeTensorMode(allow_non_fake_inputs=True):
+            m.to(device=gpu_device)
+            arg = input.to(device=gpu_device)
+            out = m(arg)
+
+        for p in m.parameters():
+            self.assertTrue(isinstance(p, FakeTensor))
+            self.assertEqual(p.device, gpu_device)
+        for b in m.buffers():
+            self.assertTrue(isinstance(b, FakeTensor))
+            self.assertEqual(b.device, gpu_device)
+
+        self.assertTrue(isinstance(out, FakeTensor))
+        self.assertEqual(out.device, gpu_device)
+
+    @unittest.skipIf(not RUN_CUDA, "requires cuda")
+    def test_move_meta_tensor(self):
+        if torch._functorch.config.fake_tensor_propagate_real_tensors:
+            self.skipTest("Propagate real tensor not supported")
+
+        meta_tensor = torch.ones(2, device="meta")
+        with FakeTensorMode(allow_non_fake_inputs=True):
+            self.assertEqual(meta_tensor.to(device="cpu").device.type, "cpu")
+            self.assertEqual(meta_tensor.to(device=GPU_TYPE).device.type, GPU_TYPE)
 
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_conv_c1_backward(self):
diff --git a/test/test_flop_counter.py b/test/test_flop_counter.py
index c44d5e5d4145..03eb15744b54 100644
--- a/test/test_flop_counter.py
+++ b/test/test_flop_counter.py
@@ -13,9 +13,9 @@
     PLATFORM_SUPPORTS_FP8,
     PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
 )
+from torch.testing._internal.common_device_type import e4m3_type
 from torch.testing._internal.common_utils import (
     run_tests,
-    skipIfRocm,
     TEST_WITH_TORCHDYNAMO,
     TestCase,
 )
@@ -463,7 +463,6 @@ def get_flops(
         self.assertExpectedInline(str(flops_fw_bw_math), """805306368""")
         self.assertExpectedInline(str(flops_fw_bw_efficient), """939524096""")
 
-    @skipIfRocm  # Nested tensor
     @unittest.skipIf(not HAS_CUDA, "CUDA not available")
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION
@@ -683,7 +682,6 @@ def split_tensor(x):
             ),
         )
 
-    @skipIfRocm  # Nested tensor
     @unittest.skipIf(not HAS_CUDA, "CUDA not available")
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION,
@@ -856,7 +854,7 @@ def get_flops(model):
         "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
     )
     def test_scaled_mm(self):
-        dtype = torch.float8_e4m3fnuz if torch.version.hip else torch.float8_e4m3fn
+        dtype = e4m3_type
         with FlopCounterMode() as mode:
             torch._scaled_mm(
                 torch.randn((3 * 16, 5 * 16), device="cuda").to(dtype),
diff --git a/test/test_foreach.py b/test/test_foreach.py
index 7ac128d6bac8..12c2ec7ccc96 100644
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@@ -38,7 +38,6 @@
     gradcheck,
     parametrize,
     run_tests,
-    skipIfRocmVersionLessThan,
     skipIfTorchDynamo,
     TEST_WITH_ROCM,
     TestCase,
@@ -196,7 +195,6 @@ def test_all_zero_size_tensors_do_not_launch_kernel(self, device, dtype, op):
                         zero_size=True,
                     )
 
-    @skipIfRocmVersionLessThan((6, 0))
     @ops(
         foreach_unary_op_db
         + foreach_binary_op_db
diff --git a/test/test_fx.py b/test/test_fx.py
index ba80f69828df..76dd7e15df93 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -35,7 +35,8 @@
 from torch.fx.operator_schemas import get_signature_for_torch_op
 from copy import deepcopy
 from collections import namedtuple
-from typing import Any, Callable, NamedTuple, Optional, Union
+from typing import Any, NamedTuple, Optional, Union
+from collections.abc import Callable
 
 import torch
 
@@ -3583,7 +3584,7 @@ def is_leaf_module(self, module, name):
 
         class LeafTracerNotB(Tracer):
             def is_leaf_module(self, module, name):
-                return False if "b" in name else True
+                return "b" not in name
 
         # Recompile calls added "for fun", since they
         # chain __call__ wrappers.
@@ -3785,25 +3786,6 @@ def forward(self, x: typing.Tuple[()], y: typing.Tuple[str, typing.Tuple[()]]):
 
         FileCheck().check("Tuple[()]").check("Tuple[str, Tuple[()]]").run(scripted.code)
 
-    @unittest.skipIf(
-        IS_WINDOWS, "Python Windows bug? https://bugs.python.org/issue45108"
-    )
-    @unittest.skipIf(sys.version_info >= (3, 10), "Does not work on Python-3.10")
-    def test_assert(self):
-        def f(x):
-            assert x > 1
-            return x + 1
-
-        try:
-            torch.fx.proxy.TracerBase.trace_asserts = True
-            traced = symbolic_trace(f)
-        finally:
-            torch.fx.proxy.TracerBase.trace_asserts = False
-
-        self.assertEqual(f(2), traced(2))
-        with self.assertRaises(AssertionError):
-            traced(0)
-
     def test_pytree(self):
         # Used to test that you can use your own placeholder class
         class PHTest(PHBase):
@@ -4819,7 +4801,6 @@ def generate_test_func(cls, func_name, fn):
         def functional_test(self):
             if (
                 func_name in self.UNTRACEABLE_FUNCTIONALS_PY38
-                and sys.version_info >= (3, 8)
                 and sys.version_info < (3, 12)
             ):
                 exc, err = self.UNTRACEABLE_FUNCTIONALS_PY38[func_name]
diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index 72d770e6d3f0..d74a3febf171 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -12,7 +12,8 @@
 import typing
 import unittest
 from types import BuiltinFunctionType
-from typing import Callable, NamedTuple, Optional, Union
+from typing import NamedTuple, Optional, Union
+from collections.abc import Callable
 
 import torch
 import torch.fx.experimental.meta_tracer
diff --git a/test/test_indexing.py b/test/test_indexing.py
index 7a202efbe084..99d84a65abca 100644
--- a/test/test_indexing.py
+++ b/test/test_indexing.py
@@ -902,7 +902,7 @@ def test_list_indices(self, device):
         # Set window size
         W = 10
         # Generate a list of lists, containing overlapping window indices
-        indices = [range(i, i + W) for i in range(0, N - W)]
+        indices = [range(i, i + W) for i in range(N - W)]
 
         for i in [len(indices), 100, 32]:
             windowed_data = t[indices[:i]]
@@ -2030,13 +2030,13 @@ def test_index_put_non_accumulate_deterministic(self, device) -> None:
                 self.assertEqual(output, input_list)
 
     @dtypes(*all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16))
-    @expectedFailureMPS
+    @dtypesIfMPS(*all_mps_types_and(torch.bool))  # TODO: Add torch.cfloat here
     def test_index_fill(self, device, dtype):
         x = torch.tensor([[1, 2], [4, 5]], dtype=dtype, device=device)
         index = torch.tensor([0], device=device)
         x.index_fill_(1, index, 0)
         self.assertEqual(x, torch.tensor([[0, 2], [0, 5]], dtype=dtype, device=device))
-        if not x.is_complex() and not device == "meta":
+        if not x.is_complex() and device != "meta":
             with self.assertRaisesRegex(RuntimeError, r"Scalar"):
                 x.index_fill_(1, index, 1 + 1j)
         # Make sure that the result stays 0-dim while applied to
diff --git a/test/test_itt.py b/test/test_itt.py
index 99841e1932d5..efcdcf49b159 100644
--- a/test/test_itt.py
+++ b/test/test_itt.py
@@ -6,7 +6,7 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 @unittest.skipIf(not torch.profiler.itt.is_available(), "ITT is required")
 class TestItt(TestCase):
diff --git a/test/test_jit.py b/test/test_jit.py
index c86fb111bfb8..137979fcc4f1 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -3,6 +3,13 @@
 
 import torch
 
+if __name__ == '__main__':
+    from torch.testing._internal.common_utils import parse_cmd_line_args
+
+    # The value of GRAPH_EXECUTOR depends on command line arguments so make sure they're parsed
+    # before instantiating tests.
+    parse_cmd_line_args()
+
 # This is how we include tests located in test/jit/...
 # They are included here so that they are invoked when you call `test_jit.py`,
 # do not run these test files directly.
@@ -97,7 +104,7 @@
 from torch.testing._internal import jit_utils
 from torch.testing._internal.common_jit import check_against_reference
 from torch.testing._internal.common_utils import run_tests, IS_WINDOWS, \
-    suppress_warnings, IS_SANDCASTLE, GRAPH_EXECUTOR, ProfilingMode, \
+    GRAPH_EXECUTOR, suppress_warnings, IS_SANDCASTLE, ProfilingMode, \
     TestCase, freeze_rng_state, slowTest, TemporaryFileName, \
     enable_profiling_mode_for_profiling_tests, TEST_MKL, set_default_dtype, num_profiled_runs, \
     skipIfCrossRef, skipIfTorchDynamo
@@ -158,6 +165,7 @@ def doAutodiffCheck(testname):
     if "test_t_" in testname or testname == "test_t":
         return False
 
+    assert GRAPH_EXECUTOR
     if GRAPH_EXECUTOR == ProfilingMode.SIMPLE:
         return False
 
@@ -201,6 +209,7 @@ def doAutodiffCheck(testname):
     return testname not in test_exceptions
 
 
+assert GRAPH_EXECUTOR
 # TODO: enable TE in PE when all tests are fixed
 torch._C._jit_set_texpr_fuser_enabled(GRAPH_EXECUTOR == ProfilingMode.PROFILING)
 torch._C._jit_set_profiling_executor(GRAPH_EXECUTOR != ProfilingMode.LEGACY)
@@ -2878,9 +2887,9 @@ def lstm(x, hx, cx, w_ih, w_hh, b_ih, b_hh):
                     self.assertTrue(hasattr(input, 'type'))
                     self.assertTrue(input.type() is not None)
                 self.assertTrue(hasattr(block, 'returnNode'))
-                self.assertTrue(type(block.returnNode()) == torch._C.Node)
+                self.assertTrue(type(block.returnNode()) is torch._C.Node)
                 self.assertTrue(hasattr(block, 'paramNode'))
-                self.assertTrue(type(block.paramNode()) == torch._C.Node)
+                self.assertTrue(type(block.paramNode()) is torch._C.Node)
         self.assertTrue(tested_blocks)
 
     def test_export_opnames(self):
@@ -3144,7 +3153,7 @@ def fct_loop(x):
             eplan = get_execution_plan(dstate)
             num_bailouts = eplan.code.num_bailouts()
 
-            for i in range(0, num_bailouts):
+            for i in range(num_bailouts):
                 eplan.code.request_bailout(i)
                 self.assertEqual(jitted(x), expected)
 
@@ -5941,7 +5950,7 @@ def fib(x):
             # type: (int) -> int
             prev = 1
             v = 1
-            for i in range(0, x):
+            for i in range(x):
                 save = v
                 v = v + prev
                 prev = save
@@ -6501,7 +6510,7 @@ def func(a, b):
                     if isinstance(res_python, Exception):
                         continue
 
-                    if type(res_python) == type(res_script):
+                    if type(res_python) is type(res_script):
                         if isinstance(res_python, tuple) and (math.isnan(res_python[0]) == math.isnan(res_script[0])):
                             continue
                         if isinstance(res_python, float) and math.isnan(res_python) and math.isnan(res_script):
@@ -6714,7 +6723,7 @@ def test_logical_short_circuit(self):
         @torch.jit.script
         def testNoThrows(t):
             c1 = 1
-            if (False and bool(t[1])) or (True or bool(t[1])):
+            if (False and bool(t[1])) or (True or bool(t[1])):  # noqa: SIM222,SIM223
                 c1 = 0
             return c1
 
@@ -7375,7 +7384,7 @@ def func():
                     if "annotate" in li and "dtype" not in option:
                         continue
                     # Skip unsigned tensor initialization for signed values on 3.10
-                    if sys.version_info[:2] >= (3, 10) and "torch.uint8" in option and "-" in li:
+                    if "torch.uint8" in option and "-" in li:
                         continue
                     code = tensor_template.format(list_create=li, tensor_op=op, options=option)
                     scope = {}
@@ -8637,7 +8646,7 @@ def func():
         args = args + [1, 1.5]
 
         def isBool(arg):
-            return type(arg) == bool or (type(arg) == str and "torch.bool" in arg)
+            return type(arg) is bool or (type(arg) is str and "torch.bool" in arg)
 
         for op in ops:
             for first_arg in args:
@@ -8646,7 +8655,7 @@ def isBool(arg):
                     if (op == 'sub' or op == 'div') and (isBool(first_arg) or isBool(second_arg)):
                         continue
                     # div is not implemented correctly for mixed-type or int params
-                    if (op == 'div' and (type(first_arg) != type(second_arg) or
+                    if (op == 'div' and (type(first_arg) is not type(second_arg) or
                        isinstance(first_arg, int) or
                        (isinstance(first_arg, str) and 'int' in first_arg))):
                         continue
@@ -8662,7 +8671,7 @@ def isBool(arg):
                     graph = cu.func.graph
                     torch._C._jit_pass_complete_shape_analysis(graph, (), False)
                     # use dim=-1 to represent a python/jit scalar.
-                    dim = -1 if type(first_arg) != str and type(second_arg) != str else non_jit_result.dim()
+                    dim = -1 if type(first_arg) is not str and type(second_arg) is not str else non_jit_result.dim()
                     dtype = non_jit_result.dtype
                     # jit only supports int/float scalars.
                     if dim < 0:
@@ -9996,7 +10005,7 @@ def test_if_supertype(self):
         def tensor_unifying(x, y, z):
             # testing dynamic is appropriately set for y and z
             if bool(x):
-                x, y, z = x + 1, y, z
+                x, y, z = x + 1, y, z  # noqa: PLW0127
             else:
                 x, y, z = x + 1, x, y
 
@@ -10929,7 +10938,7 @@ def forward(self, x: torch.Tensor):
 
             # Test symbolic differentiation
             # Run Forward and Backward thrice to trigger autodiff graph
-            for i in range(0, 3):
+            for i in range(3):
                 y = jit_module(x)
                 y.backward(grad)
             x.grad.zero_()
@@ -11793,7 +11802,7 @@ def test_for_in_zip_enumerate(self):
         def fn_zip_enumerate(x, y):
             # type: (List[int], List[int]) -> int
             sum = 0
-            for (i, (j, v), k) in zip(x, enumerate(y), range(0, 100)):
+            for (i, (j, v), k) in zip(x, enumerate(y), range(100)):
                 sum += i * j * v * k
 
             return sum
@@ -15749,7 +15758,7 @@ def test_in_for_and_comp_expr(self):
         def fn(d):
             # type: (Dict[str, int]) -> List[int]
             out = [1]
-            for i in range(d["hi"] if "hi" in d else 6):
+            for i in range(d.get("hi", 6)):
                 out.append(i)  # noqa: PERF402
             return out
 
@@ -16095,7 +16104,7 @@ class TestJitGeneratedFunctional(JitTestCase):
 S = 5
 
 def add_nn_module_test(*args, **kwargs):
-    no_grad = False if 'no_grad' not in kwargs else kwargs['no_grad']
+    no_grad = kwargs.get('no_grad', False)
 
     if 'desc' in kwargs and 'eval' in kwargs['desc']:
         # eval() is not supported, so skip these tests
diff --git a/test/test_jit_autocast.py b/test/test_jit_autocast.py
index b3cf4d9bee8f..0559a728aef9 100644
--- a/test/test_jit_autocast.py
+++ b/test/test_jit_autocast.py
@@ -5,12 +5,17 @@
 from typing import Optional
 
 import unittest
-from test_jit import JitTestCase
 from torch.testing._internal.common_cuda import TEST_CUDA
-from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo
+from torch.testing._internal.common_utils import parse_cmd_line_args, run_tests, skipIfTorchDynamo
 from torch.testing import FileCheck
 from jit.test_models import MnistNet
 
+if __name__ == '__main__':
+    # The value of GRAPH_EXECUTOR depends on command line arguments so make sure they're parsed
+    # before instantiating tests.
+    parse_cmd_line_args()
+
+from test_jit import JitTestCase
 TEST_BFLOAT16 = TEST_CUDA and torch.cuda.is_bf16_supported()
 
 @skipIfTorchDynamo("Not a TorchDynamo suitable test")
@@ -106,7 +111,7 @@ def fn(a, b, use_amp: bool):
     def test_runtime_autocast_state_expr(self):
         @torch.jit.script
         def fn(a, b):
-            with autocast(enabled=True if a[0][0] > 0.5 else False):
+            with autocast(enabled=bool((a[0][0] > 0.5).item())):
                 return torch.mm(a, b)
         # runtime values for autocast enable argument are not supported
         with self.assertRaises(RuntimeError):
diff --git a/test/test_jit_fuser.py b/test/test_jit_fuser.py
index 1ac7803a9d46..5446770695c4 100644
--- a/test/test_jit_fuser.py
+++ b/test/test_jit_fuser.py
@@ -9,6 +9,13 @@
 from torch.testing import FileCheck
 from unittest import skipIf
 
+if __name__ == "__main__":
+    from torch.testing._internal.common_utils import parse_cmd_line_args
+
+    # The value of GRAPH_EXECUTOR depends on command line arguments so make sure they're parsed
+    # before instantiating tests.
+    parse_cmd_line_args()
+
 from torch.testing._internal.common_utils import run_tests, IS_SANDCASTLE, ProfilingMode, GRAPH_EXECUTOR, \
     enable_profiling_mode_for_profiling_tests, IS_WINDOWS, TemporaryDirectoryName, shell
 from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, _inline_everything, \
diff --git a/test/test_jit_fuser_legacy.py b/test/test_jit_fuser_legacy.py
index 3bd8c9497ce0..4100bcc3e182 100644
--- a/test/test_jit_fuser_legacy.py
+++ b/test/test_jit_fuser_legacy.py
@@ -2,6 +2,14 @@
 
 import sys
 sys.argv.append("--jit-executor=legacy")
+
+if __name__ == "__main__":
+    from torch.testing._internal.common_utils import parse_cmd_line_args
+
+    # The value of GRAPH_EXECUTOR depends on command line arguments so make sure they're parsed
+    # before instantiating tests.
+    parse_cmd_line_args()
+
 from test_jit_fuser import *  # noqa: F403
 
 if __name__ == '__main__':
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index c3e26d37da1b..dba28f98cbf9 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -22,6 +22,13 @@
 torch._C._jit_set_profiling_executor(True)
 torch._C._get_graph_executor_optimize(True)
 
+if __name__ == "__main__":
+    from torch.testing._internal.common_utils import parse_cmd_line_args
+
+    # The value of GRAPH_EXECUTOR depends on command line arguments so make sure they're parsed
+    # before instantiating tests.
+    parse_cmd_line_args()
+
 from itertools import combinations, permutations, product
 from textwrap import dedent
 
@@ -236,7 +243,7 @@ def func(x):
             return x2.sum()
 
         with texpr_reductions_enabled():
-            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device="cpu")
+            a = torch.tensor(list(range(15)), dtype=torch.float, device="cpu")
             a = a.reshape(5, 3)
             scripted = self.checkScript(func, (a,))
             self.assertLastGraphAllFused()
@@ -252,7 +259,7 @@ def func_neg(x):
             return x.sum((-2,)) * 2
 
         with texpr_reductions_enabled():
-            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device="cpu")
+            a = torch.tensor(list(range(15)), dtype=torch.float, device="cpu")
             a = a.reshape(5, 3)
             scripted = self.checkScript(func, (a,))
             self.assertLastGraphAllFused()
@@ -264,7 +271,7 @@ def func(x):
             return x.sum((0,), keepdim=True, dtype=torch.double) * 2
 
         with texpr_reductions_enabled():
-            a = torch.tensor(list(range(0, 15)), dtype=torch.float, device="cpu")
+            a = torch.tensor(list(range(15)), dtype=torch.float, device="cpu")
             a = a.reshape(5, 3)
 
             self.checkScript(func, (a,))
@@ -2227,7 +2234,7 @@ def eager(x, y):
 
         indices = [0, 1, 2, 3]
         sets = []
-        for i in range(0, len(indices) + 1):
+        for i in range(len(indices) + 1):
             for subset in combinations(indices, i):
                 sets.append(subset)  # noqa: PERF402
 
diff --git a/test/test_jit_legacy.py b/test/test_jit_legacy.py
index 5576f1645349..480b57a55bd4 100644
--- a/test/test_jit_legacy.py
+++ b/test/test_jit_legacy.py
@@ -2,7 +2,14 @@
 
 import sys
 sys.argv.append("--jit-executor=legacy")
-from test_jit import *  # noqa: F403
+from torch.testing._internal.common_utils import parse_cmd_line_args, run_tests
+
+if __name__ == '__main__':
+    # The value of GRAPH_EXECUTOR depends on command line arguments so make sure they're parsed
+    # before instantiating tests.
+    parse_cmd_line_args()
+
+from test_jit import *  # noqa: F403, F401
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_jit_string.py b/test/test_jit_string.py
index b4344229f1ae..55bd003cd9e3 100644
--- a/test/test_jit_string.py
+++ b/test/test_jit_string.py
@@ -241,17 +241,17 @@ def test_rpartition() -> tuple[tuple[str, str, str], tuple[str, str, str], tuple
         def test_split() -> tuple[list[str], list[str], list[str], list[str], list[str],
                                   list[str], list[str], list[str], list[str], list[str], list[str]]:
             return (
-                "a a a a a".split(),
-                "a  a a   a a".split(),
-                "   a a\ta \v a \v\f\n a \t   ".split(),
-                " a a a a a ".split(" "),
-                "a a a a a ".split(" ", 10),
-                "a a a a a ".split(" ", -1),
-                "a a a a a ".split(" ", 3),
-                " a a a a a ".split("*"),
-                " a*a a*a a".split("*"),
-                " a*a a*a a ".split("*", -1),
-                " a*a a*a a ".split("a*", 10),
+                ["a", "a", "a", "a", "a"],
+                ["a", "a", "a", "a", "a"],
+                ["a", "a", "a", "a", "a"],
+                ["", "a", "a", "a", "a", "a", ""],
+                ["a", "a", "a", "a", "a", ""],
+                ["a", "a", "a", "a", "a", ""],
+                ["a", "a", "a", "a a "],
+                [" a a a a a "],
+                [" a", "a a", "a a"],
+                [" a", "a a", "a a "],
+                [" ", "a ", "a a "],
             )
         self.checkScript(test_split, ())
 
@@ -266,15 +266,15 @@ def test_split_empty_separator():
         def test_rsplit() -> tuple[list[str], list[str], list[str], list[str], list[str],
                                    list[str], list[str], list[str], list[str]]:
             return (
-                "a a a a a".rsplit(),
-                " a a a a a ".rsplit(" "),
-                "a a a a a ".rsplit(" ", 10),
-                "a a a a a ".rsplit(" ", -1),
-                "a a a a a ".rsplit(" ", 3),
-                " a a a a a ".rsplit("*"),
-                " a*a a*a a ".rsplit("*"),
-                " a*a a*a a ".rsplit("*", -1),
-                " a*a a*a a".rsplit("a*", 10),
+                ["a", "a", "a", "a", "a"],
+                ["", "a", "a", "a", "a", "a", ""],
+                ["a", "a", "a", "a", "a", ""],
+                ["a", "a", "a", "a", "a", ""],
+                ["a a a", "a", "a", ""],
+                [" a a a a a "],
+                [" a", "a a", "a a "],
+                [" a", "a a", "a a "],
+                [" ", "a ", "a a"],
             )
         self.checkScript(test_rsplit, ())
 
diff --git a/test/test_kernel_launch_checks.py b/test/test_kernel_launch_checks.py
index 10479e7d13b1..278026a021d7 100644
--- a/test/test_kernel_launch_checks.py
+++ b/test/test_kernel_launch_checks.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: cuda"]
 
 from torch.testing._internal.common_utils import TestCase, run_tests
 from torch.testing._internal.check_kernel_launches import (
diff --git a/test/test_linalg.py b/test/test_linalg.py
index ffae8ac18da2..37da7fe8a2fc 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -23,14 +23,13 @@
      TEST_WITH_ROCM, IS_FBCODE, IS_REMOTE_GPU, iter_indices,
      make_fullrank_matrices_with_distinct_singular_values,
      freeze_rng_state, IS_ARM64, IS_SANDCASTLE, TEST_OPT_EINSUM, parametrize, skipIfTorchDynamo,
-     setBlasBackendsToDefaultFinally, setLinalgBackendsToDefaultFinally, serialTest,
-     runOnRocmArch, MI300_ARCH, TEST_CUDA)
+     skipIfRocmArch, setBlasBackendsToDefaultFinally, setLinalgBackendsToDefaultFinally, serialTest,
+     runOnRocmArch, MI300_ARCH, NAVI_ARCH, TEST_CUDA)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, dtypes, has_cusolver, has_hipsolver,
-     onlyCPU, skipCUDAIf, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride,
+     onlyCPU, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride,
      skipCUDAIfNoMagmaAndNoCusolver, skipCUDAIfRocm, onlyNativeDeviceTypes, dtypesIfCUDA,
-     onlyCUDA, skipMeta, skipCUDAIfNoCusolver, skipCUDAIfNotRocm, skipCUDAIfRocmVersionLessThan,
-     dtypesIfMPS, largeTensorTest)
+     onlyCUDA, skipMeta, skipCUDAIfNoCusolver, skipCUDAIfNotRocm, dtypesIfMPS, largeTensorTest)
 from torch.testing import make_tensor
 from torch.testing._internal.common_dtype import (
     all_types, all_types_and_complex_and, floating_and_complex_types, integral_types,
@@ -109,22 +108,6 @@ def get_tunableop_untuned_filename():
     return untuned_filename
 
 class TestLinalg(TestCase):
-    @contextlib.contextmanager
-    def _hip_allow_tf32(self):
-        # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
-        # and only for MI300+. Environment variable will be removed in the future.
-        import os
-        hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
-        os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
-
-        try:
-            yield
-        finally:
-            if hip_allow_tf32 is not None:
-                os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
-            else:
-                del os.environ["HIPBLASLT_ALLOW_TF32"]
-
     def setUp(self):
         super().setUp()
         torch.backends.cuda.matmul.allow_tf32 = False
@@ -165,7 +148,6 @@ def _tunableop_ctx(self):
             # loop through a list of potentially used
             # environment variables.
             env_list = ["PYTORCH_TUNABLEOP_BLAS_LOG",
-                        "PYTORCH_TUNABLEOP_NUMERICAL_CHECK",
                         "PYTORCH_TUNABLEOP_UNTUNED_FILENAME"]
             for env in env_list:
                 try:
@@ -185,6 +167,7 @@ def _set_tunableop_defaults(self):
         torch.cuda.tunable.set_max_tuning_duration(30)
         torch.cuda.tunable.set_max_tuning_iterations(100)
         torch.cuda.tunable.set_rotating_buffer_size(-1)
+        torch.cuda.tunable.set_numerical_check_tolerances(False)
         ordinal = torch.cuda.current_device()
 
         # Set filenames to be unique on a per test basis
@@ -621,15 +604,6 @@ def complement_device(device):
             with self.assertRaisesRegex(RuntimeError, r'parameter `driver` should be one of \(gels, gelsy, gelsd, gelss\)'):
                 torch.linalg.lstsq(a, b, driver='fictitious_driver')
 
-        # cuSOLVER path supports underdetermined systems
-        version = torch.testing._internal.common_cuda._get_torch_cuda_version()
-        cusolver_not_available = (version < (10, 1))
-
-        if device != 'cpu' and cusolver_not_available:
-            a = torch.rand(2, 3, dtype=dtype, device=device)
-            b = torch.rand(2, 1, dtype=dtype, device=device)
-            with self.assertRaisesRegex(RuntimeError, r'only overdetermined systems'):
-                torch.linalg.lstsq(a, b)
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
@@ -1171,7 +1145,7 @@ def test_eigh_errors_and_warnings(self, device, dtype):
 
     @skipCPUIfNoLapack
     @dtypes(torch.float, torch.double)
-    @unittest.skipIf(_get_torch_cuda_version() < (12, 1), "Test is fixed on cuda 12.1 update 1.")
+    @unittest.skipIf((not TEST_WITH_ROCM) and _get_torch_cuda_version() < (12, 1), "Test is fixed on cuda 12.1 update 1.")
     def test_eigh_svd_illcondition_matrix_input_should_not_crash(self, device, dtype):
         # See https://github.com/pytorch/pytorch/issues/94772, https://github.com/pytorch/pytorch/issues/105359
         # This test crashes with `cusolver error: CUSOLVER_STATUS_EXECUTION_FAILED` on cuda 11.8,
@@ -4776,6 +4750,7 @@ def test_matmul_small_brute_force_3d_Nd(self, device, dtype):
     @dtypes(*floating_types_and(torch.half))
     @precisionOverride({torch.float16: 1e-1})  # TunableOp may occasionally find less precise solution
     def test_matmul_small_brute_force_tunableop(self, device, dtype):
+        import os
         # disable tunableop buffer rotation for all tests everywhere, it can be slow
         # We set the TunableOp numerical check environment variable here because it is
         # possible to hit some invalid numerical solutions due to the small matrix sizes.
@@ -4803,27 +4778,11 @@ def test_matmul_small_brute_force_tunableop(self, device, dtype):
 
             filename1 = torch.cuda.tunable.get_filename()
             unique_id = self.id().split(".")[-1]
-            filename2 = f"{filename1}_tmp1.csv"
-            filename3 = f"{filename1}_tmp2.csv"
             ordinal = torch.cuda.current_device()
             assert filename1 == f"tunableop_results_{unique_id}_{ordinal}.csv"
             assert len(torch.cuda.tunable.get_results()) > 0
 
-            assert torch.cuda.tunable.write_file()  # use default filename
-            assert torch.cuda.tunable.write_file(filename2)  # use custom, one-time filename
-            torch.cuda.tunable.set_filename(filename3)
-            assert torch.cuda.tunable.write_file()  # use previously set filename
-            assert torch.cuda.tunable.read_file()  # use previously set filename, will ignore duplicates and return True
-
-            with open(filename1) as file1:
-                file1_contents = file1.read()
-            with open(filename2) as file2:
-                file2_contents = file2.read()
-            with open(filename3) as file3:
-                file3_contents = file3.read()
-            assert file1_contents == file2_contents
-            assert file1_contents == file3_contents
-
+            self.assertTrue(os.path.exists(filename1))
             # We need to reset the filename to the default value so we can properly
             # clean up intermediate files
             self._set_tunableop_defaults()
@@ -4832,6 +4791,7 @@ def test_matmul_small_brute_force_tunableop(self, device, dtype):
     @skipCUDAIfNotRocm
     @dtypes(torch.half)
     def test_matmul_offline_tunableop(self, device, dtype):
+        import os
         # Main offline tunableop test
         # NOTE: The offline tuning does not support certain tensor
         # shapes as noted below. Submatrics / matrix slices are
@@ -4942,7 +4902,9 @@ def is_bmm_compatible(A, B):
             new_results = len(torch.cuda.tunable.get_results())
 
             self.assertGreater(new_results - ref_results, 0)
-            self.assertTrue(torch.cuda.tunable.write_file())
+
+            results_filename = torch.cuda.tunable.get_filename()
+            self.assertTrue(os.path.exists(results_filename))
 
             # Compare Param Signature of untuned and tuned results
             ok = self._compare_untuned_tuned_entries()
@@ -4953,6 +4915,7 @@ def is_bmm_compatible(A, B):
     @runOnRocmArch(MI300_ARCH)
     @dtypes(torch.torch.float8_e4m3fnuz, torch.float8_e5m2fnuz)
     def test_scaled_gemm_offline_tunableop(self, device, dtype):
+        import os
         # This test is the offline version of test_scaled_gemm_tunableop
 
         with self._tunableop_ctx():
@@ -5032,7 +4995,8 @@ def test_scaled_gemm_offline_tunableop(self, device, dtype):
                 count = 6
             self.assertEqual(total_num_results, count)
 
-            self.assertTrue(torch.cuda.tunable.write_file())
+            results_filename = torch.cuda.tunable.get_filename()
+            self.assertTrue(os.path.exists(results_filename))
 
             # Compare Param Signature of untuned and tuned results
             ok = self._compare_untuned_tuned_entries()
@@ -5180,7 +5144,6 @@ def test_bmm_tunableop_rocm(self, device, dtype):
     @skipCUDAIfNotRocm
     @dtypes(torch.bfloat16)
     def test_numeric_check_leak_tunableop_rocm(self, device, dtype):
-        import os
         from torch.testing._internal.common_utils import CudaMemoryLeakCheck
         # run operator first without tuning to ensure all rocm libs are loaded,
         # otherwise false positive mem leak
@@ -5193,8 +5156,8 @@ def test_numeric_check_leak_tunableop_rocm(self, device, dtype):
 
         with self._tunableop_ctx():
             torch.cuda.tunable.set_rotating_buffer_size(0)
-            # enable tunableop numeric check via env variable.
-            os.environ["PYTORCH_TUNABLEOP_NUMERICAL_CHECK"] = "1"
+            # enable tunableop numeric check via API.
+            torch.cuda.tunable.set_numerical_check_tolerances(True, 0.1, 0.1)
 
             ordinal = torch.cuda.current_device()
 
@@ -5407,6 +5370,7 @@ def test_gemm_bias_tunableop(self, device, dtype):
     @skipCUDAIfNotRocm
     @dtypes(torch.bfloat16)
     def test_gemm_bias_offline_tunableop(self, device, dtype):
+        import os
         # This test is the offline version of test_gemm_bias_tunableop
         ordinal = torch.cuda.current_device()
 
@@ -5457,7 +5421,8 @@ def test_gemm_bias_offline_tunableop(self, device, dtype):
             # There must be a new tuning results
             self.assertEqual(total_num_results, 2)
 
-            self.assertTrue(torch.cuda.tunable.write_file())
+            results_filename = torch.cuda.tunable.get_filename()
+            self.assertTrue(os.path.exists(results_filename))
 
             # Compare Param Signature of untuned and tuned results
             ok = self._compare_untuned_tuned_entries()
@@ -5542,13 +5507,8 @@ def test_scaled_gemm_tunableop(self, device, dtype):
     @runOnRocmArch(MI300_ARCH)
     @dtypes(torch.float)
     def test_tf32_tunableop(self, device, dtype):
-        # Test TunableOp with TF32. Supported by hipblasLT on MI300+.
-        # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
-        # and only for MI300+. Eventually this flag will go away.
-        tf32_ctx = self._hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
-
         try:
-            with self._tunableop_ctx(), tf32_ctx():
+            with self._tunableop_ctx():
                 torch.backends.cuda.matmul.allow_tf32 = True
                 torch.cuda.tunable.set_rotating_buffer_size(0)
 
@@ -5611,13 +5571,8 @@ def test_tf32_offline_tunableop(self, device, dtype):
         # This test is the offline version of test_tf32_tunableop
         import os
 
-        # Test TunableOp with TF32. Supported by hipblasLT on MI300+.
-        # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
-        # and only for MI300+. Eventually this flag will go away.
-        tf32_ctx = self._hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
-
         try:
-            with self._tunableop_ctx(), tf32_ctx():
+            with self._tunableop_ctx():
                 torch.backends.cuda.matmul.allow_tf32 = True
                 ordinal = torch.cuda.current_device()
                 torch.cuda.tunable.set_rotating_buffer_size(0)
@@ -5668,7 +5623,8 @@ def test_tf32_offline_tunableop(self, device, dtype):
                                                      'nn_41_41_41_ld_41_41_41')
                 self.assertTrue(found_result is not None)
 
-                self.assertTrue(torch.cuda.tunable.write_file())
+                results_filename = torch.cuda.tunable.get_filename()
+                self.assertTrue(os.path.exists(results_filename))
 
                 # Compare Param Signature of untuned and tuned results
                 ok = self._compare_untuned_tuned_entries()
@@ -5768,6 +5724,7 @@ def test_blaslog_tunableop(self, device, dtype):
     @skipCUDAIfNotRocm
     @dtypes(torch.float)
     def test_mm_submatrix_offline_tunableop(self, device, dtype):
+        import os
         # Test offline tuning with submatrices
         # Covers GEMM, ScaledGEMM, and GEMM+bias.
         ordinal = torch.cuda.current_device()
@@ -5898,12 +5855,97 @@ def test_mm_submatrix_offline_tunableop(self, device, dtype):
             # There must be a new tuning results
             self.assertEqual(total_num_results, 10)
 
-            self.assertTrue(torch.cuda.tunable.write_file())
+            results_filename = torch.cuda.tunable.get_filename()
+            self.assertTrue(os.path.exists(results_filename))
+
 
             # Compare Param Signature of untuned and tuned results
             ok = self._compare_untuned_tuned_entries()
             self.assertTrue(ok)
 
+
+    @onlyCUDA
+    @skipCUDAIfNotRocm
+    @dtypes(torch.float32)
+    def test_ops_append_to_existing_file_tunableop(self, device, dtype):
+        """If a TunableOp results file already exists (with matching Validator),
+        new results should be appended (not overwritten)."""
+
+        with self._tunableop_ctx():
+            torch.cuda.tunable.set_rotating_buffer_size(0)
+
+            # Seed the existing results file with Validator lines + 1 result line
+            results_filename = torch.cuda.tunable.get_filename()
+            validators = torch.cuda.tunable.get_validators()  # Iterable[Tuple[str, str]]
+
+            seed_lines = []
+            # Each (k, v) becomes a "Validator" line
+            for k, v in validators:
+                seed_lines.append(f"Validator,{k},{v}")
+
+            # One arbitrary, plausible matmul result line
+            seed_lines.append(
+                "GemmAndBiasTunableOp_float_TN,tn_768_32_1024_ld_1024_1024_768,"
+                "Gemm_Hipblaslt_220580,0.0103395"
+            )
+
+            with open(results_filename, "w") as f:
+                f.write("\n".join(seed_lines) + "\n")
+
+            # Count initial (non-Validator) lines
+            with open(results_filename) as f:
+                initial_content = f.read()
+            initial_lines = [
+                l for l in initial_content.split("\n")
+                if l and not l.startswith("Validator")
+            ]
+            initial_count = len(initial_lines)
+            self.assertGreater(initial_count, 0)  # we seeded 1 result line
+
+            # Perform ONE simple matmul
+            A = torch.randn(37, 53, device=device, dtype=dtype)
+            B = torch.randn(53, 29, device=device, dtype=dtype)
+            _ = torch.matmul(A, B)
+
+            # Verify that new results were appended to the same file
+            with open(results_filename) as f:
+                final_content = f.read()
+            final_lines = [
+                l for l in final_content.split("\n")
+                if l and not l.startswith("Validator")
+            ]
+            final_count = len(final_lines)
+
+            self.assertGreater(final_count, initial_count)
+
+    @onlyCUDA
+    @skipCUDAIfNotRocm
+    @dtypes(torch.float32)
+    def test_matmul_empty_existing_file_tunableop(self, device, dtype):
+        """ Test that if an existing results file is empty/corrupted, then the default behaviour should hold """
+        with self._tunableop_ctx():
+            torch.cuda.tunable.set_rotating_buffer_size(0)
+            results_filename = torch.cuda.tunable.get_filename()
+
+            # Pre-create an empty results file
+            with open(results_filename, 'w') as f:
+                pass  # Empty file
+
+            # Use unique random inputs for this test
+            A = torch.randn(37, 53, device=device, dtype=dtype)
+            B = torch.randn(53, 29, device=device, dtype=dtype)
+
+            # Direct matmul
+            C = torch.matmul(A, B)
+
+            with open(results_filename) as f:
+                content = f.read()
+                self.assertIn("Validator", content)
+                result_lines = [l for l in content.split('\n')
+                                if l and not l.startswith('Validator')]
+                self.assertGreater(len(result_lines), 0)
+
+
     @onlyCUDA
     @skipCUDAIfNotRocm
     @runOnRocmArch(MI300_ARCH)
@@ -5980,6 +6022,48 @@ def test_call_count_tunableop(self, device, dtype):
             # There must be exactly three kernels only
             self.assertEqual(kernel_count, 3)
 
+    @onlyCUDA
+    @skipCUDAIfNotRocm
+    @dtypes(torch.float16)
+    def test_numerical_check_python_binding_tunableop(self, device, dtype):
+        with self._tunableop_ctx():
+            torch.cuda.tunable.enable(True)
+            torch.cuda.tunable.set_numerical_check_tolerances(True)
+
+            a = torch.randn(128, 128, device='cuda')
+            b = torch.randn(128, 128, device='cuda')
+
+            _ = a @ b
+
+        with self._tunableop_ctx():
+            torch.cuda.tunable.enable(True)
+            with self.assertRaisesRegex(RuntimeError, r"positive"):
+                torch.cuda.tunable.set_numerical_check_tolerances(True, -1e-5, 1e5)
+            with self.assertRaisesRegex(RuntimeError, r"positive"):
+                torch.cuda.tunable.set_numerical_check_tolerances(True, 1e-5, -1e5)
+            with self.assertRaisesRegex(RuntimeError, r"positive"):
+                torch.cuda.tunable.set_numerical_check_tolerances(True, -1e-5, -1e5)
+
+    @onlyCUDA
+    @skipCUDAIfNotRocm
+    @dtypes(torch.float16, torch.float32)
+    def test_numerical_check_accuracy_tunableop(self, device, dtype):
+        shapes = [(127, 193, 61), (251, 317, 73), (89, 149, 41)]
+        atol, rtol = 1e-2, 1e-1
+
+        for (m, k, n) in shapes:
+            a = torch.randn(m, k, device='cuda')
+            b = torch.randn(k, n, device='cuda')
+            torch.cuda.tunable.enable(False)
+            torch.cuda.tunable.set_numerical_check_tolerances(False)
+            C_baseline = a @ b
+            with self._tunableop_ctx():
+                torch.cuda.tunable.enable(True)
+                torch.cuda.tunable.set_numerical_check_tolerances(True, atol, rtol)
+                C_numeric = a @ b
+            self.assertTrue(torch.allclose(C_baseline, C_numeric, atol=atol, rtol=rtol))
+
+
     @dtypes(torch.float, torch.complex64)
     def test_matmul_out_kernel_errors_with_autograd(self, device, dtype):
         a = torch.empty((256, 512), device=device, dtype=dtype, requires_grad=True).unsqueeze(0)
@@ -7338,7 +7422,6 @@ def test_matmul_45724(self, device):
     @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!")
     @unittest.skipIf(SM90OrLater and not TEST_WITH_ROCM, "Expected failure on sm90")
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
-    @skipCUDAIfRocmVersionLessThan((6, 0))
     @onlyCUDA
     @parametrize("k", [16, 32])
     @parametrize("n", [16, 32])
@@ -7380,7 +7463,7 @@ def _test(m, k, n, transpose_a, transpose_b, test_equal=True):
 
         if TEST_WITH_ROCM:
             _test(17, k, n, use_transpose_a, use_transpose_b, True)
-        elif version >= (11, 7):
+        else:
             if not use_transpose_a and use_transpose_b:
                 if SM80OrLater or (version >= (12, 3) and (SM70 or SM75)):
                     _test(17, k, n, use_transpose_a, use_transpose_b, version > (11, 7))
@@ -7406,18 +7489,11 @@ def _test(m, k, n, transpose_a, transpose_b, test_equal=True):
                     with self.assertRaisesRegex(RuntimeError,
                                                 "CUDA error: CUBLAS_STATUS_NOT_SUPPORTED when calling cublasLtMatmul"):
                         _test(17, k, n, use_transpose_a, use_transpose_b)
-        else:
-            with self.assertRaisesRegex(RuntimeError, "_int_mm_out_cuda not compiled for CUDA"):
-                _test(17, k, n, use_transpose_a, use_transpose_b, False)
 
     @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!")
     @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
-    @skipCUDAIfRocmVersionLessThan((6, 0))
     @onlyCUDA
     def test__int_mm_errors(self, device):
-        version = _get_torch_cuda_version()
-        if torch.version.cuda and version < (11, 7):
-            self.skipTest("_int_mm only compiled for CUDA 11.7")
 
         def genf_int(x, y):
             return torch.empty((x, y), dtype=torch.int8, device=device)
@@ -8459,6 +8535,21 @@ def test_linalg_matrix_exp_boundary_cases(self, device, dtype):
         x = torch.randn(3, 3, 1, 1)
         self.assertEqual(expm(x), x.exp())
 
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)
+    def test_matrix_exp_backward_input_validation(self, device, dtype):
+
+        scalar_tensor = torch.tensor(1.0, dtype=dtype, device=device)
+        grad_1d = torch.randn(1, dtype=dtype, device=device)
+        with self.assertRaisesRegex(RuntimeError, "must have at least 2 dimensions"):
+            torch.ops.aten.matrix_exp_backward(scalar_tensor, grad_1d)
+
+        non_square = torch.randn(2, 3, dtype=dtype, device=device)
+        grad_non_square = torch.randn(2, 3, dtype=dtype, device=device)
+        with self.assertRaisesRegex(RuntimeError, "must be batches of square matrices"):
+            torch.ops.aten.matrix_exp_backward(non_square, grad_non_square)
+
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.float, torch.double, torch.complex64, torch.complex128)
@@ -9665,7 +9756,6 @@ def symmetric(A):
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @skipCUDAIfRocm
-    @skipCUDAIf(_get_torch_cuda_version() < (11, 4), "not available before CUDA 11.3.1")
     @dtypes(*floating_and_complex_types())
     def test_ldl_solve(self, device, dtype):
         from torch.testing._internal.common_utils import random_hermitian_pd_matrix
@@ -9736,6 +9826,7 @@ def test_preferred_blas_library(self):
         self.assertEqual(out_ref, out2.cpu())
 
     @onlyCUDA
+    @skipIfRocmArch(NAVI_ARCH)
     @skipCUDAIfNotRocm
     @unittest.skipIf(not blaslt_supported_device(), "blasLt not supported on current device")
     @setBlasBackendsToDefaultFinally
@@ -9840,6 +9931,28 @@ def test_matmul_mv(self, device, dtype):
         C = torch.matmul(A, B)
         self.assertEqual(C, B.sum().expand(B.shape))
 
+    @onlyCUDA
+    @largeTensorTest("40GB")
+    def test_triu_tril_large_matrix_64bit(self, device):
+        """
+        Test triu/tril with large matrices requiring 64-bit indexing.
+        Regression test for https://github.com/pytorch/pytorch/issues/136611
+        """
+        # 100k x 100k matrix with 10B elements requires 64-bit indexing
+        q_len = 100000
+        causal_mask = torch.full((q_len, q_len), float('-inf'), device=device, dtype=torch.float32)
+        causal_mask.triu_(1)
+
+        # Verify row 42950 is correct (previously failed due to int32 overflow at row*col)
+        row_42950 = causal_mask[42950]
+        num_zeros = (row_42950 == 0.0).sum().item()
+        expected_zeros = 42951
+        self.assertEqual(num_zeros, expected_zeros)
+
+        # Verify last row is correct
+        last_row = causal_mask[-1]
+        self.assertTrue((last_row == 0.0).all())
+
     @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))
     def test_triu_tril_extreme_k_values(self, device, dtype):
         """
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index ea73ccfd5b37..bf46ee0709fc 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -1,14 +1,9 @@
 # Owner(s): ["module: linear algebra"]
 
 import contextlib
-import json
-import math
-import re
-import tempfile
 import unittest
 from itertools import product
 from functools import partial
-from typing import Optional
 
 import torch
 
@@ -22,17 +17,10 @@
     PLATFORM_SUPPORTS_BF16,
     SM53OrLater,
     SM80OrLater,
-    SM89OrLater,
     SM90OrLater,
-    xfailIfSM100OrLater,
+    SM100OrLater,
     xfailIfSM120OrLater,
     _get_torch_cuda_version,
-    PLATFORM_SUPPORTS_FP8,
-    PLATFORM_SUPPORTS_FP8_GROUPED_GEMM,
-    PLATFORM_SUPPORTS_MX_GEMM,
-    PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM,
-    IS_SM90,
-    with_tf32_off,
 )
 from torch.testing._internal.common_device_type import (
     dtypes,
@@ -40,30 +28,26 @@
     onlyCUDA,
     tol as xtol,
     toleranceOverride,
-    e4m3_type,
-    e5m2_type,
-    E4M3_MAX_POS,
-    E5M2_MAX_POS,
 )
 
 from torch.testing._internal.common_utils import (
     IS_JETSON,
     IS_WINDOWS,
+    MI200_ARCH,
+    NAVI_ARCH,
+    getRocmVersion,
+    isRocmArchAnyOf,
     parametrize,
     run_tests,
+    runOnRocmArch,
     skipIfRocm,
-    skipIfRocmVersionLessThan,
     TEST_CUDA,
     TEST_WITH_ROCM,
     TestCase,
+    decorateIf,
 )
-from torch.testing._internal.common_quantized import (
-    _f32_to_floatx_unpacked,
-    _floatx_unpacked_to_f32,
-    ceil_div, to_blocked,
-    to_mxfp8,
-    generate_jagged_offs,
-)
+
+from torch._inductor.test_case import TestCase as InductorTestCase
 
 _IS_SM8X = False
 if TEST_CUDA:
@@ -72,6 +56,17 @@
 # Protects against includes accidentally setting the default dtype
 assert torch.get_default_dtype() is torch.float32
 
+def xfailIfSM100OrLaterNonRTXAndCondition(condition_fn):
+    """
+    Conditionally xfail tests on SM100+ datacenter SKUs based on a condition function.
+    The condition function receives the test parameters dict and returns True to xfail.
+    """
+    computeCapabilityCheck = SM100OrLater and torch.cuda.get_device_capability()[0] != 12
+    return decorateIf(
+        unittest.expectedFailure,
+        lambda params: computeCapabilityCheck and condition_fn(params)
+    )
+
 
 @contextlib.contextmanager
 def blas_library_context(backend):
@@ -82,7 +77,7 @@ def blas_library_context(backend):
     finally:
         torch.backends.cuda.preferred_blas_library(prev_backend)
 
-class TestMatmulCuda(TestCase):
+class TestMatmulCuda(InductorTestCase):
     def setUp(self):
         super().setUp()
         torch.backends.cuda.matmul.allow_tf32 = False
@@ -154,7 +149,6 @@ def cublas_addmm(self, size: int, dtype: torch.dtype, reduced_precision: bool =
         torch.backends.cuda.matmul.allow_fp16_accumulation = orig_fp16_accumulate
 
     @onlyCUDA
-    @skipIfRocmVersionLessThan((5, 2))
     # imported 'tol' as 'xtol' to avoid aliasing in code above
     @toleranceOverride({torch.float16: xtol(atol=1e-1, rtol=1e-1),
                         torch.bfloat16: xtol(atol=1e-1, rtol=1e-1),
@@ -164,10 +158,13 @@ def cublas_addmm(self, size: int, dtype: torch.dtype, reduced_precision: bool =
     @parametrize("backend", ["cublas", "cublaslt"])
     def test_cublas_addmm(self, size: int, dtype: torch.dtype, backend):
         with blas_library_context(backend):
+            if (TEST_WITH_ROCM and backend == "cublas" and isRocmArchAnyOf(NAVI_ARCH) and
+                    getRocmVersion() < (6, 4) and dtype == torch.float16 and size >= 10000):
+                self.skipTest(f"failed on Navi for ROCm6.3 due to hipblas backend, dtype={dtype} and size={size}")
             self.cublas_addmm(size, dtype, False)
 
     @onlyCUDA
-    @skipIfRocmVersionLessThan((5, 2))
+    @xfailIfSM100OrLaterNonRTXAndCondition(lambda params: params.get('dtype') == torch.bfloat16 and params.get('size') == 10000)
     # imported 'tol' as 'xtol' to avoid aliasing in code above
     @toleranceOverride({torch.float16: xtol(atol=7e-1, rtol=2e-1),
                         torch.bfloat16: xtol(atol=1e1, rtol=2e-1)})
@@ -179,7 +176,6 @@ def test_cublas_addmm_reduced_precision(self, size: int, dtype: torch.dtype, bac
             self.cublas_addmm(size, dtype, True)
 
     @onlyCUDA
-    @skipIfRocmVersionLessThan((5, 2))
     @dtypes(torch.float16)
     # m == 4 chooses OUTPUT_TYPE reduction on H200
     # m == 8 chooses OUTPUT_TYPE reduction on A100
@@ -200,7 +196,6 @@ def test_cublas_addmm_no_reduced_precision(self, small_size: int, size: int, dty
             torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = orig_precision
 
     @onlyCUDA
-    @skipIfRocmVersionLessThan((5, 2))
     # imported 'tol' as 'xtol' to avoid aliasing in code above
     @toleranceOverride({torch.float16: xtol(atol=7e-1, rtol=2e-1),
                         torch.bfloat16: xtol(atol=1e1, rtol=2e-1)})
@@ -236,7 +231,7 @@ def test_cublas_and_lt_reduced_precision_fp16_accumulate(self):
     def test_cublas_addmm_alignment(self, dtype):
         device = 'cuda'
         # perturb X, A, or B alignment
-        for idx in range(0, 3):
+        for idx in range(3):
             for offset in range(1, 3):
                 offsets = [0, 0, 0]
                 offsets[idx] = offset
@@ -263,7 +258,6 @@ def test_cublas_addmm_alignment(self, dtype):
          (1, 10000, 10000, 10000)],
         name_fn=lambda batch_size, N, M, P: f"{batch_size}_{N}_{M}_{P}",
     )
-    @skipIfRocm
     def test_cublas_baddbmm_large_input(self, device, batch_size, N, M, P, dtype):
         cpu_dtype = dtype
         if dtype == torch.float16 or dtype == torch.bfloat16:
@@ -285,7 +279,10 @@ def _convert_to_cpu(t):
         if N == M and M == P:
             M2_eye = torch.eye(N, device=device, dtype=dtype)
             out1_eye_gpu = torch.nn.functional.linear(M1, M2_eye.t(), torch.zeros_like(A))
-            self.assertEqual(M1_cpu.to(dtype=dtype), out1_eye_gpu.cpu())
+            if runOnRocmArch(MI200_ARCH) and dtype == torch.float16:
+                self.assertEqual(M1_cpu.to(dtype=dtype), out1_eye_gpu.cpu(), atol=1e-4, rtol=0.001)
+            else:
+                self.assertEqual(M1_cpu.to(dtype=dtype), out1_eye_gpu.cpu())
 
         # baddbmm
         def _expand_to_batch(t: torch.Tensor):
@@ -300,11 +297,24 @@ def _expand_to_batch(t: torch.Tensor):
         if N == M and M == P:
             M2_eye = torch.eye(N, device=device, dtype=dtype).expand(batch_size, N, N)
             out2_eye_gpu = torch.baddbmm(torch.zeros_like(A), M1, M2_eye, beta=beta, alpha=alpha)
-            self.assertEqual(M1_cpu.to(dtype=dtype), out2_eye_gpu.cpu())
+            if runOnRocmArch(MI200_ARCH) and dtype == torch.float16:
+                self.assertEqual(M1_cpu.to(dtype=dtype), out2_eye_gpu.cpu(), atol=1e-4, rtol=0.001)
+            else:
+                self.assertEqual(M1_cpu.to(dtype=dtype), out2_eye_gpu.cpu())
 
         # cross comparison
         self.assertEqual(out1_gpu, out2_gpu[0])
 
+    @onlyCUDA
+    @skipIfRocm
+    @parametrize("shape", [2**i for i in range(5, 14)])
+    @dtypes(torch.float, torch.half, torch.bfloat16)
+    def test_cublas_deterministic(self, device, shape, dtype):
+        inp = torch.randn(shape, shape, device=device, dtype=dtype)
+        first = torch.matmul(inp, inp)
+        for _ in range(10):
+            self.assertEqual(first, torch.matmul(inp, inp), atol=0., rtol=0.)
+
     def grouped_mm_helper(self, alist, blist, gOlist, agradlist, bgradlist, outlist):
         for a, b, gO, agrad, bgrad, out in zip(alist, blist, gOlist, agradlist, bgradlist, outlist):
             a = a.clone().detach().requires_grad_()
@@ -499,7 +509,6 @@ def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major, dtype):
             self.grouped_mm_helper(a, blist, gOlist, agradlist, bgradlist, outlist)
 
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
-    @xfailIfSM100OrLater
     # TODO(future PR): enable compile for torch._grouped_mm fallback path
     @unittest.skipIf(not SM90OrLater, "Grouped gemm with compile supported on SM90")
     @parametrize("op", ["2d/2d", "2d/3d", "3d/2d", "3d/3d"])
@@ -507,8 +516,6 @@ def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major, dtype):
     @parametrize("b_row_major", [False, True])
     @parametrize("max_autotune", [False, True])
     def test_grouped_gemm_compiled(self, op, a_row_major, b_row_major, max_autotune):
-        torch._dynamo.reset()
-
         device = "cuda"
         dtype_AB = torch.bfloat16
         dtype_offset = torch.int32
@@ -626,6 +633,17 @@ def test_grouped_gemm_compiled(self, op, a_row_major, b_row_major, max_autotune)
     @parametrize("batch_size", [None, 1, 16])
     @parametrize("backend", ["cublas", "cublaslt"])
     def test_mm_bmm_dtype_overload(self, input_dtype, M, N, K, batch_size, backend):
+        if torch.version.hip:
+            msg = "accuracy regression in hipblas and hipblaslt in ROCm 7.0 for certain shapes"
+            if input_dtype == torch.bfloat16 and N == 1 and K == 32 and batch_size:
+                raise unittest.SkipTest(msg)
+            if input_dtype == torch.bfloat16 and N == 1 and K == 64 and batch_size:
+                raise unittest.SkipTest(msg)
+            if input_dtype == torch.float16 and M == 32 and N == 1 and K == 64 and batch_size == 1:
+                raise unittest.SkipTest(msg)
+            if input_dtype == torch.float16 and M == 64 and N == 1 and K == 64 and batch_size == 1:
+                raise unittest.SkipTest(msg)
+
         device = "cuda"
         dtype = input_dtype
         with blas_library_context(backend):
@@ -680,6 +698,17 @@ def create_inputs(B=None):
     @parametrize("batch_size", [None, 1, 32])
     @parametrize("backend", ["cublas", "cublaslt"])
     def test_addmm_baddmm_dtype_overload(self, input_dtype, M, N, K, batch_size, backend):
+        if torch.version.hip:
+            msg = "accuracy regression in hipblas and hipblaslt in ROCm 7.0 for certain shapes"
+            if input_dtype == torch.bfloat16 and N == 1 and K == 32 and batch_size:
+                raise unittest.SkipTest(msg)
+            if input_dtype == torch.bfloat16 and N == 1 and K == 64 and batch_size:
+                raise unittest.SkipTest(msg)
+            if input_dtype == torch.float16 and M == 32 and N == 1 and K == 64 and batch_size == 1:
+                raise unittest.SkipTest(msg)
+            if input_dtype == torch.float16 and M == 64 and N == 1 and K == 64 and batch_size == 1:
+                raise unittest.SkipTest(msg)
+
         device = "cuda"
         dtype = input_dtype
         with blas_library_context(backend):
@@ -772,1448 +801,58 @@ def expand(tensor):
 
             torch.backends.cuda.matmul.allow_fp16_accumulation = orig_fp16_accum
 
-f8_msg = "FP8 is only supported on H100+, SM 8.9 and MI300+ devices"
-f8_grouped_msg = "FP8 grouped is only supported on SM90 and MI300+ devices"
-mx_skip_msg = "MX gemm is only supported on CUDA capability 10.0+"
-mxfp8_grouped_mm_skip_msg = "MXFP8 grouped GEMM is only supported when PyTorch is built with USE_FBGEMM_GENAI=1 on SM100+"
-
-# avoid division by zero when calculating scale
-EPS = 1e-12
-
-def amax_to_scale(
-    amax: torch.Tensor, float8_dtype: torch.dtype, orig_dtype: torch.dtype
-):
-    """ Converts the amax value of a tensor to the fp8 scale.
-    Args:
-        amax: The amax value of the tensor.
-        float8_dtype: the float8 dtype.
-        orig_dtype: The original dtype of the tensor.
-    """
-    scale = torch.empty_like(amax, dtype=torch.float32)
-    if float8_dtype == e4m3_type:
-        res = E4M3_MAX_POS / torch.clamp(amax, min=EPS)
-    elif float8_dtype == e5m2_type:
-        res = E5M2_MAX_POS / torch.clamp(amax, min=EPS)
-    else:
-        raise ValueError(f"Unsupported float8_dtype: {float8_dtype}")
-
-    # Ensure the scale is representable in float16,
-    # this helps when amax is small. We are assuming that we don't need
-    # to care about this for float32/bfloat16
-    if orig_dtype is torch.float16:
-        res = torch.clamp(res, max=torch.finfo(torch.float16).max)
-
-    scale.copy_(res)
-    return scale
-
-def tensor_to_scale(x: torch.Tensor, float8_dtype: torch.dtype, dim=None):
-    if dim is None:
-        amax = torch.max(torch.abs(x))
-    else:
-        amax = torch.max(torch.abs(x), dim=dim, keepdim=True).values
-
-    return amax_to_scale(amax, float8_dtype, x.dtype)
-
-def tensor_to_scale_block(
-    x: torch.Tensor,
-    float8_dtype: torch.dtype,
-    block_outer: int,
-    block_inner: int,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    x = x.unflatten(1, (-1, block_inner)).unflatten(0, (-1, block_outer))
-    amax = x.abs().amax(dim=[1, 3], keepdim=True).float()
-    scale = torch.finfo(float8_dtype).max / amax
-    x = x.mul(scale).to(float8_dtype)
-    x = x.flatten(2, 3).flatten(0, 1)
-    scale = scale.flatten(2, 3).flatten(0, 1)
-    return x, scale
-
-def mm_float8_emulated(x, x_scale, y, y_scale, out_dtype) -> torch.Tensor:
-    # naive implementation: dq -> op -> q
-    x_fp32 = x.to(torch.float) / x_scale
-    y_fp32 = y.to(torch.float) / y_scale
-    out_fp32 = torch.mm(x_fp32, y_fp32)
-
-    return out_fp32.to(out_dtype)
-
-def mm_float8_emulated_block(x, x_scale, y, y_scale, out_dtype) -> torch.Tensor:
-    x = x.unflatten(1, (x_scale.shape[1], -1)).unflatten(0, (x_scale.shape[0], -1))
-    y = y.unflatten(1, (y_scale.shape[1], -1)).unflatten(0, (y_scale.shape[0], -1))
-    x_fp32 = x.to(torch.float) / x_scale[:, None, :, None]
-    y_fp32 = y.to(torch.float) / y_scale[:, None, :, None]
-    x_fp32 = x_fp32.flatten(2, 3).flatten(0, 1)
-    y_fp32 = y_fp32.flatten(2, 3).flatten(0, 1)
-    out_fp32 = torch.mm(x_fp32, y_fp32)
-
-    return out_fp32.to(out_dtype)
-
-def addmm_float8_unwrapped(
-    a_data: torch.Tensor,
-    a_scale: torch.Tensor,
-    b_data: torch.Tensor,
-    b_scale: torch.tensor,
-    output_dtype: torch.dtype,
-    output_scale: Optional[torch.Tensor],
-    bias: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    a_inverse_scale = a_scale.reciprocal()
-    b_inverse_scale = b_scale.reciprocal()
-    if output_dtype == torch.float32 and bias is not None:
-        # Bias is not supported by _scaled_mm when output is fp32
-        output = torch._scaled_mm(
-            a_data,
-            b_data,
-            scale_a=a_inverse_scale,
-            scale_b=b_inverse_scale,
-            scale_result=output_scale,
-            out_dtype=output_dtype,
-        )
-        output += bias
-        return output
-    output = torch._scaled_mm(
-        a_data,
-        b_data,
-        bias=bias,
-        scale_a=a_inverse_scale,
-        scale_b=b_inverse_scale,
-        scale_result=output_scale,
-        out_dtype=output_dtype,
-    )
-    return output
-
-def mm_float8(
-    a: torch.Tensor,
-    b: torch.Tensor,
-    a_scale: torch.Tensor,
-    b_scale: torch.Tensor,
-    output_dtype: torch.dtype,  # output dtype
-    output_scale: Optional[torch.Tensor] = None,  # output scale, precomputed
-) -> torch.Tensor:
-    return addmm_float8_unwrapped(
-        a, a_scale, b, b_scale, output_dtype, output_scale
-    )
-
-def to_fp8_saturated(
-    x: torch.Tensor,
-    fp8_dtype: torch.dtype
-):
-    if fp8_dtype == e4m3_type:
-        x = x.clamp(min=-1 * E4M3_MAX_POS, max=E4M3_MAX_POS)
-    elif fp8_dtype == e5m2_type:
-        x = x.clamp(min=-1 * E5M2_MAX_POS, max=E5M2_MAX_POS)
-    else:
-        raise ValueError(f"to_fp8_saturated(): Unsupported fp8_dtype: {fp8_dtype}")
-
-    return x.to(fp8_dtype)
-
-
-
-def compute_error(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-    """Computes the error between two tensors in dB.
-
-    For more details see:
-        https://en.wikipedia.org/wiki/Signal-to-noise_ratio
-
-    Args:
-        x: The original tensor.
-        y: The tensor to compare to the original tensor.
-    """
-    Ps = torch.norm(x)
-    Pn = torch.norm(x - y)
-    return 20 * torch.log10(Ps / Pn)
-
-
-# largest power of 2 representable in `torch.float8_e4m3fn`
-F8E4M3_LARGEST_POW2 = 8
-# largest power of 2 representable in `torch.float4_e2m1fn_x2`
-FP4E2M1FN_LARGEST_POW2 = 1.0
-# max value of `torch.float8_e4m3fn` (448)
-F8E4M3_MAX_VAL = torch.finfo(torch.float8_e4m3fn).max
-# exponent bias of `torch.float8_e8m0fnu`
-F8E8M0_EXP_BIAS = 127
-# exponent and mantissa bits of `torch.float4_e2m1fn_x2`
-FP4_EBITS, FP4_MBITS = 2, 1
-FP4_MAX_VAL = 6.0
-
-def data_to_mx_scale(x, block_size, recipe):
-    # simple implementation of https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
-    # section 6.3, not all edge cases (such as NaN) are handled/tested
-    if recipe == "mxfp8":
-        largest_pow2 = F8E4M3_LARGEST_POW2
-    elif recipe == "mxfp4":
-        largest_pow2 = FP4E2M1FN_LARGEST_POW2
-    else:
-        raise ValueError(f"data_to_mx_scale(): Unsupported mx recipe: {recipe}")
-    orig_shape = x.shape
-    x = x.reshape(-1, block_size)
-    max_abs = torch.amax(torch.abs(x), 1)
-    largest_p2_lt_max_abs = torch.floor(torch.log2(max_abs))
-    scale_e8m0_unbiased = largest_p2_lt_max_abs - largest_pow2
-    scale_e8m0_unbiased = torch.clamp(scale_e8m0_unbiased, -1 * F8E8M0_EXP_BIAS, F8E8M0_EXP_BIAS)
-    scale_e8m0_biased = scale_e8m0_unbiased + F8E8M0_EXP_BIAS
-    scale_e8m0_biased = scale_e8m0_biased.to(torch.uint8)
-    scale_e8m0_biased = scale_e8m0_biased.view(torch.float8_e8m0fnu)
-    return scale_e8m0_biased.reshape(orig_shape[0], -1)
-
-
-def data_to_nvfp4_scale(x, block_size):
-    orig_shape = x.shape
-    x = x.reshape(-1, block_size)
-    max_abs = torch.amax(torch.abs(x), 1) + 1e-12
-
-    # x_orig_max / scale = x_in_fp4_domain_max
-    # x_orig_max / x_in_fp4_domain_max = scale
-    scale = max_abs / FP4_MAX_VAL
-
-    # for the purposes of this function, just clamp to representable range of
-    # `torch.float8_e4m3fn`. In real code, we would expect the modeling code to
-    # handle this before the input data hits this function.
-    scale = scale.clamp(max=F8E4M3_MAX_VAL)
-
-    # cast to target dtype
-    scale = scale.to(torch.float8_e4m3fn)
-    scale = scale.reshape(orig_shape[0], -1)
-    return scale
-
-
-def down_size(size):
-    assert size[-1] % 2 == 0, f"{size} last dim not divisible by two"
-    return (*size[:-1], size[-1] // 2)
-
-
-def pack_uint4(uint8_data) -> torch.Tensor:
-    # converting to uint8 for operations
-    shape = uint8_data.shape
-    assert shape[-1] % 2 == 0
-    uint8_data = uint8_data.contiguous().view(-1)
-    return (uint8_data[1::2] << 4 | uint8_data[::2]).view(down_size(shape))
-
-
-def _bfloat16_to_float4_e2m1fn_x2(x):
-    assert x.dtype == torch.bfloat16
-    x = _f32_to_floatx_unpacked(x.float(), FP4_EBITS, FP4_MBITS)
-    x = pack_uint4(x)
-    x = x.view(torch.float4_e2m1fn_x2)
-    return x
-
-
-class TestFP8Matmul(TestCase):
-
-    def _test_tautological_mm(self, device: str = "cuda",
-                              x_dtype: torch.dtype = e4m3_type,
-                              y_dtype: torch.dtype = e4m3_type,
-                              out_dtype: Optional[torch.dtype] = None,
-                              size: int = 16) -> None:
-        if device != "cpu" and torch.cuda.is_available() and not PLATFORM_SUPPORTS_FP8:
-            raise unittest.SkipTest(f8_msg)
-        x_fp8 = torch.rand(size, size, device=device).to(x_dtype)
-        y_fp8 = torch.eye(size, device=device, dtype=y_dtype).t()
-        out_fp32 = torch.mm(x_fp8.to(torch.float), y_fp8.to(torch.float))
-        scale_a = torch.tensor(1.0, device=device)
-        scale_b = torch.tensor(1.0, device=device)
-        out_fp8 = torch._scaled_mm(x_fp8, y_fp8, scale_a, scale_b, out_dtype=out_dtype)
-        if out_dtype is not None:
-            self.assertEqual(out_dtype, out_fp8.dtype)
-        self.assertEqual(out_fp32, out_fp8.to(torch.float))
-
-    def test_float8_basics(self, device) -> None:
-        if device != "cpu" and torch.cuda.is_available() and not PLATFORM_SUPPORTS_FP8:
-            raise unittest.SkipTest(f8_msg)
-        self._test_tautological_mm(device, e4m3_type, e4m3_type, size=16)
-        # According to https://docs.nvidia.com/cuda/cublas/#id99 8F_E5M2 MM is unsupported
-        # supported on ROCm but fails on CUDA
-        ctx = self.assertRaises(RuntimeError) if torch.version.hip is None and device != "cpu" else contextlib.nullcontext()
-        with ctx:
-            self._test_tautological_mm(device, e5m2_type, e5m2_type)
-
-        self._test_tautological_mm(device, e4m3_type, e5m2_type, size=32)
-        self._test_tautological_mm(device, e5m2_type, e4m3_type, size=48)
-
-        self._test_tautological_mm(device, size=64, out_dtype=torch.float16)
-        self._test_tautological_mm(device, size=96, out_dtype=torch.float32)
-        self._test_tautological_mm(device, size=80, out_dtype=torch.bfloat16)
-
-        with self.assertRaises(AssertionError if torch.version.hip or device == "cpu" else RuntimeError):
-            self._test_tautological_mm(device, out_dtype=e5m2_type)
-
-    def test_float8_scale(self, device) -> None:
-        if device != "cpu" and torch.cuda.is_available() and not PLATFORM_SUPPORTS_FP8:
-            raise unittest.SkipTest(f8_msg)
-        size = (16, 16)
-        x = torch.full(size, .5, device=device, dtype=e4m3_type)
-        # hipblaslt does not yet support mixed e4m3_type input
-        y_type = e4m3_type if torch.version.hip else e5m2_type
-        y = torch.full(size, .5, device=device, dtype=y_type).t()
-        scale_one = torch.tensor(1.0, device=device)
-        scale_a = torch.tensor(1.5, device=device)
-        scale_b = torch.tensor(0.66, device=device)
-        out_fp8 = torch._scaled_mm(x, y, scale_a=scale_one, scale_b=scale_one)
-        self.assertEqual(out_fp8.to(torch.float), torch.full(size, 4., device=device))
-        out_fp8_s = torch._scaled_mm(x, y, scale_a=scale_a, scale_b=scale_b)
-        self.assertEqual(out_fp8, out_fp8_s)
-
-    @unittest.skipIf(not PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM, mxfp8_grouped_mm_skip_msg)
-    @parametrize("G", [1, 4, 16])
-    @parametrize("M", [2048, 2049])
-    @parametrize("N", [8192])
-    @parametrize("K", [16640])
-    def test_mxfp8_scaled_grouped_mm_2d_2d(self, G, M, N, K):
-        torch.manual_seed(42)
-        total_K = K  # Alias for clarity, communicating this consists of several groups along this dim
-        input_group_end_offsets = generate_jagged_offs(
-            G, total_K, multiple_of=32, device="cuda"
-        )
-        X = torch.randn((M, total_K), dtype=torch.bfloat16, device="cuda") * 0.1
-        W = torch.randn((N, total_K), dtype=torch.bfloat16, device="cuda") * 0.01
-
-        # Convert scales to blocked format.
-        x_list = []
-        w_list = []
-        x_blocked_scale_list = []
-        w_blocked_scale_list = []
-
-        def round_up(x: int, y: int) -> int:
-            return ((x + y - 1) // y) * y
-
-        for group_idx in range(G):
-            # to_mxfp8 per group
-            prev_group_end_offset = (
-                0 if group_idx == 0 else input_group_end_offsets[group_idx - 1]
-            )
-            curr_group_end_offset = input_group_end_offsets[group_idx]
-            group_size = curr_group_end_offset - prev_group_end_offset
-            if group_size > 0:
-                x_slice = X[
-                    :, prev_group_end_offset:curr_group_end_offset
-                ].contiguous()  # (M, K_group)
-                w_slice = W[
-                    :, prev_group_end_offset:curr_group_end_offset
-                ].contiguous()  # (N, K_group)
-                x_scale_slice, xq_slice = to_mxfp8(
-                    x_slice
-                )  # scale shape -> (M, K_group // 32)
-                w_scale_slice, wq_slice = to_mxfp8(
-                    w_slice
-                )  # scale shape -> (N, K_group // 32)
-                x_list.append(xq_slice)
-                w_list.append(wq_slice)
-
-                # Convert scales to blocked format.
-                x_scale_slice_blocked = to_blocked(
-                    x_scale_slice
-                )  # (round_up(M, 128), round_up(K_group//32, 4))
-                w_scale_slice_blocked = to_blocked(
-                    w_scale_slice
-                )  # (round_up(N, 128), round_up(K_group//32, 4))
-                x_blocked_scale_list.append(x_scale_slice_blocked)
-                w_blocked_scale_list.append(w_scale_slice_blocked)
-
-        # Assemble the full XQ and WQ
-        xq = torch.cat(x_list, dim=1).contiguous()
-        wq = torch.cat(w_list, dim=1).contiguous()
-
-        # Combine all XQ groups blocked scales into one tensor.
-        x_blocked_scales = torch.cat(x_blocked_scale_list, dim=0)
-        M_rounded = round_up(M, 128)
-        x_blocked_scales = x_blocked_scales.reshape(M_rounded, -1)
-
-        # Combine all WQ groups blocked scales into one tensor.
-        w_blocked_scales = torch.cat(w_blocked_scale_list, dim=0)
-        N_rounded = round_up(N, 128)
-        w_blocked_scales = w_blocked_scales.reshape(N_rounded, -1)
-
-        # Compute mxfp8 grouped mm output
-        y_mxfp8 = torch._scaled_grouped_mm(
-            xq,  # (M, total_K)
-            wq.transpose(-2, -1),  # (total_K, N)
-            x_blocked_scales,  # to_blocked_per_group(M, total_K//32)
-            w_blocked_scales,  # to_blocked_per_group(N, total_K//32)
-            offs=input_group_end_offsets,  # (G,)
-            out_dtype=torch.bfloat16,
-        )
-
-        # bf16 reference output
-        y_bf16 = torch._grouped_mm(
-            X, W.t(), offs=input_group_end_offsets, out_dtype=torch.bfloat16
-        )
-
-        # Assert no NaNs
-        assert not y_mxfp8.isnan().any(), "mxfp8 output contains NaN"
-
-        # Assert outputs are close
-        torch.testing.assert_close(y_mxfp8, y_bf16, atol=8.0e-2, rtol=8.0e-2)
-
-    @unittest.skipIf(not PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM, mxfp8_grouped_mm_skip_msg)
-    @parametrize("G", [1, 4, 16])
-    @parametrize("M", [16640])
-    @parametrize("N", [8192])
-    @parametrize("K", [4096])
-    def test_mxfp8_scaled_grouped_mm_2d_3d(self, G, M, N, K):
-        torch.manual_seed(42)
-        # Simulate 2d-3d grouped gemm `out = input @ weight.t()`
-        # 2D inputs with groups along M, 3D weights.
-        block_size = 32
-        total_M = M  # Alias for clarity that M dim contains groups.
-        X = torch.randn((total_M, K), dtype=torch.bfloat16, device="cuda") * 0.1
-        W = torch.randn((G, N, K), dtype=torch.bfloat16, device="cuda") * 0.01
-        input_group_end_offsets = generate_jagged_offs(
-            G, total_M, multiple_of=32, device="cuda"
-        )
-
-        # For each constituent 2d subtensor in the 3d weights, quantize and convert scale to blocked format separately,
-        # as they each used for independent gemm in the grouped gemm.
-        wq_list = []
-        w_scale_list = []
-        for i in range(G):
-            w_scale, wq = to_mxfp8(W[i])
-            w_scale = to_blocked(w_scale)
-            wq_list.append(wq)
-            w_scale_list.append(w_scale)
-        wq = torch.stack(wq_list, dim=0).contiguous()
-        w_scale = torch.stack(w_scale_list, dim=0).contiguous()
-
-        # For each group along `total_M` in the 2D tensor, quantize and convert scale to blocked format separately,
-        # as they each used for independent gemm in the grouped gemm.
-        xq_list = []
-        x_scale_list = []
-        for i in range(G):
-            prev_group_end = 0 if i == 0 else input_group_end_offsets[i - 1]
-            curr_group_end = input_group_end_offsets[i]
-            group_size = curr_group_end - prev_group_end
-            if group_size > 0:
-                x_slice = X[prev_group_end:curr_group_end, :]
-                x_scale, xq = to_mxfp8(x_slice)
-                x_scale = to_blocked(x_scale)
-                xq_list.append(xq)
-                x_scale_list.append(x_scale)
-        xq = torch.cat(xq_list, dim=0).contiguous()
-        x_scale = torch.cat(x_scale_list, dim=0).contiguous()
-        x_scale = x_scale.reshape(-1, K // block_size)
-        xq = xq.view(-1, xq.shape[-1])
-
-        # Compute mxfp8 grouped gemm.
-        y_mxfp8 = torch._scaled_grouped_mm(
-            xq,
-            wq.transpose(-2, -1),
-            x_scale,
-            w_scale,
-            offs=input_group_end_offsets,
-            out_dtype=torch.bfloat16,
-        )
-
-        # Compute reference bf16 grouped gemm.
-        y_bf16 = torch._grouped_mm(
-            X,
-            W.transpose(-2, -1),
-            offs=input_group_end_offsets,
-            out_dtype=torch.bfloat16,
-        )
-
-        # Assert outputs are close.
-        torch.testing.assert_close(y_mxfp8, y_bf16, atol=8.0e-2, rtol=8.0e-2)
-
-
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
-    @parametrize("base_dtype", [torch.float16, torch.bfloat16, torch.float32])
-    def test_scaled_mm_vs_emulated(self, base_dtype):
-        torch.manual_seed(42)
-        input_dtype = e4m3_type
-        output_dtype = base_dtype
-        compare_type = torch.float32
-
-        x = torch.randn(16, 16, device="cuda", dtype=base_dtype)
-        y = torch.randn(32, 16, device="cuda", dtype=base_dtype).t()
-
-        x_scale = tensor_to_scale(x, input_dtype).float()
-        y_scale = tensor_to_scale(y, input_dtype).float()
-
-        x_fp8 = to_fp8_saturated(x * x_scale, input_dtype)
-        y_fp8 = to_fp8_saturated(y * y_scale, input_dtype)
-
-        # Calculate actual F8 mm
-        out_scaled_mm = mm_float8(
-            x_fp8,
-            y_fp8,
-            a_scale=x_scale,
-            b_scale=y_scale,
-            output_dtype=output_dtype
-        )
-
-        # Calculate emulated F8 mm
-        out_emulated = mm_float8_emulated(
-            x_fp8,
-            x_scale,
-            y_fp8,
-            y_scale,
-            output_dtype
-        )
-
-        if output_dtype != base_dtype:
-            out_scaled_mm = out_scaled_mm.to(compare_type)
-            out_scaled_mm = out_scaled_mm / tensor_to_scale(out_scaled_mm, input_dtype)
-
-            out_emulated = out_emulated.to(compare_type)
-            out_emulated = out_emulated / tensor_to_scale(out_emulated, input_dtype)
-
-        if base_dtype in {torch.bfloat16, torch.float16}:
-            atol, rtol = 7e-2, 7e-2
-        else:
-            atol, rtol = 3e-3, 3e-3
-
-        torch.testing.assert_close(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
-
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
-    @parametrize("base_dtype", [torch.float16, torch.bfloat16, torch.float32])
-    def test_scaled_mm_change_stride(self, base_dtype):
-        torch.manual_seed(42)
-        input_dtype = e4m3_type
-        output_dtype = base_dtype
-        compare_type = torch.float32
-
-        x = torch.empty_strided((16, 16), (16, 1), device="cuda", dtype=base_dtype)
-        y = torch.empty_strided((16, 32), (1, 64), device="cuda", dtype=base_dtype)
-
-        x.normal_()
-        y.normal_()
-
-        x_scale = tensor_to_scale(x, input_dtype).float()
-        y_scale = tensor_to_scale(y, input_dtype).float()
-
-        x_fp8 = to_fp8_saturated(x * x_scale, input_dtype)
-        y_fp8 = to_fp8_saturated(y * y_scale, input_dtype)
-
-        # Calculate actual F8 mm
-        out_scaled_mm = mm_float8(
-            x_fp8,
-            y_fp8,
-            a_scale=x_scale,
-            b_scale=y_scale,
-            output_dtype=output_dtype
-        )
-
-        # Calculate emulated F8 mm
-        out_emulated = mm_float8_emulated(
-            x_fp8,
-            x_scale,
-            y_fp8,
-            y_scale,
-            output_dtype
-        )
-
-        if output_dtype != base_dtype:
-            out_scaled_mm = out_scaled_mm.to(compare_type)
-            out_scaled_mm = out_scaled_mm / tensor_to_scale(out_scaled_mm, input_dtype)
-
-            out_emulated = out_emulated.to(compare_type)
-            out_emulated = out_emulated / tensor_to_scale(out_emulated, input_dtype)
-
-        if base_dtype in {torch.bfloat16, torch.float16}:
-            atol, rtol = 7e-2, 7e-2
-        else:
-            atol, rtol = 3e-3, 3e-3
-
-        torch.testing.assert_close(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
-
     @onlyCUDA
-    def test_float8_bias(self, device) -> None:
-        if device != "cpu" and torch.cuda.is_available() and not PLATFORM_SUPPORTS_FP8:
-            raise unittest.SkipTest(f8_msg)
-        (k, l, m) = (16, 48, 32)
-        x = torch.ones((k, l), device=device).to(e4m3_type)
-        y = torch.full((m, l), .25, device=device, dtype=e4m3_type).t()
-        bias = torch.full((m,), 4.0, device=device, dtype=torch.half)
-        scale_a = torch.tensor(1.0, device=device)
-        scale_b = torch.tensor(1.0, device=device)
-        out_fp8 = torch._scaled_mm(x, y, scale_a=scale_a, scale_b=scale_b)
-        outb_fp8 = torch._scaled_mm(x, y, scale_a=scale_a, scale_b=scale_b, bias=bias)
-        # this fails on ROCm currently because hipblaslt doesn't have amax op
-        out_fp32 = out_fp8.to(torch.float32)
-        outb_fp32 = outb_fp8.to(torch.float32)
-        difference = torch.abs(out_fp32 - outb_fp32)
-        self.assertEqual(difference, torch.tensor(4.0, device=device).expand_as(out_fp32))
-
-    @onlyCUDA
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
-    @parametrize("bias", [True, False])
-    def test_non_divisible_leading_dim(self, device, bias: bool) -> None:
-        x = torch.rand((17, 16), device=device).to(e4m3_type)
-        y = torch.rand((16, 16), device=device).to(e4m3_type).t()
-        scale_a = torch.tensor(1.0, device=device)
-        scale_b = torch.tensor(1.0, device=device)
-        input_bias = None
-        if bias:
-            input_bias = torch.rand((16,), device=device).to(torch.half)
-        _ = torch._scaled_mm(x, y, scale_a, scale_b, bias=input_bias)
-
-    @onlyCUDA
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
-    def test_float8_bias_relu_edgecase(self, device) -> None:
-        (k, l, m) = (16, 48, 32)
-        x = torch.full((k, l), 0.0, device=device).to(e4m3_type)
-        y = torch.full((m, l), 1.0, device=device, dtype=e4m3_type).t()
-        bias = torch.full((m,), -3.0, device=device, dtype=torch.half)
-        scale_a = torch.tensor(1.0, device=device)
-        scale_b = torch.tensor(1.0, device=device)
-        outb_fp8 = torch._scaled_mm(x, y, scale_a, scale_b, bias=bias)
-        outb_fp32 = outb_fp8.to(torch.float32)
-        self.assertEqual(outb_fp32, torch.tensor(-3.0, device=device).expand_as(outb_fp32))
-
-    @onlyCUDA
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
-    def test_float32_output_errors_with_bias(self, device) -> None:
-        (k, l, m) = (16, 48, 32)
-        x = torch.rand((k, l), device=device).to(e4m3_type)
-        y = torch.full((m, l), .25, device=device, dtype=e4m3_type).t()
-        scale_a = torch.tensor(1.0, device=device)
-        scale_b = torch.tensor(1.0, device=device)
-        bias = torch.full((m,), 4.0, device=device, dtype=torch.bfloat16)
-        self.assertRaisesRegex(
-            RuntimeError,
-            "Bias is not supported when out_dtype is set to Float32",
-            lambda: torch._scaled_mm(x, y, scale_a, scale_b, bias=bias, out_dtype=torch.float32),
-        )
-
-    @onlyCUDA
-    @unittest.skipIf(PLATFORM_SUPPORTS_FP8 or not torch.cuda.is_available(), f8_msg)
-    def test_error_message_fp8_pre_sm89(self, device) -> None:
-        (k, l, m) = (16, 48, 32)
-        x = torch.rand((k, l), device=device).to(e4m3_type)
-        y = torch.rand((m, l), device=device).to(e4m3_type).t()
-        scale_a = torch.tensor(1.0, device=device)
-        scale_b = torch.tensor(1.0, device=device)
-        self.assertRaisesRegex(
-            RuntimeError,
-            r"torch\.\_scaled\_mm is only supported on CUDA devices with compute capability \>\= 9\.0 or 8\.9, or ROCm MI300\+",
-            lambda: torch._scaled_mm(x, y, scale_a, scale_b, out_dtype=torch.float32),
-        )
-
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
-    def test_float8_scale_fast_accum(self, device) -> None:
-        size = (16, 16)
-        x = torch.full(size, .5, device=device, dtype=e4m3_type)
-        # hipblaslt does not yet support mixed e4m3_type input
-        y_type = e4m3_type if torch.version.hip else e5m2_type
-        y = torch.full(size, .5, device=device, dtype=y_type).t()
-        scale_a = torch.tensor(1.5, device=device)
-        scale_b = torch.tensor(0.66, device=device)
-        out_fp8 = torch._scaled_mm(x, y, scale_a, scale_b, use_fast_accum=True)
-        self.assertEqual(out_fp8.to(torch.float), torch.full(size, 4., device=device))
-        out_fp8_s = torch._scaled_mm(x, y, scale_a=scale_a, scale_b=scale_b, use_fast_accum=True)
-        self.assertEqual(out_fp8, out_fp8_s)
-
-    @onlyCUDA
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
-    @unittest.skipIf(not SM89OrLater, "rowwise implementation is currently sm89-sm100 specific")
-    @parametrize("use_fast_accum", [True, False])
-    def test_float8_rowwise_scaling_sanity(self, device, use_fast_accum: bool) -> None:
-        M, K, N = (1024, 512, 2048)
-        fill_value = 0.5
-        x = torch.full((M, K), fill_value, device=device)
-        y = torch.full((N, K), fill_value, device=device)
-
-        x_scales = torch.ones((x.shape[0], 1), device=device, dtype=torch.float32)
-        y_scales = torch.ones((1, y.shape[0]), device=device, dtype=torch.float32)
-
-        x_fp8 = x.to(e4m3_type)
-        y_fp8 = y.to(e4m3_type).t()
-
-        out_fp8 = torch._scaled_mm(
-            x_fp8,
-            y_fp8,
-            scale_a=x_scales,
-            scale_b=y_scales,
-            out_dtype=torch.bfloat16,
-            use_fast_accum=use_fast_accum,
-        )
-        self.assertEqual(
-            out_fp8.to(torch.float32), torch.full((M, N), K * (fill_value**2), device=device)
-        )
-
-    @onlyCUDA
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
-    def test_float8_error_messages(self, device) -> None:
-        M, K, N = (1024, 512, 2048)
-        fill_value = 0.5
-        x = torch.full((M, K), fill_value, device=device)
-        y = torch.full((N, K), fill_value, device=device)
-
-        x_fp8 = x.to(e4m3_type)
-        y_fp8 = y.to(e4m3_type).t()
-
-        with self.assertRaisesRegex(
-            RuntimeError, re.escape("Invalid scaling configuration")
-        ):
-            torch._scaled_mm(
-                x_fp8,
-                y_fp8,
-                scale_a=torch.ones((1, 1), device="cuda"),
-                scale_b=torch.ones((1, 2), device="cuda"),
-                out_dtype=torch.bfloat16,
-            )
-
-        with self.assertRaisesRegex(
-            RuntimeError, re.escape("Invalid scaling configuration")
-        ):
-            torch._scaled_mm(
-                x_fp8,
-                y_fp8,
-                scale_a=torch.ones((M, 1), device="cuda"),
-                scale_b=torch.ones((1, N + 1), device="cuda"),
-                out_dtype=torch.bfloat16,
-            )
-        with self.assertRaisesRegex(
-            RuntimeError, re.escape("Invalid scaling configuration")
-        ):
-            torch._scaled_mm(
-                x_fp8,
-                y_fp8,
-                scale_a=torch.ones((M), device="cuda"),
-                scale_b=torch.ones((N, 1), device="cuda"),
-                out_dtype=torch.bfloat16,
-            )
-
-        with self.assertRaisesRegex(
-            RuntimeError, re.escape("Invalid scaling configuration")
-        ):
-            torch._scaled_mm(
-                x_fp8,
-                y_fp8,
-                scale_a=torch.ones((M, 1), device="cuda"),
-                scale_b=torch.ones((1, N * 2), device="cuda")[:, ::2],
-                out_dtype=torch.bfloat16,
-            )
-
-        def e5m2():
-            out = torch._scaled_mm(
-                x_fp8,
-                y_fp8.to(e5m2_type),
-                scale_a=torch.ones((M, 1), device="cuda"),
-                scale_b=torch.ones((1, N), device="cuda"),
-                out_dtype=torch.bfloat16,
-            )
-            return out
-
-        if torch.cuda.get_device_capability() == (9, 0) and torch.version.cuda and torch.version.cuda >= "12.9":
-            out = e5m2()
-            self.assertEqual(out, torch.ones_like(out) * 128.)
-        else:
-            # Note re.compile is used, not re.escape. This is to accommodate fn vs fnuz type message.
-            with self.assertRaisesRegex(
-                RuntimeError,
-                r"Expected b\.dtype\(\) == at::kFloat8_e4m3fnu?z? to be true, but got false\.",
-            ):
-                e5m2()
-
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
-    @unittest.skipIf(not SM89OrLater, "rowwise implementation is currently sm89-sm100 specific")
-    @parametrize("base_dtype", [torch.bfloat16, torch.float32])
-    @with_tf32_off
-    def test_scaled_mm_vs_emulated_row_wise(self, base_dtype):
-        # Fp32 out_dtype is only supported by cuBLAS, which however only started
-        # shipping row-wise kernels in CUDA 12.9, and only for sm90+.
-        if base_dtype is torch.float32:
-            if _get_torch_cuda_version() < (12, 9):
-                raise unittest.SkipTest("Need CUDA 12.9+ for row-wise fp8 w/ cuBLAS")
-            if torch.cuda.get_device_capability() < (9, 0):
-                raise unittest.SkipTest("Need sm90+ for row-wise fp8 w/ cuBLAS")
-
-        torch.manual_seed(42)
-        input_dtype = e4m3_type
-        output_dtype = base_dtype
-
-        x = torch.randn(16, 16, device="cuda", dtype=base_dtype)
-        y = torch.randn(32, 16, device="cuda", dtype=base_dtype).t()
-
-        x_scales = tensor_to_scale(x, input_dtype, dim=1).float()
-        y_scales = tensor_to_scale(y, input_dtype, dim=0).float()
-
-        x_fp8 = to_fp8_saturated(x * x_scales, e4m3_type)
-        y_fp8 = to_fp8_saturated(y * y_scales, e4m3_type)
-
-        # Calculate actual F8 mm
-        out_scaled_mm = mm_float8(
-            x_fp8, y_fp8, a_scale=x_scales, b_scale=y_scales, output_dtype=output_dtype
-        )
-
-        # Calculate emulated F8 mm
-        out_emulated = mm_float8_emulated(
-            x_fp8, x_scales, y_fp8, y_scales, output_dtype
-        )
-
-        if base_dtype in {torch.bfloat16, torch.float16}:
-            atol, rtol = 7e-2, 7e-2
-        else:
-            atol, rtol = 2e-3, 2e-3
-
-        torch.testing.assert_close(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
+    @parametrize("ops", [("mm", torch.mm), ("bmm", torch.bmm), ("addmm", torch.addmm), ("baddbmm", torch.baddbmm)])
+    def test_input_dimension_checking_out_dtype(self, ops):
+        op_name, op = ops
+        B = 2
+        M, N, K = 32, 32, 32
 
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
-    @unittest.skipIf(not IS_SM90, "cuBLAS blockwise scaling requires sm90+")
-    @unittest.skipIf(
-        _get_torch_cuda_version() < (12, 9),
-        "cuBLAS blockwise scaling added in CUDA 12.9",
-    )
-    @parametrize("output_dtype", [torch.bfloat16, torch.float32])
-    @parametrize("lhs_block,rhs_block", [(1, 1), (128, 1), (1, 128)])
-    def test_scaled_mm_vs_emulated_block_wise(self, output_dtype, lhs_block, rhs_block):
-        torch.manual_seed(42)
-
-        x = torch.randn(256, 512, device="cuda", dtype=output_dtype).pow(3)
-        y = torch.randn(768, 512, device="cuda", dtype=output_dtype).pow(3)
-
-        x_fp8, x_scales = tensor_to_scale_block(x, e4m3_type, lhs_block, 128)
-        y_fp8, y_scales = tensor_to_scale_block(y, e4m3_type, rhs_block, 128)
-
-        # 1x128 blocks need scales to be outer-dim-major
-        if lhs_block == 1:
-            x_scales = x_scales.t().contiguous().t()
-        if rhs_block == 1:
-            y_scales = y_scales.t().contiguous().t()
-
-        # Calculate actual F8 mm
-        out_scaled_mm = mm_float8(
-            x_fp8, y_fp8.t(), a_scale=x_scales, b_scale=y_scales.t(), output_dtype=output_dtype
-        )
+        def is_addmm():
+            return "add" in op_name
 
-        # Calculate emulated F8 mm
-        out_emulated = mm_float8_emulated_block(
-            x_fp8, x_scales, y_fp8.t(), y_scales.t(), output_dtype
-        )
+        def is_batched():
+            return "bmm" in op_name
 
-        cosine_sim = torch.nn.functional.cosine_similarity(
-            out_scaled_mm.flatten().float(), out_emulated.flatten().float(), dim=0
-        )
-        self.assertGreaterEqual(float(cosine_sim), 0.999)
+        if is_batched():
+            a = torch.randn(B, M, K, device="cuda", dtype=torch.bfloat16)
+            mismatch_k_b = torch.randn(B, K + 1, N, device="cuda", dtype=torch.bfloat16)
+            c = torch.randn(B, M, N, device="cuda", dtype=torch.bfloat16)
+            extra_dim_b = a.clone().unsqueeze(0)
 
-        if output_dtype in {torch.bfloat16, torch.float16}:
-            atol, rtol = 6e-1, 7e-2
+            mismatch_k_err = "Expected size for first two dimensions of batch2 tensor to be"
+            extra_dim_err = "batch2 must be a 3D tensor"
         else:
-            atol, rtol = 7e-1, 2e-3
-
-        self.assertEqual(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
-
-        # One last check against the full-precision reference, to ensure we
-        # didn't mess up the scaling itself and made the test trivial.
-        cosine_sim = torch.nn.functional.cosine_similarity(
-            out_scaled_mm.flatten().float(), (x @ y.t()).flatten().float(), dim=0
-        )
-        self.assertGreaterEqual(float(cosine_sim), 0.999)
+            a = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+            mismatch_k_b = torch.randn(K + 1, N, device="cuda", dtype=torch.bfloat16)
+            c = torch.randn(M, N, device="cuda", dtype=torch.bfloat16)
+            extra_dim_b = a.clone().unsqueeze(0)
+
+            mismatch_k_err = "mat1 and mat2 shapes cannot be multiplied"
+            extra_dim_err = "mat2 must be a matrix, got 3-D tensor"
+
+        # Test mismatch K
+        with self.assertRaisesRegex(RuntimeError, mismatch_k_err):
+            if is_addmm():
+                op(c, a, mismatch_k_b, out_dtype=torch.float32)
+            else:
+                op(a, mismatch_k_b, out_dtype=torch.float32)
 
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
-    @unittest.skipIf(torch.version.hip is not None, "Float8_e4m3fn not supported on current ROCm CI setup (MI325X)")
-    @parametrize("which_dim_zero", [0, 1, 2])
-    @parametrize("use_torch_compile", [False, True])
-    def test_zero_dim_tensorwise(self, which_dim_zero, use_torch_compile) -> None:
-        device = "cuda"
-        x_dtype, y_dtype = torch.float8_e4m3fn, torch.float8_e4m3fn
-        out_dtype = torch.bfloat16
-        M, K, N = 32, 32, 32
-        if which_dim_zero == 0:
-            M = 0
-        elif which_dim_zero == 1:
-            K = 0
-        elif which_dim_zero == 2:
-            N = 0
-
-        x_fp8 = torch.zeros(M, K, device=device).to(x_dtype)
-        y_fp8 = torch.zeros(N, K, device=device, dtype=y_dtype).t()
-        out_fp32 = torch.mm(x_fp8.to(torch.float), y_fp8.to(torch.float))
-        scale_a = torch.tensor(float('-inf'), device=device)
-        scale_b = torch.tensor(float('-inf'), device=device)
-        f = torch._scaled_mm
-        if use_torch_compile:
-            f = torch.compile(torch._scaled_mm)
-        out_fp8 = f(x_fp8, y_fp8, scale_a, scale_b, out_dtype=out_dtype)
-        self.assertEqual(out_dtype, out_fp8.dtype)
-        self.assertEqual(out_fp32, out_fp8.to(torch.float))
-
-    @unittest.skipIf(IS_WINDOWS, "Windows doesn't support row-wise scaling")
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
-    @unittest.skipIf(not SM90OrLater, "sm89 kernel isn't opted into carveout yet")
-    def test_honor_sm_carveout(self) -> None:
-        torch.manual_seed(42)
-
-        x = torch.randn(8192, 2048, device="cuda", dtype=torch.float32)
-        y = torch.randn(8192, 2048, device="cuda", dtype=torch.float32).t()
-        x_scales = tensor_to_scale(x, e4m3_type, dim=1).reciprocal()
-        y_scales = tensor_to_scale(y, e4m3_type, dim=0).reciprocal()
-        x_fp8 = to_fp8_saturated(x / x_scales, e4m3_type)
-        y_fp8 = to_fp8_saturated(y / y_scales, e4m3_type)
-
-        with tempfile.NamedTemporaryFile() as f:
-            with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
-                self.assertIsNone(torch._C._get_sm_carveout_experimental())
-                torch._scaled_mm(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
-                torch._C._set_sm_carveout_experimental(0)
-                self.assertEqual(torch._C._get_sm_carveout_experimental(), 0)
-                torch._scaled_mm(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
-                torch._C._set_sm_carveout_experimental(66)
-                self.assertEqual(torch._C._get_sm_carveout_experimental(), 66)
-                torch._scaled_mm(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
-                torch._C._set_sm_carveout_experimental(None)
-                self.assertIsNone(torch._C._get_sm_carveout_experimental())
-                torch._scaled_mm(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
-
-            prof.export_chrome_trace(f.name)
-            if torch.version.hip:
-                events = [evt for evt in json.load(open(f.name))["traceEvents"] if evt.get("cat", "") == "kernel"]
-                # events were returned out of order; need to be sorted on "ts" timestamp
-                events = sorted(events, key=lambda x: x['ts'])
-                # ROCm carveout is invisible except for kernels running slower on fewer CUs
-                no_carveout, carveout_0, carveout_66, no_carveout_again = [float(evt.get("dur", "0.0")) for evt in events]
-                self.assertTrue(no_carveout < carveout_66)
-                self.assertTrue(carveout_0 < carveout_66)
-                self.assertTrue(no_carveout_again < carveout_66)
-                # ROCm carveout will create new streams when enabled, and go back to the original stream when disabled
-                no_carveout, carveout_0, carveout_66, no_carveout_again = [int(evt.get("tid", "0")) for evt in events]
-                self.assertTrue(no_carveout == no_carveout_again)
-                self.assertTrue(no_carveout != carveout_0)
-                self.assertTrue(no_carveout != carveout_66)
-                self.assertTrue(carveout_0 != carveout_66)
+        # Test extra dimension
+        with self.assertRaisesRegex(RuntimeError, extra_dim_err):
+            if is_addmm():
+                op(c, a, extra_dim_b, out_dtype=torch.float32)
             else:
-                no_carveout, carveout_0, carveout_66, no_carveout_again = [
-                    math.prod(evt.get("args", {}).get("grid", []))
-                    for evt in json.load(open(f.name))["traceEvents"]
-                    if evt.get("cat", "") == "kernel"
-                ]
-
-                self.assertEqual(no_carveout, no_carveout_again)
-                capability = torch.cuda.get_device_capability()
-                if capability == (10, 0):
-                    # expected failure
-                    # CUTLASS only supports SM carveout via green contexts on SM100
-                    self.assertEqual(no_carveout, carveout_66)
-                    self.assertEqual(carveout_66, carveout_0)
+                op(c, extra_dim_b, out_dtype=torch.float32)
+
+        if is_batched():
+            with self.assertRaisesRegex(RuntimeError, "Expected size for first two dimensions of batch2 tensor to be"):
+                # Test mismatch B for bmm/baddbmm
+                mismatch_batch_dim_b = torch.randn(B + 1, K, N, device="cuda", dtype=torch.bfloat16)
+                if is_addmm():
+                    op(c, a, mismatch_batch_dim_b, out_dtype=torch.float32)
                 else:
-                    # correct behavior
-                    self.assertNotEqual(no_carveout, carveout_66)
-                    self.assertNotEqual(carveout_66, carveout_0)
-
-    def test_pack_uint4(self):
-        """
-        Verify that given a tensor with high precision values [val0, val1],
-        the x2 packed representation is val1:val0 (from MSB to LSB), and
-        not val0:val1.
-
-        Note that the packing function is private to this file, but it's still
-        good to test that we are packing in the expected way.
-        """
-        hp_data = torch.tensor([0b00000010, 0b00001011], dtype=torch.uint8)
-        lp_data_actual = pack_uint4(hp_data)
-        lp_data_expected = torch.tensor([0b10110010], dtype=torch.uint8)
-        torch.testing.assert_close(lp_data_actual, lp_data_expected, atol=0, rtol=0)
-
-    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, mx_skip_msg)
-    @parametrize("test_case_name", [
-        "a_eye_b_eye",
-        "a_ones_b_ones",
-        "a_ones_modified_b_ones",
-        "a_ones_b_ones_modified",
-        "a_scale_modified_b_ones",
-        "a_ones_b_scale_modified",
-        "data_random_scales_one",
-        "data_random_scales_from_data",
-    ])
-    @parametrize("fast_accum", [False, True])
-    @parametrize("mkn", [
-        # Nice shapes
-        (128, 128, 128),
-        (256, 256, 256),
-        (128, 256, 512),
-        (256, 512, 128),
-        (512, 128, 256),
-
-        # Non block multiples
-        (65, 96, 112),
-        (197, 224, 272),
-        # K not multiple of 32 (skipped for fp4)
-        (197, 240, 272),
-
-        # Very unbalanced
-        (1023, 64, 48),
-        (31, 1024, 64),
-        (45, 96, 1024),
-
-        # Mixed large and small
-        (2, 1024, 128),
-        (127, 96, 1024),
-        (1025, 128, 96)
-    ], name_fn=lambda mkn: f"{mkn[0]}_{mkn[1]}_{mkn[2]}")
-    @parametrize("recipe", ["mxfp8", "mxfp4" if torch.version.hip else "nvfp4"])
-    def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum, mkn, recipe) -> None:
-        if (recipe == "nvfp4" or recipe == "mxfp4") and fast_accum:
-            raise unittest.SkipTest("fast_accum not supported in nvfp4/mxfp4 cublas gemm, skipping")
-
-        device = "cuda"
-        M, K, N = mkn
-        if (recipe == "nvfp4" or recipe == "mxfp4") and K % 32 != 0:
-            raise unittest.SkipTest("K must be divisible by 32 for nvfp4/mxfp4 cublas gemm, skipping")
-
-        fp4_scaling_dtype = torch.float8_e8m0fnu if torch.version.hip else torch.float8_e4m3fn
-        BLOCK_SIZE = 32 if torch.version.hip else (16 if recipe == "nvfp4" else 32)
-        require_exact_match = True
-        approx_match_sqnr_target = 22.0
-
-        if test_case_name == "a_eye_b_eye":
-            if not ((M == K) and (M == N)):
-                raise unittest.SkipTest("this test is only defined for M == K == N, skipping")
-            A_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
-            B_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
-
-            if recipe == "mxfp8":
-                A = A_ref.to(torch.float8_e4m3fn)
-                B = B_ref.to(torch.float8_e4m3fn)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-            else:  # nvfp4 # mxfp4
-                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
-                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
-
-        elif test_case_name == "a_ones_b_ones":
-            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
-            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
-
-            if recipe == "mxfp8":
-                A = A_ref.to(torch.float8_e4m3fn)
-                B = B_ref.to(torch.float8_e4m3fn)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-            else:  # nvfp4 # mxfp4
-                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
-                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
-
-        elif test_case_name == "a_ones_modified_b_ones":
-            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
-            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
-            A_ref[1][0:BLOCK_SIZE] = 2
-
-            if recipe == "mxfp8":
-                A = A_ref.to(torch.float8_e4m3fn)
-                B = B_ref.to(torch.float8_e4m3fn)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-            else:  # nvfp4 # mxfp4
-                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
-                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
-
-        elif test_case_name == "a_ones_b_ones_modified":
-            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
-            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
-            B_ref[1][0:BLOCK_SIZE] = 2
-
-            if recipe == "mxfp8":
-                A = A_ref.to(torch.float8_e4m3fn)
-                B = B_ref.to(torch.float8_e4m3fn)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-            else:  # nvfp4 # mxfp4
-                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
-                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
-
-        elif test_case_name == "a_scale_modified_b_ones":
-            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
-            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
-
-            if recipe == "mxfp8":
-                A = A_ref.to(torch.float8_e4m3fn)
-                B = B_ref.to(torch.float8_e4m3fn)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-                A_ref[1][0:BLOCK_SIZE] = 4
-                A[1][0:BLOCK_SIZE] = 2
-                A_scale[1][0] = 2
-            else:  # nvfp4 # mxfp4
-                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
-                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
-                A_ref[1][0:BLOCK_SIZE] = 4
-                A.view(torch.uint8)[1][0:(BLOCK_SIZE // 2)] = 0b01000100
-                A_scale[1][0] = 2
-
-        elif test_case_name == "a_ones_b_scale_modified":
-            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
-            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
-
-            if recipe == "mxfp8":
-                A = A_ref.to(torch.float8_e4m3fn)
-                B = B_ref.to(torch.float8_e4m3fn)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-                B_ref[1][0:BLOCK_SIZE] = 4
-                B[1][0:BLOCK_SIZE] = 2
-                B_scale[1][0] = 2
-            else:  # nvfp4 # mxfp4
-                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
-                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
-                B_ref[1][0:BLOCK_SIZE] = 4
-                B.view(torch.uint8)[1][0:(BLOCK_SIZE // 2)] = 0b01000100
-                B_scale[1][0] = 2
-
-        elif test_case_name == "data_random_scales_one":
-            require_exact_match = False
-
-            if recipe == "mxfp8":
-                # scales all-ones, element data random while being exactly representable in float8_e4m3fn
-                # generate integers in [0, 255] and interpret as float8_e4m3fn
-                A_ref = torch.randint(0, 255, (M, K), device=device, dtype=torch.uint8).view(torch.float8_e4m3fn).to(torch.bfloat16)
-                B_ref = torch.randint(0, 255, (N, K), device=device, dtype=torch.uint8).view(torch.float8_e4m3fn).to(torch.bfloat16)
-                # modification: don't allow NaN values
-                A_ref[torch.isnan(A_ref)] = 0
-                B_ref[torch.isnan(B_ref)] = 0
-                A = A_ref.to(torch.float8_e4m3fn)
-                B = B_ref.to(torch.float8_e4m3fn)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-            else:  # nvfp4 # mxfp4
-                # scales all-ones, element data random while being exactly representable in float4_e2m1fn_x2
-                # generate integers in [0, 16] and cast to bfloat16
-                A_ref = _floatx_unpacked_to_f32(
-                    torch.randint(0, 16, (M, K), device=device, dtype=torch.uint8),
-                    FP4_EBITS,
-                    FP4_MBITS
-                ).bfloat16()
-                B_ref = _floatx_unpacked_to_f32(
-                    torch.randint(0, 16, (N, K), device=device, dtype=torch.uint8),
-                    FP4_EBITS,
-                    FP4_MBITS
-                ).bfloat16()
-                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
-                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
-                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
-                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
-
-        elif test_case_name == "data_random_scales_from_data":
-            if not K % BLOCK_SIZE == 0:
-                raise unittest.SkipTest(f"this test is only defined for K a multiple of {BLOCK_SIZE}, skipping")
-            require_exact_match = False
-            # random data, scales from data
-            A_ref = torch.randn((M, K), device=device, dtype=torch.bfloat16) * 1000
-            B_ref = torch.randn((N, K), device=device, dtype=torch.bfloat16) * 1000
-
-            if recipe == "mxfp8":
-                # Calculate scales based on the inputs
-                A_scale = data_to_mx_scale(A_ref, BLOCK_SIZE, recipe)
-                B_scale = data_to_mx_scale(B_ref, BLOCK_SIZE, recipe)
-                max_val = F8E4M3_MAX_VAL
-                min_val = -1 * max_val
-                A = (A_ref.reshape(-1, BLOCK_SIZE) / A_scale.reshape(M * ceil_div(K, BLOCK_SIZE), 1).float()).reshape(M, K)
-                A = A.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
-                B = (B_ref.reshape(-1, BLOCK_SIZE) / B_scale.reshape(N * ceil_div(K, BLOCK_SIZE), 1).float()).reshape(N, K)
-                B = B.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
-            else:  # nvfp4 # mxfp4
-                scale_func = data_to_mx_scale if recipe == "mxfp4" else data_to_nvfp4_scale
-                A_scale = scale_func(*([A_ref, BLOCK_SIZE] + recipe if recipe == "mxfp4" else [A_ref, BLOCK_SIZE]))
-                B_scale = scale_func(*([B_ref, BLOCK_SIZE] + recipe if recipe == "mxfp4" else [B_ref, BLOCK_SIZE]))
-                max_val = FP4_MAX_VAL
-                min_val = -1 * max_val
-
-                A = (A_ref.reshape(-1, BLOCK_SIZE) / A_scale.reshape(M * ceil_div(K, BLOCK_SIZE), 1).bfloat16()).reshape(M, K)
-                A = A.clamp(min=min_val, max=max_val)
-                A = _bfloat16_to_float4_e2m1fn_x2(A)
-                B = (B_ref.reshape(-1, BLOCK_SIZE) / B_scale.reshape(N * ceil_div(K, BLOCK_SIZE), 1).bfloat16()).reshape(N, K)
-                B = B.clamp(min=min_val, max=max_val)
-                B = _bfloat16_to_float4_e2m1fn_x2(B)
-
-                approx_match_sqnr_target = 12.0 if torch.version.hip else 15.8
-
-        C_ref = A_ref @ B_ref.t()
-
-        # convert to swizzled format
-        if not torch.version.hip:
-            A_scale = to_blocked(A_scale)
-            B_scale = to_blocked(B_scale)
-
-        C = torch._scaled_mm(
-            A,
-            B.t(),
-            A_scale,
-            B_scale,
-            out_dtype=torch.bfloat16,
-            use_fast_accum=fast_accum,
-        )
-
-        if require_exact_match:
-            torch.testing.assert_close(C, C_ref, atol=0, rtol=0)
-        else:
-            sqnr = compute_error(C_ref, C)
-            assert sqnr.item() > approx_match_sqnr_target
-
-    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM or IS_WINDOWS, mx_skip_msg)
-    @parametrize("recipe", ["mxfp8", "nvfp4"])
-    def test_blockwise_mxfp8_nvfp4_error_messages(self, device, recipe) -> None:
-        M, K, N = (1024, 512, 2048)
-        BLOCK_SIZE_K = 16 if recipe == "nvfp4" else 32
-        BLOCK_SIZE_MN = 128
-        fill_value = 0.5
-        scale_dtype = torch.float8_e4m3fn if recipe == "nvfp4" else torch.float8_e8m0fnu
-
-        x = torch.full((M, K), fill_value, device=device)
-        y = torch.full((N, K), fill_value, device=device)
-
-        if recipe == "mxfp8":
-            x_lowp = x.to(e4m3_type)
-            y_lowp = y.to(e4m3_type).t()
-        else:  # nvfp4
-            x_lowp = _bfloat16_to_float4_e2m1fn_x2(x.bfloat16())
-            y_lowp = _bfloat16_to_float4_e2m1fn_x2(y.bfloat16()).t()
-
-        num_k_blocks = ceil_div(K, BLOCK_SIZE_K)
-        padded_num_k_blocks = ceil_div(num_k_blocks, 4) * 4
-        expected_a_size = BLOCK_SIZE_MN * ceil_div(M, BLOCK_SIZE_MN) * padded_num_k_blocks
-        expected_b_size = BLOCK_SIZE_MN * ceil_div(N, BLOCK_SIZE_MN) * padded_num_k_blocks
-
-        # Test wrong scale tensor size for scale_a with correct dtype
-        with self.assertRaisesRegex(
-            RuntimeError,
-            f".*For Block[W,w]ise.*scaling.*scale_a should have {expected_a_size} "
-            f"elements.*"
-            ,
-        ):
-            incorrect_size_a = torch.ones(expected_a_size - 1, device=device, dtype=scale_dtype)
-            correct_size_b = torch.ones(expected_b_size, device=device, dtype=scale_dtype)
-            torch._scaled_mm(
-                x_lowp,
-                y_lowp,
-                scale_a=incorrect_size_a,
-                scale_b=correct_size_b,
-                out_dtype=torch.bfloat16,
-            )
-
-        # Test wrong scale tensor size for scale_b with correct dtype
-        with self.assertRaisesRegex(
-            RuntimeError,
-            f"For Block[W,w]ise.*scaling.*scale_b should have {expected_b_size} "
-            f"elements.*"
-            ,
-        ):
-            correct_size_a = torch.ones(expected_a_size, device=device, dtype=scale_dtype)
-            incorrect_size_b = torch.ones(expected_b_size + 1, device=device, dtype=scale_dtype)
-            torch._scaled_mm(
-                x_lowp,
-                y_lowp,
-                scale_a=correct_size_a,
-                scale_b=incorrect_size_b,
-                out_dtype=torch.bfloat16,
-            )
-
-        # Test non-contiguous scale tensors with correct dtype
-        with self.assertRaisesRegex(
-            RuntimeError,
-            "For Block[W,w]ise.*scaling.*both should be contiguous"
-            ,
-        ):
-            non_contiguous_a = torch.ones(expected_a_size * 2, device=device, dtype=scale_dtype)[::2]
-            contiguous_b = torch.ones(expected_b_size, device=device, dtype=scale_dtype)
-            torch._scaled_mm(
-                x_lowp,
-                y_lowp,
-                scale_a=non_contiguous_a,
-                scale_b=contiguous_b,
-                out_dtype=torch.bfloat16,
-            )
-
-    def scaled_grouped_mm_helper(self, alist, blist, ascalelist, bscalelist, outlist, use_fast_accum):
-        for a, b, ascale, bscale, out in zip(alist, blist, ascalelist, bscalelist, outlist):
-            out_ref = torch._scaled_mm(a, b.t(), ascale.view(-1, 1), bscale.view(1, -1),
-                                       out_dtype=torch.bfloat16, use_fast_accum=use_fast_accum)
-            self.assertEqual(out, out_ref, atol=5e-2, rtol=5e-4)
-
-    # Testing only _scaled_grouped_mm() with multiple shapes, as
-    # _scaled_mm() already has more combinations of parameters than
-    # _scaled_grouped_mm(), for supporting more than one inputs layout
-    # combinations.
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8_GROUPED_GEMM, f8_grouped_msg)
-    @parametrize("fast_accum", [False, True])
-    # AMD does not support non-contiguous inputs yet
-    @parametrize("strided", [False] + ([True] if torch.version.cuda else []))
-    def test_scaled_grouped_gemm_2d_2d(self, fast_accum, strided):
-        device = "cuda"
-        fp8_dtype = torch.float8_e4m3fnuz if torch.version.hip else torch.float8_e4m3fn
-        m, n, k, n_groups = 16, 32, 64, 4
-        a = torch.randn(m, k * n_groups + k * int(strided), device=device).to(fp8_dtype)[:, :k * n_groups]
-        b = torch.randn(n, k * n_groups + k * int(strided), device=device).to(fp8_dtype)[:, :k * n_groups]
-        scale_a = torch.rand(m * n_groups, device=device, dtype=torch.float32)
-        scale_b = torch.rand(n * n_groups, device=device, dtype=torch.float32)
-        offs = torch.arange(k, n_groups * k + 1, k, device=device, dtype=torch.int32)
-        f = torch._scaled_grouped_mm
-        out = f(a, b.t(), scale_a, scale_b, offs=offs,
-                out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
-        offs_cpu = offs.cpu()
-        alist, blist, ascalelist, bscalelist = [], [], [], []
-        start = 0
-        for i in range(n_groups):
-            alist.append(a[:, start:offs_cpu[i]])
-            blist.append(b[:, start:offs_cpu[i]])
-            ascalelist.append(scale_a[i * m : (i + 1) * m])
-            bscalelist.append(scale_b[i * n : (i + 1) * n])
-            start = offs_cpu[i]
-        self.scaled_grouped_mm_helper(alist, blist, ascalelist, bscalelist, out, fast_accum)
-
-
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8_GROUPED_GEMM, f8_grouped_msg)
-    @parametrize("fast_accum", [False, True])
-    # AMD does not support non-contiguous inputs yet
-    @parametrize("strided", [False] + ([True] if torch.version.cuda else []))
-    def test_scaled_grouped_gemm_2d_3d(self, fast_accum, strided):
-        device = "cuda"
-        fp8_dtype = torch.float8_e4m3fnuz if torch.version.hip else torch.float8_e4m3fn
-        m, n, k, n_groups = 16, 32, 64, 4
-        s_int = int(strided)
-        a = torch.randn(m * n_groups, k * (1 + s_int), device=device).to(fp8_dtype)[:, :k]
-        b = torch.randn(n_groups * (1 + s_int), n, k * (1 + s_int), device=device).to(fp8_dtype)[::(1 + s_int), :, :k]
-        self.assertTrue(a.is_contiguous() is not strided)
-        self.assertTrue(b.is_contiguous() is not strided)
-        for check_zero_size in (True, False):
-            if check_zero_size and n_groups <= 1:
-                continue
-
-            offs = torch.arange(m, n_groups * m + 1, m, device="cuda", dtype=torch.int32)
-            if check_zero_size:
-                offs[0] = offs[1]
-            scale_a = torch.rand(n_groups * m, device="cuda", dtype=torch.float32)
-            scale_b = torch.rand(n_groups * n, device="cuda", dtype=torch.float32).view(n_groups, n)
-            f = torch._scaled_grouped_mm
-            out = f(a, b.transpose(-2, -1), scale_a, scale_b, offs=offs,
-                    out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
-
-            offs_cpu = offs.cpu()
-            alist, ascalelist, outlist = [], [], []
-            start = 0
-            for i in range(n_groups):
-                alist.append(a[start:offs_cpu[i]])
-                ascalelist.append(scale_a[start:offs_cpu[i]])
-                outlist.append(out[start:offs_cpu[i]])
-                start = offs_cpu[i]
-                self.scaled_grouped_mm_helper(alist, b, ascalelist, scale_b, outlist, fast_accum)
-
-
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8_GROUPED_GEMM, f8_grouped_msg)
-    @parametrize("fast_accum", [False, True])
-    # AMD does not support non-contiguous inputs yet
-    @parametrize("strided", [False] + ([True] if torch.version.cuda else []))
-    def test_scaled_grouped_gemm_3d_3d(self, fast_accum, strided):
-        device = "cuda"
-        fp8_dtype = torch.float8_e4m3fnuz if torch.version.hip else torch.float8_e4m3fn
-        m, n, k, n_groups = 16, 32, 64, 4
-        s_int = int(strided)
-        a = torch.randn(n_groups * (1 + s_int), m, k * (1 + s_int), device=device).to(fp8_dtype)[::(1 + s_int), :, :k]
-        b = torch.randn(n_groups * (1 + s_int), n, k * (1 + s_int), device=device).to(fp8_dtype)[::(1 + s_int), :, :k]
-        self.assertTrue(a.is_contiguous() is not strided)
-        self.assertTrue(b.is_contiguous() is not strided)
-        scale_a = torch.rand(n_groups * m, device="cuda", dtype=torch.float32).view(n_groups, m)
-        scale_b = torch.rand(n_groups * n, device="cuda", dtype=torch.float32).view(n_groups, n)
-
-        f = torch._scaled_grouped_mm
-        out = f(a, b.transpose(-2, -1), scale_a, scale_b,
-                out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
-
-        self.scaled_grouped_mm_helper(a, b, scale_a, scale_b, out, fast_accum)
-
-
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8_GROUPED_GEMM, f8_grouped_msg)
-    @parametrize("fast_accum", [False, True])
-    # AMD does not support non-contiguous inputs yet
-    @parametrize("strided", [False] + ([True] if torch.version.cuda else []))
-    def test_scaled_grouped_gemm_3d_2d(self, fast_accum, strided):
-        device = "cuda"
-        fp8_dtype = torch.float8_e4m3fnuz if torch.version.hip else torch.float8_e4m3fn
-        m, n, k, n_groups = 16, 32, 64, 4
-        s_int = int(strided)
-        a = torch.randn(n_groups * (1 + s_int), m, k * (1 + s_int), device=device).to(fp8_dtype)[::(1 + s_int), :, :k]
-        b = torch.randn(n * n_groups, k * (1 + s_int), device=device).to(fp8_dtype)[:, :k]
-        self.assertTrue(a.is_contiguous() is not strided)
-        self.assertTrue(b.is_contiguous() is not strided)
-        scale_a = torch.rand(n_groups * m, device="cuda", dtype=torch.float32).view(n_groups, m)
-        scale_b = torch.rand(n_groups * n, device="cuda", dtype=torch.float32)
-        for check_zero_size in (True, False):
-            if check_zero_size and n_groups <= 1:
-                continue
-
-            offs = torch.arange(n, n_groups * n + 1, n, device="cuda", dtype=torch.int32)
-            if check_zero_size:
-                offs[0] = offs[1]
-
-            f = torch._scaled_grouped_mm
-            out = f(a, b.transpose(-2, -1), scale_a, scale_b, offs=offs,
-                    out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
-            offs_cpu = offs.cpu()
-            blist, bscalelist, outlist = [], [], []
-            start = 0
-            for i in range(n_groups):
-                blist.append(b[start:offs_cpu[i]])
-                bscalelist.append(scale_b[start:offs_cpu[i]])
-                outlist.append(out[:, start:offs_cpu[i]])
-                start = offs_cpu[i]
-                self.scaled_grouped_mm_helper(a, blist, scale_a, bscalelist, outlist, fast_accum)
-
-
-    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, mx_skip_msg)
-    def test_blockwise_mxfp8_compile(self) -> None:
-
-        device = "cuda"
-        M, K, N = 128, 128, 128
-        BLOCK_SIZE = 32
-
-        A_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
-        B_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
-
-        A = A_ref.to(torch.float8_e4m3fn)
-        B = B_ref.to(torch.float8_e4m3fn)
-
-        A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-        B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
-        C_ref = A_ref @ B_ref.t()
-
-        compiled_scaled_mm = torch.compile(torch._scaled_mm, backend="inductor")
-        C = compiled_scaled_mm(
-            A,
-            B.t(),
-            A_scale,
-            B_scale,
-            out_dtype=torch.bfloat16,
-            use_fast_accum=False,
-        )
-        torch.testing.assert_close(C, C_ref, atol=0, rtol=0)
-
-    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, mx_skip_msg)
-    def test_blockwise_nvfp4_compile(self) -> None:
-
-        device = "cuda"
-        M, K, N = 128, 128, 128
-        BLOCK_SIZE = 16
-
-        A_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
-        B_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
-
-        A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
-        B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
-
-        A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
-        B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e4m3fn)
-        C_ref = A_ref @ B_ref.t()
-
-        compiled_scaled_mm = torch.compile(torch._scaled_mm, backend="inductor")
-        # C = torch._scaled_mm(
-        C = compiled_scaled_mm(
-            A,
-            B.t(),
-            A_scale,
-            B_scale,
-            out_dtype=torch.bfloat16,
-            use_fast_accum=False,
-        )
-        torch.testing.assert_close(C, C_ref, atol=0, rtol=0)
+                    op(a, mismatch_batch_dim_b, out_dtype=torch.float32)
 
 
 @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
@@ -2338,7 +977,6 @@ def run_test(
 
 instantiate_device_type_tests(TestMatmulCuda, globals(), except_for="cpu")
 instantiate_device_type_tests(TestMixedDtypesLinearCuda, globals(), except_for="cpu")
-instantiate_device_type_tests(TestFP8Matmul, globals(), except_for="cpu")
 
 if __name__ == '__main__':
     TestCase._default_dtype_check_enabled = True
diff --git a/test/test_mps.py b/test/test_mps.py
index 03cc4fe8b212..6f4e957aa9df 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -80,6 +80,9 @@
 
 total_memory = int(subprocess.check_output(["sysctl", "-n", "hw.memsize"]))
 
+MPS_UNSUPPORTED_TYPES = [torch.double, torch.cdouble]
+MPS_DTYPES = [t for t in get_all_dtypes() if t not in MPS_UNSUPPORTED_TYPES]
+
 # Determine whether to enable MPS memory leak check (uses same code as CUDA).
 TEST_MPS_MEM_LEAK_CHECK = os.getenv('PYTORCH_TEST_MPS_MEM_LEAK_CHECK', '0') == '1'
 
@@ -1897,7 +1900,7 @@ def test_linalg_vector_norm(self):
         res_cpu = torch.linalg.vector_norm(B_cpu, ord=3.5)
         self.assertEqual(res_mps, res_cpu)
 
-        for dim in range(0, B_mps.dim()):
+        for dim in range(B_mps.dim()):
             res_mps = torch.linalg.vector_norm(B_mps, ord=3.5, dim=dim)
             res_cpu = torch.linalg.vector_norm(B_cpu, ord=3.5, dim=dim)
             self.assertEqual(res_mps, res_cpu)
@@ -1975,6 +1978,16 @@ def run_linalg_solve_test(size, *batch_dims):
         run_linalg_solve_test(32, 10, 10)
         run_linalg_solve_test(32, 2, 2, 2, 2, 10, 10)
 
+    def test_linalg_solve_singular(self):
+        # Regression test for https://github.com/pytorch/pytorch/issues/163962
+
+        # Explicit singular matrix
+        A = torch.tensor([[1.0, 2.0], [2.0, 4.0]], device="mps")
+        b = torch.rand_like(A)
+
+        with self.assertRaisesRegex(RuntimeError, "input matrix is singular"):
+            torch.linalg.solve(A, b)
+
     def test_linalg_solve_with_broadcasting(self):
         from functools import partial
         import torch
@@ -2858,8 +2871,8 @@ def test_slice_reshape_contg_view(self):
 
     def test_contiguous_slice_2d(self):
         def helper(shape):
-            for i in range(0, shape[0]):
-                for j in range(0, shape[1]):
+            for i in range(shape[0]):
+                for j in range(shape[1]):
                     t_mps = torch.randn(shape, device="mps")
                     t_cpu = t_mps.detach().clone().cpu()
 
@@ -3419,12 +3432,12 @@ def test_storage_offset_greater_than_src_nbytes(self):
         elems = torch.arange(n_tensors * n_tensor_elems, dtype=torch.float32)
 
         tensor_list = []
-        for i in range(0, n_tensors - 1):
+        for i in range(n_tensors - 1):
             # create a list of contiguous view tensors (view tensor created by the slice op)
             t = elems[n_tensor_elems * i : n_tensor_elems * (i + 1)]
             tensor_list.append(t)
 
-        for i in range(0, n_tensors - 1):
+        for i in range(n_tensors - 1):
             t = tensor_list[i].view(1, n_tensor_elems)
             t_mps = t.to("mps")
             self.assertEqual(t, t_mps.cpu(), f"i={i}")
@@ -3637,6 +3650,70 @@ def rotate_subset(data, dim):
                 # TODO: enable memory format test
                 # self.assertEqual(cpu_result.is_contiguous(), mps_result.is_contiguous())
 
+    # Skip if a test needs more memory than the system has.
+    def _skip_if_exceeds_total_memory(self, required_memory):
+        if total_memory < required_memory:
+            self.skipTest(
+                f"Needs {required_memory / (1024**3):0.01f} GiB RAM, "
+                f"but only {total_memory / (1024**3):0.01f} GiB is available.")
+
+    @parametrize("dtype", MPS_DTYPES)
+    def test_cat_large_tensor(self, dtype):
+        a_shape = (1, 11 + (1 << 31), 1)
+        b_shape = (1, 100, 1)
+
+        # Assume up to 1% extra overhead memory might be required.
+        required_memory = 1.01 * (math.prod(a_shape) + math.prod(a_shape)) * dtype.itemsize
+        self._skip_if_exceeds_total_memory(required_memory)
+
+        a_cpu = make_tensor((1,), dtype=dtype, device='cpu').expand(a_shape)
+        b_cpu = make_tensor(b_shape, dtype=dtype, device='cpu')
+        r_cpu = torch.cat([a_cpu, b_cpu], dim=1)
+
+        # Pick a subset of output elements to compare, because comparing all of
+        # them takes too long.
+        rand_indices = torch.randint(0, a_cpu.shape[1] + b_cpu.shape[1], (10_000,))
+        r_cpu_part0 = r_cpu[:, rand_indices, :].clone()
+        r_cpu_part1 = r_cpu[:, -200:, :].clone()
+        r_cpu_part2 = r_cpu[:, :200, :].clone()
+
+        # Delete the CPU result to free up memory for the MPS run.
+        del r_cpu
+
+        a_mps = (
+            torch.empty(0, dtype=dtype, device='mps')
+            .set_(a_cpu.untyped_storage().mps())
+            .as_strided(size=a_cpu.size(), stride=a_cpu.stride())
+        )
+        b_mps = b_cpu.to('mps')
+
+        try:
+            r_mps = torch.cat([a_mps, b_mps], dim=1)
+
+        except RuntimeError as e:
+            if "Invalid buffer size" in str(e):
+                self.skipTest(f"Exceeds max buffer size for MPS: {str(e)}.")
+            raise e
+
+        self.assertEqual(r_mps[:, rand_indices, :], r_cpu_part0)
+        self.assertEqual(r_mps[:, -200:, :], r_cpu_part1)
+        self.assertEqual(r_mps[:, :200, :], r_cpu_part2)
+
+    def test_large_tensor_to_string(self):
+        shape = (2, 1 << 31)
+
+        # Assume up to 1% extra overhead memory might be required.
+        required_memory = 1.01 * 2 * math.prod(shape)
+        self._skip_if_exceeds_total_memory(required_memory)
+
+        self.assertEqual(
+            str(torch.ones(shape, dtype=torch.int8, device='mps')),
+            (
+                "tensor([[1, 1, 1,  ..., 1, 1, 1],\n"
+                "        [1, 1, 1,  ..., 1, 1, 1]], device='mps:0', dtype=torch.int8)"
+            ),
+        )
+
     # See https://github.com/pytorch/pytorch/issues/152701
     def test_jacfwd_cat(self):
         def fn(x, y):
@@ -4865,7 +4942,7 @@ def helper(fn, dim):
             x_mps = fn(torch.zeros(shape, device="mps"), dim=dim)
             self.assertEqual(x_cpu, x_mps.cpu())
         for fn in [torch.any, torch.all]:
-            for dim in range(0, 4):
+            for dim in range(4):
                 helper(fn, dim)
 
         # 6D tensor reductions
@@ -8058,6 +8135,12 @@ def test_empty_posneginf(self):
         self.assertEqual(out_pos.numel(), 0)
         self.assertEqual(out_neg.numel(), 0)
 
+    def test_empty_dot(self):
+        # just to check that it doesnt crash
+        a = torch.rand((0), device="mps")
+        b = torch.rand((0), device="mps")
+        self.assertEqual(a.dot(b), a.cpu().dot(b.cpu()))
+
 
 class TestLargeTensors(TestCaseMPS):
     @serialTest()
@@ -9472,17 +9555,37 @@ def get_mps_memory_usage():
         # 5 MB different maximum allowed value(could be decreased even more)
         torch.testing.assert_close(memory_footprints[-1], memory_footprints[0], atol=5, rtol=1)
 
-    def generate_qkv(self, batch, NH, q_len, s_len, head_dim, contiguous, dtype):
-        if contiguous:
+    def generate_qkv(self, batch: int, NH: int, q_len: int, s_len: int, head_dim: int, layout: str, dtype: torch.dtype):
+        if layout == "contiguous":
             q = torch.randn(batch, NH, q_len, head_dim, dtype=dtype, device="mps")
             k = torch.randn(batch, NH, s_len, head_dim, dtype=dtype, device="mps")
-        else:
+        elif layout == "mT":
+            # Transpose head dimension and length
             q = torch.randn(batch, NH, head_dim, q_len, dtype=dtype, device="mps").mT
             k = torch.randn(batch, NH, head_dim, s_len, dtype=dtype, device="mps").mT
+        elif layout == "transpose_seq_head":
+            # Transpose length and number of heads
+            q = torch.randn(batch, q_len, NH, head_dim, dtype=dtype, device="mps").transpose(1, 2)
+            k = torch.randn(batch, s_len, NH, head_dim, dtype=dtype, device="mps").transpose(1, 2)
+        elif layout == "permute":
+            # Permute head dimension and length
+            q = torch.randn(batch, head_dim, NH, q_len, dtype=dtype, device="mps").permute(0, 2, 3, 1)
+            k = torch.randn(batch, head_dim, NH, s_len, dtype=dtype, device="mps").permute(0, 2, 3, 1)
+        else:
+            raise ValueError(f"Unknown layout: {layout}")
+
         v = torch.randn(batch, NH, s_len, head_dim, dtype=dtype, device="mps")
         return q, k, v
 
-    def run_fast_attention_test(self, q, k, v, with_mask, dropout_p=0.0, is_causal=False):
+    def run_fast_attention_test(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        with_mask: bool,
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+    ):
         q_len = q.shape[2]
         s_len = k.shape[2]
 
@@ -9523,48 +9626,47 @@ def run_fast_attention_test(self, q, k, v, with_mask, dropout_p=0.0, is_causal=F
         self._compare_tensors(y.cpu(), y_ref)
 
     @parametrize("dtype", [torch.float16, torch.float32])
-    @parametrize("contiguous", [True, False])
+    @parametrize("layout", ["contiguous", "mT", "transpose_seq_head", "permute"])
     @parametrize("head_dim", [64, 96, 128])  # 64, 96, 128 are for the fast kernel
     @parametrize("with_mask", [True, False])
-    def test_fast_vector_attention(self, dtype, contiguous, head_dim, with_mask):
+    def test_fast_vector_attention(self, dtype: torch.dtype, layout: str, head_dim: int, with_mask: bool):
         torch.manual_seed(1729)
         batch = 1
         NH = 2
         q_len = 4  # <8 so that vector fast is eligible
         s_len = 16  # smaller than 1024 so that we use the one–pass variant
-        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, contiguous, dtype)
+        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, layout, dtype)
         self.run_fast_attention_test(q, k, v, with_mask)
 
     @parametrize("dtype", [torch.float32])  # float16 underflows sometimes, which leads to flaky tests
-    @parametrize("contiguous", [True, False])
+    @parametrize("layout", ["contiguous", "mT", "transpose_seq_head", "permute"])
     @parametrize("with_mask", [True, False])
-    def test_fast_vector_attention_2pass(self, dtype, contiguous, with_mask):
+    def test_fast_vector_attention_2pass(self, dtype: torch.dtype, layout: str, with_mask: bool):
         torch.manual_seed(1729)
         batch = 1
         NH = 32
         q_len = 8
         s_len = 1024  # large enough to trigger the two–pass path
         head_dim = 64  # supported head dimension for vector attention
-        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, contiguous, dtype)
+        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, layout, dtype)
         self.run_fast_attention_test(q, k, v, with_mask)
 
     @unittest.skip("Full attention fast kernel not implemented yet")
     @parametrize("dtype", [torch.float16, torch.float32])
-    @parametrize("contiguous", [True, False])
+    @parametrize("layout", ["contiguous", "mT"])
     @parametrize("head_dim", [64, 80, 128])  # 64, 80, 128 are for the fast kernel
     @parametrize("with_mask", [True, False])
-    def test_fast_full_attention(self, dtype, contiguous, head_dim, with_mask):
+    def test_fast_full_attention(self, dtype: torch.dtype, layout: str, head_dim: int, with_mask: bool):
         torch.manual_seed(1729)
         batch = 1
         NH = 2
         q_len = 32  # threshold to trigger full fast attention path
         s_len = 16
-        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, contiguous, dtype)
+        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, layout, dtype)
         self.run_fast_attention_test(q, k, v, with_mask)
 
 
 
-
 class TestSDPAMetaDispatchMode(TorchDispatchMode):
     """
     TorchDispatchMode which intercepts the
@@ -9648,7 +9750,7 @@ def test_slicing_with_step(self):
         self.assertEqual(x_cpu, x_mps)
 
     def test_cast_gather_scatter(self):
-        for _ in range(0, 50):
+        for _ in range(50):
             input = np.random.randint(0, 255, size=(5, 5, 4), dtype=np.uint8)
             with torch.no_grad():
                 s = torch.tensor(input, dtype=torch.uint8, device="mps").unsqueeze(0)
@@ -10036,7 +10138,7 @@ def assert_is_nonview(t, nv):
         assert_is_nonview(t, nv)
 
         # flatten returns the original object if start_dim=end_dim
-        t = t = torch.ones(2, 2, device=device)
+        t = torch.ones(2, 2, device=device)
         nv = t.flatten(1, 1)
         self.assertIs(t, nv)
 
@@ -12148,9 +12250,6 @@ def test_serialization_map_location(self):
             self.assertEqual(x2.device.type, "mps")
 
 
-MPS_UNSUPPORTED_TYPES = [torch.double, torch.cdouble]
-MPS_DTYPES = [t for t in get_all_dtypes() if t not in MPS_UNSUPPORTED_TYPES]
-
 MPS_GRAD_DTYPES = [torch.float32, torch.float16]
 
 
@@ -12447,6 +12546,8 @@ def req_grad(t):
                 # several output grad elements of similar magnitudes get summed
                 # together, introducing significant error for float16.
                 atol, rtol = 5e-3, 5e-3
+            if op.name == "nn.functional.embedding_bag" and dtype == torch.float16:
+                atol, rtol = 5e-3, 5e-3
             self.assertEqual(cpu_grad_inputs, mps_grad_inputs, atol=atol, rtol=rtol)
 
     # The CPU impl of grid_sampler_3d gives a large amount of error for half
@@ -12482,6 +12583,13 @@ def get_samples():
 
             self.assertEqual(half_out, full_out.to(dtype), atol=atol, rtol=rtol)
 
+    def test_grid_sampler_3d_nan(self, device):
+        input = torch.ones(1, 1, 3, 3, 3)
+        grid_nan = torch.tensor([[[[[torch.nan, 1., 1.], [1., 1., 1.]]]]])
+        out_cpu = torch.grid_sampler_3d(input, grid_nan, 0, 0, True)
+        out_mps = torch.grid_sampler_3d(input.to(device), grid_nan.to(device), 0, 0, True)
+        self.assertEqual(out_mps, out_cpu)
+
     def test_fmax_mixed_dtypes(self, device):
         # Regression tesing for https://github.com/pytorch/pytorch/issues/149951
         # fmax and fmin are implemented as binary metal shaders and they were implemented
diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
index 85c3b4d2cb3c..76e50375bba1 100644
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@@ -30,7 +30,7 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 TEST_REPEATS = 30
 HAS_SHM_FILES = os.path.isdir("/dev/shm")
@@ -211,9 +211,9 @@ def autograd_sharing(queue, ready, master_modified, device, is_parameter):
     is_ok &= var.grad is None
     is_ok &= not var._backward_hooks
     if is_parameter:
-        is_ok &= type(var) == Parameter
+        is_ok &= type(var) is Parameter
     else:
-        is_ok &= type(var) == torch.Tensor
+        is_ok &= type(var) is torch.Tensor
     var._grad = torch.ones(5, 5, device=device)
 
     queue.put(is_ok)
diff --git a/test/test_multiprocessing_spawn.py b/test/test_multiprocessing_spawn.py
index d093e01921dc..b77105567cba 100644
--- a/test/test_multiprocessing_spawn.py
+++ b/test/test_multiprocessing_spawn.py
@@ -265,6 +265,12 @@ def tearDown(self):
 )
 class ParallelForkServerPerfTest(TestCase):
 
+    @unittest.skipIf(
+        sys.version_info >= (3, 13, 8),
+        "Python 3.13.8+ changed forkserver module caching behavior",
+        # https://docs.python.org/3.13/whatsnew/changelog.html
+        # gh-126631
+    )
     def test_forkserver_perf(self):
 
         start_method = 'forkserver'
diff --git a/test/test_namedtensor.py b/test/test_namedtensor.py
index 0076da2da485..c8a9ca33efb0 100644
--- a/test/test_namedtensor.py
+++ b/test/test_namedtensor.py
@@ -280,6 +280,11 @@ def test_diagonal(self):
         self.assertEqual(named_tensor.diagonal(outdim='E', dim1='B', dim2='D').names,
                          ['A', 'C', 'E'])
 
+    def test_empty_names(self):
+        ref_tensor = torch.tensor([[1, 2, 3, 4], [4, 3, 2, 1]])
+        empty_named_tensor = torch.tensor([[1, 2, 3, 4], [4, 3, 2, 1]], names=[])
+        self.assertEqual(ref_tensor, empty_named_tensor)
+
     def test_max_pooling(self):
         def check_tuple_return(op, inputs, expected_names):
             values, indices = op(*inputs)
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index 5affbb74cca0..3f20e8b6fac5 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -857,6 +857,22 @@ def test_cat(self):
         ):
             torch.cat([x, y], dim=-1)
 
+    # https://github.com/pytorch/pytorch/issues/161812
+    def test_jagged_with_dim_error(self):
+        x = torch.nested.nested_tensor(
+            [torch.ones(3, 2, 3), torch.ones(4, 2, 3)], layout=torch.jagged
+        )
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "not supported for NestedTensor on dim=0",
+        ):
+            torch.cat([x, x])
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "not supported for NestedTensor on dim=0",
+        ):
+            torch.stack([x, x])
+
     def test_nested_view_from_buffer_overflow_errors(self):
         buffer = torch.tensor([1])
         sizes = torch.tensor([[2**63 - 1], [2**63 - 1], [3]], dtype=torch.int64)
@@ -1089,6 +1105,138 @@ def check(inputs, y):
 
         check(inputs, y)
 
+    @dtypes(
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.uint8,
+        torch.float,
+        torch.float16,
+        torch.bfloat16,
+        torch.double,
+    )
+    def test_jagged_max_dtypes(self, device, dtype):
+        x = torch.nested.nested_tensor(
+            [torch.arange(0, n, dtype=dtype, device=device) for n in (10, 20, 30)],
+            layout=torch.jagged,
+        )
+
+        result_max = x.max(dim=1)
+        expected_max = torch.tensor([9, 19, 29], dtype=dtype, device=device)
+
+        self.assertEqual(result_max.values, expected_max)
+
+    @dtypes(
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.uint8,
+        torch.float,
+        torch.float16,
+        torch.bfloat16,
+        torch.double,
+    )
+    def test_jagged_min_dtypes(self, device, dtype):
+        x = torch.nested.nested_tensor(
+            [torch.arange(0, n, dtype=dtype, device=device) for n in (10, 20, 30)],
+            layout=torch.jagged,
+        )
+
+        result_min = x.min(dim=1)
+        expected_min = torch.tensor([0, 0, 0], dtype=dtype, device=device)
+
+        self.assertEqual(result_min.values, expected_min)
+
+    @dtypes(
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.uint8,
+        torch.float,
+        torch.float16,
+        torch.bfloat16,
+        torch.double,
+    )
+    def test_jagged_amax_dtypes(self, device, dtype):
+        x = torch.nested.nested_tensor(
+            [torch.arange(0, n, dtype=dtype, device=device) for n in (10, 20, 30)],
+            layout=torch.jagged,
+        )
+
+        result_amax = x.amax(dim=1)
+        expected_amax = torch.tensor([9, 19, 29], dtype=dtype, device=device)
+
+        self.assertEqual(result_amax, expected_amax)
+
+    @dtypes(
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.uint8,
+        torch.float,
+        torch.float16,
+        torch.bfloat16,
+        torch.double,
+    )
+    def test_jagged_amin_dtypes(self, device, dtype):
+        x = torch.nested.nested_tensor(
+            [torch.arange(0, n, dtype=dtype, device=device) for n in (10, 20, 30)],
+            layout=torch.jagged,
+        )
+
+        result_amin = x.amin(dim=1)
+        expected_amin = torch.tensor([0, 0, 0], dtype=dtype, device=device)
+
+        self.assertEqual(result_amin, expected_amin)
+
+    @dtypes(
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.uint8,
+        torch.float,
+        torch.float16,
+        torch.bfloat16,
+        torch.double,
+    )
+    def test_jagged_argmax_dtypes(self, device, dtype):
+        x = torch.nested.nested_tensor(
+            [torch.arange(0, n, dtype=dtype, device=device) for n in (10, 20, 30)],
+            layout=torch.jagged,
+        )
+
+        result_argmax = x.argmax(dim=1)
+        expected_argmax = torch.tensor([9, 19, 29], dtype=torch.long, device=device)
+
+        self.assertEqual(result_argmax, expected_argmax)
+
+    @dtypes(
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.uint8,
+        torch.float,
+        torch.float16,
+        torch.bfloat16,
+        torch.double,
+    )
+    def test_jagged_argmin_dtypes(self, device, dtype):
+        x = torch.nested.nested_tensor(
+            [torch.arange(0, n, dtype=dtype, device=device) for n in (10, 20, 30)],
+            layout=torch.jagged,
+        )
+
+        result_argmin = x.argmin(dim=1)
+        expected_argmin = torch.tensor([0, 0, 0], dtype=torch.long, device=device)
+
+        self.assertEqual(result_argmin, expected_argmin)
+
     @skipMeta
     @torch.inference_mode()
     @dtypes(*floating_types_and_half())
@@ -1227,6 +1375,24 @@ def test_device_checks(self, device):
         is_cuda = "cuda" in str(device)
         self.assertEqual(nt.is_cuda, is_cuda)
 
+    @skipIfTorchDynamo("Not a suitable test for TorchDynamo")
+    def test_share_memory(self, device):
+        a = torch.randn(3, 4, device=device)
+        b = torch.randn(5, 4, device=device)
+        nt = torch.nested.nested_tensor([a, b], layout=torch.jagged)
+
+        # Guard CUDA tensors
+        if "cuda" in device:
+            result = nt.share_memory_()
+            self.assertIs(result, nt)
+            return
+
+        result = nt.share_memory_()
+        self.assertIs(result, nt)
+
+        # Verify in shared memory
+        self.assertTrue(nt.is_shared())
+
     @dtypes(torch.float, torch.float16, torch.double)
     def test_nested_tensor_indexing(self, device, dtype):
         # edge case: empty nested tensor
@@ -1316,6 +1482,82 @@ def test_unary_funcs(self, device, func):
             lambda: func(nt_noncontiguous),
         )
 
+    def test_is_any_true_jagged(self, device):
+        B, Fin = 2, 6
+        start = torch.zeros(B, dtype=torch.int64, device=device)
+        lengths = torch.tensor([3, 2], dtype=torch.int64, device=device)
+
+        # NestedTensor reduction should operate on same data as .values().
+        with self.subTest("dispatch_matches_values_buffer"):
+            cond = torch.tensor(
+                [
+                    [True, False, False, True, True, False],
+                    [False, False, True, False, False, False],
+                ],
+                dtype=torch.bool,
+                device=device,
+            )
+            nt = torch.nested.narrow(
+                cond, dim=1, start=start, length=lengths, layout=torch.jagged
+            )
+            out_nt = torch.ops.aten._is_any_true.default(nt).item()
+            out_vals = torch.ops.aten._is_any_true.default(nt.values()).item()
+            self.assertEqual(out_nt, out_vals)
+
+        # Verify jagged boolean behavior.
+        with self.subTest("all_false_returns_false"):
+            cond_false = torch.zeros(B, Fin, dtype=torch.bool, device=device)
+            nt_false = torch.nested.narrow(
+                cond_false, dim=1, start=start, length=lengths, layout=torch.jagged
+            )
+            self.assertFalse(torch.ops.aten._is_any_true.default(nt_false).item())
+
+        with self.subTest("one_true_returns_true"):
+            cond_mixed = torch.zeros(B, Fin, dtype=torch.bool, device=device)
+            cond_mixed[0, 0] = True
+            nt_mixed = torch.nested.narrow(
+                cond_mixed, dim=1, start=start, length=lengths, layout=torch.jagged
+            )
+            self.assertTrue(torch.ops.aten._is_any_true.default(nt_mixed).item())
+
+    def test_is_all_true_jagged(self, device):
+        B, Fin = 2, 6
+        start = torch.zeros(B, dtype=torch.int64, device=device)
+        lengths = torch.tensor([3, 2], dtype=torch.int64, device=device)
+
+        # NestedTensor reduction should operate on same data as .values().
+        with self.subTest("dispatch_matches_values_buffer"):
+            cond = torch.tensor(
+                [
+                    [True, True, True, False, False, False],
+                    [True, True, False, False, False, False],
+                ],
+                dtype=torch.bool,
+                device=device,
+            )
+            nt = torch.nested.narrow(
+                cond, dim=1, start=start, length=lengths, layout=torch.jagged
+            )
+            out_nt = torch.ops.aten._is_all_true.default(nt).item()
+            out_vals = torch.ops.aten._is_all_true.default(nt.values()).item()
+            self.assertEqual(out_nt, out_vals)
+
+        # Verify jagged boolean behavior.
+        with self.subTest("all_true_returns_true"):
+            cond_true = torch.ones(B, Fin, dtype=torch.bool, device=device)
+            nt_true = torch.nested.narrow(
+                cond_true, dim=1, start=start, length=lengths, layout=torch.jagged
+            )
+            self.assertTrue(torch.ops.aten._is_all_true.default(nt_true).item())
+
+        with self.subTest("any_false_returns_false"):
+            cond_mixed = torch.ones(B, Fin, dtype=torch.bool, device=device)
+            cond_mixed[0, 1] = False
+            nt_mixed = torch.nested.narrow(
+                cond_mixed, dim=1, start=start, length=lengths, layout=torch.jagged
+            )
+            self.assertFalse(torch.ops.aten._is_all_true.default(nt_mixed).item())
+
     @parametrize("func", [subtest(torch.ge, name="ge"), subtest(torch.eq, name="eq")])
     def test_binary_ops_with_scalar(self, device, func):
         nt_contiguous, nt_noncontiguous = random_nt_noncontiguous_pair(
@@ -7689,9 +7931,13 @@ def test_to_padded_tensor(self, device, dtype, nt_dim, requires_grad):
 
         nt = torch.nested.nested_tensor(
             [
-                torch.randint(2, (n, *post_seq_len_shape), device=device, dtype=dtype)
-                if dtype is torch.bool
-                else torch.randn(n, *post_seq_len_shape, device=device, dtype=dtype)
+                (
+                    torch.randint(
+                        2, (n, *post_seq_len_shape), device=device, dtype=dtype
+                    )
+                    if dtype is torch.bool
+                    else torch.randn(n, *post_seq_len_shape, device=device, dtype=dtype)
+                )
                 for n in range(2, 9)
             ],
             layout=torch.jagged,
@@ -7740,9 +7986,13 @@ def test_to_padded_tensor_compile(self, device, dtype, nt_dim, requires_grad):
 
         nt = torch.nested.nested_tensor(
             [
-                torch.randint(2, (n, *post_seq_len_shape), device=device, dtype=dtype)
-                if dtype is torch.bool
-                else torch.randn(n, *post_seq_len_shape, device=device, dtype=dtype)
+                (
+                    torch.randint(
+                        2, (n, *post_seq_len_shape), device=device, dtype=dtype
+                    )
+                    if dtype is torch.bool
+                    else torch.randn(n, *post_seq_len_shape, device=device, dtype=dtype)
+                )
                 for n in range(2, 9)
             ],
             layout=torch.jagged,
@@ -8487,7 +8737,7 @@ def f(values, offsets):
     # min() / max(): weird bug
     XFailRule(
         error_type=AttributeError,
-        error_msg="'ConstantIntNode' object has no attribute 'add'",
+        error_msg="'NestedIntNode' object has no attribute 'add'",
         op_match_fn=lambda device, op: (
             op.full_name in {"max.reduction_with_dim", "min.reduction_with_dim"}
         ),
@@ -8504,7 +8754,7 @@ def f(values, offsets):
     # copysign(): formula is broken for (T, NT) broadcasting
     XFailRule(
         error_type=AttributeError,
-        error_msg="'ConstantIntNode' object has no attribute 'add'",
+        error_msg="'NestedIntNode' object has no attribute 'add'",
         op_match_fn=lambda device, op: (op.full_name == "copysign"),
         sample_match_fn=lambda device, sample: ("(T, NT)" in sample.name),
         name="broken_copysign_compile_backward",
diff --git a/test/test_nn.py b/test/test_nn.py
index dc01fbe07726..f63f1ab1a96c 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -39,7 +39,7 @@
     parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \
     skipIfTorchDynamo, gcIfJetson, set_default_dtype
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, \
-    PLATFORM_SUPPORTS_FLASH_ATTENTION, _get_torch_rocm_version
+    _get_torch_rocm_version
 from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \
     module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \
     ctcloss_reference, get_new_module_tests, single_batch_reference_fn, _test_bfloat16_ops, _test_module_empty_input
@@ -66,7 +66,7 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 if TEST_SCIPY:
     import scipy.signal
@@ -3167,7 +3167,6 @@ def perm_fn(x):
                                                 [2.42240309, 0.0354595, -0.60659063, -0.05378816]]]))
             torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0)
 
-    @skipIfRocm(msg='Large numerical errors')
     def test_transformerdecoder(self):
         def get_a_test_layer(use_cuda, activation, batch_first=False):
             d_model = 4
@@ -3523,7 +3522,7 @@ def test_cudnn_weight_format(self):
             nn.RNN(10, 20, batch_first=True)
         ]
         # ROCm RNN does not issue warning about single contig chunk of memory, so don't assert it
-        first_warn = False if torch.version.hip else True
+        first_warn = not torch.version.hip
         for rnn in rnns:
             rnn.cuda()
             input = torch.randn(5, 4, 10, requires_grad=True, device="cuda")
@@ -5245,7 +5244,7 @@ def _get_backend_memory_format(backend: str, memory_format: torch.memory_format)
                 return torch.contiguous_format
             if memory_format in (torch.contiguous_format, torch.channels_last, torch.channels_last_3d):
                 return memory_format
-            raise ValueError("Unable to detect memory format for backend={backend} and memory_format={memory_format}")
+            raise ValueError(f"Unable to detect memory format for backend={backend} and memory_format={memory_format}")
 
         def _get_memory_format(t: torch.Tensor) -> torch.memory_format:
             if t.is_contiguous(memory_format=torch.contiguous_format):
@@ -7111,6 +7110,14 @@ def test_unfold_invalid_arg(self):
             unfold = nn.Unfold(kernel_size=(1, 3), padding=(1, 1), dilation=(1, 2))
             unfold(torch.randn(1, 2, 2, 2))
 
+        with self.assertRaisesRegex(RuntimeError, r"the product of kernel_width and kernel_height overflowed"):
+            tensor_data = torch.tensor([
+                [1.4009e-03, -1.3341e-32, -1.3334e-32, -1.3341e-32, 1.2723e-38, 3.6334e+00, 1.5374e-02],
+                [-1.5525e-02, 9.2391e-29, -2.5615e-13, -1.3322e-32, -1.3341e-32, -1.3341e-32, -1.3341e-32],
+                [-1.3341e-32, -1.3341e-32, -1.3341e-32, 3.0466e+14, 2.3677e+14, 2.3677e+14, 2.3677e+14],
+            ])
+            F.fold(tensor_data, 16, 7318349394477056)
+
     def test_softmin(self):
         x = torch.randn(2, 16)
         self.assertEqual(F.softmin(x, 1), F.softmax(-x, 1))
@@ -7489,6 +7496,19 @@ def test_fractional_max_pool2d_invalid_output_ratio(self):
                                     "fractional_max_pool2d requires output_ratio to either be a single Int or tuple of Ints."):
             res = arg_class(*arg_3)
 
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
+    @largeTensorTest("20GB", device="cuda")
+    def test_large_max_pool2d_ch_last(self):
+        # https://github.com/pytorch/pytorch/issues/165297
+        N, C, H, W = 70, 64, 512, 960  # dims to extend > int32
+        device = torch.device("cuda")
+        x_cuda = torch.randn(N, C, H, W, device=device, dtype=torch.float16)
+        x_cuda = x_cuda.to(memory_format=torch.channels_last)
+        pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        y_cuda_ch_last = pool(x_cuda)
+        y_cuda_contig = pool(x_cuda.contiguous())
+        self.assertEqual(y_cuda_ch_last, y_cuda_contig)
+
     def test_max_pool1d_invalid_output_size(self):
         arg_1 = 3
         arg_2 = 255
@@ -8458,6 +8478,18 @@ def test_avg_pool_large_tensor(self, device):
         # workaround for memory usage overhead of assertEqual
         self.assertTrue(torch.allclose(a.grad.cpu(), a_cpu.grad.half()))
 
+    @onlyCUDA
+    @largeTensorTest("20GB", device="cuda")
+    def test_large_max_pool2d_ch_last(self, device):
+        # https://github.com/pytorch/pytorch/issues/165297
+        N, C, H, W = 70, 64, 512, 960  # dims to extend > int32
+        x_cuda = torch.randn(N, C, H, W, device=device, dtype=torch.float16)
+        x_cuda = x_cuda.to(memory_format=torch.channels_last)
+        pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        y_cuda_ch_last = pool(x_cuda)
+        y_cuda_contig = pool(x_cuda.contiguous())
+        self.assertEqual(y_cuda_ch_last, y_cuda_contig)
+
     @onlyCUDA
     @largeTensorTest("48GB", "cpu")
     @largeTensorTest("48GB", "cuda")
@@ -11217,6 +11249,16 @@ def test_upsamplingNearest2d_launch_config(self, device):
         out_ref = m(inp_ref)
         self.assertEqual(out_ref, out)
 
+    @onlyCUDA
+    @dtypes(torch.half, torch.bfloat16)
+    def test_cudnn_rnn(self, dtype):
+        rnn = nn.RNN(10, 20, num_layers=2, device='cuda', dtype=dtype)
+        input = torch.randn(5, 4, 10, device='cuda', dtype=dtype)
+        hx = torch.randn(2, 4, 20, device='cuda', dtype=dtype)
+        output = rnn(input, hx)
+        output_ref = rnn.cpu()(input.cpu(), hx.cpu())
+        self.assertEqual(tuple([i.cuda() for i in output_ref]), output, atol=5e-3, rtol=1e-3)
+
     @onlyCUDA
     @gcIfJetson
     def test_upsamplingNearest3d_launch_config(self, device):
@@ -12796,6 +12838,43 @@ def test_leaky_relu_inplace_with_zero_slope(self, device):
         expected_bf16 = torch.tensor([0., 0., 1.], device=device, dtype=dtype)
         self.assertEqual(a_bf16.grad, expected_bf16)
 
+    @onlyCPU
+    def test_rrelu_bounds_validation(self, device):
+        """Test RReLU bounds validation for finite and infinite values."""
+        x = torch.randn(5, 5, device=device)
+
+        # Test with finite bounds
+        result = F.rrelu(x, lower=0.1, upper=0.3)
+        self.assertEqual(result.shape, x.shape)
+
+        # Test with infinite lower bound
+        with self.assertRaisesRegex(RuntimeError, "rrelu: lower bound must be finite, got inf"):
+            F.rrelu(x, lower=float('inf'), upper=0.3)
+
+        # Test with infinite upper bound
+        with self.assertRaisesRegex(RuntimeError, "rrelu: upper bound must be finite, got inf"):
+            F.rrelu(x, lower=0.1, upper=float('inf'))
+
+        # Test with NaN lower bound
+        with self.assertRaisesRegex(RuntimeError, "rrelu: lower bound must be finite, got nan"):
+            F.rrelu(x, lower=float('nan'), upper=0.3)
+
+        # Test with NaN upper bound
+        with self.assertRaisesRegex(RuntimeError, "rrelu: upper bound must be finite, got nan"):
+            F.rrelu(x, lower=0.1, upper=float('nan'))
+
+        # Test with negative infinity lower bound
+        with self.assertRaisesRegex(RuntimeError, "rrelu: lower bound must be finite, got -inf"):
+            F.rrelu(x, lower=float('-inf'), upper=0.3)
+
+        # Test with negative infinity upper bound
+        with self.assertRaisesRegex(RuntimeError, "rrelu: upper bound must be finite, got -inf"):
+            F.rrelu(x, lower=0.1, upper=float('-inf'))
+
+        # Test with lower bound greater than upper bound
+        with self.assertRaisesRegex(RuntimeError, "Lower bound should be less than or equal to the upper bound"):
+            F.rrelu(x, lower=0.5, upper=0.3)
+
     @onlyCPU
     def test_softshrink(self, device):
         x = torch.tensor([[1.21, 0.56, 0.5001, 0.4999, 1.2357, -0.4999, -0.5001, -1.154,
@@ -13007,13 +13086,10 @@ def test_skip_init(self, device):
         self.assertEqual(m_initialized.weight.device, m_uninitialized.weight.device)
         self.assertFalse(torch.allclose(m_initialized.weight, m_uninitialized.weight))
 
-    @skipIfRocm(msg='Not our bug: TransformerEncoderLayer._sa_block still uses FA/ME and effectively takes fastpath')
     @skipIfMPS  # TODO(hvaara): Investigate as possible bug. macOS 13 passes, while 14 and 15 fails.
     @dtypes(torch.float)
     @dtypesIfCUDA(torch.double, torch.float, torch.half)
     def test_transformerencoderlayer(self, device, dtype):
-        if TEST_WITH_ROCM and PLATFORM_SUPPORTS_FLASH_ATTENTION and dtype == torch.half:
-            self.skipTest("Skip on ROCM due to Flash Attention tolerances")
         # this is a deterministic test for TransformerEncoderLayer
         d_model = 4
         nhead = 2
@@ -13235,8 +13311,6 @@ def test_transformerencoderlayer_fast_path(self, device, dtype):
     @dtypes(torch.float)
     @dtypesIfCUDA(torch.half, torch.float)
     def test_transformerencoderlayer_gelu(self, device, dtype):
-        if TEST_WITH_ROCM and PLATFORM_SUPPORTS_FLASH_ATTENTION and dtype == torch.half:
-            self.skipTest("Skip on ROCM due to Flash Attention tolerances")
         # this is a deterministic test for TransformerEncoderLayer with gelu activation
         d_model = 4
         nhead = 2
diff --git a/test/test_numa_binding.py b/test/test_numa_binding.py
index d38032ba2260..c599587e281d 100644
--- a/test/test_numa_binding.py
+++ b/test/test_numa_binding.py
@@ -7,7 +7,7 @@
 from dataclasses import dataclass
 from multiprocessing.context import SpawnProcess
 from typing import Any, Optional
-from unittest import skipIf, skipUnless
+from unittest import skipUnless
 from unittest.mock import mock_open, patch
 
 import torch
@@ -22,7 +22,7 @@
     AffinityMode,
     NumaOptions,
 )
-from torch.testing._internal.common_utils import IS_MACOS, run_tests, TestCase
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 @dataclass(frozen=True)
@@ -549,7 +549,7 @@ def test_exclusive_numa_binding(self) -> None:
             bound_logical_cpu_indices_0,
             # Gets an extra physical core due to odd number of physical cores on numa node
             # 3 physical cores total, 2 GPUs: GPU 0 gets 2 physical cores (CPUs 0-3)
-            set(range(0, 4)),
+            set(range(4)),
         )
 
         bound_logical_cpu_indices_1 = (
@@ -677,10 +677,9 @@ def test_core_complex_tiebreak_prefers_lower_cache_key(self) -> None:
             # 1 numa node, 2 L3 caches, 1 physical core per L3 cache = 2 logical CPUs per cache
             # L3 cache 0: CPUs 0-1, L3 cache 1: CPUs 2-3
             # Both have same number of CPUs, so prefer lower cache key (0)
-            set(range(0, 2)),
+            set(range(2)),
         )
 
-    @skipIf(IS_MACOS, "sched_getaffinity doesn't exist")
     def test_binds_to_node_0_if_node_stored_as_minus_one(self) -> None:
         self._add_mock_hardware(
             num_sockets=1,
@@ -710,7 +709,7 @@ def test_binds_to_node_0_if_node_stored_as_minus_one(self) -> None:
             # GPU 0 has numa node stored as -1, which is treated as numa node 0
             # Each numa node has 1 * 1 * 2 = 2 logical CPUs
             # Numa node 0 has CPUs 0-1
-            set(range(0, 2)),
+            set(range(2)),
         )
 
     def test_callable_entrypoint_basic(self) -> None:
diff --git a/test/test_numba_integration.py b/test/test_numba_integration.py
index f42dd4176daf..a194c2da6de3 100644
--- a/test/test_numba_integration.py
+++ b/test/test_numba_integration.py
@@ -1,4 +1,4 @@
-# Owner(s): ["module: unknown"]
+# Owner(s): ["module: cuda"]
 
 import unittest
 
diff --git a/test/test_numpy_interop.py b/test/test_numpy_interop.py
index 286882dfdb37..724cc974047b 100644
--- a/test/test_numpy_interop.py
+++ b/test/test_numpy_interop.py
@@ -164,6 +164,28 @@ def test_to_numpy_bool(self, device) -> None:
         self.assertEqual(y.dtype, np.bool_)
         self.assertEqual(x[0], y[0])
 
+    @skipIfTorchDynamo(
+        "can't check if value is ZeroTensor since _is_zerotensor returns a bool and not a TensorVariable"
+    )
+    def test_to_numpy_zero_tensor(self, device) -> None:
+        dtypes = [
+            torch.uint8,
+            torch.int8,
+            torch.short,
+            torch.int,
+            torch.half,
+            torch.float,
+            torch.double,
+            torch.long,
+            torch.bool,
+        ]
+        for dtype in dtypes:
+            x = torch._efficientzerotensor((10), dtype=dtype)
+            self.assertRaises(RuntimeError, lambda: x.numpy())
+            y = x.numpy(force=True)
+            for i in range(10):
+                self.assertEqual(y[i], 0)
+
     @skipIfTorchDynamo("conj bit not implemented in TensorVariable yet")
     def test_to_numpy_force_argument(self, device) -> None:
         for force in [False, True]:
@@ -183,7 +205,7 @@ def test_to_numpy_force_argument(self, device) -> None:
                             x = x.conj()
                             y = x.resolve_conj()
                         expect_error = (
-                            requires_grad or sparse or conj or not device == "cpu"
+                            requires_grad or sparse or conj or device != "cpu"
                         )
                         error_msg = r"Use (t|T)ensor\..*(\.numpy\(\))?"
                         if not force and expect_error:
@@ -574,7 +596,7 @@ def test_numpy_scalar_cmp(self, device, dtype):
                 if (
                     dtype == torch.complex64
                     and torch.is_tensor(t)
-                    and type(a) == np.complex64
+                    and type(a) is np.complex64
                 ):
                     # TODO: Imaginary part is dropped in this case. Need fix.
                     # https://github.com/pytorch/pytorch/issues/43579
diff --git a/test/test_opaque_obj.py b/test/test_opaque_obj.py
new file mode 100644
index 000000000000..f78ab4faef8f
--- /dev/null
+++ b/test/test_opaque_obj.py
@@ -0,0 +1,269 @@
+# Owner(s): ["module: custom-operators"]
+import copy
+
+import torch
+from torch._dynamo.test_case import run_tests, TestCase
+from torch._library.fake_class_registry import maybe_to_fake_obj
+from torch._library.opaque_object import (
+    get_payload,
+    make_opaque,
+    OpaqueType,
+    set_payload,
+)
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+)
+
+
+class OpaqueQueue:
+    def __init__(self, queue: list[torch.Tensor], init_tensor_: torch.Tensor) -> None:
+        super().__init__()
+        self.queue = queue
+        self.init_tensor_ = init_tensor_
+
+        # For testing purposes
+        self._push_counter = 0
+        self._pop_counter = 0
+        self._size_counter = 0
+
+    def push(self, tensor: torch.Tensor) -> None:
+        self._push_counter += 1
+        self.queue.append(tensor)
+
+    def pop(self) -> torch.Tensor:
+        self._pop_counter += 1
+        if len(self.queue) > 0:
+            return self.queue.pop(0)
+        return self.init_tensor_
+
+    def size(self) -> int:
+        self._size_counter += 1
+        return len(self.queue)
+
+    def __eq__(self, other):
+        if len(self.queue) != len(other.queue):
+            return False
+        for q1, q2 in zip(self.queue, other.queue):
+            if not torch.allclose(q1, q2):
+                return False
+        return torch.allclose(self.init_tensor_, other.init_tensor_)
+
+
+class TestOpaqueObject(TestCase):
+    def setUp(self):
+        self.lib = torch.library.Library("_TestOpaqueObject", "FRAGMENT")  # noqa: TOR901
+
+        torch.library.define(
+            "_TestOpaqueObject::queue_push",
+            "(__torch__.torch.classes.aten.OpaqueObject a, Tensor b) -> ()",
+            tags=torch.Tag.pt2_compliant_tag,
+            lib=self.lib,
+        )
+
+        @torch.library.impl(
+            "_TestOpaqueObject::queue_push", "CompositeExplicitAutograd", lib=self.lib
+        )
+        def push_impl(q: torch._C.ScriptObject, b: torch.Tensor) -> None:
+            queue = get_payload(q)
+            assert isinstance(queue, OpaqueQueue)
+            queue.push(b)
+
+        @torch.library.register_fake("_TestOpaqueObject::queue_push", lib=self.lib)
+        def push_impl_fake(q: torch._C.ScriptObject, b: torch.Tensor) -> None:
+            pass
+
+        self.lib.define(
+            "queue_pop(__torch__.torch.classes.aten.OpaqueObject a) -> Tensor",
+        )
+
+        def pop_impl(q: torch._C.ScriptObject) -> torch.Tensor:
+            queue = get_payload(q)
+            assert isinstance(queue, OpaqueQueue)
+            return queue.pop()
+
+        self.lib.impl("queue_pop", pop_impl, "CompositeExplicitAutograd")
+
+        def pop_impl_fake(q: torch._C.ScriptObject) -> torch.Tensor:
+            # This is not accurate since the queue could have tensors that are
+            # not rank 1
+            ctx = torch._custom_op.impl.get_ctx()
+            u0 = ctx.create_unbacked_symint()
+            return torch.empty(u0)
+
+        self.lib._register_fake("queue_pop", pop_impl_fake)
+
+        @torch.library.custom_op(
+            "_TestOpaqueObject::queue_size",
+            mutates_args=[],
+        )
+        def size_impl(q: OpaqueType) -> int:
+            queue = get_payload(q)
+            assert isinstance(queue, OpaqueQueue)
+            return queue.size()
+
+        @size_impl.register_fake
+        def size_impl_fake(q: torch._C.ScriptObject) -> int:
+            ctx = torch._custom_op.impl.get_ctx()
+            u0 = ctx.create_unbacked_symint()
+            torch._check_is_size(u0)
+            return u0
+
+        super().setUp()
+
+    def tearDown(self):
+        self.lib._destroy()
+
+        super().tearDown()
+
+    def test_creation(self):
+        queue = OpaqueQueue([], torch.zeros(3))
+        obj = make_opaque(queue)
+        self.assertTrue(isinstance(obj, torch._C.ScriptObject))
+        self.assertEqual(str(obj._type()), "__torch__.torch.classes.aten.OpaqueObject")
+
+        # obj.payload stores a direct reference to this python queue object
+        payload = get_payload(obj)
+        self.assertEqual(payload, queue)
+        queue.push(torch.ones(3))
+        self.assertEqual(payload.size(), 1)
+
+    def test_ops(self):
+        queue = OpaqueQueue([], torch.zeros(3))
+        obj = make_opaque()
+        set_payload(obj, queue)
+
+        torch.ops._TestOpaqueObject.queue_push(obj, torch.ones(3) + 1)
+        self.assertEqual(queue.size(), 1)
+        size = torch.ops._TestOpaqueObject.queue_size(obj)
+        self.assertEqual(size, queue.size())
+        popped = torch.ops._TestOpaqueObject.queue_pop(obj)
+        self.assertEqual(popped, torch.ones(3) + 1)
+        self.assertEqual(queue.size(), 0)
+
+    def test_eq(self):
+        self.assertTrue(make_opaque("moo") == make_opaque("moo"))
+        self.assertFalse(make_opaque("moo") == make_opaque("mop"))
+
+        q1 = OpaqueQueue([torch.ones(3)], torch.zeros(3))
+        q2 = OpaqueQueue([torch.ones(3)], torch.zeros(3))
+        obj1 = make_opaque(q1)
+        obj2 = make_opaque(q2)
+        self.assertTrue(obj1 == obj1)
+        self.assertTrue(q1 == q2)
+        self.assertTrue(obj1 == obj2)
+
+    def test_deepcopy(self):
+        q1 = OpaqueQueue([torch.ones(3), torch.ones(3) * 2], torch.zeros(3))
+        obj1 = make_opaque(q1)
+
+        obj2 = copy.deepcopy(obj1)
+        q2 = get_payload(obj2)
+
+        self.assertTrue(q1 is not q2)
+        self.assertTrue(q1 == q2)
+
+    def test_bad_fake(self):
+        torch.library.define(
+            "_TestOpaqueObject::bad_fake",
+            "(__torch__.torch.classes.aten.OpaqueObject q, Tensor x) -> Tensor",
+            lib=self.lib,
+        )
+
+        def f(q, x):
+            torch.ops._TestOpaqueObject.bad_fake(q, x)
+            return x.cos()
+
+        def bad_fake1(q: torch._C.ScriptObject, b: torch.Tensor) -> torch.Tensor:
+            payload = get_payload(q)
+            return b * payload
+
+        torch.library.register_fake(
+            "_TestOpaqueObject::bad_fake", bad_fake1, lib=self.lib
+        )
+
+        with FakeTensorMode() as fake_mode:
+            obj = make_opaque(1)
+            fake_obj = maybe_to_fake_obj(fake_mode, obj)
+            x = torch.ones(3)
+
+            with self.assertRaisesRegex(
+                ValueError,
+                "get_payload: this function was called with a FakeScriptObject",
+            ):
+                torch.ops._TestOpaqueObject.bad_fake(fake_obj, x)
+
+        def bad_fake2(q: torch._C.ScriptObject, b: torch.Tensor) -> torch.Tensor:
+            set_payload(q, 2)
+            return torch.empty_like(b)
+
+        torch.library.register_fake(
+            "_TestOpaqueObject::bad_fake", bad_fake2, lib=self.lib, allow_override=True
+        )
+
+        with FakeTensorMode() as fake_mode:
+            obj = make_opaque(1)
+            fake_obj = maybe_to_fake_obj(fake_mode, obj)
+            x = torch.ones(3)
+
+            with self.assertRaisesRegex(
+                ValueError,
+                "set_payload: this function was called with a FakeScriptObject",
+            ):
+                torch.ops._TestOpaqueObject.bad_fake(fake_obj, x)
+
+    @parametrize("make_fx_tracing_mode", ["fake", "symbolic"])
+    def test_make_fx(self, make_fx_tracing_mode):
+        class M(torch.nn.Module):
+            def forward(self, queue, x):
+                torch.ops._TestOpaqueObject.queue_push(queue, x.tan())
+                torch.ops._TestOpaqueObject.queue_push(queue, x.cos())
+                torch.ops._TestOpaqueObject.queue_push(queue, x.sin())
+                pop1 = torch.ops._TestOpaqueObject.queue_pop(queue)
+                size1 = torch.ops._TestOpaqueObject.queue_size(queue)
+                pop2 = torch.ops._TestOpaqueObject.queue_pop(queue)
+                size2 = torch.ops._TestOpaqueObject.queue_size(queue)
+                x_cos = pop1 + size1
+                x_sin = pop2 - size2
+                return x_sin + x_cos
+
+        q1 = OpaqueQueue([], torch.empty(0).fill_(-1))
+        obj1 = make_opaque(q1)
+        q2 = OpaqueQueue([], torch.empty(0).fill_(-1))
+        obj2 = make_opaque(q2)
+
+        x = torch.ones(2, 3)
+        gm = make_fx(M(), tracing_mode=make_fx_tracing_mode)(obj1, x)
+        self.assertTrue(torch.allclose(gm(obj1, x), M()(obj2, x)))
+        self.assertEqual(q1._push_counter, 3)
+        self.assertEqual(q1._pop_counter, 2)
+        self.assertEqual(q1._size_counter, 2)
+        self.assertEqual(q1.size(), 1)
+        self.assertExpectedInline(
+            gm.code.strip("\n"),
+            """\
+def forward(self, arg0_1, arg1_1):
+    tan = torch.ops.aten.tan.default(arg1_1)
+    queue_push = torch.ops._TestOpaqueObject.queue_push.default(arg0_1, tan);  tan = queue_push = None
+    cos = torch.ops.aten.cos.default(arg1_1)
+    queue_push_1 = torch.ops._TestOpaqueObject.queue_push.default(arg0_1, cos);  cos = queue_push_1 = None
+    sin = torch.ops.aten.sin.default(arg1_1);  arg1_1 = None
+    queue_push_2 = torch.ops._TestOpaqueObject.queue_push.default(arg0_1, sin);  sin = queue_push_2 = None
+    queue_pop = torch.ops._TestOpaqueObject.queue_pop.default(arg0_1)
+    queue_size = torch.ops._TestOpaqueObject.queue_size.default(arg0_1)
+    queue_pop_1 = torch.ops._TestOpaqueObject.queue_pop.default(arg0_1)
+    queue_size_1 = torch.ops._TestOpaqueObject.queue_size.default(arg0_1);  arg0_1 = None
+    add = torch.ops.aten.add.Tensor(queue_pop, queue_size);  queue_pop = queue_size = None
+    sub = torch.ops.aten.sub.Tensor(queue_pop_1, queue_size_1);  queue_pop_1 = queue_size_1 = None
+    add_1 = torch.ops.aten.add.Tensor(sub, add);  sub = add = None
+    return add_1
+    """,
+        )
+
+
+instantiate_parametrized_tests(TestOpaqueObject)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_opaque_obj_v2.py b/test/test_opaque_obj_v2.py
new file mode 100644
index 000000000000..aea2441c61b9
--- /dev/null
+++ b/test/test_opaque_obj_v2.py
@@ -0,0 +1,84 @@
+# Owner(s): ["module: custom-operators"]
+
+import torch
+from torch._dynamo.test_case import run_tests, TestCase
+from torch._library.opaque_object import register_opaque_type
+
+
+class OpaqueQueue:
+    def __init__(self, queue: list[torch.Tensor], init_tensor_: torch.Tensor) -> None:
+        super().__init__()
+        self.queue = queue
+        self.init_tensor_ = init_tensor_
+
+    def push(self, tensor: torch.Tensor) -> None:
+        self.queue.append(tensor)
+
+    def pop(self) -> torch.Tensor:
+        if len(self.queue) > 0:
+            return self.queue.pop(0)
+        return self.init_tensor_
+
+    def size(self) -> int:
+        return len(self.queue)
+
+
+class TestOpaqueObject(TestCase):
+    def setUp(self):
+        self.lib = torch.library.Library("_TestOpaqueObject", "FRAGMENT")  # noqa: TOR901
+
+        register_opaque_type(OpaqueQueue, "_TestOpaqueObject_OpaqueQueue")
+
+        torch.library.define(
+            "_TestOpaqueObject::queue_push",
+            "(_TestOpaqueObject_OpaqueQueue a, Tensor b) -> ()",
+            tags=torch.Tag.pt2_compliant_tag,
+            lib=self.lib,
+        )
+
+        @torch.library.impl(
+            "_TestOpaqueObject::queue_push", "CompositeExplicitAutograd", lib=self.lib
+        )
+        def push_impl(queue: OpaqueQueue, b: torch.Tensor) -> None:
+            assert isinstance(queue, OpaqueQueue)
+            queue.push(b)
+
+        self.lib.define(
+            "queue_pop(_TestOpaqueObject_OpaqueQueue a) -> Tensor",
+        )
+
+        def pop_impl(queue: OpaqueQueue) -> torch.Tensor:
+            assert isinstance(queue, OpaqueQueue)
+            return queue.pop()
+
+        self.lib.impl("queue_pop", pop_impl, "CompositeExplicitAutograd")
+
+        @torch.library.custom_op(
+            "_TestOpaqueObject::queue_size",
+            mutates_args=[],
+        )
+        def size_impl(queue: OpaqueQueue) -> int:
+            assert isinstance(queue, OpaqueQueue)
+            return queue.size()
+
+        super().setUp()
+
+    def tearDown(self):
+        self.lib._destroy()
+
+        super().tearDown()
+
+    def test_ops(self):
+        queue = OpaqueQueue([], torch.zeros(3))
+
+        torch.ops._TestOpaqueObject.queue_push(queue, torch.ones(3) + 1)
+        size = torch.ops._TestOpaqueObject.queue_size(queue)
+        self.assertEqual(size, 1)
+        popped = torch.ops._TestOpaqueObject.queue_pop(queue)
+        self.assertEqual(popped, torch.ones(3) + 1)
+        size = torch.ops._TestOpaqueObject.queue_size(queue)
+        self.assertEqual(size, 0)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_ops.py b/test/test_ops.py
index 2d5af9966690..3ec023f3d67f 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -28,9 +28,11 @@
     onlyCPU,
     onlyCUDA,
     onlyNativeDeviceTypesAnd,
+    onlyOn,
     OpDTypes,
     ops,
     skipMeta,
+    skipXPU,
 )
 from torch.testing._internal.common_dtype import (
     all_types_and_complex_and,
@@ -110,6 +112,16 @@ def reduction_dtype_filter(op):
     return "dtype" in inspect.getfullargspec(op.op).kwonlyargs
 
 
+def has_reduction_tag(op):
+    """Check if an op has the reduction tag."""
+    if not hasattr(torch.ops.aten, op.name):
+        return False
+    aten_op = getattr(torch.ops.aten, op.name)
+    if not hasattr(aten_op, "default"):
+        return False
+    return torch.Tag.reduction in aten_op.default.tags
+
+
 # Create a list of operators that are a subset of _ref_test_ops but don't have a
 # numpy ref to compare them too, If both CPU and CUDA are compared to numpy
 # then they do not need to be compared to each other
@@ -190,7 +202,6 @@ def reduction_dtype_filter(op):
     xfail("tril"),
     xfail("triu"),
     xfail("unfold_copy"),
-    xfail("where"),
     # Output has dynamic shape.
     # Does not have a meta kernel implementation.
     skip("linalg.lstsq"),
@@ -222,7 +233,7 @@ def tearDownClass(cls):
             assert len(filtered_ops) == 0, err_msg
 
     # Validates that each OpInfo works correctly on different CUDA devices
-    @onlyCUDA
+    @onlyOn(["cuda", "xpu"])
     @deviceCountAtLeast(2)
     @ops(op_db, allowed_dtypes=(torch.float32, torch.long))
     def test_multiple_devices(self, devices, dtype, op):
@@ -326,12 +337,131 @@ def get_opoverloadpacket_from_dispatch(kernel):
 
                         self.assertTrue(torch.Tag.pointwise in overload.tags)
 
+    def test_reduction_tag_coverage(self):
+        """Test that operators with reduction tag are from reduction operator files."""
+        pytorch_dir = os.path.abspath(__file__ + "/../../")
+        files = [
+            "aten/src/ATen/native/ReduceOps.cpp",
+            "aten/src/ATen/native/ReduceAllOps.h",
+        ]
+
+        # Operators that are not pure reduction but have reduction overloads
+        allowed_functions = (
+            # min/max have both elementwise (binary) and reduction versions
+            "aten.min.other",
+            "aten.min.out",
+            "aten.max.other",
+            "aten.max.out",
+        )
+
+        regex = re.compile(r"DEFINE_DISPATCH\(.*_stub")
+
+        def get_opoverloadpacket_from_dispatch(kernel):
+            # Skip cumulative operations - they're in ReduceOps.cpp but aren't reductions
+            if kernel in ("cumsum", "cumprod", "logcumsumexp", "xor_sum"):
+                return None
+
+            # Special mappings for ambiguous kernel names
+            if kernel == "and":
+                return "all"
+            if kernel == "or":
+                return "any"
+
+            if hasattr(torch.ops.aten, kernel):
+                return kernel
+            if hasattr(torch.ops.aten, f"__{kernel}__"):
+                return f"__{kernel}__"
+            if hasattr(torch.ops.aten, f"special_{kernel}"):
+                return f"special_{kernel}"
+            if "_" in kernel:
+                kernel_split = kernel.split("_")
+                new_kernel = "_".join(kernel_split[:-1])
+                if hasattr(torch.ops.aten, new_kernel):
+                    return new_kernel
+
+            # could not find op from kernel dispatch string
+            return None
+
+        for file_name in files:
+            file_path = os.path.join(pytorch_dir, file_name)
+            if not os.path.exists(file_path):
+                continue
+
+            with open(file_path) as f:
+                lines = f.read()
+                matches = regex.findall(lines)
+                for match in matches:
+                    kernel = match[len("DEFINE_DISPATCH(") : -len("_stub")]
+
+                    kernel = get_opoverloadpacket_from_dispatch(kernel)
+                    if kernel is None:
+                        continue
+
+                    overloadpacket = getattr(torch.ops.aten, kernel)
+
+                    for overload_name in overloadpacket.overloads():
+                        overload = getattr(overloadpacket, overload_name)
+
+                        if not torch._C._dispatch_has_kernel(overload.name()):
+                            continue
+
+                        # TODO: tags are not propagated to generated overload,
+                        # and there's no way of specifying them
+                        if torch.Tag.generated in overload.tags:
+                            continue
+
+                        if str(overload) in allowed_functions:
+                            continue
+
+                        self.assertTrue(
+                            torch.Tag.reduction in overload.tags,
+                            f"{overload} should have reduction tag",
+                        )
+
+    @ops([op for op in op_db if has_reduction_tag(op)], dtypes=OpDTypes.none)
+    def test_reduction_ops_reduce(self, device, op):
+        """Test that operators with reduction tag actually reduce numel when dim is specified."""
+        samples = op.sample_inputs(device, torch.float32)
+
+        for sample in samples:
+            if "dim" not in sample.kwargs:
+                continue
+
+            dim_val = sample.kwargs["dim"]
+
+            # Call the operation
+            result = op(sample.input, *sample.args, **sample.kwargs)
+
+            if isinstance(result, torch.Tensor):
+                if dim_val is None:
+                    dim_val = list(range(sample.input.ndim))
+                reduction_dims = [dim_val] if isinstance(dim_val, int) else dim_val
+
+                # Skip 0 dim for now
+                if any(abs(dim) >= sample.input.ndim for dim in reduction_dims):
+                    continue
+
+                reduction_factor = 1
+                for dim in reduction_dims:
+                    reduction_factor *= sample.input.shape[dim]
+
+                expected_numel = sample.input.numel() // reduction_factor
+
+                self.assertEqual(
+                    result.numel(),
+                    expected_numel,
+                    f"{op.name} with dim={dim_val} should reduce numel by factor of {reduction_factor} "
+                    f"(input: {sample.input.numel()}, expected: {expected_numel}, got: {result.numel()})",
+                )
+
     # Tests that the function and its (ndarray-accepting) reference produce the same
     #   values on the tensors from sample_inputs func for the corresponding op.
     # This test runs in double and complex double precision because
     # NumPy does computation internally using double precision for many functions
     # resulting in possible equality check failures.
     # skip windows case on CPU due to https://github.com/pytorch/pytorch/issues/129947
+    # XPU test will be enabled step by step. Skip the tests temporarily.
+    @skipXPU
     @onlyNativeDeviceTypesAnd(["hpu"])
     @suppress_warnings
     @ops(_ref_test_ops, allowed_dtypes=(torch.float64, torch.long, torch.complex128))
@@ -341,7 +471,7 @@ def test_numpy_ref(self, device, dtype, op):
             and op.formatted_name
             in ("signal_windows_exponential", "signal_windows_bartlett")
             and dtype == torch.float64
-            and "cuda" in device
+            and ("cuda" in device or "xpu" in device)
             or "cpu" in device
         ):  # noqa: E121
             raise unittest.SkipTest("XXX: raises tensor-likes are not close.")
@@ -354,7 +484,7 @@ def test_numpy_ref(self, device, dtype, op):
                 )
 
     # Tests that the cpu and gpu results are consistent
-    @onlyCUDA
+    @onlyOn(["cuda", "xpu"])
     @suppress_warnings
     @slowTest
     @ops(_ops_and_refs_with_no_numpy_ref, dtypes=OpDTypes.any_common_cpu_cuda_one)
@@ -386,6 +516,7 @@ def to_cpu(arg):
     # Tests that experimental Python References can propagate shape, dtype,
     # and device metadata properly.
     # See https://github.com/pytorch/pytorch/issues/78050 for a discussion of stride propagation.
+    @skipXPU
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops(python_ref_db)
     @skipIfTorchInductor("Takes too long for inductor")
@@ -581,6 +712,7 @@ def _distance(a, b):
     # Tests that experimental Python References perform the same computation
     # as the operators they reference, when operator calls in the torch
     # namespace are remapped to the refs namespace (torch.foo becomes refs.foo).
+    @skipXPU
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops(python_ref_db)
     @skipIfTorchInductor("Takes too long for inductor")
@@ -599,6 +731,7 @@ def test_python_ref(self, device, dtype, op):
     # Tests that experimental Python References perform the same computation
     # as the operators they reference, when operator calls in the torch
     # namespace are preserved (torch.foo remains torch.foo).
+    @skipXPU
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops(python_ref_db)
     @skipIfTorchInductor("Takes too long for inductor")
@@ -634,6 +767,7 @@ def test_python_ref_executor(self, device, dtype, op, executor):
         op.op = partial(make_traced(op.op), executor=executor)
         self._ref_test_helper(contextlib.nullcontext, device, dtype, op)
 
+    @skipXPU
     @skipMeta
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops([op for op in op_db if op.error_inputs_func is not None], dtypes=OpDTypes.none)
@@ -645,6 +779,7 @@ def test_errors(self, device, op):
                 out = op(si.input, *si.args, **si.kwargs)
                 self.assertFalse(isinstance(out, type(NotImplemented)))
 
+    @skipXPU
     @skipMeta
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops(
@@ -668,6 +803,7 @@ def test_errors_sparse(self, device, op, layout):
                 out = op(si.input, *si.args, **si.kwargs)
                 self.assertFalse(isinstance(out, type(NotImplemented)))
 
+    @skipXPU
     @skipMeta
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops(
@@ -694,6 +830,7 @@ def _to_tensormeta(x):
 
     # Tests that the function produces the same result when called with
     #   noncontiguous tensors.
+    @skipXPU
     @with_tf32_off
     @onlyNativeDeviceTypesAnd(["hpu"])
     @suppress_warnings
@@ -786,6 +923,7 @@ def test_noncontiguous_samples(self, device, dtype, op):
     #   incorrectly sized out parameter warning properly yet
     # Cases test here:
     #   - out= with the correct dtype and device, but the wrong shape
+    @skipXPU
     @ops(ops_and_refs, dtypes=OpDTypes.none)
     def test_out_warning(self, device, op):
         if TEST_WITH_TORCHDYNAMO and op.name == "_refs.clamp":
@@ -924,6 +1062,7 @@ def _any_nonempty(out):
     # Case 3 and 4 are slightly different when the op is a factory function:
     #   - if device, dtype are NOT passed, any combination of dtype/device should be OK for out
     #   - if device, dtype are passed, device and dtype should match
+    @skipXPU
     @ops(ops_and_refs, dtypes=OpDTypes.any_one)
     def test_out(self, device, dtype, op):
         # Prefers running in float32 but has a fallback for the first listed supported dtype
@@ -1127,6 +1266,7 @@ def _case_four_transform(t):
                     with self.assertRaises(exc_type, msg=msg_fail):
                         op_out(out=out)
 
+    @skipXPU
     @ops(
         [
             op
@@ -1165,6 +1305,7 @@ def set_requires_grad(x):
         with self.assertRaises(RuntimeError, msg=msg), maybe_skip_size_asserts(op):
             op(sample.input, *sample.args, **sample.kwargs, out=out)
 
+    @skipXPU
     @ops(filter(reduction_dtype_filter, ops_and_refs), dtypes=(torch.int16,))
     def test_out_integral_dtype(self, device, dtype, op):
         def helper(with_out, expectFail, op_to_test, inputs, *args, **kwargs):
@@ -1208,6 +1349,7 @@ def helper(with_out, expectFail, op_to_test, inputs, *args, **kwargs):
     # Tests that the forward and backward passes of operations produce the
     #   same values for the cross-product of op variants (method, inplace)
     #   against eager's gold standard op function variant
+    @skipXPU
     @_variant_ops(op_db)
     def test_variant_consistency_eager(self, device, dtype, op):
         # Acquires variants (method variant, inplace variant, operator variant, inplace_operator variant, aliases)
@@ -1388,6 +1530,7 @@ def _test_inplace_preserve_storage(samples, variants):
 
     # Reference testing for operations in complex32 against complex64.
     # NOTE: We test against complex64 as NumPy doesn't have a complex32 equivalent dtype.
+    @skipXPU
     @ops(op_db, allowed_dtypes=(torch.complex32,))
     def test_complex_half_reference_testing(self, device, dtype, op):
         if not op.supports_dtype(torch.complex32, device):
@@ -1423,6 +1566,7 @@ def test_complex_half_reference_testing(self, device, dtype, op):
             # `cfloat` input -> `float` output
             self.assertEqual(actual, expected, exact_dtype=False)
 
+    @skipXPU
     @ops(op_db, allowed_dtypes=(torch.bool,))
     def test_non_standard_bool_values(self, device, dtype, op):
         # Test boolean values other than 0x00 and 0x01 (gh-54789)
@@ -1451,6 +1595,7 @@ def convert_boolean_tensors(x):
 
     # Validates that each OpInfo specifies its forward and backward dtypes
     #   correctly for CPU and CUDA devices
+    @skipXPU
     @skipMeta
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops(ops_and_refs, dtypes=OpDTypes.none)
@@ -1657,6 +1802,7 @@ def _tensor_requires_grad(x):
         self.fail(msg)
 
     # Validates that each OpInfo that sets promotes_int_to_float=True does as it says
+    @skipXPU
     @skipMeta
     @onlyNativeDeviceTypesAnd(["hpu"])
     @ops(
@@ -2846,7 +2992,7 @@ def test_strided_layout(self, device, dtype, op):
             self.assertEqual(strided_result.layout, torch.strided)
 
 
-instantiate_device_type_tests(TestCommon, globals())
+instantiate_device_type_tests(TestCommon, globals(), allow_xpu=True)
 instantiate_device_type_tests(TestCompositeCompliance, globals())
 instantiate_device_type_tests(TestMathBits, globals())
 instantiate_device_type_tests(TestRefsOpsInfo, globals(), only_for="cpu")
diff --git a/test/test_prims.py b/test/test_prims.py
index 58ed8a7dd758..e528a1eb2e4e 100644
--- a/test/test_prims.py
+++ b/test/test_prims.py
@@ -342,6 +342,16 @@ def test_clone_complex(self):
             x = torch.randn(4, dtype=torch.complex64, device='meta').conj()
             x + 1
 
+    def test_clone_meta_stride_preservation_dense(self):
+        tensor = torch.randn(1, 5).t()
+        meta_clone = prims._clone_meta(tensor, memory_format=torch.preserve_format)
+        self.assertEqual(tensor.stride(), meta_clone.stride())
+
+    def test_clone_meta_stride_preservation_sparse(self):
+        tensor = torch.arange(12).float().view(3, 4)[1:, ::2]
+        meta_clone = prims._clone_meta(tensor, memory_format=torch.preserve_format)
+        self.assertEqual(tensor.contiguous().stride(), meta_clone.stride())
+
     def test_check_deprecation_warning(self):
         with self.assertWarnsRegex(FutureWarning, 'will be removed in the future'):
             torch._prims_common.check(True, lambda: 'message')
diff --git a/test/test_privateuseone_python_backend.py b/test/test_privateuseone_python_backend.py
new file mode 100644
index 000000000000..b767933f0c54
--- /dev/null
+++ b/test/test_privateuseone_python_backend.py
@@ -0,0 +1,147 @@
+# Owner(s): ["module: PrivateUse1"]
+import numpy as np
+
+import torch
+import torch._C
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.utils.backend_registration import _setup_privateuseone_for_python_backend
+
+
+_setup_privateuseone_for_python_backend("npy")
+
+aten = torch.ops.aten
+
+
+# NOTE: From https://github.com/albanD/subclass_zoo/blob/main/new_device.py
+# but using torch.library instead of `__torch_dispatch__`
+class MyDeviceTensor(torch.Tensor):
+    @staticmethod
+    def __new__(cls, size, dtype, raw_data=None, requires_grad=False):
+        # Use a meta Tensor here to be used as the wrapper
+        res = torch._C._acc.create_empty_tensor(size, dtype)
+        res.__class__ = MyDeviceTensor
+        return res
+
+    def __init__(self, size, dtype, raw_data=None, requires_grad=False):
+        # Store any provided user raw_data
+        self.raw_data = raw_data
+
+    def __repr__(self):
+        return "MyDeviceTensor" + str(self.raw_data)
+
+    __str__ = __repr__
+
+
+def wrap(arr, shape, dtype):
+    # hard code float32 for tests
+    return MyDeviceTensor(shape, dtype, arr)
+
+
+def unwrap(arr):
+    return arr.raw_data
+
+
+# Add some ops
+@torch.library.impl("aten::add.Tensor", "privateuseone")
+def add(t1, t2):
+    out = unwrap(t1) + unwrap(t2)
+    return wrap(out, out.shape, torch.float32)
+
+
+@torch.library.impl("aten::mul.Tensor", "privateuseone")
+def mul(t1, t2):
+    # If unsure what should be the result's properties, you can
+    # use the super_fn (can be useful for type promotion)
+    out = unwrap(t1) * unwrap(t2)
+    return wrap(out, out.shape, torch.float32)
+
+
+@torch.library.impl("aten::detach", "privateuseone")
+def detach(self):
+    out = unwrap(self)
+    return wrap(out, out.shape, torch.float32)
+
+
+@torch.library.impl("aten::empty_strided", "privateuseone")
+def empty_strided(
+    size, stride, *, dtype=None, layout=None, device=None, pin_memory=None
+):
+    out = np.empty(size)
+    return wrap(out, out.shape, torch.float32)
+
+
+@torch.library.impl("aten::_copy_from", "privateuseone")
+def _copy_from(a, b):
+    if a.device.type == "npy":
+        npy_data = unwrap(a)
+    else:
+        npy_data = a.numpy()
+    b.raw_data = npy_data
+
+
+@torch.library.impl("aten::view", "privateuseone")
+def _view(a, b):
+    ans = unwrap(a)
+    return wrap(ans, a.shape, a.dtype)
+
+
+@torch.library.impl("aten::empty.memory_format", "privateuseone")
+def empty_memory_format(
+    size, *, dtype=None, layout=None, device=None, pin_memory=None, memory_format=None
+):
+    ans = np.empty(size)
+    return wrap(ans, ans.shape, torch.float32)
+
+
+@torch.library.impl("aten::sum", "privateuseone")
+def sum_int_list(*args, **kwargs):
+    ans = unwrap(args[0]).sum()
+    return wrap(ans, ans.shape, torch.float32)
+
+
+@torch.library.impl("aten::ones_like", "privateuseone")
+def ones_like(
+    self, *, dtype=None, layout=None, device=None, pin_memory=None, memory_format=None
+):
+    ans = np.ones_like(unwrap(self))
+    return wrap(ans, ans.shape, torch.float32)
+
+
+@torch.library.impl("aten::expand", "privateuseone")
+def expand(self, size, *, implicit=False):
+    ans = np.broadcast_to(self.raw_data, size)
+    return wrap(ans, ans.shape, torch.float32)
+
+
+@torch.library.impl("aten::as_strided", "privateuseone")
+def as_strided(self, size, stride, storage_offset=None):
+    ans = np.lib.stride_tricks.as_strided(self.raw_data, size, stride)
+    return wrap(ans, ans.shape, torch.float32)
+
+
+class PrivateUse1BackendTest(TestCase):
+    @classmethod
+    def setupClass(cls):
+        pass
+
+    def test_accessing_is_pinned(self):
+        a_cpu = torch.randn((2, 2))
+        # Assert this don't throw:
+        _ = a_cpu.is_pinned()
+
+    def test_backend_simple(self):
+        a_cpu = torch.randn((2, 2))
+        b_cpu = torch.randn((2, 2))
+        # Assert this don't throw:
+        a = a_cpu.to("privateuseone")
+        b = b_cpu.to("privateuseone")
+
+        a.requires_grad = True
+        b.requires_grad = True
+        c = (a + b).sum()
+        c.backward()
+        self.assertTrue(np.allclose(a.grad.raw_data, np.ones((2, 2))))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_proxy_tensor.py b/test/test_proxy_tensor.py
index 6d36b36996c4..b76895a0a91f 100644
--- a/test/test_proxy_tensor.py
+++ b/test/test_proxy_tensor.py
@@ -1128,9 +1128,6 @@ def f(_a, _b, _stride):
             a = _a.item()
             b = _b.item()
             stride = _stride.item()
-            torch._check_is_size(a)
-            torch._check_is_size(b)
-            torch._check_is_size(stride)
             ta = torch.randn(a * stride)
             tb = torch.randn(b * stride)
             r = torch.cat([ta, tb])
@@ -1476,9 +1473,9 @@ def test_view_divisibility_unbacked_relatively_prime(self):
         # See https://github.com/pytorch/pytorch/issues/123651
         def f(x):
             i0 = x.item()
-            torch._check_is_size(i0)
             # To trigger the original issue, the max bound has to
             # be chosen such that 448 / 447 < 2 (which it is.)
+            torch._check(i0 > 0)
             torch._check(i0 <= 448)
             return torch.zeros(256 * i0).view(-1, 447)
         make_fx(f, tracing_mode="symbolic")(torch.tensor(256 * 447, device="cuda"))
@@ -1559,9 +1556,6 @@ def test_split_unbacked_sizes(self):
         def f(lengths, values):
             # tolist not directly supported atm
             sizes = [lengths[i].item() for i in range(lengths.size(0))]
-            for s in sizes:
-                # TODO(avik): no assertion generated with torch._check_is_size?
-                torch._constrain_as_size(s)
             return torch.split(values, sizes)
 
         r = str(make_fx(f, tracing_mode="symbolic")(
@@ -1576,9 +1570,6 @@ def forward(self, lengths_1, values_1):
     _local_scalar_dense_1 = torch.ops.aten._local_scalar_dense.default(select_1);  select_1 = None
     select_2 = torch.ops.aten.select.int(lengths_1, 0, 2);  lengths_1 = None
     _local_scalar_dense_2 = torch.ops.aten._local_scalar_dense.default(select_2);  select_2 = None
-    sym_constrain_range_for_size = torch.ops.aten.sym_constrain_range_for_size.default(_local_scalar_dense);  sym_constrain_range_for_size = None
-    sym_constrain_range_for_size_1 = torch.ops.aten.sym_constrain_range_for_size.default(_local_scalar_dense_1);  sym_constrain_range_for_size_1 = None
-    sym_constrain_range_for_size_2 = torch.ops.aten.sym_constrain_range_for_size.default(_local_scalar_dense_2);  sym_constrain_range_for_size_2 = None
     split_with_sizes = torch.ops.aten.split_with_sizes.default(values_1, [_local_scalar_dense, _local_scalar_dense_1, _local_scalar_dense_2]);  values_1 = _local_scalar_dense = _local_scalar_dense_1 = _local_scalar_dense_2 = None
     getitem = split_with_sizes[0]
     getitem_1 = split_with_sizes[1]
@@ -1867,7 +1858,7 @@ def f(a):
             show_guards(tensor),
             """\
 L['a'].size()[1] < L['a'].size()[0]
-L['a'].size()[0] <= 19
+3 <= L['a'].size()[0] and L['a'].size()[0] <= 19
 L['a'].size()[1] <= 18""")
 
     def test_sym_storage_offset(self):
@@ -1973,7 +1964,6 @@ def f(t):
     skip('item'),
     xfail('cov'),
     xfail('nn.functional.gaussian_nll_loss'),
-    xfail('tensor_split'),
     xfail('corrcoef'),
     xfail('quantile'),
     xfail('nanquantile'),
@@ -1993,10 +1983,12 @@ def f(t):
 
 only_real_tensor_failures = {
     xfail('narrow'),
+    xfail('tensor_split'),
 }
 
 only_fake_tensor_failures = {
     xfail('narrow'),
+    xfail('tensor_split'),
 }
 
 fake_tensor_failures = set()
diff --git a/test/test_pruning_op.py b/test/test_pruning_op.py
index 5d24a9a31cbe..d8e42d781390 100644
--- a/test/test_pruning_op.py
+++ b/test/test_pruning_op.py
@@ -18,7 +18,7 @@ class PruningOpTest(TestCase):
     def _generate_rowwise_mask(self, embedding_rows):
         indicator = torch.from_numpy((np.random.random_sample(embedding_rows)).astype(np.float32))
         threshold = float(np.random.random_sample())
-        mask = torch.BoolTensor([True if val >= threshold else False for val in indicator])
+        mask = torch.BoolTensor([val >= threshold for val in indicator])
         return mask
 
     def _test_rowwise_prune_op(self, embedding_rows, embedding_dims, indices_type, weights_dtype):
diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py
index 09bbbcbadcc8..fa21705c76e4 100644
--- a/test/test_public_bindings.py
+++ b/test/test_public_bindings.py
@@ -7,7 +7,7 @@
 import os
 import pkgutil
 import unittest
-from typing import Callable
+from collections.abc import Callable
 
 import torch
 from torch._utils_internal import get_file_path_2  # @manual
diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py
index 07a92244cd73..98fbabff11ef 100644
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@@ -850,7 +850,7 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
             lambda: A(torch.zeros(1)).detach(),
         )
 
-    def test_detach_appears_twice_when_called_once(self) -> None:
+    def test_detach_appears_once_when_called_once(self) -> None:
         with capture_logs() as logs:
             x = LoggingTensor(torch.tensor([3.0]), requires_grad=True)
             log_input("x", x)
@@ -863,8 +863,7 @@ def test_detach_appears_twice_when_called_once(self) -> None:
             "\n".join(logs),
             """\
 $0: f32[1] = input('x')
-$1: f32[1] = torch._ops.aten.detach.default($0)
-$2: f32[1] = torch._ops.aten.detach.default($1)""",
+$1: f32[1] = torch._ops.aten.detach.default($0)""",
         )
 
     def test_storage(self) -> None:
diff --git a/test/test_quantization.py b/test/test_quantization.py
index 6d72da3279e1..01006e3f6e22 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -72,7 +72,7 @@
 except ImportError as e:
     # In FBCode we separate FX out into a separate target for the sake of dev
     # velocity. These are covered by a separate test target `quantization_fx`
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 
 # PyTorch 2 Export Quantization
 try:
@@ -94,7 +94,7 @@
 except ImportError as e:
     # In FBCode we separate PT2 out into a separate target for the sake of dev
     # velocity. These are covered by a separate test target `quantization_pt2e`
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 
 try:
     from quantization.fx.test_numeric_suite_fx import TestFXGraphMatcher  # noqa: F401
@@ -103,7 +103,7 @@
     from quantization.fx.test_numeric_suite_fx import TestFXNumericSuiteNShadows  # noqa: F401
     from quantization.fx.test_numeric_suite_fx import TestFXNumericSuiteCoreAPIsModels  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 
 # Test the model report module
 try:
@@ -115,19 +115,19 @@
     from quantization.fx.test_model_report_fx import TestFxDetectOutliers  # noqa: F401
     from quantization.fx.test_model_report_fx import TestFxModelReportVisualizer  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 
 # Equalization for FX mode
 try:
     from quantization.fx.test_equalize_fx import TestEqualizeFx  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 
 # Backward Compatibility. Tests serialization and BC for quantized modules.
 try:
     from quantization.bc.test_backward_compatibility import TestSerialization  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 
 # JIT Graph Mode Quantization
 from quantization.jit.test_quantize_jit import TestQuantizeJit  # noqa: F401
@@ -146,29 +146,29 @@
 try:
     from quantization.ao_migration.test_quantization_fx import TestAOMigrationQuantizationFx  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 
 # Experimental functionality
 try:
     from quantization.core.experimental.test_bits import TestBitsCPU  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 try:
     from quantization.core.experimental.test_bits import TestBitsCUDA  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 try:
     from quantization.core.experimental.test_floatx import TestFloat8DtypeCPU  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 try:
     from quantization.core.experimental.test_floatx import TestFloat8DtypeCUDA  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 try:
     from quantization.core.experimental.test_floatx import TestFloat8DtypeCPUOnlyCPU  # noqa: F401
 except ImportError as e:
-    log.warning(e)
+    log.warning(e)  # noqa:G200
 
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_reductions.py b/test/test_reductions.py
index f0ec8b434535..4a3235fbc50c 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -1710,7 +1710,7 @@ def _test_reduction_function_with_numpy(self, torch_func, np_func, device, dtype
                                             with_extremal=False, atol=None, rtol=None,
                                             exact_dtype=True, with_keepdim=False):
         # Test 0-d to 3-d tensors.
-        for ndims in range(0, 4):
+        for ndims in range(4):
             shape = _rand_shape(ndims, min_size=5, max_size=10)
             for n in range(ndims + 1):
                 for c in combinations(list(range(ndims)), n):
@@ -1899,7 +1899,7 @@ def test_all_any_vs_numpy(self, device, dtype):
         # Note [all, any uint8 compatibility]: However for compatibility reason,
         # for `uint8`, they return Tensor of same dtype `uint8`.
         # Reference: https://github.com/pytorch/pytorch/pull/47878#issuecomment-747108561
-        exact_dtype = True if dtype != torch.uint8 else False
+        exact_dtype = dtype != torch.uint8
 
         def _test_all_any(x):
             self.compare_with_numpy(torch.all, np.all, x)
@@ -2623,7 +2623,7 @@ def test_quantile(self, device, dtype):
         # Generate some random test cases
         ops = ['quantile', 'nanquantile']
         inputs = [tuple(np.random.randint(2, 10, size=i)) for i in range(1, 4)]
-        quantiles = [tuple(np.random.rand(i)) for i in range(0, 5)]
+        quantiles = [tuple(np.random.rand(i)) for i in range(5)]
         keepdims = [True, False]
 
         # Add corner cases
@@ -3047,9 +3047,13 @@ def test_histc(self, device):
             torch.tensor([1], dtype=torch.float, device=device),
             actual)
         # tensors with inf; min, max not provided -- should throw a RuntimeError
-        with self.assertRaisesRegex(RuntimeError, r'range of \[inf, inf\] is not finite'):
+        with self.assertRaisesRegex(RuntimeError, r'range of \[[\w,+\-\.\ ]+\] is not finite'):
             torch.histc(torch.tensor([float("inf")], dtype=torch.float, device=device))
-        with self.assertRaisesRegex(RuntimeError, r'range of \[1, inf\] is not finite'):
+        with self.assertRaisesRegex(RuntimeError, r'range of \[[\w,+\-\.\ ]+\] is not finite'):
+            torch.histc(torch.tensor([float("-inf")], dtype=torch.float, device=device))
+        with self.assertRaisesRegex(RuntimeError, r'range of \[[\w,+\-\.\ ]+\] is not finite'):
+            torch.histc(torch.tensor([float("-inf"), float("inf")], dtype=torch.float, device=device))
+        with self.assertRaisesRegex(RuntimeError, r'range of \[[\w,+\-\.\ ]+\] is not finite'):
             torch.histc(torch.tensor([1., 2., float("inf")], dtype=torch.float, device=device))
         # tensors with inf; min, max provided
         self.assertEqual(
@@ -3130,6 +3134,20 @@ def test_histc_min_max_corner_cases(self, device, dtype):
             torch.tensor([2, 0, 0, 1], dtype=dtype, device=device),
             actual)
 
+    @onlyCPU
+    @dtypes(torch.float, torch.double)
+    def test_histc_value_corner_cases(self, device, dtype):
+        min_val = torch.finfo(dtype).min
+        actual = torch.histc(
+            torch.tensor([min_val, min_val, min_val], dtype=dtype, device=device),
+            bins=4)
+        self.assertEqual(3.0, actual.sum())
+        max_val = torch.finfo(dtype).max
+        actual = torch.histc(
+            torch.tensor([max_val, max_val, max_val], dtype=dtype, device=device),
+            bins=4)
+        self.assertEqual(3.0, actual.sum())
+
     @onlyCUDA
     @dtypes(torch.uint8, torch.int8, torch.int, torch.long)
     def test_histc_min_max_corner_cases_cuda(self, device, dtype):
@@ -3309,7 +3327,7 @@ def test_histogram(self, device, dtype):
     """
     def _test_histogramdd_numpy(self, t, bins, bin_range, weights, density):
         def to_np(t):
-            if type(t) == list:
+            if type(t) is list:
                 return list(map(to_np, t))
             if not torch.is_tensor(t):
                 return t
diff --git a/test/test_scaled_matmul_cuda.py b/test/test_scaled_matmul_cuda.py
new file mode 100644
index 000000000000..9781022ef5c1
--- /dev/null
+++ b/test/test_scaled_matmul_cuda.py
@@ -0,0 +1,1906 @@
+# Owner(s): ["module: linear algebra"]
+
+import contextlib
+import json
+import math
+import re
+import tempfile
+import unittest
+from typing import Optional
+
+import torch
+
+
+from torch.nn.functional import scaled_mm, scaled_grouped_mm, ScalingType, SwizzleType
+from torch.testing._internal.common_cuda import (
+    IS_SM90,
+    _get_torch_cuda_version,
+    PLATFORM_SUPPORTS_FP8,
+    PLATFORM_SUPPORTS_FP8_GROUPED_GEMM,
+    PLATFORM_SUPPORTS_MX_GEMM,
+    PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM,
+    SM100OrLater,
+    SM89OrLater,
+    SM90OrLater,
+    with_tf32_off,
+)
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    onlyCUDA,
+    e4m3_type,
+    e5m2_type,
+    E4M3_MAX_POS,
+    E5M2_MAX_POS,
+)
+
+from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
+    parametrize,
+    run_tests,
+    TEST_CUDA,
+    TestCase,
+)
+from torch.testing._internal.common_quantized import (
+    _f32_to_floatx_unpacked,
+    _floatx_unpacked_to_f32,
+    ceil_div, to_blocked,
+    to_mxfp8,
+    generate_jagged_offs,
+)
+
+
+_IS_SM8X = False
+if TEST_CUDA:
+    _IS_SM8X = torch.cuda.get_device_capability(0)[0] == 8
+
+f8_msg = "FP8 is only supported on H100+, SM 8.9 and MI300+ devices"
+f8_grouped_msg = "FP8 grouped is only supported on SM90 and MI300+ devices"
+mx_skip_msg = "MX gemm is only supported on CUDA capability 10.0+"
+mxfp8_grouped_mm_skip_msg = "MXFP8 grouped GEMM is only supported when PyTorch is built with USE_FBGEMM_GENAI=1 on SM100+"
+
+# avoid division by zero when calculating scale
+EPS = 1e-12
+
+def amax_to_scale(
+    amax: torch.Tensor, float8_dtype: torch.dtype, orig_dtype: torch.dtype
+):
+    """ Converts the amax value of a tensor to the fp8 scale.
+    Args:
+        amax: The amax value of the tensor.
+        float8_dtype: the float8 dtype.
+        orig_dtype: The original dtype of the tensor.
+    """
+    scale = torch.empty_like(amax, dtype=torch.float32)
+    if float8_dtype == e4m3_type:
+        res = E4M3_MAX_POS / torch.clamp(amax, min=EPS)
+    elif float8_dtype == e5m2_type:
+        res = E5M2_MAX_POS / torch.clamp(amax, min=EPS)
+    else:
+        raise ValueError(f"Unsupported float8_dtype: {float8_dtype}")
+
+    # Ensure the scale is representable in float16,
+    # this helps when amax is small. We are assuming that we don't need
+    # to care about this for float32/bfloat16
+    if orig_dtype is torch.float16:
+        res = torch.clamp(res, max=torch.finfo(torch.float16).max)
+
+    scale.copy_(res)
+    return scale
+
+def tensor_to_scale(x: torch.Tensor, float8_dtype: torch.dtype, dim=None):
+    if dim is None:
+        amax = torch.max(torch.abs(x))
+    else:
+        amax = torch.max(torch.abs(x), dim=dim, keepdim=True).values
+
+    return amax_to_scale(amax, float8_dtype, x.dtype)
+
+def tensor_to_scale_block(
+    x: torch.Tensor,
+    float8_dtype: torch.dtype,
+    block_outer: int,
+    block_inner: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    x = x.unflatten(1, (-1, block_inner)).unflatten(0, (-1, block_outer))
+    amax = x.abs().amax(dim=[1, 3], keepdim=True).float()
+    scale = torch.finfo(float8_dtype).max / amax
+    x = x.mul(scale).to(float8_dtype)
+    x = x.flatten(2, 3).flatten(0, 1)
+    scale = scale.flatten(2, 3).flatten(0, 1)
+    return x, scale
+
+
+def round_up(x: int, y: int) -> int:
+    return ((x + y - 1) // y) * y
+
+
+def infer_scale_swizzle(mat, scale):
+    # Tensor-wise
+    if scale.numel() == 1:
+        return ScalingType.TensorWise, SwizzleType.NO_SWIZZLE
+
+    # Row-wise
+    if (scale.shape[0] == mat.shape[0] and scale.shape[1] == 1) or (
+        scale.shape[0] == 1 and scale.shape[1] == mat.shape[1]
+    ):
+        return ScalingType.RowWise, SwizzleType.NO_SWIZZLE
+
+    # deepgemm 1x128 / 128x1
+    if len(scale.shape) > 1:
+        if (
+            (scale.shape[0] == mat.shape[0]
+                and scale.shape[1] == math.ceil(mat.shape[1] // 128))
+            or (scale.shape[1] == mat.shape[1]
+                and scale.shape[0] == math.ceil(mat.shape[0] // 128))
+        ):
+            return ScalingType.BlockWise1x128, SwizzleType.NO_SWIZZLE
+
+        # deepgemm 128x128
+        if scale.shape[0] == math.ceil(mat.shape[0] // 128) and scale.shape[
+            1
+        ] == math.ceil(mat.shape[1] // 128):
+            return ScalingType.BlockWise128x128, SwizzleType.NO_SWIZZLE
+
+    # NVFP4
+    if (
+        (scale.numel()
+            == round_up(mat.shape[0], 128) * round_up(math.ceil(2 * mat.shape[1] // 16), 4)
+            or scale.numel()
+            == round_up(mat.shape[1], 128) * round_up(math.ceil(2 * mat.shape[0] // 16), 4))
+        and mat.dtype == torch.float4_e2m1fn_x2
+        and scale.dtype == torch.float8_e4m3fn
+    ):
+        return ScalingType.BlockWise1x16, SwizzleType.SWIZZLE_32_4_4
+
+    # MXFP4 w/o swizzle
+    if (
+        (scale.numel() == 2 * math.ceil(mat.shape[0] // 32) * mat.shape[1]
+            or scale.numel() == 2 * math.ceil(mat.shape[1] // 32) * mat.shape[0])
+        and mat.dtype == torch.float4_e2m1fn_x2
+        and scale.dtype == torch.float8_e8m0fnu
+    ):
+        return ScalingType.BlockWise1x32, SwizzleType.NO_SWIZZLE
+
+    if not torch.version.hip:
+        # MXFP8 w/ swizzle
+        if (
+            (scale.numel()
+                == round_up(mat.shape[0], 128) * round_up(math.ceil(mat.shape[1] // 32), 4)
+                or scale.numel()
+                == round_up(mat.shape[1], 128) * round_up(math.ceil(mat.shape[0] // 32), 4))
+            and scale.dtype == torch.float8_e8m0fnu
+        ):
+            return ScalingType.BlockWise1x32, SwizzleType.SWIZZLE_32_4_4
+
+    else:
+        # MXFP8 w/o swizzle
+        if (
+            (scale.numel() == math.ceil(mat.shape[0] // 32) * mat.shape[1]
+                or scale.numel() == math.ceil(mat.shape[1] // 32) * mat.shape[0])
+            and scale.dtype == torch.float8_e8m0fnu
+        ):
+            return ScalingType.BlockWise1x32, SwizzleType.NO_SWIZZLE
+
+    return None, None
+
+
+wrap: bool = True
+
+def scaled_mm_wrap(
+    a,
+    b,
+    scale_a,
+    scale_b,
+    scale_recipe_a=None,
+    scale_recipe_b=None,
+    swizzle_a=SwizzleType.NO_SWIZZLE,
+    swizzle_b=SwizzleType.NO_SWIZZLE,
+    scale_result=None,
+    out_dtype=torch.bfloat16,
+    use_fast_accum=False,
+    bias=None,
+    wrap_v2=wrap,
+):
+    if not wrap_v2:
+        return torch._scaled_mm(
+            a,
+            b,
+            scale_a,
+            scale_b,
+            scale_result=scale_result,
+            out_dtype=out_dtype,
+            bias=bias,
+            use_fast_accum=use_fast_accum,
+        )
+    else:
+        # infer scalingtype and swizzle from scales
+        if scale_recipe_a is None:
+            scale_recipe_a, swizzle_a = infer_scale_swizzle(a, scale_a)
+        if scale_recipe_b is None:
+            scale_recipe_b, swizzle_b = infer_scale_swizzle(b, scale_b)
+
+        out = scaled_mm(
+            a,
+            b,
+            scale_a,
+            scale_recipe_a,
+            scale_b,
+            scale_recipe_b,
+            swizzle_a=swizzle_a,
+            swizzle_b=swizzle_b,
+            bias=bias,
+            output_dtype=out_dtype,
+            use_fast_accum=use_fast_accum,
+        )
+        return out
+
+def scaled_grouped_mm_wrap(
+    a,
+    b,
+    scale_a,
+    scale_b,
+    scale_recipe_a,
+    scale_recipe_b,
+    swizzle_a=SwizzleType.NO_SWIZZLE,
+    swizzle_b=SwizzleType.NO_SWIZZLE,
+    scale_result=None,
+    out_dtype=torch.bfloat16,
+    use_fast_accum=False,
+    offs=None,
+    bias=None,
+    wrap_v2=True,
+):
+    if not wrap_v2:
+        return torch._scaled_grouped_mm(
+            a,
+            b,
+            scale_a,
+            scale_b,
+            out_dtype=out_dtype,
+            bias=bias,
+            offs=offs,
+            use_fast_accum=use_fast_accum)
+    else:
+        return scaled_grouped_mm(
+            a,
+            b,
+            scale_a,
+            scale_recipe_a,
+            scale_b,
+            scale_recipe_b,
+            swizzle_a=swizzle_a,
+            swizzle_b=swizzle_b,
+            offs=offs,
+            bias=bias,
+            output_dtype=out_dtype,
+            use_fast_accum=use_fast_accum)
+
+
+
+def mm_float8_emulated(x, x_scale, y, y_scale, out_dtype) -> torch.Tensor:
+    # naive implementation: dq -> op -> q
+    x_fp32 = x.to(torch.float) / x_scale
+    y_fp32 = y.to(torch.float) / y_scale
+    out_fp32 = torch.mm(x_fp32, y_fp32)
+
+    return out_fp32.to(out_dtype)
+
+def mm_float8_emulated_block(x, x_scale, y, y_scale, out_dtype) -> torch.Tensor:
+    x = x.unflatten(1, (x_scale.shape[1], -1)).unflatten(0, (x_scale.shape[0], -1))
+    y = y.unflatten(1, (y_scale.shape[1], -1)).unflatten(0, (y_scale.shape[0], -1))
+    x_fp32 = x.to(torch.float) / x_scale[:, None, :, None]
+    y_fp32 = y.to(torch.float) / y_scale[:, None, :, None]
+    x_fp32 = x_fp32.flatten(2, 3).flatten(0, 1)
+    y_fp32 = y_fp32.flatten(2, 3).flatten(0, 1)
+    out_fp32 = torch.mm(x_fp32, y_fp32)
+
+    return out_fp32.to(out_dtype)
+
+def addmm_float8_unwrapped(
+    a_data: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_data: torch.Tensor,
+    b_scale: torch.tensor,
+    output_dtype: torch.dtype,
+    output_scale: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    a_inverse_scale = a_scale.reciprocal()
+    b_inverse_scale = b_scale.reciprocal()
+    if output_dtype == torch.float32 and bias is not None:
+        # Bias is not supported by _scaled_mm when output is fp32
+        output = scaled_mm_wrap(
+            a_data,
+            b_data,
+            scale_a=a_inverse_scale,
+            scale_b=b_inverse_scale,
+            scale_result=output_scale,
+            out_dtype=output_dtype,
+        )
+        output += bias
+        return output
+    output = scaled_mm_wrap(
+        a_data,
+        b_data,
+        bias=bias,
+        scale_a=a_inverse_scale,
+        scale_b=b_inverse_scale,
+        scale_result=output_scale,
+        out_dtype=output_dtype,
+    )
+    return output
+
+def to_fp8_saturated(
+    x: torch.Tensor,
+    fp8_dtype: torch.dtype
+):
+    if fp8_dtype == e4m3_type:
+        x = x.clamp(min=-1 * E4M3_MAX_POS, max=E4M3_MAX_POS)
+    elif fp8_dtype == e5m2_type:
+        x = x.clamp(min=-1 * E5M2_MAX_POS, max=E5M2_MAX_POS)
+    else:
+        raise ValueError(f"to_fp8_saturated(): Unsupported fp8_dtype: {fp8_dtype}")
+
+    return x.to(fp8_dtype)
+
+
+
+def compute_error(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    """Computes the error between two tensors in dB.
+
+    For more details see:
+        https://en.wikipedia.org/wiki/Signal-to-noise_ratio
+
+    Args:
+        x: The original tensor.
+        y: The tensor to compare to the original tensor.
+    """
+    Ps = torch.norm(x)
+    Pn = torch.norm(x - y)
+    return 20 * torch.log10(Ps / Pn)
+
+
+# largest power of 2 representable in `torch.float8_e4m3fn`
+F8E4M3_LARGEST_POW2 = 8
+# largest power of 2 representable in `torch.float4_e2m1fn_x2`
+FP4E2M1FN_LARGEST_POW2 = 2.0
+# max value of `torch.float8_e4m3fn` (448)
+F8E4M3_MAX_VAL = torch.finfo(torch.float8_e4m3fn).max
+# exponent bias of `torch.float8_e8m0fnu`
+F8E8M0_EXP_BIAS = 127
+# exponent and mantissa bits of `torch.float4_e2m1fn_x2`
+FP4_EBITS, FP4_MBITS = 2, 1
+FP4_MAX_VAL = 6.0
+
+def data_to_mx_scale(x, block_size, recipe):
+    # simple implementation of https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    # section 6.3, not all edge cases (such as NaN) are handled/tested
+    if recipe == "mxfp8":
+        largest_pow2 = F8E4M3_LARGEST_POW2
+    elif recipe == "mxfp4":
+        largest_pow2 = FP4E2M1FN_LARGEST_POW2
+    else:
+        raise ValueError(f"data_to_mx_scale(): Unsupported mx recipe: {recipe}")
+    orig_shape = x.shape
+    x = x.reshape(-1, block_size)
+    max_abs = torch.amax(torch.abs(x), 1)
+    largest_p2_lt_max_abs = torch.floor(torch.log2(max_abs))
+    scale_e8m0_unbiased = largest_p2_lt_max_abs - largest_pow2
+    scale_e8m0_unbiased = torch.clamp(scale_e8m0_unbiased, -1 * F8E8M0_EXP_BIAS, F8E8M0_EXP_BIAS)
+    scale_e8m0_biased = scale_e8m0_unbiased + F8E8M0_EXP_BIAS
+    scale_e8m0_biased = scale_e8m0_biased.to(torch.uint8)
+    scale_e8m0_biased = scale_e8m0_biased.view(torch.float8_e8m0fnu)
+    return scale_e8m0_biased.reshape(orig_shape[0], -1)
+
+
+def data_to_nvfp4_scale(x, block_size):
+    orig_shape = x.shape
+    x = x.reshape(-1, block_size)
+    max_abs = torch.amax(torch.abs(x), 1) + 1e-12
+
+    # x_orig_max / scale = x_in_fp4_domain_max
+    # x_orig_max / x_in_fp4_domain_max = scale
+    scale = max_abs / FP4_MAX_VAL
+
+    # for the purposes of this function, just clamp to representable range of
+    # `torch.float8_e4m3fn`. In real code, we would expect the modeling code to
+    # handle this before the input data hits this function.
+    scale = scale.clamp(max=F8E4M3_MAX_VAL)
+
+    # cast to target dtype
+    scale = scale.to(torch.float8_e4m3fn)
+    scale = scale.reshape(orig_shape[0], -1)
+    return scale
+
+
+def data_to_nvfp4_with_global_scale(x, block_size):
+    # Simple (slow) reference implementation of NVFP4 two-level-scaling
+    orig_shape = x.shape
+    x = x.reshape(-1, block_size)
+
+    # Per-block-amax
+    block_max = torch.amax(torch.abs(x), 1) + 1e-12
+
+    # Per-tensor max
+    global_max = x.abs().max()
+
+    # Contants
+    # Global encoding scale for block-scales
+    S_enc = FP4_MAX_VAL * F8E4M3_MAX_VAL / global_max
+    S_dec = 1. / S_enc
+
+    # Per-block decode-scale
+    S_dec_b = block_max / FP4_MAX_VAL
+
+    # Stored scaled-e4m3 per-block decode scales
+    S_dec_b_e4m3 = (S_dec_b * S_enc).to(torch.float8_e4m3fn)
+
+    # Actual per-block encoding scale
+    S_enc_b = S_enc / S_dec_b_e4m3.float()
+
+    # scale & reshape input, reshape scales
+    x = (S_enc_b.unsqueeze(1) * x).bfloat16().reshape(orig_shape)
+    S_dec_b_e4m3 = S_dec_b_e4m3.reshape(orig_shape[0], -1)
+
+    # cast input
+    x_fp4 = _bfloat16_to_float4_e2m1fn_x2(x)
+
+    # fp4x2, fp8_e4m3, float respectively
+    return x_fp4, S_dec_b_e4m3, S_dec.float()
+
+
+def down_size(size):
+    assert size[-1] % 2 == 0, f"{size} last dim not divisible by two"
+    return (*size[:-1], size[-1] // 2)
+
+
+def pack_uint4(uint8_data) -> torch.Tensor:
+    # converting to uint8 for operations
+    shape = uint8_data.shape
+    assert shape[-1] % 2 == 0
+    uint8_data = uint8_data.contiguous().view(-1)
+    return (uint8_data[1::2] << 4 | uint8_data[::2]).view(down_size(shape))
+
+
+def _bfloat16_to_float4_e2m1fn_x2(x):
+    assert x.dtype == torch.bfloat16
+    x = _f32_to_floatx_unpacked(x.float(), FP4_EBITS, FP4_MBITS)
+    x = pack_uint4(x)
+    x = x.view(torch.float4_e2m1fn_x2)
+    return x
+
+
+class TestFP8Matmul(TestCase):
+
+    def _test_tautological_mm(self, device: str = "cuda",
+                              x_dtype: torch.dtype = e4m3_type,
+                              y_dtype: torch.dtype = e4m3_type,
+                              out_dtype: Optional[torch.dtype] = None,
+                              size: int = 16) -> None:
+        if device != "cpu" and torch.cuda.is_available() and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(f8_msg)
+        x_fp8 = torch.rand(size, size, device=device).to(x_dtype)
+        y_fp8 = torch.eye(size, device=device, dtype=y_dtype).t()
+        out_fp32 = torch.mm(x_fp8.to(torch.float), y_fp8.to(torch.float))
+        scale_a = torch.tensor(1.0, device=device)
+        scale_b = torch.tensor(1.0, device=device)
+        out_fp8 = scaled_mm_wrap(x_fp8, y_fp8, scale_a, scale_b, out_dtype=out_dtype)
+        if out_dtype is not None:
+            self.assertEqual(out_dtype, out_fp8.dtype)
+        self.assertEqual(out_fp32, out_fp8.to(torch.float))
+
+    def test_float8_basics(self, device) -> None:
+        if device != "cpu" and torch.cuda.is_available() and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(f8_msg)
+        self._test_tautological_mm(device, e4m3_type, e4m3_type, size=16)
+        # According to https://docs.nvidia.com/cuda/cublas/#id99 8F_E5M2 MM is unsupported
+        # supported on ROCm but fails on CUDA
+        ctx = self.assertRaises(ValueError) if torch.version.hip is None and device != "cpu" else contextlib.nullcontext()
+        with ctx:
+            self._test_tautological_mm(device, e5m2_type, e5m2_type)
+
+        self._test_tautological_mm(device, e4m3_type, e5m2_type, size=32)
+        self._test_tautological_mm(device, e5m2_type, e4m3_type, size=48)
+
+        self._test_tautological_mm(device, size=64, out_dtype=torch.float16)
+        self._test_tautological_mm(device, size=96, out_dtype=torch.float32)
+        self._test_tautological_mm(device, size=80, out_dtype=torch.bfloat16)
+
+        with self.assertRaises(AssertionError if torch.version.hip or device == "cpu" else RuntimeError):
+            self._test_tautological_mm(device, out_dtype=e5m2_type)
+
+    def test_float8_scale(self, device) -> None:
+        if device != "cpu" and torch.cuda.is_available() and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(f8_msg)
+        size = (16, 16)
+        x = torch.full(size, .5, device=device, dtype=e4m3_type)
+        # hipblaslt does not yet support mixed e4m3_type input
+        y_type = e4m3_type if torch.version.hip else e5m2_type
+        y = torch.full(size, .5, device=device, dtype=y_type).t()
+        scale_one = torch.tensor(1.0, device=device)
+        scale_a = torch.tensor(1.5, device=device)
+        scale_b = torch.tensor(0.66, device=device)
+        out_fp8 = scaled_mm_wrap(x, y, scale_a=scale_one, scale_b=scale_one)
+        self.assertEqual(out_fp8.to(torch.float), torch.full(size, 4., device=device))
+        out_fp8_s = scaled_mm_wrap(x, y, scale_a=scale_a, scale_b=scale_b)
+        self.assertEqual(out_fp8, out_fp8_s)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM, mxfp8_grouped_mm_skip_msg)
+    @parametrize("G", [1, 4, 16])
+    @parametrize("M", [2048, 2049])
+    @parametrize("N", [8192])
+    @parametrize("K", [16640])
+    @parametrize("wrap_v2", [True, False])
+    def test_mxfp8_scaled_grouped_mm_2d_2d(self, G, M, N, K, wrap_v2):
+        torch.manual_seed(42)
+        total_K = K  # Alias for clarity, communicating this consists of several groups along this dim
+        input_group_end_offsets = generate_jagged_offs(
+            G, total_K, multiple_of=32, device="cuda"
+        )
+        X = torch.randn((M, total_K), dtype=torch.bfloat16, device="cuda") * 0.1
+        W = torch.randn((N, total_K), dtype=torch.bfloat16, device="cuda") * 0.01
+
+        # Convert scales to blocked format.
+        x_list = []
+        w_list = []
+        x_blocked_scale_list = []
+        w_blocked_scale_list = []
+
+        def round_up(x: int, y: int) -> int:
+            return ((x + y - 1) // y) * y
+
+        for group_idx in range(G):
+            # to_mxfp8 per group
+            prev_group_end_offset = (
+                0 if group_idx == 0 else input_group_end_offsets[group_idx - 1]
+            )
+            curr_group_end_offset = input_group_end_offsets[group_idx]
+            group_size = curr_group_end_offset - prev_group_end_offset
+            if group_size > 0:
+                x_slice = X[
+                    :, prev_group_end_offset:curr_group_end_offset
+                ].contiguous()  # (M, K_group)
+                w_slice = W[
+                    :, prev_group_end_offset:curr_group_end_offset
+                ].contiguous()  # (N, K_group)
+                x_scale_slice, xq_slice = to_mxfp8(
+                    x_slice
+                )  # scale shape -> (M, K_group // 32)
+                w_scale_slice, wq_slice = to_mxfp8(
+                    w_slice
+                )  # scale shape -> (N, K_group // 32)
+                x_list.append(xq_slice)
+                w_list.append(wq_slice)
+
+                # Convert scales to blocked format.
+                x_scale_slice_blocked = to_blocked(
+                    x_scale_slice
+                )  # (round_up(M, 128), round_up(K_group//32, 4))
+                w_scale_slice_blocked = to_blocked(
+                    w_scale_slice
+                )  # (round_up(N, 128), round_up(K_group//32, 4))
+                x_blocked_scale_list.append(x_scale_slice_blocked)
+                w_blocked_scale_list.append(w_scale_slice_blocked)
+
+        # Assemble the full XQ and WQ
+        xq = torch.cat(x_list, dim=1).contiguous()
+        wq = torch.cat(w_list, dim=1).contiguous()
+
+        # Combine all XQ groups blocked scales into one tensor.
+        x_blocked_scales = torch.cat(x_blocked_scale_list, dim=0)
+        M_rounded = round_up(M, 128)
+        x_blocked_scales = x_blocked_scales.reshape(M_rounded, -1)
+
+        # Combine all WQ groups blocked scales into one tensor.
+        w_blocked_scales = torch.cat(w_blocked_scale_list, dim=0)
+        N_rounded = round_up(N, 128)
+        w_blocked_scales = w_blocked_scales.reshape(N_rounded, -1)
+
+        # Compute mxfp8 grouped mm output
+        y_mxfp8 = scaled_grouped_mm_wrap(
+            xq,  # (M, total_K)
+            wq.transpose(-2, -1),  # (total_K, N)
+            x_blocked_scales,  # to_blocked_per_group(M, total_K//32)
+            w_blocked_scales,  # to_blocked_per_group(N, total_K//32)
+            scale_recipe_a=ScalingType.BlockWise1x32,
+            scale_recipe_b=ScalingType.BlockWise1x32,
+            swizzle_a=SwizzleType.SWIZZLE_32_4_4,
+            swizzle_b=SwizzleType.SWIZZLE_32_4_4,
+            offs=input_group_end_offsets,  # (G,)
+            out_dtype=torch.bfloat16,
+            wrap_v2=wrap_v2
+        )
+
+        # bf16 reference output
+        y_bf16 = torch._grouped_mm(
+            X, W.t(), offs=input_group_end_offsets, out_dtype=torch.bfloat16
+        )
+
+        # Assert no NaNs
+        assert not y_mxfp8.isnan().any(), "mxfp8 output contains NaN"
+
+        # Assert outputs are close
+        torch.testing.assert_close(y_mxfp8, y_bf16, atol=8.0e-2, rtol=8.0e-2)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM, mxfp8_grouped_mm_skip_msg)
+    @parametrize("G", [1, 4, 16])
+    @parametrize("M", [16640])
+    @parametrize("N", [8192])
+    @parametrize("K", [4096])
+    @parametrize("wrap_v2", [True, False])
+    def test_mxfp8_scaled_grouped_mm_2d_3d(self, G, M, N, K, wrap_v2):
+        torch.manual_seed(42)
+        # Simulate 2d-3d grouped gemm `out = input @ weight.t()`
+        # 2D inputs with groups along M, 3D weights.
+        block_size = 32
+        total_M = M  # Alias for clarity that M dim contains groups.
+        X = torch.randn((total_M, K), dtype=torch.bfloat16, device="cuda") * 0.1
+        W = torch.randn((G, N, K), dtype=torch.bfloat16, device="cuda") * 0.01
+        input_group_end_offsets = generate_jagged_offs(
+            G, total_M, multiple_of=32, device="cuda"
+        )
+
+        # For each constituent 2d subtensor in the 3d weights, quantize and convert scale to blocked format separately,
+        # as they each used for independent gemm in the grouped gemm.
+        wq_list = []
+        w_scale_list = []
+        for i in range(G):
+            w_scale, wq = to_mxfp8(W[i])
+            w_scale = to_blocked(w_scale)
+            wq_list.append(wq)
+            w_scale_list.append(w_scale)
+        wq = torch.stack(wq_list, dim=0).contiguous()
+        w_scale = torch.stack(w_scale_list, dim=0).contiguous()
+
+        # For each group along `total_M` in the 2D tensor, quantize and convert scale to blocked format separately,
+        # as they each used for independent gemm in the grouped gemm.
+        xq_list = []
+        x_scale_list = []
+        for i in range(G):
+            prev_group_end = 0 if i == 0 else input_group_end_offsets[i - 1]
+            curr_group_end = input_group_end_offsets[i]
+            group_size = curr_group_end - prev_group_end
+            if group_size > 0:
+                x_slice = X[prev_group_end:curr_group_end, :]
+                x_scale, xq = to_mxfp8(x_slice)
+                x_scale = to_blocked(x_scale)
+                xq_list.append(xq)
+                x_scale_list.append(x_scale)
+        xq = torch.cat(xq_list, dim=0).contiguous()
+        x_scale = torch.cat(x_scale_list, dim=0).contiguous()
+        x_scale = x_scale.reshape(-1, K // block_size)
+        xq = xq.view(-1, xq.shape[-1])
+
+        # Compute mxfp8 grouped gemm.
+        y_mxfp8 = scaled_grouped_mm_wrap(
+            xq,
+            wq.transpose(-2, -1),
+            x_scale,
+            w_scale,
+            offs=input_group_end_offsets,
+            out_dtype=torch.bfloat16,
+            scale_recipe_a=ScalingType.BlockWise1x32,
+            scale_recipe_b=ScalingType.BlockWise1x32,
+            swizzle_a=SwizzleType.SWIZZLE_32_4_4,
+            swizzle_b=SwizzleType.SWIZZLE_32_4_4,
+            wrap_v2=wrap_v2)
+
+
+        # Compute reference bf16 grouped gemm.
+        y_bf16 = torch._grouped_mm(
+            X,
+            W.transpose(-2, -1),
+            offs=input_group_end_offsets,
+            out_dtype=torch.bfloat16,
+        )
+
+        # Assert outputs are close.
+        torch.testing.assert_close(y_mxfp8, y_bf16, atol=8.0e-2, rtol=8.0e-2)
+
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @parametrize("base_dtype", [torch.float16, torch.bfloat16, torch.float32])
+    def test_scaled_mm_vs_emulated(self, base_dtype):
+        torch.manual_seed(42)
+        input_dtype = e4m3_type
+        output_dtype = base_dtype
+        compare_type = torch.float32
+
+        x = torch.randn(16, 16, device="cuda", dtype=base_dtype)
+        y = torch.randn(32, 16, device="cuda", dtype=base_dtype).t()
+
+        x_scale = tensor_to_scale(x, input_dtype).float()
+        y_scale = tensor_to_scale(y, input_dtype).float()
+
+
+        x_fp8 = to_fp8_saturated(x * x_scale, input_dtype)
+        y_fp8 = to_fp8_saturated(y * y_scale, input_dtype)
+
+        # Calculate actual F8 mm
+        out_scaled_mm = scaled_mm_wrap(
+            x_fp8,
+            y_fp8,
+            scale_a=x_scale.reciprocal(),
+            scale_b=y_scale.reciprocal(),
+            out_dtype=output_dtype
+        )
+
+        # Calculate emulated F8 mm
+        out_emulated = mm_float8_emulated(
+            x_fp8,
+            x_scale,
+            y_fp8,
+            y_scale,
+            output_dtype
+        )
+
+        if output_dtype != base_dtype:
+            out_scaled_mm = out_scaled_mm.to(compare_type)
+            out_scaled_mm = out_scaled_mm / tensor_to_scale(out_scaled_mm, input_dtype)
+
+            out_emulated = out_emulated.to(compare_type)
+            out_emulated = out_emulated / tensor_to_scale(out_emulated, input_dtype)
+
+        if base_dtype in {torch.bfloat16, torch.float16}:
+            atol, rtol = 7e-2, 7e-2
+        else:
+            atol, rtol = 3e-3, 3e-3
+
+        torch.testing.assert_close(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @parametrize("base_dtype", [torch.float16, torch.bfloat16, torch.float32])
+    def test_scaled_mm_change_stride(self, base_dtype):
+        torch.manual_seed(42)
+        input_dtype = e4m3_type
+        output_dtype = base_dtype
+        compare_type = torch.float32
+
+        x = torch.empty_strided((16, 16), (16, 1), device="cuda", dtype=base_dtype)
+        y = torch.empty_strided((16, 32), (1, 64), device="cuda", dtype=base_dtype)
+
+        x.normal_()
+        y.normal_()
+
+        x_scale = tensor_to_scale(x, input_dtype).float()
+        y_scale = tensor_to_scale(y, input_dtype).float()
+
+        x_fp8 = to_fp8_saturated(x * x_scale, input_dtype)
+        y_fp8 = to_fp8_saturated(y * y_scale, input_dtype)
+
+        # Calculate actual F8 mm
+        out_scaled_mm = scaled_mm_wrap(
+            x_fp8,
+            y_fp8,
+            scale_a=x_scale.reciprocal(),
+            scale_b=y_scale.reciprocal(),
+            out_dtype=output_dtype
+        )
+
+        # Calculate emulated F8 mm
+        out_emulated = mm_float8_emulated(
+            x_fp8,
+            x_scale,
+            y_fp8,
+            y_scale,
+            output_dtype
+        )
+
+        if output_dtype != base_dtype:
+            out_scaled_mm = out_scaled_mm.to(compare_type)
+            out_scaled_mm = out_scaled_mm / tensor_to_scale(out_scaled_mm, input_dtype)
+
+            out_emulated = out_emulated.to(compare_type)
+            out_emulated = out_emulated / tensor_to_scale(out_emulated, input_dtype)
+
+        if base_dtype in {torch.bfloat16, torch.float16}:
+            atol, rtol = 7e-2, 7e-2
+        else:
+            atol, rtol = 3e-3, 3e-3
+
+        torch.testing.assert_close(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
+
+    @onlyCUDA
+    def test_float8_bias(self, device) -> None:
+        if device != "cpu" and torch.cuda.is_available() and not PLATFORM_SUPPORTS_FP8:
+            raise unittest.SkipTest(f8_msg)
+        (k, l, m) = (16, 48, 32)
+        x = torch.ones((k, l), device=device).to(e4m3_type)
+        y = torch.full((m, l), .25, device=device, dtype=e4m3_type).t()
+        bias = torch.full((m,), 4.0, device=device, dtype=torch.bfloat16)
+        scale_a = torch.tensor(1.0, device=device)
+        scale_b = torch.tensor(1.0, device=device)
+        out_fp8 = scaled_mm_wrap(x, y, scale_a=scale_a, scale_b=scale_b)
+        outb_fp8 = scaled_mm_wrap(x, y, scale_a=scale_a, scale_b=scale_b, bias=bias)
+        # this fails on ROCm currently because hipblaslt doesn't have amax op
+        out_fp32 = out_fp8.to(torch.float32)
+        outb_fp32 = outb_fp8.to(torch.float32)
+        difference = torch.abs(out_fp32 - outb_fp32)
+        self.assertEqual(difference, torch.tensor(4.0, device=device).expand_as(out_fp32))
+
+    @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @parametrize("bias", [True, False])
+    def test_non_divisible_leading_dim(self, device, bias: bool) -> None:
+        x = torch.rand((17, 16), device=device).to(e4m3_type)
+        y = torch.rand((16, 16), device=device).to(e4m3_type).t()
+        scale_a = torch.tensor(1.0, device=device)
+        scale_b = torch.tensor(1.0, device=device)
+        input_bias = None
+        if bias:
+            input_bias = torch.rand((16,), device=device).to(torch.bfloat16)
+        _ = scaled_mm_wrap(x, y, scale_a, scale_b, bias=input_bias)
+
+    @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    def test_float8_bias_relu_edgecase(self, device) -> None:
+        (k, l, m) = (16, 48, 32)
+        x = torch.full((k, l), 0.0, device=device).to(e4m3_type)
+        y = torch.full((m, l), 1.0, device=device, dtype=e4m3_type).t()
+        bias = torch.full((m,), -3.0, device=device, dtype=torch.bfloat16)
+        scale_a = torch.tensor(1.0, device=device)
+        scale_b = torch.tensor(1.0, device=device)
+        outb_fp8 = scaled_mm_wrap(x, y, scale_a, scale_b, bias=bias)
+        outb_fp32 = outb_fp8.to(torch.float32)
+        self.assertEqual(outb_fp32, torch.tensor(-3.0, device=device).expand_as(outb_fp32))
+
+    @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    def test_float32_output_errors_with_bias(self, device) -> None:
+        (k, l, m) = (16, 48, 32)
+        x = torch.rand((k, l), device=device).to(e4m3_type)
+        y = torch.full((m, l), .25, device=device, dtype=e4m3_type).t()
+        scale_a = torch.tensor(1.0, device=device)
+        scale_b = torch.tensor(1.0, device=device)
+        bias = torch.full((m,), 4.0, device=device, dtype=torch.bfloat16)
+        self.assertRaisesRegex(
+            ValueError,
+            "Bias is not supported when out_dtype is set to Float32",
+            lambda: scaled_mm_wrap(x, y, scale_a, scale_b, bias=bias, out_dtype=torch.float32),
+        )
+
+    @onlyCUDA
+    @unittest.skipIf(PLATFORM_SUPPORTS_FP8 or not torch.cuda.is_available(), f8_msg)
+    def test_error_message_fp8_pre_sm89(self, device) -> None:
+        (k, l, m) = (16, 48, 32)
+        x = torch.rand((k, l), device=device).to(e4m3_type)
+        y = torch.rand((m, l), device=device).to(e4m3_type).t()
+        scale_a = torch.tensor(1.0, device=device)
+        scale_b = torch.tensor(1.0, device=device)
+        self.assertRaisesRegex(
+            RuntimeError,
+            r"torch\.\_scaled\_mm is only supported on CUDA devices with compute capability \>\= 9\.0 or 8\.9, or ROCm MI300\+",
+            lambda: scaled_mm_wrap(x, y, scale_a, scale_b, out_dtype=torch.float32),
+        )
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @unittest.skipIf(SM100OrLater, "fast_accum is SM90-only")
+    def test_float8_scale_fast_accum(self, device) -> None:
+        size = (16, 16)
+        x = torch.full(size, .5, device=device, dtype=e4m3_type)
+        # hipblaslt does not yet support mixed e4m3_type input
+        y_type = e4m3_type if torch.version.hip else e5m2_type
+        y = torch.full(size, .5, device=device, dtype=y_type).t()
+        scale_a = torch.tensor(1.5, device=device)
+        scale_b = torch.tensor(0.66, device=device)
+        out_fp8 = scaled_mm_wrap(x, y, scale_a, scale_b, out_dtype=e4m3_type, use_fast_accum=True)
+        self.assertEqual(out_fp8.to(torch.float), torch.full(size, 4., device=device))
+        out_fp8_s = scaled_mm_wrap(x, y, scale_a=scale_a, scale_b=scale_b, out_dtype=e4m3_type, use_fast_accum=True)
+        self.assertEqual(out_fp8, out_fp8_s)
+
+    @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
+    @unittest.skipIf(not SM89OrLater, "rowwise implementation is currently sm89-sm100 specific")
+    @parametrize("use_fast_accum", [True, False])
+    def test_float8_rowwise_scaling_sanity(self, device, use_fast_accum: bool) -> None:
+        M, K, N = (1024, 512, 2048)
+        fill_value = 0.5
+        x = torch.full((M, K), fill_value, device=device)
+        y = torch.full((N, K), fill_value, device=device)
+
+        x_scales = torch.ones((x.shape[0], 1), device=device, dtype=torch.float32)
+        y_scales = torch.ones((1, y.shape[0]), device=device, dtype=torch.float32)
+
+        x_fp8 = x.to(e4m3_type)
+        y_fp8 = y.to(e4m3_type).t()
+
+        out_fp8 = scaled_mm_wrap(
+            x_fp8,
+            y_fp8,
+            scale_a=x_scales,
+            scale_b=y_scales,
+            out_dtype=torch.bfloat16,
+            use_fast_accum=use_fast_accum,
+        )
+        self.assertEqual(
+            out_fp8.to(torch.float32), torch.full((M, N), K * (fill_value**2), device=device)
+        )
+
+    @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
+    def test_float8_error_messages(self, device) -> None:
+        M, K, N = (1024, 512, 2048)
+        fill_value = 0.5
+        x = torch.full((M, K), fill_value, device=device)
+        y = torch.full((N, K), fill_value, device=device)
+
+        x_fp8 = x.to(e4m3_type)
+        y_fp8 = y.to(e4m3_type).t()
+
+        with self.assertRaisesRegex(
+            ValueError, re.escape("scale_b must have 1 Float element")
+        ):
+            scaled_mm_wrap(
+                x_fp8,
+                y_fp8,
+                scale_a=torch.ones((1, 1), device="cuda"),
+                scale_b=torch.ones((1, 2), device="cuda"),
+                scale_recipe_a=ScalingType.TensorWise,
+                scale_recipe_b=ScalingType.TensorWise,
+                out_dtype=torch.bfloat16,
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, re.escape(f"scale_b must have {N} Float elements, got {N + 1}"),
+        ):
+            scaled_mm_wrap(
+                x_fp8,
+                y_fp8,
+                scale_a=torch.ones((M, 1), device="cuda"),
+                scale_b=torch.ones((1, N + 1), device="cuda"),
+                scale_recipe_a=ScalingType.RowWise,
+                scale_recipe_b=ScalingType.RowWise,
+                out_dtype=torch.bfloat16,
+            )
+        with self.assertRaisesRegex(
+            IndexError, re.escape("Dimension out of range")
+        ):
+            scaled_mm_wrap(
+                x_fp8,
+                y_fp8,
+                scale_a=torch.ones((M), device="cuda"),
+                scale_b=torch.ones((N, 1), device="cuda"),
+                scale_recipe_a=ScalingType.RowWise,
+                scale_recipe_b=ScalingType.RowWise,
+                out_dtype=torch.bfloat16,
+            )
+
+        with self.assertRaisesRegex(
+            ValueError, re.escape("expected scale_b.stride(1) to be 1, but got 2"),
+        ):
+            scaled_mm_wrap(
+                x_fp8,
+                y_fp8,
+                scale_a=torch.ones((M, 1), device="cuda"),
+                scale_b=torch.ones((1, N * 2), device="cuda")[:, ::2],
+                scale_recipe_a=ScalingType.RowWise,
+                scale_recipe_b=ScalingType.RowWise,
+                out_dtype=torch.bfloat16,
+            )
+
+        def e5m2():
+            out = scaled_mm_wrap(
+                x_fp8,
+                y_fp8.to(e5m2_type),
+                scale_a=torch.ones((M, 1), device="cuda"),
+                scale_b=torch.ones((1, N), device="cuda"),
+                out_dtype=torch.bfloat16,
+            )
+            return out
+
+        if torch.cuda.get_device_capability() == (9, 0) and torch.version.cuda and torch.version.cuda >= "12.9":
+            out = e5m2()
+            self.assertEqual(out, torch.ones_like(out) * 128.)
+        else:
+            if torch.version.hip:
+                # Note re.compile is used, not re.escape. This is to accommodate fn vs fnuz type message.
+                with self.assertRaisesRegex(
+                    ValueError,
+                    r"expected mat_b\.dtype\(\) to be at::kFloat8_e4m3fn(uz)?, but got c10::Float8_e5m2(fnuz)?"
+                ):
+                    e5m2()
+            else:
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"Expected b\.dtype\(\) == at::kFloat8_e4m3fn to be true, but got false\.",
+                ):
+                    e5m2()
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
+    @unittest.skipIf(not SM89OrLater, "rowwise implementation is currently sm89-sm100 specific")
+    @parametrize("base_dtype", [torch.bfloat16, torch.float32])
+    @with_tf32_off
+    def test_scaled_mm_vs_emulated_row_wise(self, base_dtype):
+        # Fp32 out_dtype is only supported by cuBLAS, which however only started
+        # shipping row-wise kernels in CUDA 12.9, and only for sm90+.
+        if base_dtype is torch.float32:
+            if torch.version.hip:
+                raise unittest.SkipTest("hipblaslt rowwise _scaled_mm only supports BFloat16")
+            if _get_torch_cuda_version() < (12, 9):
+                raise unittest.SkipTest("Need CUDA 12.9+ for row-wise fp8 w/ cuBLAS")
+            if torch.cuda.get_device_capability() < (9, 0):
+                raise unittest.SkipTest("Need sm90+ for row-wise fp8 w/ cuBLAS")
+
+        torch.manual_seed(42)
+        input_dtype = e4m3_type
+        output_dtype = base_dtype
+
+        x = torch.randn(16, 16, device="cuda", dtype=base_dtype)
+        y = torch.randn(32, 16, device="cuda", dtype=base_dtype).t()
+
+        x_scales = tensor_to_scale(x, input_dtype, dim=1).float()
+        y_scales = tensor_to_scale(y, input_dtype, dim=0).float()
+
+        x_fp8 = to_fp8_saturated(x * x_scales, e4m3_type)
+        y_fp8 = to_fp8_saturated(y * y_scales, e4m3_type)
+
+        def test():
+            # Calculate actual F8 mm
+            out_scaled_mm = scaled_mm_wrap(
+                x_fp8,
+                y_fp8,
+                scale_a=x_scales.reciprocal(),
+                scale_b=y_scales.reciprocal(),
+                out_dtype=output_dtype
+            )
+
+            # Calculate emulated F8 mm
+            out_emulated = mm_float8_emulated(
+                x_fp8, x_scales, y_fp8, y_scales, output_dtype
+            )
+
+            if base_dtype in {torch.bfloat16, torch.float16}:
+                atol, rtol = 7e-2, 7e-2
+            else:
+                atol, rtol = 2e-3, 2e-3
+
+            self.assertEqual(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
+
+        # only cuBLAS supports rowwise with fp32 output and cuBLAS only supports
+        # rowwise on SM 9.0
+        if torch.cuda.get_device_capability() != (9, 0) and output_dtype == torch.float:
+            with self.assertRaisesRegex(
+                ValueError,
+                "Only bf16 high precision output types are supported for row-wise scaling."
+            ):
+                test()
+        else:
+            test()
+
+    # Note: Removed parameterization over M,N,K from #163829 as it failed tests as-is
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
+    @unittest.skipIf(not IS_SM90, "cuBLAS blockwise scaling requires sm90+")
+    @unittest.skipIf(
+        _get_torch_cuda_version() < (12, 9),
+        "cuBLAS blockwise scaling added in CUDA 12.9",
+    )
+    @parametrize("output_dtype", [torch.bfloat16, torch.float32])
+    @parametrize("lhs_block,rhs_block", [(1, 1), (128, 1), (1, 128)])
+    @parametrize("M,N,K", [(256, 768, 512)])
+    def test_scaled_mm_vs_emulated_block_wise(self, output_dtype, lhs_block, rhs_block, M, N, K):
+        torch.manual_seed(42)
+
+        x = torch.randn(M, K, device="cuda", dtype=output_dtype).pow(3)
+        y = torch.randn(N, K, device="cuda", dtype=output_dtype).pow(3)
+
+        x_fp8, x_scales = tensor_to_scale_block(x, e4m3_type, lhs_block, 128)
+        y_fp8, y_scales = tensor_to_scale_block(y, e4m3_type, rhs_block, 128)
+
+        # 1x128 blocks need scales to be outer-dim-major
+        if lhs_block == 1:
+            x_scales = x_scales.t().contiguous().t()
+            lhs_recipe = ScalingType.BlockWise1x128
+        else:
+            lhs_recipe = ScalingType.BlockWise128x128
+        if rhs_block == 1:
+            y_scales = y_scales.t().contiguous().t()
+            rhs_recipe = ScalingType.BlockWise1x128
+        else:
+            rhs_recipe = ScalingType.BlockWise128x128
+
+
+        # Calculate actual F8 mm
+        out_scaled_mm = scaled_mm_wrap(
+            x_fp8, y_fp8.t(), scale_a=x_scales.reciprocal(), scale_b=y_scales.reciprocal().t(), out_dtype=output_dtype,
+            scale_recipe_a=lhs_recipe, scale_recipe_b=rhs_recipe
+        )
+
+        # Calculate emulated F8 mm
+        out_emulated = mm_float8_emulated_block(
+            x_fp8, x_scales, y_fp8.t(), y_scales.t(), output_dtype
+        )
+
+        cosine_sim = torch.nn.functional.cosine_similarity(
+            out_scaled_mm.flatten().float(), out_emulated.flatten().float(), dim=0
+        )
+        self.assertGreaterEqual(float(cosine_sim), 0.999)
+
+        if output_dtype in {torch.bfloat16, torch.float16}:
+            atol, rtol = 6e-1, 7e-2
+        else:
+            atol, rtol = 7e-1, 2e-3
+
+        self.assertEqual(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
+
+        # One last check against the full-precision reference, to ensure we
+        # didn't mess up the scaling itself and made the test trivial.
+        cosine_sim = torch.nn.functional.cosine_similarity(
+            out_scaled_mm.flatten().float(), (x @ y.t()).flatten().float(), dim=0
+        )
+        self.assertGreaterEqual(float(cosine_sim), 0.999)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
+    @unittest.skipIf(not IS_SM90, "cuBLAS blockwise scaling requires sm90+")
+    @unittest.skipIf(
+        _get_torch_cuda_version() < (12, 9),
+        "cuBLAS blockwise scaling added in CUDA 12.9",
+    )
+    @parametrize("output_dtype", [torch.bfloat16, torch.float32])
+    @parametrize("lhs_block,rhs_block", [(1, 1), (128, 1), (1, 128)])
+    @parametrize("M,N,K", [(256, 128, 256), (256, 256, 128)])
+    def test_scaled_mm_vs_emulated_block_wise_verify_small_shapes(
+        self, output_dtype, lhs_block, rhs_block, M, N, K
+    ):
+        torch.manual_seed(42)
+
+        x = torch.randn(M, K, device="cuda", dtype=output_dtype).pow(3)
+        y = torch.randn(N, K, device="cuda", dtype=output_dtype).pow(3)
+
+        x_fp8, x_scales = tensor_to_scale_block(x, e4m3_type, lhs_block, 128)
+        y_fp8, y_scales = tensor_to_scale_block(y, e4m3_type, rhs_block, 128)
+
+        # 1x128 blocks need scales to be outer-dim-major
+        if lhs_block == 1:
+            x_scales = x_scales.t().contiguous().t()
+            lhs_recipe = ScalingType.BlockWise1x128
+        else:
+            lhs_recipe = ScalingType.BlockWise128x128
+
+        if rhs_block == 1:
+            y_scales = y_scales.t().contiguous().t()
+            rhs_recipe = ScalingType.BlockWise1x128
+        else:
+            rhs_recipe = ScalingType.BlockWise128x128
+
+        # Verify that actual F8 mm doesn't error
+        scaled_mm_wrap(
+            x_fp8,
+            y_fp8.t(),
+            scale_a=x_scales,
+            scale_recipe_a=lhs_recipe,
+            scale_b=y_scales.t(),
+            scale_recipe_b=rhs_recipe,
+            out_dtype=output_dtype,
+        )
+
+        # Verify that emulated F8 mm doesn't error
+        mm_float8_emulated_block(x_fp8, x_scales, y_fp8.t(), y_scales.t(), output_dtype)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @parametrize("which_dim_zero", [0, 1, 2])
+    @parametrize("use_torch_compile", [False, True])
+    def test_zero_dim_tensorwise(self, which_dim_zero, use_torch_compile) -> None:
+        device = "cuda"
+        x_dtype, y_dtype = e4m3_type, e4m3_type
+        out_dtype = torch.bfloat16
+        M, K, N = 32, 32, 32
+        if which_dim_zero == 0:
+            M = 0
+        elif which_dim_zero == 1:
+            K = 0
+        elif which_dim_zero == 2:
+            N = 0
+
+        x_fp8 = torch.zeros(M, K, device=device).to(x_dtype)
+        y_fp8 = torch.zeros(N, K, device=device, dtype=y_dtype).t()
+        out_fp32 = torch.mm(x_fp8.to(torch.float), y_fp8.to(torch.float))
+        scale_a = torch.tensor(float('-inf'), device=device)
+        scale_b = torch.tensor(float('-inf'), device=device)
+        f = scaled_mm_wrap
+        if use_torch_compile:
+            f = torch.compile(scaled_mm_wrap)
+        out_fp8 = f(x_fp8, y_fp8, scale_a, scale_b, out_dtype=out_dtype)
+        self.assertEqual(out_dtype, out_fp8.dtype)
+        self.assertEqual(out_fp32, out_fp8.to(torch.float))
+
+    @unittest.skipIf(IS_WINDOWS, "Windows doesn't support row-wise scaling")
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
+    @unittest.skipIf(not SM90OrLater, "sm89 kernel isn't opted into carveout yet")
+    def test_honor_sm_carveout(self) -> None:
+        torch.manual_seed(42)
+
+        x = torch.randn(8192, 2048, device="cuda", dtype=torch.float32)
+        y = torch.randn(8192, 2048, device="cuda", dtype=torch.float32).t()
+        x_scales = tensor_to_scale(x, e4m3_type, dim=1).reciprocal()
+        y_scales = tensor_to_scale(y, e4m3_type, dim=0).reciprocal()
+        x_fp8 = to_fp8_saturated(x / x_scales, e4m3_type)
+        y_fp8 = to_fp8_saturated(y / y_scales, e4m3_type)
+
+        cu_count = torch.cuda.get_device_properties().multi_processor_count
+        carveout = 66 if torch.version.cuda else cu_count // 8
+
+        with tempfile.NamedTemporaryFile() as f:
+            with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
+                self.assertIsNone(torch._C._get_sm_carveout_experimental())
+                scaled_mm_wrap(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
+                torch._C._set_sm_carveout_experimental(0)
+                self.assertEqual(torch._C._get_sm_carveout_experimental(), 0)
+                scaled_mm_wrap(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
+                torch._C._set_sm_carveout_experimental(66)
+                self.assertEqual(torch._C._get_sm_carveout_experimental(), 66)
+                scaled_mm_wrap(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
+                torch._C._set_sm_carveout_experimental(None)
+                self.assertIsNone(torch._C._get_sm_carveout_experimental())
+                scaled_mm_wrap(x_fp8, y_fp8, scale_a=x_scales, scale_b=y_scales, out_dtype=torch.bfloat16)
+
+            prof.export_chrome_trace(f.name)
+            if torch.version.hip:
+                events = [evt for evt in json.load(open(f.name))["traceEvents"] if evt.get("cat", "") == "kernel"]
+                # events were returned out of order; need to be sorted on "ts" timestamp
+                events = sorted(events, key=lambda x: x['ts'])
+                # ROCm carveout is invisible except for kernels running slower on fewer CUs
+                no_carveout, carveout_0, carveout, no_carveout_again = [float(evt.get("dur", "0.0")) for evt in events]
+                if True or not (no_carveout < carveout and carveout_0 < carveout and no_carveout_again < carveout):  # noqa: SIM222
+                    # something went wrong, print more info to help debug flaky test
+                    print("ROCm debug info for test_honor_sm_carveout")
+                    print("cu_count", cu_count)
+                    print("no_carveout", no_carveout)
+                    print("carveout_0", carveout_0)
+                    print("carveout", carveout)
+                    print("no_carveout_again", no_carveout_again)
+                self.assertTrue(no_carveout < carveout)
+                self.assertTrue(carveout_0 < carveout)
+                self.assertTrue(no_carveout_again < carveout)
+                # ROCm carveout will create new streams when enabled, and go back to the original stream when disabled
+                no_carveout, carveout_0, carveout, no_carveout_again = [int(evt.get("tid", "0")) for evt in events]
+                self.assertTrue(no_carveout == no_carveout_again)
+                self.assertTrue(no_carveout == carveout_0)
+                self.assertTrue(no_carveout != carveout)
+                self.assertTrue(carveout_0 != carveout)
+            else:
+                no_carveout, carveout_0, carveout_66, no_carveout_again = [
+                    math.prod(evt.get("args", {}).get("grid", []))
+                    for evt in json.load(open(f.name))["traceEvents"]
+                    if evt.get("cat", "") == "kernel"
+                ]
+
+                self.assertEqual(no_carveout, no_carveout_again)
+                capability = torch.cuda.get_device_capability()
+                if capability in {(10, 0), (10, 3), (12, 0), (12, 1)}:
+                    # expected failure
+                    # CUTLASS only supports SM carveout via green contexts on SM100
+                    self.assertEqual(no_carveout, carveout_66)
+                    self.assertEqual(carveout_66, carveout_0)
+                else:
+                    # correct behavior
+                    self.assertNotEqual(no_carveout, carveout_66)
+                    self.assertNotEqual(carveout_66, carveout_0)
+
+    def test_pack_uint4(self):
+        """
+        Verify that given a tensor with high precision values [val0, val1],
+        the x2 packed representation is val1:val0 (from MSB to LSB), and
+        not val0:val1.
+
+        Note that the packing function is private to this file, but it's still
+        good to test that we are packing in the expected way.
+        """
+        hp_data = torch.tensor([0b00000010, 0b00001011], dtype=torch.uint8)
+        lp_data_actual = pack_uint4(hp_data)
+        lp_data_expected = torch.tensor([0b10110010], dtype=torch.uint8)
+        torch.testing.assert_close(lp_data_actual, lp_data_expected, atol=0, rtol=0)
+
+
+    @onlyCUDA
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, mx_skip_msg)
+    @parametrize("mkn", [
+        # Nice shapes
+        (128, 128, 128),
+        (256, 256, 256),
+        (128, 256, 512),
+        (256, 512, 128),
+        (512, 128, 256),
+
+        # Very unbalanced
+        (1023, 64, 48),
+        (31, 1024, 64),
+        (45, 96, 1024),
+
+        # Mixed large and small
+        (2, 1024, 128),
+        (127, 96, 1024),
+        (1025, 128, 96)
+    ], name_fn=lambda mkn: f"{mkn[0]}_{mkn[1]}_{mkn[2]}")
+    def test_blockwise_nvfp4_with_global_scale(self, mkn) -> None:
+        device = 'cuda'
+        M, K, N = mkn
+        BLOCK_SIZE = 16
+        # Note: SQNR target from `test_blockwise_mxfp8_nvfp4_mxfp4_numerics` test
+        approx_match_sqnr_target = 15.8
+
+        A_ref = torch.randn((M, K), device=device, dtype=torch.bfloat16) * 1000
+        B_ref = torch.randn((N, K), device=device, dtype=torch.bfloat16) * 1000
+
+        A, A_scale, A_global_scale = data_to_nvfp4_with_global_scale(A_ref, BLOCK_SIZE)
+        B, B_scale, B_global_scale = data_to_nvfp4_with_global_scale(B_ref, BLOCK_SIZE)
+        A_scale = to_blocked(A_scale)
+        B_scale = to_blocked(B_scale)
+
+        C_ref = A_ref @ B_ref.t()
+
+        C = scaled_mm(
+            A,
+            B.t(),
+            scale_a=[A_scale, A_global_scale],
+            scale_recipe_a=[ScalingType.BlockWise1x16, ScalingType.TensorWise],
+            scale_b=[B_scale, B_global_scale],
+            scale_recipe_b=[ScalingType.BlockWise1x16, ScalingType.TensorWise],
+            swizzle_a=[SwizzleType.SWIZZLE_32_4_4, SwizzleType.NO_SWIZZLE],
+            swizzle_b=[SwizzleType.SWIZZLE_32_4_4, SwizzleType.NO_SWIZZLE],
+            output_dtype=torch.bfloat16,
+        )
+
+        sqnr = compute_error(C_ref, C)
+        assert sqnr.item() > approx_match_sqnr_target
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, mx_skip_msg)
+    @parametrize("test_case_name", [
+        "a_eye_b_eye",
+        "a_ones_b_ones",
+        "a_ones_modified_b_ones",
+        "a_ones_b_ones_modified",
+        "a_scale_modified_b_ones",
+        "a_ones_b_scale_modified",
+        "data_random_scales_one",
+        "data_random_scales_from_data",
+    ])
+    @parametrize("fast_accum", [False, True])
+    @parametrize("mkn", [
+        # Nice shapes
+        (128, 128, 128),
+        (256, 256, 256),
+        (128, 256, 512),
+        (256, 512, 128),
+        (512, 128, 256),
+
+        # Non block multiples
+        (65, 96, 112),
+        (197, 224, 272),
+        # K not multiple of 32 (skipped for fp4)
+        (197, 240, 272),
+
+        # Very unbalanced
+        (1023, 64, 48),
+        (31, 1024, 64),
+        (45, 96, 1024),
+
+        # Mixed large and small
+        (2, 1024, 128),
+        (127, 96, 1024),
+        (1025, 128, 96)
+    ], name_fn=lambda mkn: f"{mkn[0]}_{mkn[1]}_{mkn[2]}")
+    @parametrize("recipe", ["mxfp8", "mxfp4" if torch.version.hip else "nvfp4"])
+    def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum, mkn, recipe) -> None:
+        if (recipe == "nvfp4" or recipe == "mxfp4") and fast_accum:
+            raise unittest.SkipTest("fast_accum not supported in nvfp4/mxfp4 cublas gemm, skipping")
+
+        device = "cuda"
+        M, K, N = mkn
+        if recipe == "nvfp4" and K % 32 != 0:
+            raise unittest.SkipTest("K must be divisible by 32 for nvfp4 cublas gemm, skipping")
+
+        if torch.version.hip:
+            if not (M % 16 == 0 and K % 128 == 0 and N % 16 == 0):
+                raise unittest.SkipTest("M and N must be multiples of 16 and K must be multiple of 128 on ROCm, skipping")
+
+        fp4_scaling_dtype = torch.float8_e8m0fnu if torch.version.hip else torch.float8_e4m3fn
+        BLOCK_SIZE = 32 if torch.version.hip else (16 if recipe == "nvfp4" else 32)
+        require_exact_match = True
+        approx_match_sqnr_target = 22.0
+
+        if test_case_name == "a_eye_b_eye":
+            if not ((M == K) and (M == N)):
+                raise unittest.SkipTest("this test is only defined for M == K == N, skipping")
+            A_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
+            B_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
+
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+
+        elif test_case_name == "a_ones_b_ones":
+            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
+            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
+
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+
+        elif test_case_name == "a_ones_modified_b_ones":
+            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
+            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
+            A_ref[1][0:BLOCK_SIZE] = 2
+
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+
+        elif test_case_name == "a_ones_b_ones_modified":
+            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
+            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
+            B_ref[1][0:BLOCK_SIZE] = 2
+
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+
+        elif test_case_name == "a_scale_modified_b_ones":
+            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
+            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
+
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                A_ref[1][0:BLOCK_SIZE] = 4
+                A[1][0:BLOCK_SIZE] = 2
+                A_scale[1][0] = 2
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                A_ref[1][0:BLOCK_SIZE] = 4
+                A.view(torch.uint8)[1][0:(BLOCK_SIZE // 2)] = 0b01000100
+                A_scale[1][0] = 2
+
+        elif test_case_name == "a_ones_b_scale_modified":
+            A_ref = torch.ones(M, K, device=device, dtype=torch.bfloat16)
+            B_ref = torch.ones(N, K, device=device, dtype=torch.bfloat16)
+
+            if recipe == "mxfp8":
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_ref[1][0:BLOCK_SIZE] = 4
+                B[1][0:BLOCK_SIZE] = 2
+                B_scale[1][0] = 2
+            else:  # nvfp4 # mxfp4
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_ref[1][0:BLOCK_SIZE] = 4
+                B.view(torch.uint8)[1][0:(BLOCK_SIZE // 2)] = 0b01000100
+                B_scale[1][0] = 2
+
+        elif test_case_name == "data_random_scales_one":
+            require_exact_match = False
+
+            if recipe == "mxfp8":
+                # scales all-ones, element data random while being exactly representable in float8_e4m3fn
+                # generate integers in [0, 255] and interpret as float8_e4m3fn
+                A_ref = torch.randint(0, 255, (M, K), device=device, dtype=torch.uint8).view(torch.float8_e4m3fn).to(torch.bfloat16)
+                B_ref = torch.randint(0, 255, (N, K), device=device, dtype=torch.uint8).view(torch.float8_e4m3fn).to(torch.bfloat16)
+                # modification: don't allow NaN values
+                A_ref[torch.isnan(A_ref)] = 0
+                B_ref[torch.isnan(B_ref)] = 0
+                A = A_ref.to(torch.float8_e4m3fn)
+                B = B_ref.to(torch.float8_e4m3fn)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+            else:  # nvfp4 # mxfp4
+                # scales all-ones, element data random while being exactly representable in float4_e2m1fn_x2
+                # generate integers in [0, 16] and cast to bfloat16
+                A_ref = _floatx_unpacked_to_f32(
+                    torch.randint(0, 16, (M, K), device=device, dtype=torch.uint8),
+                    FP4_EBITS,
+                    FP4_MBITS
+                ).bfloat16()
+                B_ref = _floatx_unpacked_to_f32(
+                    torch.randint(0, 16, (N, K), device=device, dtype=torch.uint8),
+                    FP4_EBITS,
+                    FP4_MBITS
+                ).bfloat16()
+                A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+                B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+                A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+                B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+
+        elif test_case_name == "data_random_scales_from_data":
+            if not K % BLOCK_SIZE == 0:
+                raise unittest.SkipTest(f"this test is only defined for K a multiple of {BLOCK_SIZE}, skipping")
+            require_exact_match = False
+            # random data, scales from data
+            A_ref = torch.randn((M, K), device=device, dtype=torch.bfloat16) * 1000
+            B_ref = torch.randn((N, K), device=device, dtype=torch.bfloat16) * 1000
+
+            if recipe == "mxfp8":
+                # Calculate scales based on the inputs
+                A_scale = data_to_mx_scale(A_ref, BLOCK_SIZE, recipe)
+                B_scale = data_to_mx_scale(B_ref, BLOCK_SIZE, recipe)
+                max_val = F8E4M3_MAX_VAL
+                min_val = -1 * max_val
+                A = (A_ref.reshape(-1, BLOCK_SIZE) / A_scale.reshape(M * ceil_div(K, BLOCK_SIZE), 1).float()).reshape(M, K)
+                A = A.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
+                B = (B_ref.reshape(-1, BLOCK_SIZE) / B_scale.reshape(N * ceil_div(K, BLOCK_SIZE), 1).float()).reshape(N, K)
+                B = B.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
+            else:  # nvfp4 # mxfp4
+                if recipe == "mxfp4":
+                    A_scale = data_to_mx_scale(A_ref, BLOCK_SIZE, recipe)
+                    B_scale = data_to_mx_scale(B_ref, BLOCK_SIZE, recipe)
+                else:
+                    A_scale = data_to_nvfp4_scale(A_ref, BLOCK_SIZE)
+                    B_scale = data_to_nvfp4_scale(B_ref, BLOCK_SIZE)
+                max_val = FP4_MAX_VAL
+                min_val = -1 * max_val
+
+                A = (A_ref.reshape(-1, BLOCK_SIZE) / A_scale.reshape(M * ceil_div(K, BLOCK_SIZE), 1).bfloat16()).reshape(M, K)
+                A = A.clamp(min=min_val, max=max_val)
+                A = _bfloat16_to_float4_e2m1fn_x2(A)
+                B = (B_ref.reshape(-1, BLOCK_SIZE) / B_scale.reshape(N * ceil_div(K, BLOCK_SIZE), 1).bfloat16()).reshape(N, K)
+                B = B.clamp(min=min_val, max=max_val)
+                B = _bfloat16_to_float4_e2m1fn_x2(B)
+
+                approx_match_sqnr_target = 15 if torch.version.hip else 15.8
+
+        C_ref = A_ref @ B_ref.t()
+
+        # convert to swizzled format
+        if not torch.version.hip:
+            A_scale = to_blocked(A_scale)
+            B_scale = to_blocked(B_scale)
+
+        C = scaled_mm_wrap(
+            A,
+            B.t(),
+            A_scale,
+            B_scale,
+            out_dtype=torch.bfloat16,
+            use_fast_accum=fast_accum,
+        )
+
+        if require_exact_match:
+            torch.testing.assert_close(C, C_ref, atol=0, rtol=0)
+        else:
+            sqnr = compute_error(C_ref, C)
+            assert sqnr.item() > approx_match_sqnr_target
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM or IS_WINDOWS, mx_skip_msg)
+    @parametrize("recipe", ["mxfp8", "mxfp4" if torch.version.hip else "nvfp4"])
+    def test_blockwise_mxfp8_nvfp4_error_messages(self, device, recipe) -> None:
+        M, K, N = (1024, 512, 2048)
+        BLOCK_SIZE_K = 16 if recipe == "nvfp4" else 32
+        BLOCK_SIZE_MN = 128
+        fill_value = 0.5
+        scale_dtype = torch.float8_e4m3fn if recipe == "nvfp4" else torch.float8_e8m0fnu
+
+        x = torch.full((M, K), fill_value, device=device)
+        y = torch.full((N, K), fill_value, device=device)
+
+        if recipe == "mxfp8":
+            x_lowp = x.to(e4m3_type)
+            y_lowp = y.to(e4m3_type).t()
+        else:  # nvfp4 #mxfp4
+            x_lowp = _bfloat16_to_float4_e2m1fn_x2(x.bfloat16())
+            y_lowp = _bfloat16_to_float4_e2m1fn_x2(y.bfloat16()).t()
+
+        num_k_blocks = ceil_div(K, BLOCK_SIZE_K)
+        padded_num_k_blocks = ceil_div(num_k_blocks, 4) * 4
+        expected_a_size = BLOCK_SIZE_MN * ceil_div(M, BLOCK_SIZE_MN) * padded_num_k_blocks
+        expected_b_size = BLOCK_SIZE_MN * ceil_div(N, BLOCK_SIZE_MN) * padded_num_k_blocks
+
+        block = (
+            ScalingType.BlockWise1x16
+            if recipe == "nvfp4"
+            else ScalingType.BlockWise1x32
+        )
+        if torch.version.hip:
+            swizzle = SwizzleType.NO_SWIZZLE
+        else:
+            swizzle = SwizzleType.SWIZZLE_32_4_4
+
+        # Test wrong scale tensor size for scale_a with correct dtype
+        with self.assertRaisesRegex(
+            ValueError,
+            f".*For Block[W,w]ise.*scaling.*scale_a should have {expected_a_size} "
+            f"elements.*"
+            ,
+        ):
+            incorrect_size_a = torch.ones(expected_a_size - 1, device=device, dtype=scale_dtype)
+            correct_size_b = torch.ones(expected_b_size, device=device, dtype=scale_dtype)
+
+            scaled_mm_wrap(
+                x_lowp,
+                y_lowp,
+                scale_a=incorrect_size_a,
+                scale_recipe_a=block,
+                scale_b=correct_size_b,
+                scale_recipe_b=block,
+                swizzle_a=swizzle,
+                swizzle_b=swizzle,
+                out_dtype=torch.bfloat16,
+            )
+
+        # Test wrong scale tensor size for scale_b with correct dtype
+        with self.assertRaisesRegex(
+            ValueError,
+            f"For Block[W,w]ise.*scaling.*scale_b should have {expected_b_size} "
+            f"elements.*"
+            ,
+        ):
+            correct_size_a = torch.ones(expected_a_size, device=device, dtype=scale_dtype)
+            incorrect_size_b = torch.ones(expected_b_size + 1, device=device, dtype=scale_dtype)
+            scaled_mm_wrap(
+                x_lowp,
+                y_lowp,
+                scale_a=correct_size_a,
+                scale_recipe_a=block,
+                scale_b=incorrect_size_b,
+                scale_recipe_b=block,
+                swizzle_a=swizzle,
+                swizzle_b=swizzle,
+                out_dtype=torch.bfloat16,
+            )
+
+        # Test non-contiguous scale tensors with correct dtype
+        with self.assertRaisesRegex(
+            ValueError,
+            "For Block[W,w]ise.*scaling.*both scales should be contiguous"
+            ,
+        ):
+            non_contiguous_a = torch.ones(expected_a_size * 2, device=device, dtype=scale_dtype)[::2]
+            contiguous_b = torch.ones(expected_b_size, device=device, dtype=scale_dtype)
+            scaled_mm_wrap(
+                x_lowp,
+                y_lowp,
+                scale_a=non_contiguous_a,
+                scale_b=contiguous_b,
+                out_dtype=torch.bfloat16,
+            )
+
+    def scaled_grouped_mm_helper(self, alist, blist, ascalelist, bscalelist, outlist, use_fast_accum):
+        for a, b, ascale, bscale, out in zip(alist, blist, ascalelist, bscalelist, outlist):
+            out_ref = scaled_mm_wrap(a, b.t(), ascale.view(-1, 1), bscale.view(1, -1),
+                                     out_dtype=torch.bfloat16, use_fast_accum=use_fast_accum)
+            self.assertEqual(out, out_ref, atol=5e-2, rtol=5e-4)
+
+    # Testing only _scaled_grouped_mm() with multiple shapes, as
+    # _scaled_mm() already has more combinations of parameters than
+    # _scaled_grouped_mm(), for supporting more than one inputs layout
+    # combinations.
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8_GROUPED_GEMM, f8_grouped_msg)
+    @parametrize("fast_accum", [False, True])
+    # AMD does not support non-contiguous inputs yet
+    @parametrize("strided", [False] + ([True] if torch.version.cuda else []))
+    @parametrize("wrap_v2", [True, False])
+    def test_scaled_grouped_gemm_2d_2d(self, fast_accum, strided, wrap_v2):
+        device = "cuda"
+        fp8_dtype = e4m3_type
+        m, n, k, n_groups = 16, 32, 64, 4
+        a = torch.randn(m, k * n_groups + k * int(strided), device=device).to(fp8_dtype)[:, :k * n_groups]
+        b = torch.randn(n, k * n_groups + k * int(strided), device=device).to(fp8_dtype)[:, :k * n_groups]
+        scale_a = torch.rand(m * n_groups, device=device, dtype=torch.float32)
+        scale_b = torch.rand(n * n_groups, device=device, dtype=torch.float32)
+        offs = torch.arange(k, n_groups * k + 1, k, device=device, dtype=torch.int32)
+        f = scaled_grouped_mm_wrap
+        out = f(a, b.t(),
+                scale_a,
+                scale_b,
+                scale_recipe_a=ScalingType.RowWise,
+                scale_recipe_b=ScalingType.RowWise,
+                offs=offs,
+                out_dtype=torch.bfloat16,
+                use_fast_accum=fast_accum,
+                wrap_v2=wrap_v2)
+        offs_cpu = offs.cpu()
+        alist, blist, ascalelist, bscalelist = [], [], [], []
+        start = 0
+        for i in range(n_groups):
+            alist.append(a[:, start:offs_cpu[i]])
+            blist.append(b[:, start:offs_cpu[i]])
+            ascalelist.append(scale_a[i * m : (i + 1) * m])
+            bscalelist.append(scale_b[i * n : (i + 1) * n])
+            start = offs_cpu[i]
+        self.scaled_grouped_mm_helper(alist, blist, ascalelist, bscalelist, out, fast_accum)
+
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8_GROUPED_GEMM, f8_grouped_msg)
+    @parametrize("fast_accum", [False, True])
+    # AMD does not support non-contiguous inputs yet
+    @parametrize("strided", [False] + ([True] if torch.version.cuda else []))
+    @parametrize("wrap_v2", [True, False])
+    def test_scaled_grouped_gemm_2d_3d(self, fast_accum, strided, wrap_v2):
+        device = "cuda"
+        fp8_dtype = e4m3_type
+        m, n, k, n_groups = 16, 32, 64, 4
+        s_int = int(strided)
+        a = torch.randn(m * n_groups, k * (1 + s_int), device=device).to(fp8_dtype)[:, :k]
+        b = torch.randn(n_groups * (1 + s_int), n, k * (1 + s_int), device=device).to(fp8_dtype)[::(1 + s_int), :, :k]
+        self.assertTrue(a.is_contiguous() is not strided)
+        self.assertTrue(b.is_contiguous() is not strided)
+        for check_zero_size in (True, False):
+            if check_zero_size and n_groups <= 1:
+                continue
+
+            offs = torch.arange(m, n_groups * m + 1, m, device="cuda", dtype=torch.int32)
+            if check_zero_size:
+                offs[0] = offs[1]
+            scale_a = torch.rand(n_groups * m, device="cuda", dtype=torch.float32)
+            scale_b = torch.rand(n_groups * n, device="cuda", dtype=torch.float32).view(n_groups, n)
+            f = scaled_grouped_mm_wrap
+            out = f(a, b.transpose(-2, -1),
+                    scale_a,
+                    scale_b,
+                    scale_recipe_a=ScalingType.RowWise,
+                    scale_recipe_b=ScalingType.RowWise,
+                    offs=offs,
+                    out_dtype=torch.bfloat16,
+                    use_fast_accum=fast_accum,
+                    wrap_v2=wrap_v2)
+
+            offs_cpu = offs.cpu()
+            alist, ascalelist, outlist = [], [], []
+            start = 0
+            for i in range(n_groups):
+                alist.append(a[start:offs_cpu[i]])
+                ascalelist.append(scale_a[start:offs_cpu[i]])
+                outlist.append(out[start:offs_cpu[i]])
+                start = offs_cpu[i]
+                self.scaled_grouped_mm_helper(alist, b, ascalelist, scale_b, outlist, fast_accum)
+
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8_GROUPED_GEMM, f8_grouped_msg)
+    @parametrize("fast_accum", [False, True])
+    # AMD does not support non-contiguous inputs yet
+    @parametrize("strided", [False] + ([True] if torch.version.cuda else []))
+    def test_scaled_grouped_gemm_3d_3d(self, fast_accum, strided):
+        device = "cuda"
+        fp8_dtype = e4m3_type
+        m, n, k, n_groups = 16, 32, 64, 4
+        s_int = int(strided)
+        a = torch.randn(n_groups * (1 + s_int), m, k * (1 + s_int), device=device).to(fp8_dtype)[::(1 + s_int), :, :k]
+        b = torch.randn(n_groups * (1 + s_int), n, k * (1 + s_int), device=device).to(fp8_dtype)[::(1 + s_int), :, :k]
+        self.assertTrue(a.is_contiguous() is not strided)
+        self.assertTrue(b.is_contiguous() is not strided)
+        scale_a = torch.rand(n_groups * m, device="cuda", dtype=torch.float32).view(n_groups, m)
+        scale_b = torch.rand(n_groups * n, device="cuda", dtype=torch.float32).view(n_groups, n)
+
+        f = torch._scaled_grouped_mm
+        out = f(a, b.transpose(-2, -1), scale_a, scale_b,
+                out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+
+        self.scaled_grouped_mm_helper(a, b, scale_a, scale_b, out, fast_accum)
+
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8_GROUPED_GEMM, f8_grouped_msg)
+    @parametrize("fast_accum", [False, True])
+    # AMD does not support non-contiguous inputs yet
+    @parametrize("strided", [False] + ([True] if torch.version.cuda else []))
+    def test_scaled_grouped_gemm_3d_2d(self, fast_accum, strided):
+        device = "cuda"
+        fp8_dtype = e4m3_type
+        m, n, k, n_groups = 16, 32, 64, 4
+        s_int = int(strided)
+        a = torch.randn(n_groups * (1 + s_int), m, k * (1 + s_int), device=device).to(fp8_dtype)[::(1 + s_int), :, :k]
+        b = torch.randn(n * n_groups, k * (1 + s_int), device=device).to(fp8_dtype)[:, :k]
+        self.assertTrue(a.is_contiguous() is not strided)
+        self.assertTrue(b.is_contiguous() is not strided)
+        scale_a = torch.rand(n_groups * m, device="cuda", dtype=torch.float32).view(n_groups, m)
+        scale_b = torch.rand(n_groups * n, device="cuda", dtype=torch.float32)
+        for check_zero_size in (True, False):
+            if check_zero_size and n_groups <= 1:
+                continue
+
+            offs = torch.arange(n, n_groups * n + 1, n, device="cuda", dtype=torch.int32)
+            if check_zero_size:
+                offs[0] = offs[1]
+
+            f = torch._scaled_grouped_mm
+            out = f(a, b.transpose(-2, -1), scale_a, scale_b, offs=offs,
+                    out_dtype=torch.bfloat16, use_fast_accum=fast_accum)
+            offs_cpu = offs.cpu()
+            blist, bscalelist, outlist = [], [], []
+            start = 0
+            for i in range(n_groups):
+                blist.append(b[start:offs_cpu[i]])
+                bscalelist.append(scale_b[start:offs_cpu[i]])
+                outlist.append(out[:, start:offs_cpu[i]])
+                start = offs_cpu[i]
+                self.scaled_grouped_mm_helper(a, blist, scale_a, bscalelist, outlist, fast_accum)
+
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, mx_skip_msg)
+    def test_blockwise_mxfp8_compile(self) -> None:
+
+        device = "cuda"
+        M, K, N = 128, 128, 128
+        BLOCK_SIZE = 32
+
+        A_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
+        B_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
+
+        A = A_ref.to(torch.float8_e4m3fn)
+        B = B_ref.to(torch.float8_e4m3fn)
+
+        A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+        B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=torch.float8_e8m0fnu)
+        C_ref = A_ref @ B_ref.t()
+
+        compiled_scaled_mm = torch.compile(scaled_mm_wrap, backend="inductor")
+        C = compiled_scaled_mm(
+            A,
+            B.t(),
+            A_scale,
+            B_scale,
+            out_dtype=torch.bfloat16,
+            use_fast_accum=False,
+        )
+        torch.testing.assert_close(C, C_ref, atol=0, rtol=0)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM, mx_skip_msg)
+    def test_blockwise_nvfp4_compile(self) -> None:
+
+        device = "cuda"
+        M, K, N = 128, 128, 128
+        BLOCK_SIZE = 32 if torch.version.hip else 16
+        fp4_scaling_dtype = torch.float8_e8m0fnu if torch.version.hip else torch.float8_e4m3fn
+
+        A_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
+        B_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
+
+        A = _bfloat16_to_float4_e2m1fn_x2(A_ref)
+        B = _bfloat16_to_float4_e2m1fn_x2(B_ref)
+
+        A_scale = torch.full((M, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+        B_scale = torch.full((N, ceil_div(K, BLOCK_SIZE)), 1.0, device=device, dtype=fp4_scaling_dtype)
+        C_ref = A_ref @ B_ref.t()
+
+        compiled_scaled_mm = torch.compile(scaled_mm_wrap, backend="inductor")
+        # C = scaled_mm_wrap(
+        C = compiled_scaled_mm(
+            A,
+            B.t(),
+            A_scale,
+            B_scale,
+            out_dtype=torch.bfloat16,
+            use_fast_accum=False,
+        )
+        torch.testing.assert_close(C, C_ref, atol=0, rtol=0)
+
+
+instantiate_device_type_tests(TestFP8Matmul, globals(), except_for="cpu")
+
+if __name__ == '__main__':
+    TestCase._default_dtype_check_enabled = True
+    run_tests()
diff --git a/test/test_segment_reductions.py b/test/test_segment_reductions.py
index 815bbc7dbc3d..18159044407c 100644
--- a/test/test_segment_reductions.py
+++ b/test/test_segment_reductions.py
@@ -129,7 +129,7 @@ def test_simple_1d(self, device, dtypes):
 
         for reduction in reductions:
             for initial in [0, None]:
-                check_backward = True if initial is not None else False
+                check_backward = initial is not None
                 initial_value = initial
                 default_value = get_default_value(initial_value, reduction)
                 if reduction == "max":
@@ -186,7 +186,7 @@ def test_simple_zero_length(self, device, dtypes):
 
         for reduction in reductions:
             for initial in [0, None]:
-                check_backward = True if initial is not None else False
+                check_backward = initial is not None
                 initial_value = initial
                 default_value = get_default_value(initial_value, reduction)
                 if reduction == "max":
@@ -244,7 +244,7 @@ def test_multi_d_simple(self, device, dtypes):
 
         for reduction in reductions:
             for initial in [0, None]:
-                check_backward = True if initial is not None else False
+                check_backward = initial is not None
                 initial_value = initial
                 default_value = get_default_value(initial_value, reduction)
                 if reduction == "max":
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 8fa78cb5da4b..a6e3ef23580d 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -57,6 +57,7 @@
     TemporaryDirectoryName,
     TemporaryFileName,
     TEST_DILL,
+    TEST_WITH_MTIA,
     TestCase,
 )
 from torch.testing._internal.two_tensor import TwoTensor  # noqa: F401
@@ -69,6 +70,9 @@
 else:
     MAP_SHARED, MAP_PRIVATE = None, None
 
+if TEST_WITH_MTIA:
+    import mtia.host_runtime.torch_mtia.dynamic_library  # noqa: F401
+
 # These tests were all copied from `test/test_torch.py` at some point, so see
 # the actual blame, see this revision
 # https://github.com/pytorch/pytorch/blame/9a2691f2fc948b9792686085b493c61793c2de30/test/test_torch.py
@@ -291,7 +295,7 @@ def test_serialization_fake_zip(self):
             5,
             6
         ]
-        for i in range(0, 100):
+        for i in range(100):
             data.append(0)
         t = torch.tensor(data, dtype=torch.uint8)
 
@@ -648,6 +652,10 @@ def load_bytes():
         xpu_last_map_locations = [
             f'xpu:{torch.xpu.device_count() - 1}',
         ]
+        mtia_0_map_locations = generate_map_locations('mtia')
+        mtia_last_map_locations = [
+            f'mtia:{torch.mtia.device_count() - 1}',
+        ]
 
         def check_map_locations(map_locations, dtype, intended_device):
             for fileobject_lambda in fileobject_lambdas:
@@ -673,6 +681,13 @@ def check_map_locations(map_locations, dtype, intended_device):
                 torch.float,
                 torch.device('xpu', torch.xpu.device_count() - 1)
             )
+        if torch.mtia.is_available():
+            check_map_locations(mtia_0_map_locations, torch.float, torch.device('mtia', 0))
+            check_map_locations(
+                mtia_last_map_locations,
+                torch.float,
+                torch.device('mtia', torch.mtia.device_count() - 1)
+            )
 
     @unittest.skipIf(torch.cuda.is_available(), "Testing torch.load on CPU-only machine")
     def test_load_nonexistent_device(self):
@@ -4538,7 +4553,7 @@ def test_weights_only_env_variables(self, force_weights_only):
         with TemporaryFileName() as f:
             torch.save(m, f)
             try:
-                old_value = os.environ[env_var] if env_var in os.environ else None
+                old_value = os.environ.get(env_var, None)
                 os.environ[env_var] = "1"
                 # if weights_only is explicitly set, TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD cannot override it
                 with self.assertRaisesRegex(pickle.UnpicklingError, "Weights only load failed"):
diff --git a/test/test_sparse.py b/test/test_sparse.py
index bd49998e0951..a3bd5b2085c5 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -22,7 +22,8 @@
     (SM53OrLater, SM80OrLater, TEST_MULTIGPU)
 from torch.testing._internal.common_device_type import \
     (instantiate_device_type_tests, ops, dtypes, dtypesIfCUDA, dtypesIfMPS, onlyCPU, onlyCUDA, precisionOverride,
-     deviceCountAtLeast, OpDTypes, onlyNativeDeviceTypes, skipCUDAIf, expectedFailureMPS, largeTensorTest)
+     deviceCountAtLeast, OpDTypes, onlyNativeDeviceTypes, skipCUDAIf, expectedFailureMPS,
+     expectedFailureMPSComplex, largeTensorTest)
 from torch.testing._internal.common_methods_invocations import \
     (op_db, reduction_ops, sparse_unary_ufuncs, sparse_masked_reduction_ops, binary_ufuncs)
 from torch.testing._internal.common_dtype import (
@@ -58,13 +59,13 @@ def _op_supports_any_sparse(op):
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 # batched grad doesn't support sparse
 gradcheck = functools.partial(gradcheck, check_batched_grad=False)
 
 CUSPARSE_SPMM_COMPLEX128_SUPPORTED = (
-    IS_WINDOWS and torch.version.cuda and version.parse(torch.version.cuda) > version.parse("11.2")
+    IS_WINDOWS and torch.version.cuda
 ) or (not IS_WINDOWS and not TEST_WITH_ROCM)
 
 HIPSPARSE_SPMM_COMPLEX128_SUPPORTED = torch.version.hip and version.parse(torch.version.hip.split("-")[0]) >= version.parse("6.0")
@@ -456,6 +457,7 @@ def test_ctor_size_checks(self, device, dtype):
     @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     def test_ctor_is_coalesced_with_gradcheck(self, device, dtype, coalesced):
         for sparse_size, nnz in (((3, 3), 5), ((2, 3, 1, 5), 11)):
             t, _, _ = self._gen_sparse(len(sparse_size), nnz, sparse_size, dtype, device, coalesced)
@@ -546,11 +548,12 @@ def fn(x):
 
     @coalescedonoff
     @dtypes(torch.float16, torch.bfloat16, torch.float64, torch.int, torch.cfloat, torch.cdouble)
-    @expectedFailureMPS  # unique_dim not implemented for MPS device
+    @dtypesIfMPS(torch.float16, torch.bfloat16, torch.float32, torch.int, torch.cfloat)
     def test_to_sparse(self, device, dtype, coalesced):
         shape = [5, 2, 10, 4]
         max_nnz = 1
-        for value_type in [torch.double, torch.cdouble]:
+        dtypes = [torch.double, torch.cdouble] if device != "mps:0" else [torch.float32, torch.complex64]
+        for value_type in dtypes:
             for dim, dim_sz in enumerate(shape, 1):
                 max_nnz *= dim_sz
                 rnnz = torch.randint(2, max_nnz, (1,)).item()
@@ -623,6 +626,7 @@ def test_shared(self, device, dtype):
 
     @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     @gradcheck_semantics()
     def test_to_dense_hybrid(self, device, dtype, gradcheck):
@@ -984,6 +988,7 @@ def test_shape(sparse_dims, nnz, with_size):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     @expectedFailureMPS
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     @gradcheck_semantics()
@@ -1249,6 +1254,7 @@ def test_select_no_type_promotion(self, device, dtype):
     @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_index_select(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, sizes, select_dim, select_index, fail_message=None):
             if isinstance(select_index, int):
@@ -1302,6 +1308,7 @@ def _test_index_select_exhaustive_index(self, sizes, dims, device, dtype, coales
     @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_index_select_exhaustive_index_small(self, device, dtype, coalesced):
         # will trigger brute-force algo
         self._test_index_select_exhaustive_index((3, 3, 4), range(3), device, dtype, coalesced)
@@ -1309,6 +1316,7 @@ def test_index_select_exhaustive_index_small(self, device, dtype, coalesced):
     @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_index_select_exhaustive_index_large(self, device, dtype, coalesced):
         # will trigger more sophisticated algos
         self._test_index_select_exhaustive_index((100, 50, 3, 3), (2, 3), device, dtype, coalesced)
@@ -1316,6 +1324,7 @@ def test_index_select_exhaustive_index_large(self, device, dtype, coalesced):
     @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_index_select_empty_and_non_contiguous_index(self, device, dtype, coalesced):
         # empty index
         idx_empty = torch.tensor([], dtype=torch.long, device=device)
@@ -1412,8 +1421,8 @@ def test_shape(di, dj, dk, nnz):
         "bmm sparse-dense CUDA is not yet supported in Windows, at least up to CUDA 10.1"
     )
     @coalescedonoff
-    @expectedFailureMPS
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     def test_bmm(self, device, dtype, coalesced):
         def test_shape(num_mats, dim_i, dim_j, dim_k, nnz):
             a_list = []
@@ -1623,9 +1632,9 @@ def test_shape(di, dj, dk, nnz):
         self.assertEqual(self.safeToDense(res), self.safeToDense(true_result))
 
     @coalescedonoff
-    @expectedFailureMPS
     @precisionOverride({torch.bfloat16: 5e-2, torch.float16: 5e-2})
     @dtypes(torch.double, torch.cdouble, torch.bfloat16, torch.float16)
+    @dtypesIfMPS(torch.float32, torch.complex64, torch.bfloat16, torch.float16)
     def test_sparse_addmm(self, device, dtype, coalesced):
         if (dtype is torch.bfloat16 or dtype is torch.float16) and device.startswith("cuda"):
             self.skipTest('addmm_sparse_cuda is not implemented for BFloat16 and Half')
@@ -1667,6 +1676,7 @@ def fn(S, D1, D2, beta=beta, alpha=alpha):
     @coalescedonoff
     @expectedFailureMPS
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     def test_sparse_mm(self, device, dtype, coalesced):
         def test_shape(d1, d2, d3, nnz, transposed):
@@ -1712,8 +1722,8 @@ def test_shape(sparse_dims, nnz, with_shape):
         # test_shape(2, 3, [2, 2, 0])
 
     @coalescedonoff
-    @expectedFailureMPS
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     def test_dsmm(self, device, dtype, coalesced):
         def test_shape(di, dj, dk, nnz):
             x = self._gen_sparse(2, nnz, [di, dj], dtype, device, coalesced)[0]
@@ -1734,6 +1744,7 @@ def test_shape(di, dj, dk, nnz):
     @coalescedonoff
     @expectedFailureMPS
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     def test_hsmm(self, device, dtype, coalesced):
         def test_shape(di, dj, dk, nnz):
             x = self._gen_sparse(2, nnz, [di, dj], dtype, device, coalesced)[0]
@@ -1752,8 +1763,8 @@ def test_shape(di, dj, dk, nnz):
         test_shape(1000, 100, 0, 20)
 
     @coalescedonoff
-    @expectedFailureMPS
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     def test_spadd(self, device, dtype, coalesced):
 
         def _test_spadd_shape(nnz, shape_i, shape_v=None):
@@ -1840,8 +1851,9 @@ def test_sparse_add_out_bfloat16(self, device, dtype, coalesced):
         self.assertEqual(res_fp32, res_bf16, atol=1e-2, rtol=0)
 
     @coalescedonoff
-    @expectedFailureMPS
+    @expectedFailureMPSComplex
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_norm(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, with_size):
             x, _, _ = self._gen_sparse(sparse_dims, nnz, with_size, dtype, device, coalesced)
@@ -1871,6 +1883,7 @@ def test_shape(sparse_dims, nnz, with_size):
     @coalescedonoff
     @expectedFailureMPS
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     @unittest.skipIf(TEST_WITH_CROSSREF, "fallback triggers cuda device error")
     def test_sparse_sum(self, device, dtype, coalesced):
 
@@ -2083,8 +2096,8 @@ def _test_sparse_mask_shape(self, nnz_x1, nnz_x2, shape_i, shape_v, dtype, devic
         self.assertEqual(self.safeToDense(y2), expected)
 
     @coalescedonoff
-    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_sparse_mask(self, device, dtype, coalesced):
         def _test_sparse_mask_fixed():
             i = self.index_tensor([
@@ -2216,6 +2229,7 @@ def _test_sparse_mask_hybrid_fixed():
         self._test_sparse_mask_shape(0, 0, [10, 10, 0], [2, 0], dtype, device, coalesced)
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     @expectedFailureMPS
     @skipIfCrossRef
     def test_sparse_mask_backward(self, device, dtype):
@@ -2656,6 +2670,7 @@ def test_asin_arcsin(self, device, dtype, coalesced):
     @coalescedonoff
     @expectedFailureMPS
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     def test_mv(self, device, dtype, coalesced):
         def test_shape(di, dj, dk, nnz):
             x, _, _ = self._gen_sparse(2, nnz, [di, dj], dtype, device, coalesced)
@@ -3368,6 +3383,7 @@ def test_sparse_to_numpy(self, device):
     @coalescedonoff
     @expectedFailureMPS
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     def test_softmax(self, device, dtype, coalesced):
         import torch.nn.functional as F
 
@@ -3679,6 +3695,7 @@ def _check_zero_nnz_softmax_op(self, func, ndim, device, dtype):
 
 
     @dtypes(torch.double, torch.float)
+    @dtypesIfMPS(torch.float32)
     @expectedFailureMPS
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     def test_softmax_zero_nnz(self, device, dtype):
@@ -3686,6 +3703,7 @@ def test_softmax_zero_nnz(self, device, dtype):
         self._check_zero_nnz_softmax_op(torch.sparse.softmax, 10, device, dtype)
 
     @dtypes(torch.double, torch.float)
+    @dtypesIfMPS(torch.float32)
     @expectedFailureMPS
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     def test_log_softmax_zero_nnz(self, device, dtype):
@@ -3704,6 +3722,7 @@ def test_log_softmax_float(self, device, dtype):
     # TODO: Check after why ROCm's cusparseXcsrgemm2Nnz function doesn't return the same nnz value as CUDA
     @coalescedonoff
     @dtypes(*floating_and_complex_types())
+    @dtypesIfMPS(*all_mps_types())
     @expectedFailureMPS
     @dtypesIfCUDA(*floating_types_and(*[torch.half] if SM53OrLater and not TEST_WITH_ROCM else [],
                                       *[torch.bfloat16] if SM80OrLater and not TEST_WITH_ROCM else [],
@@ -3835,8 +3854,8 @@ def assign_to():
 
         self.assertRaises(TypeError, assign_to)
 
-    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_full_broadcast_to(self, device, dtype):
         def can_broadcast(s0, s1):
             s0 = tuple(reversed(s0))
@@ -3866,8 +3885,8 @@ def can_broadcast(s0, s1):
                         torch._sparse_broadcast_to(s, s1)
 
     @coalescedonoff
-    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_sparse_broadcast_to(self, device, dtype, coalesced):
         def test(sparse_dims, nnz, with_size, new_size):
             x = self._gen_sparse(sparse_dims, nnz, with_size, dtype, device, coalesced)[0]
@@ -3899,6 +3918,7 @@ def _test_mul_skips(self, device, dtype, coalesced):
     @expectedFailureMPS
     # NOTE: addcmul_out is not implemented for bool.
     @dtypes(*all_types_and_complex_and(torch.bfloat16, torch.float16))
+    @dtypesIfMPS(*all_mps_types())
     @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2})
     def test_sparse_sparse_mul(self, device, dtype, coalesced):
         self._test_mul_skips(device, dtype, coalesced)
@@ -3950,6 +3970,7 @@ def check_empty(sparse_shape, nnz, dense_shape, coalesce):
     @coalescedonoff
     @expectedFailureMPS
     @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))
+    @dtypesIfMPS(*all_mps_types())
     @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2})
     def test_sparse_dense_mul(self, device, dtype, coalesced):
         self._test_mul_skips(device, dtype, coalesced)
@@ -4889,9 +4910,6 @@ def generic_constructor(*args, **kwargs):
                                     lambda i, v, sz: cnstr(i, v, sz, **kwargs_).to_dense(masked_grad=masked),
                                     args_, masked=masked)
                             else:
-                                if layout in {torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc} and 0:
-                                    # TODO: remove this if-block after gh-107370 is resolved
-                                    continue
                                 torch.autograd.gradcheck(
                                     lambda ci, pi, v: cnstr(ci, pi, v, **kwargs).to_dense(masked_grad=masked),
                                     args, masked=masked)
@@ -5279,7 +5297,7 @@ def test_to_sparse_identity(self, device, layout, dtype):
             x_dense = torch.eye(dense_dim, dtype=dtype, device=device)
             for sparse_dim_in in range(1, dense_dim):
                 x_sparse = x_dense.to_sparse(sparse_dim_in)
-                for sparse_dim_out in range(0, dense_dim):
+                for sparse_dim_out in range(dense_dim):
                     if sparse_dim_out == sparse_dim_in:
                         self.assertTrue(x_sparse.to_sparse(sparse_dim_out).sparse_dim() == sparse_dim_out)
                     else:
@@ -5470,7 +5488,6 @@ def test_constructor_pin_memory(self, device, layout):
                 layout, device=device, dtype=torch.float64,
                 enable_zero_sized=False,  # pinning zero-sized tensors is a no-op
                 pin_memory=True,
-                enable_batch=False,  # TODO: remove after gh-104868 is resolved
         ):
             if layout is torch.sparse_coo:
                 self.assertTrue(t._indices().is_pinned())
@@ -5500,7 +5517,6 @@ def test_method_pin_memory(self, device, layout):
                 layout, device=device, dtype=torch.float64,
                 enable_zero_sized=False,  # pinning zero-sized tensors is a no-op
                 pin_memory=False,         # no pinning
-                enable_batch=False,  # TODO: remove after gh-104868 is resolved
         ):
             t = t_.pin_memory()
             self.assertTrue(t.is_pinned())
@@ -5551,7 +5567,6 @@ def test_constructor_pinned_memory(self, device, layout):
                 enable_zero_sized=False,     # pinning zero-sized tensors is a no-op
                 pin_memory=None,             # constructor does not specify pin_memory=...
                 members_pin_memory=True,     # indices and values are pinned
-                enable_batch=False,          # TODO: remove after gh-104868 is resolved
         ):
             if layout is torch.sparse_coo:
                 self.assertTrue(t._indices().is_pinned())
@@ -5589,7 +5604,6 @@ def generic_constructor(*args, **kwargs):
         for args, kwargs in self.generate_simple_inputs(
                 layout, device=device, dtype=torch.float64,
                 enable_zero_sized=False,     # pinning zero-sized tensors is a no-op
-                enable_batch=False,  # TODO: remove after gh-104868 is resolved
                 output_tensor=False):
 
             # indices are pinned, values is a non-pinned tensor
diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
index ed66b7d810d0..f84adcc7bd26 100644
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@@ -16,11 +16,10 @@
      skipIfRocmVersionLessThan, IS_FBCODE, IS_REMOTE_GPU, suppress_warnings)
 from torch.testing._internal.common_device_type import \
     (ops, instantiate_device_type_tests, dtypes, OpDTypes, dtypesIfCUDA, onlyCPU, onlyCUDA, skipCUDAIfNoSparseGeneric,
-     precisionOverride, skipMeta, skipCUDAIf, skipCPUIfNoMklSparse, skipCUDAIfRocmVersionLessThan,
-     largeTensorTest)
+     precisionOverride, skipMeta, skipCUDAIf, skipCUDAIfRocm, skipCPUIfNoMklSparse, largeTensorTest)
 from torch.testing._internal.common_methods_invocations import \
     (op_db, sparse_csr_unary_ufuncs, ReductionOpInfo)
-from torch.testing._internal.common_cuda import _get_torch_cuda_version, TEST_CUDA
+from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_dtype import (
     floating_types, all_types_and_complex_and, floating_and_complex_types, floating_types_and,
     all_types_and_complex, floating_and_complex_types_and)
@@ -36,27 +35,15 @@
     import numpy as np
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 no_mkl_sparse = IS_WINDOWS or not TEST_MKL
 
-def _check_cusparse_triangular_solve_available():
-    version = _get_torch_cuda_version()
-    # cusparseSpSM was added in 11.3.1 but we don't have access to patch version
-    min_supported_version = (11, 4)
-    return version >= min_supported_version
 
 def _check_cusparse_spgemm_available():
     # cusparseSpGEMM was added in 11.0
     return not TEST_WITH_ROCM
 
-def _check_cusparse_sddmm_available():
-    if TEST_WITH_ROCM:
-        return True
-    version = _get_torch_cuda_version()
-    # cusparseSDDMM was added in 11.2.1 but we don't have access to patch version
-    min_supported_version = (11, 3)
-    return version >= min_supported_version
 
 _sparse_csr_ops = list(filter(lambda op: op.supports_sparse_csr, op_db))
 _sparse_compressed_ops = list(filter(lambda op: (op.supports_sparse_csr or op.supports_sparse_csc
@@ -148,7 +135,7 @@ def test_make_crow_indices(self):
         index_dtype = torch.int32
         for n_rows in range(1, 10):
             for n_cols in range(1, 10):
-                for nnz in range(0, n_rows * n_cols + 1):
+                for nnz in range(n_rows * n_cols + 1):
                     crow_indices = self._make_crow_indices(
                         n_rows, n_cols, nnz,
                         device=device, dtype=index_dtype)
@@ -1504,8 +1491,6 @@ def test_csr_matvec(self, device, dtype):
                 csr.matmul(bad_vec)
 
     @onlyCUDA
-    # hmm, the test passes ok on CUDA when Rocm is not available:
-    @skipCUDAIfRocmVersionLessThan((5, 2))
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_baddbmm(self, device, dtype):
 
@@ -2347,10 +2332,7 @@ def run_test(index_type):
             run_test(index_dtype)
 
     @skipCPUIfNoMklSparse
-    @skipCUDAIf(
-        not _check_cusparse_triangular_solve_available(),
-        "cuSparse Generic API SpSV is not available"
-    )
+    @skipCUDAIfRocm(msg="needs HIPSPARSE_GENERIC_SPSV or SPSM")
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
                         torch.float64: 1e-8, torch.complex128: 1e-8})
@@ -2427,10 +2409,6 @@ def remove_diagonal(t):
                                                                                  itertools.product([True, False], repeat=4)):
             run_test(n, k, upper, unitriangular, transpose, zero)
 
-    @skipCUDAIf(
-        not _check_cusparse_sddmm_available(),
-        "cuSparse Generic API SDDMM is not available"
-    )
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
                         torch.float64: 1e-8, torch.complex128: 1e-8})
@@ -2481,10 +2459,6 @@ def run_test(c, a, b, op_a, op_b, *, alpha=None, beta=None):
                 for op_a, op_b in itertools.product([True, False], repeat=2):
                     run_test(c, a, b, op_a, op_b)
 
-    @skipCUDAIf(
-        not _check_cusparse_sddmm_available(),
-        "cuSparse Generic API SDDMM is not available"
-    )
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_sampled_addmm_autograd(self, device, dtype):
         from torch.testing._internal.common_methods_invocations import sample_inputs_sparse_sampled_addmm
@@ -2511,13 +2485,9 @@ def test_sampled_addmm_autograd(self, device, dtype):
             self.assertEqual(a.grad, a1.grad)
             self.assertEqual(b.grad, b1.grad)
 
+    @skipCUDAIfRocm
     @onlyCUDA
-    # It works on ROCm and CUDA issue is currently active
-    @skipCUDAIf(not TEST_WITH_ROCM, "Causes CUDA memory exception, see https://github.com/pytorch/pytorch/issues/72177")
-    @skipCUDAIf(
-        not _check_cusparse_sddmm_available(),
-        "cuSparse Generic API SDDMM is not available"
-    )
+    @skipCUDAIf(True, "Causes CUDA memory exception, see https://github.com/pytorch/pytorch/issues/72177")
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
                         torch.float64: 1e-8, torch.complex128: 1e-8})
@@ -2533,10 +2503,6 @@ def run_test(c, a, b):
             run_test(c, a, b)
 
     @onlyCUDA
-    @skipCUDAIf(
-        not _check_cusparse_sddmm_available(),
-        "cuSparse Generic API SDDMM is not available"
-    )
     @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_sampled_addmm_errors(self, device, dtype):
         # test that the errors are the same for dense and sparse sampled versions
@@ -2733,7 +2699,7 @@ def test_sparse_csr_unary_out(self, device, dtype, op):
             # Sparse CSR only supports 2D tensors as inputs
             # Fail early to prevent silent success with this test
             if sample.input.ndim != 2:
-                raise ValueError("Expected 2D tensor but got tensor with dimension: {sample.input.ndim}.")
+                raise ValueError(f"Expected 2D tensor but got tensor with dimension: {sample.input.ndim}.")
 
             sample.input = sample.input.to_sparse_csr()
             expect = op(sample.input, *sample.args, **sample.kwargs)
@@ -2757,7 +2723,7 @@ def test_sparse_csr_unary_inplace(self, device, dtype, op):
             # Sparse CSR only supports 2D tensors as inputs
             # Fail early to prevent silent success with this test
             if sample.input.ndim != 2:
-                raise ValueError("Expected 2D tensor but got tensor with dimension: {sample.input.ndim}.")
+                raise ValueError(f"Expected 2D tensor but got tensor with dimension: {sample.input.ndim}.")
 
             sample.input = sample.input.to_sparse_csr()
             expect = op(sample.input, *sample.args, **sample.kwargs)
@@ -2816,10 +2782,6 @@ def fn(input):
             dense_output.backward(dense_covector)
             self.assertEqual(sparse_input.grad, dense_input.grad)
 
-    @skipCUDAIf(
-        not _check_cusparse_sddmm_available(),
-        "cuSparse Generic API SDDMM is not available"
-    )
     @dtypes(torch.float64)
     def test_autograd_dense_output_addmm(self, device, dtype):
         from torch.testing._internal.common_methods_invocations import sample_inputs_addmm
@@ -4137,7 +4099,7 @@ def nc_copy(t, axes=(-1,)):
             left_alpha = make_tensor(M, dtype=dtype, device=device, low=0.5, high=high) if has_left_alpha else None
             right_alpha = make_tensor(N, dtype=dtype, device=device, low=0.5, high=high) if has_right_alpha else None
 
-            if 0 and op == "bsr_dense_addmm":
+            if 0 and op == "bsr_dense_addmm":  # noqa: SIM223
                 # Find optimal kernel parameters, the speed-up is
                 # about 10x for running this test.
                 #
diff --git a/test/test_sparse_semi_structured.py b/test/test_sparse_semi_structured.py
index 51fb4aa48c22..5374ec994cda 100644
--- a/test/test_sparse_semi_structured.py
+++ b/test/test_sparse_semi_structured.py
@@ -50,8 +50,8 @@
 _IS_HIPSPARSELT_AVAILABLE = False
 
 if torch.cuda.is_available():
-    _IS_SM8X = torch.cuda.get_device_capability(0)[0] == 8
-    _IS_SM9X = torch.cuda.get_device_capability(0)[0] == 9
+    _IS_SM8X = torch.version.cuda is not None and (torch.cuda.get_device_capability(0)[0] == 8)
+    _IS_SM9X = torch.version.cuda is not None and (torch.cuda.get_device_capability(0)[0] == 9)
     _IS_HIPSPARSELT_AVAILABLE = torch.version.hip is not None and tuple(int(v) for v in torch.version.hip.split('.')[:2]) > (6, 4)
     # CUTLASS kernels only work for Ampere
     if _IS_SM8X:
@@ -1240,11 +1240,8 @@ def test_cusparselt_backend(self):
         version = _get_torch_cuda_version()
         assert torch.backends.cusparselt.is_available()
 
-        # CUDA 11.8 has cuSPARSELt v0.4.0 support
-        if version == (11, 8):
-            assert torch.backends.cusparselt.version() == 400
         # PyTorch CUDA 12.4+ using cuSPARSELt v0.6.2+
-        elif version >= (12, 4):
+        if version >= (12, 4):
             assert torch.backends.cusparselt.version() >= 602
         else:
             assert torch.backends.cusparselt.version() is None
diff --git a/test/test_static_runtime.py b/test/test_static_runtime.py
index 893aea8e3130..df1e0c3e34fa 100644
--- a/test/test_static_runtime.py
+++ b/test/test_static_runtime.py
@@ -60,7 +60,7 @@ def forward(self, query, key, value, mask):
 # Taken from https://github.com/facebookresearch/dlrm/blob/master/dlrm_s_pytorch.py
 def create_mlp(ln, sigmoid_layer):
     layers = nn.ModuleList()
-    for i in range(0, len(ln) - 1):
+    for i in range(len(ln) - 1):
         n = ln[i]
         m = ln[i + 1]
 
diff --git a/test/test_sympy_utils.py b/test/test_sympy_utils.py
index 5343e2e0a9fb..75396631fedb 100644
--- a/test/test_sympy_utils.py
+++ b/test/test_sympy_utils.py
@@ -5,7 +5,7 @@
 import math
 import pickle
 import sys
-from typing import Callable
+from collections.abc import Callable
 
 import sympy
 
@@ -24,6 +24,7 @@
     FloorDiv,
     Identity,
     OpaqueUnaryFn_cos,
+    BitwiseFn_bitwise_and,
     simple_floordiv_gcd,
 )
 from torch.utils._sympy.interp import sympy_interp
@@ -873,6 +874,10 @@ def test_pickle(self):
         r = pickle.loads(pickle.dumps(x))
         self.assertEqual(x, r)
 
+        x = BitwiseFn_bitwise_and(sympy.Symbol("a"), sympy.Symbol("b"))
+        r = pickle.loads(pickle.dumps(x))
+        self.assertEqual(x, r)
+
 
 class TestSingletonInt(TestCase):
     def test_basic(self):
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index 15c04b8154c3..8a76397f0516 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -688,6 +688,21 @@ def test_cat_preserve_channels_last(self, device):
         self.assertEqual(res1, res2)
         self.assertTrue(res1.is_contiguous(memory_format=torch.channels_last))
 
+    @onlyCUDA
+    def test_cat_channels_last_large_inputs(self, device):
+        num_tensors = 130
+        inputs_cuda = [
+            torch.randn((2, 3, 4, 4), device=device).contiguous(memory_format=torch.channels_last)
+            for _ in range(num_tensors)
+        ]
+        inputs_cpu = [t.cpu() for t in inputs_cuda]
+
+        result = torch.cat(inputs_cuda, dim=1)
+        expected = torch.cat(inputs_cpu, dim=1)
+
+        self.assertEqual(result.cpu(), expected)
+        self.assertTrue(result.is_contiguous(memory_format=torch.channels_last))
+
     @onlyCUDA
     def test_cat_out_memory_format(self, device):
         inp_size = (4, 4, 4, 4)
@@ -1151,6 +1166,50 @@ def test_cat2(self, device, dtype):
         z = torch.cat([x, y])
         self.assertEqual(z.size(), (21, SIZE, SIZE))
 
+    @dtypes(torch.float)
+    def test_cat_size1(self, device, dtype):
+        # create a tensor that has aligned stride along dim - 1 dimension
+        # but catted slice size is not aligned
+        x1 = torch.randn(16, 16, device=device, dtype=dtype)[:1, :1]
+        xref = x1.clone().view(-1).view(x1.shape)
+        # make sure output size is aligned, need at least 4 elements for this
+        res = torch.cat([x1, x1, x1, x1], dim=-1)
+        ref = torch.cat([xref, xref, xref, xref], dim=-1)
+        self.assertEqual(res, ref)
+
+    @dtypes(torch.float)
+    def test_cat_trailing_dim(self, device, dtype):
+        x1 = torch.randn(16, 16, 23, device=device, dtype=dtype)
+        x2 = torch.rand_like(x1)
+        res = torch.cat([x1, x2], dim=1)
+        ref = torch.cat([x1.cpu(), x2.cpu()], dim=1)
+        self.assertEqual(res, ref)
+
+    @dtypes(torch.float)
+    def test_cat_misaligned(self, device, dtype):
+        x1 = torch.randn(14, device=device, dtype=dtype)[2:]
+        x2 = torch.rand_like(x1)
+        res = torch.cat([x1, x2], dim=-1)
+        ref = torch.cat([x1.cpu(), x2.cpu()], dim=-1)
+        self.assertEqual(res, ref)
+
+    @dtypes(torch.float)
+    def test_cat_multi_batch(self, device, dtype):
+        xs = [torch.randn(16, 16, device=device, dtype=dtype) for _ in range(130)]
+        xs_cpu = [x.cpu() for x in xs]
+        res = torch.cat(xs, dim=-1)
+        ref = torch.cat(xs_cpu, dim=-1)
+        self.assertEqual(res, ref)
+
+    @dtypes(torch.float)
+    @largeTensorTest("16GB")
+    def test_cat_large_tensor(self, device, dtype):
+        N = 2 ** 32 // dtype.itemsize
+        inps = [torch.randn(N, device=device, dtype=dtype), torch.randn(N // 128, device=device, dtype=dtype)]
+        res = torch.cat(inps, dim=0)
+        ref = torch.cat([x.cpu() for x in inps])
+        self.assertEqual(res, ref)
+
     # FIXME: Create an OpInfo-based tensor creation method test that verifies this for all tensor
     #   creation methods and verify all dtypes and layouts
     @dtypes(torch.bool, torch.uint8, torch.int16, torch.int64, torch.float16, torch.float32, torch.complex64)
@@ -3439,7 +3498,7 @@ def test_uniform_from_to(self, device, dtype):
                 else:
                     t.uniform_(from_, to_)
                     range_ = to_ - from_
-                    if not (dtype == torch.bfloat16) and not (
+                    if dtype != torch.bfloat16 and not (
                             dtype == torch.half and device == 'cpu') and not torch.isnan(t).all():
                         delta = alpha * range_
                         double_t = t.to(torch.double)
diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py
index cd527db88441..8ff6913887c8 100644
--- a/test/test_tensorboard.py
+++ b/test/test_tensorboard.py
@@ -200,7 +200,7 @@ def test_pytorch_histogram_raw(self):
                 bucket_counts=counts.tolist(),
             )
 
-            ints = torch.tensor(range(0, 100)).float()
+            ints = torch.tensor(range(100)).float()
             nbins = 100
             counts = torch.histc(ints, bins=nbins, min=0, max=99)
             limits = torch.tensor(range(nbins))
diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py
index 17d3a58535d6..57be409ab6b4 100644
--- a/test/test_tensorexpr.py
+++ b/test/test_tensorexpr.py
@@ -1216,7 +1216,7 @@ def test_loop(self):
         @torch.jit.script
         def test(x: torch.Tensor, y: torch.Tensor, z: int) -> torch.Tensor:
             b = y
-            for i in range(0, z):
+            for i in range(z):
                 a = x + y
                 b = b + y
             return b
diff --git a/test/test_testing.py b/test/test_testing.py
index 00fb106ac2ab..1735bcdcbb06 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -12,7 +12,8 @@
 import subprocess
 import sys
 import unittest.mock
-from typing import Any, Callable
+from typing import Any
+from collections.abc import Callable
 from collections.abc import Iterator
 
 import torch
diff --git a/test/test_torch.py b/test/test_torch.py
index a6c265c309a2..86c7c647eee3 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -79,7 +79,7 @@
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 AMPERE_OR_ROCM = TEST_WITH_ROCM or torch.cuda.is_tf32_supported()
 
@@ -1232,74 +1232,6 @@ def _test_in_place_broadcastable(t0, t1, t2=None):
             _test_in_place_broadcastable(small2, small_expanded, large_expanded)
             _test_in_place_broadcastable(small2, small, large)
 
-    @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error")
-    @onlyCUDA
-    @wrapDeterministicFlagAPITest
-    def test_cublas_config_nondeterministic_alert(self, device):
-        test_cases = [
-            # (function, (tensor sizes))
-            ('mm', ((2, 2), (2, 2),)),
-            ('mv', ((2, 2), (2,),)),
-            ('bmm', ((1, 2, 2), (1, 2, 2),))]
-
-        test_configs = [
-            # (CuBLAS workspace config, is deterministic)
-            ('garbage', False),
-            (None, False),
-            (':4096:8', True),
-            (':16:8', True)]
-
-        cublas_var_name = 'CUBLAS_WORKSPACE_CONFIG'
-        is_cuda10_2_or_higher = (torch.version.cuda is not None)
-
-        def test_case_info(fn_name, config):
-            return f'function "{fn_name}" with config "{"" if config is None else config}"'
-
-        # Create processes to test each combination of test cases and config settings
-        for fn_name, arg_sizes in test_cases:
-            for config, is_config_deterministic in test_configs:
-                env = os.environ.copy()
-                if config is None:
-                    if env.get(cublas_var_name) is not None:
-                        del env[cublas_var_name]
-                else:
-                    env[cublas_var_name] = config
-                should_throw_error = is_cuda10_2_or_higher and not is_config_deterministic
-                script = f"""
-import torch
-torch.use_deterministic_algorithms(True)
-fn = torch.{fn_name}
-arg_sizes = {arg_sizes}
-device = '{device}'
-should_throw_error = {should_throw_error}
-args = []
-for arg_size in arg_sizes:
-    args.append(torch.randn(*arg_size, device=device))
-try:
-    fn(*args)
-except RuntimeError as e:
-    if not should_throw_error:
-        raise RuntimeError('Did not expect any error to be raised')
-    elif 'Deterministic behavior was enabled with either' not in str(e):
-        raise RuntimeError('Expected a CuBLAS nondeterministic error, but got a different error')
-else:
-    if should_throw_error:
-        raise RuntimeError('Expected a CuBLAS nondeterministic error, but it was not raised')
-
-"""
-                try:
-                    subprocess.check_output(
-                        [sys.executable, '-c', script],
-                        stderr=subprocess.STDOUT,
-                        # On Windows, opening the subprocess with the default CWD makes `import torch`
-                        # fail, so just set CWD to this script's directory
-                        cwd=os.path.dirname(os.path.realpath(__file__)),
-                        env=env)
-                except subprocess.CalledProcessError as e:
-                    self.fail(msg=(
-                        f'Subprocess exception while attempting to run {test_case_info(fn_name, config)}:\n'
-                        + e.output.decode("utf-8")))
-
     @onlyCPU
     @skipIfTorchInductor("https://github.com/pytorch/pytorch/issues/113707")
     @dtypes(*get_all_qint_dtypes())
@@ -8492,7 +8424,7 @@ def test_Size_scalar(self):
     def test_Size_iter(self):
         for sizes in [iter([1, 2, 3, 4, 5]), range(1, 6)]:
             x = torch.Size(sizes)
-            for i in range(0, 5):
+            for i in range(5):
                 self.assertEqual(x[i], i + 1)
 
     def test_t_not_2d_error(self):
@@ -10563,7 +10495,7 @@ def test_conj_neg_tolist(self):
     def test_no_cuda_monkeypatch(self):
         # Note that this is not in test_cuda.py as this whole file is skipped when cuda
         # is not available.
-        with self.assertRaisesRegex(RuntimeError, "Tried to instantiate dummy base class Stream"):
+        with self.assertRaisesRegex(RuntimeError, "torch.cuda.Stream requires CUDA support"):
             torch.cuda.Stream()
 
         with self.assertRaisesRegex(RuntimeError, "Tried to instantiate dummy base class Event"):
diff --git a/test/test_torchfuzz_repros.py b/test/test_torchfuzz_repros.py
new file mode 100644
index 000000000000..adfdd755bc7b
--- /dev/null
+++ b/test/test_torchfuzz_repros.py
@@ -0,0 +1,554 @@
+# Owner(s): ["module: tests"]
+"""
+Fuzzer-discovered eager/compile divergence test cases.
+
+All tests are marked as xfail since they represent known compilation bugs.
+
+IF YOU ARE HERE YOU LIKELY DIDN'T DO ANYTHING WRONG. In fact, you probably did something right!
+All of these tests are associated with bugs the fuzzer found. If one of these tests starts failing due to your PR,
+it actually means your PR fixed the bug! Feel free to delete the test and close out the issue linked from the test.
+"""
+
+import pytest
+
+import torch
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class TestFuzzerCompileIssues(TestCase):
+    """Test cases for fuzzer-discovered eager/compile divergence issues."""
+
+    def setUp(self):
+        """Configure common test settings."""
+        torch._dynamo.config.capture_scalar_outputs = True
+        torch._dynamo.config.capture_dynamic_output_shape_ops = True
+        torch._inductor.config.emulate_precision_casts = True
+
+    @pytest.mark.xfail(reason="Issue #164484")
+    def test_fuzzer_issue_164484(self):
+        torch.manual_seed(9157)
+
+        def foo(arg0, arg1, arg2, arg3):
+            var_node_2 = torch.full((14, 16), 1.158473253250122, dtype=torch.float32)
+            var_node_1 = torch.nn.functional.relu(var_node_2)
+            var_node_6 = torch.full((14, 1), -0.94140625, dtype=torch.bfloat16)
+            var_node_7 = arg0  # size=(1, 16), stride=(16, 1), dtype=bfloat16
+            var_node_5 = torch.matmul(
+                var_node_6.to(torch.bfloat16), var_node_7.to(torch.bfloat16)
+            )
+            var_node_9 = torch.full((16,), 0.76953125, dtype=torch.bfloat16)
+            var_node_8 = torch.reshape(var_node_9, [16])
+            var_node_11 = torch.full((16,), 2.4375, dtype=torch.bfloat16)
+            var_node_10 = torch.reshape(var_node_11, [16])
+            var_node_4 = torch.cat([var_node_5, var_node_8, var_node_10], dim=1)
+            var_node_12 = arg1  # size=(14, 48), stride=(48, 1), dtype=bfloat16
+            var_node_3 = torch.sub(var_node_4, var_node_12)
+            var_node_0 = torch.add(var_node_1, var_node_3)
+            var_node_14 = torch.full((14, 48), 1.4375, dtype=torch.bfloat16)
+            var_node_13 = torch.nn.functional.layer_norm(var_node_14, [48])
+            result = torch.add(var_node_0, var_node_13)
+            output = result + arg2 + arg3
+            return output
+
+        arg0 = torch.rand(
+            [1, 16], dtype=torch.bfloat16, device="cuda", requires_grad=True
+        )
+        arg1 = torch.rand(
+            [14, 48], dtype=torch.bfloat16, device="cuda", requires_grad=True
+        )
+        arg2 = torch.tensor(
+            0.0, dtype=torch.bfloat16, device="cuda", requires_grad=True
+        )
+        arg3 = torch.tensor(
+            0.0, dtype=torch.bfloat16, device="cuda", requires_grad=True
+        )
+
+        out_eager = foo(arg0, arg1, arg2, arg3)
+        out_eager.sum().backward()
+        print("Eager Success! ✅")
+        compiled_foo = torch.compile(foo, fullgraph=True, dynamic=True)
+        out_compiled = compiled_foo(arg0, arg1, arg2, arg3)
+        out_compiled.sum().backward()
+        print("Compile Success! ✅")
+
+    @pytest.mark.xfail(reason="Issue #164186")
+    def test_fuzzer_issue_164186(self):
+        torch.manual_seed(0)
+
+        def foo(arg0):
+            t0 = arg0  # size=(714, 33), stride=(33, 1), dtype=float16, device=cuda
+            t1 = t0.clone()
+            t1.zero_()
+            t2 = t1.contiguous().view((34, 9, 77))
+            t3 = t2.clone()
+            t3.zero_()
+            output = t3
+            return output
+
+        arg0 = torch.rand(
+            [714, 33], dtype=torch.float16, device="cuda", requires_grad=True
+        )
+
+        out_eager = foo(arg0)
+        out_eager.sum().backward()
+        print("Eager Success! ✅")
+        compiled_foo = torch.compile(foo, fullgraph=True, dynamic=True)
+        out_compiled = compiled_foo(arg0)
+        out_compiled.sum().backward()
+        print("Compile Success! ✅")
+
+    @pytest.mark.xfail(reason="Issue #164185")
+    def test_fuzzer_issue_164185(self):
+        torch.manual_seed(0)
+
+        def foo(arg0, arg1, arg2):
+            t0 = arg0  # size=(349200, 5), stride=(5, 1), dtype=bfloat16, device=cuda
+            t1 = t0.mean(
+                dim=1
+            )  # size=(349200,), stride=(1,), dtype=bfloat16, device=cuda
+            t2 = arg1  # size=(), stride=(), dtype=int64, device=cuda
+            t3 = arg2  # size=(50000, 349200), stride=(50000, 1), dtype=bfloat16, device=cuda
+            t4 = torch.nn.functional.embedding(
+                torch.clamp(t2, 0, t3.size(0) - 1).to(torch.long), t3
+            )
+            t5 = torch.pow(torch.pow(torch.pow(torch.pow(t1, t4), t4), t1), t1)
+            t6 = t5.contiguous().view((75, 97, 48))
+            output = t6
+            return output
+
+        arg0 = torch.rand(
+            [349200, 5], dtype=torch.bfloat16, device="cuda", requires_grad=True
+        )
+        arg1 = torch.randint(0, 50000, [], dtype=torch.int64, device="cuda")
+        arg2 = torch.rand(
+            [50000, 349200], dtype=torch.bfloat16, device="cuda", requires_grad=True
+        )
+
+        out_eager = foo(arg0, arg1, arg2)
+        out_eager.sum().backward()
+        print("Eager Success! ✅")
+        compiled_foo = torch.compile(foo, fullgraph=True, dynamic=True)
+        out_compiled = compiled_foo(arg0, arg1, arg2)
+        out_compiled.sum().backward()
+        print("Compile Success! ✅")
+
+    @pytest.mark.xfail(reason="Issue #164157")
+    def test_fuzzer_issue_164157(self):
+        torch.manual_seed(0)
+
+        def foo(arg0, arg1, arg2, arg3, arg4, arg5):
+            t0 = arg0  # size=(47,), stride=(1,), dtype=int64, device=cuda
+            t1 = torch.tanh(t0)  # size=(47,), stride=(1,), dtype=int64, device=cuda
+            t2 = arg1  # size=(), stride=(), dtype=int64, device=cuda
+            t3 = arg2  # size=(), stride=(), dtype=int64, device=cuda
+            t4 = t2 * t3  # size=(), stride=(), dtype=int64, device=cuda
+            t5 = t1.clone()
+            t5.fill_(t4.item())
+            t6 = (
+                arg3  # size=(256, 88, 1), stride=(88, 1, 1), dtype=float16, device=cuda
+            )
+            t7 = (
+                arg4  # size=(256, 88, 1), stride=(88, 1, 1), dtype=float16, device=cuda
+            )
+            t8 = (
+                arg5  # size=(256, 88, 1), stride=(88, 1, 1), dtype=float16, device=cuda
+            )
+            t9 = torch.cat([t6, t6, t7, t8], dim=2)
+            t10 = t9.std(dim=2)
+            t11 = torch.nn.functional.embedding(
+                torch.clamp(t5, 0, t10.size(0) - 1), t10
+            )
+            output = t11
+            return output
+
+        arg0 = torch.randint(0, 100, [47], dtype=torch.int64, device="cuda")
+        arg1 = torch.randint(0, 10, [], dtype=torch.int64, device="cuda")
+        arg2 = torch.randint(0, 10, [], dtype=torch.int64, device="cuda")
+        arg3 = torch.rand(
+            [256, 88, 1], dtype=torch.float16, device="cuda", requires_grad=True
+        )
+        arg4 = torch.rand(
+            [256, 88, 1], dtype=torch.float16, device="cuda", requires_grad=True
+        )
+        arg5 = torch.rand(
+            [256, 88, 1], dtype=torch.float16, device="cuda", requires_grad=True
+        )
+
+        out_eager = foo(arg0, arg1, arg2, arg3, arg4, arg5)
+        out_eager.sum().backward()
+        print("Eager Success! ✅")
+        compiled_foo = torch.compile(foo, fullgraph=True, dynamic=True)
+        out_compiled = compiled_foo(arg0, arg1, arg2, arg3, arg4, arg5)
+        out_compiled.sum().backward()
+        print("Compile Success! ✅")
+
+    @pytest.mark.xfail(reason="Issue #164428")
+    def test_fuzzer_issue_164428_already_exists(self):
+        torch.manual_seed(6804)
+
+        def foo(arg0, arg1, arg2):
+            var_node_4 = (
+                arg0  # size=(7, 1, 32), stride=(1, 1, 0), dtype=float64, device=cuda
+            )
+            var_node_5 = torch.full((7, 1, 32), -1.195053522845565, dtype=torch.float64)
+            var_node_3 = torch.div(var_node_4, var_node_5)
+            var_node_2 = torch.flatten(var_node_3)
+            var_node_8 = torch.full((2,), -0.8316502130341195, dtype=torch.float64)
+            var_node_9 = arg1  # size=(2, 224), stride=(224, 1), dtype=float64
+            var_node_7 = torch.matmul(
+                var_node_8.to(torch.float64), var_node_9.to(torch.float64)
+            )
+            var_node_10 = arg2  # size=(224,), stride=(1,), dtype=float64
+            var_node_6 = torch.sub(var_node_7, var_node_10)
+            var_node_1 = torch.sub(var_node_2, var_node_6)
+            output = var_node_1
+            return output
+
+        arg0 = torch.rand(
+            [7, 1, 32], dtype=torch.float64, device="cuda", requires_grad=True
+        )
+        arg1 = torch.rand(
+            [2, 224], dtype=torch.float64, device="cuda", requires_grad=True
+        )
+        arg2 = torch.rand([224], dtype=torch.float64, device="cuda", requires_grad=True)
+
+        out_eager = foo(arg0, arg1, arg2)
+        out_eager.sum().backward()
+        print("Eager Success! ✅")
+        compiled_foo = torch.compile(foo, fullgraph=True, dynamic=True)
+        out_compiled = compiled_foo(arg0, arg1, arg2)
+        out_compiled.sum().backward()
+        print("Compile Success! ✅")
+
+    @pytest.mark.xfail(reason="Issue #164086")
+    def test_fuzzer_issue_164086(self):
+        torch.manual_seed(0)
+
+        def foo(arg0, arg1, arg2, arg3, arg4, arg5):
+            t0 = arg0  # size=(42, 56), stride=(42, 1), dtype=int64, device=cuda
+            t1 = torch.tanh(
+                t0
+            )  # size=(42, 56), stride=(42, 1), dtype=int64, device=cuda
+            t2 = t1.clone()
+            t2.zero_()  # size=(42, 56), stride=(42, 1), dtype=int64, device=cuda
+            t3 = (
+                arg1  # size=(50000, 128), stride=(50000, 1), dtype=float16, device=cuda
+            )
+            t4 = arg2  # size=(46, 128), stride=(46, 1), dtype=float16, device=cuda
+            t5 = torch.nn.functional.linear(
+                t3, t4
+            )  # size=(50000, 46), stride=(50000, 1), dtype=float16, device=cuda
+            t6 = arg3  # size=(50000, 4, 46), stride=(184, 46, 1), dtype=float16, device=cuda
+            t7 = t6.max(
+                dim=1
+            ).values  # size=(50000, 46), stride=(50000, 1), dtype=float16, device=cuda
+            t8 = arg4  # size=(25786, 46), stride=(46, 1), dtype=float16, device=cuda
+            t9 = arg5  # size=(24214, 46), stride=(46, 1), dtype=float16, device=cuda
+            t10 = torch.cat(
+                [t8, t9], dim=0
+            )  # size=(50000, 46), stride=(50000, 1), dtype=float16, device=cuda
+            t11 = torch.pow(
+                torch.pow(torch.pow(torch.pow(t5, t7), t10), t5), t7
+            )  # size=(50000, 46), stride=(50000, 1), dtype=float16, device=cuda
+            t12 = torch.nn.functional.embedding(
+                torch.clamp(t2, 0, t11.size(0) - 1).to(torch.long), t11
+            )  # size=(42, 56, 46), stride=(2576, 46, 1), dtype=float16, device=cuda
+            output = t12
+            return output
+
+        arg0 = torch.randint(0, 1000, [42, 56], dtype=torch.int64, device="cuda")
+        arg1 = torch.rand(
+            [50000, 128], dtype=torch.float16, device="cuda", requires_grad=True
+        )
+        arg2 = torch.rand(
+            [46, 128], dtype=torch.float16, device="cuda", requires_grad=True
+        )
+        arg3 = torch.rand(
+            [50000, 4, 46], dtype=torch.float16, device="cuda", requires_grad=True
+        )
+        arg4 = torch.rand(
+            [25786, 46], dtype=torch.float16, device="cuda", requires_grad=True
+        )
+        arg5 = torch.rand(
+            [24214, 46], dtype=torch.float16, device="cuda", requires_grad=True
+        )
+
+        out_eager = foo(arg0, arg1, arg2, arg3, arg4, arg5)
+        out_eager.sum().backward()
+        print("Eager Success! ✅")
+        compiled_foo = torch.compile(foo, fullgraph=True, dynamic=True)
+        out_compiled = compiled_foo(arg0, arg1, arg2, arg3, arg4, arg5)
+        out_compiled.sum().backward()
+        print("Compile Success! ✅")
+
+    @pytest.mark.xfail(reason="Issue #163877")
+    def test_fuzzer_issue_163877(self):
+        torch.manual_seed(0)
+
+        def foo(arg0, arg1):
+            t0 = arg0  # size=(401120, 3), stride=(3, 1), dtype=float32, device=cuda
+            t1 = t0.clone()
+            t1.zero_()  # size=(401120, 3), stride=(3, 1), dtype=float32, device=cuda
+            t2 = t1.reshape(
+                (109, 115, 96)
+            )  # size=(109, 115, 96), stride=(11040, 96, 1), dtype=float32, device=cuda
+            t3 = arg1  # size=(), stride=(), dtype=float32, device=cuda
+            t4 = t3.contiguous()  # size=(), stride=(), dtype=float32, device=cuda
+            t5 = torch.nn.functional.relu(
+                t4
+            )  # size=(), stride=(), dtype=float32, device=cuda
+            t6 = t2.clone()
+            t6.fill_(
+                t5.item()
+            )  # size=(109, 115, 96), stride=(11040, 96, 1), dtype=float32, device=cuda
+            output = t6
+            return output
+
+        arg0 = torch.rand(
+            [401120, 3], dtype=torch.float32, device="cuda", requires_grad=True
+        )
+        arg1 = torch.rand([], dtype=torch.float32, device="cuda", requires_grad=True)
+
+        out_eager = foo(arg0, arg1)
+        out_eager.sum().backward()
+        print("Eager Success! ✅")
+        compiled_foo = torch.compile(foo, fullgraph=True, dynamic=True)
+        out_compiled = compiled_foo(arg0, arg1)
+        out_compiled.sum().backward()
+        print("Compile Success! ✅")
+
+    @pytest.mark.xfail(reason="Issue #163971")
+    def test_fuzzer_issue_163971(self):
+        torch.manual_seed(0)
+
+        def foo(arg0):
+            t0 = arg0  # size=(), stride=(), dtype=bfloat16, device=cuda
+            t1 = torch.softmax(
+                t0, dim=0
+            )  # size=(), stride=(), dtype=bfloat16, device=cuda
+            t2 = torch.nn.functional.gelu(
+                t1
+            )  # size=(), stride=(), dtype=bfloat16, device=cuda
+            t3 = torch.softmax(
+                t2, dim=0
+            )  # size=(), stride=(), dtype=bfloat16, device=cuda
+            output = t3
+            return output
+
+        arg0 = torch.rand([], dtype=torch.bfloat16, device="cuda", requires_grad=True)
+
+        out_eager = foo(arg0)
+        out_eager.sum().backward()
+        print("Eager Success! ✅")
+        compiled_foo = torch.compile(foo, fullgraph=True, dynamic=True)
+        out_compiled = compiled_foo(arg0)
+        out_compiled.sum().backward()
+        print("Compile Success! ✅")
+
+    @pytest.mark.xfail(reason="Issue #164059")
+    def test_fuzzer_issue_164059(self):
+        torch.manual_seed(0)
+
+        def foo(arg0, arg1, arg2):
+            t0 = arg0  # size=(16, 38073, 1), stride=(38073, 1, 1), dtype=float32, device=cuda
+            t1 = t0.clone()
+            t1.zero_()  # size=(16, 38073, 1), stride=(38073, 1, 1), dtype=float32, device=cuda
+            t2 = t1.contiguous().view(
+                (49, 112, 111)
+            )  # size=(49, 112, 111), stride=(5488, 112, 1), dtype=float32, device=cuda
+            t3 = arg1  # size=(1,), stride=(1,), dtype=int64, device=cuda
+            t4 = arg2  # size=(1,), stride=(1,), dtype=int64, device=cuda
+            t5 = t3 + t3 + t4  # size=(1,), stride=(1,), dtype=int64, device=cuda
+            t6 = torch.exp(  # noqa: F841
+                t5
+            )  # size=(1,), stride=(1,), dtype=int64, device=cuda  # noqa: F841
+            t7 = torch.nn.functional.layer_norm(
+                t2, (111,)
+            )  # size=(49, 112, 111), stride=(12432, 111, 1), dtype=float32, device=cuda
+            output = t7
+            return output
+
+        arg0 = torch.rand(
+            [16, 38073, 1], dtype=torch.float32, device="cuda", requires_grad=True
+        )
+        arg1 = torch.randint(0, 1000, [1], dtype=torch.int64, device="cuda")
+        arg2 = torch.randint(0, 1000, [1], dtype=torch.int64, device="cuda")
+
+        out_eager = foo(arg0, arg1, arg2)
+        out_eager.sum().backward()
+        print("Eager Success! ✅")
+        compiled_foo = torch.compile(foo, fullgraph=True, dynamic=True)
+        out_compiled = compiled_foo(arg0, arg1, arg2)
+        out_compiled.sum().backward()
+        print("Compile Success! ✅")
+
+    @pytest.mark.xfail(reason="Issue #164088")
+    def test_fuzzer_issue_164088(self):
+        torch.manual_seed(0)
+
+        def foo(arg0, arg1, arg2, arg3, arg4):
+            t0 = arg0  # size=(23, 4), stride=(4, 1), dtype=bfloat16, device=cuda
+            t1 = t0.clone()
+            t1.zero_()  # size=(23, 4), stride=(4, 1), dtype=bfloat16, device=cuda
+            t2 = t1.contiguous().view(
+                (92,)
+            )  # size=(92,), stride=(1,), dtype=bfloat16, device=cuda
+            t3 = arg1  # size=(5, 4, 5), stride=(20, 5, 1), dtype=bfloat16, device=cuda
+            t4 = t3.min()  # size=(), stride=(), dtype=bfloat16, device=cuda
+            t5 = arg2  # size=(), stride=(), dtype=bfloat16, device=cuda
+            t6 = torch.nn.functional.silu(
+                t5
+            )  # size=(), stride=(), dtype=bfloat16, device=cuda
+            t7 = arg3  # size=(3, 2, 3), stride=(6, 3, 1), dtype=bfloat16, device=cuda
+            t8 = t7.min()  # size=(), stride=(), dtype=bfloat16, device=cuda
+            t9 = arg4  # size=(), stride=(), dtype=bfloat16, device=cuda
+            t10 = ((t8) / t9) / t9  # size=(), stride=(), dtype=bfloat16, device=cuda
+            t11 = (
+                t4 + t4 + t6 + t10 + t8
+            )  # size=(), stride=(), dtype=bfloat16, device=cuda
+            t12 = t2.clone()
+            t12.fill_(
+                t11.item()
+            )  # size=(92,), stride=(1,), dtype=bfloat16, device=cuda
+            output = t12
+            return output
+
+        arg0 = torch.rand(
+            [23, 4], dtype=torch.bfloat16, device="cuda", requires_grad=True
+        )
+        arg1 = torch.rand(
+            [5, 4, 5], dtype=torch.bfloat16, device="cuda", requires_grad=True
+        )
+        arg2 = torch.rand([], dtype=torch.bfloat16, device="cuda", requires_grad=True)
+        arg3 = torch.rand(
+            [3, 2, 3], dtype=torch.bfloat16, device="cuda", requires_grad=True
+        )
+        arg4 = torch.rand([], dtype=torch.bfloat16, device="cuda", requires_grad=True)
+
+        out_eager = foo(arg0, arg1, arg2, arg3, arg4)
+        out_eager.sum().backward()
+        print("Eager Success! ✅")
+        compiled_foo = torch.compile(foo, fullgraph=True, dynamic=True)
+        out_compiled = compiled_foo(arg0, arg1, arg2, arg3, arg4)
+        out_compiled.sum().backward()
+        print("Compile Success! ✅")
+
+    @pytest.mark.xfail(reason="Issue #163894")
+    def test_fuzzer_issue_163894(self):
+        torch.manual_seed(9)
+
+        def foo(arg0):
+            var_node_1 = arg0  # size=(1, 2), stride=(2, 1), dtype=int64, device=cuda  # noqa: F841
+            var_node_5 = torch.full(
+                (1, 2), -66, dtype=torch.int32
+            )  # size=(1, 2), stride=(2, 1), dtype=int32, device=cuda
+            var_node_6 = torch.full(
+                (1, 2), 77, dtype=torch.int64
+            )  # size=(1, 2), stride=(2, 1), dtype=int64, device=cuda
+            var_node_4 = torch.ops.aten.add(
+                var_node_5, var_node_6
+            )  # size=(1, 2), stride=(2, 1), dtype=int32, device=cuda
+            var_node_7 = torch.full(
+                (1, 2), -64, dtype=torch.int32
+            )  # size=(1, 2), stride=(2, 1), dtype=int32, device=cuda
+            var_node_3 = torch.ops.aten.mul(
+                var_node_4, var_node_7
+            )  # size=(1, 2), stride=(2, 1), dtype=int32, device=cuda
+            var_node_9 = torch.full(
+                (3, 4), False, dtype=torch.bool
+            )  # size=(3, 4), stride=(4, 1), dtype=bool, device=cuda
+            var_node_8 = torch.nonzero(
+                var_node_9
+            )  # size=(0, 2), stride=(2, 1), dtype=int64, device=cuda
+            if var_node_8.numel() == 0:
+                var_node_8 = torch.zeros((1, 2), dtype=torch.int64, device="cuda")
+            var_node_2 = torch.ops.aten.add(var_node_3, var_node_8)
+            output = var_node_2.float()
+            return output
+
+        arg0 = torch.randint(0, 10, [1, 2], dtype=torch.int64, device="cuda")
+
+        out_eager = foo(arg0)
+        out_eager.sum().backward()
+        print("Eager Success! ✅")
+        compiled_foo = torch.compile(foo, fullgraph=True, dynamic=True)
+        out_compiled = compiled_foo(arg0)
+        out_compiled.sum().backward()
+        print("Compile Success! ✅")
+
+    @pytest.mark.xfail(reason="Issue #164486")
+    def test_fuzzer_issue_164486(self):
+        torch.manual_seed(238)
+
+        def foo(arg0):
+            var_node_2 = torch.full(
+                (), 1, dtype=torch.int16
+            )  # size=(), stride=(), dtype=int16, device=cuda
+            var_node_3 = arg0  # size=(), stride=(), dtype=int16, device=cuda
+            var_node_1 = torch.add(
+                var_node_2, var_node_3
+            )  # size=(), stride=(), dtype=int16, device=cuda
+            var_node_5 = torch.full(
+                (1,), 3, dtype=torch.int16
+            )  # size=(1,), stride=(1,), dtype=int16, device=cuda
+            var_node_4 = torch.squeeze(
+                var_node_5
+            )  # size=(), stride=(), dtype=int16, device=cuda
+            var_node_0 = torch.div(
+                var_node_1, var_node_4
+            )  # size=(), stride=(), dtype=int16, device=cuda
+            result = var_node_0.float()
+            return result
+
+        arg0 = torch.randint(0, 10, [], dtype=torch.int16, device="cuda")
+
+        out_eager = foo(arg0)
+        out_eager.sum().backward()
+        print("Eager Success! ✅")
+        compiled_foo = torch.compile(foo, fullgraph=True, dynamic=True)
+        out_compiled = compiled_foo(arg0)
+        out_compiled.sum().backward()
+        print("Compile Success! ✅")
+
+    @pytest.mark.xfail(reason="Issue #163674")
+    def test_fuzzer_issue_163674(self):
+        torch.manual_seed(0)
+
+        def foo(arg0, arg1, arg2):
+            t0 = arg0  # size=(79488, 1, 3, 1), stride=(3, 3, 1, 1), dtype=float16, device=cuda
+            t1 = t0.clone()
+            t1.zero_()  # size=(79488, 1, 3, 1), stride=(3, 3, 1, 1), dtype=float16, device=cuda
+            t2 = arg1  # size=(79488, 1, 3, 1), stride=(3, 3, 1, 1), dtype=float32, device=cuda
+            t3 = arg2  # size=(), stride=(), dtype=float32, device=cuda
+            t4 = t2.clone()
+            t4.fill_(
+                t3.item()
+            )  # size=(79488, 1, 3, 1), stride=(3, 3, 1, 1), dtype=float32, device=cuda
+            t5 = torch.pow(
+                t1, t4
+            )  # size=(79488, 1, 3, 1), stride=(3, 3, 1, 1), dtype=float32, device=cuda
+            t6 = t5.reshape(
+                (96, 69, 36)
+            )  # size=(96, 69, 36), stride=(2484, 36, 1), dtype=float32, device=cuda
+            output = t6
+            return output
+
+        arg0 = torch.rand(
+            [79488, 1, 3, 1], dtype=torch.float16, device="cuda", requires_grad=True
+        )
+        arg1 = torch.rand(
+            [79488, 1, 3, 1], dtype=torch.float32, device="cuda", requires_grad=True
+        )
+        arg2 = torch.rand([], dtype=torch.float32, device="cuda", requires_grad=True)
+
+        out_eager = foo(arg0, arg1, arg2)
+        out_eager.sum().backward()
+        print("Eager Success! ✅")
+        compiled_foo = torch.compile(foo, fullgraph=True, dynamic=True)
+        out_compiled = compiled_foo(arg0, arg1, arg2)
+        out_compiled.sum().backward()
+        print("Compile Success! ✅")
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_transformers.py b/test/test_transformers.py
index c58fe05d37be..1d8350250369 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -51,7 +51,6 @@
     PLATFORM_SUPPORTS_CUDNN_ATTENTION,
     tf32_on_and_off,
     tf32_enabled,
-    ROCM_VERSION,
 )
 
 if TEST_FAIRSEQ:
@@ -340,14 +339,11 @@ def test_train_with_pad_and_catch_error(self, device):
                 l1_bool = nn.L1Loss()(test_train_bool[:, 0:2, :], test_eval_bool[:, 0:2, :]).item()
                 self.assertTrue(l1_bool < 1e-4, "Eval/Train difference in pad_mask BOOL")
 
-    @tf32_on_and_off(0.001, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
+    @tf32_on_and_off(0.001)
     @parametrize("attn_mask_dim", [2, 3, None])
     @parametrize("key_padding_mask_dim", [2, None])
     @parametrize("mask_dtype", [torch.bool, torch.float32])
     def test_multiheadattention_fastpath_attn_mask(self, device, attn_mask_dim, key_padding_mask_dim, mask_dtype):
-        if TEST_WITH_ROCM:
-            if attn_mask_dim is not None and mask_dtype == torch.bool:
-                self.skipTest("boolean mask is not fully supported on ROCm yet.")
         # MHA converts all
         with torch.no_grad():
             B = 2
@@ -430,8 +426,7 @@ def hook(module, inputs, output):
         # remove hook
         handle.remove()
 
-    @skipIfRocm
-    @tf32_on_and_off(0.001)
+    @tf32_on_and_off(0.0021 if TEST_WITH_ROCM else 0.001)
     @parametrize("use_torchscript", [False])
     @parametrize("enable_nested_tensor", [True, False])
     @parametrize("use_autocast", [True, False])
@@ -524,7 +519,7 @@ def test_transformerencoder_fastpath(self, device, use_torchscript, enable_neste
                 slowpath_output = slowpath_output.masked_fill(src_key_padding_mask.unsqueeze(-1), 0)
                 self.assertEqual(fastpath_output_expanded, slowpath_output)
 
-    @tf32_on_and_off(0.001, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
+    @tf32_on_and_off(0.001)
     @parametrize("with_no_grad", [True, False])
     @parametrize("training", [True, False])
     @parametrize("enable_nested_tensor", [False])
@@ -1110,7 +1105,7 @@ def forward(
                     return_all_hiddens=False,
                 )[0]
 
-    @tf32_on_and_off(0.003, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
+    @tf32_on_and_off(0.003)
     @parametrize("input_dim,attn_mask_dim,is_causal",
                  [(3, None, False), (3, 2, False), (3, 2, True), (3, 3, False), (3, 3, True),
                   (4, None, False), (4, 2, False), (4, 2, True), (4, 4, False), (4, 4, True)],
@@ -1421,7 +1416,6 @@ def ones_tensor(*shape):
                         _ = mha_f(qkv_f, qkv_f, qkv_f, attn_mask=mask, need_weights=False, is_causal=True)
                         torch.cuda.synchronize()
 
-    @skipIfRocm  # Missing EFFICIENT_ATTENTION
     @unittest.skipIf(
         not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Platform does not supposrt fused SDPA or pre-SM80 hardware"
     )
@@ -1714,7 +1708,7 @@ def test_unaligned_tensors(self, device):
         make_tensor = partial(torch.rand, size, device=device, dtype=dtype)
         q, k, v = make_tensor(), make_tensor(), make_tensor()
         with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
-            ctxmgr = self.assertRaises(RuntimeError) if not TEST_WITH_ROCM else contextlib.nullcontext()
+            ctxmgr = self.assertRaises(RuntimeError)
             with ctxmgr:
                 torch.nn.functional.scaled_dot_product_attention(q, k, v, None, 0.0, False)
 
@@ -2612,7 +2606,6 @@ def convert_flash_attn_S_to_softmax(
         S_converted = F.pad(S_converted, (0, seqlen_k_og - seqlen_k_rounded))
         return S_converted[:, :, :seqlen_q, :seqlen_k]
 
-    @skipIfRocm  # No cuDNN Attention
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
     def test_cudnn_attention_different_dk_dv(self, device):
         dtype = torch.bfloat16
@@ -2636,7 +2629,6 @@ def test_cudnn_attention_different_dk_dv(self, device):
 
         self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
 
-    @skipIfRocm  # No cuDNN Attention
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
     def test_cudnn_attention_gqa(self, device):
         batch = 4
@@ -2660,7 +2652,6 @@ def test_cudnn_attention_gqa(self, device):
 
         self.assertEqual(output_math, output_cudnn)
 
-    @skipIfRocm  # No cuDNN Attention
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
     def test_cudnn_attention_d256_heuristic(self, device):
         dtype = torch.bfloat16
@@ -2691,7 +2682,6 @@ def test():
             with self.assertRaisesRegex(RuntimeError, "No available kernel."):
                 test()
 
-    @skipIfRocm(msg="No cuDNN on ROCm")
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
     def test_fused_attention_different_dk_dv(self, device):
         dtype = torch.bfloat16
@@ -2715,7 +2705,7 @@ def test_fused_attention_different_dk_dv(self, device):
         self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
 
 
-    @skipIfRocm  # No cuDNN Attention
+    @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
     @unittest.skipIf(True, "broken as of cuDNN 9.10")
     def test_cudnn_attention_fail_d128(self, device):
         # Test that cuDNN attention dispatching correctly bails out on d > 128
@@ -2737,7 +2727,6 @@ def test_cudnn_attention_fail_d128(self, device):
                 with self.assertRaisesRegex(RuntimeError, "No available kernel."):
                     torch.nn.functional.scaled_dot_product_attention(q, k, v)
 
-    @skipIfRocm(msg="No cuDNN on ROCm")
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cudnn Attention is not supported on this system")
     def test_cudnn_attention_trivial_output_transpose(self, device):
         # see also: https://github.com/pytorch/pytorch/issues/134001
@@ -2753,7 +2742,6 @@ def test_cudnn_attention_trivial_output_transpose(self, device):
         o.backward(o)
         torch.testing.assert_close(x.grad, x_cpu.grad.cuda(), atol=7e-3, rtol=7e-3)
 
-    @skipIfRocm  # No cuDNN Attention
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cudnn Attention is not supported on this system")
     def test_cudnn_attention_nonmodulo64seqlen(self, device):
         # see also: https://github.com/pytorch/pytorch/issues/137347
@@ -2793,7 +2781,6 @@ def test_cudnn_attention_nonmodulo64seqlen(self, device):
         torch.testing.assert_close(k.grad, k_cpu.grad.cuda(), atol=3e-3, rtol=2e-3)
         torch.testing.assert_close(v.grad, v_cpu.grad.cuda(), atol=3e-3, rtol=2e-3)
 
-    @skipIfRocm
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cudnn Attention is not supported on this system")
     def test_cudnn_attention_preserves_query_layout(self, device):
 
@@ -2823,7 +2810,6 @@ def test_attention(backend: SDPBackend, permute_order: list[list[int]]):
         for permute_order in permute_orders:
             test_attention(SDPBackend.CUDNN_ATTENTION, list(permute_order) + [3])
 
-    @skipIfRocm
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cudnn Attention is not supported on this system")
     def test_cudnn_attention_compiles(self):
         q = torch.randn(2, 8, 1024, 128, dtype=torch.half, device='cuda', requires_grad=True)
@@ -2846,6 +2832,15 @@ def func():
         self.assertEqual(out, out_cpu.cuda().half(), atol=1e-3, rtol=1e-3)
         self.assertEqual(q.grad, q_cpu.grad.cuda().half(), atol=7e-3, rtol=5e-3)
 
+    @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cudnn Attention is not supported on this system")
+    def test_cudnn_attention_seqlen1_dropout_heuristic(self):
+        q = torch.randn(2, 8, 1, 128, dtype=torch.half, device='cuda', requires_grad=True)
+        grad = torch.randn_like(q)
+
+        with torch.nn.attention.sdpa_kernel([SDPBackend.CUDNN_ATTENTION, SDPBackend.FLASH_ATTENTION]):
+            out = torch.nn.functional.scaled_dot_product_attention(q, q, q, dropout_p=0.5)
+            out.backward(grad)
+
     @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Fused SDPA was not built for this system")
     @parametrize("mask_dim", [1, 2, 3, 4])
     def test_mem_efficient_attention_mask_variants(self, device, mask_dim: list[int]):
@@ -2978,7 +2973,7 @@ def test_scaled_dot_product_attention_fused_kernels_packed(self, device, type: s
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "Fused SDPA was not built for this system")
     @unittest.skipIf("TORCH_CUDNN_SDPA_NESTED_TENSOR_ENABLED" not in os.environ, "cuDNN Nested Tensor support not enabled")
     @parametrize("type", ["nested"])
-    @parametrize("is_contiguous", [True])
+    @parametrize("is_contiguous", [True, False])
     def test_scaled_dot_product_attention_cudnn_nested(self, device, type: str, is_contiguous: bool):
         if TEST_WITH_ROCM and type == 'nested':
             self.skipTest("ROCM does not support efficient attention on nested tensors, for now")
@@ -3233,7 +3228,6 @@ def test_sdp_choice_with_determinism(self, device, warn_only):
             with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION, SDPBackend.MATH]):
                 assert torch._fused_sdp_choice(query, key, value) == SDPBackend.EFFICIENT_ATTENTION.value
 
-    @skipIfRocm
     @onlyCUDA
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
     @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Platform does not support fused SDPA")
diff --git a/test/test_type_info.py b/test/test_type_info.py
index 80a21bc5e9dd..2ed7a29fe5d2 100644
--- a/test/test_type_info.py
+++ b/test/test_type_info.py
@@ -12,7 +12,7 @@
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 import sys
 import unittest
@@ -125,6 +125,7 @@ def test_to_complex(self):
         # Regression test for https://github.com/pytorch/pytorch/issues/124868
         # If reference count is leaked this would be a set of 10 elements
         ref_cnt = {sys.getrefcount(torch.float32.to_complex()) for _ in range(10)}
+
         self.assertLess(len(ref_cnt), 3)
 
         self.assertEqual(torch.float64.to_complex(), torch.complex128)
@@ -135,6 +136,7 @@ def test_to_real(self):
         # Regression test for https://github.com/pytorch/pytorch/issues/124868
         # If reference count is leaked this would be a set of 10 elements
         ref_cnt = {sys.getrefcount(torch.cfloat.to_real()) for _ in range(10)}
+
         self.assertLess(len(ref_cnt), 3)
 
         self.assertEqual(torch.complex128.to_real(), torch.double)
diff --git a/test/test_type_promotion.py b/test/test_type_promotion.py
index 59d856ec4fc9..abb32d525bf7 100644
--- a/test/test_type_promotion.py
+++ b/test/test_type_promotion.py
@@ -22,7 +22,7 @@
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 # Not thread-safe decorator that runs the decorated test once with
 # the default dtype being torch.float and again with the default dtype
@@ -968,7 +968,7 @@ def test_numpy_array_binary_ufunc_promotion(self, device, dtypes):
                 except Exception as e:
                     expected = e
 
-                same_result = (type(expected) == type(actual)) and expected == actual
+                same_result = (type(expected) is type(actual)) and expected == actual
 
                 # Note: An "undesired failure," as opposed to an "expected failure"
                 # is both expected (we know the test will fail) and
@@ -1128,7 +1128,7 @@ def make_tensor(size, dtype):
         maxs = (max_t, max_t[0], max_t[0].item())
         inp = make_tensor((S,), dtype0)
         for min_v, max_v in itertools.product(mins, maxs):
-            if type(max_v) != type(min_v):
+            if type(max_v) is not type(min_v):
                 continue
             if isinstance(min_v, torch.Tensor) and min_v.ndim == 0 and max_v.ndim == 0:
                 continue  # 0d tensors go to scalar overload, and it's tested separately
diff --git a/test/test_utils.py b/test/test_utils.py
index 0314da6e320a..40cc969f1166 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -3,7 +3,6 @@
 
 import os
 import random
-import re
 import shutil
 import subprocess
 import sys
@@ -52,7 +51,7 @@
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 HAS_CUDA = torch.cuda.is_available()
 
@@ -633,151 +632,6 @@ def test_multi_drop(self):
 test_dir = os.path.abspath(os.path.dirname(str(__file__)))
 
 
-@unittest.skipIf(
-    "SKIP_TEST_BOTTLENECK" in os.environ.keys(), "SKIP_TEST_BOTTLENECK is set"
-)
-class TestBottleneck(TestCase):
-    def _run(self, command, timeout=30):
-        """Returns (return-code, stdout, stderr)"""
-        import subprocess
-
-        p = subprocess.Popen(
-            command,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            shell=True,
-        )
-        try:
-            output, err = p.communicate(timeout=timeout)
-        except subprocess.TimeoutExpired:
-            p.kill()
-            output, err = p.communicate()
-        rc = p.returncode
-        output_str = output.decode("ascii")
-        err_str = err.decode("ascii")
-        return (rc, output_str, err_str)
-
-    def _run_bottleneck(self, test_file, scriptargs=""):
-        curdir = os.path.dirname(os.path.abspath(__file__))
-        filepath = f"{curdir}/{test_file}"
-        if scriptargs != "":
-            scriptargs = f" {scriptargs}"
-        rc, out, err = self._run(
-            f"{sys.executable} -m torch.utils.bottleneck {filepath}{scriptargs}"
-        )
-        return rc, out, err
-
-    def _check_run_args(self):
-        # Check that this fails due to missing args
-        rc, out, err = self._run_bottleneck("bottleneck_test/test_args.py")
-        self.assertEqual(
-            rc,
-            2,
-            atol=0,
-            rtol=0,
-            msg=self._fail_msg("Missing args should error", out + err),
-        )
-
-        # This should succeed
-        rc, out, err = self._run_bottleneck(
-            "bottleneck_test/test_args.py", "--foo foo --bar bar"
-        )
-        self.assertEqual(
-            rc,
-            0,
-            atol=0,
-            rtol=0,
-            msg=self._fail_msg("Should pass args to script", out + err),
-        )
-
-    def _fail_msg(self, msg, output):
-        return f"{msg}, output was:\n{output}"
-
-    def _check_environment_summary(self, output):
-        results = re.search("Environment Summary", output)
-        self.assertIsNotNone(
-            results, self._fail_msg("Should have Environment Summary", output)
-        )
-
-        # Up to five lines away from the heading, there should be the version number
-        results = re.search(
-            r"Environment Summary.*(\n.*){,5}\nPyTorch \d+\.\d+", output
-        )
-        self.assertIsNotNone(
-            results, self._fail_msg("Should have PyTorch version", output)
-        )
-
-    def _check_cprof_summary(self, output):
-        results = re.search("cProfile output", output)
-        self.assertIsNotNone(
-            results, self._fail_msg("Should have cProfile output", output)
-        )
-
-        # This assumes that after the cProfile output section we have
-        # the autograd profiler output
-        results = re.search(
-            r"cProfile output.*(\n.*){6,50}\n.*autograd profiler output", output
-        )
-        self.assertIsNotNone(
-            results,
-            self._fail_msg(
-                "Distance between cProfile and autograd prof out not in [6, 50] lines",
-                output,
-            ),
-        )
-
-    def _check_autograd_summary(self, output):
-        results = re.search("autograd profiler output", output)
-        self.assertIsNotNone(
-            results, self._fail_msg("Should have autograd profiler output", output)
-        )
-
-        # This assumes that after the autograd profiler output is the end of the
-        # output.
-        results = re.search(r"autograd profiler output.*(\n.*){6,100}", output)
-        self.assertIsNotNone(
-            results,
-            self._fail_msg(
-                "Distance between autograd prof output and end of output not in [6, 100] lines",
-                output,
-            ),
-        )
-
-    def _check_cuda(self, output):
-        if HAS_CUDA:
-            results = re.search("CUDA mode", output)
-            self.assertIsNotNone(
-                results, self._fail_msg("Should tell users CUDA", output)
-            )
-        else:
-            results = re.search("CUDA mode", output)
-            self.assertIsNone(
-                results, self._fail_msg("Should not tell users about CUDA", output)
-            )
-
-    @unittest.skipIf(HAS_CUDA, "CPU-only test")
-    def test_bottleneck_cpu_only(self):
-        rc, out, err = self._run_bottleneck("bottleneck_test/test.py")
-        self.assertEqual(rc, 0, msg=f"Run failed with\n{err}")
-
-        self._check_run_args()
-        self._check_environment_summary(out)
-        self._check_autograd_summary(out)
-        self._check_cprof_summary(out)
-        self._check_cuda(out)
-
-    @unittest.skipIf(not HAS_CUDA, "No CUDA")
-    def test_bottleneck_cuda(self):
-        rc, out, err = self._run_bottleneck("bottleneck_test/test_cuda.py")
-        self.assertEqual(rc, 0, msg=f"Run failed with\n{err}")
-
-        self._check_run_args()
-        self._check_environment_summary(out)
-        self._check_autograd_summary(out)
-        self._check_cprof_summary(out)
-        self._check_cuda(out)
-
-
 from torch.utils.collect_env import get_pretty_env_info
 
 
diff --git a/test/test_varlen_attention.py b/test/test_varlen_attention.py
new file mode 100644
index 000000000000..f249adf21a52
--- /dev/null
+++ b/test/test_varlen_attention.py
@@ -0,0 +1,195 @@
+# Owner(s): ["module: sdpa"]
+import unittest
+from collections import namedtuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.attention import varlen_attn
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.testing._internal.common_nn import NNTestCase
+from torch.testing._internal.common_utils import parametrize, run_tests
+
+
+VarlenShape = namedtuple(
+    "VarlenShape", ["batch_size", "max_seq_len", "embed_dim", "num_heads"]
+)
+
+default_tolerances = {
+    torch.float16: {"atol": 1e-1, "rtol": 1e-1},
+    torch.bfloat16: {"atol": 9e-2, "rtol": 5e-2},
+    torch.float32: {"atol": 1e-5, "rtol": 1.3e-6},
+}
+
+
+class AttentionBlock(nn.Module):
+    def __init__(
+        self, embed_dim: int, num_heads: int, device: torch.device, dtype: torch.dtype
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+
+        self.qkv_proj = nn.Linear(
+            embed_dim, 3 * embed_dim, bias=False, device=device, dtype=dtype
+        )
+        self.out_proj = nn.Linear(
+            embed_dim, embed_dim, bias=False, device=device, dtype=dtype
+        )
+
+    def forward_varlen(
+        self,
+        x_packed: torch.Tensor,
+        cu_seq: torch.Tensor,
+        max_len: int,
+        is_causal: bool = False,
+    ):
+        qkv = self.qkv_proj(x_packed)
+        q, k, v = qkv.chunk(3, dim=-1)
+
+        q = q.view(-1, self.num_heads, self.head_dim)
+        k = k.view(-1, self.num_heads, self.head_dim)
+        v = v.view(-1, self.num_heads, self.head_dim)
+
+        attn_out = varlen_attn(
+            q, k, v, cu_seq, cu_seq, max_len, max_len, is_causal=is_causal
+        )
+        attn_out = attn_out.view(-1, self.embed_dim)
+
+        return self.out_proj(attn_out)
+
+    def forward_sdpa(self, x_padded: torch.Tensor, is_causal: bool = False):
+        batch_size, seq_len, _ = x_padded.shape
+
+        qkv = self.qkv_proj(x_padded)
+        q, k, v = qkv.chunk(3, dim=-1)
+
+        q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        attn_out = F.scaled_dot_product_attention(q, k, v, is_causal=is_causal)
+        attn_out = (
+            attn_out.transpose(1, 2)
+            .contiguous()
+            .view(batch_size, seq_len, self.embed_dim)
+        )
+
+        return self.out_proj(attn_out)
+
+
+def create_variable_length_batch(
+    shape: VarlenShape, device: torch.device, dtype: torch.dtype
+):
+    seq_lengths = []
+    for _ in range(shape.batch_size):
+        length = torch.randint(1, shape.max_seq_len // 64 + 1, (1,)).item() * 64
+        seq_lengths.append(min(length, shape.max_seq_len))
+
+    seq_lengths = torch.tensor(seq_lengths, device=device)
+    total_tokens = seq_lengths.sum().item()
+
+    x_packed = torch.randn(total_tokens, shape.embed_dim, device=device, dtype=dtype)
+
+    cu_seq = torch.zeros(shape.batch_size + 1, device=device, dtype=torch.int32)
+    cu_seq[1:] = seq_lengths.cumsum(0)
+
+    max_len = seq_lengths.max().item()
+    x_padded = torch.zeros(
+        shape.batch_size, max_len, shape.embed_dim, device=device, dtype=dtype
+    )
+
+    start_idx = 0
+    for i, seq_len in enumerate(seq_lengths):
+        end_idx = start_idx + seq_len
+        x_padded[i, :seq_len] = x_packed[start_idx:end_idx]
+        start_idx = end_idx
+
+    return {
+        "seq_lengths": seq_lengths,
+        "cu_seq": cu_seq,
+        "x_packed": x_packed,
+        "x_padded": x_padded,
+        "max_len": max_len,
+        "total_tokens": total_tokens,
+    }
+
+
+class TestVarlenAttention(NNTestCase):
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Flash Attention not supported"
+    )
+    @parametrize("dtype", [torch.bfloat16, torch.float16])
+    def test_basic_functionality(self, device, dtype):
+        torch.manual_seed(42)
+
+        shape = VarlenShape(batch_size=2, max_seq_len=512, embed_dim=1024, num_heads=16)
+
+        attention_block = AttentionBlock(
+            shape.embed_dim, shape.num_heads, device, dtype
+        )
+
+        total_tokens = shape.batch_size * shape.max_seq_len
+        x_packed = torch.randn(
+            total_tokens, shape.embed_dim, device=device, dtype=dtype
+        )
+        cu_seq = torch.tensor(
+            [0, shape.max_seq_len, total_tokens], device=device, dtype=torch.int32
+        )
+
+        output = attention_block.forward_varlen(
+            x_packed, cu_seq, shape.max_seq_len, is_causal=False
+        )
+
+        self.assertEqual(output.shape, (total_tokens, shape.embed_dim))
+        self.assertEqual(output.device, torch.device(device))
+        self.assertEqual(output.dtype, dtype)
+
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Flash Attention not supported"
+    )
+    @parametrize("dtype", [torch.bfloat16, torch.float16])
+    @parametrize("is_causal", [False, True])
+    def test_varlen_vs_sdpa(self, device, dtype, is_causal):
+        torch.manual_seed(42)
+
+        shape = VarlenShape(
+            batch_size=8, max_seq_len=2048, embed_dim=1024, num_heads=16
+        )
+
+        attention_block = AttentionBlock(
+            shape.embed_dim, shape.num_heads, device, dtype
+        )
+
+        variable_length_batch_data = create_variable_length_batch(shape, device, dtype)
+
+        varlen_output = attention_block.forward_varlen(
+            variable_length_batch_data["x_packed"],
+            variable_length_batch_data["cu_seq"],
+            variable_length_batch_data["max_len"],
+            is_causal=is_causal,
+        )
+        sdpa_output = attention_block.forward_sdpa(
+            variable_length_batch_data["x_padded"], is_causal=is_causal
+        )
+
+        tolerances = default_tolerances[dtype]
+        start_idx = 0
+        for i, seq_len in enumerate(variable_length_batch_data["seq_lengths"]):
+            end_idx = start_idx + seq_len
+
+            varlen_seq = varlen_output[start_idx:end_idx]
+            sdpa_seq = sdpa_output[i, :seq_len]
+
+            torch.testing.assert_close(varlen_seq, sdpa_seq, **tolerances)
+            start_idx = end_idx
+
+
+device_types = ("cuda",)
+
+instantiate_device_type_tests(TestVarlenAttention, globals(), only_for=device_types)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 5bec225787cc..980439b7a696 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -916,7 +916,7 @@ def assert_is_nonview(t, nv):
         assert_is_nonview(t, nv)
 
         # flatten returns the original object if start_dim=end_dim
-        t = t = torch.ones(2, 2, device=device)
+        t = torch.ones(2, 2, device=device)
         nv = t.flatten(1, 1)
         self.assertTrue(t is nv)
 
@@ -1559,7 +1559,7 @@ def test_transpose_vs_numpy(self, device, dtype):
             self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None)
 
     def _test_atleast_dim(self, torch_fn, np_fn, device, dtype):
-        for ndims in range(0, 5):
+        for ndims in range(5):
             shape = _rand_shape(ndims, min_size=5, max_size=10)
             for _ in range(ndims + 1):
                 for with_extremal in [False, True]:
diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py
index 481bd3c76a50..62e257790fd4 100644
--- a/test/test_xnnpack_integration.py
+++ b/test/test_xnnpack_integration.py
@@ -1316,7 +1316,7 @@ def test_conv1d_basic(self):
         groups_list = range(1, 3)
         kernel_list = range(1, 4)
         stride_list = range(1, 3)
-        padding_list = range(0, 3)
+        padding_list = range(3)
         dilation_list = range(1, 3)
 
         for hparams in itertools.product(
@@ -1401,7 +1401,7 @@ def test_conv1d_with_relu_fc(self):
         groups_list = range(1, 3)
         kernel_list = range(1, 4)
         stride_list = range(1, 3)
-        padding_list = range(0, 3)
+        padding_list = range(3)
         dilation_list = range(1, 3)
         output_features_list = range(1, 3)
 
diff --git a/test/test_xpu.py b/test/test_xpu.py
index 3474e4031ef2..93524286d788 100644
--- a/test/test_xpu.py
+++ b/test/test_xpu.py
@@ -776,6 +776,10 @@ def test_is_bf16_supported(self):
             torch.xpu.is_available(),
         )
 
+    def test_is_tf32_supported(self):
+        if not torch.xpu.is_available():
+            self.assertFalse(torch.xpu.is_tf32_supported())
+
     def test_get_arch_list(self):
         if not torch.xpu._is_compiled():
             self.assertEqual(len(torch.xpu.get_arch_list()), 0)
diff --git a/test/torch_np/numpy_tests/core/test_dtype.py b/test/torch_np/numpy_tests/core/test_dtype.py
index d548f49b4cc4..18622aa0d6ae 100644
--- a/test/torch_np/numpy_tests/core/test_dtype.py
+++ b/test/torch_np/numpy_tests/core/test_dtype.py
@@ -100,7 +100,7 @@ def test_richcompare_invalid_dtype_equality(self):
         # dtypes results in False/True when compared to valid dtypes.
         # Here 7 cannot be converted to dtype. No exceptions should be raised
 
-        assert not np.dtype(np.int32) == 7, "dtype richcompare failed for =="
+        assert np.dtype(np.int32) != 7, "dtype richcompare failed for =="
         assert np.dtype(np.int32) != 7, "dtype richcompare failed for !="
 
     @parametrize("operation", [operator.le, operator.lt, operator.ge, operator.gt])
diff --git a/test/torch_np/numpy_tests/core/test_numeric.py b/test/torch_np/numpy_tests/core/test_numeric.py
index 75bf5c0fc628..c6b2d14aef6d 100644
--- a/test/torch_np/numpy_tests/core/test_numeric.py
+++ b/test/torch_np/numpy_tests/core/test_numeric.py
@@ -2384,7 +2384,7 @@ def test_dtype_str_bytes(self, likefunc, dtype):
         b = a[:, ::2]  # Ensure b is not contiguous.
         kwargs = {"fill_value": ""} if likefunc == np.full_like else {}
         result = likefunc(b, dtype=dtype, **kwargs)
-        if dtype == str:
+        if dtype is str:
             assert result.strides == (16, 4)
         else:
             # dtype is bytes
diff --git a/test/torch_np/numpy_tests/core/test_scalar_methods.py b/test/torch_np/numpy_tests/core/test_scalar_methods.py
index 2bfce58f3944..f51e62d91eb8 100644
--- a/test/torch_np/numpy_tests/core/test_scalar_methods.py
+++ b/test/torch_np/numpy_tests/core/test_scalar_methods.py
@@ -235,7 +235,7 @@ class TestBitCount(TestCase):
     def test_small(self, itype):
         for a in range(max(np.iinfo(itype).min, 0), 128):
             msg = f"Smoke test for {itype}({a}).bit_count()"
-            assert itype(a).bit_count() == bin(a).count("1"), msg
+            assert itype(a).bit_count() == a.bit_count(), msg
 
     def test_bit_count(self):
         for exp in [10, 17, 63]:
diff --git a/test/torch_np/numpy_tests/core/test_scalarmath.py b/test/torch_np/numpy_tests/core/test_scalarmath.py
index 84b1e99cb931..ea7621e97546 100644
--- a/test/torch_np/numpy_tests/core/test_scalarmath.py
+++ b/test/torch_np/numpy_tests/core/test_scalarmath.py
@@ -925,7 +925,7 @@ def rop_func(self, other):
 
         # inheritance has to override, or this is correctly lost:
         res = op(myf_simple1(1), myf_simple2(2))
-        assert type(res) == sctype or type(res) == np.bool_
+        assert type(res) is sctype or type(res) is np.bool_
         assert op(myf_simple1(1), myf_simple2(2)) == op(1, 2)  # inherited
 
         # Two independent subclasses do not really define an order.  This could
@@ -955,7 +955,7 @@ def rop_func(self, other):
         assert op(myt(1), np.float64(2)) == __op__
         assert op(np.float64(1), myt(2)) == __rop__
 
-        if op in {operator.mod, operator.floordiv} and subtype == complex:
+        if op in {operator.mod, operator.floordiv} and subtype is complex:
             return  # module is not support for complex.  Do not test.
 
         if __rop__ == __op__:
@@ -968,11 +968,11 @@ def rop_func(self, other):
         res = op(myt(1), np.float16(2))
         expected = op(subtype(1), np.float16(2))
         assert res == expected
-        assert type(res) == type(expected)
+        assert type(res) is type(expected)
         res = op(np.float32(2), myt(1))
         expected = op(np.float32(2), subtype(1))
         assert res == expected
-        assert type(res) == type(expected)
+        assert type(res) is type(expected)
 
 
 if __name__ == "__main__":
diff --git a/test/torch_np/numpy_tests/linalg/test_linalg.py b/test/torch_np/numpy_tests/linalg/test_linalg.py
index f8fa81bca63e..f3e42294a149 100644
--- a/test/torch_np/numpy_tests/linalg/test_linalg.py
+++ b/test/torch_np/numpy_tests/linalg/test_linalg.py
@@ -937,7 +937,7 @@ def do(self, a, b, tags):
 @instantiate_parametrized_tests
 class TestDet(DetCases, TestCase):
     def test_zero(self):
-        # NB: comment out tests of type(det) == double : we return zero-dim arrays
+        # NB: comment out tests of type(det) is double : we return zero-dim arrays
         assert_equal(linalg.det([[0.0]]), 0.0)
         #    assert_equal(type(linalg.det([[0.0]])), double)
         assert_equal(linalg.det([[0.0j]]), 0.0)
@@ -1103,7 +1103,7 @@ def tz(M):
 
         for mat in self.rshft_all:
             tz(mat.astype(dt))
-            if dt != object:
+            if dt is not object:
                 tz(self.stacked.astype(dt))
 
     @parametrize("dt", [np.dtype(c) for c in "?bBhilefdFD"])
@@ -1115,7 +1115,7 @@ def tz(mat):
 
         for mat in self.rshft_all:
             tz(mat.astype(dt))
-            if dt != object:
+            if dt is not object:
                 tz(self.stacked.astype(dt))
 
     @parametrize("dt", [np.dtype(c) for c in "?bBhilefdFD"])
@@ -1128,7 +1128,7 @@ def tz(mat):
 
         for mat in self.rshft_all:
             tz(mat.astype(dt))
-            if dt != object:
+            if dt is not object:
                 tz(self.stacked.astype(dt))
 
     @parametrize("dt", [np.dtype(c) for c in "?bBhilefdFD"])
diff --git a/test/torch_np/test_ndarray_methods.py b/test/torch_np/test_ndarray_methods.py
index e32720d986eb..f94b03f1f6e5 100644
--- a/test/torch_np/test_ndarray_methods.py
+++ b/test/torch_np/test_ndarray_methods.py
@@ -661,7 +661,7 @@ def test_iter_1d(self):
         # numpy generates array scalars, we do 0D arrays
         a = np.arange(5)
         lst = list(a)
-        assert all(type(x) == np.ndarray for x in lst), f"{[type(x) for x in lst]}"
+        assert all(type(x) is np.ndarray for x in lst), f"{[type(x) for x in lst]}"
         assert all(x.ndim == 0 for x in lst)
 
     def test_iter_2d(self):
@@ -669,7 +669,8 @@ def test_iter_2d(self):
         a = np.arange(5)[None, :]
         lst = list(a)
         assert len(lst) == 1
-        assert type(lst[0]) == np.ndarray
+        # FIXME: "is" cannot be used here because dynamo fails
+        assert type(lst[0]) == np.ndarray  # noqa: E721
         assert_equal(lst[0], np.arange(5))
 
 
diff --git a/test/torch_np/test_nep50_examples.py b/test/torch_np/test_nep50_examples.py
index 1c27d8702875..d89a7a390e34 100644
--- a/test/torch_np/test_nep50_examples.py
+++ b/test/torch_np/test_nep50_examples.py
@@ -94,7 +94,7 @@ class TestNEP50Table(TestCase):
     def test_nep50_exceptions(self, example):
         old, new = examples[example]
 
-        if new == Exception:
+        if new is Exception:
             with assert_raises(OverflowError):
                 eval(example)
 
diff --git a/test/xpu/test_gemm.py b/test/xpu/test_gemm.py
index 1164a2b67636..f2a273ccc330 100644
--- a/test/xpu/test_gemm.py
+++ b/test/xpu/test_gemm.py
@@ -19,8 +19,12 @@
 from torch.testing._internal.common_device_type import (
     dtypes,
     instantiate_device_type_tests,
+    onlyNativeDeviceTypes,
     precisionOverride,
 )
+from torch.testing._internal.common_quantization import (
+    _dynamically_quantize_per_channel,
+)
 from torch.testing._internal.common_utils import (
     iter_indices,
     parametrize,
@@ -1446,6 +1450,50 @@ def forward(self, x_1, w_1):
     return out_dtype""",
         )
 
+    @onlyNativeDeviceTypes
+    @parametrize("m", [32, 64])
+    @parametrize("k", [32, 64])
+    @parametrize("n", [48, 64])
+    @parametrize("compile", [True, False])
+    @parametrize("slice", [True, False])
+    def test__int8_mm(self, device, m, k, n, compile, slice):
+        torch.manual_seed(1)
+        if slice:
+            # logits are generated from LLaMA LM head like this -
+            # the activation to LM head is a slice of final hidden state
+            # of shape (batch_size, sequence_length, hidden dim),
+            # but is non-contiguous
+            # Using arbitrary batch-size here, since it'd be converted to 2D
+            batch_size = 4
+            a = torch.rand((batch_size, m, k), dtype=torch.bfloat16, device=device)
+            # Make a non-contiguous
+            a = a[:, -1:, :]
+            a = a.view(-1, a.size(-1))
+        else:
+            a = torch.rand((m, k), dtype=torch.bfloat16, device=device)
+
+        b = torch.rand((n, k), dtype=torch.bfloat16, device=device)
+
+        def convert_weight_to_int8pack(b):
+            b_int8pack, b_scales, _ = _dynamically_quantize_per_channel(
+                b, -128, 127, torch.int8
+            )
+            return b_int8pack, b_scales
+
+        def weight_int8pack_mm(a, b_int8pack, b_scales):
+            return torch._weight_int8pack_mm(a, b_int8pack, b_scales)
+
+        b_int8pack, b_scales = convert_weight_to_int8pack(b)
+        if compile:
+            mod = torch.compile(weight_int8pack_mm)
+        else:
+            mod = weight_int8pack_mm
+        res = mod(a, b_int8pack, b_scales)
+        ref = torch.mm(a, b.transpose(0, 1))
+
+        mean_err = ((res - ref).abs() / ref).mean()
+        self.assertTrue(mean_err < 0.05)
+
 
 instantiate_device_type_tests(TestBasicGEMM, globals(), only_for="xpu", allow_xpu=True)
 
diff --git a/third_party/cudnn_frontend b/third_party/cudnn_frontend
index f937055efc6d..0b1577c8c834 160000
--- a/third_party/cudnn_frontend
+++ b/third_party/cudnn_frontend
@@ -1 +1 @@
-Subproject commit f937055efc6d414d11f4c6577e3977fe74f35fb6
+Subproject commit 0b1577c8c83401237d601d0d0db5210506705396
diff --git a/third_party/cutlass b/third_party/cutlass
index e51efbfe18fe..f3fde58372d3 160000
--- a/third_party/cutlass
+++ b/third_party/cutlass
@@ -1 +1 @@
-Subproject commit e51efbfe18fe4f4cbb66ab814c55bf4aa0185491
+Subproject commit f3fde58372d33e9a5650ba7b80fc48b3b49d40c8
diff --git a/third_party/eigen_pin.txt b/third_party/eigen_pin.txt
index 18091983f59d..0062ac971805 100644
--- a/third_party/eigen_pin.txt
+++ b/third_party/eigen_pin.txt
@@ -1 +1 @@
-3.4.0
+5.0.0
diff --git a/third_party/fbgemm b/third_party/fbgemm
index 4b39c551efe1..3cefe0564a8c 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 4b39c551efe15e6bbade20565b0ceb2d8ce3352d
+Subproject commit 3cefe0564a8c3de514a152d40a2b4770f2ee5be0
diff --git a/third_party/fmt b/third_party/fmt
index 40626af88bd7..e424e3f2e607 160000
--- a/third_party/fmt
+++ b/third_party/fmt
@@ -1 +1 @@
-Subproject commit 40626af88bd7df9a5fb80be7b25ac85b122d6c21
+Subproject commit e424e3f2e607da02742f73db84873b8084fc714c
diff --git a/third_party/kineto b/third_party/kineto
index 5e7501833f10..001ba8eb5194 160000
--- a/third_party/kineto
+++ b/third_party/kineto
@@ -1 +1 @@
-Subproject commit 5e7501833f1021ce6f618572d3baf657b6319658
+Subproject commit 001ba8eb519438592f79dbc8e86a349f5f6c6829
diff --git a/third_party/kleidiai b/third_party/kleidiai
index cca02c2f69dd..d7770c896323 160000
--- a/third_party/kleidiai
+++ b/third_party/kleidiai
@@ -1 +1 @@
-Subproject commit cca02c2f69dd18e1f12647c1c0bdc8cf90e680c7
+Subproject commit d7770c89632329a9914ef1a90289917597639cbe
diff --git a/third_party/xpu.txt b/third_party/xpu.txt
index ed84e6812d9b..47097a86a01b 100644
--- a/third_party/xpu.txt
+++ b/third_party/xpu.txt
@@ -1 +1 @@
-d8c3eefc297193cf9e0888a7d8ff32dc74da0793
+ce9db15136c5e8ba1b51710aae574ce4791c5d73
diff --git a/tools/alerts/create_alerts.py b/tools/alerts/create_alerts.py
index 6b679a030682..b86e2368d440 100644
--- a/tools/alerts/create_alerts.py
+++ b/tools/alerts/create_alerts.py
@@ -190,12 +190,12 @@ def map_job_data(jobNames: Any, shaGrid: Any) -> dict[str, Any]:
 
 
 def is_job_failed(job: Any) -> bool:
-    conclusion = job["conclusion"] if "conclusion" in job else None
+    conclusion = job.get("conclusion", None)
     return conclusion is not None and conclusion != SUCCESS and conclusion != PENDING
 
 
 def is_job_skipped(job: Any) -> bool:
-    conclusion = job["conclusion"] if "conclusion" in job else None
+    conclusion = job.get("conclusion", None)
     return conclusion in (NEUTRAL, SKIPPED) or conclusion is None
 
 
diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
index 5a9aaf0aa6f7..504bb01e4739 100755
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@@ -137,6 +137,7 @@
     "third_party/nvfuser/runtime/helpers.cu",
     "torch/csrc/jit/codegen/fuser/cuda/resource_strings.h",
     "torch/csrc/jit/tensorexpr/ir_printer.cpp",
+    "torch/csrc/jit/ir/ir.h",
     # generated files we shouldn't frob
     "torch/lib/tmp_install/*",
     "torch/include/*",
diff --git a/tools/autograd/context.py b/tools/autograd/context.py
index 146cf571d304..0ed4b2ee4d01 100644
--- a/tools/autograd/context.py
+++ b/tools/autograd/context.py
@@ -1,5 +1,5 @@
 import functools
-from typing import Callable
+from collections.abc import Callable
 
 from torchgen.api.autograd import NativeFunctionWithDifferentiabilityInfo as NFWDI
 from torchgen.context import native_function_manager
diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py
index cdc805d5a4b5..2bd33cf8df9c 100644
--- a/tools/autograd/gen_autograd_functions.py
+++ b/tools/autograd/gen_autograd_functions.py
@@ -863,6 +863,7 @@ def save_var(var: SavedAttribute, is_output: bool) -> None:
             saved_variables.append(f"{type.cpp_type()} {name};")
 
             if type in MISC_GETTER_DEFS:
+                # pyrefly: ignore  # index-error
                 getter_def, body = MISC_GETTER_DEFS[type]
                 getter_definitions.append(
                     getter_def.substitute(op=info.op, name=name, body=body)
@@ -1033,6 +1034,7 @@ def emit_derivative(
     unpack_ivalues = []
     for typ, name in zip(apply_functional_args_ref_types, apply_functional_args):
         typ = typ.removesuffix("&")
+        # pyrefly: ignore  # bad-argument-type
         unpack_ivalues.append(f"auto {name} = packed_args.unpack<{typ}>();")
 
     schema_args = [f"std::array<bool, {len(input_name_to_idx)}>"]
diff --git a/tools/autograd/gen_inplace_or_view_type.py b/tools/autograd/gen_inplace_or_view_type.py
index 684290da0a72..4cb3429c3927 100644
--- a/tools/autograd/gen_inplace_or_view_type.py
+++ b/tools/autograd/gen_inplace_or_view_type.py
@@ -340,7 +340,7 @@ def get_base_name(f: NativeFunction) -> str:
 
 def get_view_info(f: NativeFunction) -> str | None:
     base_name = get_base_name(f)
-    view_info = VIEW_FUNCTIONS.get(base_name, None)
+    view_info = VIEW_FUNCTIONS.get(base_name)
     if view_info is None and base_name in RETURNS_VIEWS_OF_INPUT:
         view_info = "self"
     return view_info
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 5a003cadf6b3..af25d55ef38d 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -36,7 +36,7 @@
 import itertools
 import re
 from collections import defaultdict
-from typing import Callable, TYPE_CHECKING
+from typing import TYPE_CHECKING
 
 import yaml
 
@@ -77,7 +77,7 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Iterable, Sequence
+    from collections.abc import Callable, Iterable, Sequence
 
 
 #
diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py
index 21069b4671e2..fb20c7872f85 100644
--- a/tools/autograd/gen_trace_type.py
+++ b/tools/autograd/gen_trace_type.py
@@ -182,6 +182,7 @@ def dispatch_trace_input(arg: Argument | TensorOptionsArguments) -> Sequence[str
             ADD_TRACE_INPUT.substitute(
                 name=f.func.arguments.out[i].name, input=f.func.arguments.out[i].name
             )
+            # pyrefly: ignore  # unbound-name
             for i in range(num_out_args)
         ]
 
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index ed5a6e6cf398..df43f8060cea 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -29,7 +29,7 @@
 from __future__ import annotations
 
 import re
-from typing import Callable, TYPE_CHECKING
+from typing import TYPE_CHECKING
 
 from torchgen.api import cpp
 from torchgen.api.autograd import (
@@ -106,7 +106,7 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
 
 
 # We don't set or modify grad_fn on these methods. Generally, they return
@@ -1495,6 +1495,7 @@ def save_variables(
                 else:
                     expr = f"SavedVariable({var}, {str(is_output).lower()})"
                     if foreacharg is not None and "original_selfs" not in expr:
+                        # pyrefly: ignore  # unbound-name
                         expr = expr.replace(src_name, name_in_expr)
             elif (
                 type == BaseCType(tensorListT)
@@ -1844,12 +1845,14 @@ def emit_any_has_forward_grad() -> list[str]:
                                 )
                             )
                         cur_derivative_conditions.append(
+                            # pyrefly: ignore  # bad-argument-type
                             FW_DERIVATIVE_CHECK_TEMPLATE.substitute(
                                 req_inp=inp_name + "[i]"
                             )
                         )
                     else:
                         cur_derivative_conditions.append(
+                            # pyrefly: ignore  # bad-argument-type
                             FW_DERIVATIVE_CHECK_TEMPLATE.substitute(req_inp=inp_name)
                         )
 
@@ -1920,6 +1923,7 @@ def emit_fw_derivatives() -> list[str]:
                 unpacked_arguments += FW_DERIVATIVE_DEFINED_GRAD_TEMPLATE.substitute(
                     inp_name="original_self",
                     inp="original_self" + input_suffix,
+                    # pyrefly: ignore  # unbound-name
                     zeros_fn=zeros_fn,
                 )
                 unpacked_arguments += FW_DERIVATIVE_DEFINED_PRIMAL_TEMPLATE.substitute(
diff --git a/tools/autograd/load_derivatives.py b/tools/autograd/load_derivatives.py
index f61226f25fb9..c8a621bf950f 100644
--- a/tools/autograd/load_derivatives.py
+++ b/tools/autograd/load_derivatives.py
@@ -95,8 +95,11 @@ def add_view_copy_derivatives(
             else:
                 break
         # prefer manually-defined derivatives if any
+        # pyrefly: ignore  # unbound-name
         if len(view_copy_differentiability_infos) > 0 and fn_schema not in infos:
+            # pyrefly: ignore  # unbound-name
             assert fn_schema is not None
+            # pyrefly: ignore  # unbound-name
             view_infos[fn_schema] = view_copy_differentiability_infos
 
     infos.update(view_infos)
@@ -398,6 +401,7 @@ def repl(m: Any) -> str:
             for arg_name in all_arg_names:
                 if arg_name in diff_arg_names:
                     arg_name = arg_name + "_t"
+                # pyrefly: ignore  # bad-argument-type
                 new_args.append(arg_name)
 
             # TODO we are trolling
@@ -938,6 +942,7 @@ def stride_expr(name: str) -> str:
             + f".sym_strides(), which returned a c10::SymIntArrayRef. formula={formula}"
         )
     for nctype in nctypes:
+        # pyrefly: ignore  # bad-assignment
         name = (
             nctype.name.name if isinstance(nctype.name, SpecialArgName) else nctype.name
         )
@@ -947,6 +952,7 @@ def stride_expr(name: str) -> str:
 
             def repl(m: re.Match[str]) -> str:
                 suffix: str = (
+                    # pyrefly: ignore  # bad-assignment
                     info["suffix"](m) if callable(info["suffix"]) else info["suffix"]
                 )
                 expr: str = info["expr"](name) if "expr" in info else m.group(0)
diff --git a/tools/bazel.bzl b/tools/bazel.bzl
index 9b662859adb4..147990c2e84b 100644
--- a/tools/bazel.bzl
+++ b/tools/bazel.bzl
@@ -16,6 +16,7 @@ def _is_cpu_static_dispatch_build():
 # build structure aims to replicate Bazel as much as possible, most of
 # the rules simply forward to the Bazel definitions.
 rules = struct(
+    alias = native.alias,
     cc_binary = cc_binary,
     cc_library = cc_library,
     cc_test = cc_test,
diff --git a/tools/build_pytorch_libs.py b/tools/build_pytorch_libs.py
index 457b224354fb..9d43de80f129 100644
--- a/tools/build_pytorch_libs.py
+++ b/tools/build_pytorch_libs.py
@@ -88,7 +88,8 @@ def build_pytorch(
 ) -> None:
     my_env = _create_build_env()
     if (
-        not check_negative_env_flag("USE_CUDA")
+        not check_negative_env_flag("USE_DISTRIBUTED")
+        and not check_negative_env_flag("USE_CUDA")
         and not check_negative_env_flag("USE_NCCL")
         and not check_env_flag("USE_SYSTEM_NCCL")
     ):
diff --git a/tools/code_coverage/package/tool/summarize_jsons.py b/tools/code_coverage/package/tool/summarize_jsons.py
index 3d53b37bcf6a..b41b5760e716 100644
--- a/tools/code_coverage/package/tool/summarize_jsons.py
+++ b/tools/code_coverage/package/tool/summarize_jsons.py
@@ -67,6 +67,7 @@ def is_intrested_file(
 
     # ignore files that are not belong to pytorch
     if platform == TestPlatform.OSS:
+        # pyrefly: ignore  # import-error
         from package.oss.utils import get_pytorch_folder
 
         if not file_path.startswith(get_pytorch_folder()):
diff --git a/tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py b/tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py
index 72594abefd0a..5cea32d00dec 100644
--- a/tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py
+++ b/tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py
@@ -30,7 +30,7 @@
 
 
 def is_not_builtin_class(obj: Any) -> bool:
-    return isclass(obj) and not type(obj).__module__ == "builtins"
+    return isclass(obj) and type(obj).__module__ != "builtins"
 
 
 class JitPlugin(CoveragePlugin):  # type: ignore[misc, no-any-unimported]
diff --git a/tools/download_mnist.py b/tools/download_mnist.py
index c8eeb4ec1a9a..206753a61cce 100644
--- a/tools/download_mnist.py
+++ b/tools/download_mnist.py
@@ -7,7 +7,6 @@
 
 
 MIRRORS = [
-    "http://yann.lecun.com/exdb/mnist/",
     "https://ossci-datasets.s3.amazonaws.com/mnist/",  # @lint-ignore
 ]
 
@@ -25,6 +24,7 @@ def report_download_progress(
     file_size: int,
 ) -> None:
     if file_size != -1:
+        # pyrefly: ignore  # no-matching-overload
         percent = min(1, (chunk_number * chunk_size) / file_size)
         bar = "#" * int(64 * percent)
         sys.stdout.write(f"\r0% |{bar:<64}| {int(percent * 100)}%")
diff --git a/tools/dynamo/gb_id_mapping.py b/tools/dynamo/gb_id_mapping.py
index 8fef79bd8077..cb9cbc0dce63 100644
--- a/tools/dynamo/gb_id_mapping.py
+++ b/tools/dynamo/gb_id_mapping.py
@@ -105,8 +105,10 @@ def extract_info_from_keyword(source: str, kw: ast.keyword) -> Any:
         evaluated_context = []
         for value in kw.value.values:
             if isinstance(value, ast.FormattedValue):
+                # pyrefly: ignore  # bad-argument-type
                 evaluated_context.append(f"{{{ast.unparse(value.value)}}}")
             elif isinstance(value, ast.Constant):
+                # pyrefly: ignore  # bad-argument-type
                 evaluated_context.append(value.value)
         return "".join(evaluated_context)
     else:
@@ -152,6 +154,7 @@ def find_unimplemented_v2_calls(
 
                         for kw in node.keywords:
                             if kw.arg in info:
+                                # pyrefly: ignore  # unsupported-operation
                                 info[kw.arg] = extract_info_from_keyword(source, kw)
 
                         if info["gb_type"] is None:
diff --git a/tools/dynamo/verify_dynamo.py b/tools/dynamo/verify_dynamo.py
index ec09fbd2b489..b6ec848922f5 100644
--- a/tools/dynamo/verify_dynamo.py
+++ b/tools/dynamo/verify_dynamo.py
@@ -8,7 +8,7 @@
 
 MIN_CUDA_VERSION = "11.6"
 MIN_ROCM_VERSION = "5.4"
-MIN_PYTHON_VERSION = (3, 8)
+MIN_PYTHON_VERSION = (3, 10)
 
 
 class VerifyDynamoError(BaseException):
diff --git a/tools/experimental/torchfuzz/README.md b/tools/experimental/torchfuzz/README.md
new file mode 100644
index 000000000000..f63217022ec2
--- /dev/null
+++ b/tools/experimental/torchfuzz/README.md
@@ -0,0 +1,497 @@
+# TorchFuzz - PyTorch Compile End-to-End Fuzz Testing Framework
+
+TorchFuzz is a comprehensive fuzzing framework for testing PyTorch operations. It generates random operation graphs, converts them to executable Python code, and validates correctness across eager execution and various `torch.compile()` configurations.
+
+## Overview
+
+TorchFuzz employs a sophisticated four-stage fuzzing pipeline:
+
+1. **Random Specification Generation** - Creates target tensor/scalar specifications with varied shapes, strides, and dtypes
+2. **Operation Graph Construction** - Builds Directed Acyclic Graphs (DAGs) of PyTorch operations through recursive decomposition with dependency tracking
+3. **Code Generation** - Converts operation graphs to executable Python code using topological ordering
+4. **Validation** - Executes generated programs in both eager and compiled modes, comparing outputs for correctness
+
+### How It Works: Example Walkthrough
+
+**Step 1: Target Specification**
+```python
+TensorSpec(size=(2, 3), stride=(3, 1), dtype=torch.float32)
+```
+
+**Step 2: Operation Graph Construction**
+```
+OperationGraph (root: node_0, target: TensorSpec(size=(2, 3), stride=(3, 1), dtype=torch.float32))
+  node_2: arg_0 -> TensorSpec(size=(2, 3), stride=(3, 1), dtype=torch.float32) (depth 0)
+  node_3: arg_1 -> TensorSpec(size=(2, 3), stride=(3, 1), dtype=torch.float32) (depth 0)
+  node_0: torch.add -> TensorSpec(size=(2, 3), stride=(3, 1), dtype=torch.float32) (depth 2) <- ['node_2', 'node_3']
+```
+
+**Step 3: Generated Python Code**
+```python
+import torch
+torch._dynamo.config.capture_scalar_outputs = True
+torch.manual_seed(42)
+
+def fuzzed_program(arg_0, arg_1, sentinel):
+    var_node_2 = arg_0  # TensorSpec(size=(2, 3), stride=(3, 1), dtype=torch.float32)
+    var_node_3 = arg_1  # TensorSpec(size=(2, 3), stride=(3, 1), dtype=torch.float32)
+    var_node_0 = torch.add(var_node_2, var_node_3)  # TensorSpec(...)
+    result = var_node_0 * sentinel
+    return result
+
+sentinel = torch.tensor(1.0, requires_grad=True)
+arg_0 = torch.randn((2, 3), dtype=torch.float32)
+arg_1 = torch.randn((2, 3), dtype=torch.float32)
+args = (arg_0, arg_1)
+
+# Run eager and compiled versions
+out_eager = fuzzed_program(*args, sentinel)
+out_eager.sum().backward()
+print('Eager Success! ✅')
+
+compiled_program = torch.compile(fuzzed_program, fullgraph=True, dynamic=True)
+out_compiled = compiled_program(*args, sentinel)
+out_compiled.sum().backward()
+print('Compile Success! ✅')
+```
+
+## Graph Visualization
+![Operation Graph](image.png)
+
+## Quick Start
+
+### Single Seed Execution
+
+```bash
+cd tools/experimental/torchfuzz
+python fuzzer.py --seed 42
+```
+
+The fuzzer is deterministic: given the same seed and Git commit, it will generate identical programs.
+
+### Multi-Process Fuzzing
+
+Run multiple seeds in parallel across CPU cores:
+
+```bash
+# Run seeds 0-999 with 8 worker processes
+python fuzzer.py --start 0 --count 1000 -p 8
+
+# Run seeds 1000-1099 with verbose output
+python fuzzer.py --start 1000 --count 100 --verbose
+```
+
+### Template-Based Fuzzing
+
+Use specialized templates for different testing scenarios:
+
+```bash
+# Default template: neural network operations with numerics checks
+python fuzzer.py --seed 42 --template default
+
+# DTensor template: distributed tensor operations
+python fuzzer.py --seed 42 --template dtensor
+
+# Unbacked template: data-dependent operations (nonzero, unique, etc.)
+python fuzzer.py --seed 42 --template unbacked
+```
+
+### Debug Mode
+
+```bash
+python fuzzer.py --seed 42 --log-level DEBUG --max-depth 5
+```
+
+## Command Line Reference
+
+### Single Seed Execution
+
+| Option | Description | Example |
+|--------|-------------|---------|
+| `--seed INT` | Random seed for reproducible tests | `--seed 42` |
+| `--max-depth INT` | Maximum operation graph depth (1-20) | `--max-depth 5` |
+| `--template NAME` | Template to use (default, dtensor, unbacked) | `--template unbacked` |
+| `--log-level LEVEL` | Logging verbosity (DEBUG, INFO, WARNING, ERROR) | `--log-level DEBUG` |
+
+### Multi-Process Fuzzing
+
+| Option | Description | Example |
+|--------|-------------|---------|
+| `--start INT` | Starting seed value (inclusive) | `--start 0` |
+| `--count INT` | Number of seeds to run | `--count 1000` |
+| `--processes INT` | Number of worker processes (default: 75% of CPU cores) | `-p 16` |
+| `--verbose` | Print detailed output for all runs | `--verbose` |
+| `--template NAME` | Template to use for all runs | `--template default` |
+
+## Restricting supported ops and weighting examples
+
+You can restrict the fuzzer to a specific set of fully-qualified torch ops and optionally weight them to bias sampling.
+
+- Restrict to only torch.add and torch.matmul (equal likelihood):
+
+```bash
+python fuzzer.py --seed 42 \
+  --supported-ops "torch.add,torch.matmul"
+```
+
+- Restrict to only torch.add and torch.matmul, and make matmul 5x more likely than add:
+
+```bash
+python fuzzer.py --seed 42 \
+  --supported-ops "torch.add,torch.matmul=5"
+```
+
+Notes:
+- Use fully-qualified torch op names (e.g., torch.matmul, torch.nn.functional.rms_norm).
+- Weights must be > 0. If both --supported-ops and --op-weights specify a weight for the same op, the value from --supported-ops takes precedence.
+
+## Architecture
+
+### Core Components
+
+| Component | Responsibility |
+|-----------|----------------|
+| `fuzzer.py` | Main CLI orchestrator, coordinates fuzzing workflow |
+| `tensor_fuzzer.py` | Generates random tensor/scalar specifications |
+| `ops_fuzzer.py` | Builds operation graphs via recursive decomposition |
+| `codegen.py` | Converts operation graphs to executable Python code |
+| `runner.py` | Executes generated programs and reports results |
+| `multi_process_fuzzer.py` | Parallel fuzzing across multiple processes |
+| `visualize_graph.py` | Creates visual diagrams of operation graphs |
+| `checks.py` | Defines validation strategies (eager vs compiled) |
+| `operators/` | Modular operator implementations |
+
+### Operator System
+
+TorchFuzz uses a plugin-based operator system where each operation is a class implementing the `Operator` interface:
+
+```python
+class Operator(ABC):
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Check if operator can produce the target specification."""
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specifications via decomposition."""
+
+    def codegen(self, output_name: str, input_names: list[str], output_spec: Spec) -> str:
+        """Generate executable code for this operation."""
+```
+
+### Supported Operations
+
+#### Pointwise Operations
+- **Tensor-Tensor**: `torch.add`, `torch.sub`, `torch.mul`, `torch.div`
+- **Scalar-Tensor**: Scalar versions of above operations
+
+#### Shape Operations
+- `torch.Tensor.view`, `torch.reshape`, `torch.flatten`
+- `torch.squeeze`, `torch.unsqueeze`
+
+#### Matrix Operations
+- `torch.mm` - Matrix multiplication
+- `torch.addmm` - Additive matrix multiplication
+- `torch.bmm` - Batch matrix multiplication
+- `torch.matmul` - General matrix multiplication
+
+#### Neural Network Operations
+- **Layers**: `F.embedding`, `F.linear`
+- **Activations**: `F.relu`, `F.leaky_relu`, `F.elu`, `F.gelu`, `F.silu`, `torch.sigmoid`, `torch.tanh`, `F.softmax`
+- **Normalization**: `F.layer_norm`, `F.rms_norm`, `F.batch_norm`, `F.group_norm`
+- **Regularization**: `F.dropout`
+
+#### Data-Dependent Operations
+- `torch.ops.aten.nonzero` - Find non-zero elements
+- `torch.ops.aten.masked_select` - Select elements by mask
+- `torch.ops.aten.unique` - Find unique elements
+- `torch.ops.aten.item` - Extract scalar from tensor
+
+#### Input Operations
+- `arg` - Function arguments
+- `constant` - Constant scalar values
+
+## Templates
+
+Templates define specialized fuzzing strategies with custom operator sets, checks, and argument generation.
+
+### Default Template
+
+**Focus**: Neural network operations with numerical validation
+
+**Operators**: All operations except data-dependent ones
+
+**Check**: Compares eager vs compiled outputs with numerical tolerance (5% relative + 1.0 absolute difference)
+
+**Use Case**: General PyTorch compilation testing
+
+```bash
+python fuzzer.py --seed 42 --template default
+```
+
+### DTensor Template
+
+**Focus**: Distributed tensor operations
+
+**Operators**: Basic arithmetic and matrix operations
+
+**Check**: Validates compilation correctness (no numerical comparison)
+
+**Special Features**:
+- Initializes fake distributed process group
+- Creates 2D device mesh
+- Wraps all tensors as DTensors with Replicate placement
+
+**Use Case**: Testing torch.compile with distributed tensors
+
+```bash
+python fuzzer.py --seed 42 --template dtensor
+```
+
+### Unbacked Template
+
+**Focus**: Data-dependent operations that produce unbacked SymInts
+
+**Operators**: `nonzero`, `masked_select`, `unique`, `item`, plus basic arithmetic
+
+**Check**: Validates compilation correctness
+
+**Special Features**:
+- 50/50 tensor/scalar distribution
+- Integer/float dtypes only (no bool)
+- Custom tensor initialization for meaningful data-dependent results
+
+**Use Case**: Testing dynamic shape handling and unbacked SymInt scenarios
+
+```bash
+python fuzzer.py --seed 42 --template unbacked
+```
+
+## Multi-Process Fuzzing
+
+The multi-process fuzzer distributes seeds across worker processes for high-throughput testing:
+
+### Features
+
+- **Parallel Execution**: Automatically uses 75% of available CPU cores (configurable)
+- **Progress Tracking**: Real-time progress bars with throughput statistics (requires `tqdm`)
+- **Failure Detection**: Immediately reports failing seeds with full output
+- **Known Issue Filtering**: Automatically skips known bugs based on regex patterns
+- **Operation Statistics**: Aggregates operation distribution across all runs
+- **Graceful Interruption**: Ctrl+C shows partial summary
+
+### Output Example
+
+```
+🚀 Starting multi-process fuzzer with 12 processes
+📊 Processing seeds 0 to 999 (1000 total)
+🔧 Command template: python fuzzer.py --seed {seed} --template default
+============================================================
+Processing seeds |████████████████████| 1000/1000 [05:23<00:00] ✅/❌/❓=947/45/8 | throughput: 185.61 seeds/hr
+============================================================
+📈 SUMMARY
+============================================================
+✅ Successful: 947/1000 (94.7%)
+❌ Failed:     45/1000 (4.5%)
+⏱️  Total time: 323.45s
+⚡ Throughput: 185.61 seeds/hr
+
+❌ Failed seeds: [23, 47, 89, ...]
+✅ Successful seeds: [0, 1, 2, ...]
+
+🚫 Ignored seeds: [12, 56, 78, ...]
+
+📊 OPERATION DISTRIBUTION
+============================================================
+Total operations executed: 15847
+  torch.add                      3421 times ( 21.6%)
+  torch.mul                      2890 times ( 18.2%)
+  torch.nn.functional.relu       1567 times (  9.9%)
+  ...
+```
+
+### Known Issue Filtering
+
+Edit `multi_process_fuzzer.py` to add regex patterns for known bugs:
+
+```python
+IGNORE_PATTERNS: list[re.Pattern] = [
+    re.compile(r"RuntimeError: self\.stride\(-1\) must be 1 to view ComplexDouble as"),
+    re.compile(r"BooleanAtom not allowed in this context"),
+    re.compile(r"Your custom error pattern here"),
+]
+```
+
+Ignored failures are tracked separately and don't count as failures in the summary.
+
+## Custom Checks
+
+Checks define how generated programs are validated. Create custom checks by subclassing `Check`:
+
+```python
+from torchfuzz.checks import Check
+
+class MyCustomCheck(Check):
+    def codegen(self, args_tuple: str) -> list[str]:
+        """Generate validation code."""
+        return [
+            f"args = {args_tuple}",
+            "result = fuzzed_program(*args)",
+            "# Add your validation logic here",
+            "assert result.sum() > 0, 'Custom validation failed'",
+        ]
+```
+
+### Built-in Checks
+
+#### EagerVsFullGraphDynamicCompileCheck
+
+Validates that eager and compiled execution both succeed (no output comparison).
+
+#### EagerVsFullGraphDynamicCompileWithNumericsCheck
+
+Validates that eager and compiled outputs match within tolerance:
+- **Relative tolerance**: 5%
+- **Absolute tolerance**: 1.0
+
+Includes backward pass validation.
+
+## API Usage
+
+### Programmatic Interface
+
+```python
+from torchfuzz.fuzzer import fuzz_and_execute
+from torchfuzz.ops_fuzzer import fuzz_operation_graph, fuzz_spec
+from torchfuzz.codegen import convert_graph_to_python_code
+
+# Generate and execute a single test
+fuzz_and_execute(seed=42, max_depth=5, template="default")
+
+# Generate operation graph only
+target_spec = fuzz_spec("default")
+operation_graph = fuzz_operation_graph(target_spec, max_depth=5, seed=42, template="default")
+
+# Generate code without executing
+python_code = convert_graph_to_python_code(operation_graph, seed=42, template="default")
+print(python_code)
+
+# Explore graph structure
+print(f"Graph has {len(operation_graph.nodes)} nodes")
+print(f"Root node: {operation_graph.root_node_id}")
+print(f"Topological order: {operation_graph.get_topological_order()}")
+print(f"Leaf nodes: {operation_graph.get_leaf_nodes()}")
+```
+
+## Adding New Operations
+
+TorchFuzz uses a modular operator system. To add a new operation:
+
+### Step 1: Create Operator Class
+
+Create a new file in `operators/` (e.g., `operators/my_op.py`):
+
+```python
+from torchfuzz.operators.base import Operator
+from torchfuzz.tensor_fuzzer import TensorSpec
+
+class MyOperator(Operator):
+    def __init__(self):
+        super().__init__("my_op")
+
+    @property
+    def torch_op_name(self):
+        return "torch.my_op"
+
+    def can_produce(self, output_spec):
+        """Check if this operator can produce the output specification."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        # Add your constraints here
+        return True
+
+    def fuzz_inputs_specs(self, output_spec):
+        """Generate input specifications via decomposition."""
+        # Decompose output spec into input specs
+        return [
+            TensorSpec(size=output_spec.size, stride=output_spec.stride, dtype=output_spec.dtype),
+            TensorSpec(size=output_spec.size, stride=output_spec.stride, dtype=output_spec.dtype),
+        ]
+
+    def codegen(self, output_name, input_names, output_spec):
+        """Generate code for this operation."""
+        return f"{output_name} = torch.my_op({', '.join(input_names)})"
+```
+
+### Step 2: Register Operator
+
+Add your operator to `operators/registry.py`:
+
+```python
+from torchfuzz.operators.my_op import MyOperator
+
+class OperatorRegistry:
+    def _register_default_operators(self):
+        # ... existing registrations ...
+        self.register(MyOperator())
+```
+
+### Step 3: Add to Template (Optional)
+
+If you want the operator in specific templates, add its torch_op_name to the template's `supported_ops` list in `codegen.py`:
+
+```python
+class DefaultFuzzTemplate(FuzzTemplate):
+    def __init__(self):
+        super().__init__(
+            supported_ops=[
+                # ... existing ops ...
+                "torch.my_op",
+            ],
+            check=EagerVsFullGraphDynamicCompileWithNumericsCheck(),
+        )
+```
+
+### Step 4: Test Your Operator
+
+```bash
+python fuzzer.py --seed 42 --template default
+```
+
+## Artifacts and Debugging
+
+### Generated Artifacts
+
+Each fuzzing run creates artifacts in `/tmp/fuzzing_seed_{seed}_{timestamp}_{status}/`:
+
+- `summary.txt` - Seed, depth, success status, target spec, operation count
+- `operation_stack.txt` - Detailed operation graph with dependencies
+- `operation_graph.png` - Visual diagram of the operation graph (if GraphViz installed)
+
+### Debugging Failed Seeds
+
+```bash
+# Reproduce a failed seed
+python fuzzer.py --seed 12345 --log-level DEBUG
+
+# View generated program
+ls /tmp/torchfuzz/fuzz_*.py
+
+# Run generated program directly
+python /tmp/torchfuzz/fuzz_<hash>.py
+```
+
+## Best Practices
+
+### For Continuous Fuzzing
+
+1. **Start with small seed ranges**: Test with `--count 10` first
+2. **Monitor the first few failures**: Check if they're legitimate bugs or known issues
+3. **Add known issues to ignore list**: Update `IGNORE_PATTERNS` in `multi_process_fuzzer.py`
+4. **Use appropriate templates**: Match template to your testing goals
+5. **Save successful seeds**: Track seeds that find bugs for regression testing
+
+### For Operation Development
+
+1. **Start simple**: Test with `--max-depth 2` initially
+2. **Verify determinism**: Run the same seed multiple times
+3. **Check operator coverage**: Use `--verbose` to see operation statistics
+4. **Test edge cases**: Create targeted specs
diff --git a/tools/experimental/torchfuzz/__init__.py b/tools/experimental/torchfuzz/__init__.py
new file mode 100644
index 000000000000..20f00ec7c4c1
--- /dev/null
+++ b/tools/experimental/torchfuzz/__init__.py
@@ -0,0 +1,19 @@
+"""Torchfuzz package for generating and testing random PyTorch operations."""
+
+# Make key classes available at package level
+from .operators import get_operator, list_operators, register_operator
+from .ops_fuzzer import fuzz_operation_graph, fuzz_spec, OperationGraph
+from .tensor_fuzzer import ScalarSpec, Spec, TensorSpec
+
+
+__all__ = [
+    "TensorSpec",
+    "ScalarSpec",
+    "Spec",
+    "OperationGraph",
+    "fuzz_operation_graph",
+    "fuzz_spec",
+    "get_operator",
+    "register_operator",
+    "list_operators",
+]
diff --git a/tools/experimental/torchfuzz/checks.py b/tools/experimental/torchfuzz/checks.py
new file mode 100644
index 000000000000..5b7b2e9da0e9
--- /dev/null
+++ b/tools/experimental/torchfuzz/checks.py
@@ -0,0 +1,53 @@
+"""Check abstractions for different execution modes and validations."""
+
+from abc import ABC, abstractmethod
+
+
+class Check(ABC):
+    """Base class for execution checks."""
+
+    @abstractmethod
+    def codegen(self, args_tuple: str) -> list[str]:
+        """Generate code lines for this check."""
+
+
+class EagerVsFullGraphDynamicCompileCheck(Check):
+    """Standard check that runs eager then fullgraph+dynamic compilation."""
+
+    def codegen(self, args_tuple: str) -> list[str]:
+        return [
+            f"args = {args_tuple}",
+            "result_original = fuzzed_program(*args)",
+            "print('✅ eager success')",
+            "compiled_program = torch.compile(fuzzed_program, fullgraph=True, dynamic=True)",
+            "result_compiled = compiled_program(*args)",
+            "print('✅ compile success')",
+        ]
+
+
+class EagerVsFullGraphDynamicCompileWithNumericsCheck(Check):
+    """Check that runs eager and compiled, compares forward numerics."""
+
+    def codegen(self, args_tuple: str) -> list[str]:
+        return [
+            f"args = {args_tuple}",
+            "out_eager = fuzzed_program(*args)",
+            "out_eager.sum().backward()",
+            "print('Eager Success! ✅')",
+            "compiled_program = torch.compile(fuzzed_program, fullgraph=True, dynamic=True)",
+            "out_compiled = compiled_program(*args)",
+            "out_compiled.sum().backward()",
+            "print('Compile Success! ✅')",
+            "out_eager_sum = out_eager.sum()",
+            "out_compiled_sum = out_compiled.sum()",
+            "diff = (out_eager_sum - out_compiled_sum).abs().item()",
+            "rel_diff = diff / (out_eager_sum.abs().item() + 1e-12) * 100",
+            "print(f'Relative diff (sum): {rel_diff:.6f}%')",
+            "if rel_diff > 5 and diff > 1:",
+            "    print(f'❌ Forward output sums differ significantly (relative and absolute)!')",
+            "    print('out_eager_sum:', out_eager_sum.item())",
+            "    print('out_compiled_sum:', out_compiled_sum.item())",
+            "    print('Absolute diff:', diff)",
+            "    print('Relative diff (%):', rel_diff)",
+            "    import sys; sys.exit(1)",
+        ]
diff --git a/tools/experimental/torchfuzz/codegen.py b/tools/experimental/torchfuzz/codegen.py
new file mode 100644
index 000000000000..592d9322bcd6
--- /dev/null
+++ b/tools/experimental/torchfuzz/codegen.py
@@ -0,0 +1,728 @@
+# mypy: ignore-errors
+import os
+from typing import Optional
+
+import torch
+
+from torchfuzz.operators import get_operator
+from torchfuzz.ops_fuzzer import OperationGraph
+from torchfuzz.tensor_descriptor import format_tensor_descriptor
+from torchfuzz.tensor_fuzzer import ScalarSpec, Spec, TensorSpec
+
+
+class FuzzTemplate:
+    def __init__(self, supported_ops, check):
+        self.supported_ops = supported_ops
+        self.check = check
+
+    def supported_dtypes(self):
+        """Return list of supported dtypes for this template."""
+        return [
+            torch.float32,
+            torch.float64,
+            torch.float16,
+            torch.bfloat16,
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.bool,
+        ]
+
+    def spec_distribution(self):
+        """
+        Define the distribution for generating random Specs.
+
+        Returns:
+            Dict with keys:
+            - 'tensor_prob': Probability of generating TensorSpec (0.0 to 1.0)
+            - 'scalar_prob': Probability of generating ScalarSpec (0.0 to 1.0)
+            - 'allow_tensors': Whether TensorSpec generation is allowed (boolean)
+            - 'allow_scalars': Whether ScalarSpec generation is allowed (boolean)
+        """
+        return {
+            "tensor_prob": 0.8,
+            "scalar_prob": 0.2,
+            "allow_tensors": True,
+            "allow_scalars": True,
+        }
+
+    def fuzz_spec_custom(self):
+        """
+        Generate a random Spec based on this template's distribution preferences.
+
+        Returns:
+            Spec: Either a TensorSpec or ScalarSpec according to template's distribution
+        """
+        import random
+
+        from torchfuzz.tensor_fuzzer import fuzz_torch_tensor_type
+
+        # Get template's distribution configuration
+        distribution = self.spec_distribution()
+
+        # Get random dtype based on template
+        dtype = fuzz_torch_tensor_type("default")
+
+        # Validate distribution configuration
+        allow_tensors = distribution.get("allow_tensors", True)
+        allow_scalars = distribution.get("allow_scalars", True)
+
+        if not allow_tensors and not allow_scalars:
+            raise ValueError("Template must allow at least one of tensors or scalars")
+
+        # Determine which type to generate
+        if not allow_scalars:
+            # Only tensors allowed
+            return self._generate_tensor_spec(dtype)
+        elif not allow_tensors:
+            # Only scalars allowed
+            return self._generate_scalar_spec(dtype)
+        else:
+            # Both allowed, use probability distribution
+            tensor_prob = distribution.get("tensor_prob", 0.8)
+            if random.random() < tensor_prob:
+                return self._generate_tensor_spec(dtype)
+            else:
+                return self._generate_scalar_spec(dtype)
+
+    def _generate_tensor_spec(self, dtype):
+        """Generate a TensorSpec with the given dtype."""
+        from torchfuzz.tensor_fuzzer import (
+            fuzz_tensor_size,
+            fuzz_valid_stride,
+            TensorSpec,
+        )
+
+        size = fuzz_tensor_size()
+        stride = fuzz_valid_stride(size)
+        return TensorSpec(size=size, stride=stride, dtype=dtype)
+
+    def _generate_scalar_spec(self, dtype):
+        """Generate a ScalarSpec with the given dtype."""
+        from torchfuzz.tensor_fuzzer import ScalarSpec
+
+        return ScalarSpec(dtype=dtype)
+
+    def args_codegen(self, arg_operations):
+        """Generate argument creation code for default template."""
+        code_lines = []
+
+        # Add sentinel tensor that ensures gradient computation
+        code_lines.extend(
+            [
+                "# Sentinel tensor to ensure gradient computation",
+                "sentinel = torch.tensor(1.0, requires_grad=True)",
+                "",
+            ]
+        )
+
+        if arg_operations:
+            for i, (node_id, spec) in enumerate(arg_operations):
+                arg_name = f"arg_{i}"
+
+                if isinstance(spec, ScalarSpec):
+                    dtype_str = f"torch.{spec.dtype}".replace("torch.torch.", "torch.")
+                    if spec.dtype in [
+                        torch.int8,
+                        torch.int16,
+                        torch.int32,
+                        torch.int64,
+                    ]:
+                        # For integer scalars, use randint to avoid always getting 0
+                        code_lines.append(
+                            f"{arg_name} = int(torch.randint(5, 30, ()).item())"
+                        )
+                    elif spec.dtype == torch.bool:
+                        # For boolean scalars, use randint and cast to bool
+                        code_lines.append(
+                            f"{arg_name} = bool(torch.randint(0, 2, ()).item())"
+                        )
+                    else:
+                        # For float scalars, use randn
+                        code_lines.append(
+                            f"{arg_name} = float(torch.randn((), dtype={dtype_str}).item())"
+                        )
+
+                elif isinstance(spec, TensorSpec):
+                    size_str = str(spec.size)
+                    dtype_str = f"torch.{spec.dtype}".replace("torch.torch.", "torch.")
+
+                    # Calculate storage size needed for the strided tensor
+                    if spec.size:
+                        # Calculate the maximum index that will be accessed
+                        max_offset = 0
+                        for dim_size, stride in zip(spec.size, spec.stride):
+                            if dim_size > 1:
+                                max_offset += (dim_size - 1) * abs(stride)
+                        storage_size = max_offset + 1
+                    else:
+                        storage_size = 1
+
+                    stride_str = str(spec.stride)
+
+                    # Special handling for integer tensors which might be used as indices
+                    if spec.dtype in [
+                        torch.int8,
+                        torch.int16,
+                        torch.int32,
+                        torch.int64,
+                    ]:
+                        # For integer tensors, generate valid indices with headroom for arithmetic
+                        # Use smaller range [5, 30] to allow for multiplication and other operations
+                        # This prevents indices from becoming too large after arithmetic
+                        min_val = (
+                            5  # Minimum to avoid negative results after subtraction
+                        )
+                        max_val = (
+                            30  # Maximum to avoid out-of-bounds after multiplication
+                        )
+                        code_lines.append(
+                            f"{arg_name} = torch.as_strided(torch.randint({min_val}, {max_val}, ({storage_size},)).to({dtype_str}), {size_str}, {stride_str})"
+                        )
+                    elif spec.dtype == torch.bool:
+                        # For boolean tensors, use randint to generate True/False values
+                        # Using randn().to(bool) would yield almost all True due to non-zero floats
+                        code_lines.append(
+                            f"{arg_name} = torch.as_strided(torch.randint(0, 2, ({storage_size},), dtype=torch.int8).bool(), {size_str}, {stride_str})"
+                        )
+                    else:
+                        code_lines.append(
+                            f"{arg_name} = torch.as_strided(torch.randn({storage_size}).to({dtype_str}), {size_str}, {stride_str})"
+                        )
+
+        return code_lines
+
+
+class DefaultFuzzTemplate(FuzzTemplate):
+    def __init__(self):
+        from torchfuzz.checks import EagerVsFullGraphDynamicCompileCheck
+
+        super().__init__(
+            supported_ops=[
+                # Basic arithmetic operations
+                "torch.add",
+                "torch.sub",
+                "torch.mul",
+                "torch.div",
+                # Tensor shape operations
+                "torch.Tensor.view",
+                "torch.reshape",
+                "torch.flatten",
+                "torch.squeeze",
+                "torch.unsqueeze",
+                # Matrix operations
+                "torch.mm",
+                "torch.addmm",
+                "torch.bmm",
+                "torch.matmul",
+                # Neural network operations
+                "torch.nn.functional.embedding",
+                "torch.nn.functional.linear",
+                # Activation functions
+                "torch.nn.functional.relu",
+                "torch.nn.functional.leaky_relu",
+                "torch.nn.functional.elu",
+                "torch.nn.functional.gelu",
+                "torch.nn.functional.silu",
+                "torch.sigmoid",
+                "torch.tanh",
+                "torch.nn.functional.softmax",
+                # Normalization layers
+                "torch.nn.functional.layer_norm",
+                "torch.nn.functional.rms_norm",
+                "torch.nn.functional.batch_norm",
+                "torch.nn.functional.group_norm",
+                # Regularization
+                "torch.nn.functional.dropout",
+            ],
+            check=EagerVsFullGraphDynamicCompileCheck(),
+        )
+
+    def spec_distribution(self):
+        """Default template: tensor-only (no scalars)."""
+        return {
+            "tensor_prob": 1.0,
+            "scalar_prob": 0.0,
+            "allow_tensors": True,
+            "allow_scalars": False,
+        }
+
+    def imports_codegen(self):
+        return [
+            "import torch",
+        ]
+
+    def flags_codegen(self):
+        return ["torch._dynamo.config.capture_scalar_outputs = True"]
+
+    def epilogue_codegen(self):
+        return []
+
+
+class DTensorFuzzTemplate(FuzzTemplate):
+    def __init__(self):
+        from torchfuzz.checks import EagerVsFullGraphDynamicCompileCheck
+
+        super().__init__(
+            supported_ops=[
+                "torch.add",
+                "torch.sub",
+                "torch.mul",
+                "torch.div",
+                "torch.mm",
+                "torch.addmm",
+                "torch.bmm",
+                "torch.matmul",
+            ],
+            check=EagerVsFullGraphDynamicCompileCheck(),
+        )
+
+    def supported_dtypes(self):
+        """Return list of DTensor-compatible dtypes (no complex types)."""
+        return [
+            torch.float32,
+            torch.float64,
+            torch.float16,
+            torch.bfloat16,
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.bool,
+        ]
+
+    def spec_distribution(self):
+        """DTensor template: tensor-only (no scalars)."""
+        return {
+            "tensor_prob": 1.0,
+            "scalar_prob": 0.0,
+            "allow_tensors": True,
+            "allow_scalars": False,
+        }
+
+    def imports_codegen(self):
+        return [
+            "import torch",
+            "from torch.distributed.tensor.placement_types import Replicate, Shard",
+            "from torch.testing._internal.distributed.fake_pg import FakeStore",
+            "from torch.distributed.tensor import DTensor",
+        ]
+
+    def flags_codegen(self):
+        return [
+            "torch._dynamo.config.capture_scalar_outputs = True",
+            "torch._dynamo.config.capture_dynamic_output_shape_ops = True",
+            "torch._inductor.config.emulate_precision_casts = True",
+        ]
+
+    def args_codegen(self, arg_operations):
+        """Generate DTensor argument creation code with proper mesh setup."""
+        code_lines = []
+
+        # Add DTensor setup code first
+        code_lines.extend(
+            [
+                "world_size = 1024",
+                "fake_store = FakeStore()",
+                "torch.distributed.init_process_group(",
+                '    "fake", store=fake_store, rank=0, world_size=world_size',
+                ")",
+                "",
+                "mesh = torch.distributed.device_mesh.init_device_mesh(",
+                '    "cuda",',
+                "    (2, 8),",
+                "    mesh_dim_names=(",
+                '        "dim1", "dim2",',
+                "    ),",
+                ")",
+                "",
+                "placements = (Replicate(), Replicate())",
+                "",
+                "# Sentinel tensor to ensure gradient computation",
+                "sentinel_local = torch.tensor(1.0, device='cuda', requires_grad=True)",
+                "sentinel = DTensor.from_local(sentinel_local, mesh, placements)",
+                "",
+            ]
+        )
+
+        if arg_operations:
+            for i, (node_id, spec) in enumerate(arg_operations):
+                arg_name = f"arg_{i}"
+
+                if isinstance(spec, ScalarSpec):
+                    # For scalars in DTensor, create a 0-dim tensor
+                    dtype_str = f"torch.{spec.dtype}".replace("torch.torch.", "torch.")
+                    code_lines.extend(
+                        [
+                            f"{arg_name}_local = torch.randn((), dtype={dtype_str}, device='cuda', requires_grad=True)",
+                            f"{arg_name} = DTensor.from_local({arg_name}_local, mesh, placements)",
+                        ]
+                    )
+
+                elif isinstance(spec, TensorSpec):
+                    size_str = str(spec.size)
+                    dtype_str = f"torch.{spec.dtype}".replace("torch.torch.", "torch.")
+
+                    # Handle different dtypes appropriately for DTensor
+                    if spec.dtype in [
+                        torch.int32,
+                        torch.int64,
+                        torch.int8,
+                        torch.int16,
+                    ]:
+                        # Integer dtypes: use randint and no requires_grad
+                        code_lines.extend(
+                            [
+                                f"{arg_name}_local = torch.randint(1, 10, {size_str}, dtype={dtype_str}, device='cuda')",
+                                f"{arg_name} = DTensor.from_local({arg_name}_local, mesh, placements)",
+                            ]
+                        )
+                    elif spec.dtype == torch.bool:
+                        # Boolean dtype: use randint and cast to bool
+                        code_lines.extend(
+                            [
+                                f"{arg_name}_local = torch.randint(0, 2, {size_str}, device='cuda').bool()",
+                                f"{arg_name} = DTensor.from_local({arg_name}_local, mesh, placements)",
+                            ]
+                        )
+                    else:
+                        # Float dtypes: use randn and requires_grad
+                        code_lines.extend(
+                            [
+                                f"{arg_name}_local = torch.randn({size_str}, dtype={dtype_str}, device='cuda', requires_grad=True)",
+                                f"{arg_name} = DTensor.from_local({arg_name}_local, mesh, placements)",
+                            ]
+                        )
+
+        return code_lines
+
+    def epilogue_codegen(self):
+        return ["torch.distributed.destroy_process_group()"]
+
+
+class UnbackedFuzzTemplate(FuzzTemplate):
+    def __init__(self):
+        from torchfuzz.checks import EagerVsFullGraphDynamicCompileCheck
+
+        super().__init__(
+            supported_ops=[
+                "torch.ops.aten.item",
+                "torch.ops.aten.nonzero",
+                "torch.ops.aten.masked_select",
+                "torch.ops.aten.unique",
+                # Basic arithmetic operations
+                "torch.add",
+                "torch.sub",
+                "torch.mul",
+                "torch.div",
+                # Tensor shape operations
+                "torch.Tensor.view",
+                "torch.reshape",
+                "torch.flatten",
+                "torch.squeeze",
+                "torch.unsqueeze",
+                # Matrix operations
+                "torch.mm",
+                "torch.addmm",
+                "torch.bmm",
+                "torch.matmul",
+                # Neural network operations
+                "torch.nn.functional.embedding",
+                "torch.nn.functional.linear",
+                # Activation functions
+                "torch.nn.functional.relu",
+                "torch.nn.functional.leaky_relu",
+                "torch.nn.functional.elu",
+                "torch.nn.functional.gelu",
+                "torch.nn.functional.silu",
+                "torch.sigmoid",
+                "torch.tanh",
+                "torch.nn.functional.softmax",
+                # Normalization layers
+                "torch.nn.functional.layer_norm",
+                "torch.nn.functional.rms_norm",
+                "torch.nn.functional.batch_norm",
+                "torch.nn.functional.group_norm",
+                # Regularization
+                "torch.nn.functional.dropout",
+            ],
+            check=EagerVsFullGraphDynamicCompileCheck(),
+        )
+
+    def supported_dtypes(self):
+        """Return list of dtypes good for data-dependent operations."""
+        # Focus on dtypes that work well with data-dependent ops and arithmetic
+        # Exclude bool since arithmetic operations don't work with boolean tensors
+        return [
+            torch.float32,
+            torch.float64,
+            torch.int32,
+            torch.int64,
+        ]
+
+    def spec_distribution(self):
+        """Unbacked template: 50% tensors, 50% scalars."""
+        return {
+            "tensor_prob": 0.5,
+            "scalar_prob": 0.5,
+            "allow_tensors": True,
+            "allow_scalars": True,
+        }
+
+    def imports_codegen(self):
+        return [
+            "import torch",
+        ]
+
+    def flags_codegen(self):
+        return [
+            "torch._dynamo.config.capture_scalar_outputs = True",
+            "torch._dynamo.config.capture_dynamic_output_shape_ops = True",
+        ]
+
+    def epilogue_codegen(self):
+        return []
+
+
+def convert_graph_to_python_code(
+    operation_graph: OperationGraph,
+    seed: Optional[int] = None,
+    template: str = "default",
+) -> str:
+    """
+    Convert an operation graph to executable Python code using topological ordering.
+
+    The graph-based approach generates code by:
+    1. Getting the topological order of nodes (dependencies before dependents)
+    2. Generating code for each node in that order
+    3. Properly handling input dependencies through node connections
+
+    Args:
+        operation_graph: OperationGraph instance containing the operation DAG
+        seed: Random seed for reproducible code generation. If None, uses current random state.
+
+    Returns:
+        String containing the complete Python code that executes the operations
+    """
+
+    # Instantiate template
+    if template == "dtensor":
+        fuzz_template = DTensorFuzzTemplate()
+    elif template == "unbacked":
+        fuzz_template = UnbackedFuzzTemplate()
+    else:
+        fuzz_template = DefaultFuzzTemplate()
+
+    # Set seed for reproducible code generation
+    if seed is not None:
+        import random
+
+        random.seed(seed + 1000)  # Offset to avoid conflicts with graph generation
+        torch.manual_seed(seed + 1000)
+
+    if not operation_graph.nodes:
+        raise ValueError("Empty operation graph")
+
+    # Get topological order - this ensures dependencies are processed before dependents
+    topo_order = operation_graph.get_topological_order()
+
+    # Track generated variables and arg operations
+    generated_code_lines = []
+    node_variables: dict[str, tuple[str, Spec]] = {}  # Maps node_id to (var_name, spec)
+    arg_operations: list[
+        tuple[str, Spec]
+    ] = []  # List of (node_id, spec) for arg operations
+
+    # Process nodes in topological order
+    for node_id in topo_order:
+        node = operation_graph.nodes[node_id]
+        op_name = node.op_name
+        output_spec = node.output_spec
+
+        # Generate output variable name
+        output_var_name = f"var_{node_id}"
+
+        # Generate input variable names from input nodes
+        input_var_names = []
+        for input_node_id in node.input_nodes:
+            if input_node_id in node_variables:
+                input_var_name, _ = node_variables[input_node_id]
+                input_var_names.append(input_var_name)
+            else:
+                raise ValueError(
+                    f"Node {node_id} depends on {input_node_id}, but {input_node_id} "
+                    f"was not processed yet. Topological order may be incorrect."
+                )
+
+        # Handle different operation types
+        if op_name == "arg" or op_name.startswith("arg_"):
+            # Track arg operations for later function signature generation
+            arg_operations.append((node_id, output_spec))
+            arg_name = f"arg_{len(arg_operations) - 1}"
+            # Add tensor descriptor comment for arg operations too
+            descriptor_comment = f"# {format_tensor_descriptor(output_spec)}"
+            operation_lines = [f"{output_var_name} = {arg_name} " + descriptor_comment]
+        else:
+            # Generate operation execution code
+            operation_lines = generate_simple_operation_code(
+                output_var_name, input_var_names, op_name, output_spec
+            )
+
+        # Add proper indentation for function body
+        generated_code_lines.extend(["    " + line for line in operation_lines])
+
+        # Track this node's variable
+        node_variables[node_id] = (output_var_name, output_spec)
+
+    # The final result comes from the root node
+    root_node_id = operation_graph.root_node_id
+    if root_node_id not in node_variables:
+        raise ValueError(f"Root node {root_node_id} was not processed")
+
+    final_var_name, _ = node_variables[root_node_id]
+
+    # Generate function signature based on discovered arg operations
+    if arg_operations:
+        arg_names = [f"arg_{i}" for i in range(len(arg_operations))]
+        function_signature = f"def fuzzed_program({', '.join(arg_names)}, sentinel)"
+    else:
+        function_signature = "def fuzzed_program(sentinel)"
+
+    # Build the complete code - all imports at the top
+    code_lines = []
+
+    # Add template imports
+    code_lines.extend(fuzz_template.imports_codegen())
+
+    # Add template flags
+    code_lines.extend(fuzz_template.flags_codegen())
+    code_lines.append("")
+
+    # Add single seed at the top if seed is provided
+    if seed is not None:
+        code_lines.append(f"torch.manual_seed({seed})")
+        code_lines.append("")
+
+    code_lines.append(function_signature + ":")
+
+    # Add the generated operation code
+    code_lines.extend(generated_code_lines)
+
+    # Add return statement with sentinel multiplication to ensure gradient computation
+    # Handle complex tensors appropriately based on template
+    if template == "dtensor":
+        # For DTensor, avoid .real operation which doesn't work with sharding
+        # Instead use abs() for complex tensors to get a real result
+        code_lines.extend(
+            [
+                "    # Ensure gradient computation by multiplying with sentinel",
+                f"    result = {final_var_name} * sentinel",
+                "    if result.is_complex():",
+                "        result = result.abs()  # Use abs() instead of .real for DTensor compatibility",
+                "    return result",
+                "",
+            ]
+        )
+    else:
+        code_lines.extend(
+            [
+                "    # Ensure gradient computation by multiplying with sentinel and taking real part",
+                f"    result = {final_var_name} * sentinel",
+                "    if result.is_complex():",
+                "        result = result.real",
+                "    return result",
+                "",
+            ]
+        )
+
+    # Generate argument creation code using template
+    arg_code_lines = fuzz_template.args_codegen(arg_operations)
+    code_lines.extend(arg_code_lines)
+
+    # Generate the final execution with both normal and compiled versions
+    if arg_operations:
+        arg_names = [f"arg_{i}" for i in range(len(arg_operations))]
+        if len(arg_names) == 1:
+            args_tuple = (
+                f"({arg_names[0]},)"  # Single element tuple needs trailing comma
+            )
+        else:
+            args_tuple = f"({', '.join(arg_names)})"
+    else:
+        args_tuple = "()"
+
+    # Generate execution code using template check
+    check_lines = fuzz_template.check.codegen(f"{args_tuple} + (sentinel,)")
+    code_lines.extend([""] + check_lines)
+
+    # Add template epilogue
+    epilogue_lines = fuzz_template.epilogue_codegen()
+    if epilogue_lines:
+        code_lines.append("")
+        code_lines.extend(epilogue_lines)
+
+    return "\n".join(code_lines)
+
+
+def generate_simple_operation_code(
+    output_var: str,
+    input_vars: list,
+    op_name: str,
+    output_spec,
+) -> list:
+    """
+    Generate code lines for executing a single operation using class-based operators.
+
+    Args:
+        output_var: Name of the output variable
+        input_vars: List of input variable names
+        op_name: Name of the operation
+        output_spec: Output specification for the operation
+    """
+    # Try to get the operator from the registry
+    operator = get_operator(op_name)
+
+    if operator is not None:
+        # Use the class-based operator to generate code
+        code = operator.codegen(output_var, input_vars, output_spec)
+        # Add tensor descriptor comment to the last emitted line
+        descriptor_comment = f"# {format_tensor_descriptor(output_spec)}"
+        if "\n" in code:
+            lines = code.split("\n")
+            # Attach comment to the last non-empty line
+            for i in range(len(lines) - 1, -1, -1):
+                if lines[i].strip():
+                    lines[i] = lines[i] + " " + descriptor_comment
+                    break
+            return lines
+        else:
+            return [code + " " + descriptor_comment]
+    else:
+        # Fallback for unknown operations
+        return [f"# Unknown operation: {op_name}"]
+
+
+def create_program_file(python_code: str) -> str:
+    """
+    Create a temporary Python file from the generated code.
+
+    Args:
+        python_code: String containing Python code to write
+
+    Returns:
+        Path to the created temporary file
+    """
+    import hashlib
+
+    # Generate a deterministic filename based on code content hash
+    code_hash = hashlib.md5(python_code.encode()).hexdigest()[:8]  # noqa: S324
+    tmp_dir = "/tmp/torchfuzz"
+    os.makedirs(tmp_dir, exist_ok=True)
+    generated_file_path = os.path.join(tmp_dir, f"fuzz_{code_hash}.py")
+
+    # Write the generated code to the specified file
+    with open(generated_file_path, "w") as f:
+        f.write(python_code)
+
+    return generated_file_path
diff --git a/tools/experimental/torchfuzz/fuzzer.py b/tools/experimental/torchfuzz/fuzzer.py
new file mode 100644
index 000000000000..5c54fded9f8a
--- /dev/null
+++ b/tools/experimental/torchfuzz/fuzzer.py
@@ -0,0 +1,415 @@
+# mypy: ignore-errors
+import logging
+import multiprocessing as mp
+import os
+import random
+import sys
+from typing import Optional
+
+
+# Add parent directory to path so we can import torchfuzz as a module
+current_dir = os.path.dirname(os.path.abspath(__file__))
+parent_dir = os.path.dirname(current_dir)
+if parent_dir not in sys.path:
+    sys.path.insert(0, parent_dir)
+
+import torch
+from torchfuzz.codegen import convert_graph_to_python_code, create_program_file
+from torchfuzz.ops_fuzzer import fuzz_operation_graph, fuzz_spec
+from torchfuzz.runner import ProgramRunner
+from torchfuzz.visualize_graph import visualize_operation_graph
+
+
+def _parse_supported_ops_with_weights(spec: str) -> tuple[list[str], dict[str, float]]:
+    """Parse --supported-ops string.
+
+    Format: comma-separated fully-qualified torch ops, each optionally with =weight.
+    Example: "torch.matmul=5,torch.nn.functional.rms_norm=5,torch.add"
+    Returns (ops_list, weights_dict)
+    """
+    ops: list[str] = []
+    weights: dict[str, float] = {}
+    if not spec:
+        return ops, weights
+    for entry in spec.split(","):
+        entry = entry.strip()
+        if not entry:
+            continue
+        if "=" in entry:
+            name, w = entry.split("=", 1)
+            name = name.strip()
+            try:
+                weight = float(w.strip())
+            except ValueError:
+                continue
+            ops.append(name)
+            weights[name] = weight
+        else:
+            ops.append(entry)
+    return ops, weights
+
+
+def fuzz_and_execute(
+    seed: Optional[int] = None,
+    max_depth: Optional[int] = None,
+    log_at_faluire: bool = False,
+    template: str = "default",
+    supported_ops: Optional[list[str]] = None,
+    op_weights: Optional[dict[str, float]] = None,
+) -> None:
+    """
+    Generate a fuzzed operation stack, convert it to Python code, and execute it.
+
+    Args:
+        seed: Random seed for reproducible generation. If None, uses a random seed.
+        max_depth: Maximum depth for operation stack (1-10). If None, uses a random depth.
+
+    This function:
+    1. Generates a random target specification
+    2. Creates a stack of operations to produce that target
+    3. Converts the stack into executable Python code
+    4. Executes the generated Python code
+    5. Validates the final result matches the target spec
+    """
+
+    # Generate seed if not provided
+    if seed is None:
+        seed = random.randint(0, 2**31 - 1)
+
+    # Generate max_depth if not provided (range 3-12)
+    if max_depth is None:
+        random.seed(seed + 999)  # Use seed offset for consistent depth selection
+        max_depth = random.randint(2, 4)
+    else:
+        # Clamp max_depth to valid range
+        max_depth = max(1, max_depth)
+
+    print(f"Using seed: {seed}, max_depth: {max_depth}")
+
+    # Set seed for reproducible generation
+    random.seed(seed)
+    torch.manual_seed(seed)
+    operation_stack = None
+    python_code = None
+    target_spec = None
+
+    def log(success: bool) -> None:
+        import os
+        import time
+
+        # Create a unique folder for this iteration
+        timestamp = int(time.time() * 1000)  # milliseconds
+        folder_name = (
+            f"fuzzing_seed_{seed}_{timestamp}_{'success' if success else 'failed'}"
+        )
+        iteration_folder = os.path.join("/tmp", folder_name)
+        os.makedirs(iteration_folder, exist_ok=True)
+
+        # Write summary file
+        summary_path = os.path.join(iteration_folder, "summary.txt")
+        with open(summary_path, "w") as f:
+            f.write("Fuzzing Session Summary\n")
+            f.write("======================\n")
+            f.write(f"Seed: {seed}\n")
+            f.write(f"Max depth: {max_depth}\n")
+            f.write(f"Success: {success}\n")
+            f.write(f"Target specification: {target_spec}\n")
+            if operation_stack:
+                f.write(f"Operations count: {len(operation_stack)}\n")
+
+        if operation_stack:
+            # Write operation stack to file in iteration folder
+            stack_file_path = os.path.join(iteration_folder, "operation_stack.txt")
+            with open(stack_file_path, "w") as f:
+                f.write(f"Target specification: {target_spec}\n")
+                f.write(f"Generated {len(operation_stack)} operations in stack\n\n")
+                f.write("Operation stack (in reverse order - dependencies first):\n")
+                for i in range(len(operation_stack) - 1, -1, -1):
+                    op = operation_stack[i]
+                    f.write(
+                        f"  {i}: {op.op_name} -> {op.output_spec} (depth {op.depth})\n"
+                    )
+
+            # Generate visualization in the iteration folder
+            visualize_operation_graph(
+                operation_graph, "Operation Graph", iteration_folder
+            )
+
+    import time
+
+    try:
+        logger = logging.getLogger(__name__)
+
+        # Generate target specification first
+        logger.debug("⏱️  Step 1: Generating target spec...")
+        start_time = time.time()
+        target_spec = fuzz_spec(template)
+
+        # Apply user-specified operator weights (if provided)
+        if op_weights:
+            from torchfuzz.operators import set_operator_weights
+
+            set_operator_weights(op_weights)
+        logger.debug(
+            "   Completed in %.3fs - %s", time.time() - start_time, target_spec
+        )
+
+        logger.debug("⏱️  Step 2: Generating operation graph...")
+        start_time = time.time()
+        operation_graph = fuzz_operation_graph(
+            target_spec,
+            max_depth=max_depth,
+            seed=seed,
+            template=template,
+            supported_ops=supported_ops,
+        )
+
+        # Extract and print operation statistics
+        operation_counts = {}
+        for node in operation_graph.nodes.values():
+            # Use the fully qualified torch operation name if available
+            from torchfuzz.operators import get_operator
+
+            # Try to get the fully qualified torch operation name
+            torch_op_name = None
+
+            # Extract the base operation name (without arg_X suffixes)
+            base_op_name = node.op_name
+            if node.op_name.startswith("arg_"):
+                # For arg operations, use just "arg" to look up in registry
+                base_op_name = "arg"
+
+            try:
+                operator = get_operator(base_op_name)
+                if (
+                    operator
+                    and hasattr(operator, "torch_op_name")
+                    and operator.torch_op_name
+                ):
+                    torch_op_name = operator.torch_op_name
+            except (KeyError, ValueError):
+                # If the operator doesn't exist in registry, use the node's op_name
+                pass
+
+            # Use fully qualified name if available, otherwise use the node's op_name
+            display_name = torch_op_name if torch_op_name else node.op_name
+            operation_counts[display_name] = operation_counts.get(display_name, 0) + 1
+
+        # Print operation statistics in a parseable format
+        print("OPERATION_STATS:")
+        for op_name, count in sorted(operation_counts.items()):
+            print(f"  {op_name}: {count}")
+
+        logger.debug("⏱️  Step 3: Converting to Python code...")
+        start_time = time.time()
+        python_code = convert_graph_to_python_code(
+            operation_graph, seed=seed, template=template
+        )
+        logger.debug(
+            "   Completed in %.3fs - %d chars",
+            time.time() - start_time,
+            len(python_code),
+        )
+
+        logger.debug("⏱️  Step 4: Executing Python code...")
+        start_time = time.time()
+
+        # Create program file and run with new runner
+        program_path = create_program_file(python_code)
+        runner = ProgramRunner()
+        runner.run_program(program_path)
+
+        logger.debug("   Completed in %.3fs", time.time() - start_time)
+
+        # # Validate the result matches target specification
+        if not log_at_faluire:
+            log(True)
+
+    except Exception as e:
+        print(f"\n❌ Execution failed: {e}")
+        # from visualize_stack import visualize_operation_stack
+        log(False)
+        import traceback
+
+        traceback.print_exc()
+        error_message = str(e)
+        print(f"Error: {error_message}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    try:
+        from multi_process_fuzzer import run_multi_process_fuzzer, run_until_failure
+    except ImportError:
+        # If importing as a module fails, import from the same directory
+        import os
+        import sys
+
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        sys.path.insert(0, current_dir)
+        from multi_process_fuzzer import run_multi_process_fuzzer, run_until_failure
+
+    # Set up command-line argument parsing
+    parser = argparse.ArgumentParser(
+        description="PyTorch Fuzzer - Generate and test random PyTorch operations"
+    )
+
+    # Single seed execution arguments
+    parser.add_argument("--seed", type=int, help="Random seed for single execution")
+    parser.add_argument(
+        "--max-depth", type=int, help="Maximum depth for operation stack (1-20)"
+    )
+    parser.add_argument(
+        "--template",
+        choices=["default", "dtensor", "unbacked"],
+        default="default",
+        help="Template to use for code generation (default: default)",
+    )
+    parser.add_argument(
+        "--supported-ops",
+        type=str,
+        help=(
+            "Comma-separated fully-qualified torch ops to allow, each optionally with =weight. "
+            "Examples: 'torch.matmul,torch.nn.functional.rms_norm' or "
+            "'torch.matmul=5,torch.nn.functional.rms_norm=5'. Overrides template supported ops."
+        ),
+    )
+
+    # Multi-process fuzzing arguments
+    parser.add_argument(
+        "--start", type=int, help="Starting seed value for multi-process fuzzing"
+    )
+    parser.add_argument(
+        "--count", type=int, help="Number of seeds to run in multi-process fuzzing"
+    )
+    parser.add_argument(
+        "--processes",
+        "-p",
+        type=int,
+        help="Number of worker processes to use (default: auto-detected)",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Print detailed output for all runs (not just failures)",
+    )
+    parser.add_argument(
+        "--stop-at-first-failure",
+        action="store_true",
+        help="Pick a random seed and keep iterating until finding a failure (exits with non-zero code)",
+    )
+
+    # Legacy arguments
+    parser.add_argument(
+        "--single",
+        action="store_true",
+        help="Run a single fuzz_and_execute (deprecated, use --seed)",
+    )
+    parser.add_argument(
+        "--log-level",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
+        default="INFO",
+        help="Set the logging level (default: INFO)",
+    )
+
+    args = parser.parse_args()
+
+    # Configure logging
+    logging.basicConfig(
+        level=getattr(logging, args.log_level), format="%(levelname)s: %(message)s"
+    )
+    logger = logging.getLogger(__name__)
+
+    # Determine execution mode
+    if args.seed is not None or args.single:
+        # Single seed execution mode
+        print("Running single fuzz_and_execute...")
+        # Parse supported ops and any inline weights from that flag
+        parsed_supported_ops: Optional[list[str]] = None
+        parsed_weights: dict[str, float] = {}
+        if args.supported_ops:
+            parsed_supported_ops, parsed_weights = _parse_supported_ops_with_weights(
+                args.supported_ops
+            )
+
+        fuzz_and_execute(
+            seed=args.seed,
+            max_depth=args.max_depth,
+            template=args.template,
+            supported_ops=parsed_supported_ops,
+            op_weights=(parsed_weights if parsed_weights else None),
+        )
+    elif args.stop_at_first_failure:
+        # Stop-at-first-failure mode
+        # Default number of processes
+        if args.processes is None:
+            cpu_count = mp.cpu_count()
+            args.processes = max(1, min(16, int(cpu_count * 0.75)))
+
+        if args.processes < 1:
+            print("❌ Error: Number of processes must be at least 1")
+            sys.exit(1)
+
+        try:
+            run_until_failure(
+                num_processes=args.processes,
+                verbose=args.verbose,
+                template=args.template,
+                supported_ops=args.supported_ops,
+            )
+        except Exception as e:
+            print(f"❌ Unexpected error: {str(e)}")
+            import traceback
+
+            traceback.print_exc()
+            sys.exit(1)
+    elif args.start is not None or args.count is not None:
+        # Multi-process fuzzing mode
+        if args.start is None:
+            print("❌ Error: --start is required when --count is specified")
+            sys.exit(1)
+        if args.count is None:
+            print("❌ Error: --count is required when --start is specified")
+            sys.exit(1)
+
+        # Validate arguments
+        if args.count < 1:
+            print("❌ Error: --count must be at least 1")
+            sys.exit(1)
+
+        # Default number of processes
+        if args.processes is None:
+            cpu_count = mp.cpu_count()
+            args.processes = max(1, min(16, int(cpu_count * 0.75)))
+
+        if args.processes < 1:
+            print("❌ Error: Number of processes must be at least 1")
+            sys.exit(1)
+
+        try:
+            run_multi_process_fuzzer(
+                num_processes=args.processes,
+                seed_start=args.start,
+                seed_count=args.count,
+                verbose=args.verbose,
+                template=args.template,
+                supported_ops=args.supported_ops,
+            )
+        except Exception as e:
+            print(f"❌ Unexpected error: {str(e)}")
+            import traceback
+
+            traceback.print_exc()
+            sys.exit(1)
+    else:
+        # Show help when no arguments are provided
+        parser.print_help()
+        print("\nExamples:")
+        print("  python fuzzer.py --seed 42                    # Run single seed")
+        print(
+            "  python fuzzer.py --start 0 --count 1000       # Run multi-process fuzzing"
+        )
+        print("  python fuzzer.py --start 100 --count 50 -p 8  # Use 8 processes")
diff --git a/tools/experimental/torchfuzz/image.png b/tools/experimental/torchfuzz/image.png
new file mode 100644
index 000000000000..ec90ab3310c9
Binary files /dev/null and b/tools/experimental/torchfuzz/image.png differ
diff --git a/tools/experimental/torchfuzz/multi_process_fuzzer.py b/tools/experimental/torchfuzz/multi_process_fuzzer.py
new file mode 100644
index 000000000000..37573f5940ca
--- /dev/null
+++ b/tools/experimental/torchfuzz/multi_process_fuzzer.py
@@ -0,0 +1,666 @@
+#!/usr/bin/env python3
+"""
+Multi-process fuzzer library that uses worker processes to execute fuzzer.py with different seeds.
+"""
+
+import multiprocessing as mp
+import re
+import subprocess
+import sys
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Optional
+
+
+try:
+    from tqdm import tqdm
+
+    HAS_TQDM = True
+except ImportError:
+    HAS_TQDM = False
+
+    # Create a mock tqdm class for type safety
+    class MockTqdm:
+        @staticmethod
+        def write(msg, file=None):
+            print(msg, file=file, flush=True)
+
+    tqdm = MockTqdm()
+
+
+def persist_print(msg):
+    """Print messages that persist with tqdm progress bars."""
+    try:
+        if HAS_TQDM and hasattr(tqdm, "write"):
+            # Keep prints on the same stream as the bar
+            tqdm.write(msg, file=sys.stderr)
+        else:
+            print(msg, file=sys.stderr, flush=True)
+    except BrokenPipeError:
+        import os
+
+        os.makedirs("/tmp/torchfuzz", exist_ok=True)
+        with open("/tmp/torchfuzz/crash.log", "a") as f:
+            f.write(f"BrokenPipeError: {msg}\n")
+
+
+# List of regex patterns for ignore bucket
+IGNORE_PATTERNS: list[re.Pattern] = [
+    re.compile(
+        r"Dynamo failed to run FX node with fake tensors: call_method fill_diagonal_"
+    ),  # https://github.com/pytorch/pytorch/issues/163420
+    re.compile(
+        r"TypeError: unsupported operand type\(s\) for divmod\(\): 'SymInt' and 'int'"
+    ),  # https://github.com/pytorch/pytorch/issues/163457
+    re.compile(
+        r"RuntimeError: self\.stride\(-1\) must be 1 to view ComplexDouble as"
+    ),  # https://github.com/pytorch/pytorch/issues/162561
+    re.compile(
+        r"BooleanAtom not allowed in this context"
+    ),  # https://github.com/pytorch/pytorch/issues/160726
+    re.compile(
+        r"TypeError\(\"unsupported operand type\(s\) for \*: 'SymBool' and 'FakeTensor'\"\)"
+    ),  # https://github.com/pytorch/pytorch/issues/164684
+    re.compile(r"KeyError: u\d+"),  # https://github.com/pytorch/pytorch/issues/164685
+    re.compile(
+        r"torch\._inductor\.exc\.InductorError: CppCompileError: C\+\+ compile error"
+    ),  # https://github.com/pytorch/pytorch/issues/164686
+    re.compile(
+        r"\.item\(\) # dtype="
+    ),  # https://github.com/pytorch/pytorch/issues/164725
+    re.compile(
+        r"dimensionality of sizes \(0\) must match dimensionality of strides \(1\)"
+    ),  # https://github.com/pytorch/pytorch/issues/164814
+    # Add more patterns here as needed, e.g.:
+    # re.compile(r"Some other error message"),
+]
+
+
+@dataclass
+class FuzzerResult:
+    seed: int
+    success: bool
+    output: str
+    duration: float
+    ignored_pattern_idx: int
+    operation_stats: dict[str, int]  # New field for operation statistics
+
+
+def is_ignored_output(output: str) -> int:
+    """
+    Check if the output matches any ignore pattern.
+
+    Args:
+        output: The combined stdout/stderr string.
+
+    Returns:
+        Index of the matched ignore pattern, or -1 if none matched.
+    """
+    for idx, pattern in enumerate(IGNORE_PATTERNS):
+        if pattern.search(output):
+            return idx
+    return -1
+
+
+def run_fuzzer_with_seed(
+    seed: int,
+    template: str = "default",
+    supported_ops: Optional[str] = None,
+) -> FuzzerResult:
+    """
+    Run fuzzer.py with a specific seed.
+
+    Args:
+        seed: The seed value to pass to fuzzer.py
+        template: The template to use for code generation
+        supported_ops: Comma-separated ops string with optional weights
+
+    Returns:
+        FuzzerResult dataclass instance
+    """
+    start_time = time.time()
+
+    try:
+        # Run fuzzer.py with the specified seed and template
+        cmd = [
+            sys.executable,
+            "fuzzer.py",
+            "--single",
+            "--seed",
+            str(seed),
+            "--template",
+            template,
+        ]
+
+        # Append supported ops if provided
+        if supported_ops:
+            cmd.extend(["--supported-ops", supported_ops])
+
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=300,  # 5 minute timeout per seed
+        )
+
+        duration = time.time() - start_time
+        success = result.returncode == 0
+
+        # Combine stdout and stderr for output
+        output = ""
+        if result.stdout:
+            output += f"STDOUT:\n{result.stdout}\n"
+        if result.stderr:
+            output += f"STDERR:\n{result.stderr}\n"
+        output += f"Return code: {result.returncode}"
+
+        # Parse operation statistics from the output
+        operation_stats = {}
+        if result.stdout:
+            lines = result.stdout.split("\n")
+            in_stats_section = False
+            for line in lines:
+                if line.strip() == "OPERATION_STATS:":
+                    in_stats_section = True
+                    continue
+                elif in_stats_section:
+                    if line.startswith("  ") and ":" in line:
+                        # Parse line like "  torch.add: 3"
+                        op_line = line.strip()
+                        if ": " in op_line:
+                            op_name, count_str = op_line.split(": ", 1)
+                            try:
+                                count = int(count_str)
+                                operation_stats[op_name] = count
+                            except ValueError:
+                                pass  # Skip malformed lines
+                    else:
+                        # End of stats section
+                        in_stats_section = False
+
+        # Check if output should be ignored and which pattern matched
+        ignored_pattern_idx = is_ignored_output(output)
+        if ignored_pattern_idx != -1:
+            # Mark as ignored (could also return a special flag if needed)
+            output = "[IGNORED] " + output
+
+        return FuzzerResult(
+            seed, success, output, duration, ignored_pattern_idx, operation_stats
+        )
+
+    except subprocess.TimeoutExpired:
+        duration = time.time() - start_time
+        return FuzzerResult(
+            seed, False, "Process timed out after 300 seconds", duration, -1, {}
+        )
+
+    except Exception as e:
+        duration = time.time() - start_time
+        return FuzzerResult(
+            seed, False, f"Exception occurred: {str(e)}", duration, -1, {}
+        )
+
+
+def print_output_lines(output: str, write_func):
+    """Helper to print non-empty lines of output using the provided write_func."""
+    for line in output.split("\n"):
+        if line.strip():
+            write_func(f"   {line}")
+    if hasattr(write_func, "__self__") and hasattr(write_func.__self__, "write"):
+        # For tqdm.write, add an empty line for separation
+        write_func("")
+
+
+def handle_result_output(
+    *,
+    label: str,
+    seed: int,
+    duration: float,
+    output: str,
+    ignored: bool,
+    verbose: bool,
+    write_func,
+):
+    """Unified handler for result output, reducing code repetition."""
+    ignored_text = " [IGNORED]" if ignored else ""
+    write_func(f"{label} - Seed {seed} (duration: {duration:.2f}s){ignored_text}")
+    if output.strip() or label.startswith("❌") or verbose:
+        print_output_lines(output, write_func)
+
+
+def run_multi_process_fuzzer(
+    num_processes: Optional[int] = None,
+    seed_start: int = 0,
+    seed_count: int = 100,
+    verbose: bool = False,
+    template: str = "default",
+    supported_ops: Optional[str] = None,
+) -> None:
+    """
+    Run the multi-process fuzzer.
+
+    Args:
+        num_processes: Number of worker processes to use
+        seed_start: Starting seed value (inclusive)
+        seed_count: Number of seeds to run
+        verbose: Whether to print detailed output
+        template: The template to use for code generation
+        supported_ops: Comma-separated ops string with optional weights
+    """
+    seeds = list(range(seed_start, seed_start + seed_count))
+
+    persist_print(f"🚀 Starting multi-process fuzzer with {num_processes} processes")
+    persist_print(
+        f"📊 Processing seeds {seed_start} to {seed_start + seed_count - 1} ({len(seeds)} total)"
+    )
+    persist_print(
+        f"🔧 Command template: python fuzzer.py --seed {{seed}} --template {template}"
+    )
+    persist_print("=" * 60)
+
+    start_time = time.time()
+    results: list[FuzzerResult] = []
+    successful_count = 0
+    failed_count = 0
+    ignored_count = 0
+    ignored_seeds = []
+    ignored_pattern_counts: dict[int, int] = dict.fromkeys(
+        range(len(IGNORE_PATTERNS)), 0
+    )
+
+    try:
+        # Use multiprocessing Pool to distribute work
+        with mp.Pool(processes=num_processes) as pool:
+            # Submit all seeds to the process pool
+            future_results = []
+            for seed in seeds:
+                future = pool.apply_async(
+                    run_fuzzer_with_seed, (seed, template, supported_ops)
+                )
+                future_results.append(future)
+
+            # Set up progress bar
+            if HAS_TQDM:
+                from tqdm import tqdm  # Import the real tqdm here
+
+                pbar = tqdm(
+                    total=len(seeds),
+                    desc="Processing seeds",
+                    file=sys.stdout,
+                    bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}] ✅/❌/❓={postfix}",
+                    dynamic_ncols=True,
+                )
+                pbar.set_postfix_str(
+                    f"{successful_count}/{failed_count}/{ignored_count} | throughput: 0.00 seeds/hr"
+                )
+
+                def write_func(msg):
+                    # pyrefly: ignore  # missing-attribute
+                    pbar.write(msg)
+            else:
+                persist_print("Progress: (install tqdm for better progress bar)")
+                pbar = None
+                write_func = persist_print
+
+            # Collect results as they complete
+            for i, future in enumerate(future_results):
+                try:
+                    result: FuzzerResult = future.get()
+                    results.append(result)
+
+                    if result.ignored_pattern_idx != -1:
+                        ignored_seeds.append(result.seed)
+                        ignored_pattern_counts[result.ignored_pattern_idx] += 1
+                        ignored_count += 1
+
+                    # Only increment failed_count if not ignored
+                    if result.success:
+                        successful_count += 1
+                    elif result.ignored_pattern_idx == -1:
+                        failed_count += 1
+
+                    elapsed = time.time() - start_time
+                    throughput = (i + 1) / (elapsed / 3600)
+
+                    # Update progress bar
+                    if HAS_TQDM and pbar:
+                        pbar.set_postfix_str(
+                            f"{successful_count}/{failed_count}/{ignored_count} | throughput: {throughput:.2f} seeds/hr"
+                        )
+                        pbar.update(1)
+                    else:
+                        status_emoji = "✅" if result.success else "❌"
+                        ignored_text = (
+                            " (IGNORED)" if result.ignored_pattern_idx != -1 else ""
+                        )
+                        persist_print(
+                            f"Completed {i + 1}/{len(seeds)} - Seed {result.seed}: {status_emoji}{ignored_text}"
+                        )
+
+                    # Unified output handling
+                    if not result.success and result.ignored_pattern_idx == -1:
+                        handle_result_output(
+                            label="❌ FAILURE",
+                            seed=result.seed,
+                            duration=result.duration,
+                            output=result.output,
+                            ignored=False,
+                            verbose=verbose,
+                            write_func=write_func,
+                        )
+                    elif not result.success and result.ignored_pattern_idx != -1:
+                        if verbose:
+                            handle_result_output(
+                                label="🚫 IGNORED",
+                                seed=result.seed,
+                                duration=result.duration,
+                                output=result.output,
+                                ignored=True,
+                                verbose=verbose,
+                                write_func=write_func,
+                            )
+                    elif verbose:
+                        handle_result_output(
+                            label="✅ SUCCESS",
+                            seed=result.seed,
+                            duration=result.duration,
+                            output=result.output,
+                            ignored=(result.ignored_pattern_idx != -1),
+                            verbose=verbose,
+                            write_func=write_func,
+                        )
+
+                except Exception as e:
+                    failed_count += 1
+                    if HAS_TQDM and pbar:
+                        pbar.set_postfix_str(f"{successful_count}/{failed_count}")
+                        pbar.update(1)
+                        pbar.write(f"❌ POOL ERROR - Seed {seeds[i]}: {str(e)}")
+                    else:
+                        persist_print(
+                            f"Completed {i + 1}/{len(seeds)} - Seed {seeds[i]}: ❌ POOL ERROR"
+                        )
+                        persist_print(f"❌ POOL ERROR - Seed {seeds[i]}: {str(e)}")
+                    results.append(
+                        FuzzerResult(
+                            seeds[i], False, f"Pool error: {str(e)}", 0.0, -1, {}
+                        )
+                    )
+
+            # Close progress bar
+            if HAS_TQDM and pbar:
+                pbar.close()
+    except KeyboardInterrupt:
+        persist_print("\n🛑 Interrupted by user (Ctrl+C)")
+        # Print summary up to this point
+        total_time = time.time() - start_time
+        persist_print("=" * 60)
+        persist_print("📈 SUMMARY (partial, interrupted)")
+        persist_print("=" * 60)
+
+        successful = [res for res in results if res.success]
+        # Only count as failed if not ignored
+        failed = [
+            res for res in results if not res.success and res.ignored_pattern_idx == -1
+        ]
+        ignored = [res for res in results if res.ignored_pattern_idx != -1]
+
+        persist_print(
+            f"✅ Successful: {len(successful)}/{len(results)} ({(len(successful) / len(results) * 100 if results else 0):.1f}%)"
+        )
+        persist_print(
+            f"❌ Failed:     {len(failed)}/{len(results)} ({(len(failed) / len(results) * 100 if results else 0):.1f}%)"
+        )
+        persist_print(f"⏱️  Total time: {total_time:.2f}s")
+        if results:
+            persist_print(
+                f"⚡ Throughput: {(len(results) / (total_time / 3600)):.2f} seeds/hr"
+                if total_time > 0
+                else "⚡ Throughput: N/A"
+            )
+        if failed:
+            persist_print(f"\n❌ Failed seeds: {[res.seed for res in failed]}")
+        if successful:
+            persist_print(f"✅ Successful seeds: {[res.seed for res in successful]}")
+            avg_success_time = sum(res.duration for res in successful) / len(successful)
+            persist_print(f"⚡ Avg time for successful runs: {avg_success_time:.2f}s")
+        if ignored:
+            persist_print(f"\n🚫 Ignored seeds: {[res.seed for res in ignored]}")
+            # Print ignore pattern stats
+            persist_print("\n🚫 Ignored pattern statistics:")
+            total_ignored = len(ignored)
+            for idx, pattern in enumerate(IGNORE_PATTERNS):
+                count = ignored_pattern_counts[idx]
+                percent = (count / total_ignored * 100) if total_ignored else 0
+                persist_print(
+                    f"  Pattern {idx}: {pattern.pattern!r} - {count} ({percent:.1f}%)"
+                )
+
+        # Aggregate and print operation distribution
+        _print_operation_distribution(results)
+
+        sys.exit(130)
+
+    total_time = time.time() - start_time
+
+    # Print summary
+    persist_print("=" * 60)
+    persist_print("📈 SUMMARY")
+    persist_print("=" * 60)
+
+    successful = [res for res in results if res.success]
+    # Only count as failed if not ignored
+    failed = [
+        res for res in results if not res.success and res.ignored_pattern_idx == -1
+    ]
+    ignored = [res for res in results if res.ignored_pattern_idx != -1]
+
+    persist_print(
+        f"✅ Successful: {len(successful)}/{len(results)} ({len(successful) / len(results) * 100:.1f}%)"
+    )
+    persist_print(
+        f"❌ Failed:     {len(failed)}/{len(results)} ({len(failed) / len(results) * 100:.1f}%)"
+    )
+    persist_print(f"⏱️  Total time: {total_time:.2f}s")
+    persist_print(
+        f"⚡ Throughput: {(len(results) / (total_time / 3600)):.2f} seeds/hr"
+        if total_time > 0
+        else "⚡ Throughput: N/A"
+    )
+
+    if failed:
+        persist_print(f"\n❌ Failed seeds: {[res.seed for res in failed]}")
+
+    if successful:
+        persist_print(f"✅ Successful seeds: {[res.seed for res in successful]}")
+        avg_success_time = sum(res.duration for res in successful) / len(successful)
+        persist_print(f"⚡ Avg time for successful runs: {avg_success_time:.2f}s")
+
+    if ignored:
+        persist_print(f"\n🚫 Ignored seeds: {[res.seed for res in ignored]}")
+        # Print ignore pattern stats
+        persist_print("\n🚫 Ignored pattern statistics:")
+        total_ignored = len(ignored)
+        for idx, pattern in enumerate(IGNORE_PATTERNS):
+            count = ignored_pattern_counts[idx]
+            percent = (count / total_ignored * 100) if total_ignored else 0
+            persist_print(
+                f"  Pattern {idx}: {pattern.pattern!r} - {count} ({percent:.1f}%)"
+            )
+
+    # Aggregate and print operation distribution
+    _print_operation_distribution(results)
+
+
+def _print_operation_distribution(results: list[FuzzerResult]) -> None:
+    """Helper function to print operation distribution statistics."""
+    total_operation_stats = defaultdict(int)
+    total_operations = 0
+
+    # Collect operation stats from all successful results
+    for result in results:
+        if result.success and result.operation_stats:
+            for op_name, count in result.operation_stats.items():
+                total_operation_stats[op_name] += count
+                total_operations += count
+
+    if total_operation_stats:
+        persist_print("\n📊 OPERATION DISTRIBUTION")
+        persist_print("=" * 60)
+        persist_print(f"Total operations executed: {total_operations}")
+        persist_print("")
+
+        # Sort operations by count (descending) for better readability
+        sorted_ops = sorted(
+            total_operation_stats.items(), key=lambda x: x[1], reverse=True
+        )
+
+        for op_name, count in sorted_ops:
+            percentage = (count / total_operations * 100) if total_operations > 0 else 0
+            persist_print(f"  {op_name:<30} {count:>6} times ({percentage:>5.1f}%)")
+    else:
+        persist_print(
+            "\n📊 No operation statistics collected (no successful runs with stats)"
+        )
+
+
+def run_until_failure(
+    num_processes: Optional[int] = None,
+    verbose: bool = False,
+    template: str = "default",
+    supported_ops: Optional[str] = None,
+) -> None:
+    """
+    Run the multi-process fuzzer with a random starting seed, iterating until a failure is found.
+
+    Args:
+        num_processes: Number of worker processes to use
+        verbose: Whether to print detailed output
+        template: The template to use for code generation
+        supported_ops: Comma-separated ops string with optional weights
+
+    Returns:
+        Exits with non-zero code when a failure is found
+    """
+    import random
+
+    # Pick a random seed to start from
+    initial_seed = random.randint(0, 2**31 - 1)
+
+    persist_print(
+        f"🎲 Starting continuous fuzzing with random initial seed: {initial_seed}"
+    )
+    persist_print(f"🚀 Using {num_processes} processes")
+    persist_print(
+        f"🔧 Command template: python fuzzer.py --seed {{seed}} --template {template}"
+    )
+    persist_print("🎯 Running until first failure is found...")
+    persist_print("=" * 60)
+
+    start_time = time.time()
+    current_seed = initial_seed
+    total_successful = 0
+    total_ignored = 0
+    batch_size = 100  # Process seeds in batches of 100
+
+    try:
+        while True:
+            # Process a batch of seeds
+            seeds = list(range(current_seed, current_seed + batch_size))
+
+            with mp.Pool(processes=num_processes) as pool:
+                future_results = []
+                for seed in seeds:
+                    future = pool.apply_async(
+                        run_fuzzer_with_seed, (seed, template, supported_ops)
+                    )
+                    future_results.append((seed, future))
+
+                # Set up progress bar for this batch
+                if HAS_TQDM:
+                    from tqdm import tqdm
+
+                    pbar = tqdm(
+                        total=len(seeds),
+                        desc=f"Batch starting at seed {current_seed}",
+                        file=sys.stdout,
+                        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}] ✅/🚫={postfix}",
+                        dynamic_ncols=True,
+                    )
+                    pbar.set_postfix_str(f"{total_successful}/{total_ignored}")
+
+                    def write_func(msg):
+                        # pyrefly: ignore  # missing-attribute
+                        pbar.write(msg)
+                else:
+                    pbar = None
+
+                # Collect results as they complete
+                for seed, future in future_results:
+                    result: FuzzerResult = future.get()
+
+                    if result.ignored_pattern_idx != -1:
+                        total_ignored += 1
+
+                    if result.success:
+                        total_successful += 1
+                    elif result.ignored_pattern_idx == -1:
+                        # Found a failure that is not ignored!
+                        if HAS_TQDM and pbar:
+                            pbar.close()
+
+                        elapsed = time.time() - start_time
+                        persist_print("\n" + "=" * 60)
+                        persist_print("🎯 FAILURE FOUND!")
+                        persist_print("=" * 60)
+                        persist_print(f"❌ Failing seed: {result.seed}")
+                        persist_print(
+                            f"⏱️  Duration for this seed: {result.duration:.2f}s"
+                        )
+                        persist_print(f"⏱️  Total time elapsed: {elapsed:.2f}s")
+                        persist_print(f"✅ Successful seeds tested: {total_successful}")
+                        persist_print(f"🚫 Ignored seeds: {total_ignored}")
+                        persist_print(
+                            f"📊 Total seeds tested: {total_successful + total_ignored + 1}"
+                        )
+                        persist_print("\n💥 Failure output:")
+                        persist_print("-" * 60)
+                        print_output_lines(result.output, persist_print)
+                        persist_print("-" * 60)
+                        persist_print(
+                            f"\n🔄 Reproduce with: python fuzzer.py --seed {result.seed} --template {template}"
+                        )
+
+                        # Exit with non-zero code
+                        sys.exit(1)
+
+                    # Update progress bar
+                    if HAS_TQDM and pbar:
+                        pbar.set_postfix_str(f"{total_successful}/{total_ignored}")
+                        pbar.update(1)
+                    elif verbose:
+                        status_emoji = "✅" if result.success else "🚫"
+                        persist_print(f"Seed {result.seed}: {status_emoji}")
+
+                # Close progress bar for this batch
+                if HAS_TQDM and pbar:
+                    pbar.close()
+
+            # Move to next batch
+            current_seed += batch_size
+
+    except KeyboardInterrupt:
+        persist_print("\n🛑 Interrupted by user (Ctrl+C)")
+        elapsed = time.time() - start_time
+        persist_print("=" * 60)
+        persist_print("📈 SUMMARY (interrupted)")
+        persist_print("=" * 60)
+        persist_print(f"⏱️  Total time: {elapsed:.2f}s")
+        persist_print(f"✅ Successful seeds: {total_successful}")
+        persist_print(f"🚫 Ignored seeds: {total_ignored}")
+        persist_print(f"📊 Total seeds tested: {total_successful + total_ignored}")
+        persist_print(
+            f"⚡ Throughput: {((total_successful + total_ignored) / (elapsed / 3600)):.2f} seeds/hr"
+        )
+        sys.exit(130)
diff --git a/tools/experimental/torchfuzz/operation_stack.png b/tools/experimental/torchfuzz/operation_stack.png
new file mode 100644
index 000000000000..b5dfcaf110e2
Binary files /dev/null and b/tools/experimental/torchfuzz/operation_stack.png differ
diff --git a/tools/experimental/torchfuzz/operators/__init__.py b/tools/experimental/torchfuzz/operators/__init__.py
new file mode 100644
index 000000000000..71d85497dbef
--- /dev/null
+++ b/tools/experimental/torchfuzz/operators/__init__.py
@@ -0,0 +1,90 @@
+"""Torchfuzz operators module."""
+
+from torchfuzz.operators.arg import ArgOperator
+from torchfuzz.operators.base import Operator
+from torchfuzz.operators.constant import ConstantOperator
+from torchfuzz.operators.item import ItemOperator
+from torchfuzz.operators.layout import (
+    FlattenOperator,
+    ReshapeOperator,
+    SqueezeOperator,
+    UnsqueezeOperator,
+    ViewOperator,
+)
+from torchfuzz.operators.matrix_multiply import (
+    AddmmOperator,
+    BmmOperator,
+    MatmulOperator,
+    MMOperator,
+)
+from torchfuzz.operators.nn_functional import (
+    DropoutOperator,
+    EmbeddingOperator,
+    LayerNormOperator,
+    LinearOperator,
+    ReLUOperator,
+    SoftmaxOperator,
+)
+from torchfuzz.operators.registry import (
+    get_operator,
+    list_operators,
+    register_operator,
+    set_operator_weight,
+    set_operator_weight_by_torch_op,
+    set_operator_weights,
+    set_operator_weights_by_torch_op,
+)
+from torchfuzz.operators.scalar_pointwise import (
+    ScalarAddOperator,
+    ScalarDivOperator,
+    ScalarMulOperator,
+    ScalarPointwiseOperator,
+    ScalarSubOperator,
+)
+from torchfuzz.operators.tensor_pointwise import (
+    AddOperator,
+    DivOperator,
+    MulOperator,
+    PointwiseOperator,
+    SubOperator,
+)
+
+
+__all__ = [
+    "Operator",
+    "PointwiseOperator",
+    "AddOperator",
+    "MulOperator",
+    "SubOperator",
+    "DivOperator",
+    "ScalarPointwiseOperator",
+    "ScalarAddOperator",
+    "ScalarMulOperator",
+    "ScalarSubOperator",
+    "ScalarDivOperator",
+    "ItemOperator",
+    "ConstantOperator",
+    "ArgOperator",
+    "ViewOperator",
+    "ReshapeOperator",
+    "FlattenOperator",
+    "SqueezeOperator",
+    "UnsqueezeOperator",
+    "MMOperator",
+    "AddmmOperator",
+    "BmmOperator",
+    "MatmulOperator",
+    "EmbeddingOperator",
+    "LinearOperator",
+    "ReLUOperator",
+    "SoftmaxOperator",
+    "DropoutOperator",
+    "LayerNormOperator",
+    "get_operator",
+    "register_operator",
+    "list_operators",
+    "set_operator_weight",
+    "set_operator_weights",
+    "set_operator_weight_by_torch_op",
+    "set_operator_weights_by_torch_op",
+]
diff --git a/tools/experimental/torchfuzz/operators/arg.py b/tools/experimental/torchfuzz/operators/arg.py
new file mode 100644
index 000000000000..8a9cc042cdb4
--- /dev/null
+++ b/tools/experimental/torchfuzz/operators/arg.py
@@ -0,0 +1,34 @@
+"""Arg operator implementation."""
+
+from typing import Optional
+
+from torchfuzz.operators.base import Operator
+from torchfuzz.tensor_fuzzer import Spec
+
+
+class ArgOperator(Operator):
+    """Operator for function arguments/parameters."""
+
+    def __init__(self):
+        super().__init__("arg")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Arg is not a torch operation, it represents function arguments."""
+        return None
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Arg can produce any type of output."""
+        return True
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Arg requires no inputs for fuzzing."""
+        return []
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for arg operation."""
+        # The actual argument name assignment will be handled separately
+        # in the codegen.py when processing arg operations
+        return f"# {output_name} will be assigned an argument value"
diff --git a/tools/experimental/torchfuzz/operators/base.py b/tools/experimental/torchfuzz/operators/base.py
new file mode 100644
index 000000000000..3135a96a971f
--- /dev/null
+++ b/tools/experimental/torchfuzz/operators/base.py
@@ -0,0 +1,79 @@
+"""Base operator implementation."""
+
+from abc import ABC, abstractmethod
+from typing import Optional
+
+from torchfuzz.tensor_fuzzer import Spec
+
+
+class Operator(ABC):
+    """Base class for all operators in torchfuzz."""
+
+    def __init__(self, name: str, weight: float = 1.0):
+        """Initialize operator with name and optional selection weight.
+
+        Args:
+            name: Unique operator name used in the registry
+            weight: Relative selection weight when sampling among compatible operators
+                    (default 1.0). Higher values increase selection likelihood.
+        """
+        self.name = name
+        self.weight: float = float(weight)
+
+    @property
+    @abstractmethod
+    def torch_op_name(self) -> Optional[str]:
+        """
+        Return the torch operation name this operator represents.
+
+        Returns:
+            Optional[str]: The torch operation name (e.g., "torch.ops.aten.add", "torch.nonzero").
+                          Returns None for non-torch operations like "arg" and "constant".
+        """
+        raise NotImplementedError("Subclasses must implement torch_op_name")
+
+    @abstractmethod
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Check if this operator can produce the given output spec."""
+        raise NotImplementedError("Subclasses must implement can_produce()")
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """
+        Get input specifications for fuzzing.
+
+        Subclasses must implement this to return a list of input Specs that,
+        when used with this operator, can produce the given output_spec. Leaf
+        operators should return an empty list.
+        """
+        raise NotImplementedError("Subclasses must implement fuzz_inputs_specs()")
+
+    @abstractmethod
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for this operation."""
+        raise NotImplementedError("Subclasses must implement codegen()")
+
+    def get_weight(
+        self,
+        *,
+        target_spec: Optional[Spec] = None,
+        depth: Optional[int] = None,
+        stack_size: Optional[int] = None,
+        template: Optional[str] = None,
+    ) -> float:
+        """
+        Return the selection weight for this operator.
+
+        Subclasses may override to implement context-sensitive weighting.
+        The default implementation returns the static attribute `self.weight`.
+        """
+        return self.weight
+
+    def __str__(self) -> str:
+        """String representation of the operator."""
+        return f"{self.__class__.__name__}({self.name})"
+
+    def __repr__(self) -> str:
+        """Repr representation of the operator."""
+        return self.__str__()
diff --git a/tools/experimental/torchfuzz/operators/constant.py b/tools/experimental/torchfuzz/operators/constant.py
new file mode 100644
index 000000000000..65f6d9c9c42b
--- /dev/null
+++ b/tools/experimental/torchfuzz/operators/constant.py
@@ -0,0 +1,131 @@
+"""Constant operator implementation."""
+
+from typing import Optional
+
+from torchfuzz.operators.base import Operator
+from torchfuzz.tensor_fuzzer import (
+    fuzz_scalar,
+    fuzz_tensor_simple,
+    ScalarSpec,
+    Spec,
+    TensorSpec,
+)
+
+
+class ConstantOperator(Operator):
+    """Operator for generating constants."""
+
+    def __init__(self):
+        super().__init__("constant")
+        self.template = "default"  # Track template for DTensor compatibility
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Constant is not a torch operation, it generates constant values."""
+        return None
+
+    def set_template(self, template: str):
+        """Set the template for context-aware code generation."""
+        self.template = template
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Constant can produce any type of output."""
+        return True
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Constant requires no inputs for fuzzing."""
+        return []
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for constant creation."""
+        # Create constant by calling fuzzing functions during codegen with deterministic seed
+        # Use a deterministic hash based on the variable name to ensure reproducibility across processes
+        import hashlib
+
+        var_seed = int(hashlib.md5(output_name.encode()).hexdigest()[:8], 16) % (2**31)  # noqa: S324
+
+        if isinstance(output_spec, ScalarSpec):
+            # Call fuzz_scalar during codegen and embed the result
+            actual_value = fuzz_scalar(output_spec, seed=var_seed)
+
+            # Format the value for embedding in code
+            if isinstance(actual_value, bool):
+                value_str = str(actual_value)
+            elif isinstance(actual_value, (int, float)):
+                value_str = repr(actual_value)
+            elif isinstance(actual_value, complex):
+                value_str = f"complex({actual_value.real}, {actual_value.imag})"
+            else:
+                value_str = repr(actual_value)
+
+            return f"{output_name} = {value_str}"
+
+        elif isinstance(output_spec, TensorSpec):
+            # Call fuzz_tensor_simple during codegen and embed the result
+            actual_tensor = fuzz_tensor_simple(
+                output_spec.size, output_spec.stride, output_spec.dtype, seed=var_seed
+            )
+
+            # Convert tensor to code representation
+            size_str = str(output_spec.size)
+            dtype_str = f"torch.{output_spec.dtype}".replace("torch.torch.", "torch.")
+
+            # Handle empty tensors (with 0 elements)
+            if actual_tensor.numel() == 0:
+                # For empty tensors, use a default fill value based on dtype
+                import torch
+
+                default_values = {
+                    torch.float16: 1.0,
+                    torch.float32: 1.0,
+                    torch.float64: 1.0,
+                    torch.bfloat16: 1.0,
+                    torch.int8: 1,
+                    torch.int16: 1,
+                    torch.int32: 1,
+                    torch.int64: 1,
+                    torch.bool: True,
+                    torch.complex64: 1.0,
+                    torch.complex128: 1.0,
+                }
+
+                fill_value = default_values.get(output_spec.dtype, 1)
+                tensor_creation = (
+                    f"torch.full({size_str}, {fill_value}, dtype={dtype_str})"
+                )
+            else:
+                # For non-empty tensors, use the first element as fill value
+                fill_value = actual_tensor.flatten()[0].item()
+
+                # For integer types, clamp the value to a smaller range to avoid
+                # issues when used in arithmetic with embedding indices
+                import torch
+
+                if output_spec.dtype in [
+                    torch.int8,
+                    torch.int16,
+                    torch.int32,
+                    torch.int64,
+                ]:
+                    # Clamp integer values to [0, 3] to avoid index overflow in multiplication
+                    # Even with multiplication, indices should stay in reasonable range
+                    # pyrefly: ignore  # bad-argument-type
+                    fill_value = max(0, min(3, abs(fill_value)))
+
+                tensor_creation = (
+                    f"torch.full({size_str}, {fill_value}, dtype={dtype_str})"
+                )
+
+            # For DTensor template, convert to DTensor
+            if self.template == "dtensor":
+                return (
+                    f"{output_name}_local = {tensor_creation}.to('cuda')\n"
+                    f"    {output_name} = DTensor.from_local({output_name}_local, mesh, placements)"
+                )
+            else:
+                return f"{output_name} = {tensor_creation}"
+
+        else:
+            return f"# Unknown output spec type for constant: {type(output_spec)}"
diff --git a/tools/experimental/torchfuzz/operators/item.py b/tools/experimental/torchfuzz/operators/item.py
new file mode 100644
index 000000000000..88bb2795b57c
--- /dev/null
+++ b/tools/experimental/torchfuzz/operators/item.py
@@ -0,0 +1,42 @@
+"""Item operator implementation."""
+
+from typing import Optional
+
+from torchfuzz.operators.base import Operator
+from torchfuzz.tensor_fuzzer import ScalarSpec, Spec, TensorSpec
+
+
+class ItemOperator(Operator):
+    """Operator for converting 0-d tensor to scalar."""
+
+    def __init__(self):
+        super().__init__("item")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Item is a tensor method, not a direct torch operation."""
+        return None
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Item produces scalars from 0-d tensors."""
+        return isinstance(output_spec, ScalarSpec)
+
+    def fuzz_inputs_specs(self, output_spec: Spec, num_inputs: int = 1) -> list[Spec]:
+        """Decompose scalar into a single-element tensor for item operation."""
+        if not isinstance(output_spec, ScalarSpec):
+            raise ValueError("ItemOperator can only produce ScalarSpec outputs")
+
+        # Create a tensor spec that can produce a scalar via .item()
+        # Use a 0-D tensor (scalar tensor) to ensure .item() works reliably
+        tensor_spec = TensorSpec(size=(), stride=(), dtype=output_spec.dtype)
+
+        return [tensor_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for item operation."""
+        if len(input_names) != 1:
+            raise ValueError("ItemOperator requires exactly one input")
+
+        return f"{output_name} = {input_names[0]}.item()"
diff --git a/tools/experimental/torchfuzz/operators/layout.py b/tools/experimental/torchfuzz/operators/layout.py
new file mode 100644
index 000000000000..d8b8500f5e7a
--- /dev/null
+++ b/tools/experimental/torchfuzz/operators/layout.py
@@ -0,0 +1,402 @@
+"""Tensor layout operator implementations."""
+
+import random
+from typing import Optional
+
+from torchfuzz.operators.base import Operator
+from torchfuzz.tensor_fuzzer import fuzz_tensor_size, Spec, TensorSpec
+
+
+class LayoutOperatorBase(Operator):
+    """Base class for tensor layout operations."""
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """All layout operations can only produce tensor outputs."""
+        return isinstance(output_spec, TensorSpec)
+
+
+class ViewOperator(LayoutOperatorBase):
+    """Operator for tensor.view() operation."""
+
+    def __init__(self):
+        """Initialize ViewOperator."""
+        super().__init__("view")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.Tensor.view"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """ViewOperator can produce tensor outputs but not scalars due to element count constraints."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        # Don't produce scalars since we can't guarantee input has exactly 1 element
+        return len(output_spec.size) > 0
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input spec for view operation."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("ViewOperator can only produce TensorSpec outputs")
+
+        # Calculate total number of elements in output
+        output_numel = 1
+        for dim in output_spec.size:
+            output_numel *= dim
+
+        # Generate a compatible input shape with exactly the same number of elements
+        input_size = fuzz_tensor_size()
+
+        # Always ensure exact element count match
+        if output_numel == 0:
+            # For zero-sized output, create zero-sized input
+            input_size = tuple(list(input_size)[:-1] + [0])
+        else:
+            # Calculate input shape that gives exactly output_numel elements
+            # Try to use the fuzzed shape structure but adjust to match element count
+            if len(input_size) > 1:
+                # Keep all dims except last, adjust last to make total = output_numel
+                prefix_numel = 1
+                for dim in input_size[:-1]:
+                    prefix_numel *= dim
+
+                if prefix_numel > 0 and output_numel % prefix_numel == 0:
+                    last_dim = output_numel // prefix_numel
+                    input_size = tuple(list(input_size)[:-1] + [last_dim])
+                else:
+                    # Fallback: create a simple shape with exact element count
+                    input_size = (output_numel,)
+            else:
+                # For single-dim input, just use the exact element count
+                input_size = (output_numel,)
+
+        # Create input tensor spec with contiguous stride for view compatibility
+        # .view() requires compatible memory layout, so use contiguous stride
+        input_stride = tuple()
+        if input_size:
+            # Calculate contiguous stride
+            stride = [1]
+            for i in range(len(input_size) - 1, 0, -1):
+                stride.insert(0, stride[0] * input_size[i])
+            input_stride = tuple(stride)
+
+        return [
+            TensorSpec(size=input_size, stride=input_stride, dtype=output_spec.dtype)
+        ]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for view operation."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("ViewOperator can only produce TensorSpec outputs")
+
+        shape_str = str(list(output_spec.size))
+        # Ensure tensor is contiguous before view to avoid stride compatibility issues
+        return f"{output_name} = {input_names[0]}.contiguous().view({shape_str})"
+
+
+class ReshapeOperator(LayoutOperatorBase):
+    """Operator for torch.reshape() operation."""
+
+    def __init__(self):
+        """Initialize ReshapeOperator."""
+        super().__init__("reshape")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.reshape"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """ReshapeOperator can produce tensor outputs but not scalars due to element count constraints."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        # Don't produce scalars since we can't guarantee input has exactly 1 element
+        return len(output_spec.size) > 0
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input spec for reshape operation."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("ReshapeOperator can only produce TensorSpec outputs")
+
+        # Calculate total number of elements in output
+        output_numel = 1
+        for dim in output_spec.size:
+            output_numel *= dim
+
+        # Generate a compatible input shape with exactly the same number of elements
+        input_size = fuzz_tensor_size()
+
+        # Always ensure exact element count match
+        if output_numel == 0:
+            # For zero-sized output, create zero-sized input
+            input_size = tuple(list(input_size)[:-1] + [0])
+        else:
+            # Calculate input shape that gives exactly output_numel elements
+            # Try to use the fuzzed shape structure but adjust to match element count
+            if len(input_size) > 1:
+                # Keep all dims except last, adjust last to make total = output_numel
+                prefix_numel = 1
+                for dim in input_size[:-1]:
+                    prefix_numel *= dim
+
+                if prefix_numel > 0 and output_numel % prefix_numel == 0:
+                    last_dim = output_numel // prefix_numel
+                    input_size = tuple(list(input_size)[:-1] + [last_dim])
+                else:
+                    # Fallback: create a simple shape with exact element count
+                    input_size = (output_numel,)
+            else:
+                # For single-dim input, just use the exact element count
+                input_size = (output_numel,)
+
+        # Create input tensor spec with compatible stride
+        from torchfuzz.tensor_fuzzer import fuzz_valid_stride
+
+        input_stride = fuzz_valid_stride(input_size)
+
+        return [
+            TensorSpec(size=input_size, stride=input_stride, dtype=output_spec.dtype)
+        ]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for reshape operation."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("ReshapeOperator can only produce TensorSpec outputs")
+
+        shape_str = str(list(output_spec.size))
+        return f"{output_name} = torch.reshape({input_names[0]}, {shape_str})"
+
+
+class FlattenOperator(LayoutOperatorBase):
+    """Operator for torch.flatten() operation."""
+
+    def __init__(self):
+        """Initialize FlattenOperator."""
+        super().__init__("flatten")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.flatten"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Flatten can only produce 1D tensors when using torch.flatten() without start_dim."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        # Since we always use torch.flatten() without start_dim, we can only produce 1D tensors
+        return len(output_spec.size) == 1
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input spec for flatten operation."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("FlattenOperator can only produce TensorSpec outputs")
+
+        # Calculate total number of elements in output
+        output_numel = 1
+        for dim in output_spec.size:
+            output_numel *= dim
+
+        # Generate a multi-dimensional input that can be flattened
+        if len(output_spec.size) == 1:
+            # For 1D output, generate any multi-dimensional input
+            input_size = fuzz_tensor_size()
+            # Ensure input has multiple dimensions
+            if len(input_size) < 2:
+                input_size = (2, 2)  # Default multi-dim shape
+        else:
+            # For 2D output, generate input with more dimensions
+            input_size = fuzz_tensor_size()
+            if len(input_size) < 3:
+                input_size = (2, 2, 2)  # Default 3D shape
+
+        # Adjust input size to match output element count
+        input_numel = 1
+        for dim in input_size:
+            input_numel *= dim
+
+        if input_numel != output_numel:
+            # Handle zero-sized tensors specially
+            if output_numel == 0:
+                # For zero-sized output, create zero-sized input
+                input_size = tuple(list(input_size)[:-1] + [0])
+            elif len(input_size) > 0 and output_numel > 0:
+                # Calculate input shape that gives exactly output_numel elements
+                prefix_numel = 1
+                for dim in input_size[:-1]:
+                    prefix_numel *= dim
+
+                if prefix_numel > 0:
+                    last_dim = output_numel // prefix_numel
+                    # Ensure we get exactly output_numel elements
+                    if last_dim * prefix_numel == output_numel:
+                        input_size = tuple(list(input_size)[:-1] + [last_dim])
+                    else:
+                        # Fallback: create a simple shape with exact element count
+                        input_size = (output_numel,)
+                else:
+                    input_size = (output_numel,)
+
+        # Create input tensor spec
+        from torchfuzz.tensor_fuzzer import fuzz_valid_stride
+
+        input_stride = fuzz_valid_stride(tuple(input_size))
+
+        return [
+            TensorSpec(
+                size=tuple(input_size), stride=input_stride, dtype=output_spec.dtype
+            )
+        ]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for flatten operation."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("FlattenOperator can only produce TensorSpec outputs")
+
+        # Always flatten all dimensions to avoid shape calculation errors
+        # This ensures the output matches the expected output_spec shape
+        return f"{output_name} = torch.flatten({input_names[0]})"
+
+
+class SqueezeOperator(LayoutOperatorBase):
+    """Operator for torch.squeeze() operation."""
+
+    def __init__(self):
+        """Initialize SqueezeOperator."""
+        super().__init__("squeeze")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.squeeze"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """SqueezeOperator can only produce tensors WITHOUT singleton dimensions."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        # Don't produce outputs with singleton dimensions since squeeze() removes ALL of them
+        return 1 not in output_spec.size
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input spec for squeeze operation."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("SqueezeOperator can only produce TensorSpec outputs")
+
+        # Add exactly one singleton dimension to the output shape to create input
+        input_size = list(output_spec.size)
+        # Insert exactly one singleton dimension at a random position
+        pos = random.randint(0, len(input_size))
+        input_size.insert(pos, 1)
+
+        # Create input tensor spec
+        from torchfuzz.tensor_fuzzer import fuzz_valid_stride
+
+        input_stride = fuzz_valid_stride(tuple(input_size))
+
+        return [
+            TensorSpec(
+                size=tuple(input_size), stride=input_stride, dtype=output_spec.dtype
+            )
+        ]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for squeeze operation."""
+        # Always use squeeze() without dim specification to be safe
+        # Since we control input generation to add exactly one singleton dimension,
+        # and we preserve existing singleton dimensions in the output,
+        # this should work correctly
+        return f"{output_name} = torch.squeeze({input_names[0]})"
+
+
+class UnsqueezeOperator(LayoutOperatorBase):
+    """Operator for torch.unsqueeze() operation."""
+
+    def __init__(self):
+        """Initialize UnsqueezeOperator."""
+        super().__init__("unsqueeze")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.unsqueeze"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Unsqueeze produces tensors with at least one singleton dimension."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        # Check if there's at least one singleton dimension
+        return 1 in output_spec.size
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input spec for unsqueeze operation."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("UnsqueezeOperator can only produce TensorSpec outputs")
+
+        # For unsqueeze: output = input.shape[:dim] + (1,) + input.shape[dim:]
+        # So to get input from output, we need to remove exactly one singleton dimension
+
+        # Find a singleton dimension to remove (prefer last one for consistency)
+        input_size = list(output_spec.size)
+        singleton_idx = None
+
+        for i in range(len(input_size) - 1, -1, -1):
+            if input_size[i] == 1:
+                singleton_idx = i
+                break
+
+        if singleton_idx is not None:
+            # Remove the singleton dimension to create input shape
+            input_size.pop(singleton_idx)
+        else:
+            # This shouldn't happen given our can_produce constraint
+            raise ValueError(
+                "UnsqueezeOperator requires output to have at least one singleton dimension"
+            )
+
+        # Handle empty input (scalar case)
+        if not input_size:
+            input_size = tuple()  # Scalar tensor
+        else:
+            input_size = tuple(input_size)
+
+        # Create input tensor spec
+        from torchfuzz.tensor_fuzzer import fuzz_valid_stride
+
+        if input_size:
+            input_stride = fuzz_valid_stride(input_size)
+        else:
+            input_stride = tuple()  # Scalar has empty stride
+
+        return [
+            TensorSpec(size=input_size, stride=input_stride, dtype=output_spec.dtype)
+        ]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for unsqueeze operation."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("UnsqueezeOperator can only produce TensorSpec outputs")
+
+        # Find the last singleton dimension position (matching fuzz_inputs_specs logic)
+        # This should be the same singleton dimension that we removed in fuzz_inputs_specs
+        last_singleton_idx = None
+        for i in range(len(output_spec.size) - 1, -1, -1):
+            if output_spec.size[i] == 1:
+                last_singleton_idx = i
+                break
+
+        if last_singleton_idx is not None:
+            dim = last_singleton_idx
+        else:
+            # Fallback: add at the end (shouldn't happen given our can_produce constraint)
+            dim = len(output_spec.size) - 1
+
+        return f"{output_name} = torch.unsqueeze({input_names[0]}, dim={dim})"
diff --git a/tools/experimental/torchfuzz/operators/masked_select.py b/tools/experimental/torchfuzz/operators/masked_select.py
new file mode 100644
index 000000000000..5c68005dd111
--- /dev/null
+++ b/tools/experimental/torchfuzz/operators/masked_select.py
@@ -0,0 +1,67 @@
+"""Masked select operator implementation."""
+
+from typing import Optional
+
+import torch
+
+from torchfuzz.operators.base import Operator
+from torchfuzz.tensor_fuzzer import Spec, TensorSpec
+
+
+class MaskedSelectOperator(Operator):
+    """Operator for selecting elements from a tensor based on a mask."""
+
+    def __init__(self):
+        super().__init__("masked_select")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.masked_select"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Masked select produces a 1D tensor; we'll synthesize inputs to match size."""
+        return isinstance(output_spec, TensorSpec) and len(output_spec.size) == 1
+
+    def fuzz_inputs_specs(self, output_spec: Spec, num_inputs: int = 2) -> list[Spec]:
+        """Generate input specs for masked_select operation."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("MaskedSelectOperator can only produce TensorSpec outputs")
+
+        # Input tensor - can be any shape and type
+        input_tensor_spec = TensorSpec(
+            size=(2, 3),  # Fixed size for consistency
+            stride=(3, 1),  # Contiguous
+            dtype=output_spec.dtype,  # Match output dtype
+        )
+
+        # Mask tensor - must be boolean and broadcastable to input
+        mask_spec = TensorSpec(
+            size=(2, 3),  # Same size as input for simplicity
+            stride=(3, 1),  # Contiguous
+            dtype=torch.bool,
+        )
+
+        return [input_tensor_spec, mask_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for masked_select with synthesized inputs to match size.
+
+        Constructs an input tensor and mask so that exactly k elements are selected,
+        where k = output_spec.size[0]. No data-dependent guards.
+        """
+        if len(input_names) != 2:
+            raise ValueError("MaskedSelectOperator requires exactly two inputs")
+        if not isinstance(output_spec, TensorSpec) or len(output_spec.size) != 1:
+            raise ValueError("MaskedSelectOperator requires 1D TensorSpec output")
+        k = output_spec.size[0]
+        # Build a 1D input of length >= k and a mask with first k positions True
+        # Use input's device and output dtype to avoid mismatches
+        return (
+            f"_x_ms = torch.arange(max({k}, 1), device={input_names[0]}.device).to({input_names[0]}.dtype)\n"
+            f"_mask_ms = torch.zeros_like(_x_ms, dtype=torch.bool)\n"
+            f"_mask_ms[:{k}] = True\n"
+            f"{output_name} = torch.masked_select(_x_ms, _mask_ms)"
+        )
diff --git a/tools/experimental/torchfuzz/operators/matrix_multiply.py b/tools/experimental/torchfuzz/operators/matrix_multiply.py
new file mode 100644
index 000000000000..11ccb02ee780
--- /dev/null
+++ b/tools/experimental/torchfuzz/operators/matrix_multiply.py
@@ -0,0 +1,434 @@
+"""Matrix multiplication operator implementations."""
+
+import random
+from typing import Optional
+
+import torch
+
+from torchfuzz.operators.base import Operator
+from torchfuzz.tensor_fuzzer import Spec, TensorSpec
+
+
+# Type promotion imports removed since we now use explicit casting in codegen
+
+
+class MatrixMultiplyOperator(Operator):
+    """Base class for matrix multiplication operations."""
+
+    def __init__(self, name: str, torch_op: str):
+        super().__init__(name)
+        self._torch_op = torch_op
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return self._torch_op
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Matrix multiply operations can produce float/complex tensors of dimension >= 2."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+
+        # Must have at least 2 dimensions for matrix multiplication
+        if len(output_spec.size) < 2:
+            return False
+
+        # Matrix multiply doesn't work with bool or integer types for gradients
+        if output_spec.dtype in [
+            torch.bool,
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+        ]:
+            return False
+
+        return True
+
+    def _get_compatible_dtype(self, output_dtype):
+        """Get a compatible dtype for matrix multiplication."""
+        # For matrix multiplication, we need to be flexible with input dtypes
+        # since earlier operations may have performed type promotion.
+        # We'll let the fuzzer generate whatever dtypes result from earlier operations
+        # and rely on the operation graph to ensure compatibility.
+        # Return the output dtype as a starting point, but this may be overridden
+        # by the actual tensor specs generated by the fuzzer.
+        return [output_dtype, output_dtype]
+
+
+class MMOperator(MatrixMultiplyOperator):
+    """Operator for matrix multiplication (torch.mm)."""
+
+    def __init__(self):
+        super().__init__("mm", "torch.mm")
+        self.weight = 5.0
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """MM requires exactly 2D tensors."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+
+        # Must have exactly 2 dimensions for torch.mm
+        if len(output_spec.size) != 2:
+            return False
+
+        # Matrix multiply doesn't work with bool or integer types for gradients
+        if output_spec.dtype in [
+            torch.bool,
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+        ]:
+            return False
+
+        return True
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for matrix multiplication."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("MMOperator can only produce TensorSpec outputs")
+
+        if len(output_spec.size) != 2:
+            raise ValueError("torch.mm requires 2D tensors")
+
+        m, n = output_spec.size
+        # Choose a random inner dimension k
+        k = random.randint(1, 16)
+
+        # Get compatible dtypes
+        dtypes = self._get_compatible_dtype(output_spec.dtype)
+
+        # First tensor: [m, k]
+        input1_spec = TensorSpec(
+            size=(m, k),
+            stride=(k, 1),  # Contiguous stride
+            dtype=dtypes[0],
+        )
+
+        # Second tensor: [k, n]
+        input2_spec = TensorSpec(
+            size=(k, n),
+            stride=(n, 1),  # Contiguous stride
+            dtype=dtypes[1] if len(dtypes) > 1 else dtypes[0],
+        )
+
+        return [input1_spec, input2_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for matrix multiplication."""
+        if len(input_names) != 2:
+            raise ValueError("torch.mm requires exactly 2 inputs")
+
+        # Get target dtype
+        if isinstance(output_spec, TensorSpec):
+            target_dtype_str = f"torch.{output_spec.dtype}".replace(
+                "torch.torch.", "torch."
+            )
+            # Cast inputs to ensure compatible types
+            return (
+                f"{output_name} = torch.mm("
+                f"{input_names[0]}.to({target_dtype_str}), "
+                f"{input_names[1]}.to({target_dtype_str}))"
+            )
+        else:
+            return f"{output_name} = torch.mm({input_names[0]}, {input_names[1]})"
+
+
+class AddmmOperator(MatrixMultiplyOperator):
+    """Operator for additive matrix multiplication (torch.addmm)."""
+
+    def __init__(self):
+        super().__init__("addmm", "torch.addmm")
+        self.weight = 5.0
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Addmm requires exactly 2D tensors."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+
+        # Must have exactly 2 dimensions for torch.addmm
+        if len(output_spec.size) != 2:
+            return False
+
+        # Matrix multiply doesn't work with bool or integer types for gradients
+        if output_spec.dtype in [
+            torch.bool,
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+        ]:
+            return False
+
+        return True
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for additive matrix multiplication."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("AddmmOperator can only produce TensorSpec outputs")
+
+        if len(output_spec.size) != 2:
+            raise ValueError("torch.addmm requires 2D output tensor")
+
+        m, n = output_spec.size
+        # Choose a random inner dimension k
+        k = random.randint(1, 16)
+
+        # Get compatible dtypes
+        dtypes = self._get_compatible_dtype(output_spec.dtype)
+
+        # Bias tensor: [m, n] (same shape as output)
+        bias_spec = TensorSpec(
+            size=(m, n),
+            stride=(n, 1),  # Contiguous stride
+            dtype=dtypes[0],
+        )
+
+        # First matrix: [m, k]
+        input1_spec = TensorSpec(
+            size=(m, k),
+            stride=(k, 1),  # Contiguous stride
+            dtype=dtypes[1] if len(dtypes) > 1 else dtypes[0],
+        )
+
+        # Second matrix: [k, n]
+        input2_spec = TensorSpec(
+            size=(k, n),
+            stride=(n, 1),  # Contiguous stride
+            dtype=dtypes[1] if len(dtypes) > 1 else dtypes[0],
+        )
+
+        return [bias_spec, input1_spec, input2_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for additive matrix multiplication."""
+        if len(input_names) != 3:
+            raise ValueError("torch.addmm requires exactly 3 inputs")
+
+        # Get target dtype
+        if isinstance(output_spec, TensorSpec):
+            target_dtype_str = f"torch.{output_spec.dtype}".replace(
+                "torch.torch.", "torch."
+            )
+            # Cast inputs to ensure compatible types
+            return (
+                f"{output_name} = torch.addmm("
+                f"{input_names[0]}.to({target_dtype_str}), "
+                f"{input_names[1]}.to({target_dtype_str}), "
+                f"{input_names[2]}.to({target_dtype_str}))"
+            )
+        else:
+            return f"{output_name} = torch.addmm({input_names[0]}, {input_names[1]}, {input_names[2]})"
+
+
+class BmmOperator(MatrixMultiplyOperator):
+    """Operator for batch matrix multiplication (torch.bmm)."""
+
+    def __init__(self):
+        super().__init__("bmm", "torch.bmm")
+        self.weight = 5.0
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Batch matrix multiply requires 3D tensors."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+
+        # Must have exactly 3 dimensions for batch matrix multiplication
+        if len(output_spec.size) != 3:
+            return False
+
+        # Matrix multiply doesn't work with bool or integer types for gradients
+        if output_spec.dtype in [
+            torch.bool,
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+        ]:
+            return False
+
+        return True
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for batch matrix multiplication."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("BmmOperator can only produce TensorSpec outputs")
+
+        if len(output_spec.size) != 3:
+            raise ValueError("torch.bmm requires 3D tensors")
+
+        b, m, n = output_spec.size
+        # Choose a random inner dimension k
+        k = random.randint(1, 16)
+
+        # Get compatible dtypes
+        dtypes = self._get_compatible_dtype(output_spec.dtype)
+
+        # First tensor: [b, m, k]
+        input1_spec = TensorSpec(
+            size=(b, m, k),
+            stride=(m * k, k, 1),  # Contiguous stride
+            dtype=dtypes[0],
+        )
+
+        # Second tensor: [b, k, n]
+        input2_spec = TensorSpec(
+            size=(b, k, n),
+            stride=(k * n, n, 1),  # Contiguous stride
+            dtype=dtypes[1] if len(dtypes) > 1 else dtypes[0],
+        )
+
+        return [input1_spec, input2_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for batch matrix multiplication."""
+        if len(input_names) != 2:
+            raise ValueError("torch.bmm requires exactly 2 inputs")
+
+        # Get target dtype
+        if isinstance(output_spec, TensorSpec):
+            target_dtype_str = f"torch.{output_spec.dtype}".replace(
+                "torch.torch.", "torch."
+            )
+            # Cast inputs to ensure compatible types
+            return (
+                f"{output_name} = torch.bmm("
+                f"{input_names[0]}.to({target_dtype_str}), "
+                f"{input_names[1]}.to({target_dtype_str}))"
+            )
+        else:
+            return f"{output_name} = torch.bmm({input_names[0]}, {input_names[1]})"
+
+
+class MatmulOperator(MatrixMultiplyOperator):
+    """Operator for general matrix multiplication (torch.matmul)."""
+
+    def __init__(self):
+        super().__init__("matmul", "torch.matmul")
+        self.weight = 500.0
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Matmul can handle various tensor dimensions >= 1."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+
+        # Must have at least 1 dimension
+        if len(output_spec.size) < 1:
+            return False
+
+        # Matrix multiply doesn't work with bool or integer types for gradients
+        if output_spec.dtype in [
+            torch.bool,
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+        ]:
+            return False
+
+        return True
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for general matrix multiplication."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("MatmulOperator can only produce TensorSpec outputs")
+
+        output_size = output_spec.size
+        output_dims = len(output_size)
+
+        # Get compatible dtypes
+        dtypes = self._get_compatible_dtype(output_spec.dtype)
+
+        if output_dims == 1:
+            # Matrix-vector multiplication: (n,) = (k,) @ (k, n) or (n,) = (n, k) @ (k,)
+            n = output_size[0]
+            k = random.randint(1, 16)
+
+            # Randomly choose between two valid patterns
+            if random.choice([True, False]):
+                # Pattern 1: (n,) = (k,) @ (k, n)
+                input1_spec = TensorSpec(size=(k,), stride=(1,), dtype=dtypes[0])
+                input2_spec = TensorSpec(
+                    size=(k, n),
+                    stride=(n, 1),
+                    dtype=dtypes[1] if len(dtypes) > 1 else dtypes[0],
+                )
+            else:
+                # Pattern 2: (n,) = (n, k) @ (k,)
+                input1_spec = TensorSpec(size=(n, k), stride=(k, 1), dtype=dtypes[0])
+                input2_spec = TensorSpec(
+                    size=(k,),
+                    stride=(1,),
+                    dtype=dtypes[1] if len(dtypes) > 1 else dtypes[0],
+                )
+
+        elif output_dims == 2:
+            # Matrix multiplication: (m, n) = (m, k) @ (k, n)
+            m, n = output_size
+            k = random.randint(1, 16)
+
+            input1_spec = TensorSpec(size=(m, k), stride=(k, 1), dtype=dtypes[0])
+            input2_spec = TensorSpec(
+                size=(k, n),
+                stride=(n, 1),
+                dtype=dtypes[1] if len(dtypes) > 1 else dtypes[0],
+            )
+
+        else:
+            # Batched matrix multiplication: (..., m, n) = (..., m, k) @ (..., k, n)
+            *batch_dims, m, n = output_size
+            k = random.randint(1, 16)
+
+            # Calculate strides for contiguous tensors
+            input1_size = tuple(batch_dims + [m, k])
+            input2_size = tuple(batch_dims + [k, n])
+
+            # Contiguous strides
+            input1_stride = [1]
+            for i in reversed(range(len(input1_size) - 1)):
+                input1_stride.append(input1_stride[-1] * input1_size[i + 1])
+            input1_stride = tuple(reversed(input1_stride))
+
+            input2_stride = [1]
+            for i in reversed(range(len(input2_size) - 1)):
+                input2_stride.append(input2_stride[-1] * input2_size[i + 1])
+            input2_stride = tuple(reversed(input2_stride))
+
+            input1_spec = TensorSpec(
+                size=input1_size, stride=input1_stride, dtype=dtypes[0]
+            )
+            input2_spec = TensorSpec(
+                size=input2_size,
+                stride=input2_stride,
+                dtype=dtypes[1] if len(dtypes) > 1 else dtypes[0],
+            )
+
+        return [input1_spec, input2_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for general matrix multiplication."""
+        if len(input_names) != 2:
+            raise ValueError("torch.matmul requires exactly 2 inputs")
+
+        # Get target dtype
+        if isinstance(output_spec, TensorSpec):
+            target_dtype_str = f"torch.{output_spec.dtype}".replace(
+                "torch.torch.", "torch."
+            )
+            # Cast inputs to ensure compatible types
+            return (
+                f"{output_name} = torch.matmul("
+                f"{input_names[0]}.to({target_dtype_str}), "
+                f"{input_names[1]}.to({target_dtype_str}))"
+            )
+        else:
+            return f"{output_name} = torch.matmul({input_names[0]}, {input_names[1]})"
diff --git a/tools/experimental/torchfuzz/operators/nn_functional.py b/tools/experimental/torchfuzz/operators/nn_functional.py
new file mode 100644
index 000000000000..a0569aa8e5c6
--- /dev/null
+++ b/tools/experimental/torchfuzz/operators/nn_functional.py
@@ -0,0 +1,953 @@
+"""Neural network functional operator implementations."""
+
+import random
+from typing import Optional
+
+import torch
+
+from torchfuzz.operators.base import Operator
+from torchfuzz.tensor_fuzzer import Spec, TensorSpec
+
+
+def is_float_dtype(dtype: torch.dtype) -> bool:
+    """Check if dtype is a floating point type."""
+    return dtype in [
+        torch.float32,
+        torch.float64,
+        torch.float16,
+        torch.bfloat16,
+    ]
+
+
+class EmbeddingOperator(Operator):
+    """Operator for torch.nn.functional.embedding."""
+
+    def __init__(self):
+        super().__init__("torch.nn.functional.embedding")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.nn.functional.embedding"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Embedding can produce tensor outputs with floating point dtypes."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        # Embedding needs at least 1 dimension (embedding_dim)
+        if len(output_spec.size) == 0:
+            return False
+        # Embedding outputs are typically float tensors
+        return is_float_dtype(output_spec.dtype)
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for embedding operation.
+
+        Embedding requires:
+        - weight tensor: (num_embeddings, embedding_dim)
+        - input tensor: integer indices (any shape, but output shape + [embedding_dim])
+        """
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("EmbeddingOperator can only produce TensorSpec outputs")
+
+        # Output shape should be input_shape + [embedding_dim]
+        if len(output_spec.size) == 0:
+            raise ValueError("Embedding output must have at least 1 dimension")
+
+        embedding_dim = output_spec.size[-1]
+        input_shape = output_spec.size[:-1]  # Remove last dimension for embedding_dim
+
+        # Generate reasonable vocab size that's larger than our index generation range
+        # This ensures that indices generated in range [0, 100) will always be valid
+        num_embeddings = random.randint(150, 500)  # Always larger than max index (100)
+
+        # Weight tensor: (num_embeddings, embedding_dim)
+        weight_spec = TensorSpec(
+            size=(num_embeddings, embedding_dim),
+            stride=(embedding_dim, 1),
+            dtype=output_spec.dtype,
+        )
+
+        # Input tensor: integer indices with shape that produces the output shape
+        input_spec = TensorSpec(
+            size=input_shape,
+            stride=self._calculate_stride(input_shape),
+            dtype=torch.int64,  # Indices are typically int64
+        )
+
+        return [weight_spec, input_spec]
+
+    def _calculate_stride(self, size):
+        """Calculate stride for a given size."""
+        if not size:
+            return ()
+        stride = []
+        current_stride = 1
+        for dim_size in reversed(size):
+            stride.append(current_stride)
+            current_stride *= dim_size
+        return tuple(reversed(stride))
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for embedding operation."""
+        if len(input_names) != 2:
+            raise ValueError("Embedding requires exactly 2 inputs: weight and input")
+
+        weight_name, input_name = input_names
+        # Ensure indices are integer type and clamped to valid range
+        # This handles any arithmetic operations that might produce out-of-bounds indices
+        return f"{output_name} = torch.nn.functional.embedding(torch.clamp({input_name}.to(torch.int64), 0, {weight_name}.size(0)-1), {weight_name})"
+
+
+class LinearOperator(Operator):
+    """Operator for torch.nn.functional.linear."""
+
+    def __init__(self):
+        super().__init__("torch.nn.functional.linear")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.nn.functional.linear"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Linear can produce tensor outputs with floating point dtypes."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        # Linear needs at least 1 dimension (output features)
+        if len(output_spec.size) == 0:
+            return False
+        return is_float_dtype(output_spec.dtype)
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for linear operation.
+
+        Linear transformation: y = xW^T + b
+        - input: (..., in_features)
+        - weight: (out_features, in_features)
+        - bias: (out_features,) [optional]
+        - output: (..., out_features)
+        """
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("LinearOperator can only produce TensorSpec outputs")
+
+        if len(output_spec.size) == 0:
+            raise ValueError("Linear output must have at least 1 dimension")
+
+        out_features = output_spec.size[-1]
+        batch_shape = output_spec.size[:-1]
+
+        # Generate reasonable input features size
+        in_features = random.randint(8, 256)
+
+        # Input tensor: (..., in_features)
+        input_shape = batch_shape + (in_features,)
+        input_spec = TensorSpec(
+            size=input_shape,
+            stride=self._calculate_stride(input_shape),
+            dtype=output_spec.dtype,
+        )
+
+        # Weight tensor: (out_features, in_features)
+        weight_spec = TensorSpec(
+            size=(out_features, in_features),
+            stride=(in_features, 1),
+            dtype=output_spec.dtype,
+        )
+
+        # Bias tensor: (out_features,) - make bias optional with 50% probability
+        if random.random() < 0.5:
+            bias_spec = TensorSpec(
+                size=(out_features,), stride=(1,), dtype=output_spec.dtype
+            )
+            return [input_spec, weight_spec, bias_spec]
+        else:
+            return [input_spec, weight_spec]
+
+    def _calculate_stride(self, size):
+        """Calculate stride for a given size."""
+        if not size:
+            return ()
+        stride = []
+        current_stride = 1
+        for dim_size in reversed(size):
+            stride.append(current_stride)
+            current_stride *= dim_size
+        return tuple(reversed(stride))
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for linear operation."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("LinearOperator can only produce TensorSpec outputs")
+
+        # Ensure dtype compatibility by converting all inputs to the expected output dtype
+        target_dtype = str(output_spec.dtype)
+
+        if len(input_names) == 2:
+            input_name, weight_name = input_names
+            return f"{output_name} = torch.nn.functional.linear({input_name}.to({target_dtype}), {weight_name}.to({target_dtype}))"
+        elif len(input_names) == 3:
+            input_name, weight_name, bias_name = input_names
+            return f"{output_name} = torch.nn.functional.linear({input_name}.to({target_dtype}), {weight_name}.to({target_dtype}), {bias_name}.to({target_dtype}))"
+        else:
+            raise ValueError(
+                "Linear requires 2 or 3 inputs: input, weight, and optional bias"
+            )
+
+
+class ReLUOperator(Operator):
+    """Operator for torch.nn.functional.relu."""
+
+    def __init__(self):
+        super().__init__("torch.nn.functional.relu")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.nn.functional.relu"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """ReLU can produce tensor outputs with floating point dtypes."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        return is_float_dtype(output_spec.dtype)
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for ReLU operation.
+
+        ReLU is element-wise, so input shape matches output shape.
+        """
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("ReLUOperator can only produce TensorSpec outputs")
+
+        # Input tensor has same shape and dtype as output
+        input_spec = TensorSpec(
+            size=output_spec.size, stride=output_spec.stride, dtype=output_spec.dtype
+        )
+
+        return [input_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for ReLU operation."""
+        if len(input_names) != 1:
+            raise ValueError("ReLU requires exactly 1 input")
+
+        input_name = input_names[0]
+        return f"{output_name} = torch.nn.functional.relu({input_name})"
+
+
+class SoftmaxOperator(Operator):
+    """Operator for torch.nn.functional.softmax."""
+
+    def __init__(self):
+        super().__init__("torch.nn.functional.softmax")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.nn.functional.softmax"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Softmax can produce tensor outputs with floating point dtypes."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        # Softmax needs at least 1 dimension to apply softmax along a dimension
+        if len(output_spec.size) == 0:
+            return False
+        return is_float_dtype(output_spec.dtype)
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for softmax operation.
+
+        Softmax is element-wise along a dimension, input shape matches output shape.
+        """
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("SoftmaxOperator can only produce TensorSpec outputs")
+
+        # Input tensor has same shape and dtype as output
+        input_spec = TensorSpec(
+            size=output_spec.size, stride=output_spec.stride, dtype=output_spec.dtype
+        )
+
+        return [input_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for softmax operation."""
+        if len(input_names) != 1:
+            raise ValueError("Softmax requires exactly 1 input")
+
+        input_name = input_names[0]
+        # Use dim=-1 as default (last dimension)
+        return f"{output_name} = torch.nn.functional.softmax({input_name}, dim=-1)"
+
+
+class DropoutOperator(Operator):
+    """Operator for torch.nn.functional.dropout."""
+
+    def __init__(self):
+        super().__init__("torch.nn.functional.dropout")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.nn.functional.dropout"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Dropout can produce tensor outputs with floating point dtypes."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        return is_float_dtype(output_spec.dtype)
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for dropout operation.
+
+        Dropout is element-wise, input shape matches output shape.
+        """
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("DropoutOperator can only produce TensorSpec outputs")
+
+        # Input tensor has same shape and dtype as output
+        input_spec = TensorSpec(
+            size=output_spec.size, stride=output_spec.stride, dtype=output_spec.dtype
+        )
+
+        return [input_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for dropout operation."""
+        if len(input_names) != 1:
+            raise ValueError("Dropout requires exactly 1 input")
+
+        input_name = input_names[0]
+        # Use training=False to make it deterministic for testing
+        return f"{output_name} = torch.nn.functional.dropout({input_name}, p=0.1, training=False)"
+
+
+class LayerNormOperator(Operator):
+    """Operator for torch.nn.functional.layer_norm."""
+
+    def __init__(self):
+        super().__init__("torch.nn.functional.layer_norm")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.nn.functional.layer_norm"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """LayerNorm can produce tensor outputs with floating point dtypes."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        # LayerNorm needs at least 1 dimension to normalize over
+        if len(output_spec.size) == 0:
+            return False
+        return is_float_dtype(output_spec.dtype)
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for layer_norm operation.
+
+        LayerNorm normalizes over the last dimensions specified by normalized_shape.
+        - input: input tensor
+        - weight: (normalized_shape,) [optional]
+        - bias: (normalized_shape,) [optional]
+        """
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("LayerNormOperator can only produce TensorSpec outputs")
+
+        if len(output_spec.size) == 0:
+            raise ValueError("LayerNorm output must have at least 1 dimension")
+
+        # Input tensor has same shape and dtype as output
+        input_spec = TensorSpec(
+            size=output_spec.size, stride=output_spec.stride, dtype=output_spec.dtype
+        )
+
+        # For simplicity, normalize over the last dimension
+        normalized_shape = output_spec.size[-1:]
+
+        # Weight and bias tensors (optional with 70% probability each)
+        specs = [input_spec]
+        if random.random() < 0.7:
+            # LayerNorm weight and bias parameters should match input tensor dtype
+            # for compatibility (conversion will be handled in codegen)
+            weight_spec = TensorSpec(
+                size=normalized_shape, stride=(1,), dtype=output_spec.dtype
+            )
+            specs.append(weight_spec)
+
+            if random.random() < 0.7:
+                bias_spec = TensorSpec(
+                    size=normalized_shape, stride=(1,), dtype=output_spec.dtype
+                )
+                specs.append(bias_spec)
+
+        # Cast to list[Spec] to fix type checking
+        from typing import cast
+
+        return cast(list[Spec], specs)
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for layer_norm operation."""
+        if len(input_names) < 1 or len(input_names) > 3:
+            raise ValueError(
+                "LayerNorm requires 1-3 inputs: input, optional weight, optional bias"
+            )
+
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("LayerNormOperator can only produce TensorSpec outputs")
+
+        # Normalize over the last dimension
+        normalized_shape = f"({output_spec.size[-1]},)"
+
+        # Ensure dtype compatibility by converting all inputs to the expected output dtype
+        target_dtype = str(output_spec.dtype)
+
+        input_name = input_names[0]
+
+        if len(input_names) == 1:
+            return f"{output_name} = torch.nn.functional.layer_norm({input_name}.to({target_dtype}), {normalized_shape})"
+        elif len(input_names) == 2:
+            weight_name = input_names[1]
+            return f"{output_name} = torch.nn.functional.layer_norm({input_name}.to({target_dtype}), {normalized_shape}, weight={weight_name}.to({target_dtype}))"
+        else:  # len(input_names) == 3
+            weight_name, bias_name = input_names[1], input_names[2]
+            return f"{output_name} = torch.nn.functional.layer_norm({input_name}.to({target_dtype}), {normalized_shape}, weight={weight_name}.to({target_dtype}), bias={bias_name}.to({target_dtype}))"
+
+
+class RMSNormOperator(Operator):
+    """Operator for torch.nn.functional.rms_norm (Root Mean Square Normalization).
+
+    RMSNorm is commonly used in modern LLMs like LLaMA. It normalizes by the RMS of the input.
+    """
+
+    def __init__(self):
+        super().__init__("torch.nn.functional.rms_norm")
+        self.weight = 5.0
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.nn.functional.rms_norm"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """RMSNorm can produce tensor outputs with floating point dtypes."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        # RMSNorm needs at least 1 dimension to normalize over
+        if len(output_spec.size) == 0:
+            return False
+        return is_float_dtype(output_spec.dtype)
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for RMSNorm operation.
+
+        RMSNorm requires:
+        - input: input tensor
+        - weight: (normalized_shape,) [optional]
+        """
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("RMSNormOperator can only produce TensorSpec outputs")
+
+        if len(output_spec.size) == 0:
+            raise ValueError("RMSNorm output must have at least 1 dimension")
+
+        # Input tensor has same shape and dtype as output
+        input_spec = TensorSpec(
+            size=output_spec.size, stride=output_spec.stride, dtype=output_spec.dtype
+        )
+
+        # Weight tensor (optional with 70% probability)
+        normalized_shape = output_spec.size[-1:]
+        specs = [input_spec]
+        if random.random() < 0.7:
+            weight_spec = TensorSpec(
+                size=normalized_shape, stride=(1,), dtype=output_spec.dtype
+            )
+            specs.append(weight_spec)
+
+        from typing import cast
+
+        return cast(list[Spec], specs)
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for RMSNorm operation."""
+        if len(input_names) < 1 or len(input_names) > 2:
+            raise ValueError("RMSNorm requires 1-2 inputs: input, optional weight")
+
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("RMSNormOperator can only produce TensorSpec outputs")
+
+        target_dtype = str(output_spec.dtype)
+        input_name = input_names[0]
+
+        # Normalize over the last dimension
+        normalized_shape = f"({output_spec.size[-1]},)"
+
+        if len(input_names) == 1:
+            return f"{output_name} = torch.nn.functional.rms_norm({input_name}.to({target_dtype}), {normalized_shape})"
+        else:  # len(input_names) == 2
+            weight_name = input_names[1]
+            return f"{output_name} = torch.nn.functional.rms_norm({input_name}.to({target_dtype}), {normalized_shape}, weight={weight_name}.to({target_dtype}))"
+
+
+class GELUOperator(Operator):
+    """Operator for torch.nn.functional.gelu (Gaussian Error Linear Unit)."""
+
+    def __init__(self):
+        super().__init__("torch.nn.functional.gelu")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.nn.functional.gelu"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """GELU can produce tensor outputs with floating point dtypes."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        return is_float_dtype(output_spec.dtype)
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for GELU operation.
+
+        GELU is element-wise, so input shape matches output shape.
+        """
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("GELUOperator can only produce TensorSpec outputs")
+
+        input_spec = TensorSpec(
+            size=output_spec.size, stride=output_spec.stride, dtype=output_spec.dtype
+        )
+
+        return [input_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for GELU operation."""
+        if len(input_names) != 1:
+            raise ValueError("GELU requires exactly 1 input")
+
+        input_name = input_names[0]
+        return f"{output_name} = torch.nn.functional.gelu({input_name})"
+
+
+class SigmoidOperator(Operator):
+    """Operator for torch.sigmoid."""
+
+    def __init__(self):
+        super().__init__("torch.sigmoid")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.sigmoid"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Sigmoid can produce tensor outputs with floating point dtypes."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        return is_float_dtype(output_spec.dtype)
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for sigmoid operation.
+
+        Sigmoid is element-wise, so input shape matches output shape.
+        """
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("SigmoidOperator can only produce TensorSpec outputs")
+
+        input_spec = TensorSpec(
+            size=output_spec.size, stride=output_spec.stride, dtype=output_spec.dtype
+        )
+
+        return [input_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for sigmoid operation."""
+        if len(input_names) != 1:
+            raise ValueError("Sigmoid requires exactly 1 input")
+
+        input_name = input_names[0]
+        return f"{output_name} = torch.sigmoid({input_name})"
+
+
+class TanhOperator(Operator):
+    """Operator for torch.tanh."""
+
+    def __init__(self):
+        super().__init__("torch.tanh")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.tanh"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Tanh can produce tensor outputs with floating point dtypes."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        return is_float_dtype(output_spec.dtype)
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for tanh operation.
+
+        Tanh is element-wise, so input shape matches output shape.
+        """
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("TanhOperator can only produce TensorSpec outputs")
+
+        input_spec = TensorSpec(
+            size=output_spec.size, stride=output_spec.stride, dtype=output_spec.dtype
+        )
+
+        return [input_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for tanh operation."""
+        if len(input_names) != 1:
+            raise ValueError("Tanh requires exactly 1 input")
+
+        input_name = input_names[0]
+        return f"{output_name} = torch.tanh({input_name})"
+
+
+class BatchNormOperator(Operator):
+    """Operator for torch.nn.functional.batch_norm."""
+
+    def __init__(self):
+        super().__init__("torch.nn.functional.batch_norm")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.nn.functional.batch_norm"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """BatchNorm can produce tensor outputs with floating point dtypes."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        # BatchNorm needs at least 2 dimensions (batch, features)
+        if len(output_spec.size) < 2:
+            return False
+        # Channel dimension (second dimension) must be greater than 0
+        if output_spec.size[1] == 0:
+            return False
+        return is_float_dtype(output_spec.dtype)
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for batch_norm operation.
+
+        BatchNorm requires:
+        - input: (N, C, ...) where N is batch and C is channels
+        - running_mean: (C,) [optional]
+        - running_var: (C,) [optional]
+        - weight: (C,) [optional]
+        - bias: (C,) [optional]
+        """
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("BatchNormOperator can only produce TensorSpec outputs")
+
+        if len(output_spec.size) < 2:
+            raise ValueError("BatchNorm output must have at least 2 dimensions")
+
+        # Input tensor has same shape and dtype as output
+        input_spec = TensorSpec(
+            size=output_spec.size, stride=output_spec.stride, dtype=output_spec.dtype
+        )
+
+        # Channel dimension is the second dimension
+        num_features = output_spec.size[1]
+
+        specs = [input_spec]
+
+        # Add running_mean and running_var (required for inference mode)
+        running_mean_spec = TensorSpec(
+            size=(num_features,), stride=(1,), dtype=output_spec.dtype
+        )
+        running_var_spec = TensorSpec(
+            size=(num_features,), stride=(1,), dtype=output_spec.dtype
+        )
+        specs.extend([running_mean_spec, running_var_spec])
+
+        # Add weight and bias (optional with 70% probability)
+        if random.random() < 0.7:
+            weight_spec = TensorSpec(
+                size=(num_features,), stride=(1,), dtype=output_spec.dtype
+            )
+            specs.append(weight_spec)
+
+            if random.random() < 0.7:
+                bias_spec = TensorSpec(
+                    size=(num_features,), stride=(1,), dtype=output_spec.dtype
+                )
+                specs.append(bias_spec)
+
+        from typing import cast
+
+        return cast(list[Spec], specs)
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for batch_norm operation."""
+        if len(input_names) < 3 or len(input_names) > 5:
+            raise ValueError(
+                "BatchNorm requires 3-5 inputs: input, running_mean, running_var, optional weight, optional bias"
+            )
+
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("BatchNormOperator can only produce TensorSpec outputs")
+
+        target_dtype = str(output_spec.dtype)
+        input_name = input_names[0]
+        running_mean_name = input_names[1]
+        running_var_name = input_names[2]
+
+        # Use training=False for deterministic behavior
+        if len(input_names) == 3:
+            return f"{output_name} = torch.nn.functional.batch_norm({input_name}.to({target_dtype}), {running_mean_name}.to({target_dtype}), {running_var_name}.to({target_dtype}), training=False)"
+        elif len(input_names) == 4:
+            weight_name = input_names[3]
+            return f"{output_name} = torch.nn.functional.batch_norm({input_name}.to({target_dtype}), {running_mean_name}.to({target_dtype}), {running_var_name}.to({target_dtype}), weight={weight_name}.to({target_dtype}), training=False)"
+        else:  # len(input_names) == 5
+            weight_name = input_names[3]
+            bias_name = input_names[4]
+            return f"{output_name} = torch.nn.functional.batch_norm({input_name}.to({target_dtype}), {running_mean_name}.to({target_dtype}), {running_var_name}.to({target_dtype}), weight={weight_name}.to({target_dtype}), bias={bias_name}.to({target_dtype}), training=False)"
+
+
+class GroupNormOperator(Operator):
+    """Operator for torch.nn.functional.group_norm."""
+
+    def __init__(self):
+        super().__init__("torch.nn.functional.group_norm")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.nn.functional.group_norm"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """GroupNorm can produce tensor outputs with floating point dtypes."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        # GroupNorm needs at least 2 dimensions (batch, channels)
+        if len(output_spec.size) < 2:
+            return False
+        return is_float_dtype(output_spec.dtype)
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for group_norm operation.
+
+        GroupNorm requires:
+        - input: (N, C, ...) where N is batch and C is channels
+        - weight: (C,) [optional]
+        - bias: (C,) [optional]
+        """
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("GroupNormOperator can only produce TensorSpec outputs")
+
+        if len(output_spec.size) < 2:
+            raise ValueError("GroupNorm output must have at least 2 dimensions")
+
+        # Input tensor has same shape and dtype as output
+        input_spec = TensorSpec(
+            size=output_spec.size, stride=output_spec.stride, dtype=output_spec.dtype
+        )
+
+        # Channel dimension is the second dimension
+        num_channels = output_spec.size[1]
+
+        specs = [input_spec]
+
+        # Add weight and bias (optional with 70% probability)
+        if random.random() < 0.7:
+            weight_spec = TensorSpec(
+                size=(num_channels,), stride=(1,), dtype=output_spec.dtype
+            )
+            specs.append(weight_spec)
+
+            if random.random() < 0.7:
+                bias_spec = TensorSpec(
+                    size=(num_channels,), stride=(1,), dtype=output_spec.dtype
+                )
+                specs.append(bias_spec)
+
+        from typing import cast
+
+        return cast(list[Spec], specs)
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for group_norm operation."""
+        if len(input_names) < 1 or len(input_names) > 3:
+            raise ValueError(
+                "GroupNorm requires 1-3 inputs: input, optional weight, optional bias"
+            )
+
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("GroupNormOperator can only produce TensorSpec outputs")
+
+        target_dtype = str(output_spec.dtype)
+        input_name = input_names[0]
+
+        # Determine number of groups (must divide num_channels evenly)
+        num_channels = output_spec.size[1]
+        # Common choices: 32, 16, 8, or equal to channels (instance norm)
+        possible_groups = [g for g in [32, 16, 8, 4, 2, 1] if num_channels % g == 0]
+        num_groups = possible_groups[0] if possible_groups else 1
+
+        if len(input_names) == 1:
+            return f"{output_name} = torch.nn.functional.group_norm({input_name}.to({target_dtype}), {num_groups})"
+        elif len(input_names) == 2:
+            weight_name = input_names[1]
+            return f"{output_name} = torch.nn.functional.group_norm({input_name}.to({target_dtype}), {num_groups}, weight={weight_name}.to({target_dtype}))"
+        else:  # len(input_names) == 3
+            weight_name = input_names[1]
+            bias_name = input_names[2]
+            return f"{output_name} = torch.nn.functional.group_norm({input_name}.to({target_dtype}), {num_groups}, weight={weight_name}.to({target_dtype}), bias={bias_name}.to({target_dtype}))"
+
+
+class LeakyReLUOperator(Operator):
+    """Operator for torch.nn.functional.leaky_relu."""
+
+    def __init__(self):
+        super().__init__("torch.nn.functional.leaky_relu")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.nn.functional.leaky_relu"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """LeakyReLU can produce tensor outputs with floating point dtypes."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        return is_float_dtype(output_spec.dtype)
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for LeakyReLU operation.
+
+        LeakyReLU is element-wise, so input shape matches output shape.
+        """
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("LeakyReLUOperator can only produce TensorSpec outputs")
+
+        input_spec = TensorSpec(
+            size=output_spec.size, stride=output_spec.stride, dtype=output_spec.dtype
+        )
+
+        return [input_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for LeakyReLU operation."""
+        if len(input_names) != 1:
+            raise ValueError("LeakyReLU requires exactly 1 input")
+
+        input_name = input_names[0]
+        return f"{output_name} = torch.nn.functional.leaky_relu({input_name}, negative_slope=0.01)"
+
+
+class ELUOperator(Operator):
+    """Operator for torch.nn.functional.elu (Exponential Linear Unit)."""
+
+    def __init__(self):
+        super().__init__("torch.nn.functional.elu")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.nn.functional.elu"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """ELU can produce tensor outputs with floating point dtypes."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        return is_float_dtype(output_spec.dtype)
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for ELU operation.
+
+        ELU is element-wise, so input shape matches output shape.
+        """
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("ELUOperator can only produce TensorSpec outputs")
+
+        input_spec = TensorSpec(
+            size=output_spec.size, stride=output_spec.stride, dtype=output_spec.dtype
+        )
+
+        return [input_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for ELU operation."""
+        if len(input_names) != 1:
+            raise ValueError("ELU requires exactly 1 input")
+
+        input_name = input_names[0]
+        return f"{output_name} = torch.nn.functional.elu({input_name})"
+
+
+class SiLUOperator(Operator):
+    """Operator for torch.nn.functional.silu (Sigmoid Linear Unit, also known as Swish)."""
+
+    def __init__(self):
+        super().__init__("torch.nn.functional.silu")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.nn.functional.silu"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """SiLU can produce tensor outputs with floating point dtypes."""
+        if not isinstance(output_spec, TensorSpec):
+            return False
+        return is_float_dtype(output_spec.dtype)
+
+    def fuzz_inputs_specs(self, output_spec: Spec) -> list[Spec]:
+        """Generate input specs for SiLU operation.
+
+        SiLU is element-wise, so input shape matches output shape.
+        """
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("SiLUOperator can only produce TensorSpec outputs")
+
+        input_spec = TensorSpec(
+            size=output_spec.size, stride=output_spec.stride, dtype=output_spec.dtype
+        )
+
+        return [input_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for SiLU operation."""
+        if len(input_names) != 1:
+            raise ValueError("SiLU requires exactly 1 input")
+
+        input_name = input_names[0]
+        return f"{output_name} = torch.nn.functional.silu({input_name})"
diff --git a/tools/experimental/torchfuzz/operators/nonzero.py b/tools/experimental/torchfuzz/operators/nonzero.py
new file mode 100644
index 000000000000..00b651e939b5
--- /dev/null
+++ b/tools/experimental/torchfuzz/operators/nonzero.py
@@ -0,0 +1,80 @@
+"""Nonzero operator implementation."""
+
+from typing import Optional
+
+import torch
+
+from torchfuzz.operators.base import Operator
+from torchfuzz.tensor_fuzzer import Spec, TensorSpec
+
+
+class NonzeroOperator(Operator):
+    """Operator for finding nonzero elements in a tensor."""
+
+    def __init__(self):
+        super().__init__("nonzero")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.nonzero"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Nonzero produces a tensor with shape (n_nonzero, n_dims).
+
+        We can deterministically synthesize inputs to match any 2D int64 output
+        shape (k, d) without data-dependent guards by constructing an input with
+        exactly k non-zero elements and d dimensions.
+        """
+        return (
+            isinstance(output_spec, TensorSpec)
+            and output_spec.dtype in [torch.int64, torch.long]
+            and len(output_spec.size) == 2
+        )
+
+    def fuzz_inputs_specs(self, output_spec: Spec, num_inputs: int = 1) -> list[Spec]:
+        """Generate input spec for nonzero operation.
+
+        The actual values will be synthesized in codegen to achieve the target size.
+        """
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("NonzeroOperator can only produce TensorSpec outputs")
+
+        # Provide a placeholder spec; codegen will ignore the actual input content
+        # and synthesize a tensor with desired nonzero count and dimensionality.
+        d = output_spec.size[1]
+        input_spec = TensorSpec(
+            size=tuple([1] * d) if d > 0 else (),
+            stride=tuple([1] * d) if d > 0 else (),
+            dtype=torch.bool,
+        )
+        return [input_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for nonzero using synthesized input to match target size.
+
+        No data-dependent conditionals/guards. Constructs an input with exactly
+        k = output_spec.size[0] non-zero elements and d = output_spec.size[1] dims,
+        then calls torch.nonzero on it.
+        """
+        if len(input_names) != 1:
+            raise ValueError("NonzeroOperator requires exactly one input")
+        if not isinstance(output_spec, TensorSpec) or len(output_spec.size) != 2:
+            raise ValueError("NonzeroOperator requires 2D TensorSpec output")
+        k = output_spec.size[0]
+        d = output_spec.size[1]
+        # Construct concrete shape literal like (k, 1, 1, ...)
+        shape_elems = [str(k)] + ["1"] * max(0, d - 1)
+        shape_literal = (
+            "(" + ", ".join(shape_elems) + ("," if d == 1 else "") + ")"
+            if d > 0
+            else "()"
+        )
+        return (
+            f"_x_nz = torch.zeros({shape_literal}, dtype=torch.bool, device={input_names[0]}.device)\n"
+            f"_x_nz_flat = _x_nz.reshape(-1)\n"
+            f"_x_nz_flat[:{k}] = True\n"
+            f"{output_name} = torch.nonzero(_x_nz)"
+        )
diff --git a/tools/experimental/torchfuzz/operators/registry.py b/tools/experimental/torchfuzz/operators/registry.py
new file mode 100644
index 000000000000..08eafd13956d
--- /dev/null
+++ b/tools/experimental/torchfuzz/operators/registry.py
@@ -0,0 +1,205 @@
+"""Operator registry for mapping operation names to operator instances."""
+
+from typing import Optional
+
+from torchfuzz.operators.arg import ArgOperator
+from torchfuzz.operators.base import Operator
+from torchfuzz.operators.constant import ConstantOperator
+from torchfuzz.operators.item import ItemOperator
+from torchfuzz.operators.layout import (
+    FlattenOperator,
+    ReshapeOperator,
+    SqueezeOperator,
+    UnsqueezeOperator,
+    ViewOperator,
+)
+from torchfuzz.operators.masked_select import MaskedSelectOperator
+from torchfuzz.operators.matrix_multiply import (
+    AddmmOperator,
+    BmmOperator,
+    MatmulOperator,
+    MMOperator,
+)
+from torchfuzz.operators.nn_functional import (
+    BatchNormOperator,
+    DropoutOperator,
+    ELUOperator,
+    EmbeddingOperator,
+    GELUOperator,
+    GroupNormOperator,
+    LayerNormOperator,
+    LeakyReLUOperator,
+    LinearOperator,
+    ReLUOperator,
+    RMSNormOperator,
+    SigmoidOperator,
+    SiLUOperator,
+    SoftmaxOperator,
+    TanhOperator,
+)
+from torchfuzz.operators.nonzero import NonzeroOperator
+from torchfuzz.operators.scalar_pointwise import (
+    ScalarAddOperator,
+    ScalarDivOperator,
+    ScalarMulOperator,
+    ScalarSubOperator,
+)
+from torchfuzz.operators.tensor_pointwise import (
+    AddOperator,
+    DivOperator,
+    MulOperator,
+    SubOperator,
+)
+from torchfuzz.operators.unique import UniqueOperator
+
+
+class OperatorRegistry:
+    """Registry for managing operator instances."""
+
+    def __init__(self):
+        """Initialize the registry with default operators."""
+        self._operators: dict[str, Operator] = {}
+        self._register_default_operators()
+
+    def _register_default_operators(self):
+        """Register the default set of operators."""
+        # Individual tensor pointwise operators (preferred)
+        self.register(AddOperator())
+        self.register(MulOperator())
+        self.register(SubOperator())
+        self.register(DivOperator())
+
+        # Individual scalar pointwise operators (preferred)
+        self.register(ScalarAddOperator())
+        self.register(ScalarMulOperator())
+        self.register(ScalarSubOperator())
+        self.register(ScalarDivOperator())
+
+        # Leaf Input operators
+        self.register(ConstantOperator())
+        self.register(ArgOperator())
+
+        # # Data-dependent operators
+        self.register(NonzeroOperator())
+        self.register(MaskedSelectOperator())
+        self.register(ItemOperator())
+        self.register(UniqueOperator())
+
+        # Tensor layout operators
+        self.register(ViewOperator())
+        self.register(ReshapeOperator())
+        self.register(FlattenOperator())
+        self.register(SqueezeOperator())
+        self.register(UnsqueezeOperator())
+
+        # Matrix multiplication operators
+        self.register(MMOperator())
+        self.register(AddmmOperator())
+        self.register(BmmOperator())
+        self.register(MatmulOperator())
+
+        # Neural network functional operators
+        self.register(EmbeddingOperator())
+        self.register(LinearOperator())
+
+        # Activation functions
+        self.register(ReLUOperator())
+        self.register(LeakyReLUOperator())
+        self.register(ELUOperator())
+        self.register(GELUOperator())
+        self.register(SiLUOperator())
+        self.register(SigmoidOperator())
+        self.register(TanhOperator())
+        self.register(SoftmaxOperator())
+
+        # Normalization layers
+        self.register(LayerNormOperator())
+        self.register(RMSNormOperator())
+        self.register(BatchNormOperator())
+        self.register(GroupNormOperator())
+
+        # Regularization
+        self.register(DropoutOperator())
+
+    def register(self, operator: Operator):
+        """Register an operator in the registry."""
+        self._operators[operator.name] = operator
+
+    def get(self, op_name: str) -> Optional[Operator]:
+        """Get an operator by name."""
+        # Handle special arg_ operations by mapping them to the ArgOperator
+        if op_name.startswith("arg_"):
+            return self._operators.get("arg")
+        return self._operators.get(op_name)
+
+    def list_operators(self) -> dict[str, Operator]:
+        """List all registered operators."""
+        return self._operators.copy()
+
+
+# Global registry instance
+_global_registry = OperatorRegistry()
+
+
+def get_operator(op_name: str) -> Optional[Operator]:
+    """Get an operator from the global registry."""
+    return _global_registry.get(op_name)
+
+
+def register_operator(operator: Operator):
+    """Register an operator in the global registry."""
+    _global_registry.register(operator)
+
+
+def list_operators() -> dict[str, Operator]:
+    """List all operators in the global registry."""
+    return _global_registry.list_operators()
+
+
+def set_operator_weight(op_name: str, weight: float) -> None:
+    """Set the selection weight for a specific operator.
+
+    Args:
+        op_name: The registered operator name (e.g., "add", "arg") OR fully-qualified torch op
+                 (e.g., "torch.nn.functional.relu", "torch.matmul")
+        weight: New relative selection weight (must be > 0)
+    """
+    if weight <= 0:
+        raise ValueError("Operator weight must be > 0")
+
+    # Try by registry key
+    op = _global_registry.get(op_name)
+    if op is not None:
+        op.weight = float(weight)
+        return
+
+    # Fallback: try to locate by fully-qualified torch op name
+    for candidate in _global_registry.list_operators().values():
+        if getattr(candidate, "torch_op_name", None) == op_name:
+            candidate.weight = float(weight)
+            return
+
+    raise KeyError(f"Operator '{op_name}' not found by registry name or torch op name")
+
+
+def set_operator_weights(weights: dict[str, float]) -> None:
+    """Bulk-update operator weights from a mapping of name -> weight."""
+    for name, w in weights.items():
+        set_operator_weight(name, w)
+
+
+def set_operator_weight_by_torch_op(torch_op_name: str, weight: float) -> None:
+    """Set operator weight by fully-qualified torch op name."""
+    if weight <= 0:
+        raise ValueError("Operator weight must be > 0")
+    for candidate in _global_registry.list_operators().values():
+        if getattr(candidate, "torch_op_name", None) == torch_op_name:
+            candidate.weight = float(weight)
+            return
+    raise KeyError(f"Torch op '{torch_op_name}' not found in registry")
+
+
+def set_operator_weights_by_torch_op(weights: dict[str, float]) -> None:
+    """Bulk-update weights by fully-qualified torch op names."""
+    for name, w in weights.items():
+        set_operator_weight_by_torch_op(name, w)
diff --git a/tools/experimental/torchfuzz/operators/scalar_pointwise.py b/tools/experimental/torchfuzz/operators/scalar_pointwise.py
new file mode 100644
index 000000000000..6350c0120631
--- /dev/null
+++ b/tools/experimental/torchfuzz/operators/scalar_pointwise.py
@@ -0,0 +1,99 @@
+"""Scalar pointwise operator implementation."""
+
+import random
+from typing import Optional
+
+import torch
+
+from torchfuzz.operators.base import Operator
+from torchfuzz.tensor_fuzzer import ScalarSpec, Spec
+
+
+class ScalarPointwiseOperator(Operator):
+    """Base class for scalar pointwise operations."""
+
+    def __init__(self, name: str, symbol: str):
+        super().__init__(name)
+        self.symbol = symbol
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Scalar operations don't have specific torch ops, they use Python operators."""
+        return None
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Scalar pointwise operations can only produce scalars."""
+        if output_spec.dtype == torch.bool:
+            return False
+        return isinstance(output_spec, ScalarSpec)
+
+    def fuzz_inputs_specs(self, output_spec: Spec, num_inputs: int = 2) -> list[Spec]:
+        """Decompose scalar into input scalars for pointwise operation with type promotion."""
+        if not isinstance(output_spec, ScalarSpec):
+            raise ValueError(
+                f"{self.__class__.__name__} can only produce ScalarSpec outputs"
+            )
+
+        # Use shared type promotion utility
+        from torchfuzz.type_promotion import get_scalar_promotion_pairs
+
+        supported_types = get_scalar_promotion_pairs(output_spec.dtype)
+        dtypes = random.choice(supported_types)
+
+        return [ScalarSpec(dtype=dtypes[0]), ScalarSpec(dtype=dtypes[1])]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for scalar pointwise operation."""
+        if len(input_names) != 2:
+            raise ValueError(f"{self.__class__.__name__} requires exactly two inputs")
+
+        return f"{output_name} = {input_names[0]} {self.symbol} {input_names[1]}"
+
+
+class ScalarAddOperator(ScalarPointwiseOperator):
+    """Operator for scalar addition."""
+
+    def __init__(self):
+        super().__init__("scalar_add", "+")
+
+
+class ScalarMulOperator(ScalarPointwiseOperator):
+    """Operator for scalar multiplication."""
+
+    def __init__(self):
+        super().__init__("scalar_mul", "*")
+
+
+class ScalarSubOperator(ScalarPointwiseOperator):
+    """Operator for scalar subtraction."""
+
+    def __init__(self):
+        super().__init__("scalar_sub", "-")
+
+
+class ScalarDivOperator(ScalarPointwiseOperator):
+    """Operator for scalar division."""
+
+    def __init__(self):
+        super().__init__("scalar_div", "/")
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for scalar division with zero-denominator guard."""
+        if len(input_names) != 2:
+            raise ValueError(f"{self.__class__.__name__} requires exactly two inputs")
+
+        # Prevent ZeroDivisionError at runtime by clamping the denominator.
+        # Clamp denominator to at least 1 (for ints) or 1e-6 (for floats).
+        if isinstance(output_spec, ScalarSpec) and output_spec.dtype in [
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+        ]:
+            return f"{output_name} = {input_names[0]} / max({input_names[1]}, 1)"
+        else:
+            return f"{output_name} = {input_names[0]} / max({input_names[1]}, 1e-6)"
diff --git a/tools/experimental/torchfuzz/operators/tensor_pointwise.py b/tools/experimental/torchfuzz/operators/tensor_pointwise.py
new file mode 100644
index 000000000000..68b162c48796
--- /dev/null
+++ b/tools/experimental/torchfuzz/operators/tensor_pointwise.py
@@ -0,0 +1,112 @@
+"""Tensor pointwise operator implementation."""
+
+import random
+from typing import Optional
+
+import torch
+
+from torchfuzz.operators.base import Operator
+from torchfuzz.tensor_fuzzer import Spec, TensorSpec
+from torchfuzz.type_promotion import (
+    get_dtype_map,
+    get_dtype_name,
+    get_promotion_table_for_strings,
+)
+
+
+class PointwiseOperator(Operator):
+    """Base class for element-wise pointwise operations."""
+
+    def __init__(self, name: str, torch_op: str, symbol: str):
+        super().__init__(name)
+        self._torch_op = torch_op
+        self.symbol = symbol
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return self._torch_op
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Tensor pointwise operations can produce tensors but not scalars."""
+        if isinstance(output_spec, TensorSpec) and output_spec.dtype == torch.bool:
+            return False
+        return isinstance(output_spec, TensorSpec)
+
+    def fuzz_inputs_specs(self, output_spec: Spec, num_inputs: int = 2) -> list[Spec]:
+        """Decompose tensor into input tensors for pointwise operation with type promotion."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError(
+                f"{self.__class__.__name__} can only produce TensorSpec outputs"
+            )
+
+        # Use shared type promotion table
+        promotion_table = get_promotion_table_for_strings()
+
+        # If num_inputs > 2, promote left-to-right (e.g. (((a op b) op c) op d))
+        # For simplicity, we generate the first two with promotion, rest match output dtype
+        dtype_str = get_dtype_name(output_spec.dtype)
+        supported_types = promotion_table.get(dtype_str, [(dtype_str, dtype_str)])
+
+        # Pick a random promotion pattern for the first two inputs
+        if num_inputs >= 2:
+            dtypes = list(random.choice(supported_types))
+            # For >2 inputs, fill with output dtype
+            while len(dtypes) < num_inputs:
+                dtypes.append(dtype_str)
+        else:
+            dtypes = [dtype_str] * num_inputs
+
+        # Convert dtype strings back to torch dtypes
+        dtype_map = get_dtype_map()
+
+        return [
+            TensorSpec(
+                size=output_spec.size,
+                stride=output_spec.stride,
+                dtype=dtype_map.get(dt, output_spec.dtype),
+            )
+            for dt in dtypes
+        ]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for pointwise operation."""
+        if len(input_names) == 2:
+            return (
+                f"{output_name} = {self._torch_op}({input_names[0]}, {input_names[1]})"
+            )
+        else:
+            # Chain operations using symbols for readability
+            expr = f" {self.symbol} ".join(input_names)
+            return f"{output_name} = {expr}"
+
+
+class AddOperator(PointwiseOperator):
+    """Operator for element-wise addition."""
+
+    def __init__(self, weight: float = 1.0):
+        super().__init__("add", "torch.add", "+")
+        self.weight = float(weight)
+
+
+class MulOperator(PointwiseOperator):
+    """Operator for element-wise multiplication."""
+
+    def __init__(self):
+        super().__init__("mul", "torch.mul", "*")
+
+
+class SubOperator(PointwiseOperator):
+    """Operator for element-wise subtraction."""
+
+    def __init__(self):
+        super().__init__("sub", "torch.sub", "-")
+
+
+class DivOperator(PointwiseOperator):
+    """Operator for element-wise division."""
+
+    def __init__(self):
+        super().__init__("div", "torch.div", "/")
diff --git a/tools/experimental/torchfuzz/operators/unique.py b/tools/experimental/torchfuzz/operators/unique.py
new file mode 100644
index 000000000000..5fa09dbe4315
--- /dev/null
+++ b/tools/experimental/torchfuzz/operators/unique.py
@@ -0,0 +1,56 @@
+"""Unique operator implementation."""
+
+from typing import Optional
+
+from torchfuzz.operators.base import Operator
+from torchfuzz.tensor_fuzzer import Spec, TensorSpec
+
+
+class UniqueOperator(Operator):
+    """Operator for finding unique elements in a tensor."""
+
+    def __init__(self):
+        super().__init__("unique")
+
+    @property
+    def torch_op_name(self) -> Optional[str]:
+        """Return the torch operation name."""
+        return "torch.unique"
+
+    def can_produce(self, output_spec: Spec) -> bool:
+        """Unique can produce 1D tensor outputs of arbitrary length without guards.
+
+        We will synthesize an input with exactly the desired number of unique
+        elements so that torch.unique returns the target size deterministically.
+        """
+        return isinstance(output_spec, TensorSpec) and len(output_spec.size) == 1
+
+    def fuzz_inputs_specs(self, output_spec: Spec, num_inputs: int = 1) -> list[Spec]:
+        """Generate input spec for unique operation."""
+        if not isinstance(output_spec, TensorSpec):
+            raise ValueError("UniqueOperator can only produce TensorSpec outputs")
+
+        # Input can be any tensor - unique will flatten and find unique values
+        input_spec = TensorSpec(
+            size=(2, 3),  # Fixed size for consistency
+            stride=(3, 1),  # Contiguous
+            dtype=output_spec.dtype,  # Match output dtype
+        )
+
+        return [input_spec]
+
+    def codegen(
+        self, output_name: str, input_names: list[str], output_spec: Spec
+    ) -> str:
+        """Generate code for unique with deterministic target size input (no guards)."""
+        if len(input_names) != 1:
+            raise ValueError("UniqueOperator requires exactly one input")
+        # Desired output length and target dtype
+        desired_len = output_spec.size[0] if isinstance(output_spec, TensorSpec) else 0
+        # Synthesize in a wide dtype (int64) to guarantee desired_len distinct values,
+        # apply unique, then cast to the target dtype. No conditionals or guards.
+        return (
+            f"_inp_unique_wide = torch.arange({desired_len}, device={input_names[0]}.device, dtype=torch.int64)\n"
+            f"_uniq_wide = torch.unique(_inp_unique_wide)\n"
+            f"{output_name} = _uniq_wide.to({input_names[0]}.dtype)"
+        )
diff --git a/tools/experimental/torchfuzz/ops_fuzzer.py b/tools/experimental/torchfuzz/ops_fuzzer.py
new file mode 100644
index 000000000000..3ff17bb5b559
--- /dev/null
+++ b/tools/experimental/torchfuzz/ops_fuzzer.py
@@ -0,0 +1,521 @@
+# mypy: ignore-errors
+
+import random
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+from torchfuzz.operators import get_operator, list_operators
+from torchfuzz.tensor_fuzzer import (
+    fuzz_tensor_size,
+    fuzz_torch_tensor_type,
+    fuzz_valid_stride,
+    ScalarSpec,
+    Spec,
+    specs_compatible,
+    TensorSpec,
+)
+
+
+# Cache operators at module level to avoid repeated calls to list_operators()
+_CACHED_OPERATORS = None
+
+
+def _get_cached_operators():
+    """Get cached operators, initializing if necessary."""
+    global _CACHED_OPERATORS
+    if _CACHED_OPERATORS is None:
+        _CACHED_OPERATORS = list_operators()
+    return _CACHED_OPERATORS
+
+
+def _get_template_filtered_operators(
+    template: str = "default", supported_ops: Optional[list[str]] = None
+):
+    """Get operators filtered by template's supported_ops, with user override.
+
+    If supported_ops is provided, it takes precedence and is used to filter the
+    registry. Otherwise, the template's supported_ops are used. If neither are
+    specified, all operators are returned.
+    """
+    # Instantiate template
+    if template == "dtensor":
+        from torchfuzz.codegen import DTensorFuzzTemplate
+
+        fuzz_template = DTensorFuzzTemplate()
+    elif template == "unbacked":
+        from torchfuzz.codegen import UnbackedFuzzTemplate
+
+        fuzz_template = UnbackedFuzzTemplate()
+    else:
+        from torchfuzz.codegen import DefaultFuzzTemplate
+
+        fuzz_template = DefaultFuzzTemplate()
+
+    all_operators = _get_cached_operators()
+
+    # Determine allowed ops list
+    allowed_ops = supported_ops if supported_ops else fuzz_template.supported_ops
+
+    # If no supported_ops specified, return all operators
+    if not allowed_ops:
+        return all_operators
+
+    # Filter operators based on allowed_ops
+    filtered_ops = {}
+
+    for op_name, operator in all_operators.items():
+        # Always include operations that don't have a specific torch operation
+        # (utility operations like arg, constant, item, scalar ops)
+        torch_op = operator.torch_op_name
+        if torch_op is None:
+            # Set template on operators that support it
+            if hasattr(operator, "set_template"):
+                operator.set_template(template)  # type: ignore[attr-defined]
+            filtered_ops[op_name] = operator
+            continue
+
+        # Check if the operator supports any of the allowed operations
+        should_include = False
+        for supported_op in allowed_ops:
+            # Direct torch operation matching
+            if torch_op == supported_op:
+                should_include = True
+                break
+
+            # Direct name matching as fallback
+            if supported_op in op_name or op_name in supported_op:
+                should_include = True
+                break
+
+        if should_include:
+            # Set template on operators that support it
+            if hasattr(operator, "set_template"):
+                operator.set_template(template)  # type: ignore[attr-defined]
+            filtered_ops[op_name] = operator
+
+    return filtered_ops
+
+
+@dataclass
+class OperationNode:
+    """
+    Represents a node in the operation graph.
+
+    Attributes:
+        node_id: Unique identifier for this node
+        op_name: Name of the operation (e.g., 'torch.ops.aten.add', 'scalar_add', 'arg')
+        input_specs: List of input specifications required by this operation
+        output_spec: Output specification produced by this operation
+        input_nodes: List of node IDs that provide inputs to this operation
+        depth: Depth level of this node in the generation tree
+    """
+
+    node_id: str
+    op_name: str
+    input_specs: list[Spec]
+    output_spec: Spec
+    input_nodes: list[str]
+    depth: int
+
+    def __str__(self) -> str:
+        """String representation for debugging."""
+        return (
+            f"{self.node_id}: {self.op_name} -> {self.output_spec} (depth {self.depth})"
+        )
+
+    def __repr__(self) -> str:
+        """Detailed representation for debugging."""
+        return (
+            f"OperationNode(node_id='{self.node_id}', op_name='{self.op_name}', "
+            f"input_specs={self.input_specs}, output_spec={self.output_spec}, "
+            f"input_nodes={self.input_nodes}, depth={self.depth})"
+        )
+
+
+@dataclass
+class OperationGraph:
+    """
+    Represents a graph of operations.
+
+    Attributes:
+        nodes: Dictionary mapping node_id to OperationNode
+        root_node_id: ID of the root node that produces the final output (the output node)
+        target_spec: The specification that the root node should produce
+    """
+
+    nodes: dict[str, OperationNode]
+    root_node_id: str  # The output node - produces the final result of the graph
+    target_spec: Spec
+
+    def __post_init__(self):
+        """Validate the graph structure after initialization."""
+        if self.root_node_id not in self.nodes:
+            raise ValueError(f"Root node {self.root_node_id} not found in nodes")
+
+    def get_topological_order(self) -> list[str]:
+        """
+        Get nodes in topological order (dependencies before dependents).
+
+        Returns:
+            List of node IDs in topological order
+        """
+        visited = set()
+        temp_visited = set()
+        result = []
+
+        def visit(node_id: str):
+            if node_id in temp_visited:
+                raise ValueError(f"Cycle detected involving node {node_id}")
+            if node_id in visited:
+                return
+
+            temp_visited.add(node_id)
+            node = self.nodes[node_id]
+
+            # Visit all input nodes first
+            for input_node_id in node.input_nodes:
+                if input_node_id in self.nodes:  # Skip external inputs
+                    visit(input_node_id)
+
+            temp_visited.remove(node_id)
+            visited.add(node_id)
+            result.append(node_id)
+
+        # Start from all nodes to handle disconnected components
+        for node_id in self.nodes:
+            if node_id not in visited:
+                visit(node_id)
+
+        return result
+
+    def get_leaf_nodes(self) -> list[str]:
+        """Get all leaf nodes (nodes with no inputs)."""
+        return [node_id for node_id, node in self.nodes.items() if not node.input_nodes]
+
+    def get_node_dependencies(self, node_id: str) -> list[str]:
+        """Get all nodes that this node depends on (transitive closure)."""
+        visited = set()
+        dependencies = []
+
+        def collect_deps(current_id: str):
+            if current_id in visited or current_id not in self.nodes:
+                return
+            visited.add(current_id)
+
+            node = self.nodes[current_id]
+            for input_node_id in node.input_nodes:
+                dependencies.append(input_node_id)
+                collect_deps(input_node_id)
+
+        collect_deps(node_id)
+        return dependencies
+
+    def __str__(self) -> str:
+        """String representation for debugging."""
+        lines = [
+            f"OperationGraph (root: {self.root_node_id}, target: {self.target_spec})"
+        ]
+        for node_id in self.get_topological_order():
+            node = self.nodes[node_id]
+            inputs_str = f" <- {node.input_nodes}" if node.input_nodes else ""
+            lines.append(f"  {node}{inputs_str}")
+        return "\n".join(lines)
+
+
+def fuzz_spec(template: str = "default") -> Spec:
+    """
+    Generate a random Spec (either TensorSpec or ScalarSpec) using template's distribution preferences.
+
+    Args:
+        template: Template name to determine configuration and distribution
+
+    Returns:
+        Spec: Either a TensorSpec or ScalarSpec according to template's distribution
+    """
+    # Try to use template's custom distribution if available
+    try:
+        # Instantiate template
+        if template == "dtensor":
+            from torchfuzz.codegen import DTensorFuzzTemplate
+
+            fuzz_template = DTensorFuzzTemplate()
+        elif template == "unbacked":
+            from torchfuzz.codegen import UnbackedFuzzTemplate
+
+            fuzz_template = UnbackedFuzzTemplate()
+        else:
+            from torchfuzz.codegen import DefaultFuzzTemplate
+
+            fuzz_template = DefaultFuzzTemplate()
+
+        # Use template's custom spec generation
+        return fuzz_template.fuzz_spec_custom()
+
+    except Exception:
+        # Fallback to original hardcoded behavior if template fails
+        # Get random dtype based on template
+        dtype = fuzz_torch_tensor_type(template)
+
+        # 20% probability of returning ScalarSpec
+        if random.random() < 0.2:
+            return ScalarSpec(dtype=dtype)
+
+        # 80% probability of returning TensorSpec
+        # Get random size and corresponding stride
+        size = fuzz_tensor_size()
+        stride = fuzz_valid_stride(size)
+        return TensorSpec(size=size, stride=stride, dtype=dtype)
+
+
+def fuzz_op(
+    target_spec: Spec,
+    depth,
+    stack_size,
+    template: str = "default",
+    supported_ops: Optional[list[str]] = None,
+) -> tuple[str, list[Spec]]:
+    """
+    Given an output specification, returns an operation that can
+    produce a tensor with that layout using the operator class system.
+
+    Args:
+        target_spec: Desired output specification (TensorSpec or ScalarSpec)
+        depth: Maximum depth for operation generation. At depth 0, only leaf operations
+               (constant, arg) are allowed. Higher depths allow more complex operations.
+        stack_size: Current stack size. When < 10, reduces probability of leaf operations.
+
+    Returns:
+        Tuple of (operation_name, list_of_argument_specs) where each argument spec
+        describes the layout requirements for the operation's inputs
+    """
+    # Get template-filtered operators
+    available_operators = _get_template_filtered_operators(template, supported_ops)
+
+    # Filter operators that can produce the target spec
+    # IMPORTANT: iterate in a deterministic order to avoid dict-order nondeterminism
+    compatible_ops = []
+    for op_name in sorted(available_operators.keys()):
+        operator = available_operators[op_name]
+        if operator.can_produce(target_spec):
+            compatible_ops.append((op_name, operator))
+
+    # Shuffle with seeded RNG (caller seeds random), but from a deterministic base order
+    random.shuffle(compatible_ops)
+
+    if not compatible_ops:
+        raise ValueError(f"No operators available that can produce {target_spec}")
+
+    # Categorize operators into leaf and non-leaf
+    leaf_ops = []
+    non_leaf_ops = []
+
+    for op_name, operator in compatible_ops:
+        if op_name in ["constant", "arg"] or op_name.startswith("arg_"):
+            leaf_ops.append((op_name, operator))
+        else:
+            non_leaf_ops.append((op_name, operator))
+
+    # Choose operation based on depth and stack size constraints
+    if depth == 0:
+        # At depth 0, only allow leaf operations
+        if not leaf_ops:
+            # If no leaf ops can produce this spec, fallback to arg
+            return _get_arg_args_specs(target_spec)
+        # Weighted choice among leaf ops
+        leaf_weights = [
+            op.get_weight(
+                target_spec=target_spec,
+                depth=depth,
+                stack_size=stack_size,
+                template=template,
+            )
+            for _, op in leaf_ops
+        ]
+        idx = random.choices(range(len(leaf_ops)), weights=leaf_weights, k=1)[0]
+        chosen_op_name, chosen_operator = leaf_ops[idx]
+    else:
+        # At higher depths, choose between leaf and non-leaf operations
+        # Reduce probability of leaf operations when stack_size < 10
+        if (stack_size < 10 or depth > 7) and non_leaf_ops:
+            # 80% chance of non-leaf, 20% chance of leaf
+            if random.random() < 0.8:
+                # Weighted choice among non-leaf ops
+                nonleaf_weights = [
+                    op.get_weight(
+                        target_spec=target_spec,
+                        depth=depth,
+                        stack_size=stack_size,
+                        template=template,
+                    )
+                    for _, op in non_leaf_ops
+                ]
+                idx = random.choices(
+                    range(len(non_leaf_ops)), weights=nonleaf_weights, k=1
+                )[0]
+                chosen_op_name, chosen_operator = non_leaf_ops[idx]
+            else:
+                if leaf_ops:
+                    leaf_weights = [
+                        op.get_weight(
+                            target_spec=target_spec,
+                            depth=depth,
+                            stack_size=stack_size,
+                            template=template,
+                        )
+                        for _, op in leaf_ops
+                    ]
+                    idx = random.choices(
+                        range(len(leaf_ops)), weights=leaf_weights, k=1
+                    )[0]
+                    chosen_op_name, chosen_operator = leaf_ops[idx]
+                else:
+                    nonleaf_weights = [
+                        op.get_weight(
+                            target_spec=target_spec,
+                            depth=depth,
+                            stack_size=stack_size,
+                            template=template,
+                        )
+                        for _, op in non_leaf_ops
+                    ]
+                    idx = random.choices(
+                        range(len(non_leaf_ops)), weights=nonleaf_weights, k=1
+                    )[0]
+                    chosen_op_name, chosen_operator = non_leaf_ops[idx]
+        else:
+            # Normal probability distribution over all ops
+            all_ops = non_leaf_ops + leaf_ops
+            if all_ops:
+                all_weights = [
+                    op.get_weight(
+                        target_spec=target_spec,
+                        depth=depth,
+                        stack_size=stack_size,
+                        template=template,
+                    )
+                    for _, op in all_ops
+                ]
+                idx = random.choices(range(len(all_ops)), weights=all_weights, k=1)[0]
+                chosen_op_name, chosen_operator = all_ops[idx]
+            else:
+                chosen_op_name, chosen_operator = ("arg", get_operator("arg"))
+
+    if chosen_operator is None:
+        # If no operator found, fallback to arg
+        return _get_arg_args_specs(target_spec)
+
+    input_specs = chosen_operator.fuzz_inputs_specs(target_spec)
+    return chosen_op_name, input_specs
+
+
+# Global counter for generating unique argument IDs
+_next_arg_id = 0
+
+
+def _get_arg_args_specs(target_spec: Spec) -> tuple[str, list[Spec]]:
+    """Get argument specifications for arg operation."""
+    global _next_arg_id
+
+    # Generate a unique argument ID
+    arg_id = _next_arg_id
+    _next_arg_id += 1
+
+    # Return the operation name with the arg_id embedded and no input specs
+    return f"arg_{arg_id}", []
+
+
+def fuzz_operation_graph(
+    target_spec: Spec,
+    max_depth: int = 7,
+    seed: Optional[int] = None,
+    template: str = "default",
+    supported_ops: Optional[list[str]] = None,
+) -> OperationGraph:
+    """
+    Generate a graph of operations that produces the target specification.
+
+    The graph-based approach allows for better visualization, debugging, and
+    potential optimizations like common subexpression elimination.
+
+    Args:
+        target_spec: The desired output specification (TensorSpec or ScalarSpec)
+        max_depth: Maximum depth of operations. At depth 0, only leaf operations (constant, arg) are used.
+        seed: Random seed for reproducible generation. If None, uses current random state.
+        template: Template name to determine configuration
+
+    Returns:
+        OperationGraph with nodes organized in a DAG structure
+    """
+
+    # Set seed for reproducible generation
+    if seed is not None:
+        import random
+
+        random.seed(seed)
+        torch.manual_seed(seed)
+        # Reset global arg counter for deterministic behavior
+        global _next_arg_id
+        _next_arg_id = 0
+
+    # Global counter for unique node IDs - start from 0 for deterministic behavior
+    node_counter = 0
+
+    # Dictionary to store all nodes: node_id -> OperationNode
+    nodes: dict[str, OperationNode] = {}
+
+    def _generate_node(spec: Spec, depth: int, stack_size: int = 0) -> str:
+        """
+        Generate a node for the given spec and return its node_id.
+        """
+        nonlocal node_counter
+
+        # Generate new operation
+        op_name, input_specs = fuzz_op(spec, depth, stack_size, template, supported_ops)
+
+        # Create unique node ID
+        node_id = f"node_{node_counter}"
+        node_counter += 1
+
+        # Generate input nodes
+        input_node_ids = []
+        if input_specs:  # Non-leaf operations
+            for input_spec in input_specs:
+                input_node_id = _generate_node(
+                    input_spec, max(0, depth - 1), stack_size + len(input_node_ids) + 1
+                )
+                input_node_ids.append(input_node_id)
+
+        # Create the operation node
+        node = OperationNode(
+            node_id=node_id,
+            op_name=op_name,
+            input_specs=input_specs,
+            output_spec=spec,
+            input_nodes=input_node_ids,
+            depth=depth,
+        )
+
+        # Store the node
+        nodes[node_id] = node
+
+        return node_id
+
+    # Generate the root node
+    root_node_id = _generate_node(target_spec, max_depth, 0)
+
+    # Create and return the operation graph
+    graph = OperationGraph(
+        nodes=nodes, root_node_id=root_node_id, target_spec=target_spec
+    )
+
+    # Verify that the root node produces the target spec
+    root_node = nodes[root_node_id]
+    if not specs_compatible(root_node.output_spec, target_spec):
+        raise ValueError(
+            f"Generated graph root node produces {root_node.output_spec}, "
+            f"but target spec is {target_spec}"
+        )
+
+    return graph
diff --git a/tools/experimental/torchfuzz/runner.py b/tools/experimental/torchfuzz/runner.py
new file mode 100644
index 000000000000..1fe2f380f5b4
--- /dev/null
+++ b/tools/experimental/torchfuzz/runner.py
@@ -0,0 +1,133 @@
+"""
+Program runner utilities for PyTorch fuzzer.
+This module handles running and testing generated PyTorch programs.
+"""
+
+import os
+import random
+import subprocess
+import sys
+
+
+class ProgramRunner:
+    """Runs generated PyTorch programs and handles output/error reporting."""
+
+    def __init__(self):
+        pass
+
+    def run_program(self, program_path):
+        """
+        Run a generated Python program and handle output/errors.
+
+        Args:
+            program_path: Path to the Python program to run
+
+        Returns:
+            bool: True if program ran successfully, False otherwise
+        """
+        abs_path = os.path.abspath(program_path)
+        print(f"Running: {abs_path}")
+
+        # Select a random CUDA device if available
+        cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cuda_visible_devices:
+            devices = [d.strip() for d in cuda_visible_devices.split(",") if d.strip()]
+        else:
+            # Default to all GPUs if not set
+            try:
+                import torch
+
+                num_gpus = torch.cuda.device_count()
+                devices = [str(i) for i in range(num_gpus)]
+            except ImportError:
+                devices = []
+        if devices:
+            selected_device = random.choice(devices)
+            env = os.environ.copy()
+            env["CUDA_VISIBLE_DEVICES"] = selected_device
+            print(f"Selected CUDA_VISIBLE_DEVICES={selected_device}")
+        else:
+            env = None  # No GPU available or torch not installed
+
+        try:
+            result = subprocess.run(
+                [sys.executable, abs_path],
+                capture_output=True,
+                text=True,
+                check=True,
+                env=env,
+            )
+            print("=== Program Output ===")
+            print(result.stdout)
+            print(result.stderr)
+            return True
+
+        except subprocess.CalledProcessError as e:
+            print("=== Program Output (Failure) ===")
+            print(e.stdout)
+            print(e.stderr)
+            print("===============================")
+            print("=== Program Source ===")
+            with open(abs_path) as f:
+                print(f.read())
+            print("======================")
+            print(f"Program exited with code: {e.returncode}")
+            sys.exit(1)
+
+    def run_and_validate(self, program_path):
+        """
+        Run a program and return detailed results for validation.
+
+        Args:
+            program_path: Path to the Python program to run
+
+        Returns:
+            dict: Dictionary with 'success', 'stdout', 'stderr', 'returncode'
+        """
+        abs_path = os.path.abspath(program_path)
+
+        # Select a random CUDA device if available
+        cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cuda_visible_devices:
+            devices = [d.strip() for d in cuda_visible_devices.split(",") if d.strip()]
+        else:
+            try:
+                import torch
+
+                num_gpus = torch.cuda.device_count()
+                if num_gpus > 1:
+                    devices = [str(i) for i in range(1, num_gpus)]
+                else:
+                    devices = [str(i) for i in range(num_gpus)]
+            except ImportError:
+                devices = []
+        if devices:
+            selected_device = random.choice(devices)
+            env = os.environ.copy()
+            env["CUDA_VISIBLE_DEVICES"] = selected_device
+            print(f"Selected CUDA_VISIBLE_DEVICES={selected_device}")
+        else:
+            env = None
+
+        try:
+            result = subprocess.run(
+                [sys.executable, abs_path],
+                capture_output=True,
+                text=True,
+                check=True,
+                env=env,
+            )
+            return {
+                "success": True,
+                "stdout": result.stdout,
+                "stderr": result.stderr,
+                "returncode": result.returncode,
+            }
+
+        except subprocess.CalledProcessError as e:
+            return {
+                "success": False,
+                "stdout": e.stdout,
+                "stderr": e.stderr,
+                "returncode": e.returncode,
+            }
diff --git a/tools/experimental/torchfuzz/tensor_descriptor.py b/tools/experimental/torchfuzz/tensor_descriptor.py
new file mode 100644
index 000000000000..ff0a6f62da51
--- /dev/null
+++ b/tools/experimental/torchfuzz/tensor_descriptor.py
@@ -0,0 +1,30 @@
+# mypy: ignore-errors
+"""Utility functions for generating tensor descriptors in code comments."""
+
+from torchfuzz.tensor_fuzzer import ScalarSpec, Spec, TensorSpec
+
+
+def format_tensor_descriptor(spec: Spec) -> str:
+    """
+    Format a tensor or scalar spec as a descriptor comment.
+
+    Args:
+        spec: TensorSpec or ScalarSpec to format
+
+    Returns:
+        Formatted descriptor string like "size=(64, 176, 96), stride=(16896, 96, 1), dtype=bfloat16, device=cuda"
+    """
+    if isinstance(spec, ScalarSpec):
+        # For scalars, just show the dtype
+        dtype_str = str(spec.dtype).replace("torch.", "")
+        return f"dtype={dtype_str}"
+    elif isinstance(spec, TensorSpec):
+        # For tensors, show size, stride, dtype, and device (assuming cuda for now)
+        size_str = str(tuple(spec.size))
+        stride_str = str(tuple(spec.stride))
+        dtype_str = str(spec.dtype).replace("torch.", "")
+        device_str = "cuda"  # Most fuzzing is done on GPU
+
+        return f"size={size_str}, stride={stride_str}, dtype={dtype_str}, device={device_str}"
+    else:
+        return "unknown_spec"
diff --git a/tools/experimental/torchfuzz/tensor_fuzzer.py b/tools/experimental/torchfuzz/tensor_fuzzer.py
new file mode 100644
index 000000000000..0357d6cbca18
--- /dev/null
+++ b/tools/experimental/torchfuzz/tensor_fuzzer.py
@@ -0,0 +1,568 @@
+# mypy: ignore-errors
+import random
+from typing import NamedTuple, Optional, Union
+
+import torch
+
+
+# Global configuration for tensor fuzzing
+class FuzzerConfig:
+    """Global configuration for tensor fuzzing behavior."""
+
+    use_real_values: bool = True  # If False, use zeros; if True, use random values
+    avoid_complex: bool = False  # If True, exclude complex dtypes from fuzzing
+
+
+class TensorSpec(NamedTuple):
+    """Specification for a tensor argument."""
+
+    size: tuple[int, ...]
+    stride: tuple[int, ...]
+    dtype: torch.dtype
+
+
+class ScalarSpec(NamedTuple):
+    """Specification for a scalar argument."""
+
+    dtype: torch.dtype
+    constant: Optional[Union[int, float, bool, complex]] = (
+        None  # If set, use this constant value instead of fuzzing
+    )
+
+
+# Union type for specs
+Spec = Union[TensorSpec, ScalarSpec]
+
+
+def fuzz_torch_tensor_type(template: str = "default") -> torch.dtype:
+    """
+    Fuzzes PyTorch tensor data types by randomly selecting and returning different dtypes.
+
+    Args:
+        template: Template name to determine supported dtypes
+
+    Returns:
+        torch.dtype: A randomly selected PyTorch tensor data type based on template constraints
+    """
+
+    # Get template-specific dtypes
+    if template == "dtensor":
+        # Import here to avoid circular imports
+        from torchfuzz.codegen import DTensorFuzzTemplate
+
+        fuzz_template = DTensorFuzzTemplate()
+        tensor_dtypes = fuzz_template.supported_dtypes()
+    elif template == "unbacked":
+        # Import here to avoid circular imports
+        from torchfuzz.codegen import UnbackedFuzzTemplate
+
+        fuzz_template = UnbackedFuzzTemplate()
+        tensor_dtypes = fuzz_template.supported_dtypes()
+    else:
+        from torchfuzz.codegen import DefaultFuzzTemplate
+
+        fuzz_template = DefaultFuzzTemplate()
+        tensor_dtypes = fuzz_template.supported_dtypes()
+
+    # Randomly select and return a data type
+    return random.choice(tensor_dtypes)
+
+
+def fuzz_tensor_size(max_dims: int = 3, max_size_per_dim: int = 30) -> tuple[int, ...]:
+    """
+    Fuzzes PyTorch tensor sizes by generating random tensor shapes.
+
+    Args:
+        max_dims: Maximum number of dimensions (default: 6)
+        max_size_per_dim: Maximum size for each dimension (default: 100)
+
+    Returns:
+        Tuple[int, ...]: A tuple representing tensor shape/size
+    """
+
+    # Randomly choose number of dimensions (0 to max_dims)
+    # 0 dimensions = scalar tensor
+    num_dims: int = random.randint(0, max_dims)
+
+    if num_dims == 0:
+        # Scalar tensor (0-dimensional)
+        return ()
+
+    # Generate random sizes for each dimension
+    sizes: list[int] = []
+    for _ in range(num_dims):
+        # Include edge cases:
+        # - 5% chance of size 0 (empty tensor in that dimension)
+        # - 10% chance of size 1 (singleton dimension)
+        # - 80% chance of normal size (2 to max_size_per_dim)
+
+        rand_val: float = random.random()
+        if rand_val < 0.05:
+            # Empty dimension
+            size: int = 0
+        elif rand_val < 0.2:
+            # Singleton dimension
+            size = 1
+        else:
+            # Normal size
+            size = random.randint(2, max_size_per_dim)
+
+        sizes.append(size)
+
+    return tuple(sizes)
+
+
+def fuzz_valid_stride(size: tuple[int, ...]) -> tuple[int, ...]:
+    """
+    Fuzzes PyTorch tensor strides by generating valid stride patterns for a given size.
+
+    Args:
+        size: Tensor shape/size as a tuple of integers
+
+    Returns:
+        Tuple[int, ...]: A tuple representing valid tensor strides
+    """
+
+    if len(size) == 0:
+        # Scalar tensor has no strides
+        return ()
+
+    # Choose stride pattern type
+    stride_types = [
+        "contiguous",  # Normal contiguous memory layout
+        "transposed",  # Transposed dimensions
+        "custom_gaps",  # Custom strides with gaps (non-dense)
+        "minimal",  # Minimal valid strides (all ones)
+        "nonoverlapping_and_dense",  # Non-overlapping and dense (contiguous)
+        "nonoverlapping_and_dense_non_contig",  # Non-overlapping and dense but not contiguous
+        "overlapping",  # Overlapping memory access (zero strides)
+        "sparse_gaps",  # Large gaps (definitely non-dense)
+    ]
+
+    stride_type: str = random.choice(stride_types)
+
+    if stride_type in ["contiguous", "nonoverlapping_and_dense"]:
+        # Standard contiguous strides: stride[i] = product of sizes[i+1:]
+        return tuple(_compute_contiguous_strides(size))
+
+    elif stride_type == "transposed":
+        # Create transposed version - swap some dimensions' strides
+        base_strides = list(_compute_contiguous_strides(size))
+
+        if len(base_strides) >= 2:
+            # Randomly swap strides of two dimensions
+            i, j = random.sample(range(len(base_strides)), 2)
+            base_strides[i], base_strides[j] = base_strides[j], base_strides[i]
+
+        return tuple(base_strides)
+
+    elif stride_type == "custom_gaps":
+        # Create strides with custom gaps/spacing
+        base_strides = list(_compute_contiguous_strides(size))
+
+        # Add random gaps to some strides
+        for i in range(len(base_strides)):
+            if size[i] != 0 and random.random() < 0.3:  # 30% chance to add gap
+                gap_multiplier: int = random.randint(2, 5)
+                base_strides[i] *= gap_multiplier
+
+        return tuple(base_strides)
+
+    elif stride_type == "minimal":
+        # Minimal valid strides (all ones)
+        return tuple([1] * len(size))
+
+    elif stride_type == "nonoverlapping_and_dense_non_contig":
+        # Non-overlapping and dense but not contiguous (e.g., column-major)
+        return tuple(_compute_non_contiguous_dense_strides(size))
+
+    elif stride_type == "overlapping":
+        # Create overlapping strides (zero strides for some dimensions)
+        base_strides = list(_compute_contiguous_strides(size))
+
+        # Randomly set some strides to 0 to cause overlapping
+        for i in range(len(base_strides)):
+            if size[i] > 1 and random.random() < 0.4:  # 40% chance to make overlapping
+                base_strides[i] = 0
+
+        return tuple(base_strides)
+
+    elif stride_type == "sparse_gaps":
+        # Create strides with very large gaps (definitely non-dense)
+        base_strides = list(_compute_contiguous_strides(size))
+
+        # Add very large gaps to create sparse layout
+        for i in range(len(base_strides)):
+            if size[i] > 1:
+                gap_multiplier: int = random.randint(10, 100)  # Much larger gaps
+                base_strides[i] *= gap_multiplier
+
+        return tuple(base_strides)
+
+    # Fallback to contiguous
+    return tuple(_compute_contiguous_strides(size))
+
+
+def _compute_contiguous_strides(size: tuple[int, ...]) -> list[int]:
+    """
+    Helper function to compute standard contiguous strides for a given size.
+
+    Args:
+        size: Tensor shape/size as a tuple of integers
+
+    Returns:
+        list[int]: List of contiguous strides
+    """
+    strides: list[int] = []
+    current_stride: int = 1
+
+    # Calculate strides from right to left
+    for i in range(len(size) - 1, -1, -1):
+        strides.insert(0, current_stride)
+        # For dimensions with size 0, keep stride as is
+        if size[i] != 0:
+            current_stride *= size[i]
+
+    return strides
+
+
+def _compute_non_contiguous_dense_strides(size: tuple[int, ...]) -> list[int]:
+    """
+    Helper function to compute non-contiguous but dense strides (e.g., column-major order).
+
+    Args:
+        size: Tensor shape/size as a tuple of integers
+
+    Returns:
+        list[int]: List of non-contiguous dense strides
+    """
+    if len(size) <= 1:
+        # For 0D or 1D tensors, return same as contiguous
+        return _compute_contiguous_strides(size)
+
+    # Generate different dense patterns
+    patterns = [
+        "column_major",  # Reverse order (left to right instead of right to left)
+        "random_permute",  # Random permutation of dimensions
+        "middle_out",  # Start from middle dimension
+    ]
+
+    pattern: str = random.choice(patterns)
+
+    if pattern == "column_major":
+        # Column-major order: calculate strides from left to right
+        strides: list[int] = [0] * len(size)
+        current_stride: int = 1
+
+        # Calculate strides from left to right (opposite of contiguous)
+        for i in range(len(size)):
+            strides[i] = current_stride
+            # For dimensions with size 0, keep stride as is
+            if size[i] != 0:
+                current_stride *= size[i]
+
+        return strides
+
+    elif pattern == "random_permute":
+        # Create a valid permutation that's still dense
+        # Create dimension permutation
+        indices = list(range(len(size)))
+        random.shuffle(indices)
+
+        # Apply permutation to get new dense layout
+        new_strides = [0] * len(size)
+        current_stride = 1
+
+        # Sort indices by their corresponding size to maintain density
+        sorted_indices = sorted(
+            indices, key=lambda i: size[i] if size[i] != 0 else float("inf")
+        )
+
+        for idx in sorted_indices:
+            new_strides[idx] = current_stride
+            if size[idx] != 0:
+                current_stride *= size[idx]
+
+        return new_strides
+
+    elif pattern == "middle_out":
+        # Start from middle dimension and work outward
+        strides = [0] * len(size)
+        current_stride = 1
+
+        # Start from middle
+        middle = len(size) // 2
+        processed = [False] * len(size)
+
+        # Process middle first
+        strides[middle] = current_stride
+        if size[middle] != 0:
+            current_stride *= size[middle]
+        processed[middle] = True
+
+        # Process alternating left and right
+        for offset in range(1, len(size)):
+            for direction in [-1, 1]:
+                idx = middle + direction * offset
+                if 0 <= idx < len(size) and not processed[idx]:
+                    strides[idx] = current_stride
+                    if size[idx] != 0:
+                        current_stride *= size[idx]
+                    processed[idx] = True
+                    break
+
+        return strides
+
+    # Fallback to contiguous
+    return _compute_contiguous_strides(size)
+
+
+def _compute_storage_size_needed(
+    size: tuple[int, ...], strides: tuple[int, ...]
+) -> int:
+    """Compute minimum storage size needed for given shape and strides."""
+    if not size:
+        return 1
+
+    # Find maximum offset
+    max_offset = 0
+    for dim_size, stride in zip(size, strides):
+        if dim_size > 1:
+            max_offset += (dim_size - 1) * abs(stride)
+
+    return max_offset + 1
+
+
+def fuzz_tensor(
+    size: Optional[tuple[int, ...]] = None,
+    stride: Optional[tuple[int, ...]] = None,
+    dtype: Optional[torch.dtype] = None,
+    seed: Optional[int] = None,
+) -> tuple[torch.Tensor, int]:
+    """
+    Create a tensor with fuzzed size, stride, and dtype.
+
+    Args:
+        size: Tensor shape. If None, will be randomly generated.
+        stride: Tensor stride. If None, will be randomly generated based on size.
+        dtype: Tensor data type. If None, will be randomly generated.
+        seed: Random seed for reproducibility. If None, will be randomly generated.
+
+    Returns:
+        Tuple[torch.Tensor, int]: A tuple of (tensor, seed_used) where tensor has
+        the specified or randomly generated properties, and seed_used is the seed
+        that was used for generation (for reproducibility).
+    """
+    # Generate or use provided seed
+    if seed is None:
+        seed = random.randint(0, 2**32 - 1)
+
+    # Create a local Random instance to avoid interfering with global state
+    local_random = random.Random(seed)
+
+    # Set the torch random seed for reproducibility
+    # Save and restore global torch state to avoid side effects
+    torch_state = torch.get_rng_state()
+    torch.manual_seed(seed)
+
+    # Generate random values if not provided using local random instance
+    old_random_state = random.getstate()
+    try:
+        # Temporarily use local random instance for deterministic generation
+        random.setstate(local_random.getstate())
+
+        if size is None:
+            size = fuzz_tensor_size()
+
+        if dtype is None:
+            dtype = fuzz_torch_tensor_type("default")
+
+        if stride is None:
+            stride = fuzz_valid_stride(size)
+
+        # Handle empty tensor case
+        if len(size) == 0:
+            return torch.ones((), dtype=dtype), seed
+
+        # Calculate required storage size for the custom stride
+        required_storage = _compute_storage_size_needed(size, stride)
+
+        # Create base tensor with sufficient storage
+        if FuzzerConfig.use_real_values:
+            # Use random values based on dtype
+            if dtype.is_floating_point:
+                base_tensor = torch.randn(required_storage, dtype=dtype)
+            elif dtype in [torch.complex64, torch.complex128]:
+                # Create complex tensor with random real and imaginary parts
+                real_part = torch.randn(
+                    required_storage,
+                    dtype=torch.float32 if dtype == torch.complex64 else torch.float64,
+                )
+                imag_part = torch.randn(
+                    required_storage,
+                    dtype=torch.float32 if dtype == torch.complex64 else torch.float64,
+                )
+                base_tensor = torch.complex(real_part, imag_part).to(dtype)
+            elif dtype == torch.bool:
+                base_tensor = torch.randint(0, 2, (required_storage,), dtype=torch.bool)
+            else:  # integer types
+                base_tensor = torch.randint(-100, 100, (required_storage,), dtype=dtype)
+        else:
+            # Use zeros (default behavior)
+            base_tensor = torch.ones(required_storage, dtype=dtype)
+
+        # Create strided tensor view
+        strided_tensor = torch.as_strided(base_tensor, size, stride)
+
+        return strided_tensor, seed
+    finally:
+        # Restore original random state
+        random.setstate(old_random_state)
+        # Restore original torch state
+        torch.set_rng_state(torch_state)
+
+
+def fuzz_tensor_simple(
+    size: Optional[tuple[int, ...]] = None,
+    stride: Optional[tuple[int, ...]] = None,
+    dtype: Optional[torch.dtype] = None,
+    seed: Optional[int] = None,
+) -> torch.Tensor:
+    """
+    Convenience function that returns just the tensor without the seed.
+
+    Args:
+        size: Tensor shape. If None, will be randomly generated.
+        stride: Tensor stride. If None, will be randomly generated based on size.
+        dtype: Tensor data type. If None, will be randomly generated.
+        seed: Random seed for reproducibility. If None, uses current random state.
+
+    Returns:
+        torch.Tensor: A tensor with the specified or randomly generated properties.
+    """
+    tensor, _ = fuzz_tensor(size, stride, dtype, seed)
+    return tensor
+
+
+def fuzz_non_contiguous_dense_tensor(
+    size: Optional[tuple[int, ...]] = None, dtype: Optional[torch.dtype] = None
+) -> torch.Tensor:
+    """
+    Specifically generates tensors that are non-contiguous but dense and non-overlapping.
+
+    Args:
+        size: Tensor shape/size. If None, auto-generated.
+        dtype: PyTorch tensor data type. If None, auto-generated.
+
+    Returns:
+        torch.Tensor: A non-contiguous but dense tensor
+    """
+    if dtype is None:
+        dtype = fuzz_torch_tensor_type("default")
+
+    if size is None:
+        size = fuzz_tensor_size()
+
+    # Force non-contiguous but dense stride patterns
+    if len(size) <= 1:
+        # For 0D or 1D tensors, return contiguous (they're trivially dense)
+        tensor, _ = fuzz_tensor(size, None, dtype)
+        return tensor
+
+    # Choose from patterns that guarantee non-contiguous but dense
+    patterns = ["column_major", "transposed", "permuted_dense"]
+
+    pattern = random.choice(patterns)
+
+    if pattern == "column_major":
+        # Column-major order (non-contiguous but dense)
+        stride = tuple(_compute_non_contiguous_dense_strides(size))
+    elif pattern == "transposed":
+        # Simple transpose of last two dimensions
+        base_strides = _compute_contiguous_strides(size)
+        if len(base_strides) >= 2:
+            # Swap last two dimensions' strides
+            base_strides[-1], base_strides[-2] = base_strides[-2], base_strides[-1]
+        stride = tuple(base_strides)
+    else:  # permuted_dense
+        # Random permutation that maintains density
+        stride = tuple(_compute_non_contiguous_dense_strides(size))
+
+    tensor, _ = fuzz_tensor(size, stride, dtype)
+    return tensor
+
+
+def fuzz_scalar(spec, seed: Optional[int] = None) -> Union[float, int, bool, complex]:
+    """
+    Create a Python scalar value from a ScalarSpec.
+
+    Args:
+        spec: ScalarSpec containing the desired dtype and optionally a constant value
+        seed: Random seed for reproducibility. If None, uses current random state.
+
+    Returns:
+        Python scalar (float, int, bool, complex) matching the dtype
+    """
+    # If a constant value is specified, use it directly
+    if spec.constant is not None:
+        return spec.constant
+
+    # Create a local random instance to avoid interfering with global state
+    if seed is not None:
+        local_random = random.Random(seed)
+        # Save and restore global random state
+        old_random_state = random.getstate()
+        try:
+            random.setstate(local_random.getstate())
+
+            # Create a scalar value based on dtype
+            if spec.dtype.is_floating_point:
+                return random.uniform(-10.0, 10.0)
+            elif spec.dtype in [torch.complex64, torch.complex128]:
+                # Only generate complex values if not avoiding complex dtypes
+                if FuzzerConfig.avoid_complex:
+                    raise ValueError(
+                        "Cannot generate complex values with avoid_complex=True"
+                    )
+                return complex(random.uniform(-10.0, 10.0), random.uniform(-10.0, 10.0))
+            else:  # integer or bool
+                if spec.dtype == torch.bool:
+                    return random.choice([True, False])
+                else:
+                    return random.randint(-10, 10)
+        finally:
+            # Restore original random state
+            random.setstate(old_random_state)
+    else:
+        # Use current random state when no seed provided
+        # Create a scalar value based on dtype
+        if spec.dtype.is_floating_point:
+            return random.uniform(-10.0, 10.0)
+        elif spec.dtype in [torch.complex64, torch.complex128]:
+            # Only generate complex values if not avoiding complex dtypes
+            if FuzzerConfig.avoid_complex:
+                raise ValueError(
+                    "Cannot generate complex values with avoid_complex=True"
+                )
+            return complex(random.uniform(-10.0, 10.0), random.uniform(-10.0, 10.0))
+        else:  # integer or bool
+            if spec.dtype == torch.bool:
+                return random.choice([True, False])
+            else:
+                return random.randint(-10, 10)
+
+
+def specs_compatible(spec1: Spec, spec2: Spec) -> bool:
+    """Check if two specifications are compatible (one can be used where the other is expected)."""
+    if type(spec1) is not type(spec2):
+        return False
+
+    if isinstance(spec1, ScalarSpec):
+        # For scalars, require exact dtype match for simplicity
+        return spec1.dtype == spec2.dtype
+    elif isinstance(spec1, TensorSpec):
+        assert isinstance(spec2, TensorSpec)
+        # For tensors, shape and dtype should match exactly
+        return spec1.size == spec2.size and spec1.dtype == spec2.dtype
+
+    return False
diff --git a/tools/experimental/torchfuzz/test_determinism.py b/tools/experimental/torchfuzz/test_determinism.py
new file mode 100644
index 000000000000..7c621d2e0cf2
--- /dev/null
+++ b/tools/experimental/torchfuzz/test_determinism.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""Test to verify fuzzer produces deterministic output with same seed."""
+
+import subprocess
+import sys
+from pathlib import Path
+
+
+def run_fuzzer_with_seed(seed):
+    """Run the fuzzer with a specific seed and return the generated code."""
+    cmd = [sys.executable, "fuzzer.py", "--seed", str(seed), "--template", "unbacked"]
+
+    # Clear the output directory first
+    torchfuzz_dir = Path("/tmp/torchfuzz")
+    if torchfuzz_dir.exists():
+        for f in torchfuzz_dir.glob("*.py"):
+            f.unlink()
+
+    result = subprocess.run(
+        cmd, capture_output=True, text=True, cwd=Path(__file__).parent
+    )
+
+    # Always attempt to read the generated file even if execution failed.
+    if result.returncode != 0:
+        print(f"Fuzzer failed with return code {result.returncode}")
+        print(f"stdout: {result.stdout}")
+        print(f"stderr: {result.stderr}")
+
+    # Prefer to compare the exact Program Source that the fuzzer printed in stdout,
+    # which reflects the executed code even if files are overwritten between runs.
+    src_block = None
+    lines = result.stdout.splitlines()
+    for i, line in enumerate(lines):
+        if line.strip() == "=== Program Source ===":
+            # Collect until the next delimiter line of === or the end
+            j = i + 1
+            block_lines = []
+            while j < len(lines) and not lines[j].startswith("==="):
+                block_lines.append(lines[j])
+                j += 1
+            src_block = "\n".join(block_lines)
+            break
+
+    if src_block:
+        return src_block
+
+    # Fallback: parse the exact path the fuzzer ran from stdout: "Running: /tmp/torchfuzz/fuzz_XXXX.py"
+    path = None
+    for line in lines:
+        if line.startswith("Running: ") and line.strip().endswith(".py"):
+            path = line.split("Running: ", 1)[1].strip()
+            break
+
+    if path is None:
+        # Fallback: pick the most recently modified fuzz_*.py in /tmp/torchfuzz
+        py_files = sorted(
+            torchfuzz_dir.glob("fuzz_*.py"),
+            key=lambda p: p.stat().st_mtime,
+            reverse=True,
+        )
+        if not py_files:
+            print("No Python files generated in /tmp/torchfuzz/")
+            return None
+        path = str(py_files[0])
+
+    # Read the content of the generated file that was actually executed
+    with open(path) as f:
+        return f.read()
+
+
+def test_deterministic_output():
+    """Test that the fuzzer produces identical output for the same seed."""
+    seed = 13157  # Use the seed mentioned in the user's issue
+    num_runs = 3
+
+    outputs = []
+
+    print(f"Running fuzzer {num_runs} times with seed {seed}...")
+
+    for i in range(num_runs):
+        print(f"Run {i + 1}...")
+        output = run_fuzzer_with_seed(seed)
+        if output is None:
+            print(f"Failed to get output from run {i + 1}")
+            return False
+        outputs.append(output)
+
+    # Compare all outputs
+    first_output = outputs[0]
+    all_identical = all(output == first_output for output in outputs[1:])
+
+    if all_identical:
+        print("✓ SUCCESS: All outputs are identical!")
+        print(f"Generated code length: {len(first_output)} characters")
+        return True
+    else:
+        print("✗ FAILURE: Outputs differ between runs!")
+
+        # Show differences for debugging
+        for i, output in enumerate(outputs[1:], 2):
+            if output != first_output:
+                print(f"\nDifferences between run 1 and run {i}:")
+
+                # Simple line-by-line comparison
+                lines1 = first_output.splitlines()
+                lines2 = output.splitlines()
+
+                min_lines = min(len(lines1), len(lines2))
+                for line_num in range(min_lines):
+                    if lines1[line_num] != lines2[line_num]:
+                        print(f"Line {line_num + 1}:")
+                        print(f"  Run 1: {lines1[line_num]}")
+                        print(f"  Run {i}: {lines2[line_num]}")
+                        break
+
+                if len(lines1) != len(lines2):
+                    print(f"Different number of lines: {len(lines1)} vs {len(lines2)}")
+
+        return False
+
+
+def main():
+    """Main function to run the determinism test."""
+    print("Testing fuzzer determinism...")
+    print("=" * 50)
+
+    success = test_deterministic_output()
+
+    if success:
+        print("\n🎉 Test PASSED: Fuzzer is deterministic!")
+        sys.exit(0)
+    else:
+        print("\n❌ Test FAILED: Fuzzer is not deterministic!")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/experimental/torchfuzz/type_promotion.py b/tools/experimental/torchfuzz/type_promotion.py
new file mode 100644
index 000000000000..db48b87c0b5e
--- /dev/null
+++ b/tools/experimental/torchfuzz/type_promotion.py
@@ -0,0 +1,197 @@
+"""Type promotion utilities for torchfuzz operators."""
+
+import random
+
+import torch
+
+
+# Define promotion chains - types that can promote to the target
+# PyTorch promotion hierarchy (simplified):
+# - bool < int8 < int16 < int32 < int64 < float16 < float32 < float64 < complex64 < complex128
+# - uint types have limited promotion support
+PROMOTION_CHAINS = {
+    torch.bool: [torch.bool],
+    torch.int8: [torch.bool, torch.int8],
+    torch.int16: [torch.bool, torch.int8, torch.int16],
+    torch.int32: [torch.bool, torch.int8, torch.int16, torch.int32],
+    torch.int64: [torch.bool, torch.int8, torch.int16, torch.int32, torch.int64],
+    torch.float16: [
+        torch.bool,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.float16,
+    ],
+    torch.float32: [
+        torch.bool,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.float16,
+        torch.float32,
+    ],
+    torch.float64: [
+        torch.bool,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.float16,
+        torch.float32,
+        torch.float64,
+    ],
+    torch.complex64: [
+        torch.bool,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.float16,
+        torch.float32,
+        torch.complex64,
+    ],
+    torch.complex128: [
+        torch.bool,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.float16,
+        torch.float32,
+        torch.float64,
+        torch.complex64,
+        torch.complex128,
+    ],
+}
+
+
+def get_promoted_dtypes(target_dtype: torch.dtype) -> list[torch.dtype]:
+    """
+    Generate two dtypes that will promote to target_dtype via PyTorch's type promotion rules.
+    """
+    # Get compatible input types for the target dtype
+    compatible_types = PROMOTION_CHAINS.get(target_dtype, [target_dtype])
+
+    # Strategy: Choose between same type or mixed promotion
+    strategies = ["same_type", "mixed_promotion"]
+    strategy = random.choice(strategies)
+
+    if strategy == "same_type":
+        # Both args same type as target
+        return [target_dtype, target_dtype]
+
+    else:  # mixed_promotion
+        # Mixed types where the result will promote to target_dtype
+        lower_types = compatible_types[:-1]  # All except the last (target_dtype)
+
+        if lower_types:
+            # One arg is target_dtype, one is lower (will promote to target)
+            lower_dtype = random.choice(lower_types)
+            if random.random() < 0.5:
+                return [target_dtype, lower_dtype]
+            else:
+                return [lower_dtype, target_dtype]
+        else:
+            # Fallback to same type if no lower types available
+            return [target_dtype, target_dtype]
+
+
+def get_dtype_name(dtype: torch.dtype) -> str:
+    """Get string name for a torch dtype."""
+    return str(dtype).split(".")[-1]
+
+
+def get_promotion_table_for_strings() -> dict:
+    """
+    Get promotion table using string dtype names for backward compatibility.
+    Returns dictionary mapping output dtype string to possible input dtype string pairs.
+    """
+    return {
+        "float32": [
+            ("float32", "float32"),
+            ("bfloat16", "float32"),
+            ("float32", "bfloat16"),
+            ("float16", "float32"),
+            ("float32", "float16"),
+        ],
+        "bfloat16": [
+            ("bfloat16", "bfloat16"),
+            ("float32", "bfloat16"),
+            ("bfloat16", "float32"),
+        ],
+        "float16": [
+            ("float16", "float16"),
+            ("float32", "float16"),
+            ("float16", "float32"),
+        ],
+        "int32": [
+            ("int32", "int32"),
+            ("int64", "int32"),
+            ("int32", "int64"),
+        ],
+        "int64": [
+            ("int64", "int64"),
+            ("int32", "int64"),
+            ("int64", "int32"),
+        ],
+        "bool": [
+            ("bool", "bool"),
+        ],
+    }
+
+
+def get_dtype_map() -> dict:
+    """Get mapping from string names to torch dtypes."""
+    return {
+        "float32": torch.float32,
+        "float16": torch.float16,
+        "bfloat16": torch.bfloat16,
+        "int32": torch.int32,
+        "int64": torch.int64,
+        "bool": torch.bool,
+        "int8": torch.int8,
+        "int16": torch.int16,
+        "float64": torch.float64,
+        "complex64": torch.complex64,
+        "complex128": torch.complex128,
+    }
+
+
+def get_scalar_promotion_pairs(
+    target_dtype: torch.dtype,
+) -> list[tuple[torch.dtype, torch.dtype]]:
+    """
+    Get promotion pairs for scalar operations.
+    Returns list of (dtype1, dtype2) tuples that promote to target_dtype.
+    """
+    return (
+        [
+            (torch.float32, torch.float32),
+            (torch.float16, torch.float32),
+            (torch.float32, torch.float16),
+            (torch.int32, torch.float32),
+            (torch.float32, torch.int32),
+        ]
+        if target_dtype == torch.float32
+        else [
+            (torch.float64, torch.float64),
+            (torch.float32, torch.float64),
+            (torch.float64, torch.float32),
+        ]
+        if target_dtype == torch.float64
+        else [
+            (torch.int32, torch.int32),
+            (torch.int64, torch.int32),
+            (torch.int32, torch.int64),
+        ]
+        if target_dtype == torch.int32
+        else [
+            (torch.int64, torch.int64),
+            (torch.int32, torch.int64),
+            (torch.int64, torch.int32),
+        ]
+        if target_dtype == torch.int64
+        else [(target_dtype, target_dtype)]
+    )
diff --git a/tools/experimental/torchfuzz/visualize_graph.py b/tools/experimental/torchfuzz/visualize_graph.py
new file mode 100644
index 000000000000..4a8608e0d2e9
--- /dev/null
+++ b/tools/experimental/torchfuzz/visualize_graph.py
@@ -0,0 +1,227 @@
+# mypy: ignore-errors
+
+"""
+Visualization tools for operation stacks and graphs as DAGs.
+"""
+
+import subprocess
+
+from torchfuzz.ops_fuzzer import OperationGraph
+from torchfuzz.tensor_fuzzer import TensorSpec
+
+
+def save_and_render_dot(dot_content: str, filename: str = "operation_stack"):
+    """
+    Save DOT content to file and render as PNG/PDF.
+
+    Args:
+        dot_content: DOT format string
+        filename: Base filename (without extension)
+    """
+    import os
+
+    dot_file = f"{filename}.dot"
+    png_file = f"{filename}.png"
+
+    # Get absolute path for clickable link
+    abs_png = os.path.abspath(png_file)
+
+    # Save DOT file
+    with open(dot_file, "w") as f:
+        f.write(dot_content)
+
+    # Render to PNG
+    try:
+        subprocess.run(["dot", "-Tpng", dot_file, "-o", png_file], check=True)
+        print(f"🖼️  View: file://{abs_png}")
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        pass
+
+
+def operation_graph_to_dot(
+    graph: OperationGraph, title: str = "Operation Graph"
+) -> str:
+    """
+    Convert an operation graph to Graphviz DOT format for visualization.
+
+    Args:
+        graph: OperationGraph instance
+        title: Title for the graph
+
+    Returns:
+        DOT format string
+    """
+    dot_lines = [
+        "digraph OperationGraph {",
+        f'    label="{title}";',
+        "    rankdir=TB;",  # Top to bottom layout
+        "    node [shape=box, style=filled, fontsize=10];",
+        "    edge [fontsize=8];",
+        "",
+    ]
+
+    # Add nodes with styling based on operation type
+    for node_id, node in graph.nodes.items():
+        # Choose color and shape based on operation type
+        if node.op_name.startswith("arg_"):
+            color = "lightblue"
+            shape = "ellipse"
+        elif node.op_name == "constant":
+            color = "lightgreen"
+            shape = "ellipse"
+        elif "aten" in node.op_name:
+            color = "lightyellow"
+            shape = "box"
+        else:
+            color = "lightgray"
+            shape = "box"
+
+        # Create comprehensive label
+        if node.op_name.startswith("arg_"):
+            label_parts = [node.op_name]
+        else:
+            label_parts = [node_id, node.op_name, f"depth {node.depth}"]
+
+        if hasattr(node.output_spec, "dtype"):
+            dtype_str = str(node.output_spec.dtype).replace("torch.", "")
+            label_parts.append(dtype_str)
+
+        # Only add size for TensorSpec, not ScalarSpec
+        if isinstance(node.output_spec, TensorSpec) and node.output_spec.size:
+            size_str = "x".join(map(str, node.output_spec.size))
+            label_parts.append(f"size {size_str}")
+
+        label = "\\n".join(label_parts)
+
+        # Special highlighting for root node
+        extra_style = ""
+        if node_id == graph.root_node_id:
+            extra_style = ", penwidth=3, color=red"
+
+        dot_lines.append(
+            f'    {node_id} [label="{label}", fillcolor="{color}", shape="{shape}"{extra_style}];'
+        )
+
+    dot_lines.append("")
+
+    # Add edges based on the graph structure
+    for node_id, node in graph.nodes.items():
+        for i, input_node_id in enumerate(node.input_nodes):
+            # Add edge from input node to current node with input position label
+            edge_label = f"input_{i}"
+            dot_lines.append(
+                f'    {input_node_id} -> {node_id} [label="{edge_label}"];'
+            )
+
+    dot_lines.extend(
+        [
+            "",
+            "    // Legend",
+            "    subgraph cluster_legend {",
+            '        label="Legend";',
+            "        style=filled;",
+            "        fillcolor=white;",
+            '        legend_arg [label="arg", fillcolor=lightblue, shape=ellipse];',
+            '        legend_const [label="constant", fillcolor=lightgreen, shape=ellipse];',
+            '        legend_aten [label="aten ops", fillcolor=lightyellow, shape=box];',
+            '        legend_root [label="root", fillcolor=orange, shape=box, penwidth=3, color=red];',
+            "    }",
+            "}",
+        ]
+    )
+
+    return "\n".join(dot_lines)
+
+
+def visualize_operation_graph(
+    graph: OperationGraph,
+    title: str = "Operation Graph",
+    output_folder: str = ".",
+):
+    """
+    Complete visualization pipeline for an operation graph.
+
+    Args:
+        graph: OperationGraph instance
+        title: Title for the visualization
+        output_folder: Folder where to save the visualization files
+    """
+    # Generate DOT content
+    dot_content = operation_graph_to_dot(graph, title)
+
+    # Save and render in the specified folder
+    import os
+
+    filename = os.path.join(output_folder, "operation_graph")
+    save_and_render_dot(dot_content, filename)
+
+
+def operation_graph_to_networkx(graph: OperationGraph):
+    """
+    Convert operation graph to NetworkX graph for Python visualization.
+    Requires: pip install networkx matplotlib
+    """
+    try:
+        import matplotlib.pyplot as plt
+        import networkx as nx
+    except ImportError:
+        print(
+            "⚠️  NetworkX/Matplotlib not installed. Run: pip install networkx matplotlib"
+        )
+        return
+
+    # Create directed graph
+    G = nx.DiGraph()
+
+    # Add nodes
+    for node_id, node in graph.nodes.items():
+        label = f"{node_id}\n{node.op_name}\ndepth {node.depth}"
+        G.add_node(node_id, label=label, node=node)
+
+    # Add edges based on the graph structure
+    for node_id, node in graph.nodes.items():
+        for input_node_id in node.input_nodes:
+            if input_node_id in graph.nodes:  # Only add edges to nodes in the graph
+                G.add_edge(input_node_id, node_id)
+
+    # Plot
+    plt.figure(figsize=(12, 8))
+    pos = nx.spring_layout(G, k=2, iterations=50)
+
+    # Draw nodes with colors based on operation type
+    node_colors = []
+    for node_id in G.nodes():
+        node = graph.nodes[node_id]
+        if node.op_name.startswith("arg_"):
+            node_colors.append("lightblue")
+        elif node.op_name == "constant":
+            node_colors.append("lightgreen")
+        elif "aten" in node.op_name:
+            node_colors.append("lightyellow")
+        else:
+            node_colors.append("lightgray")
+
+    # Highlight root node
+    node_sizes = []
+    for node_id in G.nodes():
+        if node_id == graph.root_node_id:
+            node_sizes.append(2000)  # Larger size for root
+        else:
+            node_sizes.append(1500)
+
+    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=node_sizes)
+    nx.draw_networkx_edges(G, pos, edge_color="gray", arrows=True, arrowsize=20)
+
+    # Draw labels
+    labels = {
+        node_id: f"{node_id}\n{graph.nodes[node_id].op_name}" for node_id in G.nodes()
+    }
+    nx.draw_networkx_labels(G, pos, labels, font_size=8)
+
+    plt.title("Operation Graph Visualization")
+    plt.axis("off")
+    plt.tight_layout()
+    plt.savefig("operation_graph_networkx.png", dpi=300, bbox_inches="tight")
+    plt.show()
+
+    print("✓ NetworkX graph visualization saved as operation_graph_networkx.png")
diff --git a/tools/flight_recorder/components/config_manager.py b/tools/flight_recorder/components/config_manager.py
index 1b4eafc3631d..6f7c93c0b58f 100644
--- a/tools/flight_recorder/components/config_manager.py
+++ b/tools/flight_recorder/components/config_manager.py
@@ -78,15 +78,22 @@ def __init__(self: "JobConfig"):
     def parse_args(
         self: "JobConfig", args: Optional[Sequence[str]]
     ) -> argparse.Namespace:
+        # pyrefly: ignore  # bad-assignment
         args = self.parser.parse_args(args)
+        # pyrefly: ignore  # missing-attribute
         if args.selected_ranks is not None:
+            # pyrefly: ignore  # missing-attribute
             assert args.just_print_entries, (
                 "Not support selecting ranks without printing entries"
             )
+        # pyrefly: ignore  # missing-attribute
         if args.pg_filters is not None:
+            # pyrefly: ignore  # missing-attribute
             assert args.just_print_entries, (
                 "Not support selecting pg filters without printing entries"
             )
+        # pyrefly: ignore  # missing-attribute
         if args.verbose:
             logger.set_log_level(logging.DEBUG)
+        # pyrefly: ignore  # bad-return
         return args
diff --git a/tools/flight_recorder/components/fr_logger.py b/tools/flight_recorder/components/fr_logger.py
index 9574df97437b..49d878bf4559 100644
--- a/tools/flight_recorder/components/fr_logger.py
+++ b/tools/flight_recorder/components/fr_logger.py
@@ -5,7 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 
 
 class FlightRecorderLogger:
diff --git a/tools/flight_recorder/components/types.py b/tools/flight_recorder/components/types.py
index 20e093688ba1..2c8fea5fb334 100644
--- a/tools/flight_recorder/components/types.py
+++ b/tools/flight_recorder/components/types.py
@@ -469,6 +469,30 @@ def __repr__(self) -> str:
             f"{p2p_info}, " if p2p_info else ""
         )
 
+    def dtype_mismatch(self, other: "Op") -> bool:
+        if (
+            (
+                self.type not in ["scatter", "gather", "broadcast"]
+                and set(self.input_dtypes) != set(self.output_dtypes)
+                and self.input_sizes[0]
+                and self.output_sizes[0]
+            )
+            or (
+                self.type not in ["scatter", "broadcast"]
+                and set(self.input_dtypes) != set(other.input_dtypes)
+                and self.input_sizes[0]
+                and other.input_sizes[0]
+            )
+            or (
+                self.type not in ["gather"]
+                and set(self.output_dtypes) != set(other.output_dtypes)
+                and self.output_sizes[0]
+                and other.output_sizes[0]
+            )
+        ):
+            return True
+        return False
+
     def match(self, other: "Op") -> MatchInfo:
         # TODO: I think this can validly not match,
         # e.g. if one PG was used for p2p ops between only some of the peers?
@@ -504,44 +528,19 @@ def match(self, other: "Op") -> MatchInfo:
                     MatchState.COLLECTIVE_TYPE_MISMATCH,
                     f"Expected collective type: '{self.type}' does not match found collective type: '{other.type}'",
                 )
-            if self.state != other.state:
-                # MatchState()
-                return MatchInfo(
-                    MatchState.COLLECTIVE_STATE_MISMATCH,
-                    f"Expected state: '{self.state}' does not match found state: '{other.state}'",
-                )
             if (
-                (
-                    set(self.input_dtypes) != set(self.output_dtypes)
-                    and self.input_sizes[0]
-                    and self.output_sizes[0]
-                )
-                or (
-                    set(self.input_dtypes) != set(other.input_dtypes)
-                    and self.input_sizes[0]
-                    and other.input_sizes[0]
-                )
-                or (
-                    set(self.input_dtypes) != set(other.output_dtypes)
-                    and self.input_sizes[0]
-                    and other.output_sizes[0]
-                )
+                self.type not in ["all_to_all", "scatter"]
+                and self.input_sizes != other.input_sizes
             ):
-                return MatchInfo(
-                    MatchState.COLLECTIVE_DTYPE_MISMATCH,
-                    f"Expected dtypes: '{set(self.input_dtypes)}' does not "
-                    f"match found dtype: '{set(self.output_dtypes)}/"
-                    f"{set(other.input_dtypes)}/{set(other.output_dtypes)}'",
-                )
-            if self.type == "all_to_all":
-                return MatchInfo(MatchState.UNDECIDED)
-            if self.type != "scatter" and self.input_sizes != other.input_sizes:
                 return MatchInfo(
                     MatchState.SIZE_OR_SYNTAX_MISMATCH,
                     f"Expected input sizes: '{self.input_sizes}' does not match found input sizes: "
                     f"'{other.input_sizes}'",
                 )
-            if self.type != "gather" and self.output_sizes != other.output_sizes:
+            if (
+                self.type not in ["all_to_all", "gather"]
+                and self.output_sizes != other.output_sizes
+            ):
                 return MatchInfo(
                     MatchState.SIZE_OR_SYNTAX_MISMATCH,
                     f"Expected output sizes: '{self.output_sizes}' does not match found output sizes: "
@@ -555,32 +554,51 @@ def match(self, other: "Op") -> MatchInfo:
                     MatchState.SIZE_OR_SYNTAX_MISMATCH,
                     f"Expected input sizes: '{self.input_sizes}' does not match found output sizes: '{other.output_sizes}'",
                 )
-            if self.type in [
-                "all_gather",
-                "all_gather_base",
-                "all_gather_into_tensor_coalesced",
-            ] and not (
-                math.prod(other.output_sizes[0])
-                == math.prod(self.input_sizes[0]) * self.pg_size
+            if (
+                self.type
+                in [
+                    "all_gather",
+                    "all_gather_base",
+                    "all_gather_into_tensor_coalesced",
+                ]
+                and math.prod(other.output_sizes[0])
+                != math.prod(self.input_sizes[0]) * self.pg_size
             ):
                 return MatchInfo(
                     MatchState.SIZE_OR_SYNTAX_MISMATCH,
                     f"Found input numel '{math.prod(other.input_sizes[0])} * pg size {self.pg_size}' "
                     f"does not match output numel '{math.prod(other.output_sizes[0])}'",
                 )
-            if self.type in [
-                "reduce_scatter",
-                "_reduce_scatter_base",
-                "reduce_scatter_tensor_coalesced",
-            ] and not (
-                math.prod(other.input_sizes[0])
-                == math.prod(self.output_sizes[0]) * self.pg_size
+            if (
+                self.type
+                in [
+                    "reduce_scatter",
+                    "_reduce_scatter_base",
+                    "reduce_scatter_tensor_coalesced",
+                ]
+                and math.prod(other.input_sizes[0])
+                != math.prod(self.output_sizes[0]) * self.pg_size
             ):
                 return MatchInfo(
                     MatchState.SIZE_OR_SYNTAX_MISMATCH,
                     f"Found input numel '{math.prod(other.input_sizes[0])}' does not match output numel "
                     f"'{math.prod(other.output_sizes[0])} * pg size {self.pg_size}'",
                 )
+            if self.dtype_mismatch(other):
+                return MatchInfo(
+                    MatchState.COLLECTIVE_DTYPE_MISMATCH,
+                    f"Expected dtypes: '{set(self.input_dtypes)}' does not "
+                    f"match found dtype: '{set(self.output_dtypes)}/"
+                    f"{set(other.input_dtypes)}/{set(other.output_dtypes)}'",
+                )
+            if self.state != other.state:
+                # MatchState()
+                return MatchInfo(
+                    MatchState.COLLECTIVE_STATE_MISMATCH,
+                    f"Expected state: '{self.state}' does not match found state: '{other.state}'",
+                )
+            if self.type == "all_to_all":
+                return MatchInfo(MatchState.UNDECIDED)
         elif self.type in [
             "coalesced",
             "ALLGATHER_coalesced",
diff --git a/tools/flight_recorder/components/utils.py b/tools/flight_recorder/components/utils.py
index 69455a5a433b..c65a6b98c3c0 100644
--- a/tools/flight_recorder/components/utils.py
+++ b/tools/flight_recorder/components/utils.py
@@ -41,6 +41,7 @@ def format_frame(frame: dict[str, str]) -> str:
 def format_frames(frames: list[dict[str, str]]) -> str:
     formatted_frames = []
     for frame in frames:
+        # pyrefly: ignore  # bad-argument-type
         formatted_frames.append(format_frame(frame))
     return "\n".join(formatted_frames)
 
@@ -695,6 +696,7 @@ def check_version(version_by_ranks: dict[str, str], version: str) -> None:
 
 
 def get_version_detail(version: str) -> tuple[int, int]:
+    # pyrefly: ignore  # bad-assignment
     version = version.split(".")
     assert len(version) == 2, f"Invalid version {version}"
     major, minor = map(int, version)
diff --git a/tools/flight_recorder/fr_trace.py b/tools/flight_recorder/fr_trace.py
index 1d8abcefabfa..3bb64a12120a 100644
--- a/tools/flight_recorder/fr_trace.py
+++ b/tools/flight_recorder/fr_trace.py
@@ -40,11 +40,17 @@
 
 def main(args: Optional[Sequence[str]] = None) -> None:
     config = JobConfig()
+    # pyrefly: ignore  # bad-assignment
     args = config.parse_args(args)
+    # pyrefly: ignore  # missing-attribute
     assert args.trace_dir, "Trace directory trace_dir is required"
+    # pyrefly: ignore  # bad-argument-type
     details, version = read_dir(args)
+    # pyrefly: ignore  # bad-argument-type
     db = build_db(details, args, version)
+    # pyrefly: ignore  # missing-attribute
     if args.output:
+        # pyrefly: ignore  # no-matching-overload
         with open(args.output, "wb") as f:
             pickle.dump((types, db), f)
 
diff --git a/tools/gdb/pytorch-gdb.py b/tools/gdb/pytorch-gdb.py
index b205afdc45d4..bb3f7e51f027 100644
--- a/tools/gdb/pytorch-gdb.py
+++ b/tools/gdb/pytorch-gdb.py
@@ -34,6 +34,7 @@ class TensorRepr(gdb.Command):  # type: ignore[misc, no-any-unimported]
     on it.
     """
 
+    # pyrefly: ignore  # bad-argument-type
     __doc__ = textwrap.dedent(__doc__).strip()
 
     def __init__(self) -> None:
diff --git a/tools/gen_vulkan_spv.py b/tools/gen_vulkan_spv.py
index 3c7539b21d86..6772e690a02c 100644
--- a/tools/gen_vulkan_spv.py
+++ b/tools/gen_vulkan_spv.py
@@ -118,6 +118,7 @@ def extract_filename(path: str, keep_ext: bool = True) -> Any:
 
 
 # https://gist.github.com/pypt/94d747fe5180851196eb
+# pyrefly: ignore  # invalid-inheritance
 class UniqueKeyLoader(Loader):
     def construct_mapping(self, node, deep=False):  # type: ignore[no-untyped-def]
         if not isinstance(node, MappingNode):
@@ -233,6 +234,7 @@ def preprocess(
         last_indent = input_indent
 
     while blank_lines != 0:
+        # pyrefly: ignore  # unbound-name
         python_lines.append(python_indent + "print(file=OUT_STREAM)")
         blank_lines -= 1
 
@@ -667,6 +669,7 @@ def generateShaderDispatchStr(shader_info: ShaderInfo, name: str) -> str:
             "    ",
         )
 
+    # pyrefly: ignore  # unbound-name
     return shader_dispatch_str
 
 
@@ -681,15 +684,18 @@ def genCppFiles(
         name = getName(spvPath).replace("_spv", "")
 
         sizeBytes, spv_bin_str = generateSpvBinStr(spvPath, name)
+        # pyrefly: ignore  # bad-argument-type
         spv_bin_strs.append(spv_bin_str)
 
         shader_info = getShaderInfo(srcPath)
 
         register_shader_info_strs.append(
+            # pyrefly: ignore  # bad-argument-type
             generateShaderInfoStr(shader_info, name, sizeBytes)
         )
 
         if shader_info.register_for is not None:
+            # pyrefly: ignore  # bad-argument-type
             shader_registry_strs.append(generateShaderDispatchStr(shader_info, name))
 
     spv_bin_arrays = "\n".join(spv_bin_strs)
diff --git a/tools/generate_torch_version.py b/tools/generate_torch_version.py
index ce92638c859e..ec16bbf4546e 100644
--- a/tools/generate_torch_version.py
+++ b/tools/generate_torch_version.py
@@ -1,11 +1,13 @@
 from __future__ import annotations
 
 import argparse
+import email
 import os
 import re
 import subprocess
 from pathlib import Path
 
+from packaging.version import Version
 from setuptools import distutils  # type: ignore[import,attr-defined]
 
 
@@ -48,19 +50,60 @@ def get_tag(pytorch_root: str | Path) -> str:
 
 
 def get_torch_version(sha: str | None = None) -> str:
+    """Determine the torch version string.
+
+    The version is determined from one of the following sources, in order of
+    precedence:
+    1. The PYTORCH_BUILD_VERSION and PYTORCH_BUILD_NUMBER environment variables.
+       These are set by the PyTorch build system when building official
+       releases. If built from an sdist, it is checked that the version matches
+       the sdist version.
+    2. The PKG-INFO file, if it exists. This file is included in source
+       distributions (sdist) and contains the version of the sdist.
+    3. The version.txt file, which contains the base version string. If the git
+       commit SHA is available, it is appended to the version string to
+       indicate that this is a development build.
+    """
     pytorch_root = Path(__file__).absolute().parent.parent
-    version = open(pytorch_root / "version.txt").read().strip()
-
+    pkg_info_path = pytorch_root / "PKG-INFO"
+    if pkg_info_path.exists():
+        with open(pkg_info_path) as f:
+            pkg_info = email.message_from_file(f)
+        sdist_version = pkg_info["Version"]
+    else:
+        sdist_version = None
     if os.getenv("PYTORCH_BUILD_VERSION"):
         assert os.getenv("PYTORCH_BUILD_NUMBER") is not None
         build_number = int(os.getenv("PYTORCH_BUILD_NUMBER", ""))
         version = os.getenv("PYTORCH_BUILD_VERSION", "")
         if build_number > 1:
             version += ".post" + str(build_number)
-    elif sha != UNKNOWN:
-        if sha is None:
-            sha = get_sha(pytorch_root)
-        version += "+git" + sha[:7]
+        origin = "PYTORCH_BUILD_{VERSION,NUMBER} env variables"
+    elif sdist_version:
+        version = sdist_version
+        origin = "PKG-INFO"
+    else:
+        version = open(pytorch_root / "version.txt").read().strip()
+        origin = "version.txt"
+        if sdist_version is None and sha != UNKNOWN:
+            if sha is None:
+                sha = get_sha(pytorch_root)
+            version += "+git" + sha[:7]
+            origin += " and git commit"
+    # Validate that the version is PEP 440 compliant
+    parsed_version = Version(version)
+    if sdist_version:
+        if (l := parsed_version.local) and l.startswith("git"):
+            # Assume local version is git<sha> and
+            # hence whole version is source version
+            source_version = version
+        else:
+            # local version is absent or platform tag
+            source_version = version.partition("+")[0]
+        assert sdist_version == source_version, (
+            f"Source part '{source_version}' of version '{version}' from "
+            f"{origin} does not match version '{sdist_version}' from PKG-INFO"
+        )
     return version
 
 
diff --git a/tools/github/github_utils.py b/tools/github/github_utils.py
index 6442a0644282..dc078fe29fad 100644
--- a/tools/github/github_utils.py
+++ b/tools/github/github_utils.py
@@ -4,12 +4,16 @@
 
 import json
 import os
-from typing import Any, Callable, cast
+from typing import Any, cast, TYPE_CHECKING
 from urllib.error import HTTPError
 from urllib.parse import quote
 from urllib.request import Request, urlopen
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 def gh_fetch_url_and_headers(
     url: str,
     *,
diff --git a/tools/jit/gen_unboxing.py b/tools/jit/gen_unboxing.py
index b63b6f5ed251..6ff4d393f2f7 100644
--- a/tools/jit/gen_unboxing.py
+++ b/tools/jit/gen_unboxing.py
@@ -131,12 +131,14 @@ def __call__(self, f: NativeFunction) -> str:
                 else:
                     arg_cpp = f"c10::IValue({arg_default})"
             args_code.append(
+                # pyrefly: ignore  # bad-argument-type
                 f"""c10::Argument("{arg.name}", nullptr, ::std::nullopt, {arg_cpp})"""
             )
 
         returns = f.func.returns
         returns_code = []
         for ret in returns:
+            # pyrefly: ignore  # bad-argument-type
             returns_code.append(f"""c10::Argument("{ret.name if ret.name else ""}")""")
         return f"""
 // aten::{schema}
diff --git a/tools/jit/test/test_gen_unboxing.py b/tools/jit/test/test_gen_unboxing.py
index e4f228063199..975342aad0f7 100644
--- a/tools/jit/test/test_gen_unboxing.py
+++ b/tools/jit/test/test_gen_unboxing.py
@@ -53,7 +53,7 @@ def test_get_custom_build_selector_with_both_allowlist_and_yaml(
         temp_file.seek(0)
         args = [
             "--op-registration-allowlist=op1",
-            "--TEST-ONLY-op-registration-allowlist-yaml-path={temp_file.name}",
+            f"--TEST-ONLY-op-registration-allowlist-yaml-path={temp_file.name}",
             "--op-selection-yaml-path=path2",
         ]
         gen_unboxing.main(args)
diff --git a/tools/linter/adapters/_linter/file_linter.py b/tools/linter/adapters/_linter/file_linter.py
index 7f9c0890fbf6..94b4dd33ac5e 100644
--- a/tools/linter/adapters/_linter/file_linter.py
+++ b/tools/linter/adapters/_linter/file_linter.py
@@ -112,6 +112,7 @@ def _replace(self, pf: PythonFile) -> tuple[str, list[LintResult]]:
         first_results = None
         original = replacement = pf.contents
 
+        # pyrefly: ignore  # bad-assignment
         while True:
             try:
                 results = sorted(self._lint(pf), key=LintResult.sort_key)
diff --git a/tools/linter/adapters/_linter/sets.py b/tools/linter/adapters/_linter/sets.py
index 0aab76876acf..24792301d754 100644
--- a/tools/linter/adapters/_linter/sets.py
+++ b/tools/linter/adapters/_linter/sets.py
@@ -41,6 +41,7 @@ def is_set(self, i: int) -> bool:
         t = self.tokens[i]
         after = i < len(self.tokens) - 1 and self.tokens[i + 1]
         if t.string == "Set" and t.type == token.NAME:
+            # pyrefly: ignore  # bad-return
             return after and after.string == "[" and after.type == token.OP
         return (
             (t.string == "set" and t.type == token.NAME)
diff --git a/tools/linter/adapters/clangformat_linter.py b/tools/linter/adapters/clangformat_linter.py
index 9289dcd6375f..0d82ddd939b1 100644
--- a/tools/linter/adapters/clangformat_linter.py
+++ b/tools/linter/adapters/clangformat_linter.py
@@ -73,7 +73,7 @@ def run_command(
             if remaining_retries == 0:
                 raise err
             remaining_retries -= 1
-            logging.warning(
+            logging.warning(  # noqa: G200
                 "(%s/%s) Retrying because command failed with: %r",
                 retries - remaining_retries,
                 retries,
diff --git a/tools/linter/adapters/clangtidy_linter.py b/tools/linter/adapters/clangtidy_linter.py
index e4b1b7d35ee5..61456c39993d 100644
--- a/tools/linter/adapters/clangtidy_linter.py
+++ b/tools/linter/adapters/clangtidy_linter.py
@@ -19,11 +19,13 @@
 # PyTorch directory root
 def scm_root() -> str:
     path = os.path.abspath(os.getcwd())
+    # pyrefly: ignore  # bad-assignment
     while True:
         if os.path.exists(os.path.join(path, ".git")):
             return path
         if os.path.isdir(os.path.join(path, ".hg")):
             return path
+        # pyrefly: ignore  # bad-argument-type
         n = len(path)
         path = os.path.dirname(path)
         if len(path) == n:
@@ -174,6 +176,8 @@ def check_file(
         for match in RESULTS_RE.finditer(proc.stdout.decode()):
             # Convert the reported path to an absolute path.
             abs_path = str(Path(match["file"]).resolve())
+            if not abs_path.startswith(PYTORCH_ROOT):
+                continue
             message = LintMessage(
                 path=abs_path,
                 name=match["code"],
diff --git a/tools/linter/adapters/codespell_linter.py b/tools/linter/adapters/codespell_linter.py
index 13498cff1320..ce0dd8b6692c 100644
--- a/tools/linter/adapters/codespell_linter.py
+++ b/tools/linter/adapters/codespell_linter.py
@@ -101,6 +101,7 @@ def check_dictionary(filename: str) -> list[LintMessage]:
         words_set = set(words)
         if len(words) != len(words_set):
             raise ValueError("The dictionary file contains duplicate entries.")
+        # pyrefly: ignore  # no-matching-overload
         uncased_words = list(map(str.lower, words))
         if uncased_words != sorted(uncased_words):
             raise ValueError(
diff --git a/tools/linter/adapters/docstring_linter.py b/tools/linter/adapters/docstring_linter.py
index 477bfe7d9a80..ce891bedcf99 100644
--- a/tools/linter/adapters/docstring_linter.py
+++ b/tools/linter/adapters/docstring_linter.py
@@ -5,7 +5,7 @@
 import sys
 from functools import cached_property
 from pathlib import Path
-from typing import Any, Callable, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 
 
 _FILE = Path(__file__).absolute()
@@ -18,7 +18,7 @@
     import _linter
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator, Sequence
+    from collections.abc import Callable, Iterator, Sequence
 
 
 GRANDFATHER_LIST = _FILE.parent / "docstring_linter-grandfather.json"
diff --git a/tools/linter/adapters/flake8_linter.py b/tools/linter/adapters/flake8_linter.py
index 0bc522821cab..d51ef09fec75 100644
--- a/tools/linter/adapters/flake8_linter.py
+++ b/tools/linter/adapters/flake8_linter.py
@@ -172,7 +172,7 @@ def run_command(
             ):
                 raise err
             remaining_retries -= 1
-            logging.warning(
+            logging.warning(  # noqa: G200
                 "(%s/%s) Retrying because command failed with: %r",
                 retries - remaining_retries,
                 retries,
diff --git a/tools/linter/adapters/import_linter.py b/tools/linter/adapters/import_linter.py
index 1b24556a03bd..69c5ecc19fa5 100644
--- a/tools/linter/adapters/import_linter.py
+++ b/tools/linter/adapters/import_linter.py
@@ -47,281 +47,8 @@ class LintMessage(NamedTuple):
 CURRENT_FILE_NAME = os.path.basename(__file__)
 _MODULE_NAME_ALLOW_LIST: set[str] = set()
 
-# Add builtin modules.
-if sys.version_info >= (3, 10):
-    _MODULE_NAME_ALLOW_LIST.update(sys.stdlib_module_names)
-else:
-    assert (sys.version_info.major, sys.version_info.minor) == (3, 9)
-    # Taken from `stdlib_list("3.9")` to avoid introducing a new dependency.
-    _MODULE_NAME_ALLOW_LIST.update(
-        [
-            "__future__",
-            "_abc",
-            "_aix_support",
-            "_ast",
-            "_bootlocale",
-            "_bootsubprocess",
-            "_codecs",
-            "_collections",
-            "_collections_abc",
-            "_compat_pickle",
-            "_compression",
-            "_crypt",
-            "_functools",
-            "_hashlib",
-            "_imp",
-            "_io",
-            "_locale",
-            "_lsprof",
-            "_markupbase",
-            "_operator",
-            "_osx_support",
-            "_peg_parser",
-            "_posixsubprocess",
-            "_py_abc",
-            "_pydecimal",
-            "_pyio",
-            "_random",
-            "_signal",
-            "_sitebuiltins",
-            "_socket",
-            "_sre",
-            "_ssl",
-            "_stat",
-            "_string",
-            "_strptime",
-            "_symtable",
-            "_sysconfigdata_x86_64_conda_cos6_linux_gnu",
-            "_sysconfigdata_x86_64_conda_linux_gnu",
-            "_thread",
-            "_threading_local",
-            "_tracemalloc",
-            "_uuid",
-            "_warnings",
-            "_weakref",
-            "_weakrefset",
-            "abc",
-            "aifc",
-            "antigravity",
-            "argparse",
-            "array",
-            "ast",
-            "asynchat",
-            "asyncio",
-            "asyncore",
-            "atexit",
-            "audioop",
-            "base64",
-            "bdb",
-            "binascii",
-            "binhex",
-            "bisect",
-            "builtins",
-            "bz2",
-            "cProfile",
-            "calendar",
-            "cgi",
-            "cgitb",
-            "chunk",
-            "cmath",
-            "cmd",
-            "code",
-            "codecs",
-            "codeop",
-            "collections",
-            "colorsys",
-            "compileall",
-            "concurrent",
-            "configparser",
-            "contextlib",
-            "contextvars",
-            "copy",
-            "copyreg",
-            "crypt",
-            "csv",
-            "ctypes",
-            "curses",
-            "dataclasses",
-            "datetime",
-            "dbm",
-            "decimal",
-            "difflib",
-            "dis",
-            "distutils",
-            "doctest",
-            "email",
-            "encodings",
-            "ensurepip",
-            "enum",
-            "errno",
-            "faulthandler",
-            "fcntl",
-            "filecmp",
-            "fileinput",
-            "fnmatch",
-            "formatter",
-            "fractions",
-            "ftplib",
-            "functools",
-            "gc",
-            "genericpath",
-            "getopt",
-            "getpass",
-            "gettext",
-            "glob",
-            "graphlib",
-            "grp",
-            "gzip",
-            "hashlib",
-            "heapq",
-            "hmac",
-            "html",
-            "http",
-            "idlelib",
-            "imaplib",
-            "imghdr",
-            "imp",
-            "importlib",
-            "inspect",
-            "io",
-            "ipaddress",
-            "itertools",
-            "json",
-            "keyword",
-            "lib2to3",
-            "linecache",
-            "locale",
-            "logging",
-            "lzma",
-            "mailbox",
-            "mailcap",
-            "marshal",
-            "math",
-            "mimetypes",
-            "mmap",
-            "modulefinder",
-            "msilib",
-            "msvcrt",
-            "multiprocessing",
-            "netrc",
-            "nis",
-            "nntplib",
-            "ntpath",
-            "nturl2path",
-            "numbers",
-            "opcode",
-            "operator",
-            "optparse",
-            "os",
-            "ossaudiodev",
-            "parser",
-            "pathlib",
-            "pdb",
-            "pickle",
-            "pickletools",
-            "pipes",
-            "pkgutil",
-            "platform",
-            "plistlib",
-            "poplib",
-            "posix",
-            "posixpath",
-            "pprint",
-            "profile",
-            "pstats",
-            "pty",
-            "pwd",
-            "py_compile",
-            "pyclbr",
-            "pydoc",
-            "pydoc_data",
-            "queue",
-            "quopri",
-            "random",
-            "re",
-            "readline",
-            "reprlib",
-            "resource",
-            "rlcompleter",
-            "runpy",
-            "sched",
-            "secrets",
-            "select",
-            "selectors",
-            "shelve",
-            "shlex",
-            "shutil",
-            "signal",
-            "site",
-            "smtpd",
-            "smtplib",
-            "sndhdr",
-            "socket",
-            "socketserver",
-            "spwd",
-            "sqlite3",
-            "sre_compile",
-            "sre_constants",
-            "sre_parse",
-            "ssl",
-            "stat",
-            "statistics",
-            "string",
-            "stringprep",
-            "struct",
-            "subprocess",
-            "sunau",
-            "symbol",
-            "symtable",
-            "sys",
-            "sysconfig",
-            "syslog",
-            "tabnanny",
-            "tarfile",
-            "telnetlib",
-            "tempfile",
-            "termios",
-            "test",
-            "textwrap",
-            "this",
-            "threading",
-            "time",
-            "timeit",
-            "tkinter",
-            "token",
-            "tokenize",
-            "trace",
-            "traceback",
-            "tracemalloc",
-            "tty",
-            "turtle",
-            "turtledemo",
-            "types",
-            "typing",
-            "unicodedata",
-            "unittest",
-            "urllib",
-            "uu",
-            "uuid",
-            "venv",
-            "warnings",
-            "wave",
-            "weakref",
-            "webbrowser",
-            "winreg",
-            "winsound",
-            "wsgiref",
-            "xdrlib",
-            "xml",
-            "xmlrpc",
-            "xxsubtype",
-            "zipapp",
-            "zipfile",
-            "zipimport",
-            "zlib",
-            "zoneinfo",
-        ]
-    )
+# Add builtin modules of python.
+_MODULE_NAME_ALLOW_LIST.update(sys.stdlib_module_names)
 
 # Add the allowed third party libraries. Please avoid updating this unless you
 # understand the risks -- see `_ERROR_MESSAGE` for why.
diff --git a/tools/linter/adapters/no_workflows_on_fork.py b/tools/linter/adapters/no_workflows_on_fork.py
index 81e11a47f67b..02efd5f6f62a 100644
--- a/tools/linter/adapters/no_workflows_on_fork.py
+++ b/tools/linter/adapters/no_workflows_on_fork.py
@@ -22,11 +22,15 @@
 import re
 from enum import Enum
 from pathlib import Path
-from typing import Any, Callable, NamedTuple, Optional
+from typing import Any, NamedTuple, Optional, TYPE_CHECKING
 
 from yaml import load
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 # Safely load fast C Yaml loader/dumper if they are available
 try:
     from yaml import CSafeLoader as Loader
diff --git a/tools/linter/adapters/pip_init.py b/tools/linter/adapters/pip_init.py
index 05a7a8acf932..8e5aca4f7102 100644
--- a/tools/linter/adapters/pip_init.py
+++ b/tools/linter/adapters/pip_init.py
@@ -89,7 +89,7 @@ def main() -> None:
         package_name, _, version = package.partition("=")
         if version == "":
             raise RuntimeError(
-                "Package {package_name} did not have a version specified. "
+                f"Package {package_name} did not have a version specified. "
                 "Please specify a version to produce a consistent linting experience."
             )
 
diff --git a/tools/linter/adapters/pyfmt_linter.py b/tools/linter/adapters/pyfmt_linter.py
index ce5f8252a20f..7d70067b4913 100644
--- a/tools/linter/adapters/pyfmt_linter.py
+++ b/tools/linter/adapters/pyfmt_linter.py
@@ -12,6 +12,7 @@
 from pathlib import Path
 from typing import NamedTuple
 
+# pyrefly: ignore  # import-error
 import isort
 import usort
 
diff --git a/tools/linter/adapters/pyrefly_linter.py b/tools/linter/adapters/pyrefly_linter.py
new file mode 100644
index 000000000000..77ed9c681e52
--- /dev/null
+++ b/tools/linter/adapters/pyrefly_linter.py
@@ -0,0 +1,258 @@
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import re
+import subprocess
+import sys
+import time
+from enum import Enum
+from typing import NamedTuple
+
+
+class LintSeverity(str, Enum):
+    ERROR = "error"
+    WARNING = "warning"
+    ADVICE = "advice"
+    DISABLED = "disabled"
+
+
+class LintMessage(NamedTuple):
+    path: str | None
+    line: int | None
+    char: int | None
+    code: str
+    severity: LintSeverity
+    name: str
+    original: str | None
+    replacement: str | None
+    description: str | None
+
+
+# Note: This regex pattern is kept for reference but not used for pyrefly JSON parsing
+RESULTS_RE: re.Pattern[str] = re.compile(
+    r"""(?mx)
+    ^
+    (?P<file>.*?):
+    (?P<line>\d+):
+    (?:(?P<column>-?\d+):)?
+    \s(?P<severity>\S+?):?
+    \s(?P<message>.*)
+    \s(?P<code>\[.*\])
+    $
+    """
+)
+
+# torch/_dynamo/variables/tensor.py:363: error: INTERNAL ERROR
+INTERNAL_ERROR_RE: re.Pattern[str] = re.compile(
+    r"""(?mx)
+    ^
+    (?P<file>.*?):
+    (?P<line>\d+):
+    \s(?P<severity>\S+?):?
+    \s(?P<message>INTERNAL\sERROR.*)
+    $
+    """
+)
+
+
+def run_command(
+    args: list[str],
+    *,
+    extra_env: dict[str, str] | None,
+    retries: int,
+) -> subprocess.CompletedProcess[bytes]:
+    logging.debug("$ %s", " ".join(args))
+    start_time = time.monotonic()
+    try:
+        return subprocess.run(
+            args,
+            capture_output=True,
+        )
+    finally:
+        end_time = time.monotonic()
+        logging.debug("took %dms", (end_time - start_time) * 1000)
+
+
+# Severity mapping (currently only used for stderr internal errors)
+# Pyrefly JSON output doesn't include severity, so all errors default to ERROR
+severities = {
+    "error": LintSeverity.ERROR,
+    "note": LintSeverity.ADVICE,
+}
+
+
+def check_pyrefly_installed(code: str) -> list[LintMessage]:
+    cmd = ["pyrefly", "--version"]
+    try:
+        subprocess.run(cmd, check=True, capture_output=True)
+        return []
+    except subprocess.CalledProcessError as e:
+        msg = e.stderr.decode(errors="replace")
+        return [
+            LintMessage(
+                path=None,
+                line=None,
+                char=None,
+                code=code,
+                severity=LintSeverity.ERROR,
+                name="command-failed",
+                original=None,
+                replacement=None,
+                description=f"Could not run '{' '.join(cmd)}': {msg}",
+            )
+        ]
+
+
+def in_github_actions() -> bool:
+    return bool(os.getenv("GITHUB_ACTIONS"))
+
+
+def check_files(
+    code: str,
+    config: str,
+) -> list[LintMessage]:
+    try:
+        pyrefly_commands = [
+            "pyrefly",
+            "check",
+            "--config",
+            config,
+            "--output-format=json",
+        ]
+        proc = run_command(
+            [*pyrefly_commands],
+            extra_env={},
+            retries=0,
+        )
+    except OSError as err:
+        return [
+            LintMessage(
+                path=None,
+                line=None,
+                char=None,
+                code=code,
+                severity=LintSeverity.ERROR,
+                name="command-failed",
+                original=None,
+                replacement=None,
+                description=(f"Failed due to {err.__class__.__name__}:\n{err}"),
+            )
+        ]
+    stdout = str(proc.stdout, "utf-8").strip()
+    stderr = str(proc.stderr, "utf-8").strip()
+    if proc.returncode not in (0, 1):
+        return [
+            LintMessage(
+                path=None,
+                line=None,
+                char=None,
+                code=code,
+                severity=LintSeverity.ERROR,
+                name="command-failed",
+                original=None,
+                replacement=None,
+                description=stderr,
+            )
+        ]
+
+    # Parse JSON output from pyrefly
+    try:
+        if stdout:
+            result = json.loads(stdout)
+            errors = result.get("errors", [])
+        else:
+            errors = []
+        # For now filter out deprecated warnings and only report type errors as warnings
+        # until we remove mypy
+        errors = [error for error in errors if error["name"] != "deprecated"]
+        rc = [
+            LintMessage(
+                path=error["path"],
+                name=error["name"],
+                description=error.get(
+                    "description", error.get("concise_description", "")
+                ),
+                line=error["line"],
+                char=error["column"],
+                code=code,
+                severity=LintSeverity.ADVICE,
+                # uncomment and replace when we switch to pyrefly
+                # severity=LintSeverity.ADVICE if error["name"] == "deprecated" else LintSeverity.ERROR,
+                original=None,
+                replacement=None,
+            )
+            for error in errors
+        ]
+    except (json.JSONDecodeError, KeyError, TypeError) as e:
+        return [
+            LintMessage(
+                path=None,
+                line=None,
+                char=None,
+                code=code,
+                severity=LintSeverity.ERROR,
+                name="json-parse-error",
+                original=None,
+                replacement=None,
+                description=f"Failed to parse pyrefly JSON output: {e}",
+            )
+        ]
+
+    # Still check stderr for internal errors
+    rc += [
+        LintMessage(
+            path=match["file"],
+            name="INTERNAL ERROR",
+            description=match["message"],
+            line=int(match["line"]),
+            char=None,
+            code=code,
+            severity=severities.get(match["severity"], LintSeverity.ERROR),
+            original=None,
+            replacement=None,
+        )
+        for match in INTERNAL_ERROR_RE.finditer(stderr)
+    ]
+    return rc
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="pyrefly wrapper linter.",
+        fromfile_prefix_chars="@",
+    )
+    parser.add_argument(
+        "--code",
+        default="PYREFLY",
+        help="the code this lint should report as",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose logging",
+    )
+    parser.add_argument(
+        "--config",
+        required=True,
+        help="path to an mypy .ini config file",
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        format="<%(threadName)s:%(levelname)s> %(message)s",
+        level=logging.INFO,
+        stream=sys.stderr,
+    )
+
+    lint_messages = check_pyrefly_installed(args.code) + check_files(
+        args.code, args.config
+    )
+    for lint_message in lint_messages:
+        print(json.dumps(lint_message._asdict()), flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/linter/adapters/ruff_linter.py b/tools/linter/adapters/ruff_linter.py
index d8120461b13b..28feae002f36 100644
--- a/tools/linter/adapters/ruff_linter.py
+++ b/tools/linter/adapters/ruff_linter.py
@@ -112,7 +112,7 @@ def run_command(
             if remaining_retries == 0:
                 raise err
             remaining_retries -= 1
-            logging.warning(
+            logging.warning(  # noqa: G200
                 "(%s/%s) Retrying because command failed with: %r",
                 retries - remaining_retries,
                 retries,
diff --git a/tools/linter/adapters/s3_init.py b/tools/linter/adapters/s3_init.py
index 80e61efb612f..154e3d56ad26 100644
--- a/tools/linter/adapters/s3_init.py
+++ b/tools/linter/adapters/s3_init.py
@@ -55,6 +55,7 @@ def report_download_progress(
     Pretty printer for file download progress.
     """
     if file_size != -1:
+        # pyrefly: ignore  # no-matching-overload
         percent = min(1, (chunk_number * chunk_size) / file_size)
         bar = "#" * int(64 * percent)
         sys.stdout.write(f"\r0% |{bar:<64}| {int(percent * 100)}%")
@@ -94,8 +95,8 @@ def check(binary_path: Path, reference_hash: str) -> bool:
 
     try:
         binary_path.unlink()
-    except OSError as e:
-        logging.critical("Failed to delete binary: %s", e)
+    except OSError:
+        logging.critical("Failed to delete binary", exc_info=True)
         logging.critical(
             "Delete this binary as soon as possible and do not execute it!"
         )
diff --git a/tools/linter/adapters/test_has_main_linter.py b/tools/linter/adapters/test_has_main_linter.py
index e648a96e0df5..5ba653c3ff95 100644
--- a/tools/linter/adapters/test_has_main_linter.py
+++ b/tools/linter/adapters/test_has_main_linter.py
@@ -15,7 +15,10 @@
 from enum import Enum
 from typing import NamedTuple
 
+# pyrefly: ignore  # import-error
 import libcst as cst
+
+# pyrefly: ignore  # import-error
 import libcst.matchers as m
 
 
diff --git a/tools/linter/adapters/testowners_linter.py b/tools/linter/adapters/testowners_linter.py
index b7edf23521b2..82dc668b6738 100755
--- a/tools/linter/adapters/testowners_linter.py
+++ b/tools/linter/adapters/testowners_linter.py
@@ -103,7 +103,7 @@ def check_labels(
                 description=(
                     f"{label} is not an acceptable owner "
                     "(please update to another label or edit ACCEPTABLE_OWNERS_LABELS "
-                    "in tools/linters/adapters/testowners_linter.py"
+                    "in tools/linters/adapters/testowners_linter.py)"
                 ),
             )
         )
diff --git a/tools/linter/adapters/workflow_consistency_linter.py b/tools/linter/adapters/workflow_consistency_linter.py
index 46ec00b1a1f2..54a98df699ca 100644
--- a/tools/linter/adapters/workflow_consistency_linter.py
+++ b/tools/linter/adapters/workflow_consistency_linter.py
@@ -69,6 +69,7 @@ def print_lint_message(path: Path, job: dict[str, Any], sync_tag: str) -> None:
 
     lint_message = LintMessage(
         path=str(path),
+        # pyrefly: ignore  # unbound-name
         line=line_number,
         char=None,
         code="WORKFLOWSYNC",
diff --git a/tools/lite_interpreter/gen_selected_mobile_ops_header.py b/tools/lite_interpreter/gen_selected_mobile_ops_header.py
index f90d33c5ba45..5c25d0934ee1 100644
--- a/tools/lite_interpreter/gen_selected_mobile_ops_header.py
+++ b/tools/lite_interpreter/gen_selected_mobile_ops_header.py
@@ -73,6 +73,7 @@ def get_selected_kernel_dtypes_code(
         for kernel_tag, dtypes in selective_builder.kernel_metadata.items():
             conditions = ["scalar_type == at::ScalarType::" + x for x in dtypes]
             body_parts.append(
+                # pyrefly: ignore  # bad-argument-type
                 if_condition_template.substitute(
                     kernel_tag_name=kernel_tag,
                     dtype_checks=" || ".join(conditions),
diff --git a/tools/nightly.py b/tools/nightly.py
index ba66eb702228..a365bff1e6a1 100755
--- a/tools/nightly.py
+++ b/tools/nightly.py
@@ -7,12 +7,12 @@
 You can use this script to check out a new nightly branch with the following::
 
     $ ./tools/nightly.py checkout -b my-nightly-branch
-    $ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+    $ source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 
 Or if you would like to check out the nightly commit in detached HEAD mode::
 
     $ ./tools/nightly.py checkout
-    $ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+    $ source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 
 Or if you would like to reuse an existing virtual environment, you can pass in
 the prefix argument (--prefix)::
@@ -23,18 +23,24 @@
 To install the nightly binaries built with CUDA, you can pass in the flag --cuda::
 
     $ ./tools/nightly.py checkout -b my-nightly-branch --cuda
-    $ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+    $ source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 
 To install the nightly binaries built with ROCm, you can pass in the flag --rocm::
 
     $ ./tools/nightly.py checkout -b my-nightly-branch --rocm
-    $ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+    $ source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 
 You can also use this tool to pull the nightly commits into the current branch as
 well. This can be done with::
 
     $ ./tools/nightly.py pull
-    $ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
+    $ source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
+
+To create the virtual environment with a specific Python interpreter, you can
+pass in the --python argument::
+
+    $ ./tools/nightly.py --python /path/to/python3.12
+    $ source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 
 Pulling will recreate a fresh virtual environment and reinstall the development
 dependencies as well as the nightly binaries into the repo directory.
@@ -59,10 +65,11 @@
 import time
 import uuid
 from ast import literal_eval
+from collections.abc import Callable
 from datetime import datetime
 from pathlib import Path
 from platform import system as platform_system
-from typing import Any, Callable, cast, NamedTuple, TYPE_CHECKING, TypeVar
+from typing import Any, cast, NamedTuple, TYPE_CHECKING, TypeVar
 
 
 if TYPE_CHECKING:
@@ -142,19 +149,25 @@ class PipSource(NamedTuple):
         supported_platforms={"Linux", "Windows"},
         accelerator="cuda",
     ),
-    # NOTE: Sync with ROCM_ARCHES in .github/scripts/generate_binary_build_matrix.py
-    "rocm-6.3": PipSource(
-        name="rocm-6.3",
-        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm6.3",
-        supported_platforms={"Linux"},
-        accelerator="rocm",
+    "cuda-13.0": PipSource(
+        name="cuda-13.0",
+        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cu130",
+        supported_platforms={"Linux", "Windows"},
+        accelerator="cuda",
     ),
+    # NOTE: Sync with ROCM_ARCHES in .github/scripts/generate_binary_build_matrix.py
     "rocm-6.4": PipSource(
         name="rocm-6.4",
         index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm6.4",
         supported_platforms={"Linux"},
         accelerator="rocm",
     ),
+    "rocm-7.0": PipSource(
+        name="rocm-7.0",
+        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm7.0",
+        supported_platforms={"Linux"},
+        accelerator="rocm",
+    ),
 }
 
 
@@ -242,9 +255,18 @@ def __init__(
         *,
         base_executable: Path | str | None = None,
     ) -> None:
+        base_executable = Path(base_executable or sys.executable)
+        if not base_executable.is_absolute():
+            base_exec = shutil.which(str(base_executable))
+            if base_exec is None:
+                raise RuntimeError(
+                    f"Could not find Python executable {base_executable}",
+                )
+            base_executable = Path(base_exec)
+
         self.prefix = Path(prefix).absolute()
         self.pip_source = pip_source
-        self.base_executable = Path(base_executable or sys.executable).absolute()
+        self.base_executable = base_executable.absolute()
         self._executable: Path | None = None
         self._bindir: Path | None = None
         self._env = {
@@ -289,6 +311,7 @@ def site_packages(self, python: Path | str | None = None) -> Path:
             python=python,
             capture_output=True,
         ).stdout
+        # pyrefly: ignore  # no-matching-overload
         candidates = list(map(Path, filter(None, map(str.strip, output.splitlines()))))
         candidates = [p for p in candidates if p.is_dir() and p.name == "site-packages"]
         if not candidates:
@@ -302,7 +325,7 @@ def activate_script(self) -> Path:
         """Get the activation script for the virtual environment."""
         if WINDOWS:
             # Assume PowerShell
-            return self.prefix / "Scripts" / "Activate.ps1"
+            return self.prefix / "Scripts" / "activate"
         # Assume POSIX-compliant shell: Bash, Zsh, etc.
         return self.prefix / "bin" / "activate"
 
@@ -311,13 +334,18 @@ def activate_command(self) -> str:
         """Get the command to activate the virtual environment."""
         if WINDOWS:
             # Assume PowerShell
-            return f'& "{self.activate_script}"'
+            return f'. "{self.activate_script}"'
         # Assume Bash, Zsh, etc.
         # POSIX standard should use dot `. venv/bin/activate` rather than `source`
         return f"source {shlex.quote(str(self.activate_script))}"
 
     @timed("Creating virtual environment")
-    def create(self, *, remove_if_exists: bool = False) -> Path:
+    def create(
+        self,
+        *,
+        remove_if_exists: bool = False,
+        assume_yes: bool = False,
+    ) -> Path:
         """Create a virtual environment."""
         if self.prefix.exists():
             if remove_if_exists:
@@ -327,43 +355,56 @@ def create(self, *, remove_if_exists: bool = False) -> Path:
                         f"The path {self.prefix} already exists and is not a virtual environment. "
                         "Please remove it manually or choose a different prefix."
                     )
-                if self.prefix in [
-                    Path(p).absolute()
+                if any(
+                    Path(p).absolute().samefile(self.prefix)
                     for p in [
                         sys.prefix,
                         sys.exec_prefix,
                         sys.base_prefix,
                         sys.base_exec_prefix,
                     ]
-                ]:
+                ):
                     raise RuntimeError(
                         f"The path {self.prefix} trying to remove is the same as the interpreter "
                         "to run this script. Please choose a different prefix or deactivate the "
                         "current virtual environment."
                     )
-                if self.prefix in [
+                if any(
                     Path(
                         self.base_python(
                             "-c",
                             f"import os, sys; print(os.path.abspath({p}))",
                             capture_output=True,
                         ).stdout.strip()
-                    ).absolute()
+                    )
+                    .absolute()
+                    .samefile(self.prefix)
                     for p in [
                         "sys.prefix",
                         "sys.exec_prefix",
                         "sys.base_prefix",
                         "sys.base_exec_prefix",
                     ]
-                ]:
+                ):
                     raise RuntimeError(
                         f"The Python executable {self.base_executable} trying to remove is the "
                         "same as the interpreter to create the virtual environment. Please choose "
                         "a different prefix or a different Python interpreter."
                     )
+                if not assume_yes:
+                    answer = input(
+                        f"The virtual environment {self.prefix} already exists. "
+                        "Do you want to remove it and recreate it? [y/N] "
+                    )
+                    if answer.lower() not in ("y", "yes"):
+                        if answer.lower() not in ("n", "no", ""):
+                            print(f"Invalid answer: {answer!r}")
+                        else:
+                            print(f"Aborting due to existing prefix: {self.prefix}")
+                        sys.exit(1)
+
                 print(f"Removing existing venv: {self.prefix}")
                 _remove_existing(self.prefix)
-
             else:
                 raise RuntimeError(f"Path {self.prefix} already exists.")
 
@@ -415,6 +456,15 @@ def ensure(self) -> Path:
         """Ensure the virtual environment exists."""
         if not self.is_venv():
             return self.create(remove_if_exists=True)
+        if (
+            self.python_version().split(".")[:2]
+            != self.base_python_version().split(".")[:2]
+        ):
+            raise RuntimeError(
+                f"Python version mismatch: venv has Python {self.python_version()} "
+                f"but base Python is {self.base_python_version()}. "
+                "Please recreate the virtual environment with the correct Python version."
+            )
 
         self.pip_install(*self.AGGRESSIVE_UPDATE_PACKAGES, upgrade=True)
         return self.prefix
@@ -431,6 +481,7 @@ def python(
         cmd = [str(python), *args]
         env = popen_kwargs.pop("env", None) or {}
         check = popen_kwargs.pop("check", True)
+        # pyrefly: ignore  # no-matching-overload
         return subprocess.run(
             cmd,
             check=check,
@@ -449,7 +500,10 @@ def base_python(
         return self.python(*args, python=self.base_executable, **popen_kwargs)
 
     def python_version(self, *, python: Path | str | None = None) -> str:
-        """Get the Python version for the virtual environment."""
+        """Get the Python version for the virtual environment.
+
+        Return a string like "3.13.7", "3.13.7t", "3.13.7d", "3.13.7td", etc.
+        """
         return self.python(
             "-c",
             (
@@ -461,7 +515,10 @@ def python_version(self, *, python: Path | str | None = None) -> str:
         ).stdout.strip()
 
     def base_python_version(self) -> str:
-        """Get the Python version for the base environment."""
+        """Get the Python version for the base environment.
+
+        Return a string like "3.13.7", "3.13.7t", "3.13.7d", "3.13.7td", etc.
+        """
         return self.python_version(python=self.base_executable)
 
     def uv(
@@ -476,6 +533,7 @@ def uv(
         cmd = [str(self.bindir / "uv"), *args]
         env = popen_kwargs.pop("env", None) or {}
         check = popen_kwargs.pop("check", True)
+        # pyrefly: ignore  # no-matching-overload
         return subprocess.run(
             cmd,
             check=check,
@@ -883,6 +941,7 @@ def _move_single(
 
 def _copy_files(listing: list[Path], source_dir: Path, target_dir: Path) -> None:
     for src in listing:
+        # pyrefly: ignore  # bad-argument-type
         _move_single(src, source_dir, target_dir, shutil.copy2, "Copying")
 
 
@@ -968,13 +1027,15 @@ def install(
     packages: Iterable[str],
     subcommand: str = "checkout",
     branch: str | None = None,
+    fresh_venv: bool = False,
+    assume_yes: bool = False,
 ) -> None:
     """Development install of PyTorch"""
-    use_existing = subcommand == "checkout"
-    if use_existing:
+    if not fresh_venv:
+        print(f"Using existing venv: {venv.prefix}")
         venv.ensure()
     else:
-        venv.create(remove_if_exists=True)
+        venv.create(remove_if_exists=True, assume_yes=assume_yes)
 
     packages = [p for p in packages if p != "torch"]
 
@@ -1049,8 +1110,8 @@ def find_executable(name: str) -> Path:
             metavar="PYTHON",
         )
         subparser.add_argument(
-            "-p",
             "--prefix",
+            "-p",
             type=lambda p: Path(p).absolute(),
             help='Path to virtual environment directory (e.g. "./venv")',
             dest="prefix",
@@ -1058,8 +1119,26 @@ def find_executable(name: str) -> Path:
             metavar="PATH",
         )
         subparser.add_argument(
-            "-v",
+            "--fresh",
+            help="Remove existing virtual environment if it exists",
+            dest="fresh",
+            default=False,
+            action="store_true",
+        )
+        subparser.add_argument(
+            "--yes",
+            "-y",
+            help=(
+                "Automatic yes to prompts; assume 'yes' as answer to all prompts "
+                "(e.g., removing existing venv)"
+            ),
+            dest="yes",
+            default=False,
+            action="store_true",
+        )
+        subparser.add_argument(
             "--verbose",
+            "-v",
             help="Provide debugging info",
             dest="verbose",
             default=False,
@@ -1149,6 +1228,8 @@ def main() -> None:
             packages=PACKAGES_TO_INSTALL,
             subcommand=args.subcmd,
             branch=args.branch,
+            fresh_venv=args.fresh,
+            assume_yes=args.yes,
         )
 
 
diff --git a/tools/nightly_hotpatch.py b/tools/nightly_hotpatch.py
index c956de267651..52833ea2cffa 100644
--- a/tools/nightly_hotpatch.py
+++ b/tools/nightly_hotpatch.py
@@ -118,6 +118,7 @@ def download_patch(pr_number: int, repo_url: str, download_dir: str) -> str:
             urllib.request.urlopen(patch_url) as response,
             open(patch_file, "wb") as out_file,
         ):
+            # pyrefly: ignore  # bad-specialization
             shutil.copyfileobj(response, out_file)
         if not os.path.isfile(patch_file):
             print(f"Failed to download patch for PR #{pr_number}")
diff --git a/tools/packaging/build_wheel.py b/tools/packaging/build_wheel.py
index 10c4516a3280..5f6f262ab820 100644
--- a/tools/packaging/build_wheel.py
+++ b/tools/packaging/build_wheel.py
@@ -20,7 +20,6 @@
 logger.setLevel(logging.INFO)
 
 ROOT_PATH = Path(__file__).absolute().parent.parent.parent
-SETUP_PY_PATH = ROOT_PATH / "setup.py"
 REQUIREMENTS_PATH = ROOT_PATH / "requirements.txt"
 PYPROJECT_TOML_PATH = ROOT_PATH / "pyproject.toml"
 
@@ -115,7 +114,7 @@ def _find_manylinux_interpreters() -> list[str]:
                 )
 
         except subprocess.CalledProcessError as e:
-            logger.debug("Failed to get version for %s: %s", python_path, e)
+            logger.debug("Failed to get version for %s: %s", python_path, e)  # noqa:G200
             continue
     return interpreters
 
@@ -143,18 +142,27 @@ class Builder:
     def __init__(self, interpreter: str) -> None:
         self.interpreter = interpreter
 
-    def setup_py(self, cmd_args: list[str]) -> bool:
+    def build_wheel(self, destination: str) -> bool:
+        logger.info("Running bdist_wheel -d %s", destination)
         return (
-            run_cmd([self.interpreter, str(SETUP_PY_PATH), *cmd_args]).returncode == 0
+            run_cmd(
+                [
+                    self.interpreter,
+                    "-m",
+                    "build",
+                    "--wheel",
+                    "--no-isolation",
+                    "--outdir",
+                    destination,
+                    str(ROOT_PATH),
+                ]
+            ).returncode
+            == 0
         )
 
-    def bdist_wheel(self, destination: str) -> bool:
-        logger.info("Running bdist_wheel -d %s", destination)
-        return self.setup_py(["bdist_wheel", "-d", destination])
-
     def clean(self) -> bool:
         logger.info("Running clean")
-        return self.setup_py(["clean"])
+        return run_cmd([self.interpreter, "setup.py", "clean"]).returncode == 0
 
     def install_requirements(self) -> None:
         logger.info("Installing requirements")
@@ -234,7 +242,7 @@ def main() -> None:
 
             start_time = time.time()
 
-            builder.bdist_wheel(args.destination)
+            builder.build_wheel(args.destination)
 
             end_time = time.time()
 
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index ca17415762fd..38a83694a3c2 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -716,6 +716,103 @@ def gen_nn_functional(fm: FileManager) -> None:
                     "Tensor",
                 )
             ],
+            "multilabel_margin_loss": [
+                defs(
+                    "multilabel_margin_loss",
+                    [
+                        INPUT,
+                        "target: Tensor",
+                        "reduction: str = ...",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "multi_margin_loss": [
+                defs(
+                    "multi_margin_loss",
+                    [
+                        INPUT,
+                        "target: Tensor",
+                        "p: float = 1.0",
+                        "margin: float = 1.0",
+                        "weight: Tensor | None = None",
+                        "reduction: str = ...",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "nll_loss_nd": [
+                defs(
+                    "nll_loss_nd",
+                    [
+                        INPUT,
+                        "target: Tensor",
+                        "weight: Tensor | None = None",
+                        "reduction: str = ...",
+                        "ignore_index: int = -100",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "relu6": [
+                defs(
+                    "relu6",
+                    [
+                        INPUT,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "relu6_": [
+                defs(
+                    "relu6_",
+                    [
+                        INPUT,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "silu": [
+                defs(
+                    "silu",
+                    [
+                        INPUT,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "silu_": [
+                defs(
+                    "silu_",
+                    [
+                        INPUT,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "smooth_l1_loss": [
+                defs(
+                    "smooth_l1_loss",
+                    [
+                        INPUT,
+                        "target: Tensor",
+                        "reduction: str = ...",
+                        "beta: float = 1.0",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "soft_margin_loss": [
+                defs(
+                    "soft_margin_loss",
+                    [
+                        INPUT,
+                        "target: Tensor",
+                        "reduction: str = ...",
+                    ],
+                    "Tensor",
+                )
+            ],
         }
     )
 
@@ -897,6 +994,7 @@ def add_docstr_to_hint(docstr: str, hint: str) -> str:
         hint = hint.removesuffix("...").rstrip()  # remove "..."
         content = hint + "\n" + textwrap.indent(f'r"""\n{docstr}\n"""', prefix="    ")
         # Remove trailing whitespace on each line
+        # pyrefly: ignore  # no-matching-overload
         return "\n".join(map(str.rstrip, content.splitlines())).rstrip()
 
     # attribute or property
@@ -1483,15 +1581,14 @@ def replace_special_case(hint: str) -> str:
                     "S",
                 )
             ],
-            "_make_dtensor": [
+            "_dtensor__new__": [
                 "@staticmethod\n"
                 + defs(
-                    "_make_dtensor",
+                    "_dtensor__new__",
                     [
                         "cls: type[S]",
-                        "size: Sequence[_int | SymInt]",
-                        "strides: Sequence[_int | SymInt]",
                         "local_tensor: Tensor",
+                        "spec: torch.distributed.tensor._dtensor_spec.DTensorSpec",
                         "requires_grad: _bool",
                     ],
                     "S",
diff --git a/tools/setup_helpers/BUILD.bazel b/tools/setup_helpers/BUILD.bazel
index 28dcd1b5b47c..8a02a12d0845 100644
--- a/tools/setup_helpers/BUILD.bazel
+++ b/tools/setup_helpers/BUILD.bazel
@@ -11,5 +11,8 @@ py_binary(
 py_binary(
     name = "gen_version_header",
     srcs = ["gen_version_header.py"],
-    visibility = ["//:__pkg__"],
+    visibility = [
+        "//:__pkg__",
+        "//torch/headeronly:__pkg__",
+    ],
 )
diff --git a/tools/setup_helpers/build.bzl b/tools/setup_helpers/build.bzl
index c5be13e4603b..ab2fa27f9f31 100644
--- a/tools/setup_helpers/build.bzl
+++ b/tools/setup_helpers/build.bzl
@@ -13,5 +13,8 @@ def define_targets(rules):
     rules.py_binary(
         name = "gen_version_header",
         srcs = ["gen_version_header.py"],
-        visibility = ["//:__pkg__"],
+        visibility = [
+            "//:__pkg__",
+            "//torch/headeronly:__pkg__",
+        ],
     )
diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
index 02ab011dd482..9dc22cc37531 100644
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@@ -100,6 +100,7 @@ def _get_cmake_command() -> str:
             if ver is not None:
                 eprint(f"Found {cmd} ({command}) version: {ver}", end="")
                 cmake_versions.append(f"{cmd}=={ver}")
+                # pyrefly: ignore  # unsupported-operation
                 if ver >= CMAKE_MINIMUM_VERSION:
                     eprint(f" (>={CMAKE_MINIMUM_VERSION})")
                     valid_cmake_versions[cmd] = ver
@@ -326,7 +327,7 @@ def generate(
 
         # The default value cannot be easily obtained in CMakeLists.txt. We set it here.
         py_lib_path = sysconfig.get_path("purelib")
-        cmake_prefix_path = build_options.get("CMAKE_PREFIX_PATH", None)
+        cmake_prefix_path = build_options.get("CMAKE_PREFIX_PATH")
         if cmake_prefix_path:
             build_options["CMAKE_PREFIX_PATH"] = (
                 py_lib_path + ";" + cast(str, cmake_prefix_path)
diff --git a/tools/setup_helpers/generate_code.py b/tools/setup_helpers/generate_code.py
index 64a12c0d228c..e53efd7288c1 100644
--- a/tools/setup_helpers/generate_code.py
+++ b/tools/setup_helpers/generate_code.py
@@ -189,6 +189,12 @@ def main() -> None:
     )
     options = parser.parse_args()
 
+    # Path: aten/src/ATen
+    aten_path = os.path.dirname(os.path.dirname(options.native_functions_path))
+    operator_selector = get_selector(
+        options.selected_op_list_path, options.operators_yaml_path
+    )
+
     generate_code(
         options.gen_dir,
         options.native_functions_path,
@@ -198,18 +204,37 @@ def main() -> None:
         options.disable_autograd,
         options.force_schema_registration,
         # options.selected_op_list
-        operator_selector=get_selector(
-            options.selected_op_list_path, options.operators_yaml_path
-        ),
+        operator_selector=operator_selector,
+    )
+
+    # Generate the python bindings for functionalization's `ViewMeta` classes.
+    from torchgen.gen_functionalization_type import (
+        gen_functionalization_view_meta_classes,
+    )
+
+    functionalization_templates_dir = os.path.join(aten_path, "templates")
+    install_dir = options.install_dir or os.fspath(options.gen_dir / "torch/csrc")
+    functionalization_install_dir = os.path.join(
+        install_dir, "functionalization", "generated"
+    )
+
+    os.makedirs(functionalization_install_dir, exist_ok=True)
+    assert os.path.isdir(functionalization_install_dir)
+    assert os.path.isdir(functionalization_templates_dir)
+
+    gen_functionalization_view_meta_classes(
+        options.native_functions_path or NATIVE_FUNCTIONS_PATH,
+        options.tags_path or TAGS_PATH,
+        selector=operator_selector,
+        install_dir=functionalization_install_dir,
+        template_dir=functionalization_templates_dir,
     )
 
     if options.gen_lazy_ts_backend:
-        aten_path = os.path.dirname(os.path.dirname(options.native_functions_path))
         ts_backend_yaml = os.path.join(aten_path, "native/ts_native_functions.yaml")
         ts_native_functions = "torch/csrc/lazy/ts_backend/ts_native_functions.cpp"
         ts_node_base = "torch/csrc/lazy/ts_backend/ts_node.h"
-        install_dir = options.install_dir or os.fspath(options.gen_dir / "torch/csrc")
-        lazy_install_dir = os.path.join(install_dir, "lazy/generated")
+        lazy_install_dir = os.path.join(install_dir, "lazy", "generated")
         os.makedirs(lazy_install_dir, exist_ok=True)
 
         assert os.path.isfile(ts_backend_yaml), (
diff --git a/tools/setup_helpers/generate_linker_script.py b/tools/setup_helpers/generate_linker_script.py
index e66fc197062a..bed5d8d742f1 100644
--- a/tools/setup_helpers/generate_linker_script.py
+++ b/tools/setup_helpers/generate_linker_script.py
@@ -1,5 +1,7 @@
+import argparse
 import os
 import subprocess
+from pathlib import Path
 
 
 def gen_linker_script(
@@ -28,6 +30,12 @@ def gen_linker_script(
     assert len(text_line_start) == 1, "The linker script has multiple text sections!"
     text_line_start = text_line_start[0]
 
+    # ensure that parent directory exists before writing
+    # pyrefly: ignore  # bad-assignment
+    fout = Path(fout)
+    # pyrefly: ignore  # missing-attribute
+    fout.parent.mkdir(parents=True, exist_ok=True)
+
     with open(fout, "w") as f:
         for lineid, line in enumerate(linker_script_lines):
             if lineid == text_line_start + 2:
@@ -36,3 +44,20 @@ def gen_linker_script(
                     f.write(f"      .text.{plines}\n")
                 f.write("    )\n")
             f.write(f"{line}\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate linker file based on prioritized symbols. Used for link-time optimization.",
+    )
+    parser.add_argument(
+        "--filein",
+        help="Path to prioritized_text.txt input file",
+        default=argparse.SUPPRESS,
+    )
+    parser.add_argument(
+        "--fout", help="Output path for linker ld file", default=argparse.SUPPRESS
+    )
+    # convert args to a dict to pass to gen_linker_script
+    kwargs = vars(parser.parse_args())
+    gen_linker_script(**kwargs)
diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py
index 8fb6be57e97d..a7c661340d13 100644
--- a/tools/stats/import_test_stats.py
+++ b/tools/stats/import_test_stats.py
@@ -7,10 +7,14 @@
 import os
 import shutil
 from pathlib import Path
-from typing import Any, Callable, cast
+from typing import Any, cast, TYPE_CHECKING
 from urllib.request import urlopen
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 REPO_ROOT = Path(__file__).resolve().parents[2]
 
 
diff --git a/tools/stats/test_dashboard.py b/tools/stats/test_dashboard.py
index 47a421000460..f98fc7b4abc7 100644
--- a/tools/stats/test_dashboard.py
+++ b/tools/stats/test_dashboard.py
@@ -89,74 +89,101 @@ def get_td_exclusions(
         return grouped_tests
 
 
-def group_test_cases(test_cases: list[dict[str, Any]]) -> dict[str, Any]:
+def group_test_cases(test_cases: list[dict[str, Any]]) -> list[list[dict[str, Any]]]:
+    # Returns a list of lists. Each inner list contains test cases with the same
+    # build name, test config, file name, class name, and test name (ex if it was run multiple times)
     start = time.time()
-    grouped_tests: dict[str, Any] = defaultdict(
-        lambda: defaultdict(
-            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
-        )
-    )
+    test_case_with_job_info = defaultdict(list)
+
     for test_case in test_cases:
         job_name = get_job_name(test_case["job_id"])
         build_name = get_build_name(job_name)
         if "bazel" in build_name:
             continue
         test_config = get_test_config(job_name)
-        class_name = test_case.pop("classname", "NoClass")
-        name = test_case.pop("name", "NoName")
-        invoking_file = test_case.pop("invoking_file", "NoFile")
-        invoking_file = invoking_file.replace(".", "/")
-        test_case.pop("workflow_id")
-        test_case.pop("workflow_run_attempt")
-        grouped_tests[build_name][test_config][invoking_file][class_name][name].append(
-            test_case
+
+        test_case["job_name"] = job_name
+        test_case["build_name"] = build_name
+        test_case["test_config"] = test_config
+
+        key = (
+            build_name,
+            test_config,
+            test_case.get("file", "NoFile"),
+            test_case.get("classname", "NoClass"),
+            test_case.get("name", "NoName"),
         )
+        test_case_with_job_info[key].append(test_case)
 
     print(f"Time taken to group tests: {time.time() - start}")
-    return grouped_tests
+    return list(test_case_with_job_info.values())
 
 
-def get_reruns(grouped_tests: dict[str, Any]) -> dict[str, Any]:
+def get_reruns(grouped_tests: list[list[dict[str, Any]]]) -> dict[str, Any]:
     reruns: dict[str, Any] = defaultdict(
         lambda: defaultdict(
             lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
         )
     )
-    for build_name, build in grouped_tests.items():
-        for test_config, test_config_data in build.items():
-            for invoking_file, invoking_file_data in test_config_data.items():
-                for class_name, class_data in invoking_file_data.items():
-                    for test_name, test_data in class_data.items():
-                        if len(test_data) > 1:
-                            if invoking_file in (
-                                "distributed/test_distributed_spawn",
-                                "onnx/test_fx_to_onnx_with_onnxruntime",
-                                "distributed/algorithms/quantization/test_quantization",
-                            ):
-                                continue
-                            reruns[build_name][test_config][invoking_file][class_name][
-                                test_name
-                            ] = test_data
+
+    for tests in grouped_tests:
+        if len(tests) > 1:
+            build_name = tests[0]["build_name"]
+            test_config = tests[0]["test_config"]
+            file = tests[0].get("file", "NoFile")
+            class_name = tests[0].get("classname", "NoClass")
+            test_name = tests[0].get("name", "NoName")
+            if file in (
+                "distributed/test_distributed_spawn.py",
+                "onnx/test_fx_to_onnx_with_onnxruntime.py",
+                "distributed/algorithms/quantization/test_quantization.py",
+            ):
+                continue
+            reruns[build_name][test_config][file][class_name][test_name] = tests
+
     return reruns
 
 
-def get_invoking_file_summary(grouped_tests: dict[str, Any]) -> dict[str, Any]:
+def get_invoking_file_summary(
+    grouped_tests: list[list[dict[str, Any]]],
+) -> dict[str, Any]:
+    summary_flat = {}
+    for tests in grouped_tests:
+        build_name = tests[0]["build_name"]
+        test_config = tests[0]["test_config"]
+        short_job_name = f"{build_name} / test ({test_config})"
+        file = tests[0].get("file", "NoFile")
+
+        key = (build_name, test_config, file)
+        if key not in summary_flat:
+            summary_flat[key] = {
+                "count": 0,
+                "time": 0.0,
+                "skipped": 0,
+                "failures": 0,
+                "errors": 0,
+                "successes": 0,
+                "short_job_name": short_job_name,
+                "file": file,
+            }
+        summary_flat[key]["count"] += 1
+        status = "successes"
+        for test in tests:
+            summary_flat[key]["time"] += test["time"]
+            if "skipped" in test:
+                status = "skipped"
+            elif "failure" in test:
+                status = "failures"
+            elif "error" in test:
+                status = "errors"
+        summary_flat[key][status] += 1
+
     invoking_file_summary: dict[str, Any] = defaultdict(
         lambda: defaultdict(lambda: defaultdict(lambda: {"count": 0, "time": 0.0}))
     )
-    for build_name, build in grouped_tests.items():
-        for test_config, test_config_data in build.items():
-            for invoking_file, invoking_file_data in test_config_data.items():
-                for class_data in invoking_file_data.values():
-                    for test_data in class_data.values():
-                        invoking_file_summary[build_name][test_config][invoking_file][
-                            "count"
-                        ] += 1
-                        for i in test_data:
-                            invoking_file_summary[build_name][test_config][
-                                invoking_file
-                            ]["time"] += i["time"]
 
+    for (build_name, test_config, file), data in summary_flat.items():
+        invoking_file_summary[build_name][test_config][file] = data
     return invoking_file_summary
 
 
@@ -177,6 +204,41 @@ def get_all_run_attempts(workflow_run_id: int) -> list[int]:
     return sorted(run_attempts)
 
 
+def get_test_status(test_cases: list[list[dict[str, Any]]]) -> list[dict[str, Any]]:
+    # Returns a list of dicts with test status info (flaky, success, failure,
+    # skipped)
+    only_status_info = []
+    for tests in test_cases:
+        build_name = tests[0]["build_name"]
+        test_config = tests[0]["test_config"]
+        short_job_name = f"{build_name} / test ({test_config})"
+        file = tests[0].get("file", "NoFile")
+
+        statuses = []
+        for test in tests:
+            if "skipped" in test:
+                statuses.append("skipped")
+            elif "failure" in test or "error" in test:
+                statuses.append("failure")
+            else:
+                statuses.append("success")
+        if "failure" in statuses and "success" in statuses:
+            status = "flaky"
+        else:
+            status = statuses[0]
+
+        only_status_info.append(
+            {
+                "short_job_name": short_job_name,
+                "file": file,
+                "name": test["name"],
+                "status": status,
+            }
+        )
+
+    return only_status_info
+
+
 def upload_additional_info(
     workflow_run_id: int, workflow_run_attempt: int, test_cases: list[dict[str, Any]]
 ) -> None:
@@ -203,3 +265,9 @@ def upload_additional_info(
         "additional_info/invoking_file_summary",
         [invoking_file_summary],
     )
+    upload_workflow_stats_to_s3(
+        workflow_run_id,
+        workflow_run_attempt,
+        "additional_info/test_status",
+        get_test_status(grouped_tests),
+    )
diff --git a/tools/stats/upload_external_contrib_stats.py b/tools/stats/upload_external_contrib_stats.py
index 6de0e4952143..ab31cf645cd5 100644
--- a/tools/stats/upload_external_contrib_stats.py
+++ b/tools/stats/upload_external_contrib_stats.py
@@ -6,13 +6,17 @@
 import os
 import time
 import urllib.parse
-from typing import Any, Callable, cast
+from typing import Any, cast, TYPE_CHECKING
 from urllib.error import HTTPError
 from urllib.request import Request, urlopen
 
 from tools.stats.upload_stats_lib import upload_to_s3
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 FILTER_OUT_USERS = {
     "pytorchmergebot",
     "facebook-github-bot",
diff --git a/tools/stats/upload_stats_lib.py b/tools/stats/upload_stats_lib.py
index 3ef60171acf6..34548b80d76b 100644
--- a/tools/stats/upload_stats_lib.py
+++ b/tools/stats/upload_stats_lib.py
@@ -9,12 +9,16 @@
 import zipfile
 from functools import lru_cache
 from pathlib import Path
-from typing import Any, Callable, cast, Optional
+from typing import Any, cast, Optional, TYPE_CHECKING
 
 import boto3  # type: ignore[import]
 import requests
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 PYTORCH_REPO = "https://api.github.com/repos/pytorch/pytorch"
 
 
diff --git a/tools/stats/upload_test_stats.py b/tools/stats/upload_test_stats.py
index 216444769720..b5802e803241 100644
--- a/tools/stats/upload_test_stats.py
+++ b/tools/stats/upload_test_stats.py
@@ -296,4 +296,12 @@ def init_value(test_case: dict[str, Any]) -> dict[str, Any]:
             remove_nan_inf(test_cases),
         )
 
+    # Part of an experiment to see if we can handle all the data as is
+    upload_workflow_stats_to_s3(
+        args.workflow_run_id,
+        args.workflow_run_attempt,
+        "all_test_runs",
+        remove_nan_inf(test_cases),
+    )
+
     upload_additional_info(args.workflow_run_id, args.workflow_run_attempt, test_cases)
diff --git a/tools/stats/upload_utilization_stats/upload_utilization_stats.py b/tools/stats/upload_utilization_stats/upload_utilization_stats.py
index a0ad34c92205..9aa2935815f7 100644
--- a/tools/stats/upload_utilization_stats/upload_utilization_stats.py
+++ b/tools/stats/upload_utilization_stats/upload_utilization_stats.py
@@ -60,6 +60,7 @@ def generate(
         df[time_col_name] = pd.to_datetime(df[time_col_name], unit="s", utc=True)
 
         # get unique cmd names
+        # pyrefly: ignore  # bad-argument-type
         unique_cmds_df = pd.DataFrame(df[cmd_col_name].unique(), columns=[cmd_col_name])
 
         # get all detected python cmds
diff --git a/tools/test/gen_operators_yaml_test.py b/tools/test/gen_operators_yaml_test.py
index 815c8bf9fb5a..3c905a2bf269 100644
--- a/tools/test/gen_operators_yaml_test.py
+++ b/tools/test/gen_operators_yaml_test.py
@@ -7,6 +7,7 @@
 from collections import defaultdict
 from unittest.mock import Mock, patch
 
+# pyrefly: ignore  # import-error
 from gen_operators_yaml import (
     fill_output,
     get_parser_options,
@@ -241,5 +242,6 @@ def test_fill_output_with_arguments_not_include_all_overloads(
 
         fill_output(output, options)
 
+        # pyrefly: ignore  # missing-attribute
         for op_val in output["operators"].values():
             self.assertFalse(op_val["include_all_overloads"])
diff --git a/tools/test/test_selective_build.py b/tools/test/test_selective_build.py
index fac6ca6c8b50..8f9b467b2017 100644
--- a/tools/test/test_selective_build.py
+++ b/tools/test/test_selective_build.py
@@ -88,6 +88,7 @@ def gen():
         self.assertTrue(selector2.is_operator_selected("aten::sub.int"))
 
         selector_legacy_v1 = SelectiveBuilder.from_legacy_op_registration_allow_list(
+            # pyrefly: ignore  # bad-argument-type
             ["aten::add", "aten::add.int", "aten::mul.int"],
             False,
             False,
@@ -103,6 +104,7 @@ def gen():
         )
 
         selector_legacy_v1 = SelectiveBuilder.from_legacy_op_registration_allow_list(
+            # pyrefly: ignore  # bad-argument-type
             ["aten::add", "aten::add.int", "aten::mul.int"],
             True,
             False,
@@ -118,6 +120,7 @@ def gen():
         )
 
         selector_legacy_v1 = SelectiveBuilder.from_legacy_op_registration_allow_list(
+            # pyrefly: ignore  # bad-argument-type
             ["aten::add", "aten::add.int", "aten::mul.int"],
             False,
             True,
diff --git a/tools/testing/discover_tests.py b/tools/testing/discover_tests.py
index 25fcf07de937..1210326a02db 100644
--- a/tools/testing/discover_tests.py
+++ b/tools/testing/discover_tests.py
@@ -73,7 +73,6 @@ def skip_test_p(name: str) -> bool:
     cpp_tests_dir=CPP_TESTS_DIR,
     blocklisted_patterns=[
         "ao",
-        "bottleneck_test",
         "custom_backend",
         "custom_operator",
         "fx",  # executed by test_fx.py
@@ -108,6 +107,7 @@ def skip_test_p(name: str) -> bool:
         "lazy/test_meta_kernel",
         "lazy/test_extract_compiled_graph",
         "test/inductor/test_aot_inductor_utils",
+        "inductor/test_aoti_cross_compile_windows",
         "onnx/test_onnxscript_no_runtime",
         "onnx/test_pytorch_onnx_onnxruntime_cuda",
         "onnx/test_models",
diff --git a/tools/testing/explicit_ci_jobs.py b/tools/testing/explicit_ci_jobs.py
index dcf406472353..0d25bc642678 100755
--- a/tools/testing/explicit_ci_jobs.py
+++ b/tools/testing/explicit_ci_jobs.py
@@ -43,7 +43,7 @@ def add_job(
     if workflow_name not in workflows:
         workflows[workflow_name] = {"when": "always", "jobs": []}
 
-    requires = job.get("requires", None)
+    requires = job.get("requires")
     if requires is not None:
         for requirement in requires:
             dependency = past_jobs[requirement]
diff --git a/tools/testing/target_determination/heuristics/filepath.py b/tools/testing/target_determination/heuristics/filepath.py
index e9bdd920b4ce..9cd4ccd862a4 100644
--- a/tools/testing/target_determination/heuristics/filepath.py
+++ b/tools/testing/target_determination/heuristics/filepath.py
@@ -3,7 +3,7 @@
 from collections import defaultdict
 from functools import lru_cache
 from pathlib import Path
-from typing import Any, Callable
+from typing import Any, TYPE_CHECKING
 from warnings import warn
 
 from tools.testing.target_determination.heuristics.interface import (
@@ -17,6 +17,10 @@
 from tools.testing.test_run import TestRun
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 REPO_ROOT = Path(__file__).parents[3]
 
 keyword_synonyms: dict[str, list[str]] = {
diff --git a/tools/testing/target_determination/heuristics/historical_class_failure_correlation.py b/tools/testing/target_determination/heuristics/historical_class_failure_correlation.py
index 6665301f01bb..58c85352db39 100644
--- a/tools/testing/target_determination/heuristics/historical_class_failure_correlation.py
+++ b/tools/testing/target_determination/heuristics/historical_class_failure_correlation.py
@@ -83,7 +83,9 @@ def _rank_correlated_tests(
 ) -> list[str]:
     # Find the tests failures that are correlated with the edited files.
     # Filter the list to only include tests we want to run.
+    # pyrefly: ignore  # bad-assignment
     tests_to_run = set(tests_to_run)
+    # pyrefly: ignore  # bad-argument-type
     ratings = _get_ratings_for_tests(tests_to_run)
     prioritize = sorted(ratings, key=lambda x: -ratings[x])
     return prioritize
diff --git a/tools/testing/test_selections.py b/tools/testing/test_selections.py
index 9493e35f97d7..4a5fbb6a836b 100644
--- a/tools/testing/test_selections.py
+++ b/tools/testing/test_selections.py
@@ -4,7 +4,7 @@
 import os
 import subprocess
 from pathlib import Path
-from typing import Callable, TYPE_CHECKING
+from typing import TYPE_CHECKING
 
 from tools.stats.import_test_stats import get_disabled_tests
 from tools.testing.test_run import ShardedTest, TestRun
@@ -19,7 +19,7 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
 
 
 REPO_ROOT = Path(__file__).resolve().parents[2]
diff --git a/tools/testing/upload_artifacts.py b/tools/testing/upload_artifacts.py
index bcc5b221f30a..57aefd9996d2 100644
--- a/tools/testing/upload_artifacts.py
+++ b/tools/testing/upload_artifacts.py
@@ -36,11 +36,13 @@ def concated_logs() -> str:
     for log_file in glob.glob(
         f"{REPO_ROOT}/test/test-reports/**/*.log", recursive=True
     ):
+        # pyrefly: ignore  # bad-argument-type
         logs.append(f"=== {log_file} ===")
         with open(log_file) as f:
             # For every line, prefix with fake timestamp for log classifier
             for line in f:
                 line = line.rstrip("\n")  # Remove any trailing newline
+                # pyrefly: ignore  # bad-argument-type
                 logs.append(f"2020-01-01T00:00:00.0000000Z {line}")
     return "\n".join(logs)
 
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index adc9aad4a05c..866c40ad1c12 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -276,30 +276,32 @@ add_custom_command(
     WORKING_DIRECTORY
     "${TORCH_ROOT}"
 )
+if(USE_DISTRIBUTED)
+    if(WIN32)
+      append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
+    else()
+      append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
+    endif()
+    # Disable certain warnings for GCC-9.X
+    if(CMAKE_COMPILER_IS_GNUCXX)
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+    endif()
+    # NCCL is a private dependency of libtorch, but libtorch_python includes
+    # some private headers of libtorch, which in turn include NCCL. As a hacky
+    # alternative to making NCCL a public dependency of libtorch, we make it
+    # a private dependency of libtorch_python as well.
+    if(USE_NCCL)
+      list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
+    endif()
+    # Same for MPI.
+    if(USE_MPI)
+      list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
+    endif()
+    list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
 
-if(WIN32)
-  append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
-else()
-  append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
 endif()
-# Disable certain warnings for GCC-9.X
-if(CMAKE_COMPILER_IS_GNUCXX)
-  set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-  set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-  set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-endif()
-# NCCL is a private dependency of libtorch, but libtorch_python includes
-# some private headers of libtorch, which in turn include NCCL. As a hacky
-# alternative to making NCCL a public dependency of libtorch, we make it
-# a private dependency of libtorch_python as well.
-if(USE_NCCL)
-  list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
-endif()
-# Same for MPI.
-if(USE_MPI)
-  list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
-endif()
-list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
 
 if(USE_NCCL AND NOT WIN32)
     list(APPEND TORCH_PYTHON_SRCS
@@ -367,6 +369,10 @@ if(BUILD_LIBTORCHLESS)
     target_compile_definitions(torch_python PRIVATE USE_C10D_NCCL)
   endif()
 
+  if(USE_DISTRIBUTED)
+    target_compile_definitions(torch_python PRIVATE USE_DISTRIBUTED)
+  endif()
+
   if(USE_MPI AND USE_C10D_MPI)
     target_compile_definitions(torch_python PRIVATE USE_C10D_MPI)
   endif()
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index e37bde93513c..abdc3d58a2b3 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -26,10 +26,12 @@ import numpy
 import torch
 from torch import inf, SymInt, Tensor
 from torch._C import (
+    _acc,
     _aoti,
     _cpu,
     _dynamo,
     _export,
+    _functionalization,
     _functorch,
     _lazy,
     _lazy_ts_backend,
@@ -41,6 +43,7 @@ from torch._C import (
 from torch._prims_common import DeviceLikeType
 from torch.autograd.graph import Node as _Node
 from torch.cuda import _POOL_HANDLE
+from torch.distributed.tensor._op_schema import OpSchema
 from torch.fx.node import Node as FxNode
 from torch.package import PackageExporter
 from torch.storage import TypedStorage, UntypedStorage
@@ -1251,17 +1254,19 @@ def _get_float32_matmul_precision() -> str: ...  # THPModule_float32MatmulPrecis
 def _set_float32_matmul_precision(
     arg: str,
 ) -> None: ...  # THPModule_setFloat32MatmulPrecision
-def _get_cublas_allow_fp16_reduced_precision_reduction() -> (
-    _bool
-): ...  # THPModule_allowFP16ReductionCuBLAS
+def _get_cublas_allow_fp16_reduced_precision_reduction() -> tuple[
+    _bool, _bool
+]: ...  # THPModule_allowFP16ReductionCuBLAS
 def _set_cublas_allow_fp16_reduced_precision_reduction(
     arg: _bool,
+    allow_splitk: _bool = ...,
 ) -> None: ...  # THPModule_setAllowFP16ReductionCuBLAS
-def _get_cublas_allow_bf16_reduced_precision_reduction() -> (
-    _bool
-): ...  # THPModule_allowBF16ReductionCuBLAS
+def _get_cublas_allow_bf16_reduced_precision_reduction() -> tuple[
+    _bool, _bool
+]: ...  # THPModule_allowBF16ReductionCuBLAS
 def _set_cublas_allow_bf16_reduced_precision_reduction(
     arg: _bool,
+    allow_splitk: _bool = ...,
 ) -> None: ...  # THPModule_setAllowBF16ReductionCuBLAS
 def _get_cublas_allow_fp16_accumulation() -> (
     _bool
@@ -1437,6 +1442,7 @@ _has_cuda: _bool
 _has_magma: _bool
 _has_xpu: _bool
 _has_mkldnn: _bool
+_has_mkldnn_acl: _bool
 _has_cudnn: _bool
 _has_cusparselt: _bool
 has_spectral: _bool
@@ -1619,6 +1625,11 @@ def _jit_pass_cse(Graph) -> _bool: ...
 def _jit_pass_dce(Graph) -> None: ...
 def _jit_pass_dce_graph(Graph) -> None: ...
 def _jit_pass_lint(Graph) -> None: ...
+def _make_opaque_object(payload: Any) -> ScriptObject: ...
+def _get_opaque_object_payload(obj: ScriptObject) -> Any: ...
+def _set_opaque_object_payload(obj: ScriptObject, payload: Any) -> None: ...
+def _register_opaque_type(type_name: str) -> None: ...
+def _is_opaque_type_registered(type_name: str) -> _bool: ...
 
 # Defined in torch/csrc/jit/python/python_custom_class.cpp
 def _get_custom_class_python_wrapper(name: str, attr: str) -> Any: ...
@@ -1911,6 +1922,7 @@ class TensorBase(metaclass=_TensorMeta):
     names: list[str]
     device: _device
     dtype: _dtype
+    grad_dtype: _dtype | None
     layout: _layout
     real: Tensor
     imag: Tensor
@@ -1942,6 +1954,9 @@ class TensorBase(metaclass=_TensorMeta):
 
 _TensorBase = TensorBase
 
+def _DTensor_OpSchema_post_init(self: OpSchema) -> None: ...
+def _DTensor_OpSchema_recompute_comparison_key(self: OpSchema) -> None: ...
+
 # Defined in torch/csrc/multiprocessing/init.cpp
 def _multiprocessing_init() -> None: ...
 def _set_thread_name(name: str) -> None: ...
@@ -2033,7 +2048,6 @@ def _cuda_cudaHostAllocator() -> _int: ...
 def _cuda_cudaCachingAllocator_raw_alloc(size: _int, cuda_stream: _int) -> _int: ...
 def _cuda_cudaCachingAllocator_raw_delete(ptr: _int) -> None: ...
 def _cuda_cudaCachingAllocator_enable(val: _bool) -> None: ...
-def _cuda_cudaCachingAllocator_set_allocator_settings(env: str) -> None: ...
 def _cuda_beginAllocateToPool(device: _int, mempool_id: tuple[_int, _int]) -> None: ...
 def _cuda_beginAllocateCurrentThreadToPool(
     device: _int,
@@ -2066,6 +2080,8 @@ def _cuda_hostMemoryStats() -> dict[str, Any]: ...
 def _cuda_resetAccumulatedHostMemoryStats() -> None: ...
 def _cuda_resetPeakHostMemoryStats() -> None: ...
 def _cuda_memorySnapshot(mempool_id: tuple[_int, _int] | None) -> dict[str, Any]: ...
+def _cuda_setMemoryMetadata(metadata: str) -> None: ...
+def _cuda_getMemoryMetadata() -> str: ...
 def _cuda_record_memory_history_legacy(
     enabled: _bool,
     record_context: _bool,
@@ -2183,13 +2199,14 @@ def _cuda_tunableop_set_filename(
     insert_device_ordinal: _bool | None,
 ) -> None: ...
 def _cuda_tunableop_get_filename() -> str: ...
-def _cuda_tunableop_write_file(filename: str | None) -> _bool: ...
 def _cuda_tunableop_read_file(filename: str | None) -> _bool: ...
-def _cuda_tunableop_write_file_on_exit(val: _bool) -> None: ...
 def _cuda_tunableop_get_results() -> tuple[str, str, str, _float]: ...
 def _cuda_tunableop_get_validators() -> tuple[str, str]: ...
 def _cuda_tunableop_set_rotating_buffer_size(buffer_size: _int) -> None: ...
 def _cuda_tunableop_get_rotation_buffer_size() -> _int: ...
+def _cuda_tunableop_set_numerical_check_tolerances(
+    enabled: _bool, atol: _float = 1e-5, rtol: _float = 1e-5
+) -> None: ...
 
 class _CudaDeviceProperties:
     name: str
@@ -2204,6 +2221,9 @@ class _CudaDeviceProperties:
     warp_size: _int
     uuid: str
     L2_cache_size: _int
+    clock_rate: _int
+    memory_clock_rate: _int
+    memory_bus_width: _int
 
 # Functions related to SDPA
 class _SDPAParams:
@@ -2456,6 +2476,7 @@ def _accelerator_emptyCache() -> None: ...
 def _accelerator_getDeviceStats(device_index: _int) -> dict[str, Any]: ...
 def _accelerator_resetAccumulatedStats(device_index: _int) -> None: ...
 def _accelerator_resetPeakStats(device_index: _int) -> None: ...
+def _accelerator_setAllocatorSettings(env: str) -> None: ...
 
 # Defined in torch/csrc/jit/python/python_tracer.cpp
 class TracingState:
diff --git a/torch/_C/_acc/__init__.pyi b/torch/_C/_acc/__init__.pyi
new file mode 100644
index 000000000000..aa17e5cb2190
--- /dev/null
+++ b/torch/_C/_acc/__init__.pyi
@@ -0,0 +1,15 @@
+from torch import Tensor
+from torch.types import _dtype, _int, Device
+
+# Defined in torch/csrc/acc/Module.cpp
+class PrivateUse1Hooks:
+    def has_primary_context(self, device_index: _int) -> bool: ...
+    def is_built(self) -> bool: ...
+    def is_avaible(self) -> bool: ...
+
+class DeviceGuard:
+    def type_(self) -> Device: ...
+
+def register_python_privateuseone_device_guard(guard: DeviceGuard) -> bool: ...
+def register_python_privateuseone_hook(hook: PrivateUse1Hooks) -> bool: ...
+def create_empty_tensor(shape: tuple[_int, ...], dtype: _dtype) -> Tensor: ...
diff --git a/torch/_C/_autograd.pyi b/torch/_C/_autograd.pyi
index b166b280df9d..1ff5d847b61a 100644
--- a/torch/_C/_autograd.pyi
+++ b/torch/_C/_autograd.pyi
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
+from collections.abc import Callable
 from enum import Enum
-from typing import Any, Callable
+from typing import Any
 
 import torch
 from torch._C._profiler import (
@@ -78,6 +79,7 @@ class _KinetoEvent:
     def privateuse1_elapsed_us(self) -> int: ...
     def is_user_annotation(self) -> bool: ...
     def is_hidden_event(self) -> bool: ...
+    def metadata_json(self) -> str: ...
 
 class _ProfilerResult:
     def events(self) -> list[_KinetoEvent]: ...
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index 79e437063b8c..da59123625e8 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -112,17 +112,28 @@ class DebugLevel(Enum):
     DETAIL = ...
 
 class ReduceOp:
+    # pyrefly: ignore  # unknown-name
     def __init__(self, op: RedOpType) -> None: ...
 
+    # pyrefly: ignore  # unknown-name
     SUM: RedOpType = ...
+    # pyrefly: ignore  # unknown-name
     AVG: RedOpType = ...
+    # pyrefly: ignore  # unknown-name
     PRODUCT: RedOpType = ...
+    # pyrefly: ignore  # unknown-name
     MIN: RedOpType = ...
+    # pyrefly: ignore  # unknown-name
     MAX: RedOpType = ...
+    # pyrefly: ignore  # unknown-name
     BAND: RedOpType = ...
+    # pyrefly: ignore  # unknown-name
     BOR: RedOpType = ...
+    # pyrefly: ignore  # unknown-name
     BXOR: RedOpType = ...
+    # pyrefly: ignore  # unknown-name
     PREMUL_SUM: RedOpType = ...
+    # pyrefly: ignore  # unknown-name
     UNUSED: RedOpType = ...
 
     # mypy error being ignored:
@@ -596,7 +607,8 @@ class ProcessGroup:
     def group_desc(self) -> str: ...
 
 class FakeProcessGroup(Backend):
-    def __init__(self, rank: int, world_size: int) -> None: ...
+    @staticmethod
+    def _create_internal(rank: int, world_size: int) -> FakeProcessGroup: ...
 
 class FakeWork(Work):
     seq_id: int
@@ -734,7 +746,7 @@ def _allow_inflight_collective_as_graph_input() -> bool: ...
 def _unregister_all_process_groups() -> None: ...
 def _unregister_process_group(group_name: str) -> None: ...
 
-# Initializes the device state in CUmodule so that it’s able to perform NVSHMEM
+# Initializes the device state in CUmodule so that it's able to perform NVSHMEM
 # operations.  CUmodule is a pointer to a CUDA module, carried by a int64 in
 # Python. At C++ interface, it is converted to a uintptr_t.
 def _nvshmemx_cumodule_init(module: int) -> None: ...
@@ -851,12 +863,3 @@ class ProcessGroupXCCL(Backend):
 
 def _set_process_group(pg: ProcessGroup) -> None: ...
 def _current_process_group() -> ProcessGroup: ...
-def _dump_nccl_trace_json(
-    includeCollectives: Optional[bool] = ...,
-    onlyActive: Optional[bool] = ...,
-) -> bytes: ...
-def _dump_nccl_trace(
-    includeCollectives: Optional[bool] = ...,
-    includeStackTraces: Optional[bool] = ...,
-    onlyActive: Optional[bool] = ...,
-) -> bytes: ...
diff --git a/torch/_C/_dynamo/compiled_autograd.pyi b/torch/_C/_dynamo/compiled_autograd.pyi
index 321a99fc709b..ef24582b5023 100644
--- a/torch/_C/_dynamo/compiled_autograd.pyi
+++ b/torch/_C/_dynamo/compiled_autograd.pyi
@@ -1,4 +1,4 @@
-from typing import Callable
+from collections.abc import Callable
 
 from torch import Tensor
 from torch._dynamo.compiled_autograd import AutogradCompilerInstance
diff --git a/torch/_C/_dynamo/guards.pyi b/torch/_C/_dynamo/guards.pyi
index 591d47119499..b8c0a93e35fa 100644
--- a/torch/_C/_dynamo/guards.pyi
+++ b/torch/_C/_dynamo/guards.pyi
@@ -1,6 +1,6 @@
 import enum
-from typing import Any, Callable, Optional
-from typing_extensions import TypeAlias
+from collections.abc import Callable
+from typing import Any, Optional, TypeAlias
 
 import torch
 
@@ -222,16 +222,6 @@ class GuardManager:
     ) -> GuardManager: ...
     # Leaf guards
     def add_lambda_guard(
-        self,
-        user_lambda: Callable[..., Any],
-        required_locals: dict[str, int],
-        construct_partial_framelocals_dict: bool,
-        verbose_code_parts: list[str],
-    ) -> None: ...
-    def add_lambda_guard_no_args(
-        self, user_lambda: Callable[..., Any], verbose_code_parts: list[str]
-    ) -> None: ...
-    def add_lambda_guard_no_framelocals(
         self, user_lambda: Callable[..., Any], verbose_code_parts: list[str]
     ) -> None: ...
     def add_id_match_guard(
@@ -359,8 +349,6 @@ class RootGuardManager(GuardManager):
     def add_epilogue_lambda_guard(
         self,
         guard: LeafGuard,
-        required_locals: dict[str, int],
-        construct_partial_framelocals_dict: bool,
         verbose_code_parts: list[str],
     ) -> None: ...
     def clone_manager(
diff --git a/torch/_C/_functionalization.pyi b/torch/_C/_functionalization.pyi
new file mode 100644
index 000000000000..4e00df97e271
--- /dev/null
+++ b/torch/_C/_functionalization.pyi
@@ -0,0 +1,16 @@
+from torch import Tensor
+from torch.types import _bool
+
+# Defined in torch/csrc/functionalization/Module.cpp
+
+class ViewMeta:
+    has_symbolic_inputs: _bool
+
+# Returns the list of ViewMeta instances of the given functional tensor.
+#
+# Although we do have python bindings for their types, we won't
+# expose them here, since they should not be used by users.
+def get_view_meta_sequence(tensor: Tensor) -> list[ViewMeta]: ...
+
+# Applies the ViewMeta sequence on top of the given base.
+def apply_view_meta_sequence(base: Tensor, sequence: list[ViewMeta]) -> Tensor: ...
diff --git a/torch/_C/_functorch.pyi b/torch/_C/_functorch.pyi
index 2e37b3d10996..c23240e13170 100644
--- a/torch/_C/_functorch.pyi
+++ b/torch/_C/_functorch.pyi
@@ -22,6 +22,7 @@ def _unwrap_batched(tensor: Tensor, level: int) -> tuple[Tensor, int | None]: ..
 def current_level() -> int: ...
 def count_jvp_interpreters() -> int: ...
 def _add_batch_dim(tensor: Tensor, bdim: int, level: int) -> Tensor: ...
+def _maybe_unsafe_set_level(tensor: Tensor, level: int) -> None: ...
 def set_single_level_autograd_function_allowed(allowed: bool) -> None: ...
 def get_single_level_autograd_function_allowed() -> bool: ...
 def _unwrap_functional_tensor(tensor: Tensor, reapply_views: bool) -> Tensor: ...
diff --git a/torch/_C/_monitor.pyi b/torch/_C/_monitor.pyi
index be6f0f64f97d..82f2a3e44270 100644
--- a/torch/_C/_monitor.pyi
+++ b/torch/_C/_monitor.pyi
@@ -1,9 +1,9 @@
 # Defined in torch/csrc/monitor/python_init.cpp
 
 import datetime
+from collections.abc import Callable
 from enum import Enum
 from types import TracebackType
-from typing import Callable
 
 class Aggregation(Enum):
     VALUE = ...
diff --git a/torch/_C/_profiler.pyi b/torch/_C/_profiler.pyi
index 5e2870f72b47..d60d89a6a479 100644
--- a/torch/_C/_profiler.pyi
+++ b/torch/_C/_profiler.pyi
@@ -1,6 +1,5 @@
 from enum import Enum
-from typing import Literal
-from typing_extensions import TypeAlias
+from typing import Literal, TypeAlias
 
 from torch._C import device, dtype, layout
 
diff --git a/torch/__init__.py b/torch/__init__.py
index eac57306e63d..f7fd0210d81f 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -22,9 +22,10 @@
 import sys
 import textwrap
 import threading
+import warnings
+from collections.abc import Callable as _Callable
 from typing import (
     Any as _Any,
-    Callable as _Callable,
     get_origin as _get_origin,
     Optional as _Optional,
     overload as _overload,
@@ -244,7 +245,7 @@ def _load_dll_libraries() -> None:
                 textwrap.dedent(
                     """
                     Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure.
-                    It can be downloaded at https://aka.ms/vs/16/release/vc_redist.x64.exe
+                    It can be downloaded at https://aka.ms/vs/17/release/vc_redist.x64.exe
                     """
                 ).strip()
             )
@@ -283,16 +284,26 @@ def _load_dll_libraries() -> None:
 
 
 def _get_cuda_dep_paths(path: str, lib_folder: str, lib_name: str) -> list[str]:
-    # Libraries can either be in path/nvidia/lib_folder/lib or path/lib_folder/lib
+    # Libraries can either be in
+    # path/nvidia/lib_folder/lib or
+    # path/nvidia/cuXX/lib (since CUDA 13.0) or
+    # path/lib_folder/lib
+    from torch.version import cuda as cuda_version
+
     nvidia_lib_paths = glob.glob(
         os.path.join(path, "nvidia", lib_folder, "lib", lib_name)
     )
+    if cuda_version is not None:
+        maj_cuda_version = cuda_version.split(".")[0]
+        nvidia_lib_paths += glob.glob(
+            os.path.join(path, "nvidia", f"cu{maj_cuda_version}", "lib", lib_name)
+        )
     lib_paths = glob.glob(os.path.join(path, lib_folder, "lib", lib_name))
 
     return nvidia_lib_paths + lib_paths
 
 
-def _preload_cuda_deps(lib_folder: str, lib_name: str) -> None:
+def _preload_cuda_deps(lib_folder: str, lib_name: str, required: bool = True) -> None:  # type: ignore[valid-type]
     """Preloads cuda deps if they could not be found otherwise."""
     # Should only be called on Linux if default path resolution have failed
     assert platform.system() == "Linux", "Should only be called on Linux"
@@ -303,9 +314,10 @@ def _preload_cuda_deps(lib_folder: str, lib_name: str) -> None:
         if candidate_lib_paths:
             lib_path = candidate_lib_paths[0]
             break
-    if not lib_path:
+    if not lib_path and required:
         raise ValueError(f"{lib_name} not found in the system path {sys.path}")
-    ctypes.CDLL(lib_path)
+    if lib_path:
+        ctypes.CDLL(lib_path)
 
 
 # See Note [Global dependencies]
@@ -330,12 +342,13 @@ def _load_global_deps() -> None:
         try:
             with open("/proc/self/maps") as f:
                 _maps = f.read()
-            # libtorch_global_deps.so always depends in cudart, check if its installed via wheel
-            if "nvidia/cuda_runtime/lib/libcudart.so" not in _maps:
+
+            # libtorch_global_deps.so always depends in cudart, check if its installed and loaded
+            if "libcudart.so" not in _maps:
                 return
             # If all above-mentioned conditions are met, preload nvrtc and nvjitlink
-            # Please note that order are important for CUDA-11.8 , as nvjitlink does not exist there
             _preload_cuda_deps("cuda_nvrtc", "libnvrtc.so.*[0-9]")
+            _preload_cuda_deps("cuda_nvrtc", "libnvrtc-builtins.so.*[0-9]")
             _preload_cuda_deps("nvjitlink", "libnvJitLink.so.*[0-9]")
         except Exception:
             pass
@@ -343,8 +356,6 @@ def _load_global_deps() -> None:
     except OSError as err:
         # Can only happen for wheel with cuda libs as PYPI deps
         # As PyTorch is not purelib, but nvidia-*-cu12 is
-        from torch.version import cuda as cuda_version
-
         cuda_libs: dict[str, str] = {
             "cublas": "libcublas.so.*[0-9]",
             "cudnn": "libcudnn.so.*[0-9]",
@@ -358,7 +369,6 @@ def _load_global_deps() -> None:
             "cusparselt": "libcusparseLt.so.*[0-9]",
             "cusolver": "libcusolver.so.*[0-9]",
             "nccl": "libnccl.so.*[0-9]",
-            "nvtx": "libnvToolsExt.so.*[0-9]",
             "nvshmem": "libnvshmem_host.so.*[0-9]",
             "cufile": "libcufile.so.*[0-9]",
         }
@@ -370,6 +380,9 @@ def _load_global_deps() -> None:
             raise err
         for lib_folder, lib_name in cuda_libs.items():
             _preload_cuda_deps(lib_folder, lib_name)
+
+        # libnvToolsExt is Optional Dependency
+        _preload_cuda_deps("nvtx", "libnvToolsExt.so.*[0-9]", required=False)
         ctypes.CDLL(global_deps_lib_path, mode=ctypes.RTLD_GLOBAL)
 
 
@@ -977,7 +990,7 @@ def sym_ite(b, t, f):
     """SymInt-aware utility for ternary operator (``t if b else f``.)"""
     if overrides.has_torch_function((b, t, f)):
         return overrides.handle_torch_function(sym_ite, (b, t, f), b, t, f)
-    assert isinstance(b, (SymBool, builtins.bool)) and type(t) == type(f)
+    assert isinstance(b, (SymBool, builtins.bool)) and type(t) is type(f)
     if isinstance(b, SymBool):
         return b.__sym_ite__(t, f)
     return t if b else f
@@ -1107,11 +1120,6 @@ def typename(obj: _Any, /) -> str:
 def is_tensor(obj: _Any, /) -> _TypeIs["torch.Tensor"]:
     r"""Returns True if `obj` is a PyTorch tensor.
 
-    Note that this function is simply doing ``isinstance(obj, Tensor)``.
-    Using that ``isinstance`` check is better for type checking with mypy,
-    and more explicit - so it's recommended to use that instead of
-    ``is_tensor``.
-
     Args:
         obj (object): Object to test
     Example::
@@ -1416,17 +1424,6 @@ def use_deterministic_algorithms(
     :attr:`torch.utils.deterministic.fill_uninitialized_memory` is turned on.
     See the documentation for that attribute for more information.
 
-    A handful of CUDA operations are nondeterministic if the CUDA version is
-    10.2 or greater, unless the environment variable ``CUBLAS_WORKSPACE_CONFIG=:4096:8``
-    or ``CUBLAS_WORKSPACE_CONFIG=:16:8`` is set. See the CUDA documentation for more
-    details: `<https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility>`_
-    If one of these environment variable configurations is not set, a :class:`RuntimeError`
-    will be raised from these operations when called with CUDA tensors:
-
-        * :func:`torch.mm`
-        * :func:`torch.mv`
-        * :func:`torch.bmm`
-
     Note that deterministic operations tend to have worse performance than
     nondeterministic operations.
 
@@ -1688,9 +1685,10 @@ def _check(cond, message=None):  # noqa: F811
             an object that has a ``__str__()`` method to be used as the error
             message. Default: ``None``
     """
-    _check_with(RuntimeError, cond, message)
+    _check_with(RuntimeError, cond, message)  # pyrefly: ignore  # bad-argument-type
 
 
+# TODO add deprecation annotation
 def _check_is_size(i, message=None, *, max=None):
     """Checks that a given integer is a valid size (i.e., is non-negative).
     You should use this over ``_check(i >= 0)`` because it can prevent
@@ -1737,7 +1735,7 @@ def _check_index(cond, message=None):  # noqa: F811
             an object that has a ``__str__()`` method to be used as the error
             message. Default: ``None``
     """
-    _check_with(IndexError, cond, message)
+    _check_with(IndexError, cond, message)  # pyrefly: ignore  # bad-argument-type
 
 
 def _check_value(cond, message=None):  # noqa: F811
@@ -1755,7 +1753,7 @@ def _check_value(cond, message=None):  # noqa: F811
             an object that has a ``__str__()`` method to be used as the error
             message. Default: ``None``
     """
-    _check_with(ValueError, cond, message)
+    _check_with(ValueError, cond, message)  # pyrefly: ignore  # bad-argument-type
 
 
 def _check_type(cond, message=None):  # noqa: F811
@@ -1773,7 +1771,7 @@ def _check_type(cond, message=None):  # noqa: F811
             an object that has a ``__str__()`` method to be used as the error
             message. Default: ``None``
     """
-    _check_with(TypeError, cond, message)
+    _check_with(TypeError, cond, message)  # pyrefly: ignore  # bad-argument-type
 
 
 def _check_not_implemented(cond, message=None):  # noqa: F811
@@ -1791,7 +1789,12 @@ def _check_not_implemented(cond, message=None):  # noqa: F811
             an object that has a ``__str__()`` method to be used as the error
             message. Default: ``None``
     """
-    _check_with(NotImplementedError, cond, message)
+    _check_with(
+        NotImplementedError,
+        cond,
+        # pyrefly: ignore  # bad-argument-type
+        message,
+    )
 
 
 def _check_tensor_all_with(error_type, cond, message=None):  # noqa: F811
@@ -2495,7 +2498,7 @@ def compile(
     to compile it and cache the compiled result on the code object for future
     use.  A single frame may be compiled multiple times if previous compiled
     results are not applicable for subsequent calls (this is called a "guard
-    failure), you can use TORCH_LOGS=guards to debug these situations.
+    failure"), you can use TORCH_LOGS=guards to debug these situations.
     Multiple compiled results can be associated with a frame up to
     ``torch._dynamo.config.recompile_limit``, which defaults to 8; at which
     point we will fall back to eager.  Note that compile caches are per
@@ -2507,7 +2510,8 @@ def compile(
        fullgraph (bool): If False (default), torch.compile attempts to discover compilable regions
         in the function that it will optimize. If True, then we require that the entire function be
         capturable into a single graph. If this is not possible (that is, if there are graph breaks),
-        then this will raise an error.
+        then this will raise an error. This also opts into unbacked semantics, notably it will turn on
+        capture_scalar_outputs and capture_dynamic_output_shape_ops on by default.
        dynamic (bool or None): Use dynamic shape tracing.  When this is True, we will up-front attempt
         to generate a kernel that is as dynamic as possible to avoid recompilations when
         sizes change.  This may not always work as some operations/optimizations will
@@ -2600,7 +2604,7 @@ def foo(x):
         def fn(model: _Callable[_InputT, _RetT]) -> _Callable[_InputT, _RetT]:
             if model is None:
                 raise RuntimeError("Model can't be None")
-            return compile(
+            return compile(  # pyrefly: ignore  # no-matching-overload
                 model,
                 fullgraph=fullgraph,
                 dynamic=dynamic,
@@ -2628,6 +2632,29 @@ def fn(model: _Callable[_InputT, _RetT]) -> _Callable[_InputT, _RetT]:
     if options and isinstance(options, dict):
         guard_filter_fn = options.pop("guard_filter_fn", None)
 
+    if torch.compiler.is_exporting():
+        warnings.warn(
+            "You are calling torch.compile inside torch.export region. "
+            "To capture an useful graph, we will implicitly switch to torch.compile(backend=eager)"
+        )
+        from torch._higher_order_ops.utils import setup_compilation_env
+
+        # Create wrapper that always uses eager backend during export
+        def export_wrapped_fn(*args, **kwargs):
+            with setup_compilation_env() as backend:  # type: ignore[attr-defined]
+                # Force eager backend regardless of original backend
+                backend_wrapper = _TorchCompileWrapper(backend, mode, options, dynamic)
+                return torch._dynamo.optimize(
+                    backend=backend_wrapper,
+                    nopython=fullgraph,
+                    dynamic=dynamic,
+                    disable=disable,
+                    guard_filter_fn=guard_filter_fn,
+                    # pyrefly: ignore  # bad-argument-type
+                )(model)(*args, **kwargs)
+
+        return export_wrapped_fn
+
     if backend == "inductor":
         backend = _TorchCompileInductorWrapper(mode, options, dynamic)
     else:
@@ -2818,10 +2845,7 @@ def _import_device_backends():
     from importlib.metadata import entry_points
 
     group_name = "torch.backends"
-    if sys.version_info < (3, 10):
-        backend_extensions = entry_points().get(group_name, ())
-    else:
-        backend_extensions = entry_points(group=group_name)
+    backend_extensions = entry_points(group=group_name)
 
     for backend_extension in backend_extensions:
         try:
diff --git a/torch/_compile.py b/torch/_compile.py
index 33855b44b705..76ddd3ccb05b 100644
--- a/torch/_compile.py
+++ b/torch/_compile.py
@@ -4,7 +4,8 @@
 """
 
 import functools
-from typing import Callable, Literal, Optional, overload, TypeVar, Union
+from collections.abc import Callable
+from typing import Optional, overload, TypeVar, Union
 from typing_extensions import ParamSpec
 
 
@@ -20,7 +21,7 @@ def _disable_dynamo(
 
 @overload
 def _disable_dynamo(
-    fn: Literal[None] = None, recursive: bool = True
+    fn: None = None, recursive: bool = True
 ) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]: ...
 
 
diff --git a/torch/_custom_op/autograd.py b/torch/_custom_op/autograd.py
index 4f688164a001..eed665a1a0d6 100644
--- a/torch/_custom_op/autograd.py
+++ b/torch/_custom_op/autograd.py
@@ -220,7 +220,7 @@ def error(what):
                     f"hold a list of gradients but got object of type "
                     f"{type(grad)}."
                 )
-            if not len(grad) == len(arg_info):
+            if len(grad) != len(arg_info):
                 error(
                     f"for input '{name}' expected the grad_input dict to "
                     f"hold a list of {len(arg_info)} gradients but got "
diff --git a/torch/_custom_op/impl.py b/torch/_custom_op/impl.py
index 208c18e392a4..b445907b5d1a 100644
--- a/torch/_custom_op/impl.py
+++ b/torch/_custom_op/impl.py
@@ -101,7 +101,7 @@ def inner(func):
             lib, ns, function_schema, name, ophandle, _private_access=True
         )
 
-        result.__name__ = func.__name__
+        result.__name__ = func.__name__  # pyrefly: ignore  # bad-assignment
         result.__module__ = func.__module__
         result.__doc__ = func.__doc__
 
diff --git a/torch/_decomp/__init__.py b/torch/_decomp/__init__.py
index 8e9796d2f7c1..69ef0b901bed 100644
--- a/torch/_decomp/__init__.py
+++ b/torch/_decomp/__init__.py
@@ -1,10 +1,10 @@
 # mypy: allow-untyped-defs
 import inspect
 from collections import defaultdict
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from functools import lru_cache, partial, wraps
 from itertools import chain
-from typing import Callable, Optional, TYPE_CHECKING, TypeVar, Union
+from typing import Optional, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import ParamSpec
 
 
@@ -240,6 +240,7 @@ def get_decompositions(
 
     registry = global_decomposition_table[type]
     packets_to_overloads = defaultdict(list)
+
     for opo in registry:
         if isinstance(opo, (OpOverload, OpOverloadPacket)):
             packets_to_overloads[opo.overloadpacket].append(opo)
@@ -403,6 +404,7 @@ def _core_aten_decompositions_post_autograd() -> dict[
             aten.max_unpool3d,
             aten.mish,
             aten.mish_,
+            aten.mish_backward,
             aten.mse_loss,
             aten.mse_loss_backward,
             aten.multi_margin_loss,
@@ -418,6 +420,7 @@ def _core_aten_decompositions_post_autograd() -> dict[
             aten.native_dropout_backward,
             aten.native_group_norm_backward,
             aten.native_layer_norm_backward,
+            aten._fused_rms_norm,
             aten._fused_rms_norm_backward,
             aten.new_empty,
             aten.new_full,
@@ -474,6 +477,7 @@ def _core_aten_decompositions_post_autograd() -> dict[
             aten.silu,
             aten.silu_,
             aten.silu_backward.grad_input,
+            aten.silu_backward,
             aten.sinc,
             aten.sinc_,
             aten.slice_backward,
diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py
index 2a00c57419da..b1ac83c740c5 100644
--- a/torch/_decomp/decompositions.py
+++ b/torch/_decomp/decompositions.py
@@ -5,11 +5,12 @@
 import numbers
 import operator
 import sys
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
+from contextlib import nullcontext
 from enum import Enum
 from functools import partial, reduce
 from itertools import chain, product
-from typing import Any, Callable, cast, Optional, Union
+from typing import Any, cast, Optional, Union
 
 import torch
 import torch._meta_registrations
@@ -381,6 +382,7 @@ def to_real_dtype(dtype: torch.dtype):
 def mse_loss(
     self: Tensor, target: Tensor, reduction: int = Reduction.MEAN.value
 ) -> Tensor:
+    # pyrefly: ignore  # unsupported-operation
     loss = (self - target) ** 2
     return apply_loss_reduction(loss, reduction)
 
@@ -414,6 +416,7 @@ def smooth_l1_loss(
     beta: float = 1.0,
 ):
     loss = (self - target).abs()
+    # pyrefly: ignore  # unsupported-operation
     loss = torch.where(loss < beta, 0.5 * loss**2 / beta, loss - 0.5 * beta)
     return apply_loss_reduction(loss, reduction)
 
@@ -721,10 +724,7 @@ def slice_forward(
     end: Optional[int] = None,
     step: int = 1,
 ):
-    from torch.fx.experimental.symbolic_shapes import (
-        guard_size_oblivious,
-        statically_known_true,
-    )
+    from torch.fx.experimental.symbolic_shapes import statically_known_true
 
     ndim = self.dim()
     if ndim == 0:
@@ -739,22 +739,22 @@ def slice_forward(
     start_val = start if start is not None else 0
     end_val = end if end is not None else sys.maxsize  # 2^63 - 1
 
-    if guard_size_oblivious(start_val < 0):
+    if start_val < 0:
         start_val += sizes[dim]
 
-    if guard_size_oblivious(end_val < 0):
+    if end_val < 0:
         end_val += sizes[dim]
 
-    if guard_size_oblivious(start_val < 0):
+    if start_val < 0:
         start_val = 0
-    elif guard_size_oblivious(start_val > sizes[dim]):
+    elif start_val > sizes[dim]:
         start_val = sizes[dim]
 
     if statically_known_true(end_val == sys.maxsize):
         end_val = sizes[dim]
-    elif guard_size_oblivious(end_val < start_val):
+    elif end_val < start_val:
         end_val = start_val
-    elif guard_size_oblivious(end_val > sizes[dim]):
+    elif end_val > sizes[dim]:
         end_val = sizes[dim]
 
     storage_offset = self.storage_offset() + start_val * strides[dim]
@@ -924,7 +924,7 @@ def im2col(
     def check_positive(param, param_name, strict=True):
         cond = all(p > 0 for p in param) if strict else all(p >= 0 for p in param)
         torch._check(
-            cond, lambda: "{param_name} should be greater {'than' zero, but got {param}"
+            cond, lambda: f"{param_name} should be greater than zero, but got {param}"
         )
 
     check_positive(kernel_size, "kernel_size")
@@ -1009,7 +1009,7 @@ def col2im(
     def check_positive(param, param_name, strict=True):
         cond = all(p > 0 for p in param) if strict else all(p >= 0 for p in param)
         torch._check(
-            cond, lambda: "{param_name} should be greater than zero, but got {param}"
+            cond, lambda: f"{param_name} should be greater than zero, but got {param}"
         )
 
     check_positive(kernel_size, "kernel_size")
@@ -1173,6 +1173,8 @@ def native_dropout(input: Tensor, p: float, train: Optional[bool]):
 @register_decomposition(aten._softmax)
 @out_wrapper()
 def _softmax(x: Tensor, dim: int, half_to_float: bool):
+    from torch.fx.experimental.symbolic_shapes import guard_or_false
+
     # eager softmax returns a contiguous tensor. Ensure that decomp also returns
     # a contiguous tensor.
     x = x.contiguous()
@@ -1182,7 +1184,7 @@ def _softmax(x: Tensor, dim: int, half_to_float: bool):
         x, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
     )
     x = x.to(computation_dtype)
-    if x.numel() == 0:
+    if guard_or_false(x.numel() == 0):
         unnormalized = torch.exp(x)
     else:
         x_max = torch.amax(x, dim, keepdim=True)
@@ -1196,6 +1198,8 @@ def _softmax(x: Tensor, dim: int, half_to_float: bool):
 @register_decomposition(aten._log_softmax)
 @out_wrapper(exact_dtype=True)
 def _log_softmax(x: Tensor, dim: int, half_to_float: bool):
+    from torch.fx.experimental.symbolic_shapes import guard_or_false
+
     # eager log_softmax returns a contiguous tensor. Ensure that decomp also
     # returns a contiguous tensor.
     x = x.contiguous()
@@ -1205,7 +1209,7 @@ def _log_softmax(x: Tensor, dim: int, half_to_float: bool):
         x, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
     )
     x = x.to(computation_dtype)
-    if x.numel() == 0:
+    if guard_or_false(x.numel() == 0):
         shifted = x
     else:
         x_max = torch.amax(x, dim, keepdim=True)
@@ -1438,8 +1442,18 @@ def tensor_split_tensor_indices_or_sections_py_impl(
         assert isinstance(sections, IntLike)
         return self.tensor_split(sections, dim)
     else:
-        indices = [i.item() for i in tensor_indices_or_sections]
-        # WARNING: Tempted to torch._check_is_size on the indices here?  You
+        ctx = nullcontext
+        if (fake_mode := torch._guards.detect_fake_mode()) and (
+            shape_env := fake_mode.shape_env
+        ):
+            ctx = shape_env.ignore_fresh_unbacked_symbols  # type: ignore[assignment]
+        # In fake tensor prop, we end up calling slice() with these unbacked indices.
+        # Because slice has flexible semantics, the unbacked handling generates new output sizes
+        # for each slice, effectively clobbering over these index symbols.
+        # To avoid PendingUnbackedSymbolNotFound errors, we tell the compiler it's fine to not bind these.
+        with ctx():
+            indices = [i.item() for i in tensor_indices_or_sections]
+        # WARNING: Tempted to torch._check(x>0) on the indices here?  You
         # can't: tensor_split works with negative values in indices:
         #
         # >>> torch.tensor_split(torch.randn(10), torch.tensor([-5, 5]))
@@ -1539,9 +1553,9 @@ def native_group_norm_backward(
         lambda: f"Expect gamma to have {C} elements but got {gamma.numel() if gamma is not None else -1}",
     )
 
-    cpg, _rem = divmod(C, group)
+    cpg = C // group
     torch._check(
-        _rem == 0,
+        C == cpg * group,
         lambda: f"Expect number of channels {C} to be evenly-divisible by number of groups {group}",
     )
 
@@ -1743,6 +1757,61 @@ def native_layer_norm_backward_out(
     return grad_input
 
 
+@register_decomposition(aten._fused_rms_norm.default)
+def _fused_rms_norm(
+    input: Tensor,
+    normalized_shape: list[int],
+    weight: Optional[Tensor],
+    eps: Optional[float],
+) -> tuple[Tensor, Tensor]:
+    dims_to_reduce: list[int] = []
+    for i in range(len(normalized_shape)):
+        dims_to_reduce.append(input.dim() - i - 1)
+
+    # upcast is needed for fp16 and bf16
+    computation_dtype = utils.get_computation_dtype(input.dtype)
+    upcasted_input = input.to(computation_dtype)
+
+    # computation_dtype would be one of [Double, Float, ComplexFloat, ComplexDouble]
+    if eps is None:
+        if computation_dtype in (torch.float32, torch.complex64):
+            eps_val = torch.finfo(torch.float32).eps
+        else:
+            eps_val = torch.finfo(torch.float64).eps
+    else:
+        eps_val = eps
+
+    rqrst_input = torch.rsqrt(
+        # NB: don't inplace here, will violate functional IR invariant
+        # NB: carefully use the Scalar overload of add to ensure compatibility with the C++ decomp
+        torch.ops.aten.add.Scalar(
+            torch.pow(upcasted_input, 2).mean(dim=dims_to_reduce, keepdim=True), eps_val
+        )
+    )
+
+    upcasted_result = upcasted_input.mul(rqrst_input)
+
+    if weight is not None:
+        upcasted_result = upcasted_result.mul(weight)
+
+    # NB: nested should be dead here, just here for fidelity
+    is_nested = input.is_nested or (weight is not None and weight.is_nested)
+    memory_format = utils.suggest_memory_format(input)
+    is_channels_last = memory_format in (
+        torch.channels_last,
+        torch.channels_last_3d,
+    )
+
+    if not is_nested and not is_channels_last:
+        upcasted_result = upcasted_result.contiguous()
+        rqrst_input = rqrst_input.contiguous()
+
+    # Cast normalized result back to original input type
+    result = upcasted_result.type_as(input)
+
+    return result, rqrst_input
+
+
 @register_decomposition(aten._fused_rms_norm_backward.default)
 def _fused_rms_norm_backward(
     grad_out: Tensor,
@@ -2776,7 +2845,7 @@ def _index_add(
     if alpha != 1:
         python_type = utils.dtype_to_type(x.dtype)
         torch._check(
-            python_type == bool
+            python_type is bool
             or utils.is_weakly_lesser_type(type(alpha), python_type),
             lambda: f"alpha argument of type {type(alpha)} cannot be safely cast to type {python_type}!",
         )
@@ -4065,6 +4134,7 @@ def _nll_loss_forward(
         return result, total_weight
 
     if weight is not None:
+        # pyrefly: ignore  # unbound-name
         w = w.expand(self.shape)
         wsum = torch.gather(w, channel_dim, safe_target_).squeeze(channel_dim)
         wsum = torch.where(target != ignore_index, wsum, 0)
@@ -4881,7 +4951,9 @@ def accumulate(grad, out, index_ranges):
 @register_decomposition(aten.aminmax)
 @out_wrapper("min", "max")
 def aminmax(self, *, dim=None, keepdim=False):
+    # pyrefly: ignore  # bad-argument-type
     amin = torch.amin(self, dim=dim, keepdim=keepdim)
+    # pyrefly: ignore  # bad-argument-type
     amax = torch.amax(self, dim=dim, keepdim=keepdim)
     return amin, amax
 
@@ -5126,6 +5198,7 @@ def baddbmm(self, batch1, batch2, beta=1, alpha=1):
         alpha = int(alpha)
     result = torch.bmm(batch1, batch2)
     if not isinstance(alpha, numbers.Number) or alpha != 1:
+        # pyrefly: ignore  # unsupported-operation
         result = result * alpha
     if beta == 0:
         return result
diff --git a/torch/_decomp/decompositions_for_jvp.py b/torch/_decomp/decompositions_for_jvp.py
index cd1e0426f166..fb4a4d85faa2 100644
--- a/torch/_decomp/decompositions_for_jvp.py
+++ b/torch/_decomp/decompositions_for_jvp.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-decorators
 # mypy: allow-untyped-defs
 import inspect
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Optional
 
 import torch
 import torch._decomp
@@ -146,7 +147,7 @@ def native_layer_norm_backward(
     inner_dims = input_shape[axis:]
     outer_dims = input_shape[:axis]
     inner_dim_indices = list(range(axis, input_ndim))
-    outer_dim_indices = list(range(0, axis))
+    outer_dim_indices = list(range(axis))
 
     N = 1
     for i in inner_dims:
diff --git a/torch/_decomp/decompositions_for_rng.py b/torch/_decomp/decompositions_for_rng.py
index 256045498cbf..455ef0cc9943 100644
--- a/torch/_decomp/decompositions_for_rng.py
+++ b/torch/_decomp/decompositions_for_rng.py
@@ -2,7 +2,7 @@
 # mypy: allow-untyped-defs
 import functools
 from collections import defaultdict
-from typing import Callable
+from collections.abc import Callable
 
 import torch
 import torch._decomp as decomp
diff --git a/torch/_dispatch/python.py b/torch/_dispatch/python.py
index a4103eb8387d..4cf1d1b5cffc 100644
--- a/torch/_dispatch/python.py
+++ b/torch/_dispatch/python.py
@@ -1,9 +1,9 @@
 # mypy: allow-untyped-defs
 import itertools
 import unittest.mock
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 from contextlib import contextmanager
-from typing import Callable, TypeVar, Union
+from typing import TypeVar, Union
 from typing_extensions import ParamSpec
 
 import torch
@@ -154,7 +154,7 @@ def maybe_detach(t):
                 maybe_detach, (f_args, f_kwargs)
             )
             with fake_mode:
-                f_r = op(*f_args, **f_kwargs)
+                f_r = op(*f_args, **f_kwargs)  # pyrefly: ignore  # invalid-param-spec
         r = op._op_dk(final_key, *args, **kwargs)
 
         def desc():
diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index 561acf62f785..a75c88910ecc 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -40,6 +40,7 @@
     run,
     set_stance,
     skip_frame,
+    step_unsupported,
     substitute_in_graph,
 )
 from .eval_frame import (
@@ -53,6 +54,8 @@
     OptimizedModule,
     reset_code,
 )
+
+# pyrefly: ignore  # deprecated
 from .external_utils import is_compiling
 from .mutation_guard import GenerationTracker
 from .pgo import reset_code_state
@@ -100,6 +103,7 @@
     "error_on_graph_break",
     "set_stance",
     "skip_frame",
+    "step_unsupported",
     "substitute_in_graph",
 ]
 
diff --git a/torch/_dynamo/_trace_wrapped_higher_order_op.py b/torch/_dynamo/_trace_wrapped_higher_order_op.py
index 9b000ee926a1..69ffb830c945 100644
--- a/torch/_dynamo/_trace_wrapped_higher_order_op.py
+++ b/torch/_dynamo/_trace_wrapped_higher_order_op.py
@@ -95,6 +95,7 @@ class ModIndex(torch.autograd.Function):
     generate_vmap_rule = True
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(x: Tensor, indices: list[Tensor]) -> Tensor:
         return torch.ops.aten.index(x, indices)
 
@@ -242,6 +243,7 @@ def _trace_wrapped_functionalized(ctx: Any, *args: Any, **kwargs: Any) -> Any:
 
 def autograd_function_backward_rewritten(original_backward: Any) -> Any:
     def new_backward(ctx: Any, *grads: Any) -> Any:
+        # pyrefly: ignore  # bad-assignment
         grads = [g.contiguous() for g in grads]
         return original_backward(ctx, *grads)
 
diff --git a/torch/_dynamo/aot_compile.py b/torch/_dynamo/aot_compile.py
index 9a1dce544052..cc1391cb7748 100644
--- a/torch/_dynamo/aot_compile.py
+++ b/torch/_dynamo/aot_compile.py
@@ -1,5 +1,4 @@
 import abc
-import builtins
 import dataclasses
 import importlib
 import inspect
@@ -13,7 +12,7 @@
 import torch
 import torch.fx
 from torch._dynamo.graph_utils import _graph_device_type
-from torch._dynamo.precompile_context import SystemInfo
+from torch._dynamo.package import SystemInfo
 
 from . import convert_frame
 from .hooks import Hooks
@@ -70,6 +69,7 @@ def check_compatibility(self) -> None:
 @dataclass
 class AOTCompiledFunction:
     _artifacts: CompileArtifacts
+    _guard_check_enabled: bool = True
 
     def guard_check(self, *args: Any, **kwargs: Any) -> bool:
         f_locals = bind_locals(self._artifacts.signature, *args, **kwargs)
@@ -77,6 +77,8 @@ def guard_check(self, *args: Any, **kwargs: Any) -> bool:
         return self._artifacts.guard_manager.check(f_locals)
 
     def __post_init__(self) -> None:
+        from .package import load_guard_manager, load_guards_state
+
         self._artifacts.check_compatibility()
 
         import_sources = {
@@ -87,22 +89,22 @@ def __post_init__(self) -> None:
             **import_sources,
             self._artifacts.backend_id: self._artifacts.compiled_fn,
         }
+        # pyrefly: ignore  # read-only
         self.fn = types.FunctionType(
             self._artifacts.bytecode, f_globals, closure=self._artifacts.closure
         )
 
         if self._artifacts.guard_manager is None:
-            guards_state = pickle.loads(self._artifacts.guards_state)
-            self._artifacts.guard_manager = torch._dynamo.guards.CheckFunctionManager(
+            guards_state = load_guards_state(self._artifacts.guards_state)
+            self._artifacts.guard_manager = load_guard_manager(
+                guards_state,
                 self._artifacts.original_code,
-                guards_state.output_graph,
-                shape_code_parts=guards_state.shape_code_parts,
-                runtime_global_scope=f_globals,
-            ).guard_manager
+                f_globals,
+            )
 
     def __call__(self, *args: Any, **kwargs: Any) -> Any:
         assert self._artifacts.guard_manager is not None
-        if not self.guard_check(*args, **kwargs):
+        if self._guard_check_enabled and not self.guard_check(*args, **kwargs):
             f_locals = bind_locals(self._artifacts.signature, *args, **kwargs)
             reason = str(self._artifacts.guard_manager.check_verbose(f_locals))
             raise RuntimeError(f"GuardManager check failed, reason: {reason}")
@@ -143,6 +145,9 @@ def deserialize(cls, data: bytes) -> "AOTCompiledFunction":
         artifacts = CompileArtifacts(**state)
         return cls(artifacts)
 
+    def disable_guard_check(self) -> None:
+        self._guard_check_enabled = False
+
 
 class BundledAOTAutogradSerializableCallable(SerializableCallable):
     """
@@ -182,7 +187,9 @@ def deserialize_compile_artifacts(cls, data: bytes) -> Any:
             deserialize_bundled_cache_entry,
         )
 
-        compiled_fn = deserialize_bundled_cache_entry(data)
+        entry = pickle.loads(data)
+
+        compiled_fn = deserialize_bundled_cache_entry(entry)
         return cls(compiled_fn)
 
     def __call__(self, *args: Any, **kwargs: Any) -> Any:
@@ -198,43 +205,17 @@ def aot_compile_fullgraph(
     from torch._dynamo.guards import CheckFunctionManager
     from torch._dynamo.package import SourceInfo
     from torch._dynamo.utils import dynamo_timed, get_metrics_context
-    from torch._guards import compile_context, CompileContext, TracingContext
+    from torch._guards import TracingContext
 
     args, kwargs = example_inputs
-    if hasattr(model, "__self__"):
-        fn = model.__func__
-        args = (model.__self__,) + args
-    elif inspect.isfunction(model):
-        fn = model
-    else:
-        raise RuntimeError(f"Unsupported model code type {model}")
-
-    signature = inspect.signature(fn)
-    f_locals = bind_locals(signature, *args, **kwargs)
-    if fn.__code__.co_freevars or fn.__closure__:
-        assert len(fn.__closure__) == len(fn.__code__.co_freevars)
-        f_locals.update(
-            {
-                name: cell.cell_contents
-                for name, cell in zip(fn.__code__.co_freevars, fn.__closure__)
-            }
-        )
 
     with (
-        compile_context(CompileContext(convert_frame.get_compile_id({}))),
         get_metrics_context(),
         dynamo_timed("fullgraph_capture"),
     ):
-        capture_output = convert_frame.fullgraph_capture(
-            convert_frame.FrameInfo(
-                fn.__code__,
-                fn.__globals__,
-                f_locals,
-                builtins.__dict__,
-                closure=fn.__closure__ or (),  # type: ignore[arg-type]
-            )
-        )
-        dynamo_output = capture_output.dynamo_output
+        capture_output = convert_frame.fullgraph_capture(model, args, kwargs)
+        graph_capture_output = capture_output.graph_capture_output
+        assert graph_capture_output.output_graph is not None
 
         if not hooks.guard_filter_fn:
             from torch._dynamo.types import GuardFilterEntry
@@ -255,7 +236,8 @@ def new_guard_filter_fn(
 
             hooks.guard_filter_fn = new_guard_filter_fn
 
-        check_fn = dynamo_output.build_guards(
+        fn, _ = convert_frame.get_traced_fn(model)
+        check_fn = graph_capture_output.build_guards(
             fn.__code__, hooks=hooks, save=True, strict_error=True
         )
 
@@ -264,12 +246,11 @@ def new_guard_filter_fn(
         backend_input = capture_output.backend_input
         assert backend_input is not None
         backend_input.graph_module._backend_id = backend_input.backend_id  # type: ignore[assignment]
-        output_graph = dynamo_output.tracer_output.output_graph
-        assert output_graph is not None
-        device_type = _graph_device_type(output_graph.current_tracer.graph)
-        import_sources = output_graph.import_sources
+        device_type = _graph_device_type(backend_input.graph_module.graph)
+        tracing_context = TracingContext(backend_input.fake_mode)
+        tracing_context.tensor_to_context = backend_input.tensor_to_context
         with (
-            torch._guards.tracing(TracingContext(backend_input.fake_mode)),
+            torch._guards.tracing(tracing_context),
             torch._functorch.config.patch(
                 {
                     "bundled_autograd_cache": True,
@@ -296,15 +277,15 @@ def new_guard_filter_fn(
             )
 
         source_info = SourceInfo(inlined_sources=set())
-        for traced_code in output_graph.traced_code:
+        for traced_code in graph_capture_output.traced_code:
             source_info.add_code(traced_code)
 
         artifacts = CompileArtifacts(
-            signature=signature,
-            bytecode=dynamo_output.bytecode,
+            signature=convert_frame._get_signature(fn),
+            bytecode=graph_capture_output.bytecode,
             guard_manager=check_fn.guard_manager,
             guards_state=check_fn.guards_state,
-            import_sources=import_sources,
+            import_sources=graph_capture_output.import_sources,
             backend_id=backend_input.backend_id,
             compiled_fn=compiled_fn,
             original_code=fn.__code__,
diff --git a/torch/_dynamo/backends/cudagraphs.py b/torch/_dynamo/backends/cudagraphs.py
index f8599d393833..d6775d0841d8 100644
--- a/torch/_dynamo/backends/cudagraphs.py
+++ b/torch/_dynamo/backends/cudagraphs.py
@@ -206,6 +206,7 @@ def backward_cudagraphs(
             assert manager is not None
 
             def fn(inputs: list[Any]) -> Any:
+                # pyrefly: ignore  # missing-attribute
                 manager.set_to_running_backward()
                 return aot_model(inputs)
 
diff --git a/torch/_dynamo/backends/debugging.py b/torch/_dynamo/backends/debugging.py
index 32fc72cfa52a..3b33db4e452a 100644
--- a/torch/_dynamo/backends/debugging.py
+++ b/torch/_dynamo/backends/debugging.py
@@ -74,13 +74,13 @@ def make_eager_backend_with_torch_function_modes(
     def fn(
         gm: torch.fx.GraphModule, fake_tensor_inputs: list[torch.Tensor], **kwargs: Any
     ) -> Callable[..., Any]:
-        stack = ExitStack()
-        for mode in modes:
-            stack.enter_context(mode)
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
+            with ExitStack() as stack:
+                for mode in modes:
+                    stack.enter_context(mode)
+                return gm.forward(*args, **kwargs)
 
-        result = gm.forward
-        stack.close()
-        return result
+        return wrapper
 
     return fn
 
@@ -153,8 +153,17 @@ def torchscript(
 def boxed_nop(
     fx_g: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
 ) -> Callable[..., Any]:
+    from torch.fx.graph import _BoxedCodeGen
+
+    # Set the graph to use boxed codegen
+    fx_g.graph.set_codegen(_BoxedCodeGen())
+    fx_g.recompile()
+
+    # Wrap the forward method in a function so we can set _boxed_call attribute
+    forward_fn = fx_g.forward
+
     def run(args: Any) -> Any:
-        return torch.fx.Interpreter(fx_g).boxed_run(args)
+        return forward_fn(args)
 
     run._boxed_call = True  # type: ignore[attr-defined]
     return run
@@ -166,9 +175,18 @@ def boxed_nop_with_mode(
     *,
     mode: torch.overrides.TorchFunctionMode,
 ) -> Callable[..., Any]:
+    from torch.fx.graph import _BoxedCodeGen
+
+    # Set the graph to use boxed codegen
+    fx_g.graph.set_codegen(_BoxedCodeGen())
+    fx_g.recompile()
+
+    # Create a wrapper that runs with the mode
+    forward_fn = fx_g.forward
+
     def run(args: Any) -> Any:
         with mode:
-            return torch.fx.Interpreter(fx_g).boxed_run(args)
+            return forward_fn(args)
 
     run._boxed_call = True  # type: ignore[attr-defined]
     return run
@@ -179,9 +197,18 @@ def fake_crossref_boxed_nop(
     example_inputs: list[torch.Tensor],
     ignore_op_fn: Optional[Callable[[torch._ops.OpOverload], bool]] = None,
 ) -> Callable[..., Any]:
+    from torch.fx.graph import _BoxedCodeGen
+
+    # Set the graph to use boxed codegen
+    fx_g.graph.set_codegen(_BoxedCodeGen())
+    fx_g.recompile()
+
+    # Create a wrapper that runs with the mode
+    forward_fn = fx_g.forward
+
     def run(args: Any) -> Any:
         with torch._subclasses.CrossRefFakeMode(ignore_op_fn):
-            return torch.fx.Interpreter(fx_g).boxed_run(args)
+            return forward_fn(args)
 
     run._boxed_call = True  # type: ignore[attr-defined]
     return run
diff --git a/torch/_dynamo/backends/distributed.py b/torch/_dynamo/backends/distributed.py
index b282a6218816..6be9690c6a1c 100644
--- a/torch/_dynamo/backends/distributed.py
+++ b/torch/_dynamo/backends/distributed.py
@@ -98,14 +98,14 @@ def pretty_print_buckets(buckets: list[Bucket], bucket_bytes_cap: int) -> None:
                 )
             )
 
-    if len(rows):
+    if rows:
         log.info(
             "\nDDPOptimizer used bucket cap %s and created %d buckets. Enable debug logs for detailed bucket info.",
             bucket_bytes_cap,
             len(buckets),
         )
 
-        if len(extended_buckets):
+        if extended_buckets:
             log.warning(
                 "Some buckets were extended beyond their requested parameter capacities"
                 " in order to ensure each subgraph has an output node, required for fx graph partitioning."
@@ -122,7 +122,7 @@ def pretty_print_buckets(buckets: list[Bucket], bucket_bytes_cap: int) -> None:
                 tabulate(rows, headers=headers, tablefmt="simple_grid"),
             )
 
-            if len(extended_buckets):
+            if extended_buckets:
                 log.warning(
                     "DDPOptimizer extended these buckets to ensure per-subgraph output nodes:\n%s",
                     tabulate(
diff --git a/torch/_dynamo/backends/registry.py b/torch/_dynamo/backends/registry.py
index 699d82fff3f0..c6a334359d0e 100644
--- a/torch/_dynamo/backends/registry.py
+++ b/torch/_dynamo/backends/registry.py
@@ -60,7 +60,6 @@ def my_compiler_function(fx_graph, example_inputs):
 
 import functools
 import logging
-import sys
 from collections.abc import Sequence
 from importlib.metadata import EntryPoint
 from typing import Any, Callable, Optional, Protocol, Union
@@ -174,12 +173,7 @@ def _discover_entrypoint_backends() -> None:
     from importlib.metadata import entry_points
 
     group_name = "torch_dynamo_backends"
-    if sys.version_info < (3, 10):
-        eps = entry_points()
-        eps = eps[group_name] if group_name in eps else []
-        eps_dict = {ep.name: ep for ep in eps}
-    else:
-        eps = entry_points(group=group_name)
-        eps_dict = {name: eps[name] for name in eps.names}
+    eps = entry_points(group=group_name)
+    eps_dict = {name: eps[name] for name in eps.names}
     for backend_name in eps_dict:
         _BACKENDS[backend_name] = eps_dict[backend_name]
diff --git a/torch/_dynamo/backends/tvm.py b/torch/_dynamo/backends/tvm.py
index 7e2ab19bb9c0..4820916c1212 100644
--- a/torch/_dynamo/backends/tvm.py
+++ b/torch/_dynamo/backends/tvm.py
@@ -77,16 +77,19 @@ def tvm(
     opt_level = options.get("opt_level", 3)
 
     if scheduler == "auto_scheduler":
+        # pyrefly: ignore  # import-error
         from tvm import auto_scheduler
 
         log_file = tempfile.NamedTemporaryFile()
 
+        # pyrefly: ignore  # bad-argument-type
         if not os.path.exists(log_file):
             tasks, task_weights = auto_scheduler.extract_tasks(
                 mod["main"], params, target
             )
             if len(tasks) != 0:
                 tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+                # pyrefly: ignore  # bad-argument-type
                 if not os.path.exists(log_file):
                     assert trials > 0
                     tune_option = auto_scheduler.TuningOptions(
@@ -97,7 +100,9 @@ def tvm(
                     try:
                         tuner.tune(tune_option)
                     except Exception:
+                        # pyrefly: ignore  # bad-argument-type
                         if os.path.exists(log_file):
+                            # pyrefly: ignore  # bad-argument-type
                             os.unlink(log_file)
                         raise
 
@@ -107,6 +112,7 @@ def tvm(
             ):
                 lib = relay.build(mod, target=target, params=params)
     elif scheduler == "meta_schedule":
+        # pyrefly: ignore  # import-error
         from tvm import meta_schedule as ms
 
         with tempfile.TemporaryDirectory() as work_dir:
diff --git a/torch/_dynamo/bytecode_analysis.py b/torch/_dynamo/bytecode_analysis.py
index 8bdf155e0060..0e98f873e479 100644
--- a/torch/_dynamo/bytecode_analysis.py
+++ b/torch/_dynamo/bytecode_analysis.py
@@ -15,6 +15,7 @@
 import bisect
 import dataclasses
 import dis
+import itertools
 import sys
 from typing import Any, TYPE_CHECKING, Union
 
@@ -36,6 +37,7 @@
     TERMINAL_OPCODES.add(dis.opmap["JUMP_FORWARD"])
 else:
     TERMINAL_OPCODES.add(dis.opmap["JUMP_ABSOLUTE"])
+# pyrefly: ignore  # unsupported-operation
 if (3, 12) <= sys.version_info < (3, 14):
     TERMINAL_OPCODES.add(dis.opmap["RETURN_CONST"])
 if sys.version_info >= (3, 13):
@@ -110,7 +112,7 @@ def remove_pointless_jumps(instructions: list["Instruction"]) -> list["Instructi
     """Eliminate jumps to the next instruction"""
     pointless_jumps = {
         id(a)
-        for a, b in zip(instructions, instructions[1:])
+        for a, b in itertools.pairwise(instructions)
         if a.opname == "JUMP_ABSOLUTE" and a.target is b
     }
     return [inst for inst in instructions if id(inst) not in pointless_jumps]
diff --git a/torch/_dynamo/bytecode_transformation.py b/torch/_dynamo/bytecode_transformation.py
index 14a6f78bfcd4..c7a212df5c57 100644
--- a/torch/_dynamo/bytecode_transformation.py
+++ b/torch/_dynamo/bytecode_transformation.py
@@ -24,7 +24,6 @@
 from collections.abc import Iterable, Iterator, Mapping, Sequence
 from typing import Any, Callable, cast, Optional, TYPE_CHECKING, Union
 
-from ..utils._backport_slots import dataclass_slots
 from . import config
 from .bytecode_analysis import (
     get_indexof,
@@ -39,8 +38,7 @@
     from .output_graph import DynamoTracerOutput
 
 
-@dataclass_slots
-@dataclasses.dataclass
+@dataclasses.dataclass(slots=True)
 class InstructionExnTabEntry:
     start: "Instruction"
     end: "Instruction"
@@ -68,8 +66,7 @@ def __eq__(self, o: object) -> bool:
         )
 
 
-@dataclass_slots
-@dataclasses.dataclass
+@dataclasses.dataclass(slots=True)
 class Instruction:
     """A mutable version of dis.Instruction"""
 
@@ -251,22 +248,6 @@ def create_rot_n(n: int) -> list[Instruction]:
         # e.g. rotate 3 is equivalent to swap 3, swap 2
         return [create_instruction("SWAP", arg=i) for i in range(n, 1, -1)]
 
-    # ROT_N does not exist in Python <= 3.9, but we can simulate it
-    if sys.version_info < (3, 10) and n >= 5:
-        """
-        0 1 2 3 4
-        [0 1 2 3 4]
-        4 3 2 1 0
-        4 [3 2 1 0]
-        4 0 1 2 3
-        """
-        return [
-            create_instruction("BUILD_TUPLE", arg=n),
-            create_instruction("UNPACK_SEQUENCE", arg=n),
-            create_instruction("BUILD_TUPLE", arg=n - 1),
-            create_instruction("UNPACK_SEQUENCE", arg=n - 1),
-        ]
-
     if n <= 4:
         return [create_instruction("ROT_" + ["TWO", "THREE", "FOUR"][n - 2])]
     return [create_instruction("ROT_N", arg=n)]
@@ -416,6 +397,39 @@ def create_call_function(nargs: int, push_null: bool) -> list[Instruction]:
     return [create_instruction("CALL_FUNCTION", arg=nargs)]
 
 
+def create_call_function_ex(
+    has_kwargs: bool, push_null: bool, ignore_314_kwargs_push: bool = False
+) -> list[Instruction]:
+    """
+    Assumes that in 3.14+, if has_kwargs=False, there is NOT a NULL
+    on the TOS for the kwargs. This utility function will add a PUSH_NULL.
+
+    If the caller has already pushed a NULL for the kwargs, then set ignore_314_kwargs_push=True
+    so we don't push another NULL for the kwargs.
+    """
+    if sys.version_info >= (3, 11):
+        output = []
+        if (
+            sys.version_info >= (3, 14)
+            and not has_kwargs
+            and not ignore_314_kwargs_push
+        ):
+            output.append(create_instruction("PUSH_NULL"))
+        if push_null:
+            output.append(create_instruction("PUSH_NULL"))
+            # 3.13 swapped NULL and callable
+            # if flags == 1, 2 values popped - otherwise if flags == 0, 1 value
+            rots = (
+                int(has_kwargs) + 2
+                if sys.version_info >= (3, 13)
+                else int(has_kwargs) + 3
+            )
+            output.extend(create_rot_n(rots))
+        output.append(create_instruction("CALL_FUNCTION_EX", arg=int(has_kwargs)))
+        return output
+    return [create_instruction("CALL_FUNCTION_EX", arg=int(has_kwargs))]
+
+
 def create_call_method(nargs: int) -> list[Instruction]:
     if sys.version_info >= (3, 12):
         return [create_instruction("CALL", arg=nargs)]
@@ -471,7 +485,7 @@ def create_swap(n: int) -> list[Instruction]:
         create_instruction("BUILD_LIST", arg=n - 1),
         create_instruction("DUP_TOP"),
         create_instruction("LOAD_CONST", argval=-1),
-        create_instruction("BINARY_SUBSCR"),
+        create_binary_subscr(),
         create_instruction("ROT_THREE"),
         create_instruction("DUP_TOP"),
         create_instruction("ROT_THREE"),
@@ -491,7 +505,15 @@ def create_binary_slice(
     """
     BINARY_SLICE and STORE_SLICE (if `set` is True) for all Python versions
     """
-    if sys.version_info >= (3, 12):
+    if sys.version_info >= (3, 14):
+        subscr_inst = (
+            create_instruction("STORE_SUBSCR") if store else create_binary_subscr()
+        )
+        return [
+            create_load_const(slice(start, end)),
+            subscr_inst,
+        ]
+    elif sys.version_info >= (3, 12):
         inst_name = "STORE_SLICE" if store else "BINARY_SLICE"
         return [
             create_load_const(start),
@@ -511,6 +533,8 @@ def create_binary_slice(
 def create_copy(i: int) -> list[Instruction]:
     if sys.version_info >= (3, 11):
         return [create_instruction("COPY", arg=i)]
+    if i == 1:
+        return [create_instruction("DUP_TOP")]
     # COPY 4
     # 0 1 2 3
     # 3 1 2 0
@@ -545,31 +569,20 @@ def create_print_value(value: Any) -> list[Instruction]:
     ]
 
 
-def lnotab_writer(
-    lineno: int, byteno: int = 0
-) -> tuple[list[int], Callable[[int, int], None]]:
-    """
-    Used to create typing.CodeType.co_lnotab
-    See https://github.com/python/cpython/blob/main/Objects/lnotab_notes.txt
-    This is the internal format of the line number table if Python < 3.10
-    """
-    assert sys.version_info < (3, 10)
-    lnotab: list[int] = []
+def create_binary_subscr() -> Instruction:
+    if sys.version_info < (3, 14):
+        return create_instruction("BINARY_SUBSCR")
+    # https://github.com/python/cpython/blob/0e46c0499413bc5f9f8336fe76e2e67cf93f64d8/Include/opcode.h#L36
+    return create_instruction("BINARY_OP", arg=26)
 
-    def update(lineno_new: int, byteno_new: int) -> None:
-        nonlocal byteno, lineno
-        while byteno_new != byteno or lineno_new != lineno:
-            byte_offset = max(0, min(byteno_new - byteno, 255))
-            line_offset = max(-128, min(lineno_new - lineno, 127))
-            assert byte_offset != 0 or line_offset != 0
-            byteno += byte_offset
-            lineno += line_offset
-            lnotab.extend((byte_offset, line_offset & 0xFF))
 
-    return lnotab, update
+def create_build_tuple(n: int) -> Instruction:
+    if sys.version_info >= (3, 14) and n == 0:
+        return create_load_const(())
+    return create_instruction("BUILD_TUPLE", arg=n)
 
 
-def linetable_310_writer(
+def linetable_writer(
     first_lineno: int,
 ) -> tuple[list[int], Callable[[int, int], None], Callable[[int], None]]:
     """
@@ -577,7 +590,7 @@ def linetable_310_writer(
     See https://github.com/python/cpython/blob/main/Objects/lnotab_notes.txt
     This is the internal format of the line number table for Python 3.10
     """
-    assert sys.version_info >= (3, 10) and sys.version_info < (3, 11)
+    assert sys.version_info[:2] == (3, 10)
     linetable: list[int] = []
     lineno = first_lineno
     lineno_delta = 0
@@ -682,8 +695,7 @@ def _update(delta: int, size: int) -> None:
     return linetable, update
 
 
-@dataclass_slots
-@dataclasses.dataclass
+@dataclasses.dataclass(slots=True)
 class ExceptionTableEntry:
     start: int
     end: int
@@ -799,10 +811,7 @@ def assemble(instructions: list[Instruction], firstlineno: int) -> tuple[bytes,
             for _ in range(instruction_size(inst) // 2 - 1):
                 code.extend((0, 0))
     else:
-        if sys.version_info < (3, 10):
-            lnotab, update_lineno = lnotab_writer(firstlineno)
-        else:
-            lnotab, update_lineno, end = linetable_310_writer(firstlineno)
+        lnotab, update_lineno, end = linetable_writer(firstlineno)
 
         for inst in instructions:
             if inst.starts_line is not None:
@@ -810,8 +819,7 @@ def assemble(instructions: list[Instruction], firstlineno: int) -> tuple[bytes,
             arg = inst.arg or 0
             code.extend((inst.opcode, arg & 0xFF))
 
-        if sys.version_info >= (3, 10):
-            end(len(code))
+        end(len(code))
 
     return bytes(code), bytes(lnotab)
 
@@ -903,9 +911,7 @@ def devirtualize_jumps(instructions: list[Instruction]) -> None:
             assert inst.target is not None
             target = _get_instruction_front(instructions, indexof[inst.target])
             if inst.opcode in dis.hasjabs:
-                if sys.version_info < (3, 10):
-                    inst.arg = target.offset
-                elif sys.version_info < (3, 11):
+                if sys.version_info < (3, 11):
                     # `arg` is expected to be bytecode offset, whereas `offset` is byte offset.
                     # Divide since bytecode is 2 bytes large.
                     inst.arg = int(target.offset / 2)
@@ -917,9 +923,8 @@ def devirtualize_jumps(instructions: list[Instruction]) -> None:
                 inst.arg = abs(
                     int(target.offset - inst.offset - instruction_size(inst))
                 )
-                if sys.version_info >= (3, 10):
-                    # see bytecode size comment in the absolute jump case above
-                    inst.arg //= 2
+                # pyrefly: ignore  # unsupported-operation
+                inst.arg //= 2
             inst.argval = target.offset
             inst.argrepr = f"to {target.offset}"
 
@@ -1195,7 +1200,10 @@ def remove_binary_store_slice(instructions: list[Instruction]) -> None:
         new_insts.append(inst)
         if inst.opname in ("BINARY_SLICE", "STORE_SLICE"):
             # new instruction
-            subscr_inst = create_instruction(inst.opname.replace("SLICE", "SUBSCR"))
+            if sys.version_info >= (3, 14) and inst.opname == "BINARY_SLICE":
+                subscr_inst = create_binary_subscr()
+            else:
+                subscr_inst = create_instruction(inst.opname.replace("SLICE", "SUBSCR"))
             if inst.exn_tab_entry and inst.exn_tab_entry.end is inst:
                 inst.exn_tab_entry.end = subscr_inst
             subscr_inst.exn_tab_entry = copy.copy(inst.exn_tab_entry)
@@ -1211,6 +1219,7 @@ def remove_binary_store_slice(instructions: list[Instruction]) -> None:
 
 FUSED_INSTS = {
     "LOAD_FAST_LOAD_FAST": ("LOAD_FAST", "LOAD_FAST"),
+    "LOAD_FAST_BORROW_LOAD_FAST_BORROW": ("LOAD_FAST_BORROW", "LOAD_FAST_BORROW"),
     "STORE_FAST_STORE_FAST": ("STORE_FAST", "STORE_FAST"),
     "STORE_FAST_LOAD_FAST": ("STORE_FAST", "LOAD_FAST"),
 }
@@ -1251,7 +1260,7 @@ def add_graph_break_if_leaf_instructions(instructions: list[Instruction]) -> Non
 
 def remove_graph_break_if_leaf_instructions(instructions: list[Instruction]) -> None:
     new_insts = []
-    for inst, next_inst in zip(instructions, instructions[1:]):
+    for inst, next_inst in itertools.pairwise(instructions):
         if (
             inst.opname == "NOP"
             and inst.argval == "GRAPH_BREAK_IF_LEAF"
@@ -1366,6 +1375,7 @@ def update_offsets(instructions: Sequence[Instruction]) -> None:
     offset = 0
     for inst in instructions:
         inst.offset = offset
+        # pyrefly: ignore  # unsupported-operation
         offset += instruction_size(inst)
 
 
@@ -1558,10 +1568,7 @@ def get_code_keys() -> list[str]:
     if sys.version_info >= (3, 11):
         keys.append("co_qualname")
     keys.append("co_firstlineno")
-    if sys.version_info >= (3, 10):
-        keys.append("co_linetable")
-    else:
-        keys.append("co_lnotab")
+    keys.append("co_linetable")
     if sys.version_info >= (3, 11):
         # not documented, but introduced in https://github.com/python/cpython/issues/84403
         keys.append("co_exceptiontable")
@@ -1618,11 +1625,8 @@ def clean_and_assemble_instructions(
 
     remove_extra_line_nums(instructions)
     bytecode, lnotab = assemble(instructions, code_options["co_firstlineno"])
-    if sys.version_info < (3, 10):
-        code_options["co_lnotab"] = lnotab
-    else:
-        code_options["co_linetable"] = lnotab
 
+    code_options["co_linetable"] = lnotab
     code_options["co_code"] = bytecode
     code_options["co_stacksize"] = stacksize_analysis(instructions)
     assert set(keys) - {"co_posonlyargcount"} == set(code_options.keys()) - {
diff --git a/torch/_dynamo/codegen.py b/torch/_dynamo/codegen.py
index d929e3270f38..4ac9fa00f1ad 100644
--- a/torch/_dynamo/codegen.py
+++ b/torch/_dynamo/codegen.py
@@ -26,7 +26,10 @@
 from .bytecode_transformation import (
     add_push_null,
     add_push_null_call_function_ex,
+    create_binary_subscr,
+    create_build_tuple,
     create_call_function,
+    create_call_function_ex,
     create_call_method,
     create_dup_top,
     create_instruction,
@@ -292,7 +295,7 @@ def __call__(
             output.extend(create_call_function(2, False))
         elif (
             isinstance(value, SymNodeVariable)
-            and value.python_type() == float
+            and value.python_type() is float
             and not self.tx.export
         ):
             # This is a little unusual; force the output convention to be a
@@ -397,7 +400,7 @@ def foreach(self, items: Iterable[Union[VariableTracker, Source]]) -> None:
             self(i)
 
     def create_binary_subscr(self) -> Instruction:
-        return create_instruction("BINARY_SUBSCR")
+        return create_binary_subscr()
 
     def setup_globally_cached(self, name: str, value: Any) -> list[Instruction]:
         """Store value in a new global"""
@@ -513,10 +516,10 @@ def rot_n(self, n: int) -> list[Instruction]:
         except AttributeError:
             # desired rotate bytecode doesn't exist, generate equivalent bytecode
             return [
-                create_instruction("BUILD_TUPLE", arg=n),
+                create_build_tuple(n),
                 self.create_load_const_unchecked(rot_n_helper(n)),
                 *create_rot_n(2),
-                create_instruction("CALL_FUNCTION_EX", arg=0),
+                *create_call_function_ex(False, False),
                 create_instruction("UNPACK_SEQUENCE", arg=n),
             ]
 
@@ -537,51 +540,33 @@ def load_deref(self, varname: str) -> None:
 
     def make_function_with_closure(
         self,
-        tx: "InstructionTranslatorBase",
         fn_name: str,
         code: types.CodeType,
-        push_null: bool,
-        num_on_stack: int = 0,
     ) -> None:
-        freevars = code.co_freevars
-        assert freevars
+        """Creates a closure with code object `code`.
+
+        Expects the TOS to be the tuple of cells to use for this closure.
+        TOS will be popped to create the closure.
+        Args:
+            - fn_name: name of the function
+            - code: code object of the function
+                (does not include the tuple of cells on the TOS)
+        """
         output = self._output
 
-        def gen_fn() -> None:
-            self.clear_tos()
-            # Emitting `LOAD_FAST/LOAD_CLOSURE` with names in `co_freevars`
-            # requires that in the generated bytecode, these cells would keep
-            # their original local names, which we ensure via
-            # `CellVariable.local_name`.
-            for var in freevars:
-                if tx is self.tx:  # root frame
-                    assert var in self.cell_and_freevars()
-                    output.append(self.create_load_closure(var))
-                else:  # nested frame
-                    assert var in tx.cell_and_freevars()
-                    assert tx.post_prune_cell_and_freevars
-                    self(tx.post_prune_cell_and_freevars[var])
-            output.append(create_instruction("BUILD_TUPLE", arg=len(freevars)))
-            output.append(self.create_load_const(code))
-            if sys.version_info < (3, 11):
-                output.append(self.create_load_const(fn_name))
-            if sys.version_info >= (3, 13):
-                output.extend(
-                    [
-                        create_instruction("MAKE_FUNCTION"),
-                        create_instruction("SET_FUNCTION_ATTRIBUTE", arg=0x08),
-                    ]
-                )
-            else:
-                output.append(create_instruction("MAKE_FUNCTION", arg=0x08))
-
-        if push_null and sys.version_info >= (3, 11):
-            self.add_push_null(gen_fn)
-            output.extend(self.rot_n(num_on_stack + 2))
-            output.extend(self.rot_n(num_on_stack + 2))
+        output.append(self.create_load_const(code))
+        if sys.version_info < (3, 11):
+            output.append(self.create_load_const(fn_name))
+        if sys.version_info >= (3, 13):
+            output.extend(
+                [
+                    create_instruction("MAKE_FUNCTION"),
+                    create_instruction("SET_FUNCTION_ATTRIBUTE", arg=0x08),
+                ]
+            )
         else:
-            gen_fn()
-            output.extend(self.rot_n(num_on_stack + 1))
+            output.append(create_instruction("MAKE_FUNCTION", arg=0x08))
+
         self.clear_tos()
 
     def create_load_python_module(self, mod: types.ModuleType) -> Instruction:
diff --git a/torch/_dynamo/compiled_autograd.py b/torch/_dynamo/compiled_autograd.py
index 84145d64f38a..ae7263c62ea0 100644
--- a/torch/_dynamo/compiled_autograd.py
+++ b/torch/_dynamo/compiled_autograd.py
@@ -166,7 +166,7 @@ def prep_with_inputs(self, inputs: tuple[torch.Tensor]) -> None:
             if grad is not None:
                 assert not torch.isnan(grad).any(), (
                     f"Compiled autograd running under anomaly mode with inputs[{idx}] already "
-                    "having NaN gradient. This is not supported. {TURN_OFF_MSG}"
+                    f"having NaN gradient. This is not supported. {TURN_OFF_MSG}"
                 )
 
             self.params_to_check[f"inputs[{idx}]"] = inputs[idx]
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 8649f17cd17a..d35ba10ef1af 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -3,6 +3,7 @@
 
 This module contains various configuration flags and settings that control TorchDynamo's
 behavior, including:
+
 - Runtime behavior flags (e.g., guard settings, specialization options)
 - Debugging and development options
 - Performance tuning parameters
@@ -187,8 +188,22 @@
 # [@compile_ignored: runtime_behaviour] Get a cprofile trace of Dynamo
 cprofile = os.environ.get("TORCH_COMPILE_CPROFILE", False)
 
-# legacy config, does nothing now!
+# Legacy config, does nothing now!
 skipfiles_inline_module_allowlist: dict[Any, Any] = {}
+"""Allowlist of inline modules to skip during compilation.
+
+Legacy configuration that previously controlled which modules could be
+inlined during tracing. This configuration is deprecated and no longer used.
+
+:type: dict[Any, Any]
+:default: {}
+
+.. deprecated::
+   This configuration is deprecated and does nothing now.
+
+.. note::
+   DEPRECATED: This setting has no effect on current behavior.
+"""
 
 # If a string representing a PyTorch module is in this ignorelist,
 # the `allowed_functions.is_allowed` function will not consider it
@@ -308,7 +323,7 @@
     ],
 ] = True
 
-# By default, Dynamo emits runtime asserts (e.g. torch._check, torch._check_is_size) in the graph.
+# By default, Dynamo emits runtime asserts (e.g. torch._check) in the graph.
 # In some cases those asserts could be performance costly
 # E.g. torch._check(tensor[0].item() > 2) for tensor on cuda will require cuda sync.
 # Setting this to True keeps them hinting to symbolic shapes engine,
@@ -381,12 +396,6 @@
 # useful for regional compilation.
 max_saved_pointers_for_recursive_dict_tags_check = 256
 
-# Controls whether to construct the partial framelocals to dict for lambda
-# guards. This is a temporary flag to allow quick fallback behavior in case of
-# unexpected issues. Default is True, i.e., we will construct only partial
-# dict, a faster version for guards. Set to False to fallback to old behavior.
-construct_partial_framelocals_dict = True
-
 # If True, raises exception if TorchDynamo is called with a context manager
 raise_on_ctx_manager_usage = True
 
@@ -407,7 +416,7 @@
 # exported FX graph. This flag should become the default eventually
 # and be removed, but currently provides a way to fall back to old
 # graph breaking behavior.
-capture_sparse_compute = False if is_fbcode() else True
+capture_sparse_compute = not is_fbcode()
 
 # If true, error if we try to compile a function that has
 # been seen before.
@@ -463,6 +472,10 @@
 # produces a consistent number of inputs to the graph.
 install_free_tensors = False
 
+# Temporary flag to control the turning of install_free_tensors to True for
+# export. We will remove this flag in a few weeks when stable.
+install_free_tensors_for_export = True
+
 # Use C++ FrameLocalsMapping (raw array view of Python frame fastlocals) (deprecated: always True)
 enable_cpp_framelocals_guard_eval = True
 
@@ -620,8 +633,35 @@ def default_debug_dir_root() -> str:
 # See https://github.com/pytorch/pytorch/issues/157452 for more context
 graph_break_on_nn_param_ctor = True
 
+# Eager AC/SAC reapplies the mutations (like global dict mutations) in the
+# backward during the recomputation of forward. torch.compile has no easy way to
+# reapply python mutations in the backward. But many users might be ok to skip
+# reapplication of side effects in the backward. They can set this config flag
+# to accept this eager and compile divergence.
+skip_fwd_side_effects_in_bwd_under_checkpoint = False
+
+
 # Overrides torch.compile() kwargs for Compiled Autograd:
 compiled_autograd_kwargs_override: dict[str, Any] = {}
+"""Overrides torch.compile() kwargs for Compiled Autograd.
+
+This dictionary allows overriding specific torch.compile() keyword arguments
+when using Compiled Autograd. Only certain overrides are currently supported.
+
+:type: dict[str, Any]
+:default: {}
+
+Example::
+
+    torch._dynamo.config.compiled_autograd_kwargs_override = {
+        "fullgraph": True
+    }
+
+.. note::
+   Currently only the "fullgraph" kwarg override is supported. Other kwargs
+   may be added in future versions.
+"""
+
 
 # Enables use of collectives *during* compilation to synchronize behavior
 # across ranks.  Today, this is used solely to modify automatic_dynamic_shapes
@@ -681,6 +721,11 @@ def default_debug_dir_root() -> str:
     env_name_default="TORCH_DYNAMO_RUN_GC_AFTER_COMPILE",
 )
 
+# Does not graph break on torch.autograd._profiler_enabled if set to True. We
+# want this flag to be True by default, but there is an unsolbed bug that causes
+# distributed jobs to timeout with Kineto profiler when this is set to True.
+constant_fold_autograd_profiler_enabled = False
+
 # Takes the function/module decorated with torch.compile and passes it through a
 # wrapper. This ensures that nn.module hooks are also compiled in the same frame.
 wrap_top_frame = False
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 507e24e261bf..e1b4e051672e 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -29,6 +29,7 @@
 import dis
 import functools
 import gc
+import inspect
 import itertools
 import logging
 import os
@@ -116,14 +117,19 @@
     unimplemented_v2,
     Unsupported,
 )
+from .graph_bytecode_inputs import reset_user_object_tracking
 from .guards import (
     CheckFunctionManager,
     get_and_maybe_log_recompilation_reasons,
     GuardedCode,
 )
 from .hooks import Hooks
-from .output_graph import DynamoTracerOutput
-from .pgo import log_frame_dynamic_whitelist, put_code_state
+from .output_graph import DynamoTracerOutput, OutputGraphCommon
+from .pgo import (
+    _log_size_mismatch_recompile,
+    log_frame_dynamic_whitelist,
+    put_code_state,
+)
 from .replay_record import ExecutionRecord
 from .resume_execution import TORCH_DYNAMO_RESUME_IN_PREFIX
 from .symbolic_convert import (
@@ -170,6 +176,8 @@
 
 
 if typing.TYPE_CHECKING:
+    from torch.utils.weak import WeakIdKeyDictionary
+
     from .backends.registry import CompilerFn
     from .package import CompilePackage
     from .repro.after_dynamo import WrapBackendDebug
@@ -297,7 +305,8 @@ def _fn(*args: _P.args, **kwargs: _P.kwargs) -> _T:
             torch_rng_state = torch.random.get_rng_state()
             cuda_rng_state = None
             if torch.cuda.is_available():
-                cuda_rng_state = torch.cuda.get_rng_state()
+                with torch._C.DisableTorchFunction():
+                    cuda_rng_state = torch.cuda.get_rng_state()
             cuda_matmul_fp32_prec = torch._C._get_fp32_precision_getter(
                 "cuda", "matmul"
             )
@@ -309,6 +318,7 @@ def _fn(*args: _P.args, **kwargs: _P.kwargs) -> _T:
                 torch.fx._symbolic_trace._maybe_revert_all_patches()
             )
             exit_stack.enter_context(torch_function_mode_stack_state_mgr)
+            reset_user_object_tracking()
             try:
                 return fn(*args, **kwargs)
             finally:
@@ -331,7 +341,8 @@ def _fn(*args: _P.args, **kwargs: _P.kwargs) -> _T:
                 if prior_mobile_allocator_state != curr_mobile_allocator_state:
                     torch._C._unset_default_mobile_cpu_allocator()
                 if cuda_rng_state is not None:
-                    torch.cuda.set_rng_state(cuda_rng_state)
+                    with torch._C.DisableTorchFunction():
+                        torch.cuda.set_rng_state(cuda_rng_state)
                 torch._C._set_fp32_precision_setter(
                     "cuda", "matmul", cuda_matmul_fp32_prec
                 )
@@ -462,6 +473,7 @@ def profile_wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _T:
         try:
             prof.enable()
             start_ts = time.time()
+            # pyrefly: ignore  # bad-argument-type
             retval = prof.runcall(func, *args, **kwargs)
             profile_latency = time.time() - start_ts
             prof.disable()
@@ -747,6 +759,9 @@ def register_bytecode_hook(hook: BytecodeHook) -> RemovableHandle:
     return handle
 
 
+# TODO - We want to run preserve_node_meta context manager here, but the CI
+# fails (its unclear if the failures were flaky)
+# @torch.fx.traceback.preserve_node_meta()
 @preserve_global_state
 def trace_frame(
     code: types.CodeType,
@@ -853,10 +868,11 @@ def build_guards(
         cache_entry: Optional[CacheEntry] = None,
         strict_error: bool = False,
     ) -> CheckFunctionManager:
-        assert self.tracer_output.output_graph is not None
+        output_graph = self.tracer_output.output_graph
+        assert output_graph is not None
         return CheckFunctionManager(
             code,
-            self.tracer_output.output_graph,
+            output_graph,
             cache_entry,
             hooks.guard_fail_fn if hooks else None,
             hooks.guard_filter_fn if hooks else None,
@@ -864,6 +880,21 @@ def build_guards(
             strict_error=strict_error,
         )
 
+    def graph_capture_output(self) -> GraphCaptureOutput:
+        output_graph = self.tracer_output.output_graph
+        assert output_graph is not None
+        return GraphCaptureOutput(
+            OutputGraphCommon(
+                output_graph.dump_guards_state(),
+                output_graph.shape_env,
+                output_graph.export_metadata,
+                output_graph.tracked_fakes_id_to_source,
+            ),
+            output_graph.import_sources,
+            output_graph.traced_code,
+            self.bytecode,
+        )
+
 
 @dataclass
 class BackendInput:
@@ -880,6 +911,37 @@ class BackendInput:
     graph_module: torch.fx.GraphModule
     example_inputs: Any
     fake_mode: torch._subclasses.fake_tensor.FakeTensorMode
+    tensor_to_context: WeakIdKeyDictionary
+
+
+@dataclass
+class GraphCaptureOutput:
+    """
+    Minimal version of DynamoOutput
+    """
+
+    output_graph: OutputGraphCommon
+    import_sources: dict[str, str]
+    traced_code: list[CodeType]
+    bytecode: CodeType
+
+    def build_guards(
+        self,
+        code: types.CodeType,
+        hooks: Optional[Hooks] = None,
+        save: bool = False,
+        cache_entry: Optional[CacheEntry] = None,
+        strict_error: bool = False,
+    ) -> CheckFunctionManager:
+        return CheckFunctionManager(
+            code,
+            self.output_graph,
+            cache_entry,
+            hooks.guard_fail_fn if hooks else None,
+            hooks.guard_filter_fn if hooks else None,
+            save_guards=save,
+            strict_error=strict_error,
+        )
 
 
 @dataclass
@@ -894,43 +956,125 @@ class CaptureOutput:
     frontends.
     """
 
-    dynamo_output: DynamoOutput
+    graph_capture_output: GraphCaptureOutput
     # BackendInput can be None when dynamo didn't compile any graph (no tensor op)
     backend_input: Optional[BackendInput]
 
 
-@dataclass
-class FrameInfo:
-    code: types.CodeType
-    globals: dict[str, object]
-    locals: dict[str, object]
-    builtins: dict[str, object]
-    closure: tuple[CellType]
+def get_traced_fn(mod: Any) -> tuple[FunctionType, Optional[object]]:
+    """
+    Utility function to get the function to trace, and optionally a bound self
+    object, from a callable (nn.Module, function, or method).
+    """
+    import inspect
+
+    if isinstance(mod, torch.nn.Module):
+        mod = mod.forward
+    if hasattr(mod, "__self__"):
+        # pyrefly: ignore  # missing-attribute
+        return mod.__func__, mod.__self__
+    elif inspect.isfunction(mod):
+        return mod, None
+    else:
+        raise RuntimeError(f"Unsupported model code type {mod}")
+
+
+def _get_signature(fn: Any) -> inspect.Signature:
+    return inspect.signature(fn, follow_wrapped=False)
+
+
+def _get_frame(
+    mod: Any,
+    args: tuple[Any, ...],
+    kwargs: Optional[dict[str, Any]] = None,
+) -> FrameInfo:
+    """
+    Create a frame to trace, given a model, args, and optional kwargs.
+    """
+    import builtins
+
+    fn, self_opt = get_traced_fn(mod)
+    if self_opt is not None:
+        args = (self_opt,) + args
+    if kwargs is None:
+        kwargs = {}
+
+    signature = _get_signature(fn)
+    bound_arguments = signature.bind(*args, **kwargs)
+    bound_arguments.apply_defaults()
+    f_locals = bound_arguments.arguments
+
+    closure = fn.__closure__ or ()
+    freevars = fn.__code__.co_freevars
+    if freevars or closure:
+        assert len(closure) == len(freevars)
+        f_locals.update(
+            {name: cell.cell_contents for name, cell in zip(freevars, closure)}
+        )
+
+    return FrameInfo(
+        fn.__code__,
+        fn.__globals__,
+        f_locals,
+        builtins.__dict__,
+        closure=fn.__closure__ or (),  # type: ignore[arg-type]
+    )
 
 
 def fullgraph_capture(
-    frame: FrameInfo,
+    mod: Any,
+    args: tuple[Any, ...],
+    kwargs: Optional[dict[str, Any]] = None,
     *,
     constraints: Optional[list[Constraint]] = None,
     _is_export_deprecated_do_not_use: bool = False,
 ) -> CaptureOutput:
     """
-    A standalone function which takes a frame and returns dynamo captured graph
-    plus other important compile information. This should serve as the common
-    interface for different torch compiler AOT frontengs (e.g. precompile, export).
-    Note that this function doesn't apply context managers like metrics context
-    or compile id, and the expectation is that the caller will apply them depending
+    This API captures a full graph for a model, given example inputs to trace with.
+
+    Specifically, it takes a callable (nn.Module, method, or function), args, and
+    optional kwargs, and returns Dynamo-captured graph along with other important
+    compile-time information. This serves as the common graph-capture mechanism
+    for different torch compiler AOT frontends (e.g. AOT precompile, export).
+
+    Note that this API doesn't apply context managers like metrics context,
+    and the expectation is that the caller will apply them depending
     on the use case.
 
     The CaptureOutput is separated into two parts:
-    1. Dynamo specific information from DynamoOutput, which includes:
+    1. Frontend specific information, which includes:
         - guards
         - generated bytecode
-        - other information tracked by OutputGraph.
+        - other information tracked by OutputGraphCommon.
     2. Backend specific information (indexed by unique backend id) such as:
         - fx graph
         - example inputs
     """
+    frame = _get_frame(mod, args, kwargs)
+
+    with compile_context(CompileContext(get_compile_id({}))):
+        return _fullgraph_capture_frame(
+            frame,
+            constraints=constraints,
+            _is_export_deprecated_do_not_use=_is_export_deprecated_do_not_use,
+        )
+
+
+@dataclass
+class FrameInfo:
+    code: types.CodeType
+    globals: dict[str, object]
+    locals: dict[str, object]
+    builtins: dict[str, object]
+    closure: tuple[CellType]
+
+
+def _fullgraph_capture_frame(
+    frame: FrameInfo,
+    *,
+    constraints: Optional[list[Constraint]] = None,
+    _is_export_deprecated_do_not_use: bool = False,
+) -> CaptureOutput:
     from torch._guards import TracingContext
 
     backend_input: Optional[BackendInput] = None
@@ -939,11 +1083,13 @@ def fullgraph_compiler(
         gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
     ) -> torch.fx.GraphModule:
         nonlocal backend_input
-        fake_mode = TracingContext.get().fake_mode
+        tracing_context = TracingContext.get()
+        fake_mode = tracing_context.fake_mode
+        tensor_to_context = tracing_context.tensor_to_context
         assert fake_mode is not None
         assert isinstance(gm.meta["backend_id"], str)
         backend_input = BackendInput(
-            gm.meta["backend_id"], gm, example_inputs, fake_mode
+            gm.meta["backend_id"], gm, example_inputs, fake_mode, tensor_to_context
         )
         return gm
 
@@ -970,9 +1116,13 @@ def fullgraph_compiler(
         while cur_exn.__cause__ is not None:
             cur_exn.__cause__.with_traceback(None)
             cur_exn = cur_exn.__cause__
+        # pyrefly: ignore  # invalid-inheritance
         raise e.with_traceback(None) from e.__cause__  # User compiler error
 
-    return CaptureOutput(dynamo_output, backend_input)
+    return CaptureOutput(
+        dynamo_output.graph_capture_output(),
+        backend_input,
+    )
 
 
 def compile_frame(  # type: ignore[return]
@@ -990,6 +1140,7 @@ def compile_frame(  # type: ignore[return]
     frame_state: Optional[dict[str, Union[int, FrameStateSizeEntry]]] = None,
     distributed_state: Optional[DistributedState] = None,
     package: Optional[CompilePackage] = None,
+    # pyrefly: ignore  # bad-return
 ) -> DynamoOutput:
     """
     A helper function taking a frame and backend, then return the generated bytecode
@@ -1064,7 +1215,7 @@ def transform(
         except exc.SkipFrame as e:
             if not isinstance(e, exc.TensorifyScalarRestartAnalysis):
                 TensorifyState.clear()
-            log.debug(
+            log.debug(  # noqa: G200
                 "Skipping frame %s %s \
                 %s %s",
                 e,
@@ -1455,6 +1606,8 @@ def format_func_info(code: CodeType) -> str:
                 and output_graph.has_outputs()
             ):
                 log_frame_dynamic_whitelist(code)
+                if recompile_reason and "size mismatch at index" in recompile_reason:
+                    _log_size_mismatch_recompile()
 
             return guarded_code
         except Exception as e:
diff --git a/torch/_dynamo/create_parameter_op.py b/torch/_dynamo/create_parameter_op.py
index ded3ef75ed1d..63f6704370b8 100644
--- a/torch/_dynamo/create_parameter_op.py
+++ b/torch/_dynamo/create_parameter_op.py
@@ -20,6 +20,7 @@
 
 class TracableCreateParameter(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx: Any, tensor: Any, placeholder: Any) -> torch.nn.Parameter:
         assert not tensor.requires_grad
         return placeholder.set_(tensor)
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 2321213a0a3b..493533f9ae8b 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -264,7 +264,7 @@ def _cuda_system_info_comment() -> str:
     try:
         cuda_version_out = subprocess.check_output(["nvcc", "--version"])
         cuda_version_lines = cuda_version_out.decode().split("\n")
-        comment = "".join([f"# {s} \n" for s in cuda_version_lines if s not in [""]])
+        comment = "".join([f"# {s} \n" for s in cuda_version_lines if s != ""])
         model_str += f"{comment}\n"
     except (FileNotFoundError, subprocess.CalledProcessError):
         model_str += "# nvcc not found\n"
@@ -341,7 +341,7 @@ def helper_for_dump_minify(contents: str) -> None:
 
     except OSError as e:
         log.exception("")
-        raise NotImplementedError("Could not write to {minified_repro_path}") from e
+        raise NotImplementedError(f"Could not write to {minified_repro_path}") from e
 
 
 class AccuracyError(Exception):
@@ -879,6 +879,7 @@ def gen_tensor(shape: torch._prims_common.ShapeType, dtype: torch.dtype) -> Tens
             data_type, shape_str = match.groups()
             shape = tuple(shape_str.split(","))
             dtype = dtype_map[data_type]
+            # pyrefly: ignore  # bad-argument-type
             kwargs[param] = gen_tensor(shape, dtype)
 
         match = re.search(sym_shape_regex, annotation)
@@ -892,6 +893,7 @@ def gen_tensor(shape: torch._prims_common.ShapeType, dtype: torch.dtype) -> Tens
             attr_name, data_type, shape_str, _ = match.groups()
             shape = tuple(shape_str.split(","))
             dtype = dtype_map[data_type]
+            # pyrefly: ignore  # bad-argument-type
             setattr(container, attr_name, gen_tensor(shape, dtype))
 
     return kwargs
diff --git a/torch/_dynamo/decorators.py b/torch/_dynamo/decorators.py
index bb66e79b6557..ac46955faf65 100644
--- a/torch/_dynamo/decorators.py
+++ b/torch/_dynamo/decorators.py
@@ -95,6 +95,7 @@ def wrap(fn: Callable[_P, _R]) -> Callable[_P, _R]:
             nonrecursive_disable_wrapper._torchdynamo_disable = True  # type: ignore[attr-defined]
             nonrecursive_disable_wrapper._torchdynamo_disable_msg = reason  # type: ignore[attr-defined]
             nonrecursive_disable_wrapper._torchdynamo_orig_callable = fn  # type: ignore[attr-defined]
+            # pyrefly: ignore  # bad-return
             return nonrecursive_disable_wrapper
 
         if fn is None:
@@ -295,6 +296,14 @@ def skip_frame(msg: str = "") -> None:
     """Force a skipped frame"""
 
 
+@_disallow_in_graph_helper(throw_if_not_allowed=False)
+def step_unsupported(msg: str = "") -> None:
+    """Force a step unsupported graph break, which results in compiling
+    the traced FX graph so far, then skipping the rest of the frame.
+    In order to get expected behavior, there should be at least 2 ops
+    and a part of the code not contained in any try/with blocks."""
+
+
 def forbid_in_graph(fn: Any) -> Any:
     """
     Customize which functions TorchDynamo will assert are not present while tracing.
@@ -306,6 +315,7 @@ def forbid_in_graph(fn: Any) -> Any:
     if isinstance(fn, (list, tuple)):
         return [forbid_in_graph(x) for x in fn]
     assert callable(fn), "forbid_in_graph applies only to callables"
+    # pyrefly: ignore  # missing-attribute
     fn._dynamo_forbidden = True
     return fn
 
@@ -653,21 +663,28 @@ def mark_dynamic(
 
     if isinstance(index, int):
         if not hasattr(t, "_dynamo_dynamic_indices"):
+            # pyrefly: ignore  # missing-attribute
             t._dynamo_dynamic_indices = set()
+            # pyrefly: ignore  # missing-attribute
             t._dynamo_dynamic_range = set()
+            # pyrefly: ignore  # missing-attribute
             t._dynamo_hint_overrides = {}
 
         if not hasattr(t, "_specialize_on"):
+            # pyrefly: ignore  # missing-attribute
             t._specialize_on = {}
 
         if hint_override:
+            # pyrefly: ignore  # missing-attribute
             t._dynamo_hint_overrides[index] = hint_override
         # TODO(voz): Should we bounds check?
+        # pyrefly: ignore  # missing-attribute
         t._dynamo_dynamic_indices.add(index)
         t._dynamo_dynamic_range.add(_DimRange(index, min, max))  # type: ignore[arg-type]
 
         # FX tracers don't respect @forbid_in_graph and choke on the following error since it passes in proxies:
         # TypeError: 'Attribute' object does not support item assignment
+        # pyrefly: ignore  # missing-attribute
         if isinstance(t._specialize_on, dict):
             t._specialize_on[index] = specialize_on if specialize_on is not None else []
 
@@ -692,8 +709,10 @@ def maybe_mark_dynamic(t: Any, index: Union[int, list[Any], tuple[Any]]) -> None
 
     if isinstance(index, int):
         if not hasattr(t, "_dynamo_weak_dynamic_indices"):
+            # pyrefly: ignore  # missing-attribute
             t._dynamo_weak_dynamic_indices = set()
         # TODO(voz): Should we bounds check?
+        # pyrefly: ignore  # missing-attribute
         t._dynamo_weak_dynamic_indices.add(index)
         return
 
@@ -745,8 +764,11 @@ def mark_static(
         # TODO: Make this configurable via a supported public API
         _apply_func_to_inner_tensors_of_same_dim(mark_static, t, index)
 
+    # pyrefly: ignore  # bad-argument-type
     if not isinstance(t, torch.Tensor) and issubclass(t, torch.nn.Module):
+        # pyrefly: ignore  # missing-attribute
         t._dynamo_marked_static = True
+        # pyrefly: ignore  # bad-return
         return t
 
     if not isinstance(t, torch.Tensor):
diff --git a/torch/_dynamo/device_interface.py b/torch/_dynamo/device_interface.py
index 26cf4796fd07..c6eb87c42cb5 100644
--- a/torch/_dynamo/device_interface.py
+++ b/torch/_dynamo/device_interface.py
@@ -205,6 +205,7 @@ class CudaInterface(DeviceInterface):
     Event = torch.cuda.Event  # type: ignore[assignment]
     Stream = torch.cuda.Stream  # type: ignore[assignment]
 
+    # pyrefly: ignore  # bad-override
     class Worker:
         @staticmethod
         def set_device(device: int) -> None:
@@ -240,6 +241,7 @@ def get_device_properties(device: torch.types.Device = None) -> Any:
     set_device = staticmethod(torch.cuda.set_device)
     device_count = staticmethod(torch.cuda.device_count)
     stream = staticmethod(torch.cuda.stream)  # type: ignore[assignment]
+    # pyrefly: ignore  # bad-override
     current_stream = staticmethod(torch.cuda.current_stream)
     set_stream = staticmethod(torch.cuda.set_stream)  # type: ignore[assignment]
     _set_stream_by_id = staticmethod(torch.cuda._set_stream_by_id)  # type: ignore[assignment]
@@ -300,6 +302,7 @@ class MtiaInterface(DeviceInterface):
     Event = torch.mtia.Event  # type: ignore[assignment]
     Stream = torch.mtia.Stream  # type: ignore[assignment]
 
+    # pyrefly: ignore  # bad-override
     class Worker:
         @staticmethod
         def set_device(device: int) -> None:
@@ -335,14 +338,15 @@ def get_device_properties(device: torch.types.Device = None) -> Any:
     set_device = staticmethod(torch.mtia.set_device)  # type: ignore[assignment]
     device_count = staticmethod(torch.mtia.device_count)
     stream = staticmethod(torch.mtia.stream)  # type: ignore[assignment]
+    # pyrefly: ignore  # bad-override
     current_stream = staticmethod(torch.mtia.current_stream)
     set_stream = staticmethod(torch.mtia.set_stream)  # type: ignore[assignment]
     _set_stream_by_id = staticmethod(torch.mtia._set_stream_by_id)  # type: ignore[assignment]
     synchronize = staticmethod(torch.mtia.synchronize)
     get_device_properties = staticmethod(torch.mtia.get_device_properties)  # type: ignore[assignment]
     get_raw_stream = staticmethod(get_mtia_stream)  # type: ignore[assignment, arg-type]
-    exchange_device = staticmethod(torch.mtia._exchange_device)  # type: ignore[arg-type]
-    maybe_exchange_device = staticmethod(torch.mtia._maybe_exchange_device)  # type: ignore[arg-type]
+    exchange_device = staticmethod(torch.mtia._exchange_device)  # type: ignore[arg-type, has-type]
+    maybe_exchange_device = staticmethod(torch.mtia._maybe_exchange_device)  # type: ignore[arg-type, has-type]
     memory_allocated = staticmethod(torch.mtia.memory_allocated)  # type: ignore[assignment]
     is_bf16_supported = staticmethod(torch.mtia.is_bf16_supported)  # type: ignore[arg-type]
 
@@ -381,6 +385,7 @@ class XpuInterface(DeviceInterface):
     Event = torch.xpu.Event  # type: ignore[assignment]
     Stream = torch.xpu.Stream  # type: ignore[assignment]
 
+    # pyrefly: ignore  # bad-override
     class Worker:
         @staticmethod
         def set_device(device: int) -> None:
@@ -414,16 +419,17 @@ def get_device_properties(device: torch.types.Device = None) -> Any:
 
     current_device = staticmethod(torch.xpu.current_device)
     set_device = staticmethod(torch.xpu.set_device)
-    device_count = staticmethod(torch.xpu.device_count)
+    device_count = staticmethod(torch.xpu.device_count)  # type: ignore[has-type]
     stream = staticmethod(torch.xpu.stream)  # type: ignore[assignment]
+    # pyrefly: ignore  # bad-override
     current_stream = staticmethod(torch.xpu.current_stream)
     set_stream = staticmethod(torch.xpu.set_stream)  # type: ignore[assignment]
     _set_stream_by_id = staticmethod(torch.xpu._set_stream_by_id)  # type: ignore[assignment]
     synchronize = staticmethod(torch.xpu.synchronize)
     get_device_properties = staticmethod(torch.xpu.get_device_properties)  # type: ignore[assignment]
     get_raw_stream = staticmethod(get_xpu_stream)  # type: ignore[assignment, arg-type]
-    exchange_device = staticmethod(torch.xpu._exchange_device)  # type: ignore[arg-type]
-    maybe_exchange_device = staticmethod(torch.xpu._maybe_exchange_device)  # type: ignore[arg-type]
+    exchange_device = staticmethod(torch.xpu._exchange_device)  # type: ignore[arg-type, has-type]
+    maybe_exchange_device = staticmethod(torch.xpu._maybe_exchange_device)  # type: ignore[arg-type, has-type]
     memory_allocated = staticmethod(torch.xpu.memory_allocated)
 
     # Can be mock patched by @patch decorator.
@@ -458,6 +464,7 @@ class CpuDeviceProperties:
 
 
 class CpuInterface(DeviceInterface):
+    # pyrefly: ignore  # bad-override
     class Event(torch.Event):
         def __init__(self, enable_timing: bool = True) -> None:
             self.time = 0.0
@@ -468,6 +475,7 @@ def elapsed_time(self, end_event: Any) -> float:
         def record(self, stream: Any = None) -> None:
             self.time = time.perf_counter()
 
+    # pyrefly: ignore  # bad-override
     class Worker:
         @staticmethod
         def get_device_properties(
@@ -543,6 +551,7 @@ def get_compute_capability(device: torch.types.Device = None) -> str:
     def synchronize(device: torch.types.Device = None) -> None:
         torch.mps.synchronize()
 
+    # pyrefly: ignore  # bad-override
     class Worker:
         @staticmethod
         def get_device_properties(device: torch.types.Device = None) -> Any:
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 4f5288e1cbe3..0d7f702aae7d 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -235,7 +235,11 @@ def fail_callback(
             if not convert_frame.has_tensor_in_frame(frame):
                 return ConvertFrameReturn()
 
-            from torch._C._dynamo.eval_frame import _debug_get_precompile_entries
+            from torch._C._dynamo.eval_frame import (
+                _debug_get_cache_entry_list,
+                _debug_get_precompile_entries,
+            )
+            from torch._dynamo.guards import get_and_maybe_log_recompilation_reasons
 
             message = (
                 "Detected recompile when torch.compile stance is 'fail_on_recompile'. "
@@ -243,6 +247,17 @@ def fail_callback(
                 + f"function name: '{frame.f_code.co_name}', "
                 + f"line number: {frame.f_lineno}"
             )
+            cache_entries = _debug_get_cache_entry_list(frame.f_code)
+            if cache_entries:
+                reasons = get_and_maybe_log_recompilation_reasons(
+                    cache_entries[0], frame, skip_logging=True
+                )
+                if reasons:
+                    failures = textwrap.indent("\n".join(reasons), "- ")
+                    guard_failure_details = (
+                        f"triggered by the following guard failure(s):\n{failures}"
+                    )
+                    message += f"\n{textwrap.indent(guard_failure_details, '    ')}"
             precompile_entries = _debug_get_precompile_entries(frame.f_code)
             if len(precompile_entries) > 0:
                 message += "\nFailed on the following precompiled guards: "
@@ -484,6 +499,7 @@ def __setstate__(self, state: dict[str, Any]) -> None:
         self._initialize()
 
     @property
+    # pyrefly: ignore  # bad-override
     def training(self) -> bool:
         return self._orig_mod.training
 
@@ -747,14 +763,15 @@ def get_compiler_config() -> Any:
                     # Create a fresh CompilePackage
                     self._package.initialize(fn, None, ignore_inlined_sources=False)
                 else:
-                    cache_entry, backends = result
                     try:
                         self._package.initialize(
-                            fn, cache_entry, ignore_inlined_sources=False
+                            fn, result.dynamo, ignore_inlined_sources=False
+                        )
+                        self._package.install(result.backends)
+                    except RuntimeError:
+                        log.warning(
+                            "Failed to load entry from dynamo cache", exc_info=True
                         )
-                        self._package.install(backends)
-                    except RuntimeError as e:
-                        log.warning("Failed to load entry from dynamo cache: %s", e)
                         self._package.initialize(fn, None, ignore_inlined_sources=False)
 
         fn = innermost_fn(fn)
@@ -847,6 +864,14 @@ def do_nothing(*arg: Any, **kwargs: Any) -> None:
         def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
             prior = set_eval_frame(None)
             try:
+                # We shouldn't compile inside kernel invocation.
+                if tracing_context := torch._guards.TracingContext.try_get():
+                    if (
+                        tracing_context.fake_mode is not None
+                        and tracing_context.fake_mode.in_kernel_invocation
+                    ):
+                        return fn(*args, **kwargs)
+                # Skip nested compile - just inline the function
                 if is_fx_symbolic_tracing():
                     if config.error_on_nested_fx_trace:
                         raise RuntimeError(
@@ -893,6 +918,7 @@ def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
                     while cur_exn.__cause__ is not None:
                         cur_exn.__cause__.with_traceback(None)
                         cur_exn = cur_exn.__cause__
+                    # pyrefly: ignore  # invalid-inheritance
                     raise e.with_traceback(None) from e.__cause__  # User compiler error
                 except ShortenTraceback as e:
                     # Failures in the backend likely don't have useful
@@ -1021,7 +1047,10 @@ def call_compiled_autograd() -> functools.partial[Optional[bool]]:
                 assert rebuild_ctx is not None
                 compiler_fn = rebuild_ctx()
                 ctx = torch._dynamo.compiled_autograd._enable(
-                    compiler_fn, dynamic=_dynamic, ignore_active_disable_ctx=False
+                    compiler_fn,
+                    # pyrefly: ignore  # bad-argument-type
+                    dynamic=_dynamic,
+                    ignore_active_disable_ctx=False,
                 )
                 ctx.__enter__()
                 return functools.partial(ctx.__exit__, None, None, None)
@@ -1084,6 +1113,7 @@ def __call__(self, fn: Callable[..., Any]) -> Callable[..., Any]:
             cls_obj.__call__ = self(cls_obj.__call__)
             if issubclass(cls_obj, torch.nn.Module):
                 # NN module variable tracker directly inlines the _call_impl. Disable it.
+                # pyrefly: ignore  # missing-attribute
                 cls_obj._call_impl = self(cls_obj._call_impl)
             return cls_obj
 
@@ -1235,7 +1265,7 @@ def signature_to_fullargspec(sig: inspect.Signature) -> inspect.FullArgSpec:
         # signature. Assign names as {varargs}_0, {varargs}_1, ...
         assert fullargspec.varargs is not None, "More arguments than expected"
         input_strs += [
-            f"{fullargspec.varargs}_{i}" for i in range(0, len(args) - len(input_strs))
+            f"{fullargspec.varargs}_{i}" for i in range(len(args) - len(input_strs))
         ]
     elif len(args) < len(fullargspec.args):
         # 3. If there are fewer arguments in `args` than `fullargspec.args`,
@@ -1525,7 +1555,7 @@ def __init__(
         }
 
         self.new_args = []
-        for i in range(0, len(flat_args)):
+        for i in range(len(flat_args)):
             arg = super().placeholder(f"arg{i}", (), {})
             if i in matched_input_elements_to_fake:
                 arg.node.meta["val"] = matched_input_elements_to_fake[i]
@@ -1989,6 +2019,7 @@ def fakify_with_ambient(
                         path: KeyPath, t: Union[torch.Tensor, _IntWrapper, Any]
                     ) -> Any:
                         if isinstance(t, torch.Tensor):
+                            # pyrefly: ignore  # missing-attribute
                             return ambient_fake_mode.from_tensor(t, static_shapes=True)
                         elif isinstance(t, _IntWrapper):
                             if (
@@ -2039,7 +2070,12 @@ def fakify_with_ambient(
                 automatic_dynamic_shapes=False,
                 capture_dynamic_output_shape_ops=True,
                 capture_scalar_outputs=True,
+                constant_fold_autograd_profiler_enabled=True,
                 prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
+                # install_free_tensors ensures that params and buffers are still
+                # added as graph attributes, and makes Dynamo emits graphs that
+                # follow export pytree-able input requirements
+                install_free_tensors=config.install_free_tensors_for_export,
             ),
             _compiling_state_context(),
         ):
@@ -2068,8 +2104,11 @@ def fakify_with_ambient(
             )
             and not trace_rules.check(call_to_inspect)
         ):
+            # pyrefly: ignore  # unbound-name
             dim_constraints.solve()
+            # pyrefly: ignore  # unbound-name
             forced_specializations = dim_constraints.forced_specializations()
+            # pyrefly: ignore  # unbound-name
             msg = dim_constraints.prettify_results(
                 original_signature,
                 dynamic_shapes,
@@ -2090,9 +2129,11 @@ def fakify_with_ambient(
                     )
 
             # Error if we have any constraints on static values
+            # pyrefly: ignore  # unbound-name
             for k in shape_env.var_to_range.keys():
                 if isinstance(k, sympy.Integer):
                     constraint_violation_error = ConstraintViolationError(
+                        # pyrefly: ignore  # unbound-name
                         f"{''.join(traceback.format_list(shape_env.var_to_stack[k]))}\n"
                         "It appears that you're trying to set a constraint on a "
                         f"value which we evaluated to have a static value of {k}. "
diff --git a/torch/_dynamo/exc.py b/torch/_dynamo/exc.py
index e69b768ba374..d066e057dcef 100644
--- a/torch/_dynamo/exc.py
+++ b/torch/_dynamo/exc.py
@@ -163,9 +163,17 @@ def __init__(
 
 
 class Unsupported(TorchDynamoException):
-    def __init__(self, msg: str, *, case_name: Optional[str] = None) -> None:
+    def __init__(
+        self,
+        msg: str,
+        *,
+        case_name: Optional[str] = None,
+        real_stack: None | StackSummary = None,
+    ) -> None:
         super().__init__(msg)
-        self.real_stack = torch._guards.TracingContext.extract_stack()
+        if not real_stack:
+            real_stack = torch._guards.TracingContext.extract_stack()
+        self.real_stack = real_stack
         self.msg = msg
         self.category: Optional[str] = None
         self.add_to_stats()
@@ -263,6 +271,11 @@ class RecompileLimitExceeded(Unsupported):
     pass
 
 
+# debug exception thrown when tracing torch._dynamo.step_unsupported()
+class StepUnsupported(TorchDynamoException):
+    pass
+
+
 class UnsafeScriptObjectError(TorchDynamoException):
     pass
 
@@ -295,7 +308,9 @@ class PackageError(TorchDynamoException):
 
 class ObservedException(TorchDynamoException):
     # An exception observed during the tracing. This exception is used by Dynamo to handle exceptions.
-    pass
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.real_stack: StackSummary = torch._guards.TracingContext.extract_stack()
 
 
 class ObservedUserStopIteration(ObservedException):
@@ -369,6 +384,7 @@ def get_dynamo_observed_exception(exc_type: type[Exception]) -> type[ObservedExc
         observed_exception_map[exc_type] = type(  # type: ignore[assignment]
             f"Observed{name}Error", (ObservedException,), {}
         )
+    # pyrefly: ignore  # index-error
     return observed_exception_map[exc_type]
 
 
@@ -378,14 +394,22 @@ def raise_observed_exception(
     *,
     args: Optional[list[Any]] = None,
     kwargs: Optional[dict[str, Any]] = None,
+    msg: Optional[str] = None,
 ) -> NoReturn:
     from .variables import BuiltinVariable
 
     # CPython here raises an exception. Since there is no python code, we have to manually setup the exception
     # stack and raise the exception.
+    # If a message is provided but no args, use the message as the first argument
+    if msg is not None and (args is None or len(args) == 0):
+        args = [msg]
     exception_vt = BuiltinVariable(exc_type).call_function(tx, args or [], kwargs or {})  # type: ignore[arg-type]
     tx.exn_vt_stack.set_current_exception(exception_vt)  # type: ignore[arg-type]
-    raise get_dynamo_observed_exception(exc_type)
+    raised_exc = get_dynamo_observed_exception(exc_type)
+    # Store the original exception arguments for better error messages
+    if args:
+        raise raised_exc(*args)
+    raise raised_exc
 
 
 def handle_observed_exception(tx: Any) -> None:
@@ -526,8 +550,8 @@ def _load_gb_type_to_gb_id_map() -> dict[str, Any]:
         )
         with open(registry_path) as f:
             registry = json.load(f)
-    except Exception as e:
-        log.error("Error accessing the registry file: %s", e)
+    except Exception:
+        log.exception("Error accessing the registry file")
         registry = {}
 
     mapping = {}
@@ -592,7 +616,10 @@ def unimplemented_v2(
     if log_warning:
         log.warning(msg)
     if from_exc is not _NOTHING:
-        raise Unsupported(msg) from from_exc
+        past_real_stack = None
+        if hasattr(from_exc, "real_stack"):
+            past_real_stack = from_exc.real_stack
+        raise Unsupported(msg, real_stack=past_real_stack) from from_exc
     raise Unsupported(msg)
 
 
diff --git a/torch/_dynamo/external_utils.py b/torch/_dynamo/external_utils.py
index 2ff3f6752f56..75d2020ce56f 100644
--- a/torch/_dynamo/external_utils.py
+++ b/torch/_dynamo/external_utils.py
@@ -96,7 +96,9 @@ def wrap(*args: _P.args, **kwargs: _P.kwargs) -> pytree.PyTree:
         args, kwargs = pytree.tree_map_only(
             torch.Tensor, lambda x: x.numpy(), (args, kwargs)
         )
+        # pyrefly: ignore  # invalid-param-spec
         out = f(*args, **kwargs)
+        # pyrefly: ignore  # missing-attribute
         return pytree.tree_map_only(np.ndarray, lambda x: torch.as_tensor(x), out)
 
     return wrap
diff --git a/torch/_dynamo/functional_export.py b/torch/_dynamo/functional_export.py
index 06c6339e6036..5f74ffd37c09 100644
--- a/torch/_dynamo/functional_export.py
+++ b/torch/_dynamo/functional_export.py
@@ -1,20 +1,18 @@
-import builtins
 import inspect
 import logging
 import traceback
 from collections import namedtuple
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Optional, TYPE_CHECKING, Union
 
 import sympy
 
 import torch
 import torch.fx
 import torch.utils._pytree as pytree
-from torch._dynamo.convert_frame import FrameInfo, fullgraph_capture, get_compile_id
+from torch._dynamo.convert_frame import CaptureOutput, fullgraph_capture, get_traced_fn
 from torch._dynamo.eval_frame import argument_names
 from torch._dynamo.utils import dynamo_timed, get_metrics_context
 from torch._export.utils import _compiling_state_context
-from torch._guards import compile_context, CompileContext
 from torch.export.dynamic_shapes import _RelaxedConstraint, Constraint
 from torch.fx import Node
 from torch.fx.experimental.symbolic_shapes import (
@@ -25,12 +23,16 @@
 from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
 
 
+if TYPE_CHECKING:
+    from torch._subclasses.fake_tensor import FakeTensorMode
+
+
 log = logging.getLogger(__name__)
 
 
 def post_process_error_msg(
     constraint_violation_error: ConstraintViolationError,
-    mod: Callable[..., Any],
+    func: Callable[..., Any],
     args: Any,
     kwargs: Any,
 ):
@@ -40,8 +42,7 @@ def post_process_error_msg(
     """
     from torch.export._unlift import _get_input_paths, _replace_sources
 
-    assert isinstance(mod, torch.nn.Module)
-    orig_sig = inspect.signature(mod.forward)
+    orig_sig = inspect.signature(func)
     flat_input_paths = _get_input_paths((args, kwargs), orig_sig)
     constraint_violation_error.args = (
         _replace_sources(constraint_violation_error.args[0], flat_input_paths),
@@ -49,7 +50,22 @@ def post_process_error_msg(
     return constraint_violation_error
 
 
-def clean_nn_module_stack(
+EXPORT_ROOT_REPLACEMENTS = [
+    ("__export_root_", "_"),
+    ("_export_root.", ""),
+    ("._export_root", ""),
+]
+
+
+def clean_export_root_string(text: str) -> str:
+    """Generic utility to clean export_root patterns from strings."""
+    result = text
+    for pattern, replacement in EXPORT_ROOT_REPLACEMENTS:
+        result = result.replace(pattern, replacement)
+    return result
+
+
+def clean_nn_module_stack_and_source_fn(
     graph_module: torch.fx.GraphModule, is_inline_builtin=False
 ) -> torch.fx.GraphModule:
     """
@@ -76,12 +92,8 @@ def clean_nn_module_stack(
     Returns:
         The cleaned GraphModule (modified in-place)
     """
-    for node in graph_module.graph.nodes:
-        if "nn_module_stack" not in node.meta:
-            continue
-
-        nn_module_stack = node.meta["nn_module_stack"].copy()
 
+    def _process_nn_module_stack(nn_module_stack):
         if "L__self____export_root" in nn_module_stack:
             del nn_module_stack["L__self____export_root"]
 
@@ -89,22 +101,54 @@ def clean_nn_module_stack(
         cleaned_stack = {}
         for key, (child_name, child_class) in nn_module_stack.items():
             # Clean key by removing export_root patterns
-            clean_key = key.replace("__modules['_export_root']_", "").replace(
-                "__export_root_", ""
-            )
+            clean_key = clean_export_root_string(key)
 
             # Clean child_name by removing export_root patterns
-            clean_name = child_name.replace("._modules['_export_root']", "").replace(
-                "._export_root", ""
-            )
+            clean_name = clean_export_root_string(child_name)
 
             # Skip self reference for inline builtin case
             if is_inline_builtin and clean_name == "L['self']":
                 continue
 
             cleaned_stack[clean_key] = (clean_name, child_class)
+        return cleaned_stack
+
+    def _process_source_fn(source_fn_stack):
+        cleaned_stack = []
+        for item in source_fn_stack:
+            if isinstance(item, tuple) and len(item) == 2:
+                name, cls = item
+                if isinstance(name, str):
+                    clean_name = clean_export_root_string(name)
+                    cleaned_stack.append((clean_name, cls))
+                else:
+                    cleaned_stack.append(item)
+            else:
+                cleaned_stack.append(item)
+        return cleaned_stack
 
-        node.meta["nn_module_stack"] = cleaned_stack
+    for node in graph_module.graph.nodes:
+        if "nn_module_stack" in node.meta:
+            node.meta["nn_module_stack"] = _process_nn_module_stack(
+                node.meta["nn_module_stack"].copy()
+            )
+        if "source_fn_stack" in node.meta:
+            node.meta["source_fn_stack"] = _process_source_fn(
+                node.meta["source_fn_stack"].copy()
+            )
+
+    if "dynamo_flat_name_to_original_fqn" in graph_module.meta:
+        # Clean up flat name to original fqn mapping
+        clean_name_to_original_fqn = {}
+        for flat_name, original_fqn in graph_module.meta[
+            "dynamo_flat_name_to_original_fqn"
+        ].items():
+            clean_name_to_original_fqn[clean_export_root_string(flat_name)] = (
+                clean_export_root_string(original_fqn)
+            )
+        graph_module.meta["dynamo_flat_name_to_original_fqn"] = (
+            clean_name_to_original_fqn
+        )
 
     return graph_module
 
@@ -112,26 +156,43 @@ def clean_nn_module_stack(
 def clean_export_root(graph_module: torch.fx.GraphModule) -> None:
     """Remove export_root artifacts from FX graph in-place"""
 
-    # Clean parameter names: L__self____export_root_param -> L__self___param
-    def clean_name(name) -> str:
-        if "____modules___export_root_" in name:
-            return name.replace("____modules___export_root_", "_")
-        if "__export_root_" in name:
-            return name.replace("__export_root_", "_")
-        return name
+    # Unlike getattr node, call_module can be invoked multiple times
+    # In those cases, we should fix all invocations of call_module
+    clean_named_module_map: dict[str, str] = {}
 
     # Update get_attr nodes in-place
     for node in graph_module.graph.nodes:
         if node.op == "get_attr":
             old_target = node.target
-            new_target = clean_name(old_target)
+            new_target = clean_export_root_string(old_target)
             if new_target != old_target:
                 node.target = new_target
+                assert hasattr(graph_module, old_target)
                 # Move the parameter to the new name
-                if hasattr(graph_module, old_target):
-                    param = torch.fx.graph_module._get_attr(graph_module, old_target)
-                    torch.fx.graph_module._assign_attr(param, graph_module, new_target)
-                    torch.fx.graph_module._del_attr(graph_module, old_target)
+                param = torch.fx.graph_module._get_attr(graph_module, old_target)
+                torch.fx.graph_module._assign_attr(param, graph_module, new_target)
+                torch.fx.graph_module._del_attr(graph_module, old_target)
+        # Dynamo will only have one nested level
+        if node.op == "call_module":
+            old_target = node.target
+            assert isinstance(old_target, str)
+            new_target = clean_export_root_string(old_target)
+            assert isinstance(new_target, str)
+            new_name = clean_export_root_string(node.name)
+            if new_target == old_target:
+                continue
+
+            # if this module has already been cleaned before, just lookup from map.
+            if old_target in clean_named_module_map:
+                node.target = clean_named_module_map[old_target]
+                node.name = new_name
+                continue
+            target = graph_module.get_submodule(old_target)
+            graph_module.delete_submodule(old_target)
+            graph_module.add_submodule(new_target, target)
+            node.target = new_target
+            node.name = new_name
+            clean_named_module_map[old_target] = new_target
 
 
 class ModuleToTrace(torch.nn.Module):
@@ -224,6 +285,7 @@ def _create_flattened_inputs(self) -> None:
             else:
                 placeholder.node.meta["val"] = self.flat_inputs[i]
 
+            # pyrefly: ignore  # unsupported-operation
             self.new_input_nodes[i] = placeholder
 
     def _create_placeholder_mapping(self) -> None:
@@ -298,18 +360,91 @@ def transform(self) -> torch.fx.GraphModule:
 
         # Copy module metadata like the original implementation
         if hasattr(self.module, "meta"):
+            # pyrefly: ignore  # unsupported-operation
             if "dynamo_flat_name_to_original_fqn" in self.module.meta:
+                # pyrefly: ignore  # index-error
                 result_gm.meta["dynamo_flat_name_to_original_fqn"] = self.module.meta[
+                    # pyrefly: ignore  # index-error
                     "dynamo_flat_name_to_original_fqn"
                 ]
+            # pyrefly: ignore  # unsupported-operation
             if "dynamo_compile_id" in self.module.meta:
+                # pyrefly: ignore  # index-error
                 result_gm.meta["dynamo_compile_id"] = self.module.meta[
+                    # pyrefly: ignore  # index-error
                     "dynamo_compile_id"
                 ]
 
         return result_gm
 
 
+def _suggest_or_raise_constraint_violation(
+    module_to_trace: torch.nn.Module,
+    orig_callable: Callable,  # type: ignore[type-arg]
+    fake_mode: Optional["FakeTensorMode"],
+    graph_capture_output: CaptureOutput,
+    args: Any,
+    kwargs: Any,
+    dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]],
+):
+    constraint_violation_error = None
+    try:
+        # Check if we have any constraint violations
+        fn, _ = get_traced_fn(module_to_trace)
+        graph_capture_output.graph_capture_output.build_guards(fn.__code__)
+    except ConstraintViolationError as e:
+        constraint_violation_error = e
+
+    if (
+        (shape_env := getattr(fake_mode, "shape_env", None)) is not None
+        and (dim_constraints := shape_env.dim_constraints) is not None
+        and not isinstance(
+            module_to_trace.forward,
+            torch._ops.OpOverloadPacket | torch._ops.OpOverload,
+        )
+    ):
+        # pyrefly: ignore  # unbound-name
+        dim_constraints.solve()
+        # pyrefly: ignore  # unbound-name
+        forced_specializations = dim_constraints.forced_specializations()
+        # pyrefly: ignore  # unbound-name
+        msg = dim_constraints.prettify_results(
+            inspect.signature(orig_callable),  # type: ignore[attr-defined]
+            dynamic_shapes,
+            constraint_violation_error,
+            forced_specializations,
+        )
+        if constraint_violation_error:
+            constraint_violation_error.args = (
+                constraint_violation_error.args[0] + msg,
+            )
+        else:
+            if forced_specializations:
+                constraint_violation_error = ConstraintViolationError(msg)
+            else:
+                log.info(
+                    "Summary of dimension constraints:%s",
+                    msg,
+                )
+
+        # Error if we have any constraints on static values
+        # pyrefly: ignore  # unbound-name
+        for k in shape_env.var_to_range.keys():
+            if isinstance(k, sympy.Integer):
+                constraint_violation_error = ConstraintViolationError(
+                    # pyrefly: ignore  # unbound-name
+                    f"{''.join(traceback.format_list(shape_env.var_to_stack[k]))}\n"
+                    "It appears that you're trying to set a constraint on a "
+                    f"value which we evaluated to have a static value of {k}. "
+                    'Set TORCH_LOGS="+export" for more information.'
+                )
+    if constraint_violation_error:
+        constraint_violation_error = post_process_error_msg(
+            constraint_violation_error, orig_callable, args, kwargs
+        )
+        raise constraint_violation_error
+
+
 def _dynamo_graph_capture_for_export(
     mod: Callable[..., Any],
     *,
@@ -345,10 +480,7 @@ def inner(*args: Any, **kwargs: Any) -> torch.fx.GraphModule:
         with _compiling_state_context():
             flat_inputs, in_spec = pytree.tree_flatten((args, kwargs))
             module_to_trace = ModuleToTrace(mod, in_spec)
-
-            signature = inspect.signature(module_to_trace.forward)
-            bound_arguments = signature.bind(*flat_inputs)
-            bound_arguments.apply_defaults()
+            orig_callable = mod.forward if isinstance(mod, torch.nn.Module) else mod
 
             constraints: Optional[list[Constraint]] = _constraints
             dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]] = (
@@ -359,15 +491,6 @@ def inner(*args: Any, **kwargs: Any) -> torch.fx.GraphModule:
 
             reset()
 
-            f_locals = {"self": module_to_trace, **bound_arguments.arguments}
-            frame = FrameInfo(
-                module_to_trace.forward.__func__.__code__,  # type: ignore[attr-defined]
-                module_to_trace.forward.__func__.__globals__,  # type: ignore[attr-defined]
-                f_locals,
-                builtins,  # type: ignore[arg-type]
-                closure=(),  # type: ignore[arg-type]
-            )
-
             dynamo_config_ctx = torch._dynamo.config.patch(
                 specialize_int=True,
                 specialize_float=True,
@@ -375,43 +498,58 @@ def inner(*args: Any, **kwargs: Any) -> torch.fx.GraphModule:
                 automatic_dynamic_shapes=False,
                 capture_dynamic_output_shape_ops=True,
                 capture_scalar_outputs=True,
+                constant_fold_autograd_profiler_enabled=True,
                 log_graph_in_out_metadata=True,
+                # install_free_tensors ensures that params and buffers are still
+                # added as graph attributes, and makes Dynamo emits graphs that
+                # follow export pytree-able input requirements In future, if we
+                # fully rely on bytecode for the runtime, we can turn this flag
+                # off.
+                install_free_tensors=torch._dynamo.config.install_free_tensors_for_export,
             )
 
             with (
-                compile_context(CompileContext(get_compile_id({}))),
                 get_metrics_context(),
                 dynamo_timed("fullgraph_capture"),
                 dynamo_config_ctx,
             ):
                 out = fullgraph_capture(
-                    frame,
+                    module_to_trace,
+                    tuple(flat_inputs),
                     constraints=_constraints,
                     _is_export_deprecated_do_not_use=True,
                 )
 
-                assert out.dynamo_output.tracer_output.output_graph is not None
+                assert out.graph_capture_output.output_graph is not None
 
-                # Extract export metadata from the new location
-                export_metadata = (
-                    out.dynamo_output.tracer_output.output_graph.export_metadata
+                example_inputs: list[Any] = []
+                if out.backend_input is not None:
+                    graph = out.backend_input.graph_module
+                    fake_mode = out.backend_input.fake_mode
+                    example_inputs = out.backend_input.example_inputs
+                else:
+                    graph = torch.fx.GraphModule(torch.nn.Module(), torch.fx.Graph())
+                    graph.graph.output(None)
+                    graph.recompile()
+                    fake_mode = None
+
+                _suggest_or_raise_constraint_violation(
+                    module_to_trace,
+                    orig_callable,
+                    fake_mode,
+                    out,
+                    args,
+                    kwargs,
+                    dynamic_shapes,
                 )
+
+                # Extract export metadata from the new location
+                export_metadata = out.graph_capture_output.output_graph.export_metadata
                 graph_inputs = export_metadata.graph_input_idx_to_local_source
                 graph_output_map = export_metadata.output_return_type
                 out_spec = export_metadata.out_spec
                 module_call_spec = export_metadata.module_call_spec
 
-            example_inputs: list[Any] = []
-            if out.backend_input is not None:
-                graph = out.backend_input.graph_module
-                fake_mode = out.backend_input.fake_mode
-                example_inputs = out.backend_input.example_inputs
-            else:
-                graph = torch.fx.GraphModule(torch.nn.Module(), torch.fx.Graph())
-                graph.graph.output(None)
-                graph.recompile()
-                fake_mode = out.dynamo_output.tracer_output.output_graph.fake_mode
-
             # Compute dynamic dimensions for each input based on constraints
             flat_args_dynamic_dims = [
                 {
@@ -449,73 +587,20 @@ def inner(*args: Any, **kwargs: Any) -> torch.fx.GraphModule:
             # Set up PyTree codegen for proper input/output handling
             transformed_graph.graph._codegen = _PyTreeCodeGen(
                 _PyTreeInfo(
-                    argument_names(inspect.signature(mod.forward), args, kwargs),  # type: ignore[attr-defined, arg-type]
+                    argument_names(inspect.signature(orig_callable), args, kwargs),  # type: ignore[attr-defined, arg-type]
                     in_spec,
                     out_spec,
                 )
             )
             transformed_graph.recompile()
 
-            clean_nn_module_stack(
+            clean_nn_module_stack_and_source_fn(
                 transformed_graph, torch._dynamo.config.inline_inbuilt_nn_modules
             )
             clean_export_root(transformed_graph)
 
             transformed_graph.meta["module_call_specs"] = module_call_spec
-
-            constraint_violation_error = None
-            try:
-                # Check if we have any constraint violations
-                check_fn = out.dynamo_output.build_guards(
-                    module_to_trace.forward.__code__
-                ).guard_manager
-                check_fn.check(f_locals)
-            except ConstraintViolationError as e:
-                constraint_violation_error = e
-
-            if (
-                (shape_env := getattr(fake_mode, "shape_env", None)) is not None
-                and (dim_constraints := shape_env.dim_constraints) is not None
-                and not isinstance(
-                    module_to_trace.forward,
-                    (torch._ops.OpOverloadPacket, torch._ops.OpOverload),
-                )
-            ):
-                dim_constraints.solve()
-                forced_specializations = dim_constraints.forced_specializations()
-                msg = dim_constraints.prettify_results(
-                    inspect.signature(mod.forward),  # type: ignore[attr-defined]
-                    dynamic_shapes,
-                    constraint_violation_error,
-                    forced_specializations,
-                )
-                if constraint_violation_error:
-                    constraint_violation_error.args = (
-                        constraint_violation_error.args[0] + msg,
-                    )
-                else:
-                    if forced_specializations:
-                        constraint_violation_error = ConstraintViolationError(msg)
-                    else:
-                        log.info(
-                            "Summary of dimension constraints:%s",
-                            msg,
-                        )
-
-                # Error if we have any constraints on static values
-                for k in shape_env.var_to_range.keys():
-                    if isinstance(k, sympy.Integer):
-                        constraint_violation_error = ConstraintViolationError(
-                            f"{''.join(traceback.format_list(shape_env.var_to_stack[k]))}\n"
-                            "It appears that you're trying to set a constraint on a "
-                            f"value which we evaluated to have a static value of {k}. "
-                            'Set TORCH_LOGS="+export" for more information.'
-                        )
-            if constraint_violation_error:
-                constraint_violation_error = post_process_error_msg(
-                    constraint_violation_error, mod, args, kwargs
-                )
-                raise constraint_violation_error
+            transformed_graph.meta["fake_mode"] = fake_mode
 
             return transformed_graph
 
diff --git a/torch/_dynamo/graph_break_registry.json b/torch/_dynamo/graph_break_registry.json
index 28fd02294ad3..42786c5335c8 100644
--- a/torch/_dynamo/graph_break_registry.json
+++ b/torch/_dynamo/graph_break_registry.json
@@ -1382,6 +1382,20 @@
     }
   ],
   "GB0142": [
+    {
+      "Gb_type": "Unsupported context manager",
+      "Context": "Attempted SETUP_WITH/BEFORE_WITH/LOAD_SPECIAL on {ctx}",
+      "Explanation": "Dynamo does not know how to enter a `{ctx.python_type_name()}` context manager.",
+      "Hints": [
+        "Avoid using the unsupported context manager.",
+        "If the context manager seems like it should be supported (e.g. torch.set_grad_enabled), then ",
+        "it may be the case that it was created outside the compiled region, which Dynamo does not support. ",
+        "Supported context managers can cross graph break boundaries only if they are local non-closure ",
+        "variables, or are intermediate values.",
+        "File an issue to PyTorch. Simple context managers can potentially be supported, ",
+        "but note that context managers can't be supported in general"
+      ]
+    },
     {
       "Gb_type": "Unsupported context manager",
       "Context": "Attempted SETUP_WITH/BEFORE_WITH on {ctx}",
@@ -2718,5 +2732,83 @@
       "Explanation": "Dyanmo does not support tracing mutations on a class when its __dict__ is materialized",
       "Hints": []
     }
+  ],
+  "GB0272": [
+    {
+      "Gb_type": "Failed to make weakref to User Object when storing by ID",
+      "Context": "user_objected: {obj}",
+      "Explanation": "Object does not allow us to make a weakref to it",
+      "Hints": []
+    },
+    {
+      "Gb_type": "Failed to make weakref to User Object",
+      "Context": "user_objected: {obj}",
+      "Explanation": "Object does not allow us to make a weakref to it",
+      "Hints": []
+    }
+  ],
+  "GB0273": [
+    {
+      "Gb_type": "Keyword args passed to exception constructor",
+      "Context": "{self} with kwargs {init_kwargs}",
+      "Explanation": "Dynamo does not know how to handle keyword args passed to an exception constructor",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0274": [
+    {
+      "Gb_type": "Attempted to reconstruct context manager's __enter__ method",
+      "Context": "str(self.ctx)",
+      "Explanation": "Attempted to reconstruct context manager {type_str} while tracing `with ...:`",
+      "Hints": [
+        "It is likely there is a graph break while tracing `with ctx:` ",
+        "but outside the actual `ctx.__enter__()` method. ",
+        "`torch.compile` does not expect this to happen.",
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0275": [
+    {
+      "Gb_type": "torch._dynamo.step_unsupported() with empty checkpoint",
+      "Context": "",
+      "Explanation": "traced torch._dynamo.step_unsupported(), but there is no checkpoint to step_graph_break from. This graph break is used for debugging only.",
+      "Hints": [
+        "Remove the torch._dynamo.step_unsupported() call.",
+        "Include at least one checkpoint: (1) include at least 2 ops and (2) make sure there is some ",
+        "line of code that is not in a try/with block, and has an empty Python stack.",
+        "This is likely to be a Dynamo bug. Please report an issue to PyTorch."
+      ]
+    }
+  ],
+  "GB0276": [
+    {
+      "Gb_type": "Failed to make weakref to User Object",
+      "Context": "user_object: {value}",
+      "Explanation": "Object does not allow us to make a weakref to it",
+      "Hints": []
+    }
+  ],
+  "GB0277": [
+    {
+      "Gb_type": "Attempted to wrap sparse Tensor with VariableTracker",
+      "Context": "str(example_value)",
+      "Explanation": "torch.compile does not support sparse Tensors with VariableTracker",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0278": [
+    {
+      "Gb_type": "Unsupported dict type for fromkeys()",
+      "Context": "{user_cls.__name__}.fromkeys(): {args} {kwargs}",
+      "Explanation": "Failed to call {user_cls.__name__}.fromkeys() because {user_cls.__name__} is not any type of dict, OrderedDict, or defaultdict",
+      "Hints": [
+        "Ensure {user_cls.__name__} is a type of dict, OrderedDict, or defaultdict."
+      ]
+    }
   ]
 }
diff --git a/torch/_dynamo/graph_bytecode_inputs.py b/torch/_dynamo/graph_bytecode_inputs.py
new file mode 100644
index 000000000000..7836478b5178
--- /dev/null
+++ b/torch/_dynamo/graph_bytecode_inputs.py
@@ -0,0 +1,62 @@
+import weakref
+from typing import Any
+
+from torch._dynamo.source import Source
+
+
+# This file is to handle types that we don't want to support
+# as explicit FX graph inputs. This uses a sidetable which
+# we populate in bytecode and is loaded during graph execution
+
+# We use a dynamo-generated index as a level of indirection
+# this allows us to register objects externally in pre-graph bytecode that we want
+# to pass to the graph, but not support their types as graph inputs
+index_to_source: dict[int, Source] = {}
+
+index_to_user_object_weakref: dict[int, weakref.ReferenceType[Any]] = {}
+
+
+def has_user_objects() -> bool:
+    return bool(index_to_source)
+
+
+def get_user_object_by_index(index: int) -> Any:
+    assert index in index_to_user_object_weakref, (
+        "Index not registered in index_to_user_object_weakref"
+    )
+    obj = index_to_user_object_weakref[index]()
+    assert obj is not None, "User object is no longer alive"
+    return index_to_user_object_weakref[index]()
+
+
+def store_user_object_weakrefs(*args: Any) -> None:
+    global index_to_user_object_weakref
+    index_to_user_object_weakref.clear()
+    index_to_user_object_weakref.update(
+        {i: weakref.ref(arg) for i, arg in enumerate(args)}
+    )
+
+
+def reset_user_object_tracking() -> None:
+    index_to_source.clear()
+    index_to_user_object_weakref.clear()
+
+
+# Register a user object to be used in the graph
+def register_user_object(value: Any, source: Source) -> int:
+    global index_to_source
+    index = len(index_to_source)
+    index_to_source[index] = source
+    try:
+        index_to_user_object_weakref[index] = weakref.ref(value)
+    except TypeError as e:
+        from .exc import unimplemented_v2
+
+        unimplemented_v2(
+            gb_type="Failed to make weakref to User Object",
+            context=f"user_object: {value}",
+            explanation="Object does not allow us to make a weakref to it",
+            hints=[],
+            from_exc=e,
+        )
+    return index
diff --git a/torch/_dynamo/graph_deduplication.py b/torch/_dynamo/graph_deduplication.py
index be2b51a7abdf..5c3bcdb67f4d 100644
--- a/torch/_dynamo/graph_deduplication.py
+++ b/torch/_dynamo/graph_deduplication.py
@@ -324,10 +324,11 @@ def copy_to_subgraph(node: Node) -> Node:
     return subgraph, external_node_usages, node_usage_to_tuple_elems, ind_to_tuple_spec
 
 
-def _stable_topological_sort(
+def _stable_topological_sort_impl(
     graph: torch.fx.Graph,
     node_to_additional_deps: dict[Node, OrderedSet[Node]],
-) -> None:
+    do_sort: bool = True,
+) -> bool:
     # Nodes are in exactly one of these four collections:
 
     # - Nodes in `pending` are waiting to be processed (in reverse order):
@@ -366,7 +367,7 @@ def _stable_topological_sort(
             waiting[waiting_for[-1]].append(node)
         else:
             ready.add(node)
-            if cursor and cursor.next is not node:
+            if cursor and cursor.next is not node and do_sort:
                 cursor.append(node)
             cursor = node
             # Mark the nodes that have been waiting for this node to finish as
@@ -374,7 +375,23 @@ def _stable_topological_sort(
             pending.extend(reversed(waiting.pop(node, ())))
 
     ready.update(outputs)
-    assert not waiting and len(ready) == len(graph.nodes)
+    return not waiting and len(ready) == len(graph.nodes)
+
+
+def _stable_topological_sort(
+    graph: torch.fx.Graph,
+    node_to_additional_deps: dict[Node, OrderedSet[Node]],
+) -> None:
+    assert _stable_topological_sort_impl(graph, node_to_additional_deps)
+
+
+def _has_cycle(
+    graph: torch.fx.Graph,
+    node_to_additional_deps: dict[Node, OrderedSet[Node]],
+) -> bool:
+    return not _stable_topological_sort_impl(
+        graph, node_to_additional_deps, do_sort=False
+    )
 
 
 def _populate_additional_deps(
diff --git a/torch/_dynamo/graph_region_tracker.py b/torch/_dynamo/graph_region_tracker.py
index c1463d290bc9..5fcf4e83cacb 100644
--- a/torch/_dynamo/graph_region_tracker.py
+++ b/torch/_dynamo/graph_region_tracker.py
@@ -44,7 +44,18 @@
 Node = torch.fx.Node
 Region = list[Node]
 IdenticalNodes = list[Node]
-GlobalStateKey = tuple[bool, bool, int, bool, bool, torch.dtype, bool, bool, bool, bool]
+GlobalStateKey = tuple[
+    bool,
+    bool,
+    int,
+    tuple[bool, bool],
+    tuple[bool, bool],
+    torch.dtype,
+    bool,
+    bool,
+    bool,
+    bool,
+]
 
 log = logging.getLogger(__name__)
 graph_expansion_log = torch._logging.getArtifactLogger(
@@ -258,7 +269,7 @@ def track_node(self, tx: InstructionTranslatorBase, node: Node) -> None:
                 duplicates.append(node)
                 self.node_to_duplicates[node] = duplicates
         except NodeHashException as e:
-            log.debug("Unable to hash node %s with exception %s", node, e)
+            log.debug("Unable to hash node %s with exception %s", node, e)  # noqa: G200
 
     def track_node_mutations(
         self,
@@ -320,6 +331,7 @@ def get_identical_regions(self, graph: torch.fx.Graph) -> list[list[Region]]:
             if len(group) > 1:
                 region_group = []
                 min_rank = math.inf
+                # pyrefly: ignore  # bad-assignment
                 for node in group:
                     # some nodes aren't in the topo ranking?
                     if node in topological_ranking:
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index bf5ba4be4973..f85f74cc0b97 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -136,6 +136,7 @@
     DefaultsSource,
     DictGetItemSource,
     DictSubclassGetItemSource,
+    DynamicScalarSource,
     FlattenScriptObjectSource,
     FloatTensorSource,
     FSDPNNModuleSource,
@@ -211,7 +212,7 @@
     from sympy import Symbol
 
     from torch._C import DispatchKeySet
-    from torch._dynamo.output_graph import OutputGraph, OutputGraphGuardsState
+    from torch._dynamo.output_graph import OutputGraphCommon, OutputGraphGuardsState
 
 T = TypeVar("T")
 log = logging.getLogger(__name__)
@@ -234,7 +235,7 @@
 )
 
 
-def get_framelocals_idx(code: types.CodeType, var_name: str) -> Optional[int]:
+def get_framelocals_idx(code: types.CodeType, var_name: str) -> int:
     # Refer to index in the frame's localsplus directly.
     # NOTE: name order for a code object doesn't change.
     # NOTE: we need to find the LAST matching index because <= 3.10 contains
@@ -242,8 +243,6 @@ def get_framelocals_idx(code: types.CodeType, var_name: str) -> Optional[int]:
     # and will take up 2 slots of the frame's localsplus. The correct behavior
     # is to refer to the cell, which has a higher index.
     framelocals_names_reversed = code_framelocals_names_reversed_cached(code)
-    if var_name not in framelocals_names_reversed:
-        return None
     framelocals_idx = (
         len(framelocals_names_reversed) - framelocals_names_reversed.index(var_name) - 1
     )
@@ -393,7 +392,7 @@ def find_tag_safe_roots(self) -> None:
         -----------------------------------------------------------------------
         A ``tag safe root`` is a tag safe node whose parent is not tag safe.
         These boundary nodes mark the points where guard evaluation can safely
-        prune traversal: if a tag-safe root’s dictionary tag matches, the entire
+        prune traversal: if a tag-safe root's dictionary tag matches, the entire
         subtree beneath it is skipped.
 
         One strong requirement for tag safe root is for the guarded object to
@@ -545,12 +544,12 @@ def visit_manager(node: GuardManager) -> list[GuardManager]:
                 and node.get_source().endswith(dunder_attrs_assumed_constants)
                 and config.assume_dunder_attributes_remain_unchanged
             ):
-                # We trust tuples obtained from a function’s __closure__ or
+                # We trust tuples obtained from a function's __closure__ or
                 # __defaults__. Any *other* tuple-valued attribute can be
                 # silently replaced—for example:
                 #
                 #     foo.bar = (1, 2)      # original
-                #     foo.bar = (3, 4)      # rebinding that our dict-tag optimisation won’t see
+                #     foo.bar = (3, 4)      # rebinding that our dict-tag optimisation won't see
                 #
                 # Therefore only tuples from __closure__ / __defaults__ participate in the
                 # recursive-dict-tag optimization; all others are ignored.
@@ -641,6 +640,7 @@ def construct_manager_string(
                 if isinstance(guard, RelationalGuard):
                     if guard not in self.printed_relational_guards:
                         self.printed_relational_guards.add(guard)
+                        # pyrefly: ignore  # bad-argument-type
                         body.writelines(self.get_guard_lines(guard))
                     else:
                         body.writelines(
@@ -701,6 +701,7 @@ def visit(mgr: GuardManager) -> None:
             for guard in mgr.get_leaf_guards():
                 if isinstance(guard, RelationalGuard):
                     if guard not in relational_guards_seen:
+                        # pyrefly: ignore  # bad-argument-type
                         self.code_parts.extend(get_code_parts(guard))
                         relational_guards_seen.add(guard)
                 else:
@@ -717,6 +718,7 @@ def from_numpy(a: Any) -> torch.Tensor:
     # Re-enable torch function since we disable it on leaf guards
     # we need it to properly construct the tensor if a default device is set
     with torch.overrides._enable_torch_function():
+        # pyrefly: ignore  # missing-attribute
         return torch.as_tensor(a) if isinstance(a, (np.generic, np.ndarray)) else a
 
 
@@ -730,6 +732,7 @@ def uninteresting_files() -> set[str]:
 
     from torch._dynamo.polyfills.loader import POLYFILLED_MODULES
 
+    # pyrefly: ignore  # bad-argument-type
     mods.extend(POLYFILLED_MODULES)
 
     return {inspect.getfile(m) for m in mods}
@@ -987,6 +990,7 @@ def __init__(
         check_fn_manager: CheckFunctionManager,
         save_guards: bool = False,
         runtime_global_scope: Optional[dict[str, object]] = None,
+        source_get_cache: Optional[dict[str, Any]] = None,
     ) -> None:
         self.f_code = f_code
         self.id_ref = id_ref
@@ -994,6 +998,7 @@ def __init__(
         self.lookup_weakrefs = lookup_weakrefs
         self.scope: dict[str, dict[str, object]] = {"L": local_scope, "G": global_scope}
         self.runtime_global_scope = runtime_global_scope or global_scope
+        self.source_get_cache = source_get_cache or {}
         self.scope["__builtins__"] = builtins.__dict__.copy()
         for (
             name,
@@ -1022,6 +1027,9 @@ def __init__(
 
         self.check_fn_manager: CheckFunctionManager = check_fn_manager
 
+        self.guard_tree_values: dict[int, Any] = {}
+        self.save_guards = save_guards
+
         # Collect the ids of dicts which need key order guarding. source_name is
         # not sufficient because for nn modules, we can have different sources
         # to access the same object - self._module["param"] is same as
@@ -1029,7 +1037,10 @@ def __init__(
         self.key_order_guarded_dict_ids = set()
         assert self.check_fn_manager.output_graph is not None
         for source in self.check_fn_manager.output_graph.guard_on_key_order:
-            self.key_order_guarded_dict_ids.add(id(self.get(source.name())))
+            dict_obj = self.get(source.name())
+            if self.save_guards:
+                self.source_get_cache[source.name()] = dict_obj
+            self.key_order_guarded_dict_ids.add(id(dict_obj))
 
         # Keep track of weak references of objects with ID_MATCH guard. This
         # info is stored alongside optimized_code and guard_manager and is used to
@@ -1040,14 +1051,10 @@ def __init__(
         self._cached_guard_managers: dict[str, GuardManager] = {}
         self._cached_duplicate_input_guards: set[tuple[str, str]] = set()
         self.object_aliasing_guard_codes: list[tuple[str, str]] = []
-        self.save_guards = save_guards
         self.guard_nn_modules = config.guard_nn_modules and justknobs_check(
             "pytorch/compiler:guard_nn_modules"
         )
-        self.already_guarded_not_present_in_generic_dict: OrderedSet[
-            tuple[str, str]
-        ] = OrderedSet()
-        self.guard_tree_values: dict[int, Any] = {}
+        self.already_added_code_parts: OrderedSet[str] = OrderedSet()
 
     def guard_on_dict_keys_and_ignore_order(
         self, example_value: dict[Any, Any], guard: Guard
@@ -1361,7 +1368,6 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
         # Use istype instead of isinstance to check for exact type of source.
         if istype(source, LocalSource):
             framelocals_idx = get_framelocals_idx(self.f_code, source.local_name)
-            assert framelocals_idx is not None
             out = root_guard_manager.framelocals_manager(
                 key=(source.local_name, framelocals_idx),
                 source=source_name,
@@ -1719,6 +1725,14 @@ def get_guard_manager_from_source(self, source: Source) -> GuardManager:
                 example_value=example_value,
                 guard_manager_enum=guard_manager_enum,
             )
+        elif istype(source, DynamicScalarSource):
+            assert base_guard_manager
+            out = base_guard_manager.lambda_manager(
+                python_lambda=lambda x: int(x),
+                source=source_name,
+                example_value=example_value,
+                guard_manager_enum=guard_manager_enum,
+            )
         else:
             raise AssertionError(
                 f"missing guard manager builder {source} - {source.name()}"
@@ -1749,34 +1763,15 @@ def add_python_lambda_leaf_guard_to_root(
         guards_log.debug("Python shape guard function:\n%s", pycode)
         exec(pycode, globals_for_guard_fn, out)
         guard_fn = out["___make_guard_fn"](*closure_vars.values())
-
-        required_locals = {}
-        all_locals = self.scope["L"].keys()
-        for var_name in guard_fn.__code__.co_consts:
-            if isinstance(var_name, str) and var_name in all_locals:
-                index = get_framelocals_idx(self.f_code, var_name)
-                if index is not None:
-                    required_locals[var_name] = index
-
-        construct_partial_framelocals_dict = config.construct_partial_framelocals_dict
-
         if is_epilogue:
             # Epilogue guards are run after all the other guards have finished.
             # If epilogue guards contain a getattr or getitem access, one of the
             # other guards would fail preventing the epilogue guards to run.
             self.guard_manager.root.add_epilogue_lambda_guard(
-                guard_fn,
-                required_locals,
-                construct_partial_framelocals_dict,
-                verbose_code_parts,
+                guard_fn, verbose_code_parts
             )
         else:
-            self.guard_manager.root.add_lambda_guard(
-                guard_fn,
-                required_locals,
-                construct_partial_framelocals_dict,
-                verbose_code_parts,
-            )
+            self.guard_manager.root.add_lambda_guard(guard_fn, verbose_code_parts)
 
     # Warning: use this with care!  This lets you access what the current
     # value of the value you are guarding on is.  You probably don't want
@@ -1785,9 +1780,15 @@ def add_python_lambda_leaf_guard_to_root(
     # (like its type) which is what you permanently install into the
     # guard code.
     def get(self, name: str, closure_vars: Optional[dict[str, Any]] = None) -> Any:
+        if self.source_get_cache:
+            if name in self.source_get_cache:
+                return self.source_get_cache[name]
         if closure_vars is None:
             closure_vars = _get_closure_vars()
-        return eval(name, self.scope, closure_vars)
+        ret = eval(name, self.scope, closure_vars)
+        if self.save_guards and ".__closure__" in name:
+            self.source_get_cache[name] = ret
+        return ret
 
     # Registers the usage of the source name referenced by the
     # string (or stored in the Guard) as being guarded upon.  It's important
@@ -1846,6 +1847,10 @@ def HASATTR(self, guard: Guard) -> None:
             code = f"hasattr({ref}, {attr!r})"
         else:
             code = f"not hasattr({ref}, {attr!r})"
+
+        if code in self.already_added_code_parts:
+            return
+
         self._set_guard_export_info(
             guard, [code], provided_guarded_object=self.get(base)
         )
@@ -1879,6 +1884,7 @@ def HASATTR(self, guard: Guard) -> None:
                 )
         else:
             base_manager.add_no_hasattr_guard(attr, get_verbose_code_parts(code, guard))
+        self.already_added_code_parts.add(code)
 
     def NOT_PRESENT_IN_GENERIC_DICT(
         self, guard: Guard, attr: Optional[Any] = None
@@ -1889,7 +1895,8 @@ def NOT_PRESENT_IN_GENERIC_DICT(
 
         base_manager = self.get_guard_manager(guard)
 
-        if (ref, attr) in self.already_guarded_not_present_in_generic_dict:
+        code = f"not ___dict_contains({attr!r}, {ref}.__dict__)"
+        if code in self.already_added_code_parts:
             return
 
         mod_dict_source = f"{guard.name}.__dict__"
@@ -1899,11 +1906,10 @@ def NOT_PRESENT_IN_GENERIC_DICT(
             guard_manager_enum=GuardManagerType.GUARD_MANAGER,
         )
 
-        code = f"not ___dict_contains({attr!r}, {ref}.__dict__)"
         mod_generic_dict_manager.add_dict_contains_guard(
             False, attr, get_verbose_code_parts(code, guard)
         )
-        self.already_guarded_not_present_in_generic_dict.add((ref, attr))
+        self.already_added_code_parts.add(code)
 
     def TYPE_MATCH(self, guard: Guard) -> None:
         # ___check_type_id is same as `id(type(x)) == y`
@@ -1945,11 +1951,14 @@ def DICT_CONTAINS(self, guard: Guard, key: str, invert: bool) -> None:
 
         maybe_not = "not " if invert else ""
         code = f"{maybe_not}___dict_contains({key!r}, {dict_ref})"
+        if code in self.already_added_code_parts:
+            return
         self._set_guard_export_info(guard, [code])
 
         self.get_guard_manager(guard).add_dict_contains_guard(
             not invert, key, get_verbose_code_parts(code, guard)
         )
+        self.already_added_code_parts.add(code)
 
     def SET_CONTAINS(self, guard: Guard, key: Any, invert: bool) -> None:
         set_ref = self.arg_ref(guard)
@@ -1957,12 +1966,15 @@ def SET_CONTAINS(self, guard: Guard, key: Any, invert: bool) -> None:
         contains = not invert  # install_dict_contains_guard inverts "contains"
 
         code = f"set.__contains__({set_ref}, {item!r})"
+        if code in self.already_added_code_parts:
+            return
 
         self._set_guard_export_info(guard, [code])
 
         self.get_guard_manager(guard).add_set_contains_guard(
             contains, item, get_verbose_code_parts(code, guard)
         )
+        self.already_added_code_parts.add(code)
 
     def BOOL_MATCH(self, guard: Guard) -> None:
         # checks val == True or val == False
@@ -2074,10 +2086,10 @@ def FUNCTORCH_STACK_MATCH(self, guard: Guard) -> None:
         # TODO(anijain2305) - Consider this moving this guard to C++
         compare_fn = torch._functorch.pyfunctorch.compare_functorch_state
 
-        def fn() -> bool:
+        def fn(x: Any) -> bool:
             return compare_fn(states)
 
-        self.guard_manager.root.add_lambda_guard_no_args(
+        self.guard_manager.root.add_lambda_guard(
             fn, get_verbose_code_parts(code, guard)
         )
 
@@ -2103,10 +2115,10 @@ def hooks_ids_fn(
         ]
         self._set_guard_export_info(guard, code)
 
-        def fn() -> bool:
+        def fn(x: Any) -> bool:
             return guard_hooks_ids == hooks_ids_fn(get_hooks())
 
-        self.guard_manager.root.add_lambda_guard_no_args(
+        self.guard_manager.root.add_lambda_guard(
             fn, get_verbose_code_parts(code, guard)
         )
 
@@ -2127,7 +2139,7 @@ def metadata_checker(x: Any) -> bool:
                 return x.__tensor_flatten__()[1] == original_metadata
 
         global_name = f"___check_metadata_{id(metadata_checker)}_c{CompileContext.current_compile_id()}"
-        self.get_guard_manager(guard).add_lambda_guard_no_framelocals(
+        self.get_guard_manager(guard).add_lambda_guard(
             metadata_checker, get_verbose_code_parts(global_name, guard)
         )
 
@@ -2163,6 +2175,8 @@ def EQUALS_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None) -> No
                 range,
                 dict_keys,
                 torch.Size,
+                torch.Stream,
+                torch.cuda.streams.Stream,
                 *np_types,
                 *ok_mutable_types,
             }
@@ -2191,7 +2205,7 @@ def EQUALS_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None) -> No
 
         import torch.utils._pytree as pytree
 
-        assert istype(val, ok_types) or pytree.is_constant_class(type(val)), (
+        assert isinstance(val, ok_types) or pytree.is_constant_class(type(val)), (
             f"Unexpected type {type(val)}"
         )
 
@@ -2206,6 +2220,7 @@ def EQUALS_MATCH(self, guard: Guard, recompile_hint: Optional[str] = None) -> No
             return
 
         # Python math library doesn't support complex nan, so we need to use numpy
+        # pyrefly: ignore  # missing-attribute
         if istype(val, complex) and np.isnan(val):
             code = [f"(type({ref}) is complex and __numpy_isnan({ref}))"]
             self._set_guard_export_info(guard, code)
@@ -2274,7 +2289,7 @@ def CLOSURE_MATCH(self, guard: Guard) -> None:
         # don't support this in serialization because it uses unsupported FUNCTION_MATCH
         val = self.get(guard.name)
         # Strictly only want user-defined functions
-        if type(val) == types.FunctionType and hasattr(val, "__code__"):
+        if type(val) is types.FunctionType and hasattr(val, "__code__"):
             self._guard_on_attribute(guard, "__code__", GuardBuilder.HASATTR)  # type: ignore[arg-type]
             self._guard_on_attribute(guard, "__code__", GuardBuilder.FUNCTION_MATCH)  # type: ignore[arg-type]
         else:
@@ -2469,7 +2484,7 @@ def DEFAULT_DEVICE(self, guard: Guard) -> None:
         )
 
     def SHAPE_ENV(self, guard: Guard) -> None:
-        from torch._dynamo.output_graph import OutputGraph
+        from torch._dynamo.output_graph import OutputGraphCommon
 
         assert guard.name == ""
         output_graph = self.check_fn_manager.output_graph
@@ -2486,8 +2501,9 @@ def SHAPE_ENV(self, guard: Guard) -> None:
             # shape variables to sources from tracked_fakes.  This must happen after
             # tensor checks.
             # NB: self.output_graph can be None in the debug_nops tests
-            assert isinstance(output_graph, OutputGraph)
-            fs = output_graph.tracked_fakes
+            assert isinstance(output_graph, OutputGraphCommon)
+            assert output_graph.shape_env is not None
+            fs = output_graph.shape_env.tracked_fakes or []
             input_contexts = [a.symbolic_context for a in fs]
 
             def get_sources(t_id: int, dim: int) -> list[Source]:
@@ -2495,10 +2511,10 @@ def get_sources(t_id: int, dim: int) -> list[Source]:
                 # sources for the corresponding tensor dimension.
                 return [
                     TensorPropertySource(source, TensorProperty.SIZE, dim)
+                    # pyrefly: ignore  # missing-attribute
                     for source in output_graph.tracked_fakes_id_to_source[t_id]
                 ]
 
-            assert output_graph.shape_env is not None
             if output_graph.export_constraints:
                 names: dict[str, tuple[int, int]] = {}
                 source_pairs: list[tuple[Source, Source]] = []
@@ -2532,6 +2548,7 @@ def get_sources(t_id: int, dim: int) -> list[Source]:
                 equalities_inputs = None
 
             def _get_code_parts(langs: tuple[str, ...]) -> list[_ShapeGuardsHelper]:
+                # pyrefly: ignore  # missing-attribute
                 return output_graph.shape_env.produce_guards_verbose(
                     [a.fake for a in fs],  # type: ignore[misc]
                     [a.source for a in fs],
@@ -2539,6 +2556,7 @@ def _get_code_parts(langs: tuple[str, ...]) -> list[_ShapeGuardsHelper]:
                     equalities_inputs=equalities_inputs,
                     source_ref=self.source_ref,
                     # Export keeps static.
+                    # pyrefly: ignore  # missing-attribute
                     ignore_static=(not output_graph.export),
                     langs=langs,
                 )
@@ -2600,7 +2618,9 @@ def _get_code_parts(langs: tuple[str, ...]) -> list[_ShapeGuardsHelper]:
         if not python_fallback:
             assert cpp_code_parts  # type: ignore[possibly-undefined]
             code_parts, source_to_symbol = (
+                # pyrefly: ignore  # unbound-name
                 cpp_code_parts.exprs,
+                # pyrefly: ignore  # unbound-name, missing-attribute
                 cpp_code_parts.source_to_symbol,
             )
 
@@ -2631,7 +2651,9 @@ def _get_code_parts(langs: tuple[str, ...]) -> list[_ShapeGuardsHelper]:
 
             assert cpp_code_parts  # type: ignore[possibly-undefined]
             code_parts, source_to_symbol = (
+                # pyrefly: ignore  # unbound-name
                 cpp_code_parts.exprs,
+                # pyrefly: ignore  # unbound-name, missing-attribute
                 cpp_code_parts.source_to_symbol,
             )
 
@@ -2734,9 +2756,10 @@ def TENSOR_MATCH(self, guard: Guard, value: Optional[Any] = None) -> None:
 
             if config.log_compilation_metrics and isinstance(value, torch.nn.Parameter):
                 metrics_context = get_metrics_context()
-                metrics_context.increment("param_numel", value.numel())
-                metrics_context.increment("param_bytes", value.nbytes)
-                metrics_context.increment("param_count", 1)
+                if metrics_context.in_progress():
+                    metrics_context.increment("param_numel", value.numel())
+                    metrics_context.increment("param_bytes", value.nbytes)
+                    metrics_context.increment("param_count", 1)
 
             tensor_name = self.arg_ref(guard)
             # [Note - On Export Tensor Guards]
@@ -3083,10 +3106,23 @@ class ShapeCodeParts:
 class GuardsState:
     output_graph: OutputGraphGuardsState
     shape_code_parts: Optional[ShapeCodeParts]
+    source_get_cache: Optional[dict[str, Any]] = None
 
 
 class _Missing:
-    pass
+    def __init__(self, reason: Optional[str] = None) -> None:
+        self._reason = reason
+
+    def __repr__(self) -> str:
+        return f"_Missing({self._reason})"
+
+    def __str__(self) -> str:
+        return f"_Missing({self._reason})"
+
+    # Sometimes _Missing object is used as the callable with functools.partial,
+    # so we add a dummy __call__ here to bypass TypeError from partial().
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        return _Missing()
 
 
 @functools.cache
@@ -3109,6 +3145,7 @@ def __init__(
         self,
         guard_tree_values: dict[int, Any],
         empty_values: dict[int, Any],
+        missing_values: dict[int, Any],
         *args: Any,
         **kwargs: Any,
     ) -> None:
@@ -3117,6 +3154,7 @@ def __init__(
         self.tensor_converter = torch._subclasses.fake_tensor.FakeTensorConverter()
         self.guard_tree_values = guard_tree_values
         self.empty_values = empty_values
+        self.missing_values = missing_values
 
     @classmethod
     def _unpickle_module(cls, state: Any) -> torch.nn.Module:
@@ -3200,10 +3238,32 @@ def _unpickle_fsdp_module_type(
             original_type
         ]
 
+    @classmethod
+    def _unpickle_ddp_module(
+        cls, state: dict[str, Any]
+    ) -> torch.nn.parallel.DistributedDataParallel:
+        ty = torch.nn.parallel.DistributedDataParallel
+        ddp = ty.__new__(ty)
+        torch.nn.Module.__setstate__(ddp, state)
+        return ddp
+
     @classmethod
     def _unpickle_c_op(cls, name: str) -> Any:
         return getattr(torch.ops._C, name)
 
+    @classmethod
+    def _unpickle_bound_method(cls, func: Any, base: Any) -> Any:
+        return types.MethodType(func, base)
+
+    @classmethod
+    def _unpickle_cell(cls, val: Any) -> Any:
+        def _() -> Any:
+            return val
+
+        assert _.__closure__ is not None
+        return _.__closure__[0]
+
+    # pyrefly: ignore  # bad-override
     def reducer_override(
         self, obj: Any
     ) -> Union[tuple[Callable[..., Any], tuple[Any, ...]], Any]:
@@ -3212,11 +3272,14 @@ def reducer_override(
         if id(obj) in self.empty_values:
             return type(obj).__new__, (type(obj),)
 
+        if id(obj) in self.missing_values:
+            return _Missing, ("missing values",)
+
         if isinstance(obj, torch.Tensor) and obj.device.type != "meta":
             from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
             if id(obj) not in self.guard_tree_values:
-                return _Missing, ()
+                return _Missing, ("tensor guard tree",)
 
             if is_traceable_wrapper_subclass(obj):
                 # inner_data is a list of tuples of:
@@ -3250,6 +3313,15 @@ def reducer_override(
             )
 
         elif isinstance(obj, torch.nn.Module):
+            if id(obj) not in self.guard_tree_values:
+                return _Missing, ("module guard tree",)
+
+            # DDP module is a special case because it tries to restore unneeded
+            # data in custom __setstate__. We cannot skip ddp module because it
+            # is often a toplevel module.
+            if isinstance(obj, torch.nn.parallel.DistributedDataParallel):
+                return type(self)._unpickle_ddp_module, (obj.__getstate__(),)
+
             if type(obj).__qualname__ == type(obj).__name__:
                 return NotImplemented
             if obj.__class__.__getstate__ == torch.nn.Module.__getstate__:
@@ -3291,20 +3363,37 @@ def reducer_override(
             and obj.__class__.__name__ == "PyCapsule"
         ):
             # Skipping PyCapsule since there isn't much to be guarded about them.
-            return _Missing, ()
+            return _Missing, ("capsule",)
 
         elif isinstance(obj, _get_unsupported_types()):
-            return _Missing, ()
+            return _Missing, ("unsupported",)
 
         elif inspect.isfunction(obj):
             if obj.__code__.co_flags & inspect.CO_NESTED:
-                return _Missing, ()
+                return _Missing, ("nested function",)
             if obj.__module__ in sys.modules:
                 f = sys.modules[obj.__module__]
                 for name in obj.__qualname__.split("."):
                     f = getattr(f, name, None)  # type: ignore[assignment]
                 if f is not obj:
-                    return _Missing, ()
+                    return _Missing, ("fqn mismatch",)
+        elif inspect.ismethod(obj):
+            func = obj.__func__
+            method_self = obj.__self__
+            inner_func = getattr(method_self, func.__name__)
+            if inspect.ismethod(inner_func):
+                inner_func = inner_func.__func__
+            if func is not inner_func:
+                return type(self)._unpickle_bound_method, (func, method_self)
+
+        elif isinstance(obj, type((lambda x: lambda: x)(0).__closure__[0])):  # type: ignore[index] # noqa: PLC3002
+            return type(self)._unpickle_cell, (obj.cell_contents,)
+
+        if hasattr(torch.distributed, "distributed_c10d") and isinstance(
+            obj, torch.distributed.distributed_c10d.Work
+        ):
+            if id(obj) not in self.guard_tree_values:
+                return _Missing, ("distributed_c10d.Work",)
 
         if type(obj).__qualname__ != type(obj).__name__:
             raise torch._dynamo.exc.PackageError(
@@ -3313,12 +3402,6 @@ def reducer_override(
                 + "Please define the class at global scope (top level of a module)."
             )
 
-        if hasattr(torch.distributed, "distributed_c10d") and isinstance(
-            obj, torch.distributed.distributed_c10d.Work
-        ):
-            if id(obj) not in self.guard_tree_values:
-                return _Missing, ()
-
         if (
             inspect.isclass(obj)
             and hasattr(torch.distributed, "fsdp")
@@ -3339,6 +3422,7 @@ def reducer_override(
 def pickle_guards_state(state: GuardsState, guard_tree_values: dict[int, Any]) -> bytes:
     buf = io.BytesIO()
     empty_values = {}
+    missing_values = {}
 
     leaves = pytree.tree_leaves(state.output_graph.local_scope)
     for leaf in leaves:
@@ -3350,7 +3434,11 @@ def pickle_guards_state(state: GuardsState, guard_tree_values: dict[int, Any]) -
                     empty_values[id(base)] = base
                 except:  # noqa: E722, B001
                     pass
-    pickler = GuardsStatePickler(guard_tree_values, empty_values, buf)
+        elif id(leaf) not in guard_tree_values:
+            # TODO See if we have lift this branch as the first one.
+            # Prune more objects in pytree hierarchy.
+            missing_values[id(leaf)] = leaf
+    pickler = GuardsStatePickler(guard_tree_values, empty_values, missing_values, buf)
     try:
         pickler.dump(state)
     except AttributeError as e:
@@ -3367,7 +3455,7 @@ class CheckFunctionManager:
     def __init__(
         self,
         f_code: types.CodeType,
-        output_graph: OutputGraphGuardsState,
+        output_graph: OutputGraphCommon,
         cache_entry: Optional[CacheEntry] = None,
         guard_fail_fn: Optional[Callable[[GuardFail], None]] = None,
         guard_filter_fn: Optional[
@@ -3377,6 +3465,7 @@ def __init__(
         runtime_global_scope: Optional[dict[str, Any]] = None,
         save_guards: bool = False,
         strict_error: bool = False,
+        source_get_cache: Optional[dict[str, Any]] = None,
     ):
         guards = output_graph.guards if output_graph else None
         self._weakrefs: dict[int, ReferenceType[object]] = {}
@@ -3384,7 +3473,7 @@ def __init__(
         existing_diff_guard_sources = (
             update_diff_guard_managers_for_existing_cache_entries(cache_entry)
         )
-        self.output_graph: Optional[OutputGraphGuardsState] = output_graph
+        self.output_graph: Optional[OutputGraphCommon] = output_graph
         assert self.output_graph is not None
 
         # Only used for serialization.
@@ -3439,7 +3528,12 @@ def guard_filter_fn(guards: list[GuardFilterEntry]) -> list[bool]:
             # If we're filtering guards, we need to build it an extra time first
             # because filtering depends on the builder/guard_manager results
             builder, guard_manager = self.build_guards(
-                sorted_guards, existing_diff_guard_sources, f_code, output_graph, False
+                sorted_guards,
+                existing_diff_guard_sources,
+                f_code,
+                output_graph,
+                False,
+                source_get_cache=source_get_cache,
             )
 
             def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
@@ -3476,7 +3570,7 @@ def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
                 [make_guard_filter_entry(guard) for guard in sorted_guards]
             )
             assert len(filter_results) == len(sorted_guards)
-            assert all(type(x) == bool for x in filter_results)
+            assert all(type(x) is bool for x in filter_results)
             sorted_guards = [
                 guard for i, guard in enumerate(sorted_guards) if filter_results[i]
             ]
@@ -3488,6 +3582,7 @@ def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
             f_code,
             output_graph,
             save_guards,
+            source_get_cache=source_get_cache,
         )
 
         self.guard_manager = guard_manager
@@ -3519,7 +3614,10 @@ def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
                     output_graph.local_scope,
                     CompileContext.current_compile_id(),
                 )
-                raise AssertionError(f"Guard check failed: {reasons}")
+                raise AssertionError(
+                    "Guard failed on the same frame it was created. This is a bug - please create an issue."
+                    f"Guard fail reason: {reasons}"
+                )
 
             if guard_manager_testing_hook_fn is not None:
                 guard_manager_testing_hook_fn(
@@ -3549,9 +3647,9 @@ def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
 
         self.guards_state: Optional[bytes] = None
         if save_guards:
-            from torch._dynamo.output_graph import OutputGraph
+            from torch._dynamo.output_graph import OutputGraphCommon
 
-            assert isinstance(self.output_graph, OutputGraph)
+            assert isinstance(self.output_graph, OutputGraphCommon)
             try:
                 self.guards_state = self.serialize_guards(
                     builder, sorted_guards, self.output_graph
@@ -3593,7 +3691,7 @@ def serialize_guards(
         self,
         builder: GuardBuilder,
         sorted_guards: list[Guard],
-        output_graph: OutputGraph,
+        output_graph: OutputGraphCommon,
     ) -> bytes:
         # We check whether our list of guards are serializable here
         for guard in sorted_guards:
@@ -3625,7 +3723,7 @@ def serialize_guards(
                     f"{failed} guard cannot be serialized."
                 )
 
-        builtins_dict_name = output_graph.name_of_builtins_dict_key_in_fglobals
+        builtins_dict_name = output_graph.name_of_builtins_dict_key_in_fglobals or ""
         used_global_vars = set()
         used_local_vars = set()
 
@@ -3635,7 +3733,7 @@ def prune_variable(source: Source) -> None:
                 # Leave out the builtins dict key, as we will special handle
                 # it later because the guarded code rarely use the entire
                 # builtin dict in the common case.
-                if name not in (builtins_dict_name,):
+                if name != builtins_dict_name:
                     used_global_vars.add(name)
             elif name := get_local_source_name(source):
                 assert isinstance(name, str)
@@ -3708,6 +3806,7 @@ def _ref(x: Any) -> Any:
         guards_state = GuardsState(
             output_graph=output_graph_guards_state,
             shape_code_parts=self.shape_code_parts,
+            source_get_cache=builder.source_get_cache,
         )
 
         return pickle_guards_state(guards_state, builder.guard_tree_values)
@@ -3719,6 +3818,7 @@ def build_guards(
         f_code: types.CodeType,
         output_graph: OutputGraphGuardsState,
         save_guards: bool,
+        source_get_cache: Optional[dict[str, Any]] = None,
     ) -> tuple[GuardBuilder, GuardManagerWrapper]:
         guard_manager = GuardManagerWrapper()
         guard_manager.diff_guard_sources = existing_diff_guard_sources
@@ -3746,6 +3846,7 @@ def source_ref(source: Source) -> str:
             self,
             save_guards,
             runtime_global_scope=self.runtime_global_scope,
+            source_get_cache=source_get_cache,
         )
 
         # Break retain cycle. See test_release_scope_memory
@@ -3882,13 +3983,13 @@ def add_code_part(
             )
 
         # Note - On Lambda guarding of object aliasing
-        # We previously installed object‑aliasing guards as relational guards,
-        # but that undermined the recursive‑dict guard optimization: placing the
+        # We previously installed object-aliasing guards as relational guards,
+        # but that undermined the recursive-dict guard optimization: placing the
         # aliasing guard at a leaf prevented the parent dict node from
-        # qualifying as a recursive‑dict guard root. Because aliasing guards are
+        # qualifying as a recursive-dict guard root. Because aliasing guards are
         # rare, we now emit them as epilogue guards via a small Python lambda.
         # This repeats the access in Python—adding a bit of work—but the
-        # overhead is outweighed by the gains from enabling recursive‑dict guard
+        # overhead is outweighed by the gains from enabling recursive-dict guard
         # optimization.
         if (
             config.use_lamba_guard_for_object_aliasing
@@ -3998,10 +4099,13 @@ def invalidate(self, obj_str: str) -> None:
             and (cache_entry := self.guard_manager.cache_entry) is not None
             and (extra_state := self.guard_manager.extra_state) is not None
         ):
+            # pyrefly: ignore  # unbound-name
             assert isinstance(cache_entry, CacheEntry)
+            # pyrefly: ignore  # unbound-name
             assert isinstance(extra_state, ExtraState)
             reason = f"Cache line invalidated because {obj_str} got deallocated"
             deleted_guard_manager = DeletedGuardManagerWrapper(reason)
+            # pyrefly: ignore  # unbound-name
             extra_state.invalidate(cache_entry, deleted_guard_manager)
             self.guard_manager = deleted_guard_manager
 
@@ -4092,7 +4196,7 @@ def check_torch_function_mode_stack() -> bool:
             return False
 
         for ty, mode in zip(types, cur_stack):
-            if ty != type(mode):
+            if ty is not type(mode):
                 return False
 
         return True
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 13b3de0280e2..f39d80f89b45 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -67,6 +67,7 @@
     is_symbolic,
     ShapeEnv,
     Specialization,
+    uninteresting_files,
 )
 from torch.fx.node import Target
 from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
@@ -78,6 +79,8 @@
 from .backends.registry import CompiledFn, CompilerFn
 from .bytecode_transformation import (
     create_binary_slice,
+    create_binary_subscr,
+    create_build_tuple,
     create_call_function,
     create_dup_top,
     create_instruction,
@@ -98,6 +101,7 @@
     unimplemented_v2,
     unimplemented_v2_with_warning,
 )
+from .graph_bytecode_inputs import has_user_objects, index_to_source
 from .graph_deduplication import apply_graph_deduplication
 from .graph_region_tracker import GraphRegionTracker
 from .guards import GuardBuilder, install_guard
@@ -175,6 +179,13 @@
 RootGuardManager = guards.RootGuardManager
 
 
+# Capture fn pointer at import time
+# This is to guard against trying to mark the iterated tensors
+# as static in case user overrides fn ptr
+og_module_named_buffers_fn_ptr = torch.nn.Module.named_buffers
+og_module_named_parameters_fn_ptr = torch.nn.Module.named_parameters
+
+
 @dataclass(frozen=True)
 class VariableTrackerCacheKey:
     vt_id: int
@@ -345,6 +356,26 @@ def guards(self) -> torch._guards.GuardsSet:
     def aotautograd_guards(self) -> list[torch._guards.GuardEnvExpr]:
         return self._aotautograd_guards
 
+    def dump_guards_state(self) -> "OutputGraphGuardsState":
+        # Dump a serializable version of self without extras
+        return OutputGraphGuardsState(
+            local_scope=self.local_scope,
+            global_scope=self.global_scope,
+            torch_function_mode_stack=self.torch_function_mode_stack,
+            guard_on_key_order=self.guard_on_key_order,
+            input_source_to_sizes_strides=self.input_source_to_sizes_strides,
+            dual_level=self.dual_level,
+            functorch_layers=self.functorch_layers,
+            current_device=self.current_device,
+            global_state_guard=self.global_state_guard,
+            name_of_builtins_dict_key_in_fglobals=self.name_of_builtins_dict_key_in_fglobals,
+            export=self.export,
+            export_constraints=self.export_constraints,
+            _guards=self.guards,
+            _aotautograd_guards=self.aotautograd_guards,
+            skip_guards_check=self.skip_guards_check,
+        )
+
 
 @dataclass
 class StackLocalsMetadata:
@@ -405,7 +436,78 @@ def get_builtins_dict(global_scope: Scope) -> dict[str, Any]:
     return f_builtins
 
 
-class OutputGraph(OutputGraphGuardsState):
+class OutputGraphCommon(OutputGraphGuardsState):
+    """
+    A minimal interface for full graph capture. It is intended to be
+    the target of any tracer that feeds into backends.
+
+    Currently dynamo's OutputGraph is the only known implementation
+    of this interface, used by (aot) precompile and (strict) export.
+    Importantly, that implementation also contains many other fields
+    that are using during tracing but not included in this interface
+    because they are not used once tracing is complete.
+
+    It should be safe to assume that (caching) precompile also uses
+    this interface.
+
+    In the future, we want make_fx, used by (non-strict) export, to
+    also implement this interface.
+
+    The serializable part of this interface is OutputGraphGuardsState.
+    We do not need to serialize other parts; however it will pay to
+    be disciplined about what those other parts are, especially since
+    we want other tracers to be able to meaningfully implement them,
+    and we should generally try to cut them down when possible.
+    """
+
+    def __init__(
+        self,
+        output_graph_guards_state: OutputGraphGuardsState,
+        shape_env: Optional[ShapeEnv] = None,
+        export_metadata: Optional[ExportMetaData] = None,
+        tracked_fakes_id_to_source: Optional[dict[int, list[Source]]] = None,
+    ):
+        super().__init__(
+            output_graph_guards_state.local_scope,
+            output_graph_guards_state.global_scope,
+            output_graph_guards_state.torch_function_mode_stack,
+            output_graph_guards_state.guard_on_key_order,
+            output_graph_guards_state.input_source_to_sizes_strides,
+            output_graph_guards_state.dual_level,
+            output_graph_guards_state.functorch_layers,
+            output_graph_guards_state.current_device,
+            output_graph_guards_state.global_state_guard,
+            output_graph_guards_state._guards,
+            output_graph_guards_state._aotautograd_guards,
+            output_graph_guards_state.export,
+            output_graph_guards_state.skip_guards_check,
+            output_graph_guards_state.export_constraints,
+            output_graph_guards_state.name_of_builtins_dict_key_in_fglobals,
+        )
+
+        # The following fields are currently known to be used by clients.
+        # In particular, we need:
+        # - shape_env, for building guards
+        # - export_metadata, for un/flattening inputs and outputs
+        # - tracked_fakes_id_to_source, for processing tensor dim constraints
+        self._shape_env = shape_env or ShapeEnv()  # private for inheritance
+        self.export_metadata = export_metadata or ExportMetaData()
+        self.tracked_fakes_id_to_source: dict[int, list[Source]] = (
+            tracked_fakes_id_to_source or {}
+        )
+
+    @property
+    def shape_env(self) -> ShapeEnv:
+        return self._shape_env
+
+    def bypass_package(self, reason: str = "", **kwargs: Any) -> None:
+        # NOTE: currently there are no tests for this but it is reachable
+        # when building guards, so technically necessary to include here.
+        # It is unclear whether we should include packaging altogether.
+        raise NotImplementedError
+
+
+class OutputGraph(OutputGraphCommon):
     """
     Wrapper class to hold outputs of InstructionTranslator.  Mainly the
     generated fx.Graph.
@@ -431,8 +533,10 @@ def __init__(
         f_code: CodeType,
         torch_function_mode_stack: list[torch.overrides.TorchFunctionMode],
         package: Optional["CompilePackage"],
+        one_graph: bool = False,
     ) -> None:
-        super().__init__(
+        OutputGraphGuardsState.__init__(
+            self,
             local_scope,
             global_scope,
             torch_function_mode_stack,
@@ -487,8 +591,10 @@ def __init__(
             # TrackedFake instances may have its metadata changed throughout
             # the program execution.
             tracked_fakes=self.tracked_fakes,
-            allow_scalar_outputs=config.capture_scalar_outputs,
-            allow_dynamic_output_shape_ops=config.capture_dynamic_output_shape_ops,
+            # We want to allow capture scalar outputs and allow_dynamic_output_shape_ops when fullgraph=True
+            allow_scalar_outputs=one_graph or config.capture_scalar_outputs,
+            allow_dynamic_output_shape_ops=one_graph
+            or config.capture_dynamic_output_shape_ops,
             prefer_deferred_runtime_asserts_over_guards=config.prefer_deferred_runtime_asserts_over_guards,
             co_fields=self.co_fields,
         )
@@ -501,7 +607,7 @@ def __init__(
             fake_mode = torch._subclasses.FakeTensorMode(
                 shape_env=shape_env,
                 # TODO (tmanlaibaatar) Remove this once we always lift params and buffers
-                allow_non_fake_inputs=True if self.export else False,
+                allow_non_fake_inputs=bool(self.export),
                 export=self.export,
             )
         self.tracing_context: TracingContext = TracingContext(fake_mode)
@@ -604,6 +710,7 @@ def __init__(
         self.backward_state_proxy: Optional[torch.fx.Proxy] = None
         self.backward_state_var: Optional[str] = None
 
+        # pyrefly: ignore  # bad-override
         self.name_of_builtins_dict_key_in_fglobals: str = (
             self.install_builtins_dict_in_fglobals()
         )
@@ -623,6 +730,10 @@ def __init__(
 
         self.export_metadata = ExportMetaData()
 
+        # Set of inlined unspecialized modules names to generate the
+        # dynamo_flat_name_to_original_fqn mapping.
+        self.used_inlined_inbuilt_modules_names: OrderedSet[str] = OrderedSet()
+
     def mark_bytecode_tracing_start(self) -> None:
         self.compiler_trace_stack.enter_context(
             dynamo_timed(
@@ -725,26 +836,6 @@ def maybe_install_saved_tensors_hooks_subgraphs(self) -> Optional[list[str]]:
         assert unpack_subgraph_name == "saved_tensors_hooks_unpack_0"
         return [pack_subgraph_name, unpack_subgraph_name]
 
-    def dump_guards_state(self) -> OutputGraphGuardsState:
-        # Dump a serializable version of self without extras
-        return OutputGraphGuardsState(
-            local_scope=self.local_scope,
-            global_scope=self.global_scope,
-            torch_function_mode_stack=self.torch_function_mode_stack,
-            guard_on_key_order=self.guard_on_key_order,
-            input_source_to_sizes_strides=self.input_source_to_sizes_strides,
-            dual_level=self.dual_level,
-            functorch_layers=self.functorch_layers,
-            current_device=self.current_device,
-            global_state_guard=self.global_state_guard,
-            name_of_builtins_dict_key_in_fglobals=self.name_of_builtins_dict_key_in_fglobals,
-            export=self.export,
-            export_constraints=self.export_constraints,
-            _guards=self.guards,
-            _aotautograd_guards=self.aotautograd_guards,
-            skip_guards_check=self.skip_guards_check,
-        )
-
     def synthetic_graph_input(
         self, fn: Callable[..., Any], args: tuple[Any, ...]
     ) -> VariableTracker:
@@ -967,6 +1058,14 @@ def update_co_names(self, name: str) -> None:
     def module_key_name(*names: Any) -> str:
         # create a new unique name
         name = "_".join(map(str, names))
+        # Strip _buffers[..]/_parmeters[..]/_modules[..] names
+        name = re.sub(
+            r"\._(?:modules|parameters|buffers)\[(['\"])([^'\"\]]+)\1\]", r".\2", name
+        )
+        # Replace getattr(a, b) with a.b
+        name = re.sub(
+            r"getattr\(\s*([^,]+?)\s*,\s*(['\"])([^'\"]+)\2\s*\)", r"\1.\3", name
+        )
         # Strip the guard lookup L/G access
         name = re.sub(r"^[GL]\['?(.*?)'?\]$", r"\1", name)
         # e.g. replace abc.xyz[123].qkv with abc.xyz_123.qkv
@@ -982,6 +1081,14 @@ def module_key_name(*names: Any) -> str:
     def register_static_attr_and_return_proxy(
         self, attr_prefix: str, attr_value: Any
     ) -> fx.Proxy:
+        # Check if the module already exists, if it does, return the already
+        # added proxy. This is important for executorch tests.
+        if isinstance(attr_value, torch.nn.Module):
+            for name, mod in self.nn_modules.items():
+                if mod is attr_value:
+                    proxy = self.create_proxy("get_attr", name, (), {})
+                    return proxy
+
         attr_name = get_unique_name_wrt(attr_prefix, self.nn_modules)
         # TODO `nn_modules` has been historically overloaded to store a lot more
         # than just nn module objects, fix that.
@@ -1051,6 +1158,7 @@ def wrap_name(module_key: str) -> VariableTracker:
                 vt = self.root_tx.output.side_effects.track_object_existing(target, vt)
 
                 assert "tensor_dict" not in vt.as_proxy().node.meta
+                # pyrefly: ignore  # bad-argument-type
                 vt.as_proxy().node.meta["tensor_dict"] = _extract_tensor_dict(target)
 
                 return vt
@@ -1062,6 +1170,7 @@ def wrap_name(module_key: str) -> VariableTracker:
                 install_guard(source.make_guard(GuardBuilder.NN_MODULE))
 
                 def wrap_name(module_key: str) -> VariableTracker:
+                    # pyrefly: ignore  # bad-argument-type
                     return NNModuleVariable(type(target), module_key, target, **options)
 
             else:
@@ -1204,7 +1313,7 @@ def handle_aliases_for_stolen_lists(
                         [
                             create_instruction("LOAD_FAST", argval=list_name),
                             create_load_const(list_idx),
-                            create_instruction("BINARY_SUBSCR"),
+                            create_binary_subscr(),
                             create_instruction("STORE_FAST", argval=alias_name),
                         ]
                     )
@@ -1413,6 +1522,27 @@ def compile_subgraph(
 
         from .decorators import disable
 
+        if has_user_objects():
+            # NB: This is where we store possible user objects before running the graph
+            # index_to_user_object_weakref is the function used in the graph to translate
+            # the dynamo-generated index into the actual object passed to the compiled function.
+            # We generate bytecode to store all user objects at the proper index in the below
+            # call.
+            codegen = PyCodegen(
+                self.root_tx, root, overridden_sources=overridden_sources
+            )
+            codegen.add_push_null(
+                lambda: codegen.load_import_from(
+                    torch._dynamo.graph_bytecode_inputs.__name__,
+                    "store_user_object_weakrefs",
+                )
+            )
+            for source in reversed(index_to_source.values()):
+                codegen(source)
+            codegen.call_function(len(index_to_source), False)
+            codegen.pop_top()
+            self.add_output_instructions(codegen.get_instructions())
+
         # to handle random calls
         if len(self.random_calls) > 0:
             random_calls_instructions = []
@@ -1436,8 +1566,9 @@ def compile_subgraph(
 
         # Codegen stack convention before the unsupported instruction
         # NOTE: in these comment blocks, "locals" EXCLUDE free and cell vars.
-        # NOTE: stack and locals must be codegen'd BEFORE the unsupported instruction, since the latter
+        # NOTE: stack/locals/cells must be codegen'd BEFORE the unsupported instruction, since the latter
         # can arbitrarily mutate the former.
+        # [frame N cells, .., frame 1 cells],
         # [
         #   frame N locals,
         #   frame N-1 stack + locals,
@@ -1447,7 +1578,7 @@ def compile_subgraph(
 
         # see symbolic_convert.py for
         # codegen stack convention after the unsupported instruction
-        # NOTE: cells are loaded into continuation functions directly
+        # NOTE: cells will be loaded into continuation functions directly by symbolic_convert
 
         # this determines the order that values are codegen'd to the stack
         stack_values_flat = [val for vals in all_stack_values for val in vals]
@@ -1479,12 +1610,19 @@ def compile_subgraph(
             and not all_stack_locals_metas[-1].locals_null_keys
         ):
             # optimization to generate better code in a common case
+
+            # codegen cells
+            # no side effects, so no new cells created - no need to call side_effects.codegen_save_tempvars
+            cell_cg = PyCodegen(self.root_tx)
+            self.codegen_cells(tx, cell_cg)
             self.add_output_instructions(
                 [
                     # load in reverse since UNPACK_SEQUENCE will reverse
                     *self.compile_and_call_fx_graph(
                         tx, list(reversed(stack_values_flat)), root
                     ),
+                    *cell_cg.get_instructions(),
+                    *create_swap(2),
                     create_instruction("UNPACK_SEQUENCE", arg=len(stack_values_flat)),
                 ]
             )
@@ -1550,7 +1688,7 @@ def compile_subgraph(
                             )
                         elif (
                             vt.source is not None
-                            and (source := getattr(vt.source, "base", None))
+                            and (source := getattr(vt.source, "base", None))  # type: ignore[assignment]
                             and source.is_input
                         ):
                             self.export_metadata.output_return_type[idx] = (
@@ -1586,6 +1724,7 @@ def compile_subgraph(
 
         # store all stack and locals for each frame
         # current state of the stack:
+        # all cells,
         # *(frame N stack), *(frame N locals),
         # ...,
         # *(frame 1 stack), *(frame 1 locals)
@@ -1600,6 +1739,7 @@ def compile_subgraph(
         )
 
         # current state of the stack:
+        # all cells,
         # *(frame N stack), [
         #     *(frame N locals),
         #     *(frame N-1 stack), *(frame N-1 locals),
@@ -1660,7 +1800,8 @@ def compile_subgraph(
             # *(frame N stack), metas[0] stack + locals, ..., metas[i] stack + locals, stack_values_flat
 
         # current state of the stack:
-        # *(frame N stack)
+        # all cells,
+        # *(frame N stack),
         # frame N locals,
         # frame N-1 stack, frame N-1 locals,
         # ...
@@ -1677,6 +1818,7 @@ def compile_subgraph(
         )
 
         # final state of the stack before running the unsupported bytecode:
+        # all cells,
         # [
         #   [frame N locals],
         #   [frame N-1 stack + locals],
@@ -1725,7 +1867,7 @@ def compile_subgraph(
                 _get_source_debug_name(var.source) for var in potential_side_effects
             ]
 
-            if len(side_effect_refs):
+            if side_effect_refs:
                 warnings.warn(
                     f"While exporting, we found certain side effects happened in the model.forward. "
                     f"Here are the list of potential sources you can double check: {side_effect_refs}"
@@ -1733,6 +1875,31 @@ def compile_subgraph(
 
         return all_stack_locals_metas
 
+    def codegen_cells(self, tx: "InstructionTranslatorBase", cg: PyCodegen) -> None:
+        # no need to codegen if reason.graph_break is False (since we won't resume)
+        if self.compile_subgraph_reason.graph_break:
+            tx_cnt = 0
+            cur_tx: Optional[InstructionTranslatorBase] = tx
+            while cur_tx is not None:
+                # NOTE: we generate cells in the same order as resume_execution.py: sorted freevars + cellvars
+                # Emitting `LOAD_FAST/LOAD_CLOSURE` with names in `co_freevars`
+                # requires that in the generated bytecode, these cells would keep
+                # their original local names, which we ensure via
+                # `CellVariable.local_name`.
+                freevars = tuple(sorted(cur_tx.cell_and_freevars()))
+                for cell in freevars:
+                    if cur_tx is self.root_tx:  # root frame
+                        cg.append_output(cg.create_load_closure(cell))
+                    else:  # nested frame
+                        assert cur_tx.post_prune_cell_and_freevars
+                        cg(cur_tx.post_prune_cell_and_freevars[cell])
+                cg.append_output(create_build_tuple(len(freevars)))
+                cur_tx = cur_tx.parent
+                tx_cnt += 1
+            cg.append_output(create_instruction("BUILD_LIST", arg=tx_cnt))
+        else:
+            cg.append_output(create_instruction("BUILD_LIST", arg=0))
+
     def codegen_suffix(
         self,
         tx: "InstructionTranslatorBase",
@@ -1752,6 +1919,7 @@ def codegen_suffix(
                 cg.store_attr(name)
         self.side_effects.codegen_hooks(cg)
 
+        # TODO get debug_locals working for nested graph breaks
         # Return variables used for logging at the end
         for debug_var, args in tx.debug_locals:
             cg.add_push_null(lambda: cg(debug_var))
@@ -1760,6 +1928,9 @@ def codegen_suffix(
             cg.extend_output(create_call_function(len(args), False))
             cg.extend_output([create_instruction("POP_TOP")])
 
+        # codegen cells before we apply side effects
+        self.codegen_cells(tx, cg)
+
         cg.restore_stack(stack_values, value_from_source=not tx.export)
         self.side_effects.codegen_update_mutated(cg)
 
@@ -1777,7 +1948,7 @@ def cleanup_graph(self) -> None:
             node.meta.pop("creation_timestamp", None)
 
         grad_enabled = torch.is_grad_enabled()
-        for node1, node2 in zip(nodes, nodes[1:]):
+        for node1, node2 in itertools.pairwise(nodes):
             if (
                 node1.target is torch._C._set_grad_enabled
                 and tuple(node1.args) == (not grad_enabled,)
@@ -1875,7 +2046,9 @@ def run_compiler_collective(self) -> None:
         tx = self.root_tx
         assert tx is not None
         if (ds := tx.distributed_state) is not None and ds.all_states is None:
+            # pyrefly: ignore  # unbound-name
             compile_pg = ds.compile_pg
+            # pyrefly: ignore  # unbound-name
             log.info("compiler_collective %s", ds.local_state)
             torch._logging.trace_structured(
                 "artifact",
@@ -1883,6 +2056,7 @@ def run_compiler_collective(self) -> None:
                     "name": "compiler_collective",
                     "encoding": "string",
                 },
+                # pyrefly: ignore  # unbound-name
                 payload_fn=lambda: ds.local_state.render(),
             )
             device_types = compile_pg._device_types
@@ -1896,7 +2070,9 @@ def run_compiler_collective(self) -> None:
                 dynamo_timed("compiler_collective", log_pt2_compile_event=True),
             ):
                 all_states: list[Any] = [None] * compile_pg.size()
+                # pyrefly: ignore  # unbound-name
                 dist.all_gather_object(all_states, ds.local_state, group=compile_pg)
+                # pyrefly: ignore  # unbound-name
                 ds.all_states = all_states
             # Clear speculation log, because are tracing may diverge due to
             # this information from the compiler collective
@@ -2071,7 +2247,7 @@ def compile_and_call_fx_graph(
                     check_fn_source = inspect.getsource(specialization.check_fn).strip()
                     # Required because the LABDA_GUARD API requires a root guard manager
                     unused_root_guard_manager = RootGuardManager()
-                    check_fn = guards.LAMBDA_GUARD_NO_FRAMELOCALS(  # type: ignore[attr-defined]
+                    check_fn = guards.LAMBDA_GUARD(  # type: ignore[attr-defined]
                         unused_root_guard_manager,
                         specialization.check_fn,
                         [check_fn_source],
@@ -2226,6 +2402,7 @@ def _call_user_compiler(
             },
         )
 
+        # pyrefly: ignore  # unbound-name
         return compiled_fn
 
     def dedup_pass(self) -> dict[str, torch.fx.GraphModule]:
@@ -2280,6 +2457,7 @@ def is_static_true(b_node: fx.node.Argument) -> bool:
                 isinstance(b, torch.SymBool)
                 and (r := b.node.maybe_as_bool()) is not None
             ):
+                # pyrefly: ignore  # unbound-name
                 return r
             # TODO: We can also technically remove all cases when the input
             # doesn't have unbacked inputs, since it's all in the ShapeEnv
@@ -2489,6 +2667,7 @@ def cleanup(self) -> None:
         # some of the tensor objects to be held alive for longer than necessary.
         self.root_tx = None  # type: ignore[assignment]
         self.nn_modules.clear()
+        self.used_inlined_inbuilt_modules_names.clear()
         self.param_name_to_source = None
 
         for node in self.graph.nodes:
@@ -2517,6 +2696,44 @@ def example_value_from_input_node(self, node: torch.fx.Node) -> Any:
         assert node.op == "get_attr"
         return self.nn_modules[node.target]  # type: ignore[index]
 
+    def add_fqn_info_for_inlined_modules(
+        self, inlined_module: torch.nn.Module, source: Source
+    ) -> None:
+        name = OutputGraph.module_key_name(source.name())
+        name = get_unique_name_wrt(
+            name, self.used_inlined_inbuilt_modules_names, self.global_scope
+        )
+        self.used_inlined_inbuilt_modules_names.add(name)
+
+        def register_leaf_name(leaf_name: str) -> None:
+            assert self.param_name_to_source is not None
+            new_source = ParamBufferSource(source, leaf_name)
+            new_name = f"{name}.{leaf_name}"
+            self.param_name_to_source[new_name] = new_source
+            if isinstance(source, LocalSource):
+                self.dynamo_flat_name_to_original_fqn[
+                    OutputGraph.module_key_name(new_source.name())
+                ] = leaf_name
+
+        # annoying, but there are cases when we do not have parameters
+        # see test_nn_moduledict_contains
+        if hasattr(inlined_module, "_parameters"):
+            if (
+                callable(inlined_module.named_parameters)
+                and inlined_module.named_parameters.__func__  # type: ignore[attr-defined]
+                is og_module_named_parameters_fn_ptr
+            ):
+                for leaf_name, _ in inlined_module.named_parameters():
+                    register_leaf_name(leaf_name)
+        if hasattr(inlined_module, "_buffers"):
+            if (
+                callable(inlined_module.named_buffers)
+                and inlined_module.named_buffers.__func__  # type: ignore[attr-defined]
+                is og_module_named_buffers_fn_ptr
+            ):
+                for leaf_name, _ in inlined_module.named_buffers():
+                    register_leaf_name(leaf_name)
+
 
 class DynamoTracerOutput:
     error_on_graph_break: bool
@@ -2606,6 +2823,7 @@ def encountered_non_compliant_op(target: torch._ops.OpOverload, msg: str) -> Non
                 hints=[],
             )
 
+        # pyrefly: ignore  # unbound-name
         op = getattr(target, overload)
         if torch.Tag.pt2_compliant_tag in op.tags:
             encountered_compliant_op(op)
@@ -2613,6 +2831,7 @@ def encountered_non_compliant_op(target: torch._ops.OpOverload, msg: str) -> Non
             encountered_non_compliant_op(
                 op,
                 f"Encountered the torch.ops.OpOverloadPacket {target} "
+                # pyrefly: ignore  # unbound-name
                 f"which resolves to the overload ({overload}) that is "
                 f"not PT2 compliant.",
             )
@@ -2633,6 +2852,7 @@ def __init__(
         **kwargs: P.kwargs,
     ) -> None:
         self.tracer = tracer
+        # pyrefly: ignore  # invalid-type-var
         self.fn = fn
         self.args = args
         self.kwargs = kwargs
@@ -2695,6 +2915,9 @@ def __init__(
         # tracer is the current tracer that's readily accessible in current tracer's graph.
         self.bound_symbols: dict[sympy.Symbol, Union[torch.fx.Proxy, LazyProxy]] = {}
 
+        # Maps _DynamicScalar object ids to allocated SymInt nodes, for symbol reuse
+        self.dynamic_scalar_nodes: dict[int, torch.SymInt] = {}
+
         self.prev_inst = None
         # True if this tracer is currently tracing into torch.utils.checkpoint
         # as part of speculate_subgraph.
@@ -2874,9 +3097,21 @@ def get_trace_call_log_str() -> str:
             rv.node.meta["nn_module_stack"] = nn_module_stack.copy()
 
         if kind in {"call_function", "call_method"}:
-            rv.node.meta["source_fn_stack"] = self.source_fn_stack + [
-                (rv.node.name, target)
-            ]
+            stack = (rv.node.name, target)
+            if nn_module_stack:
+                # Current codebase assumes that the nn_module_stack has the
+                # builtin modules in the stack.
+                current_nn_module = list(rv.node.meta["nn_module_stack"].values())[-1][
+                    1
+                ]
+                if current_nn_module.__module__.startswith(
+                    ("torch.nn.modules", "torch.ao.")
+                ) and not current_nn_module.__module__.startswith(
+                    "torch.nn.modules.container"
+                ):
+                    stack = (rv.node.name, current_nn_module)
+
+            rv.node.meta["source_fn_stack"] = self.source_fn_stack + [stack]
         elif kind == "call_module":
             if self.parent is not None:
                 # TODO can remove once inline_inbuilt_nn_modules is always True
@@ -2936,11 +3171,18 @@ def get_trace_call_log_str() -> str:
                 if not tx.is_co_filename_from_nn_modules():
                     frame_summaries.append(tx.frame_summary())
                 tx = getattr(tx, "parent", None)
+
+            filtered_frame_summaries = [
+                frame
+                for frame in frame_summaries
+                if frame.filename not in uninteresting_files()
+            ]
+
             # Reverse the frame_summaries, such that the innermost frame is at the last
-            frame_summaries.reverse()
+            filtered_frame_summaries.reverse()
 
             # official from_list stub doesn't have new-style type
-            msgs = traceback.StackSummary.from_list(frame_summaries).format()
+            msgs = traceback.StackSummary.from_list(filtered_frame_summaries).format()
             rv.node.stack_trace = "".join(msgs)
 
         if (
@@ -3494,7 +3736,7 @@ def has_input_mutation(self) -> MutationInfo:
             if v1 != v2
         ]
 
-        if len(mutated_inputs):
+        if mutated_inputs:
             mutated_nodes = [input_nodes[i] for i in mutated_inputs]
             msg = f"Input mutation detected at {mutated_nodes}"
             return MutationInfo(True, msg)
diff --git a/torch/_dynamo/package.py b/torch/_dynamo/package.py
index e8113915008d..b61728d03f05 100644
--- a/torch/_dynamo/package.py
+++ b/torch/_dynamo/package.py
@@ -16,35 +16,34 @@
 import hashlib
 import importlib
 import inspect
+import json
 import logging
 import os
 import pickle
+import platform
 import shutil
 import sys
 import types
 from collections.abc import Generator, Iterator
-from typing import Any, Callable, NewType, Optional
+from contextlib import nullcontext
+from typing import Any, Callable, NewType, Optional, TYPE_CHECKING
 from typing_extensions import Never
 
 import torch
-import torch._inductor.package
 from torch._dynamo.exc import PackageError
 from torch._dynamo.graph_utils import _graph_device_type
-from torch._dynamo.precompile_context import (
-    PrecompileCacheArtifact,
-    PrecompileContext,
-    SystemInfo,
-)
-from torch._inductor.runtime.cache_dir_utils import cache_dir
-from torch.compiler._cache import CacheArtifactFactory
 
 from .bytecode_transformation import get_code_keys
-from .utils import dynamo_timed, increment_frame
+from .utils import counters, dynamo_timed, increment_frame
 
 
 logger = logging.getLogger(__name__)
 
 
+if TYPE_CHECKING:
+    from .guards import GuardManagerWrapper, GuardsState
+
+
 @dataclasses.dataclass(frozen=True)
 class SerializedCode:
     co_argcount: int
@@ -103,6 +102,33 @@ class _GuardedCodeCacheEntry:
     dynamo_code: SerializedCode
 
 
+def load_guards_state(guards_state: bytes) -> Any:
+    try:
+        import torch.distributed.fsdp._fully_shard._fully_shard as _fully_shard
+
+        ctx = _fully_shard.disable_fsdp_module_new_init()
+    except ImportError:
+        ctx = nullcontext()  # type: ignore[assignment]
+    with ctx:
+        return pickle.loads(guards_state)
+
+
+def load_guard_manager(
+    guards_state: "GuardsState",
+    target_code: types.CodeType,
+    runtime_global_scope: Any,
+) -> "GuardManagerWrapper":
+    from .output_graph import OutputGraphCommon
+
+    return torch._dynamo.guards.CheckFunctionManager(
+        target_code,
+        OutputGraphCommon(guards_state.output_graph),
+        shape_code_parts=guards_state.shape_code_parts,
+        runtime_global_scope=runtime_global_scope,
+        source_get_cache=guards_state.source_get_cache,
+    ).guard_manager
+
+
 _BackendId = NewType("_BackendId", str)  # __compiled_fn
 _FunctionId = NewType("_FunctionId", str)  # __resume_at
 
@@ -145,17 +171,7 @@ def add_code(self, code: types.CodeType) -> None:
 
 
 @dataclasses.dataclass
-class DynamoCaptureOutput:
-    """
-    Core information generated from Dynamo for fullgraph=True.
-    """
-
-    guarded_codes: list[_GuardedCodeCacheEntry]
-    backend_ids: list[_BackendId]
-
-
-@dataclasses.dataclass
-class _DynamoCodeCacheEntry(DynamoCaptureOutput):
+class _DynamoCodeCacheEntry:
     """
     Contains the serializable information associated with a single code object
     in dynamo. To restore an execution of compiled code, we will need the following
@@ -179,7 +195,9 @@ class _DynamoCodeCacheEntry(DynamoCaptureOutput):
     python_code: SerializedCode
     python_module: str
     function_names: list[_FunctionId]
+    guarded_codes: list[_GuardedCodeCacheEntry]
     import_sources: dict[str, str]
+    backend_ids: list[_BackendId]
     code_source: Optional[str]
     install_to_global: bool
     has_compile_id: bool = False
@@ -301,15 +319,99 @@ def _find_code_source(obj: Any) -> Optional[str]:
     code_source = _find_code_source(toplevel)
     if code_source is None:
         _raise_resolution_error(code, toplevel)
+    # pyrefly: ignore  # missing-attribute
     return toplevel.__qualname__, code_source.strip(".")
 
 
+@dataclasses.dataclass(frozen=True)
+class SystemInfo:
+    """
+    System information including Python, PyTorch, and GPU details.
+    This information is used to ensure compiled artifacts can only be loaded
+    with compatible system configurations.
+    """
+
+    python_version: str
+    torch_version: str
+    toolkit_version: Optional[str]
+    triton_version: Optional[tuple[int, int]]
+    gpu_name: Optional[str]
+    CHECK_GPUS = ("cuda", "xpu")
+
+    @classmethod
+    def current(cls) -> "SystemInfo":
+        """Create a SystemInfo instance with current system information."""
+        # Get GPU name if CUDA or XPU is available
+        gpu_name = None
+        from torch.utils._triton import get_triton_version
+
+        gpu_name, toolkit_version = None, None
+        for device_type in cls.CHECK_GPUS:
+            if getattr(torch, device_type).is_available():
+                try:
+                    gpu_name = getattr(torch, device_type).get_device_name()
+                    toolkit_version = getattr(torch.version, device_type)
+                    break
+                except Exception:
+                    pass
+
+        return cls(
+            python_version=platform.python_version(),
+            torch_version=torch.__version__,
+            toolkit_version=toolkit_version,
+            triton_version=get_triton_version((0, 0)),
+            gpu_name=gpu_name,
+        )
+
+    def check_compatibility(
+        self, other: "SystemInfo", device_type: str = "cpu"
+    ) -> None:
+        """
+        Check if this SystemInfo is compatible with another SystemInfo.
+        Raises RuntimeError if incompatible.
+        """
+        if self.python_version != other.python_version:
+            raise RuntimeError(
+                f"Compile package was created with a different Python version: {self.python_version}"
+            )
+
+        if self.torch_version != other.torch_version:
+            raise RuntimeError(
+                f"Compile package was created with a different PyTorch version: {self.torch_version}"
+            )
+        if device_type in self.CHECK_GPUS:
+            if not getattr(torch, device_type).is_available():
+                raise RuntimeError(f"{device_type} is not available")
+
+            if self.toolkit_version != other.toolkit_version:
+                raise RuntimeError(
+                    f"Compile package was created with a different toolkit version: {self.toolkit_version}"
+                )
+
+            if (
+                other.triton_version != (0, 0)
+                and self.triton_version != other.triton_version
+            ):
+                raise RuntimeError(
+                    f"Compile package was created with a different Triton version: {self.triton_version}"
+                )
+
+            # Check GPU name if CUDA/XPU was used
+            if other.gpu_name is not None and self.gpu_name != other.gpu_name:
+                raise RuntimeError(
+                    f"Compile package was created with different GPU: "
+                    f"cached={self.gpu_name}, current={other.gpu_name}"
+                )
+
+
 @dataclasses.dataclass
 class _DynamoCacheEntry:
     codes: list[_DynamoCodeCacheEntry]
     source_info: SourceInfo
     device_type: str
     system_info: SystemInfo = dataclasses.field(default_factory=SystemInfo.current)
+    fn_name: Optional[str] = None
+    fn_first_lineno: Optional[str] = None
 
     @property
     def backend_ids(self) -> set[_BackendId]:
@@ -320,15 +422,77 @@ def check_versions(self) -> None:
         current_system_info = SystemInfo.current()
         self.system_info.check_compatibility(current_system_info, self.device_type)
 
+    def debug_info(self) -> dict[str, Any]:
+        assert len(self.codes) > 0
+        return {
+            "num_codes": str(len(self.codes)),
+            "fn_name": self.fn_name,
+            "fn_first_lineno": self.fn_first_lineno,
+            "device_type": self.device_type,
+            "backend_ids": list(self.backend_ids),
+        }
+
+
+from torch.compiler._cache import (
+    CacheArtifact,
+    CacheArtifactFactory,
+    CacheArtifactManager,
+)
+
 
 @CacheArtifactFactory.register
-class _DynamoCacheArtifact(PrecompileCacheArtifact[_DynamoCacheEntry]):
+class PrecompileCacheArtifact(CacheArtifact):
+    def populate_cache(self) -> None:
+        DynamoCache._write_to_local_cache(self.content, self.key)
+
     @staticmethod
     def type() -> str:
-        return "precompile_dynamo"
+        return "precompile"
+
+
+@dataclasses.dataclass
+class PrecompileCacheEntry:
+    """
+    A full cache entry for caching precompile, for a toplevel torch.compile.
+    Consists of a _DynamoCacheEntry, which contains all the dynamo related contents,
+    and a set of backends content. In general, the backend content here will always
+    be of type precompile_context.BackendCacheArtifact
+    """
+
+    dynamo: _DynamoCacheEntry
+    backends: dict[_BackendId, Any]
 
-    def after_deserialization(self) -> _DynamoCacheEntry:
-        return pickle.loads(self.content)
+    @staticmethod
+    def from_cache_entry(
+        cache_entry: _DynamoCacheEntry, backends: dict[_BackendId, Any]
+    ) -> Optional["PrecompileCacheEntry"]:
+        backend_content: dict[_BackendId, Any] = {}
+
+        for code in cache_entry.codes:
+            for backend_id in code.backend_ids:
+                if backend_id not in backends:
+                    logger.warning("Backend not found")
+                    debug_str = json.dumps(
+                        {
+                            "entry": cache_entry.debug_info(),
+                            "missing_backend": backend_id,
+                        }
+                    )
+                    torch._logging.trace_structured(
+                        "artifact",
+                        metadata_fn=lambda: {
+                            "name": "dynamo_cache_bypass",
+                            "encoding": "json",
+                        },
+                        payload_fn=lambda: debug_str,
+                        expect_trace_id=False,
+                    )
+                    code.bypassed = True
+                    break
+                else:
+                    backend_content[backend_id] = backends[backend_id]
+
+        return PrecompileCacheEntry(dynamo=cache_entry, backends=backend_content)
 
 
 def _hash_source(source: str) -> str:
@@ -447,9 +611,11 @@ def initialize(
                             f"Source code changes detected for {code.module} (line {code.firstlineno} - line {code.lastlineno})"
                         )
 
+                # pyrefly: ignore  # bad-assignment
                 self._source_info = dynamo.source_info
 
             main, *codes = dynamo.codes
+            # pyrefly: ignore  # bad-assignment
             self._codes = {self._innermost_fn.__code__: main}
             for code in codes:
                 self._codes[SerializedCode.to_code_object(code.python_code)] = code
@@ -457,6 +623,7 @@ def initialize(
             self._add_function(
                 self._innermost_fn.__code__, self._innermost_fn.__module__
             )
+        # pyrefly: ignore  # bad-assignment
         self._initialized = True
 
     def _add_function(
@@ -523,10 +690,6 @@ def code_context(self, code: types.CodeType) -> Generator[None, None, None]:
         try:
             yield
         finally:
-            if (
-                entry.bypassed
-            ):  # Remove the code from the cache entry if it's been bypassed
-                del self._codes[code]
             entry.has_compile_id = True
             self._current_entry = None
 
@@ -604,6 +767,7 @@ def uninstall(self) -> None:
             for name in names:
                 module.__dict__.pop(name)
 
+        # pyrefly: ignore  # bad-assignment
         self._installed_globals = {}
 
         _reset_precompile_entries(self._innermost_fn.__code__)
@@ -640,6 +804,11 @@ def install(self, backends: dict[_BackendId, Any]) -> None:
                 if entry.code_source:
                     target_code = _lookup_code(entry)
 
+                if entry.bypassed:
+                    # If the entry is bypassed, do not install backends
+                    # or guarded codes.
+                    continue
+
                 for backend_id in entry.backend_ids:
                     if backend_id not in backends:
                         raise RuntimeError(
@@ -661,7 +830,8 @@ def install(self, backends: dict[_BackendId, Any]) -> None:
                     torch._dynamo.eval_frame.skip_code(target_code)
 
                 for guarded_code in entry.guarded_codes:
-                    guards_state = pickle.loads(guarded_code.guards_state)
+                    with dynamo_timed("precompile_load_guards"):
+                        guards_state = load_guards_state(guarded_code.guards_state)
                     runtime_global_scope = sys.modules[entry.python_module].__dict__
                     # The installed builtins dict might be absent from the runtime
                     # while loading guards. Populate it if it's missing.
@@ -677,24 +847,25 @@ def install(self, backends: dict[_BackendId, Any]) -> None:
                         else:
                             runtime_global_scope[builtin_dict_name] = builtins_dict
                     assert isinstance(guards_state, torch._dynamo.guards.GuardsState)
-                    check_fn_manager = torch._dynamo.guards.CheckFunctionManager(
-                        target_code,
-                        guards_state.output_graph,
-                        shape_code_parts=guards_state.shape_code_parts,
-                        runtime_global_scope=runtime_global_scope,
-                    )
+                    with dynamo_timed("precompile_build_guards"):
+                        guard_manager = load_guard_manager(
+                            guards_state, target_code, runtime_global_scope
+                        )
                     _load_precompile_entry(
                         target_code,
-                        check_fn_manager.guard_manager,
+                        guard_manager,
                         SerializedCode.to_code_object(guarded_code.dynamo_code),
                     )
 
     def cache_entry(self) -> _DynamoCacheEntry:
         self.validate()
+        assert self._innermost_fn is not None
         return _DynamoCacheEntry(
             codes=list(self._codes.values()),
             source_info=self._source_info,
             device_type=self._device_type,
+            fn_name=self._innermost_fn.__qualname__,
+            fn_first_lineno=self._innermost_fn.__code__.co_firstlineno,
         )
 
     @staticmethod
@@ -709,17 +880,7 @@ def source_id_from_fn(fn: Callable[..., Any]) -> str:
         return sha256_hash.hexdigest()
 
 
-@CacheArtifactFactory.register
-class EagerCacheArtifact(PrecompileCacheArtifact[Any]):
-    @staticmethod
-    def type() -> str:
-        return "precompile_eager"
-
-    def after_deserialization(self) -> Any:
-        return pickle.loads(self.content)
-
-
-_Backends = dict[_BackendId, PrecompileCacheArtifact[Any]]
+_Backends = dict[_BackendId, Any]
 
 
 class DynamoStore(abc.ABC):
@@ -733,29 +894,32 @@ def record_package(self, package: CompilePackage) -> None:
         """
         Records a package to PrecompileContext, so that it can be serialized later.
         """
+        from torch._dynamo.precompile_context import PrecompileContext
+
         cache_entry = package.cache_entry()
-        pickled_result = pickle.dumps(cache_entry)
-        PrecompileContext.record_artifact(
-            _DynamoCacheArtifact.type(), key=package.source_id, content=pickled_result
+        PrecompileContext.record_dynamo_cache_entry(
+            cache_entry=cache_entry, key=package.source_id
         )
 
     def record_eager_backend(self, backend_id: _BackendId, backend: Any) -> None:
         """
         Records eager fx graphs to PrecompileContext for testing purposes.
         """
-        pickled_result = pickle.dumps(backend)
-        PrecompileContext.record_artifact(
-            EagerCacheArtifact.type(), key=backend_id, content=pickled_result
+        from torch._dynamo.precompile_context import (
+            EagerCacheArtifact,
+            PrecompileContext,
         )
 
+        result = EagerCacheArtifact(key=backend_id, content=backend)
+        PrecompileContext.record_artifact(result)
+
     @abc.abstractmethod
     def clear(self) -> None: ...
 
     @abc.abstractmethod
     def write(
         self,
-        dynamo: _DynamoCacheEntry,
-        backends: _Backends,
+        cache_entry: PrecompileCacheEntry,
         path: str,
     ) -> None:
         """
@@ -772,6 +936,11 @@ def save_cache_entry(self, cache_entry: _DynamoCacheEntry, key: str) -> None:
         """
         Saves a package to a given path. Grabs backends from PrecompileContext.
         """
+        from torch._dynamo.precompile_context import (
+            BackendCacheArtifact,
+            PrecompileContext,
+        )
+
         backend_content: _Backends = {}
         for backend_id in cache_entry.backend_ids:
             serialized_backend = PrecompileContext.serialize_artifact_by_key(backend_id)
@@ -779,10 +948,12 @@ def save_cache_entry(self, cache_entry: _DynamoCacheEntry, key: str) -> None:
                 raise RuntimeError(
                     f"Backend {backend_id} is not found in the given backends"
                 )
-            assert isinstance(serialized_backend, PrecompileCacheArtifact)
+            assert isinstance(serialized_backend, BackendCacheArtifact)
             backend_content[backend_id] = serialized_backend
 
-        self.write(cache_entry, backend_content, key)
+        entry = PrecompileCacheEntry(cache_entry, backend_content)
+
+        self.write(entry, key)
 
     def save_package(self, package: CompilePackage, key: str) -> None:
         """
@@ -793,7 +964,7 @@ def save_package(self, package: CompilePackage, key: str) -> None:
         self.save_cache_entry(cache_entry, key)
 
     @abc.abstractmethod
-    def read(self, path: str) -> tuple[_DynamoCacheEntry, _Backends]:
+    def read(self, path: str) -> PrecompileCacheEntry:
         """
         Abstract method to read dynamo cache entry and backends from storage.
 
@@ -805,17 +976,18 @@ def read(self, path: str) -> tuple[_DynamoCacheEntry, _Backends]:
         """
         ...
 
-    def load_cache_entry(
-        self, key: str
-    ) -> tuple[_DynamoCacheEntry, dict[_BackendId, Any]]:
-        cache_entry, backend_content = self.read(key)
-        for backend_id, backend in backend_content.items():
-            PrecompileContext.record_artifact(
-                backend.type(), key=backend.key, content=backend.content
-            )
-            backend_content[backend_id] = backend
+    def load_cache_entry(self, key: str) -> PrecompileCacheEntry:
+        from torch._dynamo.precompile_context import (
+            BackendCacheArtifact,
+            PrecompileContext,
+        )
 
-        return cache_entry, backend_content
+        precompile_entry = self.read(key)
+        for backend in precompile_entry.backends.values():
+            assert isinstance(backend, BackendCacheArtifact)
+            PrecompileContext.record_artifact(backend)
+
+        return precompile_entry
 
     def load_package(
         self, fn: Any, key: str
@@ -823,9 +995,9 @@ def load_package(
         """
         Loads a package from a given path and returns it plus a list of deserialized backends
         """
-        cache_entry, backend_content = self.load_cache_entry(key)
-        package = CompilePackage(fn, cache_entry)
-        return package, backend_content
+        entry = self.load_cache_entry(key)
+        package = CompilePackage(fn, entry.dynamo)
+        return package, entry.backends
 
 
 class InMemoryDynamoStore(DynamoStore):
@@ -834,23 +1006,22 @@ class InMemoryDynamoStore(DynamoStore):
     """
 
     def __init__(self) -> None:
-        self.packages: dict[str, tuple[_DynamoCacheEntry, _Backends]] = {}
+        self.packages: dict[str, PrecompileCacheEntry] = {}
 
     def clear(self) -> None:
         self.packages.clear()
 
     def write(
         self,
-        dynamo: _DynamoCacheEntry,
-        backends: _Backends,
+        entry: PrecompileCacheEntry,
         path: str,
     ) -> None:
         """
         Store the dynamo cache entry and backends in memory instead of writing to disk.
         """
-        self.packages[path] = (dynamo, backends)
+        self.packages[path] = entry
 
-    def read(self, path: str) -> tuple[_DynamoCacheEntry, _Backends]:
+    def read(self, path: str) -> PrecompileCacheEntry:
         """
         Read dynamo cache entry and backends from memory.
         """
@@ -872,45 +1043,55 @@ def __init__(self, path_prefix: str = ""):
         Args:
             path_prefix: Prefix directory for where to put CompilePackages on disk
         """
-        self.path_prefix = path_prefix
+        self._path_prefix = path_prefix
+
+    def path_prefix(self) -> str:
+        return self._path_prefix
 
     def clear(self) -> None:
         """
         Clear all CompilePackages from disk.
         """
-        if self.path_prefix:
-            shutil.rmtree(self.path_prefix, ignore_errors=True)
+        if self.path_prefix():
+            shutil.rmtree(self.path_prefix(), ignore_errors=True)
 
     def write(
         self,
-        dynamo: _DynamoCacheEntry,
-        backends: _Backends,
+        entry: PrecompileCacheEntry,
         path: str,
     ) -> None:
         """
         Write dynamo cache entry and backends to disk.
         """
-        path = os.path.join(self.path_prefix, path) if self.path_prefix else path
+        try:
+            pickled_content: bytes = pickle.dumps(entry)
+            CacheArtifactManager.record_artifact(
+                PrecompileCacheArtifact.type(), path, pickled_content
+            )
+            self._write_to_local_cache(pickled_content, path)
+        except Exception as e:
+            raise RuntimeError(f"Failed to save package to {path}: {e}") from e
+
+    def _write_to_local_cache(self, pickled_content: bytes, path: str) -> None:
+        from torch._inductor.codecache import write_atomic
+
+        path = os.path.join(self.path_prefix(), path) if self.path_prefix() else path
         try:
             os.makedirs(path, exist_ok=True)
-            with open(os.path.join(path, "dynamo"), "wb") as dynamo_path:
-                pickle.dump(dynamo, dynamo_path)
-            with open(os.path.join(path, "backends"), "wb") as backend_path:
-                pickle.dump(backends, backend_path)
+            write_atomic(os.path.join(path, "entry"), pickled_content)
         except Exception as e:
             raise RuntimeError(f"Failed to save package to {path}: {e}") from e
 
-    def read(self, path: str) -> tuple[_DynamoCacheEntry, _Backends]:
+    def read(self, path: str) -> PrecompileCacheEntry:
         """
         Read dynamo cache entry and backends from disk.
         """
-        path = os.path.join(self.path_prefix, path) if self.path_prefix else path
+        path = os.path.join(self.path_prefix(), path) if self.path_prefix() else path
         try:
-            with open(os.path.join(path, "dynamo"), "rb") as dynamo_path:
-                cache_entry = pickle.load(dynamo_path)
-            with open(os.path.join(path, "backends"), "rb") as backend_path:
-                backend_content = pickle.load(backend_path)
-            return cache_entry, backend_content
+            with open(os.path.join(path, "entry"), "rb") as f:
+                pickled_content = f.read()
+                entry = pickle.loads(pickled_content)
+                return entry
         except Exception as e:
             raise RuntimeError(f"Failed to load package from path {path}: {e}") from e
 
@@ -929,23 +1110,24 @@ def save(self, package: CompilePackage) -> None:
         logger.info("Saving CompilePackage for %s", package.source_id)
         super().save_package(package, key)
 
-    def load(
-        self, fn: Callable[..., Any]
-    ) -> Optional[tuple[_DynamoCacheEntry, dict[_BackendId, Any]]]:
+    def load(self, fn: Callable[..., Any]) -> Optional[PrecompileCacheEntry]:
         """
         Loads a package from a given path and returns it plus a list of deserialized backends
         """
         key = CompilePackage.source_id_from_fn(fn)
         logger.info("Loading CompilePackage for %s", key)
-        path = os.path.join(self.path_prefix, key)
+        path = os.path.join(self.path_prefix(), key)
         if os.path.exists(path):
             try:
                 result = super().load_cache_entry(key)
+                counters["dynamo_cache"]["dynamo_cache_hit"] += 1
                 return result
-            except Exception as e:
-                logger.warning("Failed to load package from path %s: %s", path, str(e))
+            except Exception:
+                counters["dynamo_cache"]["dynamo_cache_error"] += 1
+                logger.warning("Failed to load package from path %s", exc_info=True)
                 return None
         logger.info("No package found for %s", key)
+        counters["dynamo_cache"]["dynamo_cache_miss"] += 1
         return None
 
     def load_and_install_package(
@@ -958,10 +1140,18 @@ def load_and_install_package(
         if results is None:
             return None
         else:
-            (entry, backends) = results
-            package = CompilePackage(fn, entry)
-            package.install(backends)
+            package = CompilePackage(fn, results.dynamo)
+            package.install(results.backends)
             return package
 
+    def path_prefix(self) -> str:
+        return os.path.join(cache_dir(), "dynamo")
+
+
+def cache_dir() -> str:
+    from torch._inductor.runtime.cache_dir_utils import cache_dir
+
+    return cache_dir()
+
 
 DynamoCache = DiskDynamoCache(os.path.join(cache_dir(), "dynamo"))
diff --git a/torch/_dynamo/pgo.py b/torch/_dynamo/pgo.py
index 1a2c98ee6c7d..89a73de21337 100644
--- a/torch/_dynamo/pgo.py
+++ b/torch/_dynamo/pgo.py
@@ -167,6 +167,7 @@ def make(code: types.CodeType) -> CodeId:
 @dataclasses.dataclass
 class CodeState:
     automatic_dynamic: defaultdict[str, FrameStateSizeEntry] = dataclasses.field(
+        # pyrefly: ignore  # unbound-name
         default_factory=lambda: defaultdict(FrameStateSizeEntry)
     )
 
@@ -174,6 +175,7 @@ class CodeState:
 _INIT_CODE_STATE: Optional[defaultdict[CodeId, CodeState]] = None
 _CODE_STATE: Optional[defaultdict[CodeId, CodeState]] = None
 _LOGGED_DYNAMIC_ALLOWLIST: bool = False
+_KNOWN_DYNAMIC_SOURCES: set[str] = set()
 
 
 @dataclasses.dataclass(frozen=True)
@@ -263,7 +265,7 @@ def render_tuple(ss: tuple[Union[int, AutoDynamic, InferStride], ...]) -> str:
                 return f"tensor size={render_tuple(self.size)} stride={render_tuple(self.stride)}"
 
         # Fallback
-        return "unusual {repr(self)}"
+        return f"unusual {repr(self)}"
 
     def __post_init__(self) -> None:
         assert not isinstance(self.scalar, torch.SymInt), self.scalar
@@ -629,26 +631,49 @@ def _collect_dynamic_sources(code_state: CodeState) -> OrderedSet[str]:
     return dynamic_sources
 
 
+def _collect_missing_sources(all_sources: OrderedSet[str]) -> OrderedSet[str]:
+    from torch._dynamo.variables.builder import is_dynamic_source
+
+    global _KNOWN_DYNAMIC_SOURCES
+    missing_sources: OrderedSet[str] = OrderedSet()
+    for src in all_sources:
+        if src in _KNOWN_DYNAMIC_SOURCES:
+            continue
+        elif is_dynamic_source(src):
+            _KNOWN_DYNAMIC_SOURCES.add(src)
+            continue
+        missing_sources.add(src)
+    return missing_sources
+
+
 def log_frame_dynamic_whitelist(f_code: types.CodeType) -> None:
-    global _LOGGED_DYNAMIC_ALLOWLIST
+    global _KNOWN_DYNAMIC_SOURCES
     code_id = CodeId.make(f_code)
     frame_state = get_code_state()[code_id]
-    frame_whitelist = ",".join(_collect_dynamic_sources(frame_state))
+    all_dynamic_sources = _collect_dynamic_sources(frame_state)
+    frame_whitelist = ",".join(all_dynamic_sources)
+    missing_whitelist = ",".join(_collect_missing_sources(all_dynamic_sources))
     if frame_whitelist:
         with dynamo_timed(name := "pgo.dynamic_whitelist", log_pt2_compile_event=True):
             CompileEventLogger.pt2_compile(
-                name, recompile_dynamic_whitelist=frame_whitelist
-            )
-        if not _LOGGED_DYNAMIC_ALLOWLIST:
-            torch._utils_internal.add_mlhub_insight(
-                category="dynamic_shapes_analysis",
-                insight="Dynamic shape recompilation detected",
-                insight_description="PGO detected a recompilation due to dynamic shapes. \
-                Please follow the instruction from the action link to reduce \
-                recompilation overhead.",
+                name,
+                recompile_dynamic_whitelist=frame_whitelist,
+                missing_dynamic_whitelist=missing_whitelist,
             )
-            # add mlhub insight only once per rank
-            _LOGGED_DYNAMIC_ALLOWLIST = True
+
+
+def _log_size_mismatch_recompile() -> None:
+    global _LOGGED_DYNAMIC_ALLOWLIST
+    if not _LOGGED_DYNAMIC_ALLOWLIST:
+        torch._utils_internal.add_mlhub_insight(
+            category="dynamic_shapes_analysis",
+            insight="Dynamic shape recompilation detected",
+            insight_description="PGO detected a recompilation due to dynamic shapes. \
+            Please follow the instruction from the action link to reduce \
+            recompilation overhead.",
+        )
+        # add mlhub insight only once per rank
+        _LOGGED_DYNAMIC_ALLOWLIST = True
 
 
 def render_code_state(cs: defaultdict[CodeId, CodeState]) -> str:
@@ -671,16 +696,6 @@ def render_code_state(cs: defaultdict[CodeId, CodeState]) -> str:
     return code_state_str
 
 
-def merge_pgo_entry(src: FrameStateSizeEntry, dst: FrameStateSizeEntry) -> None:
-    def rank(entry: FrameStateSizeEntry) -> int:
-        if not isinstance(entry.size, tuple):  # scalar
-            return -1
-        return len(entry.size)
-
-    if rank(src) == rank(dst):  # both tensors same rank, or both scalars
-        dst |= src
-
-
 @CacheArtifactFactory.register
 class PGOCacheArtifact(CacheArtifact):
     @override
@@ -805,7 +820,7 @@ def get_remote_code_state(cache_key: str) -> Optional[defaultdict[CodeId, CodeSt
     return None
 
 
-def add_extra_remote_code_state(cache_key: str) -> None:
+def get_extra_remote_code_state(cache_key: str) -> None:
     """
     Reads an additional PGO profile from the given cache key, and merges it with the default PGO profile.
     """
@@ -815,34 +830,23 @@ def add_extra_remote_code_state(cache_key: str) -> None:
     remote_cache = get_remote_cache()
     if remote_cache is not None:
         with dynamo_timed(
-            name := "pgo.add_extra_remote_code_state",
+            name := "pgo.get_extra_remote_code_state",
             log_pt2_compile_event=True,
             dynamo_compile_column_us="pgo_get_remote_code_state_time_us",
         ):
             CompileEventLogger.pt2_compile(name, cache_key=cache_key)
             code_state = lookup_remote_cache_entry(remote_cache, cache_key)
             log.info(
-                "add_extra_code_state %s hit, %d entries",
+                "get_extra_code_state %s hit, %d entries",
                 cache_key,
                 len(code_state) if code_state is not None else 0,
             )
             if code_state is not None:
-                # merge the code state into the current one
-                for code_id, state in code_state.items():
-                    if code_id in _CODE_STATE:
-                        for src, entry in state.automatic_dynamic.items():
-                            # NOTE: maybe we need an "unsafe" merge to handle this,
-                            # where one entry might be 1-d, the other 2-d.
-                            # or if entries are of different types?
-                            # with local source naming, could be scalar vs. tensor
-                            merge_pgo_entry(
-                                entry, _CODE_STATE[code_id].automatic_dynamic[src]
-                            )
-                    else:
-                        _CODE_STATE[code_id] = state
+                assert not _CODE_STATE
+                _CODE_STATE = code_state
                 # log to tlparse
                 trace_structured_artifact(
-                    "add_extra_remote_code_state",
+                    "get_extra_remote_code_state",
                     "string",
                     lambda: render_code_state(code_state),
                 )
@@ -867,11 +871,15 @@ def get_code_state() -> defaultdict[CodeId, CodeState]:
     if local_code_state is None:
         get_remote_code_state(cache_key)
 
-    # Attempt additional remote
-    if (sticky_read := torch.compiler.config.pgo_extra_read_key) is not None:
+    # Attempt additional remote if neither local/default remote succeeded
+    if (
+        not _CODE_STATE
+        and (sticky_read := torch.compiler.config.pgo_extra_read_key) is not None
+    ):
+        # pyrefly: ignore  # unbound-name
         extra_read_key = get_extra_cache_key(sticky_read)
         if extra_read_key is not None:
-            add_extra_remote_code_state(extra_read_key)
+            get_extra_remote_code_state(extra_read_key)
 
     log.info("get_code_state using default")
 
@@ -952,9 +960,14 @@ def put_local_code_state(cache_key: str) -> None:
         )
 
 
-def put_remote_code_state(cache_key: str) -> None:
+def put_remote_code_state(cache_key: str, extra_code_state: bool = False) -> None:
+    event_name = (
+        "put_remote_code_state"
+        if not extra_code_state
+        else "put_extra_remote_code_state"
+    )
     with dynamo_timed(
-        name := "pgo.put_remote_code_state",
+        name := f"pgo.{event_name}",
         log_pt2_compile_event=True,
         dynamo_compile_column_us="pgo_put_remote_code_state_time_us",
     ):
@@ -964,7 +977,7 @@ def put_remote_code_state(cache_key: str) -> None:
         remote_cache = get_remote_cache()
 
         if remote_cache is None:
-            log.info("put_code_state: remote cache disabled")
+            log.info("%s: remote cache disabled", event_name)
             return
 
         content = pickle.dumps(_CODE_STATE)
@@ -974,11 +987,11 @@ def put_remote_code_state(cache_key: str) -> None:
         }
         remote_cache.put(cache_key, cache_data)
         log.info(
-            "put_code_state: wrote remote %s, %d entries", cache_key, len(_CODE_STATE)
+            "%s: wrote remote %s, %d entries", event_name, cache_key, len(_CODE_STATE)
         )
         # TODO: don't log this multiple times
         trace_structured_artifact(
-            "put_remote_code_state",
+            event_name,
             "string",
             lambda: render_code_state(_CODE_STATE),
         )
diff --git a/torch/_dynamo/polyfills/__init__.py b/torch/_dynamo/polyfills/__init__.py
index 4fc777ffe7ef..6f071e818356 100644
--- a/torch/_dynamo/polyfills/__init__.py
+++ b/torch/_dynamo/polyfills/__init__.py
@@ -12,6 +12,7 @@
 from collections import OrderedDict
 from collections.abc import Hashable, Iterable, MutableMapping, Sequence
 from itertools import repeat as _repeat
+from operator import eq, ne
 from typing import Any, Callable, TYPE_CHECKING
 
 import torch
@@ -106,13 +107,24 @@ def accumulate_grad(x, new_grad):
 # https://github.com/python/cpython/blob/a1c52d1265c65bcf0d9edf87e143843ad54f9b8f/Objects/listobject.c#L3352-L3413
 def list_cmp(op: Callable[[Any, Any], bool], left: Sequence[Any], right: Sequence[Any]):
     """emulate `(1,2,3) > (1,2)` etc"""
+
+    # Optimization: For equality, short-circuit if lengths differ
+    # This avoids iterating through elements and triggering guards on SymInts
+    left_len = len(left)
+    right_len = len(right)
+
+    if op is eq and left_len != right_len:
+        return False
+    if op is ne and left_len != right_len:
+        return True
+
     # Apply `op` to the first pair that differ
     for a, b in zip(left, right):
         if a != b:
             return op(a, b)
 
     # No more pairs to compare, so compare sizes.
-    return op(len(left), len(right))
+    return op(left_len, right_len)
 
 
 def dict___eq__(d, other):
diff --git a/torch/_dynamo/polyfills/itertools.py b/torch/_dynamo/polyfills/itertools.py
index 2b64327b93de..954fbd994e75 100644
--- a/torch/_dynamo/polyfills/itertools.py
+++ b/torch/_dynamo/polyfills/itertools.py
@@ -6,7 +6,6 @@
 
 import itertools
 import operator
-import sys
 from typing import Callable, Optional, overload, TYPE_CHECKING, TypeVar
 from typing_extensions import TypeAlias
 
@@ -28,6 +27,7 @@
     "islice",
     "tee",
     "zip_longest",
+    "pairwise",
 ]
 
 
@@ -163,20 +163,16 @@ def islice(iterable: Iterable[_T], /, *args: int | None) -> Iterator[_T]:
 
 
 # Reference: https://docs.python.org/3/library/itertools.html#itertools.pairwise
-if sys.version_info >= (3, 10):
-
-    @substitute_in_graph(itertools.pairwise, is_embedded_type=True)  # type: ignore[arg-type]
-    def pairwise(iterable: Iterable[_T], /) -> Iterator[tuple[_T, _T]]:
-        a = None
-        first = True
-        for b in iterable:
-            if first:
-                first = False
-            else:
-                yield a, b  # type: ignore[misc]
-            a = b
-
-    __all__ += ["pairwise"]
+@substitute_in_graph(itertools.pairwise, is_embedded_type=True)  # type: ignore[arg-type]
+def pairwise(iterable: Iterable[_T], /) -> Iterator[tuple[_T, _T]]:
+    a = None
+    first = True
+    for b in iterable:
+        if first:
+            first = False
+        else:
+            yield a, b  # type: ignore[misc]
+        a = b
 
 
 # Reference: https://docs.python.org/3/library/itertools.html#itertools.tee
@@ -200,6 +196,7 @@ def _tee(link) -> Iterator[_T]:  # type: ignore[no-untyped-def]
 
 
 @overload
+# pyrefly: ignore  # inconsistent-overload
 def zip_longest(
     iter1: Iterable[_T1],
     /,
@@ -209,6 +206,7 @@ def zip_longest(
 
 
 @overload
+# pyrefly: ignore  # inconsistent-overload
 def zip_longest(
     iter1: Iterable[_T1],
     iter2: Iterable[_T2],
@@ -217,6 +215,7 @@ def zip_longest(
 
 
 @overload
+# pyrefly: ignore  # inconsistent-overload
 def zip_longest(
     iter1: Iterable[_T1],
     iter2: Iterable[_T2],
@@ -227,6 +226,7 @@ def zip_longest(
 
 
 @overload
+# pyrefly: ignore  # inconsistent-overload
 def zip_longest(
     iter1: Iterable[_T],
     iter2: Iterable[_T],
@@ -237,6 +237,7 @@ def zip_longest(
 
 
 @overload
+# pyrefly: ignore  # inconsistent-overload
 def zip_longest(
     iter1: Iterable[_T],
     iter2: Iterable[_T],
diff --git a/torch/_dynamo/polyfills/operator.py b/torch/_dynamo/polyfills/operator.py
index 4ce889b297c9..4a24ce20bf21 100644
--- a/torch/_dynamo/polyfills/operator.py
+++ b/torch/_dynamo/polyfills/operator.py
@@ -30,10 +30,12 @@
 
 
 @overload
+# pyrefly: ignore  # inconsistent-overload
 def attrgetter(attr: str, /) -> Callable[[Any], _U]: ...
 
 
 @overload
+# pyrefly: ignore  # inconsistent-overload
 def attrgetter(
     attr1: str, attr2: str, /, *attrs: str
 ) -> Callable[[Any], tuple[_U1, _U2, Unpack[_Us]]]: ...
@@ -68,10 +70,12 @@ def getter(obj: Any) -> tuple[Any, ...]:  # type: ignore[misc]
 
 
 @overload
+# pyrefly: ignore  # inconsistent-overload
 def itemgetter(item: _T, /) -> Callable[[Any], _U]: ...
 
 
 @overload
+# pyrefly: ignore  # inconsistent-overload
 def itemgetter(
     item1: _T1, item2: _T2, /, *items: Unpack[_Ts]
 ) -> Callable[[Any], tuple[_U1, _U2, Unpack[_Us]]]: ...
diff --git a/torch/_dynamo/polyfills/os.py b/torch/_dynamo/polyfills/os.py
index 5388816b8267..98adc5582d0f 100644
--- a/torch/_dynamo/polyfills/os.py
+++ b/torch/_dynamo/polyfills/os.py
@@ -17,6 +17,7 @@
 @substitute_in_graph(os.fspath, can_constant_fold_through=True)
 def fspath(path: AnyStr | os.PathLike[AnyStr]) -> AnyStr:
     if isinstance(path, (str, bytes)):
+        # pyrefly: ignore  # bad-return
         return path
 
     path_type = type(path)
diff --git a/torch/_dynamo/polyfills/pytree.py b/torch/_dynamo/polyfills/pytree.py
index dfad40de4b08..9f2b7d9636d4 100644
--- a/torch/_dynamo/polyfills/pytree.py
+++ b/torch/_dynamo/polyfills/pytree.py
@@ -171,6 +171,7 @@ def helper(treespec: PyTreeSpec) -> str:
                     or optree.is_namedtuple_class(treespec.type)
                     or optree.is_structseq_class(treespec.type)
                 ):
+                    # pyrefly: ignore  # bad-return
                     return treespec._unflatten_func(
                         treespec._metadata,
                         children_representations,
diff --git a/torch/_dynamo/precompile_context.py b/torch/_dynamo/precompile_context.py
index a032ad996810..65ceab92262c 100644
--- a/torch/_dynamo/precompile_context.py
+++ b/torch/_dynamo/precompile_context.py
@@ -1,26 +1,18 @@
 import copy
-import dataclasses
+import json
 import logging
-import pickle
-import platform
 from abc import abstractmethod
 from collections import defaultdict
-from itertools import chain
-from typing import Any, Callable, Generic, Optional, TypeVar, Union
-from typing_extensions import override
+from dataclasses import dataclass
+from typing import Any, Callable, Generic, Optional, TypeVar
 
 import torch
-from torch.compiler._cache import (
-    _serialize_single_cache,
-    CacheArtifact,
-    CacheArtifactFactory,
-    CacheArtifactManager,
-    CacheArtifactsResult,
-    CacheInfo,
+from torch._dynamo.package import (
+    _BackendId,
+    _DynamoCacheEntry,
+    DynamoCache,
+    PrecompileCacheEntry,
 )
-from torch.utils._appending_byte_serializer import AppendingByteSerializer
-from torch.utils._ordered_set import OrderedSet
-from torch.utils._triton import get_triton_version
 
 
 """
@@ -31,14 +23,12 @@
 logger = logging.getLogger(__name__)
 
 
-class PrecompileCacheArtifact(CacheArtifact, Generic[T]):
+@dataclass
+class BackendCacheArtifact(Generic[T]):
     """
-    Data for each cache artifact that will be serialized and deserialized by
-    PrecompileContext, rather than CacheArtifactManager.
-    T represents the deserialized type of the artifact, i.e. the return type of after_deserialization
-
-    PrecompileCacheArtifact is a frozen dataclass - you can add new serializable fields and metadata specific to your own artifacts
-    as needed, and use them in after_deserialization.
+    Represents a single serializable backend artifact from a dynamo backend.
+    Each BackendCacheArtifact has a key associated with it along with some
+    serializable content.
 
     Example implementation:
 
@@ -52,13 +42,8 @@ def after_deserialization(self) -> MySerializableType:
             return result
     """
 
-    @override
-    def populate_cache(self) -> None:
-        raise RuntimeError("Precompile cache artifacts do not populate caches")
-
-    @override
-    def precompile_compatible(self) -> bool:
-        return True
+    key: str
+    content: Any
 
     @abstractmethod
     def after_deserialization(self) -> T:
@@ -68,260 +53,172 @@ def after_deserialization(self) -> T:
         """
         ...
 
-
-class EditablePrecompileCacheArtifact(Generic[T]):
-    """
-    A PrecompileCacheArtifact whose content isn't encoded until we call PrecompileContext.serialize()
-    """
-
-    def __init__(self, artifact_type: str, content: Any, key: str) -> None:
-        # Deepcopy the content for now, but don't pickle it yet.
-        # This allows us to make changes to self.content before true serialization
-        self.content = copy.deepcopy(content)
-        self.key = key
-        self.artifact_type = artifact_type
-
-    def real_encode(self) -> PrecompileCacheArtifact[T]:
-        """
-        Actually encode the object
-        """
-        content = pickle.dumps(self.content)
-        artifact = CacheArtifactFactory.encode_create(
-            self.artifact_type, self.key, content
-        )
-        assert isinstance(artifact, PrecompileCacheArtifact)
-        return artifact
-
     def edit_contents(self, edit_fn: Callable[..., Any]) -> None:
         """
-        Edit the content of an existing artifact
+        Edit the contents of the artifact.
         """
         self.content = edit_fn(self.content)
 
 
-class PrecompileContext(CacheArtifactManager):
+class EagerCacheArtifact(BackendCacheArtifact[Any]):
+    def after_deserialization(self) -> Any:
+        return self.content
+
+
+class BypassDynamoCacheEntry(Exception):
+    pass
+
+
+class PrecompileContext:
     """
     PrecompileContext is a special CacheArtifactManager for handling precompilation
     It uses the same interface as CacheArtifactManager, but handles deserialization differently: instead
     of placing each artifact into respective caches, it will stitch all the cache artifacts for a single key
     together and place it into a global Precompile Cache.
 
+    PrecompileContext has two main portions: dynamo_cache_entries and backend_cache_artifacts.
+    When saving, PrecompileContext.serialize() will serialize all dynamo cache entries along with any PrecompileCacheArtifacts that
+    are needed to save those dynamo cache entries.
+
     The following artifact types are supported by PrecompileContext:
      - BundledAOTAutogradCacheArtifact
-     - DynamoCodeStateArtifact
-     - AutotuneCacheArtifact (regular autotune results, same as Megacache)
+
     """
 
     # Protected by the compile_lock
-    # _new_cache_artifacts_by_key organizes results by the key of each artifact.
-    # This allows us to implement serialize_by_key easily.
-    # On call to `serialize()`, all cache artifacts in _new_cache_artifacts_by_key
-    # are transferred to _new_cache_artifacts before serialization.
-    _new_cache_artifacts_by_key: dict[
-        str, Union[EditablePrecompileCacheArtifact[object], CacheArtifact]
-    ] = {}
-    _new_cache_artifacts: CacheArtifactsResult = defaultdict(list)
-    # Keep a separate seen artifacts list to make avoid unnecessary duplicates
-    # This list will not be cleared between serialize() calls
-    _seen_artifacts: OrderedSet[CacheArtifact] = OrderedSet()
-    # When serialize() is called, artifacts are transferred from _cache_artifacts to
-    # internal data structure of the _serializer
-    # This allows us to only pay the cost of serialization if serialize() is called
-    _serializer: AppendingByteSerializer[tuple[str, list[CacheArtifact]]] = (
-        AppendingByteSerializer(serialize_fn=_serialize_single_cache)
-    )
-    _cache_info: CacheInfo = CacheInfo()
+    # _backend_artifacts_by_key organizes results by the key of each artifact.
+    # Each object here must be serializable
+    _backend_artifacts_by_key: dict[_BackendId, BackendCacheArtifact[Any]] = {}
+
+    # On call to `serialize()`, all cache artifacts in _dynamo_cache_entries are converted
+    # into DynamoCacheArtifacts and added to _new_cache_artifacts for serialization
+    _dynamo_cache_entries: dict[str, _DynamoCacheEntry] = {}
 
     @classmethod
     def clear(cls) -> None:
-        cls._new_cache_artifacts_by_key.clear()
-        super().clear()
+        cls._backend_artifacts_by_key.clear()
+        cls._dynamo_cache_entries.clear()
 
-    @override
     @classmethod
     def record_artifact(
         cls,
-        artifact_type: str,
-        key: str,
-        content: Any,
-        editable: bool = False,
+        artifact: BackendCacheArtifact[Any],
     ) -> None:
         """
-        Called from each caching operation to record the artifact in this
-        "mega" list
+        Records a backend artifact to be used with dynamo cache entries
         """
-        artifact: Union[EditablePrecompileCacheArtifact[object], CacheArtifact]
-        if editable:
-            artifact = EditablePrecompileCacheArtifact(artifact_type, content, key)
-        else:
-            artifact = CacheArtifactFactory.encode_create(artifact_type, key, content)
-            # TODO: although this covers completely same artifacts, it's possible
-            # with AOTAutogradCacheEntries to have multiple artifacts whose keys
-            # (i.e. backend_ids) are different, but whose contents are equal.
-            # In those cases, it would be much better if we only serialize once instead
-            # of N times.
-            if artifact in cls._seen_artifacts:
-                return
-            cls._seen_artifacts.add(artifact)
-
-        cls._new_cache_artifacts_by_key[key] = artifact
+        cls._backend_artifacts_by_key[_BackendId(artifact.key)] = copy.deepcopy(
+            artifact
+        )
 
     @classmethod
-    def _save_artifacts_by_type(cls) -> None:
-        """
-        We normally record artifacts by key, but serialization expects them to be organized
-        by artifact type. This function transfers artifacts from _new_cache_artifacts_by_key to _new_cache_artifacts
-        """
-        for artifact in cls._new_cache_artifacts_by_key.values():
-            if isinstance(artifact, EditablePrecompileCacheArtifact):
-                artifact = artifact.real_encode()
-            cls._new_cache_artifacts[artifact.__class__.type()].append(artifact)
-        cls._new_cache_artifacts_by_key.clear()
+    def record_dynamo_cache_entry(
+        cls, cache_entry: _DynamoCacheEntry, key: str
+    ) -> None:
+        cls._dynamo_cache_entries[key] = cache_entry
 
     @classmethod
     def edit_artifact(cls, key: str, edit_fn: Callable[..., Any]) -> None:
         """
         Edit the content of an existing artifact
         """
-        assert key in cls._new_cache_artifacts_by_key, (
-            f"Key {key} not found in artifacts"
-        )
-        artifact = cls._new_cache_artifacts_by_key[key]
-        assert isinstance(artifact, EditablePrecompileCacheArtifact), (
-            "Artifact is not editable"
-        )
+        assert key in cls._backend_artifacts_by_key, f"Key {key} not found in artifacts"
+        artifact = cls._backend_artifacts_by_key[_BackendId(key)]
         artifact.edit_contents(edit_fn)
 
     @classmethod
-    def serialize_artifact_by_key(cls, key: str) -> Optional[CacheArtifact]:
+    def serialize_artifact_by_key(cls, key: str) -> Optional[BackendCacheArtifact[Any]]:
         """
-        Serialize all artifacts with the given key returned in a list.
+        Return the backend cache artifact with the associated key
         """
-        result = cls._new_cache_artifacts_by_key.get(key, None)
-        if isinstance(result, EditablePrecompileCacheArtifact):
-            result = result.real_encode()
-        return result
-
-    @classmethod
-    def serialize(cls) -> Optional[tuple[bytes, CacheInfo]]:
-        cls._save_artifacts_by_type()
-        # No need to serialize if there are no new dynamo compiles
-        if "precompile_dynamo" not in cls._new_cache_artifacts:
-            return None
-        return super().serialize()
+        return cls._backend_artifacts_by_key.get(_BackendId(key), None)
 
     @staticmethod
-    def populate_caches(artifacts: CacheArtifactsResult) -> CacheInfo:
-        PrecompileContext._ensure_cache_artifacts_registered()
-
-        artifacts_by_key = {}
-        cache_info = CacheInfo()
-        for artifact in chain(*artifacts.values()):
-            if artifact.type() == "autotune":
-                # Populate autotune cache artifacts
-                artifact.populate_cache()
-            else:
-                artifacts_by_key[artifact.key] = artifact
-            cache_info.add(artifact)
-
-        from torch._dynamo.package import _BackendId, DynamoCache
-
-        for dynamo_entry in artifacts["precompile_dynamo"]:
-            assert isinstance(dynamo_entry, PrecompileCacheArtifact)
-            cache_entry = dynamo_entry.after_deserialization()
-            # Grab backends from the dynamo cache entry
-            backends = cache_entry.backend_ids
-            backend_content: dict[_BackendId, PrecompileCacheArtifact[Any]] = {}
-            for id_ in backends:
-                assert id_ in artifacts_by_key, f"Backend {id_} not found in artifacts"
-                artifact = artifacts_by_key[id_]
-                assert isinstance(artifact, PrecompileCacheArtifact)
-                backend_content[id_] = artifact
-            DynamoCache.write(cache_entry, backend_content, dynamo_entry.key)
-
-        return cache_info
-
-    @classmethod
-    def _ensure_cache_artifacts_registered(cls) -> None:
-        from torch._dynamo.package import _DynamoCacheArtifact  # noqa: F401
-        from torch._functorch._aot_autograd.autograd_cache import (  # noqa: F401
-            BundledAOTAutogradCacheArtifact,
-        )
-
+    def dump_debug_info(
+        dynamo_entries: dict[str, _DynamoCacheEntry],
+        backend_artifacts: dict[_BackendId, BackendCacheArtifact[Any]],
+    ) -> dict[str, Any]:
+        """
+        Return a JSON serializable debug dump of all entries in the precompile context
+        Called in serialize before serialization, and in populate_caches after deserialization
+        """
+        # Print debug information
+        debug_info: defaultdict[str, list[Any]] = defaultdict(list)
+        for key, cache_entry in dynamo_entries.items():
+            info = cache_entry.debug_info()
+            info["key"] = key
+            debug_info["dynamo"].append(info)
 
-@dataclasses.dataclass(frozen=True)
-class SystemInfo:
-    """
-    System information including Python, PyTorch, and GPU details.
-    This information is used to ensure compiled artifacts can only be loaded
-    with compatible system configurations.
-    """
+        for artifact in backend_artifacts.values():
+            debug_info["backends"].append(artifact.key)
 
-    python_version: str
-    torch_version: str
-    toolkit_version: Optional[str]
-    triton_version: Optional[tuple[int, int]]
-    gpu_name: Optional[str]
-    CHECK_GPUS = ("cuda", "xpu")
+        return debug_info
 
     @classmethod
-    def current(cls) -> "SystemInfo":
-        """Create a SystemInfo instance with current system information."""
-        # Get GPU name if CUDA or XPU is available
-        gpu_name, toolkit_version = None, None
-        for device_type in cls.CHECK_GPUS:
-            if getattr(torch, device_type).is_available():
-                try:
-                    gpu_name = getattr(torch, device_type).get_device_name()
-                    toolkit_version = getattr(torch.version, device_type)
-                    break
-                except Exception:
-                    pass
-
-        return cls(
-            python_version=platform.python_version(),
-            torch_version=torch.__version__,
-            toolkit_version=toolkit_version,
-            triton_version=get_triton_version((0, 0)),
-            gpu_name=gpu_name,
-        )
+    def save_to_dynamo_cache(cls) -> dict[str, Any]:
+        precompile_cache_entries, debug_info = cls.create_cache_entries()
+        for key, entry in precompile_cache_entries.items():
+            DynamoCache.write(entry, key)
+        return debug_info
 
-    def check_compatibility(
-        self, other: "SystemInfo", device_type: str = "cpu"
-    ) -> None:
+    @classmethod
+    def create_cache_entries(
+        cls,
+    ) -> tuple[dict[str, PrecompileCacheEntry], dict[str, Any]]:
         """
-        Check if this SystemInfo is compatible with another SystemInfo.
-        Raises RuntimeError if incompatible.
+        Grabs all the cache entries in the precompile context and
+        stitches them together into full PrecompileCacheEntries.
         """
-        if self.python_version != other.python_version:
-            raise RuntimeError(
-                f"Compile package was created with a different Python version: {self.python_version}"
-            )
-
-        if self.torch_version != other.torch_version:
-            raise RuntimeError(
-                f"Compile package was created with a different PyTorch version: {self.torch_version}"
-            )
-        if device_type in self.CHECK_GPUS:
-            if not getattr(torch, device_type).is_available():
-                raise RuntimeError(f"{device_type} is not available")
-
-            if self.toolkit_version != other.toolkit_version:
-                raise RuntimeError(
-                    f"Compile package was created with a different toolkit version: {self.toolkit_version}"
-                )
+        dynamo_entries = cls._dynamo_cache_entries
+        backend_artifacts = cls._backend_artifacts_by_key
 
-            if (
-                other.triton_version != (0, 0)
-                and self.triton_version != other.triton_version
-            ):
-                raise RuntimeError(
-                    f"Compile package was created with a different Triton version: {self.triton_version}"
-                )
+        num_artifacts = len(dynamo_entries)
+
+        debug_info = PrecompileContext.dump_debug_info(
+            dynamo_entries, backend_artifacts
+        )
+        debug_str = json.dumps(
+            {
+                "num_entries": num_artifacts,
+                "artifacts": debug_info,
+            },
+        )
+        torch._logging.trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "dynamo_cache_entries",
+                "encoding": "json",
+            },
+            payload_fn=lambda: debug_str,
+            expect_trace_id=False,
+        )
 
-            # Check GPU name if CUDA/XPU was used
-            if other.gpu_name is not None and self.gpu_name != other.gpu_name:
-                raise RuntimeError(
-                    f"Compile package was created with different GPU: "
-                    f"cached={self.gpu_name}, current={other.gpu_name}"
+        precompile_cache_entries = {}
+
+        for key, cache_entry in dynamo_entries.items():
+            try:
+                result = PrecompileCacheEntry.from_cache_entry(
+                    cache_entry, backend_artifacts
+                )
+                if result is not None:
+                    precompile_cache_entries[key] = result
+            except Exception as e:
+                logger.warning("Failed to create cache entry %s", key, exc_info=True)
+
+                error = e
+                data = json.dumps(
+                    {
+                        "key": key,
+                        "error": str(error),
+                    }
+                )
+                torch._logging.trace_structured(
+                    "artifact",
+                    metadata_fn=lambda: {
+                        "name": "dynamo_cache_exception",
+                        "encoding": "json",
+                    },
+                    payload_fn=lambda: data,
                 )
+                continue
+        return precompile_cache_entries, debug_info
diff --git a/torch/_dynamo/profiler.py b/torch/_dynamo/profiler.py
index 2055507f72a4..8c0c862f3404 100644
--- a/torch/_dynamo/profiler.py
+++ b/torch/_dynamo/profiler.py
@@ -49,8 +49,11 @@ def __truediv__(self, other: Any) -> ProfileMetrics:
         if isinstance(other, int):
             other = ProfileMetrics(other, other, other)
         return ProfileMetrics(
+            # pyrefly: ignore  # no-matching-overload
             self.microseconds / max(1, other.microseconds),
+            # pyrefly: ignore  # bad-argument-type
             self.operators / max(1, other.operators),
+            # pyrefly: ignore  # bad-argument-type
             self.fusions / max(1, other.fusions),
         )
 
diff --git a/torch/_dynamo/repro/after_aot.py b/torch/_dynamo/repro/after_aot.py
index 998acc739775..c512ce891700 100644
--- a/torch/_dynamo/repro/after_aot.py
+++ b/torch/_dynamo/repro/after_aot.py
@@ -370,13 +370,16 @@ def generate_compiler_repro_string(
 
         try:
             if isinstance(kernel, Autotuner):
+                # pyrefly: ignore  # missing-attribute
                 if isinstance(kernel.fn, Heuristics):
                     model_str += "ERROR: Repro will not work as intended, "
                     model_str += "triton.runtime.autotuner.Heuristics is not currently supported\n"
                     break
 
                 config_strs = []
+                # pyrefly: ignore  # missing-attribute
                 for kernel_config in kernel.configs:
+                    # pyrefly: ignore  # bad-argument-type
                     config_strs.append(f"""triton.Config(
                             {str(kernel_config.kwargs)},
                             num_warps={kernel_config.num_warps},
@@ -394,8 +397,10 @@ def generate_compiler_repro_string(
                 """).strip()
 
             model_str += "\n@triton.jit\n"
+            # pyrefly: ignore  # missing-attribute
             src_code = kernel.src if isinstance(kernel, JITFunction) else kernel.fn.src
             fn_name = (
+                # pyrefly: ignore  # missing-attribute
                 kernel._fn_name
                 if isinstance(kernel, JITFunction)
                 else kernel.fn._fn_name
@@ -409,7 +414,9 @@ def generate_compiler_repro_string(
             model_str += "ERROR: Repro will not work as intended, "
             model_str += f"User defined triton kernel exception: {e}\n"
 
+    # pyrefly: ignore  # unbound-name
     if len(kernel_side_table.constant_args) > 0:
+        # pyrefly: ignore  # unbound-name
         model_str += f"{kernel_side_table_prefix}.constant_args={kernel_side_table.constant_args}\n"
 
     model_str += NNModuleToString.convert(gm)
@@ -420,8 +427,10 @@ def generate_compiler_repro_string(
     # Extract from graph placeholders and their corresponding arguments
     placeholder_targets = fx_placeholder_targets(gm)
     for placeholder, arg in zip(placeholder_targets, args):
+        # pyrefly: ignore  # unbound-name
         if isinstance(arg, (int, torch.SymInt)):
             writer.symint(placeholder, arg)
+        # pyrefly: ignore  # unbound-name
         elif isinstance(arg, torch.Tensor):
             # TODO: improve these names with FQN
             writer.tensor(placeholder, arg)
@@ -431,16 +440,20 @@ def generate_compiler_repro_string(
             writer.unsupported(placeholder, arg)
 
         # Extract symbolic variables from the same arguments
+        # pyrefly: ignore  # unbound-name
         if isinstance(arg, torch.SymInt):
             sym_name = str(arg.node)
             if arg.node.hint is not None:
                 used_syms[sym_name] = arg.node.hint
+        # pyrefly: ignore  # unbound-name
         elif isinstance(arg, torch.Tensor):
             # Extract symbolic variables from tensor shapes and strides
             for dim in arg.shape:
+                # pyrefly: ignore  # unbound-name
                 if isinstance(dim, torch.SymInt) and dim.node.hint is not None:
                     used_syms[str(dim.node)] = dim.node.hint
             for stride in arg.stride():
+                # pyrefly: ignore  # unbound-name
                 if isinstance(stride, torch.SymInt) and stride.node.hint is not None:
                     used_syms[str(stride.node)] = stride.node.hint
 
@@ -758,6 +771,7 @@ def repro_common(
     # TODO: speed this up
     mod = make_fx(mod, tracing_mode=options.tracing_mode)(*args)
 
+    # pyrefly: ignore  # bad-assignment
     torch._inductor.config.generate_intermediate_hooks = True
 
     return mod, args
diff --git a/torch/_dynamo/repro/aoti.py b/torch/_dynamo/repro/aoti.py
index e0aaf4caee47..eae021752fd9 100644
--- a/torch/_dynamo/repro/aoti.py
+++ b/torch/_dynamo/repro/aoti.py
@@ -301,6 +301,7 @@ def repro_load_args(load_args: Any, save_dir: Optional[str]) -> tuple[Any]:
 def repro_common(
     options: Any, exported_program: ExportedProgram
 ) -> tuple[torch.fx.GraphModule, Any, Any]:
+    # pyrefly: ignore  # bad-assignment
     torch._inductor.config.generate_intermediate_hooks = True
     mod = exported_program.module(check_guards=False)
     args, kwargs = exported_program.example_inputs
@@ -422,6 +423,7 @@ def module_fails(
     ) -> bool:
         # Need to export first so the in_spec and out_spec are populated
         tuple_inputs = tuple(flat_example_inputs)
+        # pyrefly: ignore  # bad-assignment
         gm = export_for_aoti_minifier(
             gm, tuple_inputs, strict=strict, skip_export_error=skip_export_error
         )
diff --git a/torch/_dynamo/resume_execution.py b/torch/_dynamo/resume_execution.py
index 840e02a9cdb8..9fe1722914d3 100644
--- a/torch/_dynamo/resume_execution.py
+++ b/torch/_dynamo/resume_execution.py
@@ -24,7 +24,9 @@
 from .bytecode_transformation import (
     add_push_null,
     bytecode_from_template,
+    create_binary_subscr,
     create_call_function,
+    create_call_function_ex,
     create_instruction,
     create_jump_absolute,
     create_load_const,
@@ -53,6 +55,7 @@
 IS_TRACING_RESUME_PROLOGUE_VARNAME = "__is_tracing_resume_prologue"
 
 
+# If is_resume - this codegen is for a resume function
 def _initial_push_null(insts: list[Instruction]) -> None:
     if sys.version_info >= (3, 11):
         insts.append(create_instruction("PUSH_NULL"))
@@ -79,7 +82,8 @@ def _bytecode_from_template_with_split(
         (
             (i, inst)
             for i, inst in enumerate(template_code)
-            if inst.opname == "LOAD_FAST" and inst.argval == "dummy"
+            if inst.opname in ("LOAD_FAST", "LOAD_FAST_BORROW")
+            and inst.argval == "dummy"
         ),
         (None, None),
     )
@@ -98,6 +102,7 @@ def _bytecode_from_template_with_split(
 def _try_except_tf_mode_template(dummy: Any, stack_var_name: Any) -> None:
     # NOTE: Make sure this name matches what is generated by symbolic_convert:import_source
     # on torch._dynamo.utils.
+    # pyrefly: ignore  # unknown-name
     global __import_torch_dot__dynamo_dot_utils
     try:
         dummy
@@ -197,7 +202,9 @@ def __call__(
             load_args = [create_load_const(val) for val in self.target_values]
 
         create_ctx: list[Instruction] = []
-        _initial_push_null(create_ctx)
+        # Do not push NULL in Python 3.14+ since the NULL should be on the symbolic stack.
+        if sys.version_info < (3, 14):
+            _initial_push_null(create_ctx)
         create_ctx.extend(
             [
                 *load_args,
@@ -218,7 +225,8 @@ def _template(ctx: AbstractContextManager[Any], dummy: Any) -> None:
             (
                 inst
                 for inst in setup_with
-                if inst.opname == "LOAD_FAST" and inst.argval == "ctx"
+                if inst.opname in ("LOAD_FAST", "LOAD_FAST_BORROW")
+                and inst.argval == "ctx"
             ),
             None,
         )
@@ -310,6 +318,7 @@ def generate(
         argnames: tuple[str, ...],
         argnames_null: tuple[str, ...],
         setup_fns: tuple[ReenterWith, ...],
+        handle_inactive_ctx: bool,
         stack_ctx_vars: tuple[tuple[int, tuple[Any, ...]], ...],
         argnames_ctx_vars: tuple[tuple[str, tuple[Any, ...]], ...],
         null_idxes: tuple[int, ...],
@@ -333,6 +342,7 @@ def generate(
                 argnames,
                 argnames_null,
                 setup_fns,
+                handle_inactive_ctx,
                 stack_ctx_vars,
                 argnames_ctx_vars,
                 null_idxes,
@@ -413,16 +423,23 @@ def update(
             offset_to_inst = {inst.offset: inst for inst in instructions}
             # map old hook targets to new targets generated by the hook
             old_hook_target_remap = {}
-            null_idxes_i = 0
+            stack_i = 0
+            null_i = 0
             stack_ctx_vars_d = dict(stack_ctx_vars)  # type: ignore[var-annotated,arg-type]
-            for i in range(nstack):
-                while (
-                    null_idxes_i < len(null_idxes)
-                    and null_idxes[null_idxes_i] == i + null_idxes_i
-                ):
+            for i in range(nstack + len(null_idxes)):
+                if null_i < len(null_idxes) and null_idxes[null_i] == i:
                     prefix.append(create_instruction("PUSH_NULL"))
-                    null_idxes_i += 1
-                prefix.append(create_instruction("LOAD_FAST", argval=f"___stack{i}"))
+                    null_i += 1
+                else:
+                    prefix.append(
+                        create_instruction("LOAD_FAST", argval=f"___stack{stack_i}")
+                    )
+                    if handle_inactive_ctx and stack_i in stack_ctx_vars_d:
+                        # NOTE: we assume that current stack var is a context manager CLASS!
+                        # Load args for context variable and construct it
+                        prefix.extend(_load_tuple_and_call(stack_ctx_vars_d[stack_i]))
+                    stack_i += 1
+
                 if i in hooks:
                     hook = hooks.pop(i)
                     hook_insts, exn_target = hook(code_options, cleanup)
@@ -432,10 +449,6 @@ def update(
                         old_hook_target = offset_to_inst[hook_target_offset]
                         meta.prefix_block_target_offset_remap.append(hook_target_offset)
                         old_hook_target_remap[old_hook_target] = exn_target
-                if i in stack_ctx_vars_d:
-                    # NOTE: we assume that current stack var is a context manager CLASS!
-                    # Load args for context variable and construct it
-                    prefix.extend(_load_tuple_and_call(stack_ctx_vars_d[i]))
 
             if is_py311_plus:
                 # reverse the mapping since targets of later/nested contexts are inserted
@@ -448,10 +461,11 @@ def update(
 
             # NOTE: we assume that local var is a context manager CLASS!
             # initialize inactive context vars in argnames
-            for name, vals in argnames_ctx_vars:
-                prefix.append(create_instruction("LOAD_FAST", argval=name))
-                prefix.extend(_load_tuple_and_call(vals))
-                prefix.append(create_instruction("STORE_FAST", argval=name))
+            if handle_inactive_ctx:
+                for name, vals in argnames_ctx_vars:
+                    prefix.append(create_instruction("LOAD_FAST", argval=name))
+                    prefix.extend(_load_tuple_and_call(vals))
+                    prefix.append(create_instruction("STORE_FAST", argval=name))
 
             # 3.12+: store NULL into variables that were NULL
             if argnames_null:
@@ -476,7 +490,7 @@ def update(
                                     "LOAD_FAST", argval="__nested_resume_fns"
                                 ),
                                 create_instruction("LOAD_CONST", argval=-1),
-                                create_instruction("BINARY_SUBSCR"),
+                                create_binary_subscr(),
                             ]
                         ),
                         # del __nested_resume_fns[-1]
@@ -490,7 +504,7 @@ def update(
                         # load __nested_frame_values[-1]
                         create_instruction("LOAD_FAST", argval="__nested_frame_values"),
                         create_instruction("LOAD_CONST", argval=-1),
-                        create_instruction("BINARY_SUBSCR"),
+                        create_binary_subscr(),
                         # create [
                         #     __nested_resume_fns,
                         #     __nested_frame_values,
@@ -513,7 +527,7 @@ def update(
                             "STORE_FAST", argval=IS_TRACING_RESUME_PROLOGUE_VARNAME
                         ),
                         # finish the call
-                        create_instruction("CALL_FUNCTION_EX", arg=0),
+                        *create_call_function_ex(False, False),
                     ]
                 )
             else:
@@ -545,6 +559,7 @@ def update(
 
             # remap original instructions' exception table entries
             if old_hook_target_remap:
+                # pyrefly: ignore  # unbound-name
                 assert is_py311_plus
                 for inst in instructions:
                     if (
diff --git a/torch/_dynamo/side_effects.py b/torch/_dynamo/side_effects.py
index 80b22e55227c..47912dadb941 100644
--- a/torch/_dynamo/side_effects.py
+++ b/torch/_dynamo/side_effects.py
@@ -218,7 +218,10 @@ def should_allow_side_effects_under_checkpoint(self) -> bool:
         return bool(
             output_graph
             and output_graph.current_tx.output.current_tracer.under_activation_checkpoint
-            and output_graph.current_tx.output.current_tracer.allow_side_effects_under_checkpoint
+            and (
+                output_graph.current_tx.output.current_tracer.allow_side_effects_under_checkpoint
+                or torch._dynamo.config.skip_fwd_side_effects_in_bwd_under_checkpoint
+            )
         )
 
     def should_allow_externally_visible_side_effects_in_subtracer(self) -> bool:
@@ -696,6 +699,7 @@ def codegen_save_tempvars(self, cg: PyCodegen) -> None:
                     cg.add_cache(var)
                     var.source = LocalSource(cg.tempvars[var])  # type: ignore[attr-defined]
                 elif var.source is None:
+                    # pyrefly: ignore  # bad-assignment
                     var.source = LocalCellSource(var.local_name)
             elif isinstance(var, variables.TensorVariable):
                 # NOTE: for historical reasons we never assigned local sources
@@ -732,6 +736,7 @@ def codegen_save_tempvars(self, cg: PyCodegen) -> None:
                 if isinstance(var, variables.UserDefinedObjectVariable):
 
                     def load_new_method() -> None:
+                        # pyrefly: ignore  # missing-attribute
                         assert var.base_cls_vt is not None
                         cg(var.base_cls_vt)  # type: ignore[attr-defined]
                         cg.extend_output([cg.create_load_attr("__new__")])
@@ -978,7 +983,9 @@ def codegen_update_mutated(self, cg: PyCodegen) -> None:
 
             elif self.is_attribute_mutation(var):
                 if isinstance(
-                    var, variables.UserDefinedDictVariable
+                    var,
+                    variables.UserDefinedDictVariable,
+                    # pyrefly: ignore  # bad-argument-type
                 ) and self.is_modified(var._dict_vt):
                     # Do dict related update manually here. The store_attr
                     # mutations will be applied later.
@@ -1011,6 +1018,7 @@ def codegen_update_mutated(self, cg: PyCodegen) -> None:
                         ]
                     )
 
+                    # pyrefly: ignore  # bad-argument-type
                     cg(var._dict_vt, allow_cache=False)  # Don't codegen via source
                     cg.extend_output(
                         [
@@ -1031,7 +1039,9 @@ def codegen_update_mutated(self, cg: PyCodegen) -> None:
                         ]
                     )
                 elif isinstance(
-                    var, variables.UserDefinedListVariable
+                    var,
+                    variables.UserDefinedListVariable,
+                    # pyrefly: ignore  # bad-argument-type
                 ) and self.is_modified(var._list_vt):
                     # Update the list to the updated items. Be careful in
                     # calling the list methods and not the overridden methods.
@@ -1048,6 +1058,7 @@ def codegen_update_mutated(self, cg: PyCodegen) -> None:
                         ]
                     )
 
+                    # pyrefly: ignore  # bad-argument-type
                     cg(var._list_vt, allow_cache=False)  # Don't codegen via source
                     cg.extend_output(
                         [
diff --git a/torch/_dynamo/source.py b/torch/_dynamo/source.py
index b17ccfe09dae..742722ac32e1 100644
--- a/torch/_dynamo/source.py
+++ b/torch/_dynamo/source.py
@@ -25,7 +25,11 @@
 from torch._guards import ChainedSource, Guard, GuardSource, Source
 
 from . import utils
-from .bytecode_transformation import create_call_function, create_instruction
+from .bytecode_transformation import (
+    create_binary_subscr,
+    create_build_tuple,
+    create_call_function,
+)
 
 
 if TYPE_CHECKING:
@@ -166,7 +170,7 @@ def guard_source(self) -> GuardSource:
     def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.append_output(codegen.create_load(codegen.tx.output.random_values_var))
         codegen.append_output(codegen.create_load_const(self.random_call_index))
-        codegen.append_output(create_instruction("BINARY_SUBSCR"))
+        codegen.append_output(create_binary_subscr())
 
     def name(self) -> str:
         return f"random_value_{self.random_call_index}"
@@ -526,6 +530,29 @@ def name(self) -> str:
         return f"cast_symbool_to_symint_guardless({self.base.name()})"
 
 
+@dataclasses.dataclass(frozen=True)
+class DynamicScalarSource(ChainedSource):
+    is_int: bool
+
+    def __post_init__(self) -> None:
+        assert self.base is not None
+
+    def reconstruct(self, codegen: "PyCodegen") -> None:
+        # Integer casting at reconstruction helps reduce the amount of DynamicInts returned
+        # to the user, in favor of plain ints.
+        # For example, a compiled region that only does int arithmetic could return a
+        # DynamicInt without the casting here.
+        codegen.add_push_null(lambda: codegen.load_import_from("builtins", "int"))
+        codegen(self.base)
+        codegen.extend_output(create_call_function(1, False))
+
+    def guard_source(self) -> GuardSource:
+        return self.base.guard_source()
+
+    def name(self) -> str:
+        return f"int({self.base.name()})"
+
+
 @dataclasses.dataclass(frozen=True)
 class FlattenScriptObjectSource(ChainedSource):
     def __post_init__(self) -> None:
@@ -595,7 +622,7 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen(self.base)
         codegen.extend_output(codegen.create_load_attrs(self.field))
         codegen.append_output(codegen.create_load_const(self.idx_key))
-        codegen.append_output(create_instruction("BINARY_SUBSCR"))
+        codegen.append_output(create_binary_subscr())
 
     def guard_source(self) -> GuardSource:
         return self.base.guard_source()
@@ -622,7 +649,7 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
             codegen.append_output(codegen.create_load_const(self.unpack_slice()))
         else:
             codegen.append_output(codegen.create_load_const(self.index))
-        codegen.append_output(create_instruction("BINARY_SUBSCR"))
+        codegen.append_output(create_binary_subscr())
 
     def guard_source(self) -> GuardSource:
         return self.base.guard_source()
@@ -721,7 +748,7 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
             codegen(self.index)
         else:
             codegen.append_output(codegen.create_load_const(self.index))
-        codegen.append_output(create_instruction("BINARY_SUBSCR"))
+        codegen.append_output(create_binary_subscr())
 
     def name(self) -> str:
         if isinstance(self.index, ConstDictKeySource):
@@ -945,7 +972,7 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.extend_output(
             [
                 codegen.create_load_const(0),  # level
-                create_instruction("BUILD_TUPLE", arg=0),  # fromlist
+                create_build_tuple(0),  # fromlist
                 codegen.create_import_name("torch"),
             ]
         )
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index efb27fc2903d..a5f547e55153 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -43,6 +43,7 @@
 import traceback
 import types
 import weakref
+from collections import deque
 from traceback import StackSummary
 from typing import Any, Callable, cast, NoReturn, Optional, TYPE_CHECKING, Union
 from typing_extensions import TypeAlias, TypeIs
@@ -74,10 +75,12 @@
     cleaned_instructions,
     create_binary_slice,
     create_call_function,
+    create_call_function_ex,
     create_copy,
     create_dup_top,
     create_instruction,
     create_jump_absolute,
+    create_load_const,
     create_rot_n,
     create_swap,
     get_code_keys,
@@ -95,12 +98,13 @@
     format_graph_break_message,
     get_stack_above_dynamo,
     ResumePrologueTracingError,
+    StepUnsupported,
     unimplemented_v2,
     Unsupported,
 )
 from .funcname_cache import get_funcname
 from .guards import GuardBuilder, install_guard
-from .output_graph import GraphCompileReason, OutputGraph
+from .output_graph import GraphCompileReason, OutputGraph, StackLocalsMetadata
 from .polyfills import impl_CONTAINS_OP_fallback
 from .replay_record import DummyModule, ExecutionRecorder
 from .resume_execution import (
@@ -137,6 +141,7 @@
 from .variables.ctx_manager import (
     ContextWrappingVariable,
     GenericContextWrappingVariable,
+    WithEnterFunctionVariable,
     WithExitFunctionVariable,
 )
 from .variables.dicts import ConstDictVariable, SetVariable
@@ -465,14 +470,31 @@ def impl(self: InstructionTranslator, inst: Instruction) -> None:
 
 
 def is_stdlib(mod: object) -> bool:
-    if sys.version_info < (3, 10):
-        # For < 3.10, no easy way to identify a stdlib module name.
-        return False
     if not isinstance(mod, types.ModuleType):
         return False
     return mod.__name__.split(".")[0] in sys.stdlib_module_names
 
 
+@functools.cache
+def get_assert_bytecode_sequence(with_msg: bool) -> list[str]:
+    if with_msg:
+
+        def fn(x: Any) -> None:
+            assert x, "msg"
+    else:
+
+        def fn(x: Any) -> None:
+            assert x
+
+    insts = [inst.opname for inst in dis.get_instructions(fn)]
+
+    # expect to find POP_JUMP_[FORWARD_]IF_TRUE
+    begin_idx = next(i for i, inst in enumerate(insts) if inst.startswith("POP_JUMP"))
+    end_idx = insts.index("RAISE_VARARGS")
+
+    return insts[begin_idx + 1 : end_idx + 1]
+
+
 def _detect_and_normalize_assert_statement(
     self: InstructionTranslatorBase,
     truth_fn: Callable[[object], bool],
@@ -481,62 +503,38 @@ def _detect_and_normalize_assert_statement(
     # Detect if this jump instruction is assert and normalize the assert
     # by pushing dummy error message when nothing is given.
     #
-    # Python 3.9 assertion is in following format:
+    # Python 3.9-3.13 assertion is in following format (minus small differences)
     # 18 POP_JUMP_IF_TRUE       28
     # 20 LOAD_ASSERTION_ERROR
     # 22 LOAD_CONST               3 ('Assert message') -> optional instruction
     # 24 CALL_FUNCTION            1                    -> optional instruction
     # 26 RAISE_VARARGS
-    #
-    # Python 3.8 assertion is in following format:
-    # 18 POP_JUMP_IF_TRUE       28
-    # 20 LOAD_GLOBAL              0 (Assertion type)
-    # 22 LOAD_CONST               3 ('Assert message') -> optional instruction
-    # 24 CALL_FUNCTION            1                    -> optional instruction
-    # 26 RAISE_VARARGS            1
 
     if (truth_fn is not operator.truth) or push:
         return False
 
     assert isinstance(self.instruction_pointer, int)
     current_instruction_pointer = self.instruction_pointer
-    inst = self.instructions[current_instruction_pointer]
-    # Detect LOAD_ASSERTION_ERROR or LOAD_GLOBAL 0
-    if inst.opname != "LOAD_ASSERTION_ERROR":
-        return False
-
-    current_instruction_pointer += 1
-
-    # Use dummy error message if its hard to extract
-    error_msg = "assertion error"
-
-    inst = self.instructions[current_instruction_pointer]
-    # DETECT RAISE_VARARGS or LOAD CONST
-    if inst.opname == "LOAD_CONST":
-        if not isinstance(inst.argval, str):
-            return False
-        error_msg = inst.argval
-
-        # if it is LOAD_CONSTANT, it must be followed by CALL_FUNCTION
-        # (PRECALL for Python 3.11, CALL for Python 3.12+)
-        current_instruction_pointer += 1
-        inst = self.instructions[current_instruction_pointer]
-        if inst.opname not in ("CALL_FUNCTION", "PRECALL", "CALL"):
-            return False
-
-        # for Python 3.11, PRECALL should be followed by CALL, then RAISE_VARARGS
-        # for Python != 3.11, CALL_FUNCTION/CALL should be followed by RAISE_VARARGS
-        current_instruction_pointer += 1
-        if inst.opname == "PRECALL":
-            current_instruction_pointer += 1
-        inst = self.instructions[current_instruction_pointer]
 
-    if inst.opname != "RAISE_VARARGS":
-        return False
-
-    self.push(ConstantVariable.create(error_msg))
+    for with_msg in (False, True):
+        assert_insts = get_assert_bytecode_sequence(with_msg)
+        cur_insts = self.instructions[
+            current_instruction_pointer : current_instruction_pointer
+            + len(assert_insts)
+        ]
+        cur_insts = [inst.opname for inst in cur_insts]
+        if cur_insts == assert_insts:
+            if with_msg:
+                load_const_idx = assert_insts.index("LOAD_CONST")
+                error_msg = self.instructions[
+                    current_instruction_pointer + load_const_idx
+                ].argval
+            else:
+                error_msg = "assertion error"
+            self.push(ConstantVariable.create(error_msg))
+            return True
 
-    return True
+    return False
 
 
 explain = False
@@ -547,6 +545,7 @@ def log_graph_break(
     reason: str = "",
     exc_info: bool = False,
     user_stack: Optional[StackSummary] = None,
+    latest_bytecode_log: Optional[str] = None,
 ) -> None:
     if user_stack is None:
         user_stack = torch._guards.TracingContext.extract_stack()
@@ -568,6 +567,7 @@ def log_graph_break(
         )
     else:
         user_stack = get_stack_above_dynamo() + user_stack  # type: ignore[assignment]
+        # pyrefly: ignore  # bad-argument-type
         user_stack = collapse_resume_frames(user_stack)
     user_stack_formatted = "".join(traceback.format_list(user_stack))
     user_stack_trace = (
@@ -608,6 +608,10 @@ def log_graph_break(
         # This log line MUST contain the string "Graph break in user code",
         # This log line is exercised from
         #   python test/dynamo/test_exc.py -k test_graph_break_log
+        if latest_bytecode_log and config.verbose:
+            user_stack_trace += "Most recent bytecode instructions traced (max 20):\n"
+            user_stack_trace += latest_bytecode_log
+
         graph_break_log.debug(
             user_stack_trace,
         )
@@ -673,14 +677,20 @@ def jump_graph_break(
         )
         self.pop()
 
-        if_next = self.create_call_resume_at(
-            self.next_instruction, all_stack_locals_metadata, False
+        if_next = self.codegen_fix_leaf_stack(
+            all_stack_locals_metadata[0], self.next_instruction
+        ) + self.create_call_resume_at(
+            self.next_instruction,
+            all_stack_locals_metadata,
         )
         if push:
             self.push(value)
         assert inst.target is not None
-        if_jump = self.create_call_resume_at(
-            inst.target, all_stack_locals_metadata, False
+        if_jump = self.codegen_fix_leaf_stack(
+            all_stack_locals_metadata[0], inst.target
+        ) + self.create_call_resume_at(
+            inst.target,
+            all_stack_locals_metadata,
         )
 
         if sys.version_info >= (3, 13):
@@ -929,6 +939,7 @@ def wrapper(self: InstructionTranslatorBase, inst: Instruction) -> None:
                     exc_info=True,
                     reason=str(excp),
                     user_stack=excp.real_stack,
+                    latest_bytecode_log="\n".join(self.latest_bytecode_queue),
                 )
 
                 if self.maybe_has_backedge():
@@ -964,7 +975,7 @@ def handle_graph_break(
             all_stack_locals_metadata = self.output.compile_subgraph(
                 self, reason=reason, stack_pops=push - stack_effect
             )
-            cg = PyCodegen(self)
+            cg = PyCodegen(self.output.root_tx)
             cleanup: list[Instruction] = []
             # Reconstruct the context variable CLASS in the block stack
             for b in self.block_stack:
@@ -1013,8 +1024,12 @@ def handle_graph_break(
             for _ in range(push):
                 self.push(UnknownVariable())
             self.output.add_output_instructions(
-                self.create_call_resume_at(
-                    self.next_instruction, all_stack_locals_metadata, False
+                self.codegen_fix_leaf_stack(
+                    all_stack_locals_metadata[0], self.next_instruction
+                )
+                + self.create_call_resume_at(
+                    self.next_instruction,
+                    all_stack_locals_metadata,
                 )
             )
 
@@ -1023,7 +1038,7 @@ def handle_graph_break(
     return decorator
 
 
-class BytecodeDistpatchTableMeta(type):
+class BytecodeDispatchTableMeta(type):
     """Installs a `cls.dispatch_table` on every subclass to speed up calls to self.OPCODE()"""
 
     def __init__(cls: type, name: str, bases: Any, dct: Any) -> None:
@@ -1045,6 +1060,7 @@ def _missing(opname: str, *args: Any) -> None:
             op: getattr(cls, opname, functools.partial(_missing, opname))
             for opname, op in dis.opmap.items()
         }
+        # pyrefly: ignore  # missing-attribute
         cls.dispatch_table = [dispatch_table.get(i) for i in range(2**8)]
 
 
@@ -1148,7 +1164,7 @@ def __str__(self) -> str:
 
 
 class InstructionTranslatorBase(
-    metaclass=BytecodeDistpatchTableMeta,
+    metaclass=BytecodeDispatchTableMeta,
 ):
     output: OutputGraph
     symbolic_locals: dict[str, VariableTracker]
@@ -1175,6 +1191,8 @@ class InstructionTranslatorBase(
     parent: Optional[InstructionTranslatorBase]
     debug_locals: list[tuple[VariableTracker, list[VariableTracker]]]
     package: Optional[CompilePackage]
+    latest_bytecode_queue: deque[str]
+    # Store the latest bytecode before graph_break() call by user
 
     def mark_inconsistent_side_effects(self) -> None:
         """
@@ -1339,9 +1357,20 @@ def step(self) -> bool:
 
         if self.is_trace_bytecode_log_enabled:
             trace_bytecode_log.debug(
-                "TRACE %s %s %s", inst.opname, inst.argval, self.stack
+                "TRACE %s %s %s", inst.opname, inst.argval, repr(self.stack)
             )
 
+        # Store the latest 20 bytecode execution for the process,
+        # Used repr for byte processing and limiting the length to 2048
+        try:
+            stack_repr = repr(self.stack)
+        except ValueError:
+            # Handle large integers that exceed sys.int_info.str_digits_check_threshold
+            stack_repr = "<self.stack repr truncated due to large integer>"
+        self.latest_bytecode_queue.append(
+            f"TRACE {inst.opname} {repr(inst.argval)} {stack_repr}"
+        )
+
         self.update_block_stack(inst)
 
         try:
@@ -1354,9 +1383,22 @@ def step(self) -> bool:
             return True
         except (ReturnValueOp, YieldValueOp):
             return False
-        except Unsupported:
+        except (Unsupported, StepUnsupported) as e:
             if self.current_speculation is None:
                 log.debug("empty checkpoint")
+                if isinstance(e, StepUnsupported):
+                    unimplemented_v2(
+                        gb_type="torch._dynamo.step_unsupported() with empty checkpoint",
+                        context="",
+                        explanation="traced torch._dynamo.step_unsupported(), but there is no checkpoint "
+                        "to step_graph_break from. This graph break is used for debugging only.",
+                        hints=[
+                            "Remove the torch._dynamo.step_unsupported() call.",
+                            "Include at least one checkpoint: (1) include at least 2 ops and (2) make sure there is some "
+                            "line of code that is not in a try/with block, and has an empty Python stack.",
+                            *graph_break_hints.DYNAMO_BUG,
+                        ],
+                    )
                 raise
             log.debug("step triggered compile", exc_info=True)
 
@@ -1430,24 +1472,110 @@ def step_graph_break(self, continue_inst: Instruction) -> None:
             partial_convert=True,
             reason=GraphCompileReason("step_unsupported", [self.frame_summary()]),
         )
+        # current frame state
+        # cells,
+        # [
+        #   frame N locals,
+        #   frame N-1 stack + locals,
+        #   ...,
+        #   frame 1 stack + locals,
+        # ],
         if self.parent:
+            from .eval_frame import skip_code
+
             # nested graph break
             assert config.nested_graph_breaks
-            self.output.add_output_instructions(
-                self.create_call_resume_at(
-                    continue_inst, all_stack_locals_metadata, True
-                )
+            cg = PyCodegen(self.output.root_tx)
+
+            # codegen cells and frame values only for frame N
+            cg.extend_output(
+                [
+                    *create_copy(2),
+                    cg.create_load_const(0),
+                    cg.create_binary_subscr(),
+                    create_instruction("BUILD_LIST", arg=1),
+                    *create_copy(2),
+                    cg.create_load_const(0),
+                    cg.create_binary_subscr(),
+                    create_instruction("BUILD_LIST", arg=1),
+                ]
             )
-        else:
-            # load locals from frame values
+            # No need to fix stack, since stack is assumed to be empty here.
+            # Do NOT handle_inactive_ctx because we will be skipping this resume code.
+            leaf_resume_code, leaf_resume_name = self.create_resume(
+                0, continue_inst, all_stack_locals_metadata[0], [], cg, True, False
+            )
+            skip_code(leaf_resume_code)
+
+            # current frame state
+            # cells,
+            # [
+            #   frame N locals,
+            #   frame N-1 stack + locals,
+            #   ...,
+            #   frame 1 stack + locals,
+            # ], [frame N cells], [frame N locals],
+            self.codegen_call_resume([leaf_resume_code], [leaf_resume_name], cg)
+
             # current frame state
+            # cells,
             # [
             #   frame N locals,
             #   frame N-1 stack + locals,
             #   ...,
             #   frame 1 stack + locals,
+            # ], leaf_resume result
+
+            # add the leaf_resume result to frame N-1 stack
+            num_stack = all_stack_locals_metadata[1].num_stack
+            cg.extend_output(
+                [
+                    create_instruction("BUILD_LIST", arg=1),
+                    *create_copy(2),
+                    cg.create_load_const(1),
+                    cg.create_binary_subscr(),
+                    *create_binary_slice(num_stack, num_stack, True),
+                ]
+            )
+
+            # pop frame N cells and locals
+            cg.extend_output(
+                [
+                    *create_copy(1),
+                    cg.create_load_const(0),
+                    create_instruction("DELETE_SUBSCR"),
+                    *create_copy(2),
+                    cg.create_load_const(0),
+                    create_instruction("DELETE_SUBSCR"),
+                ]
+            )
+
+            # call the remaining resume functions
+            # current frame state
+            # [frame N-1 cells, ..., frame 1 cells],
+            # [
+            #   frame N-1 stack (including leaf_resume result) + locals,
+            #   ...,
+            #   frame 1 stack + locals,
             # ],
-            cg = PyCodegen(self)
+            self.parent.push(UnknownVariable())
+            all_stack_locals_metadata[1].num_stack += 1
+            self.output.add_output_instructions(
+                cg.get_instructions()
+                + self.parent.create_call_resume_at(
+                    self.parent.next_instruction, all_stack_locals_metadata[1:]
+                )
+            )
+        else:
+            # pop cells
+            self.output.add_output_instructions(
+                [
+                    *create_swap(2),
+                    create_instruction("POP_TOP"),
+                ]
+            )
+            # load locals from frame values
+            cg = PyCodegen(self.output.root_tx)
             self.output.add_output_instructions(
                 [
                     cg.create_load_const(-1),
@@ -1614,7 +1742,7 @@ def STORE_DEREF(self, inst: Instruction) -> None:  # type: ignore[override]
 
     LOAD_CLOSURE = LOAD_FAST
 
-    def _load_const(self, inst: Instruction) -> ConstantVariable:
+    def _load_const(self, inst: Instruction) -> VariableTracker:
         i = inst.arg
         if i is None:
             return ConstantVariable.create(value=inst.argval)  # type: ignore[return-value]
@@ -1793,13 +1921,17 @@ def IMPORT_NAME(self, inst: Instruction) -> None:
                 source = self.import_source(module_name)
 
         if self.exec_recorder:
+            # pyrefly: ignore  # unbound-name
             self.exec_recorder.add_local_mod(recorded_name, value)
 
+        # pyrefly: ignore  # unbound-name
         if istype(value, (types.ModuleType, DummyModule)):
+            # pyrefly: ignore  # unbound-name
             self.push(PythonModuleVariable(value, source=source))
         else:
             unimplemented_v2(
                 gb_type="Bad import result",
+                # pyrefly: ignore  # unbound-name
                 context=typestr(value),
                 explanation="Import result is not a Python module.",
                 hints=[],
@@ -1810,7 +1942,7 @@ def IMPORT_NAME(self, inst: Instruction) -> None:
 
     def IMPORT_FROM(self, inst: Instruction) -> None:
         self.DUP_TOP(inst)
-        self._load_attr(inst)
+        self._load_attr(inst.argval)
 
     # Cache note: This cache only exists for the duration of this
     # InstructionTranslator - so it should be safe to do.
@@ -1878,6 +2010,7 @@ def WITH_CLEANUP_START(self, inst: Instruction) -> None:
         exit, exc = self.popn(2)
         assert exc is None
         self.push(exc)
+        # pyrefly: ignore  # bad-argument-type
         self.push(exit.call_function(self, [ConstantVariable.create(None)] * 3, {}))
 
     def WITH_CLEANUP_FINISH(self, inst: Instruction) -> None:
@@ -2020,7 +2153,9 @@ def _isinstance_exception(self, val: VariableTracker) -> TypeIs[ExceptionVals]:
         )
 
     def WITH_EXCEPT_START(self, inst: Instruction) -> None:
+        args: list[VariableTracker] = []
         if sys.version_info >= (3, 11):
+            fn_loc = 4 if sys.version_info < (3, 14) else 5
             # At the top of the stack are 4 values:
             #    - TOP = exc_info()
             #    - SECOND = previous exception
@@ -2028,12 +2163,17 @@ def WITH_EXCEPT_START(self, inst: Instruction) -> None:
             #    - FOURTH: the context.__exit__ bound method
             #    We call FOURTH(type(TOP), TOP, GetTraceback(TOP)).
             #    Then we push the __exit__ return value.
-            assert len(self.stack) >= 4
-            fn = self.stack[-4]
+            # In Python 3.14+, there is a NULL placed between the context.__exit__ bound method and the lasti,
+            # that is, fn is now the 5th from TOS.
+            assert len(self.stack) >= fn_loc
+            fn = self.stack[-fn_loc]
             val = self.stack[-1]
             assert self._isinstance_exception(val)
             typ = BuiltinVariable(val.exc_type)  # type: ignore[attr-defined, union-attr]
             tb = ConstantVariable(None)
+            if sys.version_info >= (3, 14):
+                if not isinstance(self.stack[-4], NullVariable):
+                    args.append(self.stack[-4])
         else:
             assert len(self.stack) >= 7
             fn = self.stack[-7]
@@ -2042,7 +2182,8 @@ def WITH_EXCEPT_START(self, inst: Instruction) -> None:
             typ = BuiltinVariable(val.exc_type)  # type: ignore[attr-defined]
             tb = ConstantVariable(None)
 
-        self.call_function(fn, [typ, val, tb], {})
+        args += [typ, val, tb]
+        self.call_function(fn, args, {})
 
     def exception_handler(self, raised_exception: ObservedException) -> None:
         observed_exn_gb_explanation = (
@@ -2063,6 +2204,7 @@ def bubble_exception_to_interpreter() -> None:
                     *graph_break_hints.USER_ERROR,
                     *graph_break_hints.SUPPORTABLE,
                 ],
+                from_exc=raised_exception,
             )
 
         if sys.version_info >= (3, 11):
@@ -2291,7 +2433,9 @@ def check_if_exc_matches(self) -> bool:
             ):
                 return True
             elif isinstance(exc_instance, variables.BuiltinVariable) and issubclass(
-                exc_instance.fn, expected_type.fn
+                exc_instance.fn,
+                # pyrefly: ignore  # missing-attribute
+                expected_type.fn,
             ):
                 return True
 
@@ -2325,8 +2469,11 @@ def CALL_FUNCTION_EX(self, inst: Instruction) -> None:
         if inst.argval == 0:
             kwargsvars = ConstDictVariable({})
             argsvars = self.pop()
-        elif inst.argval == 1:
+        elif inst.argval == 1 or sys.version_info >= (3, 14):
+            # Python 3.14+ removed the argval and replaced it with a possibly NULL kwargs
             kwargsvars = self.pop()
+            if isinstance(kwargsvars, NullVariable):
+                kwargsvars = ConstDictVariable({})
             argsvars = self.pop()
         else:
             unimplemented_v2(
@@ -2348,26 +2495,37 @@ def CALL_FUNCTION_EX(self, inst: Instruction) -> None:
             assert isinstance(null, NullVariable)
 
         if not isinstance(
-            argsvars, BaseListVariable
+            # pyrefly: ignore  # unbound-name
+            argsvars,
+            BaseListVariable,
+            # pyrefly: ignore  # unbound-name
         ) and argsvars.has_force_unpack_var_sequence(self):
+            # pyrefly: ignore  # unbound-name
             argsvars = TupleVariable(argsvars.force_unpack_var_sequence(self))
 
         # Unpack for cases like fn(**obj) where obj is a map
+        # pyrefly: ignore  # unbound-name
         if isinstance(kwargsvars, UserDefinedObjectVariable):
             kwargsvars = BuiltinVariable.call_custom_dict(self, dict, kwargsvars)  # type: ignore[arg-type]
 
+        # pyrefly: ignore  # unbound-name
         if not isinstance(argsvars, BaseListVariable) or not isinstance(
-            kwargsvars, ConstDictVariable
+            # pyrefly: ignore  # unbound-name
+            kwargsvars,
+            ConstDictVariable,
         ):
             unimplemented_v2(
                 gb_type="Variadic function call with bad args/kwargs type",
+                # pyrefly: ignore  # unbound-name
                 context=f"args type: {typestr(argsvars)}, kwargs type: {typestr(kwargsvars)}",
                 explanation="Expected args to be a list and kwargs to be a dict",
                 hints=[*graph_break_hints.USER_ERROR],
             )
 
         # Map to a dictionary of str -> VariableTracker
+        # pyrefly: ignore  # unbound-name, missing-attribute
         kwargsvars = kwargsvars.keys_as_python_constant()
+        # pyrefly: ignore  # unbound-name, missing-attribute
         self.call_function(fn, argsvars.items, kwargsvars)
 
     @break_graph_if_unsupported(push=1)
@@ -2387,7 +2545,7 @@ def LOAD_METHOD_SUPER(self, inst: Instruction) -> None:
         arg = inst.argval[0]
         argval = self.code_options["co_names"][arg]
         if sys.version_info < (3, 11):
-            self._load_attr(dataclasses.replace(inst, argval=argval))
+            self._load_attr(argval)
         else:
             self.LOAD_METHOD(dataclasses.replace(inst, argval=argval))
 
@@ -2395,10 +2553,10 @@ def LOAD_ATTR_SUPER(self, inst: Instruction) -> None:
         self.CALL_FUNCTION(dataclasses.replace(inst, argval=2))
         arg = inst.argval[0]
         argval = self.code_options["co_names"][arg]
-        self._load_attr(dataclasses.replace(inst, argval=argval))
+        self._load_attr(argval)
 
     def LOAD_METHOD(self, inst: Instruction) -> None:
-        self._load_attr(inst)
+        self._load_attr(inst.argval)
         obj = self.pop()
         if sys.version_info >= (3, 13):
             self.push(obj)
@@ -2420,21 +2578,22 @@ def CALL_METHOD(self, inst: Instruction) -> None:
         fn = self.pop()
         self.call_function(fn, args, {})
 
-    def _load_attr(self, inst: Instruction) -> None:
+    def _load_attr(self, attr: Any) -> None:
         obj = self.pop()
         result = BuiltinVariable(getattr).call_function(
             self,  # type: ignore[arg-type]
-            [obj, ConstantVariable.create(inst.argval)],
+            [obj, ConstantVariable.create(attr)],
             {},
         )
         self.push(result)
 
     def LOAD_ATTR(self, inst: Instruction) -> None:
         if sys.version_info >= (3, 12):
+            # pyrefly: ignore  # unsupported-operation
             if inst.arg % 2:
                 self.LOAD_METHOD(inst)
                 return
-        self._load_attr(inst)
+        self._load_attr(inst.argval)
 
     def STORE_ATTR(self, inst: Instruction) -> None:
         speculation = self.speculate()
@@ -2482,8 +2641,12 @@ def store_attr_graph_break(self, inst: Instruction) -> None:
         self.output.add_output_instructions([copy.copy(inst)])
         self.popn(2)
         self.output.add_output_instructions(
-            self.create_call_resume_at(
-                self.next_instruction, all_stack_locals_metadata, False
+            self.codegen_fix_leaf_stack(
+                all_stack_locals_metadata[0], self.next_instruction
+            )
+            + self.create_call_resume_at(
+                self.next_instruction,
+                all_stack_locals_metadata,
             )
         )
 
@@ -2495,93 +2658,132 @@ def DELETE_ATTR(self, inst: Instruction) -> None:
             {},
         )
 
-    def create_call_resume_at(
-        self,
-        inst: Instruction,
-        all_stack_locals_metadata: Any,
-        disable_current_frame_resume: bool,
+    @staticmethod
+    def codegen_return_with_pops(
+        inst: Instruction, num_stack: int
     ) -> list[Instruction]:
         """
-        Codegen resume function(s) and call it.
-        Assumes that the unsupported instruction has already been run.
+        Debug CPython expects the stack to be empty after the return.
+        Calling compile_subgraph will push cells and frame values to TOS.
+        This function will pop those 2 values from the stack before actually returning.
 
-        Expects the stack to be in the state:
+        Expects the stack to be:
+            cells, frame values, current frame stack (0 or 1 values)
+
+        Pops cells and frame values, leaving the current frame stack as TOS.
+        A return instruction is included.
+        """
+        insts = []
+        # NOTE: Debug CPython expects the stack to be empty after the return.
+        # Expect the current stack to be in the state
+        # cells, frame values, current frame stack (0 or 1 values)
+        assert num_stack <= 1
+        if num_stack == 1:
+            insts.extend(create_swap(3))
+        return_inst = (
+            create_instruction("RETURN_VALUE")
+            if inst.opname == "RETURN_VALUE"
+            else create_instruction("RETURN_CONST", argval=inst.argval)
+        )
+        insts.extend(
+            [create_instruction("POP_TOP"), create_instruction("POP_TOP"), return_inst]
+        )
+        return insts
+
+    def codegen_fix_leaf_stack(
+        self, meta: StackLocalsMetadata, resume_inst: Instruction
+    ) -> list[Instruction]:
+        """
+        Fixes the stack values of the current/leaf frame (self).
+
+        Expects the TOS to be:
             [
                 frame N locals,
                 frame N-1 stack + locals,
                 ...,
                 frame 1 stack + locals
-            ], frame N stack (post-instruction)
+            ], *(frame N stack (post-unsupported instruction))
+
+        Rearranges the TOS to become:
+            [
+                frame N stack + locals,
+                ...,
+                frame 1 stack + locals
+            ]
 
         Args:
-            - inst: the instruction of the current (deepest) frame to resume at
-            - all_stack_locals_metadata: metadata returned from OutputGraph.compile_subgraph - contains
-                metadata such as local names, NULL positions, stack length, etc.
-            - disable_current_frame_resume: If True, disable tracing on the current frame's resume function.
-                Used for implementing nested step_graph_break.
+            - meta: metadata for the leaf frame returned from OutputGraph.compile_subgraph
+            - resume_inst: if the resume instruction is a return instruction, then don't return any instructions
         """
-
-        self.instruction_pointer = None
-
-        if inst.opname == "RETURN_VALUE":
-            return [create_instruction("RETURN_VALUE")]
-        elif inst.opname == "RETURN_CONST":
-            return [create_instruction("RETURN_CONST", argval=inst.argval)]
-
-        cg = PyCodegen(self.output.root_tx)
-
+        if resume_inst.opname in ("RETURN_VALUE", "RETURN_CONST"):
+            return []
         # move frame N stack to the frame values list
-        current_num_stack = len(self.stack) - len(
-            all_stack_locals_metadata[0].stack_null_idxes
-        )
-        all_stack_locals_metadata[0].num_stack = current_num_stack
-        cg.extend_output(
+        current_num_stack = len(self.stack) - len(meta.stack_null_idxes)
+        meta.num_stack = current_num_stack
+        return [
+            create_instruction("BUILD_LIST", arg=current_num_stack),
+            *create_copy(2),
+            # frame_values, frame N stack, frame_values
+            create_load_const(0),
+            create_instruction("BINARY_SUBSCR"),
+            *create_binary_slice(0, 0, True),
+            # frame_values[0][0:0] = frame N stack
+            # frame_values left on top of stack
+        ]
+
+    def create_resume(
+        self,
+        idx: int,
+        resume_inst: Instruction,
+        meta: StackLocalsMetadata,
+        resume_codes: list[types.CodeType],
+        cg: PyCodegen,
+        is_leaf: bool,
+        handle_inactive_ctx: bool,
+    ) -> tuple[types.CodeType, str]:
+        """
+        Creates the resume function for the frame corresponding to `self`.
+
+        Expects the TOS to be:
+            [frame N cells, ..., frame 1 cells],
             [
-                create_instruction("BUILD_LIST", arg=current_num_stack),
-                *create_copy(2),
-                # frame_values, frame N stack, frame_values
-                cg.create_load_const(0),
-                cg.create_binary_subscr(),
-                *create_binary_slice(0, 0, True),
-                # frame_values[0][0:0] = frame N stack
-                # frame_values left on top of stack
+                frame N stack + locals,
+                ...,
+                frame 1 stack + locals
             ]
-        )
 
-        # current frame state
-        # [
-        #   [frame N stack (fixed) + locals]
-        #   ...,
-        #   [frame 1 stack + locals]
-        # ],
+        Some additional codegen may happen to prepare the frame stack + locals values for the generated resume function:
+        - inactive context variables in the stack and locals will be replaced by their types
+        - if the frame is a leaf frame, prune dead locals
 
-        #
-        txes = []
-        cur_tx: Optional[InstructionTranslatorBase] = self
-        while cur_tx is not None:
-            txes.append(cur_tx)
-            cur_tx = cur_tx.parent
-        assert len(txes) == len(all_stack_locals_metadata)
+        Regardless of codegen, the stack will be left in the same state as before.
 
+        Args:
+            - idx: depth of this frame: 0 corresponds to the leaf frame (frame N), N-1 to the root frame (frame 1).
+            - resume_inst: the instruction that this frame should resume at
+            - meta: metadata for this frame returned from OutputGraph.compile_subgraph
+            - resume_codes: nested resume code objects generated from previous create_resume calls.
+            - cg: codegen object to output to
+            - is_leaf: True if `self` corresponds to the leaf frame.
+            - handle_inactive_ctx: If True, handles inactive context variables as described above. This is necessary
+                iff the resume function is traced
+        """
         # Handle inactive context variables.
         # The resume function assumes that context variables are the class, NOT the object.
         # e.g. torch.set_grad_enabled(True) will be reconstructed as torch.set_grad_enabled
         # NOTE: if the unsupported instruction modifies the inactive context variable, it may
         # result in silent incorrectness!
-        for i, meta in enumerate(all_stack_locals_metadata):
-            if i == 0 and disable_current_frame_resume:
-                continue
-
+        if handle_inactive_ctx:
             for (j, _), j_orig in zip(meta.stack_ctx_args, meta.stack_ctx_idxes_orig):
                 # Replace the stack var with the context class
-                ctx = cast(ContextWrappingVariable, txes[i].stack[j_orig])
-                # frames[i][j] = reconstructed_ctx
+                ctx = cast(ContextWrappingVariable, self.stack[j_orig])
+                # frames[idx][j] = reconstructed_ctx
                 cg.append_output(create_dup_top())
                 ctx.reconstruct_type(cg)
                 cg.extend_output(
                     [
                         *create_swap(2),
-                        cg.create_load_const(i),
+                        cg.create_load_const(idx),
                         cg.create_binary_subscr(),
                         cg.create_load_const(j),
                         create_instruction("STORE_SUBSCR"),
@@ -2590,172 +2792,278 @@ def create_call_resume_at(
 
             for name, _ in meta.locals_ctx_args:
                 # Replace the local with the context class
-                ctx = cast(ContextWrappingVariable, txes[i].symbolic_locals[name])
-                # frames[i][meta.num_stack +meta.locals_names[name]] = reconstructed_ctx
+                ctx = cast(ContextWrappingVariable, self.symbolic_locals[name])
+                # frames[idx][meta.num_stack +meta.locals_names[name]] = reconstructed_ctx
                 cg.append_output(create_dup_top())
                 ctx.reconstruct_type(cg)
                 cg.extend_output(
                     [
                         *create_swap(2),
-                        cg.create_load_const(i),
+                        cg.create_load_const(idx),
                         cg.create_binary_subscr(),
                         cg.create_load_const(meta.num_stack + meta.locals_names[name]),
                         create_instruction("STORE_SUBSCR"),
                     ]
                 )
 
-        # build the resume function for each frame
-        resume_names = []
-        resume_codes: list[types.CodeType] = []
-        for i, meta in enumerate(all_stack_locals_metadata):
-            cur_tx = txes[i]
-            if cur_tx is self:
-                resume_inst = inst
-            else:
-                resume_inst = cur_tx.next_instruction
-            # If the resume instruction is a jump absolute, then resume
-            # at the target instead. This handles the case where we
-            # graph break again in a nested function before jump-resuming
-            # this frame.
-            if is_jump_absolute(resume_inst):
-                assert resume_inst.target
-                resume_inst = resume_inst.target
-            resume_name = unique_id(f"__resume_at_{resume_inst.offset}")
-            resume_names.append(resume_name)
-
-            # More locals may have been pruned in the current frame
-            # after the unsupported instruction (e.g. branch).
-            # There should not be any pruning in the other frames since
-            # the current instruction is a CALL.
-            if cur_tx is self:
-                reads = livevars_analysis(cur_tx.instructions, resume_inst)
-                all_argnames = tuple(
-                    k
-                    for k in cur_tx.symbolic_locals.keys()
-                    if k in reads and k not in cur_tx.cell_and_freevars()
-                )
-                argnames_null_set = set(meta.locals_null_keys)
-                argnames = tuple(k for k in all_argnames if k not in argnames_null_set)
-                argnames_null = tuple(k for k in all_argnames if k in argnames_null_set)
+        # If the resume instruction is a jump absolute, then resume
+        # at the target instead. This handles the case where we
+        # graph break again in a nested function before jump-resuming
+        # this frame.
+        if is_jump_absolute(resume_inst):
+            assert resume_inst.target
+            resume_inst = resume_inst.target
+
+        resume_name = unique_id(f"__resume_at_{resume_inst.offset}")
+
+        # More locals may have been pruned in the current/leaf frame
+        # after the unsupported instruction (e.g. branch).
+        # There should not be any pruning in the other frames since
+        # the current instruction there should be a CALL.
+        if is_leaf:
+            reads = livevars_analysis(self.instructions, resume_inst)
+            all_argnames = tuple(
+                k
+                for k in self.symbolic_locals.keys()
+                if k in reads and k not in self.cell_and_freevars()
+            )
+            argnames_null_set = set(meta.locals_null_keys)
+            argnames = tuple(k for k in all_argnames if k not in argnames_null_set)
+            argnames_null = tuple(k for k in all_argnames if k in argnames_null_set)
 
-                # codegen filter for current frame's locals
-                # current stack state: frames
+            # codegen filter for current frame's locals
+            # current stack state: frames
+            cg.extend_output(
+                [
+                    create_dup_top(),
+                    cg.create_load_const(idx),
+                    cg.create_binary_subscr(),
+                    create_dup_top(),
+                ]
+            )
+            for arg in argnames:
+                # current stack state: frames, frames[i], *(prev locals), frames[i]
                 cg.extend_output(
                     [
                         create_dup_top(),
-                        cg.create_load_const(i),
+                        cg.create_load_const(meta.num_stack + meta.locals_names[arg]),
                         cg.create_binary_subscr(),
-                        create_dup_top(),
-                    ]
-                )
-                for arg in argnames:
-                    # current stack state: frames, frames[i], *(prev locals), frames[i]
-                    cg.extend_output(
-                        [
-                            create_dup_top(),
-                            cg.create_load_const(
-                                meta.num_stack + meta.locals_names[arg]
-                            ),
-                            cg.create_binary_subscr(),
-                            *create_swap(2),
-                        ],
-                    )
-                # current stack state: frames, frames[i], *(frame i live locals), frames[i]
-                cg.extend_output(
-                    [
-                        create_instruction("POP_TOP"),
-                        create_instruction("BUILD_LIST", arg=len(argnames)),
                         *create_swap(2),
-                        # frames, frames i live locals, frames[i]
-                        *create_binary_slice(meta.num_stack, None, True),
-                        # frames[i][num_stack:] = frame i live locals
-                    ]
+                    ],
                 )
-                # current stack state: frames
-            else:
-                argnames = tuple(meta.locals_names.keys())
-                argnames_null = tuple(meta.locals_null_keys)
-
-            if sys.version_info < (3, 12):
-                assert len(argnames_null) == 0, "variables should not be NULL in < 3.12"
-
-            # compile_subgraph did not codegen any NULLs,
-            # so we should not count NullVariables
-            stack_len = len(cur_tx.stack) - len(meta.stack_null_idxes)
-
-            new_code: types.CodeType = ContinueExecutionCache.lookup(
-                cur_tx.f_code,
-                cur_tx.lineno,
-                resume_inst.offset,
-                tuple(b.target.offset for b in cur_tx.block_stack),
-                stack_len,
-                argnames,
-                argnames_null,
-                tuple(b.resume_fn() for b in cur_tx.block_stack),
-                tuple(meta.stack_ctx_args),
-                tuple(meta.locals_ctx_args),
-                tuple(meta.stack_null_idxes),
-                tuple(resume_codes),
+            # current stack state: frames, frames[i], *(frame i live locals), frames[i]
+            cg.extend_output(
+                [
+                    create_instruction("POP_TOP"),
+                    create_instruction("BUILD_LIST", arg=len(argnames)),
+                    *create_swap(2),
+                    # frames, frames i live locals, frames[i]
+                    *create_binary_slice(meta.num_stack, None, True),
+                    # frames[i][num_stack:] = frame i live locals
+                ]
             )
-            resume_codes.append(new_code)
-
-            # Add original GraphModule context to the resume function to handle
-            # the case of a graph break while tracing a GraphModule
-            orig_graphmodule_maybe = code_context.get_context(cur_tx.f_code).get(
-                "orig_graphmodule", lambda: None
-            )()
-            if orig_graphmodule_maybe is not None:
-                code_context.get_context(new_code)["orig_graphmodule"] = weakref.ref(
-                    orig_graphmodule_maybe
-                )
-
-            # add resume function to the global scope
-            if new_code.co_freevars:
-                # expose code object for debugging purposes
-                cur_tx.output.install_global_unsafe(resume_name, new_code)
-                package_name = None
-            else:
-                # This is safe: we pre-generate a unique name
-                cur_tx.output.install_global_unsafe(
-                    resume_name,
-                    types.FunctionType(new_code, cur_tx.f_globals, resume_name),
-                )
-                package_name = resume_name
+            # current stack state: frames
+        else:
+            argnames = tuple(meta.locals_names.keys())
+            argnames_null = tuple(meta.locals_null_keys)
 
-            if cur_tx.package is not None:
-                cur_tx.package.add_resume_function(
-                    new_code, cur_tx.f_globals["__name__"], package_name
-                )
+        if sys.version_info < (3, 12):
+            assert len(argnames_null) == 0, "variables should not be NULL in < 3.12"
 
-        if disable_current_frame_resume:
-            from .eval_frame import skip_code
+        # compile_subgraph did not codegen any NULLs,
+        # so we should not count NullVariables
+        stack_len = len(self.stack) - len(meta.stack_null_idxes)
 
-            skip_code(resume_codes[0])
+        new_code: types.CodeType = ContinueExecutionCache.lookup(
+            self.f_code,
+            self.lineno,
+            resume_inst.offset,
+            tuple(b.target.offset for b in self.block_stack),
+            stack_len,
+            argnames,
+            argnames_null,
+            tuple(b.resume_fn() for b in self.block_stack),
+            handle_inactive_ctx,
+            tuple(meta.stack_ctx_args),
+            tuple(meta.locals_ctx_args),
+            tuple(meta.stack_null_idxes),
+            tuple(resume_codes),
+        )
 
-        # load first resume function (to be called this frame)
-        if resume_codes[-1].co_freevars:
-            cg.make_function_with_closure(
-                txes[-1], resume_names[-1], resume_codes[-1], True, 1
+        # Add original GraphModule context to the resume function to handle
+        # the case of a graph break while tracing a GraphModule
+        orig_graphmodule_maybe = code_context.get_context(self.f_code).get(
+            "orig_graphmodule", lambda: None
+        )()
+        if orig_graphmodule_maybe is not None:
+            code_context.get_context(new_code)["orig_graphmodule"] = weakref.ref(
+                orig_graphmodule_maybe
             )
+
+        # add resume function to the global scope
+        if new_code.co_freevars:
+            # expose code object for debugging purposes
+            self.output.install_global_unsafe(resume_name, new_code)
+            package_name = None
         else:
-            cg.extend_output(cg.load_function_name(resume_names[-1], True, 1))
+            # This is safe: we pre-generate a unique name
+            self.output.install_global_unsafe(
+                resume_name,
+                types.FunctionType(new_code, self.f_globals, resume_name),
+            )
+            package_name = resume_name
+
+        if self.package is not None:
+            self.package.add_resume_function(
+                new_code, self.f_globals["__name__"], package_name
+            )
+
+        return new_code, resume_name
+
+    def create_call_resume_at(
+        self,
+        inst: Instruction,
+        all_stack_locals_metadata: list[StackLocalsMetadata],
+    ) -> list[Instruction]:
+        """
+        Codegen all resume function(s) from the frame stack starting at `self` and call them.
+        Assumes that the unsupported instruction has already been run.
+
+        Expects the stack to be in the state:
+            [frame N cells, ..., frame 1 cells],
+            [
+                frame N stack + locals,
+                frame N-1 stack + locals,
+                ...,
+                frame 1 stack + locals
+            ]
+
+        Pops the cells and frame values list from the stack.
+        Also includes a return instruction (stack expected to be empty after return).
+
+        Args:
+            - inst: the instruction of the current (deepest) frame to resume at
+            - all_stack_locals_metadata: metadata returned from OutputGraph.compile_subgraph - contains
+                metadata such as local names, NULL positions, stack length, etc.
+        """
 
-        # load all other resume functions (to be called later)
-        resume_names.pop()
-        resume_codes.pop()
-        for tx, name, code in zip(txes, resume_names, resume_codes):
+        self.instruction_pointer = None
+
+        current_num_stack = len(self.stack) - len(
+            all_stack_locals_metadata[0].stack_null_idxes
+        )
+        all_stack_locals_metadata[0].num_stack = current_num_stack
+
+        if inst.opname in ("RETURN_VALUE", "RETURN_CONST"):
+            return self.codegen_return_with_pops(
+                inst, all_stack_locals_metadata[0].num_stack
+            )
+
+        cg = PyCodegen(self.output.root_tx)
+
+        cur_tx: Optional[InstructionTranslatorBase] = self
+        idx = 0
+        resume_codes: list[types.CodeType] = []
+        resume_names = []
+        while cur_tx is not None:
+            if cur_tx is self:
+                resume_inst = inst
+            else:
+                resume_inst = cur_tx.next_instruction
+            resume_code, resume_name = cur_tx.create_resume(
+                idx,
+                resume_inst,
+                all_stack_locals_metadata[idx],
+                resume_codes,
+                cg,
+                cur_tx is self,
+                True,
+            )
+            resume_codes.append(resume_code)
+            resume_names.append(resume_name)
+
+            cur_tx = cur_tx.parent
+            idx += 1
+
+        self.codegen_call_resume(resume_codes, resume_names, cg)
+        return cg.get_instructions() + [create_instruction("RETURN_VALUE")]
+
+    @staticmethod
+    def codegen_call_resume(
+        resume_codes: list[types.CodeType], resume_names: list[str], cg: PyCodegen
+    ) -> None:
+        """
+        Calls the provided resume functions.
+
+        Expects the TOS to be in the state:
+            [frame N cells, ..., frame 1 cells],
+            [
+                frame N stack + locals,
+                frame N-1 stack + locals,
+                ...,
+                frame 1 stack + locals
+            ]
+
+        Pops the cells and frame values, leaving the result of calling the resume functions on TOS.
+
+        Args:
+            - resume_codes: list of resume function code objects to call
+            - resume_names: list of the corresponding names of the resume functions
+            - cg: PyCodegen object to output instructions to
+        """
+        # NOTE: We will load cells as we load resume functions
+
+        # load resume functions except the root's
+        cg.extend_output(create_copy(2))
+        for i, (name, code) in enumerate(zip(resume_names, resume_codes)):
+            if i == len(resume_names) - 1:
+                break
+            # stack: cells, frames, *(resume 1, ...), cells
             if code.co_freevars:
-                cg.make_function_with_closure(tx, name, code, False, 0)
+                cg.extend_output(
+                    [
+                        create_dup_top(),
+                        cg.create_load_const(i),
+                        cg.create_binary_subscr(),
+                    ]
+                )
+                cg.make_function_with_closure(name, code)
             else:
                 cg.extend_output(cg.load_function_name(name, False, 0))
+            cg.extend_output(create_swap(2))
         cg.extend_output(
             [
-                create_instruction("BUILD_LIST", arg=len(resume_codes)),
-                *create_swap(2),
+                create_instruction("POP_TOP"),
+                create_instruction("BUILD_LIST", arg=len(resume_codes) - 1),
             ]
         )
 
-        # resume 1 (+ NULL), [resume N, ..., resume 2], frames
+        # stack: cells, frames, [resume 1, ..., resume N - 1]
+        # load root resume function
+        cg.extend_output(create_swap(3))
+        if resume_codes[-1].co_freevars:
+            cg.extend_output(
+                [
+                    cg.create_load_const(-1),
+                    cg.create_binary_subscr(),
+                ]
+            )
+            cg.make_function_with_closure(resume_names[-1], resume_codes[-1])
+            cg.extend_output(
+                [
+                    *create_rot_n(3),
+                ]
+            )
+        else:
+            cg.extend_output(
+                [
+                    create_instruction("POP_TOP"),
+                    *cg.load_function_name(resume_names[-1], False),
+                    *create_rot_n(3),
+                ]
+            )
+
+        # resume 1, [resume N, ..., resume 2], frames
 
         # load top level-frame; final stack state should be:
         # first resume function (+ NULL),
@@ -2796,11 +3104,9 @@ def create_call_resume_at(
         # TOS: [resumes, frames, *(frame 1 stack + locals)]
         cg.extend_output(
             [
-                create_instruction("CALL_FUNCTION_EX", arg=0),
-                create_instruction("RETURN_VALUE"),
+                *create_call_function_ex(False, True),
             ]
         )
-        return cg.get_instructions()
 
     def should_compile_partial_graph(self) -> bool:
         if sys.version_info >= (3, 11):
@@ -3003,14 +3309,17 @@ def UNPACK_SEQUENCE(self, inst: Instruction) -> None:
                 "(i.e. `a, b, c = d`).",
                 hints=[*graph_break_hints.USER_ERROR],
             )
+        # pyrefly: ignore  # unbound-name
         if len(val) != inst.argval:
             unimplemented_v2(
                 gb_type="Length mismatch when unpacking object for UNPACK_SEQUENCE",
+                # pyrefly: ignore  # unbound-name
                 context=f"expected length: {inst.argval}, actual: {len(val)}",
                 explanation=f"{seq} unpacked to a list for the UNPACK_SEQUENCE bytecode "
                 "(i.e. `a, b, c = d`) with unexpected length.",
                 hints=[*graph_break_hints.DYNAMO_BUG],
             )
+        # pyrefly: ignore  # unbound-name
         for i in reversed(val):
             self.push(i)
 
@@ -3383,9 +3692,13 @@ def _call(self, inst: Instruction, call_kw: bool = False) -> None:
                 args = [contents[1]]
 
         if kw_names:
+            # pyrefly: ignore  # bad-argument-type
             args = args + contents[2 : -len(kw_names)]
+            # pyrefly: ignore  # bad-argument-type
             kwargs_list = contents[-len(kw_names) :]
+            # pyrefly: ignore  # no-matching-overload
             kwargs = dict(zip(kw_names, kwargs_list))
+            # pyrefly: ignore  # bad-argument-type
             assert len(kwargs) == len(kw_names)
         else:
             args = args + contents[2:]
@@ -3424,44 +3737,19 @@ def CACHE(self, inst: Instruction) -> None:
     def BEFORE_WITH(self, inst: Instruction) -> None:
         self.setup_or_before_with(inst)
 
-    def setup_or_before_with(self, inst: Instruction) -> None:
-        ctx = self.pop()
-        if not isinstance(
-            ctx, (ContextWrappingVariable, GenericContextWrappingVariable)
-        ):
-            unimplemented_v2(
-                gb_type="Unsupported context manager",
-                context=f"Attempted SETUP_WITH/BEFORE_WITH on {ctx}",
-                explanation=f"Dynamo does not know how to enter a `{ctx.python_type_name()}` context manager.",
-                hints=[
-                    "Avoid using the unsupported context manager.",
-                    "If the context manager seems like it should be supported (e.g. torch.set_grad_enabled), then "
-                    "it may be the case that it was created outside the compiled region, which Dynamo does not support. "
-                    "Supported context managers can cross graph break boundaries only if they are local non-closure "
-                    "variables, or are intermediate values.",
-                    "File an issue to PyTorch. Simple context managers can potentially be supported, "
-                    "but note that context managers can't be supported in general",
-                ],
-            )
-
+    def enter_ctx(
+        self,
+        ctx: Union[ContextWrappingVariable, GenericContextWrappingVariable],
+        inst: Instruction,
+    ) -> VariableTracker:
         if (
             isinstance(ctx, GenericContextWrappingVariable)
             and not ctx.supports_graph_breaks()
         ):
             self.active_generic_context_managers.append(ctx)
 
-        # Need this redundant check for mypy
-        assert isinstance(
-            ctx, (ContextWrappingVariable, GenericContextWrappingVariable)
-        )
-
-        exit = WithExitFunctionVariable(
-            ctx,
-            inst.target,
-        )
-
         if sys.version_info >= (3, 11):
-            # See create_call_resume_at for block stack details.
+            # See update_block_stack/create_resume for block stack details.
             # Only push a block if the current instruction's block is a
             # with block that is not nested in a try block - that is, the current
             # instruction's block target is the same as the top block's target.
@@ -3476,8 +3764,6 @@ def setup_or_before_with(self, inst: Instruction) -> None:
         else:
             target = inst.target
 
-        self.push(exit)
-
         if target:
             if isinstance(self, InstructionTranslator) or config.nested_graph_breaks:
                 self.block_stack.append(
@@ -3486,7 +3772,39 @@ def setup_or_before_with(self, inst: Instruction) -> None:
             else:
                 self.block_stack.append(BlockStackEntry(inst, target, len(self.stack)))
 
-        self.push(ctx.enter(self))
+        return ctx.enter(self)
+
+    @staticmethod
+    def unsupported_ctx_graph_break(ctx: VariableTracker) -> NoReturn:
+        unimplemented_v2(
+            gb_type="Unsupported context manager",
+            context=f"Attempted SETUP_WITH/BEFORE_WITH/LOAD_SPECIAL on {ctx}",
+            explanation=f"Dynamo does not know how to enter a `{ctx.python_type_name()}` context manager.",
+            hints=[
+                "Avoid using the unsupported context manager.",
+                "If the context manager seems like it should be supported (e.g. torch.set_grad_enabled), then "
+                "it may be the case that it was created outside the compiled region, which Dynamo does not support. "
+                "Supported context managers can cross graph break boundaries only if they are local non-closure "
+                "variables, or are intermediate values.",
+                "File an issue to PyTorch. Simple context managers can potentially be supported, "
+                "but note that context managers can't be supported in general",
+            ],
+        )
+
+    def setup_or_before_with(self, inst: Instruction) -> None:
+        ctx = self.pop()
+        if not isinstance(
+            ctx, (ContextWrappingVariable, GenericContextWrappingVariable)
+        ):
+            self.unsupported_ctx_graph_break(ctx)
+
+        # Need this redundant check for mypy
+        assert isinstance(
+            ctx, (ContextWrappingVariable, GenericContextWrappingVariable)
+        )
+
+        self.push(WithExitFunctionVariable(ctx, inst.target))
+        self.push(self.enter_ctx(ctx, inst))
 
     def append_prefix_inst(self, inst: Instruction) -> None:
         assert self.accept_prefix_inst
@@ -3542,7 +3860,7 @@ def LOAD_SUPER_ATTR(self, inst: Instruction) -> None:
         if inst.arg & 1:
             self.LOAD_METHOD(inst)
         else:
-            self._load_attr(inst)
+            self._load_attr(inst.argval)
 
     def CALL_INTRINSIC_1(self, inst: Instruction) -> None:
         if inst.argval == 3:
@@ -3611,6 +3929,70 @@ def FORMAT_SIMPLE(self, inst: Instruction) -> None:
     def FORMAT_WITH_SPEC(self, inst: Instruction) -> None:
         self._format_value(self.pop(), 0)
 
+    # 3.14 opcodes
+    LOAD_FAST_BORROW = LOAD_FAST
+    NOT_TAKEN = NOP
+    POP_ITER = POP_TOP
+
+    # See
+    # https://github.com/python/cpython/blob/805e3368d6d07e58430654d1365283924fdf4143/Python/ceval.c#L559
+    # for the LOAD_SPECIAL table - make sure it matches for Python 3.14+
+    _load_special_names = (
+        "__enter__",
+        "__exit__",
+        "__aenter__",
+        "__aexit__",
+    )
+
+    def LOAD_SPECIAL(self, inst: Instruction) -> None:
+        assert isinstance(inst.arg, int), "expected LOAD_SPECIAL arg to be set to int"
+        attr = self._load_special_names[inst.arg]
+        if attr in ("__enter__", "__exit__"):
+            ctx = self.pop()
+            if not isinstance(
+                ctx, (ContextWrappingVariable, GenericContextWrappingVariable)
+            ):
+                self.unsupported_ctx_graph_break(ctx)
+
+            # Need this redundant check for mypy
+            assert isinstance(
+                ctx, (ContextWrappingVariable, GenericContextWrappingVariable)
+            )
+            if attr == "__enter__":
+                self.push(WithEnterFunctionVariable(ctx))
+                self.PUSH_NULL(inst)
+            else:
+                # WithExitFunctionVariable doesn't really do anything with target for 3.11+
+                self.push(WithExitFunctionVariable(ctx, None))
+                self.PUSH_NULL(inst)
+        else:
+            # Implementation is similar to LOAD_METHOD for 3.13+
+            self._load_attr(attr)
+            obj = self.pop()
+            self.push(obj)
+            self.PUSH_NULL(inst)
+
+    def LOAD_SMALL_INT(self, inst: Instruction) -> None:
+        self.push(ConstantVariable.create(inst.argval))
+
+    # See
+    # https://github.com/python/cpython/blob/7519ac294fc5c4fd7fb9cb8dc0edc960688cf887/Python/pylifecycle.c#L814
+    # for the common constants - make sure it matches for Python 3.14+.
+    # The common constants are all attributes of `builtins`.
+    _common_constants = (
+        "AssertionError",
+        "NotImplementedError",
+        "tuple",
+        "all",
+        "any",
+    )
+
+    def LOAD_COMMON_CONSTANT(self, inst: Instruction) -> None:
+        assert isinstance(inst.arg, int), (
+            "expected LOAD_COMMON_CONSTANT arg to be set to int"
+        )
+        self.push(self.load_builtin_from_argval(self._common_constants[inst.arg]))
+
     def is_non_empty_graph(self) -> bool:
         if self.output.count_calls() > 1:
             # perf optimization only
@@ -3722,6 +4104,7 @@ def __init__(
         self.accept_prefix_inst = True
         self.prefix_insts = []
         self.exn_vt_stack = exn_vt_stack
+        self.latest_bytecode_queue = deque(maxlen=20)
 
         # Properties of the input/output code
         self.instructions: list[Instruction] = instructions
@@ -3769,24 +4152,23 @@ def __init__(
 
         self.package = package
 
-        if sys.version_info >= (3, 10):
-            from .resume_execution import (
-                CO_ASYNC_GENERATOR,
-                CO_COROUTINE,
-                CO_GENERATOR,
-                CO_ITERABLE_COROUTINE,
-            )
+        from .resume_execution import (
+            CO_ASYNC_GENERATOR,
+            CO_COROUTINE,
+            CO_GENERATOR,
+            CO_ITERABLE_COROUTINE,
+        )
 
-            if f_code.co_flags & (
-                CO_GENERATOR | CO_COROUTINE | CO_ITERABLE_COROUTINE | CO_ASYNC_GENERATOR
-            ):
-                self.push(BuiltinVariable(None))
+        if f_code.co_flags & (
+            CO_GENERATOR | CO_COROUTINE | CO_ITERABLE_COROUTINE | CO_ASYNC_GENERATOR
+        ):
+            self.push(BuiltinVariable(None))
 
         self.inline_depth = inline_depth
         self.inconsistent_side_effects = False
-        self._constants_cache: list[Optional[ConstantVariable]] = [None] * len(
-            f_code.co_consts
-        )
+        self._constants_cache: list[
+            Optional[Union[ConstantVariable, SliceVariable]]
+        ] = [None] * len(f_code.co_consts)
 
         self.is_trace_bytecode_log_enabled: Optional[bool] = (
             trace_bytecode_log.isEnabledFor(logging.DEBUG)
@@ -3847,6 +4229,7 @@ def __init__(
                 global_scope=f_globals,
                 f_code=f_code,
                 torch_function_mode_stack=torch_function_mode_stack,
+                one_graph=one_graph,
                 package=package,
             ),
             instructions=instructions,
@@ -4023,6 +4406,7 @@ def replace_tos_if_return_is_generator(self) -> None:
             and isinstance(tos, LocalGeneratorObjectVariable)
         ):
             self.stack[-1] = ListIteratorVariable(
+                # pyrefly: ignore  # unbound-name
                 tos.force_unpack_var_sequence(self),
                 mutation_type=ValueMutationNew(),
             )
@@ -4065,13 +4449,9 @@ def _return(self, inst: Instruction) -> None:
         # we should only be tracing 1 frame, and there should not be any NULLs on the stack
         assert len(all_stack_locals_metadata) == 1
         assert not all_stack_locals_metadata[0].stack_null_idxes
-        return_inst = (
-            create_instruction("RETURN_VALUE")
-            if inst.opname == "RETURN_VALUE"
-            else create_instruction("RETURN_CONST", argval=inst.argval)
+        self.output.add_output_instructions(
+            self.codegen_return_with_pops(inst, all_stack_locals_metadata[0].num_stack)
         )
-        # NOTE: does the stack need to be empty after the return?
-        self.output.add_output_instructions([return_inst])
         raise ReturnValueOp
 
     def RETURN_VALUE(self, inst: Instruction) -> None:
@@ -4095,6 +4475,7 @@ class InliningInstructionTranslator(InstructionTranslatorBase):
     """Trace and inline a called method"""
 
     symbolic_result: Optional[VariableTracker]
+    # pyrefly: ignore  # bad-override
     parent: InstructionTranslatorBase
 
     @classmethod
@@ -4136,9 +4517,11 @@ def check_inlineable(func: Any) -> trace_rules.SkipResult:
 
             # _origin marks this as coming from an internal dynamo known function that is safe to
             # trace through.
-            if hasattr(getattr(func, "fn", None), "_origin") and func.fn._origin in [
-                produce_trampoline_autograd_apply,
-            ]:
+            if (
+                hasattr(getattr(func, "fn", None), "_origin")
+                # pyrefly: ignore  # missing-attribute
+                and func.fn._origin is produce_trampoline_autograd_apply
+            ):
                 # Known sound
                 return trace_rules.SkipResult(
                     False, "allowlist in dynamo known function"
@@ -4211,12 +4594,14 @@ def build_inline_tracer(
                 tracing_ctx.previously_inlined_functions[code] = result
 
         try:
+            # pyrefly: ignore  # missing-attribute
             sub_locals = func.bind_args(parent, args, kwargs)
         except TypeError as e:
             # Wrap the general TypeError during bind_args() to the internal ArgsMismatchError with detailed info
             raise ArgsMismatchError(  # noqa: B904
                 "{reason}.\n  func = {func}, args = {args}, kwargs = {kwargs}".format(
                     reason=str(e),
+                    # pyrefly: ignore  # missing-attribute
                     func=f"'{func.get_name()}' {func.get_filename()}:{func.get_code().co_firstlineno}",
                     args=[arg.python_type() for arg in args],
                     kwargs=kwargs,
@@ -4300,6 +4685,7 @@ def get_trace_call_log_str() -> str:
                 sub_locals,
                 parent.symbolic_globals,
                 parent.symbolic_torch_function_state,
+                # pyrefly: ignore  # bad-argument-type
                 func,
             )
         return tracer
@@ -4449,13 +4835,10 @@ def should_compile_partial_graph(self) -> bool:
     def create_call_resume_at(
         self,
         inst: Instruction,
-        all_stack_locals_metadata: Any,
-        disable_current_frame_resume: bool,
+        all_stack_locals_metadata: list[StackLocalsMetadata],
     ) -> list[Instruction]:
         if config.nested_graph_breaks:
-            return super().create_call_resume_at(
-                inst, all_stack_locals_metadata, disable_current_frame_resume
-            )
+            return super().create_call_resume_at(inst, all_stack_locals_metadata)
         unimplemented_v2(
             gb_type="Graph break in inlined function",
             context="",
diff --git a/torch/_dynamo/test_case.py b/torch/_dynamo/test_case.py
index 77860c720a6e..41ceb9ecbf41 100644
--- a/torch/_dynamo/test_case.py
+++ b/torch/_dynamo/test_case.py
@@ -153,7 +153,9 @@ class CPythonTestCase(TestCase):
     assertTupleEqual = unittest.TestCase.assertTupleEqual
     assertSetEqual = unittest.TestCase.assertSetEqual
     assertDictEqual = polyfills.assert_dict_equal
+    # pyrefly: ignore  # bad-override
     assertRaises = unittest.TestCase.assertRaises
+    # pyrefly: ignore  # bad-override
     assertRaisesRegex = unittest.TestCase.assertRaisesRegex
     assertWarns = unittest.TestCase.assertWarns
     assertWarnsRegex = unittest.TestCase.assertWarnsRegex
@@ -169,8 +171,10 @@ def compile_fn(
     ) -> Callable[..., Any]:
         # We want to compile only the test function, excluding any setup code
         # from unittest
+
         method = getattr(self, self._testMethodName)
         method = torch._dynamo.optimize(backend, error_on_graph_break=nopython)(method)
+
         setattr(self, self._testMethodName, method)
         return fn
 
diff --git a/torch/_dynamo/test_minifier_common.py b/torch/_dynamo/test_minifier_common.py
index f48dae1d0e33..07c0c172342e 100644
--- a/torch/_dynamo/test_minifier_common.py
+++ b/torch/_dynamo/test_minifier_common.py
@@ -207,6 +207,7 @@ def _run_minifier_launcher(
         launch_file = _as_posix_path(os.path.join(repro_dir, "minifier_launcher.py"))
         with open(launch_file) as f:
             launch_code = f.read()
+
         self.assertTrue(os.path.exists(launch_file))
 
         args = ["python3", launch_file, "minify", *minifier_args]
@@ -218,6 +219,7 @@ def _run_minifier_launcher(
         print("minifier stdout:", launch_proc.stdout.decode("utf-8"))
         stderr = launch_proc.stderr.decode("utf-8")
         print("minifier stderr:", stderr)
+
         self.assertNotIn("Input graph did not fail the tester", stderr)
 
         return launch_proc, launch_code
@@ -230,6 +232,7 @@ def _run_repro(
         repro_file = _as_posix_path(os.path.join(repro_dir, "repro.py"))
         with open(repro_file) as f:
             repro_code = f.read()
+
         self.assertTrue(os.path.exists(repro_file))
 
         repro_proc = self._maybe_subprocess_run(
@@ -296,11 +299,14 @@ def _run_full_test(
         if expected_error is None:
             # Just check that there was no error
             self.assertEqual(test_proc.returncode, 0)
+
             self.assertIsNone(repro_dir)
             return None
         # NB: Intentionally do not test return code; we only care about
         # actually generating the repro, we don't have to crash
+
         self.assertIn(expected_error, test_proc.stderr.decode("utf-8"))
+
         self.assertIsNotNone(repro_dir)
         print("running minifier", file=sys.stderr)
         _minifier_proc, minifier_code = self._run_minifier_launcher(
@@ -311,6 +317,7 @@ def _run_full_test(
         )
         print("running repro", file=sys.stderr)
         repro_proc, repro_code = self._run_repro(repro_dir, isolate=isolate)
+
         self.assertIn(expected_error, repro_proc.stderr.decode("utf-8"))
         self.assertNotEqual(repro_proc.returncode, 0)
         return MinifierTestResult(minifier_code=minifier_code, repro_code=repro_code)
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index 805c3be524e8..cc3cffd1273f 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -496,6 +496,7 @@ def make_test_cls_with_patches(
 def skipIfNotPy311(fn: Callable[_P, _T]) -> Callable[_P, _T]:
     if sys.version_info >= (3, 11):
         return fn
+    # pyrefly: ignore  # bad-return, bad-argument-type
     return unittest.skip(fn)
 
 
@@ -505,6 +506,12 @@ def skipIfNotPy312(fn: Callable[_P, _T]) -> Callable[_P, _T]:
     return unittest.skip("Requires Python 3.12+")(fn)
 
 
+def skipIfOnlyNotPy312(fn: Callable[_P, _T]) -> Callable[_P, _T]:
+    if sys.version_info >= (3, 13) or sys.version_info < (3, 12):
+        return unittest.skip("Requires Python 3.12")(fn)
+    return fn
+
+
 def xfailIfPy312(fn: Callable[_P, _T]) -> Callable[_P, _T]:
     if sys.version_info >= (3, 12):
         return unittest.expectedFailure(fn)
@@ -517,13 +524,6 @@ def skipIfPy312(fn: Callable[_P, _T]) -> Callable[_P, _T]:
     return fn
 
 
-def requiresPy310(fn: Callable[_P, _T]) -> Callable[_P, _T]:
-    if sys.version_info >= (3, 10):
-        return fn
-    else:
-        return unittest.skip("Requires Python 3.10+")(fn)
-
-
 # Controls tests generated in test/inductor/test_torchinductor_dynamic_shapes.py
 # and test/dynamo/test_dynamic_shapes.py
 def expectedFailureDynamic(fn: Callable[_P, _T]) -> Callable[_P, _T]:
diff --git a/torch/_dynamo/trace_rules.py b/torch/_dynamo/trace_rules.py
index 47ad8cda0c97..cf480377057a 100644
--- a/torch/_dynamo/trace_rules.py
+++ b/torch/_dynamo/trace_rules.py
@@ -177,7 +177,6 @@
     "torch.compiler.is_compiling": TorchInGraphFunctionVariable,
     "torch.compiler.is_dynamo_compiling": TorchInGraphFunctionVariable,
     "torch.compiler.is_exporting": TorchInGraphFunctionVariable,
-    "torch.autograd._profiler_enabled": SkipFunctionVariable,
     "torch._C._to_dlpack": SkipFunctionVariable,
     "torch.to_dlpack": SkipFunctionVariable,
     # We graph break on RNG state setters or getters like
@@ -450,6 +449,7 @@
         "torch._C._accelerator_getAccelerator",
         "torch._C._accelerator_getDeviceIndex",
         "torch._C._accelerator_getStream",
+        "torch._C._accelerator_setAllocatorSettings",
         "torch._C._accelerator_setStream",
         "torch._C._accelerator_synchronizeDevice",
         "torch._C._activate_gpu_trace",
@@ -506,7 +506,6 @@
         "torch._C._cuda_clearCublasWorkspaces",
         "torch._C._cuda_cudaCachingAllocator_raw_alloc",
         "torch._C._cuda_cudaCachingAllocator_raw_delete",
-        "torch._C._cuda_cudaCachingAllocator_set_allocator_settings",
         "torch._C._cuda_cudaHostAllocator",
         "torch._C._cuda_customAllocator",
         "torch._C._cuda_emptyCache",
@@ -684,6 +683,7 @@
         "torch._C._get_mem_efficient_sdp_enabled",
         "torch._C._get_mkldnn_enabled",
         "torch._C._get_cudnn_sdp_enabled",
+        "torch._C._get_overrideable_sdp_enabled",
         "torch._C._set_sdp_use_cudnn",
         "torch._C._get_mobile_model_contained_types_from_buffer",
         "torch._C._get_mobile_model_contained_types",
@@ -1220,6 +1220,7 @@
         "torch._C._set_sdp_use_math",
         "torch._C._set_math_sdp_allow_fp16_bf16_reduction",
         "torch._C._set_sdp_use_mem_efficient",
+        "torch._C._set_sdp_use_overrideable",
         "torch._C._set_should_use_format_with_string_table",
         "torch._C._set_sm_carveout_experimental",
         "torch._C._set_storage_access_error_msg",
@@ -2441,6 +2442,7 @@
         "torch.atleast_3d",
         "torch.autograd._calculate_shape",
         "torch.autograd._is_checkpoint_valid",
+        "torch.autograd._profiler_enabled",
         "torch.autograd._make_grads",
         "torch.autograd._register_py_tensor_class_for_device",
         "torch.autograd._tensor_or_tensors_to_tuple",
@@ -3365,6 +3367,7 @@ def _module_dir(m: types.ModuleType) -> Optional[str]:
     "torch._functorch.apis",
     "torch._functorch.deprecated",
     "torch.nn.attention.flex_attention",
+    "torch.ao.quantization.stubs",
     "torch.ao.quantization.pt2e.export_utils",
     "torch.ao.quantization.pt2e.qat_utils",
     "torch.ao.quantization.pt2e.representation.rewrite",
@@ -3404,6 +3407,7 @@ def _module_dir(m: types.ModuleType) -> Optional[str]:
     "torch._dynamo.comptime",
     "torch._dynamo.polyfills",
     "torch._dynamo.test_case",
+    "torch._export.non_strict_utils",
     "torch._functorch._aot_autograd.subclass_parametrization",
     "torch._functorch.autograd_function",
     "torch._functorch.eager_transforms",
@@ -3424,11 +3428,13 @@ def _module_dir(m: types.ModuleType) -> Optional[str]:
     "torch.cuda.amp.autocast_mode",
     "torch.distributions",
     "torch.export._tree_utils",
+    "torch.export._unlift",
     "torch.export._wrapper_utils",
     "torch.fx._pytree",
     "torch.fx._symbolic_trace",
     "torch.fx.experimental.proxy_tensor",
     "torch.fx.passes.shape_prop",
+    "torch.fx.traceback",
     "torch.nn",
     "torch.overrides",
     "torch.random",
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 9d3b6bcd43cf..d83fd95a49d2 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -87,6 +87,7 @@
 from torch.fx._utils import _format_graph_code, lazy_format_graph_code
 from torch.monitor import _WaitCounter
 from torch.nn.modules.lazy import LazyModuleMixin
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 from torch.utils._triton import has_triton, has_triton_package
 from torch.utils.hooks import RemovableHandle
 
@@ -294,11 +295,13 @@ def increment_op_count(cnt: int) -> None:
 def calculate_time_spent() -> dict[str, float]:
     total_by_key = {}
     for phase, timing in cumulative_time_spent_ns.items():
+        # pyrefly: ignore  # unsupported-operation
         total_by_key[phase] = timing / 1e9
 
     total_by_key["total_wall_time"] = total_by_key.get(
         "entire_frame_compile", 0
     ) + total_by_key.get("entire_backward_compile", 0)
+    # pyrefly: ignore  # bad-return
     return total_by_key
 
 
@@ -797,6 +800,7 @@ def compile_times(repr: Literal["str"], aggregate: bool = False) -> str: ...
 
 
 @overload
+# pyrefly: ignore  # inconsistent-overload
 def compile_times(
     repr: Literal["csv"], aggregate: bool = False
 ) -> tuple[list[str], list[object]]: ...
@@ -1024,6 +1028,8 @@ def istype(obj: object, allowed_types: Any) -> bool:
 if sys.version_info >= (3, 12):
     # Some typing classes moved to C in 3.12,
     # which no longer have the _Final mixin.
+    # Check for consistency e.g. here:
+    # https://github.com/python/cpython/blob/f2b82b3b3b1f8c7a81e84df35ee921e44517cf32/Lib/typing.py#L32
     _builtin_final_typing_classes = (
         typing.ParamSpecArgs,
         typing.ParamSpecKwargs,
@@ -1038,14 +1044,18 @@ def is_typing(value: Any) -> bool:
     # _Final catches most of typing classes:
     #   - Any
     #   - Callable
-    #   - Union
+    #   - Union (Python < 3.14)
     #   ...
     #
     # NB: we intentionally ignore classes that inherit from Generic, since they
     # can be used as both TypingVariable as well as UserDefinedClassVariable.
     if sys.version_info >= (3, 12) and isinstance(value, _builtin_final_typing_classes):
         return True
-    return isinstance(value, typing._Final) or value is typing.Generic  # type: ignore[attr-defined]
+    return (
+        isinstance(value, typing._Final)  # type: ignore[attr-defined]
+        or value is typing.Generic
+        or value is typing.Union
+    )
 
 
 def is_numpy_int_type(value: Any) -> bool:
@@ -1305,6 +1315,7 @@ class CompilationMetrics:
     config_inline_inbuilt_nn_modules: Optional[bool] = None
     specialize_float: Optional[bool] = None
     dynamo_config: Optional[str] = None
+    compiler_config: Optional[str] = None
     is_forward: Optional[bool] = None
     num_triton_bundles: Optional[int] = None
     remote_fx_graph_cache_get_time_ms: Optional[int] = None
@@ -1365,6 +1376,8 @@ class CompilationMetrics:
     param_count: Optional[int] = None
     recompile_user_contexts: Optional[set[str]] = None
     inline_inbuilt_nn_modules_candidate: Optional[bool] = False
+    pytorch_version: Optional[str] = None
+    inductor_provenance: Optional[str] = None
 
     @classmethod
     def create(cls, metrics: dict[str, Any]) -> CompilationMetrics:
@@ -1448,6 +1461,7 @@ def collection_to_json_str(metric: Optional[Any]) -> Optional[str]:
         compile_id = all_metrics.get("compile_id")
         all_metrics["compile_id"] = str(compile_id) if compile_id else None
 
+        # pyrefly: ignore  # bad-argument-type
         return cls(**all_metrics)
 
 
@@ -1542,6 +1556,30 @@ def clean_for_json(d: dict[str, Any]) -> dict[str, Any]:
     return json.dumps(config_dict, sort_keys=True)
 
 
+def _compiler_config_for_logging() -> Optional[str]:
+    def clean_for_json(d: dict[str, Any]) -> dict[str, Any]:
+        blocklist = {
+            "TYPE_CHECKING",
+        }
+
+        return {
+            key: sorted(value) if isinstance(value, set) else value
+            for key, value in d.items()
+            if key not in blocklist
+        }
+
+    if not torch.compiler.config:
+        return None
+
+    try:
+        compiler_config_copy = torch.compiler.config.get_config_copy()  # type: ignore[attr-defined]
+    except (TypeError, AttributeError):
+        return "Compiler Config cannot be pickled"
+
+    config_dict = clean_for_json(compiler_config_copy)
+    return json.dumps(config_dict, sort_keys=True)
+
+
 def _scrubbed_inductor_config_for_logging() -> Optional[str]:
     """
     Method to parse and scrub uninteresting configs from inductor config
@@ -1563,7 +1601,7 @@ def default(self, o: Any) -> Any:
     if torch._inductor.config:
         try:
             inductor_config_copy = torch._inductor.config.get_config_copy()
-        except (TypeError, AttributeError):
+        except (TypeError, AttributeError, RuntimeError, AssertionError):
             inductor_conf_str = "Inductor Config cannot be pickled"
 
     if inductor_config_copy is not None:
@@ -1629,11 +1667,13 @@ def record_compilation_metrics(
         "config_suppress_errors": config.suppress_errors,
         "config_inline_inbuilt_nn_modules": config.inline_inbuilt_nn_modules,
         "inductor_config": _scrubbed_inductor_config_for_logging(),
+        "compiler_config": _compiler_config_for_logging(),
         "cuda_version": torch.version.cuda,
         "triton_version": triton.__version__ if has_triton() else "",
         "remote_cache_version": remote_cache_version,
         "inductor_fx_remote_cache_backend_type": inductor_fx_remote_cache_backend_type,
         "python_version": sys.version,
+        "pytorch_version": torch.__version__,
     }
 
     compilation_metrics = CompilationMetrics.create({**common_metrics, **metrics})
@@ -2138,6 +2178,10 @@ def torch_clone(x: torch.Tensor) -> torch.Tensor:
                 x.shape,
                 layout=x.layout,
             )
+        elif is_traceable_wrapper_subclass(x):
+            # Questionable - but this is required to not fail executorch related
+            # torchao tests.
+            return torch_clone(x)
 
         needed_size = sum(
             (shape - 1) * stride for shape, stride in zip(x.size(), x.stride())
@@ -2234,6 +2278,7 @@ def is_jit_model(
     Union[
         torch.jit._trace.TopLevelTracedModule,
         torch.jit._script.RecursiveScriptModule,
+        # pyrefly: ignore  # invalid-param-spec
         torch.jit.ScriptFunction[Any, Any],
         torch.jit.ScriptModule,
     ]
@@ -2342,6 +2387,7 @@ def checkpoint_params(gm: torch.fx.GraphModule) -> Callable[[], None]:
             cuda_rng_state = torch.clone(torch.cuda.get_rng_state())
         saved_state = [
             (param, param._version, torch.clone(param))
+            # pyrefly: ignore  # bad-argument-type
             for param in itertools.chain(gm.parameters(), gm.buffers())
         ]
 
@@ -2607,13 +2653,16 @@ def get_items_from_dict(obj: dict[K, V]) -> Iterable[tuple[K, Union[V, Any]]]:
     if istype(obj, (dict, OrderedDict)):
         return obj.items()
     elif isinstance(obj, OrderedDict):
+        # pyrefly: ignore  # bad-argument-type
         return [(k, OrderedDict.__getitem__(obj, k)) for k in OrderedDict.keys(obj)]
     else:
+        # pyrefly: ignore  # bad-argument-type
         return [(k, dict.__getitem__(obj, k)) for k in dict.keys(obj)]
 
 
 def nn_module_new(cls: Any) -> Any:
     obj = object_new(cls)
+    # pyrefly: ignore  # bad-argument-type
     torch.nn.Module.__init__(obj)
     return obj
 
@@ -2660,6 +2709,7 @@ def dict_keys_getitem(d: dict[Any, Any], n: int) -> Any:
     dict_class = dict
     if isinstance(d, OrderedDict):
         dict_class = OrderedDict
+    # pyrefly: ignore  # bad-argument-type
     return next(itertools.islice(dict_class.keys(d), n, n + 1))
 
 
@@ -2891,6 +2941,15 @@ def rmse(ref: torch.Tensor, res: torch.Tensor) -> torch.Tensor:
     return torch.sqrt(torch.mean(torch.square(ref - res)))
 
 
+def bitwise_same(ref: Any, res: Any, equal_nan: bool = False) -> bool:
+    return same(
+        ref,
+        res,
+        tol=0.0,
+        equal_nan=equal_nan,
+    )
+
+
 def same(
     ref: Any,
     res: Any,
@@ -3203,8 +3262,10 @@ def format_func_info(code: CodeType) -> str:
 @contextlib.contextmanager
 def disable_cache_limit() -> Generator[None, None, None]:
     prior = config.recompile_limit
+    # pyrefly: ignore  # bad-assignment
     config.recompile_limit = sys.maxsize
     prior_acc_limit = config.accumulated_recompile_limit
+    # pyrefly: ignore  # bad-assignment
     config.accumulated_recompile_limit = sys.maxsize
 
     try:
@@ -3939,6 +4000,7 @@ def __repr__(self) -> str:
     def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> Any:
         assert not kwargs
 
+        # pyrefly: ignore  # bad-assignment
         args = (
             tnp.ndarray(arg) if isinstance(arg, torch.Tensor) else arg for arg in args
         )
@@ -4138,6 +4200,7 @@ def nextline(lineno: int, col: int) -> tuple[int, int]:
             # (x) + (y)
             # ~~^~~~~~~
             while (ch := lines[cur_lineno][cur_col]).isspace() or ch in ")\\#":
+                # pyrefly: ignore  # unbound-name
                 if ch in "\\#":
                     cur_lineno, cur_col = nextline(cur_lineno, cur_col)
                 else:
@@ -4488,6 +4551,7 @@ def __init__(
         self.unflatten_fn = unflatten_fn
 
     def forward(self, *args: Any) -> Any:
+        # pyrefly: ignore  # annotation-mismatch
         args: list[Any] = list(args)
         return self.gm(*self.unflatten_fn(args))
 
@@ -4688,6 +4752,7 @@ def _extract_tensor_dict(t: torch.Tensor) -> dict[str, Any]:
 user_obj_id_to_weakref: dict[int, weakref.ReferenceType[object]] = {}
 
 
+# TODO: mlazos to remove after replacing w/ above API
 def get_user_object_from_id(obj_id: int) -> Any:
     obj = user_obj_id_to_weakref[obj_id]()
     assert obj is not None, "User object is no longer alive"
@@ -4696,7 +4761,18 @@ def get_user_object_from_id(obj_id: int) -> Any:
 
 def store_user_object_weakref(obj: object) -> None:
     obj_id = id(obj)
-    user_obj_id_to_weakref[obj_id] = weakref.ref(obj)
+    try:
+        user_obj_id_to_weakref[obj_id] = weakref.ref(obj)
+    except TypeError as e:
+        from .exc import unimplemented_v2
+
+        unimplemented_v2(
+            gb_type="Failed to make weakref to User Object when storing by ID",
+            context=f"user_objected: {obj}",
+            explanation="Object does not allow us to make a weakref to it",
+            hints=[],
+            from_exc=e,
+        )
 
 
 class CompileTimeInstructionCounter:
diff --git a/torch/_dynamo/variables/__init__.py b/torch/_dynamo/variables/__init__.py
index 31bc7db5128f..f1c1567140e7 100644
--- a/torch/_dynamo/variables/__init__.py
+++ b/torch/_dynamo/variables/__init__.py
@@ -29,6 +29,7 @@
     DynamoConfigPatchVariable,
     ErrorOnGraphBreakVariable,
     FSDPParamGroupUseTrainingStateVariable,
+    FxTracebackAnnotateVariable,
     GradIncrementNestingCtxManagerVariable,
     GradInplaceRequiresGradCtxManagerVariable,
     GradModeVariable,
@@ -37,9 +38,9 @@
     SDPAKernelVariable,
     SetFwdGradEnabledContextManager,
     StreamContextVariable,
-    StreamVariable,
     TemporarilyPopInterpreterStackCtxManagerVariable,
     VmapIncrementNestingCtxManagerVariable,
+    WithEnterFunctionVariable,
     WithExitFunctionVariable,
 )
 from .dicts import (
@@ -129,6 +130,7 @@
 )
 from .optimizer import OptimizerVariable
 from .sdpa import SDPAParamsVariable
+from .streams import EventVariable, StreamVariable
 from .tensor import (
     DataPtrVariable,
     FakeItemVariable,
@@ -223,6 +225,7 @@
     "UserFunctionVariable",
     "UserMethodVariable",
     "VariableTracker",
+    "WithEnterFunctionVariable",
     "WithExitFunctionVariable",
     "MappingProxyVariable",
 ]
diff --git a/torch/_dynamo/variables/base.py b/torch/_dynamo/variables/base.py
index eac225132000..4e38b4e4735c 100644
--- a/torch/_dynamo/variables/base.py
+++ b/torch/_dynamo/variables/base.py
@@ -636,6 +636,11 @@ def __init__(
                 assert source is not None
 
 
+def raise_type_error_exc(tx: "InstructionTranslator", msg_str: str) -> None:
+    msg = variables.ConstantVariable.create(msg_str)
+    raise_observed_exception(TypeError, tx, args=[msg])
+
+
 def typestr(*objs):
     if len(objs) == 1:
         (obj,) = objs
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 660042b33b87..5fab51234d74 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -45,6 +45,10 @@
 import torch
 from torch import SymInt
 from torch._dispatch.python import enable_python_dispatcher
+from torch._dynamo.graph_bytecode_inputs import (
+    get_user_object_by_index,
+    register_user_object,
+)
 from torch._dynamo.utils import (
     get_metrics_context,
     is_int_specialization_case,
@@ -60,6 +64,7 @@
 from torch._utils_internal import justknobs_check
 from torch.fx.experimental._backward_state import BackwardState
 from torch.fx.experimental._dynamism import normalize_source_name
+from torch.fx.experimental.sym_node import _DynamicScalar, DynamicInt
 from torch.fx.experimental.symbolic_shapes import (
     _constrain_range_for_size,
     _nested_int_aware_sort,
@@ -101,6 +106,7 @@
     ConvertIntSource,
     DictGetItemSource,
     DictSubclassGetItemSource,
+    DynamicScalarSource,
     FloatTensorSource,
     GetItemSource,
     GradSource,
@@ -170,11 +176,9 @@
     AutocastModeVariable,
     DynamoConfigPatchVariable,
     ErrorOnGraphBreakVariable,
-    EventVariable,
     NullContextVariable,
     PreserveVersionContextVariable,
     StreamContextVariable,
-    StreamVariable,
 )
 from .dicts import (
     ConstDictVariable,
@@ -255,6 +259,7 @@
 from .optimizer import OptimizerVariable
 from .script_object import TorchScriptObjectVariable
 from .sdpa import SDPAParamsVariable
+from .streams import EventVariable, StreamVariable
 from .tensor import (
     NumpyNdarrayVariable,
     supported_const_comparison_op_values,
@@ -440,6 +445,18 @@ def __call__(self, value):
             dup_guard = make_dupe_guard(self.source, side_effect_result.source)
             if dup_guard:
                 self.install_guards(dup_guard)
+
+            if isinstance(value, torch.nn.Module) and isinstance(
+                side_effect_result, UnspecializedNNModuleVariable
+            ):
+                # This means that two nn module instances with different sources
+                # have the same id. NN modules are somewhat special objects,
+                # because we have to track their nn_module_stack for ease of
+                # use. But if we don't do anything, we will just return the
+                # older variable tracker with the older nn_module_stack. So,
+                # lets return the old variable tracker but update its
+                # nn_module_stack
+                side_effect_result.set_nn_module_stack_source(self.source)
             return side_effect_result
 
         cached_vt = self.tx.output.variable_tracker_cache.lookup(value, self.source)
@@ -456,7 +473,9 @@ def _is_deduplicable_sym_variable(value, vt):
             # should NOT track them. If we use a single SymNodeVariable instance to track them
             # across multiple uses, then guards created for one usage will incorrectly apply to
             # all other usages of that constant, leading to unnecessary recompilations.
-            return is_torch_sym(value) and isinstance(vt, SymNodeVariable)
+            return (
+                is_torch_sym(value) or isinstance(value, _DynamicScalar)
+            ) and isinstance(vt, SymNodeVariable)
 
         if (
             (
@@ -1020,24 +1039,19 @@ def build_key_value(i, k, v):
             stream_var = VariableBuilder(self.tx, stream_source)(value.stream)
             return StreamContextVariable.create(self.tx, stream_var)
         elif isinstance(value, torch.Stream):
-            self.install_guards(GuardBuilder.ID_MATCH)
+            self.install_guards(GuardBuilder.TYPE_MATCH)
+            index = register_user_object(value, self.source)
             stream_proxy = self.tx.output.create_proxy(
-                "call_function",
-                type(value),
-                (),
-                {
-                    "stream_id": value.stream_id,
-                    "device_index": value.device_index,
-                    "device_type": value.device_type,
-                },
+                "call_function", get_user_object_by_index, (index,), {}
             )
             set_example_value(stream_proxy.node, value)
-            return StreamVariable(
+            var = StreamVariable(
                 stream_proxy,
                 value,
                 value.device,
                 source=self.source,
             )
+            return self.tx.output.side_effects.track_object_existing(value, var)
         elif isinstance(value, (torch._C._SDPAParams)):
             self.install_guards(GuardBuilder.TYPE_MATCH)
             return SDPAParamsVariable.create(self.tx, value, self.source)
@@ -1045,12 +1059,12 @@ def build_key_value(i, k, v):
             self.install_guards(GuardBuilder.ID_MATCH)
             return FuncTorchInterpreterVariable(value)
         elif isinstance(value, torch.Event):
-            self.install_guards(GuardBuilder.ID_MATCH)
-            torch._dynamo.utils.store_user_object_weakref(value)
+            self.install_guards(GuardBuilder.TYPE_MATCH)
+            index = register_user_object(value, self.source)
             event_proxy = self.tx.output.create_proxy(
                 "call_function",
-                torch._dynamo.utils.get_user_object_from_id,
-                (id(value),),
+                get_user_object_by_index,
+                (index,),
                 {},
             )
             set_example_value(event_proxy.node, value)
@@ -1103,6 +1117,46 @@ def build_key_value(i, k, v):
         ):
             self.install_guards(GuardBuilder.FUNCTION_MATCH)
             return ItertoolsVariable(value, source=self.source)
+        elif isinstance(value, _DynamicScalar):
+            is_int = isinstance(value, DynamicInt)
+            source = DynamicScalarSource(self.source, is_int)
+            if id(value) in self.tx.output.root_tracer.dynamic_scalar_nodes:
+                # If we've already seen this dynamic scalar, reuse the existing
+                # SymInt/SymFloat node.
+                node = self.tx.output.root_tracer.dynamic_scalar_nodes[id(value)]
+            else:
+                sym = self.tx.output.shape_env.create_unspecified_symbol(
+                    value.real,
+                    source=source,
+                    dynamic_dim=DimDynamic.DYNAMIC,
+                )
+                node = self.tx.output.shape_env.create_symintnode(
+                    sym,
+                    hint=value.real,
+                    source=source,
+                )
+
+            # Bind to graph input
+            sym_node_proxy = self.tx.output.root_tracer.create_graph_input(
+                re.sub(r"[^a-zA-Z0-9]+", "_", self.name),
+                type(node),
+                node,
+                source=source,
+            )
+            sym_node_proxy.node.meta["grapharg"] = GraphArg(
+                source,
+                node,
+                False,
+                None,
+                is_tensor=False,
+                example_strong_ref=node,
+            )
+            sym_expr = node.node.expr
+            assert isinstance(sym_expr, sympy.Symbol), (
+                f"{sym_expr} is not a basic Symbol."
+            )
+            self.tx.output.tracked_fakes.append(TrackedFake(node, source, None))
+            return SymNodeVariable(sym_node_proxy, node)
         elif is_torch_sym(value):
             # Note: this doesn't handle nested symints.
             # For SymBool input, we reuse the infra for SymInt by simulating SymBool with a SymInt in dynamo.
@@ -1899,6 +1953,8 @@ def wrap_module(self, value: torch.nn.Module):
                 result = UnspecializedNNModuleVariable(value, source=new_source)
                 install_guard(new_source.make_guard(GuardBuilder.TYPE_MATCH))
 
+            self.tx.output.add_fqn_info_for_inlined_modules(value, self.source)
+
             if not SideEffects.cls_supports_mutation_side_effects(type(value)):
                 # don't allow STORE_ATTR mutation with custom __setattr__
                 return result
@@ -2825,6 +2881,17 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
     import torch._utils
 
     if isinstance(example_value, torch.Tensor):
+        # Check if the result is a sparse tensor -
+        # We generally don't support sparse tensor so better to graph break here
+        if is_sparse_any(example_value) and (
+            not tx.export or not config.capture_sparse_compute
+        ):
+            unimplemented_v2(
+                gb_type="Attempted to wrap sparse Tensor with VariableTracker",
+                context=str(example_value),
+                explanation="torch.compile does not support sparse Tensors with VariableTracker",
+                hints=[*graph_break_hints.SUPPORTABLE],
+            )
         var = construct_tensor_variable(
             target_cls, tx, proxy, example_value, subclass_type, options
         )
@@ -2971,7 +3038,7 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
         ]
         or (
             # TODO: this is a little sus, because we didn't check what the self is
-            proxy.node.op == "call_method" and proxy.node.target in ["bit_length"]
+            proxy.node.op == "call_method" and proxy.node.target == "bit_length"
         )
     ):
         set_example_value(proxy.node, example_value)
@@ -2989,6 +3056,11 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
             torch.backends.cuda.is_flash_attention_available,
             torch.backends.cuda.can_use_flash_attention,
             torch.backends.cuda.can_use_efficient_attention,
+            torch._C._get_cudnn_sdp_enabled,
+            torch._C._get_flash_sdp_enabled,
+            torch._C._get_mem_efficient_sdp_enabled,
+            torch._C._get_math_sdp_enabled,
+            torch._C._get_overrideable_sdp_enabled,
             "is_integer",
         ]
         + list(supported_const_comparison_op_values.keys())
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index b46707f2f117..a27019a5d18d 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -28,7 +28,6 @@
 import logging
 import math
 import operator
-import sys
 import types
 import typing
 import unittest
@@ -82,9 +81,13 @@
     str_methods,
     tensortype_to_dtype,
 )
-from .base import AsPythonConstantNotImplementedError, ValueMutationNew, VariableTracker
+from .base import (
+    AsPythonConstantNotImplementedError,
+    raise_type_error_exc,
+    ValueMutationNew,
+    VariableTracker,
+)
 from .constant import ConstantVariable
-from .ctx_manager import EventVariable, StreamVariable
 from .dicts import (
     ConstDictVariable,
     DefaultDictVariable,
@@ -102,6 +105,7 @@
     TupleIteratorVariable,
     TupleVariable,
 )
+from .streams import EventVariable, StreamVariable
 from .tensor import (
     FakeItemVariable,
     supported_comparison_ops,
@@ -990,7 +994,7 @@ def create_exception_class_object(
                         hints=[*graph_break_hints.SUPPORTABLE],
                     )
 
-                return variables.ExceptionVariable(fn, args, **kwargs)
+                return variables.ExceptionVariable(fn, args, kwargs)
 
             return create_exception_class_object
 
@@ -1029,6 +1033,7 @@ def call_binop_handlers(tx: "InstructionTranslator", args, _):
 
             def call_self_handler(tx: "InstructionTranslator", args, kwargs):
                 try:
+                    # pyrefly: ignore  # not-callable
                     result = self_handler(tx, *args, **kwargs)
                     if result is not None:
                         return result
@@ -1036,11 +1041,12 @@ def call_self_handler(tx: "InstructionTranslator", args, kwargs):
                     # Check if binding is bad. inspect signature bind is expensive.
                     # So check only when handler call fails.
                     try:
+                        # pyrefly: ignore  # bad-argument-type
                         inspect.signature(self_handler).bind(tx, *args, **kwargs)
                     except TypeError as e:
                         has_constant_handler = obj.has_constant_handler(args, kwargs)
                         if not has_constant_handler:
-                            log.warning(
+                            log.warning(  # noqa: G200
                                 "incorrect arg count %s %s and no constant handler",
                                 self_handler,
                                 e,
@@ -1088,6 +1094,7 @@ def constant_fold_handler(tx: "InstructionTranslator", args, kwargs):
                             hints=[*graph_break_hints.DYNAMO_BUG],
                             from_exc=exc,
                         )
+                    # pyrefly: ignore  # unbound-name
                     return VariableTracker.build(tx, res)
 
             else:
@@ -1116,6 +1123,7 @@ def constant_fold_handler(tx: "InstructionTranslator", args, kwargs):
                                 tx,
                                 args=list(map(ConstantVariable.create, exc.args)),
                             )
+                        # pyrefly: ignore  # unbound-name
                         return VariableTracker.build(tx, res)
 
             handlers.append(constant_fold_handler)
@@ -1248,7 +1256,7 @@ def _handle_insert_op_in_graph(self, tx: "InstructionTranslator", args, kwargs):
             # Interaction between ndarray and tensors:
             #   We prefer the tensor op whenever there are tensors involved
             if check_numpy_ndarray_args(args, kwargs) and not any(
-                type(arg) == variables.TensorVariable for arg in args
+                type(arg) is variables.TensorVariable for arg in args
             ):
                 proxy = tx.output.create_proxy(
                     "call_function",
@@ -1438,6 +1446,7 @@ def call_method(
             resolved_fn = getattr(self.fn, name)
             if resolved_fn in dict_methods:
                 if isinstance(args[0], variables.UserDefinedDictVariable):
+                    # pyrefly: ignore  # missing-attribute
                     return args[0]._dict_vt.call_method(tx, name, args[1:], kwargs)
                 elif isinstance(args[0], variables.ConstDictVariable):
                     return args[0].call_method(tx, name, args[1:], kwargs)
@@ -1446,6 +1455,7 @@ def call_method(
             resolved_fn = getattr(self.fn, name)
             if resolved_fn in set_methods:
                 if isinstance(args[0], variables.UserDefinedSetVariable):
+                    # pyrefly: ignore  # missing-attribute
                     return args[0]._set_vt.call_method(tx, name, args[1:], kwargs)
                 elif isinstance(args[0], variables.SetVariable):
                     return args[0].call_method(tx, name, args[1:], kwargs)
@@ -1534,10 +1544,12 @@ def call_str(self, tx: "InstructionTranslator", arg):
             if type(arg.value).__str__ is object.__str__:
                 # Rely on the object str method
                 try:
+                    # pyrefly: ignore  # unbound-name
                     return variables.ConstantVariable.create(value=str_method())
                 except AttributeError:
                     # Graph break
                     return
+            # pyrefly: ignore  # unbound-name
             elif is_wrapper_or_member_descriptor(str_method):
                 unimplemented_v2(
                     gb_type="Attempted to a str() method implemented in C/C++",
@@ -1553,9 +1565,9 @@ def call_str(self, tx: "InstructionTranslator", arg):
                 try:
                     # Only supports certain function types
                     user_func_variable = variables.UserFunctionVariable(bound_method)
-                except AssertionError as e:
+                except AssertionError:
                     # Won't be able to do inline the str method, return to avoid graph break
-                    log.warning("Failed to create UserFunctionVariable: %s", e)
+                    log.warning("Failed to create UserFunctionVariable", exc_info=True)
                     return
 
                 # Inline the user function
@@ -1654,8 +1666,10 @@ def _call_min_max_binary(self, tx: "InstructionTranslator", a, b):
                 else:
                     raw_b = b.raw_value
                 if self.fn is max:
+                    # pyrefly: ignore  # missing-attribute
                     raw_res = max(a.raw_value, raw_b)
                 else:
+                    # pyrefly: ignore  # missing-attribute
                     raw_res = min(a.raw_value, raw_b)
 
                 need_unwrap = any(
@@ -1822,6 +1836,8 @@ def call_iter(self, tx: "InstructionTranslator", obj, *args, **kwargs):
             ret = obj
         elif isinstance(obj, variables.RangeVariable):
             ret = obj.call_method(tx, "__iter__", [], {})
+        elif isinstance(obj, variables.LocalGeneratorObjectVariable):
+            ret = obj  # type: ignore[assignment]
         else:
             # Handle the case where we are iterating over a tuple, list or iterator
             ret = self._call_iter_tuple_list(tx, obj, *args, **kwargs)
@@ -1836,7 +1852,7 @@ def call_iter(self, tx: "InstructionTranslator", obj, *args, **kwargs):
                 polyfills.builtins.iter_
             ).call_function(tx, [obj, *args], {})
 
-            if len(args):
+            if args:
                 # iter(obj, sentinel) returns an object that implements
                 # __iter__ and __next__ methods (UserDefinedObjectVariable)
                 # Wrap the return value in a IteratorVariable subclass (LazyObjectIteratorVariable)
@@ -1919,20 +1935,36 @@ def call_custom_dict(tx: "InstructionTranslator", user_cls, *args, **kwargs):
     def call_custom_dict_fromkeys(
         tx: "InstructionTranslator", user_cls, *args, **kwargs
     ):
-        assert user_cls in {dict, OrderedDict, defaultdict}
+        if user_cls not in {dict, OrderedDict, defaultdict}:
+            unimplemented_v2(
+                gb_type="Unsupported dict type for fromkeys()",
+                context=f"{user_cls.__name__}.fromkeys(): {args} {kwargs}",
+                explanation=f"Failed to call {user_cls.__name__}.fromkeys() because "
+                f"{user_cls.__name__} is not any type of dict, OrderedDict, or defaultdict",
+                hints=[
+                    f"Ensure {user_cls.__name__} is a type of dict, OrderedDict, or defaultdict.",
+                ],
+            )
         if kwargs:
             # Only `OrderedDict.fromkeys` accepts `value` passed by keyword
-            assert user_cls is OrderedDict
-            assert len(args) == 1 and len(kwargs) == 1 and "value" in kwargs
+            if (
+                user_cls is not OrderedDict
+                or len(args) != 1
+                or len(kwargs) != 1
+                or "value" not in kwargs
+            ):
+                raise_type_error_exc(
+                    tx, f"{user_cls.__name__}.fromkeys() takes no keyword arguments"
+                )
             args = (*args, kwargs.pop("value"))
         if len(args) == 0:
-            msg = ConstantVariable.create(
-                "fromkeys expected at least 1 arguments, got 0"
-            )
-            raise_observed_exception(TypeError, tx, args=[msg])
+            raise_type_error_exc(tx, "fromkeys expected at least 1 arguments, got 0")
         if len(args) == 1:
             args = (*args, ConstantVariable.create(None))
-        assert len(args) == 2
+        if len(args) != 2:
+            raise_type_error_exc(
+                tx, f"fromkeys expected at most 2 arguments, got {len(args)}"
+            )
         arg, value = args
         DictVariableType = (
             ConstDictVariable if user_cls is not defaultdict else DefaultDictVariable
@@ -2028,7 +2060,11 @@ def call_frozenset(self, tx: "InstructionTranslator", *args, **kwargs):
 
     def call_zip(self, tx: "InstructionTranslator", *args, **kwargs):
         if kwargs:
-            assert len(kwargs) == 1 and "strict" in kwargs
+            if not (len(kwargs) == 1 and "strict" in kwargs):
+                raise_type_error_exc(
+                    tx,
+                    f"zip() should only have 'strict' keyword argument, but ({len(kwargs)} given)",
+                )
         strict = kwargs.pop("strict", False)
         args = [BuiltinVariable(iter).call_function(tx, [arg], {}) for arg in args]
         return variables.ZipVariable(
@@ -2107,6 +2143,7 @@ def check_type(ty):
             )
 
         if isinstance(arg, variables.UserDefinedExceptionClassVariable):
+            # pyrefly: ignore  # unbound-name
             return ConstantVariable.create(isinstance(arg_type, isinstance_type))
 
         isinstance_type_tuple: tuple[type, ...]
@@ -2115,9 +2152,7 @@ def check_type(ty):
             getattr(isinstance_type, "__instancecheck__", None)
         ):
             isinstance_type_tuple = (isinstance_type,)
-        elif sys.version_info >= (3, 10) and isinstance(
-            isinstance_type, types.UnionType
-        ):
+        elif isinstance(isinstance_type, types.UnionType):
             isinstance_type_tuple = isinstance_type.__args__
         elif isinstance(isinstance_type, tuple) and all(
             isinstance(tp, type) or callable(getattr(tp, "__instancecheck__", None))
@@ -2141,8 +2176,10 @@ def check_type(ty):
             # through it. This is a limitation of the current implementation.
             # Usually `__subclasscheck__` and `__instancecheck__` can be constant fold through, it
             # might not be a big issue and we trade off it for performance.
+            # pyrefly: ignore  # unbound-name
             val = issubclass(arg_type, isinstance_type_tuple)
         except TypeError:
+            # pyrefly: ignore  # unbound-name
             val = arg_type in isinstance_type_tuple
         return variables.ConstantVariable.create(val)
 
@@ -2164,6 +2201,7 @@ def call_issubclass(self, tx: "InstructionTranslator", left_ty, right_ty):
 
         # WARNING: This might run arbitrary user code `__subclasscheck__`.
         # See the comment in call_isinstance above.
+        # pyrefly: ignore  # unbound-name
         return variables.ConstantVariable(issubclass(left_ty_py, right_ty_py))
 
     def call_super(self, tx: "InstructionTranslator", a, b):
@@ -2209,7 +2247,9 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 value = getattr(self.fn, name)
             except AttributeError:
                 raise_observed_exception(AttributeError, tx)
+            # pyrefly: ignore  # unbound-name
             if not callable(value):
+                # pyrefly: ignore  # unbound-name
                 return VariableTracker.build(tx, value, source)
         return variables.GetAttrVariable(self, name, source=source)
 
diff --git a/torch/_dynamo/variables/constant.py b/torch/_dynamo/variables/constant.py
index 11822016827e..d214707b5744 100644
--- a/torch/_dynamo/variables/constant.py
+++ b/torch/_dynamo/variables/constant.py
@@ -17,7 +17,7 @@
 from .. import graph_break_hints, variables
 from ..exc import raise_observed_exception, unimplemented_v2
 from ..utils import cmp_name_to_op_mapping, common_constant_types, istype, np
-from .base import VariableTracker
+from .base import raise_type_error_exc, VariableTracker
 
 
 if TYPE_CHECKING:
@@ -43,7 +43,7 @@ def create(value, **kwargs) -> VariableTracker:
         NOTE: the caller must install the proper guards if needed; most often
         the guard will be `CONSTANT_MATCH`.
         """
-        source = kwargs.get("source", None)
+        source = kwargs.get("source")
 
         # Routing for supported collection literals.
         if isinstance(value, set):
@@ -52,6 +52,10 @@ def create(value, **kwargs) -> VariableTracker:
         elif isinstance(value, frozenset):
             items = [ConstantVariable.create(x) for x in value]
             return variables.FrozensetVariable(items, **kwargs)
+        elif isinstance(value, slice):
+            slice_args = (value.start, value.stop, value.step)
+            slice_args_vars = tuple(ConstantVariable.create(arg) for arg in slice_args)
+            return variables.SliceVariable(slice_args_vars, **kwargs)
         elif isinstance(value, (list, tuple)):
             items = []
             for i, x in enumerate(value):
@@ -145,7 +149,8 @@ def call_method(
                 tx, [self, *args], kwargs
             )
         elif name == "join" and istype(self.value, str):
-            assert len(args) == 1 and len(kwargs) == 0
+            if not (len(args) == 1 and len(kwargs) == 0):
+                raise_type_error_exc(tx, "str.join() takes exactly one argument")
             arg_unpacked = args[0].force_unpack_var_sequence(tx)
             try:
                 arg_const = [x.as_python_constant() for x in arg_unpacked]
diff --git a/torch/_dynamo/variables/ctx_manager.py b/torch/_dynamo/variables/ctx_manager.py
index 15a5540395d1..c3a6ba794dbd 100644
--- a/torch/_dynamo/variables/ctx_manager.py
+++ b/torch/_dynamo/variables/ctx_manager.py
@@ -23,6 +23,7 @@
 import inspect
 import sys
 import warnings
+from contextlib import ExitStack
 from typing import TYPE_CHECKING, Union
 
 import torch._C
@@ -50,6 +51,7 @@
     WrappedUserFunctionVariable,
     WrappedUserMethodVariable,
 )
+from .streams import StreamVariable
 from .user_defined import UserDefinedObjectVariable
 
 
@@ -1262,140 +1264,36 @@ def fn_name(self):
         return "_sdpa_kernel_variadic"
 
 
-class StreamVariable(VariableTracker):
-    def __init__(self, proxy, value, device, **kwargs) -> None:
-        if proxy is not None and "example_value" in proxy.node.meta:
-            assert proxy.node.meta["example_value"] == value
-        assert value.device.type == device.type, (
-            "stream value is not equal to the passed device"
-        )
-        super().__init__(**kwargs)
-        self.proxy = proxy
-        self.value = value
-        self.device = device
-
-    def python_type(self):
-        return torch.Stream
-
-    def call_method(
-        self,
-        tx,
-        name,
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
-        assert hasattr(self.value, name), f"no stream method found named {name}"
-
-        from ..utils import cmp_name_to_op_mapping, proxy_args_kwargs
-        from .builder import wrap_fx_proxy_cls
-
-        if name in ("wait_stream", "synchronize", "wait_event"):
-            tx.output.create_proxy(
-                "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
-            )
-            return variables.ConstantVariable(None)
-        elif name == "query":
-            return wrap_fx_proxy_cls(
-                target_cls=variables.ConstantVariable,
-                tx=tx,
-                proxy=tx.output.create_proxy(
-                    "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
-                ),
-            )
-        elif name == "record_event":
-            return wrap_fx_proxy_cls(
-                target_cls=EventVariable,
-                tx=tx,
-                proxy=tx.output.create_proxy(
-                    "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
-                ),
-            )
-        elif name in cmp_name_to_op_mapping and len(args) == 1 and not kwargs:
-            # NB : Checking for mutation is necessary because we compare
-            # constant values
-            other = args[0]
-            if not isinstance(other, StreamVariable):
-                return variables.ConstantVariable.create(NotImplemented)
-            return variables.ConstantVariable.create(
-                cmp_name_to_op_mapping[name](self.value, other.value)
-            )
-
-        return super().call_method(tx, name, args, kwargs)
-
-    def as_proxy(self):
-        return self.proxy
-
-    def reconstruct(self, codegen: "PyCodegen"):
-        # If we got here, this stream is fully subsumed by the graph - this means it is
-        # not an input or global
-        assert not self.source
-        # Since we just proved that - for other such structures, like lists and dicts, reconstruction
-        # is fine and sound according to dynamo principles of treating collectives. However,
-        # streams are special in that we want to preserve the identity of the stream as the same as in the graph
-        # Normally, we would do this via codegen for the proxy mapping to an output - we cannot do this yet, as we do not
-        # yet have a plan for how we want to handle the case where the stream is used as an input or an output. Pending
-        # design, to unblock current work, we lift the stream into a global and then codegen bytecode to load it from there.
-        prefix = f"_stream_{self.device}"
-        name = codegen.tx.output.install_global_by_id(prefix, self.value)
-        codegen.append_output(codegen.create_load_global(name, add=True))
-
-
-class EventVariable(VariableTracker):
-    def __init__(self, proxy, value, **kwargs) -> None:
-        if proxy is not None and "example_value" in proxy.node.meta:
-            assert proxy.node.meta["example_value"] == value
-        super().__init__(**kwargs)
-        self.proxy = proxy
-        self.value = value
+class FxTracebackAnnotateVariable(ContextWrappingVariable):
+    """
+    fx.traceback.annotate is a context manager that allows users to annotate the
+    fx graph nodes with custom metadata. In the context of Dynamo, we don't have
+    to trace the body of the context manager. Instead we want to directly run
+    the body of the context manager, so the Dynamo created Fx graphs have the
+    right custom metadata. This variable tracker just runs __enter__ and
+    __exit__ method (instead of tracing).
+    """
 
-    def call_method(
-        self,
-        tx,
-        name,
-        args: "list[VariableTracker]",
-        kwargs: "dict[str, VariableTracker]",
-    ) -> "VariableTracker":
-        from ..utils import proxy_args_kwargs
-        from .builder import wrap_fx_proxy_cls
+    def __init__(self, target_values, initial_values=None, **kwargs) -> None:
+        super().__init__(
+            target_values=target_values, initial_values=initial_values, **kwargs
+        )
 
-        if name in ("wait", "record", "synchronize"):
-            tx.output.create_proxy(
-                "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
-            )
-            return variables.ConstantVariable(None)
-        elif name == "query":
-            return wrap_fx_proxy_cls(
-                target_cls=variables.ConstantVariable,
-                tx=tx,
-                proxy=tx.output.create_proxy(
-                    "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
-                ),
-            )
-        else:
-            method_name = (
-                f"{type(self.value).__module__}.{type(self.value).__qualname__}.{name}"
-            )
-            unimplemented_v2(
-                gb_type="Unsupported event method",
-                context=str(name),
-                explanation=f"Dynamo doesn't support tracing the {method_name} method. "
-                f"We currently support wait, record, synchronize, and query.",
-                hints=[
-                    *graph_break_hints.SUPPORTABLE,
-                ],
-            )
+    def enter(self, tx, *args):
+        # Run the annotation ctx manager in eager. Also ensure that
+        # preserve_node_meta context manager is setup. This is important to pass
+        # on the metadata to the create_proxy nodes.
+        stack = ExitStack()
+        stack.enter_context(torch.fx.traceback.annotate(self.target_values))
+        stack.enter_context(torch.fx.traceback.preserve_node_meta())
+        self.set_cleanup_hook(tx, lambda: stack.close())
+        return variables.ConstantVariable.create(None)
 
-    def as_proxy(self):
-        return self.proxy
+    def module_name(self):
+        return "torch.fx.traceback"
 
-    def reconstruct(self, codegen: "PyCodegen"):
-        # If we got here, this event is fully subsumed by the graph - this means it is
-        # not an input or global
-        assert not self.source
-        # Similar to stream handling, we lift the event into a global and then codegen bytecode to load it from there.
-        prefix = "_event"
-        name = codegen.tx.output.install_global_by_id(prefix, self.value)
-        codegen.append_output(codegen.create_load_global(name, add=True))
+    def fn_name(self):
+        return "annotate"
 
 
 class DynamoConfigPatchVariable(ContextWrappingVariable):
@@ -1450,6 +1348,46 @@ def fn_name(self):
         return "error_on_graph_break"
 
 
+class WithEnterFunctionVariable(VariableTracker):
+    def __init__(
+        self,
+        ctx: Union[ContextWrappingVariable, GenericContextWrappingVariable],
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.ctx = ctx
+
+    def call_function(
+        self,
+        tx: "InstructionTranslator",
+        args: "list[VariableTracker]",
+        kwargs: "dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        assert not args
+        assert not kwargs
+        # NOTE: we assume that the instruction immediately after the current CALL instruction
+        # is the first instruction of the block.
+        return tx.enter_ctx(self.ctx, tx.current_instruction)
+
+    def reconstruct(self, codegen: "PyCodegen"):
+        try:
+            type_str = f"{self.ctx.module_name()}.{self.ctx.fn_name()}"
+        except NotImplementedError:
+            type_str = str(type(self.ctx))
+        unimplemented_v2(
+            gb_type="Attempted to reconstruct context manager's __enter__ method",
+            context=str(self.ctx),
+            explanation=f"Attempted to reconstruct context manager {type_str} while tracing `with ...:`",
+            hints=[
+                "It is likely there is a graph break while tracing `with ctx:` "
+                "but outside the actual `ctx.__enter__()` method. "
+                "`torch.compile` does not expect this to happen.",
+                *graph_break_hints.DIFFICULT,
+                *graph_break_hints.DYNAMO_BUG,
+            ],
+        )
+
+
 class WithExitFunctionVariable(VariableTracker):
     _nonvar_fields = {
         "target",
diff --git a/torch/_dynamo/variables/dicts.py b/torch/_dynamo/variables/dicts.py
index c33979aae07d..71b175e2f2f2 100644
--- a/torch/_dynamo/variables/dicts.py
+++ b/torch/_dynamo/variables/dicts.py
@@ -44,7 +44,7 @@
     raise_args_mismatch,
     specialize_symnode,
 )
-from .base import ValueMutationNew, VariableTracker
+from .base import raise_type_error_exc, ValueMutationNew, VariableTracker
 from .constant import ConstantVariable
 
 
@@ -68,7 +68,9 @@ def raise_unhashable(arg, tx=None):
 
         tx = InstructionTranslator.current_tx()
     raise_observed_exception(
-        TypeError, tx, args=[ConstantVariable(f"unhashable type: {type(arg)}")]
+        TypeError,
+        tx,
+        args=[ConstantVariable(f"unhashable type: {type(arg.realize())}")],
     )
 
 
@@ -195,9 +197,11 @@ def __hash__(self):
         @staticmethod
         def _eq_impl(a, b):
             # TODO: Put this in utils and share it between variables/builtin.py and here
-            if type(a) != type(b):
+            type_a, type_b = type(a), type(b)
+            if not (issubclass(type_a, type_b) or issubclass(type_b, type_a)):
                 return False
-            elif isinstance(a, tuple):
+
+            if isinstance(a, tuple):
                 Hashable = ConstDictVariable._HashableTracker
                 return len(a) == len(b) and all(
                     Hashable._eq_impl(u, v) for u, v in zip(a, b)
@@ -504,7 +508,11 @@ def call_method(
                 raise_unhashable(args[0])
 
             self.install_dict_keys_match_guard()
-            assert not kwargs and len(args) == 2
+            if kwargs or len(args) != 2:
+                raise_type_error_exc(
+                    tx,
+                    f"dict.__setitem__ takes exactly two arguments ({len(args)} given)",
+                )
             tx.output.side_effects.mutation(self)
             self.items[Hashable(args[0])] = args[1]
             return ConstantVariable.create(None)
@@ -1369,3 +1377,13 @@ def view_items_vt(self):
 
     def python_type(self):
         return dict_items
+
+    def call_method(self, tx, name, args, kwargs):
+        # TODO(guilhermeleobas): This should actually check if args[0]
+        # implements the mapping protocol.
+        if name == "__eq__":
+            assert len(args) == 1
+            if isinstance(args[0], DictItemsVariable):
+                return self.dv_dict.call_method(tx, "__eq__", [args[0].dv_dict], {})
+            return ConstantVariable.create(False)
+        return super().call_method(tx, name, args, kwargs)
diff --git a/torch/_dynamo/variables/distributed.py b/torch/_dynamo/variables/distributed.py
index 59f3102c6519..ef64d7f92388 100644
--- a/torch/_dynamo/variables/distributed.py
+++ b/torch/_dynamo/variables/distributed.py
@@ -29,6 +29,7 @@
 
 from .. import compiled_autograd, variables
 from .._trace_wrapped_higher_order_op import trace_wrapped
+from ..bytecode_transformation import create_call_function
 from ..exc import unimplemented_v2
 from ..external_utils import call_module_hooks_from_backward_state
 from ..guards import GuardBuilder, install_guard
@@ -154,7 +155,7 @@ def call_function(
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
         if (
-            inspect.getattr_static(self.value, "__new__", None) in (object.__new__,)
+            inspect.getattr_static(self.value, "__new__", None) == object.__new__
             and self.source
         ):
             # NOTE: we don't need to track mutations to the placement class as they
@@ -231,6 +232,30 @@ def call_method(
 
         return super().call_method(tx, name, args, kwargs)
 
+    def reconstruct(self, codegen):
+        # Reconstruct the Placement object by calling its constructor
+        # e.g., Shard(0), Replicate(), Partial()
+        from torch.distributed.tensor.placement_types import Partial, Replicate, Shard
+
+        placement_type = type(self.value)
+
+        # Load the placement class
+        codegen.add_push_null(
+            lambda: codegen.load_import_from(
+                "torch.distributed.tensor.placement_types", placement_type.__name__
+            )
+        )
+
+        # For Shard, we need to pass the dim argument
+        if isinstance(self.value, Shard):
+            codegen(ConstantVariable.create(self.value.dim))
+            codegen.extend_output(create_call_function(1, False))
+        # Replicate and Partial have no required args
+        elif istype(self.value, (Replicate, Partial)):
+            codegen.extend_output(create_call_function(0, False))
+        else:
+            super().reconstruct(codegen)
+
 
 class DeviceMeshVariable(DistributedVariable):
     @staticmethod
@@ -251,6 +276,11 @@ def var_getattr(self, tx: "InstructionTranslator", name: str) -> VariableTracker
             return ConstantVariable.create(self.value.ndim)
         if name == "device_type":
             return ConstantVariable.create(self.value.device_type)
+        if name == "mesh_dim_names":
+            source = self.source
+            if source:
+                source = AttrSource(base=source, member="mesh_dim_names")
+            return VariableTracker.build(tx, self.value.mesh_dim_names, source)
         return super().var_getattr(tx, name)
 
     def call_method(
@@ -269,7 +299,11 @@ def call_method(
         if name == "get_rank":
             return ConstantVariable.create(self.value.get_rank())
         if name == "get_local_rank":
-            return ConstantVariable.create(self.value.get_local_rank())
+            const_args = [x.as_python_constant() for x in args]
+            const_kwargs = {k: v.as_python_constant() for k, v in kwargs.items()}
+            return ConstantVariable.create(
+                self.value.get_local_rank(*const_args, **const_kwargs)
+            )
         if name == "get_group":
             const_args = [x.as_python_constant() for x in args]
             const_kwargs = {k: v.as_python_constant() for k, v in kwargs.items()}
diff --git a/torch/_dynamo/variables/functions.py b/torch/_dynamo/variables/functions.py
index d1755c85abf6..4911ded6e333 100644
--- a/torch/_dynamo/variables/functions.py
+++ b/torch/_dynamo/variables/functions.py
@@ -52,6 +52,7 @@
     ObservedUserStopIteration,
     raise_observed_exception,
     SkipFrame,
+    StepUnsupported,
     unimplemented_v2,
     Unsupported,
 )
@@ -104,7 +105,7 @@
 CO_VARKEYWORDS = 0x08
 
 
-# Module‐level cache keyed by the function object
+# Module-level cache keyed by the function object
 _spec_cache = WeakKeyDictionary()
 
 
@@ -133,7 +134,7 @@ def update_defaults(self, func: FunctionType):
         self.defaults = func.__defaults__ or ()
         self.kwdefaults = func.__kwdefaults__ or {}
 
-        # Map positional‐default names → their index in self.defaults
+        # Map positional-default names → their index in self.defaults
         self.pos_default_map = dict(
             zip(self.all_pos_names[-len(self.defaults) :], range(len(self.defaults)))
         )
@@ -879,7 +880,7 @@ def call_method(
             retval = self.next_variable(tx)
 
             # The exception raised before is still active. We need to check the exception
-            # table one more time to find the next target. But why? Let’s walk
+            # table one more time to find the next target. But why? Let's walk
             # through an example and its generated bytecode: https://godbolt.org/z/ebdTbMv8M
             #
             #     z = 0
@@ -1075,7 +1076,7 @@ def __init__(self, fn, obj, source_fn=None, **kwargs) -> None:
         # One way is to simplly use `__func__` to unwrap it.
         #
         # For recursive dict-tag optimizations, it can be faster to fetch the
-        # function directly from `cls.__dict__`; that’s why we pass on
+        # function directly from `cls.__dict__`; that's why we pass on
         # `source_fn`. Whenever it is possible to access the function from
         # cls.__dict__, we pass that on to `source_fn`. Because bind_args
         # operates on the unbound function, most guards should target
@@ -1319,9 +1320,21 @@ def has_closure(self):
 
     def const_getattr(self, tx, name):
         if name == "__name__":
-            return self.fn_name.as_python_constant()
+            return self.get_name()
+        if name == "__code__":
+            return self.get_code()
+        if name == "__defaults__":
+            d = getattr(self, "defaults", None)
+            return d.as_python_constant() if d else None
         return super().const_getattr(tx, name)
 
+    def call_obj_hasattr(self, tx: "InstructionTranslator", name):
+        if name == "__code__":
+            return variables.ConstantVariable.create(hasattr(self, "code"))
+        if name == "__defaults__":
+            return variables.ConstantVariable.create(hasattr(self, "defaults"))
+        return super().call_obj_hasattr(tx, name)
+
     def has_self(self):
         return False
 
@@ -1527,6 +1540,8 @@ def call_function(
             raise SkipFrame(
                 f"Skip frame due to `torch._dynamo.skip_frame()`. Message: {skip_frame_msg}"
             )
+        elif self.value is torch._dynamo.step_unsupported:
+            raise StepUnsupported
         else:
             if config.dont_skip_tracing:
                 from .builder import SourcelessBuilder
diff --git a/torch/_dynamo/variables/higher_order_ops.py b/torch/_dynamo/variables/higher_order_ops.py
index 41a22972cba9..b5f25be5bf95 100644
--- a/torch/_dynamo/variables/higher_order_ops.py
+++ b/torch/_dynamo/variables/higher_order_ops.py
@@ -453,9 +453,7 @@ def unspecialize_carried_inputs(tx, carry) -> VariableTracker:
         cond_r_meta = _extract_tensor_metadata(
             cond_r.proxy.node.meta["example_value"], include_contiguity=False
         )
-        if not cond_r_meta.dtype == torch.bool or not cond_r_meta.shape == torch.Size(
-            []
-        ):
+        if cond_r_meta.dtype != torch.bool or cond_r_meta.shape != torch.Size([]):
             unimplemented(
                 f"Expected cond_fn to return a scalar tensor or a bool but got {cond_r_meta.shape}"
             )
@@ -720,11 +718,7 @@ def validate_args_and_maybe_create_graph_inputs(
                     new_proxy = tracer.create_graph_input(
                         arg_name, a.python_type(), example_value
                     )
-                    example_value = (
-                        node.meta["example_value"]
-                        if "example_value" in node.meta
-                        else None
-                    )
+                    example_value = node.meta.get("example_value", None)
                     a = wrap_fx_proxy_cls(
                         target_cls=type(a),
                         tx=tx,
@@ -762,9 +756,7 @@ def validate_args_and_maybe_create_graph_inputs(
             # If `a` can be put into a graph
             elif a.maybe_fx_node() is not None:
                 node = a.maybe_fx_node()
-                example_value = (
-                    node.meta["example_value"] if "example_value" in node.meta else None
-                )
+                example_value = node.meta.get("example_value", None)
                 arg_name = node.name if sub_args_names is None else sub_args_names[idx]
                 new_proxy = tracer.create_graph_input(
                     arg_name, a.python_type(), example_value
@@ -1191,7 +1183,7 @@ def move_lifted_freevars_phs_to_end(
             f"fall back to eager-mode PyTorch, which could lead to a slowdown."
         )
         log.info(msg)
-        log.info(ex)
+        log.info(ex)  # noqa: G200
         raise ex
 
 
@@ -1768,6 +1760,7 @@ def _check_combine_fn_is_normalized(combine_fn_var):
                 combine_fn_var,
                 (
                     variables.nn_module.NNModuleVariable,
+                    variables.nn_module.UnspecializedNNModuleVariable,
                     variables.FunctoolsPartialVariable,
                 ),
             ):
@@ -1776,7 +1769,13 @@ def _check_combine_fn_is_normalized(combine_fn_var):
                     f"or a graph module if we're re-exporting but got "
                     f"{combine_fn.python_type()}. Please report an issue to PyTorch if you're seeing this."
                 )
-            return isinstance(combine_fn_var, variables.nn_module.NNModuleVariable)
+            return isinstance(
+                combine_fn_var,
+                (
+                    variables.nn_module.NNModuleVariable,
+                    variables.nn_module.UnspecializedNNModuleVariable,
+                ),
+            )
 
         def arg_extractor(combine_fn, init, xs, additional_inputs):
             return combine_fn, init, xs, additional_inputs
@@ -2070,9 +2069,16 @@ def _call_function(
             unimplemented(
                 "executorch_call_delegate: kwargs arguments were not enabled."
             )
-        lowered_module = tx.output.get_submodule(args[0].module_key)
-
-        lowered_node = make_attr(tx, args[0].module_key)
+        if isinstance(args[0], variables.NNModuleVariable):
+            lowered_module = tx.output.get_submodule(args[0].module_key)
+            lowered_node = make_attr(tx, args[0].module_key)
+        elif isinstance(args[0], variables.UnspecializedNNModuleVariable):
+            # This nn module is special sa delegated by executorch. Just
+            # install it as a attr in the graph.
+            lowered_module = args[0].value
+            lowered_node = tx.output.register_static_attr_and_return_proxy(
+                "delegate", lowered_module
+            )
 
         p_args = tuple(arg.as_proxy() for arg in args[1:])
         real_sub_args = pytree.tree_map_only(
@@ -2139,6 +2145,9 @@ def call_function(
 class WrapHigherOrderVariable(TorchHigherOrderOperatorVariable):
     supports_input_mutation = True
     supports_aliasing = True
+    # TODO - Go through all subclasses of WrapHigherOrderVariable to see if
+    # restore_side_effects can be ignored. For now, this is conservative.
+    restore_side_effects = True
 
     def install_subgraph_in_output_graph(
         self, tx, fn_vt, fn_args_vt, kwargs, body_gmod, attr_name="wrap_body"
@@ -2172,6 +2181,7 @@ def create_wrapped_node(
             kwargs,
             description,
             source_target=self.value,
+            restore_side_effects=self.restore_side_effects,
             should_flatten_outputs=True,
             under_activation_checkpoint=under_activation_checkpoint,
             supports_input_mutation=self.supports_input_mutation,
@@ -2559,6 +2569,14 @@ def _call_function(
 
 
 class CheckpointHigherOrderVariable(WrapHigherOrderVariable):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        # If side effects are allowed under checkpoint, we should not restore
+        # the side effects after speculate subgraph.
+        self.restore_side_effects = (
+            not torch._dynamo.config.skip_fwd_side_effects_in_bwd_under_checkpoint
+        )
+
     def _call_function(
         self,
         tx: "InstructionTranslator",
@@ -2576,7 +2594,7 @@ def _call_function(
             elif isinstance(
                 ctx, torch._dynamo.variables.functions.FunctoolsPartialVariable
             ):
-                context_fn = ctx.as_python_constant()
+                context_fn = ctx.guard_as_python_constant()
             else:
                 raise NotImplementedError(
                     f"checkpoint not implemented for {type(ctx)} context_fn"
@@ -2818,8 +2836,6 @@ def create_wrapped_node(
     ):
         from .._trace_wrapped_higher_order_op import TransformGetItemToIndex
 
-        tx: InstructionTranslator = tx
-
         def create_scalar():
             return query.call_method(
                 tx,
@@ -3533,7 +3549,7 @@ def should_wrap_in_hop(cls, value):
         from torch.distributed.tensor.experimental._func_map import _local_map_wrapped
 
         # check is important to avoid subclass dispatch
-        if type(value) != type(_local_map_wrapped):
+        if type(value) is not type(_local_map_wrapped):
             return False
 
         return value == _local_map_wrapped and cls._enabled
@@ -3569,6 +3585,81 @@ def _call_function(
             *user_args,
         ) = args
 
+        # None placements are used to pass non-Tensors into the local_map function.
+        # Containers passed this way can not hold tensors. Thus, Dynamo would have inlined
+        # into them, and we handle None placements by assuming they will be desugared away.
+        # This will need to be adjusted for dynamic shapes support.
+        def check_none_last(placements):
+            seen_none = 0
+            for p in placements:
+                if p is None:
+                    seen_none += 1
+                else:
+                    assert seen_none == 0, (
+                        "Tracing local_map is only currently supported with None placements last."
+                    )
+            return seen_none
+
+        inputs_none_placements = check_none_last(in_placements.value)
+        output_none_placements = check_none_last(out_placements.value)
+
+        local_map_kwargs = {
+            "out_placements": out_placements.value,
+            "in_placements": in_placements.value,
+            "redistribute_inputs": redistribute_inputs.value,
+            "in_grad_placements": in_grad_placements.value,
+            "device_mesh": device_mesh.value,
+        }
+        assert local_map_kwargs["device_mesh"] is not None, (
+            "Not yet implemented, please manually provide a device_mesh to local_map."
+        )
+        mesh = local_map_kwargs["device_mesh"]
+
+        # For Autoparallel, the initial trace is done with global shapes, then we decide model weights sharding,
+        # and reuse the graph. Since the sharding decision is after the initial trace, we can't trace with local shapes.
+        # For local_map however, since we specify all placements, we can trace with local shapes.
+
+        # Step 1: Validate the annotated function matches the input_placements (i.e. that it can run in eager)
+        template = (
+            "Expecting {expected} {inputs_or_outputs} to local_map function based on placements"
+            ", but found {actual}. Please ensure the count matches for eager. "
+        )
+        assert len(in_placements.value) == len(user_args), template.format(
+            expected=len(in_placements.value),
+            inputs_or_outputs="inputs",
+            actual=len(user_args),
+        )
+
+        from torch._higher_order_ops.local_map import (
+            redistribute_fw_inputs,
+            redistribute_fw_outputs,
+        )
+
+        # Step 2: Convert inputs to local shapes
+        priors = {}
+        for placements, vt in zip(in_placements.value, user_args):
+            if isinstance(vt, variables.lazy.LazyVariableTracker):
+                vt = variables.lazy.LazyVariableTracker.realize_all(vt)
+
+            if not isinstance(vt, variables.TensorVariable):
+                assert placements is None
+                continue
+
+            global_tensor = vt.as_proxy().node.meta["example_value"]
+            # NOTE: We don't support local_map region relying on exact grad_fn information
+            # This is okay since accessing grad_fn is a graph break.
+            local_tensor = redistribute_fw_inputs(
+                (global_tensor,),
+                (placements,),
+                mesh,
+            )
+            local_tensor = local_tensor[0]
+
+            priors[vt] = global_tensor
+            vt.as_proxy().node.meta["example_value"] = local_tensor
+            vt.synchronize_attributes(tx)
+
+        # Step 3: Trace local_map subgraph with local tensors
         (
             p_args,
             p_kwargs,
@@ -3581,15 +3672,39 @@ def _call_function(
             tx, user_func, user_args, kwargs, self.value._name, subgraph_name="subgraph"
         )
 
-        # Treat as const, so we don't have to deal with Placement types in fx IR
-        # Guarded with EQUALS_MATCH on local_map call's arguments
-        body_gmod.meta["local_map_kwargs"] = {
-            "out_placements": out_placements.value,
-            "in_placements": in_placements.value,
-            "redistribute_inputs": redistribute_inputs.value,
-            "in_grad_placements": in_grad_placements.value,
-            "device_mesh": device_mesh.value,
-        }
+        # Step 4: Validate traced graph signature still matches placement information
+        expected_num_inputs = len(in_placements.value) - inputs_none_placements
+        actual_num_inputs = len(body_gmod.graph.find_nodes(op="placeholder"))
+        expected_num_outputs = len(out_placements.value) - output_none_placements
+        assert len(body_gmod.graph.find_nodes(op="output")) == 1
+        actual_num_outputs = len(body_gmod.graph.find_nodes(op="output")[0].args[0])
+
+        template = (
+            "Expecting {expected} {inputs_or_outputs} to local_map function based on placements"
+            ", but found {actual}. If the count matches for eager, "
+            "Dynamo may have flattened {inputs_or_outputs} to the function or found additional "
+            "tensors used via closures. "
+            "Please adjust the input placements to match what the traced graph sees: \n{gm_str}."
+        )
+
+        def make_error_msg(*args):
+            expected_num, actual_num, inputs_or_outputs = args
+            gm_str = body_gmod.print_readable(print_output=False)
+            return template.format(
+                expected=expected_num,
+                inputs_or_outputs=inputs_or_outputs,
+                actual=actual_num,
+                gm_str=gm_str,
+            )
+
+        if expected_num_inputs != actual_num_inputs:
+            raise AssertionError(
+                make_error_msg(expected_num_inputs, actual_num_inputs, "inputs")
+            )
+        if expected_num_outputs != actual_num_outputs:
+            raise AssertionError(
+                make_error_msg(expected_num_outputs, actual_num_outputs, "outputs")
+            )
 
         assert len(p_kwargs) == 0
 
@@ -3599,11 +3714,49 @@ def _call_function(
             body_r.as_proxy(),
         )
 
+        # Step 5: Install local_map subgraph
         p_kwargs = {key: value.as_proxy() for key, value in kwargs.items()}
         out = _call_function_and_unflatten_output(
             tx, self.value, p_args, p_kwargs, flat_example_value, treespec
         )
 
+        # Step 6: Restore inputs and outputs to global shapes
+        for vt, global_tensor in priors.items():
+            vt.as_proxy().node.meta["example_value"] = global_tensor
+            vt.synchronize_attributes(tx)
+
+        outs = out.items if isinstance(out, TupleVariable) else [out]
+        assert len(outs) == len(out_placements.value)
+        for placements, vt in zip(out_placements.value, outs):
+            if not isinstance(vt, variables.TensorVariable):
+                assert placements is None
+                continue
+
+            local_tensor = vt.as_proxy().node.meta["example_value"]
+
+            # NOTE: We don't support code after the local_map region relying on exact grad_fn information
+            # This is okay since accessing grad_fn is a graph break.
+            global_tensor = redistribute_fw_outputs(
+                (local_tensor,),
+                (placements,),
+                mesh,
+                num_activations=0,  # this is not the joint
+            )
+            global_tensor = global_tensor[0]
+
+            vt.as_proxy().node.meta["example_value"] = global_tensor
+            vt.synchronize_attributes(tx)
+
+        # Treat as const, so we don't have to deal with Placement types in fx IR
+        # Guarded with EQUALS_MATCH on local_map call's arguments
+        body_gmod.meta["local_map_kwargs"] = {
+            "out_placements": out_placements.value[:expected_num_outputs],
+            "in_placements": in_placements.value[:expected_num_inputs],
+            "redistribute_inputs": redistribute_inputs.value,
+            "in_grad_placements": in_grad_placements.value,
+            "device_mesh": device_mesh.value,
+        }
+
         return out
 
 
diff --git a/torch/_dynamo/variables/iter.py b/torch/_dynamo/variables/iter.py
index 80b9915aaa21..f3fc7b98307f 100644
--- a/torch/_dynamo/variables/iter.py
+++ b/torch/_dynamo/variables/iter.py
@@ -16,11 +16,15 @@
 """
 
 import itertools
-import sys
 from typing import TYPE_CHECKING, Union
 
 from .. import graph_break_hints, polyfills, variables
-from ..bytecode_transformation import create_call_function, create_instruction
+from ..bytecode_transformation import (
+    create_build_tuple,
+    create_call_function,
+    create_call_function_ex,
+    create_instruction,
+)
 from ..exc import (
     handle_observed_exception,
     ObservedUserStopIteration,
@@ -156,9 +160,11 @@ def keyfunc(x):
                     result.append(
                         variables.TupleVariable(
                             [
-                                variables.ConstantVariable.create(k)
-                                if variables.ConstantVariable.is_literal(k)
-                                else k,
+                                (
+                                    variables.ConstantVariable.create(k)
+                                    if variables.ConstantVariable.is_literal(k)
+                                    else k
+                                ),
                                 variables.ListIteratorVariable(
                                     list(v), mutation_type=ValueMutationNew()
                                 ),
@@ -428,9 +434,7 @@ def reconstruct_items(self, codegen: "PyCodegen"):
             if isinstance(it, list):
                 remaining_items = it[self.index :]
                 codegen.foreach(remaining_items)
-                codegen.append_output(
-                    create_instruction("BUILD_TUPLE", arg=len(remaining_items))
-                )
+                codegen.append_output(create_build_tuple(len(remaining_items)))
             else:
                 codegen(it)
 
@@ -439,20 +443,15 @@ def reconstruct(self, codegen: "PyCodegen"):
             lambda: codegen.load_import_from("builtins", "zip"), call_function_ex=True
         )
         self.reconstruct_items(codegen)
-        codegen.append_output(
-            create_instruction("BUILD_TUPLE", arg=len(self.iterables))
+        codegen.append_output(create_build_tuple(len(self.iterables)))
+        codegen.extend_output(
+            [
+                codegen.create_load_const("strict"),
+                codegen.create_load_const(self.strict),
+                create_instruction("BUILD_MAP", arg=1),
+                *create_call_function_ex(True, False),
+            ]
         )
-        if sys.version_info >= (3, 10):
-            codegen.extend_output(
-                [
-                    codegen.create_load_const("strict"),
-                    codegen.create_load_const(self.strict),
-                    create_instruction("BUILD_MAP", arg=1),
-                    create_instruction("CALL_FUNCTION_EX", arg=1),
-                ]
-            )
-        else:
-            codegen.append_output(create_instruction("CALL_FUNCTION_EX", arg=0))
 
 
 class MapVariable(ZipVariable):
@@ -487,8 +486,8 @@ def reconstruct(self, codegen: "PyCodegen"):
         self.reconstruct_items(codegen)
         codegen.extend_output(
             [
-                create_instruction("BUILD_TUPLE", arg=len(self.iterables) + 1),
-                create_instruction("CALL_FUNCTION_EX", arg=0),
+                create_build_tuple(len(self.iterables) + 1),
+                *create_call_function_ex(False, False),
             ]
         )
 
@@ -560,9 +559,7 @@ def reconstruct_items(self, codegen: "PyCodegen"):
         if isinstance(self.iterable, list):
             remaining_items = self.iterable[self.index :]
             codegen.foreach(remaining_items)
-            codegen.append_output(
-                create_instruction("BUILD_TUPLE", arg=len(remaining_items))
-            )
+            codegen.append_output(create_build_tuple(len(remaining_items)))
         else:
             codegen(self.iterable)
 
diff --git a/torch/_dynamo/variables/lazy.py b/torch/_dynamo/variables/lazy.py
index 44d346a48cd2..594ccee6fc70 100644
--- a/torch/_dynamo/variables/lazy.py
+++ b/torch/_dynamo/variables/lazy.py
@@ -104,9 +104,13 @@ def set_name_hint(self, name: str) -> None:
             self._cache.name_hint = name
 
     def __str__(self) -> str:
+        variable_info = "LazyVariableTracker("
         if self.is_realized():
-            return repr(self.unwrap())
-        return super().__repr__()
+            variable_info += f"realized: {repr(self.unwrap())})"
+        else:
+            variable_info += f"unrealized: {self.peek_type()})"
+
+        return variable_info
 
     def __getattr__(self, item: str) -> Any:
         return getattr(self.realize(), item)
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index 9bef1aecc342..418d132d7d0d 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -27,6 +27,7 @@ class that handles its unique behaviors while integrating with Dynamo's
 
 from .. import graph_break_hints, polyfills, variables
 from ..bytecode_transformation import (
+    create_build_tuple,
     create_call_function,
     create_instruction,
     create_rot_n,
@@ -46,7 +47,7 @@ class that handles its unique behaviors while integrating with Dynamo's
     range_iterator,
     set_example_value,
 )
-from .base import ValueMutationNew, VariableTracker
+from .base import raise_type_error_exc, ValueMutationNew, VariableTracker
 from .constant import ConstantVariable
 from .functions import UserFunctionVariable, UserMethodVariable
 from .iter import IteratorVariable
@@ -146,13 +147,11 @@ def call_method(
         if name == "__getitem__":
             from .tensor import TensorVariable
 
-            if len(args) != 1:
-                msg = ConstantVariable.create(
-                    f"{name} takes exactly one argument ({len(args)} given)"
+            if kwargs or len(args) != 1:
+                raise_type_error_exc(
+                    tx, f"{name} takes exactly one argument ({len(args)} given)"
                 )
-                raise_observed_exception(TypeError, tx, args=[msg])
 
-            assert not kwargs and len(args) == 1
             if isinstance(args[0], TensorVariable):
                 value = get_fake_value(args[0].as_proxy().node, tx)
                 if value.constant is not None and value.constant.numel() == 1:
@@ -199,7 +198,7 @@ def call_method(
             if kwargs or len(args) != 1:
                 raise_args_mismatch(tx, name)
 
-            if type(self) != type(args[0]):
+            if type(self) is not type(args[0]):
                 tp_name = self.python_type_name()
                 other = args[0].python_type_name()
                 msg = ConstantVariable.create(
@@ -622,9 +621,11 @@ def call_method(
                 else:
                     items = slice(
                         *[
-                            s.evaluate_expr()
-                            if isinstance(s, SymNodeVariable)
-                            else s.as_python_constant()
+                            (
+                                s.evaluate_expr()
+                                if isinstance(s, SymNodeVariable)
+                                else s.as_python_constant()
+                            )
                             for s in key.items
                         ]
                     )
@@ -966,7 +967,7 @@ def debug_repr(self):
 
     def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.foreach(self.items)
-        codegen.append_output(create_instruction("BUILD_TUPLE", arg=len(self.items)))
+        codegen.append_output(create_build_tuple(len(self.items)))
 
     def call_method(
         self,
@@ -1069,7 +1070,7 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.add_push_null(lambda: codegen.load_import_from("torch", "Size"))
         codegen.foreach(self.items)
         build_torch_size = [
-            create_instruction("BUILD_TUPLE", arg=len(self.items)),
+            create_build_tuple(len(self.items)),
         ] + create_call_function(1, False)
         codegen.extend_output(build_torch_size)
 
@@ -1112,11 +1113,15 @@ def call_method(
         kwargs: dict[str, "VariableTracker"],
     ) -> "VariableTracker":
         if name == "__getitem__":
-            assert not kwargs and len(args) == 1
+            if kwargs or len(args) != 1:
+                raise_type_error_exc(
+                    tx, f"{name} takes exactly one argument ({len(args)} given)"
+                )
             out = self.get_item_dyn(tx, args[0])
             return out
         elif name == "numel":
-            assert not args and not kwargs
+            if args or kwargs:
+                raise_type_error_exc(tx, f"{name} takes no arguments")
             return self.numel(tx)
 
         return super().call_method(tx, name, args, kwargs)
@@ -1151,7 +1156,7 @@ class NamedTupleVariable(TupleVariable):
     def __init__(self, items, tuple_cls, dynamic_attributes=None, **kwargs) -> None:
         super().__init__(items, **kwargs)
         self.tuple_cls = tuple_cls
-        self.dynamic_attributes = {} if not dynamic_attributes else dynamic_attributes
+        self.dynamic_attributes = dynamic_attributes if dynamic_attributes else {}
 
     def is_namedtuple(self):
         return isinstance(getattr(self.tuple_cls, "_fields", None), tuple) and callable(
@@ -1219,7 +1224,7 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.foreach(self.items)
         codegen.extend_output(
             [
-                create_instruction("BUILD_TUPLE", arg=len(self.items)),
+                create_build_tuple(len(self.items)),
             ]
             + create_call_function(1, False)
         )
@@ -1230,6 +1235,26 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
             codegen.extend_output(create_rot_n(2))
             codegen.store_attr(name)
 
+    def _is_method_overridden(self, method_name: str) -> bool:
+        """Checks if a method is overridden in the NamedTuple subclass.
+
+        Args:
+            method_name (str): The name of the method to check.
+
+        Returns:
+            bool: True if the method is overridden in the subclass, False otherwise.
+
+        Raises:
+            ValueError: If the NamedTuple class does not inherit from both Tuple and Object.
+        """
+        if len(self.tuple_cls.__mro__) < 3:
+            raise ValueError("NamedTuple should inherit from Tuple and Object.")
+        if getattr(self.tuple_cls, method_name, None) == getattr(
+            self.tuple_cls.__mro__[-3], method_name, None
+        ):
+            return False
+        return True
+
     def call_method(
         self,
         tx,
@@ -1257,8 +1282,55 @@ def call_method(
                 tx.output.side_effects.store_attr(self, attr, value)
             self.dynamic_attributes[attr] = value
             return ConstantVariable.create(None)
+        elif name == "_replace":
+            # NamedTuple._replace should create a new instance with replaced fields
+            if args:
+                raise_observed_exception(
+                    TypeError,
+                    tx,
+                    args=[
+                        ConstantVariable.create(
+                            "_replace() takes no positional arguments"
+                        )
+                    ],
+                )
+
+            # Get the field names for validation
+            fields = self.fields()
+
+            # Start with current items (copy them)
+            new_items = list(self.items)
+
+            # Replace fields specified in kwargs
+            for field_name, new_value in kwargs.items():
+                if field_name not in fields:
+                    raise_observed_exception(
+                        ValueError,
+                        tx,
+                        args=[
+                            ConstantVariable.create(
+                                f"Got unexpected field name: '{field_name}'"
+                            )
+                        ],
+                    )
+
+                # Replace the item at the field's index
+                field_index = fields.index(field_name)
+                new_items[field_index] = new_value
+
+            return NamedTupleVariable(new_items, self.tuple_cls)
+
         return super().call_method(tx, name, args, kwargs)
 
+    def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
+        if isinstance(arg, SliceVariable):
+            # slicing a namedtuple produces a tuple
+            return TupleVariable(
+                self.items[arg.as_python_constant()],
+                source=None,
+            )
+        return super().getitem_const(tx, arg)
+
     def var_getattr(self, tx: "InstructionTranslator", name):
         def check_and_create_method():
             method = inspect.getattr_static(self.tuple_cls, name, None)
@@ -1275,6 +1347,23 @@ def check_and_create_method():
             else:
                 return None
 
+        # Avoid UserMethodVariable fallback precisely when methods NamedTuple methods have not been overwritten.
+        if (
+            name == "_replace"
+            and not self._is_method_overridden("_replace")
+            and not self._is_method_overridden("__getattr__")
+        ):
+            # Return a BuiltinVariable for the _replace method
+            # Get the actual _replace method from the tuple class
+            actual_replace_method = getattr(self.tuple_cls, "_replace", None)
+            if actual_replace_method:
+                from ..source import AttrSource
+
+                source = AttrSource(self.source, name) if self.source else None
+                return variables.GetAttrVariable(self, name, source=source)
+            # Fallback if _replace doesn't exist (shouldn't happen for proper NamedTuples)
+            return super().var_getattr(tx, name)
+
         if name == "_fields":
             source = NamedTupleFieldsSource(self.source) if self.source else None
             return VariableTracker.build(tx, self.fields(), source=source)
@@ -1415,7 +1504,7 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.foreach(remaining_items)
         codegen.extend_output(
             [
-                create_instruction("BUILD_TUPLE", arg=len(remaining_items)),
+                create_build_tuple(len(remaining_items)),
                 create_instruction("GET_ITER"),
             ]
         )
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index 60086fe6758c..d0969e0d8746 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -33,7 +33,11 @@
 import torch.utils._pytree as pytree
 
 from .. import config, graph_break_hints, trace_rules, variables
-from ..bytecode_transformation import create_call_function, create_instruction
+from ..bytecode_transformation import (
+    create_call_function,
+    create_call_function_ex,
+    create_instruction,
+)
 from ..create_parameter_op import do_not_convert_to_tracable_parameter
 from ..exc import raise_observed_exception, unimplemented, unimplemented_v2
 from ..guards import GuardBuilder, install_guard
@@ -56,7 +60,7 @@
     proxy_args_kwargs,
     tuple_methods,
 )
-from .base import VariableTracker
+from .base import raise_type_error_exc, VariableTracker
 from .constant import ConstantVariable
 from .functions import NestedUserFunctionVariable, UserFunctionVariable
 from .user_defined import call_random_fn, is_standard_setattr, UserDefinedObjectVariable
@@ -388,10 +392,19 @@ def call_method(
 
 class ExceptionVariable(VariableTracker):
     # The ExceptionVariable corresponds to the BaseException class in Python
-    def __init__(self, exc_type, args, **kwargs) -> None:
-        super().__init__(**kwargs)
+    def __init__(
+        self, exc_type, args, init_kwargs=None, source=None, mutation_type=None
+    ) -> None:
+        super().__init__(source=source, mutation_type=mutation_type)
         self.exc_type = exc_type
         self.args = args
+        if init_kwargs:
+            unimplemented_v2(
+                gb_type="Keyword args passed to exception constructor",
+                context=f"{self} with kwargs {init_kwargs}",
+                explanation="Dynamo does not know how to handle keyword args passed to an exception constructor",
+                hints=[*graph_break_hints.SUPPORTABLE],
+            )
         # When raising a new exception while another exception is already being
         # handled, the new exception's __context__ attribute is automatically
         # set to the handled exception.
@@ -1207,7 +1220,10 @@ def call_function(
         if is_tensor_base_attr_getter(self.method_wrapper) and isinstance(
             args[0], variables.TensorVariable
         ):
-            assert len(args) == 1 and len(kwargs) == 0
+            if not (len(args) == 1 and len(kwargs) == 0):
+                raise_type_error_exc(
+                    tx, "tensor attribute getter takes exactly one argument"
+                )
 
             return args[0].var_getattr(tx, self.method_wrapper.__self__.__name__)
 
@@ -1412,7 +1428,7 @@ def can_constant_fold_through(cls, fn):
     def get_constant_collection_for_func(cls, fn):
         mod = fn.__module__.split(".")
         assert len(mod) >= 2 and mod[:2] == ["torch", "_numpy"]
-        return np_constant_collections_map.get(fn, None)
+        return np_constant_collections_map.get(fn)
 
     def call_function(
         self,
@@ -1566,7 +1582,7 @@ def reconstruct(self, codegen: "PyCodegen"):
             variables.ConstantVariable.create(k): v for k, v in self.sym_kwargs.items()
         }
         codegen(variables.ConstDictVariable(kwargs))
-        codegen.append_output(create_instruction("CALL_FUNCTION_EX", arg=1))
+        codegen.extend_output(create_call_function_ex(True, False))
 
 
 class DebuggingVariable(VariableTracker):
@@ -1917,7 +1933,7 @@ def reconstruct(self, codegen: "PyCodegen"):
 class WeakRefVariable(VariableTracker):
     @staticmethod
     def build(tx, weakref_value, **options):
-        source = options.get("source", None)
+        source = options.get("source")
         callback = weakref_value.__callback__
         callback_source = source and AttrSource(source, "__callback__")
         callback_vt = VariableTracker.build(tx, callback, callback_source)
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index 10ad8c4a1286..2cbf1dc85090 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -26,6 +26,7 @@
 import functools
 import inspect
 import itertools
+import re
 import types
 from contextlib import contextmanager, nullcontext
 from typing import TYPE_CHECKING
@@ -103,16 +104,26 @@ def convert_to_fake(x):
         fake_kwargs = {k: convert_to_fake(v) for k, v in proxy_kwargs.items()}
         try:
             mod._infer_parameters(mod, fake_args, fake_kwargs)
-        except AttributeError:
+        except AttributeError as e:
+            # Re-raise with the original error message from the AttributeError
             raise_observed_exception(
                 AttributeError,
                 tx,
+                msg=str(e)
+                if str(e)
+                else "AttributeError during lazy module initialization",
             )
 
 
 @contextmanager
 def record_nn_module_stack(module_key: str, source, tx, mod: torch.nn.Module):
     fully_qualified_name = source.name()
+    # Remove redundant namings
+    fully_qualified_name = re.sub(
+        r"\._(?:modules|parameters|buffers)\[(['\"])([^'\"\]]+)\1\]",
+        r".\2",
+        fully_qualified_name,
+    )
     num_calls = tx.num_calls.get(fully_qualified_name, 0)
     module_key = f"{module_key}@{num_calls}" if num_calls > 0 else module_key
     try:
@@ -363,6 +374,7 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 raise_observed_exception(
                     AttributeError,
                     tx,
+                    msg=f"'{type(base).__name__}' object has no attribute '{name}'",
                 )
 
         if name == "forward":
@@ -488,6 +500,11 @@ def call_function(
                 tx.output.is_root_tracer()
                 and mod.__module__.startswith(("torch.nn.", "torch.ao."))
                 and mod.__module__ != "torch.nn.utils.parametrize"
+                # this basically means we are using the new strict export tracer which wraps the
+                # user callable, so we shouldn't directly proxy in the fx graph
+                and not isinstance(
+                    mod, torch.ao.quantization.pt2e.export_utils._WrapperModule
+                )
             ):
                 if nnmodule_has_hooks(
                     mod, check_forward_hooks=True, check_backward_hooks=True
@@ -935,7 +952,7 @@ def unpack_var_sequence(self, tx):
             # will not reflect the mutations. So, trace through the `__iter__`
             # function to reflect any tracked mutations.
             return tx.inline_user_function_return(
-                variables.UserFunctionVariable(fn),
+                VariableTracker.build(tx, fn),
                 [
                     self,
                 ],
@@ -957,10 +974,7 @@ def call_function(
                 self.value_type = mod.cls_to_become
             initialize_lazy_module(tx, mod, args, kwargs)
 
-        if (
-            not isinstance(mod, torch.fx.GraphModule)
-            and mod.__call__.__func__ is not unpatched_nn_module_call
-        ):
+        if not isinstance(mod, torch.fx.GraphModule):
             name = "__call__"
             fn = getattr(self.value_type, name)
         else:
@@ -1007,9 +1021,17 @@ def call_function(
             else nullcontext()
         )
         with ctx:
-            return variables.UserFunctionVariable(fn, source=source).call_function(
-                tx, [self] + list(args), kwargs
-            )
+            if not isinstance(fn, (types.FunctionType, torch.jit.ScriptFunction)):
+                fn_vt = VariableTracker.build(tx, fn, source=source)
+                return fn_vt.call_function(tx, [self] + list(args), kwargs)
+            else:
+                # Ideally we would have just used VariableTracker.build(tx, fn,
+                # source=source) but that introduces guard on the
+                # `forward.__code__` object. Given that we already guard on the
+                # forward not present in generic dict, we dont need this guard.
+                return variables.UserFunctionVariable(fn, source=source).call_function(
+                    tx, [self] + list(args), kwargs
+                )
 
     def call_method(
         self,
@@ -1025,9 +1047,8 @@ def call_method(
             else:
                 source = None
 
-            return variables.UserFunctionVariable(fn, source=source).call_function(
-                tx, [self] + list(args), kwargs
-            )
+            fn_vt = VariableTracker.build(tx, fn, source=source)
+            return fn_vt.call_function(tx, [self] + list(args), kwargs)
 
         if name not in getattr(self.value, "__dict__", {}):
             try:
@@ -1037,11 +1058,8 @@ def call_method(
 
             if isinstance(method, staticmethod):
                 source = AttrSource(self.get_source_by_walking_mro(name), "__func__")
-                return tx.inline_user_function_return(
-                    variables.UserFunctionVariable(method.__func__, source=source),
-                    args,
-                    kwargs,
-                )
+                fn_vt = VariableTracker.build(tx, method.__func__, source=source)
+                return fn_vt.call_function(tx, args, kwargs)
 
             if (
                 hasattr(method, "__code__")
@@ -1101,11 +1119,8 @@ def call_method(
             ) or method is torch.nn.Module.__delattr__:
                 # Trace through __delattr__ to track mutations on the module
                 # members like `_modules``.
-                return tx.inline_user_function_return(
-                    variables.UserFunctionVariable(torch.nn.Module.__delattr__),
-                    [self, args[0]],
-                    kwargs,
-                )
+                fn_vt = VariableTracker.build(tx, torch.nn.Module.__delattr__)
+                return fn_vt.call_function(tx, [self, args[0]], kwargs)
 
         return super().call_method(tx, name, args, kwargs)
 
@@ -1190,7 +1205,11 @@ def manually_trace_nn_module_getattr(self, tx: "InstructionTranslator", name):
         if out is None:
             out = self.getattr_helper(tx, "_buffers", name_vt)
         if out is None:
-            raise_observed_exception(AttributeError, tx)
+            raise_observed_exception(
+                AttributeError,
+                tx,
+                msg=f"'{type(self.value).__name__}' object has no attribute '{name}'",
+            )
         return out
 
 
@@ -1217,7 +1236,7 @@ class FSDPManagedNNModuleVariable(UnspecializedNNModuleVariable):
     """
 
     def __init__(self, value, **kwargs) -> None:
-        source = kwargs.get("source", None)
+        source = kwargs.get("source")
         assert source is not None, (
             "FSDPManagedNNModule depends on having an accurate source to control guarding."
         )
diff --git a/torch/_dynamo/variables/sdpa.py b/torch/_dynamo/variables/sdpa.py
index 6edd4a7c8ea4..e63edf8e2b03 100644
--- a/torch/_dynamo/variables/sdpa.py
+++ b/torch/_dynamo/variables/sdpa.py
@@ -13,7 +13,15 @@
     from torch._dynamo.codegen import PyCodegen
     from torch._dynamo.symbolic_convert import InstructionTranslator
 
-PARAM_NAMES = "query key value attn_mask dropout is_causal enable_gqa".split()
+PARAM_NAMES = [
+    "query",
+    "key",
+    "value",
+    "attn_mask",
+    "dropout",
+    "is_causal",
+    "enable_gqa",
+]
 
 
 class SDPAParamsVariable(VariableTracker):
diff --git a/torch/_dynamo/variables/streams.py b/torch/_dynamo/variables/streams.py
new file mode 100644
index 000000000000..1c32239bfaab
--- /dev/null
+++ b/torch/_dynamo/variables/streams.py
@@ -0,0 +1,211 @@
+from typing import Any
+
+import torch
+from torch.fx import Proxy
+
+from .. import graph_break_hints
+from ..exc import TYPE_CHECKING, unimplemented_v2
+from .base import VariableTracker
+from .constant import ConstantVariable
+
+
+if TYPE_CHECKING:
+    from torch._dynamo.symbolic_convert import InstructionTranslator
+
+    from ..codegen import PyCodegen
+
+from torch._library.custom_ops import custom_op
+
+
+Tensor = torch.Tensor
+
+
+@custom_op("streams::fork", mutates_args=())
+def fork_stream(
+    from_index: int,
+    from_device: torch.device,
+    to_index: int,
+    to_device: torch.device,
+) -> None:
+    pass
+
+
+@fork_stream.register_fake
+def _(
+    from_index: int,
+    from_device: torch.device,
+    to_index: int,
+    to_device: torch.device,
+) -> None:
+    pass
+
+
+@custom_op("streams::join", mutates_args=())
+def join_stream(
+    from_index: int,
+    from_device: torch.device,
+    to_index: int,
+    to_device: torch.device,
+) -> None:
+    pass
+
+
+@join_stream.register_fake
+def _(
+    from_index: int,
+    from_device: torch.device,
+    to_index: int,
+    to_device: torch.device,
+) -> None:
+    pass
+
+
+class StreamVariable(VariableTracker):
+    def __init__(
+        self,
+        proxy: Proxy,
+        value: torch.Stream,
+        device: torch.device,
+        **kwargs: Any,
+    ) -> None:
+        if proxy is not None and "example_value" in proxy.node.meta:
+            assert proxy.node.meta["example_value"] == value
+        assert value.device.type == device.type, (
+            "stream value is not equal to the passed device"
+        )
+        super().__init__(**kwargs)
+        self.proxy = proxy
+        self.value = value
+        # pyrefly: ignore  # read-only
+        self.device = device
+
+    def python_type(self) -> type:
+        return torch.Stream
+
+    def call_method(
+        self,
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> "VariableTracker":
+        assert hasattr(self.value, name), f"no stream method found named {name}"
+
+        from ..utils import cmp_name_to_op_mapping, proxy_args_kwargs
+        from .builder import wrap_fx_proxy_cls
+
+        if name in ("wait_stream", "synchronize", "wait_event"):
+            tx.output.create_proxy(
+                "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
+            )
+            return ConstantVariable(None)
+        elif name == "query":
+            return wrap_fx_proxy_cls(
+                target_cls=ConstantVariable,
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
+                ),
+            )
+        elif name == "record_event":
+            return wrap_fx_proxy_cls(
+                target_cls=EventVariable,
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
+                ),
+            )
+        elif name in cmp_name_to_op_mapping and len(args) == 1 and not kwargs:
+            from ..guards import GuardBuilder, install_guard
+
+            if self.source:
+                install_guard(self.source.make_guard(GuardBuilder.EQUALS_MATCH))
+
+            # NB : Checking for mutation is necessary because we compare
+            # constant values
+            other = args[0]
+            if not isinstance(other, StreamVariable):
+                return ConstantVariable.create(NotImplemented)
+
+            if other.source:
+                install_guard(self.source.make_guard(GuardBuilder.EQUALS_MATCH))
+            return ConstantVariable.create(
+                cmp_name_to_op_mapping[name](self.value, other.value)  # type: ignore[arg-type]
+            )
+
+        return super().call_method(tx, name, args, kwargs)
+
+    def as_proxy(self) -> Proxy:
+        return self.proxy
+
+    def reconstruct(self, codegen: "PyCodegen") -> None:
+        # If we got here, this stream is fully subsumed by the graph - this means it is
+        # not an input or global
+        assert not self.source
+        # Since we just proved that - for other such structures, like lists and dicts, reconstruction
+        # is fine and sound according to dynamo principles of treating collectives. However,
+        # streams are special in that we want to preserve the identity of the stream as the same as in the graph
+        # Normally, we would do this via codegen for the proxy mapping to an output - we cannot do this yet, as we do not
+        # yet have a plan for how we want to handle the case where the stream is used as an input or an output. Pending
+        # design, to unblock current work, we lift the stream into a global and then codegen bytecode to load it from there.
+        prefix = f"_stream_{self.device}"
+        name = codegen.tx.output.install_global_by_id(prefix, self.value)
+        codegen.append_output(codegen.create_load_global(name, add=True))
+
+
+class EventVariable(VariableTracker):
+    def __init__(self, proxy: Proxy, value: torch.Event, **kwargs: Any) -> None:
+        if proxy is not None and "example_value" in proxy.node.meta:
+            assert proxy.node.meta["example_value"] == value
+        super().__init__(**kwargs)
+        self.proxy = proxy
+        self.value = value
+
+    def call_method(
+        self,
+        tx: "InstructionTranslator",
+        name: str,
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
+        from ..utils import proxy_args_kwargs
+        from .builder import wrap_fx_proxy_cls
+
+        if name in ("wait", "record", "synchronize"):
+            tx.output.create_proxy(
+                "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
+            )
+            return ConstantVariable(None)
+        elif name == "query":
+            return wrap_fx_proxy_cls(
+                target_cls=ConstantVariable,
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
+                ),
+            )
+        else:
+            method_name = (
+                f"{type(self.value).__module__}.{type(self.value).__qualname__}.{name}"
+            )
+            unimplemented_v2(
+                gb_type="Unsupported event method",
+                context=str(name),
+                explanation=f"Dynamo doesn't support tracing the {method_name} method. "
+                f"We currently support wait, record, synchronize, and query.",
+                hints=[
+                    *graph_break_hints.SUPPORTABLE,
+                ],
+            )
+
+    def as_proxy(self) -> Proxy:
+        return self.proxy
+
+    def reconstruct(self, codegen: "PyCodegen") -> None:
+        # If we got here, this event is fully subsumed by the graph - this means it is
+        # not an input or global
+        assert not self.source
+        # Similar to stream handling, we lift the event into a global and then codegen bytecode to load it from there.
+        prefix = "_event"
+        name = codegen.tx.output.install_global_by_id(prefix, self.value)
+        codegen.append_output(codegen.create_load_global(name, add=True))
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index 08dab47451ab..437aded89235 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -23,7 +23,7 @@
 import textwrap
 import traceback
 import types
-import unittest
+from contextlib import nullcontext
 from typing import TYPE_CHECKING
 
 import sympy
@@ -202,6 +202,19 @@ def __init__(
             _is_name_set = self.proxy.node.op == "placeholder"
         self._is_name_set: bool = _is_name_set
 
+    def synchronize_attributes(self, tx, target_cls=None):
+        from .builder import get_specialized_props, infer_subclass_type
+
+        if target_cls is None:
+            target_cls = type(self)
+
+        example_value = self.proxy.node.meta.get("example_value")
+        specialized_props = get_specialized_props(
+            target_cls, tx, example_value, infer_subclass_type(example_value)
+        )
+        for k, v in specialized_props.items():
+            setattr(self, k, v)
+
     def debug_repr(self):
         # TODO: strip off fake tensor from repr here
         return repr(self.proxy.node.meta["example_value"])
@@ -234,7 +247,7 @@ def specialize(value: torch.Tensor):
 
         if is_sparse_any(value) and not has_free_symbols(value):
             props["_size"] = tuple(
-                [int(s) if is_symbolic(s) else s for s in value.size()]
+                int(s) if is_symbolic(s) else s for s in value.size()
             )
         elif not has_free_symbols(value):
             # this is a fully static shape, and the keys on props here inform specialization.
@@ -246,7 +259,8 @@ def specialize(value: torch.Tensor):
             props["_size"] = tuple(
                 # the non is_symbolic case applies to the jagged layout
                 # NestedTensor case as singleton ints are not symbolic
-                [int(s) if is_symbolic(s) else s for s in value.size()]
+                int(s) if is_symbolic(s) else s
+                for s in value.size()
             )
             props["stride"] = tuple(value.stride())
             if torch._C._functorch.is_batchedtensor(value):
@@ -255,11 +269,9 @@ def specialize(value: torch.Tensor):
                 props["is_contiguous"] = None
             else:
                 props["is_contiguous"] = tuple(
-                    [
-                        x
-                        for x in torch._prims_common._memory_formats
-                        if value.is_contiguous(memory_format=x)
-                    ]
+                    x
+                    for x in torch._prims_common._memory_formats
+                    if value.is_contiguous(memory_format=x)
                 )
         return props
 
@@ -505,7 +517,7 @@ def try_generic_attr_handling():
                 # these attributes are implemented under tp_getset, which appear
                 # as `getset_descriptor`s, (compared to, say, methods which appear
                 # as `method_descriptor`s)
-                if type(static_attr) != types.GetSetDescriptorType:
+                if type(static_attr) is not types.GetSetDescriptorType:
                     return None
 
                 proxy = GetAttrVariable.create_getattr_proxy(self.as_proxy(), name)
@@ -945,15 +957,10 @@ def method_tolist(self):
 
         def tolist(tensor, sub_proxy):
             def wrap(i, sub_proxy):
-                # Sigh, we forgot to gate this, so this data dependent is on
-                # by default and is load bearing in CI
-                with unittest.mock.patch.object(
-                    tx.fake_mode, "allow_scalar_outputs", True
-                ):
-                    return wrap_fx_proxy(
-                        tx,
-                        sub_proxy.item(),
-                    )
+                return wrap_fx_proxy(
+                    tx,
+                    sub_proxy.item(),
+                )
 
             if tensor.dtype not in [
                 torch.int8,
@@ -999,7 +1006,11 @@ def method_data_ptr(self, *args, **kwargs):
         return DataPtrVariable(self)
 
     def method_item(self, *args, **kwargs):
-        if not config.capture_scalar_outputs:
+        from ..symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+        # We enable capture_scalar_outputs when full_graph=True by default.
+        if not tx.one_graph and not config.capture_scalar_outputs:
             self._warn_capture_scalar_outputs()
             unimplemented_v2(
                 gb_type="Unsupported Tensor.item() call with capture_scalar_outputs=False",
@@ -1099,20 +1110,26 @@ def method___setitem__(self, key, value):
             #   value.requires_grad is True => self.has_grad_fn becomes True
 
             # Not sure if __setitem__ can ever save activations, disabling just in case
-            with torch._dynamo.utils._disable_saved_tensors_hooks_during_tracing():
-                get_fake_value(proxy.node, tx, allow_non_graph_fake=False)
 
-            example_value = self.proxy.node.meta.get("example_value")
-            from .builder import get_specialized_props, infer_subclass_type
+            # Ignore fresh unbacked symbols that could arise from the internal indexing (selection),
+            # that happen in code like t[idx] += 1 when idx is unbacked. Namely the selection
+            # during 'setitem'.
+            # When the selection happens if idx is unbacked we allocate a new unbacked symbol for the
+            # storage offset in select_meta, but the output of the operation 'setitem' does not depend
+            # on the selection.
+            with (
+                torch._dynamo.utils._disable_saved_tensors_hooks_during_tracing(),
+                tx.fake_mode.shape_env.ignore_fresh_unbacked_symbols()
+                if tx.fake_mode and tx.fake_mode.shape_env
+                else nullcontext(),
+            ):
+                get_fake_value(proxy.node, tx, allow_non_graph_fake=False)
 
-            if isinstance(value, variables.lazy.LazyVariableTracker):
-                value = variables.lazy.LazyVariableTracker.realize_all(value)
+            vt = value
+            if isinstance(vt, variables.lazy.LazyVariableTracker):
+                vt = variables.lazy.LazyVariableTracker.realize_all(vt)
 
-            specialized_props = get_specialized_props(
-                type(value), tx, example_value, infer_subclass_type(example_value)
-            )
-            for k, v in specialized_props.items():
-                setattr(self, k, v)
+            self.synchronize_attributes(tx, type(vt))
 
         if config.use_graph_deduplication or config.track_nodes_for_deduplication:
             tx.output.region_tracker.add_node_mutation(proxy.node, 0)
@@ -1357,7 +1374,7 @@ def method_new(self, *args, **kwargs):
         if (len(args) == 1 and isinstance(args[0], SizeVariable)) or (
             len(args) >= 1
             and all(
-                isinstance(a, ConstantVariable) and a.python_type() == int for a in args
+                isinstance(a, ConstantVariable) and a.python_type() is int for a in args
             )
         ):
             from ..symbolic_convert import InstructionTranslator
@@ -1539,7 +1556,7 @@ def insert_into_graph():
                 explanation=f"Dynamo currently does not support tracing `ndarray.{name}`.",
                 hints=[],
             )
-        elif name in ["__version__"]:
+        elif name == "__version__":
             unimplemented_v2(
                 gb_type="Unsupported ndarray.__version__ access",
                 context=f"var_getattr {self} {name}",
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index bfebedc88d6e..1b1b08ceaf6e 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -69,7 +69,7 @@
     proxy_args_kwargs,
     unwrap_if_wrapper,
 )
-from .base import typestr, VariableTracker
+from .base import raise_type_error_exc, typestr, VariableTracker
 from .ctx_manager import (
     AutocastModeVariable,
     ProfilerContextVariable,
@@ -125,6 +125,8 @@
         torch.autograd.graph.disable_saved_tensors_hooks,
         torch.cpu.amp.autocast_mode.autocast,
         torch.cuda.amp.autocast_mode.autocast,
+        torch.fx.traceback.annotate,
+        torch.fx.traceback.annotate.__wrapped__,  # type: ignore[attr-defined]
         # We'll let Dynamo inline into the contextlib part of these context
         # manager instances, all the way till it invokes the wrapped function
         # itself (at which point we wrap it back to special context manager
@@ -145,6 +147,7 @@
 
 constant_fold_functions_need_guards = [
     torch.accelerator.current_device_index,
+    torch.accelerator.current_accelerator,
     torch.cuda.current_device,
     torch.cuda.is_initialized,
     torch.xpu.current_device,
@@ -203,7 +206,9 @@ def tracing_state_functions() -> dict[Callable[[], Any], Optional[bool]]:
         torch.compiler.is_compiling: True,
         torch.compiler.is_dynamo_compiling: True,
         torch.compiler.is_exporting: True,
-        torch.nn.modules.activation._is_make_fx_tracing: False,
+        # Look into https://github.com/pytorch/pytorch/pull/164721 why this is
+        # turned to True for Dynamo.
+        torch.nn.modules.activation._is_make_fx_tracing: True,
     }
 
 
@@ -270,6 +275,26 @@ def call_obj_hasattr(self, tx: "InstructionTranslator", name):
     def can_constant_fold_through(self):
         if self.value in constant_fold_functions:
             return True
+
+        if (
+            self.value is torch.autograd._profiler_enabled
+            and config.constant_fold_autograd_profiler_enabled
+        ):
+            # The relevant flag is enabled only for export. One might wonder
+            # why?
+            #
+            # Actually we would like to not graph break even in the case of
+            # Dynamo. But there is a weird-unsolved bug with Kineto + Dynamo
+            # when there are distributed jobs that lead to NCCL timeouts. This
+            # bug is a rare edege case, but we have not been able to root cause
+            # it yet. See https://www.internalfb.com/sevmanager/view/560336 for
+            # more details.
+            #
+            # So is this safe for export? Yes, for export, we do not anticipate
+            # JIT tracing in distributed job training, and the weird edge-case
+            # interaction with Kineto is not a valid usecase. So, this is ok.
+            return True
+
         return getattr(self.value, "__module__", None) == "math"
 
 
@@ -305,6 +330,7 @@ def call_function(
             DisabledSavedTensorsHooksVariable,
             DualLevelContextManager,
             FSDPParamGroupUseTrainingStateVariable,
+            FxTracebackAnnotateVariable,
             GradIncrementNestingCtxManagerVariable,
             GradInplaceRequiresGradCtxManagerVariable,
             GradModeVariable,
@@ -339,6 +365,14 @@ def call_function(
             assert len(args) <= 1 and len(kwargs) == 0
             inf_mode = args[0].as_python_constant() if len(args) == 1 else True
             return InferenceModeVariable.create(tx, inf_mode)
+        elif self.value in (
+            torch.fx.traceback.annotate,
+            torch.fx.traceback.annotate.__wrapped__,  # type: ignore[attr-defined]
+        ):
+            assert len(args) <= 1 and len(kwargs) == 0
+            return FxTracebackAnnotateVariable(
+                args[0].as_python_constant(), source=self.source
+            )
         elif inspect.isclass(self.value) and issubclass(self.value, torch.Stream):
             from torch._dynamo.variables.builder import wrap_fx_proxy_cls
 
@@ -497,7 +531,7 @@ def handle_dispatch_key_set_functions(
             self, tx: "InstructionTranslator", *args, **kwargs
         ):
             assert not kwargs
-            if self.value in (torch._C._dispatch_keys,):
+            if self.value is torch._C._dispatch_keys:
                 assert len(args) == 1
                 assert isinstance(args[0], variables.TensorVariable)
                 example_value = args[0].proxy.node.meta["example_value"]
@@ -631,6 +665,7 @@ def handle_is_grad_enabled(self, tx):
         def handle_use_deterministic_algorithms(
             self, tx: "InstructionTranslator", mode, warn_only=False
         ):
+            # pyrefly: ignore  # missing-attribute
             if warn_only and warn_only.as_python_constant():
                 unimplemented_v2(
                     gb_type="Attempted to use torch.use_deterministic_algorithms(warn_only=True)",
@@ -1015,6 +1050,7 @@ def guard_scalar(self, tx: "InstructionTranslator", expr):
             else:
                 raise torch._dynamo.exc.Unsupported("branch not supported")
             return variables.ConstantVariable.create(
+                # pyrefly: ignore  # bad-argument-type
                 torch.fx.experimental.symbolic_shapes.guard_scalar(val)
             )
 
@@ -1061,6 +1097,7 @@ def handle_has_static_value(self, tx: "InstructionTranslator", expr):
                 return
 
             return variables.ConstantVariable.create(
+                # pyrefly: ignore  # bad-argument-type
                 torch.fx.experimental.symbolic_shapes.has_static_value(val)
             )
 
@@ -1142,7 +1179,11 @@ def handle_pop_torch_function(
         def handle_push_torch_function(
             self, tx: "InstructionTranslator", *args, **kwargs
         ):
-            assert len(args) == 1 and not kwargs
+            if len(args) != 1 or kwargs:
+                raise_type_error_exc(
+                    tx,
+                    f"push_torch_function takes exactly one argument ({len(args)} given)",
+                )
             TorchFunctionModeStackVariable.register_mutation(tx)
             tx.symbolic_torch_function_state.push_torch_function_mode(args[0])
             return ConstantVariable.create(None)
@@ -1151,14 +1192,19 @@ def handle_push_torch_function(
         def handle_len_torch_function(
             self, tx: "InstructionTranslator", *args, **kwargs
         ):
-            assert not args and not kwargs
+            if args or kwargs:
+                raise_type_error_exc(tx, "len_torch_function_stack takes no arguments")
             return ConstantVariable.create(
                 len(tx.symbolic_torch_function_state.mode_stack)
             )
 
         @register(torch._C._get_function_stack_at)
         def handle_get_stack_at(self, tx: "InstructionTranslator", *args, **kwargs):
-            assert len(args) == 1 and not kwargs
+            if len(args) != 1 or kwargs:
+                raise_type_error_exc(
+                    tx,
+                    f"get_function_stack_at takes exactly one argument ({len(args)} given)",
+                )
             ind = args[0].as_python_constant()
             assert ind >= 0 and ind < len(tx.symbolic_torch_function_state.mode_stack)
             return tx.symbolic_torch_function_state.mode_stack[ind]
@@ -1192,13 +1238,17 @@ def handle_get_device_module(self, tx, *args, **kwargs):
                 )
 
             # need to guard only on no-arg get_device_module
+            # pyrefly: ignore  # unbound-name
             if device is None:
                 source = CallFunctionNoArgsSource(self.source)
                 install_guard(source.make_guard(GuardBuilder.ID_MATCH))
             # assumes `module` is in the form `torch.xyz`
             new_source = AttrSource(
-                TorchSource(), module.__name__.rsplit(".", maxsplit=1)[-1]
+                TorchSource(),
+                # pyrefly: ignore  # unbound-name
+                module.__name__.rsplit(".", maxsplit=1)[-1],
             )
+            # pyrefly: ignore  # unbound-name
             return VariableTracker.build(tx, module, new_source)
 
         @register(torch.set_default_device)
@@ -1353,9 +1403,12 @@ def patched_fn(*args, **kwargs):
                 f"{fn.__name__}_spec", f_spec
             )
             input_spec_proxy = tx.output.register_static_attr_and_return_proxy(
-                fn.__name__ + "_input_spec", input_spec
+                fn.__name__ + "_input_spec",
+                # pyrefly: ignore  # unbound-name
+                input_spec,
             )
             f_spec_proxy.node.type = type(f_spec)
+            # pyrefly: ignore  # unbound-name
             input_spec_proxy.node.type = type(input_spec)
             all_args = (f_spec_proxy, input_spec_proxy, *proxified_flat_args)
 
@@ -1696,6 +1749,7 @@ def call_nn_parameter(cls, tx, data=None, requires_grad=True):
             )
 
         # this results in cleaner graphs, but only works for inputs
+        # pyrefly: ignore  # missing-attribute
         if data.source:
             return cls._nn_param_via_prefix_insert(tx, data, requires_grad)
 
@@ -1714,7 +1768,9 @@ def call_nn_parameter(cls, tx, data=None, requires_grad=True):
 
         # TODO[@lucaskabela]: Remove the behavior below since it is deprecated
         if isinstance(
-            data, TensorWithTFOverrideVariable
+            data,
+            TensorWithTFOverrideVariable,
+            # pyrefly: ignore  # missing-attribute
         ) or is_traceable_wrapper_subclass_type(data.class_type):
             unimplemented_v2(
                 gb_type="Attempted to use torch.nn.Parameter constructor with tensor subclass",
@@ -1737,8 +1793,11 @@ def call_nn_parameter(cls, tx, data=None, requires_grad=True):
             )
 
         try:
+            # pyrefly: ignore  # missing-attribute
             shape = tuple(data.var_getattr(tx, "shape").as_python_constant())
+            # pyrefly: ignore  # missing-attribute
             dtype = data.var_getattr(tx, "dtype").as_python_constant()
+            # pyrefly: ignore  # missing-attribute
             device = data.var_getattr(tx, "device").as_python_constant()
         except NotImplementedError as e:
             unimplemented_v2(
@@ -1753,9 +1812,13 @@ def call_nn_parameter(cls, tx, data=None, requires_grad=True):
             )
 
         placeholder = tx.output.synthetic_graph_input(
-            new_parameter_placeholder, [shape, dtype, device, requires_grad]
+            new_parameter_placeholder,
+            # pyrefly: ignore  # unbound-name
+            [shape, dtype, device, requires_grad],
         )
+        # pyrefly: ignore  # missing-attribute
         if data.requires_grad:
+            # pyrefly: ignore  # missing-attribute
             data = data.call_method(tx, "detach", [], {})
 
         from .builder import wrap_fx_proxy
@@ -1765,6 +1828,7 @@ def call_nn_parameter(cls, tx, data=None, requires_grad=True):
             tx.output.create_proxy(
                 "call_function",
                 tracable_create_parameter,
+                # pyrefly: ignore  # missing-attribute
                 (data.as_proxy(), placeholder.as_proxy()),
                 {},
             ),
@@ -1862,7 +1926,7 @@ def create_with_source(cls, value, source):
         return cls(value, source=source)
 
     def is_constant_fold_method(self, name):
-        return name in ["has"]
+        return name == "has"
 
     def call_method(
         self,
diff --git a/torch/_dynamo/variables/user_defined.py b/torch/_dynamo/variables/user_defined.py
index 98e07fc84dfb..4f19e28d4bab 100644
--- a/torch/_dynamo/variables/user_defined.py
+++ b/torch/_dynamo/variables/user_defined.py
@@ -277,7 +277,11 @@ def var_getattr(self, tx: "InstructionTranslator", name: str) -> "VariableTracke
             obj = inspect.getattr_static(self.value, name)
         except AttributeError:
             if type(self.value) is type:
-                raise_observed_exception(AttributeError, tx)
+                raise_observed_exception(
+                    AttributeError,
+                    tx,
+                    msg=f"type object '{self.value.__name__}' has no attribute '{name}'",
+                )
             else:
                 # Cannot reason about classes with a custom metaclass
                 # See: test_functions::test_getattr_metaclass
@@ -1365,7 +1369,11 @@ def var_getattr(self, tx: "InstructionTranslator", name):
         if tx.output.side_effects.has_pending_mutation_of_attr(self, name):
             result = tx.output.side_effects.load_attr(self, name, deleted_ok=True)
             if isinstance(result, variables.DeletedVariable):
-                raise_observed_exception(AttributeError, tx)
+                raise_observed_exception(
+                    AttributeError,
+                    tx,
+                    msg=f"'{type(self.value).__name__}' object has no attribute '{name}'",
+                )
             return result
 
         if name == "__dict__":
@@ -1458,12 +1466,8 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 # Get the getter function
                 source = AttrSource(source, "fget")
 
-            # Avoid using UserMethodVariable here because there is no way to
-            # access the method object here. Direct inline by creating the
-            # UserFunctionVariable.
-            return variables.UserFunctionVariable(
-                subobj.fget, source=source
-            ).call_function(tx, [self], {})
+            fget_vt = VariableTracker.build(tx, subobj.fget, source=source)
+            return fget_vt.call_function(tx, [self], {})
         elif isinstance(subobj, _collections._tuplegetter):
             # namedtuple fields are represented by _tuplegetter, and here we
             # emulate its `__get__`, which is implemented in C.
@@ -1641,7 +1645,11 @@ def var_getattr(self, tx: "InstructionTranslator", name):
                 return VariableTracker.build(tx, subobj)
 
         # Earlier we were returning GetAttrVariable but its incorrect. In absence of attr, Python raises AttributeError.
-        raise_observed_exception(AttributeError, tx)
+        raise_observed_exception(
+            AttributeError,
+            tx,
+            msg=f"'{type(self.value).__name__}' object has no attribute '{name}'",
+        )
 
     def call_obj_hasattr(
         self, tx: "InstructionTranslator", name: str
@@ -1672,7 +1680,7 @@ def __init__(self, c, fields):
 
         def __eq__(self, other):
             return (
-                type(self) == type(other)
+                type(self) is type(other)
                 and self.cls == other.cls
                 and self.fields == other.fields
             )
diff --git a/torch/_export/__init__.py b/torch/_export/__init__.py
index 634cbee2b9ac..d653db0c23a7 100644
--- a/torch/_export/__init__.py
+++ b/torch/_export/__init__.py
@@ -16,7 +16,8 @@
 from contextlib import contextmanager
 from functools import lru_cache
 
-from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from typing import Any, Optional, TYPE_CHECKING, Union
+from collections.abc import Callable
 from unittest.mock import patch
 
 import torch
@@ -133,17 +134,18 @@ def aot_compile(
     else:
         # We want to export to Torch IR here to utilize the pre_grad passes in
         # inductor, which run on Torch IR.
-        gm = _export_to_torch_ir(
-            f,
-            args,
-            kwargs,
-            dynamic_shapes,
-            disable_constraint_solver=disable_constraint_solver,
-            same_signature=same_signature,
-            # Disabling this flag, because instead we can rely on the mapping
-            # dynamo_flat_name_to_original_fqn which is coming from Dynamo.
-            restore_fqn=False,
-        )
+        with torch._export.config.patch(use_new_tracer_experimental=True):
+            gm = _export_to_torch_ir(
+                f,
+                args,
+                kwargs,
+                dynamic_shapes,
+                disable_constraint_solver=disable_constraint_solver,
+                same_signature=same_signature,
+                # Disabling this flag, because instead we can rely on the mapping
+                # dynamo_flat_name_to_original_fqn which is coming from Dynamo.
+                restore_fqn=False,
+            )
 
     with torch.no_grad():
         so_path = torch._inductor.aot_compile(gm, args, kwargs, options=options)  # type: ignore[arg-type]
diff --git a/torch/_export/config.py b/torch/_export/config.py
index 437fed1dc252..5595fe5159f5 100644
--- a/torch/_export/config.py
+++ b/torch/_export/config.py
@@ -17,6 +17,16 @@
 # should be True in the long term.
 use_new_tracer_experimental = False
 
+# this flag is used to control whether we want to instrument
+# fake tensor creation to track potential leaks. It is off
+# by default, but user can turn it on to debug leaks.
+detect_non_strict_fake_tensor_leaks = False
+
+# error on potentially pre-dispatch/non-strict tracing limitation
+# this type of error usually happens when we encounter an op
+# that we don't know how to proxy, resulting in untracked fake tensors
+error_on_lifted_constant_tensors = True
+
 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403
 
diff --git a/torch/_export/converter.py b/torch/_export/converter.py
index bba7c2d16aa6..e2a3be171188 100644
--- a/torch/_export/converter.py
+++ b/torch/_export/converter.py
@@ -4,9 +4,9 @@
 import operator
 import typing
 import warnings
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from contextlib import contextmanager
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch.export._trace
@@ -1109,6 +1109,7 @@ def convert_prim_Loop(self, node: torch._C.Node):
                     fx_block_args[i] = self.name_to_node[output_name]
 
             # Update the value of global variables, whose values are modified inplace.
+
             for i, name in enumerate(
                 subgraph_converter.name_update_from_subblock_to_parent
             ):
diff --git a/torch/_export/db/case.py b/torch/_export/db/case.py
index 6d32eab79d3e..2187c0fa0ce7 100644
--- a/torch/_export/db/case.py
+++ b/torch/_export/db/case.py
@@ -132,6 +132,7 @@ def _make_export_case(m, name, configs):
             m.__doc__ is not None
         ), f"Could not find description or docstring for export case: {m}"
         configs = {**configs, "description": m.__doc__}
+    # pyrefly: ignore  # bad-argument-type
     return ExportCase(**{**configs, "model": m, "name": name})
 
 
diff --git a/torch/_export/db/examples/autograd_function.py b/torch/_export/db/examples/autograd_function.py
index 0458291e2176..76d77de16399 100644
--- a/torch/_export/db/examples/autograd_function.py
+++ b/torch/_export/db/examples/autograd_function.py
@@ -3,10 +3,12 @@
 
 class MyAutogradFunction(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, x):
         return x.clone()
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         return grad_output + 1
 
diff --git a/torch/_export/db/examples/constrain_as_size_example.py b/torch/_export/db/examples/constrain_as_size_example.py
index fbed4984ac4d..934746aaf673 100644
--- a/torch/_export/db/examples/constrain_as_size_example.py
+++ b/torch/_export/db/examples/constrain_as_size_example.py
@@ -5,14 +5,12 @@
 class ConstrainAsSizeExample(torch.nn.Module):
     """
     If the value is not known at tracing time, you can provide hint so that we
-    can trace further. Please look at torch._check and torch._check_is_size APIs.
-    torch._check_is_size is used for values that NEED to be used for constructing
-    tensor.
+    can trace further. Please look at torch._check APIs.
     """
 
     def forward(self, x):
         a = x.item()
-        torch._check_is_size(a)
+        torch._check(a >= 0)
         torch._check(a <= 5)
         return torch.zeros((a, 5))
 
diff --git a/torch/_export/db/examples/constrain_as_value_example.py b/torch/_export/db/examples/constrain_as_value_example.py
index c8bfc3d6e365..22f791a3e804 100644
--- a/torch/_export/db/examples/constrain_as_value_example.py
+++ b/torch/_export/db/examples/constrain_as_value_example.py
@@ -5,9 +5,7 @@
 class ConstrainAsValueExample(torch.nn.Module):
     """
     If the value is not known at tracing time, you can provide hint so that we
-    can trace further. Please look at torch._check and torch._check_is_size APIs.
-    torch._check is used for values that don't need to be used for constructing
-    tensor.
+    can trace further. Please look at torch._check API.
     """
 
     def forward(self, x, y):
diff --git a/torch/_export/db/examples/model_attr_mutation.py b/torch/_export/db/examples/model_attr_mutation.py
index 4aa623c7dc39..122b0ddfc342 100644
--- a/torch/_export/db/examples/model_attr_mutation.py
+++ b/torch/_export/db/examples/model_attr_mutation.py
@@ -1,11 +1,10 @@
 # mypy: allow-untyped-defs
 import torch
-from torch._export.db.case import SupportLevel
 
 
 class ModelAttrMutation(torch.nn.Module):
     """
-    Attribute mutation is not supported.
+    Attribute mutation raises a warning. Covered in the test_export.py test_detect_leak_strict test.
     """
 
     def __init__(self) -> None:
@@ -22,5 +21,4 @@ def forward(self, x):
 
 example_args = (torch.randn(3, 2),)
 tags = {"python.object-model"}
-support_level = SupportLevel.NOT_SUPPORTED_YET
 model = ModelAttrMutation()
diff --git a/torch/_export/db/logging.py b/torch/_export/db/logging.py
index d034e4d4d41a..594907bbf126 100644
--- a/torch/_export/db/logging.py
+++ b/torch/_export/db/logging.py
@@ -39,6 +39,7 @@ def get_class_if_classified_error(e: Exception) -> Optional[str]:
         TorchRuntimeError: None,
     }
     if type(e) in _ALLOW_LIST:
+        # pyrefly: ignore  # index-error
         attr_name = _ALLOW_LIST[type(e)]
         if attr_name is None:
             return ALWAYS_CLASSIFIED
diff --git a/torch/_export/non_strict_utils.py b/torch/_export/non_strict_utils.py
index fffe85beb467..055a2c7de048 100644
--- a/torch/_export/non_strict_utils.py
+++ b/torch/_export/non_strict_utils.py
@@ -6,9 +6,9 @@
 import logging
 import math
 from collections import defaultdict
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from contextlib import contextmanager
-from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 import torch
 import torch.utils._pytree as pytree
@@ -101,6 +101,7 @@ def get(self, kp: KeyPath) -> tuple[Source, KeyPath]:
             assert len(kp) > 0
             k, *kp = kp  # type: ignore[assignment]
             node = node[k]
+        # pyrefly: ignore  # bad-return
         return node, kp
 
 
@@ -139,6 +140,7 @@ def key_path_to_source(
         source: Source = LocalSource("args")
     else:
         source, kp = sourced_prefixes.get(kp)
+
     for k in kp:
         if isinstance(k, SequenceKey):
             source = GetItemSource(source, k.idx)
@@ -199,9 +201,31 @@ def fakify(
             "To register a constant input, use torch.utils._pytree.register_constant"
         )
 
+    # Create symbolic context (handles subclass recursion internally)
+    symbolic_context = _create_symbolic_context_for_tensor(
+        t, source, t_constraints, sources, mode
+    )
+
+    fake = mode.from_tensor(t, source=source, symbolic_context=symbolic_context)
+    mode.shape_env.tracked_fakes.append(TrackedFake(fake, source, symbolic_context))  # type: ignore[union-attr]
+    return fake
+
+
+def _create_symbolic_context_for_tensor(t, source, t_constraints, sources, mode):
+    """Helper function to create symbolic context for a tensor."""
+    from torch._dynamo.source import AttrSource
+    from torch.fx.experimental.symbolic_shapes import (
+        DimDynamic,
+        RelaxedUnspecConstraint,
+        SubclassSymbolicContext,
+    )
+    from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+
+    # Common dynamic dimension logic for both regular tensors and subclasses
     n_dims = len(t.shape)
     dynamic_sizes = []
     constraint_sizes = [None] * n_dims
+
     for i in range(n_dims):
         if i in getattr(t, "_dynamo_weak_dynamic_indices", {}):
             dynamic_sizes.append(DimDynamic.DYNAMIC)
@@ -213,12 +237,38 @@ def fakify(
             constraint_sizes[i] = RelaxedUnspecConstraint(warn_only=False)  # type: ignore[call-overload]
         else:
             dynamic_sizes.append(DimDynamic.STATIC)
-    symbolic_context: StatelessSymbolicContext = (  # make mypy happy
-        StatelessSymbolicContext(
+
+    # Handle nested subclasses
+    if is_traceable_wrapper_subclass(t):
+        # Get inner contexts recursively
+        inner_contexts = {}
+        attrs, _ = type(t).__tensor_flatten__(t)
+
+        # Propagate outer tensor constraints to inner tensors if not already present
+        for attr in attrs:
+            inner_tensor = getattr(t, attr)
+            inner_source = AttrSource(source, attr)
+            inner_contexts[attr] = _create_symbolic_context_for_tensor(
+                inner_tensor, inner_source, t_constraints, sources, mode
+            )
+
+        symbolic_context = SubclassSymbolicContext(
             dynamic_sizes=dynamic_sizes,
             constraint_sizes=constraint_sizes,  # type: ignore[arg-type]
+            view_base_context=None,
+            tensor_source=source,
+            shape_env_to_source_to_symbol_cache={},
+            inner_contexts=inner_contexts,
         )
-    )
+    else:
+        symbolic_context: StatelessSymbolicContext = (  # type: ignore[no-redef]
+            StatelessSymbolicContext(
+                dynamic_sizes=dynamic_sizes,
+                constraint_sizes=constraint_sizes,  # type: ignore[arg-type]
+            )
+        )
+
+    # Apply constraints (common logic)
     t_id = id(t)
     assert mode.shape_env is not None
     if t_id in t_constraints:
@@ -229,9 +279,8 @@ def fakify(
                 continue
             symbolic_context.constraint_sizes[i] = constraint.constraint_range
             mode.shape_env.source_name_to_debug_name[src.name()] = constraint.name  # type: ignore[assignment]
-    fake = mode.from_tensor(t, source=source, symbolic_context=symbolic_context)
-    mode.shape_env.tracked_fakes.append(TrackedFake(fake, source, symbolic_context))  # type: ignore[union-attr]
-    return fake
+
+    return symbolic_context
 
 
 def _is_unbacked_symint(symbol):
@@ -307,10 +356,12 @@ def _override_builtin_ops():
     original_min = builtins.min
     original_pow = math.pow
 
+    # pyrefly: ignore  # bad-assignment
     builtins.max = functools.partial(
         _tensor_min_max, real_callable=original_max, tensor_callable=torch.maximum
     )
 
+    # pyrefly: ignore  # bad-assignment
     builtins.min = functools.partial(
         _tensor_min_max, real_callable=original_min, tensor_callable=torch.minimum
     )
@@ -1036,6 +1087,7 @@ def rewrite(dim, item):
 
                 def run():
                     # Run sequence.
+                    # pyrefly: ignore  # index-error
                     t = args[0]
                     for _method, _args in sequence:
                         t = _method(t, *_args)
diff --git a/torch/_export/pass_base.py b/torch/_export/pass_base.py
index 952e904ca26e..b65df30103eb 100644
--- a/torch/_export/pass_base.py
+++ b/torch/_export/pass_base.py
@@ -2,8 +2,9 @@
 import operator
 import traceback
 import typing
+from collections.abc import Callable
 from contextlib import nullcontext
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 from torch import fx
@@ -187,6 +188,7 @@ def __init__(
             self.callback = callback
             self.node: torch.fx.Node = next(iter(gm.graph.nodes))
 
+        # pyrefly: ignore  # bad-override
         def placeholder(
             self,
             target: str,  # type: ignore[override]
@@ -315,6 +317,7 @@ def _fx(
         )
         res_proxy.node.meta.update(meta.data)
         if self.fake_tensor_mode and (shape_env := self.fake_tensor_mode.shape_env):
+            # pyrefly: ignore  # unbound-name
             if symbol_to_path := compute_unbacked_bindings(shape_env, res_data):
                 res_proxy.node.meta["unbacked_bindings"] = symbol_to_path
         self.tracer.set_metadata(res_proxy.node, res_data)
@@ -438,6 +441,7 @@ def call_submodule(
         )
         self.tracer.fake_tensor_mode = prev_tracer.fake_tensor_mode
         interpreter = self.ExportInterpreter(self, graph_module)
+        # pyrefly: ignore  # bad-assignment
         prev_interpreter, self.interpreter = (
             self.interpreter,
             torch.fx.Interpreter(  # type: ignore[assignment]
diff --git a/torch/_export/passes/_node_metadata_hook.py b/torch/_export/passes/_node_metadata_hook.py
index f1958815293c..d82673e58ec0 100644
--- a/torch/_export/passes/_node_metadata_hook.py
+++ b/torch/_export/passes/_node_metadata_hook.py
@@ -32,6 +32,7 @@ def _node_metadata_hook(
     that nodes being added are only call_function nodes, and copies over the
     first argument node's nn_module_stack.
     """
+    # pyrefly: ignore  # bad-assignment
     fake_mode = fake_mode or contextlib.nullcontext()
 
     assert node.op == "call_function" and callable(node.target), (
@@ -47,6 +48,7 @@ def _node_metadata_hook(
         fake_args, fake_kwargs = pytree.tree_map_only(
             torch.fx.Node, lambda arg: arg.meta["val"], (node.args, node.kwargs)
         )
+        # pyrefly: ignore  # bad-context-manager
         with fake_mode, enable_python_dispatcher():
             fake_res = node.target(*fake_args, **fake_kwargs)
         node.meta["val"] = fake_res
@@ -82,6 +84,7 @@ def _node_metadata_hook(
         "torch_fn",
         (
             f"{node.target.__name__}_0",
+            # pyrefly: ignore  # missing-attribute
             f"{node.target.__class__.__name__}.{node.target.__name__}",
         ),
     )
diff --git a/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py b/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py
index bd81f0a92676..d646b7edaaf0 100644
--- a/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py
+++ b/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py
@@ -3,7 +3,7 @@
 import operator
 import traceback
 from functools import partial
-from typing import Callable, NamedTuple
+from typing import NamedTuple, TYPE_CHECKING
 
 import sympy
 
@@ -15,6 +15,10 @@
 from torch.utils._sympy.value_ranges import ValueRanges
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 __all__ = ["InputDim"]
 
 
diff --git a/torch/_export/passes/constant_folding.py b/torch/_export/passes/constant_folding.py
index fc4149dd55bc..5fdc92702a11 100644
--- a/torch/_export/passes/constant_folding.py
+++ b/torch/_export/passes/constant_folding.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 import collections
 from collections import defaultdict
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 
 import torch
 import torch.utils._pytree as pytree
@@ -126,7 +127,7 @@ def set_env(arg):
         # contains a ScriptObject, equality checking results in a type error if
         # the types are different.
         if any(
-            type(self.unknown_value) == type(input_) and self.unknown_value == input_
+            type(self.unknown_value) is type(input_) and self.unknown_value == input_
             for input_ in flattened_inputs
         ):
             return self.unknown_value
diff --git a/torch/_export/passes/insert_custom_op_guards.py b/torch/_export/passes/insert_custom_op_guards.py
index bfea7b08c924..1b1e5fb6a9d7 100644
--- a/torch/_export/passes/insert_custom_op_guards.py
+++ b/torch/_export/passes/insert_custom_op_guards.py
@@ -54,12 +54,10 @@ def get_op_profiles(
 
     def _get_op_profile(node: torch.fx.Node) -> OpProfile:
         args_profile = tuple(
-            [
-                TensorMetadata.maybe_from_tensor(arg.meta.get("val"))
-                if isinstance(arg, torch.fx.Node)
-                else None
-                for arg in (*node.args, *node.kwargs.values())
-            ]
+            TensorMetadata.maybe_from_tensor(arg.meta.get("val"))
+            if isinstance(arg, torch.fx.Node)
+            else None
+            for arg in (*node.args, *node.kwargs.values())
         )
 
         out_profile = None
@@ -68,7 +66,7 @@ def _get_op_profile(node: torch.fx.Node) -> OpProfile:
         if isinstance(meta, torch.Tensor):
             out_profile = TensorMetadata.maybe_from_tensor(meta)
         elif isinstance(meta, (list, tuple)):
-            out_profile = tuple([TensorMetadata.maybe_from_tensor(m) for m in meta])  # type: ignore[assignment]
+            out_profile = tuple(TensorMetadata.maybe_from_tensor(m) for m in meta)  # type: ignore[assignment]
         assert out_profile is not None
 
         return OpProfile(args_profile, out_profile)  # type: ignore[arg-type]
diff --git a/torch/_export/passes/lift_constants_pass.py b/torch/_export/passes/lift_constants_pass.py
index 20253a91c258..7e57817eb68d 100644
--- a/torch/_export/passes/lift_constants_pass.py
+++ b/torch/_export/passes/lift_constants_pass.py
@@ -142,6 +142,10 @@ def _unused_constant(node: torch.fx.Node) -> Optional[list[torch.fx.Node]]:
     if len(lift_fresh_node.users) > 1:
         return None
 
+    # Case 1: lift node is not used anywhere
+    if len(lift_fresh_node.users) == 0:
+        return [lift_fresh_node, node]
+
     detach_node = next(iter(lift_fresh_node.users.keys()))
     if not (
         detach_node.op == "call_function"
@@ -156,6 +160,7 @@ def _unused_constant(node: torch.fx.Node) -> Optional[list[torch.fx.Node]]:
     if len(detach_node.users) > 0:
         return None
     else:
+        # Case 2: Lift node's child is not used anywhere
         return [detach_node, lift_fresh_node, node]
 
 
diff --git a/torch/_export/passes/replace_quantized_ops_with_standard_ops_pass.py b/torch/_export/passes/replace_quantized_ops_with_standard_ops_pass.py
index 4d9187680101..a408f6644bba 100644
--- a/torch/_export/passes/replace_quantized_ops_with_standard_ops_pass.py
+++ b/torch/_export/passes/replace_quantized_ops_with_standard_ops_pass.py
@@ -567,6 +567,7 @@ def replace_quantized_ops_with_standard_ops(gm: torch.fx.GraphModule):
     quantized = False
 
     last_quantized_node = None
+    # pyrefly: ignore  # bad-assignment
     for node in gm.graph.nodes:
         if isinstance(node.target, OpOverload):
             with gm.graph.inserting_before(node):
@@ -629,6 +630,7 @@ def _clean_attr(mod: torch.nn.Module):
                     attr_names_to_clean.add(k)
                 if k == "_buffers":
                     buffer_name_to_clean = set()
+                    # pyrefly: ignore  # missing-attribute
                     for b_name, b_value in v.items():
                         if isinstance(b_value, torch.Tensor) and b_value.dtype in [
                             torch.qint8,
@@ -636,6 +638,7 @@ def _clean_attr(mod: torch.nn.Module):
                         ]:
                             buffer_name_to_clean.add(b_name)
                     for b_name in buffer_name_to_clean:
+                        # pyrefly: ignore  # missing-attribute
                         v.pop(b_name, None)
             for attr_name in attr_names_to_clean:
                 delattr(submod, attr_name)
diff --git a/torch/_export/passes/replace_with_hop_pass_util.py b/torch/_export/passes/replace_with_hop_pass_util.py
index 974058092448..00975b3a76b9 100644
--- a/torch/_export/passes/replace_with_hop_pass_util.py
+++ b/torch/_export/passes/replace_with_hop_pass_util.py
@@ -4,7 +4,7 @@
 import contextlib
 import copy
 import operator
-from typing import Callable, Optional, TYPE_CHECKING
+from typing import Optional, TYPE_CHECKING
 
 import torch
 
@@ -12,6 +12,8 @@
 
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from torch._ops import HigherOrderOperator
     from torch.export.graph_signature import ExportGraphSignature
 
@@ -33,6 +35,7 @@ def set_hoo_node_meta(call_func_node):
         )
         call_func_node.meta["torch_fn"] = (
             f"{wrap_hoo.__name__}",
+            # pyrefly: ignore  # missing-attribute
             f"{wrap_hoo.__class__.__name__}.{wrap_hoo.__name__}",
         )
         if isinstance(output_args, (tuple, list)):
diff --git a/torch/_export/serde/dynamic_shapes.py b/torch/_export/serde/dynamic_shapes.py
index e6a0295163dc..18874ddab634 100644
--- a/torch/_export/serde/dynamic_shapes.py
+++ b/torch/_export/serde/dynamic_shapes.py
@@ -54,6 +54,7 @@ def _postprocess_serialized_shapes(
         )
         for k, v in sorted(dims.items())
     }
+    # pyrefly: ignore  # bad-argument-type
     spec = DynamicShapesSpec(dynamic_shapes=dynamic_shapes, dims=dims)
     if to_dict:
         return _dataclass_to_dict(spec)
@@ -183,6 +184,7 @@ def _track_dim_from_dims(
     kwargs = kwargs or {}
     if isinstance(dynamic_shapes, dict):
         dynamic_shapes = dynamic_shapes.values()  # type: ignore[assignment]
+    # pyrefly: ignore  # bad-assignment, bad-argument-type
     dynamic_shapes = tuple(dynamic_shapes)
     combined_args = tuple(args) + tuple(kwargs.values())
 
diff --git a/torch/_export/serde/schema_check.py b/torch/_export/serde/schema_check.py
index 29b9766ae18a..cc33c7e3aba9 100644
--- a/torch/_export/serde/schema_check.py
+++ b/torch/_export/serde/schema_check.py
@@ -64,14 +64,14 @@ def dump_type(t, level: int) -> tuple[str, str, str]:
                 )
             elif o := typing.get_origin(t):
                 # Lemme know if there's a better way to do this.
-                if o == list:
+                if o is list:
                     yaml_head, cpp_head, thrift_head, thrift_tail = (
                         "List",
                         "std::vector",
                         "list<",
                         ">",
                     )
-                elif o == dict:
+                elif o is dict:
                     yaml_head, cpp_head, thrift_head, thrift_tail = (
                         "Dict",
                         "std::unordered_map",
@@ -81,7 +81,7 @@ def dump_type(t, level: int) -> tuple[str, str, str]:
                 elif o == Union:
                     assert level == 0, "Optional is only supported at the top level."
                     args = typing.get_args(t)
-                    assert len(args) == 2 and args[1] == type(None)
+                    assert len(args) == 2 and args[1] is type(None)
                     yaml_type, cpp_type, thrift_type = dump_type(args[0], level + 1)
                     return (
                         f"Optional[{yaml_type}]",
@@ -623,7 +623,9 @@ class _Commit:
 def update_schema():
     import importlib.resources
 
+    # pyrefly: ignore  # bad-argument-type
     if importlib.resources.is_resource(__package__, "schema.yaml"):
+        # pyrefly: ignore  # bad-argument-type
         content = importlib.resources.read_text(__package__, "schema.yaml")
         match = re.search("checksum<<([A-Fa-f0-9]{64})>>", content)
         _check(match is not None, "checksum not found in schema.yaml")
@@ -631,7 +633,9 @@ def update_schema():
         checksum_head = match.group(1)
 
         thrift_content = importlib.resources.read_text(
-            __package__, "export_schema.thrift"
+            # pyrefly: ignore  # bad-argument-type
+            __package__,
+            "export_schema.thrift",
         )
         match = re.search("checksum<<([A-Fa-f0-9]{64})>>", thrift_content)
         _check(match is not None, "checksum not found in export_schema.thrift")
@@ -654,7 +658,9 @@ def update_schema():
 
     src, cpp_header, thrift_schema = _staged_schema()
     additions, subtractions = _diff_schema(dst, src)
+    # pyrefly: ignore  # missing-attribute
     yaml_path = __package__.replace(".", "/") + "/schema.yaml"
+    # pyrefly: ignore  # missing-attribute
     thrift_schema_path = __package__.replace(".", "/") + "/export_schema.thrift"
     torch_prefix = "torch/"
     assert yaml_path.startswith(torch_prefix)  # sanity check
diff --git a/torch/_export/serde/serialize.py b/torch/_export/serde/serialize.py
index 8cc139f7cf3d..26299281f04f 100644
--- a/torch/_export/serde/serialize.py
+++ b/torch/_export/serde/serialize.py
@@ -14,11 +14,11 @@
 import traceback
 import typing
 from collections import namedtuple, OrderedDict
-from collections.abc import Iterable, Iterator, Sequence
+from collections.abc import Callable, Iterable, Iterator, Sequence
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Annotated, Any, Callable, cast, final, Optional, Union
+from typing import Annotated, Any, cast, final, Optional, Union
 
 import sympy
 
@@ -383,6 +383,7 @@ def _reconstruct_fake_tensor(
     fake_tensor = _CURRENT_DESERIALIZER.deserialize_tensor_meta(tensor_meta)
     if is_parameter:
         fake_tensor = torch.nn.Parameter(fake_tensor)  # type: ignore[assignment]
+    # pyrefly: ignore  # bad-return
     return fake_tensor
 
 
@@ -2182,6 +2183,7 @@ def deserialize_graph(self, serialized_graph: Graph) -> torch.fx.Graph:
                     simplify=True,
                 )
             ):
+                # pyrefly: ignore  # unbound-name
                 node.meta["unbacked_bindings"] = unbacked_bindings
 
         assert len(self.unbacked_symbols) == 0
@@ -2499,9 +2501,9 @@ def deserialize(
             # TODO(pianpwk): if we can clean up unused symbols in range_constraints,
             # then this logic can just be handled with self.unbacked_symbols alone
             for _ in range(count_unbacked_symfloat + 1):
-                next(self.shape_env.unbacked_symfloat_counter)
+                self.shape_env.unbacked_symfloat_counter += 1
             for _ in range(count_unbacked_symint + 1):
-                next(self.shape_env.unbacked_symint_counter)
+                self.shape_env.unbacked_symint_counter += 1
 
             if example_inputs is not None and len(example_inputs) > 0:
                 self.example_inputs = deserialize_torch_artifact(example_inputs)
@@ -2740,6 +2742,7 @@ def _deserialize_hop_with_single_return(serialized_node, fx_node):
                     serialized_node.metadata
                 )
                 assert arg is not None
+                # pyrefly: ignore  # bad-argument-type
                 self.generate_getitem(meta_val, fx_node, arg, 0, deserialized_metadata)
                 fx_node.meta["val"] = tuple(meta_val)
                 self.serialized_name_to_node[fx_node.name] = fx_node
@@ -3165,6 +3168,7 @@ def _dict_to_dataclass(cls, data):
         _value = next(iter(data.values()))
         assert isinstance(_type, str)
         field_type = cls.__annotations__[_type]
+        # pyrefly: ignore  # missing-attribute
         return cls.create(**{_type: _dict_to_dataclass(field_type, _value)})
     elif dataclasses.is_dataclass(cls):
         fields = {}
@@ -3186,7 +3190,7 @@ def _dict_to_dataclass(cls, data):
     elif isinstance(data, dict):
         v_type = typing.get_args(cls)[1]
         return {k: _dict_to_dataclass(v_type, v) for k, v in data.items()}
-    elif cls == float:
+    elif cls is float:
         return float(data)
     return data
 
@@ -3471,18 +3475,23 @@ def replace_use(a):
         n.metadata.clear()
 
     # Stage 4: Aggregate values.
+    # pyrefly: ignore  # no-matching-overload
     sorted_tensor_values = dict(
         sorted(graph.tensor_values.items(), key=operator.itemgetter(0))
     )
+    # pyrefly: ignore  # no-matching-overload
     sorted_sym_int_values = dict(
         sorted(graph.sym_int_values.items(), key=operator.itemgetter(0))
     )
+    # pyrefly: ignore  # no-matching-overload
     sorted_sym_float_values = dict(
         sorted(graph.sym_float_values.items(), key=operator.itemgetter(0))
     )
+    # pyrefly: ignore  # no-matching-overload
     sorted_sym_bool_values = dict(
         sorted(graph.sym_bool_values.items(), key=operator.itemgetter(0))
     )
+    # pyrefly: ignore  # no-matching-overload
     sorted_custom_obj_values = dict(
         sorted(graph.custom_obj_values.items(), key=operator.itemgetter(0))
     )
@@ -3539,6 +3548,7 @@ def canonicalize(
         ExportedProgram: The canonicalized exported program.
     """
     ep = copy.deepcopy(ep)
+    # pyrefly: ignore  # annotation-mismatch
     constants: set[str] = constants or set()
 
     opset_version = dict(sorted(ep.opset_version.items(), key=operator.itemgetter(0)))
diff --git a/torch/_export/utils.py b/torch/_export/utils.py
index 06c608d20c7f..bfe98daff2d5 100644
--- a/torch/_export/utils.py
+++ b/torch/_export/utils.py
@@ -9,10 +9,10 @@
 import operator
 import re
 from collections import defaultdict
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from contextlib import contextmanager
 from inspect import ismethod, Parameter
-from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 import torch
 from torch._guards import detect_fake_mode
@@ -464,13 +464,18 @@ def _check_input_constraints_for_graph(
                 )
 
         elif isinstance(node_val, (int, float, str)):
-            if type(arg) != type(node_val) or arg != node_val:
+            if type(arg) is not type(node_val) or arg != node_val:
                 raise RuntimeError(
                     f"Expected input at {get_keystr(key_path)} to be equal to {node_val}, but got {arg}",
                 )
         elif isinstance(node_val, torch.SymInt):
             _check_symint(
-                node_val, arg, range_constraints, unification_map, key_path, None
+                node_val,
+                arg,
+                range_constraints,
+                unification_map,
+                key_path,
+                None,
             )
 
 
@@ -1115,12 +1120,14 @@ def _extract_pytree_key(x):
         if (  # handle targets for custom objects
             spec.kind == InputKind.CUSTOM_OBJ and spec.target in name_map
         ):
+            # pyrefly: ignore  # index-error
             spec.target = name_map[spec.target][4:]  # strip obj_ prefix
 
     for spec in export_graph_signature.output_specs:
         if spec.arg.name in name_map:
             spec.arg.name = name_map[spec.arg.name]
         if spec.kind == OutputKind.USER_INPUT_MUTATION and spec.target in name_map:
+            # pyrefly: ignore  # index-error
             spec.target = name_map[spec.target]
 
     # rename keys in constants dict for custom objects
diff --git a/torch/_functorch/_activation_checkpointing/ac_logging_utils.py b/torch/_functorch/_activation_checkpointing/ac_logging_utils.py
index fe22a3837959..b629d43ef3b5 100644
--- a/torch/_functorch/_activation_checkpointing/ac_logging_utils.py
+++ b/torch/_functorch/_activation_checkpointing/ac_logging_utils.py
@@ -60,10 +60,32 @@ def create_activation_checkpointing_logging_structure_payload(
     expected_runtime: float,
     saved_node_idxs: list[int],
     recomputable_node_idxs: list[int],
-    memories_banned_nodes: list[float],
+    memories_banned_nodes: list[int],
+    normalized_memories_banned_nodes: list[float],
     runtimes_banned_nodes: list[float],
     min_cut_saved_values: list[Node],
 ) -> dict[str, Any]:
+    """
+    Creates a structured payload for logging activation checkpointing information.
+
+    Args:
+        joint_graph: The computational graph representing operations.
+        joint_graph_node_information: Dictionary containing information about nodes in the joint graph.
+        joint_graph_edges: List of edges in the joint graph represented as tuples of node names.
+        all_recomputable_banned_nodes: List of nodes that are banned from recomputation.
+        expected_runtime: Expected runtime of the computation.
+        saved_node_idxs: Indices of nodes that are saved (not recomputed).
+        recomputable_node_idxs: Indices of nodes that can be recomputed.
+        memories_banned_nodes: Memory usage values (in absolute units) for banned nodes.
+        normalized_memories_banned_nodes: Normalized memory usage values for banned nodes,
+            used as input to the knapsack algorithm.
+        runtimes_banned_nodes: Runtime values for banned nodes, used as input to the
+            knapsack algorithm.
+        min_cut_saved_values: List of nodes saved by the min-cut algorithm.
+
+    Returns:
+        A dictionary containing structured logging information for activation checkpointing.
+    """
     activation_checkpointing_logging_structure_payload: dict[str, Any] = {
         "Joint Graph Size": len(joint_graph.nodes),
         "Joint Graph Edges": {
@@ -77,7 +99,8 @@ def create_activation_checkpointing_logging_structure_payload(
         "Expected Runtime": expected_runtime,
         "Knapsack Saved Nodes": saved_node_idxs,
         "Knapsack Recomputed Nodes": recomputable_node_idxs,
-        "Knapsack Input Memories": memories_banned_nodes,
+        "Absolute Memories": memories_banned_nodes,
+        "Knapsack Input Memories": normalized_memories_banned_nodes,
         "Knapsack Input Runtimes": runtimes_banned_nodes,
         "Min Cut Solution Saved Values": [node.name for node in min_cut_saved_values],
     }
@@ -90,17 +113,37 @@ def create_structured_trace_for_min_cut_info(
     saved_node_idxs: list[int],
     recomputable_node_idxs: list[int],
     expected_runtime: float,
-    memories_banned_nodes: list[float],
+    memories_banned_nodes: list[int],
+    normalized_memories_banned_nodes: list[float],
     runtimes_banned_nodes: list[float],
     min_cut_saved_values: list[Node],
 ) -> None:
+    """
+    Creates a structured trace for minimum cut information in the graph.
+
+    Args:
+        joint_graph: The computational graph representation.
+        all_recomputable_banned_nodes: List of nodes that can be recomputed.
+        saved_node_idxs: Indices of nodes that are saved in memory.
+        recomputable_node_idxs: Indices of nodes that are recomputed.
+        expected_runtime: Expected runtime for the computation.
+        memories_banned_nodes: Memory requirements for each banned node in bytes.
+        normalized_memories_banned_nodes: Normalized memory requirements for each banned node
+            (typically scaled between 0 and 1 for relative comparison).
+        runtimes_banned_nodes: Runtime costs associated with each banned node.
+        min_cut_saved_values: Nodes that are saved as part of the minimum cut solution.
+    """
+    # Create a dictionary to store recomputable node information
     recomputable_node_info: dict[str, int] = {
         node.name: idx for idx, node in enumerate(all_recomputable_banned_nodes)
     }
+
+    # Create joint graph node information
     joint_graph_node_information = create_joint_graph_node_information(
         joint_graph, recomputable_node_info
     )
 
+    # Update node information with recomputable candidate details
     for node_name, node_info in joint_graph_node_information.items():
         if node_info["is_recomputable_candidate"]:
             idx = recomputable_node_info[node_name]
@@ -117,28 +160,30 @@ def create_structured_trace_for_min_cut_info(
                 idx in recomputable_node_idxs
             )
 
+    # Create joint graph edges
     joint_graph_edges = create_joint_graph_edges(joint_graph)
+
+    # Create activation checkpointing logging structure payload
     activation_checkpointing_logging_structure_payload = (
         create_activation_checkpointing_logging_structure_payload(
-            joint_graph,
-            joint_graph_node_information,
-            joint_graph_edges,
-            all_recomputable_banned_nodes,
-            expected_runtime,
-            saved_node_idxs,
-            recomputable_node_idxs,
-            memories_banned_nodes,
-            runtimes_banned_nodes,
-            min_cut_saved_values,
+            joint_graph=joint_graph,
+            joint_graph_node_information=joint_graph_node_information,
+            joint_graph_edges=joint_graph_edges,
+            all_recomputable_banned_nodes=all_recomputable_banned_nodes,
+            expected_runtime=expected_runtime,
+            saved_node_idxs=saved_node_idxs,
+            recomputable_node_idxs=recomputable_node_idxs,
+            memories_banned_nodes=memories_banned_nodes,
+            normalized_memories_banned_nodes=normalized_memories_banned_nodes,
+            runtimes_banned_nodes=runtimes_banned_nodes,
+            min_cut_saved_values=min_cut_saved_values,
         )
     )
 
+    # Create structured trace
     trace_structured(
         "artifact",
-        metadata_fn=lambda: {
-            "name": "min_cut_information",
-            "encoding": "json",
-        },
+        metadata_fn=lambda: {"name": "min_cut_information", "encoding": "json"},
         payload_fn=lambda: json.dumps(
             activation_checkpointing_logging_structure_payload
         ),
diff --git a/torch/_functorch/_activation_checkpointing/knapsack.py b/torch/_functorch/_activation_checkpointing/knapsack.py
index 67187c92eb7d..0a3eaa5a9344 100644
--- a/torch/_functorch/_activation_checkpointing/knapsack.py
+++ b/torch/_functorch/_activation_checkpointing/knapsack.py
@@ -69,12 +69,12 @@ def dp_knapsack(
 
     # Quantize the memory weights
     quantized_memory = torch.tensor(
-        [int(round(m * S)) for m in memory], dtype=torch.long, device="cpu"
+        [round(m * S) for m in memory], dtype=torch.long, device="cpu"
     )
     runtimes = torch.tensor(runtime, dtype=torch.float32, device="cpu")
 
     # Quantized pseudopolynomial DP for 0-1 Knapsack
-    quantized_max_memory = int(round(max_memory * S))
+    quantized_max_memory = round(max_memory * S)
 
     n = len(memory)
 
diff --git a/torch/_functorch/_activation_checkpointing/knapsack_evaluator.py b/torch/_functorch/_activation_checkpointing/knapsack_evaluator.py
index 7cc60f6ed54b..2a1a3db275d2 100644
--- a/torch/_functorch/_activation_checkpointing/knapsack_evaluator.py
+++ b/torch/_functorch/_activation_checkpointing/knapsack_evaluator.py
@@ -1,6 +1,6 @@
 import operator
 from collections import deque
-from typing import Callable
+from collections.abc import Callable
 
 import networkx as nx
 
diff --git a/torch/_functorch/_aot_autograd/autograd_cache.py b/torch/_functorch/_aot_autograd/autograd_cache.py
index 17c925eeaf9b..47506aff1ef2 100644
--- a/torch/_functorch/_aot_autograd/autograd_cache.py
+++ b/torch/_functorch/_aot_autograd/autograd_cache.py
@@ -16,13 +16,14 @@
 import time
 import traceback
 from abc import ABC, abstractmethod
-from copy import copy
+from collections.abc import Callable
+from copy import copy, deepcopy
 from dataclasses import dataclass
-from typing import Any, Callable, Generic, Optional, TYPE_CHECKING, TypeVar, Union
+from typing import Any, Generic, Optional, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import override
 
 import torch
-from torch._dynamo.precompile_context import PrecompileCacheArtifact, PrecompileContext
+from torch._dynamo.precompile_context import BackendCacheArtifact, PrecompileContext
 from torch._dynamo.trace_rules import torch_non_c_binding_in_graph_functions
 from torch._dynamo.utils import (
     chromium_event_log_active,
@@ -51,7 +52,7 @@
     OutputCode,
 )
 from torch._inductor.runtime.runtime_utils import cache_dir
-from torch._inductor.utils import should_use_remote_fx_graph_cache
+from torch._inductor.utils import BoxedBool, should_use_remote_fx_graph_cache
 from torch._logging import LazyString
 from torch._utils_internal import log_cache_bypass
 from torch.compiler._cache import (
@@ -81,7 +82,6 @@
 if TYPE_CHECKING:
     from torch._inductor.compile_fx import _CompileFxKwargs
     from torch._inductor.remote_cache import JsonDataTy, RemoteCache
-    from torch._inductor.utils import BoxedBool
     from torch.fx.node import Node
 
 log = logging.getLogger(__name__)
@@ -235,7 +235,7 @@ def is_tensor(target):
                 f"Unsupported call_method target {method_target}. \nMethod module: {module}, \nMethod name: {name}"
             )
         if (
-            type(method_name) != str
+            type(method_name) is not str
             and type(method_name).__name__ != "method_descriptor"
         ):
             raise BypassAOTAutogradCache(
@@ -285,19 +285,6 @@ def check_cacheable(gm: torch.fx.GraphModule):
         check_cacheable(gm.saved_tensors_hooks_unpack_0)  # type: ignore[arg-type]
 
 
-def check_metadata_cacheable(metadata: ViewAndMutationMeta):
-    """
-    When view replay is turned on, we bypass autograd cache if
-    the output is aliased.
-    """
-    if config.view_replay_for_aliased_outputs:
-        for info in metadata.output_info:
-            if info.functional_tensor is not None:
-                raise BypassAOTAutogradCache(
-                    "Cannot cache a graph with functional tensor"
-                )
-
-
 class AOTAutogradCacheDetails(FxGraphHashDetails):
     """
     Object to capture all the details for a dynamo graph module relevant to computing
@@ -397,6 +384,7 @@ def _add_wrapped_user_cache_hashes(_gm, _l):
 class AOTAutogradCachePickler(FxGraphCachePickler):
     def __init__(self, gm: torch.fx.GraphModule):
         super().__init__(gm)
+        # pyrefly: ignore  # bad-override
         self.dispatch_table: dict
         self.dispatch_table.update(
             {
@@ -521,7 +509,7 @@ def autograd_cache_key(
 TOut = TypeVar("TOut", bound=OutputCode)
 
 
-class InductorOutput(Generic[TOut], ABC):
+class InductorOutput(ABC, Generic[TOut]):
     """
     Class representing a single inductor output
     """
@@ -804,7 +792,6 @@ def pre_save(self):
         """
         Perform any preparations to make the cache entry ready for serialization.
         """
-        check_metadata_cacheable(self.runtime_metadata)
         self.compiled_fw.pre_save()
         if self.compiled_bw is not None:
             self.compiled_bw.pre_save()
@@ -976,10 +963,6 @@ def wrap_post_compile(
             )
 
         # Add serialization function back onto object
-        compiled_function = SerializableCompiledFunction(
-            compiled_function, lambda: self
-        )
-
         compiled_function, _ = post_compile(
             self.dispatch_wrappers,
             compiled_function,
@@ -1059,16 +1042,18 @@ def type():
         return "aot_autograd"
 
 
-def deserialize_bundled_cache_entry(data: bytes) -> Callable:
-    entry = pickle.loads(data)
+def deserialize_bundled_cache_entry(entry: BundledAOTAutogradCacheEntry) -> Callable:
     # In the precompile use case, guards are already serialized
     # by dynamo, so we don't need to add them to the environment
     entry.guards_expr = None
     # TODO: this isn't exactly right, because cudagraphs needs to be a shared config
     # which is set by compile_fx. But in precompile, we never actually call compile_fx
     # so we don't have a place to track cudagraphs here.
-    cudagraphs = torch._inductor.config.triton.cudagraphs
+    cudagraphs = BoxedBool(torch._inductor.config.triton.cudagraphs)
     boxed_forward_device_index = BoxedDeviceIndex(None)
+    # We need to make a clean copy of the cache entry
+    # in case it needs to be serialized again
+    serializable_copy = deepcopy(entry)
     compiled_fn = entry.wrap_post_compile(
         [],
         entry.sanitized_aot_config,
@@ -1077,6 +1062,8 @@ def deserialize_bundled_cache_entry(data: bytes) -> Callable:
             "boxed_forward_device_index": boxed_forward_device_index,
         },
     )
+    # Ensure the deserialized cache entry is still serializable
+    compiled_fn = SerializableCompiledFunction(compiled_fn, lambda: serializable_copy)
 
     # TODO: this ignores flat_params, which can exist
     # if inline_builtin_nn_modules=False
@@ -1090,14 +1077,8 @@ def forward(*runtime_args: tuple[Any]):
     return forward
 
 
-@CacheArtifactFactory.register
-class BundledAOTAutogradCacheArtifact(PrecompileCacheArtifact[Callable]):
-    @override
-    @staticmethod
-    def type():
-        return "precompile_aot_autograd"
-
-    @override
+@dataclass
+class BundledAOTAutogradCacheArtifact(BackendCacheArtifact[Callable]):
     def after_deserialization(self) -> Callable:
         return deserialize_bundled_cache_entry(self.content)
 
@@ -1175,13 +1156,19 @@ def try_load(
                 cache_key, debug_lines = autograd_cache_key(
                     gm, args, aot_config, fx_config
                 )
-                entry: Optional[GenericAOTAutogradCacheEntry] = (
+                result: Optional[tuple[GenericAOTAutogradCacheEntry, bytes]] = (
                     AOTAutogradCache._lookup(
                         cache_key, local, remote, args, cache_info, aot_config
                     )
                 )
-                if entry is not None:
+                if result is not None:
+                    (entry, pickled_content) = result
                     compiled_fn = entry.wrap_post_compile(args, aot_config, fx_config)
+                    # Make the compiled_fn serializable, where the serialize function just
+                    # makes a copy of the original entry before post compile via the pickled content
+                    compiled_fn = SerializableCompiledFunction(
+                        compiled_fn, lambda: pickle.loads(pickled_content)
+                    )
                     log.info("AOTAutograd cache hit for key %s", cache_key)
 
                     counters["aot_autograd"]["autograd_cache_hit"] += 1
@@ -1219,7 +1206,7 @@ def try_load(
                 cache_state = "miss"
                 if (
                     config.strict_autograd_cache
-                    or torch._dynamo.config.caching_precompile
+                    or torch._dynamo.config.strict_precompile
                 ):
                     raise e
             # Most often this is BypassAOTAutogradCache, but
@@ -1234,7 +1221,7 @@ def try_load(
             except Exception as e:
                 cache_key = None
                 counters["aot_autograd"]["autograd_cache_bypass"] += 1
-                log.info("Bypassing autograd cache due to: %s", e)
+                log.info("Bypassing autograd cache due to: %s", e)  # noqa: G200
                 cache_state = "bypass"
                 cache_event_time = time.time_ns()
                 cache_info["cache_bypass_reason"] = str(e)
@@ -1252,7 +1239,7 @@ def try_load(
                     log_cache_bypass("bypass_aot_autograd", str(e))
                 if (
                     config.strict_autograd_cache
-                    or torch._dynamo.config.caching_precompile
+                    or torch._dynamo.config.strict_precompile
                 ):
                     raise e
             if compiled_fn is None:
@@ -1341,7 +1328,7 @@ def _lookup(
         args: list[Any],
         cache_info: dict[str, Any],
         aot_config: Optional[AOTConfig],
-    ) -> Optional[GenericAOTAutogradCacheEntry]:
+    ) -> Optional[tuple[GenericAOTAutogradCacheEntry, bytes]]:
         """Given a key generated by AOTAutogradCachePickler, look up its location in the cache."""
         remote_cache: Optional[RemoteCache[JsonDataTy]] = None
         if remote:
@@ -1350,6 +1337,7 @@ def _lookup(
         symints = AOTAutogradCache._filter_backed_symints(args)
         hints = [hint_int(s) for s in symints]
         entry = None
+        pickled_content = None
         try:
             (
                 entry,
@@ -1375,15 +1363,19 @@ def _lookup(
                     # 1. because we set it to None on save 2. even if we didn't, this new run
                     # that cache hit has a *new* backend id associated with it.
                     PrecompileContext.record_artifact(
-                        BundledAOTAutogradCacheArtifact.type(),
-                        aot_config.precompile_backend_id,
-                        pickled_content,
+                        BundledAOTAutogradCacheArtifact(
+                            aot_config.precompile_backend_id, entry
+                        ),
                     )
         except Exception as e:
-            log.info("AOTAutograd cache unable to load compiled graph: %s", e)
+            log.info("AOTAutograd cache unable to load compiled graph: %s", e)  # noqa: G200
             if config.strict_autograd_cache:
                 raise e
-        return entry
+        if entry is not None:
+            assert pickled_content is not None
+            return (entry, pickled_content)
+        else:
+            return None
 
     @staticmethod
     def _write_to_local_cache(key: str, content: bytes):
@@ -1413,25 +1405,21 @@ def save(key: str, entry: GenericAOTAutogradCacheEntry, remote: bool):
                 and entry.sanitized_aot_config.precompile_backend_id is not None
             ):
                 precompile_key = entry.sanitized_aot_config.precompile_backend_id
+                artifact = BundledAOTAutogradCacheArtifact(precompile_key, entry)
                 # Now that we're saving it, the precompile_backend_id field is no longer
                 # useful, remove it from the entry.
                 entry.sanitized_aot_config.precompile_backend_id = None
-                PrecompileContext.record_artifact(
-                    BundledAOTAutogradCacheArtifact.type(),
-                    precompile_key,
-                    entry,
-                    editable=True,
-                )
+                PrecompileContext.record_artifact(artifact)
             AOTAutogradCache._write_to_local_cache(key, content)
             counters["aot_autograd"]["autograd_cache_saved"] += 1
         except BypassAOTAutogradCache as e:
             counters["aot_autograd"]["autograd_cache_bypass"] += 1
-            log.info("Bypassing autograd cache due to: %s", e)
+            log.info("Bypassing autograd cache due to: %s", e)  # noqa: G200
             if remote:
                 log_cache_bypass("bypass_aot_autograd", str(e))
             return None
         except Exception as e:
-            log.info("AOTAutograd cache unable to serialize compiled graph: %s", e)
+            log.info("AOTAutograd cache unable to serialize compiled graph: %s", e)  # noqa: G200
             if remote:
                 log_cache_bypass(
                     "bypass_aot_autograd", "Unable to serialize: " + str(e)
diff --git a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
index 19d08a64f967..14409e36dc09 100644
--- a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
+++ b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
@@ -11,7 +11,8 @@
 import collections
 import contextlib
 import logging
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Optional
 
 import torch
 import torch.utils._pytree as pytree
@@ -43,10 +44,10 @@
     has_metadata_mutation,
     MetadataKey,
     to_fun,
+    ViewMetaSequence,
     was_inductor_storage_resized,
 )
 from .schemas import (
-    FunctionalTensorMetadataEq,
     InputAliasInfo,
     MemoryFormatMeta,
     MutationType,
@@ -85,8 +86,10 @@ def coerce_tangent_and_suggest_memory_format(x: Tensor):
 
     memory_format = MemoryFormatMeta.from_tensor(out)
 
+    # pyrefly: ignore  # missing-attribute
     if memory_format.memory_format is not None:
         was = out
+        # pyrefly: ignore  # bad-argument-type
         out = out.contiguous(memory_format=memory_format.memory_format)
         updated = was is not out
 
@@ -116,6 +119,7 @@ def coerce_tangent_and_suggest_memory_format(x: Tensor):
         out = out.__coerce_tangent_metadata__()  # type: ignore[attr-defined]
 
     if is_subclass:
+        # pyrefly: ignore  # missing-attribute
         attrs = out.__tensor_flatten__()[0]
 
         for attr in attrs:
@@ -125,6 +129,7 @@ def coerce_tangent_and_suggest_memory_format(x: Tensor):
                 new_elem_memory_format,
                 elem_updated,
             ) = coerce_tangent_and_suggest_memory_format(elem)
+            # pyrefly: ignore  # missing-attribute
             out_memory_format.append(new_elem_memory_format)
             if elem_updated:
                 setattr(out, attr, new_elem)
@@ -199,6 +204,7 @@ def inner(*flat_args):
         suppress_pending = contextlib.nullcontext()
         fake_mode = detect_fake_mode()
         if fake_mode and (shape_env := fake_mode.shape_env):
+            # pyrefly: ignore  # unbound-name
             suppress_pending = shape_env.ignore_fresh_unbacked_symbols()
         with disable_above, mode, suppress_pending:
             # precondition: The passed in function already handles unflattening inputs + flattening outputs
@@ -491,6 +497,7 @@ def inner(*flat_args):
                 curr_storage in inp_storage_refs
                 and not functional_tensor_storage_changed
             ):
+                # pyrefly: ignore  # index-error
                 base_idx = inp_storage_refs[curr_storage]
                 is_input_tensor = id(o) in inp_tensor_ids
                 num_aliased_outs = out_tensor_alias_counts[curr_storage]
@@ -640,7 +647,7 @@ def inner(*flat_args):
             #
             # The FunctionalTensor will be saved if one of the 2 conditions below
             # is true:
-            functional_tensor = None
+            view_meta_sequence = None
             if (
                 # 1. If the output_type is either of:
                 #    (i) alias_of_intermediate;
@@ -672,7 +679,7 @@ def inner(*flat_args):
                 and not input_info[base_idx].mutates_metadata
             ):
                 if isinstance(o, FunctionalTensor):
-                    functional_tensor = FunctionalTensorMetadataEq(o.elem)
+                    view_meta_sequence = ViewMetaSequence(o)
 
             out_info = OutputAliasInfo(
                 output_type=output_type,
@@ -680,7 +687,7 @@ def inner(*flat_args):
                 base_idx=base_idx,
                 dynamic_dims=dynamic_dims,
                 requires_grad=isinstance(o, torch.Tensor) and o.requires_grad,
-                functional_tensor=functional_tensor,
+                view_meta_sequence=view_meta_sequence,
             )
             output_info.append(out_info)
 
@@ -698,6 +705,7 @@ def view_avoid_dupes_with_primals(t):
         # Anything that aliases (inputs returned in the fw due to metadata mutations, or outputs that alias inputs/intermediates)
         # are *regenerated* later, and not used directly in the autograd graph
         def _plain_fake_tensor_like_subclass(x):
+            # pyrefly: ignore  # bad-context-manager
             with detect_fake_mode():
                 return torch.empty(
                     x.shape, dtype=x.dtype, device=x.device, layout=x.layout
diff --git a/torch/_functorch/_aot_autograd/frontend_utils.py b/torch/_functorch/_aot_autograd/frontend_utils.py
index 55b84c12df82..a75863cd739e 100644
--- a/torch/_functorch/_aot_autograd/frontend_utils.py
+++ b/torch/_functorch/_aot_autograd/frontend_utils.py
@@ -1,5 +1,6 @@
 # mypy: ignore-errors
 
+import warnings
 from collections.abc import KeysView
 from contextlib import contextmanager
 from typing import Any, Optional
@@ -40,7 +41,9 @@ def convert(idx, x):
                         return x
                     source = ConstantSource(f"sym_{idx}")
                     return shape_env.create_symintnode(
-                        shape_env.create_symbol(x, source), hint=x, source=source
+                        shape_env.create_symbol(x, source),
+                        hint=x,
+                        source=source,
                     )
             if isinstance(x, torch.ScriptObject):
                 return torch._library.fake_class_registry.maybe_to_fake_obj(
@@ -221,10 +224,23 @@ def _get_attributes(mod):
         # return any attributes of a module that are not standard attributes
         return {k: v for k, v in mod.__dict__.items() if k not in STD_ATTRS}
 
+    def _get_all_module_attributes(mod):
+        # return attributes from all modules and submodules
+        result = {}
+        for name, submodule in mod.named_modules():
+            result[name] = _get_attributes(submodule)
+        return result
+
+    def _restore_all_module_attributes(mod, snapshot):
+        # restore attributes to all modules and submodules
+        for name, submodule in mod.named_modules():
+            if name in snapshot:
+                submodule.__dict__.update(snapshot[name])
+
     # save state of attributes before enter
     snapshot = pytree.tree_map(
         lambda x: x,
-        _get_attributes(mod),
+        _get_all_module_attributes(mod),
         is_leaf=lambda x: type(x) in _pytree_subclasses_that_lose_info,
     )
     try:
@@ -232,52 +248,77 @@ def _get_attributes(mod):
     finally:
         # after exit, compare state of attributes with snapshot
         # to detect which tensor attributes were assigned
-        assigned_tensor_attributes = []
-
-        def _collect_assigned_tensor_attributes(kp, v, _v):
-            if _v is not v:
-                attr, *rest = kp
-                if isinstance(v, torch.Tensor):
-                    assigned_tensor_attributes.append(
-                        f"self.{attr.key}{pytree.keystr(rest)}"
-                    )
-                # TODO(avik): Assigning all other types are allowed right now.
-                # Maybe in the future we want to limit this to primitive types?
-            return v
-
-        new_attrs = _get_attributes(mod)
-        if len(new_attrs) != len(snapshot):
-            added_attrs = new_attrs.keys() - snapshot.keys()
-            deleted_attrs = snapshot.keys() - new_attrs.keys()
-
-            if len(added_attrs) > 0:
-                raise ValueError(
-                    f"During torch.export, following attrs were created in the model.forward: {added_attrs} "
-                    f"Such attributes must be registered as buffers using the `register_buffer` "
-                    f"API and must be initialized at model.__init__ "
-                    f"(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer)."
-                )
-
-            if len(deleted_attrs) > 0:
-                raise ValueError(
-                    f"During torch.export, following attrs were deleted in the model.forward: {deleted_attrs} "
-                    f"Such attributes must be registered as buffers using the `register_buffer` "
-                    f"API and must be initialized at model.__init__ "
-                    f"(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer)."
-                )
 
-        pytree.tree_map_with_path(
-            _collect_assigned_tensor_attributes, snapshot, new_attrs
+        def _collect_assigned_tensor_attributes(snapshot, new_attrs):
+            assigned_tensor_attributes = []
+
+            def _compare_values(path, old_val, new_val):
+                """Recursively compare values, handling containers."""
+                # Same object, no change
+                if old_val is new_val:
+                    return
+
+                if old_val is None or new_val is None:
+                    if isinstance(new_val, torch.Tensor):
+                        assigned_tensor_attributes.append(path)
+                    return
+
+                # Check if it's a tensor that was reassigned
+                if isinstance(new_val, torch.Tensor):
+                    assigned_tensor_attributes.append(path)
+                    return
+
+                # Handle dict containers
+                if isinstance(old_val, dict) and isinstance(new_val, dict):
+                    all_keys = set(old_val.keys()) | set(new_val.keys())
+                    for key in all_keys:
+                        old_item = old_val.get(key)
+                        new_item = new_val.get(key)
+                        _compare_values(f"{path}[{key!r}]", old_item, new_item)
+                    return
+
+                # Handle list/tuple containers
+                if isinstance(old_val, (list, tuple)) and isinstance(
+                    new_val, (list, tuple)
+                ):
+                    # Different lengths = mutation happened
+                    max_len = max(len(old_val), len(new_val))
+                    for i in range(max_len):
+                        old_item = old_val[i] if i < len(old_val) else None
+                        new_item = new_val[i] if i < len(new_val) else None
+                        _compare_values(f"{path}[{i}]", old_item, new_item)
+                    return
+
+                # For other types, just check if they're different objects
+                # (we don't care about non-tensor mutations)
+
+            for module_name in snapshot.keys() | new_attrs.keys():
+                old_module_attrs = snapshot.get(module_name, {})
+                new_module_attrs = new_attrs.get(module_name, {})
+
+                for attr_name in old_module_attrs.keys() | new_module_attrs.keys():
+                    module_prefix = f"self.{module_name}." if module_name else "self."
+                    full_path = f"{module_prefix}{attr_name}"
+
+                    old_val = old_module_attrs.get(attr_name)
+                    new_val = new_module_attrs.get(attr_name)
+                    _compare_values(full_path, old_val, new_val)
+
+            return assigned_tensor_attributes
+
+        new_attrs = _get_all_module_attributes(mod)
+        assigned_tensor_attributes = _collect_assigned_tensor_attributes(
+            snapshot, new_attrs
         )
         # restore state of all attributes (including, e.g., of primitive types)
-        mod.__dict__.update(snapshot)
+        _restore_all_module_attributes(mod, snapshot)
 
         if assigned_tensor_attributes:
             if len(assigned_tensor_attributes) > 1:
                 noun, verb = "attributes", "were"
             else:
                 noun, verb = "attribute", "was"
-            raise ValueError(
+            warnings.warn(
                 f"The tensor {noun} {', '.join(assigned_tensor_attributes)} {verb} assigned during export. "
                 "Such attributes must be registered as buffers using the `register_buffer` API "
                 "(https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_buffer)."
diff --git a/torch/_functorch/_aot_autograd/functional_utils.py b/torch/_functorch/_aot_autograd/functional_utils.py
index 4e74ed6341b9..958804e5c763 100644
--- a/torch/_functorch/_aot_autograd/functional_utils.py
+++ b/torch/_functorch/_aot_autograd/functional_utils.py
@@ -14,6 +14,7 @@
 
 import torch
 from torch import Tensor
+from torch._C import _functionalization
 from torch._logging import getArtifactLogger
 from torch._subclasses.fake_tensor import FakeTensor
 from torch._subclasses.functional_tensor import FunctionalTensor
@@ -224,9 +225,9 @@ def gen_alias_from_base(
     aliased_base_tensor,
     target_meta_tensor,
     target_requires_grad,
-    target_functional_tensor: Optional[FunctionalTensorMetadataEq] = None,
+    target_view_meta_sequence: Optional[ViewMetaSequence] = None,
     *,
-    replay_views,
+    replay_views: bool,
 ):
     # Patch the correct requires_grad field of the output tensor, depending on whether:
     # (i) the reconstructed output (out) was came from a tensor that requires grad or not;
@@ -245,13 +246,11 @@ def patch_requires_grad(out):
     # to replay them (view functions) on the aliased_base_tensor.
     if (
         replay_views
-        and target_functional_tensor is not None
-        and not torch._functionalize_is_symbolic(target_functional_tensor.tensor)
+        and target_view_meta_sequence is not None
+        and not any(vm.has_symbolic_inputs for vm in target_view_meta_sequence.sequence)
     ):
-        functional_tensor = target_functional_tensor.tensor
-
-        out = torch._functionalize_apply_view_metas(
-            functional_tensor, aliased_base_tensor
+        out = _functionalization.apply_view_meta_sequence(
+            aliased_base_tensor, target_view_meta_sequence.sequence
         )
         # If re-applying the ViewMeta sequence succeeded, there should be no more
         # problems going forward. We just check we got to the target shape and
@@ -357,25 +356,45 @@ def make(t):
         )
 
 
-# Wrapper around a FunctionalTensorWrapper for comparing only the resulting metadata
-# after applying all the ViewMeta operations.
-class FunctionalTensorMetadataEq:
-    def __init__(self, tensor: torch.Tensor) -> None:
-        assert torch._is_functional_tensor(tensor)
-        self.tensor = tensor
+# ViewMeta sequence wrapper for equality comparisons.
+#
+# Even though we can compare each ViewMeta instance, we compare the resulting
+# tensor metadata, instead. That's because the creation of synthetic bases + the
+# re-generation of input views might end-up creating a different sequence of
+# ViewMeta that is semantically equivalent. i.e. gets to a tensor with the same
+# metadata.
+#
+# Therefore, we store what the end result should look like as serializable
+# metadata.
+#
+# When logging, this class should look like:
+#
+#     ViewMetaSequence(view, select_int, slice_Tensor)
+#
+# i.e. a parenthesized list of view operations within that ViewMeta sequence.
+class ViewMetaSequence:
+    def __init__(self, tensor: FunctionalTensor) -> None:
+        assert torch._is_functional_tensor(tensor.elem)
+        self.sequence = _functionalization.get_view_meta_sequence(tensor.elem)
+        self.metadata = MetadataKey.make(tensor)
+
+    def __repr__(self) -> str:
+        suffix = len("_ViewMeta")
+        types = ", ".join(type(vm).__name__[:-suffix] for vm in self.sequence)
+        return f"ViewMetaSequence({types})"
 
     def __eq__(self, other: object) -> bool:
         # If other is None, then it probably means that we weren't able to recreate
-        # the FunctionalTensorMetadataEq. One of this cases is when we update the
-        # view metadata by calling: create_synthetic_base_metadata.
+        # the ViewMeta sequence. One example is when we update the view metadata by
+        # calling: create_synthetic_base_metadata.
         if other is None:
             return True
 
         # Comparison against any other type is not implemented.
-        if not isinstance(other, FunctionalTensorMetadataEq):
+        if not isinstance(other, ViewMetaSequence):
             return NotImplemented
 
-        return has_same_metadata(self.tensor, other.tensor)
+        return self.metadata == other.metadata
 
 
 # new_arg and arg here are either:
diff --git a/torch/_functorch/_aot_autograd/fx_utils.py b/torch/_functorch/_aot_autograd/fx_utils.py
index d415ba256939..be5195a40dde 100644
--- a/torch/_functorch/_aot_autograd/fx_utils.py
+++ b/torch/_functorch/_aot_autograd/fx_utils.py
@@ -78,6 +78,7 @@ def get_all_input_and_grad_nodes(
                 continue
             if isinstance(desc, SubclassGetAttrAOTInput):
                 _raise_autograd_subclass_not_implemented(n, desc)
+            # pyrefly: ignore  # unsupported-operation
             input_index[desc] = (n, None)
         elif n.op == "output":
             assert "desc" in n.meta, (n, n.meta)
@@ -129,6 +130,7 @@ def get_all_output_and_tangent_nodes(
                     continue
                 if isinstance(sub_d, SubclassGetAttrAOTOutput):
                     _raise_autograd_subclass_not_implemented(sub_n, sub_d)
+                # pyrefly: ignore  # unsupported-operation
                 output_index[sub_d] = (sub_n, None)
     for n in g.nodes:
         if n.op == "placeholder":
diff --git a/torch/_functorch/_aot_autograd/graph_capture.py b/torch/_functorch/_aot_autograd/graph_capture.py
index 6dc557250d8f..132cf335b387 100644
--- a/torch/_functorch/_aot_autograd/graph_capture.py
+++ b/torch/_functorch/_aot_autograd/graph_capture.py
@@ -4,6 +4,7 @@
 pathways, taking into account the AOTConfig and the collected ViewAndMutationMetadata.
 """
 
+import contextlib
 import dataclasses
 from typing import Any, Optional
 
@@ -70,14 +71,19 @@ def inner_f(*args):
             out, out_descs = call_and_expect_output_descs(f, args)
             return out
 
-    with (
-        enable_python_dispatcher(),
-        FunctionalTensorMode(
+    if aot_config.disable_functionalization:
+        ctx = contextlib.nullcontext()
+    else:
+        ctx = FunctionalTensorMode(  # type: ignore[assignment]
             pre_dispatch=aot_config.pre_dispatch,
             export=aot_config.is_export,
             # Allow token discovery for joint fn tracing as tokens can be used in backward.
             _allow_token_discovery=True,
-        ),
+        )
+
+    with (
+        enable_python_dispatcher(),
+        ctx,
     ):
         fx_g = make_fx(
             inner_f,
@@ -162,14 +168,22 @@ def aot_dispatch_base_graph(
         keep_data_input_mutations=aot_config.keep_inference_input_mutations,
     )
 
-    fn_to_trace, updated_flat_args, updated_flat_args_descs = create_functionalized_fn(
-        fn_to_trace,
-        flat_args,
-        flat_args_descs,
-        meta=fw_metadata,
-        aot_config=aot_config,
-        trace_joint=False,
-    )
+    if aot_config.disable_functionalization:
+        updated_flat_args, updated_flat_args_descs = (
+            flat_args,
+            flat_args_descs,
+        )
+    else:
+        fn_to_trace, updated_flat_args, updated_flat_args_descs = (
+            create_functionalized_fn(
+                fn_to_trace,
+                flat_args,
+                flat_args_descs,
+                meta=fw_metadata,
+                aot_config=aot_config,
+                trace_joint=False,
+            )
+        )
 
     # TODO: replace with AOTDispatchSubclassWrapper once we refactor
     # fn_input_mutations_to_outputs and create_functionalized_fn
@@ -188,17 +202,18 @@ def aot_dispatch_base_graph(
         fw_only=flat_fn,
     )
 
-    (
-        fn_to_trace,
-        updated_flat_args_subclasses_desugared,
-        updated_flat_args_subclasses_desugared_descs,
-    ) = handle_effect_tokens_fn(
-        fn_to_trace,
-        updated_flat_args_subclasses_desugared,
-        updated_flat_args_subclasses_desugared_descs,
-        meta=fw_metadata,
-        trace_joint=False,
-    )
+    if not aot_config.disable_functionalization:
+        (
+            fn_to_trace,
+            updated_flat_args_subclasses_desugared,
+            updated_flat_args_subclasses_desugared_descs,
+        ) = handle_effect_tokens_fn(
+            fn_to_trace,
+            updated_flat_args_subclasses_desugared,
+            updated_flat_args_subclasses_desugared_descs,
+            meta=fw_metadata,
+            trace_joint=False,
+        )
 
     aot_graphs_log.debug(
         "aot_config id: %s, fw_metadata=%s,subclass_metadata=%s",
@@ -265,12 +280,15 @@ def aot_dispatch_base_graph(
 
     # As long as we opted to remove input mutations, then
     # there should be *NO* mutating ops in the graph at this point.
-    copy_count = assert_functional_graph(fw_module.graph)
-    fw_module.graph.eliminate_dead_code()
-    fw_module.recompile()
-
-    copy_count2 = assert_functional_graph(fw_module.graph)
-    propagate_input_mutation_stacktraces(fw_module.graph)
+    if not aot_config.disable_functionalization:
+        copy_count = assert_functional_graph(fw_module.graph)
+        fw_module.graph.eliminate_dead_code()
+        fw_module.recompile()
+        copy_count2 = assert_functional_graph(fw_module.graph)
+        propagate_input_mutation_stacktraces(fw_module.graph)
+        assert copy_count == copy_count2
+    else:
+        fw_module.graph.eliminate_dead_code()
 
     # See Note [Side-Effectful Tokens in AOTAutograd]
     num_tokens = len(fw_metadata.tokens)
@@ -283,8 +301,6 @@ def aot_dispatch_base_graph(
             saved_updated_flat_args_subclasses_desugared_descs[num_tokens:]
         )
 
-    assert copy_count == copy_count2
-
     if aot_config.enable_log:
         aot_graphs_log.info(
             "%s",
@@ -369,23 +385,30 @@ def aot_dispatch_autograd_graph(
         flat_fn,
         flat_args_descs,
         fw_metadata,
+        aot_config,
     )
     joint_fn_to_trace = create_joint(
         fn_prepared_for_autograd, flat_args_descs, aot_config=aot_config
     )
     joint_fn_handle = joint_fn_to_trace.handle
 
-    joint_fn_to_trace, updated_joint_inputs, updated_joint_inputs_descs = (
-        create_functionalized_fn(
-            joint_fn_to_trace,
+    if aot_config.disable_functionalization:
+        updated_joint_inputs, updated_joint_inputs_descs = (
             joint_inputs,
             joint_inputs_descs,
-            meta=fw_metadata,
-            aot_config=aot_config,
-            trace_joint=True,
-            joint_fn_handle=joint_fn_handle,
         )
-    )
+    else:
+        joint_fn_to_trace, updated_joint_inputs, updated_joint_inputs_descs = (
+            create_functionalized_fn(
+                joint_fn_to_trace,
+                joint_inputs,
+                joint_inputs_descs,
+                meta=fw_metadata,
+                aot_config=aot_config,
+                trace_joint=True,
+                joint_fn_handle=joint_fn_handle,
+            )
+        )
 
     # TODO: replace with AOTDispatchSubclassWrapper once we refactor
     # fn_input_mutations_to_outputs and create_functionalized_fn
@@ -403,15 +426,16 @@ def aot_dispatch_autograd_graph(
     updated_joint_inputs = subclass_tracing_info.plain_tensor_args
     updated_joint_inputs_descs = subclass_tracing_info.plain_tensor_args_descs
 
-    (joint_fn_to_trace, updated_joint_inputs, updated_joint_inputs_descs) = (
-        handle_effect_tokens_fn(
-            joint_fn_to_trace,
-            updated_joint_inputs,
-            updated_joint_inputs_descs,
-            meta=fw_metadata,
-            trace_joint=True,
+    if not aot_config.disable_functionalization:
+        (joint_fn_to_trace, updated_joint_inputs, updated_joint_inputs_descs) = (
+            handle_effect_tokens_fn(
+                joint_fn_to_trace,
+                updated_joint_inputs,
+                updated_joint_inputs_descs,
+                meta=fw_metadata,
+                trace_joint=True,
+            )
         )
-    )
 
     # When we call _create_graph, this may mutate the metadata of joint
     # inputs.  But callers are expecting to get the original joint inputs.  So
@@ -440,15 +464,20 @@ def aot_dispatch_autograd_graph(
         aot_config=aot_config,
     )
 
-    # There should be *NO* mutating ops in the graph at this point.
-    assert_functional_graph(fx_g.graph)
-
     # Redundant with the check above, but worth having in case tracing introduced
     # a fake tensor. Unlikely.
     # See Note: [Fake Modules and AOTAutograd]
     torch._dynamo.utils.assert_no_fake_params_or_buffers(fx_g)
-    fx_g.graph.eliminate_dead_code()
+
+    # Have to copy before eliminate_dead_code otherwise the
+    # fw node match might be erased
     copy_fwd_metadata_to_bw_nodes(fx_g)
+
+    fx_g.graph.eliminate_dead_code()
+    if not aot_config.disable_functionalization:
+        # There should be *NO* mutating ops in the graph at this point.
+        assert_functional_graph(fx_g.graph)
+
     fx_g.recompile()
 
     # TODO: in AOTAutograd, we create metadata like _indices_of_inps_to_detach to detect
diff --git a/torch/_functorch/_aot_autograd/graph_capture_wrappers.py b/torch/_functorch/_aot_autograd/graph_capture_wrappers.py
index b2d96620b4bc..f8df5e93491c 100644
--- a/torch/_functorch/_aot_autograd/graph_capture_wrappers.py
+++ b/torch/_functorch/_aot_autograd/graph_capture_wrappers.py
@@ -12,9 +12,10 @@
 """
 
 import warnings
+from collections.abc import Callable
 from contextlib import AbstractContextManager, contextmanager, ExitStack, nullcontext
 from dataclasses import dataclass
-from typing import Any, Callable, cast, Optional, TypeVar, Union
+from typing import Any, Optional, TypeVar, Union
 from unittest.mock import patch
 
 import torch
@@ -159,6 +160,7 @@ def fn_prepped_for_autograd(
     fn: TraceFn,
     args_descs: list[AOTInput],
     meta: ViewAndMutationMeta,
+    aot_config: AOTConfig,
 ) -> PreppedForAutogradTraceFn:
     @simple_wraps(fn)
     def inner_fn(*args):
@@ -239,10 +241,11 @@ def inner_fn(*args):
         # This is annoying: our joint function needs to be aware of functionalization
         # (syncing mutated inputs before calling autograd.grad())
         # In theory, we could make the autograd engine do this automatically, although that probably isn't any cleaner.
-        for arg in args_maybe_cloned:
-            if not isinstance(arg, Tensor):
-                continue
-            sync_functional_tensor(arg)
+        if not aot_config.disable_functionalization:
+            for arg in args_maybe_cloned:
+                if not isinstance(arg, Tensor):
+                    continue
+                sync_functional_tensor(arg)
 
         return (fw_outs_to_return, out_grad_mask), (
             fw_outs_to_return_descs,
@@ -429,9 +432,12 @@ def inner_fn_with_anomaly(
             with torch.autograd.detect_anomaly(check_nan=False):
                 return inner_fn(primals, tangents)
 
-    inner_fn_with_anomaly.handle = joint_fn_handle  # type: ignore[attr-defined]
+    def joint_helper(primals, tangents):
+        return inner_fn_with_anomaly(primals, tangents)
+
+    joint_helper.handle = joint_fn_handle  # type: ignore[attr-defined]
 
-    return cast(JointTraceFn, inner_fn_with_anomaly)  # deal with 'handle' property
+    return joint_helper
 
 
 def create_functionalized_rng_ops_wrapper(
@@ -1304,10 +1310,12 @@ def inner_fw_only(*args):
     # See Note: [Partitioner handling for Subclasses, Part 2] for more info.
     meta_updated = run_functionalized_fw_and_collect_metadata(
         without_output_descs(metadata_fn),
+        # pyrefly: ignore  # bad-argument-type
         flat_args_descs=primals_unwrapped_descs,
         static_input_indices=remapped_static_indices,
         keep_input_mutations=meta.keep_input_mutations,
         is_train=meta.is_train,
+        # pyrefly: ignore  # not-iterable
     )(*primals_unwrapped)
 
     subclass_meta.fw_metadata = meta_updated
diff --git a/torch/_functorch/_aot_autograd/graph_compile.py b/torch/_functorch/_aot_autograd/graph_compile.py
index 2ae1263c3ae9..a7f7dee53559 100644
--- a/torch/_functorch/_aot_autograd/graph_compile.py
+++ b/torch/_functorch/_aot_autograd/graph_compile.py
@@ -17,8 +17,9 @@
 import time
 import traceback
 from collections import defaultdict
+from collections.abc import Callable
 from contextlib import nullcontext
-from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 
 if TYPE_CHECKING:
@@ -83,6 +84,7 @@
     FlatFn,
     FxValue,
     MutationType,
+    SubclassMeta,
     ViewAndMutationMeta,
 )
 from .subclass_utils import compute_inner_mutated_inp_indices_from_subclass_meta
@@ -234,32 +236,69 @@ def sanitize_aot_config(input: AOTConfig) -> AOTConfig:
     )
 
 
+def _get_inner_meta(
+    maybe_subclass_meta: Optional[SubclassMeta],
+    fw_metadata: ViewAndMutationMeta,
+) -> ViewAndMutationMeta:
+    """
+    Util to get view and mutation metadata.
+    """
+    return (
+        fw_metadata if maybe_subclass_meta is None else maybe_subclass_meta.fw_metadata
+    )
+
+
+def _apply_tensorify_python_scalars(module: torch.fx.GraphModule) -> None:
+    """
+    Util to apply tensorify_python_scalars.
+    """
+    # TODO(anijain2305) - Add tensorify_python_scalars to the HOP graph passes.
+    fake_mode = detect_fake_mode()
+    if fake_mode is not None and fake_mode.shape_env is not None:
+        tensorify_python_scalars(module, fake_mode.shape_env, fake_mode)
+
+
 def aot_stage2_compile(
     aot_state: AOTState,
     aot_graph_capture: AOTGraphCapture,
+    partition_fn: Callable,
+    fw_compiler: Callable,
+    bw_compiler: Optional[Callable] = None,
+    inference_compiler: Optional[Callable] = None,
 ) -> DispatchReturn:
+    if bw_compiler is None:
+        bw_compiler = fw_compiler
+    if inference_compiler is None:
+        inference_compiler = fw_compiler
+    # Update the AOTState with the provided compilers
+    aot_state.aot_config.partition_fn = partition_fn
+    aot_state.aot_config.fw_compiler = fw_compiler
+    aot_state.aot_config.bw_compiler = bw_compiler
+    aot_state.aot_config.inference_compiler = inference_compiler
+
     if aot_state.needs_autograd and not aot_state.aot_config.pre_dispatch:
         return aot_stage2_autograd(aot_state, aot_graph_capture)
     else:
         return aot_stage2_inference(aot_state, aot_graph_capture)
 
 
-def aot_stage2_inference(
-    aot_state: AOTState,
-    aot_graph_capture: AOTGraphCapture,
-) -> DispatchReturn:
+def _log_inference_graph(
+    fw_module: torch.fx.GraphModule,
+    aot_config: AOTConfig,
+) -> Optional[str]:
     """
-    Handles functions that don't need autograd. Runs wrappers and compiles with fw_compiler.
+    Log the inference graph to the structured logger.
+    Return a str representation of the graph.
     """
-
-    aot_config = aot_state.aot_config
-    fw_metadata = aot_state.fw_metadata
-    fw_module = aot_graph_capture.graph_module
-    wrappers = aot_graph_capture.wrappers
-    updated_flat_args = aot_graph_capture.updated_flat_args
-    maybe_subclass_meta = aot_graph_capture.maybe_subclass_meta
-
-    CompileEventLogger.try_add_pt2_compile("backend_compile", dispatch_mode="inference")
+    if aot_config.enable_log:
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "torch._functorch.config",
+                "encoding": "string",
+            },
+            payload_fn=lambda: torch._functorch.config.get_serializable_config_copy(),
+        )
 
     # Save the forward_graph_str right after aot_dispatch_base_graph,
     # to save in the cache
@@ -273,64 +312,83 @@ def aot_stage2_inference(
             expanded_def=True,
         )
 
-    fakified_out_wrapper = FakifiedOutWrapper()
-    fakified_out_wrapper.pre_compile(
-        fw_module, updated_flat_args, aot_config, fw_metadata=fw_metadata
-    )
-    functionalized_rng_wrapper = FunctionalizedRngRuntimeWrapper()
-    functionalized_rng_wrapper.pre_compile(
-        fw_module, updated_flat_args, aot_config, fw_metadata=fw_metadata
-    )
-    assert isinstance(fw_module, GraphModule)
+    return aot_forward_graph_str
 
-    if aot_config.enable_log:
-        trace_structured(
-            "artifact",
-            metadata_fn=lambda: {
-                "name": "torch._functorch.config",
-                "encoding": "string",
-            },
-            payload_fn=lambda: torch._functorch.config.get_config_copy(),
-        )
 
-    disable_amp = torch._C._is_any_autocast_enabled()
-    context = torch._C._DisableAutocast if disable_amp else nullcontext
+def _aot_stage2b_inference_compile(
+    fw_module: torch.fx.GraphModule,
+    updated_flat_args: list[Any],
+    maybe_subclass_meta: Optional[SubclassMeta],
+    fw_metadata: ViewAndMutationMeta,
+    aot_config,
+) -> Callable:
+    return _aot_stage2b_compile_forward_or_inference(
+        fw_module,
+        updated_flat_args,  # type: ignore[arg-type]
+        maybe_subclass_meta,
+        fw_metadata,
+        aot_config,
+        is_inference=True,
+    )[1]
 
-    with context(), track_graph_compiling(aot_config, "inference"):
-        compiler = (
-            aot_config.inference_compiler
-            if aot_config.inference_compiler is not None
-            else aot_config.fw_compiler
-        )
 
-        if tracing_context := torch._guards.TracingContext.try_get():
-            tracing_context.fw_metadata = (
-                fw_metadata
-                if maybe_subclass_meta is None
-                else maybe_subclass_meta.fw_metadata
-            )
+def aot_stage2_inference(
+    aot_state: AOTState,
+    aot_graph_capture: AOTGraphCapture,
+) -> DispatchReturn:
+    """
+    Handles functions that don't need autograd. Runs wrappers and compiles with fw_compiler.
+    """
 
-        with TracingContext.report_output_strides() as fwd_output_strides:
-            fake_mode = detect_fake_mode()
-            if fake_mode is not None and fake_mode.shape_env is not None:
-                tensorify_python_scalars(fw_module, fake_mode.shape_env, fake_mode)
-            compiled_fw = compiler(fw_module, updated_flat_args)
+    aot_config = aot_state.aot_config
+    fw_metadata = aot_state.fw_metadata
+    fw_module = aot_graph_capture.graph_module
+    wrappers = aot_graph_capture.wrappers
+    updated_flat_args = aot_graph_capture.updated_flat_args
+    maybe_subclass_meta = aot_graph_capture.maybe_subclass_meta
 
-        if fakified_out_wrapper.needs_post_compile:
-            fakified_out_wrapper.set_fwd_output_strides(fwd_output_strides)
+    CompileEventLogger.try_add_pt2_compile("backend_compile", dispatch_mode="inference")
+    aot_forward_graph_str = _log_inference_graph(fw_module, aot_config)
 
-    make_runtime_safe(fw_metadata, maybe_subclass_meta)
+    assert isinstance(fw_module, GraphModule)
+    _apply_tensorify_python_scalars(fw_module)
+
+    compiled_fw = _aot_stage2b_inference_compile(
+        fw_module,
+        updated_flat_args,  # type: ignore[arg-type]
+        maybe_subclass_meta,
+        fw_metadata,
+        aot_config,
+    )
 
-    # However, RuntimeWrapper does not expect the rng offsets in the
-    # output. So, we have to create another wrapper and take out the offset. As
-    # a result, we have to account for not boxed_call compilers as well.
-    if not getattr(compiled_fw, "_boxed_call", False):
-        compiled_fw = make_boxed_func(compiled_fw)
+    entry = _cache_inference_info(
+        aot_config,
+        fw_metadata,
+        maybe_subclass_meta,
+        compiled_fw,
+        aot_forward_graph_str,
+        wrappers,
+    )
 
-    # Create a wrapper to set up the rng functionalize and fakified out bits
-    compiled_fw = functionalized_rng_wrapper.post_compile(
-        compiled_fw, aot_config, runtime_metadata=fw_metadata
+    return _aot_stage2c_make_inference_function(
+        aot_config,
+        fw_metadata,
+        compiled_fw,
+        wrappers,
+        entry,
     )
+
+
+def _cache_inference_info(
+    aot_config,
+    fw_metadata,
+    maybe_subclass_meta,
+    compiled_fw,
+    aot_forward_graph_str,
+    wrappers,
+):
+    make_runtime_safe(fw_metadata, maybe_subclass_meta)
+
     cache_info = aot_config.cache_info
 
     def should_save_cache():
@@ -339,63 +397,49 @@ def should_save_cache():
         else:
             return hasattr(compiled_fw, "_fx_graph_cache_key")
 
-    if cache_info is not None:
-        if should_save_cache():
-            time_taken_ns = time.time_ns() - cache_info.start_time_ns
-            guards_expr = AOTAutogradCache.generate_guards_expression(cache_info)
-            entry = AOTAutogradCache.make_entry(
-                compiled_fw_func=compiled_fw,  # type: ignore[arg-type]
-                compiled_bw_func=None,
-                aot_joint_graph_str=None,
-                aot_forward_graph_str=aot_forward_graph_str,
-                aot_backward_graph_str=None,
-                runtime_metadata=fw_metadata,
-                dispatch_wrappers=wrappers,
-                maybe_subclass_meta=maybe_subclass_meta,
-                num_fw_outs_saved_for_bw=None,
-                indices_of_inps_to_detach=[],
-                forward_time_taken_ns=time_taken_ns,
-                backward_time_taken_ns=0,
-                sanitized_aot_config=sanitize_aot_config(aot_config),
-                guards_expr=guards_expr,
-                backward_state_indices=None,
-                num_symints_saved_for_bw=None,
-                serialized_bw_module=None,
-            )
-            AOTAutogradCache.save(
-                cache_info.cache_key, entry, remote=should_use_remote_autograd_cache()
-            )
-            compiled_fw = SerializableCompiledFunction(compiled_fw, lambda: entry)
+    entry: Optional[GenericAOTAutogradCacheEntry] = None
+    if cache_info is not None and should_save_cache():
+        time_taken_ns = time.time_ns() - cache_info.start_time_ns
+        guards_expr = AOTAutogradCache.generate_guards_expression(cache_info)
+        entry = AOTAutogradCache.make_entry(
+            compiled_fw_func=compiled_fw,  # type: ignore[arg-type]
+            compiled_bw_func=None,
+            aot_joint_graph_str=None,
+            aot_forward_graph_str=aot_forward_graph_str,
+            aot_backward_graph_str=None,
+            runtime_metadata=fw_metadata,
+            dispatch_wrappers=wrappers,
+            maybe_subclass_meta=maybe_subclass_meta,
+            num_fw_outs_saved_for_bw=None,
+            indices_of_inps_to_detach=[],
+            forward_time_taken_ns=time_taken_ns,
+            backward_time_taken_ns=0,
+            sanitized_aot_config=sanitize_aot_config(aot_config),
+            guards_expr=guards_expr,
+            backward_state_indices=None,
+            num_symints_saved_for_bw=None,
+            serialized_bw_module=None,
+        )
+        AOTAutogradCache.save(
+            cache_info.cache_key,
+            entry,
+            remote=should_use_remote_autograd_cache(),
+        )
 
-    compiled_fw = fakified_out_wrapper.post_compile(
-        compiled_fw,
-        aot_config,
-        runtime_metadata=fw_metadata,
-    )
+    return entry
 
-    compiled_fw = EffectTokensWrapper().post_compile(
-        compiled_fw,
-        aot_config,
-        runtime_metadata=fw_metadata,
-    )
 
-    # Why do we need to pass in num_fw_outs_saved_for_bw?
-    # See Note: [Partitioner handling for Subclasses, Part 2]
-    compiled_fw = AOTDispatchSubclassWrapper(
-        trace_joint=False,
-        # TODO: once we use pre_compile this will be flat_fn at the top of this function
-        fw_only=None,
-        maybe_subclass_meta=maybe_subclass_meta,
-        num_fw_outs_saved_for_bw=None,
-    ).post_compile(
-        compiled_fw,
-        aot_config,  # not used
-        runtime_metadata=fw_metadata,
-    )
-
-    if not getattr(compiled_fw, "_boxed_call", False):
-        compiled_fw = make_boxed_func(compiled_fw)
+def _aot_stage2c_make_inference_function(
+    aot_config,
+    fw_metadata,
+    compiled_fw,
+    wrappers,
+    entry,
+):
+    if entry is not None:
+        compiled_fw = SerializableCompiledFunction(compiled_fw, lambda: entry)
 
+    disable_amp = torch._C._is_any_autocast_enabled()
     compiled_fn = RuntimeWrapper(
         indices_of_inps_to_detach=[],
         trace_joint=False,
@@ -424,6 +468,7 @@ def collect_fw_donated_buffer_idxs(
     """
 
     storage_refs = set()
+
     for t in itertools.chain(fw_ins, user_fw_outs, bw_outs):
         # Only access storage if a tensor has storage (not sparse)
         if t is not None and isinstance(t, FakeTensor) and not is_sparse_any(t):
@@ -493,6 +538,7 @@ def collect_bw_donated_buffer_idxs(
         fw_ins,
         user_fw_outs,
         bw_outs,
+        # pyrefly: ignore  # bad-argument-type
         saved_tensors,
     )
 
@@ -779,6 +825,7 @@ def propagate_meta_info(new_hop_gm, new_call_function_node, old_call_function_no
         with joint_gm.graph.inserting_after(fw_node):
             new_fw_mod_attr_name = add_new_hop_gm(new_fw_hop_gm, f"fw{identifier}")
             new_fw_mod_attr = joint_gm.graph.get_attr(new_fw_mod_attr_name)
+            new_fw_mod_attr.meta = copy.copy(fw_node.args[0].meta)
 
         # new_hop_fw_gm output signature is (*fw_outs, *saved_tensors)
         with joint_gm.graph.inserting_after(new_fw_mod_attr):
@@ -831,6 +878,7 @@ def propagate_meta_info(new_hop_gm, new_call_function_node, old_call_function_no
         with joint_gm.graph.inserting_after(bw_node):
             new_bw_mod_attr_name = add_new_hop_gm(new_bw_hop_gm, bw_node.args[1])
             new_bw_mod_attr = joint_gm.graph.get_attr(new_bw_mod_attr_name)
+            new_bw_mod_attr.meta = copy.copy(bw_node.args[0].meta)
 
         with joint_gm.graph.inserting_after(new_bw_mod_attr):
             new_bw_node = joint_gm.graph.call_function(
@@ -1317,27 +1365,14 @@ def _log_structured_logs():
     bw_module.recompile()
 
 
-def aot_stage2_autograd(
-    aot_state: AOTState,
-    aot_graph_capture: AOTGraphCapture,
-) -> DispatchReturn:
+def _log_joint_graph(
+    fx_g: torch.fx.GraphModule,
+    aot_config: AOTConfig,
+) -> Optional[str]:
     """
-    Autograd logic. Generates a joint graph, partitions it, manipulates the input with various wrappers,
-    and returns a wrapped torch.autograd.Function with a forward and backward.
+    Log the joint graph to the structured logger.
+    Return a str representation of the graph.
     """
-
-    wrappers = aot_graph_capture.wrappers
-    fx_g = aot_graph_capture.graph_module
-    flat_args = aot_state.flat_args
-    joint_inputs = aot_graph_capture.updated_flat_args
-    maybe_subclass_meta = aot_graph_capture.maybe_subclass_meta
-    aot_config = aot_state.aot_config
-    fw_metadata = aot_state.fw_metadata
-
-    CompileEventLogger.try_add_pt2_compile("backend_compile", dispatch_mode="autograd")
-
-    # Copied from aot_dispatch_autograd_graph.
-    disable_amp = torch._C._is_any_autocast_enabled()
     joint_graph_str = None
     if aot_config.enable_log:
         aot_joint_log.info(
@@ -1361,13 +1396,120 @@ def aot_stage2_autograd(
             "aot_joint_graph",
             payload_fn=lambda: joint_graph_str,
         )
+    return joint_graph_str
 
-    with torch.no_grad():
-        inner_meta = (
-            fw_metadata
-            if maybe_subclass_meta is None
-            else maybe_subclass_meta.fw_metadata
+
+def _log_fw_bw_graphs(
+    fw_module: torch.fx.GraphModule,
+    bw_module: torch.fx.GraphModule,
+    maybe_subclass_meta: Optional[SubclassMeta],
+    fw_metadata: ViewAndMutationMeta,
+    aot_config: AOTConfig,
+) -> tuple[Optional[str], Optional[str]]:
+    """
+    Log the fw and bw graphs to the structured logger.
+    Return str representations of the graphs.
+    """
+    fw_module_str = None
+    bw_module_str = None
+    if aot_config.enable_log:
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "torch._functorch.config",
+                "encoding": "string",
+            },
+            payload_fn=lambda: torch._functorch.config.get_serializable_config_copy(),
+        )
+        aot_graphs_log.info(
+            "aot_config id: %s, fw_metadata=%s, inner_meta=%s",
+            str(aot_config.aot_id),
+            str(fw_metadata),
+            str(_get_inner_meta(maybe_subclass_meta, fw_metadata)),
+        )
+
+        aot_graphs_log.info(
+            "%s",
+            lazy_format_graph_code(
+                "Forward graph",
+                fw_module,
+                aot_config.aot_id,
+                include_stride=True,
+                include_device=True,
+                colored=True,
+            ),
+        )
+        aot_graphs_log.info(
+            "%s",
+            lazy_format_graph_code(
+                "Backward graph",
+                bw_module,
+                aot_config.aot_id,
+                include_stride=True,
+                include_device=True,
+                colored=True,
+            ),
+        )
+        fw_module_str = fw_module.print_readable(
+            print_output=False,
+            include_stride=True,
+            include_device=True,
+            expanded_def=True,
+        )
+        bw_module_str = bw_module.print_readable(
+            print_output=False,
+            include_stride=True,
+            include_device=True,
+            expanded_def=True,
+        )
+
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "aot_forward_graph_fw_metadata",
+                "encoding": "string",
+            },
+            payload_fn=lambda: dataclass_repr(fw_metadata),
+        )
+        if maybe_subclass_meta is not None:
+            trace_structured(
+                "artifact",
+                metadata_fn=lambda: {
+                    "name": "aot_forward_graph_fw_subclass_metadata",
+                    "encoding": "string",
+                },
+                payload_fn=lambda: dataclass_repr(maybe_subclass_meta),
+            )
+
+        trace_structured(
+            "aot_forward_graph",
+            payload_fn=lambda: fw_module_str,
         )
+        trace_structured(
+            "aot_backward_graph",
+            payload_fn=lambda: bw_module_str,
+        )
+    return fw_module_str, bw_module_str
+
+
+def _aot_stage2a_partition(
+    fx_g: torch.fx.GraphModule,
+    joint_inputs: Union[list[Any], tuple[list[Any], list[Any]]],
+    maybe_subclass_meta: Optional[SubclassMeta],
+    fw_metadata: ViewAndMutationMeta,
+    aot_config: AOTConfig,
+) -> tuple[torch.fx.GraphModule, torch.fx.GraphModule, int, int, list[int], list[Any]]:
+    """
+    Partition the joint graph into a forward graph and a backward graph. Returns:
+    - the forward and backward graphs
+    - the number of forward outputs and the number of symints saved for backward
+    - indices of inputs to detach
+    - adjusted inputs to forward
+    """
+    disable_amp = torch._C._is_any_autocast_enabled()
+    inner_meta = _get_inner_meta(maybe_subclass_meta, fw_metadata)
+
+    with torch.no_grad():
         context = torch._C._DisableAutocast if disable_amp else nullcontext
         with context(), track_graph_compiling(aot_config, "joint"):
             # See Note: [Partitioner handling for Subclasses, Part 1]
@@ -1386,12 +1528,12 @@ def aot_stage2_autograd(
                 + inner_meta.num_outputs_rng_offset
                 + num_tokens  # See Note [Side-Effectful Tokens in AOTAutograd]
             )
-            fake_mode = detect_fake_mode()
             fx_g = run_joint_graph_passes_on_hops(fx_g, joint_inputs, aot_config)
 
-            # TODO(anijain2305) - Add tensorify_python_scalars to the HOP graph passes.
-            if fake_mode is not None and fake_mode.shape_env is not None:
-                tensorify_python_scalars(fx_g, fake_mode.shape_env, fake_mode)
+            # apply joint_gm callback here
+            if callable(torch._functorch.config.joint_custom_pass):
+                # pyrefly: ignore  # bad-assignment
+                fx_g = torch._functorch.config.joint_custom_pass(fx_g, joint_inputs)
 
             static_lifetime_input_indices = fw_metadata.static_input_indices
             fw_module, bw_module = aot_config.partition_fn(
@@ -1456,9 +1598,9 @@ def aot_stage2_autograd(
                     if dynamic_dims:
                         fw_metadata.dynamic_saved_tensors_idxs[idx] = dynamic_dims
 
-            fw_metadata.num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
-            inner_meta.num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
             num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
+            fw_metadata.num_symints_saved_for_bw = num_symints_saved_for_bw
+            inner_meta.num_symints_saved_for_bw = num_symints_saved_for_bw
             if torch._functorch.config.donated_buffer:
                 fw_metadata.bw_donated_idxs = collect_bw_donated_buffer_idxs(
                     fw_module,
@@ -1467,22 +1609,6 @@ def aot_stage2_autograd(
                 )
                 inner_meta.bw_donated_idxs = fw_metadata.bw_donated_idxs
 
-        if aot_config.enable_log:
-            trace_structured(
-                "artifact",
-                metadata_fn=lambda: {
-                    "name": "torch._functorch.config",
-                    "encoding": "string",
-                },
-                payload_fn=lambda: torch._functorch.config.get_config_copy(),
-            )
-            aot_graphs_log.info(
-                "aot_config id: %s, fw_metadata=%s, inner_meta=%s",
-                str(aot_config.aot_id),
-                str(fw_metadata),
-                str(inner_meta),
-            )
-
         # Note [Detaching inputs that never need gradients]
         # See https://github.com/pytorch/pytorch/issues/97745
         # Suppose we have a function like this that we want to compile:
@@ -1579,137 +1705,49 @@ def aot_stage2_autograd(
                 if bw_out is None and not metadata_mutation_in_graph and is_non_leaf:
                     _indices_of_inps_to_detach.append(i)
 
-        fw_module_str = None
-        bw_module_str = None
-        if aot_config.enable_log:
-            aot_graphs_log.info(
-                "%s",
-                lazy_format_graph_code(
-                    "Forward graph",
-                    fw_module,
-                    aot_config.aot_id,
-                    include_stride=True,
-                    include_device=True,
-                    colored=True,
-                ),
-            )
-            aot_graphs_log.info(
-                "%s",
-                lazy_format_graph_code(
-                    "Backward graph",
-                    bw_module,
-                    aot_config.aot_id,
-                    include_stride=True,
-                    include_device=True,
-                    colored=True,
-                ),
-            )
-            fw_module_str = fw_module.print_readable(
-                print_output=False,
-                include_stride=True,
-                include_device=True,
-                expanded_def=True,
-            )
-            bw_module_str = bw_module.print_readable(
-                print_output=False,
-                include_stride=True,
-                include_device=True,
-                expanded_def=True,
-            )
-
-            trace_structured(
-                "artifact",
-                metadata_fn=lambda: {
-                    "name": "aot_forward_graph_fw_metadata",
-                    "encoding": "string",
-                },
-                payload_fn=lambda: dataclass_repr(fw_metadata),
-            )
-            if maybe_subclass_meta is not None:
-                trace_structured(
-                    "artifact",
-                    metadata_fn=lambda: {
-                        "name": "aot_forward_graph_fw_subclass_metadata",
-                        "encoding": "string",
-                    },
-                    payload_fn=lambda: dataclass_repr(maybe_subclass_meta),
-                )
-
-            trace_structured(
-                "aot_forward_graph",
-                payload_fn=lambda: fw_module_str,
-            )
-            trace_structured(
-                "aot_backward_graph",
-                payload_fn=lambda: bw_module_str,
-            )
-
-        # AMP is already traced out in joint graph. we do not wish to reapply it accidentally
-        # in the compiler.
-        with track_graph_compiling(aot_config, "forward"), torch._C._DisableAutocast():
-            # flat_args at this point might still be subclasses-
-            # make sure to pass the unwrapped fake tensors into the compiler!
-            adjusted_flat_args = joint_inputs[0]
-
-            fakified_out_wrapper = FakifiedOutWrapper()
-            fakified_out_wrapper.pre_compile(
-                fw_module, adjusted_flat_args, aot_config, fw_metadata=fw_metadata
-            )
-
-            functionalized_rng_wrapper = FunctionalizedRngRuntimeWrapper(
-                return_new_outs=False
-            )
-
-            if rng_states:
-                index = fw_metadata.graphsafe_rng_state_index
-                assert index is not None
-                rng_states = [
-                    get_cuda_generator_meta_val(index)
-                    for _ in range(fw_metadata.num_graphsafe_rng_states)
-                ]
-                adjusted_flat_args.extend(rng_states)  # type: ignore[arg-type]
-
-            functionalized_rng_wrapper.pre_compile(
-                fw_module, adjusted_flat_args, aot_config, fw_metadata=fw_metadata
-            )
-            if tracing_context := torch._guards.TracingContext.try_get():
-                tracing_context.fw_metadata = inner_meta
-
-            with TracingContext.report_output_strides() as fwd_output_strides:
-                compiled_fw_func = aot_config.fw_compiler(fw_module, adjusted_flat_args)
-
-            if not getattr(compiled_fw_func, "_boxed_call", False):
-                compiled_fw_func = make_boxed_func(compiled_fw_func)
-
-            if fakified_out_wrapper.needs_post_compile:
-                fakified_out_wrapper.set_fwd_output_strides(fwd_output_strides)
+    return (
+        fw_module,
+        bw_module,
+        num_fw_outs_saved_for_bw,
+        num_symints_saved_for_bw,
+        _indices_of_inps_to_detach,
+        joint_inputs[0],
+    )
 
-            compiled_fw_func = EffectTokensWrapper().post_compile(
-                compiled_fw_func,
-                aot_config,
-                runtime_metadata=fw_metadata,
-            )
 
-            compiled_fw_func = AOTDispatchSubclassWrapper(
-                fw_only=None,
-                trace_joint=False,
-                maybe_subclass_meta=maybe_subclass_meta,
-                num_fw_outs_saved_for_bw=num_fw_outs_saved_for_bw,
-            ).post_compile(
-                compiled_fw_func,
-                aot_config,  # not used
-                runtime_metadata=fw_metadata,
-            )
+def _aot_stage2b_fw_compile(
+    fw_module: torch.fx.GraphModule,
+    adjusted_flat_args: list[Any],
+    maybe_subclass_meta: Optional[SubclassMeta],
+    fw_metadata: ViewAndMutationMeta,
+    num_fw_outs_saved_for_bw: int,
+    aot_config: AOTConfig,
+) -> tuple[Optional[list[Optional[tuple[int, ...]]]], Callable]:
+    return _aot_stage2b_compile_forward_or_inference(
+        fw_module,
+        adjusted_flat_args,
+        maybe_subclass_meta,
+        fw_metadata,
+        aot_config,
+        is_inference=False,
+        num_fw_outs_saved_for_bw=num_fw_outs_saved_for_bw,
+    )
 
-            compiled_fw_func = functionalized_rng_wrapper.post_compile(
-                compiled_fw_func, aot_config, runtime_metadata=fw_metadata
-            )
-            compiled_fw_func = fakified_out_wrapper.post_compile(
-                compiled_fw_func,
-                aot_config,
-                runtime_metadata=fw_metadata,
-            )
 
+def _aot_stage2b_bw_compile(
+    bw_module: torch.fx.GraphModule,
+    maybe_subclass_meta: Optional[SubclassMeta],
+    fw_metadata: ViewAndMutationMeta,
+    fwd_output_strides: Optional[list[Optional[tuple[int, ...]]]],
+    num_symints_saved_for_bw: int,
+    aot_config: AOTConfig,
+) -> tuple[AutogradLazyBackwardCompileInfo, Optional[Callable]]:
+    """
+    Compile the backward graph. Returns:
+    - the placeholder list for the backward graph
+    - the compiled backward function
+    """
+    with torch.no_grad():
         # NB: It's important to compile backwards ahead of time, as this may
         # add extra guards which we need to apply to the Dynamo cache at
         # forwards
@@ -1718,6 +1756,7 @@ def aot_stage2_autograd(
 
             forward_saved_for_backwards_strides = None
             if fwd_output_strides is not None:
+                inner_meta = _get_inner_meta(maybe_subclass_meta, fw_metadata)
                 forward_saved_for_backwards_strides = fwd_output_strides[
                     inner_meta.tensors_saved_for_backwards_slice
                 ]
@@ -1761,8 +1800,19 @@ def aot_stage2_autograd(
                     # (2408448, 1, 21504, 192). The solution mentioned will
                     # decide a stride of (802816, 1, 7168, 64) for this
                     # tensor which is wrong.
-                    placeholder_list[i] = ph_arg.as_strided(ph_arg.size(), real_stride)
 
+                    ph_size = ph_arg.size()
+                    # pyrefly: ignore  # bad-argument-type
+                    if len(ph_size) == 0 and len(real_stride) > 0:
+                        # Fix for 0-dimensional tensors: When a tensor becomes 0-d
+                        # (e.g., via squeeze), its stride should be () not (1,).
+                        # This mismatch can occur when dynamic shape operations produce
+                        # tensors that are later squeezed to 0-d. The stride metadata
+                        # may get preserved causing a dimension mismatch (#164814)
+                        real_stride = ()
+
+                    # pyrefly: ignore  # bad-argument-type
+                    placeholder_list[i] = ph_arg.as_strided(ph_size, real_stride)
             compiled_bw_func = None
             if (
                 num_symints_saved_for_bw > 0
@@ -1818,20 +1868,182 @@ def aot_stage2_autograd(
 
                 _LazyGraphModule.force_recompile(bw_module)
 
-    saved_context = TracingContext.try_get()
-    saved_compile_context = CompileContext.try_get()
+            saved_context = TracingContext.try_get()
+            saved_compile_context = CompileContext.try_get()
+
+            lazy_backward_info = AutogradLazyBackwardCompileInfo(
+                bw_module,
+                placeholder_list,
+                saved_context,
+                saved_compile_context,
+            )
+
+            return lazy_backward_info, compiled_bw_func
 
+
+def aot_stage2_autograd(
+    aot_state: AOTState,
+    aot_graph_capture: AOTGraphCapture,
+) -> DispatchReturn:
+    """
+    Autograd logic. Generates a joint graph, partitions it, manipulates the input with various wrappers,
+    and returns a wrapped torch.autograd.Function with a forward and backward.
+    """
+
+    fx_g = aot_graph_capture.graph_module
+    maybe_subclass_meta = aot_graph_capture.maybe_subclass_meta
+    fw_metadata = aot_state.fw_metadata
+    aot_config = aot_state.aot_config
+
+    CompileEventLogger.try_add_pt2_compile("backend_compile", dispatch_mode="autograd")
+    joint_graph_str = _log_joint_graph(fx_g, aot_config)
+
+    _apply_tensorify_python_scalars(fx_g)
+
+    (
+        fw_module,
+        bw_module,
+        num_fw_outs_saved_for_bw,
+        num_symints_saved_for_bw,
+        _indices_of_inps_to_detach,
+        adjusted_flat_args,
+    ) = _aot_stage2a_partition(
+        fx_g,
+        aot_graph_capture.updated_flat_args,
+        maybe_subclass_meta,
+        fw_metadata,
+        aot_config,
+    )
+
+    fw_module_str, bw_module_str = _log_fw_bw_graphs(
+        fw_module, bw_module, maybe_subclass_meta, fw_metadata, aot_config
+    )
+
+    fwd_output_strides, compiled_fw_func = _aot_stage2b_fw_compile(
+        fw_module,
+        adjusted_flat_args,
+        maybe_subclass_meta,
+        fw_metadata,
+        num_fw_outs_saved_for_bw,
+        aot_config,
+    )
+
+    lazy_backward_info, compiled_bw_func = _aot_stage2b_bw_compile(
+        bw_module,
+        maybe_subclass_meta,
+        fw_metadata,
+        fwd_output_strides,
+        num_symints_saved_for_bw,
+        aot_config,
+    )
+
+    try_save_cache_entry, entry = _cache_autograd_info(
+        aot_config,
+        aot_state.flat_args,
+        compiled_fw_func,
+        compiled_bw_func,
+        fw_module_str,
+        bw_module_str,
+        joint_graph_str,
+        aot_graph_capture.wrappers,
+        maybe_subclass_meta,
+        fw_metadata,
+        num_fw_outs_saved_for_bw,
+        _indices_of_inps_to_detach,
+        num_symints_saved_for_bw,
+        bw_module,
+    )
+
+    return _aot_stage2c_make_autograd_function(
+        aot_config,
+        aot_state.flat_args,
+        fw_metadata,
+        maybe_subclass_meta,
+        aot_graph_capture.wrappers,
+        compiled_fw_func,
+        compiled_bw_func,
+        lazy_backward_info,
+        try_save_cache_entry,
+        entry,
+        _indices_of_inps_to_detach,
+        num_symints_saved_for_bw,
+    )
+
+
+def _aot_stage2c_make_autograd_function(
+    aot_config,
+    flat_args,
+    fw_metadata,
+    maybe_subclass_meta,
+    wrappers,
+    compiled_fw_func,
+    compiled_bw_func,
+    lazy_backward_info,
+    try_save_cache_entry,
+    entry,
+    _indices_of_inps_to_detach,
+    num_symints_saved_for_bw,
+):
     backward_state_indices = [
         idx for idx, x in enumerate(flat_args) if isinstance(x, BackwardState)
     ]
     assert len(backward_state_indices) <= 1
 
-    lazy_backward_info = AutogradLazyBackwardCompileInfo(
-        bw_module,
-        placeholder_list,
-        saved_context,
-        saved_compile_context,
+    disable_amp = torch._C._is_any_autocast_enabled()
+    compiled_fn = AOTDispatchAutograd.post_compile(
+        compiled_fw_func,
+        compiled_bw_func,
+        maybe_subclass_meta,
+        num_symints_saved_for_bw,
+        backward_state_indices,
+        disable_amp,
+        _indices_of_inps_to_detach,
+        lazy_backward_info,
+        aot_config,
+        fw_metadata=fw_metadata,
+        try_save_cache_entry=try_save_cache_entry,
+    )
+
+    if entry is not None:
+        compiled_fn = SerializableCompiledFunction(compiled_fn, lambda: entry)
+
+    if config.debug_assert:
+        flat_requires_grad: list[Optional[bool]] = [
+            a.requires_grad if isinstance(a, Tensor) else None for a in flat_args
+        ]
+        compiled_fn = DebugAssertWrapper(
+            flat_requires_grad=flat_requires_grad
+        ).post_compile(compiled_fn, aot_config, runtime_metadata=fw_metadata)
+
+    compiled_fn = post_compile(
+        wrappers,
+        compiled_fn,
+        aot_config,
+        runtime_metadata=fw_metadata,
     )
+    return compiled_fn
+
+
+def _cache_autograd_info(
+    aot_config,
+    flat_args,
+    compiled_fw_func,
+    compiled_bw_func,
+    fw_module_str,
+    bw_module_str,
+    joint_graph_str,
+    wrappers,
+    maybe_subclass_meta,
+    fw_metadata,
+    num_fw_outs_saved_for_bw,
+    _indices_of_inps_to_detach,
+    num_symints_saved_for_bw,
+    bw_module,
+):
+    backward_state_indices = [
+        idx for idx, x in enumerate(flat_args) if isinstance(x, BackwardState)
+    ]
+    assert len(backward_state_indices) <= 1
 
     make_runtime_safe(fw_metadata, maybe_subclass_meta)
 
@@ -1894,8 +2106,11 @@ def should_save_cache():
                     num_symints_saved_for_bw=num_symints_saved_for_bw,
                     serialized_bw_module=serialize_graph_module(bw_module),
                 )
-                remote = should_use_remote_autograd_cache()
-                AOTAutogradCache.save(cache_info.cache_key, entry, remote)
+                AOTAutogradCache.save(
+                    cache_info.cache_key,
+                    entry,
+                    remote=should_use_remote_autograd_cache(),
+                )
                 return entry
             return None
 
@@ -1906,35 +2121,133 @@ def should_save_cache():
             )
             try_save_cache_entry = None
 
-    compiled_fn = AOTDispatchAutograd.post_compile(
-        compiled_fw_func,
-        compiled_bw_func,
-        maybe_subclass_meta,
-        num_symints_saved_for_bw,
-        backward_state_indices,
-        disable_amp,
-        _indices_of_inps_to_detach,
-        lazy_backward_info,
-        aot_config,
-        fw_metadata=fw_metadata,
-        try_save_cache_entry=try_save_cache_entry,
-    )
+    return try_save_cache_entry, entry
 
-    if entry is not None:
-        compiled_fn = SerializableCompiledFunction(compiled_fn, lambda: entry)
 
-    if config.debug_assert:
-        flat_requires_grad: list[Optional[bool]] = [
-            a.requires_grad if isinstance(a, Tensor) else None for a in flat_args
-        ]
-        compiled_fn = DebugAssertWrapper(
-            flat_requires_grad=flat_requires_grad
-        ).post_compile(compiled_fn, aot_config, runtime_metadata=fw_metadata)
+def _aot_stage2b_compile_forward_or_inference(
+    fw_module: torch.fx.GraphModule,
+    adjusted_flat_args: list[Any],
+    maybe_subclass_meta: Optional[SubclassMeta],
+    fw_metadata: ViewAndMutationMeta,
+    aot_config: AOTConfig,
+    *,
+    is_inference: bool,
+    num_fw_outs_saved_for_bw: Optional[int] = None,
+) -> tuple[Optional[list[Optional[tuple[int, ...]]]], Callable]:
+    """
+    Compile the forward or inference graph. Returns:
+    - the output strides of the forward graph
+    - the compiled forward/inference function
+
+    Args:
+        fw_module: The forward graph module to compile
+        adjusted_flat_args: Flattened arguments after adjustments
+        maybe_subclass_meta: Metadata for tensor subclasses
+        fw_metadata: View and mutation metadata
+        aot_config: AOT configuration
+        is_inference: If True, compile for inference; if False, compile for forward (autograd)
+        num_fw_outs_saved_for_bw: Number of forward outputs saved for backward (required if not is_inference)
+
+    Before compiling, we run pre_compile for the following wrappers:
+    - FakifiedOutWrapper
+    - FunctionalizedRngRuntimeWrapper
+    After compiling, we run post_compile for the following wrappers:
+    - EffectTokensWrapper
+    - AOTDispatchSubclassWrapper
+    - FunctionalizedRngRuntimeWrapper
+    - FakifiedOutWrapper
+    """
+    # Validation
+    if not is_inference and num_fw_outs_saved_for_bw is None:
+        raise ValueError(
+            "num_fw_outs_saved_for_bw must be provided when is_inference=False"
+        )
 
-    compiled_fn = post_compile(
-        wrappers,
-        compiled_fn,
-        aot_config,
-        runtime_metadata=fw_metadata,
-    )
-    return compiled_fn
+    # Determine grad context, autocast context, tracking mode, compiler
+    if is_inference:
+        grad_ctx: Any = nullcontext
+        autocast_ctx: Any = (
+            torch._C._DisableAutocast
+            if torch._C._is_any_autocast_enabled()
+            else nullcontext
+        )
+        tracking_mode: str = "inference"
+        compiler: Any = aot_config.inference_compiler
+    else:
+        grad_ctx = torch.no_grad
+        autocast_ctx = torch._C._DisableAutocast
+        tracking_mode = "forward"
+        compiler = aot_config.fw_compiler
+
+    with grad_ctx(), autocast_ctx(), track_graph_compiling(aot_config, tracking_mode):
+        # Setup wrappers
+        fakified_out_wrapper = FakifiedOutWrapper()
+        fakified_out_wrapper.pre_compile(
+            fw_module, adjusted_flat_args, aot_config, fw_metadata=fw_metadata
+        )
+
+        # Initialize RNG wrapper based on mode
+        functionalized_rng_wrapper = FunctionalizedRngRuntimeWrapper(
+            return_new_outs=is_inference
+        )
+
+        # Add RNG states for forward mode only
+        if not is_inference and fw_metadata.num_graphsafe_rng_states > 0:
+            index = fw_metadata.graphsafe_rng_state_index
+            assert index is not None
+            rng_states = [
+                get_cuda_generator_meta_val(index)
+                for _ in range(fw_metadata.num_graphsafe_rng_states)
+            ]
+            adjusted_flat_args.extend(rng_states)  # type: ignore[arg-type]
+
+        functionalized_rng_wrapper.pre_compile(
+            fw_module, adjusted_flat_args, aot_config, fw_metadata=fw_metadata
+        )
+
+        # Set tracing context
+        if tracing_context := torch._guards.TracingContext.try_get():
+            tracing_context.fw_metadata = _get_inner_meta(
+                maybe_subclass_meta, fw_metadata
+            )
+
+        with TracingContext.report_output_strides() as fwd_output_strides:
+            compiled_fw_func = compiler(fw_module, adjusted_flat_args)
+
+        # Make boxed if needed
+        if not getattr(compiled_fw_func, "_boxed_call", False):
+            compiled_fw_func = make_boxed_func(compiled_fw_func)
+
+        # Set forward output strides if needed
+        if fakified_out_wrapper.needs_post_compile:
+            fakified_out_wrapper.set_fwd_output_strides(fwd_output_strides)
+
+        # Apply post-compile wrappers
+        compiled_fw_func = EffectTokensWrapper().post_compile(
+            compiled_fw_func,
+            aot_config,
+            runtime_metadata=fw_metadata,
+        )
+
+        compiled_fw_func = AOTDispatchSubclassWrapper(
+            fw_only=None,
+            trace_joint=False,
+            maybe_subclass_meta=maybe_subclass_meta,
+            num_fw_outs_saved_for_bw=num_fw_outs_saved_for_bw,
+        ).post_compile(
+            compiled_fw_func,
+            aot_config,
+            runtime_metadata=fw_metadata,
+        )
+
+        compiled_fw_func = functionalized_rng_wrapper.post_compile(
+            compiled_fw_func, aot_config, runtime_metadata=fw_metadata
+        )
+
+        compiled_fw_func = fakified_out_wrapper.post_compile(
+            compiled_fw_func,
+            aot_config,
+            runtime_metadata=fw_metadata,
+        )
+
+        return fwd_output_strides, compiled_fw_func
diff --git a/torch/_functorch/_aot_autograd/input_output_analysis.py b/torch/_functorch/_aot_autograd/input_output_analysis.py
index dcee706f5cc2..06581e1524fd 100644
--- a/torch/_functorch/_aot_autograd/input_output_analysis.py
+++ b/torch/_functorch/_aot_autograd/input_output_analysis.py
@@ -89,7 +89,7 @@ def remove_dupe_metadata(
                 dynamic_dims=o.dynamic_dims,
                 base_idx=None if o.base_idx is None else add_dupe_map[o.base_idx],
                 requires_grad=o.requires_grad,
-                functional_tensor=o.functional_tensor,
+                view_meta_sequence=o.view_meta_sequence,
             )
             for o in m.output_info
         ],
@@ -242,7 +242,7 @@ def create_synthetic_base_metadata(
                 # Map the input idx pre-synthetic-bases to the new idx post-synthetic-bases
                 base_idx=new_base_idx,  # type: ignore[arg-type]
                 requires_grad=o.requires_grad,
-                functional_tensor=o.functional_tensor,
+                view_meta_sequence=o.view_meta_sequence,
             )
         )
 
diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py
index 5a5536913813..8f8d9581da5c 100644
--- a/torch/_functorch/_aot_autograd/runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/runtime_wrappers.py
@@ -14,10 +14,11 @@
 import functools
 import itertools
 import pprint
+from collections.abc import Callable
 from contextlib import AbstractContextManager, nullcontext
 from dataclasses import dataclass, field
 from functools import wraps
-from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 
 if TYPE_CHECKING:
@@ -150,7 +151,7 @@ def __init__(self, info, runtime_metadata, trace_joint):
         self.base_idx = info.base_idx
         self.unwrap_out = _unwrap_tensoralias if trace_joint else _identity
         self.requires_grad = info.requires_grad
-        self.functional_tensor = info.functional_tensor
+        self.view_meta_sequence = info.view_meta_sequence
         self.replay_views = config.view_replay_for_aliased_outputs
 
     def __call__(self, orig_inputs, fw_outs, out):
@@ -159,7 +160,7 @@ def __call__(self, orig_inputs, fw_outs, out):
             aliased_base_tensor,
             self.unwrap_out(out),
             self.requires_grad,
-            self.functional_tensor,
+            self.view_meta_sequence,
             replay_views=self.replay_views,
         )
 
@@ -190,7 +191,7 @@ def __init__(self, info, runtime_metadata, trace_joint):
 
         self.unwrap_out = _unwrap_tensoralias if trace_joint else _identity
         self.requires_grad = info.requires_grad
-        self.functional_tensor = info.functional_tensor
+        self.view_meta_sequence = info.view_meta_sequence
         self.replay_views = config.view_replay_for_aliased_outputs
 
     def __call__(self, orig_inputs, fw_outs, out):
@@ -199,7 +200,7 @@ def __call__(self, orig_inputs, fw_outs, out):
             self._unwrap_aliased_base_tensor(aliased_base_tensor),
             self.unwrap_out(out),
             self.requires_grad,
-            self.functional_tensor,
+            self.view_meta_sequence,
             replay_views=self.replay_views,
         )
 
@@ -224,6 +225,7 @@ def make_output_handler(info, runtime_metadata, trace_joint):
 # not sure why AOTDispatcher needs to manually set this
 def maybe_mark_dynamic_helper(t: torch.Tensor, dims: set[int]):
     if hasattr(t, "_dynamo_weak_dynamic_indices"):
+        # pyrefly: ignore  # missing-attribute
         t._dynamo_weak_dynamic_indices |= dims
     else:
         t._dynamo_weak_dynamic_indices = dims.copy()  # type: ignore[attr-defined]
@@ -1141,6 +1143,7 @@ def pre_compile(
 
         def _unpack_synthetic_bases(primals: tuple[Any, ...]) -> list[Any]:
             f_args_inner = []
+            # pyrefly: ignore  # not-iterable
             for inner_idx_or_tuple in synthetic_base_info:
                 if isinstance(inner_idx_or_tuple, int):
                     f_args_inner.append(primals[inner_idx_or_tuple])
@@ -2111,6 +2114,7 @@ def _compiled_autograd_key(ctx):
                 return (ctx._autograd_function_id, *ctx.symints)
 
             @staticmethod
+            # pyrefly: ignore  # bad-override
             def forward(ctx, *deduped_flat_tensor_args):
                 args = deduped_flat_tensor_args
                 if backward_state_indices:
@@ -2147,6 +2151,7 @@ def forward(ctx, *deduped_flat_tensor_args):
                 #   in the fw output order.
                 fw_outs = call_func_at_runtime_with_args(
                     CompiledFunction.compiled_fw,
+                    # pyrefly: ignore  # bad-argument-type
                     args,
                     disable_amp=disable_amp,
                 )
@@ -2342,6 +2347,7 @@ class CompiledFunctionBackward(torch.autograd.Function):
                     _aot_id = aot_config.aot_id
 
                     @staticmethod
+                    # pyrefly: ignore  # bad-override
                     def forward(double_ctx, *unused_args):
                         return impl_fn(double_ctx)
 
diff --git a/torch/_functorch/_aot_autograd/schemas.py b/torch/_functorch/_aot_autograd/schemas.py
index 9c8cfc0a318d..b019d30dd935 100644
--- a/torch/_functorch/_aot_autograd/schemas.py
+++ b/torch/_functorch/_aot_autograd/schemas.py
@@ -7,21 +7,11 @@
 from __future__ import annotations
 
 import collections
-import dataclasses
 import functools
 import itertools
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import (
-    Any,
-    Callable,
-    NewType,
-    Optional,
-    Protocol,
-    TYPE_CHECKING,
-    TypeVar,
-    Union,
-)
+from typing import Any, NewType, Optional, Protocol, TYPE_CHECKING, TypeVar, Union
 
 import torch
 import torch.utils._pytree as pytree
@@ -32,16 +22,13 @@
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
 from .. import config
-from .functional_utils import (
-    _check_if_mutation_can_be_in_graph,
-    FunctionalTensorMetadataEq,
-)
+from .functional_utils import _check_if_mutation_can_be_in_graph, ViewMetaSequence
 from .utils import strict_zip
 
 
 if TYPE_CHECKING:
     import contextlib
-    from collections.abc import Iterable, Sequence
+    from collections.abc import Callable, Iterable, Sequence
 
     from torch._guards import Source
     from torch._inductor.output_code import OutputCode
@@ -117,15 +104,14 @@ class OutputAliasInfo:
     dynamic_dims: Optional[set[int]]
     # requires_grad
     requires_grad: bool
-    # FunctionalTensorWrapper that represents this output.
+    # Sequence of ViewMeta objects.
     #
-    # Provides us the means to replay views from it.
+    # Provides us the means to re-run view functions on other tensors.
     #
-    # We need to wrap the actual FunctionalTensorWrapper with this class so that
-    # we only compare the tensor's metadata. That's because with the transformations
-    # of the model throughout AOTAutograd, the sequence of ViewMeta and the base
-    # tensor might change.
-    functional_tensor: Optional[FunctionalTensorMetadataEq] = None
+    # We need to wrap the actual list of ViewMeta with this class so that
+    # we compare the ViewMeta elements appropriately, i.e. their type and
+    # the elements returned by the `as_tuple()` call.
+    view_meta_sequence: Optional[ViewMetaSequence] = None
 
 
 class MutationType(Enum):
@@ -210,6 +196,7 @@ def from_tensor(t: torch.Tensor) -> Optional[MemoryFormatMeta]:
 
         if use_memory_format:
             return MemoryFormatMeta(
+                # pyrefly: ignore  # unbound-name
                 memory_format=torch._prims_common.suggest_memory_format(t),
             )
 
@@ -665,17 +652,6 @@ def extract_metadata(t):
         self.traced_tangent_metas = [extract_metadata(t) for t in self.traced_tangents]
         # Clear traced tangents at runtime
         self.traced_tangents = []
-        new_output_info = []
-        for out in self.output_info:
-            if config.view_replay_for_aliased_outputs:
-                new_out = out
-            else:
-                # If we're not using view_replay, remove the functional tensor.
-                # Functional tensors are unfortunately not serializable,
-                # so doing this is required for AOTAutograd caching.
-                new_out = dataclasses.replace(out, functional_tensor=None)
-            new_output_info.append(new_out)
-        self.output_info = new_output_info
         for inp_meta in self.subclass_inp_meta:
             if isinstance(inp_meta, SubclassCreationMeta):
                 inp_meta.make_runtime_safe()
@@ -917,12 +893,15 @@ def from_tracing_metadata(
         parameters_to_mutate = {}
         for output_name, mutation_name in outputs_to_mutations.items():
             if mutation_name in user_inputs:
+                # pyrefly: ignore  # unsupported-operation
                 user_inputs_to_mutate[output_name] = mutation_name
             else:
                 assert mutation_name in buffers or mutation_name in parameters
                 if mutation_name in buffers:
+                    # pyrefly: ignore  # unsupported-operation
                     buffers_to_mutate[output_name] = mutation_name
                 else:
+                    # pyrefly: ignore  # unsupported-operation
                     parameters_to_mutate[output_name] = mutation_name
 
         start, stop = stop, stop + num_user_outputs
@@ -994,6 +973,7 @@ class AOTConfig:
     # This config makes sure to check certain things like
     # mutating input with req_grad in export joint tracing.
     export_trace_joint: bool = False
+    disable_functionalization: bool = False
 
     def __post_init__(self):
         if self.pre_dispatch:
@@ -1256,7 +1236,9 @@ def __init__(
         output_code_ty: type[TOutputCode],
         compiler_fn: Callable[[torch.fx.GraphModule, Sequence[InputType]], TOutputCode],
     ):
+        # pyrefly: ignore  # invalid-type-var
         self.output_code_ty = output_code_ty
+        # pyrefly: ignore  # invalid-type-var
         self.compiler_fn = compiler_fn
 
     def __call__(
diff --git a/torch/_functorch/_aot_autograd/subclass_parametrization.py b/torch/_functorch/_aot_autograd/subclass_parametrization.py
index 5d6d17ca099c..feb44ad75ae4 100644
--- a/torch/_functorch/_aot_autograd/subclass_parametrization.py
+++ b/torch/_functorch/_aot_autograd/subclass_parametrization.py
@@ -90,6 +90,7 @@ def unwrap_tensor_subclass_parameters(module: torch.nn.Module) -> torch.nn.Modul
     """
     for name, tensor in itertools.chain(
         list(module.named_parameters(recurse=False)),
+        # pyrefly: ignore  # no-matching-overload
         list(module.named_buffers(recurse=False)),
     ):
         if is_traceable_wrapper_subclass(tensor):
diff --git a/torch/_functorch/_aot_autograd/subclass_utils.py b/torch/_functorch/_aot_autograd/subclass_utils.py
index d06f727e25aa..9772ecc6c260 100644
--- a/torch/_functorch/_aot_autograd/subclass_utils.py
+++ b/torch/_functorch/_aot_autograd/subclass_utils.py
@@ -7,9 +7,8 @@
 
 import collections
 import typing
-from collections.abc import Iterable
-from typing import Any, Callable, Optional, TypeVar, Union
-from typing_extensions import TypeGuard
+from collections.abc import Callable, Iterable
+from typing import Any, Optional, TypeGuard, TypeVar, Union
 
 import torch
 import torch.utils._pytree as pytree
@@ -238,6 +237,7 @@ def flatten_subclass(
             n_desc: Any = (
                 SubclassGetAttrAOTInput(desc, attr)
                 if isinstance(desc, AOTInput)
+                # pyrefly: ignore  # bad-argument-type
                 else SubclassGetAttrAOTOutput(desc, attr)
             )
             flatten_subclass(inner_tensor, n_desc, out=out)
@@ -258,6 +258,7 @@ def flatten_subclass(
     descs_inner: list[AOTDescriptor] = []
 
     for x, desc in zip(wrapped_args, wrapped_args_descs):
+        # pyrefly: ignore  # bad-argument-type
         flatten_subclass(typing.cast(Tensor, x), desc, out=(xs_inner, descs_inner))
 
     return xs_inner, descs_inner
@@ -282,6 +283,7 @@ def flatten_subclass(x: Tensor, meta: Optional[SubclassCreationMeta], *, out):
 
         for attr in attrs:
             inner_tensor = getattr(x, attr)
+            # pyrefly: ignore  # missing-attribute
             inner_meta = meta.attrs.get(attr)
             flatten_subclass(inner_tensor, inner_meta, out=out)
 
diff --git a/torch/_functorch/_aot_autograd/utils.py b/torch/_functorch/_aot_autograd/utils.py
index 8f6c7d1478e2..eae75e06a42a 100644
--- a/torch/_functorch/_aot_autograd/utils.py
+++ b/torch/_functorch/_aot_autograd/utils.py
@@ -4,11 +4,13 @@
 """
 
 import dataclasses
+import logging
 import operator
 import warnings
+from collections.abc import Callable
 from contextlib import nullcontext
 from functools import wraps
-from typing import Any, Callable, Optional, TypeVar, Union
+from typing import Any, Optional, TypeVar, Union
 from typing_extensions import ParamSpec
 
 import torch
@@ -39,6 +41,7 @@
 original_zip = zip
 
 aot_graphs_effects_log = getArtifactLogger(__name__, "aot_graphs_effects")
+annotation_log = getArtifactLogger(__name__, "annotation")
 
 
 def strict_zip(*iterables, strict=True, **kwargs):
@@ -327,6 +330,7 @@ def do(module, subgraph, expected_num_erased):
                         and out.args[1] == 0
                         and out.args[0] in with_effect_nodes
                     ):
+                        # pyrefly: ignore  # missing-attribute
                         output_token_nodes.append(out)
                     else:
                         other_output_nodes.append(out)
@@ -402,34 +406,28 @@ def root_module_when_exporting_non_strict(flat_fn):
         return None
 
 
-def copy_fwd_metadata_to_bw_nodes(fx_g):
-    """
-    Input: `fx_g` which contains the joint fwd+bwd FX graph created by
-    aot_autograd.
+def _is_forward_node_with_seq_nr(node: torch.fx.Node) -> bool:
+    # For now, assume that if nn_module_stack_metadata is populated, this
+    # node is from the forward. Ignore nodes without `seq_nr`.
+    # TODO(future): there is likely a less brittle way to do this by walking
+    # the descendants of graph inputs corresponding to fwd inputs, didn't
+    # seem obvious at first glance on how to partition graph inputs into
+    # fwd vs bwd without relying on string names.
+    return node.meta.get("partitioner_tag") != "is_backward" and "seq_nr" in node.meta
 
-    This function walks the graph and copies over metadata from forward nodes
-    to backward nodes, using the `seq_nr` field as a one-to-many mapping
-    from forward node to backward node. This metadata is useful for performance
-    profiling and debugging.
-    """
 
-    def _is_forward_node_with_seq_nr(node):
-        # For now, assume that if nn_module_stack_metadata is populated, this
-        # node is from the forward. Ignore nodes without `seq_nr`.
-        # TODO(future): there is likely a less brittle way to do this by walking
-        # the descendants of graph inputs corresponding to fwd inputs, didn't
-        # seem obvious at first glance on how to partition graph inputs into
-        # fwd vs bwd without relying on string names.
-        return "nn_module_stack" in node.meta and "seq_nr" in node.meta
-
-    def _is_backward_node_with_seq_nr(node):
-        # For now, assume that if nn_module_stack_metadata is not populated,
-        # this node is from the backward. Ignore nodes without `seq_nr`.
-        # TODO(future): there is likely a less brittle way to do this, same
-        # as with the forward.
-        return ("nn_module_stack" not in node.meta) and "seq_nr" in node.meta
-
-    fwd_seq_nr_to_node = {}
+def _is_backward_node_with_seq_nr(node: torch.fx.Node) -> bool:
+    # For now, assume that if nn_module_stack_metadata is not populated,
+    # this node is from the backward. Ignore nodes without `seq_nr`.
+    # TODO(future): there is likely a less brittle way to do this, same
+    # as with the forward.
+    return node.meta.get("partitioner_tag") == "is_backward" and "seq_nr" in node.meta
+
+
+def _collect_fwd_nodes_from_subgraph(
+    fx_g: torch.fx.GraphModule, fwd_seq_nr_to_node: dict[str, torch.fx.Node]
+) -> None:
+    """Collect forward nodes from a single subgraph into the global mapping."""
     for node in fx_g.graph.nodes:
         if not _is_forward_node_with_seq_nr(node):
             continue
@@ -439,16 +437,62 @@ def _is_backward_node_with_seq_nr(node):
             # that the current op did not create an autograd node, and there
             # is no corresponding backward node, so we skip.
             continue
-        fwd_seq_nr_to_node[node.meta["seq_nr"]] = node
+        fwd_seq_nr_to_node[seq_nr] = node
 
+
+def _copy_metadata_to_bw_nodes_in_subgraph(
+    fx_g: torch.fx.GraphModule, fwd_seq_nr_to_node: dict[str, torch.fx.Node]
+) -> None:
+    """Copy metadata from forward nodes to backward nodes in a single subgraph."""
     for node in fx_g.graph.nodes:
+        annotation_log.debug("node: %s", node.name)
+        seq_nr = node.meta.get("seq_nr")
+        annotation_log.debug("seq_nr: %s", seq_nr)
+
         if not _is_backward_node_with_seq_nr(node):
             continue
+
         # fwd_node should always exist, but handle non-existence just in case
         fwd_node = fwd_seq_nr_to_node.get(node.meta["seq_nr"])
         if fwd_node is not None:
-            node.meta["fwd_nn_module_stack"] = fwd_node.meta["nn_module_stack"]
+            node.meta["fwd_nn_module_stack"] = fwd_node.meta.get("nn_module_stack")
             node.meta["fwd_source_fn_stack"] = fwd_node.meta.get("source_fn_stack")
+            # TODO: better to change to a specific field of custom?
+            node.meta["custom"] = fwd_node.meta.get("custom")
+
+
+def copy_fwd_metadata_to_bw_nodes(fx_g: torch.fx.GraphModule) -> None:
+    """
+    Input: `fx_g` which contains the joint fwd+bwd FX graph created by
+    aot_autograd.
+
+    This function walks the graph and copies over metadata from forward nodes
+    to backward nodes, using the `seq_nr` field as a one-to-many mapping
+    from forward node to backward node. This metadata is useful for performance
+    profiling and debugging.
+
+    This function supports matching forward and backward nodes across different
+    subgraphs (e.g., in recursive submodules from HOPs), enabling backward nodes
+    in any submodule to match forward nodes in any submodule.
+    """
+
+    # Build a global mapping of seq_nr to forward nodes across all subgraphs
+    fwd_seq_nr_to_node: dict[str, torch.fx.Node] = {}
+
+    # First pass: collect all forward nodes from all subgraphs
+    for submod in fx_g.modules():
+        if isinstance(submod, torch.fx.GraphModule):
+            _collect_fwd_nodes_from_subgraph(submod, fwd_seq_nr_to_node)
+
+    if annotation_log.isEnabledFor(logging.DEBUG):
+        for k, v in fwd_seq_nr_to_node.items():
+            annotation_log.debug("forward:: key: %s, value: %s", k, v)
+
+    # Second pass: copy metadata to backward nodes in all subgraphs
+    # using the global forward mapping
+    for submod in fx_g.modules():
+        if isinstance(submod, torch.fx.GraphModule):
+            _copy_metadata_to_bw_nodes_in_subgraph(submod, fwd_seq_nr_to_node)
 
 
 def register_buffer_assignment_hook(mod, assigned_buffers):
@@ -528,8 +572,10 @@ def without_output_descs(f: Callable[_P, tuple[_T, _S]]) -> Callable[_P, _T]:
     @wraps(f)
     @simple_wraps(f)
     def inner(*args, **kwargs):
+        # pyrefly: ignore  # invalid-param-spec
         return f(*args, **kwargs)[0]
 
+    # pyrefly: ignore  # bad-return
     return inner
 
 
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
index 2b0df0be370a..ebc4672a2a67 100644
--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@@ -2,9 +2,10 @@
 
 import contextlib
 import itertools
+from collections.abc import Callable
 from contextlib import nullcontext
 from functools import wraps
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 from unittest.mock import patch
 
 import torch
@@ -722,6 +723,7 @@ def aot_function(
     # Whether or not to trace with dynamic shapes
     dynamic=False,
     enable_log=True,
+    disable_functionalization=False,
 ) -> Callable:
     """
     Traces the forward and backward graph of :attr:`fn` using torch dispatch
@@ -775,15 +777,11 @@ def aot_function(
         >>> aot_fn(x)
     """
 
-    if bw_compiler is None:
-        bw_compiler = fw_compiler
-    if inference_compiler is None:
-        inference_compiler = fw_compiler
     aot_config = AOTConfig(
-        fw_compiler=fw_compiler,
-        bw_compiler=bw_compiler,
-        inference_compiler=inference_compiler,
-        partition_fn=partition_fn,
+        fw_compiler=None,
+        bw_compiler=None,
+        inference_compiler=None,
+        partition_fn=None,
         decompositions=decompositions,
         num_params_buffers=num_params_buffers,
         aot_id=next(AOT_COUNTER),
@@ -793,6 +791,7 @@ def aot_function(
         is_export=False,
         no_tangents=False,
         enable_log=enable_log,
+        disable_functionalization=disable_functionalization,
     )
     cached_res = None
 
@@ -825,7 +824,14 @@ def returned_function(*args, **kwargs):
                     shape_env,
                 )
                 aot_graph_capture = aot_stage1_graph_capture(aot_state, flat_fn)
-                compiled_fn, _ = aot_stage2_compile(aot_state, aot_graph_capture)
+                compiled_fn, _ = aot_stage2_compile(
+                    aot_state,
+                    aot_graph_capture,
+                    partition_fn,
+                    fw_compiler,
+                    bw_compiler,
+                    inference_compiler,
+                )
             cached_res = (compiled_fn, out_spec)
 
         cached_fn, out_spec = cached_res
@@ -891,17 +897,14 @@ def prepare_aot_module_simplified(
     mod: nn.Module,
     args,
     kwargs,
-    fw_compiler: Optional[AOTDispatchCompiler],
-    bw_compiler: Optional[AOTDispatchCompiler],
-    partition_fn: Callable,
     decompositions: dict,
     keep_inference_input_mutations,
-    inference_compiler: Optional[AOTDispatchCompiler],
     boxed_forward_device_index: BoxedDeviceIndex,
     ignore_shape_env: bool,
     flatten: bool,
     *,
     force_non_lazy_backward_lowering: bool = False,
+    disable_functionalization: bool = False,
 ):
     if not flatten:
         assert kwargs is None
@@ -975,10 +978,10 @@ def prepare_aot_module_simplified(
             break
 
     aot_config = AOTConfig(
-        fw_compiler=fw_compiler,
-        bw_compiler=bw_compiler,
-        inference_compiler=inference_compiler,
-        partition_fn=partition_fn,
+        fw_compiler=None,
+        bw_compiler=None,
+        inference_compiler=None,
+        partition_fn=None,
         decompositions=decompositions,
         num_params_buffers=params_len + buffers_len,
         aot_id=next(AOT_COUNTER),
@@ -992,6 +995,7 @@ def prepare_aot_module_simplified(
         ignore_shape_env=ignore_shape_env,
         precompile_backend_id=getattr(mod, "_backend_id", None),
         force_non_lazy_backward_lowering=force_non_lazy_backward_lowering,
+        disable_functionalization=False,
     )
     fake_mode, shape_env = construct_fake_mode(full_args, aot_config)
     # NB: full_args_descs not needed here, fake_flat_args is 1:1 with full_args
@@ -1028,6 +1032,7 @@ def aot_module_simplified(
     cudagraphs: Optional[BoxedBool] = None,
     boxed_forward_device_index: Optional[BoxedDeviceIndex] = None,
     ignore_shape_env: bool = False,
+    disable_functionalization: bool = False,
 ) -> nn.Module:
     """
     This is the simplified or low overhead version of aot_module. For frontends
@@ -1042,10 +1047,6 @@ def aot_module_simplified(
 
     if cudagraphs is None:
         cudagraphs = BoxedBool(torch._inductor.config.triton.cudagraphs)
-    if bw_compiler is None:
-        bw_compiler = fw_compiler
-    if inference_compiler is None:
-        inference_compiler = fw_compiler
 
     with contextlib.ExitStack() as stack:
         (
@@ -1064,16 +1065,13 @@ def aot_module_simplified(
             mod,
             args,
             None,
-            fw_compiler,
-            bw_compiler,
-            partition_fn,
             decompositions,
             keep_inference_input_mutations,
-            inference_compiler,
             boxed_forward_device_index,
             ignore_shape_env,
             flatten=False,
             force_non_lazy_backward_lowering=config.force_non_lazy_backward_lowering,
+            disable_functionalization=disable_functionalization,
         )
 
         compiled_fn = None
@@ -1105,7 +1103,14 @@ def aot_module_simplified(
                 shape_env,
             )
             aot_graph_capture = aot_stage1_graph_capture(aot_state, functional_call)
-            compiled_fn, _ = aot_stage2_compile(aot_state, aot_graph_capture)
+            compiled_fn, _ = aot_stage2_compile(
+                aot_state,
+                aot_graph_capture,
+                partition_fn,
+                fw_compiler,
+                bw_compiler,
+                inference_compiler,
+            )
 
     if isinstance(mod, torch._dynamo.utils.GmWrapper):
         # This function is called by the flatten_graph_inputs wrapper, which boxes
@@ -1169,8 +1174,7 @@ def aot_export_joint_with_descriptors(
     decompositions: Optional[dict] = None,
     keep_inference_input_mutations=False,
     ignore_shape_env=False,
-    fw_compiler: Optional[AOTDispatchCompiler] = boxed_nop_preserve_node_meta,
-    bw_compiler: Optional[AOTDispatchCompiler] = boxed_nop_preserve_node_meta,
+    disable_functionalization=False,
 ) -> JointWithDescriptors:
     """
     This API captures the joint graph for an nn.Module.  However, unlike
@@ -1248,13 +1252,10 @@ def aot_export_joint_with_descriptors(
         mod,
         args,
         kwargs,
-        fw_compiler,
-        bw_compiler,
-        default_partition,
+        # In contrast, decompositions are needed at this stage.
         decompositions,
         keep_inference_input_mutations,
         None,
-        None,
         ignore_shape_env,
         flatten=True,
         # Without this, we will attempt to "compile" the backward lazily
@@ -1263,6 +1264,7 @@ def aot_export_joint_with_descriptors(
         # Metric(s) {'is_forward'} have already been set in the current
         # context.
         force_non_lazy_backward_lowering=True,
+        disable_functionalization=disable_functionalization,
     )
 
     # TODO: Maybe this should be in create_aot_state?  Not sure, that would
@@ -1294,7 +1296,13 @@ def aot_export_joint_with_descriptors(
     )
 
 
-def aot_compile_joint_with_descriptors(jd: JointWithDescriptors) -> callable:
+def aot_compile_joint_with_descriptors(
+    jd: JointWithDescriptors,
+    *,
+    partition_fn: Callable = default_partition,
+    fw_compiler: Optional[AOTDispatchCompiler] = boxed_nop_preserve_node_meta,
+    bw_compiler: Optional[AOTDispatchCompiler] = boxed_nop_preserve_node_meta,
+) -> callable:
     """
     Companion function for aot_export_joint_with_descriptors which compiles the joint
     graph into a callable function that follows a standard calling convention.
@@ -1305,7 +1313,13 @@ def aot_compile_joint_with_descriptors(jd: JointWithDescriptors) -> callable:
 
     TODO: Consider if we should allow_in_graph the result by default.
     """
-    compiled_fn, _ = aot_stage2_compile(jd._aot_state, jd._aot_graph_capture)
+    compiled_fn, _ = aot_stage2_compile(
+        jd._aot_state,
+        jd._aot_graph_capture,
+        partition_fn,
+        fw_compiler,
+        bw_compiler,
+    )
 
     # Cribbed from torch/export/pt2_archive/_package.py
     @simple_wraps(compiled_fn)
diff --git a/torch/_functorch/autograd_function.py b/torch/_functorch/autograd_function.py
index c29f52fe6ba9..26568026bd0d 100644
--- a/torch/_functorch/autograd_function.py
+++ b/torch/_functorch/autograd_function.py
@@ -753,6 +753,7 @@ def __call__(self, fwd, bwd, *fwd_args, **fwd_kwargs):
 
         class ApplyTemplate(torch.autograd.Function):
             @staticmethod
+            # pyrefly: ignore  # bad-override
             def forward(ctx, *args):
                 nonlocal saved_values
                 output, saved_values = fwd(None, *fwd_args)
diff --git a/torch/_functorch/compile_utils.py b/torch/_functorch/compile_utils.py
index 929b58540f41..cdf2e1855a09 100644
--- a/torch/_functorch/compile_utils.py
+++ b/torch/_functorch/compile_utils.py
@@ -2,7 +2,7 @@
 
 
 import operator
-from typing import Callable
+from collections.abc import Callable
 
 import sympy
 
diff --git a/torch/_functorch/compilers.py b/torch/_functorch/compilers.py
index 5295a526e25c..8070e47153ca 100644
--- a/torch/_functorch/compilers.py
+++ b/torch/_functorch/compilers.py
@@ -5,9 +5,10 @@
 import os
 import pickle
 import random
+from collections.abc import Callable
 from contextlib import contextmanager
 from functools import partial
-from typing import Callable, Union
+from typing import Union
 
 import sympy
 
@@ -360,7 +361,7 @@ def get_input_meta(args):
             input_meta += get_input_meta(args[1])
             return input_meta
         for arg in args:
-            if type(arg) == int or type(arg) == float:
+            if type(arg) is int or type(arg) is float:
                 input_meta.append((type(arg),))
             else:
                 input_meta.append(
diff --git a/torch/_functorch/config.py b/torch/_functorch/config.py
index d53480e0e511..89fd90761917 100644
--- a/torch/_functorch/config.py
+++ b/torch/_functorch/config.py
@@ -4,6 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from typing import Callable
+
+
 """
 Global flags for aot autograd
 """
@@ -15,6 +18,13 @@
 from torch.utils._config_module import Config, install_config_module
 
 
+# [@compile_ignored: debug]
+_save_config_ignore = [
+    # callable not serializeable
+    "joint_custom_pass",
+]
+
+
 # Converts torch rng ops to their functional philox rng equivalents. Note that
 # we functionalize only CUDA rng ops today.
 functionalize_rng_ops = False
@@ -270,7 +280,7 @@ def remote_autograd_cache_default() -> Optional[bool]:
 
 # This controls whether we collect donated buffer. This flag must be set
 # False if a user wants to retain_graph=True for backward.
-donated_buffer = False if is_fbcode() else True
+donated_buffer = not is_fbcode()
 
 # Controls the default graph output format used by draw_graph
 # Supported formats are defined here https://graphviz.org/docs/outputs/
@@ -302,6 +312,9 @@ def remote_autograd_cache_default() -> Optional[bool]:
 # through compile_fx, we can remove this
 force_non_lazy_backward_lowering = False
 
+# only for testing, used to turn functionalization off in AOTDispatcher
+_test_disable_functionalization = True
+
 # Error on BypassAOTAutogradCache instead of just a warning
 # Used for tests
 strict_autograd_cache = False
@@ -358,6 +371,10 @@ def remote_autograd_cache_default() -> Optional[bool]:
 saved_tensors_hooks_filtering_mode = "donated"
 
 
+# This callback is invoked on the joint graph before partitioning
+joint_custom_pass: Callable = None  # type: ignore[assignment]
+
+
 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403
 
diff --git a/torch/_functorch/deprecated.py b/torch/_functorch/deprecated.py
index d6e295c65c77..773eb2aa8be9 100644
--- a/torch/_functorch/deprecated.py
+++ b/torch/_functorch/deprecated.py
@@ -10,7 +10,8 @@
 
 import textwrap
 import warnings
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional, Union
 
 import torch._functorch.apis as apis
 import torch._functorch.eager_transforms as _impl
diff --git a/torch/_functorch/eager_transforms.py b/torch/_functorch/eager_transforms.py
index 828f5e8decc6..7a6cf009b27c 100644
--- a/torch/_functorch/eager_transforms.py
+++ b/torch/_functorch/eager_transforms.py
@@ -7,8 +7,9 @@
 # LICENSE file in the root directory of this source tree.
 
 import contextlib
+from collections.abc import Callable
 from functools import partial, wraps
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch.autograd.forward_ad as fwAD
diff --git a/torch/_functorch/functional_call.py b/torch/_functorch/functional_call.py
index 8d019871ffee..55f45c925696 100644
--- a/torch/_functorch/functional_call.py
+++ b/torch/_functorch/functional_call.py
@@ -230,7 +230,7 @@ def forward(self, x):
             "stack_module_state: Expected all models to have the same training/eval mode."
         )
     model0_typ = type(models[0])
-    if not all(type(m) == model0_typ for m in models):
+    if not all(type(m) is model0_typ for m in models):
         raise RuntimeError(
             "stack_module_state: Expected all models to be of the same class."
         )
diff --git a/torch/_functorch/fx_minifier.py b/torch/_functorch/fx_minifier.py
index 3cf5fc24f1cb..60609ad95e68 100644
--- a/torch/_functorch/fx_minifier.py
+++ b/torch/_functorch/fx_minifier.py
@@ -4,9 +4,9 @@
 import math
 import os
 import sys
+from collections.abc import Callable
 from dataclasses import dataclass
 from functools import partial, wraps
-from typing import Callable
 
 import torch
 import torch.fx as fx
diff --git a/torch/_functorch/make_functional.py b/torch/_functorch/make_functional.py
index 16988a022a97..eeb51c721c3e 100644
--- a/torch/_functorch/make_functional.py
+++ b/torch/_functorch/make_functional.py
@@ -6,8 +6,8 @@
 # LICENSE file in the root directory of this source tree.
 
 import copy
-from collections.abc import Iterable, Sequence
-from typing import Any, Callable, NoReturn, Union
+from collections.abc import Callable, Iterable, Sequence
+from typing import Any, NoReturn, Union
 
 import torch
 import torch.nn as nn
@@ -42,7 +42,9 @@ def create_names_map(
     This function creates a mapping from the names in named_params to the
     names in tied_named_params: {'A': ['A'], 'B': ['B', 'B_tied']}.
     """
+    # pyrefly: ignore  # no-matching-overload
     named_params = dict(named_params)
+    # pyrefly: ignore  # no-matching-overload
     tied_named_params = dict(tied_named_params)
 
     tensors_dict_keys = set(named_params.keys())
@@ -51,9 +53,11 @@ def create_names_map(
 
     tensor_to_mapping: dict[Tensor, tuple[str, list[str]]] = {}
     for key, tensor in named_params.items():
+        # pyrefly: ignore  # unsupported-operation
         tensor_to_mapping[tensor] = (key, [])
     for key, tensor in tied_named_params.items():
         assert tensor in tensor_to_mapping
+        # pyrefly: ignore  # bad-argument-type
         tensor_to_mapping[tensor][1].append(key)
     return dict(tensor_to_mapping.values())
 
@@ -532,7 +536,7 @@ def combine_state_for_ensemble(
             "have the same training/eval mode."
         )
     model0_typ = type(models[0])
-    if not all(type(m) == model0_typ for m in models):
+    if not all(type(m) is model0_typ for m in models):
         raise RuntimeError(
             "combine_state_for_ensemble: Expected all models to be of the same class."
         )
diff --git a/torch/_functorch/partitioners.py b/torch/_functorch/partitioners.py
index 457cdb0867b6..46ffe463c94b 100644
--- a/torch/_functorch/partitioners.py
+++ b/torch/_functorch/partitioners.py
@@ -9,9 +9,11 @@
 import operator
 import os
 import os.path
+import re
 from collections import defaultdict
+from collections.abc import Callable
 from dataclasses import dataclass, replace
-from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 import torch
 import torch._inductor.inductor_prims
@@ -197,14 +199,15 @@ def _extract_graph_with_inputs_outputs(
         new_node = new_graph.placeholder(node.name)
         # Can't use node_copy here as we may be turning previous call_function into placeholders
         new_node.meta = node.meta
+        # pyrefly: ignore  # unsupported-operation
         env[node] = new_node
 
     for node in joint_graph.nodes:
-        if _must_be_in_backward(node) and subgraph != "backward":
+        if _must_be_in_backward(node) and subgraph != "backward" and node not in inputs:
             env[node] = InvalidNode  # type: ignore[assignment]
             continue
 
-        if _must_be_in_forward(node) and subgraph != "forward":
+        if _must_be_in_forward(node) and subgraph != "forward" and node not in inputs:
             env[node] = InvalidNode  # type: ignore[assignment]
             continue
 
@@ -225,8 +228,10 @@ def _extract_graph_with_inputs_outputs(
             if any(all_args):
                 env[node] = InvalidNode  # type: ignore[assignment]
                 continue
+            # pyrefly: ignore  # unsupported-operation, bad-argument-type
             env[node] = new_graph.node_copy(node, lambda x: env[x])
         elif node.op == "get_attr":
+            # pyrefly: ignore  # unsupported-operation, bad-argument-type
             env[node] = new_graph.node_copy(node, lambda x: env[x])
         elif node.op == "output":
             pass
@@ -291,13 +296,27 @@ def _has_tag_must_be_in_backward(node: fx.Node) -> bool:
 
 
 def _must_be_in_forward(node: fx.Node) -> bool:
-    return _has_tag_must_be_in_forward(node)
+    if _has_tag_must_be_in_forward(node):
+        return True
+    is_mutable = is_with_effects(node) or (
+        isinstance(node.target, torch._ops.OpOverload)
+        and node.target._schema.is_mutable
+    )
+    return (
+        not _has_tag_is_backward(node)
+        and not _has_tag_must_be_in_backward(node)
+        and is_mutable
+    )
 
 
 def _must_be_in_backward(node: fx.Node) -> bool:
-    return _has_tag_must_be_in_backward(node) or (
-        _has_tag_is_backward(node) and is_with_effects(node)
+    if _has_tag_must_be_in_backward(node):
+        return True
+    is_mutable = is_with_effects(node) or (
+        isinstance(node.target, torch._ops.OpOverload)
+        and node.target._schema.is_mutable
     )
+    return _has_tag_is_backward(node) and is_mutable
 
 
 def _extract_fwd_bwd_outputs(
@@ -341,6 +360,7 @@ def calculate_quantization_scaling(
     node: torch.fx.Node,
     max: float = 57344.0,
     min: float = 1e-12,
+    position: int = 0,
 ):
     with graph.inserting_after(node):
         abs_node = graph.call_function(
@@ -404,7 +424,7 @@ def calculate_quantization_scaling(
         scale_node = graph.call_function(
             torch.ops.prims.convert_element_type.default,
             args=(mul_node, torch.float32),
-            name="fp8_scale_" + str(node.name),
+            name=f"fp8_scale_pos_{position}_{node.name}",
         )
         scale_node.meta["val"] = torch.ops.prims.convert_element_type.default(
             mul_node.meta["val"], torch.float32
@@ -420,6 +440,7 @@ def perform_quantization(
     quant_type: torch.dtype,
     clamp_min: float,
     clamp_max: float,
+    position: int,
 ) -> torch.fx.Node:
     with graph.inserting_after(scale_node):
         target_node_32 = graph.call_function(
@@ -469,7 +490,7 @@ def perform_quantization(
         quant_activation_node = graph.call_function(
             torch.ops.prims.convert_element_type.default,
             args=(clamp_max_scaled_node, quant_type),
-            name="fp8_quant_" + str(node.name),
+            name=f"fp8_quant_pos_{position}_{node.name}",
         )
         quant_activation_node.meta["val"] = (
             torch.ops.prims.convert_element_type.default(
@@ -560,9 +581,9 @@ def quantize_activation_fw(graph: torch.fx.Graph) -> None:
     fwd_outputs = output.args[0]
     quant_type = get_quant_type()
     clamp_min, clamp_max = calculate_range(quant_type)
-    node_to_quant = dict()
+    position_to_quant = dict()
     tensor_scale_nodes, sym_scale_nodes = [], []
-    for node in fwd_outputs:
+    for position, node in enumerate(fwd_outputs):
         # check if the activation node is the node saved for quantization
         if node.meta.get("saved_for_quantization", False):
             # case: use scaling
@@ -571,11 +592,12 @@ def quantize_activation_fw(graph: torch.fx.Graph) -> None:
             ].get("use_scaling", True):
                 # calculating the scale
                 scale_node = calculate_quantization_scaling(
-                    graph, node, clamp_max, 1e-12
+                    graph, node, clamp_max, 1e-12, position
                 )
+
                 # converting to fp8
                 quant_node = perform_quantization(
-                    graph, node, scale_node, quant_type, clamp_min, clamp_max
+                    graph, node, scale_node, quant_type, clamp_min, clamp_max, position
                 )
                 if not is_sym_node(scale_node):
                     tensor_scale_nodes.append(scale_node)
@@ -587,7 +609,7 @@ def quantize_activation_fw(graph: torch.fx.Graph) -> None:
                     quant_node = graph.call_function(
                         torch.ops.prims.convert_element_type.default,
                         args=(node, quant_type),
-                        name="fp8_quant_" + str(node.name),
+                        name=f"fp8_quant_pos_{position}_{node.name}",
                     )
                     quant_node.meta["val"] = (
                         torch.ops.prims.convert_element_type.default(
@@ -597,12 +619,16 @@ def quantize_activation_fw(graph: torch.fx.Graph) -> None:
                     quant_node.meta["tensor_meta"] = extract_tensor_metadata(
                         quant_node.meta["val"]
                     )
-            node_to_quant[node] = quant_node
+
+            position_to_quant[position] = quant_node
+
+    # Use position-based lookup for building output
     # only update the return node args, and remain all other users unchanged
     output_updated_args = [
-        node_to_quant[node] if node in node_to_quant else node for node in fwd_outputs
+        position_to_quant.get(i, node) for i, node in enumerate(fwd_outputs)
     ]
     # add the scale nodes to the output find the first sym_node in the output
+    # pyrefly: ignore  # bad-argument-type
     idx = find_first_sym_node(output_updated_args)
     scale_nodes = tensor_scale_nodes + sym_scale_nodes
     if scale_nodes:
@@ -738,7 +764,9 @@ def perform_fp8_activation_quantization(
     # update the corresponding bwd_inputs due to the fwd_outputs quantization
     for fwd_node in quant_fwd_module_outputs:
         if "fp8_quant_" in fwd_node.name:
-            bwd_input = bwd_module_inputs[fwd_node.name.replace("fp8_quant_", "")]
+            bwd_input = bwd_module_inputs[
+                re.sub(r"^fp8_quant_pos_\d+_", "", fwd_node.name)
+            ]
             with bwd_module.graph.inserting_after(bwd_input):
                 quant_bwd_input = bwd_module.graph.placeholder(name=fwd_node.name)
             dequant_type = bwd_input.meta["dequant_type"]
@@ -1002,17 +1030,60 @@ def default_partition(
     forward_node_names = OrderedSet(
         node.name for node in forward_only_graph.nodes if node.op != "output"
     )
+    order = {node: idx for idx, node in enumerate(joint_module.graph.nodes)}
     saved_values = []
     saved_sym_nodes = []
 
+    def is_mutated_later_in_fw(node):
+        if _has_tag_is_backward(node):
+            return False
+        tensor_arg_aliases = [
+            x
+            for x in node.args
+            if isinstance(x, fx.Node)
+            and "val" in x.meta
+            and isinstance(x.meta["val"], torch.Tensor)
+        ]
+        while len(tensor_arg_aliases) > 0:
+            a = tensor_arg_aliases.pop()
+            for u in a.users:
+                if not isinstance(u.target, torch._ops.OpOverload):
+                    continue
+                # If we witness a mutation on our node later, and that mutation is not "must be in backward",
+                # then our node needs to be computed in the forward (otherwise we will compute it on the mutated values)
+                if (
+                    # one of the args was mutated
+                    u.target._schema.is_mutable
+                    # and the mutation happens "later"
+                    and order[u] > order[node]
+                    # and the mutation happened during the forward
+                    and not (_has_tag_is_backward(u) or _has_tag_must_be_in_backward(u))
+                ):
+                    for idx, alias_info in enumerate(u.target._schema.arguments):
+                        if alias_info.is_write and u.args[idx] is a:
+                            return True
+                elif u.target.is_view:
+                    tensor_arg_aliases.append(u)
+        return False
+
     for node in joint_module.graph.nodes:
         if node.name not in forward_node_names:
+            # if a node isn't "required" to be in the forward, but any of its arguments
+            # are later mutated in the forward, then it must have been run in the forward
+            # (if not, and the node's arg was saved for backward, we would have mutated a saved value)
+            # NB: doesn't handle nodes where the input is a list of tensors and one of those tensors is later mutated
+            if is_mutated_later_in_fw(node):
+                saved_values.append(node)
             continue
         if is_sym_node(node):
             # Symints must be kept separate from tensors so that PythonFunction only calls
             # save_for_backward on tensors and stashes symints in autograd .ctx
             saved_sym_nodes.append(node)
-        elif "tensor_meta" not in node.meta and node.op == "call_function":
+        elif (
+            "tensor_meta" not in node.meta
+            and node.op == "call_function"
+            and not isinstance(node.meta.get("val"), torch._subclasses.FakeTensor)
+        ):
             # Since we can't save tuple of tensor values, we need to flatten out what we're saving
             users = node.users
             assert all(user.target == operator.getitem for user in users)
@@ -1163,6 +1234,7 @@ def insert_node_in_graph(node):
             # critical path first.
             cur_nodes += node.all_input_nodes
 
+        # pyrefly: ignore  # bad-assignment
         insertable_nodes = sorted(insertable_nodes, key=lambda n: order[n])
         for node in insertable_nodes:
             env[node] = new_graph.node_copy(node, lambda x: env[x])
@@ -1391,12 +1463,14 @@ def get_sample_rng_state(device: Optional[torch.device]):
     devices = OrderedSet(
         get_device(node_pair["fwd"]) for node_pair in recomputable_rng_ops_map.values()
     )
+    # pyrefly: ignore  # unbound-name
     devices.discard(torch.device("cpu"))
     # multiple cuda devices won't work with cudagraphs anyway,
     # fallback to non graphsafe rng checkpointing
     multi_cuda_devices = len(devices) > 1
 
     # this changes numerics, so if fallback_random is set we will not use it
+    # pyrefly: ignore  # unbound-name
     ind_config = torch._inductor.config
     use_rng_graphsafe_rng_functionalization = (
         config.graphsafe_rng_functionalization
@@ -1551,6 +1625,8 @@ def cleanup_recompute_tags(joint_module: fx.GraphModule) -> fx.GraphModule:
             for user in node.users:
                 if (
                     must_recompute(user)
+                    and "ac_graph_id" in user.meta
+                    and "ac_graph_id" in node.meta
                     and user.meta["ac_graph_id"] > node.meta["ac_graph_id"]
                 ):
                     node.meta["recompute"] = CheckpointPolicy.MUST_SAVE
@@ -2433,7 +2509,10 @@ def get_saved_values_knapsack(memory_budget, node_info, joint_graph):
                 saved_node_idxs=saved_node_idxs,
                 recomputable_node_idxs=recomputable_node_idxs,
                 expected_runtime=expected_runtime,
-                memories_banned_nodes=memories_banned_nodes,
+                memories_banned_nodes=[
+                    _size_of(i) for i in all_recomputable_banned_nodes
+                ],
+                normalized_memories_banned_nodes=memories_banned_nodes,
                 runtimes_banned_nodes=runtimes_banned_nodes,
                 min_cut_saved_values=saved_values,
             )
@@ -2823,6 +2902,7 @@ def classify_nodes(joint_module, static_lifetime_input_indices):
         node_info,
         memory_budget=memory_budget,
     )
+    # pyrefly: ignore  # unbound-name
     if config._sync_decision_cross_ranks:
         saved_values = _sync_decision_cross_ranks(joint_graph, saved_values)
     # save_for_backward on tensors and stashes symints in autograd .ctx
@@ -2833,6 +2913,7 @@ def classify_nodes(joint_module, static_lifetime_input_indices):
     fw_module, bw_module = _extract_fwd_bwd_modules(
         joint_module,
         saved_values,
+        # pyrefly: ignore  # bad-argument-type
         saved_sym_nodes=saved_sym_nodes,
         num_fwd_outputs=num_fwd_outputs,
         static_lifetime_input_nodes=node_info.static_lifetime_input_nodes,
diff --git a/torch/_functorch/pyfunctorch.py b/torch/_functorch/pyfunctorch.py
index 2976e22c47a4..cdb419158415 100644
--- a/torch/_functorch/pyfunctorch.py
+++ b/torch/_functorch/pyfunctorch.py
@@ -131,6 +131,7 @@ def __init__(self, cdata: CInterpreter):
         self._cdata = cdata
 
     @cached_property
+    # pyrefly: ignore  # bad-override
     def _cptr(self):
         return CVmapInterpreterPtr(self._cdata)
 
@@ -170,6 +171,7 @@ def __init__(self, cdata: CInterpreter):
         self._cdata = cdata
 
     @cached_property
+    # pyrefly: ignore  # bad-override
     def _cptr(self):
         return CGradInterpreterPtr(self._cdata)
 
@@ -207,6 +209,7 @@ def __init__(self, cdata: CInterpreter):
         self._cdata = cdata
 
     @cached_property
+    # pyrefly: ignore  # bad-override
     def _cptr(self):
         return CJvpInterpreterPtr(self._cdata)
 
@@ -243,6 +246,7 @@ def __init__(self, cdata: CInterpreter):
         self._cdata = cdata
 
     @cached_property
+    # pyrefly: ignore  # bad-override
     def _cptr(self):
         return CFunctionalizeInterpreterPtr(self._cdata)
 
diff --git a/torch/_functorch/vmap.py b/torch/_functorch/vmap.py
index 5e3893fef5cd..465be67e41fa 100644
--- a/torch/_functorch/vmap.py
+++ b/torch/_functorch/vmap.py
@@ -9,8 +9,9 @@
 import contextlib
 import functools
 import itertools
+from collections.abc import Callable
 from functools import partial
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 from torch import Tensor
@@ -292,7 +293,7 @@ def vmap_impl(func, in_dims, out_dims, randomness, chunk_size, *args, **kwargs):
 
 
 def get_chunk_sizes(total_elems, chunk_size):
-    n_chunks = n_chunks = total_elems // chunk_size
+    n_chunks = total_elems // chunk_size
     chunk_sizes = [chunk_size] * n_chunks
     # remainder chunk
     remainder = total_elems % chunk_size
diff --git a/torch/_guards.py b/torch/_guards.py
index f6f053ea064c..e3d20c9fc518 100644
--- a/torch/_guards.py
+++ b/torch/_guards.py
@@ -14,20 +14,11 @@
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import (
-    Any,
-    Callable,
-    Generic,
-    NamedTuple,
-    Optional,
-    TYPE_CHECKING,
-    TypeVar,
-    Union,
-)
+from typing import Any, Generic, NamedTuple, Optional, TYPE_CHECKING, TypeVar, Union
 
 import torch
 from torch.utils import _pytree as pytree
-from torch.utils._backport_slots import dataclass_slots
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 from torch.utils._traceback import CapturedTraceback, format_frame
 from torch.utils.weak import WeakTensorKeyDictionary
 
@@ -36,7 +27,7 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Iterator
+    from collections.abc import Callable, Generator, Iterator
     from types import CodeType
 
     import sympy
@@ -72,8 +63,7 @@
 # 3. Compact: The string form is directly displayed by some tools. Special symbols are okay.
 
 
-# TODO: mark as kw_only=True once we drop support for <Python 3.10
-@dataclass(frozen=True)
+@dataclass(frozen=True, kw_only=True, slots=True)
 class CompileId:
     frame_id: Optional[int]
     # This id is per-frame, and counts how many times we've compiled this
@@ -241,8 +231,7 @@ class ShapeGuard(NamedTuple):
     size_oblivious: bool
 
 
-@dataclass_slots
-@dataclasses.dataclass
+@dataclasses.dataclass(slots=True)
 class Guard:
     # originating_source is the source that called the make_guard method to
     # construct this guard object. The property name specifies what exactly it
@@ -1132,7 +1121,11 @@ def detect_fake_mode(inputs: Any = None) -> Optional[FakeTensorMode]:
         - Fake mode associated with passed in tensors (inputs does not
           have to be flattened)
     """
-    from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+    from torch._subclasses.fake_tensor import (
+        FakeTensor,
+        FakeTensorMode,
+        get_plain_tensors,
+    )
 
     fake_modes = []
 
@@ -1151,6 +1144,18 @@ def detect_fake_mode(inputs: Any = None) -> Optional[FakeTensorMode]:
     for i, flat_input in enumerate(flat_inputs):
         if isinstance(flat_input, FakeTensor):
             fake_modes.append((flat_input.fake_mode, "fake tensor input", i))
+        if is_traceable_wrapper_subclass(flat_input):
+            out: list[Union[torch.Tensor, int, torch.SymInt]] = []
+            get_plain_tensors(flat_input, out=out)  # type: ignore[arg-type]
+            fake_tensors: list[FakeTensor] = [
+                x for x in out if isinstance(x, FakeTensor)
+            ]
+            fake_modes.extend(
+                [
+                    (tensor.fake_mode, f"subclass input {i}", ix)
+                    for ix, tensor in enumerate(fake_tensors)
+                ]
+            )
 
     if fake_modes:
         fake_mode, desc1, i1 = fake_modes[0]
diff --git a/torch/_higher_order_ops/associative_scan.py b/torch/_higher_order_ops/associative_scan.py
index f8b0e4ab6f74..3fc9e36ed2c9 100644
--- a/torch/_higher_order_ops/associative_scan.py
+++ b/torch/_higher_order_ops/associative_scan.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 import functools
 import itertools
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import torch
 import torch._prims_common as utils
@@ -56,6 +57,7 @@ def _interleave(a, b, dim=0):
 
     stacked = torch.stack([a, b], dim=dim + 1)
     interleaved = torch.flatten(stacked, start_dim=dim, end_dim=dim + 1)
+    # pyrefly: ignore  # unbound-name
     if b_trunc:
         # TODO: find torch alternative for slice_along dim for torch.jit.script to work
         interleaved = aten.slice(interleaved, dim, 0, b.shape[dim] + a.shape[dim] - 1)
@@ -95,6 +97,7 @@ def __call__(self, combine_fn, xs, additional_inputs):
         validate_subgraph_args_types(additional_inputs)
         return super().__call__(combine_fn, xs, additional_inputs)
 
+    # pyrefly: ignore  # bad-override
     def gen_schema(self, combine_fn, xs, additional_inputs):
         from torch._higher_order_ops.schema import HopSchemaGenerator
         from torch._higher_order_ops.utils import materialize_as_graph
@@ -105,24 +108,14 @@ def gen_schema(self, combine_fn, xs, additional_inputs):
         xs_slice2 = [first_slice_copy(x) for x in xs]
         all_inputs = tuple(xs_slice1 + xs_slice2 + list(additional_inputs))
 
-        combine_gm: torch.fx.GraphModule = (
-            combine_fn
-            if isinstance(combine_fn, torch.fx.GraphModule)
-            else materialize_as_graph(combine_fn, all_inputs)
-        )
-
-        example_inputs = [
-            n.meta["val"] if "val" in n.meta else n.meta["example_value"]
-            for n in combine_gm.graph.find_nodes(op="placeholder")
-        ]
-
+        combine_gm: torch.fx.GraphModule = materialize_as_graph(combine_fn, all_inputs)
         (
             _,
             _,
             _,
             mutated_inputs,
             outputs,
-        ) = check_input_alias_and_mutation_return_outputs(combine_gm, example_inputs)
+        ) = check_input_alias_and_mutation_return_outputs(combine_gm)
         if len(mutated_inputs) > 0:
             raise RuntimeError(
                 "For associative_scan, combine_fn cannot have in-place mutations but found "
@@ -206,14 +199,14 @@ def add(x: torch.Tensor, y: torch.Tensor):
     def _validate_input(cfn, lxs, d, r, cm):
         # Basic arguments check
         if not callable(cfn):
-            raise ValueError("Combine_fn must be a callable, but got {cfn}")
+            raise ValueError(f"Combine_fn must be a callable, but got {cfn}")
         if not isinstance(d, int):
             raise ValueError("Dim must be an int, but got " + str(type(d)))
         if not isinstance(r, bool):
             raise RuntimeError("Reverse must be a bool, but got " + str(type(r)))
         if cm not in ["pointwise", "generic"]:
             raise ValueError(
-                "Combine_mode must either 'pointwise' or 'generic', but got {cm}"
+                f"Combine_mode must either 'pointwise' or 'generic', but got {cm}"
             )
         if cm == "pointwise" and not all(l.device.type == "cuda" for l in lxs):
             raise ValueError(
@@ -657,6 +650,7 @@ def combine_fn(a: torch.Tensor, b: torch.Tensor):
     """
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(
         ctx,
         combine_fn,
diff --git a/torch/_higher_order_ops/auto_functionalize.py b/torch/_higher_order_ops/auto_functionalize.py
index d8374c356ab2..cca12066bc3e 100644
--- a/torch/_higher_order_ops/auto_functionalize.py
+++ b/torch/_higher_order_ops/auto_functionalize.py
@@ -1,9 +1,9 @@
 # mypy: allow-untyped-defs
 import warnings
 from abc import ABC, abstractmethod
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from dataclasses import dataclass
-from typing import Any, Callable, get_args, Optional, Union
+from typing import Any, get_args, Optional, Union
 
 import torch
 import torch._library.utils as library_utils
@@ -239,7 +239,7 @@ def use_alias():
             write_single_view(
                 f"_{arg_name}",
                 kwargs[arg_name],
-                arg_to_base_index.get(arg_name, None),  # type: ignore[arg-type]
+                arg_to_base_index.get(arg_name),  # type: ignore[arg-type]
             )
         else:
             raise RuntimeError(f"Unsupported type {arg_type}")
@@ -390,7 +390,7 @@ def __call__(
         if isinstance(_mutable_op, HigherOrderOperator):
             _op_to_check = HopInstance(
                 _mutable_op,
-                SchemaHolder.from_tree_spec(kwargs.get("_op_schema", None)).schema,  # type: ignore[arg-type]
+                SchemaHolder.from_tree_spec(kwargs.get("_op_schema")).schema,  # type: ignore[arg-type]
             )
         else:
             _op_to_check = _mutable_op
@@ -609,6 +609,7 @@ def do_auto_functionalize_v2(
     normalized_kwargs = {}
 
     schema = op._schema
+    # pyrefly: ignore  # bad-assignment
     op = op._op if isinstance(op, HopInstance) else op
     assert isinstance(op, get_args(_MutableOpType))
 
@@ -957,11 +958,11 @@ def auto_functionalized_v2_proxy(
         # hop node in the traced graph and graph module inputs to the hop. Finally, we replace the
         # original kwarg's callable with the graph module.
         all_bases = kwargs.get("_all_bases", [])
-        _only_clone_these_bases = kwargs.get("_only_clone_these_bases", None)
+        _only_clone_these_bases = kwargs.get("_only_clone_these_bases")
         if _only_clone_these_bases is None:
             _only_clone_these_bases = tuple(range(len(all_bases)))
 
-        schema = pytree.tree_unflatten([], kwargs.get("_op_schema", None)).schema  # type: ignore[arg-type]
+        schema = pytree.tree_unflatten([], kwargs.get("_op_schema")).schema  # type: ignore[arg-type]
         new_kwargs, _ = _generate_new_op_kwargs_from_bases(
             schema,
             {k: v for k, v in kwargs.items() if k not in ("_all_bases", "_op_schema")},
diff --git a/torch/_higher_order_ops/base_hop.py b/torch/_higher_order_ops/base_hop.py
index 11826c3f6369..a7647c709845 100644
--- a/torch/_higher_order_ops/base_hop.py
+++ b/torch/_higher_order_ops/base_hop.py
@@ -170,23 +170,18 @@ def _call_Functionalize(self, ctx, subgraph, *operands, **kwargs):
             out = self(functionalized_subgraph, *unwrapped_operands, **kwargs)
         return ctx.wrap_tensors(out)
 
+    # pyrefly: ignore  # bad-override
     def gen_schema(self, subgraph, *operands, **kwargs):
         from .schema import HopSchemaGenerator
 
-        if not isinstance(subgraph, torch.fx.GraphModule):
-            subgraph = materialize_as_graph(subgraph, operands)
-
-        fake_args = [
-            ph.meta["example_value"] if "example_value" in ph.meta else ph.meta["val"]
-            for ph in subgraph.graph.find_nodes(op="placeholder")
-        ]
+        subgraph = materialize_as_graph(subgraph, operands)
         (
             inp_inp_alias,
             inp_out_alias,
             out_out_alias,
             mutated_inp_idx,
             output,
-        ) = check_input_alias_and_mutation_return_outputs(subgraph, fake_args)
+        ) = check_input_alias_and_mutation_return_outputs(subgraph)
 
         if not (
             len(inp_inp_alias) == 0
@@ -220,6 +215,7 @@ def gen_schema(self, subgraph, *operands, **kwargs):
 
 class BaseHOPFunction(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, hop, subgraph, kwargs, *operands):
         ctx.hop = hop
         ctx.operands = operands
diff --git a/torch/_higher_order_ops/cond.py b/torch/_higher_order_ops/cond.py
index 7c13b9a0fd14..9379111d689d 100644
--- a/torch/_higher_order_ops/cond.py
+++ b/torch/_higher_order_ops/cond.py
@@ -4,7 +4,8 @@
 import functools
 import logging
 import warnings
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional, Union
 
 import torch
 import torch.utils._pytree as pytree
@@ -18,7 +19,6 @@
 from torch._functorch.utils import exposed_in
 from torch._higher_order_ops.utils import (
     _maybe_run_with_interpreter,
-    _set_compilation_env,
     check_input_alias_and_mutation_return_outputs,
     create_bw_fn,
     fill_none_with_masks,
@@ -32,12 +32,7 @@
 )
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
-from torch.fx.experimental.proxy_tensor import (
-    _temp_remove_metadata_torch_function_mode,
-    _temp_remove_pre_dispatch_torch_function_mode,
-    ProxyTorchDispatchMode,
-    track_tensor_tree,
-)
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
 from torch.utils._python_dispatch import _get_current_dispatch_mode
 
 
@@ -57,38 +52,27 @@ def __call__(self, pred, true_fn, false_fn, operands):
         validate_subgraph_args_types(operands)
         return super().__call__(pred, true_fn, false_fn, operands)
 
+    # pyrefly: ignore  # bad-override
     def gen_schema(self, pred, true_fn, false_fn, operands):
         from torch._higher_order_ops.schema import HopSchemaGenerator
         from torch._higher_order_ops.utils import materialize_as_graph
 
-        then_gm: torch.fx.GraphModule = (
-            true_fn
-            if isinstance(true_fn, torch.fx.GraphModule)
-            else materialize_as_graph(true_fn, operands)
-        )
-        else_gm: torch.fx.GraphModule = (
-            false_fn
-            if isinstance(false_fn, torch.fx.GraphModule)
-            else materialize_as_graph(false_fn, operands)
-        )
-        example_inputs = [
-            n.meta["val"] if "val" in n.meta else n.meta["example_value"]
-            for n in then_gm.graph.find_nodes(op="placeholder")
-        ]
+        then_gm: torch.fx.GraphModule = materialize_as_graph(true_fn, operands)
+        else_gm: torch.fx.GraphModule = materialize_as_graph(false_fn, operands)
         (
             _,
             _,
             _,
             then_mutated_inputs,
             then_outputs,
-        ) = check_input_alias_and_mutation_return_outputs(then_gm, example_inputs)
+        ) = check_input_alias_and_mutation_return_outputs(then_gm)
         (
             _,
             _,
             _,
             else_mutated_inputs,
             else_outputs,
-        ) = check_input_alias_and_mutation_return_outputs(else_gm, example_inputs)
+        ) = check_input_alias_and_mutation_return_outputs(else_gm)
         mutated_inputs = set(then_mutated_inputs) | set(else_mutated_inputs)
 
         schema_gen = HopSchemaGenerator(self)
@@ -185,10 +169,6 @@ def false_fn(x: torch.Tensor):
     if torch.compiler.is_dynamo_compiling():
         return cond_op(pred, true_fn, false_fn, operands)
 
-    from torch._dynamo.backends.debugging import (
-        make_eager_backend_with_torch_function_mode,
-    )
-
     if isinstance(pred, (bool, int, float)):
         # This is the non-strict export case. Strict export and torch.compile are
         # handled above in dynamo.
@@ -234,21 +214,12 @@ def _validate_input(pred, true_fn, false_fn, operands):
     def _cond_op_wrapper(*args, **kwargs):
         return cond_op(*args, **kwargs)
 
-    with (
-        _set_compilation_env(),
-        torch._dynamo.utils.disable_cache_limit(),
-        _temp_remove_pre_dispatch_torch_function_mode(),
-    ):
-        with _temp_remove_metadata_torch_function_mode() as metadata_mode:
-            if metadata_mode:
-                backend: Union[str, Callable[..., Any]] = (
-                    make_eager_backend_with_torch_function_mode(metadata_mode)
-                )
-            else:
-                backend = "eager"
-            return torch.compile(_cond_op_wrapper, backend=backend, fullgraph=True)(
-                pred, true_fn, false_fn, operands
-            )
+    from torch._higher_order_ops.utils import setup_compilation_env
+
+    with setup_compilation_env() as backend:
+        return torch.compile(_cond_op_wrapper, backend=backend, fullgraph=True)(
+            pred, true_fn, false_fn, operands
+        )
 
 
 def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
@@ -314,6 +285,7 @@ def cond_op_dense(pred, true_fn, false_fn, operands):
 
 class CondAutogradOp(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(
         ctx,
         pred,
@@ -355,8 +327,7 @@ def wrapped(*args):
 
                 true_outputs = fn(*args)
                 grads_tensor_masks = [
-                    True if isinstance(out, torch.Tensor) else False
-                    for out in true_outputs
+                    bool(isinstance(out, torch.Tensor)) for out in true_outputs
                 ]
                 return filter_with_masks(true_outputs, grads_tensor_masks)
 
@@ -750,4 +721,4 @@ def fn(p, *args):
     if not isinstance(result, tuple):
         result = (result,)
     lvl = interpreter.level()
-    return tuple([_add_batch_dim(r, 0, lvl) for r in result])
+    return tuple(_add_batch_dim(r, 0, lvl) for r in result)
diff --git a/torch/_higher_order_ops/effects.py b/torch/_higher_order_ops/effects.py
index 23f7a5e474bd..7477473e303e 100644
--- a/torch/_higher_order_ops/effects.py
+++ b/torch/_higher_order_ops/effects.py
@@ -298,4 +298,5 @@ def handle_effects(
     assert isinstance(wrapped_token, torch.Tensor)
     tokens[key] = wrapped_token
 
+    # pyrefly: ignore  # bad-argument-type
     return ctx.wrap_tensors(unwrapped_outs)
diff --git a/torch/_higher_order_ops/flat_apply.py b/torch/_higher_order_ops/flat_apply.py
index 654e2ea38384..8b45cb3db639 100644
--- a/torch/_higher_order_ops/flat_apply.py
+++ b/torch/_higher_order_ops/flat_apply.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Callable
 
 import torch
 import torch.fx.node
diff --git a/torch/_higher_order_ops/flex_attention.py b/torch/_higher_order_ops/flex_attention.py
index b52bab0e3272..9e74de4ec763 100644
--- a/torch/_higher_order_ops/flex_attention.py
+++ b/torch/_higher_order_ops/flex_attention.py
@@ -1,6 +1,6 @@
 import math
-from collections.abc import Sequence
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable, Sequence
+from typing import Any, Optional, Union
 
 import torch
 import torch.utils._pytree as pytree
@@ -171,7 +171,7 @@ def _math_attention_inner(
 
     working_precision = torch.float64 if query.dtype == torch.float64 else torch.float32
 
-    scores = (query @ key.transpose(-2, -1)).to(dtype=working_precision)
+    scores = query.to(working_precision) @ key.to(working_precision).transpose(-2, -1)
 
     b = torch.arange(0, scores.size(0), device=scores.device)
     h = torch.arange(0, scores.size(1), device=scores.device)
@@ -354,12 +354,17 @@ def trace_flex_attention(
         score_mod_other_buffers,
         mask_mod_other_buffers,
     )
+    # pyrefly: ignore  # missing-attribute
     proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
     out_proxy = proxy_mode.tracer.create_proxy(
         "call_function", flex_attention, proxy_args, {}
     )
     return track_tensor_tree(
-        example_out, out_proxy, constant=None, tracer=proxy_mode.tracer
+        example_out,
+        out_proxy,
+        constant=None,
+        # pyrefly: ignore  # bad-argument-type
+        tracer=proxy_mode.tracer,
     )
 
 
@@ -621,6 +626,7 @@ def fw_with_masks(
 
 class FlexAttentionAutogradOp(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(
         ctx: Any,
         query: Tensor,
@@ -896,9 +902,15 @@ def _maybe_new_buffer(
 
     grad_value = softmax_scores.to(query.dtype).transpose(-2, -1) @ grad_out
 
-    grad_softmax_scores = grad_out @ value.transpose(-2, -1)
+    grad_softmax_scores = grad_out.to(dtype=softmax_scores.dtype) @ value.to(
+        dtype=softmax_scores.dtype
+    ).transpose(-2, -1)
 
-    sum_scores = torch.sum(out * grad_out, -1, keepdim=True)
+    sum_scores = torch.sum(
+        out.to(dtype=softmax_scores.dtype) * grad_out.to(dtype=softmax_scores.dtype),
+        -1,
+        keepdim=True,
+    )
     grad_score_mod = softmax_scores * (
         grad_softmax_scores - sum_scores + grad_logsumexp.unsqueeze(-1)
     )
@@ -1063,6 +1075,7 @@ def trace_flex_attention_backward(
         score_mod_other_buffers,
         mask_mod_other_buffers,
     )
+    # pyrefly: ignore  # missing-attribute
     proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
     out_proxy = proxy_mode.tracer.create_proxy(
         "call_function",
@@ -1072,7 +1085,11 @@ def trace_flex_attention_backward(
         name="flex_attention_backward",
     )
     return track_tensor_tree(
-        example_out, out_proxy, constant=None, tracer=proxy_mode.tracer
+        example_out,
+        out_proxy,
+        constant=None,
+        # pyrefly: ignore  # bad-argument-type
+        tracer=proxy_mode.tracer,
     )
 
 
@@ -1256,14 +1273,12 @@ def flex_attention_backward_fake_tensor_mode(
     grad_query = torch.empty_like(query)
     # zeros_and_scatter creates a contiguous zeros tensor -> contiguous_format
     grad_score_mod_captured = tuple(
-        [
-            (
-                torch.empty_like(buffer, memory_format=torch.contiguous_format)
-                if isinstance(buffer, torch.Tensor) and buffer.requires_grad
-                else None
-            )
-            for buffer in score_mod_other_buffers
-        ]
+        (
+            torch.empty_like(buffer, memory_format=torch.contiguous_format)
+            if isinstance(buffer, torch.Tensor)
+            else None
+        )
+        for buffer in score_mod_other_buffers
     )
 
     broadcasted_grad_key = key.new_empty((Bq, Hkv, seq_len_kv, qk_head_dim))
diff --git a/torch/_higher_order_ops/foreach_map.py b/torch/_higher_order_ops/foreach_map.py
index 52841724c207..0d02515d555d 100644
--- a/torch/_higher_order_ops/foreach_map.py
+++ b/torch/_higher_order_ops/foreach_map.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-decorators
 # mypy: allow-untyped-defs
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 from torch._higher_order_ops.base_hop import BaseHOP, FunctionWithNoFreeVars
 
diff --git a/torch/_higher_order_ops/invoke_subgraph.py b/torch/_higher_order_ops/invoke_subgraph.py
index 11b663ea4f61..4d71a2824e46 100644
--- a/torch/_higher_order_ops/invoke_subgraph.py
+++ b/torch/_higher_order_ops/invoke_subgraph.py
@@ -3,7 +3,7 @@
 import contextlib
 from contextlib import nullcontext
 from dataclasses import dataclass, field
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 import torch
 import torch.utils._pytree as pytree
@@ -36,6 +36,10 @@
 from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 invoke_subgraph_counter = 0
 
 
@@ -82,6 +86,7 @@ def __call__(
 
         return super().__call__(subgraph, identifier, *operands)
 
+    # pyrefly: ignore  # bad-override
     def gen_schema(self, subgraph, identifier, *operands):
         from torch._higher_order_ops.schema import HopSchemaGenerator
         from torch._higher_order_ops.utils import (
@@ -89,11 +94,7 @@ def gen_schema(self, subgraph, identifier, *operands):
             materialize_as_graph,
         )
 
-        gm: torch.fx.GraphModule = (
-            subgraph
-            if isinstance(subgraph, torch.fx.GraphModule)
-            else materialize_as_graph(subgraph, operands)
-        )
+        gm: torch.fx.GraphModule = materialize_as_graph(subgraph, operands)
 
         schema_gen = HopSchemaGenerator(self)
         schema_gen.add_arg("subgraph", gm)
@@ -104,7 +105,7 @@ def gen_schema(self, subgraph, identifier, *operands):
             _,
             mutated_inputs,
             outputs,
-        ) = check_input_alias_and_mutation_return_outputs(gm, operands)
+        ) = check_input_alias_and_mutation_return_outputs(gm)
         for idx, arg in enumerate(operands):
             schema_gen.add_arg(f"arg{idx}", arg, is_mutated=idx in mutated_inputs)
         for out in outputs:
@@ -401,6 +402,7 @@ class InvokeSubgraphAutogradOp(torch.autograd.Function):
     """
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(
         ctx,
         subgraph,
@@ -477,6 +479,7 @@ def backward(
         for tangent in filtered_grad_outs:
             metadata = extract_tensor_metadata(tangent)
             metadata._flatten_into(tangent_metadata, fake_mode, state)
+        # pyrefly: ignore  # bad-assignment
         tangent_metadata = tuple(tangent_metadata)
 
         # bw_graph is a joint graph with signature (*primals_and_tangents) and
diff --git a/torch/_higher_order_ops/local_map.py b/torch/_higher_order_ops/local_map.py
index 22cb2af50f1f..fae73b1b4e91 100644
--- a/torch/_higher_order_ops/local_map.py
+++ b/torch/_higher_order_ops/local_map.py
@@ -6,27 +6,33 @@
 # NOTE: this file may be removed once we move to a dynamo frontend
 
 import functools
-from collections.abc import Generator
+from collections.abc import Callable, Generator
 from contextlib import contextmanager
-from typing import Any, Callable, Optional
+from typing import Any, Optional, Sequence, TypeAlias
 
 import torch
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
 from torch._higher_order_ops.utils import (
     clone_outputs_aliasing_inputs,
+    redirect_to_mode,
     save_tensors_and_symints_for_backward,
     saved_tensors_and_symints,
 )
 from torch._ops import HigherOrderOperator
-from torch._subclasses.fake_tensor import FakeTensorMode
+from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+from torch._subclasses.functional_tensor import FunctionalTensor
 from torch.fx import GraphModule
 from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
+from torch.utils.checkpoint import _CachedTorchDispatchMode, _CachingTorchDispatchMode
 
 
 # Proxy the HOP instead of inlining into it
+# And trace it with local shapes for AP
 _DEFER_INLINING = False
 
+GraphArg: TypeAlias = tuple[torch.Tensor, int, torch.SymInt, None]
+
 
 @contextmanager
 def defer_inlining() -> Generator[None, None, None]:
@@ -39,16 +45,147 @@ def defer_inlining() -> Generator[None, None, None]:
         _DEFER_INLINING = prior
 
 
+# Used to unwrap tensors classes like FunctionalTensor and Parameter
+def _new_tensor(
+    t: Any,
+    new_shape: Optional[Sequence[int]] = None,
+    new_stride: Optional[Sequence[int]] = None,
+) -> Any:
+    if isinstance(t, torch.Tensor):
+        assert type(t) in (FunctionalTensor, FakeTensor, torch.Tensor), (
+            f"No subclasses support for now, found {type(t)}"
+        )
+        return torch.empty_strided(
+            t.size() if new_shape is None else new_shape,
+            t.stride() if new_stride is None else new_stride,
+            device=t.device,
+            dtype=t.dtype,
+            requires_grad=t.requires_grad,
+        )
+    return t
+
+
+# Autoparallel specific, we want to treat plain tensors as DTensors
+def _redistribute(
+    args: Any,
+    all_placements: tuple[Any],
+    mesh: Any,
+    shape_stride_fn: Callable[[torch.Tensor, Any, Any], tuple[list[int], list[int]]],
+) -> GraphArg:
+    from torch._dispatch.python import suspend_functionalization
+    from torch._guards import detect_fake_mode
+    from torch._subclasses.functional_tensor import disable_functional_mode
+    from torch.fx.experimental.proxy_tensor import disable_proxy_modes_tracing
+
+    with (
+        suspend_functionalization(),
+        disable_functional_mode(),
+        disable_proxy_modes_tracing(),
+    ):
+        fake_mode = detect_fake_mode(args)
+        assert fake_mode is not None, (
+            "defer_inlining() is only supported for FakeTensors"
+        )
+
+        with fake_mode:
+            new_args = list(pytree.tree_map(_new_tensor, args))
+            for i, (tensor, placements) in enumerate(zip(new_args, all_placements)):
+                if tensor is None:
+                    # Sometimes gradients can be None
+                    continue
+
+                new_shape, new_stride = shape_stride_fn(
+                    tensor,
+                    mesh,
+                    placements,
+                )
+                new_args[i] = _new_tensor(
+                    tensor, new_shape=new_shape, new_stride=new_stride
+                )
+
+            new_args = tuple(new_args)
+            assert all(
+                isinstance(t, (FakeTensor, int, torch.SymInt, type(None)))
+                for t in new_args
+            ), f"Unexpected element in {args=}"
+
+    return new_args
+
+
+def redistribute_fw_inputs(
+    global_args: Any, all_placements: Any, mesh: Any, _: Optional[int] = None
+) -> GraphArg:
+    assert len(global_args) == len(all_placements)
+    return _redistribute(
+        global_args,
+        all_placements,
+        mesh,
+        torch.distributed.tensor._utils.compute_local_tensor_info,
+    )
+
+
+def redistribute_fw_outputs(
+    local_outs: Any, all_placements: Any, mesh: Any, num_activations: int
+) -> GraphArg:
+    assert len(local_outs) == len(all_placements) + num_activations
+    num_fw_outs = len(local_outs) - num_activations
+    assert num_fw_outs > 0
+    outs, activations = local_outs[:num_fw_outs], local_outs[num_fw_outs:]
+    return (
+        *_redistribute(
+            outs,
+            all_placements,
+            mesh,
+            torch.distributed.tensor._utils.compute_global_tensor_info,
+        ),
+        *activations,
+    )
+
+
+def redistribute_bw_inputs(
+    global_args: Any, all_placements: Any, mesh: Any, num_activations: int
+) -> GraphArg:
+    assert len(global_args) == len(all_placements) + num_activations
+    activations, inputs = global_args[:num_activations], global_args[num_activations:]
+    assert len(inputs) > 0
+    local_inputs = _redistribute(
+        inputs,
+        all_placements,
+        mesh,
+        torch.distributed.tensor._utils.compute_local_tensor_info,
+    )
+    return (
+        *activations,
+        *local_inputs,
+    )
+
+
+def redistribute_bw_outputs(
+    local_outs: Any, all_placements: Any, mesh: Any, _: Optional[int] = None
+) -> GraphArg:
+    assert len(local_outs) == len(all_placements)
+    return _redistribute(
+        local_outs,
+        all_placements,
+        mesh,
+        torch.distributed.tensor._utils.compute_global_tensor_info,
+    )
+
+
 class LocalMapHOP(HigherOrderOperator):
     def __init__(self) -> None:
         super().__init__("local_map_hop")
 
-    def __call__(self, fw_gm: GraphModule, *args: Any, **kwargs: Any) -> Any:
-        return super().__call__(fw_gm, *args, **kwargs)
+    def __call__(self, gm: GraphModule, *args: Any, **kwargs: Any) -> Any:
+        return super().__call__(gm, *args, **kwargs)
 
 
 local_map_hop = LocalMapHOP()
 
+# Registers dispatches for SAC
+redirect_to_mode(local_map_hop, _CachingTorchDispatchMode)
+redirect_to_mode(local_map_hop, _CachedTorchDispatchMode)
+
 
 def create_hop_fw_bw(
     fw_gm: GraphModule,
@@ -66,6 +203,12 @@ def create_hop_fw_bw(
     from torch._subclasses.functional_tensor import disable_functional_mode
     from torch.fx.experimental.proxy_tensor import disable_proxy_modes_tracing, make_fx
 
+    local_map_kwargs = fw_gm.meta["local_map_kwargs"]  # type: ignore[attr-defined]
+    assert "in_placements" in local_map_kwargs
+    assert "out_placements" in local_map_kwargs
+    assert "device_mesh" in local_map_kwargs
+    assert len(local_map_kwargs["in_placements"]) == len(_args)
+
     dummy_aot_config = AOTConfig(
         fw_compiler=None,  # type: ignore[arg-type]
         bw_compiler=None,  # type: ignore[arg-type]
@@ -78,18 +221,6 @@ def create_hop_fw_bw(
 
     with suspend_functionalization(), disable_functional_mode():
         with disable_proxy_modes_tracing():
-            # create a tensor (fake) from a compiler wrapped FunctionalTensor
-            def _from_fun(t: Any) -> Any:
-                if isinstance(t, torch.Tensor):
-                    return torch.empty_strided(
-                        t.size(),
-                        t.stride(),
-                        device=t.device,
-                        dtype=t.dtype,
-                        requires_grad=t.requires_grad,
-                    )
-                return t
-
             # If someone runs this hop under the default compiler backend ("eager")
             # Then this path will be run with the actual user inputs. We convert them
             # to fake tensors in order to not perform any actual compute.
@@ -99,14 +230,19 @@ def _from_fun(t: Any) -> Any:
                 fake_mode = FakeTensorMode(allow_non_fake_inputs=True)
 
             with fake_mode:
-                fw_inputs = pytree.tree_map(_from_fun, _args)
+                fw_inputs = redistribute_fw_inputs(
+                    _args,
+                    local_map_kwargs["in_placements"],
+                    local_map_kwargs["device_mesh"],
+                )
+                assert len(fw_inputs) == len(local_map_kwargs["in_placements"])
 
             assert all(
                 isinstance(t, (FakeTensor, int, torch.SymInt)) for t in fw_inputs
             ), f"Unexpected element in {fw_inputs=}"
 
             example_grads = pytree.tree_map(
-                _from_fun,
+                _new_tensor,
                 fw_gm(*fw_inputs),
             )
             if not isinstance(example_grads, (list, tuple)):
@@ -121,16 +257,19 @@ def joint_f(
             primals = primals_and_tangents[:num_fw_inputs]
             tangents = primals_and_tangents[num_fw_inputs:]
 
-            def prepare_fw_with_masks(fn: Callable[..., Any]) -> Callable[..., Any]:
+            def prepare_fw_with_masks(
+                fw_gm: torch.fx.GraphModule,
+            ) -> Callable[..., Any]:
                 def fw_with_masks(*args: Any) -> tuple[tuple[Any], list[bool]]:
-                    fw_out = fn(*args)
+                    # The Interpreter here is required to propagate metadata
+                    # from the dynamo graph body to the local_map graph body.
+                    # This is required for fx_traceback.annotate for work.
+                    fw_out = torch.fx.Interpreter(fw_gm).run(*args)
                     assert isinstance(fw_out, tuple), (
                         "Dynamo traced submodule should return tuple"
                     )
                     return fw_out, [
-                        True
-                        if isinstance(ret, torch.Tensor) and ret.requires_grad
-                        else False
+                        bool(isinstance(ret, torch.Tensor) and ret.requires_grad)
                         for ret in fw_out
                     ]
 
@@ -159,6 +298,11 @@ def fw_with_masks(*args: Any) -> tuple[tuple[Any], list[bool]]:
             *[example_grads[i] for i in filtered_grads_idx],
         ]
         joint_hop_gm = make_fx(joint_f)(*primals_and_tangents)
+        from torch._functorch._aot_autograd.graph_capture import (
+            copy_fwd_metadata_to_bw_nodes,
+        )
+
+        copy_fwd_metadata_to_bw_nodes(joint_hop_gm)
 
         from torch._functorch._aot_autograd.graph_compile import prepare_for_partitioner
         from torch._inductor.compile_fx import partition_fn
@@ -176,23 +320,58 @@ def fw_with_masks(*args: Any) -> tuple[tuple[Any], list[bool]]:
         )
 
         # Propagate meta onto fw/bw graphs, later will be set on proxied nodes
-        local_map_kwargs = fw_gm.meta["local_map_kwargs"]  # type: ignore[attr-defined]
-
         new_fw_gm.meta["local_map_kwargs"] = local_map_kwargs
         new_bw_gm.meta["local_map_kwargs"] = {**local_map_kwargs}
         # Okay because Autoparallel assumes same sharding between param and grads
-        new_bw_gm.meta["local_map_kwargs"]["in_placements"] = local_map_kwargs[
-            "out_placements"
-        ]
+        new_bw_gm.meta["local_map_kwargs"]["in_placements"] = tuple(
+            [local_map_kwargs["out_placements"][i] for i in filtered_grads_idx]
+        )
         new_bw_gm.meta["local_map_kwargs"]["out_placements"] = local_map_kwargs[
             "in_placements"
         ]
 
+        # Validate Forward
+        fw_kwargs = new_fw_gm.meta["local_map_kwargs"]
+        expected_fw_inputs = len(fw_kwargs["in_placements"])
+        expected_fw_outputs = len(fw_kwargs["out_placements"])
+        actual_fw_inputs = len(new_fw_gm.graph.find_nodes(op="placeholder"))
+        actual_fw_outputs = num_fw_outputs
+        assert expected_fw_inputs == actual_fw_inputs
+        assert expected_fw_outputs == actual_fw_outputs
+
+        # Validate Activations
+        assert len(new_fw_gm.graph.find_nodes(op="output")) == 1
+        num_activations = (
+            len(new_fw_gm.graph.find_nodes(op="output")[0].args[0]) - num_fw_outputs
+        )
+        assert num_activations >= 0
+
+        # Validate Backward
+        bw_kwargs = new_bw_gm.meta["local_map_kwargs"]
+        expected_bw_inputs = len(bw_kwargs["in_placements"])
+        expected_bw_outputs = len(bw_kwargs["out_placements"])
+        actual_bw_inputs = (
+            len(new_bw_gm.graph.find_nodes(op="placeholder")) - num_activations
+        )
+        assert actual_bw_inputs > 0
+        assert expected_fw_inputs + expected_bw_inputs == len(primals_and_tangents)
+        assert actual_fw_inputs + actual_bw_inputs == len(primals_and_tangents)
+        assert len(new_bw_gm.graph.find_nodes(op="output")) == 1
+        actual_bw_outputs = len(new_bw_gm.graph.find_nodes(op="output")[0].args[0])
+        assert expected_bw_inputs == actual_bw_inputs
+        assert expected_bw_outputs == actual_bw_outputs
+
+        new_fw_gm.meta["num_activations"] = num_activations
+        new_fw_gm.meta["is_backward"] = False
+        new_bw_gm.meta["num_activations"] = num_activations
+        new_bw_gm.meta["is_backward"] = True
+
         return new_fw_gm, new_bw_gm, num_fw_inputs, num_fw_outputs, filtered_grads_idx
 
 
 class LocalMapAutogradOp(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(
         ctx: Any,
         fw_gm: GraphModule,
@@ -203,6 +382,8 @@ def forward(
         *args: Any,
         **kwargs: Any,
     ) -> tuple[Optional[torch.Tensor], ...]:
+        from torch._functorch._aot_autograd.schemas import MemoryFormatMeta
+
         ctx.bw_gm = bw_gm
         ctx.num_fw_ins = num_fw_ins
         ctx.filtered_grads_idx = filtered_grads_idx
@@ -214,17 +395,32 @@ def forward(
         saved_activations = fw_outs_with_saved_activations[num_fw_outs:]
         save_tensors_and_symints_for_backward(ctx, saved_activations)
 
+        ctx.expected_tangent_metadata = {
+            i: MemoryFormatMeta.from_tensor(fw_outs[i]) for i in filtered_grads_idx
+        }
         return fw_outs
 
     @staticmethod
     def backward(
         ctx: Any, *_grads: tuple[torch.Tensor]
     ) -> tuple[Optional[torch.Tensor], ...]:
+        from torch._functorch._aot_autograd.runtime_wrappers import (
+            coerce_to_expected_memory_format,
+        )
+
         saved_activations = saved_tensors_and_symints(ctx)
         with torch._C._AutoDispatchBelowAutograd():
             # Filter out grads that are None or do not require_grad.
             # The AOTAutograd utils we rely on force this assumption.
             grads = [_grads[i] for i in ctx.filtered_grads_idx]
+            assert len(grads) == len(ctx.expected_tangent_metadata), (
+                f"{len(grads)=} vs {len(ctx.expected_tangent_metadata)}"
+            )
+
+            for i, meta in ctx.expected_tangent_metadata.items():
+                # pyrefly: ignore  # bad-argument-type
+                grads[i] = coerce_to_expected_memory_format(grads[i], meta)
+
             grad_ins = local_map_hop(ctx.bw_gm, *saved_activations, *grads)
             if len(grad_ins) != ctx.num_fw_ins:
                 raise RuntimeError(
@@ -239,6 +435,10 @@ def autograd_key(
     *args: Any,
     **kwargs: Any,
 ) -> Any:
+    local_map_kwargs = fw_gm.meta["local_map_kwargs"]  # type: ignore[attr-defined]
+    assert local_map_kwargs.get("in_grad_placements", None) is None, (
+        "local_map in_grad_placements are not yet supported."
+    )
     if _DEFER_INLINING:
         fw_gm, bw_gm, num_fw_ins, num_fw_outs, filtered_grads_idx = create_hop_fw_bw(
             fw_gm, *args
@@ -247,30 +447,55 @@ def autograd_key(
             fw_gm, bw_gm, num_fw_ins, num_fw_outs, filtered_grads_idx, *args, **kwargs
         )
 
-    return fw_gm(*args, **kwargs)
+    # TODO: get rid of this when we can install as a subgraph
+    return torch.fx.Interpreter(fw_gm).run(*args, **kwargs)
 
 
 @local_map_hop.py_functionalize_impl
 def functional_mode_key(
-    ctx: Any, fw_gm: GraphModule, *args: Any, **kwargs: Any
+    ctx: Any, gm: GraphModule, *args: Any, **kwargs: Any
 ) -> tuple[torch.Tensor]:
     assert not kwargs
 
     unwrapped_inputs = ctx.unwrap_tensors(args)
     with ctx.redispatch_to_next():
-        out = local_map_hop(fw_gm, *unwrapped_inputs)
+        out = local_map_hop(gm, *unwrapped_inputs)
         return ctx.wrap_tensors(out)
 
 
 @local_map_hop.py_impl(FakeTensorMode)
 def fake_mode_key(
     mode: FakeTensorMode,
-    fw_gm: GraphModule,
+    gm: GraphModule,
     *args: Any,
     **kwargs: Any,
-) -> tuple[torch.Tensor]:
+) -> GraphArg:
     with mode:
-        return fw_gm(*args, **kwargs)
+        if not _DEFER_INLINING:
+            return gm(*args, **kwargs)
+
+        # otherwise, we need to convert to local shapes for AP
+        is_backward = gm.meta["is_backward"]
+        redistribute_inputs = (
+            redistribute_bw_inputs if is_backward else redistribute_fw_inputs
+        )
+        local_args = redistribute_inputs(
+            args,
+            gm.meta["local_map_kwargs"]["in_placements"],
+            gm.meta["local_map_kwargs"]["device_mesh"],
+            gm.meta["num_activations"],
+        )
+        local_outs = gm(*local_args)
+        redistribute_outputs = (
+            redistribute_bw_outputs if is_backward else redistribute_fw_outputs
+        )
+        global_outs = redistribute_outputs(
+            local_outs,
+            gm.meta["local_map_kwargs"]["out_placements"],
+            gm.meta["local_map_kwargs"]["device_mesh"],
+            gm.meta["num_activations"],
+        )
+        return global_outs
 
 
 def proxy_mode_key_common(
@@ -306,22 +531,22 @@ def proxy_mode_key_common(
 @local_map_hop.py_impl(ProxyTorchDispatchMode)
 def proxy_mode_key(
     proxy_mode: ProxyTorchDispatchMode,
-    fw_gm: GraphModule,
+    gm: GraphModule,
     *args: Any,
     **kwargs: Any,
 ) -> tuple[torch.Tensor]:
     # TODO: get rid of this when we can install as a subgraph
     def call_local_map(*_args: Any, **_kwargs: Any) -> Any:
-        return functools.partial(local_map_hop, fw_gm)(*_args, **_kwargs)
+        return functools.partial(local_map_hop, gm)(*_args, **_kwargs)
 
-    return proxy_mode_key_common(call_local_map, proxy_mode, fw_gm, *args, **kwargs)
+    return proxy_mode_key_common(call_local_map, proxy_mode, gm, *args, **kwargs)
 
 
 # Running HOP in eager with real tensors
 @local_map_hop.py_impl(DispatchKey.CompositeExplicitAutograd)
 def real_impl(
-    fw_gm: GraphModule,
+    gm: GraphModule,
     *args: Any,
     **kwargs: Any,
 ) -> tuple[torch.Tensor]:
-    return fw_gm(*args, **kwargs)
+    return gm(*args, **kwargs)
diff --git a/torch/_higher_order_ops/map.py b/torch/_higher_order_ops/map.py
index 57d2cd3cb900..006d7cbd2e52 100644
--- a/torch/_higher_order_ops/map.py
+++ b/torch/_higher_order_ops/map.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import functools
-from typing import Callable, Union
+from collections.abc import Callable
+from typing import Union
 from typing_extensions import TypeVarTuple
 
 import torch
@@ -124,6 +125,7 @@ def wrapped_fn(*flat_args, f, xs_tree_spec, args_tree_spec, num_xs):
 
 class MapAutogradOp(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, f, num_mapped_args, *flat_args):
         ctx._f = f
         ctx._num_mapped_args = num_mapped_args
diff --git a/torch/_higher_order_ops/partitioner.py b/torch/_higher_order_ops/partitioner.py
new file mode 100644
index 000000000000..2a21601aa9d9
--- /dev/null
+++ b/torch/_higher_order_ops/partitioner.py
@@ -0,0 +1,365 @@
+import logging
+from collections.abc import Callable
+from typing import Any, Union
+
+import torch
+from torch._higher_order_ops.utils import create_bw_fn, materialize_as_graph
+
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+logger.setLevel(logging.DEBUG)
+
+
+def _find_hop_subgraph_outputs(gm: torch.fx.GraphModule) -> tuple[torch.fx.Node]:
+    output_node_args = gm.graph.find_nodes(op="output")[0].args
+    assert isinstance(output_node_args, tuple)
+    return output_node_args[0]
+
+
+def is_complex_expr(expr: Any) -> bool:
+    return not expr.is_symbol and not expr.is_constant()
+
+
+class HopPartitionedGraph:
+    def __init__(
+        self,
+        fw_gm: torch.fx.GraphModule,
+        bw_gm: torch.fx.GraphModule,
+        n_fw_outputs: int,
+        n_intermediates: int,
+        no_complex_exprs_at_boundary: bool,
+    ):
+        self.fw_gm = fw_gm
+        self.bw_gm = bw_gm
+        self.n_fw_outputs = n_fw_outputs
+        self.n_intermediates = n_intermediates
+        self.no_complex_exprs_at_boundary = no_complex_exprs_at_boundary
+        self._reorder_fw_output()
+        self._check_partition_boundary()
+
+    def _check_partition_boundary(self) -> None:
+        """check partitioned graph is in valid state."""
+        invalid_reasons = []
+        fw_outputs = _find_hop_subgraph_outputs(self.fw_gm)
+        for i, out in enumerate(fw_outputs):
+            if "val" not in out.meta:
+                invalid_reasons.append(f"fw_gm output[{i}] doesn't have a 'val' meta.")
+            elif not isinstance(out.meta["val"], (torch.SymInt, torch.Tensor)):
+                invalid_reasons.append(
+                    f"fw_gm output[{i}] is of type {type(out.meta['val'])} but only SymInt or Tensor are allowed."
+                )
+
+            elif (
+                isinstance(out.meta["val"], torch.SymInt)
+                and is_complex_expr(out.meta["val"].node.expr)
+                and self.no_complex_exprs_at_boundary
+            ):
+                invalid_reasons.append(
+                    f"fw_gm output[{i}] must be of type SymInt with basic symbols or "
+                    f"Tensor but got {type(out.meta['val'])} {out.meta['val']}"
+                )
+
+        if len(fw_outputs) != self.n_fw_outputs + self.n_intermediates:
+            invalid_reasons.append(
+                f"len(fw_outputs) ({len(fw_outputs)}) != n_fw_outputs ({self.n_fw_outputs}) + n_intermediates ({self.n_intermediates})"  # noqa: B950
+            )
+
+        bw_phs = list(self.bw_gm.graph.find_nodes(op="placeholder"))
+
+        if len(fw_outputs) != len(bw_phs):
+            invalid_reasons.append(
+                f"Expect number of fw_gm's output to be the same as bw_gm's input but "
+                f"fw_gm has {len(fw_outputs)} outputs, bw_gm takes {len(bw_phs)} inputs."
+            )
+
+        original_forward_outputs = fw_outputs[: self.n_fw_outputs]
+        fw_intermediates = fw_outputs[self.n_fw_outputs :]
+
+        bw_intermediates = bw_phs[: -self.n_fw_outputs]
+        bw_grads = bw_phs[-self.n_fw_outputs :]
+
+        def _match_size_or_expr(
+            val1: Union[torch.SymInt, torch.Tensor],
+            val2: Union[torch.SymInt, torch.Tensor],
+        ) -> bool:
+            if type(val1) is not type(val2):
+                return False
+
+            if isinstance(val1, torch.SymInt) and isinstance(val2, torch.SymInt):
+                return val1.node.expr == val2.node.expr
+            elif isinstance(val1, torch.Tensor) and isinstance(val2, torch.Tensor):
+                return val1.size() == val2.size()
+
+            return False
+
+        for fw, bw in zip(fw_intermediates, bw_intermediates):
+            if fw.name != bw.name or not _match_size_or_expr(
+                fw.meta["val"], bw.meta["val"]
+            ):
+                invalid_reasons.append("fw intermediates don't match bw intermediates")
+
+        for fw_out, bw_grad in zip(original_forward_outputs, bw_grads):
+            if not _match_size_or_expr(fw_out.meta["val"], bw_grad.meta["val"]):
+                invalid_reasons.append("fw outputs don't match bw gradients")
+
+        if len(invalid_reasons) > 0:
+            newline = "\n"
+            raise RuntimeError(
+                "Invalid HopPartitionedGraph. Reasons:\n",
+                f"{newline.join(invalid_reasons)}",
+            )
+
+    def _reorder_fw_output(self) -> None:
+        """
+        Before the pass, fw_gm returns (*fw_outputs, *intermediates1)
+        and bw_gm takes (*intermediates2, *grad_fw_outputs) as input.
+        intermediates1 and intermediates2 share the same node names but
+        they might be in different order. E.g. this could happen if there
+        are inputs that contain symints.
+
+        To simplify downstream processing, this graph pass normalizes the output of fw_gm
+        to be consistent with the bacwkard inputs:
+
+        fw_gm:
+          - input: fw_args
+          - output: (*fw_outputs, *intermediates)
+
+        bw_gm:
+          - input: (*intermediates, *grad_fw_outputs)
+          - output: grad_fw_args
+
+        Example:
+
+        def fw_gm(x, y, z):
+           a, b, c = f(x), g(y), k(z)
+           return a, b, c, f_tmp, g_tmp, k_tmp
+
+        , where a, b, c are fw_outputs, f_tmp, g_tmp, k_tmp are intermediates
+
+        The corresponding bw_gm has the following signature:
+
+        def bw_gm(f_tmp, g_tmp, k_tmp, grad_a, grad_b, grac):
+          return grad_x, grad_y, grad_z
+        """
+        fw_gm_output_nodes = _find_hop_subgraph_outputs(self.fw_gm)
+        fw_outputs_nodes = fw_gm_output_nodes[: self.n_fw_outputs]
+        fw_intermediates_nodes = fw_gm_output_nodes[self.n_fw_outputs :]
+        if len(fw_intermediates_nodes) > 0:
+            fw_intermediates_name_to_node = {n.name: n for n in fw_intermediates_nodes}
+
+            # First n_intermediates placeholders
+            bw_names: list[str] = [
+                ph.name
+                for ph in list(self.bw_gm.graph.find_nodes(op="placeholder"))[
+                    : self.n_intermediates
+                ]
+            ]
+            new_fw_outputs = list(fw_outputs_nodes) + [
+                fw_intermediates_name_to_node[name] for name in bw_names
+            ]
+
+            output_node = self.fw_gm.graph.find_nodes(op="output")[0]
+            output_node.args = (tuple(new_fw_outputs),)
+
+            self.fw_gm.graph.lint()
+            self.fw_gm.recompile()
+
+
+class HopJointGraph:
+    def __init__(
+        self,
+        joint_gm: torch.fx.GraphModule,
+        n_primals: int,
+        n_fw_outputs: int,
+        *,
+        functionalized: bool,
+    ):
+        self.joint_gm = joint_gm
+        self.n_primals = n_primals
+        self.n_fw_outputs = n_fw_outputs
+        self.functionalized = functionalized
+
+        self._rename_phs()
+        self._remove_redundant_sym_size_ops()
+
+    def _rename_phs(self) -> None:
+        """
+        Rename the placeholders for joint_gm so that the partitioner
+        could recognize which inputs are primals and which are tangents.
+        """
+        self.n_tangents = 0
+        for i, ph in enumerate(self.joint_gm.graph.find_nodes(op="placeholder")):
+            if i < self.n_primals:
+                ph.target = f"primals_{i}"
+                ph.name = f"primals_{i}"
+            else:
+                self.n_tangents += 1
+                ph.target = f"tangents_{i - self.n_primals}"
+                ph.name = f"tangents_{i - self.n_primals}"
+
+        self.joint_gm.graph.lint()
+        self.joint_gm.compile()
+
+    def _remove_redundant_sym_size_ops(self) -> None:
+        """
+        Deletes torch.ops.sym_size.int operators whose output is a
+        corresponding placeholder that holds the same symbol, and replace all usage
+        of the sym_size node to be directly using the placeholders.
+
+        This is to make sure all basic symbols come from inputs.
+        """
+        placeholder_exprs = {}
+        for node in self.joint_gm.graph.nodes:
+            if (
+                isinstance(node, torch.fx.Node)
+                and node.op == "placeholder"
+                and hasattr(node, "meta")
+                and "val" in node.meta
+            ):
+                val = node.meta["val"]
+                if isinstance(val, torch.SymInt):
+                    placeholder_exprs[val.node.expr] = node
+
+        nodes_to_remove = []
+        for node in self.joint_gm.graph.find_nodes(
+            op="call_function", target=torch.ops.aten.sym_size.int
+        ):
+            assert hasattr(node, "meta") and "val" in node.meta, node
+            val = node.meta["val"]
+            expr = val.node.expr
+            if expr in placeholder_exprs:
+                placeholder_node = placeholder_exprs[expr]
+                node.replace_all_uses_with(placeholder_node)
+                nodes_to_remove.append(node)
+
+        for node in nodes_to_remove:
+            self.joint_gm.graph.erase_node(node)
+
+        self.joint_gm.graph.lint()
+        self.joint_gm.recompile()
+
+    def _mark_complex_exprs_as_must_recompute(self) -> None:
+        """
+        For control flow operators such as scan, we don't want to
+        have symint in the partitioning boundaries because otherwise we would need to support stacking
+        the symints up, which causes more entropy in the stack.
+
+        By marking the recompute polify for complex nodes as MUST_RECOMPUTE, the partitioning boundary
+        no longer contains complex expressions.
+
+        Note that this pass doesn't exclude basic symbols from partitioning boundary
+        and it's up to the downstream to decide whether to return the basic symbol
+        or have a separate graph pass to remove them.
+        """
+
+        from torch._functorch.partitioners import CheckpointPolicy
+
+        for n in (
+            node for node in self.joint_gm.graph.nodes if node.op == "call_function"
+        ):
+            if "val" not in n.meta:
+                continue
+            val = n.meta["val"]
+            if isinstance(val, torch.SymInt) and is_complex_expr(val.node.expr):
+                assert n.meta.get("recompute", None) is None
+
+                n.meta["recompute"] = CheckpointPolicy.MUST_RECOMPUTE
+
+        self.joint_gm.graph.lint()
+        self.joint_gm.recompile()
+
+    def partition(
+        self, partition_fn: Callable, always_recompute_complex_exprs: bool
+    ) -> HopPartitionedGraph:
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug(
+                "before min_cut_partition:\n%s",
+                self.joint_gm.print_readable(print_output=False),
+            )
+
+        if always_recompute_complex_exprs:
+            self._mark_complex_exprs_as_must_recompute()
+
+        fw_gm, bw_gm = partition_fn(
+            self.joint_gm, None, num_fwd_outputs=self.n_fw_outputs
+        )
+
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug("after partition_fn:")
+            logger.debug("fw_gm:\n%s", fw_gm.print_readable(print_output=False))
+            logger.debug("bw_gm:\n%s", bw_gm.print_readable(print_output=False))
+
+        n_intermediates = len(_find_hop_subgraph_outputs(fw_gm)) - self.n_fw_outputs
+
+        return HopPartitionedGraph(
+            fw_gm,
+            bw_gm,
+            self.n_fw_outputs,
+            n_intermediates,
+            always_recompute_complex_exprs,
+        )
+
+
+def create_hop_joint_graph(
+    fw_fn: Callable,
+    fw_args: tuple[Union[torch.Tensor, torch.SymInt], ...],
+    functionalize: bool,
+) -> HopJointGraph:
+    fw_gm = materialize_as_graph(fw_fn, fw_args, force_enable_grad=True)
+    fw_gm_output_nodes = _find_hop_subgraph_outputs(fw_gm)
+
+    assert all(
+        isinstance(n, torch.fx.Node) and "val" in n.meta for n in fw_gm_output_nodes
+    )
+    fw_gm_output_vals = tuple(n.meta["val"] for n in fw_gm_output_nodes)  # type: ignore[arg-type]
+
+    assert all(isinstance(val, torch.Tensor) for val in fw_gm_output_vals)
+    example_grads = tuple(torch.zeros_like(val) for val in fw_gm_output_vals)
+
+    joint_fn = create_bw_fn(fw_fn, fw_args, return_fw_outputs=True)
+    joint_gm = materialize_as_graph(
+        joint_fn, fw_args + example_grads, force_enable_grad=True
+    )
+    if functionalize:
+        # Need to first trace out the joint_fn with autograd info on
+        # then functionalize the graph otherwise the grad information is lost
+        joint_gm = materialize_as_graph(
+            torch.func.functionalize(joint_gm, remove="mutations_and_views"),
+            fw_args + example_grads,
+        )
+
+    return HopJointGraph(
+        joint_gm,
+        len(fw_args),
+        len(fw_gm_output_nodes),
+        functionalized=functionalize,
+    )
+
+
+class HopGraphMinCutPartitioner:
+    @staticmethod
+    def create_partitioned_graph(
+        fw_fn: Callable,
+        fw_args: tuple[Union[torch.Tensor, torch.SymInt], ...],
+        *,
+        always_recompute_complex_exprs: bool = False,
+    ) -> HopPartitionedGraph:
+        """
+        Inputs:
+            - fw_fn: the forward function that we'll use to create a joint graph and partition
+            - fw_args: the flat_args to fw_fn
+            - always_recompute_complex_exprs: when set to True, the bw_gm will do a re-compute
+              for inputs that are complex expressions such that the partitioning boundary
+              only consists of basic symbols and tensors.
+
+        Returns a HopPartitionedGraph
+        """
+        from torch._functorch.partitioners import min_cut_rematerialization_partition
+
+        joint_graph: HopJointGraph = create_hop_joint_graph(
+            fw_fn, fw_args, functionalize=True
+        )
+        return joint_graph.partition(
+            min_cut_rematerialization_partition, always_recompute_complex_exprs
+        )
diff --git a/torch/_higher_order_ops/scan.py b/torch/_higher_order_ops/scan.py
index e4aa0161ad3c..e3274991cb24 100644
--- a/torch/_higher_order_ops/scan.py
+++ b/torch/_higher_order_ops/scan.py
@@ -1,25 +1,31 @@
 # mypy: allow-untyped-defs
+import enum
 import functools
 import itertools
-from typing import Any, Callable
+import logging
+from collections.abc import Callable
+from typing import Any
 
 import torch
 import torch._prims_common as utils
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
+from torch._higher_order_ops.partitioner import (
+    _find_hop_subgraph_outputs,
+    HopGraphMinCutPartitioner,
+    HopPartitionedGraph,
+)
 from torch._higher_order_ops.utils import (
     _maybe_compile_and_run_fn,
     check_input_alias_and_mutation_return_outputs,
     check_meta_consistency,
-    create_bw_fn,
+    fill_none_with_masks,
+    filter_with_masks,
     first_slice_copy,
-    first_slice_copy_with_grad,
     get_tensor_mask,
     mask_list,
     materialize_as_graph,
     reenter_make_fx,
-    save_tensors_and_symints_for_backward,
-    saved_tensors_and_symints,
     split_into_chunks,
     unique_graph_id,
     validate_subgraph_args_types,
@@ -34,6 +40,7 @@
 from torch.utils._python_dispatch import _get_current_dispatch_mode
 
 
+logger: logging.Logger = logging.getLogger(__name__)
 aten = torch._ops.ops.aten
 
 
@@ -41,7 +48,7 @@ def wrap_combine_fn_flat(
     *args, combine_fn, spec_init, spec_xs, num_init_leaves, num_inp_leaves
 ):
     assert len(args) == (num_init_leaves + num_inp_leaves), (
-        f"Combin_fn received wrong number of arguments, expected {num_init_leaves + num_inp_leaves}, but got {len(args)}"
+        f"combine_fn received wrong number of arguments, expected {num_init_leaves + num_inp_leaves}, but got {len(args)}"
     )
     carry = pytree.tree_unflatten(args[:num_init_leaves], spec_init)
     xs = pytree.tree_unflatten(args[num_init_leaves:], spec_xs)
@@ -147,7 +154,7 @@ def add(x: torch.Tensor, y: torch.Tensor):
     def _validate_input(cfn, lxs, linit, d, r):
         # Basic arguments check
         if not callable(cfn):
-            raise RuntimeError("Combine_fn must be a callable, but got {cfn}")
+            raise RuntimeError(f"Combine_fn must be a callable, but got {cfn}")
         if not isinstance(d, int):
             raise RuntimeError("Dim must be an int, but got " + str(type(d)))
         if not isinstance(r, bool):
@@ -181,7 +188,7 @@ def _validate_input(cfn, lxs, linit, d, r):
     # Move scan dim to 0 and always perform scan on dim 0
     leaves_xs = []
     for elem in leaves_xs_orig:
-        leaves_xs.append(torch.movedim(elem, dim, 0))
+        leaves_xs.append(torch.movedim(elem, dim, 0) if dim != 0 else elem)
 
     if reverse:
         leaves_xs = [torch.flip(elem, [0]) for elem in leaves_xs]
@@ -234,6 +241,7 @@ def __call__(self, combine_fn, init, xs, additional_inputs):
         validate_subgraph_args_types(additional_inputs)
         return super().__call__(combine_fn, init, xs, additional_inputs)
 
+    # pyrefly: ignore  # bad-override
     def gen_schema(self, combine_fn, init, xs, additional_inputs):
         from torch._higher_order_ops.schema import HopSchemaGenerator
         from torch._higher_order_ops.utils import materialize_as_graph
@@ -242,16 +250,7 @@ def gen_schema(self, combine_fn, init, xs, additional_inputs):
             list(init) + [first_slice_copy(x) for x in xs] + list(additional_inputs)
         )
 
-        combine_gm: torch.fx.GraphModule = (
-            combine_fn
-            if isinstance(combine_fn, torch.fx.GraphModule)
-            else materialize_as_graph(combine_fn, all_inputs)
-        )
-
-        example_inputs = [
-            n.meta["val"] if "val" in n.meta else n.meta["example_value"]
-            for n in combine_gm.graph.find_nodes(op="placeholder")
-        ]
+        combine_gm: torch.fx.GraphModule = materialize_as_graph(combine_fn, all_inputs)
 
         (
             _,
@@ -259,7 +258,7 @@ def gen_schema(self, combine_fn, init, xs, additional_inputs):
             _,
             mutated_inputs,
             outputs,
-        ) = check_input_alias_and_mutation_return_outputs(combine_gm, example_inputs)
+        ) = check_input_alias_and_mutation_return_outputs(combine_gm)
         if len(mutated_inputs) > 0:
             raise RuntimeError(
                 "For scan, combine_fn cannot have in-place mutations but found "
@@ -437,121 +436,6 @@ def scan_op_dense(combine_fn, init, xs, additional_inputs):
 
 class ScanAutogradOp(torch.autograd.Function):
     """
-    Example ::
-
-        def combine_fn(x: torch.Tensor, y: torch.Tensor):
-            next_carry = y = x * y
-            return next_carry, y
-
-        The ``combine_fn_bw``, computing the gradients for x and y of ``combine_fn`` is computed as:
-        def combine_fn_bw(x: torch.Tensor, y: torch.Tensor, g_carry: torch.Tensor, g_y: torch.Tensor):
-            return g_y * y + g_carry * y, g_y * x + g_carry * x
-
-        Note: In a real usecase of scan, there may be additional_inputs that participate in the
-        forward as well as in the backward of the scan operator. For the sake of readability those inputs
-        have been omitted in the following example, but are included in the subsequent detailed description below
-
-        The forward output of scan is computed as:
-        carry, ys = scan(combine_fn, init, xs).
-
-        This computation can be unpacked as
-        c_0, ys_0 = combine_fn(init, xs_0)
-        c_1, ys_1 = combine_fn(carry_0, xs_1)
-        c_2, ys_2 = combine_fn(carry_1, xs_2)
-        ...
-        c_T, ys_T = combine_fn(carry_(T-1), xs_T)
-
-        We collect c_0, c_1, ..., c_T into a vector of carries that we save for the backward,
-        but we only output (c_T, ys),
-        where ys is the vector of all intermediate outputs [y_0, y_1, ..., y_T].
-
-        Given the carries and the ys, the gradients for xs and for init can be computed as follows:
-        We receive the upstream gradients in torch.autograd.Function, i.e., we get g_c_T and g_ys,
-        where g_ys is the vector of all intermediate gradients of the outputs [g_ys_0, g_ys_1, ..., g_ys_T]
-
-        We then proceed to compute the gradients for the init (g_init) and the xs (g_xs) by running a
-        scan operation reverse over time. For example,
-
-        g_c_(T-1), g_xs_T = combine_fn_bw(c_(T-1), xs_T, g_c_T, g_ys_T)
-        g_c_(T-2), g_xs_(T-1) = combine_fn_bw(c_(T-2), xs_(T-1), g_c_(T-1), g_ys_(T-1))
-        g_c_(T-3), g_xs_(T-2) = combine_fn_bw(c_(T-3), xs_(T-2), g_c_(T-2), g_ys_(T-2))
-        ...
-        g_init, g_xs_1 = combine_fn_bw(c_0, xs_1, g_c_0, g_ys_1)
-        0     , g_xs_0 = combine_fn_bw(init, xs_0, g_init, g_ys_0),
-
-        where combine_fn_bw takes the forward inputs of step t (i.e. c_(t-1), xs_t),
-        the gradients of the carry of step t (i.e. g_c_t) and
-        the upstream gradient of the output of step t (i.e. g_ys_T)
-        and returns the gradient of xs_t -> g_xs_t, as well as the gradient for the carry of step t-1 -> g_c_(t-1).
-
-        Through this procedure we end up with the
-        gradients for the init -> g_init,
-        the gradients for the xs -> g_xs.
-
-
-    NOTE: [scan autograd implementation]
-
-    The forward of scan can be computed as:
-    1.) Prepare the forward graph wrapper ``combine_fn_with_carry_checkpoint``:
-    To use a scan operation for the backward path as well, we need access to the carries from all steps.
-    Thus, the function ``combine_fn`` is wrapped such that it returns all carries and not only the last carry.
-    In particular, we define ``combine_fn_with_carry_checkpoint``:
-    def combine_fn_with_carry_checkpoint(x: torch.Tensor, y: torch.Tensor):
-        carry, y = combine_fn(x, y)
-        return carry, (carry, y)
-
-    The scan operator will stack all outputs along the scan dimension.
-    Thus, by putting next_carry also into outputs of ``combine_fn_with_carry_checkpoint``,
-    the carries from all steps will be stacked and hence gives us chekpointed_carries
-
-    2.) Compute all carries, the last carry and all outputs using ``combine_fn_with_carry_checkpoint``:
-    c_T, (carries, ys) = scan_op(combine_fn_with_carry_checkpoint, init, xs, additional_inputs),
-    Where c_T (last carry) and ys (all outputs) are the original results of scan with the ``combine_fn``.
-    However, carries are checkpointed carries from all steps.
-    As a result of the forward, only the last carry c_T and the ys are returned,
-    while all carries are saved for the backward.
-
-    The backward of scan can be computed as:
-
-    3.) Prepare the backward graph:
-    We prepare the backward graph to be used in the backward function.
-    We utilize ``create_bw_fn`` to generate the joint function, i.e.,
-    ctx._combine_fn_bw = create_bw_fn(ctx._combine_fn, fw_operands), where fw_operands = [init, xs_0, additional_inputs]
-
-    The ctx._combine_fn_bw requires the primals (operands)
-    followed by the tangents (upstream gradients) from a single step
-    and produces the gradients of that step, i.e.,
-    g_c_(T-1), g_xs_T, g_additional_input_T = ctx._combine_fn_bw(c_(T-1), xs_T, additional_inputs, g_c_T, g_ys_T).
-
-    4.) Create a wrapper of the ``combine_fn_bw``, i.e., ``combine_fn_bw_grad_accumulation``:
-    In the forward, there may be additional inputs that participate in every forward step.
-    The gradients for those additional inputs are also computed at every step and need to be accumulated over all steps,
-    which is taken care of in this wrapper. For example:
-    def combine_fn_bw_grad_accumulation(*args):
-        carried_g_additional_input = args[:num_additional_inputs]
-        inputs_bw_fn = args[num_additional_inputs:]
-        g_c_(t-1), g_xs_t, g_additional_input_t = ctx._combine_fn_bw(*inputs_bw_fn)
-        new_g_additional_inputs = carried_g_additional_input + g_additional_input_t
-        # The ``new_g_additional_inputs`` and the ``g_c_t`` are encoded in the carry of the backward scan operator
-        # The ``g_xs_t`` is encoded as the output of the backward scan operator
-        return [*new_g_additional_inputs, *g_c_t, *g_xs_t]
-
-    5.) Perform the backward scan as
-    g_additional_inputs, g_init, g_xs = scan_op(combine_fn_bw_grad_accumulation, bw_init, bw_xs), where
-    bw_init consists of the initial gradient carry for the additional_inputs (initialized with 0s):
-    initial_g_additional_inputs, and the gradient of the last carry: g_c_T. Thus:
-    bwd_init = [*initial_g_additional_inputs, *g_c_T].
-
-    bw_xs consists of the combination of the upstream gradients g_ys,
-    the forward carries prepended with the fw_init, i.e., bw_carries = concat([fw_init, fw_carries[:-1]]) and
-    the fw_xs. In particular,
-    bwd_xs = [*g_ys, *bw_carries, *fw_xs].
-
-    Note: g_c_T and g_ys are provided through the torch.autograd.Function.backward's input
-
-    As demonstrated in the Example above, this backward scan then yields the gradient for the init -> g_init
-    and the gradient for the xs -> g_xs
-
     NOTE: [scan partial grad handling]
     If any element of init, of xs, of the outputs or of the additional_inputs does not require gradients,
     i.e., requires_grad=False, there will be still gradients returned for those elements,
@@ -565,280 +449,363 @@ def combine_fn_bw_grad_accumulation(*args):
     """
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(
         ctx,
-        combine_fn,
-        num_leaves_init,
-        num_leaves_xs,
-        num_additional_inputs,
+        hop_partitioned_graph,
+        n_init,
+        n_xs,
+        n_additional_inputs,
         *operands,
     ):
-        ctx._num_leaves_init = num_leaves_init
-        ctx._num_leaves_xs = num_leaves_xs
-        ctx._num_additional_inputs = num_additional_inputs
-        ctx._combine_fn = combine_fn
         init, xs, additional_inputs = split_into_chunks(
-            operands, [num_leaves_init, num_leaves_xs, num_additional_inputs]
+            operands, [n_init, n_xs, n_additional_inputs]
+        )
+        ctx._scan_impl = ScanAutogradImpl(
+            hop_partitioned_graph, init, xs, additional_inputs
         )
-        additional_inputs_tensor_mask = get_tensor_mask(additional_inputs)
-        ctx._additional_inputs_tensor_mask = additional_inputs_tensor_mask
-
-        # We snapshot the dispatch keys in forward for materializing the
-        # the bw_graph in backward.
-        ctx._fw_include_key_set = torch._C._dispatch_tls_local_include_set()
-        ctx._fw_exclude_key_set = torch._C._dispatch_tls_local_exclude_set()
-
-        # 1.) Prepare the forward graph wrapper ``combine_fn_with_carry_checkpoint``
-        # The wrapper of the forward graph returns carries from all iterations,
-        # not just from the last iteration. These are required in the backward path
-        def combine_fn_with_carry_checkpoint(*args):
-            carry, y = _extract_carry_and_out(combine_fn(*args), num_leaves_init)
-            return [
-                *carry,
-                # We additionally checkpoint all the intermediate carry outputs for backward.
-                *[
-                    n_c.detach().clone() if isinstance(n_c, torch.Tensor) else n_c
-                    for n_c in carry
-                ],
-                *y,
-            ]
-
         with torch._C._AutoDispatchBelowAutograd():
-            # 2.) Compute the all carries, the last carry and all outputs using ``combine_fn_with_carry_checkpoint``
-            c_T, carries_ys = _extract_carry_and_out(
-                scan_op(
-                    combine_fn_with_carry_checkpoint,
-                    init,
-                    xs,
-                    additional_inputs,
-                ),
-                num_leaves_init,
-            )
+            return ctx._scan_impl.call_forward()
 
-            # Collect the carries for each time step from the outs
-            # and save them for the backward path
-            carries = list(carries_ys[:num_leaves_init])
-            ys = list(carries_ys[num_leaves_init:])
-            save_tensors_and_symints_for_backward(ctx, list(operands) + carries + ys)
-            ctx._num_leaves_ys = len(ys)
+    @staticmethod
+    def backward(ctx, *grad_fw_outputs):
+        return (
+            None,
+            None,
+            None,
+            None,
+            *ctx._scan_impl.call_backward(*grad_fw_outputs),
+        )
 
-            return (*c_T, *ys)
 
-    @staticmethod
-    def backward(ctx, *flat_grads):
-        r"""
-        This function computes the gradients of the scan operation.
-        It does so by using a scan operator using all carries and the upstream gradients (see description above)
+class ScanForwardIntermediatesHandlingPolicy(enum.Enum):
+    """
+    Partitioner can add interemdiates to the output of original graph.
+    These intermediates fall into 4 categories and we want to have different policies for handling them by
+    modifying the graph:
+
+    CLONE: we clone the intermediate when it is a carried input (i.e. init). In this case, this carry will be
+        replaced with new values at each forward step so we need to clone the carry as part of return (i.e. ys)
+        so as to remove the aliasing and that each step's intermediate will be stacked together and saved in bacwkard.
+
+    REMOVE_XS: we remove the intermediate from output when it is part of xs. Since xs is read-only, in this case,
+        we can directly save them for backward to use.
+
+    REMOVE_ADDITIONAL_INPUTS: we remove the intermediate from output when it is part of additinonal_inputs. additional_inputs
+        are also read-only in each step, we can directly save them for bacwkard to use. We differentiate XS and ADDITIONAL_INPUTS
+        so that we could have different treatment for them in backward. In backward, we need to put xs intermediates in carry but
+        put additional_inputs as backward scan's additional_inputs.
 
-        Args:
-            flat_grads (torch.Tensor): The tensor of flattened upstream gradients.
+    KEEP: this corresponds to a real intermediate tensor operations' output. It varies at each forward step, we could just keep
+        it as part of ys.
+
+    """
+
+    KEEP = 0
+    CLONE = 1
+    REMOVE_XS = 2
+    REMOVE_ADDITIONAL_INPUTS = 3
+
+
+class ScanAutogradImpl:
+    """
+    Wraps over partitioned graph and encapsulates scan-specific implementation details
+    """
+
+    def __init__(
+        self, hop_partitioned_graph: HopPartitionedGraph, init, xs, additional_inputs
+    ):
+        self.hop_partitioned_graph = hop_partitioned_graph
+        self.init = init
+        self.xs = xs
+        self.additional_inputs = additional_inputs
+        self.forward_intermediates_handling_policies: list[
+            ScanForwardIntermediatesHandlingPolicy
+        ] = []
+        self.saved_fw_xs: list[Any] = []
+        self.saved_fw_additional_inputs: list[Any] = []
+        self.saved_intermediates: list[Any] = []
+        self.fw_spec = pytree.tree_flatten((init, xs, additional_inputs))[1]
+        self._optimize_forward_intermediates()
+
+    def _insert_clone(
+        self, need_copy_node: torch.fx.Node, output_node: torch.fx.Node
+    ) -> torch.fx.Node:
+        graph: torch.fx.Graph = output_node.graph
+        with graph.inserting_before(output_node):
+            clone_node = graph.call_function(
+                torch.ops.aten.clone.default,
+                args=(need_copy_node,),
+            )
+            clone_node.meta = (
+                need_copy_node.meta.copy() if hasattr(need_copy_node, "meta") else {}
+            )
+        return clone_node
+
+    def _optimize_forward_intermediates(self):
         """
+        We optimize the forward intermediates by categorize forward intermediates into categories
+        and construct a ScanForwardIntermediatesHandlingPolicy for them
 
-        # Collect the saved items from the forward
-        num_leaves_init = ctx._num_leaves_init
-        num_leaves_xs = ctx._num_leaves_xs
-        num_leaves_ys = ctx._num_leaves_ys
-        num_additional_inputs = ctx._num_additional_inputs
-        additional_inputs_tensor_mask = ctx._additional_inputs_tensor_mask
-
-        def prepend_init_to_carries(init, carries):
-            # Prepare the carries for the backward path.
-            # This requires to concatenate the init and the carries
-            return [
-                torch.cat([torch.unsqueeze(i, 0), c[:-1]], dim=0)
-                for i, c in zip(init, carries)
-            ]
-
-        def initialize_g_additional_inputs(
-            additional_inputs,
-        ):
-            # The initial gradients for the additional_inputs are all zeros
-            g_additional_inputs = [
-                torch.zeros_like(ai) if ai_tm else None
-                for ai_tm, ai in zip(additional_inputs_tensor_mask, additional_inputs)
-            ]
-            return g_additional_inputs
-
-        # Retrieve the forward inputs and the forward outputs and dissect them
-        flat_args = saved_tensors_and_symints(ctx)
-        fw_init, fw_xs, additional_inputs, fw_carries, fw_ys = split_into_chunks(
-            flat_args,
-            [
-                num_leaves_init,
-                num_leaves_xs,
-                num_additional_inputs,
-                num_leaves_init,
-                num_leaves_ys,
-            ],
+        """
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug(
+                "Need remove aliasing in fw_gm:\n%s",
+                self.hop_partitioned_graph.fw_gm.print_readable(print_output=False),
+            )
+
+        fw_gm = self.hop_partitioned_graph.fw_gm
+        fw_all_outputs = _find_hop_subgraph_outputs(fw_gm)
+        phs = list(fw_gm.graph.find_nodes(op="placeholder"))
+        fw_outputs = fw_all_outputs[: self.hop_partitioned_graph.n_fw_outputs]
+        fw_intermediates = fw_all_outputs[self.hop_partitioned_graph.n_fw_outputs :]
+
+        init_phs, xs_phs, additional_inputs_phs = pytree.tree_unflatten(
+            phs, self.fw_spec
+        )
+        init_node_set, xs_node_set, addi_node_set = (
+            set(init_phs),
+            set(xs_phs),
+            set(additional_inputs_phs),
         )
 
-        # 3.) Prepare the backward graph
-        fw_operands = (
-            *fw_init,
-            *[first_slice_copy(xs) for xs in fw_xs],
-            *additional_inputs,
+        assert len(self.forward_intermediates_handling_policies) == 0
+        assert len(self.saved_fw_xs) == 0
+        assert len(self.saved_fw_additional_inputs) == 0
+        intermediate_idx_to_ph_idx = {}
+        ph_idx = {ph: i for i, ph in enumerate(phs)}
+        for i, out in enumerate(fw_intermediates):
+            if out in init_node_set:
+                self.forward_intermediates_handling_policies.append(
+                    ScanForwardIntermediatesHandlingPolicy.CLONE
+                )
+                intermediate_idx_to_ph_idx[i] = ph_idx[out]
+            elif out in xs_node_set:
+                self.forward_intermediates_handling_policies.append(
+                    ScanForwardIntermediatesHandlingPolicy.REMOVE_XS
+                )
+                intermediate_idx_to_ph_idx[i] = ph_idx[out]
+            elif out in addi_node_set:
+                self.forward_intermediates_handling_policies.append(
+                    ScanForwardIntermediatesHandlingPolicy.REMOVE_ADDITIONAL_INPUTS
+                )
+                intermediate_idx_to_ph_idx[i] = ph_idx[out]
+            else:
+                self.forward_intermediates_handling_policies.append(
+                    ScanForwardIntermediatesHandlingPolicy.KEEP
+                )
+
+        new_output_node = []
+        real_graph_inputs = (
+            list(self.init) + list(self.xs) + list(self.additional_inputs)
         )
-        ctx._combine_fn_bw = create_bw_fn(ctx._combine_fn, fw_operands)
-
-        # 4.) Create the BW wrapper to accumulate the gradients for the additional_inputs
-        def combine_fn_bw_grad_accumulation(*args):
-            # Dissect args and re-order them for the ``ctx._combine_fn_bw``
-            # The content of ``combine_fn_bw_tangents`` is [*carries_g, *outs_g]
-            # The content of ``combine_fn_bw_primals`` is [*init, *xs, *additional_inputs]
-            (
-                carried_g_additional_input,
-                combine_fn_bw_tangents,
-                combine_fn_bw_primals,
-            ) = split_into_chunks(
-                args,
-                [
-                    num_additional_inputs,
-                    num_leaves_init + num_leaves_ys,
-                    num_leaves_init + num_leaves_xs + num_additional_inputs,
-                ],
+        fw_output_node = next(iter(fw_gm.graph.find_nodes(op="output")))
+        for intermediate_idx, (node, policy) in enumerate(
+            zip(fw_intermediates, self.forward_intermediates_handling_policies)
+        ):
+            if policy == ScanForwardIntermediatesHandlingPolicy.CLONE:
+                new_output_node.append(self._insert_clone(node, fw_output_node))
+            elif policy == ScanForwardIntermediatesHandlingPolicy.REMOVE_XS:
+                assert intermediate_idx in intermediate_idx_to_ph_idx
+                inp_idx = intermediate_idx_to_ph_idx[intermediate_idx]
+                self.saved_fw_xs.append(real_graph_inputs[inp_idx])
+            elif (
+                policy
+                == ScanForwardIntermediatesHandlingPolicy.REMOVE_ADDITIONAL_INPUTS
+            ):
+                assert intermediate_idx in intermediate_idx_to_ph_idx
+                inp_idx = intermediate_idx_to_ph_idx[intermediate_idx]
+                self.saved_fw_additional_inputs.append(real_graph_inputs[inp_idx])
+            else:
+                new_output_node.append(node)
+
+        fw_output_node.args = (tuple(fw_outputs) + tuple(new_output_node),)
+        fw_gm.graph.lint()
+        fw_gm.recompile()
+
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug(
+                "after removing aliasing:\n%s", fw_gm.print_readable(print_output=False)
             )
-            combine_fn_bw_args = (*combine_fn_bw_primals, *combine_fn_bw_tangents)
 
-            g_c_t, g_xs_t, g_additional_inputs_t = split_into_chunks(
-                ctx._combine_fn_bw(*combine_fn_bw_args),
-                [num_leaves_init, num_leaves_xs, num_additional_inputs],
+    def call_forward(self):
+        fw_outputs_and_intermediates: tuple[Any] = scan_op(
+            self.hop_partitioned_graph.fw_gm, self.init, self.xs, self.additional_inputs
+        )  # type: ignore[return-type]
+        fw_outs = fw_outputs_and_intermediates[
+            : self.hop_partitioned_graph.n_fw_outputs
+        ]
+        saved_intermediates = fw_outputs_and_intermediates[
+            self.hop_partitioned_graph.n_fw_outputs :
+        ]
+        assert len(self.saved_intermediates) == 0
+        self.saved_intermediates.extend(saved_intermediates)
+        return tuple(fw_outs)
+
+    def call_backward(self, *grad_fw_outputs):
+        """
+        Recall that fw_outputs = (*carry, *ys), bw_gm takes in (*fw_intermediates, *grad_carry, *grad_ys)
+        and returns (*grad_init, *grad_xs, *grad_additional_inputs)
+        The bacwkard is a reversed scan that can be constructed as follows:
+
+          grad_additonal_inputs = torch.zeros_like(additional_inputs)
+          bw_init = (grad_carry, grad_additional_inputs)
+          bw_xs = (fw_intermediates, grad_ys)
+          grad_init, grad_additional_inputs, grad_xs = scan(
+            combine_fn,
+            bw_init,
+            bw_xs,
+            reverse = True
+          )
+          , where combine_fn is defined as follows:
+
+           def combine_fn(bw_init, bw_xs):
+             grad_carry, grad_additional_inputs = bw_init
+             fw_intermediates, grad_y = bw_xs
+             nxt_grad_carry, grad_x, nxt_grad_additional_inputs = bw_gm(*fw_intermediates, *grad_carry, *grad_y)
+             return (nxt_grad_carry, grad_additional_inputs + nxt_grad_additional_inputs), grad_x
+
+          Note that grad_additional_inputs is accumulated with add, grad_carry is carried over to next iteration and
+          grad_x is the ys output, which will be stacked together after the loop and will have the same shape as xs.
+        """
+        fw_policy = self.forward_intermediates_handling_policies
+        saved_intermediates = self.saved_intermediates
+        saved_fw_xs = self.saved_fw_xs
+        saved_fw_additional_inputs = self.saved_fw_additional_inputs
+
+        n_carry = len(self.init)
+
+        grad_carry, grad_ys = grad_fw_outputs[:n_carry], grad_fw_outputs[n_carry:]
+        additional_inputs_tensor_masks = [
+            bool(isinstance(t, torch.Tensor)) for t in self.additional_inputs
+        ]
+        grad_additional_inputs = [
+            torch.zeros_like(t)
+            for t in filter_with_masks(
+                self.additional_inputs, additional_inputs_tensor_masks
             )
+        ]
 
-            new_g_additional_inputs = [
-                # If the additional inputs are ints or SymInts, those values are taken as is and no gradients are added
-                carr_g + curr_g if add_inp_tm else carr_g
-                for add_inp_tm, carr_g, curr_g in zip(
-                    additional_inputs_tensor_mask,
-                    carried_g_additional_input,
-                    g_additional_inputs_t,
-                )
-            ]
-
-            # The ``new_g_additional_inputs`` and the ``g_c_t`` are encoded in the carry of the backward scan operator
-            # The ``g_xs_t`` is encoded as the output of the backward scan operator
-            return [*new_g_additional_inputs, *g_c_t, *g_xs_t]
-
-        # Materialize the ``combine_fn_bw_grad_accumulation``
-        def construct_args_single_step_bw():
-            # This function constructs the arguments for a single step of the backward scan.
-            # In other words, it creates the arguments for ``combine_fn_bw_grad_accumulation``
-            # The order of the arguments returned is identical to the order the backward scan
-            # operations provides
-
-            # The following arguments are used for the backward part of the joint graph
-            # The first argument relates to the gradient accumulation of the additional inputs.
-            # Because only tensor elements of additional inputs can have requires_grad=True,
-            # the values for non-tensor elements of additional inputs are None
-            masked_additional_inputs = [
-                a.clone() if add_inp_tm else None
-                for add_inp_tm, a in zip(
-                    additional_inputs_tensor_mask, additional_inputs
-                )
-            ]
-
-            # The second argument relates to the gradients of the carries.
-            # Because the arguments are for a single step only,
-            # only the first slice of the carries is used.
-            sliced_carries = [first_slice_copy(c) for c in fw_carries]
-
-            # The third argument relates to the gradients of the ys.
-            # Because the arguments are for a single step only,
-            # only the first slice of the ys is used.
-            sliced_ys = [first_slice_copy(o) for o in fw_ys]
-
-            # The following arguments are used for the forward part of the joint graph
-            # The fourth argument relates to the init for the forward.
-            # I.e., fw_init
-
-            # The fifth argument relates to the xs for the forward.
-            # Because the arguments are for a single step only,
-            # only the first slice of the xs is used.
-            # Note: It is important to preserve the requires_grad flag of xs
-            # and thus we use the wrapper function ``first_slice_copy_with_grad``
-            fw_xs_slice = first_slice_copy_with_grad(fw_xs)
-
-            # The last argument relates to the additional inputs for the forward.
-            # I.e., additional_inputs
-
-            return (
-                *masked_additional_inputs,
-                *sliced_carries,
-                *sliced_ys,
-                *fw_init,
-                *fw_xs_slice,
-                *additional_inputs,
+        bw_init = [grad_carry, grad_additional_inputs]
+        bw_xs = [
+            grad_ys,
+            saved_fw_xs,
+            saved_intermediates,
+        ]
+        bw_additional_inputs = saved_fw_additional_inputs
+
+        _, flat_spec = pytree.tree_flatten((bw_init, bw_xs, bw_additional_inputs))
+
+        grad_spec = None
+
+        def bw_single_step_wrapper(*args):
+            bw_init, bw_xs, bw_additional_inputs = pytree.tree_unflatten(
+                args, flat_spec
+            )
+            grad_carry, grad_additional_inputs = bw_init
+            grad_y, saved_fw_xs, saved_intermediates = bw_xs
+            saved_fw_additional_inputs = bw_additional_inputs
+
+            fw_intermediates = []
+            xs_it = iter(saved_fw_xs)
+            carry_it = iter(saved_intermediates)
+            addi_it = iter(saved_fw_additional_inputs)
+            for policy in fw_policy:
+                if policy in (
+                    ScanForwardIntermediatesHandlingPolicy.CLONE,
+                    ScanForwardIntermediatesHandlingPolicy.KEEP,
+                ):
+                    fw_intermediates.append(next(carry_it))
+                elif policy == ScanForwardIntermediatesHandlingPolicy.REMOVE_XS:
+                    fw_intermediates.append(next(xs_it))
+                elif (
+                    policy
+                    == ScanForwardIntermediatesHandlingPolicy.REMOVE_ADDITIONAL_INPUTS
+                ):
+                    fw_intermediates.append(next(addi_it))
+                else:
+                    raise RuntimeError(f"Unknown policy: {policy}")
+
+            grad_fw_outputs = (*grad_carry, *grad_y)
+
+            flat_out = self.hop_partitioned_graph.bw_gm(
+                *fw_intermediates,
+                *grad_fw_outputs,
             )
 
-        args_single_step_bw = construct_args_single_step_bw()
+            next_grad_carry, grad_xs, grad_addi = split_into_chunks(
+                flat_out,  # type: ignore[arg-type]
+                [len(self.init), len(self.xs), len(self.additional_inputs)],
+            )
 
-        # TODO: we need to materialize the bw graphs because dynamo is unable to
-        # trace through the joint function when torch.compile torch.autograd.grad.
-        combine_fn_bw_grad_accumulation_gm = materialize_as_graph(
-            combine_fn_bw_grad_accumulation,
-            args_single_step_bw,
-            ctx._fw_include_key_set,
-            ctx._fw_exclude_key_set,
-            force_enable_grad=True,
+            nonlocal grad_spec
+            flat_grads, grad_spec = pytree.tree_flatten(
+                (
+                    next_grad_carry,
+                    [
+                        prev + cur
+                        for prev, cur in zip(
+                            grad_additional_inputs,
+                            filter_with_masks(
+                                grad_addi, additional_inputs_tensor_masks
+                            ),
+                        )
+                    ],
+                    grad_xs,
+                )
+            )
+            return flat_grads
+
+        single_step_bw_xs = pytree.tree_map(lambda t: t[0], bw_xs)
+        bw_single_step_gm = materialize_as_graph(
+            bw_single_step_wrapper,
+            tuple(
+                pytree.tree_flatten((bw_init, single_step_bw_xs, bw_additional_inputs))[
+                    0
+                ]
+            ),
         )
 
-        # Decompose the flat_grads into g_c_T, g_ys
-        g_c_T, g_ys = split_into_chunks(flat_grads, [num_leaves_init, num_leaves_ys])
-
-        # Initialize the g_additional_inputs with zero-tensors.
-        # This step is necessary because the gradients of the additional inputs are accumulated in the
-        # ``wrapper_bwd_combine_fn`` and thus need a zero-initialized starting point
-        initial_g_additional_inputs = initialize_g_additional_inputs(additional_inputs)
-
-        # Prepend the inits to the carries.
-        # This is needed, because when computing the gradients, the last carry is not needed
-        # but the first carry, the init, is required.
-        bw_carries = prepend_init_to_carries(fw_init, fw_carries)
-
-        # Prepare the xs for the backward scan.
-        bwd_xs = [*g_ys, *bw_carries, *fw_xs]
-
-        # The flipping of the ``bwd_xs`` is necessary because the scan_op in the backward is always performed in reverse
-        bwd_xs = [torch.flip(elem, [0]) for elem in bwd_xs]
-
-        # Prepare the bwd_init
-        bwd_init = [*initial_g_additional_inputs, *g_c_T]
-
-        # 5.) Perform the backward scan:
-        # The ``combine_fn_bw_wrapped`` receives the
-        # initial_g_additional_inputs and the last carry as the ``bwd_init`` and the
-        # gradients of the outputs (g_ys), as well as the fw_carries and the fw_xs of the forward as the ``bwd_xs``
-        gradients = scan_op(
-            combine_fn_bw_grad_accumulation_gm,
-            bwd_init,
-            bwd_xs,
-            additional_inputs,
+        flat_grads = scan_op(
+            bw_single_step_gm,
+            pytree.tree_flatten(bw_init)[0],
+            # TODO: torch.flip copies the tensor, we should optimize it away
+            [torch.flip(x, (0,)) for x in pytree.tree_flatten(bw_xs)[0]],
+            pytree.tree_flatten(bw_additional_inputs)[0],
         )
-
-        # Unpack the computed gradients
-        g_additional_inputs, g_init, g_xs = split_into_chunks(
-            gradients, [num_additional_inputs, num_leaves_init, num_leaves_xs]
+        assert grad_spec is not None
+        grad_init, grad_additional_inputs, grad_xs = pytree.tree_unflatten(
+            flat_grads, grad_spec
+        )
+        return (
+            *grad_init,
+            *[torch.flip(elem, (0,)) for elem in grad_xs],
+            *fill_none_with_masks(
+                grad_additional_inputs, additional_inputs_tensor_masks
+            ),
         )
-
-        # The flipping back along the scan dimension is required to get the gradients in the right order for ``xs``
-        g_xs = [torch.flip(elem, [0]) for elem in g_xs]
-
-        return *[None] * 4, *g_init, *g_xs, *g_additional_inputs
 
 
 @scan_op.py_autograd_impl
 def scan_autograd(combine_fn, init, xs, additional_inputs):
-    num_leaves_init = len(init)
-    num_leaves_xs = len(xs)
-    num_additional_inputs = len(additional_inputs)
+    with disable_proxy_modes_tracing():
+        hop_partitioned_graph: HopPartitionedGraph = (
+            HopGraphMinCutPartitioner.create_partitioned_graph(
+                combine_fn,
+                (*init, *[x[0] for x in xs], *additional_inputs),
+                always_recompute_complex_exprs=True,
+            )
+        )
 
-    flat_out = ScanAutogradOp.apply(
-        combine_fn,
-        num_leaves_init,
-        num_leaves_xs,
-        num_additional_inputs,
-        *(tuple(init) + tuple(xs) + additional_inputs),
+    return ScanAutogradOp.apply(
+        hop_partitioned_graph,
+        len(init),
+        len(xs),
+        len(additional_inputs),
+        *init,
+        *xs,
+        *additional_inputs,
     )
-    return *flat_out[:num_leaves_init], *flat_out[num_leaves_init:]
 
 
 @scan_op.py_impl(ProxyTorchDispatchMode)
diff --git a/torch/_higher_order_ops/schema.py b/torch/_higher_order_ops/schema.py
index b1cdacb32373..46dc11573a78 100644
--- a/torch/_higher_order_ops/schema.py
+++ b/torch/_higher_order_ops/schema.py
@@ -35,7 +35,7 @@ def from_example(
         kw_only: bool = False,
     ) -> HopArgumentInfo:
         if default_value is not None:
-            assert type(example_value) == type(default_value), (
+            assert type(example_value) is type(default_value), (
                 f"example_value type {type(example_value)} doesn't match default_value type: {type(default_value)}"
             )
 
diff --git a/torch/_higher_order_ops/strict_mode.py b/torch/_higher_order_ops/strict_mode.py
index 1ed920c4a150..f5875ded5a99 100644
--- a/torch/_higher_order_ops/strict_mode.py
+++ b/torch/_higher_order_ops/strict_mode.py
@@ -1,5 +1,5 @@
 # mypy: allow-untyped-defs
-from typing import Any, Callable, Union
+from typing import Any, TYPE_CHECKING, Union
 
 import torch
 import torch._subclasses.functional_tensor
@@ -20,6 +20,10 @@
 from torch.utils._python_dispatch import _get_current_dispatch_mode
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 @exposed_in("torch")
 def strict_mode(callable, operands):
     from torch._dynamo.backends.debugging import (
diff --git a/torch/_higher_order_ops/triton_kernel_wrap.py b/torch/_higher_order_ops/triton_kernel_wrap.py
index fa8ab598eb89..27be6dca3e50 100644
--- a/torch/_higher_order_ops/triton_kernel_wrap.py
+++ b/torch/_higher_order_ops/triton_kernel_wrap.py
@@ -8,8 +8,8 @@
 import operator
 import threading
 from collections import defaultdict
-from collections.abc import Sequence
-from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from collections.abc import Callable, Sequence
+from typing import Any, Optional, TYPE_CHECKING, Union
 from typing_extensions import Never
 
 import sympy
@@ -292,6 +292,7 @@ def generate_ttir(
             ordered_args[name] = 2
         elif (
             stable_meta := maybe_unpack_tma_stable_metadata(
+                # pyrefly: ignore  # bad-argument-type
                 tma_descriptor_metadata.get(name, None)
             )
         ) is not None:
@@ -386,6 +387,23 @@ def _get_specialization(args):  # type: ignore[no-untyped-def]
                         triton.runtime.jit.create_specialize_impl(),
                         specialize_extra=backend.get_arg_specialization,
                     )
+            # create_specialize_impl is removed in https://github.com/triton-lang/triton/pull/7771
+            # switch to native_specialize_impl instead
+            elif hasattr(triton.runtime.jit, "native_specialize_impl"):
+                from triton.backends import BaseBackend
+                from triton.runtime.jit import native_specialize_impl
+
+                def _native_specialize_impl(
+                    arg: Any,
+                    is_const: bool = False,
+                    specialize_value: bool = True,
+                    align: bool = True,
+                ) -> Callable:
+                    return native_specialize_impl(
+                        BaseBackend, arg, is_const, specialize_value, align
+                    )
+
+                specialize_impl = _native_specialize_impl
             else:
                 from triton.runtime.jit import specialize_impl as specialize_impl_orig
 
@@ -408,6 +426,7 @@ def _get_specialization(args):  # type: ignore[no-untyped-def]
                         specialize_value=not kp.do_not_specialize,
                         align=not kp.do_not_specialize_on_alignment,
                     )
+                    # pyrefly: ignore  # unsupported-operation
                     attrvals.append(spec[1])
 
             attrs = find_paths_if(attrvals, lambda _, x: isinstance(x, str))
@@ -426,6 +445,7 @@ def _get_specialization(args):  # type: ignore[no-untyped-def]
         def get_signature_value(idx: int, arg: Any) -> str:
             if kernel.params[idx].is_constexpr:
                 return "constexpr"
+            # pyrefly: ignore  # not-callable
             return mangle_type(arg)
 
     else:
@@ -798,6 +818,7 @@ def get_tma_stores(
         for op in op_list:
             if op.name == "tt.call":
                 assert op.fn_call_name in functions
+                # pyrefly: ignore  # bad-argument-type
                 tma_stores = get_tma_stores(functions, op.fn_call_name)
                 for i, inp in enumerate(op.args):
                     if Param(idx=i) in tma_stores:
@@ -878,7 +899,10 @@ def analyze_kernel_mutations(
             if op.name == "tt.call":
                 assert op.fn_call_name in functions
                 mutations = analyze_kernel_mutations(
-                    functions, op.fn_call_name, len(op.args)
+                    functions,
+                    # pyrefly: ignore  # bad-argument-type
+                    op.fn_call_name,
+                    len(op.args),
                 )
                 stack.extend(arg for arg, mutated in zip(op.args, mutations) if mutated)
             else:
@@ -931,6 +955,7 @@ def identify_mutated_tensors(
         assert functions is not None
         kernel_name = next(iter(functions.keys()))
         # Triton codegen modifies the name
+        # pyrefly: ignore  # missing-attribute
         assert kernel.fn.__name__ in kernel_name
         # Reset the cache between top level invocations
         # The cache for analyze kernel mutations is mainly used for cycle
@@ -1034,7 +1059,11 @@ def triton_kernel_wrapper_mutation_dense(
         grid_fn = grid[0]
     else:
         fn_name, code = user_defined_kernel_grid_fn_code(
-            kernel.fn.__name__, kernel.configs, grid
+            # pyrefly: ignore  # missing-attribute
+            kernel.fn.__name__,
+            # pyrefly: ignore  # missing-attribute
+            kernel.configs,
+            grid,
         )
         namespace: dict[str, Any] = {}
         exec(code, namespace)
@@ -1083,6 +1112,7 @@ def triton_kernel_wrapper_mutation_dense(
     # avoid mutating the original inputs
     kwargs = kwargs.copy()
     constant_args = constant_args.copy()
+    # pyrefly: ignore  # missing-attribute
     for name in kernel.arg_names:
         if name in kwargs:
             args.append(kwargs.pop(name))
@@ -1091,6 +1121,7 @@ def triton_kernel_wrapper_mutation_dense(
         else:
             break
 
+    # pyrefly: ignore  # index-error
     kernel[grid_fn](*args, **kwargs, **constant_args)
 
 
@@ -1496,6 +1527,7 @@ def init_variable(
 
         assert kernel_idx is None or variable.kernel_idx == kernel_idx
 
+        # pyrefly: ignore  # bad-assignment
         variable.grid = grid
 
         if isinstance(kernel, Autotuner):
@@ -2040,6 +2072,7 @@ def run(self, *args: Sequence[Any], **kwargs: dict[str, Any]) -> Any:
             return tracing_triton_hopifier_singleton.call_run(self, args, kwargs, None)
         else:
             assert self.kernel is not None
+            # pyrefly: ignore  # missing-attribute
             return self.kernel.run(*args, **kwargs)
 
     def __call__(self, *args: Sequence[Any], **kwargs: dict[str, Any]) -> Any:
@@ -2051,6 +2084,7 @@ def __call__(self, *args: Sequence[Any], **kwargs: dict[str, Any]) -> Any:
             )
         else:
             assert self.kernel is not None
+            # pyrefly: ignore  # index-error
             return self.kernel[self.grid](*args, **kwargs)
 
     def specialize_symbolic(self, arg: Sequence[Any]) -> Any:
diff --git a/torch/_higher_order_ops/utils.py b/torch/_higher_order_ops/utils.py
index 7e5b235264fc..59b7d635bece 100644
--- a/torch/_higher_order_ops/utils.py
+++ b/torch/_higher_order_ops/utils.py
@@ -1,10 +1,10 @@
 # mypy: allow-untyped-defs
 import contextlib
 import functools
-from collections.abc import Iterable, Sequence
+from collections.abc import Callable, Iterable, Sequence
 from contextlib import AbstractContextManager, contextmanager, ExitStack, nullcontext
 from dataclasses import dataclass
-from typing import Any, Callable, Optional, overload, TypeVar, Union
+from typing import Any, Optional, overload, TypeVar, Union
 
 import torch
 import torch.fx.traceback as fx_traceback
@@ -96,19 +96,8 @@ def graph_with_interpreter(*args):
 
 def _maybe_compile_and_run_fn(fn, *args):
     if not torch.compiler.is_dynamo_compiling():
-        from torch._dynamo.backends.debugging import (
-            make_eager_backend_with_torch_function_mode,
-        )
-
-        with _set_compilation_env(), torch._dynamo.utils.disable_cache_limit():
-            with _temp_remove_metadata_torch_function_mode() as metadata_mode:
-                if metadata_mode:
-                    backend: Union[str, Callable[..., Any]] = (
-                        make_eager_backend_with_torch_function_mode(metadata_mode)
-                    )
-                else:
-                    backend = "eager"
-                return torch.compile(fn, backend=backend, fullgraph=True)(*args)
+        with setup_compilation_env() as backend:  # type: ignore[attr-defined]
+            return torch.compile(fn, backend=backend, fullgraph=True)(*args)
     else:
         return fn(*args)
 
@@ -236,6 +225,34 @@ def diff_device(
         )
 
 
+@contextmanager
+def setup_compilation_env():
+    """
+    Context manager that sets up proper environment and backend when invoking torch.compile
+    inside torch.export region or inside HOP.
+    """
+    from torch._dynamo.backends.debugging import (
+        make_eager_backend_with_torch_function_modes,
+    )
+    from torch.fx.experimental.proxy_tensor import (
+        _temp_remove_pre_dispatch_torch_function_mode,
+    )
+
+    with (
+        _set_compilation_env(),
+        torch._dynamo.utils.disable_cache_limit(),
+        _temp_remove_pre_dispatch_torch_function_mode() as pre_dispatch_mode,
+        _temp_remove_metadata_torch_function_mode() as metadata_mode,
+    ):
+        modes = [
+            mode for mode in (pre_dispatch_mode, metadata_mode) if mode is not None
+        ]
+        if modes:
+            yield make_eager_backend_with_torch_function_modes(modes)
+        else:
+            yield "eager"
+
+
 @contextmanager
 def _set_compilation_env():
     _old_is_tracing = torch.fx._symbolic_trace._is_fx_tracing_flag
@@ -253,6 +270,7 @@ def _set_compilation_env():
         # We need to turn off the is_fx_tracing_flag. Remove this flag check from dyanmo
         # once we are confident fx tracing works with dynamo.
         torch.fx._symbolic_trace._is_fx_tracing_flag = False
+        # pyrefly: ignore  # bad-assignment
         torch._dynamo.config.allow_empty_graphs = True
         torch._dynamo.config.capture_scalar_outputs = True
         yield
@@ -423,6 +441,7 @@ def unique_graph_name_with_root(
 ) -> tuple[int, str]:
     next_name = None
     i = 0
+    # pyrefly: ignore  # bad-assignment
     while not next_name:
         candidate = f"{prefix}_{i}"
         if hasattr(root, candidate):
@@ -483,8 +502,7 @@ def prepare_fw_with_masks(fn):
     def fw_with_masks(*args):
         fw_out = fn(*args)
         return fw_out, [
-            True if isinstance(ret, torch.Tensor) and ret.requires_grad else False
-            for ret in fw_out
+            bool(isinstance(ret, torch.Tensor) and ret.requires_grad) for ret in fw_out
         ]
 
     return fw_with_masks
@@ -505,9 +523,13 @@ def fw_with_masks(*args):
                 lambda x: x.requires_grad_(True) if x.dtype.is_floating_point else x,
                 fw_out,
             )
-        return fw_out, pytree.tree_map_only(
-            torch.Tensor, lambda x: x.requires_grad, fw_out
-        )
+
+        def _query_requires_grad(t: torch.Tensor) -> bool:
+            if torch._is_functional_tensor(t):
+                t = torch._from_functional_tensor(t)
+            return t.requires_grad
+
+        return fw_out, pytree.tree_map_only(torch.Tensor, _query_requires_grad, fw_out)
 
     return fw_with_masks
 
@@ -734,7 +756,34 @@ def split_into_chunks(iterable: Sequence[Any], chunk_sizes: list[int]) -> list[A
     return elements
 
 
-def create_bw_fn(fn: Callable, args: tuple[Any]) -> Callable:
+def _clone_aliasing_output(inputs: Sequence[Any], outputs: Sequence[Any]):
+    # For tensors whose grad is None, create zero tensors as gradients
+    # This invariant is useful for cudagraph.
+
+    # Elimitate input-output, output-output aliasing
+    seen_input_storages = {
+        StorageWeakRef(t._typed_storage())
+        for t in inputs
+        if isinstance(t, torch.Tensor)
+    }
+    seen_output_storages = set()
+    final_outputs = []
+    for out in outputs:
+        if isinstance(out, torch.Tensor):
+            out_storage = StorageWeakRef(out._typed_storage())
+            if (
+                out_storage in seen_input_storages
+                or out_storage in seen_output_storages
+            ):
+                out = out.clone()
+            seen_output_storages.add(StorageWeakRef(out._typed_storage()))
+        final_outputs.append(out)
+    return final_outputs
+
+
+def create_bw_fn(
+    fn: Callable, args: tuple[Any, ...], return_fw_outputs: bool = False
+) -> Callable:
     """
     For a fn that accepts flat inputs and returns flat outputs:
         fw_out = fn(*args),
@@ -748,6 +797,8 @@ def create_bw_fn(fn: Callable, args: tuple[Any]) -> Callable:
     """
 
     from torch._functorch.aot_autograd import AOTConfig, create_joint
+
+    # pyrefly: ignore  # missing-module-attribute
     from torch._higher_order_ops.utils import prepare_fw_with_masks_all_requires_grad
 
     dummy_aot_config = AOTConfig(
@@ -768,20 +819,23 @@ def create_bw_fn(fn: Callable, args: tuple[Any]) -> Callable:
     def flat_fn(*args_and_grad_outs):
         primals = args_and_grad_outs[:n_primals]
         tangents = args_and_grad_outs[n_primals:]
-        grad_args = bw_fn(primals, tangents)[1]
+        fw_outs, grad_args = bw_fn(primals, tangents)
         assert len(args) == len(grad_args)
 
-        maybe_clone = clone_outputs_aliasing_inputs(args_and_grad_outs)
-
-        return [
-            (
-                torch.zeros_like(arg)
-                if isinstance(arg, torch.Tensor) and grad is None
-                else maybe_clone(grad)
-            )
+        # For tensors whose grad is None, create zero tensors as gradients
+        # This invariant is useful for cudagraph.
+        grad_args = [
+            torch.zeros_like(arg)
+            if isinstance(arg, torch.Tensor) and grad is None
+            else grad
             for grad, arg in zip(grad_args, primals)
         ]
 
+        final_grads = _clone_aliasing_output(args_and_grad_outs, grad_args)
+        if return_fw_outputs:
+            return *fw_outs, *final_grads
+        return final_grads
+
     return flat_fn
 
 
@@ -806,7 +860,7 @@ def first_slice_copy(t: torch.Tensor, dim: int = 0) -> torch.Tensor:
 
 # Returns a mask whether a list element is a tensor or not
 def get_tensor_mask(tensor_list: Iterable[Any]) -> list[bool]:
-    return [True if isinstance(v, torch.Tensor) else False for v in tensor_list]
+    return [bool(isinstance(v, torch.Tensor)) for v in tensor_list]
 
 
 def mask_list(
@@ -888,7 +942,8 @@ def check_input_alias_and_mutation(
         inp_out_alias_map,
         out_out_alias_map,
         mutated_inputs,
-    ) = check_input_alias_and_mutation_return_outputs(gm, fake_args)[:-1]
+    ) = check_input_alias_and_mutation_return_outputs(gm)[:-1]
+    # pyrefly: ignore  # bad-return
     return inp_inp_alias_map, inp_out_alias_map, out_out_alias_map, mutated_inputs
 
 
@@ -898,7 +953,6 @@ def _tensor_storage(t) -> StorageWeakRef:
 
 def check_input_alias_and_mutation_return_outputs(
     gm: torch.fx.GraphModule,
-    fake_args: Union[list[FakeTensor], tuple[FakeTensor, ...]],
 ) -> tuple[
     dict[int, int],
     dict[int, int],
@@ -906,137 +960,68 @@ def check_input_alias_and_mutation_return_outputs(
     list[int],
     Union[tuple[Any, ...], list[Any]],
 ]:
-    # This function can be called under autograd, functional, proxy and fake tensor mode.
-    # We need to return either a fake tensor or a real tensor depending on the mode.
-    # to detect the input mutation/aliasing.
-    with (
-        disable_proxy_modes_tracing(),
-        disable_functional_mode(),
-        suspend_functionalization(),
-    ):
+    def _get_example_value(n):
+        if not isinstance(n, torch.fx.Node):
+            return n
+        else:
+            return n.meta["val"] if "val" in n.meta else n.meta["example_value"]
 
-        def _from_functional_tensor(t: torch.Tensor) -> torch.Tensor:
-            if isinstance(t, FunctionalTensor) or torch._is_functional_tensor(t):
-                return torch.empty_strided(
-                    t.size(),
-                    t.stride(),
-                    dtype=t.dtype,
-                    requires_grad=t.requires_grad,
-                    device=t.device,
-                )
-            return t
+    fake_args = [
+        _get_example_value(n)
+        for n in gm.graph.find_nodes(op="placeholder")
+        if isinstance(n, torch.fx.Node) and "val" in n.meta
+    ]
+    outputs = [
+        _get_example_value(n)
+        for n in pytree.tree_flatten(gm.graph.find_nodes(op="output")[0].args[0])[0]
+    ]
 
-        fake_args = pytree.tree_map_only(
-            torch.Tensor, _from_functional_tensor, fake_args
-        )
-    # We want to disable active functional, proxy and fake modes if any.
-    # to create a encapsulated environment for fake tensor prop
-    with torch.utils._python_dispatch._disable_current_modes():
-        """This function returns mutated inputs, inp-inp alias, inp-out alias, out-out alias
-        in the graph module gm. It checks whether input tensor versions have
-        changed after run gm once to detect mutation and checks tensor storage
-        to detect alias.
-        """
-
-        def _tensor_version(t) -> Optional[int]:
-            if isinstance(t, torch.Tensor):
-                if not isinstance(t, FakeTensor):
-                    raise RuntimeError("Only fake tensor is allowed")
-                return t._version
-            return None
-
-        def _get_shape_env(
-            fake_args,
-        ) -> torch.fx.experimental.symbolic_shapes.ShapeEnv:
-            # detect_fake_mode requires there could be only one active fake mode. This
-            # restricts the usage of this function because the global TracingContext
-            # has a persistent fake mode but fake tensors can be created
-            # outside of the tracing context (e.g. in testing).
-            # Instead, we just look at fake_args fake tensor mode
-            for arg in fake_args:
-                if isinstance(arg, FakeTensor) and arg.fake_mode.shape_env is not None:
-                    return arg.fake_mode.shape_env
-            return torch.fx.experimental.symbolic_shapes.ShapeEnv()
-
-        # Clone the fake args to avoid mutating the original fake args
-        with ExitStack() as ctx_stack:
-            # We need to reuse prev_fake_mode's shape env to resolve
-            # the runtime assertions for unbacked symbols.
-            new_fake_mode = torch._subclasses.FakeTensorMode(
-                shape_env=_get_shape_env(fake_args),
-                # In executorch, there's an scalar_to_tensor pass that turns scalar inputs into a tensor constant
-                # e.g. add(a, 1) 1 is turned into a tensor, which becomes a constant tensor attribute in the graph.
-                # We allow non fake inputs for this purpose. This is fine for mutation detection purpose:
-                # inputs are all fake and all mutations/aliasing are still detected.
-                allow_non_fake_inputs=True,
-            )
-            # We need to temporarily turn inference_mode off because
-            # under inference mode, tensor version counter is not tracked.
-            no_inference_mode_ctx = torch.inference_mode(False)
-            ctx_stack.enter_context(new_fake_mode)
-            ctx_stack.enter_context(no_inference_mode_ctx)
-            if new_fake_mode.shape_env is not None:
-                ctx_stack.enter_context(
-                    new_fake_mode.shape_env.ignore_fresh_unbacked_symbols()
-                )
+    # We need to analyze the original fake_args to detect
+    # inp-inp alias.
+    inp_storage_map = {
+        _tensor_storage(inp): i
+        for i, inp in enumerate(fake_args)
+        if isinstance(inp, torch.Tensor)
+    }
+    out_storage_map = {
+        _tensor_storage(out): i
+        for i, out in enumerate(outputs)
+        if isinstance(out, torch.Tensor)
+    }
+    inp_inp_alias_map = {
+        i: inp_storage_map[_tensor_storage(inp)]
+        for i, inp in enumerate(fake_args)
+        if isinstance(inp, torch.Tensor) and inp_storage_map[_tensor_storage(inp)] != i
+    }
+    out_out_alias_map = {
+        i: out_storage_map[_tensor_storage(out)]
+        for i, out in enumerate(outputs)
+        if isinstance(out, torch.Tensor) and out_storage_map[_tensor_storage(out)] != i
+    }
+    inp_out_alias_map = {
+        i: out_storage_map[_tensor_storage(inp)]
+        for i, inp in enumerate(fake_args)
+        if isinstance(inp, torch.Tensor) and _tensor_storage(inp) in out_storage_map
+    }
+    mutated_inputs = []
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and isinstance(
+            node.target, torch._ops.OpOverload
+        ):
+            for arg_node, arg_schema in zip(node.args, node.target._schema.arguments):
+                if arg_schema.is_write:
+                    arg_val = _get_example_value(arg_node)
+                    assert isinstance(arg_val, torch.Tensor)
+                    if _tensor_storage(arg_val) in inp_storage_map:
+                        mutated_inputs.append(inp_storage_map[_tensor_storage(arg_val)])
 
-            # create new fake tensors in new fake mode to avoid mutating original tensors
-            cloned = [
-                torch.empty_strided(
-                    arg.size(),
-                    arg.stride(),
-                    dtype=arg.dtype,
-                    device=arg.device,
-                    requires_grad=arg.requires_grad,
-                    layout=arg.layout,
-                )
-                if isinstance(arg, torch.Tensor)
-                else arg
-                for arg in fake_args
-            ]
-            before = [_tensor_version(arg) for arg in cloned]
-            outputs = gm(*cloned)
-            outputs = [outputs] if not isinstance(outputs, (list, tuple)) else outputs
-            after = [_tensor_version(arg) for arg in cloned]
-            mutated_inputs = [
-                i for i, (v1, v2) in enumerate(zip(before, after)) if v1 != v2
-            ]
-        # We need to analyze the original fake_args to detect
-        # inp-inp alias.
-        inp_storage_map = {
-            _tensor_storage(inp): i
-            for i, inp in enumerate(fake_args)
-            if isinstance(inp, torch.Tensor)
-        }
-        inp_inp_alias_map = {
-            i: inp_storage_map[_tensor_storage(inp)]
-            for i, inp in enumerate(fake_args)
-            if isinstance(inp, torch.Tensor)
-            and inp_storage_map[_tensor_storage(inp)] != i
-        }
-        out_storage_map = {
-            _tensor_storage(out): i
-            for i, out in enumerate(outputs)
-            if isinstance(out, torch.Tensor)
-        }
-        out_out_alias_map = {
-            i: out_storage_map[_tensor_storage(out)]
-            for i, out in enumerate(outputs)
-            if isinstance(out, torch.Tensor)
-            and out_storage_map[_tensor_storage(out)] != i
-        }
-        inp_out_alias_map = {
-            i: out_storage_map[_tensor_storage(inp)]
-            for i, inp in enumerate(cloned)
-            if isinstance(inp, torch.Tensor) and _tensor_storage(inp) in out_storage_map
-        }
-        return (
-            inp_inp_alias_map,
-            inp_out_alias_map,
-            out_out_alias_map,
-            mutated_inputs,
-            outputs,
-        )
+    return (
+        inp_inp_alias_map,
+        inp_out_alias_map,
+        out_out_alias_map,
+        mutated_inputs,
+        outputs,
+    )
 
 
 registered_hop_fake_fns: dict[torch._ops.OpOverload, Callable] = {}
@@ -1155,7 +1140,7 @@ def call_op(op: Union[OpOverload, HopInstance], args, kwargs):
 
 def materialize_as_graph(
     fn: Callable,
-    args: tuple[Any],
+    args: tuple[Any, ...],
     include_key_set: Optional[torch._C.DispatchKeySet] = None,
     exclude_key_set: Optional[torch._C.DispatchKeySet] = None,
     force_enable_grad=False,
diff --git a/torch/_higher_order_ops/while_loop.py b/torch/_higher_order_ops/while_loop.py
index 02aa6ac0215e..9cf9fbd0f562 100644
--- a/torch/_higher_order_ops/while_loop.py
+++ b/torch/_higher_order_ops/while_loop.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 import contextlib
 import functools
-from typing import Any, Callable, Union
+from collections.abc import Callable
+from typing import Any, Union
 
 import torch
 import torch.utils._pytree as pytree
@@ -53,6 +54,7 @@ def __call__(
         validate_subgraph_args_types(additional_inputs)
         return super().__call__(cond_fn, body_fn, carried_inputs, additional_inputs)
 
+    # pyrefly: ignore  # bad-override
     def gen_schema(self, cond_fn, body_fn, carried_inputs, additional_inputs):
         from torch._higher_order_ops.schema import HopSchemaGenerator
         from torch._higher_order_ops.utils import materialize_as_graph
@@ -79,21 +81,13 @@ def _find_example_value(n, real_inp):
                 assert not isinstance(real_inp, torch.Tensor)
                 return real_inp
 
-        example_inputs = [
-            _find_example_value(n, real_inp)
-            for n, real_inp in zip(
-                body_gm.graph.find_nodes(op="placeholder"),
-                carried_inputs + additional_inputs,
-            )
-        ]
-
         (
             _,
             _,
             _,
             body_mutated_inputs,
             body_outputs,
-        ) = check_input_alias_and_mutation_return_outputs(body_gm, example_inputs)
+        ) = check_input_alias_and_mutation_return_outputs(body_gm)
 
         (
             _,
@@ -101,7 +95,7 @@ def _find_example_value(n, real_inp):
             _,
             cond_mutated_inputs,
             _,
-        ) = check_input_alias_and_mutation_return_outputs(cond_gm, example_inputs)
+        ) = check_input_alias_and_mutation_return_outputs(cond_gm)
 
         mutated_inputs = set(body_mutated_inputs) | set(cond_mutated_inputs)
 
@@ -437,6 +431,7 @@ def _unspecialize_carried_inputs(x):
             elif isinstance(x, torch.Tensor):
                 x = x.clone()
                 if hasattr(x, "constant") and x.constant is not None:
+                    # pyrefly: ignore  # missing-attribute
                     x.constant = None
             return x
 
@@ -459,6 +454,7 @@ def produce_graph(fn):
 
         next_name = None
         i = 0
+        # pyrefly: ignore  # bad-assignment
         while not next_name:
             candidate = f"while_loop_cond_graph_{i}"
             if hasattr(proxy_mode.tracer.root, candidate):
@@ -703,6 +699,7 @@ def __call__(
 
 class WhileLoopAutogradOp(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(
         ctx,
         cond_fn,
@@ -732,6 +729,7 @@ def forward(
         ctx.additional_inputs = additional_inputs
         ctx.fw_outputs = fw_outputs
         loop_count = None
+        # pyrefly: ignore  # bad-assignment
         for out in fw_outputs:
             if isinstance(out, torch.Tensor):
                 if loop_count is not None:
@@ -748,6 +746,7 @@ def forward(
             and (shape_env := loop_count.node.shape_env)
             and loop_count in shape_env.pending_fresh_unbacked_symbols
         ):
+            # pyrefly: ignore  # unbound-name
             shape_env.pending_fresh_unbacked_symbols.remove(loop_count)
 
         # Even when body function is not executed, we clone and unsqueeze the input
@@ -774,11 +773,11 @@ def backward(ctx, *grads):
         # inductor codegen, where we need to do a non-unform treatment for None and tensors.
         # So we set up masks and filter the None gradients so that only tensors are returned from each step.
         carries_tensor_masks = [
-            True if isinstance(t, torch.Tensor) and t.dtype.is_floating_point else False
+            bool(isinstance(t, torch.Tensor) and t.dtype.is_floating_point)
             for t in ctx.carries
         ]
         additional_inputs_tensor_masks = [
-            True if isinstance(t, torch.Tensor) and t.dtype.is_floating_point else False
+            bool(isinstance(t, torch.Tensor) and t.dtype.is_floating_point)
             for t in ctx.additional_inputs
         ]
 
@@ -885,6 +884,7 @@ def body_fn(*flat_args):
             while_loop_op(
                 cond_gm,
                 body_gm,
+                # pyrefly: ignore  # bad-argument-type
                 (
                     init_idx,
                     *init_grad_carries,
diff --git a/torch/_higher_order_ops/wrap.py b/torch/_higher_order_ops/wrap.py
index 8e9ca0503402..ba6bbe0c39b6 100644
--- a/torch/_higher_order_ops/wrap.py
+++ b/torch/_higher_order_ops/wrap.py
@@ -6,6 +6,7 @@
 
 import torch
 import torch.utils._pytree as pytree
+from torch._higher_order_ops.utils import reenter_make_fx
 from torch._logging import warning_once
 from torch._ops import HigherOrderOperator
 from torch.fx import GraphModule
@@ -52,8 +53,11 @@ def __call__(self, enable_grad, wrapped_func, *args, **kwargs):
 
         @disable
         def wrapper():
-            with torch.set_grad_enabled(enable_grad):
-                return wrapped_func(*args, **kwargs)
+            prev = torch.is_grad_enabled()
+            torch.set_grad_enabled(enable_grad)
+            res = wrapped_func(*args, **kwargs)
+            torch.set_grad_enabled(prev)
+            return res
 
         return wrapper()
 
@@ -309,6 +313,9 @@ def proxy_mode_key(
     *args: Any,
     **kwargs: Any,
 ) -> tuple[torch.Tensor]:
+    import torch.fx.traceback as fx_traceback
+    from torch.fx import Interpreter
+
     assert proxy_mode.pre_dispatch, (
         "post-dispatch mode should have inlined in the Autograd key"
     )
@@ -316,11 +323,15 @@ def proxy_mode_key(
     proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, args)  # type: ignore[union-attr]
     proxy_kwargs = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, kwargs)  # type: ignore[union-attr]
     qualname = proxy_mode.tracer.get_fresh_qualname("wrap_body")  # type: ignore[union-attr]
-    proxy_mode.tracer.root.register_module(qualname, gmod)  # type: ignore[union-attr]
-    proxy_gmod = proxy_mode.tracer.unwrap_proxy(gmod)  # type: ignore[union-attr, call-overload]
-    for node in proxy_gmod.graph.nodes:
-        if "example_value" in node.meta:
-            node.meta["val"] = node.meta["example_value"]
+
+    # TODO (tmanlaibaatar) don't we need flat_apply here??
+    # Dynamo already traced the gmod body without kwargs
+    flat_args, _ = pytree.tree_flatten(args)
+    with fx_traceback.preserve_node_meta():
+        gmod_aten = reenter_make_fx(Interpreter(gmod).run)(*flat_args)
+        gmod_aten.meta["_checkpoint_context_fn"] = gmod.meta["_checkpoint_context_fn"]
+    proxy_mode.tracer.root.register_module(qualname, gmod_aten)  # type: ignore[union-attr]
+    proxy_gmod = proxy_mode.tracer.unwrap_proxy(gmod_aten)  # type: ignore[union-attr, call-overload]
     out_proxy = proxy_mode.tracer.create_proxy(
         "call_function",
         tag_activation_checkpoint,
diff --git a/torch/_inductor/__init__.py b/torch/_inductor/__init__.py
index d287337afaa6..9c1090684016 100644
--- a/torch/_inductor/__init__.py
+++ b/torch/_inductor/__init__.py
@@ -132,6 +132,7 @@ def aoti_compile_and_package(
         )
         or (
             isinstance(package_path, (str, os.PathLike))
+            # pyrefly: ignore  # no-matching-overload
             and os.fspath(package_path).endswith(".pt2")
         )
     ), (
@@ -151,6 +152,7 @@ def aoti_compile_and_package(
     return aot_inductor_minifier_wrapper(
         _aoti_compile_and_package_inner,
         exported_program,
+        # pyrefly: ignore  # bad-argument-type
         package_path=package_path,
         inductor_configs=inductor_configs,
     )
diff --git a/torch/_inductor/analysis/device_info.py b/torch/_inductor/analysis/device_info.py
index 76eb9d3dd487..6fc271458c77 100644
--- a/torch/_inductor/analysis/device_info.py
+++ b/torch/_inductor/analysis/device_info.py
@@ -1,8 +1,6 @@
 import logging
-from collections.abc import Callable, Generator
-from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Any, Optional, Union
+from typing import Optional, Union
 
 import torch
 
@@ -10,360 +8,32 @@
 log = logging.getLogger(__name__)
 
 
-def _get_pynvml() -> Optional[Any]:
-    """Get pynvml from torch.cuda if available."""
-    return getattr(torch.cuda, "pynvml", None) if torch.cuda._HAS_PYNVML else None
-
-
-def _get_amd_smi() -> Optional[Any]:
-    """Get AMD SMI from torch.cuda if available."""
-    return getattr(torch.cuda, "amdsmi", None) if torch.cuda._HAS_PYNVML else None
-
-
-@contextmanager
-def _device_library_context(
-    library_getter: Callable[[], Optional[Any]],
-    library_name: str,
-    init_method: str,
-    shutdown_method: str,
-) -> Generator[Any, None, None]:
-    """
-    Generic context manager for device library operations.
-    Handles initialization, exception catching, and cleanup.
-
-    Args:
-        library_getter: Function that returns the library module or None
-        library_name: Name of the library for error messages
-        init_method: Name of the initialization method to call
-        shutdown_method: Name of the shutdown method to call
-    """
-    library = library_getter()
-    if library is None:
-        raise RuntimeError(f"{library_name} not available")
-
-    try:
-        getattr(library, init_method)()
-        yield library
-    finally:
-        try:
-            getattr(library, shutdown_method)()
-        except Exception:
-            pass
-
-
-@contextmanager
-def _nvml_context() -> Generator[Any, None, None]:
-    """Context manager for NVML operations."""
-    with _device_library_context(
-        _get_pynvml, "pynvml", "nvmlInit", "nvmlShutdown"
-    ) as library:
-        yield library
-
-
-@contextmanager
-def _amd_smi_context() -> Generator[Any, None, None]:
-    """Context manager for AMD SMI operations."""
-    with _device_library_context(
-        _get_amd_smi, "amdsmi", "amdsmi_init", "amdsmi_shut_down"
-    ) as library:
-        yield library
-
-
 @dataclass(frozen=True)
-class DeviceSpec:
+class DeviceInfo:
     """
     Theoretical Numbers from data sheet. If two numbers are given, Tensor/Matrix Core vs not,
     then the higher number is reported. Sparsity is not considered.
 
+
     Bandwidth numbers are tricky, because there are platform differences that may not show up in the profiler trace.
+    For example,
     """
 
     tops: dict[Union[torch.dtype, str], float]
     dram_bw_gbs: float
     dram_gb: float
-    sm_count: Optional[int]
-    clock_hz: Optional[float]
-    memory_clock_hz: Optional[float]
-
-
-class DeviceInfo:
-    """
-    Device information lookup utility for GPU hardware introspection.
-
-    This class provides methods to retrieve various hardware specifications
-    and performance characteristics of GPU devices. It supports both NVIDIA
-    and AMD GPUs through hardware lookup methods and falls back to datasheet
-    values when hardware information is not available.
-
-    The class can provide information about:
-    - Streaming multiprocessor (SM) count
-    - Clock frequencies (core and memory)
-    - DRAM capacity and bandwidth
-    - Peak FLOPS/TOPS performance
-
-    Methods use a two-tier lookup strategy:
-    1. Hardware introspection via pynvml (NVIDIA) or AMD SMI libraries
-    2. Fallback to predefined datasheet values for known device models
-
-    Example usage:
-        device_name = torch.cuda.get_device_name()
-        peak_tops = DeviceInfo.lookup_tops(device_name, torch.float32)
-    """
-
-    @staticmethod
-    def _hardware_lookup_sm_count() -> Optional[int]:
-        """Get the number of streaming multiprocessors from the hardware."""
-        try:
-            # rely on device_properties api
-            device_props = torch.cuda.get_device_properties(0)
-            return device_props.multi_processor_count
-        except Exception:
-            return None
-
-    @staticmethod
-    def _hardware_lookup_clock_hz() -> Optional[float]:
-        """Get the clock speed in Hz from the hardware."""
-        if torch.version.hip is not None:
-            amd_clock = DeviceInfo._amd_hardware_lookup_clock_hz()
-            return amd_clock
-
-        try:
-            with _nvml_context() as pynvml:
-                handle = pynvml.nvmlDeviceGetHandleByIndex(0)
-                clock_mhz = pynvml.nvmlDeviceGetMaxClockInfo(
-                    handle, pynvml.NVML_CLOCK_SM
-                )
-                return clock_mhz * 1e6
-        except Exception:
-            return None
-
-    @staticmethod
-    def _amd_hardware_lookup_clock_hz() -> Optional[float]:
-        """Get the clock speed in Hz from AMD hardware."""
-        try:
-            with _amd_smi_context() as amd_smi:
-                device_handle = amd_smi.amdsmi_get_processor_handles()[0]
-                clock_info = amd_smi.amdsmi_get_clock_info(
-                    device_handle, amd_smi.AmdSmiClkType.SYS
-                )
-                return clock_info["max_clk"] * 1e6 if "max_clk" in clock_info else None
-        except Exception as e:
-            log.info("Failed to get AMD clock frequency: %s", e)
-            return None
-
-    @staticmethod
-    def _hardware_lookup_memory_clock_hz() -> Optional[float]:
-        """Get the memory clock speed in Hz from the hardware."""
-        if torch.version.hip is not None:
-            amd_memory_clock = DeviceInfo._amd_hardware_lookup_memory_clock_hz()
-            return amd_memory_clock
-
-        try:
-            with _nvml_context() as pynvml:
-                handle = pynvml.nvmlDeviceGetHandleByIndex(0)
-                mem_clock_mhz = pynvml.nvmlDeviceGetMaxClockInfo(
-                    handle, pynvml.NVML_CLOCK_MEM
-                )
-                return mem_clock_mhz * 1e6
-        except Exception:
-            return None
-
-    @staticmethod
-    def _amd_hardware_lookup_memory_clock_hz() -> Optional[float]:
-        """Get the memory clock speed in Hz from AMD hardware."""
-        try:
-            with _amd_smi_context() as amd_smi:
-                device_handle = amd_smi.amdsmi_get_processor_handles()[0]
-                mem_clock_info = amd_smi.amdsmi_get_clock_info(
-                    device_handle, amd_smi.AmdSmiClkType.MEM
-                )
-                return (
-                    mem_clock_info["max_clk"] * 1e6
-                    if "max_clk" in mem_clock_info
-                    else None
-                )
-        except Exception as e:
-            log.info("Failed to get AMD memory clock frequency: %s", e)
-            return None
-
-    @staticmethod
-    def _hardware_dram_gb() -> Optional[float]:
-        """Get the DRAM memory size in GB from the hardware."""
-        try:
-            device_props = torch.cuda.get_device_properties(0)
-            # Convert from bytes to GB
-            return device_props.total_memory / (1024**3)
-        except Exception:
-            return None
-
-    @staticmethod
-    def _generic_lookup(
-        device_name: str, element_name: str
-    ) -> Optional[Union[int, float]]:
-        """
-        Generic lookup method for device elements.
-        First attempts hardware lookup, then falls back to device mapping.
-
-        Args:
-            element_name: Name of the element to lookup (e.g., 'sm_count', 'clock_hz')
-
-        Returns:
-            The value from hardware lookup or device mapping, or None if not available.
-        """
-        hardware_lookup_methods = {
-            "sm_count": DeviceInfo._hardware_lookup_sm_count,
-            "clock_hz": DeviceInfo._hardware_lookup_clock_hz,
-            "memory_clock_hz": DeviceInfo._hardware_lookup_memory_clock_hz,
-            "dram_gb": DeviceInfo._hardware_dram_gb,
-        }
-
-        if torch.cuda.is_available() and torch.cuda.get_device_name() == device_name:
-            # we're on the device that we're testing, so try to look up values via hardware libraries.
-            hardware_method = hardware_lookup_methods.get(element_name)
-            if hardware_method:
-                hardware_value = hardware_method()
-                if hardware_value is not None:
-                    return hardware_value
-
-        # Attempt to lookup from device mapping
-        device_info = lookup_device_info(device_name)
-        if device_info is not None:
-            return getattr(device_info, element_name, None)
-
-        return None
-
-    @staticmethod
-    def lookup_sm_count(device_name: str) -> Optional[int]:
-        """Get the number of streaming multiprocessors for the current device."""
-        result = DeviceInfo._generic_lookup(device_name, "sm_count")
-        return result if isinstance(result, int) or result is None else None
-
-    @staticmethod
-    def lookup_clock_hz(device_name: str) -> Optional[float]:
-        """Get the clock speed in Hz for the current device."""
-        return DeviceInfo._generic_lookup(device_name, "clock_hz")
-
-    @staticmethod
-    def lookup_memory_clock_hz(device_name: str) -> Optional[float]:
-        """Get the memory clock speed in Hz for the current device."""
-        return DeviceInfo._generic_lookup(device_name, "memory_clock_hz")
-
-    @staticmethod
-    def lookup_dram_gb(device_name: str) -> Optional[float]:
-        """Get the DRAM memory size in GB for the current device."""
-        return DeviceInfo._generic_lookup(device_name, "dram_gb")
-
-    @staticmethod
-    def lookup_dram_bw_gbs(device_name: str) -> Optional[float]:
-        """
-        Get the DRAM bandwidth in GB/s for the current device.
-
-        Uses hardware lookup first, then falls back to datasheet value
-        scaled by memory clock ratio if available.
-        """
-        lookupable = torch.cuda.is_available() and (
-            torch.cuda.get_device_name() == device_name
-        )
-
-        # Fall back to datasheet value with memory clock scaling
-        device_info = lookup_device_info(device_name)
-        if device_info is None:
-            return None
-
-        datasheet_bw = device_info.dram_bw_gbs
-        if datasheet_bw is None:
-            return None
-
-        # Apply memory clock adjustment if current memory clock is available
-        if lookupable:
-            current_memory_clock_hz = DeviceInfo.lookup_memory_clock_hz(device_name)
-            if (
-                current_memory_clock_hz is not None
-                and device_info.memory_clock_hz is not None
-            ):
-                # Scale bandwidth by memory clock ratio
-                expected_memory_clock_hz = device_info.memory_clock_hz
-                memory_clock_ratio = current_memory_clock_hz / expected_memory_clock_hz
-                datasheet_bw *= memory_clock_ratio
-
-        return datasheet_bw
-
-    @staticmethod
-    def lookup_tops(
-        device_name: str,
-        dtype: torch.dtype,
-        is_tf32: bool = False,
-    ) -> Optional[float]:
-        """
-        Our best attempt to calculate the current tops. Adjust by the ratio of current clock speed to theoretical.
-
-        Returns:
-            Peak FLOPS as a float, or None if calculation fails
-        """
-        lookupable = torch.cuda.is_available() and (
-            torch.cuda.get_device_name() == device_name
-        )
-
-        # Use datasheet values adjusted by clock ratio
-        peak_ops = datasheet_tops(dtype, is_tf32)
-        if peak_ops is None:
-            return None
-        peak_ops *= 1e12  # Convert TOPS to FLOPS
-
-        # Apply clock adjustment for datasheet fallback calculations
-
-        if not torch.cuda.is_available():
-            return peak_ops
-
-        device_name = torch.cuda.get_device_name()
-        if device_name is None:
-            return peak_ops
-
-        device_info = lookup_device_info(device_name)
-        if device_info is None:
-            return peak_ops
-
-        if lookupable:
-            current_clock_hz = DeviceInfo.lookup_clock_hz(device_name)
-            if current_clock_hz is not None and device_info.clock_hz is not None:
-                # Use the expected clock speed from the device mapping for scaling
-                expected_clock_hz = device_info.clock_hz
-                clock_ratio = current_clock_hz / expected_clock_hz
-                peak_ops *= clock_ratio
-
-        return peak_ops
-
-    @staticmethod
-    def lookup_tops_current_device(
-        dtype: torch.dtype,
-        is_tf32: bool = False,
-    ) -> Optional[float]:
-        """
-        Our best attempt to calculate the current tops. Adjust by the ratio of current clock speed to theoretical.
-
-        Returns:
-            Peak FLOPS as a float, or None if calculation fails
-        """
-        if not torch.cuda.is_available():
-            return None
-        name: Optional[str] = torch.cuda.get_device_name()
-        if name is None:
-            return None
-        return DeviceInfo.lookup_tops(name, dtype, is_tf32)
 
 
 # Indexing is based on `torch.cuda.get_device_name()`
 # TODO investigate profiler support for tf32 and allow device to report correct number when it's turned on.
-_device_mapping: dict[str, DeviceSpec] = {
+_device_mapping: dict[str, DeviceInfo] = {
     # Source:
     # @lint-ignore https://www.nvidia.com/en-us/data-center/h100/
-    # These are from H100 SXM.
-    #
-    "NVIDIA H100": DeviceSpec(
+    "NVIDIA H100": DeviceInfo(
         tops={
-            torch.float64: 34.0,
-            torch.float32: 67.0,
-            "torch.tf32": 989.0,
+            torch.float64: 67.0,
+            torch.float32: 67.5,
+            "torch.tf32": 156.0,
             torch.bfloat16: 1979.0,
             torch.float16: 1979.0,
             torch.float8_e8m0fnu: 3958.0,
@@ -376,17 +46,11 @@ def lookup_tops_current_device(
         },
         dram_bw_gbs=3350,
         dram_gb=80,
-        sm_count=132,
-        # boost clock
-        clock_hz=1.98e9,
-        memory_clock_hz=1.4e10,
-        # bus: 5120 bit
     ),
     # Source:
     # @lint-ignore https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/
     # nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
-    # Tensor cores enabled + SXM
-    "NVIDIA A100": DeviceSpec(
+    "NVIDIA A100": DeviceInfo(
         tops={
             torch.float64: 19.5,
             torch.float32: 19.5,
@@ -394,19 +58,14 @@ def lookup_tops_current_device(
             torch.float16: 312.5,
             # Not in datasheet: float8
             torch.int8: 624.0,
-            "torch.tf32": 312.0,
+            "torch.tf32": 156.0,
         },
         dram_bw_gbs=2039.0,
         dram_gb=80.0,
-        sm_count=108,
-        # boost clock
-        clock_hz=1410 * 1e6,
-        memory_clock_hz=1593 * 1e6,
     ),
     # Source:
     # @lint-ignore https://resources.nvidia.com/en-us-gpu-resources/l4-tensor-datasheet
-    # @lint-ignore https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/l4/PB-11316-001_v01.pdf
-    "NVIDIA L4": DeviceSpec(
+    "NVIDIA L4": DeviceInfo(
         tops={
             # This is a guess, not in datasheet
             torch.float64: 15.1,
@@ -424,15 +83,11 @@ def lookup_tops_current_device(
         },
         dram_bw_gbs=3350,
         dram_gb=24,
-        sm_count=58,
-        clock_hz=2040 * 1e6,
-        # bus: 192 bit
-        memory_clock_hz=6251 * 1e6,
     ),
     # Source:
     # @lint-ignore https://www.amd.com/content/dam/amd/en/documents\
     # /instinct-tech-docs/data-sheets/amd-instinct-mi300a-data-sheet.pdf
-    "AMD MI300A": DeviceSpec(
+    "AMD MI300A": DeviceInfo(
         tops={
             torch.float64: 122.6,
             torch.float32: 122.6,
@@ -449,15 +104,11 @@ def lookup_tops_current_device(
         },
         dram_bw_gbs=5300.0,
         dram_gb=128.0,
-        sm_count=228,
-        # bus: 8192 bit
-        clock_hz=2100 * 1e6,
-        memory_clock_hz=2600 * 1e6,
     ),
     # Source:
     # @lint-ignore https://www.amd.com/content/dam/amd/en/documents/\
     # instinct-tech-docs/data-sheets/amd-instinct-mi300x-data-sheet.pdf
-    "AMD MI300X": DeviceSpec(
+    "AMD MI300X": DeviceInfo(
         tops={
             torch.float64: 163.4,
             torch.float32: 163.4,
@@ -474,14 +125,11 @@ def lookup_tops_current_device(
         },
         dram_bw_gbs=5300.0,
         dram_gb=192.0,
-        sm_count=304,
-        clock_hz=2100 * 1e6,
-        memory_clock_hz=5200 * 1e6,
     ),
     # Source:
     # @lint-ignore https://www.amd.com/content/dam/amd/\
     # en/documents/instinct-business-docs/product-briefs/instinct-mi210-brochure.pdf
-    "AMD MI210X": DeviceSpec(
+    "AMD MI210X": DeviceInfo(
         tops={
             torch.float64: 45.3,
             torch.float32: 45.3,
@@ -501,24 +149,21 @@ def lookup_tops_current_device(
         # pcie4.0x16
         dram_bw_gbs=1600.0,
         dram_gb=64.0,
-        sm_count=104,
-        clock_hz=1700 * 1e6,
-        memory_clock_hz=1600 * 1e6,
     ),
 }
 _device_mapping["AMD INSTINCT MI300X"] = _device_mapping["AMD MI300X"]
 _device_mapping["AMD INSTINCT MI210X"] = _device_mapping["AMD MI210X"]
 
 
-def lookup_device_info(name: str) -> Optional[DeviceSpec]:
+def lookup_device_info(name: str) -> Optional[DeviceInfo]:
     """
     Problem: when diffing profiles between amd and nvidia, we don't have access to the device information
     of the other one. Also, since the analysis is static, we should be able to do it on another device unrelated
     to the recorded device. Therefore, _device_mapping statically contains the information for lots of devices.
-    If one is missing, please run DeviceSpec.get_device_info() and add it to _device_mapping.
+    If one is missing, please run DeviceInfo.get_device_info() and add it to _device_mapping.
       name (str): name of the device to lookup. Should map onto torch.cuda.get_device_name().
     """
-    return _device_mapping.get(name, None)
+    return _device_mapping.get(name)
 
 
 def datasheet_tops(dtype: torch.dtype, is_tf32: bool = False) -> Optional[float]:
diff --git a/torch/_inductor/analysis/profile_analysis.py b/torch/_inductor/analysis/profile_analysis.py
index 13545637ea7c..28e02a7a60e2 100644
--- a/torch/_inductor/analysis/profile_analysis.py
+++ b/torch/_inductor/analysis/profile_analysis.py
@@ -6,7 +6,7 @@
 from typing import Any, Callable, Optional, Union
 
 import torch
-from torch._inductor.analysis.device_info import DeviceSpec, lookup_device_info
+from torch._inductor.analysis.device_info import DeviceInfo, lookup_device_info
 from torch._inductor.utils import tabulate_2d, zip_dicts
 from torch.utils import _pytree as pytree
 from torch.utils._ordered_set import OrderedSet
@@ -49,6 +49,7 @@ def register_adapter(
     AdapterType,
 ]:
     def decorator(func: AdapterType) -> AdapterType:
+        # pyrefly: ignore  # unknown-name
         global _adapters_map
 
         if isinstance(aten, str):
@@ -381,7 +382,7 @@ class KernelStats:
 class Device:
     name: str
     index: int
-    info: Optional[DeviceSpec]
+    info: Optional[DeviceInfo]
     stats: KernelNameMap
 
     def __repr__(self) -> str:
@@ -412,12 +413,11 @@ def __init__(
         if dtype is None:
             self.dtype = None
         elif isinstance(dtype, torch.dtype):
+            # pyrefly: ignore  # bad-assignment
             self.dtype = dtype
         else:
-            if dtype in _dtype_map:
-                self.dtype = _dtype_map[dtype]
-            else:
-                self.dtype = None
+            # pyrefly: ignore  # bad-assignment
+            self.dtype = _dtype_map.get(dtype)
         self._create_devices()
 
     def convert_dtype(self, event: dict[str, Any]) -> Optional[torch.dtype]:
@@ -653,6 +653,7 @@ def create_ret(
                     t1, self_name, t2, other_name
                 )
                 tab_string = create_ret(table_headers, table_rows)
+                # pyrefly: ignore  # bad-argument-type
                 ret.append(f"{self._devices[device_idx]}:\n{tab_string}")
             return "\n".join(ret)
         self._compute_stats()
@@ -663,6 +664,7 @@ def create_ret(
         for idx, table in self_tables.items():
             table_headers, table_rows = table
             tab_string = create_ret(table_headers, table_rows)
+            # pyrefly: ignore  # bad-argument-type
             ret.append(f"{self._devices[idx]}:\n{tab_string}")
         return "\n".join(ret)
 
diff --git a/torch/_inductor/analyze_preserves_zero_mask.py b/torch/_inductor/analyze_preserves_zero_mask.py
index 90d0ff80c5f0..0096103670a3 100644
--- a/torch/_inductor/analyze_preserves_zero_mask.py
+++ b/torch/_inductor/analyze_preserves_zero_mask.py
@@ -106,6 +106,7 @@ def check_bounds(
         pass
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def indirect_indexing(*args: Any, **kwargs: Any) -> sympy.Expr:
         return sympy.S.Zero
 
diff --git a/torch/_inductor/async_compile.py b/torch/_inductor/async_compile.py
index 9f941c04e7b3..9ab6c68a7783 100644
--- a/torch/_inductor/async_compile.py
+++ b/torch/_inductor/async_compile.py
@@ -521,6 +521,11 @@ def multi_kernel(self, *args, **kwargs) -> Any:
         # no need to call this in parallel since the sub-kernels are already parallel tasks
         return MultiKernelCall(*args, **kwargs)
 
+    def size_hint_multi_kernel(self, *args, **kwargs) -> Any:
+        from torch._inductor.codegen.multi_kernel import SizeHintMultiKernelCall
+
+        return SizeHintMultiKernelCall(*args, **kwargs)
+
     def cpp(self, source_code: str):
         kernel_code_log.info("CPP Kernel:\n%s", source_code)
         if get_compile_threads() <= 1:
diff --git a/torch/_inductor/augmented_graph_helper.py b/torch/_inductor/augmented_graph_helper.py
new file mode 100644
index 000000000000..ac61c015888e
--- /dev/null
+++ b/torch/_inductor/augmented_graph_helper.py
@@ -0,0 +1,125 @@
+from collections import defaultdict
+from typing import Optional
+
+import torch
+import torch.fx as fx
+from torch.utils._ordered_set import OrderedSet
+
+
+class AugmentedGraphHelper:
+    """
+    Graph helper that augments the original graph with additional
+    dependencies and uses, plus tracks node equivalences for coalescing.
+
+    TODO: if this becomes too large of compile time, consider binding
+    graphcycles.cc
+    """
+
+    def __init__(
+        self,
+        graph: fx.Graph,
+        node_ancestors: Optional[dict[fx.Node, OrderedSet[fx.Node]]] = None,
+    ):
+        # Each node starts in its own singleton set
+        self.graph = graph
+        self.merge_sets = {node: OrderedSet([node]) for node in graph.nodes}
+
+        # Extra dependencies: node depends on dep (dep must come before node)
+        self.extra_deps: dict[fx.Node, OrderedSet[fx.Node]] = defaultdict(OrderedSet)
+        # Note: only reflect original ancestors, not maintained through additional deps
+        # or merge sets
+        self.node_ancestors = node_ancestors
+
+    def add_extra_dep(self, *, n: fx.Node, dep: fx.Node) -> None:
+        """Add extra dependency: node depends on dep."""
+        self.extra_deps[n].add(dep)
+
+    def merge_to_set(self, existing_node: fx.Node, new_node: fx.Node) -> None:
+        """
+        Merge new_node into existing_node's set. The new node must be a singleton set.
+        """
+        existing_set = self.merge_sets[existing_node]
+        new_set = self.merge_sets[new_node]
+        assert len(new_set) == 1
+
+        # Add all nodes from new_set to existing_set
+        existing_set.update(new_set)
+
+        # Update all nodes from new_set to point to existing_set
+        for node in new_set:
+            self.merge_sets[node] = existing_set
+
+    def unmerge_node(self, node: fx.Node) -> None:
+        """Remove a node from its merge set, making it singleton."""
+        old_set = self.merge_sets[node]
+
+        # If already singleton, nothing to do
+        if len(old_set) == 1:
+            return
+
+        # Remove from old set
+        old_set.remove(node)
+
+        # Make node singleton
+        self.merge_sets[node] = OrderedSet([node])
+
+    def get_merged_deps(self, node: fx.Node) -> OrderedSet[fx.Node]:
+        """
+        Get all dependencies of a node considering merges and extra deps.
+        Combines:
+        1. Direct deps (all_input_nodes) of node and its merge equivalents
+        2. Extra deps of node and its merge equivalents
+        """
+        deps: OrderedSet[fx.Node] = OrderedSet()
+
+        # For each node in the merge set
+        for merged_node in self.merge_sets[node]:
+            # Add direct dependencies from all_input_nodes
+            deps.update(merged_node.all_input_nodes)
+            # Add extra dependencies
+            deps.update(self.extra_deps[merged_node])
+
+        return deps
+
+    def has_cycle(self) -> bool:
+        merged_deps = {n: self.get_merged_deps(n) for n in self.graph.nodes}
+        return torch._dynamo.graph_deduplication._has_cycle(self.graph, merged_deps)
+
+    def has_path(self, source: fx.Node, target: fx.Node) -> bool:
+        """Check if there's a path from source to target."""
+        # we should not be checking path from node to itself
+        assert self.merge_sets[source] is not self.merge_sets[target]
+
+        # search backwards from target to source
+        visited: OrderedSet[fx.Node] = OrderedSet()
+        queue = [target]
+        visited.add(target)
+
+        while queue:
+            current = queue.pop()
+
+            for dep in self.get_merged_deps(current):
+                # Check if we reached source or its equivalent
+                if dep in self.merge_sets[source]:
+                    return True
+
+                if dep in visited:
+                    continue
+
+                # We are searching from target, so this node is necessarily an ancestor
+                # of target.
+                # If dep is an ancestor of source, any path through dep to source would imply a cycle
+                if self.node_ancestors:
+                    source_set = self.merge_sets[source]
+                    is_ancestor_of_source = any(
+                        dep in self.node_ancestors[s] for s in source_set
+                    )
+                    # Add to visited to avoid recomputing this check if we see dep again
+                    if is_ancestor_of_source:
+                        visited.add(dep)
+                        continue
+
+                visited.add(dep)
+                queue.append(dep)
+
+        return False
diff --git a/torch/_inductor/autoheuristic/artifacts/_MMRankingA100.py b/torch/_inductor/autoheuristic/artifacts/_MMRankingA100.py
index 6a8cce6f870b..7ebf134c83d7 100644
--- a/torch/_inductor/autoheuristic/artifacts/_MMRankingA100.py
+++ b/torch/_inductor/autoheuristic/artifacts/_MMRankingA100.py
@@ -17,7 +17,7 @@
 class MMRankingA100(LearnedHeuristicDecision):
 
     def __init__(self) -> None:
-        self.choices: List[Choice] = []
+        self.choices: list[Choice] = []
         self.fill_choices()
 
     def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
@@ -238,7 +238,7 @@ def fill_choices(self) -> None:
     def get_name(self) -> str:
         return 'mm'
 
-    def get_best_choices(self, context: AHContext) -> Optional[List[tuple[float, int]]]:
+    def get_best_choices(self, context: AHContext) -> Optional[list[tuple[float, int]]]:
         if context.get_value('arith_intensity') <= 52.6245059967041:
             if context.get_value('n') <= 34.0:
                 if context.get_value('n') <= 18.0:
diff --git a/torch/_inductor/autoheuristic/artifacts/_MMRankingH100.py b/torch/_inductor/autoheuristic/artifacts/_MMRankingH100.py
index e794b8e646f3..6201acc4213a 100644
--- a/torch/_inductor/autoheuristic/artifacts/_MMRankingH100.py
+++ b/torch/_inductor/autoheuristic/artifacts/_MMRankingH100.py
@@ -17,7 +17,7 @@
 class MMRankingH100(LearnedHeuristicDecision):
 
     def __init__(self) -> None:
-        self.choices: List[Choice] = []
+        self.choices: list[Choice] = []
         self.fill_choices()
 
     def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
@@ -242,7 +242,7 @@ def fill_choices(self) -> None:
     def get_name(self) -> str:
         return 'mm'
 
-    def get_best_choices(self, context: AHContext) -> Optional[List[tuple[float, int]]]:
+    def get_best_choices(self, context: AHContext) -> Optional[list[tuple[float, int]]]:
         if context.get_value('arith_intensity') <= 29.89772129058838:
             if context.get_value('n') <= 34.0:
                 if context.get_value('n') <= 18.0:
diff --git a/torch/_inductor/autoheuristic/artifacts/_MixedMMA100.py b/torch/_inductor/autoheuristic/artifacts/_MixedMMA100.py
index 9a9ea693a96d..1ba7cbaf9027 100644
--- a/torch/_inductor/autoheuristic/artifacts/_MixedMMA100.py
+++ b/torch/_inductor/autoheuristic/artifacts/_MixedMMA100.py
@@ -17,7 +17,7 @@
 class MixedMMA100(LearnedHeuristicDecision):
 
     def __init__(self) -> None:
-        self.choices: List[Choice] = []
+        self.choices: list[Choice] = []
         self.fill_choices()
 
     def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
@@ -62,7 +62,7 @@ def fill_choices(self) -> None:
     def get_name(self) -> str:
         return 'mixed_mm'
 
-    def get_best_choices(self, context: AHContext) -> Optional[List[tuple[float, int]]]:
+    def get_best_choices(self, context: AHContext) -> Optional[list[tuple[float, int]]]:
         if str(context.get_value('1LEQmLEQ16')) != 'True':
             if context.get_value('m') <= 32.5:
                 if context.get_value('n') <= 6976.0:
diff --git a/torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py b/torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py
index b4552c5257e7..c21579077042 100644
--- a/torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py
+++ b/torch/_inductor/autoheuristic/artifacts/_MixedMMH100.py
@@ -17,7 +17,7 @@
 class MixedMMH100(LearnedHeuristicDecision):
 
     def __init__(self) -> None:
-        self.choices: List[Choice] = []
+        self.choices: list[Choice] = []
         self.fill_choices()
 
     def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
@@ -61,7 +61,7 @@ def fill_choices(self) -> None:
     def get_name(self) -> str:
         return 'mixed_mm'
 
-    def get_best_choices(self, context: AHContext) -> Optional[List[tuple[float, int]]]:
+    def get_best_choices(self, context: AHContext) -> Optional[list[tuple[float, int]]]:
         if context.get_value('arith_intensity') <= 15.988086223602295:
             if context.get_value('n') <= 25280.0:
                 if context.get_value('n') <= 1344.0:
diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py
index a504b54f132b..85ea0a79d5f4 100644
--- a/torch/_inductor/autotune_process.py
+++ b/torch/_inductor/autotune_process.py
@@ -557,6 +557,7 @@ def do_bench(
             res = benchmarker.benchmark_gpu(fn)
             device_interface.synchronize()  # shake out any CUDA errors
 
+        # pyrefly: ignore  # bad-return
         return res
 
 
diff --git a/torch/_inductor/await_utils.py b/torch/_inductor/await_utils.py
index a549674d5cd7..036c7e3457d7 100644
--- a/torch/_inductor/await_utils.py
+++ b/torch/_inductor/await_utils.py
@@ -149,6 +149,7 @@ def _safe_task_factory(
         task_factory = task_factories[0]
         if task_factory is None:
             if sys.version_info >= (3, 11):
+                # pyrefly: ignore  # bad-argument-type
                 task = asyncio.Task(coro, loop=loop, context=context)
             else:
                 task = asyncio.Task(coro, loop=loop)
diff --git a/torch/_inductor/cache.py b/torch/_inductor/cache.py
new file mode 100644
index 000000000000..07ff7912e3cd
--- /dev/null
+++ b/torch/_inductor/cache.py
@@ -0,0 +1,419 @@
+from __future__ import annotations
+
+import pickle
+from abc import ABC, abstractmethod
+from ast import literal_eval
+from functools import cached_property
+from hashlib import sha256
+from os import getenv
+from pathlib import Path
+from tempfile import gettempdir
+from threading import Lock
+from typing import Any, Generic, TYPE_CHECKING, TypeVar
+from typing_extensions import assert_never, override, Self
+
+from torch.utils._filelock import FileLock
+
+
+if TYPE_CHECKING:
+    from concurrent.futures import Future, ThreadPoolExecutor
+
+
+# TypeVars can't be recursive, so generic types that fall within
+# Key or Value can't be bound properly; for example, Key should
+# only take tuples of other Key types: tuple[Key, ...]. this is
+# a known shortcoming of torch's typing
+Key = TypeVar("Key", str, int, tuple[Any, ...])
+Value = TypeVar("Value", str, int, tuple[Any, ...], bytes, dict[Any, Any], list[Any])
+
+
+class CacheError(ValueError):
+    """
+    Exception raised for errors encountered during cache operations.
+    """
+
+
+class Cache(ABC, Generic[Key, Value]):
+    """
+    Abstract base class for cache implementations.
+    Provides the interface for cache operations.
+    """
+
+    @abstractmethod
+    def get(self: Self, key: Key) -> Value | None:
+        """
+        Retrieve a value from the cache.
+        Args:
+            key (Key): The key to look up.
+        Returns:
+            Value | None: The cached value if present, else None.
+        """
+
+    @abstractmethod
+    def insert(self: Self, key: Key, value: Value) -> bool:
+        """
+        Insert a value into the cache.
+        Args:
+            key (Key): The key to insert.
+            value (Value): The value to associate with the key.
+        Returns:
+            bool: True if the value was inserted, False if the key already exists.
+        """
+
+
+class InMemoryCache(Cache[Key, Value]):
+    """
+    In-memory cache implementation using a dictionary and thread lock.
+    """
+
+    def __init__(self: Self) -> None:
+        """
+        Initialize an empty in-memory cache.
+        """
+        self._cache: dict[Key, Value] = {}
+        self._lock: Lock = Lock()
+
+    def get(self: Self, key: Key) -> Value | None:
+        """
+        Retrieve a value from the cache.
+        Args:
+            key (Key): The key to look up.
+        Returns:
+            Value | None: The cached value if present, else None.
+        """
+        with self._lock:
+            if (value := self._cache.get(key)) is not None:
+                return value
+            return None
+
+    def insert(self: Self, key: Key, value: Value) -> bool:
+        """
+        Insert a value into the cache.
+        Args:
+            key (Key): The key to insert.
+            value (Value): The value to associate with the key.
+        Returns:
+            bool: True if the value was inserted, False if the key already exists.
+        """
+        with self._lock:
+            if key in self._cache:
+                # no overwrites for insert!
+                return False
+            self._cache[key] = value
+            return True
+
+    @classmethod
+    def from_env_var(cls, env_var: str) -> Self:
+        """
+        Create an in-memory cache from an environment variable.
+        Args:
+            env_var (str): Name of the environment variable containing cache data.
+        Returns:
+            InMemoryCache: An instance populated from the environment variable.
+        Raises:
+            CacheError: If the environment variable is malformed or contains invalid data.
+        """
+        cache = cls()
+
+        if (env_val := getenv(env_var)) is None:
+            # env_var doesn't exist = empty cache
+            return cache
+
+        for kv_pair in env_val.split(";"):
+            # ignore whitespace prefix/suffix
+            kv_pair = kv_pair.strip()
+
+            if not kv_pair:
+                # kv_pair could be '' if env_val is '' or has ; suffix
+                continue
+
+            try:
+                # keys and values should be comma separated
+                key_bytes_repr, value_bytes_repr = kv_pair.split(",", 1)
+            except ValueError as err:
+                raise CacheError(
+                    f"Malformed kv_pair {kv_pair!r} from env_var {env_var!r}, likely missing comma separator."
+                ) from err
+
+            # ignore whitespace prefix/suffix, again
+            key_bytes_repr, value_bytes_repr = (
+                key_bytes_repr.strip(),
+                value_bytes_repr.strip(),
+            )
+
+            try:
+                # check that key_bytes_str is an actual, legitimate encoding
+                key_bytes = literal_eval(key_bytes_repr)
+            except (ValueError, SyntaxError) as err:
+                raise CacheError(
+                    f"Malformed key_bytes_repr {key_bytes_repr!r} in kv_pair {kv_pair!r}, encoding is invalid."
+                ) from err
+            try:
+                # check that value_bytes_str is an actual, legitimate encoding
+                value_bytes = literal_eval(value_bytes_repr)
+            except (ValueError, SyntaxError) as err:
+                raise CacheError(
+                    f"Malformed value_bytes_repr {value_bytes_repr!r} in kv_pair {kv_pair!r}, encoding is invalid."
+                ) from err
+
+            try:
+                key = pickle.loads(key_bytes)
+            except pickle.UnpicklingError as err:
+                raise CacheError(
+                    f"Malformed key_bytes_repr {key_bytes_repr!r} in kv_pair {kv_pair!r}, not un-pickle-able."
+                ) from err
+            try:
+                value = pickle.loads(value_bytes)
+            except pickle.UnpicklingError as err:
+                raise CacheError(
+                    f"Malformed value_bytes_repr {value_bytes_repr!r} in kv_pair {kv_pair!r}, not un-pickle-able."
+                ) from err
+
+            # true duplicates, i.e. multiple occurrences of the same key => value
+            # mapping are ok and treated as a no-op; key duplicates with differing
+            # values, i.e. key => value_1 and key => value_2 where value_1 != value_2,
+            # are not okay since we don't allow overwriting cached values (it's bad regardless)
+            if (not cache.insert(key, value)) and (cache.get(key) != value):
+                raise CacheError(
+                    f"Multiple values for key {key!r} found, got {cache.get(key)!r} and {value!r}."
+                )
+
+        return cache
+
+    @classmethod
+    def from_file_path(cls, fpath: Path) -> Self:
+        """
+        Create an in-memory cache from a file path.
+        Args:
+            fpath (Path): Path to the file containing pickled cache data.
+        Returns:
+            InMemoryCache: An instance populated from the file.
+        Raises:
+            CacheError: If the file is not a valid pickled dictionary.
+        """
+        cache = cls()
+
+        if not fpath.is_file():
+            # fpath doesn't exit = empty cache
+            return cache
+
+        try:
+            with open(fpath, "rb") as fp:
+                cache._cache = pickle.load(fp)
+        except pickle.UnpicklingError as err:
+            raise CacheError(
+                f"Failed to create cache from file path {fpath}, file contents are un-pickle-able."
+            ) from err
+
+        if not isinstance(cache._cache, dict):
+            raise CacheError(
+                f"Failed to create cache from file path {fpath}, file contents not pickled dict[Key, Value]."
+            )
+
+        return cache
+
+
+class AsyncCache(Cache[Key, Value]):
+    """
+    Asynchronous cache implementation using ThreadPoolExecutor.
+    """
+
+    def get_async(
+        self: Self, key: Key, executor: ThreadPoolExecutor
+    ) -> Future[Value | None]:
+        """
+        Retrieve a value from the cache asynchronously.
+        Args:
+            key (Key): The key to look up.
+            executor (ThreadPoolExecutor): Executor for async execution.
+        Returns:
+            Future[Value | None]: Future for the cached value or None.
+        """
+        return executor.submit(self.get, key)
+
+    def insert_async(
+        self: Self, key: Key, value: Value, executor: ThreadPoolExecutor
+    ) -> Future[bool]:
+        """
+        Insert a value into the cache asynchronously.
+        Args:
+            key (Key): The key to insert.
+            value (Value): The value to associate with the key.
+            executor (ThreadPoolExecutor): Executor for async execution.
+        Returns:
+            Future[bool]: Future for the result of insertion.
+        """
+        return executor.submit(self.insert, key, value)
+
+
+class OnDiskCache(AsyncCache[Key, Value]):
+    """
+    On-disk cache implementation using files and file locks.
+    Stores cache data in files on disk, with atomic operations and versioning.
+    Supports custom cache directory names.
+    Attributes:
+        version (int): The version used for cache versioning.
+        name (str): The name of the cache directory.
+    """
+
+    version: int = 0
+
+    def __init__(self: Self, name: str | None = None) -> None:
+        """
+        Initialize an on-disk cache instance.
+        Args:
+            name (str | None, optional): The name of the cache directory. If None,
+                defaults to "on_disk_cache".
+        """
+        self.name = name or "on_disk_cache"
+
+    @cached_property
+    def base_dir(self: Self) -> Path:
+        """
+        Get the base directory for the cache.
+        Returns:
+            Path: The base directory path for storing cache files.
+        """
+        return Path(gettempdir()) / "cache" / self.name
+
+    def _fpath_from_key(self: Self, key: Key) -> Path:
+        """
+        Get the file path for a given key.
+        Args:
+            key (Key): The key to convert to a file path.
+        Returns:
+            Path: The file path for the key.
+        Raises:
+            CacheError: If the key is not pickle-able.
+        """
+        try:
+            return self.base_dir / sha256(pickle.dumps(key)).hexdigest()[:32]
+        except (AttributeError, pickle.PicklingError) as err:
+            raise CacheError(
+                f"Failed to get fpath for key {key!r}, key is not pickle-able."
+            ) from err
+        # pyrefly: ignore  # bad-argument-type
+        assert_never(key)
+
+    def _flock_from_fpath(self: Self, fpath: Path) -> FileLock:
+        """
+        Get a file lock for a given file path.
+        Args:
+            fpath (Path): The file path.
+        Returns:
+            FileLock: The file lock for the path.
+        """
+        # fpath.name is a hex digest, meaning there are 16^4 potential values
+        # for fpath.name[:4]; this is more than enough unique locks to not
+        # cause additional overhead from shared locks and it also saves our
+        # cache dir from becoming 50 percent locks
+        # pyrefly: ignore  # bad-return
+        return FileLock(str(fpath.parent / "locks" / fpath.name[:4]) + ".lock")
+
+    @property
+    def version_prefix(self: Self) -> bytes:
+        """
+        Get the version prefix for the cache.
+        Returns:
+            bytes: The version prefix as bytes, derived from the cache version string.
+        """
+        return sha256(str(OnDiskCache.version).encode()).digest()[:4]
+
+    @override
+    def get(self: Self, key: Key) -> Value | None:
+        """
+        Retrieve a value from the cache.
+        Args:
+            key (Key): The key to look up.
+        Returns:
+            Value | None: The cached value if present and version matches, else None.
+        Raises:
+            CacheError: If the value is corrupted or cannot be unpickled.
+        Side Effects:
+            Removes stale cache files if the version prefix does not match.
+        """
+        fpath = self._fpath_from_key(key)
+        flock = self._flock_from_fpath(fpath)
+
+        with flock:
+            if not fpath.is_file():
+                return None
+
+            value_bytes = None
+            prefix_length = len(self.version_prefix)
+            with open(fpath, "rb") as fp:
+                if fp.read(prefix_length) == self.version_prefix:
+                    value_bytes = fp.read()
+
+            if value_bytes is None:
+                # version_prefix did not match, so we can't read the stale
+                # cached value; we should also remove the stale cached value,
+                # so that key can be re-cached by the newer version
+                fpath.unlink()
+                return None
+
+            try:
+                value = pickle.loads(value_bytes)
+            except pickle.UnpicklingError as err:
+                raise CacheError(
+                    f"Failed to get key {key!r}, value is potentially corrupted (value is not un-pickle-able)."
+                ) from err
+
+            return value
+
+    @override
+    def insert(self: Self, key: Key, value: Value) -> bool:
+        """
+        Insert a value into the cache.
+        Args:
+            key (Key): The key to insert.
+            value (Value): The value to associate with the key.
+        Returns:
+            bool: True if the value was inserted, False if the key already exists.
+        Raises:
+            CacheError: If the value is not pickle-able.
+        Side Effects:
+            Creates the cache directory if it does not exist.
+        """
+        fpath = self._fpath_from_key(key)
+        flock = self._flock_from_fpath(fpath)
+        fpath.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            # "x" mode is exclusive creation, meaning the file will be created
+            # iff the file does not already exist (atomic w/o overwrite); use
+            # flock for added atomicity guarantee and to prevent partial writes
+            with flock as _, open(fpath, "xb") as fp:
+                fp.write(self.version_prefix)
+                pickle.dump(value, fp)
+        except pickle.PicklingError as err:
+            raise CacheError(
+                f"Failed to insert key {key!r} with value {value!r}, value is not pickle-able."
+            ) from err
+        except FileExistsError:
+            return False
+        return True
+
+
+class InductorOnDiskCache(OnDiskCache[Key, Value]):
+    """
+    Inductor-specific on-disk cache implementation.
+    Uses a custom base directory for Inductor cache files.
+    """
+
+    def __init__(self: Self) -> None:
+        """
+        Initialize an inductor on-disk cache instance.
+        Sets the cache directory name to "inductor_on_disk_cache".
+        """
+        super().__init__("inductor_on_disk_cache")
+
+    @cached_property
+    def base_dir(self: Self) -> Path:
+        """
+        Get the base directory for the Inductor cache.
+        Returns:
+            Path: The base directory path for Inductor cache files.
+        """
+        from torch._inductor.runtime.runtime_utils import default_cache_dir
+
+        return Path(default_cache_dir(), "cache", self.name)
diff --git a/torch/_inductor/choices.py b/torch/_inductor/choices.py
index 2cc0a28f822a..84c6a3089d80 100644
--- a/torch/_inductor/choices.py
+++ b/torch/_inductor/choices.py
@@ -6,6 +6,8 @@
 import sympy
 
 import torch
+from torch._inductor.runtime.runtime_utils import next_power_of_2
+from torch.utils._sympy.value_ranges import bound_sympy
 
 from . import config
 from .codecache import write_text
@@ -166,11 +168,13 @@ def get_ktc(
         # adjust the kernel inputs to the template-specific heuristic, if needed
         # default here is to just return the kernel_inputs as is
         inputs_val = heuristic.adjust_kernel_inputs(kernel_inputs, op_name)
+        extra_kwargs = heuristic.get_extra_kwargs(kernel_inputs, op_name)
         # Create KernelTemplateChoice generator using the moved function
         overrides = kwarg_overrides or {}
         return make_ktc_generator(
             template=template,
             cs=cs,
+            extra_kwargs=extra_kwargs,
             overrides=overrides,
             layout=kernel_inputs.output_layout(),
             inputs=inputs_val,
@@ -333,6 +337,34 @@ def should_use_persistent_reduction(
             ReductionHint.INNER: 1024,
         }.get(features.get_reduction_hint(), 64)
 
+        if features.get_reduction_hint() not in (
+            ReductionHint.INNER,
+            ReductionHint.OUTER_TINY,
+        ):
+            bounds = bound_sympy(features.reduction_numel)
+            lower = bounds.lower
+            upper = bounds.upper
+
+            if not all(
+                (
+                    (isinstance(bound, int) or bound.is_constant())
+                    and bound != torch.utils._sympy.numbers.IntInfinity()
+                )
+                for bound in (lower, upper)
+            ):
+                return False
+
+            lower = next_power_of_2(int(lower))
+            upper = next_power_of_2(int(upper))
+
+            # If we are are coalescing on xblock (not ReductionHint.INNER) and this is not a tiny kernel
+            # (not ReductionHint.OUTER_TINY), do not use persistent reduction if it induces tile
+            # quantization. Persistent reduction forces rblock == rnumel, if the bounds between lower
+            # and upper are large, for the lower values we will be masking off large % of read/writes,
+            # when we could expand the coalescing xblock instead.
+            if lower != upper:
+                return False
+
         if cooperative_reduction:
             # The RSPLIT of cooperative reductions means each thread block is operating on fewer elements
             try:
@@ -348,6 +380,7 @@ def should_use_persistent_reduction(
         # to pick the faster one.
         if config.triton.multi_kernel:
             threshold *= 16
+
         return V.graph.sizevars.statically_known_leq(
             features.reduction_numel, threshold
         )  # type: ignore[arg-types]
@@ -555,6 +588,7 @@ def score_fusion(
                 and memory_score > 0
             )
 
+        # pyrefly: ignore  # bad-return
         return (
             template_score,
             node1.is_reduction() == node2.is_reduction() and memory_score > 0,
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index cda24724575e..3ead901e1a36 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -14,6 +14,7 @@
 import os
 import pickle
 import pkgutil
+import platform
 import re
 import shlex
 import shutil
@@ -33,24 +34,20 @@
 from tempfile import _TemporaryFileWrapper
 from time import time, time_ns
 from types import ModuleType
-from typing import (
-    Any,
-    Callable,
-    cast,
-    Generic,
-    NoReturn,
-    Optional,
-    TYPE_CHECKING,
-    TypeVar,
-    Union,
-)
+from typing import Any, Callable, cast, Generic, NoReturn, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import override, Self
 
 import torch
 import torch.distributed as dist
 from torch import SymInt, Tensor
+from torch._dynamo.device_interface import get_interface_for_device
 from torch._dynamo.exc import SkipFrame
-from torch._dynamo.utils import CompileEventLogger, counters, dynamo_timed
+from torch._dynamo.utils import (
+    CompileEventLogger,
+    counters,
+    dynamo_timed,
+    get_metrics_context,
+)
 from torch._inductor import config, exc, metrics
 from torch._inductor.codegen.common import (
     custom_backend_codegen_configs,
@@ -92,6 +89,7 @@
 from torch._inductor.utils import (
     ALIGN_BYTES,
     clear_on_fresh_cache,
+    determine_aoti_mmap_flags,
     is_linux,
     is_windows,
 )
@@ -143,7 +141,7 @@
 
 
 _IS_WINDOWS = sys.platform == "win32"
-LOCK_TIMEOUT = 600
+LOCK_TIMEOUT = config.file_lock_timeout
 
 output_code_log = torch._logging.getArtifactLogger(__name__, "output_code")
 autotuning_log = torch._logging.getArtifactLogger(__name__, "autotuning")
@@ -174,6 +172,21 @@ def get_kernel_bin_format(device: str) -> str:
         return ""
 
 
+def get_device_information(device_type: str) -> dict[str, str]:
+    """
+    Gets all the current device information used to compile the .so.
+    """
+    metadata: dict[str, str] = {
+        "AOTI_PLATFORM": sys.platform,
+        "AOTI_MACHINE": platform.machine(),
+        "AOTI_CPU_ISA": str(torch._inductor.cpu_vec_isa.pick_vec_isa()).upper(),
+        "AOTI_COMPUTE_CAPABILITY": str(
+            get_interface_for_device(device_type).get_compute_capability()
+        ),
+    }
+    return metadata
+
+
 class CacheBase:
     @staticmethod
     @functools.cache
@@ -240,7 +253,7 @@ def update_local_cache(self, local_cache: dict[str, Any]) -> None:
 
 
 class LocalCache(CacheBase):
-    def lookup(self, *keys: str) -> Optional[dict[str, Any]]:
+    def lookup(self, *keys: str) -> dict[str, Any] | None:
         cache = self.get_local_cache()
 
         sub_cache = cache
@@ -270,8 +283,8 @@ def lookup(
         choices: list[ChoiceCaller],
         op: str,
         inputs: str,
-        benchmark: Optional[Callable[[Any], dict[ChoiceCaller, float]]],
-        hint_override: Optional[int] = None,
+        benchmark: Callable[[Any], dict[ChoiceCaller, float]] | None,
+        hint_override: int | None = None,
     ) -> dict[ChoiceCaller, float]:
         """
         Check to see if we have benchmarked the given choice callers. For each
@@ -331,7 +344,7 @@ def sha256_hash(data: bytes) -> str:
     return base64.b32encode(hashlib.sha256(data).digest())[:51].decode("utf-8").lower()
 
 
-def code_hash(code: Union[str, bytes], extra: Union[str, bytes] = "") -> str:
+def code_hash(code: str | bytes, extra: str | bytes = "") -> str:
     hashing_str = code if isinstance(code, bytes) else code.encode("utf-8")
     if extra:
         extra_b = extra if isinstance(extra, bytes) else extra.encode("utf-8")
@@ -353,9 +366,7 @@ def get_path(
     return basename, subdir, path
 
 
-def get_hash(
-    content: Union[str, bytes], extra: str = "", hash_type: str = "code"
-) -> str:
+def get_hash(content: str | bytes, extra: str = "", hash_type: str = "code") -> str:
     if hash_type in {"amdgcn", "code", "ptx", "spv"}:
         return code_hash(content, extra)
     if hash_type in {"cubin", "hsaco", "spv"}:
@@ -401,12 +412,12 @@ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
 
 
 def write(
-    content: Union[str, bytes],
+    content: str | bytes,
     extension: str,
     extra: str = "",
     hash_type: str = "code",
     specified_dir: str = "",
-    key: Optional[str] = None,
+    key: str | None = None,
 ) -> tuple[str, str]:
     if key is None:
         # use striped content to compute hash so we don't end up with different
@@ -428,7 +439,7 @@ def write_text(text: str) -> str:
 
 def write_atomic(
     path_: str,
-    content: Union[str, bytes],
+    content: str | bytes,
     make_dirs: bool = False,
     encode_utf_8: bool = False,
 ) -> None:
@@ -539,7 +550,7 @@ def _reduce_fake_tensor(
 
     def _reduce_tensor(
         self, t: Tensor
-    ) -> tuple[Callable[[T], T], tuple[Union[TensorMetadata, TensorMetadataAndValues]]]:
+    ) -> tuple[Callable[[T], T], tuple[TensorMetadata | TensorMetadataAndValues]]:
         """
         Custom reducer to pickle Tensors.  If we see tensors, we know they're constants
         stored as attributes on the GraphModule.
@@ -640,7 +651,8 @@ def get_str(obj: Any) -> str:
             if isinstance(obj, torch.Tensor):
                 return str(extract_tensor_metadata_for_cache_key(obj))
             elif isinstance(obj, bytes):
-                return "<bytes>"
+                val = obj.decode("utf-8", errors="replace")
+                return val if len(val) <= 1024 else val[:1024] + "..."
             elif type(obj) in self.dispatch_table:
                 # Run the reducer on the object
                 return str(self.dispatch_table[type(obj)](obj)[1])
@@ -918,7 +930,7 @@ def __init__(
     # - if any of them are set to custom callables, we will need to cache miss
     # Future work is for someone to find any places where these functions are used
     # and force them to be of type CustomGraphPass, so we can guarantee serialization.
-    def _get_custom_pass_detail_unsafe(self, custom_pass: Any) -> Optional[Any]:
+    def _get_custom_pass_detail_unsafe(self, custom_pass: Any) -> Any | None:
         if not custom_pass:
             return None
         if isinstance(custom_pass, list):
@@ -934,8 +946,8 @@ def _get_custom_pass_detail_unsafe(self, custom_pass: Any) -> Optional[Any]:
         raise AssertionError(f"unknown config type: {str(type(custom_pass))}")
 
     def _get_custom_pass_detail(
-        self, custom_pass: Union[CustomGraphPassType, CustomGraphModulePass]
-    ) -> Optional[Any]:
+        self, custom_pass: CustomGraphPassType | CustomGraphModulePass
+    ) -> Any | None:
         if not custom_pass:
             return None
         assert isinstance(custom_pass, (CustomGraphPass, CustomGraphModulePass))
@@ -943,7 +955,7 @@ def _get_custom_pass_detail(
 
     def _get_custom_partitioner_fn_detail(
         self, custom_partitioner_fn: CustomPartitionerFnType
-    ) -> Optional[Any]:
+    ) -> Any | None:
         if not custom_partitioner_fn:
             return None
         assert isinstance(custom_partitioner_fn, CustomPartitionerFn)
@@ -1013,7 +1025,7 @@ def _get_tmp_dir_for_key(cls: type[GuardedCache[T]], _key: str) -> str:
     def iterate_over_candidates(
         cls: type[GuardedCache[T]],
         local: bool,
-        remote_cache: Optional[RemoteCache[JsonDataTy]],
+        remote_cache: RemoteCache[JsonDataTy] | None,
         key: str,
     ) -> Generator[tuple[T, bytes], None, None]:
         if local:
@@ -1048,10 +1060,10 @@ def find_guarded_entry(
         cls: type[GuardedCache[T]],
         key: str,
         local: bool,
-        remote_cache: Optional[RemoteCache[JsonDataTy]],
-        evaluate_guards: Callable[[str, Union[list[int], list[torch.SymInt]]], bool],
+        remote_cache: RemoteCache[JsonDataTy] | None,
+        evaluate_guards: Callable[[str, list[int] | list[torch.SymInt]], bool],
         hints: list[int],
-    ) -> tuple[Optional[T], Optional[bytes], dict[str, str]]:
+    ) -> tuple[T | None, bytes | None, dict[str, str]]:
         """
         Find the first cache entry in iterate_over_candidates that passes `evaluate_guards`.
 
@@ -1115,7 +1127,7 @@ def _filter_backed_symints(
         return [s for s in inputs if isinstance(s, torch.SymInt) and has_hint(s)]
 
     @classmethod
-    def _get_shape_env(cls: type[GuardedCache[T]]) -> Optional[ShapeEnv]:
+    def _get_shape_env(cls: type[GuardedCache[T]]) -> ShapeEnv | None:
         """
         Helper to get the shape env from the tracing context.
         """
@@ -1186,7 +1198,7 @@ def cache_hit_post_compile(
         graph: CompiledFxGraph,
         cache_info: dict[str, Any],
         constants: CompiledFxGraphConstants,
-    ) -> tuple[Optional[CompiledFxGraph], dict[str, Any]]:
+    ) -> tuple[CompiledFxGraph | None, dict[str, Any]]:
         """
         Cache specific post compile steps that need to run if we find a graph in the cache
         This includes putting bundled triton artifacts in the right place,
@@ -1252,7 +1264,10 @@ def cache_hit_post_compile(
         )
         trace_structured(
             "inductor_output_code",
-            lambda: {"filename": artifact_path},
+            lambda: {
+                "filename": artifact_path,
+                "file_path": os.path.abspath(artifact_path),
+            },
             payload_fn=lambda: code,
         )
         trace_structured(
@@ -1271,6 +1286,10 @@ def cache_hit_post_compile(
             },
             payload_fn=lambda: graph.inductor_provenance_stack_traces_str,
         )
+        if get_metrics_context().in_progress():
+            get_metrics_context().add_to_set(
+                "inductor_provenance", graph.inductor_provenance_stack_traces_str
+            )
         return graph, cache_info
 
     @staticmethod
@@ -1278,12 +1297,11 @@ def _lookup_graph(
         key: str,
         example_inputs: Sequence[InputType],
         local: bool,
-        remote_cache: Optional[RemoteCache[JsonDataTy]],
+        remote_cache: RemoteCache[JsonDataTy] | None,
         constants: CompiledFxGraphConstants,
-        evaluate_guards: Optional[
-            Callable[[str, Union[list[int], list[torch.SymInt]]], bool]
-        ] = None,
-    ) -> tuple[Optional[CompiledFxGraph], dict[str, Any]]:
+        evaluate_guards: Callable[[str, list[int] | list[torch.SymInt]], bool]
+        | None = None,
+    ) -> tuple[CompiledFxGraph | None, dict[str, Any]]:
         """
         Lookup a compiled graph in the cache by key. On a hit, return the
         deserialized CompiledFxGraph object. On a miss, return None.
@@ -1351,7 +1369,7 @@ def _save_graph(
         compiled_graph: OutputCode,
         example_inputs: Sequence[InputType],
         local: bool,
-        remote_cache: Optional[RemoteCache[JsonDataTy]],
+        remote_cache: RemoteCache[JsonDataTy] | None,
     ) -> None:
         """
         Store a serialized CompiledFxGraph on disk.
@@ -1480,7 +1498,7 @@ def prepare_key(
         fx_kwargs: _CompileFxKwargs,
         inputs_to_check: Sequence[int],
         remote: bool,
-    ) -> tuple[Optional[tuple[str, list[str]]], dict[str, Any]]:
+    ) -> tuple[tuple[str, list[str]] | None, dict[str, Any]]:
         """
         Checks that the inductor input is cacheable, then computes
         and returns the cache key for the input.
@@ -1498,7 +1516,7 @@ def prepare_key(
             )
         except BypassFxGraphCache as e:
             counters["inductor"]["fxgraph_cache_bypass"] += 1
-            log.info("Bypassing FX Graph Cache because '%s'", e)
+            log.info("Bypassing FX Graph Cache because '%s'", e)  # noqa: G200
             if remote:
                 log_cache_bypass("bypass_fx_graph", str(e))
             cache_info = {
@@ -1511,7 +1529,7 @@ def prepare_key(
         return (key, debug_lines), {}
 
     @staticmethod
-    def get_remote_cache() -> Optional[RemoteCache[JsonDataTy]]:
+    def get_remote_cache() -> RemoteCache[JsonDataTy] | None:
         """
         Attempts to load the remote cache, returns None on error.
         """
@@ -1529,13 +1547,12 @@ def load_with_key(
         debug_lines: list[str],
         example_inputs: Sequence[InputType],
         local: bool,
-        remote_cache: Optional[RemoteCache[JsonDataTy]],
+        remote_cache: RemoteCache[JsonDataTy] | None,
         is_backward: bool,
         constants: CompiledFxGraphConstants,
-        evaluate_guards: Optional[
-            Callable[[str, Union[list[int], list[torch.SymInt]]], bool]
-        ] = None,
-    ) -> tuple[Optional[CompiledFxGraph], dict[str, Any]]:
+        evaluate_guards: Callable[[str, list[int] | list[torch.SymInt]], bool]
+        | None = None,
+    ) -> tuple[CompiledFxGraph | None, dict[str, Any]]:
         """
         Lookup the graph with the given key, and return results and metadata.
         Doesn't do any logging on its own, because AOTAutograd handles a cache miss
@@ -1633,11 +1650,11 @@ class CudaKernelParamCache:
     def set(
         cls,
         key: str,
-        params: dict[str, Optional[str]],
+        params: dict[str, str | None],
         cubin: str,
         bin_type: str,
-        asm: Optional[str] = None,
-        asm_type: Optional[str] = None,
+        asm: str | None = None,
+        asm_type: str | None = None,
     ) -> None:
         basename = None
         if config.aot_inductor.package_cpp_only:
@@ -1690,7 +1707,7 @@ def set(
         cls.cache[key] = params
 
     @classmethod
-    def get(cls, key: str) -> Optional[dict[str, Any]]:
+    def get(cls, key: str) -> dict[str, Any] | None:
         return cls.cache.get(key, None)
 
     @classmethod
@@ -1709,16 +1726,16 @@ def compile(
         graph: GraphLowering,
         wrapper_code: str,
         kernel_code: str,
-        serialized_extern_kernel_nodes: Optional[str],
+        serialized_extern_kernel_nodes: str | None,
         *,
         device_type: str,
         additional_files: list[str],
-    ) -> Union[list[Union[str, Weights]], str]:
+    ) -> list[Union[str, Weights]] | str:
         """
         Returns the .so path, or returns a list of files that were generated if
         config.aot_inductor.package=True.
         """
-        generated_files: list[Union[str, Weights]] = additional_files  # type: ignore[assignment]
+        generated_files: list[str | Weights] = additional_files  # type: ignore[assignment]
 
         _set_gpu_runtime_env()  # cpp_extension consults the env
 
@@ -1777,7 +1794,7 @@ def compile(
 
         header_code = ""
         header_path = ""
-        if config.aot_inductor.compile_standalone:
+        if not config.aot_inductor.dynamic_linkage:
             # to link statically, we also need a header file
             with open(
                 os.path.join(
@@ -1827,7 +1844,7 @@ def compile(
             generated_files.append(wrapper_path)
             if not config.aot_inductor.package_cpp_only:
                 generated_files.append(kernel_path)
-            if config.aot_inductor.compile_standalone:
+            if not config.aot_inductor.dynamic_linkage:
                 generated_files.append(header_path)
 
         output_code_log.info("Wrapper code written to: %s", wrapper_path)
@@ -1850,7 +1867,7 @@ def compile(
             },
             payload_fn=lambda: kernel_code,
         )
-        if config.aot_inductor.compile_standalone:
+        if not config.aot_inductor.dynamic_linkage:
             output_code_log.info("Header code written to: %s", header_path)
             trace_structured(
                 "graph_dump",
@@ -2083,6 +2100,9 @@ def get_zero_consts_asm_code(
             metadata = config.aot_inductor.metadata
             metadata["AOTI_DEVICE_KEY"] = device_type
 
+            # Add environment information to ensure .so compatibility
+            metadata.update(get_device_information(device_type))
+
             # Save user provided metadata
             meta_json = str(
                 wrapper_path_operator.with_name(
@@ -2147,10 +2167,14 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                     data_ptr,
                     ctypes.POINTER(ctypes.c_ubyte * nbytes),
                 )
+                # pyrefly: ignore  # missing-attribute
                 raw_bytes = bytes(raw_array.contents)
                 return raw_bytes if all_cuda else _pad_to_alignment(raw_bytes)
 
-            if config.aot_inductor.package_constants_in_so:
+            if (
+                config.aot_inductor.package_constants_in_so
+                or config.aot_inductor.package_constants_on_disk_format == "binary_blob"
+            ):
                 serialized_weights = b"".join(
                     _to_bytes(graph.get_original_value_of_constant(name), all_cuda)
                     for name in graph.constants.keys()
@@ -2159,7 +2183,7 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
             else:
                 serialized_weights = b""
 
-            if config.aot_inductor.package_constants_on_disk:
+            if config.aot_inductor.package_constants_on_disk_format == "pickle_weights":
                 # We need to return a storage key here because the original value tensor might be a clone
                 weights_dict = Weights(
                     {
@@ -2175,15 +2199,27 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
 
             consts_size = len(serialized_weights)
 
-            # TODO: Fix mmap weights with cuda
-            use_mmap_weights = not config.is_fbcode() and consts_size > 2_000_000_000
-            if config.aot_inductor.force_mmap_weights:
-                use_mmap_weights = True
+            use_external_weights, use_mmap_weights = determine_aoti_mmap_flags(
+                consts_size
+            )
+            if use_external_weights and use_mmap_weights:
+                # Should never reach here, just a check for sanity
+                raise RuntimeError(
+                    "use_external_weights and  use_mmap_weights cannot both be True."
+                )
+
+            external_weights_path = None
+            if use_external_weights:
+                external_weights_filename = f"{wrapper_path_operator.stem}_weights.blob"
+                external_weights_path = str(
+                    wrapper_path_operator.with_name(external_weights_filename)
+                )
 
             compile_command: dict[str, Any] = {
                 "aot_mode": graph.aot_mode,
                 "device_type": device_type,
                 "use_mmap_weights": use_mmap_weights,
+                "use_mmap_weights_external": use_external_weights,
                 "use_relative_path": use_relative_path,
                 "vec_isa": picked_vec_isa,
             }
@@ -2262,7 +2298,15 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
             if not use_mmap_weights:
                 aot_constants = serialized_weights
                 magic_number = 0
+                if use_external_weights:
+                    aot_constants = struct.pack("q", consts_size)
+                    assert external_weights_path is not None
+                    # For external weights, write weights to separate file and embed minimal placeholder
+                    with open(external_weights_path, "wb") as f_weights:
+                        f_weights.write(serialized_weights)
+                    generated_files.append(external_weights_path)
             else:
+                # we'll append weights binary to the end of .so file and mmap it when loading
                 magic_number = cast(
                     int, torch.randint(0, torch.iinfo(torch.int64).max, (1,)).item()
                 )
@@ -2305,7 +2349,7 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                     f.write(json.dumps(qual_name_to_id))
                 generated_files.append(constants_config_json)
 
-            gpu_codecache: Union[ROCmCodeCache, CUDACodeCache] = (
+            gpu_codecache: ROCmCodeCache | CUDACodeCache = (
                 ROCmCodeCache() if torch.version.hip else CUDACodeCache()
             )
             gpu_kernels_o = gpu_codecache.aot_kernels_o.copy()
@@ -2338,6 +2382,7 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                     ):
                         current_arch = _nvcc_arch_as_compile_option()
                         cmd = (
+                            # pyrefly: ignore  # unbound-name
                             f"{_cuda_compiler()} -fatbin {asm_file} -o {cubin_file} "
                             # Triton only allows generating PTX version as same as the current arch
                             f"-gencode arch=compute_{current_arch},code=compute_{current_arch} "
@@ -2442,6 +2487,10 @@ def _pad_to_alignment(raw_bytes: bytes) -> bytes:
                     os.remove(o_file)
 
                 if use_mmap_weights:
+                    if config.aot_inductor.cross_target_platform == "windows":
+                        raise RuntimeError(
+                            "when cross_target_platform is windows, use_mmap_weights should not be true."
+                        )
 
                     def get_page_size() -> int:
                         # Don't use resource.getpagesize() on Windows, as it is a Unix specific package
@@ -2492,16 +2541,16 @@ class SYSTEM_INFO(Structure):
                 if config.aot_inductor.package:
                     generated_files.append(output_so)
 
-        if config.aot_inductor.package:
-            if config.trace.provenance_tracking_level != 0:
-                kernel_info = torch._inductor.debug.create_kernel_information_json()
-                kernel_info_json = os.path.join(
-                    wrapper_path_operator.parent, "kernel_information.json"
-                )
-                with open(kernel_info_json, "w") as f:
-                    f.write(json.dumps(kernel_info, indent=4))
-                generated_files.append(kernel_info_json)
+        if config.trace.provenance_tracking_level != 0:
+            kernel_info = torch._inductor.debug.create_kernel_information_json()
+            kernel_info_json = os.path.join(
+                wrapper_path_operator.parent, "kernel_information.json"
+            )
+            with open(kernel_info_json, "w") as f:
+                f.write(json.dumps(kernel_info, indent=4))
+            generated_files.append(kernel_info_json)
 
+        if config.aot_inductor.package:
             # We want to return the directory that contains all the AOTI
             # generated files, not just the so
             # return os.path.split(output_so)[0]
@@ -2510,10 +2559,10 @@ class SYSTEM_INFO(Structure):
         return output_so
 
 
-_libgomp: Optional[CDLL] = None
+_libgomp: CDLL | None = None
 
 
-def custom_op_wrapper(op: str, *args: Any) -> Union[list[c_void_p], c_void_p, None]:
+def custom_op_wrapper(op: str, *args: Any) -> list[c_void_p] | c_void_p | None:
     # This function will be called from generated cpp wrapper code in the JIT mode.
     # Because tensors will be passed in as AtenTensorHandle, we need to explicitly convert them.
     def convert_arg(arg: Any) -> Any:
@@ -2540,6 +2589,7 @@ def convert_arg(arg: Any) -> Any:
 
     # convert any kwarg-only arguments to kwargs
     kwargs = dict()
+    # pyrefly: ignore  # missing-attribute
     for func_arg, conv_arg in zip(func._schema.arguments, converted_args):
         if func_arg.kwarg_only:
             kwargs[func_arg.name] = conv_arg
@@ -2632,7 +2682,7 @@ def _get_file_checksum(filename: str) -> str:
     return header_full_path
 
 
-def _get_cpp_prefix_header(device: str) -> Optional[str]:
+def _get_cpp_prefix_header(device: str) -> str | None:
     if device.startswith("cpu"):
         return "torch/csrc/inductor/cpp_prefix.h"
     return None
@@ -2655,16 +2705,16 @@ class CppCodeCache:
     """Compiles and caches C++ libraries.  Users of this class supply the source code to
     be compiled, while compilation flags are set by CppBuilder."""
 
-    cache: dict[str, Callable[[], Union[CDLL, ModuleType]]] = {}
+    cache: dict[str, Callable[[], CDLL | ModuleType]] = {}
     cache_clear = staticmethod(cache.clear)
     cpp_compile_command_flags: dict[str, Any] = {}
 
     @staticmethod
-    def _load_library_inner(path: str, key: str) -> Union[CDLL, ModuleType]:
+    def _load_library_inner(path: str, key: str) -> CDLL | ModuleType:
         return cdll.LoadLibrary(path)
 
     @classmethod
-    def _load_library(cls, path: str, key: str) -> Union[CDLL, ModuleType]:
+    def _load_library(cls, path: str, key: str) -> CDLL | ModuleType:
         try:
             result = cls._load_library_inner(path, key)
             result.key = key  # type: ignore[union-attr]
@@ -2700,7 +2750,7 @@ def load_async(
         device_type: str = "cpu",
         submit_fn: Any = None,
         extra_flags: Sequence[str] = (),
-        optimized_code: Optional[str] = None,
+        optimized_code: str | None = None,
     ) -> Any:
         """Compile and load a C++ library.  Returns a callable that returns the loaded
         library."""
@@ -2721,10 +2771,14 @@ def load_async(
         main_build_option = CppTorchDeviceOptions(
             compile_only=bool(optimized_code),
             min_optimize=optimized_code is not None,
+            # pyrefly: ignore  # bad-argument-type
             **compile_command,
         )
         optimized_build_option = CppTorchDeviceOptions(
-            compile_only=True, **compile_command
+            # pyrefly: ignore  # bad-argument-type
+            compile_only=True,
+            # pyrefly: ignore  # bad-argument-type
+            **compile_command,
         )
 
         def get_hashable_command_line(build_option: BuildOptionsBase) -> str:
@@ -2755,7 +2809,7 @@ def get_hashable_command_line(build_option: BuildOptionsBase) -> str:
             from torch.utils._filelock import FileLock
 
             lock_path = os.path.join(get_lock_dir(), key + ".lock")
-            future: Optional[Future[Any]] = None
+            future: Future[Any] | None = None
             lib = None
 
             # if requested, pre-compile any headers
@@ -2773,6 +2827,7 @@ def get_hashable_command_line(build_option: BuildOptionsBase) -> str:
                 # decision if that ever changes.
                 if optimized_code and (header := _get_cpp_prefix_header(device_type)):
                     optimized_build_option.precompiled_header = _precompile_header(
+                        # pyrefly: ignore  # unbound-name
                         header,
                         optimized_cmd_line,
                         **compile_command,
@@ -2803,6 +2858,7 @@ def get_hashable_command_line(build_option: BuildOptionsBase) -> str:
                         main_builder.get_target_file_path(),
                         optimized_builder.get_target_file_path(),
                     ],
+                    # pyrefly: ignore  # bad-argument-type
                     BuildOption=CppTorchDeviceOptions(**compile_command),
                     output_dir=output_dir,
                 )
@@ -2861,7 +2917,7 @@ def _worker_compile_cpp(
 # Customized Python binding for cpp kernels
 @clear_on_fresh_cache
 class CppPythonBindingsCodeCache(CppCodeCache):
-    cache: dict[str, Callable[[], Union[CDLL, ModuleType]]] = {}
+    cache: dict[str, Callable[[], CDLL | ModuleType]] = {}
     cache_clear = staticmethod(cache.clear)
     cpp_compile_command_flags = {
         # kernels have no dependency on libtorch
@@ -2961,6 +3017,7 @@ class CppPythonBindingsCodeCache(CppCodeCache):
     )
 
     @classmethod
+    # pyrefly: ignore  # bad-override
     def _load_library_inner(cls, path: str, key: str) -> ModuleType:
         os.environ["_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR"] = str(
             torch._C._dynamo.guards._torchinductor_pyobject_tensor_data_ptr  # type: ignore[attr-defined]
@@ -2991,7 +3048,7 @@ def load_pybinding_async(
         num_outputs: int = -1,
         submit_fn: Any = None,
         extra_flags: Sequence[str] = (),
-        kernel_code: Optional[str] = None,
+        kernel_code: str | None = None,
     ) -> Any:
         """
         Wrap a C++ function in fast Python bindings.
@@ -3042,7 +3099,7 @@ def load_pybinding(cls, *args: Any, **kwargs: Any) -> Any:
 
 @clear_on_fresh_cache
 class CppWrapperCodeCache(CppPythonBindingsCodeCache):
-    cache: dict[str, Callable[[], Union[CDLL, ModuleType]]] = {}
+    cache: dict[str, Callable[[], CDLL | ModuleType]] = {}
     cache_clear = staticmethod(cache.clear)
     cpp_compile_command_flags = {
         "include_pytorch": True,
@@ -3111,9 +3168,9 @@ def _get_uncompiled_header(cls, device: str) -> str | None:
 
 @clear_on_fresh_cache
 class HalideCodeCache(CppPythonBindingsCodeCache):
-    cache: dict[str, Callable[[], Union[ModuleType, CDLL]]] = {}
+    cache: dict[str, Callable[[], ModuleType | CDLL]] = {}
     cache_clear = staticmethod(cache.clear)
-    _standalone_runtime_path: Optional[str] = None
+    _standalone_runtime_path: str | None = None
     prefix = textwrap.dedent(
         """
         #include "{halideruntime_h}"
@@ -3232,10 +3289,12 @@ def _codegen_glue(cls, meta: HalideMeta, headerfile: object) -> str:
         buffer_names = []
         for i, arg in enumerate(meta.argtypes):
             if arg.is_buffer():
+                # pyrefly: ignore  # bad-argument-type
                 buffer_names.append(f"&hl_buf_{i}")
                 buffers.extend(cls._codegen_buffer(f"hl_buf_{i}", arg, is_cuda))
             else:
                 assert "*" not in arg.ctype
+                # pyrefly: ignore  # bad-argument-type
                 buffer_names.append(arg.name)
         buffers = "\n".join([f"    {line}" for line in buffers]).lstrip()
 
@@ -3490,6 +3549,7 @@ def __repr__(self) -> str:
 
                 ci = cmd.index("-o")
                 assert isinstance(ci, int)
+                # pyrefly: ignore  # unsupported-operation
                 cmd[ci + 1] = Out()
                 repl = textwrap.indent(
                     textwrap.dedent(
@@ -3541,8 +3601,8 @@ def load_by_key_path(
         cls,
         key: str,
         path: str,
-        linemap: Optional[list[tuple[int, str]]] = None,
-        attrs: Optional[dict[str, Any]] = None,
+        linemap: list[tuple[int, str]] | None = None,
+        attrs: dict[str, Any] | None = None,
     ) -> ModuleType:
         if linemap is None:
             linemap = []
@@ -3590,7 +3650,7 @@ def cache_clear(cls, purge: bool = False) -> None:
     @functools.cache
     def stack_frames_for_code(
         cls, path: str, lineno: int
-    ) -> Optional[list[dict[str, Any]]]:
+    ) -> list[dict[str, Any]] | None:
         if path not in cls.linemaps:
             return None
         if len(cls.linemaps[path]) == 0:
@@ -3623,7 +3683,7 @@ def _load_triton_kernel_from_source(
     return getattr(PyCodeCache.load(source_code), kernel_name)
 
 
-def _cuda_compiler() -> Optional[str]:
+def _cuda_compiler() -> str | None:
     if cuda_env.nvcc_exist(config.cuda.cuda_cxx):
         return config.cuda.cuda_cxx
     if config.is_fbcode():
@@ -3790,7 +3850,7 @@ def cuda_compile_command(
     src_files: list[str],
     dst_file: str,
     dst_file_ext: str,
-    extra_args: Optional[list[str]] = None,
+    extra_args: list[str] | None = None,
 ) -> str:
     if extra_args is None:
         extra_args = []
@@ -3928,7 +3988,7 @@ class CUDACodeCache:
     class CacheEntry:
         input_path: str
         output_path: str
-        error_json: Optional[str] = None
+        error_json: str | None = None
 
     cache: dict[str, CacheEntry] = {}
     aot_kernels_o: list[str] = []
@@ -3943,7 +4003,7 @@ def cache_clear() -> None:
     @lru_cache(maxsize=4)
     def get_kernel_binary_remote_cache(
         caching_enabled: bool, caching_available: bool
-    ) -> Optional[Any]:
+    ) -> Any | None:
         """
         Get or create the class instance of the CUTLASSKernelBinaryRemoteCache.
 
@@ -4004,7 +4064,7 @@ def write(cls, source_code: str, dst_file_ext: str) -> tuple[str, str]:
 
     @classmethod
     def compile(
-        cls, source_code: str, dst_file_ext: str, extra_args: Optional[list[str]] = None
+        cls, source_code: str, dst_file_ext: str, extra_args: list[str] | None = None
     ) -> tuple[str, str, str]:
         """
         Compiles CUDA source_code into a file with dst_file_ext extension.
@@ -4214,7 +4274,7 @@ def write(cls, source_code: str, dst_file_ext: str) -> tuple[str, str]:
 
     @classmethod
     def compile(
-        cls, source_code: str, dst_file_ext: str, extra_args: Optional[list[str]] = None
+        cls, source_code: str, dst_file_ext: str, extra_args: list[str] | None = None
     ) -> tuple[str, str, str]:
         """
         Compiles source_code into a file with dst_file_ext extension,
@@ -4287,7 +4347,7 @@ def result(self) -> Callable[..., Any]:
 
 class LambdaFuture(CodeCacheFuture):
     def __init__(
-        self, result_fn: Callable[..., Any], future: Optional[Future[Any]] = None
+        self, result_fn: Callable[..., Any], future: Future[Any] | None = None
     ) -> None:
         self.result_fn = result_fn
         self.future = future
@@ -4308,7 +4368,7 @@ def __init__(self, static_autotuner: CachingAutotuner) -> None:
         # we need to reload the CachingAutotuner from its source code
         # We don't store the source code on the CachingAutotuner itself
         # since it can be very large.
-        self.reload_kernel_from_src: Optional[Callable[[], Any]] = None
+        self.reload_kernel_from_src: Callable[[], Any] | None = None
 
     def result(self) -> CachingAutotuner:
         assert self.reload_kernel_from_src is not None
diff --git a/torch/_inductor/codegen/aoti_runtime/interface.cpp b/torch/_inductor/codegen/aoti_runtime/interface.cpp
index e3931e86dd13..515ab89d1f2d 100644
--- a/torch/_inductor/codegen/aoti_runtime/interface.cpp
+++ b/torch/_inductor/codegen/aoti_runtime/interface.cpp
@@ -66,6 +66,7 @@ AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
     size_t num_models,
     const char* device_str,
     const char* cubin_dir) {
+
   if (num_models == 0) {
     std::cerr << "Error: num_models must be positive, but got 0\n";
     return AOTI_RUNTIME_FAILURE;
@@ -82,6 +83,7 @@ AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
   })
 }
 
+
 AOTIRuntimeError AOTInductorModelContainerDelete(
     AOTInductorModelContainerHandle container_handle) {
   CONVERT_EXCEPTION_TO_ERROR_CODE({
@@ -249,6 +251,26 @@ AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer(
   })
 }
 
+AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBufferPairs(
+    AOTInductorModelContainerHandle container_handle,
+    const AOTInductorConstantMapEntry* pairs,
+    size_t num_pairs,
+    bool use_inactive,
+    bool validate_full_update) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
+  // Build a local unordered_map inside
+  std::unordered_map<std::string, AtenTensorHandle> input_map;
+  input_map.reserve(num_pairs);
+  for (size_t i = 0; i < num_pairs; ++i) {
+      input_map.emplace(pairs[i].name, pairs[i].handle);
+  }
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->update_constant_buffer(
+        input_map, use_inactive, validate_full_update, /*user_managed=*/true);
+  })
+}
+
 AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
     AOTInductorModelContainerHandle container_handle,
     AOTInductorConstantMapHandle constant_map_handle,
@@ -440,4 +462,27 @@ AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
   })
 }
 
+AOTIRuntimeError AOTInductorModelContainerGetConstantsBlobSize(
+    AOTInductorModelContainerHandle container_handle,
+    uint64_t* ret_size) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_size = container->constant_blob_size(); })
+}
+
+
+// Load weights from a single blob in weight_blob_ptr
+AOTIRuntimeError AOTInductorModelUpdateConstantsFromBlob(
+    AOTInductorModelContainerHandle container_handle,
+    const uint8_t* weight_blob_ptr){
+    auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      {container->update_constants_from_blob(weight_blob_ptr); })
+    }
+
+
 } // extern "C"
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index 002398757738..5a953f80a1a2 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -482,15 +482,11 @@ def get_wrapper_codegen_for_device(
 
 
 def get_custom_backend_pass_for_device(device: str) -> Optional[CustomGraphModulePass]:
-    return custom_backend_passes[device] if device in custom_backend_passes else None
+    return custom_backend_passes.get(device)
 
 
 def get_custom_backend_config_for_device(device: str) -> Optional[ConfigModule]:
-    return (
-        custom_backend_codegen_configs[device]
-        if device in custom_backend_codegen_configs
-        else None
-    )
+    return custom_backend_codegen_configs.get(device)
 
 
 @functools.cache
@@ -711,6 +707,18 @@ def check_dtype(
         buffer.writeline(f"static_assert({is_same_dt});")
 
 
+def check_shape(
+    buffer: IndentedBuffer, var: CSEVariableType, shape: BlockShapeType
+) -> None:
+    backend = get_current_backend()
+    assert shape is not None
+    if config.test_configs.runtime_triton_dtype_assert and backend == "triton":
+        shape_str = (
+            ", ".join(str(d) for d in shape) if len(shape) != 1 else f"{shape[0]},"
+        )
+        buffer.writeline(f"tl.static_assert({var}.shape == ({shape_str}))")
+
+
 class DataTypePropagation:
     def __init__(self, body: LoopBody) -> None:
         self.body = body
@@ -942,6 +950,7 @@ def paren(string: OpVarT) -> OpVarT:
             or _all_in_parens(string)
         ):
             # don't put extra parens for strings that are already wrapped in parens
+            # pyrefly: ignore  # bad-return
             return string
         return f"({string})"
 
@@ -1016,6 +1025,11 @@ def store(
             f"{type(self).__name__}: store should be handled by CSEProxy"
         )
 
+    def device_assert_async(self, cond: CSEVariable, msg: str) -> None:
+        raise NotImplementedError(
+            f"{type(self).__name__}: device_assert_async should be handled by CSEProxy"
+        )
+
     def store_reduction(self, name: str, index: sympy.Expr, value: OpVarT) -> None:
         raise NotImplementedError(
             f"{type(self).__name__}: store_reduction should be handled by CSEProxy"
@@ -1075,6 +1089,11 @@ def halide_clamp(self, value: OpVarT, size: sympy.Expr, check: bool) -> OpVarT:
             f"{type(self).__name__}: halide_clamp only implemented for Halide backend"
         )
 
+    def dot(self, x: OpVarT, y: OpVarT) -> OpVarT:
+        raise NotImplementedError(
+            f"{type(self).__name__}: dot only implemented for Triton backend"
+        )
+
     def inline_asm_elementwise(
         self,
         *inputs: OpVarT,
@@ -1718,7 +1737,10 @@ def python_argdefs(
                 )
             )
         for outer, inner in chain(
-            self.input_buffers.items(), self.output_buffers.items()
+            # pyrefly: ignore  # bad-argument-type
+            self.input_buffers.items(),
+            # pyrefly: ignore  # bad-argument-type
+            self.output_buffers.items(),
         ):
             if outer in self.inplace_buffers or isinstance(inner, RemovedArg):
                 continue
@@ -2029,6 +2051,7 @@ def __init__(
     ) -> None:
         super().__init__()
         if increase_kernel_count:
+            # pyrefly: ignore  # bad-assignment
             metrics.generated_kernel_count += 1
         self.args = args or KernelArgs()
         self.loads = IndentedBuffer()
@@ -2036,6 +2059,7 @@ def __init__(
         self.stores = IndentedBuffer()
 
         self.num_load = 0
+        self.num_store = 0
         self.num_reduction = 0
 
         self.cse: CSE[CSEVariableType, Any] = CSE(self.newvar_prefix, self.suffix)
@@ -2094,6 +2118,7 @@ def swap_buffers(
             self.compute = compute
             self.stores = stores
             self.cse = cse
+            # pyrefly: ignore  # unbound-name
             if disallow_stores:
                 assert not sb, "unexpected store inside swap_buffers"
 
@@ -2118,6 +2143,11 @@ def store(
     ) -> None:
         raise NotImplementedError
 
+    def device_assert_async(self, cond: CSEVariable, msg: str) -> None:
+        raise NotImplementedError(
+            f"{type(self).__name__}: device_assert_async should be handled by CSEProxy"
+        )
+
     def reduction(
         self,
         dtype: torch.dtype,
@@ -2244,6 +2274,7 @@ def remove_kernel_local_buffers(self) -> None:
                     name, fused_node_names
                 )
             ):
+                self.num_store -= 1
                 names_to_remove.add(name)
 
         for name in names_to_remove:
@@ -2359,6 +2390,7 @@ def _template_from_string(source: str) -> Any:
             class DetailedTemplateSyntaxError(TemplateSyntaxError):
                 def __init__(self, original_error: TemplateSyntaxError) -> None:
                     super().__init__(
+                        # pyrefly: ignore  # bad-argument-type
                         original_error.message,
                         original_error.lineno,
                         original_error.name,
@@ -2370,6 +2402,7 @@ def __str__(self) -> str:
                     error_info = f"Error in template at line {self.lineno}\n"
                     error_info += f"Error message: {self.message}\n"
                     if hasattr(self.original_error, "source"):
+                        # pyrefly: ignore  # missing-attribute
                         lines = self.original_error.source.split("\n")
                         error_info += "Context:\n"
                         start = max(0, self.lineno - 2)
@@ -2460,7 +2493,7 @@ def maybe_append_choice(
             choices.append(self.generate(**kwargs))
             return None
         except NotImplementedError as e:
-            log.info(
+            log.info(  # noqa: G200
                 "Cannot Append Choice: %s. KernelTemplate type is %s",
                 e,
                 type(self),
@@ -2477,6 +2510,10 @@ def generate(self, **kwargs: Any) -> ChoiceCaller:
 
 
 class CSEProxy(DefaultHandler):
+    """A ops handler that proxies calls to `kernel` and its
+    handler and returns `CSEVariable`s with correct shape and dtype.
+    """
+
     name = "CSEProxy"
 
     def __init__(self, kernel: Kernel[Any], parent_handler: OpsHandler[Any]):
@@ -2560,6 +2597,11 @@ def do_cse(v: Union[str, CSEVariable]) -> CSEVariable:
             ):
                 assert var_dtype is not None
                 check_dtype(V.kernel.compute, csevar, var_dtype)
+
+            if config.test_configs.runtime_triton_shape_assert:
+                assert output_shape is not None
+                check_shape(V.kernel.compute, csevar, output_shape)
+
             return csevar
 
         return pytree.tree_map(do_cse, value)
@@ -2701,12 +2743,17 @@ def store(
             self._update_store_cache(name, value)
         if name not in V.graph.removed_buffers:
             self.kernel.store(name, index, value, mode=mode)
+            self.kernel.num_store += 1
+
+    def device_assert_async(self, cond: CSEVariable, msg: str) -> None:
+        self.kernel.device_assert_async(cond, msg)
 
     def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable) -> None:
         self.kernel.store_buffer_names.add(name)
         self._update_store_cache(name, value)
 
         if name not in V.graph.removed_buffers:
+            self.kernel.num_store += 1
             return self.kernel.store_reduction(name, index, value)
 
     def reduction(
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 18103c9aab01..a3391773514a 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -159,11 +159,14 @@ def get_export_declaration():
 ]
 
 MASKED_VECTORIZABLE_DTYPES: list[torch.dtype] = [
+    torch.float64,
     torch.float,
     torch.bfloat16,
     torch.float16,
     torch.uint8,
     torch.int8,
+    torch.float8_e4m3fn,
+    torch.float8_e5m2,
 ]
 
 
@@ -504,6 +507,7 @@ def fuse(  # type: ignore[override]
         if any(type(node) is OuterLoopFusedSchedulerNode for node in (node1, node2)):
             return cls(
                 node1.scheduler,
+                # pyrefly: ignore  # bad-argument-type
                 (
                     list(node1.get_outer_nodes())
                     if type(node1) is OuterLoopFusedSchedulerNode
@@ -1119,10 +1123,6 @@ def sign(x):
         code.writeline("()")
         return code
 
-    @staticmethod
-    def device_assert_async(cond, msg):
-        return f'({cond} ? 0 : (throw std::runtime_error("{msg}"), 0))'
-
 
 CppOverrides._initialize_pointwise_overrides("cpp")
 
@@ -1215,7 +1215,7 @@ def wrapper(*args, **kwargs):
             return wrapper
 
         for name, method in vars(CppVecOverrides).items():
-            if getattr(method, "__class__", None) == staticmethod and name not in [
+            if getattr(method, "__class__", None) is staticmethod and name not in [
                 "masked",
                 "index_expr",
             ]:
@@ -1720,6 +1720,7 @@ def maskify_or_vecify(code):
                     body_vec_var.dtype = dtype
                     other_vec_var.dtype = dtype
                     overrides: type[Union[CppOverrides, CppVecOverrides]] = (
+                        # pyrefly: ignore  # bad-assignment
                         V.kernel.overrides
                     )  # type: ignore[has-type]
                     code.writeline(
@@ -1763,6 +1764,7 @@ def index_expr(expr, dtype):
             csevar = V.kernel._load_or_store_non_contiguous(  # type: ignore[assignment]
                 None, index, dtype, V.kernel.compute
             )
+        # pyrefly: ignore  # missing-attribute
         csevar.update_on_args("index_expr", (expr, dtype), {})
         return csevar
 
@@ -2040,6 +2042,7 @@ def masked(self, mask):
                 # mask's dtype should be bool
                 mask.dtype = torch.bool
 
+        # pyrefly: ignore  # bad-assignment
         self._load_mask = mask
         try:
             yield mask
@@ -2138,6 +2141,11 @@ def store(self, name, index, value, mode=None):
             raise NotImplementedError(f"store mode={mode}")
         self.stores.writeline(DeferredLine(name, line))
 
+    def device_assert_async(self, cond, msg):
+        self.compute.writeline(
+            f'({cond} ? 0 : (throw std::runtime_error("{msg}"), 0));'
+        )
+
     def _gen_reduction_prefix(
         self,
         acc: Union[CSEVariable, str],
@@ -2362,6 +2370,7 @@ def set_ranges(self, lengths, reduction_lengths):
                 sympy_index_symbol_with_prefix(SymT.XBLOCK, n)
                 for n in range(len(self.ranges))
             ]
+            # pyrefly: ignore  # bad-assignment
             self.reduction_depth = len(lengths)
         return (
             self.itervars[: self.reduction_depth],
@@ -2603,13 +2612,15 @@ def gen(start, end, var):
                     var_id = i
                     break
             if (
-                type(self) == CppKernel
+                type(self) is CppKernel
                 and var_id
                 and start == 0
                 and end == self.ranges[var_id]
             ):
                 end = 1
+            # pyrefly: ignore  # bad-argument-type
             conditions.append(f"{var} >= {cexpr_index(start)}")
+            # pyrefly: ignore  # bad-argument-type
             conditions.append(f"{var} < {cexpr_index(end)}")
             return True
 
@@ -4084,6 +4095,7 @@ def is_lowp_fp_source_no_promote(node: torch.fx.Node, dt: torch.dtype):
                     and (dt := get_output_dtype(_node)) in DTYPE_LOWP_FP
                 ):
                     # No need to promote to float if all users are ops that accepts lowp fp input
+                    # pyrefly: ignore  # bad-argument-type
                     if all(is_lowp_fp_sink(user, dt) for user in _node.users):
                         continue
                     ops = _node.args[0]
@@ -4094,12 +4106,14 @@ def is_lowp_fp_source_no_promote(node: torch.fx.Node, dt: torch.dtype):
                         _node.replace_all_uses_with(
                             to_type_node, lambda n: n is not to_type_node
                         )
+                        # pyrefly: ignore  # bad-assignment
                         metrics.cpp_to_dtype_count += 1
                 elif (
                     _node.target == "store"
                     and (dt := get_input_dtype(_node)) in DTYPE_LOWP_FP
                 ):
                     ops, name, _, value_var, _ = _node.args
+                    # pyrefly: ignore  # bad-argument-type
                     if is_lowp_fp_source_no_promote(value_var, dt):
                         continue
                     dtype = V.graph.get_dtype(name)
@@ -4108,6 +4122,7 @@ def is_lowp_fp_source_no_promote(node: torch.fx.Node, dt: torch.dtype):
                             "to_dtype", args=(ops, value_var, dtype)
                         )
                         _node.replace_input_with(value_var, to_type_node)
+                        # pyrefly: ignore  # bad-assignment
                         metrics.cpp_to_dtype_count += 1
                 elif _node.target == "reduction":
                     (
@@ -4177,6 +4192,7 @@ def is_lowp_fp_source_no_promote(node: torch.fx.Node, dt: torch.dtype):
                                     "to_dtype", args=(ops, value_var, src_dtype)
                                 )
                                 _node.replace_input_with(value_var, to_type_node)
+                                # pyrefly: ignore  # bad-assignment
                                 metrics.cpp_to_dtype_count += 1
 
                     # to_dtype_bitcast act as a lowp fp source:
@@ -4195,9 +4211,8 @@ def is_lowp_fp_source_no_promote(node: torch.fx.Node, dt: torch.dtype):
                                 _node.replace_all_uses_with(
                                     to_type_node, lambda n: n is not to_type_node
                                 )
+                                # pyrefly: ignore  # bad-assignment
                                 metrics.cpp_to_dtype_count += 1
-                else:
-                    pass
 
             def eliminate_to_dtype(sub_graph: torch.fx.Graph):
                 def _eliminate_duplicate_to_node(sub_graph: torch.fx.Graph):
@@ -4290,6 +4305,7 @@ def codegen_kernel(cls, *args):
             with kernel_group.new_kernel(cls, *args) as kernel:
                 # Ugly hack to maintain the metrics kernel count since
                 # we only count in CppKernelProxy, not those contained in it
+                # pyrefly: ignore  # bad-assignment
                 metrics.generated_kernel_count -= 1
 
                 run(kernel)
@@ -4361,6 +4377,7 @@ def run(kernel):
                     )
 
             if len(tiling_indices) == 1:
+                # pyrefly: ignore  # bad-assignment
                 metrics.generated_cpp_vec_kernel_count += 1
                 loop = self.loop_nest.tile(tiling_indices[0], factor=tiling_factors[0])
                 vec_kernel = codegen_kernel(
@@ -4387,6 +4404,7 @@ def run(kernel):
                     and tiling_factors[0] == tiling_factors[1]
                 )
 
+                # pyrefly: ignore  # bad-assignment
                 metrics.generated_cpp_vec_kernel_count += 2
                 outer_loop = self.loop_nest.tile(
                     tiling_indices[0], factor=tiling_factors[0]
@@ -4535,7 +4553,7 @@ def aggregate_reduction_prefix_suffix(outer_loop: "LoopLevel"):
             assert isinstance(main_loop_kernel, self.vec_kernel_cls)
 
             # Prefix
-            if type(tail_loop_kernel) == self.kernel_cls:
+            if type(tail_loop_kernel) is self.kernel_cls:
                 # if tail loop kernel is a scalar kernel, we need to extend tmp_acc -> tmp_acc_arr[] to
                 # hold the temporary inner loop acc result for outer tail loop
                 tail_loop_kernel.finalize_reduction_prefix(
@@ -4563,7 +4581,7 @@ def aggregate_reduction_prefix_suffix(outer_loop: "LoopLevel"):
                     suffix_buf, "C10_UNLIKELY", outer_loop.var
                 ):
                     stack.enter_context(suffix_buf.indent())
-                    if type(tail_loop_kernel) == self.kernel_cls:
+                    if type(tail_loop_kernel) is self.kernel_cls:
                         reduction_vars = tail_loop_kernel.reduction_var_names
                         for name in reduction_vars:
                             new_name = f"{name}_arr[{outer_loop.var}_tail - {cexpr_index(outer_loop.tiled_size)}]"
@@ -5135,10 +5153,12 @@ def is_all_write_read_contiguous():
                             contiguous_index_expr = 0
                             stride = 1
                             for var, range in reversed(
+                                # pyrefly: ignore  # missing-attribute
                                 scheduler_node._body.var_ranges.items()
                             ):
                                 contiguous_index_expr += stride * var
                                 stride *= range
+                            # pyrefly: ignore  # missing-attribute
                             write_index_expr = scheduler_node._body.get_write_expr(
                                 scheduler_buffer.get_name()
                             )
@@ -5207,6 +5227,7 @@ def try_share_local_buffer(local_buffer_layout, local_buffers):
                             )
                             local_buffers.append(local_buffer_used)
                             local_to_global_buffers[local_buffer_used.name] = []  # type: ignore[index]
+                        # pyrefly: ignore  # index-error
                         local_to_global_buffers[local_buffer_used.name].append(
                             global_buffer,
                         )
@@ -5375,6 +5396,7 @@ def template_buffer_has_other_users(
                 )
                 user.node.mark_run()
 
+        self.codegen_comment(node_schedule, kernel_name)
         kernel.call_kernel(kernel_name, ctb)
         V.graph.removed_buffers |= kernel.removed_buffers
         self.free_buffers_in_scheduler()
@@ -5440,18 +5462,21 @@ def flush(self):
             kernel_name = self.define_kernel(
                 src_code, self.kernel_group.scheduled_nodes
             )
-            # below add provenance tracing info for cpu CppKernel types
-            debug_handle: Optional[int] = None
-            if config.trace.provenance_tracking_level != 0:
-                debug_handle = set_kernel_post_grad_provenance_tracing(
-                    self.kernel_group.scheduled_nodes, kernel_name
-                )
-            self.kernel_group.call_kernel(
-                V.graph.wrapper_code, kernel_name, debug_handle=debug_handle
-            )
+            self.codegen_comment(self.kernel_group.scheduled_nodes, kernel_name)
+            self.kernel_group.call_kernel(V.graph.wrapper_code, kernel_name)
         self.reset_kernel_group()
         self._set_flush_status(False)
 
+    def codegen_comment(self, node_schedule, kernel_name=None):
+        # below add provenance tracing info for cpu CppKernel types
+        wrapper = V.graph.wrapper_code
+        debug_handle = set_kernel_post_grad_provenance_tracing(
+            node_schedule,  # type: ignore[arg-type]
+            # pyrefly: ignore  # bad-argument-type
+            kernel_name,
+        )
+        wrapper.write_provenance_debug_handle(kernel_name, debug_handle)
+
 
 class KernelGroup:
     def __init__(self):
@@ -5523,14 +5548,13 @@ def codegen_group(self, name=None) -> str:
             code.splice(self.loops_code)
         return code.getvalue()
 
-    def call_kernel(self, wrapper, kernel_name, debug_handle: Optional[int] = None):
+    def call_kernel(self, wrapper, kernel_name):
         _, call_args, arg_types = self.args.cpp_argdefs()
         wrapper.generate_kernel_call(
             kernel_name,
             call_args,
             triton=False,
             arg_types=arg_types,
-            debug_handle=debug_handle,
         )
 
 
@@ -5770,6 +5794,7 @@ def mark_parallel(self, par_depth):
         loop = self.loops[par_depth.start_depth]
         loop.parallel = par_depth.parallel_depth
         if loop.is_reduction:
+            # pyrefly: ignore  # bad-assignment
             metrics.parallel_reduction_count += 1
         for i in range(par_depth.start_depth + 1, par_depth.parallel_depth):
             self.loops[i].collapsed = True
diff --git a/torch/_inductor/codegen/cpp_gemm_template.py b/torch/_inductor/codegen/cpp_gemm_template.py
index 6dbf1c8ad69e..cb17b5a7deb0 100644
--- a/torch/_inductor/codegen/cpp_gemm_template.py
+++ b/torch/_inductor/codegen/cpp_gemm_template.py
@@ -396,12 +396,15 @@ def transpose_w(W: _T, trans_w: bool) -> _T:
     if isinstance(W, ir.IRNode):
         if trans_w:
             if not isinstance(W, ir.TensorBox):
+                # pyrefly: ignore  # bad-assignment
                 W = ir.TensorBox(W)
             W = L.permute(W, [1, 0])
     else:
         if trans_w:
             assert isinstance(W, torch.Tensor)
+            # pyrefly: ignore  # bad-assignment
             W = W.transpose(0, 1)
+    # pyrefly: ignore  # bad-return
     return W
 
 
@@ -412,12 +415,15 @@ def expand_bias(B: Optional[_T], X: _T) -> Optional[_T]:
     if B is not None:
         if isinstance(B, ir.IRNode):
             if not isinstance(B, ir.TensorBox):
+                # pyrefly: ignore  # bad-assignment
                 B = ir.TensorBox(B)
             assert hasattr(X, "get_size")
+            # pyrefly: ignore  # missing-attribute
             B = L.expand(B, (X.get_size()[0], B.get_size()[-1]))
         else:
             assert isinstance(B, torch.Tensor)
             assert isinstance(X, torch.Tensor)
+            # pyrefly: ignore  # bad-assignment
             B = B.expand(X.shape[0], B.shape[-1])
     return B
 
@@ -1043,6 +1049,7 @@ def preprocessor(inputs, layout):
             return cls.prep_weight(
                 new_inputs,
                 new_layout,
+                # pyrefly: ignore  # bad-argument-type
                 micro_gemm,
                 pre_block_weights,
                 use_int8_fast_compensation_path,
@@ -1066,6 +1073,7 @@ def postprocessor(output):
                 new_input_nodes, _ = cls.prep_weight(
                     new_input_nodes,
                     new_layout,
+                    # pyrefly: ignore  # bad-argument-type
                     micro_gemm,
                     pre_block_weights,
                     use_int8_fast_compensation_path,
@@ -1470,7 +1478,10 @@ def copy_inner(index):
             assert isinstance(template_buffer, ir.IRNode)
             gemm_output_name = f"{template_buffer.get_name()}_GemmOut"
             gemm_output_buffer = ir.Buffer(
-                name=gemm_output_name, layout=template_buffer.layout
+                # pyrefly: ignore  # missing-attribute
+                name=gemm_output_name,
+                # pyrefly: ignore  # missing-attribute
+                layout=template_buffer.layout,
             )
             current_input_buffer = gemm_output_buffer
             for i, creator in enumerate(epilogue_creators):
@@ -1481,6 +1492,7 @@ def copy_inner(index):
                 epilogues.append(
                     ir.ComputedBuffer(
                         name=buffer_name,
+                        # pyrefly: ignore  # missing-attribute
                         layout=template_buffer.layout,
                         data=creator(current_input_buffer),
                     )
@@ -1490,7 +1502,10 @@ def copy_inner(index):
                 reindexers.append(None)
                 if i < len(epilogue_creators) - 1:
                     current_input_buffer = ir.Buffer(
-                        name=buffer_name, layout=template_buffer.layout
+                        # pyrefly: ignore  # missing-attribute
+                        name=buffer_name,
+                        # pyrefly: ignore  # missing-attribute
+                        layout=template_buffer.layout,
                     )
 
         assert isinstance(Y, (ir.Buffer, ir.ReinterpretView))
@@ -1521,6 +1536,7 @@ def copy_inner(index):
             self.n,
             self.k,
             input_dtype=X.get_dtype(),
+            # pyrefly: ignore  # missing-attribute
             input2_dtype=W.get_dtype(),
             output_dtype=output_dtype,
             compute_dtype=compute_dtype,
diff --git a/torch/_inductor/codegen/cpp_grouped_gemm_template.py b/torch/_inductor/codegen/cpp_grouped_gemm_template.py
index 4b9735222275..ed554d28004b 100644
--- a/torch/_inductor/codegen/cpp_grouped_gemm_template.py
+++ b/torch/_inductor/codegen/cpp_grouped_gemm_template.py
@@ -183,12 +183,14 @@ def __init__(
         )
         self.act_mapping = act_mapping
         self.gemm_grouped_num = gemm_grouped_num
+        # pyrefly: ignore  # bad-override
         self.output_node: list[ir.Buffer] = [
             ir.Buffer(name="buf_out" + str(idx), layout=layout)
             for idx in range(gemm_grouped_num)
         ]
 
     @classmethod
+    # pyrefly: ignore  # bad-override
     def add_choices(
         cls,
         choices: list[ChoiceCaller],
@@ -231,6 +233,7 @@ def maybe_to_dense(
                 if isinstance(inputs[idx], torch.Tensor):
                     W = inputs[idx]
                     assert isinstance(W, torch.Tensor), "W must be a torch.Tensor"
+                    # pyrefly: ignore  # unsupported-operation
                     new_inputs[idx] = W.to_dense() if W.is_mkldnn else W
             return new_inputs, layout_or_out
 
@@ -246,8 +249,10 @@ def normalize_shapes(
                 new_input = new_inputs[wgt_idx]
                 new_inputs[wgt_idx] = transpose_w(new_input, trans_w)
             for bias_idx in range(bias_start_idx, len(new_inputs)):
+                # pyrefly: ignore  # bad-argument-type
                 new_bias = expand_bias(new_inputs[bias_idx], X)
                 assert new_bias is not None
+                # pyrefly: ignore  # unsupported-operation
                 new_inputs[bias_idx] = new_bias
             return new_inputs, layout_or_out
 
@@ -308,6 +313,7 @@ def postprocessor(output: _T) -> _T:
                 W_tensor = []
                 for W_node in W_nodes:
                     assert W_node.get_name() in V.graph.constants
+                    # pyrefly: ignore  # bad-argument-type
                     W_tensor.append(V.graph.constants[W_node.get_name()])
                 new_input_nodes[wgt_start_idx : wgt_start_idx + gemm_grouped_num] = (
                     W_tensor  # type: ignore[assignment]
@@ -324,6 +330,7 @@ def postprocessor(output: _T) -> _T:
                     template_buffer.inputs[idx] = (
                         ir.InputsKernel.unwrap_storage_for_input(W_packed_constant)
                     )
+            # pyrefly: ignore  # bad-return
             return output
 
         template = DataProcessorTemplateWrapper(
@@ -362,6 +369,7 @@ def render(  # type: ignore[override,return,no-untyped-def]
         cur_idx = bias_start_idx
         for inp_idx in range(self.gemm_grouped_num):
             inp = None
+            # pyrefly: ignore  # index-error
             if self.has_bias[inp_idx]:
                 inp = self.input_nodes[cur_idx]
                 cur_idx += 1
@@ -390,6 +398,7 @@ def render(  # type: ignore[override,return,no-untyped-def]
             self.n,
             self.k,
             input_dtype=X_list[0].get_dtype(),
+            # pyrefly: ignore  # missing-attribute
             input2_dtype=W_list[0].get_dtype(),
             output_dtype=output_dtype,
             compute_dtype=compute_dtype,
@@ -427,6 +436,7 @@ def render(  # type: ignore[override,return,no-untyped-def]
         for x_idx in range(wgt_start_idx):
             kernel_args["X" + str(x_idx)] = act_deduplicated[x_idx]
         for w_idx in range(self.gemm_grouped_num):
+            # pyrefly: ignore  # unsupported-operation
             kernel_args["W" + str(w_idx)] = W_list[w_idx]
         for inp_idx in range(self.gemm_grouped_num):
             kernel_args["inp" + str(inp_idx)] = inp_list[inp_idx]
diff --git a/torch/_inductor/codegen/cpp_micro_gemm.py b/torch/_inductor/codegen/cpp_micro_gemm.py
index 2614946989b1..a494ed6dd8bd 100644
--- a/torch/_inductor/codegen/cpp_micro_gemm.py
+++ b/torch/_inductor/codegen/cpp_micro_gemm.py
@@ -965,7 +965,7 @@ def check_amx_extra(config, m, n, k, alpha, num_threads, **kwargs):
 
 def check_int8_bf16_amx_extra(config, m, n, k, alpha, num_threads, **kwargs):
     # We need avx512_bf16 to dequant int8 to bf16
-    vec_isa = kwargs.get("vec_isa", None)
+    vec_isa = kwargs.get("vec_isa")
     assert vec_isa is not None
     return vec_isa.is_avx512_bf16_supported() and check_amx_extra(
         config, m, n, k, alpha, num_threads, **kwargs
@@ -975,7 +975,7 @@ def check_int8_bf16_amx_extra(config, m, n, k, alpha, num_threads, **kwargs):
 # amx_fp16 need to be checked separately since it is not always supported when amx is supported
 def check_amx_fp16_extra(config, m, n, k, alpha, num_threads, **kwargs):
     assert config.input_dtype == torch.float16 and config.output_dtype == torch.float
-    vec_isa = kwargs.get("vec_isa", None)
+    vec_isa = kwargs.get("vec_isa")
     assert vec_isa is not None
     vnni_size = 2
     return vec_isa.is_amx_fp16_supported() and k % vnni_size == 0 and alpha == 1
@@ -1410,7 +1410,7 @@ def get_b_layout(self):
 def check_woq_int4_extra(config, m, n, k, alpha, num_threads, **kwargs):
     if alpha != 1:
         return False
-    q_group_size = kwargs.get("q_group_size", None)
+    q_group_size = kwargs.get("q_group_size")
     assert q_group_size is not None
     if (
         q_group_size not in [32, 64, 128]
diff --git a/torch/_inductor/codegen/cpp_template.py b/torch/_inductor/codegen/cpp_template.py
index d72f13a3e3fa..c2fcaeadebf7 100644
--- a/torch/_inductor/codegen/cpp_template.py
+++ b/torch/_inductor/codegen/cpp_template.py
@@ -85,6 +85,7 @@ def generate(self, **kwargs):
         bmreq = CppBenchmarkRequest(
             kernel_name=kernel_name,
             input_tensor_meta=TensorMeta.from_irnodes(self.input_nodes),
+            # pyrefly: ignore  # bad-argument-type
             output_tensor_meta=TensorMeta.from_irnodes(self.output_node),
             extra_args=extra_args,
             source_code=code,
@@ -112,6 +113,7 @@ def make_kernel_render(
             kernel_hash_name,
             self.name,
             self.input_nodes,
+            # pyrefly: ignore  # index-error
             self.output_node[0].get_layout()
             if isinstance(self.output_node, Iterable)
             else self.output_node.get_layout(),
diff --git a/torch/_inductor/codegen/cpp_template_kernel.py b/torch/_inductor/codegen/cpp_template_kernel.py
index b0dee69b012b..a077ab394dbe 100644
--- a/torch/_inductor/codegen/cpp_template_kernel.py
+++ b/torch/_inductor/codegen/cpp_template_kernel.py
@@ -411,6 +411,7 @@ def store_output(
                     )
                     epilogue_nodes = scope.localize_nodes(epilogue_nodes)
                 return self.store_pointwise_nodes(
+                    # pyrefly: ignore  # bad-argument-type
                     dst,
                     epilogue_nodes,  # type: ignore[arg-type]
                     offsets,
@@ -422,6 +423,7 @@ def store_output(
                 copy = L.copy(dst, src).data.data
                 with LocalBufferContext(self.args) as scope:
                     scope.add_local_buffer(src)
+                    # pyrefly: ignore  # bad-argument-type
                     return self.store_pointwise_nodes(dst, [copy])
             else:
                 assert dst.layout == src.layout, f"{dst=}, {src=}"
diff --git a/torch/_inductor/codegen/cpp_utils.py b/torch/_inductor/codegen/cpp_utils.py
index a2d9878f2223..de70481a3c3b 100644
--- a/torch/_inductor/codegen/cpp_utils.py
+++ b/torch/_inductor/codegen/cpp_utils.py
@@ -311,6 +311,7 @@ def store(self, name, index, value, mode=None):
         return res
 
     def store_reduction(self, name, index, value):
+        # pyrefly: ignore  # bad-argument-count
         return self._inner.store_reduction(*self.localize(name, index), value)
 
 
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
index 759eb3da462c..f885db5aaa05 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -22,7 +22,6 @@
 from torch.utils._sympy.symbol import symbol_is_type, SymT
 
 from .. import config, cpp_builder, ir
-from ..debug import set_kernel_post_grad_provenance_tracing
 from ..utils import _align, DeferredLineBase, LineContext, normalize_name
 from ..virtualized import V
 from .aoti_hipify_utils import maybe_hipify_code_wrapper
@@ -60,9 +59,9 @@ def __init__(self):
         # must be initialized prior to calling super().__init__()
         self.included_devices: OrderedSet[str] = OrderedSet()
         self.model_class_name_suffix = (
-            config.aot_inductor.model_name_for_generated_files
-            if config.aot_inductor.compile_standalone
-            else ""
+            ""
+            if config.aot_inductor.dynamic_linkage
+            else config.aot_inductor.model_name_for_generated_files
         )
         self.aoti_model_class_name = f"AOTInductorModel{self.model_class_name_suffix}"
 
@@ -222,7 +221,7 @@ def write_header(self):
         self.add_device_include(self.device)
 
         if V.graph.aot_mode:
-            if not config.aot_inductor.compile_standalone:
+            if config.aot_inductor.dynamic_linkage:
                 with open(
                     os.path.join(
                         os.path.dirname(__file__), "aoti_runtime", "interface.cpp"
@@ -768,7 +767,10 @@ def codegen_model_constructor(self):
         num_outputs = len(V.graph.graph_outputs)
         num_constants = len(V.graph.constants)
         include_weights = (
-            "true" if config.aot_inductor.package_constants_in_so else "false"
+            "true"
+            if config.aot_inductor.package_constants_in_so
+            and config.aot_inductor.package_constants_on_disk_format != "binary_blob"
+            else "false"
         )
         self.prefix.splice(
             f"""
@@ -1169,7 +1171,7 @@ def generate_end(self, result):
             """
         )
 
-        wrapper_body = "input_tensors = [arg if isinstance(arg, torch.Tensor) else torch.tensor(arg) for arg in args]"
+        wrapper_body = "input_tensors = [arg if isinstance(arg, torch.Tensor) else torch.tensor(arg, device='cpu') for arg in args]"
         if V.graph.constants:
             # Append constants to the input args for cpp wrapper.
             # Python wrapper directly gets the value inside the wrapper call
@@ -1297,14 +1299,8 @@ def generate_c_shim_extern_kernel_alloc(
 
         device = d.type if (d := extern_kernel.get_device()) else self.device
 
-        debug_handle = None
-        if config.trace.provenance_tracking_level != 0:
-            debug_handle = set_kernel_post_grad_provenance_tracing(
-                extern_kernel, extern_kernel.get_kernel_name(), is_extern=True
-            )
-
         self.generate_c_shim_extern_kernel_call(
-            extern_kernel.get_kernel_name(), args, device, debug_handle=debug_handle
+            extern_kernel.get_kernel_name(), args, device
         )
 
         if extern_kernel.python_kernel_name in (
@@ -1362,19 +1358,10 @@ def generate_c_shim_fallback_kernel(
         args = args + output_args
         device = d.type if (d := fallback_kernel.get_device()) else self.device
 
-        debug_handle = None
-        if config.trace.provenance_tracking_level != 0:
-            shim_fn = self.get_c_shim_func_name(fallback_kernel.cpp_kernel_name, device)  # type: ignore[arg-type]
-            debug_handle = set_kernel_post_grad_provenance_tracing(
-                fallback_kernel,
-                shim_fn,
-                is_extern=True,
-            )
         self.generate_c_shim_extern_kernel_call(
             fallback_kernel.cpp_kernel_name,  # type: ignore[arg-type]
             args,
             device,
-            debug_handle=debug_handle,
         )
         for raii_handle in output_raii_handles:
             self.writeline(raii_handle)
@@ -1498,7 +1485,7 @@ def _generate_symbolic_call_arg_helper(
         else:
             self.writeline(f"{arg.inner} = {cexpr(arg.inner_expr)};")
 
-    def codegen_dynamic_scalar(self, node):
+    def _codegen_dynamic_scalar(self, node):
         (data,) = (t.codegen_reference() for t in node.inputs)
         self.codegen_tensor_item(node.inputs[0].get_dtype(), data, f"{node.sym}_raw")
 
@@ -1517,19 +1504,58 @@ def codegen_dynamic_scalar(self, node):
         # record in unbacked_symbol_decls so we won't generate a declaration of the symbol again
         self.unbacked_symbol_decls.add(str(node.sym))
 
-    def codegen_dynamic_select_index(self, node):
+    def codegen_dynamic_select_index(self, node, clamp):
         index_cpp_str = self.val_to_arg_str_for_prim_type(node.index, int)
+        size_cpp_str = self.val_to_arg_str_for_prim_type(node.size, int)
 
-        index_compute_str = (
+        # codegen index
+        sym = node.unbacked_offset_symbol
+        index_str = (
             f"{index_cpp_str} < 0 ? {index_cpp_str} + "
-            f"{self.val_to_arg_str_for_prim_type(node.size, int)}:  {index_cpp_str}"
+            f"{self.val_to_arg_str_for_prim_type(node.size, int)}: {index_cpp_str}"
+        )
+        self.writeline(f"auto {sym}_index = {index_str};")
+        index_str_clamped = (
+            f"{sym}_index < 0 ? 0 : ({sym}_index > {size_cpp_str} ? {size_cpp_str} : {sym}_index)"
+            if clamp
+            else f"{sym}_index"
         )
+        self.writeline(f"auto {sym}_index_clamped = {index_str_clamped};")
         self.writeline(
-            f"auto {node.unbacked_offset_symbol} = {self.val_to_arg_str_for_prim_type(node.base_offset, int)} + "
-            f"{self.val_to_arg_str_for_prim_type(node.base_dim_stride, int)} * ({index_compute_str});"
+            f"auto {sym} = {self.val_to_arg_str_for_prim_type(node.base_offset, int)} + "
+            f"{self.val_to_arg_str_for_prim_type(node.base_dim_stride, int)} * {sym}_index_clamped;"
         )
         # record in unbacked_symbol_decls so we won't generate a declaration of the symbol again
-        self.unbacked_symbol_decls.add(str(node.unbacked_offset_symbol))
+        self.unbacked_symbol_decls.add(str(sym))
+
+    def codegen_dynamic_slice_size(self, node):
+        start_cpp_str = self.val_to_arg_str_for_prim_type(node.start, int)
+        end_cpp_str = self.val_to_arg_str_for_prim_type(node.end, int)
+        size_cpp_str = self.val_to_arg_str_for_prim_type(node.size, int)
+        step_cpp_str = self.val_to_arg_str_for_prim_type(node.step, int)
+        sym = node.unbacked_size_symbol
+
+        def codegen_clamp(index_str, start=True):
+            suf = "st" if start else "en"
+            index_ = f"{sym}_{suf}_index"
+            self.writeline(
+                f"int64_t {index_} = {index_str} < 0 ? {index_str} + {size_cpp_str} : {index_str};"
+            )
+            self.writeline(
+                f"int64_t {sym}_{suf}_cl = {index_} < 0 ? 0 : ({index_} > {size_cpp_str} ? {size_cpp_str} : {index_});"
+            )
+
+        codegen_clamp(start_cpp_str, start=True)
+        codegen_clamp(end_cpp_str, start=False)
+        if node.step == 1:
+            step_str = f"{sym}_en_cl - {sym}_st_cl"
+        else:
+            step_str = (
+                f"({sym}_en_cl - {sym}_st_cl + {step_cpp_str} + 1) / {step_cpp_str}"
+            )
+        self.writeline(f"int64_t {sym}_with_step = {step_str};")
+        self.writeline(f"int64_t {sym} = {sym}_with_step < 0 ? 0 : {sym}_with_step;")
+        self.unbacked_symbol_decls.add(str(sym))
 
     def make_buffer_free(self, buffer):
         return (
diff --git a/torch/_inductor/codegen/cpp_wrapper_gpu.py b/torch/_inductor/codegen/cpp_wrapper_gpu.py
index 6bbbab859900..dd4a3a984d34 100644
--- a/torch/_inductor/codegen/cpp_wrapper_gpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_gpu.py
@@ -3,6 +3,7 @@
 
 import dataclasses
 import re
+import sys
 from itertools import count, zip_longest
 from typing import Any, Optional, Union
 from typing_extensions import Self
@@ -199,6 +200,11 @@ def generate_load_kernel(self, prefix, kernel_var_name, params):
         prefix.writeline("}")
 
     def generate_launch_kernel(self, prefix, wrapper, kernel_var_name, params):
+        """
+        Generate the GPU kernel launching code.
+        This is where all the call args being sorted out and generated.
+        If enable_kernel_profile is enabled, all args related information would be packed in this function.
+        """
         triton_meta = params["triton_meta"]
         assert len(self.arg_types) == len(params["def_args"]), (
             self.arg_types,
@@ -234,7 +240,162 @@ def generate_launch_kernel(self, prefix, wrapper, kernel_var_name, params):
             "kernel_args_",
             "stream_",
         ]
-        prefix.writeline(f"launchKernel({', '.join(launch_kernel_args)});")
+        if wrapper.device == "xpu":
+            launch_kernel_args.append(str(params["threads_per_warp"]))
+
+        enable_kernel_profile = config.cpp.enable_kernel_profile and sys.platform in [
+            "linux",
+            "win32",
+        ]
+        if enable_kernel_profile:
+            normalized_kernel_name = re.sub(r"[^a-zA-Z0-9_]", "_", f"{kernel_var_name}")
+            prefix.writeline("{")
+            with prefix.indent():
+                prefix.writelines(
+                    [
+                        f"std::unordered_map<std::string, C10IValueHandle> kwargs_{normalized_kernel_name};",
+                        "",
+                    ]
+                )
+                # Add launch args info
+                record_launch_kernel_args = [
+                    ("grid_0", "grid_0"),
+                    ("grid_1", "grid_1"),
+                    ("grid_2", "grid_2"),
+                    ("num_warps", str(params["num_warps"])),
+                    ("shared_mem", str(params["shared_mem"])),
+                ]
+                for k, v in record_launch_kernel_args:
+                    arg_name = f"{normalized_kernel_name}_{k}"
+                    prefix.writelines(
+                        [
+                            f"// Create c10::IValue for {k}",
+                            f"C10IValueHandle tmp_{arg_name};",
+                            f"aoti_torch_int64_to_ivalue({v}, &tmp_{arg_name});",
+                            f"RAIIC10IValueHandle RAII_{arg_name}(tmp_{arg_name});",
+                            f'kwargs_{normalized_kernel_name}.emplace("{k}", RAII_{arg_name});',
+                        ]
+                    )
+
+                # Add input info (This copies the logic from args_decl)
+                signature2dtype = {
+                    "i32": "int32_t",
+                    "i64": "int64_t",
+                    "fp32": "float",
+                }
+
+                def signature_is_tma_desc(sig):
+                    if not sig:
+                        return False
+                    if sig == "nvTmaDesc":
+                        return True
+                    if sig.startswith("tensordesc<"):
+                        return True
+                    return False
+
+                curr_arg_id = -1
+                total_args = []
+                ordered_argsname = []
+
+                def write_dummy_scalar_ivalue(arg_name):
+                    # We only care about the shape, therefore we create a dummy scalar here.
+                    prefix.writelines(
+                        [
+                            f"// Create c10::IValue for arg_{curr_arg_id}",
+                            f"C10IValueHandle tmp_{arg_name};",
+                            f"aoti_torch_int64_to_ivalue(0, &tmp_{arg_name});",
+                            f"RAIIC10IValueHandle RAII_{arg_name}(tmp_{arg_name});",
+                        ]
+                    )
+                    # pyrefly: ignore  # bad-argument-type
+                    total_args.append(f"tmp_{arg_name}")
+
+                def process_args_for_input_shape(arg, arg_type, arg_signature=None):
+                    nonlocal curr_arg_id
+                    curr_arg_id += 1
+                    arg_name = f"{normalized_kernel_name}_arg_{curr_arg_id}"
+                    # ignore tma descriptors, as host-side TMA descriptors need
+                    # to be passed to the compiled Triton kernel by value
+                    if isinstance(
+                        arg_type, UnwrapUnspecArg
+                    ) and not signature_is_tma_desc(arg_signature):
+                        write_dummy_scalar_ivalue(arg_name)
+                    elif isinstance(
+                        arg_type, torch_dtype
+                    ) and not signature_is_tma_desc(arg_signature):
+                        # This is an at::Tensor.
+                        prefix.writelines(
+                            [
+                                f"// Create c10::IValue for arg_{curr_arg_id}",
+                                f"C10IValueHandle tmp_{arg_name};",
+                                f"aoti_torch_tensor_to_ivalue({arg}, &tmp_{arg_name});",
+                                f"RAIIC10IValueHandle RAII_{arg_name}(tmp_{arg_name});",
+                            ]
+                        )
+                        # pyrefly: ignore  # bad-argument-type
+                        total_args.append(f"tmp_{arg_name}")
+                    elif (
+                        isinstance(arg_type, type(SymbolicCallArg))
+                        and arg_signature is not None
+                        and arg_signature in signature2dtype.keys()
+                    ) or arg_type in (sympy.Integer, int, sympy.Float, float):
+                        write_dummy_scalar_ivalue(arg_name)
+                    elif arg_signature and arg_signature.startswith("tensordesc<"):
+                        # Skip tma related args
+                        pass
+                    else:
+                        write_dummy_scalar_ivalue(arg_name)
+
+                # Add input name and shape information
+                for arg, arg_type, arg_signature in zip_longest(
+                    call_args, arg_types, arg_signatures
+                ):
+                    # pyrefly: ignore  # bad-argument-type
+                    ordered_argsname.append(f'"{arg}"')
+                    process_args_for_input_shape(arg, arg_type, arg_signature)
+
+                # Add input name into kwargs
+                name_var = f"{normalized_kernel_name}_input_names"
+                prefix.writelines(
+                    [
+                        "// Create c10::IValue for input names",
+                        f"C10IValueHandle tmp_{name_var};",
+                        f"std::vector<const char*> {name_var}({{{', '.join(ordered_argsname)}}});",
+                        f"aoti_torch_strlist_to_ivalue({name_var}.data(), {len(ordered_argsname)}, &tmp_{name_var});",
+                        f"RAIIC10IValueHandle RAII_{name_var}(tmp_{name_var});",
+                        f'kwargs_{normalized_kernel_name}.emplace("Input Args", RAII_{name_var});',
+                    ]
+                )
+
+                inputs_info_ = f"{normalized_kernel_name}_inputs_info_"
+                # We pass in the non-RAII handles, since C10 doesn't automatically free them.
+                # The RAII will make sure they get freed when they are out of scope.
+                tmp_args = ",".join(total_args)
+                prefix.writelines(
+                    [
+                        "// Aggregate all c10::IValue for inputs",
+                        f"std::vector<C10IValueHandle> {inputs_info_}({{{tmp_args}}});",
+                    ]
+                )
+
+                # Start recording Function
+                prefix.writelines(
+                    [
+                        "",
+                        (
+                            "torch::aot_inductor::RAIIAtenRecordFunctionHandle "
+                            f"record_{normalized_kernel_name}_"
+                            f'("{kernel_var_name}", '
+                            f"reinterpret_cast<IValueMapHandle>(&kwargs_{normalized_kernel_name}), "
+                            f"{inputs_info_});"
+                        ),
+                        "",
+                        f"launchKernel({', '.join(launch_kernel_args)});",
+                    ]
+                )
+            prefix.writeline("}")
+        else:
+            prefix.writeline(f"launchKernel({', '.join(launch_kernel_args)});")
 
 
 class CppWrapperGpu(CppWrapperCpu):
@@ -319,7 +480,7 @@ def codegen_inputs(self):
                 )
                 self.prefix.splice(
                     f"""
-                    if ((long({input_name}.data_ptr()) & ({GPU_ALIGN_BYTES} -1)) != 0) {{
+                    if ((reinterpret_cast<std::uintptr_t>({input_name}.data_ptr()) & ({GPU_ALIGN_BYTES} -1)) != 0) {{
                         AOTI_TORCH_WARN("{warn_msg}");
                         AtenTensorHandle {input_name}_aligned;
                         aoti_torch_clone_preserve_strides({input_name}, &{input_name}_aligned);
@@ -661,7 +822,10 @@ def _generate_kernel_call_helper(
 
         if triton:
             call_args, arg_types = self.prepare_triton_wrapper_args(
-                call_args, arg_types
+                # pyrefly: ignore  # bad-argument-type
+                call_args,
+                # pyrefly: ignore  # bad-argument-type
+                arg_types,
             )
             wrapper_name = f"call_{kernel_name}"
             if wrapper_name not in self._triton_call_wrappers:
@@ -685,10 +849,12 @@ def _generate_kernel_call_helper(
                 self.writeline(f"{wrapper_name}({', '.join(call_args)});")
         else:
             casted = []
+            # pyrefly: ignore  # no-matching-overload
             for arg_type, arg in zip(arg_types, call_args):
                 new_arg = arg
                 if arg_type.endswith("*") and arg != "nullptr":
                     new_arg = f"{arg}.data_ptr()"
+                # pyrefly: ignore  # bad-argument-type
                 casted.append(f"({arg_type}){cexpr(new_arg)}")
             call_args_str = ", ".join(casted)
             self.writeline(f"kernels.{kernel_name}({call_args_str}, {stream});")
diff --git a/torch/_inductor/codegen/cpp_wrapper_mps.py b/torch/_inductor/codegen/cpp_wrapper_mps.py
index aea4470f1c96..7a5638f37b78 100644
--- a/torch/_inductor/codegen/cpp_wrapper_mps.py
+++ b/torch/_inductor/codegen/cpp_wrapper_mps.py
@@ -20,6 +20,7 @@ class CppWrapperMps(CppWrapperGpu):
     def __init__(self) -> None:
         super().__init__()
         self._used_kernel_names: OrderedSet[str] = OrderedSet()
+        self._lambda_counter: int = 0
 
     @staticmethod
     def create(
@@ -47,13 +48,16 @@ def _generate_kernel_call_helper(
         """
         Generates MPS kernel call code. It should look something like:
         ```
-        get_mps_lib_0()->runCommandBlock([&] {
-            get_mps_lib_0()->startEncoding();
-            aoti_torch_mps_set_arg(get_mps_lib_0_handle(), 0, buf0);
-            aoti_torch_mps_set_arg(get_mps_lib_0_handle(), 1, arg0_1);
-            ...
-            get_mps_lib_0()->dispatch(9);
-        });
+        auto mps_lib_0_lambda = [&](AOTIMetalKernelFunctionHandle handle) {
+            aoti_torch_mps_start_encoding(handle);
+            aoti_torch_mps_set_arg_tensor(handle, 0, buf0);
+            aoti_torch_mps_set_arg_tensor(handle, 1, arg0_1);
+            aoti_torch_mps_set_arg_tensor(handle, 2, arg1_1);
+            aoti_torch_mps_dispatch_single(handle, static_cast<uint64_t>(10LL));
+        };
+
+        std::function<void(AOTIMetalKernelFunctionHandle)> mps_lib_0_func_wrapper = mps_lib_0_lambda;
+        aoti_torch_mps_run_command_block(get_mps_lib_0_handle(), aoti_torch_mps_shared_callback, &mps_lib_0_func_wrapper);
         ```
         """
         device = device or V.graph.get_current_device_or_throw()
@@ -78,13 +82,9 @@ def _generate_kernel_call_helper(
         new_args = []
         for idx, (arg, arg_type) in enumerate(zip(call_args[:-2], arg_types[:-2])):
             if isinstance(arg_type, torch.dtype):
-                new_args.append(
-                    f"aoti_torch_mps_set_arg_tensor(get_{kernel_name}_handle(), {idx}, {arg});"
-                )
+                new_args.append(f"aoti_torch_mps_set_arg_tensor(handle, {idx}, {arg});")
             elif arg_type in (int, sympy.core.symbol.Symbol):
-                new_args.append(
-                    f"aoti_torch_mps_set_arg_int(get_{kernel_name}_handle(), {idx}, {arg});"
-                )
+                new_args.append(f"aoti_torch_mps_set_arg_int(handle, {idx}, {arg});")
             else:
                 raise NotImplementedError(
                     f"Unsupported arg type {arg_type} for arg {arg} for kernel {kernel_name}"
@@ -93,12 +93,85 @@ def _generate_kernel_call_helper(
         threads, group_size = call_args[-2], call_args[-1]
         if threads is None:
             raise NotImplementedError("No threads or group_size provided")
-        elif group_size is None:
-            new_args.append(f"get_{kernel_name}()->dispatch({threads});\n")
+
+        # Check if threads is a single value or an array-like structure
+        threads_str = str(threads)
+        is_single_value = (
+            threads_str.startswith("{")
+            and threads_str.endswith("}")
+            and threads_str.count(",") == 0
+        ) or not threads_str.startswith(("{", "["))
+
+        if is_single_value:
+            # Extract single value from braces if present
+            if threads_str.startswith("{") and threads_str.endswith("}"):
+                single_value = threads_str[1:-1].strip()  # Remove braces
+            else:
+                single_value = threads_str
+
+            if group_size is None:
+                new_args.append(
+                    f"aoti_torch_mps_dispatch_single(handle, {single_value});"
+                )
+            else:
+                # Extract group size value if it's also in braces
+                group_size_str = str(group_size)
+                if group_size_str.startswith("{") and group_size_str.endswith("}"):
+                    group_size_value = group_size_str[1:-1].strip()
+                else:
+                    group_size_value = group_size_str
+                new_args.append(
+                    f"aoti_torch_mps_dispatch_single_with_group_size(handle, {single_value}, {group_size_value});"
+                )
         else:
-            new_args.append(
-                f"get_{kernel_name}()->dispatch({threads}, {group_size});\n"
-            )
+            # Handle array case - need to convert initializer list to array
+            # Use kernel name to make variable names unique
+            threads_var = f"{kernel_name}_threads_array"
+            group_size_var = f"{kernel_name}_group_size_array"
+
+            # Extract array size from the initializer list string
+            def get_array_size(array_str: str) -> int:
+                # Remove braces and whitespace
+                content = array_str.strip()
+                if content.startswith("{") and content.endswith("}"):
+                    content = content[1:-1].strip()
+
+                if not content:  # Empty array
+                    return 0
+
+                # Count elements by counting commas, accounting for nested structures
+                depth = 0
+                comma_count = 0
+                for char in content:
+                    if char in "({[<":
+                        depth += 1
+                    elif char in ")}]>":
+                        depth -= 1
+                    elif char == "," and depth == 0:
+                        comma_count += 1
+
+                return comma_count + 1  # Number of elements = commas + 1
+
+            threads_size = get_array_size(threads_str)
+
+            if group_size is None:
+                new_args.append("{")
+                new_args.append(f"    uint64_t {threads_var}[] = {threads};")
+                new_args.append(
+                    f"    aoti_torch_mps_dispatch_array(handle, {threads_var}, {threads_size});"
+                )
+                new_args.append("}")
+            else:
+                group_size_str = str(group_size)
+                group_size_size = get_array_size(group_size_str)
+                new_args.append("{")
+                new_args.append(f"    uint64_t {threads_var}[] = {threads};")
+                new_args.append(f"    uint64_t {group_size_var}[] = {group_size};")
+                dispatch_args = f"handle, {threads_var}, {threads_size}, {group_size_var}, {group_size_size}"
+                new_args.append(
+                    f"    aoti_torch_mps_dispatch_array_with_group_size({dispatch_args});"
+                )
+                new_args.append("}")
 
         # debug printer related logic for cpp kernel type.
         debug_printer_manager = V.graph.wrapper_code.debug_printer
@@ -113,14 +186,34 @@ def _generate_kernel_call_helper(
             self.write_mps_kernel_call(kernel_name, new_args)
 
     def write_mps_kernel_call(self, name: str, call_args: list[str]) -> None:
-        # Initialization of the kernel function and kernel function handle
-        # variables have already been done at the beginning, which was
-        # codegen-ed in `codegen_mps_func_init`
-        self.writeline(f"get_{name}()->runCommandBlock([&] {{")
-        self.writeline(f"    get_{name}()->startEncoding();")
+        # Generate unique variable names to avoid duplicate declarations
+        # when the same MPS lib is used multiple times
+        unique_suffix = self._lambda_counter
+        self._lambda_counter += 1
+
+        lambda_name = f"{name}_lambda_{unique_suffix}"
+        wrapper_name = f"{name}_func_wrapper_{unique_suffix}"
+
+        # Generate the function call code (in current location)
+        # Create lambda that captures by reference and pass its pointer through void*
+        self.writeline(
+            f"auto {lambda_name} = [&](AOTIMetalKernelFunctionHandle handle) {{"
+        )
+        self.writeline("    aoti_torch_mps_start_encoding(handle);")
+
+        # Output call args directly since we're capturing by reference
         for call_arg in call_args:
             self.writeline(f"    {call_arg}")
-        self.writeline("});")
+        self.writeline("};")
+        self.writeline("")
+
+        # Pass lambda pointer through void*
+        self.writeline(
+            f"std::function<void(AOTIMetalKernelFunctionHandle)> {wrapper_name} = {lambda_name};"
+        )
+        self.writeline(
+            f"aoti_torch_mps_run_command_block(get_{name}_handle(), aoti_torch_mps_shared_callback, &{wrapper_name});"
+        )
 
     @staticmethod
     def get_device_include_path(device: str) -> str:
@@ -132,49 +225,77 @@ def get_device_include_path(device: str) -> str:
 
     def codegen_additional_funcs(self) -> None:
         """
-        We want to codegen the mps kernel function variable initializations
-        ahead of time.  This is so that if we reuse kernels within subgraphs, we
-        don't need to worry about the scope in which we're initializing the
-        variables. Instead we will just initialize the variables all at the top
-        level.
+        Generate thread-safe lazy singleton pattern for MPS shader libraries with RAII cleanup.
 
-        The kernel function variable initializations should look something like:
+        The generated code will look like:
         ```
-        const std::shared_ptr<at::native::mps::MetalKernelFunction> get_mps_lib_0() {
-            static const auto func = mps_lib_0.getKernelFunction("generated_kernel");
-            return func;
-        }
         AOTIMetalKernelFunctionHandle get_mps_lib_0_handle() {
-            static const auto handle = AOTIMetalKernelFunctionHandle(get_mps_lib_0().get());
-            return handle;
+            static auto kernel_handle = []() {
+                AOTIMetalShaderLibraryHandle lib_handle = nullptr;
+                AOTIMetalKernelFunctionHandle kern_handle = nullptr;
+
+                aoti_torch_mps_create_shader_library(mps_lib_0_source, &lib_handle);
+                aoti_torch_mps_get_kernel_function(lib_handle, "generated_kernel", &kern_handle);
+
+                // RAII wrapper with custom deleter
+                auto lib_deleter = [](AOTIMetalShaderLibraryHandle h) {
+                    if (h) aoti_torch_mps_delete_shader_library(h);
+                };
+
+                using LibDeleter = decltype(lib_deleter);
+                using LibPtr = std::unique_ptr<AOTIMetalShaderLibraryOpaque, LibDeleter>;
+
+                // Return pair of kernel handle and library smart pointer for cleanup
+                return std::make_pair(kern_handle, LibPtr(lib_handle, lib_deleter));
+            }();
+            return kernel_handle.first;
         }
         ```
         """
 
+        # Add shimified handles and functions
+        shader_libraries: OrderedSet[str] = OrderedSet()
         for line in self.lines:
             if not isinstance(line, KernelCallLine):
                 continue
             if line.device.type != "mps":
                 continue
 
-            # Only add handle definition once
+            # Extract library name from kernel name (e.g., "mps_lib_0" from kernel calls)
             if line.kernel_name not in self._used_kernel_names:
                 self._used_kernel_names.add(line.kernel_name)
+                shader_libraries.add(line.kernel_name)
 
-                self.prefix.writeline(
-                    f"const std::shared_ptr<at::native::mps::MetalKernelFunction> get_{line.kernel_name}() {{"
-                )
-                self.prefix.writeline(
-                    f'    static const auto func = {line.kernel_name}.getKernelFunction("generated_kernel");'
-                )
-                self.prefix.writeline("    return func;")
-                self.prefix.writeline("}")
+        # NOTE: For shimified version, we expect the shader source constant to be generated
+        # by the existing MPS shader generation process, but instead of instantiating the
+        # DynamicMetalShaderLibrary directly, we'll use our shim functions.
+        # The existing codegen should produce something like:
+        # const char* mps_lib_0_source = R"MTL(...shader_source...)MTL";
+        # instead of:
+        # at::native::mps::DynamicMetalShaderLibrary mps_lib_0(R"MTL(...shader_source...)MTL");
 
-                self.prefix.writeline(
-                    f"AOTIMetalKernelFunctionHandle get_{line.kernel_name}_handle() {{"
-                )
-                self.prefix.writeline(
-                    f"    static const auto handle = AOTIMetalKernelFunctionHandle(get_{line.kernel_name}().get());"
-                )
-                self.prefix.writeline("    return handle;")
-                self.prefix.writeline("}")
+        # Generate thread-safe lazy singleton with RAII for each library
+        for lib_name in shader_libraries:
+            self.prefix.splice(f"""
+AOTIMetalKernelFunctionHandle get_{lib_name}_handle() {{
+    static auto kernel_handle = []() {{
+        AOTIMetalShaderLibraryHandle lib_handle = nullptr;
+        AOTIMetalKernelFunctionHandle kern_handle = nullptr;
+
+        aoti_torch_mps_create_shader_library({lib_name}_source, &lib_handle);
+        aoti_torch_mps_get_kernel_function(lib_handle, "generated_kernel", &kern_handle);
+
+        // RAII wrapper with custom deleter
+        auto lib_deleter = [](AOTIMetalShaderLibraryHandle h) {{
+            if (h) aoti_torch_mps_delete_shader_library(h);
+        }};
+
+        using LibDeleter = decltype(lib_deleter);
+        using LibPtr = std::unique_ptr<AOTIMetalShaderLibraryOpaque, LibDeleter>;
+
+        // Return pair of kernel handle and library smart pointer for cleanup
+        return std::make_pair(kern_handle, LibPtr(lib_handle, lib_deleter));
+    }}();
+    return kernel_handle.first;
+}}
+""")
diff --git a/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py b/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
index 67828622fde5..3d2ee95e5232 100644
--- a/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
+++ b/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
@@ -175,6 +175,7 @@ def codegen_template(
             call_args, kernel_name, arg_signatures, kernel
         )
         with debug_printer_manager:
+            self.codegen_comment(node_schedule, kernel_name)
             kernel.call_kernel(kernel_name, ctb)
 
         V.graph.removed_buffers |= kernel.removed_buffers
@@ -189,6 +190,7 @@ def _unwrap_epilogue_nodes(
         assert all(n.node is not None for n in nodes), (
             "All epilogue nodes should have an IRNode"
         )
+        # pyrefly: ignore  # redundant-cast
         return cast(
             list[BaseSchedulerNode], [n for n in nodes if n.node is not template_node]
         )
diff --git a/torch/_inductor/codegen/cuda/cuda_env.py b/torch/_inductor/codegen/cuda/cuda_env.py
index 3eb65273285e..9ca3afbd9ca5 100644
--- a/torch/_inductor/codegen/cuda/cuda_env.py
+++ b/torch/_inductor/codegen/cuda/cuda_env.py
@@ -22,8 +22,8 @@ def get_cuda_arch() -> Optional[str]:
             major, minor = torch.cuda.get_device_capability(0)
             return str(major * 10 + minor)
         return str(cuda_arch)
-    except Exception as e:
-        log.error("Error getting cuda arch: %s", e)
+    except Exception:
+        log.exception("Error getting cuda arch")
         return None
 
 
@@ -45,8 +45,8 @@ def get_cuda_version() -> Optional[str]:
         if cuda_version is None:
             cuda_version = torch.version.cuda
         return cuda_version
-    except Exception as e:
-        log.error("Error getting cuda version: %s", e)
+    except Exception:
+        log.exception("Error getting cuda version")
         return None
 
 
diff --git a/torch/_inductor/codegen/cuda/cuda_template.py b/torch/_inductor/codegen/cuda/cuda_template.py
index 4aa0aeb46e07..fe764e652c01 100644
--- a/torch/_inductor/codegen/cuda/cuda_template.py
+++ b/torch/_inductor/codegen/cuda/cuda_template.py
@@ -72,6 +72,7 @@ def __init__(
 
     @classmethod
     @functools.lru_cache(None)
+    # pyrefly: ignore  # bad-override
     def _template_from_string(cls, source: str) -> Any:
         return KernelTemplate._template_from_string(source)
 
diff --git a/torch/_inductor/codegen/cuda/cutlass_cache.py b/torch/_inductor/codegen/cuda/cutlass_cache.py
index 519125888c16..66db98867b41 100644
--- a/torch/_inductor/codegen/cuda/cutlass_cache.py
+++ b/torch/_inductor/codegen/cuda/cutlass_cache.py
@@ -94,11 +94,11 @@ def maybe_fetch_ops() -> Optional[list[Any]]:
             assert isinstance(serialized_ops, list), (
                 f"Expected serialized ops is a list, got {type(serialized_ops)}"
             )
-        except Exception as e:
+        except Exception:
             log.warning(
-                "Failed to load CUTLASS config %s from local cache: %s",
+                "Failed to load CUTLASS config %s from local cache",
                 filename,
-                e,
+                exc_info=True,
             )
             serialized_ops = None
     elif config.is_fbcode():
diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
index 605b93dff592..49b57b892367 100644
--- a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
+++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
@@ -1,8 +1,8 @@
-from typing import Any, Callable, Union
+from collections.abc import Callable
+from typing import Any, Union
 
 from sympy import Expr
 
-import torch._inductor.config as config
 from torch._inductor.ir import (
     ComputedBuffer,
     InputBuffer,
@@ -28,6 +28,27 @@
     import textwrap
     from typing import Union
 
+    from cutlass_cppgen.backend.c_types import (  # type: ignore[import-not-found]
+        EmptyByte,
+    )
+    from cutlass_cppgen.backend.epilogue import (  # type: ignore[import-not-found]
+        dtype2ctype,
+    )
+    from cutlass_cppgen.backend.evt import (  # type: ignore[import-not-found]
+        EpilogueFunctorVisitor,
+    )
+    from cutlass_cppgen.backend.evt.backend.emitter_base import (  # type: ignore[import-not-found]
+        FusionCallbacks,
+    )
+    from cutlass_cppgen.backend.evt.backend.sm90_emitter import (  # type: ignore[import-not-found]
+        CollectiveEpilogue,
+    )
+    from cutlass_cppgen.backend.evt.frontend import (  # type: ignore[import-not-found]
+        PythonASTFrontend,
+    )
+    from cutlass_cppgen.backend.evt.ir.tensor import (  # type: ignore[import-not-found]
+        Tensor as CutlassTensor,
+    )
     from cutlass_library import (
         DataType,
         EpilogueScheduleType,
@@ -35,15 +56,10 @@
         TileDescription,
     )
 
-    if config.is_fbcode():
-        import python_cutlass  # type: ignore[import-untyped, import-not-found]  # noqa: F401
-    else:
-        import cutlass as python_cutlass  # type: ignore[import-untyped, import-not-found]  # noqa: F401
-
     from torch._inductor.codegen.cuda import cuda_env
     from torch._inductor.utils import IndentedBuffer
 
-    _CUTLASS_C_DTYPES = OrderedSet(python_cutlass.backend.epilogue.dtype2ctype.values())  # type: ignore[var-annotated]
+    _CUTLASS_C_DTYPES = OrderedSet(dtype2ctype.values())  # type: ignore[var-annotated]
 
     class EVTArgRenames:
         """Handles mapping buffer names to variable names in the cpp kernel signature and body"""
@@ -66,10 +82,10 @@ def create_example_tensors(
         var_name_to_buffer_name: dict[str, str],
         name_to_buffer: dict[str, Buffer],
         size_hint_fn: Callable[[Union[Expr, int]], int],
-    ) -> dict[str, python_cutlass.backend.evt.ir.tensor.Tensor]:
+    ) -> dict[str, CutlassTensor]:
         def cutlass_tensor_from_buffer(
             buffer: Buffer,
-        ) -> python_cutlass.backend.evt.ir.tensor.Tensor:
+        ) -> CutlassTensor:
             shape = buffer.get_layout().size
             stride = buffer.get_layout().stride
             shape = tuple(size_hint_fn(x) for x in shape)
@@ -84,7 +100,7 @@ def cutlass_tensor_from_buffer(
 non-contiguous layout, received stride: {stride} and shape: {shape}"
                 )
 
-            return python_cutlass.backend.evt.ir.tensor.Tensor(
+            return CutlassTensor(
                 shape=shape,
                 layout_tag=(
                     LayoutType.RowMajor if is_row_major else LayoutType.ColumnMajor
@@ -99,7 +115,7 @@ def cutlass_tensor_from_buffer(
 
     def trace(
         fn_src: str,
-        example_tensors: dict[str, python_cutlass.backend.evt.ir.tensor.Tensor],
+        example_tensors: dict[str, CutlassTensor],
         accum_type: DataType,
         output_type: DataType,
         tile_description: TileDescription,
@@ -111,22 +127,14 @@ def trace(
         cuda_arch = int(cuda_env.get_cuda_arch())  # type: ignore[arg-type]
         assert cuda_arch >= 90, "Only SM90+ is supported for EVT"
         epilogue_functor = _trace(fn_src, example_tensors, cuda_arch, **kwargs)
-        visitor = python_cutlass.backend.evt.EpilogueFunctorVisitor(
-            cuda_arch, epilogue_functor
-        )
-        fusion_callbacks = (
-            python_cutlass.backend.evt.backend.emitter_base.FusionCallbacks(
-                visitor.graph, cuda_arch, emit_CD=False
-            )
-        )
-        collective_epilogue = (
-            python_cutlass.backend.evt.backend.sm90_emitter.CollectiveEpilogue(
-                tile_description,
-                epilogue_schedule,
-                accum_type,
-                output_type,
-                fusion_callbacks,
-            )
+        visitor = EpilogueFunctorVisitor(cuda_arch, epilogue_functor)
+        fusion_callbacks = FusionCallbacks(visitor.graph, cuda_arch, emit_CD=False)
+        collective_epilogue = CollectiveEpilogue(
+            tile_description,
+            epilogue_schedule,
+            accum_type,
+            output_type,
+            fusion_callbacks,
         )
         evt_name, evt_code = collective_epilogue.emit()
         evt_args, arg_renames = _render_argument_type(
@@ -140,21 +148,22 @@ def trace(
     # The reason for this is that inspect.getsource does not work with functions defined at runtime via exec/eval
     def _trace(
         fn_src: str,
-        example_tensors: dict[str, python_cutlass.backend.evt.ir.tensor.Tensor],
+        example_tensors: dict[str, CutlassTensor],
         cc: int,
         **kwargs: Any,
     ) -> EpilogueFunctor:
-        class EpilogueFunctor(python_cutlass.backend.evt.frontend.PythonASTFrontend):
+        class EpilogueFunctor(PythonASTFrontend):
             def __init__(self, cc: int, **kwargs: Any):
                 self.source = textwrap.dedent(fn_src)
                 super().__init__(cc, **kwargs)
 
             def parse(
                 self,
-                example_inputs: dict[str, python_cutlass.backend.evt.ir.tensor.Tensor],
+                example_inputs: dict[str, CutlassTensor],
             ) -> None:
                 self.example_inputs = example_inputs
                 self.ast = ast.parse(self.source)
+                # pyrefly: ignore  # missing-attribute
                 self.visit(self.ast)
 
         cc = int(cuda_env.get_cuda_arch())
@@ -172,10 +181,10 @@ def _render_argument_type(
 
         # Fragile, but this is the only way to guarantee t is expected type because t is a local class
         def is_nested_visitor_type(t: type) -> bool:
-            return ".".join([t.__module__, t.__qualname__]) in {
-                "python_cutlass.backend.c_types.visitor_factory.<locals>.VisitorType",
-                "cutlass.backend.c_types.visitor_factory.<locals>.VisitorType",
-            }
+            return (
+                ".".join([t.__module__, t.__qualname__])
+                == "cutlass_cppgen.backend.c_types.visitor_factory.<locals>.VisitorType"
+            )
 
         buffer = IndentedBuffer()
         with buffer.set_tabwidth(2):
@@ -233,10 +242,10 @@ def _get_arg_from_node(
         # Today, arguments are either a pointer to the
         # node's memory, a stride tuple, the datatype
         # Once again, need to check for local class type for stride tuple
-        if str(arg_ty) in {
-            "<class 'python_cutlass.backend.c_types.tuple_factory_.<locals>.TupleType'>",
-            "<class 'cutlass.backend.c_types.tuple_factory_.<locals>.TupleType'>",
-        }:
+        if (
+            str(arg_ty)
+            == "<class 'cutlass_cppgen.backend.c_types.tuple_factory_.<locals>.TupleType'>"
+        ):
             DEFAULT_STRIDE_LEN = 3
             assert len(node.get_layout().stride) <= DEFAULT_STRIDE_LEN
             stride = [size_hint_fn(x) for x in node.get_layout().stride]
@@ -261,7 +270,7 @@ def render_stride(x: int) -> str:
             arg_ty in _CUTLASS_C_DTYPES
         ):  # Assumption: this is the element dtype, this holds for all cutlass ir nodes currently
             return f"{CUTLASSTemplate._DTYPE_TO_CUTLASS[node.get_layout().dtype]}(0)"
-        elif issubclass(arg_ty, python_cutlass.backend.c_types.EmptyByte):
+        elif issubclass(arg_ty, EmptyByte):
             return "{}"
 
         raise NotImplementedError(f"Unsupported arg type: {arg_ty}")
diff --git a/torch/_inductor/codegen/cuda/cutlass_presets.py b/torch/_inductor/codegen/cuda/cutlass_presets.py
deleted file mode 100644
index 346be534e82e..000000000000
--- a/torch/_inductor/codegen/cuda/cutlass_presets.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import functools
-from collections import defaultdict
-
-import torch
-from torch._inductor.codegen.cuda.cuda_env import get_cuda_arch
-
-
-@functools.cache
-def gen_cutlass_presets() -> dict[int, dict[str, list[str]]]:
-    """
-    Generate cutlass presets for the given CUDA arch.
-    """
-    presets: dict[int, dict[str, list[str]]] = {}
-
-    if not torch._C._has_cuda:
-        return presets
-
-    presets[0] = defaultdict(list)
-    arch = get_cuda_arch()
-    if arch == "90":
-        preset = presets[0]
-        preset["0"] = [
-            r"cutlass3x_sm90_tensorop_.*_128x128x64_2x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x256x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x128x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_64x256x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_256x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x256x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x256x64_2x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x256x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x256x64_2x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
-        ]
-        preset["3332"] = [
-            r"cutlass3x_sm90_tensorop_.*_64x48x64_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
-            r"cutlass3x_sm90_tensorop_.*_64x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_64x32x64_1x1x1_0_.*_align.*_cpasync_warpspecialized_epi_nosmem",
-            r"cutlass3x_sm90_tensorop_.*_128x128x64_1x4x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_64x16x64_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
-            r"cutlass3x_sm90_tensorop_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x256x64_1x2x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x128x64_4x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_64x32x64_2x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
-            r"cutlass3x_sm90_tensorop_.*_128x64x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_64x128x64_4x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x256x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_64x16x64_2x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
-            r"cutlass3x_sm90_tensorop_.*_64x128x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_64x32x64_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
-            r"cutlass3x_sm90_tensorop_.*_64x16x64_4x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
-            r"cutlass3x_sm90_tensorop_.*_128x256x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x128x64_1x4x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x64x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_64x16x64_1x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
-            r"cutlass3x_sm90_tensorop_.*_256x128x64_1x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_256x192x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x256x64_1x1x1_0_.*_align.*_stream_k_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_64x128x64_1x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x128x64_1x2x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_256x128x64_1x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x128x64_2x1x1_0_.*_align.*_warpspecialized_pingpong_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x64x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x256x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_128x64x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_64x16x64_1x4x1_0_.*_align.*_warpspecialized_epi_nosmem",
-            r"cutlass3x_sm90_tensorop_.*_64x32x64_2x2x1_0_.*_align.*_warpspecialized_epi_nosmem",
-            r"cutlass3x_sm90_tensorop_.*_128x256x64_2x1x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_256x192x64_1x2x1_0_.*_align.*_warpspecialized_cooperative_epi_tma",
-            r"cutlass3x_sm90_tensorop_.*_64x16x64_1x1x1_0_.*_align.*_warpspecialized_epi_nosmem",
-        ]
-
-    return presets
diff --git a/torch/_inductor/codegen/cuda/cutlass_utils.py b/torch/_inductor/codegen/cuda/cutlass_utils.py
index 7ca33ea779cc..be812347188b 100644
--- a/torch/_inductor/codegen/cuda/cutlass_utils.py
+++ b/torch/_inductor/codegen/cuda/cutlass_utils.py
@@ -9,6 +9,7 @@
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Optional
+from typing_extensions import TypeIs
 
 import sympy
 
@@ -40,23 +41,20 @@ def move_cutlass_compiled_cache() -> None:
     if not try_import_cutlass.cache_info().currsize > 0:
         return
 
-    if config.is_fbcode():
-        import python_cutlass  # type: ignore[import-not-found]
-    else:
-        import cutlass as python_cutlass  # type: ignore[import-not-found]  # noqa: F401
+    import cutlass_cppgen  # type: ignore[import-not-found]
 
-    # Check if the CACHE_FILE attribute exists in python_cutlass and if the file exists
-    if not hasattr(python_cutlass, "CACHE_FILE") or not os.path.exists(
-        python_cutlass.CACHE_FILE
+    # Check if the CACHE_FILE attribute exists in cutlass_cppgen and if the file exists
+    if not hasattr(cutlass_cppgen, "CACHE_FILE") or not os.path.exists(
+        cutlass_cppgen.CACHE_FILE
     ):
         return
 
     try:
-        filename = os.path.basename(python_cutlass.CACHE_FILE)
-        shutil.move(python_cutlass.CACHE_FILE, os.path.join(cache_dir(), filename))
+        filename = os.path.basename(cutlass_cppgen.CACHE_FILE)
+        shutil.move(cutlass_cppgen.CACHE_FILE, os.path.join(cache_dir(), filename))
         log.debug("Moved CUTLASS compiled cache file to %s", cache_dir())
-    except OSError as e:
-        log.warning("Failed to move CUTLASS compiled cache file: %s", str(e))
+    except OSError:
+        log.warning("Failed to move CUTLASS compiled cache file", exc_info=True)
 
 
 def _rename_cutlass_import(content: str, cutlass_modules: list[str]) -> str:
@@ -78,10 +76,10 @@ def try_import_cutlass() -> bool:
     """
     if config.is_fbcode():
         try:
+            import cutlass_cppgen  # type: ignore[import-not-found]  # noqa: F401
             import cutlass_library  # type: ignore[import-not-found]
-            import python_cutlass  # type: ignore[import-not-found]  # noqa: F401
         except ImportError as e:
-            log.warning(
+            log.warning(  # noqa: G200
                 "Failed to import CUTLASS packages in fbcode: %s, ignoring the CUTLASS backend.",
                 str(e),
             )
@@ -112,13 +110,13 @@ def path_join(path0, path1):
     )
 
     cutlass_library_src_path = path_join(cutlass_python_path, "cutlass_library")
-    cutlass_src_path = path_join(cutlass_python_path, "cutlass")
+    cutlass_cppgen_src_path = path_join(cutlass_python_path, "cutlass_cppgen")
     pycute_src_path = path_join(cutlass_python_path, "pycute")
 
     tmp_cutlass_full_path = os.path.abspath(os.path.join(cache_dir(), "torch_cutlass"))
 
     dst_link_library = path_join(tmp_cutlass_full_path, "cutlass_library")
-    dst_link_cutlass = path_join(tmp_cutlass_full_path, "cutlass")
+    dst_link_cutlass_cppgen = path_join(tmp_cutlass_full_path, "cutlass_cppgen")
     dst_link_pycute = path_join(tmp_cutlass_full_path, "pycute")
 
     # mock modules to import cutlass
@@ -145,7 +143,9 @@ def link_and_append(dst_link, src_path, parent_dir):
             link_and_append(
                 dst_link_library, cutlass_library_src_path, tmp_cutlass_full_path
             )
-            link_and_append(dst_link_cutlass, cutlass_src_path, tmp_cutlass_full_path)
+            link_and_append(
+                dst_link_cutlass_cppgen, cutlass_cppgen_src_path, tmp_cutlass_full_path
+            )
             link_and_append(dst_link_pycute, pycute_src_path, tmp_cutlass_full_path)
 
             for module in mock_modules:
@@ -156,7 +156,7 @@ def link_and_append(dst_link, src_path, parent_dir):
                 )
 
         try:
-            import cutlass  # noqa: F401, F811
+            import cutlass_cppgen  # type: ignore[import-not-found]  # noqa: F401, F811
             import cutlass_library.generator  # noqa: F401
             import cutlass_library.library  # noqa: F401
             import cutlass_library.manifest  # noqa: F401
@@ -164,7 +164,7 @@ def link_and_append(dst_link, src_path, parent_dir):
 
             return True
         except ImportError as e:
-            log.debug(
+            log.debug(  # noqa: G200
                 "Failed to import CUTLASS packages: %s, ignoring the CUTLASS backend.",
                 str(e),
             )
@@ -420,8 +420,8 @@ def get_max_alignment(inductor_layout: Layout) -> int:
     size = inductor_layout.size
     offset = inductor_layout.offset
 
-    def is_static_int(number):
-        return isinstance(number, (int, sympy.Integer))
+    def is_static_int(number: object) -> TypeIs[int | sympy.Integer]:
+        return isinstance(number, (int | sympy.Integer))
 
     def a_factor_of(x, alignment):
         if is_static_int(x) and is_static_int(alignment):
@@ -470,6 +470,7 @@ def my_compile(
             self.sources.append(source_code)
             return _compile_method_orig(source_code, dst_file_ext)
 
+        # pyrefly: ignore  # bad-assignment
         self._compile_patch = mock.patch(
             "torch._inductor.codecache.CUDACodeCache.compile", my_compile
         )
diff --git a/torch/_inductor/codegen/cuda/gemm_template.py b/torch/_inductor/codegen/cuda/gemm_template.py
index d37e16768adb..9ed1cfb9adfc 100644
--- a/torch/_inductor/codegen/cuda/gemm_template.py
+++ b/torch/_inductor/codegen/cuda/gemm_template.py
@@ -35,7 +35,6 @@
 from . import cutlass_utils
 from .cuda_kernel import CUDATemplateKernel
 from .cuda_template import CUTLASSTemplate
-from .cutlass_presets import gen_cutlass_presets
 from .cutlass_python_evt import CutlassEVTCodegen, scaled_mm_evt
 from .cutlass_utils import (
     ACCUMULATOR_DTYPES,
@@ -976,27 +975,10 @@ def filter_op(
             return None
 
         # Apply regex filters at the end when configuration name doesn't change anymore
-        if (
-            inductor_cuda_config.cutlass_op_allowlist_regex
-            or inductor_cuda_config.cutlass_presets
-        ):
-            patterns = []
-            if inductor_cuda_config.cutlass_op_allowlist_regex:
-                patterns.append(inductor_cuda_config.cutlass_op_allowlist_regex)
-            if inductor_cuda_config.cutlass_presets:
-                presets = gen_cutlass_presets()
-                preset_nums = [
-                    int(x) for x in inductor_cuda_config.cutlass_presets.split(",")
-                ]
-                for preset_num in preset_nums:
-                    preset = presets.get(preset_num, {}).get(
-                        inductor_cuda_config.cutlass_instantiation_level, []
-                    )
-
-                    patterns.extend(preset)
-
-            pattern = "|".join(patterns)
-            if pattern and not re.search(pattern, op.configuration_name()):
+        if inductor_cuda_config.cutlass_op_allowlist_regex:
+            if not re.search(
+                inductor_cuda_config.cutlass_op_allowlist_regex, op.configuration_name()
+            ):
                 return None
         if inductor_cuda_config.cutlass_op_denylist_regex is not None:
             if re.search(
diff --git a/torch/_inductor/codegen/cutedsl/cutedsl_kernel.py b/torch/_inductor/codegen/cutedsl/cutedsl_kernel.py
index c30f8bc05d6f..adec95b76c2c 100644
--- a/torch/_inductor/codegen/cutedsl/cutedsl_kernel.py
+++ b/torch/_inductor/codegen/cutedsl/cutedsl_kernel.py
@@ -8,6 +8,7 @@
 import sympy
 
 import torch
+from torch._inductor import config
 from torch._inductor.codegen.common import (
     CSE,
     CSEVariable,
@@ -20,6 +21,7 @@
 from torch._inductor.utils import OrderedSet
 from torch._inductor.virtualized import V
 
+from ...utils import sympy_index_symbol
 from .cutedsl_op_overrides import CuteDSLOpOverrides
 
 
@@ -111,6 +113,15 @@ def __init__(
 
         self.cse = CSE(name_prefix="tmp")
 
+        # Track all tensor buffers added during modification processing
+        self.collected_tensor_buffers: list[str] = []
+
+    def kexpr(self, expr: sympy.Expr) -> str:
+        """Convert sympy expression to CuteDSL string representation."""
+        # For CuteDSL, we use standard Python string conversion
+        # since CuteDSL uses Python syntax for expressions
+        return str(expr)
+
     def gen_imports(self) -> str:
         """Generate common imports for CuteDSL templates."""
         imports = IndentedBuffer()
@@ -143,6 +154,8 @@ def render(self, template, **kwargs):
             "def_kernel": self.def_kernel,
             "gen_defines": lambda: self.gen_defines(**kwargs),
             "get_output": self.get_output,
+            "get_tensor_buffers": self.get_tensor_buffers,
+            "unpack_buffers": self.unpack_buffers,
             "modification": self.modification,
         }
 
@@ -258,6 +271,32 @@ def get_output(self):
             raise ValueError(f"Output buffer '{buf_name}' not found in args")
         return output
 
+    def get_tensor_buffers(self):
+        """Get list of tensor buffer names that were collected during modifications."""
+        return self.collected_tensor_buffers
+
+    def unpack_buffers(self):
+        """Generate buffer unpacking code via render hook."""
+
+        def hook():
+            tensor_buffers = self.get_tensor_buffers()
+            if not tensor_buffers:
+                return ""
+
+            # Generate unpacking assignments: in_ptr4 = buffers[0], etc.
+            unpacking_lines = []
+            for i, buffer_name in enumerate(tensor_buffers):
+                # pyrefly: ignore  # bad-argument-type
+                unpacking_lines.append(f"{buffer_name} = buffers[{i}]")
+
+            return "\n        ".join(unpacking_lines)
+
+        # Register the hook and return placeholder
+        placeholder = "<UNPACK_BUFFERS>"
+        assert placeholder not in self.render_hooks
+        self.render_hooks[placeholder] = hook
+        return placeholder
+
     def call_kernel(self, name: str, node=None):
         """Call the kernel function. Simplified version of TritonTemplateKernel.call_kernel."""
         wrapper = V.graph.wrapper_code
@@ -323,6 +362,9 @@ def modification(
                     "Side-effect only modifications not yet supported for CuteDSL"
                 )
 
+            # Add Buffers that were added during modification
+            self.collected_tensor_buffers.extend(modification_handler.tensor_buffers)
+
             return self.body.getvalue()
 
 
@@ -350,6 +392,8 @@ def __init__(
         self.kernel = kernel
         self.fixed_inputs = fixed_inputs
         self.mask = mask
+        # Track tensor buffers that get added during modification processing
+        self.tensor_buffers: list[str] = []
 
     def _get_input_dtype(self, name: str) -> torch.dtype:
         """Get the dtype for an input from the kernel's named_input_nodes."""
@@ -361,9 +405,83 @@ def _get_input_dtype(self, name: str) -> torch.dtype:
     def load(self, name: str, index: sympy.Expr):
         """Handle loading from tensor or fixed(template args) input for CuteDSL."""
         if name not in self.fixed_inputs:
-            raise NotImplementedError(
-                "Tensor loading not yet supported for CuteDSL - only fixed input substitution"
+            index_str = self._process_indexing(index)
+            var = self._add_kernel_input(name)
+            buffer = V.graph.get_buffer(name)
+            var_dtype = buffer.dtype
+
+            # Get the CuteDSL dtype mapping
+            cute_dtype = CuteDSLOpOverrides.TORCH_TO_CUTE_DTYPE.get(
+                var_dtype, "cutlass.Float32"
+            )
+
+            # NB
+            # This assumes single-value loads which is not generally the case but is a workaround
+            # since we don't have gather support yet. We do loads in non-SSA form then convert
+            # back to SSA form for any remaining operations over the loaded values.
+            #
+            # Pattern:
+            #   index_frag = cute.make_fragment(1, cutlass.Int32)
+            #   index_frag.store(index)
+            #   val_frag = cute.make_fragment(1, dtype)
+            #   index = index_frag[0]
+            #   val_frag[0] = tensor[index]
+            #   result = val_frag.load()
+
+            index_frag = self.kernel.cse.generate(
+                self.kernel.body,
+                "cute.make_fragment(1, cutlass.Int32)",
+                dtype=torch.int32,
+                bounds=ValueRanges.unknown(),
             )
+
+            self.kernel.cse.generate(
+                self.kernel.body,
+                f"{index_frag}.store({index_str})",
+                dtype=torch.int32,
+                bounds=ValueRanges.unknown(),
+            )
+
+            val_frag = self.kernel.cse.generate(
+                self.kernel.body,
+                f"cute.make_fragment(1, {cute_dtype})",
+                dtype=var_dtype,
+                bounds=ValueRanges.unknown(),
+            )
+
+            index_var = self.kernel.cse.generate(
+                self.kernel.body,
+                f"{index_frag}[0]",
+                dtype=torch.int32,
+                bounds=ValueRanges.unknown(),
+            )
+
+            self.kernel.cse.generate(
+                self.kernel.body,
+                f"{val_frag}[0] = ({var}[{index_var}])",
+                dtype=var_dtype,
+                bounds=ValueRanges.unknown(),
+            )
+
+            final_expr = f"{val_frag}.load()"
+
+            # Handle upcast to fp32 if needed
+            if (
+                var_dtype in (torch.float16, torch.bfloat16)
+                and config.triton.codegen_upcast_to_fp32
+            ):
+                # Apply dtype conversion after fragment load
+                final_expr = f"({final_expr}).to(cutlass.Float32)"
+                var_dtype = torch.float32
+
+            out = self.kernel.cse.generate(
+                self.kernel.body,
+                final_expr,
+                dtype=var_dtype,
+                bounds=ValueRanges.unknown(),
+            )
+            return out
+
         value = self.fixed_inputs[name]
         dtype = self._get_input_dtype(name)
 
@@ -374,8 +492,9 @@ def load(self, name: str, index: sympy.Expr):
 
     def indirect_indexing(self, index_var: str, size, check, wrap_neg=True):
         """Convert index variable to symbolic form."""
-        raise NotImplementedError("Indirect indexing not supported")
+        return sympy_index_symbol(str(index_var))
 
+    # pyrefly: ignore  # bad-override
     def store(
         self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
     ) -> str:
@@ -385,12 +504,17 @@ def store(
 
     def _add_kernel_input(self, name: str):
         """Add name as input to kernel and return input ref."""
-        return self.kernel.args.input(name)
+        # Get the remapped name that will be used in the kernel
+        remapped_name = self.kernel.args.input(name)
+        # Track the remapped name for later collection
+        if remapped_name not in self.tensor_buffers:
+            self.tensor_buffers.append(remapped_name)
+        return remapped_name
 
     def _process_indexing(self, index):
         """Process and rename indexing, adding symbols as kernel inputs."""
-        # Convert sympy expression to string representation for CuteDSL
-        return str(index)  # Simplified for now
+        renamed = self.kernel.rename_indexing(index)
+        return self.kernel.kexpr(renamed)
 
     def _default(self, name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
         try:
diff --git a/torch/_inductor/codegen/cutedsl/cutedsl_op_overrides.py b/torch/_inductor/codegen/cutedsl/cutedsl_op_overrides.py
index 5dd79db7bdb7..a0f76ab5efbb 100644
--- a/torch/_inductor/codegen/cutedsl/cutedsl_op_overrides.py
+++ b/torch/_inductor/codegen/cutedsl/cutedsl_op_overrides.py
@@ -274,7 +274,9 @@ def abs(x: CuteDSLArg) -> CuteDSLArg:
             else "mlir_math.absi"
         )
         return CuteDSLOpOverrides._apply_unary_op(
-            x, f"cute.TensorSSA({abs_op}({{x}}), {{x}}.shape, {{x}}.dtype)"
+            # pyrefly: ignore  # bad-argument-type
+            x,
+            f"cute.TensorSSA({abs_op}({{x}}), {{x}}.shape, {{x}}.dtype)",
         )
 
     @staticmethod
diff --git a/torch/_inductor/codegen/cutedsl/cutedsl_scheduling.py b/torch/_inductor/codegen/cutedsl/cutedsl_scheduling.py
index 427b6fe5f1df..4fc1089a4082 100644
--- a/torch/_inductor/codegen/cutedsl/cutedsl_scheduling.py
+++ b/torch/_inductor/codegen/cutedsl/cutedsl_scheduling.py
@@ -135,6 +135,7 @@ def codegen_template(
         with V.set_kernel_handler(kernel):
             node_schedule = [template_node]
             kernel_name = self.define_kernel(src_code_str, node_schedule)
+        self.codegen_comment(node_schedule, kernel_name)
         kernel.call_kernel(kernel_name, ctb)
         V.graph.removed_buffers |= kernel.removed_buffers
         self.free_buffers_in_scheduler()
diff --git a/torch/_inductor/codegen/cutedsl/cutedsl_template.py b/torch/_inductor/codegen/cutedsl/cutedsl_template.py
index b43dbd9cfd71..31ff7e43afc5 100644
--- a/torch/_inductor/codegen/cutedsl/cutedsl_template.py
+++ b/torch/_inductor/codegen/cutedsl/cutedsl_template.py
@@ -43,6 +43,7 @@ def __init__(
 
     @staticmethod
     @functools.lru_cache(None)
+    # pyrefly: ignore  # bad-override
     def _template_from_string(source: str) -> Any:
         return KernelTemplate._template_from_string(source)
 
@@ -57,10 +58,10 @@ def maybe_append_choice(
             choices.append(self.generate(**kwargs))
             return None
         except NotImplementedError as e:
-            log.debug("CuteDSL template choice generation failed: %s", e)
+            log.debug("CuteDSL template choice generation failed: %s", e)  # noqa: G200
             return e
         except Exception as e:
-            log.debug("CuteDSL template choice generation error: %s", e)
+            log.debug("CuteDSL template choice generation error: %s", e)  # noqa: G200
             return NotImplementedError(f"CuteDSL template failed: {e}")
 
     def generate(self, **kwargs: Any) -> ChoiceCaller:
diff --git a/torch/_inductor/codegen/debug_utils.py b/torch/_inductor/codegen/debug_utils.py
index d4292c0d2409..d10b3bfbb9c6 100644
--- a/torch/_inductor/codegen/debug_utils.py
+++ b/torch/_inductor/codegen/debug_utils.py
@@ -274,7 +274,7 @@ def codegen_intermediate_tensor_value_print(
                         f'printf("[  {launch_prefix} - {kernel_name} - {arg}: %ld  ]", {arg}); printf("\\\\n");'
                     )
                 else:
-                    if arg_signatures is None and self.kernel_type == "cpp" or "extern":
+                    if arg_signatures is None and self.kernel_type in ("cpp", "extern"):
                         V.graph.wrapper_code.writeline(
                             f'aoti_torch_print_tensor_handle({arg}, "{launch_prefix} - {kernel_name} - {arg}");'
                         )
diff --git a/torch/_inductor/codegen/halide.py b/torch/_inductor/codegen/halide.py
index f477d16cc766..f0a2b07b1cc8 100644
--- a/torch/_inductor/codegen/halide.py
+++ b/torch/_inductor/codegen/halide.py
@@ -636,6 +636,7 @@ def index_str(self, replacements=None, zero_vars=False):
             return "hl.Var()"
         if replacements:
             replacements = {**replacements}
+            # pyrefly: ignore  # missing-attribute
             for sym in expr.free_symbols:
                 if symbol_is_type(sym, SymT.TMP):
                     assert isinstance(sym, sympy.Symbol)
@@ -709,8 +710,10 @@ def __init__(
     def dtype_to_str(self, dtype: torch.dtype) -> str:
         return halide_type(dtype)
 
+    # pyrefly: ignore  # bad-override
     def create_cse_var(self, name, bounds=None, dtype=None, shape=None):
         self.body.writeline(f"{name} = hl.Func({name!r})")
+        # pyrefly: ignore  # bad-argument-type
         return HalideCSEVariable(name, bounds, dtype, shape)
 
     def finalize_indexing(self, indices: Sequence[sympy.Expr]):
@@ -728,6 +731,7 @@ def finalize_indexing(self, indices: Sequence[sympy.Expr]):
             self.index_replacements or self.halide_vars or self.reduction_renames
         )
         size_hint = functools.partial(V.graph.sizevars.size_hint, fallback=inf)  # type: ignore[arg-type]
+        # pyrefly: ignore  # bad-assignment
         indices = dict.fromkeys(map(super().prepare_indexing, indices))
         all_used_symbols = OrderedSet[Any]()
         sym_to_node = {
@@ -826,6 +830,7 @@ def visit_floor_div(base, divisor):
                         handled_count = len(nodes)
                         had_fallback = True
                     sym = sympy_index_symbol(f"h{len(self.halide_vars)}")
+                    # pyrefly: ignore  # missing-argument
                     if tree.is_reduction:
                         self.reduction_renames[sym] = sympy_index_symbol(
                             f"hr{len(self.halide_vars)}"
@@ -1222,8 +1227,10 @@ def reduction(
             parts = []
             stride = 1
             for i, sym in enumerate(self.reduction_renames):
+                # pyrefly: ignore  # bad-argument-type
                 parts.append(f"{index}[{i}]")
                 if stride != 1:
+                    # pyrefly: ignore  # unsupported-operation
                     parts[-1] += f"*{stride}"
                 stride *= self.halide_vars[sym]
             self.body.writeline(f"{result_var} = {' + '.join(parts)}")
@@ -1576,6 +1583,7 @@ def update_index(m):
                     hint = self._autoscheduler_workarounds(
                         V.graph.sizevars.size_hint(dim.size, fallback=1), dims
                     )
+                    # pyrefly: ignore  # bad-argument-type
                     range_hints.append(f"hl.Range(0, {hint})")
                     if "out" not in arg.name:
                         code.writeline(f"{arg.name}.dim({i}).set_min(0)")
diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py
index 32e45bfde48d..fb3939531b71 100644
--- a/torch/_inductor/codegen/mps.py
+++ b/torch/_inductor/codegen/mps.py
@@ -516,6 +516,7 @@ def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable) -> N
         var = self.args.output(name)
         index = self.prepare_indexing(index)
         dtype_str = self.dtype_to_str(V.graph.get_dtype(name))
+        # pyrefly: ignore  # missing-argument
         reduction_dim = next(t for t in self.range_trees if t.is_reduction)
         # Only one thread in the reduction group needs to store the results
         line = f"{var}[{self.index_to_str(index)}] = static_cast<{dtype_str}>({value});"
@@ -582,6 +583,7 @@ def _unwrap_helper(res3: CSEVariable) -> tuple[CSEVariable, ...]:
         reduction_idx = ""
         acc_buf_size = 1
         for rd in self.range_trees:
+            # pyrefly: ignore  # missing-argument
             if not rd.is_reduction:
                 continue
             if reduction_idx:
@@ -678,7 +680,11 @@ def _unwrap_helper(res3: CSEVariable) -> tuple[CSEVariable, ...]:
                 )
                 idx_val = self._new_idxvar(dtype, default_value=0, is_threadgroup=False)  # type: ignore[assignment]
                 idx_var = next(
-                    t for t in self.range_tree_nodes.values() if t.is_reduction
+                    # pyrefly: ignore  # missing-argument
+                    t
+                    for t in self.range_tree_nodes.values()
+                    # pyrefly: ignore  # missing-argument
+                    if t.is_reduction
                 )
                 cmp_op = ">" if reduction_type == "argmax" else "<"
                 nan_suffix = (
@@ -745,6 +751,7 @@ def codegen_iteration_ranges_entry(self, entry: IterationRangesEntry) -> None:
         index_expr = self.rename_indexing(entry.expr)
         index_str = self.sexpr(index_expr)  # type: ignore[misc]
 
+        # pyrefly: ignore  # missing-argument
         if not entry.is_reduction or (
             isinstance(entry.root.numel, sympy.Integer)
             and entry.root.numel <= self.max_threadgroup_size
@@ -856,7 +863,11 @@ def codegen_kernel(self, name: Optional[str] = None) -> str:
 
             if self.inside_reduction:
                 total_reduction_size = math.prod(
-                    t.numel for t in self.range_trees if t.is_reduction
+                    # pyrefly: ignore  # missing-argument
+                    t.numel
+                    for t in self.range_trees
+                    # pyrefly: ignore  # missing-argument
+                    if t.is_reduction
                 )
                 # If using dynamic shapes, set the threadgroup size to be the
                 # max possible size
@@ -958,6 +969,7 @@ def call_kernel(self, name: str, node: Any = None) -> None:
             else:
                 expr = V.graph.wrapper_code.generate_numel_expr(name, tree).inner
 
+            # pyrefly: ignore  # missing-argument
             if not tree.is_reduction or self.inside_reduction:
                 args.append(str(expr))
                 arg_types.append(int)
@@ -977,6 +989,7 @@ def format_threads(threads: list[str], kwarg: str) -> str:
             threads = [
                 expr_printer(
                     sympy.Min(v.numel, self.max_threadgroup_size)  # type: ignore[misc]
+                    # pyrefly: ignore  # missing-argument
                     if v.is_reduction
                     else v.numel
                 )
@@ -992,6 +1005,7 @@ def format_threads(threads: list[str], kwarg: str) -> str:
         if self.inside_reduction:
             threads = [
                 expr_printer(sympy.Min(v.numel, self.max_threadgroup_size))  # type: ignore[misc]
+                # pyrefly: ignore  # missing-argument
                 if v.is_reduction
                 else "1"
                 for v in self.active_range_trees()
@@ -1058,10 +1072,8 @@ def define_kernel(
             wrapper.src_to_kernel[src_code] = kernel_name
 
             if V.graph.cpp_wrapper:
-                src_code = (
-                    f"at::native::mps::DynamicMetalShaderLibrary {mps_lib_name}"
-                    + src_code
-                )
+                # For shimified version, generate source constant instead of direct instantiation
+                src_code = f"const char* {mps_lib_name}_source = " + src_code
 
             origins, detailed_origins = get_kernel_metadata(node_schedule, wrapper)
             metadata_comment = f"{origins}\n{detailed_origins}"
diff --git a/torch/_inductor/codegen/multi_kernel.py b/torch/_inductor/codegen/multi_kernel.py
index c7ac48ba0231..0861b218f9c5 100644
--- a/torch/_inductor/codegen/multi_kernel.py
+++ b/torch/_inductor/codegen/multi_kernel.py
@@ -1,8 +1,10 @@
 # mypy: allow-untyped-defs
 import functools
 import logging
+import math
 import os
 import pathlib
+from typing import Any, Optional, Union
 
 from torch._inductor.ir import MultiTemplateBuffer
 from torch._inductor.metrics import get_metric_table, is_metric_table_enabled
@@ -31,7 +33,13 @@ def __init__(self):
         self.subkernel_to_kernel_name = {}
         self.kernel_defs = IndentedBuffer()
 
-    def define_kernel(self, kernels):
+    def define_kernel(
+        self,
+        kernels: list[Any],
+        kernel_shape_keys: Optional[
+            list[Union[None, tuple[tuple[int, ...], ...]]]
+        ] = None,
+    ) -> str:
         """
         Previously we name the multi kernel as "multi_kernel_{kernel_names[0]}".
         This has some minor issue.
@@ -45,6 +53,12 @@ def define_kernel(self, kernels):
         The only different is cache eviction policy.
 
         We should name the multi-kernel differently in these 2 cases.
+
+        kernels:
+            A list of kernels
+        kernel_shape_keys:
+            Specified for size-hint multi-kernels.
+            Each list element is a shape key, corresponding to the concrete input & output size hints each kernel was tuned for.
         """
         # Prevent circular import
         from ..select_algorithm import TritonTemplateKernel
@@ -68,9 +82,7 @@ def define_kernel(self, kernels):
             kernels[0].output_node, MultiTemplateBuffer
         ):
             for i, kernel in enumerate(kernels):
-                additional_call_args, additional_arg_types = (
-                    kernel.additional_call_args_and_types()
-                )
+                additional_call_args, _ = kernel.additional_call_args_and_types()
                 if i not in arg_index:
                     arg_index[i] = []
                 arg_index[i].append(slice(0, len(call_args)))
@@ -85,7 +97,7 @@ def define_kernel(self, kernels):
             for i in range(len(kernels)):
                 arg_index[i] = [slice(0, len(call_args))]
 
-        shape_specialize = isinstance(kernels[0], TritonTemplateKernel)
+        keyed_by_sizes = kernel_shape_keys is not None
         buf = self.kernel_defs
         buf.writeline("")
         buf.writeline("arg_index = {")
@@ -93,13 +105,26 @@ def define_kernel(self, kernels):
             slice_reprs = ", ".join(repr(s) for s in slice_list)
             buf.writeline(f"    {key}: [{slice_reprs}],")
         buf.writeline("}")
-        buf.writeline(
-            f"{multi_kernel_name} = async_compile.multi_kernel({multi_kernel_name!r}, ["
-        )
-        with buf.indent():
-            for name in kernel_names:
-                buf.writeline(f"{name},")
-        buf.writeline(f"], arg_index=arg_index, shape_specialize={shape_specialize})")
+
+        if not keyed_by_sizes:  # no size hint keys, just call with list of kernels
+            buf.writeline(
+                f"{multi_kernel_name} = async_compile.multi_kernel({multi_kernel_name!r}, ["
+            )
+            with buf.indent():
+                for name in kernel_names:
+                    buf.writeline(f"{name},")
+            buf.writeline("], arg_index=arg_index)")
+        else:  # call with dict[size hint key, kernel]
+            assert isinstance(kernels[0], TritonTemplateKernel)
+            assert isinstance(kernel_shape_keys, list)
+            assert len(kernels) == len(kernel_shape_keys)
+            buf.writeline(
+                f"{multi_kernel_name} = async_compile.size_hint_multi_kernel({multi_kernel_name!r}, {{"
+            )
+            with buf.indent():
+                for shape_key, name in zip(kernel_shape_keys, kernel_names):
+                    buf.writeline(f"{shape_key}: {name},")
+            buf.writeline("}, arg_index=arg_index)")
 
         if config.triton.autotune_at_compile_time:
             V.graph.wrapper_code.src_to_kernel["\n".join(kernel_names)] = (
@@ -266,8 +291,8 @@ class MultiKernelCall:
     This class is called at run time to actually run the kernel
     """
 
-    def __init__(self, multi_kernel_name, kernels, arg_index, shape_specialize=False):
-        assert len(kernels) >= 2
+    def __init__(self, multi_kernel_name, kernels, arg_index):
+        assert len(kernels) >= 1
         self._kernels = kernels
         self.multi_kernel_name = multi_kernel_name
 
@@ -281,19 +306,13 @@ def __init__(self, multi_kernel_name, kernels, arg_index, shape_specialize=False
             # manually force a subkernel to ease perf testing
             picked_by_config = config.triton.multi_kernel - 2
             assert picked_by_config < len(self._kernels)
+            # pyrefly: ignore  # bad-assignment
             self.picked_kernel = picked_by_config
         elif not self.disable_cache:
             self.load_cache()
 
         self._recorded = False
 
-        # This means for each unique shape we will do a separate assessment
-        # for which kernel is the best. This is particularly useful for matmul
-        # kernels where the best kernel can vary based on very small differences
-        # in shape.
-        self._shape_specialize = shape_specialize
-        self._shape_cache = {}
-
     def cache_file_path(self):
         key = code_hash(
             ",".join(
@@ -311,7 +330,9 @@ def load_cache(self):
         path = self.cache_file_path()
         if path.exists():
             with path.open() as fd:
+                # pyrefly: ignore  # bad-assignment
                 self.picked_kernel = int(fd.read())
+                # pyrefly: ignore  # unsupported-operation
                 assert self.picked_kernel >= 0 and self.picked_kernel < len(
                     self._kernels
                 )
@@ -415,20 +436,6 @@ def lookup_choice(multi_kernel_name: str) -> str:
         return V.graph.multi_kernel_to_choice[multi_kernel_name]
 
     def run(self, *args, **kwargs):
-        if self._shape_specialize:
-            cache_key = self._get_shape_cache_key(*args, **kwargs)
-            cached_choice = self._get_cached_shape_choice(cache_key)
-            if cached_choice is not None:
-                self.picked_kernel = cached_choice
-                log.debug(
-                    "using cached shape-specialized choice %dth sub-kernel in %s. Cache key: %s",
-                    self.picked_kernel,
-                    [k.inductor_meta.get("kernel_name") for k in self.kernels],
-                    cache_key,
-                )
-            else:
-                self._select_kernel_by_shape(*args, **kwargs)
-
         if self.picked_kernel is None:
             timings = self.benchmark_sub_kernels(*args, **kwargs)
             self.picked_kernel = timings.index(min(timings))
@@ -460,6 +467,68 @@ def run(self, *args, **kwargs):
         filtered_args = self._get_filtered_args(args, self.picked_kernel)
         run(*filtered_args, **kwargs)
 
+    def _metrics_table_row(self, timings):
+        def get_kernel_path(k):
+            return k.fn.fn.__code__.co_filename
+
+        k0 = self.kernels[0]
+        row = {
+            "size_hints": k0.size_hints,
+            "reduction_hint": k0.inductor_meta.get("reduction_hint"),
+        }
+        max_kernels = 4
+        assert len(timings) <= max_kernels
+        for i in range(max_kernels):
+            if i < len(self.kernels):
+                row[f"kernel{i}_path"] = get_kernel_path(self.kernels[i])
+                row[f"kernel{i}_latency"] = timings[i]
+            else:
+                row[f"kernel{i}_path"] = ""
+                row[f"kernel{i}_latency"] = ""
+        return row
+
+
+class SizeHintMultiKernel(MultiKernel):
+    """
+    Version of multi-kernel that generates kernels based on specified size hints.
+    Currently only performs 1-d search over hints; doesn't perform combinatorial n-d search
+    if n > 1 dynamic dimensions are specified.
+
+    e.g. matmul([s0, s1], [s1, s2]) with size-hints [64, 256] only generates 2 kernels,
+    based on tuning shapes ([64, 64], [64, 64]) and ([256, 256], [256, 256])
+    """
+
+    def __init__(self, kernels):
+        assert isinstance(kernels, dict) and len(kernels) >= 1
+
+        self.kernels, self.kernel_shape_keys = [], []
+        for shape_key, kernel in kernels.items():
+            self.kernels.append(kernel)
+            self.kernel_shape_keys.append(shape_key)
+        self.kernel_name = V.graph.wrapper_code.multi_kernel_state.define_kernel(
+            self.kernels, self.kernel_shape_keys
+        )
+
+        # need this since some code in inductor check if the kernel object has an args
+        # attribute to decide if it's a non-null kernel.
+        self.args = object()
+
+
+class SizeHintMultiKernelCall(MultiKernelCall):
+    """
+    Runtime class for size-hint multi-kernels.
+    Instead of having a plain list of kernels to benchmark over, keys them by input & output shapes,
+    and optionally perform shape-based selection. The pre-generated kernel is chosen based on the shape keys,
+    with the heuristic being log2 l1 distance between the pre-generated / runtime input & output shapes.
+    """
+
+    def __init__(self, multi_kernel_name, kernels, arg_index):
+        super().__init__(multi_kernel_name, list(kernels.values()), arg_index)
+        self._kernel_hints = list(kernels.keys())
+
+        # Caches results for unique shapes.
+        self._shape_cache = {}
+
     def _get_shape_cache_key(self, *args, **kwargs):
         """
         Generate a cache key based on tensor shapes for shape-specialized dispatch.
@@ -478,36 +547,61 @@ def _get_cached_shape_choice(self, cache_key):
 
     def _cache_shape_choice(self, cache_key, kernel_idx):
         """
-        Cache kernel choice for a specific shape
+        Cache kernel choice for a specific shape.
         """
         self._shape_cache[cache_key] = kernel_idx
 
+    def _dist_heuristic(self, k1, k2):
+        """
+        log2 L1 distance heuristic for kernel selection.
+        """
+
+        def dist(x, y):
+            lx = math.log2(x) if x > 0 else -1
+            ly = math.log2(y) if y > 0 else -1
+            return abs(lx - ly)
+
+        out = 0
+        for s1, s2 in zip(k1, k2):
+            out += sum(dist(x, y) for x, y in zip(s1, s2))
+        return out
+
+    def run(self, *args, **kwargs):
+        cache_key = self._get_shape_cache_key(*args, **kwargs)
+        cached_choice = self._get_cached_shape_choice(cache_key)
+        if cached_choice is not None:
+            self.picked_kernel = cached_choice
+            log.debug(
+                "using cached shape-specialized choice %dth sub-kernel in %s. Cache key: %s",
+                self.picked_kernel,
+                [k.inductor_meta.get("kernel_name") for k in self.kernels],
+                cache_key,
+            )
+        else:
+            self._select_kernel_by_shape(*args, **kwargs)
+
+        if not self._recorded:
+            self._recorded = True
+            picked_kernel_name = self.kernels[self.picked_kernel].inductor_meta.get(
+                "kernel_name"
+            )
+            assert picked_kernel_name is not None
+            self.record_choice(self.multi_kernel_name, picked_kernel_name)
+
+        run = self.kernels[self.picked_kernel].run  # type: ignore[method-assign]
+        filtered_args = self._get_filtered_args(args, self.picked_kernel)
+        run(*filtered_args, **kwargs)
+
     def _select_kernel_by_shape(self, *args, **kwargs):
         """
         Benchmark kernels for a particular shape and return the
         best kernel for this shape.
         """
         shape_key = self._get_shape_cache_key(*args, **kwargs)
-        timings = self.benchmark_sub_kernels(*args, **kwargs)
-        self.picked_kernel = timings.index(min(timings))
+        dists = [
+            self._dist_heuristic(shape_key, key) if key is not None else 2**62
+            for key in self._kernel_hints
+        ]
+        # pyrefly: ignore  # bad-assignment
+        self.picked_kernel = dists.index(min(dists))
         self._cache_shape_choice(shape_key, self.picked_kernel)
-
-    def _metrics_table_row(self, timings):
-        def get_kernel_path(k):
-            return k.fn.fn.__code__.co_filename
-
-        k0 = self.kernels[0]
-        row = {
-            "size_hints": k0.size_hints,
-            "reduction_hint": k0.inductor_meta.get("reduction_hint"),
-        }
-        max_kernels = 4
-        assert len(timings) <= max_kernels
-        for i in range(max_kernels):
-            if i < len(self.kernels):
-                row[f"kernel{i}_path"] = get_kernel_path(self.kernels[i])
-                row[f"kernel{i}_latency"] = timings[i]
-            else:
-                row[f"kernel{i}_path"] = ""
-                row[f"kernel{i}_latency"] = ""
-        return row
diff --git a/torch/_inductor/codegen/rocm/ck_conv_template.py b/torch/_inductor/codegen/rocm/ck_conv_template.py
index 032b0491a34f..b8e7da3e1567 100644
--- a/torch/_inductor/codegen/rocm/ck_conv_template.py
+++ b/torch/_inductor/codegen/rocm/ck_conv_template.py
@@ -513,9 +513,11 @@ def emit_ck_instance(self, op: "CKGroupedConvFwdOp") -> tuple[str, str]:  # type
                     arg = f"/* {field_name} */ Tuple<{tuple_elements}>"
                 else:  # tile shape
                     arg = f"/* {field_name} */ S<{tuple_elements}>"
+                # pyrefly: ignore  # bad-argument-type
                 template_params.append(arg)
             else:
                 if field_value is not None:
+                    # pyrefly: ignore  # bad-argument-type
                     template_params.append(f"/* {field_name} */ {field_value}")
         return self._template_from_string(template_definition).render(
             operation_name=op.name(),
@@ -528,7 +530,7 @@ def render(  # type: ignore[override]
         op: "CKGroupedConvFwdOp",  # type: ignore[name-defined]
         **kwargs,
     ) -> str:
-        template_buffer_node = kwargs.get("template_buffer_node", None)
+        template_buffer_node = kwargs.get("template_buffer_node")
         if template_buffer_node is not None:
             self.output_node = template_buffer_node
         X, W = self.input_nodes[0], self.input_nodes[1]
diff --git a/torch/_inductor/codegen/rocm/ck_tile_universal_gemm_template.py b/torch/_inductor/codegen/rocm/ck_tile_universal_gemm_template.py
index b18010bda908..94a79297ef5e 100644
--- a/torch/_inductor/codegen/rocm/ck_tile_universal_gemm_template.py
+++ b/torch/_inductor/codegen/rocm/ck_tile_universal_gemm_template.py
@@ -750,9 +750,9 @@ def render(  # type: ignore[override]
         """
         The primary entry point for the code rendering process used in this template.
         """
-        epilogue_nodes = kwargs.get("epilogue_nodes", None)
+        epilogue_nodes = kwargs.get("epilogue_nodes")
         assert epilogue_nodes is None or 0 == len(epilogue_nodes)
-        template_buffer_node = kwargs.get("template_buffer_node", None)
+        template_buffer_node = kwargs.get("template_buffer_node")
         if template_buffer_node is not None:
             self.output_node = template_buffer_node
         assert 2 == len(self.input_nodes)
diff --git a/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py b/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py
index bc0f75b919bb..8357e9fba774 100644
--- a/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py
+++ b/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py
@@ -510,7 +510,7 @@ def _check_num_k_loops(self, op, kBatch):
                         torch.cuda.get_device_properties(X_meta.device).warp_size,
                     )
                 except Exception as e:
-                    log.debug(
+                    log.debug(  # noqa: G200
                         "Failed to prefetch_stages for %s with exception %s", op.name, e
                     )
                     # be conservative here and disable the op
@@ -547,7 +547,7 @@ def _prefetch_stages(self, op, a_dtype_size, b_dtype_size, warp_size: int = 64):
         # Define the mapping of versions to stages
         version_to_stages = {1: 1, 3: 2, 4: 4, 5: 3}
         # Get the stages for the given version
-        stages = version_to_stages.get(version, None)
+        stages = version_to_stages.get(version)
         if stages is None:
             # This means we're at stage 2, and this requires computation
             # See github.com/ROCm/composable_kernel/blob/d6a4605/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v2.hpp#L143 # noqa: B950
@@ -590,9 +590,11 @@ def emit_ck_instance(self, op: "CKGemmOperation"):
                     arg = f"/* {field_name} */ Tuple<{tuple_elements}>"
                 else:  # tile shape
                     arg = f"/* {field_name} */ S<{tuple_elements}>"
+                # pyrefly: ignore  # bad-argument-type
                 template_params.append(arg)
             else:
                 if field_value is not None:
+                    # pyrefly: ignore  # bad-argument-type
                     template_params.append(f"/* {field_name} */ {field_value}")
         operation_name = op.name().replace("(", "").replace(",", "").replace(")", "")
         return self._template_from_string(template_definition).render(
@@ -612,9 +614,9 @@ def render(  # type: ignore[override]
         """
         The primary entry point for the code rendering process used in this template.
         """
-        epilogue_nodes = kwargs.get("epilogue_nodes", None)
+        epilogue_nodes = kwargs.get("epilogue_nodes")
         assert epilogue_nodes is None or 0 == len(epilogue_nodes)
-        template_buffer_node = kwargs.get("template_buffer_node", None)
+        template_buffer_node = kwargs.get("template_buffer_node")
         if template_buffer_node is not None:
             self.output_node = template_buffer_node
         # input nodes:
diff --git a/torch/_inductor/codegen/rocm/rocm_cpp_scheduling.py b/torch/_inductor/codegen/rocm/rocm_cpp_scheduling.py
index 9288f73954ff..ec58e458df6b 100644
--- a/torch/_inductor/codegen/rocm/rocm_cpp_scheduling.py
+++ b/torch/_inductor/codegen/rocm/rocm_cpp_scheduling.py
@@ -94,6 +94,7 @@ def codegen_template(
         with V.set_kernel_handler(kernel):
             node_schedule = [template_node]
             kernel_name = self.define_kernel(src_code, node_schedule)
+        self.codegen_comment(node_schedule, kernel_name)
         kernel.call_kernel(kernel_name, ctb)
         V.graph.removed_buffers |= kernel.removed_buffers
         self.free_buffers_in_scheduler()
diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py
index 6b1a8b5966d5..79d0b603220a 100644
--- a/torch/_inductor/codegen/simd.py
+++ b/torch/_inductor/codegen/simd.py
@@ -11,7 +11,7 @@
 import operator
 import textwrap
 from collections import Counter
-from typing import Any, Callable, Generic, no_type_check, Optional, TYPE_CHECKING, Union
+from typing import Any, Callable, Generic, Optional, TYPE_CHECKING, Union
 from typing_extensions import TypeVar
 
 import sympy
@@ -41,12 +41,11 @@
 if TYPE_CHECKING:
     from ..ir import IRNode
 
-from ..debug import set_kernel_post_grad_provenance_tracing
 from ..optimize_indexing import indexing_dtype_strength_reduction
 from ..runtime.runtime_utils import green_text, yellow_text
 from ..scheduler import BaseSchedulerNode, BaseScheduling, WhyNoFuse
 from ..utils import (
-    cache_on_self,
+    cache_property_on_self,
     expr_fits_within_32bit,
     get_dtype_size,
     IndentedBuffer,
@@ -60,7 +59,7 @@
 from ..virtualized import ops, OpsWrapper, V
 from .block_analysis import BlockPatternMatcher
 from .common import CSEVariable, index_prevent_reordering, Kernel, PythonPrinter
-from .multi_kernel import MultiKernel
+from .multi_kernel import MultiKernel, SizeHintMultiKernel
 from .simd_kernel_features import (
     DisableReduction,
     EnableReduction,
@@ -133,8 +132,7 @@ def __init__(
         self.root = root
 
     @property
-    @cache_on_self
-    @no_type_check  # https://github.com/python/mypy/issues/17184
+    @cache_property_on_self
     def is_reduction(self) -> bool:
         return prefix_is_reduction(self.prefix)
 
@@ -142,8 +140,7 @@ def symbol(self) -> sympy.Symbol:
         return sympy_index_symbol(self.name)
 
     @property
-    @cache_on_self
-    @no_type_check
+    @cache_property_on_self
     def symt(self) -> SymT:
         prefix_to_symt = {prefix: symt for symt, prefix in prefix_str.items()}
         return prefix_to_symt[self.prefix]
@@ -190,6 +187,7 @@ def __init__(
 
         # True if the dimension is implemented as a single program looping over
         # the full dimension (currently only used for non-persistent reduction)
+        # pyrefly: ignore  # missing-argument
         assert not is_loop or (self.is_reduction and grid_dim is None)
         self.is_loop = is_loop
         # Index of corresponding dimension on triton tensors
@@ -377,6 +375,7 @@ class SIMDKernel(Kernel[CSEVariableType], Generic[CSEVariableType]):
     sexpr: Callable[[sympy.Expr], str] = pexpr
     kexpr: Callable[[sympy.Expr], str]
     allow_block_ptr: bool = False
+    # pyrefly: ignore  # bad-override
     kernel_name: str
 
     def __init__(
@@ -416,6 +415,18 @@ def __init__(
         )
         self.no_x_dim = self.want_no_x_dim()
         self.code_hash: Optional[str] = None
+        # Info to enable multiple store_output calls for epilogue subtiling
+        self.store_output_ctr = itertools.count()
+        self.is_native_matmul = False
+        if config.triton.native_matmul:
+            for node in self.features.node_schedule:
+                if (
+                    isinstance(node, scheduler.SchedulerNode)
+                    and isinstance(node.node, ir.ComputedBuffer)
+                    and node.node.get_reduction_type() == "dot"
+                ):
+                    self.is_native_matmul = True
+                    break
 
         # define this in a closure to make cache local to object
         @functools.cache
@@ -429,9 +440,16 @@ def simplify_indexing(index: sympy.Expr):
         self.simplify_indexing = simplify_indexing
         self.initialize_range_tree(pid_cache)
 
+    def _get_store_output_subgraph_name(self, i: int) -> str:
+        return f"<STORE_OUTPUT_{i}>"
+
+    def get_store_output_count(self):
+        total = next(self.store_output_ctr)
+        self.store_output_ctr = itertools.count(start=total - 1, step=1)
+        return total
+
     @property
-    @cache_on_self
-    @no_type_check  # https://github.com/python/mypy/issues/17184
+    @cache_property_on_self
     def num_reduction_dims(self) -> int:
         return sum(prefix_is_reduction(prefix) for prefix in self.numels)
 
@@ -554,6 +572,7 @@ def dense_size_list(self) -> list[str]:
             if tree.tensor_dim is None:
                 continue
 
+            # pyrefly: ignore  # missing-argument
             if not tree.is_reduction or self.inside_reduction:
                 sizes[tree.tensor_dim] = f"{tree.prefix.upper()}BLOCK"
         return sizes
@@ -665,10 +684,19 @@ def add_range(i: int, expr: sympy.Expr) -> int:
             return next(var_count)
 
         def make_combined(
-            size: sympy.Expr, idx1: int, idx2: int
+            sizes: list[sympy.Expr], idxs: list[int]
         ) -> Callable[[list[sympy.Expr]], sympy.Expr]:
+            """
+            Builds the nested expression:
+              ((...((s1*v[i1] + v[i2]) * s2 + v[i3]) ... ) * sk + v[i(k+1)])
+            """
+            assert len(idxs) == len(sizes) + 1
+
             def getter(flat_vars: list[sympy.Expr]) -> sympy.Expr:
-                return size * flat_vars[idx1] + flat_vars[idx2]
+                expr = flat_vars[idxs[0]]
+                for s, idx in zip(sizes, idxs[1:]):
+                    expr = s * expr + flat_vars[idx]
+                return expr
 
             return getter
 
@@ -688,7 +716,47 @@ def getter(flat_vars: list[sympy.Expr]) -> sympy.Expr:
                     # scroll to next group with remaining elements
                     current_group += 1
 
-                if current_group + 1 < len(remaining) and sv.statically_known_gt(
+                # During native matmul on bmm, we enforce tiling order (z, y, x, r).
+                # When fusing a bmm node with loop (z, y, x, r) with a pw node
+                # of shape (z*y*x, 1), we need to split the pw iteration range
+                # into three dimensions.
+                # The group becomes [z, y, x, 1], with lengths ([z*y*x], []).
+                # In this case, we decompose the combined size z*y*x into three
+                # consecutive groups. Previously, _split_iteration_ranges supported
+                # splitting into at most two dimensions, but we now extend it to do
+                # three splits when the total size is divisible by all three.
+
+                # is group having (z,y,x,r=1) form?
+                is_bmm_then_pw = len(remaining) == 4 and remaining[-1] == 1
+                if (
+                    current_group + 2 < len(remaining)
+                    and sv.statically_known_gt(
+                        size, remaining[current_group] * remaining[current_group + 1]
+                    )
+                    and is_bmm_then_pw
+                ):
+                    # need to break size in three
+                    if not sv.statically_known_multiple_of(
+                        size, remaining[current_group] * remaining[current_group + 1]
+                    ):
+                        raise CantSplit
+
+                    size1 = remaining[current_group]
+                    size2 = remaining[current_group + 1]
+                    size3 = FloorDiv(size, size1 * size2)
+                    return_getters.append(
+                        make_combined(
+                            [size2, size3],
+                            [
+                                add_range(current_group, size1),
+                                add_range(current_group + 1, size2),
+                                add_range(current_group + 2, size3),
+                            ],
+                        )
+                    )
+
+                # Two-dimensional tiling
+                elif current_group + 1 < len(remaining) and sv.statically_known_gt(
                     size, remaining[current_group]
                 ):
                     # need to break size in two
@@ -701,9 +769,11 @@ def getter(flat_vars: list[sympy.Expr]) -> sympy.Expr:
                     size2 = FloorDiv(size, remaining[current_group])
                     return_getters.append(
                         make_combined(
-                            size2,
-                            add_range(current_group, size1),
-                            add_range(current_group + 1, size2),
+                            [size2],
+                            [
+                                add_range(current_group, size1),
+                                add_range(current_group + 1, size2),
+                            ],
                         )
                     )
                 else:
@@ -716,7 +786,6 @@ def getter(flat_vars: list[sympy.Expr]) -> sympy.Expr:
         assert all(V.graph.sizevars.size_hint(s) == 1 for s in remaining), (
             f"failed to set ranges {remaining} {lengths}"
         )
-
         return new_ranges, return_getters_groups
 
     @classmethod
@@ -896,7 +965,11 @@ def prepare_indexing(
 
     def active_range_trees(self) -> list[IterationRangesRoot]:
         return [
-            t for t in self.range_trees if not t.is_reduction or self.inside_reduction
+            # pyrefly: ignore  # missing-argument
+            t
+            for t in self.range_trees
+            # pyrefly: ignore  # missing-argument
+            if not t.is_reduction or self.inside_reduction
         ]
 
     def codegen_indexing(self, expr: sympy.Expr) -> sympy.Expr:
@@ -1044,6 +1117,7 @@ def estimate_kernel_num_bytes(self):
                 numel = buf_size
             dtype = V.graph.get_dtype(arg)
             dtype_size = get_dtype_size(dtype)
+            # pyrefly: ignore  # bad-argument-type
             nbytes.append(numel * dtype_size * (1 + int(i < ninplace_args)))
         return sum(nbytes)
 
@@ -1064,6 +1138,7 @@ def warn_mix_layout(self, kernel_name):
 
         argdefs, call_args, _signature, _ = self.args.python_argdefs()
         uniform_stride_order = None
+        # pyrefly: ignore  # bad-assignment
         for arg_name in call_args:
             buf = V.graph.try_get_buffer(arg_name)
             if not buf:
@@ -1190,6 +1265,34 @@ def can_fuse(self, node1, node2):
                     rnumel1,
                     rnumel2,
                 )
+
+            if reduction_can_fuse and (
+                node1.is_native_matmul() or node2.is_native_matmul()
+            ):
+                # Ensure node1 is always the native matmul side
+                if not node1.is_native_matmul():
+                    node1, node2 = node2, node1
+
+                # 1. A native matmul node keeps its original loop order.
+                #    For example: C[z,y,x] = torch.bmm(A[z,y,r], B[z,r,x]) keeps (z,y,x) order.
+                #    (see simplify_and_reorder in ir.py)
+                #
+                # 2. Triton kernels with native matmul always tile loops as (z,y,x)
+                #    (see get_tiling_and_scores in this file)
+                #
+                # 3. If a candidate node (node2) uses a different loop order (e.g., (z,x,y,r)),
+                #    its tiling is incompatible with native matmul tiling (z,y,x,r).
+                #    This means _split_iteration_ranges will fail, so these nodes should not be fused.
+                tiling = self.select_tiling(node1.get_nodes(), numel1, rnumel1)
+                if not all(
+                    SIMDKernel.is_compatible(
+                        tiling.values(), n2.get_ranges(), reduction_numel=rnumel1
+                    )
+                    for n2 in node2.get_nodes()
+                ):
+                    why("invalid loop order and tiling for native matmul")
+                    return False
+
             return reduction_can_fuse
 
         if not node1.is_reduction() and not node2.is_reduction():
@@ -1464,17 +1567,10 @@ def codegen_node_schedule(self, kernel_features: SIMDKernelFeatures):
         for kernel in kernels:
             self.codegen_node_schedule_with_kernel(node_schedule, kernel)
         MultiKernel.merge_workspaces_inplace(kernels)
-        debug_handles: list[tuple[str, Optional[int]]] = []
         for kernel in kernels:
             with V.set_kernel_handler(kernel):
                 src_code = kernel.codegen_kernel()
             kernel_name = self.define_kernel(src_code, node_schedule, kernel)
-            if config.trace.provenance_tracking_level != 0:
-                debug_handle = set_kernel_post_grad_provenance_tracing(
-                    node_schedule,  # type: ignore[arg-type]
-                    kernel_name,
-                )
-                debug_handles.append((kernel_name, debug_handle))
             log.debug("Generating kernel code with kernel_name: %s", kernel_name)
             kernel.kernel_name = kernel_name
             kernel.code_hash = code_hash(src_code)
@@ -1490,11 +1586,11 @@ def codegen_node_schedule(self, kernel_features: SIMDKernelFeatures):
             for node in kernel_features.scheduler_nodes():
                 node.mark_run()
 
-        self.codegen_comment(node_schedule)
-        for kernel_name, debug_handle in debug_handles:
-            V.graph.wrapper_code.write_provenance_debug_handle(
-                kernel_name, debug_handle
-            )
+        # filter out NodeScheduleMarker
+        base_scheduler_nodes = [
+            node for node in node_schedule if isinstance(node, BaseSchedulerNode)
+        ]
+        self.codegen_comment(base_scheduler_nodes, final_kernel.kernel_name)
         final_kernel.call_kernel(final_kernel.kernel_name)
 
         if config.nan_asserts:
@@ -1608,10 +1704,13 @@ def _codegen_single_template(
 
             partial_code = render()
 
-            with kernel.set_subgraph_body("<STORE_OUTPUT>"):
-                for node in epilogue_nodes:
-                    node.codegen(kernel.split_and_set_ranges(node.get_ranges()))
-                kernel.cse.invalidate(OrderedSet())
+            num_store_subgraphs = kernel.get_store_output_count()
+            for i in range(num_store_subgraphs):
+                subgraph_name = kernel._get_store_output_subgraph_name(i)
+                with kernel.set_subgraph_body(subgraph_name):
+                    for node in epilogue_nodes:
+                        node.codegen(kernel.split_and_set_ranges(node.get_ranges()))
+                    kernel.cse.invalidate(OrderedSet())
 
             for input_name, buffer in kernel.named_input_nodes.items():
                 subgraph_name = f"<LOAD_INPUT_{input_name}>"
@@ -1645,23 +1744,32 @@ def _codegen_single_template(
                                 )
                             kernel.cse.invalidate(OrderedSet())
 
-        if not isinstance(partial_code, str):
-            # This is used to calculate flops in TritonTemplateKernels
-            with ir.IRNode.current_origins(template_node.node.origins):
-                partial_code.finalize_hook("<DEF_KERNEL>")
-            partial_code.finalize_hook("<ARGDEFS>", strict=False)
-        # finalize must be called after adding epilogue above
+        # Template hooks must be finalised after kernel.remove_kernel_local_buffers
+        # is called (this is called when the kernel context is exited above), and when
+        # the kernel handler is set (as below). This is because the hooks may add
+        # DeferredLine type lines, which preclude lines involving buffers that have
+        # been removed
 
+        # finalize must be called after adding epilogue above
         with V.set_kernel_handler(kernel):
+            if not isinstance(partial_code, str):
+                # This is used to calculate flops in TritonTemplateKernels
+                with ir.IRNode.current_origins(template_node.node.origins):
+                    partial_code.finalize_hook("<DEF_KERNEL>")
+                partial_code.finalize_hook("<ARGDEFS>", strict=False)
+
             # TODO: Maybe unify CUDATemplateKernel to also use PartialRender for flexible epilogue fusion.
 
             for input_name in kernel.named_input_nodes.keys():
                 subgraph_name = f"<LOAD_INPUT_{input_name}>"
+                # pyrefly: ignore  # missing-attribute
                 partial_code.finalize_hook(subgraph_name, strict=False)
 
-            with kernel.set_subgraph_body("<STORE_OUTPUT>"):
-                if not isinstance(partial_code, str):
-                    partial_code.finalize_hook("<STORE_OUTPUT>")
+            num_store_subgraphs = kernel.get_store_output_count()
+            for i in range(num_store_subgraphs):
+                subgraph_name = kernel._get_store_output_subgraph_name(i)
+                # pyrefly: ignore  # missing-attribute
+                partial_code.finalize_hook(subgraph_name)
 
             if isinstance(partial_code, str):
                 src_code = partial_code
@@ -1685,13 +1793,57 @@ def _codegen_single_template(
 
             kernel.kernel_name = self.define_kernel(src_code, node_schedule, kernel)
 
-            if config.trace.provenance_tracking_level != 0:
-                set_kernel_post_grad_provenance_tracing(
-                    node_schedule, kernel.kernel_name
-                )
-
             return kernel
 
+    def _get_multikernel_shapes(
+        self, node: MultiTemplateBuffer
+    ) -> tuple[tuple[int, ...], ...]:
+        from ..ir import IRNode
+
+        def get_size(arg):
+            if not isinstance(arg, IRNode):
+                return None
+            if isinstance(arg, ir.BaseView):  # triton templates want the base tensor.
+                arg = arg.unwrap_view()
+            if (size := arg.maybe_get_size()) is None:
+                return None
+            return tuple(s for s in size)
+
+        out = []
+        for arg in list(node.inputs) + [node]:
+            if isinstance(arg, (list, tuple)):
+                out.append(tuple(get_size(_arg) for _arg in arg))
+            else:
+                out.append(get_size(arg))
+        return tuple(out)
+
+    def _kernel_has_dynamic_shapes(self, node: MultiTemplateBuffer) -> bool:
+        shapes = self._get_multikernel_shapes(node)
+        return any(
+            any(
+                isinstance(s, sympy.Expr) and not isinstance(s, sympy.Integer)
+                for s in shape
+            )
+            for shape in shapes
+        )
+
+    def _make_shape_cache_key(
+        self, node: MultiTemplateBuffer, hint: int
+    ) -> tuple[tuple[int, ...], ...]:
+        """
+        Returns cache key for hint-based multi-graph; key is tuple of shapes with hint filled in.
+        """
+        shapes = self._get_multikernel_shapes(node)
+        return tuple(
+            tuple(
+                hint
+                if isinstance(s, sympy.Expr) and not isinstance(s, sympy.Integer)
+                else s
+                for s in shape
+            )
+            for shape in shapes
+        )
+
     def codegen_template(
         self,
         template_node,
@@ -1714,11 +1866,16 @@ def codegen_template(
         if (
             isinstance(template_node.node, MultiTemplateBuffer)
             and template_node.node._make_kernel_renders
+            and len(template_node.node._make_kernel_renders) > 1
+            and self._kernel_has_dynamic_shapes(template_node.node)
         ):
-            kernels = []
+            kernels = {}
             src_codes = []
 
-            for make_kernel_render in template_node.node._make_kernel_renders.values():
+            for (
+                size_hint,
+                make_kernel_render,
+            ) in template_node.node._make_kernel_renders.items():
                 kernel, render = make_kernel_render(
                     template_node.node, hint_override=hint_override
                 )
@@ -1733,8 +1890,11 @@ def codegen_template(
                         only_gen_src_code=True,
                     )
                     assert isinstance(src_code, str)
+                    # pyrefly: ignore  # bad-argument-type
                     src_codes.append(src_code)
                 else:
+                    if size_hint is None:
+                        continue  # skip kernel generation based on real runtime value; only use hints
                     kernel = self._codegen_single_template(
                         kernel,
                         render,
@@ -1743,16 +1903,20 @@ def codegen_template(
                         prologue_nodes,
                         only_gen_src_code=False,
                     )
-                    kernels.append(kernel)
+                    shape_cache_key = (
+                        None
+                        if size_hint is None
+                        else self._make_shape_cache_key(template_node.node, size_hint)
+                    )
+                    kernels[shape_cache_key] = kernel
 
             if only_gen_src_code:
                 return "\n\n".join(src_codes)
 
-            MultiKernel.merge_workspaces_inplace(kernels)
-            multi_kernel = MultiKernel(kernels)
+            MultiKernel.merge_workspaces_inplace(list(kernels.values()))
+            multi_kernel = SizeHintMultiKernel(kernels)
             node_schedule = [*prologue_nodes, template_node, *epilogue_nodes]
-            self.codegen_comment(node_schedule)
-
+            self.codegen_comment(node_schedule, multi_kernel.kernel_name)
             multi_kernel.call_kernel(multi_kernel.kernel_name)
             V.graph.removed_buffers |= multi_kernel.removed_buffers
             V.graph.inplaced_to_remove |= multi_kernel.inplaced_to_remove
@@ -1783,7 +1947,7 @@ def codegen_template(
                 )
 
                 node_schedule = [*prologue_nodes, template_node, *epilogue_nodes]
-                self.codegen_comment(node_schedule)
+                self.codegen_comment(node_schedule, kernel.kernel_name)
                 kernel.call_kernel(kernel.kernel_name, template_node.node)
 
                 V.graph.removed_buffers |= kernel.removed_buffers
@@ -1831,6 +1995,8 @@ def generate_combo_kernel_code(
         )
         kernel_code_list = []
         for node_group in partitions:
+            if len(node_group) == 0:
+                continue
             fused_node_lists = [node.get_nodes() for node in node_group]
             kernel = ComboKernel(
                 enable_autotune=enable_autotune,
@@ -1869,12 +2035,7 @@ def codegen_combo_kernel(self, combo_kernel_node):
 
         for src_code, kernel, _ in kernel_code_list:
             kernel_name = self.define_kernel(src_code, [combo_kernel_node], kernel)
-            # dump provenance node info for ComboKernelNode/ForeachKernel type
-            if config.trace.provenance_tracking_level != 0:
-                set_kernel_post_grad_provenance_tracing(
-                    combo_kernel_node.snodes, kernel_name
-                )
-            self.codegen_comment([combo_kernel_node])
+            self.codegen_comment(combo_kernel_node.snodes, kernel_name)
             log.debug("ComboKernels: generated kernel %s.", kernel_name)
             kernel.call_kernel(V.graph.wrapper_code, kernel_name)
 
@@ -2235,7 +2396,7 @@ def process_node_vars(
                     return ([], [])
 
             key = (repr(vars_to_use), use_split_var, is_pointwise)
-            if out := scored_sub_split.get(key, None):
+            if out := scored_sub_split.get(key):
                 return out
 
             splitting_vars = all_iter_vars if is_pointwise else all_red_vars
@@ -2449,6 +2610,22 @@ def get_tiling_and_scores(
         # Tiled reductions are gated by a config flag.
         default_tiling = cls.create_tiling([numel], [reduction_numel])
 
+        # Force tiling compatible with matmul dimensions
+        # when natively generating matmul without template calls.
+        for node in EnableReduction.filter(node_schedule):
+            if isinstance(node.node, ir.ComputedBuffer):
+                if (
+                    node.node.get_reduction_type() == "dot"
+                    and config.triton.native_matmul
+                ):
+                    # A[M,K] @ B[K,N]
+                    # force tiling to be {'y':M, 'x':N, 'r0_':K}
+                    node_ranges = node.get_ranges()
+                    range_y_x = node_ranges[0]  # (M,N)
+                    range_r = node_ranges[1]  # (K)
+                    tiling = cls.create_tiling(range_y_x, range_r)
+                    return tiling, None
+
         # # TODO: enable by default
         if (
             torch._inductor.config.triton.coalesce_tiling_analysis
@@ -2543,6 +2720,7 @@ def convert_tiling_to_3d(
             perf_hint_log.info("possibly bad tiling: %s", ranked_tilings)
 
         # Optionally, prefer tiling into as many dimensions as possible.
+        # pyrefly: ignore  # unbound-name
         if config.triton.prefer_nd_tiling:
             ranked_tilings = (
                 cls.get_nd_tilings(node_schedule, numel, reduction_numel)
@@ -2592,12 +2770,10 @@ def generate_kernel_code_from_nodes(
                     hint_override=hint_override,
                 )
 
+        # pyrefly: ignore  # missing-attribute
         src_code = src_code.replace(str(Placeholder.KERNEL_NAME), "triton_")
         return src_code
 
-    def codegen_comment(self, node_schedule):
-        pass
-
     def define_kernel(self, src_code, node_schedule, kernel):
         raise NotImplementedError
 
diff --git a/torch/_inductor/codegen/simd_kernel_features.py b/torch/_inductor/codegen/simd_kernel_features.py
index 77e9dba34edd..3cb38dda5a36 100644
--- a/torch/_inductor/codegen/simd_kernel_features.py
+++ b/torch/_inductor/codegen/simd_kernel_features.py
@@ -290,7 +290,8 @@ def simulate_codegen(self) -> None:
             )
 
             for dep in rw._reads:
-                assert isinstance(dep, MemoryDep)
+                if not isinstance(dep, MemoryDep):
+                    continue
                 dep = dep.simplify_with_ranges()
                 if not self.persistent.writes.get(dep.name):  # cache miss?
                     self.persistent.reads[dep.name].add(dep)
@@ -308,7 +309,8 @@ def simulate_codegen(self) -> None:
                         self.must_keep_buffers.add(dep.name)
 
             for dep in rw._writes:
-                assert isinstance(dep, MemoryDep)
+                if not isinstance(dep, MemoryDep):
+                    continue
                 dep = dep.simplify_with_ranges()
                 self.store_buffer_names.add(dep.name)
                 self.persistent.writes[dep.name].add(dep)
diff --git a/torch/_inductor/codegen/subgraph.py b/torch/_inductor/codegen/subgraph.py
index 374186c2e242..a015d52d24f2 100644
--- a/torch/_inductor/codegen/subgraph.py
+++ b/torch/_inductor/codegen/subgraph.py
@@ -80,21 +80,11 @@ def benchmark(self, *args: list[Any], out: torch.Tensor) -> float:
             bm_graph_lowering.graph_input_names.append(sym_inp.name)
 
         sym_inputs = [
+            # pyrefly: ignore  # no-matching-overload
             int(V.graph.sizevars.shape_env.size_hint(sym_var))
             for sym_var in self.sym_inputs
         ]
 
-        if len(sym_inputs) == 0:
-            # Sanity check that args are same layout as example inputs
-            # Only do it if there are no symbolic inputs, otherwise
-            # the dynamic dim will be realized to the same size as args
-            for ar, example_inp in zip(args, self.example_inputs):
-                # Sanity check that args are same layout as example inputs
-                if isinstance(ar, torch.Tensor):
-                    assert isinstance(example_inp, torch.Tensor)
-                    assert ar.shape == example_inp.shape
-                    assert ar.stride() == example_inp.stride()
-
         if len(sym_inputs) == 0:
             # Sanity check that args are same layout as example inputs
             # Only do it if there are no symbolic inputs, otherwise
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index bcc7033a2848..6ad0d41a965a 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -26,7 +26,6 @@
 from torch._prims_common import is_integer_dtype
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.functions import CeilDiv, FloorDiv, ModularIndexing
-from torch.utils._sympy.value_ranges import bound_sympy
 from torch.utils._triton import has_triton_package, has_triton_stable_tma_api
 
 from ...utils._sympy.symbol import free_symbol_is_type, prefix_str, symbol_is_type, SymT
@@ -34,6 +33,7 @@
 from .. import config, ir, metrics
 from ..async_compile import AsyncCompile
 from ..codecache import code_hash, get_path, PyCodeCache, write_atomic
+from ..debug import set_kernel_post_grad_provenance_tracing
 from ..ops_handler import DefaultHandler
 from ..runtime import triton_heuristics
 from ..runtime.benchmarking import benchmarker
@@ -45,8 +45,10 @@
 )
 from ..runtime.runtime_utils import get_max_y_grid, next_power_of_2
 from ..scheduler import BaseSchedulerNode, FusedSchedulerNode, Scheduler, SchedulerNode
+from ..shape_propagation import get_broadcasted_shape
 from ..utils import (
     cache_on_self,
+    DelayMaybeLine,
     DelayReplaceLine,
     get_bounds_index_expr,
     get_fused_kernel_name,
@@ -73,6 +75,7 @@
     DeferredLine,
     IndentedBuffer,
     InplacedBuffer,
+    is_buffer_removed,
     OpOverrides,
     PythonPrinter,
     RemovedArg,
@@ -118,6 +121,18 @@
 async_compile = AsyncCompile()
 
 
+def is_sympy_integer_like(expr: object):
+    """ "
+    Is this expression a Sympy Integer or is it an integer sympy Expr
+    containing no free symbols. The latter case can happen with Identity expr.
+    """
+    if not isinstance(expr, sympy.Expr):
+        return False
+    return isinstance(expr, sympy.Integer) or (
+        expr.is_integer and len(expr.free_symbols) == 0
+    )
+
+
 class OpDtypeSupport:
     """
     Some Triton ops such as libdevice and tl.math only support float32 and float64.
@@ -195,6 +210,69 @@ class TritonSymbols:
         for symt in block_types
     }
 
+    @classmethod
+    def get_block_shape(cls, expr: sympy.Expr) -> BlockShapeType:
+        # return block shape of sympy Expression
+        # e.g.,
+        # tmp13 = y1
+        # tmp14 = x0 - tmp13
+        #
+        # get_block_shape(y1) = (YBLOCK,1,1)
+        # get_block_shape(x0-tmp13) = (YBLOCK,XBLOCK,1)
+
+        expr_shape: BlockShapeType = ()
+        expr_vars = expr.free_symbols
+        for var in expr_vars:
+            if symbol_is_type(var, SymT.TMP):
+                cse_var = V.kernel.cse.varname_map[var.name]
+                var_shape = cse_var.shape
+            elif symbol_is_type(
+                var,
+                (
+                    SymT.UNBACKED_INT,
+                    SymT.SIZE,
+                    SymT.PRECOMPUTED_SIZE,
+                    SymT.INDEX,
+                    SymT.FLOAT,
+                    SymT.UNBACKED_FLOAT,
+                ),
+            ):
+                var_shape = ()
+            else:
+                symbol_matches = [
+                    symt for symt in cls.block_types if symbol_is_type(var, symt)
+                ]
+                assert len(symbol_matches) == 1, f"Ambiguous type: {var.name}"
+
+                sym = symbol_matches[0]
+                ndim = V.kernel.triton_tensor_ndim()
+                shape = ["1"] * ndim
+
+                tree_match = [
+                    tree
+                    for tree in V.kernel.active_range_trees()
+                    if prefix_str[sym] == tree.prefix
+                ]
+                assert len(tree_match) == 1, "# of Match expected to 1"
+
+                shape[tree_match[0].tensor_dim] = str(cls.get_block_size(tree_match[0]))
+                var_shape = tuple(shape)
+
+            # Union current variable shape
+            expr_shape = get_broadcasted_shape(expr_shape, var_shape)
+
+        assert expr_shape is not None
+
+        # Below logic handles when index symbols does not match with convention range tree order.
+        # Mainly, it is for TMA template where TMA indices are expected to be in (x,y), not (y,x).
+        # so in such case, the get_block_shape(yindex) should be (1,YBLOCK), not (YBLOCK,1).
+        if isinstance(V.kernel, torch._inductor.select_algorithm.TritonTemplateKernel):
+            out_shape = V.kernel.template_out_shape
+            if out_shape == ("XBLOCK", "YBLOCK") and V.kernel.tma_store:
+                expr_shape = (expr_shape[1], expr_shape[0], *expr_shape[2:])
+
+        return expr_shape
+
     @classmethod
     def get_block_size(cls, tree: IterationRanges) -> sympy.Symbol:
         return cls.block_sizes[tree.symt]
@@ -638,6 +716,13 @@ def triton_reshape(
     return f"{value}[{', '.join(expand)}]"
 
 
+def enable_pdl_codegen():
+    if not torch._inductor.config.triton.enable_pdl:
+        return False
+    major, _ = torch.cuda.get_device_capability(torch.cuda.current_device())
+    return major >= 9
+
+
 # NB: Inheriting from PythonPrinter is somewhat dangerous, because there are a
 # number of operators which Triton "implements", but in a way that is
 # inconsistent with Python semantics (and consistent with C semantics).  We
@@ -706,7 +791,7 @@ def _print_CeilToInt(self, expr: sympy.Expr) -> str:
         return f"libdevice.ceil({self._print(expr.args[0])}).to({V.kernel.index_dtype})"
 
     def _helper_sqrt(self, expr: sympy.Expr) -> str:
-        return f"libdevice.sqrt(({self._print(expr)}).to(tl.float32))"
+        return f"tl.sqrt_rn(({self._print(expr)}).to(tl.float32))"
 
     def _print_FloatPow(self, expr: sympy.Expr) -> str:
         return (
@@ -939,7 +1024,7 @@ def wrapped(*args, **kwargs) -> str:
 
 
 class TritonOverrides(OpOverrides):
-    """Map element-wise ops to Triton"""
+    """Map element-wise ops to Triton e.g., ops.to_dtype(x,...) -> x.to(...)"""
 
     _LOG_2_E = math.log2(math.e)
 
@@ -1053,7 +1138,26 @@ def abs(x):
 
     @staticmethod
     def truediv(x, y):
-        out = f"({x} / {y})"
+        x_dtype = getattr(x, "dtype", None)
+        y_dtype = getattr(y, "dtype", None)
+
+        if (
+            x_dtype == torch.float32
+            and y_dtype == torch.float32
+            and config.emulate_divison_rounding
+        ):
+            # x / y in Triton is lowered to div.full which is approx
+            # we want div_rn to adhere with eager
+            out = f"triton.language.div_rn({x}, {y})"
+        else:
+            out = f"({x} / {y})"
+
+        # Workaround here since the functionality of div_rn has not ready on XPU.
+        # TODO: remove this workaround after https://github.com/intel/intel-xpu-backend-for-triton/issues/5306
+        # resolved.
+        if torch.xpu.is_available():
+            out = f"({x} / {y})"
+
         if low_precision_fp_var(x) or low_precision_fp_var(y):
             out_dtype = get_dtype_handler().truediv(x, y)
             if out_dtype in (torch.float16, torch.float32):
@@ -1098,7 +1202,7 @@ def expm1(x):
     @staticmethod
     @maybe_upcast_float32()
     def sqrt(x):
-        return f"libdevice.sqrt({x})"
+        return f"tl.sqrt_rn({x})"
 
     @staticmethod
     def relu(x):
@@ -1130,6 +1234,193 @@ def maximum(a, b):
     def where(a, b, c):
         return f"tl.where({a}, {b}, {c})"
 
+    @staticmethod
+    def dot(a, b):
+        """
+        Triton code generation for lowering ops.dot to tl.dot.
+
+        The logic is as follows:
+
+        1. Downcasting for performance
+           If the data was previously upcasted to fp32, we downcast back to the
+           original dtype (e.g., fp16 or bf16) for better performance. While
+           surrounding operations may run in fp32, matmul itself is executed at the
+           original precision to optimize throughput.
+
+        2. Handling non-constant reduction masks
+           If the reduction mask is not constant and there was any operation between
+           tl.load and tl.dot, we zero out regions outside the mask using
+           tl.where(r0_mask, val, 0).
+           This ensures that values outside the mask do not contribute to the dot
+           product, preventing incorrect results.
+
+        3. Shape alignment for tl.dot
+           We massage shapes to match the tl.dot requirement of (Y, R) x (R, X).
+           Current codegen eagerly broadcasts tl.arange to create unique axes. We
+           reshape, transpose, or broadcast to align with the (Y, R) x (R, X) shape.
+           We avoid using 3D dot ((Z, Y, R) x (Z, R, X)) because 3D tl.dot has
+           poor performance. During batched matmul (bmm), we keep ZBLOCK=1 and call
+           the 2D dot kernel instead.
+        """
+        assert V.kernel.is_native_matmul
+        orig_a, orig_b = a, b
+
+        def is_where_needed(var):
+            # Skip if the variable doesn't have a reduction mask
+            if not any(map(prefix_is_reduction, var.mask_vars)):
+                return False
+
+            reduction_range = V.kernel.range_trees[-1]
+            assert reduction_range.is_reduction
+
+            # Skip if reduction mask was already constant
+            if V.kernel._has_constant_mask(reduction_range):
+                return False
+
+            # Skip if the variable is already zeroed outside the mask
+            # (e.g., from tl.load(..., other=0.0))
+            # TODO : track the value of outside of mask region with cse
+            for k, v in V.kernel.cse._cache.items():
+                if v == var and "tl.load" in k and "other=0.0" in k:
+                    return False
+
+            return True
+
+        def where_cond(var):
+            default = ir.Reduction.default_value("dot", var.dtype)
+            reduction_mask = [
+                f"{tree.prefix}mask"
+                for tree in V.kernel.range_trees
+                if tree.is_reduction
+            ]
+
+            assert len(reduction_mask) == 1, "don't tile reduction when native matmul"
+
+            where_var = TritonKernelOverrides.where(reduction_mask[0], var, default)
+            return V.kernel.cse.generate(
+                V.kernel.compute, where_var, dtype=var.dtype, shape=var.shape
+            )
+
+        # When computing expressions like ((A+1) @ (B+2)),
+        # native codegen will do
+        #
+        # a = tl.load(..., r0_mask, other=0.0)
+        # b = tl.load(..., r0_mask, other=0.0)
+        # tmp0 = a+1
+        # tmp1 = b+2
+        # tmp2 = tl.dot(tmp0, tmp1)
+        #
+        # This produces incorrect results because outside of r0_mask is not zero.
+        # So before calling tl.dot, apply tl.where to zero out values properly.
+        # TODO: Optimize - We don't need both operands to be zeroed except NaN * 0
+        if is_where_needed(orig_a):
+            a = where_cond(a)
+        if is_where_needed(orig_b):
+            b = where_cond(b)
+
+        def reshape_transpose_broadcast_for_dot(
+            value,
+            initial_shape: Sequence[sympy.Expr],
+            final_shape: Sequence[sympy.Expr],
+        ) -> str:
+            """
+            Generate a reshape, transpose, and broadcast for the tl.dot.
+            tl.dot requires specific shape requirement : (Y,R) x (R,X)
+            but the current triton codegen eagerly broadcast the tl.arange so
+            it needs to be reshaped to meet the requirement.
+
+            This is done by three steps.
+            1. remove the empty dimension (dim with size 1) and make it 2d with tl.reshape
+            2. permute the dimension if needed (e.g., (X,R) -> (R,X)) with tl.trans
+            3. broadcast if needed with broadcast_to.
+                - This shows up when matmul operand is broadcasted with torch.expand/repeat.
+                - e.g., torch.rand((16,)).expand(16,16) @ B
+
+            e.g., (Y,1,R), (Y,R) -> tl.reshape(var, (Y,R))
+            e.g., (1,X,R), (R,X) -> tl.trans(tl.reshape(var, (X,R)))
+            e.g., (1,X,1), (R,X) -> tl.broadcast_to(tl.trans(tl.reshape(var, (X,1))), (R,X))
+
+            TODO : eventually we want to remove this function when lazy broadcasting arrives
+            """
+
+            # Triton 3d dot is slower than 2d dot, so we want to keep block shape in 2d
+            # by fixing ZBLOCK=1 in the autotune config
+            if ZBLOCK in initial_shape:
+                initial_shape = ["1" if dim == ZBLOCK else dim for dim in initial_shape]
+
+            if final_shape == [YBLOCK, RBLOCK]:
+                assert XBLOCK not in initial_shape, (
+                    "left tl.dot operand cannot depend on x"
+                )
+
+                shape_2d = ["1", "1"]
+                if YBLOCK in initial_shape:
+                    shape_2d[0] = YBLOCK
+                if RBLOCK in initial_shape:
+                    shape_2d[1] = RBLOCK
+
+                # reshape it into 2d
+                value = triton_reshape(value, initial_shape, shape_2d)
+
+                # broadcast if needed
+                broadcast_needed = shape_2d != [YBLOCK, RBLOCK]
+                if broadcast_needed:
+                    value = f"tl.broadcast_to({value}, ({YBLOCK}, {RBLOCK}))"
+
+            elif final_shape == [RBLOCK, XBLOCK]:
+                assert YBLOCK not in initial_shape, (
+                    "right tl.dot operand cannot depend on y"
+                )
+
+                shape_2d = ["1", "1"]
+                if XBLOCK in initial_shape:
+                    shape_2d[0] = XBLOCK
+                if RBLOCK in initial_shape:
+                    shape_2d[1] = RBLOCK
+
+                # reshape it into 2d (X,R)
+                value = triton_reshape(value, initial_shape, shape_2d)
+
+                # transpose to (R,X)
+                value = f"tl.trans({value})"
+
+                # broadcast if needed
+                broadcast_needed = shape_2d != [XBLOCK, RBLOCK]
+                if broadcast_needed:
+                    value = f"tl.broadcast_to({value}, ({RBLOCK}, {XBLOCK}))"
+            else:
+                raise NotImplementedError
+
+            return value
+
+        assert len(V.kernel.dense_size_list()) >= 3, "tl.dot can only do mm and bmm"
+
+        XBLOCK = str(TritonSymbols.block_sizes[SymT.XBLOCK])
+        YBLOCK = str(TritonSymbols.block_sizes[SymT.YBLOCK])
+        ZBLOCK = str(TritonSymbols.block_sizes[SymT.ZBLOCK])
+        RBLOCK = str(TritonSymbols.block_sizes[SymT.R0_INDEX])
+
+        a = V.kernel.cse.generate(
+            V.kernel.compute,
+            reshape_transpose_broadcast_for_dot(a, list(a.shape), [YBLOCK, RBLOCK]),
+            dtype=a.dtype,
+            shape=(YBLOCK, RBLOCK),
+        )
+
+        b = V.kernel.cse.generate(
+            V.kernel.compute,
+            reshape_transpose_broadcast_for_dot(b, list(b.shape), [RBLOCK, XBLOCK]),
+            dtype=b.dtype,
+            shape=(RBLOCK, XBLOCK),
+        )
+
+        if torch.backends.cuda.matmul.fp32_precision == "tf32":
+            input_precision = "tf32"
+        else:
+            input_precision = "ieee"
+
+        return f'tl.dot({a}, {b}, input_precision="{input_precision}")'
+
     @staticmethod
     def inline_asm_elementwise(
         *inputs, asm, constraints=None, dtype=torch.float32, is_pure=True, pack=1
@@ -1480,6 +1771,12 @@ def index_expr(cls, expr, dtype):
         )
         assert isinstance(indexing, IndexingOptions)
 
+        shape: BlockShapeType
+        if indexing.expand_shape:
+            shape = indexing.expand_shape
+        else:
+            shape = TritonSymbols.get_block_shape(indexing.index)
+
         # Our sympy expr printing casts to the current kernel index dtype.
         # we only respect non int32-int64 dtypes and otherwise use current kernel indexing dtype
         index_dtype = V.kernel.get_index_dtype_as_torch_dtype()
@@ -1494,7 +1791,7 @@ def index_expr(cls, expr, dtype):
                 indexing.index_str,
                 bounds=get_bounds_index_expr(expr),
                 dtype=dtype,
-                shape=indexing.expand_shape,
+                shape=shape,
             )
         finally:
             config.test_configs.runtime_triton_dtype_assert = orig
@@ -1597,10 +1894,6 @@ def frexp(x):
         V.kernel.cse.put(cache_key, (mantissa, exponent))
         return (mantissa, exponent)
 
-    @staticmethod
-    def device_assert_async(cond, msg):
-        return f"tl.device_assert({cond}, {repr(msg)})"
-
 
 class HelperFunctions:
     """An ordered set of helper functions."""
@@ -1944,6 +2237,8 @@ def __init__(
         self.fixed_config = fixed_config
         super().__init__(tiling, **kwargs)
         self.cse = TritonCSE(self.newvar_prefix, self.suffix)
+        # Cache of values that can be reused for the prologue.
+        self.prologue_cache: dict[str, str] = {}
         self.prologue: IndentedBuffer = IndentedBuffer()
         self.post_loop_combine: IndentedBuffer = IndentedBuffer()
         self.post_loop_store: IndentedBuffer = IndentedBuffer()
@@ -1958,6 +2253,7 @@ def __init__(
         self.tma_min_block_sizes = dict[str, int]()
         self.hint_override = hint_override
         self._load_counts: collections.Counter[str] = collections.Counter()
+        self._load_index = 0
 
         # A set of autotuning hints to pass as part of triton_meta
         self.autotune_hints = OrderedSet[AutotuneHint]()
@@ -1974,6 +2270,44 @@ def __init__(
         if self.cooperative_reduction:
             self.init_cooperative_reduction_mask()
 
+        self.has_load_with_contiguous_rdim = False
+        # We track the store name since a store can be canceled later
+        self.stores_with_contiguous_rdim: list[str] = []
+
+    @staticmethod
+    def _has_stride1_on_rdim(index) -> bool:
+        # These analysis is only needed in deterministic mode so far
+        # to filter triton configs. Return false immediately to avoid
+        # increasing compilation time when the mode is off.
+        if not (
+            config.deterministic or config.test_configs.force_filter_reduction_configs
+        ):
+            return False
+        support_vars = index.free_symbols
+        reduce_vars = [
+            var
+            for var in support_vars
+            if symbol_is_type(var, TritonSymbols.reduction_types)
+        ]
+
+        if len(reduce_vars) == 0:
+            return False
+
+        # for expression "x0 + 150528*((x1//(s27*s38))) + 3*(ModularIndexing(x1, 1, s38)) + 672*(ModularIndexing(x1, s38, s27))"
+        # stride_vars will results in DivisionByZero error
+        try:
+            stride_vars = V.graph.sizevars.stride_vars(index, reduce_vars, support_vars)
+        except ZeroDivisionError:
+            return False
+
+        return any(stride == 1 for stride in stride_vars)
+
+    @property
+    def has_store_with_contiguous_rdim(self) -> bool:
+        return not all(
+            is_buffer_removed(name) for name in self.stores_with_contiguous_rdim
+        )
+
     def dtype_to_str(self, dtype: torch.dtype) -> str:
         return triton_type(dtype)
 
@@ -2232,7 +2566,9 @@ def match_mod_div_block(
                 )
                 num_dims = max(
                     2,
-                    len(self.range_tree_nodes),
+                    # range_tree.nodes only includes the entries for the range tree
+                    # len(range_tree.nodes) <= self.range_tree_nodes
+                    len(range_tree.nodes),
                     (
                         index.count(FloorDiv(index_var, denom))
                         + index.count(ModularIndexing(index_var, denom, modulo))
@@ -2406,8 +2742,24 @@ def _get_expand_str():
             else:
                 return self.dense_size_str(), tuple(self.dense_size_list())
 
-        if isinstance(index, sympy.Integer):
-            expand_str, expand_shape = _get_expand_str()
+        if is_sympy_integer_like(index):
+            # Integer indexing produces a size-1 scalar tensor with the same shape
+            # as the dense dimension. E.g, if dense_size = [YBLOCK, XBLOCK, R0_BLOCK],
+            # then we create tl.full([1, 1, 1], int).
+            #
+            # Exceptions:
+            # 1. If copy_shape is explicitly provided, use copy_shape expansion instead.
+            # 2. If the dense tensor has only one dimension (e.g., [XBLOCK]),
+            #    broadcasting does not apply. For example:
+            #        tl.arange(0, XBLOCK) + tl.full([1], int)  # -> broadcasting error
+            #    In this case, we fall back to dense indexing:
+            #        tl.full([XBLOCK], int)
+            if copy_shape or len(self.dense_size_list()) == 1:
+                expand_str, expand_shape = _get_expand_str()
+            else:
+                expand_str = str([1] * len(self.dense_size_list()))
+                expand_shape = tuple([1] * len(self.dense_size_list()))
+
             index_str = f"tl.full({expand_str}, {index_str}, tl.int32)"
             if self.fixed_config and not self._has_constant_xmask():
                 mask_vars = OrderedSet(["xmask"])
@@ -2425,9 +2777,58 @@ def _get_expand_str():
             )
 
         if need_dense and not have_dense:
-            expand_str, expand_shape = _get_expand_str()
-            index_str = f"tl.broadcast_to({index_str}, {expand_str})"
-            mask_vars = dense_mask_vars
+            if self.inside_reduction and self.is_native_matmul:
+                # This avoids full broadcasting (need_dense) when performing native matmul.
+                # For example, self._load_mask previously required tl.broadcast_to() in index_str.
+                # Due to the restrictions of tl.dot semantics, we only want to expand the block
+                # shape for the necessary axes.
+                #
+                # Previously:
+                #   tmp1 = tl.load(ptr + tl.broadcast_to(r0, [YBLOCK, XBLOCK, R0_BLOCK]),
+                #                  r0_mask & tmp0 & xmask)
+                #
+                # Now:
+                #   tmp1 = tl.load(ptr + tl.broadcast_to(r0, [1, 1, R0_BLOCK]),
+                #                  r0_mask & tmp0 & xmask)
+                #
+                # We achieve this by determining the required block shape through mask inspection.
+                # When a temporary variable appears in the mask (e.g., self._load_mask), we retrieve
+                # its true shape by inspecting tmp.mask_vars tracked by TritonCSEVariable.
+                #
+                # Caution: it may miss the correct block shape if the specific mask was constant
+                # and thus not tracked in TritonCSEVariable.mask_vars.
+                #
+                # TODO: Once the shape propagation PR lands, reimplement this logic:
+                #       https://github.com/pytorch/pytorch/pull/152198
+                mask_shape = mask_vars.copy()
+                if self._load_mask:
+                    mask_shape.add(self._load_mask)
+
+                xyzr = OrderedSet(["xmask", "ymask", "zmask", "r0_mask"])
+                while not mask_shape.issubset(xyzr):
+                    tmp_masks = mask_shape.difference(xyzr)
+                    tmp = tmp_masks.pop()
+                    assert isinstance(tmp, TritonCSEVariable)
+                    mask_shape.discard(tmp)
+                    mask_shape.update(tmp.mask_vars)
+
+                # e.g., expand_list becomes ['ZBLOCK', 1, 1, 'R0_BLOCK']
+                expand_list = ["1"] * len(self.dense_size_list())
+                for mask in mask_shape:
+                    assert isinstance(mask, str)
+                    for tree in self.active_range_trees():
+                        if mask.startswith(tree.prefix):
+                            dim = tree.tensor_dim
+                            assert isinstance(dim, int)
+                            expand_list[dim] = self.dense_size_list()[dim]
+
+                expand_str = "[" + ",".join(map(str, expand_list)) + "]"
+                expand_shape = tuple(expand_list)
+                index_str = f"tl.broadcast_to({index_str}, {expand_str})"
+            else:
+                expand_str, expand_shape = _get_expand_str()
+                index_str = f"tl.broadcast_to({index_str}, {expand_str})"
+                mask_vars = dense_mask_vars
         elif not have_loop_vars and copy_shape:
             expand_shape_str, expand_shape = _get_expand_str()
             index_str = f"tl.broadcast_to({index_str}, {expand_shape_str})"
@@ -2463,6 +2864,24 @@ def codegen_block_ptr(
         indexing: Union[BlockPtrOptions, TensorDescriptorOptions],
         other="",
     ) -> tuple[str, str]:
+        """Generate a block pointer or tensor descriptor for Triton kernel operations.
+
+        This method creates either a block pointer (for regular Triton operations) or
+        a tensor descriptor (for TMA operations) based on the indexing type. It handles
+        caching and reuse of descriptors for performance optimization.
+
+        Args:
+            name: The name of the buffer/tensor being accessed
+            var: The variable name for the pointer
+            indexing: Block pointer options or tensor descriptor options containing
+                     indexing information and boundary check settings
+            other: Additional parameters string (e.g., padding options)
+
+        Returns:
+            A tuple containing:
+            - block_descriptor: The generated block pointer or tensor descriptor variable name
+            - other: Modified additional parameters string with boundary check options
+        """
         check = indexing.boundary_check()
         if isinstance(indexing, TensorDescriptorOptions):
             if check and other:
@@ -2485,86 +2904,105 @@ def codegen_block_ptr(
             and self.range_trees[-1].is_loop
             and indexing.has_rindex()
         ) or indexing.can_lift:
-            block_descriptor_id = next(self.block_ptr_id)
-            if isinstance(indexing, BlockPtrOptions):
-                block_descriptor = f"block_ptr{block_descriptor_id}"
-            else:
-                block_descriptor = f"tma_descriptor{block_descriptor_id}"
-            line_body = DeferredLine(
-                name, f"{block_descriptor} = {indexing.format(var, roffset=False)}"
-            )
-            if indexing.can_lift:
-                self.prologue.writeline(line_body)
+            if indexing.can_lift and var in self.prologue_cache:
+                # Check for epilogue subtiling to reuse the same
+                # tensor descriptor.
+                block_descriptor = self.prologue_cache[var]
             else:
-                self.body.writeline(line_body)
+                block_ptr_line = indexing.format(var, roffset=False)
+                block_var = self.cse.try_get(block_ptr_line)
 
-            if isinstance(indexing, BlockPtrOptions):
-                # Store for later use. If the buffer is removed the below advancements
-                # are no longer necessary
-                self.block_ptr_to_buffer[block_descriptor] = name
+                # Early return if block descriptor already exists
+                if block_var:
+                    return str(block_var), other
 
-                # Generate block pointer advancements, for later use.
-                for symt in TritonSymbols.reduction_types:
-                    advance_offsets = indexing.advance_roffset(symt)
+                block_descriptor_id = next(self.block_ptr_id)
+                if isinstance(indexing, BlockPtrOptions):
+                    block_descriptor = f"block_ptr{block_descriptor_id}"
+                else:
+                    block_descriptor = f"tma_descriptor{block_descriptor_id}"
+                named_var = self.cse.namedvar(
+                    block_descriptor, dtype=torch.uint64, shape=[]
+                )
+                self.cse.put(block_ptr_line, named_var)
 
-                    # Ignore identity advancements.
-                    if all(
-                        V.graph.sizevars.statically_known_equals(
-                            offset, sympy.Integer(0)
-                        )
-                        for offset in advance_offsets
-                    ):
-                        continue
+                line_body = DeferredLine(name, f"{block_descriptor} = {block_ptr_line}")
+                if indexing.can_lift:
+                    self.prologue.writeline(line_body)
+                    # Cache the descriptor for epilogue subtiling
+                    self.prologue_cache[var] = block_descriptor
+                else:
+                    self.body.writeline(line_body)
 
-                    advancements = self.pointer_advancements[symt]
-                    assert block_descriptor not in advancements, (
-                        f"duplicate advancement for pointer '{block_descriptor}' at type '{symt}'"
-                    )
-                    advancements[block_descriptor] = advance_offsets
+                if isinstance(indexing, BlockPtrOptions):
+                    # Store for later use. If the buffer is removed the below advancements
+                    # are no longer necessary
+                    self.block_ptr_to_buffer[block_descriptor] = name
+
+                    # Generate block pointer advancements, for later use.
+                    for symt in TritonSymbols.reduction_types:
+                        advance_offsets = indexing.advance_roffset(symt)
+
+                        # Ignore identity advancements.
+                        if all(
+                            V.graph.sizevars.statically_known_equals(
+                                offset, sympy.Integer(0)
+                            )
+                            for offset in advance_offsets
+                        ):
+                            continue
+
+                        advancements = self.pointer_advancements[symt]
+                        assert block_descriptor not in advancements, (
+                            f"duplicate advancement for pointer '{block_descriptor}' at type '{symt}'"
+                        )
+                        advancements[block_descriptor] = advance_offsets
         else:
             block_descriptor = indexing.format(var)
         return block_descriptor, other
 
     def codegen_block_ptr_store_line(self, name, indexing, block_ptr, value, other=""):
-        # TMA stores may require transposing the data to ensure we are contiguous along
-        # the final dimension. We do this by checking the shape information on value.
-        # It can either
-        #    1. Match the final shape. In this case no broadcast/reshape
-        #       is necessary.
-        #    2. Exist as the Transpose of the final shape, which means we had to transpose
-        #       the store_descriptor relative to the accumulator indexing/value. If this
-        #       happens we will generate a tl.trans().
-        #    3. A mismatched provided shape. When this occurs we will error.
-        #    4. No shape is provided. This will proceed with the default explicit broadcast
-        #       described below.
-        #
-        # To prevent unintended side effects we will gate options 1-3 behind isinstance(indexing, TensorDescriptorOptions).
-        if isinstance(indexing, TensorDescriptorOptions) and value.shape:
-            str_final_shape = tuple([symt.name for symt in indexing.final_shape])
-            if value.shape[::-1] == str_final_shape:
-                value = f"tl.trans({value})"
-            elif value.shape != str_final_shape:
-                raise AssertionError(
-                    "TMA store requires no broadcasting when a shape is provided"
-                )
+        def stringify_shape(shape):
+            return tuple(
+                symt.name if isinstance(symt, sympy.Symbol) else str(symt)
+                for symt in shape
+            )
+
+        if value.shape:
+            value_forward_shape = stringify_shape(value.shape)
+            value_reverse_shape = stringify_shape(value.shape[::-1])
         else:
-            # Stores require an explicit broadcast. We do this in two phases:
-            #  1. Broadcast the operand to the final shape of the range trees, e.g. [ZBLOCK,
-            #     YBLOCK, XBLOCK]. This protects against implicit broadcasting from loads.
-            #  2. In case the block pointer / tma descriptor has different dimensionality, broadcast/reshape the
-            #     result to the shape of the pointer.
-            value = f"tl.broadcast_to({value}, {indexing.final_shape})"
-
-            # These dims no longer need broadcasting.
-            for idx, (dim, broadcast_dim) in enumerate(
-                zip(indexing.final_shape, indexing.broadcast_shape)
-            ):
-                if V.graph.sizevars.statically_known_equals(dim, broadcast_dim):
-                    indexing.broadcasting_dims[idx] = False
+            value_forward_shape = None
+            value_reverse_shape = None
+        final_shape = stringify_shape(indexing.final_shape)
+        # TODO: Generalize to N Dimensions
+        if (
+            value_forward_shape != final_shape
+            and value_reverse_shape == final_shape
+            and len(final_shape) == 2
+        ):
+            # TMA stores may require transposing the data to ensure we are contiguous along
+            # the final dimension. This applies to Block-pointers generally, but should only practically
+            # be reached with TMA.
+            value = f"tl.trans({value})"
+
+        # Stores require an explicit broadcast. We do this in two phases:
+        #  1. Broadcast the operand to the final shape of the range trees, e.g. [ZBLOCK,
+        #     YBLOCK, XBLOCK]. This protects against implicit broadcasting from loads.
+        #  2. In case the block pointer / tma descriptor has different dimensionality, broadcast/reshape the
+        #     result to the shape of the pointer.
+        value = f"tl.broadcast_to({value}, {indexing.final_shape})"
+
+        # These dims no longer need broadcasting.
+        for idx, (dim, broadcast_dim) in enumerate(
+            zip(indexing.final_shape, indexing.broadcast_shape)
+        ):
+            if V.graph.sizevars.statically_known_equals(dim, broadcast_dim):
+                indexing.broadcasting_dims[idx] = False
 
-            value = indexing.codegen_broadcast_and_reshape(
-                value, indexing.final_shape, indexing.block_shape, False
-            )
+        value = indexing.codegen_broadcast_and_reshape(
+            value, indexing.final_shape, indexing.block_shape, False
+        )
 
         # workaround https://github.com/triton-lang/triton/issues/2814
         value = f"{value}.to({triton_store_type(V.graph.get_dtype(name))})"
@@ -2613,6 +3051,27 @@ def get_load_buffer(self, indexing):
         else:
             return self.loads
 
+    def _handle_pdl_before_load(self, wait_buffer):
+        GDC_WAIT = "tl.extra.cuda.gdc_wait()"
+        self._load_index += 1
+        if self.inside_reduction:
+            wait_buffer = self.body
+        if enable_pdl_codegen():
+            if self._load_index == 1:
+                wait_buffer.writeline(GDC_WAIT)
+
+    def _handle_pdl_after_load(self, launch_buffer, result_var):
+        GDC_LAUNCH = "tl.extra.cuda.gdc_launch_dependents()"
+        if self.inside_reduction:
+            launch_buffer = self.post_loop_combine
+        if enable_pdl_codegen():
+            current_load_index = self._load_index
+            launch_if_last_load = DelayMaybeLine(
+                lambda: current_load_index == self._load_index,
+                f"0; {GDC_LAUNCH} # gdc launch for {result_var}",
+            )
+            self.cse.generate(launch_buffer, launch_if_last_load, dtype=torch.int32)
+
     def load(self, name: str, index: sympy.Expr):
         """
         Load from the memory location 'name', offset by some indexing expression 'index'.
@@ -2634,6 +3093,12 @@ def load(self, name: str, index: sympy.Expr):
                 force=False,
             ),
         )
+
+        if isinstance(indexing, IndexingOptions) and self._has_stride1_on_rdim(
+            indexing.index
+        ):
+            self.has_load_with_contiguous_rdim = True
+
         has_rindex = indexing.has_rindex()
         has_tmpmask = indexing.has_tmpmask()
 
@@ -2721,13 +3186,21 @@ def decide_later():
                     line, indexing.block_shape, indexing.final_shape, True
                 )
                 shape = indexing.final_shape
-            elif isinstance(original_index, sympy.Integer):
+            elif is_sympy_integer_like(original_index):
                 line = f"tl.load({var} + ({original_index}))"
                 append_broadcast = indexing.expand_str
                 shape = ()
             else:
                 line = f"tl.load({var} + ({indexing.index_str}), {indexing.mask_str}{ep}{other}{cachemod})"
-                shape = indexing.expand_shape
+
+                # The block shape of tl.load depends on the indexing expression.
+                # Inferring shape solely from the mask may miss cases where the mask is constant.
+                # Inferring from indexing.expand_shape alone may also fail when dense indexing is absent.
+                # so, iterate over variables in the indexexpr to accurately infer the block shape.
+                if indexing.expand_shape:
+                    shape = indexing.expand_shape
+                else:
+                    shape = TritonSymbols.get_block_shape(indexing.index)
 
             if (
                 dtype in (torch.float16, torch.bfloat16)
@@ -2743,9 +3216,11 @@ def decide_later():
                 dtype = torch.bool
 
         load_buffer = self.get_load_buffer(indexing)
+        self._handle_pdl_before_load(load_buffer)
         result_var = self.cse.generate(
             load_buffer, make_line(line), dtype=dtype, shape=shape
         )
+        self._handle_pdl_after_load(load_buffer, result_var)
         if result_var.use_count > 1:
             load_counts[name] -= 1  # don't double count cache hit
         assert isinstance(result_var, TritonCSEVariable)
@@ -2779,6 +3254,10 @@ def decide_later():
     def store(
         self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
     ) -> None:
+        """
+        store the 'value' to the memory location 'name', offset by some indexing expression 'index'.
+        """
+
         var = self.args.output(name)
         original_index = index
         dtype = V.graph.get_dtype(name)
@@ -2799,6 +3278,11 @@ def store(
             tma_compatibility_checker=tma_compatibility_checker,
         )
 
+        if isinstance(indexing, IndexingOptions) and self._has_stride1_on_rdim(
+            indexing.index
+        ):
+            self.stores_with_contiguous_rdim.append(name)
+
         # Guard against write-after-read corruption in triton.
         # See # https://github.com/triton-lang/triton/issues/1615
         # This triton bug means that a load which is broadcasted over multiple
@@ -2817,9 +3301,32 @@ def store(
                 name, indexing, block_descriptor, value, other
             )
         elif mode is None:
-            line = f"tl.store({var} + ({indexing.index_str}), {value}, {indexing.mask_str})"
+            # If indexing is an integer and value has block shape larger than one,
+            # broadcasting fails. So, we manually broadcast indexing to the value shape.
+            # Without broadcast :
+            # tl.store(out_ptr0 + (tl.full([1, 1], 0, tl.int32)), tmp4, xmask) # Fail
+            #
+            # With broadcast:
+            # tl.store(out_ptr0 + (tl.full([1, 1], 0, tl.int32).broadcast_to((XBLOCK,1)), tmp4, xmask)
+            indexing_str = indexing.index_str
+            if (
+                is_sympy_integer_like(index)
+                and value.shape is not None
+                and not all(str(x) == "1" for x in value.shape)
+            ):
+                value_shape = ", ".join(map(str, value.shape))
+                indexing_str += f".broadcast_to({value_shape})"
+            line = f"tl.store({var} + ({indexing_str}), {value}, {indexing.mask_str})"
         elif mode == "atomic_add":
-            line = f"tl.atomic_add({var} + ({indexing.index_str}), {value}, {indexing.mask_str}, sem='relaxed')"
+            indexing_str = indexing.index_str
+            if (
+                is_sympy_integer_like(index)
+                and value.shape is not None
+                and not all(str(x) == "1" for x in value.shape)
+            ):
+                value_shape = ", ".join(map(str, value.shape))
+                indexing_str += f".broadcast_to({value_shape})"
+            line = f"tl.atomic_add({var} + ({indexing_str}), {value}, {indexing.mask_str}, sem='relaxed')"
         else:
             raise NotImplementedError(f"store mode={mode}")
 
@@ -2834,6 +3341,9 @@ def store(
 
         exit_stack.close()
 
+    def device_assert_async(self, cond, msg) -> None:
+        self.compute.writeline(f"tl.device_assert({cond}, {repr(msg)})")
+
     def guard_cooperative_store(self, name, buffer):
         """
         For cooperative reductions only one thread block should write out the result.
@@ -2891,6 +3401,7 @@ def bucketize(
                 "Bucketize only supports indexing with int32 and int64"
             )
 
+        self._handle_pdl_before_load(self.compute)
         result = self.cse.generate(
             self.compute,
             f"triton_helpers.bucketize_binary_search({values}, "
@@ -2904,6 +3415,7 @@ def bucketize(
             dtype=indexing_dtype,  # type: ignore[attr-defined]
             shape=values.shape,
         )
+        self._handle_pdl_after_load(self.compute, result)
 
         masks = self._combine_masks(values, boundary_indices, sorter_indices)
         result.mask_vars = masks  # type: ignore[attr-defined]
@@ -2958,6 +3470,10 @@ def reduction(
         reduction_type: ReductionType,
         value: Union[CSEVariable, tuple[CSEVariable, ...]],
     ) -> Union[CSEVariable, tuple[CSEVariable, ...]]:
+        """
+        codegen reduction of value to Triton according the reduction_type
+        """
+
         def maybe_upcast(value: CSEVariable) -> CSEVariable:
             # Math reductions in FP16/BF16 are less accurate because the Triton compiler does not
             # automatically promote to FP32 for accumulation. Additionally, max/min reductions
@@ -2987,19 +3503,36 @@ def maybe_upcast(value: CSEVariable) -> CSEVariable:
             masks.append(self._load_mask)
         reduction_range_prefix = self.range_trees[-1].prefix[0]
 
+        # When we do native matmtul codegen,
+        # we don't want to keep the R0_BLOCK/R1_BLOCK in the accumulator.
+        # so instead of naively calling dense_size_str(), we filter out
+        # reduction block from accumulator and only keep (Y,X).
+        # In bmm (Z,Y,R)x(Z,R,X) case, we also remove z dimension from accumulator
+        # because 3d (Z,Y,X) tl.dot is somehow slower than 2d tl.dot.
+        # Instead, we force ZBLOCK to be always 1 during autotune.
+        dense_size_str: str
+        if self.is_native_matmul:
+            dense_sizes = self.dense_size_list()
+            assert len(dense_sizes) >= 3
+            xy_sizes_only = [size for size in dense_sizes if "X" in size or "Y" in size]
+            dense_size_str = f"[{', '.join(xy_sizes_only)}]"
+            value_shape = tuple(xy_sizes_only)
+        else:
+            dense_size_str = self.dense_size_str()
+            value_shape = tuple(self.dense_size_list())
+
         # Say we have
         #     tmp0 = ops.constant(1, torch.int64)
         #     tmp1 = ops.reduction(torch.int64, torch.int64, "sum", tmp0)
         # tmp0 in the triton code is either a scalar, or single-element tensor
         # so if we emit tl.sum directly, it will only give 1 instead of RBLOCK * 1
         # To avoid this, we broadcast to the expected shape first.
-        dense_size_str = self.dense_size_str()
         value = self._map_tuple_or_scalar(
             lambda v: self.cse.generate(
                 self.compute,
                 f"tl.broadcast_to({v}, {dense_size_str})",
                 dtype=v.dtype,
-                shape=tuple(self.dense_size_list()),
+                shape=value_shape,
             ),
             value,
         )
@@ -3023,6 +3556,16 @@ def final_reduction(
                 result, shape = self.reduction_resize_and_shape(
                     f"{module}.{reduction_type}2({value}, {dim})", value.shape
                 )
+            elif reduction_type == "dot":
+                # Native matmul is a special case because accumulator shape is fixed to (Y,X)
+                is_bmm = len(self.dense_size_list()) == 4
+                assert value.shape is not None
+                if is_bmm:
+                    result = f"{value}[None,:,:,None]"  # (Y,X) to (Z=1,Y,X,R=1)
+                    shape = [1, *value.shape, 1]
+                else:
+                    result = f"{value}[:,:,None]"  # (Y,X) to (Y,X,R=1)
+                    shape = [*value.shape, 1]
             else:
                 result, shape = self.reduction_resize_and_shape(
                     f"{module}.{reduction_type}({value}, {dim})", value.shape
@@ -3080,14 +3623,34 @@ def where_cond(tval, fval):
 
         if self.persistent_reduction:
             default = ir.Reduction.default_value(reduction_type, src_dtype)
-            default = self._map_tuple_or_scalar(constant_repr, default)
+
+            def update_constant_dtype(constant, src_dtype, dst_dtype):
+                "update reduction constant mask value to match dst_dtype"
+
+                # int is the only mask which may not fit within lower bitwidth,
+                # because float uses inf/-inf
+                if src_dtype.is_floating_point or src_dtype == torch.bool:
+                    return constant
+
+                if src_dtype == dst_dtype or constant == 0:
+                    return constant
+
+                if constant == torch.iinfo(src_dtype).max:
+                    return torch.iinfo(dst_dtype).max
+                elif constant == torch.iinfo(src_dtype).min:
+                    return torch.iinfo(dst_dtype).min
+                else:
+                    return constant
 
             def _mask_value(value, default) -> CSEVariable:
+                default = update_constant_dtype(default, src_dtype, value.dtype)
+                default_str = self._map_tuple_or_scalar(constant_repr, default)
+
                 return self.cse.generate(
                     self.compute,
-                    where_cond(value, default),
+                    where_cond(value, default_str),
                     dtype=value.dtype,
-                    shape=value.shape if value.shape is not None else default.shape,
+                    shape=value.shape,
                 )
 
             masked_value: Union[CSEVariable, Sequence[CSEVariable]]
@@ -3096,13 +3659,20 @@ def _mask_value(value, default) -> CSEVariable:
                 # will fallback below
                 pass
             elif isinstance(value, tuple):
-                masked_value = [_mask_value(v, d) for v, d in zip(value, default)]
+                masked_value = [_mask_value(v, d) for v, d in zip(value, default)]  # type: ignore[arg-type]
+            elif reduction_type == "dot":
+                # Here, we don't perform the masking.
+                # Masking w/ where condition in native matmul is handled in ops.dot codegen.
+                # Since tl.dot performs reduction within the triton block,
+                # masking should happen before the tl.dot is called.
+                masked_value = self.cse.generate(self.compute, value, dtype=value.dtype)
             else:
                 masked_value = _mask_value(value, default)
 
             if reduction_type in ("argmax", "argmin"):
                 assert isinstance(masked_value, CSEVariable)
                 accumulator_dtype = V.kernel.get_index_dtype_as_torch_dtype()
+
                 accumulator_index = str(
                     self.cse.generate(
                         self.compute,
@@ -3157,9 +3727,21 @@ def _mask_value(value, default) -> CSEVariable:
             default = ir.Reduction.default_accumulator(reduction_type, src_dtype)
             default = self._map_tuple_or_scalar(constant_repr, default)
             if not isinstance(default, tuple):
-                self.body.writeline(
-                    f"{accumulator} = tl.full({self.dense_size_str()}, {default}, {acc_type})"
-                )
+                if reduction_type == "dot":
+                    dense_sizes = self.dense_size_list()
+                    assert len(dense_sizes) >= 3
+                    xy_sizes_only = [
+                        size for size in dense_sizes if "X" in size or "Y" in size
+                    ]
+                    accumulator.shape = tuple(xy_sizes_only)
+                    dense_size_str = f"[{', '.join(xy_sizes_only)}]"
+                    self.body.writeline(
+                        f"{accumulator} = tl.full({dense_size_str}, {default}, {acc_type})"
+                    )
+                else:
+                    self.body.writeline(
+                        f"{accumulator} = tl.full({self.dense_size_str()}, {default}, {acc_type})"
+                    )
 
             if reduction_type in ("argmax", "argmin"):
                 accumulator_index = f"_{result_var}_index"
@@ -3234,9 +3816,12 @@ def _mask_value(value, default) -> CSEVariable:
             else:
                 combine_fn = ir.get_reduction_combine_fn(reduction_type, src_dtype)
                 updated = combine_fn(accumulator, value)
-                self.compute.writeline(
-                    f"{accumulator} = {where_cond(updated, accumulator)}"
-                )
+                if reduction_type == "dot":
+                    self.compute.writeline(f"{accumulator} = {updated}")
+                else:
+                    self.compute.writeline(
+                        f"{accumulator} = {where_cond(updated, accumulator)}"
+                    )
 
                 if src_dtype == torch.bool:
                     # This is only really used for aten.any. It changes the
@@ -3555,7 +4140,7 @@ def store_reduction(
         self,
         name: str,
         index: sympy.Expr,
-        value: Union[CSEVariable, tuple[CSEVariable, ...]],
+        value: CSEVariable,
     ):
         assert self.inside_reduction
         self.inside_reduction = False
@@ -3594,10 +4179,20 @@ def store_reduction(
             )
         else:
             assert isinstance(indexing, IndexingOptions)
+
+            indexing_str = indexing.index_str
+            if (
+                is_sympy_integer_like(index)
+                and value.shape is not None
+                and not all(str(x) == "1" for x in value.shape)
+            ):
+                value_shape = ", ".join(map(str, value.shape))
+                indexing_str += f".broadcast_to({value_shape})"
+
             self.post_loop_store.writeline(
                 DeferredLine(
                     name,
-                    f"tl.store({var} + ({indexing.index_str}), {value}, {indexing.mask_str})",
+                    f"tl.store({var} + ({indexing_str}), {value}, {indexing.mask_str})",
                 )
             )
 
@@ -3877,6 +4472,7 @@ def codegen_prologue(self, code: IndentedBuffer):
 
         code.splice(self.prologue)
         self.prologue.clear()
+        self.prologue_cache.clear()
 
     def codegen_body(self):
         """
@@ -3988,12 +4584,16 @@ def kernel_benchmark_extra_args(self) -> list[str]:
                     args.append(str(arg))
                 elif isinstance(arg, SymbolicCallArg):
                     hint = V.graph.sizevars.size_hint(
-                        arg.inner_expr, fallback=config.unbacked_symint_fallback
+                        arg.inner_expr,
+                        hint_override=self.hint_override,
+                        fallback=config.unbacked_symint_fallback,
                     )
                     args.append(str(hint))
                 elif isinstance(arg, sympy.Expr):
                     hint = V.graph.sizevars.size_hint(
-                        arg, fallback=config.unbacked_symint_fallback
+                        arg,
+                        hint_override=self.hint_override,
+                        fallback=config.unbacked_symint_fallback,
                     )
                     args.append(str(hint))
                 else:
@@ -4052,7 +4652,11 @@ def codegen_kernel_benchmark(self, num_gb: Optional[float]) -> IndentedBuffer:
                         f"{var_name} = rand_strided({size}, {stride}, device='{const_tensor.device}', dtype={const_tensor.dtype})"  # type: ignore[arg-type]  # noqa: B950 line too long
                     )
                 elif isinstance(arg_sig, SizeArg):
-                    symval_hint = V.graph.sizevars.size_hint(arg_sig.expr)
+                    symval_hint = V.graph.sizevars.size_hint(
+                        arg_sig.expr,
+                        hint_override=self.hint_override,
+                        fallback=config.unbacked_symint_fallback,
+                    )
 
                     # Force the seed_offset to be 0 so calls to the same kernel
                     # using different seed offset will have the same benchmark harness.
@@ -4062,7 +4666,9 @@ def codegen_kernel_benchmark(self, num_gb: Optional[float]) -> IndentedBuffer:
                     result.writeline(f"{var_name} = {symval_hint}")
                 elif isinstance(arg_sig, WorkspaceArg):
                     device = V.graph.get_current_device_or_throw()
-                    count = V.graph.sizevars.size_hint(arg_sig.count)
+                    count = V.graph.sizevars.size_hint(
+                        arg_sig.count, hint_override=self.hint_override
+                    )
                     result.writeline(
                         f"{var_name} = torch.zeros({count}, device='{device}', dtype={arg_sig.dtype})"
                     )
@@ -4145,7 +4751,6 @@ def _get_heuristic(self):
     def inductor_meta_common():
         inductor_meta = {
             "backend_hash": torch.utils._triton.triton_hash_with_backend(),
-            "are_deterministic_algorithms_enabled": torch.are_deterministic_algorithms_enabled(),
             "assert_indirect_indexing": config.assert_indirect_indexing,
             "autotune_local_cache": config.autotune_local_cache,
             "autotune_pointwise": config.triton.autotune_pointwise,
@@ -4157,7 +4762,15 @@ def inductor_meta_common():
             "min_split_scan_rblock": config.triton.min_split_scan_rblock,
             "spill_threshold": config.triton.spill_threshold,
             "store_cubin": config.triton.store_cubin,
+            "deterministic": config.deterministic,
+            "force_filter_reduction_configs": config.test_configs.force_filter_reduction_configs,
         }
+
+        if config.write_are_deterministic_algorithms_enabled:
+            inductor_meta["are_deterministic_algorithms_enabled"] = (
+                torch.are_deterministic_algorithms_enabled()
+            )
+
         if torch.version.hip is not None:
             inductor_meta["is_hip"] = True
         if config.is_fbcode():
@@ -4308,6 +4921,10 @@ def add_constexpr_arg(arg_name):
             "signature": triton_meta_signature,
             "device": DeviceProperties.create(V.graph.get_current_device_or_throw()),
             "constants": {},
+            "native_matmul": (
+                torch._inductor.config.triton.native_matmul
+                and ("tl.dot" in str(self.body) or "tl.dot" in str(self.compute))
+            ),
         }
 
         # Skip memory optimization for forward of the training loop where we expect
@@ -4324,24 +4941,41 @@ def add_constexpr_arg(arg_name):
             "optimize_mem": optimize_mem,
             "no_x_dim": self.no_x_dim,
             "num_load": self.num_load,
+            "num_store": self.num_store,
             "num_reduction": self.num_reduction,
             **self.inductor_meta_common(),
         }
 
+        if config.deterministic or config.test_configs.force_filter_reduction_configs:
+            inductor_meta["has_loadstore_with_contiguous_rdim"] = (
+                self.has_load_with_contiguous_rdim
+                or self.has_store_with_contiguous_rdim
+            )
+
         # Bail on 3d tiling, which has more complicated coalesce patterns
         looped_red = V.kernel.features.is_reduction() and not self.persistent_reduction
         tiling_scores = self.tiling_scores
-        two_d_red = (
-            len(self.tiling) == 2 and tiling_scores is not None and "x" in tiling_scores
-        )
+        two_d_red = len(self.tiling) == 2
         if looped_red and two_d_red:
-            assert tiling_scores is not None
             memory_stats = self.features.memory_stats(self.tiling)
             dim_stats = memory_stats.persistent.memory.dim[0]
             mem_ops_per_thread = dim_stats.count_per_thread
 
-            # check if majority of reads are coalesced by the rblock
-            r_coalesce_ratio = tiling_scores["r0_"] / max(tiling_scores["x"], 1)
+            if (
+                tiling_scores is not None
+                and "x" in tiling_scores
+                and "r0_" in tiling_scores
+            ):
+                # large rblock inhibits xblock size, dont attempt if there is a decent amount of
+                # reads coalesced by xblock
+                r_coalesce_ratio = tiling_scores["r0_"] / max(tiling_scores["x"], 1)
+                contiguous_red = r_coalesce_ratio >= 8.0
+            else:
+                from torch._inductor.runtime.hints import ReductionHint
+
+                contiguous_red = (
+                    self.features.get_reduction_hint() == ReductionHint.INNER
+                )
 
             looped_mem = memory_stats.looped.memory.bytes
             persistent_mem = memory_stats.persistent.memory.bytes
@@ -4359,9 +4993,7 @@ def add_constexpr_arg(arg_name):
             if (
                 # significant memory bandwidth savings
                 saved_bytes_ratio >= 1.3
-                # large rblock inhibits xblock size, dont attempt if there is a decent amount of
-                # reads coalesced by xblock
-                and r_coalesce_ratio >= 8.0
+                and contiguous_red
                 # TODO - need more detailed register analysis
                 and V.graph.sizevars.statically_known_leq(
                     self.features.reduction_numel, 32768
@@ -4395,6 +5027,9 @@ def add_constexpr_arg(arg_name):
 
         triton_meta["configs"] = [config_of(signature)]
 
+        if enable_pdl_codegen():
+            triton_meta["launch_pdl"] = True
+
         # Triton compiler includes equal_to_1 args into constants even
         # when they are not constexpr. otherwise there may be a segfault
         # during launching the Inductor-compiled Triton kernel.
@@ -4402,6 +5037,7 @@ def add_constexpr_arg(arg_name):
         # https://github.com/triton-lang/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
         for arg_num in equal_1_arg_indices(signature):  # type: ignore[index]
             triton_meta["constants"][signature[arg_num].name] = 1  # type: ignore[index,union-attr]
+        triton_meta["enable_fp_fusion"] = not config.emulate_precision_casts
 
         self.triton_meta = triton_meta
 
@@ -4475,16 +5111,13 @@ def _get_persistent_RBLOCK(rnumel):
             val = int(rnumel)
             val = next_power_of_2(val)
         else:
-            val = bound_sympy(rnumel).upper
-            assert isinstance(val, int) or val.is_constant()
-
-            if val == torch.utils._sympy.numbers.IntInfinity():
-                raise ValueError(f"Failed to find static RBLOCK for {rnumel}")
-
-            val = next_power_of_2(int(val))
+            val = 2
+            while not V.graph.sizevars.statically_known_leq(rnumel, val):
+                if val > 16 * 1024:
+                    raise ValueError(f"Failed to find static RBLOCK for {rnumel}")
+                val *= 2
 
-            if val > 16 * 1024:
-                raise ValueError(f"Failed to find static RBLOCK for {rnumel}")
+            return val
 
         return val
 
@@ -4529,6 +5162,10 @@ def is_static_integer(expr: sympy.Expr) -> bool:
                     val = f"triton_helpers.constexpr_next_power_of_2(({numel} + RSPLIT - 1) // RSPLIT)"
                 else:
                     val = self._get_persistent_RBLOCK(tree.numel)
+                    if self.is_native_matmul:
+                        # tl.dot only supports shapes >= 16
+                        val = max(val, 16)
+
                 code.writeline(f"{tree.prefix.upper()}BLOCK: tl.constexpr = {val}")
 
             if tree.prefix == "x" and self.no_x_dim:
@@ -4657,6 +5294,12 @@ def max_block(self, prefix: str) -> int:
         return TRITON_MAX_BLOCK[prefix.upper()]
 
     def _has_constant_mask(self, tree: IterationRangesRoot) -> bool:
+        if self.is_native_matmul:
+            # tl.dot requires the shape to be >= 16,
+            # so when matmul shape is smaller than 16, we always keep the mask.
+            if V.graph.sizevars.statically_known_lt(tree.numel, 16):
+                return False
+
         if not self.optimize_mask:
             return False
 
@@ -4801,7 +5444,6 @@ def iteration_ranges_codegen_header(
                     f"{entry.name} = {line}",
                 ]
             )
-
         if self._has_constant_mask(entry):
             sizes = self.dense_size_str()
             code.writeline(f"{x}mask = tl.full({sizes}, True, tl.int1)")
@@ -4843,7 +5485,7 @@ def get_backend_features(cls, device: torch.device):
             )
         return cls.backend_features
 
-    def codegen_comment(self, node_schedule):
+    def codegen_comment(self, node_schedule, kernel_name=None):
         wrapper = V.graph.wrapper_code
         origins, _detailed_origins = get_kernel_metadata(node_schedule, wrapper)
         if origins:
@@ -4869,6 +5511,13 @@ def codegen_comment(self, node_schedule):
                     f"{wrapper.comment} Fused node name list: {', '.join(node_names)}"
                 )
 
+        if kernel_name:
+            debug_handle = set_kernel_post_grad_provenance_tracing(
+                node_schedule,  # type: ignore[arg-type]
+                kernel_name,
+            )
+            wrapper.write_provenance_debug_handle(kernel_name, debug_handle)
+
     def define_kernel(self, src_code, node_schedule, kernel):
         wrapper = V.graph.wrapper_code
         if src_code in wrapper.src_to_kernel:
@@ -4990,7 +5639,7 @@ def load_cache():
             except Exception as e:
                 if config.triton.disallow_failing_autotune_kernels_TESTING_ONLY:
                     raise
-                log.debug(
+                log.debug(  # noqa: G200
                     "Exception (%s) in compiling fused nodes %s",
                     e,
                     node_names,
diff --git a/torch/_inductor/codegen/triton_combo_kernel.py b/torch/_inductor/codegen/triton_combo_kernel.py
index c28321923c5e..c52bd1dbeeec 100644
--- a/torch/_inductor/codegen/triton_combo_kernel.py
+++ b/torch/_inductor/codegen/triton_combo_kernel.py
@@ -379,6 +379,7 @@ def __init__(
 
     def create_sub_kernel(self, triton_kernel: TritonKernel) -> TritonKernel:
         sub_kernel = triton_kernel
+        # pyrefly: ignore  # bad-assignment
         metrics.generated_kernel_count -= 1
         sub_kernel.args = self.args
         sub_kernel.iter_vars_count = self.iter_vars_count
@@ -434,10 +435,12 @@ def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexp
                 assert f"{tree.prefix}numel_{num}" in self.dynamic_shape_args
                 uniquify_block_sizes.append(f"{tree.prefix}numel")
 
+            # pyrefly: ignore  # missing-argument
             if not tree.is_reduction:
                 if isinstance(simplified_tree_numel, (Integer, int)):
                     grid.append(int(simplified_tree_numel))
                 else:
+                    # pyrefly: ignore  # bad-argument-type
                     grid.append(f"{tree.prefix}numel_{num}")
 
             if tree.is_reduction and sub_kernel.persistent_reduction:
@@ -475,8 +478,10 @@ def min_x_blocks_sub_kernel(self, sub_kernel: TritonKernel, num: int) -> None:
                 if sub_kernel.no_x_dim:
                     min_x_blocks = x_numels
                     x_numels = (
+                        # pyrefly: ignore  # unsupported-operation
                         -min_x_blocks
                         if isinstance(x_numels, int)
+                        # pyrefly: ignore  # redundant-cast
                         else "-" + cast(str, x_numels)
                     )
                 else:
@@ -606,6 +611,7 @@ def jit_line(
             "device": DeviceProperties.create(V.graph.get_current_device_or_throw()),
             "constants": {},
         }
+        # pyrefly: ignore  # unsupported-operation
         triton_meta["configs"] = [config_of(signature)]
         mutated_args = self.get_mutated_args_sub_kernels()
         dispatch = self.dispatch_class
@@ -684,6 +690,7 @@ def get_block_args(self) -> list[ConstexprArg]:
         for sub_kernel in self.sub_kernels:
             # TODO: we assume all sub_kernels have the same block size
             for tree in sub_kernel.range_trees:
+                # pyrefly: ignore  # missing-argument
                 if tree.is_reduction and (
                     not sub_kernel.inside_reduction or sub_kernel.persistent_reduction
                 ):
@@ -722,6 +729,7 @@ def add_numel_to_call_args(
                     expr = V.graph.wrapper_code.generate_numel_expr(
                         name, tree, suffix=str(num)
                     )
+                # pyrefly: ignore  # missing-argument
                 if not tree.is_reduction or sub_kernel.inside_reduction:
                     call_args.append(expr)
                     arg_types.append(type(expr))
@@ -733,6 +741,7 @@ def kernel_benchmark_extra_args(self) -> list[str]:
                 numel_name = f"{tree.prefix}numel_{num}"
                 if numel_name not in self.dynamic_shape_args:
                     continue
+                # pyrefly: ignore  # missing-argument
                 if not tree.is_reduction or sub_kernel.inside_reduction:
                     extra_args.append(
                         str(
@@ -1012,6 +1021,7 @@ def combo_grid_meta(self) -> dict[str, Any]:
         for num, sub_kernel in enumerate(self.sub_kernels):
             meta[f"no_x_dim_{num}"] = sub_kernel.no_x_dim
             for i, tree in enumerate(sub_kernel.range_trees):
+                # pyrefly: ignore  # missing-argument
                 if not tree.is_reduction:
                     numel_name = f"{tree.prefix}numel_{num}"
                     if numel_name in self.dynamic_shape_args:
diff --git a/torch/_inductor/codegen/triton_utils.py b/torch/_inductor/codegen/triton_utils.py
index d97988f684c0..74385a4e2846 100644
--- a/torch/_inductor/codegen/triton_utils.py
+++ b/torch/_inductor/codegen/triton_utils.py
@@ -256,4 +256,5 @@ def is_aligned(x: KernelArgType, alignment: int, include_tensor: bool) -> bool:
 
     equal_to_1 = equal_1_arg_indices(args, indices=indices)
 
+    # pyrefly: ignore  # bad-argument-type
     return AttrsDescriptorWrapper(divisible_by_16, equal_to_1)
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index ba0b26cb7723..34935f4d38df 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -26,6 +26,7 @@
 from torch._inductor.codegen.debug_utils import DebugPrinterManager
 from torch._inductor.codegen.multi_kernel import MultiKernelState
 from torch._inductor.runtime.runtime_utils import cache_dir
+from torch._logging import trace_structured
 from torch.fx.experimental.symbolic_shapes import (
     CallMethodKey,
     ConvertIntKey,
@@ -40,7 +41,6 @@
 
 from .. import async_compile, config, ir
 from ..codecache import output_code_log
-from ..debug import set_kernel_post_grad_provenance_tracing
 from ..ir import IRNode, ReinterpretView
 from ..runtime import triton_heuristics
 from ..runtime.hints import DeviceProperties
@@ -371,7 +371,7 @@ def push(self, key: ReuseKey, item: FreeIfNotReusedLine) -> None:
 
 class WrapperLine:
     def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
-        raise NotImplementedError("FX codegen not yet supported for type {type(self)}")
+        raise NotImplementedError(f"FX codegen not yet supported for type {type(self)}")
 
 
 @dataclasses.dataclass
@@ -390,6 +390,19 @@ def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
         return converter._generate_enter_subgraph
 
 
+@dataclasses.dataclass
+class ConditionalLine(WrapperLine):
+    wrapper: PythonWrapperCodegen
+    node: ir.Conditional
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        raise NotImplementedError("Only supports FX codegen")
+
+    @staticmethod
+    def codegen_fx(converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_conditional
+
+
 @dataclasses.dataclass
 class CommentLine(WrapperLine):
     line: LineContext
@@ -402,6 +415,19 @@ def codegen_fx(converter: FxConverter) -> FxConversionFunc:
         return converter._generate_comment
 
 
+@dataclasses.dataclass
+class DynamicScalarLine(WrapperLine):
+    wrapper: PythonWrapperCodegen
+    node: ir.DynamicScalar
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        self.wrapper._codegen_dynamic_scalar(self.node)
+
+    @staticmethod
+    def codegen_fx(converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_dynamic_scalar
+
+
 @dataclasses.dataclass
 class ExitSubgraphLine(WrapperLine):
     wrapper: PythonWrapperCodegen
@@ -496,19 +522,12 @@ def codegen(self, code: IndentedBuffer) -> None:
         else:
             kernel_name = node.get_kernel_name()
         device = d.type if (d := node.get_device()) else V.graph.device_type
-        provenance_debug_handle: Optional[int] = None
-        # set provenance tracing kernel mapping for ExternKernel types
-        if config.trace.provenance_tracking_level != 0:
-            provenance_debug_handle = set_kernel_post_grad_provenance_tracing(
-                node, kernel_name, is_extern=True
-            )
         self.wrapper._generate_extern_kernel_out_helper(
             kernel_name,
             node.codegen_reference(),
             node.output_view.codegen_reference() if node.output_view else None,
             args,
             device,
-            provenance_debug_handle,
         )
 
     def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
@@ -971,6 +990,22 @@ def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
         return converter._generate_symbolic_call_arg
 
 
+@dataclasses.dataclass
+class UnbackedSymbolDefsLine(WrapperLine):
+    wrapper: PythonWrapperCodegen
+    output_name: str
+    outputs: Any
+    unbacked_bindings: Optional[dict[sympy.Symbol, pytree.KeyPath]]
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        self.wrapper._codegen_unbacked_symbol_defs_for_outputs(
+            self.output_name, self.outputs, self.unbacked_bindings
+        )
+
+    def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
+        return converter._generate_unbacked_symbol_defs
+
+
 BufferName = str
 Line = Union[MemoryPlanningLine, LineContext]
 
@@ -1093,6 +1128,7 @@ def create(
         return PythonWrapperCodegen()
 
     def set_launcher_fn_name(self) -> None:
+        # pyrefly: ignore  # bad-assignment
         self.launcher_fn_name = "call"
 
     def write_constant(self, name: str, hashed: str) -> None:
@@ -1183,6 +1219,18 @@ def write_kernel_autotune_defs_header(self) -> None:
             """
         )
 
+        try:
+            from torch._C import _cuda_getCurrentRawStream  # noqa: F401
+
+            self.kernel_autotune_defs.splice(
+                """
+                get_raw_stream = torch._C._cuda_getCurrentRawStream
+                """,
+                strip=True,
+            )
+        except (ImportError, AttributeError):
+            pass
+
     @cache_on_self
     def write_triton_header_once(self) -> None:
         import_str = f"""
@@ -1217,14 +1265,17 @@ def write_get_raw_stream_header_once(self) -> None:
         self.write_get_raw_stream_header()
 
     def add_meta_once(self, meta: TritonMetaParams) -> str:
+        # pyrefly: ignore  # bad-assignment
         meta = repr(meta)
         if meta not in self._metas:
             var = f"meta{len(self._metas)}"
+            # pyrefly: ignore  # unsupported-operation
             self._metas[meta] = var
             self.header.writeline(f"{var} = {meta}")
             if config.triton.autotune_at_compile_time:
                 self.kernel_autotune_calls.writeline(f"{var} = {meta}")
                 self._meta_vars.add(var)
+        # pyrefly: ignore  # index-error
         return self._metas[meta]
 
     @cache_on_self
@@ -1503,13 +1554,11 @@ def _generate_extern_kernel_out_helper(
         out_view: Optional[str],
         args: list[str],
         device: str,
-        debug_handle: Optional[int] = None,
     ) -> None:
         # add debug printer code for triton kernel calls at (jit) inductor level
         debug_printer_manager = V.graph.wrapper_code.debug_printer
         debug_printer_manager.set_printer_args(args, kernel, None, None, "extern")
         args.append(f"out={out_view if out_view else out}")
-        self.write_provenance_debug_handle(kernel, debug_handle)
         with debug_printer_manager:
             self.writeline(f"{kernel}({', '.join(args)})")
 
@@ -1662,6 +1711,7 @@ def _generate(self, is_inference):
             with self.set_writeline(self.wrapper_call.writeline):
                 for line in self.lines:
                     if isinstance(line, WrapperLine):
+                        # pyrefly: ignore  # missing-attribute
                         line.codegen(self.wrapper_call)
                     else:
                         self.wrapper_call.writeline(line)
@@ -1756,6 +1806,14 @@ def generate_and_run_autotune_block(self):
                 "Auto-tuning code written to %s",
                 file_path,
             )
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "inductor_autotune_at_compile_time_code",
+                "encoding": "string",
+            },
+            payload_fn=lambda: tuning_code,
+        )
         # Execute the code to autotune kernels
         try:
             exec(tuning_code, scope)
@@ -1768,7 +1826,8 @@ def memory_plan(self):
         self.lines = MemoryPlanner(self).plan(self.lines)
 
     def memory_plan_reuse(self):
-        out_names = V.graph.get_output_names()
+        outputs = self.get_graph_outputs()
+        out_names = V.graph._get_output_names(outputs)
 
         while (
             self.lines
@@ -1979,15 +2038,44 @@ def codegen_multi_output(self, node: ir.MultiOutput):
         arg_name = node.input_name(0)
         self.writeline(MultiOutputLine(self, result_name, arg_name, node.indices))
 
-    def codegen_dynamic_select_index(self, node):
+    def codegen_dynamic_select_index(self, node, clamp):
         index_str = f"{node.index} + {node.size} if {node.index} < 0 else {node.index}"
+        if clamp:
+            index_str = f"max(0, min({node.size}, {index_str}))"
         self.writeline(
             f"{node.unbacked_offset_symbol} = {node.base_offset} + {node.base_dim_stride} * ({index_str})"
         )
         # record in unbacked_symbol_decls so we won't generate a declaration of the symbol again
         self.unbacked_symbol_decls.add(str(node.unbacked_offset_symbol))
 
+    def codegen_dynamic_slice_size(self, node):
+        def clamp_index(x):
+            pos = self.codegen_sizevar(sympy.Max(0, sympy.Min(x, node.size)))
+            neg = self.codegen_sizevar(
+                sympy.Max(0, sympy.Min(x + node.size, node.size))
+            )
+            return f"{pos} if {x} >= 0 else {neg}"
+
+        def codegen_with_step(start_var, end_var, step):
+            if step == 1:
+                return f"{end_var} - {start_var}"
+            step_ = self.codegen_sizevar(step)
+            return f"({end_var} - {start_var} + {step_} - 1) // {step_}"
+
+        # codegen start, end
+        sym = node.unbacked_size_symbol
+        start = clamp_index(node.start)
+        end = clamp_index(node.end)
+        self.writeline(f"{sym}_start = {start}")
+        self.writeline(f"{sym}_end = {end}")
+        with_step = codegen_with_step(f"{sym}_start", f"{sym}_end", node.step)
+        self.writeline(f"{sym} = max(0, {with_step})")
+        self.unbacked_symbol_decls.add(str(node.unbacked_size_symbol))
+
     def codegen_dynamic_scalar(self, node):
+        self.writeline(DynamicScalarLine(self, node))
+
+    def _codegen_dynamic_scalar(self, node):
         (data,) = (t.codegen_reference() for t in node.inputs)
         if len(node.keypath) == 0:
             self.writeline(f"{node.sym} = {data}.item()")
@@ -2142,6 +2230,10 @@ def define_kernel(
     def _format_kernel_definition(
         kernel_name: str, kernel_body: str, metadata: Optional[str] = None
     ):
+        if config.triton.autotune_at_compile_time and metadata:
+            # Generating autotune block
+            # Need to replace C++ comment starter with Python comment starter
+            metadata = re.sub(r"^// ", "# ", metadata, flags=re.MULTILINE)
         metadata_comment = f"{metadata}\n" if metadata else ""
         body = f"\n\n{metadata_comment}{kernel_name} = {kernel_body}"
         return body
@@ -2155,9 +2247,8 @@ def _define_kernel_helper(
         cpp_definition: Optional[str] = None,
     ):
         if config.triton.autotune_at_compile_time:
-            # Skip inserting comments for the autotune block as they may contain cpp style comments
             body = self._format_kernel_definition(
-                kernel_name, kernel_body, metadata=None
+                kernel_name, kernel_body, metadata=metadata
             )
             self.kernel_autotune_defs.splice(body)
             if V.graph.cpp_wrapper:
@@ -2169,8 +2260,8 @@ def _define_kernel_helper(
         )
         self.header.splice(body)
 
-    def define_subgraph_launcher_fn(self, fn_code: str):
-        self.subgraph_definitions.splice(fn_code)
+    def define_subgraph_launcher_fn(self, name: str, subgraph_code):
+        self.subgraph_definitions.splice(subgraph_code.value)
 
     def define_user_defined_triton_kernel(
         self,
@@ -2556,7 +2647,6 @@ def generate_save_uncompiled_kernels(self):
                         if len(kernel.launchers) == 0:
                             kernel.precompile()
                         kernel.save_gpu_kernel(
-                            grid=(0, 0, 0),   # use dummy grid
                             stream="stream",  # use dummy stream
                             launcher=kernel.launchers[0],
                         )
@@ -2680,7 +2770,6 @@ def generate_kernel_call(
         raw_args=None,
         triton_meta=None,
         original_fxnode_name=None,
-        debug_handle: Optional[int] = None,
     ):
         """
         Generates kernel call code.
@@ -2700,19 +2789,23 @@ def generate_kernel_call(
         )
 
         device = device or V.graph.get_current_device_or_throw()
-        self.write_provenance_debug_handle(kernel_name, debug_handle)
         self.writeline(
             KernelCallLine(
                 self,
                 kernel_name=kernel_name,
                 call_args=call_args,
+                # pyrefly: ignore  # bad-argument-type
                 raw_keys=raw_keys,
+                # pyrefly: ignore  # bad-argument-type
                 raw_args=raw_args,
+                # pyrefly: ignore  # bad-argument-type
                 arg_types=arg_types,
                 triton=triton,
+                # pyrefly: ignore  # bad-argument-type
                 triton_meta=triton_meta,
                 device=device,
                 graph_name=V.graph.name,
+                # pyrefly: ignore  # bad-argument-type
                 original_fxnode_name=original_fxnode_name,
             )
         )
@@ -2833,6 +2926,7 @@ def infer_arg_by_inputs(raw_keys, raw_args, idx, reused_args):
 
             reused_args = {}
             for i, (arg, arg_type, raw_key, raw_arg) in enumerate(
+                # pyrefly: ignore  # no-matching-overload
                 zip(call_args, arg_types, raw_keys, raw_args)
             ):
                 key = None
@@ -3173,7 +3267,16 @@ def codegen_unbacked_symbol_defs_for_outputs(
         unbacked_bindings = resolve_unbacked_bindings(
             V.graph.sizevars.shape_env, unbacked_bindings
         )
+        self.writeline(
+            UnbackedSymbolDefsLine(self, output_name, outputs, unbacked_bindings)
+        )
 
+    def _codegen_unbacked_symbol_defs_for_outputs(
+        self,
+        output_name: str,
+        outputs: Any,
+        unbacked_bindings: Optional[dict[sympy.Symbol, pytree.KeyPath]],
+    ) -> None:
         if not unbacked_bindings:
             return
 
@@ -3362,11 +3465,12 @@ def codegen_subgraph_call(self, subgraph, outer_inputs, outer_buffer_name):
 
     def codegen_subgraph_common(self, subgraph):
         self.push_codegened_graph(subgraph.graph)
-        self.writeline("")
-        self.writeline(f"{self.comment} subgraph: {subgraph.name}")
+        self.make_comment("")
+        self.make_comment(f"{self.comment} subgraph: {subgraph.name}")
 
         parent_graph = V.graph
         subgraph.graph.cpp_wrapper = parent_graph.cpp_wrapper
+        subgraph.graph.fx_wrapper = parent_graph.fx_wrapper
 
         if subgraph.graph.name not in self.already_codegened_subgraphs:
             # If it is already codegened, the parent wrapper already has
@@ -3376,8 +3480,9 @@ def codegen_subgraph_common(self, subgraph):
                 with config.patch("graph_partition", False):
                     # Call the codegen of subgraph recursively
                     subgraph_code, _ = subgraph.graph.codegen()
-            self.already_codegened_subgraphs.add(subgraph.graph.name)
-            self.define_subgraph_launcher_fn(subgraph_code.value)
+            subgraph_name = subgraph.graph.name
+            self.already_codegened_subgraphs.add(subgraph_name)
+            self.define_subgraph_launcher_fn(subgraph_name, subgraph_code)
 
     def codegen_subgraph_with_flattened_outputs(
         self, subgraph, outer_inputs, outer_flattened_outputs
@@ -3409,7 +3514,7 @@ def codegen_invoke_subgraph(self, invoke_subgraph):
         else:
             self.codegen_subgraph(invoke_subgraph.subgraph, outer_inputs, name)
 
-    def codegen_conditional(self, conditional):
+    def codegen_conditional(self, conditional) -> None:
         name = conditional.get_name()
 
         outer_inputs = [buf.codegen_reference() for buf in conditional.operands]
@@ -3609,6 +3714,7 @@ def __init__(
     def set_launcher_fn_name(self) -> None:
         # This sets up the name of the function containing the launcher code of
         # the subgraph.
+        # pyrefly: ignore  # bad-assignment
         self.launcher_fn_name = self.subgraph_name
 
     def write_header(self) -> None:
@@ -3644,7 +3750,7 @@ def get_wrapper_call_indent(self) -> int:
 
     def get_graph_inputs(
         self,
-    ) -> dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr]]:
+    ) -> dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr, None]]:
         if signature := self.partition_signatures:
             inputs = signature.input_nodes | {
                 str(s): s for s in signature.symbol_inputs
diff --git a/torch/_inductor/codegen/wrapper_fxir.py b/torch/_inductor/codegen/wrapper_fxir.py
index 3580ae655d3c..56fd28b828aa 100644
--- a/torch/_inductor/codegen/wrapper_fxir.py
+++ b/torch/_inductor/codegen/wrapper_fxir.py
@@ -23,10 +23,16 @@
 from torch._inductor.codecache import LambdaFuture, PyCodeCache
 from torch._inductor.runtime.triton_heuristics import CachingAutotuner
 from torch._inductor.select_algorithm import extern_kernels  # noqa: F401
-from torch._inductor.utils import convert_shape_to_symint, sympy_product
+from torch._inductor.utils import convert_shape_to_symint, convert_to_symint
 from torch._inductor.virtualized import V
 from torch._library.triton import wrap_triton
 from torch.fx import GraphModule
+from torch.fx.experimental.symbolic_shapes import (
+    CallMethodKey,
+    ConvertIntKey,
+    DivideByKey,
+    free_unbacked_symbols,
+)
 from torch.utils import _pytree as pytree
 from torch.utils._sympy.functions import FloorDiv
 from torch.utils._sympy.interp import _run_sympy_handler, sympy_interp
@@ -35,7 +41,7 @@
 
 from .. import config, ir
 from ..runtime.triton_compat import Config
-from ..utils import LineContext
+from ..utils import cache_property_on_self, LineContext, ValueWithLineMap
 from .common import (
     CodegenSymbol,
     FileBackedGraphModule,
@@ -48,6 +54,8 @@
     CommBufferAllocateLine,
     CommBufferFreeLine,
     CommentLine,
+    ConditionalLine,
+    DynamicScalarLine,
     EnterDeviceContextManagerLine,
     EnterSubgraphLine,
     ExitDeviceContextManagerLine,
@@ -66,8 +74,10 @@
     ReinterpretLine,
     ReuseLine,
     ScatterFallbackLine,
+    SubgraphPythonWrapperCodegen,
     SymbolicCallArg,
     SymbolicCallArgLine,
+    UnbackedSymbolDefsLine,
     WrapperLine,
 )
 
@@ -87,8 +97,10 @@ class SymbolBuffer(CodegenSymbol):
     def get_name(self) -> str:
         return str(self.symbol)
 
-    def get_example(self) -> Union[torch.Tensor, sympy.Symbol]:
-        return self.symbol
+    def get_example(self) -> Union[torch.Tensor, torch.SymInt]:
+        sym_int = convert_to_symint(self.symbol)
+        assert isinstance(sym_int, torch.SymInt)
+        return sym_int
 
 
 CodegenBuffer = Union[BufferLike, SymbolBuffer]
@@ -112,30 +124,20 @@ def replace_floor_div(expr: sympy.Expr) -> sympy.Expr:
     def replace(expr: sympy.Expr) -> sympy.Expr:
         expr = sympy.together(expr)
 
-        # Find division operations in the sympy.floor expression
-        # Div is either represented as Mul with:
-        # Rational denominator or Pow with negative exponent
-        if not isinstance(expr, sympy.core.mul.Mul):
-            return sympy.floor(expr)
-
-        if isinstance(expr.args[0], sympy.Rational):
-            frac = expr.args[0]
-            numerator = sympy_product(expr.args[1:]) * frac.numerator
-            denominator = frac.denominator
-
-            return FloorDiv(numerator, denominator)
-        elif isinstance(expr.args[0], sympy.Pow):
-            base = expr.args[0].base
-            exp = expr.args[0].exp
-            numerator = sympy_product(expr.args[1:])
-            if exp < 0:
-                denominator = base ** (-exp)
+        # Division is represented as a Mul with a Rational factor or a Pow with negative
+        # exponent. We convert floor(Mul(...)) to FloorDiv(numerator, denominator) by
+        # partitioning factors into the numerator and denominator.
+        (numerator, denominator) = (sympy.S.One,) * 2
+        for arg in sympy.Mul.make_args(expr):
+            if isinstance(arg, sympy.Rational):
+                numerator *= arg.numerator
+                denominator *= arg.denominator
+            elif isinstance(arg, sympy.Pow) and arg.exp.is_negative:
+                denominator *= arg.base**-arg.exp
             else:
-                numerator = numerator * (base**exp)
-                denominator = 1
-            return FloorDiv(numerator, denominator)
-        else:
-            return sympy.floor(expr)
+                numerator *= arg
+
+        return FloorDiv(numerator, denominator)
 
     return expr.replace(sympy.floor, replace)
 
@@ -147,12 +149,55 @@ class WrapperFxCodegen(PythonWrapperCodegen):
 
     supports_caching = False
 
+    def __init__(self, *args: Any, **kwargs: Any):
+        super().__init__(*args, **kwargs)
+        self.subgms: dict[str, torch.fx.GraphModule] = {}
+
     def codegen_inputs(self) -> None:
         """
         This would generate code for symbolic input shapes, strides, etc.
         Since the FX converter handles this, do nothing here.
         """
 
+    def codegen_conditional(self, conditional: ir.Conditional) -> None:
+        """
+        Conditional codegen normally emits a number of different wrapper lines.
+        Instead, FX conversion uses a dedicated line for the whole conditional.
+        """
+        self.writeline(ConditionalLine(self, conditional))
+        for subgraph in (conditional.true_subgraph, conditional.false_subgraph):
+            self.codegen_subgraph_common(subgraph)
+
+    def define_subgraph_launcher_fn(
+        self, name: str, subgraph_code: Union[ValueWithLineMap, FileBackedGraphModule]
+    ) -> None:
+        """
+        Record subgms as they're generated.
+        """
+        assert isinstance(subgraph_code, FileBackedGraphModule)
+        self.subgms[name] = subgraph_code.gm
+
+    @property
+    @cache_property_on_self
+    def is_subgraph(self) -> bool:
+        return isinstance(self, SubgraphPythonWrapperCodegen)
+
+    def get_fx_graph_inputs(
+        self,
+    ) -> dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr, None]]:
+        """
+        Get the input nodes corresponding to FX graph placeholders.
+        """
+        # pyrefly: ignore  # missing-argument
+        if V.aot_compilation and not self.is_subgraph:
+            # AOT graphs must match the signature of the input module.
+            return {
+                node.name: V.graph.graph_inputs.get(node.name)
+                for node in V.graph.module.graph.find_nodes(op="placeholder")  # type: ignore[operator, union-attr]
+            }
+
+        return self.get_graph_inputs()
+
     def _generate(self, is_inference: bool) -> tuple[FileBackedGraphModule, None]:
         self.run_wrapper_ir_passes(is_inference)
 
@@ -162,7 +207,16 @@ def _generate(self, is_inference: bool) -> tuple[FileBackedGraphModule, None]:
                 self.header.getvalue(),
             ]
         )
-        gm = FxConverter(lines=self.lines, prologue=prologue).generate()
+        gm = FxConverter(
+            lines=self.lines,
+            prologue=prologue,
+            graph_inputs=self.get_fx_graph_inputs(),
+            graph_outputs=self.get_graph_outputs(),
+            subgms=self.subgms,
+            # pyrefly: ignore  # missing-argument
+            is_subgraph=self.is_subgraph,
+        ).generate()
+
         compiled_fn = self.compile_graph(gm)
 
         return FileBackedGraphModule(gm, compiled_fn), None
@@ -175,20 +229,43 @@ def compile_graph(self, gm: GraphModule) -> Callable[..., Any]:
         """
         return gm.forward
 
+    def write_header(self) -> None:
+        """
+        Python subgraphs normally lack headers.
+        Override this behavior to generate prologues for FX subgraphs.
+        """
+        PythonWrapperCodegen.write_header(self)
+
     @classmethod
     def create(
-        cls,
+        cls: type["WrapperFxCodegen"],
         is_subgraph: bool,
         subgraph_name: Optional[str],
         parent_wrapper: Optional[PythonWrapperCodegen],
         partition_signatures: Optional[ir.GraphPartitionSignature] = None,
     ) -> "WrapperFxCodegen":
         if is_subgraph:
-            raise NotImplementedError(
-                "Subgraphs are not yet supported by FX conversion"
+            assert subgraph_name is not None
+            assert parent_wrapper is not None
+
+            # Subgraphs override some methods of PythonWrapperCodegen.
+            # Apply these overrides to the user-provided class, with priority given to
+            # user-provided methods.
+            class SubgraphFxWrapperCodegen(cls, SubgraphPythonWrapperCodegen):  # type: ignore[misc,valid-type]
+                def compile_graph(self, gm: GraphModule) -> Callable[..., Any]:
+                    """
+                    Skip graph compilation for subgraphs.
+                    """
+
+                    def crash_if_run(*args: Any) -> None:
+                        raise NotImplementedError("Cannot run a subgraph in isolation!")
+
+                    return crash_if_run
+
+            return SubgraphFxWrapperCodegen(
+                subgraph_name, parent_wrapper, partition_signatures
             )
 
-        # For derived backends, this could be a subclass.
         return cls()
 
 
@@ -200,7 +277,11 @@ class FxConverter:
     """
 
     lines: list[Line]
-    prologue: str = ""
+    prologue: str
+    graph_inputs: dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr, None]]
+    graph_outputs: list[ir.IRNode]
+    subgms: dict[str, torch.fx.GraphModule]
+    is_subgraph: bool
 
     def __post_init__(self) -> None:
         graph = torch.fx.Graph()
@@ -307,29 +388,40 @@ def _get_buffer(self, node: ir.IRNode) -> CodegenBuffer:
         else:
             raise NotImplementedError(f"Unable to extract buffer from node: {node}")
 
+    def _generate_size_proxy(
+        self, node: torch.fx.Node, expr: sympy.Expr
+    ) -> torch.fx.Proxy:
+        proxy = torch.fx.Proxy(node, tracer=self.tracer)
+        self.expr_to_proxy[expr] = proxy
+        return proxy
+
     def _generate_graph_inputs(self) -> None:
         """
         Converts graph inputs to FX placeholders.
         """
 
-        for node in V.graph.module.graph.find_nodes(op="placeholder"):  # type: ignore[operator, union-attr]
-            name = node.name
-            if name in V.graph.graph_inputs:
-                ir_node = V.graph.graph_inputs[name]
-
-                # Introduce a new symbol for constant inputs.
-                buffer = (
-                    SymbolBuffer(sympy.Symbol(name, is_integer=True))
-                    if isinstance(ir_node, (int, float, sympy.Integer, sympy.Float))
-                    else self._get_buffer(ir_node)
-                )
-                placeholder_node = self.gm.graph.placeholder(buffer.get_name())
-                placeholder_node.meta["val"] = buffer.get_example()
-                self._record_allocation(buffer, placeholder_node)
-
-            elif V.aot_compilation:
+        for name, ir_node in self.graph_inputs.items():
+            if ir_node is None:
                 # Create dummy input nodes to match the input signature
                 self.gm.graph.placeholder(name)
+                continue
+
+            # Introduce a new symbol for constant inputs.
+            is_constant = isinstance(ir_node, (int, float, sympy.Integer, sympy.Float))
+            buffer = (
+                SymbolBuffer(sympy.Symbol(name, is_integer=True))
+                if is_constant
+                else self._get_buffer(ir_node)
+            )
+            placeholder_node = self.gm.graph.placeholder(buffer.get_name())
+            placeholder_node.meta["val"] = (
+                ir_node if is_constant else buffer.get_example()
+            )
+            self._record_allocation(buffer, placeholder_node)
+
+            # Record symbol definitions for dynamic shapes.
+            if isinstance(ir_node, sympy.Symbol):
+                self._generate_size_proxy(placeholder_node, ir_node)
 
     def _generate_graph_input_shapes(self) -> None:
         """
@@ -345,8 +437,7 @@ def _codegen_symbol(
         ) -> None:
             def codegen_proxy() -> torch.fx.Proxy:
                 size_node = self.gm.graph.call_function(target, (base_node, dim))
-                size_proxy = torch.fx.Proxy(size_node, tracer=self.tracer)
-                self.expr_to_proxy[sym_or_exp] = size_proxy
+                size_proxy = self._generate_size_proxy(size_node, sym_or_exp)
                 return size_proxy
 
             if isinstance(sym_or_exp, sympy.Symbol):
@@ -399,22 +490,19 @@ def codegen_proxy() -> torch.fx.Proxy:
                     undefined_symbol_expr
                 ]
 
-        for node in V.graph.module.graph.find_nodes(op="placeholder"):  # type: ignore[operator, union-attr]
-            name = node.name
-            if name in V.graph.graph_inputs:
-                ir_node = V.graph.graph_inputs[name]
-                if isinstance(ir_node, ir.TensorBox):
-                    buffer = self._get_buffer(ir_node)
-                    placeholder_node = self.buffer_to_node[buffer.get_name()]
-
-                    for dim, size in enumerate(ir_node.get_size()):
-                        _codegen_symbol(
-                            size, placeholder_node, torch.ops.aten.sym_size.int, dim
-                        )
-                    for dim, stride in enumerate(ir_node.get_stride()):
-                        _codegen_symbol(
-                            stride, placeholder_node, torch.ops.aten.sym_stride.int, dim
-                        )
+        for ir_node in self.graph_inputs.values():
+            if isinstance(ir_node, ir.TensorBox):
+                buffer = self._get_buffer(ir_node)
+                placeholder_node = self.buffer_to_node[buffer.get_name()]
+
+                for dim, size in enumerate(ir_node.get_size()):
+                    _codegen_symbol(
+                        size, placeholder_node, torch.ops.aten.sym_size.int, dim
+                    )
+                for dim, stride in enumerate(ir_node.get_stride()):
+                    _codegen_symbol(
+                        stride, placeholder_node, torch.ops.aten.sym_stride.int, dim
+                    )
 
     def _generate_graph_constants(self) -> None:
         for name, value in V.graph.constants.items():
@@ -429,6 +517,10 @@ def _generate_buffer(self, node: ir.IRNode) -> Optional[torch.fx.Node]:
         Does nothing if no such transformations are present.
         """
 
+        if isinstance(node, ir.ShapeAsConstantBuffer):
+            # Generate FX nodes to compute the shape expression.
+            return self._sympy_interp(node.expr).node
+
         def generate_to_buffer(node: ir.IRNode) -> Optional[BufferLike]:
             if isinstance(node, (ir.Buffer, WorkspaceArg)):
                 return node
@@ -464,19 +556,47 @@ def generate_to_buffer(node: ir.IRNode) -> Optional[BufferLike]:
         buffer = generate_to_buffer(node)
         return self.buffer_to_node[buffer.get_name()] if buffer is not None else None
 
-    def _generate_output(self) -> None:
+    def _generate_outputs(
+        self,
+    ) -> Union[Optional[torch.fx.Node], list[Optional[torch.fx.Node]]]:
         """
         Generate FX IR for graph outputs.
         """
         output_nodes = [
-            self._generate_buffer(node)
-            for idx, node in enumerate(V.graph.graph_outputs)
+            self._generate_buffer(node) for idx, node in enumerate(self.graph_outputs)
         ]
 
-        # Single return elements don't use a tuple.
-        output_value = output_nodes[0] if len(output_nodes) == 1 else output_nodes
+        # Parent graphs with single return elements don't use a tuple.
+        output_value = (
+            output_nodes[0]
+            if len(output_nodes) == 1 and not self.is_subgraph
+            else output_nodes
+        )
+
+        return output_value
+
+    def _generate_subgm_getattrs(self) -> None:
+        """
+        Generate getattr nodes for subgms.
+        """
+
+        def generate_getattr(name: str, subgm: torch.fx.GraphModule) -> torch.fx.Node:
+            self.gm.add_submodule(name, subgm)
+            node = self.gm.graph.get_attr(name)
+            node.meta["val"] = subgm
+            return node
+
+        self.subgm_getattrs = {
+            name: generate_getattr(name, subgm) for name, subgm in self.subgms.items()
+        }
 
-        self.gm.graph.output(output_value)
+    def _get_subgm_attr(self, subgraph: ir.Subgraph) -> torch.fx.Node:
+        """
+        Look up the getattr node for a subgraph.
+        """
+        graph = subgraph.graph
+        assert graph is not None
+        return self.subgm_getattrs[graph.name]
 
     def generate(self) -> torch.fx.GraphModule:
         """
@@ -484,6 +604,7 @@ def generate(self) -> torch.fx.GraphModule:
         """
         self._generate_graph_inputs()
         self._generate_graph_constants()
+        self._generate_subgm_getattrs()
 
         fake_mode = _detect_fake_mode_from_gm(self.gm)
 
@@ -512,7 +633,9 @@ def generate(self) -> torch.fx.GraphModule:
                         )
                     )
 
-        self._generate_output()
+            output = self._generate_outputs()
+
+        self.gm.graph.output(output)
         self.gm.recompile()
         return self.gm
 
@@ -586,10 +709,70 @@ def _generate_allocate(self, line: WrapperLine) -> None:
         node.name = name
         self._record_allocation(buffer, node)
 
+    def _generate_conditional(self, line: WrapperLine) -> None:
+        assert isinstance(line, ConditionalLine)
+
+        def get_subgm_attr(subgraph: Optional[ir.Subgraph]) -> torch.fx.Node:
+            assert subgraph is not None
+            return self._get_subgm_attr(subgraph)
+
+        # Access the subgraphs as getattrs.
+        ir_node = line.node
+        (true_subgm, false_subgm) = [
+            get_subgm_attr(subgraph)
+            for subgraph in (ir_node.true_subgraph, ir_node.false_subgraph)
+        ]
+
+        def generate_buffer(node: Optional[ir.IRNode]) -> Optional[torch.fx.Node]:
+            assert node is not None
+            return self._generate_buffer(node)
+
+        predicate = generate_buffer(ir_node.predicate)
+        assert ir_node.operands is not None
+        operands = tuple(generate_buffer(arg) for arg in ir_node.operands)
+        fx_node = self.gm.graph.call_function(
+            torch.ops.higher_order.cond,
+            args=(predicate, true_subgm, false_subgm, operands),
+        )
+        self._record_allocation(ir_node, fx_node)
+
     def _generate_comment(self, line: WrapperLine) -> None:
         assert isinstance(line, CommentLine)
         # We ignore comments in FX IR.
 
+    def _generate_dynamic_scalar(self, line: WrapperLine) -> None:
+        assert isinstance(line, DynamicScalarLine)
+
+        ir_node = line.node
+        (input_ir_node,) = ir_node.inputs
+        assert isinstance(input_ir_node, ir.IRNode)
+        input_fx_node = self._generate_buffer(input_ir_node)
+        keypath = ir_node.keypath
+        graph = self.gm.graph
+
+        def generate_item(x: Optional[torch.fx.Node]) -> torch.fx.Node:
+            assert x is not None
+            return graph.call_function(
+                aten.item.default,
+                args=(x,),
+            )
+
+        if len(keypath) == 0:
+            result_fx_node = generate_item(input_fx_node)
+        elif len(keypath) == 1 and isinstance(keypath[0], ConvertIntKey):
+            where_fx_node = graph.call_function(
+                aten.where.Scalar,
+                args=(input_fx_node, 1, 0),
+            )
+            result_fx_node = generate_item(where_fx_node)
+        else:
+            raise NotImplementedError(f"Unsupported keypath: {keypath}")
+
+        result_symbol = ir_node.sym
+        result_buffer = SymbolBuffer(result_symbol)
+        self._record_allocation(result_buffer, result_fx_node)
+        self._generate_size_proxy(result_fx_node, result_symbol)
+
     def _generate_enter_device_context_manager(self, line: WrapperLine) -> None:
         assert isinstance(line, EnterDeviceContextManagerLine)
         # We ignore the device context in FX IR.
@@ -600,11 +783,11 @@ def _generate_exit_device_context_manager(self, line: WrapperLine) -> None:
 
     def _generate_enter_subgraph(self, line: WrapperLine) -> None:
         assert isinstance(line, EnterSubgraphLine)
-        raise NotImplementedError("Subgraphs are not yet supported by FX conversion")
+        # We ignore memory planning lines in FX IR.
 
     def _generate_exit_subgraph(self, line: WrapperLine) -> None:
         assert isinstance(line, ExitSubgraphLine)
-        raise NotImplementedError("Subgraphs are not yet supported by FX conversion")
+        # We ignore memory planning lines in FX IR.
 
     def _generate_free(self, line: WrapperLine) -> None:
         assert isinstance(line, FreeLine)
@@ -776,15 +959,11 @@ def _generate_triton_call(self, line: WrapperLine) -> None:
         call_args = self._lookup_args(line.call_args)
         kernel = self.kernels[line.kernel_name]
         tuner = kernel.tuner
-        # Use python_slow mode instead of python mode to avoid
-        # the round to neginf behaviour, which is not the convention
-        # in other languages.
-        tuner.grid_mode = "python_slow"
 
-        # Optionally autotune the kernels.
-        # The FX backend currently only supports compile-time tuning.
-        kernel_name = tuner.fn.__name__
-        if config.triton.autotune_at_compile_time:
+        class UnbackedSymintsError(Exception):
+            pass
+
+        def tune_kernel(tuner: CachingAutotuner, call_args: Sequence[Any]) -> None:
             from triton.runtime import driver
 
             log.info("Autotuning Triton kernel %s at compile time.", kernel_name)
@@ -796,9 +975,13 @@ def node_to_tuning_arg(arg: Any) -> Any:
                 Create real tensors for autotuning arguments, substituting size hints
                 for dynamic shapes.
                 """
-                to_size_hint = functools.partial(
-                    pytree.tree_map, V.graph.sizevars.size_hint
-                )
+
+                def to_size_hint(arg: Any) -> Any:
+                    if len(free_unbacked_symbols(arg)) > 0:
+                        # NYI: tuning args require backed symints.
+                        raise UnbackedSymintsError
+                    return pytree.tree_map(V.graph.sizevars.size_hint, arg)
+
                 if not isinstance(arg, torch.fx.Node):
                     return to_size_hint(arg)
 
@@ -806,11 +989,24 @@ def node_to_tuning_arg(arg: Any) -> Any:
                 return torch.empty_strided(
                     to_size_hint(fake.shape),
                     to_size_hint(fake.stride()),
+                    dtype=fake.dtype,
                     device=device,
                 ).zero_()
 
             arg_values = [node_to_tuning_arg(arg) for arg in call_args]
             tuner.run(*arg_values, stream=stream)
+
+        # Optionally autotune the kernels.
+        # The FX backend currently only supports compile-time tuning.
+        kernel_name = tuner.fn.__name__
+        if config.triton.autotune_at_compile_time:
+            try:
+                tune_kernel(tuner, call_args)
+            except UnbackedSymintsError:
+                log.info(
+                    "Detected unbacked symints. Skipping autotuning for kernel %s.",
+                    kernel_name,
+                )
         else:
             log.info(
                 "Skipping autotuning for kernel %s. Set config.triton.autotune_at_compile_time = True to enable.",
@@ -827,34 +1023,47 @@ def add_constants_to_call_args(
             Add constant kwargs to the arg list.
             """
             # Add args from the proper Triton signature.
+            # Exclude constants and config kwargs, as those are tracked separately.
             new_call_args = []
-            call_arg_idx = 0
             constants = triton_meta["constants"]
-            for arg_name in signature:
-                # Config kwargs are tracked separately.
-                if arg_name in cfg.kwargs:
-                    continue
-
-                try:
-                    new_arg = constants[arg_name]
-                except KeyError:
-                    new_arg = call_args[call_arg_idx]
-                    call_arg_idx += 1
-                new_call_args.append(new_arg)
+            call_kwargs = {
+                key: val
+                for key, val in zip(signature, call_args)
+                # pyrefly: ignore  # missing-attribute
+                if key not in constants and key not in cfg.kwargs
+            }
+
+            # Add constants stored as Triton metadata, in signature order.
+            call_kwargs |= constants
+            new_call_args = [
+                # pyrefly: ignore  # missing-attribute
+                call_kwargs[key]
+                for key in signature
+                # pyrefly: ignore  # missing-attribute
+                if key not in cfg.kwargs
+            ]
 
-            # Add Inductor's extra call args to the end.
-            new_call_args.extend(call_args[call_arg_idx:])
+            # Add Inductor's extra launcher args to the end.
+            if extra_launcher_args := tuner.inductor_meta.get("extra_launcher_args"):
+                new_call_args.extend(
+                    call_args[len(call_args) - len(extra_launcher_args) :]
+                )
 
             return tuple(new_call_args)
 
         kernel_config = tuner.compile_results[0].config
+        extra_options = getattr(kernel_config, "extra_options", None)
         call_args = add_constants_to_call_args(call_args, kernel_config)
         call_args, grid = tuner._interpret_args_grid(call_args, kernel_config)
         call_kwargs = dict(zip(signature, call_args))
+        # pyrefly: ignore  # missing-attribute
+        assert not any(kwarg in kernel_config.kwargs for kwarg in call_kwargs), (
+            f"kwargs overlap config: {call_kwargs}"
+        )
+        # pyrefly: ignore  # missing-attribute
         call_kwargs.update(kernel_config.kwargs)
 
-        # Replace all sympy.floor with FloorDiv
-        # _generate_sym_node does not support sympy.floor
+        # Replace sympy.floor with FloorDiv, to make the expression traceable.
         grid = [replace_floor_div(x) if isinstance(x, sympy.Expr) else x for x in grid]
         wrapper_grid = [tuple(self._generate_sym_nodes(grid))]
         call_kwargs = {
@@ -867,7 +1076,7 @@ def add_constants_to_call_args(
             constant_args_idx,
         ) = tracing_triton_hopifier_singleton.store_non_graphable_args(call_kwargs)
 
-        self.gm.graph.call_function(
+        triton_node = self.gm.graph.call_function(
             triton_kernel_wrapper_mutation,
             kwargs={
                 "kernel_idx": kernel.wrapped.kernel_idx,
@@ -877,6 +1086,8 @@ def add_constants_to_call_args(
                 "kwargs": call_kwargs,
             },
         )
+        if extra_options:
+            triton_node.meta["extra_options"] = extra_options
 
     def _generate_extern_kernel_alloc(self, line: WrapperLine) -> None:
         assert isinstance(line, ExternKernelAllocLine)
@@ -958,3 +1169,57 @@ def _generate_symbolic_call_arg(self, line: WrapperLine) -> None:
 
         inner_expr_proxy = self._sympy_interp(arg.inner_expr)
         self.expr_to_proxy[arg.inner] = inner_expr_proxy
+
+    def _generate_unbacked_symbol_defs(self, line: WrapperLine) -> None:
+        assert isinstance(line, UnbackedSymbolDefsLine)
+        graph = self.gm.graph
+
+        def convert_key(node: torch.fx.Node, path: pytree.KeyPath) -> torch.fx.Node:
+            """
+            Generate FX IR for each key entry.
+            """
+            # Base case.
+            if len(path) == 0:
+                return node
+
+            # Process the first entry and recurse.
+            entry = path[0]
+            if isinstance(entry, CallMethodKey):
+                target = {
+                    "size": aten.sym_size.int,
+                    "stride": aten.sym_stride.int,
+                    "storage_offset": aten.sym_storage_offset,
+                }[entry.name]
+                assert callable(target)
+                node = graph.call_function(
+                    target,
+                    args=(
+                        (node, path[1].idx)
+                        if len(path) > 1 and isinstance(path[1], pytree.SequenceKey)
+                        else (node,)
+                    ),
+                )
+                return convert_key(node, path[1 + len(node.args) :])
+            elif isinstance(entry, pytree.SequenceKey):
+                node = graph.call_function(operator.getitem, args=(node, entry.idx))
+                return convert_key(node, path[1:])
+            elif isinstance(entry, DivideByKey):
+                node = graph.call_function(
+                    operator.floordiv, args=(node, entry.divisor)
+                )
+                return convert_key(node, path[1:])
+            else:
+                raise NotImplementedError(f"Unrecognized entry type: {type(entry)}")
+
+        root_node = self.buffer_to_node[line.output_name]
+        unbacked_bindings = line.unbacked_bindings
+        assert unbacked_bindings is not None
+        for s, keypath in unbacked_bindings.items():
+            # Check if we already generated this symbol.
+            if s.name in self.buffer_to_node:
+                continue
+
+            node = convert_key(root_node, keypath)
+            out_buffer = SymbolBuffer(s)
+            self._record_allocation(out_buffer, node)
+            self._generate_size_proxy(node, s)
diff --git a/torch/_inductor/comm_analysis.py b/torch/_inductor/comm_analysis.py
index c24cf336e66a..51c5472c7fe3 100644
--- a/torch/_inductor/comm_analysis.py
+++ b/torch/_inductor/comm_analysis.py
@@ -7,6 +7,7 @@
 import sympy
 
 import torch
+from torch.fx.operator_schemas import normalize_function
 
 from . import ir
 from .utils import get_dtype_size, snode_args_kwargs, sympy_product
@@ -43,11 +44,7 @@ def get_gpu_type() -> NVIDIA_GPU_TYPE:
         return NVIDIA_GPU_TYPE.AMPERE
 
 
-def get_collective_type(node: ir.IRNode) -> NCCL_COLL:
-    if not isinstance(node, ir._CollectiveKernel):
-        raise ValueError(f"node is not a collective kernel: {node}")
-
-    kernel_name = node.python_kernel_name
+def get_collective_type_from_kernel_name(kernel_name: str) -> NCCL_COLL:
     assert kernel_name is not None
     if "all_reduce" in kernel_name:
         return NCCL_COLL.ALL_REDUCE
@@ -61,6 +58,15 @@ def get_collective_type(node: ir.IRNode) -> NCCL_COLL:
         raise ValueError(f"Unsupported collective kernel: {kernel_name}")
 
 
+def get_collective_type(node: ir.IRNode) -> NCCL_COLL:
+    if not isinstance(node, ir._CollectiveKernel):
+        raise ValueError(f"node is not a collective kernel: {node}")
+
+    name = node.python_kernel_name
+    assert name is not None
+    return get_collective_type_from_kernel_name(name)
+
+
 def get_collective_input_size_bytes(node: ir.IRNode) -> int:
     sz_bytes = 0
     for inp in node.inputs:  # type: ignore[attr-defined]
@@ -198,7 +204,7 @@ def estimate_nccl_collective_runtime_nccl_estimator(snode) -> Optional[float]:
             torch.ops._c10d_functional.wait_tensor.default(w)
     except Exception as e:
         # NCCL estimator can fail
-        log.info(e)
+        log.info(e)  # noqa: G200
         return None
 
     est_time_us = time_estimator.estimated_time
@@ -210,7 +216,9 @@ def estimate_nccl_collective_runtime_nccl_estimator(snode) -> Optional[float]:
     return est_time_ms
 
 
-def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
+def estimate_nccl_collective_runtime_impl(
+    tensor_storage_size_bytes: int, group_size: int, coll: NCCL_COLL
+) -> float:
     """
     Returns estimated NCCL collective runtime in milliseconds (ms).
 
@@ -223,14 +231,12 @@ def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
     - 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
     - collective is one of: allreduce, reducescatter, allgather
     """
-    tensor_storage_size_bytes = get_collective_input_size_bytes(node)
     # Convert bytes to GB
     tensor_storage_size_GB = tensor_storage_size_bytes / 1024 / 1024 / 1024
 
     # Currently assumes each node has 8 gpus. And when >1 node is used, assumes each node uses all 8 gpus.
     # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
     num_gpus_per_node = 8
-    group_size = get_collective_group_size(node)
     nNodes = math.ceil(group_size / num_gpus_per_node)
     nRanks = group_size  # this is total # of gpus globally that participate in this collective op
 
@@ -240,7 +246,6 @@ def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
     # Assumes ring algorithm
     nccl_algo = NCCL_ALGO.RING
     nccl_proto = NCCL_PROTO.LL
-    coll = get_collective_type(node)
 
     # =============== bandwidth computation ===============
     # First compute bandwidth in GB/s; then at the end, convert it to GB/ns
@@ -318,3 +323,75 @@ def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
 ################################################################################################################
 # The above code and constants are adapted from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc #
 ################################################################################################################
+
+
+def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
+    """
+    Returns estimated NCCL collective runtime in nanoseconds (ns).
+
+    The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
+    We aim to estimate the runtime as accurately as possible.
+
+    Assumptions:
+    - only ring algorithm (NCCL_ALGO_RING) is used
+    - only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
+    - 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
+    - collective is one of: allreduce, reducescatter, allgather
+    """
+    tensor_storage_size_bytes = get_collective_input_size_bytes(node)
+    group_size = get_collective_group_size(node)
+    coll = get_collective_type(node)
+    return estimate_nccl_collective_runtime_impl(
+        tensor_storage_size_bytes, group_size, coll
+    )
+
+
+def estimate_fx_collective_size(fx_node: torch.fx.Node) -> int:
+    size = 0
+    for node in fx_node.all_input_nodes:
+        if (t := node.meta.get("val")) is not None:
+            size += t.numel() * t.element_size()
+
+    # TODO - symbolic
+    return size
+
+
+def estimate_nccl_collective_runtime_from_fx_node(
+    fx_node: torch.fx.Node, override_size: Optional[int] = None
+) -> float:
+    """
+    Returns estimated NCCL collective runtime in nanoseconds (ns).
+
+    The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
+    We aim to estimate the runtime as accurately as possible.
+
+    Assumptions:
+    - only ring algorithm (NCCL_ALGO_RING) is used
+    - only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
+    - 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
+    - collective is one of: allreduce, reducescatter, allgather
+    """
+    from torch.distributed.distributed_c10d import _get_group_size_by_name
+
+    if override_size is None:
+        tensor_storage_size_bytes = estimate_fx_collective_size(fx_node)
+    else:
+        tensor_storage_size_bytes = override_size
+
+    assert not isinstance(fx_node.target, str)
+    opt_args_kwargs = normalize_function(
+        fx_node.target,
+        args=fx_node.args,
+        kwargs=fx_node.kwargs,
+        normalize_to_only_use_kwargs=True,
+    )
+    assert opt_args_kwargs is not None
+    _, kwargs = opt_args_kwargs
+
+    group_size = _get_group_size_by_name(kwargs["group_name"])
+    assert isinstance(fx_node.target, torch._ops.OpOverload)
+    coll = get_collective_type_from_kernel_name(fx_node.target.name())
+
+    return estimate_nccl_collective_runtime_impl(
+        tensor_storage_size_bytes, group_size, coll
+    )
diff --git a/torch/_inductor/comm_lowering.py b/torch/_inductor/comm_lowering.py
index e46909432f17..5d2e39d79307 100644
--- a/torch/_inductor/comm_lowering.py
+++ b/torch/_inductor/comm_lowering.py
@@ -151,7 +151,7 @@ def _should_lower_as_one_shot_all_reduce(
         config._collective.auto_select
         and is_symm_mem_enabled_for_group(group_name)
         and can_realize_as_comm_buffer(inp, ir.CommBufferType.SYMM_MEM)
-        and reduce_op in ("sum",)
+        and reduce_op == "sum"
         and inp_size <= config._collective.one_shot_all_reduce_threshold_bytes
     )
 
@@ -208,6 +208,7 @@ def _all_reduce(inp: ir.TensorBox, reduce_op: str, group_name: str) -> ir.Tensor
             # in-place reuse. Therefore, we tell the scheduler to not fuse it.
             inp.realize()
             V.graph.no_fuse_buffer_names.add(inp.get_name())
+        # pyrefly: ignore  # bad-assignment
         inp = ir.ExternKernel.require_contiguous(inp)
         # Because we are lowering as inplace c10d.all_reduce_, we should generate
         # _AllReduce_Kernel instead of _AllReduceKernel.
@@ -232,6 +233,7 @@ def _all_reduce_(
             return inp
 
         # Lower as c10d.all_reduce_
+        # pyrefly: ignore  # bad-assignment
         inp = ir.ExternKernel.require_contiguous(inp)
         ir._AllReduce_Kernel.create_inplace(
             c10d.all_reduce_.default,
diff --git a/torch/_inductor/comms.py b/torch/_inductor/comms.py
index fa8bb30f238c..3cf0156e043a 100644
--- a/torch/_inductor/comms.py
+++ b/torch/_inductor/comms.py
@@ -465,6 +465,7 @@ def _update_memory_tracking_after_swap(
     while _next[curr] is not None:
         if iterative_recompute_error:
             break
+        # pyrefly: ignore  # bad-argument-type
         if contains_collective(curr):
             if debug_num_collectives_to_reorder is not None and (
                 num_processed_collectives >= debug_num_collectives_to_reorder
@@ -825,12 +826,15 @@ def schedule_collective_for_overlap(snode):
             collective_cost > 0
             and (candidate := get_overlapping_candidate()) is not None
         ):
+            # pyrefly: ignore  # unbound-name
             ready.remove(candidate)
+            # pyrefly: ignore  # unbound-name
             schedule(candidate.snode)
+            # pyrefly: ignore  # unbound-name
             collective_cost -= snode_to_cost[candidate.snode]
         heapq.heapify(ready)
 
-    while len(ready):
+    while ready:
         snode = heapq.heappop(ready).snode
         if reorder_for_overlap and contains_collective(snode):
             schedule_collective_for_overlap(snode)
@@ -1028,6 +1032,7 @@ def _update_memory_tracking_after_swap(
         ):
             break
 
+        # pyrefly: ignore  # bad-argument-type
         if contains_wait(curr) and curr not in processed_waits:
             processed_waits.add(curr)
             info = stats[curr] = SinkWaitInfo()
@@ -1093,6 +1098,7 @@ def is_groupable(snode):
                         info.grouped_info = _group_names(gns)
                         candidate = _next[candidate]
                         continue
+                    # pyrefly: ignore  # unbound-name
                     elif (data_dep is None) and both_contain_comms:
                         info.limiting_factor = (
                             f"collective ordering {_group_names(gns)}"
@@ -1365,6 +1371,7 @@ def reorder_compute_and_comm_for_overlap(
             snodes, get_freeable_input_buf(snodes, graph_inputs), graph_outputs
         )
         print(f"final {peak_memory=}")
+    # pyrefly: ignore  # bad-return
     return order
 
 
@@ -1413,7 +1420,7 @@ def check_resize_pattern(graph_input):
         )
         resized_to_0_idxes = graph_input_to_resized_to_0_node_idxes.get(graph_input, [])
 
-        if not len(resized_to_full_idxes) == len(resized_to_0_idxes):
+        if len(resized_to_full_idxes) != len(resized_to_0_idxes):
             log.warning(
                 f"""
 Unequal number of resize-to-full and resize-to-0 nodes for graph input {graph_input}:
@@ -1632,6 +1639,7 @@ def remove_unused_getitem(g):
             KeywordArg("group_size"),
             KeywordArg("group_name"),
         ),
+        # pyrefly: ignore  # bad-argument-type
         pass_dict=graph_pass,
         extra_check=lambda match: match.kwargs["item_idx"] == 0,
     )
@@ -1655,6 +1663,7 @@ def repl(
             return all_gather_into_tensor
 
         match.replace_by_example(
+            # pyrefly: ignore  # bad-argument-type
             repl,
             [
                 kwargs["all_gather_inputs"],
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index b99b37ff0104..6153daac47c8 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -163,6 +163,13 @@ def log_optimus_to_scuba(*args: object, **kwargs: object) -> None:
         GraphSignature,
     )
 
+    CompileFxOutput = Union[
+        Callable[[list[object]], Sequence[torch.Tensor]],
+        str,
+        list[str],
+        Weights,
+    ]
+
 
 class FxCompileMode(enum.Enum):
     NORMAL = 0
@@ -264,8 +271,10 @@ def record_original_output_strides(gm: GraphModule) -> None:
             and (val := output.meta.get("val")) is not None
             and isinstance(val, torch.Tensor)
         ):
+            # pyrefly: ignore  # unbound-name
             output_strides.append(val.stride())
         else:
+            # pyrefly: ignore  # bad-argument-type
             output_strides.append(None)
     output_node.meta["original_output_strides"] = output_strides
 
@@ -370,9 +379,12 @@ def find_smallest_i(graph: fx.Graph, prefix: str) -> int:
                 ):
                     continue
             elif (
-                torch.equal(gm_target, model_target)
+                gm_target.device == model_target.device
                 and gm_target.dtype == model_target.dtype
+                and torch.equal(gm_target, model_target)
             ):
+                # If tensors with same name from gm and model are indeed the same, we don't need to rename
+                # Check device first, to avoid torch.equal(wrapper_CUDA__equal) raise when different device
                 continue
 
             prefix = (
@@ -1323,11 +1335,7 @@ def codegen_and_compile(
 
                 metrics_context = get_metrics_context()
                 if metrics_context.in_progress():
-                    # TODO: Remove this when 3.9 is no longer supported
-                    if sys.version_info < (3, 10):
-                        num_graph_breaks = sum(counters["graph_break"].values())
-                    else:
-                        num_graph_breaks = counters["graph_break"].total()
+                    num_graph_breaks = counters["graph_break"].total()
                     CompileEventLogger.compilation_metric(
                         overwrite=True, num_graph_breaks=num_graph_breaks
                     )
@@ -1536,10 +1544,14 @@ def codegen_and_compile(
                             },
                             payload_fn=lambda: inductor_kernel_stack_trace_str,
                         )
+                        get_metrics_context().add_to_set(
+                            "inductor_provenance", inductor_kernel_stack_trace_str
+                        )
 
                     node_runtimes = None
                     if inductor_metrics_log.isEnabledFor(logging.INFO):
                         num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
+                        # pyrefly: ignore  # bad-assignment
                         metrics.num_bytes_accessed += num_bytes
                         metrics.node_runtimes += node_runtimes
                         metrics.nodes_num_elem += nodes_num_elem
@@ -1583,8 +1595,10 @@ def codegen_and_compile(
                             disable = f"{disable} Found from {stack_trace}\n"
                         else:
                             disable = f"{disable}\n"
+                        # pyrefly: ignore  # unbound-name
                         V.graph.disable_cudagraphs_reason = disable
 
+                    # pyrefly: ignore  # unbound-name
                     if cudagraphs and not V.graph.disable_cudagraphs_reason:
                         maybe_incompat_node = get_first_incompatible_cudagraph_node(gm)
                         if maybe_incompat_node:
@@ -1593,22 +1607,29 @@ def codegen_and_compile(
                                 "stack_trace", None
                             ):
                                 disable = f"{disable} Found from {stack_trace}\n"
+                            # pyrefly: ignore  # unbound-name
                             V.graph.disable_cudagraphs_reason = disable
 
+                    # pyrefly: ignore  # unbound-name
                     if V.aot_compilation:
                         assert isinstance(
-                            compiled_fn, (str, list, torch.fx.GraphModule)
+                            compiled_fn,
+                            # pyrefly: ignore  # unbound-name
+                            (str, list, torch.fx.GraphModule),
                         ), type(compiled_fn)
                         return CompiledAOTI(compiled_fn)
 
                     # TODO: Hoist this above V.aot_compilation
+                    # pyrefly: ignore  # unbound-name
                     if cudagraphs and not V.graph.disable_cudagraphs_reason:
                         from torch._inductor.cudagraph_utils import (
                             check_lowering_disable_cudagraph,
                         )
 
+                        # pyrefly: ignore  # unbound-name
                         V.graph.disable_cudagraphs_reason = (
                             check_lowering_disable_cudagraph(
+                                # pyrefly: ignore  # unbound-name
                                 V.graph.device_node_mapping
                             )
                         )
@@ -1616,23 +1637,29 @@ def codegen_and_compile(
                     self._compile_stats[type(self)].codegen_and_compile += 1
 
                     if (
+                        # pyrefly: ignore  # unbound-name
                         torch._inductor.debug.RECORD_GRAPH_EXECUTION
+                        # pyrefly: ignore  # unbound-name
                         and torch._inductor.debug.GRAPH_COMPILE_IDS is not None
                     ):
                         compile_id = str(
+                            # pyrefly: ignore  # unbound-name
                             torch._guards.CompileContext.current_compile_id()
                         )
                         graph_id = graph_kwargs.get("graph_id")
                         if graph_id is not None:
+                            # pyrefly: ignore  # unbound-name
                             torch._inductor.debug.GRAPH_COMPILE_IDS[graph_id] = (
                                 compile_id
                             )
 
                     return CompiledFxGraph(
+                        # pyrefly: ignore  # bad-argument-type
                         compiled_fn,
                         graph,
                         gm,
                         output_strides,
+                        # pyrefly: ignore  # unbound-name
                         V.graph.disable_cudagraphs_reason,
                         metrics_helper.get_deltas(),
                         counters["inductor"] - inductor_counters,
@@ -1674,15 +1701,18 @@ def fx_codegen_and_compile(
         from .compile_fx_async import _AsyncFxCompile
         from .compile_fx_ext import _OutOfProcessFxCompile
 
+        # pyrefly: ignore  # unbound-name
         assert isinstance(scheme, _OutOfProcessFxCompile), (
             "async is only valid with an out-of-process compile mode"
         )
+        # pyrefly: ignore  # unbound-name
         scheme = _AsyncFxCompile(scheme)
 
     if fx_compile_progressive:
         from .compile_fx_async import _ProgressiveFxCompile
         from .compile_fx_ext import _OutOfProcessFxCompile
 
+        # pyrefly: ignore  # unbound-name
         assert isinstance(scheme, _OutOfProcessFxCompile), (
             "progressive is only valid with an out-of-process compile mode"
         )
@@ -1692,8 +1722,10 @@ def fx_codegen_and_compile(
         # Use in-process compile for the fast version
         fast_scheme = _InProcessFxCompile()
 
+        # pyrefly: ignore  # unbound-name
         scheme = _ProgressiveFxCompile(fast_scheme, scheme, progression_configs)
 
+    # pyrefly: ignore  # unbound-name
     return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
 
 
@@ -1803,6 +1835,7 @@ def cudagraphify_impl(
     Assumes inputs[static_input_idxs[i]] are always the same memory address
     """
     check_input_idxs = get_input_idxs_to_check(inputs, static_input_idxs)  # type: ignore[arg-type]
+    # pyrefly: ignore  # annotation-mismatch
     static_input_idxs: OrderedSet[int] = OrderedSet(
         remove_unaligned_input_idxs(inputs, static_input_idxs)  # type: ignore[arg-type]
     )
@@ -1869,6 +1902,7 @@ def run(new_inputs: list[InputType]) -> Callable[[list[InputType]], Any]:
                     index_expanded_dims_and_copy_(dst, src, expanded_dims)
             new_inputs.clear()
             graph.replay()
+            # pyrefly: ignore  # bad-return
             return static_outputs
 
     else:
@@ -1884,6 +1918,7 @@ def run(new_inputs: list[InputType]) -> Callable[[list[InputType]], Any]:
                 index_expanded_dims_and_copy_(static_inputs[idx], src, expanded_dims)
             new_inputs.clear()
             graph.replay()
+            # pyrefly: ignore  # bad-return
             return static_outputs
 
     return align_inputs_from_check_idxs(run, check_input_idxs, OrderedSet())
@@ -1900,6 +1935,7 @@ def compile_fx_aot(
     # [See NOTE] Unwrapping subclasses AOT
     unwrap_tensor_subclass_parameters(model_)
 
+    # pyrefly: ignore  # annotation-mismatch
     config_patches: dict[str, Any] = copy.deepcopy(config_patches or {})
 
     if not (config_patches.get("fx_wrapper", False) or config.fx_wrapper):
@@ -2388,7 +2424,7 @@ def compile_fx(
     config_patches: Optional[dict[str, Any]] = None,
     decompositions: Optional[dict[OpOverload, Callable[..., Any]]] = None,
     ignore_shape_env: bool = False,
-) -> Union[Callable[[list[object]], Sequence[torch.Tensor]], str, list[str], Weights]:
+) -> CompileFxOutput:
     """
     Main entry point for compiling given FX graph.  Despite the fact that this
     lives in :mod:`torch._inductor`, this function is responsible for calling
@@ -2400,13 +2436,6 @@ def compile_fx(
     NB: This function TAKES OWNERSHIP of the input ``model_`` and can potentially
     mutate it!  Make a copy if you need to preserve the original GraphModule.
     """
-    # Wake up the AsyncCompile subproc pool as early as possible (if there's cuda).
-    if any(
-        isinstance(e, torch.Tensor) and e.device.type in ("cuda", "xpu")
-        for e in example_inputs_
-    ):
-        torch._inductor.async_compile.AsyncCompile.wakeup()
-
     # Some arguments trigger a recursive call to compile_fx.  Handle these
     # short circuits first, before anything else
 
@@ -2421,50 +2450,33 @@ def compile_fx(
                 ignore_shape_env=ignore_shape_env,
             )
 
-    # TODO: This probably shouldn't be a recursive call
+    if config.deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+        torch.backends.mkldnn.deterministic = True  # type: ignore[assignment]
+
+    # Wake up the AsyncCompile subproc pool as early as possible (if there's cuda).
+    if any(
+        isinstance(e, torch.Tensor) and e.device.type in ("cuda", "xpu")
+        for e in example_inputs_
+    ):
+        torch._inductor.async_compile.AsyncCompile.wakeup()
+
     if config.cpp_wrapper or config.fx_wrapper:
+        from torch._export.non_strict_utils import _fakify_script_objects
+
         cpp_wrapper_config = config.cpp_wrapper
         fx_wrapper_config = config.fx_wrapper
 
         with (
-            config.patch(
-                {
-                    "cpp_wrapper": False,  # reset to break recursive call to compile_fx
-                    "fx_wrapper": False,  # reset to break recursive call to compile_fx
-                    **get_cpp_wrapper_config(),
-                }
-            ),
+            config.patch(get_cpp_wrapper_config()),
             V.set_real_inputs(example_inputs_),
         ):
-            inputs_: Sequence[InputType] = example_inputs_
-
-            if isinstance(model_, GraphModule):
-                fake_inputs = [
-                    node.meta.get("val")
-                    for node in model_.graph.nodes
-                    if node.op == "placeholder"
-                ]
-                # Replace non-tensor (constant) inputs with Nones, since these are not being
-                # used anyways by the graph
-                fake_inputs = [
-                    inp if isinstance(inp, torch.Tensor) else None
-                    for inp in fake_inputs
-                ]
-
-                if any(v is not None for v in fake_inputs):
-                    # Validate devices before switching to fake tensors.
-                    for idx, fi, i in zip(count(), fake_inputs, inputs_):
-                        if fi is not None:
-                            assert isinstance(i, torch.Tensor)
-                            if fi.device != i.device:
-                                raise ValueError(
-                                    f"Device mismatch between fake input and example input at position #{idx}: "
-                                    f"{fi.device} vs {i.device}. If the model was exported via torch.export(), "
-                                    "make sure torch.export() and torch.aot_compile() run on the same device."
-                                )
-                    inputs_ = fake_inputs  # type: ignore[assignment]
-            from torch._export.non_strict_utils import _fakify_script_objects
-
+            inputs_: Sequence[InputType] = (
+                _extract_inputs_from_exported_gm(model_, example_inputs_)
+                if isinstance(model_, GraphModule)
+                else example_inputs_
+            )
             fake_mode = detect_fake_mode(inputs_)
             with _fakify_script_objects(model_, inputs_, {}, fake_mode) as (
                 patched_mod,
@@ -2473,7 +2485,7 @@ def compile_fx(
                 _,
                 _,
             ):
-                return compile_fx(
+                return _maybe_wrap_and_compile_fx_main(
                     patched_mod,
                     fake_args,
                     inner_compile=functools.partial(
@@ -2485,32 +2497,108 @@ def compile_fx(
                     ignore_shape_env=ignore_shape_env,
                 )
 
-    recursive_compile_fx = functools.partial(
-        compile_fx,
+    return _maybe_wrap_and_compile_fx_main(
+        model_,
+        example_inputs_,
+        inner_compile,
+        decompositions,
+        ignore_shape_env,
+    )
+
+
+def _extract_inputs_from_exported_gm(
+    gm: GraphModule, example_inputs_: Sequence[InputType]
+) -> Sequence[InputType]:
+    fake_inputs = [
+        node.meta.get("val") for node in gm.graph.nodes if node.op == "placeholder"
+    ]
+    # Replace non-tensor (constant) inputs with Nones, since these are not being
+    # used anyways by the graph
+    fake_inputs = [
+        inp if isinstance(inp, torch.Tensor) else None for inp in fake_inputs
+    ]
+
+    if any(v is not None for v in fake_inputs):
+        # Validate devices before switching to fake tensors.
+        for idx, fi, i in zip(count(), fake_inputs, example_inputs_):
+            if fi is not None:
+                assert isinstance(i, torch.Tensor)
+                if fi.device != i.device:
+                    raise ValueError(
+                        f"Device mismatch between fake input and example input at position #{idx}: "
+                        f"{fi.device} vs {i.device}. If the model was exported via torch.export(), "
+                        "make sure torch.export() and torch.aot_compile() run on the same device."
+                    )
+        return fake_inputs
+
+    return example_inputs_
+
+
+def _maybe_wrap_and_compile_fx_main(
+    model_: GraphModule,
+    example_inputs_: Sequence[InputType],
+    inner_compile: Callable[..., OutputCode],
+    decompositions: Optional[dict[OpOverload, Callable[..., Any]]],
+    ignore_shape_env: bool,
+) -> CompileFxOutput:
+    """
+    Part of compile_fx, called after patching configs.
+
+    Ultimately we want to call _compile_fx_main, where the actual work happens.
+    But under various conditions, various forms of wrapping might be needed
+    around _compile_fx_main.
+    """
+    # Each wrapper below takes a self-contained compile_gm function which is
+    # called inside the wrapper. This just recursively calls this function.
+    compile_gm = functools.partial(
+        _maybe_wrap_and_compile_fx_main,
         inner_compile=inner_compile,
         decompositions=decompositions,
         ignore_shape_env=ignore_shape_env,
     )
-
     if not graph_returns_tuple(model_):
-        return make_graph_return_tuple(
-            model_,
-            example_inputs_,
-            recursive_compile_fx,
-        )
+        return make_graph_return_tuple(model_, example_inputs_, compile_gm)
 
     if isinstance(model_, GraphModule) and isinstance(
         model_.graph._codegen, _PyTreeCodeGen
     ):
         # this graph is the result of dynamo.export()
-        return handle_dynamo_export_graph(
-            model_,
-            example_inputs_,
-            recursive_compile_fx,
-        )
+        return handle_dynamo_export_graph(model_, example_inputs_, compile_gm)
+
+    if any(isinstance(x, (list, tuple, dict)) for x in example_inputs_):
+        # NB: this short circuit never occurs for Dynamo produced graphs
+        # (which are pre-flattened)
+        return flatten_graph_inputs(model_, example_inputs_, compile_gm)
+
+    # Finally do the actual work!
+    return _compile_fx_main(
+        model_,
+        example_inputs_,
+        inner_compile,
+        decompositions,
+        ignore_shape_env,
+    )
 
-    # Do the actual work
 
+def _compile_fx_main(
+    model_: GraphModule,
+    example_inputs_: Sequence[InputType],
+    inner_compile: Callable[..., OutputCode],
+    decompositions: Optional[dict[OpOverload, Callable[..., Any]]],
+    ignore_shape_env: bool,
+) -> CompileFxOutput:
+    """
+    Main part of compile_fx, called after wrapping is done.
+
+    Roughly speaking, here the steps will be:
+    (1) apply pre-grad passes
+    (2) create `fw_compiler` and `bw_compiler` functions out of `inner_compile`
+    (3) call aot_autograd, which:
+    - (3a) creates a joint graph with `decompositions`,
+    - (3b) partitions it with `partition_fn` into fw and bw graphs (applying joint-graph passes),
+    - (3c) calls `fw_compiler` and `bw_compiler` on those graphs (applying post-grad passes)
+    - (3d) finally, assembles the fw and bw compiled functions back together and returns.
+    """
     with (
         _use_lazy_graph_module(dynamo_config.use_lazy_graph_module),
         enable_python_dispatcher(),
@@ -2527,16 +2615,6 @@ def compile_fx(
         if isinstance(model_, GraphModule):
             model_ = run_pre_grad_passes(model_, example_inputs_)
 
-        # TODO: Move this before recursive pre-grad passes
-        # NB: This short circuit never occurs for Dynamo produced graphs
-        # (which are pre-flattened)
-        if any(isinstance(x, (list, tuple, dict)) for x in example_inputs_):
-            return flatten_graph_inputs(
-                model_,
-                example_inputs_,
-                recursive_compile_fx,
-            )
-
         assert not config._raise_error_for_testing
 
         num_example_inputs = len(example_inputs_)
@@ -2627,7 +2705,7 @@ def bw_compiler(
 
                 from torch._export.utils import _detect_fake_mode_from_gm
 
-                fake_mode = _detect_fake_mode_from_gm(gm)
+                fake_mode = _detect_fake_mode_from_gm(gm)  # type: ignore[assignment]
                 # aot_export_module doesn't account for constant tensor attributes
                 # so we end up having tensors that don't have fake vals attached.
                 # This can happen when upstream export is non-strict where we
@@ -2805,6 +2883,7 @@ def _aoti_flatten_inputs(
     Flatten the inputs to the graph module and return the flat inputs and options.
     Add "aot_inductor.serialized_in_spec" and "aot_inductor.serialized_out_spec" to the options.
     """
+    # pyrefly: ignore  # missing-module-attribute
     from .compile_fx import graph_returns_tuple
 
     assert graph_returns_tuple(gm), (
diff --git a/torch/_inductor/compile_fx_ext.py b/torch/_inductor/compile_fx_ext.py
index 7fd976a05ed9..113a7c92606d 100644
--- a/torch/_inductor/compile_fx_ext.py
+++ b/torch/_inductor/compile_fx_ext.py
@@ -445,7 +445,7 @@ def serialize_compile(
             # we can't cache (or serialize)
             FxGraphCache._check_for_hop(gm)
         except BypassFxGraphCache as e:
-            log.debug("Skipping %s compile: %s", type(self), e)
+            log.debug("Skipping %s compile: %s", type(self), e)  # noqa: G200
             return None
 
         context = torch._guards.TracingContext.try_get()
@@ -620,6 +620,7 @@ def getLogger(name: str) -> logging.Logger:
 
         if output.warning_replay:
             for w in output.warning_replay:
+                # pyrefly: ignore  # no-matching-overload
                 warnings.warn_explicit(
                     message=w.message,
                     category=w.category,
diff --git a/torch/_inductor/compile_worker/subproc_pool.py b/torch/_inductor/compile_worker/subproc_pool.py
index 6342fc7e0fcd..c6b094cc52c6 100644
--- a/torch/_inductor/compile_worker/subproc_pool.py
+++ b/torch/_inductor/compile_worker/subproc_pool.py
@@ -170,6 +170,7 @@ def __init__(
                 log_path = config.get_worker_log_path()
 
         if log_path:
+            # pyrefly: ignore  # bad-assignment
             self.log_file = open(log_path, "w")
 
         self.process = subprocess.Popen(
@@ -204,6 +205,7 @@ def submit(
         self, job_fn: Callable[_P, _T], *args: _P.args, **kwargs: _P.kwargs
     ) -> Future[_T]:
         if args or kwargs:
+            # pyrefly: ignore  # bad-assignment
             job_fn = functools.partial(job_fn, *args, **kwargs)
         job_data = self.pickler.dumps(job_fn)
         future: Future[_T]
@@ -282,8 +284,8 @@ def shutdown(self) -> None:
             self.process.wait(300)
             if self.log_file:
                 self.log_file.close()
-        except OSError as e:
-            log.warning("Ignored OSError in pool shutdown:  %s", e)
+        except OSError:
+            log.warning("Ignored OSError in pool shutdown", exc_info=True)
         finally:
             with self.futures_lock:
                 for future in self.pending_futures.values():
diff --git a/torch/_inductor/compile_worker/tracked_process_pool.py b/torch/_inductor/compile_worker/tracked_process_pool.py
index 36df56b963d6..040909fafec9 100644
--- a/torch/_inductor/compile_worker/tracked_process_pool.py
+++ b/torch/_inductor/compile_worker/tracked_process_pool.py
@@ -83,6 +83,7 @@ def set_running_or_notify_cancel() -> Any:
     def submit(
         self, fn: Callable[_P, _R], /, *args: _P.args, **kwargs: _P.kwargs
     ) -> Future[_R]:
+        # pyrefly: ignore  # bad-argument-type
         f = super().submit(fn, *args, **kwargs)
         self._record_enqueue(f)
         return f
diff --git a/torch/_inductor/compile_worker/utils.py b/torch/_inductor/compile_worker/utils.py
index a54fa308d3fd..b4b5e21630c2 100644
--- a/torch/_inductor/compile_worker/utils.py
+++ b/torch/_inductor/compile_worker/utils.py
@@ -27,7 +27,7 @@ def _async_compile_initializer(orig_ppid: int) -> None:
 
     def run() -> None:
         while True:
-            sleep(1)
+            sleep(60)
             if orig_ppid != os.getppid():
                 os.kill(os.getpid(), signal.SIGKILL)
 
diff --git a/torch/_inductor/compiler_bisector.py b/torch/_inductor/compiler_bisector.py
index 5cec2020c9fb..b325bcaa5378 100644
--- a/torch/_inductor/compiler_bisector.py
+++ b/torch/_inductor/compiler_bisector.py
@@ -58,6 +58,7 @@ def __post_init__(self) -> None:
     # applies CrossRefFakeMode on invocation
     "aot_eager_decomp_partition_crossref": [],
     "inductor": [
+        BisectSubsystem("pre_grad_passes"),  # passes applied on pre-grad IR
         BisectSubsystem("joint_graph_passes"),  # passes applied on joint graph
         BisectSubsystem(
             "post_grad_passes"
@@ -242,6 +243,7 @@ def get_bisect_range(
         lines = cls.read_lines_from_file(file_path)
         low = None
         high = None
+        # pyrefly: ignore  # bad-assignment
         for line in reversed(lines):
             if line.startswith("low="):
                 low = int(line.strip().split("=")[1])
@@ -549,7 +551,7 @@ def __del__(self) -> None:
                         curr_backend,
                         curr_subsystem.name,
                         low,
-                        call_counter_debug_info.get(low, None),
+                        call_counter_debug_info.get(low),
                     )
 
                 next_subsystem = cls.advance_subsystem(curr_backend, curr_subsystem)
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index bea2ab0bbce9..e1df2cb9a29e 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -407,6 +407,7 @@ def prologue_fusion_enabled() -> bool:
     else int(env_str)
 )
 sink_waits_iterative_debug_limit_to_sink: Optional[int] = (
+    # pyrefly: ignore  # unbound-name
     None if (env_str := os.getenv("PYTORCH_SINK_WAITS_LIMIT")) is None else int(env_str)
 )
 
@@ -634,6 +635,9 @@ def use_autoheuristic(name: str) -> bool:
 # fallback to eager for random/dropout, this is slow but useful for debugging
 fallback_random = False
 
+# fallback embedding_bag_byte_unpack to eager
+fallback_embedding_bag_byte_unpack = False
+
 # automatically create fallbacks when encountering an unhandled op
 implicit_fallbacks = True
 assume_unaligned_fallback_output = (
@@ -649,7 +653,10 @@ def use_autoheuristic(name: str) -> bool:
 benchmark_fusion: bool = os.environ.get("TORCHINDUCTOR_BENCHMARK_FUSION") == "1"
 enabled_metric_tables = os.environ.get("TORCHINDUCTOR_ENABLED_METRIC_TABLES", "")
 loop_ordering_after_fusion: bool = (
-    os.environ.get("TORCHINDUCTOR_LOOP_ORDERING_AFTER_FUSION", "0") == "1"
+    os.environ.get(
+        "TORCHINDUCTOR_LOOP_ORDERING_AFTER_FUSION", "0" if is_fbcode() else "1"
+    )
+    == "1"
 )
 
 # If fusing two nodes only save less then score_fusion_memory_threshold memory,
@@ -700,6 +707,10 @@ def use_autoheuristic(name: str) -> bool:
 # enabling both of these will implicitly disable split_reductions
 split_reductions = True
 
+# A deterministic mode that skips any on device benchmarking in Inductor
+# if we know they affect numerics.  WARNING: Expect perf hit in this mode.
+deterministic = os.getenv("TORCHINDUCTOR_DETERMINISTIC") == "1"
+
 # When we do split reduction, this number control the minimum value for
 # num_split. Too small num_split make the split reduction less efficient.
 # It's a much bigger problem when we compile a dynamic shape kernel with
@@ -753,6 +764,13 @@ def use_autoheuristic(name: str) -> bool:
     os.environ.get("TORCHINDUCTOR_EMULATE_PRECISION_CASTS", "0") == "1"
 )
 
+# x / y in Triton is lowered to div.full which is approx
+# PyTorch eager uses the equivalent of Triton's div_rn, which can
+# come at a performance penalty
+emulate_divison_rounding = (
+    os.environ.get("TORCHINDUCTOR_EMULATE_DIVISION_ROUNDING", "0") == "1"
+)
+
 # warnings intended for PyTorch developers, disable for point releases
 is_nightly_or_source = "dev" in torch.__version__ or "git" in torch.__version__
 developer_warnings = is_fbcode() or is_nightly_or_source
@@ -1072,6 +1090,8 @@ def decide_compile_threads() -> int:
 # Lookup table for overriding autotune configs based on hash of Triton source code
 autotune_lookup_table: dict[str, dict[str, Any]] = {}
 
+file_lock_timeout: int = int(os.environ.get("TORCHINDUCTOR_FILE_LOCK_TIMEOUT", "600"))
+
 
 def get_worker_log_path() -> Optional[str]:
     log_loc = None
@@ -1116,7 +1136,7 @@ class cpp:
     simdlen: Optional[int] = None
     min_chunk_size = int(os.environ.get("TORCHINDUCTOR_CPP_MIN_CHUNK_SIZE", "512"))
 
-    cxx: tuple[Literal[None], str] = (
+    cxx: tuple[None, str] = (
         None,  # download gcc12 from conda-forge if conda is installed
         os.environ.get("CXX", "clang++" if sys.platform == "darwin" else "g++"),
     )  # type: ignore[assignment]
@@ -1249,7 +1269,7 @@ class triton:
     cudagraph_trees_history_recording = False
 
     # Enable cudagraph support for mutated inputs from prior cudagraph pool
-    cudagraph_support_input_mutation = False if is_fbcode() else True
+    cudagraph_support_input_mutation = not is_fbcode()
 
     # Maximal number of allowed cudagraph re-record for a function and
     # a cudagraph node due to static input tensor address changes or
@@ -1335,6 +1355,24 @@ class triton:
     # For best results, this should be used with prefer_nd_tiling.
     tile_reductions: bool = False
 
+    # Codegen matmul natively with tl.dot without using a template.
+    # This option makes Inductor generate matrix multiplication from scratch,
+    # instead of calling predefined Triton templates (mm, bmm, mm_plus_mm).
+    # Compile time may be longer because native matmul benchmarks more Triton configs
+    # than regular pointwise or reduction kernels.
+    # Native matmul often aggressively fuses operations around the matrix multiply,
+    # which can make it faster or slower depending on your program.
+    #
+    # This option takes priority over other GEMM implementations. If Inductor determines
+    # that a matmul can be generated, it will always generate it with native_matmul.
+    # That means optimized kernels such as decompose_k or persistent_tma_matmul will
+    # not be called when this option is enabled.
+    #
+    # Note: Native matmul does not currently support block pointers or TMA matmul.
+    # If both native_matmul and (use_block_ptr or enable_persistent_tma_matmul) are enabled,
+    # an error will be thrown.
+    native_matmul: bool = False
+
     # should we stop a fusion to allow better tiling?
     tiling_prevents_pointwise_fusion = True
     tiling_prevents_reduction_fusion = True
@@ -1440,6 +1478,8 @@ class triton:
     # Should TMA store be enable from templates. TODO: Remove once we
     # can autotune over the result.
     enable_template_tma_store = os.environ.get("ENABLE_TEMPLATE_TMA_STORE", "0") == "1"
+    # Use epilogue subtiling. We allow disabling it due to limited B200 testing.
+    enable_epilogue_subtiling = os.environ.get("ENABLE_EPILOGUE_SUBTILING", "1") == "1"
     # Skip L1 cache for buffers that are used only once.  Disabled by default
     skip_l1_cache = os.environ.get("TORCHINDUCTOR_SKIP_L1", "0") == "1"
 
@@ -1460,6 +1500,11 @@ class triton:
         os.environ.get("TORCHINDUCTOR_DECOMPOSE_K_THRESHOLD", "32")
     )
 
+    # Programmatic Dependent Launch improves launch latency on Nvidia Hopper+ devices
+    # If set to true, will generate PDL code on devices that support it.
+    # If set to false, will never generate PDL code.
+    enable_pdl = False
+
 
 class aot_inductor:
     """
@@ -1518,6 +1563,10 @@ class aot_inductor:
     package: bool = False
     package_cpp_only: Optional[bool] = None
 
+    # If package_cpp_only is True, whether cpp files will be compiled to a
+    # dynamically linked library or static linked library
+    dynamic_linkage: bool = True
+
     # Dictionary of metadata users might want to save to pass to the runtime.
     # TODO: Move this somewhere else, since it's no longer really a config
     metadata: dict[str, str] = {}
@@ -1561,10 +1610,22 @@ class aot_inductor:
     )
 
     # Experimental. Flag to control whether to include weight in .so
+    # Not supported for cross_target_platform="windows".
     package_constants_in_so: bool = True
 
-    # Experimental. Flag to control whether to package weight separately on disk
-    package_constants_on_disk: bool = False
+    # Experimental. Flag to control whether to package weight separately on disk and which
+    # format to package it in.
+    # Options:
+    # None:
+    #       Do not package weight separately on disk.
+    # "pickle_weights":
+    #       Each weight is pickled and stored separately in data/weights. We also store the
+    #       FQN names of each weight in a weights_config.json in each model's data/aot_inductor/model folder.
+    #       Can only be load back from python using torch._inductor.aoti_load_package API now.
+    # "binary_blob":
+    #       Stores all weights in a single binary blob in data/aot_inductor/model folder for each model.
+    #       This option and config.aot_inductor.force_mmap_weights cannot both be True
+    package_constants_on_disk_format: Optional[str] = None
 
     # Experimental.  Controls automatic precompiling of common AOTI include files.
     precompile_headers: bool = not is_fbcode()
@@ -1593,21 +1654,36 @@ class aot_inductor:
     # custom op libs that have implemented C shim wrappers
     custom_op_libs: Optional[list[str]] = None
 
-    compile_standalone: bool = False
-
     # Whether to enable link-time-optimization
     enable_lto = os.environ.get("AOT_INDUCTOR_ENABLE_LTO", "0") == "1"
 
     # Whether the compiled .so should link to libtorch
-    # TODO: should consolidate this flag with compile_standalone
     link_libtorch: bool = True
 
-    # If None, the default torch headers such as torch/include
-    # will be used. Otherwise, the provided path will be used instead.
-    # This is needed for torchnative to load libtorch-free .so.
-    # Such as [f"{torchnative_dir}/standalone",f"{torchnative_dir}/",].
-    # TODO: should consolidate this flag with compile_standalone
-    libtorch_free_headers: Optional[list[str]] = None
+    # Currently the only valid option is "windows".
+    # We'll use x86_64-w64-mingw32-gcc to cross-compile a .dll file
+    # If using cuda, you also need to set WINDOWS_CUDA_HOME env var
+    # to point to windows CUDA toolkit.
+    # Example: WINDOWS_CUDA_HOME=cuda-windows-base/cuda_cudart/cudart/
+    # The path should contain lib cuda and lib cudart
+    cross_target_platform: Optional[str] = None
+
+    # If link_libtorch is False and cross_target_platform is windows,
+    # a library needs to be provided to provide the shim implementations.
+    aoti_shim_library: Optional[str] = None
+    aoti_shim_library_path: Optional[str] = None
+
+
+# a convenient class that automatically sets a group of the configs in aot_inductor
+# it should only control the flags in aot_inductor.
+# it should not do anything else.
+class aot_inductor_mode:
+    # dynamic_linkage=False
+    # link_libtorch=False
+    # package_cpp_only=True
+    # embed_kernel_binary=True
+    # emit_multi_arch_kernel=True
+    compile_standalone: bool = False
 
 
 class cuda:
@@ -1709,11 +1785,6 @@ class cuda:
         "TORCHINDUCTOR_CUTLASS_INSTANTIATION_LEVEL", "0"
     )
 
-    # Experimental. Only for H100 for now. Flag to control whether to use presets.
-    # Format looks like: "0,1,3" for using presets 0, 1, and 3. Presets can be
-    # controlled by some cutlass instantiation level flags (e.g. 0, 1111, 2222, ...)
-    cutlass_presets: Optional[str] = os.environ.get("TORCHINDUCTOR_CUTLASS_PRESETS")
-
     # use compile command to create kernel .cu and .so name
     cutlass_hash_with_compile_cmd: bool = (
         os.environ.get("TORCHINDUCTOR_CUTLASS_HASH_WITH_COMPILE_CMD", "0") == "1"
@@ -1972,6 +2043,10 @@ class trace:
 # External callable for matmul tuning candidates
 external_matmul: list[Callable[[torch.Tensor, torch.Tensor, torch.Tensor], None]] = []
 
+write_are_deterministic_algorithms_enabled = (
+    os.getenv("TORCHINDUCTOR_WRITE_ARE_DETERMINISTIC_ALGORITHMS_ENABLED", "1") == "1"
+)
+
 
 class test_configs:
     force_extern_kernel_in_multi_template: bool = False
@@ -1979,6 +2054,7 @@ class test_configs:
     max_mm_configs: Optional[int] = None
 
     runtime_triton_dtype_assert = False
+    runtime_triton_shape_assert = False
     static_cpp_dtype_assert = False
 
     # regex to control the set of considered autotuning
@@ -1994,6 +2070,39 @@ class test_configs:
     # for unit testing
     use_libtorch = False
 
+    # to be migrated when ready for use
+    aten_fx_overlap_scheduling = False
+
+    # insert ordering deps for overlap
+    aten_fx_overlap_insert_overlap_deps = True
+
+    # to be migrated when ready for use
+    aten_fx_overlap_preserving_bucketing = False
+
+    # mostly disabled testing
+    assume_bucketing_reduces_latency = True
+
+    # to be migrated when ready for use
+    # runtime estimation function for ops
+    # for user-defined estimation function, pass in the function handle
+    # TODO - need estimated and profile based version
+    estimate_aten_runtime: Union[
+        Literal["default"], Callable[[torch.fx.Node], Optional[float]]
+    ] = "default"
+
+    # A test config to ease the test for perf of reduction config filtering
+    force_filter_reduction_configs = (
+        os.getenv("TORCHINDUCTOR_FORCE_FILTER_REDUCTION_CONFIGS") == "1"
+    )
+
+    # a testing config to distort benchmarking result
+    # - empty string to disable
+    # - "inverse" to inverse the numbers
+    # - "random" return a random value
+    distort_benchmarking_result = os.getenv(
+        "TORCHINDUCTOR_DISTORT_BENCHMARKING_RESULT", ""
+    )
+
 
 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403
diff --git a/torch/_inductor/constant_folding.py b/torch/_inductor/constant_folding.py
index 869f2658219a..85033e2b3e8d 100644
--- a/torch/_inductor/constant_folding.py
+++ b/torch/_inductor/constant_folding.py
@@ -206,7 +206,7 @@ def set_env(arg: torch.fx.Node) -> None:
         # contains a ScriptObject, equality checking results in a type error if
         # the types are different.
         if any(
-            type(self.unknown_value) == type(input_) and self.unknown_value == input_
+            type(self.unknown_value) is type(input_) and self.unknown_value == input_
             for input_ in flattened_inputs
         ):
             return self.unknown_value
diff --git a/torch/_inductor/cpp_builder.py b/torch/_inductor/cpp_builder.py
index f7cb79c9c095..948089f3cc58 100644
--- a/torch/_inductor/cpp_builder.py
+++ b/torch/_inductor/cpp_builder.py
@@ -68,6 +68,8 @@ def use_global_cache() -> bool:  # type: ignore[misc]
 _IS_MACOS = sys.platform.startswith("darwin")
 _IS_WINDOWS = sys.platform == "win32"
 
+MINGW_GXX = "x86_64-w64-mingw32-g++"
+
 SUBPROCESS_DECODE_ARGS = (locale.getpreferredencoding(),) if _IS_WINDOWS else ()
 
 log = logging.getLogger(__name__)
@@ -332,7 +334,49 @@ def get_msvc_cl_path() -> tuple[bool, str]:
         )
 
 
+@functools.cache
+def check_mingw_win32_flavor(compiler: str) -> str:
+    """
+    Check if MinGW `compiler` exists and return it's flavor (win32 or posix).
+    """
+    try:
+        out = subprocess.check_output(
+            [compiler, "-v"], stderr=subprocess.STDOUT, text=True
+        )
+    except FileNotFoundError as e:
+        raise RuntimeError(f"Compiler: {compiler} is not found.") from e
+    except Exception as e:
+        raise RuntimeError(f"Failed to run {compiler} -v") from e
+
+    flavor: str | None = None
+    for line in out.splitlines():
+        if "Thread model" in line:
+            flavor = line.split(":", 1)[-1].strip().lower()
+
+    if flavor is None:
+        raise RuntimeError(
+            f"Cannot determine the flavor of {compiler} (win32 or posix). No Thread model found in {compiler} -v"
+        )
+
+    if flavor not in ("win32", "posix"):
+        raise RuntimeError(
+            f"Only win32 and pofix flavor of {compiler} is supported. The flavor is {flavor}"
+        )
+
+    return flavor
+
+
 def get_cpp_compiler() -> str:
+    if (
+        config.aot_inductor.cross_target_platform == "windows"
+        and sys.platform != "win32"
+    ):
+        # we're doing cross-compilation
+        compiler = MINGW_GXX
+        if not config.aot_inductor.package_cpp_only:
+            check_mingw_win32_flavor(compiler)
+        return compiler
+
     if _IS_WINDOWS:
         compiler = os.environ.get("CXX", "cl")
         compiler = normalize_path_separator(compiler)
@@ -549,9 +593,7 @@ def _create_if_dir_not_exist(path_dir: str) -> None:
             Path(path_dir).mkdir(parents=True, exist_ok=True)
         except OSError as exc:  # Guard against race condition
             if exc.errno != errno.EEXIST:
-                raise RuntimeError(  # noqa: TRY200 (Use `raise from`)
-                    f"Fail to create path {path_dir}"
-                )
+                raise RuntimeError(f"Fail to create path {path_dir}") from exc
 
 
 def _remove_dir(path_dir: str) -> None:
@@ -783,8 +825,6 @@ def _get_os_related_cpp_definitions(cpp_compiler: str) -> list[str]:
         # On Windows, we need disable min/max macro to avoid C2589 error, as PyTorch CMake:
         # https://github.com/pytorch/pytorch/blob/9a41570199155eee92ebd28452a556075e34e1b4/CMakeLists.txt#L1118-L1119
         os_definitions.append("NOMINMAX")
-    else:
-        pass
     return os_definitions
 
 
@@ -889,12 +929,15 @@ def _get_shared_cflags(do_link: bool) -> list[str]:
         https://learn.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=msvc-170
         """
         return ["DLL", "MD"]
-    if not do_link:
-        return ["fPIC"]
     if platform.system() == "Darwin" and "clang" in get_cpp_compiler():
         # This causes undefined symbols to behave the same as linux
         return ["shared", "fPIC", "undefined dynamic_lookup"]
-    return ["shared", "fPIC"]
+    flags = []
+    if do_link:
+        flags.append("shared")
+
+    flags.append("fPIC")
+    return flags
 
 
 def get_cpp_options(
@@ -930,6 +973,11 @@ def get_cpp_options(
 
     passthrough_args.append(" ".join(extra_flags))
 
+    if config.aot_inductor.cross_target_platform == "windows":
+        passthrough_args.extend(["-static-libstdc++", "-static-libgcc"])
+        if check_mingw_win32_flavor(MINGW_GXX) == "posix":
+            passthrough_args.append("-Wl,-Bstatic -lwinpthread -Wl,-Bdynamic")
+
     return (
         definitions,
         include_dirs,
@@ -1088,7 +1136,7 @@ def _get_torch_related_args(
     from torch.utils.cpp_extension import include_paths, TORCH_LIB_PATH
 
     libraries = []
-    include_dirs = config.aot_inductor.libtorch_free_headers or include_paths()
+    include_dirs = include_paths()
 
     if config.aot_inductor.link_libtorch:
         libraries_dirs = [TORCH_LIB_PATH]
@@ -1098,6 +1146,18 @@ def _get_torch_related_args(
                 libraries.append("torch_python")
     else:
         libraries_dirs = []
+        if config.aot_inductor.cross_target_platform == "windows":
+            assert config.aot_inductor.aoti_shim_library, (
+                "'config.aot_inductor.aoti_shim_library' must be set when 'cross_target_platform' is 'windows'."
+            )
+            libraries.append(config.aot_inductor.aoti_shim_library)
+
+    if config.aot_inductor.cross_target_platform == "windows":
+        assert config.aot_inductor.aoti_shim_library_path, (
+            "'config.aot_inductor.aoti_shim_library_path' must be set to the path of the AOTI shim library",
+            " when 'cross_target_platform' is 'windows'.",
+        )
+        libraries_dirs.append(config.aot_inductor.aoti_shim_library_path)
 
     if _IS_WINDOWS:
         libraries.append("sleef")
@@ -1228,6 +1288,9 @@ def _get_openmp_args(
     lib_dir_paths: list[str] = []
     libs: list[str] = []
     passthrough_args: list[str] = []
+
+    if config.aot_inductor.cross_target_platform == "windows":
+        return cflags, ldflags, include_dir_paths, lib_dir_paths, libs, passthrough_args
     if _IS_MACOS:
         # Per https://mac.r-project.org/openmp/ right way to pass `openmp` flags to MacOS is via `-Xclang`
         cflags.append("Xclang")
@@ -1339,10 +1402,19 @@ def _get_libstdcxx_args() -> tuple[list[str], list[str]]:
     return lib_dir_paths, libs
 
 
-def get_mmap_self_macro(use_mmap_weights: bool) -> list[str]:
+def get_mmap_self_macro(
+    use_mmap_weights: bool, use_mmap_weights_external: bool
+) -> list[str]:
     macros = []
+
+    if use_mmap_weights and use_mmap_weights_external:
+        raise RuntimeError(
+            "Only one of use_mmap_weights and use_mmap_weights_external should be true"
+        )
     if use_mmap_weights:
         macros.append(" USE_MMAP_SELF")
+    elif use_mmap_weights_external:
+        macros.append(" USE_MMAP_EXTERNAL")
     return macros
 
 
@@ -1362,6 +1434,7 @@ def get_cpp_torch_options(
     aot_mode: bool,
     use_relative_path: bool,
     use_mmap_weights: bool,
+    use_mmap_weights_external: bool,
 ) -> tuple[list[str], list[str], list[str], list[str], list[str], list[str], list[str]]:
     """
     This function is used to get the build args of torch related build options.
@@ -1410,7 +1483,7 @@ def get_cpp_torch_options(
 
     fb_macro_passthrough_args = _use_fb_internal_macros()
 
-    mmap_self_macros = get_mmap_self_macro(use_mmap_weights)
+    mmap_self_macros = get_mmap_self_macro(use_mmap_weights, use_mmap_weights_external)
     caching_allocator_macros = get_caching_allocator_macro()
 
     definitions = (
@@ -1467,6 +1540,7 @@ def __init__(
         compile_only: bool = False,
         use_relative_path: bool = False,
         use_mmap_weights: bool = False,
+        use_mmap_weights_external: bool = False,
         shared: bool = True,
         extra_flags: Sequence[str] = (),
         compiler: str = "",
@@ -1502,6 +1576,7 @@ def __init__(
             aot_mode=aot_mode,
             use_relative_path=use_relative_path,
             use_mmap_weights=use_mmap_weights,
+            use_mmap_weights_external=use_mmap_weights_external,
         )
 
         _append_list(self._definitions, torch_definitions)
@@ -1583,7 +1658,9 @@ def get_cpp_torch_device_options(
     )
     link_libtorch = config.aot_inductor.link_libtorch
     libraries_dirs = cpp_extension.library_paths(
-        device_type, torch_include_dirs=link_libtorch
+        device_type,
+        torch_include_dirs=link_libtorch,
+        cross_target_platform=config.aot_inductor.cross_target_platform,
     )
     if not config.is_fbcode() and link_libtorch:
         libraries += ["c10"]
@@ -1601,6 +1678,8 @@ def get_cpp_torch_device_options(
                 libraries += ["cuda"]
             else:
                 libraries += ["c10_cuda", "cuda", "torch_cuda"]
+            if config.aot_inductor.cross_target_platform == "windows":
+                libraries += ["cudart"]
             _transform_cuda_paths(libraries_dirs)
 
     if device_type == "xpu":
@@ -1675,6 +1754,7 @@ def __init__(
         compile_only: bool = False,
         use_relative_path: bool = False,
         use_mmap_weights: bool = False,
+        use_mmap_weights_external: bool = False,
         shared: bool = True,
         extra_flags: Sequence[str] = (),
         min_optimize: bool = False,
@@ -1688,6 +1768,7 @@ def __init__(
             compile_only=compile_only,
             use_relative_path=use_relative_path,
             use_mmap_weights=use_mmap_weights,
+            use_mmap_weights_external=use_mmap_weights_external,
             extra_flags=extra_flags,
             min_optimize=min_optimize,
             precompiling=precompiling,
@@ -2056,7 +2137,7 @@ def save_compile_cmd_to_cmake(
 
         definitions = " ".join(self._build_option.get_definitions())
         target_library_type = (
-            "STATIC" if config.aot_inductor.compile_standalone else "SHARED"
+            "STATIC" if not config.aot_inductor.dynamic_linkage else "SHARED"
         )
 
         contents = textwrap.dedent(
@@ -2071,10 +2152,7 @@ def save_compile_cmd_to_cmake(
             """
         )
 
-        if (
-            not config.aot_inductor.compile_standalone
-            or config.test_configs.use_libtorch
-        ):
+        if config.aot_inductor.link_libtorch or config.test_configs.use_libtorch:
             # When compile_standalone is True, the generated cpp project should
             # not use Torch. But for unit testing purpose, we need to use Torch here.
             contents += textwrap.dedent(
@@ -2209,13 +2287,6 @@ def save_kernel_asm_to_cmake(self, cmake_path: str, asm_files: list[str]) -> Non
                 )
 
     def save_link_cmd_to_cmake(self, cmake_path: str) -> None:
-        if (
-            config.aot_inductor.compile_standalone
-            and not config.test_configs.use_libtorch
-        ):
-            # When compile_standalone is True, do not link with libtorch
-            return
-
         lflags = " ".join(self._build_option.get_ldflags())
         libs = " ".join(self._build_option.get_libraries())
         contents = textwrap.dedent(
diff --git a/torch/_inductor/cpu_vec_isa.py b/torch/_inductor/cpu_vec_isa.py
index f2fd105e6a96..57f5c36f7b97 100644
--- a/torch/_inductor/cpu_vec_isa.py
+++ b/torch/_inductor/cpu_vec_isa.py
@@ -217,6 +217,7 @@ def __str__(self) -> str:
 """
 
     @functools.cache  # noqa: B019
+    # pyrefly: ignore  # bad-override
     def __bool__(self) -> bool:
         if super().__bool__():
             if config.is_fbcode():
@@ -429,6 +430,7 @@ def get_isa_from_cpu_capability(
         "avx512": "avx512",
     }
     if capability in capability_to_isa_str.keys():
+        # pyrefly: ignore  # index-error
         isa_str = capability_to_isa_str[capability]
         if isa_str == "INVALID_VEC_ISA":
             return invalid_vec_isa
diff --git a/torch/_inductor/cudagraph_trees.py b/torch/_inductor/cudagraph_trees.py
index 3b3dea909cd2..566db12e4929 100644
--- a/torch/_inductor/cudagraph_trees.py
+++ b/torch/_inductor/cudagraph_trees.py
@@ -407,6 +407,7 @@ def deferred_cudagraphify(inputs: list[InputType]) -> OutputType:
         fn = align_inputs_from_check_idxs(
             fn, inputs_to_check=check_input_idxs, mutated_input_idxs=mutated_input_idxs
         )
+        # pyrefly: ignore  # unsupported-operation
         fn_cache[int_key] = fn
 
         return out
@@ -922,6 +923,7 @@ def maybe_get_static_data_ptr(
             return None
 
         self.static_input_data_ptrs: InputList[Optional[int]] = [
+            # pyrefly: ignore  # bad-argument-type
             maybe_get_static_data_ptr(i, inputs, self.static_input_idxs)
             for i in range(len(inputs))
         ]
@@ -968,8 +970,10 @@ def maybe_get_static_data_ptr(
             self.expected_dead_indices_before_graph = different_indices
 
         rng_states = [inp for inp in inputs if isinstance(inp, torch.Generator)]
+        # pyrefly: ignore  # bad-argument-type
         recording_inputs = self._allocate_and_copy_recording_inputs(inputs)
         # recording inputs will copy over memory, so we can free non recording inputs
+        # pyrefly: ignore  # missing-attribute
         inputs.clear()
         del inputs
 
@@ -1281,8 +1285,10 @@ def static_input_iter() -> Generator[torch.Tensor, None, None]:
         if not isinstance(static_outputs, (list, tuple)):
             static_outputs = (static_outputs,)
 
+        # pyrefly: ignore  # bad-argument-type
         self._add_first_outputs(static_outputs, static_input_persistent_storage_ptrs)
 
+        # pyrefly: ignore  # bad-return
         return static_outputs
 
     def _add_first_outputs(
@@ -1676,6 +1682,7 @@ def _allocate_and_copy_recording_inputs(
             for i, inp in enumerate(inputs):
                 if not isinstance(inp, torch.Tensor):
                     assert isinstance(inp, (int, torch.Generator))
+                    # pyrefly: ignore  # bad-argument-type
                     recording_inputs.append(inp)
                 elif i not in self.static_input_idxs:
                     # static_input does an allocation!
@@ -1840,6 +1847,7 @@ def check_memory_pool(
         formatted = []
         for dp, block in allocated_not_in_live_storages.items():
             trace = format_tb(block.get("frames", []))
+            # pyrefly: ignore  # bad-argument-type
             formatted.append(f"Data Pointer: {dp}, history: \n{trace}")
         formatted_s = "\n".join(formatted)
         msg = (
@@ -2547,7 +2555,11 @@ def apply_checkpoint_execution_state_in_allocator(self) -> None:
         live_storages_weak_refs: list[int] = [t() for t in live_storages_wrappers]  # type: ignore[misc]
         ptrs_to_deallocate = self.current_node.data_ptrs_dead_since_invocation()
         torch._C._cuda_setCheckpointPoolState(
-            device, state, stale_storages, live_storages_weak_refs
+            device,
+            # pyrefly: ignore  # bad-argument-type
+            state,
+            stale_storages,
+            live_storages_weak_refs,
         )
 
         # NB: deduplicate aliased outputs
diff --git a/torch/_inductor/custom_graph_pass.py b/torch/_inductor/custom_graph_pass.py
index 413a224724fd..97497ceeada8 100644
--- a/torch/_inductor/custom_graph_pass.py
+++ b/torch/_inductor/custom_graph_pass.py
@@ -100,7 +100,6 @@ def get_hash_for_files(paths: tuple[str], extra: str = "") -> bytes:
     hasher.update(extra.encode("utf-8"))
     for path in paths:
         with open(path, "rb") as f:
-            hasher.update(path.encode("utf-8"))
             hasher.update(f.read())
     return hasher.digest()
 
diff --git a/torch/_inductor/debug.py b/torch/_inductor/debug.py
index e9df7119bb75..5dbe849af095 100644
--- a/torch/_inductor/debug.py
+++ b/torch/_inductor/debug.py
@@ -97,6 +97,7 @@ def draw_buffers(
             dtype = node.data.dtype
 
         metadata = TensorMetadata(group, dtype, None, None, None, None, None)  # type: ignore[arg-type]
+        # pyrefly: ignore  # missing-attribute
         node.meta["tensor_meta"] = metadata
 
     if print_graph:
@@ -228,6 +229,7 @@ def update_orig_fx_node_name_to_buf_name(
             )
             continue
         else:
+            # pyrefly: ignore  # bad-argument-type, unsupported-operation
             assert len(children_nodes) == 1 and children_nodes[0] == node
 
         ir_node = node.node
@@ -251,6 +253,7 @@ def get_node_name_to_buf_meta(
         if buf_name not in buf_name_to_n_node:
             buf_name_to_n_node[buf_name] = OrderedSet([node_name])
         else:
+            # pyrefly: ignore  # missing-attribute
             buf_name_to_n_node[buf_name].add(node_name)
 
     node_name_to_buf_meta = {}
@@ -1103,6 +1106,9 @@ def set_kernel_post_grad_provenance_tracing(
     Returns a unique int debug handler for each call to this function.
     """
 
+    if config.trace.provenance_tracking_level == 0:
+        return None
+
     try:
         from .codegen.simd_kernel_features import DisableReduction, EnableReduction
 
@@ -1143,9 +1149,11 @@ def set_kernel_post_grad_provenance_tracing(
                                 kernel_name, []
                             )
                         )
+                        # pyrefly: ignore  # missing-attribute
                         stack_traces_set.update(snode.node.get_stack_traces())
                         curr_node_info.extend(
                             origin.name
+                            # pyrefly: ignore  # missing-attribute
                             for origin in snode.node.origins
                             if origin.name not in curr_node_info
                         )
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index 18ce7c4da436..18e338137bdd 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -460,6 +460,12 @@ def add(
     if not x_is_complex_tensor or not y_is_complex_tensor:
         return NotImplemented
 
+    def _requires_fallback(tensor: torch.Tensor) -> bool:
+        if tensor.ndim == 0:
+            return False
+        # Viewing complex tensors as their real dtype requires the last stride to be 1.
+        return tensor.stride()[-1] != 1
+
     output_size_zero = False
     if x.ndim == 0 and y.ndim == 0:
         output_size_zero = True
@@ -474,6 +480,9 @@ def add(
         z = alpha * y
     complex_type = torch.promote_types(x.dtype, y.dtype)
 
+    if _requires_fallback(x) or _requires_fallback(z):
+        return NotImplemented
+
     # For complex typed `x`, `x.view(x.real.dtype)` doubles the last dimension and can cause problem
     # when broadcasting the add.
     def reshape_tensor_complex(tensor: torch.Tensor) -> torch.Tensor:
@@ -535,6 +544,7 @@ def amax(
     keepdim: bool = False,
 ) -> torch.Tensor:
     if self.dtype == torch.bool:
+        # pyrefly: ignore  # no-matching-overload
         return torch.any(self, dim=dim, keepdim=keepdim)
     return NotImplemented
 
@@ -546,6 +556,7 @@ def amin(
     keepdim: bool = False,
 ) -> torch.Tensor:
     if self.dtype == torch.bool:
+        # pyrefly: ignore  # no-matching-overload
         return torch.all(self, dim=dim, keepdim=keepdim)
     return NotImplemented
 
@@ -876,6 +887,10 @@ def select_decomp_table() -> dict[Any, Callable[..., Any]]:
     """decomps can change based on config"""
     if config.fallback_random:
         return decompositions
+    if config.fallback_embedding_bag_byte_unpack:
+        # remove q_embedding_bag_byte_unpack_decomp from decompositions
+        decompositions.pop(torch.ops.quantized.embedding_bag_byte_unpack.default, None)
+        return decompositions
     return fast_random_decomps()
 
 
@@ -1032,9 +1047,13 @@ def _max_pool_with_indices(
     if not stride:
         stride = kernel_size
 
+    # pyrefly: ignore  # bad-assignment
     kernel_size = pad_listlike(kernel_size, dim)
+    # pyrefly: ignore  # bad-assignment
     dilation = pad_listlike(dilation, dim)
+    # pyrefly: ignore  # bad-assignment
     padding = pad_listlike(padding, dim)
+    # pyrefly: ignore  # bad-assignment
     stride = pad_listlike(stride, dim)
 
     window_size = functools.reduce(operator.mul, kernel_size)
@@ -1169,9 +1188,10 @@ def repeat_interleave_Tensor(
     assert repeat.ndim == 1
     cumsum = repeat.cumsum(0)
     pos = torch.arange(output_size, device=repeat.device)
-    return torch.searchsorted(
+    indices = torch.searchsorted(
         cumsum, pos, out_int32=(repeat.dtype == torch.int32), right=True
     )
+    return torch.clamp(indices, max=repeat.size(0) - 1)
 
 
 # intentionally not regiestered
@@ -1192,8 +1212,11 @@ def conv1d_to_conv2d(
         "Expect (N,C_in,L) and (C_out,C_in//groups,K)"
     )
 
+    # pyrefly: ignore  # bad-assignment
     stride = stride[0]
+    # pyrefly: ignore  # bad-assignment
     padding = padding[0]
+    # pyrefly: ignore  # bad-assignment
     dilation = dilation[0]
 
     # Unsqueeze to make input 2D: (N,C,L) -> (N,C,L,1)
diff --git a/torch/_inductor/dependencies.py b/torch/_inductor/dependencies.py
index 835ea182f8e8..b431972521da 100644
--- a/torch/_inductor/dependencies.py
+++ b/torch/_inductor/dependencies.py
@@ -22,7 +22,6 @@
     get_dtype_size,
     reduction_num_outputs,
     sympy_index_symbol,
-    sympy_str,
     sympy_subs,
     VarRanges,
 )
@@ -71,7 +70,9 @@ def normalize_with_stride_order(self, prefix: str = "t") -> Self:
 
 @dataclasses.dataclass(frozen=True)
 class MemoryDep(Dep):
+    # pyrefly: ignore  # bad-override
     name: str
+    # pyrefly: ignore  # bad-override
     index: sympy.Expr
     var_names: tuple[sympy.Symbol, ...]
     size: tuple[sympy.Expr, ...]
@@ -150,7 +151,7 @@ def decide_loop_order_to_match(self, other: "MemoryDep") -> Optional[list[int]]:
         stride_to_index = {s: i for i, s in enumerate(self_strides)}
         order = [stride_to_index[s] for s in other_strides]
 
-        assert OrderedSet(order) == OrderedSet(range(0, self.num_vars))
+        assert OrderedSet(order) == OrderedSet(range(self.num_vars))
         return order
 
     def get_offset(self) -> sympy.Expr:
@@ -307,11 +308,13 @@ def is_indirect(self) -> bool:
 
 @dataclasses.dataclass(frozen=True)
 class StarDep(Dep):
+    # pyrefly: ignore  # bad-override
     name: str
     mode: Optional[str] = None
 
     # depends on the entire buffer
     @property
+    # pyrefly: ignore  # bad-override
     def index(self) -> sympy.Expr:
         raise NotImplementedError("StarDep does not have an index")
 
@@ -360,6 +363,7 @@ def is_indirect(self) -> bool:
 @dataclasses.dataclass(frozen=True)
 class WeakDep(Dep):
     # Fake dependency on unused buffer
+    # pyrefly: ignore  # bad-override
     name: str
     # Buffer that is doing the mutation
     mutating_buf: str
@@ -376,6 +380,7 @@ def get_free_symbol_uses(
         return OrderedSet()
 
     @property
+    # pyrefly: ignore  # bad-override
     def index(self) -> sympy.Expr:
         raise NotImplementedError("WeakDep does not have an index")
 
@@ -554,26 +559,23 @@ def canonicalize(
         }
         return self._normalize(index, var_ranges)
 
-    def load(self, name: str, index: sympy.Expr) -> str:
+    def load(self, name: str, index: sympy.Expr) -> None:
         self._reads.add(MemoryDep(name, *self.canonicalize(index)))
-        return f"load({name}, {sympy_str(index)})"
 
-    def load_seed(self, name: str, index: int) -> str:
+    def load_seed(self, name: str, index: int) -> None:
         assert isinstance(index, int)
-        return self.load(name, sympy.Integer(index))
+        self.load(name, sympy.Integer(index))
 
     def store(
         self, name: str, index: sympy.Expr, value: str, mode: Optional[str] = None
-    ) -> str:
+    ) -> None:
         self._writes.add(MemoryDep(name, *self.canonicalize(index), mode=mode))
-        return f"store({name}, {sympy_str(index)}, {value}, {mode})"
 
-    def store_reduction(self, name: str, index: sympy.Expr, value: str) -> str:
-        return self.store(name, index, f"store_reduction({value})")
+    def store_reduction(self, name: str, index: sympy.Expr, value: str) -> None:
+        self.store(name, index, f"store_reduction({value})")
 
-    def index_expr(self, index: sympy.Expr, dtype: Optional[torch.dtype]) -> str:
+    def index_expr(self, index: sympy.Expr, dtype: Optional[torch.dtype]) -> None:
         self._index_exprs.add(IndexExprDep(*self.canonicalize(index)))
-        return f"index_expr({sympy_str(index)}, {dtype})"
 
     def bucketize(
         self,
@@ -666,8 +668,11 @@ def extract_read_writes(
         range_vars = [*itertools.chain.from_iterable(args)]
 
     return ReadWrites(
+        # pyrefly: ignore  # missing-attribute
         OrderedSet(inner._reads),
+        # pyrefly: ignore  # missing-attribute
         OrderedSet(inner._writes),
+        # pyrefly: ignore  # missing-attribute
         inner._index_exprs,
         range_vars,
         var_ranges,
diff --git a/torch/_inductor/dtype_propagation.py b/torch/_inductor/dtype_propagation.py
index d80caa1e2b72..bfe9cde15594 100644
--- a/torch/_inductor/dtype_propagation.py
+++ b/torch/_inductor/dtype_propagation.py
@@ -58,6 +58,7 @@ def promote_types(
 ):
     dtype_prop_candidates = []
 
+    # pyrefly: ignore  # bad-assignment
     for arg in args:
         assert not isinstance(arg, str)
         if isinstance(arg, OpsValue):
@@ -68,6 +69,7 @@ def promote_types(
             dtype_prop_candidates.append((type_to_dtype(type(arg)), True))
             continue
 
+        # pyrefly: ignore  # missing-attribute
         dtype_prop_candidates.append((arg.dtype, getattr(arg, "is_scalar", False)))
 
     dtype = get_promoted_dtype(
@@ -347,6 +349,11 @@ def halide_clamp(value, size, check):
         # TODO - way of registering dtype for op in backend
         return torch.int32
 
+    @staticmethod
+    def dot(x: DTypeArg, y: DTypeArg) -> torch.dtype:
+        # triton tl.dot out_dtype is tl.float32 by default.
+        return torch.float32
+
     @staticmethod
     def inline_asm_elementwise(
         *inputs, asm, constraints=None, dtype=torch.float32, is_pure=True, pack=1
@@ -374,8 +381,8 @@ def placeholder(self, index: int) -> torch.dtype:
         )
 
     @staticmethod
-    def device_assert_async(cond, msg: str) -> torch.dtype:
-        return torch.bool
+    def device_assert_async(cond, msg: str) -> None:
+        return None
 
 
 if TYPE_CHECKING:
diff --git a/torch/_inductor/exc.py b/torch/_inductor/exc.py
index a46663ed8f8c..1dd25804ce23 100644
--- a/torch/_inductor/exc.py
+++ b/torch/_inductor/exc.py
@@ -132,6 +132,7 @@ def __init__(self, first_useful_frame: Optional[types.FrameType]) -> None:
 class GPUTooOldForTriton(ShortenTraceback):
     def __init__(
         self,
+        # pyrefly: ignore  # not-a-type
         device_props: _CudaDeviceProperties,
         first_useful_frame: Optional[types.FrameType],
     ) -> None:
diff --git a/torch/_inductor/freezing.py b/torch/_inductor/freezing.py
index 05222168095f..dd8af71c7678 100644
--- a/torch/_inductor/freezing.py
+++ b/torch/_inductor/freezing.py
@@ -153,6 +153,7 @@ def __init__(self, elem, name: Optional[str], mod) -> None:
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[override]
         erased_tensors = [
             e
+            # pyrefly: ignore  # bad-unpacking
             for e in pytree.arg_tree_leaves(*args, **kwargs)
             if isinstance(e, ErasedTensor)
         ]
@@ -177,6 +178,7 @@ def invalidate_eager_modules():
             for attr_name, tensor in list(
                 itertools.chain(
                     mod.named_parameters(recurse=False),
+                    # pyrefly: ignore  # bad-argument-type
                     mod.named_buffers(recurse=False),
                 )
             ):
@@ -192,7 +194,9 @@ def discard_traced_gm_params(mod: torch.fx.GraphModule):
     with torch.utils._python_dispatch._disable_current_modes():
         for attr_name, tensor in list(
             itertools.chain(
-                mod.named_parameters(recurse=False), mod.named_buffers(recurse=False)
+                mod.named_parameters(recurse=False),
+                # pyrefly: ignore  # bad-argument-type
+                mod.named_buffers(recurse=False),
             )
         ):
             with torch._dispatch.python.no_python_dispatcher():
diff --git a/torch/_inductor/fuzzer.py b/torch/_inductor/fuzzer.py
index 8149bc7e98e7..55e49b61f7c7 100644
--- a/torch/_inductor/fuzzer.py
+++ b/torch/_inductor/fuzzer.py
@@ -5,7 +5,6 @@
 import random
 import signal
 import string
-import sys
 import traceback
 from collections.abc import KeysView, Sequence
 from enum import Enum
@@ -109,10 +108,12 @@ def example(t: type[T]) -> Optional[T]:
         """
         Return an example of a class.
         """
+        # pyrefly: ignore  # bad-argument-type, bad-argument-count
         return TypeExemplars.TYPE_EXEMPLARS.get(t.__name__, None)
 
     @staticmethod
     def contains(t: type[T]) -> bool:
+        # pyrefly: ignore  # bad-argument-type, bad-argument-count
         return t.__name__ in TypeExemplars.TYPE_EXEMPLARS
 
 
@@ -219,15 +220,15 @@ def _generate_value_for_type(
         if field_name in TYPE_OVERRIDES:
             return random.choice(TYPE_OVERRIDES[field_name])
 
-        if type_hint == bool:
+        if type_hint is bool:
             return random.choice([True, False]) if random_sample else not default
-        elif type_hint == int:
+        elif type_hint is int:
             # NOTE initially tried to use negation of the value, but it doesn't work because most types are ints
             # when they should be natural numbers + zero. Python types to cover these values aren't super convenient.
             return random.randint(0, 1000)
-        elif type_hint == float:
+        elif type_hint is float:
             return random.uniform(0, 1000)
-        elif type_hint == str:
+        elif type_hint is str:
             characters = string.ascii_letters + string.digits + string.punctuation
             return "".join(
                 random.choice(characters) for _ in range(random.randint(1, 20))
@@ -305,11 +306,11 @@ def _generate_value_for_type(
                 new_type = random.choice(type_hint.__args__)
             else:
                 new_type = random.choice(
-                    [t for t in type_hint.__args__ if t != type(default)]
+                    [t for t in type_hint.__args__ if t is not type(default)]
                 )
             try:
                 new_default = new_type()
-            except Exception:  # noqa: E722
+            except Exception:
                 # if default constructor doesn't work, try None
                 new_default = None
 
@@ -384,7 +385,7 @@ def dummy_function(*args, **kwargs):  # type: ignore[no-untyped-def]
         elif TypeExemplars.contains(type_hint):
             return TypeExemplars.example(type_hint)
         elif type_hint == Any:
-            return 1 if not default == 1 else 2
+            return 1 if default != 1 else 2
         else:
             raise ValueError(f"Unable to process type {type_hint}. PRs welcome :)")
 
@@ -610,9 +611,6 @@ def __init__(
             sm: How type value samples are generated, default TOGGLE.
             test_timeout: max time a test can take.
         """
-        if sys.version_info < (3, 10):
-            log.error("Only python 3.10 and later supported")
-            return
         self.seed = seed
         self.test_timeout = test_timeout
         self.detailed_results: dict[ComboType, dict[str, Any]] = {}
@@ -781,7 +779,7 @@ def handle_return(
         test_model_fn = self.test_model_fn_factory()
         try:
             test_model_fn()
-        except Exception as exc:  # noqa: E722
+        except Exception as exc:
             return handle_return(
                 "Eager exception", Status.FAILED_RUN_EAGER_EXCEPTION, True, exc
             )
@@ -790,7 +788,7 @@ def handle_return(
         try:
             test_model_fn2 = self.test_model_fn_factory()
             comp = torch.compile(test_model_fn2, backend="inductor")
-        except Exception as exc:  # noqa: E722
+        except Exception as exc:
             return handle_return(
                 "Exception compiling", Status.FAILED_COMPILE, True, exc
             )
@@ -798,7 +796,7 @@ def handle_return(
         # try running compiled
         try:
             compile_result = comp()
-        except Exception as exc:  # noqa: E722
+        except Exception as exc:
             return handle_return(
                 "Exception running compiled",
                 Status.FAILED_RUN_COMPILE_EXCEPTION,
diff --git a/torch/_inductor/fx_passes/b2b_gemm.py b/torch/_inductor/fx_passes/b2b_gemm.py
index a87c86fe9e52..c93152c68356 100644
--- a/torch/_inductor/fx_passes/b2b_gemm.py
+++ b/torch/_inductor/fx_passes/b2b_gemm.py
@@ -1,7 +1,6 @@
 # mypy: allow-untyped-defs
 import functools
 from collections import deque
-from typing import Union
 
 import torch
 from torch.utils._ordered_set import OrderedSet
@@ -476,9 +475,7 @@ def build_subgraph_buffer(
         elif node.op == "call_function":
             # For call_function we use the default lowerings and pass in the
             # already created TensorBoxes as args
-            args, kwargs = tree_map(
-                lambda x: env[x] if x in env else x, (node.args, node.kwargs)
-            )
+            args, kwargs = tree_map(lambda x: env.get(x, x), (node.args, node.kwargs))
             env[node] = lowerings[node.target](*args, **kwargs)
         elif node.op == "output":
 
@@ -516,7 +513,7 @@ def convert_output_node_to_buffer(output):
 
 def create_placeholder(
     name: str, dtype: torch.dtype, device: torch.device
-) -> Union[TensorBox, ShapeAsConstantBuffer]:
+) -> TensorBox | ShapeAsConstantBuffer:
     """
     Creates a placeholder input buffers for producing subgraph_output
     """
@@ -580,6 +577,7 @@ def tuned_b2b_gemm(
 # match the inner mm of a potential b2b_gemm
 @register_graph_pattern(
     CallFunction(torch.ops.aten.mm, Arg(), Arg()),
+    # pyrefly: ignore  # bad-argument-type
     pass_dict=B2B_GEMM_PASS,
 )
 def b2b_gemm_handler(match: Match, mat1: torch.fx.Node, mat2: torch.fx.Node) -> None:
@@ -691,30 +689,39 @@ def all_reach_via_pointwise_with_no_other_inputs(
     for node in graph.nodes:  # preserve the order of nodes
         if node in subgraph_node_set:
             subgraph_node_list.append(node)
-            new_node = new_graph.node_copy(
-                node, lambda x: node_remapping[x] if x in node_remapping else x
-            )
+            new_node = new_graph.node_copy(node, lambda x: node_remapping.get(x, x))
             node_remapping[node] = new_node
             if node is inner_mm:
                 new_input_anchor = new_node
             if node is f_node:
                 new_output_anchor = new_node
+    # pyrefly: ignore  # unbound-name
     if new_input_anchor is not new_output_anchor:  # subgraph is non-trivial
         # update the input node
+        # pyrefly: ignore  # unbound-name
         with new_graph.inserting_before(new_input_anchor):
             new_input_node = new_graph.placeholder(name="subgraph_input")
+            # pyrefly: ignore  # unbound-name
             new_input_node.meta.update(new_input_anchor.meta)
+            # pyrefly: ignore  # unbound-name
             new_input_anchor.replace_all_uses_with(new_input_node)
+        # pyrefly: ignore  # unbound-name
         new_graph.erase_node(new_input_anchor)
         # add the output node
+        # pyrefly: ignore  # unbound-name
         new_output_node = new_graph.output(new_output_anchor)
+        # pyrefly: ignore  # unbound-name
         new_output_node.meta.update(new_output_anchor.meta)
     else:  # subgraph is trivial, e.g. (A @ (B @ C))
         # update the input node
+        # pyrefly: ignore  # unbound-name
         with new_graph.inserting_before(new_input_anchor):
             new_input_node = new_graph.placeholder(name="subgraph_input")
+            # pyrefly: ignore  # unbound-name
             new_input_node.meta.update(new_input_anchor.meta)
+            # pyrefly: ignore  # unbound-name
             new_input_anchor.replace_all_uses_with(new_input_node)
+        # pyrefly: ignore  # unbound-name
         new_graph.erase_node(new_input_anchor)
         # update the output node (don't use new_output_anchor since it has been erased)
         new_output_node = new_graph.output(new_input_node)
diff --git a/torch/_inductor/fx_passes/binary_folding.py b/torch/_inductor/fx_passes/binary_folding.py
index d2ad3e1c8f91..f2f68a76c426 100644
--- a/torch/_inductor/fx_passes/binary_folding.py
+++ b/torch/_inductor/fx_passes/binary_folding.py
@@ -19,18 +19,18 @@ def mark_mixed_dtype(computation_node):
     if computation_node_dtype not in (torch.float16, torch.bfloat16):
         return
 
-    if not len(computation_node.users) == 1:
+    if len(computation_node.users) != 1:
         return
 
     computation_node_user = next(iter(computation_node.users.keys()))
     if not isinstance(computation_node_user.meta["val"], torch.Tensor):
         return
 
-    if not computation_node_user.meta["val"].dtype == torch.float32:
+    if computation_node_user.meta["val"].dtype != torch.float32:
         return
 
     while computation_node_user.target in _binary_ops:
-        if not len(computation_node_user.users) == 1:
+        if len(computation_node_user.users) != 1:
             return
 
         computation_node_user = next(iter(computation_node_user.users.keys()))
@@ -188,7 +188,7 @@ def _check_conv_and_broadcast_op(conv_node, other):
         ):
             return False
 
-        if not len(conv_node.args[1].users) == 1:
+        if len(conv_node.args[1].users) != 1:
             return False
 
         weight_meta_value = conv_node.args[1].meta.get("val")
@@ -242,7 +242,7 @@ def _check_linear_and_broadcast_op(linear_node, other, has_reshape):
         ):
             return False
 
-        if not len(weight_node.users) == 1:
+        if len(weight_node.users) != 1:
             return False
 
         weight_meta_value = weight_node.meta.get("val")
diff --git a/torch/_inductor/fx_passes/bucketing.py b/torch/_inductor/fx_passes/bucketing.py
index bf16454157b3..d509e8c515e4 100644
--- a/torch/_inductor/fx_passes/bucketing.py
+++ b/torch/_inductor/fx_passes/bucketing.py
@@ -1,7 +1,8 @@
 import collections
 import logging
+import operator
 from collections import defaultdict
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Literal, TypeAlias
 
 import torch
 import torch.distributed as dist
@@ -17,6 +18,54 @@
 logger: logging.Logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
+BucketMode: TypeAlias = Literal["default", "custom_ops", "custom_ops_multidtype"]
+
+
+# Helper functions moved to top for better organization
+def _ag_group_key(node: torch.fx.Node) -> tuple[str, torch.dtype]:  # type: ignore[name-defined]
+    _, group_size, group_name = node.args
+    dtype = node.meta["val"].dtype
+    assert isinstance(group_name, str)
+    return (group_name, dtype)
+
+
+def _ag_group_key_multidtype(node: torch.fx.Node) -> tuple[str]:
+    _, group_size, group_name = node.args
+    assert isinstance(group_name, str)
+    return (group_name,)
+
+
+def _rs_group_key(node: torch.fx.Node) -> tuple[str, str, torch.dtype]:  # type: ignore[name-defined]
+    _, reduce_op, group_size, group_name = node.args
+    dtype = node.meta["val"].dtype
+    assert isinstance(group_name, str)
+    assert isinstance(reduce_op, str)
+    return (group_name, reduce_op, dtype)
+
+
+def _ar_group_key(node: torch.fx.Node) -> tuple[str, str, torch.dtype]:
+    _, reduce_op, group_name = node.args
+    dtype = node.meta["val"].dtype
+    assert isinstance(group_name, str)
+    assert isinstance(reduce_op, str)
+    return (group_name, reduce_op, dtype)
+
+
+def bucket_key(node: torch.fx.Node) -> object | None:
+    if is_all_gather_into_tensor(node):
+        return _ag_group_key(node)
+    elif is_reduce_scatter_tensor(node):
+        return _rs_group_key(node)
+    elif is_all_reduce_tensor(node):
+        return _ar_group_key(node)
+    else:
+        return None
+
+
+def pick_bucket_dtype(dtypes: list[torch.dtype]) -> torch.dtype:  # type: ignore[name-defined]
+    assert len(dtypes) > 0
+    return min(dtypes, key=operator.attrgetter("itemsize"))
+
 
 def bucket_cap_mb_by_bucket_idx_default(bucket_id: int) -> float:
     """
@@ -33,16 +82,16 @@ def bucket_cap_mb_by_bucket_idx_default(bucket_id: int) -> float:
 
 def bucket_all_gather(
     gm: torch.fx.GraphModule,
-    bucket_cap_mb_by_bucket_idx: Optional[Callable[[int], float]] = None,
-    mode: Optional[str] = None,
+    bucket_cap_mb_by_bucket_idx: Callable[[int], float] | None = None,
+    mode: BucketMode = "default",
 ) -> None:
     if bucket_cap_mb_by_bucket_idx is None:
-        from torch._inductor.fx_passes.bucketing import (
+        from torch._inductor.fx_passes.bucketing import (  # pyrefly: ignore  # missing-module-attribute
             bucket_cap_mb_by_bucket_idx_default,
         )
 
         bucket_cap_mb_by_bucket_idx = bucket_cap_mb_by_bucket_idx_default
-    ag_buckets = bucket_all_gather_by_mb(gm, bucket_cap_mb_by_bucket_idx)
+    ag_buckets = bucket_all_gather_by_mb(gm, bucket_cap_mb_by_bucket_idx, None, mode)
     if len(ag_buckets) == 0:
         return
     merge_all_gather(gm, ag_buckets, mode)
@@ -50,16 +99,18 @@ def bucket_all_gather(
 
 def bucket_reduce_scatter(
     gm: torch.fx.GraphModule,
-    bucket_cap_mb_by_bucket_idx: Optional[Callable[[int], float]] = None,
-    mode: Optional[str] = None,
+    bucket_cap_mb_by_bucket_idx: Callable[[int], float] | None = None,
+    mode: BucketMode = "default",
 ) -> None:
     if bucket_cap_mb_by_bucket_idx is None:
-        from torch._inductor.fx_passes.bucketing import (
+        from torch._inductor.fx_passes.bucketing import (  # pyrefly: ignore  # missing-module-attribute
             bucket_cap_mb_by_bucket_idx_default,
         )
 
         bucket_cap_mb_by_bucket_idx = bucket_cap_mb_by_bucket_idx_default
-    rs_buckets = bucket_reduce_scatter_by_mb(gm, bucket_cap_mb_by_bucket_idx)
+    rs_buckets = bucket_reduce_scatter_by_mb(
+        gm, bucket_cap_mb_by_bucket_idx, None, mode
+    )
     if len(rs_buckets) == 0:
         return
     merge_reduce_scatter(gm, rs_buckets, mode)
@@ -86,6 +137,13 @@ def is_wait_tensor(node: torch.fx.Node) -> bool:
     )
 
 
+def is_all_reduce_tensor(node: torch.fx.Node) -> bool:
+    return (
+        node.op == "call_function"
+        and node.target == torch.ops._c10d_functional.all_reduce.default
+    )
+
+
 def is_wait_tensor_from_all_gather_into_tensor(node: torch.fx.Node) -> bool:
     return is_wait_tensor(node) and is_all_gather_into_tensor(node.args[0])  # type: ignore[arg-type]
 
@@ -131,7 +189,7 @@ def greedy_bucket_collective_by_mb(
     bucket_cap_mb_by_bucket_idx: Callable[[int], float],
     filter_node: Callable[[torch.fx.Node], bool],
     node_group_key: Callable[[torch.fx.Node], Any],
-    filter_wait_node: Optional[Callable[[torch.fx.Node], bool]] = None,
+    filter_wait_node: Callable[[torch.fx.Node], bool] | None = None,
 ) -> list[list[torch.fx.Node]]:
     """
     Bucketing adjacent collectives with equal node_group_key.
@@ -209,7 +267,8 @@ def greedy_bucket_collective_by_mb(
 def bucket_all_gather_by_mb(
     gm: torch.fx.GraphModule,
     bucket_cap_mb_by_bucket_idx: Callable[[int], float],
-    filter_wait_node: Optional[Callable[[torch.fx.Node], bool]] = None,
+    filter_wait_node: Callable[[torch.fx.Node], bool] | None = None,
+    mode: BucketMode = "default",
 ) -> list[list[torch.fx.Node]]:
     """
     Identifies all all_gather nodes and groups them into buckets,
@@ -222,24 +281,22 @@ def bucket_all_gather_by_mb(
             to specify different sizes of the buckets at the start,
             as first all_gather is usually exposed.  Interface of bucket_cap_mb_by_bucket_idx
             is `bucket_cap_mb_by_bucket_idx_default` function that is default value for `bucket_cap_mb_by_bucket_idx`.
-        filter_wait_node (Optional[Callable[[torch.fx.Node], bool]]): If specified,
+        filter_wait_node (Callable[[torch.fx.Node], bool] | None): If specified,
             only all_gather nodes with wait_node that satisfy `filter_wait_node` will be bucketed.
 
     Returns:
         list[list[torch.fx.Node]]: List of buckets, where each bucket is a list of all_gather nodes.
     """
 
-    def _ag_group_key(node: torch.fx.Node) -> tuple[str, torch.dtype]:
-        _, group_size, group_name = node.args
-        dtype = node.meta["val"].dtype
-        assert isinstance(group_name, str)
-        return (group_name, dtype)
+    group_key_fn = (
+        _ag_group_key_multidtype if mode and "multidtype" in mode else _ag_group_key
+    )
 
     return greedy_bucket_collective_by_mb(
         gm,
         bucket_cap_mb_by_bucket_idx,
         is_all_gather_into_tensor,
-        _ag_group_key,
+        group_key_fn,
         filter_wait_node,
     )
 
@@ -247,7 +304,8 @@ def _ag_group_key(node: torch.fx.Node) -> tuple[str, torch.dtype]:
 def bucket_reduce_scatter_by_mb(
     gm: torch.fx.GraphModule,
     bucket_cap_mb_by_bucket_idx: Callable[[int], float],
-    filter_wait_node: Optional[Callable[[torch.fx.Node], bool]] = None,
+    filter_wait_node: Callable[[torch.fx.Node], bool] | None = None,
+    mode: BucketMode = "default",
 ) -> list[list[torch.fx.Node]]:
     """
     Identifies all reduce_scatter nodes and groups them into buckets,
@@ -258,19 +316,16 @@ def bucket_reduce_scatter_by_mb(
         bucket_cap_mb_by_bucket_idx (Callable[[int], float]): Callable to specify cap of the bucket
             in megabytes by bucket idx.  The idea of `bucket_cap_mb_by_bucket_idx` is to allow
             to specify different sizes of the buckets.
-        filter_wait_node (Optional[Callable[[torch.fx.Node], bool]]): If specified,
+        filter_wait_node (Callable[[torch.fx.Node], bool] | None): If specified,
             only reduce_scatter nodes with wait_node that satisfy `filter_wait_node` will be bucketed.
 
     Returns:
         list[list[torch.fx.Node]]: List of buckets, where each bucket is a list of reduce_scatter nodes.
     """
 
-    def _rs_group_key(node: torch.fx.Node) -> tuple[str, str, torch.dtype]:
-        _, reduce_op, group_size, group_name = node.args
-        dtype = node.meta["val"].dtype
-        assert isinstance(group_name, str)
-        assert isinstance(reduce_op, str)
-        return (group_name, reduce_op, dtype)
+    assert "multidtype" not in mode, (
+        "reduce scatter bucketing does not support multidtype"
+    )
 
     return greedy_bucket_collective_by_mb(
         gm,
@@ -281,6 +336,38 @@ def _rs_group_key(node: torch.fx.Node) -> tuple[str, str, torch.dtype]:
     )
 
 
+def bucket_all_reduce_by_mb(
+    gm: torch.fx.GraphModule,
+    bucket_cap_mb_by_bucket_idx: Callable[[int], float],
+    filter_wait_node: Callable[[torch.fx.Node], bool] | None = None,
+) -> list[list[torch.fx.Node]]:
+    return greedy_bucket_collective_by_mb(
+        gm,
+        bucket_cap_mb_by_bucket_idx,
+        is_all_reduce_tensor,
+        _ar_group_key,
+        filter_wait_node,
+    )
+
+
+def bucket_all_reduce(
+    gm: torch.fx.GraphModule,
+    bucket_cap_mb_by_bucket_idx: Callable[[int], float] | None = None,
+    mode: str | None = None,
+) -> None:
+    if bucket_cap_mb_by_bucket_idx is None:
+        from torch._inductor.fx_passes.bucketing import (  # pyrefly: ignore  # missing-module-attribute
+            bucket_cap_mb_by_bucket_idx_default,
+        )
+
+        bucket_cap_mb_by_bucket_idx = bucket_cap_mb_by_bucket_idx_default
+    ar_buckets = bucket_all_reduce_by_mb(gm, bucket_cap_mb_by_bucket_idx)
+    if len(ar_buckets) == 0:
+        return
+    for bucket in ar_buckets:
+        merge_all_reduce_bucket(gm.graph, bucket, mode)
+
+
 @torch.library.custom_op("bucketing::_pre_bucket_reduce_scatter", mutates_args={})
 def _pre_bucket_reduce_scatter(
     rs_ins: list[torch.Tensor],
@@ -352,6 +439,24 @@ def reduce_scatter_merge_fn_to_trace(
     return new_outs
 
 
+def all_reduce_merge_fn_to_trace(
+    ar_ins: list[torch.Tensor],
+    group_name: str,
+    reduce_op: str,
+    reduce_dtype: torch.dtype,  # type: ignore[name-defined]
+    device: torch.device,  # type: ignore[name-defined]
+) -> list[torch.Tensor]:  # type: ignore[no-untyped-def]
+    ar_ins_flattened = [x.view(-1) for x in ar_ins]
+    new_ar_in = torch.cat(ar_ins_flattened)
+    new_ar_out = torch.ops.c10d_functional.wait_tensor(
+        torch.ops._c10d_functional.all_reduce.default(new_ar_in, reduce_op, group_name)
+    )
+    split_sizes = [x.numel() for x in ar_ins]
+    new_outs_flat = new_ar_out.split(split_sizes)
+    new_outs = [x.view(ar_in.shape) for x, ar_in in zip(new_outs_flat, ar_ins)]
+    return new_outs
+
+
 @torch.library.custom_op("bucketing::_pre_bucket_all_gather", mutates_args={})
 def _pre_bucket_all_gather(
     ag_ins: list[torch.Tensor],
@@ -360,13 +465,17 @@ def _pre_bucket_all_gather(
     dtype: torch.dtype,  # type: ignore[name-defined]
     rank: int,
 ) -> torch.Tensor:
-    ins_split_sizes = [ag_in.numel() for ag_in in ag_ins]
+    ins_split_sizes_bytes = [ag_in.numel() * ag_in.element_size() for ag_in in ag_ins]
+    bucket_dtype_size_bytes = dtype.itemsize
+    ins_split_sizes = [
+        _bytes // bucket_dtype_size_bytes for _bytes in ins_split_sizes_bytes
+    ]
     ag_input_numel = sum(ins_split_sizes)
     device = ag_ins[0].device
     new_ag_out = torch.empty(ag_input_numel * group_size, dtype=dtype, device=device)
     new_ag_in = new_ag_out.narrow(0, ag_input_numel * rank, ag_input_numel)
     foreach_copy_dsts = torch.split(new_ag_in, ins_split_sizes)
-    ag_ins_flattened = [ag_in.reshape(-1) for ag_in in ag_ins]
+    ag_ins_flattened = [ag_in.reshape(-1).view(dtype) for ag_in in ag_ins]
     torch._foreach_copy_(foreach_copy_dsts, ag_ins_flattened)
     return new_ag_out
 
@@ -378,7 +487,11 @@ def _pre_bucket_all_gather_fake(
     dtype: torch.dtype,  # type: ignore[name-defined]
     rank: int,
 ) -> torch.Tensor:
-    ins_split_sizes = [ag_in.numel() for ag_in in ag_ins]
+    ins_split_sizes_bytes = [ag_in.numel() * ag_in.element_size() for ag_in in ag_ins]
+    bucket_dtype_size_bytes = dtype.itemsize
+    ins_split_sizes = [
+        _bytes // bucket_dtype_size_bytes for _bytes in ins_split_sizes_bytes
+    ]
     ag_input_numel = sum(ins_split_sizes)
     device = ag_ins[0].device
     new_ag_out = torch.empty(ag_input_numel * group_size, dtype=dtype, device=device)
@@ -389,14 +502,28 @@ def _pre_bucket_all_gather_fake(
 
 
 def all_gather_merge_fn_to_trace_custom_ops(
-    ag_ins: list[torch.Tensor],
+    _ag_ins: list[torch.Tensor],
     group_size: int,
     group_name: str,
     dtype: torch.dtype,  # type: ignore[name-defined]
+    out_dtypes: list[torch.dtype],  # type: ignore[name-defined]
     rank: int,
 ) -> list[torch.Tensor]:
+    ag_ins = [
+        torch._prims.convert_element_type(_ag_in, out_dtype)
+        if _ag_in.dtype != out_dtype
+        else _ag_in
+        for _ag_in, out_dtype in zip(_ag_ins, out_dtypes)
+    ]
     ins_sizes = [ag_in.shape for ag_in in ag_ins]
-    ins_split_sizes = [ag_in.numel() for ag_in in ag_ins]
+    ins_split_sizes_bytes = [
+        ag_in.numel() * out_dtype.itemsize
+        for ag_in, out_dtype in zip(ag_ins, out_dtypes)
+    ]
+    bucket_dtype_size_bytes = dtype.itemsize
+    ins_split_sizes = [
+        _bytes // bucket_dtype_size_bytes for _bytes in ins_split_sizes_bytes
+    ]
     ag_input_numel = sum(ins_split_sizes)
     new_ag_out = torch.ops.bucketing._pre_bucket_all_gather(
         ag_ins, group_size, group_name, dtype, rank
@@ -408,14 +535,14 @@ def all_gather_merge_fn_to_trace_custom_ops(
         )
     )
     new_ag_out_reshaped = wait_tensor.reshape(group_size, -1)
-    outs = torch.split_with_sizes(
+    outs_bucket_dtype = torch.split_with_sizes(
         new_ag_out_reshaped,
         ins_split_sizes,
         dim=1,
     )
     outs_reshaped = [
-        o.reshape((shape[0] * group_size,) + shape[1:])
-        for o, shape in zip(outs, ins_sizes)
+        o.view(out_dtype).reshape((shape[0] * group_size,) + shape[1:])
+        for o, shape, out_dtype in zip(outs_bucket_dtype, ins_sizes, out_dtypes)
     ]
     return outs_reshaped
 
@@ -425,6 +552,7 @@ def all_gather_merge_fn_to_trace(
     group_size: int,
     group_name: str,
     dtype: torch.dtype,  # type: ignore[name-defined]
+    out_dtypes: list[torch.dtype],  # type: ignore[name-defined]
     rank: int,
 ) -> list[torch.Tensor]:
     ins_sizes = [ag_in.shape for ag_in in ag_ins]
@@ -459,6 +587,7 @@ def all_gather_merge_fn_to_trace_functional(
     group_size: int,
     group_name: str,
     dtype: torch.dtype,  # type: ignore[name-defined]
+    out_dtypes: list[torch.dtype],  # type: ignore[name-defined]
     rank: int,
     use_fsdp_ag_copy_in: bool = False,
 ) -> list[torch.Tensor]:
@@ -515,13 +644,16 @@ def _insert_fn_trace_before_node(  # type: ignore[no-untyped-def]
     insert_before_node: torch.fx.Node,
     g_fn_inps: list[torch.fx.Node],
     g_fn_outs: list[torch.fx.Node],
-) -> dict[torch.fx.Node, torch.fx.Node]:  # type: ignore[no-untyped-def]
+) -> tuple[dict[torch.fx.Node, torch.fx.Node], list[torch.fx.Node]]:  # type: ignore[no-untyped-def]
     """
     Helper function that traces :attr:`fn_to_trace` with inputs
     :attr:`inps`.
     The result function graph will be inserted before :attr:`insert_before_node`,
-    using :attr:`g_fn_inps` nodes of original graphas inputs of function graph,
+    using :attr:`g_fn_inps` nodes of original graph as inputs of function graph,
     function graph outputs will replace :attr:`g_fn_outs` in original graph.
+
+    Returns:
+        (replacements, new_nodes): Dictionary mapping old to new nodes, and list of all newly inserted nodes
     """
     with dynamo_timed(
         "fx.bucketing._insert_fn_trace_before_node", log_pt2_compile_event=True
@@ -534,6 +666,8 @@ def _insert_fn_trace_before_node(  # type: ignore[no-untyped-def]
         fn_g_ins = fn_g.find_nodes(op="placeholder")
         env = {fn_g_ins[idx]: g_fn_inps[idx] for idx in range(len(g_fn_inps))}
         g_fn_new_outs: list[torch.fx.Node] = []
+        new_nodes: list[torch.fx.Node] = []  # Track all newly inserted nodes
+
         with g.inserting_before(insert_before_node):
             for _n in fn_g.nodes:
                 if _n.op == "placeholder":
@@ -543,26 +677,258 @@ def _insert_fn_trace_before_node(  # type: ignore[no-untyped-def]
                 if _n.op == "output":
                     g_fn_new_outs = _new_n.args[0]  # type: ignore[assignment]
                     g.erase_node(_new_n)
+                else:
+                    new_nodes.append(_new_n)  # Track non-output nodes
+
         replacements = {  # noqa: C416
             orig_out: new_out for orig_out, new_out in zip(g_fn_outs, g_fn_new_outs)
         }
         for orig_out, new_out in zip(g_fn_outs, g_fn_new_outs):
             orig_out.replace_all_uses_with(new_out)
-        return replacements
+
+        return replacements, new_nodes
+
+
+def process_collective_bucket(
+    g: torch.fx.Graph,
+    bucket_nodes: list[torch.fx.Node],
+    fn_to_trace: Callable[..., list[torch.Tensor]],
+    trace_args_fn: Callable[[list[torch.fx.Node]], tuple[Any, ...]],
+    insert_before: torch.fx.Node | None = None,
+    wait_insertion_point: torch.fx.Node | None = None,
+) -> tuple[list[torch.fx.Node], dict[torch.fx.Node, torch.fx.Node]]:
+    """
+    Process a single bucket of collective operation nodes with flexible insertion control.
+
+    Args:
+        g: The graph to modify
+        bucket_nodes: Nodes in the current bucket to process
+        fn_to_trace: Function to trace and insert
+        trace_args_fn: Function to create trace arguments from inputs
+        insert_before: Where to insert the traced function (default: after last bucket node)
+        wait_insertion_point: If provided, move all nodes from wait() onwards to before this node
+
+    Returns:
+        new_nodes: List of all newly inserted nodes
+        replacements: Dictionary mapping old wait nodes to new output nodes
+    """
+    # Collect inputs and waits from current bucket
+    bucket_ins: list[torch.fx.Node] = []
+    bucket_waits: list[torch.fx.Node] = []
+    ag_node_to_pre_nodes: dict[torch.fx.Node, list[torch.fx.Node]] = defaultdict(list)
+
+    for n in bucket_nodes:
+        assert len(n.users) == 1, f"Expected single user for {n}, got {n.users}"
+        wait_n = next(iter(n.users))
+
+        # Handle convert_element_type operations (for all_gather)
+        node_in = n.args[0]
+        if (
+            is_all_gather_into_tensor(n)
+            and isinstance(node_in, torch.fx.Node)  # Add type check
+            and node_in.op == "call_function"
+            and node_in.target == torch.ops.prims.convert_element_type.default
+            and len(node_in.users) == 1
+        ):
+            ag_node_to_pre_nodes[n].append(node_in)
+            node_in = node_in.args[0]
+
+        assert isinstance(node_in, torch.fx.Node)  # Ensure node_in is a Node
+        bucket_ins.append(node_in)
+        bucket_waits.append(wait_n)
+
+    # Create trace arguments
+    trace_args = trace_args_fn(bucket_ins)
+
+    # Determine insertion point
+    if insert_before is None:
+        insert_before = bucket_nodes[-1].next
+
+    # Insert traced function and get replacements + new nodes
+    replacements, new_nodes = _insert_fn_trace_before_node(
+        g,
+        fn_to_trace,
+        trace_args,
+        insert_before,
+        bucket_ins,
+        bucket_waits,
+    )
+
+    # If requested, move wait nodes and everything after to specified location
+    if wait_insertion_point is not None:
+        # Find the first wait node in new_nodes
+        wait_start_idx = None
+        for i, node in enumerate(new_nodes):
+            if is_wait_tensor(node):
+                wait_start_idx = i
+                break
+
+        # Move all nodes from wait onwards (including the wait)
+        if wait_start_idx is not None:
+            nodes_to_move = new_nodes[wait_start_idx:]
+            for node in nodes_to_move:
+                wait_insertion_point.prepend(node)
+
+    # Erase old nodes
+    for node, wait_n in zip(bucket_nodes, bucket_waits):
+        g.erase_node(wait_n)
+        g.erase_node(node)
+        # Erase any convert_element_type nodes we tracked
+        for pre_node in reversed(ag_node_to_pre_nodes[node]):
+            g.erase_node(pre_node)
+
+    return new_nodes, replacements
+
+
+def merge_reduce_scatter_bucket(
+    g: torch.fx.Graph,
+    rs_nodes: list[torch.fx.Node],
+    mode: BucketMode = "default",
+    insert_before: torch.fx.Node | None = None,
+    wait_insertion_point: torch.fx.Node | None = None,
+) -> tuple[list[torch.fx.Node], dict[torch.fx.Node, torch.fx.Node]]:
+    # Validate bucket consistency
+    rs0 = rs_nodes[0]
+    rs0_val = rs0.meta["val"]
+    _, reduce_op, group_size, group_name = rs0.args
+    reduce_dtype = rs0_val.dtype
+    device = rs0_val.device
+
+    for n in rs_nodes:
+        rs_val = n.meta["val"]
+        assert (
+            n.args[1] == reduce_op
+            and n.args[2] == group_size
+            and n.args[3] == group_name
+            and rs_val.device == device
+            and rs_val.dtype == reduce_dtype
+        )
+
+    # Choose merge function based on mode
+    rs_merge_fn = reduce_scatter_merge_fn_to_trace
+    if mode and "custom_ops" in mode:
+        rs_merge_fn = reduce_scatter_merge_fn_to_trace_custom_ops
+
+    # Process bucket with lazy input collection
+    def create_trace_args(bucket_ins: list[torch.fx.Node]) -> tuple[Any, ...]:
+        return (
+            pytree.tree_map(lambda node: node.meta["val"], bucket_ins),
+            group_size,
+            group_name,
+            reduce_op,
+            reduce_dtype,
+            device,
+        )
+
+    return process_collective_bucket(
+        g,
+        rs_nodes,
+        rs_merge_fn,
+        create_trace_args,
+        insert_before=insert_before,
+        wait_insertion_point=wait_insertion_point,
+    )
+
+
+def merge_all_reduce_bucket(
+    g: torch.fx.Graph,
+    ar_nodes: list[torch.fx.Node],
+    mode: str | None = None,
+    insert_before: torch.fx.Node | None = None,
+    wait_insertion_point: torch.fx.Node | None = None,
+) -> tuple[list[torch.fx.Node], dict[torch.fx.Node, torch.fx.Node]]:
+    ar0 = ar_nodes[0]
+    ar0_val = ar0.meta["val"]
+    _, reduce_op, group_name = ar0.args
+    reduce_dtype = ar0_val.dtype
+    device = ar0_val.device
+
+    for n in ar_nodes:
+        ar_val = n.meta["val"]
+        assert (
+            n.args[1] == reduce_op
+            and n.args[2] == group_name
+            and ar_val.device == device
+            and ar_val.dtype == reduce_dtype
+        )
+
+    ar_merge_fn = all_reduce_merge_fn_to_trace
+
+    def create_trace_args(bucket_ins: list[torch.fx.Node]) -> tuple[Any, ...]:
+        return (
+            pytree.tree_map(lambda node: node.meta["val"], bucket_ins),
+            group_name,
+            reduce_op,
+            reduce_dtype,
+            device,
+        )
+
+    return process_collective_bucket(
+        g,
+        ar_nodes,
+        ar_merge_fn,
+        create_trace_args,
+        insert_before=insert_before,
+        wait_insertion_point=wait_insertion_point,
+    )
+
+
+def merge_all_gather_bucket(
+    g: torch.fx.Graph,
+    ag_nodes: list[torch.fx.Node],
+    mode: BucketMode = "default",
+    insert_before: torch.fx.Node | None = None,
+    wait_insertion_point: torch.fx.Node | None = None,
+) -> tuple[list[torch.fx.Node], dict[torch.fx.Node, torch.fx.Node]]:
+    from torch.distributed.distributed_c10d import _resolve_process_group
+
+    ag0 = ag_nodes[0]
+    _, group_size, group_name = ag0.args
+    assert isinstance(group_name, str)
+    _ag_dtypes: list[torch.dtype] = []  # type: ignore[name-defined]
+
+    for n in ag_nodes:
+        assert n.args[1] == group_size and n.args[2] == group_name
+        _ag_dtypes.append(n.meta["val"].dtype)
+
+    bucket_dtype = pick_bucket_dtype(_ag_dtypes)
+
+    # Choose merge function based on mode
+    ag_merge_fn = all_gather_merge_fn_to_trace
+    if mode is not None and "custom_ops" in mode:
+        ag_merge_fn = all_gather_merge_fn_to_trace_custom_ops  # type: ignore[assignment]
+
+    # Process bucket with lazy input collection
+    rank: int = dist.get_rank(_resolve_process_group(group_name))
+
+    def create_trace_args(bucket_ins: list[torch.fx.Node]) -> tuple[Any, ...]:
+        return (
+            pytree.tree_map(lambda node: node.meta["val"], bucket_ins),
+            group_size,
+            group_name,
+            bucket_dtype,
+            _ag_dtypes,
+            rank,
+        )
+
+    return process_collective_bucket(
+        g,
+        ag_nodes,
+        ag_merge_fn,
+        create_trace_args,
+        wait_insertion_point=wait_insertion_point,
+    )
 
 
 def merge_reduce_scatter(
     gm: torch.fx.GraphModule,
     rs_buckets: list[list[torch.fx.Node]],
-    mode: Optional[str] = None,
+    mode: BucketMode = "default",
 ) -> None:
     """
     Merges specified buckets of reduce_scatter to joint reduce_scatter.
     """
     with dynamo_timed("fx.bucketing.merge_reduce_scatter", log_pt2_compile_event=True):
-        rs_merge_fn = reduce_scatter_merge_fn_to_trace
-        if mode and "custom_ops" in mode:
-            rs_merge_fn = reduce_scatter_merge_fn_to_trace_custom_ops
         trace_structured(
             "artifact",
             metadata_fn=lambda: {
@@ -571,90 +937,22 @@ def merge_reduce_scatter(
             },
             payload_fn=lambda: str(rs_buckets),
         )
-        n_buckets = len(rs_buckets)
+
         g = gm.graph
-        rs_ins: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
-        rs_waits: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
-
-        for bucket_idx, rs_nodes in enumerate(rs_buckets):
-            rs0 = rs_nodes[0]
-            rs0_val = rs0.meta["val"]
-            _, reduce_op, group_size, group_name = rs0.args
-            reduce_dtype = rs0_val.dtype
-            device = rs0_val.device
-            for n in rs_nodes:
-                rs_val = n.meta["val"]
-                assert (
-                    n.args[1] == reduce_op
-                    and n.args[2] == group_size
-                    and n.args[3] == group_name
-                    and rs_val.device == device
-                    and rs_val.dtype == reduce_dtype
-                )
-                assert len(n.users) == 1
-                wait_n = next(iter(n.users))
-                rs_ins[bucket_idx].append(n.args[0])  # type: ignore[arg-type]
-                rs_waits[bucket_idx].append(wait_n)
-
-        for bucket_idx in range(n_buckets):
-            _rs_ins = rs_ins[bucket_idx]
-            _rs_waits = rs_waits[bucket_idx]
-            _rs_ns = rs_buckets[bucket_idx]
-
-            rs0 = _rs_ns[0]
-            rs0_val = rs0.meta["val"]
-            _, reduce_op, group_size, group_name = rs0.args
-            reduce_dtype = rs0_val.dtype
-            device = rs0_val.device
-
-            replacements = _insert_fn_trace_before_node(
-                g,
-                rs_merge_fn,
-                (
-                    pytree.tree_map(lambda node: node.meta["val"], _rs_ins),
-                    group_size,
-                    group_name,
-                    reduce_op,
-                    reduce_dtype,
-                    device,
-                ),
-                _rs_ns[-1].next,
-                _rs_ins,
-                _rs_waits,
-            )
-            # [Note: Replacement in bucketing passes]
-            # After bucketing _rs_waits will be replaced with output nodes of
-            # fn_to_trace graph that will be inserted in the graph g.
-            # By this time we already prepared rs_ins, rs_waits.
-            # rs_ins for following buckets can be replaced _rs_waits with new nodes.
-            # We apply replacements to rs_ins.
-
-            def _replace(x: torch.fx.Node) -> torch.fx.Node:
-                return replacements.get(x, x)
-
-            for j in range(bucket_idx + 1, n_buckets):
-                rs_ins[j] = pytree.tree_map(_replace, rs_ins[j])
-
-            for rs_n, wait_n in zip(_rs_ns, _rs_waits):
-                g.erase_node(wait_n)
-                g.erase_node(rs_n)
+
+        for rs_nodes in rs_buckets:
+            merge_reduce_scatter_bucket(g, rs_nodes, mode)
 
 
 def merge_all_gather(
     gm: torch.fx.GraphModule,
     ag_buckets: list[list[torch.fx.Node]],
-    mode: Optional[str] = None,
-) -> None:  # type: ignore[union-attr]
+    mode: BucketMode = "default",
+) -> None:
     """
     Merges specified buckets of all_gather to joint all_gather.
     """
     with dynamo_timed("fx.bucketing.merge_all_gather", log_pt2_compile_event=True):
-        from torch.distributed.distributed_c10d import _resolve_process_group
-
-        ag_merge_fn = all_gather_merge_fn_to_trace
-        if mode and "custom_ops" in mode:
-            ag_merge_fn = all_gather_merge_fn_to_trace_custom_ops
-
         trace_structured(
             "artifact",
             metadata_fn=lambda: {
@@ -663,80 +961,8 @@ def merge_all_gather(
             },
             payload_fn=lambda: str(ag_buckets),
         )
-        n_buckets = len(ag_buckets)
-
-        ag_node_to_pre_nodes = defaultdict(list)
-
-        ag_ins: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
-        ag_waits: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
-        for bucket_idx, ag_bucket in enumerate(ag_buckets):
-            _, group_size, group_name = ag_bucket[0].args
-            assert isinstance(group_name, str)
-            dtype = ag_bucket[0].meta["val"].dtype
-
-            for ag_node in ag_bucket:
-                assert len(ag_node.users) == 1, (
-                    f"Expect only one user for {ag_node}, but got {ag_node.users}"
-                )
-                wait_node = next(iter(ag_node.users))
-                assert (
-                    ag_node.args[1] == group_size
-                    and ag_node.args[2] == group_name
-                    and ag_node.meta["val"].dtype == dtype
-                )
-                ag_node_in = ag_node.args[0]
-                if (
-                    ag_node_in.op == "call_function"  # type: ignore[union-attr]
-                    and ag_node_in.target  # type: ignore[union-attr]
-                    == torch.ops.prims.convert_element_type.default  # type: ignore[union-attr]
-                    and len(ag_node_in.users) == 1  # type: ignore[union-attr]
-                ):
-                    ag_node_to_pre_nodes[ag_node].append(ag_node_in)
-                    ag_node_in = ag_node_in.args[0]  # type: ignore[union-attr]
-
-                ag_ins[bucket_idx].append(ag_node_in)  # type: ignore[union-attr, arg-type]
-                ag_waits[bucket_idx].append(wait_node)
 
         g = gm.graph
 
-        for bucket_idx in range(n_buckets):
-            _ag_ins = ag_ins[bucket_idx]
-            _ag_waits = ag_waits[bucket_idx]
-            _ag_ns = ag_buckets[bucket_idx]
-
-            ag0 = _ag_ns[0]
-            ag0_val = ag0.meta["val"]
-            _, group_size, group_name = ag0.args
-            dtype = ag0_val.dtype
-            assert isinstance(group_name, str)
-
-            rank: int = dist.get_rank(_resolve_process_group(group_name))
-
-            replacements = _insert_fn_trace_before_node(
-                g,
-                ag_merge_fn,
-                (
-                    pytree.tree_map(lambda node: node.meta["val"], _ag_ins),
-                    group_size,
-                    group_name,
-                    dtype,
-                    rank,
-                ),
-                ag0.next,
-                _ag_ins,
-                _ag_waits,
-            )
-
-            # See Note: [Replacement in bucketing passes]
-            def _replace(x: torch.fx.Node) -> torch.fx.Node:
-                return replacements.get(x, x)
-
-            for j in range(bucket_idx + 1, n_buckets):
-                ag_ins[j] = pytree.tree_map(_replace, ag_ins[j])
-
-            # Erasing old nodes in reverse order
-            for ag_n, wait_n in zip(ag_buckets[bucket_idx], _ag_waits):
-                g.erase_node(wait_n)
-                g.erase_node(ag_n)
-                for n in reversed(ag_node_to_pre_nodes[ag_n]):
-                    g.erase_node(n)  # type: ignore[arg-type]
+        for ag_nodes in ag_buckets:
+            merge_all_gather_bucket(g, ag_nodes, mode)
diff --git a/torch/_inductor/fx_passes/control_dependencies.py b/torch/_inductor/fx_passes/control_dependencies.py
new file mode 100644
index 000000000000..c6e3ca625c5d
--- /dev/null
+++ b/torch/_inductor/fx_passes/control_dependencies.py
@@ -0,0 +1,226 @@
+# mypy: allow-untyped-defs
+"""
+Effect ordering pass for inductor.
+
+This pass adds ordering dependencies to FX graphs using the control_deps HOP
+for precise control over scheduling constraints. When you need exact ordering between
+operations (e.g., collective_start -> mm -> wait), this pass wraps operations
+with control_deps to make dependencies explicit.
+"""
+
+from typing import Any
+
+import torch.fx as fx
+from torch._higher_order_ops.utils import register_fake
+from torch._ops import HigherOrderOperator
+from torch.utils._ordered_set import OrderedSet
+
+
+class ControlDeps(HigherOrderOperator):
+    """
+    Higher-order operator that enforces ordering by making dependencies explicit.
+
+    Schema: control_deps(additional_deps, target, *args, **kwargs) -> result
+    where:
+    - additional_deps: tuple of tensors that must be computed before this op
+    - subgraph: GraphModule containing the exact operation to execute
+    - args/kwargs: arguments for the target function
+
+    This ensures all tensors in additional_deps are computed before the target
+    executes, creating explicit scheduling dependencies.
+    """
+
+    def __init__(self) -> None:
+        super().__init__("control_deps")
+
+    def __call__(self, additional_deps, subgraph, *args, **kwargs):
+        """Call the operator with dependencies and subgraph.
+
+        Args:
+            additional_deps: Tuple of tensors that must be computed first
+            subgraph: GraphModule containing the exact operation to execute
+            *args: Arguments to pass to the subgraph
+        """
+        if not isinstance(additional_deps, (tuple, list)):
+            raise TypeError(
+                f"additional_deps must be tuple/list, got {type(additional_deps).__name__}"
+            )
+        if not (isinstance(subgraph, fx.GraphModule) or callable(subgraph)):
+            raise TypeError(
+                f"subgraph must be GraphModule or callable, got {type(subgraph).__name__}"
+            )
+        return super().__call__(additional_deps, subgraph, *args, **kwargs)
+
+
+control_deps = ControlDeps()
+
+
+# Register fake implementation for tracing
+@register_fake(control_deps)
+def _(additional_deps, subgraph, *args, **kwargs):
+    """Fake tensor implementation - execute the subgraph."""
+    return subgraph(*args, **kwargs)
+
+
+def get_subgraph_name(gm: fx.GraphModule, name):
+    name = f"subgraph_{name}"
+
+    if not hasattr(gm, name):
+        return name
+
+    i = 0
+    while hasattr(gm, f"{name}_{i}"):
+        i += 1
+
+    return f"{name}_{i}"
+
+
+def preserve_node_ordering(
+    graph: fx.Graph,
+    additional_deps_map: dict[fx.Node, OrderedSet[fx.Node]],
+    verbose: bool = False,
+) -> None:
+    """
+    Preserve node ordering using control_deps HOP with subgraph.
+
+    This function wraps operations with control_deps that:
+    1. Makes additional dependencies explicit (first argument)
+    2. Creates a subgraph internally to preserve the exact original operation
+    3. Preserves the original node names
+
+    Args:
+        graph: The FX graph to modify
+        additional_deps_map: Mapping from dependent nodes to their dependencies
+        verbose: If True, print debug information
+    """
+    if not additional_deps_map:
+        return
+
+    # Track replacements so we can update dependencies
+    replacements: dict[fx.Node, fx.Node] = {}
+
+    # Process each node that needs additional dependencies
+    for dependent_node, dep_nodes in additional_deps_map.items():
+        assert dependent_node.op == "call_function", dependent_node.op
+
+        original_name = dependent_node.name
+        original_args = dependent_node.args
+        original_kwargs = dependent_node.kwargs
+        original_meta = dependent_node.meta.copy()
+
+        updated_dep_nodes = [replacements.get(dep, dep) for dep in dep_nodes]
+
+        # Create a subgraph that preserves the exact original operation
+        subgraph_module = _create_subgraph_for_node(graph, dependent_node)
+
+        owning_mod = graph.owning_module
+        assert owning_mod is not None
+        subgraph_attr_name = get_subgraph_name(owning_mod, original_name)
+        setattr(graph.owning_module, subgraph_attr_name, subgraph_module)
+
+        # Create control_deps call with:
+        # 1. Additional dependencies as first arg (explicit)
+        # 2. Subgraph via get_attr (like b2b gemm pass)
+        # 3. Original arguments (only fx.Node args and kwargs are passed)
+        with graph.inserting_before(dependent_node):
+            # Create get_attr node for the subgraph
+            get_subgraph = graph.get_attr(subgraph_attr_name)
+
+            # add additional args
+            node_args = [a for a in original_args if isinstance(a, fx.Node)]
+            for value in original_kwargs.values():
+                if isinstance(value, fx.Node):
+                    node_args.append(value)
+
+            # Create with temporary name first
+            ordered_node = graph.call_function(
+                control_deps,
+                args=(
+                    tuple(updated_dep_nodes),  # additional_deps
+                    get_subgraph,  # subgraph via get_attr (like b2b gemm)
+                    *node_args,  # original node arguments (from both args and kwargs)
+                ),
+                kwargs={},
+                name=f"__temp_{original_name}",  # Temporary name to avoid conflict
+            )
+
+        # Copy metadata from original node
+        ordered_node.meta = original_meta
+        # this will be constrained on the target node in subgraph if it exists
+        ordered_node.meta.pop("eager_input_vals", None)
+
+        # Replace all uses of the original node with the ordered version
+        dependent_node.replace_all_uses_with(ordered_node)
+
+        # Remove the original node from the graph
+        graph.erase_node(dependent_node)
+
+        # Now rename the ordered node to the original name
+        ordered_node.name = original_name  # PRESERVE ORIGINAL NAME
+
+        # Track the replacement for future dependencies
+        replacements[dependent_node] = ordered_node
+
+
+def _create_subgraph_for_node(graph: fx.Graph, node: fx.Node) -> fx.GraphModule:
+    """
+    Create a subgraph that exactly recreates a node's operation.
+
+    The subgraph takes only the fx.Node arguments and recreates the operation
+    with the exact target, args structure, and kwargs.
+
+    Args:
+        graph: The parent graph
+        node: The node to wrap in a subgraph
+
+    Returns:
+        A GraphModule containing the subgraph
+    """
+    # Get the owning module
+    # torch.distributed.breakpoint(0)
+    owning_module = graph.owning_module
+
+    # Create a new graph for the subgraph
+    subgraph = fx.Graph(owning_module)
+
+    new_args: list[Any] = []
+    placeholder_idx = 0
+    for _, arg in enumerate(node.args):
+        if not isinstance(arg, fx.Node):
+            new_args.append(arg)
+            continue
+
+        placeholder = subgraph.placeholder(f"arg_{placeholder_idx}")
+        placeholder_idx += 1
+        if "val" in arg.meta:
+            placeholder.meta.update(arg.meta)
+        new_args.append(placeholder)  # type: ignore[arg-type]
+
+    new_kwargs: dict[str, Any] = {}
+    for key, value in node.kwargs.items():
+        if not isinstance(value, fx.Node):
+            new_kwargs[key] = value
+            continue
+
+        placeholder = subgraph.placeholder(f"kwarg_{key}")
+        if "val" in value.meta:
+            placeholder.meta.update(value.meta)
+
+        new_kwargs[key] = placeholder  # type: ignore[assignment]
+
+    # Recreate the exact original operation in the subgraph
+    assert callable(node.target)
+    result = subgraph.call_function(
+        node.target,
+        tuple(new_args),
+        new_kwargs,  # type: ignore[arg-type]
+    )
+
+    # Copy metadata from the original node
+    result.meta.update(node.meta)
+
+    out = subgraph.output(result)
+    if "val" in result.meta:
+        out.meta["val"] = result.meta["val"]
+
+    return fx.GraphModule(owning_module, subgraph)
diff --git a/torch/_inductor/fx_passes/ddp_fusion.py b/torch/_inductor/fx_passes/ddp_fusion.py
index ccea7d7e70af..9255c37fff71 100644
--- a/torch/_inductor/fx_passes/ddp_fusion.py
+++ b/torch/_inductor/fx_passes/ddp_fusion.py
@@ -7,7 +7,7 @@
 from collections.abc import Generator
 from dataclasses import dataclass
 from functools import partial
-from typing import Any, Callable, cast, Optional, Union
+from typing import Any, Callable, cast
 
 import torch
 import torch.fx as fx
@@ -39,12 +39,12 @@ def move_block_before(block: list[fx.Node], target_node: fx.Node) -> None:
 
 def call_function(
     graph: fx.Graph,
-    target: Union[str, Callable[..., Any]],
-    args: Optional[tuple[fx.node.Argument, ...]] = None,
-    kwargs: Optional[dict[str, fx.node.Argument]] = None,
+    target: str | Callable[..., Any],
+    args: tuple[fx.node.Argument, ...] | None = None,
+    kwargs: dict[str, fx.node.Argument] | None = None,
 ) -> fx.Node:
     # We accept target as a str to avoid typing error as the type of
-    # a node.target is Union[str, Callable[..., Any]].
+    # a node.target is str | Callable[..., Any].
     # This also allows us to avoid writing check for every call.
     if isinstance(target, str):
         raise RuntimeError(f"Call function should not get a str target {target=}")
@@ -62,7 +62,7 @@ def call_function(
 
 @dataclass(unsafe_hash=True)
 class CommBlock:
-    shape: Union[torch.Size, list[torch.Size]]
+    shape: torch.Size | list[torch.Size]
     node_list: list[fx.Node]
     inputs: list[fx.Node]
     wait_nodes: list[fx.Node]
@@ -70,7 +70,7 @@ class CommBlock:
     outputs: OrderedSet[fx.Node]
 
 
-def get_comm_block(comm_node: fx.Node) -> Optional[CommBlock]:
+def get_comm_block(comm_node: fx.Node) -> CommBlock | None:
     """
     Given a collective node (e.g., allreduce), find out all the nodes belong to
     this communication.
@@ -128,7 +128,7 @@ def get_comm_block(comm_node: fx.Node) -> Optional[CommBlock]:
                 break
 
     tensor_meta = input_nodes[0].meta["tensor_meta"]
-    shape: Union[torch.Size, list[torch.Size]]
+    shape: torch.Size | list[torch.Size]
     if isinstance(tensor_meta, TensorMetadata):
         shape = tensor_meta.shape
     elif isinstance(tensor_meta, (list, tuple)):
@@ -150,7 +150,7 @@ def get_comm_block(comm_node: fx.Node) -> Optional[CommBlock]:
 def get_all_comm_blocks(
     graph: fx.Graph,
     comm_ops: tuple[torch._ops.OpOverload, ...],
-    comm_filter: Optional[Callable[..., bool]] = None,
+    comm_filter: Callable[..., bool] | None = None,
 ) -> list[CommBlock]:
     if comm_filter is None:
 
@@ -215,6 +215,7 @@ def _fuse_allreduce_by_concat(
 
     # Move the fused all_reduce and its args to right after the input node
     nodes_to_move = cat_inputs + [cat_node, div_node, fused_comm_node, fused_wait_node]
+    # pyrefly: ignore  # bad-argument-type
     move_block_after(nodes_to_move, last_input_node)
 
     return CommBlock(
@@ -307,6 +308,7 @@ def _scatter_fused_allreduce_waits(
     # in orig_comm_blocks. This index will be later used to determine what users
     # nodes need to be move to maintain a correct topological sort order.
     last_wait_node_idx = 0
+    # pyrefly: ignore  # bad-assignment
     for node in graph.nodes:
         last_wait_node_idx = max(
             node_indices.get(node, last_wait_node_idx), last_wait_node_idx
@@ -356,6 +358,7 @@ def _scatter_fused_allreduce_waits(
             user_node = nodes.popleft()
             if not isinstance(user_node, fx.Node):
                 continue
+            # pyrefly: ignore  # unsupported-operation
             if node_indices[user_node] < last_wait_node_idx:
                 incorrect_order_nodes.append(user_node)
                 nodes.extend(list(user_node.users))
@@ -568,7 +571,7 @@ def schedule_comm_wait(graph: fx.Graph) -> None:
 
 
 def fuse_ddp_communication(
-    graph: fx.Graph, passes: list[Union[Callable[..., None], str]], bucket_size_mb: int
+    graph: fx.Graph, passes: list[Callable[..., None] | str], bucket_size_mb: int
 ) -> None:
     for i, pa in enumerate(passes):
         with GraphTransformObserver(
diff --git a/torch/_inductor/fx_passes/decompose_mem_bound_mm.py b/torch/_inductor/fx_passes/decompose_mem_bound_mm.py
index 31c6dae82fdb..127384c87f10 100644
--- a/torch/_inductor/fx_passes/decompose_mem_bound_mm.py
+++ b/torch/_inductor/fx_passes/decompose_mem_bound_mm.py
@@ -225,6 +225,7 @@ def repl(mat1, mat2):
 
     if should_decompose_bmm(mat1, mat2):
         counters["inductor"]["decompose_bmm"] += 1
+        # pyrefly: ignore  # bad-argument-type
         match.replace_by_example(repl, [mat1, mat2])
         print_decompose_pattern(match, [mat1, mat2])
         realize_inputs([mat1, mat2])
@@ -248,6 +249,7 @@ def repl(mat1, mat2, mat3):
 
     if should_decompose_mm(mat2, mat3):
         counters["inductor"]["decompose_addmm"] += 1
+        # pyrefly: ignore  # bad-argument-type
         match.replace_by_example(repl, [mat1, mat2, mat3])
         print_decompose_pattern(match, [mat1, mat2, mat3])
         realize_inputs([mat1, mat2, mat3])
@@ -268,6 +270,7 @@ def repl(mat1, mat2):
 
     if should_decompose_mm(mat1, mat2):
         counters["inductor"]["decompose_mm"] += 1
+        # pyrefly: ignore  # bad-argument-type
         match.replace_by_example(repl, [mat1, mat2])
         print_decompose_pattern(match, [mat1, mat2])
         realize_inputs([mat1, mat2])
diff --git a/torch/_inductor/fx_passes/dedupe_symint_uses.py b/torch/_inductor/fx_passes/dedupe_symint_uses.py
index 713ed27aaa84..7b431c2f1711 100644
--- a/torch/_inductor/fx_passes/dedupe_symint_uses.py
+++ b/torch/_inductor/fx_passes/dedupe_symint_uses.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 from dataclasses import dataclass
-from typing import Any, Union
+from typing import Any
 
 import torch
 from torch import SymBool, SymFloat, SymInt
@@ -14,7 +14,7 @@ class _SymExprHash:
     Hash for a py_sym_types that will use the underlying sympy expression
     """
 
-    sym_obj: Union[SymInt, SymFloat, SymBool]
+    sym_obj: SymInt | SymFloat | SymBool
 
     def __hash__(self) -> int:
         return hash((type(self.sym_obj), self.sym_obj.node.expr))
diff --git a/torch/_inductor/fx_passes/efficient_conv_bn_eval.py b/torch/_inductor/fx_passes/efficient_conv_bn_eval.py
index 0e647e37cd34..b6db1367de6e 100644
--- a/torch/_inductor/fx_passes/efficient_conv_bn_eval.py
+++ b/torch/_inductor/fx_passes/efficient_conv_bn_eval.py
@@ -108,14 +108,10 @@ def efficient_conv_bn_eval_decomposed(
     else:
         bias_on_the_fly = torch.zeros_like(bn_running_var)
 
-    if bn_weight is not None:
-        bn_weight = bn_weight
-    else:
+    if bn_weight is None:
         bn_weight = torch.ones_like(bn_running_var)
 
-    if bn_bias is not None:
-        bn_bias = bn_bias
-    else:
+    if bn_bias is None:
         bn_bias = torch.zeros_like(bn_running_var)
 
     # shape of [C_out, 1, 1, 1] in Conv2d
@@ -144,6 +140,7 @@ def efficient_conv_bn_eval_decomposed(
             torch.nn.functional.batch_norm,
         ]
     ),
+    # pyrefly: ignore  # bad-argument-type
     pass_dict=efficient_conv_bn_eval_pass,
     extra_check=lambda match: not inductor_config.freezing
     and inductor_config.efficient_conv_bn_eval_fx_passes,
@@ -235,6 +232,7 @@ def efficient_conv_bn_eval_graph_transform_inlined(match: Match, *args, **kwargs
             torch.ops.aten.batch_norm.default,
         ]
     ),
+    # pyrefly: ignore  # bad-argument-type
     pass_dict=efficient_conv_bn_eval_pass,
     extra_check=lambda match: not inductor_config.freezing
     and inductor_config.efficient_conv_bn_eval_fx_passes,
@@ -330,6 +328,7 @@ def efficient_conv_bn_eval_graph_transform_decomposed(match: Match, *args, **kwa
             nn.SyncBatchNorm,
         ],
     ),
+    # pyrefly: ignore  # bad-argument-type
     pass_dict=efficient_conv_bn_eval_pass,
     extra_check=lambda match: not inductor_config.freezing
     and inductor_config.efficient_conv_bn_eval_fx_passes,
diff --git a/torch/_inductor/fx_passes/freezing_patterns.py b/torch/_inductor/fx_passes/freezing_patterns.py
index 26256b5504d7..5aad94b781e9 100644
--- a/torch/_inductor/fx_passes/freezing_patterns.py
+++ b/torch/_inductor/fx_passes/freezing_patterns.py
@@ -107,6 +107,7 @@ def register_freezing_graph_pattern(pattern, extra_check=_return_true, pass_numb
     return register_graph_pattern(
         pattern,
         extra_check=extra_check,
+        # pyrefly: ignore  # bad-argument-type
         pass_dict=pass_patterns[pass_number],
     )
 
@@ -115,6 +116,7 @@ def register_binary_folding_pattern(pattern, extra_check=_return_true):
     return register_graph_pattern(
         pattern,
         extra_check=extra_check,
+        # pyrefly: ignore  # bad-argument-type
         pass_dict=binary_folding_pass,
     )
 
@@ -202,10 +204,14 @@ def int8_woq_fusion_replacement(inp, w1, w2, w3, s1, s2, s3):
         return mm.tensor_split([n1, n1 + n2], dim=-1)
 
     register_replacement(
+        # pyrefly: ignore  # bad-argument-type
         int8_woq_fusion_pattern,
+        # pyrefly: ignore  # bad-argument-type
         int8_woq_fusion_replacement,
         [val(), val(), val(), val(), scale(), scale(), scale()],
+        # pyrefly: ignore  # bad-argument-type
         fwd_only,
+        # pyrefly: ignore  # bad-argument-type
         pass_patterns[0],
         extra_check=check_int8_woq_concat_linear_weights,
         exclusive_arg_names=("w1", "w2", "w3", "s1", "s2", "s3"),
@@ -220,10 +226,14 @@ def matmul_replacement(inp, w1, w2, w3):
         return mm.chunk(3, dim=1)
 
     register_replacement(
+        # pyrefly: ignore  # bad-argument-type
         matmul_fuse_pattern,
+        # pyrefly: ignore  # bad-argument-type
         matmul_replacement,
         [val(), val(), val(), val()],
+        # pyrefly: ignore  # bad-argument-type
         fwd_only,
+        # pyrefly: ignore  # bad-argument-type
         pass_patterns[0],
         extra_check=check_concat_weights,
         exclusive_arg_names=("w1", "w2", "w3"),
@@ -238,10 +248,14 @@ def matmul_replacement_two(inp, w1, w2):
         return mm.chunk(2, dim=1)
 
     register_replacement(
+        # pyrefly: ignore  # bad-argument-type
         matmul_fuse_pattern_two,
+        # pyrefly: ignore  # bad-argument-type
         matmul_replacement_two,
         [val(), val(), val()],
+        # pyrefly: ignore  # bad-argument-type
         fwd_only,
+        # pyrefly: ignore  # bad-argument-type
         pass_patterns[0],
         extra_check=check_concat_weights,
         exclusive_arg_names=("w1", "w2"),
@@ -260,10 +274,14 @@ def addmm_fuse_replacement_second(inp, w1, w2, w3, b1, b2, b3):
         return aten.addmm(cat_b, inp, cat_w).chunk(3, dim=1)
 
     register_replacement(
+        # pyrefly: ignore  # bad-argument-type
         addmm_fuse_pattern_second,
+        # pyrefly: ignore  # bad-argument-type
         addmm_fuse_replacement_second,
         [val() for _ in range(7)],
+        # pyrefly: ignore  # bad-argument-type
         fwd_only,
+        # pyrefly: ignore  # bad-argument-type
         pass_patterns[0],
         extra_check=check_concat_weights,
         exclusive_arg_names=("w1", "w2", "w3", "b1", "b2", "b3"),
@@ -280,6 +298,7 @@ def same_dtype(match):
         Ignored(),
         KeywordArg("dtype"),
     ),
+    # pyrefly: ignore  # bad-argument-type
     pass_dict=pass_patterns[0],
     extra_check=same_dtype,
 )
diff --git a/torch/_inductor/fx_passes/fsdp.py b/torch/_inductor/fx_passes/fsdp.py
index e7e574ae4934..6a1a2d227de1 100644
--- a/torch/_inductor/fx_passes/fsdp.py
+++ b/torch/_inductor/fx_passes/fsdp.py
@@ -1,10 +1,11 @@
 import logging
-from typing import Callable, Optional
+from typing import Callable
 
 import torch
 from torch._inductor.fx_passes.bucketing import (
     bucket_all_gather_by_mb,
     bucket_reduce_scatter_by_mb,
+    BucketMode,
     merge_all_gather,
     merge_reduce_scatter,
 )
@@ -55,15 +56,15 @@ def is_fsdp_reduce_scatter_wait(wait: torch.fx.Node) -> bool:
 
 def bucket_fsdp_all_gather(
     gm: torch.fx.GraphModule,
-    bucket_cap_mb_by_bucket_idx: Optional[Callable[[int], float]] = None,
-    mode: Optional[str] = None,
+    bucket_cap_mb_by_bucket_idx: Callable[[int], float] | None = None,
+    mode: BucketMode = "default",
 ) -> None:
     """
     Bucketing pass for SimpleFSDP all_gather ops.
 
     Attributes:
         gm (torch.fx.GraphModule): Graph module of the graph.
-        bucket_cap_mb_by_bucket_idx (Optional[Callable[[int], float]]): callback function that
+        bucket_cap_mb_by_bucket_idx (Callable[[int], float] | None): callback function that
             takes in bucket id and returns size of a bucket in megabytes.
     """
     if bucket_cap_mb_by_bucket_idx is None:
@@ -85,15 +86,15 @@ def bucket_fsdp_all_gather(
 
 def bucket_fsdp_reduce_scatter(
     gm: torch.fx.GraphModule,
-    bucket_cap_mb_by_bucket_idx: Optional[Callable[[int], float]] = None,
-    mode: Optional[str] = None,
+    bucket_cap_mb_by_bucket_idx: Callable[[int], float] | None = None,
+    mode: BucketMode = "default",
 ) -> None:
     """
     Bucketing pass for SimpleFSDP reduce_scatter ops.
 
     Attributes:
         gm (torch.fx.GraphModule): Graph module of the graph.
-        bucket_cap_mb_by_bucket_idx (Optional[Callable[[int], float]]): callback function that
+        bucket_cap_mb_by_bucket_idx (Callable[[int], float] | None): callback function that
             takes in bucket idx and returns size of a bucket in megabytes. By default
             torch._inductor.fx_passes.bucketing.bucket_cap_mb_by_bucket_idx_default is used.
 
diff --git a/torch/_inductor/fx_passes/fuse_attention.py b/torch/_inductor/fx_passes/fuse_attention.py
index 5f449eb49664..9a09d2531348 100644
--- a/torch/_inductor/fx_passes/fuse_attention.py
+++ b/torch/_inductor/fx_passes/fuse_attention.py
@@ -581,42 +581,6 @@ def _sfdp_replacement_20(query, key, value, attn_mask, dropout_p):
     )
 
 
-def _sfdp_pattern_24(query, key, value, attention_mask):
-    """
-    this pattern is for MBartForCausalLM/PLBartForCausalLM.
-    attn_mask has a different dtype with QKV.
-    there is no scale in sdpa.
-    """
-    bs = query.size(0)
-    n_head = query.size(1)
-    seq_len = query.size(2)
-    head_size = query.size(3)
-    q = query.view(bs * n_head, -1, head_size)
-    k = key.reshape(bs * n_head, -1, head_size)
-    v = value.reshape(bs * n_head, -1, head_size)
-    attn_weights = torch.bmm(q, k.transpose(1, 2))
-    attn_weights = attn_weights.view(bs, n_head, seq_len, -1) + attention_mask
-    attn_weights = attn_weights.view(bs * n_head, seq_len, -1)
-    attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
-    if query.dtype == torch.half:
-        attn_weights = attn_weights.to(torch.half)
-    attn_output = torch.bmm(attn_weights, v)
-    attn_output = attn_output.view(bs, n_head, seq_len, head_size)
-    return attn_output
-
-
-def _sfdp_replacement_24(query, key, value, attention_mask):
-    counters["inductor"]["fuse_attention"] += 1
-    return _scaled_dot_product_attention(
-        query,
-        key,
-        value,
-        attn_mask=attention_mask.to(dtype=query.dtype),
-        is_causal=False,
-        scale=1,
-    )
-
-
 def _sfdp_pattern_21(query, key, value, attn_mask):
     # for T5 with inplace add
     query = query.permute([0, 2, 1, 3])
@@ -643,7 +607,7 @@ def _sfdp_replacement_21(query, key, value, attn_mask):
         query,
         key,
         value,
-        attn_mask=attn_mask,
+        attn_mask=attn_mask.to(dtype=query.dtype),
         is_causal=False,
         scale=1.0,
     )
@@ -676,7 +640,7 @@ def _sfdp_replacement_22(query, key, value, attn_mask):
             query,
             key,
             value,
-            attn_mask=attn_mask,
+            attn_mask=attn_mask.to(dtype=query.dtype),
             is_causal=False,
             scale=1.0,
         ),
@@ -723,6 +687,42 @@ def _sfdp_replacement_23(query, key, value):
     )
 
 
+def _sfdp_pattern_24(query, key, value, attention_mask):
+    """
+    this pattern is for MBartForCausalLM/PLBartForCausalLM.
+    attn_mask has a different dtype with QKV.
+    there is no scale in sdpa.
+    """
+    bs = query.size(0)
+    n_head = query.size(1)
+    seq_len = query.size(2)
+    head_size = query.size(3)
+    q = query.view(bs * n_head, -1, head_size)
+    k = key.reshape(bs * n_head, -1, head_size)
+    v = value.reshape(bs * n_head, -1, head_size)
+    attn_weights = torch.bmm(q, k.transpose(1, 2))
+    attn_weights = attn_weights.view(bs, n_head, seq_len, -1) + attention_mask
+    attn_weights = attn_weights.view(bs * n_head, seq_len, -1)
+    attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
+    if query.dtype == torch.half:
+        attn_weights = attn_weights.to(torch.half)
+    attn_output = torch.bmm(attn_weights, v)
+    attn_output = attn_output.view(bs, n_head, seq_len, head_size)
+    return attn_output
+
+
+def _sfdp_replacement_24(query, key, value, attention_mask):
+    counters["inductor"]["fuse_attention"] += 1
+    return _scaled_dot_product_attention(
+        query,
+        key,
+        value,
+        attn_mask=attention_mask.to(dtype=query.dtype),
+        is_causal=False,
+        scale=1,
+    )
+
+
 def _sfdp_params_check(match):
     assert all(k in match.kwargs for k in ("query", "key", "value"))
     query = match.kwargs["query"].meta["val"]
@@ -1024,6 +1024,13 @@ def _get_sfdp_patterns():
                 {},
                 _sfdp_params_check,
             ),
+            (
+                _sfdp_pattern_21,
+                _sfdp_replacement_21,
+                [g_bs1(), g_bs1(), g_bs1(), m_bs1_float()],
+                {},
+                _sfdp_params_check,
+            ),
             (
                 _sfdp_pattern_22,
                 _sfdp_replacement_22,
@@ -1031,6 +1038,13 @@ def _get_sfdp_patterns():
                 {},
                 _sfdp_params_check,
             ),
+            (
+                _sfdp_pattern_22,
+                _sfdp_replacement_22,
+                [g_bs1(), g_bs1(), g_bs1(), m_bs1_float()],
+                {},
+                _sfdp_params_check,
+            ),
             (
                 _sfdp_pattern_23,
                 _sfdp_replacement_23,
@@ -1038,6 +1052,13 @@ def _get_sfdp_patterns():
                 {},
                 _sfdp_params_check,
             ),
+            (
+                _sfdp_pattern_23,
+                _sfdp_replacement_23,
+                [g_bs1(), g_bs1(), g_bs1()],
+                {},
+                _sfdp_params_check,
+            ),
             (
                 _sfdp_pattern_24,
                 _sfdp_replacement_24,
diff --git a/torch/_inductor/fx_passes/group_batch_fusion.py b/torch/_inductor/fx_passes/group_batch_fusion.py
index f081374585ee..a8e2a4816ec0 100644
--- a/torch/_inductor/fx_passes/group_batch_fusion.py
+++ b/torch/_inductor/fx_passes/group_batch_fusion.py
@@ -4,7 +4,7 @@
 import operator
 from collections import OrderedDict
 from collections.abc import Iterable, Iterator
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from torch._dynamo.utils import counters, is_node_meta_valid
@@ -185,9 +185,7 @@ def _is_input_2d(self, input: torch.fx.Node) -> bool:
             and isinstance(input_shapes[1], int)
         )
 
-    def match(
-        self, node: torch.fx.Node
-    ) -> Optional[tuple[str, int, int, int, bool, str]]:
+    def match(self, node: torch.fx.Node) -> tuple[str, int, int, int, bool, str] | None:
         if CallFunctionVarArgs(aten.mm).match(node):
             input_m, weight_m = node.args
             bias_m = None
@@ -325,7 +323,7 @@ def _mm_node_can_be_fused(self, node: torch.fx.Node):
             )
         )
 
-    def match(self, node: torch.fx.Node) -> Optional[tuple[str, bool]]:
+    def match(self, node: torch.fx.Node) -> tuple[str, bool] | None:
         if CallFunctionVarArgs(aten.mm.default).match(
             node
         ) and self._mm_node_can_be_fused(node):
@@ -493,7 +491,7 @@ class BatchLinearLHSFusion(BatchFusion):
     We have a separate pass to eliminate contiguous transpose in a generic way.
     """
 
-    def match(self, node: torch.fx.Node) -> Optional[tuple[str, bool, Any]]:
+    def match(self, node: torch.fx.Node) -> tuple[str, bool, Any] | None:
         if CallFunctionVarArgs(torch.nn.functional.linear).match(
             node
         ) and is_linear_node_can_be_fused(node):
@@ -840,6 +838,7 @@ def fuse(self, graph: torch.fx.GraphModule, subset: list[torch.fx.Node]):
                 )
                 update_pointwise_example_value(
                     batch_layer_norm,
+                    # pyrefly: ignore  # missing-attribute
                     stack_weight.meta["example_value"],
                     previous_batch_layer_norm_meta,
                     torch.mul,
@@ -850,28 +849,33 @@ def fuse(self, graph: torch.fx.GraphModule, subset: list[torch.fx.Node]):
                 )
                 update_pointwise_example_value(
                     batch_layer_norm,
+                    # pyrefly: ignore  # missing-attribute
                     stack_bias.meta["example_value"],
                     previous_batch_layer_norm_meta,
                     torch.add,
                 )
             elif group_weights is not None and group_biases is None:
                 previous_batch_layer_norm_meta = batch_layer_norm.meta["example_value"]
+                # pyrefly: ignore  # not-callable
                 batch_layer_norm = graph.call_function(
                     torch.mul, args=(stack_weight, batch_layer_norm)
                 )
                 update_pointwise_example_value(
                     batch_layer_norm,
+                    # pyrefly: ignore  # missing-attribute
                     stack_weight.meta["example_value"],
                     previous_batch_layer_norm_meta,
                     torch.mul,
                 )
             elif group_weights is None and group_biases is not None:
                 previous_batch_layer_norm_meta = batch_layer_norm.meta["example_value"]
+                # pyrefly: ignore  # not-callable
                 batch_layer_norm = graph.call_function(
                     torch.add, args=(stack_bias, batch_layer_norm)
                 )
                 update_pointwise_example_value(
                     batch_layer_norm,
+                    # pyrefly: ignore  # missing-attribute
                     stack_bias.meta["example_value"],
                     previous_batch_layer_norm_meta,
                     torch.add,
diff --git a/torch/_inductor/fx_passes/joint_graph.py b/torch/_inductor/fx_passes/joint_graph.py
index c9d7187de0d9..a080312c3125 100644
--- a/torch/_inductor/fx_passes/joint_graph.py
+++ b/torch/_inductor/fx_passes/joint_graph.py
@@ -6,7 +6,7 @@
 import typing
 from collections import Counter
 from collections.abc import Sequence
-from typing import Any, Union
+from typing import Any
 
 import torch
 import torch._guards
@@ -31,7 +31,7 @@
     KeywordArg,
     Match,
     MULTIPLE,
-    PatternMatcherPass,
+    PatternMatcherPass as PatternMatcherPassBase,
     register_graph_pattern,
     stable_topological_sort,
 )
@@ -39,6 +39,10 @@
 from .replace_random import replace_random_passes
 
 
+PatternMatcherPass = functools.partial(
+    PatternMatcherPassBase, subsystem="joint_graph_passes"
+)
+
 log = logging.getLogger(__name__)
 patterns = PatternMatcherPass()
 aten = torch.ops.aten
@@ -513,6 +517,7 @@ def canonicalize_quant_mapping(gm: torch.fx.GraphModule):
             invoke_quant_replacement = graph.call_function(
                 torch._higher_order_ops.invoke_quant,
                 (subgraph, *args),
+                # pyrefly: ignore  # bad-argument-type
                 kwargs,
             )
             invoke_quant_replacement.meta.update(subgraph.meta)
@@ -588,12 +593,6 @@ def joint_graph_passes(graph: torch.fx.GraphModule):
             constant_fold_uniform_value
         )
 
-    if config.joint_custom_pre_pass is not None:
-        GraphTransformObserver(graph, "joint_custom_pre_pass").apply_graph_pass(
-            config.joint_custom_pre_pass
-        )
-        count += 1
-
     if config.pattern_matcher:
         for i, patterns in enumerate(pass_patterns):
             maybe_count = GraphTransformObserver(
@@ -629,6 +628,7 @@ def joint_graph_passes(graph: torch.fx.GraphModule):
         device=KeywordArg("device"),
         requires_grad=KeywordArg("requires_grad"),
     ),
+    # pyrefly: ignore  # bad-argument-type
     pass_dict=patterns,
 )
 def fix_iota_device(match: Match, length, start, step, dtype, device, requires_grad):
@@ -682,6 +682,7 @@ def fix_iota_device(match: Match, length, start, step, dtype, device, requires_g
         ),
         KeywordArg("dtype2"),
     ),
+    # pyrefly: ignore  # bad-argument-type
     pass_dict=patterns,
 )
 def pointless_convert(match: Match, arg, dtype1: torch.dtype, dtype2: torch.dtype):
@@ -699,8 +700,8 @@ def pointless_convert(match: Match, arg, dtype1: torch.dtype, dtype2: torch.dtyp
 
 
 def definitely_equal(
-    old_sizes: Sequence[Union[torch.SymInt, int]],
-    new_sizes: Sequence[Union[torch.SymInt, torch.fx.Node, int]],
+    old_sizes: Sequence[torch.SymInt | int],
+    new_sizes: Sequence[torch.SymInt | torch.fx.Node | int],
 ) -> bool:
     """
     Leverage guard_or_true/false to compare if two lists of int/symint are equal.
@@ -744,6 +745,7 @@ def definitely_equal(
 
 @register_graph_pattern(
     CallFunction(torch.ops.aten.view.default, KeywordArg("arg"), KeywordArg("size")),
+    # pyrefly: ignore  # bad-argument-type
     pass_dict=patterns,
 )
 def pointless_view(match: Match, arg, size):
@@ -761,6 +763,7 @@ def pointless_view(match: Match, arg, size):
         CallFunction(aten.view.default, KeywordArg("arg"), KeywordArg("size1")),
         KeywordArg("size2"),
     ),
+    # pyrefly: ignore  # bad-argument-type
     pass_dict=patterns,
 )
 def pointless_view_pair(match: Match, arg, size1, size2):
@@ -781,6 +784,7 @@ def pointless_view_pair(match: Match, arg, size1, size2):
         CallFunction(aten.permute.default, KeywordArg("arg"), KeywordArg("perm1")),
         KeywordArg("perm2"),
     ),
+    # pyrefly: ignore  # bad-argument-type
     pass_dict=patterns,
 )
 def pointless_permute_pair(match: Match, arg, perm1, perm2):
@@ -801,6 +805,7 @@ def pointless_permute_pair(match: Match, arg, perm1, perm2):
         Arg(),
         Arg(),
     ),
+    # pyrefly: ignore  # bad-argument-type
     pass_dict=patterns,
 )
 def bmm_to_mm(match: Match, mat1: torch.fx.Node, mat2: torch.fx.Node):
@@ -814,6 +819,7 @@ def repl(a, b):
         and statically_known_true(mat1.meta["val"].shape[0] == 1)
         and statically_known_true(mat2.meta["val"].shape[0] == 1)
     ):
+        # pyrefly: ignore  # bad-argument-type
         match.replace_by_example(repl, [mat1, mat2])
 
 
@@ -894,7 +900,7 @@ def repl(inp, other):
         if dtype is not None:
             inp = inp.to(dtype)
 
-        sign: Union[int, float, torch.Tensor]
+        sign: int | float | torch.Tensor
         if isinstance(other, (int, float, torch.SymInt, torch.SymFloat)):
             sign = 1 if other >= 0 else -1
         else:
@@ -903,14 +909,17 @@ def repl(inp, other):
 
         inp = inp * sign
         max_ = torch.amax(inp, dim=dim, keepdim=keepdim)
+        # pyrefly: ignore  # unsupported-operation
         return (inp - max_) * (sign * other)
 
+    # pyrefly: ignore  # bad-argument-type
     match.replace_by_example(repl, [inp, other])
 
 
 for reverse, to_dtype in itertools.product((False, True), repeat=2):
     register_graph_pattern(
         _partial_softmax_pattern(aten.mul.Tensor, reverse=reverse, to_dtype=to_dtype),
+        # pyrefly: ignore  # bad-argument-type
         pass_dict=pass_patterns[1],
         extra_check=_other_is_broadcasted_in_dim,
     )(mul_softmax_pattern)
@@ -921,7 +930,7 @@ def repl(inp, other):
         if dtype is not None:
             inp = inp.to(dtype)
 
-        sign: Union[int, float, torch.Tensor]
+        sign: int | float | torch.Tensor
         if isinstance(other, (int, float, torch.SymInt, torch.SymFloat)):
             sign = 1 if other >= 0 else -1
         else:
@@ -930,14 +939,17 @@ def repl(inp, other):
 
         inp = inp * sign
         max_ = torch.amax(inp, dim=dim, keepdim=keepdim)
+        # pyrefly: ignore  # unsupported-operation
         return (inp - max_) / (sign * other)
 
+    # pyrefly: ignore  # bad-argument-type
     match.replace_by_example(repl, [inp, other])
 
 
 for to_dtype in (False, True):
     register_graph_pattern(
         _partial_softmax_pattern(aten.div.Tensor, to_dtype=to_dtype),
+        # pyrefly: ignore  # bad-argument-type
         pass_dict=pass_patterns[1],
         extra_check=_other_is_broadcasted_in_dim,
     )(div_softmax_pattern)
diff --git a/torch/_inductor/fx_passes/memory_estimator.py b/torch/_inductor/fx_passes/memory_estimator.py
new file mode 100644
index 000000000000..c6b7c51b948e
--- /dev/null
+++ b/torch/_inductor/fx_passes/memory_estimator.py
@@ -0,0 +1,454 @@
+import itertools
+import logging
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Callable
+
+import torch
+import torch.fx as fx
+from torch.fx.experimental.symbolic_shapes import hint_int
+from torch.utils._ordered_set import OrderedSet
+from torch.utils._pytree import tree_map_only
+
+
+log = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class StorageKey:
+    storage: torch.UntypedStorage
+    device: torch.device
+
+    def __hash__(self) -> int:
+        return self.storage._cdata
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, StorageKey):
+            return False
+        return (
+            self.storage._cdata == other.storage._cdata and self.device == other.device
+        )
+
+
+class GraphAliasTracker:
+    """
+    Tracks storage allocation and usage relationships in an FX graph.
+
+    Differentiates between:
+    - Fresh allocations: nodes that allocate new storage (not views/aliases)
+    - Uses: nodes that use a storage as input
+    """
+
+    def __init__(self, nodes: list[fx.Node]):
+        # Map from node to the fresh storages it allocates (not views/aliases)
+        self.node_to_fresh_allocations: dict[fx.Node, OrderedSet[StorageKey]] = {}
+
+        # Map from storage to the node that originally allocated it
+        self.storage_to_allocator: dict[StorageKey, fx.Node] = {}
+
+        # Map from node to all storages it uses as inputs
+        self.node_to_storage_uses: dict[fx.Node, OrderedSet[StorageKey]] = {}
+
+        # Map from storage to all nodes that use it
+        self.storage_to_uses: dict[StorageKey, OrderedSet[fx.Node]] = defaultdict(
+            OrderedSet
+        )
+
+        # Map from storage to the last node that uses it
+        self.storage_to_last_user: dict[StorageKey, fx.Node] = {}
+
+        # Map from node to storages that have their last use at that node
+        self.node_to_storages_last_used: dict[fx.Node, OrderedSet[StorageKey]] = (
+            defaultdict(OrderedSet)
+        )
+
+        # Track all output storages for each node (for building usage graph)
+        self.node_to_output_storages: dict[fx.Node, OrderedSet[StorageKey]] = {}
+
+        # First pass: build storage allocations and track uses
+        for node in nodes:
+            # Get output storages
+            output_storages = self._get_output_storages(node)
+            self.node_to_output_storages[node] = output_storages
+
+            # Track fresh allocations
+            fresh_allocations: OrderedSet[StorageKey] = OrderedSet()
+            for storage_key in output_storages:
+                if storage_key not in self.storage_to_allocator:
+                    self.storage_to_allocator[storage_key] = node
+                    fresh_allocations.add(storage_key)
+            self.node_to_fresh_allocations[node] = fresh_allocations
+
+            # Track input storage uses (safe because inputs were already processed)
+            input_storages = self._get_input_storages(node)
+            self.node_to_storage_uses[node] = input_storages
+            for storage_key in input_storages:
+                self.storage_to_uses[storage_key].add(node)
+
+        # Second pass: find last users (iterate in reverse)
+        for node in reversed(nodes):
+            input_storages = self.node_to_storage_uses[node]
+            for storage_key in input_storages:
+                if storage_key not in self.storage_to_last_user:
+                    self.storage_to_last_user[storage_key] = node
+                    self.node_to_storages_last_used[node].add(storage_key)
+
+    @staticmethod
+    def _get_output_storages(node: fx.Node) -> OrderedSet[StorageKey]:
+        """
+        Get all storages from a node's outputs.
+
+        Uses pytree to handle arbitrary nested structures.
+        """
+        val = node.meta.get("val")
+        if val is None:
+            return OrderedSet()
+
+        storages: OrderedSet[StorageKey] = OrderedSet()
+
+        def collect_storage(tensor: torch._subclasses.FakeTensor) -> None:
+            storages.add(StorageKey(tensor.untyped_storage(), tensor.device))
+
+        # Use tree_map_only to handle FakeTensors in nested structures
+        tree_map_only(torch._subclasses.FakeTensor, collect_storage, val)
+
+        return storages
+
+    def _get_input_storages(self, node: fx.Node) -> OrderedSet[StorageKey]:
+        """
+        Get all storages from a node's inputs.
+        """
+        input_storages: OrderedSet[StorageKey] = OrderedSet()
+
+        for input_node in node.all_input_nodes:
+            input_storages.update(self.node_to_output_storages[input_node])
+
+        return input_storages
+
+    def get_fresh_allocations(self, node: fx.Node) -> OrderedSet[StorageKey]:
+        """Get all fresh storage allocations by this node (not views/aliases)."""
+        return self.node_to_fresh_allocations[node]
+
+    def get_storage_uses(self, node: fx.Node) -> OrderedSet[StorageKey]:
+        """Get all storages that this node uses as inputs."""
+        return self.node_to_storage_uses[node]
+
+    def get_storages_last_used(
+        self,
+        node: fx.Node,
+    ) -> OrderedSet[StorageKey]:
+        """
+        Get storages whose last use is at this node.
+        """
+        return self.node_to_storages_last_used[node]
+
+
+def _size_of_default(num_bytes: int | torch.SymInt) -> int:
+    return hint_int(num_bytes, fallback=torch._inductor.config.unbacked_symint_fallback)
+
+
+def device_filter(device: torch.device) -> bool:
+    return device.type != "cpu"
+
+
+def build_memory_profile(
+    graph: fx.Graph,
+    is_releasable: Callable[[fx.Node], bool],
+    size_of: Callable[[int | torch.SymInt], int] | None = None,
+) -> list[int]:
+    """
+    Function to estimate the memory profile of an input FX graph.
+
+    Args:
+    - graph (fx.Graph): The input FX graph for which the memory profile
+      is to be estimated.
+    - is_releasable (Callable[[fx.Node], bool]): A function that
+      determines if a node's memory can be released (e.g. primal nodes
+      cannot be released).
+    - size_of (Callable[[int | torch.SymInt], int]): A function that converts
+      byte counts (possibly symbolic) to concrete integers.
+
+    Returns:
+    - List[int]: A list representing the memory profile over the execution
+      of the graph, where each entry corresponds to the memory usage at
+      a particular point in the execution.
+    """
+
+    size_of = size_of or _size_of_default
+    nodes = list(graph.nodes)
+    alias_info = GraphAliasTracker(nodes)
+
+    # Build memory profile
+    current_memory = 0
+
+    for node in itertools.chain(
+        graph.find_nodes(op="placeholder"), graph.find_nodes(op="get_attr")
+    ):
+        for storage_key in alias_info.get_fresh_allocations(node):
+            if device_filter(storage_key.device):
+                current_memory += size_of(storage_key.storage.nbytes())
+
+    memory_profile = [current_memory]
+
+    for node in nodes:
+        if node.op in ("placeholder", "get_attr", "output"):
+            continue
+
+        # Process allocations
+        for storage_key in alias_info.get_fresh_allocations(node):
+            if device_filter(storage_key.device):
+                current_memory += size_of(storage_key.storage.nbytes())
+
+        memory_profile.append(current_memory)
+
+        # Process deallocations
+        for storage_key in alias_info.get_storages_last_used(node):
+            allocator = alias_info.storage_to_allocator[storage_key]
+            if is_releasable(allocator):
+                if device_filter(storage_key.device):
+                    current_memory -= size_of(storage_key.storage.nbytes())
+
+        memory_profile.append(current_memory)
+
+    return memory_profile
+
+
+def get_fwd_bwd_interactions(
+    fwd_graph: fx.Graph,
+    bwd_graph: fx.Graph,
+    size_of: Callable[[int | torch.SymInt], int] | None = None,
+) -> tuple[int, OrderedSet[str]]:
+    """
+    Analyze the interactions between the forward (fwd) and backward (bwd) graphs
+    to determine memory usage characteristics.
+
+    Args:
+    - fwd_graph (fx.Graph): The forward graph representing the forward pass.
+    - bwd_graph (fx.Graph): The backward graph representing the backward pass.
+    - size_of (Callable[[int | torch.SymInt], int]): A function that converts
+      byte counts (possibly symbolic) to concrete integers.
+
+    Returns:
+    - tuple[int, OrderedSet[str]]: A tuple containing:
+        1. The baseline memory usage during the backward pass, accounting for
+           storages that persist from the forward pass (i.e., in fwd output but
+           not in bwd input).
+        2. A set of node names whose storage cannot be released during the bwd pass.
+           These include nodes that use storage from primals or are in bwd input
+           but not in fwd output.
+    """
+
+    size_of = size_of or _size_of_default
+
+    # Build alias info for forward graph
+    fwd_nodes = list(fwd_graph.nodes)
+    fwd_alias_info = GraphAliasTracker(fwd_nodes)
+
+    # Identify storages allocated by primal placeholder nodes
+    primal_storages: OrderedSet[StorageKey] = OrderedSet()
+    for node in fwd_graph.find_nodes(op="placeholder"):
+        if node.name.startswith("primals"):
+            primal_storages.update(fwd_alias_info.get_fresh_allocations(node))
+
+    # Get storages in forward output
+    fwd_output_node = next(iter(reversed(fwd_graph.nodes)))[-1]
+    assert fwd_output_node.op == "output"
+    fwd_output_storages = fwd_alias_info.get_storage_uses(fwd_output_node)
+
+    # Node names that should not be deleted during memory profile estimation of bwd_graph
+    do_not_delete: OrderedSet[str] = OrderedSet()
+
+    # Collect all storages in backward inputs and identify nodes to not delete
+    bwd_input_storages: OrderedSet[StorageKey] = OrderedSet()
+    for node in bwd_graph.find_nodes(op="placeholder"):
+        node_storages = GraphAliasTracker._get_output_storages(node)
+        bwd_input_storages.update(node_storages)
+
+        # Check if this node uses primal storage
+        if node_storages & primal_storages:
+            do_not_delete.add(node.name)
+
+        # Check if this node's storages are not in forward outputs
+        # (meaning it's an external input to backward pass)
+        if not (node_storages & fwd_output_storages):
+            do_not_delete.add(node.name)
+
+    # Calculate baseline memory: storages in fwd output but not in bwd input
+    # These storages persist throughout the backward pass
+    baseline_storages = fwd_output_storages - bwd_input_storages
+    bwd_baseline_memory = 0
+    for storage_key in baseline_storages:
+        if storage_key.device.type != "cpu":
+            bwd_baseline_memory += size_of(storage_key.storage.nbytes())
+
+    return bwd_baseline_memory, do_not_delete
+
+
+def _is_releasable(n: fx.Node) -> bool:
+    # Storages of primals cannot be released during fwd or bwd pass.
+    return not n.name.startswith("primals")
+
+
+def get_peak_memory(
+    fwd_graph: fx.Graph,
+    bwd_graph: fx.Graph,
+) -> int:
+    fwd_peak_memory = max(build_memory_profile(fwd_graph, _is_releasable))
+
+    bwd_baseline_memory, bwd_do_not_delete = get_fwd_bwd_interactions(
+        fwd_graph,
+        bwd_graph,
+    )
+
+    def _is_bwd_releasable(n: fx.Node) -> bool:
+        # Storages of nodes in bwd_do_not_delete cannot be released
+        # during the bwd pass.
+        return _is_releasable(n) and n.name not in bwd_do_not_delete
+
+    bwd_peak_memory = bwd_baseline_memory + max(
+        build_memory_profile(bwd_graph, _is_bwd_releasable)
+    )
+    return max(
+        fwd_peak_memory,
+        bwd_peak_memory,
+    )
+
+
+class MemoryTracker:
+    """
+    Tracks memory usage for alternative scheduling orders of an FX graph.
+
+    This class enables tracking memory usage as nodes are scheduled in a different
+    order than the original graph.
+    """
+
+    def __init__(
+        self,
+        graph: fx.Graph,
+        is_releasable: Callable[[fx.Node], bool] | None = None,
+        device_filter: Callable[[torch.device], bool] | None = None,
+    ):
+        """
+        Initialize memory tracker for alternative scheduling of the given graph.
+
+        Args:
+            graph: FX graph to track memory for under alternative scheduling
+            is_releaseable: do we consider this input to the graph to release memory
+            upon final use, or is allocated for the duration of the graph ?
+            by default, we assume all nodes but those that start with "primals" to be releasable
+            device_filter: Function to determine which devices to track (default: non-CPU)
+        """
+
+        self.graph = graph
+        self.nodes = list(graph.nodes)
+        self.device_filter = device_filter or (lambda device: device.type != "cpu")
+        self.scheduled: OrderedSet[fx.Node] = OrderedSet()
+
+        # Memory tracking using GraphAliasTracker
+        self.alias_tracker = GraphAliasTracker(self.nodes)
+        self.current_live_storages: OrderedSet[StorageKey] = OrderedSet()
+        self.current_memory_bytes = 0
+        self.is_releasable = _is_releasable if is_releasable is None else is_releasable
+
+        # Initialize live storages with placeholders and get_attr nodes
+        for node in self.nodes:
+            if node.op in ("placeholder", "get_attr"):
+                fresh_allocations = self.alias_tracker.get_fresh_allocations(node)
+                for storage_key in fresh_allocations:
+                    if self.device_filter(storage_key.device):
+                        self.current_live_storages.add(storage_key)
+                        self.current_memory_bytes += self._get_storage_size(storage_key)
+
+        self.peak_memory = self.current_memory_bytes
+
+        log.debug(
+            "Memory tracker initialized with initial memory: %d MB",
+            self.current_memory_bytes // (1024 * 1024),
+        )
+
+    def schedule_node(self, node: fx.Node) -> None:
+        """
+        Schedule a node and update memory tracking for the new scheduling order.
+
+        Args:
+            node: The node being scheduled (potentially out of original order)
+        """
+        assert node not in self.scheduled, "should not schedule node twice"
+        self.scheduled.add(node)
+        self._update_memory_for_node(node)
+
+    def get_current_memory_bytes(self) -> int:
+        """Get current live memory in bytes under the current scheduling."""
+        return self.current_memory_bytes
+
+    def _get_storage_size(self, storage_key: StorageKey) -> int:
+        """Get the size of a storage in bytes, handling symbolic shapes."""
+        size_bytes = storage_key.storage.nbytes()
+        return hint_int(
+            size_bytes, fallback=torch._inductor.config.unbacked_symint_fallback
+        )
+
+    def _get_storages_freed_by_node(self, node: fx.Node) -> OrderedSet[StorageKey]:
+        """Get storages that would be freed if we schedule this node."""
+        freed_storages: OrderedSet[StorageKey] = OrderedSet()
+
+        input_storages = self.alias_tracker.get_storage_uses(node)
+        for storage_key in input_storages:
+            if not self.device_filter(storage_key.device):
+                continue
+
+            # Invariant: if a node uses a storage, it must be live
+            assert storage_key in self.current_live_storages, (
+                "all input storages should be currently allocated"
+            )
+
+            if not self.is_releasable(
+                self.alias_tracker.storage_to_allocator[storage_key]
+            ):
+                continue
+
+            all_uses = self.alias_tracker.storage_to_uses[storage_key]
+
+            # If no more unscheduled uses remain, the storage can be freed
+            if all(u in self.scheduled for u in all_uses):
+                freed_storages.add(storage_key)
+
+        return freed_storages
+
+    def _update_memory_for_node(self, node: fx.Node) -> None:
+        """Update memory tracking when a node is scheduled."""
+        if node.op in ("placeholder", "get_attr", "output"):
+            return
+
+        # Add fresh allocations
+        fresh_allocations = self.alias_tracker.get_fresh_allocations(node)
+        alloc_bytes = 0
+        for storage_key in fresh_allocations:
+            if (
+                self.device_filter(storage_key.device)
+                and storage_key not in self.current_live_storages
+            ):
+                size = self._get_storage_size(storage_key)
+                self.current_live_storages.add(storage_key)
+                self.current_memory_bytes += size
+                alloc_bytes += size
+
+        self.peak_memory = max(self.current_memory_bytes, self.peak_memory)
+
+        # Remove storages that are no longer used
+        storages_to_free = self._get_storages_freed_by_node(node)
+        freed_bytes = 0
+        for storage_key in storages_to_free:
+            if storage_key in self.current_live_storages:
+                size = self._get_storage_size(storage_key)
+                self.current_live_storages.remove(storage_key)
+                self.current_memory_bytes -= size
+                freed_bytes += size
+
+        log.debug(
+            "Scheduled %s: memory change %d allocs, %d frees, current memory: %d MB",
+            node.name,
+            len(fresh_allocations),
+            len(storages_to_free),
+            self.current_memory_bytes // (1024 * 1024),
+        )
diff --git a/torch/_inductor/fx_passes/micro_pipeline_tp.py b/torch/_inductor/fx_passes/micro_pipeline_tp.py
index c4d935a4f8bb..97b4342fa763 100644
--- a/torch/_inductor/fx_passes/micro_pipeline_tp.py
+++ b/torch/_inductor/fx_passes/micro_pipeline_tp.py
@@ -4,7 +4,7 @@
 from collections import defaultdict
 from dataclasses import dataclass, field
 from math import prod
-from typing import Any, cast, Optional
+from typing import Any, cast
 
 import torch
 from torch.utils._ordered_set import OrderedSet
@@ -27,6 +27,10 @@
 patterns = PatternMatcherPass()
 
 
+def _is_last_dim(t: torch.Tensor, dim: int) -> bool:
+    return dim == t.ndim - 1 or dim == -1
+
+
 def _is_backward(graph: torch.fx.Graph) -> bool:
     placeholders = []
     for node in graph.nodes:
@@ -374,8 +378,8 @@ class _Matmul:
     arg_ancestor_nodes: OrderedSet[torch.fx.Node] = field(init=False)
     A_node: torch.fx.Node
     B_node: torch.fx.Node
-    pre_mm_reshape: Optional[torch.fx.Node]
-    post_mm_reshape: Optional[torch.fx.Node]
+    pre_mm_reshape: torch.fx.Node | None
+    post_mm_reshape: torch.fx.Node | None
 
     def __post_init__(self):
         assert len(self.nodes) in (1, 3)
@@ -450,12 +454,12 @@ def from_match(cls, match: list[torch.fx.Node]) -> "_Matmul":
 class _ScaledMatmul(_Matmul):
     A_scale_node: torch.fx.Node
     B_scale_node: torch.fx.Node
-    bias_node: Optional[torch.fx.Node]
-    result_scale_node: Optional[torch.fx.Node]
-    out_dtype: Optional[torch.dtype]
+    bias_node: torch.fx.Node | None
+    result_scale_node: torch.fx.Node | None
+    out_dtype: torch.dtype | None
     use_fast_accum: bool
-    pre_mm_reshape: Optional[torch.fx.Node]
-    post_mm_reshape: Optional[torch.fx.Node]
+    pre_mm_reshape: torch.fx.Node | None
+    post_mm_reshape: torch.fx.Node | None
 
     def __post_init__(self):
         super().__post_init__()
@@ -645,9 +649,17 @@ def fuse_all_gather_matmul(all_gather: _AllGatherMatch) -> None:
     if not is_symm_mem_enabled_for_group(group_name):
         return
 
-    if gather_dim >= len(_get_tensor(shard_node).shape) - 1:
-        # Decomposing the matmul on the K dimension is not supported
-        return
+    filter_matmul = None
+    if _is_last_dim(_get_tensor(shard_node), gather_dim):
+        # Decomposed mms should not be too small
+        if _get_tensor(shard_node).shape[-1] < 1024:
+            return
+
+        # scaled_mm is not supported yet for last dim
+        def _filter_out_scaled_matmul(matmul: _Matmul):
+            return not isinstance(matmul, _ScaledMatmul)
+
+        filter_matmul = _filter_out_scaled_matmul
 
     # Find consumer matmuls
     matmuls = _find_consumer_matmuls(ag_res_node)
@@ -663,18 +675,29 @@ def fuse_all_gather_matmul(all_gather: _AllGatherMatch) -> None:
     if len(matmuls) == 0 or len(OrderedSet(map(type, matmuls))) != 1:
         return
 
+    if _is_last_dim(_get_tensor(shard_node), gather_dim) and len(
+        all_gather.res_node.users
+    ) > len(matmuls):
+        # The result of ag-split-cat is used not only in matmuls.
+        # Then it has to be materialized, which can have overhead.
+        return
+
+    if filter_matmul and not filter_matmul(matmuls[0]):
+        return
+
     # Fuse the all_gather_tensor with the eligible matmuls
     graph = ag_node.graph
     with graph.inserting_before(ag_node):
-        if "val" in shard_node.meta:
-            restrided = restride_A_shard_for_fused_all_gather_matmul(
-                _get_tensor(shard_node),
-                gather_dim,
-            )
-            shard_node = graph.call_function(
-                inductor_prims.force_stride_order,
-                args=(shard_node, restrided.stride()),
-            )
+        if not _is_last_dim(_get_tensor(shard_node), gather_dim):
+            if "val" in shard_node.meta:
+                restrided = restride_A_shard_for_fused_all_gather_matmul(
+                    _get_tensor(shard_node),
+                    gather_dim,
+                )
+                shard_node = graph.call_function(
+                    inductor_prims.force_stride_order,
+                    args=(shard_node, restrided.stride()),
+                )
 
         fused_node = _insert_fused_all_gather_matmul(
             graph, matmuls, shard_node, gather_dim, group_name
@@ -763,7 +786,7 @@ def _scatter_dim_after_reshape(
     return 0 if leading_dims_collapsed else 1
 
 
-def _find_producer_matmul(node: torch.fx.Node) -> Optional[_Matmul]:
+def _find_producer_matmul(node: torch.fx.Node) -> _Matmul | None:
     """
     Returns producer matmul node if found, otherwise returns None.
     """
@@ -802,7 +825,7 @@ def _insert_fused_matmul_reduce_scatter(
     scatter_dim_after_reshape: int,  # only used for reshape -> scaled_mm -> reshape pattern
     output_shape: list[int],  # only used for reshape -> scaled_mm -> reshape pattern
 ) -> torch.fx.Node:
-    if type(matmul) == _Matmul:
+    if type(matmul) is _Matmul:
         return graph.call_function(
             torch.ops.symm_mem.fused_matmul_reduce_scatter.default,
             args=(
@@ -813,7 +836,7 @@ def _insert_fused_matmul_reduce_scatter(
                 group_name,
             ),
         )
-    elif type(matmul) == _ScaledMatmul:
+    elif type(matmul) is _ScaledMatmul:
         return graph.call_function(
             torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter.default,
             args=(
@@ -880,6 +903,14 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
     if not is_symm_mem_enabled_for_group(group_name):
         return
 
+    filter_matmul = None
+    if _is_last_dim(_get_tensor(input_node), orig_scatter_dim):
+        # scaled_mm is not supported yet for last dim mm+rs
+        def _filter_out_scaled_matmul(matmul: _Matmul):
+            return not isinstance(matmul, _ScaledMatmul)
+
+        filter_matmul = _filter_out_scaled_matmul
+
     # Currently fused_matmul_reduce_scatter doesn't return the matmul result,
     # so we can't apply the fusion if the matmul result is used by multiple
     # users. This is not a fundamental limitation of the fused op and can be
@@ -891,12 +922,16 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
         return
 
     matmul = _find_producer_matmul(input_node)
+
     if matmul is None:
         log.warning(
             "no producer matmul found for reduce scatter, skipping fuse_matmul_reduce_scatter fusion"
         )
         return
 
+    if filter_matmul and not filter_matmul(matmul):
+        return
+
     if rs_wait_tensor_node in matmul.arg_ancestor_nodes:
         log.warning(
             "reduce-scatter result node is an ancestor of matmul, skipping fuse_matmul_reduce_scatter fusion"
@@ -1025,7 +1060,7 @@ def _get_unexposed_collectives(graph: torch.fx.Graph) -> list[torch.fx.Node]:
     """
 
     def _is_compute_intensive(node: torch.fx.Node) -> bool:
-        return node.target in [torch.ops.aten.mm.default]
+        return node.target is torch.ops.aten.mm.default
 
     collective_to_overlapping_candidates = defaultdict(list)
     available_nodes = OrderedSet[torch.fx.Node]()
diff --git a/torch/_inductor/fx_passes/misc_patterns.py b/torch/_inductor/fx_passes/misc_patterns.py
index d2c8068f130c..7b157bf03a91 100644
--- a/torch/_inductor/fx_passes/misc_patterns.py
+++ b/torch/_inductor/fx_passes/misc_patterns.py
@@ -44,10 +44,14 @@ def randperm_index_add_replacement(x, y):
         )
 
     register_replacement(
+        # pyrefly: ignore  # bad-argument-type
         randperm_index_add_pattern,
+        # pyrefly: ignore  # bad-argument-type
         randperm_index_add_replacement,
         [torch.empty(4, 8, device=device), torch.empty(2, 8, device=device)],
+        # pyrefly: ignore  # bad-argument-type
         fwd_only,
+        # pyrefly: ignore  # bad-argument-type
         [post_grad_patterns, joint_graph_patterns],
     )
 
@@ -60,10 +64,14 @@ def randperm_index_replacement(x, slice_shape):
         return torch.ops.aten._unsafe_index(x, (index,)), index
 
     register_replacement(
+        # pyrefly: ignore  # bad-argument-type
         randperm_index_pattern,
+        # pyrefly: ignore  # bad-argument-type
         randperm_index_replacement,
         [torch.empty(4, 8, device=device)],
+        # pyrefly: ignore  # bad-argument-type
         fwd_only,
+        # pyrefly: ignore  # bad-argument-type
         [post_grad_patterns, joint_graph_patterns],
         scalar_workaround={"slice_shape": 42},
     )
diff --git a/torch/_inductor/fx_passes/mkldnn_fusion.py b/torch/_inductor/fx_passes/mkldnn_fusion.py
index 868eb74824dd..99e8cfeff793 100644
--- a/torch/_inductor/fx_passes/mkldnn_fusion.py
+++ b/torch/_inductor/fx_passes/mkldnn_fusion.py
@@ -712,6 +712,7 @@ def _other_input_not_inplaceable(_binary_node, _other_index):
             if any(_other_input_not_inplaceable(n, other_index) for n in binary_nodes):
                 return False
             if any(
+                # pyrefly: ignore  # missing-attribute
                 n.args[other_index].op in ["placeholder", "output"]
                 for n in binary_nodes
             ):
diff --git a/torch/_inductor/fx_passes/numeric_utils.py b/torch/_inductor/fx_passes/numeric_utils.py
index d5b140b49d20..b50859448f07 100644
--- a/torch/_inductor/fx_passes/numeric_utils.py
+++ b/torch/_inductor/fx_passes/numeric_utils.py
@@ -207,7 +207,7 @@ def numeric_check_if_enabled(
                 precision=precision,
             )
     except Exception as e:
-        logger.warning(
+        logger.warning(  # noqa: G200
             "Runtime numeric check failed in pre grad fx passes with error: %s", e
         )
         traceback.print_exc()
diff --git a/torch/_inductor/fx_passes/overlap_preserving_bucketer.py b/torch/_inductor/fx_passes/overlap_preserving_bucketer.py
new file mode 100644
index 000000000000..e7ea10911f37
--- /dev/null
+++ b/torch/_inductor/fx_passes/overlap_preserving_bucketer.py
@@ -0,0 +1,270 @@
+from collections import defaultdict
+
+import torch
+import torch.fx as fx
+from torch._inductor.augmented_graph_helper import AugmentedGraphHelper
+from torch._inductor.fx_passes.bucketing import (
+    bucket_key,
+    is_all_gather_into_tensor as is_all_gather,
+    is_reduce_scatter_tensor as is_reduce_scatter,
+    is_wait_tensor,
+)
+from torch._inductor.fx_passes.overlap_scheduling import CollBucket, CollectiveInfo
+from torch.utils._ordered_set import OrderedSet
+
+
+class OverlapPreservingBucketer:
+    """
+    Buckets collective operations while preserving compute-collective overlap relationships.
+    Uses an augmented graph to track dependencies between compute and collective operations.
+    """
+
+    def __init__(
+        self,
+        graph: fx.Graph,
+        collective_info: dict[fx.Node, CollectiveInfo],
+        node_ancestors: dict[fx.Node, OrderedSet[fx.Node]],
+        scheduled: OrderedSet[fx.Node],
+        max_bucket_memory_gb: float = 1.0,
+        max_coll_distance: int = 1000,
+    ):
+        self.graph = graph
+        self.collective_info = collective_info
+        self.node_ancestors = node_ancestors
+        self.scheduled = scheduled
+        self.max_bucket_memory_gb = max_bucket_memory_gb
+        self.node_idx = {n: i for i, n in enumerate(scheduled)}
+        self.aug_graph = AugmentedGraphHelper(self.graph, self.node_ancestors)
+        self.max_coll_distance = max_coll_distance
+
+    def bucket_collectives(self) -> None:
+        """Main entry point for bucketing collectives."""
+
+        # Add extra dependencies for hidden collectives
+        # For each hidden collective, add: compute -> start and wait -> compute
+        for start_node, info in self.collective_info.items():
+            if info.hiding_node and not info.is_exposed:
+                # Add edge: hiding_compute depends on start (start must come before compute)
+                self.aug_graph.add_extra_dep(n=info.hiding_node, dep=start_node)
+                # Add edge: wait depends on hiding_compute (compute must come before wait)
+                self.aug_graph.add_extra_dep(n=info.wait_node, dep=info.hiding_node)
+
+        # Group collectives by bucket key (type, group, etc.)
+        grouped_collectives: dict[object, OrderedSet[fx.Node]] = defaultdict(OrderedSet)
+        for start in self.collective_info:
+            key = bucket_key(start)
+            if key is not None:
+                grouped_collectives[key].add(start)
+
+        all_buckets: list[CollBucket] = []
+        for collective_group in grouped_collectives.values():
+            buckets = self._find_buckets(collective_group)
+            all_buckets.extend(buckets)
+
+        # Collect all extra dependencies to preserve after bucketing
+        additional_deps: dict[fx.Node, OrderedSet[fx.Node]] = defaultdict(OrderedSet)
+
+        # Apply bucketing transformations
+        for coll_bucket in all_buckets:
+            if len(coll_bucket.collectives) <= 1:
+                continue
+
+            bucket_deps = self._apply_bucket(coll_bucket)
+            additional_deps.update(bucket_deps)
+
+        # Apply topological sort with all the collected dependencies
+        from torch._dynamo.graph_deduplication import _stable_topological_sort
+
+        _stable_topological_sort(self.graph, additional_deps)
+
+        # After topological sort, preserve dependencies using effect tokens
+        self._preserve_dependencies_with_tokens(additional_deps)
+
+        self.graph.lint()
+
+    def _find_buckets(
+        self,
+        collective_group: OrderedSet[fx.Node],
+    ) -> list[CollBucket]:
+        """Find valid buckets within a group of similar collectives."""
+
+        max_bucket_bytes = int(self.max_bucket_memory_gb * 1024 * 1024 * 1024)
+        buckets = []
+        processed: OrderedSet[fx.Node] = OrderedSet()
+
+        for start_node in collective_group:
+            if start_node in processed:
+                continue
+
+            # Initialize bucket with first collective
+            bucket_info = CollBucket(
+                collectives=[start_node],
+                total_bytes=self.collective_info[start_node].size_bytes,
+            )
+            processed.add(start_node)
+            start_node_idx = self.node_idx[start_node]
+
+            # TODO - limit within range
+            for candidate in collective_group:
+                if candidate in processed:
+                    continue
+
+                candidate_idx = self.node_idx[candidate]
+                # Check if candidate is within max distance from the bucket start
+                if abs(candidate_idx - start_node_idx) > self.max_coll_distance:
+                    continue
+
+                candidate_bytes = self.collective_info[candidate].size_bytes
+                if bucket_info.total_bytes + candidate_bytes > max_bucket_bytes:
+                    continue
+
+                if self._can_add_to_bucket(bucket_info, candidate):
+                    bucket_info.collectives.append(candidate)
+                    bucket_info.total_bytes += candidate_bytes
+                    processed.add(candidate)
+
+            if len(bucket_info.collectives) > 1:
+                buckets.append(bucket_info)
+
+        return buckets
+
+    def _ancestor_dep(self, n1: fx.Node, n2: fx.Node) -> bool:
+        """Check if there's an ancestor relationship between two nodes."""
+        return n1 in self.node_ancestors[n2] or n2 in self.node_ancestors[n1]
+
+    def _can_add_to_bucket(
+        self,
+        bucket_info: CollBucket,
+        candidate: fx.Node,
+    ) -> bool:
+        """
+        Check if candidate can be added to bucket without interfering
+        with comm/compute overlap.
+        """
+
+        candidate_info = self.collective_info[candidate]
+        candidate_wait = candidate_info.wait_node
+
+        # Step 1: Quick check using precomputed ancestors
+        # This will not be fully up to date because bucketing changes ancestors,
+        # however any ancestor at the start of bucketing will remain an ancestor.
+        for coll in bucket_info.collectives:
+            if self._ancestor_dep(coll, candidate):
+                return False
+
+            coll_wait = self.collective_info[coll].wait_node
+            if self._ancestor_dep(candidate_wait, coll_wait):
+                return False
+
+            if hiding_node := self.collective_info[coll].hiding_node:
+                if self._ancestor_dep(hiding_node, candidate_wait):
+                    return False
+
+            if new_hiding_node := candidate_info.hiding_node:
+                if self._ancestor_dep(new_hiding_node, coll_wait):
+                    return False
+
+        # Step 2: Check and merge starts
+        # Check if there's a path between any existing start and candidate start.
+        # Because the collectives have already been merged, we can just start from one
+        # of them.
+        # TODO: we have a range of possible idxs of the merged node, and idx of new node.
+        # we should not do path search beyond that range
+        existing_coll = bucket_info.collectives[0]
+        if self.aug_graph.has_path(existing_coll, candidate):
+            return False
+        if self.aug_graph.has_path(candidate, existing_coll):
+            return False
+
+        # Safe to merge starts - do the merge
+        self.aug_graph.merge_to_set(existing_coll, candidate)
+
+        # Step 3: Check and merge waits
+        existing_wait = self.collective_info[existing_coll].wait_node
+        candidate_wait = candidate_info.wait_node
+        # TODO - as above, limit search by idx
+        if self.aug_graph.has_path(
+            existing_wait, candidate_wait
+        ) or self.aug_graph.has_path(candidate_wait, existing_wait):
+            # Unmerge the start we just merged
+            self.aug_graph.unmerge_node(candidate)
+            return False
+
+        self.aug_graph.merge_to_set(existing_wait, candidate_wait)
+        return True
+
+    def _apply_bucket(
+        self, bucket_info: CollBucket
+    ) -> dict[fx.Node, OrderedSet[fx.Node]]:
+        """Apply bucketing transformation and return dependencies to preserve."""
+
+        from torch._inductor.fx_passes.bucketing import (
+            merge_all_gather_bucket,
+            merge_reduce_scatter_bucket,
+        )
+
+        bucket = bucket_info.collectives
+
+        # Find where to place the bucketed operations
+        next_node = bucket[0]
+        while next_node in bucket:
+            next_node = next_node.next
+        waits = [self.collective_info[n].wait_node for n in bucket]
+        first_wait = min(waits, key=lambda w: self.node_idx[w])
+
+        # Create bucketed collective
+        if is_all_gather(bucket[0]):
+            new_nodes, replacements = merge_all_gather_bucket(
+                self.graph,
+                bucket,
+                wait_insertion_point=first_wait,
+                insert_before=next_node,
+                mode="custom_ops",
+            )
+        else:
+            assert is_reduce_scatter(bucket[0])
+            new_nodes, replacements = merge_reduce_scatter_bucket(
+                self.graph,
+                bucket,
+                wait_insertion_point=first_wait,
+                insert_before=next_node,
+                mode="custom_ops",
+            )
+
+        # Build dependencies to preserve overlap
+        # replacements maps old_start -> new_start, old_wait -> new_wait
+        new_waits = [n for n in new_nodes if is_wait_tensor(n)]
+        assert len(new_waits) == 1
+
+        new_wait = new_waits[0]
+        new_start = new_wait.args[0]
+        assert isinstance(new_start, fx.Node)
+
+        overlap_deps: dict[fx.Node, OrderedSet[fx.Node]] = defaultdict(OrderedSet)
+
+        # Create dependencies to preserve overlap
+        for coll in bucket:
+            info = self.collective_info[coll]
+            if info.hiding_node and not info.is_exposed:
+                # Compute depends on collective start
+                overlap_deps[info.hiding_node].add(new_start)
+                # Wait depends on compute
+                overlap_deps[new_wait].add(info.hiding_node)
+
+        return overlap_deps
+
+    def _preserve_dependencies_with_tokens(
+        self, additional_deps: dict[fx.Node, OrderedSet[fx.Node]]
+    ) -> None:
+        """
+        Preserve dependencies using effect tokens and with_effects higher-order op.
+
+        Uses the standalone token_dependencies utility for consistent behavior
+        across different overlap scheduling approaches.
+        """
+        from torch._inductor.fx_passes.control_dependencies import (
+            preserve_node_ordering,
+        )
+
+        if torch._inductor.config.test_configs.aten_fx_overlap_insert_overlap_deps:
+            preserve_node_ordering(self.graph, additional_deps)
diff --git a/torch/_inductor/fx_passes/overlap_scheduling.py b/torch/_inductor/fx_passes/overlap_scheduling.py
new file mode 100644
index 000000000000..5905c6d770ae
--- /dev/null
+++ b/torch/_inductor/fx_passes/overlap_scheduling.py
@@ -0,0 +1,884 @@
+import functools
+import heapq
+import itertools
+import logging
+import sys
+from collections import Counter, defaultdict
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Any, Callable
+
+import torch
+import torch.fx as fx
+from torch._dynamo.utils import counters, dynamo_timed
+from torch._inductor.fx_passes.bucketing import is_wait_tensor
+from torch._inductor.fx_passes.memory_estimator import (
+    _is_releasable,
+    build_memory_profile,
+    MemoryTracker,
+)
+from torch.fx.operator_schemas import normalize_function
+from torch.utils._mode_utils import no_dispatch
+from torch.utils._ordered_set import OrderedSet
+
+
+log = logging.getLogger(__name__)
+
+from torch._inductor.fx_passes.bucketing import bucket_key
+
+from ..pattern_matcher import stable_topological_sort
+
+
+def get_group_name(n: fx.Node) -> str:
+    """Extract the group name from a collective operation node."""
+    opt_args_kwargs = normalize_function(
+        n.target,  # type: ignore[arg-type]
+        args=n.args,
+        kwargs=n.kwargs,
+        normalize_to_only_use_kwargs=True,
+    )
+    assert opt_args_kwargs is not None
+    _, kwargs = opt_args_kwargs
+    return kwargs["group_name"]
+
+
+def get_custom_estimation(n: fx.Node) -> float | None:
+    runtime_estimation = torch._inductor.config.test_configs.estimate_aten_runtime
+    if runtime_estimation == "default":
+        return None
+
+    assert callable(runtime_estimation)
+    return runtime_estimation(n)
+
+
+def estimate_collective_time(n: fx.Node, override_size: int | None = None) -> float:
+    """Estimate the runtime of a collective operation, optionally with an overridden size."""
+    if (est := get_custom_estimation(n)) is not None:
+        return est
+
+    return torch._inductor.comm_analysis.estimate_nccl_collective_runtime_from_fx_node(
+        n, override_size
+    )
+
+
+def estimate_fx_collective_size(fx_node: torch.fx.Node) -> int:
+    size = 0
+    for node in fx_node.all_input_nodes:
+        if (t := node.meta.get("val")) is not None:
+            # todo - symbolic
+            size += t.numel() * t.element_size()
+
+    return size
+
+
+def is_compute_node(n: fx.Node) -> bool:
+    """
+    Should we consider this node computationally expensive ?
+    Currently uses flop registration, but we could expand more generally.
+    """
+    return (
+        getattr(n.target, "overloadpacket", None)
+        in torch.utils.flop_counter.flop_registry
+    )
+
+
+def get_hint(x: int | torch.SymInt) -> int | None:
+    if isinstance(x, int):
+        return x
+    assert isinstance(x, torch.SymInt)
+    if not x.node.has_hint():
+        return None
+    return x.node.hint
+
+
+def get_collective_do_bench() -> Callable[[Callable[[], Any]], float]:
+    with dynamo_timed("collective_compute_do_bench"):
+        return functools.partial(
+            # pyrefly: ignore  # bad-argument-type
+            torch._inductor.runtime.benchmarking.benchmarker.benchmark_gpu,
+            warmup=5,
+        )
+
+
+def benchmark_node_with_cache_key(n: fx.Node) -> tuple[float, str | None]:
+    assert is_compute_node(n)
+
+    from torch._dynamo.testing import rand_strided
+
+    # todo - skip unbacked, symbolic
+    success, args, kwargs = torch._inductor.fx_utils.get_fake_args_kwargs(n)
+
+    if not success:
+        return 0, None
+
+    unbacked_tensor = False
+
+    key = f"{str(n.target)}: "
+
+    def to_real(t: torch.Tensor) -> torch.Tensor | None:
+        shape = [get_hint(dim) for dim in t.shape]
+        stride = [get_hint(s) for s in t.stride()]
+
+        if any(s is None for s in itertools.chain(shape, stride)):
+            nonlocal unbacked_tensor
+            unbacked_tensor = True
+            return None
+
+        nonlocal key
+        key += f"T: {shape, stride, t.dtype} "
+        return rand_strided(shape, stride, device=t.device, dtype=t.dtype)  # type: ignore[arg-type]
+
+    with no_dispatch():
+        args, kwargs = torch.utils._pytree.tree_map_only(
+            torch.Tensor,
+            lambda t: to_real(t),
+            (args, kwargs),
+        )
+
+        if val := get_cached_node_time(key):
+            return val, key
+
+        if unbacked_tensor:
+            return 0, key
+
+        if (est := get_custom_estimation(n)) is not None:
+            set_cached_node_time(key, est)
+            return est, key
+
+        bench = get_collective_do_bench()
+        out = bench(lambda: n.target(*args, **kwargs))  # type: ignore[operator]
+        set_cached_node_time(key, out)
+        return out, key
+
+
+def benchmark_node(n: fx.Node) -> float:
+    return benchmark_node_with_cache_key(n)[0]
+
+
+@functools.cache
+def get_benchmark_cache() -> torch._inductor.codecache.LocalCache:
+    return torch._inductor.codecache.LocalCache()
+
+
+def get_cached_node_time(key: str) -> float:
+    return get_benchmark_cache().lookup(key)  # type: ignore[return-value]
+
+
+def set_cached_node_time(key: str, value: float) -> None:
+    return get_benchmark_cache().set_value(key, value=value)
+
+
+@dataclass
+class CollectiveInfo:
+    """Track info about a collective operation"""
+
+    start_node: fx.Node
+    wait_node: fx.Node
+    size_bytes: int
+    estimated_time_ms: float
+    exposed_time_ms: float  # How much of this collective is still exposed
+    hiding_node: fx.Node | None = None  # Node that hides this collective
+
+    @property
+    def is_exposed(self) -> bool:
+        return self.exposed_time_ms != 0
+
+
+@dataclass
+class CollBucket:
+    """Track information about a bucket of collectives."""
+
+    collectives: list[fx.Node]  # Original collective starts
+    bucketed_start: fx.Node | None = None  # After bucketing
+    bucketed_wait: fx.Node | None = None  # After bucketing
+    total_bytes: int = 0
+
+
+def gb_to_bytes(gb: float) -> int:
+    """Convert gigabytes to bytes."""
+    return int(gb * 1024 * 1024 * 1024)
+
+
+class OverlapScheduler:
+    """
+    Scheduler that reorders operations to maximize compute-collective overlap.
+
+    The reordering is done as a scheduling pass. We maintain a priority queue of
+    schedulable nodes. The nodes are ranked by:
+
+    1) the compute node index they dominate. this allows reordering locally, such as with
+    parallel mms, and also allows overlapping reduce scatter nodes outputs in the backward
+    with compute by deferring their waits.
+
+    2) whether the current node is a collective or wait that is currently exposed but has a compute
+    node which it could be overlapped with.
+
+    3) original order in the graph for stability.
+
+    When we schedule compute nodes, we first overlap exposed in-flight collectives, then look for unscheduled
+    collectives that can be scheduled concurrently.
+
+    TODO:
+        - experiment with other priority scores / allow other mechanisms of reorder / more strict adherence to original graph
+        - memory limit for deferred scheduling of reduce_scatter nodes.
+    """
+
+    def __init__(
+        self,
+        gm: torch.fx.GraphModule,
+        max_in_flight_gb: float = 0.5,
+        compute_overlap_multipler: float = 2.0,
+        max_coll_distance: int = 1000,
+        max_compute_pre_fetch: int = 5,
+    ):
+        self.gm = gm
+        self.graph = gm.graph
+        self.compute_overlap_multipler = compute_overlap_multipler
+        self.max_node_distance = max_coll_distance
+        self.max_in_flight_bytes: int = gb_to_bytes(max_in_flight_gb)
+
+        # Build structures
+        stable_topological_sort(self.graph)
+        self.nodes = list(self.graph.nodes)
+        self.node_idx = {n: i for i, n in enumerate(self.nodes)}
+        self.node_ancestors: dict[fx.Node, OrderedSet[fx.Node]] = (
+            self._collect_node_ancestors()
+        )
+
+        # Identify collectives and compute nodes
+        self.collective_info: dict[fx.Node, CollectiveInfo] = {}
+        self.unscheduled_collectives: OrderedSet[fx.Node] = OrderedSet()
+
+        # Memory tracking using abstracted MemoryTracker
+        self.original_peak_memory = max(
+            build_memory_profile(self.graph, _is_releasable)
+        )
+        self.memory_tracker = MemoryTracker(self.graph)
+
+        self.wait_to_start: dict[fx.Node, fx.Node] = {}
+        self._identify_collectives()
+
+        self.compute_index_domination = self._calculate_compute_node_domination_index()
+        self.compute_nodes = [n for n in self.nodes if is_compute_node(n)]
+        self.current_compute_index = 0
+
+        # Scheduling state
+        self.potentially_hidden_collectives = (
+            self.compute_potential_hidden_collectives()
+        )
+        self.potentially_hidden_waits = self.compute_potential_hidden_waits()
+        self.in_degree = Counter(user for node in self.nodes for user in node.users)
+        self.ready: list[tuple[object, fx.Node]] = []
+
+        for node in self.nodes:
+            if self.in_degree[node] == 0:
+                heapq.heappush(self.ready, (self._compute_score(node), node))
+
+        self.in_flight: dict[fx.Node, CollectiveInfo] = {}  # start -> info
+        self.in_flight_bytes = 0
+        self.scheduled: OrderedSet[fx.Node] = OrderedSet()
+        self.max_compute_pre_fetch = max_compute_pre_fetch
+
+    def _collect_node_ancestors(self) -> dict[fx.Node, OrderedSet[fx.Node]]:
+        """Collect all ancestors for each node."""
+        ancestors: dict[fx.Node, OrderedSet[fx.Node]] = defaultdict(OrderedSet)
+        for node in self.nodes:
+            for input_node in node.all_input_nodes:
+                ancestors[node].add(input_node)
+                ancestors[node] |= ancestors[input_node]
+
+        return ancestors
+
+    def off_compute_path(self, n: fx.Node) -> bool:
+        """Check if a node is off the compute path (doesn't block any compute)."""
+        return self.compute_index_domination[n] == sys.maxsize
+
+    def _identify_collectives(self) -> None:
+        """Identify all collective operations."""
+        for node in self.nodes:
+            if is_wait_tensor(node):
+                start = node.args[0]
+                coll_time_ms = estimate_collective_time(start)
+
+                info = CollectiveInfo(
+                    start_node=start,
+                    wait_node=node,
+                    size_bytes=estimate_fx_collective_size(start),
+                    estimated_time_ms=coll_time_ms,
+                    exposed_time_ms=coll_time_ms,  # Initially fully exposed
+                )
+                self.collective_info[start] = info
+                self.wait_to_start[node] = start
+                self.unscheduled_collectives.add(start)
+
+    def _calculate_compute_node_domination_index(self) -> dict[fx.Node, int]:
+        """
+        Compute the topological index of the earliest compute node each node dominates.
+
+        Compute nodes are assigned indices based on their topological order (0, 1, 2, ...).
+        For each node, returns the minimum index of compute nodes it blocks/dominates.
+        Returns sys.maxsize if the node doesn't block any compute nodes.
+        """
+        compute_node_index: dict[fx.Node, int] = {}
+        for node in self.graph.nodes:
+            if is_compute_node(node):
+                compute_node_index[node] = len(compute_node_index)
+
+        domination_index: dict[fx.Node, int] = {}
+        for node in reversed(self.graph.nodes):
+            if node in compute_node_index:
+                # Compute nodes dominate themselves (return their own index)
+                domination_index[node] = compute_node_index[node]
+            else:
+                domination_index[node] = min(
+                    (domination_index[succ] for succ in node.users), default=sys.maxsize
+                )
+
+        return domination_index
+
+    def _align_compute_nodes_runtime_estimations_across_all_distributed_ranks(
+        self,
+    ) -> None:
+        log.info(
+            "Overlap scheduling: Aligning runtime estimations across all distributed ranks"
+        )
+        runtime_estimations_keys: list[str | None] = []
+        runtime_estimations: list[float] = []
+        for n in self.compute_nodes:
+            val, key = benchmark_node_with_cache_key(n)
+            runtime_estimations.append(val)
+            runtime_estimations_keys.append(key)
+
+        import torch.distributed as dist
+        from torch._subclasses.fake_tensor import unset_fake_temporarily
+        from torch.distributed.distributed_c10d import _get_default_group
+
+        world_size = dist.get_world_size()
+        pg = _get_default_group()
+        with unset_fake_temporarily():
+            gathered_runtime_estimations: list[list[float]] = [
+                [] for _ in range(world_size)
+            ]
+            dist.all_gather_object(
+                gathered_runtime_estimations, runtime_estimations, pg
+            )
+            median_runtime_estimations = torch.median(
+                torch.tensor(gathered_runtime_estimations), dim=0
+            ).values.tolist()
+        for key, median_runtime_estimation in zip(
+            runtime_estimations_keys, median_runtime_estimations
+        ):
+            if key is None:
+                continue
+            set_cached_node_time(key, median_runtime_estimation)
+        log.info(
+            "Overlap scheduling: Runtime estimations across all distributed ranks were aligned"
+        )
+
+    def run(self) -> torch.fx.GraphModule:
+        """Run the scheduling algorithm."""
+        # All ranks must make identical decisions on overlap reordering,
+        # Thus we must have identical runtime estimations across ranks.
+        # For now we do benchmarking only for compute nodes.
+        self._align_compute_nodes_runtime_estimations_across_all_distributed_ranks()
+
+        while self.ready:
+            if self._should_force_wait_for_memory():
+                self._force_oldest_wait()
+                continue
+
+            _, node = heapq.heappop(self.ready)
+
+            # we don't always remove nodes from the heap when we schedule them
+            if node in self.scheduled:
+                continue
+
+            if is_compute_node(node):
+                self._handle_compute(node)
+            elif node in self.collective_info:
+                self._handle_collective_start(node)
+            elif is_wait_tensor(node):
+                self._handle_wait(node)
+            else:
+                self._handle_other(node)
+
+        self._reorder_graph()
+
+        if torch._inductor.config.test_configs.aten_fx_overlap_preserving_bucketing:
+            self._bucket_collectives()
+        elif torch._inductor.config.test_configs.aten_fx_overlap_insert_overlap_deps:
+            # If not bucketing, add effect tokens to preserve hiding dependencies
+            self._add_effect_tokens_for_overlap()
+
+        return self.gm
+
+    def _add_effect_tokens_for_overlap(self) -> None:
+        """
+        Add effect tokens to preserve hiding dependency relationships when not bucketing.
+
+        This ensures that communication-compute overlap is preserved through effect tokens
+        when overlap preserving bucketing is not enabled.
+        """
+        from torch._inductor.fx_passes.control_dependencies import (
+            preserve_node_ordering,
+        )
+
+        # Collect hiding dependencies: hiding_node -> collective_start, wait -> hiding_node
+        additional_deps: dict[fx.Node, OrderedSet[fx.Node]] = defaultdict(OrderedSet)
+
+        for start_node, info in self.collective_info.items():
+            if info.hiding_node and not info.is_exposed:
+                # Compute depends on collective start (compute must wait for collective to start)
+                additional_deps[info.hiding_node].add(start_node)
+                # Wait depends on compute (wait must wait for compute to finish)
+                additional_deps[info.wait_node].add(info.hiding_node)
+
+        # Apply effect tokens to preserve these dependencies
+        if additional_deps:
+            preserve_node_ordering(self.graph, additional_deps)
+
+    def _handle_other(self, node: fx.Node) -> None:
+        self._schedule(node)
+
+    def _schedule(self, node: fx.Node) -> None:
+        """Schedule a node."""
+        assert node not in self.scheduled
+        assert all(n in self.scheduled for n in node.all_input_nodes)
+        self.scheduled.add(node)
+        self.memory_tracker.schedule_node(node)
+
+        log.debug(
+            "Scheduled node %s: current_memory=%d bytes, total_scheduled=%d",
+            node.name,
+            self.memory_tracker.get_current_memory_bytes(),
+            len(self.scheduled),
+        )
+
+        for user in node.users:
+            self.in_degree[user] -= 1
+            if self.in_degree[user] == 0:
+                heapq.heappush(self.ready, (self._compute_score(user), user))
+
+    def _compute_score(self, node: fx.Node) -> object:
+        """Compute priority score for a node"""
+
+        if is_wait_tensor(node):
+            info = self.collective_info[self.wait_to_start[node]]
+            # defer waits locally if they are exposed.
+            compute_local_priority = int(info.is_exposed)
+        else:
+            # if we're scheduling this collective via its queue, then it was not
+            # pre-fetched. we might as well maximize overlap for the
+            # local, non-mm nodes prior to the next compute node.
+            if self.in_overlappable_collective_unary_chain(node):
+                compute_local_priority = -1
+            else:
+                compute_local_priority = 0
+
+        return (
+            self.compute_index_domination[node],  # what index compute it blocks
+            compute_local_priority,  # collective_start=-1, wait=1, or neither=0
+            self.node_idx[node],  # Original order for stability
+        )
+
+    @staticmethod
+    def is_cheap_fn(node: fx.Node) -> bool:
+        return getattr(node.target, "is_view", False) or torch.Tag.pointwise in getattr(
+            node.target, "tags", ()
+        )
+
+    def in_overlappable_collective_unary_chain(self, curr: fx.Node) -> bool:
+        while True:
+            if len(curr.users) != 1:
+                return False
+
+            user = next(iter(curr.users))
+            if len(user.all_input_nodes) != 1:
+                return False
+
+            if user in self.unscheduled_collectives:
+                return True
+
+            if not self.is_cheap_fn(user):
+                return False
+
+            curr = user
+
+        return False
+
+    def _should_force_wait_for_memory(self) -> bool:
+        """Check if we need to force a wait due to memory pressure"""
+        if not self.in_flight:
+            return False
+        return self.in_flight_bytes >= self.max_in_flight_bytes or (
+            self.memory_tracker.current_memory_bytes - self.original_peak_memory
+        ) > gb_to_bytes(1.0)
+
+    def _force_oldest_wait(self) -> None:
+        """Schedule the oldest in flight wait"""
+        self._handle_wait(self._get_oldest_wait())
+
+    def _handle_collective_start(self, node: fx.Node) -> None:
+        """Handle scheduling a collective start."""
+        info = self.collective_info[node]
+
+        if self.should_assume_bucketed(node):
+            latency = estimate_collective_time(node, 0)
+            assert latency <= info.exposed_time_ms
+            info.exposed_time_ms = info.exposed_time_ms - latency
+
+        self.in_flight[node] = info
+        self.in_flight_bytes += info.size_bytes
+        self.unscheduled_collectives.discard(node)
+        self._schedule(node)
+
+    def _handle_wait(self, node: fx.Node) -> None:
+        """Handle scheduling a wait."""
+        assert node in self.wait_to_start
+        coll_start = self.wait_to_start[node]
+        assert coll_start in self.in_flight
+
+        # Scheduling a wait of a collective also forces the wait
+        # of every node enqueued prior to the collective on the
+        # same process group
+        group_name = get_group_name(coll_start)
+        to_schedule: list[fx.Node] = []
+        for in_flight_coll in self.in_flight:
+            if in_flight_coll == coll_start:
+                break
+            if get_group_name(in_flight_coll) == group_name:
+                to_schedule.append(in_flight_coll)
+
+        for coll_to_schedule in to_schedule:
+            self._handle_wait(self.collective_info[coll_to_schedule].wait_node)
+
+        self.in_flight_bytes -= self.in_flight[coll_start].size_bytes
+        del self.in_flight[coll_start]
+        self._schedule(node)
+
+    def _handle_compute(self, node: fx.Node) -> None:
+        """Handle scheduling compute and finding overlaps."""
+
+        compute_time = benchmark_node(node)
+        available_compute = compute_time * self.compute_overlap_multipler
+
+        # TODO: separate overlap time per process group
+        # First reduce exposed time of in-flight collectives
+        for info in self.in_flight.values():
+            if info.exposed_time_ms == 0:
+                continue
+            overlap_amount = min(info.exposed_time_ms, available_compute)
+            info.exposed_time_ms -= overlap_amount
+            available_compute -= overlap_amount
+            if info.exposed_time_ms == 0:
+                info.hiding_node = node
+            elif available_compute == 0:
+                break
+
+        # Then, look for unscheduled collectives we can overlap
+        if available_compute:
+            self._schedule_collectives_for_overlap(node, available_compute)
+
+        self._schedule(node)
+        self.current_compute_index += 1
+
+    def _schedule_collectives_for_overlap(
+        self, compute_node: fx.Node, available_compute_time: float
+    ) -> None:
+        """Opportunistically schedule collectives that can be hidden by compute."""
+        compute_ancestors = self.node_ancestors[compute_node]
+
+        # Filter collectives by distance and compute index domination
+        possible_collectives = []
+        for collective in self.unscheduled_collectives:
+            distance = abs(self.node_idx[compute_node] - self.node_idx[collective])
+            if distance > self.max_node_distance:
+                break
+
+            # Skip collectives that are too far ahead in compute index, but allow scheduling
+            # collectives which are off compute path (which typically release memory)
+            # TODO: we could potentially be more strict about limiting the amount of
+            # pre-fetched memory before memory peak, and adjust allowed collective mem.
+            if not self.off_compute_path(collective):
+                if (
+                    self.compute_index_domination[collective]
+                    - self.current_compute_index
+                ) > self.max_compute_pre_fetch:
+                    continue
+
+            possible_collectives.append(collective)
+
+        possible_collectives = sorted(
+            possible_collectives,
+            key=lambda n: (self.compute_index_domination[n], self.node_idx[n]),
+        )
+
+        log.debug(
+            "Scheduling collectives for overlap: compute_node=%s, available_time=%.2f ms, candidates=%d, current_memory=%d bytes",
+            compute_node.name,
+            available_compute_time,
+            len(possible_collectives),
+            self.memory_tracker.current_memory_bytes,
+        )
+
+        for collective in possible_collectives:
+            if available_compute_time == 0:
+                break
+
+            info = self.collective_info[collective]
+
+            # Skip if compute depends on collective or vice versa
+            if (
+                collective in compute_ancestors
+                or compute_node in self.node_ancestors[collective]
+            ):
+                continue
+
+            while (
+                self.in_flight
+                and (self.max_in_flight_bytes - self.in_flight_bytes) < info.size_bytes
+                and self._wait_is_hidden(self._get_oldest_wait(), compute_node)
+            ):
+                self._force_oldest_wait()
+
+            if (self.max_in_flight_bytes - self.in_flight_bytes) < info.size_bytes:
+                continue
+
+            # Check if we can reach this collective without scheduling compute, other collectives, or waits
+            path = self._find_schedulable_path(collective, compute_node)
+            if path is None:
+                continue
+
+            log.debug(
+                "Overlapping collective %s with compute %s: coll_domination=%d, current_depth=%d",
+                collective.name,
+                compute_node.name,
+                self.compute_index_domination[collective],
+                self.current_compute_index,
+            )
+
+            # Schedule path to this collective
+            self._schedule_path_to_collective(path, compute_node)
+            self._handle_collective_start(collective)
+
+            # Update the exposed time for this newly scheduled collective
+            # after scheduling, which will account for latency reduction of bucketing
+            overlap_amount = min(available_compute_time, info.exposed_time_ms)
+            info.exposed_time_ms -= overlap_amount
+            if info.exposed_time_ms == 0:
+                info.hiding_node = compute_node
+            available_compute_time -= overlap_amount
+
+    def _find_schedulable_path(
+        self, target: fx.Node, curr_compute_node: fx.Node | None
+    ) -> OrderedSet[fx.Node] | None:
+        """Find path to target by collecting unscheduled dependencies."""
+
+        # TODO - following path faster than doing set difference here
+        unscheduled_ancestors = self.node_ancestors[target] - self.scheduled
+
+        # only schedule non distributed, non compute nodes
+        for node in unscheduled_ancestors:
+            if is_compute_node(node):
+                return None
+
+            if node in self.unscheduled_collectives:
+                return None
+
+            # if we schedule a wait tensor whose start collective is hidden by the
+            # current compute node we are scheduling, then we are effectively exposing it.
+            # similarly, dont schedule a wait of a collective that could be otherwise hidden,
+            # thus forcing it to be exposed.
+            # however, if it is already hidden or it cannot be possible hidden,
+            # it's fine to schedule it
+            if is_wait_tensor(node):
+                info = self.collective_info[self.wait_to_start[node]]
+                if info.hiding_node and info.hiding_node != curr_compute_node:
+                    continue
+                elif node not in self.potentially_hidden_waits:
+                    continue
+
+                return None
+
+        return unscheduled_ancestors
+
+    def should_assume_bucketed(self, node: fx.Node) -> bool:
+        """
+        Check if there's an in-flight collective that can be bucketed with the given node. If so, assume they will bucket.
+        This is a optimistic heuristic to account for latency reduction with bucketing. The two nodes may not get bucketed.
+        """
+        if not torch._inductor.config.test_configs.assume_bucketing_reduces_latency:
+            return False
+
+        key = bucket_key(node)
+        if key is None:
+            return False
+
+        for in_flight_coll in self.in_flight.keys():
+            if bucket_key(in_flight_coll) == key:
+                return True
+
+        return False
+
+    def _get_oldest_wait(self) -> fx.Node:
+        oldest_start = next(iter(self.in_flight))
+        return self.collective_info[oldest_start].wait_node
+
+    def _wait_is_hidden(
+        self, wait_node: fx.Node, compute_node: fx.Node | None = None
+    ) -> bool:
+        assert is_wait_tensor(wait_node)
+        info = self.collective_info[self.wait_to_start[wait_node]]
+        return not info.is_exposed and info.hiding_node != compute_node
+
+    def _schedule_path_to_collective(
+        self, path: OrderedSet[fx.Node], curr_compute_node: fx.Node
+    ) -> None:
+        """Schedule all nodes needed to reach a collective."""
+
+        assert all(n not in self.scheduled for n in path)
+        for node in sorted(path, key=lambda n: self.node_idx[n]):
+            assert not (is_compute_node(node) or node in self.unscheduled_collectives)
+            if is_wait_tensor(node):
+                # When we schedule wait tensors, we also force realization of all
+                # collectives enqueued prior to their corresponding collective.
+                # It's possible the scheduling of one wait tensor here has forced
+                # another in the path. If so, skip scheduling it.
+                if node in self.scheduled:
+                    continue
+
+                info = self.collective_info[self.wait_to_start[node]]
+                assert info.hiding_node != curr_compute_node
+                self._handle_wait(node)
+                continue
+
+            self._schedule(node)
+
+    def reorder_graph(self) -> None:
+        output_node = self.graph.output_node()
+        for node in self.scheduled:
+            if node.op == "placeholder":
+                continue
+            output_node.prepend(node)
+        self.graph.lint()
+
+    def _reorder_graph(self) -> None:
+        """Reorder graph based on schedule."""
+        exposed = [
+            c
+            for c in self.collective_info.values()
+            if c.exposed_time_ms == c.estimated_time_ms
+        ]
+
+        potentially_hidden_collectives = self.compute_potential_hidden_collectives(
+            limit_coll_per_compute=True
+        )
+        bad_exposed = [
+            c for c in exposed if c.start_node in potentially_hidden_collectives
+        ]
+
+        counters["inductor"]["overlap_scheduling_exposed"] += len(exposed)
+        counters["inductor"]["overlap_scheduling_bad_exposed"] += len(bad_exposed)
+        counters["inductor"]["overlap_scheduling_potentially_hidden"] += len(
+            potentially_hidden_collectives
+        )
+        counters["inductor"]["overlap_original_mem"] = self.original_peak_memory
+        counters["inductor"]["rescheduled_mem"] = self.memory_tracker.peak_memory
+
+        log.info(
+            "Overlap scheduling results: exposed=%d, bad_exposed=%d, potentially_hidden=%d, "
+            "original_peak_memory=%d bytes, rescheduled_peak_memory=%d bytes",
+            len(exposed),
+            len(bad_exposed),
+            len(potentially_hidden_collectives),
+            self.original_peak_memory,
+            self.memory_tracker.peak_memory,
+        )
+
+        self.reorder_graph()
+
+    def _bucket_collectives(self) -> None:
+        from torch._inductor.fx_passes.overlap_preserving_bucketer import (
+            OverlapPreservingBucketer,
+        )
+
+        bucketer = OverlapPreservingBucketer(
+            graph=self.graph,
+            collective_info=self.collective_info,
+            node_ancestors=self.node_ancestors,
+            scheduled=self.scheduled,
+            max_bucket_memory_gb=1.0,  # Could make this configurable
+            max_coll_distance=self.max_node_distance,
+        )
+        bucketer.bucket_collectives()
+
+    def compute_potential_hidden_nodes(
+        self, nodes_to_check: Iterable[fx.Node], limit_coll_per_compute: bool = False
+    ) -> dict[fx.Node, fx.Node]:
+        """
+        Returns a dict containing a mapping of nodes which could potentially be hidden to their hiding node
+        """
+
+        used_compute_nodes: OrderedSet[fx.Node] = OrderedSet()
+
+        def could_be_hidden(start: fx.Node) -> fx.Node | None:
+            for compute_node in self.compute_nodes:
+                if limit_coll_per_compute and compute_node in used_compute_nodes:
+                    continue
+                if (
+                    start not in self.node_ancestors[compute_node]
+                    and compute_node not in self.node_ancestors[start]
+                ):
+                    if limit_coll_per_compute:
+                        used_compute_nodes.add(compute_node)
+                    return compute_node
+
+            return None
+
+        # TODO: We could potentially limit compute nodes per overlap time,
+        # today, this is optimistic, and just serves to avoid deferring
+        # collectives/waits that have no possible overlap as well as for analysis of how
+        # successfully we hid compute
+        potentially_hidden = {}
+        for node in nodes_to_check:
+            if mm := could_be_hidden(node):
+                potentially_hidden[node] = mm
+
+        return potentially_hidden
+
+    def compute_potential_hidden_collectives(
+        self, limit_coll_per_compute: bool = False
+    ) -> dict[fx.Node, fx.Node]:
+        """Compute which collective operations could be hidden by compute."""
+        return self.compute_potential_hidden_nodes(
+            self.collective_info.keys(), limit_coll_per_compute
+        )
+
+    def compute_potential_hidden_waits(
+        self, limit_coll_per_compute: bool = False
+    ) -> dict[fx.Node, fx.Node]:
+        """Compute which wait operations could be hidden by compte."""
+        wait_nodes = [info.wait_node for info in self.collective_info.values()]
+        return self.compute_potential_hidden_nodes(wait_nodes, limit_coll_per_compute)
+
+
+def schedule_overlap_bucketing(
+    gm: torch.fx.GraphModule,
+    max_in_flight_gb: float = 2.0,
+    compute_overlap_multipler: float = 1.0,
+    max_coll_distance: int = 1000,
+) -> torch.fx.GraphModule:
+    """Schedule nodes to maximize compute-collective overlap.
+
+    Args:
+        gm: Input graph module to optimize.
+        max_in_flight_gb: Maximum GB of concurrent collective data.
+        compute_overlap_multipler: Scale factor for compute time used to hide collectives.
+        max_coll_distance: Maximum node distance for overlap consideration.
+    """
+    return OverlapScheduler(
+        gm,
+        compute_overlap_multipler=compute_overlap_multipler,
+        max_in_flight_gb=max_in_flight_gb,
+        max_coll_distance=max_coll_distance,
+    ).run()
diff --git a/torch/_inductor/fx_passes/pad_mm.py b/torch/_inductor/fx_passes/pad_mm.py
index d2dfc3d9e4d0..2e056d1ef21e 100644
--- a/torch/_inductor/fx_passes/pad_mm.py
+++ b/torch/_inductor/fx_passes/pad_mm.py
@@ -3,7 +3,7 @@
 import operator
 import typing
 from collections.abc import Sequence
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable
 
 import torch
 import torch._inductor.runtime.runtime_utils
@@ -83,12 +83,10 @@ def check_dtype(a: Tensor, b: Tensor) -> bool:
     return a.is_floating_point() and b.is_floating_point()
 
 
-def should_pad_common(
-    mat1: Tensor, mat2: Tensor, input: Optional[Tensor] = None
-) -> bool:
+def should_pad_common(mat1: Tensor, mat2: Tensor, input: Tensor | None = None) -> bool:
     # It's fine we have symbolic shapes or strides as long as they
     # have hints. Later, we will make sure we only pad non-symbolic dimensions.
-    def valid_shape_and_stride(t: Optional[Tensor]) -> bool:
+    def valid_shape_and_stride(t: Tensor | None) -> bool:
         if t is None:
             return True
 
@@ -97,6 +95,7 @@ def valid_shape_and_stride(t: Optional[Tensor]) -> bool:
             if isinstance(x, int):
                 continue
             elif utils.is_symbolic(x):
+                # pyrefly: ignore  # missing-attribute
                 if not x.node.has_hint():
                     return False
                 symbolic_cnt += 1
@@ -106,6 +105,7 @@ def valid_shape_and_stride(t: Optional[Tensor]) -> bool:
         if symbolic_cnt == len(t.size()):
             return False
         return all(
+            # pyrefly: ignore  # missing-attribute
             isinstance(x, int) or (utils.is_symbolic(x) and x.node.has_hint())
             for x in t.stride()
         )
@@ -118,7 +118,7 @@ def valid_shape_and_stride(t: Optional[Tensor]) -> bool:
     )
 
 
-def get_padded_length(x: Union[int, torch.SymInt], alignment_size: int) -> int:
+def get_padded_length(x: int | torch.SymInt, alignment_size: int) -> int:
     # we don't pad x if it is symbolic
     if isinstance(x, torch.SymInt) or alignment_size == 0 or x % alignment_size == 0:
         return 0
@@ -151,7 +151,7 @@ def should_pad_addmm(match: Match) -> bool:
 
 
 def pad_addmm(
-    input: Optional[Tensor],
+    input: Tensor | None,
     mat1: Tensor,
     mat2: Tensor,
     m_padded_length: int,
@@ -193,7 +193,7 @@ def pad_addmm(
 
 
 def addmm_replace(
-    input: Optional[Tensor],
+    input: Tensor | None,
     mat1: Tensor,
     mat2: Tensor,
     beta: float = 1.0,
@@ -273,7 +273,7 @@ def should_pad_bench_key(
     mat1: Tensor,
     mat2: Tensor,
     op: torch._ops.OpOverloadPacket,
-    input: Optional[Tensor] = None,
+    input: Tensor | None = None,
     is_base_time_key: bool = False,
 ) -> str:
     def tensor_key(t: Tensor) -> tuple[torch.Size, tuple[int, ...], torch.dtype]:
@@ -283,7 +283,7 @@ def tensor_key(t: Tensor) -> tuple[torch.Size, tuple[int, ...], torch.dtype]:
         None if mat1.dtype != torch.float32 else torch.backends.cuda.matmul.allow_tf32
     )
 
-    def fmt_pad(name: str) -> Optional[str]:
+    def fmt_pad(name: str) -> str | None:
         if is_base_time_key:
             return None
         return f"exclude_pad:{should_exclude_padding_time(match, name)}"
@@ -399,6 +399,7 @@ def should_pad_bench(*args: Any, **kwargs: Any) -> bool:
 def get_do_bench() -> Callable[[Callable[[], Any]], float]:
     with dynamo_timed("pad_mm_benchmark_get_do_bench"):
         return functools.partial(
+            # pyrefly: ignore  # bad-argument-type
             torch._inductor.runtime.benchmarking.benchmarker.benchmark_gpu,
             warmup=5,
         )
@@ -409,7 +410,7 @@ def _should_pad_bench(
     mat1: Tensor,
     mat2: Tensor,
     op: torch._ops.OpOverloadPacket,
-    input: Optional[Tensor] = None,
+    input: Tensor | None = None,
 ) -> bool:
     do_bench = get_do_bench()
 
@@ -437,7 +438,7 @@ def _should_pad_bench(
             return False
 
         def realize_symbols(
-            ds: Union[torch.Size, tuple[torch.SymInt, ...]],
+            ds: torch.Size | tuple[torch.SymInt, ...],
         ) -> list[int]:
             return [d if isinstance(d, int) else d.node.hint for d in ds]
 
@@ -452,6 +453,14 @@ def realize_symbols(
         if torch._inductor.config.force_shape_pad:
             return True
 
+        if torch._inductor.config.deterministic:
+            # In deterministic mode, don't benchmark for pad-mm and assumes
+            # no padding.
+            #
+            # Check the deterministic mode after 'force_shape_pad'
+            # so unit test relying on force_shape_pad should still pass
+            return False
+
         if (
             "pad_aten_mm_pass" in torch._inductor.config.post_grad_fusion_options
             and should_pad_mm_bf16(mat1.dtype, m, n, k)
@@ -475,6 +484,7 @@ def realize_symbols(
         def realize_tensor(t):
             if isinstance(t, FakeTensor):
                 size_hints = realize_symbols(t.size())
+                # pyrefly: ignore  # bad-argument-type
                 stride_hint = realize_symbols(t.stride())
                 real_size = (
                     sum((d - 1) * s for d, s in zip(size_hints, stride_hint)) + 1
@@ -615,6 +625,8 @@ def pad_bench_fn():
             set_cached_base_mm_benchmark_time(ori_time_key, ori_time)
 
         pad_time = do_bench(pad_bench_fn)
+
+        counters["inductor"]["pad_mm_bench"] += 1
         return should_pad(key, ori_time, pad_time)
 
 
@@ -667,10 +679,10 @@ def run_autoheuristic(
     ori_time: float,
     ori_time_key: str,
     key: str,
-) -> Optional[bool]:
+) -> bool | None:
     def feedback_fn(
         choice: str,
-    ) -> Optional[float]:
+    ) -> float | None:
         if choice == orig_choice:
             return do_bench(orig_bench_fn)
         elif choice == pad_choice:
@@ -705,7 +717,7 @@ def fallback() -> str:
     )
     choice = autoheuristic.get_choice()
     choice2should_pad = {orig_choice: False, pad_choice: True, "autotune": None}
-    ah_should_pad = choice2should_pad.get(choice, None)
+    ah_should_pad = choice2should_pad.get(choice)
 
     if torch._inductor.config.collect_autoheuristic(name):
         ah_ori_time = autoheuristic.get_collected_feedback(orig_choice)
@@ -907,7 +919,9 @@ def _pad_mm_init() -> None:
             pattern,
             replacement,
             args,
+            # pyrefly: ignore  # bad-argument-type
             joint_fwd_bwd,
+            # pyrefly: ignore  # bad-argument-type
             patterns,
             extra_check=extra_check,
             scalar_workaround=workaround,
@@ -918,7 +932,9 @@ def _pad_mm_init() -> None:
             pattern,
             replacement,
             args,
+            # pyrefly: ignore  # bad-argument-type
             fwd_only,
+            # pyrefly: ignore  # bad-argument-type
             patterns,
             extra_check=extra_check,
             scalar_workaround=workaround,
diff --git a/torch/_inductor/fx_passes/post_grad.py b/torch/_inductor/fx_passes/post_grad.py
index ba6953c09118..8e92d1e8a4f4 100644
--- a/torch/_inductor/fx_passes/post_grad.py
+++ b/torch/_inductor/fx_passes/post_grad.py
@@ -5,7 +5,7 @@
 import logging
 import operator
 from collections import Counter, defaultdict
-from typing import Any, Callable, Optional, TypeVar, Union
+from typing import Any, Callable, TypeVar
 from typing_extensions import ParamSpec
 
 import torch
@@ -42,7 +42,7 @@
     Match,
     MultiOutputPattern,
     MULTIPLE,
-    PatternMatcherPass,
+    PatternMatcherPass as PatternMatcherPassBase,
     register_graph_pattern,
     register_replacement,
     stable_topological_sort,
@@ -68,6 +68,10 @@
 _T = TypeVar("_T")
 _P = ParamSpec("_P")
 
+PatternMatcherPass = functools.partial(
+    PatternMatcherPassBase, subsystem="post_grad_passes"
+)
+
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
 prims = torch.ops.prims
@@ -198,6 +202,7 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
                 GraphTransformObserver(gm, pass_name).apply_gm_pass(custom_backend_pass)
 
     collectives_bucketing: bool = False
+
     if config.bucket_reduce_scatters_fx != "none":
         from torch._inductor.fx_passes.bucketing import bucket_reduce_scatter
         from torch._inductor.fx_passes.fsdp import bucket_fsdp_reduce_scatter
@@ -211,7 +216,7 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
             lambda graph: p(
                 graph.owning_module,
                 config.bucket_reduce_scatters_fx_bucket_size_determinator,
-                config.bucket_reduce_scatters_fx,
+                config.bucket_reduce_scatters_fx,  # type: ignore[arg-type]
             )
         )
         collectives_bucketing = True
@@ -231,7 +236,7 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
             lambda graph: p(
                 graph.owning_module,
                 config.bucket_all_gathers_fx_bucket_size_determinator,
-                config.bucket_all_gathers_fx,
+                config.bucket_all_gathers_fx,  # type: ignore[arg-type]
             )
         )
         collectives_bucketing = True
@@ -333,6 +338,7 @@ def decompose_map_to_while_loop(gm: torch.fx.GraphModule):
 
     @register_graph_pattern(
         CallFunctionVarArgs(torch.ops.higher_order.map_impl),
+        # pyrefly: ignore  # bad-argument-type
         pass_dict=graph_pass,
     )
     def _(match: Match, *args, **kwargs):
@@ -431,7 +437,7 @@ def body_fn(*flat_args):
 
 
 def resolve_shape_to_proxy(
-    shape: list[Union[int, torch.SymInt]], bound_symbols: dict[Any, Any]
+    shape: list[int | torch.SymInt], bound_symbols: dict[Any, Any]
 ):
     """
     Given a list of symints/ints, this function returns a calculated expression of bound_symbols' values.
@@ -520,6 +526,7 @@ def body_fn(loop_idx, ys, init, xs):
 
     @register_graph_pattern(
         CallFunctionVarArgs(torch.ops.higher_order.scan),
+        # pyrefly: ignore  # bad-argument-type
         pass_dict=graph_pass,
     )
     def _(match: Match, *args, **kwargs):
@@ -653,11 +660,15 @@ def lazy_init():
     # pass since otherwise there will be perf/peak-memory regression:
     # https://github.com/pytorch/pytorch/issues/148141
     register_replacement(
+        # pyrefly: ignore  # bad-argument-type
         prepare_softmax_pattern,
+        # pyrefly: ignore  # bad-argument-type
         prepare_softmax_replacement,
         [torch.empty(4, 8)],
         scalar_workaround=dict(dim=-1),
+        # pyrefly: ignore  # bad-argument-type
         trace_fn=fwd_only,
+        # pyrefly: ignore  # bad-argument-type
         pass_dicts=pass_patterns[1],
         extra_check=prepare_softmax_extra_check,
     )
@@ -700,7 +711,7 @@ def visit(other_node):
         iter(graph.find_nodes(op="call_function", target=torch.ops.aten.copy_.default)),
         None,
     )
-    past_mutating_epilogue = True if first_copy is None else False
+    past_mutating_epilogue = first_copy is None
 
     for node in reversed(graph.nodes):
         seen_nodes.add(node)
@@ -718,7 +729,10 @@ def register_lowering_pattern(
     Register an aten to inductor IR replacement pattern
     """
     return pattern_matcher.register_lowering_pattern(
-        pattern, extra_check, pass_dict=pass_patterns[pass_number]
+        pattern,
+        extra_check,
+        # pyrefly: ignore  # bad-argument-type
+        pass_dict=pass_patterns[pass_number],
     )
 
 
@@ -809,6 +823,13 @@ def scatter_upon_const_tensor(
     """
     from torch._inductor import metrics
 
+    # Check if inputs are tensors instead of inductor IR nodes
+    if isinstance(selector, torch.Tensor):
+        # Return a fake tensor with the proper shape that this operator is intended to return
+        device = selector.device if hasattr(selector, "device") else torch.device("cpu")
+        return torch.empty(shape, dtype=dtype, device=device)
+
+    # pyrefly: ignore  # bad-assignment
     metrics.num_matches_for_scatter_upon_const_tensor += 1
 
     selector_loader = selector.make_loader()
@@ -860,6 +881,7 @@ def mm_plus_mm(match: Match, mat1, mat2, mat3, mat4):
         KeywordArg("dim"),
         _users=MULTIPLE,
     ),
+    # pyrefly: ignore  # bad-argument-type
     pass_dict=pass_patterns[1],
 )
 def pointless_cumsum_replacement(match: Match, shape, fill_value, device, dtype, dim):
@@ -879,6 +901,7 @@ def repl(*shape):
 
     # only replace the output node, not all nodes
     match.nodes = [match.output_node()]
+    # pyrefly: ignore  # bad-argument-type
     match.replace_by_example(repl, list(shape))
 
 
@@ -1100,8 +1123,8 @@ def remove_noop_ops(graph: torch.fx.Graph):
     Removes both operations that are essentially aten.clone and operations that are essentially aten.alias from the graph.
     """
     inputs = OrderedSet[torch.fx.Node]()
-    input_storages = OrderedSet[Union[int, None]]()
-    output_storages = OrderedSet[Union[int, None]]()
+    input_storages = OrderedSet[int | None]()
+    output_storages = OrderedSet[int | None]()
 
     for node in graph.find_nodes(op="placeholder"):
         inputs.add(node)
@@ -1196,6 +1219,7 @@ def decompose_triton_kernel_wrapper_functional(graph):
 
     @register_graph_pattern(
         CallFunctionVarArgs(torch.ops.higher_order.triton_kernel_wrapper_functional),
+        # pyrefly: ignore  # bad-argument-type
         pass_dict=graph_pass,
     )
     def _(match: Match, *args, **kwargs):
@@ -1212,6 +1236,7 @@ def decomp(*flat_args):
             args, kwargs = pytree.tree_unflatten(flat_args, spec)
             return (triton_kernel_wrapper_functional_dense(*args, **kwargs),)
 
+        # pyrefly: ignore  # bad-argument-type
         match.replace_by_example(decomp, flat_args, run_functional_passes=False)
 
     graph_pass.apply(graph)
@@ -1235,6 +1260,7 @@ def decompose_auto_functionalized(graph):
 
     @register_graph_pattern(
         CallFunctionVarArgs(torch.ops.higher_order.auto_functionalized),
+        # pyrefly: ignore  # bad-argument-type
         pass_dict=graph_pass,
     )
     def _(match: Match, *args, **kwargs):
@@ -1255,10 +1281,12 @@ def decomp(*flat_args):
             mode = args[0]
             return auto_functionalized_dense(mode, only_clone_these_tensors, **kwargs)
 
+        # pyrefly: ignore  # bad-argument-type
         match.replace_by_example(decomp, flat_args, run_functional_passes=False)
 
     @register_graph_pattern(
         CallFunctionVarArgs(torch.ops.higher_order.auto_functionalized_v2),
+        # pyrefly: ignore  # bad-argument-type
         pass_dict=graph_pass,
     )
     def _(match: Match, *args, **kwargs):
@@ -1299,6 +1327,7 @@ def decomp(*flat_args):
                 mutable_op, only_clone_these_bases, **kwargs
             )
 
+        # pyrefly: ignore  # bad-argument-type
         match.replace_by_example(decomp, flat_args, run_functional_passes=False)
 
     graph_pass.apply(graph)
@@ -1463,6 +1492,7 @@ def should_prefer_unfused_addmm(match):
 
 @register_graph_pattern(
     CallFunction(aten.addmm, KeywordArg("inp"), Arg(), Arg()),
+    # pyrefly: ignore  # bad-argument-type
     pass_dict=pass_patterns[2],
     extra_check=should_prefer_unfused_addmm,
 )
@@ -1470,6 +1500,7 @@ def unfuse_bias_add_to_pointwise(match: Match, mat1, mat2, *, inp):
     def repl(inp, x1, x2):
         return x1 @ x2 + inp
 
+    # pyrefly: ignore  # bad-argument-type
     match.replace_by_example(repl, [inp, mat1, mat2])
 
 
@@ -1503,6 +1534,7 @@ def is_valid_addmm_fusion(match):
         CallFunction(aten.mm, Arg(), Arg()),
         KeywordArg("inp"),
     ),
+    # pyrefly: ignore  # bad-argument-type
     pass_dict=pass_patterns[2],
     extra_check=is_valid_addmm_fusion,
 )
@@ -1512,6 +1544,7 @@ def is_valid_addmm_fusion(match):
         KeywordArg("inp"),
         CallFunction(aten.mm, Arg(), Arg()),
     ),
+    # pyrefly: ignore  # bad-argument-type
     pass_dict=pass_patterns[2],
     extra_check=is_valid_addmm_fusion,
 )
@@ -1541,7 +1574,9 @@ def register_partial_reduction_pattern():
         full_reduc = CallFunction([red_op, equiv_red[red_op]], inp)
 
         @register_graph_pattern(
-            MultiOutputPattern([partial_reduc, full_reduc]), pass_dict=pass_patterns[2]
+            MultiOutputPattern([partial_reduc, full_reduc]),
+            # pyrefly: ignore  # bad-argument-type
+            pass_dict=pass_patterns[2],
         )
         def reuse_partial(match, input, reduced_dims, keepdim):
             partial_red, full_red = match.output_nodes()
@@ -1694,7 +1729,7 @@ def cannot_be_moved(self, node: fx.Node) -> bool:
 
         return False
 
-    def get_node_device(self, node: fx.Node) -> Optional[torch.device]:
+    def get_node_device(self, node: fx.Node) -> torch.device | None:
         """
         Get the device of a node.
         """
@@ -1717,6 +1752,7 @@ def add_cpu_inp(node):
 
             pytree.tree_map_only(fx.Node, add_cpu_inp, (node.args, node.kwargs))
 
+            # pyrefly: ignore  # redundant-condition
             if cpu_count:
                 cpu_indeg[node] = cpu_count
 
@@ -1750,7 +1786,7 @@ def __call__(self, graph: fx.Graph) -> None:
             if not torch._subclasses.fake_tensor._is_tensor_constructor(node.target):
                 continue
 
-            if not node.kwargs.get("device") == torch.device("cpu"):
+            if node.kwargs.get("device") != torch.device("cpu"):
                 continue
 
             constructors.append(node)
@@ -1911,13 +1947,9 @@ def move_constructors_to_gpu(graph: fx.Graph) -> None:
     # by explicitly moving cpu scalar tensors to gpu when profitable, relying on
     # graph partition to split off this data copy, and cudagraphifying
     # the remaining gpu ops.
-    allow_inputs_outputs = (
-        True
-        if (
-            torch._inductor.config.triton.cudagraphs
-            and torch._inductor.config.graph_partition
-        )
-        else False
+    allow_inputs_outputs = bool(
+        torch._inductor.config.triton.cudagraphs
+        and torch._inductor.config.graph_partition
     )
     ConstructorMoverPass(
         get_gpu_type(),
diff --git a/torch/_inductor/fx_passes/pre_grad.py b/torch/_inductor/fx_passes/pre_grad.py
index 2d1709962e64..238c6556b5c2 100644
--- a/torch/_inductor/fx_passes/pre_grad.py
+++ b/torch/_inductor/fx_passes/pre_grad.py
@@ -1,10 +1,10 @@
 # mypy: allow-untyped-defs
 import copy
+import functools
 import itertools
 import logging
 import types
 from collections.abc import Sequence
-from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -14,7 +14,9 @@
     matches_module_pattern,
     replace_node_module,
 )
-from torch.fx.passes.graph_transform_observer import GraphTransformObserver
+from torch.fx.passes.graph_transform_observer import (
+    GraphTransformObserver as GraphTransformObserverBase,
+)
 from torch.fx.passes.shape_prop import ShapeProp
 from torch.nn import functional as F
 from torch.nn.utils.fusion import fuse_conv_bn_eval, fuse_conv_bn_weights
@@ -23,7 +25,7 @@
 from ..fx_utils import matches_module_function_pattern
 from ..pattern_matcher import (
     init_once_fakemode,
-    PatternMatcherPass,
+    PatternMatcherPass as PatternMatcherPassBase,
     stable_topological_sort,
 )
 from ..utils import is_cpu_device, pass_execution_and_save
@@ -32,6 +34,13 @@
 from .split_cat import PRE_GRAD_PATTERNS
 
 
+PatternMatcherPass = functools.partial(
+    PatternMatcherPassBase, subsystem="pre_grad_passes"
+)
+GraphTransformObserver = functools.partial(
+    GraphTransformObserverBase, subsystem="pre_grad_passes"
+)
+
 log = logging.getLogger(__name__)
 
 efficient_conv_bn_eval_pass = PatternMatcherPass(
@@ -165,7 +174,7 @@ def lazy_init():
 
 
 def _get_pass_name_func(p):
-    if isinstance(p, PatternMatcherPass):
+    if isinstance(p, PatternMatcherPassBase):
         pass_name = p.pass_name
         pass_func = p.apply
     elif isinstance(p, types.FunctionType):
@@ -181,8 +190,8 @@ def _get_pass_name_func(p):
 def _run_pre_dispatch_passes(
     gm: torch.fx.GraphModule,
     example_inputs: Sequence[object] = (),
-    add_passes: Optional[str] = None,
-    remove_passes: Optional[str] = None,
+    add_passes: str | None = None,
+    remove_passes: str | None = None,
 ) -> None:
     # order matters
     default_pass_list = [
@@ -268,8 +277,8 @@ def _run_pre_dispatch_passes(
 def pre_grad_passes(
     gm: torch.fx.GraphModule,
     example_inputs: Sequence[object] = (),
-    add_passes: Optional[str] = None,
-    remove_passes: Optional[str] = None,
+    add_passes: str | None = None,
+    remove_passes: str | None = None,
 ) -> torch.fx.GraphModule:
     """
     Apply passes on the input FX graph using Torch IR.
@@ -329,8 +338,9 @@ def pre_grad_passes(
             efficient_conv_bn_eval_pass.apply(gm.graph)  # type: ignore[arg-type]
 
     if config.pre_grad_custom_pass is not None:
-        with GraphTransformObserver(gm, "pre_grad_custom_pass"):
-            config.pre_grad_custom_pass(gm.graph)
+        GraphTransformObserver(gm, "pre_grad_custom_pass").apply_graph_pass(
+            config.pre_grad_custom_pass
+        )
     stable_topological_sort(gm.graph)
 
     from .quantization import quant_lift_up
@@ -498,6 +508,7 @@ def is_fusion_enabled(self):
                 conv = conv_bn_fusion.conv_module
                 bn = conv_bn_fusion.bn_module
 
+                # pyrefly: ignore  # bad-argument-type
                 fused_conv = fuse_conv_bn_eval(conv, bn)
                 for bn_node in bn_nodes:
                     replace_node_module(bn_node.args[0], modules, fused_conv)
@@ -585,8 +596,11 @@ def _used_by_same_conv_module(users):
                 fused_conv.weight, fused_conv.bias = fuse_conv_bn_weights(
                     fused_conv.weight,
                     fused_conv.bias,
+                    # pyrefly: ignore  # bad-argument-type
                     bn_running_mean,
+                    # pyrefly: ignore  # bad-argument-type
                     bn_running_var,
+                    # pyrefly: ignore  # bad-argument-type
                     bn_eps,
                     bn_weight,
                     bn_bias,
@@ -604,7 +618,7 @@ def _used_by_same_conv_module(users):
 class NormalizedLinearNode:
     def __init__(self, node: torch.fx.Node) -> None:
         assert node.op == "call_function"
-        assert node.target in [torch.nn.functional.linear]
+        assert node.target is torch.nn.functional.linear
         self.node: torch.fx.Node = node
 
     def get_input(self) -> torch.fx.Node:
@@ -623,7 +637,7 @@ def get_bias(self) -> torch.fx.Node:
         if len(self.node.args) > 2:
             return self.node.args[2]  # type: ignore[return-value]
         else:
-            return self.node.kwargs["bias"] if "bias" in self.node.kwargs else None  # type: ignore[return-value]
+            return self.node.kwargs.get("bias", None)  # type: ignore[return-value]
 
 
 class NormalizedMatmulNode:
@@ -748,7 +762,7 @@ def linear_permute_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
 # ---->
 # Y2 = (W * X^T + bias.unsqueeze(-1))^T
 def linear_transpose(
-    input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor]
+    input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None
 ) -> torch.Tensor:
     if bias is None:
         return torch.matmul(weight, input.transpose(-1, -2))
@@ -845,7 +859,7 @@ def permute_matmul_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
 # ---->
 # Y2 = X1.transpose(-1, -2) * W1^T + bias1
 def transpose_linear(
-    input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor]
+    input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None
 ) -> torch.Tensor:
     if bias is None:
         return torch.matmul(input.transpose(-1, -2), weight.t())
diff --git a/torch/_inductor/fx_passes/quantization.py b/torch/_inductor/fx_passes/quantization.py
index 62f04504b606..80bb9a05e2aa 100644
--- a/torch/_inductor/fx_passes/quantization.py
+++ b/torch/_inductor/fx_passes/quantization.py
@@ -43,6 +43,7 @@
     aten.transpose.int,
     aten.permute.default,
     aten.view.default,
+    aten.reshape.default,
 ]
 
 """
@@ -530,7 +531,7 @@ def qlinear(match: Match, *args, **kwargs):
         )
 
         # bias
-        b = kwargs["b"] if "b" in kwargs else None
+        b = kwargs.get("b")
 
         # Output QParams
         o_inv_scale = kwargs["output_scale"]
@@ -592,7 +593,7 @@ def qlinear_binary(match: Match, *args, **kwargs):
             kwargs["w_zp"],
         )
         # bias
-        b = kwargs["b"] if "b" in kwargs else None
+        b = kwargs.get("b")
         # Output QParams
         o_inv_scale = kwargs["output_scale"]
         o_zero_point = kwargs["output_zero_point"]
@@ -884,10 +885,10 @@ def _register_quantized_maxpool2d_lowering(
     def qmaxpool2d(match: Match, *args, **kwargs):
         x = kwargs["x"]
         kernel_size = kwargs["kernel_size"]
-        stride = kwargs["stride"] if ("stride" in kwargs) else None
-        padding = kwargs["padding"] if ("padding" in kwargs) else 0
-        dilation = kwargs["dilation"] if ("dilation" in kwargs) else 1
-        ceil_mode = kwargs["ceil_mode"] if ("ceil_mode" in kwargs) else False
+        stride = kwargs.get("stride")
+        padding = kwargs.get("padding", 0)
+        dilation = kwargs.get("dilation", 1)
+        ceil_mode = kwargs.get("ceil_mode", False)
 
         if padding == 0:
             padding = [0, 0]
@@ -1112,9 +1113,7 @@ def fn(match):
             and w2.dtype == torch.int8
             and w3.dtype == torch.int8
             and scales.dtype == torch.bfloat16
-            # _weight_int8pack_mm kernel only supports cpu now
-            # TODO: add cuda kernel support instead of calling mul+sum
-            and x.device.type == "cpu"
+            and x.device.type in ("cpu", "cuda")
             and x.device == w1.device
             and w1.device == w2.device
             and w2.device == w3.device
@@ -1977,7 +1976,7 @@ def qlinear_weight_prepack(match: Match, *args, **kwargs):
         )
 
         # Params
-        bias = kwargs["b"] if "b" in kwargs else None
+        bias = kwargs.get("b")
 
         x_shape = qx.meta.get("tensor_meta").shape
         if has_free_symbols(x_shape):
@@ -2452,7 +2451,7 @@ def linear_dynamic_fp16_weight_prepack(match: Match, *args, **kwargs):
         # find params
         x = kwargs["x"]
         w = kwargs["w"]
-        bias = kwargs["b"] if "b" in kwargs else None
+        bias = kwargs.get("b")
 
         # find linear node
         nodes_to_find = [aten.addmm.default, aten.mm.default, aten.bmm.default]
@@ -2728,7 +2727,7 @@ def _validate_pattern(match: Match):
             pass_number=pass_number,
         )
         def _int_mm_weight_prepack(match: Match, *args, **kwargs):
-            bias = kwargs.get("bias", None)
+            bias = kwargs.get("bias")
             x = kwargs["a"]
             weight = kwargs["b"]
             dtype = kwargs["dtype"]
@@ -2795,7 +2794,7 @@ def _int_mm_weight_prepack(match: Match, *args, **kwargs):
                 else:
                     # onednn.qlinear does not support per-channel quantization of x
                     # so in this case, we have to apply x scale and add bias ourselves after qlinear
-                    in_shape = kwargs.get("in_shape", None)
+                    in_shape = kwargs.get("in_shape")
                     if in_shape is None:
                         x_reshaped = x
                     else:
@@ -2827,8 +2826,8 @@ def _int_mm_weight_prepack(match: Match, *args, **kwargs):
 
                     # Add bias and reshape
                     has_outer_reshape = (
-                        kwargs.get("out_shape_with_bias", None) is not None
-                        or kwargs.get("out_shape_no_bias", None) is not None
+                        kwargs.get("out_shape_with_bias") is not None
+                        or kwargs.get("out_shape_no_bias") is not None
                     )
 
                     if has_outer_reshape:
@@ -3277,7 +3276,7 @@ def qlinear_post_op_fusion(match: Match, *args, **kwargs):
         )
 
         # bias
-        b = kwargs["b"] if "b" in kwargs else None
+        b = kwargs.get("b")
 
         # Output QParams
         o_inv_scale = (
@@ -3919,6 +3918,7 @@ def is_view_op(node):
 
             # Further check the input node of the first view node has only 1 user node
             if could_lift_up and len(input_node.users) == 1:
+                counters["inductor"]["quant_lift_up_count"] += 1
                 # Replace dequant's input from quant to quant's input
                 quant_node.replace_all_uses_with(input_node_of_quant)
                 # Insert the new quant node
diff --git a/torch/_inductor/fx_passes/reinplace.py b/torch/_inductor/fx_passes/reinplace.py
index b67c0dbb729a..8b9deac6ba5a 100644
--- a/torch/_inductor/fx_passes/reinplace.py
+++ b/torch/_inductor/fx_passes/reinplace.py
@@ -4,14 +4,16 @@
 import operator
 from collections import defaultdict
 from collections.abc import Sequence
+from contextlib import nullcontext
 from dataclasses import dataclass
-from typing import Any, Callable, cast, Union
+from typing import Any, Callable, cast
 
 import torch
 import torch.fx.node
 from torch._C._dynamo.guards import compute_overlapping_tensors
 from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.utils import ReinplaceCounters, ReInplaceTrigger
+from torch._guards import detect_fake_mode
 from torch._higher_order_ops.triton_kernel_wrap import (
     kernel_side_table,
     triton_kernel_wrapper_functional,
@@ -22,7 +24,10 @@
     inplaceable_foreach_ops as inplaceable_foreach_ops_lowerings,
 )
 from torch._inductor.virtualized import V
-from torch.fx.experimental.symbolic_shapes import GuardOnDataDependentSymNode
+from torch.fx.experimental.symbolic_shapes import (
+    compute_unbacked_bindings,
+    GuardOnDataDependentSymNode,
+)
 from torch.fx.immutable_collections import immutable_dict, immutable_list
 from torch.fx.passes.reinplace import _is_view_op
 from torch.utils import _pytree as pytree
@@ -58,7 +63,9 @@ def graph_call_function(graph: torch.fx.Graph, fn, *args, **kwargs):
         fake_result = fn(*fake_args, **fake_kwargs)
 
     node = graph.call_function(fn, args, kwargs)
+
     node.meta["val"] = fake_result
+
     return node
 
 
@@ -78,7 +85,15 @@ def _inplace_generalized_scatter(
             lambda node: node.meta["val"] if isinstance(node, torch.fx.Node) else node,
             (view.args, view.kwargs),
         )
-        tmp = view.target(tmp, *fake_args, **fake_kwargs)
+        # slice and select can allocate new unbacked symints, but those won't be reflected
+        # in the output of this function, hence shall be ignored.
+        fake_mode = detect_fake_mode(fake_args)
+        with (
+            fake_mode.shape_env.ignore_fresh_unbacked_symbols()
+            if fake_mode and fake_mode.shape_env
+            else nullcontext()
+        ):
+            tmp = view.target(tmp, *fake_args, **fake_kwargs)
     try:
         tmp.copy_(src)
     except RuntimeError as e:
@@ -161,6 +176,13 @@ def _decompose_scatter_mutating(
     tmp = inp
     for view in view_ops:  # type: ignore[union-attr]
         tmp = graph_call_function(graph, view.target, tmp, *view.args, **view.kwargs)  # type: ignore[union-attr]
+        # we need to set unbacked bindings that could have been created in the view ops.
+        if (V.fake_mode.shape_env) and (
+            symbol_to_path := compute_unbacked_bindings(
+                V.fake_mode.shape_env, tmp.meta["val"]
+            )
+        ):
+            tmp.meta["unbacked_bindings"] = symbol_to_path
 
     graph_call_function(graph, aten.copy_.default, tmp, src)
     return inp  # type: ignore[return-value]
@@ -503,7 +525,7 @@ def _overlap(ls) -> bool:
 
         if mutated_arg.op in ("placeholder", "get_attr"):
             # Get the first copy_ node that mutates the mutated_arg.
-            copy_node = copy_nodes.get(mutated_arg, None)
+            copy_node = copy_nodes.get(mutated_arg)
             if copy_node is None:
                 # There is no copy_ back to the candidate mutated_arg (which is a graph input).
                 # Therefore the semantics of the program are that it does not mutate
@@ -578,7 +600,7 @@ def reinplace_and_refine_tensors_to_clone(
         old_tensors_to_clone, kwargs, node_name, trigger
     ):
         tensors_to_clone: list[str] = []
-        storage_of_reinplaced_args = OrderedSet[Union[int, None]]()
+        storage_of_reinplaced_args = OrderedSet[int | None]()
 
         # Those used to count possibly_missed_reinplacing_opportunities
         missed_nodes = []
@@ -615,7 +637,7 @@ def tensor_with_same_storage_already_reinplaced(arg):
                 copy_node = copy_args_to_copy_nodes.get((mutated_arg, node))
                 if copy_node is not None:
                     replace_dict[copy_node] = copy_node.args[0]
-                if not trigger == ReInplaceTrigger.AUTO_FUNC_V2:
+                if trigger != ReInplaceTrigger.AUTO_FUNC_V2:
                     for user in node.users:
                         # For auto_functionalize_v2, arg is the index of the base, where base at index i corresponds to
                         # output atindex size(out)+i.
@@ -679,7 +701,7 @@ def tensor_with_same_storage_already_reinplaced(arg):
             from torch._higher_order_ops.auto_functionalize import get_mutable_args
 
             tensors_to_clone, _ = get_mutable_args(_mutable_op)
-            # Don't try to reinplace Optional[Tensor] args that are None.
+            # Don't try to reinplace Tensor | None args that are None.
             tensors_to_clone = [
                 t for t in tensors_to_clone if node.kwargs[t] is not None
             ]
diff --git a/torch/_inductor/fx_passes/replace_random.py b/torch/_inductor/fx_passes/replace_random.py
index 27e97eaa5532..4c7f8887f7ae 100644
--- a/torch/_inductor/fx_passes/replace_random.py
+++ b/torch/_inductor/fx_passes/replace_random.py
@@ -17,7 +17,7 @@
 
 
 log = logging.getLogger(__name__)
-patterns = PatternMatcherPass()
+patterns = PatternMatcherPass(subsystem="joint_graph_passes")
 aten = torch.ops.aten
 
 
@@ -27,7 +27,7 @@ def replace_random_passes(gm: torch.fx.GraphModule):
         return 0
 
     count = patterns.apply(gm)
-    with GraphTransformObserver(gm, "fuse_seed_creation_pass"):
+    with GraphTransformObserver(gm, "fuse_seed_creation_pass", "joint_graph_passes"):
         count += fuse_seed_creation_pass(gm.graph)
 
     return count
@@ -88,9 +88,13 @@ def get_device(device):
     return torch.empty([]).device  # default device
 
 
+# pyrefly: ignore  # bad-argument-type
 @register_graph_pattern(CallFunctionVarArgs(aten.rand.default), pass_dict=patterns)
+# pyrefly: ignore  # bad-argument-type
 @register_graph_pattern(CallFunctionVarArgs(aten.rand.generator), pass_dict=patterns)
+# pyrefly: ignore  # bad-argument-type
 @register_graph_pattern(CallFunctionVarArgs(aten.randn.default), pass_dict=patterns)
+# pyrefly: ignore  # bad-argument-type
 @register_graph_pattern(CallFunctionVarArgs(aten.randn.generator), pass_dict=patterns)
 def replace_random(
     match: Match,
@@ -120,9 +124,11 @@ def replacement(size):
         match.output_node().target.overloadpacket  # type: ignore[union-attr]
     ]  # type: ignore[union-attr]
     device = get_device(device)
+    # pyrefly: ignore  # bad-argument-type
     match.replace_by_example(replacement, [size])
 
 
+# pyrefly: ignore  # bad-argument-type
 @register_graph_pattern(CallFunctionVarArgs(aten.randint.low), pass_dict=patterns)
 def replace_randint(
     match: Match,
@@ -140,4 +146,5 @@ def replacement(low, high, size):
         return result.to(dtype)
 
     device = get_device(device)
+    # pyrefly: ignore  # bad-argument-type
     match.replace_by_example(replacement, [low, high, size])
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_21.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_21.py
index ad27e6eb6bb8..4ebd4a4e14e4 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_21.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_21.py
@@ -119,6 +119,88 @@
 _sfdp_pattern_21_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
 
 
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, view_default_2, KeywordArg('attn_mask'))
+view_default_3 = CallFunction(aten.view.default, add_Tensor, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, view_default_4, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, view_default_4, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=3)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_6 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+neg_default = CallFunction(aten.neg.default, div_Tensor)
+view_default_8 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_6, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_8, permute_default_4)
+view_default_9 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_9, div_Tensor, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+view_default_10 = CallFunction(aten.view.default, fma_default, Ignored())
+view_default_11 = CallFunction(aten.view.default, view_default_10, Ignored())
+view_default_12 = CallFunction(aten.view.default, view_default_11, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_12, permute_default_5)
+view_default_13 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_13, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_12)
+view_default_14 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_14, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_8)
+view_default_15 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_15, Ignored())
+_sfdp_pattern_21_bs1_training = MultiOutputPattern([view_default_7,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, view_default_2, KeywordArg('attn_mask'))
+view_default_3 = CallFunction(aten.view.default, add_Tensor, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, view_default_4, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, view_default_4, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_6 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+_sfdp_pattern_21_bs1_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
+
+
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
 expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
 clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
@@ -215,3 +297,95 @@
 view_default_6 = CallFunction(aten.view.default, clone_default_2, Ignored())
 bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
 _sfdp_pattern_21_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, view_default_2, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored())
+view_default_3 = CallFunction(aten.view.default, convert_element_type_default, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, view_default_4, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default_1, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default_1, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=3)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_2, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_6 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+neg_default = CallFunction(aten.neg.default, div_Tensor)
+view_default_8 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_6, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_8, permute_default_4)
+view_default_9 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, view_default_9, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_3, div_Tensor, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+view_default_10 = CallFunction(aten.view.default, convert_element_type_default_4, Ignored())
+view_default_11 = CallFunction(aten.view.default, view_default_10, Ignored())
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, view_default_11, Ignored())
+convert_element_type_default_6 = CallFunction(prims.convert_element_type.default, convert_element_type_default_5, Ignored())
+view_default_12 = CallFunction(aten.view.default, convert_element_type_default_6, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_12, permute_default_5)
+view_default_13 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_13, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_12)
+view_default_14 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_14, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_8)
+view_default_15 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_15, Ignored())
+_sfdp_pattern_21_half_bs1_training = MultiOutputPattern([view_default_7,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, view_default_2, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored())
+view_default_3 = CallFunction(aten.view.default, convert_element_type_default, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, view_default_4, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default_1, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default_1, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_2, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_6 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+_sfdp_pattern_21_half_bs1_inference = CallFunction(aten.view.default, bmm_default_1, Ignored(), _users=0)
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_22.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_22.py
index 41a433e40543..0971c09ad972 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_22.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_22.py
@@ -125,6 +125,94 @@
 ])
 
 
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, view_default_2, KeywordArg('attn_mask'))
+view_default_3 = CallFunction(aten.view.default, add_Tensor, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, view_default_4, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, view_default_4, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=3)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_6 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+neg_default = CallFunction(aten.neg.default, div_Tensor)
+view_default_8 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_6, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_8, permute_default_4)
+view_default_9 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_9, div_Tensor, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+view_default_10 = CallFunction(aten.view.default, fma_default, Ignored())
+view_default_11 = CallFunction(aten.view.default, view_default_10, Ignored())
+view_default_12 = CallFunction(aten.view.default, view_default_11, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_12, permute_default_5)
+view_default_13 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_13, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_12)
+view_default_14 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_14, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_8)
+view_default_15 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_15, Ignored())
+_sfdp_pattern_22_bs1_training = MultiOutputPattern([view_default_7,
+  permute_default_1,
+  permute_default_3,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, view_default_2, KeywordArg('attn_mask'))
+view_default_3 = CallFunction(aten.view.default, add_Tensor, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, view_default_4, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, view_default_4, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_6 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_22_bs1_inference = MultiOutputPattern([view_default_7,
+  permute_default_1,
+  permute_default_3
+])
+
+
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
 expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
 clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
@@ -227,3 +315,101 @@
   permute_default_1,
   permute_default_3
 ])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, view_default_2, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored())
+view_default_3 = CallFunction(aten.view.default, convert_element_type_default, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, view_default_4, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default_1, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default_1, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=3)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_2, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_6 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+neg_default = CallFunction(aten.neg.default, div_Tensor)
+view_default_8 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_6, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_8, permute_default_4)
+view_default_9 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, view_default_9, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_3, div_Tensor, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+view_default_10 = CallFunction(aten.view.default, convert_element_type_default_4, Ignored())
+view_default_11 = CallFunction(aten.view.default, view_default_10, Ignored())
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, view_default_11, Ignored())
+convert_element_type_default_6 = CallFunction(prims.convert_element_type.default, convert_element_type_default_5, Ignored())
+view_default_12 = CallFunction(aten.view.default, convert_element_type_default_6, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_12, permute_default_5)
+view_default_13 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_13, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_12)
+view_default_14 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_14, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_8)
+view_default_15 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_15, Ignored())
+_sfdp_pattern_22_half_bs1_training = MultiOutputPattern([view_default_7,
+  permute_default_1,
+  permute_default_3,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, view_default_2, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored())
+view_default_3 = CallFunction(aten.view.default, convert_element_type_default, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, view_default_4, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default_1, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default_1, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_2, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_6 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_22_half_bs1_inference = MultiOutputPattern([view_default_7,
+  permute_default_1,
+  permute_default_3
+])
diff --git a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_23.py b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_23.py
index dc6f27cd2849..2be036c2e8ae 100644
--- a/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_23.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_23.py
@@ -122,6 +122,91 @@
 ])
 
 
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, view_default_2, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, view_default_4, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, view_default_4, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=3)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_6 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+neg_default = CallFunction(aten.neg.default, div_Tensor)
+view_default_8 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_6, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_8, permute_default_4)
+view_default_9 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_9, div_Tensor, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+view_default_10 = CallFunction(aten.view.default, fma_default, Ignored())
+view_default_11 = CallFunction(aten.view.default, view_default_10, Ignored())
+view_default_12 = CallFunction(aten.view.default, view_default_11, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_12, permute_default_5)
+view_default_13 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_13, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_12)
+view_default_14 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_14, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_8)
+view_default_15 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_15, Ignored())
+_sfdp_pattern_23_bs1_training = MultiOutputPattern([view_default_7,
+  permute_default_1,
+  permute_default_3,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, view_default_2, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, view_default_4, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, view_default_4, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_6 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_23_bs1_inference = MultiOutputPattern([view_default_7,
+  permute_default_1,
+  permute_default_3
+])
+
+
 permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
 expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
 clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
@@ -223,3 +308,100 @@
   permute_default_1,
   permute_default_3
 ])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, view_default_2, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, convert_element_type_default_1, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_4, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default_2, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default_2, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=3)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_3, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_6 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+neg_default = CallFunction(aten.neg.default, div_Tensor)
+view_default_8 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_6, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_8, permute_default_4)
+view_default_9 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, view_default_9, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_4, div_Tensor, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+fma_default = CallFunction(prims.fma.default, neg_default, sum_dim_IntList_1, mul_Tensor)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, fma_default, Ignored())
+view_default_10 = CallFunction(aten.view.default, convert_element_type_default_5, Ignored())
+view_default_11 = CallFunction(aten.view.default, view_default_10, Ignored())
+convert_element_type_default_6 = CallFunction(prims.convert_element_type.default, view_default_11, Ignored())
+convert_element_type_default_7 = CallFunction(prims.convert_element_type.default, convert_element_type_default_6, Ignored())
+view_default_12 = CallFunction(aten.view.default, convert_element_type_default_7, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_12, permute_default_5)
+view_default_13 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_13, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_12)
+view_default_14 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_14, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_5, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_8)
+view_default_15 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_15, Ignored())
+_sfdp_pattern_23_half_bs1_training = MultiOutputPattern([view_default_7,
+  permute_default_1,
+  permute_default_3,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, view_default_2, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, convert_element_type_default_1, Ignored())
+view_default_4 = CallFunction(aten.view.default, view_default_3, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_4, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default_2, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default_2, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_3, Ignored())
+view_default_5 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_6 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_5, view_default_6)
+view_default_7 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+_sfdp_pattern_23_half_bs1_inference = MultiOutputPattern([view_default_7,
+  permute_default_1,
+  permute_default_3
+])
diff --git a/torch/_inductor/fx_passes/split_cat.py b/torch/_inductor/fx_passes/split_cat.py
index af3631dc3288..b6be29506fef 100644
--- a/torch/_inductor/fx_passes/split_cat.py
+++ b/torch/_inductor/fx_passes/split_cat.py
@@ -5,7 +5,7 @@
 import os
 from collections import defaultdict
 from collections.abc import Sequence
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable
 from typing_extensions import TypeAlias
 
 import torch
@@ -38,10 +38,10 @@
 
 _Arguments: TypeAlias = tuple[torch.fx.node.Argument, ...]
 _TransformParam: TypeAlias = tuple[
-    Optional[_Arguments],
-    Optional[_Arguments],
-    Optional[_Arguments],
-    Optional[_Arguments],
+    _Arguments | None,
+    _Arguments | None,
+    _Arguments | None,
+    _Arguments | None,
 ]
 _Range: TypeAlias = tuple[int, int]
 
@@ -167,7 +167,7 @@ def _get_dim(node: Any):
 def normalize_split_base(
     match: Match,
     _get_split_args: Callable[
-        [torch.fx.Node], tuple[Optional[torch.fx.Node], Optional[Any], Optional[int]]
+        [torch.fx.Node], tuple[torch.fx.Node | None, Any | None, int | None]
     ],
 ):
     """
@@ -291,6 +291,7 @@ def normalize_unbind_default(match: Match, *args, **kwargs):
         log.debug("example value absent for node: %s", input)
         return
     ndim = input.meta["example_value"].ndim
+    # pyrefly: ignore  # unsupported-operation
     if dim < 0:  # Normalize unbind dim
         dim += ndim
     with graph.inserting_after(node):
@@ -340,6 +341,7 @@ def is_empty_tensor(x):
         ndim == x.meta["example_value"].dim() or is_empty_tensor(x) for x in tensors
     )
 
+    # pyrefly: ignore  # unsupported-operation
     if cat_dim < 0:  # Normalize cat dim
         cat_dim += ndim
 
@@ -723,14 +725,14 @@ def simplify(
 
     def get_user_input_list(
         self, split_node: torch.fx.Node, next_users: list[torch.fx.Node]
-    ) -> list[list[Union[torch.fx.Node, _Range]]]:
+    ) -> list[list[torch.fx.Node | _Range]]:
         """
         Returns list of inputs to the following user nodes, in order. The outer list represents the user node. The inner
         list represents the inputs to that particular node. This list can either contain
           - a tuple representing the ranges of get_items that should go into the cat (closed interval)
           - torch.fx.Node representing "other" inputs (which are not coming from our split)
         """
-        user_inputs_list: list[list[Union[torch.fx.Node, _Range]]] = []
+        user_inputs_list: list[list[torch.fx.Node | _Range]] = []
         for user in next_users:
             if user.target in (torch.cat, torch.stack):
                 user_inputs_list.append(self.get_merged_user_inputs(split_node, user))
@@ -740,7 +742,7 @@ def get_user_input_list(
 
     def get_merged_user_inputs(
         self, split_node: torch.fx.Node, cat_node: torch.fx.Node
-    ) -> list[Union[torch.fx.Node, _Range]]:
+    ) -> list[torch.fx.Node | _Range]:
         user_inputs = get_arg_value(cat_node, 0, "tensors")
         simplified_user_inputs = []
         split_users = OrderedSet(split_node.users.keys())
@@ -767,8 +769,8 @@ def get_non_cat_node_input(
         return node_input
 
     def merge_consecutive_inputs(
-        self, inputs: list[Union[torch.fx.Node, int]]
-    ) -> list[Union[torch.fx.Node, _Range]]:
+        self, inputs: list[torch.fx.Node | int]
+    ) -> list[torch.fx.Node | _Range]:
         """
         Merge consecutive inputs going into a user node.
 
@@ -799,8 +801,8 @@ def get_simplified_split_ranges(
         self,
         split_sections,
         next_users,
-        user_inputs_list: list[list[Union[torch.fx.Node, _Range]]],
-    ) -> Optional[list[_Range]]:
+        user_inputs_list: list[list[torch.fx.Node | _Range]],
+    ) -> list[_Range] | None:
         ranges = OrderedSet[Any]()
         for user_inputs in user_inputs_list:
             ranges.update(u for u in user_inputs if isinstance(u, tuple))
@@ -824,7 +826,7 @@ def get_simplified_split_ranges(
         return split_ranges
 
     def has_non_overlapping_ranges(self, ranges: list[_Range]) -> bool:
-        for range_, next_range in zip(ranges, ranges[1:]):
+        for range_, next_range in itertools.pairwise(ranges):
             if range_[1] > next_range[0]:
                 return False
         return True
@@ -845,8 +847,8 @@ def get_transform_params(
         self,
         split_node: torch.fx.Node,
         next_users: list[torch.fx.Node],
-        user_inputs_list: list[list[Union[torch.fx.Node, _Range]]],
-    ) -> Optional[list[list[_TransformParam]]]:
+        user_inputs_list: list[list[torch.fx.Node | _Range]],
+    ) -> list[list[_TransformParam]] | None:
         """
         Figure out what transforms are needed for each input to each cat node.
 
@@ -899,7 +901,7 @@ def replace_split(
         graph: torch.fx.Graph,
         split_node: torch.fx.Node,
         split_sections: list[int],
-        user_inputs_list: list[list[Union[torch.fx.Node, _Range]]],
+        user_inputs_list: list[list[torch.fx.Node | _Range]],
         split_ranges: list[_Range],
     ) -> list[list[torch.fx.Node]]:
         """
@@ -947,6 +949,7 @@ def replace_split(
                 if isinstance(user_input, tuple):
                     # Find the correct new getitem (present in split_items)
                     new_user_inputs.append(
+                        # pyrefly: ignore  # bad-argument-type
                         split_items[
                             split_ranges.index(
                                 (
@@ -997,6 +1000,7 @@ def replace_cat(
                 for user_input_new, transform_param in zip(
                     user_inputs_new, transform_params
                 ):
+                    # pyrefly: ignore  # bad-argument-type
                     if not is_node_meta_valid(user_input_new):
                         log.debug("example value absent for node: %s", user_input_new)
                         return
@@ -1011,6 +1015,7 @@ def replace_cat(
                         stack_dim is None or stack_dim == unsqueeze_params[0]
                     ):
                         to_stack.append(user_input_new)
+                        # pyrefly: ignore  # missing-attribute
                         to_stack_meta.append(user_input_new.meta["example_value"])
                         stack_dim = unsqueeze_params[0]
                         continue
@@ -1031,10 +1036,12 @@ def replace_cat(
                         if unsqueeze_params:
                             to_stack.append(user_input_new)
                             stack_dim = unsqueeze_params[0]
+                            # pyrefly: ignore  # missing-attribute
                             to_stack_meta.append(user_input_new.meta["example_value"])
                             continue
 
                     if unflatten_params:
+                        # pyrefly: ignore  # missing-attribute
                         user_input_new_meta = user_input_new.meta["example_value"]
                         user_input_new = graph.call_function(
                             torch.unflatten, args=(user_input_new, *unflatten_params)
@@ -1044,6 +1051,7 @@ def replace_cat(
                             *unflatten_params,  # type: ignore[arg-type]
                         )
                     if movedim_params:
+                        # pyrefly: ignore  # missing-attribute
                         user_input_new_meta = user_input_new.meta["example_value"]
                         user_input_new = graph.call_function(
                             torch.movedim, args=(user_input_new, *movedim_params)
@@ -1053,6 +1061,7 @@ def replace_cat(
                             *movedim_params,  # type: ignore[arg-type]
                         )
                     if flatten_params:
+                        # pyrefly: ignore  # missing-attribute
                         user_input_new_meta = user_input_new.meta["example_value"]
                         user_input_new = graph.call_function(
                             torch.flatten, args=(user_input_new, *flatten_params)
@@ -1063,6 +1072,7 @@ def replace_cat(
                         )
                     user_inputs_new_transformed.append(user_input_new)
                     user_inputs_new_transformed_meta.append(
+                        # pyrefly: ignore  # missing-attribute
                         user_input_new.meta["example_value"]
                     )
                 if to_stack:
@@ -1167,8 +1177,8 @@ def get_simplified_split_ranges(
         self,
         split_sections: list[int],
         next_users: list[torch.fx.Node],
-        user_inputs_list: list[list[Union[torch.fx.Node, _Range]]],
-    ) -> Optional[list[_Range]]:
+        user_inputs_list: list[list[torch.fx.Node | _Range]],
+    ) -> list[_Range] | None:
         simplified_split_ranges = super().get_simplified_split_ranges(
             split_sections, next_users, user_inputs_list
         )
@@ -1180,8 +1190,8 @@ def get_transform_params(
         self,
         split_node: torch.fx.Node,
         next_users: list[torch.fx.Node],
-        user_inputs_list: list[list[Union[torch.fx.Node, _Range]]],
-    ) -> Optional[list[list[_TransformParam]]]:
+        user_inputs_list: list[list[torch.fx.Node | _Range]],
+    ) -> list[list[_TransformParam]] | None:
         """
         Figure out what transforms are needed for each input to each cat node.
 
@@ -1422,6 +1432,7 @@ def simplify_split_cat(match: Match, split_sections: list[int], dim: int):
     if not isinstance(split_sections, (list, tuple)):  # Unnormalized split
         return
     split_node = next(node for node in match.nodes if node.target == torch.split)
+    # pyrefly: ignore  # bad-argument-type
     SplitCatSimplifier().simplify(match.graph, split_node, split_sections)
 
 
@@ -1477,7 +1488,7 @@ def is_sorted_and_consecutive(arr: list[int]) -> bool:
     # check if the array is sorted
     if arr == sorted(arr):
         # check if the differences between adjacent elements are all 1
-        return all(x[1] - x[0] == 1 for x in zip(arr, arr[1:]))
+        return all(x[1] - x[0] == 1 for x in itertools.pairwise(arr))
     else:
         return False
 
@@ -1490,6 +1501,7 @@ def calculate_fused_tensor_size(split_node: torch.fx.Node, indices: list[int]) -
     for i in range(len(split_node.args[1])):  # type: ignore[arg-type]
         if i in indices:
             fused_tensor_size += split_node.args[1][i]  # type: ignore[operator, assignment, index]
+    # pyrefly: ignore  # bad-return
     return fused_tensor_size
 
 
@@ -1966,6 +1978,7 @@ def is_empty_tensor(x: torch.fx.Node) -> bool:
 
     assert all(ndim == x.meta["val"].dim() or is_empty_tensor(x) for x in tensors)
 
+    # pyrefly: ignore  # unsupported-operation
     if cat_dim < 0:  # Normalize cat dim
         cat_dim += ndim
 
@@ -3031,5 +3044,6 @@ def should_replace_einsum(einsum_node) -> bool:
     einsum_node = match.nodes[0]
     input, weights = get_arg_value(einsum_node, 1), get_arg_value(einsum_node, 2)
     if should_replace_einsum(einsum_node):
+        # pyrefly: ignore  # bad-argument-type
         match.replace_by_example(repl, [input, weights])
         counters[backend]["einsum_to_pointwise_pass"] += 1
diff --git a/torch/_inductor/fx_utils.py b/torch/_inductor/fx_utils.py
index c754c0324868..4c0a2ff35e18 100644
--- a/torch/_inductor/fx_utils.py
+++ b/torch/_inductor/fx_utils.py
@@ -98,7 +98,7 @@ def is_intlist_same(new, old):
             return statically_known_true(sym_eq(new, old))
 
         def is_fake_tensor_same(new, old, *, node):
-            if type(new) != type(old):
+            if type(new) is not type(old):
                 return False
             if isinstance(new, (list, tuple)):
                 if len(new) != len(old):
@@ -238,6 +238,7 @@ def should_process_node(node):
                 symbol_to_path := compute_unbacked_bindings(shape_env, new_fake_tensor)
             ):
                 # Refresh the bindings to the new symbols
+                # pyrefly: ignore  # unbound-name
                 node.meta["unbacked_bindings"] = symbol_to_path
 
             existing_storages[get_node_storage(node)] += 1
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index d10dc7a46426..9eac1909af62 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -44,6 +44,7 @@
     SymTypes,
 )
 from torch.fx.node import Node
+from torch.fx.passes.reinplace import _is_view_op
 from torch.utils._mode_utils import no_dispatch
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.numbers import int_oo
@@ -203,6 +204,30 @@ def get_user_visible_output_strides(g: Graph) -> dict[Node, tuple[int, ...]]:
     return ret
 
 
+def extend_user_visible_output_strides(
+    user_visible_outputs: dict[Node, tuple[int, ...]],
+) -> dict[Node, object]:
+    """
+    Extend user_visible_output_strides to include view ops that lead to user-visible outputs.
+    """
+    result: dict[Node, object] = {**user_visible_outputs}
+    queue = [*result.keys()]
+    visited = OrderedSet([*queue])
+    while queue:
+        current = queue.pop()
+        if (
+            _is_view_op(current.target)
+            and current.args
+            and isinstance(current.args[0], torch.fx.Node)
+        ):
+            base = current.args[0]
+            if base not in visited:
+                result.setdefault(base, None)
+                visited.add(base)
+                queue.append(base)
+    return result
+
+
 def mark_nodes_dislike_padding(
     g: Graph, user_visible_output_strides: dict[Node, tuple[int, ...]]
 ) -> None:
@@ -216,6 +241,10 @@ def mark_nodes_dislike_padding(
     """
     if not config.comprehensive_padding:
         return
+
+    extended_user_visible_nodes = extend_user_visible_output_strides(
+        user_visible_output_strides
+    )
     ops_dislike_padding = OrderedSet(
         [
             aten.convolution,
@@ -284,7 +313,7 @@ def _get_overload_packet(
                 if prior_op not in ops_like_padding:
                     prior.meta["dislike_padding"] = True
         # We only want to mark output nodes. So, move it after the above prior nodes process.
-        if not config.pad_outputs and cur in user_visible_output_strides:
+        if not config.pad_outputs and cur in extended_user_visible_nodes:
             cur.meta["dislike_padding"] = True
 
 
@@ -356,6 +385,9 @@ def __init__(
             const_module.device_idxs if const_module else OrderedSet()
         )
         self.device_type = "cpu"
+        self.additional_buffer_deps: dict[str, OrderedSet[str]] = defaultdict(
+            OrderedSet
+        )
 
         # Inplace padding may require Inductor to allocate slightly larger
         # tensor for padding.
@@ -554,6 +586,7 @@ def get_allocation_size(
             isinstance(node, ir.ComputedBuffer)
             and node.name in self.buffer_to_padded_size
         ):
+            # pyrefly: ignore  # index-error
             return self.buffer_to_padded_size[node.name]
         else:
             return node.get_size()
@@ -822,12 +855,17 @@ def find_nodes_prefer_channels_last(self) -> OrderedSet[Node]:
         With rule 2, we makes sure all the tensors in the chain uses channels last layout. So both copies
         can be saved.
         """
+        last_conv = None
+        nodes_cannot_propagate = [torch.ops.aten.bmm.default]
         output_set = OrderedSet[Node]()
         for n in reversed(self.module.graph.nodes):  # type: ignore[arg-type, union-attr]
             if n.target == torch.ops.aten.convolution.default:
                 output_set.add(n)
+                if last_conv is None:
+                    last_conv = n
+                continue
+            if n.target in nodes_cannot_propagate:
                 continue
-
             for user in n.users:
                 if user in output_set:
                     output_set.add(n)
@@ -848,8 +886,14 @@ def find_nodes_prefer_channels_last(self) -> OrderedSet[Node]:
         # - res2net50_14w_8s
         # - sebotnet33ts_256
         for n in self.module.graph.nodes:  # type: ignore[union-attr]
+            # layout propagation ends at last conv node, which will benefit vison transformers.
+            if last_conv is not None and n == last_conv:
+                break
             if n in output_set:
-                output_set.update(n.users)
+                for user in n.users:
+                    if user.target in nodes_cannot_propagate:
+                        continue
+                    output_set.add(user)
 
         return output_set
 
@@ -1073,6 +1117,7 @@ def constant_name(self, name: str, device_override: Optional[torch.device]) -> s
                 self.constants[name].to(device_override),
             )
 
+    # pyrefly: ignore  # bad-override
     def placeholder(
         self,
         target: str,  # type: ignore[override]
@@ -1297,6 +1342,7 @@ def can_inline_constant(t: torch.Tensor) -> bool:
         """
         return len(t.shape) == 1 and t.shape[0] <= 8
 
+    # pyrefly: ignore  # bad-override
     def get_attr(
         self,
         target: str,  # type: ignore[override]
@@ -1354,6 +1400,7 @@ def call_module(self, target: Any, args: Any, kwargs: Any) -> NoReturn:
     def call_method(self, target: Any, args: Any, kwargs: Any) -> NoReturn:
         raise AssertionError
 
+    # pyrefly: ignore  # bad-override
     def output(
         self,
         target: str,  # type: ignore[override]
@@ -1598,7 +1645,12 @@ def debug(msg: str) -> None:
                         inp_args = eager_input_vals[0]
                         inp_kwargs = eager_input_vals[1]
                         args, kwargs = constrain_to_fake_tensors(
-                            args, kwargs, inp_args, inp_kwargs
+                            # pyrefly: ignore  # unbound-name
+                            args,
+                            # pyrefly: ignore  # unbound-name
+                            kwargs,
+                            inp_args,
+                            inp_kwargs,
                         )
                     else:
                         args, kwargs = constrain_to_fx_strides(n, *args, **kwargs)  # type: ignore[index]
@@ -1694,7 +1746,9 @@ def debug(msg: str) -> None:
                         # require_exact_strides to handle views. But ultimately it's better to require
                         # the right strides at the tensor definition.
                         if n.meta["val"]._is_view() or isinstance(
-                            result.data, ir.BaseView
+                            # pyrefly: ignore  # missing-attribute
+                            result.data,
+                            ir.BaseView,
                         ):
                             result = ir.ExternKernel.require_stride_order(
                                 result,
@@ -1702,10 +1756,15 @@ def debug(msg: str) -> None:
                                 allow_padding=allow_padding,
                             )
                         else:
-                            strides = [
-                                s.node.expr if isinstance(s, torch.SymInt) else s
-                                for s in strides
-                            ]
+                            # Fix for 0-d tensors: if result size is empty,
+                            # strides should also be empty
+                            if len(result.get_size()) == 0 and len(strides) > 0:
+                                strides = []
+                            else:
+                                strides = [
+                                    s.node.expr if isinstance(s, torch.SymInt) else s
+                                    for s in strides
+                                ]
                             result = ir.ExternKernel.require_exact_strides(
                                 result, strides, allow_padding=allow_padding
                             )
@@ -1772,6 +1831,7 @@ def debug(msg: str) -> None:
                                 ),
                             )
                     if user.op == "output":
+                        # pyrefly: ignore  # missing-attribute
                         if isinstance(result.data.data, (Pointwise, Reduction)):
                             result.realize()
 
@@ -1829,7 +1889,7 @@ def debug(msg: str) -> None:
 
         shape_env = V.graph.sizevars.shape_env
 
-        # An input can be unbacked symint i.e.: when mark_unabcked is used.
+        # An input can be unbacked symint i.e.: when mark_unbacked is used.
         # in that case add it to new_unbacked_defs.
         if (
             n.op == "placeholder"
@@ -2120,6 +2180,7 @@ def extract_autotune_inputs(
                             continue
                         dynamic_grid = True
                         new_grid.append(grid_outputs[visited_grids[val]])
+                    # pyrefly: ignore  # bad-argument-type
                     new_grids.append(tuple(new_grid))
 
                 if dynamic_grid:
@@ -2141,6 +2202,7 @@ def materialize(
                     x: Union[torch.SymInt, torch.SymFloat, torch.Tensor],
                 ) -> Union[int, float, torch.Tensor]:
                     if x is None:
+                        # pyrefly: ignore  # bad-return
                         return None
                     elif isinstance(x, (torch.SymInt, torch.SymFloat)):
                         # Need concrete value to run dynamic shapes and tune the result
@@ -2343,8 +2405,9 @@ def _compile_to_module(self) -> CompiledModule:
         output_code_log.info("Output code written to: %s", mod.__file__)
         if config.benchmark_kernel:
             print(f"Compiled module path: {mod.__file__}", file=sys.stderr)
-        V.debug.output_code(mod.__file__)
-        V.debug.copy(os.path.splitext(mod.__file__)[0] + ".debug")
+        if isinstance(wrapper_code, FileBackedGraphModule):
+            V.debug.output_code(mod.__file__)
+            V.debug.copy(os.path.splitext(mod.__file__)[0] + ".debug")
 
         return mod
 
@@ -2380,6 +2443,9 @@ def _compile_to_module_lines(
             ]
             key, path = PyCodeCache.write(wrapper_code.value)
             output_code_log.debug("Output code written to: %s", path)
+
+            V.debug.output_code(path)
+            V.debug.copy(os.path.splitext(path)[0] + ".debug")
         except Exception:
             trace_structured(
                 "inductor_output_code",
@@ -2390,7 +2456,10 @@ def _compile_to_module_lines(
         else:
             trace_structured(
                 "inductor_output_code",
-                lambda: {"filename": path},
+                lambda: {
+                    "filename": path,
+                    "file_path": os.path.abspath(path),
+                },
                 payload_fn=lambda: wrapper_code.value,
             )
         with dynamo_timed("PyCodeCache.load_by_key_path", log_pt2_compile_event=True):
@@ -2410,11 +2479,11 @@ def _compile_to_module_lines(
 
         return mod
 
-    def get_output_names(self) -> list[str]:
+    def _get_output_names(self, graph_outputs: list[ir.IRNode]) -> list[str]:
         names = []
         shape_counter = itertools.count(0)
         none_counter = itertools.count(0)
-        for node in self.graph_outputs:
+        for node in graph_outputs:
             if isinstance(node, ir.NoneAsConstantBuffer):
                 names.append(f"{self.name}_none{next(none_counter)}")
             elif isinstance(node, ir.ShapeAsConstantBuffer):
@@ -2423,6 +2492,9 @@ def get_output_names(self) -> list[str]:
                 names.append(node.get_name())
         return names
 
+    def get_output_names(self) -> list[str]:
+        return self._get_output_names(self.graph_outputs)
+
     def is_unspec_arg(self, name: str) -> bool:
         # dynamo wraps unspec variable as 0d CPU tensor,
         # need to convert to scalar during codegen (triton only)
diff --git a/torch/_inductor/index_propagation.py b/torch/_inductor/index_propagation.py
index 0dc0a00412a8..a74540acc2ef 100644
--- a/torch/_inductor/index_propagation.py
+++ b/torch/_inductor/index_propagation.py
@@ -333,6 +333,7 @@ def statically_true(self, e):
                 for k, v in self.indirect_var_ranges.items()
             ),
         )
+        # pyrefly: ignore  # bad-argument-type
         return statically_known_true(self.shape_env, e, self.axioms, var_to_range)
 
     def indirect_indexing(
diff --git a/torch/_inductor/inductor_prims.py b/torch/_inductor/inductor_prims.py
index ee548242c77d..458c881ef0e7 100644
--- a/torch/_inductor/inductor_prims.py
+++ b/torch/_inductor/inductor_prims.py
@@ -112,6 +112,7 @@ def eager_prepare_softmax(x: Tensor, dim: int) -> tuple[Tensor, Tensor]:
     "fma(Tensor a, Tensor b, Tensor c) -> Tensor",
     lambda a, b, c: (a * b) + c,
     doc="Fused multiply add: fma(a, b, c) -> (a * b) + c without rounding after the multiplication",
+    tags=(torch.Tag.pointwise,),
 )
 prepare_softmax_online = make_prim(
     "prepare_softmax_online(Tensor a, int dim) -> (Tensor, Tensor)",
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index cd6139cfd513..56a88caf6c7d 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -71,7 +71,7 @@
 )
 from torch.fx.node import Node
 from torch.utils._ordered_set import OrderedSet
-from torch.utils._sympy.functions import CleanDiv, FloorDiv, ModularIndexing
+from torch.utils._sympy.functions import CleanDiv, FloorDiv, Mod, ModularIndexing
 from torch.utils._sympy.symbol import SymT
 
 from . import config, dependencies
@@ -214,7 +214,7 @@
 ]
 
 
-def _is_static(x: object) -> bool:
+def _is_static(x: object) -> TypeIs[Union[int, Integer]]:
     return isinstance(x, (int, Integer))
 
 
@@ -352,7 +352,7 @@ def get_stride_order(
 
 
 @overload
-def ir_node_to_tensor(x: Literal[None], guard_shape: bool = True) -> None: ...
+def ir_node_to_tensor(x: None, guard_shape: bool = True) -> None: ...
 
 
 @overload
@@ -379,6 +379,7 @@ def ir_node_to_tensor(
     dtype = x.get_dtype()
     device = x.get_device()
     size = convert_shape_to_symint(size)
+    # pyrefly: ignore  # bad-assignment
     stride = convert_shape_to_symint(stride)
     with V.graph.sizevars.shape_env.suppress_guards():
         t = torch.empty_strided(
@@ -406,6 +407,7 @@ def get_device_type(
         return x.type
     elif isinstance(x, (IRNode, OutputSpec)):
         return get_device_type(x.get_device())
+    # pyrefly: ignore  # bad-argument-type
     assert_never(f"get_device_type({x}: {type(x).__name__})")
 
 
@@ -433,10 +435,7 @@ def is_cpu(x: Union[IRNode, torch.device, None, str]) -> bool:
     return get_device_type(x) == "cpu"
 
 
-def is_aligned_realized_tensor_hint(
-    x: Union[Buffer, TensorBox], alignment: int
-) -> bool:
-    # Use this as a hint. This won't guard since size_hint doesn't guard.
+def is_aligned_realized_tensor(x: Union[Buffer, TensorBox], alignment: int) -> bool:
     if (
         not isinstance(x, IRNode)
         or x.maybe_get_stride() is None
@@ -445,16 +444,16 @@ def is_aligned_realized_tensor_hint(
     ):
         return False
 
-    aligned_strides = all(
-        (V.graph.sizevars.size_hint_or_throw(x.get_stride()[i]) % alignment) == 0
-        for i in range(len(x.get_stride()) - 1)
+    aligned_strides = sympy.And(
+        *(sympy.Eq(Mod(s, alignment), 0) for s in x.get_stride()[:-1])
     )
-    # if the last dim size is <= 1, stride doesn't matter
-    aligned_last_dim = (
-        V.graph.sizevars.size_hint_or_throw(x.get_stride()[-1]) == 1
-        or V.graph.sizevars.size_hint_or_throw(x.get_size()[-1]) <= 1
+    aligned_last_dim = sympy.Or(
+        sympy.Eq(x.get_stride()[-1], 1), sympy.Le(x.get_size()[-1], 1)
     )
-    return aligned_last_dim and aligned_strides
+    is_aligned = sympy.And(aligned_strides, aligned_last_dim)
+
+    # Make sure to guard to recompile when necessary.
+    return V.graph.sizevars.guard_or_false(is_aligned)
 
 
 def significant_strides_equal(
@@ -472,9 +471,7 @@ def significant_strides_equal(
 
         if not V.graph.sizevars.statically_known_equals(
             s1, s2
-        ) and not V.graph.sizevars.symbolic_hint(s1) == V.graph.sizevars.symbolic_hint(
-            s2
-        ):
+        ) and V.graph.sizevars.symbolic_hint(s1) != V.graph.sizevars.symbolic_hint(s2):
             return False
 
     return True
@@ -619,7 +616,9 @@ def get_stack_traces(self) -> OrderedSet[str]:
             else:
                 pre_grad_nodes = (
                     torch._inductor.debug._inductor_post_to_pre_grad_nodes.get(
-                        "postToPre", {}
+                        "postToPre",
+                        {},
+                        # pyrefly: ignore  # missing-attribute
                     ).get(node.name, [])
                 )
                 if not isinstance(pre_grad_nodes, list):
@@ -655,6 +654,7 @@ def str_helper(
         lines = list(lines) + list(self.common_repr(shorten))
         lines = list(map(str, lines))
         if multiline:
+            # pyrefly: ignore  # no-matching-overload
             new_lines = indent(",\n".join(lines))
             return f"{type(self).__name__}(\n{new_lines}\n)"
         else:
@@ -1074,6 +1074,11 @@ def make_loader(self) -> Callable[[Sequence[Expr]], OpsValue]:
 
         return self.inner_fn
 
+    def __str__(self) -> str:
+        return self._to_str(("ranges",))
+
+    __repr__ = __str__
+
     def get_reduction_size(self) -> Sequence[sympy.Expr]:
         return []
 
@@ -1142,6 +1147,7 @@ def store_output(
     "min": ops_wrapper("minimum"),
     "prod": ops_wrapper("mul"),
     "sum": ops_wrapper("add"),
+    "dot": ops_wrapper("add"),
     "xor_sum": ops_wrapper("bitwise_xor"),
 }
 
@@ -1302,10 +1308,15 @@ def num_splits(
             )
             and config.split_reductions
         )
+
         if not (_is_static(reduction_numel_hint) and _is_static(numel_hint)):
             # We don't support unbacked symints
             return ReductionHint.DEFAULT, 1
 
+        if reduction_type == "dot":
+            # Don't split when doing native matmul
+            return ReductionHint.DEFAULT, 1
+
         props = DeviceProperties.create(device)
         num_sm = props.multi_processor_count
         min_elements_per_thread = 32
@@ -1486,6 +1497,7 @@ def value_fn(
             return fn
 
     @classmethod
+    # pyrefly: ignore  # bad-override
     def create(
         cls,
         device: torch.device,
@@ -1558,7 +1570,10 @@ def fn(index: int) -> OpsValue:
             and V.graph.sizevars.size_hint_or_throw(reduction_numel)
             < config.unroll_reductions_threshold
             and (sympy_product(ranges) != 1 or is_gpu(device.type))
+            and reduction_type != "dot"
         ):
+            # When native matmul, don't unroll the dot reduction.
+
             # NB: This works around https://github.com/pytorch/pytorch/issues/140457
             # since turning reductions into pointwise ops can exacerbate this problem
             return Pointwise.create(
@@ -1671,6 +1686,7 @@ def default_accumulator(
         return {
             "sum": zero,
             "prod": one,
+            "dot": zero,
             "xor_sum": zero,
             "any": zero,
             "welford_reduce": (zero, zero, zero),
@@ -2427,6 +2443,7 @@ def create(  # type: ignore[override]
         scan_type = Scan
         if num_splits > 1:
             supports_split = (
+                # pyrefly: ignore  # unsupported-operation
                 torch.version.hip is None or (has_triton and triton_version >= "3.3.0")
             ) and (len(dtypes) == 1)
             if not supports_split:
@@ -2887,8 +2904,9 @@ def _normalize_size(x: IRNode, new_size: Sequence[_IntLike]) -> Sequence[_IntLik
                 # NB: new_size[i] == old_size[i] is expected to already be
                 # guarded because the meta formula was expected to have taught
                 # us this equality.
+                # pyrefly: ignore  # unsupported-operation
                 assert sizevars.size_hint(new_size[i] - old_size[i], fallback=0) == 0, (
-                    "Broadcast failed in ExpandView({x.get_size()}, {new_size}) on dimension {i}"
+                    f"Broadcast failed in ExpandView({x.get_size()}, {new_size}) on dimension {i}"
                 )
         return new_size
 
@@ -3006,7 +3024,8 @@ def create(cls, x: IRNode, *, dim: Optional[int] = None) -> IRNode:
 
             for i, (size, stride) in enumerate(zip(old_layout.size, old_layout.stride)):
                 if dim is None:
-                    if size != 1:
+                    # Only append if dim is not squeezed out
+                    if not V.graph.sizevars.is_size_one_or_false(size):
                         new_size.append(size)
                         new_stride.append(stride)
                 else:
@@ -3027,8 +3046,14 @@ def create(cls, x: IRNode, *, dim: Optional[int] = None) -> IRNode:
             return ReinterpretView(data=storage, layout=new_layout)
 
         if dim is None:
-            # redirect to a generic view
-            return View.create(x, [s for s in x.get_size() if s != 1])
+            return View.create(
+                x,
+                [
+                    s
+                    for s in x.get_size()
+                    if not V.graph.sizevars.is_size_one_or_false(s)
+                ],
+            )
         else:
             assert x.get_size()[dim] == 1
             return View.create(x, [s for i, s in enumerate(x.get_size()) if i != dim])
@@ -3613,6 +3638,7 @@ def __init__(
     ) -> None:
         if stride is None:
             stride = FlexibleLayout.contiguous_strides(size)
+        # pyrefly: ignore  # read-only
         self.device = device
         self.dtype = dtype
         assert len(size) == len(stride), f"size={size}, stride={stride}"
@@ -3744,6 +3770,15 @@ def _pad_strides(
         ):
             return in_strides
 
+        # Skip padding the strides for dynamic shapes based on config.pad_dynamic_shape
+        # Checking both shape and strides, as there are cases where only one is dynamic
+        is_dynamic = not all(
+            isinstance(s, (int, sympy.Integer))
+            for s in itertools.chain(in_strides, size)
+        )
+        if not config.pad_dynamic_shapes and is_dynamic:
+            return in_strides
+
         shape_env = V.graph._shape_env if hasattr(V.graph, "_shape_env") else None
 
         def contains_unbacked_symints(expr: sympy.Expr | int) -> bool:
@@ -3787,6 +3822,7 @@ def contains_unbacked_symints(expr: sympy.Expr | int) -> bool:
             # [25, 25, 5, 1].
             return in_strides
 
+        # pyrefly: ignore  # bad-assignment
         metrics.num_comprehensive_padding += 1
         return new_strides
 
@@ -4223,7 +4259,7 @@ def get_name(self) -> str:
         assert self.name, self
         return self.name
 
-    def get_example(self) -> Union[torch.Tensor, sympy.Symbol]:
+    def get_example(self) -> Union[torch.Tensor, torch.SymInt]:
         if isinstance(self.layout, Layout):
             return self.layout.get_example()
         raise NotImplementedError(type(self.layout).__name__)
@@ -4506,9 +4542,7 @@ def get_free_symbol_uses(
             unbacked_only
         ) | self.data.get_free_symbol_uses(unbacked_only)
 
-        if self.has_store_function() and isinstance(
-            self.get_store_function(), LoopBody
-        ):
+        if self.has_store_function():
             result |= self.get_read_writes().get_free_symbol_uses(unbacked_only)
         return result
 
@@ -4687,22 +4721,49 @@ def simplify_and_reorder(
             Callable[[Sequence[int]], Sequence[int]],
             Callable[[Sequence[int]], Sequence[int]],
         ]:
-            sizes, reindex0, reindex1 = self._apply_loop_reordering(
+            newsizes, reindex0, reindex1 = self._apply_loop_reordering(
                 x_vars, support_vars, sizes, memory_addrs
             )
+
+            # When using native matmul, the codegen assumes the following loop order,
+            # regardless of the stride of A and B:
+            #
+            #   for z -> y -> x -> r:  C[z, y, x] += A[z, y, r] * B[z, r, x]
+            # or
+            #   for z -> x -> y -> r:  C[z, y, x] += A[z, y, r] * B[z, r, x]
+            #
+            # The critical point is the position of the "z" (batch) axis in bmm.
+            # It is fine to swap the y and x axes (e.g., (z, y, x, r) or (z, x, y, r)),
+            # but reordering the z axis (e.g., (y, x, z, r)) breaks codegen.
+            #
+            # Therefore, if loop reordering changes the "z" location in bmm,
+            # it should be reverted to the default.
+            # This may not always produce the optimal loop order when strides
+            # do not align with the default assumption.
+            #
+            # TODO: Consider extending tl.dot codegen to support arbitrary loop orders.
+            if self.get_reduction_type() == "dot" and len(sizes) == 3:
+                order = list(range(len(sizes)))  # default order
+
+                # if z axis is not the outermost, use the default reorder.
+                if reindex0(order)[0] != 0:
+                    newsizes = [sizes[i] for i in order]
+                    reindex0 = same_reorder(order)
+                    reindex1 = inverse_reorder(order)
+
             # for NHWC: reindex0([0,1,2,3]) = [0,2,3,1], reindex1([0,1,2,3]) = [0,3,2,1]
             x_vars = reindex0(x_vars)
 
             if simplify_loops:
-                sizes, reindex2, _prune = V.graph.sizevars._simplify_loops(
+                newsizes, reindex2, _prune = V.graph.sizevars._simplify_loops(
                     x_vars,
-                    sizes,
-                    index_prevent_reordering(index_formulas, x_vars, sizes),
+                    newsizes,
+                    index_prevent_reordering(index_formulas, x_vars, newsizes),
                 )
                 reindex = fuse_reindexing(reindex1, reindex2)
             else:
                 reindex = reindex1
-            return sizes, reindex, reindex1
+            return newsizes, reindex, reindex1
 
         support_vars = index_vars + reduce_vars
         should_merge_loops = (
@@ -4836,6 +4897,7 @@ def dummy(index: Sequence[Any], rindex: Sequence[Any]) -> Any:
 
             def dummy(index: Sequence[Any], rindex: Sequence[Any]) -> Any:
                 assert len(rindex) == 0
+                # pyrefly: ignore  # missing-attribute
                 return ops.load(inp.get_name(), indexer(index))
 
             deps.reads |= dependencies.extract_read_writes(
@@ -4985,7 +5047,7 @@ def benchmark(self, *args: Any, out: torch.Tensor) -> float:
             "rep": autotune_rep,
         }
         if config.profile_bandwidth_with_do_bench_using_profiling:
-            return do_bench_using_profiling(lambda: algo(*args), **benchmark_configs)
+            return do_bench_using_profiling(lambda: algo(*args), **benchmark_configs)  # type: ignore[arg-type]
         return benchmarker.benchmark(algo, args, {"out": out}, **benchmark_configs)
 
     def call_name(self) -> str:
@@ -5159,6 +5221,7 @@ def __init__(
     def get_layout(self) -> Layout:
         if isinstance(self.layout, MultiOutputLayout):
             assert isinstance(self.outputs, Iterable), type(self.outputs)
+            # pyrefly: ignore  # index-error
             first_output = self.outputs[0]
             assert isinstance(first_output, Buffer), type(first_output)
             layout = first_output.layout
@@ -5437,7 +5500,7 @@ def can_realize_into_without_copy(
                 return True
 
             # otherwise, check equality of layouts
-            if not len(src.get_stride()) == len(dst.get_stride()):
+            if len(src.get_stride()) != len(dst.get_stride()):
                 return False
 
             return all(
@@ -5475,6 +5538,7 @@ def realize_into(cls, src: IRNode, dst: IRNode) -> IRNode:
             # ExternKernelAlloc has specific requirements for output layout, should create a copy
             assert hasattr(src.data, "layout")
             if cls.can_realize_into_without_copy(src, dst):
+                # pyrefly: ignore  # missing-attribute
                 src.data.layout = NonOwningLayout(dst)
                 return src.data
         # introduce a copy
@@ -5599,11 +5663,23 @@ def decide_layout(self) -> None:
             self.apply_constraint()
             self.freeze_layout()
 
-    def codegen_comment(self, wrapper: PythonWrapperCodegen) -> None:
+    def codegen_comment(
+        self, wrapper: PythonWrapperCodegen, kernel_name: Optional[str] = None
+    ) -> None:
         origin_str, _detailed_origin_str = get_kernel_metadata(self, wrapper)
         if origin_str:
             wrapper.make_comment(origin_str)
 
+        if not kernel_name:
+            kernel_name = self.try_get_kernel_name()
+        if kernel_name:
+            from .debug import set_kernel_post_grad_provenance_tracing
+
+            debug_handle = set_kernel_post_grad_provenance_tracing(
+                self, kernel_name, is_extern=True
+            )
+            wrapper.write_provenance_debug_handle(kernel_name, debug_handle)
+
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
         raise NotImplementedError
 
@@ -5646,25 +5722,29 @@ def set_python_kernel_name(self, python_kernel_name: Optional[str]) -> None:
                 f"{kernel.__module__.replace('._ops.', '.ops.')}.{kernel.__name__}"
             )
 
-    def get_kernel_name(self) -> str:
+    def try_get_kernel_name(self) -> Optional[str]:
         from .codegen.cpp_wrapper_cpu import CppWrapperCpu
 
         device = d.type if (d := self.get_device()) else V.graph.device_type
         if V.graph.fx_wrapper:
-            assert self.python_kernel_name is not None
             return self.python_kernel_name
         elif V.graph.cpp_wrapper:
             assert isinstance(V.graph.wrapper_code, CppWrapperCpu), type(
                 V.graph.wrapper_code
             )
-            assert self.cpp_kernel_name is not None
+            if self.cpp_kernel_name is None:
+                return None
             return V.graph.wrapper_code.get_c_shim_func_name(
                 self.cpp_kernel_name, device
             )
         else:
-            assert self.python_kernel_name is not None
             return self.python_kernel_name
 
+    def get_kernel_name(self) -> str:
+        name = self.try_get_kernel_name()
+        assert name is not None
+        return name
+
     @staticmethod
     def copy_input(x: IRNode) -> Union[TensorBox, ShapeAsConstantBuffer]:
         pw = Pointwise.create(
@@ -6797,7 +6877,7 @@ def codegen(self, wrapper: PythonWrapperCodegen) -> None:
             else:
                 raise NotImplementedError(f"Unsupported arg type: {type(arg)}: {arg}")
 
-        self.codegen_comment(wrapper)
+        self.codegen_comment(wrapper, new_name)
         wrapper.generate_kernel_call(
             new_name,
             args,
@@ -7140,6 +7220,7 @@ def __init__(
     ) -> None:
         self.indices = indices
         valid_indices = [i for i in indices if i is not None]
+        # pyrefly: ignore  # bad-argument-type
         tensors = [self.realize_input(x) for x in [x, values, *valid_indices]]
         cpp_kernel_name = "aoti_torch_index_put_out"
         super().__init__(
@@ -7232,6 +7313,7 @@ def __init__(
         base_offset: Union[sympy.Symbol, int],
         base_dim_stride: Union[sympy.Symbol, int],
         size: Union[sympy.Symbol, int],
+        clamp: bool,
     ) -> None:
         super().__init__(None, NoneLayout(device=torch.device("cpu")), [])
         # This node codegen the following:
@@ -7241,6 +7323,7 @@ def __init__(
         self.base_offset = base_offset
         self.base_dim_stride = base_dim_stride
         self.size = size
+        self.clamp = clamp
 
     def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
         return OrderedSet([self.unbacked_offset_symbol])
@@ -7251,7 +7334,59 @@ def get_free_symbol_uses(
         return get_free_symbols(self.index, unbacked_only)
 
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
-        wrapper.codegen_dynamic_select_index(self)
+        wrapper.codegen_dynamic_select_index(self, clamp=self.clamp)
+
+
+class DynamicSliceSize(ExternKernel):
+    """
+    Computes the output size of a slice call, handling the correct semantics in codegen.
+    We do this for flexible handling for unbacked indices (to not data-dependent error).
+
+    Slicing has 4 semantics for indices, i.e. x[start:] could be:
+    1) start < -x.size(0)            -> x[0:]                    # negative out-of-bounds
+    2) start in [-x.size(0), 0)      -> x[x.size(0) + start:]    # negative slicing
+    3) start in [0, x.size(0))       -> x[start:]                # standard slicing
+    4) start >= x.size(0)            -> empty slice              # positive out-of-bounds
+
+    If the appropriate semantics are known beforehand, the output size is computed based on
+    the start & end indices. If not (with unbacked indices), a new unbacked symbol is created
+    to represent the output size, and codegen handles computing the correct case.
+    """
+
+    def get_reads(self) -> OrderedSet[Dep]:
+        return OrderedSet()
+
+    def should_allocate(self) -> bool:
+        return False
+
+    def __init__(
+        self,
+        unbacked_size_symbol: sympy.Symbol,
+        start: Union[sympy.Symbol, int],
+        end: Union[sympy.Symbol, int],
+        step: Union[sympy.Symbol, int],
+        size: Union[sympy.Symbol, int],
+    ):
+        super().__init__(None, NoneLayout(device=torch.device("cpu")), [])
+        # This node codegen
+        self.unbacked_size_symbol = unbacked_size_symbol
+        self.start = start
+        self.end = end
+        self.step = step
+        self.size = size
+
+    def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
+        return OrderedSet([self.unbacked_size_symbol])
+
+    def get_free_symbol_uses(
+        self, unbacked_only: bool = False
+    ) -> OrderedSet[sympy.Symbol]:
+        return get_free_symbols(self.start, unbacked_only).union(
+            get_free_symbols(self.end, unbacked_only)
+        )
+
+    def codegen(self, wrapper: PythonWrapperCodegen) -> None:
+        wrapper.codegen_dynamic_slice_size(self)
 
 
 class DynamicScalar(ExternKernel):
@@ -7458,6 +7593,7 @@ def add_alias(t: IRNode) -> None:
                         add_alias(optional_tensor_arg)
             else:
                 assert library_utils.is_tensor_like_type(info.type)
+                # pyrefly: ignore  # bad-argument-type
                 add_alias(arg)
 
         for info, arg in torch._library.utils.zip_schema(schema, args, kwargs):
@@ -7550,7 +7686,25 @@ def has_side_effects(self) -> bool:
         return get_schema_info(self.op_overload).is_mutable()
 
     def get_inputs_that_alias_output(self) -> Sequence[str]:
-        return self.alias_names
+        assert isinstance(
+            self.op_overload, (torch._ops.OpOverload, torch._ops.HigherOrderOperator)
+        ), (
+            f"Fails to create FallbackKernel for {self.op_overload}: "
+            f"{type(self.op_overload)} not supported"
+        )
+
+        # See [Note: FallbackKernel supported operators]: for a mutating
+        # op that is auto-functionalizable, its outputs does NOT
+        # alias any of the inputs.
+        if (
+            not isinstance(self.op_overload, torch._ops.HigherOrderOperator)
+            and "_c10d_functional" not in self.op_overload.name()
+            and self.op_overload._schema.is_mutable
+            and can_auto_functionalize(self.op_overload)
+        ):
+            return []
+        else:
+            return self.alias_names
 
     def get_mutation_names(self) -> Sequence[str]:
         assert len(self.mutation_names) <= 1
@@ -7871,6 +8025,7 @@ def generate_output(output: Any, indices: list[tuple[Any, int]]) -> Any:
             packed.outputs = tuple(outputs)
         else:
             packed.outputs = [outputs]
+        # pyrefly: ignore  # bad-return
         return outputs
 
     def apply_constraint(self) -> None:
@@ -8199,12 +8354,23 @@ def realize_hint(self) -> None:
             self.realize()
 
     def has_accumulated_enough_reads_by_size(self, threshold: int) -> bool:
-        size_of_reads = [V.graph.get_dep_size_hint(dep) for dep in self.get_reads()]
+        from torch._inductor.utils import is_nonfreeable_buffers
+
+        size_of_reads = [
+            V.graph.get_dep_size_hint(dep)
+            for dep in self.get_reads()
+            if not is_nonfreeable_buffers(dep)
+        ]
         if not size_of_reads:
             return False
         total_size = sum(size_of_reads)
         max_size = max(size_of_reads)
-        return total_size > threshold and total_size / max_size >= 2
+        min_size = min(size_of_reads)
+        return (
+            total_size >= threshold
+            and total_size / max_size >= 2
+            and max_size == min_size
+        )
 
     def has_exceeded_max_reads(self) -> bool:
         return isinstance(self.data, Pointwise) and (
@@ -8306,6 +8472,7 @@ def create(
         # Realize the inputs. Also intermediates can have different strides than
         # the inputs of the subgraph. So, force the intermediates to have same
         # strides as that of subgraph inputs.
+        # pyrefly: ignore  # annotation-mismatch
         operands: list[IRNode] = [cls.realize_input(x) for x in operands]
         new_operands: list[IRNode] = []
 
@@ -8317,6 +8484,7 @@ def create(
                     constrain_to_fake_tensor(operand, fake_operands[idx])
                 )
 
+        # pyrefly: ignore  # bad-assignment
         operands = new_operands
 
         if subgraph.graph is None:
@@ -8427,7 +8595,9 @@ def create(
         operands: list[Union[TensorBox, ShapeAsConstantBuffer]],
     ) -> Sequence[IRNode]:
         """Create a Sequence of IRNodes from a conditional statement (see .lowering.cond)"""
+        # pyrefly: ignore  # bad-assignment
         predicate = cls.realize_input(predicate)
+        # pyrefly: ignore  # bad-assignment
         operands = [cls.realize_input(x) for x in operands]
         fx_operands: Argument = V.graph.current_node.args[-1]
 
@@ -8590,7 +8760,6 @@ def _clone_aliased_inputs(carried_inputs: Sequence[IRNode]) -> Sequence[IRNode]:
             return carried_inputs
 
         # Import clone from lowering module
-        from .lowering import clone
 
         # Unwrap views to get the underlying buffers for comparison
         unwrapped_buffers = [
@@ -8600,19 +8769,30 @@ def _clone_aliased_inputs(carried_inputs: Sequence[IRNode]) -> Sequence[IRNode]:
 
         # Track which buffers we've seen and their indices
         seen_buffers: OrderedSet[int] = OrderedSet()
-        result = []
+        result: list[Union[IRNode, TensorBox, ShapeAsConstantBuffer]] = []
 
         for i, (original_input, unwrapped_buffer) in enumerate(
             zip(carried_inputs, unwrapped_buffers)
         ):
             if id(unwrapped_buffer) in seen_buffers:
-                result.append(clone(original_input))
+                result.append(ExternKernel.copy_input(original_input))
             else:
                 seen_buffers.add(id(unwrapped_buffer))
                 result.append(original_input)
 
         return result
 
+    @staticmethod
+    def _maybe_wrap_as_tensor_box(out: IRNode) -> IRNode:
+        if isinstance(out, TensorBox):
+            return out
+        elif isinstance(out, (StorageBox, ReinterpretView)):
+            return TensorBox(out)
+        elif isinstance(out, MultiOutput):
+            return TensorBox.create(out)
+        else:
+            raise RuntimeError(f"NYI unsupported output type: {type(out)}")
+
     @classmethod
     def create(
         cls,
@@ -8635,9 +8815,21 @@ def _require_exact_strides(
             ret = []
             for tb, fk in zip(tensor_boxes, fake_tensors):
                 if isinstance(fk, torch.Tensor):
+                    # Subgraph lowering always return StorageBox as graph_outputs because
+                    # it realizes the outputs.
+                    #
+                    # However, require_exact_strides is expecting TensorBox
+                    # e.g. in require_exact_strides when an expand happens,
+                    # the fake tensor's stride is (0, 0, 0) but the storage
+                    # box might have a different stride so lowering.slice_
+                    # is used to make the stride consistent and it expects input to
+                    # be TensorBox.
+                    #
+                    # So we wrap the inputs as tensor boxes if they're not yet.
+                    new_tb = WhileLoop._maybe_wrap_as_tensor_box(tb)
                     ret.append(
                         ExternKernel.require_exact_strides(
-                            tb, fk.stride(), allow_padding=False
+                            new_tb, fk.stride(), allow_padding=False
                         )
                     )
                 else:
@@ -8984,6 +9176,7 @@ def create_inplace(
         assert not unbacked_bindings, f"{kernel} {unbacked_bindings}"
         for tensor_arg in tensor_args:
             tensor_arg.realize()
+            V.graph.mark_buffer_mutated(tensor_arg.get_name())
 
         device = tensor_args[0].get_device()
         packed = cls(
@@ -9198,6 +9391,7 @@ def get_volatile_reads(self) -> Sequence[IRNode]:
             # Case 1
             if isinstance(coll, _CollectiveKernel):
                 _, idx = inp.indices[0]
+                # pyrefly: ignore  # bad-return
                 return [coll.inputs[idx]]
             # Case 2
             return []
diff --git a/torch/_inductor/kernel/bmm.py b/torch/_inductor/kernel/bmm.py
index 20d101b951c0..06c4a63497d7 100644
--- a/torch/_inductor/kernel/bmm.py
+++ b/torch/_inductor/kernel/bmm.py
@@ -6,8 +6,9 @@
 from torch._dynamo.utils import counters
 from torch._inductor.codegen.rocm.ck_universal_gemm_template import CKGemmTemplate
 
-from .. import ir, lowering as L
+from .. import config as inductor_config, ir, lowering as L
 from ..kernel_inputs import MMKernelInputs
+from ..lowering import lowerings, make_pointwise, make_reduction, transform_args
 from ..select_algorithm import (
     autotune_select_algorithm,
     ExternKernelChoice,
@@ -22,8 +23,13 @@
     use_cutlass_template,
     use_triton_template,
 )
-from ..virtualized import V
-from .mm_common import _is_static_problem, is_batch_stride_largest_or_zero, mm_args
+from ..virtualized import ops, V
+from .mm_common import (
+    _is_static_problem,
+    is_batch_stride_largest_or_zero,
+    mm_args,
+    use_native_matmul,
+)
 
 
 if TYPE_CHECKING:
@@ -113,7 +119,7 @@ def bmm_grid(b, m, n, meta, *, cdiv):
     cache_codegen_enabled_for_template=True,
 )
 
-aten_bmm = ExternKernelChoice(torch.bmm, "at::bmm_out")
+aten_bmm = ExternKernelChoice(torch.bmm, "at::bmm_out", op_overload=aten.bmm.out)
 aten_bmm_dtype = ExternKernelChoice(
     torch.bmm,
     "at::_bmm_out_dtype_cuda",
@@ -167,6 +173,32 @@ def may_require_contiguous(t, meta_t):
             meta_mat2 = V.graph.current_node.args[1]
             mat2 = may_require_contiguous(mat2, meta_mat2)
 
+    if use_native_matmul(mat1, mat2):
+        mat1 = lowerings[aten.unsqueeze](mat1, -1)
+        mat2 = lowerings[aten.unsqueeze](mat2, 1)
+        args, kwargs = transform_args(
+            args=[mat1, mat2],
+            kwargs={},
+            broadcast=True,
+            type_promotion_kind=None,
+            convert_input_to_bool=False,
+        )  # Handles broadcasting the arguments
+
+        if inductor_config.triton.codegen_upcast_to_fp32 and mat1.dtype in [
+            torch.float16,
+            torch.bfloat16,
+        ]:
+
+            def _to_dtype(x):
+                return ops.to_dtype(x, mat1.dtype, use_compute_types=False)
+
+            args = [make_pointwise(_to_dtype)(x) for x in args]
+
+        mul_pointwise = make_pointwise(ops.dot)(*args)
+        dot_reduction = make_reduction("dot")(mul_pointwise, 2)
+
+        return dot_reduction
+
     # TODO(coconutruben): integrate into MMKernelInputs when all callsites use that
     m, n, k, layout, mat1, mat2 = mm_args(
         mat1, mat2, layout=layout, out_dtype=out_dtype
@@ -255,6 +287,19 @@ def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
     """
     Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
     """
+    if use_native_matmul(mat1, mat2):
+        if beta == 0:
+            arg1 = 0
+        else:
+            arg1 = lowerings[aten.mul](beta, inp)
+
+        if alpha == 0:
+            arg2 = 0
+        else:
+            arg2 = lowerings[aten.mul](alpha, lowerings[aten.bmm](mat1, mat2))
+
+        return lowerings[aten.add](arg1, arg2)
+
     # TODO(coconutruben): integrate into MMKernelInputs when all callsites use that
     m, n, k, layout, mat1, mat2, inp = mm_args(mat1, mat2, inp, layout=layout)
 
diff --git a/torch/_inductor/kernel/flex/common.py b/torch/_inductor/kernel/flex/common.py
index aab25ac0813b..dc7b29f5034b 100644
--- a/torch/_inductor/kernel/flex/common.py
+++ b/torch/_inductor/kernel/flex/common.py
@@ -3,6 +3,7 @@
 
 import math
 from collections.abc import Sequence
+from functools import partial
 from pathlib import Path
 from typing import Any, Optional, Union
 
@@ -11,7 +12,7 @@
 import torch
 from torch._inductor.virtualized import V
 from torch.utils._ordered_set import OrderedSet
-from torch.utils._pytree import tree_map
+from torch.utils._pytree import tree_map, tree_map_only
 
 from ...ir import (
     ComputedBuffer,
@@ -36,6 +37,7 @@
     to_dtype,
 )
 from ...select_algorithm import realize_inputs
+from ...utils import load_template
 
 
 SubgraphResults = Union[list[Optional[ComputedBuffer]], Optional[ComputedBuffer]]
@@ -90,13 +92,16 @@ def get_fwd_subgraph_outputs(
     subgraph_buffer: SubgraphResults, mask_graph_buffer: SubgraphResults
 ) -> list[Optional[ComputedBuffer]]:
     subgraph_buffer = (
+        # pyrefly: ignore  # bad-assignment
         subgraph_buffer if isinstance(subgraph_buffer, Sequence) else [subgraph_buffer]
     )
     mask_graph_buffer = (
+        # pyrefly: ignore  # bad-assignment
         mask_graph_buffer
         if isinstance(mask_graph_buffer, Sequence)
         else [mask_graph_buffer]
     )
+    # pyrefly: ignore  # not-iterable
     return [*subgraph_buffer, *mask_graph_buffer]
 
 
@@ -173,6 +178,22 @@ def maybe_realize(args: list[Optional[IRNode]]):
     )
 
 
+def freeze_irnodes(tree: Any) -> Any:
+    """Freeze layouts for every IRNode contained in a pytree."""
+
+    if tree is None:
+        return None
+
+    def _freeze(node: IRNode) -> IRNode:
+        try:
+            node.freeze_layout()
+        except NotImplementedError:
+            pass
+        return node
+
+    return tree_map_only(IRNode, _freeze, tree)
+
+
 def create_placeholder(
     name: str,
     dtype: torch.dtype,
@@ -318,13 +339,8 @@ def next_power_of_two(n):
     return 2 ** math.ceil(math.log2(n))
 
 
-_TEMPLATE_DIR = Path(__file__).parent / "templates"
-
-
-def load_template(name: str) -> str:
-    """Load a template file and return its content."""
-    with open(_TEMPLATE_DIR / f"{name}.py.jinja") as f:
-        return f.read()
+_FLEX_TEMPLATE_DIR = Path(__file__).parent / "templates"
+load_flex_template = partial(load_template, template_dir=_FLEX_TEMPLATE_DIR)
 
 
 # Template strings have been moved to templates/common.py.jinja
diff --git a/torch/_inductor/kernel/flex/flex_attention.py b/torch/_inductor/kernel/flex/flex_attention.py
index fb986e93c78c..f62eb70d967e 100644
--- a/torch/_inductor/kernel/flex/flex_attention.py
+++ b/torch/_inductor/kernel/flex/flex_attention.py
@@ -1,11 +1,13 @@
 # mypy: allow-untyped-defs
 """Triton Implementation of the flex_attention Kernel"""
 
+from __future__ import annotations
+
 import logging
 import math
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Any, Optional, Union
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 import sympy
 
@@ -24,15 +26,24 @@
     create_indices_fake,
     create_num_blocks_fake_generator,
     create_placeholder,
+    freeze_irnodes,
     get_fwd_subgraph_outputs,
     infer_dense_strides,
-    load_template,
+    load_flex_template,
     maybe_realize,
     set_head_dim_values,
     SubgraphResults,
 )
 from .flex_cpu import lower_cpu
 from .flex_decoding import _use_flex_decoding, create_flex_decoding_kernel
+from .flex_flash_attention import (
+    _use_flex_flash_attention,
+    create_flex_flash_attention_kernel,
+)
+
+
+if TYPE_CHECKING:
+    from ...template_heuristics.triton import FlexBwDConfig, FlexConfig
 
 
 log = logging.getLogger(__name__)
@@ -68,9 +79,9 @@ def get_float32_precision():
 flex_attention_template = TritonTemplate(
     name="flex_attention",
     grid=flex_attention_grid,
-    source=load_template("flex_attention")
-    + load_template("utilities")
-    + load_template("common"),
+    source=load_flex_template("flex_attention")
+    + load_flex_template("utilities")
+    + load_flex_template("common"),
 )
 
 
@@ -143,6 +154,7 @@ def flex_attention(
     subgraph_buffer = build_subgraph_buffer(
         placeholder_inps + list(score_mod_other_buffers), subgraph
     )
+    freeze_irnodes(subgraph_buffer)
 
     mask_graph_placeholder_inps = [
         create_placeholder(name, dtype, query.get_device())
@@ -156,6 +168,7 @@ def flex_attention(
     mask_graph_buffer = build_subgraph_buffer(
         mask_graph_placeholder_inps + list(mask_mod_other_buffers), mask_graph
     )
+    freeze_irnodes(mask_graph_buffer)
 
     kernel_options = dict(kernel_options)
     # Mark symbols in custom kernel options as static shapes and add guards.
@@ -180,6 +193,24 @@ def flex_attention(
             score_mod_other_buffers,
             mask_mod_other_buffers,
         )
+    if _use_flex_flash_attention(
+        subgraph,
+        mask_graph,
+        kernel_options,
+        num_score_mod_placeholders=len(placeholder_inps),
+    ):
+        return create_flex_flash_attention_kernel(
+            query,
+            key,
+            value,
+            block_mask,
+            scale,
+            kernel_options,
+            subgraph_buffer,
+            mask_graph_buffer,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        )
 
     (
         query,
@@ -212,6 +243,9 @@ def flex_attention(
     score_mod_other_buffers = maybe_realize(score_mod_other_buffers)
     mask_mod_other_buffers = maybe_realize(mask_mod_other_buffers)
 
+    freeze_irnodes(score_mod_other_buffers)
+    freeze_irnodes(mask_mod_other_buffers)
+
     Bq, Hq, seq_len_q, qk_head_dim = query.get_size()
     Bkv, Hkv, seq_len_kv, v_head_dim = value.get_size()
     assert V.graph.sizevars.evaluate_expr(sympy.Eq(Bq, Bkv) | sympy.Eq(Bkv, 1)), (
@@ -279,7 +313,7 @@ def flex_attention(
 
     dtype = query.get_dtype()
     head_dim = V.graph.sizevars.guard_int(query.get_size()[-1])
-    configs = V.choices.get_flex_attention_fwd_configs(
+    configs: list[FlexConfig] = V.choices.get_flex_attention_fwd_configs(
         head_dim, dtype, query.get_device().type
     )
 
@@ -435,7 +469,7 @@ def flex_attention_backward_grid(
 flex_attention_backward_template = TritonTemplate(
     name="flex_attention_backward",
     grid=flex_attention_backward_grid,
-    source=load_template("flex_backwards") + load_template("utilities"),
+    source=load_flex_template("flex_backwards") + load_flex_template("utilities"),
 )
 
 
@@ -557,6 +591,7 @@ def flex_attention_backward(*args, **kwargs):
         query,
         key,
         value,
+        logsumexp,
         grad_out,
         kv_num_blocks,
         kv_indices,
@@ -571,6 +606,7 @@ def flex_attention_backward(*args, **kwargs):
             query,
             key,
             value,
+            logsumexp,
             grad_out,
             kv_num_blocks,
             kv_indices,
@@ -619,6 +655,7 @@ def flex_attention_backward(*args, **kwargs):
     fw_subgraph_buffer = build_subgraph_buffer(
         fwd_placeholder_inps + list(score_mod_other_buffers), fw_graph
     )
+    freeze_irnodes(fw_subgraph_buffer)
 
     joint_placeholder_inps = fwd_placeholder_inps + [
         create_placeholder("grad_score_mod", dtype, device)
@@ -634,6 +671,7 @@ def flex_attention_backward(*args, **kwargs):
         joint_placeholder_inps + list(score_mod_other_buffers),
         joint_graph,
     )
+    freeze_irnodes(all_joint_outputs)
 
     joint_outputs = process_joint_outputs(
         all_joint_outputs, len(joint_placeholder_inps)
@@ -651,8 +689,7 @@ def flex_attention_backward(*args, **kwargs):
     mask_graph_buffer = build_subgraph_buffer(
         mask_graph_placeholder_inps + list(mask_mod_other_buffers), mask_graph
     )
-
-    mask_graph_buffer = mask_graph_buffer
+    freeze_irnodes(mask_graph_buffer)
 
     # Construct layout with stride order matching K
     key_size = [Bq, Hkv, seq_len_kv, qk_head_dim]
@@ -719,7 +756,7 @@ def flex_attention_backward(*args, **kwargs):
 
     dtype = query.get_dtype()
     head_dim = V.graph.sizevars.guard_int(query.get_size()[-1])
-    configs = V.choices.get_flex_attention_bwd_configs(
+    configs: list[FlexBwDConfig] = V.choices.get_flex_attention_bwd_configs(
         head_dim, dtype, query.get_device().type
     )
 
@@ -727,12 +764,13 @@ def flex_attention_backward(*args, **kwargs):
     num_consumer_groups, num_buffers_warp_spec = 0, 0
 
     original_kernel_options = kernel_options.copy()
+
     for conf in configs:
         if (
-            SPARSE_KV_BLOCK_SIZE % conf.block_m != 0
-            or SPARSE_Q_BLOCK_SIZE % conf.block_m != 0
-            or SPARSE_KV_BLOCK_SIZE % conf.block_n != 0
-            or SPARSE_Q_BLOCK_SIZE % conf.block_n != 0
+            SPARSE_KV_BLOCK_SIZE % conf.block_n1 != 0
+            or SPARSE_Q_BLOCK_SIZE % conf.block_m1 != 0
+            or SPARSE_KV_BLOCK_SIZE % conf.block_n2 != 0
+            or SPARSE_Q_BLOCK_SIZE % conf.block_m2 != 0
         ):
             continue
 
@@ -755,10 +793,10 @@ def flex_attention_backward(*args, **kwargs):
                 "num_buffers_warp_spec", num_buffers_warp_spec
             )
 
-        cur_kernel_options.setdefault("BLOCK_M1", conf.block_m)
-        cur_kernel_options.setdefault("BLOCK_N1", conf.block_n)
-        cur_kernel_options.setdefault("BLOCK_M2", conf.block_n)
-        cur_kernel_options.setdefault("BLOCK_N2", conf.block_m)
+        cur_kernel_options.setdefault("BLOCK_M1", conf.block_m1)
+        cur_kernel_options.setdefault("BLOCK_N1", conf.block_n1)
+        cur_kernel_options.setdefault("BLOCK_M2", conf.block_m2)
+        cur_kernel_options.setdefault("BLOCK_N2", conf.block_n2)
 
         # Blocksparse options
         cur_kernel_options.setdefault("SPARSE_Q_BLOCK_SIZE", SPARSE_Q_BLOCK_SIZE)
@@ -805,6 +843,7 @@ def flex_attention_backward(*args, **kwargs):
             **cur_kernel_options,
         )
     inputs_for_autotuning = (
+        # pyrefly: ignore  # unsupported-operation
         [
             query,
             key,
@@ -875,9 +914,11 @@ def get_bwd_subgraph_outputs(
     joint_outputs: JointOutputResult,
 ) -> list[Optional[Union[ComputedBuffer, TensorBox]]]:
     subgraph_buffer = (
+        # pyrefly: ignore  # bad-assignment
         subgraph_buffer if isinstance(subgraph_buffer, Sequence) else [subgraph_buffer]
     )
     mask_graph_buffer = (
+        # pyrefly: ignore  # bad-assignment
         mask_graph_buffer
         if isinstance(mask_graph_buffer, Sequence)
         else [mask_graph_buffer]
@@ -889,4 +930,5 @@ def get_bwd_subgraph_outputs(
         *joint_outputs.mutated_grads,
     ]
 
+    # pyrefly: ignore  # not-iterable
     return [*subgraph_buffer, *mask_graph_buffer, *joint_output_buffers]
diff --git a/torch/_inductor/kernel/flex/flex_cpu.py b/torch/_inductor/kernel/flex/flex_cpu.py
index ec366dd6069a..6987e64546fe 100644
--- a/torch/_inductor/kernel/flex/flex_cpu.py
+++ b/torch/_inductor/kernel/flex/flex_cpu.py
@@ -35,7 +35,7 @@ def check_cpu_supported():
     supported = (
         requires_avx2_on_cpu
         and not torch.xpu.is_available()
-        and not sys.platform == "darwin"
+        and sys.platform != "darwin"
     )
     return supported
 
diff --git a/torch/_inductor/kernel/flex/flex_decoding.py b/torch/_inductor/kernel/flex/flex_decoding.py
index 0f7d58402551..bdab06eb0661 100644
--- a/torch/_inductor/kernel/flex/flex_decoding.py
+++ b/torch/_inductor/kernel/flex/flex_decoding.py
@@ -20,8 +20,9 @@
 from .common import (
     create_indices_fake,
     create_num_blocks_fake_generator,
+    freeze_irnodes,
     get_fwd_subgraph_outputs,
-    load_template,
+    load_flex_template,
     maybe_realize,
     set_head_dim_values,
 )
@@ -96,9 +97,9 @@ def flex_decoding_grid(batch_size, kv_heads, gqa_group_size, n_keys, d_model, me
 flex_decoding_template = TritonTemplate(
     name="flex_decoding",
     grid=flex_decoding_grid,
-    source=load_template("flex_decode")
-    + load_template("utilities")
-    + load_template("common"),
+    source=load_flex_template("flex_decode")
+    + load_flex_template("utilities")
+    + load_flex_template("common"),
 )
 
 
@@ -208,6 +209,9 @@ def create_flex_decoding_kernel(*args, **kwargs):
     score_mod_other_buffers = maybe_realize(score_mod_other_buffers)
     mask_mod_other_buffers = maybe_realize(mask_mod_other_buffers)
 
+    freeze_irnodes(score_mod_other_buffers)
+    freeze_irnodes(mask_mod_other_buffers)
+
     choices: list[Any] = []
     dtype = key.get_dtype()
     head_dim = V.graph.sizevars.guard_int(key.get_size()[-1])
@@ -363,6 +367,7 @@ def create_flex_decoding_kernel(*args, **kwargs):
     ]
 
     inputs_for_flex_decoding = (
+        # pyrefly: ignore  # unsupported-operation
         [
             query,
             key,
diff --git a/torch/_inductor/kernel/flex/flex_flash_attention.py b/torch/_inductor/kernel/flex/flex_flash_attention.py
new file mode 100644
index 000000000000..4fd38b0c66f5
--- /dev/null
+++ b/torch/_inductor/kernel/flex/flex_flash_attention.py
@@ -0,0 +1,216 @@
+# mypy: allow-untyped-defs
+"""Call into flash-attention 4 for flexattention"""
+
+import functools
+import importlib
+from typing import Any
+
+import sympy
+
+import torch
+from torch.fx import GraphModule
+
+from ...ir import FixedLayout, ShapeAsConstantBuffer, Subgraph, TensorBox
+from ...lowering import empty_strided
+from .common import infer_dense_strides, load_flex_template, SubgraphResults
+
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+
+@functools.lru_cache(maxsize=1)
+def ensure_flash_available() -> bool:
+    """Check if flash-attn is importable; cache the result for reuse.
+
+    Call ensure_flash_available.cache_clear() after installing flash-attn
+    in the same interpreter to retry the import.
+    """
+    try:
+        return importlib.util.find_spec("flash_attn.cute") is not None
+    except ImportError:
+        return False
+
+
+from ...codegen.cutedsl.cutedsl_template import CuteDSLTemplate
+
+
+flash_attention_cutedsl_template = CuteDSLTemplate(
+    name="flash_attention_cutedsl", source=load_flex_template("flash_attention")
+)
+
+
+def input_buffers_require_grads(graph_module, num_score_mod_placeholders: int):
+    """Check if any of the input buffers (beyond the score mod placeholders) require gradients."""
+    inputs = []
+    for node in graph_module.graph.nodes:
+        if node.op == "placeholder":
+            inputs.append(node)
+    if len(inputs) <= num_score_mod_placeholders:
+        return False
+
+    def requires_grad(n):
+        tensor_meta = n.meta.get("tensor_meta")
+        return tensor_meta.requires_grad if tensor_meta is not None else False
+
+    return any(requires_grad(n) for n in inputs[num_score_mod_placeholders:])
+
+
+def is_trivial_graph(
+    graph_module: GraphModule, is_score_graph: bool, num_score_mod_placeholders: int
+):
+    """Check if the flex graphs are compatible with Flash Attention."""
+    graph = graph_module.graph
+    nodes = list(graph.nodes)
+    placeholders = [n for n in nodes if n.op == "placeholder"]
+    output = [n for n in nodes if n.op == "output"]
+    assert len(output) == 1, "Got graph w/ multiple outputs"
+    output_val = output[0].args[0]
+
+    if is_score_graph:
+        if input_buffers_require_grads(graph_module, num_score_mod_placeholders):
+            return False
+        return True  # party on garth
+    # mask mod graph is empty if we have 4 inputs and full_default output
+    return len(placeholders) == 4 and output_val.target == torch.ops.aten.full.default
+
+
+def _can_use_flex_flash_attention(
+    subgraph: Subgraph, mask_graph: Subgraph, num_score_mod_placeholders: int
+) -> tuple[bool, str]:
+    """Check if flex flash attention can be used for the given inputs.
+
+    Returns:
+        tuple: (can_use, reason) where reason explains why it can't be used if can_use is False
+    """
+    if not ensure_flash_available():
+        return False, "CUTE flash attention library is not available"
+
+    if input_buffers_require_grads(subgraph.graph_module, num_score_mod_placeholders):
+        return (
+            False,
+            "Input buffers require gradients (not supported by flash attention)",
+        )
+
+    score_trivial = is_trivial_graph(
+        subgraph.graph_module,
+        is_score_graph=True,
+        num_score_mod_placeholders=num_score_mod_placeholders,
+    )
+    mask_trivial = is_trivial_graph(
+        mask_graph.graph_module,
+        is_score_graph=False,
+        num_score_mod_placeholders=num_score_mod_placeholders,
+    )
+
+    if not score_trivial and not mask_trivial:
+        return (
+            False,
+            "Both score and mask graphs are too complex for flash attention (require simple operations only)",
+        )
+    elif not score_trivial:
+        return (
+            False,
+            "Score modification captured tensors that require gradients (not supported by flash attention)",
+        )
+    elif not mask_trivial:
+        return (
+            False,
+            "A non None BlockMask was passed to flex attention (not supported by flash attention yet)",
+        )
+
+    return True, ""
+
+
+def _use_flex_flash_attention(
+    subgraph: Subgraph,
+    mask_graph: Subgraph,
+    kernel_options: dict[str, Any],
+    num_score_mod_placeholders: int,
+) -> bool:
+    """Determine if we should use flex flash attention for the given inputs."""
+    force_flash = kernel_options.get("force_flash", False)
+
+    can_use, reason = _can_use_flex_flash_attention(
+        subgraph, mask_graph, num_score_mod_placeholders
+    )
+
+    if force_flash and not can_use:
+        raise RuntimeError(
+            f"force_flash=True but flash attention cannot be used: {reason}"
+        )
+
+    return force_flash and can_use
+
+
+def create_flex_flash_attention_kernel(
+    query: TensorBox,
+    key: TensorBox,
+    value: TensorBox,
+    block_mask: tuple[Any, ...],
+    scale: float,
+    kernel_options: dict[str, Any],
+    subgraph_buffer: SubgraphResults,
+    mask_graph_buffer: SubgraphResults,
+    score_mod_other_buffers: list[TensorBox],
+    mask_mod_other_buffers: list[TensorBox],
+) -> tuple[TensorBox | ShapeAsConstantBuffer, TensorBox | ShapeAsConstantBuffer]:
+    """Create a flex flash attention kernel using CuteDSL template."""
+    if not ensure_flash_available():
+        raise RuntimeError("CUTE flash attention not available")
+
+    # Get dimensions
+    batch_size, num_heads, seq_len_q, head_dim = query.get_size()
+    v_head_dim = value.get_size()[-1]
+    device = query.get_device()
+    dtype = query.get_dtype()
+    assert device is not None, "Device must be specified"
+
+    # Match stride pattern from query tensor
+    q_strides = query.get_stride()
+    out_size = [batch_size, num_heads, seq_len_q, v_head_dim]
+    out_strides = infer_dense_strides(out_size, q_strides)
+
+    output = empty_strided(
+        size=out_size,
+        stride=out_strides,
+        dtype=dtype,
+        device=device,
+    )
+
+    lse = empty_strided(
+        size=[batch_size, num_heads, seq_len_q],
+        stride=None,  # LSE can be contiguous
+        dtype=torch.float32,  # LSE is always fp32
+        device=device,
+    )
+
+    # Create layout for primary output
+    output_layout = FixedLayout(
+        device=device,
+        dtype=dtype,
+        size=[batch_size, num_heads, seq_len_q, v_head_dim],
+        stride=[sympy.sympify(s) for s in output.get_stride()],
+    )
+
+    choices: list[Any] = []
+    causal = kernel_options.get("causal", False)
+    assert flash_attention_cutedsl_template is not None
+    error = flash_attention_cutedsl_template.maybe_append_choice(
+        choices,
+        input_nodes=[query, key, value, lse],
+        layout=output_layout,
+        mutated_inputs=[lse],
+        subgraphs=[subgraph_buffer, mask_graph_buffer],
+        SM_SCALE=scale,
+        CAUSAL=causal,
+    )
+
+    if error or not choices:
+        # Fallback to original implementation
+        raise RuntimeError(f"CuteDSL template failed: {error}")
+
+    # No autotune for now
+    template_output = choices[0].output_node()
+
+    return (template_output, lse)
diff --git a/torch/_inductor/kernel/flex/templates/flash_attention.py.jinja b/torch/_inductor/kernel/flex/templates/flash_attention.py.jinja
new file mode 100644
index 000000000000..20834940b372
--- /dev/null
+++ b/torch/_inductor/kernel/flex/templates/flash_attention.py.jinja
@@ -0,0 +1,50 @@
+
+{{def_kernel("Q", "K", "V", "LOGSUMEXP")}}
+    from flash_attn.cute.interface import _flash_attn_fwd
+
+    # Transpose tensors for _flash_attn_fwd compatibility (B,H,M,D) -> (B,M,H,D)
+    q_transposed = Q.transpose(1, 2)
+    k_transposed = K.transpose(1, 2)
+    v_transposed = V.transpose(1, 2)
+
+    @cute.jit
+    def score_mod(tSrS_ssa, b_idx, h_idx, q_idx, kv_idx, buffers):
+        {{unpack_buffers()}}
+        {{ modification(
+            subgraph_number=0,
+            output_name="tSrS_ssa",
+            score="tSrS_ssa",
+            b="b_idx",
+            h="h_idx",
+            m="q_idx",
+            n="kv_idx",
+            out="tSrS_ssa"
+        ) | indent_except_first(2) }}
+        return tSrS_ssa
+
+    # (B,M,H,D) -> (B,H,M,D)
+    output = {{get_output()}}
+    output_transposed = output.transpose(1, 2)
+
+    # Collect any additional tensor buffers that were added during modifications
+    {% set tensor_buffers = get_tensor_buffers() -%}
+    {% if tensor_buffers -%}
+    buffers = [{% for buffer in tensor_buffers %}{{buffer}}{% if not loop.last %}, {% endif %}{% endfor %}]
+    buffers = list(buffers)
+    {% else -%}
+    buffers = None
+    {% endif -%}
+
+    # Out and LSE filled inplace
+    _flash_attn_fwd(
+        q_transposed,
+        k_transposed,
+        v_transposed,
+        softmax_scale={{SM_SCALE}},
+        causal={{CAUSAL}},
+        return_lse=True,
+        score_mod=score_mod,
+        out=output_transposed,
+        lse=LOGSUMEXP,
+        buffers=buffers
+    )
\ No newline at end of file
diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py
index a101bcb58643..63cb32563947 100644
--- a/torch/_inductor/kernel/mm.py
+++ b/torch/_inductor/kernel/mm.py
@@ -14,8 +14,9 @@
 )
 from torch._inductor.codegen.cpp_gemm_template import CppGemmTemplate
 from torch._inductor.remote_gemm_autotune_cache import gen_best_config
-from torch._inductor.virtualized import V
+from torch._inductor.virtualized import ops, V
 from torch.fx.experimental.proxy_tensor import make_fx
+from torch.nn.functional import ScalingType  # type: ignore[attr-defined]
 from torch.torch_version import TorchVersion
 
 from .. import config as inductor_config
@@ -25,7 +26,13 @@
 from ..codegen.subgraph import SubgraphChoiceCaller, SubgraphTemplate
 from ..ir import Buffer, ChoiceCaller, is_triton, Layout
 from ..kernel_inputs import MMKernelInputs
-from ..lowering import add_layout_constraint, constrain_to_fx_strides, register_lowering
+from ..lowering import (
+    lowerings,
+    make_pointwise,
+    make_reduction,
+    register_lowering,
+    transform_args,
+)
 from ..select_algorithm import (
     autotune_select_algorithm,
     ExternKernelChoice,
@@ -45,7 +52,13 @@
     use_triton_template,
     use_triton_tma_template,
 )
-from .mm_common import _is_static_problem, mm_args, mm_grid, persistent_mm_grid
+from .mm_common import (
+    _is_static_problem,
+    mm_args,
+    mm_grid,
+    persistent_mm_grid,
+    use_native_matmul,
+)
 
 
 try:
@@ -360,15 +373,11 @@
 
 load_scales = r"""
 @triton.jit
-def load_scales(a_scale_ptr, b_scale_ptr, SCALING_ROWWISE: tl.constexpr):
-    if SCALING_ROWWISE:
-        # For row-wise scaling, we'll return the pointers
-        return a_scale_ptr, b_scale_ptr
+def load_scales(scale_ptr, SCALE_RECIPE: tl.constexpr):
+    if SCALE_RECIPE == 0:
+        return tl.load(scale_ptr)  # For tensor-wise scaling, we'll load the scalar values
     else:
-        # For per-tensor scaling, we'll load the scalar values
-        a_scale = tl.load(a_scale_ptr)
-        b_scale = tl.load(b_scale_ptr)
-        return a_scale, b_scale
+        return scale_ptr  # For all other scaling recipes, we'll return the pointers
 """
 
 
@@ -378,7 +387,8 @@ def apply_scaling(
     accumulator,
     a_scale,
     b_scale,
-    SCALING_ROWWISE: tl.constexpr,
+    SCALE_RECIPE_A: tl.constexpr,
+    SCALE_RECIPE_B: tl.constexpr,
     offs_cm,
     offs_cn,
     M,
@@ -386,7 +396,7 @@ def apply_scaling(
     stride_a_scale_m,
     stride_b_scale_n,
 ):
-    if SCALING_ROWWISE:
+    if SCALE_RECIPE_A == 1 and SCALE_RECIPE_B == 1:  # (ScalingType.RowWise, ScalingType.RowWise)
         # For row-wise scaling, we need to load the scales for each row/column
         a_scales = tl.load(
             a_scale + (offs_cm * stride_a_scale_m),
@@ -399,7 +409,7 @@ def apply_scaling(
             other=0.0,
         )
         acc_scale = a_scales[:, None] * b_scales[None, :]
-    else:
+    else:  # (ScalingType.TensorWise, ScalingType.TensorWise)
         # For per-tensor scaling, we can directly use the loaded scalar values
         acc_scale = a_scale * b_scale
 
@@ -407,7 +417,7 @@ def apply_scaling(
 """
 
 
-device_tma = r"""
+scaled_mm_device_tma_epilogue_scaling = r"""
 {{def_kernel("A", "B", "A_inverse_scale", "B_inverse_scale")}}
     M = {{size("A", 0)}}
     N = {{size("B", 1)}}
@@ -421,11 +431,14 @@ def apply_scaling(
     stride_bk = {{stride("B", 0)}}
     stride_bn = {{stride("B", 1)}}
 
-    if SCALING_ROWWISE:
+    if SCALE_RECIPE_A == 1:  # ScalingType.RowWise
         stride_a_scale_m = 1
-        stride_b_scale_n = 1
     else:
         stride_a_scale_m = 0
+
+    if SCALE_RECIPE_B == 1:  # ScalingType.RowWise
+        stride_b_scale_n = 1
+    else:
         stride_b_scale_n = 0
 
     start_pid = tl.program_id(axis=0).to(INDEX_DTYPE)
@@ -488,7 +501,8 @@ def apply_scaling(
 
     num_pid_in_group = GROUP_M * num_pid_n
     accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
-    a_scale, b_scale = load_scales(A_inverse_scale, B_inverse_scale, SCALING_ROWWISE)
+    a_scale = load_scales(A_inverse_scale, SCALE_RECIPE_A)
+    b_scale = load_scales(B_inverse_scale, SCALE_RECIPE_B)
 
     for _ in range(0, k_tiles * tiles_per_SM):
         ki = tl.where(ki == k_tiles - 1, 0, ki + 1)
@@ -530,7 +544,8 @@ def apply_scaling(
                 accumulator,
                 a_scale,
                 b_scale,
-                SCALING_ROWWISE,
+                SCALE_RECIPE_A,
+                SCALE_RECIPE_B,
                 offs_cm,
                 offs_cn,
                 M,
@@ -558,10 +573,10 @@ def apply_scaling(
 """
 
 
-scaled_mm_device_tma_template = TritonTemplate(
-    name="scaled_mm_device_tma",
+scaled_mm_device_tma_epilogue_scaling_template = TritonTemplate(
+    name="scaled_mm_device_tma_epilogue_scaling",
     grid=persistent_mm_grid,
-    source=device_tma + load_scales + apply_scaling,
+    source=scaled_mm_device_tma_epilogue_scaling + load_scales + apply_scaling,
 )
 
 _compute_blackwell_pid = r"""
@@ -645,7 +660,27 @@ def _compute_pid(tile_id, num_pid_in_group, grid_m, GROUP_M: tl.constexpr, NUM_S
         )
         offs_cm = pid_m * BLOCK_M
         offs_cn = pid_n * BLOCK_N
-        # TODO: Add EPILOGUE_SUBTILE
+        {%- if EPILOGUE_SUBTILE %}
+        tl.static_assert(BLOCK_N % 2 == 0)
+        acc = tl.reshape(accumulator, (BLOCK_M, 2, BLOCK_N // 2))
+        acc = tl.permute(acc, (0, 2, 1))
+        acc0, acc1 = tl.split(acc)
+        {{store_output(
+            ("offs_cm", "offs_cn"),
+            "acc0",
+            indent_width=8,
+            val_shape=("BLOCK_M", "BLOCK_N // 2"),
+            block_indexing=True
+        )}}
+        offs_cn2 = offs_cn + BLOCK_N // 2
+        {{store_output(
+            ("offs_cm", "offs_cn2"),
+            "acc1",
+            indent_width=8,
+            val_shape=("BLOCK_M", "BLOCK_N // 2"),
+            block_indexing=True
+        )}}
+        {%- else %}
         {{store_output(
             ("offs_cm", "offs_cn"),
             "accumulator",
@@ -653,6 +688,7 @@ def _compute_pid(tile_id, num_pid_in_group, grid_m, GROUP_M: tl.constexpr, NUM_S
             val_shape=("BLOCK_M", "BLOCK_N"),
             block_indexing=True
         )}}
+        {%- endif %}
 """
 
 blackwell_ws_persistent_device_tma_mm_template = TritonTemplate(
@@ -669,6 +705,12 @@ def lazy_register_extern_choice(fn):
 
 
 aten_mm = ExternKernelChoice(torch.mm, "at::mm_out", op_overload=aten.mm.out)
+aten_mm_dtype = ExternKernelChoice(
+    torch.mm,
+    "at::_mm_dtype_out_cuda",
+    name="mm_dtype",
+    op_overload=aten.mm.dtype_out,
+)
 
 aten_addmm = ExternKernelChoice(
     torch.addmm, "at::addmm_out", op_overload=aten.addmm.out
@@ -837,17 +879,78 @@ def contiguous_addmm(inp, a, b):
 
 
 @register_lowering(aten.mm, type_promotion_kind=None)
-def tuned_mm(mat1, mat2, *, layout=None):
+def tuned_mm(mat1, mat2, out_dtype=None, *, layout=None):
     """
     Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
     """
+    if out_dtype is not None:
+        input_dtype = mat1.get_dtype()
+        torch._check(
+            mat2.get_dtype() == input_dtype,
+            lambda: "input dtypes must be the same",
+        )
+        torch._check(
+            mat1.get_device().type == "cuda",
+            lambda: "out_dtype is only supported for CUDA",
+        )
+        torch._check(
+            out_dtype == input_dtype
+            or (
+                out_dtype == torch.float32
+                and input_dtype in (torch.float16, torch.bfloat16)
+            ),
+            lambda: "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs",
+        )
+
+    # Lower matmul-related operations (e.g., torch.matmul / torch.bmm / torch.addmm)
+    # into native matmul IR using `ops.dot`. When we see a matmul pattern
+    # (C[y, x] = A[y, r] * B[r, x]), the core idea is to emulate a broadcasted
+    # multiply followed by a sum.
+    #
+    # For example, given `C = torch.matmul(A, B)`, this can be rewritten as:
+    #
+    #     Prod = A.unsqueeze(-1) * B.unsqueeze(0)
+    #     C = Prod.sum(dim=1)
+    #
+    # Instead of explicitly using `ops.mul` and `ops.reduction("sum")`, we lower
+    # these into `ops.dot` (pointwise) and `ops.reduction("dot")`. These IR nodes
+    # are semantically equivalent to the `ops.mul` + `ops.reduction("sum")`
+    # combination, but are lowered to `tl.dot` during the code generation phase.
+    if use_native_matmul(mat1, mat2):
+        mat1 = lowerings[aten.unsqueeze](mat1, -1)
+        mat2 = lowerings[aten.unsqueeze](mat2, 0)
+        args, kwargs = transform_args(
+            args=[mat1, mat2],
+            kwargs={},
+            broadcast=True,
+            type_promotion_kind=None,
+            convert_input_to_bool=False,
+        )  # Handles broadcasting the arguments
+
+        if inductor_config.triton.codegen_upcast_to_fp32 and mat1.dtype in [
+            torch.float16,
+            torch.bfloat16,
+        ]:
+
+            def _to_dtype(x):
+                return ops.to_dtype(x, mat1.dtype, use_compute_types=False)
+
+            args = [make_pointwise(_to_dtype)(x) for x in args]
+
+        mul_pointwise = make_pointwise(ops.dot)(*args)
+        dot_reduction = make_reduction("dot")(mul_pointwise, 1)
+
+        return dot_reduction
+
     # TODO(coconutruben): integrate into MMKernelInputs when all callsites use that
-    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
+    m, n, k, layout, mat1, mat2 = mm_args(
+        mat1, mat2, layout=layout, out_dtype=out_dtype
+    )
     static_shape, is_nonzero = _is_static_problem(layout)
     name = "mm"
 
     # Create MMKernelInputs for standard MM at the top
-    kernel_inputs = MMKernelInputs([mat1, mat2])
+    kernel_inputs = MMKernelInputs([mat1, mat2], out_dtype=out_dtype)
 
     # below is for getting an overview logging info of inductor mms
     counters["aten_mm_info"][f"aten.mm_{m}_{n}_{k}"] += 1
@@ -864,32 +967,55 @@ def tuned_mm(mat1, mat2, *, layout=None):
     choices: list[ChoiceCaller] = []
     static_shape, is_nonzero = _is_static_problem(layout)
 
-    # Collect all templates for unified call
+    aten_handler: ExternKernelChoice = aten_mm
+    aten_extra_kwargs: dict[str, Any] = {}
+    if out_dtype is not None:
+        aten_handler = aten_mm_dtype
+        aten_extra_kwargs = {"out_dtype": out_dtype}
+
     templates_to_use: list[Union[ExternKernelChoice, KernelTemplate]] = []
+    kwarg_overrides: dict[str, dict[str, Any]] = {}
     if use_aten_gemm_kernels():
-        templates_to_use.append(aten_mm)
-
-    if is_nonzero and use_triton_template(layout, check_max_autotune=True):
-        templates_to_use.append(mm_template)
-
-        if use_triton_tma_template(mat1, mat2, output_layout=layout):
-            templates_to_use.append(persistent_tma_mm_template)
-
-        if use_triton_blackwell_tma_template(mat1, mat2, output_layout=layout):
-            templates_to_use.append(blackwell_ws_persistent_device_tma_mm_template)
+        templates_to_use.append(aten_handler)
+        if aten_extra_kwargs:
+            kwarg_overrides[aten_handler.uid] = aten_extra_kwargs
 
+    if (
+        out_dtype is None
+        and is_nonzero
+        and use_triton_template(layout, check_max_autotune=True)
+    ):
         if use_decompose_k_choice(m, n, k):
             templates_to_use.append(decompose_k_subgraph_template)
+        # Triton Templates typically perform very poorly for large K.
+        # Its highly unlikely that if we want to use decompose_k, then
+        # Triton will ever win.
+        #
+        # To be conservative we increase this threshold for N/M by 2.
+        is_exhaustive = inductor_config.max_autotune_gemm_search_space == "exhaustive"
+        if is_exhaustive or not use_decompose_k_choice(m, n, k, threshold_multiple=2):
+            templates_to_use.append(mm_template)
+
+            if use_triton_tma_template(mat1, mat2, output_layout=layout):
+                templates_to_use.append(persistent_tma_mm_template)
+
+            if use_triton_blackwell_tma_template(mat1, mat2, output_layout=layout):
+                templates_to_use.append(blackwell_ws_persistent_device_tma_mm_template)
 
         templates_to_use.append(mm_contiguous_subgraph_template)
 
-    # Single unified call for all non-autoheuristic templates
     choices.extend(
-        V.choices.get_template_configs(kernel_inputs, templates_to_use, "mm")
+        V.choices.get_template_configs(
+            kernel_inputs,
+            templates_to_use,
+            "mm",
+            kwarg_overrides=kwarg_overrides,
+        )
     )
 
     if (
-        is_nonzero
+        out_dtype is None
+        and is_nonzero
         and use_cutlass_template(layout, m, n, k)
         and _use_cutlass_for_op("mm")
     ):
@@ -897,12 +1023,12 @@ def tuned_mm(mat1, mat2, *, layout=None):
             choices, layout, kernel_inputs.nodes()
         )
 
-    if is_nonzero and use_ck_gemm_template(layout, m, n, k):
+    if out_dtype is None and is_nonzero and use_ck_gemm_template(layout, m, n, k):
         CKGemmTemplate.add_ck_gemm_choices(choices, layout, kernel_inputs.nodes())
-    if is_nonzero and use_ck_tile_gemm_template(layout, m, n, k):
+    if out_dtype is None and is_nonzero and use_ck_tile_gemm_template(layout, m, n, k):
         CKTileGemmTemplate.add_choices(choices, layout, kernel_inputs.nodes())
 
-    if use_cpp_gemm_template(layout, mat1, mat2):
+    if out_dtype is None and use_cpp_gemm_template(layout, mat1, mat2):
         CppGemmTemplate.add_choices(
             choices,
             layout,
@@ -911,7 +1037,8 @@ def tuned_mm(mat1, mat2, *, layout=None):
 
     input_nodes = [mat1, mat2]
     if (
-        is_nonzero
+        out_dtype is None
+        and is_nonzero
         and use_triton_template(layout)
         and torch._inductor.config.run_autoheuristic(name)
         and is_triton(mat1)
@@ -957,15 +1084,16 @@ def tuned_mm(mat1, mat2, *, layout=None):
             else:
                 choices = choices[:num_choices_before_extra_configs]
 
-    for k in inductor_config.external_matmul:
-        choices.append(
-            lazy_register_extern_choice(k).bind(kernel_inputs.nodes(), layout)
-        )
+    if out_dtype is None:
+        for k in inductor_config.external_matmul:
+            choices.append(
+                lazy_register_extern_choice(k).bind(kernel_inputs.nodes(), layout)
+            )
 
     best_config_future = None
-    # Purposely not awaiting the future here - this kicks off the best config lookup at lowering time
-    # The future will be awaited at scheduling time in select_algorithm.py
-    if torch._inductor.config.remote_gemm_autotune_cache:
+    if out_dtype is None and torch._inductor.config.remote_gemm_autotune_cache:
+        # Purposely not awaiting the future here - this kicks off the best config lookup at lowering time
+        # The future will be awaited at scheduling time in select_algorithm.py
         best_config_future = gen_best_config(mat1, mat2)
 
     return autotune_select_algorithm(
@@ -1031,10 +1159,24 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
     """
     Lowering for autotuning aten.addmm with different backends (Aten, Triton, CUTLASS, etc.)
     """
+    if use_native_matmul(mat1, mat2):
+        if beta == 0:
+            arg1 = 0
+        else:
+            arg1 = lowerings[aten.mul](beta, inp)
+
+        if alpha == 0:
+            arg2 = 0
+        else:
+            arg2 = lowerings[aten.mul](alpha, lowerings[aten.mm](mat1, mat2))
+
+        return lowerings[aten.add](arg1, arg2)
+
     # TODO(coconutruben): integrate into MMKernelInputs when all callsites use that
     m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout)
     static_shape, is_nonzero = _is_static_problem(layout)
     name = "addmm"
+
     # Create MMKernelInputs for AddMM at the top
     kernel_inputs = MMKernelInputs(
         [inp_expanded, mat1, mat2], scalars=dict(alpha=alpha, beta=beta)
@@ -1180,7 +1322,36 @@ def tuned_sparse_semi_structured_mm(
     )
 
 
-add_layout_constraint(aten._scaled_mm.default, constrain_to_fx_strides)
+scaling_pairs = [
+    (ScalingType.TensorWise, ScalingType.TensorWise),
+    (ScalingType.RowWise, ScalingType.RowWise),
+]
+
+
+def _is_tensorwise_scaling(sz: Any) -> bool:
+    return (len(sz) == 0) or all(
+        V.graph.sizevars.statically_known_equals(d, 1) for d in sz
+    )
+
+
+def _is_rowwise_scaling(sz: Any, transpose: bool) -> bool:
+    idx = 0 if transpose else -1
+    return V.graph.sizevars.statically_known_equals(sz[idx], 1)
+
+
+def is_desired_scaling(
+    t: torch.Tensor,
+    scale_size: torch.Tensor,
+    scaling_type: ScalingType,
+    transpose: bool = False,
+) -> bool:
+    match scaling_type:
+        case ScalingType.TensorWise:
+            return _is_tensorwise_scaling(scale_size)
+        case ScalingType.RowWise:
+            return _is_rowwise_scaling(scale_size, transpose)
+        case _:
+            raise AssertionError(f"Unsupported scaling type {scaling_type}")
 
 
 @register_lowering(aten._scaled_mm.default, type_promotion_kind=None)  # type: ignore[misc]
@@ -1268,8 +1439,38 @@ def tuned_scaled_mm(
         # TODO (paulzhan): There is no template that exists for bias and TMA
         # Don't run tma template currently if bias exist
         if use_triton_tma_template(mat_a, mat_b, output_layout=layout) and not bias:
-            templates_to_use.append(scaled_mm_device_tma_template)
-            kwarg_overrides[scaled_mm_device_tma_template.uid] = overriders
+            scale_a_size, scale_b_size = scale_a_real.shape, scale_b_real.shape
+
+            for scale_option_a, scale_option_b in scaling_pairs:
+                if is_desired_scaling(
+                    mat_a, scale_a_size, scale_option_a
+                ) and is_desired_scaling(
+                    mat_b, scale_b_size, scale_option_b, transpose=True
+                ):
+                    overriders["SCALE_RECIPE_A"] = scale_option_a.value
+                    overriders["SCALE_RECIPE_B"] = scale_option_b.value
+                    break
+
+            if (
+                "SCALE_RECIPE_A" not in overriders
+            ):  # verify that shapes are supported by at least one existing pairing
+                raise AssertionError(
+                    f"Inductor Triton does not support scale_a.shape = {scale_a_size}, scale_b.shape = {scale_b_size}"
+                )
+
+            templates_to_use.append(scaled_mm_device_tma_epilogue_scaling_template)
+            kwarg_overrides[scaled_mm_device_tma_epilogue_scaling_template.uid] = (
+                overriders
+            )
+
+        if (
+            use_triton_blackwell_tma_template(mat_a, mat_b, output_layout=layout)
+            and not bias
+        ):
+            templates_to_use.append(blackwell_ws_persistent_device_tma_mm_template)
+            kwarg_overrides[blackwell_ws_persistent_device_tma_mm_template.uid] = (
+                overriders
+            )
 
         templates_to_use.append(mm_template)
         kwarg_overrides[mm_template.uid] = overriders
diff --git a/torch/_inductor/kernel/mm_common.py b/torch/_inductor/kernel/mm_common.py
index 228492fd9a1e..5da5eaa70ffb 100644
--- a/torch/_inductor/kernel/mm_common.py
+++ b/torch/_inductor/kernel/mm_common.py
@@ -7,7 +7,9 @@
 from torch._inductor.select_algorithm import realize_inputs, SymbolicGridFn
 from torch._inductor.utils import sympy_product
 from torch._inductor.virtualized import V
+from torch.fx.experimental.symbolic_shapes import has_free_unbacked_symbols
 
+from .. import config
 from ..codegen.wrapper import PythonWrapperCodegen
 from ..ir import _IntLike, Layout, TensorBox
 
@@ -126,6 +128,63 @@ def epilogue(acc, inv_a_scale, inv_b_scale, bias=None):
     return epilogue
 
 
+def use_native_matmul(mat1, mat2):
+    if not config.triton.native_matmul:
+        return False
+
+    # If tma matmul is on, don't do native matmul
+    if (
+        config.triton.enable_persistent_tma_matmul
+        and torch.utils._triton.has_triton_tma_device()
+    ):
+        raise AssertionError("native matmul doesn't support tma codegen yet")
+
+    # Currently only enable native matmul for default indexing
+    # TODO : support block ptr
+    if config.triton.use_block_ptr:
+        raise AssertionError("native matmul doesn't support block_ptr codegen yet")
+
+    # Currently only enable native matmul for triton on GPU.
+    if not (mat1.get_device().type == "cuda" and config.cuda_backend == "triton"):
+        return False
+
+    # Currently, tl.dot only supports following dtypes
+    triton_supported_dtype = [
+        torch.int8,
+        torch.uint8,
+        torch.float16,
+        torch.bfloat16,
+        torch.float32,
+    ]
+    if mat1.dtype not in triton_supported_dtype:
+        return False
+    if mat2.dtype not in triton_supported_dtype:
+        return False
+
+    # (..., M, K) @ (..., K, N)
+    m, k, n = mat1.get_size()[-2], mat1.get_size()[-1], mat2.get_size()[-1]
+
+    # If the shape has unbacked symbols, don't do native matmul.
+    # This is related to the behavior of statically_known_multiple_of on unbacked symints.
+    # Since statically_known_multiple_of just returns False for unbacked symbols
+    # due to the expensive cost, codegen fails when there is a unbacked symbol.
+    # In particular, it fails at _split_iteration_ranges in codegen/simd.py.
+    # See this : https://github.com/pytorch/pytorch/pull/131649
+    if any(map(has_free_unbacked_symbols, [m, k, n])):
+        return False
+
+    # Consider the shape (m,k,n) > 1
+    # TODO : support when size = 1
+    if (
+        V.graph.sizevars.statically_known_leq(m, 1)
+        or V.graph.sizevars.statically_known_leq(k, 1)
+        or V.graph.sizevars.statically_known_leq(n, 1)
+    ):
+        return False
+
+    return True
+
+
 def _is_static_problem(layout: Layout) -> tuple[bool, bool]:
     """
     Check if input tensors and output layout have static shapes and non-zero sizes.
diff --git a/torch/_inductor/kernel/mm_grouped.py b/torch/_inductor/kernel/mm_grouped.py
index c25da722a7b4..a287ed4953bc 100644
--- a/torch/_inductor/kernel/mm_grouped.py
+++ b/torch/_inductor/kernel/mm_grouped.py
@@ -65,8 +65,7 @@ def grouped_mm_configs():
     return _NV_CONFIGS
 
 
-def early_config_prune(g, m, configs, named_args):
-    dtsize = 1
+def early_config_prune(g, m, dtsize, configs, named_args):
     pruned_configs = []
     for config in configs:
         kw = config.kwargs
@@ -122,6 +121,71 @@ def early_config_prune(g, m, configs, named_args):
 
 
 triton_grouped_mm_source = r"""
+import triton
+import triton.language as tl
+
+@triton.jit
+def do_tma_loads(
+    g, a_desc, b_desc, m_offset, n_offset, k_offset,
+    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+):
+{%- if A_IS_2D %}
+{%- if A_IS_K_MAJOR %}
+    a = a_desc.load([m_offset, k_offset])
+{%- else %}
+    a = a_desc.load([k_offset, m_offset])
+{%- endif %}
+{%- else %}
+{%- if A_IS_K_MAJOR %}
+    a = a_desc.load([g, m_offset, k_offset]).reshape(BLOCK_M, BLOCK_K)
+{%- else %}
+    a = a_desc.load([g, k_offset, m_offset]).reshape(BLOCK_K, BLOCK_M)
+{%- endif %}
+{%- endif %}
+{%- if B_IS_2D %}
+{%- if B_IS_K_MAJOR %}
+    b = b_desc.load([n_offset, k_offset])
+{%- else %}
+    b = b_desc.load([k_offset, n_offset])
+{%- endif %}
+{%- else %}
+{%- if B_IS_K_MAJOR %}
+    b = b_desc.load([g, n_offset, k_offset]).reshape(BLOCK_N, BLOCK_K)
+{%- else %}
+    b = b_desc.load([g, k_offset, n_offset]).reshape(BLOCK_K, BLOCK_N)
+{%- endif %}
+{%- endif %}
+
+    return (a, b)
+
+
+@triton.jit
+def do_mma(a, b, accumulator):
+{%- if USE_FAST_ACCUM %}
+{%- if A_IS_K_MAJOR and B_IS_K_MAJOR %}
+    accumulator = tl.dot(a, b.T, accumulator)
+{%- elif A_IS_K_MAJOR and not B_IS_K_MAJOR %}
+    accumulator = tl.dot(a, b, accumulator)
+{%- elif not A_IS_K_MAJOR and B_IS_K_MAJOR %}
+    accumulator = tl.dot(a.T, b.T, accumulator)
+{%- else %}
+    accumulator = tl.dot(a.T, b, accumulator)
+{%- endif %}
+{%- else %}
+{%- if A_IS_K_MAJOR and B_IS_K_MAJOR %}
+    accumulator += tl.dot(a, b.T)
+{%- elif A_IS_K_MAJOR and not B_IS_K_MAJOR %}
+    accumulator += tl.dot(a, b)
+{%- elif not A_IS_K_MAJOR and B_IS_K_MAJOR %}
+    accumulator += tl.dot(a.T, b.T)
+{%- else %}
+    accumulator += tl.dot(a.T, b)
+{%- endif %}
+{%- endif %}
+
+    return accumulator
+
+
 {%- if SCALED %}
 {%- if A_IS_2D or B_IS_2D %}
 {{def_kernel("a_ptr", "b_ptr", "scale_a_ptr", "scale_b_ptr", "offsets_ptr")}}
@@ -186,15 +250,29 @@ def early_config_prune(g, m, configs, named_args):
 {%- endif %}
         a_ptr,
 {%- if A_IS_2D %}
+{%- if A_IS_K_MAJOR %}
         shape=[M, K],
         # fixme: strides=[A_STRIDE_M, A_STRIDE_K],
         strides=[{{stride("a_ptr", -2)}}, {{stride("a_ptr", -1)}}],
         block_shape=[BLOCK_M, BLOCK_K],
 {%- else %}
+        shape=[K, M],
+        # fixme: strides=[A_STRIDE_K, A_STRIDE_M],
+        strides=[{{stride("a_ptr", -1)}}, {{stride("a_ptr", -2)}}],
+        block_shape=[BLOCK_K, BLOCK_M],
+{%- endif %}
+{%- else %}
+{%- if A_IS_K_MAJOR %}
         shape=[G, M, K],
         # fixme: strides=[A_STRIDE_G, A_STRIDE_M, A_STRIDE_K],
         strides=[{{stride("a_ptr", 0)}}, {{stride("a_ptr", -2)}}, {{stride("a_ptr", -1)}}],
         block_shape=[1, BLOCK_M, BLOCK_K],
+{%- else %}
+        shape=[G, K, M],
+        # fixme: strides=[A_STRIDE_G, A_STRIDE_K, A_STRIDE_M],
+        strides=[{{stride("a_ptr", 0)}}, {{stride("a_ptr", -1)}}, {{stride("a_ptr", -2)}}],
+        block_shape=[1, BLOCK_K, BLOCK_M],
+{%- endif %}
 {%- endif %}
     )
 
@@ -205,15 +283,29 @@ def early_config_prune(g, m, configs, named_args):
 {%- endif %}
         b_ptr,
 {%- if B_IS_2D %}
+{%- if B_IS_K_MAJOR %}
         shape=[N, K],
         # fixme: strides=[B_STRIDE_N, B_STRIDE_K],
         strides=[{{stride("b_ptr", -1)}}, {{stride("b_ptr", -2)}}],
         block_shape=[BLOCK_N, BLOCK_K],
 {%- else %}
+        shape=[K, N],
+        # fixme: strides=[B_STRIDE_K, B_STRIDE_N],
+        strides=[{{stride("b_ptr", -2)}}, {{stride("b_ptr", -1)}}],
+        block_shape=[BLOCK_K, BLOCK_N],
+{%- endif %}
+{%- else %}
+{%- if B_IS_K_MAJOR %}
         shape=[G, N, K],
         # fixme: strides=[B_STRIDE_G, B_STRIDE_N, B_STRIDE_K],
         strides=[{{stride("b_ptr", 0)}}, {{stride("b_ptr", -1)}}, {{stride("b_ptr", -2)}}],
         block_shape=[1, BLOCK_N, BLOCK_K],
+{%- else %}
+        shape=[G, K, N],
+        # fixme: strides=[B_STRIDE_G, B_STRIDE_K, B_STRIDE_N],
+        strides=[{{stride("b_ptr", 0)}}, {{stride("b_ptr", -2)}}, {{stride("b_ptr", -1)}}],
+        block_shape=[1, BLOCK_K, BLOCK_N],
+{%- endif %}
 {%- endif %}
     )
 {%- endif %}
@@ -286,39 +378,48 @@ def early_config_prune(g, m, configs, named_args):
                 accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
 
 {%- if USE_TMA_LOAD %}
-                m_offset = (m_start_offset + tile_m_idx * BLOCK_M).to(tl.int32)
-                n_offset = (n_start_offset + tile_n_idx * BLOCK_N).to(tl.int32)
-
-                for k_offset in range(0, k_size, BLOCK_K):
-{%- if A_IS_2D %}
-                    a = a_desc.load([m_offset, k_start_offset + k_offset])
+                m_tile_offset = tile_m_idx * BLOCK_M
+                n_tile_offset = tile_n_idx * BLOCK_N
+                m_offset = (m_start_offset + m_tile_offset).to(tl.int32)
+                n_offset = (n_start_offset + n_tile_offset).to(tl.int32)
+
+                k_block_offset = 0
+                for k in range(k_size // BLOCK_K):
+                    k_offset = k_start_offset + k_block_offset
+                    a, b = do_tma_loads(
+                        g, a_desc, b_desc, m_offset, n_offset, k_offset,
+                        BLOCK_M, BLOCK_N, BLOCK_K
+                    )
+                    accumulator = do_mma(a, b, accumulator)
+                    k_block_offset += BLOCK_K
+
+                if k_size % BLOCK_K != 0:
+                    k_offset = k_start_offset + k_block_offset
+                    a, b = do_tma_loads(
+                        g, a_desc, b_desc, m_offset, n_offset, k_offset,
+                        BLOCK_M, BLOCK_N, BLOCK_K
+                    )
+{%- if K_IS_VARYING %}
+                    group_offs = k_block_offset + tl.arange(0, BLOCK_K)
+                    k_mask = group_offs < k_size
+{%- if A_IS_K_MAJOR %}
+                    a = tl.where(k_mask[None, :], a, 0)
 {%- else %}
-                    a = a_desc.load([g, m_offset, k_start_offset + k_offset]).reshape(BLOCK_M, BLOCK_K)
+                    a = tl.where(k_mask[:, None], a, 0)
 {%- endif %}
-{%- if B_IS_2D %}
-                    b = b_desc.load([n_offset, k_start_offset + k_offset])
+{%- if B_IS_K_MAJOR %}
+                    b = tl.where(k_mask[None, :], b, 0)
 {%- else %}
-                    b = b_desc.load([g, n_offset, k_start_offset + k_offset]).reshape(BLOCK_N, BLOCK_K)
+                    b = tl.where(k_mask[:, None], b, 0)
 {%- endif %}
-
-{%- if K_IS_VARYING %}
-                    if k_offset + BLOCK_K > k_size:
-                        group_offs_k = k_offset + tl.arange(0, BLOCK_K)
-                        a = tl.where(group_offs_k < k_size, a, 0)
-                        b = tl.where(group_offs_k < k_size, b, 0)
-{%- endif %}
-
-{%- if USE_FAST_ACCUM %}
-                    accumulator = tl.dot(a, b.T, accumulator)
-{%- else %}
-                    accumulator += tl.dot(a, b.T)
 {%- endif %}
+                    accumulator = do_mma(a, b, accumulator)
 {%- else %}
                 offs_am = tile_m_idx * BLOCK_M + tl.arange(0, BLOCK_M)
                 offs_bn = tile_n_idx * BLOCK_N + tl.arange(0, BLOCK_N)
-                for k_offset in range(0, k_size, BLOCK_K):
-                    group_offs_k = k_offset + tl.arange(0, BLOCK_K)
-                    offs_k = group_offs_k + k_start_offset
+                for k_block_offset in range(0, k_size, BLOCK_K):
+                    block_offs_k = k_block_offset + tl.arange(0, BLOCK_K)
+                    offs_k = block_offs_k + k_start_offset
                     a_ptrs = (
                         a_ptr
 {%- if not A_IS_2D %}
@@ -335,11 +436,10 @@ def early_config_prune(g, m, configs, named_args):
                         + (n_start_offset + offs_bn[:, None]) * B_STRIDE_N
                         + offs_k[None, :] * B_STRIDE_K
                     )
-                    a = tl.load(a_ptrs, mask=offs_am[:, None] < m_size)
-                    b = tl.load(b_ptrs, mask=offs_bn[:, None] < n_size)
-                    if k_offset + BLOCK_K > k_size:
-                        a = tl.where(group_offs_k < k_size, a, 0)
-                        b = tl.where(group_offs_k < k_size, b, 0)
+                    a_mask = (offs_am[:, None] < m_size) & (block_offs_k[None, :] < k_size)
+                    b_mask = (offs_bn[:, None] < n_size) & (block_offs_k[None, :] < k_size)
+                    a = tl.load(a_ptrs, mask=a_mask, other=tl.zeros((), dtype=a_ptrs.dtype.element_ty))
+                    b = tl.load(b_ptrs, mask=b_mask, other=tl.zeros((), dtype=b_ptrs.dtype.element_ty))
 {%- if USE_FAST_ACCUM %}
                     accumulator = tl.dot(a, b.T, accumulator)
 {%- else %}
@@ -361,6 +461,7 @@ def early_config_prune(g, m, configs, named_args):
 {%- endif %}
                     + offs_am[:, None],
                     mask=offs_am[:, None] < m_size,
+                    other=tl.zeros((), dtype=scale_a_ptr.dtype.element_ty),
                 )
                 scale_b = tl.load(
                     scale_b_ptr
@@ -371,6 +472,7 @@ def early_config_prune(g, m, configs, named_args):
 {%- endif %}
                     + offs_bn[None, :],
                     mask=offs_bn[None, :] < n_size,
+                    other=tl.zeros((), dtype=scale_b_ptr.dtype.element_ty),
                 )
                 c = accumulator.to(tl.float32) * scale_a * scale_b
 {%- else %}
@@ -469,7 +571,7 @@ def grouped_mm_args(
 aten__grouped_mm = ExternKernelChoice(
     torch._grouped_mm,
     "at::_grouped_mm",
-    op_overload=aten._grouped_mm,
+    op_overload=aten._grouped_mm.default,
     has_out_variant=False,
 )
 
@@ -477,7 +579,7 @@ def grouped_mm_args(
 aten__scaled_grouped_mm = ExternKernelChoice(
     torch._scaled_grouped_mm,
     "at::_scaled_grouped_mm",
-    op_overload=aten._scaled_grouped_mm,
+    op_overload=aten._scaled_grouped_mm.default,
     has_out_variant=False,
 )
 
@@ -622,10 +724,12 @@ def _tuned_grouped_mm_common(
             if len(m2_size) == 2:
                 m, k1 = m1_size
                 k2, _ = m2_size
+                # pyrefly: ignore  # missing-attribute
                 g = offs.get_size()[0]
                 V.graph.sizevars.check_equals(k1, k2)
                 a_is_2d, b_is_2d = True, True
             else:
+                # pyrefly: ignore  # missing-attribute
                 g1 = offs.layout.size[0]
                 m, k1 = m1_size
                 g2, k2, _ = m2_size
@@ -634,6 +738,7 @@ def _tuned_grouped_mm_common(
                 a_is_2d, b_is_2d = True, False
         else:
             if len(m2_size) == 2:
+                # pyrefly: ignore  # missing-attribute
                 g1 = offs.layout.size[0]
                 g2, m, k1 = m1_size
                 k2, _ = m2_size
@@ -647,6 +752,9 @@ def _tuned_grouped_mm_common(
                 V.graph.sizevars.check_equals(k1, k2)
                 a_is_2d, b_is_2d = False, False
 
+        a_is_k_major = mat_a.get_stride()[-1] == 1
+        b_is_k_major = mat_b.get_stride()[-2] == 1
+
         triton_has_make_tensor_descriptor = hasattr(tl, "make_tensor_descriptor")
         triton_has_experimental_make_tensor_descriptor = hasattr(
             tl, "_experimental_make_tensor_descriptor"
@@ -655,22 +763,21 @@ def _tuned_grouped_mm_common(
             triton_has_make_tensor_descriptor
             or triton_has_experimental_make_tensor_descriptor
         )
-        # The make_tensor_descriptor imposes this additional limitation.
-        use_tma_load = use_tma_load and (
-            mat_a.get_stride()[-1] == 1 and mat_b.get_stride()[-2] == 1
-        )
-
         kwargs = {
             "SCALED": scaled,
             "A_IS_2D": a_is_2d,
             "B_IS_2D": b_is_2d,
+            "A_IS_K_MAJOR": a_is_k_major,
+            "B_IS_K_MAJOR": b_is_k_major,
             "USE_FAST_ACCUM": use_fast_accum,
             "NUM_SMS": get_num_sms(),
             "USE_TMA_LOAD": use_tma_load,
             "USE_EXPERIMENTAL_MAKE_TENSOR_DESCRIPTOR": triton_has_experimental_make_tensor_descriptor,
         }
 
-        for config in early_config_prune(g, m, grouped_mm_configs(), kwargs):
+        for config in early_config_prune(
+            g, m, mat_a.dtype.itemsize, grouped_mm_configs(), kwargs
+        ):
             kernel_template.maybe_append_choice(
                 choices,
                 input_nodes=input_nodes,
@@ -735,6 +842,9 @@ def tuned_scaled_grouped_mm(
 ) -> TensorBox:
     """Auto-tuning for _scaled_grouped_mm() operator."""
 
+    # matching _scaled_grouped_mm_cuda Blas.cpp implementation
+    out_dtype = out_dtype or torch.bfloat16
+
     return _tuned_grouped_mm_common(
         "aten._scaled_grouped_mm.default",
         "scaled_grouped_mm",
diff --git a/torch/_inductor/kernel/mm_plus_mm.py b/torch/_inductor/kernel/mm_plus_mm.py
index df94e3e5cd7b..aef8dfb2168f 100644
--- a/torch/_inductor/kernel/mm_plus_mm.py
+++ b/torch/_inductor/kernel/mm_plus_mm.py
@@ -5,6 +5,7 @@
 
 import torch
 
+from .. import config as inductor_config
 from ..kernel_inputs import MMKernelInputs
 from ..lowering import lowerings
 from ..select_algorithm import (
@@ -142,6 +143,7 @@ def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
         or not V.graph.sizevars.statically_known_list_equals(
             mat2.get_size(), mat4.get_size()
         )
+        or inductor_config.triton.native_matmul
     ):
         # TODO(jansel): support different K values when this is fixed:
         # https://github.com/triton-lang/triton/issues/967
diff --git a/torch/_inductor/kernel_template_choice.py b/torch/_inductor/kernel_template_choice.py
index c03783959ccb..8f90157c6c1a 100644
--- a/torch/_inductor/kernel_template_choice.py
+++ b/torch/_inductor/kernel_template_choice.py
@@ -27,11 +27,13 @@ def __init__(
         self,
         template: Union[KernelTemplate, ExternKernelChoice],
         params: KernelTemplateParams,
+        extra_kwargs: dict[str, Any],
         layout: Layout,
         inputs: KernelInputs,
     ):
         self.template = template
         self.params = params
+        self.extra_kwargs = extra_kwargs
         self.layout = layout
         self.inputs = inputs
         self.annotations: dict[str, Any] = {"ktc": self}
@@ -53,6 +55,7 @@ def choice(self) -> Optional[ChoiceCaller]:
             kwargs = self.params.to_kwargs()
             self._choice = self.template.choice_or_none(
                 **kwargs,
+                **self.extra_kwargs,
                 layout=self.layout,
                 input_nodes=self.inputs.nodes(),
             )
@@ -64,6 +67,7 @@ def choice(self) -> Optional[ChoiceCaller]:
 def make_ktc_generator(
     template: Union[KernelTemplate, ExternKernelChoice],
     cs: Generator[KernelTemplateParams, None, None],
+    extra_kwargs: dict[str, Any],
     overrides: dict[str, Any],
     layout: Layout,
     inputs: KernelInputs,
@@ -86,10 +90,10 @@ def make_ktc_generator(
         base_kwargs = params.to_kwargs()
         final_kwargs = {**base_kwargs, **overrides}
         final_params = DictKernelTemplateParams(final_kwargs)
-
         yield KernelTemplateChoice(
             template=template,
             params=final_params,
+            extra_kwargs=extra_kwargs,
             layout=layout,
             inputs=inputs,
         )
diff --git a/torch/_inductor/loop_body.py b/torch/_inductor/loop_body.py
index 5ae38810fa13..45e049141a2f 100644
--- a/torch/_inductor/loop_body.py
+++ b/torch/_inductor/loop_body.py
@@ -52,6 +52,7 @@ def __init__(self, graph, submodules):
         self.current_node = None
 
     def run_node(self, n: torch.fx.Node) -> Any:
+        # pyrefly: ignore  # bad-assignment
         self.current_node = n
         return super().run_node(n)
 
@@ -103,7 +104,15 @@ class LoopBody:
     memory_usage: dict[MemoryUsageType, list[MemoryEntry]]
     op_counts: collections.Counter[str]
 
-    def __init__(self, fn, args, var_ranges, iter_vars, reduce_vars):
+    def __init__(
+        self,
+        fn,
+        args,
+        var_ranges,
+        iter_vars,
+        reduce_vars,
+        allow_same_symbol_in_index=False,
+    ):
         super().__init__()
 
         _flat_sizes = tuple(var_ranges.values())
@@ -117,7 +126,7 @@ def __init__(self, fn, args, var_ranges, iter_vars, reduce_vars):
         self.var_ranges = var_ranges
 
         if isinstance(fn, LoopBody):
-            self._init_with_copy(fn, args)
+            self._init_with_copy(fn, args, allow_same_symbol_in_index)
         else:
             self._init_with_tracing(fn, args)
 
@@ -136,13 +145,13 @@ def _init_with_tracing(self, fn, args):
         self.root_block = LoopBodyBlock(self, fn, args)  # traces
         del self.indexing_exprs_name  # not used after _init_with_tracing
 
-    def _init_with_copy(self, other: LoopBody, args):
+    def _init_with_copy(self, other: LoopBody, args, allow_same_symbol_in_index):
         """
         _init_with_tracing() is slow, so this is a fast path in the case
         where we are just reordering/merging/splitting the args of an
         existing LoopBody.
         """
-        indexing_exprs = other.indexing_from_args(args)
+        indexing_exprs = other.indexing_from_args(args, allow_same_symbol_in_index)
         self.indexing_exprs = {
             name: V.graph.sizevars.simplify_with_ranges(expr, self.var_ranges)
             for name, expr in indexing_exprs.items()
@@ -187,41 +196,26 @@ def merge_loops(self) -> LoopBody:
             index_prevent_reordering(index_exprs, old_reduce_vars, old_reduce_sizes),
         )
 
-        # if iter_sizes == old_iter_sizes:
-        #     # no dimensions get merged.
-        #     return old_sizes, old_body
-
-        # Note: if no dimension get merges, the symbol prefix will
-        # remain 'y'. But if we merge dimensions, we change prefix to
-        # 'z'. If this is an issue, we can always retrace the LoopBody
-        # to change symbol prefix to 'z'.
-        #
-        # There is indeed an issue due to symbol name conflicting.
-        # y0 maybe reused for the y dimension later.
+        if iter_sizes == old_iter_sizes and reduce_sizes == old_reduce_sizes:
+            return old_body
+
         (
             (
                 iter_vars,
                 reduce_vars,
             ),
             var_ranges,
-        ) = dependencies.index_vars_no_squeeze(iter_sizes, reduce_sizes, prefix="t")
+        ) = dependencies.index_vars_no_squeeze(iter_sizes, reduce_sizes, prefix="p")
         new_body = LoopBody(
             old_body,
             [iter_reindex(iter_vars), reduce_reindex(reduce_vars)],
             var_ranges,
             iter_vars,
             reduce_vars,
+            allow_same_symbol_in_index=True,
         )
 
-        # use the original symbol prefix
-        # Can try to optimize if this is a bottleneck for compilation time
-        (iter_vars2, reduce_vars2), var_ranges2 = dependencies.index_vars_no_squeeze(
-            iter_sizes, reduce_sizes, prefix="p"
-        )
-        new_body2 = LoopBody(
-            new_body, (iter_vars2, reduce_vars2), var_ranges2, iter_vars2, reduce_vars2
-        )
-        return new_body2
+        return new_body
 
     def expand_dimension_for_pointwise_node(
         self, dimension: int, new_range: int
@@ -288,7 +282,7 @@ def reorder_iter_loops(self, new_order) -> LoopBody:
 
         (iter_vars, reduce_vars), var_ranges = dependencies.index_vars_no_squeeze(
             *new_sizes,
-            prefix="t",  # type: ignore[arg-type]
+            prefix="p",  # type: ignore[arg-type]
         )
 
         inverse_order = {b: a for a, b in enumerate(new_order)}
@@ -300,21 +294,15 @@ def new_body(*indices: Sequence[sympy.Expr]) -> Any:
             iter_idx = index[: len(iter_size)]
             reduce_idx = index[len(iter_size) :]
             iter_idx = [iter_idx[i] for i in inverse_order]
-            return old_body(iter_idx, reduce_idx)
-
-        loop_body = LoopBody(
-            new_body, (iter_vars, reduce_vars), var_ranges, iter_vars, reduce_vars
-        )
+            return old_body(iter_idx, reduce_idx, allow_same_symbol_in_index=True)
 
-        # use the original symbol prefix so we can do multiple round of reordering
-        (iter_vars2, reduce_vars2), var_ranges2 = dependencies.index_vars_no_squeeze(
-            *new_sizes,
-            prefix="p",  # type: ignore[arg-type]
-        )
-        new_body = LoopBody(
-            loop_body, (iter_vars2, reduce_vars2), var_ranges2, iter_vars2, reduce_vars2
+        return LoopBody(
+            new_body,
+            (iter_vars, reduce_vars),
+            var_ranges,
+            iter_vars,
+            reduce_vars,
         )
-        return new_body
 
     @property
     def vars(self):
@@ -449,26 +437,28 @@ def replace_indirect(self, old, new):
         if str(old) == str(new):
             return
         assert self.indexing is not None
+        # pyrefly: ignore  # bad-assignment
         self.indexing = {k: sympy_subs(v, {old: new}) for k, v in self.indexing.items()}
 
     def get_index(self, name):
         assert self.indexing is not None
         return self.indexing[name]
 
-    def indexing_from_args(self, indices):
+    def indexing_from_args(self, indices, allow_same_symbol_in_index=False):
         index = [*itertools.chain.from_iterable(indices)]
         assert len(index) == len(self.var_ranges), (index, self.var_ranges)
-        assert all(v not in self.var_ranges for v in index), (
-            f"{self.var_ranges=}, {indices=}"
-        )
+        assert allow_same_symbol_in_index or all(
+            v not in self.var_ranges for v in index
+        ), f"{self.var_ranges=}, {indices=}"
+
         replacements = dict(zip(self.var_ranges.keys(), index))
         return {
             name: sympy_subs(expr, replacements)
             for name, expr in self.indexing_exprs.items()
         }
 
-    def __call__(self, *indices):
-        self.indexing = self.indexing_from_args(indices)
+    def __call__(self, *indices, allow_same_symbol_in_index=False):
+        self.indexing = self.indexing_from_args(indices, allow_same_symbol_in_index)
         result = self.root_block()
         self.indexing = None
         return result
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 6c5e8ad1ca8b..e6a9d4f27635 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -27,7 +27,7 @@
 from torch._higher_order_ops.associative_scan import associative_scan_op
 from torch._higher_order_ops.triton_kernel_wrap import triton_kernel_wrapper_mutation
 from torch._library.utils import get_layout_constraint_tag
-from torch._prims_common import (
+from torch._prims_common import (  # pyrefly: ignore  # deprecated
     canonicalize_dim,
     canonicalize_dims,
     check,
@@ -47,7 +47,13 @@
     resolve_unbacked_bindings,
 )
 from torch.utils._ordered_set import OrderedSet
-from torch.utils._sympy.functions import CeilDiv, FloorDiv, Identity, ModularIndexing
+from torch.utils._sympy.functions import (
+    CeilDiv,
+    FloorDiv,
+    Identity,
+    Mod,
+    ModularIndexing,
+)
 
 from .._dynamo.utils import import_submodule
 from . import config, inductor_prims, ir, test_operators  # NOQA: F401
@@ -154,6 +160,7 @@ def group_foreach_args(arg_pairs: Iterable[Union[tuple[Any, Any], Any]]):
                 break
         assert device is not None, "foreach op should have at least one tensor arg"
         if unpack_args:
+            # pyrefly: ignore  # bad-unpacking
             (args,) = args
         out[(device, use_foreach)].append((i, args))
     return out
@@ -256,6 +263,7 @@ def decode_dtype(dtype: int):
     if not isinstance(dtype, int):
         return dtype
     assert dtype in DTYPE_ID_LOOKUP, f"id {dtype} missing from DTYPE_ID_LOOKUP"
+    # pyrefly: ignore  # bad-assignment
     dtype = DTYPE_ID_LOOKUP[dtype]
     return dtype
 
@@ -552,7 +560,9 @@ def promote_constants(inputs, override_return_dtype=None, type_promotion_kind=No
         return inputs
     if all(isinstance(x, (int, float, sympy.Basic)) for x in inputs):
         dtype = override_return_dtype or get_promoted_dtype(
-            *inputs, type_promotion_kind=type_promotion_kind
+            *inputs,
+            # pyrefly: ignore  # bad-argument-type
+            type_promotion_kind=type_promotion_kind,
         )
 
         def const_func(x):
@@ -609,7 +619,9 @@ def inner(*inputs: TensorBox, alpha=None):
         inputs = promote_constants(inputs, override_return_dtype)
         if allow_alpha:
             if alpha is not None and alpha != 1:
+                # pyrefly: ignore  # bad-assignment
                 inputs = list(inputs)
+                # pyrefly: ignore  # unsupported-operation
                 inputs[-1] = mul(inputs[-1], alpha)
         else:
             assert alpha is None
@@ -631,8 +643,8 @@ def inner(*inputs: TensorBox, alpha=None):
             and getattr(V.graph, "current_node", None) is not None
             and V.graph.current_node.meta is not None
             and V.graph.current_node.meta.get("low_precision_pointwise_barrier", False)
-            and dtype in low_pr_fp
         )
+        emulate_output_cast = emulate_precision_casts and dtype in low_pr_fp
 
         def inner_fn(index):
             assert len(index) == len(ranges), f"wrong ndim {index} {ranges}"
@@ -649,7 +661,7 @@ def inner_fn(index):
                     inputs_loaded.append(out)
 
                 out = fn(*inputs_loaded)
-                if emulate_precision_casts:
+                if emulate_output_cast:
                     # fp16/bf16 kernels are computed in fp32. Casting down to fp16/bf16 here,
                     # then upcasting again, to emulate casts that eager would do.
                     downcast = ops.to_dtype(out, dtype, use_compute_types=False)
@@ -659,12 +671,14 @@ def inner_fn(index):
         if not override_device:
             device = None
             for i in inputs:
+                # pyrefly: ignore  # missing-attribute
                 if is_gpu(i.get_device().type):
                     device = i.get_device()
                     break
             if not device:
                 device = inputs[0].get_device()
 
+        # pyrefly: ignore  # unbound-name
         device = override_device or device
 
         return Pointwise.create(
@@ -719,6 +733,7 @@ def inner(*inputs: list[list[TensorBox]], alpha=1):
                 outputs[output_ind] = output
 
                 if (
+                    # pyrefly: ignore  # unbound-name
                     V.graph.has_feature(device, BackendFeature.FOREACH)
                     and use_foreach
                     and realize_outputs
@@ -727,6 +742,7 @@ def inner(*inputs: list[list[TensorBox]], alpha=1):
                     operation_list.append(output.get_operation_name())
 
             if operation_list:
+                # pyrefly: ignore  # unbound-name
                 V.graph.register_operation_list(operation_list)
 
         assert all(x is not None for x in outputs)
@@ -1195,23 +1211,143 @@ def permute(x, dims):
 
 @register_lowering(aten.slice, type_promotion_kind=None)
 def slice_(x, dim=0, start=0, end=2**63, step=1, clamp=True):
+    """
+    Lowers a slice call, creating ExternKernels for the output size & storage offset symbols,
+    if the indices are unbacked and appropriate semantics aren't known.
+    If they are known (indices are static/backed/unbacked with info), a SliceView is created.
+    """
+
+    from torch.fx.experimental.symbolic_shapes import (
+        CallMethodKey,
+        resolve_unbacked_bindings,
+    )
+
     assert isinstance(x, TensorBox)
     dim = _validate_dim(x, dim, 0)
-    return TensorBox(ir.SliceView.create(x.data, dim, start, end, step, clamp=clamp))
+    size = x.get_size()[dim]
+    step = sympy.expand(step)
+    assert isinstance(step, sympy.Expr) or step > 0, step
+
+    # maybe apply slice optimization
+    try:
+        if (
+            start == 0
+            and V.graph.sizevars.statically_known_leq(size, end)
+            and step == 1
+        ):
+            return x
+    except TypeError:
+        pass
+
+    # try to avoid dynamic (unbacked) slice
+    def compute_slice_index(index, size, default=None):
+        if index is None:
+            return default
+
+        fn = lambda x: V.graph.sizevars.guard_or_false(x)  # noqa: E731
+        index = sympy.expand(index)
+        size = sympy.expand(size)
+        if fn(sympy.Ge(index, 0)) and fn(sympy.Le(index, size)):
+            return index
+        elif fn(sympy.Lt(index, 0)) and fn(sympy.Ge(index, -size)):
+            return index + size
+        elif fn(sympy.Gt(index, size)):
+            return size
+        elif fn(sympy.Lt(index, -size)):
+            return 0
+        return None
+
+    start_index, end_index = None, None
+    ambiguous_slice = clamp
+    if ambiguous_slice:
+        start_index = compute_slice_index(start, size, 0)
+        end_index = compute_slice_index(end, size, size)
+        if start_index is not None and end_index is not None:
+            start, end = start_index, end_index
+            ambiguous_slice = False
+
+    # ambiguous_slice=False means we know what semantics this slice call follows,
+    # and don't need to generate an extern kernel to represent the output size.
+    # This is assumed True for clamp=False
+    # (meant to follow standard indexing semantics: 0 <= index < size)
+    if not ambiguous_slice:
+        return TensorBox(
+            ir.SliceView.create(x.data, dim, start, end, step, clamp=clamp)
+        )  # go to SliceView/ReinterpretView
+
+    # unbacked territory: create DynamicSlice ExternKernel
+    # clamp is True, unbacked start / end
+    assert clamp
+    unbacked_bindings = resolve_unbacked_bindings(
+        V.graph.sizevars.shape_env, V.graph.current_node.meta["unbacked_bindings"]
+    )
+    assert unbacked_bindings is not None
+    assert len(unbacked_bindings) <= 2, unbacked_bindings
+    sym_size, sym_storage = None, None
+    for sym, keypath in unbacked_bindings.items():
+        if keypath == (CallMethodKey("size"), pytree.SequenceKey(dim)):
+            sym_size = sym
+        elif keypath == (CallMethodKey("storage_offset"),):
+            sym_storage = sym
+
+    assert start_index is None or end_index is None
+    b_size = ir.DynamicSliceSize(
+        sym_size,
+        start,
+        end,
+        step,
+        x.get_size()[dim],
+    )
+    b_size.name = V.graph.register_buffer(b_size)
+    V.graph.register_operation(b_size)
+    new_size = sym_size
+
+    if start_index is not None:
+        # we shouldn't have allocated storage offset symbol if start index was determinable
+        assert sym_storage is None
+        new_storage_offset = x.get_layout().offset + start_index * x.get_stride()[dim]
+    else:
+        b_storage = ir.DynamicSelectStorageOffset(
+            sym_storage,
+            start,
+            x.get_layout().offset,
+            x.get_stride()[dim],
+            x.get_size()[dim],
+            clamp=True,
+        )
+        b_storage.name = V.graph.register_buffer(b_storage)
+        V.graph.register_operation(b_storage)
+        new_storage_offset = sym_storage
+
+    new_sizes = list(x.get_size())
+    new_strides = list(x.get_stride())
+    new_sizes[dim] = new_size
+    new_strides[dim] *= step
+    return as_strided(x, new_sizes, new_strides, new_storage_offset)
 
 
 @register_lowering(aten.as_strided, type_promotion_kind=None)
 def as_strided(x, size, stride, storage_offset=None):
+    new_device = None
+    new_dtype = None
     if isinstance(x, TensorBox) and isinstance(x.data, ir.BaseView):
-        # as_strided ignores views
+        # Note: Merging views
+        # When we use as_strided, we can rewrite the size/stride/offset
+        # of the incoming buffer x. If x is a view, we would overwrite
+        # its metadata. Except for dtype, which we need to propagate.
+
+        # Technically device is not needed because it is not possible
+        # to have a cross-device view today.
+        new_device = x.get_device()
+        new_dtype = x.dtype
         x = x.data.unwrap_view()
     x.realize()
     if not ir.is_storage_and_layout(x):
         raise NotImplementedError(f"unrealized as_strided({x}, ...)")
     storage, old_layout = ir.as_storage_and_layout(x)
     new_layout = ir.FixedLayout(
-        old_layout.device,
-        old_layout.dtype,
+        new_device if new_device else old_layout.device,
+        new_dtype if new_dtype else old_layout.dtype,
         [sympy.expand(s) for s in size],
         [sympy.expand(s) for s in stride],
         sympy.expand(storage_offset or 0),
@@ -1820,6 +1956,9 @@ def select(x, dim, idx):
             # Additionally, we want to avoid accidental unbacked unsqueeze semantics. To resolve this,
             # we use as_strided instead.
             # Removing this branch will cause test_unbacked_select_index_with_check to fail.
+
+            # before accessing size, stride, and offset we need to realize.
+            x.realize()
             new_size = x.get_size()
             new_stride = x.get_stride()
             new_storage_offset = x.get_layout().offset + new_stride[dim] * actual_index
@@ -1828,7 +1967,8 @@ def select(x, dim, idx):
             del new_stride[dim]
             return as_strided(x, new_size, new_stride, new_storage_offset)
         else:
-            slice_result = slice_(x, dim, actual_index, actual_index + 1)
+            # no need to clamp, this function handles negative indexing itself
+            slice_result = slice_(x, dim, actual_index, actual_index + 1, clamp=False)
             return squeeze(slice_result, dim)
 
     # Unbacked Semantics:
@@ -1842,6 +1982,8 @@ def select(x, dim, idx):
     assert len(unbacked_bindings) == 1, unbacked_bindings
     unbacked_offset_sym, _ = next(iter(unbacked_bindings.items()))
 
+    # before accessing size, stride, and offset we need to realize.
+    x.realize()
     new_size = x.get_size()
     new_stride = x.get_stride()
     new_storage_offset = unbacked_offset_sym
@@ -1851,6 +1993,7 @@ def select(x, dim, idx):
         x.get_layout().offset,
         new_stride[dim],
         x.get_size()[dim],
+        clamp=False,
     )
     buffer.name = V.graph.register_buffer(buffer)
     V.graph.register_operation(buffer)
@@ -1907,7 +2050,7 @@ def unfold(x, dimension, size, step):
     dim = canonicalize_dim(ndim, dimension)
 
     if ndim == 0:
-        return slice_(unsqueeze(x, 0), end=size)
+        return slice_(unsqueeze(x, 0), end=size, clamp=False)
 
     dim_size = sizes[dim]
     sizevars = V.graph.sizevars
@@ -1960,8 +2103,9 @@ def glu(x, dim=-1):
     dim = _validate_dim(x, dim, 0)
     # TODO: don't guard on static shape here
     new_len = V.graph.sizevars.guard_int(x.get_size()[dim]) // 2
-    a = slice_(x, dim, 0, new_len)
-    b = slice_(x, dim, new_len, new_len * 2)
+    # no need to clamp, index is int based on input size
+    a = slice_(x, dim, 0, new_len, clamp=False)
+    b = slice_(x, dim, new_len, new_len * 2, clamp=False)
     return mul(a, sigmoid(b))
 
 
@@ -2002,6 +2146,9 @@ def unsupported_input_tensor(t: torch.Tensor, node=None):
     if t.is_meta:
         return True
 
+    if t.is_sparse:
+        return True
+
     if t.dtype == torch.float8_e8m0fnu:
         if not node:
             return True
@@ -2236,7 +2383,7 @@ def warn_triton_random():
 
 @register_lowering(aten.rand)
 def rand(*args, **kwargs):
-    if kwargs.get("generator", None) is not None:
+    if kwargs.get("generator") is not None:
         return fallback_rand_generator(*args, **kwargs)
     elif config.fallback_random:
         kwargs.pop("generator", None)
@@ -2246,7 +2393,7 @@ def rand(*args, **kwargs):
 
 @register_lowering(aten.randn)
 def randn(*args, **kwargs):
-    if kwargs.get("generator", None) is not None:
+    if kwargs.get("generator") is not None:
         return fallback_randn_generator(*args, **kwargs)
     elif config.fallback_random:
         kwargs.pop("generator", None)
@@ -2598,8 +2745,8 @@ def apply_constraint(idx, arg, fx_arg):
         meta_stride_expr = [
             s.node.expr if isinstance(s, torch.SymInt) else s for s in meta_val.stride()
         ]
-
-        stride_order = ir.get_stride_order(meta_val.stride())
+        shape_env = V.graph.sizevars.shape_env
+        stride_order = ir.get_stride_order(meta_val.stride(), shape_env)
 
         if stride_order and stride_order[-1] != 0:
             # contiguous stride order
@@ -2635,7 +2782,7 @@ def apply_constraint(idx, arg, fx_arg):
         if len(arg.get_size()) not in (3, 4):
             return arg
 
-        is_aligned_tensor = ir.is_aligned_realized_tensor_hint(arg, ALIGNMENT)
+        is_aligned_tensor = ir.is_aligned_realized_tensor(arg, ALIGNMENT)
         if is_aligned_tensor:
             return ir.try_match_insignificant_strides(
                 ir.ExternKernel.realize_input(arg), meta_stride_expr
@@ -2703,7 +2850,9 @@ def apply_constraint(idx, arg, fx_arg):
             )
 
         def is_aligned(x):
-            return (V.graph.sizevars.size_hint(x.get_size()[-1]) % ALIGNMENT) == 0
+            return V.graph.sizevars.guard_or_false(
+                sympy.Eq(Mod(x.get_size()[-1], ALIGNMENT), 0)
+            )
 
         if isinstance(arg.data, ir.BaseView):
             if not is_aligned(arg):
@@ -2795,7 +2944,7 @@ def is_aligned(x):
 make_fallback(aten.upsample_linear1d_backward)
 make_fallback(aten.upsample_bicubic2d_backward, require_contiguous)
 make_fallback(aten.upsample_trilinear3d_backward)
-make_fallback(aten.grid_sampler_2d_backward, require_dense)
+make_fallback(aten.grid_sampler_2d_backward)
 make_fallback(aten._pdist_backward)
 
 
@@ -2937,10 +3086,14 @@ def is_aligned(x):
 # For example, fp16.copy_(fp32) should **not** promote the first input's dtype.
 @register_lowering(aten.copy, type_promotion_kind=None)
 def copy(self, src, non_blocking=False):
+    if not isinstance(src, ir.IRNode):
+        src = tensor(src, dtype=self.get_dtype(), device=self.get_device())
     x = src
     if self.get_device() != src.get_device():
+        # pyrefly: ignore  # bad-argument-type
         x = to_device(x, self.get_device())
     if self.get_dtype() != src.get_dtype():
+        # pyrefly: ignore  # bad-argument-type
         x = to_dtype(x, self.get_dtype())
 
     if self.get_size() != src.get_size():
@@ -2964,6 +3117,7 @@ def clone_preserve_reinterpret_view(x):
     reinterpret_view_layouts = []
     if isinstance(x, TensorBox) and isinstance(x.data, ir.ReinterpretView):
         x = x.data  # unwrap TensorBox
+        # pyrefly: ignore  # bad-assignment
         while isinstance(x, ir.ReinterpretView):
             reinterpret_view_layouts.append(x.get_layout())
             x = x.data
@@ -3010,8 +3164,14 @@ def select_scatter(x, src, dim: int, index: int):
     assert x.get_dtype() == src.get_dtype()
     x_loader = x.make_loader()
     dim = _validate_dim(x, dim, 0)
-    if V.graph.sizevars.evaluate_expr(sympy.Lt(index, 0)):
+    if V.graph.sizevars.guard_or_false(sympy.Lt(index, 0)):
         index = index + x.get_size()[dim]
+    elif V.graph.sizevars.guard_or_false(sympy.Ge(index, 0)):
+        pass
+    else:
+        # unbacked index
+        return fallback_handler(aten.select_scatter.default)(x, src, dim, index)
+
     V.graph.sizevars.check_leq(0, index)  # type: ignore[arg-type]
     V.graph.sizevars.check_lt(index, x.get_size()[dim])  # type: ignore[arg-type]
     src = expand(unsqueeze(src, dim), x.get_size())
@@ -3042,6 +3202,7 @@ def slice_scatter(x, src, dim=0, start=None, end=None, step=1):
     dim = _validate_dim(x, dim, 0)
     dim_size = x.get_size()[dim]
 
+    # pyrefly: ignore  # bad-argument-type
     start, end = ir.SliceView.normalize_start_end(x, dim, start, end)
 
     src_size = list(x.get_size())
@@ -3364,6 +3525,7 @@ def _new_constant(
         assert isinstance(size, (list, tuple))
         assert_nyi(not pin_memory, "pin_memory")
         assert_nyi(layout in (None, torch.strided), f"layout={layout}")
+        # pyrefly: ignore  # bad-argument-type
         dtype = decode_dtype(dtype) or x.get_dtype()
         device = device or x.get_device()
         size = [sympy.Integer(s) for s in size]
@@ -3396,6 +3558,7 @@ def empty_strided(
     assert isinstance(stride, (list, tuple, type(None)))
     assert_nyi(not pin_memory, "pin_memory")
     assert_nyi(layout in (None, torch.strided), f"layout={layout}")
+    # pyrefly: ignore  # bad-argument-type
     dtype = decode_dtype(dtype) or torch.get_default_dtype()
     device = device or torch.tensor(0.0).device
     device = decode_device(device)
@@ -3567,7 +3730,7 @@ def index_output_size_and_inner_fn(
     # Then, a[:,x,:,x,:] will have shape 2,3,5,7 as due to x,:,x then 2 will
     # be pulled to the front.
     non_consecutive_tensors = False
-    for previous, current in zip(tensor_indices, tensor_indices[1:]):
+    for previous, current in itertools.pairwise(tensor_indices):
         if current - previous != 1:
             non_consecutive_tensors = True
 
@@ -4054,6 +4217,7 @@ def fn(idx):
             return src_loader(idx)
         else:
             # src is a scalar
+            # pyrefly: ignore  # bad-argument-type
             return ops.constant(src, self.get_dtype())
 
     def backend_reduce_str(reduce):
@@ -4318,7 +4482,7 @@ def _padding_can_be_fused():
         layout.offset,
     )
 
-    sliced_x = slice_(resized_x, dim=1, start=rowsize, end=rowsize + npad)
+    sliced_x = slice_(resized_x, dim=1, start=rowsize, end=rowsize + npad, clamp=False)
     fill_(sliced_x, fill_value)
 
     counters["inductor"]["inplace_padding"] += 1
@@ -4407,6 +4571,7 @@ def constant_boundary_condition(
 ):
     h = x.get_size()[-dim:]
     x_loader = x.make_loader()
+    # pyrefly: ignore  # unsupported-operation
     padding_h = padding or [0] * dim
 
     def load(index):
@@ -4415,6 +4580,7 @@ def load(index):
 
         mask = functools.reduce(
             ops.and_,
+            # pyrefly: ignore  # no-matching-overload
             [range_mask(ih[i], h[i] + padding_h[i], -padding_h[i]) for i in range(dim)],
         )
         return (
@@ -5296,6 +5462,7 @@ def upsample_nearest2d_backward(
     inp_h = V.graph.sizevars.guard_int(inp_h)
     inp_w = V.graph.sizevars.guard_int(inp_w)
 
+    # pyrefly: ignore  # not-iterable
     *_batch, out_h, out_w = input_size
 
     if inp_h % out_h == 0 and inp_w % out_w == 0:
@@ -5328,6 +5495,7 @@ def fn(idx):
         device=x.get_device(),
         dtype=x.get_dtype(),
         inner_fn=fn,
+        # pyrefly: ignore  # no-matching-overload
         ranges=list(input_size),
     )
 
@@ -6183,6 +6351,7 @@ def fn(idx):
     if isinstance(a, Number):
         if a == 1:
             return full_like(b, 1)
+        # pyrefly: ignore  # missing-attribute
         if a == 2 and is_float_dtype(b.get_dtype()):
             return exp2(b)
 
@@ -6330,9 +6499,12 @@ def div_prim(a, b):
     # see https://github.com/pytorch/pytorch/issues/157959
     if (divisor := get_constant_value(b)) is not None and a.get_device().type != "cpu":
         # Replace divide by constant with multiply by reciprocal
+        # pyrefly: ignore  # unbound-name
         if divisor.value == 0:
+            # pyrefly: ignore  # unbound-name
             reciprocal = math.copysign(float("inf"), divisor.value)
         else:
+            # pyrefly: ignore  # unbound-name
             reciprocal = 1.0 / divisor.value
         return mul(a, reciprocal)
 
@@ -7077,21 +7249,11 @@ def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs, stack_output
             msg = f"{msg} Found from : \n {stack_trace}"
         V.graph.disable_cudagraphs_reason = msg
 
-    def _map_output(out: Any):
-        if isinstance(out, TensorBox):
-            return out
-        elif isinstance(out, ir.StorageBox):
-            return TensorBox(out)
-        elif isinstance(out, ir.MultiOutput):
-            return TensorBox.create(out)
-        else:
-            raise RuntimeError(f"NYI unsupported output type: {type(out)}")
-
     result = ir.WhileLoop.create(
         cond_fn, body_fn, carried_inputs, additional_inputs, stack_output
     )
     assert isinstance(result, Sequence)
-    return list(map(_map_output, result))
+    return list(map(ir.WhileLoop._maybe_wrap_as_tensor_box, result))
 
 
 register_lowering(
@@ -7105,6 +7267,67 @@ def invoke_subgraph(subgraph_fn: ir.Subgraph, identifier: str, *operands):
     return list(map(TensorBox.create, result))  # type: ignore[call-overload]
 
 
+# Import the control_deps_op HOP for lowering
+from torch._inductor.fx_passes.control_dependencies import control_deps
+
+
+@register_lowering(control_deps, type_promotion_kind=None)
+def control_deps_op_lowering(additional_deps, subgraph_fn, *args):
+    """
+    Lower control_deps_op by ensuring dependencies are realized and tracking them.
+
+    The control_deps_op HOP makes dependencies explicit in the graph. During lowering:
+    1. Realize all additional dependencies to ensure they're computed
+    2. Execute the target operation normally
+    3. Track the dependencies for the scheduler
+    """
+    # Realize all additional dependencies
+    dep_names = []
+    for dep in additional_deps:
+        if not isinstance(dep, IRNode):
+            continue
+
+        dep.realize()
+        dep_names.append(dep.get_name())
+
+    original_args = V.graph.current_node.args
+    arg_offset = 2  # first two args (additional_deps, subgraph)
+    assert len(args) + arg_offset == len(original_args)
+
+    output = None
+
+    operation_len = len(V.graph.operations)
+    assert len(subgraph_fn.graph_module.graph.find_nodes(op="placeholder")) == len(args)
+    for i, node in enumerate(subgraph_fn.graph_module.graph.nodes):
+        if node.op == "placeholder":
+            assert node not in V.graph.env
+            V.graph.env[node] = args[i]
+            continue
+        elif node.op == "output":
+            args, kwargs = V.graph.fetch_args_kwargs_from_env(node)
+            output = torch.fx.Interpreter.output(V.graph, node, args, kwargs)
+        else:
+            assert node not in V.graph.env
+            V.graph.env[node] = V.graph.run_node(node)
+
+    assert output is not None and additional_deps
+
+    # some operators, like wait_tensor, just return their input,
+    # so its more robust to add dep to the operation itself,
+    # otherwise you can have a cycle of
+    # a = coll
+    # b = control_deps(a, mm, ...)
+    # c = control_deps(b, wait, ...)
+    # if c == a, then you have a cycle.
+    for op in V.graph.operations[operation_len:]:
+        for dep_name in dep_names:
+            op_name = op.operation_name
+            assert op_name is not None
+            V.graph.additional_buffer_deps[op_name].add(dep_name)
+
+    return output
+
+
 @register_lowering(torch._higher_order_ops.invoke_quant, type_promotion_kind=None)
 def invoke_quant_tracer(subgraph_fn: ir.Subgraph, *operands, scheme=None):
     output = None
@@ -7220,9 +7443,8 @@ def prepare_softmax_online(x, dim):
         reduction_numel=rnumel,
     )
 
-    if (
-        num_split == 1
-        and V.graph.sizevars.size_hint(rnumel) >= config.unroll_reductions_threshold
+    if num_split == 1 and V.graph.sizevars.statically_known_geq(
+        rnumel, config.unroll_reductions_threshold
     ):
         max_tensor, sum_tensor = OnlineSoftmaxReduction.create(
             input_node=x, num_output=2, reduction_hint=hint, **kwargs
diff --git a/torch/_inductor/memory.py b/torch/_inductor/memory.py
index 1a02dbb1e6af..a8df2fe55987 100644
--- a/torch/_inductor/memory.py
+++ b/torch/_inductor/memory.py
@@ -913,8 +913,8 @@ def reorder_for_peak_memory(
     try:
         validate_graph_acyclic(nodes)
         validate_unique_buffer_names(nodes, name_to_buf, name_to_freeable_input_buf)
-    except RuntimeError as e:
-        torch_log.error("Memory planning validation failed: %s", e)
+    except RuntimeError:
+        torch_log.exception("Memory planning validation failed")
         if not is_fbcode():  # TODO: remove after ensuring OSS side is safe
             raise
 
@@ -942,8 +942,8 @@ def reorder_for_peak_memory(
                 PeakMemoryResult(order, peak_memory, method.__name__)
             )
             torch_log.info("%s peak memory: %d", method.__name__, peak_memory)
-        except Exception as e:
-            torch_log.error("Failed to reorder for %s: %s", method.__name__, e)
+        except Exception:
+            torch_log.exception("Failed to reorder for %s", method.__name__)
             if not is_fbcode():  # TODO: remove after ensuring OSS side is safe
                 raise
 
diff --git a/torch/_inductor/metrics.py b/torch/_inductor/metrics.py
index 116550be70e7..5e2b5a9f90d9 100644
--- a/torch/_inductor/metrics.py
+++ b/torch/_inductor/metrics.py
@@ -7,7 +7,7 @@
 import re
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import Callable, cast, Optional, TYPE_CHECKING, Union
+from typing import Callable, Optional, TYPE_CHECKING, Union
 
 from torch._inductor import config
 from torch._inductor.utils import get_benchmark_name
@@ -16,6 +16,7 @@
 
 # Prevent circular import
 if TYPE_CHECKING:
+    from torch._inductor.runtime.triton_compat import Config
     from torch._inductor.scheduler import BaseSchedulerNode
 
 # counter for tracking how many kernels have been generated
@@ -153,8 +154,8 @@ def add_row(
         bn = get_benchmark_name()
         # assert bn is not None
         row = [bn] + [row_dict[column_name] for column_name in self.column_names]
-        assert all(isinstance(i, str) for i in row)
-        self._write_row(cast(list[str], row))
+        assert all(isinstance(i, (str, float, type(None))) for i in row)
+        self._write_row(row)
 
     def output_filename(self) -> str:
         return f"metric_table_{self.table_name}.csv"
@@ -165,7 +166,7 @@ def write_header(self) -> None:
             writer = csv.writer(fd, lineterminator="\n")
             writer.writerow(["model_name"] + self.column_names)
 
-    def _write_row(self, row: list[str]) -> None:
+    def _write_row(self, row: list[str | float | None]) -> None:
         filename = self.output_filename()
         if self.num_rows_added == 0 and not os.path.exists(filename):
             self.write_header()
@@ -452,3 +453,27 @@ def is_metric_table_enabled(name: str) -> bool:
 def get_metric_table(name: str) -> MetricTable:
     assert name in REGISTERED_METRIC_TABLES, f"Metric table {name} is not defined"
     return REGISTERED_METRIC_TABLES[name]
+
+
+MetricTable.register_table(
+    "kernel_autotune",
+    [
+        "kernel_path",
+        "kernel_name",
+        "triton_config",
+        "latency_ms",
+    ],
+)
+
+
+def log_kernel_autotune_result(
+    kernel_path: str, kernel_name: str, config: Config, latency: float
+) -> None:
+    get_metric_table("kernel_autotune").add_row(
+        lambda: {
+            "kernel_path": kernel_path,
+            "kernel_name": kernel_name,
+            "triton_config": str(config),
+            "latency_ms": latency,
+        }
+    )
diff --git a/torch/_inductor/mkldnn_ir.py b/torch/_inductor/mkldnn_ir.py
index 866c22abd069..16c77556e69d 100644
--- a/torch/_inductor/mkldnn_ir.py
+++ b/torch/_inductor/mkldnn_ir.py
@@ -83,6 +83,7 @@ def _conv_output_size(input_size, weight_size, padding, stride, dilation=None):
         output_size.append(input_size[0])
         output_size.append(weight_size[0])
         for d in range(2, dim):
+            # pyrefly: ignore  # unsupported-operation
             dilation_ = dilation[d - 2] if has_dilation else 1
             kernel = dilation_ * (weight_size[d] - 1) + 1
             output_size_d = (input_size[d] + (2 * padding[d - 2]) - kernel) // stride[
@@ -409,6 +410,7 @@ def create(
         ) = _prepare_convolution_fusion_create(
             cls, x, weight, bias, padding_, stride_, dilation_, groups
         )
+        # pyrefly: ignore  # bad-assignment
         other = cls.require_stride_order(other, req_stride_order)
         inputs.insert(1, other)
         constant_args = constant_args + [
@@ -486,6 +488,7 @@ def create(
         ) = _prepare_convolution_fusion_create(
             cls, x, weight, bias, padding_, stride_, dilation_, groups
         )
+        # pyrefly: ignore  # bad-assignment
         other = cls.require_stride_order(other, req_stride_order)
         inputs.insert(1, other)
         constant_args = constant_args + [
@@ -1216,16 +1219,23 @@ def create(
         batch_first: bool,
         train: bool,
     ):
+        # pyrefly: ignore  # bad-assignment
         x = cls.require_stride1(cls.realize_input(x))
         # If batch_first, x has been permuted in lstm before entering the mkldnn_rnn_layer.
         # Make sure x is contiguous in batch_first case.
         x.freeze_layout()
+        # pyrefly: ignore  # bad-assignment
         w0 = cls.require_stride1(cls.realize_input(w0))
+        # pyrefly: ignore  # bad-assignment
         w1 = cls.require_stride1(cls.realize_input(w1))
+        # pyrefly: ignore  # bad-assignment
         w2 = cls.require_stride1(cls.realize_input(w2))
+        # pyrefly: ignore  # bad-assignment
         w3 = cls.require_stride1(cls.realize_input(w3))
+        # pyrefly: ignore  # bad-assignment
         hx = cls.require_stride1(cls.realize_input(hx))
         hx.freeze_layout()
+        # pyrefly: ignore  # bad-assignment
         cx = cls.require_stride1(cls.realize_input(cx))
         cx.freeze_layout()
 
diff --git a/torch/_inductor/mkldnn_lowerings.py b/torch/_inductor/mkldnn_lowerings.py
index 3b3a7b072534..9e766ef63783 100644
--- a/torch/_inductor/mkldnn_lowerings.py
+++ b/torch/_inductor/mkldnn_lowerings.py
@@ -142,6 +142,7 @@ def grouped_gemm_lowering(
     num_gemm = len(w)
 
     assert config.max_autotune or config.max_autotune_gemm
+    # pyrefly: ignore  # bad-assignment
     b = [bias if bias is None else ir.ExternKernel.realize_input(bias) for bias in b]
 
     choices: list[ChoiceCaller] = []
@@ -176,6 +177,7 @@ def grouped_gemm_lowering(
         ir.MultiOutput(layout, template_buf, [(list, gemm_idx)])
         for gemm_idx in range(num_gemm)
     ]
+    # pyrefly: ignore  # bad-argument-type
     template_buf.layout = ir.MultiOutputLayout(device=input_nodes[0].get_device())
     template_buf.outputs = return_bufs
     return_tensors = [
@@ -424,6 +426,7 @@ def epilogue_creator(buf):
                         "epilogue_creator": epilogue_creator,
                     }
 
+                    # pyrefly: ignore  # unsupported-operation
                     kwargs["input_indices"] = [0, 2, 1] if b is None else [3, 0, 2, 1]
                     CppGemmTemplate.add_choices(
                         choices,
@@ -549,11 +552,11 @@ def qconvolution_unary(
             algorithm,
         ):
             # To align with qlinear where x_scale and x_zp are converted to Tensor
-            assert type(x_scale) == float
+            assert type(x_scale) is float
             x_scale = V.graph.add_tensor_constant(
                 torch.tensor(x_scale, dtype=torch.float32), name="x_scale"
             )
-            assert type(x_zp) == int
+            assert type(x_zp) is int
             x_zp = V.graph.add_tensor_constant(
                 torch.tensor(x_zp, dtype=torch.int32), name="x_zp"
             )
@@ -611,11 +614,11 @@ def qconvolution_binary(
             unary_algorithmm,
         ):
             # To align with qlinear where x_scale and x_zp are converted to Tensor
-            assert type(x_scale) == float
+            assert type(x_scale) is float
             x_scale = V.graph.add_tensor_constant(
                 torch.tensor(x_scale, dtype=torch.float32), name="x_scale"
             )
-            assert type(x_zp) == int
+            assert type(x_zp) is int
             x_zp = V.graph.add_tensor_constant(
                 torch.tensor(x_zp, dtype=torch.int32), name="x_zp"
             )
@@ -683,7 +686,7 @@ def qlinear_unary(
                 # GEMM template needs 2D input, normalize input shape here
                 x = view(x, [-1, x_size[-1]])
             if not isinstance(x_scale, ir.TensorBox):
-                assert type(x_scale) == float
+                assert type(x_scale) is float
                 x_scale = V.graph.add_tensor_constant(
                     torch.tensor(x_scale, dtype=torch.float32), name="x_scale"
                 )
@@ -705,7 +708,7 @@ def qlinear_unary(
                     torch.tensor(0, dtype=torch.int32), name="x_zp"
                 )
             if not isinstance(x_zp, ir.TensorBox):
-                assert type(x_zp) == int
+                assert type(x_zp) is int
                 x_zp = V.graph.add_tensor_constant(
                     torch.tensor(x_zp, dtype=torch.int32), name="x_zp"
                 )
@@ -721,6 +724,7 @@ def qlinear_unary(
                 # If w_zp is None, then it's a dummy tensor created to denote the
                 # absence of a zero point, and thus w is int8 symmetrically quantized.
                 # Moreover, oneDNN qlinear API doesn't accept None value for zp
+                # pyrefly: ignore  # bad-assignment
                 w_zp = V.graph.add_tensor_constant(
                     torch.tensor(0, dtype=torch.int32), name="w_zp"
                 )
@@ -764,7 +768,9 @@ def qlinear_unary(
                     ) = create_int8_compensation(
                         W_tensor,
                         packed_weight,
+                        # pyrefly: ignore  # bad-argument-type
                         x_scale,
+                        # pyrefly: ignore  # bad-argument-type
                         x_zp,
                         w_scale,
                     )
@@ -823,6 +829,7 @@ def inner_fn(index):
                             )
                             # Step 2: add Bias if applicable
                             if bias is not None:
+                                # pyrefly: ignore  # not-callable
                                 _bias = bias_loader(weight_compens_index)
                                 nonlocal bias_dtype
                                 assert bias_dtype in [torch.float32, torch.bfloat16]
@@ -994,7 +1001,7 @@ def qlinear_binary(
                 x = view(x, [-1, x_size[-1]])
                 x2 = view(x2, [-1, x2_size[-1]])
             if not isinstance(x_scale, ir.TensorBox):
-                assert type(x_scale) == float
+                assert type(x_scale) is float
                 x_scale = V.graph.add_tensor_constant(
                     torch.tensor(x_scale, dtype=torch.float32), name="x_scale"
                 )
@@ -1013,12 +1020,13 @@ def qlinear_binary(
                 )
 
             if w_zp is None:
+                # pyrefly: ignore  # bad-assignment
                 w_zp = V.graph.add_tensor_constant(
                     torch.tensor(0, dtype=torch.int32), name="w_zp"
                 )
 
             if not isinstance(x_zp, ir.TensorBox):
-                assert type(x_zp) == int
+                assert type(x_zp) is int
                 x_zp = V.graph.add_tensor_constant(
                     torch.tensor(x_zp, dtype=torch.int32), name="x_zp"
                 )
@@ -1087,7 +1095,9 @@ def qlinear_binary(
                     ) = create_int8_compensation(
                         W_tensor,
                         packed_weight,
+                        # pyrefly: ignore  # bad-argument-type
                         x_scale,
+                        # pyrefly: ignore  # bad-argument-type
                         x_zp,
                         w_scale,
                     )
@@ -1147,6 +1157,7 @@ def inner_fn(index):
                             )
                             # Step 2: add Bias if applicable
                             if bias is not None:
+                                # pyrefly: ignore  # not-callable
                                 _bias = bias_loader(weight_compens_index)
                                 nonlocal bias_dtype
                                 assert bias_dtype in [torch.float32, torch.bfloat16]
@@ -1348,5 +1359,3 @@ def mkl_packed_linear(
                 return result
 
         add_needs_realized_inputs(cpu_needs_realized_inputs)
-    else:
-        pass
diff --git a/torch/_inductor/ops_handler.py b/torch/_inductor/ops_handler.py
index cccb0e294362..f17f54b503c6 100644
--- a/torch/_inductor/ops_handler.py
+++ b/torch/_inductor/ops_handler.py
@@ -30,6 +30,7 @@
     "min",
     "prod",
     "sum",
+    "dot",
     "xor_sum",
     "online_softmax_reduce",
 ]
@@ -686,6 +687,10 @@ def check_bounds(
     def halide_clamp(self, value: T, size: sympy.Expr, check: bool) -> T:
         raise NotImplementedError
 
+    # triton-only
+    def dot(self, x: T, y: T) -> T:
+        raise NotImplementedError
+
     # triton-only
     def inline_asm_elementwise(
         self,
@@ -791,9 +796,6 @@ def {target}(self, {", ".join(args)}):
             if target in OP_NAMES:
                 setattr(cls, target, impl)
 
-    def device_assert_async(self, cond, msg):
-        return None
-
 
 DefaultHandler._init_cls()
 
@@ -939,9 +941,6 @@ def sort(dtypes, values, stable, descending):
     def indirect_indexing(index_var, size, check=True, wrap_neg=True) -> sympy.Symbol:
         return sympy_index_symbol(str(index_var))
 
-    def device_assert_async(self, cond, msg):
-        return None
-
 
 class KernelFormatterHandler(DefaultHandler):
     def __init__(self, parent_handler: OpsHandler[Any]):
@@ -1008,9 +1007,6 @@ def getvalue(self, result):
         self._output.writeline(f"return {result}")
         return self._output.getvalue()
 
-    def device_assert_async(self, cond, msg: str):
-        return f"ops.device_assert_async({cond}, {msg})"
-
 
 class WrapperHandler(DefaultHandler):
     def __init__(self, inner: OpsHandler[Any]):
@@ -1158,3 +1154,8 @@ def _default(self, name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) ->
         val = getattr(self._inner, name)(*args, **kwargs)
         self.cse_cache[key] = val
         return val
+
+    def device_assert_async(self, *args, **kwargs) -> None:
+        raise NotImplementedError(
+            f"{type(self).__name__}: device_assert_async should be handled by CSEProxy"
+        )
diff --git a/torch/_inductor/output_code.py b/torch/_inductor/output_code.py
index 955c00c51d0b..42be50270995 100644
--- a/torch/_inductor/output_code.py
+++ b/torch/_inductor/output_code.py
@@ -110,6 +110,7 @@ def set_triton_bundle(self, triton_bundle: Any) -> None:
 # to achieve writing to all values of that dimension of the input tensor
 def get_expanded_dims(t: torch.Tensor) -> list[int]:
     if not isinstance(t, torch.Tensor):
+        # pyrefly: ignore  # bad-return
         return None
     return [i for i in range(t.ndim) if t.stride(i) == 0 and t.size(i) != 1]
 
@@ -607,9 +608,13 @@ def __call__(self, inputs: Sequence[Any]) -> Any:
                 }
             )
         try:
-            with record_function(
-                f"## Call CompiledFxGraph {self._fx_graph_cache_key} ##"
-            ):
+            # Checking the profiler directly is faster than nullcontext
+            if torch.autograd.profiler._is_profiler_enabled:
+                with record_function(
+                    f"## Call CompiledFxGraph {self._fx_graph_cache_key} ##"
+                ):
+                    return self.current_callable(inputs)
+            else:
                 return self.current_callable(inputs)
         finally:
             get_runtime_metrics_context().finish()
diff --git a/torch/_inductor/package/package.py b/torch/_inductor/package/package.py
index bd11d033cadb..7c7884c92dba 100644
--- a/torch/_inductor/package/package.py
+++ b/torch/_inductor/package/package.py
@@ -131,6 +131,7 @@ def load_package(
             )
             return AOTICompiledModel(loader)
 
+    # pyrefly: ignore  # no-matching-overload
     path = os.fspath(path)  # AOTIModelPackageLoader expects (str, str)
     loader = torch._C._aoti.AOTIModelPackageLoader(
         path, model_name, run_single_threaded, num_runners, device_index
diff --git a/torch/_inductor/pattern_matcher.py b/torch/_inductor/pattern_matcher.py
index e8210f1e80f8..aaedf37b4eb2 100644
--- a/torch/_inductor/pattern_matcher.py
+++ b/torch/_inductor/pattern_matcher.py
@@ -270,6 +270,7 @@ def should_propagate_eager_input_vals(nodes: list[torch.fx.Node]) -> bool:
                 ]
             )
 
+        # pyrefly: ignore  # bad-context-manager
         with context:
             if trace_fn is None:
                 trace_fn = functools.partial(
@@ -1127,6 +1128,10 @@ def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node) -> Non
 
 @dataclasses.dataclass
 class ReplacementPatternEntry(PatternEntry):
+    """
+    The replacement pattern for the graph
+    """
+
     normalize_args: Callable[..., list[Any]]
 
     @staticmethod
@@ -1136,6 +1141,12 @@ def replace_with_graph(
         replacement_graph: Union[torch.fx.Graph, torch.fx.GraphModule],
         args: Sequence[torch.fx.Node],
     ) -> None:
+        """
+        Inserts the replacement graph into the toplevel graph at the match
+        """
+
+        added_replacement_nodes: list[torch.fx.Node] = []
+
         class Replacer(torch.fx.Interpreter):
             call_method = None  # type: ignore[assignment]
             call_module = None  # type: ignore[assignment]
@@ -1149,6 +1160,7 @@ def run_node(self, node: torch.fx.Node) -> Any:
                 if node.op == "call_function":
                     assert callable(target)
                     result = graph.call_function(target, args, kwargs)
+                    added_replacement_nodes.append(result)
                     _transfer_meta(
                         new_meta=result.meta,
                         old_node=node,
@@ -1186,10 +1198,16 @@ def run_node(self, node: torch.fx.Node) -> Any:
                     if graph_name is None:
                         assert isinstance(target, str)
                         _, graph_name = unique_graph_name_with_root(
-                            graph.owning_module, target
+                            # pyrefly: ignore  # unbound-name
+                            graph.owning_module,
+                            target,
                         )
+                        # pyrefly: ignore  # unbound-name
                         graph.owning_module.register_module(graph_name, sub_gm)
-                    return graph.get_attr(graph_name)
+                    # pyrefly: ignore  # unbound-name
+                    getattr_node = graph.get_attr(graph_name)
+                    added_replacement_nodes.append(getattr_node)
+                    return getattr_node
 
                 raise NotImplementedError(f"unhandled {node}")
 
@@ -1245,13 +1263,22 @@ def replace(
                 old: Union[torch.fx.Node, None],
                 new: Union[torch.fx.Node, Sequence[torch.fx.Node], None],
             ) -> None:
+                def filter_nodes_in_newly_added_nodes(node: torch.fx.Node) -> bool:
+                    # Do not replace the use of a node if it is being used by
+                    # nodes in the replaced graph
+                    return node not in added_replacement_nodes
+
                 if old is None:
                     assert new is None
                     return
                 assert isinstance(old, torch.fx.Node)
                 if new is None:
-                    old.replace_all_uses_with(None)  # type: ignore[arg-type]
-                    graph.erase_node(old)
+                    old.replace_all_uses_with(
+                        None,  # type: ignore[arg-type]
+                        delete_user_cb=filter_nodes_in_newly_added_nodes,
+                    )
+                    if len(old.users) == 0:
+                        graph.erase_node(old)
                     return
                 if isinstance(new, torch.fx.Node):
                     if "val" not in new.meta:
@@ -1271,8 +1298,11 @@ def replace(
                                 new, tag_name, old.meta[tag_name], OrderedSet(args)
                             )
 
-                    old.replace_all_uses_with(new)
-                    graph.erase_node(old)
+                    old.replace_all_uses_with(
+                        new, delete_user_cb=filter_nodes_in_newly_added_nodes
+                    )
+                    if len(old.users) == 0:
+                        graph.erase_node(old)
                     return
 
                 # `new` is not a node: it's a list of nodes.
@@ -1373,7 +1403,7 @@ def check_and_add_duplicate_pattern(
 
     new_graph_str = str(graph)
     for graph_str in equiv_pattern_reprs:
-        if not new_graph_str == graph_str:
+        if new_graph_str != graph_str:
             continue
         if skip_duplicates:
             return True
@@ -1473,6 +1503,7 @@ def search_fn_new(*args_new: Any) -> Any:
                         return search_fn(*args_new[len(args_new) - len(args) :])
 
                     try:
+                        # pyrefly: ignore  # bad-argument-type
                         specific_graph = trace_fn(search_fn_new, sym_args + args)
                     except RuntimeError as e:
                         log_trace_failure(search_fn, e)
@@ -1623,6 +1654,7 @@ def get_file_template() -> str:
                 if isinstance(attr, type) and issubclass(
                     attr, (PatternExpr, _TargetExpr)
                 ):
+                    # pyrefly: ignore  # bad-argument-type
                     pattern_matcher_imports.append(name)
             except TypeError:
                 pass
@@ -1905,12 +1937,14 @@ class PatternMatcherPass:
     def __init__(
         self,
         pass_name: Optional[str] = None,
+        subsystem: Optional[str] = None,
     ) -> None:
         super().__init__()
         self.patterns: defaultdict[
             tuple[str, torch.fx.node.Target], list[PatternEntry]
         ] = defaultdict(list)
         self.pass_name = pass_name
+        self.subsystem = subsystem
 
         # For a particular generated pattern repr, store all of the str representations
         # of the graph used to generate them. Because we ignore certain patterns
@@ -1950,7 +1984,7 @@ def apply(self, gm: Union[torch.fx.GraphModule, torch.fx.Graph]) -> int:
             nodes.append(graph.find_nodes(op="call_module", sort=False))
         pass_name = self.pass_name if self.pass_name is not None else "pattern_matcher"
         assert isinstance(gm, torch.fx.GraphModule)
-        with GraphTransformObserver(gm, pass_name):
+        with GraphTransformObserver(gm, pass_name, self.subsystem):
             for node in sorted(itertools.chain.from_iterable(nodes), reverse=True):
                 target = extract_target(node)
                 if node.op == "call_module":
@@ -2028,10 +2062,14 @@ def process_arg(
     argnum = itertools.count()
 
     class Converter(torch.fx.Interpreter):
+        # pyrefly: ignore  # bad-override
         call_method = _not_implemented
+        # pyrefly: ignore  # bad-override
         call_module = _not_implemented
+        # pyrefly: ignore  # bad-override
         get_attr = _not_implemented
 
+        # pyrefly: ignore  # bad-override
         def placeholder(
             self,
             target: str,  # type: ignore[override]
@@ -2052,6 +2090,7 @@ def placeholder(
             else:
                 return KeywordArg(name)
 
+        # pyrefly: ignore  # bad-override
         def call_function(
             self,
             target: str,  # type: ignore[override]
@@ -2086,6 +2125,7 @@ def run_node(self, n: torch.fx.Node) -> Any:
                 assert isinstance(args, Collection)
                 assert len(rv) == len(args)
                 for r, arg in zip(rv, args):
+                    # pyrefly: ignore  # missing-attribute
                     r.users = len(arg.users)
             else:
                 rv.users = len(n.users)
@@ -2160,7 +2200,10 @@ def record_joint_graph(
         torch.ops.aten.view.default, KeywordArg("arg"), KeywordArg("size")
     )
     GraphPatternEntry(
-        pattern=pattern, handler=pointless_view, extra_check=_return_true
+        pattern=pattern,
+        handler=pointless_view,
+        extra_check=_return_true,
+        # pyrefly: ignore  # bad-argument-type
     ).register(matcher_pass.patterns)
     matcher_pass.apply(gm.graph)
 
@@ -2250,6 +2293,7 @@ def run_node(self, old_node: torch.fx.Node) -> torch.fx.Node:
                 new_node.node.name = self.new_graph._graph_namespace.create_name(
                     old_node.name, None
                 )
+            # pyrefly: ignore  # bad-return
             return new_node
 
     return CopyGraph(input_graph).transform()
diff --git a/torch/_inductor/pcache.py b/torch/_inductor/pcache.py
deleted file mode 100644
index 0c6bcdfb0845..000000000000
--- a/torch/_inductor/pcache.py
+++ /dev/null
@@ -1,332 +0,0 @@
-from __future__ import annotations
-
-from functools import cached_property
-from os import getenv
-from pathlib import Path
-from tempfile import gettempdir
-from threading import Lock
-from typing import Generic, TYPE_CHECKING, TypeVar
-from typing_extensions import override, Self
-
-from torch.utils._filelock import FileLock
-
-
-if TYPE_CHECKING:
-    from concurrent.futures import Future, ThreadPoolExecutor
-
-
-Key = TypeVar("Key")
-Value = TypeVar("Value")
-
-
-class Cache(Generic[Key, Value]):
-    """
-    Abstract base class for cache implementations.
-
-    Provides the interface for basic synchronous get and insert methods for storing and retrieving data.
-    Subclasses must implement both methods.
-
-    Note:
-        - Not guaranteed to be thread-safe.
-        - For asynchronous and thread-safe cache, see `AsyncCache`.
-
-    Methods:
-        get(key): Retrieve a value by key.
-        insert(key, value): Insert a value if the key does not already exist.
-    """
-
-    def get(self: Self, key: Key) -> Value | None:
-        """
-        Retrieve the value associated with the given key from the cache.
-
-        Args:
-            key (Key): The key used to query the cache.
-
-        Returns:
-            Value | None: The value associated with the key, or None if not found.
-
-        Raises:
-            NotImplementedError: If not implemented by a subclass.
-        """
-        raise NotImplementedError
-
-    def insert(self: Self, key: Key, value: Value) -> bool:
-        """
-        Store the given value in the cache with the associated key if the key does
-        not already exist in the cache, otherwise do nothing.
-
-        Args:
-            key (Key): The key to associate with the value.
-            value (Value): The value to be stored in the cache.
-
-        Returns:
-            bool: True if the value was stored successfully, False if the key already exists.
-
-        Raises:
-            NotImplementedError: If not implemented by a subclass.
-        """
-        raise NotImplementedError
-
-
-class InMemoryCache(Cache[str, bytes]):
-    """
-    In-memory cache implementation.
-
-    Stores cache data in a dictionary for fast lookups and insertions.
-    Not thread-safe.
-    """
-
-    def __init__(self: Self) -> None:
-        """
-        Initialize the in-memory cache.
-        """
-        self._cache: dict[str, bytes] = {}
-        self._lock: Lock = Lock()
-
-    @override
-    def get(self: Self, key: str) -> bytes | None:
-        """
-        Retrieve the value associated with the given key from the cache.
-
-        Args:
-            key (str): The key used to query the cache.
-
-        Returns:
-            bytes | None: The value associated with the key, or None if not found.
-        """
-        with self._lock:
-            return self._cache.get(key)
-
-    @override
-    def insert(self: Self, key: str, value: bytes) -> bool:
-        """
-        Store the given value in the cache with the associated key if the key does
-        not already exist in the cache, otherwise do nothing.
-
-        Args:
-            key (str): The key to associate with the value.
-            value (bytes): The value to be stored in the cache.
-
-        Returns:
-            bool: True if the value was stored successfully, False if the key already exists.
-        """
-        with self._lock:
-            if key in self._cache:
-                return False
-            self._cache[key] = value
-            return True
-
-    @classmethod
-    def from_env_var(cls, env_var: str) -> Self:
-        """
-        Create a new in-memory cache instance from an environment variable.
-
-        The environment variable should contain key-value pairs separated by ';',
-        with each pair formatted as 'key,value'. The value should be a string
-        representation of bytes (e.g., b'...').
-
-        Args:
-            env_var (str): The environment variable containing cache data.
-
-        Returns:
-            InMemoryCache: A new in-memory cache instance populated with data from the environment variable.
-
-        Raises:
-            ValueError: If a key is associated with two distinct values, or if the environment variable
-                is malformed (e.g., missing comma, value not a bytes string).
-        """
-        cache: Self = cls()
-        env_val: str | None = getenv(env_var, None)
-
-        if env_val is not None:
-            for kv_pair in env_val.split(";"):
-                if not kv_pair:
-                    # can happen if env_val is an empty string, or ends with ;
-                    continue
-                try:
-                    key, raw_value = kv_pair.split(",", 1)
-                except ValueError as err:
-                    raise ValueError(
-                        f"Malformed kv_pair {kv_pair!r} in env_var {env_var!r}, missing comma separator!"
-                    ) from err
-                # check that raw_value is a str repr of bytes
-                if (not raw_value.startswith("b'")) or (not raw_value.endswith("'")):
-                    raise ValueError(
-                        f"Malformed value {raw_value!r} in kv_pair {kv_pair!r}, expected b'...' format!"
-                    )
-                # remove b' prefix and ' suffix
-                str_value = raw_value[2:-1]
-                try:
-                    # make sure the value is legitimately encoded
-                    value = bytes([ord(char) for char in str_value])
-                except ValueError as err:
-                    raise ValueError(
-                        f"Malformed value {raw_value!r} in kv_pair {kv_pair!r}!"
-                    ) from err
-                # duplicates are ok, so long as the key does not point to two distinct values
-                if (not cache.insert(key, value)) and (cache.get(key) != value):
-                    raise ValueError(
-                        f"Duplicated values for key {key!r}, got {cache.get(key)!r} and {value!r}!"
-                    )
-
-        return cache
-
-
-class AsyncCache(Cache[Key, Value]):
-    """
-    Abstract base class for asynchronous, thread-safe cache implementations.
-
-    Provides synchronous get/insert methods and additional asynchronous (_async) methods
-    for concurrent access using a ThreadPoolExecutor. All methods are thread-safe.
-
-    Note:
-        - Use this class or its subclasses when thread safety or async access is required.
-        - The _async methods return concurrent.futures.Future objects.
-
-    Methods:
-        get(key): Retrieve a value by key.
-        get_async(key, executor): Asynchronously retrieve a value by key.
-        insert(key, value): Insert a value.
-        insert_async(key, value, executor): Asynchronously insert a value.
-    """
-
-    def get_async(
-        self: Self, key: Key, executor: ThreadPoolExecutor
-    ) -> Future[Value | None]:
-        """
-        Retrieve the value associated with the given key from the cache asynchronously.
-
-        Args:
-            key (Key): The key used to query the cache.
-            executor (ThreadPoolExecutor): The executor to use for asynchronous execution.
-
-        Returns:
-            Future[Value | None]: A Future representing the result of the asynchronous operation.
-        """
-        return executor.submit(self.get, key)
-
-    def insert_async(
-        self: Self, key: Key, value: Value, executor: ThreadPoolExecutor
-    ) -> Future[bool]:
-        """
-        Store the given value in the cache with the associated key if the key does
-        not already exist in the cache, otherwise do nothing, asynchronously.
-
-        Args:
-            key (Key): The key to associate with the value.
-            value (Value): The value to be stored in the cache.
-            executor (ThreadPoolExecutor): The executor to use for asynchronous execution.
-
-        Returns:
-            Future[bool]: A Future representing the result of the asynchronous operation.
-        """
-        return executor.submit(self.insert, key, value)
-
-
-class OnDiskCache(AsyncCache[str, bytes]):
-    """
-    Abstract base class for on-disk cache implementations.
-
-    Provides synchronous and asynchronous get/insert methods for storing and retrieving data on disk.
-    All methods are thread-safe.
-
-    Methods:
-        get(key): Retrieve a value by key from disk.
-        get_async(key, executor): Asynchronously retrieve a value by key from disk.
-        insert(key, value): Insert a value on disk.
-        insert_async(key, value, executor): Asynchronously insert a value on disk.
-    """
-
-    @property
-    def base_dir(self: Self) -> Path:
-        """
-        Get the base directory for the on-disk cache.
-
-        Returns:
-            Path: The base directory for the on-disk cache.
-        """
-        return Path(gettempdir())
-
-    def _fpath_from_key(self: Self, key: str) -> Path:
-        """
-        Get the file path associated with the given key.
-
-        Args:
-            key (str): The key used to query the cache.
-
-        Returns:
-            Path: The file path associated with the key.
-        """
-        return self.base_dir / key
-
-    def _flock_from_fpath(self: Self, fpath: Path) -> FileLock:
-        """
-        Get the file lock associated with the given file path.
-
-        Args:
-            fpath (Path): The file path to lock.
-
-        Returns:
-            FileLock: The file lock associated with the file path.
-        """
-        return FileLock(str(fpath) + ".lock")
-
-    @override
-    def get(self: Self, key: str) -> bytes | None:
-        """
-        Retrieve the value associated with the given key from the cache on disk.
-
-        Args:
-            key (str): The key used to query the cache.
-
-        Returns:
-            bytes | None: The value associated with the key, or None if not found.
-        """
-        fpath = self._fpath_from_key(key)
-        flock = self._flock_from_fpath(fpath)
-        with flock:
-            return fpath.read_bytes() if fpath.is_file() else None
-
-    @override
-    def insert(self: Self, key: str, value: bytes) -> bool:
-        """
-        Store the given value in the cache with the associated key on disk.
-
-        Args:
-            key (str): The key to associate with the value.
-            value (bytes): The value to be stored in the cache.
-
-        Returns:
-            bool: True if the value was stored successfully, False if the key already exists.
-        """
-        fpath = self._fpath_from_key(key)
-        flock = self._flock_from_fpath(fpath)
-        fpath.parent.mkdir(parents=True, exist_ok=True)
-        try:
-            # "x" mode is exclusive creation, meaning the file will be created
-            # iff the file does not already exist (atomic w/o overwrite)
-            with flock as _, open(fpath, "xb") as fp:
-                fp.write(value)
-        except FileExistsError:
-            return False
-        return True
-
-
-class InductorOnDiskCache(OnDiskCache):
-    """
-    On-disk cache implementation for Inductor.
-
-    Uses the default cache directory provided by Inductor.
-    """
-
-    @cached_property
-    def base_dir(self: Self) -> Path:
-        """
-        Get the base directory for the on-disk cache.
-
-        Returns:
-            Path: The base directory for the on-disk cache.
-        """
-        from torch._inductor.runtime.runtime_utils import default_cache_dir
-
-        return Path(default_cache_dir(), "pcache")
diff --git a/torch/_inductor/quantized_lowerings.py b/torch/_inductor/quantized_lowerings.py
index c7628314a85c..5e192579bbec 100644
--- a/torch/_inductor/quantized_lowerings.py
+++ b/torch/_inductor/quantized_lowerings.py
@@ -137,6 +137,7 @@ def int4pack_mm_cpu(
             )
             and mat2.get_layout().is_contiguous()
         ):
+            # pyrefly: ignore  # bad-specialization, missing-attribute, not-a-type
             CppWoqInt4GemmTemplate[qGroupSize].add_choices(
                 choices,
                 aten_layout,
diff --git a/torch/_inductor/remote_cache.py b/torch/_inductor/remote_cache.py
index 1304ce79b86e..8b143520808f 100644
--- a/torch/_inductor/remote_cache.py
+++ b/torch/_inductor/remote_cache.py
@@ -160,6 +160,7 @@ def __init__(
             self.backend = override_cls()
         else:
             self.backend = backend
+        # pyrefly: ignore  # invalid-type-var
         self.serde = serde
 
     # See if the cache contains `key`. Returns `None` if the value is not
@@ -245,6 +246,7 @@ class RedisRemoteCacheBackend(RemoteCacheBackend[bytes]):
     A Redis implementation of a remote/distributed cache.
     """
 
+    # pyrefly: ignore  # missing-attribute
     _redis: Optional[redis.Redis] = None
 
     def __init__(self, cache_id: str) -> None:
@@ -267,7 +269,9 @@ def _get(self, key: str) -> Optional[bytes]:
             return None
 
         try:
+            # pyrefly: ignore  # missing-attribute
             value = self._redis.get(key)
+        # pyrefly: ignore  # missing-attribute
         except redis.exceptions.ConnectionError:
             # Redis is lazy and doesn't actually attempt to connect until the
             # first use. Mark is as unavailable now.
@@ -285,7 +289,9 @@ def _put(self, key: str, data: bytes) -> None:
             return
 
         try:
+            # pyrefly: ignore  # missing-attribute
             self._redis.set(key, data)
+        # pyrefly: ignore  # missing-attribute
         except redis.exceptions.ConnectionError:
             # Redis is lazy and doesn't actually attempt to connect until the
             # first use. Mark is as unavailable now.
diff --git a/torch/_inductor/runtime/autotune_cache.py b/torch/_inductor/runtime/autotune_cache.py
index 88b9c80c7714..63d7a52ff7d7 100644
--- a/torch/_inductor/runtime/autotune_cache.py
+++ b/torch/_inductor/runtime/autotune_cache.py
@@ -31,11 +31,10 @@
 import os
 import os.path
 import re
-from typing import Any, Optional, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 from typing_extensions import override
 
 import torch
-from torch._dynamo.precompile_context import PrecompileContext
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch.compiler._cache import (
     CacheArtifact,
@@ -116,14 +115,14 @@ def encode(content: JsonDataTy) -> bytes:
 @dataclasses.dataclass
 class AutotuneCache:
     configs_hash: str
-    local_cache: Optional[tuple[RemoteCache[JsonDataTy], str]] = None
-    remote_cache: Optional[tuple[RemoteCache[JsonDataTy], str]] = None
+    local_cache: tuple[RemoteCache[JsonDataTy], str] | None = None
+    remote_cache: tuple[RemoteCache[JsonDataTy], str] | None = None
 
     # Create a AutotuneCache. Returns None if none of the caches can be used.
     @staticmethod
     def create(
         inductor_meta: _InductorMetaTy, filename: str, configs_hash: str
-    ) -> Optional[AutotuneCache]:
+    ) -> AutotuneCache | None:
         cache = AutotuneCache(configs_hash)
         key = AutotuneCache._prepare_key(filename)
 
@@ -143,7 +142,7 @@ def _prepare_key(filename: str) -> str:
         return hashlib.sha256(key.encode("utf-8")).hexdigest()
 
     # Read the best config options from the most local cache and return it.
-    def _read(self) -> Optional[dict[str, JsonDataTy]]:
+    def _read(self) -> dict[str, JsonDataTy] | None:
         if local_cache := self.local_cache:
             cache, key = local_cache
             if best_config := cache.get(key):
@@ -162,7 +161,7 @@ def _read(self) -> Optional[dict[str, JsonDataTy]]:
     # which `configs` represents that option.
     def read_best(
         self, inductor_meta: _InductorMetaTy, configs: list[Config]
-    ) -> Optional[Config]:
+    ) -> Config | None:
         if best := self._read():
             return _load_cached_autotuning(
                 best, self.configs_hash, configs, inductor_meta
@@ -273,11 +272,14 @@ def save(
         config: Config,
         time_taken_ns: int,
         found_by_coordesc: bool = False,
-        triton_cache_hash: Optional[str] = None,
+        triton_cache_hash: str | None = None,
     ) -> None:
         data = {
+            # pyrefly: ignore  # missing-attribute
             **config.kwargs,
+            # pyrefly: ignore  # missing-attribute
             "num_warps": config.num_warps,
+            # pyrefly: ignore  # missing-attribute
             "num_stages": config.num_stages,
             "configs_hash": self.configs_hash,
             "found_by_coordesc": found_by_coordesc,
@@ -302,10 +304,6 @@ def save(
             CacheArtifactManager.record_artifact(
                 AutotuneCacheArtifact.type(), autotune_artifact_key, data
             )
-            if torch._dynamo.config.caching_precompile:
-                PrecompileContext.record_artifact(
-                    AutotuneCacheArtifact.type(), autotune_artifact_key, data
-                )
 
             if log.isEnabledFor(logging.DEBUG):
                 type_str = "coordesc" if found_by_coordesc else "heuristic"
@@ -419,7 +417,7 @@ def _get_backend_hash(inductor_meta: _InductorMetaTy) -> str:
 
 
 class AutotuneCacheBundler:
-    _bundler: Optional[_AutotuneCacheBundlerImpl] = None
+    _bundler: _AutotuneCacheBundlerImpl | None = None
 
     def __init__(self) -> None:
         pass
@@ -432,8 +430,8 @@ def begin_compile(
         cls,
         inductor_meta: _InductorMetaTy,
         *,
-        code: Optional[str] = None,
-        code_hash: Optional[str] = None,
+        code: str | None = None,
+        code_hash: str | None = None,
     ) -> None:
         assert cls._bundler is None
 
@@ -541,7 +539,7 @@ def _load_cached_autotuning(
     configs_hash: str,
     configs: list[Config],
     inductor_meta: _InductorMetaTy,
-) -> Optional[Config]:
+) -> Config | None:
     if best_config is None:
         return None
     if best_config.pop("configs_hash", None) != configs_hash:
@@ -575,15 +573,20 @@ def _load_cached_autotuning(
             )
 
         # Create the triton_config with the appropriate arguments
+        # pyrefly: ignore  # bad-argument-count
         triton_config = Config(best_config, **config_args)
+        # pyrefly: ignore  # missing-attribute
         triton_config.found_by_coordesc = True
         return triton_config
 
     matching_configs = [
         cfg
         for cfg in configs
+        # pyrefly: ignore  # missing-attribute
         if all(val == best_config.get(key) for key, val in cfg.kwargs.items())
+        # pyrefly: ignore  # missing-attribute
         and cfg.num_warps == best_config.get("num_warps")
+        # pyrefly: ignore  # missing-attribute
         and cfg.num_stages == best_config.get("num_stages")
     ]
     if len(matching_configs) != 1:
@@ -594,7 +597,7 @@ def _load_cached_autotuning(
 
 class _LocalAutotuneCacheBackend(RemoteCacheBackend[bytes]):
     @override
-    def _get(self, key: str) -> Optional[bytes]:
+    def _get(self, key: str) -> bytes | None:
         try:
             with open(key, "rb") as fd:
                 return fd.read()
@@ -616,7 +619,7 @@ def __init__(self) -> None:
         super().__init__(backend, serde)
 
     @override
-    def _get(self, key: str, sample: Optional[Sample]) -> Optional[JsonDataTy]:
+    def _get(self, key: str, sample: Sample | None) -> JsonDataTy | None:
         AutotuneCacheBundler.sync()
         result = super()._get(key, sample)
         if result is not None:
@@ -631,14 +634,10 @@ def _get(self, key: str, sample: Optional[Sample]) -> Optional[JsonDataTy]:
             CacheArtifactManager.record_artifact(
                 AutotuneCacheArtifact.type(), autotune_artifact_key, result
             )
-            if torch._dynamo.config.caching_precompile:
-                PrecompileContext.record_artifact(
-                    AutotuneCacheArtifact.type(), autotune_artifact_key, result
-                )
         return result
 
     @override
-    def _put(self, key: str, value: JsonDataTy, sample: Optional[Sample]) -> None:
+    def _put(self, key: str, value: JsonDataTy, sample: Sample | None) -> None:
         AutotuneCacheBundler.put(key, value)
         super()._put(key, value, sample)
 
diff --git a/torch/_inductor/runtime/benchmarking.py b/torch/_inductor/runtime/benchmarking.py
index 95b1ba64d158..ee504b1a0575 100644
--- a/torch/_inductor/runtime/benchmarking.py
+++ b/torch/_inductor/runtime/benchmarking.py
@@ -1,9 +1,10 @@
+import functools
 import inspect
 import time
 from functools import cached_property, wraps
 from itertools import chain
 from statistics import median
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable
 from typing_extensions import Concatenate, ParamSpec, Self, TypeVar
 
 import torch
@@ -23,6 +24,55 @@
 T = TypeVar("T")
 
 
+def may_distort_benchmarking_result(fn: Callable[..., Any]) -> Callable[..., Any]:
+    from torch._inductor import config
+
+    if config.test_configs.distort_benchmarking_result == "":
+        return fn
+
+    def distort(
+        ms: list[float] | tuple[float] | float,
+    ) -> list[float] | tuple[float] | float:
+        if isinstance(ms, (list, tuple)):
+            return type(ms)(distort(val) for val in ms)  # type: ignore[misc]
+
+        distort_method = config.test_configs.distort_benchmarking_result
+        assert isinstance(ms, float)
+        if distort_method == "inverse":
+            return 1.0 / ms if ms else 0.0
+        elif distort_method == "random":
+            import random
+
+            return random.random()
+        else:
+            raise RuntimeError(f"Unrecognized distort method {distort_method}")
+
+    @functools.wraps(fn)
+    def wrapper(
+        *args: list[Any], **kwargs: dict[str, Any]
+    ) -> list[float] | tuple[float] | float:
+        ms = fn(*args, **kwargs)
+
+        return distort(ms)
+
+    return wrapper
+
+
+def may_ban_benchmarking() -> None:
+    if torch._inductor.config.deterministic:
+        raise RuntimeError("""In the deterministic mode of Inductor, we will avoid those
+        benchmarkings that would cause non deterministic results. Only benchmarkings in the vetted
+        scenarios are allowed. Example include autotuning for triton configs of pointwise kernels.
+
+        When you see this exception, you can do one of the following two things:
+        1. if the benchmarking you are doing does not introduce any non-determinism, you can just
+        add is_vetted_benchmarking=True to you benchmark_gpu call. That would solve the issue.
+
+        2. if the benchmarking you are doing indeed introduces non-determinism, you'll need to disable
+        such feature in deterministic mode or find an alternative implementation that is deterministic.
+        """)
+
+
 def time_and_count(
     fn: Callable[Concatenate[Any, P], T],
 ) -> Callable[Concatenate[Any, P], T]:
@@ -73,6 +123,7 @@ def benchmark(
         - The runtime of `fn(*fn_args, **fn_kwargs)`, in milliseconds.
         """
         inferred_device = None
+        # pyrefly: ignore  # bad-assignment
         for arg_or_kwarg in chain(fn_args, fn_kwargs.values()):
             if not isinstance(arg_or_kwarg, torch.Tensor):
                 continue
@@ -144,8 +195,15 @@ def triton_do_bench(self: Self) -> Callable[..., Any]:
             raise NotImplementedError("requires Triton") from e
         return do_bench
 
+    @may_distort_benchmarking_result
     @time_and_count
-    def benchmark_gpu(self: Self, _callable: Callable[[], Any], **kwargs: Any) -> float:
+    # pyrefly: ignore  # bad-override
+    def benchmark_gpu(
+        self: Self,
+        _callable: Callable[[], Any],
+        is_vetted_benchmarking: bool = False,
+        **kwargs: Any,
+    ) -> float:
         """Benchmark the GPU callable, `_callable`, and return the runtime, in milliseconds.
 
         Arguments:
@@ -162,6 +220,9 @@ def benchmark_gpu(self: Self, _callable: Callable[[], Any], **kwargs: Any) -> fl
         this is the first requested quantile. Else, if `kwargs["return_mode"]` is specified,
         this is the requested return mode. Otherwise, this is the median.
         """
+        if not is_vetted_benchmarking:
+            may_ban_benchmarking()
+
         do_bench_params = inspect.signature(self.triton_do_bench).parameters
         for kwarg in list(kwargs.keys()):
             if kwarg not in do_bench_params:
@@ -204,6 +265,7 @@ def get_event_pairs_min_timing(
             ]
         )
 
+    @may_distort_benchmarking_result
     @time_and_count
     def benchmark_gpu(  # type: ignore[override]
         self: Self,
@@ -213,9 +275,10 @@ def benchmark_gpu(  # type: ignore[override]
         benchmark_iters: int = 100,
         max_benchmark_duration: int = 25,
         return_mode: str = "min",
-        grad_to_none: Optional[list[torch.Tensor]] = None,
+        grad_to_none: list[torch.Tensor] | None = None,
+        is_vetted_benchmarking: bool = False,
         **kwargs: Any,
-    ) -> Union[float, list[float]]:
+    ) -> float | list[float]:
         """Benchmark a GPU callable using a custom benchmarking implementation.
 
         Arguments:
@@ -237,12 +300,18 @@ def benchmark_gpu(  # type: ignore[override]
         "all" (returns all measurements).
         - grad_to_none: Optionally, a list of tensors whose gradients should be cleared
         before each benchmark iteration.
+        - is_vetted_benchmarking: in deterministic mode, we only allow
+        benchmarking in vetted cases.
         - **kwargs: Additional kwargs that may be passed to the fallback.
 
         Returns:
         - If return_mode="min": The minimum runtime of `_callable`, in milliseconds.
         - If return_mode="all": List of all runtime measurements, in milliseconds.
         """
+
+        if not is_vetted_benchmarking:
+            may_ban_benchmarking()
+
         # we don't want any outside errors propagating into benchmarking
         torch.cuda.synchronize()
 
diff --git a/torch/_inductor/runtime/caching/__init__.py b/torch/_inductor/runtime/caching/__init__.py
new file mode 100644
index 000000000000..38903b3ea0cc
--- /dev/null
+++ b/torch/_inductor/runtime/caching/__init__.py
@@ -0,0 +1,32 @@
+from .context import IsolationSchema, SelectedCompileContext, SelectedRuntimeContext
+from .exceptions import (
+    CacheError,
+    FileLockTimeoutError,
+    KeyEncodingError,
+    KeyPicklingError,
+    LockTimeoutError,
+    SystemError,
+    UserError,
+    ValueDecodingError,
+    ValueEncodingError,
+    ValuePicklingError,
+    ValueUnPicklingError,
+)
+
+
+__all__ = [
+    "SelectedCompileContext",
+    "SelectedRuntimeContext",
+    "IsolationSchema",
+    "CacheError",
+    "SystemError",
+    "UserError",
+    "LockTimeoutError",
+    "FileLockTimeoutError",
+    "KeyEncodingError",
+    "KeyPicklingError",
+    "ValueEncodingError",
+    "ValuePicklingError",
+    "ValueDecodingError",
+    "ValueUnPicklingError",
+]
diff --git a/torch/_inductor/runtime/caching/config.py b/torch/_inductor/runtime/caching/config.py
new file mode 100644
index 000000000000..cc20b4093efc
--- /dev/null
+++ b/torch/_inductor/runtime/caching/config.py
@@ -0,0 +1,56 @@
+import os
+
+import torch
+from torch._environment import is_fbcode
+
+
+def _versioned_config(
+    jk_name: str,
+    this_version: int,
+    oss_default: bool,
+    env_var_override: str | None = None,
+) -> bool:
+    """
+    A versioned configuration utility that determines boolean settings based on:
+    1. Environment variable override (highest priority)
+    2. JustKnobs version comparison in fbcode environments
+    3. OSS default fallback
+
+    This function enables gradual rollouts of features in fbcode by comparing
+    a local version against a JustKnobs-controlled remote version, while
+    allowing environment variable overrides for testing and OSS defaults
+    for non-fbcode environments.
+
+    Args:
+        jk_name: JustKnobs key name (e.g., "pytorch/inductor:feature_version")
+        this_version: Local version number to compare against JustKnobs version
+        oss_default: Default value to use in non-fbcode environments
+        env_var_override: Optional environment variable name that, when set,
+                         overrides all other logic
+
+    Returns:
+        bool: Configuration value determined by the priority order above
+    """
+    if (
+        env_var_override
+        and (env_var_value := os.environ.get(env_var_override)) is not None
+    ):
+        return env_var_value == "1"
+    elif is_fbcode():
+        jk_version: int = torch._utils_internal.justknobs_getval_int(jk_name)
+        return this_version >= jk_version
+    return oss_default
+
+
+_DETERMINISTIC_CACHING_VERSION: int = 0
+_DETERMINISTIC_CACHING_VERSION_JK: str = (
+    "pytorch/inductor:deterministic_caching_version"
+)
+_DETERMINISTIC_CACHING_OSS_DEFAULT: bool = False
+_DETERMINISTIC_CACHING_ENV_VAR_OVERRIDE: str = "TORCHINDUCTOR_DETERMINISTIC_CACHING"
+DETERMINISTIC_CACHING: bool = _versioned_config(
+    _DETERMINISTIC_CACHING_VERSION_JK,
+    _DETERMINISTIC_CACHING_VERSION,
+    _DETERMINISTIC_CACHING_OSS_DEFAULT,
+    _DETERMINISTIC_CACHING_ENV_VAR_OVERRIDE,
+)
diff --git a/torch/_inductor/runtime/caching/context.py b/torch/_inductor/runtime/caching/context.py
new file mode 100644
index 000000000000..4030ff3ba690
--- /dev/null
+++ b/torch/_inductor/runtime/caching/context.py
@@ -0,0 +1,291 @@
+"""Context management for PyTorch Inductor runtime caching.
+
+This module provides context classes for collecting configuration and environment
+information used in caching decisions for PyTorch's Inductor runtime.
+"""
+
+import json
+from abc import ABC, abstractmethod
+from base64 import b64encode
+from functools import cache
+from hashlib import sha256
+from typing import Any, Sequence
+from typing_extensions import override, TypedDict
+
+import torch
+
+
+class _Context(ABC):
+    """Abstract base class for context providers.
+
+    Context providers collect specific configuration and environment information
+    that affects compilation and runtime behavior.
+    """
+
+    @staticmethod
+    @abstractmethod
+    def forms_of_context() -> Sequence[str]:
+        """Return a sequence of context form names provided by this context class.
+
+        Returns:
+            A sequence of strings representing the available context forms.
+        """
+
+
+class _RuntimeContext(_Context):
+    """Context provider for runtime configuration and environment settings.
+
+    Collects configuration settings that affect runtime behavior but not
+    compilation, such as Inductor configs, determinism settings, and CUDA
+    matmul precision configurations.
+    """
+
+    @override
+    @staticmethod
+    def forms_of_context() -> Sequence[str]:
+        """Return the runtime context forms provided by this class.
+
+        Returns:
+            A sequence containing the available runtime context forms:
+            - "inductor_configs": PyTorch Inductor configuration settings
+            - "torch_determinism_configs": Deterministic algorithm settings
+            - "cuda_matmul_precision_configs": CUDA matrix multiplication precision settings
+        """
+        return (
+            "inductor_configs",
+            "torch_determinism_configs",
+            "cuda_matmul_precision_configs",
+        )
+
+    @staticmethod
+    def inductor_configs() -> dict[str, Any]:
+        """Get portable Inductor configuration settings.
+
+        Returns:
+            A dictionary containing Inductor configuration settings,
+            including private configs.
+        """
+        from torch._inductor import config
+
+        return config.save_config_portable(ignore_private_configs=False)
+
+    @staticmethod
+    def torch_determinism_configs() -> dict[str, Any]:
+        """Get PyTorch deterministic algorithm configuration settings.
+
+        Returns:
+            A dictionary containing deterministic algorithm settings:
+            - Whether deterministic algorithms are enabled
+            - Whether deterministic algorithm warnings are enabled
+            - Fill uninitialized memory setting
+        """
+        return {
+            "torch.are_deterministic_algorithms_enabled": torch.are_deterministic_algorithms_enabled(),
+            "torch.is_deterministic_algorithms_warn_only_enabled": (
+                torch.is_deterministic_algorithms_warn_only_enabled()
+            ),
+            "torch.utils.deterministic.fill_uninitialized_memory": (
+                torch.utils.deterministic.fill_uninitialized_memory  # type: ignore[attr-defined]
+            ),
+        }
+
+    @staticmethod
+    def cuda_matmul_precision_configs() -> dict[str, Any]:
+        """Get CUDA matrix multiplication precision configuration settings.
+
+        Returns:
+            A dictionary containing CUDA matmul precision settings:
+            - FP32 precision setting
+            - FP16 reduced precision reduction allowance
+            - BF16 reduced precision reduction allowance
+        """
+        return {
+            "torch.backends.cuda.matmul.fp32_precision": torch.backends.cuda.matmul.fp32_precision,
+            "torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction": (
+                torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction
+            ),
+            "torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction": (
+                torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction
+            ),
+        }
+
+
+class _CompileContext(_Context):
+    """Context provider for compilation-related configuration and environment settings.
+
+    Collects information that affects compilation behavior, such as PyTorch and Triton
+    versions, runtime environment, and accelerator properties.
+    """
+
+    @override
+    @staticmethod
+    def forms_of_context() -> Sequence[str]:
+        """Return the compile context forms provided by this class.
+
+        Returns:
+            A sequence containing the available compile context forms:
+            - "torch_version_hash": PyTorch version hash
+            - "triton_version_hash": Triton version hash (if available)
+            - "runtime": Runtime type (CUDA/HIP/None)
+            - "runtime_version": Runtime version string
+            - "accelerator_properties": GPU/accelerator properties
+        """
+        return (
+            "torch_version_hash",
+            "triton_version_hash",
+            "runtime",
+            "runtime_version",
+            "accelerator_properties",
+        )
+
+    @cache
+    @staticmethod
+    def torch_version_hash() -> str:
+        """Get base64-encoded PyTorch version hash.
+
+        Returns:
+            A base64-encoded string representing the PyTorch version hash.
+        """
+        from torch._inductor.codecache import torch_key
+
+        return b64encode(torch_key()).decode()
+
+    @cache
+    @staticmethod
+    def triton_version_hash() -> str | None:
+        """Get Triton version key if Triton is available.
+
+        Returns:
+            Triton version key if Triton is available, None otherwise.
+        """
+        from torch._inductor.runtime.triton_compat import HAS_TRITON, triton_key
+
+        return triton_key() if HAS_TRITON else None
+
+    @cache
+    @staticmethod
+    def runtime() -> str | None:
+        """Determine the runtime type based on available backends.
+
+        Returns:
+            "CUDA" if CUDA is available, "HIP" if HIP is available, None otherwise.
+        """
+        return "CUDA" if torch.version.cuda else "HIP" if torch.version.hip else None
+
+    @cache
+    @staticmethod
+    def runtime_version() -> str | None:
+        """Get the version string for the detected runtime.
+
+        Returns:
+            Version string for the current runtime (CUDA or HIP), or None if
+            no supported runtime is detected.
+        """
+        return {
+            "CUDA": torch.version.cuda,
+            "HIP": torch.version.hip,
+        }.get(_CompileContext.runtime())  # type: ignore[arg-type]
+
+    @cache
+    @staticmethod
+    def accelerator_properties() -> str | None:
+        """Get string representation of CUDA device properties.
+
+        Returns:
+            String representation of CUDA device properties if a runtime is
+            available, None otherwise.
+        """
+        return (
+            repr(torch.cuda.get_device_properties())
+            if _CompileContext.runtime() and torch.cuda.is_available()
+            else None
+        )
+
+
+class SelectedRuntimeContext(TypedDict):
+    inductor_configs: bool
+    torch_determinism_configs: bool
+    cuda_matmul_precision_configs: bool
+
+
+class SelectedCompileContext(TypedDict):
+    torch_version_hash: bool
+    triton_version_hash: bool
+    runtime: bool
+    runtime_version: bool
+    accelerator_properties: bool
+
+
+class IsolationSchema(TypedDict):
+    """Schema for specifying which context forms to include in cache isolation.
+
+    Attributes:
+        runtime_context: Either True (include all runtime context), False (exclude all),
+                        or a SelectedRuntimeContext dict specifying which forms to include.
+        compile_context: Either True (include all compile context), False (exclude all),
+                        or a SelectedCompileContext dict specifying which forms to include.
+    """
+
+    runtime_context: SelectedRuntimeContext | bool
+    compile_context: SelectedCompileContext | bool
+
+
+_DEFAULT_ISOLATION_SCHEMA: IsolationSchema = IsolationSchema(
+    runtime_context=True, compile_context=True
+)
+
+
+def _isolation_context(
+    ischema: IsolationSchema = _DEFAULT_ISOLATION_SCHEMA,
+) -> dict[str, Any]:
+    """Generate context data based on the isolation schema.
+
+    Args:
+        ischema: Schema specifying which context forms to include.
+                Defaults to including all runtime and compile context.
+
+    Returns:
+        A dictionary containing the selected context data with keys
+        "runtime_context" and "compile_context", where each value is
+        either None (if excluded) or a dict of context form data.
+    """
+    isolation_context: dict[str, Any] = {}
+    for context_name, context_cls in (
+        ("runtime_context", _RuntimeContext),
+        ("compile_context", _CompileContext),
+    ):
+        selected_context: dict[str, Any] | None = None
+        if ischema[context_name] is True:  # type: ignore[literal-required]
+            selected_context = {
+                form_of_context: getattr(context_cls, form_of_context)()
+                for form_of_context in context_cls.forms_of_context()
+            }
+        elif ischema[context_name] is False:  # type: ignore[literal-required]
+            selected_context = None
+        else:
+            selected_context = {}
+            for form_of_context in ischema[context_name]:  # type: ignore[literal-required]
+                selected = ischema[context_name][form_of_context]  # type: ignore[literal-required]
+                if selected:
+                    selected_context[form_of_context] = getattr(
+                        context_cls, form_of_context
+                    )()
+            selected_context = selected_context or None
+        isolation_context[context_name] = selected_context
+    return isolation_context
+
+
+def _isolation_key(ischema: IsolationSchema = _DEFAULT_ISOLATION_SCHEMA) -> str:
+    """Generate a unique key for the given isolation schema.
+
+    Args:
+        ischema: Schema specifying which context forms to include.
+                Defaults to including all runtime and compile context.
+
+    Returns:
+        A 32-character hexadecimal string that uniquely identifies
+        the context specified by the isolation schema.
+    """
+    return sha256(
+        json.dumps(_isolation_context(ischema), sort_keys=True).encode()
+    ).hexdigest()[:32]
diff --git a/torch/_inductor/runtime/caching/exceptions.py b/torch/_inductor/runtime/caching/exceptions.py
new file mode 100644
index 000000000000..67a762c0aaea
--- /dev/null
+++ b/torch/_inductor/runtime/caching/exceptions.py
@@ -0,0 +1,153 @@
+# pyre-strict
+
+"""Exception classes for PyTorch Inductor runtime caching.
+
+This module defines a hierarchy of exceptions used throughout the caching system.
+All custom exceptions inherit from CacheError, with UserError serving as a base
+for user-facing errors that also inherit from TypeError for compatibility.
+"""
+
+from threading import Lock
+from typing import Any
+
+from filelock import FileLock
+
+
+class CacheError(Exception):
+    """Base class for all caching-related errors.
+
+    This is the root exception class for all custom exceptions raised by the caching
+    module, providing a common interface for error handling and logging.
+    """
+
+
+class SystemError(CacheError, RuntimeError):
+    """Base class for system-level caching errors.
+
+    This class represents errors that occur during cache operations, such as
+    storage or retrieval failures. It inherits from RuntimeError to indicate
+    that the error is not caused by user input.
+    """
+
+
+class LockTimeoutError(SystemError):
+    """Error raised when a lock operation times out.
+
+    This exception is raised when a lock operation exceeds the specified timeout
+    limit, indicating that the lock could not be acquired within the allotted time.
+    """
+
+    def __init__(self, lock: Lock, timeout: float) -> None:
+        """Initialize the lock timeout error with detailed lock information.
+
+        Args:
+            lock: The lock object that timed out.
+            timeout: The timeout limit that was exceeded.
+        """
+        super().__init__(f"Failed to acquire lock {lock} within {timeout} seconds.")
+
+
+class FileLockTimeoutError(SystemError):
+    """Error raised when a file lock operation times out.
+
+    This exception is raised when a file lock operation exceeds the specified timeout
+    limit, indicating that the lock could not be acquired within the allotted time.
+    """
+
+    def __init__(self, flock: FileLock, timeout: float) -> None:
+        """Initialize the file lock timeout error with detailed lock information.
+
+        Args:
+            flock: The file lock object that timed out.
+            timeout: The timeout limit that was exceeded.
+        """
+        super().__init__(
+            f"Failed to acquire file lock {flock} within {timeout} seconds."
+        )
+
+
+class UserError(CacheError, TypeError):
+    """Base class for user-facing cache errors that also inherit from TypeError.
+
+    This class combines CacheError with TypeError to provide compatibility
+    with existing exception handling patterns while maintaining the cache
+    error hierarchy. All user-facing cache errors should inherit from this class.
+    """
+
+
+class KeyEncodingError(UserError):
+    """Base class for errors that occur during cache key encoding operations.
+
+    Raised when cache keys cannot be properly encoded for storage or transmission.
+    This includes serialization, hashing, or other encoding-related failures.
+    """
+
+
+class KeyPicklingError(KeyEncodingError):
+    """Error raised when a cache key cannot be pickled for serialization.
+
+    This typically occurs when trying to cache objects with keys that contain
+    non-serializable components, lambda functions, or other unpickleable types.
+    """
+
+    def __init__(self, key: Any) -> None:
+        """Initialize the key pickling error with detailed key information.
+
+        Args:
+            key: The cache key that failed to be pickled.
+        """
+        super().__init__(
+            f"Failed to pickle cache key with type {type(key)} and value {key!r}."
+        )
+
+
+class ValueEncodingError(UserError):
+    """Base class for errors that occur during cache value encoding operations.
+
+    Raised when cache values cannot be properly encoded for storage or transmission.
+    This includes serialization, compression, or other encoding-related failures.
+    """
+
+
+class ValuePicklingError(ValueEncodingError):
+    """Error raised when a cache value cannot be pickled for serialization.
+
+    This occurs when trying to cache objects that contain non-serializable
+    components, file handles, network connections, or other unpickleable types.
+    """
+
+    def __init__(self, value: Any) -> None:
+        """Initialize the value pickling error with detailed value information.
+
+        Args:
+            value: The cache value that failed to be pickled.
+        """
+        super().__init__(
+            f"Failed to pickle cache value with type {type(value)} and value {value!r}."
+        )
+
+
+class ValueDecodingError(UserError):
+    """Base class for errors that occur during cache value decoding operations.
+
+    Raised when cached values cannot be properly decoded during retrieval.
+    This includes deserialization, decompression, or other decoding-related failures.
+    """
+
+
+class ValueUnPicklingError(ValueDecodingError):
+    """Error raised when cached value data cannot be unpickled during retrieval.
+
+    This typically indicates corruption, version incompatibility, or missing
+    dependencies required to reconstruct the cached object.
+    """
+
+    def __init__(self, pickled_value: bytes) -> None:
+        """Initialize the value unpickling error with the problematic data.
+
+        Args:
+            pickled_value: The bytes that failed to be unpickled.
+        """
+        super().__init__(
+            f"Failed to unpickle cache value from pickled value {pickled_value!r}."
+        )
diff --git a/torch/_inductor/runtime/caching/implementations.py b/torch/_inductor/runtime/caching/implementations.py
new file mode 100644
index 000000000000..8292b957f562
--- /dev/null
+++ b/torch/_inductor/runtime/caching/implementations.py
@@ -0,0 +1,419 @@
+"""Cache implementation classes for PyTorch Inductor runtime caching.
+
+This module provides concrete implementations of caching backends including
+in-memory, on-disk, and remote caching strategies. Each implementation follows
+the abstract _CacheImpl interface and provides thread-safe operations with
+appropriate locking mechanisms.
+"""
+
+from abc import ABC, abstractmethod
+from contextlib import _GeneratorContextManager, contextmanager
+from dataclasses import dataclass
+from hashlib import sha256
+from io import BufferedReader, BufferedWriter
+from os import PathLike
+from pathlib import Path
+from threading import Lock
+from typing import Any, Callable, Generator
+from typing_extensions import override, TypeAlias
+
+from filelock import FileLock
+
+from . import locks, utils
+
+
+_LockContextManager: TypeAlias = (
+    Generator[None, None, None] | _GeneratorContextManager[None, None, None]
+)
+
+
+@dataclass
+class Hit:
+    """Result wrapper for hits on cache get operations.
+
+    Allows distinguishing between a cache miss and a cached None value.
+
+    Attributes:
+        value: The cached value.
+    """
+
+    value: Any
+
+
+class Miss:
+    """Sentinel class representing a cache miss.
+
+    Used to distinguish between a cached None value and a cache miss
+    when None is a valid cached value.
+    """
+
+
+# Singleton instance for cache miss sentinel
+miss = Miss()
+
+
+class _CacheImpl(ABC):
+    """Abstract base class for cache implementations.
+
+    This class defines the interface that all cache implementations must follow.
+    It provides thread-safe operations through a locking mechanism and supports
+    both get and insert operations.
+
+    Note: We don't use generics here as doing so would require that the interfaces
+    know which k/v types the implementation can work with. Instead, we leave that
+    determination up to the implementation itself and require that the interfaces
+    handle any potential errors from invalid k/v types being passed to the
+    implementation.
+    """
+
+    def __init__(self) -> None:
+        """Initialize the cache implementation with a threading lock."""
+        self._lock: Lock = Lock()
+
+    @property
+    def lock(self) -> Callable[[float | None], _LockContextManager]:
+        """Get a context manager for acquiring the cache lock.
+
+        Locking of the cache is not done by the implementation itself, but by the
+        interface that uses it. The interface may want to hold the lock for longer
+        than a single cache operation, for example when dealing with multiple
+        cache implementations at once, so we leave that decision up to the interface.
+
+        Args:
+            timeout: Optional timeout in seconds (float) for acquiring the lock.
+
+        Returns:
+            A callable that returns a context manager for the lock.
+        """
+
+        def _lock_with_timeout(
+            timeout: float | None = None,
+        ) -> _LockContextManager:
+            return locks._acquire_lock_with_timeout(self._lock, timeout)
+
+        return _lock_with_timeout
+
+    @abstractmethod
+    def get(self, key: Any) -> Hit | None:
+        """Retrieve a value from the cache.
+
+        Args:
+            key: The key to look up in the cache.
+
+        Returns:
+            A Hit object on cache hit where Hit.value is the cached value,
+            or None on cache miss.
+        """
+
+    @abstractmethod
+    def insert(self, key: Any, value: Any) -> bool:
+        """Insert a key-value pair into the cache.
+
+        Args:
+            key: The key to insert.
+            value: The value to associate with the key.
+
+        Returns:
+            True if the insertion was successful, False if not inserted.
+        """
+
+
+class _InMemoryCacheImpl(_CacheImpl):
+    """In-memory cache implementation using a dictionary.
+
+    This implementation stores key-value pairs in a Python dictionary,
+    with keys being pickled for consistent hashing. It provides fast
+    access but is limited by available memory and process lifetime.
+    """
+
+    def __init__(self) -> None:
+        """Initialize the in-memory cache with an empty dictionary."""
+        super().__init__()
+        self._memory: dict[bytes, Any] = {}
+
+    @override
+    def get(self, key: Any) -> Hit | None:
+        """Retrieve a value from the in-memory cache.
+
+        Args:
+            key: The key to look up. Will be pickled for storage.
+
+        Returns:
+            A Hit object on cache hit where Hit.value is the cached value,
+            or None on cache miss.
+        """
+        pickled_key: bytes = utils._try_pickle_key(key)
+        if (value := self._memory.get(pickled_key, miss)) is not miss:
+            return Hit(value=value)
+        return None
+
+    @override
+    def insert(self, key: Any, value: Any) -> bool:
+        """Insert a key-value pair into the in-memory cache.
+
+        Args:
+            key: The key to insert. Will be pickled for storage.
+            value: The value to associate with the key.
+
+        Returns:
+            True if the insertion was successful (key was new),
+            False if not inserted (key already existed).
+        """
+        pickled_key: bytes = utils._try_pickle_key(key)
+        if pickled_key not in self._memory:
+            self._memory[pickled_key] = value
+            return True
+        return False
+
+
+class _OnDiskCacheImpl(_CacheImpl):
+    """On-disk cache implementation using file system storage.
+
+    This implementation stores cached data as files on disk, with version
+    headers to handle cache invalidation. It uses file locking to ensure
+    thread safety across processes and provides persistent storage that
+    survives process restarts.
+
+    Attributes:
+        _version: Version number for cache format compatibility.
+        _version_header_length: Length of the version header in bytes.
+    """
+
+    _version: int = 0
+    _version_header_length: int = 4
+
+    def __init__(self, sub_dir: PathLike[str] | None = None) -> None:
+        """Initialize the on-disk cache with a specified subdirectory.
+
+        Args:
+            sub_dir: Subdirectory name within the cache directory.
+                    Defaults to empty string if not specified.
+        """
+        self._cache_dir: Path = self._base_dir / (sub_dir or "")
+        # pyrefly: ignore  # bad-assignment
+        self._flock: FileLock = FileLock(str(self._cache_dir / "dir.lock"))
+
+    @property
+    def _base_dir(self) -> Path:
+        """Get the base directory for cache storage.
+
+        Returns:
+            Path to the cache directory based on the default cache dir
+            and the specified subdirectory.
+        """
+        from torch._inductor.runtime.runtime_utils import default_cache_dir
+
+        return Path(default_cache_dir(), "cache")
+
+    def _fpath_from_key(self, key: Any) -> Path:
+        """Generate a file path from a cache key.
+
+        Args:
+            key: The cache key to convert to a file path.
+
+        Returns:
+            A Path object representing the file location for this key.
+        """
+        pickled_key: bytes = utils._try_pickle_key(key)
+        return self._cache_dir / sha256(pickled_key).hexdigest()[:32]
+
+    @classmethod
+    def _version_header(cls) -> bytes:
+        """Generate the version header bytes.
+
+        Returns:
+            A byte string representing the current cache version header.
+        """
+        return sha256(str(cls._version).encode()).digest()[: cls._version_header_length]
+
+    def _version_header_matches(self, fp: BufferedReader) -> bool:
+        """Check if the file's version header matches the current version.
+
+        Args:
+            fp: File pointer positioned at the start of the file.
+
+        Returns:
+            True if the version header matches, False otherwise.
+        """
+        return fp.read(self._version_header_length) == self._version_header()
+
+    def _write_version_header(self, fp: BufferedWriter) -> None:
+        """Write the version header to a file.
+
+        Args:
+            fp: File pointer where the version header should be written.
+        """
+        fp.write(self._version_header())
+
+    @override
+    @property
+    def lock(self) -> Callable[[float | None], _LockContextManager]:
+        """Get a context manager for acquiring the file lock.
+
+        Uses file locking to ensure thread safety across processes.
+
+        Args:
+            timeout: Optional timeout in seconds (float) for acquiring the file lock.
+
+        Returns:
+            A callable that returns a context manager for the file lock.
+        """
+
+        def _lock_with_timeout(
+            timeout: float | None = None,
+        ) -> _LockContextManager:
+            return locks._acquire_flock_with_timeout(self._flock, timeout)
+
+        return _lock_with_timeout
+
+    @override
+    def get(self, key: Any) -> Hit | None:
+        """Retrieve a value from the on-disk cache.
+
+        Args:
+            key: The key to look up in the cache.
+
+        Returns:
+            A Hit object on cache hit where Hit.value is the cached value,
+            or None on cache miss or version mismatch.
+        """
+        fpath: Path = self._fpath_from_key(key)
+
+        if not fpath.is_file():
+            return None
+
+        pickled_value: bytes | None = None
+        with open(fpath, "rb") as fp:
+            if self._version_header_matches(fp):
+                pickled_value = fp.read()
+
+        if not pickled_value:
+            # if pickled_value is still None, even though the file exists, then
+            # we know that the version header did not match. in this case implementation
+            # is up to preference, we choose to remove entries that do not match
+            # the version header so that the key can be re-cached later with the correct
+            # version header
+            fpath.unlink()
+            return None
+
+        return Hit(value=utils._try_unpickle_value(pickled_value))
+
+    @override
+    def insert(self, key: Any, value: Any) -> bool:
+        """Insert a key-value pair into the on-disk cache.
+
+        Args:
+            key: The key to insert.
+            value: The value to associate with the key.
+
+        Returns:
+            True if successfully inserted, False if the key already exists
+            with a valid version.
+        """
+        fpath: Path = self._fpath_from_key(key)
+        fpath.parent.mkdir(parents=True, exist_ok=True)
+
+        r_fp, w_fp, inserted = None, None, False
+        try:
+            w_fp = open(fpath, "xb")
+        except FileExistsError:
+            is_stale: bool = False
+            with open(fpath, "rb") as r_fp:
+                is_stale = not self._version_header_matches(r_fp)
+
+            if is_stale:
+                # same story as above, in this case the version header doesn't
+                # match so we choose to remove the old entry so that the new
+                # k/v pair can be cached
+                fpath.unlink()
+                w_fp = open(fpath, "xb")
+            else:
+                w_fp = None
+        finally:
+            if w_fp:
+                try:
+                    pickled_value: bytes = utils._try_pickle_value(value)
+                    self._write_version_header(w_fp)
+                    w_fp.write(pickled_value)
+                    inserted = True
+                finally:
+                    w_fp.close()
+
+        return inserted
+
+
+try:
+    from .fb.implementations import _RemoteCacheImpl
+except ModuleNotFoundError:
+
+    class _RemoteCacheImpl(_CacheImpl):  # type: ignore[no-redef]
+        """Fallback remote cache implementation for non-Facebook environments.
+
+        This is a no-op implementation that always raises NotImplementedError.
+        The actual remote cache implementation is provided in the `.fb` module
+        for Facebook-specific environments.
+
+        Attributes:
+            _version: Version number for cache format compatibility.
+            has_strong_consistency: Whether the remote cache provides strong
+                                   consistency guarantees.
+        """
+
+        _version: int = 0
+        has_strong_consistency: bool = False
+
+        def __init__(self) -> None:
+            """Initialize the fallback remote cache implementation.
+
+            Note: We don't need to initialize any form of lock since this
+            implementation provides a pseudo-lock context manager.
+            """
+
+        @override
+        @property
+        def lock(self) -> Callable[[float | None], _LockContextManager]:
+            """Get a pseudo lock that does nothing.
+
+            Most remote cache implementations don't have an ability to implement
+            any form of locking, so we provide a no-op pseudo-lock for consistency
+            with the interface.
+
+            Args:
+                timeout: Optional timeout in seconds (float). Ignored in this
+
+            Returns:
+                A callable that returns a no-op context manager.
+            """
+
+            @contextmanager
+            def pseudo_lock(
+                timeout: float | None = None,
+            ) -> Generator[None, None, None]:
+                yield
+
+            return pseudo_lock
+
+        @override
+        def get(self, key: Any) -> Hit | None:
+            """Raise NotImplementedError for remote cache get operations.
+
+            Args:
+                key: The key to look up (ignored).
+
+            Raises:
+                NotImplementedError: Always raised as this is a fallback implementation.
+            """
+            raise NotImplementedError
+
+        @override
+        def insert(self, key: Any, value: Any) -> bool:
+            """Raise NotImplementedError for remote cache insert operations.
+
+            Args:
+                key: The key to insert (ignored).
+                value: The value to insert (ignored).
+
+            Raises:
+                NotImplementedError: Always raised as this is a fallback implementation.
+            """
+            raise NotImplementedError
diff --git a/torch/_inductor/runtime/caching/locks.py b/torch/_inductor/runtime/caching/locks.py
new file mode 100644
index 000000000000..45da2870081f
--- /dev/null
+++ b/torch/_inductor/runtime/caching/locks.py
@@ -0,0 +1,177 @@
+"""Lock acquisition utilities for caching system with timeout support.
+
+This module provides safe and unsafe lock acquisition functions for both threading.Lock
+and FileLock objects, with configurable timeout behaviors. It supports three timeout modes:
+blocking (infinite wait), non-blocking (immediate), and blocking with timeout (finite wait).
+
+The module offers both context manager and manual acquisition patterns:
+- Safe acquisition: Uses context managers that automatically handle lock release
+- Unsafe acquisition: Manual acquisition that requires explicit release by the caller
+"""
+
+from contextlib import contextmanager
+from threading import Lock
+from typing import Generator
+
+from filelock import FileLock, Timeout
+
+from . import exceptions
+
+
+# Infinite timeout - blocks indefinitely until lock is acquired.
+_BLOCKING: float = -1
+# No timeout - returns immediately if lock cannot be acquired.
+_NON_BLOCKING: float = 0
+# Finite timeout - blocks for a specified duration before raising a timeout error.
+_BLOCKING_WITH_TIMEOUT: float = 60.0
+# Default timeout for lock acquisition.
+_DEFAULT_TIMEOUT: float = _BLOCKING_WITH_TIMEOUT
+
+
+@contextmanager
+def _acquire_lock_with_timeout(
+    lock: Lock,
+    timeout: float | None = None,
+) -> Generator[None, None, None]:
+    """Context manager that safely acquires a threading.Lock with timeout and automatically releases it.
+
+    This function provides a safe way to acquire a lock with timeout support, ensuring
+    the lock is always released even if an exception occurs during execution.
+
+    Args:
+        lock: The threading.Lock object to acquire
+        timeout: Timeout in seconds. If None, uses _DEFAULT_TIMEOUT.
+                - Use _BLOCKING (-1.0) for infinite wait
+                - Use _NON_BLOCKING (0.0) for immediate return
+                - Use positive value for finite timeout
+
+    Yields:
+        None: Yields control to the caller while holding the lock
+
+    Raises:
+        LockTimeoutError: If the lock cannot be acquired within the timeout period
+
+    Example:
+        with _acquire_lock_with_timeout(my_lock, timeout=30.0):
+            # Critical section - lock is held
+            perform_critical_operation()
+        # Lock is automatically released here
+    """
+    _unsafe_acquire_lock_with_timeout(lock, timeout=timeout)
+
+    try:
+        yield
+    finally:
+        lock.release()
+
+
+def _unsafe_acquire_lock_with_timeout(lock: Lock, timeout: float | None = None) -> None:
+    """Acquire a threading.Lock with timeout without automatic release (unsafe).
+
+    This function acquires a lock with timeout support but does NOT automatically
+    release it. The caller is responsible for releasing the lock explicitly.
+    Use this only when you need manual control over lock lifetime.
+
+    Args:
+        lock: The threading.Lock object to acquire
+        timeout: Timeout in seconds. If None, uses _DEFAULT_TIMEOUT.
+                - Use _BLOCKING (-1.0) for infinite wait
+                - Use _NON_BLOCKING (0.0) for immediate return
+                - Use positive value for finite timeout
+
+    Raises:
+        LockTimeoutError: If the lock cannot be acquired within the timeout period
+
+    Warning:
+        This is an "unsafe" function because it does not automatically release
+        the lock. Always call lock.release() when done, preferably in a try/finally
+        block or use the safe _acquire_lock_with_timeout context manager instead.
+
+    Example:
+        lock = Lock()
+        try:
+            _unsafe_acquire_lock_with_timeout(lock, timeout=30.0)
+            # Critical section - lock is held
+            perform_critical_operation()
+        finally:
+            lock.release()  # Must manually release!
+    """
+    _timeout: float = timeout if timeout is not None else _DEFAULT_TIMEOUT
+    if not lock.acquire(timeout=_timeout):
+        raise exceptions.LockTimeoutError(lock, _timeout)
+
+
+@contextmanager
+def _acquire_flock_with_timeout(
+    flock: FileLock,
+    timeout: float | None = None,
+) -> Generator[None, None, None]:
+    """Context manager that safely acquires a FileLock with timeout and automatically releases it.
+
+    This function provides a safe way to acquire a file lock with timeout support, ensuring
+    the lock is always released even if an exception occurs during execution.
+
+    Args:
+        flock: The FileLock object to acquire
+        timeout: Timeout in seconds. If None, uses _DEFAULT_TIMEOUT.
+                - Use _BLOCKING (-1.0) for infinite wait
+                - Use _NON_BLOCKING (0.0) for immediate return
+                - Use positive value for finite timeout
+
+    Yields:
+        None: Yields control to the caller while holding the file lock
+
+    Raises:
+        FileLockTimeoutError: If the file lock cannot be acquired within the timeout period
+
+    Example:
+        flock = FileLock("/tmp/my_process.lock")
+        with _acquire_flock_with_timeout(flock, timeout=30.0):
+            # Critical section - file lock is held
+            perform_exclusive_file_operation()
+        # File lock is automatically released here
+    """
+    _unsafe_acquire_flock_with_timeout(flock, timeout=timeout)
+
+    try:
+        yield
+    finally:
+        flock.release()
+
+
+def _unsafe_acquire_flock_with_timeout(flock: FileLock, timeout: float | None) -> None:
+    """Acquire a FileLock with timeout without automatic release (unsafe).
+
+    This function acquires a file lock with timeout support but does NOT automatically
+    release it. The caller is responsible for releasing the lock explicitly.
+    Use this only when you need manual control over lock lifetime.
+
+    Args:
+        flock: The FileLock object to acquire
+        timeout: Timeout in seconds. If None, uses _DEFAULT_TIMEOUT.
+                - Use _BLOCKING (-1.0) for infinite wait
+                - Use _NON_BLOCKING (0.0) for immediate return
+                - Use positive value for finite timeout
+
+    Raises:
+        FileLockTimeoutError: If the file lock cannot be acquired within the timeout period
+
+    Warning:
+        This is an "unsafe" function because it does not automatically release
+        the lock. Always call flock.release() when done, preferably in a try/finally
+        block or use the safe _acquire_flock_with_timeout context manager instead.
+
+    Example:
+        flock = FileLock("/tmp/my_process.lock")
+        try:
+            _unsafe_acquire_flock_with_timeout(flock, timeout=30.0)
+            # Critical section - file lock is held
+            perform_exclusive_file_operation()
+        finally:
+            flock.release()  # Must manually release!
+    """
+    _timeout: float = timeout if timeout is not None else _DEFAULT_TIMEOUT
+    try:
+        _ = flock.acquire(timeout=_timeout)
+    except Timeout as err:
+        raise exceptions.FileLockTimeoutError(flock, _timeout) from err
diff --git a/torch/_inductor/runtime/caching/utils.py b/torch/_inductor/runtime/caching/utils.py
new file mode 100644
index 000000000000..bb5a001745e9
--- /dev/null
+++ b/torch/_inductor/runtime/caching/utils.py
@@ -0,0 +1,108 @@
+"""Utility functions for caching operations in PyTorch Inductor runtime.
+
+This module provides helper functions for pickling/unpickling operations
+with error handling, LRU caching decorators, and type-safe serialization
+utilities used throughout the caching system.
+"""
+
+import pickle
+from functools import lru_cache, partial, wraps
+from typing import Any, Callable
+from typing_extensions import ParamSpec, TypeVar
+
+from . import exceptions
+
+
+# Type specification for function parameters
+P = ParamSpec("P")
+# Type variable for function return values
+R = TypeVar("R")
+
+
+def _lru_cache(fn: Callable[P, R]) -> Callable[P, R]:
+    """LRU cache decorator with TypeError fallback.
+
+    Provides LRU caching with a fallback mechanism that calls the original
+    function if caching fails due to unhashable arguments. Uses a cache
+    size of 64 with typed comparison.
+
+    Args:
+        fn: The function to be cached.
+
+    Returns:
+        A wrapper function that attempts caching with fallback to original function.
+    """
+    cached_fn = lru_cache(maxsize=64, typed=True)(fn)
+
+    @wraps(fn)
+    def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:  # type: ignore[type-var]
+        try:
+            return cached_fn(*args, **kwargs)  # type: ignore[arg-type]
+        except TypeError:
+            return fn(*args, **kwargs)
+
+    return wrapper
+
+
+@_lru_cache
+def _try_pickle(to_pickle: Any, raise_if_failed: type = exceptions.CacheError) -> bytes:
+    """Attempt to pickle an object with error handling.
+
+    Tries to serialize an object using pickle.dumps with appropriate error
+    handling and custom exception raising.
+
+    Args:
+        to_pickle: The object to be pickled.
+        raise_if_failed: Exception class to raise if pickling fails.
+
+    Returns:
+        The pickled bytes representation of the object.
+
+    Raises:
+        The exception class specified in raise_if_failed if pickling fails.
+    """
+    try:
+        pickled: bytes = pickle.dumps(to_pickle)
+    except (pickle.PicklingError, AttributeError) as err:
+        raise raise_if_failed(to_pickle) from err
+    return pickled
+
+
+# Specialized pickle function for cache keys with KeyPicklingError handling.
+_try_pickle_key: Callable[[Any], bytes] = partial(
+    _try_pickle, raise_if_failed=exceptions.KeyPicklingError
+)
+# Specialized pickle function for cache values with ValuePicklingError handling.
+_try_pickle_value: Callable[[Any], bytes] = partial(
+    _try_pickle, raise_if_failed=exceptions.ValuePicklingError
+)
+
+
+@_lru_cache
+def _try_unpickle(pickled: bytes, raise_if_failed: type = exceptions.CacheError) -> Any:
+    """Attempt to unpickle bytes with error handling.
+
+    Tries to deserialize bytes using pickle.loads with appropriate error
+    handling and custom exception raising.
+
+    Args:
+        pickled: The bytes to be unpickled.
+        raise_if_failed: Exception class to raise if unpickling fails.
+
+    Returns:
+        The unpickled object.
+
+    Raises:
+        The exception class specified in raise_if_failed if unpickling fails.
+    """
+    try:
+        unpickled: Any = pickle.loads(pickled)
+    except pickle.UnpicklingError as err:
+        raise raise_if_failed(pickled) from err
+    return unpickled
+
+
+# Specialized unpickle function for cache keys with KeyUnPicklingError handling.
+_try_unpickle_value: Callable[[Any], bytes] = partial(
+    _try_unpickle, raise_if_failed=exceptions.ValueUnPicklingError
+)
diff --git a/torch/_inductor/runtime/compile_tasks.py b/torch/_inductor/runtime/compile_tasks.py
index 850c7660d5d9..1851e447e195 100644
--- a/torch/_inductor/runtime/compile_tasks.py
+++ b/torch/_inductor/runtime/compile_tasks.py
@@ -40,7 +40,7 @@ def _reload_python_module(
 def _set_triton_ptxas_path() -> None:
     if os.environ.get("TRITON_PTXAS_PATH") is not None:
         return
-    ptxas = Path(__file__).absolute().parents[1] / "bin" / "ptxas"
+    ptxas = Path(__file__).absolute().parents[2] / "bin" / "ptxas"
     if not ptxas.exists():
         return
     if ptxas.is_file() and os.access(ptxas, os.X_OK):
diff --git a/torch/_inductor/runtime/coordinate_descent_tuner.py b/torch/_inductor/runtime/coordinate_descent_tuner.py
index ad7a0d56fc4b..526669d7ca88 100644
--- a/torch/_inductor/runtime/coordinate_descent_tuner.py
+++ b/torch/_inductor/runtime/coordinate_descent_tuner.py
@@ -2,7 +2,7 @@
 import copy
 import itertools
 import logging
-from typing import Callable, Optional, TYPE_CHECKING
+from typing import Callable, TYPE_CHECKING
 
 from .hints import TRITON_MAX_BLOCK
 from .runtime_utils import red_text, triton_config_to_hashable
@@ -47,9 +47,20 @@ class CoordescTuner:
     """
 
     def __init__(
-        self, is_mm=False, name="unknown", size_hints=None, inductor_meta=None
+        self,
+        is_mm=False,
+        is_native_matmul=False,
+        name="unknown",
+        size_hints=None,
+        inductor_meta=None,
     ):
         self.is_mm = is_mm  # we will tune num_stages for mm
+
+        # Native matmul codegen assumes ZBLOCK=1 always.
+        # This is because 3d tl.dot is slow and so we want to tile y and x only.
+        # tl.dot also does not support size smaller than 16; we put this restriction.
+        self.is_native_matmul = is_native_matmul
+        assert not (self.is_mm and self.is_native_matmul)
         self.cached_benchmark_results = {}
         self.name = name
         self.size_hints = size_hints
@@ -101,6 +112,9 @@ def tunable_fields(self):
             out.append("num_stages")
         if self.inductor_meta.get("is_hip") is True:
             out.append("waves_per_eu")
+        if self.is_native_matmul:
+            out.append("num_stages")
+            out.remove("ZBLOCK")  # ZBLOCK=1 always in native matmul
 
         return out
 
@@ -116,6 +130,15 @@ def value_too_large(self, name: str, val: int) -> bool:
 
         return False
 
+    def value_too_small(self, name: str, val: int) -> bool:
+        # In native matmul, block size should be >= 16 for tl.dot
+        if self.is_native_matmul:
+            if name in ["YBLOCK", "XBLOCK", "R0_BLOCK"]:
+                return val < 16
+
+        # Break if value becomes 0/neg
+        return val <= 0
+
     def get_neighbour_values(self, name, orig_val, radius=1, include_self=False):
         """
         Get neighbour values in 'radius' steps. The original value is not
@@ -148,7 +171,7 @@ def update(cur_val, inc=True):
         cur_val = orig_val
         for _ in range(radius):
             cur_val = update(cur_val, False)
-            if cur_val <= 0:
+            if self.value_too_small(name, cur_val):
                 break
             out.append(cur_val)
 
@@ -163,6 +186,7 @@ def has_improvement(baseline, test):
 
     def check_all_tuning_directions(
         self,
+        # pyrefly: ignore  # missing-attribute
         func: Callable[["triton.Config"], float],
         best_config,
         best_timing,
@@ -215,7 +239,7 @@ def compare_config(self, func, candidate_config, best_config, best_timing):
         try:
             candidate_timing = self.call_func(func, candidate_config)
         except Exception as e:
-            log.debug("Got exception %s", e)
+            log.debug("Got exception %s", e)  # noqa: G200
             return False, float("inf")
 
         if self.has_improvement(best_timing, candidate_timing):
@@ -232,10 +256,12 @@ def compare_config(self, func, candidate_config, best_config, best_timing):
 
     def autotune(
         self,
+        # pyrefly: ignore  # missing-attribute
         func: Callable[["triton.Config"], float],
+        # pyrefly: ignore  # missing-attribute
         baseline_config: "triton.Config",
-        baseline_timing: Optional[float] = None,
-    ) -> "triton.Config":
+        baseline_timing: float | None = None,
+    ) -> "triton.Config":  # pyrefly: ignore  # missing-attribute
         if baseline_timing is None:
             baseline_timing = self.call_func(func, baseline_config)
 
diff --git a/torch/_inductor/runtime/hints.py b/torch/_inductor/runtime/hints.py
index 15b86b1b3d1a..98357daacabb 100644
--- a/torch/_inductor/runtime/hints.py
+++ b/torch/_inductor/runtime/hints.py
@@ -5,15 +5,15 @@
 import functools
 import typing
 from enum import auto, Enum
-from typing import Optional, Union
 
+import torch
 from torch.utils._triton import has_triton_package
 
 
 # The following maximums only apply to runtime autotuning, when using FixedTritonConfig one may see larger values
 # NOTE: if these fail asserts submit a PR to increase them
 TRITON_MAX_BLOCK = {
-    "X": 4096,
+    "X": 8192 if torch.version.hip else 4096,
     "Y": 1024,
     "Z": 1024,
     "R0_": 4096 * 16,  # * 16 is multi-kernel only
@@ -89,11 +89,13 @@ def AttrsDescriptorWrapper(
             divisible_by_16=None,
             equal_to_1=None,
         ):
+            # pyrefly: ignore  # not-iterable
             return {(x,): [["tt.divisibility", 16]] for x in divisible_by_16}
 
 else:
     # Define a namedtuple as a fallback when AttrsDescriptor is not available
     AttrsDescriptorWrapper = collections.namedtuple(  # type: ignore[no-redef, name-match]
+        # pyrefly: ignore  # invalid-argument
         "AttrsDescriptor",
         ["divisible_by_16", "equal_to_1"],
         defaults=[(), ()],
@@ -130,10 +132,10 @@ class DeviceProperties(typing.NamedTuple):
     index: int  # type: ignore[assignment]
     multi_processor_count: int
     cc: int
-    major: Optional[int] = None
-    regs_per_multiprocessor: Optional[int] = None
-    max_threads_per_multi_processor: Optional[int] = None
-    warp_size: Optional[int] = None
+    major: int | None = None
+    regs_per_multiprocessor: int | None = None
+    max_threads_per_multi_processor: int | None = None
+    warp_size: int | None = None
 
     @classmethod
     @functools.cache
@@ -174,10 +176,10 @@ def create(cls, device) -> DeviceProperties:
 class HalideInputSpec(typing.NamedTuple):
     ctype: str
     name: str
-    shape: Optional[list[str]] = None
-    stride: Optional[list[str]] = None
-    offset: Optional[str] = None
-    alias_of: Optional[str] = None
+    shape: list[str] | None = None
+    stride: list[str] | None = None
+    offset: str | None = None
+    alias_of: str | None = None
 
     def bindings_type(self) -> str:
         if self.ctype in ("at::Half*", "at::BFloat16*"):
@@ -201,9 +203,9 @@ def is_buffer(self) -> bool:
 class HalideMeta(typing.NamedTuple):
     argtypes: list[HalideInputSpec]
     target: str
-    scheduler: Optional[str] = None
-    scheduler_flags: Optional[dict[str, Union[int, str]]] = None
-    cuda_device: Optional[int] = None
+    scheduler: str | None = None
+    scheduler_flags: dict[str, int | str] | None = None
+    cuda_device: int | None = None
 
     def args(self) -> list[str]:
         """Command line args to pass to halide generator"""
diff --git a/torch/_inductor/runtime/runtime_utils.py b/torch/_inductor/runtime/runtime_utils.py
index 21cd5987f8f4..30087d95663a 100644
--- a/torch/_inductor/runtime/runtime_utils.py
+++ b/torch/_inductor/runtime/runtime_utils.py
@@ -68,8 +68,11 @@ def triton_config_to_hashable(cfg: Config) -> Hashable:
     Convert triton config to a tuple that can uniquely identify it. We can use
     the return value as a dictionary key.
     """
+    # pyrefly: ignore  # missing-attribute
     items = sorted(cfg.kwargs.items())
+    # pyrefly: ignore  # missing-attribute
     items.append(("num_warps", cfg.num_warps))
+    # pyrefly: ignore  # missing-attribute
     items.append(("num_stages", cfg.num_stages))
     return tuple(items)
 
@@ -103,6 +106,7 @@ def get_max_y_grid() -> int:
 
 
 try:
+    # pyrefly: ignore  # import-error
     import colorama
 
     HAS_COLORAMA = True
@@ -114,6 +118,7 @@ def get_max_y_grid() -> int:
 if HAS_COLORAMA:
 
     def _color_text(msg: str, color: str) -> str:
+        # pyrefly: ignore  # missing-attribute
         return getattr(colorama.Fore, color.upper()) + msg + colorama.Fore.RESET
 
 else:
diff --git a/torch/_inductor/runtime/static_cuda_launcher.py b/torch/_inductor/runtime/static_cuda_launcher.py
index bfea6fc119d9..e7d4705740e5 100644
--- a/torch/_inductor/runtime/static_cuda_launcher.py
+++ b/torch/_inductor/runtime/static_cuda_launcher.py
@@ -1,6 +1,6 @@
 import functools
 import os
-from typing import Any, Optional
+from typing import Any
 from typing_extensions import Unpack
 
 from .triton_compat import ASTSource, CompiledKernel, knobs as triton_knobs
@@ -34,21 +34,29 @@ class StaticallyLaunchedCudaKernel:
     """
 
     def __init__(self, kernel: CompiledKernel) -> None:
+        # pyrefly: ignore  # missing-attribute
         self.name = kernel.src.fn.__name__
+        # pyrefly: ignore  # missing-attribute
         self.cubin_raw = kernel.asm.get("cubin", None)
+        # pyrefly: ignore  # missing-attribute
         self.cubin_path = kernel._cubin_path
 
         # Used by torch.compile to filter constants in older triton versions
+        # pyrefly: ignore  # missing-attribute
         self.arg_names = kernel.src.fn.arg_names
 
         # Const exprs that are declared by the triton kernel directly
         # Used to generate the kernel launcher's def args
+        # pyrefly: ignore  # missing-attribute
         self.declared_constexprs = kernel.src.fn.constexprs
 
+        # pyrefly: ignore  # missing-attribute
         self.hash = kernel.hash
 
         if triton_knobs is None:
+            # pyrefly: ignore  # missing-attribute
             launch_enter = kernel.__class__.launch_enter_hook
+            # pyrefly: ignore  # missing-attribute
             launch_exit = kernel.__class__.launch_exit_hook
         else:
             launch_enter = triton_knobs.runtime.launch_enter_hook
@@ -70,12 +78,15 @@ def hook_is_empty(hook: Any) -> bool:
             raise NotImplementedError(
                 "We don't support launch enter or launch exit hooks"
             )
+        # pyrefly: ignore  # missing-attribute
         self.num_warps = kernel.metadata.num_warps
         self.shared = (
+            # pyrefly: ignore  # missing-attribute
             kernel.shared if hasattr(kernel, "shared") else kernel.metadata.shared
         )
 
         def needs_scratch_arg(scratch_name: str, param_name: str) -> bool:
+            # pyrefly: ignore  # missing-attribute
             if hasattr(kernel.metadata, param_name):
                 if getattr(kernel.metadata, param_name) > 0:
                     raise NotImplementedError(
@@ -91,10 +102,9 @@ def needs_scratch_arg(scratch_name: str, param_name: str) -> bool:
         # same situation for profile scratch - triton-lang/triton#7258
         self.has_profile_scratch = needs_scratch_arg("Profile", "profile_scratch_size")
 
+        # pyrefly: ignore  # missing-attribute
         self.arg_tys = self.arg_ty_from_signature(kernel.src)
-        self.function: Optional[int] = (
-            None  # Loaded by load_kernel(on the parent process)
-        )
+        self.function: int | None = None  # Loaded by load_kernel(on the parent process)
         num_ctas = 1
         if hasattr(kernel, "num_ctas"):
             num_ctas = kernel.num_ctas
@@ -172,6 +182,7 @@ def extract_type(self, ty: str) -> str:
     def arg_ty_from_signature(self, src: ASTSource) -> str:
         def index_key(i: Any) -> int:
             if isinstance(i, str):
+                # pyrefly: ignore  # missing-attribute
                 return src.fn.arg_names.index(i)
             elif isinstance(i, tuple):
                 # In triton 3.3, src.fn.constants has tuples as a key
@@ -179,6 +190,7 @@ def index_key(i: Any) -> int:
             else:
                 return i
 
+        # pyrefly: ignore  # missing-attribute
         signature = {index_key(key): value for key, value in src.signature.items()}
         # Triton uses these as the main way to filter out constants passed to their cubin
         constants = [index_key(key) for key in getattr(src, "constants", dict())]
@@ -200,6 +212,7 @@ def index_key(i: Any) -> int:
             if ty == "constexpr" or i in constants:
                 pass
             else:
+                # pyrefly: ignore  # bad-argument-type
                 params.append(self.extract_type(ty))
         return "".join(params)
 
@@ -237,6 +250,7 @@ def run(
             if has_scratch:
                 arg_tys = arg_tys + "O"
                 args = (*args, None)
+        # pyrefly: ignore  # bad-argument-type
         assert len(args) == len(arg_tys)
 
         # TODO: can handle grid functions here or in C++, so
@@ -249,6 +263,7 @@ def run(
             self.num_warps,
             self.shared,
             arg_tys,
+            # pyrefly: ignore  # bad-argument-type
             args,
             stream,
         )
diff --git a/torch/_inductor/runtime/triton_compat.py b/torch/_inductor/runtime/triton_compat.py
index 645e0f4c8903..7bd4fbee24ab 100644
--- a/torch/_inductor/runtime/triton_compat.py
+++ b/torch/_inductor/runtime/triton_compat.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import inspect
-from typing import Any, Union
+from typing import Any
 
 import torch
 
@@ -37,7 +37,7 @@ class PTXASError(Exception):  # type: ignore[no-redef]
 
         def GPUTarget(
             backend: str,
-            arch: Union[int, str],
+            arch: int | str,
             warp_size: int,
         ) -> Any:
             if torch.version.hip:
@@ -138,7 +138,7 @@ def constexpr(val: Any) -> Any:
     HAS_TRITON = False
 
 
-def cc_warp_size(cc: Union[str, int]) -> int:
+def cc_warp_size(cc: str | int) -> int:
     if torch.version.hip:
         cc_str = str(cc)
         if "gfx10" in cc_str or "gfx11" in cc_str:
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 6d978af8d772..f809d9f7d50a 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -18,20 +18,12 @@
 import threading
 import time
 from collections import namedtuple
-from typing import (
-    Any,
-    Callable,
-    Generic,
-    Literal,
-    Optional,
-    TYPE_CHECKING,
-    TypeVar,
-    Union,
-)
+from typing import Any, Callable, Generic, Literal, TYPE_CHECKING, TypeVar, Union
 
 import torch
-from torch._dynamo.utils import set_feature_use
+from torch._dynamo.utils import counters, set_feature_use
 from torch._environment import is_fbcode
+from torch._inductor import metrics
 from torch._prims_common import compute_required_storage_length
 from torch.utils._ordered_set import OrderedSet
 
@@ -118,7 +110,7 @@ def generate_lookup_hash_from_source_code(size_hints_str: str, source_code: str)
     return fn_hash
 
 
-def lookup_autotune_config(size_hints, fn) -> Optional[Config]:
+def lookup_autotune_config(size_hints, fn) -> Config | None:
     lookup_table = torch._inductor.config.autotune_lookup_table
     cached_config = None
     if len(lookup_table) > 0 and "_fused_" in fn.src:
@@ -156,7 +148,7 @@ def autotune_hints_to_configs(
     Based on those hints, this function will generate a list of additional autotuning
     configs to try.
     """
-    xyz_options: tuple[tuple[int, Optional[int], Optional[int]], ...]
+    xyz_options: tuple[tuple[int, int | None, int | None], ...]
     configs: list[Config] = []
     for hint in hints:
         if hint == AutotuneHint.ONE_ELEMENT_PER_THREAD:
@@ -184,14 +176,6 @@ def autotune_hints_to_configs(
     return configs
 
 
-def disable_pointwise_autotuning(inductor_meta):
-    # Autotuning can give different benchmarking results from run to run, and
-    # therefore we disable autotuning when use_deterministic flag is on.
-    if inductor_meta.get("are_deterministic_algorithms_enabled"):
-        return True
-    return not inductor_meta.get("autotune_pointwise", True)
-
-
 def _dump_launch_params(args, kwargs, launcher, kernel_name, grid):
     call_args = []
     call_kwargs = {}
@@ -224,8 +208,8 @@ def _dump_launch_params(args, kwargs, launcher, kernel_name, grid):
 
 
 def check_autotune_cache(
-    configs: list[Config], filename: Optional[str], inductor_meta: dict[str, Any]
-) -> tuple[list[Config], Optional[AutotuneCache], dict[str, Any]]:
+    configs: list[Config], filename: str | None, inductor_meta: dict[str, Any]
+) -> tuple[list[Config], AutotuneCache | None, dict[str, Any]]:
     """
     Given a list of configs, checks autotune cache and return metadata
     """
@@ -236,7 +220,7 @@ def check_autotune_cache(
         not disabled
         and filename is not None
         and (len(configs) > 1 or inductor_meta.get("coordinate_descent_tuning"))
-        and not os.environ.get("TRITON_INTERPRET", "0") == "1"
+        and os.environ.get("TRITON_INTERPRET", "0") != "1"
     ):
         configs_hash = hash_configs(configs)
 
@@ -292,9 +276,9 @@ def __init__(
         size_hints=None,
         inductor_meta=None,  # metadata not relevant to triton
         custom_kernel=False,  # whether the kernel is inductor-generated or custom
-        filename: Optional[str] = None,
-        reset_to_zero_arg_names: Optional[list[str]] = None,
-        autotune_cache_info: Optional[dict[str, Any]] = None,
+        filename: str | None = None,
+        reset_to_zero_arg_names: list[str] | None = None,
+        autotune_cache_info: dict[str, Any] | None = None,
     ):
         super().__init__()
 
@@ -311,6 +295,8 @@ def __init__(
             "device_type": self.device_props.type,
         }
         self.inductor_meta = {} if inductor_meta is None else inductor_meta
+        self.deterministic_mode = self.inductor_meta.get("deterministic", False)
+
         self.save_cache_hook = save_cache_hook
         self.mutated_arg_names = mutated_arg_names
         self.reset_to_zero_arg_names = (
@@ -345,6 +331,7 @@ def __init__(
         self.size_hints = size_hints
         self.coordesc_tuner = CoordescTuner(
             is_mm=False,
+            is_native_matmul=triton_meta.get("native_matmul", False),
             name=self.fn.__name__,
             size_hints=size_hints,
             inductor_meta=self.inductor_meta,
@@ -371,11 +358,11 @@ def __init__(
         self.triton_interpret = os.environ.get("TRITON_INTERPRET", "0") == "1"
 
         # Compile-time info included in runtime logginging
-        self.compile_id: Optional[CompileId] = None
+        self.compile_id: CompileId | None = None
         self.is_backward = False
 
         # Mode for launch grid calculation
-        self.grid_mode: Literal["python", "python_slow", "cpp"] = "python"
+        self.grid_mode: Literal["python", "cpp"] = "python"
 
     def is_statically_launchable(self):
         """
@@ -423,17 +410,15 @@ def recheck_autotune_cache(
                         self.fn = reload_kernel_from_src().fn
                     self.compile_results = [self._precompile_config(best_config)]
 
-    def set_compile_info(
-        self, compile_id: Optional[CompileId], is_backward: bool
-    ) -> None:
+    def set_compile_info(self, compile_id: CompileId | None, is_backward: bool) -> None:
         self.compile_id = compile_id
         self.is_backward = is_backward
 
     def precompile(
         self,
         warm_cache_only=False,
-        reload_kernel: Optional[Callable[[], CachingAutotuner]] = None,
-        static_triton_bundle_key: Optional[str] = None,
+        reload_kernel: Callable[[], CachingAutotuner] | None = None,
+        static_triton_bundle_key: str | None = None,
     ):
         if warm_cache_only:
             self._precompile_worker()
@@ -482,7 +467,8 @@ def _dynamic_scale_rblock(self):
         # Currently it relies on _make_launchers(), which requires a cuda context, to populate nreg.
         device_prop = self.device_props
         if (
-            self.inductor_meta.get("dynamic_scale_rblock", True)
+            not self.deterministic_mode
+            and self.inductor_meta.get("dynamic_scale_rblock", True)
             and not self.inductor_meta.get("persistent_reduction")
             and self.heuristic_type == HeuristicType.REDUCTION
             and self.size_hints is not None
@@ -495,7 +481,7 @@ def _dynamic_scale_rblock(self):
             assert device_prop.regs_per_multiprocessor
             assert device_prop.max_threads_per_multi_processor
             assert device_prop.multi_processor_count
-            seen_config_hashes: Optional[OrderedSet[Hashable]] = None
+            seen_config_hashes: OrderedSet[Hashable] | None = None
             warp_size = device_prop.warp_size or 32
             for result in self.compile_results:
                 triton_config = result.config
@@ -641,7 +627,7 @@ def prepare_for_pickle(self) -> tuple[Any, Any, Any, Any, Any, Any]:
         return old_values
 
     def restore_after_unpickle(
-        self, old_values: Optional[tuple[Any, Any, Any, Any, Any, Any]]
+        self, old_values: tuple[Any, Any, Any, Any, Any, Any] | None
     ) -> None:
         if old_values:
             (
@@ -686,9 +672,17 @@ def get_device_interface(self):
 
         return get_interface_for_device(self.device_props.type.replace("hip", "cuda"))
 
-    def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
-        """Ahead of time compile a given autotuner config."""
+    def _create_compile_meta(self, cfg: Config) -> dict[str, Any]:
+        """
+        Create compilation metadata for a given autotuner config. This involves
+        processing the Config kwargs so that the kwargs that are not part
+        of the triton signature are passed in as options to triton.compile
+        instead
+        """
         compile_meta = copy.deepcopy(self.triton_meta)
+        compile_meta["num_warps"] = cfg.num_warps
+        compile_meta["num_stages"] = cfg.num_stages
+
         cfg_kwargs = cfg.kwargs
         if self.device_props.type == "hip":
             cfg_kwargs = {**cfg_kwargs}
@@ -696,14 +690,13 @@ def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
                 if k in cfg_kwargs:
                     compile_meta[k] = cfg_kwargs.pop(k)
         compile_meta["constants"].update(cfg_kwargs)
+
         for i in self.fn.constexprs:
             arg_name = self.fn.arg_names[i]
             if arg_name not in compile_meta["constants"] and (
                 arg_name == "num_warps" or arg_name == "num_stages"
             ):
                 compile_meta["constants"][arg_name] = getattr(cfg, arg_name)
-        compile_meta["num_warps"] = cfg.num_warps
-        compile_meta["num_stages"] = cfg.num_stages
         if HAS_WARP_SPEC:
             compile_meta["num_consumer_groups"] = getattr(cfg, "num_consumer_groups", 0)
             compile_meta["num_buffers_warp_spec"] = getattr(
@@ -717,6 +710,53 @@ def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
         compile_meta["device_type"] = self.device_props.type
         compile_meta["cc"] = self.device_props.cc
 
+        return compile_meta
+
+    def _create_compile_options(
+        self, cfg: Config, compile_meta: dict[str, Any]
+    ) -> dict[str, Any]:
+        """
+        Create options to pass to triton.compile based on the compile metadata
+        and the given config.
+        """
+        options = {
+            "num_warps": compile_meta["num_warps"],
+            "num_stages": compile_meta["num_stages"],
+            "debug": compile_meta["debug"],
+            "sanitize_overflow": False,  # turn off additional asserts added for overflow checks
+        }
+        if "enable_fp_fusion" in compile_meta:
+            options["enable_fp_fusion"] = compile_meta["enable_fp_fusion"]
+        if HAS_WARP_SPEC:
+            options.update(
+                {
+                    "num_consumer_groups": compile_meta.get("num_consumer_groups", 0),
+                    "num_buffers_warp_spec": compile_meta.get(
+                        "num_buffers_warp_spec", 0
+                    ),
+                }
+            )
+        if self.device_props.type == "cuda":
+            options.update(
+                {
+                    "launch_cooperative_grid": compile_meta.get(
+                        "launch_cooperative_grid", False
+                    ),
+                    "launch_pdl": compile_meta.get("launch_pdl", False),  # True
+                }
+            )
+        if self.device_props.type == "hip":
+            if "waves_per_eu" in compile_meta:
+                options["waves_per_eu"] = compile_meta["waves_per_eu"]
+            if "matrix_instr_nonkdim" in compile_meta:
+                options["matrix_instr_nonkdim"] = compile_meta["matrix_instr_nonkdim"]
+
+        return options
+
+    def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
+        """Ahead of time compile a given autotuner config."""
+        compile_meta = self._create_compile_meta(cfg)
+
         if self.device_props.type == "cpu":
             triton_helpers.set_driver_to_cpu()
         else:
@@ -749,26 +789,8 @@ def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
             cc_warp_size(compile_meta["cc"]),
         )
 
-        options = {
-            "num_warps": compile_meta["num_warps"],
-            "num_stages": compile_meta["num_stages"],
-            "debug": compile_meta["debug"],
-            "sanitize_overflow": False,  # turn off additional asserts added for overflow checks
-        }
-        if HAS_WARP_SPEC:
-            options.update(
-                {
-                    "num_consumer_groups": compile_meta.get("num_consumer_groups", 0),
-                    "num_buffers_warp_spec": compile_meta.get(
-                        "num_buffers_warp_spec", 0
-                    ),
-                }
-            )
-        if self.device_props.type == "hip":
-            if "waves_per_eu" in compile_meta:
-                options["waves_per_eu"] = compile_meta["waves_per_eu"]
-            if "matrix_instr_nonkdim" in compile_meta:
-                options["matrix_instr_nonkdim"] = compile_meta["matrix_instr_nonkdim"]
+        options = self._create_compile_options(cfg, compile_meta)
+
         compile_kwargs = {
             "target": target,
             "options": options,
@@ -858,25 +880,34 @@ def kernel_call():
             )
             # reset to zero before evaluating any config
             self.reset_to_zero_args(*args, **kwargs)
+            kernel_name = self.inductor_meta.get("kernel_name", "triton kernel")
             if autograd_profiler._is_profiler_enabled:
                 profiler_kwargs = self.get_profiler_kwargs(stream, launcher)
                 with torch._C._profiler._RecordFunctionFast(
-                    self.inductor_meta.get("kernel_name", "triton kernel"),
+                    kernel_name,
                     cloned_args,
                     profiler_kwargs,
                 ):
+                    try:
+                        launcher(
+                            *cloned_args,
+                            **cloned_kwargs,
+                            stream=stream,
+                        )
+                    except Exception:
+                        log.error("Failed during launch %s: ", kernel_name)
+                        raise
+
+            else:
+                try:
                     launcher(
                         *cloned_args,
                         **cloned_kwargs,
                         stream=stream,
                     )
-
-            else:
-                launcher(
-                    *cloned_args,
-                    **cloned_kwargs,
-                    stream=stream,
-                )
+                except Exception:
+                    log.error("Failed during launch %s: ", kernel_name)
+                    raise
             self.restore_args_from_cpu(cpu_copies)
 
         # only use profiler when not already in a profiler instance
@@ -888,7 +919,9 @@ def kernel_call():
         if self.device_props.type == "cpu":
             return benchmarker.benchmark_cpu(kernel_call)
 
-        return benchmarker.benchmark_gpu(kernel_call, rep=40)
+        return benchmarker.benchmark_gpu(
+            kernel_call, rep=40, is_vetted_benchmarking=True
+        )
 
     def copy_args_to_cpu_if_needed(self, *args, **kwargs):
         """
@@ -902,7 +935,11 @@ def copy_args_to_cpu_if_needed(self, *args, **kwargs):
             return {}
 
         copies = {}
-        budget = torch.cuda.max_memory_allocated() - torch.cuda.memory_allocated()
+        try:
+            budget = torch.cuda.max_memory_allocated() - torch.cuda.memory_allocated()
+        except RuntimeError:
+            # Possibly a custom CUDA allocator, see https://github.com/pytorch/pytorch/issues/163257
+            return {}
 
         def maybe_copy(name, arg):
             if name in self.mutated_arg_names and arg.is_cuda:
@@ -1039,6 +1076,18 @@ def benchmark_all_configs(self, *args, **kwargs):
                         k.shared,
                     )
 
+            if metrics.is_metric_table_enabled("kernel_autotune"):
+                if self.fn.fn is None:
+                    self.fn = self._reload_kernel().fn
+
+                kernel_path = self.fn.fn.__code__.co_filename
+                kernel_name = self.fn.__name__
+
+                for k, v in timings.items():
+                    metrics.log_kernel_autotune_result(
+                        kernel_path, kernel_name, k.config, v
+                    )
+
             self.reset_to_zero_args(*args, **kwargs)
             return timings
 
@@ -1100,6 +1149,15 @@ def save_gpu_kernel(self, stream, launcher):
             "global_scratch": launcher.global_scratch,
             "profile_scratch": launcher.profile_scratch,
         }
+        if self.device_props.type == "xpu":
+            # On the XPU backend, threads_per_warp is not always 32.
+            # For Intel GEMM Triton kernels, it can be 16.
+            # This information must be preserved so that the Cpp wrapper
+            # can launch the kernel with the correct configuration.
+            params["threads_per_warp"] = getattr(
+                launcher.bin.metadata, "threads_per_warp", 32
+            )
+
         from torch._inductor.codecache import CudaKernelParamCache
 
         bin_type = {"hip": "hsaco", "xpu": "spv"}.get(self.device_props.type, "cubin")
@@ -1124,13 +1182,26 @@ def coordinate_descent_tuning(self, launcher, *args, **kwargs):
         Then if coordinate desecnt tuning is run with max-autotune disabled, it will start from C1;
         while if coordinate descent tuning is run with max-autotune enabled, it will start from C3.
         """
-        if (
-            self.heuristic_type == HeuristicType.TEMPLATE
-            or self.heuristic_type == HeuristicType.USER_AUTOTUNE
+        if self.heuristic_type in (
+            HeuristicType.TEMPLATE,
+            HeuristicType.USER_AUTOTUNE,
+            HeuristicType.FIXED,
         ):
             # skip triton template
             return launcher
 
+        if self.deterministic_mode and self.heuristic_type in (
+            HeuristicType.REDUCTION,
+            HeuristicType.PERSISTENT_REDUCTION,
+            HeuristicType.SPLIT_SCAN,
+        ):
+            # Not only RBLOCK size matters for numericals of reduction.
+            # num_warps also matters since that affect how much data
+            # is handled by each thread, how many warp-reduction we do
+            # in parallel and how much data is there for block
+            # reduction.
+            return launcher
+
         with dynamo_timed(
             "CachingAutotuner.coordinate_descent_tuning",
             # These generate too many pt2_compile_event logs:
@@ -1164,6 +1235,7 @@ def benchmark_one_config(config):
             config2launcher[config] = launcher
 
             out = self.bench(launcher, *args, **kwargs)
+            counters["inductor"]["coordesc_tuning_bench"] += 1
             log.debug(
                 "COORDESC: %s: %f, nreg %d, nspill %d, #shared-mem %d",
                 launcher.config,
@@ -1239,7 +1311,7 @@ def run(
     ):  # type:ignore[override]
         if hasattr(triton, "set_allocator"):
 
-            def alloc_fn(size: int, align: int, stream: Optional[int]):
+            def alloc_fn(size: int, align: int, stream: int | None):
                 return torch.empty(
                     size, dtype=torch.int8, device=self.device_props.type
                 )
@@ -1488,12 +1560,12 @@ def can_statically_launch(
         inductor_meta: dict[str, Any],
         triton_meta: dict[str, Any],
         heuristic_type: HeuristicType,
-    ) -> Optional[StaticallyLaunchedCudaKernel]:
+    ) -> StaticallyLaunchedCudaKernel | None:
         if not torch._inductor.config.use_static_cuda_launcher:
             return None
 
         def check_can_launch() -> StaticallyLaunchedCudaKernel:
-            if triton_meta.get("device_type", None) != "cuda":
+            if triton_meta.get("device_type") != "cuda":
                 # Only cuda kernels
                 raise CannotStaticallyLaunchKernel("Non-cuda device")
 
@@ -1510,10 +1582,17 @@ def check_can_launch() -> StaticallyLaunchedCudaKernel:
                 # Don't support user defined triton kernels yet
                 raise CannotStaticallyLaunchKernel("User defined triton kernel")
 
-            if inductor_meta.get("store_cubin", None):
+            if inductor_meta.get("store_cubin"):
                 # Requires storing the entire binary
                 raise CannotStaticallyLaunchKernel("store_cubin is enabled")
 
+            if getattr(kernel.metadata, "launch_pdl", False) or getattr(
+                kernel.metadata, "launch_cooperative_grid", False
+            ):
+                raise CannotStaticallyLaunchKernel(
+                    "static launch does not support launch attributes"
+                )
+
             cubin_location = os.path.join(
                 triton_cache_dir(triton_meta.get("device", 0)),
                 triton_hash_to_path_key(kernel.hash),
@@ -1539,7 +1618,7 @@ def check_can_launch() -> StaticallyLaunchedCudaKernel:
             result = check_can_launch()
             return result
         except CannotStaticallyLaunchKernel as e:
-            log.info("Bypassing StaticallyLaunchedCudaKernel due to %s", str(e))
+            log.info("Bypassing StaticallyLaunchedCudaKernel due to %s", str(e))  # noqa: G200
             if torch._inductor.config.strict_static_cuda_launcher:
                 raise e
             return None
@@ -1842,12 +1921,12 @@ def make_launcher(self) -> LauncherType:
 
             # in AMD's Triton backend, the global scratch size is never provided
             # (but for AMD it's safe to pass an extra null arg, so always include it)
-            global_scratch: Optional[int] = getattr(
+            global_scratch: int | None = getattr(
                 kernel_metadata,
                 "global_scratch_size",
                 (0 if torch.version.hip else None),
             )
-            profile_scratch: Optional[int] = getattr(
+            profile_scratch: int | None = getattr(
                 kernel_metadata, "profile_scratch_size", None
             )
             launcher.global_scratch = global_scratch
@@ -1918,11 +1997,11 @@ def end_graph(output_file):
                     )
                     file.write(bw_info_str + "\n")
                 file.write(f"{summary_str}\n\n")
-        except Exception as e:
+        except Exception:
             log.warning(
-                "failed to write profile bandwidth result into %s: %s",
+                "failed to write profile bandwidth result into %s",
                 output_file,
-                e,
+                exc_info=True,
             )
 
 
@@ -1950,7 +2029,6 @@ def run(self, *args, stream, **kwargs):
             kernel_name = f"{max(possible_names, key=len)}"
             if not re.match(self.regex_filter, kernel_name):
                 return
-
             if len(self.launchers) != 1:
                 if len(self.launchers) == 0:
                     start_time = time.time_ns()
@@ -2002,7 +2080,7 @@ def hash_configs(configs: list[Config]):
 
 
 def cached_autotune(
-    size_hints: Optional[list[int]],
+    size_hints: list[int] | None,
     configs: list[Config],
     triton_meta,
     heuristic_type,
@@ -2166,6 +2244,9 @@ def triton_config(
     num_stages=1,
     num_elements_per_warp=256,
     min_elem_per_thread=0,
+    num_warps=None,
+    matrix_instr=None,
+    waves_per_eu=None,
 ) -> Config:
     """
     Construct a pointwise triton config with some adjustment heuristics
@@ -2222,9 +2303,11 @@ def triton_config(
     ):
         z *= 2
 
-    num_warps = _num_warps(
-        conditional_product(x, y, z) // num_elements_per_warp, min_num_warps=1
-    )
+    # Calculate num_warps if they are not hard passed to config
+    if num_warps is None:
+        num_warps = _num_warps(
+            conditional_product(x, y, z) // num_elements_per_warp, min_num_warps=1
+        )
     # we are going to arrive at 2 warps only if bs was too small due to
     # numel being too small. However to workaround some ptx bugs we still
     # want at least 4 warps if there's enough elements per thread
@@ -2254,7 +2337,15 @@ def triton_config(
         cfg["ZBLOCK"] = z
     check_max_block(cfg)
     check_config(cfg, xnumel=xnumel, ynumel=ynumel, znumel=znumel)
-    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+    config = Config(cfg, num_warps=num_warps, num_stages=num_stages)
+
+    if torch.version.hip:
+        if matrix_instr is not None:
+            config.kwargs["matrix_instr_nonkdim"] = matrix_instr
+        if waves_per_eu is not None:
+            config.kwargs["waves_per_eu"] = waves_per_eu
+
+    return config
 
 
 def _get_nd_reduction_numels(r: int, size_hints: dict[str, int]) -> dict[str, int]:
@@ -2303,6 +2394,7 @@ def triton_config_reduction(
     num_warps=None,
     register_intensive=False,
     dynamic_scale_rblock=True,
+    reduction_hint=None,
 ) -> Config:
     """
     Construct a reduction triton config with some adjustment heuristics
@@ -2330,9 +2422,17 @@ def total_numel() -> int:
             rnumels[prefix] *= 2
 
     if num_warps is None:
-        num_warps = total_numel() // 128
+        if reduction_hint == ReductionHint.INNER and not is_fbcode():
+            # r is contiguous, so ensure that each thread has 8 elements for
+            # vectorized loads, assuming bf16/fp16
+            # xblock is usually 1-2, default to giving each thread more work
+            num_warps = r // 128
+        else:
+            num_warps = total_numel() // 128
+
+    max_num_warps = 16 if r <= 8192 else 32
     num_warps = _num_warps(
-        num_warps, max_num_warps=16, register_intensive=register_intensive
+        num_warps, max_num_warps=max_num_warps, register_intensive=register_intensive
     )
 
     x, _num_blocks = _check_max_grid_x(size_hints, x, num_warps)
@@ -2478,7 +2578,7 @@ def pointwise(
 
     configs = None
     if len(size_hints) == 1:
-        if disable_pointwise_autotuning(inductor_meta) and not (
+        if not inductor_meta.get("autotune_pointwise", True) and not (
             inductor_meta.get("max_autotune")
             or inductor_meta.get("max_autotune_pointwise")
         ):
@@ -2491,9 +2591,32 @@ def pointwise(
                 ),
                 *hinted_configs,
             ]
+            # Additional configs appended for ROCm builds
+            if torch.version.hip:
+                configs.extend(
+                    [
+                        triton_config_with_settings(
+                            size_hints, TRITON_MAX_BLOCK["X"], waves_per_eu=2
+                        ),
+                        triton_config_with_settings(
+                            size_hints,
+                            4096,  # wrt: better than the max_block for some kernel
+                        ),
+                        triton_config_with_settings(
+                            size_hints,
+                            2048,
+                            num_warps=8,
+                            num_stages=2,
+                            waves_per_eu=1,  # 20% improvement
+                        ),
+                    ]
+                )
     if len(size_hints) == 2:
+        # Only avoiding tuning on TileHint.SQUARE if not on ROCm builds
+        # ROCm has observed improvement by diverging here
         if (
-            disable_pointwise_autotuning(inductor_meta) or tile_hint == TileHint.SQUARE
+            not inductor_meta.get("autotune_pointwise", True)
+            or (torch.version.hip is None and tile_hint == TileHint.SQUARE)
         ) and not (
             inductor_meta.get("max_autotune")
             or inductor_meta.get("max_autotune_pointwise")
@@ -2509,8 +2632,26 @@ def pointwise(
                 triton_config_with_settings(size_hints, 1, bs),
                 *hinted_configs,
             ]
+            # Additional configs appended for ROCm builds
+            if torch.version.hip:
+                configs.extend(
+                    [
+                        triton_config_with_settings(
+                            size_hints, 64, 32
+                        ),  # better for some kernels
+                        triton_config_with_settings(
+                            size_hints, 128, 16
+                        ),  # +10% for some kernels
+                        triton_config_with_settings(
+                            size_hints, 128, 32
+                        ),  # additional 10% more
+                        triton_config_with_settings(
+                            size_hints, 32, 512
+                        ),  # +30% for some kernels
+                    ]
+                )
     if len(size_hints) == 3:
-        if disable_pointwise_autotuning(inductor_meta):
+        if not inductor_meta.get("autotune_pointwise", True):
             configs = [triton_config_with_settings(size_hints, 16, 16, 16)]
         else:
             configs = [
@@ -2539,10 +2680,70 @@ def pointwise(
     )
 
 
+def make_matmul_triton_config(sizes: dict[str, int], num_warps: int, num_stages: int):
+    config = {
+        "XBLOCK": sizes.get("x"),
+        "YBLOCK": sizes.get("y"),
+        "ZBLOCK": sizes.get("z"),
+        "R0_BLOCK": sizes.get("r"),
+    }
+    # Remove keys with None values (i.e., missing in sizes)
+    config = {k: v for k, v in config.items() if v is not None}
+    return Config(config, num_warps=num_warps, num_stages=num_stages)
+
+
+def _config_helper(bmm=False, persistent=False):
+    # Each entry is: (sizes_dict, num_warps, num_stages)
+    _base_mm_configs = [
+        ({"x": 32, "y": 32, "r": 16}, 2, 1),
+        ({"x": 32, "y": 32, "r": 128}, 4, 2),
+        ({"x": 32, "y": 64, "r": 32}, 8, 5),
+        ({"x": 64, "y": 32, "r": 32}, 8, 5),
+        ({"x": 64, "y": 32, "r": 128}, 4, 5),
+        ({"x": 64, "y": 64, "r": 16}, 4, 2),
+        ({"x": 64, "y": 64, "r": 32}, 4, 2),
+        ({"x": 64, "y": 64, "r": 64}, 8, 3),
+        ({"x": 64, "y": 64, "r": 128}, 4, 5),
+        ({"x": 64, "y": 128, "r": 32}, 4, 3),
+        ({"x": 64, "y": 128, "r": 32}, 8, 4),
+        ({"x": 64, "y": 128, "r": 64}, 4, 3),
+        ({"x": 64, "y": 128, "r": 128}, 4, 4),
+        ({"x": 128, "y": 64, "r": 32}, 4, 3),
+        ({"x": 128, "y": 64, "r": 32}, 8, 4),
+        ({"x": 128, "y": 128, "r": 32}, 8, 2),
+        ({"x": 128, "y": 128, "r": 32}, 4, 3),
+        ({"x": 128, "y": 128, "r": 64}, 4, 3),
+        ({"x": 128, "y": 128, "r": 64}, 8, 5),
+    ]
+    out = []
+    for sizes, w, s in _base_mm_configs:
+        d = dict(sizes)
+        if persistent:
+            d.pop("r", None)
+        if bmm:
+            d["z"] = 1
+        out.append((d, w, s))
+
+    # Deduplicate by converting dicts to immutable frozensets
+    deduped = {(frozenset(d.items()), w, s): (d, w, s) for d, w, s in out}
+
+    return list(deduped.values())
+
+
+triton_native_mm_configs = _config_helper(bmm=False, persistent=False)
+triton_native_persistent_mm_configs = _config_helper(bmm=False, persistent=True)
+triton_native_bmm_configs = _config_helper(bmm=True, persistent=False)
+triton_native_persistent_bmm_configs = _config_helper(bmm=True, persistent=True)
+
+
 def _reduction_configs(
-    *, size_hints: dict[str, int], inductor_meta: dict[str, Any], num_dynamic=0
+    *,
+    size_hints: dict[str, int],
+    inductor_meta: dict[str, Any],
+    triton_meta: dict[str, Any],
+    num_dynamic=0,
 ) -> list[Config]:
-    reduction_hint = inductor_meta.get("reduction_hint", None)
+    reduction_hint = inductor_meta.get("reduction_hint")
 
     # Convert reductions to 1D, to simplify heuristics.
     rnumel = get_total_reduction_numel(size_hints)
@@ -2568,6 +2769,20 @@ def _reduction_configs(
         MAX_R0_BLOCK = 1024
         register_intensive = True
 
+    if triton_meta.get("native_matmul"):
+        if len(size_hints) == 3:
+            return [
+                make_matmul_triton_config(sizes, num_warps, num_stages)
+                for sizes, num_warps, num_stages in triton_native_mm_configs
+            ]
+        elif len(size_hints) == 4:
+            return [
+                make_matmul_triton_config(sizes, num_warps, num_stages)
+                for sizes, num_warps, num_stages in triton_native_bmm_configs
+            ]
+        else:
+            raise NotImplementedError("native matmul only supports mm/bmm pattern")
+
     def make_config(
         x,
         r,
@@ -2598,6 +2813,7 @@ def make_config(
                 num_stages=num_stages,
                 register_intensive=register_intensive,
                 dynamic_scale_rblock=dynamic_scale_rblock,
+                reduction_hint=reduction_hint,
             )
 
     def outer_config_opt():
@@ -2649,7 +2865,7 @@ def outer_config_opt():
         )
 
     contiguous_config = make_config(
-        1,
+        2 if rnumel <= 2048 and not is_fbcode() else 1,  # 1024 or less is persistent
         min(rnumel, MAX_R0_BLOCK),
         register_intensive=register_intensive,
     )
@@ -2671,7 +2887,7 @@ def outer_config_opt():
         xnumel = max(4096 // rnumel, 1)
         c = make_config(
             xnumel,
-            rnumel,
+            min(rnumel, 32768),
             register_intensive=register_intensive,
             dynamic_scale_rblock=False,
         )
@@ -2690,8 +2906,6 @@ def outer_config_opt():
         return configs + [outer_config]
     elif reduction_hint == ReductionHint.OUTER_TINY:
         return configs + [tiny_config]
-    if disable_pointwise_autotuning(inductor_meta):
-        return configs + [make_config(32, 128)]
 
     return configs + [
         contiguous_config,
@@ -2734,7 +2948,7 @@ def match_target_block_product(
         relative_scores[dim] = score / total_score
 
     # Scale up dimensions by their relative scores until we reach the target
-    while curr_block_product < target_block_product and len(relative_scores):
+    while curr_block_product < target_block_product and relative_scores:
         dim, score = max(relative_scores.items(), key=lambda item: item[1])
 
         # Check if we've hit the max for this dimension
@@ -2782,6 +2996,112 @@ def adapt_config_for_tiling(
     )
 
 
+def filter_reduction_configs_for_determinism(
+    inductor_meta: dict[str, Any], configs: list[Config]
+) -> list[Config]:
+    """
+    Filter configs for reduction so the numerics can be deterministic.
+
+    Heuristics:
+    - skip reduction configs with too small RBLOCK
+    - skip reduction configs with XBLOCK==1 if we are confident it will not perform well
+    - if there is a tie, pick the config with second largest RBLOCK
+    - if there is still a tie, pick the config with second largest num_warps
+    - if there is still a tie, pick the config with second largest XBLOCK
+    """
+    configs = unique_configs(configs)
+    assert len(configs) > 0
+
+    def _do_filter_due_to_inductor_config():
+        return (
+            inductor_meta.get("deterministic", False)
+            or inductor_meta.get("force_filter_reduction_configs", False)
+        ) or inductor_meta.get("are_deterministic_algorithms_enabled")
+
+    if not _do_filter_due_to_inductor_config() or len(configs) == 1:
+        # no filtering happening if NOT in deterministic mode
+        return configs
+
+    if log.isEnabledFor(logging.DEBUG):
+        log.debug("reduction configs before filtering:")
+        for c in configs:
+            log.debug("%s", c)
+            log.debug("")
+
+    def _has_too_small_rblock(config):
+        rblock = config.kwargs.get("R0_BLOCK")
+        # too small RBLOCK is likely to be bad
+        return rblock is not None and rblock <= 4
+
+    def _nonpromising_xblock_1(config):
+        # kernel like https://gist.github.com/shunting314/0b3281c087e79bc915fe45985ff9d7d5
+        # without a load/store having contiguous rdim is unlikely to perform well with XBLOCK==1
+        return config.kwargs["XBLOCK"] == 1 and not inductor_meta.get(
+            "has_loadstore_with_contiguous_rdim", True
+        )
+
+    newconfigs = [*filter(lambda x: not _has_too_small_rblock(x), configs)]
+    # accept the filtering only if there are configs left
+    if len(newconfigs) > 0:
+        configs = newconfigs
+
+    newconfigs = [*filter(lambda x: not _nonpromising_xblock_1(x), configs)]
+    if len(newconfigs) > 0:
+        configs = newconfigs
+
+    assert len(configs) > 0
+
+    def _r0_block(c):
+        return c.kwargs.get("R0_BLOCK", -1)
+
+    def _xblock(c):
+        return c.kwargs.get("XBLOCK", -1)
+
+    def _num_warps(c):
+        return c.num_warps
+
+    def _pick_second_largest(accessor):
+        nonlocal configs
+        configs = sorted(configs, key=lambda x: accessor(x))
+        if accessor(configs[0]) != accessor(configs[-1]):
+            max_val = accessor(configs[-1])
+            configs = [*filter(lambda x: accessor(x) != max_val, configs)]
+            second_max_val = accessor(configs[-1])
+            configs = [*filter(lambda x: accessor(x) == second_max_val, configs)]
+        return configs
+
+    def _pick_config():
+        nonlocal configs
+        assert len(configs) > 0
+        if len(configs) == 1:
+            return configs[0]
+
+        # break tie by R0_BLOCK
+        configs = _pick_second_largest(_r0_block)
+        if len(configs) == 1:
+            return configs[0]
+
+        # break tie by num_warps
+        configs = _pick_second_largest(_num_warps)
+        if len(configs) == 1:
+            return configs[0]
+
+        # break tie by XBLOCK
+        configs = _pick_second_largest(_xblock)
+
+        # there is still a tie, pick the first one
+        return configs[0]
+
+    configs = [_pick_config()]
+
+    if log.isEnabledFor(logging.DEBUG):
+        log.debug("reduction configs after filtering:")
+        for c in configs:
+            log.debug("%s", c)
+            log.debug("")
+    return configs
+
+
 def reduction(
     size_hints,
     reduction_hint=False,
@@ -2803,10 +3123,15 @@ def reduction(
             num_dynamic += 1
 
     configs = _reduction_configs(
-        size_hints=size_hints, inductor_meta=inductor_meta, num_dynamic=num_dynamic
+        size_hints=size_hints,
+        inductor_meta=inductor_meta,
+        triton_meta=triton_meta,
+        num_dynamic=num_dynamic,
     )
 
     configs = _maybe_filter_configs_for_tma_restrictions(inductor_meta, configs)
+    configs = filter_reduction_configs_for_determinism(inductor_meta, configs)
+
     return cached_autotune(
         size_hints,
         configs=configs,
@@ -2842,18 +3167,23 @@ def cooperative_reduction(
     assert split <= TRITON_MAX_RSPLIT
     if inductor_meta["persistent_reduction"]:
         configs = _persistent_reduction_configs(
-            {"x": xnumel, "r0_": rnumel // split}, reduction_hint, inductor_meta
+            {"x": xnumel, "r0_": rnumel // split},
+            reduction_hint,
+            inductor_meta,
+            triton_meta,
         )
     else:
         configs = _reduction_configs(
             size_hints={"x": xnumel, "r0_": rnumel // split},
             inductor_meta=inductor_meta,
+            triton_meta=triton_meta,
         )
     for config in configs:
         config.kwargs["RSPLIT"] = split
     # TODO(jansel): add more configs in max_autotune
 
     configs = _maybe_filter_configs_for_tma_restrictions(inductor_meta, configs)
+    configs = filter_reduction_configs_for_determinism(inductor_meta, configs)
     return cached_autotune(
         size_hints,
         configs=configs,
@@ -2868,16 +3198,49 @@ def _persistent_reduction_configs(
     size_hints,
     reduction_hint=False,
     inductor_meta=None,
+    triton_meta=None,
 ):
     xnumel = size_hints["x"]
     rnumel = get_total_reduction_numel(size_hints)
+    loads_and_stores = inductor_meta.get("num_load", 0) + inductor_meta.get(
+        "num_store", 0
+    )
 
     MAX_PERSISTENT_BLOCK_NUMEL = 4096
 
+    if triton_meta.get("native_matmul"):
+        if len(size_hints) == 3:
+            return [
+                make_matmul_triton_config(sizes, num_warps, num_stages)
+                for sizes, num_warps, num_stages in triton_native_persistent_mm_configs
+            ]
+        elif len(size_hints) == 4:
+            return [
+                make_matmul_triton_config(sizes, num_warps, num_stages)
+                for sizes, num_warps, num_stages in triton_native_persistent_bmm_configs
+            ]
+        else:
+            raise NotImplementedError("native matmul only supports mm/bmm pattern")
+
+    max_autotune_enabled = inductor_meta.get("max_autotune") or inductor_meta.get(
+        "max_autotune_pointwise"
+    )
+
+    if torch.version.hip:
+        xblock_vals = [1, 4, 8, 16, 32, 64, 128, 256]
+    else:
+        xblock_vals = [1, 8, 32, 128]
+
     if "y" not in size_hints:
         configs = [
-            triton_config_reduction(size_hints, xblock, rnumel, register_intensive=True)
-            for xblock in (1, 8, 32, 128)
+            triton_config_reduction(
+                size_hints,
+                xblock,
+                rnumel,
+                register_intensive=True,
+                reduction_hint=reduction_hint,
+            )
+            for xblock in xblock_vals
             if xblock == 1
             or (rnumel * xblock <= MAX_PERSISTENT_BLOCK_NUMEL and xblock <= xnumel)
         ]
@@ -2885,7 +3248,7 @@ def _persistent_reduction_configs(
         configs = []
         assert "tiling_scores" in inductor_meta
         x_y_scores = {dim: inductor_meta["tiling_scores"][dim] for dim in ("x", "y")}
-        for target_block_size in (1, 8, 32, 64, 128):
+        for target_block_size in xblock_vals:
             if target_block_size * rnumel > MAX_PERSISTENT_BLOCK_NUMEL:
                 continue
 
@@ -2898,31 +3261,53 @@ def _persistent_reduction_configs(
                 )
             )
 
+    tiny_configs = [
+        triton_config_reduction(
+            size_hints,
+            2 * (256 // rnumel) if rnumel <= 256 else 1,
+            rnumel,
+        )
+    ]
+
     # defer to more autotuning, initially
     if "y" in size_hints:
         pass
     # TODO(jansel): we should be able to improve these heuristics
-    elif reduction_hint == ReductionHint.INNER and rnumel >= 256:
-        configs = configs[:1]
-    elif reduction_hint == ReductionHint.OUTER:
-        configs = configs[-1:]
-    elif reduction_hint == ReductionHint.OUTER_TINY:
-        configs = [
-            triton_config_reduction(
-                size_hints,
-                2 * (256 // rnumel) if rnumel <= 256 else 1,
-                rnumel,
-            )
-        ]
+    elif not max_autotune_enabled:  # Do not filter configs when tuning
+        if reduction_hint == ReductionHint.INNER and rnumel >= 256:
+            if rnumel > 1024:
+                configs = configs[:1]
+            else:
+                x_block = 8
+                if xnumel // x_block < 128 or loads_and_stores >= 5:
+                    x_block = 1
+
+                configs = [
+                    triton_config_reduction(
+                        size_hints,
+                        x_block,
+                        rnumel,
+                        register_intensive=True,
+                    )
+                ]
+
+        elif reduction_hint == ReductionHint.OUTER:
+            configs = configs[-1:]
+        elif reduction_hint == ReductionHint.OUTER_TINY:
+            configs = tiny_configs
+    else:
+        if torch.version.hip:
+            # If autotune is enabled append tiny configs
+            for conf in tiny_configs:
+                if conf not in configs:
+                    configs.append(conf)
+
     for c in configs:
         # we don't need Rn_BLOCK for persistent reduction
         for prefix in size_hints:
             if prefix_is_reduction(prefix):
                 c.kwargs.pop(f"{prefix.upper()}BLOCK")
 
-    if disable_pointwise_autotuning(inductor_meta):
-        configs = configs[:1]
-
     return configs
 
 
@@ -2938,7 +3323,9 @@ def persistent_reduction(
     if inductor_meta.get("no_x_dim"):
         size_hints["x"] = 1
 
-    configs = _persistent_reduction_configs(size_hints, reduction_hint, inductor_meta)
+    configs = _persistent_reduction_configs(
+        size_hints, reduction_hint, inductor_meta, triton_meta
+    )
 
     # This key is not added to the inductor meta as its clear from the heuristic
     # choice that it is persistent. Add it and remove it below so that persistent
@@ -2948,6 +3335,7 @@ def persistent_reduction(
     configs = _maybe_filter_configs_for_tma_restrictions(inductor_meta, configs)
     inductor_meta.pop(persistent_reduction_key)
 
+    configs = filter_reduction_configs_for_determinism(inductor_meta, configs)
     return cached_autotune(
         size_hints,
         configs,
@@ -2975,7 +3363,9 @@ def split_scan(
     if len(size_hints) != 2:
         raise NotImplementedError(f"size_hints: {size_hints}")
 
-    configs = _reduction_configs(size_hints=size_hints, inductor_meta=inductor_meta)
+    configs = _reduction_configs(
+        size_hints=size_hints, inductor_meta=inductor_meta, triton_meta=triton_meta
+    )
 
     # Fixup configs to enforce the minimum Rn_BLOCK size
     min_rblock = inductor_meta.get("min_split_scan_rblock", 256)
@@ -2985,6 +3375,7 @@ def split_scan(
                 cfg.kwargs[var] = min_rblock
 
     configs = _maybe_filter_configs_for_tma_restrictions(inductor_meta, configs)
+    configs = filter_reduction_configs_for_determinism(inductor_meta, configs)
     return cached_autotune(
         size_hints,
         configs=configs,
@@ -3124,21 +3515,19 @@ class GridExpr:
     """Generate code for grid size expressions in launcher"""
 
     inductor_meta: dict[str, Any]
-    mode: Literal["python", "cpp", "python_slow"] = "python"
+    mode: Literal["python", "cpp"] = "python"
     prefix: list[str] = dataclasses.field(default_factory=list)
-    x_grid: Union[str, int] = 1
-    y_grid: Union[str, int] = 1
-    z_grid: Union[str, int] = 1
+    x_grid: str | int = 1
+    y_grid: str | int = 1
+    z_grid: str | int = 1
 
     def __post_init__(self) -> None:
-        assert self.mode in ("python", "cpp", "python_slow")
+        assert self.mode in ("python", "cpp")
 
     def generate(self, meta: dict[str, int]) -> None:
         raise NotImplementedError
 
-    def ceildiv(
-        self, numel: Union[str, int], block: Union[None, int, str]
-    ) -> Union[str, int]:
+    def ceildiv(self, numel: str | int, block: None | int | str) -> str | int:
         if block is None or block == 1:
             return numel
         if isinstance(numel, int) and isinstance(block, int):
@@ -3147,23 +3536,19 @@ def ceildiv(
         # negative integer division is floored
         if self.mode == "python":
             return f"-(({numel}) // -({block}))"
-        # This is more generic than above, and works in languages where
-        # positive integer division is floored/truncated
-        elif self.mode == "python_slow":
-            return f"(({numel} + {block} - 1) // ({block}))"
         # For cpp code gen
         return f"(({numel} + ({block} - 1)) / ({block}))"
 
-    def maximum(self, seq: list[Union[int, str]]) -> Union[int, str]:
+    def maximum(self, seq: list[int | str]) -> int | str:
         """Codegen for max function with constant folding, constants are represented as int"""
         items = self._constant_fold(max, seq)
         if len(items) <= 1:
             return items[0]
-        if self.mode in ("python", "python_slow"):
+        if self.mode == "python":
             return f"max({', '.join(map(str, items))})"
         return functools.reduce(lambda x, y: f"std::max({x}, {y})", items)
 
-    def summation(self, seq: list[Union[int, str]]) -> Union[int, str]:
+    def summation(self, seq: list[int | str]) -> int | str:
         """Codegen for sum function with constant folding, constants are represented as int"""
         items = self._constant_fold(sum, seq)
         if len(items) <= 1:
@@ -3171,18 +3556,18 @@ def summation(self, seq: list[Union[int, str]]) -> Union[int, str]:
         return " + ".join(map(str, items))
 
     def _constant_fold(
-        self, fn: Callable[[list[int]], int], seq: list[Union[int, str]]
-    ) -> list[Union[int, str]]:
+        self, fn: Callable[[list[int]], int], seq: list[int | str]
+    ) -> list[int | str]:
         """Constant fold through a commutative fn where ints are constants"""
-        items: list[Union[int, str]] = [x for x in seq if not isinstance(x, int)]
+        items: list[int | str] = [x for x in seq if not isinstance(x, int)]
         const_items = [x for x in seq if isinstance(x, int)]
         if const_items:
             items.append(fn(const_items))
         return items
 
-    def assign_tmp(self, name: str, expr: Union[str, int]) -> str:
+    def assign_tmp(self, name: str, expr: str | int) -> str:
         # Grid functions are one per kernel, so name collisions are fine
-        if self.mode in ("python", "python_slow"):
+        if self.mode == "python":
             return f"{name} = {expr}"
         if self.mode == "cpp":
             return f"uint32_t {name} = {expr};"
@@ -3191,8 +3576,8 @@ def assign_tmp(self, name: str, expr: Union[str, int]) -> str:
     @staticmethod
     def from_meta(
         inductor_meta: dict[str, Any],
-        cfg: Union[Config, dict[str, int]],
-        mode: Literal["python", "cpp", "python_slow"] = "python",
+        cfg: Config | dict[str, int],
+        mode: Literal["python", "cpp"] = "python",
     ) -> GridExpr:
         grid_cls = globals()[inductor_meta["grid_type"]]
         assert issubclass(grid_cls, GridExpr)
@@ -3315,20 +3700,20 @@ def generate(self, meta: dict[str, int]):
 
     def combo_x_grid(
         self,
-        xnumels: list[Union[int, str]],
+        xnumels: list[int | str],
         no_x_dims: list[bool],
         meta: dict[str, int],
-    ) -> Union[str, int]:
+    ) -> str | int:
         raise NotImplementedError
 
 
 class SequentialComboKernelGrid(ComboKernelGrid):
     def combo_x_grid(
         self,
-        xnumels: list[Union[int, str]],
+        xnumels: list[int | str],
         no_x_dims: list[bool],
         meta: dict[str, int],
-    ) -> Union[str, int]:
+    ) -> str | int:
         assert len(xnumels) == len(no_x_dims)
         return self.summation(
             [
@@ -3341,7 +3726,7 @@ def combo_x_grid(
 class RoundRobinComboKernelGrid(ComboKernelGrid):
     def combo_x_grid(
         self,
-        xnumels: list[Union[int, str]],
+        xnumels: list[int | str],
         no_x_dims: list[bool],
         meta: dict[str, int],
     ) -> str:
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index afd3f716deec..31e490538f55 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -23,6 +23,8 @@
     from collections.abc import Iterator, Sequence
     from types import ModuleType
 
+import weakref
+
 import sympy
 
 import torch
@@ -92,6 +94,28 @@
 _P = ParamSpec("_P")
 
 
+_custom_should_partition_fns: weakref.WeakKeyDictionary[
+    torch._ops.OpOverload, Callable[..., bool]
+] = weakref.WeakKeyDictionary()
+
+
+def register_should_partition_rule(
+    op: torch._ops.OpOverload,
+    func: Callable[..., bool],
+) -> None:
+    """Register a function that says if Inductor should partition the graph on this op.
+
+    The function should be have the same signature as the operator.
+    Inductor will invoke the function with FakeTensors when it needs to decide
+    if the graph should be partitioned.
+
+    `register_should_partition_rule` is currently private and experimental.
+    Use at your own risk.
+    """
+    assert isinstance(op, torch._ops.OpOverload)
+    _custom_should_partition_fns[op] = func
+
+
 @dataclasses.dataclass
 class SchedulerBuffer:
     scheduler: Scheduler
@@ -450,6 +474,9 @@ def is_gpu(self) -> bool:
     def is_reduction(self) -> bool:
         return False
 
+    def is_native_matmul(self) -> bool:
+        return False
+
     def is_split_scan(self) -> bool:
         return False
 
@@ -869,11 +896,11 @@ def _get_estimated_runtime(self) -> float:
             except ValueError as e:
                 # We don't know how to estimate runtime for this collective,
                 # falling back to 0
-                log.info(e)
+                log.info(e)  # noqa: G200
                 return 0
             except TypeError as e:
                 # this happens when the collective is not of type ir._CollectiveKernel
-                log.info(e)
+                log.info(e)  # noqa: G200
                 return 0
 
         elif is_wait(self.node):
@@ -995,6 +1022,7 @@ def maybe_estimate_runtime_benchmark(snode: BaseSchedulerNode) -> Optional[float
         if mm_fn is None:
             return None
         bench_fn = mm_fn
+        # pyrefly: ignore  # unbound-name
         args_kwargs_fn = lambda: snode_args_kwargs(snode)  # noqa: E731
     else:
         return None
@@ -1009,18 +1037,18 @@ def maybe_estimate_runtime_benchmark(snode: BaseSchedulerNode) -> Optional[float
     from .utils import snode_args_kwargs
 
     args, kwargs = args_kwargs_fn()
-    from triton.testing import do_bench
+    from torch._inductor.runtime.benchmarking import benchmarker
 
-    ms = do_bench(lambda: bench_fn(*args, **kwargs))
+    ms = benchmarker.benchmark(bench_fn, args, kwargs)  # type: ignore[arg-type]
 
     cache.set_value(cache_key, value=ms)
     return ms
 
 
+@dataclasses.dataclass(slots=True)
 class WhyNoFuse:
-    # TODO when we drop support for Python < 3.10, we can use
-    # @dataclass(slots=True) instead of manually specifying __slots__.
-    __slots__ = ["name1", "name2", "reason", "args"]
+    name1: str
+    name2: str
     reason: str
     args: tuple[Any, ...]
 
@@ -1088,7 +1116,11 @@ def _prune_redundant_deps(
     def should_prune(dep: Dep) -> bool:
         if isinstance(dep, WeakDep):
             op_name = name_to_buf[dep.name].defining_op_name()
-            is_redundant = name_to_dep_count[name_to_fused_node[op_name].get_name()] > 0
+            is_redundant = name_to_dep_count[
+                name_to_fused_node[op_name].get_name()
+            ] > 0 and node.scheduler.fusable_weak_dep(
+                dep, name_to_fused_node[op_name], node
+            )
             # These can occur because fused nodes always gather deps from their snodes
             # If B has a weakdep on A
             # B gets fused with C, then any time BC is fused, the weakdep will reappear
@@ -1267,6 +1299,7 @@ def reorder_loops_by_dep_pair(
             new_order = self_dep.decide_loop_order_to_match(other_dep)
 
         if new_order:
+            # pyrefly: ignore  # bad-assignment
             metrics.num_loop_reordering += 1
             loop_ordering_log.debug(
                 "Reorder loops for %s with order %s", self.get_name(), new_order
@@ -1311,6 +1344,10 @@ def is_reduction(self) -> bool:
         )
         return bool(self.node.get_reduction_type())
 
+    def is_native_matmul(self) -> bool:
+        assert isinstance(self.node, ir.ComputedBuffer), f"{type(self.node)=}"
+        return self.node.get_reduction_type() == "dot"
+
     def is_split_scan(self) -> bool:
         assert isinstance(self.node, (ir.ComputedBuffer, ir.TemplateBuffer)), (
             f"{type(self.node)=}"
@@ -1563,6 +1600,7 @@ def reorder_loops_by_dep_pair(
                 self.get_name(),
             )
             return False
+        # pyrefly: ignore  # bad-assignment
         metrics.num_loop_reordering += 1
         loop_ordering_log.debug(
             "Reorder loops for fused node %s with order %s", self.get_name(), new_order
@@ -1645,6 +1683,10 @@ def __repr__(self) -> str:
     def is_reduction(self) -> bool:
         return any(x.is_reduction() for x in self.snodes)
 
+    @cache_on_self
+    def is_native_matmul(self) -> bool:
+        return any(x.is_native_matmul() for x in self.snodes)
+
     @cache_on_self
     def is_split_scan(self) -> bool:
         return any(x.is_split_scan() for x in self.snodes)
@@ -2240,7 +2282,6 @@ def _init(self, nodes: list[ir.Operation]) -> None:
                 *V.graph.torchbind_constants.keys(),
             ]
         )
-
         self.nodes = [self.create_scheduler_node(n) for n in nodes]
         self.current_node: Optional[BaseSchedulerNode] = None
         self.update_zero_dim_cpu_tensor()
@@ -2294,6 +2335,7 @@ def _init(self, nodes: list[ir.Operation]) -> None:
         self.name_to_fused_node = {n.get_name(): n for n in self.nodes}
         self.compute_ancestors()
 
+        # pyrefly: ignore  # bad-assignment
         metrics.ir_nodes_pre_fusion += len(self.nodes)
         from torch._inductor.debug import log_ir_post_fusion, log_ir_pre_fusion
 
@@ -2331,7 +2373,10 @@ def _init(self, nodes: list[ir.Operation]) -> None:
                 OrderedSet(V.graph.graph_inputs.keys()),
                 OrderedSet(V.graph.get_output_names()),
             )
-        if config.reorder_for_compute_comm_overlap:
+
+        # reorder_for_compute_comm_overlap may do benchmarking to estimate
+        # op runtime. Disable it for now in deterministic mode.
+        if not config.deterministic and config.reorder_for_compute_comm_overlap:
             if not config.reorder_for_peak_memory:
                 from .memory import assign_memory_planning_info_for_scheduler_buffers
 
@@ -2518,6 +2563,7 @@ def __add__(self, other: DedupList[_T]) -> DedupList[_T]:
                 ]
                 return DedupList(new_items, new_membership)
 
+        # pyrefly: ignore  # not-a-type
         name_to_users: defaultdict[str, DedupList[NodeUser]] = collections.defaultdict(
             DedupList
         )
@@ -2554,12 +2600,14 @@ def __add__(self, other: DedupList[_T]) -> DedupList[_T]:
                     else:
                         name_to_users[buf1_name] = name_to_users[buf2_name]
 
+        # pyrefly: ignore  # not-a-type
         def rename(n: str) -> str:
             if n in self.mutation_renames:
                 return rename(self.mutation_renames[n])
             return n
 
         def add_user(
+            # pyrefly: ignore  # not-a-type
             used_by_name: str,
             user_node: Union[BaseSchedulerNode, OutputNode],
             can_inplace: bool = False,
@@ -2569,6 +2617,7 @@ def add_user(
                 NodeUser(user_node, can_inplace, is_weak)
             )
 
+        # pyrefly: ignore  # not-a-type
         unbacked_symbol_to_origin_node: dict[sympy.Symbol, Optional[str]] = {}
 
         # NB: None means that the dependency is on an input.  Don't actually
@@ -2627,6 +2676,7 @@ def add_user(
                 and (dep := next(iter(node.read_writes.writes)))
                 and isinstance(dep, MemoryDep)
             ):
+                # pyrefly: ignore  # unbound-name
                 node_mode = dep.mode
             else:
                 node_mode = None
@@ -2653,6 +2703,10 @@ def add_user(
                             )
                             add_user(other_name, node, is_weak=True)
 
+            for add_dep in V.graph.additional_buffer_deps[node.get_name()]:
+                add_user(add_dep, node, is_weak=True)
+                node.add_fake_dep(WeakDep(add_dep, node.get_name()))
+
             # add normal non-mutation dependencies
             for read in node.read_writes.reads:
                 if not isinstance(read, WeakDep):
@@ -2978,7 +3032,7 @@ def fuse_nodes(self, nodes: list[BaseSchedulerNode]) -> list[BaseSchedulerNode]:
                     i + 1,
                     old_len,
                 )
-                nodes = self.fuse_nodes_once(nodes)
+                nodes = self.fuse_nodes_once(nodes, is_reorder_round=False)
                 new_len = len(nodes)
                 fusion_log.debug(
                     "completed fusion round (%d/10): fused %d nodes into %d nodes\n",
@@ -2991,6 +3045,9 @@ def fuse_nodes(self, nodes: list[BaseSchedulerNode]) -> list[BaseSchedulerNode]:
                         "===== fusion complete (%d iterations) =====", i + 1
                     )
                     break
+
+            if config.loop_ordering_after_fusion:
+                nodes = self.fuse_nodes_once(nodes, is_reorder_round=True)
             return nodes
 
     def process_grouped_nodes(self) -> None:
@@ -3309,7 +3366,7 @@ def compile_kernel(
                             future.result()
                     except Exception as e:
                         if fusion_log.isEnabledFor(logging.DEBUG):
-                            fusion_log.debug(
+                            fusion_log.debug(  # noqa: G200
                                 "Exception in compiling %s: %s",
                                 "prologue" if not epilogue_fusion else "epilogue",
                                 str(e),
@@ -3385,15 +3442,18 @@ def benchmark_when_ready() -> bool:
                     # triton  will unpredictably error with valid prologue fusions
                     except Exception as e:
                         if fusion_log.isEnabledFor(logging.DEBUG):
-                            fusion_log.debug(
+                            fusion_log.debug(  # noqa: G200
                                 "Exception in compiling %s: %s",
                                 "prologue" if not epilogue_fusion else "epilogue",
                                 str(e),
                             )
                         continue
+                    # pyrefly: ignore  # missing-attribute
                     with multi_node.swap_as_triton_caller(choice):
                         ms_fused, path = self.benchmark_codegened_module(
-                            mod_fused, device
+                            mod_fused,
+                            # pyrefly: ignore  # bad-argument-type
+                            device,
                         )
                         new_timings[choice] = ms_fused
                         if ms_fused < min_ms_fused:
@@ -3405,12 +3465,15 @@ def benchmark_when_ready() -> bool:
                 if min_ms_fused < (ms1 + ms2) and ms_fused_choice is not None:
                     if config.multi_kernel_hints:
                         hint_override_best_fusion_choice[None] = ms_fused_choice
+                        # pyrefly: ignore  # missing-attribute
                         multi_node.finalize_as_triton_callers(
                             hint_override_best_fusion_choice
                         )
                     else:
+                        # pyrefly: ignore  # missing-attribute
                         multi_node.finalize_as_triton_caller(ms_fused_choice)
 
+                    # pyrefly: ignore  # missing-attribute
                     multi_node._choice_timings[None] = new_timings
                     return True
                 else:
@@ -3440,21 +3503,27 @@ def benchmark_when_ready() -> bool:
                             fut.result()
 
                     ms1, path1 = self.benchmark_codegened_module(
-                        future_and_mod_l1[1], device
+                        future_and_mod_l1[1],
+                        # pyrefly: ignore  # bad-argument-type
+                        device,
                     )
                     if math.isinf(ms1):
                         why("register spilling of the first kernel")
                         return False
 
                     ms2, path2 = self.benchmark_codegened_module(
-                        future_and_mod_l2[1], device
+                        future_and_mod_l2[1],
+                        # pyrefly: ignore  # bad-argument-type
+                        device,
                     )
                     if math.isinf(ms2):
                         why("register spilling of the second kernel")
                         return False
 
                     ms_fused, path_fused = self.benchmark_codegened_module(
-                        future_and_mod_l1_fused[1], device
+                        future_and_mod_l1_fused[1],
+                        # pyrefly: ignore  # bad-argument-type
+                        device,
                     )
                     if math.isinf(ms_fused):
                         why("register spilling of the fused kernel")
@@ -3497,7 +3566,9 @@ def get_fused_node(self, node: BaseSchedulerNode) -> BaseSchedulerNode:
         return self.name_to_fused_node[node.get_first_name()]
 
     def fuse_nodes_once(
-        self, nodes: list[BaseSchedulerNode]
+        self,
+        nodes: list[BaseSchedulerNode],
+        is_reorder_round: bool,
     ) -> list[BaseSchedulerNode]:
         """
         Combine eligible nodes into FusedSchedulerNodes.
@@ -3506,6 +3577,7 @@ def fuse_nodes_once(
             - self.can_fuse(): checks if a fusion is legal
             - self.score_fusion(): assigns priority to a given fusion
         """
+        self.prune_redundant_deps(nodes)
         fused_nodes = OrderedSet(nodes)
         if fusion_log.isEnabledFor(logging.DEBUG):
             fusion_log.debug("fuse_nodes_once, candidates:")
@@ -3560,7 +3632,7 @@ def resolve_pending_fusions(
 
                 fuse_two_nodes(node_key1, node_key2)
 
-        for node1, node2 in self.get_possible_fusions(nodes):
+        for node1, node2 in self.get_possible_fusions(nodes, is_reorder_round):
             # if either node is in a pending fusion, resolve it.
             # since we iterate on potential fusions based on profitability
             # the first potential fusion should take precedence.
@@ -3568,9 +3640,9 @@ def resolve_pending_fusions(
             node1 = self.get_fused_node(node1)
             node2 = self.get_fused_node(node2)
 
-            if self.can_fuse(node1, node2) and not self.will_fusion_create_cycle(
-                node1, node2
-            ):
+            if self.can_fuse(
+                node1, node2, is_reorder_round
+            ) and not self.will_fusion_create_cycle(node1, node2):
                 speedup = self.speedup_by_fusion(node1, node2)
                 if callable(speedup):
                     pending_fusions[node1] = (speedup, node1, node2)
@@ -3599,7 +3671,6 @@ def resolve_pending_fusions(
 
         nodes = sorted(fused_nodes, key=lambda x: x.min_order)
         nodes = self.topological_sort_schedule(nodes)
-        self.prune_redundant_deps(nodes)
         return nodes
 
     def create_combo_kernel_nodes(self, num_ck_nodes: Optional[int] = None) -> None:
@@ -3655,7 +3726,9 @@ def prune_redundant_deps(self, nodes: list[BaseSchedulerNode]) -> None:
             node.prune_redundant_deps(self.name_to_fused_node)
 
     def get_possible_fusions(
-        self, nodes: list[BaseSchedulerNode]
+        self,
+        nodes: list[BaseSchedulerNode],
+        is_reorder_round: bool,
     ) -> list[tuple[BaseSchedulerNode, BaseSchedulerNode]]:
         """
         Helper to find all legal fusion opportunities, sorted by self.score_fusion()
@@ -3675,10 +3748,10 @@ def check_all_pairs(nodes: list[BaseSchedulerNode]) -> None:
                         continue
                     seen.add(key)
 
-                    if self.can_fuse(node1, node2):
+                    if self.can_fuse(node1, node2, is_reorder_round):
                         possible_fusions.append(key)
                     elif (node2.is_template() or node2.is_foreach()) and self.can_fuse(
-                        node2, node1
+                        node2, node1, is_reorder_round
                     ):
                         # foreach fusions and epilogue fusions are order dependent
                         possible_fusions.append((node2, node1))
@@ -3921,6 +3994,12 @@ def shared_data_after_reordering_loop(
         ):
             return -1
 
+        # in some rare case, a template can be passed in.
+        # Check test_interaction_with_multi_template in test_loop_ordering.py
+        # and https://github.com/pytorch/pytorch/issues/165579
+        if node1.is_template() or node2.is_template():
+            return -1
+
         node1_buffer_names = node1.read_writes.buffer_names()
         node2_buffer_names = node2.read_writes.buffer_names()
         # Fast path: no common buffers.
@@ -4148,7 +4227,12 @@ def has_reusable_buffer(node: BaseSchedulerNode) -> bool:
         else:
             return None
 
-    def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
+    def can_fuse(
+        self,
+        node1: BaseSchedulerNode,
+        node2: BaseSchedulerNode,
+        can_reorder: bool = False,
+    ) -> bool:
         """
         Determine if it is possible to combine node1 and node2 into a
         single fused node.
@@ -4265,7 +4349,8 @@ def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
 
         shared_data_score = self.score_fusion_memory(node1, node2)
         if (
-            shared_data_score < config.score_fusion_memory_threshold
+            can_reorder
+            and shared_data_score < config.score_fusion_memory_threshold
             and config.loop_ordering_after_fusion
         ):
             new_shared_data_score = self.shared_data_after_reordering_loop(node1, node2)
@@ -4275,6 +4360,7 @@ def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
         if config.expand_dimension_for_pointwise_nodes and (
             expand_analysis := self.get_expand_dim_for_pointwise_nodes(node1, node2)
         ):
+            # pyrefly: ignore  # unbound-name
             (expand_dim, smaller_node, expand_size) = expand_analysis
             smaller_node.expand_dimension_for_pointwise_node(expand_dim, expand_size)
             shared_data_score = self.score_fusion_memory(node1, node2)
@@ -4371,22 +4457,36 @@ def fusable_weak_dep(
         if len(mutating_writes) != 1:
             return False
         write = mutating_writes[0]
+        if isinstance(write, StarDep):
+            return False
         assert isinstance(write, MemoryDep)
 
         if free_symbol_is_type(write.index, SymT.TMP):
             return False
 
         real_name = self.mutation_real_name[weak_dep.mutating_buf]
-        relevant_reads = [
-            read for read in node1.read_writes.reads if read.name == real_name
-        ]
-        return all(
-            isinstance(read, MemoryDep)
-            and not free_symbol_is_type(read.index, SymT.TMP)
-            and read.index == write.index
-            and read.size == write.size
-            for read in relevant_reads
-        )
+        relevant_reading_nodes = [node1]
+        if isinstance(node1, ForeachKernelSchedulerNode):
+            relevant_reading_nodes = node1.snodes
+        num_concurrent_reads = 0
+        for reading_node in relevant_reading_nodes:
+            relevant_reads = [
+                read
+                for read in reading_node.read_writes.reads
+                if read.name == real_name
+            ]
+            if not relevant_reads:
+                continue
+            num_concurrent_reads += 1
+            if not all(
+                isinstance(read, MemoryDep)
+                and not free_symbol_is_type(read.index, SymT.TMP)
+                and read.index == write.index
+                and read.size == write.size
+                for read in relevant_reads
+            ):
+                return False
+        return num_concurrent_reads <= 1
 
     # StarDep doesn't match MemoryDep, different indices don't match
     # However, broadcasting sometimes strips dimensions, and if that's the case
@@ -4571,6 +4671,7 @@ def create_backend(self, device: torch.device) -> BaseScheduling:
                 device.type == "cuda"
                 and (device_props := torch.cuda.get_device_properties(device)).major < 7
             ):
+                # pyrefly: ignore  # unbound-name
                 raise GPUTooOldForTriton(device_props, inspect.currentframe())
             elif is_gpu(device.type) and not device.type == "mps":
                 raise TritonMissing(inspect.currentframe())
@@ -4619,6 +4720,25 @@ def should_partition(
     ) -> bool:
         """Return True if we should partition the inductor graph on this node"""
 
+        # Allow users to manually specify if a node should be partitioned
+        # Can only do this for FallbackKernels
+        ir_node = node.node
+        if isinstance(ir_node, torch._inductor.ir.FallbackKernel):
+            operator = ir_node.op_overload
+            if operator is not None and operator in _custom_should_partition_fns:
+                assert isinstance(operator, torch._ops.OpOverload)
+                should_partition_fn = _custom_should_partition_fns[operator]
+                fx_node = ir_node.get_origin_node()
+                assert fx_node is not None
+                success, fake_args, fake_kwargs = (
+                    torch._inductor.fx_utils.get_fake_args_kwargs(fx_node)
+                )
+                assert success, (
+                    "If this op came from a custom inductor pass, make sure to run FakeTensorUpdator"
+                )
+                should_partition = should_partition_fn(*fake_args, **fake_kwargs)
+                return should_partition
+
         # When not using cudagraphs, keep all kernels in the `call` function
         # instead of graph partition functions, since graph partition only brings
         # benefit to cudagraph
@@ -4849,6 +4969,7 @@ def is_none_layout(buf_name: str) -> bool:
                 if isinstance(buf.node, ir.MutationOutput) and (
                     real_name := self.mutation_real_name.get(buf_name, None)
                 ):
+                    # pyrefly: ignore  # unbound-name
                     return is_none_layout(real_name)
 
                 return True
@@ -4893,13 +5014,23 @@ def is_none_layout(buf_name: str) -> bool:
             for node in partition:
                 buffer_names_to_free.update(node.last_usage)
 
+            # buffer_names_to_free may contain buffers allocated in previous
+            # graph partitions. These buffers should also be a partition
+            # input.
+            extra_input_names = [
+                name
+                for name in (buffer_names_to_free - output_names)
+                if name in name_to_node
+            ]
+            partition_input_names.update(extra_input_names)
+
             input_nodes = {
                 name: name_to_node[name]
                 for name in partition_input_names
                 if name in name_to_node
             }
             input_deallocation = {
-                name: True if name in buffer_names_to_free else False
+                name: name in buffer_names_to_free
                 for name in partition_input_names
                 if name in name_to_node
             }
@@ -4947,6 +5078,7 @@ def is_none_layout(buf_name: str) -> bool:
             signatures.append(partition_signature)
 
             unmet_output_names = partition_input_names.union(
+                # pyrefly: ignore  # unsupported-operation
                 unmet_output_names - returned_output_names
             )
 
@@ -5189,9 +5321,10 @@ def _codegen_partition_wrapper(
             V.graph.wrapper_code.partition_signatures = signature
             V.graph.wrapper_code.write_prefix()
 
+            graph_name = V.graph.name
             partition_code, _ = V.graph.wrapper_code.generate(V.graph.is_inference)
 
-        V.graph.wrapper_code.define_subgraph_launcher_fn(partition_code.value)
+        V.graph.wrapper_code.define_subgraph_launcher_fn(graph_name, partition_code)
 
         V.graph.wrapper_code.codegen_partition_call(graph_partition_id, signature)
         V.graph.wrapper_code.allocated.update(  # type: ignore[has-type]
@@ -5328,6 +5461,7 @@ def _codegen(self, nodes: list[BaseSchedulerNode]) -> None:
 
         self.current_device = self.default_device_context
 
+        # pyrefly: ignore  # unbound-name
         if self.default_device_context and config.triton.autotune_at_compile_time:
             V.graph.wrapper_code.write_get_raw_stream_header()
 
@@ -5371,6 +5505,7 @@ def _codegen(self, nodes: list[BaseSchedulerNode]) -> None:
                 prologue, template_node, epilogue = node.get_prologue_template_epilogue(
                     list(node.get_nodes())
                 )
+                # pyrefly: ignore  # unbound-name
                 self.get_backend(device).codegen_template(
                     template_node, epilogue, prologue
                 )
@@ -5379,6 +5514,7 @@ def _codegen(self, nodes: list[BaseSchedulerNode]) -> None:
                 self.codegen_extern_call(node)
             elif node.is_foreach():
                 node = typing.cast(ForeachKernelSchedulerNode, node)
+                # pyrefly: ignore  # unbound-name
                 backend_ = self.get_backend(device)
                 from .codegen.cuda_combined_scheduling import CUDACombinedScheduling
                 from .codegen.simd import SIMDScheduling
@@ -5389,12 +5525,15 @@ def _codegen(self, nodes: list[BaseSchedulerNode]) -> None:
                     raise AssertionError(f"{type(self)=}")
                 backend.codegen_combo_kernel(node)
             elif isinstance(node, (FusedSchedulerNode, SchedulerNode)):
+                # pyrefly: ignore  # unbound-name
                 self.get_backend(device).codegen_node(node)
             else:
                 assert isinstance(node, NopKernelSchedulerNode)
                 node.mark_run()
 
+            # pyrefly: ignore  # unbound-name
             if config.triton.debug_sync_kernel:
+                # pyrefly: ignore  # unbound-name
                 self.get_backend(device).codegen_sync()
 
             self.available_buffer_names.update(node.get_buffer_names())
@@ -5531,7 +5670,7 @@ def update_zero_dim_cpu_tensor(self) -> None:
                         V.graph.zero_dim_cpu_tensor_list.add(read.name)
 
 
-class BaseScheduling:
+class BaseScheduling:  # noqa: docstring_linter
     def __init__(self, scheduler: Optional[Scheduler]):
         super().__init__()
         self.scheduler = scheduler
@@ -5674,3 +5813,19 @@ def benchmark_combo_kernel(
         and memory copy time in milliseconds on randomly generated inputs.
         """
         raise NotImplementedError
+
+    def codegen_comment(
+        self,
+        node_schedule: Sequence[BaseSchedulerNode],
+        kernel_name: Optional[str] = None,
+    ) -> None:
+        if kernel_name:
+            from torch._inductor.debug import set_kernel_post_grad_provenance_tracing
+
+            debug_handle = set_kernel_post_grad_provenance_tracing(
+                node_schedule,  # type: ignore[arg-type]
+                kernel_name,
+            )
+            V.graph.wrapper_code.write_provenance_debug_handle(
+                kernel_name, debug_handle
+            )
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index 1c1b3c78f9a2..24fd3ccbfe10 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -105,7 +105,9 @@
 if TYPE_CHECKING:
     import concurrent
 
-    from torch._inductor.codegen.simd import IterationRangesRoot
+    from torch._inductor.codegen.simd import IterationRangesEntry, IterationRangesRoot
+
+    from .codegen.common import CSE
 
 
 class KernelNamespace:
@@ -261,13 +263,20 @@ class SubgraphInfo:
     loads: IndentedBuffer = dataclasses.field(default_factory=IndentedBuffer)
     stores: IndentedBuffer = dataclasses.field(default_factory=IndentedBuffer)
     ops_handler: Optional[V.WrapperHandler] = None  # type: ignore[name-defined]
+    cse: Optional["CSE[Any]"] = None
 
     # only copied over if not None
     range_trees: Optional[list["IterationRangesRoot"]] = None
+    range_tree_nodes: Optional[dict[sympy.Symbol, "IterationRangesEntry"]] = None
     numels: Optional[dict[str, sympy.Expr]] = None
 
     def __post_init__(self):
-        self.only_copy_if_non_none_fields = ("range_trees", "numels")
+        self.only_copy_if_non_none_fields = (
+            "range_trees",
+            "range_tree_nodes",
+            "numels",
+            "cse",
+        )
 
     def to_dict(self):
         return {
@@ -323,6 +332,7 @@ def indirect_indexing(self, index_var: str, size, check, wrap_neg=True):
         """Convert index variable to symbolic form."""
         return sympy_index_symbol(str(index_var))
 
+    # pyrefly: ignore  # bad-override
     def store(
         self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
     ) -> str:
@@ -426,6 +436,7 @@ def __init__(
         # for templates with fixed epilogues
         self.prefix_args = prefix_args
         self.suffix_args = suffix_args
+        # pyrefly: ignore  # invalid-type-var
         self.epilogue_fn = epilogue_fn
         self.render_hooks = {}  # type: ignore[var-annotated]
         self.triton_meta: Optional[dict[str, object]] = None
@@ -543,6 +554,7 @@ def set_subgraph_body(self, body_name: str):
         context = (
             contextlib.nullcontext
             if not self.ops_handler
+            # pyrefly: ignore  # not-callable
             else lambda: V.set_ops_handler(self.ops_handler(V.get_ops_handler()))
         )
         with context():  # type: ignore[operator]
@@ -557,12 +569,10 @@ def set_subgraph_body(self, body_name: str):
             setattr(self, key, value)
 
     @contextlib.contextmanager
-    def create_subgraph_body(self, body_name: str):
+    def create_subgraph_body(self, body_name: str, clear_cse: bool = False):
         assert body_name not in self.subgraph_bodies
         self.subgraph_bodies[body_name] = SubgraphInfo(
-            IndentedBuffer(),
-            None,
-            None,
+            IndentedBuffer(), None, None, cse=self.cse.clone() if clear_cse else None
         )
         with self.set_subgraph_body(body_name):
             yield
@@ -983,6 +993,7 @@ def store(
                             f"{output_name} = {value_str}.broadcast_to(xindex.shape)"
                         )
 
+            # pyrefly: ignore  # bad-assignment
             self.ops_handler = StoreOutputSubstitution
 
             input_node = self.named_input_nodes[input_name]
@@ -1071,7 +1082,13 @@ def _generate_index_from_tma_index(
             # XBLOCK/YBLOCK and xoffset/yoffset. We append XBLOCK/YBLOCK
             # to the top of the kernel so we can safely extract the tensor
             # descriptor construction to the top of the kernel.
-            self.defines += f"{block_name}: tl.constexpr = {block_size}\n"
+            if block_name in self.prologue_cache:
+                assert self.prologue_cache[block_name] == block_size, (
+                    f"Constant {block_name} must be used for all stores"
+                )
+            else:
+                self.prologue_cache[block_name] = block_size
+                self.prologue.writeline(f"{block_name}: tl.constexpr = {block_size}")
         else:
             block_name = block_size
         line0 = f"{offset_name} = {texpr(tma_index)}"
@@ -1124,7 +1141,10 @@ def store_output(
             block_indexing (bool): Are the input indices presented as offsets for creating the block (e.g.
                 inputs to TMA) or are they tensors that should be passed in directly.
         """
-        with self.create_subgraph_body("<STORE_OUTPUT>"):
+        subgraph_name = self._get_store_output_subgraph_name(
+            next(self.store_output_ctr)
+        )
+        with self.create_subgraph_body(subgraph_name, clear_cse=True):
             assert isinstance(indices, (list, tuple))
             assert isinstance(val, str)
             assert isinstance(mask, (str, type(None)))
@@ -1177,6 +1197,7 @@ def store_output(
                                 val_shape[i],
                                 i,
                                 len(index_order),
+                                # pyrefly: ignore  # missing-argument
                                 block_name=range_tree.symt.name,
                             )
                         )
@@ -1190,6 +1211,7 @@ def store_output(
                         )
                         # Update the val_shape information to use consistent naming
                         # after the remapping.
+                        # pyrefly: ignore  # missing-argument
                         val_shape_copy[i] = range_tree.symt.name
                     # Reverse the index symbols because TMA is indexed
                     # as (x, y) whereas the variables will naturally be indexed
@@ -1267,6 +1289,7 @@ def store_output(
                 if output_index == contiguous_index:
                     output_index = sympy.Symbol("xindex", integer=True)
 
+            # pyrefly: ignore  # bad-assignment
             self.template_out_shape = val_shape if val_shape else val
             acc_dtype = (
                 triton_type_to_torch(self.meta["ACC_TYPE"])
@@ -1300,13 +1323,14 @@ def store_output(
             self.codegen_body()
 
         def hook():
-            # more stuff might have been added since the codegen_body above
-            self.codegen_body()
-            self.cse.invalidate(OrderedSet())
+            with self.set_subgraph_body(subgraph_name):
+                # more stuff might have been added since the codegen_body above
+                self.codegen_body()
+                self.cse.invalidate(OrderedSet())
 
-            return textwrap.indent(self.body.getvalue(), " " * indent_width).strip()
+                return textwrap.indent(self.body.getvalue(), " " * indent_width).strip()
 
-        return self._register_hook("<STORE_OUTPUT>", hook)
+        return self._register_hook(subgraph_name, hook)
 
     def _register_hook(
         self,
@@ -1356,9 +1380,11 @@ def render(self, template, kwargs, record_input_dependent_tracked_event=False):
             self.cached_replay_events = []
 
         template_env = {
-            fn.__name__: self.record_input_dependent_tracked_event()(fn)
-            if record_input_dependent_tracked_event
-            else fn
+            fn.__name__: (
+                self.record_input_dependent_tracked_event()(fn)
+                if record_input_dependent_tracked_event
+                else fn
+            )
             for fn in [
                 self.def_kernel,
                 self.size,
@@ -1676,7 +1702,7 @@ def maybe_append_choice(
                 choices.append(choice)
             return None
         except NotImplementedError as e:
-            log.info(
+            log.info(  # noqa: G200
                 "Cannot Append Choice: %s. KernelTemplate type is %s",
                 e,
                 type(self),
@@ -1810,8 +1836,7 @@ def make_extra() -> str:
 
             try:
                 template = kernel.render(self.template, kwargs, caching_enabled)
-                with kernel.set_subgraph_body("<STORE_OUTPUT>"):
-                    code = template.finalize_all()
+                code = template.finalize_all()
             except ZeroDivisionError:
                 # TODO(nmacchioni): fix sympy division by zero
                 return None
@@ -1881,6 +1906,7 @@ def maybe_test_cache(code: str, extra: str, kernel):
             extra,
             input_call_args,
             prologue_supported_inputs,
+            # pyrefly: ignore  # bad-argument-type
             kernel_args_sizevars_keys,
             kernel_options,
         )
@@ -1965,9 +1991,18 @@ def generate(  # type: ignore[override]
             expected_input_args,
         )
 
-        full_input_nodes = tuple(
+        # `kernel_input_nodes` are the actual inputs that will be passed to the kernel,
+        # so e.g. views of the same input are not included. `codegen_input_nodes`
+        # includes views of inputs to preserve the kernel semantics. The shape and
+        # strides of `codegen_input_nodes` will be used to infer read/writes in
+        # TemplateBuffer.extract_read_writes
+        kernel_input_nodes = tuple(
             [V.graph.get_buffer(k) for k in result.input_call_args]
         )
+        # Here we have (*input_nodes, *captured_buffers)
+        codegen_input_nodes = (
+            tuple(input_nodes) + kernel_input_nodes[len(expected_input_args) :]
+        )
         extra_args = V.graph.sizevars.size_hints(
             map(sympy.expand, result.kernel_args_sizevars_keys),
             fallback=config.unbacked_symint_fallback,
@@ -2040,13 +2075,13 @@ def make_kernel_render(out_node, hint_override: Optional[int] = None):
             matrix_instr_nonkdim=kwargs.get("matrix_instr_nonkdim", 0),
             waves_per_eu=kwargs.get("waves_per_eu", 0),
             kpack=kwargs.get("kpack", 2),
-            input_tensor_meta=TensorMeta.from_irnodes(full_input_nodes),  # type: ignore[arg-type]
+            input_tensor_meta=TensorMeta.from_irnodes(kernel_input_nodes),  # type: ignore[arg-type]
             output_tensor_meta=TensorMeta.from_irnodes(layout),
         )
 
         return TritonTemplateCaller(
             kernel_hash_name,
-            full_input_nodes,
+            codegen_input_nodes,
             layout,
             make_kernel_render,
             result.extra.strip("-").replace("-", ", "),
@@ -2062,8 +2097,8 @@ def make_kernel_render(out_node, hint_override: Optional[int] = None):
                 "num_stages": num_stages,
                 "num_warps": num_warps,
                 "GROUP_M": kwargs.get("GROUP_M", -1),
-                "allow_tf32": str(kwargs.get("ALLOW_TF32", None)),
-                "acc_type": str(kwargs.get("ACC_TYPE", None)),
+                "allow_tf32": str(kwargs.get("ALLOW_TF32")),
+                "acc_type": str(kwargs.get("ACC_TYPE")),
                 "matrix_instr_nonkdim": kwargs.get("matrix_instr_nonkdim", 0),
                 "waves_per_eu": kwargs.get("waves_per_eu", 0),
                 "kpack": kwargs.get("kpack", 2),
@@ -2435,6 +2470,7 @@ def __init__(
             self._postprocessor = lambda x: x
         assert "input_nodes" in kwargs
         assert "layout" in kwargs
+        # pyrefly: ignore  # not-callable
         kwargs["input_nodes"], kwargs["layout"] = preprocessor(
             kwargs["input_nodes"], kwargs["layout"]
         )
@@ -2600,6 +2636,17 @@ def cache_clear(self) -> None:
         self.precompile_cache.clear()
         self.prescreening_cache.clear()
 
+    def pick_deterministic_choice(self, choices: list[ChoiceCaller]) -> ChoiceCaller:
+        assert len(choices) >= 2
+        externs = [
+            choice for choice in choices if isinstance(choice, ExternKernelChoice)
+        ]
+        if len(externs) > 0:
+            # pyrefly: ignore  # bad-return
+            return externs[0]
+        else:
+            return choices[0]
+
     def __call__(
         self,
         name,
@@ -2656,6 +2703,9 @@ def create_no_valid_choices(reason: str) -> NoValidChoicesError:
                 # CUDATemplateCaller still needs to go through autotuning process to retrieve workspace size.
                 return choices[0].output_node()
 
+        if config.deterministic:
+            return self.pick_deterministic_choice(choices).output_node()
+
         inputs_key = create_inputs_key(input_nodes)
 
         # TODO(nmacchioni): remove this hacky way to tell if we ran benchmarking
@@ -3090,7 +3140,9 @@ def get_inputs(
         # de-duplicate args
         unique_example_inputs = {
             x.get_name(): input_gen_fns.get(
-                i, lambda x: cls.benchmark_example_value(x, hint_override=hint_override)
+                i,
+                lambda x: cls.benchmark_example_value(x, hint_override=hint_override),
+                # pyrefly: ignore  # bad-argument-type
             )(x)
             for i, x in enumerate(input_nodes)
         }
@@ -3137,12 +3189,15 @@ def get_inputs(
             expected,
         )
 
+    @staticmethod
+    def _is_extern(choice: ChoiceCaller) -> bool:
+        return isinstance(choice, (ExternKernelCaller, SubgraphChoiceCaller))
+
     @classmethod
     def benchmark_choice(
         cls, choice: ChoiceCaller, autotune_args: AutotuneArgs
     ) -> float:
-        is_extern = isinstance(choice, (ExternKernelCaller, SubgraphChoiceCaller))
-        benchmark_tensors = autotune_args.get_benchmark_tensors(is_extern)
+        benchmark_tensors = autotune_args.get_benchmark_tensors(cls._is_extern(choice))
         inputs, output = benchmark_tensors.unpack()
         output.zero_()
         result = choice.benchmark(*inputs, out=output)
@@ -3168,17 +3223,16 @@ def benchmark_choices(
         for choice in choices:
             try:
                 timing = cls.benchmark_choice(choice, autotune_args)
-            except CUDACompileError as e:
+            except CUDACompileError:
                 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
 
                 if not isinstance(choice, CUDATemplateCaller):
-                    log.error(
-                        "CUDA compilation error during autotuning: \n%s. \nIgnoring this choice.",
-                        e,
+                    log.exception(
+                        "CUDA compilation error during autotuning: \n%s. \nIgnoring this choice."
                     )
                 timing = float("inf")
-            except NotImplementedError as e:
-                log.warning("Not yet implemented: %s", e)
+            except NotImplementedError:
+                log.warning("Not yet implemented", exc_info=True)
                 timing = float("inf")
             except RuntimeError as e:
                 from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller
@@ -3211,7 +3265,7 @@ def benchmark_choices(
                     from triton.runtime.autotuner import OutOfResources
 
                     if isinstance(e, OutOfResources):
-                        log.warning(e)
+                        log.warning(e)  # noqa: G200
                         timing = float("inf")
                     else:
                         raise e
@@ -3249,8 +3303,8 @@ def benchmark_in_sub_process(
 
         # only benchmark triton kernel in sub process for now.
         # ATen/Extern kernel are still benchmarked in the current process.
-        extern = [c for c in choices if isinstance(c, ExternKernelCaller)]
-        triton = [c for c in choices if not isinstance(c, ExternKernelCaller)]
+        extern = [c for c in choices if cls._is_extern(c)]
+        triton = [c for c in choices if not cls._is_extern(c)]
 
         timings = cls.benchmark_in_current_process(
             extern, input_nodes, layout, input_gen_fns, hint_override=hint_override
@@ -3567,15 +3621,20 @@ def benchmark_example_value(node, hint_override: Optional[int] = None):
                 fallback=config.unbacked_symint_fallback,
                 hint_override=hint_override,
             ),
-            V.graph.sizevars.size_hints(
-                node.get_stride(),
-                fallback=config.unbacked_symint_fallback,
-                hint_override=hint_override,
+            tuple(
+                V.graph.sizevars.atomically_apply_size_hint(
+                    stride,
+                    fallback=config.unbacked_symint_fallback,
+                    hint_override=hint_override,
+                )
+                for stride in node.get_stride()
             ),
             node.get_device(),
             node.get_dtype(),
+            # pyrefly: ignore  # missing-attribute
             node.layout.offset,
             V.graph.sizevars.size_hints(
+                # pyrefly: ignore  # bad-argument-type
                 V.graph.get_allocation_size(node),
                 fallback=config.unbacked_symint_fallback,
                 hint_override=hint_override,
@@ -3620,9 +3679,12 @@ def key_of(node):
                 node.get_size(),
                 fallback=config.unbacked_symint_fallback,
             ),
-            *sizevars.size_hints(
-                node.get_stride(),
-                fallback=config.unbacked_symint_fallback,
+            *tuple(
+                V.graph.sizevars.atomically_apply_size_hint(
+                    stride,
+                    fallback=config.unbacked_symint_fallback,
+                )
+                for stride in node.get_stride()
             ),
             sizevars.size_hint(
                 node.get_layout().offset,
diff --git a/torch/_inductor/shape_propagation.py b/torch/_inductor/shape_propagation.py
index 38e3714d78f3..af227c6dbdc0 100644
--- a/torch/_inductor/shape_propagation.py
+++ b/torch/_inductor/shape_propagation.py
@@ -83,9 +83,11 @@ class ShapePropagationOpsHandler:
     @staticmethod
     def constant(value: torch.types.Number, dtype: torch.dtype) -> BlockShapeType:
         # See implementation of constant for triton for the reason
-        from torch._inductor.codegen.triton import TritonKernel
+        from torch._inductor.codegen.triton import triton_compute_type, TritonKernel
+
+        triton_type = triton_compute_type(dtype)
 
-        if isinstance(V.kernel, TritonKernel):
+        if isinstance(V.kernel, TritonKernel) and triton_type != "tl.float32":
             ndim = V.kernel.triton_tensor_ndim()
             return tuple([1] * ndim)
         else:
@@ -119,6 +121,13 @@ def to_dtype(
     ) -> BlockShapeType:
         return value.shape
 
+    @staticmethod
+    def dot(a: sympy.Expr, b: sympy.Expr) -> BlockShapeType:
+        from torch._inductor.codegen.triton import TritonKernel
+
+        assert isinstance(V.kernel, TritonKernel), "dot supports Triton only"
+        return ("YBLOCK", "XBLOCK")
+
     @staticmethod
     def index_expr(expr: sympy.Expr, dtype: torch.dtype) -> BlockShapeType:
         # shape is implicitly embedded in expr.
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 985b31e60d69..322a8f0ea06c 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -2,6 +2,7 @@
 import functools
 import itertools
 import logging
+from collections import defaultdict
 from collections.abc import Iterable, Sequence
 from typing import Any, Callable, cast, Optional, Union
 
@@ -112,7 +113,7 @@ def simplify_with_ranges(expr: Expr, var_ranges: VarRanges) -> Expr:
                 cache.clear()
                 replacement_count = len(self.replacements)
             key = (expr, *var_ranges.items())
-            result = cache.get(key, None)
+            result = cache.get(key)
             if result is None:
                 result = self._simplify_with_ranges(expr, var_ranges)
                 cache[key] = result
@@ -136,7 +137,7 @@ def simplify_loops(index_vars, sizes, index_formulas):
                 cache.clear()
                 replacement_count = len(self.replacements)
             key = (*index_vars, *sizes, *index_formulas)
-            result = cache.get(key, None)
+            result = cache.get(key)
             if result is None:
                 result = self._simplify_loops_impl(index_vars, sizes, index_formulas)
                 cache[key] = result
@@ -180,6 +181,7 @@ def _simplify_with_ranges(self, expr: Expr, var_ranges: VarRanges) -> Expr:
         def statically_known(expr):
             evaluated = self.shape_env._maybe_evaluate_static(
                 expr,
+                # pyrefly: ignore  # bad-argument-type
                 axioms=axioms,
                 var_to_range=var_to_range_tuple,
             )
@@ -453,9 +455,23 @@ def check_lt(self, left: Expr, right: Expr) -> None:
     # Similar to the functions guard_or_false/guard_or_true in symbolic_shapes.py
     # but operates on sympy expressions instead of symnodes. see Note [guard_or_].
     def guard_or_false(self, left):
+        import torch.fx.experimental._config as exp_config
+
+        if exp_config.backed_size_oblivious:
+            static_val = self.shape_env._maybe_evaluate_static(left)
+            if static_val is not None:
+                return static_val
+            return False
         return self.evaluate_expr(left, fallback_value=False)
 
     def guard_or_true(self, left):
+        import torch.fx.experimental._config as exp_config
+
+        if exp_config.backed_size_oblivious:
+            static_val = self.shape_env._maybe_evaluate_static(left)
+            if static_val is not None:
+                return static_val
+            return True
         return self.evaluate_expr(left, fallback_value=True)
 
     # The evaluate functions evaluate some symbolic sympy expression
@@ -715,50 +731,188 @@ def _stride_vars(
         return strides
 
     def _get_unbacked_replacements(self) -> dict[Expr, Expr]:
-        """
-        This helps with covering unbacked symint cases where you may have two
-        expressions: s0 + u0 and u1. And s0 + u0 is known to be equal to u1
-        via deferred_runtime_asserts.
-
-        For example in atomically_apply_size_hint, it must return the same size
-        hint for both s0 + u0 and u1, but it first needs to know they are equal.
-        Then it can substitute s0 + u0 for u1.
-        """
         if self.unbacked_replacements is not None:
             return self.unbacked_replacements
 
-        self.unbacked_replacements = {}
+        class CanonicalExprFinder:
+            """
+            Purpose:
+            A disjoint-set/union-find data structure that can return the
+            "canonical" expression for a group of equivalent expressions.
+            - The canonical expression must come from the input eq_graph.
+            - The heuristics used to choose a leader determines which
+            expression becomes the canonical expression.
+
+            Problem:
+            Given any unbacked expression, we should be able to find a size_hint
+            for the unbacked expression, that adheres to the ShapeEnv's deferred
+            runtime assertions. Otherwise, we may generate conflicting size hints.
+            In other words, even though we know u0 + s0 == u2, we may generate
+            size hints, such that, size_hint(u0 + s0) != size_hint(u2).
+            NOTE: At this time, only deferred runtime asserts that are equalities
+            (i.e. Eq(lhs, rhs)) are considered in this data structure.
+
+            Examples:
+            - u0 + u1 == 9000, then find_expr(u0 + u1) == find_expr(9000)
+            - u0 + u1 == s9, then find_expr(u0 + u1) == find_expr(s9)
+            - u0 + s0 == u10, then find_expr(u0 + s0) == find_expr(u10)
+
+            Inputs:
+            - equality_graph: An adjacency set of expressions where the edge
+            connects two expressions that are found equal to each other. The
+            edges are sourced from ShapeEnv's deferred_runtime_asserts.
+
+            Usage:
+            - Call union_expr(a, b) to merge a & b into a single set which
+            shares the same canonical expression.
+            - Call find_expr(x) to find the canonical expression for x.
+            """
+
+            def __init__(self, eq_graph: dict[Expr, OrderedSet[Expr]]):
+                self.eq_graph = eq_graph
+                self.expressions = list(eq_graph.keys())
+                self.reverse_expressions = {
+                    expr: i for i, expr in enumerate(self.expressions)
+                }
+                # Each node is its own leader/parent initially
+                self.leader = list(range(len(self.expressions)))
+                # Track rank for union-by-rank
+                self.rank = [1] * len(self.expressions)
+
+                # Takes each edge from the undirected graph and starts merging them.
+                self._build_canonical_expr_mapping()
+
+            def _build_canonical_expr_mapping(self):
+                for expr, edges in self.eq_graph.items():
+                    for adj in edges:
+                        self.union_expr(expr, adj)
+
+            def union_expr(self, a: Expr, b: Expr):
+                return self.union(
+                    self.reverse_expressions[a], self.reverse_expressions[b]
+                )
+
+            def union(self, a: int, b: int):
+                rootA = self.find(a)
+                rootB = self.find(b)
+                if rootA == rootB:
+                    return False  # already connected
+                leader, other = self.choose_leader(rootA, rootB)
+                self.leader[other] = leader
+                self.rank[leader] += self.rank[other]
+                return True
+
+            def find_expr(self, expr: Expr):
+                parent = self.find(self.reverse_expressions[expr])
+                return self.expressions[parent]
+
+            def find(self, x: int):
+                # Path compression
+                if self.leader[x] != x:
+                    self.leader[x] = self.find(self.leader[x])
+                return self.leader[x]
+
+            def choose_leader(self, a: int, b: int):
+                """
+                The leader will become the canonical expression.
+
+                Here are the heuristics used for choosing a leader:
+                1. Backed expression or constants preferred over unbacked expr
+                2. Simpler sub-expr when one contains the other
+                3. Higher frequency across equalities from deferred runtime assertions
+                4. Rank/size of the set
+                5. Fallback to sympy.Basic.compare
+                """
+
+                def _choose(x: int, y: int) -> bool:
+                    lhs, rhs = self.expressions[x], self.expressions[y]
+
+                    # Prefer replacing unbacked exprs with backed expressions/constants.
+                    # Examples:
+                    # u0 + s3 ==> s0 + s1, then leader is s0 + s1
+                    # u2 ==> 300, then leader is 300
+                    any_unbacked_lhs = has_free_unbacked_symbols(lhs)
+                    any_unbacked_rhs = has_free_unbacked_symbols(rhs)
+                    if any_unbacked_lhs != any_unbacked_rhs:
+                        return bool(any_unbacked_rhs)
+
+                    # Handles cases where LHS contains the RHS. In other words,
+                    # RHS is a sub-expression of LHS. For example:
+                    # s1 * Max(2, u0) ==> Max(2, u0), then leader is Max(2, u0)
+                    if lhs.has(rhs):
+                        return False
+                    elif rhs.has(lhs):
+                        return True
+
+                    # Prefer expressions that come up more often.
+                    degrees_lhs = len(self.eq_graph[lhs])
+                    degrees_rhs = len(self.eq_graph[rhs])
+                    if degrees_lhs != degrees_rhs:
+                        return degrees_lhs > degrees_rhs
+
+                    # Try to apply union-by-rank optimization to flatten the
+                    # leader trees.
+                    if self.rank[x] != self.rank[y]:
+                        return self.rank[x] > self.rank[y]
+
+                    # Fallback to sympy.Basic.compare for a deterministic ordering.
+                    return lhs.compare(rhs) == -1
+
+                if _choose(a, b):
+                    return a, b
+                return b, a
+
+        # Build an undirected graph using ShapeEnv's deferred runtime assertions.
+        self.equality_graph: dict[Expr, OrderedSet[Expr]] = defaultdict(OrderedSet)
         for assertions in self.shape_env.deferred_runtime_asserts.values():
             for assertion in assertions:
                 if not isinstance(assertion.expr, sympy.Equality):
+                    # We're ignoring other relationals for now. If you need to
+                    # account for relationals, then you may need a solver solution.
                     continue
+                lhs = sympy.sympify(assertion.expr.lhs)  # sympify helps with ints
+                rhs = sympy.sympify(assertion.expr.rhs)
+                self.equality_graph[lhs].add(rhs)
+                self.equality_graph[rhs].add(lhs)
 
-                lhs, rhs = assertion.expr.lhs, assertion.expr.rhs
-                l2r = lhs.compare(rhs) == 1  # see sympy.Basic.compare
-                src = lhs if l2r else rhs
-                dst = rhs if l2r else lhs
+        # Use the undirected graph to create a DSU data structure, so we can
+        # query for a "canonical" expression.
+        uf = CanonicalExprFinder(self.equality_graph)
+
+        # Start building the unbacked replacements mapping using CanonicalExprFinder
+        # The mapping is from Expr to its "canonical" Expr.
+        self.unbacked_replacements = {}
+        for expr in self.equality_graph.keys():
+            canonical_expr = uf.find_expr(expr)
+            if expr != canonical_expr:
+                self.unbacked_replacements[expr] = canonical_expr
 
-                existing_replacement = self.unbacked_replacements.get(src, None)
-                if existing_replacement and isinstance(
-                    existing_replacement, sympy.Symbol
-                ):
-                    # Prefer to keep replacements with symbols.
-                    continue
-                self.unbacked_replacements[src] = dst
         return self.unbacked_replacements
 
     @functools.lru_cache  # noqa: B019
     def _sub_unbacked_exprs(self, expr: Expr) -> Expr:
         # it's fine to cache this fn since self is a singleton
         replacements = self._get_unbacked_replacements()
-        while True:
+
+        # consider making this threshold configurable
+        sub_cnt_limit = 30
+        sub_cnt = 0
+        while sub_cnt < sub_cnt_limit:
             new_expr = expr.subs(replacements)
             if new_expr == expr:
                 return new_expr
             expr = sympy.factor(new_expr)
+            sub_cnt += 1
+
+        log.warning("Substitution limit (%d) reached w/ %s", sub_cnt_limit, expr)
+        return expr
 
     def atomically_apply_size_hint(
-        self, expr: Union[Expr, int], *, fallback: Optional[int] = None
+        self,
+        expr: Union[Expr, int],
+        *,
+        fallback: Optional[int] = None,
+        hint_override: Optional[int] = None,
     ) -> Union[Expr, int]:
         if isinstance(expr, (int, sympy.Integer)):
             return int(expr)
@@ -775,7 +929,9 @@ def atomically_apply_size_hint(
         assert isinstance(expr, Expr), type(expr)
         free_symbols = expr.free_symbols
         size_dict = {
-            symbol: V.graph.sizevars.size_hint(symbol, fallback=fallback)
+            symbol: V.graph.sizevars.size_hint(
+                symbol, fallback=fallback, hint_override=hint_override
+            )
             for symbol in free_symbols
         }
         return expr.subs(size_dict)
diff --git a/torch/_inductor/standalone_compile.py b/torch/_inductor/standalone_compile.py
index 26042535bc29..0d21b06f7182 100644
--- a/torch/_inductor/standalone_compile.py
+++ b/torch/_inductor/standalone_compile.py
@@ -158,7 +158,7 @@ def load(
                         AOTAutogradCache,
                     )
 
-                    entry = AOTAutogradCache._lookup(
+                    result = AOTAutogradCache._lookup(
                         key,
                         local=True,
                         remote=False,
@@ -167,7 +167,8 @@ def load(
                         aot_config=None,
                     )
 
-                assert entry is not None
+                assert result is not None
+                (entry, _) = result
 
                 from .compile_fx import _CompileFxKwargs
 
diff --git a/torch/_inductor/template_heuristics/aten.py b/torch/_inductor/template_heuristics/aten.py
index 72e66b1c1476..103668aa056f 100644
--- a/torch/_inductor/template_heuristics/aten.py
+++ b/torch/_inductor/template_heuristics/aten.py
@@ -5,7 +5,14 @@
 from torch._inductor import config as inductor_config
 
 from ..kernel.bmm import aten_baddbmm, aten_bmm, aten_bmm_dtype
-from ..kernel.mm import aten__fp8_mm, aten__int_mm, aten_addmm, aten_bias_addmm, aten_mm
+from ..kernel.mm import (
+    aten__fp8_mm,
+    aten__int_mm,
+    aten_addmm,
+    aten_bias_addmm,
+    aten_mm,
+    aten_mm_dtype,
+)
 from ..kernel.mm_plus_mm import aten_mm_plus_mm
 from .base import TemplateConfigHeuristics
 from .gemm import GemmMaxAutotuneTemplateConfigHeuristics
@@ -21,6 +28,7 @@
 # These are all labeled as device type None to indicate that they
 # are valid for all device types
 @register_template_heuristic(aten_mm.uid, None)
+@register_template_heuristic(aten_mm_dtype.uid, "cuda")
 @register_template_heuristic(aten__fp8_mm.uid, None)
 @register_template_heuristic(aten__int_mm.uid, None)
 @register_template_heuristic(aten_bmm.uid, None)
diff --git a/torch/_inductor/template_heuristics/base.py b/torch/_inductor/template_heuristics/base.py
index d11ca64fd1aa..0343270f3a11 100644
--- a/torch/_inductor/template_heuristics/base.py
+++ b/torch/_inductor/template_heuristics/base.py
@@ -39,14 +39,10 @@ def get_template_configs(
         if not self.should_run(kernel_inputs):
             return
 
-        # Get extra kwargs once
-        extra_kwargs = self.get_extra_kwargs(kernel_inputs, op_name)
-
         # Generate configs and fuse with extra_kwargs
         for config_dict in self._get_template_configs_impl(kernel_inputs, op_name):
             # Fuse extra_kwargs into config
-            fused_kwargs = {**config_dict, **extra_kwargs}
-            yield DictKernelTemplateParams(fused_kwargs)
+            yield DictKernelTemplateParams(config_dict)
 
     def _get_template_configs_impl(
         self,
diff --git a/torch/_inductor/template_heuristics/triton.py b/torch/_inductor/template_heuristics/triton.py
index 9eb19ba0ccbc..af7d55c130e2 100644
--- a/torch/_inductor/template_heuristics/triton.py
+++ b/torch/_inductor/template_heuristics/triton.py
@@ -21,7 +21,7 @@
     blackwell_ws_persistent_device_tma_mm_template,
     mm_template,
     persistent_tma_mm_template,
-    scaled_mm_device_tma_template,
+    scaled_mm_device_tma_epilogue_scaling_template,
 )
 from ..kernel.mm_plus_mm import mm_plus_mm_template
 from ..kernel_inputs import KernelInputs, MMKernelInputs
@@ -89,6 +89,39 @@ class FlexConfig:
     num_warps: int
 
 
+@dataclasses.dataclass
+class FlexBwDConfig:
+    """
+    Base Config class for flex attention backward
+    - FlexAttn backward will use this.
+
+    Note: flex bwd configs
+
+    Kernel Constraints:
+      * BLOCK_N1 % BLOCK_M1 == 0
+      * BLOCK_M2 % BLOCK_N2 == 0
+
+    Pattern 1 - Symmetric Pairing (M, N, N, M):
+    - Used in autotune configs
+    - block_m1=M, block_n1=N, block_m2=N, block_n2=M
+    - Only requires checking BLOCK_N % BLOCK_M == 0
+    - Second constraint (BLOCK_M2 % BLOCK_N2) automatically satisfied
+
+    Pattern 2 - Independent Parameters (M1, N1, M2, N2):
+    - Used in exhaustive search for maximum flexibility
+    - All four parameters can be set independently
+    - Requires checking both constraints
+
+    """
+
+    block_m1: int
+    block_n1: int
+    block_m2: int
+    block_n2: int
+    num_stages: int
+    num_warps: int
+
+
 @dataclasses.dataclass
 class FlexDecodeConfig:
     """
@@ -134,6 +167,17 @@ class ROCmFlexConfig(FlexConfig):
     kpack: int = 2
 
 
+@dataclasses.dataclass
+class ROCmFlexBwDConfig(FlexBwDConfig):
+    """
+    ROCm subclass for FlexAttn backward, with AMD backend specific tuneable kernargs
+    """
+
+    matrix_instr_nonkdim: int = 0
+    waves_per_eu: int = 0
+    kpack: int = 2
+
+
 @dataclasses.dataclass
 class ROCmFlexDecodeConfig(FlexDecodeConfig):
     """
@@ -199,6 +243,7 @@ def __init__(self) -> None:
             GemmConfig(128, 128, 32, 3, 4),
             GemmConfig(128, 128, 64, 3, 4),
             GemmConfig(128, 128, 64, 5, 8),
+            GemmConfig(128, 128, 128, 4, 8),
         ]
 
         # Exhaustive search for mm configs
@@ -260,6 +305,20 @@ def __init__(self) -> None:
             GemmConfig(128, 128, 64, 5, 4),
         ]
 
+        self.blackwell_persistent_mm_configs: list[BaseConfig] = [
+            GemmConfig(128, 256, 64, 4, 8),
+            GemmConfig(256, 128, 64, 3, 8),
+            GemmConfig(128, 256, 128, 2, 8),
+            GemmConfig(128, 256, 64, 3, 8),
+            GemmConfig(128, 128, 128, 3, 4),
+            GemmConfig(256, 128, 64, 3, 8),
+            GemmConfig(128, 128, 128, 3, 8),
+        ]
+
+        self.blackwell_persistent_addmm_configs: list[BaseConfig] = [
+            GemmConfig(256, 128, 64, 2, 4),
+        ]
+
         self.scaled_mm_configs: list[BaseConfig] = [
             GemmConfig(128, 256, 32, 3, 8),
             GemmConfig(256, 128, 32, 3, 8),
@@ -413,13 +472,14 @@ def __init__(self) -> None:
             FlexConfig(64, 64, 3, 4),
         ]
 
-        self.flex_attn_bwd_autotune_configs: list[FlexConfig] = [
-            FlexConfig(BLOCK1, BLOCK2, s, w)
-            for BLOCK1 in [32, 64]
-            for BLOCK2 in [32, 64, 128]
+        self.flex_attn_bwd_autotune_configs: list[FlexBwDConfig] = [
+            # See Note: flex bwd configs
+            FlexBwDConfig(BLOCK_M, BLOCK_N, BLOCK_N, BLOCK_M, s, w)
+            for BLOCK_M in [32, 64]
+            for BLOCK_N in [32, 64, 128]
             for s in [1, 3, 4, 5]  # num_stages
-            for w in ([4, 8] if BLOCK1 >= 128 or BLOCK2 >= 128 else [4])
-            if BLOCK2 % BLOCK1 == 0
+            for w in ([4, 8] if BLOCK_M >= 128 or BLOCK_N >= 128 else [4])
+            if BLOCK_N % BLOCK_M == 0
         ]
 
         self.flex_decode_autotune_configs: list[FlexDecodeConfig] = [
@@ -436,13 +496,17 @@ def __init__(self) -> None:
             for num_warps in [2, 4, 8]
         ]
 
-        self.exhaustive_flex_attn_bwd_configs: list[FlexConfig] = [
-            FlexConfig(BLOCK1, BLOCK2, num_stages, num_warps)
-            for BLOCK1 in [16, 32, 64, 128]
-            for BLOCK2 in [16, 32, 64, 128]
-            for num_stages in [1, 3, 4, 5]
+        self.exhaustive_flex_attn_bwd_configs: list[FlexBwDConfig] = [
+            # See Note: flex bwd configs
+            FlexBwDConfig(BLOCK_M1, BLOCK_N1, BLOCK_M2, BLOCK_N2, num_stages, num_warps)
+            for BLOCK_M1 in [16, 32, 64, 128]
+            for BLOCK_N1 in [16, 32, 64, 128]
+            for BLOCK_M2 in [16, 32, 64, 128]
+            for BLOCK_N2 in [16, 32, 64, 128]
+            for num_stages in [1, 3, 4]
             for num_warps in [2, 4, 8]
-            if BLOCK2 % BLOCK1 == 0
+            if BLOCK_N1 % BLOCK_M1 == 0
+            and BLOCK_M2 % BLOCK_N2 == 0  # kernel static assertions
         ]
 
         self.exhaustive_flex_decode_configs: list[FlexDecodeConfig] = [
@@ -715,15 +779,17 @@ def get_flex_attn_fwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfi
 
         return flex_attn_fwd_configs
 
-    def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
-        flex_attn_bwd_configs: list[FlexConfig] = []
+    def get_flex_attn_bwd_configs(
+        self, head_dim: int, dtype: Any
+    ) -> list[FlexBwDConfig]:
+        flex_attn_bwd_configs: list[FlexBwDConfig] = []
 
         if config.max_autotune:
             if config.max_autotune_flex_search_space == "EXHAUSTIVE":
                 return self.exhaustive_flex_attn_bwd_configs
             flex_attn_bwd_configs += self.flex_attn_bwd_autotune_configs
 
-        default_config = FlexConfig(16, 16, 1, 4)
+        default_config = FlexBwDConfig(16, 16, 16, 16, 1, 4)
 
         if default_config not in flex_attn_bwd_configs:
             flex_attn_bwd_configs.append(default_config)
@@ -922,41 +988,57 @@ def get_flex_attn_fwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfi
 
         return flex_attn_fwd_configs
 
-    def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
+    def get_flex_attn_bwd_configs(
+        self, head_dim: int, dtype: Any
+    ) -> list[FlexBwDConfig]:
         capability = torch.cuda.get_device_capability()
-
-        flex_attn_bwd_configs: list[FlexConfig] = []
+        flex_attn_bwd_configs: list[FlexBwDConfig] = []
 
         if config.max_autotune:
             if config.max_autotune_flex_search_space == "EXHAUSTIVE":
                 return self.exhaustive_flex_attn_bwd_configs
             flex_attn_bwd_configs += self.flex_attn_bwd_autotune_configs
 
+        major, minor = capability
         if dtype == torch.float32:
-            default_config = FlexConfig(16, 16, 1, 4)
-        elif head_dim <= 256 and capability == (9, 0):  # H100
-            if head_dim == 64:
-                default_config = FlexConfig(64, 64, 3, 4)
-            elif head_dim == 128:
-                default_config = FlexConfig(64, 128, 3, 8)
-            else:
-                default_config = FlexConfig(64, 64, 2, 4)
-        elif head_dim <= 256 and capability >= (10, 0):  # B100
-            if head_dim == 64 or head_dim == 128:
-                default_config = FlexConfig(32, 32, 2, 4)
-            else:
-                default_config = FlexConfig(32, 32, 1, 4)
-        elif capability >= (8, 0):  # A100
-            if head_dim == 64:
-                default_config = FlexConfig(32, 128, 3, 4)
-            elif head_dim == 128:
-                # SM86/89 have smaller shared memory sizes
-                num_stages = 3 if capability[1] == 0 else 2
-                default_config = FlexConfig(64, 64, num_stages, 4)
-            else:
-                default_config = FlexConfig(64, 64, 2, 4)
-        else:  # modest hardware or extremely large head_dim
-            default_config = FlexConfig(16, 16, 1, 4)
+            capability_class = "float32"
+        elif major >= 10:
+            capability_class = "sm10x"
+        elif capability == (9, 0):
+            capability_class = "sm90"
+        elif major >= 8:
+            capability_class = "sm8x"
+        else:
+            capability_class = "baseline"
+
+        # fmt: off
+        config_map = {
+            "float32": lambda h: FlexBwDConfig(16, 16, 16, 16, 1, 4),
+            "baseline": lambda h: FlexBwDConfig(16, 16, 16, 16, 1, 4),
+            "sm90": lambda h: (
+                FlexBwDConfig(64, 64, 64, 64, 3, 4) if h < 64 else
+                FlexBwDConfig(64, 128, 128, 64, 3, 8) if h <= 128 else
+                FlexBwDConfig(64, 64, 64, 64, 2, 4)
+            ),
+            "sm10x": lambda h: (
+                FlexBwDConfig(64, 128, 128, 64, 3, 4)
+                if h <= 128
+                else FlexBwDConfig(64, 64, 64, 64, 2, 4)
+            ),
+            "sm8x": lambda h: (
+                FlexBwDConfig(32, 128, 128, 32, 3, 4)
+                if h < 64
+                else FlexBwDConfig(
+                    64, 64, 64, 64, 3 if minor == 6 and h == 128 else 2, 4
+                )
+            ),
+        }
+        # fmt: on
+
+        if head_dim <= 256:
+            default_config = config_map[capability_class](head_dim)
+        else:
+            default_config = FlexBwDConfig(16, 16, 16, 16, 1, 4)
 
         if default_config not in flex_attn_bwd_configs:
             flex_attn_bwd_configs.append(default_config)
@@ -1101,8 +1183,9 @@ def __init__(self) -> None:
             for w in [4, 8]
         ]
 
-        self.flex_attn_bwd_autotune_configs: list[FlexConfig] = [
-            ROCmFlexConfig(BLOCK1, BLOCK2, 1, w, mfma)
+        self.flex_attn_bwd_autotune_configs: list[FlexBwDConfig] = [
+            # See Note: flex bwd configs
+            ROCmFlexBwDConfig(BLOCK1, BLOCK2, BLOCK2, BLOCK1, 1, w, mfma)
             for BLOCK1 in [16, 32, 64]
             for BLOCK2 in [32, 64, 128]
             for w in ([4, 8] if BLOCK1 >= 128 or BLOCK2 >= 128 else [4])
@@ -1129,15 +1212,28 @@ def __init__(self) -> None:
             for wpeu in [0, int(8 // num_warps)]
         ]
 
-        self.exhaustive_flex_attn_bwd_configs: list[FlexConfig] = [
-            ROCmFlexConfig(BLOCK1, BLOCK2, num_stages, num_warps, mfma, wpeu)
-            for BLOCK1 in [16, 32, 64, 128]
-            for BLOCK2 in [16, 32, 64, 128]
+        self.exhaustive_flex_attn_bwd_configs: list[FlexBwDConfig] = [
+            # See Note: flex bwd configs
+            ROCmFlexBwDConfig(
+                BLOCK_M1,
+                BLOCK_N1,
+                BLOCK_M2,
+                BLOCK_N2,
+                num_stages,
+                num_warps,
+                mfma,
+                wpeu,
+            )
+            for BLOCK_M1 in [16, 32, 64, 128]
+            for BLOCK_N1 in [16, 32, 64, 128]
+            for BLOCK_M2 in [16, 32, 64, 128]
+            for BLOCK_N2 in [16, 32, 64, 128]
             for num_stages in [1, 2]
             for num_warps in [2, 4, 8]
             for mfma in [0, 16]
             for wpeu in [0, int(8 // num_warps)]
-            if BLOCK2 % BLOCK1 == 0
+            if BLOCK_N1 % BLOCK_M1 == 0
+            and BLOCK_M2 % BLOCK_N2 == 0  # kernel static assertions
         ]
 
         self.exhaustive_flex_decode_configs: list[FlexDecodeConfig] = [
@@ -1256,8 +1352,10 @@ def get_flex_attn_fwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfi
 
         return flex_attn_fwd_configs
 
-    def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
-        flex_attn_bwd_configs: list[FlexConfig] = []
+    def get_flex_attn_bwd_configs(
+        self, head_dim: int, dtype: Any
+    ) -> list[FlexBwDConfig]:
+        flex_attn_bwd_configs: list[FlexBwDConfig] = []
 
         if config.max_autotune:
             if config.max_autotune_flex_search_space == "EXHAUSTIVE":
@@ -1265,16 +1363,16 @@ def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfi
             flex_attn_bwd_configs += self.flex_attn_bwd_autotune_configs
 
         if dtype == torch.float32:
-            default_config = ROCmFlexConfig(16, 16, 1, 4)
+            default_config = ROCmFlexBwDConfig(16, 16, 16, 16, 1, 4)
         elif head_dim <= 256:
             if head_dim == 64:
-                default_config = ROCmFlexConfig(64, 64, 1, 4)
+                default_config = ROCmFlexBwDConfig(64, 64, 64, 64, 1, 4)
             elif head_dim == 128:
-                default_config = ROCmFlexConfig(64, 128, 1, 8)
+                default_config = ROCmFlexBwDConfig(64, 128, 128, 64, 1, 8)
             else:
-                default_config = ROCmFlexConfig(64, 64, 1, 4)
+                default_config = ROCmFlexBwDConfig(64, 64, 64, 64, 1, 4)
         else:
-            default_config = ROCmFlexConfig(16, 16, 1, 4)
+            default_config = ROCmFlexBwDConfig(16, 16, 16, 16, 1, 4)
 
         if default_config not in flex_attn_bwd_configs:
             flex_attn_bwd_configs.append(default_config)
@@ -1324,12 +1422,13 @@ def __init__(self) -> None:
             FlexConfig(128, 32, 2, 16),
             FlexConfig(128, 32, 2, 8),
         ]
-        self.flex_attn_bwd_autotune_configs: list[FlexConfig] = []
+        self.flex_attn_bwd_autotune_configs: list[FlexBwDConfig] = []
         self.flex_decode_autotune_configs: list[FlexDecodeConfig] = []
 
         if not bool(os.getenv("CI")):
             self.flex_attn_bwd_autotune_configs += [
-                FlexConfig(BLOCK1, BLOCK2, s, w)
+                # See Note: flex bwd configs
+                FlexBwDConfig(BLOCK1, BLOCK2, BLOCK2, BLOCK1, s, w)
                 for BLOCK1 in [32, 64]
                 for BLOCK2 in [32, 64, 128]
                 for s in [1, 3, 4, 5]  # num_stages
@@ -1374,8 +1473,10 @@ def get_flex_attn_fwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfi
 
         return flex_attn_fwd_configs
 
-    def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
-        flex_attn_bwd_configs: list[FlexConfig] = []
+    def get_flex_attn_bwd_configs(
+        self, head_dim: int, dtype: Any
+    ) -> list[FlexBwDConfig]:
+        flex_attn_bwd_configs: list[FlexBwDConfig] = []
 
         if config.max_autotune:
             if config.max_autotune_flex_search_space == "EXHAUSTIVE":
@@ -1383,16 +1484,16 @@ def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfi
             flex_attn_bwd_configs += self.flex_attn_bwd_autotune_configs
 
         if dtype == torch.float32:
-            default_config = FlexConfig(16, 16, 1, 4)
+            default_config = FlexBwDConfig(16, 16, 16, 16, 1, 4)
         elif head_dim <= 256:
             if head_dim == 64:
-                default_config = FlexConfig(64, 64, 1, 8)
+                default_config = FlexBwDConfig(64, 64, 64, 64, 1, 8)
             elif head_dim == 128:
-                default_config = FlexConfig(64, 128, 1, 8)
+                default_config = FlexBwDConfig(64, 128, 64, 128, 1, 8)
             else:
-                default_config = FlexConfig(64, 64, 1, 8)
+                default_config = FlexBwDConfig(64, 64, 64, 64, 1, 8)
         else:  # modest hardware or extremely large head_dim
-            default_config = FlexConfig(16, 16, 1, 4)
+            default_config = FlexBwDConfig(16, 16, 16, 16, 1, 4)
 
         if default_config not in flex_attn_bwd_configs:
             flex_attn_bwd_configs.append(default_config)
@@ -1446,6 +1547,23 @@ class MMTemplateConfigMixin(GemmMaxAutotuneTemplateConfigHeuristics):
     ]
     _filter_configs: Callable[[list[BaseConfig]], list[BaseConfig]]
 
+    def get_extra_kwargs(
+        self,
+        kernel_inputs: KernelInputs,
+        op_name: str,
+    ) -> dict[str, Any]:
+        assert isinstance(kernel_inputs, MMKernelInputs)
+        m, n, k = kernel_inputs.mnk_symbolic()
+        # Calculate allow_tf32
+        allow_tf32 = torch.backends.cuda.matmul.allow_tf32 and (
+            not inductor_config.force_same_precision
+            or ((m % 16) == 0 and (n % 16) == 0 and (k % 8) == 0)
+        )
+
+        return {
+            "ALLOW_TF32": allow_tf32,
+        }
+
     def _valid(self, kernel_inputs: KernelInputs) -> bool:
         return True
 
@@ -1519,16 +1637,10 @@ def _convert_config_to_template_kwargs(
             == triton_config.kwargs["BLOCK_K"]
         )
 
-        # Calculate allow_tf32
-        allow_tf32 = torch.backends.cuda.matmul.allow_tf32 and (
-            not inductor_config.force_same_precision
-            or ((m % 16) == 0 and (n % 16) == 0 and (k % 8) == 0)
-        )
-
         # Build options dict
+        # pyrefly: ignore  # no-matching-overload
         options_dict = dict(
             EVEN_K=even_k_symbolic,
-            ALLOW_TF32=allow_tf32,
             USE_FAST_ACCUM=False,  # Option for _scaled_mm
             ACC_TYPE=self._get_acc_type(out_dtype),
             num_stages=triton_config.num_stages,
@@ -1609,6 +1721,7 @@ def get_extra_kwargs(
         )
         return kwargs
 
+    # pyrefly: ignore  # bad-override
     def _filter_configs(self, configs: list[BaseConfig]) -> list[BaseConfig]:
         """
         TMA specific filtering, as num_warps=2 not safe for TMA
@@ -1678,7 +1791,12 @@ def _get_template_configs_impl(
             ws = (
                 template_kwargs["num_warps"] >= 4 and template_kwargs["num_stages"] >= 2
             )
-            yield {**template_kwargs, **base_ops, "WARP_SPECIALIZE": ws}
+            yield {
+                **template_kwargs,
+                **base_ops,
+                "WARP_SPECIALIZE": ws,
+                "EPILOGUE_SUBTILE": config.triton.enable_epilogue_subtiling,
+            }
 
 
 # Scaled MM-specific mixin for scaled MM templates
@@ -1729,7 +1847,7 @@ def _get_template_configs_impl(
     ) -> Generator[dict[str, Any], None, None]:
         """
         Generate scaled MM template configs with scaled MM-specific options.
-        Handles the remaining logic from mm_common including assertions and SCALING_ROWWISE.
+        Handles the remaining logic from mm_common, including assertions.
         """
         kernel_inputs = self.adjust_kernel_inputs(kernel_inputs, op_name)
         input_nodes = kernel_inputs.nodes()
@@ -1779,9 +1897,6 @@ def is_scalar_like(sz: Any) -> bool:
             # Add scaled MM-specific options (moved from mm_common.scaled_mm_options)
             # Override accumulator type for scaled MM
             template_kwargs["ACC_TYPE"] = "tl.float32"
-            # Add SCALING_ROWWISE attribute based on scale tensor shapes
-            both_scalar_like = is_scalar_like(size_a) and is_scalar_like(size_b)
-            template_kwargs["SCALING_ROWWISE"] = not both_scalar_like
 
             yield template_kwargs
 
@@ -1828,6 +1943,7 @@ class ScaledTMAConfigMixin(TMAWorkspaceMixin, BaseScaledMMConfigMixin):
     This inherits from BaseScaledMMConfigMixin and adds TMA-specific options.
     """
 
+    # pyrefly: ignore  # bad-override
     def _filter_configs(self, configs: list[BaseConfig]) -> list[BaseConfig]:
         """
         TMA specific filtering:
@@ -1858,6 +1974,31 @@ def _get_template_configs_impl(
             yield template_kwargs
 
 
+# Scaled Blackwell TMA-specific mixin for scaled MM templates with TMA
+class ScaledBlackwellTMAConfigMixin(
+    BlackwellTMATemplateConfigMixin, ScaledMMConfigMixin
+):
+    """
+    Scaled Blackwell TMA-specific mixin that extends ScaledMMConfigMixin with TMA functionality.
+    This is for scaled MM templates that use device TMA on Blackwell.
+    This inherits from ScaledMMConfigMixin, which inherits the scale_mm_epilogue, and adds TMA-specific options.
+    """
+
+    # pyrefly: ignore  # bad-override
+    def _filter_configs(self, configs: list[BaseConfig]) -> list[BaseConfig]:
+        """
+        Warp specialization-specific filtering (BlackwellTMATemplateConfigMixin)
+        (compilation issues occur in some versions of Triton)
+        - num_warps < 4 unsafe for warpspec
+        - num_stages < 2 unsafe for warpspec
+
+        TMA-specific filtering:
+        - block_k >= 32 required for TMA (requires inner-most dimension >= 32)
+        """
+        configs = [c for c in configs if c.block_k >= 32]
+        return super()._filter_configs(configs)
+
+
 # Template-specific heuristic classes using multiple inheritance
 
 
@@ -1930,8 +2071,7 @@ class CUDABlackwellPersistentTMATemplateConfigHeuristic(
 
     def __init__(self) -> None:
         super().__init__()
-        # TODO: Tune mm_configs for blackwell.
-        self.mm_configs = self.persistent_mm_configs
+        self.mm_configs = self.blackwell_persistent_mm_configs
 
 
 @register_template_heuristic(
@@ -1950,6 +2090,7 @@ class CUDAAddmmPersistentTMATemplateConfigHeuristic(
     blackwell_ws_persistent_device_tma_mm_template.uid,
     "cuda",
     register=torch.version.hip is None,
+    op_name="addmm",
 )
 class CUDABlackwellAddmmPersistentTMATemplateConfigHeuristic(
     AddMMConfigMixin, CUDABlackwellPersistentTMATemplateConfigHeuristic
@@ -1958,8 +2099,11 @@ class CUDABlackwellAddmmPersistentTMATemplateConfigHeuristic(
 
     def __init__(self) -> None:
         super().__init__()
-        # TODO: Tune mm_configs for blackwell.
-        self.mm_configs = self.persistent_mm_configs
+        # NOTE: to ensure that we pass tests, addmm needs a small config
+        self.mm_configs = (
+            self.blackwell_persistent_mm_configs
+            + self.blackwell_persistent_addmm_configs
+        )
 
 
 @register_template_heuristic(
@@ -1973,22 +2117,44 @@ def __init__(self) -> None:
         # Override mm_configs to use scaled_mm_configs
         self.mm_configs = self.scaled_mm_configs
 
+    # pyrefly: ignore  # bad-override
     def _filter_configs(self, configs: list[BaseConfig]) -> list[BaseConfig]:
         configs = [c for c in configs if c.block_k >= 32]
         return super()._filter_configs(configs)
 
 
 @register_template_heuristic(
-    scaled_mm_device_tma_template.uid,
+    scaled_mm_device_tma_epilogue_scaling_template.uid,
     "cuda",
     register=torch.version.hip is None,
+    op_name="scaled_mm",
 )
-class CUDAScaledTMATemplateConfigHeuristic(ScaledTMAConfigMixin, CUDAConfigHeuristic):
-    """Scaled TMA template heuristic for CUDA"""
+class CUDAScaledTMAEpilogueScalingTemplateConfigHeuristic(
+    ScaledTMAConfigMixin, CUDAConfigHeuristic
+):
+    """Scaled TMA template heuristic for CUDA: epilogue scaling variants (TensorWise, RowWise)"""
+
+    def __init__(self) -> None:
+        super().__init__()
+        # Override mm_configs to use scaled_persistent_mm_configs for TMA
+        self.mm_configs = self.scaled_persistent_mm_configs
+
+
+@register_template_heuristic(
+    blackwell_ws_persistent_device_tma_mm_template.uid,  # regular Blackwell MM template + scaling epilogue from ScaledMMConfigMixin
+    "cuda",
+    register=torch.version.hip is None,
+    op_name="scaled_mm",
+)
+class CUDAScaledBlackwellTMATemplateConfigHeuristic(
+    ScaledBlackwellTMAConfigMixin, CUDAConfigHeuristic
+):
+    """Scaled Blackwell TMA template heuristic for CUDA"""
 
     def __init__(self) -> None:
         super().__init__()
         # Override mm_configs to use scaled_persistent_mm_configs for TMA
+        # TODO: Tune scaled_persistent_mm_configs for Blackwell
         self.mm_configs = self.scaled_persistent_mm_configs
 
 
diff --git a/torch/_inductor/test_operators.py b/torch/_inductor/test_operators.py
index d3d2705f8c78..fffc71db1358 100644
--- a/torch/_inductor/test_operators.py
+++ b/torch/_inductor/test_operators.py
@@ -15,6 +15,7 @@
 
 class Realize(Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx: object, x: Tensor) -> Tensor:
         return torch.ops._inductor_test.realize(x)
 
diff --git a/torch/_inductor/tiling_utils.py b/torch/_inductor/tiling_utils.py
index 4a1febe08e99..30efae2293c8 100644
--- a/torch/_inductor/tiling_utils.py
+++ b/torch/_inductor/tiling_utils.py
@@ -1,9 +1,6 @@
 import dataclasses
-import functools
 import itertools
-import sys
 from collections import Counter, defaultdict
-from collections.abc import Iterable, Iterator
 from typing import Callable, Literal, Optional, overload, TYPE_CHECKING, TypeVar, Union
 
 import sympy
@@ -129,13 +126,14 @@ def indexing_div_rep(
 
     # For the purposes of tiling/coalesced access, approximate ModularIndexing and FloorDiv
     # then check later
+    # pyrefly: ignore  # missing-attribute
     eq_1_expr_simplified = eq_1_expr.replace(ModularIndexing, indexing_div_rep).replace(
         FloorDiv, indexing_div_rep
     )
 
     out = _solve_simple_expr(eq_1_expr_simplified)
     # since we approximated FloorDiv/ModularIndexing, double check here
-    if not out or not (sympy_subs(eq_1_expr, {free_symbol: out})) == 1:
+    if not out or sympy_subs(eq_1_expr, {free_symbol: out}) != 1:
         return None
 
     required_values.append(out)
@@ -373,20 +371,6 @@ def try_split(self, pw: Split, red: Split) -> Optional[tuple[Split, Split]]:
         return pw, red
 
 
-if sys.version_info >= (3, 10):
-    # On Python 3.10+ we can use zip(strict=True)
-    zip_equal = functools.partial(zip, strict=True)
-else:
-    # Fallback for older versions
-    def zip_equal(it1: Iterable[T], it2: Iterable[U]) -> Iterator[tuple[T, U]]:
-        """
-        Zip two iterables, raising ValueError if their lengths differ.
-        """
-        if len(it1) != len(it2):
-            raise ValueError(f"Lengths differ: {len(it1)} != {len(it2)}")
-        return zip(it1, it2)
-
-
 def apply_var_mapping(
     iter_vars: list[sympy.Symbol],
     red_vars: list[sympy.Symbol],
@@ -424,7 +408,7 @@ def apply_var_mapping(
 
     iter_vars_to_flat_vars = {}
     for i, (group, var_group) in enumerate(
-        zip_equal(apply_groups, ((iter_vars, red_vars)))
+        zip(apply_groups, (iter_vars, red_vars), strict=True)
     ):
         # if the node has sizes (p0, 1) and the fused node is (p0, r0)
         # the reduction var gets filled in for split_iteration_range
@@ -437,7 +421,9 @@ def apply_var_mapping(
 
     count = 0
     flat_vars_to_new_vars = {}
-    for new_range, new_var in zip_equal(new_ranges, norm_pw_vars + norm_red_vars):
+    for new_range, new_var in zip(
+        new_ranges, norm_pw_vars + norm_red_vars, strict=True
+    ):
         range_vars = []
         for i in range(len(new_range)):
             range_vars.append(flat_vars[count])
@@ -491,7 +477,6 @@ def extract_normalized_read_writes(
     (norm_pw_vars, norm_red_vars), ranges = index_vars_no_squeeze(
         pw_splits, red_splits, prefix="n"
     )
-    node = node
 
     for n in list(node.get_nodes()):
         if not isinstance(n, torch._inductor.scheduler.SchedulerNode):
@@ -590,7 +575,7 @@ def get_score(addr: sympy.Expr, var_ranges: dict[sympy.Symbol, int]) -> int:
     # TODO - deduplicate with candidate_tilings
     var_sizes = []
     for v in addr.free_symbols:
-        v_size = var_ranges.get(v, None)
+        v_size = var_ranges.get(v)
         # TODO - reason about indirect vars
         if not symbol_is_type(v, SymT.INDIRECT) and v_size is not None:
             var_sizes.append(v_size)
diff --git a/torch/_inductor/triton_bundler.py b/torch/_inductor/triton_bundler.py
index b210dbff5c84..5bf5210a2cf4 100644
--- a/torch/_inductor/triton_bundler.py
+++ b/torch/_inductor/triton_bundler.py
@@ -224,11 +224,11 @@ def load_autotuners(
                     # Make sure the cubin path exists and is valid
                     for compile_result in result.kernel.compile_results:
                         compile_result.reload_cubin_path()
-                except RuntimeError as e:
+                except RuntimeError:
                     log.warning(
-                        "Failed to reload cubin file statically launchable autotuner %s: %s",
+                        "Failed to reload cubin file statically launchable autotuner %s",
                         result.kernel_name,
-                        e,
+                        exc_info=True,
                     )
                     continue
                 # We make a future instead of returning the kernel here so that
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index 86494ba967d0..a676222089af 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -60,13 +60,16 @@
 
 import torch
 import torch.utils._pytree as pytree
-from torch._inductor.analysis.device_info import DeviceInfo
+from torch._inductor.analysis.device_info import datasheet_tops
 from torch._inductor.runtime.hints import DeviceProperties
 from torch.utils._dtype_abbrs import dtype_abbrs
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._pytree import tree_flatten, tree_map_only
 
 
+if TYPE_CHECKING:
+    from pathlib import Path
+
 OPTIMUS_EXCLUDE_POST_GRAD = [
     "activation_quantization_aten_pass",
     "inductor_autotune_lookup_table",
@@ -272,7 +275,34 @@ def fp8_bench(fn: Callable[[], Any], warmup: int = 25, rep: int = 100) -> float:
 
 
 def do_bench_using_profiling(
-    fn: Callable[[], Any], warmup: int = 25, rep: int = 100
+    fn: Callable[[], Any],
+    warmup: int = 25,
+    rep: int = 100,
+    is_vetted_benchmarking: bool = False,
+) -> float:
+    # We did't use decorator may_distort_benchmarking_result directly since that
+    # requires us to import torch._inductor.runtime.benchmarking into global scope.
+    # Importing torch._inductor.runtime.benchmarking will cause cuda initialization
+    # (because of calling torch.cuda.available in global scope)
+    # which cause failure in vllm when it create child processes. Check log:
+    #   https://gist.github.com/shunting314/c194e147bf981e58df095c14874dd65a
+    #
+    # Another way to solve the issue is to just move do_bench_using_profiling
+    # to torch._inductor.runtime.benchmarking and change all the call site.
+    # But that's not trivial due to so many call sites in and out of pytorch.
+
+    from torch._inductor.runtime.benchmarking import may_distort_benchmarking_result
+
+    return may_distort_benchmarking_result(_do_bench_using_profiling)(
+        fn, warmup, rep, is_vetted_benchmarking
+    )
+
+
+def _do_bench_using_profiling(
+    fn: Callable[[], Any],
+    warmup: int = 25,
+    rep: int = 100,
+    is_vetted_benchmarking: bool = False,
 ) -> float:
     """
     Returns benchmark results by examining torch profiler events.
@@ -282,6 +312,11 @@ def do_bench_using_profiling(
     various CUDA events, etc, so could also be fragile.
     """
 
+    if not is_vetted_benchmarking:
+        from torch._inductor.runtime.benchmarking import may_ban_benchmarking
+
+        may_ban_benchmarking()
+
     fn()
     torch.cuda.synchronize()
     cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
@@ -504,7 +539,7 @@ def is_pointwise_use(
     Uses in views ops will follow the views uses
     """
 
-    if not use.op == "call_function":
+    if use.op != "call_function":
         return False
     if not (
         isinstance(use.target, torch._ops.OpOverload) or use.target is operator.getitem
@@ -666,6 +701,14 @@ def clear_cache(self: Any) -> None:
     return wrapper  # type: ignore[return-value]
 
 
+def cache_property_on_self(fn: Callable[P, RV]) -> CachedMethod[P, RV]:
+    """
+    Variant of cache_on_self for properties. The only difference is the type signature.
+    """
+    # pyrefly: ignore  # bad-argument-type
+    return cache_on_self(fn)
+
+
 def aggregate_origins(
     node_schedule: Union[Sequence[BaseSchedulerNode], ExternKernel],
 ) -> OrderedSet[Node]:
@@ -675,6 +718,7 @@ def aggregate_origins(
         return functools.reduce(
             operator.or_,
             [
+                # pyrefly: ignore  # missing-attribute
                 node.node.origins
                 for node in node_schedule
                 if hasattr(node, "node") and node.node
@@ -719,7 +763,6 @@ def get_fused_kernel_name(
         ]
     else:
         raise NotImplementedError
-    sources = sources
     return "_".join(["fused"] + sources)
 
 
@@ -751,7 +794,7 @@ def get_kernel_metadata(
     # where `inductor_nodes` contains nodes from multiple graph instances
     # is not supported. An example of this is conditional statements.
     single_graph = None
-    if len(inductor_nodes):
+    if inductor_nodes:
         unique_graphs = OrderedSet(n.graph for n in inductor_nodes)
         if len(unique_graphs) == 1:
             single_graph = inductor_nodes[0].graph
@@ -1151,6 +1194,7 @@ def unload_xpu_triton_pyds() -> None:
                             result,
                             torch._inductor.runtime.triton_heuristics.TritonCompileResult,
                         ):
+                            # pyrefly: ignore  # missing-attribute
                             result.kernel.run.mod.__del__()
         del sys.modules[module_name]
 
@@ -1424,6 +1468,7 @@ def splice(
     ) -> None:
         if isinstance(other_code, IndentedBuffer):
             dedent = float("inf")
+            # pyrefly: ignore  # bad-assignment
             for line in other_code._lines:
                 if not isinstance(line, LineContext) and line:
                     dedent = min(dedent, len(line) - len(line.lstrip()))
@@ -1535,6 +1580,20 @@ def _new_line(self, line: str) -> DelayReplaceLine:
         return DelayReplaceLine(self.key, self.value_fn, line)
 
 
+class DelayMaybeLine(DeferredLineBase):
+    """At end of codegen return `line if `pred_fn() else None`"""
+
+    def __init__(self, pred_fn: Callable[[], bool], line: str):
+        super().__init__(line)
+        self.pred_fn = pred_fn
+
+    def __call__(self) -> str | None:
+        return self.line if self.pred_fn() else None
+
+    def _new_line(self, line: str) -> DelayMaybeLine:
+        return DelayMaybeLine(self.pred_fn, line)
+
+
 @functools.cache
 def is_big_gpu(index_or_device: Union[int, torch.device] = 0) -> bool:
     if isinstance(index_or_device, torch.device):
@@ -1854,10 +1913,12 @@ def _use_cutlass_for_op(op_name: str) -> bool:
 
 
 @functools.cache
-def use_decompose_k_choice(m: _IntLike, n: _IntLike, k: _IntLike) -> bool:
+def use_decompose_k_choice(
+    m: _IntLike, n: _IntLike, k: _IntLike, threshold_multiple: int = 1
+) -> bool:
     from torch._inductor.virtualized import V
 
-    decompose_k_threshold = config.triton.decompose_k_threshold
+    decompose_k_threshold = config.triton.decompose_k_threshold * threshold_multiple
 
     return (
         not torch.version.hip
@@ -1869,6 +1930,7 @@ def use_decompose_k_choice(m: _IntLike, n: _IntLike, k: _IntLike) -> bool:
         )
         and not V.graph.aot_mode  # TODO: Support AOTI for decomposeK
         and not V.graph.cpp_wrapper
+        and config.triton.num_decompose_k_splits > 0
     )
 
 
@@ -1996,7 +2058,7 @@ def use_ck_template(layout: Layout) -> bool:
     if not torch.version.hip:
         return False
     # tensors must be on GPU
-    if not layout.device.type == "cuda":
+    if layout.device.type != "cuda":
         return False
     # hardware check
     # if config arch list is not specified, get the native arch from the device properties
@@ -2176,6 +2238,7 @@ def save_output_code(code: str) -> None:
 def run_and_get_kernels(
     fn: Callable[P, _T], *args: P.args, **kwargs: P.kwargs
 ) -> tuple[_T, list[str]]:
+    # pyrefly: ignore  # bad-argument-type
     result, source_codes = run_and_get_code(fn, *args, **kwargs)
     kernels = []
     for code in source_codes:
@@ -2236,6 +2299,7 @@ def call(self, *args: Any, **kwargs: Any) -> None:
 
 
 def get_triton_code(fn: Callable[P, _T], *args: P.args, **kwargs: P.kwargs) -> str:
+    # pyrefly: ignore  # bad-argument-type
     source_codes = get_code(fn, *args, **kwargs)
     # Can have two outputs if backwards was eagerly compiled
     assert 1 <= len(source_codes) <= 2, (
@@ -2247,6 +2311,7 @@ def get_triton_code(fn: Callable[P, _T], *args: P.args, **kwargs: P.kwargs) -> s
 def run_and_get_triton_code(
     fn: Callable[P, _T], *args: P.args, **kwargs: P.kwargs
 ) -> str:
+    # pyrefly: ignore  # bad-argument-type
     _, source_codes = run_and_get_code(fn, *args, **kwargs)
     # Can have two outputs if backwards was eagerly compiled
     assert 1 <= len(source_codes) <= 2, (
@@ -2416,9 +2481,7 @@ def get_device_tflops(dtype: torch.dtype) -> float:
     We don't want to throw errors in this function. First check to see if the device is in device_info.py,
     then fall back to the inaccurate triton estimation.
     """
-    ds_tops = DeviceInfo.lookup_tops_current_device(
-        dtype, is_tf32=torch.backends.cuda.matmul.allow_tf32
-    )
+    ds_tops = datasheet_tops(dtype, is_tf32=torch.backends.cuda.matmul.allow_tf32)
     if ds_tops is not None:
         return ds_tops
 
@@ -2600,7 +2663,7 @@ def is_collective(
         # TODO: this is a temporary solution to ensure that we can identify torchrec's
         # communication ops. But in order to allow better communication and computation
         # overlap, torchrec's communication ops should be not used.
-        type(node) == ir.FallbackKernel
+        type(node) is ir.FallbackKernel
         and (
             # NOTE: the `hasattr()` check is to bypass errors such as the following:
             # AttributeError: '_OpNamespace' 'torchrec' object has no attribute 'all_to_all_single'
@@ -2624,7 +2687,7 @@ def is_collective(
 def is_wait(node: Optional[Union[IRNode, Operation]]) -> bool:
     from . import ir
 
-    return type(node) == ir._WaitKernel
+    return type(node) is ir._WaitKernel
 
 
 def contains_collective(snode: BaseSchedulerNode) -> bool:
@@ -3330,12 +3393,7 @@ def __delitem__(self, key: KeyType) -> None:
 @dataclass_transform(frozen_default=True)
 def ir_dataclass(cls: Optional[type[Any]] = None, /, *, frozen: bool = True) -> Any:
     def wrap(cls: _T) -> _T:
-        if sys.version_info >= (3, 10):
-            return dataclasses.dataclass(cls, kw_only=True, frozen=frozen)  # type: ignore[call-overload]
-        else:
-            # Polyfill for python=3.9. kw_only simply introduces an extra check
-            # that only kwargs are used (and is not available on 3.9)
-            return dataclasses.dataclass(cls, frozen=frozen)
+        return dataclasses.dataclass(cls, kw_only=True, frozen=frozen)  # type: ignore[call-overload]
 
     if cls is None:
         return wrap
@@ -3530,7 +3588,7 @@ def maybe_aoti_standalone_config(config_patches: dict[str, Any]) -> dict[str, An
     """
     Ensures the configuration is internally consistent for standalone AOTInductor.
 
-    If `aot_inductor.compile_standalone` is set to True in the provided
+    If `aot_inductor_mode.compile_standalone` is set to True in the provided
     `config_patches` (or falls back to the global config), this function ensures
     that the following configs are also enabled:
         - `aot_inductor.package_cpp_only`
@@ -3551,11 +3609,24 @@ def patch_config(
             config_patches[config_name] = config_value
         elif not value and value != config_value:
             raise RuntimeError(
-                f"Invalid config: {config_name}={config_value} when aot_inductor.compile_standalone is True."
+                f"Invalid config: {config_name}={config_value} when aot_inductor_mode.compile_standalone is True."
+            )
+
+    def force_patch_config(
+        config_patches: dict[str, Any], config_name: str, config_value: Any
+    ) -> None:
+        value = config_patches.get(config_name, getattr(config, config_name))
+        if value != config_value:
+            log.warning(
+                "Overriding: %s=%s when aot_inductor_mode.compile_standalone is True.",
+                config_name,
+                config_value,
             )
+        config_patches[config_name] = config_value
 
     compile_standalone = config_patches.get(
-        "aot_inductor.compile_standalone", config.aot_inductor.compile_standalone
+        "aot_inductor_mode.compile_standalone",
+        config.aot_inductor_mode.compile_standalone,
     )
     # Make a copy of the config_patches to avoid modifying the original dictionary, needed for testing
     config_patches = config_patches.copy()
@@ -3571,10 +3642,74 @@ def patch_config(
         patch_config(
             config_patches, "aot_inductor.model_name_for_generated_files", "aoti_model"
         )
+        # TODO: change these two configs to default to None and use patch_config
+        force_patch_config(
+            config_patches,
+            "aot_inductor.link_libtorch",
+            config.test_configs.use_libtorch,
+        )
+        force_patch_config(config_patches, "aot_inductor.dynamic_linkage", False)
+
+    cross_target_platform = config_patches.get(
+        "aot_inductor.cross_target_platform",
+        config.aot_inductor.cross_target_platform,
+    )
+
+    package_constants_in_so = config_patches.get(
+        "aot_inductor.package_constants_in_so",
+        config.aot_inductor.package_constants_in_so,
+    )
+
+    if cross_target_platform == "windows" and package_constants_in_so:
+        raise RuntimeError(
+            "config.aot_inductor.package_constants_in_so is not supported for windows cross-compilation. "
+            "Please use config.aot_inductor.package_constants_on_disk_format = binary_blob."
+        )
 
     return config_patches
 
 
+def determine_aoti_mmap_flags(consts_size: int) -> tuple[bool, bool]:
+    """
+    Decide whether we should mmap weights, and whether to store the weights with .so.
+
+    If force_mmap_weights or package_constants_on_disk_format == "binary_blob" configs are set, respect the config.
+
+    Returns tuple (use_external_weights, use_mmap_weights).
+    """
+
+    if (
+        config.aot_inductor.force_mmap_weights
+        and config.aot_inductor.package_constants_on_disk_format == "binary_blob"
+    ):
+        raise RuntimeError(
+            "config.aot_inductor.package_constants_on_disk_format = binary_blob and "
+            "config.aot_inductor.force_mmap_weights cannot both be True."
+        )
+
+    if config.aot_inductor.force_mmap_weights:
+        if config.aot_inductor.cross_target_platform == "windows":
+            raise RuntimeError(
+                "when cross_target_platform is windows, use_mmap_weights should not be true."
+            )
+        use_mmap_weights = True
+        use_external_weights = False
+        return use_external_weights, use_mmap_weights
+
+    if config.aot_inductor.package_constants_on_disk_format == "binary_blob":
+        use_external_weights = True
+        use_mmap_weights = False
+        return use_external_weights, use_mmap_weights
+
+    if consts_size <= 2_000_000_000:
+        return False, False
+
+    use_external_weights = False
+    use_mmap_weights = not config.is_fbcode()
+
+    return use_external_weights, use_mmap_weights
+
+
 def is_valid_aoti_model_name() -> bool:
     """
     Validates if a model name is suitable for use in code generation.
@@ -3629,6 +3764,7 @@ def maybe_log_cudagraph_partition(
         and (fx_node := ir_node.get_origin_node())
         and (stack_trace := fx_node.meta.get("stack_trace", None))
     ):
+        # pyrefly: ignore  # unbound-name
         warning_msg = f"{warning_msg}. Found from : \n {stack_trace}"
 
     perf_hint_log.warning(warning_msg)
@@ -3749,4 +3885,13 @@ def is_nonfreeable_buffers(dep: Dep) -> bool:
     # before checking for known strings.
     if V.graph.name:
         dep_name = dep_name.removeprefix(V.graph.name + "_")
-    return dep_name.startswith(("primals_", "arg", "fwd_rng_state", "bwd_rng_state"))
+    return dep_name.startswith(
+        ("primals_", "arg", "fwd_rng_state", "bwd_rng_state", "tangents")
+    )
+
+
+# Make sure to also include your jinja templates within torch_package_data in setup.py, or this function won't be able to find them
+def load_template(name: str, template_dir: Path) -> str:
+    """Load a template file and return its content."""
+    with open(template_dir / f"{name}.py.jinja") as f:
+        return f.read()
diff --git a/torch/_inductor/wrapper_benchmark.py b/torch/_inductor/wrapper_benchmark.py
index 9a527471c8cc..f8430064917e 100644
--- a/torch/_inductor/wrapper_benchmark.py
+++ b/torch/_inductor/wrapper_benchmark.py
@@ -144,6 +144,7 @@ def get_info_str(
             launcher = triton_kernel.launchers[0]
             print(
                 get_info_str(
+                    # pyrefly: ignore  # bad-argument-type
                     ms,
                     launcher.n_regs,
                     launcher.n_spills,
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index be6d23bbbc53..56622079c3b4 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -52,15 +52,7 @@
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
 
-IS_PY310_PLUS: Final[bool] = sys.version_info >= (3, 10)
-
-BuiltinUnionType: Union[type, tuple[type, ...]]
-if sys.version_info >= (3, 10):
-    # NOTE: IS_PY310_PLUS doesn't work with mypy.
-    # cf. https://mypy.readthedocs.io/en/stable/common_issues.html#python-version-and-system-platform-checks
-    BuiltinUnionType = types.UnionType
-else:
-    BuiltinUnionType = ()  # trick: this makes isinstance short circuit.
+BuiltinUnionType: Union[type, tuple[type, ...]] = types.UnionType
 
 LockType: type
 try:
@@ -155,7 +147,7 @@ def _qualified_name(obj, mangle_name=True) -> str:
 
     # If the module is actually a torchbind module, then we should short circuit
     if module_name == "torch._classes":
-        return obj.qualified_name
+        return obj.qualified_name  # pyrefly: ignore  # missing-attribute
 
     # The Python docs are very clear that `__module__` can be None, but I can't
     # figure out when it actually would be.
@@ -451,7 +443,7 @@ def get_callable_argument_names(fn) -> list[str]:
     for name, param in callable_signature.parameters.items():
         # All four other types of arguments do not map to individual values
         # with a keyword as name.
-        if not param.kind == param.POSITIONAL_OR_KEYWORD:
+        if param.kind != param.POSITIONAL_OR_KEYWORD:
             continue
 
         argument_names.append(name)
@@ -767,7 +759,7 @@ def forward(self, x):
                 prop.fset, "_torchscript_modifier", FunctionModifiers.UNUSED
             )
 
-        return prop
+        return prop  # pyrefly: ignore  # bad-return
 
     fn._torchscript_modifier = FunctionModifiers.UNUSED  # type: ignore[attr-defined]
     return fn
@@ -852,6 +844,7 @@ def forward(self, x):
         #   @torch.jit.ignore
         #   def fn(...):
         fn = drop
+        # pyrefly: ignore  # missing-attribute
         fn._torchscript_modifier = FunctionModifiers.IGNORE
         return fn
 
@@ -1081,13 +1074,13 @@ def _overload_method(func):
     _check_overload_body(func)
     qual_name = _qualified_name(func)
     global _overloaded_methods
-    class_name_map = _overloaded_methods.get(qual_name, None)
+    class_name_map = _overloaded_methods.get(qual_name)
     if class_name_map is None:
         class_name_map = {}
         _overloaded_methods[qual_name] = class_name_map
 
     class_name, line_no = get_class_name_lineno(func)
-    method_overloads = class_name_map.get(class_name, None)
+    method_overloads = class_name_map.get(class_name)
     if method_overloads is None:
         method_overloads = []
         class_name_map[class_name] = method_overloads
@@ -1109,7 +1102,7 @@ def _get_overloaded_methods(method, mod_class):
     if not hasattr(method, "__name__"):
         return None
     qual_name = _qualified_name(method)
-    class_name_map = _overloaded_methods.get(qual_name, None)
+    class_name_map = _overloaded_methods.get(qual_name)
     if class_name_map is None:
         return None
     overloads = class_name_map.get(mod_class.__name__, None)
@@ -1255,14 +1248,13 @@ def _get_named_tuple_properties(
         ]
     else:
         defaults = []
-    # In 3.10 recommended way to get annotations is to call `inspect.get_annotations` function
-    # Also, annotations from base class are not inherited so they need to be queried explicitly
-    if sys.version_info[:2] < (3, 10):
-        obj_annotations = getattr(obj, "__annotations__", {})
-    else:
-        obj_annotations = inspect.get_annotations(obj)
-        if len(obj_annotations) == 0 and hasattr(obj, "__base__"):
-            obj_annotations = inspect.get_annotations(obj.__base__)
+
+    obj_annotations = inspect.get_annotations(obj)
+    if len(obj_annotations) == 0 and hasattr(obj, "__base__"):
+        obj_annotations = inspect.get_annotations(
+            # pyrefly: ignore  # bad-argument-type
+            obj.__base__
+        )
 
     annotations = []
     for field in obj._fields:
@@ -1451,7 +1443,9 @@ def container_checker(obj, target_type) -> bool:
                 return False
         return True
     elif origin_type is Union or issubclass(
-        origin_type, BuiltinUnionType
+        # pyrefly: ignore  # bad-argument-type
+        origin_type,
+        BuiltinUnionType,
     ):  # also handles Optional
         if obj is None:  # check before recursion because None is always fine
             return True
diff --git a/torch/_lazy/closure.py b/torch/_lazy/closure.py
index dce2a58a5d88..864591f84b56 100644
--- a/torch/_lazy/closure.py
+++ b/torch/_lazy/closure.py
@@ -63,8 +63,10 @@ def event_loop():
                         self._closure_exception.put(e)
                         return
 
-            self._closure_event_loop = threading.Thread(target=event_loop)
-            self._closure_event_loop.start()
+            self._closure_event_loop = threading.Thread(
+                target=event_loop
+            )  # pyrefly: ignore  # bad-assignment
+            self._closure_event_loop.start()  # pyrefly: ignore  # missing-attribute
 
     def run(self, closure):
         with self._closure_lock:
diff --git a/torch/_lazy/extract_compiled_graph.py b/torch/_lazy/extract_compiled_graph.py
index 38219a54b30b..78455ebc964b 100644
--- a/torch/_lazy/extract_compiled_graph.py
+++ b/torch/_lazy/extract_compiled_graph.py
@@ -3,7 +3,8 @@
 import dataclasses
 import itertools
 import os
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import torch
 import torch._lazy as lazy
diff --git a/torch/_library/autograd.py b/torch/_library/autograd.py
index 3f3e9295549b..2707d07059ed 100644
--- a/torch/_library/autograd.py
+++ b/torch/_library/autograd.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 import dataclasses
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable, Optional, Protocol
+from typing import Any, Optional, Protocol
 
 from torch import _C, _ops, autograd, Tensor
 from torch.utils import _pytree
diff --git a/torch/_library/custom_ops.py b/torch/_library/custom_ops.py
index 5c5598bdb457..faa066a987f6 100644
--- a/torch/_library/custom_ops.py
+++ b/torch/_library/custom_ops.py
@@ -3,9 +3,9 @@
 import inspect
 import logging
 import weakref
-from collections.abc import Iterable, Sequence
+from collections.abc import Callable, Iterable, Sequence
 from contextlib import contextmanager
-from typing import Any, Callable, Literal, Optional, overload, Union
+from typing import Any, Optional, overload, Union
 
 import torch
 from torch import _C, _ops, Tensor
@@ -22,7 +22,7 @@
 @overload
 def custom_op(
     name: str,
-    fn: Literal[None] = None,
+    fn: None = None,
     /,
     *,
     mutates_args: Union[str, Iterable[str]],
@@ -348,13 +348,15 @@ def get_module():
                             fn = self._backend_fns[device_type]
                             return inspect.getmodule(fn)
 
-                        utils._c_check_aliasing_constraint(
-                            self._name,
-                            args,
-                            kwargs,
-                            result,
-                            get_module,
-                        )
+                        schema = self._opoverload._schema
+                        if not schema._is_view_op():
+                            utils._c_check_aliasing_constraint(
+                                self._name,
+                                args,
+                                kwargs,
+                                result,
+                                get_module,
+                            )
                         return result
 
                     if device_type is None:
@@ -587,7 +589,7 @@ def register_autograd(
 
         """
         schema = self._opoverload._schema
-        if not utils.is_functional_schema(schema):
+        if not utils.is_functional_schema(schema, allow_valid_view=True):
             raise RuntimeError(
                 f"Cannot register autograd formula for non-functional operator "
                 f"{self} with schema {schema}. Please create "
@@ -632,20 +634,27 @@ def fake_impl(*args, **kwargs):
 
         autograd_impl = autograd.make_autograd_impl(self._opoverload, self)
         lib.impl(self._name, autograd_impl, "Autograd", with_keyset=True)
-
         schema = self._opoverload._schema
+
+        if schema._is_view_op() or schema.is_mutable:
+            lib.m.register_ad_inplace_or_view_fallback(self._name)  # type: ignore[union-attr]
+
         if schema.is_mutable:
             mutated_idxs, mutated_keys = utils.mutated_args_kwargs(schema)
 
+            original_kernel = torch._C._dispatch_get_computed_kernel_for_dispatch_key(
+                f"{lib.ns}::{self._name}", "ADInplaceOrView"
+            )
+
             def adinplaceorview_impl(keyset, *args, **kwargs):
+                # Handle the mutated idx the user gave us explicitly
+
                 for idx in mutated_idxs:
                     increment_version(args[idx])
                 for key in mutated_keys:
                     increment_version(kwargs[key])
-                with _C._AutoDispatchBelowADInplaceOrView():
-                    return self._opoverload.redispatch(
-                        keyset & _C._after_ADInplaceOrView_keyset, *args, **kwargs
-                    )
+                # Handle view + mutation that are in the schema
+                return original_kernel.call_boxed(keyset, *args, **kwargs)
 
             lib.impl(
                 self._name,
diff --git a/torch/_library/fake_class_registry.py b/torch/_library/fake_class_registry.py
index 68208d0be4a8..b98949b388a9 100644
--- a/torch/_library/fake_class_registry.py
+++ b/torch/_library/fake_class_registry.py
@@ -20,13 +20,14 @@ def __init__(self, wrapped_obj: Any, script_class_name: str, x: torch.ScriptObje
         try:
             with _disable_current_modes():
                 self.real_obj = copy.deepcopy(x)
-        except RuntimeError:
-            log.warning(
-                "Unable to deepcopy the custom object %s. "
+        except RuntimeError as e:
+            log.warning(  # noqa: G200
+                "Unable to deepcopy the custom object %s due to %s. "
                 "Defaulting to the user given object. This might be "
                 "dangerous as side effects may be directly applied "
                 "to the object.",
                 script_class_name,
+                str(e),
             )
             self.real_obj = x
 
@@ -134,55 +135,64 @@ def maybe_to_fake_obj(
     if tracing_with_real(x):
         return x
 
-    # x.__obj_flatten__() could be calling some tensor operations inside but we don't
-    # want to call these ops in surrounding dispatch modes when executing it.
-    # Otherwise, for example, the fake tensor modes will error out when the tensors inside
-    # script object execute some operations like clone if allow_non_fake_input flag is set.
-    with _disable_current_modes():
-        flat_x = x.__obj_flatten__()  # type: ignore[attr-defined]
-
-    _check_valid_flat_script_obj(flat_x)
-
-    with fake_mode:
-        from torch._higher_order_ops.utils import _tensor_storage
-
-        storage_map = {
-            _tensor_storage(inp): i
-            for i, inp in enumerate(flat_x)
-            if isinstance(inp, torch.Tensor)
-        }
-        alias_map = {
-            i: storage_map[_tensor_storage(inp)]
-            for i, inp in enumerate(flat_x)
-            if isinstance(inp, torch.Tensor) and storage_map[_tensor_storage(inp)] != i
-        }
-        if len(alias_map) > 0:
-            log.warning(
-                "Detected script object %s has aliasing relationship among its tensors. "
-                "Flattened obj: %s. Aliasing tensor indices: %s. "
-                "This is not supported and may cause unexpected behavior.",
-                x,
+    from torch._library.opaque_object import FakeOpaqueObject, OpaqueTypeStr
+
+    if str(x._type()) == OpaqueTypeStr:
+        # In order to make OpaqueObjects truly opaque, the fake kernel should
+        # not depend on the contents of the OpaqueObject at all.
+        fake_x = FakeOpaqueObject()
+
+    else:
+        # x.__obj_flatten__() could be calling some tensor operations inside but we don't
+        # want to call these ops in surrounding dispatch modes when executing it.
+        # Otherwise, for example, the fake tensor modes will error out when the tensors inside
+        # script object execute some operations like clone if allow_non_fake_input flag is set.
+        with _disable_current_modes():
+            flat_x = x.__obj_flatten__()  # type: ignore[attr-defined]
+
+        _check_valid_flat_script_obj(flat_x)
+
+        with fake_mode:
+            from torch._higher_order_ops.utils import _tensor_storage
+
+            storage_map = {
+                _tensor_storage(inp): i
+                for i, inp in enumerate(flat_x)
+                if isinstance(inp, torch.Tensor)
+            }
+            alias_map = {
+                i: storage_map[_tensor_storage(inp)]
+                for i, inp in enumerate(flat_x)
+                if isinstance(inp, torch.Tensor)
+                and storage_map[_tensor_storage(inp)] != i
+            }
+            if len(alias_map) > 0:
+                log.warning(
+                    "Detected script object %s has aliasing relationship among its tensors. "
+                    "Flattened obj: %s. Aliasing tensor indices: %s. "
+                    "This is not supported and may cause unexpected behavior.",
+                    x,
+                    flat_x,
+                    alias_map,
+                )
+
+            # This breaks the aliasing relationship among the tensors inside the torchbind object
+            # This is bad but since we don't need to preserve the aliasing relationship anyway and
+            # we state clearly that aliasing relationship is not preserved in the doc so this might be OK.
+            fake_flattened = pytree.tree_map_only(
+                torch.Tensor,
+                lambda t: torch.empty_strided(
+                    t.size(),
+                    t.stride(),
+                    device=t.device,
+                    dtype=t.dtype,
+                    requires_grad=t.requires_grad,
+                    layout=t.layout,
+                ),
                 flat_x,
-                alias_map,
             )
 
-        # This breaks the aliasing relationship among the tensors inside the torchbind object
-        # This is bad but since we don't need to preserve the aliasing relationship anyway and
-        # we state clearly that aliasing relationship is not preserved in the doc so this might be OK.
-        fake_flattened = pytree.tree_map_only(
-            torch.Tensor,
-            lambda t: torch.empty_strided(
-                t.size(),
-                t.stride(),
-                device=t.device,
-                dtype=t.dtype,
-                requires_grad=t.requires_grad,
-                layout=t.layout,
-            ),
-            flat_x,
-        )
-
-    fake_x = _find_fake_class_for_script_object(x).__obj_unflatten__(fake_flattened)
+        fake_x = _find_fake_class_for_script_object(x).__obj_unflatten__(fake_flattened)
 
     fake_x_wrapped = FakeScriptObject(fake_x, x._type().qualified_name(), x)  # type: ignore[attr-defined]
 
@@ -205,7 +215,7 @@ def maybe_to_fake_obj(
                 FakeScriptMethod(fake_x_wrapped, name, method_schema),
             )
         else:
-            override_skip_list = {"__obj_flatten__", "__get_state__", "__set_state__"}
+            override_skip_list = {"__obj_flatten__", "__getstate__", "__setstate__"}
             if name not in override_skip_list:
                 log.warning("fake object of %s doesn't implement method %s.", x, name)
     return fake_x_wrapped
diff --git a/torch/_library/fake_impl.py b/torch/_library/fake_impl.py
index 632020a04bac..877ebb0c5912 100644
--- a/torch/_library/fake_impl.py
+++ b/torch/_library/fake_impl.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import contextlib
 import functools
-from typing import Callable
+from collections.abc import Callable
 from typing_extensions import deprecated
 
 import torch
diff --git a/torch/_library/fake_profile.py b/torch/_library/fake_profile.py
index d480f6662680..9e0b8cccdb56 100644
--- a/torch/_library/fake_profile.py
+++ b/torch/_library/fake_profile.py
@@ -2,9 +2,9 @@
 import io
 import logging
 import os
-from collections.abc import Generator
+from collections.abc import Callable, Generator
 from dataclasses import dataclass
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 from torch._library.custom_ops import _maybe_get_opdef
@@ -198,6 +198,7 @@ def generate_yaml_from_profiles(op_profiles: dict[str, set[OpProfile]]) -> str:
     to a file. The yaml string can be loaded back into an operator profile
     structure using `read_profiles_from_yaml`.
     """
+
     import yaml
 
     from torch._export.serde.serialize import (
@@ -245,6 +246,7 @@ def save_op_profiles(op_profiles: dict[str, set[OpProfile]], f: FileLike) -> Non
     yaml_str = generate_yaml_from_profiles(op_profiles)
 
     if isinstance(f, (str, os.PathLike)):
+        # pyrefly: ignore  # no-matching-overload
         f = os.fspath(f)
 
         with open(f, "w") as file:
@@ -261,6 +263,7 @@ def read_profiles_from_yaml(yaml_str: str) -> dict[str, set[OpProfile]]:
     """
     Reads the yaml saved by `save_op_profiles` and returns the operator profiles.
     """
+
     import yaml
 
     from torch._export.serde.serialize import (
@@ -309,6 +312,7 @@ def load_op_profiles(f: FileLike) -> dict[str, set[OpProfile]]:
     Loads the saved operator profiles from `save_op_profiles`.
     """
     if isinstance(f, (str, os.PathLike)):
+        # pyrefly: ignore  # no-matching-overload
         f = os.fspath(f)
 
         with open(f) as file:
diff --git a/torch/_library/infer_schema.py b/torch/_library/infer_schema.py
index 512bd5835bd9..51986d08e23c 100644
--- a/torch/_library/infer_schema.py
+++ b/torch/_library/infer_schema.py
@@ -9,6 +9,8 @@
 from torch import device, dtype, Tensor, types
 from torch.utils._exposed_in import exposed_in
 
+from .opaque_object import _OPAQUE_TYPES, is_opaque_type, OpaqueType, OpaqueTypeStr
+
 
 # This is used as a negative test for
 # test_custom_ops.py::TestTypeConversion::test_type_eval.
@@ -123,8 +125,20 @@ def unstringify_type(ty: Union[type[object], str]) -> tuple[typing.Any, bool]:
         # we convert it to the actual type.
         annotation_type, _ = unstringify_type(param.annotation)
 
+        schema_type = None
         if annotation_type not in SUPPORTED_PARAM_TYPES:
-            if annotation_type.__origin__ is tuple:
+            if is_opaque_type(annotation_type):
+                schema_type = _OPAQUE_TYPES[annotation_type]
+            elif annotation_type == torch._C.ScriptObject:
+                error_fn(
+                    f"Parameter {name}'s type cannot be inferred from the schema "
+                    "as it is a ScriptObject. Please manually specify the schema "
+                    "using the `schema=` kwarg with the actual type of the ScriptObject."
+                )
+            elif (
+                hasattr(annotation_type, "__origin__")
+                and annotation_type.__origin__ is tuple
+            ):
                 list_type = tuple_to_list(annotation_type)
                 example_type_str = "\n\n"
                 # Only suggest the list type if this type is supported.
@@ -141,9 +155,12 @@ def unstringify_type(ty: Union[type[object], str]) -> tuple[typing.Any, bool]:
                     f"Parameter {name} has unsupported type {param.annotation}. "
                     f"The valid types are: {SUPPORTED_PARAM_TYPES.keys()}."
                 )
+        else:
+            schema_type = SUPPORTED_PARAM_TYPES[annotation_type]
+
+        assert schema_type is not None
 
-        schema_type = SUPPORTED_PARAM_TYPES[annotation_type]
-        if type(mutates_args) == str:
+        if type(mutates_args) is str:
             if mutates_args != UNKNOWN_MUTATES:
                 raise ValueError(
                     "mutates_args must either be a sequence of the names of "
@@ -159,6 +176,7 @@ def unstringify_type(ty: Union[type[object], str]) -> tuple[typing.Any, bool]:
             schema_type = f"Tensor(a{idx}!){schema_type[len('Tensor') :]}"
         seen_args.add(name)
         if param.default is inspect.Parameter.empty:
+            # pyrefly: ignore  # bad-argument-type
             params.append(f"{schema_type} {name}")
         else:
             default_repr = None
@@ -176,6 +194,7 @@ def unstringify_type(ty: Union[type[object], str]) -> tuple[typing.Any, bool]:
                     f"Parameter {name} has an unsupported default value type {type(param.default)}. "
                     f"Please file an issue on GitHub so we can prioritize this."
                 )
+            # pyrefly: ignore  # bad-argument-type
             params.append(f"{schema_type} {name}={default_repr}")
     if mutates_args != UNKNOWN_MUTATES:
         mutates_args_not_seen = set(mutates_args) - seen_args
@@ -202,6 +221,7 @@ def derived_types(
 ):
     result: list[tuple[Union[type, typing._SpecialForm, GenericAlias], str]] = [
         (base_type, cpp_type),
+        # pyrefly: ignore  # not-a-type
         (typing.Optional[base_type], f"{cpp_type}?"),
     ]
 
@@ -220,6 +240,7 @@ def derived_seq_types(typ: Union[type, typing._SpecialForm]):
     if optional_base_list:
         result.extend(
             (seq_typ, f"{cpp_type}?[]")
+            # pyrefly: ignore  # not-a-type
             for seq_typ in derived_seq_types(typing.Optional[base_type])
         )
     if optional_list_base:
@@ -231,6 +252,7 @@ def derived_seq_types(typ: Union[type, typing._SpecialForm]):
 
 
 def get_supported_param_types():
+    # pyrefly: ignore  # bad-assignment
     data: list[tuple[Union[type, typing._SpecialForm], str, bool, bool, bool]] = [
         # (python type, schema type, type[] variant, type?[] variant, type[]? variant
         (Tensor, "Tensor", True, True, False),
@@ -241,6 +263,7 @@ def get_supported_param_types():
         (types.Number, "Scalar", True, False, False),
         (dtype, "ScalarType", False, False, False),
         (device, "Device", False, False, False),
+        (OpaqueType, OpaqueTypeStr, False, False, False),
     ]
     result = []
     for line in data:
@@ -273,6 +296,7 @@ def parse_return(annotation, error_fn):
                 f"Return has unsupported type {annotation}. "
                 f"The valid types are: {SUPPORTED_RETURN_TYPES}."
             )
+        # pyrefly: ignore  # index-error
         return SUPPORTED_RETURN_TYPES[annotation]
 
     args = typing.get_args(annotation)
diff --git a/torch/_library/opaque_object.py b/torch/_library/opaque_object.py
new file mode 100644
index 000000000000..cbe8795ec531
--- /dev/null
+++ b/torch/_library/opaque_object.py
@@ -0,0 +1,187 @@
+from typing import Any, NewType, Optional
+
+import torch
+
+from .fake_class_registry import FakeScriptObject, register_fake_class
+
+
+@register_fake_class("aten::OpaqueObject")
+class FakeOpaqueObject:
+    def __init__(self) -> None:
+        pass
+
+    @classmethod
+    def __obj_unflatten__(cls, flattened_ctx: dict[str, Any]) -> None:
+        raise RuntimeError(
+            "FakeOpaqueObject should not be created through __obj_unflatten__ "
+            "and should be special handled. Please file an issue to Github."
+        )
+
+
+OpaqueTypeStr = "__torch__.torch.classes.aten.OpaqueObject"
+
+OpaqueType = NewType("OpaqueType", torch._C.ScriptObject)
+
+
+def make_opaque(payload: Any = None) -> torch._C.ScriptObject:
+    """
+    Creates an opaque object which stores the given Python object.
+    This opaque object can be passed to any custom operator as an argument.
+    The Python object can then be accessed from the opaque object using the `get_payload()` API.
+    The opaque object has `._type()`
+    "__torch__.torch.classes.aten.OpaqueObject", which should be the type used
+    when creating custom operator schemas.
+
+    Args:
+        payload (Any): The Python object to store in the opaque object. This can
+        be empty, and can be set with `set_payload()` later.
+
+    Returns:
+        torch._C.ScriptObject: The opaque object that stores the given Python object.
+
+    Example:
+
+        >>> import random
+        >>> import torch
+        >>> from torch._library.opaque_object import (
+        ...     make_opaque,
+        ...     get_payload,
+        ...     set_payload,
+        ... )
+        >>>
+        >>> class RNGState:
+        >>>     def __init__(self, seed):
+        >>>         self.rng = random.Random(seed)
+        >>>
+        >>> rng = RNGState(0)
+        >>> obj = make_opaque()
+        >>> set_payload(obj, rng)
+        >>>
+        >>> assert get_payload(obj) == rng
+        >>>
+        >>> lib = torch.library.Library("mylib", "FRAGMENT")
+        >>>
+        >>> torch.library.define(
+        >>>     "mylib::noisy_inject",
+        >>>     "(Tensor x, __torch__.torch.classes.aten.OpaqueObject obj) -> Tensor",
+        >>>     tags=torch.Tag.pt2_compliant_tag,
+        >>>     lib=lib,
+        >>> )
+        >>>
+        >>> @torch.library.impl(
+        >>>     "mylib::noisy_inject", "CompositeExplicitAutograd", lib=lib
+        >>> )
+        >>> def noisy_inject(x: torch.Tensor, obj: torch._C.ScriptObject) -> torch.Tensor:
+        >>>     rng_state = get_payload(obj)
+        >>>     assert isinstance(rng_state, RNGState)
+        >>>     out = x.clone()
+        >>>     for i in range(out.numel()):
+        >>>         out.view(-1)[i] += rng_state.rng.random()
+        >>>     return out
+        >>>
+        >>> print(torch.ops.mylib.noisy_inject(torch.ones(3), obj))
+    """
+    return torch._C._make_opaque_object(payload)
+
+
+def get_payload(opaque_object: torch._C.ScriptObject) -> Any:
+    """
+    Retrieves the Python object stored in the given opaque object.
+
+    Args:
+        torch._C.ScriptObject: The opaque object that stores the given Python object.
+
+    Returns:
+        payload (Any): The Python object stored in the opaque object. This can
+        be set with `set_payload()`.
+    """
+    if isinstance(opaque_object, FakeScriptObject):
+        raise ValueError(
+            "get_payload: this function was called with a FakeScriptObject "
+            "implying that you are calling get_payload inside of a fake kernel."
+            "The fake kernel should not depend on the contents of the "
+            "OpaqueObject at all, so we're erroring out. If you need this"
+            "functionality, consider creating a custom TorchBind Object instead"
+            "(but note that this is more difficult)."
+        )
+    if not (
+        isinstance(opaque_object, torch._C.ScriptObject)
+        and opaque_object._type().qualified_name() == OpaqueTypeStr
+    ):
+        type_ = (
+            opaque_object._type().qualified_name()
+            if isinstance(opaque_object, torch._C.ScriptObject)
+            else type(opaque_object)
+        )
+        raise ValueError(
+            f"Tried to get the payload from a non-OpaqueObject of type `{type_}`"
+        )
+    return torch._C._get_opaque_object_payload(opaque_object)
+
+
+def set_payload(opaque_object: torch._C.ScriptObject, payload: Any) -> None:
+    """
+    Sets the Python object stored in the given opaque object.
+
+    Args:
+        torch._C.ScriptObject: The opaque object that stores the given Python object.
+        payload (Any): The Python object to store in the opaque object.
+    """
+    if isinstance(opaque_object, FakeScriptObject):
+        raise ValueError(
+            "set_payload: this function was called with a FakeScriptObject "
+            "implying that you are calling get_payload inside of a fake kernel."
+            "The fake kernel should not depend on the contents of the "
+            "OpaqueObject at all, so we're erroring out. If you need this"
+            "functionality, consider creating a custom TorchBind Object instead"
+            "(but note that this is more difficult)."
+        )
+
+    if not (
+        isinstance(opaque_object, torch._C.ScriptObject)
+        and opaque_object._type().qualified_name() == OpaqueTypeStr
+    ):
+        type_ = (
+            opaque_object._type().qualified_name()
+            if isinstance(opaque_object, torch._C.ScriptObject)
+            else type(opaque_object)
+        )
+        raise ValueError(
+            f"Tried to get the payload from a non-OpaqueObject of type `{type_}`"
+        )
+    torch._C._set_opaque_object_payload(opaque_object, payload)
+
+
+_OPAQUE_TYPES: dict[Any, str] = {}
+
+
+def register_opaque_type(cls: Any, name: Optional[str] = None) -> None:
+    """
+    Registers the given type as an opaque type which allows this to be consumed
+    by a custom operator.
+
+    Args:
+        cls (type): The class to register as an opaque type.
+        name (str): A unique qualified name of the type.
+    """
+    if name is None:
+        name = cls.__name__
+
+    if "." in name:
+        # The schema_type_parser will break up types with periods
+        raise ValueError(
+            f"Unable to accept name, {name}, for this opaque type as it contains a '.'"
+        )
+    _OPAQUE_TYPES[cls] = name
+    # pyrefly: ignore  # missing-attribute
+    torch._C._register_opaque_type(name)
+
+
+def is_opaque_type(cls: Any) -> bool:
+    """
+    Checks if the given type is an opaque type.
+    """
+    if cls not in _OPAQUE_TYPES:
+        return False
+    # pyrefly: ignore  # missing-attribute
+    return torch._C._is_opaque_type_registered(_OPAQUE_TYPES[cls])
diff --git a/torch/_library/simple_registry.py b/torch/_library/simple_registry.py
index bf25cde9cb53..8709c9e95c2b 100644
--- a/torch/_library/simple_registry.py
+++ b/torch/_library/simple_registry.py
@@ -1,5 +1,5 @@
-# mypy: allow-untyped-defs
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 
 from .fake_impl import FakeImplHolder
 from .utils import RegistrationHandle
@@ -24,8 +24,8 @@ class SimpleLibraryRegistry:
     (including the overload) to SimpleOperatorEntry.
     """
 
-    def __init__(self):
-        self._data = {}
+    def __init__(self) -> None:
+        self._data: dict[str, SimpleOperatorEntry] = {}
 
     def find(self, qualname: str) -> "SimpleOperatorEntry":
         res = self._data.get(qualname, None)
@@ -44,7 +44,7 @@ class SimpleOperatorEntry:
     registered to.
     """
 
-    def __init__(self, qualname: str):
+    def __init__(self, qualname: str) -> None:
         self.qualname: str = qualname
         self.fake_impl: FakeImplHolder = FakeImplHolder(qualname)
         self.torch_dispatch_rules: GenericTorchDispatchRuleHolder = (
@@ -53,17 +53,17 @@ def __init__(self, qualname: str):
 
     # For compatibility reasons. We can delete this soon.
     @property
-    def abstract_impl(self):
+    def abstract_impl(self) -> FakeImplHolder:
         return self.fake_impl
 
 
 class GenericTorchDispatchRuleHolder:
-    def __init__(self, qualname):
-        self._data = {}
-        self.qualname = qualname
+    def __init__(self, qualname: str) -> None:
+        self._data: dict[type, Callable[..., Any]] = {}
+        self.qualname: str = qualname
 
     def register(
-        self, torch_dispatch_class: type, func: Callable
+        self, torch_dispatch_class: type, func: Callable[..., Any]
     ) -> RegistrationHandle:
         if self.find(torch_dispatch_class):
             raise RuntimeError(
@@ -71,16 +71,18 @@ def register(
             )
         self._data[torch_dispatch_class] = func
 
-        def deregister():
+        def deregister() -> None:
             del self._data[torch_dispatch_class]
 
         return RegistrationHandle(deregister)
 
-    def find(self, torch_dispatch_class):
+    def find(self, torch_dispatch_class: type) -> Optional[Callable[..., Any]]:
         return self._data.get(torch_dispatch_class, None)
 
 
-def find_torch_dispatch_rule(op, torch_dispatch_class: type) -> Optional[Callable]:
+def find_torch_dispatch_rule(
+    op: Any, torch_dispatch_class: type
+) -> Optional[Callable[..., Any]]:
     return singleton.find(op.__qualname__).torch_dispatch_rules.find(
         torch_dispatch_class
     )
diff --git a/torch/_library/triton.py b/torch/_library/triton.py
index 741b341f7e21..761279743f3a 100644
--- a/torch/_library/triton.py
+++ b/torch/_library/triton.py
@@ -2,8 +2,8 @@
 import contextlib
 import inspect
 import threading
-from collections.abc import Generator, Iterable
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable, Generator, Iterable
+from typing import Any, Optional, Union
 
 from torch.utils._exposed_in import exposed_in
 
diff --git a/torch/_library/utils.py b/torch/_library/utils.py
index 59a316acc69a..edbe86992b6a 100644
--- a/torch/_library/utils.py
+++ b/torch/_library/utils.py
@@ -2,11 +2,12 @@
 import dataclasses
 import inspect
 import sys
-from collections.abc import Iterable, Iterator
-from typing import Any, Callable, Literal, Optional, overload, Union
+from collections.abc import Callable, Iterable, Iterator
+from typing import Any, Literal, Optional, overload, Union
 
 import torch
 import torch.utils._pytree as pytree
+import torchgen
 from torch import _C, _utils_internal
 from torch._ops import OpOverload
 
@@ -74,12 +75,15 @@ def is_builtin(op: OpOverload) -> bool:
     return op.namespace in {"aten", "prim", "prims"}
 
 
-def is_functional_schema(schema: Any) -> bool:
+def is_functional_schema(schema: Any, *, allow_valid_view: bool = False) -> bool:
     """Check if the schema is functional.
 
     An operator is functional if:
     - it does not mutate any of its inputs
-    - it does not return a view on any of its inputs
+    - If no view are allowed
+        - it does not return a view on any of its inputs
+    - If valid views are allowed
+        - it is not a view or a view with a single input Tensor and single output Tensor
     - it has at least one return
     """
 
@@ -90,8 +94,31 @@ def is_functional(schema):
         is_non_mutating_view = len(rets) > 0 and any(
             r.alias_info is not None and not r.alias_info.is_write for r in rets
         )
+        num_tensor_inputs = 0
+        num_tensor_outputs = 0
+
+        if isinstance(schema, torch.FunctionSchema):
+            for arg in schema.arguments:
+                if isinstance(arg.type, torch.TensorType):
+                    num_tensor_inputs += 1
+
+            for ret in schema.returns:
+                if isinstance(ret.type, torch.TensorType):
+                    num_tensor_outputs += 1
+
+        elif isinstance(schema, torchgen.model.FunctionSchema):
+            for argument in schema.arguments.flat_non_out:
+                if argument.type.is_tensor_like():
+                    num_tensor_inputs += 1
+
+            for ret_arg in schema.returns:
+                if ret_arg.type.is_tensor_like():
+                    num_tensor_outputs += 1
+
         if is_non_mutating_view:
-            return False
+            return allow_valid_view and (
+                num_tensor_inputs == 1 and num_tensor_outputs == 1
+            )
         if not schema.returns:
             return False
         return True
@@ -135,7 +162,7 @@ def mutates_and_returns_first_arg(op: OpOverload):
     if op.namespace != "aten":
         return False
     schema = op._schema
-    if not len(schema.returns) == 1:
+    if len(schema.returns) != 1:
         return False
     if schema.returns[0].alias_info is None:
         return False
@@ -341,13 +368,13 @@ def check_aliasing_constraint(name, prev, result, get_module=lambda: "???"):
     """
     custom operators' outputs must not alias any inputs or other outputs.
     """
-    storages = {id(t.untyped_storage()) for t in prev if isinstance(t, torch.Tensor)}
+    storages = {t.untyped_storage()._cdata for t in prev if isinstance(t, torch.Tensor)}
     tuple_result = result
     if not isinstance(result, tuple):
         tuple_result = (result,)
     for tensor in iter_tensors(tuple_result, {}):
-        key = id(tensor.untyped_storage())
-        if id(tensor.untyped_storage()) in storages:
+        key = tensor.untyped_storage()._cdata
+        if tensor.untyped_storage()._cdata in storages:
             raise RuntimeError(
                 f"{name} (with implementation in {get_module()}): "
                 f"The output of this custom operator (1) must not "
diff --git a/torch/_lobpcg.py b/torch/_lobpcg.py
index a3f57411b8f5..a35116f8c62b 100644
--- a/torch/_lobpcg.py
+++ b/torch/_lobpcg.py
@@ -301,7 +301,7 @@ def forward(  # type: ignore[override]
         return D, U
 
     @staticmethod
-    def backward(ctx, D_grad, U_grad):
+    def backward(ctx, D_grad, U_grad):  # pyrefly: ignore  # bad-override
         A_grad = B_grad = None
         grads = [None] * 14
 
@@ -1048,7 +1048,11 @@ def _get_svqb(self, U: Tensor, drop: bool, tau: float) -> Tensor:
         else:
             E[(torch.where(E < t))[0]] = t
 
-        return torch.matmul(U * d_col.mT, Z * E**-0.5)
+        return torch.matmul(
+            U * d_col.mT,
+            # pyrefly: ignore  # unsupported-operation
+            Z * E**-0.5,
+        )
 
     def _get_ortho(self, U, V):
         """Return B-orthonormal U with columns are B-orthogonal to V.
diff --git a/torch/_logging/_internal.py b/torch/_logging/_internal.py
index a418fe3b6097..a84268610263 100644
--- a/torch/_logging/_internal.py
+++ b/torch/_logging/_internal.py
@@ -15,8 +15,9 @@
 import time
 import warnings
 from collections import defaultdict
+from collections.abc import Callable
 from dataclasses import dataclass, field
-from typing import Any, Callable, Generic, Optional, Union
+from typing import Any, Generic, Optional, Union
 from typing_extensions import ParamSpec
 from weakref import WeakSet
 
@@ -698,7 +699,7 @@ def pad_to(s, length=30):
 
   TORCH_LOGS_OUT=/tmp/output.txt will output the logs to /tmp/output.txt as
   well. This is useful when the output is long.
-"""  # flake8: noqa: B950
+"""
     msg = f"""
 TORCH_LOGS Info
 {examples}
@@ -913,6 +914,7 @@ def format(self, record):
             and (trace_id := torch._guards.CompileContext.current_trace_id())
             is not None
         ):
+            # pyrefly: ignore  # unbound-name
             record.traceid = f" [{trace_id}]"
 
         glog_level_to_abbr = {
@@ -1206,7 +1208,7 @@ def safe_grad_filter(message, category, filename, lineno, file=None, line=None)
 def user_warning_filter(
     message, category, filename, lineno, file=None, line=None
 ) -> bool:
-    return not category == UserWarning
+    return category is not UserWarning
 
 
 @contextlib.contextmanager
diff --git a/torch/_logging/_registrations.py b/torch/_logging/_registrations.py
index 3c6f092ed4d2..162ad53a63cc 100644
--- a/torch/_logging/_registrations.py
+++ b/torch/_logging/_registrations.py
@@ -246,4 +246,9 @@
     "Logs debug info for hierarchical compilation",
     off_by_default=True,
 )
+register_artifact(
+    "annotation",
+    "Logs detailed steps of the creating annotation on graph nodes",
+    off_by_default=True,
+)
 register_artifact("custom_format_test_artifact", "Testing only", log_format="")
diff --git a/torch/_logging/scribe.py b/torch/_logging/scribe.py
index 4456a94ccc7d..2feb814d4a2c 100644
--- a/torch/_logging/scribe.py
+++ b/torch/_logging/scribe.py
@@ -1,5 +1,5 @@
-from typing import Callable, Union
-from typing_extensions import TypeAlias
+from collections.abc import Callable
+from typing import TypeAlias, Union
 
 
 try:
diff --git a/torch/_logging/structured.py b/torch/_logging/structured.py
index 4eae33227e61..e6dd36a6c696 100644
--- a/torch/_logging/structured.py
+++ b/torch/_logging/structured.py
@@ -21,7 +21,7 @@ def intern_string(s: Optional[str]) -> int:
     if s is None:
         return -1
 
-    r = INTERN_TABLE.get(s, None)
+    r = INTERN_TABLE.get(s)
     if r is None:
         r = len(INTERN_TABLE)
         INTERN_TABLE[s] = r
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 78b3964d7171..1ad443ff387e 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -1,9 +1,9 @@
 # mypy: allow-untyped-defs
 import math
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from enum import Enum
 from functools import wraps
-from typing import Callable, Optional, TypeVar, Union
+from typing import Optional, TypeVar, Union
 from typing_extensions import ParamSpec
 
 import torch
@@ -646,16 +646,14 @@ def meta__cslt_sparse_mm(
     assert len(dense_B.shape) == 2, "_cslt_sparse_mm only supports 2d inputs"
 
     is_8bit_input_type = compressed_A.dtype in [torch.int8, torch.float8_e4m3fn]
-    compression_factor = 10 if is_8bit_input_type else 9
 
     if is_8bit_input_type:
         assert not dense_B.is_contiguous(), (
             "dense input must be transposed for 8bit dtypes"
         )
 
-    k = dense_B.size(0)
     n = dense_B.size(1)
-    m = (compressed_A.numel() * 16) // (compression_factor * k)
+    m = compressed_A.size(0)
     if bias is not None:
         assert m == bias.size(0)
 
@@ -666,7 +664,7 @@ def meta__cslt_sparse_mm(
             torch.int32,
             torch.float8_e4m3fn,
         }, (
-            "out_dtype is not supported for {compressed_A.dtype} x {dense_B.dtype} -> {out_dtype} matmul!"
+            f"out_dtype is not supported for {compressed_A.dtype} x {dense_B.dtype} -> {out_dtype} matmul!"
         )
     output_shape = (n, m) if transpose_result else (m, n)
     return dense_B.new_empty(output_shape, dtype=out_dtype)
@@ -843,7 +841,7 @@ def sym_constrain_range_for_size(size, min=None, max=None):
     from torch.fx.experimental.symbolic_shapes import _constrain_range_for_size
 
     if min is None and max is None:
-        torch._check_is_size(size)
+        torch._check(size >= 0)
         return
 
     if isinstance(size, (SymFloat, SymBool)):
@@ -1789,7 +1787,7 @@ def _padding_check_valid_input(input, padding, *, dim):
         for d in range(1, input_dim):
             valid_batch_mode = valid_batch_mode and input.size(d) != 0
     else:
-        for d in range(0, input_dim):
+        for d in range(input_dim):
             valid_non_batch_mode = valid_non_batch_mode and input.size(d) != 0
 
     # allow empty batch size but not other dimensions.
@@ -2240,7 +2238,7 @@ def meta__fused_moving_avg_obs_fq_helper(
 
 @register_meta(aten.mm)
 @out_wrapper(exact_dtype=True)
-def meta_mm(a, b):
+def meta_mm(a, b, out_dtype: Optional[torch.dtype] = None):
     torch._check(a.dim() == 2, lambda: "a must be 2D")
     torch._check(b.dim() == 2, lambda: "b must be 2D")
     N, M1 = a.shape
@@ -2249,7 +2247,17 @@ def meta_mm(a, b):
         M1 == M2,
         lambda: f"a and b must have same reduction dim, but got [{N}, {M1}] X [{M2}, {P}].",
     )
-    return a.new_empty(N, P)
+    if out_dtype is not None:
+        torch._check(
+            out_dtype == a.dtype
+            or (
+                out_dtype == torch.float32
+                and a.dtype in (torch.float16, torch.bfloat16)
+            ),
+            lambda: "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs",
+        )
+    result_dtype = a.dtype if out_dtype is None else out_dtype
+    return a.new_empty((N, P), dtype=result_dtype)
 
 
 def _compute_reduction_shape(self, dims, keepdim):
@@ -2333,16 +2341,19 @@ def _formula_transposed(ln: int, p: int, d: int, k: int, s: int, op: int) -> int
 
     ret_shape = [input_tensor.shape[0], out_channels]
     if isinstance(stride, IntLike):
+        # pyrefly: ignore  # bad-assignment
         stride = [stride] * len(dims)
     elif len(stride) == 1:
         stride = [stride[0]] * len(dims)
 
     if isinstance(padding, IntLike):
+        # pyrefly: ignore  # bad-assignment
         padding = [padding] * len(dims)
     elif len(padding) == 1:
         padding = [padding[0]] * len(dims)
 
     if isinstance(dilation, IntLike):
+        # pyrefly: ignore  # bad-assignment
         dilation = [dilation] * len(dims)
     elif len(dilation) == 1:
         dilation = [dilation[0]] * len(dims)
@@ -2350,6 +2361,7 @@ def _formula_transposed(ln: int, p: int, d: int, k: int, s: int, op: int) -> int
     output_padding_list: Optional[list[int]] = None
     if output_padding:
         if isinstance(output_padding, IntLike):
+            # pyrefly: ignore  # bad-assignment
             output_padding_list = [output_padding] * len(dims)
         elif len(output_padding) == 1:
             output_padding_list = [output_padding[0]] * len(dims)
@@ -2362,15 +2374,19 @@ def _formula_transposed(ln: int, p: int, d: int, k: int, s: int, op: int) -> int
             ret_shape.append(
                 _formula_transposed(
                     dims[i],
+                    # pyrefly: ignore  # index-error
                     padding[i],
+                    # pyrefly: ignore  # index-error
                     dilation[i],
                     kernel_size[i],
+                    # pyrefly: ignore  # index-error
                     stride[i],
                     output_padding_list[i],
                 )
             )
         else:
             ret_shape.append(
+                # pyrefly: ignore  # index-error
                 _formula(dims[i], padding[i], dilation[i], kernel_size[i], stride[i])
             )
     from torch.fx.experimental.symbolic_shapes import sym_or
@@ -2439,18 +2455,6 @@ def meta_conv(
     output_padding: list[int],
     groups: int,
 ):
-    def pick_memory_format():
-        if device_hint(input_tensor) == "cuda":
-            if is_channels_last(input_tensor) or is_channels_last(weight):
-                return torch.channels_last
-        else:
-            if is_channels_last(input_tensor):
-                return torch.channels_last
-        if input_tensor.is_contiguous(memory_format=torch.contiguous_format):
-            return torch.contiguous_format
-        elif input_tensor.is_contiguous(memory_format=torch.preserve_format):
-            return torch.preserve_format
-
     shape_out = calc_conv_nd_return_shape(
         input_tensor,
         weight,
@@ -2468,7 +2472,6 @@ def pick_memory_format():
         shape_out[output_channels_dim] = 0
 
     out = input_tensor.new_empty(shape_out)
-    out = out.to(memory_format=pick_memory_format())  # type: ignore[call-overload]
     return out
 
 
@@ -2716,20 +2719,22 @@ def meta_quantized_max_pool2d(
 
     @register_meta(torch.ops.quantized.int4mm_packed_weight_cpu)
     def meta_int4mm_packed_weight_cpu(x, w, q_group_size, q_scale_and_zeros):
-        torch._check(x.dim() == 2, f"x must be a 2D tensor, got {x.dim()}D")
-        torch._check(w.dim() == 2, f"w must be a 2D tensor, got {w.dim()}D")
+        torch._check(x.dim() == 2, lambda: f"x must be a 2D tensor, got {x.dim()}D")
+        torch._check(w.dim() == 2, lambda: f"w must be a 2D tensor, got {w.dim()}D")
         torch._check(
             x.dtype in [torch.float32, torch.float16, torch.bfloat16],
-            f"expected x to be f32/f16/bf16, got {x.dtype}",
+            lambda: f"expected x to be f32/f16/bf16, got {x.dtype}",
+        )
+        torch._check(
+            w.dtype == torch.uint8, lambda: f"expected w to be uint8, got {w.dtype}"
         )
-        torch._check(w.dtype == torch.uint8, f"expected w to be uint8, got {w.dtype}")
         torch._check(
             q_group_size.dtype == torch.int64,
-            f"q_group_size must be int64, got {q_group_size.dtype}",
+            lambda: f"q_group_size must be int64, got {q_group_size.dtype}",
         )
         torch._check(
             q_scale_and_zeros.dtype == x.dtype,
-            f"q_scale_and_zeros must have the same dtype as x, got {q_scale_and_zeros.dtype}",
+            lambda: f"q_scale_and_zeros must have the same dtype as x, got {q_scale_and_zeros.dtype}",
         )
         return x.new_empty(x.size(0), w.size(0), dtype=x.dtype)
 
@@ -3449,6 +3454,7 @@ def _restride_src(self):
         """
         shape = before_shape + replacement_shape + after_shape
         strides = list(self.stride())
+        # pyrefly: ignore  # unsupported-operation
         strides[len(before_shape) : len(self.shape) - len(after_shape)] = [0] * len(
             replacement_shape
         )
@@ -3851,7 +3857,7 @@ def meta__dyn_quant_matmul_4bit(
 ):
     torch._check(inp.dim() == 2, lambda: "input must be a 2D tensor")
     torch._check(
-        inp.dtype in [torch.float32],
+        inp.dtype == torch.float32,
         lambda: f"expected input to be f32, got {inp.dtype}",
     )
     M = inp.size(0)
@@ -3889,16 +3895,16 @@ def meta_cdist_forward(x1, x2, p, compute_mode):
     )
     torch._check(
         utils.is_float_dtype(x1.dtype),
-        lambda: "cdist only supports floating-point dtypes, X1 got: {x1.dtype}",
+        lambda: f"cdist only supports floating-point dtypes, X1 got: {x1.dtype}",
     )
     torch._check(
         utils.is_float_dtype(x2.dtype),
-        lambda: "cdist only supports floating-point dtypes, X2 got: {x2.dtype}",
+        lambda: f"cdist only supports floating-point dtypes, X2 got: {x2.dtype}",
     )
     torch._check(p >= 0, lambda: "cdist only supports non-negative p values")
     torch._check(
-        compute_mode in (None, 1, 2),
-        lambda: f"possible modes: None, 1, 2, but was: {compute_mode}",
+        compute_mode in (None, 0, 1, 2),
+        lambda: f"possible modes: None, 0, 1, 2, but was: {compute_mode}",
     )
     r1 = x1.size(-2)
     r2 = x2.size(-2)
@@ -4347,6 +4353,8 @@ def meta_index_put_(self, indices, values, accumulate=False):
 
 
 def common_meta_baddbmm_bmm(batch1, batch2, is_bmm, self_baddbmm=None, out_dtype=None):
+    from torch.fx.experimental.symbolic_shapes import sym_and, sym_eq
+
     torch._check(batch1.dim() == 3, lambda: "batch1 must be a 3D tensor")
     torch._check(batch2.dim() == 3, lambda: "batch2 must be a 3D tensor")
 
@@ -4360,7 +4368,7 @@ def common_meta_baddbmm_bmm(batch1, batch2, is_bmm, self_baddbmm=None, out_dtype
     output_size = (bs, res_rows, res_cols)
 
     torch._check(
-        batch2_sizes[0] == bs and batch2_sizes[1] == contraction_size,
+        sym_and(sym_eq(batch2_sizes[0], bs), sym_eq(batch2_sizes[1], contraction_size)),
         lambda: f"Expected size for first two dimensions of batch2 tensor to be: [{bs}"
         f", {contraction_size}] but got: [{batch2_sizes[0]}, {batch2_sizes[1]}].",
     )
@@ -4380,7 +4388,7 @@ def common_meta_baddbmm_bmm(batch1, batch2, is_bmm, self_baddbmm=None, out_dtype
     if not is_bmm and self_baddbmm is not None:
         torch._check(self_baddbmm.dim() == 3, lambda: "self must be a 3D tensor")
         torch._check(
-            self_baddbmm.size() == output_size,
+            sym_eq(self_baddbmm.size(), output_size),
             lambda: f"Expected an input tensor shape with shape {output_size} but got shape: {self_baddbmm.size()}",
         )
 
@@ -4471,15 +4479,15 @@ def pool2d_shape_check(
 
     torch._check(
         kW > 0 and kH > 0,
-        lambda: "kernel size should be greater than zero, but got kH: {kH}, kW: {kW}",
+        lambda: f"kernel size should be greater than zero, but got kH: {kH}, kW: {kW}",
     )
     torch._check(
         dW > 0 and dH > 0,
-        lambda: "stride should be greater than zero, but got dH: {dH}, dW: {dW}",
+        lambda: f"stride should be greater than zero, but got dH: {dH}, dW: {dW}",
     )
     torch._check(
         dilationH > 0 and dilationW > 0,
-        lambda: "dilation should be greater than zero, but got dilationH: {dilationH}, dilationW: {dilationW}",
+        lambda: f"dilation should be greater than zero, but got dilationH: {dilationH}, dilationW: {dilationW}",
     )
 
     valid_dims = input.size(1) != 0 and input.size(2) != 0
@@ -4488,7 +4496,7 @@ def pool2d_shape_check(
         torch._check(
             ndim == 4 and valid_dims and input.size(3) != 0,
             lambda: "Expected 4D (batch mode) tensor expected for input with channels_last layout"
-            " with optional 0 dim batch size for input, but got: {input.size()}",
+            f" with optional 0 dim batch size for input, but got: {input.size()}",
         )
     else:
         torch._check(
@@ -4889,7 +4897,7 @@ def meta_fractional_max_pool2d(self, kernel_size, output_size, random_samples):
     for d in range(ndim - 3, ndim):
         torch._check(
             self.size(d) > 0,
-            f"fractional_max_pool2d: Expected input to have non-zero "
+            lambda: f"fractional_max_pool2d: Expected input to have non-zero "
             f" size for non-batch dimensions, but got {self.size()} with dimension {d} empty",
         )
 
@@ -4927,7 +4935,7 @@ def meta_fractional_max_pool2d(self, kernel_size, output_size, random_samples):
     d = random_samples.size(2)
     torch._check(
         n >= input_batch,
-        "Expect _random_samples.size(0) no less then input batch size.",
+        lambda: "Expect _random_samples.size(0) no less then input batch size.",
     )
     torch._check(
         c == input_channels,
@@ -5299,7 +5307,7 @@ def grid_sampler_3d_backward(
 
 @register_meta([aten.full.default])
 def full(size, fill_value, *args, **kwargs):
-    dtype = kwargs.get("dtype", None)
+    dtype = kwargs.get("dtype")
     if not dtype:
         dtype = utils.get_dtype(fill_value)
     kwargs["dtype"] = dtype
@@ -6660,6 +6668,7 @@ def rnn_cell_checkSizes(
     )
     torch._check(
         all(
+            # pyrefly: ignore  # missing-attribute
             x.device == input_gates.device
             for x in [hidden_gates, input_bias, hidden_bias, prev_hidden]
         ),
@@ -6828,7 +6837,7 @@ def topk_meta(self, k, dim=-1, largest=True, sorted=True):
     # From aten/src/ATen/native/Sorting.cpp
     dim = maybe_wrap_dim(dim, self.dim(), wrap_scalar=True)
     sliceSize = 1 if self.dim() == 0 else self.size(dim)
-    torch._check_is_size(k)
+    torch._check(k >= 0)
     torch._check(k <= sliceSize, lambda: "k not in range for dimension")
 
     topKSize = list(self.shape)
@@ -7020,7 +7029,7 @@ def meta_histc(input, bins=100, min=0, max=0):
         isinstance(max, Number),
         lambda: f"{fn_name}: argument 'max' must be Number, not {type(max)}",
     )
-    torch._check(max >= min, lambda: "{fn_name}: max must be larger than min")
+    torch._check(max >= min, lambda: f"{fn_name}: max must be larger than min")
     return torch.empty(bins, device=input.device, dtype=input.dtype)
 
 
@@ -7185,7 +7194,7 @@ def meta_searchsorted(
     # Per the docs, if side == "left" and right is True, we error.
     torch._check(
         side != "left" or not right,
-        "torch.searchsorted(): side and right can't be set to opposites, got side of "
+        lambda: "torch.searchsorted(): side and right can't be set to opposites, got side of "
         "left while right was True",
     )
 
@@ -7296,7 +7305,7 @@ def meta_embedding_bag_per_sample_weights_backward(
     embedding_features = grad.size(1)
     torch._check(
         mode == MODE_SUM,
-        "embedding_bag_backward: per_sample_weights only supported for mode='sum'",
+        lambda: "embedding_bag_backward: per_sample_weights only supported for mode='sum'",
     )
     torch._check(grad.dim() == 2)
     torch._check(indices.dim() == 1)
@@ -7443,7 +7452,7 @@ def _meta_grouped_mm_common(
     if not mat_a_is_2d or not mat_b_is_2d:
         torch._check(
             mat_a.size(-1) == mat_b.size(-2),
-            "contraction dimension of mat_a and mat_b must match",
+            lambda: "contraction dimension of mat_a and mat_b must match",
         )
 
     if scaled:
@@ -7647,6 +7656,9 @@ def meta_scaled_grouped_mm(
     out_dtype: Optional[torch.dtype] = None,
     use_fast_accum: bool = False,
 ):
+    # matching _scaled_grouped_mm_cuda Blas.cpp implementation
+    out_dtype = out_dtype or torch.bfloat16
+
     return _meta_grouped_mm_common(
         mat_a,
         mat_b,
@@ -7782,6 +7794,56 @@ def _f(x):
     return _f
 
 
+# Implementation follows cuda implementation native_multi_head_attention_cuda
+@register_meta(aten._native_multi_head_attention.default)
+def native_multi_head_attention_fake(
+    query,
+    key,
+    value,
+    embed_dim,
+    num_head,
+    qkv_weight,
+    qkv_bias,
+    proj_weight,
+    proj_bias,
+    mask=None,
+    need_weights=True,
+    average_attn_weights=True,
+    mask_type=None,
+):
+    if query.is_nested or key.is_nested or value.is_nested:
+        raise NotImplementedError(
+            "_native_multi_head_attention fake implementation does not support nested tensors"
+        )
+
+    if query.numel() == 0:
+        return (query.new_empty(query.shape), query.new_empty(0))
+
+    B = query.size(0)  # B: batch size
+    T = query.size(1)  # T: target sequence length
+
+    # In native_multi_head_attention_cuda,
+    # we have proj = transform0213_gemm_nt_bias(attn_ctx, proj_weight, proj_bias, query)
+    # , which does attn_ctx @ proj_weight.T + proj_bias
+    # so the last dim of output shape is proj_weight.size(0)
+    output_dim = proj_weight.size(0)
+    output = query.new_empty(B, T, output_dim)
+
+    if need_weights:
+        if average_attn_weights:
+            # When averaging attention weights, shape is [B, T, T] (averaged over heads)
+            # T = query seq len, S = key/value seq len
+            attn_weights = query.new_empty(B, T, T)
+        else:
+            # When not averaging, shape is [B, num_head, T, T]
+            # T = query seq len, S = key/value seq len
+            attn_weights = query.new_empty(B, num_head, T, T)
+    else:
+        attn_weights = query.new_empty(0)
+
+    return (output, attn_weights)
+
+
 def _create_binary_float_meta_func(func):
     @register_meta(func)
     @out_wrapper()
diff --git a/torch/_numpy/__init__.py b/torch/_numpy/__init__.py
index 89f6ccfec07a..ff60965d6067 100644
--- a/torch/_numpy/__init__.py
+++ b/torch/_numpy/__init__.py
@@ -21,10 +21,10 @@
 from math import pi, e  # usort: skip
 
 
-all = all
+all = all  # noqa: PLW0127
 alltrue = all
 
-any = any
+any = any  # noqa: PLW0127
 sometrue = any
 
 inf = float("inf")
diff --git a/torch/_numpy/_dtypes.py b/torch/_numpy/_dtypes.py
index e955a47060ff..a429d28f30cc 100644
--- a/torch/_numpy/_dtypes.py
+++ b/torch/_numpy/_dtypes.py
@@ -408,8 +408,6 @@ def set_default_dtype(fp_dtype="numpy", int_dtype="numpy"):
 
     if int_dtype in ["numpy", "pytorch"]:
         int_dtype = torch.int64
-    else:
-        int_dtype = int_dtype
 
     new_defaults = _dtypes_impl.DefaultDTypes(
         float_dtype=float_dtype, complex_dtype=complex_dtype, int_dtype=int_dtype
diff --git a/torch/_numpy/_funcs_impl.py b/torch/_numpy/_funcs_impl.py
index 19748a08b9de..f57e7fb001fb 100644
--- a/torch/_numpy/_funcs_impl.py
+++ b/torch/_numpy/_funcs_impl.py
@@ -1449,7 +1449,7 @@ def rollaxis(a: ArrayLike, axis, start=0):
         # numpy returns a view, here we try returning the tensor itself
         # return tensor[...]
         return a
-    axes = list(range(0, n))
+    axes = list(range(n))
     axes.remove(axis)
     axes.insert(start, axis)
     return a.view(axes)
@@ -1867,7 +1867,7 @@ def common_type(*tensors: ArrayLike):
         if not (t.is_floating_point or t.is_complex):
             p = 2  # array_precision[_nx.double]
         else:
-            p = array_precision.get(t, None)
+            p = array_precision.get(t)
             if p is None:
                 raise TypeError("can't get common type for non-numeric array")
         precision = builtins.max(precision, p)
diff --git a/torch/_numpy/_reductions_impl.py b/torch/_numpy/_reductions_impl.py
index 4afc217ebd4b..a4ebc094a728 100644
--- a/torch/_numpy/_reductions_impl.py
+++ b/torch/_numpy/_reductions_impl.py
@@ -428,7 +428,7 @@ def percentile(
     interpolation: NotImplementedType = None,
 ):
     # np.percentile(float_tensor, 30) : q.dtype is int64 => q / 100.0 is float32
-    if _dtypes_impl.python_type_for_torch(q.dtype) == int:
+    if _dtypes_impl.python_type_for_torch(q.dtype) is int:
         q = q.to(_dtypes_impl.default_dtypes().float_dtype)
     qq = q / 100.0
 
diff --git a/torch/_numpy/_util.py b/torch/_numpy/_util.py
index fdb1736a1d0f..e1102790af2b 100644
--- a/torch/_numpy/_util.py
+++ b/torch/_numpy/_util.py
@@ -88,7 +88,7 @@ def normalize_axis_tuple(axis, ndim, argname=None, allow_duplicate=False):
         except TypeError:
             pass
     # Going via an iterator directly is slower than via list comprehension.
-    axis = tuple([normalize_axis_index(ax, ndim, argname) for ax in axis])
+    axis = tuple(normalize_axis_index(ax, ndim, argname) for ax in axis)
     if not allow_duplicate and len(set(map(int, axis))) != len(axis):
         if argname:
             raise ValueError(f"repeated axis in `{argname}` argument")
diff --git a/torch/_numpy/testing/utils.py b/torch/_numpy/testing/utils.py
index cd0d33893ac2..d43f63f10388 100644
--- a/torch/_numpy/testing/utils.py
+++ b/torch/_numpy/testing/utils.py
@@ -317,7 +317,7 @@ def print_assert_equal(test_string, actual, desired):
     __tracebackhide__ = True  # Hide traceback for py.test
     import pprint
 
-    if not (actual == desired):
+    if actual != desired:
         msg = StringIO()
         msg.write(test_string)
         msg.write(" failed\nACTUAL: \n")
@@ -1505,7 +1505,7 @@ def _integer_repr(x, vdt, comp):
     # See also
     # https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
     rx = x.view(vdt)
-    if not (rx.size == 1):
+    if rx.size != 1:
         rx[rx < 0] = comp - rx[rx < 0]
     else:
         if rx < 0:
diff --git a/torch/_ops.py b/torch/_ops.py
index b351aa17dfa7..0a6bb7f5fbfb 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -6,19 +6,19 @@
 import inspect
 import sys
 import types
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 from functools import cached_property
 from typing import (
     Any,
-    Callable,
     ClassVar,
+    Concatenate,
     final,
     Generic,
     Optional,
     TYPE_CHECKING,
     Union,
 )
-from typing_extensions import Concatenate, ParamSpec, TypeVar
+from typing_extensions import ParamSpec, TypeVar
 
 import torch
 import torch.utils._pytree as pytree
@@ -521,19 +521,14 @@ def check_overloaded(arg):
 
     @abc.abstractmethod
     def __call__(self, /, *args, **kwargs):
-        def wrapper():
-            flat_args = _to_flat_tuple(args, kwargs)
-            if torch.overrides.has_torch_function(flat_args):
-                return torch.overrides.handle_torch_function(
-                    self, flat_args, *args, **kwargs
-                )
-
-            dispatch_key_set = _compute_keyset(args, kwargs, self.non_fallthrough_keys)
-            return self.dispatch(
-                dispatch_key_set.highestPriorityTypeId(), *args, **kwargs
+        flat_args = _to_flat_tuple(args, kwargs)
+        if torch.overrides.has_torch_function(flat_args):
+            return torch.overrides.handle_torch_function(
+                self, flat_args, *args, **kwargs
             )
 
-        return wrapper()
+        dispatch_key_set = _compute_keyset(args, kwargs, self.non_fallthrough_keys)
+        return self.dispatch(dispatch_key_set.highestPriorityTypeId(), *args, **kwargs)
 
     # NOTE [HigherOrderOprator Schema]
     # Each invocation of a HigherOrderOperator (hop) should have its own schema because
@@ -803,7 +798,7 @@ def __init__(
 
         # Logic replicated from aten/src/ATen/native/MathBitsFallback.h
         is_write = None
-        for a in self._schema.arguments:
+        for a in self._schema.arguments:  # pyrefly: ignore  # bad-assignment
             if a.alias_info is None:
                 continue
             if is_write is None:
@@ -885,7 +880,7 @@ def decompose(self, *args: _P.args, **kwargs: _P.kwargs) -> _T:
         elif torch._C._dispatch_has_kernel_for_dispatch_key(self.name(), dk):
             return self._op_dk(dk, *args, **kwargs)
         else:
-            return NotImplemented
+            return NotImplemented  # pyrefly: ignore  # bad-return
 
     # Remove a dispatch key from the dispatch cache.  This will force it to get
     # recomputed the next time.  Does nothing
@@ -931,7 +926,7 @@ def handler(*args: _P.args, **kwargs: _P.kwargs) -> _T:
                         return self._op_dk(key, *args, **kwargs)
 
                 with torch.utils._python_dispatch._pop_mode_temporarily() as mode:
-                    return self.python_key_table[curr_mode](mode, *args, **kwargs)
+                    return self.python_key_table[curr_mode](mode, *args, **kwargs)  # type: ignore[index]
 
             self._dispatch_cache[key] = handler
             add_cached_op(self)
@@ -990,9 +985,9 @@ def _temporarily_pop_modes_from_pre_dispatch():
 
         r = self.py_kernels.get(final_key, final_key)
         if cache_result:
-            self._dispatch_cache[key] = r
+            self._dispatch_cache[key] = r  # pyrefly: ignore  # unsupported-operation
             add_cached_op(self)
-        return r
+        return r  # pyrefly: ignore  # bad-return
 
     def name(self):
         return self._name
@@ -1122,7 +1117,7 @@ def _dispatch_in_python(
             )
 
         assert isinstance(handler, Callable)  # type: ignore[arg-type]
-        return handler(*args, **kwargs)
+        return handler(*args, **kwargs)  # pyrefly: ignore  # bad-return
 
 
 def _must_dispatch_in_python(args, kwargs):
@@ -1251,6 +1246,7 @@ def __call__(self, /, *args: _P.args, **kwargs: _P.kwargs) -> _T:
         # the schema and cause an error for torchbind op when inputs consist of FakeScriptObject so we
         # intercept it here and call TorchBindOpverload instead.
         if self._has_torchbind_op_overload and _must_dispatch_in_python(args, kwargs):
+            # pyrefly: ignore  # bad-argument-type
             return _call_overload_packet_from_python(self, *args, **kwargs)
         return self._op(*args, **kwargs)
 
@@ -1413,7 +1409,7 @@ def __iter__(self) -> Iterator[str]:
 
     def __getattr__(self, name: str) -> HigherOrderOperator:
         # Following _OpNamespace.__getattr__, we cache the op on this object.
-        op = _higher_order_ops.get(name, None)
+        op = _higher_order_ops.get(name)
         if op is None:
             raise AttributeError(
                 f"'_HigherOrderNamespace' 'torch.ops.higher_order' object has no attribute '{name}'"
diff --git a/torch/_prims/__init__.py b/torch/_prims/__init__.py
index 5fef517dc59f..7827aa244a2e 100644
--- a/torch/_prims/__init__.py
+++ b/torch/_prims/__init__.py
@@ -1,9 +1,9 @@
 # mypy: allow-untyped-defs
 import operator
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from enum import Enum
 from functools import partial, reduce
-from typing import Callable, Optional, Union
+from typing import Optional, Union
 
 import torch
 import torch._prims_common as utils
@@ -145,7 +145,6 @@
     "conj",
     "expand_dims",
     "slice",
-    "slice_in_dim",  # implemented using slice -- make this a ref?
     "split_dim",
     "squeeze",
     "transpose",
@@ -353,12 +352,14 @@ def _backend_select_impl(*args, **kwargs):
 
     from torch._subclasses.fake_tensor import contains_tensor_types
 
-    if not any(contains_tensor_types(a.type) for a in _prim._schema.arguments) or str(
-        _prim
-    ) in [
-        # See https://github.com/pytorch/pytorch/issues/103532
-        "prims.device_put.default"
-    ]:
+    if (
+        not any(contains_tensor_types(a.type) for a in _prim._schema.arguments)
+        or str(
+            _prim
+            # See https://github.com/pytorch/pytorch/issues/103532
+        )
+        == "prims.device_put.default"
+    ):
         prim_backend_select_impl.impl(name, _backend_select_impl)
 
     for p in (_prim_packet, _prim):
@@ -426,6 +427,7 @@ def _prim_elementwise_meta(
     # Acquires the device (if it exists) or number
     device = None
     number = None
+    # pyrefly: ignore  # bad-assignment
     for arg in args_:
         if isinstance(arg, TensorLike):
             if utils.is_cpu_scalar_tensor(arg):
@@ -445,9 +447,7 @@ def _prim_elementwise_meta(
     # (but getting it wrong will cause too many casts to be inserted in traces!)
     if device is not None:
         assert dtype is not None
-        if type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT:
-            dtype = dtype
-        elif type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL:
+        if type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL:
             dtype = torch.bool
         elif type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.INT_TO_FLOAT:
             if utils.is_integer_dtype(dtype) or utils.is_boolean_dtype(dtype):
@@ -455,8 +455,6 @@ def _prim_elementwise_meta(
         elif type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT:
             if utils.is_complex_dtype(dtype):
                 dtype = utils.corresponding_real_dtype(dtype)
-            else:
-                dtype = dtype
 
         assert shape is not None
         return torch.empty_permuted(shape, l2p_perm, device=device, dtype=dtype)  # type: ignore[return-value]
@@ -692,16 +690,22 @@ def _clone_meta(
             device=input.device,
             memory_format=memory_format,
         )
+    else:
+        # Match eager behavior by preserving strides for non_overlapping_and_dense tensors
+        # If not, eager clone creates contiguous strides
+        computed_stride = None
+        if utils.is_non_overlapping_and_dense(input):
+            computed_stride = input.stride()
+        else:
+            computed_stride = utils.compute_elementwise_output_strides(input)
 
-    # memory_format == torch.preserve_format
-    strides = utils.compute_elementwise_output_strides(input)
-    return torch.empty_strided(
-        input.shape,
-        strides,
-        dtype=input.dtype,
-        layout=input.layout,
-        device=input.device,
-    )
+        return torch.empty_strided(
+            input.shape,
+            computed_stride,
+            dtype=input.dtype,
+            layout=input.layout,
+            device=input.device,
+        )
 
 
 clone = _make_prim(
@@ -1008,8 +1012,10 @@ def _div_aten(a, b):
     )
 
     if is_integral:
+        # pyrefly: ignore  # bad-argument-type
         return torch.div(a, b, rounding_mode="trunc")
     else:
+        # pyrefly: ignore  # bad-argument-type
         return torch.true_divide(a, b)
 
 
diff --git a/torch/_prims/context.py b/torch/_prims/context.py
index 30bc1f85c0ee..a76ed61b2d91 100644
--- a/torch/_prims/context.py
+++ b/torch/_prims/context.py
@@ -2,12 +2,12 @@
 
 import functools
 from contextlib import nullcontext
-from typing import Any, Callable, TYPE_CHECKING, TypeVar
+from typing import Any, TYPE_CHECKING, TypeVar
 from typing_extensions import ParamSpec
 
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
 
 import torch
 import torch._decomp
@@ -125,6 +125,7 @@ def __torch_function__(
         # Unless we are in prims_mode, in which case we want to use nvprims
         if orig_func in torch_function_passthrough or orig_func in all_prims():
             with self.prims_mode_cls():
+                # pyrefly: ignore  # invalid-param-spec
                 return orig_func(*args, **kwargs)
         mapping = torch_to_refs_map()
         func = mapping.get(orig_func, None)
@@ -147,6 +148,7 @@ def __torch_function__(
         if func is not None:
             # If the ref exists query whether we should use it or not
             if self.should_fallback_fn(self, orig_func, func, args, kwargs):
+                # pyrefly: ignore  # invalid-param-spec
                 return orig_func(*args, **kwargs)
             # torch calls inside func should be interpreted as refs calls
             with self:
@@ -155,4 +157,5 @@ def __torch_function__(
             raise RuntimeError(
                 f"no _refs support for {torch.overrides.resolve_name(orig_func)}"
             )
+        # pyrefly: ignore  # invalid-param-spec
         return orig_func(*args, **kwargs)
diff --git a/torch/_prims/executor.py b/torch/_prims/executor.py
index fdd2e19ab43b..55eb0d35c38c 100644
--- a/torch/_prims/executor.py
+++ b/torch/_prims/executor.py
@@ -1,4 +1,5 @@
-from typing import Any, Callable, Optional, TypeVar
+from collections.abc import Callable
+from typing import Any, Optional, TypeVar
 from typing_extensions import ParamSpec, TypeVarTuple, Unpack
 
 from torch._prims.context import TorchRefsMode
diff --git a/torch/_prims/rng_prims.py b/torch/_prims/rng_prims.py
index e6ed4a4e3ea6..11ffb3b554f6 100644
--- a/torch/_prims/rng_prims.py
+++ b/torch/_prims/rng_prims.py
@@ -29,6 +29,7 @@ def register_rng_prim(name, schema, impl_aten, impl_meta, doc, tags=None):
     rngprim_def = torch.library.custom_op(
         "rngprims::" + name, impl_aten, mutates_args=(), schema=schema
     )
+    # pyrefly: ignore  # missing-attribute
     rngprim_def.register_fake(impl_meta)
 
     prim_packet = getattr(torch._ops.ops.rngprims, name)
@@ -329,9 +330,11 @@ def __call__(self, op, *args, rng_state=None, **kwargs):
 
     @graphsafe_run_with_rng_state.py_impl(DispatchKey.CUDA)
     def impl_cuda(op, *args, rng_state=None, **kwargs):
+        # pyrefly: ignore  # missing-attribute
         device_idx = rng_state.device.index
         generator = torch.cuda.default_generators[device_idx]
         current_state = generator.graphsafe_get_state()
+        # pyrefly: ignore  # bad-argument-type
         generator.graphsafe_set_state(rng_state)
         out = op(*args, **kwargs)
         generator.graphsafe_set_state(current_state)
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index b4c3afce557b..2afb23310486 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -4,22 +4,23 @@
 import operator
 import typing
 import warnings
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from contextlib import AbstractContextManager, nullcontext
 from enum import Enum
 from functools import reduce
 from typing import (
     Any,
-    Callable,
     cast,
     NamedTuple,
     Optional,
     overload,
     TYPE_CHECKING,
+    TypeAlias,
+    TypeGuard,
     TypeVar,
     Union,
 )
-from typing_extensions import deprecated, TypeAlias
+from typing_extensions import deprecated
 
 import torch
 from torch import sym_float, sym_int, sym_max
@@ -305,7 +306,10 @@ def is_contiguous(a: TensorLikeType, false_if_dde=False) -> bool:
         guard_size_oblivious,
     )
 
-    maybe_guard_or_false = guard_or_false if false_if_dde else guard_size_oblivious
+    def eval_eager(x):
+        return bool(x)
+
+    maybe_guard_or_false = guard_or_false if false_if_dde else eval_eager
 
     if maybe_guard_or_false(a.numel() < 2):
         return True
@@ -388,7 +392,11 @@ def validate_memory_format(memory_format: torch.memory_format):
 
 
 def is_contiguous_for_memory_format(  # type: ignore[return]
-    a: Tensor, *, memory_format: torch.memory_format, false_if_dde=False
+    a: Tensor,
+    *,
+    memory_format: torch.memory_format,
+    false_if_dde=False,
+    # pyrefly: ignore  # bad-return
 ) -> bool:
     validate_memory_format(memory_format)
 
@@ -720,7 +728,7 @@ def validate_dim_length(length: int):
     """
 
     if isinstance(length, (int, torch.SymInt)):
-        torch._check_is_size(length)
+        torch._check(length >= 0)
     else:
         # sometimes called with sympy expression by inductor
         assert length >= 0
@@ -809,12 +817,16 @@ def canonicalize_dim(rank: int, idx: int, wrap_scalar: bool = True) -> int:
 # mapping negative offsets to positive ones
 @overload
 def canonicalize_dims(
-    rank: int, indices: Sequence[int], wrap_scalar: bool = True
+    rank: int,
+    indices: Sequence[int],
+    wrap_scalar: bool = True,
+    # pyrefly: ignore  # bad-return
 ) -> tuple[int, ...]:
     pass
 
 
 @overload
+# pyrefly: ignore  # bad-return
 def canonicalize_dims(rank: int, indices: int, wrap_scalar: bool = True) -> int:
     pass
 
@@ -843,7 +855,7 @@ def is_same_shape(a: Sequence, b: Sequence) -> bool:
     return tuple(a) == tuple(b)
 
 
-def is_cpu_scalar_tensor(a: Any) -> bool:
+def is_cpu_scalar_tensor(a: object) -> TypeGuard[TensorLike]:
     return isinstance(a, TensorLike) and a.ndim == 0 and a.device.type == "cpu"
 
 
@@ -861,6 +873,7 @@ def check_same_device(*args, allow_cpu_scalar_tensors):
 
     # Note: cannot initialize device to the first arg's device (it may not have one)
     device = None
+    # pyrefly: ignore  # bad-assignment
     for arg in args:
         if isinstance(arg, Number):
             continue
@@ -908,6 +921,7 @@ def check_same_shape(*args, allow_cpu_scalar_tensors: bool):
     """
     shape = None
 
+    # pyrefly: ignore  # bad-assignment
     for arg in args:
         if isinstance(arg, Number):
             continue
@@ -934,6 +948,7 @@ def extract_shape(*args, allow_cpu_scalar_tensors: bool) -> Optional[ShapeType]:
     shape = None
     scalar_shape = None
 
+    # pyrefly: ignore  # bad-assignment
     for arg in args:
         if isinstance(arg, Number):
             continue
@@ -990,6 +1005,7 @@ def extract_shape_from_varargs(
 
     # Handles tuple unwrapping
     if len(shape) == 1 and isinstance(shape[0], Sequence):
+        # pyrefly: ignore  # bad-assignment
         shape = shape[0]
 
     if validate:
@@ -1067,13 +1083,7 @@ def infer_size(shape: ShapeType, numel: int) -> tuple[int, ...]:
         # PyTorch, which prints sequences in square brackets.
         shape = list(shape)
         shape[dim] = numel // newsize
-        # NB: This is pretty important when you have unbacked SymInts.
-        # Suppose you have (i0, 12) resizing into (2, -1, 12).  The old
-        # range for i0 is typically [2, inf], which means if you divide
-        # by two the new range should be [1, inf].  But this is bad news
-        # if you have an unbacked SymInt: we need to reapply the unsound
-        # assumption that the size is >= 2.
-        torch._check_is_size(shape[dim])
+        torch._check(shape[dim] >= 0)
     return tuple(shape)
 
 
@@ -1291,6 +1301,7 @@ def _extract_dtype(
 
         raise RuntimeError("Unexpected type given to _extract_dtype!")
 
+    # pyrefly: ignore  # bad-argument-type
     a, b = _extract_dtype(a), _extract_dtype(b)
 
     if a is b:
@@ -1386,6 +1397,7 @@ def check_same_dtype(*args):
     full_dtype = None
     scalar_type = None
 
+    # pyrefly: ignore  # bad-assignment
     for arg in args:
         if isinstance(arg, Number):
             # Scalar type checking is disabled (and may be removed in the future)
@@ -1656,8 +1668,10 @@ def _find_highest_dtype_filtered(
 
         # Prefers dtype of tensors with one or more dimensions
         if one_plus_dim_tensor_dtype is not None:
+            # pyrefly: ignore  # bad-return
             return one_plus_dim_tensor_dtype
 
+        # pyrefly: ignore  # bad-return
         return zero_dim_tensor_dtype
 
     if highest_type is float:
diff --git a/torch/_prims_common/wrappers.py b/torch/_prims_common/wrappers.py
index e5e5b13f62c7..23e242290d98 100644
--- a/torch/_prims_common/wrappers.py
+++ b/torch/_prims_common/wrappers.py
@@ -2,10 +2,10 @@
 import inspect
 import types
 import warnings
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from functools import wraps
 from types import GenericAlias
-from typing import Callable, NamedTuple, Optional, overload, TypeVar, Union
+from typing import NamedTuple, Optional, overload, TypeVar, Union
 from typing_extensions import ParamSpec
 
 import torch
@@ -28,16 +28,19 @@
 
 
 @overload
+# pyrefly: ignore  # bad-return
 def _maybe_convert_to_dtype(a: TensorLikeType, dtype: torch.dtype) -> TensorLikeType:
     pass
 
 
 @overload
+# pyrefly: ignore  # bad-return
 def _maybe_convert_to_dtype(a: NumberType, dtype: torch.dtype) -> NumberType:
     pass
 
 
 @overload
+# pyrefly: ignore  # bad-return
 def _maybe_convert_to_dtype(a: Sequence, dtype: torch.dtype) -> Sequence:
     pass
 
@@ -276,7 +279,9 @@ def _out_wrapper(fn: Callable[_P, _T]) -> Callable[_P, _T]:
             TensorLikeType
             if is_tensor
             else NamedTuple(
-                f"return_types_{fn.__name__}", [(o, TensorLikeType) for o in out_names]
+                f"return_types_{fn.__name__}",
+                # pyrefly: ignore  # bad-argument-count
+                [(o, TensorLikeType) for o in out_names],
             )
         )
 
@@ -294,6 +299,7 @@ def _fn(*args: _P.args, **kwargs: _P.kwargs):
                         kwargs[k] = out_attr
 
             def maybe_check_copy_devices(out):
+                # pyrefly: ignore  # unsupported-operation
                 if isinstance(out, TensorLike) and isinstance(args[0], TensorLike):
                     check_copy_devices(copy_from=args[0], copy_to=out)
 
@@ -324,7 +330,7 @@ def maybe_check_copy_devices(out):
                 # Naively you might expect this assert to be true, but
                 # it's not:
                 #
-                #   assert type(out) == type(result)
+                #   assert type(out) is type(result)
                 #
                 # The reason is that functions under this wrapper can
                 # get registered to the Meta dispatch key, and that
@@ -429,6 +435,7 @@ def redispatch_prim(args, kwargs):
 
     class BackwardsNotSupported(torch.autograd.Function):
         @staticmethod
+        # pyrefly: ignore  # bad-override
         def forward(ctx, args_spec, *flat_args):
             args, kwargs = tree_unflatten(flat_args, args_spec)  # type: ignore[arg-type]
             return redispatch_prim(args, kwargs)
@@ -477,11 +484,14 @@ def _fn(*args, **kwargs):
             dtype = utils.type_to_dtype(type(args[0]))
             args_ = list(args)
             args_[0] = torch.tensor(args[0], dtype=dtype)
+            # pyrefly: ignore  # invalid-param-spec
             result = fn(*args_, **kwargs)
             assert isinstance(result, torch.Tensor)
             return result.item()
 
+        # pyrefly: ignore  # invalid-param-spec
         return fn(*args, **kwargs)
 
     _fn.__signature__ = sig  # type: ignore[attr-defined]
+    # pyrefly: ignore  # bad-return
     return _fn
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index 18455b519415..822f949d536f 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -7,10 +7,10 @@
 import math
 import operator
 import warnings
-from collections.abc import Iterable, Sequence
+from collections.abc import Callable, Iterable, Sequence
 from enum import Enum
 from functools import partial, reduce, singledispatch, wraps
-from typing import Any, Callable, cast, Optional, overload, Union
+from typing import Any, cast, Optional, overload, Union
 
 import torch
 import torch._prims as prims
@@ -476,7 +476,7 @@ def should_expand(a: ShapeType, b: ShapeType) -> bool:
             # u0==u1 assume the same, no broadcasting!
             torch._check(
                 x == y,
-                "sizes assumed to be the same due to unbacked broadcasting semantics",
+                lambda: "sizes assumed to be the same due to unbacked broadcasting semantics",
             )
 
         return False
@@ -880,10 +880,14 @@ def logsumexp(
     if not isinstance(dim, Iterable):
         dim = (dim,)
     if self.numel() == 0:
+        # pyrefly: ignore  # no-matching-overload
         return torch.sum(torch.exp(self), dim, keepdim).log()
+    # pyrefly: ignore  # bad-argument-type
     maxes = torch.amax(torch.real(self), dim, keepdim=True)
     maxes = torch.masked_fill(maxes, maxes.abs() == float("inf"), 0)
+    # pyrefly: ignore  # no-matching-overload
     maxes_squeezed = maxes if keepdim else torch.squeeze(maxes, dim)
+    # pyrefly: ignore  # no-matching-overload
     result = torch.sum(torch.exp(self - maxes), dim, keepdim)
     return result.log().add(maxes_squeezed)
 
@@ -1175,7 +1179,7 @@ def add(
     if alpha is not None:
         dtype = a.dtype if isinstance(a, TensorLike) else b.dtype  # type: ignore[union-attr]
         python_type = utils.dtype_to_type(dtype)
-        if python_type != bool and not utils.is_weakly_lesser_type(
+        if python_type is not bool and not utils.is_weakly_lesser_type(
             type(alpha), python_type
         ):
             msg = f"alpha argument of type {type(alpha)} cannot be safely cast to type {python_type}!"
@@ -1241,10 +1245,12 @@ def copysign(
     a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]
 ):
     if isinstance(b, Number) and isinstance(a, Tensor):
+        # pyrefly: ignore  # bad-argument-type
         b = scalar_tensor(b, dtype=a.dtype, device=a.device)
     elif isinstance(a, Tensor) and isinstance(b, Tensor) and a.device != b.device:
         msg = f"Expected divisor (b) to be on the same device ({a.device}) as dividend (a), but it is found on {b.device}!"
         raise RuntimeError(msg)
+    # pyrefly: ignore  # bad-argument-type
     return where(signbit(b), neg(abs(a)), abs(a))
 
 
@@ -1330,10 +1336,13 @@ def float_power(
 
     # Float power has the following contiguous cast behavior to be
     # consistent with its C++ impl
+    # pyrefly: ignore  # no-matching-overload
     a = _maybe_convert_to_dtype(a, dtype)
+    # pyrefly: ignore  # no-matching-overload
     b = _maybe_convert_to_dtype(b, dtype)
 
     a, b = _maybe_broadcast(a, b)
+    # pyrefly: ignore  # bad-return
     return pow(a, b)
 
 
@@ -1375,11 +1384,15 @@ def floor_divide(
 ):
     # Wrap scalars because some references only accept tensor arguments.
     if isinstance(a, Number) and isinstance(b, Number):
+        # pyrefly: ignore  # bad-argument-type
         a = scalar_tensor(a)
+        # pyrefly: ignore  # bad-argument-type
         b = scalar_tensor(b)
     elif isinstance(b, Number) and isinstance(a, Tensor):
+        # pyrefly: ignore  # bad-argument-type
         b = scalar_tensor(b, dtype=a.dtype, device=a.device)
     elif isinstance(a, Number) and isinstance(b, Tensor):
+        # pyrefly: ignore  # bad-argument-type
         a = scalar_tensor(a, dtype=b.dtype, device=b.device)
     elif isinstance(a, Tensor) and isinstance(b, Tensor) and a.device != b.device:
         if a.device == torch.device("cpu"):
@@ -1856,8 +1869,10 @@ def xlogy(a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberT
 
     # Operations like eq and log do not handle scalar values, so we convert them to scalar_tensors.
     if isinstance(b, TensorLike) and isinstance(a, Number):
+        # pyrefly: ignore  # bad-argument-type
         a = scalar_tensor(a, dtype=b.dtype, device=b.device)
     elif isinstance(a, TensorLike) and isinstance(b, Number):
+        # pyrefly: ignore  # bad-argument-type
         b = scalar_tensor(b, dtype=a.dtype, device=a.device)
 
     # mypy: expected "Tensor"
@@ -1997,9 +2012,13 @@ def clamp_max(
 
 
 # https://pytorch.org/docs/stable/generated/torch.where.html
-# TODO: implement alternate where
-@register_decomposition(aten.where)
-@out_wrapper()
+# TODO: implement where.default
+@register_decomposition(aten.where.self)
+@register_decomposition(aten.where.ScalarSelf)
+@register_decomposition(aten.where.ScalarOther)
+@register_decomposition(aten.where.Scalar)
+@register_decomposition(aten.where.self_out)
+@out_wrapper(exact_dtype=True)
 @elementwise_type_promotion_wrapper(
     type_promoting_args=("a", "b"),
     type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
@@ -2259,11 +2278,14 @@ def _reduction(
         dims = (dims,)  # type: ignore[assignment]
     dims = utils.reduction_dims(a.shape, dims)
     if not has_identity:
-        valid_shape = a.ndim == 0 or builtins.all(a.shape[i] for i in dims)
-        if not valid_shape:
-            raise RuntimeError(
-                "reducing over zero-size dimension for reduction operation without identity"
-            )
+        from torch.fx.experimental.symbolic_shapes import sym_and
+
+        valid_shape = a.ndim == 0 or sym_and(*(a.shape[i] > 0 for i in dims))
+        torch._check(
+            valid_shape,
+            lambda: "reducing over zero-size dimension for reduction operation without identity",
+        )
+
     computation_dtype, result_dtype = utils.reduction_dtypes(
         a, output_dtype_kind, dtype
     )
@@ -2326,6 +2348,7 @@ def all(
     dim: Optional[DimsType] = None,
     keepdim: bool = False,
 ) -> TensorLikeType:
+    # pyrefly: ignore  # no-matching-overload
     result = torch.logical_not(torch.any(torch.logical_not(a), dim, keepdim=keepdim))
 
     if a.dtype == torch.uint8:
@@ -2843,6 +2866,7 @@ def cat_compute_output_memory_format(inputs):
     # SymInts
 
     example = None
+    # pyrefly: ignore  # bad-assignment
     for i, t in enumerate(tensors):
         if example is None:
             if t.ndim != 1:
@@ -2988,7 +3012,7 @@ def constant_pad_nd(
         pad_idx = len(pad) - ((i + 1) * 2)
         new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1]
         torch._check(
-            new_dim > 0,
+            new_dim >= 0,
             lambda: f"The input size {input_sizes[l_diff + i]}, plus negative padding "
             f"{pad[pad_idx]} and {pad[pad_idx + 1]} resulted in a negative output size, "
             f"which is invalid. Check dimension {l_diff + i} of your input.",
@@ -3221,6 +3245,7 @@ def _normalize(
         mean (Tensor): mean of the tensor along norm_dims.
         rstd (Tensor): 1/std of the tensor along norm_dims.
     """
+    # pyrefly: ignore  # no-matching-overload
     norm_dims = utils.canonicalize_dims(a.ndim, norm_dims)
     computation_dtype = utils.get_computation_dtype(a.dtype)
     a_acc = _maybe_convert_to_dtype(a, computation_dtype)
@@ -3334,6 +3359,7 @@ def native_layer_norm(
     # while torch.Size([1, 2, 3]) == (1, 2, 3) is True
     # therefore we use tuple(normalized_shape)
     torch._check(
+        # pyrefly: ignore  # bad-argument-type
         weight is None or sym_eq(weight.shape, tuple(normalized_shape)),
         lambda: "Expected weight to be of same shape as normalized_shape, but got "
         + "weight of shape "
@@ -3342,6 +3368,7 @@ def native_layer_norm(
         + str(normalized_shape),
     )
     torch._check(
+        # pyrefly: ignore  # bad-argument-type
         bias is None or sym_eq(bias.shape, tuple(normalized_shape)),
         lambda: "Expected bias to be of same shape as normalized_shape, but got "
         + "bias of shape "
@@ -3352,7 +3379,9 @@ def native_layer_norm(
     torch._check(
         input.ndim >= normalized_ndim
         and sym_eq(
-            input.shape[(input.ndim - normalized_ndim) :], tuple(normalized_shape)
+            input.shape[(input.ndim - normalized_ndim) :],
+            # pyrefly: ignore  # bad-argument-type
+            tuple(normalized_shape),
         ),
         lambda: "Given normalized_shape="
         + str(normalized_shape)
@@ -3465,7 +3494,7 @@ def stft(
     )
     torch._check(
         not center or align_to_window is None,
-        "stft only supports align_to_window for center = False.",
+        lambda: "stft only supports align_to_window for center = False.",
     )
 
     hop_length_ = hop_length if hop_length is not None else n_fft // 4
@@ -3477,7 +3506,7 @@ def stft(
         )
         torch._check(
             return_complex_,
-            (
+            lambda: (
                 "stft requires the return_complex parameter be given for real inputs, "
                 + "and will further require that return_complex=True in a future PyTorch release."
             ),
@@ -3606,7 +3635,7 @@ def istft(
             n_fft // 2 + 1 == fft_size,
             lambda: (
                 "istft expected the frequency dimension (3rd to the last) of the input tensor "
-                + "to match n_fft / 2 + 1 when onesided=True, but got {fft_size}"
+                + f"to match n_fft / 2 + 1 when onesided=True, but got {fft_size}"
             ),
         )
     else:
@@ -3614,7 +3643,7 @@ def istft(
             n_fft == fft_size,
             lambda: (
                 "istft expected the frequency dimension (3rd to the last) of the input tensor "
-                + "to match n_fft when onesided=False, but got {fft_size}",
+                + f"to match n_fft when onesided=False, but got {fft_size}",
             ),
         )
 
@@ -3922,7 +3951,7 @@ def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorL
     shape_numel = reduce(operator.mul, shape, 1)
     torch._check(
         a.numel() == shape_numel,
-        f"Could not reshape a tensor with shape {a.shape} as a tensor with shape {shape}!",
+        lambda: f"Could not reshape a tensor with shape {a.shape} as a tensor with shape {shape}!",
     )
 
     # Handles general case: a 1+D tensor reshaped into a distinct 1+D shape
@@ -3946,6 +3975,7 @@ def reshape_as(self: TensorLikeType, other: TensorLikeType) -> TensorLikeType:
 @out_wrapper()
 def roll(a: TensorLikeType, shifts: DimsType, dims: DimsType = ()) -> TensorLikeType:
     """Reference implementation of :func:`torch.roll`."""
+    # pyrefly: ignore  # no-matching-overload
     dims = utils.canonicalize_dims(a.ndim, dims)
     # ATen specifies int[1] type for shifts and dims which expands integers to tuples of length 1
     if not isinstance(shifts, Iterable):
@@ -3958,12 +3988,16 @@ def roll(a: TensorLikeType, shifts: DimsType, dims: DimsType = ()) -> TensorLike
         # Keeping this as ref for now as FakeTensor runs into some issues with complex tensors
         return a.clone()
 
+    # pyrefly: ignore  # bad-argument-type
     if a.dim() == 0 and len(dims) > 0:
         raise IndexError(
+            # pyrefly: ignore  # index-error
             f"Dimension specified as {dims[0]} but tensor has no dimensions"
         )
 
+    # pyrefly: ignore  # bad-argument-type
     len_shifts = len(shifts)
+    # pyrefly: ignore  # bad-argument-type
     len_dims = len(dims)
     if len_shifts != 1 or len_dims != 1:
         if len_shifts == 0:
@@ -3971,21 +4005,27 @@ def roll(a: TensorLikeType, shifts: DimsType, dims: DimsType = ()) -> TensorLike
         # Takes care of the case when dims is not specified (default)
         # By default, the tensor is flattened before shifting, after which the original shape is restored
         if len_dims == 0 and len_shifts == 1:
+            # pyrefly: ignore  # bad-argument-type
             return torch.roll(torch.flatten(a), shifts, 0).view(a.shape)
         if len_shifts != len_dims:
             raise RuntimeError(
                 f"shifts and dimensions must align. shifts: {len_shifts}, dims: {len_dims}"
             )
         assert len_dims > 1
+        # pyrefly: ignore  # index-error
         tail_shifts = shifts[1:]
+        # pyrefly: ignore  # index-error
         tail_dims = dims[1:]
+        # pyrefly: ignore  # index-error
         first_dim_rolled = torch.roll(a, (shifts[0],), dims[0])
         return torch.roll(first_dim_rolled, tail_shifts, tail_dims)
 
     # This path is taken when only one dimension is rolled
     # For example to get `first_dim_rolled` above
+    # pyrefly: ignore  # index-error
     dim = dims[0]
     size = a.shape[dim]
+    # pyrefly: ignore  # index-error
     start = (size - shifts[0]) % size
     idx = torch.arange(size, device=a.device)
     return a.index_select(dim, torch.fmod(start + idx, size))
@@ -4024,11 +4064,13 @@ def rot90(
 
 
 def _check_stack_inputs(tensors: TensorSequenceType) -> None:
+    from torch.fx.experimental.symbolic_shapes import sym_eq
+
     entry_shape = tensors[0].shape
     for i in range(1, len(tensors)):
-        assert tensors[i].shape == entry_shape, (
-            f"stack expects each tensor to be equal size, but got {entry_shape} at entry 0 "
-            f"and {tensors[i].shape} at entry {i}"
+        torch._check(
+            sym_eq(tensors[i].shape, entry_shape),
+            lambda: f"stack expects each tensor to be equal size, but got {entry_shape} at entry 0 ",
         )
 
 
@@ -4065,7 +4107,9 @@ def softmax(
         a_max = amax(a_, dim, keepdim=True)
         a_exp = exp(a_ - a_max)
     return _maybe_convert_to_dtype(
-        true_divide(a_exp, sum(a_exp, dim, keepdim=True)), result_dtype
+        # pyrefly: ignore  # no-matching-overload
+        true_divide(a_exp, sum(a_exp, dim, keepdim=True)),
+        result_dtype,
     )  # type: ignore[return-value]
 
 
@@ -4242,6 +4286,7 @@ def squeeze(a: TensorLikeType, dim: Optional[DimsType] = None) -> TensorLikeType
         return prims.squeeze(a, dims) if dims else prims.view_of(a)
 
     ndim = a.ndim
+    # pyrefly: ignore  # no-matching-overload
     dim = utils.canonicalize_dims(ndim, dim)
     dims = (dim,) if isinstance(dim, Dim) else dim
     # Short-circuits if the tensor has no dimensions
@@ -4270,8 +4315,8 @@ def split_with_sizes(
     # NB: Perform the check_is_size tests first so that the
     # sum test does not try to do a replacement
     for i in range(len(split_sizes)):
-        torch._check_is_size(
-            split_sizes[i],
+        torch._check(
+            split_sizes[i] >= 0,
             lambda: "split_with_sizes expects split_sizes have only non-negative entries",
         )
     torch._check_with(
@@ -4307,7 +4352,7 @@ def tensor_split(
 
     # If indices_or_sections is a tensor, it must be a CPU Long tensor
     if isinstance(indices_or_sections, TensorLike):
-        if not indices_or_sections.device.type == "cpu":
+        if indices_or_sections.device.type != "cpu":
             msg = (
                 f"tensor_split: if indices_or_sections is a tensor it must be on the CPU, "
                 f"but received one on {indices_or_sections.device}"
@@ -4382,6 +4427,7 @@ def hsplit(
     if isinstance(indices_or_sections, IntLike):
         split_size = indices_or_sections
         torch._check(
+            # pyrefly: ignore  # unsupported-operation
             (split_size != 0 and a.shape[dim] % split_size == 0),
             lambda: (
                 "torch.hsplit attempted to split along dimension "
@@ -4393,6 +4439,7 @@ def hsplit(
                 + "!"
             ),
         )
+        # pyrefly: ignore  # bad-argument-type
         return tensor_split(a, split_size, dim)
 
     torch._check_type(
@@ -4423,6 +4470,7 @@ def vsplit(
     if isinstance(indices_or_sections, IntLike):
         split_size = indices_or_sections
         torch._check(
+            # pyrefly: ignore  # unsupported-operation
             (split_size != 0 and a.shape[0] % split_size == 0),
             lambda: (
                 f"torch.vsplit attempted to split along dimension 0"
@@ -4433,6 +4481,7 @@ def vsplit(
                 f"!"
             ),
         )
+        # pyrefly: ignore  # bad-argument-type
         return tensor_split(a, split_size, 0)
 
     torch._check_type(
@@ -4637,6 +4686,7 @@ def dsplit(a: TensorLikeType, sections: DimsType) -> TensorSequenceType:
         raise RuntimeError(
             f"torch.dsplit requires a tensor with at least 3 dimension, but got a tensor with {a.ndim} dimensions!"
         )
+    # pyrefly: ignore  # unsupported-operation
     if isinstance(sections, IntLike) and (sections == 0 or a.shape[2] % sections != 0):
         raise RuntimeError(
             "torch.dsplit attempted to split along dimension 2, "
@@ -4688,7 +4738,7 @@ def transpose(a: TensorLikeType, dim0: int, dim1: int) -> TensorLikeType:
     if a.ndim <= 1 or dim0 == dim1:
         return aten.alias.default(a)
 
-    _permutation = list(range(0, a.ndim))
+    _permutation = list(range(a.ndim))
     _permutation[_dim0] = _dim1
     _permutation[_dim1] = _dim0
     return torch.permute(a, _permutation)
@@ -5410,6 +5460,7 @@ def logspace(
 
 
 @overload
+# pyrefly: ignore  # inconsistent-overload
 def meshgrid(tensors: Sequence[TensorLikeType], indexing: str):
     pass
 
@@ -5578,6 +5629,13 @@ def empty_strided(
     )
 
 
+def _strength_reduce_integer(val: int) -> torch.dtype:
+    for possible_dtype in (torch.uint8, torch.uint16, torch.int32):
+        if val <= torch.iinfo(possible_dtype).max:
+            return possible_dtype
+    return torch.int64
+
+
 @register_decomposition(aten.eye)
 @out_wrapper()
 def eye(
@@ -5599,12 +5657,15 @@ def eye(
     torch._check(n >= 0, lambda: f"n must be greater or equal to 0, got {n}")
     torch._check(m >= 0, lambda: f"m must be greater or equal to 0, got {m}")
 
-    range_n = torch.arange(n, dtype=torch.int64, device=device, requires_grad=False)
-    range_m = torch.arange(m, dtype=torch.int64, device=device, requires_grad=False)
+    range_dtype = torch.int64
+    if isinstance(n, utils.IntWithoutSymInt) and isinstance(m, utils.IntWithoutSymInt):
+        range_dtype = _strength_reduce_integer(max(n, m))
+    range_n = torch.arange(n, dtype=range_dtype, device=device, requires_grad=False)
+    range_m = torch.arange(m, dtype=range_dtype, device=device, requires_grad=False)
 
     cond = range_n.unsqueeze(-1) == range_m
-    if dtype is torch.bool:
-        return cond
+    if layout in (torch.strided, None) and not pin_memory:
+        return cond.to(dtype or torch.get_default_dtype())
     else:
         one = torch.ones(
             (1,),
@@ -5826,6 +5887,7 @@ def masked_fill(a: TensorLikeType, mask: TensorLikeType, value: TensorOrNumberLi
 
     # Since `where` allows type-promotion,
     # cast value to correct type before passing to `where`
+    # pyrefly: ignore  # no-matching-overload
     value = _maybe_convert_to_dtype(value, a.dtype)
     r = torch.where(mask, value, a)  # type: ignore[arg-type]
 
@@ -5911,7 +5973,8 @@ def norm(
 @out_wrapper()
 def trace(self: TensorLikeType) -> TensorLikeType:
     torch._check(
-        self.ndim == 2, lambda: "expected a matrix, but got tensor with dim {self.ndim}"
+        self.ndim == 2,
+        lambda: f"expected a matrix, but got tensor with dim {self.ndim}",
     )
     return torch.sum(torch.diag(self, 0))
 
@@ -6619,6 +6682,7 @@ def _infer_scalar_type(obj):
         # double.
         if length == 0:
             return torch.get_default_dtype()
+
         for i in range(length):
             cur_item = obj[i]
             # TODO: test this
@@ -6656,6 +6720,7 @@ def _recursive_build(
         # torch.Size([1, 2])
         return obj.detach().to(dtype=scalarType, device="cpu", copy=True)
     elif isinstance(obj, Number):
+        # pyrefly: ignore  # bad-argument-type
         return torch.scalar_tensor(obj, dtype=scalarType)
 
     # seq can be a list of tensors
diff --git a/torch/_refs/fft.py b/torch/_refs/fft.py
index e12e4c8e603b..e4e300bee62a 100644
--- a/torch/_refs/fft.py
+++ b/torch/_refs/fft.py
@@ -106,6 +106,7 @@ def _resize_fft_input(
         if x_sizes[dims[i]] < sizes[i]:
             must_copy = True
             pad_idx = len(pad_amount) - 2 * dims[i] - 1
+
             pad_amount[pad_idx] = sizes[i] - x_sizes[dims[i]]
 
         if x_sizes[dims[i]] > sizes[i]:
diff --git a/torch/_refs/linalg/__init__.py b/torch/_refs/linalg/__init__.py
index 28711c2c5485..c6ed344f3239 100644
--- a/torch/_refs/linalg/__init__.py
+++ b/torch/_refs/linalg/__init__.py
@@ -56,7 +56,7 @@ def _check_norm_dtype(dtype: Optional[torch.dtype], x_dtype: torch.dtype, fn_nam
         torch._check(
             utils.get_higher_dtype(dtype, x_dtype) == dtype,
             lambda: f"{fn_name}: the dtype of the input ({x_dtype}) should be convertible "
-            "without narrowing to the specified dtype ({dtype})",
+            f"without narrowing to the specified dtype ({dtype})",
         )
 
 
@@ -110,7 +110,7 @@ def _check_vector_norm_args(
             x.numel() != 0,
             not isinstance(dim, IntLike) and dim is not None and len(dim) != 0,
         ),
-        "linalg.vector_norm cannot compute the {ord} norm on an empty tensor "
+        lambda: f"linalg.vector_norm cannot compute the {ord} norm on an empty tensor "
         "because the operation does not have an identity",
     )
 
@@ -119,7 +119,7 @@ def _check_vector_norm_args(
         for d in dim:
             torch._check(
                 sym_or(x.numel() != 0, d < len(shape) and d >= 0 and shape[d] != 0),
-                "linalg.vector_norm cannot compute the {ord} norm on the "
+                lambda: f"linalg.vector_norm cannot compute the {ord} norm on the "
                 f"dimension {d} because this dimension is empty and the "
                 "operation does not have an identity",
             )
@@ -216,15 +216,18 @@ def matrix_norm(
     # shape
     check_is_matrix(A, "linalg.matrix_norm")
     # dim
+    # pyrefly: ignore  # no-matching-overload
     dim = utils.canonicalize_dims(A.ndim, dim)
     if isinstance(dim, Dim):
         dim = (dim,)  # type: ignore[assignment]
     torch._check(
-        len(dim) == 2, lambda: "linalg.matrix_norm: dim must be a 2-tuple. Got {dim}"
+        len(dim) == 2, lambda: f"linalg.matrix_norm: dim must be a 2-tuple. Got {dim}"
     )
     torch._check(
+        # pyrefly: ignore  # index-error
         dim[0] != dim[1],
-        lambda: "linalg.matrix_norm: dims must be different. Got ({dim[0]}, {dim[1]})",
+        # pyrefly: ignore  # index-error
+        lambda: f"linalg.matrix_norm: dims must be different. Got ({dim[0]}, {dim[1]})",
     )
     # dtype arg
     _check_norm_dtype(dtype, A.dtype, "linalg.matrix_norm")
@@ -233,7 +236,7 @@ def matrix_norm(
         # ord
         torch._check(
             ord in ("fro", "nuc"),
-            lambda: "linalg.matrix_norm: Order {ord} not supported.",
+            lambda: f"linalg.matrix_norm: Order {ord} not supported.",
         )
         # dtype
         check_fp_or_complex(
@@ -245,6 +248,7 @@ def matrix_norm(
         else:  # ord == "nuc"
             if dtype is not None:
                 A = _maybe_convert_to_dtype(A, dtype)  # type: ignore[assignment]
+            # pyrefly: ignore  # index-error
             perm = _backshift_permutation(dim[0], dim[1], A.ndim)
             result = torch.sum(svdvals(prims.transpose(A, perm)), -1, keepdim)
             if keepdim:
@@ -256,7 +260,7 @@ def matrix_norm(
         abs_ord = abs(ord)
         torch._check(
             abs_ord in (2, 1, float("inf")),
-            lambda: "linalg.matrix_norm: Order {ord} not supported.",
+            lambda: f"linalg.matrix_norm: Order {ord} not supported.",
         )
         # dtype
         check_fp_or_complex(
@@ -268,6 +272,7 @@ def matrix_norm(
         if abs_ord == 2.0:
             if dtype is not None:
                 A = _maybe_convert_to_dtype(A, dtype)  # type: ignore[assignment]
+            # pyrefly: ignore  # index-error
             perm = _backshift_permutation(dim[0], dim[1], A.ndim)
             result = max_min(svdvals(prims.transpose(A, perm)), dim=-1)
             if keepdim:
@@ -275,6 +280,7 @@ def matrix_norm(
                 result = prims.transpose(torch.unsqueeze(result, -1), inv_perm)
             return result
         else:  # 1, -1, inf, -inf
+            # pyrefly: ignore  # bad-unpacking
             dim0, dim1 = dim
             if abs_ord == float("inf"):
                 dim0, dim1 = dim1, dim0
@@ -300,12 +306,12 @@ def norm(
             dim = (dim,)  # type: ignore[assignment]
         torch._check(
             len(dim) in (1, 2),
-            lambda: "linalg.norm: If dim is specified, it must be of length 1 or 2. Got {dim}",
+            lambda: f"linalg.norm: If dim is specified, it must be of length 1 or 2. Got {dim}",
         )
     elif ord is not None:
         torch._check(
             A.ndim in (1, 2),
-            lambda: "linalg.norm: If dim is not specified but ord is, the input must be 1D or 2D. Got {A.ndim}D",
+            lambda: f"linalg.norm: If dim is not specified but ord is, the input must be 1D or 2D. Got {A.ndim}D",
         )
 
     if ord is not None and (
diff --git a/torch/_refs/nn/functional/__init__.py b/torch/_refs/nn/functional/__init__.py
index 89ead281d947..3603969de943 100644
--- a/torch/_refs/nn/functional/__init__.py
+++ b/torch/_refs/nn/functional/__init__.py
@@ -1,9 +1,10 @@
 # mypy: allow-untyped-decorators
 # mypy: allow-untyped-defs
 import math
+from collections.abc import Callable
 from functools import wraps
-from typing import Callable, Optional, TypeVar, Union
-from typing_extensions import Concatenate, ParamSpec
+from typing import Concatenate, Optional, TypeVar, Union
+from typing_extensions import ParamSpec
 
 import torch
 import torch._prims as prims
@@ -141,9 +142,11 @@ def _inplace_wrapper(fn: Callable[_P, _T]) -> Callable[_P, _T]:
     # nb. We use the name of the first argument used in the unary references
     @wraps(fn)
     def _fn(*args: _P.args, **kwargs: _P.kwargs) -> _T:
+        # pyrefly: ignore  # unsupported-operation
         a = args[0]
         if "inplace" not in kwargs:
             kwargs["inplace"] = False
+        # pyrefly: ignore  # unsupported-operation
         if kwargs["inplace"]:
             torch._check(
                 "out" not in kwargs,
@@ -624,6 +627,7 @@ def smooth_l1_loss(
         )
     else:
         loss = torch.abs(input - target)
+        # pyrefly: ignore  # unsupported-operation
         loss = torch.where(loss < beta, 0.5 * loss**2 / beta, loss - 0.5 * beta)
         return _apply_loss_reduction(loss, reduction)
 
diff --git a/torch/_refs/special/__init__.py b/torch/_refs/special/__init__.py
index de11bee923c9..f1a893639645 100644
--- a/torch/_refs/special/__init__.py
+++ b/torch/_refs/special/__init__.py
@@ -155,8 +155,10 @@ def xlog1py(a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, Numbe
 
     # Operations like eq and log do not handle scalar values, so we convert them to scalar_tensors.
     if isinstance(a, TensorLike) and isinstance(b, Number):
+        # pyrefly: ignore  # bad-argument-type
         b = refs.scalar_tensor(b, dtype=a.dtype, device=a.device)
     elif isinstance(b, TensorLike) and isinstance(a, Number):
+        # pyrefly: ignore  # bad-argument-type
         a = refs.scalar_tensor(a, dtype=b.dtype, device=b.device)
 
     # mypy: expected "Tensor"
diff --git a/torch/_strobelight/cli_function_profiler.py b/torch/_strobelight/cli_function_profiler.py
index 80108dc99186..8f901b0b264f 100644
--- a/torch/_strobelight/cli_function_profiler.py
+++ b/torch/_strobelight/cli_function_profiler.py
@@ -6,10 +6,10 @@
 import re
 import subprocess
 import time
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from threading import Lock
 from timeit import default_timer as timer
-from typing import Any, Callable, Optional, TypeVar
+from typing import Any, Optional, TypeVar
 from typing_extensions import ParamSpec
 
 
@@ -314,6 +314,7 @@ def strobelight_inner(
     ) -> Callable[_P, Optional[_R]]:
         @functools.wraps(work_function)
         def wrapper_function(*args: _P.args, **kwargs: _P.kwargs) -> Optional[_R]:
+            # pyrefly: ignore  # bad-argument-type
             return profiler.profile(work_function, *args, **kwargs)
 
         return wrapper_function
diff --git a/torch/_strobelight/compile_time_profiler.py b/torch/_strobelight/compile_time_profiler.py
index 436f9a2c8b59..89b44632e278 100644
--- a/torch/_strobelight/compile_time_profiler.py
+++ b/torch/_strobelight/compile_time_profiler.py
@@ -145,7 +145,7 @@ def enable(cls, profiler_class: Any = StrobelightCLIFunctionProfiler) -> None:
             async_stack_max_len=cls.max_stack_length,
             run_user_name="pt2-profiler/"
             + os.environ.get("USER", os.environ.get("USERNAME", "")),
-            sample_tags={cls.identifier},
+            sample_tags={cls.identifier},  # pyrefly: ignore  # bad-argument-type
         )
 
     @classmethod
diff --git a/torch/_subclasses/_fake_tensor_utils.py b/torch/_subclasses/_fake_tensor_utils.py
index faaea9e61056..a86a44bf6615 100644
--- a/torch/_subclasses/_fake_tensor_utils.py
+++ b/torch/_subclasses/_fake_tensor_utils.py
@@ -7,7 +7,6 @@
 from torch import SymInt
 from torch.fx.experimental.sym_node import SymNode
 from torch.types import py_sym_types, PySymType
-from torch.utils._backport_slots import dataclass_slots
 
 
 if TYPE_CHECKING:
@@ -18,8 +17,7 @@
     from .fake_tensor import _DispatchCacheKey, _MetadataIntLike
 
 
-@dataclass_slots
-@dataclass(frozen=True)
+@dataclass(frozen=True, slots=True)
 class _DeconstructedSymNode:
     """
     Represents a SymNode without the associated ShapeEnv
@@ -35,7 +33,12 @@ class _DeconstructedSymNode:
     @staticmethod
     def from_node(node: SymNode) -> _DeconstructedSymNode:
         return _DeconstructedSymNode(
-            node._expr, node.pytype, node._hint, node.constant, node.fx_node
+            node._expr,
+            node.pytype,
+            node._hint,
+            node.constant,
+            # pyrefly: ignore  # bad-argument-type
+            node.fx_node,
         )
 
     def extract(self, shape_env: ShapeEnv) -> SymNode:
@@ -73,8 +76,7 @@ def _value_hash(self) -> int:
         return hash((self._expr, self.pytype, self._hint, self.constant, self.fx_node))
 
 
-@dataclass_slots
-@dataclass(frozen=True)
+@dataclass(frozen=True, slots=True)
 class _DeconstructedSymType:
     """
     Represents a SymInt, SymFloat, SymBool without the associated ShapeEnv
@@ -103,14 +105,12 @@ def __hash__(self) -> int:
         return NotImplemented
 
 
-@dataclass_slots
-@dataclass(frozen=True)
+@dataclass(frozen=True, slots=True)
 class _InputBackref:
     value: int
 
 
-@dataclass_slots
-@dataclass
+@dataclass(slots=True)
 class _PySymInputStub:
     """
     Represents a SymInt in the cached key. Needed because SymInt doesn't
@@ -172,8 +172,7 @@ def __hash__(self) -> int:
             return self.value.node._value_hash()
 
 
-@dataclass_slots
-@dataclass
+@dataclass(slots=True)
 class _SymIntOutputStub:
     """
     Represents a SymInt in the cached output.
@@ -207,8 +206,7 @@ def __hash__(self) -> int:
         raise NotImplementedError
 
 
-@dataclass_slots
-@dataclass
+@dataclass(slots=True)
 class _CacheKeyState:
     """
     State used while building our cache key.
diff --git a/torch/_subclasses/fake_impls.py b/torch/_subclasses/fake_impls.py
index cefff832c5fd..0b000cfa1a9a 100644
--- a/torch/_subclasses/fake_impls.py
+++ b/torch/_subclasses/fake_impls.py
@@ -5,8 +5,9 @@
 import math
 import operator
 import sys
+from collections.abc import Callable
 from functools import reduce
-from typing import Callable, Union
+from typing import Optional, Union
 
 import torch
 import torch._custom_op
@@ -15,6 +16,7 @@
 from torch._dispatch.python import no_python_dispatcher
 from torch._ops import OpOverload
 from torch._prims_common import (
+    canonicalize_dim,
     elementwise_dtypes,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
     is_boolean_dtype,
@@ -156,8 +158,7 @@ def _is_op_registered_to_fake_rule(op):
 
 
 def _deregister_op_impl(op):
-    if op in op_implementations_dict:
-        del op_implementations_dict[op]
+    op_implementations_dict.pop(op, None)
     for check, impl in op_implementations_checks:
         if check is op:
             op_implementations_checks.remove((check, impl))
@@ -592,6 +593,21 @@ def _view_unbacked_meta(a, shape, size_oblivious_enabled=True):
     raise ValueError(msg)
 
 
+@register_op_impl(aten._reshape_copy.default)
+def _reshape_copy(fake_mode, func, a, *shape):
+    if a.is_sparse or a.is_mkldnn:
+        return NotImplemented
+
+    shape = utils.infer_size(*shape, a.numel())
+    if is_contiguous_or_false(a):
+        view = _view_meta(fake_mode, func, a, *shape)
+        return view.clone(memory_format=torch.contiguous_format)
+    else:
+        return _view_meta(
+            fake_mode, func, a.clone(memory_format=torch.contiguous_format), *shape
+        )
+
+
 @register_op_impl(aten.view.default)
 @register_op_impl(aten._unsafe_view.default)
 def _view_meta(fake_mode, func, a, *shape):
@@ -746,6 +762,89 @@ def _padded_dense_to_jagged_forward(fake_mode, func, padded, offsets, total_L=No
     return padded.new_empty(output_shape)
 
 
+def _compute_slice_index(size, index):
+    from torch.fx.experimental.symbolic_shapes import guard_or_false, sym_and
+
+    if guard_or_false(sym_and(index >= 0, index <= size)):
+        return index
+    elif guard_or_false(sym_and(index < 0, index >= -size)):
+        return index + size
+    elif guard_or_false(index < -size):
+        return 0
+    elif guard_or_false(index > size):
+        return size
+    return None
+
+
+@register_op_impl(torch.ops.aten.slice.Tensor)
+def slice_forward(
+    fake_mode,
+    func,
+    self,
+    dim: int = 0,
+    start: Optional[int] = None,
+    end: Optional[int] = None,
+    step: int = 1,
+):
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_false,
+        statically_known_true,
+    )
+
+    shape_env = fake_mode.shape_env
+
+    ndim = self.dim()
+    if ndim == 0:
+        raise RuntimeError("slice() cannot be applied to a 0-dim tensor.")
+    dim = canonicalize_dim(self.dim(), dim)
+    sizes = list(self.size())
+    strides = list(self.stride())
+
+    if step <= 0:
+        raise RuntimeError("slice step must be positive")
+
+    # start, end
+    start_index = 0 if start is None else _compute_slice_index(sizes[dim], start)
+    end_index = (
+        sizes[dim]
+        if statically_known_true(end == sys.maxsize) or end is None
+        else _compute_slice_index(sizes[dim], end)
+    )
+
+    # size
+    new_size = None
+    if start_index is not None and end_index is not None:
+        if guard_or_false(end_index >= start_index):
+            new_size = (end_index - start_index + step - 1) // step
+        elif guard_or_false(start_index >= end_index):
+            new_size = 0
+
+    # create unbacked if case unknown
+    if new_size is None:
+        new_size = shape_env.create_unbacked_symint()
+        torch._check(new_size >= 0)
+        torch._check(new_size <= sizes[dim])
+
+    # stride
+    new_stride = strides[dim] * step
+
+    # storage offset
+    if start_index is not None:
+        storage_offset = self.storage_offset() + start_index * strides[dim]
+    else:
+        storage_offset = shape_env.create_unbacked_symint()
+        torch._check(storage_offset >= 0)
+
+    sizes[dim] = new_size
+    strides[dim] = new_stride
+    if self.is_quantized:
+        raise NotImplementedError(
+            "Slice decomposition for quantized tensors aren't implemented"
+        )
+    else:
+        return self.as_strided(sizes, strides, storage_offset)
+
+
 @register_op_impl(torch.ops.aten.masked_select.default)
 def masked_select(fake_mode, func, self, mask):
     if (
@@ -1021,8 +1120,6 @@ def conv(fake_mode, func, *args, **kwargs):
             # TODO: We can make this a little more faithful with best effort
             # channels last detection (but only if it's statically obvious!)
             mem_fmt = None
-        elif k == 3 and not kwargs["input"].is_mkldnn and not kwargs["input"].is_xpu:
-            mem_fmt = None
         else:
             if func is aten.convolution.default:
                 conv_backend = torch._C._select_conv_backend(**kwargs)
@@ -1039,15 +1136,40 @@ def conv(fake_mode, func, *args, **kwargs):
                     groups=kwargs["groups"],
                     bias_sizes=kwargs["bias_sizes"],
                 )
+            # Expand 1d -> 2d.
+            # Note: Avoid expanding before calling _select_conv_backend,
+            # as the function handles 2D expansion internally.
+            if k == 3 and not kwargs["input"].is_mkldnn and not kwargs["input"].is_xpu:
+                # Note: Using input.to(memory_format=contiguous) does not work.
+                kwargs["input"] = kwargs["input"].contiguous().unsqueeze(2)
+                kwargs["weight"] = kwargs["weight"].unsqueeze(2)
+                if len(kwargs["stride"]) == 1:
+                    kwargs["stride"].insert(0, 1)
+                    kwargs["padding"].insert(0, 0)
+                    kwargs["dilation"].insert(0, 1)
+                    kwargs["output_padding"].insert(0, 0)
             mem_fmt = torch._C._conv_determine_backend_memory_format(
                 kwargs["input"], kwargs["weight"], conv_backend
             )
+            # revert 2d -> 1d
+            if k == 3 and not kwargs["input"].is_mkldnn and not kwargs["input"].is_xpu:
+                kwargs["input"] = kwargs["input"].squeeze(2)
+                kwargs["weight"] = kwargs["weight"].squeeze(2)
+                if len(kwargs["stride"]) == 2:
+                    kwargs["stride"].pop(0)
+                    kwargs["padding"].pop(0)
+                    kwargs["dilation"].pop(0)
+                    kwargs["output_padding"].pop(0)
 
     def convert(t, mem_fmt):
         if t is None:
             return t
         if mem_fmt is not None:
-            t = t.to(memory_format=mem_fmt)
+            # channels last only support 4d, try to expand dim then convert it back later.
+            if t.dim() == 3 and mem_fmt == torch.channels_last:
+                t = t.unsqueeze(2).to(memory_format=mem_fmt).squeeze(2)
+            else:
+                t = t.to(memory_format=mem_fmt)
         return FakeTensor(fake_mode, t, device)
 
     with in_kernel_invocation_manager(fake_mode):
@@ -1214,7 +1336,7 @@ def slow(msg):
                 # Use elementwise_dtypes for the tricky case
                 has_different_input_dtypes = True
                 continue
-            if common_device == cpu and not op.device.type == "cpu":
+            if common_device == cpu and op.device.type != "cpu":
                 common_device = op.device
             # Slightly simplified here as target_dtype cannot vary
             if common_dtype is None:
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 6b55abcef00c..3c2d609b7367 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -15,8 +15,17 @@
 import weakref
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Any, Callable, cast, Literal, Optional, TYPE_CHECKING, TypeVar, Union
-from typing_extensions import Self, TypeGuard
+from typing import (
+    Any,
+    cast,
+    Literal,
+    Optional,
+    TYPE_CHECKING,
+    TypeGuard,
+    TypeVar,
+    Union,
+)
+from typing_extensions import Self
 from weakref import ReferenceType
 
 import torch
@@ -40,7 +49,6 @@
 from torch.multiprocessing.reductions import StorageWeakRef
 from torch.overrides import TorchFunctionMode
 from torch.types import IntLikeType, py_sym_types
-from torch.utils._backport_slots import dataclass_slots
 from torch.utils._mode_utils import no_dispatch
 from torch.utils._python_dispatch import (
     is_traceable_wrapper_subclass,
@@ -54,7 +62,7 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Iterable, Mapping, Sequence
+    from collections.abc import Callable, Generator, Iterable, Mapping, Sequence
     from types import TracebackType
 
     from torch._guards import Source
@@ -135,9 +143,11 @@ class FakeTensorTLS(threading.local):
     # Default to None, otherwise it'll be used to override _all_
     # `FakeTensorMode.allow_non_fake_inputs` in this thread.
     allow_non_fake_inputs_override: Optional[bool]
+    non_strict_export_fake_tensor_tracker: weakref.WeakSet
 
     def __init__(self) -> None:
         self.allow_non_fake_inputs_override = None
+        self.non_strict_export_fake_tensor_tracker = weakref.WeakSet()
 
 
 fake_tensor_tls = FakeTensorTLS()
@@ -394,7 +404,9 @@ def mk_fake_tensor(
             with no_dispatch():
                 return FakeTensor(
                     fake_mode,
+                    # pyrefly: ignore  # bad-argument-type
                     make_meta_t(),
+                    # pyrefly: ignore  # bad-argument-type
                     device,
                     # TODO: callback might be used in recursive contexts, in
                     # which case using t is wrong!  BUG!
@@ -669,6 +681,7 @@ class FakeTensor(Tensor):
     _mode_key = torch._C._TorchDispatchModeKey.FAKE
 
     @property
+    # pyrefly: ignore  # bad-override
     def device(self) -> torch.device:
         if self.fake_mode.in_kernel_invocation:
             return torch.device("meta")
@@ -696,6 +709,7 @@ def device(self, _: torch.device) -> None:
 
     # We don't support named tensors; graph break
     @property
+    # pyrefly: ignore  # bad-override
     def names(self) -> list[str]:
         raise UnsupportedFakeTensorException(
             "torch.compile doesn't support named tensors"
@@ -754,6 +768,7 @@ def __new__(
                 )
             else:
                 device = torch.device(f"{device.type}:0")
+        # pyrefly: ignore  # read-only
         self.fake_device = device
         self.fake_mode = fake_mode
         self.constant = constant
@@ -791,6 +806,11 @@ def __new__(
     #
     def __init__(self, *args: object, **kwargs: object) -> None:
         super().__init__()
+        if (
+            torch.compiler.is_exporting()
+            and torch._export.config.detect_non_strict_fake_tensor_leaks
+        ):
+            fake_tensor_tls.non_strict_export_fake_tensor_tracker.add(self)
 
     @staticmethod
     def from_tensor(t: Tensor, fake_mode: FakeTensorMode) -> FakeTensor:
@@ -1001,8 +1021,7 @@ def tolist(self) -> Any:
 _MetadataIntLike = Union[IntLikeType, "_PySymInputStub", "_SymIntOutputStub"]
 
 
-@dataclass_slots
-@dataclass
+@dataclass(slots=True)
 class TensorMetadata:
     """
     The Tensor metadata relevant to hashing FakeTensors when caching.
@@ -1086,8 +1105,7 @@ def extract_tensor_metadata(t: Tensor) -> TensorMetadata:
     )
 
 
-@dataclass_slots
-@dataclass
+@dataclass(slots=True)
 class _DispatchCacheKey:
     """
     Key for the FakeTensor dispatch cache.
@@ -1120,8 +1138,7 @@ class SingletonConstant:
     pass
 
 
-@dataclass_slots
-@dataclass(frozen=True)
+@dataclass(frozen=True, slots=True)
 class _DispatchCacheEntryOutputInfo:
     """
     Entry type for the FakeTensor dispatch cache for an output. Accounts for three
@@ -1140,8 +1157,7 @@ class _DispatchCacheEntryOutputInfo:
     constant_value: Optional[Any] = SingletonConstant
 
 
-@dataclass_slots
-@dataclass(frozen=True)
+@dataclass(frozen=True, slots=True)
 class _DispatchCacheValidEntry:
     """
     Entry type for the FakeTensor dispatch cache. It supports two types of outputs
@@ -1155,8 +1171,7 @@ class _DispatchCacheValidEntry:
     is_output_tuple: bool = False
 
 
-@dataclass_slots
-@dataclass(frozen=True)
+@dataclass(frozen=True, slots=True)
 class _DispatchCacheBypassEntry:
     """
     Entry type for a negative cache entry.
@@ -1169,8 +1184,7 @@ class _DispatchCacheBypassEntry:
     _DispatchCacheEntry = Union[_DispatchCacheValidEntry, _DispatchCacheBypassEntry]
 
 
-@dataclass_slots
-@dataclass(frozen=True)
+@dataclass(frozen=True, slots=True)
 class _BypassDispatchCache(Exception):
     """
     Signals cases that should skip FakeTensor caching.
@@ -1179,8 +1193,7 @@ class _BypassDispatchCache(Exception):
     reason: str
 
 
-@dataclass_slots
-@dataclass(frozen=True)
+@dataclass(frozen=True, slots=True)
 class DispatchCacheInfo:
     """
     Information about the state of the FakeTensor dispatch cache.
@@ -1361,6 +1374,7 @@ def stack(self) -> str:
         return self._stack
 
     @count
+    # pyrefly: ignore  # bad-override
     def __torch_dispatch__(
         self,
         func: OpOverload,
@@ -1485,6 +1499,7 @@ def _cached_dispatch_impl(
             # Do this dispatch outside the above except handler so if it
             # generates its own exception there won't be a __context__ caused by
             # the caching mechanism.
+            # pyrefly: ignore  # bad-argument-type
             return self._dispatch_impl(func, types, args, kwargs)
 
         assert state is not None
@@ -1502,23 +1517,28 @@ def _cached_dispatch_impl(
                 # This represents a negative cache entry - we already saw that the
                 # output is uncachable. Compute it from first principals.
                 FakeTensorMode.cache_bypasses[entry.reason] += 1
+                # pyrefly: ignore  # bad-argument-type
                 return self._dispatch_impl(func, types, args, kwargs)
 
             # We have a cache entry.
+            # pyrefly: ignore  # bad-argument-type
             output = self._output_from_cache_entry(state, entry, key, func, args)
             FakeTensorMode.cache_hits += 1
             if self.cache_crosscheck_enabled:
                 # For debugging / testing: Validate that the output synthesized
                 # from the cache matches the output created by normal dispatch.
                 with disable_fake_tensor_cache(self):
+                    # pyrefly: ignore  # bad-argument-type
                     self._crosscheck_cache_output(output, func, types, args, kwargs)
             return output
 
         # We don't have a cache entry.
+        # pyrefly: ignore  # bad-argument-type
         output = self._dispatch_impl(func, types, args, kwargs)
 
         try:
-            self._validate_cache_key(func, args, kwargs)
+            # pyrefly: ignore  # bad-argument-type
+            entry = self._make_cache_entry(state, key, func, args, kwargs, output)
         except _BypassDispatchCache as e:
             # We ran "extra" checks on the cache key and determined that it's no
             # good. Record the reason and mark it so we don't bother validating
@@ -1536,15 +1556,6 @@ def _cached_dispatch_impl(
             set_cache_key(cache, key, _DispatchCacheBypassEntry(e.reason))
             return output
 
-        try:
-            entry = self._make_cache_entry(state, key, func, args, kwargs, output)
-        except _BypassDispatchCache as e:
-            # We had trouble making the cache entry. Record the reason and mark
-            # it.
-            FakeTensorMode.cache_bypasses[e.reason] += 1
-            set_cache_key(cache, key, _DispatchCacheBypassEntry(e.reason))
-            return output
-
         set_cache_key(cache, key, entry)
         FakeTensorMode.cache_misses += 1
         return output
@@ -1560,6 +1571,7 @@ def _cache_key(
         Create a cache key given the dispatch args. Raises _BypassDispatchCache
         for any situation that precludes caching.
         """
+        is_tracing = torch.fx.experimental.proxy_tensor.get_proxy_mode() is not None
         key_values = [
             func,
             # Capture the default_dtype mode since that can affect the output tensor,
@@ -1575,17 +1587,24 @@ def _cache_key(
             # Disallowing dynamic shapes can introduce a DynamicOutputShapeException
             # where it wasn't seen on a previous instance of the same op.
             self.shape_env.settings if self.shape_env else None,
+            # ProxyTorchDispatchMode needs to track how SymNodes are constructed
+            # so we need to handle things a little different depending on
+            # whether we're tracing or not.
+            is_tracing,
         ]
         if state.known_symbols:
             # If there are symbols then include the epoch - this is really more
             # of a Shape env var which lives on the FakeTensorMode.
+            # pyrefly: ignore  # bad-argument-type
             key_values.append(self.epoch)
         # Collect the id_hashed objects to attach a weakref finalize later
         id_hashed_objects: list[object] = []
         # Translate any FakeTensor args to metadata.
         if args:
+            # pyrefly: ignore  # bad-argument-type
             self._prep_args_for_hash(key_values, args, state, id_hashed_objects)
         if kwargs:
+            # pyrefly: ignore  # bad-argument-type
             self._prep_args_for_hash(key_values, kwargs, state, id_hashed_objects)
         key = _DispatchCacheKey(tuple(key_values))
 
@@ -1752,11 +1771,9 @@ def _validate_output_for_cache_entry(
         if isinstance(output, (int, type(None))):
             return
 
-        if _has_unrepresented_symbols(state, output):
-            # Unbacked symbols are fine - but only if they're also represented
-            # in the input. If there are any new unbacked symbols then we can't
-            # cache this output.
-            raise _BypassDispatchCache("unrepresented symbol in output")
+        # Check for symbolic content that should bypass caching - raises
+        # _BypassDispatchCache if necessary.
+        _validate_symbolic_output_for_caching(state, output)
 
         # Some ops return tuples of Tensors, but it's rare, so avoid
         # the complexity of caching other types.
@@ -1872,6 +1889,8 @@ def _make_cache_entry(
         from torch._higher_order_ops.utils import registered_hop_fake_fns
         from torch.fx.experimental.symbolic_shapes import has_free_unbacked_symbols
 
+        self._validate_cache_key(func, args, kwargs)
+
         # For hops, lets look at the output tensor to find any unbacked symints.
         # If there are none, then we rely on the existing checks to validate
         # caching.
@@ -1901,27 +1920,53 @@ def _make_cache_entry(
         if isinstance(output, tuple):
             for out_element in output:
                 self._validate_output_for_cache_entry(
-                    state, key, func, args, kwargs, out_element
+                    state,
+                    key,
+                    # pyrefly: ignore  # bad-argument-type
+                    func,
+                    args,
+                    kwargs,
+                    out_element,
                 )
         else:
             self._validate_output_for_cache_entry(
-                state, key, func, args, kwargs, output
+                state,
+                key,
+                # pyrefly: ignore  # bad-argument-type
+                func,
+                args,
+                kwargs,
+                output,
             )
 
         if isinstance(output, tuple):
             output_infos = [
                 self._get_output_info_for_cache_entry(
-                    state, key, func, args, kwargs, out_elem
+                    state,
+                    key,
+                    # pyrefly: ignore  # bad-argument-type
+                    func,
+                    args,
+                    kwargs,
+                    out_elem,
                 )
                 for out_elem in output
             ]
             return _DispatchCacheValidEntry(
-                output_infos=tuple(output_infos), is_output_tuple=True
+                # pyrefly: ignore  # bad-argument-type
+                output_infos=tuple(output_infos),
+                is_output_tuple=True,
             )
 
         else:
             output_info = self._get_output_info_for_cache_entry(
-                state, key, func, args, kwargs, output
+                state,
+                key,
+                # pyrefly: ignore  # bad-argument-type
+                func,
+                args,
+                kwargs,
+                output,
             )
             return _DispatchCacheValidEntry(
                 output_infos=(output_info,), is_output_tuple=False
@@ -2049,7 +2094,7 @@ def assert_helper(a: Any, b: Any) -> None:
             elif a is None:
                 assert b is None
             elif isinstance(a, py_sym_types):
-                assert type(a) == type(b) and a.node is b.node
+                assert type(a) is type(b) and a.node is b.node
             elif isinstance(a, torch.Tensor):
                 assert isinstance(b, torch.Tensor)
                 assert_metadata_eq(assert_eq, a, b)
@@ -2327,11 +2372,28 @@ def _dispatch_impl(
         converter = self.fake_tensor_converter
 
         is_lift_func = func in self.lift_fns
+
+        # If we are trying to avoid device init, then we need to avoid constant
+        # prop on constant tensors for ops that change devices.
+        avoiding_device_init = False
+        if self.avoid_device_init:
+            if (
+                func == torch.ops.aten._to_copy.default
+                and "device" in kwargs
+                and kwargs["device"].type != "cpu"  # type: ignore[attr-defined]
+            ):
+                avoiding_device_init = True
+            if func == torch.ops.prims.device_put.default:
+                avoiding_device_init = True
+
+        # skip const prop for aten._to_copy if
+        # 1. input tensor is on "meta" device
+        # 2. destination device is unavailable, captured by `avoiding_device_init`
         device_conversion_skip_const_prop = (
             func is torch.ops.aten._to_copy.default
             and isinstance(args[0], torch.Tensor)
             and args[0].device.type == "meta"
-        )
+        ) or avoiding_device_init
 
         # To constant propagate through these functions:
         # 1, If this is a lift due to a torch.tensor call,
@@ -2377,19 +2439,6 @@ def _dispatch_impl(
             if type(args[0]) is Tensor:
                 return converter.from_real_tensor(self, args[0])
 
-        # If we are trying to avoid device init, then we need to avoid constant
-        # prop on constant tensors for ops that change devices.
-        avoiding_device_init = False
-        if self.avoid_device_init:
-            if (
-                func == torch.ops.aten._to_copy.default
-                and "device" in kwargs
-                and kwargs["device"] != "cpu"
-            ):
-                avoiding_device_init = True
-            if func == torch.ops.prims.device_put.default:
-                avoiding_device_init = True
-
         # Recompute flat_arg_fake_tensors here again in case some of the inputs
         # were real tensors and fakified in validate_and_convert_non_fake_tensors
         (flat_args, flat_arg_fake_tensors) = self.validate_and_convert_non_fake_tensors(
@@ -2460,6 +2509,7 @@ def _dispatch_impl(
             )
 
             with self, maybe_ignore_fresh_unbacked_symbols():
+                # pyrefly: ignore  # index-error
                 return registered_hop_fake_fns[func](*args, **kwargs)
 
         self.invalidate_written_to_constants(func, flat_arg_fake_tensors, args, kwargs)
@@ -2518,7 +2568,7 @@ def maybe_to_real_tensor(
                 # we shouldn't broadly catch all errors here;
                 # some come from real-kernel mutation/aliasing checks we want to run.
                 # add more exception types as needed.
-                log.debug(
+                log.debug(  # noqa: G200
                     "real-tensor fallback failed for %s: %s; silently ignoring",
                     func,
                     exc,
@@ -2570,6 +2620,7 @@ def go(t: object, real_t: Tensor) -> None:
                         and s.rhs == 1
                     ):
                         assert self.shape_env is not None
+                        # pyrefly: ignore  # unbound-name
                         self.shape_env.set_unbacked_var_to_val(s, int(real_t))
 
             if real_out is not nil:
@@ -2596,7 +2647,7 @@ def go(t: object, real_t: Tensor) -> None:
                 if (
                     not isinstance(fake_out, Tensor)
                     and not isinstance(real_out, Tensor)
-                    and type(fake_out) != type(real_out)
+                    and type(fake_out) is not type(real_out)
                 ):
                     # This can happen when decompositions have different return types,
                     # e.g. namedtuple vs. tuple vs. list.
@@ -2613,6 +2664,7 @@ def go(t: object, real_t: Tensor) -> None:
                 # TODO: Is this really needed?
                 compute_unbacked_bindings(self.shape_env, fake_out, peek=True)
 
+            # pyrefly: ignore  # bad-return
             return fake_out
 
         # Try for fastpath
@@ -2627,7 +2679,9 @@ def go(t: object, real_t: Tensor) -> None:
         if (
             func not in meta_table
             and not self.cpp_meta_supports_symint(func)
-            and not (has_symbolic_sizes and func in self._view_fake_tensor_impl_ops)
+            and not (
+                has_symbolic_sizes and func in self._unbacked_special_fake_handling_ops
+            )
         ):
             from torch._decomp import decomposition_table
 
@@ -2892,6 +2946,7 @@ def wrap(e: T) -> Union[T, FakeTensor]:
                         self, e, device or common_device
                     )
             else:
+                # pyrefly: ignore  # bad-return
                 return e
 
         return tree_map(wrap, r)
@@ -2936,8 +2991,10 @@ def create_symbolic_nested_int(
         aten._sparse_coo_tensor_with_dims_and_tensors.default,
     )
 
-    _view_fake_tensor_impl_ops = ordered_set(
-        aten.view.default, aten._unsafe_view.default
+    _unbacked_special_fake_handling_ops = ordered_set(
+        aten.view.default,
+        aten._unsafe_view.default,
+        aten.slice.Tensor,
     )
 
     def cpp_meta_supports_symint(self, func: OpOverload) -> bool:
@@ -2952,7 +3009,7 @@ def may_turn_const(self, t: Tensor) -> bool:
             t.numel() <= CONSTANT_NUMEL_LIMIT
             and not is_sparse_any(t)
             and not self.is_our_fake(t)
-            and not t.device.type == "meta"
+            and t.device.type != "meta"
         )
 
     def invalidate_written_to_constants(
@@ -3010,17 +3067,65 @@ def from_tensor(
 _StoragePointer = object
 
 
-def _has_unrepresented_symbols(
-    state: _CacheKeyState, output: Optional[FakeTensor]
-) -> bool:
-    from torch.fx.experimental.symbolic_shapes import _iterate_exprs
+def _validate_symbolic_output_for_caching(
+    state: _CacheKeyState, output: FakeTensor
+) -> None:
+    """
+    Validate symbolic content in output and raise _BypassDispatchCache if
+    caching should be bypassed.
 
-    for s in _iterate_exprs(output):
-        for symbol in s.free_symbols:
-            if symbol not in state.known_symbols:
-                return True
+    Args:
+        state: Cache key state containing known symbols
+        output: Output to validate
+        proxy_mode_active: Whether PROXY dispatch mode is currently active
 
-    return False
+    Raises: _BypassDispatchCache: If output contains symbolic content that
+        prevents caching
+
+    Details:
+
+    If our output contains any symbols that didn't appear in the input then we
+    need to bypass. Usually this will be unbacked symbols which can't be
+    properly reconstructed but there could be "weird" cases where backed symbols
+    spontaneously appear (from non-input state)?
+
+    If we're proxy (symbol) tracing and the output contains ANY symbols then we
+    need to bypass. The problem is that ProxyTorchDispatchMode relies on SymNode
+    object identity and being able to see the construction of SymNodes.
+
+    We could improve the proxy tracing case in a few ways:
+
+    1. If the output SymNodes are directly copied from inputs then this is
+       actually fine - they're already tracked. This would probably be the
+       biggest bang/buck.
+
+    2. If the output (tensors) are all direct copies of the inputs then this is
+       also fine - since they're inputs they must be tracked. We already compute
+       this we just don't plumb it around enough.
+
+    3. If the output SymNodes are already tracked by the proxy then this is also
+       actually fine - they're properly tracked. This probably wouldn't be
+       common since for most outputs we use torch.empty_strided() and recompute
+       strides.
+
+    4. We could use the proxy to track "how" the SymNodes were computed and when
+       using the cache we could "replay" them properly to teach the proxy how to
+       build them.
+    """
+    from torch.fx.experimental.symbolic_shapes import _iterate_exprs, _iterate_nodes
+
+    is_tracing = torch.fx.experimental.proxy_tensor.get_proxy_mode() is not None
+    if is_tracing:
+        # Check for SymNode types in PROXY mode - this should bypass caching
+        # regardless of whether symbols are known or not
+        for node in _iterate_nodes(output):
+            raise _BypassDispatchCache("Proxy mode with SymNode output")
+    else:
+        # Check for unrepresented symbols in tensor expressions
+        for s in _iterate_exprs(output):
+            for symbol in s.free_symbols:
+                if symbol not in state.known_symbols:
+                    raise _BypassDispatchCache("unrepresented symbol in output")
 
 
 # NB: returns fake tensors
diff --git a/torch/_subclasses/fake_utils.py b/torch/_subclasses/fake_utils.py
index bd481c87cf6f..1212168b0904 100644
--- a/torch/_subclasses/fake_utils.py
+++ b/torch/_subclasses/fake_utils.py
@@ -2,7 +2,8 @@
 
 import functools
 import warnings
-from typing import Any, Callable, Union
+from collections.abc import Callable
+from typing import Any, Union
 
 import torch
 import torch.utils._pytree as pytree
@@ -126,7 +127,7 @@ def try_convert_fake_to_real(
     key_to_real_storage = {v: k for k, v in desc.lookup_storage.items()}
     out = []
     for t in ten_list:
-        if not isinstance(t, FakeTensor) or not t.layout == torch.strided:
+        if not isinstance(t, FakeTensor) or t.layout != torch.strided:
             out.append(t)
             continue
 
diff --git a/torch/_subclasses/functional_tensor.py b/torch/_subclasses/functional_tensor.py
index 28cc3070affc..d3b9ac7858ce 100644
--- a/torch/_subclasses/functional_tensor.py
+++ b/torch/_subclasses/functional_tensor.py
@@ -3,8 +3,9 @@
 import warnings
 import weakref
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from contextlib import AbstractContextManager
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch.utils._pytree as pytree
@@ -14,6 +15,7 @@
 from torch.utils._python_dispatch import (
     _detect_infra_mode,
     _disable_infra_mode,
+    autograd_would_have_decomposed,
     return_and_correct_aliasing,
     TorchDispatchMode,
 )
@@ -408,8 +410,13 @@ def _can_decompose(func):
                     return False
                 return True
 
-            # in normal torch.compile IR, we decompose functional composite ops
-            return True
+            # in normal torch.compile IR, we only decompose an op if autograd
+            # would have decomposed it (NB: autograd may have been skipped if
+            # we are in inference mode)
+            # TODO: the flatten here can potentially be deduped with the
+            # unwrapping pytree_map later
+            flat_args_kwargs, _ = pytree.tree_flatten((args, kwargs))
+            return autograd_would_have_decomposed(func, flat_args_kwargs)
 
         if (
             func not in FunctionalTensor.metadata_fns
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index b73ee9abfc33..da3eed2b0c71 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -11,17 +11,17 @@
 from dataclasses import dataclass
 from typing import (
     Any,
-    Callable,
     ClassVar,
     Generic,
     NewType,
     Optional,
     Protocol,
     TYPE_CHECKING,
+    TypeGuard,
     TypeVar,
     Union,
 )
-from typing_extensions import override, TypedDict, TypeGuard, TypeIs, Unpack
+from typing_extensions import override, TypedDict, TypeIs, Unpack
 
 import torch
 from torch._C._autograd import CreationMeta
@@ -46,7 +46,7 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Generator
+    from collections.abc import Callable, Generator
 
     from torch._C._functorch import CInterpreter
     from torch._guards import Source
@@ -81,6 +81,7 @@ def safe_is_leaf(t: Union[MetaTensorDesc, torch.Tensor]) -> bool:
 
 def safe_grad(t: _TensorLikeT) -> Optional[_TensorLikeT]:
     with torch._logging.hide_warnings(torch._logging._internal.safe_grad_filter):
+        # pyrefly: ignore  # bad-return
         return t.grad
 
 
@@ -415,6 +416,7 @@ def describe_tensor(
             device=t.device,
             size=t.size(),
             stride=stride,
+            # pyrefly: ignore  # bad-argument-type
             storage_offset=storage_offset,
             dynamo_dynamic_indices=list(getattr(t, "_dynamo_dynamic_indices", set())),
             dynamo_hint_overrides=getattr(t, "_dynamo_hint_overrides", {}),
@@ -539,7 +541,11 @@ def apply(
         tensor_visitor_fn: Optional[Callable[[torch.Tensor], FakeTensor]] = None,
     ) -> FakeTensor:
         return torch._subclasses.fake_tensor.FakeTensor._view_func_unsafe(
-            t, new_base, symint_visitor_fn, tensor_visitor_fn
+            # pyrefly: ignore  # bad-argument-type
+            t,
+            new_base,
+            symint_visitor_fn,
+            tensor_visitor_fn,
         )
 
 
@@ -887,13 +893,15 @@ def meta_tensor(
                 f"__meta_utils_unknown_tensor{len(self.tensor_memo)}"
             )
 
-        # This indicates you set no_dispatch() before calling into this
-        # function.  This is an error: we may be creating fake tensors and
-        # will perform operations on them which need fake tensor mode to
-        # be active.  You will segfault if you are in a no_dispatch() block.
+        msg = (
+            " This indicates you set no_dispatch() before calling into this"
+            " function.  This is an error: we may be creating fake tensors and"
+            " will perform operations on them which need fake tensor mode to"
+            " be active.  You will segfault if you are in a no_dispatch() block."
+        )
         assert not torch._C._dispatch_tls_local_exclude_set().has(
             torch._C.DispatchKey.Python
-        )
+        ), msg
         self.arg_cnt += 1
 
         # When we make as_strided calls, we end up generating a guard
@@ -1011,6 +1019,7 @@ def empty_create_subclass(
             # Morally, the code here is same as transform_subclass, but we've
             # written it from scratch to read EmptyCreateSubclass
             outer_size = outer_size if outer_size is not None else t.size
+            # pyrefly: ignore  # bad-assignment
             outer_stride = outer_stride if outer_stride is not None else t.stride
 
             assert symbolic_context is None or isinstance(
@@ -1267,6 +1276,7 @@ def tensor_visitor_fn(
                 ) -> torch.Tensor:
                     # It's possible to close over an undefined tensor (e.g. NJT's lengths).
                     if visited_t is None:
+                        # pyrefly: ignore  # bad-return
                         return None
 
                     # NB: visited_t being a Tensor here is very naughty!  Should
@@ -1274,7 +1284,7 @@ def tensor_visitor_fn(
 
                     # Fake inner tensors of view subclasses will come from the mapping built above.
                     visited_id = self.describer.get_tensor_id(visited_t)
-                    fake_visited_t = real_to_fake_mapping.get(visited_id, None)
+                    fake_visited_t = real_to_fake_mapping.get(visited_id)
                     if fake_visited_t is not None:
                         return fake_visited_t
 
@@ -1397,6 +1407,7 @@ def tensor_visitor_fn(
                     if t.requires_grad:
                         r.requires_grad = True
                     if t.requires_grad and not is_leaf:
+                        # pyrefly: ignore  # bad-argument-type
                         r = self._backward_error(r)
                 elif t.is_nested and not t.is_traceable_wrapper_subclass:
                     # TODO: Handle this better in Dynamo?
@@ -1435,6 +1446,7 @@ def tensor_visitor_fn(
                     if t.requires_grad:
                         r.requires_grad = True
                     if t.requires_grad and not is_leaf:
+                        # pyrefly: ignore  # bad-argument-type
                         r = self._backward_error(r)
                 elif t.is_functorch_wrapped:
                     if t.is_view:
@@ -1531,6 +1543,7 @@ def _to_fake_tensor(t: MetaTensorDesc) -> _TensorT:
                                     )
                                     assert t.data is not None
                                     _safe_copy(r.real_tensor, t.data)  # type: ignore[attr-defined]
+                        # pyrefly: ignore  # bad-return
                         return r
 
                     r = _to_fake_tensor(t)
@@ -1680,6 +1693,7 @@ def is_c_of_r(
                         not (t.is_batchedtensor or t.is_gradtrackingtensor)
                         and t.is_functorch_wrapped
                     ) or t.is_legacy_batchedtensor:
+                        # pyrefly: ignore  # bad-return
                         return NotImplemented
 
                     (
@@ -1726,6 +1740,7 @@ def is_c_of_r(
                             # the metadata of the inner tensor.
                             # So instead, we now have a dedicated fn to set autograd history,
                             # without inadvertently changing other metadata.
+                            # pyrefly: ignore  # bad-argument-type
                             r = self._backward_error(r)
 
                     s = t.storage
@@ -1805,6 +1820,7 @@ def is_c_of_r(
 
                     # TODO: Use a valid grad-specific symbolic context instead of recycling
                     # the one from t. This isn't correct if e.g. t._is_view() != t.grad._is_view().
+                    # pyrefly: ignore  # unbound-name
                     r.grad = self.meta_tensor(
                         t.grad,
                         shape_env,
@@ -1812,12 +1828,15 @@ def is_c_of_r(
                         AttrSource(source, "grad"),
                         symbolic_context,
                     )
+                # pyrefly: ignore  # unbound-name
                 torch._C._set_conj(r, t.is_conj)
+                # pyrefly: ignore  # unbound-name
                 torch._C._set_neg(r, t.is_neg)
             # This can be skipped if necessary for performance reasons
             skip_leaf = (
                 t.is_gradtrackingtensor and t.level == GRAD_TENSOR_SENTINEL_VALUE
             )
+            # pyrefly: ignore  # unbound-name
             assert_metadata_eq(assert_eq, t, r, skip_symbolic=True, skip_leaf=skip_leaf)
             # Thanks to storage resizing, it's possible to end up with a tensor
             # that advertises a real size, but has a storage that actually has zero bytes.
@@ -1825,18 +1844,23 @@ def is_c_of_r(
             from torch.fx.experimental.symbolic_shapes import guard_or_false
 
             if t.storage is not None and guard_or_false(t.storage.size == 0):
+                # pyrefly: ignore  # unbound-name
                 r.untyped_storage().resize_(0)
 
             if t.is_parameter:
+                # pyrefly: ignore  # unbound-name
                 r._is_param = True
 
             # See Note: [Creating symbolic nested int]
             if t.nested_int is not None:
+                # pyrefly: ignore  # unbound-name
                 assert _is_fake_tensor(r)
+                # pyrefly: ignore  # unbound-name
                 r.nested_int_memo = r.fake_mode.create_symbolic_nested_int(
                     nt_tensor_id=t.nested_int
                 )
 
+            # pyrefly: ignore  # bad-argument-type
             self.set_tensor_memo(t, r)
 
         return self._checked_get_tensor_memo(t)
@@ -1880,11 +1904,13 @@ def __call__(
                 (t._is_view() and t._base is not None and t._base.is_sparse)
             ):
                 self.miss += 1
+                # pyrefly: ignore  # bad-return
                 return NotImplemented
             else:
                 self.hit += 1
         elif torch.overrides.is_tensor_like(t):
             self.miss += 1
+            # pyrefly: ignore  # bad-return
             return NotImplemented
         else:
             # non-Tensor types don't count as hit or miss
diff --git a/torch/_subclasses/schema_check_mode.py b/torch/_subclasses/schema_check_mode.py
index 3f45272d4f05..28bbb8f335ec 100644
--- a/torch/_subclasses/schema_check_mode.py
+++ b/torch/_subclasses/schema_check_mode.py
@@ -86,7 +86,7 @@ def bitwise_equal(lhs, rhs):
                 return torch.allclose(lhs, rhs, equal_nan=True)
 
         def has_mutated(before, after, md):
-            are_tensors = type(before) == torch.Tensor and type(after) == torch.Tensor
+            are_tensors = type(before) is torch.Tensor and type(after) is torch.Tensor
             if (
                 are_tensors
                 and before.layout != torch.sparse_csr
@@ -113,7 +113,7 @@ def standardize_name(name):
             return name if name != "self" else "input"
 
         def unwrap(e):
-            if isinstance(e, torch.Tensor) and not type(e) == torch.Tensor:
+            if isinstance(e, torch.Tensor) and type(e) is not torch.Tensor:
                 try:
                     return e.elem
                 except AttributeError:
@@ -122,7 +122,7 @@ def unwrap(e):
 
         def parse_metadata(e):
             if isinstance(e, torch.Tensor):
-                if not type(e) == torch.Tensor:
+                if type(e) is not torch.Tensor:
                     try:
                         current = e.elem
                         return (
diff --git a/torch/_tensor.py b/torch/_tensor.py
index bb6c2b046684..c36ba126d643 100644
--- a/torch/_tensor.py
+++ b/torch/_tensor.py
@@ -2,12 +2,14 @@
 import copyreg
 import enum
 import functools
+import itertools
 import warnings
 from collections import OrderedDict
+from collections.abc import Callable
 from copy import deepcopy
 from numbers import Number
-from typing import Any, Callable, cast, Optional, TypeVar, Union
-from typing_extensions import Concatenate, ParamSpec
+from typing import Any, cast, Concatenate, Optional, TypeVar, Union
+from typing_extensions import ParamSpec
 
 import torch
 import torch._C as _C
@@ -626,6 +628,34 @@ def backward(
             self, gradient, retain_graph, create_graph, inputs=inputs
         )
 
+    def index(self, positions, dims):
+        """
+        Index a regular tensor by binding specified positions to dims.
+
+        This converts a regular tensor to a first-class tensor by binding
+        the specified positional dimensions to Dim objects.
+
+        Args:
+            positions: Tuple of dimension positions to bind
+            dims: Dim objects or tuple of Dim objects to bind to
+
+        Returns:
+            First-class tensor with specified dimensions bound
+        """
+        # TODO: make it possible to dispatch on positions/dims
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.index,
+                (self,),
+                self,
+                positions,
+                dims,
+            )
+
+        from functorch.dim import index
+
+        return index(self, positions, dims)
+
     def register_hook(self, hook):
         r"""Registers a backward hook.
 
@@ -727,7 +757,10 @@ def register_post_accumulate_grad_hook(self, hook):
                 "post accumulate grad hooks cannot be registered on non-leaf tensors"
             )
         if self._post_accumulate_grad_hooks is None:
-            self._post_accumulate_grad_hooks: dict[Any, Any] = OrderedDict()
+            self._post_accumulate_grad_hooks: dict[Any, Any] = (
+                # pyrefly: ignore  # bad-assignment
+                OrderedDict()
+            )
 
         from torch.utils.hooks import RemovableHandle
 
@@ -1027,7 +1060,12 @@ def split(self, split_size, dim=0):
         if isinstance(split_size, (int, torch.SymInt)):
             return torch._VF.split(self, split_size, dim)  # type: ignore[attr-defined]
         else:
-            return torch._VF.split_with_sizes(self, split_size, dim)
+            return torch._VF.split_with_sizes(
+                self,
+                # pyrefly: ignore  # bad-argument-type
+                split_size,
+                dim,
+            )
 
     def unique(self, sorted=True, return_inverse=False, return_counts=False, dim=None):
         r"""Returns the unique elements of the input tensor.
@@ -1072,6 +1110,7 @@ def unique_consecutive(self, return_inverse=False, return_counts=False, dim=None
 
     @_handle_torch_function_and_wrap_type_error_to_not_implemented
     def __rsub__(self, other: Union["Tensor", int, float, bool, complex]) -> "Tensor":
+        # pyrefly: ignore  # no-matching-overload
         return _C._VariableFunctions.rsub(self, other)
 
     @_handle_torch_function_and_wrap_type_error_to_not_implemented
@@ -1081,6 +1120,7 @@ def __rdiv__(self, other: Union["Tensor", int, float, bool, complex]) -> "Tensor
     __rtruediv__ = __rdiv__
     __itruediv__ = _C.TensorBase.__idiv__
 
+    # pyrefly: ignore  # bad-override
     __pow__ = cast(
         Callable[
             ["torch._C.TensorBase", Union["Tensor", int, float, bool, complex]],
@@ -1097,7 +1137,7 @@ def __rdiv__(self, other: Union["Tensor", int, float, bool, complex]) -> "Tensor
 
     @_handle_torch_function_and_wrap_type_error_to_not_implemented
     def __rmod__(self, other: Union["Tensor", int, float, bool, complex]) -> "Tensor":
-        return torch.remainder(other, self)
+        return torch.remainder(other, self)  # pyrefly: ignore  # no-matching-overload
 
     def __format__(self, format_spec):
         if has_torch_function_unary(self):
@@ -1110,7 +1150,7 @@ def __format__(self, format_spec):
 
     @_handle_torch_function_and_wrap_type_error_to_not_implemented
     def __rpow__(self, other: Union["Tensor", int, float, bool, complex]) -> "Tensor":
-        return torch.pow(other, self)
+        return torch.pow(other, self)  # pyrefly: ignore  # no-matching-overload
 
     @_handle_torch_function_and_wrap_type_error_to_not_implemented
     def __floordiv__(self, other: Union["Tensor", int, float, bool]) -> "Tensor":  # type: ignore[override]
@@ -1126,12 +1166,14 @@ def __rfloordiv__(self, other: Union["Tensor", int, float, bool]) -> "Tensor":
     def __rlshift__(
         self, other: Union["Tensor", int, float, bool, complex]
     ) -> "Tensor":
+        # pyrefly: ignore  # no-matching-overload
         return torch.bitwise_left_shift(other, self)
 
     @_handle_torch_function_and_wrap_type_error_to_not_implemented
     def __rrshift__(
         self, other: Union["Tensor", int, float, bool, complex]
     ) -> "Tensor":
+        # pyrefly: ignore  # no-matching-overload
         return torch.bitwise_right_shift(other, self)
 
     @_handle_torch_function_and_wrap_type_error_to_not_implemented
@@ -1306,7 +1348,7 @@ def storage_type(self):
 
         return self._typed_storage()._get_legacy_storage_class()
 
-    def refine_names(self, *names):
+    def refine_names(self, *names):  # pyrefly: ignore  # bad-override
         r"""Refines the dimension names of :attr:`self` according to :attr:`names`.
 
         Refining is a special case of renaming that "lifts" unnamed dimensions.
@@ -1350,7 +1392,7 @@ def refine_names(self, *names):
         names = resolve_ellipsis(names, self.names, "refine_names")
         return super().refine_names(names)
 
-    def align_to(self, *names):
+    def align_to(self, *names):  # pyrefly: ignore  # bad-override
         r"""Permutes the dimensions of the :attr:`self` tensor to match the order
         specified in :attr:`names`, adding size-one dims for any new names.
 
@@ -1593,7 +1635,7 @@ def has_multiple_dim_order(tensor):
             # Check if there are any duplicate strides
             has_duplicate_strides = any(
                 guard_or_false(earlier == later)
-                for earlier, later in zip(strides, strides[1:])
+                for earlier, later in itertools.pairwise(strides)
             )
 
             # Check if there are any singleton dimensions
@@ -1671,7 +1713,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
     def __dlpack__(
         self,
         *,
-        stream: Optional[Any] = None,
+        stream: Optional[Any] = -1,
         max_version: Optional[tuple[int, int]] = None,
         dl_device: Optional[tuple[enum.IntEnum, int]] = None,
         copy: Optional[bool] = None,
@@ -1689,9 +1731,12 @@ def __dlpack__(
                 pointer to a CUDA stream. The current stream is synchronized with
                 this stream before the capsule is created, and since the capsule
                 shares its storage with the tensor this make it safe to access from
-                both streams.  If None or -1 is passed then no synchronization is performed.
+                both streams.  If -1 is passed then no synchronization is performed.
                 If 1 (on CUDA) or 0 (on ROCM) then the default stream is used for
-                synchronization.
+                synchronization. This API intentionally slightly deviates from the DLPack
+                guidance: the default stream is -1 (stream-preserving; no cross-stream sync),
+                because many from_dlpack implementations intend stream preservation.
+                For non-CUDA devices, -1 is treated the same as None.
 
             max_version (tuple[int, int] or None): An optional Python tuple with
                 2 integers, representing the maximum version the caller supports. If
@@ -1769,7 +1814,7 @@ def __dlpack__(
                 event.record(current_stream)
                 stream.wait_event(event)
         elif self.device.type == "cpu":
-            assert stream is None, "stream should be None on cpu."
+            assert stream is None or stream == -1, "stream should be None on cpu."
 
         if self.device.type == "xla":
             import torch_xla
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 33a2184b71f5..bc5ed9d510d5 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -6624,6 +6624,36 @@ def callable(a, b) -> number
 """,
 )
 
+add_docstr_all(
+    "grad_dtype",
+    r"""
+The allowed dtype of :attr:``grad`` for this tensor.
+
+:attr:``grad_dtype`` can be set to a specific dtype or ``None``. By default,
+``t.grad_dtype == t.dtype``. When not None, the autograd engine casts
+incoming gradients to this dtype. This attribute is only accessible and
+settable for leaf tensors.
+
+.. warning::
+    Use with caution. Diverging the dtypes of a tensor and its gradient may
+    break downstream systems that assume they match.
+
+Example::
+
+    >>> x = torch.tensor([1.0, 2.0], requires_grad=True)
+    >>> x.grad_dtype
+    torch.float32
+
+    >>> x.grad_dtype = torch.float16
+    >>> x.grad_dtype
+    torch.float16
+
+    >>> # Allow any gradient dtype
+    >>> x.grad_dtype = None
+    >>> x.grad_dtype
+""",
+)
+
 add_docstr_all(
     "retain_grad",
     r"""
diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py
index c9262e1b2ee0..86a745f09b44 100644
--- a/torch/_tensor_str.py
+++ b/torch/_tensor_str.py
@@ -247,7 +247,7 @@ def _vector_str(self, indent, summarize, formatter1, formatter2=None):
         element_length += formatter2.width() + 1
 
     elements_per_line = max(
-        1, int(math.floor((PRINT_OPTS.linewidth - indent) / (element_length)))
+        1, math.floor((PRINT_OPTS.linewidth - indent) / (element_length))
     )
 
     def _val_formatter(val, formatter1=formatter1, formatter2=formatter2):
@@ -307,7 +307,7 @@ def _tensor_str_with_formatter(self, indent, summarize, formatter1, formatter2=N
                 _tensor_str_with_formatter(
                     self[i], indent + 1, summarize, formatter1, formatter2
                 )
-                for i in range(0, PRINT_OPTS.edgeitems)
+                for i in range(PRINT_OPTS.edgeitems)
             ]
             + ["..."]
             + [
@@ -322,7 +322,7 @@ def _tensor_str_with_formatter(self, indent, summarize, formatter1, formatter2=N
             _tensor_str_with_formatter(
                 self[i], indent + 1, summarize, formatter1, formatter2
             )
-            for i in range(0, self.size(0))
+            for i in range(self.size(0))
         ]
 
     tensor_str = ("," + "\n" * (dim - 1) + " " * (indent + 1)).join(slices)
@@ -406,7 +406,7 @@ def get_summarized_data(self):
     if not PRINT_OPTS.edgeitems:
         return self.new_empty([0] * self.dim())
     elif self.size(0) > 2 * PRINT_OPTS.edgeitems:
-        start = [self[i] for i in range(0, PRINT_OPTS.edgeitems)]
+        start = [self[i] for i in range(PRINT_OPTS.edgeitems)]
         end = [self[i] for i in range(len(self) - PRINT_OPTS.edgeitems, len(self))]
         return torch.stack([get_summarized_data(x) for x in (start + end)])
     else:
@@ -657,8 +657,10 @@ def indented_str(s, indent):
         grad_fn_name = "Invalid"
 
     if grad_fn_name is None and grad_fn is not None:  # type: ignore[possibly-undefined]
+        # pyrefly: ignore  # unbound-name
         grad_fn_name = type(grad_fn).__name__
         if grad_fn_name == "CppFunction":
+            # pyrefly: ignore  # unbound-name
             grad_fn_name = grad_fn.name().rsplit("::", 1)[-1]
 
     if grad_fn_name is not None:
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index e8ead0868292..3a8c2083afac 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -253,7 +253,7 @@ def merge_dicts(*dicts):
     r"""
 acos(input: Tensor, *, out: Optional[Tensor]) -> Tensor
 
-Computes the inverse cosine of each element in :attr:`input`.
+Returns a new tensor with the arccosine (in radians) of each element in :attr:`input`.
 
 .. math::
     \text{out}_{i} = \cos^{-1}(\text{input}_{i})
@@ -1047,7 +1047,7 @@ def merge_dicts(*dicts):
     r"""
 asin(input: Tensor, *, out: Optional[Tensor]) -> Tensor
 
-Returns a new tensor with the arcsine of the elements of :attr:`input`.
+Returns a new tensor with the arcsine of the elements (in radians) in the :attr:`input` tensor.
 
 .. math::
     \text{out}_{i} = \sin^{-1}(\text{input}_{i})
@@ -1119,7 +1119,7 @@ def merge_dicts(*dicts):
     r"""
 atan(input: Tensor, *, out: Optional[Tensor]) -> Tensor
 
-Returns a new tensor with the arctangent of the elements of :attr:`input`.
+Returns a new tensor with the arctangent of the elements (in radians) in the :attr:`input` tensor.
 
 .. math::
     \text{out}_{i} = \tan^{-1}(\text{input}_{i})
@@ -3135,7 +3135,7 @@ def merge_dicts(*dicts):
     r"""
 cos(input, *, out=None) -> Tensor
 
-Returns a new tensor with the cosine  of the elements of :attr:`input`.
+Returns a new tensor with the cosine of the elements of :attr:`input` given in radians.
 
 .. math::
     \text{out}_{i} = \cos(\text{input}_{i})
@@ -5308,7 +5308,7 @@ def merge_dicts(*dicts):
 index_select(input, dim, index, *, out=None) -> Tensor
 
 Returns a new tensor which indexes the :attr:`input` tensor along dimension
-:attr:`dim` using the entries in :attr:`index` which is a `LongTensor`.
+:attr:`dim` using the entries in :attr:`index`.
 
 The returned tensor has the same number of dimensions as the original tensor
 (:attr:`input`).  The :attr:`dim`\ th dimension has the same size as the length
@@ -7673,8 +7673,6 @@ def merge_dicts(*dicts):
 Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
 in the output tensors having 1 fewer dimension than :attr:`input`.
 
-.. note:: This function is not defined for ``torch.cuda.Tensor`` yet.
-
 Args:
     {input}
     {opt_dim}
@@ -9942,7 +9940,8 @@ def merge_dicts(*dicts):
     r"""
 sin(input, *, out=None) -> Tensor
 
-Returns a new tensor with the sine of the elements of :attr:`input`.
+Returns a new tensor with the sine of the elements in the :attr:`input` tensor,
+where each value in this input tensor is in radians.
 
 .. math::
     \text{out}_{i} = \sin(\text{input}_{i})
@@ -11359,7 +11358,8 @@ def merge_dicts(*dicts):
     r"""
 tan(input, *, out=None) -> Tensor
 
-Returns a new tensor with the tangent of the elements of :attr:`input`.
+Returns a new tensor with the tangent of the elements in the :attr:`input` tensor,
+where each value in this input tensor is in radians.
 
 .. math::
     \text{out}_{i} = \tan(\text{input}_{i})
diff --git a/torch/_utils.py b/torch/_utils.py
index 9bd062cb5cec..87d17c374de3 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -7,8 +7,9 @@
 import traceback
 import warnings
 from collections import defaultdict
+from collections.abc import Callable
 from types import ModuleType
-from typing import Any, Callable, Generic, Optional, TYPE_CHECKING
+from typing import Any, Generic, Optional, TYPE_CHECKING
 from typing_extensions import deprecated, ParamSpec
 
 import torch
@@ -36,7 +37,7 @@ def _type(self, dtype=None, non_blocking=False, **kwargs):
 
     if isinstance(dtype, str):
         dtype = _import_dotted_name(dtype)
-    if dtype == type(self):
+    if dtype is type(self):
         return self
     if self.is_sparse:
         if not dtype.is_sparse:
@@ -685,8 +686,8 @@ def _take_tensors(tensors, size_limit):
         if buf_and_size[1] + size > size_limit and buf_and_size[1] > 0:
             yield buf_and_size[0]
             buf_and_size = buf_dict[t] = [[], 0]
-        buf_and_size[0].append(tensor)
-        buf_and_size[1] += size
+        buf_and_size[0].append(tensor)  # pyrefly: ignore  # missing-attribute
+        buf_and_size[1] += size  # pyrefly: ignore  # unsupported-operation
     for buf, _ in buf_dict.values():
         if len(buf) > 0:
             yield buf
@@ -743,15 +744,18 @@ def __init__(self, exc_info=None, where="in background"):
         if exc_info is None:
             exc_info = sys.exc_info()
         self.exc_type = exc_info[0]
-        self.exc_msg = "".join(traceback.format_exception(*exc_info))
+        self.exc_msg = "".join(
+            # pyrefly: ignore  # no-matching-overload
+            traceback.format_exception(*exc_info)
+        )
         self.where = where
 
     def reraise(self):
         r"""Reraises the wrapped exception in the current thread"""
         # Format a message such as: "Caught ValueError in DataLoader worker
         # process 2. Original Traceback:", followed by the traceback.
-        msg = f"Caught {self.exc_type.__name__} {self.where}.\nOriginal {self.exc_msg}"
-        if self.exc_type == KeyError:
+        msg = f"Caught {self.exc_type.__name__} {self.where}.\nOriginal {self.exc_msg}"  # pyrefly: ignore  # missing-attribute
+        if self.exc_type is KeyError:
             # KeyError calls repr() on its argument (usually a dict key). This
             # makes stack traces unreadable. It will not be changed in Python
             # (https://bugs.python.org/issue2651), so we work around it.
@@ -759,9 +763,13 @@ def reraise(self):
         elif getattr(self.exc_type, "message", None):
             # Some exceptions have first argument as non-str but explicitly
             # have message field
-            raise self.exc_type(message=msg)
+            # pyrefly: ignore  # not-callable
+            raise self.exc_type(
+                # pyrefly: ignore  # unexpected-keyword
+                message=msg
+            )
         try:
-            exception = self.exc_type(msg)
+            exception = self.exc_type(msg)  # pyrefly: ignore  # not-callable
         except Exception:
             # If the exception takes multiple arguments or otherwise can't
             # be constructed, don't try to instantiate since we don't know how to
@@ -1013,12 +1021,12 @@ def __init__(self):
         self.call_order = []
 
     def queue_seed_all(self, cb, traceback):
-        self.manual_seed_all_cb = (cb, traceback)
+        self.manual_seed_all_cb = (cb, traceback)  # pyrefly: ignore  # bad-assignment
         # update seed_all to be latest
         self.call_order = [self.manual_seed_cb, self.manual_seed_all_cb]
 
     def queue_seed(self, cb, traceback):
-        self.manual_seed_cb = (cb, traceback)
+        self.manual_seed_cb = (cb, traceback)  # pyrefly: ignore  # bad-assignment
         # update seed to be latest
         self.call_order = [self.manual_seed_all_cb, self.manual_seed_cb]
 
diff --git a/torch/_utils_internal.py b/torch/_utils_internal.py
index 839c50d12d56..37bb1837871a 100644
--- a/torch/_utils_internal.py
+++ b/torch/_utils_internal.py
@@ -5,7 +5,8 @@
 import sys
 import tempfile
 import typing_extensions
-from typing import Any, Callable, Optional, TypeVar
+from collections.abc import Callable
+from typing import Any, Optional, TypeVar
 from typing_extensions import ParamSpec
 
 import torch
@@ -83,7 +84,12 @@ def compile_time_strobelight_meta_inner(
     ) -> Callable[_P, _T]:
         @functools.wraps(function)
         def wrapper_function(*args: _P.args, **kwargs: _P.kwargs) -> _T:
-            if "skip" in kwargs and isinstance(skip := kwargs["skip"], int):
+            if "skip" in kwargs and isinstance(
+                # pyrefly: ignore  # unsupported-operation
+                skip := kwargs["skip"],
+                int,
+            ):
+                # pyrefly: ignore  # unbound-name
                 kwargs["skip"] = skip + 1
 
             # This is not needed but we have it here to avoid having profile_compile_time
@@ -305,7 +311,7 @@ def deprecated():
     """
 
     def decorator(func: Callable[_P, _T]) -> Callable[_P, _T]:
-        # Validate naming convention – single leading underscore, not dunder
+        # Validate naming convention - single leading underscore, not dunder
         if not (func.__name__.startswith("_")):
             raise ValueError(
                 "@deprecate must decorate a function whose name "
@@ -326,7 +332,10 @@ def decorator(func: Callable[_P, _T]) -> Callable[_P, _T]:
 
         # public deprecated alias
         alias = typing_extensions.deprecated(
-            warning_msg, category=UserWarning, stacklevel=1
+            # pyrefly: ignore  # bad-argument-type
+            warning_msg,
+            category=UserWarning,
+            stacklevel=1,
         )(func)
 
         alias.__name__ = public_name
diff --git a/torch/_vmap_internals.py b/torch/_vmap_internals.py
index 6baee77ade54..3f303f78a471 100644
--- a/torch/_vmap_internals.py
+++ b/torch/_vmap_internals.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import functools
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional, Union
 from typing_extensions import deprecated
 
 import torch
diff --git a/torch/_weights_only_unpickler.py b/torch/_weights_only_unpickler.py
index 9382a5500e0e..5cc8e523406f 100644
--- a/torch/_weights_only_unpickler.py
+++ b/torch/_weights_only_unpickler.py
@@ -27,6 +27,7 @@
 
 from _codecs import encode
 from collections import Counter, OrderedDict
+from collections.abc import Callable
 from pickle import (
     APPEND,
     APPENDS,
@@ -68,7 +69,7 @@
 )
 from struct import unpack
 from sys import maxsize
-from typing import Any, Callable, Union
+from typing import Any, Union
 
 import torch
 from torch._utils import _sparse_tensors_to_validate, IMPORT_MAPPING, NAME_MAPPING
diff --git a/torch/amp/autocast_mode.py b/torch/amp/autocast_mode.py
index c758d47fc815..9196cb2de695 100644
--- a/torch/amp/autocast_mode.py
+++ b/torch/amp/autocast_mode.py
@@ -232,11 +232,11 @@ def __init__(
             )
         if dtype is None:
             dtype = torch.get_autocast_dtype(device_type)
+        self.fast_dtype = dtype
         if torch._jit_internal.is_scripting():
             self._enabled = enabled
             self.device = device_type
-            self.fast_dtype = dtype
-            assert dtype is not None
+            assert self.fast_dtype is not None
             return
         self.device = device_type
         if not is_autocast_available(self.device):
@@ -244,7 +244,6 @@ def __init__(
                 f"User specified an unsupported autocast device_type '{self.device}'"
             )
         self.custom_backend_name = torch._C._get_privateuse1_backend_name()
-        self.fast_dtype = torch.get_autocast_dtype(self.device)
         if self.device == self.custom_backend_name:
             necessary_funcs = [
                 "get_amp_supported_dtype",
@@ -271,8 +270,6 @@ def __init__(
                 "User provided device_type of 'cuda', but CUDA is not available. Disabling"
             )
             enabled = False
-        if dtype is not None:
-            self.fast_dtype = dtype
         if cache_enabled is not None:
             self._cache_enabled = cache_enabled
 
@@ -467,7 +464,11 @@ def _cast(value, device_type: str, dtype: _dtype):
         return value.to(dtype) if is_eligible else value
     elif isinstance(value, (str, bytes)):
         return value
-    elif HAS_NUMPY and isinstance(value, np.ndarray):
+    elif HAS_NUMPY and isinstance(
+        value,
+        # pyrefly: ignore  # missing-attribute
+        np.ndarray,
+    ):
         return value
     elif isinstance(value, collections.abc.Mapping):
         return {
@@ -524,18 +525,18 @@ def decorate_fwd(*args, **kwargs):
         args[0]._dtype = torch.get_autocast_dtype(device_type)
         if cast_inputs is None:
             args[0]._fwd_used_autocast = torch.is_autocast_enabled(device_type)
-            return fwd(*args, **kwargs)
+            return fwd(*args, **kwargs)  # pyrefly: ignore  # not-callable
         else:
             autocast_context = torch.is_autocast_enabled(device_type)
             args[0]._fwd_used_autocast = False
             if autocast_context:
                 with autocast(device_type=device_type, enabled=False):
-                    return fwd(
+                    return fwd(  # pyrefly: ignore  # not-callable
                         *_cast(args, device_type, cast_inputs),
                         **_cast(kwargs, device_type, cast_inputs),
                     )
             else:
-                return fwd(*args, **kwargs)
+                return fwd(*args, **kwargs)  # pyrefly: ignore  # not-callable
 
     return decorate_fwd
 
@@ -570,6 +571,6 @@ def decorate_bwd(*args, **kwargs):
             enabled=args[0]._fwd_used_autocast,
             dtype=args[0]._dtype,
         ):
-            return bwd(*args, **kwargs)
+            return bwd(*args, **kwargs)  # pyrefly: ignore  # not-callable
 
     return decorate_bwd
diff --git a/torch/ao/nn/intrinsic/modules/fused.py b/torch/ao/nn/intrinsic/modules/fused.py
index ec5b9c26fdd0..030ac21f9158 100644
--- a/torch/ao/nn/intrinsic/modules/fused.py
+++ b/torch/ao/nn/intrinsic/modules/fused.py
@@ -245,7 +245,7 @@ class LinearLeakyReLU(_FusedModule):
     During quantization this will be replaced with the corresponding fused module."""
 
     def __init__(self, linear, leaky_relu):
-        assert type(linear) == Linear and type(leaky_relu) == torch.nn.LeakyReLU, (
+        assert type(linear) is Linear and type(leaky_relu) is torch.nn.LeakyReLU, (
             f"Incorrect types for input modules{type(linear)}{type(leaky_relu)}"
         )
         super().__init__(linear, leaky_relu)
@@ -256,7 +256,7 @@ class LinearTanh(_FusedModule):
     During quantization this will be replaced with the corresponding fused module."""
 
     def __init__(self, linear, tanh):
-        assert type(linear) == Linear and type(tanh) == torch.nn.Tanh, (
+        assert type(linear) is Linear and type(tanh) is torch.nn.Tanh, (
             f"Incorrect types for input modules{type(linear)}{type(tanh)}"
         )
         super().__init__(linear, tanh)
diff --git a/torch/ao/nn/intrinsic/qat/modules/conv_fused.py b/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
index 6671e317b6b0..6b9f882d7e11 100644
--- a/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
+++ b/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
@@ -361,7 +361,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         """
         # The ignore is because _FLOAT_MODULE is a TypeVar here where the bound
         # has no __name__ (code is fine though)
-        assert type(mod) == cls._FLOAT_MODULE, (
+        assert type(mod) is cls._FLOAT_MODULE, (
             "qat."
             + cls.__name__
             + ".from_float only works for "
@@ -620,6 +620,7 @@ def __init__(
             dilation=dilation,
             groups=groups,
             bias=bias,
+            # pyrefly: ignore  # bad-argument-type
             padding_mode=padding_mode,
             qconfig=qconfig,
         )
@@ -820,6 +821,7 @@ def __init__(
             dilation=dilation,
             groups=groups,
             bias=bias,
+            # pyrefly: ignore  # bad-argument-type
             padding_mode=padding_mode,
             qconfig=qconfig,
         )
@@ -1021,6 +1023,7 @@ def __init__(
             dilation=dilation,
             groups=groups,
             bias=bias,
+            # pyrefly: ignore  # bad-argument-type
             padding_mode=padding_mode,
             qconfig=qconfig,
         )
diff --git a/torch/ao/nn/intrinsic/qat/modules/linear_fused.py b/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
index aada0ab2ab71..b8fac4d51bb1 100644
--- a/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
+++ b/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
@@ -150,7 +150,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         Args: `mod' a float module, either produced by torch.ao.quantization
         utilities or directly from user
         """
-        assert type(mod) == nni.LinearBn1d, (
+        assert type(mod) is nni.LinearBn1d, (
             "qat."
             + cls.__name__
             + ".from_float only works for "
diff --git a/torch/ao/nn/intrinsic/qat/modules/linear_relu.py b/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
index 8446468dddcf..075e1411a45f 100644
--- a/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
+++ b/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
@@ -36,6 +36,7 @@ class LinearReLU(nnqat.Linear, _FusedModule):
         torch.Size([128, 30])
     """
 
+    # pyrefly: ignore  # bad-override
     _FLOAT_MODULE = nni.LinearReLU
 
     def __init__(
diff --git a/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py b/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
index a9566b268f08..2458c32ae4f3 100644
--- a/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
+++ b/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
@@ -30,6 +30,7 @@ class LinearReLU(nnqd.Linear):
         torch.Size([128, 30])
     """
 
+    # pyrefly: ignore  # bad-override
     _FLOAT_MODULE = nni.LinearReLU
 
     def __init__(
diff --git a/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py b/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py
index 8172004d95fc..b5cdffabd165 100644
--- a/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py
+++ b/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py
@@ -54,6 +54,7 @@ def __init__(
             dilation=dilation,
             groups=groups,
             bias=bias,
+            # pyrefly: ignore  # bad-argument-type
             padding_mode=padding_mode,
             device=device,
             dtype=dtype,
@@ -79,7 +80,7 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
-        if type(mod) == torch.ao.nn.intrinsic.qat.ConvBnReLU1d:
+        if type(mod) is torch.ao.nn.intrinsic.qat.ConvBnReLU1d:
             assert mod.bn.running_var is not None and mod.bn.running_mean is not None
             mod.weight, mod.bias = fuse_conv_bn_weights(
                 mod.weight,
@@ -94,7 +95,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[over
 
     @classmethod
     def from_reference(cls, ref_qconv, output_scale, output_zero_point):
-        assert type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU1d, (
+        assert type(ref_qconv) is not torch.ao.nn.intrinsic.ConvBnReLU1d, (
             "BatchNorm1d should be fused into Conv1d before converting to reference module"
         )
         return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
@@ -160,7 +161,7 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
-        if type(mod) == torch.ao.nn.intrinsic.qat.ConvBnReLU2d:
+        if type(mod) is torch.ao.nn.intrinsic.qat.ConvBnReLU2d:
             assert mod.bn.running_var is not None and mod.bn.running_mean is not None
             mod.weight, mod.bias = fuse_conv_bn_weights(
                 mod.weight,
@@ -177,7 +178,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[over
 
     @classmethod
     def from_reference(cls, ref_qconv, output_scale, output_zero_point):
-        assert type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU2d, (
+        assert type(ref_qconv) is not torch.ao.nn.intrinsic.ConvBnReLU2d, (
             "BatchNorm2d should be fused into Conv2d before converting to reference module"
         )
         return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
@@ -243,7 +244,7 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[override]
-        if type(mod) == torch.ao.nn.intrinsic.qat.ConvBnReLU3d:
+        if type(mod) is torch.ao.nn.intrinsic.qat.ConvBnReLU3d:
             assert mod.bn.running_var is not None and mod.bn.running_mean is not None
             mod.weight, mod.bias = fuse_conv_bn_weights(
                 mod.weight,
@@ -260,7 +261,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[over
 
     @classmethod
     def from_reference(cls, ref_qconv, output_scale, output_zero_point):
-        assert type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU3d, (
+        assert type(ref_qconv) is not torch.ao.nn.intrinsic.ConvBnReLU3d, (
             "BatchNorm3d should be fused into Conv3d before converting to reference module"
         )
         return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
diff --git a/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py b/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
index 0ff5a7e4029f..8ec84101ee0d 100644
--- a/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
+++ b/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
@@ -94,7 +94,7 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):
-        assert type(mod) == nni.LinearLeakyReLU, (
+        assert type(mod) is nni.LinearLeakyReLU, (
             "Input float module should be LinearLeakyReLU"
         )
         assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
@@ -163,7 +163,7 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):
-        assert type(mod) == nni.LinearTanh, "Input float module should be LinearTanh"
+        assert type(mod) is nni.LinearTanh, "Input float module should be LinearTanh"
         assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
         activation_post_process = mod.activation_post_process
         mod = mod[0]
diff --git a/torch/ao/nn/qat/modules/conv.py b/torch/ao/nn/qat/modules/conv.py
index 4a193fa6763c..1f707deaaa28 100644
--- a/torch/ao/nn/qat/modules/conv.py
+++ b/torch/ao/nn/qat/modules/conv.py
@@ -62,7 +62,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
            `mod`: a float module, either produced by torch.ao.quantization utilities
            or directly from user
         """
-        assert type(mod) == cls._FLOAT_MODULE, (
+        assert type(mod) is cls._FLOAT_MODULE, (
             "qat."
             + cls.__name__
             + ".from_float only works for "
@@ -114,6 +114,7 @@ def to_float(self):
             assert hasattr(cls, "_FLOAT_RELU_MODULE")
             relu = cls._FLOAT_RELU_MODULE()
             modules.append(relu)
+            # pyrefly: ignore  # missing-attribute
             fused = cls._FLOAT_MODULE(*modules)
             fused.train(self.training)
             return fused
diff --git a/torch/ao/nn/qat/modules/embedding_ops.py b/torch/ao/nn/qat/modules/embedding_ops.py
index 13fd7a5983fb..b4d18ade4faa 100644
--- a/torch/ao/nn/qat/modules/embedding_ops.py
+++ b/torch/ao/nn/qat/modules/embedding_ops.py
@@ -50,6 +50,7 @@ def __init__(
             scale_grad_by_freq,
             sparse,
             _weight,
+            # pyrefly: ignore  # bad-argument-type
             **factory_kwargs,
         )
         assert qconfig, "qconfig must be provided for QAT module"
@@ -78,7 +79,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         Args: `mod` a float module, either produced by torch.ao.quantization utilities
         or directly from user
         """
-        assert type(mod) == cls._FLOAT_MODULE, (
+        assert type(mod) is cls._FLOAT_MODULE, (
             " qat."
             + cls.__name__
             + ".from_float only works for "
@@ -201,7 +202,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         Args: `mod` a float module, either produced by torch.ao.quantization utilities
         or directly from user
         """
-        assert type(mod) == cls._FLOAT_MODULE, (
+        assert type(mod) is cls._FLOAT_MODULE, (
             " qat."
             + cls.__name__
             + ".from_float only works for "
diff --git a/torch/ao/nn/quantizable/modules/activation.py b/torch/ao/nn/quantizable/modules/activation.py
index d9f5e4ff4c86..1a8ec7e62e41 100644
--- a/torch/ao/nn/quantizable/modules/activation.py
+++ b/torch/ao/nn/quantizable/modules/activation.py
@@ -116,7 +116,7 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, other):
-        assert type(other) == cls._FLOAT_MODULE
+        assert type(other) is cls._FLOAT_MODULE
         assert hasattr(other, "qconfig"), "The float module must have 'qconfig'"
         # Setting the dropout to 0.0!
         observed = cls(
@@ -170,8 +170,11 @@ def from_float(cls, other):
             observed.linear_K.weight = nn.Parameter(other.k_proj_weight)
             observed.linear_V.weight = nn.Parameter(other.v_proj_weight)
             if other.in_proj_bias is None:
+                # pyrefly: ignore  # bad-assignment
                 observed.linear_Q.bias = None
+                # pyrefly: ignore  # bad-assignment
                 observed.linear_K.bias = None
+                # pyrefly: ignore  # bad-assignment
                 observed.linear_V.bias = None
             else:
                 observed.linear_Q.bias = nn.Parameter(
@@ -234,6 +237,7 @@ def dequantize(self):
             _end = _start + fp.embed_dim
             fp.in_proj_weight[_start:_end, :] = wQ
             if fp.in_proj_bias is not None:
+                # pyrefly: ignore  # bad-argument-type
                 assert all(bQ == 0)
                 fp.in_proj_bias[_start:_end] = bQ
 
@@ -241,12 +245,14 @@ def dequantize(self):
             _end = _start + fp.embed_dim
             fp.in_proj_weight[_start:_end, :] = wK
             if fp.in_proj_bias is not None:
+                # pyrefly: ignore  # bad-argument-type
                 assert all(bK == 0)
                 fp.in_proj_bias[_start:_end] = bK
 
             _start = _end
             fp.in_proj_weight[_start:, :] = wV
             if fp.in_proj_bias is not None:
+                # pyrefly: ignore  # bad-argument-type
                 assert all(bV == 0)
                 fp.in_proj_bias[_start:] = bV
         else:
@@ -254,8 +260,11 @@ def dequantize(self):
             fp.k_proj_weight = nn.Parameter(wK)
             fp.v_proj_weight = nn.Parameter(wV)
             if fp.in_proj_bias is None:
+                # pyrefly: ignore  # bad-assignment
                 self.linear_Q.bias = None
+                # pyrefly: ignore  # bad-assignment
                 self.linear_K.bias = None
+                # pyrefly: ignore  # bad-assignment
                 self.linear_V.bias = None
             else:
                 fp.in_proj_bias[0 : fp.embed_dim] = bQ
@@ -463,6 +472,7 @@ def _forward_impl(
             assert static_v.size(2) == head_dim
             v = static_v
 
+        # pyrefly: ignore  # missing-attribute
         src_len = k.size(1)
 
         if key_padding_mask is not None:
@@ -471,17 +481,35 @@ def _forward_impl(
 
         if self.add_zero_attn:
             src_len += 1
+            # pyrefly: ignore  # missing-attribute
             k_zeros = torch.zeros((k.size(0), 1) + k.size()[2:])
+            # pyrefly: ignore  # missing-attribute
             if k.is_quantized:
                 k_zeros = torch.quantize_per_tensor(
-                    k_zeros, k.q_scale(), k.q_zero_point(), k.dtype
+                    k_zeros,
+                    # pyrefly: ignore  # missing-attribute
+                    k.q_scale(),
+                    # pyrefly: ignore  # missing-attribute
+                    k.q_zero_point(),
+                    # pyrefly: ignore  # missing-attribute
+                    k.dtype,
                 )
+            # pyrefly: ignore  # no-matching-overload
             k = torch.cat([k, k_zeros], dim=1)
+            # pyrefly: ignore  # missing-attribute
             v_zeros = torch.zeros((v.size(0), 1) + k.size()[2:])
+            # pyrefly: ignore  # missing-attribute
             if v.is_quantized:
                 v_zeros = torch.quantize_per_tensor(
-                    v_zeros, v.q_scale(), v.q_zero_point(), v.dtype
+                    v_zeros,
+                    # pyrefly: ignore  # missing-attribute
+                    v.q_scale(),
+                    # pyrefly: ignore  # missing-attribute
+                    v.q_zero_point(),
+                    # pyrefly: ignore  # missing-attribute
+                    v.dtype,
                 )
+            # pyrefly: ignore  # no-matching-overload
             v = torch.cat([v, v_zeros], dim=1)
 
             if attn_mask is not None:
diff --git a/torch/ao/nn/quantizable/modules/rnn.py b/torch/ao/nn/quantizable/modules/rnn.py
index ad32cf174c62..f13bb9b1a162 100644
--- a/torch/ao/nn/quantizable/modules/rnn.py
+++ b/torch/ao/nn/quantizable/modules/rnn.py
@@ -203,7 +203,7 @@ def from_params(cls, wi, wh, bi=None, bh=None, split_gates=False):
 
     @classmethod
     def from_float(cls, other, use_precomputed_fake_quant=False, split_gates=False):
-        assert type(other) == cls._FLOAT_MODULE
+        assert type(other) is cls._FLOAT_MODULE
         assert hasattr(other, "qconfig"), "The float module must have 'qconfig'"
         observed = cls.from_params(
             other.weight_ih,
@@ -376,6 +376,7 @@ def from_float(cls, other, layer_idx=0, qconfig=None, **kwargs):
             bidirectional,
             split_gates=split_gates,
         )
+        # pyrefly: ignore  # bad-argument-type
         layer.qconfig = getattr(other, "qconfig", qconfig)
         wi = getattr(other, f"weight_ih_l{layer_idx}")
         wh = getattr(other, f"weight_hh_l{layer_idx}")
@@ -454,6 +455,7 @@ def __init__(
 
         if (
             not isinstance(dropout, numbers.Number)
+            # pyrefly: ignore  # unsupported-operation
             or not 0 <= dropout <= 1
             or isinstance(dropout, bool)
         ):
@@ -462,6 +464,7 @@ def __init__(
                 "representing the probability of an element being "
                 "zeroed"
             )
+        # pyrefly: ignore  # unsupported-operation
         if dropout > 0:
             warnings.warn(
                 "dropout option for quantizable LSTM is ignored. "
@@ -573,6 +576,7 @@ def from_float(cls, other, qconfig=None, split_gates=False):
             other.bidirectional,
             split_gates=split_gates,
         )
+        # pyrefly: ignore  # bad-argument-type
         observed.qconfig = getattr(other, "qconfig", qconfig)
         for idx in range(other.num_layers):
             observed.layers[idx] = _LSTMLayer.from_float(
diff --git a/torch/ao/nn/quantized/dynamic/modules/conv.py b/torch/ao/nn/quantized/dynamic/modules/conv.py
index a079f31f62e4..1f8a65fe9d66 100644
--- a/torch/ao/nn/quantized/dynamic/modules/conv.py
+++ b/torch/ao/nn/quantized/dynamic/modules/conv.py
@@ -73,6 +73,7 @@ def __init__(
         factory_kwargs = {"device": device, "dtype": dtype}
         kernel_size = _single(kernel_size)
         stride = _single(stride)
+        # pyrefly: ignore  # bad-assignment
         padding = padding if isinstance(padding, str) else _single(padding)
         dilation = _single(dilation)
 
diff --git a/torch/ao/nn/quantized/dynamic/modules/linear.py b/torch/ao/nn/quantized/dynamic/modules/linear.py
index 0faaf62cedb5..2ea3fc972046 100644
--- a/torch/ao/nn/quantized/dynamic/modules/linear.py
+++ b/torch/ao/nn/quantized/dynamic/modules/linear.py
@@ -117,9 +117,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
             + str([float_mod.__name__ for float_mod in float_modules])
         )
         assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
-        if type(mod) == nni.LinearReLU:
+        if type(mod) is nni.LinearReLU:
             mod = mod[0]
+        # pyrefly: ignore  # missing-attribute
         if mod.qconfig is not None and mod.qconfig.weight is not None:
+            # pyrefly: ignore  # not-callable
             weight_observer = mod.qconfig.weight()
         else:
             # We have the circular import issues if we import the qconfig in the beginning of this file:
@@ -143,6 +145,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
                 "Unsupported dtype specified for dynamic quantized Linear!"
             )
         qlinear = cls(mod.in_features, mod.out_features, dtype=dtype)
+        # pyrefly: ignore  # bad-argument-type
         qlinear.set_weight_bias(qweight, mod.bias)
         return qlinear
 
diff --git a/torch/ao/nn/quantized/dynamic/modules/rnn.py b/torch/ao/nn/quantized/dynamic/modules/rnn.py
index 10db59aafbf7..fb5371ea4a4f 100644
--- a/torch/ao/nn/quantized/dynamic/modules/rnn.py
+++ b/torch/ao/nn/quantized/dynamic/modules/rnn.py
@@ -521,6 +521,7 @@ class LSTM(RNNBase):
         >>> output, (hn, cn) = rnn(input, (h0, c0))
     """
 
+    # pyrefly: ignore  # bad-override
     _FLOAT_MODULE = nn.LSTM
 
     __overloads__ = {"forward": ["forward_packed", "forward_tensor"]}
@@ -806,6 +807,7 @@ class GRU(RNNBase):
         >>> output, hn = rnn(input, h0)
     """
 
+    # pyrefly: ignore  # bad-override
     _FLOAT_MODULE = nn.GRU
 
     __overloads__ = {"forward": ["forward_packed", "forward_tensor"]}
@@ -1062,15 +1064,15 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
 
         qRNNCellBase: Union[LSTMCell, GRUCell, RNNCell]
 
-        if type(mod) == torch.nn.LSTMCell:
+        if type(mod) is torch.nn.LSTMCell:
             qRNNCellBase = LSTMCell(
                 mod.input_size, mod.hidden_size, bias=mod.bias, dtype=dtype
             )
-        elif type(mod) == torch.nn.GRUCell:
+        elif type(mod) is torch.nn.GRUCell:
             qRNNCellBase = GRUCell(
                 mod.input_size, mod.hidden_size, bias=mod.bias, dtype=dtype
             )
-        elif type(mod) == torch.nn.RNNCell:
+        elif type(mod) is torch.nn.RNNCell:
             qRNNCellBase = RNNCell(
                 mod.input_size,
                 mod.hidden_size,
diff --git a/torch/ao/nn/quantized/modules/activation.py b/torch/ao/nn/quantized/modules/activation.py
index 15b4d36e8b44..67b69eb7390c 100644
--- a/torch/ao/nn/quantized/modules/activation.py
+++ b/torch/ao/nn/quantized/modules/activation.py
@@ -67,7 +67,9 @@ class Hardswish(torch.nn.Hardswish):
     def __init__(self, scale, zero_point, device=None, dtype=None):
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
+        # pyrefly: ignore  # bad-argument-type
         self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+        # pyrefly: ignore  # bad-argument-type
         self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
 
     def forward(self, input):
@@ -138,7 +140,9 @@ def __init__(
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__(negative_slope, inplace)
+        # pyrefly: ignore  # bad-argument-type
         self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+        # pyrefly: ignore  # bad-argument-type
         self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
 
     def forward(self, input):
@@ -226,6 +230,7 @@ def from_reference(cls, mod, scale, zero_point):
 
 
 class MultiheadAttention(torch.ao.nn.quantizable.MultiheadAttention):
+    # pyrefly: ignore  # bad-override
     _FLOAT_MODULE = torch.ao.nn.quantizable.MultiheadAttention
 
     def _get_name(self):
diff --git a/torch/ao/nn/quantized/modules/batchnorm.py b/torch/ao/nn/quantized/modules/batchnorm.py
index 069db116a064..782bfdbda283 100644
--- a/torch/ao/nn/quantized/modules/batchnorm.py
+++ b/torch/ao/nn/quantized/modules/batchnorm.py
@@ -12,13 +12,15 @@ def __init__(
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__(num_features, eps, momentum, True, True, **factory_kwargs)
+        # pyrefly: ignore  # bad-argument-type
         self.register_buffer("scale", torch.tensor(1.0, **factory_kwargs))
+        # pyrefly: ignore  # bad-argument-type
         self.register_buffer("zero_point", torch.tensor(0, **factory_kwargs))
 
     @staticmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):
         activation_post_process = mod.activation_post_process
-        if type(mod) == cls._NNI_BN_RELU_MODULE:
+        if type(mod) is cls._NNI_BN_RELU_MODULE:
             mod = mod[0]
         scale, zero_point = activation_post_process.calculate_qparams()
         new_mod = cls(mod.num_features, mod.eps)
diff --git a/torch/ao/nn/quantized/modules/conv.py b/torch/ao/nn/quantized/modules/conv.py
index 592c5893d113..55b308a2028c 100644
--- a/torch/ao/nn/quantized/modules/conv.py
+++ b/torch/ao/nn/quantized/modules/conv.py
@@ -278,9 +278,9 @@ def get_qconv(cls, mod, activation_post_process, weight_post_process=None):
     @staticmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):
         if hasattr(mod, "weight_fake_quant"):
-            # assert type(mod) == cls.__QAT_MODULE, " nnq." + cls.__name__ + \
+            # assert type(mod) is cls.__QAT_MODULE, " nnq." + cls.__name__ + \
             # ".from_float only works for " + cls.__QAT_MODULE.__name__
-            if type(mod) == cls._NNIQAT_CONV_BN_MODULE:
+            if type(mod) is cls._NNIQAT_CONV_BN_MODULE:
                 mod.weight, mod.bias = fuse_conv_bn_weights(
                     mod.weight,
                     mod.bias,
@@ -296,7 +296,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
             weight_post_process = mod.weight_fake_quant
             activation_post_process = mod.activation_post_process
         else:
-            assert type(mod) == cls._FLOAT_MODULE, (
+            assert type(mod) is cls._FLOAT_MODULE, (
                 " nnq."
                 + cls.__name__
                 + ".from_float only works for "
@@ -408,6 +408,7 @@ def __init__(
         factory_kwargs = {"device": device, "dtype": dtype}
         kernel_size = _single(kernel_size)
         stride = _single(stride)
+        # pyrefly: ignore  # bad-assignment
         padding = padding if isinstance(padding, str) else _single(padding)
         dilation = _single(dilation)
 
@@ -807,7 +808,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):  # type: ignore[over
             + ".from_float only works for "
             + cls._FLOAT_MODULE.__name__  # type: ignore[attr-defined]
         )
-        assert type(mod) == cls._FLOAT_MODULE, msg
+        assert type(mod) is cls._FLOAT_MODULE, msg
         assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined."
         weight_post_process = mod.qconfig.weight()  # type: ignore[operator, union-attr]
         weight_post_process(mod.weight)
diff --git a/torch/ao/nn/quantized/modules/embedding_ops.py b/torch/ao/nn/quantized/modules/embedding_ops.py
index c39c8de8ce2c..7e843653ed27 100644
--- a/torch/ao/nn/quantized/modules/embedding_ops.py
+++ b/torch/ao/nn/quantized/modules/embedding_ops.py
@@ -196,7 +196,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
                           utilities or provided by user
         """
         if hasattr(mod, "weight_fake_quant"):
-            assert type(mod) == torch.ao.nn.qat.Embedding, (
+            assert type(mod) is torch.ao.nn.qat.Embedding, (
                 "nnq."
                 + cls.__name__
                 + ".from_float "
@@ -205,7 +205,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
             )
             weight_observer = mod.weight_fake_quant
         else:
-            assert type(mod) == nn.Embedding, (
+            assert type(mod) is nn.Embedding, (
                 "nnq."
                 + cls.__name__
                 + ".from_float only works for "
@@ -349,7 +349,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         if hasattr(mod, "weight_fake_quant"):
             weight_observer = mod.weight_fake_quant
         else:
-            assert type(mod) == nn.EmbeddingBag, (
+            assert type(mod) is nn.EmbeddingBag, (
                 "nnq."
                 + cls.__name__
                 + ".from_float only works for "
diff --git a/torch/ao/nn/quantized/modules/functional_modules.py b/torch/ao/nn/quantized/modules/functional_modules.py
index 3b364b43f606..acb578d0cc79 100644
--- a/torch/ao/nn/quantized/modules/functional_modules.py
+++ b/torch/ao/nn/quantized/modules/functional_modules.py
@@ -288,7 +288,7 @@ def matmul(self, x: Tensor, y: Tensor) -> Tensor:
 
     @classmethod
     def from_float(cls, mod, use_precomputed_fake_quant=False):
-        assert type(mod) == FloatFunctional, (
+        assert type(mod) is FloatFunctional, (
             "QFunctional.from_float expects an instance of FloatFunctional"
         )
         scale, zero_point = mod.activation_post_process.calculate_qparams()  # type: ignore[operator]
diff --git a/torch/ao/nn/quantized/modules/linear.py b/torch/ao/nn/quantized/modules/linear.py
index 9042833f5e30..54d3adf19e00 100644
--- a/torch/ao/nn/quantized/modules/linear.py
+++ b/torch/ao/nn/quantized/modules/linear.py
@@ -310,6 +310,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
             # the type mismatch in assignment. Also, mypy has an issue with
             # iterables not being implemented, so we are ignoring those too.
             if not isinstance(cls._FLOAT_MODULE, Iterable):
+                # pyrefly: ignore  # bad-assignment
                 cls._FLOAT_MODULE = [cls._FLOAT_MODULE]
             supported_modules = ", ".join(
                 [float_mod.__name__ for float_mod in cls._FLOAT_MODULE]
diff --git a/torch/ao/nn/quantized/modules/normalization.py b/torch/ao/nn/quantized/modules/normalization.py
index 4db2ac6e928f..d5c6c4d41c5c 100644
--- a/torch/ao/nn/quantized/modules/normalization.py
+++ b/torch/ao/nn/quantized/modules/normalization.py
@@ -37,11 +37,14 @@ def __init__(
             normalized_shape,
             eps=eps,
             elementwise_affine=elementwise_affine,
+            # pyrefly: ignore  # bad-argument-type
             **factory_kwargs,
         )
         self.weight = weight
         self.bias = bias
+        # pyrefly: ignore  # bad-argument-type
         self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+        # pyrefly: ignore  # bad-argument-type
         self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
 
     def forward(self, input):
@@ -113,7 +116,9 @@ def __init__(
         super().__init__(num_groups, num_channels, eps, affine, **factory_kwargs)
         self.weight = weight
         self.bias = bias
+        # pyrefly: ignore  # bad-argument-type
         self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+        # pyrefly: ignore  # bad-argument-type
         self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
 
     def forward(self, input):
@@ -175,7 +180,9 @@ def __init__(
         )
         self.weight = weight
         self.bias = bias
+        # pyrefly: ignore  # bad-argument-type
         self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+        # pyrefly: ignore  # bad-argument-type
         self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
 
     def forward(self, input):
@@ -242,7 +249,9 @@ def __init__(
         )
         self.weight = weight
         self.bias = bias
+        # pyrefly: ignore  # bad-argument-type
         self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+        # pyrefly: ignore  # bad-argument-type
         self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
 
     def forward(self, input):
@@ -309,7 +318,9 @@ def __init__(
         )
         self.weight = weight
         self.bias = bias
+        # pyrefly: ignore  # bad-argument-type
         self.register_buffer("scale", torch.tensor(scale, **factory_kwargs))
+        # pyrefly: ignore  # bad-argument-type
         self.register_buffer("zero_point", torch.tensor(zero_point, **factory_kwargs))
 
     def forward(self, input):
diff --git a/torch/ao/nn/quantized/modules/utils.py b/torch/ao/nn/quantized/modules/utils.py
index be59d496b8d0..330070913a75 100644
--- a/torch/ao/nn/quantized/modules/utils.py
+++ b/torch/ao/nn/quantized/modules/utils.py
@@ -83,7 +83,7 @@ def _quantize_weight(float_wt, observer):
             torch.qint8,
         )
         qweight = _clamp_weights(qweight, observer, wt_scale, wt_zp)
-    elif observer.qscheme in [torch.per_channel_affine_float_qparams]:
+    elif observer.qscheme == torch.per_channel_affine_float_qparams:
         qweight = torch.quantize_per_channel(
             float_wt,
             wt_scale.to(torch.float),
diff --git a/torch/ao/nn/quantized/reference/modules/conv.py b/torch/ao/nn/quantized/reference/modules/conv.py
index de2ea9c6da8d..1e9cbceb7c12 100644
--- a/torch/ao/nn/quantized/reference/modules/conv.py
+++ b/torch/ao/nn/quantized/reference/modules/conv.py
@@ -95,6 +95,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         and the backend should be able to fuse the ops with `*` into a quantized conv1d
         """
         weight_quant_dequant = self.get_weight()
+        # pyrefly: ignore  # no-matching-overload
         result = F.conv1d(
             x,
             weight_quant_dequant,
@@ -140,6 +141,7 @@ def __init__(
             dilation,
             groups,
             bias,
+            # pyrefly: ignore  # bad-argument-type
             padding_mode,
             device,
             dtype,
@@ -158,6 +160,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         and the backend should be able to fuse the ops with `*` into a quantized conv2d
         """
         weight_quant_dequant = self.get_weight()
+        # pyrefly: ignore  # no-matching-overload
         result = F.conv2d(
             x,
             weight_quant_dequant,
@@ -203,6 +206,7 @@ def __init__(
             dilation,
             groups,
             bias,
+            # pyrefly: ignore  # bad-argument-type
             padding_mode,
             device,
             dtype,
@@ -221,6 +225,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         and the backend should be able to fuse the ops with `*` into a quantized conv3d
         """
         weight_quant_dequant = self.get_weight()
+        # pyrefly: ignore  # no-matching-overload
         result = F.conv3d(
             x,
             weight_quant_dequant,
@@ -378,6 +383,7 @@ def __init__(
             groups,
             bias,
             dilation,
+            # pyrefly: ignore  # bad-argument-type
             padding_mode,
             device,
             dtype,
@@ -459,6 +465,7 @@ def __init__(
             groups,
             bias,
             dilation,
+            # pyrefly: ignore  # bad-argument-type
             padding_mode,
             device,
             dtype,
diff --git a/torch/ao/nn/quantized/reference/modules/rnn.py b/torch/ao/nn/quantized/reference/modules/rnn.py
index adb1356cb3d3..f7b9c447e9aa 100644
--- a/torch/ao/nn/quantized/reference/modules/rnn.py
+++ b/torch/ao/nn/quantized/reference/modules/rnn.py
@@ -663,7 +663,11 @@ def forward(self, input, hx=None):  # noqa: F811
         # xxx: isinstance check needs to be in conditional for TorchScript to compile
         if isinstance(orig_input, PackedSequence):
             output_packed = PackedSequence(
-                output, batch_sizes, sorted_indices, unsorted_indices
+                output,
+                # pyrefly: ignore  # bad-argument-type
+                batch_sizes,
+                sorted_indices,
+                unsorted_indices,
             )
             return output_packed, self.permute_hidden(hidden, unsorted_indices)
         else:
@@ -823,7 +827,11 @@ def forward(self, input, hx=None):  # noqa: F811
         # xxx: isinstance check needs to be in conditional for TorchScript to compile
         if isinstance(orig_input, PackedSequence):
             output_packed = PackedSequence(
-                output, batch_sizes, sorted_indices, unsorted_indices
+                output,
+                # pyrefly: ignore  # bad-argument-type
+                batch_sizes,
+                sorted_indices,
+                unsorted_indices,
             )
             return output_packed, self.permute_hidden(hidden, unsorted_indices)
         else:
diff --git a/torch/ao/nn/quantized/reference/modules/sparse.py b/torch/ao/nn/quantized/reference/modules/sparse.py
index 7e4bdb9b02c7..23d85ac46d09 100644
--- a/torch/ao/nn/quantized/reference/modules/sparse.py
+++ b/torch/ao/nn/quantized/reference/modules/sparse.py
@@ -42,6 +42,7 @@ def __init__(
             scale_grad_by_freq,
             sparse,
             _weight,
+            # pyrefly: ignore  # bad-argument-type
             device,
             dtype,
         )
diff --git a/torch/ao/nn/quantized/reference/modules/utils.py b/torch/ao/nn/quantized/reference/modules/utils.py
index 0701b73da38b..8ff113b79172 100644
--- a/torch/ao/nn/quantized/reference/modules/utils.py
+++ b/torch/ao/nn/quantized/reference/modules/utils.py
@@ -18,6 +18,7 @@ def _init_weight_qparams(self, weight_qparams, device):
                 "scale": 1.0,
                 "zero_point": 0,
             }
+        # pyrefly: ignore  # bad-assignment
         self.weight_qscheme: torch.qscheme = weight_qparams["qscheme"]
         self.weight_dtype = weight_qparams["dtype"]
         assert self.weight_qscheme in [
@@ -80,16 +81,15 @@ def _init_weight_qparams(self, weight_qparams, device):
             self.register_buffer(
                 "weight_axis", torch.tensor(0, dtype=torch.int, device=device)
             )
+        # pyrefly: ignore  # bad-assignment
         self.is_decomposed: bool = weight_qparams.get("is_decomposed", False)
         # store weight_axis as weight_axis_int due to some constraints of torchdynamo.export
         # for capturing `.item` operations
         self.weight_axis_int: int = self.weight_axis.item()  # type: ignore[operator, assignment]
-        self.weight_quant_min: typing.Optional[int] = weight_qparams.get(
-            "quant_min", None
-        )
-        self.weight_quant_max: typing.Optional[int] = weight_qparams.get(
-            "quant_max", None
-        )
+        # pyrefly: ignore  # bad-assignment
+        self.weight_quant_min: typing.Optional[int] = weight_qparams.get("quant_min")
+        # pyrefly: ignore  # bad-assignment
+        self.weight_quant_max: typing.Optional[int] = weight_qparams.get("quant_max")
 
     def get_weight(self):
         """
@@ -105,6 +105,7 @@ def get_weight(self):
             return _quantize_and_dequantize_weight_decomposed(
                 self.weight,  # type: ignore[arg-type]
                 self.weight_qscheme,
+                # pyrefly: ignore  # bad-argument-type
                 self.weight_dtype,
                 self.weight_scale,
                 self.weight_zero_point,
@@ -116,6 +117,7 @@ def get_weight(self):
             return _quantize_and_dequantize_weight(
                 self.weight,  # type: ignore[arg-type]
                 self.weight_qscheme,
+                # pyrefly: ignore  # bad-argument-type
                 self.weight_dtype,
                 self.weight_scale,
                 self.weight_zero_point,
@@ -131,6 +133,7 @@ def get_quantized_weight(self):
             return _quantize_weight_decomposed(
                 self.weight,  # type: ignore[arg-type]
                 self.weight_qscheme,
+                # pyrefly: ignore  # bad-argument-type
                 self.weight_dtype,
                 self.weight_scale,
                 self.weight_zero_point,
@@ -142,6 +145,7 @@ def get_quantized_weight(self):
             return _quantize_weight(
                 self.weight,  # type: ignore[arg-type]
                 self.weight_qscheme,
+                # pyrefly: ignore  # bad-argument-type
                 self.weight_dtype,
                 self.weight_scale,
                 self.weight_zero_point,
@@ -198,7 +202,7 @@ def _quantize_weight_decomposed(
     _DTYPE_TO_QVALUE_BOUNDS: dict[torch.dtype, tuple[int, int]] = {
         torch.uint8: (0, 255),
         torch.int8: (-128, 127),
-        torch.int32: (int(-(2**31)), int(2**31 - 1)),
+        torch.int32: ((-(2**31)), (2**31 - 1)),
     }
 
     # TODO: add an util function for converting qdtype to dtype
@@ -261,7 +265,7 @@ def _dequantize_weight_decomposed(
     _DTYPE_TO_QVALUE_BOUNDS: dict[torch.dtype, tuple[int, int]] = {
         torch.uint8: (0, 255),
         torch.int8: (-128, 127),
-        torch.int32: (int(-(2**31)), int(2**31 - 1)),
+        torch.int32: ((-(2**31)), (2**31 - 1)),
     }
     # TODO: add an util function for converting qdtype to dtype
     _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE = {
diff --git a/torch/ao/nn/sparse/quantized/dynamic/linear.py b/torch/ao/nn/sparse/quantized/dynamic/linear.py
index 6da18e151012..515ba144cc88 100644
--- a/torch/ao/nn/sparse/quantized/dynamic/linear.py
+++ b/torch/ao/nn/sparse/quantized/dynamic/linear.py
@@ -140,7 +140,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
 
         We only care about the convert at this stage, no need for observers just yet.
         """
-        assert type(mod) == cls._FLOAT_MODULE, (
+        assert type(mod) is cls._FLOAT_MODULE, (
             " nnq."
             + cls.__name__
             + ".from_float only works for "
@@ -149,9 +149,11 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
         # TODO: Need to add options to qconfig to avoid the calibration.
         # TODO: Add calibration for the sparsity
         assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
-        if type(mod) == nni.LinearReLU:
+        if type(mod) is nni.LinearReLU:
             mod = mod[0]
+        # pyrefly: ignore  # missing-attribute
         if mod.qconfig is not None and mod.qconfig.weight is not None:
+            # pyrefly: ignore  # not-callable
             weight_observer = mod.qconfig.weight()
         else:
             # We have the circular import issues if we import the qconfig in the beginning of this file:
@@ -185,5 +187,6 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
             col_block_size,
             dtype=dtype,
         )
+        # pyrefly: ignore  # bad-argument-type
         qlinear.set_weight_bias(qweight, mod.bias, row_block_size, col_block_size)
         return qlinear
diff --git a/torch/ao/nn/sparse/quantized/linear.py b/torch/ao/nn/sparse/quantized/linear.py
index e3dbf23b9f68..e51ce51138ac 100644
--- a/torch/ao/nn/sparse/quantized/linear.py
+++ b/torch/ao/nn/sparse/quantized/linear.py
@@ -224,7 +224,7 @@ def from_float(cls, mod, use_precomputed_fake_quant=False):
 
         TODO(zaf): Need to add the sparse params to the qconfig
         """
-        assert type(mod) == cls._FLOAT_MODULE, (
+        assert type(mod) is cls._FLOAT_MODULE, (
             cls._get_name() + ".from_float only works for " + cls._FLOAT_MODULE.__name__
         )
         assert hasattr(mod, "sparse_params"), (
diff --git a/torch/ao/ns/_numeric_suite.py b/torch/ao/ns/_numeric_suite.py
index 96d24a2cf2e7..1c8e751b1ebd 100644
--- a/torch/ao/ns/_numeric_suite.py
+++ b/torch/ao/ns/_numeric_suite.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional, Union
 
 import torch
 import torch.ao.nn.quantized as nnq
diff --git a/torch/ao/ns/_numeric_suite_fx.py b/torch/ao/ns/_numeric_suite_fx.py
index ec13839f3c9b..3acd381564b4 100644
--- a/torch/ao/ns/_numeric_suite_fx.py
+++ b/torch/ao/ns/_numeric_suite_fx.py
@@ -84,7 +84,8 @@
 """
 
 import collections
-from typing import Any, Callable, Optional, TYPE_CHECKING
+from collections.abc import Callable
+from typing import Any, Optional, TYPE_CHECKING
 
 import torch
 import torch.ao.quantization.quantize_fx as quantize_fx
diff --git a/torch/ao/ns/fx/graph_matcher.py b/torch/ao/ns/fx/graph_matcher.py
index 1f9c873971a3..4873ae0d3532 100644
--- a/torch/ao/ns/fx/graph_matcher.py
+++ b/torch/ao/ns/fx/graph_matcher.py
@@ -84,6 +84,7 @@ def __next__(self) -> NSSubgraph:
                 if is_match:
                     # navigate to the base node
                     for rev_fusion_idx in range(len(_reverse_fusion_ops) - 1):
+                        # pyrefly: ignore  # bad-argument-type
                         self.seen_nodes.add(cur_start_node)
                         # for now, assume that there are no other nodes
                         # which need to be added to the stack
@@ -94,8 +95,10 @@ def __next__(self) -> NSSubgraph:
                             cur_base_op_node = cur_start_node
                     break
 
+            # pyrefly: ignore  # bad-argument-type
             self.seen_nodes.add(cur_start_node)
             # add args of previous nodes to stack
+            # pyrefly: ignore  # missing-attribute
             for arg in cur_start_node.all_input_nodes:
                 self._recursively_add_node_arg_to_stack(arg)
 
@@ -103,6 +106,7 @@ def __next__(self) -> NSSubgraph:
             # note: this check is done on the start_node, i.e.
             # if we are matching linear-relu in reverse, this would do the matchable
             # check on the linear
+            # pyrefly: ignore  # bad-argument-type
             if not self._is_matchable(cur_base_op_node):
                 continue
 
@@ -116,8 +120,10 @@ def __next__(self) -> NSSubgraph:
                     continue
 
             return NSSubgraph(
+                # pyrefly: ignore  # bad-argument-type
                 start_node=cur_start_node,
                 end_node=cur_end_node,
+                # pyrefly: ignore  # bad-argument-type
                 base_op_node=cur_base_op_node,
             )
 
@@ -237,11 +243,11 @@ def _get_subgraph_relationship_type(
         key = (type(mod_a), type(mod_b))
 
         if key not in type_a_related_to_b:
-            if type(mod_a) == type(mod_b):
+            if type(mod_a) is type(mod_b):
                 return SubgraphTypeRelationship.EQUAL_BUT_UKNOWN
             else:
                 return SubgraphTypeRelationship.NOT_RELATED
-        elif type(mod_a) == type(mod_b):
+        elif type(mod_a) is type(mod_b):
             return SubgraphTypeRelationship.EQUAL
         else:
             return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL
diff --git a/torch/ao/ns/fx/graph_passes.py b/torch/ao/ns/fx/graph_passes.py
index bc30a014c195..8b2444dd8483 100644
--- a/torch/ao/ns/fx/graph_passes.py
+++ b/torch/ao/ns/fx/graph_passes.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional, Union
 
 import torch
 from torch.ao.ns.fx.mappings import get_node_type_to_io_type_map
@@ -127,7 +128,7 @@ def load_arg(a):
                 arg_indices_to_log = get_arg_indices_of_inputs_to_log(node)
                 for node_arg_idx in arg_indices_to_log:
                     node_arg = get_normalized_nth_input(node, gm, node_arg_idx)
-                    if type(node_arg) == Node:
+                    if type(node_arg) is Node:
                         # create a single input logger
                         prev_node = env[node_arg.name]
                         env[node_arg.name] = _insert_logger_after_node(
@@ -145,7 +146,7 @@ def load_arg(a):
                             fqn=fqn,
                         )
                     elif (
-                        type(node_arg) == torch.fx.immutable_collections.immutable_list
+                        type(node_arg) is torch.fx.immutable_collections.immutable_list
                     ):
                         # create N input loggers, one for each node
                         for arg_idx, arg in enumerate(node_arg):  # type: ignore[var-annotated, arg-type]
@@ -164,8 +165,6 @@ def load_arg(a):
                                 index_of_arg=node_arg_idx,
                                 fqn=fqn,
                             )
-                    else:
-                        pass
 
             # ensure env is populated with base node
             # Note: runs for both inputs and outputs
@@ -953,6 +952,7 @@ def load_arg(a):
                 if should_log_inputs:
                     # skip the input logger when inserting a dtype cast
                     if isinstance(prev_node_c, Node):
+                        # pyrefly: ignore  # unbound-name
                         prev_node_c = get_normalized_nth_input(node_c, gm_b, 0)
                     elif isinstance(prev_node_c, list):
                         prev_node_c = [
@@ -961,6 +961,7 @@ def load_arg(a):
                         ]
                 dtype_cast_node = _insert_dtype_cast_after_node(
                     subgraph_a.start_node,
+                    # pyrefly: ignore  # unbound-name
                     node_c,
                     prev_node_c,
                     gm_a,
@@ -1041,7 +1042,10 @@ def load_arg(a):
                 if num_non_param_args_node_a == 2:
                     # node_c_second_non_param_arg = node_c.args[1]
                     node_c_second_non_param_arg = get_normalized_nth_input(
-                        node_c, gm_b, 1
+                        # pyrefly: ignore  # unbound-name
+                        node_c,
+                        gm_b,
+                        1,
                     )
                 node_a_shadows_c = _insert_copy_of_subgraph_a_after_input_node_c(
                     dtype_cast_node,
@@ -1049,6 +1053,7 @@ def load_arg(a):
                     subgraph_a,
                     gm_a,
                     gm_b,
+                    # pyrefly: ignore  # unbound-name
                     node_c.name + "_shadow_copy_",
                 )
                 env_c[node_a_shadows_c.name] = node_a_shadows_c
@@ -1071,11 +1076,15 @@ def load_arg(a):
                     cur_node = node_a_shadows_c
                     while get_normalized_nth_input(cur_node, gm_b, 0) != input_logger:  # type: ignore[possibly-undefined]
                         cur_node = get_normalized_nth_input(cur_node, gm_b, 0)  # type: ignore[assignment]
+                    # pyrefly: ignore  # unbound-name
                     if isinstance(input_logger, Node):
+                        # pyrefly: ignore  # unbound-name
                         input_logger_mod = getattr(gm_b, input_logger.name)
                         input_logger_mod.ref_node_name = cur_node.name
                     else:
+                        # pyrefly: ignore  # unbound-name
                         assert isinstance(input_logger, list)
+                        # pyrefly: ignore  # unbound-name
                         for input_logger_inner in input_logger:
                             input_logger_mod = getattr(gm_b, input_logger_inner.name)
                             input_logger_mod.ref_node_name = cur_node.name
diff --git a/torch/ao/ns/fx/mappings.py b/torch/ao/ns/fx/mappings.py
index a8ca955d22fa..c8a718a75b06 100644
--- a/torch/ao/ns/fx/mappings.py
+++ b/torch/ao/ns/fx/mappings.py
@@ -1,5 +1,5 @@
 import operator
-from typing import Callable, Optional
+from typing import Optional, TYPE_CHECKING
 
 import torch
 import torch.ao.nn.intrinsic as nni
@@ -19,6 +19,10 @@
 from .ns_types import NSNodeTargetType
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 toq = torch.ops.quantized
 
 
@@ -415,6 +419,7 @@ def get_base_name_to_sets_of_related_ops() -> dict[str, set[NSNodeTargetType]]:
         target2,
     ) in _lower_to_native_backend.STATIC_LOWER_FUNCTIONAL_MAP.items():
         new_connections.append((source, target1))
+        # pyrefly: ignore  # bad-argument-type
         new_connections.append((source, target2))
 
     for source_to_target in (
@@ -423,6 +428,7 @@ def get_base_name_to_sets_of_related_ops() -> dict[str, set[NSNodeTargetType]]:
         quantization_mappings.DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS,
     ):
         for source, target in source_to_target.items():  # type:ignore[assignment]
+            # pyrefly: ignore  # bad-argument-type
             new_connections.append((source, target))
 
     #
diff --git a/torch/ao/ns/fx/n_shadows_utils.py b/torch/ao/ns/fx/n_shadows_utils.py
index 5d8b569036ff..5b721b30cbb3 100644
--- a/torch/ao/ns/fx/n_shadows_utils.py
+++ b/torch/ao/ns/fx/n_shadows_utils.py
@@ -2,7 +2,8 @@
 import collections
 import copy
 import operator
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 
 import torch
 import torch.fx
@@ -93,8 +94,10 @@ def fetch_attr(target: str):
                 )
 
             if isinstance(result, torch.Tensor):  # type: ignore[possibly-undefined]
+                # pyrefly: ignore  # unbound-name
                 node.traced_result = result
 
+            # pyrefly: ignore  # unsupported-operation
             env[node.name] = result
 
         return None
@@ -393,8 +396,10 @@ def _add_placeholder(
                         cur_name_idx += 1
                         setattr(gm, mod_name, new_arg)
                         new_arg_placeholder = gm.placeholder(mod_name)  # type: ignore[operator]
+                        # pyrefly: ignore  # missing-attribute
                         cur_args_copy.append(new_arg_placeholder)
                     elif isinstance(arg, (float, int, torch.dtype)):
+                        # pyrefly: ignore  # missing-attribute
                         cur_args_copy.append(arg)
                     else:
                         raise AssertionError(f"arg of type {type(arg)} not handled yet")
@@ -801,6 +806,7 @@ def _get_subgraph_containing_node(node, subgraphs_dedup):
                 model,
                 cur_subgraph_idx,
                 match_name,
+                # pyrefly: ignore  # bad-argument-type
                 maybe_subgraph,
                 [qconfig_mapping],
                 [node_name_to_qconfig],
@@ -857,6 +863,7 @@ def _get_subgraph_containing_node(node, subgraphs_dedup):
             cur_node_orig = first_node
             cur_node_copy = None
             first_node_copy = None
+            # pyrefly: ignore  # bad-assignment
             while cur_node_orig in subgraph_to_use:
                 # TODO(future PR): make this support all possible args/kwargs
                 if cur_node_orig is first_node:
diff --git a/torch/ao/ns/fx/ns_types.py b/torch/ao/ns/fx/ns_types.py
index d7fcd28e3648..134fd485130e 100644
--- a/torch/ao/ns/fx/ns_types.py
+++ b/torch/ao/ns/fx/ns_types.py
@@ -1,5 +1,6 @@
 import enum
-from typing import Any, Callable, NamedTuple, Union
+from collections.abc import Callable
+from typing import Any, NamedTuple, Union
 
 from torch.fx.graph import Node
 
diff --git a/torch/ao/ns/fx/pattern_utils.py b/torch/ao/ns/fx/pattern_utils.py
index 4ac267417f97..8339ce8f57c1 100644
--- a/torch/ao/ns/fx/pattern_utils.py
+++ b/torch/ao/ns/fx/pattern_utils.py
@@ -1,4 +1,5 @@
-from typing import Any, Callable, Union
+from collections.abc import Callable
+from typing import Any, Union
 
 import torch
 import torch.nn as nn
@@ -27,7 +28,7 @@ def get_type_a_related_to_b(
     for s in base_name_to_sets_of_related_ops.values():
         s_list = list(s)
         # add every bidirectional pair
-        for idx_0 in range(0, len(s_list)):
+        for idx_0 in range(len(s_list)):
             for idx_1 in range(idx_0, len(s_list)):
                 type_a_related_to_b.add((s_list[idx_0], s_list[idx_1]))
                 type_a_related_to_b.add((s_list[idx_1], s_list[idx_0]))
diff --git a/torch/ao/ns/fx/qconfig_multi_mapping.py b/torch/ao/ns/fx/qconfig_multi_mapping.py
index 530d5ce52d99..750129807d00 100644
--- a/torch/ao/ns/fx/qconfig_multi_mapping.py
+++ b/torch/ao/ns/fx/qconfig_multi_mapping.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 
 import copy
-from typing import Any, Callable, TYPE_CHECKING, Union
+from typing import Any, TYPE_CHECKING, Union
 
 import torch
 from torch.ao.quantization import QConfigMapping
@@ -10,6 +10,8 @@
 
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from torch.ao.quantization.qconfig import QConfigAny
 
 __all__ = ["QConfigMultiMapping"]
diff --git a/torch/ao/ns/fx/utils.py b/torch/ao/ns/fx/utils.py
index b6357120dc14..168f07ee33a0 100644
--- a/torch/ao/ns/fx/utils.py
+++ b/torch/ao/ns/fx/utils.py
@@ -2,7 +2,8 @@
 # mypy: allow-untyped-defs
 import enum
 import operator
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+from typing import Optional, Union
 
 import torch
 import torch.ao.nn.intrinsic.quantized as nniq
@@ -316,7 +317,7 @@ def get_arg_indices_of_inputs_to_log(node: Node) -> list[int]:
         node.target in (torch.add, torch.ops.quantized.add, operator.add)
         or node.target in (torch.mul, torch.ops.quantized.mul, operator.mul)
     ):
-        result = [i for i in range(2) if type(node.args[i]) == Node]
+        result = [i for i in range(2) if type(node.args[i]) is Node]
         return result
     return [0]
 
@@ -404,6 +405,7 @@ def maybe_add_missing_fqns(results: NSResultsType) -> None:
                 for model_name, model_results in model_name_to_results.items():
                     if model_name == model_name_with_fqns:
                         continue
+
                     for i in range(len(model_results)):
                         fqn = ref_model_results[i]["fqn"]
                         model_results[i]["fqn"] = fqn
@@ -467,6 +469,7 @@ def compute_normalized_l2_error(x: torch.Tensor, y: torch.Tensor) -> torch.Tenso
     Return:
         float or tuple of floats
     """
+    # pyrefly: ignore  # unsupported-operation
     return torch.sqrt(((x - y) ** 2).sum() / (x**2).sum())
 
 
diff --git a/torch/ao/ns/fx/weight_utils.py b/torch/ao/ns/fx/weight_utils.py
index 52cb13c1286e..1b665e616a1e 100644
--- a/torch/ao/ns/fx/weight_utils.py
+++ b/torch/ao/ns/fx/weight_utils.py
@@ -1,4 +1,5 @@
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Optional
 
 import torch
 import torch.ao.nn.intrinsic as nni
@@ -267,7 +268,7 @@ def extract_weight_from_node(
         mod = getattr_from_fqn(gm, node.target)
         module_mapping = op_to_type_to_weight_extraction_fn["call_module"]
         for target_mod_type, weight_extraction_fn in module_mapping.items():
-            if type(mod) == target_mod_type:
+            if type(mod) is target_mod_type:
                 weight = weight_extraction_fn(mod)
                 return {
                     "type": res_type,
diff --git a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
index ef6a35686c7d..4330b0e24253 100644
--- a/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
+++ b/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
@@ -158,9 +158,9 @@ def hook(module, input) -> None:
                 # data should be a list [aggregated over each feature only]
                 if data is None:
                     out_data = [
-                        0 for _ in range(0, len(features))
+                        0 for _ in range(len(features))
                     ]  # create one in case of 1st forward
-                    self.state[name]["mask"] = [0 for _ in range(0, len(features))]
+                    self.state[name]["mask"] = [0 for _ in range(len(features))]
                 else:
                     out_data = data  # a list
 
@@ -336,7 +336,7 @@ def hook(module, input):
                 return input_data * mask
             else:
                 # apply per feature, feature_dim
-                for feature_idx in range(0, len(features)):
+                for feature_idx in range(len(features)):
                     feature = (
                         torch.Tensor([features[feature_idx]])
                         .long()
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/dlrm_utils.py b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/dlrm_utils.py
index 2bfaac1cef49..1c0dfd502bea 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/dlrm_utils.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/dlrm_utils.py
@@ -23,12 +23,17 @@ def __init__(self, **args):
         super().__init__(**args)
 
     def forward(self, dense_x, lS_o, lS_i):
+        # pyrefly: ignore  # missing-attribute
         x = self.apply_mlp(dense_x, self.bot_l)  # dense features
+        # pyrefly: ignore  # missing-attribute
         ly = self.apply_emb(lS_o, lS_i, self.emb_l, self.v_W_l)  # apply embedding bag
+        # pyrefly: ignore  # missing-attribute
         z = self.interact_features(x, ly)
 
         z = z.to_sparse_coo()
+        # pyrefly: ignore  # missing-attribute
         z = torch.mm(z, self.top_l[0].weight.T).add(self.top_l[0].bias)
+        # pyrefly: ignore  # missing-attribute
         for layer in self.top_l[1:]:
             z = layer(z)
 
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
index 8192b617139b..0e25f59cea64 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_disk_savings.py
@@ -99,7 +99,7 @@ def sparsify_model(path_to_model, sparsified_model_dump_path):
         sparse_block_shapes (List of tuples)
             List of sparse block shapes to be sparsified on
     """
-    sparsity_levels = [sl / 10 for sl in range(0, 10)]
+    sparsity_levels = [sl / 10 for sl in range(10)]
     sparsity_levels += [0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]
 
     norms = ["L1", "L2"]
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_forward_time.py b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_forward_time.py
index d3a823543229..f5796ab04718 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_forward_time.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/benchmarks/evaluate_forward_time.py
@@ -27,6 +27,7 @@ def run_forward(model, **batch):
             model(X, lS_o, lS_i)
         end = time.time()
         time_taken = end - start
+        # pyrefly: ignore  # bad-argument-type
         time_list.append(time_taken)
     avg_time = np.mean(time_list[1:])
     return avg_time
diff --git a/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py b/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
index 442639be9b21..5a36e13c7b46 100644
--- a/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
+++ b/torch/ao/pruning/_experimental/data_sparsifier/lightning/tests/test_callbacks.py
@@ -299,7 +299,7 @@ def test_train_aware_callback(self):
         self._check_on_train_start(pl_module, callback, sparsifier_args, scheduler_args)
 
         num_epochs = 5
-        for _ in range(0, num_epochs):
+        for _ in range(num_epochs):
             self._check_on_train_epoch_start(pl_module, callback)
             self._simulate_update_param_model(pl_module)
             self._check_on_train_epoch_end(pl_module, callback)
diff --git a/torch/ao/pruning/_experimental/pruner/FPGM_pruner.py b/torch/ao/pruning/_experimental/pruner/FPGM_pruner.py
index 680ecd9f139e..b2e44a5ed249 100644
--- a/torch/ao/pruning/_experimental/pruner/FPGM_pruner.py
+++ b/torch/ao/pruning/_experimental/pruner/FPGM_pruner.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+from typing import Optional, Union
 
 import torch
 
@@ -71,6 +72,7 @@ def _compute_distance(self, t):
         dist_matrix = self.dist_fn(t_flatten)
 
         # more similar with other filter indicates large in the sum of row
+        # pyrefly: ignore  # bad-argument-type
         distance = torch.sum(torch.abs(dist_matrix), 1)
 
         return distance
diff --git a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
index ffbb99bb2967..33ecf08b79ed 100644
--- a/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
+++ b/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
+from collections.abc import Callable
 from itertools import chain
 from operator import getitem
-from typing import Callable, Optional, Union
+from typing import Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -259,6 +260,7 @@ def _prepare(self, *args, **kwargs) -> None:
                     module.register_parameter(
                         "_bias", nn.Parameter(module.bias.detach())
                     )
+                    # pyrefly: ignore  # bad-assignment
                     module.bias = None
                     module.prune_bias = prune_bias
 
diff --git a/torch/ao/pruning/_experimental/pruner/prune_functions.py b/torch/ao/pruning/_experimental/pruner/prune_functions.py
index a1882af4ca11..13cf450b6ee4 100644
--- a/torch/ao/pruning/_experimental/pruner/prune_functions.py
+++ b/torch/ao/pruning/_experimental/pruner/prune_functions.py
@@ -4,7 +4,8 @@
 Also contains utilities for bias propagation
 """
 
-from typing import Callable, cast, Optional
+from collections.abc import Callable
+from typing import cast, Optional
 
 import torch
 from torch import nn, Tensor
@@ -96,6 +97,7 @@ def _propagate_module_bias(module: nn.Module, mask: Tensor) -> Optional[Tensor]:
     if module.bias is not None:
         module.bias = nn.Parameter(cast(Tensor, module.bias)[mask])
     elif getattr(module, "_bias", None) is not None:
+        # pyrefly: ignore  # bad-assignment
         module.bias = nn.Parameter(cast(Tensor, module._bias)[mask])
 
     # get pruned biases to propagate to subsequent layer
@@ -125,6 +127,7 @@ def _prune_linear_helper(linear: nn.Linear) -> Tensor:
     linear.out_features = linear.weight.shape[0]
     _remove_bias_handles(linear)
 
+    # pyrefly: ignore  # unbound-name
     return mask
 
 
@@ -183,6 +186,7 @@ def _prune_conv2d_helper(conv2d: nn.Conv2d) -> Tensor:
     conv2d.out_channels = conv2d.weight.shape[0]
 
     _remove_bias_handles(conv2d)
+    # pyrefly: ignore  # unbound-name
     return mask
 
 
@@ -203,6 +207,7 @@ def prune_conv2d_padded(conv2d_1: nn.Conv2d) -> None:
             new_bias = torch.zeros(conv2d_1.bias.shape)
             new_bias[mask] = conv2d_1.bias[mask]  # type: ignore[possibly-undefined]
             # adjusted bias that to keep in conv2d_1
+            # pyrefly: ignore  # unbound-name
             new_bias[~mask] = cast(Tensor, conv2d_1._bias)[~mask]
             # pruned biases that are kept instead of propagated
             conv2d_1.bias = nn.Parameter(new_bias)
diff --git a/torch/ao/pruning/scheduler/lambda_scheduler.py b/torch/ao/pruning/scheduler/lambda_scheduler.py
index 5588c157161a..7c0e80888908 100644
--- a/torch/ao/pruning/scheduler/lambda_scheduler.py
+++ b/torch/ao/pruning/scheduler/lambda_scheduler.py
@@ -1,5 +1,6 @@
 import warnings
-from typing import Callable, Union
+from collections.abc import Callable
+from typing import Union
 
 from torch.ao.pruning.sparsifier.base_sparsifier import BaseSparsifier
 
diff --git a/torch/ao/pruning/sparsifier/base_sparsifier.py b/torch/ao/pruning/sparsifier/base_sparsifier.py
index 73d4c283da63..a4ffac985631 100644
--- a/torch/ao/pruning/sparsifier/base_sparsifier.py
+++ b/torch/ao/pruning/sparsifier/base_sparsifier.py
@@ -170,6 +170,7 @@ def prepare(self, model, config):
             self.make_config_from_model(model)
 
         # TODO: Remove the configuration by reference ('module')
+        # pyrefly: ignore  # not-iterable
         for module_config in self.config:
             assert isinstance(module_config, dict), (
                 "config elements should be dicts not modules i.e.:"
diff --git a/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py b/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py
index a4d42ea80328..26fb3a98b8fb 100644
--- a/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py
+++ b/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py
@@ -53,7 +53,7 @@ def update_mask(  # type:ignore[override]
                 "nearliness cannot be larger than the dimensions of tensor."
             )
 
-        for row in range(0, height):
+        for row in range(height):
             # Bounds of entries that needs to be set to 1
             low = max(0, row - dist_to_diagonal)
             high = min(width, row + dist_to_diagonal + 1)
diff --git a/torch/ao/pruning/sparsifier/utils.py b/torch/ao/pruning/sparsifier/utils.py
index 47185aeea527..de7e400757bc 100644
--- a/torch/ao/pruning/sparsifier/utils.py
+++ b/torch/ao/pruning/sparsifier/utils.py
@@ -51,6 +51,7 @@ def swap_module(
             new_mod.register_forward_hook(hook_fn)
 
         # respect device affinity when swapping modules
+        # pyrefly: ignore  # bad-argument-type
         devices = {p.device for p in chain(mod.parameters(), mod.buffers())}
         assert len(devices) <= 1, (
             f"swap_module only works with cpu or single-device CUDA modules, but got devices {devices}"
diff --git a/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py b/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py
index 89c707ad33e6..a78659094ac9 100644
--- a/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py
+++ b/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 import operator
+from collections.abc import Callable
 from functools import reduce
-from typing import Callable, Optional, Union
+from typing import Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -142,7 +143,7 @@ def _make_tensor_mask(
 
         data = data.repeat(1, values_per_block, 1)
 
-        threshold_idx = int(round(sparsity_level * num_blocks))
+        threshold_idx = round(sparsity_level * num_blocks)
         threshold_idx = max(0, min(num_blocks - 1, threshold_idx))  # Sanity check
         _, sorted_idx = torch.topk(data, k=threshold_idx, dim=2, largest=False)
 
@@ -234,6 +235,7 @@ def update_mask(  # type: ignore[call-override, override]
             ww = self.norm_fn(getattr(module, tensor_name))
             tensor_mask = self._make_tensor_mask(
                 data=ww,
+                # pyrefly: ignore  # missing-attribute
                 input_shape=ww.shape,
                 sparsity_level=sparsity_level,
                 sparse_block_shape=sparse_block_shape,
diff --git a/torch/ao/quantization/__init__.py b/torch/ao/quantization/__init__.py
index f50b9d6cd137..001dd8a17bf1 100644
--- a/torch/ao/quantization/__init__.py
+++ b/torch/ao/quantization/__init__.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 
 import sys
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+from typing import Optional, Union
 
 import torch
 from torch import Tensor
@@ -23,6 +24,8 @@
     _move_exported_model_to_eval as move_exported_model_to_eval,
     _move_exported_model_to_train as move_exported_model_to_train,
 )
+
+# pyrefly: ignore  # deprecated
 from .qconfig import *  # noqa: F403
 from .qconfig_mapping import *  # noqa: F403
 from .quant_type import *  # noqa: F403
@@ -232,9 +235,10 @@ def __init__(
         from .utils import is_per_channel
 
         if is_per_channel(self.qscheme):
-            assert self.ch_axis is not None, (
-                "Must provide a valid ch_axis if qscheme is per channel"
-            )
+            if self.ch_axis is None:
+                raise AssertionError(
+                    "Must provide a valid ch_axis if qscheme is per channel"
+                )
 
     def forward(self, x: Tensor) -> Tensor:
         return x
diff --git a/torch/ao/quantization/_equalize.py b/torch/ao/quantization/_equalize.py
index 5d79f7f71b4f..a78dd307fc6d 100644
--- a/torch/ao/quantization/_equalize.py
+++ b/torch/ao/quantization/_equalize.py
@@ -92,9 +92,10 @@ def channel_range(input, axis=0):
     mins = min_over_ndim(input, axis_list)
     maxs = max_over_ndim(input, axis_list)
 
-    assert mins.size(0) == input.size(axis), (
-        "Dimensions of resultant channel range does not match size of requested axis"
-    )
+    if mins.size(0) != input.size(axis):
+        raise AssertionError(
+            "Dimensions of resultant channel range does not match size of requested axis"
+        )
     return maxs - mins
 
 
diff --git a/torch/ao/quantization/_learnable_fake_quantize.py b/torch/ao/quantization/_learnable_fake_quantize.py
index d12c96f66c00..88d93e31e7d6 100644
--- a/torch/ao/quantization/_learnable_fake_quantize.py
+++ b/torch/ao/quantization/_learnable_fake_quantize.py
@@ -45,7 +45,8 @@ def __init__(
         **observer_kwargs,
     ):
         super().__init__()
-        assert quant_min < quant_max, "quant_min must be strictly less than quant_max."
+        if quant_min >= quant_max:
+            raise AssertionError("quant_min must be strictly less than quant_max.")
         self.quant_min = quant_min
         self.quant_max = quant_max
         # also pass quant_min and quant_max to observer
@@ -56,19 +57,16 @@ def __init__(
             self.scale = Parameter(torch.tensor([scale]))
             self.zero_point = Parameter(torch.tensor([zero_point]))
         else:
-            assert isinstance(channel_len, int) and channel_len > 0, (
-                "Channel size must be a positive integer."
-            )
+            if not (isinstance(channel_len, int) and channel_len > 0):
+                raise AssertionError("Channel size must be a positive integer.")
             self.scale = Parameter(torch.tensor([scale] * channel_len))
             self.zero_point = Parameter(torch.tensor([zero_point] * channel_len))
 
         self.activation_post_process = observer(**observer_kwargs)
-        assert torch.iinfo(self.activation_post_process.dtype).min <= quant_min, (
-            "quant_min out of bound"
-        )
-        assert quant_max <= torch.iinfo(self.activation_post_process.dtype).max, (
-            "quant_max out of bound"
-        )
+        if not torch.iinfo(self.activation_post_process.dtype).min > quant_min:
+            raise AssertionError("quant_min out of bound")
+        if quant_max > torch.iinfo(self.activation_post_process.dtype).max:
+            raise AssertionError("quant_max out of bound")
         self.dtype = self.activation_post_process.dtype
         self.qscheme = self.activation_post_process.qscheme
         self.ch_axis = (
diff --git a/torch/ao/quantization/backend_config/_common_operator_config_utils.py b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
index 781bfdc8b392..ab44cfa09197 100644
--- a/torch/ao/quantization/backend_config/_common_operator_config_utils.py
+++ b/torch/ao/quantization/backend_config/_common_operator_config_utils.py
@@ -2,7 +2,8 @@
 import copy
 import operator
 from collections import namedtuple
-from typing import Callable, Union
+from collections.abc import Callable
+from typing import Union
 
 import torch
 import torch.ao.nn.intrinsic as nni
diff --git a/torch/ao/quantization/backend_config/backend_config.py b/torch/ao/quantization/backend_config/backend_config.py
index 3919b84da280..17bbf15e6371 100644
--- a/torch/ao/quantization/backend_config/backend_config.py
+++ b/torch/ao/quantization/backend_config/backend_config.py
@@ -3,12 +3,14 @@
 
 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 import torch
 
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from torch.ao.quantization.utils import Pattern
 
 
@@ -238,29 +240,29 @@ def from_dict(cls, dtype_config_dict: dict[str, Any]) -> DTypeConfig:
             "bias_type": torch.dtype
             "is_dynamic": bool
         """
-        input_dtype = dtype_config_dict.get(INPUT_DTYPE_DICT_KEY, None)
+        input_dtype = dtype_config_dict.get(INPUT_DTYPE_DICT_KEY)
         if input_dtype is not None and not isinstance(
             input_dtype, (torch.dtype, DTypeWithConstraints)
         ):
             raise ValueError(
                 "Expected input_dtype to be a torch.dtype or DTypeWithConstraints"
             )
-        output_dtype = dtype_config_dict.get(OUTPUT_DTYPE_DICT_KEY, None)
+        output_dtype = dtype_config_dict.get(OUTPUT_DTYPE_DICT_KEY)
         if output_dtype is not None and not isinstance(
             output_dtype, (torch.dtype, DTypeWithConstraints)
         ):
             raise ValueError(
                 "Expected output_dtype to be a torch.dtype or DTypeWithConstraints"
             )
-        weight_dtype = dtype_config_dict.get(WEIGHT_DTYPE_DICT_KEY, None)
+        weight_dtype = dtype_config_dict.get(WEIGHT_DTYPE_DICT_KEY)
         if weight_dtype is not None and not isinstance(
             weight_dtype, (torch.dtype, DTypeWithConstraints)
         ):
             raise ValueError(
                 "Expected weight_dtype to be a torch.dtype or DTypeWithConstraints"
             )
-        bias_dtype = dtype_config_dict.get(BIAS_DTYPE_DICT_KEY, None)
-        is_dynamic = dtype_config_dict.get(IS_DYNAMIC_DICT_KEY, None)
+        bias_dtype = dtype_config_dict.get(BIAS_DTYPE_DICT_KEY)
+        is_dynamic = dtype_config_dict.get(IS_DYNAMIC_DICT_KEY)
         return cls(input_dtype, output_dtype, weight_dtype, bias_dtype, is_dynamic)
 
     def to_dict(self) -> dict[str, Any]:
@@ -671,23 +673,23 @@ def _get_dtype_config(obj: Any) -> DTypeConfig:
         for d in backend_pattern_config_dict.get(DTYPE_CONFIGS_DICT_KEY, []):
             conf.add_dtype_config(_get_dtype_config(d))
         conf.set_root_module(
-            backend_pattern_config_dict.get(ROOT_MODULE_DICT_KEY, None)  # type: ignore[arg-type]
+            backend_pattern_config_dict.get(ROOT_MODULE_DICT_KEY)  # type: ignore[arg-type]
         )
-        conf.set_qat_module(backend_pattern_config_dict.get(QAT_MODULE_DICT_KEY, None))  # type: ignore[arg-type]
+        conf.set_qat_module(backend_pattern_config_dict.get(QAT_MODULE_DICT_KEY))  # type: ignore[arg-type]
         conf.set_reference_quantized_module(
-            backend_pattern_config_dict.get(REFERENCE_QUANTIZED_MODULE_DICT_KEY, None)  # type: ignore[arg-type]
+            backend_pattern_config_dict.get(REFERENCE_QUANTIZED_MODULE_DICT_KEY)  # type: ignore[arg-type]
         )
         conf.set_fused_module(
-            backend_pattern_config_dict.get(FUSED_MODULE_DICT_KEY, None)  # type: ignore[arg-type]
+            backend_pattern_config_dict.get(FUSED_MODULE_DICT_KEY)  # type: ignore[arg-type]
         )
         conf.set_fuser_method(
-            backend_pattern_config_dict.get(FUSER_METHOD_DICT_KEY, None)  # type: ignore[arg-type]
+            backend_pattern_config_dict.get(FUSER_METHOD_DICT_KEY)  # type: ignore[arg-type]
         )
         conf._set_root_node_getter(
-            backend_pattern_config_dict.get(ROOT_NODE_GETTER_DICT_KEY, None)  # type: ignore[arg-type]
+            backend_pattern_config_dict.get(ROOT_NODE_GETTER_DICT_KEY)  # type: ignore[arg-type]
         )
         conf._set_extra_inputs_getter(
-            backend_pattern_config_dict.get(EXTRA_INPUTS_GETTER_DICT_KEY, None)  # type: ignore[arg-type]
+            backend_pattern_config_dict.get(EXTRA_INPUTS_GETTER_DICT_KEY)  # type: ignore[arg-type]
         )
         conf._set_num_tensor_args_to_observation_type(
             backend_pattern_config_dict.get(
diff --git a/torch/ao/quantization/backend_config/onednn.py b/torch/ao/quantization/backend_config/onednn.py
index 348cec62ea18..3cc7a2cf4c66 100644
--- a/torch/ao/quantization/backend_config/onednn.py
+++ b/torch/ao/quantization/backend_config/onednn.py
@@ -88,9 +88,10 @@ def _fuse_linear_bn_leaky_relu(is_qat, linear, bn, leaky_relu):
         >>> lr = nn.LeakyReLU(0.01)
         >>> m2 = _fuse_linear_bn_leaky_relu(m1, b1, lr)
     """
-    assert linear.training == bn.training and bn.training == leaky_relu.training, (
-        "Linear, BN and LeakyReLU all must be in the same mode (train or eval)."
-    )
+    if linear.training != bn.training or bn.training != leaky_relu.training:
+        raise AssertionError(
+            "Linear, BN and LeakyReLU all must be in the same mode (train or eval)."
+        )
 
     if is_qat:
         raise NotImplementedError(
diff --git a/torch/ao/quantization/backend_config/utils.py b/torch/ao/quantization/backend_config/utils.py
index 97dd6007c7fe..0758c6a3b59d 100644
--- a/torch/ao/quantization/backend_config/utils.py
+++ b/torch/ao/quantization/backend_config/utils.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
-from typing import Any, Callable, Union
+from collections.abc import Callable
+from typing import Any, Union
 
 import torch
 import torch.nn as nn
@@ -163,10 +164,11 @@ def remove_boolean_dispatch_from_name(p) -> Any:
         return "torch.nn.functional.adaptive_max_pool2d"
     elif p is F.adaptive_max_pool3d:
         return "torch.nn.functional.adaptive_max_pool3d"
-    assert "boolean_dispatch" not in str(p), (
-        f"{p} does not have a human readable representation in "
-        + "quantization documentation"
-    )
+    if "boolean_dispatch" in str(p):
+        raise AssertionError(
+            f"{p} does not have a human readable representation in "
+            + "quantization documentation"
+        )
     return p
 
 
@@ -299,7 +301,8 @@ def _get_fuser_method_in_reversed_nested_tuple_format(
     The first argument of a fuser method is always `is_qat` and is not affected
     in the conversion. We currently only support functions with 3 or 4 arguments.
     """
-    assert config.fuser_method is not None
+    if config.fuser_method is None:
+        raise AssertionError("config.fuser_method must be provided")
     if config._pattern_complex_format is not None:
         return config.fuser_method
     if not isinstance(config.pattern, tuple):
diff --git a/torch/ao/quantization/experimental/adaround_fake_quantize.py b/torch/ao/quantization/experimental/adaround_fake_quantize.py
index 9992b22839b4..1f129e551af0 100644
--- a/torch/ao/quantization/experimental/adaround_fake_quantize.py
+++ b/torch/ao/quantization/experimental/adaround_fake_quantize.py
@@ -40,13 +40,16 @@ def __init__(
         )
         # Populate quant_min/quant_max to observer_kwargs if valid
         if quant_min is not None and quant_max is not None:
-            assert quant_min <= quant_max, (
-                "quant_min must be less than or equal to quant_max"
-            )
+            if quant_min > quant_max:
+                raise AssertionError(
+                    "quant_min must be less than or equal to quant_max, "
+                    f"got quant_min:{quant_min}, quant_max:{quant_max}"
+                )
         self.qscheme: torch.qscheme = qscheme
         self.is_per_tensor: bool = is_per_tensor(qscheme)
         self.is_symmetric: bool = _is_symmetric_quant(qscheme)
-        assert self.is_symmetric, "Only symmetric quantization is supported"
+        if not self.is_symmetric:
+            raise AssertionError("Only symmetric quantization is supported")
         self.ch_axis: int = ch_axis
 
         self.scale = torch.tensor([], requires_grad=False)
@@ -105,9 +108,8 @@ def update_scale(
         X_q = X / self.scale
         X_q_floor = torch.floor(X_q)
         residual = X_q - X_q_floor  # [0,1)
-        assert torch.all(torch.ge(residual, 0)), (
-            "residual should be non-negative [0, 1)"
-        )
+        if not torch.all(torch.ge(residual, 0)):
+            raise AssertionError("residual should be non-negative in [0, 1)")
         V_init = -torch.log((self.zeta - self.gamma) / (residual - self.gamma) - 1)
         self.V.data = V_init
 
diff --git a/torch/ao/quantization/experimental/adaround_loss.py b/torch/ao/quantization/experimental/adaround_loss.py
index 9b0ce6a32f14..8168e4de70c5 100644
--- a/torch/ao/quantization/experimental/adaround_loss.py
+++ b/torch/ao/quantization/experimental/adaround_loss.py
@@ -37,9 +37,8 @@ def rounding_regularization(
         Major logics copied from official Adaround Implementation.
         Apply rounding regularization to the input tensor V.
         """
-        assert curr_iter < self.max_iter, (
-            "Current iteration strictly les sthan max iteration"
-        )
+        if curr_iter >= self.max_iter:
+            raise AssertionError("Current iteration strictly les sthan max iteration")
         if curr_iter < self.warm_start * self.max_iter:
             return torch.tensor(0.0)
         else:
diff --git a/torch/ao/quantization/experimental/adaround_optimization.py b/torch/ao/quantization/experimental/adaround_optimization.py
index fd2d8124bb70..31ce16537e46 100644
--- a/torch/ao/quantization/experimental/adaround_optimization.py
+++ b/torch/ao/quantization/experimental/adaround_optimization.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import copy
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional, Union
 
 import torch
 from torch.ao.quantization.experimental.adaround_fake_quantize import (
@@ -126,6 +127,7 @@ def get_data_inp_out(
     @torch.no_grad()
     def feed_forward(self, x, weight, module):
         if isinstance(module, torch.nn.Conv1d):
+            # pyrefly: ignore  # no-matching-overload
             out = torch.nn.functional.conv1d(
                 x,
                 weight,
@@ -186,9 +188,10 @@ def optimize_adaptive_rounding(
         inp, out, fp_in = self.get_data_inp_out(module, q_module, self.data)
 
         print("==================== Before adaround ====================")
-        assert torch.abs(out[0] - module(fp_in[0])).sum().item() == 0, (
-            "In-placed activation is detected, please do not use activation in-placed"
-        )
+        if torch.abs(out[0] - module(fp_in[0])).sum().item() != 0:
+            raise AssertionError(
+                "In-placed activation is detected, please do not use activation in-placed"
+            )
         # Stack the tensors in each list into a single tensor
         # Assuming inp and out are your lists of tensors
         inp_tensor = torch.vstack(inp)
diff --git a/torch/ao/quantization/experimental/fake_quantize.py b/torch/ao/quantization/experimental/fake_quantize.py
index b18b5e133f19..f7e754c60450 100644
--- a/torch/ao/quantization/experimental/fake_quantize.py
+++ b/torch/ao/quantization/experimental/fake_quantize.py
@@ -1,4 +1,5 @@
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import torch
 from torch import Tensor
@@ -35,15 +36,14 @@ def forward(self, X: torch.Tensor) -> Tensor:  # type: ignore[override]
             self.level_indices = result[3]
 
         if self.fake_quant_enabled[0] == 1:
-            assert (
-                self.alpha is not None
-                and self.gamma is not None
-                and self.quantization_levels is not None
-                and self.level_indices is not None
-            ), "Must set qparams for fake quant"
-
+            if (
+                self.alpha is None
+                or self.gamma is None
+                or self.quantization_levels is None
+                or self.level_indices is None
+            ):
+                raise AssertionError("Must set qparams for fake quant")
             X = fake_quantize_function.apply(
                 X, self.alpha, self.gamma, self.quantization_levels, self.level_indices
             )
-
         return X
diff --git a/torch/ao/quantization/experimental/linear.py b/torch/ao/quantization/experimental/linear.py
index e0aba9b589fb..3d7d3f79fcaf 100644
--- a/torch/ao/quantization/experimental/linear.py
+++ b/torch/ao/quantization/experimental/linear.py
@@ -28,8 +28,12 @@ class LinearAPoT(WeightedQuantizedModule):
     """
 
     def __init__(self, weight2quantize: torch.Tensor, b: int, k: int):
-        assert weight2quantize.dim() == 2
-        assert b % k == 0
+        if weight2quantize.dim() != 2:
+            raise AssertionError(
+                f"weight2quantize must be a 2-D tensor, got dim={weight2quantize.dim()}"
+            )
+        if b % k != 0:
+            raise AssertionError(f"b must be divisible by k, got b={b}, k={k}")
 
         super().__init__()
 
@@ -142,7 +146,10 @@ def forward(self, activation: torch.Tensor) -> torch.FloatTensor:
         Args:
             activation (Tensor): uniformly quantized activation tensor
         """
-        assert activation.dim() == 2
+        if activation.dim() != 2:
+            raise AssertionError(
+                f"activation must be a 2-D tensor, got dim={activation.dim()}"
+            )
 
         weight_rows = self.weight_transposed.size()[0]
         weight_cols = self.weight_transposed.size()[1]
diff --git a/torch/ao/quantization/experimental/observer.py b/torch/ao/quantization/experimental/observer.py
index 7d9432ab27ec..135b27eaf925 100644
--- a/torch/ao/quantization/experimental/observer.py
+++ b/torch/ao/quantization/experimental/observer.py
@@ -58,8 +58,12 @@ def _calculate_qparams(self, signed: bool, min_val=None, max_val=None):
         alpha = torch.max(-self.min_val, self.max_val)
 
         # check for valid inputs of b, k
-        assert self.k and self.k != 0
-        assert self.b % self.k == 0
+        if not self.k or self.k == 0:
+            raise AssertionError(f"k must be a non-zero integer, got k={self.k}")
+        if self.b % self.k != 0:
+            raise AssertionError(
+                f"b must be divisible by k, got b={self.b}, k={self.k}"
+            )
 
         # compute n and store as member variable
         self.n = self.b // self.k
@@ -68,10 +72,10 @@ def _calculate_qparams(self, signed: bool, min_val=None, max_val=None):
         p_all = []
 
         # create levels
-        for i in range(0, self.n):
+        for i in range(self.n):
             p_curr = torch.tensor([0])
 
-            for j in range(0, (2**self.k - 2) + 1):
+            for j in range((2**self.k - 2) + 1):
                 curr_ele = 2 ** (-(i + j * self.n))
                 p_append = torch.tensor([curr_ele])
                 p_curr = torch.cat((p_curr, p_append))
diff --git a/torch/ao/quantization/fake_quantize.py b/torch/ao/quantization/fake_quantize.py
index c17008adcf65..7c7c51e7ada6 100644
--- a/torch/ao/quantization/fake_quantize.py
+++ b/torch/ao/quantization/fake_quantize.py
@@ -64,9 +64,7 @@ def _is_symmetric_quant(qscheme: "torch.qscheme") -> bool:
 
 
 def _is_float_qparams(qscheme: "torch.qscheme") -> bool:
-    return qscheme in [
-        torch.per_channel_affine_float_qparams,
-    ]
+    return qscheme == torch.per_channel_affine_float_qparams
 
 
 class FakeQuantizeBase(ABC, Module):
@@ -177,9 +175,10 @@ def __init__(
         super().__init__()
         # Populate quant_min/quant_max to observer_kwargs if valid
         if quant_min is not None and quant_max is not None:
-            assert quant_min <= quant_max, (
-                "quant_min must be less than or equal to quant_max"
-            )
+            if quant_min > quant_max:
+                raise AssertionError(
+                    "quant_min must be less than or equal to quant_max"
+                )
             dtype = observer_kwargs.get("dtype", torch.quint8)
             if hasattr(observer, "p"):
                 # In case observer is _PartialWrapper, dtype can be stored in
@@ -187,8 +186,12 @@ def __init__(
                 dtype = getattr(getattr(observer, "p", {}), "keywords", {}).get(
                     "dtype", dtype
                 )
-            assert torch.iinfo(dtype).min <= quant_min, "quant_min out of bound"
-            assert quant_max <= torch.iinfo(dtype).max, "quant_max out of bound"
+            # pyrefly: ignore  # bad-argument-type
+            if torch.iinfo(dtype).min > quant_min:
+                raise AssertionError("quant_min out of bound")
+            # pyrefly: ignore  # bad-argument-type
+            if quant_max > torch.iinfo(dtype).max:
+                raise AssertionError("quant_max out of bound")
             observer_kwargs.update({"quant_min": quant_min, "quant_max": quant_max})
         observer_kwargs["is_dynamic"] = is_dynamic
         self.activation_post_process = observer(**observer_kwargs)
@@ -210,11 +213,12 @@ def __init__(
             if hasattr(self.activation_post_process, "ch_axis")
             else -1
         )
-        assert _is_per_channel(self.qscheme) or _is_per_tensor(self.qscheme), (
-            "Only per channel and per tensor quantization are supported in fake quantize"
-            + " got qscheme: "
-            + str(self.qscheme)
-        )
+        if not (_is_per_channel(self.qscheme) or _is_per_tensor(self.qscheme)):
+            raise AssertionError(
+                "Only per channel and per tensor quantization are supported in fake quantize"
+                + " got qscheme: "
+                + str(self.qscheme)
+            )
         self.is_per_channel = _is_per_channel(self.qscheme)
 
     @torch.jit.export
@@ -295,7 +299,10 @@ def _load_from_state_dict(
                 if name == "scale":
                     self.scale.resize_(val.shape)
                 else:
-                    assert name == "zero_point"
+                    if name != "zero_point":
+                        raise AssertionError(
+                            "Expected 'zero_point' but got different state key"
+                        )
                     self.zero_point.resize_(val.shape)
                 # For torchscript module we need to update the attributes here since we do not
                 # call the `_load_from_state_dict` function defined module.py
@@ -303,7 +310,10 @@ def _load_from_state_dict(
                     if name == "scale":
                         self.scale.copy_(val)
                     else:
-                        assert name == "zero_point"
+                        if name != "zero_point":
+                            raise AssertionError(
+                                "Expected 'zero_point' but got different state key"
+                            )
                         self.zero_point.copy_(val)
             elif strict:
                 missing_keys.append(key)
@@ -329,17 +339,19 @@ class FixedQParamsFakeQuantize(FakeQuantize):
     # TODO: rename observer to observer_ctr
     def __init__(self, observer):
         super().__init__(observer=observer)
-        assert type(self.activation_post_process) == FixedQParamsObserver, (
-            f"{self.__class__.__name__}'s observer must be a {FixedQParamsObserver.__name__}"
-        )
+        if type(self.activation_post_process) is not FixedQParamsObserver:
+            raise AssertionError(
+                f"{self.__class__.__name__}'s observer must be a {FixedQParamsObserver.__name__}"
+            )
         self._observer_ctr = observer
         self.scale = self.activation_post_process.scale
         self.zero_point = self.activation_post_process.zero_point
-        assert _is_per_tensor(self.qscheme), (
-            "Only per tensor quantization is supported"
-            + " FixedQParamsFakeQuantize module, got qscheme:"
-            + str(self.qscheme)
-        )
+        if not _is_per_tensor(self.qscheme):
+            raise AssertionError(
+                "Only per tensor quantization is supported"
+                + " FixedQParamsFakeQuantize module, got qscheme:"
+                + str(self.qscheme)
+            )
 
     @torch.jit.export
     def calculate_qparams(self):  # type: ignore[override]
@@ -382,12 +394,13 @@ def __init__(
         **observer_kwargs: Any,
     ) -> None:
         super().__init__(observer, quant_min, quant_max, **observer_kwargs)
-        assert isinstance(
+        if not isinstance(
             self.activation_post_process,
             (MovingAverageMinMaxObserver, MovingAveragePerChannelMinMaxObserver),
-        ), (
-            "Fused observer+fake_quant module only works with MovingAverageMinMaxObserver"
-        )
+        ):
+            raise AssertionError(
+                "Fused observer+fake_quant module only works with MovingAverageMinMaxObserver"
+            )
         self.register_buffer("fake_quant_enabled", torch.tensor([1], dtype=torch.long))
         self.register_buffer("observer_enabled", torch.tensor([1], dtype=torch.long))
         self.is_symmetric_quant = _is_symmetric_quant(
diff --git a/torch/ao/quantization/fuser_method_mappings.py b/torch/ao/quantization/fuser_method_mappings.py
index 260bbee37bd2..4eef33698d10 100644
--- a/torch/ao/quantization/fuser_method_mappings.py
+++ b/torch/ao/quantization/fuser_method_mappings.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import itertools
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional, Union
 
 import torch.ao.nn.intrinsic as nni
 import torch.nn as nn
@@ -34,9 +35,10 @@ def fuse_conv_bn(is_qat, conv, bn):
         >>> # xdoctest: +SKIP
         >>> m2 = fuse_conv_bn(m1, b1)
     """
-    assert conv.training == bn.training, (
-        "Conv and BN both must be in the same mode (train or eval)."
-    )
+    if conv.training != bn.training:
+        raise AssertionError(
+            "Conv and BN both must be in the same mode (train or eval)."
+        )
 
     fused_module_class_map = {
         nn.Conv1d: nni.ConvBn1d,
@@ -45,13 +47,18 @@ def fuse_conv_bn(is_qat, conv, bn):
     }
 
     if is_qat:
-        assert bn.num_features == conv.out_channels, (
-            "Output channel of Conv2d must match num_features of BatchNorm2d"
-        )
-        assert bn.affine, "Only support fusing BatchNorm2d with affine set to True"
-        assert bn.track_running_stats, (
-            "Only support fusing BatchNorm2d with tracking_running_stats set to True"
-        )
+        if bn.num_features != conv.out_channels:
+            raise AssertionError(
+                "Output channel of Conv2d must match num_features of BatchNorm2d."
+            )
+        if not bn.affine:
+            raise AssertionError(
+                "Only support fusing BatchNorm2d with affine set to True"
+            )
+        if not bn.track_running_stats:
+            raise AssertionError(
+                "Only support fusing BatchNorm2d with tracking_running_stats set to True"
+            )
         fused_module_class = fused_module_class_map.get((type(conv)), None)
         if fused_module_class is not None:
             return fused_module_class(conv, bn)
@@ -80,9 +87,10 @@ def fuse_conv_bn_relu(is_qat, conv, bn, relu):
         >>> # xdoctest: +SKIP
         >>> m2 = fuse_conv_bn_relu(m1, b1, r1)
     """
-    assert conv.training == bn.training == relu.training, (
-        "Conv and BN both must be in the same mode (train or eval)."
-    )
+    if not (conv.training == bn.training == relu.training):
+        raise AssertionError(
+            "Conv and BN both must be in the same mode (train or eval)."
+        )
     fused_module: Optional[type[nn.Sequential]] = None
     if is_qat:
         map_to_fused_module_train = {
@@ -90,13 +98,18 @@ def fuse_conv_bn_relu(is_qat, conv, bn, relu):
             nn.Conv2d: nni.ConvBnReLU2d,
             nn.Conv3d: nni.ConvBnReLU3d,
         }
-        assert bn.num_features == conv.out_channels, (
-            "Output channel of Conv must match num_features of BatchNorm"
-        )
-        assert bn.affine, "Only support fusing BatchNorm with affine set to True"
-        assert bn.track_running_stats, (
-            "Only support fusing BatchNorm with tracking_running_stats set to True"
-        )
+        if bn.num_features != conv.out_channels:
+            raise AssertionError(
+                "Output channel of Conv2d must match num_features of BatchNorm2d"
+            )
+        if not bn.affine:
+            raise AssertionError(
+                "Only support fusing BatchNorm2d with affine set to True"
+            )
+        if not bn.track_running_stats:
+            raise AssertionError(
+                "Only support fusing BatchNorm2d with tracking_running_stats set to True"
+            )
         fused_module = map_to_fused_module_train.get(type(conv), None)
         if fused_module is not None:
             return fused_module(conv, bn, relu)
@@ -133,18 +146,24 @@ def fuse_linear_bn(is_qat, linear, bn):
         >>> # xdoctest: +SKIP
         >>> m2 = fuse_linear_bn(m1, b1)
     """
-    assert linear.training == bn.training, (
-        "Linear and BN both must be in the same mode (train or eval)."
-    )
+    if linear.training != bn.training:
+        raise AssertionError(
+            "Linear and BN both must be in the same mode (train or eval)."
+        )
 
     if is_qat:
-        assert bn.num_features == linear.out_features, (
-            "Output features of Linear must match num_features of BatchNorm1d"
-        )
-        assert bn.affine, "Only support fusing BatchNorm1d with affine set to True"
-        assert bn.track_running_stats, (
-            "Only support fusing BatchNorm1d with tracking_running_stats set to True"
-        )
+        if bn.num_features != linear.out_features:
+            raise AssertionError(
+                "Output features of Linear must match num_features of BatchNorm1d"
+            )
+        if not bn.affine:
+            raise AssertionError(
+                "Only support fusing BatchNorm1d with affine set to True"
+            )
+        if not bn.track_running_stats:
+            raise AssertionError(
+                "Only support fusing BatchNorm1d with tracking_running_stats set to True"
+            )
         return nni.LinearBn1d(linear, bn)
     else:
         return nn.utils.fusion.fuse_linear_bn_eval(linear, bn)
@@ -166,9 +185,10 @@ def fuse_convtranspose_bn(is_qat, convt, bn):
         >>> # xdoctest: +SKIP
         >>> m2 = fuse_convtranspose_bn(m1, b1)
     """
-    assert convt.training == bn.training, (
-        "ConvTranspose and BN both must be in the same mode (train or eval)."
-    )
+    if convt.training != bn.training:
+        raise AssertionError(
+            "ConvTranspose and BN both must be in the same mode (train or eval)."
+        )
 
     if is_qat:
         raise Exception(  # noqa: TRY002
@@ -223,7 +243,8 @@ def get_fuser_method(op_list, additional_fuser_method_mapping=None):
         _DEFAULT_OP_LIST_TO_FUSER_METHOD, additional_fuser_method_mapping
     )
     fuser_method = all_mappings.get(op_list, None)
-    assert fuser_method is not None, f"did not find fuser method for: {op_list} "
+    if fuser_method is None:
+        raise AssertionError(f"did not find fuser method for: {op_list} ")
     return fuser_method
 
 
@@ -285,8 +306,9 @@ def get_fuser_method_new(
     op_patterns = _get_valid_patterns(op_pattern)
     fuser_method = None
     for op_pattern in op_patterns:
-        fuser_method = fuser_method_mapping.get(op_pattern, None)
+        fuser_method = fuser_method_mapping.get(op_pattern)
         if fuser_method is not None:
             break
-    assert fuser_method is not None, f"did not find fuser method for: {op_pattern} "
+    if fuser_method is None:
+        raise AssertionError(f"did not find fuser method for: {op_pattern} ")
     return fuser_method
diff --git a/torch/ao/quantization/fx/_decomposed.py b/torch/ao/quantization/fx/_decomposed.py
index 1c4517b93c7f..b145cbfaeeba 100644
--- a/torch/ao/quantization/fx/_decomposed.py
+++ b/torch/ao/quantization/fx/_decomposed.py
@@ -1149,6 +1149,7 @@ def dequantize_per_channel_group(
 
 class FakeQuantPerChannel(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, input, scales, zero_points, axis, quant_min, quant_max):
         if scales.dtype != torch.float32:
             scales = scales.to(torch.float32)
@@ -1158,7 +1159,7 @@ def forward(ctx, input, scales, zero_points, axis, quant_min, quant_max):
             f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
         )
         assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
-        broadcast_dims = list(range(0, axis)) + list(range(axis + 1, input.ndim))
+        broadcast_dims = list(range(axis)) + list(range(axis + 1, input.ndim))
         unsqueeze_scales = _unsqueeze_multiple(scales, broadcast_dims)
         unsqueeze_zero_points = _unsqueeze_multiple(zero_points, broadcast_dims)
         temp = torch.round(input * (1.0 / unsqueeze_scales)) + unsqueeze_zero_points
@@ -1171,6 +1172,7 @@ def forward(ctx, input, scales, zero_points, axis, quant_min, quant_max):
         return out
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, gy):
         (mask,) = ctx.saved_tensors
         return gy * mask, None, None, None, None, None
diff --git a/torch/ao/quantization/fx/_equalize.py b/torch/ao/quantization/fx/_equalize.py
index 822d261ffc32..71563c236aab 100644
--- a/torch/ao/quantization/fx/_equalize.py
+++ b/torch/ao/quantization/fx/_equalize.py
@@ -246,6 +246,7 @@ def calculate_equalization_scale(
 
 
 class EqualizationQConfig(
+    # pyrefly: ignore  # invalid-inheritance
     namedtuple("EqualizationQConfig", ["input_activation", "weight"])
 ):
     """
@@ -460,6 +461,7 @@ def maybe_get_next_equalization_scale(
     In this case, the node given is linear1 and we want to locate the InputEqObs.
     """
     next_inp_eq_obs = maybe_get_next_input_eq_obs(node, modules)
+    # pyrefly: ignore  # invalid-argument
     if next_inp_eq_obs:
         if (
             next_inp_eq_obs.equalization_scale.nelement() == 1
@@ -821,13 +823,18 @@ def convert_eq_obs(
             # Scale the weight nodes
             if node.op == "call_module":
                 scale_weight_node(
-                    node, modules, equalization_scale, maybe_next_equalization_scale
+                    node,
+                    modules,
+                    # pyrefly: ignore  # bad-argument-type
+                    equalization_scale,
+                    maybe_next_equalization_scale,
                 )
             elif node.op == "call_function":
                 scale_weight_functional(
                     node,
                     model,
                     modules,
+                    # pyrefly: ignore  # bad-argument-type
                     equalization_scale,
                     maybe_next_equalization_scale,
                 )
diff --git a/torch/ao/quantization/fx/_lower_to_native_backend.py b/torch/ao/quantization/fx/_lower_to_native_backend.py
index eeaad6b8afcc..fa8e7d53e6b0 100644
--- a/torch/ao/quantization/fx/_lower_to_native_backend.py
+++ b/torch/ao/quantization/fx/_lower_to_native_backend.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import operator
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional, Union
 
 import torch
 import torch.ao.nn.intrinsic as nni
@@ -227,7 +228,7 @@ def is_getattr_tensor_metadata_node(node):
     return (
         node.op == "call_function"
         and node.target == getattr
-        and node.args[1] in ["shape"]
+        and node.args[1] == "shape"
     )
 
 
@@ -588,7 +589,7 @@ def _match_static_pattern(
 
     # Handle cases where the node is wrapped in a ReLU
     if (ref_node.op == "call_function" and ref_node.target in (F.relu, torch.relu)) or (
-        ref_node.op == "call_module" and type(_get_module(ref_node, modules)) == nn.ReLU
+        ref_node.op == "call_module" and type(_get_module(ref_node, modules)) is nn.ReLU
     ):
         relu_node = ref_node
         ref_node = relu_node.args[0]
@@ -723,7 +724,7 @@ def _lower_static_weighted_ref_module(
         # If so, we replace the entire fused module with the corresponding quantized module
         if ref_class in STATIC_LOWER_FUSED_MODULE_MAP:
             inner_ref_class, q_class = STATIC_LOWER_FUSED_MODULE_MAP[ref_class]
-            if type(ref_module[0]) != inner_ref_class:  # type: ignore[index]
+            if type(ref_module[0]) is not inner_ref_class:  # type: ignore[index]
                 continue
         else:
             q_class = STATIC_LOWER_MODULE_MAP[ref_class]
@@ -785,7 +786,7 @@ def _lower_static_weighted_ref_module_with_two_inputs(
             inner_ref_class, q_class = STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP[
                 ref_class
             ]
-            if type(ref_module[0]) != inner_ref_class:  # type: ignore[index]
+            if type(ref_module[0]) is not inner_ref_class:  # type: ignore[index]
                 continue
         else:
             continue
@@ -845,7 +846,7 @@ def _lower_dynamic_weighted_ref_module(model: GraphModule):
         ref_class = type(ref_module)
         if ref_class in DYNAMIC_LOWER_FUSED_MODULE_MAP:
             inner_ref_class, q_class = DYNAMIC_LOWER_FUSED_MODULE_MAP[ref_class]
-            if type(ref_module[0]) != inner_ref_class:
+            if type(ref_module[0]) is not inner_ref_class:
                 continue
         else:
             q_class = DYNAMIC_LOWER_MODULE_MAP.get(ref_class)  # type: ignore[assignment]
@@ -1007,7 +1008,7 @@ def _lower_dynamic_weighted_ref_functional(
             func_node.op == "call_function"
             and func_node.target == F.relu
             or func_node.op == "call_module"
-            and type(modules[str(func_node.target)]) == torch.nn.ReLU
+            and type(modules[str(func_node.target)]) is torch.nn.ReLU
         ):
             relu_node = func_node
             func_node = relu_node.args[0]
diff --git a/torch/ao/quantization/fx/_model_report/detector.py b/torch/ao/quantization/fx/_model_report/detector.py
index 4625e287011c..b3bc3c384760 100644
--- a/torch/ao/quantization/fx/_model_report/detector.py
+++ b/torch/ao/quantization/fx/_model_report/detector.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 from abc import ABC, abstractmethod
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import torch
 import torch.ao.nn.qat as nnqat
diff --git a/torch/ao/quantization/fx/_model_report/model_report.py b/torch/ao/quantization/fx/_model_report/model_report.py
index 04035b41bf68..ca9c1099298f 100644
--- a/torch/ao/quantization/fx/_model_report/model_report.py
+++ b/torch/ao/quantization/fx/_model_report/model_report.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 from collections import OrderedDict
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import torch
 from torch.ao.quantization.fx._equalize import EqualizationQConfig
@@ -369,10 +370,10 @@ def _is_same_info_for_same_key(self, info_dict_a: dict, info_dict_b: dict) -> bo
             dict_b_val = info_dict_b[key]
 
             # if it's a tensor we have to handle separately
-            if type(dict_a_val) == torch.Tensor:
+            if type(dict_a_val) is torch.Tensor:
                 # if dict_b_val not tensor, automatically false
                 if (
-                    type(dict_b_val) != torch.Tensor
+                    type(dict_b_val) is not torch.Tensor
                     or sum(dict_a_val != dict_b_val) != 0
                 ):
                     return False
diff --git a/torch/ao/quantization/fx/_model_report/model_report_visualizer.py b/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
index 63d31171bbe7..656206d161c9 100644
--- a/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
+++ b/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
@@ -132,7 +132,7 @@ def get_all_unique_feature_names(
                 # if we need plottable, ensure type of val is tensor
                 if (
                     not plottable_features_only
-                    or type(feature_dict[feature_name]) == torch.Tensor
+                    or type(feature_dict[feature_name]) is torch.Tensor
                 ):
                     unique_feature_names.add(feature_name)
 
@@ -223,6 +223,7 @@ def _generate_tensor_table(
                         feature_val = feature_val.item()
 
                     # we add to our list of values
+                    # pyrefly: ignore  # bad-argument-type
                     tensor_table_row.append(feature_val)
 
                 tensor_table.append(tensor_table_row)
@@ -283,6 +284,7 @@ def _generate_channels_table(
                             feature_val = feature_val.item()
 
                         # add value to channel specific row
+                        # pyrefly: ignore  # bad-argument-type
                         new_channel_row.append(feature_val)
 
                     # add to table and increment row index counter
@@ -518,7 +520,7 @@ def _get_plottable_data(
                 # the index of the feature will the 0 + num non feature columns
                 tensor_feature_index = feature_column_offset
                 row_value = row[tensor_feature_index]
-                if not type(row_value) == str:
+                if type(row_value) is not str:
                     x_data.append(x_val_to_append)
                     y_data.append(row_value)
         elif is_valid_per_channel_plot:
@@ -541,7 +543,7 @@ def _get_plottable_data(
                 # the index of the feature will the 0 + num non feature columns
                 tensor_feature_index = feature_column_offset
                 row_value = row[tensor_feature_index]
-                if not type(row_value) == str:
+                if type(row_value) is not str:
                     # only append if new index we are appending
                     if len(x_data) == 0 or x_data[-1] != x_val_to_append:
                         x_data.append(x_val_to_append)
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index dc51ab943bc5..cde3a92987c2 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -3,7 +3,7 @@
 import copy
 import operator
 import warnings
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 import torch
 from torch.ao.quantization import CUSTOM_KEY, NUMERIC_DEBUG_HANDLE_KEY
@@ -62,6 +62,10 @@
 )
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 __all__ = [
     "convert",
     "convert_custom_module",
diff --git a/torch/ao/quantization/fx/fuse.py b/torch/ao/quantization/fx/fuse.py
index 2078ddba9f40..435085a6b845 100644
--- a/torch/ao/quantization/fx/fuse.py
+++ b/torch/ao/quantization/fx/fuse.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import warnings
-from typing import Any, Callable, Union
+from collections.abc import Callable
+from typing import Any, Union
 
 from torch.ao.quantization.backend_config import (
     BackendConfig,
diff --git a/torch/ao/quantization/fx/fuse_handler.py b/torch/ao/quantization/fx/fuse_handler.py
index 68a5a440a512..24f3b1338172 100644
--- a/torch/ao/quantization/fx/fuse_handler.py
+++ b/torch/ao/quantization/fx/fuse_handler.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Union
+from collections.abc import Callable
+from typing import Any, Union
 
 import torch
 from torch.ao.quantization.backend_config import BackendConfig
diff --git a/torch/ao/quantization/fx/lstm_utils.py b/torch/ao/quantization/fx/lstm_utils.py
index fe18ba465212..b49f462640f0 100644
--- a/torch/ao/quantization/fx/lstm_utils.py
+++ b/torch/ao/quantization/fx/lstm_utils.py
@@ -1,6 +1,6 @@
 import copy
 import operator
-from typing import Any, Callable, Optional
+from typing import Any, Optional, TYPE_CHECKING
 
 import torch
 from torch.ao.quantization import (
@@ -15,6 +15,10 @@
 from torch.ao.quantization.quantize_fx import convert_to_reference_fx, prepare_fx
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 # TODO: move all LSTM util functions from fx/utils.py to this file
 def _get_lstm_with_individually_observed_parts(
     float_lstm: torch.nn.LSTM,
diff --git a/torch/ao/quantization/fx/match_utils.py b/torch/ao/quantization/fx/match_utils.py
index 137461d233ce..95d2b27f23ca 100644
--- a/torch/ao/quantization/fx/match_utils.py
+++ b/torch/ao/quantization/fx/match_utils.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import sys
-from collections.abc import Iterable
-from typing import Any, Callable, Optional
+from collections.abc import Callable, Iterable
+from typing import Any, Optional
 
 import torch
 from torch.ao.quantization.qconfig import QConfigAny
@@ -51,7 +51,7 @@ def _is_match(modules, node, pattern, max_uses=sys.maxsize):
     if isinstance(self_match, type) and issubclass(self_match, torch.nn.Module):
         if node.op != "call_module":
             return False
-        if not type_before_parametrizations(modules[node.target]) == self_match:
+        if type_before_parametrizations(modules[node.target]) != self_match:
             return False
     elif callable(self_match):
         if node.op != "call_function" or node.target is not self_match:
@@ -168,7 +168,7 @@ def record_match(pattern, node, last_node, matched_node_pattern, match_map):
     for node in reversed(graph.nodes):
         if node.name not in match_map and node.name not in all_matched:
             for pattern, quantize_handler_cls in patterns.items():
-                root_node_getter = root_node_getter_mapping.get(pattern, None)
+                root_node_getter = root_node_getter_mapping.get(pattern)
                 if _is_match(modules, node, pattern) and node.name not in match_map:
                     matched_node_pattern: list[Node] = []
                     record_match(pattern, node, node, matched_node_pattern, match_map)
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index e70a078630d9..4ea44181e96f 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -130,7 +130,7 @@ def _get_qspec_for_arg(
 ) -> Optional[QuantizationSpecBase]:
     while _is_activation_post_process_node(arg, named_modules):
         arg = arg.args[0]  # type: ignore[assignment]
-    return input_qspec_map.get(arg, None)
+    return input_qspec_map.get(arg)
 
 
 def _create_obs_or_fq_from_qspec(
@@ -166,6 +166,7 @@ def _create_obs_or_fq_from_qspec(
         }
         edge_or_nodes = quantization_spec.derived_from
         obs_or_fqs = [obs_or_fq_map[k] for k in edge_or_nodes]
+        # pyrefly: ignore  # unsupported-operation
         kwargs["obs_or_fqs"] = obs_or_fqs
         return _DerivedObserverOrFakeQuantize.with_args(**kwargs)()
     elif isinstance(quantization_spec, FixedQParamsQuantizationSpec):
@@ -938,7 +939,7 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
             if maybe_obs_node.op == "call_module":
                 maybe_obs_mod = named_modules[maybe_obs_node.target]  # type: ignore[index]
                 if (
-                    type(maybe_obs_mod) == type(arg_as_input_act_obs_or_fq)
+                    type(maybe_obs_mod) is type(arg_as_input_act_obs_or_fq)
                     and maybe_obs_mod.dtype == arg_as_input_target_dtype  # type: ignore[possibly-undefined]
                 ):
                     arg_as_input_act_obs_or_fq = maybe_obs_mod  # type: ignore[assignment]
@@ -1708,7 +1709,7 @@ def insert_observers_for_model(
 
             skip_inserting_observers = (
                 (qconfig is None) or not output_is_a_tensor
-            ) and (not node.op == "output")
+            ) and (node.op != "output")
 
             # TODO: take a closer look to see if we can remove this check
             # right now it is here because of `observed_node_names`, we are using
@@ -2085,8 +2086,11 @@ def prepare(
 
     root_node_getter_mapping = get_fusion_pattern_to_root_node_getter(backend_config)
 
+    # pyrefly: ignore  # bad-argument-type
     _update_qconfig_for_fusion(model, qconfig_mapping)
+    # pyrefly: ignore  # bad-argument-type
     _update_qconfig_for_fusion(model, _equalization_config)
+    # pyrefly: ignore  # bad-argument-type
     flattened_qconfig_dict = _get_flattened_qconfig_dict(qconfig_mapping)
     # TODO: support regex as well
     propagate_qconfig_(model, flattened_qconfig_dict, prepare_custom_config.to_dict())
@@ -2094,6 +2098,7 @@ def prepare(
     if is_qat:
         module_to_qat_module = get_module_to_qat_module(backend_config)
         _qat_swap_modules(model, module_to_qat_module)
+        # pyrefly: ignore  # bad-argument-type
         _update_qconfig_for_qat(qconfig_mapping, backend_config)
 
     # mapping from fully qualified module name to module instance
@@ -2107,10 +2112,20 @@ def prepare(
 
     # fill node_name_to_qconfig, a map from node name to qconfig, used in _find_matches
     equalization_node_name_to_qconfig = _generate_node_name_to_qconfig(
-        model, named_modules, model.graph, _equalization_config, node_name_to_scope
+        model,
+        named_modules,
+        model.graph,
+        # pyrefly: ignore  # bad-argument-type
+        _equalization_config,
+        node_name_to_scope,
     )
     node_name_to_qconfig = _generate_node_name_to_qconfig(
-        model, named_modules, model.graph, qconfig_mapping, node_name_to_scope
+        model,
+        named_modules,
+        model.graph,
+        # pyrefly: ignore  # bad-argument-type
+        qconfig_mapping,
+        node_name_to_scope,
     )
 
     # match the patterns that will get quantized
@@ -2170,6 +2185,7 @@ def prepare(
         node_name_to_scope,
         prepare_custom_config,
         equalization_node_name_to_qconfig,
+        # pyrefly: ignore  # bad-argument-type
         qconfig_mapping,
         is_qat,
         observed_node_names,
diff --git a/torch/ao/quantization/fx/qconfig_mapping_utils.py b/torch/ao/quantization/fx/qconfig_mapping_utils.py
index 421e6d4b8eba..7e4ebbf75bc3 100644
--- a/torch/ao/quantization/fx/qconfig_mapping_utils.py
+++ b/torch/ao/quantization/fx/qconfig_mapping_utils.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 import re
 from collections import defaultdict, OrderedDict
-from typing import Any, Callable, Union
+from collections.abc import Callable
+from typing import Any, Union
 
 import torch
 from torch.ao.nn.intrinsic import _FusedModule
diff --git a/torch/ao/quantization/fx/quantize_handler.py b/torch/ao/quantization/fx/quantize_handler.py
index a285a58814ba..2acb71194317 100644
--- a/torch/ao/quantization/fx/quantize_handler.py
+++ b/torch/ao/quantization/fx/quantize_handler.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 from abc import ABC
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Optional
 
 import torch
 from torch.ao.quantization.backend_config import (
diff --git a/torch/ao/quantization/fx/tracer.py b/torch/ao/quantization/fx/tracer.py
index 915b396e9f33..2c1635936845 100644
--- a/torch/ao/quantization/fx/tracer.py
+++ b/torch/ao/quantization/fx/tracer.py
@@ -1,4 +1,4 @@
-from typing import Callable
+from collections.abc import Callable
 
 import torch
 from torch.ao.nn.intrinsic import _FusedModule
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index 6e68bfd4648e..287b30c0bb8f 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -4,8 +4,9 @@
 import operator
 import warnings
 from collections import namedtuple
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -163,7 +164,7 @@ def get_qconv_prepack_op(conv_op: Callable) -> Callable:
         torch.nn.functional.conv_transpose2d: torch.ops.quantized.conv_transpose2d_prepack,
         torch.nn.functional.conv_transpose3d: torch.ops.quantized.conv_transpose3d_prepack,
     }
-    prepack_op = prepack_ops.get(conv_op, None)
+    prepack_op = prepack_ops.get(conv_op)
     assert prepack_op, f"Didn't find prepack op for {conv_op}"
     return prepack_op
 
@@ -703,7 +704,7 @@ def match_getitem(a):
         return a.op == "call_function" and a.target == operator.getitem
 
     def match_tuple(a):
-        return a.op == "call_function" and a.target == tuple
+        return a.op == "call_function" and a.target is tuple
 
     def _match_pattern(match_pattern: list[Callable]) -> Optional[Node]:
         """
@@ -720,6 +721,7 @@ def _match_pattern(match_pattern: list[Callable]) -> Optional[Node]:
                     a = a.args[0][0]  # type: ignore[assignment,index]
                 else:
                     a = a.args[0]  # type: ignore[assignment]
+        # pyrefly: ignore  # bad-return
         return a
 
     all_match_patterns = [
@@ -795,7 +797,7 @@ def find_patterns(
 
         # Iterate through users of this node to find tuple/getitem nodes to match
         for user in node.users:
-            if user.op == "call_function" and user.target == tuple:
+            if user.op == "call_function" and user.target is tuple:
                 for i, user_arg in enumerate(user.args[0]):  # type: ignore[arg-type]
                     if user_arg == node:
                         index_stack.append(i)
@@ -824,7 +826,7 @@ def find_patterns(
     for pattern in matched_patterns:
         first_tuple = pattern[0]
         last_getitem = pattern[-1]
-        assert first_tuple.op == "call_function" and first_tuple.target == tuple
+        assert first_tuple.op == "call_function" and first_tuple.target is tuple
         assert (
             last_getitem.op == "call_function"
             and last_getitem.target == operator.getitem
diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py
index 7b56fbe7232c..1b97499f1078 100644
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@@ -249,17 +249,17 @@ def __init__(
             )
         self.reduce_range = reduce_range
         self.register_buffer("eps", torch.tensor([eps], **factory_kwargs))
-        assert self.qscheme in (
+        if self.qscheme not in (
             torch.per_tensor_affine,
             torch.per_tensor_symmetric,
             torch.per_channel_affine,
             torch.per_channel_symmetric,
             torch.per_channel_affine_float_qparams,
-        ), (
-            "Default Observer only works for per_tensor_affine, \
-                per_tensor_symmetric, per_channel_affine, \
-                per_channel_symmetric and per_channel_float_qparams quantization scheme"
-        )
+        ):
+            raise AssertionError(
+                "Default Observer only works for per_tensor_affine, per_tensor_symmetric, "
+                "per_channel_affine, per_channel_symmetric and per_channel_float_qparams quantization scheme"
+            )
 
         _ALLOWED_DTYPES = (
             torch.qint8,
@@ -275,14 +275,18 @@ def __init__(
             torch.uint16,
         )
 
-        assert self.dtype in _ALLOWED_DTYPES, (
-            f"Default Observer only works for {_ALLOWED_DTYPES} data type"
-        )
+        if self.dtype not in _ALLOWED_DTYPES:
+            raise AssertionError(
+                f"Default Observer only works for {_ALLOWED_DTYPES} data type"
+            )
         self.has_customized_qrange = (quant_min is not None) and (quant_max is not None)
         if self.has_customized_qrange:
+            # pyrefly: ignore  # bad-argument-type
             validate_qmin_qmax(quant_min, quant_max)
         self.quant_min, self.quant_max = calculate_qmin_qmax(
+            # pyrefly: ignore  # bad-argument-type
             quant_min,
+            # pyrefly: ignore  # bad-argument-type
             quant_max,
             self.has_customized_qrange,
             self.dtype,
@@ -333,12 +337,12 @@ def _validate_qmin_qmax(self, quant_min: int, quant_max: int) -> None:
         """
         # The variable names are prefixed with "initial" because their values (qmin and qmax) might be adjusted
         # based on whether quantization range is reduced and the datatype (signed/unsigned) used by the observer.
-        assert quant_min <= 0 <= quant_max, (
-            "Used-specified quantization range must include 0."
-        )
-        assert quant_min < quant_max, (
-            "qmin must be strictly less than qmax for user-specified quantization range."
-        )
+        if not quant_min <= 0 <= quant_max:
+            raise AssertionError("Used-specified quantization range must include 0.")
+        if quant_min >= quant_max:
+            raise AssertionError(
+                "qmin must be strictly less than qmax for user-specified quantization range."
+            )
 
     @torch.jit.export
     def _calculate_qparams(
@@ -388,7 +392,7 @@ def _calculate_qparams(
                     )
                 else:
                     zero_point = zero_point.new_full(zero_point.size(), 128)
-            elif self.dtype in [torch.uint16]:
+            elif self.dtype == torch.uint16:
                 zero_point = zero_point.new_full(zero_point.size(), 2**15)
         elif self.qscheme == torch.per_channel_affine_float_qparams:
             scale = (max_val - min_val) / float(quant_max - quant_min)
@@ -803,7 +807,7 @@ def _load_from_state_dict(
         unexpected_keys: list[str],
         error_msgs: list[str],
     ):
-        version = local_metadata.get("version", None)
+        version = local_metadata.get("version")
         if version is not None and version < 3:
             local_state = ["min_vals", "max_vals"]
             expected_min_name = "min_vals"
@@ -1128,7 +1132,8 @@ def _non_linear_param_search(self) -> tuple[torch.Tensor, torch.Tensor]:
         This follows the implementation of NormMinimization::NonlinearQuantizationParamsSearch in
         caffe2/quantization/server/norm_minimization.cc
         """
-        assert self.histogram.size()[0] == self.bins, "bins mismatch"
+        if self.histogram.size()[0] != self.bins:
+            raise AssertionError("bins mismatch")
         bin_width = (self.max_val - self.min_val) / self.bins
 
         # cumulative sum
@@ -1249,8 +1254,10 @@ def _combine_histograms(
             return transformed_orig_hist + update_hist
 
         # We assume the update_hist is already in the target range, we will map the orig_max to it
-        assert update_min <= orig_min
-        assert update_max >= orig_max
+        if update_min > orig_min:
+            raise AssertionError("update_min must be <= orig_min")
+        if update_max < orig_max:
+            raise AssertionError("update_max must be >= orig_max")
 
         # Now we need to turn the old_histogram, into the range of the new histogram
         transformed_orig_hist = self._upscale_histogram(
@@ -1270,9 +1277,8 @@ def reset_histogram(
         self.min_val.copy_(min_val)
         self.max_val.resize_(max_val.shape)
         self.max_val.copy_(max_val)
-        assert min_val.numel() == 1 and max_val.numel() == 1, (
-            "histogram min/max values must be scalar."
-        )
+        if min_val.numel() != 1 or max_val.numel() != 1:
+            raise AssertionError("histogram min/max values must be scalar.")
         new_histogram = torch.histc(x, self.bins, min=min_val, max=max_val)  # type: ignore[arg-type]
         self.histogram.detach_().resize_(new_histogram.shape)
         self.histogram.copy_(new_histogram)
@@ -1347,10 +1353,11 @@ def calculate_qparams(self):  # type: ignore[override]
             return torch.tensor([1.0], device=self.min_val.device.type), torch.tensor(
                 [0], device=self.min_val.device.type
             )
-        assert self.bins == len(self.histogram), (
-            "The number of bins in histogram should be equal to the number of bins "
-            "supplied while making this observer"
-        )
+        if self.bins != len(self.histogram):
+            raise AssertionError(
+                "The number of bins in histogram should be equal to the number of bins "
+                "supplied while making this observer"
+            )
 
         new_min, new_max = self._non_linear_param_search()
 
@@ -1782,9 +1789,10 @@ def get_block_size(
         input_shape: The input tensor shape possibly more than 2 dimensions
         granularity: The granularity type of the quantization
     """
-    assert isinstance(granularity, Granularity), (
-        "Please provide an instance of Granularity, not subclass of it"
-    )
+    if not isinstance(granularity, Granularity):
+        raise AssertionError(
+            "Please provide an instance of Granularity, not subclass of it"
+        )
     if isinstance(granularity, PerTensor):
         return input_shape
     elif isinstance(granularity, PerAxis):
@@ -1794,9 +1802,10 @@ def get_block_size(
     elif isinstance(granularity, PerRow):
         return (1,) * (len(input_shape) - 1) + (input_shape[-1],)
     elif isinstance(granularity, PerGroup):
-        assert len(input_shape) == 2, (
-            f"Expecting input shape dim to be 2 for per group quantization, gotinput shape: {input_shape}"
-        )
+        if len(input_shape) != 2:
+            raise AssertionError(
+                f"Expecting input shape dim to be 2 for per group quantization, gotinput shape: {input_shape}"
+            )
         return (1, granularity.group_size)
     elif isinstance(granularity, PerToken):
         block_size = [1] * len(input_shape)
@@ -1833,8 +1842,8 @@ def __init__(
         **kwargs,
     ):
         super().__init__()
-        assert granularity is not None, "granularity is None"
-
+        if granularity is None:
+            raise AssertionError("granularity is None")
         self.mapping_type = mapping_type
         self.target_dtype = target_dtype
         self.granularity = granularity
@@ -1872,10 +1881,10 @@ def convert(self, model: torch.fx.GraphModule, observer_node: Node):
         from torch.ao.quantization.fx.utils import create_getattr_from_value
 
         with model.graph.inserting_before(observer_node):
-            assert self.block_size is not None, "Expecting block_size to be populated"
-            assert self.original_dtype is not None, (
-                "Expecting original_dtype to be populated"
-            )
+            if self.block_size is None:
+                raise AssertionError("Expecting block_size to be populated")
+            if self.original_dtype is None:
+                raise AssertionError("Expecting original_dtype to be populated")
             if hasattr(self, "is_dynamic") and self.is_dynamic:
                 choose_qparams_affine = model.graph.call_function(
                     torch.ops.pt2e_quant.choose_qparams_affine,
diff --git a/torch/ao/quantization/pt2e/_affine_quantization.py b/torch/ao/quantization/pt2e/_affine_quantization.py
index e4eac6f6cc77..02e9c9e6deb8 100644
--- a/torch/ao/quantization/pt2e/_affine_quantization.py
+++ b/torch/ao/quantization/pt2e/_affine_quantization.py
@@ -79,15 +79,17 @@ def _get_and_check_qmin_qmax(dtype, quant_min, quant_max):
     if quant_max is None:
         quant_max = quant_max_upper_bound
 
-    assert quant_min >= quant_min_lower_bound, (
-        "quant_min out of bound for dtype, "
-        f"quant_min_lower_bound: {quant_min_lower_bound} quant_min: {quant_min}"
-    )
+    if quant_min < quant_min_lower_bound:
+        raise AssertionError(
+            "quant_min out of bound for dtype, "
+            f"quant_min_lower_bound: {quant_min_lower_bound} quant_min: {quant_min}"
+        )
 
-    assert quant_max <= quant_max_upper_bound, (
-        "quant_max out of bound for dtype, "
-        f"quant_max_upper_bound: {quant_max_upper_bound} quant_max: {quant_max}"
-    )
+    if quant_max > quant_max_upper_bound:
+        raise AssertionError(
+            "quant_max out of bound for dtype, "
+            f"quant_max_upper_bound: {quant_max_upper_bound} quant_max: {quant_max}"
+        )
     return quant_min, quant_max
 
 
@@ -107,16 +109,21 @@ def _get_reduction_params(block_size, input_size):
           shape_for_reduction: (3, 3, 5, 2, 10)
           reduction_dim: [0, 1, 3, 4]
     """
-    assert len(block_size) == len(input_size)
+    if len(block_size) != len(input_size):
+        raise AssertionError(
+            "block_size length must equal input_size length, got "
+            f"block_size={block_size}, input_size={input_size}"
+        )
     shape_for_reduction = []
     reduction_dims = []
     cur_dim = 0
     for i in range(len(block_size)):
         if block_size[i] != input_size[i] and block_size[i] > 1:
-            assert input_size[i] % block_size[i] == 0, (
-                f"Expecting input size at {i} dimension: "
-                f"{input_size[i]} to be divisible by block_size at {i} dimension: {block_size[i]}"
-            )
+            if input_size[i] % block_size[i] != 0:
+                raise AssertionError(
+                    f"Expecting input size at {i} dimension: {input_size[i]} to be divisible "
+                    f"by block_size at {i} dimension: {block_size[i]}"
+                )
             shape_for_reduction.append(input_size[i] // block_size[i])
             shape_for_reduction.append(block_size[i])
             # reduce over the block_size[i] dim
@@ -165,12 +172,14 @@ def decorator(fn):
 
         # expecting fn.__name__ starts with `_` and we want to take the rest
         # to be the name of the custom op
-        assert fn.__name__[0] == "_", (
-            f"Expecting function name starts with `_`, got {fn.__name__}"
-        )
-        assert not any(c in fn.__name__ for c in ".<>"), (
-            f"Expecting op to be defined in normal functions, not lambda or local: {fn.__name__}"
-        )
+        if fn.__name__[0] != "_":
+            raise AssertionError(
+                f"Expecting function name starts with `_`, got {fn.__name__}"
+            )
+        if any(c in fn.__name__ for c in ".<>"):
+            raise AssertionError(
+                f"Expecting op to be defined in normal functions, not lambda or local: {fn.__name__}"
+            )
         op_name = fn.__name__[1:]
         schema = op_name + infer_schema(fn, mutates_args={})
         lib.define(schema)
@@ -255,15 +264,17 @@ def _choose_qparams_affine(
        and `zero_point_domain`
     """
     quant_min, quant_max = _get_and_check_qmin_qmax(target_dtype, quant_min, quant_max)
-    assert mapping_type in [
+    if mapping_type not in [
         MappingType.SYMMETRIC.name,
         MappingType.SYMMETRIC_NO_CLIPPING_ERR.name,
         MappingType.ASYMMETRIC.name,
-    ], f"Unsupported mapping type: {mapping_type}"
+    ]:
+        raise AssertionError(f"Unsupported mapping type: {mapping_type}")
     if target_dtype in FP8_TYPES:
-        assert mapping_type == MappingType.SYMMETRIC.name, (
-            f"Only symmetric quantization is supported for FP8 types, got {mapping_type}"
-        )
+        if mapping_type != MappingType.SYMMETRIC.name:
+            raise AssertionError(
+                f"Only symmetric quantization is supported for FP8 types, got {mapping_type}"
+            )
 
     if input is not None:
         if scale_dtype is None:
@@ -273,9 +284,10 @@ def _choose_qparams_affine(
         if eps is None:
             eps = torch.finfo(input.dtype).eps
 
-        assert len(block_size) == input.dim(), (
-            f"Got input dim:{input.dim()}, block_size: {block_size}"
-        )
+        if len(block_size) != input.dim():
+            raise AssertionError(
+                f"Got input dim:{input.dim()}, block_size: {block_size}"
+            )
         shape_for_reduction, reduction_dims = _get_reduction_params(
             block_size, input.size()
         )
@@ -284,12 +296,14 @@ def _choose_qparams_affine(
         min_val = torch.amin(input, dim=reduction_dims, keepdim=False)
         max_val = torch.amax(input, dim=reduction_dims, keepdim=False)
     else:
-        assert min_val is not None and max_val is not None, (
-            "Need to provide `min_val` and `max_val` when `input` is None, got: {min_val, max_val}"
-        )
-        assert min_val.dtype == max_val.dtype, (
-            "Expecting `min_val` and `max_val` to have the same dtype, got: {min_val.dtype, max_val.dtype}"
-        )
+        if min_val is None or max_val is None:
+            raise AssertionError(
+                f"Need to provide `min_val` and `max_val` when `input` is None, got: {min_val, max_val}"
+            )
+        if min_val.dtype != max_val.dtype:
+            raise AssertionError(
+                f"Expecting `min_val` and `max_val` to have the same dtype, got: {min_val.dtype, max_val.dtype}"
+            )
 
         if scale_dtype is None:
             scale_dtype = min_val.dtype
@@ -314,7 +328,10 @@ def _choose_qparams_affine(
             max_val_pos = torch.max(-min_val_neg, max_val_pos)
             scale = max_val_pos / (float(quant_max - quant_min) / 2)
         else:
-            assert mapping_type == MappingType.SYMMETRIC_NO_CLIPPING_ERR.name
+            if mapping_type != MappingType.SYMMETRIC_NO_CLIPPING_ERR.name:
+                raise AssertionError(
+                    f"Expected mapping_type to be SYMMETRIC_NO_CLIPPING_ERR, got {mapping_type}"
+                )
             # calculate smin and smax individually and choose the larger one. For example, if quant_min = -8 and
             # quant_max = 7.
             # - If smin is bigger: There would be coverage on negative values down to -8, and less rounding
@@ -341,7 +358,10 @@ def _choose_qparams_affine(
         scale = torch.clamp(scale, min=eps)
         zero_point = torch.full_like(scale, int((quant_max + quant_min + 1) / 2))
     else:
-        assert mapping_type == MappingType.ASYMMETRIC.name
+        if mapping_type != MappingType.ASYMMETRIC.name:
+            raise AssertionError(
+                f"Expected mapping_type to be ASYMMETRIC, got {mapping_type}"
+            )
         scale = (max_val_pos - min_val_neg) / float(quant_max - quant_min)
         scale = torch.clamp(scale, min=eps)
         if zero_point_domain == ZeroPointDomain.NONE.name:
@@ -351,9 +371,10 @@ def _choose_qparams_affine(
                 zero_point = quant_min - torch.round(min_val_neg / scale)
                 zero_point = torch.clamp(zero_point, quant_min, quant_max)
             else:
-                assert zero_point_domain == ZeroPointDomain.FLOAT.name, (
-                    "if not preserve_zero, zero_point must be in FLOAT domain"
-                )
+                if zero_point_domain != ZeroPointDomain.FLOAT.name:
+                    raise AssertionError(
+                        "if not preserve_zero, zero_point must be in FLOAT domain"
+                    )
                 mid_point = (quant_max + quant_min + 1) / 2
                 zero_point = min_val_neg + scale * mid_point
 
@@ -473,14 +494,10 @@ def _quantize_affine_no_dtype_cast(
     """
     # TODO: validations
     # TODO: validate scale/zero_point dimensions are compatible with block_size
-    assert input.dtype in [
-        torch.float32,
-        torch.float16,
-        torch.bfloat16,
-    ], f"Unsupported input dtype: {input.dtype}"
-    assert len(block_size) == input.dim(), (
-        f"Got input dim:{input.dim()}, block_size: {block_size}"
-    )
+    if input.dtype not in [torch.float32, torch.float16, torch.bfloat16]:
+        raise AssertionError(f"Unsupported input dtype: {input.dtype}")
+    if len(block_size) != input.dim():
+        raise AssertionError(f"Got input dim:{input.dim()}, block_size: {block_size}")
     shape_for_reduction, reduction_dims = _get_reduction_params(
         block_size, input.size()
     )
@@ -498,18 +515,21 @@ def _quantize_affine_no_dtype_cast(
             torch.round(input * (1.0 / scale)) + zero_point, quant_min, quant_max
         )
     elif zero_point_domain == ZeroPointDomain.NONE.name:
-        assert zero_point is None, (
-            "zero_point should be None when zero_point_domain is NONE"
-        )
+        if zero_point is not None:
+            raise AssertionError(
+                "zero_point should be None when zero_point_domain is NONE"
+            )
         quant = torch.clamp(torch.round(input * (1.0 / scale)), quant_min, quant_max)
     elif zero_point_domain is None:
         # This case handles quantization for float8 we expect no zero point and no zero point domain
-        assert zero_point is None, (
-            "zero_point should be None when zero_point_domain is None"
-        )
+        if zero_point is not None:
+            raise AssertionError(
+                "zero_point should be None when zero_point_domain is None"
+            )
         quant = torch.clamp(input * scale.reciprocal(), quant_min, quant_max)
     else:
-        assert zero_point_domain == ZeroPointDomain.FLOAT.name
+        if zero_point_domain != ZeroPointDomain.FLOAT.name:
+            raise AssertionError(f"Unexpected zero_point_domain: {zero_point_domain}")
         mid_point = (quant_max + quant_min + 1) / 2
         min_val = zero_point - scale * mid_point
         quant = torch.clamp(
@@ -582,14 +602,10 @@ def _dequantize_affine(
     """op definition that has compatible signatures with custom op library"""
     # TODO: validate scale/zero_point dimensions are compatible with block_size
     if input_dtype not in _SUB_BYTE_UINT_BOUNDS:
-        assert input.dtype == input_dtype, (
-            f"Expected: {input_dtype}, got: {input.dtype}"
-        )
-    assert output_dtype in [
-        torch.float32,
-        torch.float16,
-        torch.bfloat16,
-    ], f"Unsupported output dtype: {output_dtype}"
+        if input.dtype != input_dtype:
+            raise AssertionError(f"Expected: {input_dtype}, got: {input.dtype}")
+    if output_dtype not in [torch.float32, torch.float16, torch.bfloat16]:
+        raise AssertionError(f"Unsupported output dtype: {output_dtype}")
     quant_min, quant_max = _get_and_check_qmin_qmax(input_dtype, quant_min, quant_max)
     return _dequantize_affine_no_dtype_check(
         input,
@@ -621,9 +637,8 @@ def _dequantize_affine_no_dtype_check(
     2. dequantize the input based on the quantization parameters scale and zero_point and args like zero_point_domain
     3. reshape the quantized result to original shape and change dtype to the output_dtype
     """
-    assert len(block_size) == input.dim(), (
-        f"Got input dim:{input.dim()}, block_size: {block_size}"
-    )
+    if len(block_size) != input.dim():
+        raise AssertionError(f"Got input dim:{input.dim()}, block_size: {block_size}")
     shape_for_reduction, reduction_dims = _get_reduction_params(
         block_size, input.size()
     )
@@ -646,25 +661,27 @@ def _dequantize_affine_no_dtype_check(
         dequant = dequant.to(output_dtype)
         dequant = dequant * scale
     elif zero_point_domain == ZeroPointDomain.NONE.name:
-        assert zero_point is None, (
-            "zero_point should be None when zero_point_domain is NONE"
-        )
+        if zero_point is not None:
+            raise AssertionError(
+                "zero_point should be None when zero_point_domain is NONE"
+            )
         dequant = input.to(output_dtype)
         dequant = dequant * scale
     elif zero_point_domain is None:
         # This case handles dequantization for float8 we expect no zero point and no zero point domain
-        assert zero_point is None, (
-            "zero_point should be None when zero_point_domain is None"
-        )
-        assert _is_float8_type(input.dtype), (
-            f"dequantiztion with no zero point domain is only supported with FP8 types, got {input.dtype}"
-        )
+        if zero_point is not None:
+            raise AssertionError(
+                "zero_point should be None when zero_point_domain is None"
+            )
+        if not _is_float8_type(input.dtype):
+            raise AssertionError(
+                f"dequantiztion with no zero point domain is only supported with FP8 types, got {input.dtype}"
+            )
         dequant = input.to(output_dtype)
         dequant = dequant * scale
     else:
-        assert zero_point_domain == ZeroPointDomain.FLOAT.name, (
-            f"Unexpected zero point domain: {zero_point_domain}"
-        )
+        if zero_point_domain != ZeroPointDomain.FLOAT.name:
+            raise AssertionError(f"Unexpected zero point domain: {zero_point_domain}")
         # TODO: this seems to be a detail for tinygemm (converting from uint to int, probably need to refactor this)
         mid_point = (quant_max + quant_min + 1) / 2
         # This should allocate new memory and avoid input modification
@@ -684,7 +701,8 @@ def forward(self, input: torch.Tensor):
 
         input_detached = input.detach()
         self.original_dtype = input_detached.dtype
-        assert self.granularity is not None, "granularity is None"
+        if self.granularity is None:
+            raise AssertionError("granularity is None")
         self.block_size = get_block_size(input_detached.shape, self.granularity)
 
         shape_for_reduction, reduction_dims = _get_reduction_params(
@@ -697,12 +715,14 @@ def forward(self, input: torch.Tensor):
             self.min_val = min_val
             self.max_val = max_val
         else:
-            assert self.min_val.shape == min_val.shape, (
-                f"Can't update existing min_val - shape mismatch, self.min_val:{self.min_val.shape} != min_val:{min_val.shape}"
-            )
-            assert self.max_val.shape == max_val.shape, (
-                f"Can't update existing max_val - shape mismatch, self.max_val {self.max_val.shape} != max_val:{max_val.shape}"
-            )
+            if self.min_val.shape != min_val.shape:
+                raise AssertionError(
+                    f"Can't update existing min_val - shape mismatch, self.min_val:{self.min_val.shape} != min_val:{min_val.shape}"
+                )
+            if self.max_val.shape != max_val.shape:
+                raise AssertionError(
+                    f"Can't update existing max_val - shape mismatch, self.max_val {self.max_val.shape} != max_val:{max_val.shape}"
+                )
             min_val = torch.min(self.min_val, min_val)
             max_val = torch.max(self.max_val, max_val)
             self.min_val.copy_(min_val)
@@ -711,9 +731,10 @@ def forward(self, input: torch.Tensor):
         return input
 
     def calculate_qparams(self) -> tuple[torch.Tensor, torch.Tensor]:
-        assert hasattr(self, "min_val") and hasattr(self, "max_val"), (
-            "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
-        )
+        if not (hasattr(self, "min_val") and hasattr(self, "max_val")):
+            raise AssertionError(
+                "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
+            )
         return choose_qparams_affine_with_min_max(
             self.min_val,
             self.max_val,
@@ -775,7 +796,8 @@ def forward(self, input: torch.Tensor):
 
         input_detached = input.detach()
         self.original_dtype = input_detached.dtype
-        assert self.granularity is not None, "granularity is None"
+        if self.granularity is None:
+            raise AssertionError("granularity is None")
         self.block_size = get_block_size(input_detached.shape, self.granularity)
 
         shape_for_reduction, reduction_dims = _get_reduction_params(
@@ -788,12 +810,14 @@ def forward(self, input: torch.Tensor):
             self.min_val = min_val
             self.max_val = max_val
         else:
-            assert self.min_val.shape == min_val.shape, (
-                f"Can't update existing min_val - shape mismatch, self.min_val:{self.min_val.shape} != min_val:{min_val.shape}"
-            )
-            assert self.max_val.shape == max_val.shape, (
-                f"Can't update existing max_val - shape mismatch, self.max_val {self.max_val.shape} != max_val:{max_val.shape}"
-            )
+            if self.min_val.shape != min_val.shape:
+                raise AssertionError(
+                    f"Can't update existing min_val - shape mismatch, self.min_val:{self.min_val.shape} != min_val:{min_val.shape}"
+                )
+            if self.max_val.shape != max_val.shape:
+                raise AssertionError(
+                    f"Can't update existing max_val - shape mismatch, self.max_val {self.max_val.shape} != max_val:{max_val.shape}"
+                )
             min_val = self.min_val + self.averaging_constant * (min_val - self.min_val)
             max_val = self.max_val + self.averaging_constant * (max_val - self.max_val)
             self.min_val.copy_(min_val)
@@ -803,9 +827,10 @@ def forward(self, input: torch.Tensor):
         return input
 
     def calculate_qparams(self) -> tuple[torch.Tensor, torch.Tensor]:
-        assert hasattr(self, "min_val") and hasattr(self, "max_val"), (
-            "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
-        )
+        if not (hasattr(self, "min_val") and hasattr(self, "max_val")):
+            raise AssertionError(
+                "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
+            )
 
         return choose_qparams_affine_with_min_max(
             self.min_val,
diff --git a/torch/ao/quantization/pt2e/_numeric_debugger.py b/torch/ao/quantization/pt2e/_numeric_debugger.py
index 81c6a2060e76..040a352f6be3 100644
--- a/torch/ao/quantization/pt2e/_numeric_debugger.py
+++ b/torch/ao/quantization/pt2e/_numeric_debugger.py
@@ -1,8 +1,8 @@
 import copy
 import logging
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from dataclasses import dataclass
-from typing import Callable, Optional
+from typing import Optional
 
 import torch
 from torch.ao.ns.fx.utils import compute_sqnr
@@ -97,7 +97,7 @@ def _tensor_shape_equals(x: object, y: object) -> bool:
         return all_equal
     else:
         log.debug("Comparing non Tensors: %s and %s, they must be equal", x, y)
-        return type(x) == type(y) and x == y
+        return type(x) is type(y) and x == y
 
 
 def _loss_fn(
diff --git a/torch/ao/quantization/pt2e/duplicate_dq_pass.py b/torch/ao/quantization/pt2e/duplicate_dq_pass.py
index 163184c00f1d..34a95eb80fb2 100644
--- a/torch/ao/quantization/pt2e/duplicate_dq_pass.py
+++ b/torch/ao/quantization/pt2e/duplicate_dq_pass.py
@@ -1,4 +1,3 @@
-# mypy: allow-untyped-defs
 import logging
 import operator
 
@@ -31,7 +30,7 @@
 
 def _maybe_duplicate_dq(
     gm: torch.fx.GraphModule, dq_node: torch.fx.Node, user: torch.fx.Node
-):
+) -> None:
     annotation = user.meta.get("quantization_annotation", None)
     if not _is_valid_annotation(annotation):  # type: ignore[arg-type]
         return
diff --git a/torch/ao/quantization/pt2e/graph_utils.py b/torch/ao/quantization/pt2e/graph_utils.py
index 80520d1ef0d0..f00def3c1668 100644
--- a/torch/ao/quantization/pt2e/graph_utils.py
+++ b/torch/ao/quantization/pt2e/graph_utils.py
@@ -2,8 +2,8 @@
 import itertools
 import operator
 from collections import OrderedDict
-from collections.abc import Sequence
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable, Sequence
+from typing import Any, Optional, Union
 
 import torch
 from torch.export import ExportedProgram
@@ -124,9 +124,18 @@ def _get_submodule(
     graph_module: torch.fx.GraphModule, node: torch.fx.Node, arg_index: int
 ) -> tuple[str, torch.nn.Module, torch.fx.Node]:
     submod_node = node.args[arg_index]
-    assert isinstance(submod_node, torch.fx.Node)
-    assert submod_node.op == "get_attr"
-    assert isinstance(submod_node.target, str)
+    if not isinstance(submod_node, torch.fx.Node):
+        raise AssertionError(
+            f"Expected submod_node to be a torch.fx.Node, got {type(submod_node)}"
+        )
+    if submod_node.op != "get_attr":
+        raise AssertionError(
+            f"Expected submod_node.op to be 'get_attr', got {submod_node.op}"
+        )
+    if not isinstance(submod_node.target, str):
+        raise AssertionError(
+            f"Expected submod_node.target to be a string attribute name, got {type(submod_node.target)}"
+        )
     submodule = graph_module.get_submodule(submod_node.target)
     # pyre-ignore
     return submod_node.target, submodule, node
@@ -161,9 +170,10 @@ def bfs_trace_with_node_process(
 ) -> None:
     """Traverse the graph module and apply node_op to each node."""
 
-    assert isinstance(model, (ExportedProgram, torch.fx.GraphModule)), (
-        f"Expected GraphModule or ExportedProgram, got {type(model)}"
-    )
+    if not isinstance(model, (ExportedProgram, torch.fx.GraphModule)):
+        raise AssertionError(
+            f"Expected GraphModule or ExportedProgram, got {type(model)}"
+        )
     gm = model.graph_module if isinstance(model, ExportedProgram) else model
     queue = [gm]
     while queue:
diff --git a/torch/ao/quantization/pt2e/prepare.py b/torch/ao/quantization/pt2e/prepare.py
index 57ff31152101..aedad07cc8a6 100644
--- a/torch/ao/quantization/pt2e/prepare.py
+++ b/torch/ao/quantization/pt2e/prepare.py
@@ -229,7 +229,10 @@ def _get_edge_or_node_to_group_id(
                 qspec, edge_or_node_to_qspec, shared_with_map
             )
 
-            assert isinstance(input_edge, tuple)
+            if not isinstance(input_edge, tuple):
+                raise AssertionError(
+                    f"input_edge must be a tuple (arg, user), got {type(input_edge)}"
+                )
             arg, n = input_edge
             if n.meta["quantization_annotation"].allow_implicit_sharing:
                 # NOTE: the order is important here, we first share with other users and then share with previous
@@ -346,7 +349,10 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
 
     if not isinstance(arg, Node):
         return arg
-    assert isinstance(arg, Node)
+    if not isinstance(arg, Node):
+        raise AssertionError(
+            f"expect original argument to be a Node, but got: {type(arg)}"
+        )
     # default (no observer)
     new_arg = arg
 
@@ -354,9 +360,10 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
     original_arg = arg
     while _is_activation_post_process_node(original_arg, named_modules):
         original_arg = original_arg.args[0]  # type: ignore[assignment]
-    assert isinstance(original_arg, Node), (
-        f"expect original argument to be a Node, but got: {type(original_arg)}"
-    )
+    if not isinstance(original_arg, Node):
+        raise AssertionError(
+            f"expect original argument to be a Node, but got: {type(original_arg)}"
+        )
 
     input_edge = (original_arg, node)
     if input_edge not in obs_or_fq_map:
@@ -366,7 +373,7 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
     if input_edge_obs_or_fq is None:
         return new_arg
 
-    arg_as_output_obs_or_fq = obs_or_fq_map.get(original_arg, None)
+    arg_as_output_obs_or_fq = obs_or_fq_map.get(original_arg)
     # the arg is observed as the output and is using the same instance as the input_edge
     # we'll reuse the inserted observer/fake_quant
     if arg_as_output_obs_or_fq is not None and id(arg_as_output_obs_or_fq) == id(
@@ -391,7 +398,10 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
         if id(maybe_obs_mod) == id(input_edge_obs_or_fq):
             return maybe_obs_node
 
-    assert isinstance(model.graph, Graph)
+    if not isinstance(model.graph, Graph):
+        raise AssertionError(
+            f"Expected model.graph to be a torch.fx.Graph, got {type(model.graph)}"
+        )
     new_arg = _insert_obs_or_fq(
         arg,
         input_edge_obs_or_fq,
@@ -444,12 +454,13 @@ def _maybe_insert_input_observers_for_node(
     # Clone has a memory_format kwarg, zeros_like has a pin_memory kwarg, and
     # gelu has a has an approximate kwarg that persist in exported graph.
     # This is just a work around for these.
-    assert (
+    if not (
         node.target == torch.ops.aten.clone.default
         or node.target == torch.ops.aten.zeros_like.default
         or node.target == torch.ops.aten.gelu.default
         or len(node.kwargs) == 0
-    ), " expecting kwargs for aten op IR to be empty"
+    ):
+        raise AssertionError(" expecting kwargs for aten op IR to be empty")
 
     # assign the new args to the node, inplace
     node.args = tuple(new_args)
@@ -497,11 +508,7 @@ def _maybe_insert_input_and_output_observers_for_node(
     is_qat: bool,
     model_device: Optional[torch.device] = None,
 ):
-    this_node_quantization_annotation = (
-        node.meta["quantization_annotation"]
-        if "quantization_annotation" in node.meta
-        else None
-    )
+    this_node_quantization_annotation = node.meta.get("quantization_annotation", None)
     if this_node_quantization_annotation is None:
         return
 
diff --git a/torch/ao/quantization/pt2e/qat_utils.py b/torch/ao/quantization/pt2e/qat_utils.py
index b9ce762896f1..b7daca97b18f 100644
--- a/torch/ao/quantization/pt2e/qat_utils.py
+++ b/torch/ao/quantization/pt2e/qat_utils.py
@@ -3,7 +3,8 @@
 import dataclasses
 import itertools
 import operator
-from typing import Any, Callable, Optional, TYPE_CHECKING
+from collections.abc import Callable
+from typing import Any, Optional, TYPE_CHECKING
 
 import torch
 import torch.nn.functional as F
@@ -384,28 +385,53 @@ def _get_nodes(nodes: list[Node]) -> tuple[Node, Node, Optional[Node]]:
             if n.op != "call_function":
                 continue
             if _is_conv_or_conv_transpose_node(n):
-                assert conv_node is None
+                if conv_node is not None:
+                    raise AssertionError(
+                        f"Found multiple conv nodes in match, previous: {conv_node}, new: {n}"
+                    )
                 conv_node = n
             if _is_bn_node(n):
-                assert bn_node is None
+                if bn_node is not None:
+                    raise AssertionError(
+                        f"Found multiple bn nodes in match, previous: {bn_node}, new: {n}"
+                    )
                 bn_node = n
             if n.target == operator.getitem:
-                assert getitem_node is None
+                if getitem_node is not None:
+                    raise AssertionError(
+                        f"Found multiple getitem nodes in match, previous: {getitem_node}, new: {n}"
+                    )
                 getitem_node = n
-        assert conv_node is not None
-        assert bn_node is not None
+        if conv_node is None:
+            raise AssertionError(
+                "Expected exactly one conv node in the match, found none"
+            )
+        if bn_node is None:
+            raise AssertionError(
+                "Expected exactly one bn node in the match, found none"
+            )
         return (conv_node, bn_node, getitem_node)
 
     def _get_q_dq_nodes(n: Node) -> tuple[Node, Node, Node]:
         """
         Return a 3-tuple of (orig_node, q_node, dq_node).
         """
-        assert _is_dequantize(n)
+        if not _is_dequantize(n):
+            raise AssertionError(f"Expected a dequantize node, got: {n}")
         q_node = n.args[0]
-        assert isinstance(q_node, Node)
-        assert _is_quantize(q_node)
+        if not isinstance(q_node, Node):
+            raise AssertionError(
+                f"Expected quantize node to be a torch.fx.Node, got {type(q_node)}"
+            )
+        if not _is_quantize(q_node):
+            raise AssertionError(
+                f"Expected q_node to be a quantize node, got target={q_node.target}"
+            )
         orig_node = q_node.args[0]
-        assert isinstance(orig_node, Node)
+        if not isinstance(orig_node, Node):
+            raise AssertionError(
+                f"Expected original node to be a torch.fx.Node, got {type(orig_node)}"
+            )
         return (orig_node, q_node, n)
 
     original_nodes = list(_filter_nodes_map(r.nodes_map).values())
@@ -413,8 +439,10 @@ def _get_q_dq_nodes(n: Node) -> tuple[Node, Node, Node]:
     r_conv, r_bn, r_getitem = _get_nodes(r.replacements)
 
     # Create the mapping from original node to replacement node
-    assert o_getitem is None
-    assert r_getitem is None
+    if o_getitem is not None:
+        raise AssertionError(f"Expected o_getitem to be None, got {o_getitem}")
+    if r_getitem is not None:
+        raise AssertionError(f"Expected r_getitem to be None, got {r_getitem}")
     mapping = {
         "conv": (o_conv, r_conv),
         "bn": (o_bn, r_bn),
@@ -426,10 +454,22 @@ def _get_q_dq_nodes(n: Node) -> tuple[Node, Node, Node]:
     (p_conv, _, _) = _get_nodes(list(r.nodes_map.keys()))
     (p_conv_input, p_conv_weight, *_) = p_conv.args
     (r_conv_input, r_conv_weight, *_) = r_conv.args
-    assert isinstance(p_conv_input, Node)
-    assert isinstance(p_conv_weight, Node)
-    assert isinstance(r_conv_input, Node)
-    assert isinstance(r_conv_weight, Node)
+    if not isinstance(p_conv_input, Node):
+        raise AssertionError(
+            f"Expected p_conv_input to be a Node, got {type(p_conv_input)}"
+        )
+    if not isinstance(p_conv_weight, Node):
+        raise AssertionError(
+            f"Expected p_conv_weight to be a Node, got {type(p_conv_weight)}"
+        )
+    if not isinstance(r_conv_input, Node):
+        raise AssertionError(
+            f"Expected r_conv_input to be a Node, got {type(r_conv_input)}"
+        )
+    if not isinstance(r_conv_weight, Node):
+        raise AssertionError(
+            f"Expected r_conv_weight to be a Node, got {type(r_conv_weight)}"
+        )
     o_conv_input = r.nodes_map[p_conv_input]
     o_conv_weight = r.nodes_map[p_conv_weight]
 
@@ -453,8 +493,14 @@ def _get_q_dq_nodes(n: Node) -> tuple[Node, Node, Node]:
     if len(p_conv.args) > 2 and len(r_conv.args) > 2:
         p_conv_bias = p_conv.args[2]
         r_conv_bias = r_conv.args[2]
-        assert isinstance(p_conv_bias, Node)
-        assert isinstance(r_conv_bias, Node)
+        if not isinstance(p_conv_bias, Node):
+            raise AssertionError(
+                f"Expected p_conv_bias to be a Node, got {type(p_conv_bias)}"
+            )
+        if not isinstance(r_conv_bias, Node):
+            raise AssertionError(
+                f"Expected r_conv_bias to be a Node, got {type(r_conv_bias)}"
+            )
         o_conv_bias = r.nodes_map[p_conv_bias]
 
         # If conv bias is quantized, extract the q - dq nodes
@@ -504,8 +550,12 @@ def _copy_over_literal_conv_args(original_node: Node, new_node: Node):
     Note: Unlike other tensor args like conv weights and biases, literal args are
     preserved in the original nodes after replacement, so we can access them here.
     """
-    assert _is_conv_or_conv_transpose_node(original_node)
-    assert _is_conv_or_conv_transpose_node(new_node)
+    if not _is_conv_or_conv_transpose_node(original_node):
+        raise AssertionError(
+            f"Expected original_node to be a conv node, got {original_node}"
+        )
+    if not _is_conv_or_conv_transpose_node(new_node):
+        raise AssertionError(f"Expected new_node to be a conv node, got {new_node}")
     # x, weight, bias, [stride, padding, dilation, transposed, output_padding, groups]
     new_args = list(new_node.args)
     if len(new_args) < 3:
@@ -524,8 +574,14 @@ def _update_conv_input_qspec_map_after_replacement(
     so the keys in the `input_qspec_map` will need to be updated to reflect
     the corresponding nodes in the replacement graph.
     """
-    assert _is_conv_or_conv_transpose_node(original_node)
-    assert _is_conv_or_conv_transpose_node(replacement_node)
+    if not _is_conv_or_conv_transpose_node(original_node):
+        raise AssertionError(
+            f"Expected original_node to be a conv node, got {original_node}"
+        )
+    if not _is_conv_or_conv_transpose_node(replacement_node):
+        raise AssertionError(
+            f"Expected replacement_node to be a conv node, got {replacement_node}"
+        )
     if "quantization_annotation" not in original_node.meta:
         return
     original_input_qspec_map = original_node.meta[
@@ -785,8 +841,8 @@ def _duplicate_dequantize_node(m: GraphModule):
 def _remove_extra_dequantize(m: GraphModule):
     """
     Removes duplicate dequant nodes in the graph, for an operator that has
-    multiple dequant nodes as a user, replace them with a single dequant node
-    that can be shared across all the uses. This should be seen as the "reverse"
+    multiple dequant nodes as a user. Replace them with a single dequant node
+    that can be shared across all uses. This should be seen as the "reverse"
     of `_duplicate_dequantize_node`.
     """
     dq_op = torch.ops.quantized_decomposed.dequantize_per_tensor
@@ -814,7 +870,11 @@ def _copy_over_q_dq_args(original_node: Node, replacement_node: Node):
     """
     # For quantize_per_tensor, scale and zp are literals and need to be copied
     # For quantize_per_channel, scale and zp are get_attr nodes and should be skipped
-    assert original_node.target == replacement_node.target
+    if original_node.target != replacement_node.target:
+        raise AssertionError(
+            "Expected original and replacement nodes to have the same target, got "
+            f"{original_node.target} != {replacement_node.target}"
+        )
     if original_node.target in (
         torch.ops.quantized_decomposed.quantize_per_tensor.default,
         torch.ops.quantized_decomposed.dequantize_per_tensor.default,
@@ -972,7 +1032,10 @@ def _fold_conv_bn_qat_helper(
         _copy_over_q_dq_args(*node_map["conv_weight_q"])
         _copy_over_q_dq_args(*node_map["conv_weight_dq"])
         if "conv_bias_q" in node_map:
-            assert "conv_bias_dq" in node_map
+            if "conv_bias_dq" not in node_map:
+                raise AssertionError(
+                    "Expected 'conv_bias_dq' to be present in node_map when 'conv_bias_q' is present"
+                )
             _copy_over_q_dq_args(*node_map["conv_bias_q"])
             _copy_over_q_dq_args(*node_map["conv_bias_dq"])
 
diff --git a/torch/ao/quantization/pt2e/representation/rewrite.py b/torch/ao/quantization/pt2e/representation/rewrite.py
index 5a757a700498..0f055cc3019a 100644
--- a/torch/ao/quantization/pt2e/representation/rewrite.py
+++ b/torch/ao/quantization/pt2e/representation/rewrite.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
+from collections.abc import Callable
 from dataclasses import dataclass
 from functools import partial
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 
 import torch
 from torch._export.utils import _disable_aten_to_metadata_assertions
diff --git a/torch/ao/quantization/pt2e/utils.py b/torch/ao/quantization/pt2e/utils.py
index ae938fec4c7f..4f994f710a15 100644
--- a/torch/ao/quantization/pt2e/utils.py
+++ b/torch/ao/quantization/pt2e/utils.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 import operator
 import types
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional, Union
 
 import torch
 import torch.ao.quantization.pt2e._affine_quantization  # noqa: F401
@@ -89,6 +90,7 @@ def _find_q_dq_node_for_user(
         and arg.op == "call_function"
         and arg.target in _QUANTIZE_OPS
     ):
+        # pyrefly: ignore  # unbound-name
         q_node = arg
     return (q_node, dq_node)
 
@@ -121,7 +123,8 @@ def _is_valid_annotation(annotation: QuantizationAnnotation) -> bool:
 def _get_tensor_constant_from_node(node, m):
     if node is None:
         return None
-    assert node.op == "get_attr"
+    if node.op != "get_attr":
+        raise AssertionError(f"Expected node.op to be 'get_attr', got {node.op}")
     target_atoms = node.target.split(".")
     attr_itr = m
     for i, atom in enumerate(target_atoms):
@@ -246,7 +249,10 @@ def fold_bn_weights_into_conv_node(
 
     # calling data since the fused_weight and fused_bias are nn.Parameter
     weight_attr_name = conv_weight_node.target
-    assert isinstance(weight_attr_name, str)
+    if not isinstance(weight_attr_name, str):
+        raise AssertionError(
+            f"Expected conv_weight_node.target to be a string attribute name, got {type(weight_attr_name)}"
+        )
     _assign_attr(fused_weight, m, weight_attr_name, _AttrKind.PARAMETER)
     if conv_bias_node is not None:
         bias_attr_name = conv_bias_node.target
@@ -353,15 +359,16 @@ def _get_aten_graph_module_for_pattern(
     """
     if is_cuda:
         example_inputs = tuple(
-            [x.cuda() if isinstance(x, torch.Tensor) else x for x in example_inputs]
+            x.cuda() if isinstance(x, torch.Tensor) else x for x in example_inputs
         )
 
-    aten_pattern = torch.export.export(
-        pattern,  # type: ignore[arg-type]
-        example_inputs,
-        kwargs,
-        strict=True,
-    ).module(check_guards=False)
+    with torch._export.config.patch(use_new_tracer_experimental=True):
+        aten_pattern = torch.export.export(
+            pattern,  # type: ignore[arg-type]
+            example_inputs,
+            kwargs,
+            strict=True,
+        ).module(check_guards=False)
 
     aten_pattern.graph.eliminate_dead_code()  # type: ignore[operator, union-attr]
     aten_pattern.recompile()  # type: ignore[operator]
diff --git a/torch/ao/quantization/qconfig.py b/torch/ao/quantization/qconfig.py
index 94dfdb8c7626..a80ccc901582 100644
--- a/torch/ao/quantization/qconfig.py
+++ b/torch/ao/quantization/qconfig.py
@@ -83,6 +83,7 @@
 ]
 
 
+# pyrefly: ignore  # invalid-inheritance
 class QConfig(namedtuple("QConfig", ["activation", "weight"])):
     """
     Describes how to quantize a layer or a part of the network by providing
@@ -120,6 +121,7 @@ def __new__(cls, activation, weight):
     "`QConfigDynamic` is going to be deprecated in PyTorch 1.12, please use `QConfig` instead",
     category=FutureWarning,
 )
+# pyrefly: ignore  # invalid-inheritance
 class QConfigDynamic(namedtuple("QConfigDynamic", ["activation", "weight"])):
     """
     Describes how to dynamically quantize a layer or a part of the network by providing
@@ -563,9 +565,10 @@ def _assert_valid_qconfig(qconfig: Optional[QConfig], mod: torch.nn.Module) -> N
                 torch.ao.quantization.MovingAveragePerChannelMinMaxObserver,
             ),
         )
-        assert not is_per_channel, (
-            "Per channel weight observer is not supported yet for ConvTranspose{n}d."
-        )
+        if is_per_channel:
+            raise AssertionError(
+                "Per channel weight observer is not supported yet for ConvTranspose{n}d."
+            )
 
 
 if sys.version_info < (3, 12):
@@ -597,7 +600,8 @@ def _add_module_to_qconfig_obs_ctr(
         return qconfig
 
     def get_factory_kwargs_based_on_module_device():
-        assert isinstance(module, torch.nn.Module)
+        if not isinstance(module, torch.nn.Module):
+            raise AssertionError("module must be an instance of torch.nn.Module")
         devices = {p.device for p in module.parameters()} | {
             p.device for p in module.buffers()
         }
@@ -669,7 +673,10 @@ def qconfig_equals(q1: QConfigAny, q2: QConfigAny):
     if q1 is None or q2 is None:
         return q1 == q2
     else:
-        assert q1 is not None and q2 is not None
+        if q1 is None or q2 is None:
+            raise AssertionError(
+                "Both q1 and q2 must be non-None for qconfig comparison"
+            )
         try:
             # Qconfig weight and activation can be either a partial wrapper,
             # or an observer class. Special handling is required (above) for
diff --git a/torch/ao/quantization/qconfig_mapping.py b/torch/ao/quantization/qconfig_mapping.py
index bd34a6b8a1f4..10111d4ab8a2 100644
--- a/torch/ao/quantization/qconfig_mapping.py
+++ b/torch/ao/quantization/qconfig_mapping.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 
 from collections import OrderedDict
-from typing import Any, Callable, Union
+from typing import Any, TYPE_CHECKING, Union
 
 import torch
 
@@ -26,6 +26,10 @@
 )
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 __all__ = [
     "get_default_qconfig_mapping",
     "get_default_qat_qconfig_mapping",
diff --git a/torch/ao/quantization/quantization_mappings.py b/torch/ao/quantization/quantization_mappings.py
index e22fba05bbc9..c9173e6bc6e9 100644
--- a/torch/ao/quantization/quantization_mappings.py
+++ b/torch/ao/quantization/quantization_mappings.py
@@ -1,5 +1,6 @@
 import copy
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional, Union
 
 import torch
 import torch.ao.nn as ao_nn
@@ -251,10 +252,11 @@ def get_static_quant_module_class(
         additional_static_quant_mapping,
     )
     static_quant_module_class = all_mappings.get(float_module_class, None)
-    assert static_quant_module_class is not None, (
-        f"Floating point module class {str(float_module_class)}"
-        + " does not have a corresponding quantized module class"
-    )
+    if static_quant_module_class is None:
+        raise AssertionError(
+            f"Floating point module class {str(float_module_class)}"
+            + " does not have a corresponding quantized module class"
+        )
     return copy.deepcopy(static_quant_module_class)
 
 
@@ -271,10 +273,11 @@ def get_dynamic_quant_module_class(
         DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS, additional_dynamic_quant_mapping
     )
     dynamic_quant_module_class = all_mappings.get(float_module_class, None)
-    assert dynamic_quant_module_class is not None, (
-        f"Floating point module class {str(float_module_class)}"
-        + " does not have a corresponding quantized module class"
-    )
+    if dynamic_quant_module_class is None:
+        raise AssertionError(
+            f"Floating point module class {str(float_module_class)}"
+            + " does not have a corresponding quantized module class"
+        )
     return copy.deepcopy(dynamic_quant_module_class)
 
 
@@ -342,10 +345,11 @@ def get_default_float_to_quantized_operator_mappings() -> dict[
 # TODO: merge with get_static_quant_module_class
 def get_quantized_operator(float_op: Union[Callable, str]) -> Callable:
     """Get the quantized operator corresponding to the float operator"""
-    quantized_op = DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS.get(float_op, None)
-    assert quantized_op is not None, (
-        f"Operator {str(float_op)} does not have corresponding quantized op"
-    )
+    quantized_op = DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS.get(float_op)
+    if quantized_op is None:
+        raise AssertionError(
+            f"Operator {str(float_op)} does not have corresponding quantized op"
+        )
     return quantized_op
 
 
diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py
index b85618a16331..5a2e04f8dd3f 100644
--- a/torch/ao/quantization/quantize.py
+++ b/torch/ao/quantization/quantize.py
@@ -158,9 +158,10 @@ def _observer_forward_pre_hook(self, input):
 
 
 def _register_activation_post_process_hook(module, pre_hook=False):
-    assert hasattr(module, "activation_post_process"), (
-        "Expect activation_post_process attribute already attached to the module"
-    )
+    if not hasattr(module, "activation_post_process"):
+        raise AssertionError(
+            "Expect activation_post_process attribute already attached to the module"
+        )
     if pre_hook:
         module.register_forward_pre_hook(_observer_forward_pre_hook, prepend=True)
     else:
@@ -198,9 +199,10 @@ def _add_observer_(
     # respect device affinity when adding observers
     if device is None:
         devices = _get_unique_devices_(module)
-        assert len(devices) <= 1, (
-            f"_add_observer_ only works with cpu or single-device CUDA modules, but got devices {devices}"
-        )
+        if len(devices) > 1:
+            raise AssertionError(
+                f"_add_observer_ only works with cpu or single-device CUDA modules, but got devices {devices}"
+            )
         device = next(iter(devices)) if len(devices) > 0 else None
 
     def get_activation_post_process(qconfig, device, special_act_post_process=None):
@@ -237,15 +239,16 @@ def insert_activation_post_process(m, special_act_post_process=None):
 
     for name, child in module.named_children():
         # TODO remove Dropout special after codebase stable
-        if type_before_parametrizations(child) in [nn.Dropout]:
+        if type_before_parametrizations(child) is nn.Dropout:
             continue
         elif issubclass(
             type_before_parametrizations(child), (nnq.FloatFunctional, nnq.QFunctional)
         ):
             if needs_observation(child):
-                assert hasattr(child, "activation_post_process"), (
-                    f"functional class {type_before_parametrizations(child)} has no pre-defined `activation_post_process`"
-                )
+                if not hasattr(child, "activation_post_process"):
+                    raise AssertionError(
+                        f"functional class {type_before_parametrizations(child)} has no pre-defined `activation_post_process`"
+                    )
                 child.activation_post_process = get_activation_post_process(
                     child.qconfig, device
                 )
@@ -584,7 +587,8 @@ def prepare_qat(model, mapping=None, inplace=False):
                  is mutated
     """
     torch._C._log_api_usage_once("quantization_api.quantize.prepare_qat")
-    assert model.training, "prepare_qat only works on models in training mode"
+    if not model.training:
+        raise AssertionError("prepare_qat only works on models in training mode")
     if mapping is None:
         mapping = get_default_qat_module_mappings()
 
@@ -760,7 +764,10 @@ def swap_module(
         elif type_before_parametrizations(mod) in mapping:
             qmod = mapping[type_before_parametrizations(mod)]
             if hasattr(qmod, "_IS_REFERENCE") and qmod._IS_REFERENCE:
-                assert mod.qconfig is not None
+                if mod.qconfig is None:
+                    raise AssertionError(
+                        "module qconfig must not be None when swapping to reference module"
+                    )
                 weight_post_process = mod.qconfig.weight()
                 weight_post_process(mod.weight)
                 weight_qparams = get_qparam_dict(weight_post_process)
@@ -787,11 +794,13 @@ def swap_module(
 
             # respect device affinity when swapping modules
             devices = _get_unique_devices_(mod)
-            assert len(devices) <= 1 or (
-                len(devices) == 2 and torch.device("meta") in devices
-            ), (
-                f"swap_module only works with cpu or single-device CUDA modules, but got devices {devices}"
-            )
+            if not (
+                len(devices) <= 1
+                or (len(devices) == 2 and torch.device("meta") in devices)
+            ):
+                raise AssertionError(
+                    f"swap_module only works with cpu or single-device CUDA modules, but got devices {devices}"
+                )
             device = next(iter(devices)) if len(devices) > 0 else None
             if device:
                 new_mod.to(device)
diff --git a/torch/ao/quantization/quantize_jit.py b/torch/ao/quantization/quantize_jit.py
index 38d9cd6b8b76..79f8db1a792f 100644
--- a/torch/ao/quantization/quantize_jit.py
+++ b/torch/ao/quantization/quantize_jit.py
@@ -157,12 +157,12 @@ def _convert_ondevice_jit(
     model, method_name, inplace=False, debug=False, quant_type=QuantType.STATIC
 ):
     _check_is_script_module(model)
-    assert quant_type == QuantType.DYNAMIC, (
-        "This API, while should work for static quant, is only tested for dynamic quant."
-    )
-    assert not method_name.startswith("observe_"), (
-        "Pass in valid method to be quantized, e.g. forward"
-    )
+    if quant_type != QuantType.DYNAMIC:
+        raise AssertionError(
+            "This API, while should work for static quant, is only tested for dynamic quant."
+        )
+    if method_name.startswith("observe_"):
+        raise AssertionError("Pass in valid method to be quantized, e.g. forward")
     observe_method_name = "observe_" + method_name
     quantize_method_name = "quantize_" + method_name
     model_c = model._c
@@ -230,12 +230,14 @@ def _quantize_jit(
         model = prepare_dynamic_jit(model, qconfig_dict, inplace)
         model = convert_dynamic_jit(model, True, debug)
     else:
-        assert run_fn, (
-            "Must provide calibration function for post training static quantization"
-        )
-        assert run_args, (
-            "Must provide calibration dataset for post training static quantization"
-        )
+        if not run_fn:
+            raise AssertionError(
+                "Must provide calibration function for post training static quantization"
+            )
+        if not run_args:
+            raise AssertionError(
+                "Must provide calibration dataset for post training static quantization"
+            )
         model = prepare_jit(model, qconfig_dict, inplace)
         run_fn(model, *run_args)
         model = convert_jit(model, True, debug)
diff --git a/torch/ao/quantization/quantizer/quantizer.py b/torch/ao/quantization/quantizer/quantizer.py
index 9884cb1990f0..91c7159a89af 100644
--- a/torch/ao/quantization/quantizer/quantizer.py
+++ b/torch/ao/quantization/quantizer/quantizer.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from dataclasses import dataclass, field
-from typing import Callable, Optional, Union
+from typing import Annotated, Optional, Union
 
 import torch
 from torch import Tensor
@@ -81,7 +82,7 @@ class FixedQParamsQuantizationSpec(QuantizationSpecBase):
 input edge is the connection between input node and the node consuming the input, so it's a Tuple[Node, Node]
 output value is an fx Node
 """
-EdgeOrNode = Union[tuple[Node, Node], Node]
+EdgeOrNode = Annotated[Union[tuple[Node, Node], Node], None]
 EdgeOrNode.__module__ = "torch.ao.quantization.quantizer.quantizer"
 
 
diff --git a/torch/ao/quantization/quantizer/utils.py b/torch/ao/quantization/quantizer/utils.py
index 04fefb7e463b..7c65a8e68014 100644
--- a/torch/ao/quantization/quantizer/utils.py
+++ b/torch/ao/quantization/quantizer/utils.py
@@ -1,11 +1,20 @@
-# mypy: allow-untyped-defs
+from collections.abc import Callable
+from typing import Optional
 
 from torch.ao.quantization.pt2e.utils import _is_sym_size_node
-from torch.ao.quantization.quantizer.quantizer import QuantizationAnnotation
+from torch.ao.quantization.quantizer.quantizer import (
+    QuantizationAnnotation,
+    QuantizationSpecBase,
+)
 from torch.fx import Node
 
 
-def _annotate_input_qspec_map(node: Node, input_node: Node, qspec):
+__all__: list[str] = []
+
+
+def _annotate_input_qspec_map(
+    node: Node, input_node: Node, qspec: Optional[QuantizationSpecBase]
+) -> None:
     quantization_annotation = node.meta.get(
         "quantization_annotation", QuantizationAnnotation()
     )
@@ -15,7 +24,7 @@ def _annotate_input_qspec_map(node: Node, input_node: Node, qspec):
     node.meta["quantization_annotation"] = quantization_annotation
 
 
-def _annotate_output_qspec(node: Node, qspec):
+def _annotate_output_qspec(node: Node, qspec: Optional[QuantizationSpecBase]) -> None:
     quantization_annotation = node.meta.get(
         "quantization_annotation", QuantizationAnnotation()
     )
@@ -23,7 +32,7 @@ def _annotate_output_qspec(node: Node, qspec):
     node.meta["quantization_annotation"] = quantization_annotation
 
 
-def _node_only_used_for_sym_size(node: Node, partition_nodes: list[Node]):
+def _node_only_used_for_sym_size(node: Node, partition_nodes: list[Node]) -> bool:
     """
     This utility is used to handle cases when dynami_shape=True tracing leads
     to symint nodes in the pattern of linear module. In those cases, we need to
@@ -48,7 +57,7 @@ def _node_only_used_for_sym_size(node: Node, partition_nodes: list[Node]):
     )
 
 
-def _get_module_name_filter(module_name: str):
+def _get_module_name_filter(module_name: str) -> Callable[[Node], bool]:
     """Get the module_name_filter function for a given module name, the filter accepts
     a node and checks if the node comes from a module that has certain module name
 
@@ -69,7 +78,7 @@ def module_name_filter(n: Node) -> bool:
         # get_attr nodes doesn't have nn_module_stack?
         nn_module_stack = n.meta.get("nn_module_stack", {})
 
-        def _normalize_path(n):
+        def _normalize_path(n: str) -> str:
             prefix = 0
             # TODO This is non standard behavior and should be removed when we migrate off capture_pre_autograd_graph.
             if n.startswith("L['self']."):
diff --git a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
index e4777645a9e9..5c0a3131f985 100644
--- a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
+++ b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
@@ -3,10 +3,9 @@
 import itertools
 import operator
 import warnings
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from dataclasses import dataclass
-from typing import Any, Callable, Optional, TYPE_CHECKING, Union
-from typing_extensions import TypeAlias
+from typing import Any, Optional, TYPE_CHECKING, TypeAlias, Union
 
 import torch
 import torch.nn.functional as F
@@ -264,7 +263,10 @@ def _is_quantized_op_pt2e(node: torch.fx.Node):
         # The node has not been annotated, directly return False
         return False
     quantization_annotation = node.meta.get(QUANT_ANNOTATION_KEY, None)
-    assert isinstance(quantization_annotation, _X86InductorQuantizationAnnotation)
+    if not isinstance(quantization_annotation, _X86InductorQuantizationAnnotation):
+        raise AssertionError(
+            "quantization_annotation must be an _X86InductorQuantizationAnnotation"
+        )
     return quantization_annotation._is_output_of_quantized_pattern
 
 
@@ -418,6 +420,7 @@ def _get_current_quantization_mode(self) -> _CurrentQuantizationMode:
         # As we use `_need_skip_config` to skip all invalid configurations,
         # we can safely assume that the all existing non-None configurations
         # have the same quantization mode.
+        # pyrefly: ignore  # bad-assignment
         for qconfig in (
             list(self.module_name_qconfig.values())
             + list(self.operator_type_qconfig.values())
@@ -428,20 +431,22 @@ def _get_current_quantization_mode(self) -> _CurrentQuantizationMode:
                 if qat_state is None:
                     qat_state = qconfig.is_qat
                 else:
-                    assert qat_state == qconfig.is_qat, (
-                        f"All non-None quantization configs should have the same `is_qat`,"
-                        f"but got {qat_state} and {qconfig.is_qat}."
-                    )
+                    if qat_state != qconfig.is_qat:
+                        raise AssertionError(
+                            f"All non-None quantization configs should have the same `is_qat`,"
+                            f"but got {qat_state} and {qconfig.is_qat}."
+                        )
                 # Query the `is_dynamic` state
                 input_activation_spec = qconfig.input_activation
                 if input_activation_spec is not None:
                     if dynamic_state is None:
                         dynamic_state = input_activation_spec.is_dynamic
                     else:
-                        assert dynamic_state == input_activation_spec.is_dynamic, (
-                            f"All non-None `input_activation_spec` should have the same `is_dynamic`,"
-                            f"but got {dynamic_state} and {input_activation_spec.is_dynamic}."
-                        )
+                        if dynamic_state != input_activation_spec.is_dynamic:
+                            raise AssertionError(
+                                f"All non-None `input_activation_spec` should have the same `is_dynamic`,"
+                                f"but got {dynamic_state} and {input_activation_spec.is_dynamic}."
+                            )
         return _CurrentQuantizationMode(
             qat_state=qat_state, dynamic_state=dynamic_state
         )
@@ -567,10 +572,12 @@ def _annotate_conv_node_helper(
             return
         input_qspec_map = {}
         input_node = conv_node.args[0]
-        assert isinstance(input_node, Node)
+        if not isinstance(input_node, Node):
+            raise AssertionError("input_node must be a FX Node")
         input_qspec_map[input_node] = get_input_act_qspec(quantization_config)
         weight_node = conv_node.args[1]
-        assert isinstance(weight_node, Node)
+        if not isinstance(weight_node, Node):
+            raise AssertionError("weight_node must be a FX Node")
         input_qspec_map[weight_node] = get_weight_qspec(quantization_config)
         bias_node = None if len(conv_node.args) == 2 else conv_node.args[2]
         if isinstance(bias_node, Node):
@@ -598,18 +605,23 @@ def _annotate_linear_node_helper(
             _annotate_nodes_not_quantize(linear_node)
             return
         input_qspec_map = {}
-        assert linear_node.target in (torch.ops.aten.linear.default,)
+        if linear_node.target != torch.ops.aten.linear.default:
+            raise AssertionError(
+                "linear_node.target must be torch.ops.aten.linear.default"
+            )
         has_bias = len(linear_node.args) == 3
         input_index = 0
         weight_index = 1
         bias_index = 2
 
         input_node = linear_node.args[input_index]
-        assert isinstance(input_node, Node)
+        if not isinstance(input_node, Node):
+            raise AssertionError("input_node must be a FX Node")
         input_qspec_map[input_node] = get_input_act_qspec(quantization_config)
 
         weight_node = linear_node.args[weight_index]
-        assert isinstance(weight_node, Node)
+        if not isinstance(weight_node, Node):
+            raise AssertionError("weight_node must be a FX Node")
         input_qspec_map[weight_node] = get_weight_qspec(quantization_config)
 
         bias_node = linear_node.args[bias_index] if has_bias else None
@@ -637,7 +649,8 @@ def _get_output_nodes_of_partitions(
             if len(partition.output_nodes) > 1:
                 raise ValueError("Input partition has more than one output node")
             output_node = partition.output_nodes[0]
-            assert isinstance(output_node, Node)
+            if not isinstance(output_node, Node):
+                raise AssertionError("output_node must be a FX Node")
             output_node_list.append(output_node)
         if len(output_node_list) != len(partition_list):
             raise ValueError(
@@ -666,7 +679,8 @@ def _get_input_idx_for_binary_node(
             conv_gemm_node_idx = 1
             extra_input_node_idx = 0
         extra_input_node = binary_node.args[extra_input_node_idx]  # type: ignore[index]
-        assert isinstance(extra_input_node, Node)
+        if not isinstance(extra_input_node, Node):
+            raise AssertionError("extra_input_node must be a FX Node")
         return conv_gemm_node_idx, extra_input_node_idx
 
     def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
@@ -809,6 +823,7 @@ def _annotate_qat_conv2d_bn_binary_unary(
                 )
                 binary_node.meta[QUANT_ANNOTATION_KEY] = (
                     _X86InductorQuantizationAnnotation(
+                        # pyrefly: ignore  # bad-argument-type
                         input_qspec_map=binary_node_input_qspec_map,
                         _annotated=True,
                     )
@@ -879,6 +894,7 @@ def _annotate_qat_conv2d_bn_binary(
                 )
                 binary_node.meta[QUANT_ANNOTATION_KEY] = (
                     _X86InductorQuantizationAnnotation(
+                        # pyrefly: ignore  # bad-argument-type
                         input_qspec_map=binary_node_input_qspec_map,
                         # TODO<leslie> Remove the annotate of output in QAT when qat util support pattern matcher.
                         output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
@@ -1086,6 +1102,7 @@ def _annotate_conv2d_binary_unary(
                 quantization_config
             )
             binary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                # pyrefly: ignore  # bad-argument-type
                 input_qspec_map=binary_node_input_qspec_map,
                 _annotated=True,
             )
@@ -1120,7 +1137,8 @@ def _annotate_conv2d_binary(
             if conv_node != binary_node.args[conv_node_idx]:
                 raise ValueError(f"{conv_node} doesn't match input of binary node")
             extra_input_node = binary_node.args[extra_input_node_idx]
-            assert isinstance(conv_node, Node)
+            if not isinstance(conv_node, Node):
+                raise AssertionError("conv_node must be a FX Node")
             if (
                 conv_node.op != "call_function"
                 or conv_node.target != torch.ops.aten.conv2d.default
@@ -1140,6 +1158,7 @@ def _annotate_conv2d_binary(
                 quantization_config
             )
             binary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                # pyrefly: ignore  # bad-argument-type
                 input_qspec_map=binary_node_input_qspec_map,
                 _annotated=True,
                 _is_output_of_quantized_pattern=True,
@@ -1233,7 +1252,8 @@ def _annotate_maxpool2d(
             return
 
         input_node = maxpool_node.args[0]
-        assert isinstance(input_node, Node)
+        if not isinstance(input_node, Node):
+            raise AssertionError("input_node must be a FX Node")
         input_qspec_map = {}
         input_qspec_map[input_node] = get_input_act_qspec(quantization_config)
         maxpool_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
@@ -1250,11 +1270,14 @@ def _annotate_cat(
             return
         cat_node = node
         input_nodes = cat_node.args[0]
-        assert isinstance(input_nodes, Sequence)
+        if not isinstance(input_nodes, Sequence):
+            raise AssertionError("input_nodes must be a Sequence of FX Nodes")
         first_input_node = input_nodes[0]
         input_qspec_map = {}
-        assert isinstance(first_input_node, Node)
-        assert isinstance(cat_node, Node)
+        if not isinstance(first_input_node, Node):
+            raise AssertionError("first_input_node must be a FX Node")
+        if not isinstance(cat_node, Node):
+            raise AssertionError("cat_node must be a FX Node")
         input_qspec_map[first_input_node] = get_input_act_qspec(quantization_config)
         share_qparams_with_input_act0_qspec = SharedQuantizationSpec(
             (first_input_node, cat_node)
@@ -1263,7 +1286,8 @@ def _annotate_cat(
         for input_node in input_nodes[1:]:
             if input_node not in input_qspec_map:
                 # There has the case of cat same nodes: torch.cat([input0, input0], 1)
-                assert isinstance(input_node, Node)
+                if not isinstance(input_node, Node):
+                    raise AssertionError("input_node must be a FX Node")
                 input_qspec_map[input_node] = share_qparams_with_input_act0_qspec
 
         cat_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
@@ -1353,11 +1377,7 @@ def is_all_inputs_connected_to_quantized_op(input_nodes):
     def _annotate_output_share_observer_as_input(
         self, input_node: Node, source_node: Node
     ):
-        source_node_quantization_annotation = (
-            source_node.meta[QUANT_ANNOTATION_KEY]
-            if QUANT_ANNOTATION_KEY in source_node.meta
-            else None
-        )
+        source_node_quantization_annotation = source_node.meta.get(QUANT_ANNOTATION_KEY)
         if (
             source_node_quantization_annotation
             and source_node_quantization_annotation._is_output_of_quantized_pattern
@@ -1396,10 +1416,8 @@ def _annotate_output_for_int8_in_int8_out_pattern(
                     return
 
                 # Get the quantization_annotation from getitem_node
-                maxpool_node_quantization_annotation = (
-                    maxpool_node.meta[QUANT_ANNOTATION_KEY]
-                    if QUANT_ANNOTATION_KEY in maxpool_node.meta
-                    else None
+                maxpool_node_quantization_annotation = maxpool_node.meta.get(
+                    QUANT_ANNOTATION_KEY
                 )
                 if (
                     maxpool_node_quantization_annotation
@@ -1407,8 +1425,10 @@ def _annotate_output_for_int8_in_int8_out_pattern(
                 ):
                     # Annotate the output_qspec of getitem_node
                     input_act = maxpool_node.args[0]
-                    assert isinstance(input_act, Node)
-                    assert isinstance(maxpool_node, Node)
+                    if not isinstance(input_act, Node):
+                        raise AssertionError("input_act must be a FX Node")
+                    if not isinstance(maxpool_node, Node):
+                        raise AssertionError("maxpool_node must be a FX Node")
                     edge_or_node = (input_act, maxpool_node)
                     maxpool_node_quantization_annotation.output_qspec = (
                         SharedQuantizationSpec(edge_or_node)
@@ -1436,8 +1456,9 @@ def _annotate_linear(
                     "Linear partition cannot have more than one output node"
                 )
             linear_node = partition.output_nodes[0]
-            if linear_node.op != "call_function" or linear_node.target not in (
-                torch.ops.aten.linear.default,
+            if (
+                linear_node.op != "call_function"
+                or linear_node.target != torch.ops.aten.linear.default
             ):
                 raise ValueError(f"{linear_node} is not an aten linear operator")
             # skip annotation if it is already annotated
@@ -1467,8 +1488,9 @@ def _annotate_linear_unary(
             linear_node, unary_node = self._get_output_nodes_of_partitions(
                 [linear_partition, unary_partition]
             )
-            if linear_node.op != "call_function" or linear_node.target not in (
-                torch.ops.aten.linear.default,
+            if (
+                linear_node.op != "call_function"
+                or linear_node.target != torch.ops.aten.linear.default
             ):
                 continue
             if _skip_annotate([unary_node, linear_node], filter_fn):
@@ -1498,6 +1520,7 @@ def _annotate_linear_binary_unary(
             has_unary = unary_op is not None
             seq_partition = [torch.nn.Linear, binary_op]
             if has_unary:
+                # pyrefly: ignore  # bad-argument-type
                 seq_partition.append(unary_op)
             fused_partitions = find_sequential_partitions(gm, seq_partition)
             for fused_partition in fused_partitions:
@@ -1533,7 +1556,8 @@ def _annotate_linear_binary_unary(
                     raise ValueError(
                         f"{linear_node} doesn't match input of binary node"
                     )
-                assert isinstance(linear_node, Node)
+                if not isinstance(linear_node, Node):
+                    raise AssertionError("linear_node must be a FX Node")
                 if (
                     linear_node.op != "call_function"
                     or linear_node.target != torch.ops.aten.linear.default
diff --git a/torch/ao/quantization/quantizer/xnnpack_quantizer.py b/torch/ao/quantization/quantizer/xnnpack_quantizer.py
index 6005152a4d73..792285dc8aea 100644
--- a/torch/ao/quantization/quantizer/xnnpack_quantizer.py
+++ b/torch/ao/quantization/quantizer/xnnpack_quantizer.py
@@ -4,7 +4,7 @@
 import copy
 import functools
 import typing_extensions
-from typing import Any, Callable, Optional, TYPE_CHECKING
+from typing import Any, Optional, TYPE_CHECKING
 
 import torch
 import torch._dynamo as torchdynamo
@@ -35,6 +35,8 @@
 
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor
     from torch.fx import Node
 
@@ -345,9 +347,8 @@ def set_module_name(
         quantizer.set_module_name("blocks.sub"), it will quantize all supported operator/operator
         patterns in the submodule with this module name with the given `quantization_config`
         """
-        assert quantization_config is not None, (
-            " quantization_config == None is not supported yet"
-        )
+        if quantization_config is None:
+            raise AssertionError("quantization_config == None is not supported yet")
         self.module_name_config[module_name] = quantization_config
         return self
 
diff --git a/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py b/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
index f8ac0a7727de..473d0b9d9afa 100644
--- a/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
+++ b/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
@@ -1,8 +1,9 @@
 # mypy: allow-untyped-defs
 import itertools
 import typing
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Callable, NamedTuple, Optional
+from typing import NamedTuple, Optional
 
 import torch
 import torch.nn.functional as F
@@ -120,10 +121,13 @@ def get_input_act_qspec(quantization_config: Optional[QuantizationConfig]):
     if quantization_config.input_activation is None:
         return None
     quantization_spec: QuantizationSpec = quantization_config.input_activation
-    assert quantization_spec.qscheme in [
+    if quantization_spec.qscheme not in [
         torch.per_tensor_affine,
         torch.per_tensor_symmetric,
-    ]
+    ]:
+        raise AssertionError(
+            f"Unsupported activation qscheme: {quantization_spec.qscheme}"
+        )
     return quantization_spec
 
 
@@ -133,17 +137,21 @@ def get_output_act_qspec(quantization_config: Optional[QuantizationConfig]):
     if quantization_config.output_activation is None:
         return None
     quantization_spec: QuantizationSpec = quantization_config.output_activation
-    assert quantization_spec.qscheme in [
+    if quantization_spec.qscheme not in [
         torch.per_tensor_affine,
         torch.per_tensor_symmetric,
-    ]
+    ]:
+        raise AssertionError(
+            f"Unsupported activation qscheme: {quantization_spec.qscheme}"
+        )
     return quantization_spec
 
 
 def get_weight_qspec(quantization_config: Optional[QuantizationConfig]):
     if quantization_config is None:
         return None
-    assert quantization_config is not None
+    if quantization_config is None:
+        raise AssertionError("quantization_config must not be None")
     if quantization_config.weight is None:
         return None
     quantization_spec: QuantizationSpec = quantization_config.weight
@@ -161,13 +169,15 @@ def get_weight_qspec(quantization_config: Optional[QuantizationConfig]):
 def get_bias_qspec(quantization_config: Optional[QuantizationConfig]):
     if quantization_config is None:
         return None
-    assert quantization_config is not None
+    if quantization_config is None:
+        raise AssertionError("quantization_config must not be None")
     if quantization_config.bias is None:
         return None
     quantization_spec: QuantizationSpec = quantization_config.bias
-    assert quantization_spec.dtype == torch.float, (
-        "Only float dtype for bias is supported for bias right now"
-    )
+    if quantization_spec.dtype != torch.float:
+        raise AssertionError(
+            "Only float dtype for bias is supported for bias right now"
+        )
     return quantization_spec
 
 
@@ -252,11 +262,13 @@ def _annotate_linear_relu(
 
         input_qspec_map = {}
         input_act = linear_node.args[0]
-        assert isinstance(input_act, Node)
+        if not isinstance(input_act, Node):
+            raise AssertionError("input activation must be a FX Node")
         input_qspec_map[input_act] = input_act_qspec
 
         weight = linear_node.args[1]
-        assert isinstance(weight, Node)
+        if not isinstance(weight, Node):
+            raise AssertionError("weight must be a FX Node")
         input_qspec_map[weight] = weight_qspec
 
         # adding weight node to the partition as well
@@ -302,11 +314,13 @@ def _annotate_conv(
 
         input_qspec_map = {}
         input_act = conv_node.args[0]
-        assert isinstance(input_act, Node)
+        if not isinstance(input_act, Node):
+            raise AssertionError("input activation must be a FX Node")
         input_qspec_map[input_act] = get_input_act_qspec(quantization_config)
 
         weight = conv_node.args[1]
-        assert isinstance(weight, Node)
+        if not isinstance(weight, Node):
+            raise AssertionError("weight must be a FX Node")
         input_qspec_map[weight] = get_weight_qspec(quantization_config)
 
         # adding weight node to the partition as well
@@ -361,11 +375,13 @@ def _do_annotate_conv_relu(
 
         input_qspec_map = {}
         input_act = conv_node.args[0]
-        assert isinstance(input_act, Node)
+        if not isinstance(input_act, Node):
+            raise AssertionError("input activation must be a FX Node")
         input_qspec_map[input_act] = get_input_act_qspec(quantization_config)
 
         weight = conv_node.args[1]
-        assert isinstance(weight, Node)
+        if not isinstance(weight, Node):
+            raise AssertionError("weight must be a FX Node")
         input_qspec_map[weight] = get_weight_qspec(quantization_config)
 
         # adding weight node to the partition as well
@@ -375,9 +391,11 @@ def _do_annotate_conv_relu(
             input_qspec_map[bias] = get_bias_qspec(quantization_config)
             partition.append(bias)
 
+        # pyrefly: ignore  # bad-argument-type
         if _is_annotated(partition):
             continue
 
+        # pyrefly: ignore  # bad-argument-type
         if filter_fn and any(not filter_fn(n) for n in partition):
             continue
 
@@ -388,6 +406,7 @@ def _do_annotate_conv_relu(
             output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
             _annotated=True,
         )
+        # pyrefly: ignore  # bad-argument-type
         _mark_nodes_as_annotated(partition)
         annotated_partitions.append(partition)
     return annotated_partitions
@@ -631,8 +650,10 @@ def _annotate_gru_io_only(
         # subgraph
         input_act = input_nodes[0]
         input_act_user = next(iter(input_act.users.keys()))
-        assert isinstance(input_act, Node)
-        assert isinstance(input_act_user, Node)
+        if not isinstance(input_act, Node):
+            raise AssertionError("input activation must be a FX Node")
+        if not isinstance(input_act_user, Node):
+            raise AssertionError("input activation user must be a FX Node")
         input_act_user.meta["quantization_annotation"] = QuantizationAnnotation(
             input_qspec_map={
                 input_act: get_input_act_qspec(quantization_config),
@@ -642,8 +663,10 @@ def _annotate_gru_io_only(
 
         hidden_state = input_nodes[1]
         hidden_state_user = next(iter(hidden_state.users.keys()))
-        assert isinstance(hidden_state, Node)
-        assert isinstance(hidden_state_user, Node)
+        if not isinstance(hidden_state, Node):
+            raise AssertionError("hidden state must be a FX Node")
+        if not isinstance(hidden_state_user, Node):
+            raise AssertionError("hidden state user must be a FX Node")
         hidden_state_user.meta["quantization_annotation"] = QuantizationAnnotation(
             input_qspec_map={
                 hidden_state: get_input_act_qspec(quantization_config),
@@ -651,7 +674,8 @@ def _annotate_gru_io_only(
             _annotated=True,
         )
 
-        assert len(output_nodes) == 2, "expecting GRU to have two outputs"
+        if len(output_nodes) != 2:
+            raise AssertionError("expecting GRU to have two outputs")
         for output in output_nodes:
             output.meta["quantization_annotation"] = QuantizationAnnotation(
                 output_qspec=get_output_act_qspec(quantization_config),
@@ -687,7 +711,8 @@ def _annotate_adaptive_avg_pool2d(
 
         annotated_partitions.append(partition.nodes)
         input_act = pool_node.args[0]
-        assert isinstance(input_act, Node)
+        if not isinstance(input_act, Node):
+            raise AssertionError("input activation must be a FX Node")
 
         # only annotate input output sharing operator
         # when the output of the input node is annotated
diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py
index e93cd3fdb7cb..09e4affe4b55 100644
--- a/torch/ao/quantization/utils.py
+++ b/torch/ao/quantization/utils.py
@@ -7,8 +7,9 @@
 import sys
 import warnings
 from collections import OrderedDict
+from collections.abc import Callable
 from inspect import getfullargspec, signature
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 from torch.ao.quantization.quant_type import QuantType
@@ -213,7 +214,8 @@ def to_underlying_dtype(qdtype):
         torch.float8_e5m2: torch.float8_e5m2,
         torch.float8_e4m3fn: torch.float8_e4m3fn,
     }
-    assert qdtype in DTYPE_MAPPING, "Unsupported dtype: " + str(qdtype)
+    if qdtype not in DTYPE_MAPPING:
+        raise AssertionError("Unsupported dtype: " + str(qdtype))
     return DTYPE_MAPPING[qdtype]
 
 
@@ -268,21 +270,24 @@ def get_swapped_custom_module_class(
     """
     quant_type = get_quant_type(qconfig)
     class_mapping = custom_module_class_mapping.get(quant_type, {})
-    assert type(custom_module) in class_mapping, (
-        "did not find corresponding observed "
-        f"module class for {type(custom_module)} in mapping: {class_mapping}"
-    )
+    if type(custom_module) not in class_mapping:
+        raise AssertionError(
+            "did not find corresponding observed "
+            f"module class for {type(custom_module)} in mapping: {class_mapping}"
+        )
     return class_mapping[type(custom_module)]
 
 
 def activation_dtype(qconfig):
-    assert qconfig is not None
+    if qconfig is None:
+        raise AssertionError("qconfig must be provided to determine activation dtype")
     activation = qconfig.activation()
     return activation.dtype
 
 
 def weight_dtype(qconfig):
-    assert qconfig is not None
+    if qconfig is None:
+        raise AssertionError("qconfig must be provided to determine weight dtype")
     weight = qconfig.weight()
     return weight.dtype
 
@@ -376,7 +381,8 @@ def get_qconfig_dtypes(qconfig):
     r"""returns the qconfig tuple for qconfig:
     (activation_dtype, weight_dtype, activation_is_dynamic)
     """
-    assert qconfig is not None
+    if qconfig is None:
+        raise AssertionError("qconfig must be provided to extract dtypes")
     activation = qconfig.activation()
     weight = qconfig.weight()
     act_is_dynamic = getattr(activation, "is_dynamic", False)
@@ -384,7 +390,8 @@ def get_qconfig_dtypes(qconfig):
 
 
 def get_quant_type(qconfig):
-    assert qconfig is not None
+    if qconfig is None:
+        raise AssertionError("qconfig must be provided to determine quant type")
     activation = qconfig.activation()
     weight = qconfig.weight()
     static_dtypes = [
@@ -439,11 +446,11 @@ def check_min_max_valid(min_val: torch.Tensor, max_val: torch.Tensor) -> bool:
 
             return False
 
-        assert min_val <= max_val, f"min {min_val} should be less than max {max_val}"
+        if min_val > max_val:
+            raise AssertionError(f"min {min_val} should be less than max {max_val}")
     else:
-        assert torch.all(min_val <= max_val), (
-            f"min {min_val} should be less than max {max_val}"
-        )
+        if torch.any(min_val > max_val):
+            raise AssertionError(f"min {min_val} should be less than max {max_val}")
 
     return True
 
@@ -478,13 +485,15 @@ def calculate_qmin_qmax(
 
         qrange_len = initial_quant_max - initial_quant_min + 1
         if dtype in [torch.qint8, torch.int8]:
-            assert 0 < qrange_len <= 256, (
-                "quantization range should be positive and not exceed the maximum bit range (=256)."
-            )
+            if not (0 < qrange_len <= 256):
+                raise AssertionError(
+                    "quantization range should be positive and not exceed the maximum bit range (=256)."
+                )
         elif dtype in [torch.qint32, torch.int32]:
-            assert 0 < qrange_len <= 2**32, (
-                "quantization range should be positive and not exceed the maximum bit range (=4294967296)."
-            )
+            if not (0 < qrange_len <= 2**32):
+                raise AssertionError(
+                    "quantization range should be positive and not exceed the maximum bit range (=4294967296)."
+                )
         if reduce_range:
             quant_min, quant_max = quant_min // 2, quant_max // 2
     else:
@@ -501,9 +510,9 @@ def calculate_qmin_qmax(
                 quant_min, quant_max = 0, 255
         elif dtype in [torch.qint32, torch.int32]:
             quant_min, quant_max = -1 * (2**31), (2**31) - 1
-        elif dtype in [torch.uint16]:
+        elif dtype == torch.uint16:
             quant_min, quant_max = 0, 2**16 - 1
-        elif dtype in [torch.int16]:
+        elif dtype == torch.int16:
             quant_min, quant_max = -(2**15), 2**15 - 1
         else:
             quant_min, quant_max = 0, 15
@@ -632,12 +641,12 @@ def validate_qmin_qmax(quant_min: int, quant_max: int) -> None:
     """
     # The variable names are prefixed with "initial" because their values (qmin and qmax) might be adjusted
     # based on whether quantization range is reduced and the datatype (signed/unsigned) used by the observer.
-    assert quant_min <= 0 <= quant_max, (
-        "Used-specified quantization range must include 0."
-    )
-    assert quant_min < quant_max, (
-        "qmin must be strictly less than qmax for user-specified quantization range."
-    )
+    if not (quant_min <= 0 <= quant_max):
+        raise AssertionError("Used-specified quantization range must include 0.")
+    if quant_min >= quant_max:
+        raise AssertionError(
+            "qmin must be strictly less than qmax for user-specified quantization range."
+        )
 
 
 # Functionally equivalent to '_calculate_qparams' in observer.py. Observers must be torchscriptable however and qscheme
@@ -809,10 +818,11 @@ def _assert_and_get_unique_device(module: torch.nn.Module) -> Any:
         )
         devices = {torch.device("cpu")}
     ""
-    assert len(devices) <= 1, (
-        "prepare only works with cpu or single-device CUDA modules, "
-        f"but got devices {devices}"
-    )
+    if len(devices) > 1:
+        raise AssertionError(
+            "prepare only works with cpu or single-device CUDA modules, "
+            f"but got devices {devices}"
+        )
     device = next(iter(devices)) if len(devices) > 0 else None
     return device
 
diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
index 74dcb4b70433..c0a8d30df328 100644
--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -92,6 +92,7 @@ def _make_grads(
     is_grads_batched: bool,
 ) -> tuple[_OptionalTensor, ...]:
     new_grads: list[_OptionalTensor] = []
+    # pyrefly: ignore  # no-matching-overload
     for out, grad in zip(outputs, grads):
         out = cast(Union[torch.Tensor, graph.GradientEdge], out)
         out_size = None
@@ -112,7 +113,8 @@ def _make_grads(
             # circular import
             from torch.nested._internal.nested_tensor import NestedTensor
 
-            assert isinstance(out, torch.Tensor)
+            if not isinstance(out, torch.Tensor):
+                raise AssertionError("Expected output to be a torch.Tensor")
             out_dtype = out.dtype
             out_is_nested = out.is_nested
             out_is_cpp_nested = out_is_nested and not isinstance(out, NestedTensor)
@@ -128,13 +130,15 @@ def _make_grads(
             # singleton int to represent jagged dimension, so that size() call
             # on nested tensor works.
             if out_is_cpp_nested:
-                assert isinstance(out, torch.Tensor)
+                if not isinstance(out, torch.Tensor):
+                    raise AssertionError("Expected output to be a torch.Tensor.")
                 shape_matches = torch.is_same_size(out, first_grad)
             else:
                 # We need to do a regular size check, without going through
                 # the operator, to be able to handle unbacked symints
                 # (expect_true ensures we can deal with unbacked)
-                assert out_size is not None
+                if out_size is None:
+                    raise AssertionError("Expected out_size to be set.")
                 shape_matches = expect_true(sym_eq(out_size, first_grad.size()))
 
             if not shape_matches:
@@ -190,10 +194,12 @@ def _make_grads(
         elif grad is None:
             if isinstance(out, graph.GradientEdge) or out.requires_grad:  # type: ignore[attr-defined]
                 if isinstance(out, graph.GradientEdge):
-                    assert out_size is not None
+                    if out_size is None:
+                        raise AssertionError("Expected out_size to be set.")
                     out_numel_is_1 = all(o == 1 for o in out_size)
                 else:
-                    assert isinstance(out, torch.Tensor)
+                    if not isinstance(out, torch.Tensor):
+                        raise AssertionError("Expected output to be a torch.Tensor")
                     out_numel_is_1 = out.numel() == 1
                 if not out_numel_is_1:
                     raise RuntimeError(
@@ -206,8 +212,10 @@ def _make_grads(
                     )
                     raise RuntimeError(msg)
                 if isinstance(out, graph.GradientEdge):
-                    assert out_size is not None
-                    assert out_device is not None
+                    if out_size is None:
+                        raise AssertionError("Expected out_size to be set.")
+                    if out_device is None:
+                        raise AssertionError("Expected out_device to be set.")
                     new_grads.append(
                         torch.ones(
                             out_size,
@@ -216,7 +224,8 @@ def _make_grads(
                         )
                     )
                 else:
-                    assert isinstance(out, torch.Tensor)
+                    if not isinstance(out, torch.Tensor):
+                        raise AssertionError("Expected output to be a torch.Tensor")
                     new_grads.append(
                         torch.ones_like(out, memory_format=torch.preserve_format)
                     )
@@ -341,6 +350,7 @@ def backward(
             Union[tuple[torch.Tensor], tuple[graph.GradientEdge]], (tensors,)
         )
     else:
+        # pyrefly: ignore  # bad-argument-type
         tensors = tuple(tensors)
 
     grad_tensors_ = _tensor_or_tensors_to_tuple(grad_tensors, len(tensors))
@@ -440,10 +450,12 @@ def grad(
             Union[Sequence[torch.Tensor], Sequence[graph.GradientEdge]], (outputs,)
         )
     else:
+        # pyrefly: ignore  # bad-argument-type
         outputs = tuple(outputs)
     if is_tensor_like(inputs) or isinstance(inputs, graph.GradientEdge):
         inputs = cast(_TensorOrTensorsOrGradEdge, (inputs,))
     else:
+        # pyrefly: ignore  # bad-argument-type
         inputs = tuple(inputs)
     t_outputs = tuple(i for i in outputs if is_tensor_like(i))
     t_inputs = tuple(i for i in inputs if is_tensor_like(i))
diff --git a/torch/autograd/_functions/tensor.py b/torch/autograd/_functions/tensor.py
index 65d6e92de3e8..cfa6cd233231 100644
--- a/torch/autograd/_functions/tensor.py
+++ b/torch/autograd/_functions/tensor.py
@@ -15,12 +15,14 @@ class Type(Function):
         "please use `torch.tensor.to(dtype=dtype)` instead.",
         category=FutureWarning,
     )
+    # pyrefly: ignore  # bad-override
     def forward(ctx, i, dest_type):
         ctx.input_type = type(i)
         ctx.input_device = -1 if not i.is_cuda else i.get_device()
         return i.type(dest_type)
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         if ctx.input_device == -1:
             return grad_output.type(ctx.input_type), None
@@ -32,6 +34,7 @@ def backward(ctx, grad_output):
 # TODO: deprecate this
 class Resize(Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, tensor, sizes):
         ctx.sizes = sizes
         ctx.numel = reduce(operator.mul, sizes, 1)
@@ -60,6 +63,10 @@ def forward(ctx, tensor, sizes):
             return tensor.contiguous().view(*sizes)
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
-        assert grad_output.numel() == ctx.numel
+        if grad_output.numel() != ctx.numel:
+            raise AssertionError(
+                f"Expected grad_output to have {ctx.numel} elements, but got {grad_output.numel()}"
+            )
         return grad_output.contiguous().view(ctx.input_sizes), None
diff --git a/torch/autograd/function.py b/torch/autograd/function.py
index ac3aad9f93b5..70c6cad4e996 100644
--- a/torch/autograd/function.py
+++ b/torch/autograd/function.py
@@ -4,8 +4,9 @@
 import itertools
 import warnings
 from collections import OrderedDict
-from typing import Any, Callable, Optional, TypeVar
-from typing_extensions import Concatenate, deprecated, ParamSpec
+from collections.abc import Callable
+from typing import Any, Concatenate, Optional, TypeVar
+from typing_extensions import deprecated, ParamSpec
 
 import torch
 import torch._C as _C
@@ -145,10 +146,11 @@ def save_for_forward(self, *tensors: torch.Tensor):
 
         """
         for tensor in tensors:
-            assert isinstance(tensor, torch.Tensor) or tensor is None, (
-                "save_for_forward expects all arguments to be tensors; you should "
-                "save non-tensors as attributes on ctx."
-            )
+            if not (isinstance(tensor, torch.Tensor) or tensor is None):
+                raise AssertionError(
+                    "save_for_forward expects all arguments to be tensors; you should "
+                    "save non-tensors as attributes on ctx."
+                )
 
         self.saved_for_forward = tensors
 
diff --git a/torch/autograd/functional.py b/torch/autograd/functional.py
index 09ced2e03f77..e8bce9ed7c54 100644
--- a/torch/autograd/functional.py
+++ b/torch/autograd/functional.py
@@ -54,7 +54,8 @@ def _tuple_postprocess(res, to_unpack):
     # - invert _as_tuple when res should match the inp given to _as_tuple
     # - optionally remove nesting of two tuples created by multiple calls to _as_tuple
     if isinstance(to_unpack, tuple):
-        assert len(to_unpack) == 2
+        if len(to_unpack) != 2:
+            raise AssertionError("Expected to_unpack tuple to have exactly 2 elements")
         if not to_unpack[1]:
             res = tuple(el[0] for el in res)
         if not to_unpack[0]:
@@ -174,11 +175,17 @@ def _autograd_grad(
 ):
     # Version of autograd.grad that accepts `None` in outputs and do not compute gradients for them.
     # This has the extra constraint that inputs has to be a tuple
-    assert isinstance(outputs, tuple)
+    if not isinstance(outputs, tuple):
+        raise AssertionError("Expected outputs to be a tuple")
     if grad_outputs is None:
         grad_outputs = (None,) * len(outputs)
-    assert isinstance(grad_outputs, tuple)
-    assert len(outputs) == len(grad_outputs)
+    if not isinstance(grad_outputs, tuple):
+        raise AssertionError("Expected grad_outputs to be a tuple")
+    if len(outputs) != len(grad_outputs):
+        raise AssertionError(
+            f"Expected outputs and grad_outputs to have the same length, "
+            f"but got {len(outputs)} and {len(grad_outputs)}"
+        )
 
     new_outputs: tuple[torch.Tensor, ...] = ()
     new_grad_outputs: tuple[torch.Tensor, ...] = ()
@@ -489,8 +496,13 @@ def _construct_standard_basis_for(
     # See NOTE: [Computing jacobian with vmap and grad for multiple tensors]
     # for context behind this function. All the pre-conditions are guarded for
     # in torch.autograd.functional.jacobian.
-    assert len(tensors) == len(tensor_numels)
-    assert len(tensors) > 0
+    if len(tensors) != len(tensor_numels):
+        raise AssertionError(
+            f"Expected tensors and tensor_numels to have the same length, "
+            f"but got {len(tensors)} and {len(tensor_numels)}"
+        )
+    if len(tensors) == 0:
+        raise AssertionError("Expected at least one tensor")
     total_numel = sum(tensor_numels)
     chunks = tuple(
         tensor.new_zeros(total_numel, tensor_numel)
@@ -664,11 +676,12 @@ def jacobian(
         >>> jac.shape
         torch.Size([4, 2, 4, 2])
     """
-    assert strategy in ("forward-mode", "reverse-mode"), (
-        'Expected strategy to be either "forward-mode" or "reverse-mode". Hint: If your '
-        'function has more outputs than inputs, "forward-mode" tends to be more performant. '
-        'Otherwise, prefer to use "reverse-mode".'
-    )
+    if strategy not in ("forward-mode", "reverse-mode"):
+        raise AssertionError(
+            'Expected strategy to be either "forward-mode" or "reverse-mode". Hint: If your '
+            'function has more outputs than inputs, "forward-mode" tends to be more performant. '
+            'Otherwise, prefer to use "reverse-mode".'
+        )
     if strategy == "forward-mode":
         if create_graph:
             raise NotImplementedError(
@@ -932,10 +945,13 @@ def hessian(
                   [0., 6.]])))
     """
     is_inputs_tuple, inputs = _as_tuple(inputs, "inputs", "hessian")
-    assert outer_jacobian_strategy in (
+    if outer_jacobian_strategy not in (
         "forward-mode",
         "reverse-mode",
-    ), 'Expected strategy to be either "forward-mode" or "reverse-mode".'
+    ):
+        raise AssertionError(
+            'Expected strategy to be either "forward-mode" or "reverse-mode".'
+        )
 
     def ensure_single_output_function(*inp):
         out = func(*inp)
diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py
index e92f38b3af38..1004b3b9631d 100644
--- a/torch/autograd/grad_mode.py
+++ b/torch/autograd/grad_mode.py
@@ -237,6 +237,12 @@ class inference_mode(_DecoratorContextManager):
        Unlike some other mechanisms that locally enable or disable grad,
        entering inference_mode also disables :ref:`forward-mode AD <forward-mode-ad>`.
 
+    .. warning::
+        `inference_mode` does NOT automatically set the model to evaluation mode.
+        For proper inference behavior (e.g., disabling dropout, using running statistics
+        in batch normalization), you must explicitly set your model to evaluation mode using
+        `model.eval()` in addition to using this context manager.
+
     Args:
         mode (bool or function): Either a boolean flag to enable or disable
             inference mode, or a Python function to decorate with inference
@@ -402,11 +408,13 @@ class _unsafe_preserve_version_counter(_DecoratorContextManager):
 
     def __init__(self, tensors: Union[torch.Tensor, tuple[torch.Tensor, ...]]) -> None:
         self.tensors = (tensors,) if isinstance(tensors, torch.Tensor) else tensors
-        assert isinstance(self.tensors, tuple)
+        if not isinstance(self.tensors, tuple):
+            raise AssertionError("Expected tensors to be a tuple")
         self.prev_versions = tuple(t._version for t in self.tensors)
 
     def __enter__(self) -> None:
         pass
 
+    # pyrefly: ignore  # bad-override
     def __exit__(self, *args) -> None:
         torch._C._autograd._unsafe_set_version_counter(self.tensors, self.prev_versions)
diff --git a/torch/autograd/gradcheck.py b/torch/autograd/gradcheck.py
index 6dacdfe8b946..674e42b34ad6 100644
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@@ -2,13 +2,15 @@
 import collections
 import functools
 import warnings
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from itertools import product
-from typing import Callable, Optional, Union
+from typing import Optional, Union
 from typing_extensions import deprecated
 
 import torch
 import torch.testing
+
+# pyrefly: ignore  # deprecated
 from torch._vmap_internals import _vmap, vmap
 from torch.overrides import is_tensor_like
 from torch.types import _TensorOrTensors
@@ -361,8 +363,15 @@ def _compute_numerical_gradient(fn, entry, v, norm_v, nbhd_checks_fn):
         # sparse compressed tensors don't implement sub/add/copy_
         # yet. However, in non-masked semantics context entry and v
         # have the same sparse indices ...
-        assert entry.layout == v.layout, (entry.layout, v.layout)
-        assert entry._nnz() == v._nnz(), (entry._nnz(), v._nnz(), entry.shape)
+        if entry.layout != v.layout:
+            raise AssertionError(
+                f"Expected entry and v to have the same layout, but got {entry.layout} and {v.layout}"
+            )
+        if entry._nnz() != v._nnz():
+            raise AssertionError(
+                f"Expected entry and v to have the same nnz, but got {entry._nnz()} and {v._nnz()} "
+                f"with entry shape {entry.shape}"
+            )
         # ... the finite differencing can be performed on values only:
         entry = entry.values()
         v = v.values()
@@ -401,13 +410,15 @@ def _compute_numerical_jvps_wrt_specific_input(
             jvp_fn(delta[1] * 1j) if isinstance(delta, tuple) else jvp_fn(delta * 1j)
         )
         for ds_dx, ds_dy in zip(ds_dx_tup, ds_dy_tup):
-            assert not ds_dx.is_complex()
+            if ds_dx.is_complex():
+                raise AssertionError("Expected ds_dx to be real-valued, not complex")
             # conjugate wirtinger derivative
             conj_w_d = ds_dx + ds_dy * 1j
             jvps.append(conj_w_d)
     else:
         for ds_dx in ds_dx_tup:  # R -> R or (R -> C for the forward AD case)
-            assert is_forward_ad or not ds_dx.is_complex()
+            if not is_forward_ad and ds_dx.is_complex():
+                raise AssertionError("Expected ds_dx to be real-valued, not complex.")
             jvps.append(ds_dx)
     return jvps
 
@@ -453,17 +464,19 @@ def _prepare_input(
 def _check_outputs_same_dtype_and_shape(output1, output2, eps, idx=None) -> None:
     # Check that the returned outputs don't have different dtype or shape when you
     # perturb the input
-    on_index = "on index {idx} " if idx is not None else ""
-    assert output1.shape == output2.shape, (
-        f"Expected `func` to return outputs with the same shape"
-        f" when inputs are perturbed {on_index}by {eps}, but got:"
-        f" shapes {output1.shape} and {output2.shape}."
-    )
-    assert output1.dtype == output2.dtype, (
-        f"Expected `func` to return outputs with the same dtype"
-        f" when inputs are perturbed {on_index}by {eps}, but got:"
-        f" dtypes {output1.dtype} and {output2.dtype}."
-    )
+    on_index = f"on index {idx} " if idx is not None else ""
+    if output1.shape != output2.shape:
+        raise AssertionError(
+            f"Expected `func` to return outputs with the same shape"
+            f" when inputs are perturbed {on_index}by {eps}, but got:"
+            f" shapes {output1.shape} and {output2.shape}."
+        )
+    if output1.dtype != output2.dtype:
+        raise AssertionError(
+            f"Expected `func` to return outputs with the same dtype"
+            f" when inputs are perturbed {on_index}by {eps}, but got:"
+            f" dtypes {output1.dtype} and {output2.dtype}."
+        )
 
 
 def get_numerical_jacobian_wrt_specific_input(
@@ -476,7 +489,8 @@ def get_numerical_jacobian_wrt_specific_input(
     # is equivalent to a single col of the Jacobian matrix of fn.
     jacobian_cols: dict[int, list[torch.Tensor]] = {}
     input = inputs[input_idx] if input is None else input
-    assert input.requires_grad
+    if not input.requires_grad:
+        raise AssertionError("Expected input to have requires_grad=True")
     for x, idx, d_idx in _iter_tensor(input):
         wrapped_fn = _with_prepare_inputs(fn, inputs, input_idx, x)
         input_to_perturb = x[idx]
@@ -685,7 +699,11 @@ def _get_numerical_vJu(
         # Filter out the Ju for non floating point outputs
         filtered_Ju = []
         func_out = _as_tuple(func_out)
-        assert len(all_Ju) == len(func_out)
+        if len(all_Ju) != len(func_out):
+            raise AssertionError(
+                f"Expected all_Ju and func_out to have the same length, "
+                f"but got {len(all_Ju)} and {len(func_out)}"
+            )
         for Ju, output in zip(all_Ju, func_out):
             if _is_float_or_complex_tensor(output):
                 filtered_Ju.append(Ju)
@@ -731,10 +749,12 @@ def _stack_and_check_tensors(
             if tensor is None:
                 out_jacobian[:, j].zero_()
             else:
-                dense = (
-                    tensor.to_dense() if not tensor.layout == torch.strided else tensor
-                )
-                assert out_jacobian[:, j].numel() == dense.numel()
+                dense = tensor.to_dense() if tensor.layout != torch.strided else tensor
+                if out_jacobian[:, j].numel() != dense.numel():
+                    raise AssertionError(
+                        f"Expected out_jacobian column to have {dense.numel()} elements, "
+                        f"but got {out_jacobian[:, j].numel()}"
+                    )
                 out_jacobian[:, j] = dense.reshape(-1)
     return out_jacobians, correct_grad_sizes, correct_grad_types
 
@@ -1061,7 +1081,8 @@ def _get_failed_batched_grad_test_msg(
 
 def _test_batched_grad_forward_ad(func, inputs) -> bool:
     fwAD = torch.autograd.forward_ad  # To avoid early import issues (do we need this?)
-    assert isinstance(inputs, tuple)
+    if not isinstance(inputs, tuple):
+        raise AssertionError("Expected inputs to be a tuple")
 
     for input_idx, current_input in enumerate(inputs):
         if not (is_tensor_like(current_input) and current_input.requires_grad):
@@ -1641,7 +1662,10 @@ def _slow_gradcheck(
 
 
 def _dot_with_type_promotion(u, v):
-    assert u.dim() == 1 and v.dim() == 1
+    if u.dim() != 1 or v.dim() != 1:
+        raise AssertionError(
+            f"Expected u and v to be 1D tensors, but got dims {u.dim()} and {v.dim()}"
+        )
     return (u * v).sum()
 
 
@@ -1908,7 +1932,8 @@ def _fast_gradcheck(
     )
     # TODO: replicate https://github.com/pytorch/pytorch/pull/77743 for fast gradcheck as well
     if use_forward_ad:
-        assert all_v is None
+        if all_v is not None:
+            raise AssertionError("Expected all_v to be None.")
         analytical_vJu = _get_analytical_jacobian_forward_ad(
             func,
             inputs,
@@ -2036,15 +2061,18 @@ def gradcheck(
         ``True`` if all differences satisfy allclose condition
 
     """
-    assert check_forward_ad or check_backward_ad, (
-        "Expected at least one of check_forward_ad or check_backward_ad to be True"
-    )
-    assert not (check_batched_grad and not check_backward_ad), (
-        "Setting check_batched_grad=True requires check_backward_ad to be True"
-    )
-    assert not (check_batched_forward_grad and not check_forward_ad), (
-        "Setting check_batched_forward_grad=True requires check_forward_ad to be True"
-    )
+    if not (check_forward_ad or check_backward_ad):
+        raise AssertionError(
+            "Expected at least one of check_forward_ad or check_backward_ad to be True"
+        )
+    if check_batched_grad and not check_backward_ad:
+        raise AssertionError(
+            "Setting check_batched_grad=True requires check_backward_ad to be True"
+        )
+    if check_batched_forward_grad and not check_forward_ad:
+        raise AssertionError(
+            "Setting check_batched_forward_grad=True requires check_forward_ad to be True"
+        )
     args = locals().copy()
     args.pop("raise_exception")
     if not raise_exception:
@@ -2189,15 +2217,18 @@ def gradgradcheck(
     Returns:
         True if all differences satisfy allclose condition
     """
-    assert check_fwd_over_rev or check_rev_over_rev, (
-        "Expected at least one of check_fwd_over_rev or check_rev_over_rev to be True"
-    )
-    assert not (check_undefined_grad and not check_rev_over_rev), (
-        "Setting check_undefined_grad=True requires check_rev_over_rev to be True"
-    )
-    assert not (check_batched_grad and not check_rev_over_rev), (
-        "Setting check_batched_grad=True requires check_rev_over_rev to be True"
-    )
+    if not (check_fwd_over_rev or check_rev_over_rev):
+        raise AssertionError(
+            "Expected at least one of check_fwd_over_rev or check_rev_over_rev to be True"
+        )
+    if check_undefined_grad and not check_rev_over_rev:
+        raise AssertionError(
+            "Setting check_undefined_grad=True requires check_rev_over_rev to be True"
+        )
+    if check_batched_grad and not check_rev_over_rev:
+        raise AssertionError(
+            "Setting check_batched_grad=True requires check_rev_over_rev to be True"
+        )
     # TODO: do we want to test this too?
     # assert not (check_batched_forward_grad and not check_fwd_over_rev), (
     #     "Setting check_batched_forward_grad=True requires check_fwd_over_rev to be True")
diff --git a/torch/autograd/graph.py b/torch/autograd/graph.py
index 4b2707b65d0f..f7615db03edb 100644
--- a/torch/autograd/graph.py
+++ b/torch/autograd/graph.py
@@ -4,18 +4,24 @@
 import logging
 import threading
 from collections import defaultdict, deque
-from collections.abc import Generator, Iterable, Iterator, MutableMapping, Sequence
+from collections.abc import (
+    Callable,
+    Generator,
+    Iterable,
+    Iterator,
+    MutableMapping,
+    Sequence,
+)
 from typing import (
     Any,
-    Callable,
     cast,
     Literal,
     NamedTuple,
     Optional,
     TYPE_CHECKING,
+    TypeAlias,
     Union,
 )
-from typing_extensions import TypeAlias
 from weakref import WeakKeyDictionary, WeakValueDictionary
 
 import torch
@@ -181,7 +187,8 @@ def _get_grad_fn_or_grad_acc(t: Union[torch.Tensor, "GradientEdge"]) -> Node:
             node = t.view_as(t).grad_fn.next_functions[0][0]  # type: ignore[union-attr]
     else:
         node = t.grad_fn
-    assert node is not None
+    if node is None:
+        raise AssertionError("Expected gradient function to be set")
     return node
 
 
@@ -223,6 +230,7 @@ def get_gradient_edge(tensor: torch.Tensor) -> GradientEdge:
 
     # Note that output_nr default to 0 which is the right value
     # for the AccumulateGrad node.
+    # pyrefly: ignore  # bad-argument-type
     return GradientEdge(grad_fn, tensor.output_nr, ownership_token=token)
 
 
@@ -521,10 +529,12 @@ def get_inner_hook(idx: int) -> Callable[[torch.Tensor], None]:
             def inner_hook(grad: torch.Tensor) -> None:
                 nonlocal count, nb_calls, buffer, fn
                 id = torch._C._current_graph_task_id()
-                assert id != -1, (
-                    "expected this hook to be called inside a backward call"
-                )
+                if id == -1:
+                    raise AssertionError(
+                        "expected this hook to be called inside a backward call"
+                    )
                 count[id] = count.get(id, 0)
+                # pyrefly: ignore  # unsupported-operation
                 buffer[id] = buffer.get(id, [None] * len_tensors)
 
                 with lock:
@@ -538,7 +548,8 @@ def inner_hook(grad: torch.Tensor) -> None:
 
                 buffer[id][idx] = grad
 
-                assert nb_calls is not None
+                if nb_calls is None:
+                    raise AssertionError("Expected nb_calls to be set")
                 if curr_count == nb_calls - 1:
                     fn = cast(Callable[[Sequence[Optional[torch.Tensor]]], None], fn)
                     fn(buffer[id])
@@ -558,7 +569,10 @@ def inner_hook(grad: torch.Tensor) -> None:
         def wrapped_fn(grad: torch.Tensor) -> None:
             nonlocal ran_hook
             id = torch._C._current_graph_task_id()
-            assert id != -1, "expected this hook to be called inside a backward call"
+            if id == -1:
+                raise AssertionError(
+                    "expected this hook to be called inside a backward call"
+                )
             with lock:
                 prev, ran_hook[id] = ran_hook[id], True
             if prev:
@@ -654,11 +668,13 @@ def unpack_hook(handle: _Handle) -> torch.Tensor:
                 "Trying to backward outside of the 'allow_mutation_on_saved_tensors' context"
                 "in which the graph was originally recorded."
             )
-            assert _allow_mutation_on_saved_tensors_enabled, error_msg
+            if not _allow_mutation_on_saved_tensors_enabled:
+                raise AssertionError(error_msg)
             if handle in ctx.cloned:
                 res = ctx.cloned[handle]
             else:
-                assert handle in ctx.original, error_msg
+                if handle not in ctx.original:
+                    raise AssertionError(error_msg)
                 res = ctx.original[handle]
             return res
 
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index c1ae4d8561fd..5c478e514d05 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -255,9 +255,10 @@ def __init__(
         self.custom_trace_id_callback = custom_trace_id_callback
         self.trace_id = ""
         if not self.use_cpu:
-            assert use_kineto, (
-                "Device-only events supported only with Kineto (use_kineto=True)"
-            )
+            if not use_kineto:
+                raise AssertionError(
+                    "Device-only events supported only with Kineto (use_kineto=True)"
+                )
 
         if self.use_device is not None:
             VALID_DEVICE_OPTIONS = ["cuda", "xpu", "mtia", "hpu"]
@@ -289,40 +290,44 @@ def __init__(
         self.profiler_kind = ProfilerState.KINETO
         if self.use_device == "cuda":
             if not use_kineto or ProfilerActivity.CUDA not in _supported_activities():
-                assert self.use_cpu, "Legacy CUDA profiling requires use_cpu=True"
+                if not self.use_cpu:
+                    raise AssertionError("Legacy CUDA profiling requires use_cpu=True")
                 self.profiler_kind = ProfilerState.KINETO_GPU_FALLBACK
             else:
                 self.kineto_activities.add(ProfilerActivity.CUDA)
         elif self.use_device == "xpu":
-            assert use_kineto and ProfilerActivity.XPU in _supported_activities(), (
-                "Legacy XPU profiling is not supported. Requires use_kineto=True on XPU devices."
-            )
+            if not (use_kineto and ProfilerActivity.XPU in _supported_activities()):
+                raise AssertionError(
+                    "Legacy XPU profiling is not supported. Requires use_kineto=True on XPU devices."
+                )
             self.kineto_activities.add(ProfilerActivity.XPU)
         elif self.use_device == "mtia":
-            assert use_kineto and ProfilerActivity.MTIA in _supported_activities(), (
-                "Legacy MTIA profiling is not supported. Requires use_kineto=True on MTIA devices."
-            )
+            if not (use_kineto and ProfilerActivity.MTIA in _supported_activities()):
+                raise AssertionError(
+                    "Legacy MTIA profiling is not supported. Requires use_kineto=True on MTIA devices."
+                )
             self.kineto_activities.add(ProfilerActivity.MTIA)
         elif self.use_device == "hpu":
-            assert use_kineto and ProfilerActivity.HPU in _supported_activities(), (
-                "Legacy HPU profiling is not supported. Requires use_kineto=True on HPU devices."
-            )
+            if not (use_kineto and ProfilerActivity.HPU in _supported_activities()):
+                raise AssertionError(
+                    "Legacy HPU profiling is not supported. Requires use_kineto=True on HPU devices."
+                )
             self.kineto_activities.add(ProfilerActivity.HPU)
         elif self.use_device is not None and self.use_device != "privateuseone":
             if (
                 not use_kineto
                 or ProfilerActivity.PrivateUse1 not in _supported_activities()
             ):
-                assert self.use_cpu, (
-                    "Legacy custombackend profiling requires use_cpu=True"
-                )
+                if not self.use_cpu:
+                    raise AssertionError(
+                        "Legacy custombackend profiling requires use_cpu=True"
+                    )
                 self.profiler_kind = ProfilerState.KINETO_PRIVATEUSE1_FALLBACK
             else:
                 self.kineto_activities.add(ProfilerActivity.PrivateUse1)
 
-        assert len(self.kineto_activities) > 0, (
-            "No activities specified for the profiler"
-        )
+        if len(self.kineto_activities) == 0:
+            raise AssertionError("No activities specified for the profiler")
 
     def default_trace_id(self):
         # Generate a UUID
@@ -472,7 +477,8 @@ def table(
         top_level_events_only=False,
     ):
         self._ensure_function_events()
-        assert self._function_events is not None
+        if self._function_events is None:
+            raise AssertionError("Expected profiling results")
         return self._function_events.table(
             sort_by=sort_by,
             row_limit=row_limit,
@@ -500,8 +506,10 @@ def export_chrome_trace(self, path):
 
     def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
         self._ensure_function_events()
-        assert self._function_events is not None, "Expected profiling results"
-        assert self.with_stack, "export_stacks() requires with_stack=True"
+        if self._function_events is None:
+            raise AssertionError("Expected profiling results")
+        if not self.with_stack:
+            raise AssertionError("export_stacks() requires with_stack=True")
         return self._function_events.export_stacks(path, metric)
 
     def toggle_collection_dynamic(
@@ -519,7 +527,8 @@ def key_averages(
         group_by_overload_name=False,
     ):
         self._ensure_function_events()
-        assert self._function_events is not None, "Expected profiling results"
+        if self._function_events is None:
+            raise AssertionError("Expected profiling results")
         return self._function_events.key_averages(
             group_by_input_shape, group_by_stack_n, group_by_overload_name
         )
@@ -528,7 +537,8 @@ def key_averages(
 
     def total_average(self):
         self._ensure_function_events()
-        assert self._function_events is not None, "Expected profiling results"
+        if self._function_events is None:
+            raise AssertionError("Expected profiling results")
         return self._function_events.total_average()
 
     total_average.__doc__ = EventList.total_average.__doc__
@@ -540,7 +550,8 @@ def self_cpu_time_total(self):
         The total time is a sum of all self times across all the events.
         """
         self._ensure_function_events()
-        assert self._function_events is not None
+        if self._function_events is None:
+            raise AssertionError("Expected profiling results")
         return self._function_events.self_cpu_time_total
 
     def _parse_kineto_results(self, result: _ProfilerResult):
@@ -638,6 +649,7 @@ def _device_memory_usage(mem_record):
                 device_resource_id=kineto_event.device_resource_id(),
                 flops=kineto_event.flops(),
                 is_user_annotation=kineto_event.is_user_annotation(),
+                metadata_json=kineto_event.metadata_json(),
             )
             max_evt_id = max(max_evt_id, fe.id)
             if fe.device_type == DeviceType.CPU and not fe.is_async:
@@ -730,6 +742,7 @@ def createFunctionEventForMemoryEvents(evt):
         return all_function_events
 
 
+# pyrefly: ignore  # invalid-inheritance
 class record_function(_ContextDecorator):
     """Context manager/function decorator that adds a label to a code block/function when running autograd profiler.
     Label will only appear if CPU activity tracing is enabled.
@@ -777,7 +790,9 @@ def __init__(self, name: str, args: Optional[str] = None):
         # TODO: TorchScript ignores standard type annotation here
         # self.record: Optional["torch.classes.profiler._RecordFunction"] = None
         self.record = torch.jit.annotate(
-            Optional["torch.classes.profiler._RecordFunction"], None
+            # pyrefly: ignore  # not-a-type
+            Optional["torch.classes.profiler._RecordFunction"],
+            None,
         )
 
     def __enter__(self):
@@ -792,7 +807,8 @@ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any):
 
         # Local variable is needed by TorchScript to refine Optional[T] to T
         record = self.record
-        assert record is not None
+        if record is None:
+            raise AssertionError("Expected record to be set")
 
         # TODO: Too slow with __torch_function__ handling enabled
         # See https://github.com/pytorch/pytorch/issues/76410
@@ -829,7 +845,8 @@ def _call_end_callbacks_on_future(self, fut: Future[Any]) -> Future[Any]:
 
         # Local variable is needed by TorchScript to refine Optional[T] to T
         record = self.record
-        assert record is not None
+        if record is None:
+            raise AssertionError("Expected record to be set")
 
         # TODO: Too slow with __torch_function__ handling enabled
         # See https://github.com/pytorch/pytorch/issues/76410
@@ -1120,7 +1137,8 @@ def parse_nvprof_trace(path):
     for row in conn.execute(kernel_query):
         unique.see(row["marker_id"], row["runtime_id"])
         # 211 is cudaKernelLaunch for cuda >= 9.2
-        assert row["cbid"] == 211
+        if row["cbid"] != 211:
+            raise AssertionError(f"Expected cbid to be 211, but got {row['cbid']}")
         evt = functions_map[row["marker_id"]]
         evt.append_kernel(
             row["kernel_name"], 0, row["kernel_end"] - row["kernel_start"]
@@ -1208,7 +1226,7 @@ def increment_step(cls, requester: str) -> int:
                     "Profiler step count has increased more than 1 - "
                     f"current_step = {cls._current_step} step dict =  {cls._step_dict}"
                 )
-            for _ in range(0, delta):
+            for _ in range(delta):
                 _kineto_step()
             cls._current_step = new_step
         return cls._current_step
diff --git a/torch/autograd/profiler_legacy.py b/torch/autograd/profiler_legacy.py
index 4e5c9264ee58..17d84debe124 100644
--- a/torch/autograd/profiler_legacy.py
+++ b/torch/autograd/profiler_legacy.py
@@ -101,12 +101,14 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
         records = _disable_profiler_legacy()
         parsed_results = _parse_legacy_records(records)
+        # pyrefly: ignore  # bad-assignment
         self.function_events = EventList(
             parsed_results,
             use_device="cuda" if self.use_cuda else None,
             profile_memory=self.profile_memory,
             with_flops=self.with_flops,
         )
+        # pyrefly: ignore  # missing-attribute
         self.function_events._build_tree()
         return False
 
@@ -135,7 +137,8 @@ def table(
         top_level_events_only=False,
     ):
         self._check_finish()
-        assert self.function_events is not None
+        if self.function_events is None:
+            raise AssertionError("Expected profiling results")
         return self.function_events.table(
             sort_by=sort_by,
             row_limit=row_limit,
@@ -150,27 +153,32 @@ def table(
 
     def export_chrome_trace(self, path):
         self._check_finish()
-        assert self.function_events is not None
+        if self.function_events is None:
+            raise AssertionError("Expected profiling results")
         return self.function_events.export_chrome_trace(path)
 
     export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__
 
     def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
         self._check_finish()
-        assert self.function_events is not None, "Expected profiling results"
-        assert self.with_stack, "export_stacks() requires with_stack=True"
+        if self.function_events is None:
+            raise AssertionError("Expected profiling results")
+        if not self.with_stack:
+            raise AssertionError("export_stacks() requires with_stack=True")
         return self.function_events.export_stacks(path, metric)
 
     def key_averages(self, group_by_input_shape=False, group_by_stack_n=0):
         self._check_finish()
-        assert self.function_events is not None, "Expected profiling results"
+        if self.function_events is None:
+            raise AssertionError("Expected profiling results")
         return self.function_events.key_averages(group_by_input_shape, group_by_stack_n)
 
     key_averages.__doc__ = EventList.key_averages.__doc__
 
     def total_average(self):
         self._check_finish()
-        assert self.function_events is not None, "Expected profiling results"
+        if self.function_events is None:
+            raise AssertionError("Expected profiling results")
         return self.function_events.total_average()
 
     total_average.__doc__ = EventList.total_average.__doc__
@@ -179,7 +187,8 @@ def total_average(self):
     def self_cpu_time_total(self):
         """Return CPU time as the sum of self times across all events."""
         self._check_finish()
-        assert self.function_events is not None
+        if self.function_events is None:
+            raise AssertionError("Expected profiling results")
         return self.function_events.self_cpu_time_total
 
 
@@ -197,7 +206,8 @@ def _get_record_key(record):
         if start_record is None and name == "__start_profile":
             start_record = record
 
-    assert start_record is not None and not start_record.is_remote()
+    if start_record is None or start_record.is_remote():
+        raise AssertionError("Expected a valid local start_record")
 
     for thread_record_list in thread_records:
         # accumulated memory allocations per handle
@@ -231,10 +241,11 @@ def _get_record_key(record):
                 cpu_memory_allocs[record_key] = 0
                 cuda_memory_allocs[record_key] = 0
             elif record.kind() == "pop":
-                assert (
-                    record_key in range_starts
-                ), f"""Expected record with key {record_key} to exist in range_starts.
-                    This means that the pop event did not have a corresponding push."""
+                if record_key not in range_starts:
+                    raise AssertionError(
+                        f"Expected record with key {record_key} to exist in range_starts. "
+                        "This means that the pop event did not have a corresponding push."
+                    )
 
                 start = range_starts[record_key]
 
@@ -280,7 +291,11 @@ def _get_record_key(record):
             elif record.kind() == "memory_alloc":
                 num_open_handles_cpu = len(cpu_memory_allocs)
                 num_open_handles_cuda = len(cuda_memory_allocs)
-                assert num_open_handles_cpu == num_open_handles_cuda
+                if num_open_handles_cpu != num_open_handles_cuda:
+                    raise AssertionError(
+                        f"Expected CPU and CUDA memory allocation handles to match, "
+                        f"but got {num_open_handles_cpu} CPU and {num_open_handles_cuda} CUDA"
+                    )
                 for handle in cpu_memory_allocs.keys():
                     cpu_memory_allocs[handle] += record.cpu_memory_usage()
                 for handle in cuda_memory_allocs.keys():
diff --git a/torch/autograd/profiler_util.py b/torch/autograd/profiler_util.py
index b789aab11c66..530937928b8a 100644
--- a/torch/autograd/profiler_util.py
+++ b/torch/autograd/profiler_util.py
@@ -48,6 +48,7 @@ def __str__(self):
     def _remove_dup_nodes(self):
         while True:
             to_delete = set()
+
             for idx in range(len(self)):
                 if (
                     self[idx].cpu_parent is not None
@@ -61,8 +62,11 @@ def _remove_dup_nodes(self):
                     to_delete.add(idx)
             if len(to_delete) == 0:
                 break
+
             new_evts = [ev for ind, ev in enumerate(self) if ind not in to_delete]
+
             self.clear()
+
             self.extend(new_evts)
 
     def _populate_cpu_children(self):
@@ -126,9 +130,10 @@ def _populate_cpu_children(self):
                         current_events.pop()
                     else:
                         parent.append_cpu_child(event)
-                        assert event.cpu_parent is None, (
-                            f"There is already a CPU parent event for {event.key}"
-                        )
+                        if event.cpu_parent is not None:
+                            raise AssertionError(
+                                f"There is already a CPU parent event for {event.key}"
+                            )
                         event.set_cpu_parent(parent)
                         break
 
@@ -153,12 +158,12 @@ def bw_parent(evt):
         for evt in self:
             p = bw_parent(evt)
             if p is not None:
-                assert p.fwd_thread is not None
+                if p.fwd_thread is None:
+                    raise AssertionError(
+                        "Expected fwd_thread to be set for backward parent"
+                    )
                 t = (p.sequence_nr, p.fwd_thread)
-                if t in fwd_stacks:
-                    evt.stack = fwd_stacks[t]
-                else:
-                    evt.stack = []
+                evt.stack = fwd_stacks.get(t, [])
 
     @property
     def self_cpu_time_total(self):
@@ -321,7 +326,10 @@ def key_averages(
         Returns:
             An EventList containing FunctionEventAvg objects.
         """
-        assert self._tree_built
+        if not self._tree_built:
+            raise AssertionError(
+                "Expected tree to be built before calling key_averages"
+            )
         stats: dict[tuple[str, ...], FunctionEventAvg] = defaultdict(FunctionEventAvg)
 
         def get_key(
@@ -391,7 +399,8 @@ def _format_time(time_us):
 def _format_time_share(time_us, total_time_us):
     """Define how to format time in FunctionEvent."""
     if total_time_us == 0:
-        assert time_us == 0, f"Expected time_us == 0 but got {time_us}"
+        if time_us != 0:
+            raise AssertionError(f"Expected time_us == 0 but got {time_us}")
         return "NaN"
     return f"{time_us * 100.0 / total_time_us:.2f}%"
 
@@ -491,11 +500,14 @@ def __init__(
         concrete_inputs=None,
         kwinputs=None,
         is_user_annotation=False,
+        metadata_json=None,
     ):
         self.id: int = id
         self.node_id: int = node_id
         self.name: str = name
+        # pyrefly: ignore  # bad-assignment
         self.overload_name: str = overload_name
+        # pyrefly: ignore  # bad-assignment
         self.trace_name: str = trace_name
         self.time_range: Interval = Interval(start_us, end_us)
         self.thread: int = thread
@@ -504,9 +516,13 @@ def __init__(
         self.count: int = 1
         self.cpu_children: list[FunctionEvent] = []
         self.cpu_parent: Optional[FunctionEvent] = None
+        # pyrefly: ignore  # bad-assignment
         self.input_shapes: tuple[int, ...] = input_shapes
+        # pyrefly: ignore  # bad-assignment
         self.concrete_inputs: list[Any] = concrete_inputs
+        # pyrefly: ignore  # bad-assignment
         self.kwinputs: dict[str, Any] = kwinputs
+        # pyrefly: ignore  # bad-assignment
         self.stack: list = stack
         self.scope: int = scope
         self.use_device: Optional[str] = use_device
@@ -526,9 +542,11 @@ def __init__(
         self.self_cpu_percent = -1
         self.total_cpu_percent = -1
         self.total_device_percent = -1
+        self.metadata_json = metadata_json
 
     def append_kernel(self, name, device, duration):
-        assert self.device_type == DeviceType.CPU
+        if self.device_type != DeviceType.CPU:
+            raise AssertionError("Expected device_type to be CPU")
         self.kernels.append(Kernel(name, device, duration))
 
     def append_cpu_child(self, child):
@@ -537,9 +555,12 @@ def append_cpu_child(self, child):
         One is supposed to append only direct children to the event to have
         correct self cpu time being reported.
         """
-        assert self.device_type == DeviceType.CPU
-        assert isinstance(child, FunctionEvent)
-        assert child.device_type == DeviceType.CPU
+        if self.device_type != DeviceType.CPU:
+            raise AssertionError("Expected device_type to be CPU")
+        if not isinstance(child, FunctionEvent):
+            raise AssertionError("Expected child to be a FunctionEvent")
+        if child.device_type != DeviceType.CPU:
+            raise AssertionError("Expected child device_type to be CPU")
         self.cpu_children.append(child)
 
     def set_cpu_parent(self, parent):
@@ -549,9 +570,12 @@ def set_cpu_parent(self, parent):
         the child's range interval is completely inside the parent's. We use
         this connection to determine the event is from top-level op or not.
         """
-        assert self.device_type == DeviceType.CPU
-        assert isinstance(parent, FunctionEvent)
-        assert parent.device_type == DeviceType.CPU
+        if self.device_type != DeviceType.CPU:
+            raise AssertionError("Expected device_type to be CPU")
+        if not isinstance(parent, FunctionEvent):
+            raise AssertionError("Expected parent to be a FunctionEvent")
+        if parent.device_type != DeviceType.CPU:
+            raise AssertionError("Expected parent device_type to be CPU")
         self.cpu_parent = parent
 
     # Note: async events don't have children, are not used when computing 'self'
@@ -609,12 +633,15 @@ def device_time_total(self):
                 # each legacy cpu events has a single (fake) kernel
                 return sum(kinfo.duration for kinfo in self.kernels)
         else:
-            assert self.device_type in [
+            if self.device_type not in [
                 DeviceType.CUDA,
                 DeviceType.PrivateUse1,
                 DeviceType.MTIA,
                 DeviceType.HPU,
-            ]
+            ]:
+                raise AssertionError(
+                    f"Expected device_type to be CUDA, PrivateUse1, MTIA, or HPU, but got {self.device_type}"
+                )
             return self.time_range.elapsed_us()
 
     @property
@@ -634,12 +661,15 @@ def self_device_time_total(self):
                 child.device_time_total for child in self.cpu_children
             )
         else:
-            assert self.device_type in [
+            if self.device_type not in [
                 DeviceType.CUDA,
                 DeviceType.PrivateUse1,
                 DeviceType.MTIA,
                 DeviceType.HPU,
-            ]
+            ]:
+                raise AssertionError(
+                    f"Expected device_type to be CUDA, PrivateUse1, MTIA, or HPU, but got {self.device_type}"
+                )
             return self.device_time_total
 
     @property
@@ -717,8 +747,14 @@ def add(self, other):
             self.use_device = other.use_device
             self.is_user_annotation = other.is_user_annotation
 
-        assert isinstance(other, (FunctionEvent, FunctionEventAvg))
-        assert other.key == self.key
+        if not isinstance(other, (FunctionEvent, FunctionEventAvg)):
+            raise AssertionError(
+                "Expected other to be a FunctionEvent or FunctionEventAvg"
+            )
+        if other.key != self.key:
+            raise AssertionError(
+                f"Expected keys to match, but got {other.key} vs {self.key}"
+            )
 
         self.cpu_time_total += other.cpu_time_total
         self.device_time_total += other.device_time_total
@@ -730,6 +766,7 @@ def add(self, other):
         self.self_device_memory_usage += other.self_device_memory_usage
         self.count += other.count
         if self.flops is None:
+            # pyrefly: ignore  # bad-assignment
             self.flops = other.flops
         elif other.flops is not None:
             self.flops += other.flops
@@ -964,9 +1001,14 @@ def auto_scale_flops(flops):
             "TFLOPs",
             "PFLOPs",
         ]
-        assert flops > 0
+        if flops <= 0:
+            raise AssertionError(f"Expected flops to be positive, but got {flops}")
+        # pyrefly: ignore  # no-matching-overload
         log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1)))
-        assert log_flops >= 0 and log_flops < len(flop_headers)
+        if not (log_flops >= 0 and log_flops < len(flop_headers)):
+            raise AssertionError(
+                f"Expected log_flops to be in range [0, {len(flop_headers)}), but got {log_flops}"
+            )
         return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)])
 
     add_column(name_column_width)
diff --git a/torch/backends/_nnapi/serializer.py b/torch/backends/_nnapi/serializer.py
index 5c5d1a1885f3..8bdb080d44d8 100644
--- a/torch/backends/_nnapi/serializer.py
+++ b/torch/backends/_nnapi/serializer.py
@@ -414,6 +414,7 @@ def torch_tensor_to_operand(self, tensor, dim_order):
             )  # noqa: TRY002
         return Operand(
             shape=tuple(tensor.shape),
+            # pyrefly: ignore  # bad-argument-type
             op_type=op_type,
             dim_order=dim_order,
             scale=scale,
@@ -1734,11 +1735,13 @@ def add_upsample_nearest2d(self, node):
         for dim in (2, 3):  # h, w indices
             if image_oper.shape[dim] == 0:
                 if size_ctype.kind() != "NoneType":
+                    # pyrefly: ignore  # unsupported-operation
                     self.compute_operand_shape(out_id, dim, size_arg[dim - 2])
                 elif scale_ctype.kind() != "NoneType":
                     self.compute_operand_shape(
                         out_id,
                         dim,
+                        # pyrefly: ignore  # unsupported-operation
                         f"int({scale_arg[dim - 2]} * {flex_name(image_id, dim)})",
                     )
                 else:
diff --git a/torch/backends/cuda/__init__.py b/torch/backends/cuda/__init__.py
index 87327428461a..d895ab377e7c 100644
--- a/torch/backends/cuda/__init__.py
+++ b/torch/backends/cuda/__init__.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 import contextlib
-from typing import Union
+from typing import Any, Union
 from typing_extensions import deprecated
 
 import torch
@@ -126,13 +126,58 @@ def __setattr__(self, name, value):
 
 
 class cuBLASModule:
+    @staticmethod
+    def _parse_reduction_setting(value: Any, attr_name: str) -> tuple[bool, bool]:
+        def _ensure_bool(obj: Any, which: str) -> bool:
+            if isinstance(obj, bool):
+                return obj
+            raise TypeError(
+                f"{attr_name} expects a bool for {which}, but got {type(obj)!r}"
+            )
+
+        if isinstance(value, bool):
+            return value, True
+        if isinstance(value, (list, tuple)):
+            if not value:
+                raise TypeError(f"{attr_name} expects at least one boolean argument")
+            if len(value) > 2:
+                raise TypeError(f"{attr_name} expects at most two boolean arguments")
+            allow_reduced_precision = _ensure_bool(value[0], "allow_reduced_precision")
+            if len(value) == 1:
+                return allow_reduced_precision, True
+            allow_splitk = _ensure_bool(value[1], "allow_splitk")
+            return allow_reduced_precision, allow_splitk
+        raise TypeError(
+            f"{attr_name} expects a bool or a tuple/list of bools, but got {type(value)!r}"
+        )
+
     def __getattr__(self, name):
         if name == "allow_tf32":
             return torch._C._get_cublas_allow_tf32()
         elif name == "allow_fp16_reduced_precision_reduction":
-            return torch._C._get_cublas_allow_fp16_reduced_precision_reduction()
+            # pyrefly: ignore  # not-iterable
+            allow_reduced_precision, _ = (
+                torch._C._get_cublas_allow_fp16_reduced_precision_reduction()
+            )
+            return allow_reduced_precision
+        elif name == "allow_fp16_reduced_precision_reduction_split_k":
+            # pyrefly: ignore  # not-iterable
+            _, allow_splitk = (
+                torch._C._get_cublas_allow_fp16_reduced_precision_reduction()
+            )
+            return allow_splitk
         elif name == "allow_bf16_reduced_precision_reduction":
-            return torch._C._get_cublas_allow_bf16_reduced_precision_reduction()
+            # pyrefly: ignore  # not-iterable
+            allow_reduced_precision, _ = (
+                torch._C._get_cublas_allow_bf16_reduced_precision_reduction()
+            )
+            return allow_reduced_precision
+        elif name == "allow_bf16_reduced_precision_reduction_split_k":
+            # pyrefly: ignore  # not-iterable
+            _, allow_splitk = (
+                torch._C._get_cublas_allow_bf16_reduced_precision_reduction()
+            )
+            return allow_splitk
         elif name == "allow_fp16_accumulation":
             return torch._C._get_cublas_allow_fp16_accumulation()
         elif name == "fp32_precision":
@@ -143,9 +188,23 @@ def __setattr__(self, name, value):
         if name == "allow_tf32":
             return torch._C._set_cublas_allow_tf32(value)
         elif name == "allow_fp16_reduced_precision_reduction":
-            return torch._C._set_cublas_allow_fp16_reduced_precision_reduction(value)
+            allow_reduced_precision, allow_splitk = self._parse_reduction_setting(
+                value, "allow_fp16_reduced_precision_reduction"
+            )
+            return torch._C._set_cublas_allow_fp16_reduced_precision_reduction(
+                allow_reduced_precision,
+                # pyrefly: ignore  # bad-argument-count
+                allow_splitk,
+            )
         elif name == "allow_bf16_reduced_precision_reduction":
-            return torch._C._set_cublas_allow_bf16_reduced_precision_reduction(value)
+            allow_reduced_precision, allow_splitk = self._parse_reduction_setting(
+                value, "allow_bf16_reduced_precision_reduction"
+            )
+            return torch._C._set_cublas_allow_bf16_reduced_precision_reduction(
+                allow_reduced_precision,
+                # pyrefly: ignore  # bad-argument-count
+                allow_splitk,
+            )
         elif name == "allow_fp16_accumulation":
             return torch._C._set_cublas_allow_fp16_accumulation(value)
         elif name == "fp32_precision":
diff --git a/torch/backends/cudnn/__init__.py b/torch/backends/cudnn/__init__.py
index 9c155de7c04b..907b6d0b862d 100644
--- a/torch/backends/cudnn/__init__.py
+++ b/torch/backends/cudnn/__init__.py
@@ -34,8 +34,11 @@
     def _init():
         global __cudnn_version
         if __cudnn_version is None:
+            # pyrefly: ignore  # missing-attribute
             __cudnn_version = _cudnn.getVersionInt()
+            # pyrefly: ignore  # missing-attribute
             runtime_version = _cudnn.getRuntimeVersion()
+            # pyrefly: ignore  # missing-attribute
             compile_version = _cudnn.getCompileVersion()
             runtime_major, runtime_minor, _ = runtime_version
             compile_major, compile_minor, _ = compile_version
@@ -44,6 +47,7 @@ def _init():
             # Not sure about MIOpen (ROCm), so always do a strict check
             if runtime_major != compile_major:
                 cudnn_compatible = False
+            # pyrefly: ignore  # missing-attribute
             elif runtime_major < 7 or not _cudnn.is_cuda:
                 cudnn_compatible = runtime_minor == compile_minor
             else:
diff --git a/torch/backends/cudnn/rnn.py b/torch/backends/cudnn/rnn.py
index 5b253e190540..6aa772574cae 100644
--- a/torch/backends/cudnn/rnn.py
+++ b/torch/backends/cudnn/rnn.py
@@ -12,12 +12,16 @@
 
 def get_cudnn_mode(mode):
     if mode == "RNN_RELU":
+        # pyrefly: ignore  # missing-attribute
         return int(_cudnn.RNNMode.rnn_relu)
     elif mode == "RNN_TANH":
+        # pyrefly: ignore  # missing-attribute
         return int(_cudnn.RNNMode.rnn_tanh)
     elif mode == "LSTM":
+        # pyrefly: ignore  # missing-attribute
         return int(_cudnn.RNNMode.lstm)
     elif mode == "GRU":
+        # pyrefly: ignore  # missing-attribute
         return int(_cudnn.RNNMode.gru)
     else:
         raise Exception(f"Unknown mode: {mode}")  # noqa: TRY002
@@ -56,6 +60,7 @@ def init_dropout_state(dropout, train, dropout_seed, dropout_state):
                     dropout_p,
                     train,
                     dropout_seed,
+                    # pyrefly: ignore  # unexpected-keyword
                     self_ty=torch.uint8,
                     device=torch.device("cuda"),
                 )
diff --git a/torch/backends/cusparselt/__init__.py b/torch/backends/cusparselt/__init__.py
index 9d3d9a8a0163..6d073b77793f 100644
--- a/torch/backends/cusparselt/__init__.py
+++ b/torch/backends/cusparselt/__init__.py
@@ -23,6 +23,7 @@ def _init() -> bool:
         global __cusparselt_version
         global __MAX_ALG_ID
         if __cusparselt_version is None:
+            # pyrefly: ignore  # missing-attribute
             __cusparselt_version = _cusparselt.getVersionInt()
             if __cusparselt_version == 400:
                 __MAX_ALG_ID = 4
diff --git a/torch/backends/mkldnn/__init__.py b/torch/backends/mkldnn/__init__.py
index ae76a9f20c46..e4b301251444 100644
--- a/torch/backends/mkldnn/__init__.py
+++ b/torch/backends/mkldnn/__init__.py
@@ -19,6 +19,12 @@ def is_available():
     return torch._C._has_mkldnn
 
 
+def is_acl_available():
+    r"""Return whether PyTorch is built with MKL-DNN + ACL support."""
+    # pyrefly: ignore  # missing-attribute
+    return torch._C._has_mkldnn_acl
+
+
 VERBOSE_OFF = 0
 VERBOSE_ON = 1
 VERBOSE_ON_CREATION = 2
diff --git a/torch/backends/opt_einsum/__init__.py b/torch/backends/opt_einsum/__init__.py
index 73c107cc1e44..d838d6a2e67f 100644
--- a/torch/backends/opt_einsum/__init__.py
+++ b/torch/backends/opt_einsum/__init__.py
@@ -70,6 +70,7 @@ def _set_strategy(_strategy: str) -> None:
 
 
 def _get_strategy() -> str:
+    # pyrefly: ignore  # bad-return
     return strategy
 
 
@@ -115,5 +116,5 @@ def __init__(self, m, name):
 # https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
 sys.modules[__name__] = OptEinsumModule(sys.modules[__name__], __name__)
 
-enabled = True if is_available() else False
+enabled = bool(is_available())
 strategy = "auto" if is_available() else None
diff --git a/torch/backends/xeon/run_cpu.py b/torch/backends/xeon/run_cpu.py
index fe263858abb7..e64d9b5eb7dd 100644
--- a/torch/backends/xeon/run_cpu.py
+++ b/torch/backends/xeon/run_cpu.py
@@ -835,6 +835,7 @@ def create_args(parser=None):
 
     @retval ArgumentParser
     """
+    # pyrefly: ignore  # missing-attribute
     parser.add_argument(
         "--multi-instance",
         "--multi_instance",
@@ -843,6 +844,7 @@ def create_args(parser=None):
         help="Enable multi-instance, by default one instance per node",
     )
 
+    # pyrefly: ignore  # missing-attribute
     parser.add_argument(
         "-m",
         "--module",
@@ -853,6 +855,7 @@ def create_args(parser=None):
         '"python -m".',
     )
 
+    # pyrefly: ignore  # missing-attribute
     parser.add_argument(
         "--no-python",
         "--no_python",
@@ -867,6 +870,7 @@ def create_args(parser=None):
 
     _add_multi_instance_params(parser)
     # positional
+    # pyrefly: ignore  # missing-attribute
     parser.add_argument(
         "program",
         type=str,
@@ -875,6 +879,7 @@ def create_args(parser=None):
     )
 
     # rest from the training program
+    # pyrefly: ignore  # missing-attribute
     parser.add_argument("program_args", nargs=REMAINDER)
 
 
diff --git a/torch/compiler/__init__.py b/torch/compiler/__init__.py
index 08ec23b748eb..52d2645c4b71 100644
--- a/torch/compiler/__init__.py
+++ b/torch/compiler/__init__.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import io
-from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar, Union
+from collections.abc import Callable
+from typing import Any, Optional, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import ParamSpec
 
 import torch
@@ -500,7 +501,12 @@ def save_cache_artifacts() -> Optional[tuple[bytes, "CacheInfo"]]:
     - Execute torch.compile
     - Call torch.compiler.save_cache_artifacts()
     """
-    from ._cache import CacheArtifactManager, CacheInfo
+    from ._cache import CacheArtifactManager
+
+    if torch._dynamo.config.caching_precompile:
+        from torch._dynamo.precompile_context import PrecompileContext
+
+        PrecompileContext.save_to_dynamo_cache()
 
     return CacheArtifactManager.serialize()
 
diff --git a/torch/compiler/_cache.py b/torch/compiler/_cache.py
index 054ab1bb9fb2..77cfb77d74df 100644
--- a/torch/compiler/_cache.py
+++ b/torch/compiler/_cache.py
@@ -48,9 +48,6 @@ def encode(content: Any) -> bytes:
     def populate_cache(self) -> None:
         pass
 
-    def precompile_compatible(self) -> bool:
-        return False
-
     @staticmethod
     def type() -> str:
         """
@@ -93,6 +90,7 @@ def _get_artifact_type(cls, artifact_type_key: str) -> type[CacheArtifact]:
     @classmethod
     def create(cls, artifact_type_key: str, key: str, content: bytes) -> CacheArtifact:
         artifact_cls = cls._get_artifact_type(artifact_type_key)
+        # pyrefly: ignore  # bad-instantiation
         return artifact_cls(key, content)
 
     @classmethod
@@ -100,6 +98,7 @@ def encode_create(
         cls, artifact_type_key: str, key: str, content: Any
     ) -> CacheArtifact:
         artifact_cls = cls._get_artifact_type(artifact_type_key)
+        # pyrefly: ignore  # bad-instantiation
         return artifact_cls(key, artifact_cls.encode(content))
 
 
@@ -132,11 +131,7 @@ def pgo_artifacts(self) -> list[str]:  # type: ignore[empty-body]
         ...
 
     @property
-    def precompile_aot_autograd_artifacts(self) -> list[str]:  # type: ignore[empty-body]
-        ...
-
-    @property
-    def precompile_dynamo_artifacts(self) -> list[str]:  # type: ignore[empty-body]
+    def precompile_artifacts(self) -> list[str]:  # type: ignore[empty-body]
         ...
 
     def add(self, artifact: CacheArtifact) -> None:
@@ -316,6 +311,7 @@ def _ensure_cache_artifacts_registered(cls) -> None:
         cache artifacts are registered in the cache registry. This is done by
         simply importing all the cache artifacts already wrapped with register call.
         """
+        from torch._dynamo.package import PrecompileCacheArtifact  # noqa: F401
         from torch._dynamo.pgo import PGOCacheArtifact  # noqa: F401
         from torch._functorch._aot_autograd.autograd_cache import (  # noqa: F401
             AOTAutogradCacheArtifact,
diff --git a/torch/compiler/config.py b/torch/compiler/config.py
index d30f6c66f29e..e7578a57f2c0 100644
--- a/torch/compiler/config.py
+++ b/torch/compiler/config.py
@@ -20,6 +20,21 @@
 
 __all__ = [
     "job_id",
+    "dynamic_shapes",
+    "assume_static_by_default",
+    "automatic_dynamic_shapes",
+    "recompile_limit",
+    "accumulated_recompile_limit",
+    "verbose",
+    "capture_scalar_outputs",
+    "capture_dynamic_output_shape_ops",
+    "log_file_name",
+    "fail_on_recompile_limit_hit",
+    "allow_unspec_int_on_nn_module",
+    "skip_tensor_guards_with_matching_dict_tags",
+    "enable_cpp_symbolic_shape_guards",
+    "wrap_top_frame",
+    "reorderable_logging_functions",
 ]
 
 
@@ -121,4 +136,145 @@
 """
 
 
+# Cross-cutting configuration options that affect the entire compilation pipeline
+
+dynamic_shapes: bool = Config(alias="torch._dynamo.config.dynamic_shapes")
+"""
+Controls whether the compilation pipeline supports dynamic tensor shapes.
+When enabled, the compiler can handle tensors with varying dimensions across
+different invocations. This is a cross-cutting setting that affects shape
+inference, guard generation, and code generation across the entire compilation
+stack.
+"""
+
+assume_static_by_default: bool = Config(
+    alias="torch._dynamo.config.assume_static_by_default"
+)
+"""
+When enabled, all tensor dimensions are assumed to be static unless explicitly
+marked as dynamic or detected as changing. This compilation-wide behavior affects
+how the entire stack handles shape specialization and can improve performance
+for static workloads.
+"""
+
+automatic_dynamic_shapes: bool = Config(
+    alias="torch._dynamo.config.automatic_dynamic_shapes"
+)
+"""
+Enables automatic detection and handling of dynamic shapes. When a tensor's
+shape changes between compilations, the system automatically marks those
+dimensions as dynamic rather than requiring manual specification. This
+cross-cutting optimization improves the user experience by reducing recompilations.
+"""
+
+recompile_limit: int = Config(alias="torch._dynamo.config.recompile_limit")
+"""
+Maximum number of recompilations allowed for a single function before falling
+back to eager execution. This compilation performance control prevents excessive
+recompilation overhead that can degrade overall performance.
+"""
+
+accumulated_recompile_limit: int = Config(
+    alias="torch._dynamo.config.accumulated_recompile_limit"
+)
+"""
+Global limit on total recompilations across all compiled functions to prevent
+runaway recompilation scenarios. This safeguard protects against compilation
+performance issues that could affect the entire program.
+"""
+
+verbose: bool = Config(alias="torch._dynamo.config.verbose")
+"""
+Enables verbose debugging output for Dynamo. When enabled, provides detailed
+information about Dynamo's compilation decisions, optimizations, and potential
+issues.
+"""
+
+
+# TorchDynamo-specific configuration options
+
+capture_scalar_outputs: bool = Config(
+    alias="torch._dynamo.config.capture_scalar_outputs"
+)
+"""
+Controls whether TorchDynamo captures operations that return scalar values (like .item())
+into the FX graph. When disabled, these operations cause graph breaks. This is a
+TorchDynamo-specific tracing behavior that affects how the tracer handles
+scalar-returning operations.
+"""
+
+capture_dynamic_output_shape_ops: bool = Config(
+    alias="torch._dynamo.config.capture_dynamic_output_shape_ops"
+)
+"""
+Controls whether TorchDynamo captures operations with dynamic output shapes (like
+nonzero, unique) into the FX graph. When disabled, these operations cause graph breaks.
+This is a TorchDynamo-specific setting for handling operations with unpredictable
+output shapes during tracing.
+"""
+
+log_file_name: Optional[str] = Config(alias="torch._dynamo.config.log_file_name")
+"""
+Specifies a file path for TorchDynamo-specific logging output. When set, internal
+TorchDynamo debug information is written to this file rather than stdout. This is
+useful for debugging TorchDynamo's internal tracing behavior.
+"""
+
+fail_on_recompile_limit_hit: bool = Config(
+    alias="torch._dynamo.config.fail_on_recompile_limit_hit"
+)
+"""
+Raises a hard error when recompile limits are exceeded instead of falling back
+to eager execution. This is useful for detecting excessive recompilation in
+performance-critical deployments where you want to ensure compilation overhead
+is kept under control.
+"""
+
+allow_unspec_int_on_nn_module: bool = Config(
+    alias="torch._dynamo.config.allow_unspec_int_on_nn_module"
+)
+"""
+Allows integer attributes of nn.Module instances to be unspecialized through
+the dynamic shape mechanism. By default, TorchDynamo specializes on all integer
+module attributes, but this can cause excessive recompilation when integers
+like step counters change frequently.
+"""
+
+skip_tensor_guards_with_matching_dict_tags: bool = Config(
+    alias="torch._dynamo.config.skip_tensor_guards_with_matching_dict_tags"
+)
+"""
+Optimizes guard generation by treating tensors as immutable when they are
+dictionary values with consistent dictionary tags across invocations. This
+reduces guard overhead for tensors stored in persistent data structures.
+"""
+
+enable_cpp_symbolic_shape_guards: bool = Config(
+    alias="torch._dynamo.config.enable_cpp_symbolic_shape_guards"
+)
+"""
+Uses C++ implementation for symbolic shape guard evaluation to improve performance.
+The C++ guard manager can significantly speed up guard checking for symbolic shapes
+in shape-polymorphic compilations.
+"""
+
+wrap_top_frame: bool = Config(alias="torch._dynamo.config.wrap_top_frame")
+"""
+Wraps the top-level decorated function/module in a frame wrapper to ensure
+nn.Module hooks are compiled within the same frame as the main function. This
+improves compilation coverage for models that rely on hooks.
+"""
+
+reorderable_logging_functions: set = Config(
+    alias="torch._dynamo.config.reorderable_logging_functions"
+)
+"""
+A set of logging functions that can be reordered to execute after the compiled
+portion of the graph, allowing larger graphs to be captured. Functions in this
+set will have their execution deferred to avoid graph breaks, though this may
+affect the timing of log output. In particular, mutated values will not be logged
+at the right time, leading to incorrect logging.
+"""
+
+
 install_config_module(sys.modules[__name__])
diff --git a/torch/cpu/amp/__init__.py b/torch/cpu/amp/__init__.py
index e72eb3b92a7f..147d39b4a20a 100644
--- a/torch/cpu/amp/__init__.py
+++ b/torch/cpu/amp/__init__.py
@@ -1,2 +1,3 @@
+# pyrefly: ignore  # deprecated
 from .autocast_mode import autocast
 from .grad_scaler import GradScaler
diff --git a/torch/cpu/amp/autocast_mode.py b/torch/cpu/amp/autocast_mode.py
index 630fe9e58a46..f0f81060d4a0 100644
--- a/torch/cpu/amp/autocast_mode.py
+++ b/torch/cpu/amp/autocast_mode.py
@@ -1,4 +1,5 @@
 # mypy: allow-untyped-defs
+import sys
 from typing import Any
 from typing_extensions import deprecated
 
@@ -8,17 +9,36 @@
 __all__ = ["autocast"]
 
 
+@deprecated(
+    "`torch.cpu.amp.autocast(args...)` is deprecated. "
+    "Please use `torch.amp.autocast('cpu', args...)` instead.",
+    category=FutureWarning,
+)
 class autocast(torch.amp.autocast_mode.autocast):
     r"""
     See :class:`torch.autocast`.
     ``torch.cpu.amp.autocast(args...)`` is deprecated. Please use ``torch.amp.autocast("cpu", args...)`` instead.
     """
 
-    @deprecated(
-        "`torch.cpu.amp.autocast(args...)` is deprecated. "
-        "Please use `torch.amp.autocast('cpu', args...)` instead.",
-        category=FutureWarning,
-    )
+    # TODO: remove this conditional once we stop supporting Python < 3.13
+    # Prior to Python 3.13, inspect.signature could not retrieve the correct
+    # signature information for classes decorated with @deprecated (unless
+    # the __new__ static method was explicitly defined);
+    #
+    # However, this issue has been fixed in Python 3.13 and later versions.
+    if sys.version_info < (3, 13):
+
+        def __new__(
+            cls,
+            enabled: bool = True,
+            dtype: torch.dtype = torch.bfloat16,
+            cache_enabled: bool = True,
+        ):
+            return super().__new__(cls)
+
+        def __init_subclass__(cls):
+            pass
+
     def __init__(
         self,
         enabled: bool = True,
diff --git a/torch/csrc/Device.cpp b/torch/csrc/Device.cpp
index a86bf6cfa313..f3babe4cd72b 100644
--- a/torch/csrc/Device.cpp
+++ b/torch/csrc/Device.cpp
@@ -18,7 +18,7 @@
 static PyObject* THPUpperModuleOfDevice = nullptr;
 
 PyObject* THPDevice_New(const at::Device& device) {
-  auto type = (PyTypeObject*)&THPDeviceType;
+  auto type = &THPDeviceType;
   auto self = THPObjectPtr{type->tp_alloc(type, 0)};
   if (!self)
     throw python_error();
@@ -151,7 +151,7 @@ static PyObject* THPDevice_rc(PyObject* a, PyObject* b, int op) {
 
 static PyObject* THPDevice_reduce(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
-  auto self = (THPDevice*)_self;
+  auto self = reinterpret_cast<THPDevice*>(_self);
   auto ret = THPObjectPtr{PyTuple_New(2)};
   if (!ret)
     throw python_error();
@@ -221,8 +221,16 @@ typedef PyObject* (*getter)(PyObject*, void*);
 // NB: If you edit these properties/methods, update torch/_C/__init__.pyi.in
 
 static const std::initializer_list<PyGetSetDef> THPDevice_properties = {
-    {"type", (getter)THPDevice_type, nullptr, nullptr, nullptr},
-    {"index", (getter)THPDevice_index, nullptr, nullptr, nullptr},
+    {"type",
+     reinterpret_cast<getter>(THPDevice_type),
+     nullptr,
+     nullptr,
+     nullptr},
+    {"index",
+     reinterpret_cast<getter>(THPDevice_index),
+     nullptr,
+     nullptr,
+     nullptr},
     {nullptr}};
 
 static const std::initializer_list<PyMethodDef> THPDevice_methods = {
@@ -242,18 +250,18 @@ PyTypeObject THPDeviceType = {
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
-    (reprfunc)THPDevice_repr, /* tp_repr */
+    reinterpret_cast<reprfunc>(THPDevice_repr), /* tp_repr */
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
-    (hashfunc)THPDevice_hash, /* tp_hash  */
+    reinterpret_cast<hashfunc>(THPDevice_hash), /* tp_hash  */
     // TODO: We're not sure if this is a good idea or not, because making
     // torch.device callable means that it will start returning true
     // for callable() queries, and that is unexpected.  We can always add
     // this later, so for now, don't actually implement this
     // THPDevice_call, /* tp_call */
     nullptr, /* tp_call */
-    (reprfunc)THPDevice_str, /* tp_str */
+    reinterpret_cast<reprfunc>(THPDevice_str), /* tp_str */
     nullptr, /* tp_getattro */
     nullptr, /* tp_setattro */
     nullptr, /* tp_as_buffer */
@@ -261,7 +269,7 @@ PyTypeObject THPDeviceType = {
     nullptr, /* tp_doc */
     nullptr, /* tp_traverse */
     nullptr, /* tp_clear */
-    (richcmpfunc)THPDevice_rc, /* tp_richcompare */
+    static_cast<richcmpfunc>(THPDevice_rc), /* tp_richcompare */
     0, /* tp_weaklistoffset */
     nullptr, /* tp_iter */
     nullptr, /* tp_iternext */
@@ -286,7 +294,8 @@ void THPDevice_init(PyObject* module) {
   }
   Py_INCREF(&THPDeviceType);
   THPUpperModuleOfDevice = module;
-  if (PyModule_AddObject(module, "device", (PyObject*)&THPDeviceType) != 0) {
+  if (PyModule_AddObject(
+          module, "device", reinterpret_cast<PyObject*>(&THPDeviceType)) != 0) {
     throw python_error();
   }
 }
diff --git a/torch/csrc/DeviceAccelerator.cpp b/torch/csrc/DeviceAccelerator.cpp
index dc3da8881a71..b6176f11aaf6 100644
--- a/torch/csrc/DeviceAccelerator.cpp
+++ b/torch/csrc/DeviceAccelerator.cpp
@@ -1,3 +1,4 @@
+#include <c10/core/AllocatorConfig.h>
 #include <torch/csrc/DeviceAccelerator.h>
 #include <torch/csrc/utils/device_lazy_init.h>
 
@@ -136,6 +137,10 @@ void initModule(PyObject* module) {
   m.def("_accelerator_resetPeakStats", [](c10::DeviceIndex device_index) {
     at::accelerator::resetPeakStats(device_index);
   });
+
+  m.def("_accelerator_setAllocatorSettings", [](std::string env) {
+    c10::CachingAllocator::setAllocatorSettings(env);
+  });
 }
 
 } // namespace torch::accelerator
diff --git a/torch/csrc/Dtype.cpp b/torch/csrc/Dtype.cpp
index f1298e368de2..c302378de81e 100644
--- a/torch/csrc/Dtype.cpp
+++ b/torch/csrc/Dtype.cpp
@@ -15,7 +15,7 @@
 PyObject* THPDtype_New(at::ScalarType scalar_type, const std::string& name) {
   HANDLE_TH_ERRORS
   AT_ASSERT(name.length() < DTYPE_NAME_LEN);
-  auto type = (PyTypeObject*)&THPDtypeType;
+  auto type = &THPDtypeType;
   auto self = THPObjectPtr{type->tp_alloc(type, 0)};
   if (!self)
     throw python_error();
@@ -69,14 +69,14 @@ static PyObject* THPDtype_reduce(PyObject* _self, PyObject* noargs) {
    * For singletons, a string is returned. The string should be interpreted
    * as the name of a global variable.
    */
-  auto self = (THPDtype*)_self;
+  auto self = reinterpret_cast<THPDtype*>(_self);
   return THPUtils_packString(self->name);
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* THPDtype_to_real(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
-  auto* self = (THPDtype*)_self;
+  auto* self = reinterpret_cast<THPDtype*>(_self);
   auto scalar_type = self->scalar_type;
   if (!at::isFloatingType(self->scalar_type)) {
     scalar_type = at::toRealValueType(self->scalar_type);
@@ -87,7 +87,7 @@ static PyObject* THPDtype_to_real(PyObject* _self, PyObject* noargs) {
 
 static PyObject* THPDtype_to_complex(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
-  auto* self = (THPDtype*)_self;
+  auto* self = reinterpret_cast<THPDtype*>(_self);
   auto scalar_type = self->scalar_type;
   if (!at::isComplexType(self->scalar_type)) {
     scalar_type = at::toComplexType(self->scalar_type);
@@ -100,13 +100,25 @@ typedef PyObject* (*getter)(PyObject*, void*);
 
 static const std::initializer_list<PyGetSetDef> THPDtype_properties = {
     {"is_floating_point",
-     (getter)THPDtype_is_floating_point,
+     reinterpret_cast<getter>(THPDtype_is_floating_point),
+     nullptr,
+     nullptr,
+     nullptr},
+    {"is_complex",
+     reinterpret_cast<getter>(THPDtype_is_complex),
+     nullptr,
+     nullptr,
+     nullptr},
+    {"is_signed",
+     reinterpret_cast<getter>(THPDtype_is_signed),
+     nullptr,
+     nullptr,
+     nullptr},
+    {"itemsize",
+     reinterpret_cast<getter>(THPDtype_itemsize),
      nullptr,
      nullptr,
      nullptr},
-    {"is_complex", (getter)THPDtype_is_complex, nullptr, nullptr, nullptr},
-    {"is_signed", (getter)THPDtype_is_signed, nullptr, nullptr, nullptr},
-    {"itemsize", (getter)THPDtype_itemsize, nullptr, nullptr, nullptr},
     {nullptr}};
 
 static const std::initializer_list<PyMethodDef> THPDtype_methods = {
@@ -130,7 +142,7 @@ PyTypeObject THPDtypeType = {
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
-    (reprfunc)THPDtype_repr, /* tp_repr */
+    reinterpret_cast<reprfunc>(THPDtype_repr), /* tp_repr */
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
@@ -190,7 +202,8 @@ void THPDtype_init(PyObject* module) {
     throw python_error();
   }
   Py_INCREF(&THPDtypeType);
-  if (PyModule_AddObject(module, "dtype", (PyObject*)&THPDtypeType) != 0) {
+  if (PyModule_AddObject(
+          module, "dtype", reinterpret_cast<PyObject*>(&THPDtypeType)) != 0) {
     throw python_error();
   }
 }
diff --git a/torch/csrc/Event.cpp b/torch/csrc/Event.cpp
index b5369e436cfe..319eee8a41c6 100644
--- a/torch/csrc/Event.cpp
+++ b/torch/csrc/Event.cpp
@@ -48,7 +48,7 @@ static PyObject* THPEvent_pynew(
     TORCH_CHECK(ptr, "Failed to allocate memory for Event");
   }
 
-  THPEvent* self = (THPEvent*)ptr.get();
+  THPEvent* self = reinterpret_cast<THPEvent*>(ptr.get());
 
   // TODO: blocking and interprocess are not supported yet. To support them, the
   // flag system of c10::Event needs to be refactored. C10::Event should also
@@ -64,12 +64,12 @@ static PyObject* THPEvent_pynew(
       (enable_timing ? c10::EventFlag::BACKEND_DEFAULT
                      : c10::EventFlag::PYTORCH_DEFAULT));
 
-  return (PyObject*)ptr.release();
+  return static_cast<PyObject*>(ptr.release());
   END_HANDLE_TH_ERRORS
 }
 
 PyObject* THPEvent_new(c10::DeviceType device_type, c10::EventFlag flag) {
-  auto type = (PyTypeObject*)&THPEventType;
+  auto type = &THPEventType;
   auto self = THPObjectPtr{type->tp_alloc(type, 0)};
   TORCH_CHECK(self, "Failed to allocate memory for Event");
   auto self_ = reinterpret_cast<THPEvent*>(self.get());
@@ -82,7 +82,7 @@ static void THPEvent_dealloc(THPEvent* self) {
     pybind11::gil_scoped_release no_gil{};
     self->event.~Event();
   }
-  Py_TYPE(self)->tp_free((PyObject*)self);
+  Py_TYPE(self)->tp_free(reinterpret_cast<PyObject*>(self));
 }
 
 static PyObject* THPEvent_get_device(THPEvent* self, void* unused) {
@@ -96,7 +96,7 @@ static PyObject* THPEvent_record(
     PyObject* args,
     PyObject* kwargs) {
   HANDLE_TH_ERRORS
-  auto self = (THPEvent*)_self;
+  auto self = reinterpret_cast<THPEvent*>(_self);
   PyObject* _stream = Py_None;
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
   constexpr const char* accepted_args[] = {"stream", nullptr};
@@ -111,7 +111,7 @@ static PyObject* THPEvent_record(
     return nullptr;
   }
   if (_stream != Py_None) {
-    auto stream = (THPStream*)_stream;
+    auto stream = reinterpret_cast<THPStream*>(_stream);
     self->event.record(c10::Stream::unpack3(
         stream->stream_id,
         static_cast<c10::DeviceIndex>(stream->device_index),
@@ -130,7 +130,7 @@ static PyObject* THPEvent_from_ipc_handle(
     PyObject* args,
     PyObject* kwargs) {
   HANDLE_TH_ERRORS
-  auto type = (PyTypeObject*)_type;
+  auto type = reinterpret_cast<PyTypeObject*>(_type);
 
   static torch::PythonArgParser parser({
       "from_ipc_handle(Device device, std::string ipc_handle)",
@@ -146,13 +146,13 @@ static PyObject* THPEvent_from_ipc_handle(
   if (!ptr) {
     return nullptr;
   }
-  THPEvent* self = (THPEvent*)ptr.get();
+  THPEvent* self = reinterpret_cast<THPEvent*>(ptr.get());
 
   // TODO: for constructing event from ipc handle, the c10::Event needs to have
   // more general constructor to achieve that.
   new (&self->event) c10::Event(device.type(), c10::EventFlag::PYTORCH_DEFAULT);
 
-  return (PyObject*)ptr.release();
+  return static_cast<PyObject*>(ptr.release());
   END_HANDLE_TH_ERRORS
 }
 
@@ -174,7 +174,7 @@ static PyObject* THPEvent_wait(
     PyObject* args,
     PyObject* kwargs) {
   HANDLE_TH_ERRORS {
-    auto self = (THPEvent*)_self;
+    auto self = reinterpret_cast<THPEvent*>(_self);
     PyObject* _stream = Py_None;
     // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
     constexpr const char* accepted_args[] = {"stream", nullptr};
@@ -189,7 +189,7 @@ static PyObject* THPEvent_wait(
       return nullptr;
     }
     if (_stream != Py_None) {
-      auto stream = (THPStream*)_stream;
+      auto stream = reinterpret_cast<THPStream*>(_stream);
       self->event.block(c10::Stream::unpack3(
           stream->stream_id,
           static_cast<c10::DeviceIndex>(stream->device_index),
@@ -206,15 +206,15 @@ static PyObject* THPEvent_wait(
 
 static PyObject* THPEvent_query(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
-  auto self = (THPEvent*)_self;
+  auto self = reinterpret_cast<THPEvent*>(_self);
   return PyBool_FromLong(self->event.query());
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* THPEvent_elapsed_time(PyObject* _self, PyObject* _other) {
   HANDLE_TH_ERRORS
-  auto self = (THPEvent*)_self;
-  auto other = (THPEvent*)_other;
+  auto self = reinterpret_cast<THPEvent*>(_self);
+  auto other = reinterpret_cast<THPEvent*>(_other);
   return PyFloat_FromDouble(self->event.elapsedTime(other->event));
   END_HANDLE_TH_ERRORS
 }
@@ -222,7 +222,7 @@ static PyObject* THPEvent_elapsed_time(PyObject* _self, PyObject* _other) {
 static PyObject* THPEvent_synchronize(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS {
     pybind11::gil_scoped_release no_gil{};
-    auto self = (THPEvent*)_self;
+    auto self = reinterpret_cast<THPEvent*>(_self);
     self->event.synchronize();
   }
   Py_RETURN_NONE;
@@ -231,7 +231,7 @@ static PyObject* THPEvent_synchronize(PyObject* _self, PyObject* noargs) {
 
 static PyObject* THPEvent_evend_id(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
-  auto self = (THPEvent*)_self;
+  auto self = reinterpret_cast<THPEvent*>(_self);
   return PyLong_FromVoidPtr(self->event.eventId());
   END_HANDLE_TH_ERRORS
 }
@@ -251,8 +251,16 @@ static PyObject* THPEvent_repr(THPEvent* self) {
 
 // NOLINTNEXTLINE(*c-arrays*, *global-variables)
 static struct PyGetSetDef THPEvent_properties[] = {
-    {"device", (getter)THPEvent_get_device, nullptr, nullptr, nullptr},
-    {"event_id", (getter)THPEvent_evend_id, nullptr, nullptr, nullptr},
+    {"device",
+     reinterpret_cast<getter>(THPEvent_get_device),
+     nullptr,
+     nullptr,
+     nullptr},
+    {"event_id",
+     reinterpret_cast<getter>(THPEvent_evend_id),
+     nullptr,
+     nullptr,
+     nullptr},
     {nullptr}};
 
 // NOLINTNEXTLINE(*c-arrays*, *global-variables)
@@ -280,12 +288,12 @@ PyTypeObject THPEventType = {
     "torch.Event", /* tp_name */
     sizeof(THPEvent), /* tp_basicsize */
     0, /* tp_itemsize */
-    (destructor)THPEvent_dealloc, /* tp_dealloc */
+    reinterpret_cast<destructor>(THPEvent_dealloc), /* tp_dealloc */
     0, /* tp_vectorcall_offset */
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
-    (reprfunc)THPEvent_repr, /* tp_repr */
+    reinterpret_cast<reprfunc>(THPEvent_repr), /* tp_repr */
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
@@ -322,7 +330,8 @@ void THPEvent_init(PyObject* module) {
     throw python_error();
   }
   Py_INCREF(&THPEventType);
-  if (PyModule_AddObject(module, "Event", (PyObject*)&THPEventType) < 0) {
+  if (PyModule_AddObject(
+          module, "Event", reinterpret_cast<PyObject*>(&THPEventType)) < 0) {
     throw python_error();
   }
 }
diff --git a/torch/csrc/Exceptions.cpp b/torch/csrc/Exceptions.cpp
index 77085a946399..cf74ddff576c 100644
--- a/torch/csrc/Exceptions.cpp
+++ b/torch/csrc/Exceptions.cpp
@@ -65,7 +65,8 @@ could not be completed because the input matrix is singular.",
           "Exception raised when device is out of memory",
           PyExc_RuntimeError,
           nullptr));
-  PyTypeObject* type = (PyTypeObject*)THPException_OutOfMemoryError;
+  PyTypeObject* type =
+      reinterpret_cast<PyTypeObject*>(THPException_OutOfMemoryError);
   type->tp_name = "torch.OutOfMemoryError";
   ASSERT_TRUE(
       PyModule_AddObject(
@@ -133,7 +134,7 @@ could not be completed because the input matrix is singular.",
           "Exception raised while executing on device",
           PyExc_RuntimeError,
           nullptr));
-  type = (PyTypeObject*)THPException_AcceleratorError;
+  type = reinterpret_cast<PyTypeObject*>(THPException_AcceleratorError);
   ASSERT_TRUE(
       PyModule_AddObject(
           module, "AcceleratorError", THPException_AcceleratorError) == 0);
@@ -252,10 +253,10 @@ PyWarningHandler::PyWarningHandler() noexcept(true)
 // Get the Python warning type for a warning
 static PyObject* map_warning_to_python_type(const c10::Warning& warning) {
   struct Visitor {
-    PyObject* operator()(const c10::UserWarning&) const {
+    PyObject* operator()(const c10::UserWarning& /*unused*/) const {
       return PyExc_UserWarning;
     }
-    PyObject* operator()(const c10::DeprecationWarning&) const {
+    PyObject* operator()(const c10::DeprecationWarning& /*unused*/) const {
       return PyExc_DeprecationWarning;
     }
   };
@@ -336,7 +337,7 @@ PyObject* _new_accelerator_error_object(const c10::AcceleratorError& e) {
 
   auto py_msg = PyUnicode_FromString(msg);
   auto rc = PyObject_CallOneArg(THPException_AcceleratorError, py_msg);
-  auto error_code = PyInt_FromLong(e.get_error_code());
+  auto error_code = THPUtils_packUInt32(e.get_error_code());
   PyObject_SetAttrString(rc, "error_code", error_code);
   Py_XDECREF(py_msg);
   Py_XDECREF(error_code);
diff --git a/torch/csrc/Exceptions.h b/torch/csrc/Exceptions.h
index d43d2b02a23e..d58080946081 100644
--- a/torch/csrc/Exceptions.h
+++ b/torch/csrc/Exceptions.h
@@ -15,7 +15,9 @@
 #include <torch/csrc/utils/cpp_stacktraces.h>
 #include <torch/csrc/utils/pybind.h>
 
+#if defined(USE_DISTRIBUTED)
 #include <torch/csrc/distributed/c10d/exception.h>
+#endif
 
 inline void PyErr_SetString(PyObject* type, const std::string& message) {
   PyErr_SetString(type, message.c_str());
@@ -267,7 +269,8 @@ bool THPException_init(PyObject* module);
 namespace torch {
 
 // Set python current exception from a C++ exception
-TORCH_PYTHON_API void translate_exception_to_python(const std::exception_ptr&);
+TORCH_PYTHON_API void translate_exception_to_python(
+    const std::exception_ptr& /*e_ptr*/);
 
 TORCH_PYTHON_API std::string processErrorMsg(std::string str);
 
@@ -356,8 +359,8 @@ using Arg = typename invoke_traits<Func>::template arg<i>::type;
 template <typename Func, size_t... Is, bool release_gil>
 auto wrap_pybind_function_impl_(
     Func&& f,
-    std::index_sequence<Is...>,
-    std::bool_constant<release_gil>) {
+    std::index_sequence<Is...> /*unused*/,
+    std::bool_constant<release_gil> /*unused*/) {
   namespace py = pybind11;
 
   // f=f is needed to handle function references on older compilers
@@ -369,7 +372,7 @@ auto wrap_pybind_function_impl_(
   };
 }
 
-PyObject* _new_accelerator_error_object(const c10::AcceleratorError&);
+PyObject* _new_accelerator_error_object(const c10::AcceleratorError& /*e*/);
 } // namespace detail
 
 // Wrap a function with TH error and warning handling.
diff --git a/torch/csrc/Generator.cpp b/torch/csrc/Generator.cpp
index d99d41ae3d35..058335921209 100644
--- a/torch/csrc/Generator.cpp
+++ b/torch/csrc/Generator.cpp
@@ -21,7 +21,7 @@ using namespace torch;
 PyObject* THPGeneratorClass = nullptr;
 
 PyObject* THPGenerator_initDefaultGenerator(const at::Generator& cdata) {
-  auto type = (PyTypeObject*)THPGeneratorClass;
+  auto type = reinterpret_cast<PyTypeObject*>(THPGeneratorClass);
   auto self = THPObjectPtr{type->tp_alloc(type, 0)};
   if (!self)
     throw python_error();
@@ -49,7 +49,8 @@ static PyObject* THPGenerator_pynew(
   auto r = parser.parse(args, kwargs, parsed_args);
   auto device = r.deviceWithDefault(0, at::Device(at::kCPU));
 
-  THPGeneratorPtr self((THPGenerator*)type->tp_alloc(type, 0));
+  THPGeneratorPtr self(
+      reinterpret_cast<THPGenerator*>(type->tp_alloc(type, 0)));
 
   c10::DeviceType device_type = device.type();
   if (device_type == at::kCPU) {
@@ -60,14 +61,14 @@ static PyObject* THPGenerator_pynew(
                       .getNewGenerator(device.index());
   }
 
-  return (PyObject*)self.release();
+  return reinterpret_cast<PyObject*>(self.release());
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* THPGenerator_getState(PyObject* _self, PyObject* noargs) {
   using namespace torch::autograd;
   HANDLE_TH_ERRORS
-  auto& gen = ((THPGenerator*)_self)->cdata;
+  auto& gen = (reinterpret_cast<THPGenerator*>(_self))->cdata;
 
   // See Note [Acquire lock when using random generators]
   std::scoped_lock<std::mutex> lock(gen.mutex());
@@ -88,7 +89,7 @@ static PyObject* THPGenerator_setState(PyObject* _self, PyObject* _new_state) {
             "expected a torch.ByteTensor, but got {}",
             Py_TYPE(_new_state)->tp_name));
   }
-  auto self = (THPGenerator*)_self;
+  auto self = reinterpret_cast<THPGenerator*>(_self);
   auto& gen = self->cdata;
   const auto& new_state_tensor = THPVariable_Unpack(_new_state);
 
@@ -97,7 +98,7 @@ static PyObject* THPGenerator_setState(PyObject* _self, PyObject* _new_state) {
   gen.set_state(new_state_tensor);
 
   Py_INCREF(self);
-  return (PyObject*)self;
+  return reinterpret_cast<PyObject*>(self);
   END_HANDLE_TH_ERRORS
 }
 
@@ -125,7 +126,7 @@ static PyObject* THPGenerator_graphSafeGetState(
     PyObject* _self,
     PyObject* noargs) {
   HANDLE_TH_ERRORS
-  auto& gen = ((THPGenerator*)_self)->cdata;
+  auto& gen = (reinterpret_cast<THPGenerator*>(_self))->cdata;
 
   // See Note [Acquire lock when using random generators]
   std::scoped_lock<std::mutex> lock(gen.mutex());
@@ -138,7 +139,7 @@ static PyObject* THPGenerator_graphSafeSetState(
     PyObject* _self,
     PyObject* _state) {
   HANDLE_TH_ERRORS
-  auto self = (THPGenerator*)_self;
+  auto self = reinterpret_cast<THPGenerator*>(_self);
   auto& gen = self->cdata;
 
   // See Note [Acquire lock when using random generators]
@@ -146,13 +147,13 @@ static PyObject* THPGenerator_graphSafeSetState(
   gen.graphsafe_set_state(THPGenerator_Unwrap(_state));
 
   Py_INCREF(self);
-  return (PyObject*)self;
+  return reinterpret_cast<PyObject*>(self);
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* THPGenerator_cloneState(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
-  auto& gen = ((THPGenerator*)_self)->cdata;
+  auto& gen = (reinterpret_cast<THPGenerator*>(_self))->cdata;
 
   // See Note [Acquire lock when using random generators]
   std::scoped_lock<std::mutex> lock(gen.mutex());
@@ -163,7 +164,7 @@ static PyObject* THPGenerator_cloneState(PyObject* _self, PyObject* noargs) {
 
 static PyObject* THPGenerator_manualSeed(PyObject* _self, PyObject* seed) {
   HANDLE_TH_ERRORS
-  auto self = (THPGenerator*)_self;
+  auto self = reinterpret_cast<THPGenerator*>(_self);
   auto generator = self->cdata;
   TORCH_CHECK(
       THPUtils_checkLong(seed),
@@ -175,13 +176,13 @@ static PyObject* THPGenerator_manualSeed(PyObject* _self, PyObject* seed) {
   std::scoped_lock<std::mutex> lock(generator.mutex());
   generator.set_current_seed(unsigned_seed);
   Py_INCREF(self);
-  return (PyObject*)self;
+  return reinterpret_cast<PyObject*>(self);
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* THPGenerator_setOffset(PyObject* _self, PyObject* offset) {
   HANDLE_TH_ERRORS
-  auto self = (THPGenerator*)_self;
+  auto self = reinterpret_cast<THPGenerator*>(_self);
   auto generator = self->cdata;
   TORCH_CHECK(
       THPUtils_checkLong(offset),
@@ -193,14 +194,14 @@ static PyObject* THPGenerator_setOffset(PyObject* _self, PyObject* offset) {
   std::scoped_lock<std::mutex> lock(generator.mutex());
   generator.set_offset(unsigned_offset);
   Py_INCREF(self);
-  return (PyObject*)self;
+  return reinterpret_cast<PyObject*>(self);
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* THPGenerator_seed(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
   // See Note [Acquire lock when using random generators]
-  auto self = (THPGenerator*)_self;
+  auto self = reinterpret_cast<THPGenerator*>(_self);
   std::scoped_lock<std::mutex> lock(self->cdata.mutex());
   uint64_t seed_val = self->cdata.seed();
   return THPUtils_packUInt64(seed_val);
@@ -209,14 +210,14 @@ static PyObject* THPGenerator_seed(PyObject* _self, PyObject* noargs) {
 
 static PyObject* THPGenerator_initialSeed(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
-  auto self = (THPGenerator*)_self;
+  auto self = reinterpret_cast<THPGenerator*>(_self);
   return THPUtils_packUInt64(self->cdata.current_seed());
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject* THPGenerator_getOffset(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
-  auto self = (THPGenerator*)_self;
+  auto self = reinterpret_cast<THPGenerator*>(_self);
   return THPUtils_packUInt64(self->cdata.get_offset());
   END_HANDLE_TH_ERRORS
 }
@@ -229,7 +230,7 @@ static PyObject* THPGenerator_get_device(THPGenerator* self, void* unused) {
 
 static PyObject* THPGenerator_reduce(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
-  auto self = (THPGenerator*)_self;
+  auto self = reinterpret_cast<THPGenerator*>(_self);
   auto& gen = self->cdata;
 
   auto ret = THPObjectPtr{PyTuple_New(3)};
@@ -279,7 +280,11 @@ static PyObject* THPGenerator_pickleSetState(PyObject* _self, PyObject* state) {
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
 static struct PyGetSetDef THPGenerator_properties[] = {
-    {"device", (getter)THPGenerator_get_device, nullptr, nullptr, nullptr},
+    {"device",
+     reinterpret_cast<getter>(THPGenerator_get_device),
+     nullptr,
+     nullptr,
+     nullptr},
     {nullptr}};
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
@@ -349,11 +354,12 @@ static PyTypeObject THPGeneratorType = {
 };
 
 bool THPGenerator_init(PyObject* module) {
-  THPGeneratorClass = (PyObject*)&THPGeneratorType;
+  THPGeneratorClass = reinterpret_cast<PyObject*>(&THPGeneratorType);
   if (PyType_Ready(&THPGeneratorType) < 0)
     return false;
   Py_INCREF(&THPGeneratorType);
-  PyModule_AddObject(module, "Generator", (PyObject*)&THPGeneratorType);
+  PyModule_AddObject(
+      module, "Generator", reinterpret_cast<PyObject*>(&THPGeneratorType));
   return true;
 }
 
@@ -377,7 +383,8 @@ PyObject* THPGenerator_Wrap(const Generator& gen) {
     return obj;
   }
 
-  return THPGenerator_NewWithVar((PyTypeObject*)THPGeneratorClass, gen);
+  return THPGenerator_NewWithVar(
+      reinterpret_cast<PyTypeObject*>(THPGeneratorClass), gen);
 }
 
 at::Generator THPGenerator_Unwrap(PyObject* state) {
@@ -395,7 +402,7 @@ at::Generator THPGenerator_Unwrap(PyObject* state) {
 PyObject* THPGenerator_NewWithVar(PyTypeObject* type, Generator gen) {
   PyObject* obj = type->tp_alloc(type, 0);
   if (obj) {
-    auto g = (THPGenerator*)obj;
+    auto g = reinterpret_cast<THPGenerator*>(obj);
     new (&g->cdata) Generator(std::move(gen));
     set_pyobj(g->cdata, obj);
   }
diff --git a/torch/csrc/Layout.cpp b/torch/csrc/Layout.cpp
index b1b2f254b365..06b49d56f649 100644
--- a/torch/csrc/Layout.cpp
+++ b/torch/csrc/Layout.cpp
@@ -11,7 +11,7 @@
 #include <string>
 
 PyObject* THPLayout_New(at::Layout layout, const std::string& name) {
-  auto type = (PyTypeObject*)&THPLayoutType;
+  auto type = &THPLayoutType;
   auto self = THPObjectPtr{type->tp_alloc(type, 0)};
   if (!self)
     throw python_error();
@@ -36,7 +36,7 @@ PyTypeObject THPLayoutType = {
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
-    (reprfunc)THPLayout_repr, /* tp_repr */
+    reinterpret_cast<reprfunc>(THPLayout_repr), /* tp_repr */
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
@@ -72,7 +72,8 @@ void THPLayout_init(PyObject* module) {
     throw python_error();
   }
   Py_INCREF(&THPLayoutType);
-  if (PyModule_AddObject(module, "layout", (PyObject*)&THPLayoutType) != 0) {
+  if (PyModule_AddObject(
+          module, "layout", reinterpret_cast<PyObject*>(&THPLayoutType)) != 0) {
     throw python_error();
   }
 }
diff --git a/torch/csrc/MemoryFormat.cpp b/torch/csrc/MemoryFormat.cpp
index f55fe3dcc516..5bd3f9eed42d 100644
--- a/torch/csrc/MemoryFormat.cpp
+++ b/torch/csrc/MemoryFormat.cpp
@@ -13,7 +13,7 @@
 PyObject* THPMemoryFormat_New(
     at::MemoryFormat memory_format,
     const std::string& name) {
-  auto type = (PyTypeObject*)&THPMemoryFormatType;
+  auto type = &THPMemoryFormatType;
   auto self = THPObjectPtr{type->tp_alloc(type, 0)};
   if (!self)
     throw python_error();
@@ -29,7 +29,7 @@ static PyObject* THPMemoryFormat_repr(THPMemoryFormat* self) {
 }
 
 static PyObject* THPMemoryFormat_reduce(PyObject* _self, PyObject* noargs) {
-  auto* self = (THPMemoryFormat*)_self;
+  auto* self = reinterpret_cast<THPMemoryFormat*>(_self);
   return THPUtils_packString(self->name);
 }
 
@@ -49,7 +49,7 @@ PyTypeObject THPMemoryFormatType = {
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
-    (reprfunc)THPMemoryFormat_repr, /* tp_repr */
+    reinterpret_cast<reprfunc>(THPMemoryFormat_repr), /* tp_repr */
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
@@ -86,7 +86,9 @@ void THPMemoryFormat_init(PyObject* module) {
   }
   Py_INCREF(&THPMemoryFormatType);
   if (PyModule_AddObject(
-          module, "memory_format", (PyObject*)&THPMemoryFormatType) != 0) {
+          module,
+          "memory_format",
+          reinterpret_cast<PyObject*>(&THPMemoryFormatType)) != 0) {
     throw python_error();
   }
 }
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index bd8491de6efb..99e47d64fb7e 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -56,6 +56,7 @@
 #include <torch/csrc/Stream.h>
 #include <torch/csrc/THP.h>
 #include <torch/csrc/TypeInfo.h>
+#include <torch/csrc/acc/Module.h>
 #include <torch/csrc/api/include/torch/python/init.h>
 #include <torch/csrc/autograd/generated/python_return_types.h>
 #include <torch/csrc/autograd/python_cpp_function.h>
@@ -72,6 +73,7 @@
 #include <torch/csrc/cpu/Module.h>
 #include <torch/csrc/dynamo/init.h>
 #include <torch/csrc/export/pybind.h>
+#include <torch/csrc/functionalization/Module.h>
 #include <torch/csrc/functorch/init.h>
 #include <torch/csrc/fx/node.h>
 #include <torch/csrc/inductor/aoti_package/pybind.h>
@@ -111,6 +113,7 @@
 
 #ifdef USE_CUDA
 #include <ATen/ROCmFABackend.h>
+#include <ATen/cuda/CUDABlas.h>
 #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/native/transformers/cuda/sdp_utils.h>
 #include <torch/csrc/inductor/static_cuda_launcher.h>
@@ -121,10 +124,14 @@
 #endif
 #endif
 
+#ifdef USE_DISTRIBUTED
+#ifdef USE_C10D
 #include <torch/csrc/distributed/autograd/python_autograd.h>
 #include <torch/csrc/distributed/c10d/c10d.h>
 #include <torch/csrc/distributed/rpc/rpc.h>
 #include <torch/csrc/distributed/rpc/testing/testing.h>
+#endif
+#endif
 
 #if defined(USE_VALGRIND)
 #include <callgrind.h>
@@ -159,7 +166,7 @@ static PyObject* THPModule_initNames(PyObject* self, PyObject* arg) {
   for (Py_ssize_t i = 0; i < num_classes; i++) {
     PyObject* obj = PySequence_Fast_GET_ITEM(types.get(), i);
     TORCH_CHECK(PyType_Check(obj), "expected a PyTypeObject");
-    PyTypeObject* type = (PyTypeObject*)obj;
+    PyTypeObject* type = reinterpret_cast<PyTypeObject*>(obj);
 
     THPObjectPtr module_name(PyObject_GetAttrString(obj, "__module__"));
     if (!module_name)
@@ -234,7 +241,7 @@ static PyObject* THPModule_initExtension(
   END_HANDLE_TH_ERRORS
 }
 
-// The idea behind these two functions is to make it easy to test if we are
+// The idea behind these functions is to make it easy to test if we are
 // built with ASAN: they're designed not to crash if ASAN is not enabled, but
 // to trigger ASAN if it is enabled.  This lets us run a "canary" tests which
 // checks if our build environment is misconfigured.
@@ -261,7 +268,7 @@ static PyObject* THPModule_crashIfCsrcUBSAN(PyObject* module, PyObject* arg) {
       THPUtils_typename(arg));
   int32_t x = THPUtils_unpackInt(arg);
   double y = 1.0 / x;
-  return THPUtils_packInt32((int)y);
+  return THPUtils_packInt32(static_cast<int>(y));
   END_HANDLE_TH_ERRORS
 }
 
@@ -327,7 +334,7 @@ static PyObject* THPModule_setNumThreads(PyObject* module, PyObject* arg) {
       THPUtils_checkLong(arg),
       "set_num_threads expects an int, but got ",
       THPUtils_typename(arg));
-  int nthreads = (int)THPUtils_unpackLong(arg);
+  int nthreads = THPUtils_unpackInt(arg);
   TORCH_CHECK(nthreads > 0, "set_num_threads expects a positive integer");
   at::set_num_threads(nthreads);
   Py_RETURN_NONE;
@@ -349,7 +356,7 @@ static PyObject* THPModule_setNumInteropThreads(
       "set_num_interop_threads expects an int, "
       "but got ",
       THPUtils_typename(arg));
-  int nthreads = (int)THPUtils_unpackLong(arg);
+  int nthreads = THPUtils_unpackInt(arg);
   TORCH_CHECK(
       nthreads > 0, "set_num_interop_threads expects a positive integer");
   at::set_num_interop_threads(nthreads);
@@ -404,9 +411,11 @@ static PyObject* THPModule_swap_tensor_impl(PyObject* _unused, PyObject* args) {
   // The TensorImpls contain PyObjectSlots that have a reference to the PyObject
   // associated with the TensorImpl. Swap this field as well.
   std::optional<PyObject*> mb_obj_a =
-      a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
+      a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+          /*ignore_hermetic_tls=*/false);
   std::optional<PyObject*> mb_obj_b =
-      b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
+      b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+          /*ignore_hermetic_tls=*/false);
   TORCH_INTERNAL_ASSERT(
       mb_obj_a.has_value() && mb_obj_b.has_value(),
       "Both tensors should have PyObjects tagged by the current python interpreter");
@@ -439,7 +448,7 @@ static PyObject* THPModule_addDocStr(PyObject* _unused, PyObject* args) {
   }
 
   if (Py_TYPE(obj) == &PyCFunction_Type) {
-    PyCFunctionObject* f = (PyCFunctionObject*)obj;
+    PyCFunctionObject* f = reinterpret_cast<PyCFunctionObject*>(obj);
     if (f->m_ml->ml_doc) {
       return PyErr_Format(
           PyExc_RuntimeError,
@@ -448,7 +457,7 @@ static PyObject* THPModule_addDocStr(PyObject* _unused, PyObject* args) {
     }
     f->m_ml->ml_doc = doc_str;
   } else if (strcmp(Py_TYPE(obj)->tp_name, "method_descriptor") == 0) {
-    PyMethodDescrObject* m = (PyMethodDescrObject*)obj;
+    PyMethodDescrObject* m = reinterpret_cast<PyMethodDescrObject*>(obj);
     if (m->d_method->ml_doc) {
       return PyErr_Format(
           PyExc_RuntimeError,
@@ -457,8 +466,7 @@ static PyObject* THPModule_addDocStr(PyObject* _unused, PyObject* args) {
     }
     m->d_method->ml_doc = doc_str;
   } else if (strcmp(Py_TYPE(obj)->tp_name, "getset_descriptor") == 0) {
-    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-cstyle-cast)
-    PyGetSetDescrObject* m = (PyGetSetDescrObject*)obj;
+    PyGetSetDescrObject* m = reinterpret_cast<PyGetSetDescrObject*>(obj);
     if (m->d_getset->doc) {
       return PyErr_Format(
           PyExc_RuntimeError,
@@ -467,7 +475,7 @@ static PyObject* THPModule_addDocStr(PyObject* _unused, PyObject* args) {
     }
     m->d_getset->doc = doc_str;
   } else if (Py_TYPE(obj) == &PyType_Type) {
-    PyTypeObject* t = (PyTypeObject*)obj;
+    PyTypeObject* t = reinterpret_cast<PyTypeObject*>(obj);
     if (t->tp_doc) {
       return PyErr_Format(
           PyExc_RuntimeError, "Type '%s' already has a docstring", t->tp_name);
@@ -486,7 +494,7 @@ static PyObject* THPModule_addDocStr(PyObject* _unused, PyObject* args) {
 
 static PyObject* THPModule_inferSize(PyObject* _unused, PyObject* args) {
   HANDLE_TH_ERRORS
-  Py_ssize_t num_args = args ? (Py_ssize_t)PyTuple_Size(args) : 0;
+  Py_ssize_t num_args = args ? PyTuple_Size(args) : 0;
   TORCH_CHECK(num_args == 2, "expected exactly 2 arguments");
   PyObject* arg1 = PyTuple_GET_ITEM(args, 0);
   TORCH_CHECK(THPSize_Check(arg1), "expected a torch.Size as argument 1");
@@ -547,7 +555,11 @@ static PyObject* THPModule_getBackcompatKeepdimWarn(
 }
 
 static PyObject* THPModule_hasDistributed(PyObject* _unused, PyObject* noargs) {
+#ifdef USE_DISTRIBUTED
   Py_RETURN_TRUE;
+#else
+  Py_RETURN_FALSE;
+#endif
 }
 
 static PyObject* THPModule_showConfig(PyObject* module, PyObject* noargs) {
@@ -1212,14 +1224,30 @@ static PyObject* THPModule_allowTF32CuBLAS(
 
 static PyObject* THPModule_setAllowFP16ReductionCuBLAS(
     PyObject* _unused,
-    PyObject* arg) {
+    PyObject* args) {
   HANDLE_TH_ERRORS
+  PyObject* allow_reduction_obj = nullptr;
+  PyObject* allow_splitk_obj = Py_None;
+  if (!PyArg_ParseTuple(args, "O|O", &allow_reduction_obj, &allow_splitk_obj)) {
+    return nullptr;
+  }
   TORCH_CHECK(
-      PyBool_Check(arg),
-      "set_allow_fp16_reduction_cublas expects a bool, "
+      PyBool_Check(allow_reduction_obj),
+      "set_allow_fp16_reduction_cublas expects a bool for allow_reduced_precision, "
       "but got ",
-      THPUtils_typename(arg));
-  at::globalContext().setAllowFP16ReductionCuBLAS(arg == Py_True);
+      THPUtils_typename(allow_reduction_obj));
+  bool allow_reduction = allow_reduction_obj == Py_True;
+  bool allow_splitk = true;
+  if (allow_splitk_obj != Py_None) {
+    TORCH_CHECK(
+        PyBool_Check(allow_splitk_obj),
+        "set_allow_fp16_reduction_cublas expects a bool for allow_splitk, "
+        "but got ",
+        THPUtils_typename(allow_splitk_obj));
+    allow_splitk = allow_splitk_obj == Py_True;
+  }
+  at::globalContext().setAllowFP16ReductionCuBLAS(
+      allow_reduction, allow_splitk);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
@@ -1227,22 +1255,43 @@ static PyObject* THPModule_setAllowFP16ReductionCuBLAS(
 static PyObject* THPModule_allowFP16ReductionCuBLAS(
     PyObject* _unused,
     PyObject* noargs) {
-  if (at::globalContext().allowFP16ReductionCuBLAS()) {
-    Py_RETURN_TRUE;
-  }
-  Py_RETURN_FALSE;
+  auto option = at::globalContext().allowFP16ReductionCuBLAS();
+  bool allow_reduced_precision =
+      option == at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
+  bool allow_splitk = option !=
+      at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK;
+  return PyTuple_Pack(
+      2,
+      allow_reduced_precision ? Py_True : Py_False,
+      allow_splitk ? Py_True : Py_False);
 }
 
 static PyObject* THPModule_setAllowBF16ReductionCuBLAS(
     PyObject* _unused,
-    PyObject* arg) {
+    PyObject* args) {
   HANDLE_TH_ERRORS
+  PyObject* allow_reduction_obj = nullptr;
+  PyObject* allow_splitk_obj = Py_None;
+  if (!PyArg_ParseTuple(args, "O|O", &allow_reduction_obj, &allow_splitk_obj)) {
+    return nullptr;
+  }
   TORCH_CHECK(
-      PyBool_Check(arg),
-      "set_allow_bf16_reduction_cublas expects a bool, "
+      PyBool_Check(allow_reduction_obj),
+      "set_allow_bf16_reduction_cublas expects a bool for allow_reduced_precision, "
       "but got ",
-      THPUtils_typename(arg));
-  at::globalContext().setAllowBF16ReductionCuBLAS(arg == Py_True);
+      THPUtils_typename(allow_reduction_obj));
+  bool allow_reduction = allow_reduction_obj == Py_True;
+  bool allow_splitk = true;
+  if (allow_splitk_obj != Py_None) {
+    TORCH_CHECK(
+        PyBool_Check(allow_splitk_obj),
+        "set_allow_bf16_reduction_cublas expects a bool for allow_splitk, "
+        "but got ",
+        THPUtils_typename(allow_splitk_obj));
+    allow_splitk = allow_splitk_obj == Py_True;
+  }
+  at::globalContext().setAllowBF16ReductionCuBLAS(
+      allow_reduction, allow_splitk);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
@@ -1250,10 +1299,15 @@ static PyObject* THPModule_setAllowBF16ReductionCuBLAS(
 static PyObject* THPModule_allowBF16ReductionCuBLAS(
     PyObject* _unused,
     PyObject* noargs) {
-  if (at::globalContext().allowBF16ReductionCuBLAS()) {
-    Py_RETURN_TRUE;
-  }
-  Py_RETURN_FALSE;
+  auto option = at::globalContext().allowBF16ReductionCuBLAS();
+  bool allow_reduced_precision =
+      option == at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK;
+  bool allow_splitk = option !=
+      at::CuBLASReductionOption::DisallowReducedPrecisionDisallowSplitK;
+  return PyTuple_Pack(
+      2,
+      allow_reduced_precision ? Py_True : Py_False,
+      allow_splitk ? Py_True : Py_False);
 }
 
 static PyObject* THPModule_setAllowFP16AccumulationCuBLAS(
@@ -1417,10 +1471,11 @@ static PyObject* THPModule_willEngineExecuteNode(
   torch::autograd::Node* node = nullptr;
   std::shared_ptr<torch::autograd::Node> node_sp;
   if (isTHPFunction) {
-    node_sp = ((THPFunction*)arg)->cdata.lock();
+    node_sp = (reinterpret_cast<THPFunction*>(arg))->cdata.lock();
     node = node_sp.get();
   } else {
-    node = ((torch::autograd::THPCppFunction*)arg)->cdata.get();
+    node =
+        (reinterpret_cast<torch::autograd::THPCppFunction*>(arg))->cdata.get();
   }
   const auto nodes_in_graph =
       torch::autograd::get_current_graph_task_nodes_in_graph();
@@ -1724,7 +1779,7 @@ static std::initializer_list<PyMethodDef> TorchMethods = {
      nullptr},
     {"_set_cublas_allow_fp16_reduced_precision_reduction",
      THPModule_setAllowFP16ReductionCuBLAS,
-     METH_O,
+     METH_VARARGS,
      nullptr},
     {"_get_cublas_allow_bf16_reduced_precision_reduction",
      THPModule_allowBF16ReductionCuBLAS,
@@ -1732,7 +1787,7 @@ static std::initializer_list<PyMethodDef> TorchMethods = {
      nullptr},
     {"_set_cublas_allow_bf16_reduced_precision_reduction",
      THPModule_setAllowBF16ReductionCuBLAS,
-     METH_O,
+     METH_VARARGS,
      nullptr},
     {"_get_cublas_allow_fp16_accumulation",
      THPModule_allowFP16AccumulationCuBLAS,
@@ -1850,7 +1905,8 @@ static std::initializer_list<PyMethodDef> TorchMethods = {
      METH_O,
      nullptr},
     {"_has_torch_function_variadic",
-     (PyCFunction)(void (*)())THPModule_has_torch_function_variadic,
+     reinterpret_cast<PyCFunction>(
+         reinterpret_cast<void (*)()>(THPModule_has_torch_function_variadic)),
      METH_FASTCALL,
      nullptr},
     {"_ensureCUDADeviceGuardSet",
@@ -1927,7 +1983,7 @@ SigHandler* _getOldHandler(int signum) {
   SIG_CHECK(SIGSEGV);
   SIG_CHECK(SIGILL);
 
-  throw std::runtime_error("unexpected signal number");
+  TORCH_CHECK(false, "unexpected signal number");
 #undef SIG_CHECK
 }
 
@@ -1999,6 +2055,7 @@ PyObject* initModule() {
 #ifdef USE_XPU
   THPUtils_addPyMethodDefs(methods, THXPModule_methods());
 #endif
+#if defined(USE_DISTRIBUTED) && defined(USE_C10D)
   THPUtils_addPyMethodDefs(
       methods, torch::distributed::c10d::python_functions());
 #ifndef _WIN32
@@ -2008,6 +2065,7 @@ PyObject* initModule() {
       methods, torch::distributed::autograd::python_functions());
   THPUtils_addPyMethodDefs(
       methods, torch::distributed::rpc::testing::python_functions());
+#endif
 #endif
 
   static struct PyModuleDef torchmodule = {
@@ -2078,8 +2136,10 @@ PyObject* initModule() {
   torch::cpu::initModule(module);
   torch::accelerator::initModule(module);
   torch::instruction_counter::initModule(module);
+  torch::acc::initModule(module);
   torch::initVerboseBindings(module);
   ASSERT_TRUE(THPStorage_init(module));
+  torch::functionalization::initModule(module);
 
 #ifdef USE_CUDA
   // This will only initialise base classes and attach them to library namespace
@@ -2446,6 +2506,39 @@ Call this whenever a new thread is created in order to propagate values from
     return at::globalContext().blasPreferredBackend();
   });
 
+  py::enum_<at::blas::ScalingType>(
+      py_module, "_ScalingType", "Supported Tensor scaling types")
+      .value(
+          "TensorWise",
+          at::blas::ScalingType::TensorWise,
+          "Single scale per-tensor")
+      .value(
+          "RowWise", at::blas::ScalingType::RowWise, "Scale per-row of tensor")
+      .value(
+          "BlockWise1x16",
+          at::blas::ScalingType::BlockWise1x16,
+          "Scale per 16 contiguous values")
+      .value(
+          "BlockWise1x32",
+          at::blas::ScalingType::BlockWise1x32,
+          "Scale per 32 contiguous values")
+      .value(
+          "BlockWise1x128",
+          at::blas::ScalingType::BlockWise1x128,
+          "Scale per 128 contiguous values")
+      .value(
+          "BlockWise128x128",
+          at::blas::ScalingType::BlockWise128x128,
+          "Scale per 128x128 tile");
+
+  py::enum_<at::blas::SwizzleType>(
+      py_module, "_SwizzleType", "Supported scale swizzle types")
+      .value("NO_SWIZZLE", at::blas::SwizzleType::NO_SWIZZLE, "No swizzling")
+      .value(
+          "SWIZZLE_32_4_4",
+          at::blas::SwizzleType::SWIZZLE_32_4_4,
+          "Blackwell-stype 32x4x4 swizzle");
+
   py::enum_<at::ROCmFABackend>(py_module, "_ROCmFABackend")
       .value("Default", at::ROCmFABackend::Default)
       .value("AOTriton", at::ROCmFABackend::AOTriton)
@@ -2479,7 +2572,8 @@ Call this whenever a new thread is created in order to propagate values from
   py_module.def(
       "_get_fp32_precision_getter",
       [](const std::string& backend, const std::string& op) {
-        return at::globalContext().float32Precision(backend, op);
+        return at::precision2str(at::globalContext().float32Precision(
+            at::str2backend(backend), at::str2op(op)));
       });
 
   py_module.def(
@@ -2487,7 +2581,10 @@ Call this whenever a new thread is created in order to propagate values from
       [](const std::string& backend,
          const std::string& op,
          const std::string& precision) {
-        at::globalContext().setFloat32Precision(backend, op, precision);
+        at::globalContext().setFloat32Precision(
+            at::str2backend(backend),
+            at::str2op(op),
+            at::str2precision(precision));
         return precision;
       });
 
@@ -2516,7 +2613,7 @@ Call this whenever a new thread is created in order to propagate values from
           .getAcceleratorHooksInterface(device_type)
           .deviceCount();
     }
-    return c10::DeviceIndex(-1);
+    return static_cast<c10::DeviceIndex>(-1);
   });
 
   py_module.def(
@@ -2537,7 +2634,7 @@ Call this whenever a new thread is created in order to propagate values from
           .getAcceleratorHooksInterface(device_type)
           .getCurrentDevice();
     }
-    return c10::DeviceIndex(-1);
+    return static_cast<c10::DeviceIndex>(-1);
   });
 
   py_module.def(
@@ -2548,7 +2645,7 @@ Call this whenever a new thread is created in order to propagate values from
               .getAcceleratorHooksInterface(device_type)
               .exchangeDevice(device_index);
         }
-        return c10::DeviceIndex(-1);
+        return static_cast<c10::DeviceIndex>(-1);
       });
 
   py_module.def(
@@ -2560,7 +2657,7 @@ Call this whenever a new thread is created in order to propagate values from
               .getAcceleratorHooksInterface(device_type)
               .maybeExchangeDevice(device_index);
         }
-        return c10::DeviceIndex(-1);
+        return static_cast<c10::DeviceIndex>(-1);
       });
 
   py_module.def(
@@ -2605,6 +2702,8 @@ Call this whenever a new thread is created in order to propagate values from
   ASSERT_TRUE(set_module_attr("_has_xpu", has_xpu));
   ASSERT_TRUE(
       set_module_attr("_has_mkldnn", at::hasMKLDNN() ? Py_True : Py_False));
+  ASSERT_TRUE(set_module_attr(
+      "_has_mkldnn_acl", AT_MKLDNN_ACL_ENABLED() ? Py_True : Py_False));
 
   ASSERT_TRUE(set_module_attr("_GLIBCXX_USE_CXX11_ABI", Py_True));
 
@@ -2722,8 +2821,8 @@ Call this whenever a new thread is created in order to propagate values from
       py::arg("eps"));
 
   const auto& defaultGenerator = at::detail::getDefaultCPUGenerator();
-  THPDefaultCPUGenerator =
-      (THPGenerator*)THPGenerator_initDefaultGenerator(defaultGenerator);
+  THPDefaultCPUGenerator = reinterpret_cast<THPGenerator*>(
+      THPGenerator_initDefaultGenerator(defaultGenerator));
   // This reference is meant to be given away, so no need to incref here.
   ASSERT_TRUE(set_module_attr(
       "default_generator",
diff --git a/torch/csrc/PyInterpreter.cpp b/torch/csrc/PyInterpreter.cpp
index 993f8b8216a6..6b2375212422 100644
--- a/torch/csrc/PyInterpreter.cpp
+++ b/torch/csrc/PyInterpreter.cpp
@@ -57,7 +57,7 @@ struct ConcretePyInterpreterVTable final
   void reportErrorCallback(PyObject* callback, DispatchKey key) const override;
   void python_dispatcher(
       const c10::OperatorHandle& op,
-      c10::DispatchKeySet,
+      c10::DispatchKeySet /*ks*/,
       torch::jit::Stack* stack) const override;
   // NB: this is defined in python_dispatch.cpp
   void python_op_registration_trampoline(
@@ -80,12 +80,15 @@ struct ConcretePyInterpreterVTable final
             opname, pymodule, context);
   }
 
-  bool is_contiguous(const c10::TensorImpl* self, at::MemoryFormat)
-      const override;
-  c10::SymBool sym_is_contiguous(const c10::TensorImpl* self, at::MemoryFormat)
-      const override;
-  bool is_strides_like(const c10::TensorImpl* self, at::MemoryFormat)
-      const override;
+  bool is_contiguous(
+      const c10::TensorImpl* self,
+      at::MemoryFormat /*memory_format*/) const override;
+  c10::SymBool sym_is_contiguous(
+      const c10::TensorImpl* self,
+      at::MemoryFormat /*memory_format*/) const override;
+  bool is_strides_like(
+      const c10::TensorImpl* self,
+      at::MemoryFormat /*memory_format*/) const override;
   bool is_non_overlapping_and_dense(const c10::TensorImpl* self) const override;
   c10::Device device(const c10::TensorImpl* self) const override;
   int64_t dim(const c10::TensorImpl* self) const override;
@@ -267,7 +270,7 @@ void ConcretePyInterpreterVTable::decref(PyObject* pyobj, bool has_pyobj_slot)
           "This probably happened because you took out a weak reference to "
           "Tensor and didn't call _fix_weakref() after dereferencing it.  "
           "Subsequent accesses to this tensor via the PyObject will now fail.");
-      ((THPVariable*)pyobj)->cdata =
+      (reinterpret_cast<THPVariable*>(pyobj))->cdata =
           c10::MaybeOwned<torch::autograd::Variable>();
     } else if (THPStorage_Check(pyobj)) {
       TORCH_WARN(
@@ -275,7 +278,8 @@ void ConcretePyInterpreterVTable::decref(PyObject* pyobj, bool has_pyobj_slot)
           "This probably happened because you took out a weak reference to "
           "UntypedStorage and didn't call _fix_weakref() after dereferencing it.  "
           "Subsequent accesses to this storage via the PyObject will now fail.");
-      ((THPStorage*)pyobj)->cdata = c10::MaybeOwned<c10::Storage>();
+      (reinterpret_cast<THPStorage*>(pyobj))->cdata =
+          c10::MaybeOwned<c10::Storage>();
     }
   }
   Py_DECREF(pyobj);
@@ -614,7 +618,8 @@ static void set_tensor_attr_with_capsule(
     const c10::TensorImpl* tensor,
     py::capsule& capsule,
     const char* attr_name) {
-  std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj();
+  std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj(
+      /*ignore_hermetic_tls=*/false);
   TORCH_CHECK(
       mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
   auto obj = mb_obj.value();
@@ -641,7 +646,8 @@ static c10::ArrayRef<T> get_set_cached_attr(
     const c10::TensorImpl* tensor,
     const char* base_attr_name,
     const py::object& obj) {
-  std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj();
+  std::optional<PyObject*> mb_obj =
+      tensor->pyobj_slot()->check_pyobj(getPyInterpreter());
   TORCH_CHECK(
       mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
   auto tensor_obj = mb_obj.value();
diff --git a/torch/csrc/PyInterpreterHooks.cpp b/torch/csrc/PyInterpreterHooks.cpp
index 5e064493fd59..f3f07273eb90 100644
--- a/torch/csrc/PyInterpreterHooks.cpp
+++ b/torch/csrc/PyInterpreterHooks.cpp
@@ -3,7 +3,8 @@
 
 namespace torch::detail {
 
-PyInterpreterHooks::PyInterpreterHooks(c10::impl::PyInterpreterHooksArgs) {}
+PyInterpreterHooks::PyInterpreterHooks(
+    c10::impl::PyInterpreterHooksArgs /*unused*/) {}
 
 c10::impl::PyInterpreter* PyInterpreterHooks::getPyInterpreter() const {
   // Delegate to the existing implementation
diff --git a/torch/csrc/PyInterpreterHooks.h b/torch/csrc/PyInterpreterHooks.h
index 1def7b8c55ae..65c6f3e149ec 100644
--- a/torch/csrc/PyInterpreterHooks.h
+++ b/torch/csrc/PyInterpreterHooks.h
@@ -7,7 +7,7 @@ namespace torch::detail {
 // Concrete implementation of PyInterpreterHooks
 class PyInterpreterHooks : public c10::impl::PyInterpreterHooksInterface {
  public:
-  explicit PyInterpreterHooks(c10::impl::PyInterpreterHooksArgs);
+  explicit PyInterpreterHooks(c10::impl::PyInterpreterHooksArgs /*unused*/);
 
   c10::impl::PyInterpreter* getPyInterpreter() const override;
 };
diff --git a/torch/csrc/QScheme.cpp b/torch/csrc/QScheme.cpp
index fe21b736cd29..3fbabc1026f5 100644
--- a/torch/csrc/QScheme.cpp
+++ b/torch/csrc/QScheme.cpp
@@ -11,7 +11,7 @@
 #include <string>
 
 PyObject* THPQScheme_New(at::QScheme qscheme, const std::string& name) {
-  auto type = (PyTypeObject*)&THPQSchemeType;
+  auto type = &THPQSchemeType;
   auto self = THPObjectPtr{type->tp_alloc(type, 0)};
   if (!self)
     throw python_error();
@@ -23,7 +23,7 @@ PyObject* THPQScheme_New(at::QScheme qscheme, const std::string& name) {
 }
 
 static PyObject* THPQScheme_reduce(PyObject* _self, PyObject* noargs) {
-  auto self = (THPQScheme*)_self;
+  auto self = reinterpret_cast<THPQScheme*>(_self);
   return THPUtils_packString(self->name);
 }
 
@@ -48,7 +48,7 @@ PyTypeObject THPQSchemeType = {
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
-    (reprfunc)THPQScheme_repr, /* tp_repr */
+    reinterpret_cast<reprfunc>(THPQScheme_repr), /* tp_repr */
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
@@ -84,7 +84,9 @@ void THPQScheme_init(PyObject* module) {
     throw python_error();
   }
   Py_INCREF(&THPQSchemeType);
-  if (PyModule_AddObject(module, "qscheme", (PyObject*)&THPQSchemeType) != 0) {
+  if (PyModule_AddObject(
+          module, "qscheme", reinterpret_cast<PyObject*>(&THPQSchemeType)) !=
+      0) {
     throw python_error();
   }
 }
diff --git a/torch/csrc/Size.cpp b/torch/csrc/Size.cpp
index e0a54ad17659..62bc48fa9b98 100644
--- a/torch/csrc/Size.cpp
+++ b/torch/csrc/Size.cpp
@@ -133,7 +133,8 @@ static PyObject* THPSize_pynew(
 static PyObject* THPSize_repr(THPSize* self) {
   HANDLE_TH_ERRORS
   std::string repr("torch.Size([");
-  for (Py_ssize_t i = 0; i < PyTuple_Size((PyObject*)self); ++i) {
+  for (Py_ssize_t i = 0; i < PyTuple_Size(reinterpret_cast<PyObject*>(self));
+       ++i) {
     if (i != 0) {
       repr += ", ";
     }
@@ -156,7 +157,7 @@ static PyObject* wrap_tuple_fn(Args... args) {
     return nullptr;
   if (PyTuple_Check(result.get())) {
     return PyObject_CallFunctionObjArgs(
-        (PyObject*)&THPSizeType, result.get(), nullptr);
+        reinterpret_cast<PyObject*>(&THPSizeType), result.get(), nullptr);
   }
   return result.release();
 }
@@ -225,9 +226,9 @@ static PyMappingMethods THPSize_as_mapping = {
 
 static PyObject* THPSize_numel(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
-  auto self = (THPSize*)_self;
+  auto self = reinterpret_cast<THPSize*>(_self);
   int64_t numel = 1;
-  for (Py_ssize_t i = 0; i < PyTuple_Size((PyObject*)self); ++i) {
+  for (Py_ssize_t i = 0; i < PyTuple_Size(_self); ++i) {
     numel *= THPUtils_unpackLong(PyTuple_GET_ITEM(self, i));
   }
   return THPUtils_packInt64(numel);
@@ -236,19 +237,19 @@ static PyObject* THPSize_numel(PyObject* _self, PyObject* noargs) {
 
 static PyObject* THPSize_reduce(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
-  auto self = (THPSize*)_self;
+  auto self = reinterpret_cast<THPSize*>(_self);
   auto ret = THPObjectPtr{PyTuple_New(2)};
   if (!ret)
     throw python_error();
 
-  auto obj = (PyObject*)(&THPSizeType);
+  auto obj = reinterpret_cast<PyObject*>(&THPSizeType);
   Py_INCREF(&THPSizeType);
   PyTuple_SET_ITEM(ret.get(), 0, obj);
 
-  THPObjectPtr t(PyTuple_New(PyTuple_Size((PyObject*)self)));
+  THPObjectPtr t(PyTuple_New(PyTuple_Size(_self)));
   if (!t)
     throw python_error();
-  for (Py_ssize_t i = 0; i < PyTuple_Size((PyObject*)self); ++i) {
+  for (Py_ssize_t i = 0; i < PyTuple_Size(_self); ++i) {
     auto d = PyTuple_GET_ITEM(self, i);
     Py_INCREF(d);
     PyTuple_SET_ITEM(t.get(), i, d);
@@ -279,7 +280,7 @@ PyTypeObject THPSizeType = {
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
-    (reprfunc)THPSize_repr, /* tp_repr */
+    reinterpret_cast<reprfunc>(THPSize_repr), /* tp_repr */
     &THPSize_as_number, /* tp_as_number */
     &THPSize_as_sequence, /* tp_as_sequence */
     &THPSize_as_mapping, /* tp_as_mapping */
@@ -315,7 +316,8 @@ void THPSize_init(PyObject* module) {
     throw python_error();
   }
   Py_INCREF(&THPSizeType);
-  if (PyModule_AddObject(module, "Size", (PyObject*)&THPSizeType) < 0) {
+  if (PyModule_AddObject(
+          module, "Size", reinterpret_cast<PyObject*>(&THPSizeType)) < 0) {
     throw python_error();
   }
 }
diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp
index f6638bbd10c1..02558cbdf896 100644
--- a/torch/csrc/Storage.cpp
+++ b/torch/csrc/Storage.cpp
@@ -41,8 +41,8 @@ PyObject* THPStorage_NewWithStorage(
       "Creating a Storage subclass from a class that does not inherit from ",
       "Storage is not possible. Make sure your class inherits from Storage.");
 
-  auto maybe_pyobj =
-      _storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj();
+  auto maybe_pyobj = _storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
+      /*ignore_hermetic_tls=*/false);
   if (maybe_pyobj.has_value() && maybe_pyobj.value()) {
     TORCH_CHECK(
         allow_preexisting_pyobj,
@@ -68,7 +68,7 @@ PyObject* THPStorage_NewWithStorage(
   PyObject* obj = type->tp_alloc(type, 0);
   TORCH_CHECK(obj, "Failed to allocate a ", type->tp_name, " object");
 
-  auto s = (THPStorage*)obj;
+  auto s = reinterpret_cast<THPStorage*>(obj);
 
   new (&s->cdata) c10::MaybeOwned<c10::Storage>();
 
@@ -93,7 +93,8 @@ PyObject* THPStorage_Wrap(c10::Storage storage) {
   }
   c10::impl::PyObjectSlot* pyobj_slot = storage_impl->pyobj_slot();
 
-  std::optional<PyObject*> maybe_pyobj = pyobj_slot->check_pyobj();
+  std::optional<PyObject*> maybe_pyobj = pyobj_slot->check_pyobj(
+      /*ignore_hermetic_tls=*/false);
   if (maybe_pyobj.has_value()) {
     auto obj = *maybe_pyobj;
     if (obj) {
@@ -126,8 +127,8 @@ static bool THPStorage_isPreservable(THPStorage* self) {
     return false;
   }
 
-  if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj() !=
-      (PyObject*)self) {
+  if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
+          /*ignore_hermetic_tls=*/true) != reinterpret_cast<PyObject*>(self)) {
     return false;
   }
   if (storage.use_count() <= 1) {
@@ -144,7 +145,8 @@ static bool THPStorage_tryPreserve(THPStorage* self) {
   const auto& storage = THPStorage_Unpack(self);
   c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl();
 
-  auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj();
+  auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj(
+      /*ignore_hermetic_tls=*/true);
   // NOTE: It is possible to just set the PyObjectSlot here, but the point is
   // that we should have already set PyObjectSlot when the storage PyObject
   // was created.
@@ -168,14 +170,14 @@ static bool THPStorage_tryPreserve(THPStorage* self) {
   storage_impl->pyobj_slot()->set_owns_pyobj(true);
   // When resurrecting, we MUST use _Py_NewReference and not Py_INCREF to
   // ensure the PyObject is in a valid state
-  _Py_NewReference((PyObject*)self);
+  _Py_NewReference(reinterpret_cast<PyObject*>(self));
 
   self->cdata = c10::MaybeOwned<c10::Storage>::borrowed(storage);
   return true;
 }
 
 static void THPStorage_subclass_dealloc(PyObject* self) {
-  THPStorage* _self = (THPStorage*)self;
+  THPStorage* _self = reinterpret_cast<THPStorage*>(self);
 
   if (THPStorage_tryPreserve(_self)) {
     return;
@@ -224,8 +226,8 @@ static void THPStorage_subclass_dealloc(PyObject* self) {
        being finalized that has already been destroyed. */
     if (type->tp_weaklistoffset) {
       /* Modeled after GET_WEAKREFS_LISTPTR() */
-      PyWeakReference** list =
-          (PyWeakReference**)PyObject_GET_WEAKREFS_LISTPTR(self);
+      PyWeakReference** list = reinterpret_cast<PyWeakReference**>(
+          PyObject_GET_WEAKREFS_LISTPTR(self));
       while (*list)
         _PyWeakref_ClearRef(*list);
     }
@@ -329,6 +331,7 @@ static PyObject* THPStorage_pynew(
       case at::DeviceType::Meta:
       case at::DeviceType::PrivateUse1:
       case at::DeviceType::MAIA:
+      case at::DeviceType::MTIA:
         allocator = c10::GetAllocator(device.type());
         break;
       default:
@@ -447,7 +450,7 @@ static PyObject* THPStorage_get(THPStorage* self, PyObject* index) {
       return nullptr;
     }
     uint8_t value = storage_get(storage, nindex);
-    return THPByteUtils_newReal(value);
+    return THPUtils_packUInt32(value);
     /* Slice index */
   } else if (PySlice_Check(index)) {
     Py_ssize_t start = 0, stop = 0, slicelength = 0, step = 0;
@@ -546,9 +549,9 @@ static int THPStorage_set(THPStorage* self, PyObject* index, PyObject* value) {
 }
 
 static PyMappingMethods THPStorage_mappingmethods = {
-    (lenfunc)THPStorage_length,
-    (binaryfunc)THPStorage_get,
-    (objobjargproc)THPStorage_set};
+    reinterpret_cast<lenfunc>(THPStorage_length),
+    reinterpret_cast<binaryfunc>(THPStorage_get),
+    reinterpret_cast<objobjargproc>(THPStorage_set)};
 
 struct THPStorageMeta {
   PyHeapTypeObject base;
@@ -650,7 +653,8 @@ int THPStorageMetaType_init(PyObject* cls, PyObject* args, PyObject* kwargs) {
   if (PyType_Type.tp_init(cls, args, kwargs) < 0) {
     return -1;
   }
-  ((PyTypeObject*)cls)->tp_dealloc = (destructor)THPStorage_subclass_dealloc;
+  (reinterpret_cast<PyTypeObject*>(cls))->tp_dealloc =
+      static_cast<destructor>(THPStorage_subclass_dealloc);
   return 0;
 }
 
@@ -671,8 +675,16 @@ typedef PyObject* (*getter)(PyObject*, void*);
 
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
 static struct PyGetSetDef THPStorage_properties[] = {
-    {"device", (getter)THPStorage_device, nullptr, nullptr, nullptr},
-    {"_cdata", (getter)THPStorage_get_cdata, nullptr, nullptr, nullptr},
+    {"device",
+     reinterpret_cast<getter>(THPStorage_device),
+     nullptr,
+     nullptr,
+     nullptr},
+    {"_cdata",
+     reinterpret_cast<getter>(THPStorage_get_cdata),
+     nullptr,
+     nullptr,
+     nullptr},
     {nullptr}};
 
 bool THPStorage_init(PyObject* module) {
@@ -684,20 +696,22 @@ bool THPStorage_init(PyObject* module) {
   if (PyType_Ready(&THPStorageMetaType) < 0)
     return false;
   Py_INCREF(&THPStorageMetaType);
-  PyModule_AddObject(module, "_StorageMeta", (PyObject*)&THPStorageMetaType);
+  PyModule_AddObject(
+      module, "_StorageMeta", reinterpret_cast<PyObject*>(&THPStorageMetaType));
 
   THPStorageType.tp_methods = methods.data();
   THPStorageType.tp_getset = THPStorage_properties;
   if (PyType_Ready(&THPStorageType) < 0)
     return false;
   Py_INCREF(&THPStorageType);
-  PyModule_AddObject(module, "StorageBase", (PyObject*)&THPStorageType);
+  PyModule_AddObject(
+      module, "StorageBase", reinterpret_cast<PyObject*>(&THPStorageType));
   return true;
 }
 
 void THPStorage_postInit(PyObject* module) {
-  THPStorageClass =
-      (PyTypeObject*)PyObject_GetAttrString(module, "UntypedStorage");
+  THPStorageClass = reinterpret_cast<PyTypeObject*>(
+      PyObject_GetAttrString(module, "UntypedStorage"));
   if (!THPStorageClass)
     throw python_error();
 }
@@ -708,5 +722,5 @@ void THPStorage_assertNotNull(THPStorage* storage) {
 }
 
 void THPStorage_assertNotNull(PyObject* obj) {
-  THPStorage_assertNotNull((THPStorage*)obj);
+  THPStorage_assertNotNull(reinterpret_cast<THPStorage*>(obj));
 }
diff --git a/torch/csrc/StorageMethods.cpp b/torch/csrc/StorageMethods.cpp
index da64bcfbd500..68c06f7c88c1 100644
--- a/torch/csrc/StorageMethods.cpp
+++ b/torch/csrc/StorageMethods.cpp
@@ -297,7 +297,7 @@ static PyObject* THPStorage_fromBuffer(
     size_bytes = count * element_size;
   }
 
-  if (offset + (count * (Py_ssize_t)element_size) > buffer.len) {
+  if (offset + (count * static_cast<Py_ssize_t>(element_size)) > buffer.len) {
     PyErr_SetString(
         PyExc_ValueError,
         fmt::format(
@@ -309,7 +309,7 @@ static PyObject* THPStorage_fromBuffer(
     return nullptr;
   }
 
-  uint8_t* src = (uint8_t*)buffer.buf;
+  uint8_t* src = static_cast<uint8_t*>(buffer.buf);
   auto fake_mode_active =
       c10::impl::TorchDispatchModeTLS::get_mode(
           c10::impl::TorchDispatchModeKey::FAKE) != std::nullopt;
@@ -482,7 +482,7 @@ static PyObject* THPStorage_setFromFile(PyObject* self, PyObject* args) {
       return nullptr;
     }
     Py_INCREF(self);
-    return (PyObject*)self;
+    return self;
   }
 
   // file is backed by a fd
@@ -508,8 +508,8 @@ static PyObject* THPStorage_setFromFile(PyObject* self, PyObject* args) {
   // advanced position
   const auto fd_current_pos = LSEEK(fd, 0, SEEK_CUR);
   LSEEK(fd, fd_original_pos, SEEK_SET);
-  const auto seek_return =
-      PyObject_CallMethod(file, "seek", "Li", (long long)fd_current_pos, 0);
+  const auto seek_return = PyObject_CallMethod(
+      file, "seek", "Li", static_cast<long long>(fd_current_pos), 0);
   if (seek_return == nullptr) {
     return nullptr;
   }
@@ -521,18 +521,19 @@ static PyObject* THPStorage_setFromFile(PyObject* self, PyObject* args) {
 
 static PyObject* THPStorage__setCdata(PyObject* _self, PyObject* new_cdata) {
   HANDLE_TH_ERRORS
-  auto self = (THPStorage*)_self;
+  auto self = reinterpret_cast<THPStorage*>(_self);
   TORCH_CHECK(
       THPUtils_checkLong(new_cdata),
       "given an invalid argument to "
       "_set_cdata - expected an int or long, but got ",
       THPUtils_typename(new_cdata));
-  c10::StorageImpl* ptr = (c10::StorageImpl*)PyLong_AsVoidPtr(new_cdata);
+  c10::StorageImpl* ptr =
+      static_cast<c10::StorageImpl*>(PyLong_AsVoidPtr(new_cdata));
   self->cdata.~MaybeOwned<c10::Storage>();
   self->cdata = c10::MaybeOwned<c10::Storage>::owned(
       c10::Storage(c10::intrusive_ptr<c10::StorageImpl>::reclaim_copy(ptr)));
   Py_INCREF(self);
-  return (PyObject*)self;
+  return reinterpret_cast<PyObject*>(self);
   END_HANDLE_TH_ERRORS
 }
 
diff --git a/torch/csrc/StorageSharing.cpp b/torch/csrc/StorageSharing.cpp
index e58865bb60a8..e5fa5f32e9a7 100644
--- a/torch/csrc/StorageSharing.cpp
+++ b/torch/csrc/StorageSharing.cpp
@@ -256,7 +256,7 @@ static PyObject* THPStorage_newSharedFd(PyObject* _unused, PyObject* args) {
         "a file descriptor (int) and storage size (int)");
     return nullptr;
   }
-  int tmp_fd = (int)THPUtils_unpackLong(_tmp_fd);
+  int tmp_fd = THPUtils_unpackInt(_tmp_fd);
   int64_t size = THPUtils_unpackLong(_size);
   int fd = dup(tmp_fd);
   if (fd == -1) {
@@ -312,8 +312,8 @@ static PyObject* THPStorage_shareCuda(PyObject* self, PyObject* noargs) {
     auto shandle =
         c10::cuda::CUDACachingAllocator::shareIpcHandle(storage.mutable_data());
     _handle = PyBytes_FromStringAndSize(
-        shandle.handle.c_str(), (Py_ssize_t)shandle.handle.size());
-    _offset_bytes = PyLong_FromSsize_t((Py_ssize_t)shandle.offset);
+        shandle.handle.c_str(), static_cast<Py_ssize_t>(shandle.handle.size()));
+    _offset_bytes = PyLong_FromSsize_t(static_cast<Py_ssize_t>(shandle.offset));
 
     // Put Storage Data behind new ref counting context
     // See Note [CUDA IPC Refcounting implementation explained]
@@ -334,7 +334,7 @@ static PyObject* THPStorage_shareCuda(PyObject* self, PyObject* noargs) {
     }
 
     _event_handle = PyBytes_FromStringAndSize(
-        (char*)&ipc_event_handle, CUDA_IPC_HANDLE_SIZE);
+        reinterpret_cast<const char*>(&ipc_event_handle), CUDA_IPC_HANDLE_SIZE);
     _event_sync_required = PyBool_FromLong(sent_data->event_sync_required_);
   }
 
@@ -385,7 +385,7 @@ static PyObject* THPStorage_releaseIPCCounter(
   }
   std::string ref_counter_handle = PyBytes_AS_STRING(_ref_counter);
   ptrdiff_t ref_counter_offset =
-      (ptrdiff_t)THPUtils_unpackLong(_ref_counter_offset);
+      static_cast<ptrdiff_t>(THPUtils_unpackLong(_ref_counter_offset));
   // We don't want to break existing code, so resource deletion is best
   // effort basis. Exception expected if producer process terminated
   // before consumer released data.
@@ -446,10 +446,9 @@ static PyObject* THPStorage_newSharedCuda(PyObject* _unused, PyObject* args) {
     return nullptr;
   }
 
-  size_t storage_size =
-      (size_t)THPUtils_unpackLong(_size_bytes) / sizeof(uint8_t);
+  size_t storage_size = THPUtils_unpackUInt64(_size_bytes) / sizeof(uint8_t);
   ptrdiff_t storage_offset_bytes =
-      (ptrdiff_t)THPUtils_unpackLong(_offset_bytes);
+      static_cast<ptrdiff_t>(THPUtils_unpackLong(_offset_bytes));
 
   const auto device = c10::checked_convert<c10::DeviceIndex>(
       THPUtils_unpackLong(_device), "c10::DeviceIndex");
@@ -480,11 +479,11 @@ static PyObject* THPStorage_newSharedCuda(PyObject* _unused, PyObject* args) {
   // Offset the basePtr to reconstruct the real storage
   // devPtr = basePtr + storage_offset
   void* devPtr = basePtr.get();
-  devPtr = (char*)devPtr + storage_offset_bytes;
+  devPtr = static_cast<char*>(devPtr) + storage_offset_bytes;
 
   std::string ref_counter_handle = PyBytes_AS_STRING(_ref_counter);
   ptrdiff_t ref_counter_offset =
-      (ptrdiff_t)THPUtils_unpackLong(_ref_counter_offset);
+      static_cast<ptrdiff_t>(THPUtils_unpackLong(_ref_counter_offset));
 
   struct IpcDeleterContext {
     std::string ref_counter_handle;
@@ -578,7 +577,8 @@ static PyObject* THPStorage_newWithWeakPtr(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(
       THPUtils_checkLong(arg), "_new_with_weak_ptr(): arg must be an 'int'");
-  c10::StorageImpl* weak_storage = (c10::StorageImpl*)PyLong_AsVoidPtr(arg);
+  c10::StorageImpl* weak_storage =
+      static_cast<c10::StorageImpl*>(PyLong_AsVoidPtr(arg));
   if (auto* storage = c10::raw::weak_intrusive_ptr::lock(weak_storage)) {
     return THPStorage_Wrap(
         c10::intrusive_ptr<c10::StorageImpl>::reclaim(storage));
@@ -594,7 +594,8 @@ static PyObject* THPStorage_freeWeakRef(PyObject* _unused, PyObject* arg) {
   }
   TORCH_CHECK(
       THPUtils_checkLong(arg), "_free_weak_ref(): arg must be an 'int'");
-  c10::StorageImpl* weak_storage = (c10::StorageImpl*)PyLong_AsVoidPtr(arg);
+  c10::StorageImpl* weak_storage =
+      static_cast<c10::StorageImpl*>(PyLong_AsVoidPtr(arg));
   c10::raw::weak_intrusive_ptr::decref(weak_storage);
 
   Py_RETURN_NONE;
@@ -604,7 +605,8 @@ static PyObject* THPStorage_freeWeakRef(PyObject* _unused, PyObject* arg) {
 static PyObject* THPStorage_expired(PyObject* _unused, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(THPUtils_checkLong(arg), "_expired(): arg must be an 'int'");
-  c10::StorageImpl* weak_storage = (c10::StorageImpl*)PyLong_AsVoidPtr(arg);
+  c10::StorageImpl* weak_storage =
+      static_cast<c10::StorageImpl*>(PyLong_AsVoidPtr(arg));
   return PyBool_FromLong(
       c10::raw::weak_intrusive_ptr::use_count(weak_storage) == 0);
   END_HANDLE_TH_ERRORS
diff --git a/torch/csrc/Stream.cpp b/torch/csrc/Stream.cpp
index 6f44d82bd854..534294909a18 100644
--- a/torch/csrc/Stream.cpp
+++ b/torch/csrc/Stream.cpp
@@ -74,7 +74,7 @@ static PyObject* THPStream_pynew(
     return nullptr;
   }
 
-  THPStream* self = (THPStream*)ptr.get();
+  THPStream* self = reinterpret_cast<THPStream*>(ptr.get());
 
   // If torch.Stream is not created from existing Stream, then create a new one.
   // It requires other device backends override getNewStream method. How the new
@@ -96,19 +96,19 @@ static PyObject* THPStream_pynew(
   self->device_type = static_cast<int64_t>(stream_opt->device_type());
   self->context = nullptr;
 
-  return (PyObject*)ptr.release();
+  return static_cast<PyObject*>(ptr.release());
   END_HANDLE_TH_ERRORS
 }
 
 PyObject* THPStream_Wrap(const c10::Stream& stream) {
   HANDLE_TH_ERRORS
-  auto type = (PyTypeObject*)THPStreamClass;
+  auto type = THPStreamClass;
   THPObjectPtr ptr(type->tp_alloc(type, 0));
   if (!ptr) {
     throw python_error();
   }
 
-  THPStream* self = (THPStream*)ptr.get();
+  THPStream* self = reinterpret_cast<THPStream*>(ptr.get());
   self->stream_id = stream.id();
   // NOLINTNEXTLINE(bugprone-signed-char-misuse)
   self->device_index = static_cast<int64_t>(stream.device_index());
@@ -119,7 +119,7 @@ PyObject* THPStream_Wrap(const c10::Stream& stream) {
 }
 
 static void THPStream_dealloc(THPStream* self) {
-  Py_TYPE(self)->tp_free((PyObject*)self);
+  Py_TYPE(self)->tp_free(reinterpret_cast<PyObject*>(self));
 }
 
 static PyObject* THPStream_get_device(THPStream* self, void* unused) {
@@ -132,7 +132,7 @@ static PyObject* THPStream_get_device(THPStream* self, void* unused) {
 
 static PyObject* THPStream_query(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS
-  auto self = (THPStream*)_self;
+  auto self = reinterpret_cast<THPStream*>(_self);
 
   return PyBool_FromLong(c10::Stream::unpack3(
                              self->stream_id,
@@ -146,7 +146,7 @@ static PyObject* THPStream_query(PyObject* _self, PyObject* noargs) {
 static PyObject* THPStream_synchronize(PyObject* _self, PyObject* noargs) {
   HANDLE_TH_ERRORS {
     pybind11::gil_scoped_release no_gil;
-    auto self = (THPStream*)_self;
+    auto self = reinterpret_cast<THPStream*>(_self);
 
     c10::Stream::unpack3(
         self->stream_id,
@@ -160,8 +160,8 @@ static PyObject* THPStream_synchronize(PyObject* _self, PyObject* noargs) {
 
 static PyObject* THPStream_wait_event(PyObject* _self, PyObject* _event) {
   HANDLE_TH_ERRORS {
-    auto self = (THPStream*)_self;
-    auto event = (THPEvent*)_event;
+    auto self = reinterpret_cast<THPStream*>(_self);
+    auto event = reinterpret_cast<THPEvent*>(_event);
     c10::Stream::unpack3(
         self->stream_id,
         static_cast<c10::DeviceIndex>(self->device_index),
@@ -174,8 +174,8 @@ static PyObject* THPStream_wait_event(PyObject* _self, PyObject* _event) {
 
 static PyObject* THPStream_wait_stream(PyObject* _self, PyObject* _other) {
   HANDLE_TH_ERRORS {
-    auto self = (THPStream*)_self;
-    auto other_stream = (THPStream*)_other;
+    auto self = reinterpret_cast<THPStream*>(_self);
+    auto other_stream = reinterpret_cast<THPStream*>(_other);
     c10::Event new_event(
         static_cast<c10::DeviceType>(other_stream->device_type),
         c10::EventFlag::PYTORCH_DEFAULT);
@@ -198,7 +198,7 @@ static PyObject* THPStream_record_event(
     PyObject* args,
     PyObject* kwargs) {
   HANDLE_TH_ERRORS
-  auto self = (THPStream*)_self;
+  auto self = reinterpret_cast<THPStream*>(_self);
   PyObject* _new_event = nullptr;
   PyObject* _event = Py_None;
 
@@ -222,13 +222,13 @@ static PyObject* THPStream_record_event(
         static_cast<c10::DeviceType>(self->device_type),
         c10::EventFlag::PYTORCH_DEFAULT);
   }
-  auto new_event = (THPEvent*)_new_event;
+  auto new_event = reinterpret_cast<THPEvent*>(_new_event);
   TORCH_CHECK(new_event, "event must not be null");
   new_event->event.record(c10::Stream::unpack3(
       self->stream_id,
       static_cast<c10::DeviceIndex>(self->device_index),
       static_cast<c10::DeviceType>(self->device_type)));
-  return (PyObject*)new_event;
+  return reinterpret_cast<PyObject*>(new_event);
   END_HANDLE_TH_ERRORS
 }
 
@@ -260,7 +260,7 @@ static PyObject* THPStream_eq(THPStream* self, THPStream* other) {
 
 static PyObject* THPStream_enter(PyObject* _self, PyObject* unused) {
   HANDLE_TH_ERRORS
-  auto self = (THPStream*)_self;
+  auto self = reinterpret_cast<THPStream*>(_self);
   c10::DeviceType stream_device_type =
       static_cast<c10::DeviceType>(self->device_type);
   // No operation is performed if the stream does not belong to an accelerator.
@@ -304,7 +304,7 @@ static PyObject* THPStream_enter(PyObject* _self, PyObject* unused) {
 
 static PyObject* THPStream_exit(PyObject* _self, PyObject* unused) {
   HANDLE_TH_ERRORS
-  auto self = (THPStream*)_self;
+  auto self = reinterpret_cast<THPStream*>(_self);
   // No operation is performed if the stream does not belong to an accelerator.
   if (C10_UNLIKELY(!at::accelerator::isAccelerator(
           static_cast<c10::DeviceType>(self->device_type)))) {
@@ -323,7 +323,7 @@ static PyObject* THPStream_exit(PyObject* _self, PyObject* unused) {
   auto ctx_device_index = THPObjectPtr(py_device_index);
   TORCH_INTERNAL_ASSERT(
       ctx_stream.get(), "ctx_stream should be present on the context dict.");
-  auto prev_stream = (THPStream*)(ctx_stream.get());
+  auto prev_stream = reinterpret_cast<THPStream*>(ctx_stream.get());
   TORCH_INTERNAL_ASSERT(
       ctx_device_index.get(),
       "ctx_device_index should be present on the context dict.");
@@ -360,10 +360,14 @@ static PyObject* THPStream_richcompare(
   } else {
     switch (op) {
       case Py_EQ:
-        result = THPStream_eq((THPStream*)self, (THPStream*)other);
+        result = THPStream_eq(
+            reinterpret_cast<THPStream*>(self),
+            reinterpret_cast<THPStream*>(other));
         break;
       case Py_NE:
-        result = THPStream_ne((THPStream*)self, (THPStream*)other);
+        result = THPStream_ne(
+            reinterpret_cast<THPStream*>(self),
+            reinterpret_cast<THPStream*>(other));
         break;
       default:
         result = Py_False;
@@ -393,7 +397,11 @@ static const std::initializer_list<PyMemberDef> THPStream_members = {
     {nullptr}};
 
 static const std::initializer_list<PyGetSetDef> THPStream_properties = {
-    {"device", (getter)THPStream_get_device, nullptr, nullptr, nullptr},
+    {"device",
+     reinterpret_cast<getter>(THPStream_get_device),
+     nullptr,
+     nullptr,
+     nullptr},
     {nullptr}};
 
 static const std::initializer_list<PyMethodDef> THPStream_methods = {
@@ -405,7 +413,7 @@ static const std::initializer_list<PyMethodDef> THPStream_methods = {
      castPyCFunctionWithKeywords(THPStream_record_event),
      METH_VARARGS | METH_KEYWORDS,
      nullptr},
-    {"__eq__", (PyCFunction)THPStream_eq, METH_O, nullptr},
+    {"__eq__", reinterpret_cast<PyCFunction>(THPStream_eq), METH_O, nullptr},
     {"__enter__", THPStream_enter, METH_NOARGS, nullptr},
     {"__exit__", THPStream_exit, METH_VARARGS, nullptr},
     {nullptr}};
@@ -415,16 +423,16 @@ static PyTypeObject THPStreamType = {
     "torch.Stream", /* tp_name */
     sizeof(THPStream), /* tp_basicsize */
     0, /* tp_itemsize */
-    (destructor)THPStream_dealloc, /* tp_dealloc */
+    reinterpret_cast<destructor>(THPStream_dealloc), /* tp_dealloc */
     0, /* tp_vectorcall_offset */
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
-    (reprfunc)THPStream_repr, /* tp_repr */
+    reinterpret_cast<reprfunc>(THPStream_repr), /* tp_repr */
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
-    (hashfunc)THPStream_hash, /* tp_hash  */
+    reinterpret_cast<hashfunc>(THPStream_hash), /* tp_hash  */
     nullptr, /* tp_call */
     nullptr, /* tp_str */
     nullptr, /* tp_getattro */
@@ -462,7 +470,8 @@ void THPStream_init(PyObject* module) {
     throw python_error();
   }
   Py_INCREF(&THPStreamType);
-  if (PyModule_AddObject(module, "Stream", (PyObject*)&THPStreamType) < 0) {
+  if (PyModule_AddObject(
+          module, "Stream", reinterpret_cast<PyObject*>(&THPStreamType)) < 0) {
     throw python_error();
   }
 }
diff --git a/torch/csrc/THP.h b/torch/csrc/THP.h
index 88d8489ba7b8..7eac32916a9b 100644
--- a/torch/csrc/THP.h
+++ b/torch/csrc/THP.h
@@ -4,16 +4,6 @@
 #include <torch/csrc/Export.h>
 #include <torch/csrc/python_headers.h>
 
-// Back-compatibility macros, Thanks to http://cx-oracle.sourceforge.net/
-// define PyInt_* macros for Python 3.x.  NB: We must include Python.h first,
-// otherwise we'll incorrectly conclude PyInt_Check isn't defined!
-#ifndef PyInt_Check
-#define PyInt_Check PyLong_Check
-#define PyInt_FromLong PyLong_FromLong
-#define PyInt_AsLong PyLong_AsLong
-#define PyInt_Type PyLong_Type
-#endif
-
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/Generator.h>
 #include <torch/csrc/Module.h>
diff --git a/torch/csrc/TypeInfo.cpp b/torch/csrc/TypeInfo.cpp
index 9c944fa79d45..6874374eff76 100644
--- a/torch/csrc/TypeInfo.cpp
+++ b/torch/csrc/TypeInfo.cpp
@@ -18,7 +18,7 @@
 #include <sstream>
 
 static PyObject* THPFInfo_New(const at::ScalarType& type) {
-  auto finfo = (PyTypeObject*)&THPFInfoType;
+  auto finfo = &THPFInfoType;
   auto self = THPObjectPtr{finfo->tp_alloc(finfo, 0)};
   if (!self)
     throw python_error();
@@ -28,7 +28,7 @@ static PyObject* THPFInfo_New(const at::ScalarType& type) {
 }
 
 static PyObject* THPIInfo_New(const at::ScalarType& type) {
-  auto iinfo = (PyTypeObject*)&THPIInfoType;
+  auto iinfo = &THPIInfoType;
   auto self = THPObjectPtr{iinfo->tp_alloc(iinfo, 0)};
   if (!self)
     throw python_error();
@@ -117,7 +117,7 @@ static PyObject* THPDTypeInfo_compare(
   return Py_INCREF(Py_NotImplemented), Py_NotImplemented;
 }
 
-static PyObject* THPDTypeInfo_bits(THPDTypeInfo* self, void*) {
+static PyObject* THPDTypeInfo_bits(THPDTypeInfo* self, void* /*unused*/) {
   uint64_t bits = elementSize(self->type) * CHAR_BIT;
   return THPUtils_packUInt64(bits);
 }
@@ -133,7 +133,7 @@ static PyObject* THPDTypeInfo_bits(THPDTypeInfo* self, void*) {
       at::ScalarType::BFloat16,                   \
       AT_EXPAND(AT_FLOAT8_TYPES))
 
-static PyObject* THPFInfo_eps(THPFInfo* self, void*) {
+static PyObject* THPFInfo_eps(THPFInfo* self, void* /*unused*/) {
   HANDLE_TH_ERRORS
   return _AT_DISPATCH_FINFO_TYPES(self->type, "epsilon", [] {
     return PyFloat_FromDouble(
@@ -142,7 +142,7 @@ static PyObject* THPFInfo_eps(THPFInfo* self, void*) {
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject* THPFInfo_max(THPFInfo* self, void*) {
+static PyObject* THPFInfo_max(THPFInfo* self, void* /*unused*/) {
   HANDLE_TH_ERRORS
   return _AT_DISPATCH_FINFO_TYPES(self->type, "max", [] {
     return PyFloat_FromDouble(
@@ -151,7 +151,7 @@ static PyObject* THPFInfo_max(THPFInfo* self, void*) {
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject* THPFInfo_min(THPFInfo* self, void*) {
+static PyObject* THPFInfo_min(THPFInfo* self, void* /*unused*/) {
   HANDLE_TH_ERRORS
   return _AT_DISPATCH_FINFO_TYPES(self->type, "lowest", [] {
     return PyFloat_FromDouble(
@@ -164,7 +164,7 @@ static PyObject* THPFInfo_min(THPFInfo* self, void*) {
   AT_DISPATCH_V2(                                \
       TYPE, NAME, AT_WRAP(__VA_ARGS__), AT_EXPAND(AT_INTEGRAL_TYPES_V2))
 
-static PyObject* THPIInfo_max(THPIInfo* self, void*) {
+static PyObject* THPIInfo_max(THPIInfo* self, void* /*unused*/) {
   HANDLE_TH_ERRORS
   if (at::isIntegralType(self->type, /*includeBool=*/false)) {
     return AT_DISPATCH_IINFO_TYPES(self->type, "max", [] {
@@ -182,7 +182,7 @@ static PyObject* THPIInfo_max(THPIInfo* self, void*) {
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject* THPIInfo_min(THPIInfo* self, void*) {
+static PyObject* THPIInfo_min(THPIInfo* self, void* /*unused*/) {
   HANDLE_TH_ERRORS
   if (at::isIntegralType(self->type, /*includeBool=*/false)) {
     return AT_DISPATCH_IINFO_TYPES(self->type, "min", [] {
@@ -200,7 +200,7 @@ static PyObject* THPIInfo_min(THPIInfo* self, void*) {
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject* THPIInfo_dtype(THPIInfo* self, void*) {
+static PyObject* THPIInfo_dtype(THPIInfo* self, void* /*unused*/) {
   HANDLE_TH_ERRORS
   auto primary_name = c10::getDtypeNames(self->type).first;
   return AT_DISPATCH_IINFO_TYPES(self->type, "dtype", [&primary_name] {
@@ -209,7 +209,7 @@ static PyObject* THPIInfo_dtype(THPIInfo* self, void*) {
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject* THPFInfo_smallest_normal(THPFInfo* self, void*) {
+static PyObject* THPFInfo_smallest_normal(THPFInfo* self, void* /*unused*/) {
   HANDLE_TH_ERRORS
   return _AT_DISPATCH_FINFO_TYPES(self->type, "min", [] {
     return PyFloat_FromDouble(
@@ -218,12 +218,12 @@ static PyObject* THPFInfo_smallest_normal(THPFInfo* self, void*) {
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject* THPFInfo_tiny(THPFInfo* self, void*) {
+static PyObject* THPFInfo_tiny(THPFInfo* self, void* /*unused*/) {
   // see gh-70909, essentially the array_api prefers smallest_normal over tiny
   return THPFInfo_smallest_normal(self, nullptr);
 }
 
-static PyObject* THPFInfo_resolution(THPFInfo* self, void*) {
+static PyObject* THPFInfo_resolution(THPFInfo* self, void* /*unused*/) {
   HANDLE_TH_ERRORS
   return _AT_DISPATCH_FINFO_TYPES(self->type, "digits10", [] {
     return PyFloat_FromDouble(std::pow(
@@ -233,7 +233,7 @@ static PyObject* THPFInfo_resolution(THPFInfo* self, void*) {
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject* THPFInfo_dtype(THPFInfo* self, void*) {
+static PyObject* THPFInfo_dtype(THPFInfo* self, void* /*unused*/) {
   HANDLE_TH_ERRORS
   auto primary_name = c10::getDtypeNames(self->type).first;
   return _AT_DISPATCH_FINFO_TYPES(self->type, "dtype", [&primary_name] {
@@ -273,18 +273,34 @@ static PyObject* THPIInfo_str(THPIInfo* self) {
 }
 
 static const std::initializer_list<PyGetSetDef> THPFInfo_properties = {
-    {"bits", (getter)THPDTypeInfo_bits, nullptr, nullptr, nullptr},
-    {"eps", (getter)THPFInfo_eps, nullptr, nullptr, nullptr},
-    {"max", (getter)THPFInfo_max, nullptr, nullptr, nullptr},
-    {"min", (getter)THPFInfo_min, nullptr, nullptr, nullptr},
+    {"bits",
+     reinterpret_cast<getter>(THPDTypeInfo_bits),
+     nullptr,
+     nullptr,
+     nullptr},
+    {"eps", reinterpret_cast<getter>(THPFInfo_eps), nullptr, nullptr, nullptr},
+    {"max", reinterpret_cast<getter>(THPFInfo_max), nullptr, nullptr, nullptr},
+    {"min", reinterpret_cast<getter>(THPFInfo_min), nullptr, nullptr, nullptr},
     {"smallest_normal",
-     (getter)THPFInfo_smallest_normal,
+     reinterpret_cast<getter>(THPFInfo_smallest_normal),
+     nullptr,
+     nullptr,
+     nullptr},
+    {"tiny",
+     reinterpret_cast<getter>(THPFInfo_tiny),
+     nullptr,
+     nullptr,
+     nullptr},
+    {"resolution",
+     reinterpret_cast<getter>(THPFInfo_resolution),
+     nullptr,
+     nullptr,
+     nullptr},
+    {"dtype",
+     reinterpret_cast<getter>(THPFInfo_dtype),
      nullptr,
      nullptr,
      nullptr},
-    {"tiny", (getter)THPFInfo_tiny, nullptr, nullptr, nullptr},
-    {"resolution", (getter)THPFInfo_resolution, nullptr, nullptr, nullptr},
-    {"dtype", (getter)THPFInfo_dtype, nullptr, nullptr, nullptr},
     {nullptr}};
 
 PyTypeObject THPFInfoType = {
@@ -297,13 +313,13 @@ PyTypeObject THPFInfoType = {
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
-    (reprfunc)THPFInfo_str, /* tp_repr */
+    reinterpret_cast<reprfunc>(THPFInfo_str), /* tp_repr */
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
     nullptr, /* tp_hash  */
     nullptr, /* tp_call */
-    (reprfunc)THPFInfo_str, /* tp_str */
+    reinterpret_cast<reprfunc>(THPFInfo_str), /* tp_str */
     nullptr, /* tp_getattro */
     nullptr, /* tp_setattro */
     nullptr, /* tp_as_buffer */
@@ -311,7 +327,7 @@ PyTypeObject THPFInfoType = {
     nullptr, /* tp_doc */
     nullptr, /* tp_traverse */
     nullptr, /* tp_clear */
-    (richcmpfunc)THPDTypeInfo_compare, /* tp_richcompare */
+    reinterpret_cast<richcmpfunc>(THPDTypeInfo_compare), /* tp_richcompare */
     0, /* tp_weaklistoffset */
     nullptr, /* tp_iter */
     nullptr, /* tp_iternext */
@@ -330,10 +346,18 @@ PyTypeObject THPFInfoType = {
 };
 
 static const std::initializer_list<PyGetSetDef> THPIInfo_properties = {
-    {"bits", (getter)THPDTypeInfo_bits, nullptr, nullptr, nullptr},
-    {"max", (getter)THPIInfo_max, nullptr, nullptr, nullptr},
-    {"min", (getter)THPIInfo_min, nullptr, nullptr, nullptr},
-    {"dtype", (getter)THPIInfo_dtype, nullptr, nullptr, nullptr},
+    {"bits",
+     reinterpret_cast<getter>(THPDTypeInfo_bits),
+     nullptr,
+     nullptr,
+     nullptr},
+    {"max", reinterpret_cast<getter>(THPIInfo_max), nullptr, nullptr, nullptr},
+    {"min", reinterpret_cast<getter>(THPIInfo_min), nullptr, nullptr, nullptr},
+    {"dtype",
+     reinterpret_cast<getter>(THPIInfo_dtype),
+     nullptr,
+     nullptr,
+     nullptr},
     {nullptr}};
 
 PyTypeObject THPIInfoType = {
@@ -346,13 +370,13 @@ PyTypeObject THPIInfoType = {
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
     nullptr, /* tp_reserved */
-    (reprfunc)THPIInfo_str, /* tp_repr */
+    reinterpret_cast<reprfunc>(THPIInfo_str), /* tp_repr */
     nullptr, /* tp_as_number */
     nullptr, /* tp_as_sequence */
     nullptr, /* tp_as_mapping */
     nullptr, /* tp_hash  */
     nullptr, /* tp_call */
-    (reprfunc)THPIInfo_str, /* tp_str */
+    reinterpret_cast<reprfunc>(THPIInfo_str), /* tp_str */
     nullptr, /* tp_getattro */
     nullptr, /* tp_setattro */
     nullptr, /* tp_as_buffer */
@@ -360,7 +384,7 @@ PyTypeObject THPIInfoType = {
     nullptr, /* tp_doc */
     nullptr, /* tp_traverse */
     nullptr, /* tp_clear */
-    (richcmpfunc)THPDTypeInfo_compare, /* tp_richcompare */
+    reinterpret_cast<richcmpfunc>(THPDTypeInfo_compare), /* tp_richcompare */
     0, /* tp_weaklistoffset */
     nullptr, /* tp_iter */
     nullptr, /* tp_iternext */
@@ -383,14 +407,16 @@ void THPDTypeInfo_init(PyObject* module) {
     throw python_error();
   }
   Py_INCREF(&THPFInfoType);
-  if (PyModule_AddObject(module, "finfo", (PyObject*)&THPFInfoType) != 0) {
+  if (PyModule_AddObject(
+          module, "finfo", reinterpret_cast<PyObject*>(&THPFInfoType)) != 0) {
     throw python_error();
   }
   if (PyType_Ready(&THPIInfoType) < 0) {
     throw python_error();
   }
   Py_INCREF(&THPIInfoType);
-  if (PyModule_AddObject(module, "iinfo", (PyObject*)&THPIInfoType) != 0) {
+  if (PyModule_AddObject(
+          module, "iinfo", reinterpret_cast<PyObject*>(&THPIInfoType)) != 0) {
     throw python_error();
   }
 }
diff --git a/torch/csrc/acc/Module.cpp b/torch/csrc/acc/Module.cpp
new file mode 100644
index 000000000000..1ae2cd2d0bc3
--- /dev/null
+++ b/torch/csrc/acc/Module.cpp
@@ -0,0 +1,197 @@
+#include <torch/csrc/acc/Module.h>
+
+#include <ATen/ATen.h>
+#include <torch/extension.h>
+
+#include <ATen/detail/PrivateUse1HooksInterface.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace py = pybind11;
+
+namespace torch::acc {
+
+// python hook interface
+struct PythonHooks final : public at::PrivateUse1HooksInterface {
+  using at::PrivateUse1HooksInterface::PrivateUse1HooksInterface;
+  bool hasPrimaryContext(c10::DeviceIndex device_index) const override {
+    PYBIND11_OVERRIDE_PURE_NAME(
+        bool,
+        at::PrivateUse1HooksInterface,
+        "has_primary_context",
+        hasPrimaryContext,
+        device_index);
+  }
+
+  bool isBuilt() const override {
+    PYBIND11_OVERRIDE_PURE_NAME(
+        bool, at::PrivateUse1HooksInterface, "is_built", isBuilt, );
+  }
+
+  bool isAvailable() const override {
+    PYBIND11_OVERRIDE_PURE_NAME(
+        bool, at::PrivateUse1HooksInterface, "is_available", isBuilt, );
+  }
+
+  // TODO(qihqi): these is not supported from python yet
+  const at::Generator& getDefaultGenerator(
+      c10::DeviceIndex device_index) const override {
+    return at::PrivateUse1HooksInterface::getDefaultGenerator(device_index);
+  }
+
+  at::Generator getNewGenerator(
+      c10::DeviceIndex device_index = -1) const override {
+    return at::PrivateUse1HooksInterface::getNewGenerator(device_index);
+  }
+
+  at::Device getDeviceFromPtr(void* data) const override {
+    return at::PrivateUse1HooksInterface::getDeviceFromPtr(data);
+  }
+
+  bool isPinnedPtr(const void* data) const override {
+    return at::PrivateUse1HooksInterface::isPinnedPtr(data);
+  }
+
+  at::Allocator* getPinnedMemoryAllocator() const override {
+    return at::PrivateUse1HooksInterface::getPinnedMemoryAllocator();
+  }
+};
+
+struct PythonDeviceGuard final : public c10::impl::DeviceGuardImplInterface {
+  using c10::impl::DeviceGuardImplInterface::DeviceGuardImplInterface;
+
+  c10::DeviceType type() const override {
+    PYBIND11_OVERRIDE_PURE_NAME(
+        c10::DeviceType, c10::impl::DeviceGuardImplInterface, "type_", type, );
+  }
+
+  // TODO(qihqi): figure out if those are even useful
+  // to python or not
+  c10::Device exchangeDevice(c10::Device device) const override {
+    return getDevice();
+  }
+  c10::Device getDevice() const override {
+    return c10::Device(type(), 0);
+  }
+  void setDevice(c10::Device device) const override {}
+  void uncheckedSetDevice(c10::Device device) const noexcept override {}
+  c10::Stream getStream(c10::Device /*unused*/) const noexcept override {
+    // no-op
+    return c10::Stream(c10::Stream::DEFAULT, getDevice());
+  }
+
+  c10::Stream getNewStream(c10::Device /*unused*/, int priority = 0)
+      const override {
+    // no-op
+    (void)priority;
+    return c10::Stream(c10::Stream::DEFAULT, getDevice());
+  }
+
+  c10::Stream exchangeStream(c10::Stream /*unused*/) const noexcept override {
+    // no-op
+    return c10::Stream(c10::Stream::DEFAULT, getDevice());
+  }
+  c10::DeviceIndex deviceCount() const noexcept override {
+    return 1;
+  }
+
+  // TODO(qihqi): support Event-related functions
+  void record(
+      void** /*event*/,
+      const c10::Stream& /*stream*/,
+      const c10::DeviceIndex /*device_index*/,
+      const c10::EventFlag /*flag*/) const override {}
+  void block(void* /*event*/, const c10::Stream& /*stream*/) const override {}
+  bool queryEvent(void* /*event*/) const override {
+    return true;
+  }
+  void destroyEvent(void* /*event*/, const c10::DeviceIndex /*device_index*/)
+      const noexcept override {}
+
+  // Stream-related functions
+  bool queryStream(const c10::Stream& /*stream*/) const override {
+    return true;
+  }
+  void synchronizeStream(const c10::Stream& /*stream*/) const override {}
+};
+
+namespace {
+
+bool registerPythonPrivateUse1Hook(const py::object& hook) {
+  if (at::isPrivateUse1HooksRegistered()) {
+    return false;
+  }
+  hook.inc_ref();
+  at::RegisterPrivateUse1HooksInterface(
+      hook.cast<PrivateUse1HooksInterface*>());
+  return true;
+}
+
+bool registerPythonPrivateUse1DeviceGuard(const py::object& guard) {
+  if (c10::impl::hasDeviceGuardImpl(c10::DeviceType::PrivateUse1)) {
+    return false;
+  }
+  guard.inc_ref();
+  c10::impl::registerDeviceGuard(
+      c10::DeviceType::PrivateUse1,
+      guard.cast<c10::impl::DeviceGuardImplInterface*>());
+  return true;
+}
+
+at::Tensor createEmptyTensor(
+    const std::vector<int64_t>& shape,
+    c10::ScalarType dtype) {
+  c10::Storage storage{
+      c10::Storage::use_byte_size_t{},
+      0,
+      c10::GetAllocator(c10::kMeta),
+      true,
+  };
+
+  c10::Device device(c10::DeviceType::PrivateUse1, 0);
+  storage.set_data_ptr_noswap(at::DataPtr{nullptr, device});
+  c10::DispatchKeySet key_set({c10::DispatchKey::PrivateUse1});
+  at::Tensor tensor = at::detail::make_tensor<at::TensorImpl>(
+      std::move(storage), key_set, c10::scalarTypeToTypeMeta(dtype));
+
+  std::vector<int64_t> strides(shape.size());
+  int64_t size = 1;
+  for (auto i = strides.size(); i > 0; --i) {
+    strides[i - 1] = size;
+    size *= shape[i - 1];
+  }
+
+  tensor.unsafeGetTensorImpl()->set_sizes_and_strides(shape, strides, 0);
+  return tensor;
+}
+} // namespace
+
+void initModule(PyObject* module) {
+  auto py_module = py::reinterpret_borrow<py::module>(module);
+  auto _acc =
+      py_module.def_submodule("_acc", "classes related to custom accelerators");
+
+  py::class_<at::PrivateUse1HooksInterface, PythonHooks>(
+      _acc.ptr(), "PrivateUse1Hooks")
+      .def(py::init<>())
+      .def(
+          "has_primary_context",
+          &at::PrivateUse1HooksInterface::hasPrimaryContext)
+      .def("is_built", &at::PrivateUse1HooksInterface::isBuilt)
+      .def("is_available", &at::PrivateUse1HooksInterface::isAvailable);
+
+  py::class_<c10::impl::DeviceGuardImplInterface, PythonDeviceGuard>(
+      _acc.ptr(), "DeviceGuard")
+      .def(py::init<>())
+      .def("type_", &c10::impl::DeviceGuardImplInterface::type);
+
+  _acc.def(
+      "register_python_privateuseone_hook", &registerPythonPrivateUse1Hook);
+  _acc.def(
+      "register_python_privateuseone_device_guard",
+      &registerPythonPrivateUse1DeviceGuard);
+  _acc.def("create_empty_tensor", &createEmptyTensor);
+}
+
+} // namespace torch::acc
diff --git a/torch/csrc/acc/Module.h b/torch/csrc/acc/Module.h
new file mode 100644
index 000000000000..7fe776e2f783
--- /dev/null
+++ b/torch/csrc/acc/Module.h
@@ -0,0 +1,8 @@
+#pragma once
+#include <torch/csrc/python_headers.h>
+
+namespace torch::acc {
+// PyMethodDef* python_functions();
+void initModule(PyObject* module);
+
+} // namespace torch::acc
diff --git a/torch/csrc/api/include/torch/nn/functional/conv.h b/torch/csrc/api/include/torch/nn/functional/conv.h
index 1c2b5b73c48d..2ab6a7684285 100644
--- a/torch/csrc/api/include/torch/nn/functional/conv.h
+++ b/torch/csrc/api/include/torch/nn/functional/conv.h
@@ -8,11 +8,11 @@ namespace torch::nn::functional {
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
 namespace detail {
 
-inline std::string padding_unwrap(enumtype::kValid) {
+inline std::string padding_unwrap(enumtype::kValid /*unused*/) {
   return "valid";
 }
 
-inline std::string padding_unwrap(enumtype::kSame) {
+inline std::string padding_unwrap(enumtype::kSame /*unused*/) {
   return "same";
 }
 
diff --git a/torch/csrc/api/include/torch/nn/modules/container/any.h b/torch/csrc/api/include/torch/nn/modules/container/any.h
index 28f297388757..c7a2fcbe62f7 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/any.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/any.h
@@ -185,11 +185,12 @@ class AnyModule {
       typename... ArgumentTypes>
   std::unique_ptr<AnyModulePlaceholder> make_holder(
       std::shared_ptr<ModuleType>&& module,
-      ReturnType (Class::*)(ArgumentTypes...));
+      ReturnType (Class::* /*unused*/)(ArgumentTypes...));
 
   /// Helper method invoked by const and non-const `get()`.
   template <typename ModuleType, typename ReturnType, typename... ArgumentTypes>
-  ModuleType& get_(ReturnType (ModuleType::*)(ArgumentTypes...)) const;
+  ModuleType& get_(
+      ReturnType (ModuleType::* /*unused*/)(ArgumentTypes...)) const;
 
   /// Helper method invoked by const and non-const `get()`.
   template <typename ModuleType>
@@ -320,7 +321,7 @@ template <
     typename... ArgumentTypes>
 std::unique_ptr<AnyModulePlaceholder> AnyModule::make_holder(
     std::shared_ptr<ModuleType>&& module,
-    ReturnType (Class::*)(ArgumentTypes...)) {
+    ReturnType (Class::* /*unused*/)(ArgumentTypes...)) {
   static_assert(
       torch::detail::check_not_lvalue_references<ArgumentTypes...>(),
       "Modules stored inside AnyModule must not take references. "
@@ -345,7 +346,7 @@ ModuleType& AnyModule::get_() const {
 
 template <typename ModuleType, typename ReturnType, typename... ArgumentTypes>
 ModuleType& AnyModule::get_(
-    ReturnType (ModuleType::*)(ArgumentTypes...)) const {
+    ReturnType (ModuleType::* /*unused*/)(ArgumentTypes...)) const {
   if (typeid(ModuleType).hash_code() == type_info().hash_code()) {
     return *static_cast<AnyModuleHolder<ModuleType, ArgumentTypes...>&>(
                 *content_)
diff --git a/torch/csrc/api/include/torch/nn/options/activation.h b/torch/csrc/api/include/torch/nn/options/activation.h
index 480e09ad4de2..00c7a99e6751 100644
--- a/torch/csrc/api/include/torch/nn/options/activation.h
+++ b/torch/csrc/api/include/torch/nn/options/activation.h
@@ -686,23 +686,23 @@ struct TORCH_API MultiheadAttentionForwardFuncOptions {
 
   TORCH_ARG(bool, training) = true;
 
-  TORCH_ARG(Tensor, key_padding_mask) = {};
+  TORCH_ARG(Tensor, key_padding_mask);
 
   TORCH_ARG(bool, need_weights) = true;
 
-  TORCH_ARG(Tensor, attn_mask) = {};
+  TORCH_ARG(Tensor, attn_mask);
 
   TORCH_ARG(bool, use_separate_proj_weight) = false;
 
-  TORCH_ARG(Tensor, q_proj_weight) = {};
+  TORCH_ARG(Tensor, q_proj_weight);
 
-  TORCH_ARG(Tensor, k_proj_weight) = {};
+  TORCH_ARG(Tensor, k_proj_weight);
 
-  TORCH_ARG(Tensor, v_proj_weight) = {};
+  TORCH_ARG(Tensor, v_proj_weight);
 
-  TORCH_ARG(Tensor, static_k) = {};
+  TORCH_ARG(Tensor, static_k);
 
-  TORCH_ARG(Tensor, static_v) = {};
+  TORCH_ARG(Tensor, static_v);
 
   TORCH_ARG(bool, average_attn_weights) = true;
 };
diff --git a/torch/csrc/api/include/torch/nn/options/batchnorm.h b/torch/csrc/api/include/torch/nn/options/batchnorm.h
index d77cfb4f0d15..78a287207c3a 100644
--- a/torch/csrc/api/include/torch/nn/options/batchnorm.h
+++ b/torch/csrc/api/include/torch/nn/options/batchnorm.h
@@ -73,9 +73,9 @@ namespace functional {
 /// F::BatchNormFuncOptions().weight(weight).bias(bias).momentum(0.1).eps(1e-05).training(false));
 /// ```
 struct TORCH_API BatchNormFuncOptions {
-  TORCH_ARG(Tensor, weight) = Tensor();
+  TORCH_ARG(Tensor, weight);
 
-  TORCH_ARG(Tensor, bias) = Tensor();
+  TORCH_ARG(Tensor, bias);
 
   TORCH_ARG(bool, training) = false;
 
diff --git a/torch/csrc/api/include/torch/nn/options/conv.h b/torch/csrc/api/include/torch/nn/options/conv.h
index f10d5e9a3106..bbaecbeb97b6 100644
--- a/torch/csrc/api/include/torch/nn/options/conv.h
+++ b/torch/csrc/api/include/torch/nn/options/conv.h
@@ -196,7 +196,7 @@ struct ConvFuncOptions {
   using padding_t = torch::nn::detail::conv_padding_t<D>;
 
   /// optional bias of shape `(out_channels)`. Default: ``None``
-  TORCH_ARG(torch::Tensor, bias) = Tensor();
+  TORCH_ARG(torch::Tensor, bias);
 
   /// The stride of the convolving kernel.
   /// For a `D`-dim convolution, must be a single number or a list of `D`
@@ -352,7 +352,7 @@ namespace functional {
 template <size_t D>
 struct ConvTransposeFuncOptions {
   /// optional bias of shape `(out_channels)`. Default: ``None``
-  TORCH_ARG(torch::Tensor, bias) = Tensor();
+  TORCH_ARG(torch::Tensor, bias);
 
   /// The stride of the convolving kernel.
   /// For a `D`-dim convolution, must be a single number or a list of `D`
diff --git a/torch/csrc/api/include/torch/nn/options/embedding.h b/torch/csrc/api/include/torch/nn/options/embedding.h
index be689f12b3bd..3c62b2a06852 100644
--- a/torch/csrc/api/include/torch/nn/options/embedding.h
+++ b/torch/csrc/api/include/torch/nn/options/embedding.h
@@ -40,7 +40,7 @@ struct TORCH_API EmbeddingOptions {
   TORCH_ARG(bool, sparse) = false;
   /// The learnable weights of the module of shape (num_embeddings,
   /// embedding_dim)
-  TORCH_ARG(torch::Tensor, _weight) = Tensor();
+  TORCH_ARG(torch::Tensor, _weight);
 };
 
 // ============================================================================
@@ -136,7 +136,7 @@ struct TORCH_API EmbeddingBagOptions {
   TORCH_ARG(bool, sparse) = false;
   /// The learnable weights of the module of shape (num_embeddings,
   /// embedding_dim)
-  TORCH_ARG(torch::Tensor, _weight) = Tensor();
+  TORCH_ARG(torch::Tensor, _weight);
   /// If ``true``, `offsets` has one additional element, where the last element
   /// is equivalent to the size of `indices`. This matches the CSR format.
   TORCH_ARG(bool, include_last_offset) = false;
@@ -201,7 +201,7 @@ namespace functional {
 struct TORCH_API EmbeddingBagFuncOptions {
   /// Only used when `input` is 1D. `offsets` determines
   /// the starting index position of each bag (sequence) in `input`.
-  TORCH_ARG(torch::Tensor, offsets) = Tensor();
+  TORCH_ARG(torch::Tensor, offsets);
   /// If given, each embedding vector with norm larger than `max_norm` is
   /// renormalized to have norm `max_norm`.
   TORCH_ARG(std::optional<double>, max_norm) = std::nullopt;
@@ -223,7 +223,7 @@ struct TORCH_API EmbeddingBagFuncOptions {
   /// be taken to be 1. If specified, `per_sample_weights` must have exactly the
   /// same shape as input and is treated as having the same `offsets`, if those
   /// are not None.
-  TORCH_ARG(torch::Tensor, per_sample_weights) = Tensor();
+  TORCH_ARG(torch::Tensor, per_sample_weights);
   /// If ``true``, `offsets` has one additional element, where the last element
   /// is equivalent to the size of `indices`. This matches the CSR format. Note:
   /// this option is currently only supported when ``mode="sum"``.
diff --git a/torch/csrc/api/include/torch/nn/options/instancenorm.h b/torch/csrc/api/include/torch/nn/options/instancenorm.h
index 2c90a060340b..c37832407edf 100644
--- a/torch/csrc/api/include/torch/nn/options/instancenorm.h
+++ b/torch/csrc/api/include/torch/nn/options/instancenorm.h
@@ -67,13 +67,13 @@ namespace functional {
 /// F::InstanceNormFuncOptions().running_mean(mean).running_var(variance).weight(weight).bias(bias).momentum(0.1).eps(1e-5));
 /// ```
 struct TORCH_API InstanceNormFuncOptions {
-  TORCH_ARG(Tensor, running_mean) = Tensor();
+  TORCH_ARG(Tensor, running_mean);
 
-  TORCH_ARG(Tensor, running_var) = Tensor();
+  TORCH_ARG(Tensor, running_var);
 
-  TORCH_ARG(Tensor, weight) = Tensor();
+  TORCH_ARG(Tensor, weight);
 
-  TORCH_ARG(Tensor, bias) = Tensor();
+  TORCH_ARG(Tensor, bias);
 
   TORCH_ARG(bool, use_input_stats) = true;
 
diff --git a/torch/csrc/api/include/torch/nn/options/loss.h b/torch/csrc/api/include/torch/nn/options/loss.h
index 88d954c5e18b..b004fae8cdb0 100644
--- a/torch/csrc/api/include/torch/nn/options/loss.h
+++ b/torch/csrc/api/include/torch/nn/options/loss.h
@@ -131,7 +131,7 @@ struct TORCH_API BCELossOptions {
       reduction_t;
 
   /// A manual rescaling weight given to the loss of each batch element.
-  TORCH_ARG(Tensor, weight) = {};
+  TORCH_ARG(Tensor, weight);
   /// Specifies the reduction to apply to the output.
   /// ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'mean'``
   TORCH_ARG(reduction_t, reduction) = torch::kMean;
@@ -207,7 +207,7 @@ struct TORCH_API MultiMarginLossOptions {
   /// A manual rescaling weight given to each
   /// class. If given, it has to be a Tensor of size `C`. Otherwise, it is
   /// treated as if having all ones.
-  TORCH_ARG(Tensor, weight) = Tensor();
+  TORCH_ARG(Tensor, weight);
   /// Specifies the reduction to apply to the output:
   /// ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be
   /// applied,
@@ -365,7 +365,7 @@ struct TORCH_API MultiLabelSoftMarginLossOptions {
   /// A manual rescaling weight given to each
   /// class. If given, it has to be a Tensor of size `C`. Otherwise, it is
   /// treated as if having all ones.
-  TORCH_ARG(Tensor, weight) = Tensor();
+  TORCH_ARG(Tensor, weight);
 
   /// Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'.
   /// 'none': no reduction will be applied, 'mean': the sum of the output will
@@ -697,7 +697,7 @@ struct TORCH_API NLLLossOptions {
   /// A manual rescaling weight given to each
   /// class. If given, it has to be a Tensor of size `C`. Otherwise, it is
   /// treated as if having all ones.
-  TORCH_ARG(Tensor, weight) = {};
+  TORCH_ARG(Tensor, weight);
   /// Specifies a target value that is ignored
   /// and does not contribute to the input gradient.
   TORCH_ARG(int64_t, ignore_index) = -100;
@@ -735,7 +735,7 @@ struct TORCH_API CrossEntropyLossOptions {
 
   /// A manual rescaling weight given to each class. If given, has to be a
   /// Tensor of size C
-  TORCH_ARG(Tensor, weight) = {};
+  TORCH_ARG(Tensor, weight);
   /// Specifies a target value that is ignored
   /// and does not contribute to the input gradient.
   TORCH_ARG(int64_t, ignore_index) = -100;
@@ -774,12 +774,12 @@ struct TORCH_API BCEWithLogitsLossOptions {
       reduction_t;
   /// A manual rescaling weight given to the loss of each batch element.
   /// If given, has to be a Tensor of size `nbatch`.
-  TORCH_ARG(Tensor, weight) = {};
+  TORCH_ARG(Tensor, weight);
   /// Specifies the reduction to apply to the output. Default: Mean
   TORCH_ARG(reduction_t, reduction) = torch::kMean;
   /// A weight of positive examples.
   /// Must be a vector with length equal to the number of classes.
-  TORCH_ARG(Tensor, pos_weight) = {};
+  TORCH_ARG(Tensor, pos_weight);
 };
 
 namespace functional {
diff --git a/torch/csrc/api/include/torch/nn/options/normalization.h b/torch/csrc/api/include/torch/nn/options/normalization.h
index 6097a2923af2..439f8b2a9808 100644
--- a/torch/csrc/api/include/torch/nn/options/normalization.h
+++ b/torch/csrc/api/include/torch/nn/options/normalization.h
@@ -43,9 +43,9 @@ struct TORCH_API LayerNormFuncOptions {
   /// input shape from an expected input.
   TORCH_ARG(std::vector<int64_t>, normalized_shape);
 
-  TORCH_ARG(Tensor, weight) = {};
+  TORCH_ARG(Tensor, weight);
 
-  TORCH_ARG(Tensor, bias) = {};
+  TORCH_ARG(Tensor, bias);
 
   /// a value added to the denominator for numerical stability. ``Default:
   /// 1e-5``.
@@ -177,9 +177,9 @@ struct TORCH_API GroupNormFuncOptions {
   /// number of groups to separate the channels into
   TORCH_ARG(int64_t, num_groups);
 
-  TORCH_ARG(Tensor, weight) = {};
+  TORCH_ARG(Tensor, weight);
 
-  TORCH_ARG(Tensor, bias) = {};
+  TORCH_ARG(Tensor, bias);
 
   /// a value added to the denominator for numerical stability. Default: 1e-5
   TORCH_ARG(double, eps) = 1e-5;
diff --git a/torch/csrc/api/include/torch/nn/options/pooling.h b/torch/csrc/api/include/torch/nn/options/pooling.h
index 3934f326c8a5..4449a16f2206 100644
--- a/torch/csrc/api/include/torch/nn/options/pooling.h
+++ b/torch/csrc/api/include/torch/nn/options/pooling.h
@@ -456,7 +456,7 @@ struct FractionalMaxPoolOptions {
   using ExpandingArrayDouble = torch::ExpandingArray<D, double>;
   TORCH_ARG(std::optional<ExpandingArrayDouble>, output_ratio) = std::nullopt;
 
-  TORCH_ARG(torch::Tensor, _random_samples) = Tensor();
+  TORCH_ARG(torch::Tensor, _random_samples);
 };
 
 /// `FractionalMaxPoolOptions` specialized for the `FractionalMaxPool2d` module.
diff --git a/torch/csrc/api/include/torch/optim/adam.h b/torch/csrc/api/include/torch/optim/adam.h
index 6c06e4030cf4..c75aac357717 100644
--- a/torch/csrc/api/include/torch/optim/adam.h
+++ b/torch/csrc/api/include/torch/optim/adam.h
@@ -38,7 +38,7 @@ struct TORCH_API AdamParamState
   TORCH_ARG(int64_t, step) = 0;
   TORCH_ARG(torch::Tensor, exp_avg);
   TORCH_ARG(torch::Tensor, exp_avg_sq);
-  TORCH_ARG(torch::Tensor, max_exp_avg_sq) = {};
+  TORCH_ARG(torch::Tensor, max_exp_avg_sq);
 
  public:
   void serialize(torch::serialize::InputArchive& archive) override;
diff --git a/torch/csrc/api/include/torch/optim/adamw.h b/torch/csrc/api/include/torch/optim/adamw.h
index d656921a719d..8b8c4c45f406 100644
--- a/torch/csrc/api/include/torch/optim/adamw.h
+++ b/torch/csrc/api/include/torch/optim/adamw.h
@@ -38,7 +38,7 @@ struct TORCH_API AdamWParamState
   TORCH_ARG(int64_t, step) = 0;
   TORCH_ARG(torch::Tensor, exp_avg);
   TORCH_ARG(torch::Tensor, exp_avg_sq);
-  TORCH_ARG(torch::Tensor, max_exp_avg_sq) = {};
+  TORCH_ARG(torch::Tensor, max_exp_avg_sq);
 
  public:
   void serialize(torch::serialize::InputArchive& archive) override;
diff --git a/torch/csrc/api/include/torch/optim/lbfgs.h b/torch/csrc/api/include/torch/optim/lbfgs.h
index 3d5f1832cf60..dc90113955fd 100644
--- a/torch/csrc/api/include/torch/optim/lbfgs.h
+++ b/torch/csrc/api/include/torch/optim/lbfgs.h
@@ -39,9 +39,9 @@ struct TORCH_API LBFGSParamState
   TORCH_ARG(int64_t, n_iter) = 0;
   TORCH_ARG(double, t) = 0;
   TORCH_ARG(double, prev_loss) = 0;
-  TORCH_ARG(Tensor, d) = {};
-  TORCH_ARG(Tensor, H_diag) = {};
-  TORCH_ARG(Tensor, prev_flat_grad) = {};
+  TORCH_ARG(Tensor, d);
+  TORCH_ARG(Tensor, H_diag);
+  TORCH_ARG(Tensor, prev_flat_grad);
   TORCH_ARG(std::deque<Tensor>, old_dirs);
   TORCH_ARG(std::deque<Tensor>, old_stps);
   TORCH_ARG(std::deque<Tensor>, ro);
diff --git a/torch/csrc/api/include/torch/optim/rmsprop.h b/torch/csrc/api/include/torch/optim/rmsprop.h
index 7b6b9dea5649..c6581b87a4b6 100644
--- a/torch/csrc/api/include/torch/optim/rmsprop.h
+++ b/torch/csrc/api/include/torch/optim/rmsprop.h
@@ -43,8 +43,8 @@ struct TORCH_API RMSpropParamState
     : public OptimizerCloneableParamState<RMSpropParamState> {
   TORCH_ARG(int64_t, step) = 0;
   TORCH_ARG(torch::Tensor, square_avg);
-  TORCH_ARG(torch::Tensor, momentum_buffer) = {};
-  TORCH_ARG(torch::Tensor, grad_avg) = {};
+  TORCH_ARG(torch::Tensor, momentum_buffer);
+  TORCH_ARG(torch::Tensor, grad_avg);
 
  public:
   void serialize(torch::serialize::InputArchive& archive) override;
diff --git a/torch/csrc/api/include/torch/version.h b/torch/csrc/api/include/torch/version.h
new file mode 100644
index 000000000000..44362d302fa9
--- /dev/null
+++ b/torch/csrc/api/include/torch/version.h
@@ -0,0 +1 @@
+#include <torch/headeronly/version.h>
diff --git a/torch/csrc/api/src/nn/modules/_functions.cpp b/torch/csrc/api/src/nn/modules/_functions.cpp
index 3bd956098f2c..77d3039dbe1a 100644
--- a/torch/csrc/api/src/nn/modules/_functions.cpp
+++ b/torch/csrc/api/src/nn/modules/_functions.cpp
@@ -103,9 +103,9 @@ variable_list CrossMapLRN2d::backward(
   double cache_ratio_value = 2 * ctx->saved_data["alpha"].toDouble() *
       ctx->saved_data["beta"].toDouble() /
       static_cast<double>(ctx->saved_data["size"].toInt());
-  int64_t inversePrePad = static_cast<int64_t>(
-      ctx->saved_data["size"].toInt() -
-      (ctx->saved_data["size"].toInt() - 1) / 2);
+  int64_t inversePrePad =
+      (ctx->saved_data["size"].toInt() -
+       (ctx->saved_data["size"].toInt() - 1) / 2);
 
   grad_input.resize_as_(input);
   torch::pow_out(
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 2b7e7760754d..ff58cfd18ee3 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -2176,7 +2176,7 @@ Tensor _nested_split_with_sizes_backward(
     const Tensor& nt_sizes,
     const at::TensorOptions& options) {
   // add 1 to account for batch dim
-  dim = at::maybe_wrap_dim(dim, static_cast<int64_t>(nt_sizes.size(1)) + 1);
+  dim = at::maybe_wrap_dim(dim, nt_sizes.size(1) + 1);
   // it's possible some of the grads are not defined (represents tensors of all
   // 0s). Since at::cat can't handle those, let's define them
   std::vector<Tensor> grads_all_defined;
@@ -2187,10 +2187,9 @@ Tensor _nested_split_with_sizes_backward(
       const auto& length = split_sizes[i].guard_int(__FILE__, __LINE__);
       auto nt_split_size = nt_sizes.clone();
       auto nt_split_size_ptr = nt_split_size.data_ptr<int64_t>();
-      for (int64_t j : c10::irange(static_cast<int64_t>(nt_sizes.size(0)))) {
+      for (int64_t j : c10::irange(nt_sizes.size(0))) {
         // subtract 1 to account for batch dim
-        nt_split_size_ptr
-            [j * static_cast<int64_t>(nt_sizes.size(1)) + (dim - 1)] = length;
+        nt_split_size_ptr[j * nt_sizes.size(1) + (dim - 1)] = length;
       }
       Tensor zeros_buffer = at::zeros(
           {at::native::get_numel_from_nested_size_tensor(nt_split_size)},
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 96864e165a95..4dc0425d426e 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -279,7 +279,7 @@ std::tuple<at::Tensor, at::Tensor> clamp_backward_min_max(
     const at::Tensor& self,
     const at::Tensor& min,
     const at::Tensor& max,
-    const std::array<bool, 2>&);
+    const std::array<bool, 2>& /*grad_input_mask*/);
 at::Tensor clamp_jvp(
     const Tensor& self_p,
     const Tensor& self_t,
diff --git a/torch/csrc/autograd/TraceTypeManual.cpp b/torch/csrc/autograd/TraceTypeManual.cpp
index 3690751ed196..3e722903fe81 100644
--- a/torch/csrc/autograd/TraceTypeManual.cpp
+++ b/torch/csrc/autograd/TraceTypeManual.cpp
@@ -245,13 +245,12 @@ static void general_trace_function(
           tracer::addInputs(
               node, args[i].name().c_str(), iter->toBoolList().vec());
         } else {
-          throw std::runtime_error(
-              "unsupported input list type: " + elem_type->str());
+          TORCH_CHECK(false, "unsupported input list type: ", elem_type->str());
         }
       } else if (iter->isObject()) {
         tracer::addInputs(node, args[i].name().c_str(), iter->toObject());
       } else {
-        throw std::runtime_error("unsupported input type: " + type->str());
+        TORCH_CHECK(false, "unsupported input type: ", type->str());
       }
     }
     graph->insertNode(node);
@@ -277,16 +276,19 @@ static void general_trace_function(
           AT_ASSERT(iter->isTensorList());
           tracer::addOutput(node, iter->toTensorList());
         } else {
-          throw std::runtime_error(
-              "unsupported output list type: " + elem_type->str());
+          TORCH_CHECK(
+              false, "unsupported output list type: ", elem_type->str());
         }
       } else if (type->kind() == TypeKind::ClassType) {
         AT_ASSERT(iter->isObject());
         tracer::addOutput(node, iter->toObject());
       } else {
-        throw std::runtime_error(
-            "unsupported output type: " + type->str() +
-            ", from operator: " + toString(op.operator_name()));
+        TORCH_CHECK(
+            false,
+            "unsupported output type: ",
+            type->str(),
+            ", from operator: ",
+            toString(op.operator_name()));
       }
     }
   }
diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp
index e270df51221b..c2c4dffee66e 100644
--- a/torch/csrc/autograd/VariableTypeManual.cpp
+++ b/torch/csrc/autograd/VariableTypeManual.cpp
@@ -453,20 +453,18 @@ static Tensor detach(c10::DispatchKeySet ks, const Tensor& self) {
     return at::_ops::detach::redispatch(
         ks & c10::after_ADInplaceOrView_keyset, self);
   })();
-  // NB: we can't make detach() a normal view operator because the codegen
-  // generates allow_tensor_metadata_change = True for them. In the future we
-  // should have an option for this in the codegen.
-  auto result = as_view(
-      /* base */ self,
-      /* output */ out,
-      /* is_bw_differentiable */ false,
-      /* is_fw_differentiable */ false,
-      /* view_func */ nullptr,
-      /* rev_view_func */ nullptr,
-      /* creation_meta */ CreationMeta::DEFAULT,
-      /*allow_tensor_metadata_change=*/false);
-
-  return result;
+  // NB: we can't make detach() a normal view operator because the
+  // codegen generates allow_tensor_metadata_change = True (and leaves
+  // is_fresh_tensor to the default setting of False) for them. In the
+  // future we should have an option for this in the codegen.
+  if (self.is_inference()) {
+    return out;
+  }
+  return ::torch::autograd::make_variable_non_differentiable_view(
+      self,
+      out,
+      /* allow_tensor_metadata_change */ false,
+      /* is_fresh_tensor */ true);
 }
 
 static Tensor _fw_primal(
diff --git a/torch/csrc/autograd/autograd_meta.cpp b/torch/csrc/autograd/autograd_meta.cpp
index b1ef5b3a76a4..072501cbcf04 100644
--- a/torch/csrc/autograd/autograd_meta.cpp
+++ b/torch/csrc/autograd/autograd_meta.cpp
@@ -1,5 +1,7 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <c10/util/irange.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/input_metadata.h>
 #include <torch/csrc/autograd/variable.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
diff --git a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
index 2b44dd5905e9..9de461cc56a2 100644
--- a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+++ b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
@@ -478,6 +478,58 @@ torch::CppFunction autogradNotImplementedFallback() {
       &autogradNotImplementedFallbackImpl>();
 }
 
+struct GenericViewFunc : public ViewFunc {
+  GenericViewFunc(
+      torch::jit::Stack non_tensor_stack,
+      size_t aliased_input_idx_val,
+      c10::OperatorHandle op)
+      : non_tensor_stack_(non_tensor_stack),
+        aliased_input_idx_val_(aliased_input_idx_val),
+        op_(op) {
+    // This should report saved Tensors and SymInts.
+    // We already have an assert that ensure there are no Tensors here
+    // by making sure there is only one Tensor input.
+    // We also verify there are no SymInt here for now.
+    // Both can be lifted if the visit and clone logic get updated.
+    const auto& schema = op_.schema();
+    for (const auto& arg : schema.arguments()) {
+      TORCH_CHECK(
+          arg.real_type()->kind() != c10::TypeKind::SymIntType,
+          "Custom ops that are views do not support SymInt. Please file an issue if you need it.");
+      for (const auto& ct : arg.real_type()->containedTypes()) {
+        TORCH_CHECK(
+            ct->kind() != c10::TypeKind::SymIntType,
+            "Custom ops that are views do not support SymInt. Please file an issue if you need it.");
+      }
+    }
+  }
+
+  at::Tensor operator()(const at::Tensor& new_base) const override {
+    torch::jit::Stack local_stack = non_tensor_stack_;
+    local_stack.at(aliased_input_idx_val_) = c10::IValue(new_base);
+
+    op_.callBoxed(local_stack);
+    auto& result = local_stack[local_stack.size() - 1];
+    TORCH_CHECK(
+        result.isTensor(),
+        "ADInplaceOrView fallback view replay did not return a Tensor");
+    return result.toTensor();
+  }
+
+  std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> /*unused*/ = std::nullopt,
+      std::optional<std::vector<at::Tensor>> /*unused*/ =
+          std::nullopt) const override {
+    return std::make_unique<GenericViewFunc>(
+        non_tensor_stack_, aliased_input_idx_val_, op_);
+  }
+
+ private:
+  torch::jit::Stack non_tensor_stack_;
+  size_t aliased_input_idx_val_;
+  c10::OperatorHandle op_;
+};
+
 static void autogradNotImplementedInplaceOrViewFallbackImpl(
     const c10::OperatorHandle& op,
     c10::DispatchKeySet dispatch_keys,
@@ -553,6 +605,18 @@ static void autogradNotImplementedInplaceOrViewFallbackImpl(
       "input and the first output (the output can be a vector of tensors). Please change the "
       "order of your operator's parameters so that this is the case.");
   const bool is_view = aliased_input_idx.has_value();
+  size_t aliased_input_idx_val;
+
+  // Save inputs before we redispatch down
+  torch::jit::Stack non_tensor_stack;
+  if (is_view) {
+    // Note that this won't be used if a TensorList is returned.
+    aliased_input_idx_val = aliased_input_idx.value();
+    non_tensor_stack.reserve(num_arguments);
+    for (const auto i : c10::irange(num_arguments)) {
+      non_tensor_stack.push_back((*stack)[stack_start + i]);
+    }
+  }
 
   {
     at::AutoDispatchBelowADInplaceOrView guard;
@@ -608,13 +672,32 @@ static void autogradNotImplementedInplaceOrViewFallbackImpl(
       auto result = std::move(aliased_output);
       stack->at(stack->size() - num_returns + aliased_output_idx) = result;
     } else {
+      c10::IValue& aliased_output_iv =
+          (*stack)[stack->size() - num_returns + aliased_output_idx];
       TORCH_CHECK(aliased_output_iv.isTensor());
+      TORCH_CHECK(
+          num_returns == 1,
+          "ADInplaceOrView fallback only support single output view functions");
+
+      // Remove the Tensor from the original stack
+      for (const auto i : c10::irange(num_arguments)) {
+        if (non_tensor_stack[i].isTensor()) {
+          TORCH_CHECK(
+              i == aliased_input_idx_val,
+              "Internal error in ADInplaceOrView fallback, unknown Tensor in the stack");
+          non_tensor_stack[i] = {};
+        }
+      }
+
+      auto view_func = std::make_unique<GenericViewFunc>(
+          non_tensor_stack, aliased_input_idx_val, op);
+
       auto result = as_view(
           /* base=*/aliased_input,
           /* tensor=*/std::move(aliased_output_iv).toTensor(),
           /* is_bw_differentiable=*/true,
           /* is_fw_differentiable=*/true,
-          /* view_func=*/std::move(erroring_view_func),
+          /* view_func=*/std::move(view_func),
           /* rev_view_func=*/erroring_rev_view_func,
           /* creation_meta=*/
           InferenceMode::is_enabled()
diff --git a/torch/csrc/autograd/cpp_hook.cpp b/torch/csrc/autograd/cpp_hook.cpp
index b8772d462fd4..48292931f33b 100644
--- a/torch/csrc/autograd/cpp_hook.cpp
+++ b/torch/csrc/autograd/cpp_hook.cpp
@@ -11,10 +11,8 @@ void check_single_result(
     const at::TensorBase& value,
     const at::TensorBase& result,
     const std::string& hook_name) {
-  if (!value.defined()) {
-    throw std::runtime_error(
-        "can't replace a empty gradient with a non-empty value");
-  }
+  TORCH_CHECK(
+      value.defined(), "can't replace a empty gradient with a non-empty value");
   torch::autograd::check_variable_result(value, result, hook_name);
 }
 } // namespace
diff --git a/torch/csrc/autograd/custom_function.cpp b/torch/csrc/autograd/custom_function.cpp
index fb33425e656b..4104dac14a5f 100644
--- a/torch/csrc/autograd/custom_function.cpp
+++ b/torch/csrc/autograd/custom_function.cpp
@@ -261,7 +261,8 @@ static optional_variable_list _process_backward_mode_ad(
     const at::ArrayRef<std::optional<Variable>> raw_outputs,
     const std::shared_ptr<Node>& cdata,
     const std::unordered_set<at::TensorImpl*>& to_save_if_setup_context,
-    const _view_as_self_fn_t& view_as_self_fn) {
+    const _view_as_self_fn_t& view_as_self_fn,
+    bool pure_view) {
   auto num_outputs = raw_outputs.size();
 
 #ifndef STRIP_ERROR_MESSAGES
@@ -404,7 +405,8 @@ static optional_variable_list _process_backward_mode_ad(
     if (!(is_input && is_modified) && var.is_view()) {
       // is_view() => diff_view_meta
       auto diff_view_meta = impl::get_view_autograd_meta(var);
-      diff_view_meta->set_creation_meta(CreationMeta::IN_CUSTOM_FUNCTION);
+      diff_view_meta->set_creation_meta(
+          pure_view ? CreationMeta::DEFAULT : CreationMeta::IN_CUSTOM_FUNCTION);
     }
 
     if (is_differentiable) {
@@ -448,13 +450,20 @@ optional_variable_list _wrap_outputs(
     const std::shared_ptr<Node>& cdata,
     const _jvp_fn_t& jvp_user_function,
     const std::unordered_set<at::TensorImpl*>& to_save_if_setup_context,
-    const _view_as_self_fn_t& view_as_self_fn) {
+    const _view_as_self_fn_t& view_as_self_fn,
+    bool pure_view) {
   std::unordered_map<at::TensorImpl*, size_t> inputs_mapping;
   inputs_mapping.reserve(input_vars.size());
   for (const auto i : c10::irange(input_vars.size())) {
     inputs_mapping.emplace(input_vars[i].unsafeGetTensorImpl(), i);
   }
 
+  // Limit pure views to 1-1 mapping as it is unclear if it is even
+  // possible to have a pure view for N-1 or 1-N.
+  TORCH_CHECK(
+      !pure_view || (input_vars.size() == 1 && raw_outputs.size() == 1),
+      "Pure view custom Function can only have one input Tensor and one output Tensor. Open an issue if you need to support more.");
+
   auto outputs = _process_backward_mode_ad(
       inputs_mapping,
       non_differentiable,
@@ -462,7 +471,8 @@ optional_variable_list _wrap_outputs(
       raw_outputs,
       cdata,
       to_save_if_setup_context,
-      view_as_self_fn);
+      view_as_self_fn,
+      pure_view);
 
   // This must happen after the backward processing as we expect the
   // computations happening here to track backward mode gradients.
@@ -482,30 +492,31 @@ void check_variable_result(
     const at::TensorBase& original,
     const at::TensorBase& result,
     const std::string& hook_name) {
-  if (!original.options().type_equal(result.options())) {
-    std::stringstream ss;
-    ss << "hook '" << hook_name << "' has changed the type of value (";
-    ss << "was " << original.toString() << " got ";
-    ss << result.toString() << ")";
-    throw std::runtime_error(ss.str());
-  }
+  TORCH_CHECK(
+      original.options().type_equal(result.options()),
+      "hook '",
+      hook_name,
+      "' has changed the type of value (was ",
+      original.toString(),
+      " got ",
+      result.toString(),
+      ")");
 
-  if (original.is_cuda() != result.is_cuda()) {
-    std::stringstream ss;
-    ss << "hook '" << hook_name << "' has changed the type of value";
-    if (original.is_cuda()) {
-      ss << " (was CUDA tensor got CPU tensor)";
-    } else {
-      ss << " (was CPU tensor got CUDA tensor)";
-    }
-    throw std::runtime_error(ss.str());
-  }
+  TORCH_CHECK(
+      original.is_cuda() == result.is_cuda(),
+      "hook '",
+      hook_name,
+      "' has changed the type of value (was ",
+      original.is_cuda() ? "CUDA tensor" : "CPU tensor",
+      " got ",
+      result.is_cuda() ? "CUDA tensor" : "CPU tensor",
+      ")");
 
-  if (original.sym_sizes().vec() != result.sym_sizes().vec()) {
-    std::stringstream ss;
-    ss << "hook '" << hook_name << "' has changed the size of value";
-    throw std::runtime_error(ss.str());
-  }
+  TORCH_CHECK(
+      original.sym_sizes().vec() == result.sym_sizes().vec(),
+      "hook '",
+      hook_name,
+      "' has changed the size of value");
 }
 
 AutogradContext::AutogradContext(PackedArgs& packed_args) {
diff --git a/torch/csrc/autograd/custom_function.h b/torch/csrc/autograd/custom_function.h
index 25e88cbf6cfe..3b9cf755f4c2 100644
--- a/torch/csrc/autograd/custom_function.h
+++ b/torch/csrc/autograd/custom_function.h
@@ -24,7 +24,8 @@ TORCH_API std::vector<std::optional<Variable>> _wrap_outputs(
     const std::shared_ptr<Node>& cdata,
     const _jvp_fn_t& jvp_user_function,
     const std::unordered_set<at::TensorImpl*>& to_save_if_setup_context,
-    const _view_as_self_fn_t& view_as_self_fn);
+    const _view_as_self_fn_t& view_as_self_fn,
+    bool pure_view);
 
 TORCH_API void check_variable_result(
     const at::TensorBase& original,
@@ -228,30 +229,32 @@ inline variable_list CppNode_apply_functional(
     }
   }
 
-  if (num_outputs != num_forward_inputs) {
-    std::string msg("function ");
-    msg += name + " returned an incorrect number of gradients (expected ";
-    msg += std::to_string(num_forward_inputs) + ", got ";
-    msg += std::to_string(num_outputs) + ")";
-    throw std::runtime_error(msg);
-  }
+  TORCH_CHECK(
+      num_outputs == num_forward_inputs,
+      "function ",
+      name,
+      " returned an incorrect number of gradients (expected ",
+      num_forward_inputs,
+      ", got ",
+      num_outputs,
+      ")");
 
   variable_list results;
   results.reserve(num_outputs);
   for (const auto i : c10::irange(num_outputs)) {
     if (!is_variable_input_[i]) {
-      if (outputs[i].defined()) {
-        std::string msg("function ");
-        msg += name +
-            " returned a gradient different that is defined at position ";
-        msg += std::to_string(i + 1) +
-            ", std the corresponding forward input was not a Variable";
-        throw std::runtime_error(msg);
-      }
+      TORCH_CHECK(
+          outputs[i].defined() == false,
+          "function ",
+          name,
+          " returned a gradient different that is defined at position ",
+          i + 1,
+          ", std the corresponding forward input was not a Variable");
       continue;
     }
     results.emplace_back(outputs[i]);
   }
+
   return results;
 }
 
@@ -521,7 +524,8 @@ auto Function<T>::apply(Args&&... args)
       is_executable ? node : nullptr,
       jvp_fn,
       {},
-      view_as_self_fn);
+      view_as_self_fn,
+      false);
 
   node->output_info_.reserve(wrapped_outputs.size());
   for (auto& output : wrapped_outputs) {
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index f0024f8f0b07..f92af4994fd5 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -707,9 +707,8 @@ void GraphTask::mark_as_completed_and_run_post_processing() {
 }
 
 void GraphTask::exec_post_processing() {
-  if (!not_ready_.empty()) {
-    throw std::runtime_error("could not compute gradients for some functions");
-  }
+  TORCH_CHECK(
+      not_ready_.empty(), "could not compute gradients for some functions");
 
   // set the thread_local current_graph_task_ as more callbacks can be installed
   // by existing final callbacks.
@@ -949,15 +948,17 @@ static void validate_outputs_impl(
     TORCH_CHECK(
         isFloatingType(grad.scalar_type()) ||
         (input_is_complex == grad_is_complex));
-    if (c10::typeMetaToScalarType(metadata.options().dtype()) !=
-        grad.scalar_type()) {
-      grad = grad.to(c10::typeMetaToScalarType(metadata.options().dtype()));
-    }
-    if (grad.dtype() != metadata.dtype()) {
-      std::stringstream ss;
-      ss << "invalid gradient at index " << i << " - expected dtype ";
-      ss << metadata.dtype() << " but got " << grad.dtype();
-      TORCH_CHECK(false, format_error(ss.str()));
+
+    if (metadata.grad_dtype().has_value()) {
+      if (grad.scalar_type() != metadata.grad_dtype().value()) {
+        grad = grad.to(metadata.grad_dtype().value());
+      }
+      if (grad.scalar_type() != metadata.grad_dtype().value()) {
+        std::stringstream ss;
+        ss << "invalid gradient at index " << i << " - expected dtype ";
+        ss << metadata.grad_dtype().value() << " but got " << grad.dtype();
+        TORCH_CHECK(false, format_error(ss.str()));
+      }
     }
     if (grad.layout() != metadata.layout()) {
       // TODO: Currently we only support (*, Sparse) combination for
@@ -1149,12 +1150,13 @@ void Engine::evaluate_function(
     for (const auto i : c10::irange(num_outputs)) {
       auto& output = outputs[i];
       at::OptionalDeviceGuard guard(device_of(output));
-      if (output.defined() && isnan(output)._is_any_true().item<bool>()) {
-        std::stringstream ss;
-        ss << "Function '" << fn.name() << "' returned nan values in its " << i
-           << "th output.";
-        throw std::runtime_error(ss.str());
-      }
+      TORCH_CHECK(
+          !output.defined() || !isnan(output)._is_any_true().item<bool>(),
+          "Function '",
+          fn.name(),
+          "' returned nan values in its ",
+          i,
+          "th output.");
     }
   }
 
@@ -1175,7 +1177,7 @@ void Engine::evaluate_function(
 
     if (it == dependencies.end()) {
       auto name = next.function->name();
-      throw std::runtime_error(std::string("dependency not found for ") + name);
+      TORCH_CHECK(false, "dependency not found for ", name);
     } else if (--it->second == 0) {
       dependencies.erase(it);
       is_ready = true;
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index fba950bbcec5..ca97c43ca726 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -200,11 +200,12 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
       const at::TensorOptions& options,
       c10::SymIntArrayRef shape,
       bool is_tensor_subclass,
-      bool is_nested) noexcept {
+      bool is_nested,
+      std::optional<at::ScalarType> grad_dtype) noexcept {
     uint32_t input_nr = input_metadata_.size();
     auto meta_shape = MetadataShape{std::in_place_type<SymIntSmallVec>, shape};
     input_metadata_.emplace_back(
-        options, meta_shape, is_tensor_subclass, is_nested);
+        options, meta_shape, is_tensor_subclass, is_nested, grad_dtype);
     return input_nr;
   }
 
diff --git a/torch/csrc/autograd/function_hook.h b/torch/csrc/autograd/function_hook.h
index c72aac4fbecf..8a847c56834f 100644
--- a/torch/csrc/autograd/function_hook.h
+++ b/torch/csrc/autograd/function_hook.h
@@ -60,8 +60,8 @@ struct TORCH_API PostAccumulateGradHook {
   }
 
   virtual void apply_with_saved(
-      Variable&,
-      torch::dynamo::autograd::SwapSavedVariables&) {
+      Variable& /*unused*/,
+      torch::dynamo::autograd::SwapSavedVariables& /*unused*/) {
     TORCH_CHECK_NOT_IMPLEMENTED(
         false,
         std::string("compiled_args nyi, see [Note: Compiled Autograd] ") +
diff --git a/torch/csrc/autograd/functions/basic_ops.cpp b/torch/csrc/autograd/functions/basic_ops.cpp
index d461c638df12..7ab63db9b0b2 100644
--- a/torch/csrc/autograd/functions/basic_ops.cpp
+++ b/torch/csrc/autograd/functions/basic_ops.cpp
@@ -17,7 +17,7 @@ variable_list Error::apply(variable_list&& inputs) {
 }
 
 variable_list Error::apply(variable_list&& inputs) const {
-  throw std::runtime_error(msg);
+  TORCH_CHECK(false, msg);
 }
 
 void Error::compiled_args(CompiledNodeArgs& args) const {
diff --git a/torch/csrc/autograd/functions/init.cpp b/torch/csrc/autograd/functions/init.cpp
index 05c8901e1f60..efdf171e5c01 100644
--- a/torch/csrc/autograd/functions/init.cpp
+++ b/torch/csrc/autograd/functions/init.cpp
@@ -8,7 +8,9 @@
 #include <torch/csrc/autograd/python_autograd.h>
 #include <torch/csrc/autograd/python_cpp_function.h>
 #include <torch/csrc/autograd/python_variable.h>
+#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/autograd/functions/sendrpc_backward.h>
+#endif
 #include <torch/csrc/jit/python/python_tracer.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_numbers.h>
@@ -47,7 +49,7 @@ struct UndefinedGradCtor {
 
 struct NoCtor {
   Node* operator()(PyObject* args) {
-    throw std::runtime_error("Cannot construct");
+    TORCH_CHECK(false, "Cannot construct");
   }
 };
 
@@ -148,9 +150,11 @@ void THPAutograd_initFunctions() {
   static PyTypeObject CopyBackwardsClass;
   addClass<CopyBackwards, NoCtor>(module, CopyBackwardsClass, "CopyBackwards");
 
+#ifdef USE_DISTRIBUTED
   static PyTypeObject SendRpcBackwardClass;
   addClass<torch::distributed::autograd::SendRpcBackward, NoCtor>(
       module, SendRpcBackwardClass, "SendRpcBackward");
+#endif
 
   static PyTypeObject CopySlicesClass;
   addClass<CopySlices, NoCtor>(module, CopySlicesClass, "CopySlices");
diff --git a/torch/csrc/autograd/functions/tensor.cpp b/torch/csrc/autograd/functions/tensor.cpp
index 98248aca714e..ca57f6488ce1 100644
--- a/torch/csrc/autograd/functions/tensor.cpp
+++ b/torch/csrc/autograd/functions/tensor.cpp
@@ -184,9 +184,7 @@ inline variable_list CopySlices::apply_impl(
   // see Note [Thread Safety on Autograd Node]
   std::lock_guard<std::mutex> lock(mutex_);
 
-  if (!fn) {
-    throw std::runtime_error(ERR_BACKWARD_TWICE);
-  }
+  TORCH_CHECK(fn, ERR_BACKWARD_TWICE);
 
   auto result =
       grad.new_empty_strided_symint(base.sym_sizes(), base.sym_strides());
@@ -252,9 +250,7 @@ variable_list CopySlices::apply_with_saved(
 
   auto results = variable_list(num_outputs());
   if (grads[0].defined()) {
-    if (!fn) {
-      throw std::runtime_error(ERR_BACKWARD_TWICE);
-    }
+    TORCH_CHECK(fn, ERR_BACKWARD_TWICE);
     update_exec_info();
 
     std::vector<bool> needs_input_grad;
diff --git a/torch/csrc/autograd/functions/utils.cpp b/torch/csrc/autograd/functions/utils.cpp
index c655e4664b8f..2f24f22a8b33 100644
--- a/torch/csrc/autograd/functions/utils.cpp
+++ b/torch/csrc/autograd/functions/utils.cpp
@@ -53,18 +53,22 @@ void check_input_variables(
   if (required_args == -1) {
     required_args = args;
   }
-  if (inputs.size() != static_cast<size_t>(args)) {
-    std::stringstream ss;
-    ss << name << ": expected " << args << " arguments (got " << inputs.size();
-    ss << ")";
-    throw std::runtime_error(ss.str());
-  }
+  TORCH_CHECK(
+      inputs.size() == static_cast<size_t>(args),
+      name,
+      ": expected ",
+      args,
+      " arguments (got ",
+      inputs.size(),
+      ")");
+
   for (const auto i : c10::irange(required_args)) {
-    if (!inputs[i].defined() && !allow_undefined) {
-      std::stringstream ss;
-      ss << name << ": expected Tensor at argument " << i << " (got None)";
-      throw std::runtime_error(ss.str());
-    }
+    TORCH_CHECK(
+        inputs[i].defined() || allow_undefined,
+        name,
+        ": expected Tensor at argument ",
+        i,
+        " (got None)");
   }
 }
 } // namespace torch::autograd
diff --git a/torch/csrc/autograd/graph_task.h b/torch/csrc/autograd/graph_task.h
index 018beaffdaaf..b34d15c7d05c 100644
--- a/torch/csrc/autograd/graph_task.h
+++ b/torch/csrc/autograd/graph_task.h
@@ -122,7 +122,7 @@ struct GraphTask : std::enable_shared_from_this<GraphTask> {
 
   // Note: this field is not ready to be used until the proper
   // `thread_locals_.set_grad_mode()` call in the constructor.
-  at::ThreadLocalState thread_locals_ = at::ThreadLocalState();
+  at::ThreadLocalState thread_locals_;
 
   std::unordered_set<c10::Stream> leaf_streams;
 
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index 380060501882..7cfb93594204 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -309,8 +309,12 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
           })
       .def("nbytes", [](const KinetoEvent& e) { return e.nBytes(); })
       // whether the event is hidden
-      .def("is_hidden_event", [](const KinetoEvent& e) {
-        return e.isHiddenEvent();
+      .def(
+          "is_hidden_event",
+          [](const KinetoEvent& e) { return e.isHiddenEvent(); })
+      // KinetoEvent metadata
+      .def("metadata_json", [](const KinetoEvent& e) {
+        return e.metadataJson();
       });
 
   m.def("_soft_assert_raises", &setSoftAssertRaises);
@@ -594,6 +598,33 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
             s.register_hooks(
                 std::make_unique<torch::autograd::PySavedVariableHooks>(
                     pack_hook, unpack_hook));
+          })
+      .def_property_readonly(
+          "data",
+          [](const torch::autograd::SavedVariable& s) -> py::object {
+            if (s.has_hooks()) {
+              auto opt = s.retrieve_unpack_hook_data();
+              TORCH_INTERNAL_ASSERT(opt.has_value());
+              py::gil_scoped_acquire gil;
+              const auto& [_unpack_fn, data_obj] = *opt;
+              PyObject* raw = data_obj.ptr(getPyInterpreter());
+              TORCH_INTERNAL_ASSERT(raw != nullptr);
+              return py::reinterpret_borrow<py::object>(raw);
+            } else {
+              return py::cast(s.get_raw_data().value());
+            }
+          })
+      .def_property_readonly(
+          "unpack_hook",
+          [](const torch::autograd::SavedVariable& s) -> py::object {
+            auto opt = s.retrieve_unpack_hook_data();
+            if (!opt.has_value()) {
+              return py::none();
+            }
+            py::gil_scoped_acquire gil;
+            const auto& [unpack_safe, _unused_data] = *opt;
+            auto* unpack_ptr = unpack_safe.ptr(getPyInterpreter());
+            return py::reinterpret_borrow<py::function>(unpack_ptr);
           });
 
   torch::autograd::profiler::python_tracer::init();
@@ -1082,7 +1113,7 @@ static PyObject* any_output_is_alias_to_input_or_output(
     if (!t.storage()) {
       return false;
     }
-    auto* cp = t.storage().data_ptr().get_context();
+    auto* cp = t.storage().unsafeGetStorageImpl();
     if (cp) {
       s.insert(cp);
     }
@@ -1093,7 +1124,7 @@ static PyObject* any_output_is_alias_to_input_or_output(
     if (!t.storage()) {
       return false;
     }
-    auto* cp = t.storage().data_ptr().get_context();
+    auto* cp = t.storage().unsafeGetStorageImpl();
     if (!cp) {
       return false;
     }
diff --git a/torch/csrc/autograd/input_metadata.cpp b/torch/csrc/autograd/input_metadata.cpp
index 74a39ed68381..f43368bbded0 100644
--- a/torch/csrc/autograd/input_metadata.cpp
+++ b/torch/csrc/autograd/input_metadata.cpp
@@ -29,12 +29,14 @@ InputMetadata::InputMetadata(
     const at::TensorOptions& options,
     MetadataShape input_shape,
     bool is_tensor_subclass,
-    bool is_nested)
+    bool is_nested,
+    std::optional<at::ScalarType> grad_dtype)
     : options_{options},
       shape_{std::move(input_shape)},
       is_tensor_subclass_{is_tensor_subclass},
       is_nested_{is_nested},
-      was_default_constructed_{false} {
+      was_default_constructed_{false},
+      grad_dtype_{grad_dtype} {
   auto device_ = options.device();
   stream_ = c10::impl::getDeviceGuardImpl(device_.type())->getStream(device_);
 }
@@ -44,7 +46,8 @@ InputMetadata::InputMetadata(const at::Tensor& t)
           t.options(),
           compute_variant_shape(t),
           is_python_dispatch(t),
-          t.is_nested()) {}
+          t.is_nested(),
+          t.grad_dtype()) {}
 
 at::Tensor InputMetadata::zeros_like() const {
   TORCH_CHECK(
diff --git a/torch/csrc/autograd/input_metadata.h b/torch/csrc/autograd/input_metadata.h
index 1f74e72cae7c..1facbf345bc6 100644
--- a/torch/csrc/autograd/input_metadata.h
+++ b/torch/csrc/autograd/input_metadata.h
@@ -38,7 +38,8 @@ struct TORCH_API InputMetadata {
       const at::TensorOptions& options,
       MetadataShape input_shape,
       bool is_tensor_subclass,
-      bool is_nested);
+      bool is_nested,
+      std::optional<at::ScalarType> grad_dtype);
   InputMetadata(const at::Tensor& t);
 
   const at::TensorOptions& options() const {
@@ -97,11 +98,23 @@ struct TORCH_API InputMetadata {
   // Danger: not thread safe, caller must protect with lock
   SymIntSmallVec& mutable_shape_as_dim_vector();
 
+  std::optional<at::ScalarType> grad_dtype() const {
+    TORCH_INTERNAL_ASSERT(!was_default_constructed_);
+    return grad_dtype_;
+  }
+
+  void set_grad_dtype(const std::optional<at::ScalarType>& grad_dtype) {
+    TORCH_INTERNAL_ASSERT(!was_default_constructed_);
+    grad_dtype_ = grad_dtype;
+  }
+
  private:
   at::Tensor shape_as_tensor() const;
   bool is_nestedness_same(const at::Tensor& grad) const;
   bool maybe_expandable_to(const at::Tensor& grad) const;
 
+  // NB: The engine does not use the dtype from the options, but rather the
+  //     grad_dtype_ field to validate grad_output dtype.
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   const at::TensorOptions options_;
   MetadataShape shape_;
@@ -109,5 +122,11 @@ struct TORCH_API InputMetadata {
   bool is_tensor_subclass_ = false;
   bool is_nested_ = false;
   bool was_default_constructed_ = true;
+
+  // The grad_dtype_ field is the dtype that the engine expects the grad to be.
+  // When nullopt, grad_dtype_ is allowed to be any dtype.
+  // This field is mutated if THPVariable_set_grad_dtype is called
+  // and the AccumulateGrad has already been created.
+  std::optional<at::ScalarType> grad_dtype_;
 };
 } // namespace torch::autograd
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 6880caddc8d2..fe3acd99761c 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -37,7 +37,8 @@ extern "C" {
 // https://github.com/pytorch/pytorch/issues/51026
 __attribute__((weak)) int acc_get_device_type();
 __attribute__((weak)) int acc_get_device_type() {
-  throw std::runtime_error(
+  TORCH_CHECK(
+      false,
       "Dummy implementation of acc_get_device_type is not supposed to be called!");
 }
 } // extern "C"
@@ -221,7 +222,7 @@ struct AddTensorboardFields : public MetadataBase {
   }
 
   template <typename T>
-  void operator()(const T&) {}
+  void operator()(const T& /*unused*/) {}
 };
 
 struct AddGenericMetadata : public MetadataBase {
@@ -264,16 +265,38 @@ struct AddGenericMetadata : public MetadataBase {
         continue;
       }
 
-      // Until needed, lets limit the kwargs to only ints, doubles, strings and
-      // bools
-      if (!val.isInt() && !val.isDouble() && !val.isString() && !val.isBool()) {
-        LOG(WARNING) << "Inputted kwarg: " << key
-                     << " is not an int, double, string, or bool for op: "
-                     << op_event.name_ << " skipping";
+      // Until needed, lets limit the kwargs to only ints, doubles, strings,
+      // bools, and list of strings
+      bool isValidType =
+          val.isInt() || val.isDouble() || val.isString() || val.isBool();
+      bool isStringList = false;
+
+      if (!isValidType && val.isList()) {
+        // Check if it's a list of strings
+        auto list = val.toListRef();
+        isStringList =
+            std::all_of(list.begin(), list.end(), [](const c10::IValue& item) {
+              return item.isString();
+            });
+      }
+
+      if (!isValidType && !isStringList) {
+        LOG(WARNING)
+            << "Inputted kwarg: " << key
+            << " is not an int, double, string, bool, or list of strings for op: "
+            << op_event.name_ << " skipping";
         continue;
       }
-      bool isString = val.isString();
-      addMetadata(key, ivalueToStr(val, isString));
+
+      if (isStringList) {
+        // For list of strings, use ivalueListToStr
+        auto list = val.toListRef();
+        std::vector<c10::IValue> stringList(list.begin(), list.end());
+        addMetadata(key, ivalueListToStr(stringList));
+      } else {
+        bool isString = val.isString();
+        addMetadata(key, ivalueToStr(val, isString));
+      }
     }
     // Add extra metadata if any
     for (const auto& [key, val] : op_event.extra_meta_) {
@@ -323,7 +346,7 @@ struct AddGenericMetadata : public MetadataBase {
   }
 
   template <typename T>
-  void operator()(const T&) {}
+  void operator()(const T& /*unused*/) {}
 
  private:
   /* To get names of the performance events */
@@ -1045,6 +1068,17 @@ void KinetoEvent::getPerfEventCounters(std::vector<uint64_t>& in) const {
       [](const auto&) -> void { return; }));
 }
 
+std::string KinetoEvent::metadataJson() const {
+  return result_->visit(c10::overloaded(
+      [](const ExtraFields<EventType::TorchOp>& op) -> std::string {
+        return op.metadata_json_;
+      },
+      [](const ExtraFields<EventType::Kineto>& op) -> std::string {
+        return op.metadata_json_;
+      },
+      [](const auto&) -> std::string { return std::string(""); }));
+}
+
 #define FORWARD_FROM_RESULT(method_name, result_expr)                        \
   decltype(std::declval<KinetoEvent>().method_name())                        \
   KinetoEvent::method_name() const {                                         \
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index 34d65a0b8dd6..dbb4febce78b 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -23,7 +23,7 @@ using extra_meta_t = std::unordered_map<std::string, std::string>;
 
 struct TORCH_API KinetoEvent {
   KinetoEvent(
-      const std::shared_ptr<const torch::profiler::impl::Result>&,
+      const std::shared_ptr<const torch::profiler::impl::Result>& /*result*/,
       const bool verbose);
 
   uint64_t startThreadId() const;
@@ -63,8 +63,9 @@ struct TORCH_API KinetoEvent {
   bool isPythonFunction() const;
   int64_t cudaElapsedUs() const;
   int64_t privateuse1ElapsedUs() const;
-  void getPerfEventCounters(torch::profiler::perf_counters_t&) const;
+  void getPerfEventCounters(torch::profiler::perf_counters_t& /*in*/) const;
   extra_meta_t extraMeta() const;
+  std::string metadataJson() const;
 
  private:
   torch::profiler::impl::ProfilerVoidEventStub fallbackStart() const;
diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h
index e4a451ef6679..30a9fb96f258 100644
--- a/torch/csrc/autograd/profiler_legacy.h
+++ b/torch/csrc/autograd/profiler_legacy.h
@@ -97,7 +97,7 @@ struct TORCH_API LegacyEvent {
       case EventKind::MemoryAlloc:
         return "memory_alloc";
     }
-    throw std::runtime_error("unknown event kind");
+    TORCH_CHECK(false, "unknown event kind");
   }
 
   EventKind kind() const {
@@ -328,7 +328,7 @@ struct TORCH_API ProfilerDisableOptions {
 // NOTE: profiler mode is thread local, with automatic propagation
 // across thread boundary (e.g. at::launch tasks)
 TORCH_API void enableProfilerLegacy(
-    const torch::profiler::impl::ProfilerConfig&);
+    const torch::profiler::impl::ProfilerConfig& /*new_config*/);
 using thread_event_lists = std::vector<std::vector<LegacyEvent>>;
 TORCH_API thread_event_lists disableProfilerLegacy(
     std::optional<ProfilerDisableOptions> profilerDisableOptions =
diff --git a/torch/csrc/autograd/profiler_python.cpp b/torch/csrc/autograd/profiler_python.cpp
index 0e895312cbd1..a45935ecb299 100644
--- a/torch/csrc/autograd/profiler_python.cpp
+++ b/torch/csrc/autograd/profiler_python.cpp
@@ -365,7 +365,9 @@ std::vector<std::pair<std::string, TensorMetadata>> ValueCache::unpackTensorMap(
 }
 
 template <>
-void ValueCache::store<CallType::PyCall>(const PyCallKey& key, no_ephemeral_t) {
+void ValueCache::store<CallType::PyCall>(
+    const PyCallKey& key,
+    no_ephemeral_t /*unused*/) {
   auto& locations = std::get<CallType::PyCall>(state_);
   if (C10_UNLIKELY(locations.find(key) == locations.end())) {
     locations[key] = {
@@ -1432,7 +1434,7 @@ struct PythonIDVisitor {
   }
 
   template <typename T>
-  void operator()(T&) {}
+  void operator()(T& /*unused*/) {}
 
   size_t current_python_id_{0};
   ska::flat_hash_map<PyModuleCls, ska::flat_hash_map<PyModuleSelf, size_t>>
diff --git a/torch/csrc/autograd/python_anomaly_mode.cpp b/torch/csrc/autograd/python_anomaly_mode.cpp
index 8fb88dff5c3e..2b05b93673bf 100644
--- a/torch/csrc/autograd/python_anomaly_mode.cpp
+++ b/torch/csrc/autograd/python_anomaly_mode.cpp
@@ -30,7 +30,7 @@ void PyAnomalyMetadata::store_stack() {
 void PyAnomalyMetadata::print_stack(const std::string& current_node_name) {
   pybind11::gil_scoped_acquire gil;
   if (!PyDict_Check(dict())) {
-    throw std::runtime_error("Anomaly metadata is not a python dictionary.");
+    TORCH_CHECK(false, "Anomaly metadata is not a python dictionary.");
   }
   PyObject* trace_stack = nullptr;
   if (PyDict_GetItemStringRef(dict(), ANOMALY_TRACE_KEY, &trace_stack) < 0) {
diff --git a/torch/csrc/autograd/python_cpp_function.cpp b/torch/csrc/autograd/python_cpp_function.cpp
index bd844c5ca18e..6787df7080ad 100644
--- a/torch/csrc/autograd/python_cpp_function.cpp
+++ b/torch/csrc/autograd/python_cpp_function.cpp
@@ -261,8 +261,7 @@ PyTypeObject* _initFunctionPyTypeObject(
   type.tp_traverse = THPCppFunction_traverse;
   type.tp_clear = THPCppFunction_clear;
   if (PyType_Ready(&type) < 0) {
-    auto msg = std::string("Unable to instantiate PyTypeObject for ") + name;
-    throw std::runtime_error(msg);
+    TORCH_CHECK(false, "Unable to instantiate PyTypeObject for ", name);
   }
   return &type;
 }
diff --git a/torch/csrc/autograd/python_engine.cpp b/torch/csrc/autograd/python_engine.cpp
index 32f2cc34cf3d..8a52306e9183 100644
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@@ -57,10 +57,6 @@ PythonEngine::~PythonEngine() {
   Engine::stop();
 }
 
-#if PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION >= 9
-#define IS_PYTHON_3_9_PLUS
-#endif
-
 void PythonEngine::thread_init(
     int device,
     const std::shared_ptr<ReadyQueue>& ready_queue,
@@ -72,11 +68,7 @@ void PythonEngine::thread_init(
   // Create a PyThreadState, but release the GIL. This lets
   // pybind11::gil_scoped_acquire calls inside thread_main acquire the GIL
   // without having to create a new PyThreadState each time.
-#if defined(IS_PYTHON_3_9_PLUS)
   auto gil = std::make_unique<pybind11::gil_scoped_acquire>();
-#else
-  pybind11::gil_scoped_acquire gil;
-#endif
   pybind11::gil_scoped_release no_gil;
   Engine::thread_init(device, ready_queue, false);
 
@@ -85,7 +77,6 @@ void PythonEngine::thread_init(
     decrement_non_reentrant_thread_count();
   }
 
-#if defined(IS_PYTHON_3_9_PLUS)
   // Do not call PyEval_RestoreThread, PyThreadState_[Clear|DeleteCurrent] if
   // runtime is finalizing
   if (!Py_IsInitialized()) {
@@ -96,7 +87,6 @@ void PythonEngine::thread_init(
     auto ptr = gil.release();
     operator delete(ptr);
   }
-#endif
 }
 
 void PythonEngine::thread_on_exception(
@@ -510,9 +500,9 @@ static void child_atfork() {
 
 bool THPEngine_initModule(PyObject* module) {
 #ifndef _WIN32
-  if (pthread_atfork(nullptr, nullptr, child_atfork) != 0) {
-    throw std::runtime_error("unable to set pthread_atfork handler");
-  }
+  TORCH_CHECK(
+      pthread_atfork(nullptr, nullptr, child_atfork) == 0,
+      "unable to set pthread_atfork handler");
 #endif
   if (PyType_Ready(&THPEngineType) < 0)
     return false;
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index 14591bc1fb4a..b4378faf8d3e 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -188,13 +188,15 @@ auto PyNode::apply(variable_list&& inputs) -> variable_list {
   }
 
   // Now the number of gradients should match
-  if (num_outputs != num_forward_inputs) {
-    std::string msg("function ");
-    msg += name() + " returned an incorrect number of gradients (expected ";
-    msg += std::to_string(num_forward_inputs) + ", got ";
-    msg += std::to_string(num_outputs) + ")";
-    throw std::runtime_error(msg);
-  }
+  TORCH_CHECK(
+      num_outputs == num_forward_inputs,
+      "function ",
+      name(),
+      " returned an incorrect number of gradients (expected ",
+      num_forward_inputs,
+      ", got ",
+      num_outputs,
+      ")");
 
   // Massage the Python results tuple back into a C++ variable_list
   return to_variable_list(r.get(), is_variable_input);
@@ -435,24 +437,24 @@ variable_list PyNode::to_variable_list(
     PyObject* output = PyTuple_GET_ITEM(outputs, i);
     bool was_variable = is_variable_input[i];
     if (!was_variable) {
-      if (output != Py_None) {
-        std::string msg("function ");
-        msg += name() + " returned a gradient different than None at position ";
-        msg += std::to_string(i + 1) +
-            ", but the corresponding forward input was not a Variable";
-        throw std::runtime_error(msg);
-      }
+      TORCH_CHECK(
+          output == Py_None,
+          "function ",
+          name(),
+          " returned a gradient different than None at position ",
+          i + 1,
+          ", but the corresponding forward input was not a Variable");
       continue;
     }
     if (output == Py_None) {
       results.emplace_back();
     } else {
-      if (!THPVariable_Check(output)) {
-        std::string msg("expected Variable or None (got ");
-        msg += THPUtils_typename(output);
-        msg += ")";
-        throw std::runtime_error(msg);
-      }
+      TORCH_CHECK(
+          THPVariable_Check(output),
+          "expected Variable or None (got ",
+          THPUtils_typename(output),
+          ")");
+
       results.emplace_back(THPVariable_Unpack(output));
     }
   }
@@ -537,6 +539,7 @@ static PyObject* THPFunction_new(
   new (&self->saved_variables) std::vector<SavedVariable>();
   new (&self->is_variable_input) std::vector<bool>();
   self->materialize_grads = true;
+  self->pure_view = false;
   self->materialize_non_diff_grads = true;
   return obj;
 }
@@ -714,7 +717,8 @@ static void _wrap_outputs(
       cdata_if_executable,
       jvp_user_function,
       to_save_if_setup_context,
-      view_as_self_fn);
+      view_as_self_fn,
+      self->pure_view);
 
   for (const auto i : c10::irange(num_outputs)) {
     PyObject* obj = PyTuple_GetItem(raw_output, i);
@@ -810,17 +814,31 @@ static void _get_tensors_to_save(
 static void _save_variables(
     const std::vector<std::optional<at::Tensor>>& tensors_to_save,
     const std::shared_ptr<PyNode>& cdata_ptr,
-    THPFunction* self) {
-  if (tensors_to_save.size() == 0)
+    THPFunction* self,
+    PyObject* outputs,
+    int64_t num_outputs) {
+  if (tensors_to_save.empty())
     return;
   size_t num_saved = tensors_to_save.size();
   self->saved_variables.clear();
   self->saved_variables.reserve(num_saved);
+
+  std::unordered_set<at::TensorImpl*> output_impls{};
+  output_impls.reserve(num_outputs);
+  for (const auto i : c10::irange(num_outputs)) {
+    PyObject* obj = PyTuple_GET_ITEM(outputs, i);
+    if (THPVariable_Check(obj)) {
+      const auto& tensor = THPVariable_Unpack(obj);
+      output_impls.insert(tensor.unsafeGetTensorImpl());
+    }
+  }
+
   for (const auto& opt_tensor : tensors_to_save) {
     if (!opt_tensor.has_value()) {
       self->saved_variables.emplace_back();
     } else {
-      bool is_output = opt_tensor.value().grad_fn().get() == cdata_ptr.get();
+      bool is_output =
+          output_impls.count(opt_tensor.value().unsafeGetTensorImpl()) > 0;
       self->saved_variables.emplace_back(opt_tensor.value(), is_output);
     }
   }
@@ -1135,7 +1153,8 @@ PyObject* process_outputs(
   // wrapping as the outputs must have their grad_fn/fw_grad properly set before
   // we save them.
   if (is_executable) {
-    _save_variables(tensors_to_save, cdata, grad_fn);
+    _save_variables(
+        tensors_to_save, cdata, grad_fn, outputs.get(), num_outputs);
   } else {
     // Remove unnecessary attributes
     Py_CLEAR(grad_fn->to_save);
@@ -1439,6 +1458,20 @@ int THPFunction_set_materialize_grads(
   END_HANDLE_TH_ERRORS_RET(-1)
 }
 
+int THPFunction_set_pure_view(
+    THPFunction* self,
+    PyObject* value,
+    void* unused) {
+  HANDLE_TH_ERRORS
+  if (!PyBool_Check(value)) {
+    THPUtils_invalidArguments(value, nullptr, "set_pure_view", 1, "(bool)");
+    return -1;
+  }
+  self->pure_view = (value == Py_True);
+  return 0;
+  END_HANDLE_TH_ERRORS_RET(-1)
+}
+
 PyObject* THPFunction_get_materialize_non_diff_grads(
     THPFunction* self,
     void* _unused) {
@@ -1713,6 +1746,11 @@ static struct PyGetSetDef THPFunction_properties[] = {
      (setter)THPFunction_set_materialize_grads,
      nullptr,
      nullptr},
+    {"_is_pure_view",
+     nullptr,
+     (setter)THPFunction_set_pure_view,
+     nullptr,
+     nullptr},
     {"_materialize_non_diff_grads",
      (getter)THPFunction_get_materialize_non_diff_grads,
      (setter)THPFunction_set_materialize_non_diff_grads,
diff --git a/torch/csrc/autograd/python_function.h b/torch/csrc/autograd/python_function.h
index e24399c10aa3..4b22c40725f9 100644
--- a/torch/csrc/autograd/python_function.h
+++ b/torch/csrc/autograd/python_function.h
@@ -109,6 +109,10 @@ struct THPFunction {
   // Default is true.
   bool materialize_grads;
 
+  // boolean indicating whether the function is a "pure view", meaning that
+  // replaying the view is enough to get a correct backward.
+  bool pure_view;
+
   // boolean indicating whether to materialize output grad tensors
   // corresponding to non-differentiable outputs. Normally, someone would
   // already get this behavior by switching off materialize_grads,
diff --git a/torch/csrc/autograd/python_hook.cpp b/torch/csrc/autograd/python_hook.cpp
index 3b2be3cb3f38..8d2fd0b99670 100644
--- a/torch/csrc/autograd/python_hook.cpp
+++ b/torch/csrc/autograd/python_hook.cpp
@@ -289,9 +289,7 @@ static variable_list unwrap_variables(PyObject* py_variables) {
       results[i] = THPVariable_Unpack(item);
     } else {
       // this should never happen, but just in case...
-      std::stringstream ss;
-      ss << "expected variable but got " << Py_TYPE(item)->tp_name;
-      throw std::runtime_error(ss.str());
+      TORCH_CHECK(false, "expected variable but got ", Py_TYPE(item)->tp_name);
     }
   }
   return results;
@@ -308,14 +306,16 @@ static void check_result(PyObject* prev, PyObject* result, PyObject* hook) {
 
   auto prev_size = PyTuple_GET_SIZE(prev);
   auto result_size = PyTuple_GET_SIZE(result);
-  if (prev_size != result_size) {
-    std::stringstream ss;
-    auto name = hook_name(hook);
-    ss << "hook '" << name << "' has returned an incorrect number ";
-    ss << "of values (got " << result_size << ", but expected ";
-    ss << prev_size << ")";
-    throw std::runtime_error(ss.str());
-  }
+
+  TORCH_CHECK(
+      prev_size == result_size,
+      "hook '",
+      hook_name(hook),
+      "' has returned an incorrect number of values (got ",
+      result_size,
+      ", but expected ",
+      prev_size,
+      ")");
 
   for (const auto i : c10::irange(prev_size)) {
     check_single_result(
@@ -330,10 +330,9 @@ static void check_single_result(
   if (_result == Py_None)
     return;
 
-  if (_original == Py_None) {
-    throw std::runtime_error(
-        "can't replace a None gradient with a non-None value");
-  }
+  TORCH_CHECK(
+      _original != Py_None,
+      "can't replace a None gradient with a non-None value");
 
   if (!PyObject_IsInstance(_result, THPVariableClass)) {
     PyErr_Format(
diff --git a/torch/csrc/autograd/python_torch_functions_manual.cpp b/torch/csrc/autograd/python_torch_functions_manual.cpp
index 1236fad45f36..79739b6e459d 100644
--- a/torch/csrc/autograd/python_torch_functions_manual.cpp
+++ b/torch/csrc/autograd/python_torch_functions_manual.cpp
@@ -644,15 +644,6 @@ void initTorchFunctions(PyObject* module) {
             at::functionalization::impl::isFunctionalTensor(t));
         at::functionalization::impl::mark_mutation_hidden_from_autograd(t);
       });
-  py_module.def(
-      "_functionalize_apply_view_metas",
-      [](const at::Tensor& tensor, const at::Tensor& base) {
-        TORCH_INTERNAL_ASSERT(
-            at::functionalization::impl::isFunctionalTensor(tensor));
-        auto impl =
-            at::functionalization::impl::unsafeGetFunctionalWrapper(tensor);
-        return impl->apply_view_metas(base);
-      });
   py_module.def("_functionalize_is_symbolic", [](const at::Tensor& t) {
     TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(t));
     auto impl = at::functionalization::impl::unsafeGetFunctionalWrapper(t);
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index c6f65453e871..4d6c618d0fae 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -213,7 +213,7 @@ static PyObject* THPVariable_NewWithVar(
     std::optional<bool> has_torch_dispatch_if_known = std::nullopt);
 
 // clang-tidy gets confused by static const
-static const char* VOLATILE_WARNING =
+static constexpr const char* VOLATILE_WARNING =
     "volatile was removed and now has no effect. Use "
     "`with torch.no_grad():` instead.";
 
@@ -265,7 +265,8 @@ PyObject* THPVariable_Wrap(const at::TensorBase& var) {
   }
 
   std::optional<PyObject*> mb_obj =
-      var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
+      var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+          /*ignore_hermetic_tls=*/false);
   if (mb_obj.has_value()) {
     auto obj = *mb_obj;
     if (obj) {
@@ -328,8 +329,8 @@ static bool isResurrectable(THPVariable* self) {
     return false;
   }
   // Check if this is hermetic. If it is, no resurrection.
-  if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj() !=
-      (PyObject*)self) {
+  if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+          /*ignore_hermetic_tls=*/false) != (PyObject*)self) {
     return false;
   }
   return true;
@@ -354,7 +355,8 @@ static bool THPVariable_tryResurrect(THPVariable* self) {
       !tensor.unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj());
 
   c10::TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
-  auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj();
+  auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj(
+      /*ignore_hermetic_tls=*/false);
 
   TORCH_INTERNAL_ASSERT(
       maybe_pyobj.has_value(),
@@ -626,7 +628,7 @@ static PyObject* THPVariable_make_subclass(
 }
 
 // Shared code factored out of THPVariable_make_wrapper_subclass and
-// THPVariable_make_dtensor.
+// THPVariable_dtensor__new__.
 static Tensor make_tensor_for_subclass_helper(
     SymIntArrayRef sym_sizes,
     OptionalSymIntArrayRef sym_strides,
@@ -684,7 +686,7 @@ static Tensor make_tensor_for_subclass_helper(
 }
 
 static PyObject* THPVariable_make_wrapper_subclass(
-    PyObject*,
+    PyObject* /*unused*/,
     PyObject* args,
     PyObject* kwargs) {
   HANDLE_TH_ERRORS
@@ -785,18 +787,122 @@ static PyObject* THPVariable_make_wrapper_subclass(
   END_HANDLE_TH_ERRORS
 }
 
+static py::handle get_dtensor_spec_class() {
+#if IS_PYBIND_2_13_PLUS
+  PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store<py::object>
+      storage;
+  return storage
+      .call_once_and_store_result([]() -> py::object {
+        return py::module::import("torch")
+            .attr("distributed")
+            .attr("tensor")
+            .attr("_dtensor_spec")
+            .attr("DTensorSpec");
+      })
+      .get_stored();
+#else
+  static py::handle dtensor_spec_class = py::object(py::module::import("torch")
+                                                        .attr("distributed")
+                                                        .attr("tensor")
+                                                        .attr("_dtensor_spec")
+                                                        .attr("DTensorSpec"))
+                                             .release();
+  return dtensor_spec_class;
+#endif
+}
+
+static bool arg_type_tensor_or_tensor_list_like(py::handle arg) {
+  const auto dtensor_spec_class = get_dtensor_spec_class();
+  if (py::isinstance(arg, dtensor_spec_class)) {
+    return true;
+  }
+  if (!PyList_Check(arg.ptr())) {
+    return false;
+  }
+  py::list arg_list = py::reinterpret_borrow<py::list>(arg);
+  for (const auto e : arg_list) {
+    if (!e.is_none() && !py::isinstance(e, dtensor_spec_class)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+#define FOR_EACH_DTENSOR_INTERNED_STRING(_) \
+  _(_comparison_key)                        \
+  _(_local_tensor)                          \
+  _(_spec)                                  \
+  _(args_schema)                            \
+  _(has_symints)                            \
+  _(kwargs_schema)                          \
+  _(op)                                     \
+  _(schema_info)                            \
+  _(shape)                                  \
+  _(static_argnum)                          \
+  _(static_kwargkey)                        \
+  _(stride)                                 \
+  _(tensor_meta)
+
+struct DTensorInternedStrings {
+#define DECLARE_INTERNED_STRING_VARIABLE(s) PyObject* s;
+  FOR_EACH_DTENSOR_INTERNED_STRING(DECLARE_INTERNED_STRING_VARIABLE)
+#undef DECLARE_INTERNED_STRING_VARIABLE
+};
+
+static DTensorInternedStrings dtensor_interned_strings;
+
+static bool intern_dtensor_strings() {
+#define INTERN_DTENSOR_STRING(s)                                           \
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dtensor_interned_strings.s == nullptr); \
+  dtensor_interned_strings.s = PyUnicode_InternFromString(#s);             \
+  if (dtensor_interned_strings.s == nullptr) {                             \
+    return false;                                                          \
+  }
+
+  FOR_EACH_DTENSOR_INTERNED_STRING(INTERN_DTENSOR_STRING);
+#undef INTERN_DTENSOR_STRING
+  return true;
+}
+
+static bool checked_not(PyObject* obj) {
+  int result = PyObject_Not(obj);
+  if (result == -1) {
+    throw py::error_already_set();
+  }
+  return result;
+}
+
+static c10::SymDimVector tuple_to_symintlist(PyObject* obj) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(PyTuple_Check(obj));
+  c10::SymDimVector res;
+  const auto size = PyTuple_GET_SIZE(obj);
+  res.reserve(size);
+  for (const auto idx : c10::irange(size)) {
+    PyObject* item = PyTuple_GET_ITEM(obj, idx);
+    if (THPUtils_checkLongExact(item)) {
+      res.emplace_back(THPUtils_unpackLong(item));
+    } else if (torch::is_symint(py::handle(item))) {
+      res.push_back(py::handle(item).cast<c10::SymInt>());
+    } else {
+      // N.B. torch.Tensor.__index__ exists, so this should handle
+      // scalar Tensors fine.
+      res.emplace_back(THPUtils_unpackIndex(item));
+    }
+  }
+  return res;
+}
+
 // DTensor-specific variant of make_wrapper_subclass to minimize DTensor
 // overhead.
-static PyObject* THPVariable_make_dtensor(
-    PyObject*,
+static PyObject* THPVariable_dtensor_new(
+    PyObject* /*unused*/,
     PyObject* args,
     PyObject* kwargs) {
   HANDLE_TH_ERRORS
   static PythonArgParser parser({
-      "_make_dtensor(PyObject* cls, SymIntArrayRef size, SymIntArrayRef strides, "
-      "Tensor local_tensor, bool requires_grad)",
+      "_dtensor__new__(PyObject* cls, Tensor local_tensor, PyObject* spec, bool requires_grad)",
   });
-  ParsedArgs<5> parsed_args{};
+  ParsedArgs<4> parsed_args{};
   auto r = parser.parse(args, kwargs, parsed_args);
   PyObject* cls = r.pyobject(0);
 
@@ -818,7 +924,13 @@ static PyObject* THPVariable_make_dtensor(
       " must define __torch_dispatch__");
 #endif
 
-  const auto& local_tensor = r.tensor(3);
+  const auto& local_tensor = r.tensor(1);
+  const bool requires_grad = r.toBool(3);
+  if (local_tensor.requires_grad() && !requires_grad) {
+    TORCH_WARN(
+        "To construct DTensor from torch.Tensor, it's recommended to use "
+        "local_tensor.detach() and make requires_grad consistent.");
+  }
   const auto options = TensorOptions()
                            .dtype(local_tensor.dtype())
                            .device(local_tensor.device())
@@ -833,22 +945,190 @@ static PyObject* THPVariable_make_dtensor(
     extra_dispatch_keys = extra_dispatch_keys.add(c10::DispatchKey::Negative);
   }
 
+  py::handle spec = py::handle(r.pyobject(2));
+  const auto tensor_meta = spec.attr(dtensor_interned_strings.tensor_meta);
+  TORCH_CHECK(!tensor_meta.is_none());
+  const auto sizes = tensor_meta.attr(dtensor_interned_strings.shape);
+  TORCH_CHECK(
+      PyTuple_Check(sizes.ptr()), "spec.tensor_meta.shape must be a tuple");
+  const auto stride = tensor_meta.attr(dtensor_interned_strings.stride);
+  TORCH_CHECK(
+      PyTuple_Check(stride.ptr()), "spec.tensor_meta.stride must be a tuple");
+
   Tensor tensor = make_tensor_for_subclass_helper(
-      /*sym_sizes=*/r.symintlist(1),
-      /*sym_strides=*/r.symintlist(2),
+      /*sym_sizes=*/tuple_to_symintlist(sizes.ptr()),
+      /*sym_strides=*/tuple_to_symintlist(stride.ptr()),
       /*sym_storage_offset=*/std::nullopt,
       options,
       /*storage_size=*/std::nullopt,
       extra_dispatch_keys);
-  tensor.set_requires_grad(r.toBool(4));
-  return THPVariable_NewWithVar(
-      (PyTypeObject*)cls,
-      tensor,
-      // false is the default
-      /*allow_preexisting_pyobj=*/false,
-      // we know DTensor has __torch_dispatch__ and we double-checked
-      // above; avoid checking again.
-      /*has_torch_dispatch_if_known=*/true);
+  tensor.set_requires_grad(requires_grad);
+  py::object py_tensor =
+      py::reinterpret_steal<py::object>(THPVariable_NewWithVar(
+          (PyTypeObject*)cls,
+          tensor,
+          // false is the default
+          /*allow_preexisting_pyobj=*/false,
+          // we know DTensor has __torch_dispatch__; avoid checking again.
+          /*has_torch_dispatch_if_known=*/true));
+  py_tensor.attr(dtensor_interned_strings._spec) = spec;
+  py_tensor.attr(dtensor_interned_strings._local_tensor) = local_tensor;
+  return py_tensor.release().ptr();
+  END_HANDLE_TH_ERRORS
+}
+
+static bool DTensor_OpSchema_recompute_comparison_key_impl(
+    PyObject* self,
+    const py::tuple& args_schema) {
+  py::object static_kwargkey;
+  size_t static_argnum = 0;
+  const py::handle self_handle = py::handle(self);
+  const py::handle schema_info =
+      self_handle.attr(dtensor_interned_strings.schema_info);
+  if (checked_not(schema_info.ptr())) {
+    static_argnum = args_schema.size();
+    static_kwargkey = py::none();
+  } else {
+    static_argnum = py::cast<size_t>(
+        schema_info.attr(dtensor_interned_strings.static_argnum));
+    static_kwargkey =
+        schema_info.attr(dtensor_interned_strings.static_kwargkey);
+  }
+  c10::SmallVector<py::object, 8> args_to_hash;
+  size_t idx = 0;
+  for (const auto& e : args_schema) {
+    if (idx >= static_argnum || arg_type_tensor_or_tensor_list_like(e)) {
+      if (PyList_Check(e.ptr())) {
+        args_to_hash.push_back(
+            py::reinterpret_steal<py::object>(PyList_AsTuple(e.ptr())));
+      } else {
+        args_to_hash.push_back(py::reinterpret_borrow<py::object>(e));
+      }
+    }
+    idx++;
+  }
+  py::tuple args_to_hash_tup(args_to_hash.size());
+  for (const auto idx : c10::irange(args_to_hash.size())) {
+    args_to_hash_tup[idx] = std::move(args_to_hash[idx]);
+  }
+  PyObject* comparison_key = nullptr;
+  if (!static_kwargkey.is_none()) {
+    if (!PyList_Check(static_kwargkey.ptr())) {
+      PyErr_SetString(
+          PyExc_TypeError, "self.schema_info.static_kwargkey must be a list!");
+      return false;
+    }
+    py::list static_kwargkey_list =
+        py::reinterpret_borrow<py::list>(static_kwargkey);
+    auto raw_kwargs_schema =
+        self_handle.attr(dtensor_interned_strings.kwargs_schema);
+    if (!PyDict_Check(raw_kwargs_schema.ptr())) {
+      PyErr_SetString(PyExc_TypeError, "self.kwargs_schema must be a dict!");
+      return false;
+    }
+    py::tuple kwargs_to_hash(static_kwargkey_list.size());
+    int idx = 0;
+    auto kwargs_schema = py::reinterpret_borrow<py::dict>(raw_kwargs_schema);
+    for (const auto& k : static_kwargkey_list) {
+      PyObject* item = PyDict_GetItemWithError(kwargs_schema.ptr(), k.ptr());
+      if (item) {
+        kwargs_to_hash[idx++] = py::reinterpret_borrow<py::object>(item);
+      } else if (PyErr_Occurred()) {
+        return false;
+      } else {
+        kwargs_to_hash[idx++] = py::none();
+      }
+    }
+    comparison_key = PyTuple_Pack(
+        3,
+        self_handle.attr(dtensor_interned_strings.op).ptr(),
+        args_to_hash_tup.ptr(),
+        kwargs_to_hash.ptr());
+  } else {
+    comparison_key = PyTuple_Pack(
+        2,
+        self_handle.attr(dtensor_interned_strings.op).ptr(),
+        args_to_hash_tup.release().ptr());
+  }
+  if (!comparison_key) {
+    return false;
+  }
+  self_handle.attr(dtensor_interned_strings._comparison_key) =
+      py::reinterpret_steal<py::object>(comparison_key);
+
+  return true;
+}
+
+static PyObject* DTensor_OpSchema_recompute_comparison_key(
+    PyObject* mod,
+    PyObject* self) {
+  HANDLE_TH_ERRORS
+  const py::handle self_handle = py::handle(self);
+  const py::handle raw_args_schema =
+      self_handle.attr(dtensor_interned_strings.args_schema);
+  if (!PyTuple_Check(raw_args_schema.ptr())) {
+    PyErr_SetString(PyExc_TypeError, "DTensor.args_schema must be a tuple!");
+    return nullptr;
+  }
+  py::tuple args_schema = py::reinterpret_borrow<py::tuple>(raw_args_schema);
+  if (!DTensor_OpSchema_recompute_comparison_key_impl(self, args_schema)) {
+    return nullptr;
+  }
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
+static PyObject* DTensor_OpSchema_post_init(PyObject* mod, PyObject* self) {
+  HANDLE_TH_ERRORS
+  const py::handle self_handle = py::handle(self);
+  const py::handle raw_args_schema =
+      self_handle.attr(dtensor_interned_strings.args_schema);
+  if (!PyTuple_Check(raw_args_schema.ptr())) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "DTensor_OpSchema_post_init requires self.args_schema to be a tuple!");
+    return nullptr;
+  }
+  py::tuple args_schema = py::reinterpret_borrow<py::tuple>(raw_args_schema);
+  if (!DTensor_OpSchema_recompute_comparison_key_impl(self, args_schema)) {
+    return nullptr;
+  }
+
+  const auto dtensor_spec_class = get_dtensor_spec_class();
+  bool has_symints = false;
+  for (const auto& a : args_schema) {
+    if (Py_TYPE(a.ptr()) != (PyTypeObject*)(dtensor_spec_class.ptr()) &&
+        !py::isinstance(a, dtensor_spec_class)) {
+      continue;
+    }
+    const py::handle tensor_meta = a.attr(dtensor_interned_strings.tensor_meta);
+    if (tensor_meta.is_none()) {
+      continue;
+    }
+    const auto contains_any_symint = [](const py::tuple& sequence) {
+      for (const auto& s : sequence) {
+        if (THPUtils_checkLong(s.ptr())) {
+          continue;
+        }
+        if (torch::is_symint(s)) {
+          return true;
+        }
+      }
+      return false;
+    };
+    // Specifically it's supposed to be torch.Size.
+    py::object raw_shape = tensor_meta.attr(dtensor_interned_strings.shape);
+    if (!PyTuple_Check(raw_shape.ptr())) {
+      PyErr_SetString(PyExc_TypeError, "OpSchema.shape must be a tuple!");
+      return nullptr;
+    }
+    const auto shape = py::reinterpret_steal<py::tuple>(raw_shape.release());
+    if (contains_any_symint(shape)) {
+      has_symints = true;
+    }
+  }
+  self_handle.attr(dtensor_interned_strings.has_symints) = has_symints;
+  Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }
 
@@ -956,7 +1236,7 @@ static PyObject* THPVariable_get_version(THPVariable* self, void* unused) {
     return handle_torch_function_getter(self, "_version");
   }
   const auto& var = THPVariable_Unpack(self);
-  return PyInt_FromLong(var._version());
+  return THPUtils_packInt64(var._version());
   END_HANDLE_TH_ERRORS
 }
 
@@ -1039,13 +1319,18 @@ static int THPVariable_set_grad(
       self != (THPVariable*)py_grad, "can't assign Variable as its own grad");
 
   const auto& grad = THPVariable_Unpack(py_grad);
-  TORCH_CHECK(
-      var.dtype() == grad.dtype(),
-      "attempting to assign a gradient with dtype '",
-      grad.dtype(),
-      "' to a tensor with dtype '",
-      var.dtype(),
-      "'. Please ensure that the gradient and the tensor have the same dtype");
+  if (var.grad_dtype().has_value()) {
+    TORCH_CHECK(
+        grad.dtype() == var.grad_dtype().value(),
+        "attempting to assign a gradient with dtype '",
+        grad.dtype(),
+        "' to a tensor with grad_dtype '",
+        var.grad_dtype().value(),
+        "'. The gradient must match the tensor's grad_dtype (defaults to the tensor's "
+        "dtype). You can set the tensor's grad_dtype attribute with a specific dtype, or "
+        "None to allow any dtype. Set grad_dtype with caution. Diverging the dtypes of "
+        "a tensor and its gradient may break downstream systems that assume they match.");
+  }
   TORCH_CHECK(
       var.device().type() == grad.device().type(),
       "attempting to assign a gradient with device type '",
@@ -1054,8 +1339,11 @@ static int THPVariable_set_grad(
       var.device().type(),
       "'. Please ensure that the gradient and the tensor are on the same device");
   if (grad.layout() != kSparse) {
+    auto expected_options = var.options().dtype(
+        var.grad_dtype().has_value() ? var.grad_dtype().value()
+                                     : grad.scalar_type());
     TORCH_CHECK(
-        grad.options().type_equal(var.options()),
+        grad.options().type_equal(expected_options),
         "attempting to assign a gradient to a tensor that has data of a different type");
   }
   TORCH_CHECK(
@@ -1111,9 +1399,8 @@ static PyObject* THPVariable_get_output_nr(THPVariable* self, void* unused) {
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "output_nr");
   }
-  const auto output_nr =
-      static_cast<long>(THPVariable_Unpack(self).output_nr());
-  return PyInt_FromLong(output_nr);
+  const auto output_nr = THPVariable_Unpack(self).output_nr();
+  return THPUtils_packInt64(output_nr);
   END_HANDLE_TH_ERRORS
 }
 
@@ -1150,7 +1437,7 @@ static PyObject* THPVariable_get_ndim(THPVariable* self, void* unused) {
   if (check_has_torch_function((PyObject*)self)) {
     return handle_torch_function_getter(self, "ndim");
   }
-  return PyInt_FromLong(THPVariable_Unpack(self).dim());
+  return THPUtils_packInt64(THPVariable_Unpack(self).dim());
   END_HANDLE_TH_ERRORS
 }
 
@@ -1562,6 +1849,56 @@ static PyObject* THPVariable_get_nbytes(THPVariable* self, void* unused) {
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject* THPVariable_get_grad_dtype(THPVariable* self, void* unused) {
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function((PyObject*)self)) {
+    return handle_torch_function_getter(self, "grad_dtype");
+  }
+  const auto& var = THPVariable_Unpack(self);
+  TORCH_CHECK(
+      !var.grad_fn(), "grad_dtype can only be accessed on leaf tensors.");
+  if (!var.grad_dtype().has_value()) {
+    Py_RETURN_NONE;
+  } else {
+    return torch::autograd::utils::wrap(var.grad_dtype().value());
+  }
+  END_HANDLE_TH_ERRORS
+}
+
+static int THPVariable_set_grad_dtype(
+    THPVariable* self,
+    PyObject* obj,
+    void* unused) {
+  HANDLE_TH_ERRORS
+  if (check_has_torch_function((PyObject*)self)) {
+    return handle_torch_function_setter(self, "grad_dtype", obj);
+  }
+  const auto& var = THPVariable_Unpack(self);
+  TORCH_CHECK(
+      THPDtype_Check(obj) || obj == Py_None,
+      "grad_dtype must be a torch.dtype or None, but got ",
+      Py_TYPE(obj)->tp_name);
+  if (var.grad().defined() && obj != Py_None) {
+    auto new_dtype = reinterpret_cast<THPDtype*>(obj);
+    TORCH_CHECK(
+        var.grad().dtype() == new_dtype->scalar_type,
+        "Cannot set grad_dtype to '",
+        new_dtype->scalar_type,
+        "' because there is already a gradient with dtype '",
+        var.grad().dtype(),
+        "'. Please clear the gradient (.grad = None) before changing grad_dtype, "
+        "or ensure the new grad_dtype matches the existing gradient's dtype.");
+  }
+  std::optional<at::ScalarType> new_dtype;
+  if (obj != Py_None) {
+    auto* dtype = reinterpret_cast<THPDtype*>(obj);
+    new_dtype = dtype->scalar_type;
+  }
+  var.set_grad_dtype(new_dtype);
+  return 0;
+  END_HANDLE_TH_ERRORS_RET(-1)
+}
+
 static PyObject* THPVariable_get_itemsize(THPVariable* self, void* unused) {
   HANDLE_TH_ERRORS
   if (check_has_torch_function((PyObject*)self)) {
@@ -1720,6 +2057,11 @@ static struct PyGetSetDef THPVariable_properties[] = {
      (setter)THPVariable_set_imag,
      nullptr,
      nullptr},
+    {"grad_dtype",
+     (getter)THPVariable_get_grad_dtype,
+     (setter)THPVariable_set_grad_dtype,
+     nullptr,
+     nullptr},
     {nullptr}};
 
 static PyMappingMethods THPVariable_as_mapping = {
@@ -1742,8 +2084,8 @@ static PyMethodDef extra_methods[] = {
      castPyCFunctionWithKeywords(THPVariable_make_wrapper_subclass),
      METH_STATIC | METH_VARARGS | METH_KEYWORDS,
      nullptr},
-    {"_make_dtensor",
-     castPyCFunctionWithKeywords(THPVariable_make_dtensor),
+    {"_dtensor__new__",
+     castPyCFunctionWithKeywords(THPVariable_dtensor_new),
      METH_STATIC | METH_VARARGS | METH_KEYWORDS,
      nullptr},
     {"_fix_weakref", THPVariable_fix_weakref, METH_NOARGS, nullptr},
@@ -1762,6 +2104,18 @@ static PyMethodDef extra_methods[] = {
     {"_use_count", THPVariable__use_count, METH_NOARGS, nullptr},
     {nullptr}};
 
+// NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
+static PyMethodDef extra_functions[] = {
+    {"_DTensor_OpSchema_post_init",
+     DTensor_OpSchema_post_init,
+     METH_O,
+     nullptr},
+    {"_DTensor_OpSchema_recompute_comparison_key",
+     DTensor_OpSchema_recompute_comparison_key,
+     METH_O,
+     nullptr},
+    {nullptr}};
+
 struct THPVariableMeta {
   PyHeapTypeObject base;
 };
@@ -1931,8 +2285,8 @@ static int THPVariable_subclass_clear(THPVariable* self) {
     //        because Tensor asked us to (it's already destructing).
 
     if (!self->cdata.unsafeIsBorrowed() &&
-        tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj() ==
-            (PyObject*)self) {
+        tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+            /*ignore_hermetic_tls=*/false) == (PyObject*)self) {
       // TODO: empirically, on OS X this assert appears to be untrue
       // In test_py_tensors_multi_async_call - ProcessGroupRpcTestWithSpawn
       // distributed/rpc/test_process_group_agent.py
@@ -2118,7 +2472,8 @@ static PyObject* THPVariable_NewWithVar(
 
   // This function overwrite the Tensor's pyobj field without extra checks
   // Make sure it is not set otherwise we would leak memory
-  auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
+  auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+      /*ignore_hermetic_tls=*/false);
 
   // Under some circumstances, we may attempt to create a new Python
   // object for a variable that already has a Python object.  The most common
@@ -2488,5 +2843,10 @@ bool THPVariable_initModule(PyObject* module) {
   torch::autograd::initTorchFunctions(module);
   torch::autograd::initTensorImplConversion(module);
   torch::utils::validate_numpy_for_dlpack_deleter_bug();
+
+  if (!intern_dtensor_strings()) {
+    return false;
+  }
+  PyModule_AddFunctions(module, extra_functions);
   return true;
 }
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index e618ee703378..1a1a12ec20a7 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -109,7 +109,9 @@ static int64_t count_specified_dimensions(PyObject* index) {
       }
     } else {
       // Check sequences for __torch_function__ (top-level only)
-      if (PySequence_Check(obj)) {
+      // NB: do NOT use PySequence_Check, that will grab things like Numpy
+      // arrays
+      if (PyTuple_Check(obj) || PyList_Check(obj)) {
         if (sequence_has_torch_function(obj)) {
           return -1; // Signal torch function handling needed
         }
diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h
index 78510969400a..9126f397ffb2 100644
--- a/torch/csrc/autograd/saved_variable.h
+++ b/torch/csrc/autograd/saved_variable.h
@@ -54,6 +54,14 @@ class TORCH_API SavedVariable {
     return (bool)hooks_;
   }
 
+  std::optional<at::Tensor> get_raw_data() const {
+    if (hooks_) {
+      return std::nullopt;
+    } else {
+      return data_;
+    }
+  }
+
   // Used by compiled autograd
   std::optional<std::pair<c10::SafePyObject, c10::SafePyObject>>
   retrieve_unpack_hook_data() const {
diff --git a/torch/csrc/autograd/saved_variable_hooks.h b/torch/csrc/autograd/saved_variable_hooks.h
index ed255d34a043..d858f4638467 100644
--- a/torch/csrc/autograd/saved_variable_hooks.h
+++ b/torch/csrc/autograd/saved_variable_hooks.h
@@ -11,8 +11,8 @@ struct TORCH_API SavedVariableHooks {
   virtual ~SavedVariableHooks() = default;
   virtual std::optional<std::pair<c10::SafePyObject, c10::SafePyObject>>
   retrieve_unpack_hook_data() const {
-    throw std::runtime_error(
-        "Compiled Autograd only supports python saved tensor hooks ");
+    TORCH_CHECK(
+        false, "Compiled Autograd only supports python saved tensor hooks ");
   }
 };
 
diff --git a/torch/csrc/autograd/utils/lambda_post_hook.h b/torch/csrc/autograd/utils/lambda_post_hook.h
index e43d7a23876d..5f0f5626a4ad 100644
--- a/torch/csrc/autograd/utils/lambda_post_hook.h
+++ b/torch/csrc/autograd/utils/lambda_post_hook.h
@@ -36,7 +36,7 @@ class LambdaPostHook : public torch::autograd::FunctionPostHook {
 
  protected:
   std::function<variable_list(const variable_list&, const variable_list&)> fn_;
-  compiled_fn_type compiled_fn_{};
+  compiled_fn_type compiled_fn_;
 };
 
 } // namespace torch::autograd::utils
diff --git a/torch/csrc/autograd/utils/python_arg_parsing.h b/torch/csrc/autograd/utils/python_arg_parsing.h
index 9d4ec8dfcd3a..d9ad69848029 100644
--- a/torch/csrc/autograd/utils/python_arg_parsing.h
+++ b/torch/csrc/autograd/utils/python_arg_parsing.h
@@ -17,8 +17,8 @@ inline std::tuple<
     std::optional<at::MemoryFormat>>
 parse_to_conversion(PythonArgs& r, bool allow_copy) {
   if (r.idx == 0) {
-    if (!allow_copy && !r.isNone(3))
-      throw std::runtime_error(".to() does not accept copy argument");
+    TORCH_CHECK(
+        allow_copy || r.isNone(3), ".to() does not accept copy argument");
     return std::make_tuple(
         r.deviceOptional(0),
         r.scalartypeOptional(1),
@@ -26,8 +26,8 @@ parse_to_conversion(PythonArgs& r, bool allow_copy) {
         r.toBool(3),
         r.memoryformatOptional(4));
   } else if (r.idx == 1) {
-    if (!allow_copy && !r.isNone(2))
-      throw std::runtime_error(".to() does not accept copy argument");
+    TORCH_CHECK(
+        allow_copy || r.isNone(2), ".to() does not accept copy argument");
     return std::make_tuple(
         std::nullopt,
         r.scalartype(0),
@@ -36,8 +36,8 @@ parse_to_conversion(PythonArgs& r, bool allow_copy) {
         r.memoryformatOptional(3));
   } else {
     auto tensor = r.tensor(0);
-    if (!allow_copy && !r.isNone(2))
-      throw std::runtime_error(".to() does not accept copy argument");
+    TORCH_CHECK(
+        allow_copy || r.isNone(2), ".to() does not accept copy argument");
     return std::make_tuple(
         tensor.device(),
         tensor.scalar_type(),
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index de1422592fbe..b559ba44bf52 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -274,7 +274,7 @@ void set_grad_accumulator(
       std::move(grad_accumulator);
 }
 
-std::shared_ptr<Node> try_get_grad_accumulator(const Variable& self) {
+std::shared_ptr<Node> try_get_grad_accumulator(const at::TensorBase& self) {
   if (get_autograd_meta(self)) {
     return get_autograd_meta(self)->grad_accumulator_.lock();
   } else {
@@ -282,6 +282,10 @@ std::shared_ptr<Node> try_get_grad_accumulator(const Variable& self) {
   }
 }
 
+std::shared_ptr<Node> try_get_grad_accumulator(const Variable& self) {
+  return try_get_grad_accumulator(get_tensor_base(self));
+}
+
 std::shared_ptr<Node> grad_accumulator(const Variable& self) {
   auto autograd_meta = get_autograd_meta(self);
   if (!autograd_meta) {
@@ -597,10 +601,9 @@ void VariableHooks::_backward(
 void VariableHooks::requires_grad_(
     const at::TensorBase& self,
     bool _requires_grad) const {
-  if (!self.is_leaf() && !_requires_grad) {
-    throw std::runtime_error(
-        autograd::utils::requires_grad_leaf_error(_requires_grad));
-  }
+  TORCH_CHECK(
+      self.is_leaf() || _requires_grad,
+      autograd::utils::requires_grad_leaf_error(_requires_grad));
   self.set_requires_grad(_requires_grad);
 }
 
@@ -624,7 +627,7 @@ const at::TensorBase& VariableHooks::base(const at::TensorBase& self) const {
         "Can't get base of non-backward view Tensor");
     return diff_view_meta->get_backward_view().base_;
   } else {
-    throw std::runtime_error("Can't get base of non-view Tensor");
+    TORCH_CHECK(false, "Can't get base of non-view Tensor");
   }
 }
 
@@ -714,7 +717,8 @@ const std::shared_ptr<torch::autograd::Node>& VariableHooks::grad_fn(
             self.sym_sizes(), // Note: sizes(), not base_.sizes(), is
                               // intentional
             self.unsafeGetTensorImpl()->is_python_dispatch(),
-            self.is_nested());
+            self.is_nested(),
+            self.grad_dtype());
         diff_view_meta->grad_fn_ = std::move(fn);
       }
       diff_view_meta->set_attr_version(current_version);
@@ -910,4 +914,45 @@ std::unique_ptr<ViewFunc> ChainedViewFunc::clone_and_set(
       second->clone_and_set(second_symints, second_tensors));
 }
 
+std::optional<c10::ScalarType> VariableHooks::grad_dtype(
+    const at::TensorBase& self) const {
+  if (auto* meta = impl::get_autograd_meta(self)) {
+    return meta->grad_dtype(self);
+  }
+  return self.scalar_type();
+}
+
+void VariableHooks::set_grad_dtype(
+    const at::TensorBase& self,
+    const std::optional<c10::ScalarType>& grad_dtype) const {
+  auto* meta = impl::materialize_autograd_meta(self);
+  meta->set_grad_dtype(grad_dtype, self);
+}
+
+std::optional<at::ScalarType> AutogradMeta::grad_dtype(
+    const at::TensorBase& self) const {
+  if (allow_grad_dtype_mismatch_) {
+    return std::nullopt;
+  } else if (grad_dtype_.has_value()) {
+    return grad_dtype_;
+  } else {
+    return std::optional<at::ScalarType>(self.scalar_type());
+  }
+}
+void AutogradMeta::set_grad_dtype(
+    const std::optional<at::ScalarType>& grad_dtype,
+    const at::TensorBase& self) {
+  TORCH_CHECK(!grad_fn_, "grad_dtype can only be set on leaf tensors.");
+  if (grad_dtype.has_value()) {
+    grad_dtype_ = grad_dtype;
+    allow_grad_dtype_mismatch_ = false;
+  } else {
+    allow_grad_dtype_mismatch_ = true;
+  }
+  auto grad_acc = impl::try_get_grad_accumulator(self);
+  if (grad_acc) {
+    grad_acc->mutable_input_metadata(0).set_grad_dtype(grad_dtype);
+  }
+}
+
 } // namespace torch::autograd
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index dfffd3d97095..a297a9f5ef42 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -108,30 +108,35 @@ namespace impl {
 
 // WARNING: This may return a nullptr.  If you require AutogradMeta to return
 // a materialized structure, use materialize_autograd_meta instead.
-TORCH_API AutogradMeta* get_autograd_meta(const at::TensorBase&);
+TORCH_API AutogradMeta* get_autograd_meta(const at::TensorBase& /*self*/);
 
 // WARNING: This will return a nullptr if the Tensor is not a view.
-TORCH_API DifferentiableViewMeta* get_view_autograd_meta(const at::TensorBase&);
+TORCH_API DifferentiableViewMeta* get_view_autograd_meta(
+    const at::TensorBase& /*self*/);
 
 // Returns the current autograd meta, materializing it if it was previously
 // none.  This counts as a *mutating* operation, so do not call it on
 // "read-only" operators; in particular, this is NOT thread safe
-TORCH_API AutogradMeta* materialize_autograd_meta(const at::TensorBase&);
+TORCH_API AutogradMeta* materialize_autograd_meta(
+    const at::TensorBase& /*self*/);
 
 /// Set the gradient accumulator of the `Variable`. This is only applicable to
 /// leaf variables. Interior variables should call `set_gradient_edge()`.
 TORCH_API void set_grad_accumulator(
-    const Variable&,
+    const Variable& /*self*/,
     std::weak_ptr<Node> grad_accumulator);
 
 /// Attempts to get a pointer to the gradient accumulator of the `Variable`,
 /// if it still exists. If the gradient accumulator function has been
 /// destroyed, returns a `nullptr`.
-TORCH_API std::shared_ptr<Node> try_get_grad_accumulator(const Variable&);
+TORCH_API std::shared_ptr<Node> try_get_grad_accumulator(
+    const Variable& /*self*/);
+TORCH_API std::shared_ptr<Node> try_get_grad_accumulator(
+    const at::TensorBase& /*self*/);
 
 /// Gets the gradient accumulator of the `Variable` if it has one, or else
 /// create one on the fly and return it.
-TORCH_API std::shared_ptr<Node> grad_accumulator(const Variable&);
+TORCH_API std::shared_ptr<Node> grad_accumulator(const Variable& /*self*/);
 
 /// Returns the "canonical" gradient edge of this `Variable`, i.e. either the
 /// gradient function if this is an interior `Variable`, or the gradient
@@ -141,7 +146,7 @@ TORCH_API std::shared_ptr<Node> grad_accumulator(const Variable&);
 /// zero. Note that `set_gradient_edge` and `gradient_edge` are not
 /// symmetric. You must use `set_gradient_edge` to set the `grad_fn` and
 /// `set_grad_accumulator` to set the accumulator.
-TORCH_API Edge gradient_edge(const Variable&);
+TORCH_API Edge gradient_edge(const Variable& /*self*/);
 
 /// Set the gradient edge -- i.e. `grad_fn` and `input_nr` -- of the
 /// `Variable`.
@@ -149,7 +154,7 @@ TORCH_API Edge gradient_edge(const Variable&);
 /// and never the `grad_accumulator`. For the latter, use
 /// `set_grad_accumulator`. This allows late construction of an interior
 /// `Variable`.
-TORCH_API void set_gradient_edge(const Variable&, Edge edge);
+TORCH_API void set_gradient_edge(const Variable& /*self*/, Edge edge);
 
 // Autograd Graph Interaction
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -160,36 +165,37 @@ TORCH_API void set_gradient_edge(const Variable&, Edge edge);
 /// For View Variables:
 /// Called after in-place modifications. Modifies the grad_fn of the base
 /// Variable.
-TORCH_API void rebase_history(const Variable&, Edge gradient_edge);
+TORCH_API void rebase_history(const Variable& /*self*/, Edge gradient_edge);
 
 /// Gets the raw gradient function pointer, whatever it currently is.
-TORCH_API Node* grad_fn_unsafe(const Variable&);
+TORCH_API Node* grad_fn_unsafe(const Variable& /*self*/);
 
 /// Increments the version count of this `Variable`.
-TORCH_API void bump_version(const Variable&);
+TORCH_API void bump_version(const Variable& /*self*/);
 TORCH_API void set_version_counter(
-    const Variable&,
+    const Variable& /*self*/,
     const c10::VariableVersion& version_counter);
 
 /// Retrieves this `Variable`s version counter.
-TORCH_API const c10::VariableVersion& version_counter(const Variable&);
+TORCH_API const c10::VariableVersion& version_counter(const Variable& /*self*/);
 
-TORCH_API void set_name(const Variable&, const std::string& name);
+TORCH_API void set_name(const Variable& /*self*/, const std::string& name);
 
 TORCH_API void add_hook(
-    const at::TensorBase&,
+    const at::TensorBase& /*self*/,
     std::unique_ptr<FunctionPreHook> hook);
-TORCH_API std::vector<std::unique_ptr<FunctionPreHook>>& hooks(const Variable&);
-TORCH_API void clear_hooks(const at::TensorBase&);
+TORCH_API std::vector<std::unique_ptr<FunctionPreHook>>& hooks(
+    const Variable& /*self*/);
+TORCH_API void clear_hooks(const at::TensorBase& /*self*/);
 
 TORCH_API void set_post_acc_grad_hooks(
-    const at::TensorBase&,
+    const at::TensorBase& /*self*/,
     std::unique_ptr<PostAccumulateGradHook> dict);
 TORCH_API std::unique_ptr<PostAccumulateGradHook>& post_acc_grad_hooks(
-    const Variable&);
+    const Variable& /*self*/);
 
 TORCH_API void create_cpp_hook(
-    const at::TensorBase&,
+    const at::TensorBase& /*self*/,
     bool is_retains_grad_hooks = false);
 } // namespace impl
 
@@ -253,6 +259,13 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
   // correctly when this variable is passed to another function.
   uint32_t output_nr_;
 
+  // The dtype of the grad field; when nullopt, defaults to tensor's dtype.
+  std::optional<at::ScalarType> grad_dtype_;
+
+  // When true, allows gradient dtype to be different from tensor dtype,
+  // bypassing dtype casting and validation in the autograd engine.
+  bool allow_grad_dtype_mismatch_{false};
+
   // Mutex to ensure that concurrent read operations that modify internal
   // state are still thread-safe. Used by grad_fn(), grad_accumulator(),
   // fw_grad() and set_fw_grad()
@@ -293,6 +306,12 @@ struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
       uint64_t level,
       bool is_inplace_op) override;
 
+  std::optional<at::ScalarType> grad_dtype(const at::TensorBase& self) const;
+
+  void set_grad_dtype(
+      const std::optional<at::ScalarType>& grad_dtype,
+      const at::TensorBase& self);
+
   AutogradMeta(
       at::TensorImpl* self_impl = nullptr,
       bool requires_grad = false,
@@ -359,12 +378,12 @@ struct TORCH_API ViewFunc {
   /// must match the number of SymInts in the saved state (i.e. the size of the
   /// list returned by get_symints()).
   /// NOLINTNEXTLINE(performance-unnecessary-value-param)
-  virtual void set_symints(std::vector<c10::SymInt>) {}
+  virtual void set_symints(std::vector<c10::SymInt> /*unused*/) {}
   /// Sets the values of any Tensors in the saved state. The input vector size
   /// must match the number of Tensors in the saved state (i.e. the size of the
   /// list returned by get_tensors()).
   /// NOLINTNEXTLINE(performance-unnecessary-value-param)
-  virtual void set_tensors(std::vector<at::Tensor>) {}
+  virtual void set_tensors(std::vector<at::Tensor> /*unused*/) {}
 };
 
 /// ViewFunc that represents a chain of two ViewFuncs.
@@ -382,10 +401,13 @@ struct ChainedViewFunc : public ViewFunc {
   size_t num_tensors() const override {
     return first->num_tensors() + second->num_tensors();
   }
-  at::Tensor operator()(const at::Tensor&) const override;
+  at::Tensor operator()(
+      const at::Tensor& /*input_base*/ /*unused*/) const override;
   std::unique_ptr<ViewFunc> clone_and_set(
-      std::optional<std::vector<c10::SymInt>> = std::nullopt,
-      std::optional<std::vector<at::Tensor>> = std::nullopt) const override;
+      std::optional<std::vector<c10::SymInt>> /*symints*/ /*unused*/ =
+          std::nullopt,
+      std::optional<std::vector<at::Tensor>> /*tensors*/ /*unused*/ =
+          std::nullopt) const override;
 
  private:
   std::unique_ptr<ViewFunc> first;
@@ -396,12 +418,13 @@ struct ChainedViewFunc : public ViewFunc {
 struct ErroringViewFunc : public ViewFunc {
   ErroringViewFunc(std::string error_msg) : error_msg(std::move(error_msg)) {}
   ~ErroringViewFunc() override = default;
-  at::Tensor operator()(const at::Tensor&) const override {
+  at::Tensor operator()(const at::Tensor& /*unused*/) const override {
     TORCH_CHECK(false, error_msg);
   }
   std::unique_ptr<ViewFunc> clone_and_set(
-      std::optional<std::vector<c10::SymInt>> = std::nullopt,
-      std::optional<std::vector<at::Tensor>> = std::nullopt) const override {
+      std::optional<std::vector<c10::SymInt>> /*unused*/ = std::nullopt,
+      std::optional<std::vector<at::Tensor>> /*unused*/ =
+          std::nullopt) const override {
     return std::make_unique<ErroringViewFunc>(error_msg);
   }
 
@@ -835,16 +858,25 @@ inline Variable make_variable_differentiable_view(
 inline Variable make_variable_non_differentiable_view(
     const Variable& base,
     const at::Tensor& data,
-    bool allow_tensor_metadata_change = true) {
+    bool allow_tensor_metadata_change = true,
+    bool is_fresh_tensor = false) {
   if (data.defined()) {
-    // Currently all of non-differentiable view ops(detach/_indices/_values)
-    // share the same TensorImpl as their base Tensor. Thus a new TensorImpl
-    // allocation here is required.
+    // If we already allocated a new tensor, no need to
+    // shallow_copy_and_detach here. (See #163671 history; we tried to
+    // fan out to _indices and _values and ran into a SparseTensorImpl
+    // can of worms.)
+    if (is_fresh_tensor) {
+      auto* data_impl = data.unsafeGetTensorImpl();
+      data_impl->set_version_counter(impl::version_counter(base));
+      data_impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
+      data_impl->set_autograd_meta(nullptr);
+      return data;
+    }
     auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(
         /*version_counter=*/impl::version_counter(base),
         /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
     data_impl_copy->set_autograd_meta(nullptr);
-    return Variable(data_impl_copy);
+    return Variable(std::move(data_impl_copy));
   }
   return Variable();
 }
@@ -903,25 +935,30 @@ inline Variable make_variable(
         /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
     data_impl_copy->set_autograd_meta(std::make_unique<AutogradMeta>(
         data_impl_copy.get(), false, std::move(gradient_edge)));
-    return Variable(data_impl_copy);
+    return Variable(std::move(data_impl_copy));
   }
   return Variable();
 }
 
 struct VariableHooks final : at::impl::VariableHooksInterface {
-  at::TensorBase tensor_data(const at::TensorBase&) const override;
-  at::TensorBase variable_data(const at::TensorBase&) const override;
+  at::TensorBase tensor_data(
+      const at::TensorBase& /*self*/ /*unused*/) const override;
+  at::TensorBase variable_data(
+      const at::TensorBase& /*self*/ /*unused*/) const override;
   const std::shared_ptr<torch::autograd::Node>& grad_fn(
-      const at::TensorBase&) const override;
+      const at::TensorBase& /*self*/ /*unused*/) const override;
   unsigned _register_hook(
-      const at::TensorBase&,
+      const at::TensorBase& /*self*/ /*unused*/,
       std::function<at::TensorBase(const at::TensorBase&)> hook) const override;
-  void remove_hook(const at::TensorBase&, unsigned pos) const override;
-  bool is_view(const at::TensorBase&) const override;
-  const at::TensorBase& base(const at::TensorBase&) const override;
-  const std::string& name(const at::TensorBase&) const override;
-  bool is_leaf(const at::TensorBase&) const override;
-  int64_t output_nr(const at::TensorBase&) const override;
+  void remove_hook(const at::TensorBase& /*self*/ /*unused*/, unsigned pos)
+      const override;
+  bool is_view(const at::TensorBase& /*self*/ /*unused*/) const override;
+  const at::TensorBase& base(
+      const at::TensorBase& /*self*/ /*unused*/) const override;
+  const std::string& name(
+      const at::TensorBase& /*self*/ /*unused*/) const override;
+  bool is_leaf(const at::TensorBase& /*self*/ /*unused*/) const override;
+  int64_t output_nr(const at::TensorBase& /*self*/ /*unused*/) const override;
   void set_data(const at::TensorBase& self, const at::TensorBase& new_data)
       const override;
   at::TensorBase data(const at::TensorBase& self) const override;
@@ -940,6 +977,12 @@ struct VariableHooks final : at::impl::VariableHooksInterface {
       const c10::OperatorHandle& op,
       c10::DispatchKeySet dispatch_keys,
       torch::jit::Stack* stack) const override;
+  std::optional<c10::ScalarType> grad_dtype(
+      const at::TensorBase& /*self*/ /*unused*/) const override;
+  void set_grad_dtype(
+      const at::TensorBase& /*self*/ /*unused*/,
+      const std::optional<c10::ScalarType>& /*grad_dtype*/ /*unused*/)
+      const override;
 };
 
 namespace utils {
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.cpp b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
index 43606807c6e4..079cb9703aa2 100644
--- a/torch/csrc/cuda/CUDAPluggableAllocator.cpp
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
@@ -7,23 +7,6 @@
 
 namespace torch::cuda::CUDAPluggableAllocator {
 
-CUDAPluggableAllocatorDeleterContext::CUDAPluggableAllocatorDeleterContext(
-    std::function<FreeFuncType> free_fn,
-    void* data,
-    size_t size,
-    int device,
-    cudaStream_t stream)
-    : free_fn_(std::move(free_fn)),
-      data_(data),
-      size_(size),
-      device_(device),
-      stream_(stream) {}
-
-void CUDAPluggableAllocatorDeleterContext::free() {
-  free_fn_(data_, size_, device_, stream_);
-  delete this;
-}
-
 int device_count = 0;
 
 void custom_raw_deleter(void* ptr);
@@ -41,8 +24,8 @@ _AllocationMetadata::_AllocationMetadata(
 // This avoids having to link against libtorch for C++ based custom allocators
 // And also use this from python
 CUDAPluggableAllocator::CUDAPluggableAllocator(
-    std::function<MallocFuncType> alloc_fn,
-    std::function<FreeFuncType> free_fn)
+    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<void(void*, size_t, int, cudaStream_t)> free_fn)
     : alloc_fn_(std::move(alloc_fn)), free_fn_(std::move(free_fn)) {}
 
 CUDAPluggableAllocator::CUDAPluggableAllocator(CUDAPluggableAllocator& other)
@@ -114,10 +97,8 @@ c10::DataPtr CUDAPluggableAllocator::allocate(size_t size) {
   C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
   cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device);
   void* r = this->malloc(size, device, stream);
-  auto* ctx = new CUDAPluggableAllocatorDeleterContext(
-      free_fn_, r, size, device, stream);
   c10::DataPtr data_ptr = {
-      r, ctx, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
+      r, r, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
   return data_ptr;
 }
 
@@ -184,6 +165,13 @@ void CUDAPluggableAllocator::setMemoryFraction(
   }
 }
 
+std::vector<c10::cuda::CUDACachingAllocator::StreamSegmentSize>
+CUDAPluggableAllocator::getExpandableSegmentSizes(c10::DeviceIndex device) {
+  TORCH_CHECK(
+      false,
+      "CUDAMallocAsyncAllocator does not yet support getExpandableSegmentSizes.");
+}
+
 void CUDAPluggableAllocator::emptyCache(
     /*unused*/ c10::cuda::MempoolId_t mempool_id) {
   if (reset_fn_) {
@@ -382,8 +370,8 @@ getCurrentAllocator() {
 // TODO: add more functions in the argument
 std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
 createCustomAllocator(
-    std::function<MallocFuncType> alloc_fn,
-    std::function<FreeFuncType> free_fn) {
+    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<void(void*, size_t, int, cudaStream_t)> free_fn) {
   std::shared_ptr<CUDAPluggableAllocator> allocator(
       new CUDAPluggableAllocator(std::move(alloc_fn), std::move(free_fn)));
   allocator->init(device_count);
@@ -400,8 +388,8 @@ void changeCurrentAllocator(
   current_custom_allocator = allocator;
 }
 
-void custom_raw_deleter(void* ctx) {
-  reinterpret_cast<CUDAPluggableAllocatorDeleterContext*>(ctx)->free();
+void custom_raw_deleter(void* ptr) {
+  current_custom_allocator->raw_delete(ptr);
 }
 
 } // namespace torch::cuda::CUDAPluggableAllocator
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.h b/torch/csrc/cuda/CUDAPluggableAllocator.h
index 5a1b7be0a15d..ab9e2e84cd7b 100644
--- a/torch/csrc/cuda/CUDAPluggableAllocator.h
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.h
@@ -11,32 +11,6 @@
 
 namespace torch::cuda::CUDAPluggableAllocator {
 
-using MallocFuncType = void*(size_t, int, cudaStream_t);
-using FreeFuncType = void(void*, size_t, int, cudaStream_t);
-
-// A CUDAPluggableAllocatorDeleterContext object is used as the `ctx`
-// argument for DataPtr. We need context because a user can use
-// multiple allocators in the same PyTorch program, and
-// the allocators can have different free functions, such as:
-// free, cudaFree, cudaFreeAsync, ncclMemFree etc.
-struct TORCH_CUDA_CPP_API CUDAPluggableAllocatorDeleterContext {
-  explicit CUDAPluggableAllocatorDeleterContext(
-      std::function<FreeFuncType> free_fn,
-      void* data,
-      size_t size,
-      int device,
-      cudaStream_t stream);
-
-  void free();
-
- private:
-  std::function<FreeFuncType> free_fn_;
-  void* data_;
-  size_t size_;
-  int device_;
-  cudaStream_t stream_{};
-};
-
 #if defined(USE_ROCM)
 using streamType = c10::hip::HIPStream;
 #else
@@ -49,8 +23,8 @@ getCurrentAllocator();
 TORCH_CUDA_CPP_API std::shared_ptr<
     c10::cuda::CUDACachingAllocator::CUDAAllocator>
 createCustomAllocator(
-    std::function<MallocFuncType> alloc_fn,
-    std::function<FreeFuncType> free_fn);
+    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
 TORCH_CUDA_CPP_API void changeCurrentAllocator(
     const std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>&
         allocator);
@@ -69,8 +43,8 @@ struct _AllocationMetadata {
 struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
     : public c10::cuda::CUDACachingAllocator::CUDAAllocator {
   CUDAPluggableAllocator(
-      std::function<MallocFuncType> alloc_fn,
-      std::function<FreeFuncType> free_fn);
+      std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+      std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
 
   CUDAPluggableAllocator(CUDAPluggableAllocator& other);
   CUDAPluggableAllocator(CUDAPluggableAllocator&& other) = delete;
@@ -114,6 +88,8 @@ struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
   bool initialized() override;
   double getMemoryFraction(c10::DeviceIndex device) override;
   void setMemoryFraction(double fraction, c10::DeviceIndex device) override;
+  std::vector<c10::cuda::CUDACachingAllocator::StreamSegmentSize>
+  getExpandableSegmentSizes(c10::DeviceIndex device) override;
   void emptyCache(c10::cuda::MempoolId_t mempool_id = {0, 0}) override;
   void enable(bool) override {}
   bool isEnabled() const override {
@@ -173,8 +149,8 @@ struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
   void copy_data(void* dest, const void* src, std::size_t count) const final;
 
  protected:
-  std::function<MallocFuncType> alloc_fn_;
-  std::function<FreeFuncType> free_fn_;
+  std::function<void*(size_t, int, cudaStream_t)> alloc_fn_;
+  std::function<void(void*, size_t, int, cudaStream_t)> free_fn_;
   std::function<void(int)> init_fn_;
   std::function<void()> reset_fn_;
   std::function<void(double, int)> memory_fraction_fn_;
diff --git a/torch/csrc/cuda/GdsFile.cpp b/torch/csrc/cuda/GdsFile.cpp
index ac304e9617ae..b0d796945a95 100644
--- a/torch/csrc/cuda/GdsFile.cpp
+++ b/torch/csrc/cuda/GdsFile.cpp
@@ -47,7 +47,7 @@ void gds_load_storage(
   const size_t nbytes = storage.nbytes();
 
   // Read the binary file
-  ssize_t ret = cuFileRead(cf_handle, (void*)dataPtr, nbytes, offset, 0);
+  ssize_t ret = cuFileRead(cf_handle, dataPtr, nbytes, offset, 0);
   TORCH_CHECK(ret >= 0, "cuFileRead failed: ", cuGDSFileGetErrorString(ret));
 }
 
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index 23094f1a06af..a9a5a13206f9 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -4,6 +4,7 @@
 #include <ATen/native/ConvUtils.h>
 #include <c10/core/Device.h>
 #include <c10/core/TensorImpl.h>
+#include <c10/util/Exception.h>
 #include <c10/util/UniqueVoidPtr.h>
 #include <pybind11/pytypes.h>
 #include <torch/csrc/utils/python_arg_parser.h>
@@ -19,8 +20,8 @@
 #include <ATen/cuda/detail/CUDAHooks.h>
 #include <ATen/cuda/jiterator.h>
 #include <ATen/cuda/tunable/Tunable.h>
+#include <c10/core/AllocatorConfig.h>
 #include <c10/core/StorageImpl.h>
-#include <c10/cuda/CUDAAllocatorConfig.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <ATen/cuda/CUDAGraphsUtils.cuh>
@@ -303,7 +304,7 @@ at::Scalar as_scalar(PyObject* arg) {
   }
 
   if (THPUtils_checkLong(arg)) {
-    return at::Scalar(static_cast<int64_t>(THPUtils_unpackLong(arg)));
+    return at::Scalar(THPUtils_unpackLong(arg));
   }
 
   if (PyBool_Check(arg)) {
@@ -421,16 +422,6 @@ PyObject* THCPModule_cudaCachingAllocator_enable(
   END_HANDLE_TH_ERRORS
 }
 
-PyObject* THCPModule_cudaCachingAllocator_set_allocator_settings(
-    PyObject* _unused,
-    PyObject* env) {
-  HANDLE_TH_ERRORS
-  c10::cuda::CUDACachingAllocator::setAllocatorSettings(
-      THPUtils_unpackString(env));
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
 PyObject* THCPModule_getAllocatorBackend(PyObject* _unused, PyObject* noargs) {
   HANDLE_TH_ERRORS
   return THPUtils_packString(c10::cuda::CUDACachingAllocator::name());
@@ -681,10 +672,10 @@ PyObject* THCPModule_hostMemoryStats(PyObject* _unused, PyObject* noargs) {
   py::dict result;
   result["num_host_alloc"] = stats.num_host_alloc;
   result["num_host_free"] = stats.num_host_free;
-  result["allocation"] = statToDict(stats.allocation);
-  result["segment"] = statToDict(stats.segment);
+  result["allocations"] = statToDict(stats.allocations);
+  result["active_requests"] = statToDict(stats.active_requests);
   result["allocated_bytes"] = statToDict(stats.allocated_bytes);
-  result["reserved_bytes"] = statToDict(stats.reserved_bytes);
+  result["active_bytes"] = statToDict(stats.active_bytes);
   result["host_alloc_time"] = durationStatToDict(stats.host_alloc_time);
   result["host_free_time"] = durationStatToDict(stats.host_free_time);
 
@@ -735,8 +726,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
         "mempool_id elements must be integers");
 
     mempool_id = c10::cuda::MempoolId_t(
-        static_cast<int64_t>(THPUtils_unpackLong(id1)),
-        static_cast<int64_t>(THPUtils_unpackLong(id2)));
+        THPUtils_unpackLong(id1), THPUtils_unpackLong(id2));
   }
 
   using c10::cuda::CUDACachingAllocator::BlockInfo;
@@ -765,6 +755,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
   py::str frames_s = "frames";
   py::str time_us_s = "time_us";
   py::str compile_context_s = "compile_context";
+  py::str user_metadata_s = "user_metadata";
 
   py::list empty_frames;
   std::vector<CapturedTraceback*> to_gather_frames;
@@ -862,7 +853,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
       case TraceEntry::SEGMENT_MAP:
         return segment_map_s;
     }
-    throw std::runtime_error("unreachable");
+    TORCH_CHECK(false, "unreachable");
   };
 
   for (const auto& traceInfo : snapshot.device_traces) {
@@ -882,6 +873,7 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
       trace_entry[stream_s] = int64_t(te.stream_);
       trace_entry[time_us_s] = te.time_.t_;
       trace_entry[compile_context_s] = te.compile_context_;
+      trace_entry[user_metadata_s] = te.user_metadata_;
       trace.append(trace_entry);
     }
     traces.append(trace);
@@ -1053,6 +1045,25 @@ static void registerCudaDeviceProperties(PyObject* module) {
       .def_readonly("warp_size", &cudaDeviceProp::warpSize)
 #ifndef USE_ROCM
       // NVIDIA-only properties
+      .def_property_readonly(
+          "clock_rate",
+          [](const cudaDeviceProp&) {
+            int clk = 0;
+            AT_CUDA_CHECK(cudaDeviceGetAttribute(
+                &clk, cudaDevAttrClockRate, c10::cuda::current_device()));
+            return clk;
+          })
+      .def_property_readonly(
+          "memory_clock_rate",
+          [](const cudaDeviceProp&) {
+            int mem_clk = 0;
+            AT_CUDA_CHECK(cudaDeviceGetAttribute(
+                &mem_clk,
+                cudaDevAttrMemoryClockRate,
+                c10::cuda::current_device()));
+            return mem_clk;
+          })
+      .def_readonly("memory_bus_width", &cudaDeviceProp::memoryBusWidth)
       .def_readonly(
           "shared_memory_per_block", &cudaDeviceProp::sharedMemPerBlock)
       .def_readonly(
@@ -1118,6 +1129,14 @@ static void registerCudaDeviceProperties(PyObject* module) {
     return c10::cuda::CUDACachingAllocator::isHistoryEnabled();
   });
 
+  m.def("_cuda_setMemoryMetadata", [](const std::string& metadata) {
+    c10::cuda::CUDACachingAllocator::setUserMetadata(metadata);
+  });
+
+  m.def("_cuda_getMemoryMetadata", []() {
+    return c10::cuda::CUDACachingAllocator::getUserMetadata();
+  });
+
   m.def("_cuda_get_conv_benchmark_empty_cache", []() {
     return at::native::_cudnn_get_conv_benchmark_empty_cache();
   });
@@ -1274,14 +1293,16 @@ static void registerCudaPluggableAllocator(PyObject* module) {
             self.set_release_pool(func);
           });
   m.def("_cuda_customAllocator", [](uint64_t malloc_ptr, uint64_t free_ptr) {
-    using namespace torch::cuda::CUDAPluggableAllocator;
+    using MallocFuncType = void*(size_t, int, cudaStream_t);
+    using FreeFuncType = void(void*, size_t, int, cudaStream_t);
     std::function<MallocFuncType> malloc_fn =
         // NOLINTNEXTLINE(performance-no-int-to-ptr)
         reinterpret_cast<MallocFuncType*>(malloc_ptr);
     std::function<FreeFuncType> free_fn =
         // NOLINTNEXTLINE(performance-no-int-to-ptr)
         reinterpret_cast<FreeFuncType*>(free_ptr);
-    return createCustomAllocator(malloc_fn, free_fn);
+    return torch::cuda::CUDAPluggableAllocator::createCustomAllocator(
+        malloc_fn, free_fn);
   });
 
   // NOLINTNEXTLINE(bugprone-unused-raii)
@@ -1632,20 +1653,6 @@ PyObject* THCPModule_cuda_record_untuned_is_enabled(
   END_HANDLE_TH_ERRORS
 }
 
-PyObject* THCPModule_cuda_tunableop_write_file_on_exit(
-    PyObject* _unused,
-    PyObject* arg) {
-  HANDLE_TH_ERRORS
-  TORCH_CHECK(
-      THPUtils_checkBool(arg),
-      "cuda_tunableop_write_file_on_exit expects a bool, but got ",
-      THPUtils_typename(arg));
-  at::cuda::tunable::getTuningContext()->WriteFileOnExit(
-      THPUtils_unpackBool(arg));
-  Py_RETURN_NONE;
-  END_HANDLE_TH_ERRORS
-}
-
 PyObject* THCPModule_cuda_tunableop_set_max_tuning_duration(
     PyObject* _unused,
     PyObject* arg) {
@@ -1727,32 +1734,6 @@ PyObject* THCPModule_cuda_tunableop_get_filename(
   END_HANDLE_TH_ERRORS
 }
 
-PyObject* THCPModule_cuda_tunableop_write_file(
-    PyObject* _unused,
-    PyObject* args) {
-  HANDLE_TH_ERRORS
-  PyObject* str = nullptr;
-  bool success = false;
-  if (!PyArg_ParseTuple(args, "|O", &str)) {
-  }
-  if (str) {
-    TORCH_CHECK(
-        THPUtils_checkString(str),
-        "cuda_tunableop_write_file expects a string, but got ",
-        THPUtils_typename(str));
-    auto filename = THPUtils_unpackString(str);
-    success = at::cuda::tunable::getTuningContext()->WriteFile(filename);
-  } else {
-    success = at::cuda::tunable::getTuningContext()->WriteFile();
-  }
-  if (success) {
-    Py_RETURN_TRUE;
-  } else {
-    Py_RETURN_FALSE;
-  }
-  END_HANDLE_TH_ERRORS
-}
-
 PyObject* THCPModule_cuda_tunableop_read_file(
     PyObject* _unused,
     PyObject* args) {
@@ -1876,6 +1857,64 @@ PyObject* THCPModule_cuda_tunableop_get_rotating_buffer_size(
   END_HANDLE_TH_ERRORS
 }
 
+PyObject* THCPModule_cuda_tunableop_set_numerical_check_tolerances(
+    PyObject* unused,
+    PyObject* args) {
+  HANDLE_TH_ERRORS
+
+  PyObject* enabled_obj;
+  PyObject* atol_obj = NULL;
+  PyObject* rtol_obj = NULL;
+
+  // Parse: required bool, optional float, optional float
+  if (!PyArg_ParseTuple(args, "O|OO", &enabled_obj, &atol_obj, &rtol_obj)) {
+    TORCH_CHECK(
+        false,
+        "cuda_tunableop_set_numerical_check_tolerances expects (bool[, float[, float]])");
+  }
+
+  TORCH_CHECK(
+      PyBool_Check(enabled_obj),
+      "First argument must be a boolean, got ",
+      THPUtils_typename(enabled_obj));
+
+  bool enabled = THPUtils_unpackBool(enabled_obj);
+
+  double atol = 1e-5;
+  double rtol = 1e-5;
+
+  if (atol_obj != NULL) {
+    TORCH_CHECK(
+        PyFloat_Check(atol_obj),
+        "Second argument (atol) must be a float, got ",
+        THPUtils_typename(atol_obj));
+
+    atol = PyFloat_AsDouble(atol_obj);
+  }
+
+  if (rtol_obj != NULL) {
+    TORCH_CHECK(
+        PyFloat_Check(rtol_obj),
+        "Third argument (rtol) must be a float, got ",
+        THPUtils_typename(rtol_obj));
+
+    rtol = PyFloat_AsDouble(rtol_obj);
+  }
+
+  TORCH_CHECK(
+      atol > 0.0 && rtol > 0.0,
+      "Numerical check tolerances must be positive. Got atol=",
+      atol,
+      ", rtol=",
+      rtol);
+
+  at::cuda::tunable::getTuningContext()->SetNumericalCheckConfig(
+      enabled, atol, rtol);
+
+  Py_RETURN_NONE;
+  END_HANDLE_TH_ERRORS
+}
+
 static PyObject* THCPModule_isCurrentStreamCapturing_wrap(
     PyObject* self,
     PyObject* noargs) {
@@ -2028,10 +2067,6 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_cudaCachingAllocator_enable,
      METH_O,
      nullptr},
-    {"_cuda_cudaCachingAllocator_set_allocator_settings",
-     THCPModule_cudaCachingAllocator_set_allocator_settings,
-     METH_O,
-     nullptr},
     {"_cuda_getAllocatorBackend",
      THCPModule_getAllocatorBackend,
      METH_NOARGS,
@@ -2106,10 +2141,6 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_cuda_record_untuned_is_enabled,
      METH_NOARGS,
      nullptr},
-    {"_cuda_tunableop_write_file_on_exit",
-     THCPModule_cuda_tunableop_write_file_on_exit,
-     METH_O,
-     nullptr},
     {"_cuda_tunableop_set_max_tuning_duration",
      THCPModule_cuda_tunableop_set_max_tuning_duration,
      METH_O,
@@ -2134,10 +2165,6 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_cuda_tunableop_get_filename,
      METH_NOARGS,
      nullptr},
-    {"_cuda_tunableop_write_file",
-     THCPModule_cuda_tunableop_write_file,
-     METH_VARARGS,
-     nullptr},
     {"_cuda_tunableop_read_file",
      THCPModule_cuda_tunableop_read_file,
      METH_VARARGS,
@@ -2158,6 +2185,10 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_cuda_tunableop_get_rotating_buffer_size,
      METH_NOARGS,
      nullptr},
+    {"_cuda_tunableop_set_numerical_check_tolerances",
+     THCPModule_cuda_tunableop_set_numerical_check_tolerances,
+     METH_VARARGS,
+     nullptr},
     {nullptr}};
 
 PyMethodDef* THCPModule_methods() {
@@ -2165,7 +2196,6 @@ PyMethodDef* THCPModule_methods() {
 }
 
 namespace torch::cuda {
-
 namespace shared {
 
 void initCudartBindings(PyObject* module);
diff --git a/torch/csrc/cuda/memory_snapshot.cpp b/torch/csrc/cuda/memory_snapshot.cpp
index 3c96d5c5908d..830159d0a919 100644
--- a/torch/csrc/cuda/memory_snapshot.cpp
+++ b/torch/csrc/cuda/memory_snapshot.cpp
@@ -1,6 +1,7 @@
 #include <ATen/Context.h>
 #include <ATen/record_function.h>
 #include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/Exception.h>
 #include <torch/csrc/cuda/memory_snapshot.h>
 #include <torch/csrc/jit/runtime/interpreter.h>
 #include <torch/csrc/jit/serialization/pickler.h>
@@ -310,6 +311,7 @@ std::string _memory_snapshot_pickled() {
   IValue is_expandable_s = "is_expandable";
   IValue time_us_s = "time_us";
   IValue compile_contexts_s = "compile_context";
+  IValue user_metadata_s = "user_metadata";
 
   auto empty_frames = new_list();
 
@@ -413,7 +415,7 @@ std::string _memory_snapshot_pickled() {
       case TraceEntry::SEGMENT_MAP:
         return segment_map_s;
     }
-    throw std::runtime_error("unreachable");
+    TORCH_CHECK(false, "unreachable");
   };
 
   for (const auto& traceInfo : snapshot.device_traces) {
@@ -427,6 +429,7 @@ std::string _memory_snapshot_pickled() {
       trace_entry.insert(size_s, (int64_t)te.size_);
       trace_entry.insert(stream_s, int64_t(te.stream_));
       trace_entry.insert(compile_contexts_s, te.compile_context_);
+      trace_entry.insert(user_metadata_s, te.user_metadata_);
       if (te.context_) {
         auto sc = getFromContext(te.context_);
         frame_tracebacks.push_back(sc);
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index 35866f33b6fe..ee80c8b13f19 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -62,7 +62,7 @@ ncclResult_t to_nccl_result(torch::cuda::nccl::ncclResult var) {
     case torch::cuda::nccl::ncclResult::NumResults:
       return ncclResult_t::ncclNumResults;
     default:
-      throw std::runtime_error("Unconvertible NCCL type");
+      TORCH_CHECK(false, "Unconvertible NCCL type");
   }
 }
 
@@ -91,7 +91,7 @@ torch::cuda::nccl::ncclResult from_nccl_result(ncclResult_t var) {
     case ncclNumResults:
       return torch::cuda::nccl::ncclResult::NumResults;
     default:
-      throw std::runtime_error("Unconvertible NCCL type");
+      TORCH_CHECK(false, "Unconvertible NCCL type");
   }
 }
 
@@ -194,10 +194,9 @@ static void NCCL_CHECK_TIMEOUT(ncclResult status, ncclComm_t comm) {
     auto timeElapsed = std::chrono::duration_cast<std::chrono::seconds>(
                            currentTimepoint - startTimepoint)
                            .count();
-    if (timeElapsed > nccl_nonblocking_timeout()) {
-      throw std::runtime_error(
-          "NCCL timeout when waiting for nonblocking call to become successful.");
-    }
+    TORCH_CHECK(
+        timeElapsed <= nccl_nonblocking_timeout(),
+        "NCCL timeout when waiting for nonblocking call to become successful.");
     sched_yield(); // yield to other threads
     ncclCommGetAsyncError(to_nccl_comm(comm), &result);
   }
@@ -227,10 +226,9 @@ static void NCCL_CHECK_TIMEOUT(
         auto timeElapsed = std::chrono::duration_cast<std::chrono::seconds>(
                                currentTimepoint - startTimepoint)
                                .count();
-        if (timeElapsed > nccl_nonblocking_timeout()) {
-          throw std::runtime_error(
-              "NCCL timeout when waiting for nonblocking call to become successful.");
-        }
+        TORCH_CHECK(
+            timeElapsed <= nccl_nonblocking_timeout(),
+            "NCCL timeout when waiting for nonblocking call to become successful.");
         sched_yield(); // yield to other threads
         ncclCommGetAsyncError(to_nccl_comm(comms[i]), &result);
       } while (result == ncclInProgress);
@@ -258,7 +256,7 @@ void throw_nccl_error(torch::cuda::nccl::ncclResult status) {
   std::ostringstream err;
   err << "NCCL Error " << static_cast<int>(status) << ": "
       << ncclGetErrorString(to_nccl_result(status));
-  throw std::runtime_error(err.str());
+  TORCH_CHECK(false, err.str());
 }
 
 struct NcclCommList {
@@ -318,41 +316,36 @@ static void check_tensor(
     int64_t ref_numel,
     ScalarType ref_dtype) {
   auto check_one = [&](const at::Tensor& tensor) {
-    if (!tensor.is_cuda() || tensor.is_sparse()) {
-      throw std::runtime_error(
-          "input and output elements have to be cuda dense Tensors");
-    }
+    TORCH_CHECK(
+        tensor.is_cuda() && !tensor.is_sparse(),
+        "input and output elements have to be cuda dense Tensors");
 
-    if (ref_dtype != tensor.scalar_type()) {
-      throw std::runtime_error(
-          "all inputs and outputs must be of the same Tensor dtype");
-    }
+    TORCH_CHECK(
+        ref_dtype == tensor.scalar_type(),
+        "all inputs and outputs must be of the same Tensor dtype");
 
-    if (!tensor.is_contiguous()) {
-      throw std::runtime_error("all inputs and outputs have to be contiguous");
-    }
+    TORCH_CHECK(
+        tensor.is_contiguous(), "all inputs and outputs have to be contiguous");
   };
 
   check_one(input);
 
   // all inputs must be same size
-  if (input.numel() != ref_numel) {
-    throw std::runtime_error(
-        "all inputs must have the same number of elements");
-  }
+  TORCH_CHECK(
+      input.numel() == ref_numel,
+      "all inputs must have the same number of elements");
 
   if (output) {
     check_one(*output);
 
     // inputs and outputs must be on same device respectively
-    if (input.get_device() != output->get_device()) {
-      throw std::runtime_error("input and output must be on the same device");
-    }
+    TORCH_CHECK(
+        input.get_device() == output->get_device(),
+        "input and output must be on the same device");
 
-    if (output->numel() * output_multiplier != ref_numel * input_multiplier) {
-      throw std::runtime_error(
-          "output must be of size input_size * size_multiplier");
-    }
+    TORCH_CHECK(
+        output->numel() * output_multiplier == ref_numel * input_multiplier,
+        "output must be of size input_size * size_multiplier");
   }
 }
 
@@ -364,15 +357,13 @@ void check_inputs(
   // len(inputs) == len(outputs)
   size_t len = inputs.size();
 
-  if (len == 0) {
-    throw std::runtime_error("input sequence can't be empty");
-  }
+  TORCH_CHECK(len != 0, "input sequence can't be empty");
 
   if (len != outputs.size()) {
     std::stringstream err;
     err << "inputs and outputs sequences have to be of the same length, but got input of length "
         << len << " and output of length " << outputs.size();
-    throw std::runtime_error(err.str());
+    TORCH_CHECK(false, err.str());
   }
 
   device_set devices;
@@ -388,9 +379,8 @@ void check_inputs(
 
     auto input_device = input.get_device();
     // inputs must be on unique devices
-    if (devices.test(input_device)) {
-      throw std::runtime_error("inputs must be on unique devices");
-    }
+    TORCH_CHECK(
+        !devices.test(input_device), "inputs must be on unique devices");
     devices.set(input_device);
   }
 }
@@ -403,9 +393,7 @@ void check_inputs(
     int output_multiplier) {
   auto len = inputs.size();
 
-  if (len <= 0) {
-    throw std::runtime_error("input sequence can't be empty");
-  }
+  TORCH_CHECK(len > 0, "input sequence can't be empty");
 
   device_set devices;
   int64_t numel = inputs[0].numel();
@@ -426,9 +414,8 @@ void check_inputs(
 
     auto input_device = input.get_device();
     // inputs must be on unique devices
-    if (devices.test(input_device)) {
-      throw std::runtime_error("inputs must be on unique devices");
-    }
+    TORCH_CHECK(
+        !devices.test(input_device), "inputs must be on unique devices");
     devices.set(input_device);
   }
 }
@@ -624,7 +611,7 @@ void broadcast(
         ")");
     ncclComm_t comm = comms[i];
     NCCL_CHECK(ncclBcast(
-        tensors[i].data_ptr(),
+        tensors[i].mutable_data_ptr(),
         numel,
         data_type,
         0,
@@ -669,9 +656,9 @@ void reduce(
 
     ncclComm_t comm = comms_ref[i];
     NCCL_CHECK(ncclReduce(
-        inputs[i].data_ptr(),
+        inputs[i].const_data_ptr(),
         static_cast<std::remove_cv_t<decltype(i)>>(root) == i
-            ? output.data_ptr()
+            ? output.mutable_data_ptr()
             : nullptr,
         count,
         data_type,
@@ -723,8 +710,8 @@ void all_reduce(
 
     ncclComm_t comm = comms_ref[i];
     NCCL_CHECK(ncclAllReduce(
-        inputs[i].data_ptr(),
-        outputs[i].data_ptr(),
+        inputs[i].const_data_ptr(),
+        outputs[i].mutable_data_ptr(),
         count,
         data_type,
         to_nccl_red_op(op),
@@ -765,8 +752,8 @@ void reduce_scatter(
 
     ncclComm_t comm = comms_ref[i];
     NCCL_CHECK(ncclReduceScatter(
-        inputs[i].data_ptr(),
-        outputs[i].data_ptr(),
+        inputs[i].const_data_ptr(),
+        outputs[i].mutable_data_ptr(),
         count,
         data_type,
         to_nccl_red_op(op),
@@ -807,18 +794,18 @@ void all_gather(
     ncclComm_t comm = comms_ref[i];
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
     NCCL_CHECK(ncclAllGather(
-        inputs[i].data_ptr(),
-        outputs[i].data_ptr(),
+        inputs[i].const_data_ptr(),
+        outputs[i].mutable_data_ptr(),
         count,
         data_type,
         to_nccl_comm(comm),
         stream));
 #else
     NCCL_CHECK(ncclAllGather(
-        inputs[i].data_ptr(),
+        inputs[i].const_data_ptr(),
         count,
         data_type,
-        outputs[i].data_ptr(),
+        outputs[i].mutable_data_ptr(),
         to_nccl_comm(comm),
         stream));
 #endif
@@ -843,7 +830,7 @@ void all2all_single_equal_split(
   size_t count = input.numel() / size;
   [[maybe_unused]] size_t rankdiff = input.nbytes() / size;
   const auto* sendbuff = reinterpret_cast<const char*>(input.const_data_ptr());
-  auto* recvbuff = reinterpret_cast<char*>(output.data_ptr());
+  auto* recvbuff = reinterpret_cast<char*>(output.mutable_data_ptr());
   auto comm = to_nccl_comm(_comm);
 #if defined(USE_ROCM) || defined(NCCL_ALLTOALL_SUPPORTED)
   // NCCL_ALLTOALL_SUPPORTED is used so NCCL can differentiate send/recv
@@ -964,7 +951,7 @@ void all2all(
 
     if (_nccl_should_send_recv(input.numel())) {
       NCCL_CHECK(ncclSend(
-          input.data_ptr(),
+          input.const_data_ptr(),
           input.numel(),
           to_nccl_data_type(input),
           r,
@@ -973,7 +960,7 @@ void all2all(
     }
     if (_nccl_should_send_recv(output.numel())) {
       NCCL_CHECK(ncclRecv(
-          output.data_ptr(),
+          output.mutable_data_ptr(),
           output.numel(),
           to_nccl_data_type(output),
           r,
@@ -1005,7 +992,7 @@ void send(
   using namespace torch::cuda::nccl::detail;
 #ifndef NCCL_HAS_COMM_NONBLOCKING
   NCCL_CHECK(ncclSend(
-      input.data_ptr(),
+      input.const_data_ptr(),
       input.numel(),
       to_nccl_data_type(input),
       dst,
@@ -1014,7 +1001,7 @@ void send(
 #else
   NCCL_CHECK_TIMEOUT(
       ncclSend(
-          input.data_ptr(),
+          input.const_data_ptr(),
           input.numel(),
           to_nccl_data_type(input),
           dst,
@@ -1041,7 +1028,7 @@ void recv(
   using namespace torch::cuda::nccl::detail;
 #ifndef NCCL_HAS_COMM_NONBLOCKING
   NCCL_CHECK(ncclRecv(
-      output.data_ptr(),
+      output.mutable_data_ptr(),
       output.numel(),
       to_nccl_data_type(output),
       src,
@@ -1050,7 +1037,7 @@ void recv(
 #else
   NCCL_CHECK_TIMEOUT(
       ncclRecv(
-          output.data_ptr(),
+          output.mutable_data_ptr(),
           output.numel(),
           to_nccl_data_type(output),
           src,
@@ -1091,7 +1078,7 @@ void gather(
   if (cur_rank == root) {
     for (const auto r : c10::irange(numranks)) {
       if (r != root) {
-        auto* recvbuff = reinterpret_cast<char*>(outputs[r].data_ptr());
+        auto* recvbuff = reinterpret_cast<char*>(outputs[r].mutable_data_ptr());
         NCCL_CHECK(ncclRecv(recvbuff, count, type, r, comm, stream));
       } else {
         // on its own rank, simply copy from the input
@@ -1152,7 +1139,7 @@ void scatter(
   } else {
     size_t recv_count = outputs.numel();
     auto recv_type = to_nccl_data_type(outputs);
-    auto* recvbuff = reinterpret_cast<char*>(outputs.data_ptr());
+    auto* recvbuff = reinterpret_cast<char*>(outputs.mutable_data_ptr());
     NCCL_CHECK(ncclRecv(recvbuff, recv_count, recv_type, root, comm, stream));
   }
 #ifndef NCCL_HAS_COMM_NONBLOCKING
diff --git a/torch/csrc/cuda/python_nccl.cpp b/torch/csrc/cuda/python_nccl.cpp
index f7493a20e301..55af32792018 100644
--- a/torch/csrc/cuda/python_nccl.cpp
+++ b/torch/csrc/cuda/python_nccl.cpp
@@ -11,6 +11,7 @@
 #include <torch/csrc/utils/pybind.h>
 
 #include <c10/cuda/CUDAGuard.h>
+#include <c10/util/Exception.h>
 #include <c10/util/irange.h>
 
 using namespace at;
@@ -18,7 +19,7 @@ using namespace torch;
 using namespace torch::cuda::nccl;
 using namespace torch::cuda::nccl::detail;
 
-static const char* COMM_CAPSULE_NAME = "torch.cuda.nccl.Communicator";
+static constexpr const char* COMM_CAPSULE_NAME = "torch.cuda.nccl.Communicator";
 
 PyObject* THCPModule_nccl_version(PyObject* self, PyObject* args) {
   return PyLong_FromUnsignedLongLong(version());
@@ -63,10 +64,9 @@ static std::vector<std::optional<at::cuda::CUDAStream>> unpack_streams(
     return std::vector<std::optional<at::cuda::CUDAStream>>(size, std::nullopt);
   }
   auto streams = THPUtils_PySequence_to_CUDAStreamList(obj);
-  if (streams.size() != size) {
-    throw std::runtime_error(
-        "number of streams is not equal to number of inputs");
-  }
+  TORCH_CHECK(
+      streams.size() == size,
+      "number of streams is not equal to number of inputs");
   return streams;
 }
 
@@ -90,10 +90,9 @@ static std::vector<ncclComm_t> unpack_comms(PyObject* obj, size_t size) {
       comms[i] = unpack_nccl_comm(PySequence_Fast_GET_ITEM(seq.get(), i));
     }
   }
-  if (comms.size() != size) {
-    throw std::runtime_error(
-        "number of communicators is not equal to number of inputs");
-  }
+  TORCH_CHECK(
+      comms.size() == size,
+      "number of communicators is not equal to number of inputs");
   return comms;
 }
 
@@ -141,7 +140,7 @@ PyObject* THCPModule_nccl_reduce(PyObject* self, PyObject* args) {
         "nccl_reduce",
         1,
         "(sequence[Tensor] inputs, Tensor output, int root,"
-        " int op, sequence[torch.cuda.Stream or None]");
+        " int op, sequence[torch.cuda.Stream or None])");
     return nullptr;
   }
 
diff --git a/torch/csrc/cuda/utils.cpp b/torch/csrc/cuda/utils.cpp
index d41a7c817209..23112a8a06b8 100644
--- a/torch/csrc/cuda/utils.cpp
+++ b/torch/csrc/cuda/utils.cpp
@@ -1,3 +1,4 @@
+#include <c10/util/Exception.h>
 #include <torch/csrc/Stream.h>
 #include <torch/csrc/cuda/THCP.h>
 #include <torch/csrc/python_headers.h>
@@ -8,18 +9,17 @@
 // whatever the current stream of the device the input is associated with was.
 std::vector<std::optional<at::cuda::CUDAStream>>
 THPUtils_PySequence_to_CUDAStreamList(PyObject* obj) {
-  if (!PySequence_Check(obj)) {
-    throw std::runtime_error(
-        "Expected a sequence in THPUtils_PySequence_to_CUDAStreamList");
-  }
+  TORCH_CHECK(
+      PySequence_Check(obj),
+      "Expected a sequence in THPUtils_PySequence_to_CUDAStreamList");
   THPObjectPtr seq = THPObjectPtr(PySequence_Fast(obj, nullptr));
-  if (seq.get() == nullptr) {
-    throw std::runtime_error(
-        "expected PySequence, but got " + std::string(THPUtils_typename(obj)));
-  }
+  TORCH_CHECK(
+      seq.get() != nullptr,
+      "expected PySequence, but got " + std::string(THPUtils_typename(obj)));
 
   std::vector<std::optional<at::cuda::CUDAStream>> streams;
   Py_ssize_t length = PySequence_Fast_GET_SIZE(seq.get());
+  streams.reserve(length);
   for (Py_ssize_t i = 0; i < length; i++) {
     PyObject* stream = PySequence_Fast_GET_ITEM(seq.get(), i);
 
@@ -34,7 +34,8 @@ THPUtils_PySequence_to_CUDAStreamList(PyObject* obj) {
     } else if (stream == Py_None) {
       streams.emplace_back();
     } else {
-      throw std::runtime_error(
+      TORCH_CHECK(
+          false,
           "Unknown data type found in stream list. Need torch.cuda.Stream or None");
     }
   }
diff --git a/torch/csrc/distributed/autograd/context/container.cpp b/torch/csrc/distributed/autograd/context/container.cpp
index 3d81d8a0a83c..48d3c4af3a76 100644
--- a/torch/csrc/distributed/autograd/context/container.cpp
+++ b/torch/csrc/distributed/autograd/context/container.cpp
@@ -52,13 +52,9 @@ DistAutogradContainer& DistAutogradContainer::init(int64_t worker_id) {
   }
 
   container.worker_id_ = static_cast<int16_t>(worker_id);
-  container.next_context_id_ = static_cast<int64_t>(worker_id)
-      << kAutoIncrementBits;
-  container.next_autograd_message_id_ = static_cast<int64_t>(worker_id)
-      << kAutoIncrementBits;
-  container.max_id_ =
-      (kAutoIncrementMask |
-       (static_cast<int64_t>(worker_id) << kAutoIncrementBits));
+  container.next_context_id_ = worker_id << kAutoIncrementBits;
+  container.next_autograd_message_id_ = worker_id << kAutoIncrementBits;
+  container.max_id_ = (kAutoIncrementMask | (worker_id << kAutoIncrementBits));
   container.initialized_ = true;
   return container;
 }
diff --git a/torch/csrc/distributed/autograd/engine/dist_engine.cpp b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
index 6c3b3537c523..3743476c7a52 100644
--- a/torch/csrc/distributed/autograd/engine/dist_engine.cpp
+++ b/torch/csrc/distributed/autograd/engine/dist_engine.cpp
@@ -65,7 +65,7 @@ class DistAccumulateGradCaptureHook
     for (const auto& hook : accumulateGrad_->post_hooks()) {
       (*hook)(kEmptyOutput, inputGrads);
     }
-    return inputGrads[0];
+    return std::move(inputGrads[0]);
   }
 
  private:
diff --git a/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_req.cpp b/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_req.cpp
index 7134715d6a5c..221ebb143471 100644
--- a/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_req.cpp
+++ b/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_req.cpp
@@ -25,7 +25,7 @@ c10::intrusive_ptr<rpc::Message> CleanupAutogradContextReq::toMessageImpl() && {
 std::unique_ptr<CleanupAutogradContextReq> CleanupAutogradContextReq::
     fromMessage(const rpc::Message& message) {
   // unpickle and get the context_id we need to clean up
-  auto payload = static_cast<const char*>(message.payload().data());
+  auto payload = message.payload().data();
   auto payload_size = message.payload().size();
   IValue ivalue_context_id = jit::unpickle(
       payload,
diff --git a/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_req.cpp b/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_req.cpp
index df1d88cde488..f7726301e0c0 100644
--- a/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_req.cpp
+++ b/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_req.cpp
@@ -47,7 +47,7 @@ c10::intrusive_ptr<Message> PropagateGradientsReq::toMessageImpl() && {
 std::unique_ptr<PropagateGradientsReq> PropagateGradientsReq::fromMessage(
     const Message& message) {
   // Unpickle the message and retrieve tupleElements.
-  auto payload = static_cast<const char*>(message.payload().data());
+  auto payload = message.payload().data();
   auto payload_size = message.payload().size();
   IValue tuple = jit::unpickle(
       payload,
diff --git a/torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.cpp b/torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.cpp
index be82d0935785..2c3cdcca4c26 100644
--- a/torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.cpp
+++ b/torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.cpp
@@ -37,7 +37,7 @@ c10::intrusive_ptr<Message> RRefBackwardReq::toMessageImpl() && {
 std::unique_ptr<RRefBackwardReq> RRefBackwardReq::fromMessage(
     const Message& message) {
   // Unpickle the message and retrieve tupleElements.
-  auto payload = static_cast<const char*>(message.payload().data());
+  auto payload = message.payload().data();
   auto payload_size = message.payload().size();
   IValue tuple = jit::unpickle(
       payload,
diff --git a/torch/csrc/distributed/c10d/FakeProcessGroup.hpp b/torch/csrc/distributed/c10d/FakeProcessGroup.hpp
index dc3c4889057c..b0cb420eb6fc 100644
--- a/torch/csrc/distributed/c10d/FakeProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/FakeProcessGroup.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/csrc/distributed/c10d/Backend.hpp>
+#include <torch/csrc/utils.h>
 
 namespace c10d {
 
@@ -24,13 +25,17 @@ class FakeProcessGroup : public Backend {
     explicit Options() : Backend::Options("fake") {}
 
     int fake_option = 0;
+    bool error_on_collective = false;
   };
 
-  FakeProcessGroup(
+  // Static factory method for official APIs
+  static c10::intrusive_ptr<FakeProcessGroup> _create_internal(
       int rank,
       int size,
-      c10::intrusive_ptr<Options> options = c10::make_intrusive<Options>())
-      : Backend(rank, size), options_(std::move(options)) {}
+      c10::intrusive_ptr<Options> options = c10::make_intrusive<Options>()) {
+    return c10::make_intrusive<FakeProcessGroup>(
+        rank, size, std::move(options));
+  }
 
   const std::string getBackendName() const override {
     return "fake";
@@ -43,18 +48,21 @@ class FakeProcessGroup : public Backend {
   c10::intrusive_ptr<Work> broadcast(
       std::vector<at::Tensor>& /* tensors */,
       const BroadcastOptions& /* opts */ = BroadcastOptions()) override {
+    checkCollectiveError();
     return c10::make_intrusive<FakeWork>();
   }
 
   c10::intrusive_ptr<Work> allreduce(
       std::vector<at::Tensor>& /* tensors */,
       const AllreduceOptions& /* opts */ = AllreduceOptions()) override {
+    checkCollectiveError();
     return c10::make_intrusive<FakeWork>();
   }
 
   c10::intrusive_ptr<Work> allreduce_sparse(
       std::vector<at::Tensor>& /* tensors */,
       const AllreduceOptions& /* opts */ = AllreduceOptions()) override {
+    checkCollectiveError();
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -62,12 +70,14 @@ class FakeProcessGroup : public Backend {
       std::vector<at::Tensor>& /* tensors */,
       const AllreduceCoalescedOptions& /* opts */ =
           AllreduceCoalescedOptions()) override {
+    checkCollectiveError();
     return c10::make_intrusive<FakeWork>();
   }
 
   c10::intrusive_ptr<Work> reduce(
       std::vector<at::Tensor>& /* tensors */,
       const ReduceOptions& /* opts */ = ReduceOptions()) override {
+    checkCollectiveError();
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -85,6 +95,7 @@ class FakeProcessGroup : public Backend {
       std::vector<std::vector<at::Tensor>>& outputTensors,
       std::vector<at::Tensor>& inputTensors,
       const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+    checkCollectiveError();
     for (auto& tensor : outputTensors[0]) {
       tensor.copy_(inputTensors[0]);
     }
@@ -95,6 +106,7 @@ class FakeProcessGroup : public Backend {
       at::Tensor& outputBuffer,
       at::Tensor& inputBuffer,
       const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+    checkCollectiveError();
     auto chunks = outputBuffer.chunk(size_);
     for (auto& tensor : chunks) {
       tensor.copy_(inputBuffer);
@@ -106,6 +118,7 @@ class FakeProcessGroup : public Backend {
       std::vector<std::vector<at::Tensor>>& /* outputTensorLists */,
       std::vector<at::Tensor>& /* inputTensors */,
       const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+    checkCollectiveError();
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -113,6 +126,7 @@ class FakeProcessGroup : public Backend {
       std::vector<at::Tensor>& outputs,
       std::vector<at::Tensor>& inputs,
       const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+    checkCollectiveError();
     for (size_t i = 0; i < outputs.size(); ++i) {
       auto chunks = outputs[i].chunk(size_);
       for (auto& chunk : chunks) {
@@ -126,6 +140,7 @@ class FakeProcessGroup : public Backend {
       std::vector<std::vector<at::Tensor>>& /* outputTensors */,
       std::vector<at::Tensor>& /* inputTensors */,
       const GatherOptions& /* opts */ = GatherOptions()) override {
+    checkCollectiveError();
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -133,6 +148,7 @@ class FakeProcessGroup : public Backend {
       std::vector<at::Tensor>& /* outputTensors */,
       std::vector<std::vector<at::Tensor>>& /* inputTensors */,
       const ScatterOptions& /* opts */ = ScatterOptions()) override {
+    checkCollectiveError();
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -141,6 +157,7 @@ class FakeProcessGroup : public Backend {
       std::vector<std::vector<at::Tensor>>& /* inputTensors */,
       const ReduceScatterOptions& /* opts */ =
           ReduceScatterOptions()) override {
+    checkCollectiveError();
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -149,6 +166,7 @@ class FakeProcessGroup : public Backend {
       at::Tensor& /* inputBuffer */,
       const ReduceScatterOptions& /* opts */ =
           ReduceScatterOptions()) override {
+    checkCollectiveError();
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -157,6 +175,7 @@ class FakeProcessGroup : public Backend {
       std::vector<at::Tensor>& /* inputs */,
       const ReduceScatterOptions& /* opts */ =
           ReduceScatterOptions()) override {
+    checkCollectiveError();
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -166,6 +185,7 @@ class FakeProcessGroup : public Backend {
       std::vector<int64_t>& /* outputSplitSizes */,
       std::vector<int64_t>& /* inputSplitSizes */,
       const AllToAllOptions& /* opts */ = AllToAllOptions()) override {
+    checkCollectiveError();
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -173,6 +193,7 @@ class FakeProcessGroup : public Backend {
       std::vector<at::Tensor>& /* outputTensors */,
       std::vector<at::Tensor>& /* inputTensors */,
       const AllToAllOptions& opts = AllToAllOptions()) override {
+    checkCollectiveError();
     return c10::make_intrusive<FakeWork>();
   }
 
@@ -201,20 +222,32 @@ class FakeProcessGroup : public Backend {
   }
 
   c10::intrusive_ptr<Work> endCoalescing(OpType /* optype */) {
+    checkCollectiveError();
     return c10::make_intrusive<FakeWork>();
   }
 
   c10::intrusive_ptr<Work> endCoalescing() override {
+    checkCollectiveError();
     return c10::make_intrusive<FakeWork>();
   }
 
   c10::intrusive_ptr<Work> barrier(
       const BarrierOptions& /* opts */ = BarrierOptions()) override {
+    checkCollectiveError();
     return c10::make_intrusive<FakeWork>();
   }
 
- private:
+  // Private constructor used by official APIs
+  FakeProcessGroup(int rank, int size, c10::intrusive_ptr<Options> options)
+      : Backend(rank, size), options_(std::move(options)) {}
   c10::intrusive_ptr<Options> options_;
+
+ private:
+  void checkCollectiveError() {
+    TORCH_CHECK(
+        !options_ || !options_->error_on_collective,
+        "FakeProcessGroup collective operation error (error_on_collective=true)");
+  }
 };
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/FileStore.cpp b/torch/csrc/distributed/c10d/FileStore.cpp
index 9fbd86cbad70..7e22aa6fd0bd 100644
--- a/torch/csrc/distributed/c10d/FileStore.cpp
+++ b/torch/csrc/distributed/c10d/FileStore.cpp
@@ -225,7 +225,7 @@ class File {
     while (count > 0) {
       auto rv = syscall([this, buf, count] { return ::read(fd_, buf, count); });
       SYSASSERT(rv, "read");
-      buf = (uint8_t*)buf + rv;
+      buf = static_cast<uint8_t*>(buf) + rv;
       count -= rv;
     }
   }
diff --git a/torch/csrc/distributed/c10d/FlightRecorder.cpp b/torch/csrc/distributed/c10d/FlightRecorder.cpp
index 8f8b993ebe6f..a404b627752a 100644
--- a/torch/csrc/distributed/c10d/FlightRecorder.cpp
+++ b/torch/csrc/distributed/c10d/FlightRecorder.cpp
@@ -5,32 +5,39 @@
 namespace c10d {
 
 void DebugInfoWriter::write(const std::string& trace) {
+  std::string filename = filename_;
+  if (enable_dynamic_filename_) {
+    LOG(INFO) << "Writing Flight Recorder debug info to a dynamic file name";
+    filename = c10::str(getCvarString({"TORCH_FR_DUMP_TEMP_FILE"}, ""));
+  } else {
+    LOG(INFO) << "Writing Flight Recorder debug info to a static file name";
+  }
   // Open a file for writing. The ios::binary flag is used to write data as
   // binary.
-  std::ofstream file(filename_, std::ios::binary);
+  std::ofstream file(filename, std::ios::binary);
 
   // Check if the file was opened successfully.
   if (!file.is_open()) {
     LOG(ERROR) << "Error opening file for writing Flight Recorder debug info: "
-               << filename_;
+               << filename;
     return;
   }
 
   if (!file.write(trace.data(), static_cast<std::streamsize>(trace.size()))) {
     const auto bad = file.bad();
     LOG(ERROR) << "Error writing Flight Recorder debug info to file: "
-               << filename_ << " bad bit: " << bad;
+               << filename << " bad bit: " << bad;
     return;
   }
 
   // Flush the buffer to ensure data is written to the file
   file.flush();
   if (file.bad()) {
-    LOG(ERROR) << "Error flushing Flight Recorder debug info: " << filename_;
+    LOG(ERROR) << "Error flushing Flight Recorder debug info: " << filename;
     return;
   }
 
-  LOG(INFO) << "Finished writing Flight Recorder debug info to " << filename_;
+  LOG(INFO) << "Finished writing Flight Recorder debug info to " << filename;
 }
 
 DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
@@ -47,10 +54,12 @@ DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
     std::string fileNamePrefix = getCvarString(
         {"TORCH_FR_DUMP_TEMP_FILE", "TORCH_NCCL_DEBUG_INFO_TEMP_FILE"},
         defaultLocation.string().c_str());
+    bool useDynamicFileName =
+        getCvarBool({"TORCH_FR_DUMP_DYNAMIC_FILE_NAME"}, false);
     // Using std::unique_ptr here to auto-delete the writer object
     // when the pointer itself is destroyed.
     std::unique_ptr<DebugInfoWriter> writerPtr(
-        new DebugInfoWriter(fileNamePrefix, rank));
+        new DebugInfoWriter(fileNamePrefix, rank, useDynamicFileName));
     DebugInfoWriter::registerWriter(std::move(writerPtr));
   }
   return *writer_;
diff --git a/torch/csrc/distributed/c10d/FlightRecorder.hpp b/torch/csrc/distributed/c10d/FlightRecorder.hpp
index b0974495a87a..23b8893c54f2 100644
--- a/torch/csrc/distributed/c10d/FlightRecorder.hpp
+++ b/torch/csrc/distributed/c10d/FlightRecorder.hpp
@@ -76,10 +76,17 @@ class TORCH_API DebugInfoWriter {
   }
 
  protected:
-  DebugInfoWriter(const std::string& namePrefix, int rank) {
+  DebugInfoWriter(
+      const std::string& namePrefix,
+      int rank,
+      bool enableDynamicFilename = false) {
     filename_ = c10::str(namePrefix, rank);
+    enable_dynamic_filename_ = enableDynamicFilename;
+    rank_ = rank;
   }
   std::string filename_;
+  int rank_;
+  bool enable_dynamic_filename_;
 
  private:
   static std::unique_ptr<DebugInfoWriter> writer_;
@@ -176,9 +183,9 @@ struct FlightRecorder {
   size_t max_entries_ = 0;
   size_t next_ = 0;
   size_t id_ = 0;
-  std::map<size_t, std::shared_ptr<ProcessGroupStatus>> all_pg_status_ = {};
+  std::map<size_t, std::shared_ptr<ProcessGroupStatus>> all_pg_status_;
   std::map<std::tuple<std::string, std::string>, std::vector<uint64_t>>
-      pg_name_to_ranks_ = {};
+      pg_name_to_ranks_;
   std::string comm_lib_version_;
 
   std::optional<size_t> record(
@@ -224,6 +231,8 @@ struct FlightRecorder {
       std::optional<size_t> id,
       bool compute_duration = true);
 
+  TORCH_API void reset_all();
+
   const c10::List<c10::IValue> getCollectiveTrace(
       bool includeStacktraces,
       bool onlyActive);
@@ -256,6 +265,15 @@ struct FlightRecorder {
       bool onlyActive);
 };
 
+// Whether to include stack trace in the Flight Recorder trace (default true)
+static std::vector<std::string> TORCH_INCLUDE_STACK_TRACE = {
+    "TORCH_INCLUDE_STACK_TRACE"};
+
+// Whether to include only active collectives in the Flight Recorder trace
+// (default false)
+static std::vector<std::string> TORCH_INCLUDE_ONLY_ACTIVE = {
+    "TORCH_INCLUDE_ONLY_ACTIVE"};
+
 // Dumps the fr traces and additional information about the Process
 // Group.
 TORCH_API std::string dump_fr_trace(
diff --git a/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp b/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
index 473372fd44b4..8813c9515846 100644
--- a/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
+++ b/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
@@ -249,6 +249,14 @@ void FlightRecorder<EventType>::retire_id(
   }
 }
 
+template <typename EventType>
+void FlightRecorder<EventType>::reset_all() {
+  std::lock_guard<std::mutex> guard(mutex_);
+  next_ = 0;
+  id_ = 0;
+  entries_.clear();
+}
+
 template <typename EventType>
 const c10::List<c10::IValue> FlightRecorder<EventType>::getCollectiveTrace(
     bool includeStacktraces,
diff --git a/torch/csrc/distributed/c10d/Functional.cpp b/torch/csrc/distributed/c10d/Functional.cpp
index 99b0c7d17bf7..16530f0e6502 100644
--- a/torch/csrc/distributed/c10d/Functional.cpp
+++ b/torch/csrc/distributed/c10d/Functional.cpp
@@ -35,7 +35,7 @@ at::Tensor allocate_all_gather_output(
     int64_t group_size) {
   TORCH_CHECK(input.is_contiguous());
   auto output_size = input.sizes().vec();
-  if (output_size.size() == 0) {
+  if (output_size.empty()) {
     output_size.push_back(group_size);
   } else {
     output_size[0] *= group_size;
diff --git a/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp b/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp
index 7bd7115ba8cf..d9a74e2efa37 100644
--- a/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp
+++ b/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp
@@ -196,7 +196,7 @@ std::shared_ptr<::gloo::transport::Device> makeGlooDevice(
   static auto transportName = c10::utils::get_env("GLOO_DEVICE_TRANSPORT");
   if (transportName.has_value()) {
     return GlooDeviceRegistry()->Create(
-        transportName.value().c_str(), interfaceName, hostName, lazyInit);
+        transportName.value(), interfaceName, hostName, lazyInit);
   }
 
 #ifdef __linux__
diff --git a/torch/csrc/distributed/c10d/HashStore.cpp b/torch/csrc/distributed/c10d/HashStore.cpp
index 1055afc4847d..15befd9ec34e 100644
--- a/torch/csrc/distributed/c10d/HashStore.cpp
+++ b/torch/csrc/distributed/c10d/HashStore.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/distributed/c10d/HashStore.hpp>
 
+#include <unistd.h>
 #include <cstdint>
 
 #include <chrono>
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.hpp b/torch/csrc/distributed/c10d/NCCLUtils.hpp
index fcd55b6a655e..fdd50f69ef3d 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -367,7 +367,7 @@ class NCCLComm {
   int rank_{};
   // Optional reason for communicator failure, provided by ProcessGroupNCCL for
   // better error messaging.
-  std::optional<std::string> commFailureReason_{};
+  std::optional<std::string> commFailureReason_;
   bool initialized_{false};
   // Whether this communicator is using nonblocking mode. Recorded during comm
   // creation or split. For safety, we give a default value of true (more
diff --git a/torch/csrc/distributed/c10d/Ops.cpp b/torch/csrc/distributed/c10d/Ops.cpp
index 1b3318284999..a5d42771ce05 100644
--- a/torch/csrc/distributed/c10d/Ops.cpp
+++ b/torch/csrc/distributed/c10d/Ops.cpp
@@ -15,7 +15,11 @@ TORCH_LIBRARY(c10d, m) {
   m.class_<Work>("Work")
       .def(torch::init<>())
       .def("wait", [](const c10::intrusive_ptr<Work>& self) { self->wait(); });
-  m.class_<ReduceOp>("ReduceOp").def(torch::init<>());
+  m.class_<ReduceOp>("ReduceOp")
+      .def(torch::init<>())
+      .def("op", [](const c10::intrusive_ptr<ReduceOp>& self) -> int64_t {
+        return self->op_;
+      });
   m.def(
       "broadcast_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, int root_rank, int root_tensor, bool async_op=True, int timeout=-1) -> (Tensor[], __torch__.torch.classes.c10d.Work)");
   m.def(
diff --git a/torch/csrc/distributed/c10d/ParamCommsUtils.hpp b/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
index d011b0e42ed1..678c98e91a0b 100644
--- a/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
+++ b/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
@@ -91,7 +91,7 @@ class TORCH_API ParamCommsDebugInfo : public c10::DebugInfoBase {
   std::vector<int64_t> outputSplitSizes_;
   int globalRankStart_{};
   int globalRankStride_{};
-  std::vector<int64_t> groupRanks_{};
+  std::vector<int64_t> groupRanks_;
 };
 
 #define RECORD_PARAM_COMMS(                                                    \
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.cpp b/torch/csrc/distributed/c10d/ProcessGroup.cpp
index e57e2c2a8d41..9f79a09d236e 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.cpp
@@ -165,7 +165,7 @@ c10::intrusive_ptr<ProcessGroup> ProcessGroup::splitGroup(
     const std::optional<std::string>& name,
     const std::optional<std::string>& desc) {
   TORCH_CHECK(
-      ranks.size() > 0,
+      !ranks.empty(),
       "Split ranks cannot be empty. Please provide a non-empty list of ranks to split the group.");
   TORCH_CHECK(
       ranks.size() <= static_cast<size_t>(size_),
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
index 5a06a386d5ca..8c4a657fd7ee 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -11,7 +11,6 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <c10/macros/Macros.h>
 
-#include <torch/csrc/distributed/c10d/Work.hpp>
 // *************************************************************************
 // PROCESS GROUP collective communication API IS BEING CHANGED BETWEEN
 // versions 1.7 and 1.8.
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
index 74063ff579e8..526176eab048 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -559,7 +559,7 @@ c10::intrusive_ptr<ProcessGroupGloo::Options> ProcessGroupGloo::Options::
   // Use interfaces listed in "GLOO_SOCKET_IFNAME", if set.
   auto ifnameEnv = c10::utils::get_env("GLOO_SOCKET_IFNAME");
   if (ifnameEnv && ifnameEnv->size() > 1) {
-    for (const auto& iface : ::c10d::split(',', ifnameEnv->c_str())) {
+    for (const auto& iface : ::c10d::split(',', *ifnameEnv)) {
       options->devices.push_back(
           ::c10d::ProcessGroupGloo::createDeviceForInterface(iface, lazyInit));
     }
@@ -828,7 +828,7 @@ class AsyncBroadcastWork : public ProcessGroupGloo::AsyncWork {
         rootTensor(rootTensor),
         tag(tag) {}
 
-  std::vector<at::Tensor> inputs{};
+  std::vector<at::Tensor> inputs;
   const int rootRank;
   const int rootTensor;
   const uint32_t tag;
@@ -924,8 +924,8 @@ class AsyncBroadcastCUDAWork : public AsyncBroadcastWork {
   }
 
   at::Tensor tmp;
-  std::vector<c10::Stream> streams{};
-  std::vector<c10::Event> events{};
+  std::vector<c10::Stream> streams;
+  std::vector<c10::Event> events;
 };
 
 } // namespace
@@ -1160,7 +1160,7 @@ class AsyncReduceWork : public ProcessGroupGloo::AsyncWork {
         reduceOp(std::move(reduceOp)),
         tag(tag) {}
 
-  std::vector<at::Tensor> inputs{};
+  std::vector<at::Tensor> inputs;
   const int rootRank;
   const int rootTensor;
   const ReduceOp reduceOp;
@@ -1276,9 +1276,9 @@ class AsyncReduceCUDAWork : public AsyncReduceWork {
     }
   }
 
-  std::vector<at::Tensor> tmp{};
-  std::vector<c10::Stream> streams{};
-  std::vector<c10::Event> events{};
+  std::vector<at::Tensor> tmp;
+  std::vector<c10::Stream> streams;
+  std::vector<c10::Event> events;
 };
 
 } // namespace
@@ -1362,8 +1362,8 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
         inputs(inputs),
         tag(tag) {}
 
-  std::vector<std::vector<at::Tensor>> outputs{};
-  std::vector<at::Tensor> inputs{};
+  std::vector<std::vector<at::Tensor>> outputs;
+  std::vector<at::Tensor> inputs;
   const uint32_t tag;
 
   void allgather(
@@ -1381,7 +1381,8 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
     // Use single flat output tensor.
     // The first dimension corresponds to the index into outputs[N],
     // so copying into the actual output later is easy.
-    at::Tensor flatOutputTensor = newLikeFlat(outputs[0]);
+    at::Tensor flatOutputTensor =
+        newLikeFlat(outputs[0], /*preserve_strides*/ false);
     GENERATE_ALL_TYPES(scalarType, setOutput, opts, flatOutputTensor);
     gloo::allgather(opts);
 
@@ -1398,7 +1399,7 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
   }
 
   const std::vector<at::Tensor> getOutputTensors() override {
-    return {newLikeFlat(outputs[0])};
+    return {newLikeFlat(outputs[0], /*preserve_strides*/ false)};
   }
 
   void run() override {
@@ -1471,13 +1472,13 @@ class AsyncAllgatherCUDAWork : public AsyncAllgatherWork {
     }
   }
 
-  std::vector<at::Tensor> tmpInputs{};
-  std::vector<c10::Stream> inputStreams{};
-  std::vector<c10::Event> inputEvents{};
+  std::vector<at::Tensor> tmpInputs;
+  std::vector<c10::Stream> inputStreams;
+  std::vector<c10::Event> inputEvents;
 
-  std::vector<std::vector<at::Tensor>> tmpOutputs{};
-  std::vector<c10::Stream> outputStreams{};
-  std::vector<c10::Event> outputEvents{};
+  std::vector<std::vector<at::Tensor>> tmpOutputs;
+  std::vector<c10::Stream> outputStreams;
+  std::vector<c10::Event> outputEvents;
 };
 
 // A work that takes an lambda on construction and calls it on wait.
@@ -1646,8 +1647,8 @@ class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork {
         input_list(input_list),
         tag(tag) {}
 
-  std::vector<std::vector<at::Tensor>> output_lists{};
-  std::vector<at::Tensor> input_list{};
+  std::vector<std::vector<at::Tensor>> output_lists;
+  std::vector<at::Tensor> input_list;
   const uint32_t tag;
 
   void allgather_coalesced() {
@@ -1694,7 +1695,7 @@ class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork {
   }
 
   const std::vector<at::Tensor> getOutputTensors() override {
-    return {newLikeFlat(output_lists[0])};
+    return {newLikeFlat(output_lists[0], /*preserve_strides*/ false)};
   }
 
   void run() override {
@@ -1800,8 +1801,8 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
         root(root),
         tag(tag) {}
 
-  std::vector<std::vector<at::Tensor>> outputs{};
-  std::vector<at::Tensor> inputs{};
+  std::vector<std::vector<at::Tensor>> outputs;
+  std::vector<at::Tensor> inputs;
   const int root;
   const uint32_t tag;
 
@@ -1818,7 +1819,7 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
     // This is later scattered to the separate output tensors.
     at::Tensor flatOutputTensor;
     if (context_->rank == root) {
-      flatOutputTensor = newLikeFlat(outputs[0]);
+      flatOutputTensor = newLikeFlat(outputs[0], /*preserve_strides*/ false);
       GENERATE_ALL_TYPES(scalarType, setOutput, opts, flatOutputTensor);
     }
 
@@ -1841,7 +1842,8 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
 
   const std::vector<at::Tensor> getOutputTensors() override {
     return outputs.empty() ? std::vector<at::Tensor>{}
-                           : std::vector<at::Tensor>{newLikeFlat(outputs[0])};
+                           : std::vector<at::Tensor>{newLikeFlat(
+                                 outputs[0], /*preserve_strides*/ false)};
   }
 
   void run() override {
@@ -1918,13 +1920,13 @@ class AsyncGatherCUDAWork : public AsyncGatherWork {
     }
   }
 
-  std::vector<at::Tensor> tmpInputs{};
-  std::vector<c10::Stream> inputStreams{};
-  std::vector<c10::Event> inputEvents{};
+  std::vector<at::Tensor> tmpInputs;
+  std::vector<c10::Stream> inputStreams;
+  std::vector<c10::Event> inputEvents;
 
-  std::vector<std::vector<at::Tensor>> tmpOutputs{};
-  std::vector<c10::Stream> outputStreams{};
-  std::vector<c10::Event> outputEvents{};
+  std::vector<std::vector<at::Tensor>> tmpOutputs;
+  std::vector<c10::Stream> outputStreams;
+  std::vector<c10::Event> outputEvents;
 };
 
 } // namespace
@@ -2031,8 +2033,8 @@ class AsyncScatterWork : public ProcessGroupGloo::AsyncWork {
         root(root),
         tag(tag) {}
 
-  std::vector<at::Tensor> outputs{};
-  std::vector<std::vector<at::Tensor>> inputs{};
+  std::vector<at::Tensor> outputs;
+  std::vector<std::vector<at::Tensor>> inputs;
   const int root;
   const uint32_t tag;
 
@@ -2057,7 +2059,8 @@ class AsyncScatterWork : public ProcessGroupGloo::AsyncWork {
 
   const std::vector<at::Tensor> getInputTensors() override {
     return inputs.empty() ? std::vector<at::Tensor>{}
-                          : std::vector<at::Tensor>{newLikeFlat(inputs[0])};
+                          : std::vector<at::Tensor>{newLikeFlat(
+                                inputs[0], /*preserve_strides*/ false)};
   }
 
   const std::vector<at::Tensor> getOutputTensors() override {
@@ -2131,13 +2134,13 @@ class AsyncScatterCUDAWork : public AsyncScatterWork {
     }
   }
 
-  std::vector<at::Tensor> tmpOutputs{};
-  std::vector<c10::Stream> outputStreams{};
-  std::vector<c10::Event> outputEvents{};
+  std::vector<at::Tensor> tmpOutputs;
+  std::vector<c10::Stream> outputStreams;
+  std::vector<c10::Event> outputEvents;
 
-  std::vector<std::vector<at::Tensor>> tmpInputs{};
-  std::vector<c10::Stream> inputStreams{};
-  std::vector<c10::Event> inputEvents{};
+  std::vector<std::vector<at::Tensor>> tmpInputs;
+  std::vector<c10::Stream> inputStreams;
+  std::vector<c10::Event> inputEvents;
 };
 
 } // namespace
@@ -2291,8 +2294,8 @@ class AsyncAlltoallWork : public ProcessGroupGloo::AsyncWork {
 
   at::Tensor outputTensor;
   at::Tensor inputTensor;
-  std::vector<int64_t> outputCounts{};
-  std::vector<int64_t> inputCounts{};
+  std::vector<int64_t> outputCounts;
+  std::vector<int64_t> inputCounts;
   const uint32_t tag;
 
   void alltoall(at::Tensor& outputTensor, at::Tensor& inputTensor) {
@@ -2394,12 +2397,12 @@ class AsyncAlltoallCUDAWork : public AsyncAlltoallWork {
   }
 
   at::Tensor cpuOutput;
-  std::vector<c10::Stream> outputStreams{};
-  std::vector<c10::Event> outputEvents{};
+  std::vector<c10::Stream> outputStreams;
+  std::vector<c10::Event> outputEvents;
 
   at::Tensor cpuInput;
-  std::vector<c10::Stream> inputStreams{};
-  std::vector<c10::Event> inputEvents{};
+  std::vector<c10::Stream> inputStreams;
+  std::vector<c10::Event> inputEvents;
 };
 
 } // namespace
@@ -2473,7 +2476,7 @@ static at::Tensor& checkSingleTensor(std::vector<at::Tensor>& tensors) {
 
 static uint32_t checkTag(int32_t tag) {
   TORCH_CHECK(tag >= 0, "Tag must be nonnegative");
-  return (uint32_t)tag;
+  return static_cast<uint32_t>(tag);
 }
 
 c10::intrusive_ptr<Work> ProcessGroupGloo::send(
@@ -2573,9 +2576,9 @@ class AsyncBarrierWork : public ProcessGroupGloo::AsyncWork {
         priorWork(std::move(priorWork)),
         tag(tag) {}
 
-  std::vector<c10::weak_intrusive_ptr<AsyncWork>> priorWork{};
+  std::vector<c10::weak_intrusive_ptr<AsyncWork>> priorWork;
   const uint32_t tag;
-  std::vector<at::Tensor> inputs{};
+  std::vector<at::Tensor> inputs;
 
   const std::vector<at::Tensor> getInputTensors() override {
     return inputs;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp b/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp
index 6e680b41fe8d..7494a960a3dc 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp
@@ -1,6 +1,7 @@
 #ifdef USE_C10D_GLOO
 #include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp>
+#include <utility>
 
 #include <gloo/cuda_allreduce_ring_chunked.h>
 
@@ -24,7 +25,7 @@ class AsyncAllreduceCUDADeviceWork : public ProcessGroupGloo::AsyncWork {
             "gloo:all_reduce",
             inputs),
         inputs_(inputs),
-        reduceOp_(reduceOp) {}
+        reduceOp_(std::move(reduceOp)) {}
 
   template <typename T>
   void createAlgorithm(std::unique_ptr<gloo::Algorithm>& algo) {
@@ -125,8 +126,8 @@ class AsyncAllreduceCUDAHostWork : public AsyncAllreduceWork {
   }
 
   std::vector<at::Tensor> tmp;
-  std::vector<c10::Stream> streams{};
-  std::vector<c10::Event> events{};
+  std::vector<c10::Stream> streams;
+  std::vector<c10::Event> events;
 };
 
 class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork {
@@ -179,9 +180,9 @@ class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork {
     }
   }
 
-  std::vector<at::Tensor> tmp{};
-  std::vector<c10::Stream> streams{};
-  std::vector<c10::Event> events{};
+  std::vector<at::Tensor> tmp;
+  std::vector<c10::Stream> streams;
+  std::vector<c10::Event> events;
 };
 
 static c10::intrusive_ptr<ProcessGroupGloo::AsyncWork> makeAllreduceCUDAWork(
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp b/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp
index 442cb490743b..07f0e26c2da9 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp
@@ -284,7 +284,7 @@ class AsyncAllreduceWork : public ProcessGroupGloo::AsyncWork {
         reduceOp(std::move(reduceOp)),
         tag(tag) {}
 
-  std::vector<at::Tensor> inputs{};
+  std::vector<at::Tensor> inputs;
   const ReduceOp reduceOp;
   const uint32_t tag;
 
@@ -399,7 +399,7 @@ class AsyncSparseAllreduceWork : public ProcessGroupGloo::AsyncWork {
         inputs(inputs),
         tag(tag) {}
 
-  std::vector<at::Tensor> inputs{};
+  std::vector<at::Tensor> inputs;
   const uint32_t tag;
 
   // We share dimensionality about the sparse tensors before collecting
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 1811404d6663..9b615b9f16b0 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -393,6 +393,10 @@ static std::
 #endif // (defined(IS_NCCLX) || defined(USE_ROCM)) && defined(NCCL_COMM_DUMP)
 }
 
+void reset_nccl_trace() {
+  FlightRecorderCUDA::get()->reset_all();
+}
+
 std::string dump_nccl_trace(
     bool includeCollectives,
     bool includeStackTraces,
@@ -1426,17 +1430,41 @@ bool ProcessGroupNCCL::abortComms(
   return true;
 }
 
+void ProcessGroupNCCL::dumpExtraDebuggingInfo() {
+  // This extra dump is intended to capture the current snapshot of collectives
+  // When this process group is terminated for some exception out of NCCL
+  bool dumpExtraOnExec_ = getCvarBool(TORCH_NCCL_EXTRA_DUMP_ON_EXEC, false);
+  if (dumpExtraOnExec_) {
+    bool should_dump_local = false;
+    bool succeeded = shouldDump_.compare_exchange_strong(
+        should_dump_local,
+        true,
+        std::memory_order_release,
+        std::memory_order_acquire);
+    if (succeeded) {
+      LOG(INFO) << logPrefix() << "Sending extra dumping signal";
+      broadcastDumpSignal();
+      // When this routine is called, exception is captured so
+      // dumping by default_pg is not guaranteed due to early termination of
+      // process So we call dumping manually here
+      bool onlyActive = getCvarBool(TORCH_INCLUDE_ONLY_ACTIVE, false);
+      // Stacktrace is not included at the moment to prevent deadlock due to GIL
+      dumpDebuggingInfo(false, onlyActive);
+    }
+  }
+}
+
 // Abort this backend.
 void ProcessGroupNCCL::abort() {
   // This will log counter for how long the abort actually takes.
   STATIC_SCOPED_WAIT_COUNTER(pytorch.ProcessGroupNCCL__abort);
 
+  dumpExtraDebuggingInfo();
   // Don't join threads here since the purpose of this method is to abort all
   // communicators and signal the threads to exit. Joining on the threads could
   // potentially block and hence avoid it in this method.
   terminateProcessGroup_.store(true);
   watchdog_->notify();
-
   // launch abort asynchronously and wait for it to complete or timeout
   LOG(INFO) << logPrefix()
             << "Launching ProcessGroupNCCL abort asynchronously.";
@@ -1564,7 +1592,9 @@ ProcessGroupNCCL::~ProcessGroupNCCL() {
   }
 }
 
-bool ProcessGroupNCCL::dumpDebuggingInfo(bool includeStackTrace /*=true*/) {
+bool ProcessGroupNCCL::dumpDebuggingInfo(
+    bool includeStackTrace /*=true*/,
+    bool onlyActive /*=false*/) {
   // This will log counter for how long dumpDebuggingInfo actually takes.
   STATIC_SCOPED_WAIT_COUNTER(pytorch.ProcessGroupNCCL__dumpDebuggingInfo);
 
@@ -1575,12 +1605,12 @@ bool ProcessGroupNCCL::dumpDebuggingInfo(bool includeStackTrace /*=true*/) {
   LOG(ERROR)
       << logPrefix()
       << "ProcessGroupNCCL preparing to dump debug info. Include stack trace: "
-      << includeStackTrace;
+      << includeStackTrace << ", only active collectives: " << onlyActive;
   if (traceBufferSize_ > 0) {
     // We dump nccl trace into local disk by default and users can register
     // their customized writer by inheriting `DebugInfoWriter` via
     // `registerDebugInfoWriter`.
-    auto ncclTrace = dump_nccl_trace(true, includeStackTrace, false);
+    auto ncclTrace = dump_nccl_trace(true, includeStackTrace, onlyActive);
     // dump_nccl_trace will hang so we don't grab the global lock until we get
     // the trace.
     std::lock_guard<std::mutex> lock(writeDebugInfoMutex);
@@ -1848,10 +1878,11 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
     // recorder and dump. After dump, the training should continue.
     if (dumpPipe.has_value() && dumpPipe->shouldDump()) {
       // best effort dump, not waiting for the dump here
+      bool onlyActive = getCvarBool(TORCH_INCLUDE_ONLY_ACTIVE, false);
       LOG(INFO) << pg_->logPrefix()
                 << "Dump signal received through pipe, triggering FR dump.";
-      futures.emplace_back(std::async(std::launch::async, [this]() {
-        return this->pg_->dumpDebuggingInfo();
+      futures.emplace_back(std::async(std::launch::async, [this, onlyActive]() {
+        return this->pg_->dumpDebuggingInfo(false, onlyActive);
       }));
     }
   }
@@ -1869,7 +1900,8 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
   if (checkDumpSignal && shouldDump_.load()) {
     // Store debug info to storage if no other thread does it. (By default to
     // local disk)
-    bool dumpStackTrace = true;
+    bool dumpStackTrace = getCvarBool(TORCH_INCLUDE_STACK_TRACE, true);
+    bool onlyActive = getCvarBool(TORCH_INCLUDE_ONLY_ACTIVE, false);
     ::c10d::C10dLoggingData debugLog;
     debugLog.integers["pg_id"] = static_cast<int64_t>(pg_->getUid());
     debugLog.integers["rank"] = pg_->getRank();
@@ -1878,8 +1910,8 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
     debugLog.strings["flight_recorder_version"] = c10d::version_val_str;
     for (int i = 0; i < 2; i++) {
       std::future<bool> asyncDebugDump =
-          std::async(std::launch::async, [this, dumpStackTrace]() {
-            return this->pg_->dumpDebuggingInfo(dumpStackTrace);
+          std::async(std::launch::async, [this, dumpStackTrace, onlyActive]() {
+            return this->pg_->dumpDebuggingInfo(dumpStackTrace, onlyActive);
           });
 
       // wait for the dump until timeout - log data
@@ -2041,6 +2073,9 @@ void ProcessGroupNCCL::Watchdog::run() {
     VLOG(2) << pg_->logPrefix()
             << "Process group watchdog thread terminated normally";
   } catch (std::exception& e) {
+    // This condition is triggered when any routine in watchdog gets an
+    // exception
+    pg_->dumpExtraDebuggingInfo();
     if (std::string(e.what()).find("driver shutting down") !=
         std::string::npos) {
       VLOG(2)
@@ -3217,9 +3252,15 @@ void check_gpu_single_tensor(
   if (!tensor.is_cuda() || tensor.is_sparse()) {
     C10_THROW_ERROR(ValueError, "Tensors must be CUDA and dense");
   }
-  // Skip the following requirements for P2P operations
+  // Check memory format
   if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
+    // P2P is a bit relaxed, supporting transfer of a transposed tensor
     if (p2p) {
+      // But must be dense still
+      if (!tensor.is_non_overlapping_and_dense()) {
+        C10_THROW_ERROR(
+            ValueError, "Tensors for P2P must be non-overlapping and dense");
+      }
       TORCH_WARN_ONCE(
           "Detected non-contiguous tensor in P2P operations. It is user "
           "responsibility to guarantee that source and destination tensors have "
@@ -3506,6 +3547,30 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
     workEnqueue(work);
   }
 
+  {
+    c10::cuda::CUDAMultiStreamGuard streamGuard(ncclStream);
+    std::vector<at::Device> devices{device};
+    work->future_ = c10::make_intrusive<at::ivalue::Future>(
+        c10::ListType::create(c10::TensorType::get()), devices);
+
+    // Add a callback that runs profiling end callbacks. wrapCallback() in CUDA
+    // future blocks the stream this callback runs on the corresponding
+    // ncclEndEvents_ ensuring appropriate synchronization.
+    if (work->recordFunctionEndCallback_) {
+      work->future_->addCallback(
+          [work](at::ivalue::Future& /* unused */) {
+            work->recordFunctionEndCallback_();
+          },
+          // uses_future = false allows us to skip synchronization in
+          // ivalue::Future, but is only valid as long as the lambda doesn't use
+          // the "Future" argument.
+          /*uses_future=*/false);
+    }
+    // Mark the future as completed since coalesced operations complete
+    // immediately
+    work->future_->markCompleted(at::IValue(std::vector<at::Tensor>{}));
+  }
+
   // Reset coalescing state
   coalescing_state_ = 0;
   coalescedComm_ = nullptr;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index e3ac4c09a9b0..286eab14d1a8 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -126,6 +126,11 @@ static std::vector<std::string> TORCH_NCCL_COORD_CHECK_MILSEC = {
 static std::vector<std::string> TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN = {
     "TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN"};
 
+// Whether to include only active collectives in the Flight Recorder trace
+// (default false)
+static std::vector<std::string> TORCH_NCCL_EXTRA_DUMP_ON_EXEC = {
+    "TORCH_NCCL_EXTRA_DUMP_ON_EXEC"};
+
 // Control whether to use CudaEventCache for the collective in watchdog thread.
 // We noticed in the past when cuda global lock is held, destroying CudaEvent
 // can cause a hang.
@@ -732,7 +737,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     std::condition_variable workMetaListCV_;
 
     // Heartbeat of watchdog thread.
-    std::atomic_uint64_t heartbeat_{};
+    std::atomic_uint64_t heartbeat_;
 
     // Whether or not to propagate detected errors to all ranks in the same PG
     // through TCPStore.
@@ -1079,7 +1084,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // In the timeout case and we will dump debug info such as the NCCL flight
   // recorder to storage. Down the road, if we have more complicated or blocking
   // operations, we might need to use a side thread to do it.
-  bool dumpDebuggingInfo(bool includeStackTrace = true);
+  bool dumpDebuggingInfo(
+      bool includeStackTrace = true,
+      bool onlyActive = false);
+
+  void dumpExtraDebuggingInfo();
 
   // Abort all communicators on this rank.
   bool abortComms(const std::optional<std::string>& abortReason = std::nullopt);
@@ -1089,8 +1098,8 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   bool useNonblocking();
 
  protected:
-  int globalRankStart_;
-  int globalRankStride_;
+  int globalRankStart_{};
+  int globalRankStride_{};
 
  private:
   bool eagerInit_{false};
@@ -1319,7 +1328,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   int traceBufferSize_;
 
   // We gate the cudaEventCache so that we can roll it out gradually.
-  std::atomic<bool> cudaEventCacheEnabled_{};
+  std::atomic<bool> cudaEventCacheEnabled_;
 
   std::thread onCompletionHookThread_;
 
@@ -1327,7 +1336,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   std::atomic<bool> terminateProcessGroup_;
 
   // Whether there are hooks pending to be fired
-  std::atomic<bool> hasPendingHooks_{};
+  std::atomic<bool> hasPendingHooks_;
 
   // This is the signal from watchdog threads to indicate whether the monitor
   // thread should dump. Making it static so that it is accessible from all the
@@ -1380,7 +1389,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   std::shared_ptr<NCCLComm> coalescedComm_ = nullptr;
 
   // Whether the coalesced calls are sync or async.
-  bool coalescedAsync_;
+  bool coalescedAsync_{};
 
   // keeps track of input and output tensors when coalescing is in flight.  Will
   // hand over these tensors to WorkNCCL's stash when coalescing is ended.
@@ -1416,11 +1425,11 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // Whether or not to create start CUDAEvent and enable timing for start
   // and end events. Note that enableTiming_ is always true if desyncDebug_
   // is set to true.
-  std::atomic<bool> enableTiming_{};
+  std::atomic<bool> enableTiming_;
 
   // Flag to enable the print of hash value of input/output of collectives for
   // verification.
-  std::atomic<bool> enableCollectiveHashDebug_{};
+  std::atomic<bool> enableCollectiveHashDebug_;
 
   // Whether or not TORCH_NCCL_AVOID_RECORD_STREAMS was set
   bool avoidRecordStreams_ = false;
@@ -1463,6 +1472,9 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   std::unique_ptr<c10::cuda::MemPool> memPool_ = nullptr;
 };
 
+// Reset the flighrecorder recordings for the current rank.
+TORCH_API void reset_nccl_trace();
+
 // Dumps the NCCL comm traces and additional information about the Process
 // Group.
 TORCH_API std::string dump_nccl_trace(
diff --git a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
index a0d2738ab692..624a8fc11b61 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
@@ -163,8 +163,8 @@ struct CollectiveFingerPrint {
     backend->allgather(output_tensors, tensors_to_verify)->wait();
     // Verify equivalence
     for (const auto i : c10::irange(output_tensors.size())) {
-      const std::vector<at::Tensor> gathered_tensors = output_tensors[i];
-      const at::Tensor reference_tensor = tensors_to_verify[i];
+      const std::vector<at::Tensor>& gathered_tensors = output_tensors[i];
+      const at::Tensor& reference_tensor = tensors_to_verify[i];
       for (const auto rank : c10::irange(gathered_tensors.size())) {
         const auto& rank_tensor = gathered_tensors[rank];
         if (!rank_tensor.equal(reference_tensor)) {
diff --git a/torch/csrc/distributed/c10d/TCPStore.cpp b/torch/csrc/distributed/c10d/TCPStore.cpp
index f944815b99fa..d4fd1c6c1e42 100644
--- a/torch/csrc/distributed/c10d/TCPStore.cpp
+++ b/torch/csrc/distributed/c10d/TCPStore.cpp
@@ -207,7 +207,7 @@ class SendBuffer {
   SendBuffer(detail::TCPClient& client, detail::QueryType cmd)
       : client(client) {
     buffer.reserve(32); // enough for most commands
-    buffer.push_back((uint8_t)cmd);
+    buffer.push_back(static_cast<uint8_t>(cmd));
   }
 
   void appendString(const std::string& str) {
@@ -224,7 +224,7 @@ class SendBuffer {
 
   template <typename T>
   void appendValue(T value) {
-    uint8_t* begin = (uint8_t*)&value;
+    uint8_t* begin = reinterpret_cast<uint8_t*>(&value);
     buffer.insert(buffer.end(), begin, begin + sizeof(T));
     maybeFlush();
   }
diff --git a/torch/csrc/distributed/c10d/TCPStore.hpp b/torch/csrc/distributed/c10d/TCPStore.hpp
index 75561cf597ae..2caab088a609 100644
--- a/torch/csrc/distributed/c10d/TCPStore.hpp
+++ b/torch/csrc/distributed/c10d/TCPStore.hpp
@@ -41,7 +41,7 @@ class TCPServer;
 class TCPClient;
 
 struct SocketAddress {
-  std::string host{};
+  std::string host;
   std::uint16_t port{};
 };
 
diff --git a/torch/csrc/distributed/c10d/TCPStoreBackend.cpp b/torch/csrc/distributed/c10d/TCPStoreBackend.cpp
index 67d31bb63f90..22455a22a461 100644
--- a/torch/csrc/distributed/c10d/TCPStoreBackend.cpp
+++ b/torch/csrc/distributed/c10d/TCPStoreBackend.cpp
@@ -96,7 +96,7 @@ class TCPStoreMasterDaemon : public BackgroundThread {
   std::unordered_set<int> miscellaneousSockets_;
 
   Socket storeListenSocket_;
-  std::vector<Socket> sockets_{};
+  std::vector<Socket> sockets_;
 #ifdef _WIN32
   const std::chrono::milliseconds checkTimeout_ = std::chrono::milliseconds{10};
   HANDLE ghStopEvent_{};
diff --git a/torch/csrc/distributed/c10d/TCPStoreBackend.hpp b/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
index 3eb148f2bef8..d5f7f0248bba 100644
--- a/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
+++ b/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
@@ -65,7 +65,7 @@ class BackgroundThread {
 
  private:
   std::atomic<bool> is_running_{false};
-  std::thread daemonThread_{};
+  std::thread daemonThread_;
 };
 
 std::unique_ptr<BackgroundThread> create_tcpstore_backend(
diff --git a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
index 52354de93edf..80dd7340709a 100644
--- a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
+++ b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
@@ -36,14 +36,14 @@ Other callbacks don't provide exception safety so avoid there.
 // backlog. This should be at least world size to avoid issues on init. We set
 // it to -1 to use the host max value which is controlled by `soconnmax`.
 auto constexpr DEFAULT_BACKLOG = -1;
-auto constexpr MAX_KEY_COUNT = size_t(128 * 1024);
+auto constexpr MAX_KEY_COUNT = static_cast<size_t>(128 * 1024);
 auto constexpr MAX_STRING_LEN = 8 * 1024;
 auto constexpr MAX_PAYLOAD_LEN = 8 * 1024 * 1024;
 
 // This controls the preferred size for buffers.
 // Too small and we'll need multiple buffers for one request
 // Too big and we might taxing malloc
-auto constexpr ALLOC_BUFFER_SIZE = size_t(4096);
+auto constexpr ALLOC_BUFFER_SIZE = static_cast<size_t>(4096);
 class UvHandle : public c10::intrusive_ptr_target {
  public:
   ~UvHandle() override = default;
@@ -78,7 +78,7 @@ class UvHandle : public c10::intrusive_ptr_target {
 
  private:
   static c10::intrusive_ptr<UvHandle> reclaim(uv_handle_t* handle) {
-    auto h = (UvHandle*)uv_handle_get_data(handle);
+    auto h = static_cast<UvHandle*>(uv_handle_get_data(handle));
     return c10::intrusive_ptr<UvHandle>::reclaim(h);
   }
 
@@ -97,7 +97,8 @@ class UvTcpSocket : public UvHandle {
   }
 
   static c10::intrusive_ptr<UvTcpSocket> borrow(uv_stream_t* handle) {
-    auto h = (UvTcpSocket*)uv_handle_get_data((uv_handle_t*)handle);
+    auto h = static_cast<UvTcpSocket*>(
+        uv_handle_get_data(reinterpret_cast<uv_handle_t*>(handle)));
     return h->iptr();
   }
 
@@ -107,7 +108,7 @@ class UvTcpSocket : public UvHandle {
       uv_buf_t* buf) {
     suggested_size = std::min(suggested_size, ALLOC_BUFFER_SIZE);
     // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
-    buf->base = (char*)malloc(suggested_size);
+    buf->base = static_cast<char*>(malloc(suggested_size));
     buf->len = suggested_size;
   }
 
@@ -168,7 +169,8 @@ class UvTcpSocket : public UvHandle {
           formatSockAddr(reinterpret_cast<struct ::sockaddr*>(&addr), addrLen);
     }
 
-    int res = uv_read_start((uv_stream_t*)&client, alloc_buffer, read_callback);
+    int res = uv_read_start(
+        reinterpret_cast<uv_stream_t*>(&client), alloc_buffer, read_callback);
     if (res) {
       C10D_WARNING(
           "Failed to setup read callback. client:{} code:{} name:{} desc:{}.",
@@ -181,12 +183,12 @@ class UvTcpSocket : public UvHandle {
   }
 
   uv_handle_t* unsafeGetHandle() override {
-    return (uv_handle_t*)&client;
+    return reinterpret_cast<uv_handle_t*>(&client);
   }
 
  protected:
   uv_stream_t* unsafeGetStream() {
-    return (uv_stream_t*)&client;
+    return reinterpret_cast<uv_stream_t*>(&client);
   }
 
   uv_tcp_t* unsafeGetSocket() {
@@ -217,7 +219,7 @@ class UvTcpServer : public UvTcpSocket {
     auto res = c10::make_intrusive<UvTcpServer>(loop);
     res->handleReady();
     try {
-      int uv_res = uv_tcp_open((uv_tcp_t*)res->unsafeGetStream(), socket);
+      int uv_res = uv_tcp_open(res->unsafeGetSocket(), socket);
       C10D_CHECK_WITH(
           SocketError,
           uv_res == 0,
@@ -266,9 +268,11 @@ class UvTcpServer : public UvTcpSocket {
       struct sockaddr_storage addr{};
       int uv_res = 0;
       if (useIpv6) {
-        uv_res = uv_ip6_addr("::", port, (struct sockaddr_in6*)&addr);
+        uv_res = uv_ip6_addr(
+            "::", port, reinterpret_cast<struct sockaddr_in6*>(&addr));
       } else {
-        uv_res = uv_ip4_addr("0.0.0.0", port, (struct sockaddr_in*)&addr);
+        uv_res = uv_ip4_addr(
+            "0.0.0.0", port, reinterpret_cast<struct sockaddr_in*>(&addr));
       }
       TORCH_CHECK_WITH(
           DistStoreError,
@@ -286,7 +290,9 @@ class UvTcpServer : public UvTcpSocket {
           uv_strerror(uv_res));
 
       uv_res = uv_tcp_bind(
-          res->unsafeGetSocket(), (const struct ::sockaddr*)&addr, 0);
+          res->unsafeGetSocket(),
+          reinterpret_cast<const struct ::sockaddr*>(&addr),
+          0);
       C10D_CHECK_WITH(
           SocketError,
           uv_res == 0,
@@ -329,8 +335,9 @@ class UvTcpServer : public UvTcpSocket {
   }
 
   void accept(const c10::intrusive_ptr<UvTcpSocket>& socket) {
-    int res =
-        uv_accept(unsafeGetStream(), (uv_stream_t*)socket->unsafeGetHandle());
+    int res = uv_accept(
+        unsafeGetStream(),
+        reinterpret_cast<uv_stream_t*>(socket->unsafeGetHandle()));
     C10D_CHECK_WITH(
         SocketError,
         res == 0,
@@ -352,7 +359,8 @@ class UvTcpServer : public UvTcpSocket {
   }
 
   static c10::intrusive_ptr<UvTcpServer> borrow(uv_stream_t* handle) {
-    auto h = (UvTcpServer*)uv_handle_get_data((uv_handle_t*)handle);
+    auto h = static_cast<UvTcpServer*>(
+        uv_handle_get_data(reinterpret_cast<uv_handle_t*>(handle)));
     return h->iptr();
   }
 
@@ -361,13 +369,12 @@ class UvTcpServer : public UvTcpSocket {
 
     int addr_len = sizeof(addr_s);
 
-    if (uv_tcp_getsockname(
+    TORCH_CHECK(
+        uv_tcp_getsockname(
             (uv_tcp_t*)unsafeGetStream(),
             reinterpret_cast<::sockaddr*>(&addr_s),
-            &addr_len) != 0) {
-      throw std::runtime_error(
-          "The port number of the socket cannot be retrieved.");
-    }
+            &addr_len) == 0,
+        "The port number of the socket cannot be retrieved.");
 
     if (addr_s.ss_family == AF_INET) {
       portNum_ = ntohs(reinterpret_cast<sockaddr_in*>(&addr_s)->sin_port);
@@ -390,7 +397,8 @@ class WriterPayload : public c10::intrusive_ptr_target {
   static c10::intrusive_ptr<WriterPayload> reclaim(uv_write_t* request) {
     /* This method returns a intrusive_ptr that does not increase the refcount.
      */
-    auto h = (WriterPayload*)uv_req_get_data((uv_req_t*)request);
+    auto h = static_cast<WriterPayload*>(
+        uv_req_get_data(reinterpret_cast<uv_req_t*>(request)));
     return c10::intrusive_ptr<WriterPayload>::reclaim(h);
   }
 
@@ -428,15 +436,19 @@ class WriterPayload : public c10::intrusive_ptr_target {
       std::vector<uint8_t>&& in_data,
       c10::intrusive_ptr<UvHandle> handle)
       : data(std::move(in_data)), handle(std::move(handle)) {
-    uv_req_set_data((uv_req_t*)&req, this);
+    uv_req_set_data(reinterpret_cast<uv_req_t*>(&req), this);
   }
 
   ~WriterPayload() override = default;
 
   void send() {
-    buf = uv_buf_init((char*)data.data(), data.size());
+    buf = uv_buf_init(reinterpret_cast<char*>(data.data()), data.size());
     int res = uv_write(
-        &req, (uv_stream_t*)handle->unsafeGetHandle(), &buf, 1, write_done);
+        &req,
+        reinterpret_cast<uv_stream_t*>(handle->unsafeGetHandle()),
+        &buf,
+        1,
+        write_done);
 
     if (res) {
       C10D_WARNING(
@@ -585,7 +597,7 @@ class ChunkedStream {
     if (available() < size)
       return false;
     str.resize(size);
-    return read_many((char*)str.data(), size);
+    return read_many(str.data(), size);
   }
 
   bool read_payload(std::vector<uint8_t>& data) {
@@ -605,7 +617,7 @@ class ChunkedStream {
     if (available() < size_in_bytes)
       return false;
     data.resize(size);
-    return read_many((char*)data.data(), size_in_bytes);
+    return read_many(reinterpret_cast<char*>(data.data()), size_in_bytes);
   }
 
   size_t available() {
@@ -704,15 +716,15 @@ class LibUVStoreDaemon : public BackgroundThread {
   int port_;
 
   static LibUVStoreDaemon& from_uv(uv_handle_t* stream) {
-    return *(LibUVStoreDaemon*)uv_handle_get_data(stream);
+    return *static_cast<LibUVStoreDaemon*>(uv_handle_get_data(stream));
   }
 
   static void on_new_connection(uv_stream_t* server, int status) {
-    from_uv((uv_handle_t*)server).onConnect(status);
+    from_uv(reinterpret_cast<uv_handle_t*>(server)).onConnect(status);
   }
 
   static void on_exit_request(uv_async_t* handle) {
-    from_uv((uv_handle_t*)handle).onExitRequest();
+    from_uv(reinterpret_cast<uv_handle_t*>(handle)).onExitRequest();
   }
 
   void onConnect(int status);
@@ -740,12 +752,12 @@ class UvClient : public UvTcpSocket {
       if (!stream.read1(command))
         break;
       if (store->isMiscellaneousClient(iptr())) {
-        if ((QueryType)command != QueryType::VALIDATE)
+        if (static_cast<QueryType>(command) != QueryType::VALIDATE)
           return;
         if (!parse_validate_command())
           return;
       } else {
-        switch ((QueryType)command) {
+        switch (static_cast<QueryType>(command)) {
           case QueryType::PING:
             if (!parse_ping_command())
               return;
@@ -984,7 +996,7 @@ class UvClient : public UvTcpSocket {
 
     if (store->waitKeys(keys, iptr())) {
       StreamWriter sw(iptr());
-      sw.write1((uint8_t)WaitResponseType::STOP_WAITING);
+      sw.write1(static_cast<uint8_t>(WaitResponseType::STOP_WAITING));
       sw.send();
     }
 
@@ -1103,7 +1115,7 @@ class UvClient : public UvTcpSocket {
     C10D_TRACE("cancel_wait address:{}", this->address());
 
     StreamWriter sw(iptr());
-    sw.write1((uint8_t)WaitResponseType::WAIT_CANCELED);
+    sw.write1(static_cast<uint8_t>(WaitResponseType::WAIT_CANCELED));
     sw.send();
 
     return true;
@@ -1188,7 +1200,7 @@ void LibUVStoreDaemon::onConnect(int status) {
 
 void LibUVStoreDaemon::onExitRequest() {
   C10D_DEBUG("Store exit requested\n");
-  uv_close((uv_handle_t*)&exit_handle_, nullptr);
+  uv_close(reinterpret_cast<uv_handle_t*>(&exit_handle_), nullptr);
   uv_stop(&loop_);
 }
 
@@ -1229,12 +1241,12 @@ LibUVStoreDaemon::LibUVStoreDaemon(int port) : port_(port) {
       uv_async_init(&loop_, &exit_handle_, LibUVStoreDaemon::on_exit_request) ==
           0,
       "Failed to init uv async event");
-  uv_handle_set_data((uv_handle_t*)&exit_handle_, this);
+  uv_handle_set_data(reinterpret_cast<uv_handle_t*>(&exit_handle_), this);
 }
 
 LibUVStoreDaemon::~LibUVStoreDaemon() {
   if (!is_running()) {
-    uv_close((uv_handle_t*)&exit_handle_, nullptr);
+    uv_close(reinterpret_cast<uv_handle_t*>(&exit_handle_), nullptr);
     uv_run(&loop_, UV_RUN_NOWAIT);
     if (uv_loop_close(&loop_) != 0) {
       C10D_ERROR("loop cleanup didn't work");
@@ -1478,7 +1490,7 @@ void LibUVStoreDaemon::wakeupWaitingClients(const std::string& key) {
     for (const auto& client : socketsToWait->second) {
       if (--keysAwaited_[client] == 0) {
         StreamWriter sw(client->iptr());
-        sw.write1((uint8_t)WaitResponseType::STOP_WAITING);
+        sw.write1(static_cast<uint8_t>(WaitResponseType::STOP_WAITING));
         sw.send();
       }
     }
@@ -1492,7 +1504,7 @@ void LibUVStoreDaemon::wakeupOneWaitingClient(const std::string& key) {
     for (const auto& client : socketsToWait->second) {
       if (--keysAwaited_[client] == 0) {
         StreamWriter sw(client->iptr());
-        sw.write1((uint8_t)WaitResponseType::STOP_WAITING);
+        sw.write1(static_cast<uint8_t>(WaitResponseType::STOP_WAITING));
         sw.send();
         return;
       }
diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp
index 03bd6ef3cafd..ff68af5b2b5d 100644
--- a/torch/csrc/distributed/c10d/Utils.hpp
+++ b/torch/csrc/distributed/c10d/Utils.hpp
@@ -437,14 +437,16 @@ inline at::Tensor newLikeFlat(
   }
   at::DeviceGuard gpuGuard(device);
   std::vector<int64_t> sizes{static_cast<int64_t>(tensors[deviceIdx].size())};
-  std::vector<int64_t> strides{static_cast<int64_t>(t.numel())};
+  std::vector<int64_t> strides{t.numel()};
   sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
   strides.insert(strides.end(), t.strides().begin(), t.strides().end());
   return at::empty_strided(
       sizes, strides, t.options().memory_format(std::nullopt));
 }
 
-inline at::Tensor newLikeFlat(std::vector<at::Tensor>& tensors) {
+inline at::Tensor newLikeFlat(
+    std::vector<at::Tensor>& tensors,
+    bool preserve_strides = true) {
   if (tensors.empty()) {
     TORCH_CHECK(false, "Received an empty list");
   }
@@ -452,7 +454,20 @@ inline at::Tensor newLikeFlat(std::vector<at::Tensor>& tensors) {
   at::DeviceGuard gpuGuard(t.device());
   std::vector<int64_t> sizes{static_cast<int64_t>(tensors.size())};
   sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
-  return at::empty(sizes, t.options());
+  if (t.is_contiguous() ||
+      !preserve_strides) { // we are checking for memory format, so tensor might
+    // not be contiguous
+    // TODO handle all non-overlapping-and-dense, although if the strides
+    // disagree in ranks we are opening a door for more bugs than currently
+    // where channels-last might disagree between ranks
+    // fast path, don't call empty_strided
+    return at::empty(sizes, t.options());
+  } else {
+    // memory-dense, but not necessarily contiguous tensor
+    std::vector<int64_t> strides{t.numel()};
+    strides.insert(strides.end(), t.strides().begin(), t.strides().end());
+    return at::empty_strided(sizes, strides, t.options());
+  }
 }
 
 inline std::vector<std::vector<int64_t>> getSizes(
diff --git a/torch/csrc/distributed/c10d/Work.cpp b/torch/csrc/distributed/c10d/Work.cpp
index 2c1ee42727d8..cdec9185ce53 100644
--- a/torch/csrc/distributed/c10d/Work.cpp
+++ b/torch/csrc/distributed/c10d/Work.cpp
@@ -1,5 +1,5 @@
 #include <ATen/ThreadLocalState.h>
-#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#include <distributed/c10d/ProcessGroup.hpp>
 #include <torch/csrc/distributed/c10d/cuda/StreamBlock.hpp>
 
 #include <torch/csrc/distributed/c10d/Work.hpp>
diff --git a/torch/csrc/distributed/c10d/Work.hpp b/torch/csrc/distributed/c10d/Work.hpp
index 9e242d6faf9b..2eeea75330fd 100644
--- a/torch/csrc/distributed/c10d/Work.hpp
+++ b/torch/csrc/distributed/c10d/Work.hpp
@@ -135,7 +135,7 @@ class TORCH_API Work : public torch::CustomClassHolder {
   OpType retrieveOpType() const;
 
   static c10::intrusive_ptr<Work> create_from_future(
-      const c10::intrusive_ptr<c10::ivalue::Future>&);
+      const c10::intrusive_ptr<c10::ivalue::Future>& /*future*/);
 
  protected:
   // Completes the work object and optionally sets the exception in a
diff --git a/torch/csrc/distributed/c10d/comm.cpp b/torch/csrc/distributed/c10d/comm.cpp
index 1ded910eaad1..4128bb0dd7a8 100644
--- a/torch/csrc/distributed/c10d/comm.cpp
+++ b/torch/csrc/distributed/c10d/comm.cpp
@@ -109,7 +109,7 @@ at::Tensor parseCppCommHookResult(const c10::IValue& result) {
   if (result.isPyObject()) {
     std::vector<at::Tensor> tensors =
         result.toPyObjectHolder()->extractTensors();
-    return tensors[0];
+    return std::move(tensors[0]);
   }
   TORCH_INTERNAL_ASSERT(
       result.isTensor() || result.isTensorList(),
diff --git a/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp b/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp
index 995899441d46..b5bbe8351fb0 100644
--- a/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp
+++ b/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp
@@ -49,7 +49,7 @@ void StoreCollectives::barrier(
           msg += fmt::format("{}, ", i);
         }
       }
-      throw std::runtime_error(msg + e.what());
+      TORCH_CHECK(false, msg, e.what());
     }
   }
 }
@@ -118,7 +118,7 @@ std::vector<std::vector<uint8_t>> StoreCollectives::gatherRecv(
         msg += fmt::format("{}, ", i);
       }
     }
-    throw std::runtime_error(msg + e.what());
+    TORCH_CHECK(false, msg, e.what());
   }
 
   // insert local data
@@ -194,7 +194,7 @@ std::vector<std::vector<uint8_t>> StoreCollectives::allGather(
         msg += fmt::format("{}, ", i);
       }
     }
-    throw std::runtime_error(msg + e.what());
+    TORCH_CHECK(false, msg, e.what());
   }
 }
 
diff --git a/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp b/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp
index 7d3eb5038565..6aefca8eabd3 100644
--- a/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp
+++ b/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp
@@ -62,7 +62,7 @@ class TORCH_API StoreCollectives : public ControlCollectives {
   int rank_;
   int worldSize_;
 
-  c10::FastSet<std::string> seenKeys_{};
+  c10::FastSet<std::string> seenKeys_;
 };
 
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/control_plane/Handlers.cpp b/torch/csrc/distributed/c10d/control_plane/Handlers.cpp
index 973197ded14f..10274d053b99 100644
--- a/torch/csrc/distributed/c10d/control_plane/Handlers.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/Handlers.cpp
@@ -49,8 +49,8 @@ class HandlerRegistry {
   }
 
  private:
-  std::shared_mutex handlersMutex_{};
-  std::unordered_map<std::string, HandlerFunc> handlers_{};
+  std::shared_mutex handlersMutex_;
+  std::unordered_map<std::string, HandlerFunc> handlers_;
 };
 
 HandlerRegistry& getHandlerRegistry() {
diff --git a/torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp b/torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp
index 3e89d8510710..f9fa068bed0d 100644
--- a/torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp
@@ -4,6 +4,7 @@
 #include <fstream>
 #include <string>
 
+#include <c10/util/Exception.h>
 #include <c10/util/tempfile.h>
 #include <torch/csrc/distributed/c10d/exception.h>
 #include <torch/csrc/utils/pybind.h>
@@ -17,9 +18,7 @@ RegisterHandler tracebackHandler{
       auto tmpfile = c10::make_tempfile("torch-dump_traceback");
 
       auto cfile = ::fopen(tmpfile.name.c_str(), "w");
-      if (!cfile) {
-        throw std::runtime_error("failed to open file for writing");
-      }
+      TORCH_CHECK(cfile, "failed to open file for writing");
 
       {
         py::gil_scoped_acquire guard{};
diff --git a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
index a9a7722fe41f..908540e6852a 100644
--- a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
@@ -2,9 +2,9 @@
 #include <unordered_map>
 
 #include <ATen/core/interned_strings.h>
+#include <c10/util/Exception.h>
 #include <c10/util/FileSystem.h>
 #include <c10/util/thread_name.h>
-#include <caffe2/utils/threadpool/WorkersPool.h>
 #include <torch/csrc/distributed/c10d/control_plane/WorkerServer.hpp>
 #include <torch/csrc/distributed/c10d/logging.h>
 
@@ -144,21 +144,19 @@ WorkerServer::WorkerServer(const std::string& hostOrFile, int port) {
   if (port == -1) {
     // using unix sockets
     server_.set_address_family(AF_UNIX);
-
-    if (c10::filesystem::exists(hostOrFile)) {
-      throw std::runtime_error(fmt::format("{} already exists", hostOrFile));
-    }
+    TORCH_CHECK(
+        !c10::filesystem::exists(hostOrFile),
+        fmt::format("{} already exists", hostOrFile));
 
     C10D_WARNING("Server listening to UNIX {}", hostOrFile);
-    if (!server_.bind_to_port(hostOrFile, 80)) {
-      throw std::runtime_error(fmt::format("Error binding to {}", hostOrFile));
-    }
+    TORCH_CHECK(
+        server_.bind_to_port(hostOrFile, 80),
+        fmt::format("Error binding to {}", hostOrFile));
   } else {
     C10D_WARNING("Server listening to TCP {}:{}", hostOrFile, port);
-    if (!server_.bind_to_port(hostOrFile, port)) {
-      throw std::runtime_error(
-          fmt::format("Error binding to {}:{}", hostOrFile, port));
-    }
+    TORCH_CHECK(
+        server_.bind_to_port(hostOrFile, port),
+        fmt::format("Error binding to {}:{}", hostOrFile, port));
   }
 
   serverThread_ = std::thread([this]() {
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 128fab6593b3..a6c6c6f8c474 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -46,7 +46,6 @@
 #include <fmt/format.h>
 #include <pybind11/chrono.h>
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
-
 #include <torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
 
@@ -444,7 +443,8 @@ PyTypeObject* GetReduceOpMetaclass() {
     spec.basicsize = base_metaclass->tp_basicsize;
     spec.flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
     spec.slots = slots;
-    PyTypeObject* metaclass = (PyTypeObject*)PyType_FromSpec(&spec);
+    PyTypeObject* metaclass =
+        reinterpret_cast<PyTypeObject*>(PyType_FromSpec(&spec));
     if (!metaclass)
       throw py::error_already_set();
     return metaclass;
@@ -813,7 +813,10 @@ An enum-like class for built-in communication hooks: ``ALLREDUCE`` and ``FP16_CO
   //    `ReduceOp.PREMUL_SUM(scale)` might be better as per @wanchaol.
   // https://pybind11.readthedocs.io/en/stable/classes.html#enumerations-and-internal-types
   py::class_<::c10d::ReduceOp> reduce_op(
-      module, "ReduceOp", py::metaclass((PyObject*)GetReduceOpMetaclass()), R"(
+      module,
+      "ReduceOp",
+      py::metaclass(reinterpret_cast<PyObject*>(GetReduceOpMetaclass())),
+      R"(
 An enum-like class for available reduction operations: ``SUM``, ``PRODUCT``,
 ``MIN``, ``MAX``, ``BAND``, ``BOR``, ``BXOR``, and ``PREMUL_SUM``.
 
@@ -3359,6 +3362,20 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
             return ::c10d::getNcclVersionTuple();
           });
 
+#ifdef NCCL_HAS_CTA_POLICY
+  processGroupNCCL.def_property_readonly_static(
+      "NCCL_CTA_POLICY_DEFAULT",
+      [](const py::object&) { return NCCL_CTA_POLICY_DEFAULT; });
+  processGroupNCCL.def_property_readonly_static(
+      "NCCL_CTA_POLICY_EFFICIENCY",
+      [](const py::object&) { return NCCL_CTA_POLICY_EFFICIENCY; });
+#ifdef NCCL_CTA_POLICY_ZERO // requires NCCL version >= 2.28
+  processGroupNCCL.def_property_readonly_static(
+      "NCCL_CTA_POLICY_ZERO",
+      [](const py::object&) { return NCCL_CTA_POLICY_ZERO; });
+#endif // NCCL_CTA_POLICY_ZERO
+#endif // NCCL_HAS_CTA_POLICY
+
   module.def(
       "_get_intra_node_comm_usage_counter",
       &::c10d::intra_node_comm::getIntraNodeCommUsageCounter);
@@ -3515,11 +3532,35 @@ Example::
               py::arg("rank"),
               py::arg("size"),
               py::arg("options"),
-              R"(Create a new ProcessGroupXCCL instance.)");
+              R"(Create a new ProcessGroupXCCL instance.)")
+          .def(
+              py::init([](const c10::intrusive_ptr<::c10d::Store>& store,
+                          int rank,
+                          int size) {
+                // gil_scoped_release is not safe as a call_guard in init.
+                // https://github.com/pybind/pybind11/issues/5473
+                py::gil_scoped_release nogil{};
+
+                auto options = ::c10d::ProcessGroupXCCL::Options::create();
+                options->is_high_priority_stream = false;
+                return c10::make_intrusive<::c10d::ProcessGroupXCCL>(
+                    store, rank, size, options);
+              }),
+              py::arg("store"),
+              py::arg("rank"),
+              py::arg("size"),
+              R"(Create a new ProcessGroupXCCL instance.)")
+          .def_property_readonly(
+              "options",
+              &::c10d::ProcessGroupXCCL::getOptions,
+              R"(Return the options used to create this ProcessGroupXCCL instance.)");
 
   intrusive_ptr_class_<::c10d::ProcessGroupXCCL::Options>(
       processGroupXCCL, "Options", backendOptions)
-      .def(py::init<>());
+      .def(py::init<bool>(), py::arg("is_high_priority_stream") = false)
+      .def_readwrite(
+          "is_high_priority_stream",
+          &::c10d::ProcessGroupXCCL::Options::is_high_priority_stream);
   module
       .def(
           "_dump_xccl_trace",
@@ -3808,16 +3849,19 @@ such as `dist.all_reduce(tensor, async_op=True)`.
       fakeProcessGroup, "Options", backendOptions)
       .def(py::init())
       .def_readwrite(
-          "fake_option", &::c10d::FakeProcessGroup::Options::fake_option);
+          "fake_option", &::c10d::FakeProcessGroup::Options::fake_option)
+      .def_readwrite(
+          "error_on_collective",
+          &::c10d::FakeProcessGroup::Options::error_on_collective);
   fakeProcessGroup
-      .def(
-          py::init([](int rank,
-                      int size,
-                      c10::intrusive_ptr<::c10d::FakeProcessGroup::Options>
-                          options) {
-            return c10::make_intrusive<::c10d::FakeProcessGroup>(
+      .def_static(
+          "_create_internal",
+          [](int rank,
+             int size,
+             c10::intrusive_ptr<::c10d::FakeProcessGroup::Options> options) {
+            return ::c10d::FakeProcessGroup::_create_internal(
                 rank, size, std::move(options));
-          }),
+          },
           py::arg("rank"),
           py::arg("world_size"),
           py::arg("options") =
@@ -4064,6 +4108,10 @@ such as `dist.all_reduce(tensor, async_op=True)`.
             Stringified pickle work traces.
             Default settings return everything - i.e. contains NCCL comm dumps and collective traces.
       )");
+  module.def(
+      "_reset_fr_recording_nccl",
+      []() { ::c10d::reset_nccl_trace(); },
+      "API to reset Flight recorder recording when it comes fault tolerance.");
 #endif
 
   module.def(
diff --git a/torch/csrc/distributed/c10d/logger.hpp b/torch/csrc/distributed/c10d/logger.hpp
index cd562af7473a..75f8b2998f35 100644
--- a/torch/csrc/distributed/c10d/logger.hpp
+++ b/torch/csrc/distributed/c10d/logger.hpp
@@ -153,7 +153,7 @@ class TORCH_API C10dLogger {
   virtual ~C10dLogger() = default;
   virtual void log(const C10dLoggingData& data);
   static C10dLogger* getLogger();
-  static void registerLogger(std::unique_ptr<C10dLogger>);
+  static void registerLogger(std::unique_ptr<C10dLogger> /*logger*/);
 
  protected:
   // singletion, hide constructor from the public
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 752e18c8dbf7..10a2251754cd 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -136,9 +136,9 @@ Reducer::Reducer(
   {
     std::set<int> unique_devices;
     for (const auto& v : params_) {
-      auto device_idx = int(v.device().index());
-      if (unique_devices.find(device_idx) == unique_devices.end()) {
-        unique_devices.insert(device_idx);
+      auto device_idx = static_cast<int>(v.device().index());
+      auto [_, inserted] = unique_devices.emplace(device_idx);
+      if (inserted) {
         if (unique_devices.size() > 1) {
           is_multi_device_module_ = true;
           break;
@@ -168,7 +168,7 @@ Reducer::Reducer(
   }
 
   // All variables are expected to have their `grad_fn` set to the gradient
-  // accumulation function (since they are leafs in the autograd graph).
+  // accumulation function (since they are leaves in the autograd graph).
   // We store pointers to these functions such that we can check if they are
   // used in an autograd pass. If they are not, we know their grad tensors
   // can be marked as ready for reduction.
@@ -375,8 +375,7 @@ void Reducer::mark_variable_ready_dense(size_t variable_index) {
       // previous iterations, no copy is needed.
       if (!grad.is_alias_of(bucket_view)) {
         if (comm_hook_ == nullptr) {
-          auto wrapped =
-              at::native::wrapped_scalar_tensor(double(1.) / div_factor_);
+          auto wrapped = at::native::wrapped_scalar_tensor(1. / div_factor_);
           if (!grad.requires_grad()) {
             // Divides while copying into the bucket view to save one scan over
             // all the input parameters.
diff --git a/torch/csrc/distributed/c10d/reducer.hpp b/torch/csrc/distributed/c10d/reducer.hpp
index 6707975d38ac..4e5ed6a9a5c3 100644
--- a/torch/csrc/distributed/c10d/reducer.hpp
+++ b/torch/csrc/distributed/c10d/reducer.hpp
@@ -26,8 +26,8 @@
 
 namespace c10d {
 
-constexpr int kDefaultFirstBucketBytes = int(1024 * 1024);
-constexpr int kDefaultBucketBytesCap = int(25 * 1024 * 1024);
+constexpr int kDefaultFirstBucketBytes = 1024 * 1024;
+constexpr int kDefaultBucketBytesCap = 25 * 1024 * 1024;
 // Collect runtime stats once for every kDDPRuntimeLoggingSampleRate iterations.
 constexpr int kDDPRuntimeLoggingSampleRate = 100;
 
diff --git a/torch/csrc/distributed/c10d/reducer_cuda.cpp b/torch/csrc/distributed/c10d/reducer_cuda.cpp
index 771ffa1dbae0..ee220abfa411 100644
--- a/torch/csrc/distributed/c10d/reducer_cuda.cpp
+++ b/torch/csrc/distributed/c10d/reducer_cuda.cpp
@@ -76,7 +76,7 @@ class CudaTimer : public Timer {
     if (milliseconds < 0) {
       return std::nullopt;
     }
-    return int64_t(milliseconds * kMilliSecondToNanosSecond);
+    return static_cast<int64_t>(milliseconds * kMilliSecondToNanosSecond);
   }
 };
 
diff --git a/torch/csrc/distributed/c10d/socket.cpp b/torch/csrc/distributed/c10d/socket.cpp
index b23722ec384a..48b59f41b7a8 100644
--- a/torch/csrc/distributed/c10d/socket.cpp
+++ b/torch/csrc/distributed/c10d/socket.cpp
@@ -220,7 +220,7 @@ std::string formatSockAddr(const struct ::sockaddr* addr, socklen_t len) {
   }
   // if we can't resolve the hostname, display the IP address
   if (addr->sa_family == AF_INET) {
-    struct sockaddr_in* psai = (struct sockaddr_in*)&addr;
+    struct sockaddr_in* psai = reinterpret_cast<struct sockaddr_in*>(&addr);
     // NOLINTNEXTLINE(*array*)
     char ip[INET_ADDRSTRLEN];
     if (inet_ntop(addr->sa_family, &(psai->sin_addr), ip, INET_ADDRSTRLEN) !=
@@ -228,7 +228,7 @@ std::string formatSockAddr(const struct ::sockaddr* addr, socklen_t len) {
       return fmt::format("{}:{}", ip, psai->sin_port);
     }
   } else if (addr->sa_family == AF_INET6) {
-    struct sockaddr_in6* psai = (struct sockaddr_in6*)&addr;
+    struct sockaddr_in6* psai = reinterpret_cast<struct sockaddr_in6*>(&addr);
     // NOLINTNEXTLINE(*array*)
     char ip[INET6_ADDRSTRLEN];
     if (inet_ntop(addr->sa_family, &(psai->sin6_addr), ip, INET6_ADDRSTRLEN) !=
@@ -532,8 +532,8 @@ class SocketListenOp {
 
   std::string port_;
   const SocketOptions* opts_;
-  std::vector<std::string> errors_{};
-  std::unique_ptr<SocketImpl> socket_{};
+  std::vector<std::string> errors_;
+  std::unique_ptr<SocketImpl> socket_;
 };
 
 SocketListenOp::SocketListenOp(std::uint16_t port, const SocketOptions& opts)
@@ -772,9 +772,9 @@ class SocketConnectOp {
   const char* host_;
   std::string port_;
   const SocketOptions* opts_;
-  TimePoint deadline_{};
-  std::vector<std::string> errors_{};
-  std::unique_ptr<SocketImpl> socket_{};
+  TimePoint deadline_;
+  std::vector<std::string> errors_;
+  std::unique_ptr<SocketImpl> socket_;
 };
 
 SocketConnectOp::SocketConnectOp(
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
index bd1446c57941..4523333c7fad 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
@@ -727,11 +727,16 @@ c10::intrusive_ptr<CUDASymmetricMemory> make_symm_mem(
   std::vector<c10::intrusive_ptr<AllocationRef>> alloc_refs;
   for (int r = 0; r < world_size; ++r) {
     if (r == rank) {
-      alloc_refs.emplace_back(block->alloc_ref);
       if (mc_addr != nullptr) {
         alloc_refs.push_back(c10::make_intrusive<AllocationRef>(
             mc_addr, mc_handle, block->block_size, block->device_idx, true));
       }
+      // Note that in B200, cuMulticastUnbind can error if the mapped buffers
+      // are free'd before the multicast object is free'd. That's why the
+      // alloc_ref for the multicast object is added first into the vector,
+      // such that ~AllocationRef can release it first. For more context,
+      // see: https://github.com/pytorch/pytorch/issues/162429
+      alloc_refs.emplace_back(block->alloc_ref);
       continue;
     }
     alloc_refs.push_back(c10::make_intrusive<AllocationRef>(
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
index 572c5a8fd369..b09e4f97b344 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
@@ -16,6 +16,7 @@
 #endif
 
 #include <torch/csrc/distributed/c10d/cuda/AsyncMM.cuh>
+#include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp>
 
@@ -213,55 +214,68 @@ at::Tensor multimem_all_reduce_(
 }
 
 template <typename T, int alignment>
-static __global__ void multimem_one_shot_all_reduce_kernel(
+static __global__ void multimem_one_shot_reduce_kernel(
     T* input_mc_ptr,
     T* output_ptr,
     size_t numel,
     uint32_t** signal_pads,
     size_t rank,
-    size_t world_size) {
+    size_t world_size,
+    int64_t root) {
   static_assert(alignment % sizeof(T) == 0);
   constexpr size_t numel_per_thread = alignment / sizeof(T);
 
   sync_remote_blocks<false, true>(signal_pads, rank, world_size);
   __syncthreads();
 
-  auto offset = (blockDim.x * blockIdx.x + threadIdx.x) * numel_per_thread;
-  auto stride = blockDim.x * gridDim.x * numel_per_thread;
-  for (size_t i = offset; i < numel; i += stride) {
-    auto vec = multimem_ld_reduce_add<alignment>(input_mc_ptr + i);
-    at::native::memory::st_vec<alignment>(output_ptr + i, vec);
+  if (rank == root) {
+    auto offset = (blockDim.x * blockIdx.x + threadIdx.x) * numel_per_thread;
+    auto stride = blockDim.x * gridDim.x * numel_per_thread;
+    for (size_t i = offset; i < numel; i += stride) {
+      auto vec = multimem_ld_reduce_add<alignment>(input_mc_ptr + i);
+      at::native::memory::st_vec<alignment>(output_ptr + i, vec);
+    }
   }
 
   __syncthreads();
   sync_remote_blocks<true, false>(signal_pads, rank, world_size);
 }
 
-at::Tensor multimem_one_shot_all_reduce_out(
+at::Tensor multimem_one_shot_reduce_out(
     const at::Tensor& input,
     std::string reduce_op,
+    int64_t root,
     std::string group_name,
     at::Tensor out) {
   TORCH_CHECK(
       input.is_contiguous(),
-      "multimem_one_shot_all_reduce: input must be contiguous.");
-  TORCH_CHECK(
-      out.is_contiguous(),
-      "multimem_one_shot_all_reduce: output must be contiguous.");
-  TORCH_CHECK(
-      out.sizes() == input.sizes(),
-      "multimem_one_shot_all_reduce: input/output size mismatch.");
+      "multimem_one_shot_reduce: input must be contiguous.");
   TORCH_CHECK(
       reduce_op == "sum",
-      "multimem_one_shot_all_reduce: only sum is supported for now.");
+      "multimem_one_shot_reduce: only sum is supported for now.");
 
   auto symm_mem = c10d::symmetric_memory::rendezvous(input, group_name);
   TORCH_CHECK(
       symm_mem != nullptr,
-      "multimem_one_shot_all_reduce: input must be allocated with empty_strided_p2p().");
+      "multimem_one_shot_reduce: input must be allocated with empty_strided_p2p().");
   TORCH_CHECK(
       symm_mem->has_multicast_support(),
-      "multimem_one_shot_all_reduce: requires multicast support.");
+      "multimem_one_shot_reduce: requires multicast support.");
+
+  int rank = symm_mem->get_rank();
+  int world_size = symm_mem->get_world_size();
+  TORCH_CHECK(
+      root >= 0 && root < world_size,
+      "multimem_one_shot_reduce: root must be in [0, world_size).")
+
+  if (rank == root) {
+    TORCH_CHECK(
+        out.is_contiguous(),
+        "multimem_one_shot_reduce: output must be contiguous.");
+    TORCH_CHECK(
+        out.sizes() == input.sizes(),
+        "multimem_one_shot_reduce: input/output size mismatch.");
+  }
 
   const size_t alignment =
       get_and_verify_alignment(input, "multimem_one_shot_all_reduce");
@@ -281,7 +295,7 @@ at::Tensor multimem_one_shot_all_reduce_out(
   AT_DISPATCH_FLOAT_AND_BFLOAT16(
       input.scalar_type(), "multimem_one_shot_all_reduce", [&]() {
         DISPATCH_ALIGNMENTS_16_8_4(alignment, [&]() {
-          multimem_one_shot_all_reduce_kernel<scalar_t, k_alignment>
+          multimem_one_shot_reduce_kernel<scalar_t, k_alignment>
               <<<num_blocks,
                  num_threads,
                  0,
@@ -292,14 +306,25 @@ at::Tensor multimem_one_shot_all_reduce_out(
                   input.numel(),
                   reinterpret_cast<uint32_t**>(
                       symm_mem->get_signal_pad_ptrs_dev()),
-                  symm_mem->get_rank(),
-                  symm_mem->get_world_size());
+                  rank,
+                  world_size,
+                  root);
           C10_CUDA_KERNEL_LAUNCH_CHECK();
         });
       });
   return out;
 }
 
+at::Tensor multimem_one_shot_all_reduce_out(
+    const at::Tensor& input,
+    std::string reduce_op,
+    std::string group_name,
+    at::Tensor out) {
+  auto group = c10d::resolve_process_group(group_name);
+  int root = group->getRank();  // each rank reduces to itself
+  return multimem_one_shot_reduce_out(input, reduce_op, root, group_name, out);
+}
+
 at::Tensor multimem_one_shot_all_reduce(
     const at::Tensor& input,
     std::string reduce_op,
@@ -1063,6 +1088,17 @@ at::Tensor reduce_scatter_out(
   TORCH_CHECK(false, "reduce_scatter_out: requires CUDA 12.3+.");
   return output;
 }
+
+at::Tensor multimem_one_shot_reduce_out(
+    const at::Tensor& input,
+    std::string reduce_op,
+    int64_t root,
+    std::string group_name,
+    at::Tensor out) {
+  TORCH_CHECK(false, "multimem_one_shot_reduce_out: requires CUDA 12.3+.");
+  return out;
+}
+
 } // namespace
 #endif // #if defined(CUDART_VERSION) && CUDART_VERSION < 12030
 
@@ -1211,6 +1247,8 @@ TORCH_LIBRARY_IMPL(symm_mem, CUDA, m) {
   m.impl("multimem_one_shot_all_reduce", ::multimem_one_shot_all_reduce);
   m.impl(
       "multimem_one_shot_all_reduce_out", ::multimem_one_shot_all_reduce_out);
+  m.impl(
+      "multimem_one_shot_reduce_out", ::multimem_one_shot_reduce_out);
   m.impl("multimem_all_gather_out", ::multimem_all_gather_out);
 #endif
   m.impl("stream_write_value32_", ::stream_write_value32_);
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
index b705e7099d12..04838b1581ad 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
@@ -178,7 +178,7 @@ std::vector<int> IpcChannel::all_gather_fds(
     int rank,
     const std::vector<int>& pids,
     int fd) {
-  int world_size = (int)pids.size();
+  int world_size = static_cast<int>(pids.size());
   std::vector<int> fds(pids.size());
   fds[rank] = fd;
 
@@ -197,10 +197,10 @@ int IpcChannel::broadcast_fds(
     int src_rank,
     const std::vector<int>& pids,
     int fd) {
-  int world_size = (int)pids.size();
+  int world_size = static_cast<int>(pids.size());
 
   if (rank == src_rank) {
-    for (int dst_rank = 0; dst_rank < (int)world_size; ++dst_rank) {
+    for (int dst_rank = 0; dst_rank < world_size; ++dst_rank) {
       if (dst_rank == rank) {
         continue;
       }
@@ -242,7 +242,7 @@ void map_block(
   CUmemAccessDesc desc;
   desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   // NOLINTNEXTLINE(bugprone-signed-char-misuse)
-  desc.location.id = static_cast<int>(device_idx);
+  desc.location.id = device_idx;
   desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
   C10_CUDA_DRIVER_CHECK(driver_api->cuMemSetAccess_(*dev_ptr, size, &desc, 1));
 #elif defined(USE_ROCM)
diff --git a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
index f3ba9763f2a6..69e75df453f5 100644
--- a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
@@ -51,7 +51,7 @@ struct NVSHMEMAllocation {
 class NVSHMEMPeerAllocInfo : public c10::intrusive_ptr_target {
  public:
   NVSHMEMPeerAllocInfo(
-      std::shared_ptr<NVSHMEMAllocation> allocation,
+      NVSHMEMAllocation* allocation,
       const std::string& group_name)
       : base_ptr_(allocation->ptr),
         buffer_size_(allocation->buffer_size) {
@@ -144,10 +144,9 @@ class NVSHMEMPeerAllocInfo : public c10::intrusive_ptr_target {
 class NVSHMEMSymmetricMemory : public SymmetricMemory {
  public:
   NVSHMEMSymmetricMemory(
-      std::shared_ptr<NVSHMEMAllocation> allocation,
+      NVSHMEMAllocation* allocation,
       const std::string& group_name)
-      : allocation_(allocation),
-        device_idx_(allocation->device_idx),
+      : device_idx_(allocation->device_idx),
         group_name_(group_name) {
     // A handle stores two types of info:
     // (i) allocation's base ptrs and base signal pads, ours and peers'
@@ -162,7 +161,7 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
   // Copy with offset is allowed
   // This is mostly a shallow copy that shares the pointer to `NVSHMEMPeerAllocInfo` which has been created by `other`
   NVSHMEMSymmetricMemory(const NVSHMEMSymmetricMemory& other, size_t offset)
-      : allocation_(other.allocation_), device_idx_(other.device_idx_), group_name_(other.group_name_), pai_(other.pai_) {
+      : device_idx_(other.device_idx_), group_name_(other.group_name_), pai_(other.pai_) {
     offset_ = offset;
   }
 
@@ -245,7 +244,6 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
   }
 
  private:
-  std::shared_ptr<NVSHMEMAllocation> allocation_;
   int device_idx_;
   std::string group_name_;
   c10::intrusive_ptr<NVSHMEMPeerAllocInfo> pai_;
@@ -335,10 +333,10 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
     auto ptr = nvshmem_malloc(size);
     // If size is 0 (which is legal allocation request) we shouldn't error out
     TORCH_CHECK(ptr != nullptr || size == 0, "nvshmem_malloc failed");
-    auto allocation =
-        std::make_shared<NVSHMEMAllocation>(ptr, size, device_idx);
     // TODO: thread safety
-    allocations_.try_emplace(ptr, std::move(allocation));
+    allocations_.try_emplace(
+      ptr,
+      std::make_unique<NVSHMEMAllocation>(ptr, size, device_idx));
     return ptr;
   }
 
@@ -390,7 +388,7 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
     } else {
       // Create a new rendezvous
       symm_mem =
-          c10::make_intrusive<NVSHMEMSymmetricMemory>(allocation, *group_name);
+          c10::make_intrusive<NVSHMEMSymmetricMemory>(allocation.get(), *group_name);
     }
 
     // Cache rendezvous using allocation's base address as key
@@ -424,7 +422,7 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
   }
 
  private:
-  std::unordered_map<void*, std::shared_ptr<NVSHMEMAllocation>> allocations_;
+  std::unordered_map<void*, std::unique_ptr<NVSHMEMAllocation>> allocations_;
   std::map<std::tuple<void*, std::string>, c10::intrusive_ptr<NVSHMEMSymmetricMemory>>
       symm_mems_;
 };
diff --git a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
index 949e6d7c9fbd..ac9b1e1a69ca 100644
--- a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
@@ -125,7 +125,7 @@ static at::Tensor empty_strided_p2p_persistent(
   const size_t numel = std::accumulate(
       size.begin(),
       size.end(),
-      size_t(1),
+      static_cast<size_t>(1),
       // NOLINTNEXTLINE(modernize-use-transparent-functors)
       std::multiplies<size_t>());
   const size_t element_size = c10::elementSize(dtype);
@@ -152,8 +152,7 @@ static at::Tensor empty_strided_p2p_persistent(
   auto allocated = at::from_blob(dev_ptr, size, stride, options);
 
   // Track the allocation's activeness
-  alloc_id_to_storage.erase(alloc_id);
-  alloc_id_to_storage.emplace(
+  alloc_id_to_storage.insert_or_assign(
       alloc_id, allocated.storage().getWeakStorageImpl());
   return allocated;
 }
@@ -231,7 +230,7 @@ at::Tensor empty_strided_p2p(
   const size_t numel = std::accumulate(
       size.begin(),
       size.end(),
-      size_t(1),
+      static_cast<size_t>(1),
       // NOLINTNEXTLINE(modernize-use-transparent-functors)
       std::multiplies<size_t>());
   const size_t element_size = c10::elementSize(dtype);
@@ -457,6 +456,8 @@ TORCH_LIBRARY_FRAGMENT(symm_mem, m) {
       "multimem_one_shot_all_reduce(Tensor input, str reduce_op, str group_name) -> Tensor");
   m.def(
       "multimem_one_shot_all_reduce_out(Tensor input, str reduce_op, str group_name, Tensor(a!) out) -> Tensor(a!)");
+  m.def(
+      "multimem_one_shot_reduce_out(Tensor input, str reduce_op, int root, str group_name, Tensor(a!) out) -> Tensor(a!)");
   m.def(
       "multimem_all_gather_out(Tensor input, str group_name, Tensor(a!) out) -> Tensor(a!)");
   m.def(
@@ -499,14 +500,21 @@ TORCH_LIBRARY_FRAGMENT(symm_mem, m) {
   m.def("nvshmem_get(Tensor(a!) tensor, int peer) -> ()");
   m.def(
       "nvshmem_broadcast(Tensor(a!) input, int root, str group_name) -> Tensor(a!)");
+  m.def("nvshmem_wait_for_signal(Tensor sigpad, int signal, int peer) -> ()");
+  m.def(
+      "nvshmem_put_with_signal(Tensor(a) tensor, Tensor(a) sigpad, int signal, int peer) -> ()");
   m.def(
       "nvshmem_all_to_all(Tensor input, Tensor(a!) out, str group_name) -> Tensor(a!)");
   m.def(
-      "all_to_all_vdev(Tensor input, Tensor(a!) out, Tensor(a!) in_out_splits, str group_name) -> Tensor(a!)");
+      "all_to_all_vdev(Tensor input, Tensor(a!) out, Tensor in_splits, Tensor(a!) out_splits_offsets, str group_name) -> ()");
   m.def(
       "all_to_all_vdev_2d(Tensor input, Tensor(a!) out, Tensor in_splits, Tensor(a!) out_splits_offsets, str group_name, int? major_align=None) -> ()");
   m.def(
       "all_to_all_vdev_2d_offset(Tensor input, Tensor(a!) out, Tensor in_splits_offsets, Tensor(a!) out_splits_offsets, str group_name) -> ()");
+  m.def(
+      "tile_reduce(Tensor in_tile, Tensor(a!) out_tile, int root, str group_name, str reduce_op='sum') -> ()");
+  m.def(
+      "multi_root_tile_reduce(Tensor[] in_tiles, Tensor(a!) out_tile, int[] roots, str group_name, str reduce_op='sum') -> ()");
 }
 
 TORCH_LIBRARY_IMPL(symm_mem, Meta, m) {
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
index 7c97d6cbc9dc..cb5d40ef4183 100644
--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
@@ -9,6 +9,7 @@
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
 
+#include <ATen/ceil_div.h>
 // Use torch's cub wrapper instead of CUDA's <cub/cub.cuh>, see #55292
 #include <ATen/cuda/cub.cuh>
 
@@ -107,6 +108,28 @@ void nvshmem_put(at::Tensor& tensor, const int64_t peer) {
   nvshmemx_putmem_on_stream(buffer_ptr, tensor.data_ptr(), buffer_size, peer, stream);
 }
 
+void nvshmem_wait_for_signal(at::Tensor& sigpad, int64_t signal, int64_t peer) {
+  c10::cuda::CUDAGuard guard(sigpad.device());
+  auto stream = at::cuda::getCurrentCUDAStream();
+  nvshmemx_signal_wait_until_on_stream(static_cast<uint64_t*>(sigpad.data_ptr()), NVSHMEM_CMP_EQ, signal, stream);
+}
+
+void nvshmem_put_with_signal(at::Tensor& tensor, at::Tensor& sigpad, int64_t signal, int64_t peer) {
+  auto buffer_size = tensor.numel() * tensor.element_size();
+
+  c10::cuda::CUDAGuard guard(tensor.device());
+  auto stream = at::cuda::getCurrentCUDAStream();
+  nvshmemx_putmem_signal_on_stream(
+    tensor.mutable_data_ptr(),
+    tensor.mutable_data_ptr(),
+    buffer_size,
+    static_cast<uint64_t*>(sigpad.mutable_data_ptr()),
+    signal,
+    NVSHMEM_SIGNAL_SET,
+    peer,
+    stream);
+}
+
 void nvshmem_get(at::Tensor& tensor, const int64_t peer) {
   // TODO: support non-contiguous tensors
   TORCH_CHECK(tensor.is_contiguous(),
@@ -180,16 +203,15 @@ __device__ int64_t prefixSum(int64_t *odata, int64_t *idata, int n) {
 // - input splits (IN)
 // - output splits (OUT) and
 // - source offsets (OUT).
-__global__ void exchangeSplitAndOffset(int64_t* in_out_splits, nvshmem_team_t team) {
+__global__ void exchangeSplitAndOffset(int64_t* input_splits, int64_t* out_splits_offsets, nvshmem_team_t team) {
 #ifndef _NVSHMEM_DEVICELIB_SUPPORTED
   CUDA_KERNEL_ASSERT_MSG(false, "SM arch unsupported for NVSHMEM");
 #else
   CUDA_KERNEL_ASSERT(team != NVSHMEM_TEAM_INVALID);
   int mype = nvshmem_team_my_pe(team);
   int npes = nvshmem_team_n_pes(team);
-  auto input_splits = in_out_splits;
-  auto output_splits = in_out_splits + npes;
-  auto source_offsets = in_out_splits + npes * 2;
+  auto output_splits = out_splits_offsets;
+  auto source_offsets = out_splits_offsets + npes;
   int tid = threadIdx.x;
 
   CUDA_KERNEL_ASSERT(npes <= THREADS_PER_BLOCK);
@@ -207,22 +229,22 @@ __global__ void exchangeSplitAndOffset(int64_t* in_out_splits, nvshmem_team_t te
     nvshmem_int64_p(output_splits + mype, input_splits[tid], peer_global);
   }
   // This barrier ensures that all remote PEs see the updated values
-  nvshmemx_barrier_all_block();
+  nvshmemx_barrier_block(team);
 #endif
 }
 
 // This kernel is used to do the actual data exchange.
 // `in_out_splits` has the same definition as in `exchangeSplitAndOffset`.
 // `stride` is the stride at dim 0, unit in byte.
-__global__ void allToAllV(void *send_data, void *recv_data, int64_t* in_out_splits, size_t stride, nvshmem_team_t team) {
+__global__ void allToAllV(void *send_data, void *recv_data, int64_t* out_splits_offsets, size_t stride, nvshmem_team_t team) {
 #ifndef _NVSHMEM_DEVICELIB_SUPPORTED
   CUDA_KERNEL_ASSERT_MSG(false, "SM arch unsupported for NVSHMEM");
 #else
   CUDA_KERNEL_ASSERT(team != NVSHMEM_TEAM_INVALID);
   int mype = nvshmem_team_my_pe(team);
   int npes = nvshmem_team_n_pes(team);
-  auto output_splits = in_out_splits + npes;
-  auto source_offsets = in_out_splits + npes * 2;
+  auto output_splits = out_splits_offsets;
+  auto source_offsets = out_splits_offsets + npes;
   int bid = blockIdx.x;
   int tid = threadIdx.x;
   int blocks_per_peer = max(gridDim.x / npes, 1);
@@ -277,29 +299,31 @@ static int get_a2a_nblocks(size_t size, int world_size, bool intra_node) {
   return std::min(num_blocks, max_blocks);
 }
 
-at::Tensor all_to_all_vdev(
+void all_to_all_vdev(
     at::Tensor& input,
     at::Tensor& out,
-    at::Tensor& in_out_splits,
+    at::Tensor& in_splits,
+    at::Tensor& out_splits_offsets,
     std::string group_name) {
   /* Perform AllToAllv operation using NVSHMEM, with split information provided on device.
    * Arguments:
    *  - `input` is the input tensor
    *  - `out` is the output tensor
-   *  - `in_out_splits` is a 2D tensor of size (3, npes). The rows are (in order):
-        input splits (IN)
-        output splits (OUT) and
-        output offsets (OUT).
+   *  - `in_splits` is a 1D tensor of size (npes), containing the input splits
+   *  - `out_splits_offsets` is a 2D tensor of size (2, npes). The rows are (in order):
+        output splits and output offsets.
   */
   auto input_hdl = c10d::symmetric_memory::rendezvous(input, group_name);
   auto out_hdl = c10d::symmetric_memory::rendezvous(out, group_name);
-  auto splits_hdl = c10d::symmetric_memory::rendezvous(in_out_splits, group_name);
+  auto in_splits_hdl = c10d::symmetric_memory::rendezvous(in_splits, group_name);
+  auto out_splits_offsets_hdl = c10d::symmetric_memory::rendezvous(out_splits_offsets, group_name);
   int rank = input_hdl->get_rank();
   int world_size = input_hdl->get_world_size();
 
   void* input_ptr = input.data_ptr();
   void* output_ptr = out.mutable_data_ptr();
-  int64_t* splits_ptr = (int64_t*)(in_out_splits.mutable_data_ptr());
+  int64_t* in_splits_ptr = (int64_t*)(in_splits.const_data_ptr());
+  int64_t* out_splits_offsets_ptr = (int64_t*)(out_splits_offsets.mutable_data_ptr());
 
   TORCH_CHECK_EQ(input.device(), out.device());
   auto device = input.device();
@@ -311,7 +335,8 @@ at::Tensor all_to_all_vdev(
   // Exchange output splits and source offsets
   // Use collective launch because kernel involves nvshmem barrier
   void* args0[] = {
-      &splits_ptr,
+      &in_splits_ptr,
+      &out_splits_offsets_ptr,
       &team};
   nvshmemx_collective_launch(
       (const void*)exchangeSplitAndOffset,
@@ -335,7 +360,7 @@ at::Tensor all_to_all_vdev(
   void* args1[] = {
       &input_ptr,
       &output_ptr,
-      &splits_ptr,
+      &out_splits_offsets_ptr,
       &stride_bytes,
       &team};
   nvshmemx_collective_launch(
@@ -345,7 +370,6 @@ at::Tensor all_to_all_vdev(
       args1,
       0,
       stream);
-  return out;
 }
 
 // Start of `all_to_all_vdev_2d`
@@ -418,7 +442,7 @@ __global__ void exchangeSplitAndOffset_2d(int64_t* in_splits_offsets, int64_t* o
     nvshmem_int64_p(output_splits + dst_offset, split_val, peer_global);
   }
   // This barrier ensures that all remote PEs see the updated values
-  nvshmemx_barrier_all_block();
+  nvshmemx_barrier_block(team);
 #endif
 }
 
@@ -840,6 +864,211 @@ void all_to_all_vdev_2d_offset(
       0,
       stream);
 }
+
+/* Tiled Communication */
+
+using Shape2D = nvshmemx::shape<int64_t, int64_t>;
+using Stride2D = nvshmemx::stride<int64_t, int64_t>;
+
+template <typename T>
+__global__ void tile_reduce_kernel(
+    T* src_ptr, T* dst_ptr, Shape2D shape, Stride2D strides, int64_t root, nvshmem_team_t* teams) {
+#ifndef _NVSHMEM_DEVICELIB_SUPPORTED
+  CUDA_KERNEL_ASSERT_MSG(false, "SM arch unsupported for NVSHMEM");
+#else
+  int bid = blockIdx.x;
+  auto team = teams[bid];
+  CUDA_KERNEL_ASSERT(team != NVSHMEM_TEAM_INVALID && " invalid team\n");
+
+  // Global tile shape
+  auto [rows, cols] = shape;
+  auto [stride0, stride1] = strides;
+
+  // Divide rows among CUDA blocks
+  auto rows_per_block = at::ceil_div(rows, (int64_t)gridDim.x);
+  auto block_start_row = rows_per_block * bid;
+  auto block_shape = nvshmemx::make_shape(std::min(rows_per_block, rows - block_start_row), cols);
+  auto block_layout = nvshmemx::make_layout(block_shape, strides);
+
+  // Start pointer of each block's sub-tile
+  auto block_src_ptr = src_ptr + stride0 * block_start_row;
+  auto block_dst_ptr = dst_ptr + stride0 * block_start_row;
+  auto block_src_tensor = nvshmemx::Tensor(block_src_ptr, block_layout);
+  auto block_dst_tensor = nvshmemx::Tensor(block_dst_ptr, block_layout);
+
+  // Making these empty to avoid nvshmemx::tile_sum_reduce_block() from doing
+  // additional range checks
+  auto start_coord = nvshmemx::make_shape();
+  auto boundary = nvshmemx::make_shape();
+
+  // Use one-shot pull to reduce the tile
+  uint64_t flag = 0;
+  constexpr auto algo = nvshmemx::tile_coll_algo_t::NVLS_ONE_SHOT_PULL_NBI;
+  nvshmemx::tile_sum_reduce_block<decltype(block_src_tensor), decltype(block_dst_tensor), decltype(boundary), algo>(
+      team, block_src_tensor, block_dst_tensor, start_coord, boundary, root, flag /* unused */);
+
+  // Wait for the operation to complete
+  nvshmemx::tile_collective_wait<algo>(team, flag /* unused */);
+#endif
+}
+
+#define AT_DISPATCH_CASE_CONVERT(enum_type, scalar_type, ...)               \
+  case enum_type: {                                                         \
+    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                            \
+    using scalar_t = scalar_type;                                           \
+    return __VA_ARGS__();                                                   \
+  }
+
+#define AT_DISPATCH_NVSHMEM_FLOATS(scalar_type, name, ...)                  \
+  AT_DISPATCH_SWITCH(                                                       \
+      scalar_type, name,                                                    \
+      AT_DISPATCH_CASE_CONVERT(at::kBFloat16, __nv_bfloat16, __VA_ARGS__);  \
+      AT_DISPATCH_CASE_CONVERT(at::kHalf, __half, __VA_ARGS__);             \
+      AT_DISPATCH_CASE(at::kFloat, __VA_ARGS__));
+
+void tile_reduce(
+    at::Tensor& in_tile,
+    at::Tensor& out_tile,
+    int64_t root,
+    std::string group_name,
+    std::string reduce_op) {
+  /* Perform a tile reduce operation on the input tensor, with the root rank
+   * receiving the reduced tensor. */
+  TORCH_CHECK(reduce_op == "sum", "tile_reduce: only sum is supported for now");
+  TORCH_CHECK(in_tile.dim() == 2 && out_tile.dim() == 2, "Only 2D tensors are supported");
+  TORCH_CHECK_EQ(in_tile.dtype(), out_tile.dtype());
+  TORCH_CHECK_EQ(in_tile.sizes(), out_tile.sizes());
+  TORCH_CHECK_EQ(in_tile.strides(), out_tile.strides());
+  TORCH_CHECK_EQ(in_tile.device(), out_tile.device());
+
+  auto device = in_tile.device();
+  c10::cuda::CUDAGuard guard(device);
+  auto hdl = c10d::symmetric_memory::rendezvous(in_tile, group_name);
+  c10d::symmetric_memory::rendezvous(out_tile, group_name);
+
+  // Ideally 16 bytes per thread
+  int nblocks = at::ceil_div(
+      in_tile.numel() * in_tile.element_size(),
+      (int64_t)THREADS_PER_BLOCK * 16);
+  nblocks = std::min(nblocks, 24);
+
+  // Need one team per block
+  auto& team_manager = TeamManager::get(device);
+  auto [teams, teams_dev] = team_manager.get_n_teams(
+      group_name, hdl->get_rank_to_global_rank(), nblocks);
+  TORCH_CHECK(
+      root < nvshmem_team_n_pes(teams[0]),
+      "root must be smaller than group size");
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  // Prepare launch parameters
+  auto shape = nvshmemx::make_shape(in_tile.sizes()[0], in_tile.sizes()[1]);
+  auto stride = nvshmemx::make_stride(in_tile.strides()[0], in_tile.strides()[1]);
+  auto src_ptr = in_tile.const_data_ptr();
+  auto dst_ptr = out_tile.mutable_data_ptr();
+  void* args[] = {
+      &src_ptr,
+      &dst_ptr,
+      &shape,
+      &stride,
+      &root,
+      &teams_dev};
+
+  AT_DISPATCH_NVSHMEM_FLOATS(in_tile.scalar_type(), "tile_reduce", [&]() {
+    nvshmemx_collective_launch(
+        (const void*)tile_reduce_kernel<scalar_t>,
+        dim3(nblocks),
+        dim3(THREADS_PER_BLOCK),
+        args,
+        0,
+        stream);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  });
+}
+
+/* Multi-tile Communication */
+
+void multi_root_tile_reduce(
+    at::ArrayRef<at::Tensor> in_tiles,
+    at::Tensor& out_tile,
+    at::ArrayRef<int64_t> roots,
+    std::string group_name,
+    std::string reduce_op) {
+  /* Perform multiple tile reductions concurrently, with each tile reduced to a separate root.
+   Args:
+     - `in_tiles` is a list of input tensors.
+     - `out_tile` is the output tensor.
+     - `roots` is a list of root ranks corresponding to each input tile, in the same order. A rank cannot be a root more than once.
+     - `group_name` is the name of the group to use for the collective operation.
+     - `reduce_op` is the reduction operation to perform. Currently only "sum" is supported.
+   */
+  TORCH_CHECK(reduce_op == "sum", "tile_reduce: only sum is supported for now");
+  TORCH_CHECK(out_tile.dim() == 2, "Only 2D tensors are supported");
+  TORCH_CHECK(roots.size() == in_tiles.size(), "Number of roots must match number of tiles");
+
+  // Get device and stream
+  auto device = out_tile.device();
+  c10::cuda::CUDAGuard guard(device);
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  // Rendezvous all tensors, and find the tile "I" need to reduce
+  auto hdl = c10d::symmetric_memory::rendezvous(out_tile, group_name);
+  int rank = hdl->get_rank();
+  int world_size = hdl->get_world_size();
+  int i = 0, my_tile_idx = 0, root = world_size;
+  // Note: if there is no tile for the current rank, my_tile_idx will remain
+  // initial value 0, and root will remain `world_size`. This is OK. In
+  // `nvshmemx::tile_sum_reduce_block`, this rank would skip the reduction
+  // operation, but would still participate in the barrier.
+  for (auto& in_tile : in_tiles) {
+    TORCH_CHECK(in_tile.dim() == 2, "Only 2D tensors are supported");
+    c10d::symmetric_memory::rendezvous(in_tile, group_name);
+    TORCH_CHECK(roots[i] < world_size && roots[i] >= 0, "Invalid root");
+    if (roots[i] == rank) {
+      TORCH_CHECK(root == world_size, "Each rank can only be a root once");
+      my_tile_idx = i;
+      root = rank;
+    }
+    i++;
+  }
+
+  // Ideally 16 bytes per thread
+  int nblocks = at::ceil_div(
+      out_tile.numel() * out_tile.element_size(),
+      (int64_t)THREADS_PER_BLOCK * 16);
+  nblocks = std::min(nblocks, 24);
+
+  // Need one team per block
+  auto& team_manager = TeamManager::get(device);
+  auto [teams, teams_dev] = team_manager.get_n_teams(
+      group_name, hdl->get_rank_to_global_rank(), nblocks);
+
+  // Prepare launch parameters
+  auto shape = nvshmemx::make_shape(out_tile.sizes()[0], out_tile.sizes()[1]);
+  auto stride = nvshmemx::make_stride(out_tile.strides()[0], out_tile.strides()[1]);
+  auto in_tile_ptr = in_tiles[my_tile_idx].const_data_ptr();
+  auto out_tile_ptr = out_tile.mutable_data_ptr();
+
+  void* args[] = {
+      &in_tile_ptr,
+      &out_tile_ptr,
+      &shape,
+      &stride,
+      &root,
+      &teams_dev};
+
+  AT_DISPATCH_NVSHMEM_FLOATS(out_tile.scalar_type(), "multi_root_tile_reduce", [&]() {
+    nvshmemx_collective_launch(
+        (const void*)tile_reduce_kernel<scalar_t>,
+        dim3(nblocks),
+        dim3(THREADS_PER_BLOCK),
+        args,
+        0,
+        stream);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  });
+}
+
 } // namespace c10d::nvshmem_extension
 
 
@@ -847,8 +1076,12 @@ TORCH_LIBRARY_IMPL(symm_mem, CUDA, m) {
   m.impl("nvshmem_broadcast", c10d::nvshmem_extension::nvshmem_broadcast);
   m.impl("nvshmem_put", c10d::nvshmem_extension::nvshmem_put);
   m.impl("nvshmem_get", c10d::nvshmem_extension::nvshmem_get);
+  m.impl("nvshmem_wait_for_signal", c10d::nvshmem_extension::nvshmem_wait_for_signal);
+  m.impl("nvshmem_put_with_signal", c10d::nvshmem_extension::nvshmem_put_with_signal);
   m.impl("nvshmem_all_to_all", c10d::nvshmem_extension::nvshmem_all_to_all);
   m.impl("all_to_all_vdev", c10d::nvshmem_extension::all_to_all_vdev);
   m.impl("all_to_all_vdev_2d", c10d::nvshmem_extension::all_to_all_vdev_2d);
   m.impl("all_to_all_vdev_2d_offset", c10d::nvshmem_extension::all_to_all_vdev_2d_offset);
+  m.impl("tile_reduce", c10d::nvshmem_extension::tile_reduce);
+  m.impl("multi_root_tile_reduce", c10d::nvshmem_extension::multi_root_tile_reduce);
 }
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
index fc37bd931fa9..50b9e268cba7 100644
--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
@@ -27,15 +27,20 @@ TORCH_API void nvshmem_get(at::Tensor& tensor, const int64_t peer);
 
 at::Tensor nvshmem_broadcast(at::Tensor& input, const int64_t root, const std::string& group_name);
 
+TORCH_API void nvshmem_wait_for_signal(at::Tensor& sigpad, int64_t signal, int64_t peer);
+
+TORCH_API void nvshmem_put_with_signal(at::Tensor& tensor, at::Tensor& sigpad, int64_t signal, int64_t peer);
+
 at::Tensor nvshmem_all_to_all(
     at::Tensor& input,
     at::Tensor& out,
     std::string group_name);
 
-at::Tensor all_to_all_vdev(
+void all_to_all_vdev(
     at::Tensor& input,
     at::Tensor& out,
-    at::Tensor& in_out_splits,
+    at::Tensor& in_splits,
+    at::Tensor& out_splits_offsets,
     std::string group_name);
 
 void all_to_all_vdev_2d(
@@ -53,4 +58,18 @@ void all_to_all_vdev_2d_offset(
     at::Tensor& out_splits_offsets,
     std::string group_name);
 
+void tile_reduce(
+    at::Tensor& in_tile,
+    at::Tensor& out_tile,
+    int64_t root,
+    std::string group_name,
+    std::string reduce_op = "sum");
+
+void multi_root_tile_reduce(
+    at::ArrayRef<at::Tensor> in_tiles,
+    at::Tensor& out_tile,
+    at::ArrayRef<int64_t> roots,
+    std::string group_name,
+    std::string reduce_op = "sum");
+
 } // namespace c10d::nvshmem_extension
diff --git a/torch/csrc/distributed/rpc/agent_utils.cpp b/torch/csrc/distributed/rpc/agent_utils.cpp
index 2d324f9670c2..fe04f23c66d1 100644
--- a/torch/csrc/distributed/rpc/agent_utils.cpp
+++ b/torch/csrc/distributed/rpc/agent_utils.cpp
@@ -23,7 +23,8 @@ std::unordered_map<std::string, worker_id_t> collectNames(
     }
     std::vector<uint8_t> workerNameVector = store.get(std::to_string(workerId));
     std::string workerName(
-        (char*)workerNameVector.data(), workerNameVector.size());
+        reinterpret_cast<char*>(workerNameVector.data()),
+        workerNameVector.size());
 
     TORCH_CHECK(
         nameToId.find(workerName) == nameToId.end(),
@@ -91,7 +92,8 @@ std::unordered_map<std::string, worker_id_t> collectCurrentNames(
     // Get the current list of workers
     std::vector<uint8_t> allWorkerInfosKeyVector = store.get(allWorkerInfosKey);
     allWorkerInfos = std::string(
-        (char*)allWorkerInfosKeyVector.data(), allWorkerInfosKeyVector.size());
+        reinterpret_cast<const char*>(allWorkerInfosKeyVector.data()),
+        allWorkerInfosKeyVector.size());
     // workerInfos are comma separated with a comma at the end (e.g.
     // "Name1-Rank1,Name2-Rank2,Name3-Rank2,") parse list of workers.
     if (!allWorkerInfos.empty()) {
@@ -132,7 +134,8 @@ void removeCurrentName(
   // Get current list of names/ranks
   std::vector<uint8_t> allWorkerInfosKeyVector = store.get(allWorkerInfosKey);
   std::string allWorkerInfos = std::string(
-      (char*)allWorkerInfosKeyVector.data(), allWorkerInfosKeyVector.size());
+      reinterpret_cast<const char*>(allWorkerInfosKeyVector.data()),
+      allWorkerInfosKeyVector.size());
 
   // Remove the current name and rank
   std::string str_to_erase = fmt::format("{}-{},", selfName, selfId);
diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp
index fb812f8522f5..ce85ee4f5c4b 100644
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@@ -149,13 +149,13 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
               py::call_guard<py::gil_scoped_release>())
           .def(
               "get_worker_info",
-              (const WorkerInfo& (RpcAgent::*)(void) const) &
-                  RpcAgent::getWorkerInfo,
+              static_cast<const WorkerInfo& (RpcAgent::*)(void) const>(
+                  &RpcAgent::getWorkerInfo),
               py::call_guard<py::gil_scoped_release>())
           .def(
               "get_worker_info",
-              (const WorkerInfo& (RpcAgent::*)(const std::string&) const) &
-                  RpcAgent::getWorkerInfo,
+              static_cast<const WorkerInfo& (RpcAgent::*)(const std::string&)
+                              const>(&RpcAgent::getWorkerInfo),
               py::call_guard<py::gil_scoped_release>())
           .def(
               "get_worker_infos",
@@ -611,28 +611,28 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
           py::call_guard<py::gil_scoped_release>())
       .def(
           "get_worker_info",
-          (const WorkerInfo& (TensorPipeAgent::*)(void) const) &
-              RpcAgent::getWorkerInfo,
+          static_cast<const WorkerInfo& (TensorPipeAgent::*)(void) const>(
+              &RpcAgent::getWorkerInfo),
           py::call_guard<py::gil_scoped_release>())
       .def(
           "get_worker_info",
-          (const WorkerInfo& (TensorPipeAgent::*)(const std::string&) const) &
-              TensorPipeAgent::getWorkerInfo,
+          static_cast<const WorkerInfo& (TensorPipeAgent::*)(const std::string&)
+                          const>(&TensorPipeAgent::getWorkerInfo),
           py::call_guard<py::gil_scoped_release>())
       .def(
           "get_worker_info",
-          (const WorkerInfo& (TensorPipeAgent::*)(worker_id_t id) const) &
-              TensorPipeAgent::getWorkerInfo,
+          static_cast<const WorkerInfo& (TensorPipeAgent::*)(worker_id_t id)
+                          const>(&TensorPipeAgent::getWorkerInfo),
           py::call_guard<py::gil_scoped_release>())
       .def(
           "get_worker_infos",
-          (std::vector<WorkerInfo>(TensorPipeAgent::*)() const) &
-              TensorPipeAgent::getWorkerInfos,
+          static_cast<std::vector<WorkerInfo> (TensorPipeAgent::*)() const>(
+              &TensorPipeAgent::getWorkerInfos),
           py::call_guard<py::gil_scoped_release>())
       .def(
           "_get_device_map",
-          (DeviceMap(TensorPipeAgent::*)(const WorkerInfo& dst)
-               const)&TensorPipeAgent::getDeviceMap,
+          static_cast<DeviceMap (TensorPipeAgent::*)(const WorkerInfo& dst)
+                          const>(&TensorPipeAgent::getDeviceMap),
           py::call_guard<py::gil_scoped_release>())
       .def(
           "_get_backend_options",
diff --git a/torch/csrc/distributed/rpc/python_remote_call.cpp b/torch/csrc/distributed/rpc/python_remote_call.cpp
index 03f0dd1449fe..2b259234b9cd 100644
--- a/torch/csrc/distributed/rpc/python_remote_call.cpp
+++ b/torch/csrc/distributed/rpc/python_remote_call.cpp
@@ -32,7 +32,7 @@ c10::intrusive_ptr<Message> PythonRemoteCall::toMessageImpl() && {
 
 std::unique_ptr<PythonRemoteCall> PythonRemoteCall::fromMessage(
     const Message& message) {
-  auto payload = static_cast<const char*>(message.payload().data());
+  auto payload = message.payload().data();
   auto payload_size = message.payload().size();
 
   auto value = jit::unpickle(
diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.cpp b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
index c328ade749c3..ef645675af20 100644
--- a/torch/csrc/distributed/rpc/request_callback_no_python.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_no_python.cpp
@@ -74,7 +74,7 @@ c10::intrusive_ptr<JitFuture> RequestCallbackNoPython::processMessage(
         [this,
          // std::function must be copyable, hence hae to cast the unique_ptr to
          // a shared_ptr here.
-         rpc = (std::shared_ptr<RpcCommandBase>)std::move(rpc),
+         rpc = std::shared_ptr<RpcCommandBase>(std::move(rpc)),
          messageType = request.type(),
          streams = std::move(streams)](JitFuture& /* unused */) mutable {
           // The cost of pre-request check is minimal thanks to
diff --git a/torch/csrc/distributed/rpc/rpc_agent.cpp b/torch/csrc/distributed/rpc/rpc_agent.cpp
index 0fcb324cf1a9..9eee15bdc4d8 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.cpp
+++ b/torch/csrc/distributed/rpc/rpc_agent.cpp
@@ -13,7 +13,7 @@ RegisterWorkerInfoOnce::RegisterWorkerInfoOnce() {
 }
 
 WorkerInfo::WorkerInfo(std::string name, int64_t id)
-    : WorkerInfo(std::move(name), (worker_id_t)id) {
+    : WorkerInfo(std::move(name), static_cast<worker_id_t>(id)) {
   TORCH_CHECK(
       id <= std::numeric_limits<worker_id_t>::max(),
       "RPC worker id ",
diff --git a/torch/csrc/distributed/rpc/rref_context.h b/torch/csrc/distributed/rpc/rref_context.h
index ce3b71580ab6..5a3fff5d6722 100644
--- a/torch/csrc/distributed/rpc/rref_context.h
+++ b/torch/csrc/distributed/rpc/rref_context.h
@@ -225,7 +225,7 @@ class TORCH_API RRefContext {
     c10::intrusive_ptr<JitFuture> confirmationFuture_;
   };
 
-  RRefContext(std::shared_ptr<RpcAgent>);
+  RRefContext(std::shared_ptr<RpcAgent> /*agent*/);
 
   c10::intrusive_ptr<UserRRef> createUserRRef(
       worker_id_t ownerId,
diff --git a/torch/csrc/distributed/rpc/rref_proto.cpp b/torch/csrc/distributed/rpc/rref_proto.cpp
index af70fe96b2d9..ea02972f8915 100644
--- a/torch/csrc/distributed/rpc/rref_proto.cpp
+++ b/torch/csrc/distributed/rpc/rref_proto.cpp
@@ -15,7 +15,7 @@ c10::ivalue::TupleElements toIValues(const Message& message, MessageType type) {
       type,
       ", but got ",
       message.type());
-  auto payload = static_cast<const char*>(message.payload().data());
+  auto payload = message.payload().data();
   auto payload_size = message.payload().size();
 
   auto value = jit::unpickle(
@@ -87,7 +87,7 @@ std::unique_ptr<ScriptRRefFetchCall> ScriptRRefFetchCall::fromMessage(
           id <= std::numeric_limits<worker_id_t>::max(),
       "ScriptRRefFetchCall fromWorkerId exceeds worker_id_t limit.")
   return std::make_unique<ScriptRRefFetchCall>(
-      worker_id_t(id), RRefId::fromIValue(values[0]));
+      static_cast<worker_id_t>(id), RRefId::fromIValue(values[0]));
 }
 
 c10::intrusive_ptr<Message> PythonRRefFetchCall::toMessageImpl() && {
@@ -109,7 +109,7 @@ std::unique_ptr<PythonRRefFetchCall> PythonRRefFetchCall::fromMessage(
           id <= std::numeric_limits<worker_id_t>::max(),
       "PythonRRefFetchCall fromWorkerId exceeds worker_id_t limit.")
   return std::make_unique<PythonRRefFetchCall>(
-      worker_id_t(id), RRefId::fromIValue(values[0]));
+      static_cast<worker_id_t>(id), RRefId::fromIValue(values[0]));
 }
 
 const std::vector<at::IValue>& RRefFetchRet::values() {
diff --git a/torch/csrc/distributed/rpc/script_call.cpp b/torch/csrc/distributed/rpc/script_call.cpp
index 7ac838fa034b..d9aefc659c7d 100644
--- a/torch/csrc/distributed/rpc/script_call.cpp
+++ b/torch/csrc/distributed/rpc/script_call.cpp
@@ -127,7 +127,7 @@ c10::intrusive_ptr<Message> ScriptCall::toMessageImpl() && {
 }
 
 std::unique_ptr<ScriptCall> ScriptCall::fromMessage(const Message& message) {
-  auto payload = static_cast<const char*>(message.payload().data());
+  auto payload = message.payload().data();
   auto payload_size = message.payload().size();
   auto value = jit::unpickle(
       payload,
diff --git a/torch/csrc/distributed/rpc/script_remote_call.cpp b/torch/csrc/distributed/rpc/script_remote_call.cpp
index 3ca815fd3e06..4458d8da4e16 100644
--- a/torch/csrc/distributed/rpc/script_remote_call.cpp
+++ b/torch/csrc/distributed/rpc/script_remote_call.cpp
@@ -65,7 +65,7 @@ c10::intrusive_ptr<Message> ScriptRemoteCall::toMessageImpl() && {
 
 std::unique_ptr<ScriptRemoteCall> ScriptRemoteCall::fromMessage(
     const Message& message) {
-  auto payload = static_cast<const char*>(message.payload().data());
+  auto payload = message.payload().data();
   auto payload_size = message.payload().size();
 
   auto value = jit::unpickle(
diff --git a/torch/csrc/distributed/rpc/script_resp.cpp b/torch/csrc/distributed/rpc/script_resp.cpp
index 6dc6a939b4a5..12d1f2a8756a 100644
--- a/torch/csrc/distributed/rpc/script_resp.cpp
+++ b/torch/csrc/distributed/rpc/script_resp.cpp
@@ -20,7 +20,7 @@ c10::intrusive_ptr<Message> ScriptResp::toMessageImpl() && {
 }
 
 std::unique_ptr<ScriptResp> ScriptResp::fromMessage(const Message& message) {
-  auto payload = static_cast<const char*>(message.payload().data());
+  auto payload = message.payload().data();
   auto payload_size = message.payload().size();
   auto value = jit::unpickle(
       payload,
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index c25e83c07c6d..e20f8730b6ec 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -263,10 +263,12 @@ constexpr static int kNumUvThreads = 16;
 
 std::unique_ptr<ChannelRegistration> makeMultiplexedUvChannel() {
   std::vector<std::shared_ptr<tensorpipe::transport::Context>> contexts;
+  contexts.reserve(kNumUvThreads);
   std::vector<std::shared_ptr<tensorpipe::transport::Listener>> listeners;
+  listeners.reserve(kNumUvThreads);
   for ([[maybe_unused]] const auto laneIdx : c10::irange(kNumUvThreads)) {
     auto context = tensorpipe::transport::uv::create();
-    std::string address = TensorPipeAgent::guessAddress();
+    const std::string& address = TensorPipeAgent::guessAddress();
     contexts.push_back(std::move(context));
     listeners.push_back(contexts.back()->listen(address));
   }
@@ -302,9 +304,10 @@ void TensorPipeAgent::TimeSeriesMetricsTracker::addData(uint64_t dataPoint) {
 }
 
 float TensorPipeAgent::TimeSeriesMetricsTracker::computeAverage() const {
-  return currentCount_ == 0
-      ? 0
-      : static_cast<float>((double)currentSum_ / (double)currentCount_);
+  return currentCount_ == 0 ? 0
+                            : static_cast<float>(
+                                  static_cast<double>(currentSum_) /
+                                  static_cast<double>(currentCount_));
 }
 
 ////////////////////////  TensorpipeRpcAgent  /////////////////////////////////
@@ -501,8 +504,9 @@ void TensorPipeAgent::startImpl() {
   for (const auto& p : workerNameToInfo_) {
     const auto& name = p.first;
     auto nodeAddrData = nameToAddressStore_.get(name);
-    auto nodeAddrStr =
-        std::string((const char*)nodeAddrData.data(), nodeAddrData.size());
+    auto nodeAddrStr = std::string(
+        reinterpret_cast<const char*>(nodeAddrData.data()),
+        nodeAddrData.size());
     workerNameToURL_.insert({name, nodeAddrStr});
   }
 
@@ -1238,8 +1242,9 @@ void TensorPipeAgent::updateGroupMembership(
     // TODO: we should get nodeAddrStr in the joining process, then pass in as
     // an argument rather than getting from store each time
     auto nodeAddrData = nameToAddressStore_.get(name);
-    auto nodeAddrStr =
-        std::string((const char*)nodeAddrData.data(), nodeAddrData.size());
+    auto nodeAddrStr = std::string(
+        reinterpret_cast<const char*>(nodeAddrData.data()),
+        nodeAddrData.size());
     workerNameToURL_.insert({name, nodeAddrStr});
 
     for (const auto& it : reverseDeviceMaps) {
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.h b/torch/csrc/distributed/rpc/tensorpipe_agent.h
index e6f4d66af138..a1d449fba549 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.h
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.h
@@ -232,11 +232,11 @@ class TORCH_API TensorPipeAgent : public RpcAgent {
   // messages by server, and write request messages by client. This
   // is a protected method since it is overwritten by FaultyTensorPipeAgent
   virtual void pipeWrite(
-      const std::shared_ptr<tensorpipe::Pipe>&,
+      const std::shared_ptr<tensorpipe::Pipe>& /*pipe*/,
       const c10::intrusive_ptr<Message>& message,
       std::vector<c10::Device>&& devices,
       std::vector<c10::Stream> streams,
-      std::function<void(const tensorpipe::Error&)>) noexcept;
+      std::function<void(const tensorpipe::Error&)> /*fn*/) noexcept;
 
  private:
   // Removes the given messageId with the given expirationTime from the
@@ -257,11 +257,11 @@ class TORCH_API TensorPipeAgent : public RpcAgent {
   // TensorPipe read function that could be used to read response messages
   // by client, and read request messages by server.
   void pipeRead(
-      const std::shared_ptr<tensorpipe::Pipe>&,
+      const std::shared_ptr<tensorpipe::Pipe>& /*pipe*/,
       std::function<void(
           const tensorpipe::Error&,
           c10::intrusive_ptr<Message>,
-          std::vector<c10::Stream>)>) noexcept;
+          std::vector<c10::Stream>)> /*fn*/) noexcept;
 
   // Callback of listener accept()
   void onListenerAccepted(
diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.h b/torch/csrc/distributed/rpc/tensorpipe_utils.h
index 9021bc11c86a..cfb0bad8bdad 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_utils.h
+++ b/torch/csrc/distributed/rpc/tensorpipe_utils.h
@@ -49,8 +49,8 @@ extern TORCH_API std::array<
 class TORCH_API TensorpipeDeviceTypeConverterRegistrar {
  public:
   TensorpipeDeviceTypeConverterRegistrar(
-      DeviceType,
-      const TensorpipeDeviceTypeConverter*);
+      DeviceType /*type*/,
+      const TensorpipeDeviceTypeConverter* /*impl*/);
 };
 
 #define C10_REGISTER_TENSORPIPE_DEVICE_TYPE_CONVERTER(                     \
diff --git a/torch/csrc/distributed/rpc/testing/init.cpp b/torch/csrc/distributed/rpc/testing/init.cpp
index bc9541e56a49..9ec4725fd966 100644
--- a/torch/csrc/distributed/rpc/testing/init.cpp
+++ b/torch/csrc/distributed/rpc/testing/init.cpp
@@ -106,23 +106,23 @@ PyObject* faulty_agent_init(PyObject* _unused, PyObject* noargs) {
           py::call_guard<py::gil_scoped_release>())
       .def(
           "get_worker_info",
-          (const WorkerInfo& (TensorPipeAgent::*)(void) const) &
-              RpcAgent::getWorkerInfo,
+          static_cast<const WorkerInfo& (TensorPipeAgent::*)(void) const>(
+              &RpcAgent::getWorkerInfo),
           py::call_guard<py::gil_scoped_release>())
       .def(
           "get_worker_info",
-          (const WorkerInfo& (TensorPipeAgent::*)(const std::string&) const) &
-              TensorPipeAgent::getWorkerInfo,
+          static_cast<const WorkerInfo& (TensorPipeAgent::*)(const std::string&)
+                          const>(&TensorPipeAgent::getWorkerInfo),
           py::call_guard<py::gil_scoped_release>())
       .def(
           "get_worker_info",
-          (const WorkerInfo& (TensorPipeAgent::*)(worker_id_t id) const) &
-              TensorPipeAgent::getWorkerInfo,
+          static_cast<const WorkerInfo& (TensorPipeAgent::*)(worker_id_t id)
+                          const>(&TensorPipeAgent::getWorkerInfo),
           py::call_guard<py::gil_scoped_release>())
       .def(
           "get_worker_infos",
-          (std::vector<WorkerInfo>(TensorPipeAgent::*)() const) &
-              TensorPipeAgent::getWorkerInfos,
+          static_cast<std::vector<WorkerInfo> (TensorPipeAgent::*)() const>(
+              &TensorPipeAgent::getWorkerInfos),
           py::call_guard<py::gil_scoped_release>());
 #endif // USE_TENSORPIPE
 
diff --git a/torch/csrc/distributed/rpc/types.h b/torch/csrc/distributed/rpc/types.h
index 863ccb6d6c8f..665d26a87c9e 100644
--- a/torch/csrc/distributed/rpc/types.h
+++ b/torch/csrc/distributed/rpc/types.h
@@ -32,7 +32,7 @@ struct TORCH_API GloballyUniqueId final {
   bool operator!=(const GloballyUniqueId& other) const;
 
   at::IValue toIValue() const;
-  static GloballyUniqueId fromIValue(const at::IValue&);
+  static GloballyUniqueId fromIValue(const at::IValue& /*ivalue*/);
 
   struct Hash {
     size_t operator()(const GloballyUniqueId& key) const {
diff --git a/torch/csrc/distributed/rpc/utils.cpp b/torch/csrc/distributed/rpc/utils.cpp
index aa3fccbd2fc7..7405da7f3294 100644
--- a/torch/csrc/distributed/rpc/utils.cpp
+++ b/torch/csrc/distributed/rpc/utils.cpp
@@ -314,8 +314,8 @@ parseWireSections(const void* data, size_t data_size) {
   return out;
 }
 
-static const char* kMeta = "meta";
-static const char* kPayload = "payload";
+static constexpr const char* kMeta = "meta";
+static constexpr const char* kPayload = "payload";
 } // namespace
 
 c10::List<at::Tensor> cloneSparseTensors(
@@ -507,8 +507,7 @@ std::vector<at::IValue> readWrappedPayload(
       " but additional payload size is ",
       additionalPayloadSize);
   auto wrappedPayloadBegin =
-      static_cast<const char*>(message.payload().data()) + payload.size() -
-      additionalPayloadSize;
+      message.payload().data() + payload.size() - additionalPayloadSize;
   std::vector<torch::Tensor> tensorTable;
   IValue tuple = jit::unpickle(
       wrappedPayloadBegin,
diff --git a/torch/csrc/dynamo/compiled_autograd.h b/torch/csrc/dynamo/compiled_autograd.h
index c5f5fd8d2f18..ca9eb3e638f4 100644
--- a/torch/csrc/dynamo/compiled_autograd.h
+++ b/torch/csrc/dynamo/compiled_autograd.h
@@ -1458,24 +1458,30 @@ struct IValuePacker<InputMetadata> {
     auto tuple = std::make_tuple(
         pack_TensorOptions(t.options()),
         t.shape_as_dim_vector().vec(),
-        t.is_tensor_subclass());
+        t.is_tensor_subclass(),
+        t.grad_dtype());
     return tuple;
   }
   static InputMetadata unpack(const at::IValue& t) {
-    auto tuple = t.to<
-        std::tuple<packed_tensoroptions_t, std::vector<at::SymInt>, bool>>();
+    auto tuple = t.to<std::tuple<
+        packed_tensoroptions_t,
+        std::vector<at::SymInt>,
+        bool,
+        std::optional<c10::ScalarType>>>();
 
     return InputMetadata(
         unpack_TensorOptions(std::get<0>(tuple)),
         SymIntSmallVec(std::get<1>(tuple)),
         std::get<2>(tuple),
-        false);
+        false,
+        std::get<3>(tuple));
   }
   static at::TypePtr packed_type() {
     return at::TupleType::create(
         {IValuePacker<at::TensorOptions>::packed_type(),
          IValuePacker<std::vector<at::SymInt>>::packed_type(),
-         at::BoolType::get()});
+         at::BoolType::get(),
+         IValuePacker<std::optional<at::ScalarType>>::packed_type()});
   }
 };
 
diff --git a/torch/csrc/dynamo/cpython_defs.c b/torch/csrc/dynamo/cpython_defs.c
index 244d4165d5e8..7b86017c59b3 100644
--- a/torch/csrc/dynamo/cpython_defs.c
+++ b/torch/csrc/dynamo/cpython_defs.c
@@ -2,17 +2,17 @@
 #include <torch/csrc/dynamo/cpython_includes.h>
 #include <torch/csrc/dynamo/debug_macros.h>
 
-#if IS_PYTHON_3_14_PLUS
+#if IS_PYTHON_3_15_PLUS || (IS_PYTHON_3_14_PLUS && defined(_WIN32))
 
 const uint8_t* THP_PyOpcode_Caches = NULL;
-const int THP_PyOpcode_Caches_size = 0;
+int THP_PyOpcode_Caches_size = 0;
+
+void THP_PyThreadState_PopFrame(
+    PyThreadState* tstate,
+    _PyInterpreterFrame* frame) {}
+void THP_PyFrame_Clear(_PyInterpreterFrame* frame) {}
 
-void
-THP_PyThreadState_PopFrame(PyThreadState *tstate, _PyInterpreterFrame * frame)
-{}
-void
-THP_PyFrame_Clear(_PyInterpreterFrame *frame)
-{}
+void init_THPCaches() {}
 
 #else
 
@@ -33,10 +33,12 @@ THP_PyFrame_Clear(_PyInterpreterFrame *frame)
 #undef NEED_OPCODE_TABLES
 #undef Py_BUILD_CORE
 
-// As a simple way to reduce the impact of ABI changes on the CPython side, this check forces
-// us to manually re-check that the function didn't change on the next major version
-#if IS_PYTHON_3_14_PLUS
-#error "Please ensure that the functions below still match the CPython implementation for 3.14"
+// As a simple way to reduce the impact of ABI changes on the CPython side, this
+// check forces us to manually re-check that the function didn't change on the
+// next major version
+#if IS_PYTHON_3_15_PLUS
+#error \
+    "Please ensure that the functions below still match the CPython implementation for 3.15"
 #endif
 
 // e.g. COPY_FIELD(op, o, globals) becomes
@@ -53,15 +55,15 @@ THP_PyFrame_Clear(_PyInterpreterFrame *frame)
 // Ensure that all fields defined in the PyFunctionObject struct in
 // https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Include/cpython/funcobject.h
 // are accounted for.
-PyFunctionObject *
-_PyFunction_CopyWithNewCode(PyFunctionObject *o, PyCodeObject* code)
-{
-  PyFunctionObject *op = PyObject_GC_New(PyFunctionObject, &PyFunction_Type);
+PyFunctionObject* _PyFunction_CopyWithNewCode(
+    PyFunctionObject* o,
+    PyCodeObject* code) {
+  PyFunctionObject* op = PyObject_GC_New(PyFunctionObject, &PyFunction_Type);
   if (op == NULL) {
     return NULL;
   }
   Py_XINCREF(code);
-  op->func_code = (PyObject *) code;
+  op->func_code = (PyObject*)code;
   Py_XINCREF(code->co_name);
   op->func_name = code->co_name;
   Py_XINCREF(code->co_qualname);
@@ -76,303 +78,418 @@ _PyFunction_CopyWithNewCode(PyFunctionObject *o, PyCodeObject* code)
   op->func_weakreflist = NULL;
   COPY_FIELD(op, o, module);
   COPY_FIELD(op, o, annotations);
-  #if IS_PYTHON_3_12_PLUS
+#if IS_PYTHON_3_14_PLUS
+  COPY_FIELD(op, o, annotate);
+#endif
+#if IS_PYTHON_3_12_PLUS
   COPY_FIELD(op, o, typeparams);
-  #endif
+#endif
   op->vectorcall = o->vectorcall;
   op->func_version = o->func_version;
   PyObject_GC_Track(op);
   return op;
 }
 
-// From https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Objects/frameobject.c#L1020
-PyFrameObject*
-THP_PyFrame_New_NoTrack(const PyCodeObject *code)
-{
-    // DYNAMO: commented out
-    // CALL_STAT_INC(frame_objects_created);
-    int slots = code->co_nlocalsplus + code->co_stacksize;
-    PyFrameObject *f = PyObject_GC_NewVar(PyFrameObject, &PyFrame_Type, slots);
-    if (f == NULL) {
-        return NULL;
-    }
-    f->f_back = NULL;
-    f->f_trace = NULL;
-    f->f_trace_lines = 1;
-    f->f_trace_opcodes = 0;
+// From
+// https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Objects/frameobject.c#L1020
+PyFrameObject* THP_PyFrame_New_NoTrack(const PyCodeObject* code) {
+  // DYNAMO: commented out
+  // CALL_STAT_INC(frame_objects_created);
+  int slots = code->co_nlocalsplus + code->co_stacksize;
+  PyFrameObject* f = PyObject_GC_NewVar(PyFrameObject, &PyFrame_Type, slots);
+  if (f == NULL) {
+    return NULL;
+  }
+  f->f_back = NULL;
+  f->f_trace = NULL;
+  f->f_trace_lines = 1;
+  f->f_trace_opcodes = 0;
 #if IS_PYTHON_3_13_PLUS
-    f->f_extra_locals = NULL;
+  f->f_extra_locals = NULL;
 #else
-    f->f_fast_as_locals = 0;
+  f->f_fast_as_locals = 0;
 #endif
-    f->f_lineno = 0;
-    return f;
+  f->f_lineno = 0;
+#if IS_PYTHON_3_14_PLUS
+  f->f_locals_cache = NULL;
+  f->f_overwritten_fast_locals = NULL;
+#endif
+  return f;
 }
 
-// From https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Python/frame.c#L27
-PyFrameObject *
-THP_PyFrame_MakeAndSetFrameObject(_PyInterpreterFrame *frame)
-{
-    CHECK(frame->frame_obj == NULL);
-    PyObject *error_type = NULL, *error_value = NULL, *error_traceback = NULL;
-    PyErr_Fetch(&error_type, &error_value, &error_traceback);
-
-    PyFrameObject *f = THP_PyFrame_New_NoTrack(F_CODE(frame));
-    if (f == NULL) {
-        Py_XDECREF(error_type);
-        Py_XDECREF(error_value);
-        Py_XDECREF(error_traceback);
-        return NULL;
-    }
-    PyErr_Restore(error_type, error_value, error_traceback);
-    if (frame->frame_obj) {
-        // GH-97002: How did we get into this horrible situation? Most likely,
-        // allocating f triggered a GC collection, which ran some code that
-        // *also* created the same frame... while we were in the middle of
-        // creating it! See test_sneaky_frame_object in test_frame.py for a
-        // concrete example.
-        //
-        // Regardless, just throw f away and use that frame instead, since it's
-        // already been exposed to user code. It's actually a bit tricky to do
-        // this, since we aren't backed by a real _PyInterpreterFrame anymore.
-        // Just pretend that we have an owned, cleared frame so frame_dealloc
-        // doesn't make the situation worse:
-        f->f_frame = (_PyInterpreterFrame *)f->_f_frame_data;
-        f->f_frame->owner = FRAME_CLEARED;
-        f->f_frame->frame_obj = f;
-        Py_DECREF(f);
-        return frame->frame_obj;
-    }
-    CHECK(frame->owner != FRAME_OWNED_BY_FRAME_OBJECT);
-    CHECK(frame->owner != FRAME_CLEARED);
-    f->f_frame = frame;
-    frame->frame_obj = f;
-    return f;
+#if IS_PYTHON_3_14_PLUS
+
+// From
+// https://github.com/python/cpython/blob/8b3f9ae2ca55b2cc7edc097321cc10d7c2fdbb98/Python/frame.c#L21
+PyFrameObject* THP_PyFrame_MakeAndSetFrameObject(_PyInterpreterFrame* frame) {
+  CHECK(frame->frame_obj == NULL);
+  PyObject* exc = PyErr_GetRaisedException();
+
+  PyFrameObject* f = THP_PyFrame_New_NoTrack(F_CODE(frame));
+  if (f == NULL) {
+    Py_XDECREF(exc);
+    return NULL;
+  }
+  PyErr_SetRaisedException(exc);
+
+  // GH-97002: There was a time when a frame object could be created when we
+  // are allocating the new frame object f above, so frame->frame_obj would
+  // be assigned already. That path does not exist anymore. We won't call any
+  // Python code in this function and garbage collection will not run.
+  // Notice that _PyFrame_New_NoTrack() can potentially raise a MemoryError,
+  // but it won't allocate a traceback until the frame unwinds, so we are safe
+  // here.
+  assert(frame->frame_obj == NULL);
+  assert(frame->owner != FRAME_OWNED_BY_FRAME_OBJECT);
+  f->f_frame = frame;
+  frame->frame_obj = f;
+  return f;
 }
 
-// From https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Include/internal/pycore_frame.h#L163
-static inline PyFrameObject *
-THP_PyFrame_GetFrameObject(_PyInterpreterFrame *frame)
-{
+#else
 
-    CHECK(!_PyFrame_IsIncomplete(frame));
-    PyFrameObject *res =  frame->frame_obj;
-    if (res != NULL) {
-        return res;
-    }
-    return THP_PyFrame_MakeAndSetFrameObject(frame);
+// From
+// https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Python/frame.c#L27
+PyFrameObject* THP_PyFrame_MakeAndSetFrameObject(_PyInterpreterFrame* frame) {
+  CHECK(frame->frame_obj == NULL);
+  PyObject *error_type = NULL, *error_value = NULL, *error_traceback = NULL;
+  PyErr_Fetch(&error_type, &error_value, &error_traceback);
+
+  PyFrameObject* f = THP_PyFrame_New_NoTrack(F_CODE(frame));
+  if (f == NULL) {
+    Py_XDECREF(error_type);
+    Py_XDECREF(error_value);
+    Py_XDECREF(error_traceback);
+    return NULL;
+  }
+  PyErr_Restore(error_type, error_value, error_traceback);
+  if (frame->frame_obj) {
+    // GH-97002: How did we get into this horrible situation? Most likely,
+    // allocating f triggered a GC collection, which ran some code that
+    // *also* created the same frame... while we were in the middle of
+    // creating it! See test_sneaky_frame_object in test_frame.py for a
+    // concrete example.
+    //
+    // Regardless, just throw f away and use that frame instead, since it's
+    // already been exposed to user code. It's actually a bit tricky to do
+    // this, since we aren't backed by a real _PyInterpreterFrame anymore.
+    // Just pretend that we have an owned, cleared frame so frame_dealloc
+    // doesn't make the situation worse:
+    f->f_frame = (_PyInterpreterFrame*)f->_f_frame_data;
+    f->f_frame->owner = FRAME_CLEARED;
+    f->f_frame->frame_obj = f;
+    Py_DECREF(f);
+    return frame->frame_obj;
+  }
+  CHECK(frame->owner != FRAME_OWNED_BY_FRAME_OBJECT);
+  CHECK(frame->owner != FRAME_CLEARED);
+  f->f_frame = frame;
+  frame->frame_obj = f;
+  return f;
 }
 
-// From https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Python/frame.c#L79
-static void
-THP_take_ownership(PyFrameObject *f, _PyInterpreterFrame *frame)
-{
-    CHECK(frame->owner != FRAME_OWNED_BY_FRAME_OBJECT);
-    CHECK(frame->owner != FRAME_CLEARED);
-    Py_ssize_t size = ((char*)&frame->localsplus[frame->stacktop]) - (char *)frame;
-    memcpy((_PyInterpreterFrame *)f->_f_frame_data, frame, size);
-    frame = (_PyInterpreterFrame *)f->_f_frame_data;
-    f->f_frame = frame;
-    frame->owner = FRAME_OWNED_BY_FRAME_OBJECT;
-    if (_PyFrame_IsIncomplete(frame)) {
-        // This may be a newly-created generator or coroutine frame. Since it's
-        // dead anyways, just pretend that the first RESUME ran:
-        PyCodeObject *code = F_CODE(frame);
-        PREV_INSTR(frame) = _PyCode_CODE(code) + code->_co_firsttraceable;
-    }
-    CHECK(!_PyFrame_IsIncomplete(frame));
-    CHECK(f->f_back == NULL);
-    _PyInterpreterFrame *prev = frame->previous;
-    while (prev && _PyFrame_IsIncomplete(prev)) {
-        prev = prev->previous;
+#endif
+
+// From
+// https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Include/internal/pycore_frame.h#L163
+static inline PyFrameObject* THP_PyFrame_GetFrameObject(
+    _PyInterpreterFrame* frame) {
+  CHECK(!_PyFrame_IsIncomplete(frame));
+  PyFrameObject* res = frame->frame_obj;
+  if (res != NULL) {
+    return res;
+  }
+  return THP_PyFrame_MakeAndSetFrameObject(frame);
+}
+
+#if IS_PYTHON_3_14_PLUS
+
+static void THP_take_ownership(PyFrameObject* f, _PyInterpreterFrame* frame) {
+  Py_BEGIN_CRITICAL_SECTION(f);
+  CHECK(frame->owner < FRAME_OWNED_BY_INTERPRETER);
+  CHECK(frame->owner != FRAME_OWNED_BY_FRAME_OBJECT);
+  _PyInterpreterFrame* new_frame = (_PyInterpreterFrame*)f->_f_frame_data;
+  _PyFrame_Copy(frame, new_frame);
+  // _PyFrame_Copy takes the reference to the executable,
+  // so we need to restore it.
+  frame->f_executable = PyStackRef_DUP(new_frame->f_executable);
+  f->f_frame = new_frame;
+  new_frame->owner = FRAME_OWNED_BY_FRAME_OBJECT;
+  if (_PyFrame_IsIncomplete(new_frame)) {
+    // This may be a newly-created generator or coroutine frame. Since it's
+    // dead anyways, just pretend that the first RESUME ran:
+    PyCodeObject* code = F_CODE(new_frame);
+    new_frame->instr_ptr =
+        _PyFrame_GetBytecode(new_frame) + code->_co_firsttraceable + 1;
+  }
+  CHECK(!_PyFrame_IsIncomplete(new_frame));
+  CHECK(f->f_back == NULL);
+  _PyInterpreterFrame* prev = _PyFrame_GetFirstComplete(frame->previous);
+  if (prev) {
+    CHECK(prev->owner < FRAME_OWNED_BY_INTERPRETER);
+    PyObject* exc = PyErr_GetRaisedException();
+    /* Link PyFrameObjects.f_back and remove link through
+     * _PyInterpreterFrame.previous */
+    PyFrameObject* back = THP_PyFrame_GetFrameObject(prev);
+    if (back == NULL) {
+      /* Memory error here. */
+      assert(PyErr_ExceptionMatches(PyExc_MemoryError));
+      /* Nothing we can do about it */
+      PyErr_Clear();
+    } else {
+      f->f_back = (PyFrameObject*)Py_NewRef(back);
     }
-    if (prev) {
-        /* Link PyFrameObjects.f_back and remove link through _PyInterpreterFrame.previous */
-        PyFrameObject *back = THP_PyFrame_GetFrameObject(prev);
-        if (back == NULL) {
-            /* Memory error here. */
-            CHECK(PyErr_ExceptionMatches(PyExc_MemoryError));
-            /* Nothing we can do about it */
-            PyErr_Clear();
-        }
-        else {
-            f->f_back = (PyFrameObject *)Py_NewRef(back);
-        }
-        frame->previous = NULL;
+    PyErr_SetRaisedException(exc);
+  }
+  if (!_PyObject_GC_IS_TRACKED((PyObject*)f)) {
+    _PyObject_GC_TRACK((PyObject*)f);
+  }
+  Py_END_CRITICAL_SECTION();
+}
+
+#else
+
+// From
+// https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Python/frame.c#L79
+static void THP_take_ownership(PyFrameObject* f, _PyInterpreterFrame* frame) {
+  CHECK(frame->owner != FRAME_OWNED_BY_FRAME_OBJECT);
+  CHECK(frame->owner != FRAME_CLEARED);
+  Py_ssize_t size = ((char*)&frame->localsplus[frame->stacktop]) - (char*)frame;
+  memcpy((_PyInterpreterFrame*)f->_f_frame_data, frame, size);
+  frame = (_PyInterpreterFrame*)f->_f_frame_data;
+  f->f_frame = frame;
+  frame->owner = FRAME_OWNED_BY_FRAME_OBJECT;
+  if (_PyFrame_IsIncomplete(frame)) {
+    // This may be a newly-created generator or coroutine frame. Since it's
+    // dead anyways, just pretend that the first RESUME ran:
+    PyCodeObject* code = F_CODE(frame);
+    PREV_INSTR(frame) = _PyCode_CODE(code) + code->_co_firsttraceable;
+  }
+  CHECK(!_PyFrame_IsIncomplete(frame));
+  CHECK(f->f_back == NULL);
+  _PyInterpreterFrame* prev = frame->previous;
+  while (prev && _PyFrame_IsIncomplete(prev)) {
+    prev = prev->previous;
+  }
+  if (prev) {
+    /* Link PyFrameObjects.f_back and remove link through
+     * _PyInterpreterFrame.previous */
+    PyFrameObject* back = THP_PyFrame_GetFrameObject(prev);
+    if (back == NULL) {
+      /* Memory error here. */
+      CHECK(PyErr_ExceptionMatches(PyExc_MemoryError));
+      /* Nothing we can do about it */
+      PyErr_Clear();
+    } else {
+      f->f_back = (PyFrameObject*)Py_NewRef(back);
     }
-    // DYNAMO: use public GC functions instead of internal ones
-    if (!PyObject_GC_IsTracked((PyObject *) f)) {
-        PyObject_GC_Track((PyObject *) f);
+    frame->previous = NULL;
+  }
+  // DYNAMO: use public GC functions instead of internal ones
+  if (!PyObject_GC_IsTracked((PyObject*)f)) {
+    PyObject_GC_Track((PyObject*)f);
+  }
+}
+
+#endif
+
+#if IS_PYTHON_3_14_PLUS
+
+void THP_PyFrame_ClearLocals(_PyInterpreterFrame* frame) {
+  CHECK(frame->stackpointer != NULL);
+  _PyStackRef* sp = frame->stackpointer;
+  _PyStackRef* locals = frame->localsplus;
+  frame->stackpointer = locals;
+  while (sp > locals) {
+    sp--;
+    PyStackRef_XCLOSE(*sp);
+  }
+  Py_CLEAR(frame->f_locals);
+}
+
+// From
+// https://github.com/python/cpython/blob/8b3f9ae2ca55b2cc7edc097321cc10d7c2fdbb98/Python/frame.c#L107
+void THP_PyFrame_Clear(_PyInterpreterFrame* frame) {
+  /* It is the responsibility of the owning generator/coroutine
+   * to have cleared the enclosing generator, if any. */
+  CHECK(
+      frame->owner != FRAME_OWNED_BY_GENERATOR ||
+      _PyGen_GetGeneratorFromFrame(frame)->gi_frame_state == FRAME_CLEARED);
+  // GH-99729: Clearing this frame can expose the stack (via finalizers). It's
+  // crucial that this frame has been unlinked, and is no longer visible:
+  CHECK(_PyThreadState_GET()->current_frame != frame);
+  if (frame->frame_obj) {
+    PyFrameObject* f = frame->frame_obj;
+    frame->frame_obj = NULL;
+    if (!_PyObject_IsUniquelyReferenced((PyObject*)f)) {
+      THP_take_ownership(f, frame);
+      Py_DECREF(f);
+      return;
     }
+    Py_DECREF(f);
+  }
+  THP_PyFrame_ClearLocals(frame);
+  PyStackRef_CLEAR(frame->f_funcobj);
 }
 
-// From https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Python/frame.c#L120
-void
-THP_PyFrame_Clear(_PyInterpreterFrame *frame)
-{
-    /* It is the responsibility of the owning generator/coroutine
-     * to have cleared the enclosing generator, if any. */
-    CHECK(frame->owner != FRAME_OWNED_BY_GENERATOR ||
-        _PyFrame_GetGenerator(frame)->gi_frame_state == FRAME_CLEARED);
-    // GH-99729: Clearing this frame can expose the stack (via finalizers). It's
-    // crucial that this frame has been unlinked, and is no longer visible:
+#else
+
+// From
+// https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Python/frame.c#L120
+void THP_PyFrame_Clear(_PyInterpreterFrame* frame) {
+  /* It is the responsibility of the owning generator/coroutine
+   * to have cleared the enclosing generator, if any. */
+  CHECK(
+      frame->owner != FRAME_OWNED_BY_GENERATOR ||
+      _PyFrame_GetGenerator(frame)->gi_frame_state == FRAME_CLEARED);
+  // GH-99729: Clearing this frame can expose the stack (via finalizers). It's
+  // crucial that this frame has been unlinked, and is no longer visible:
 #if IS_PYTHON_3_13_PLUS
-    CHECK(_PyThreadState_GET()->current_frame != frame);
+  CHECK(_PyThreadState_GET()->current_frame != frame);
 #else
-    CHECK(_PyThreadState_GET()->cframe->current_frame != frame);
+  CHECK(_PyThreadState_GET()->cframe->current_frame != frame);
 #endif
-    if (frame->frame_obj) {
-        PyFrameObject *f = frame->frame_obj;
-        frame->frame_obj = NULL;
-        if (Py_REFCNT(f) > 1) {
-            THP_take_ownership(f, frame);
-            Py_DECREF(f);
-            return;
-        }
-        Py_DECREF(f);
+  if (frame->frame_obj) {
+    PyFrameObject* f = frame->frame_obj;
+    frame->frame_obj = NULL;
+    if (Py_REFCNT(f) > 1) {
+      THP_take_ownership(f, frame);
+      Py_DECREF(f);
+      return;
     }
-    CHECK(frame->stacktop >= 0);
-    for (int i = 0; i < frame->stacktop; i++) {
-        Py_XDECREF(frame->localsplus[i]);
-    }
-    Py_XDECREF(frame->frame_obj);
-    Py_XDECREF(frame->f_locals);
-    // DYNAMO: additional field for 3.12
-    #if IS_PYTHON_3_12_PLUS
-    Py_DECREF(frame->f_funcobj);
-    #else
-    Py_DECREF(frame->f_func);
-    #endif
-    Py_DECREF(F_CODE(frame));
+    Py_DECREF(f);
+  }
+  CHECK(frame->stacktop >= 0);
+  for (int i = 0; i < frame->stacktop; i++) {
+    Py_XDECREF(frame->localsplus[i]);
+  }
+  Py_XDECREF(frame->frame_obj);
+  Py_XDECREF(frame->f_locals);
+// DYNAMO: additional field for 3.12
+#if IS_PYTHON_3_12_PLUS
+  Py_DECREF(frame->f_funcobj);
+#else
+  Py_DECREF(frame->f_func);
+#endif
+  Py_DECREF(F_CODE(frame));
 }
 
+#endif
+
 // https://github.com/python/cpython/blob/fad48ea1816be3125ea51edcdfe2f999d6ade796/Objects/obmalloc.c#L635
-void *
-THP_PyObject_VirtualAlloc(size_t size)
-{
-    PyObjectArenaAllocator arena;
-    PyObject_GetArenaAllocator(&arena);
-    return arena.alloc(arena.ctx, size);
+void* THP_PyObject_VirtualAlloc(size_t size) {
+  PyObjectArenaAllocator arena;
+  PyObject_GetArenaAllocator(&arena);
+  return arena.alloc(arena.ctx, size);
 }
 
 // https://github.com/python/cpython/blob/fad48ea1816be3125ea51edcdfe2f999d6ade796/Objects/obmalloc.c#L641
-void
-THP_PyObject_VirtualFree(void *obj, size_t size)
-{
-    PyObjectArenaAllocator arena;
-    PyObject_GetArenaAllocator(&arena);
-    return arena.free(arena.ctx, obj, size);
+void THP_PyObject_VirtualFree(void* obj, size_t size) {
+  PyObjectArenaAllocator arena;
+  PyObject_GetArenaAllocator(&arena);
+  arena.free(arena.ctx, obj, size);
 }
 
 // https://github.com/python/cpython/blob/051b8a2589ff28f0194c3701b21f729444691752/Python/pystate.c#L728
-static _PyStackChunk*
-allocate_chunk(int size_in_bytes, _PyStackChunk* previous)
-{
-    CHECK(size_in_bytes % sizeof(PyObject **) == 0);
-    _PyStackChunk *res = THP_PyObject_VirtualAlloc(size_in_bytes);
-    if (res == NULL) {
-        return NULL;
-    }
-    res->previous = previous;
-    res->size = size_in_bytes;
-    res->top = 0;
-    return res;
+static _PyStackChunk* allocate_chunk(
+    int size_in_bytes,
+    _PyStackChunk* previous) {
+  CHECK(size_in_bytes % sizeof(PyObject**) == 0);
+  _PyStackChunk* res = THP_PyObject_VirtualAlloc(size_in_bytes);
+  if (res == NULL) {
+    return NULL;
+  }
+  res->previous = previous;
+  res->size = size_in_bytes;
+  res->top = 0;
+  return res;
 }
 
-#define DATA_STACK_CHUNK_SIZE (16*1024)
+#define DATA_STACK_CHUNK_SIZE (16 * 1024)
 #define MINIMUM_OVERHEAD 1000
 
 // https://github.com/python/cpython/blob/051b8a2589ff28f0194c3701b21f729444691752/Python/pystate.c#L2182
-static PyObject **
-push_chunk(PyThreadState *tstate, int size)
-{
-    int allocate_size = DATA_STACK_CHUNK_SIZE;
-    while (allocate_size < (int)sizeof(PyObject*)*(size + MINIMUM_OVERHEAD)) {
-        allocate_size *= 2;
-    }
-    _PyStackChunk *new = allocate_chunk(allocate_size, tstate->datastack_chunk);
-    if (new == NULL) {
-        return NULL;
-    }
-    if (tstate->datastack_chunk) {
-        tstate->datastack_chunk->top = tstate->datastack_top -
-                                       &tstate->datastack_chunk->data[0];
-    }
-    tstate->datastack_chunk = new;
-    tstate->datastack_limit = (PyObject **)(((char *)new) + allocate_size);
-    // When new is the "root" chunk (i.e. new->previous == NULL), we can keep
-    // _PyThreadState_PopFrame from freeing it later by "skipping" over the
-    // first element:
-    PyObject **res = &new->data[new->previous == NULL];
-    tstate->datastack_top = res + size;
-    return res;
+static PyObject** push_chunk(PyThreadState* tstate, int size) {
+  int allocate_size = DATA_STACK_CHUNK_SIZE;
+  while (allocate_size < (int)sizeof(PyObject*) * (size + MINIMUM_OVERHEAD)) {
+    allocate_size *= 2;
+  }
+  _PyStackChunk* new = allocate_chunk(allocate_size, tstate->datastack_chunk);
+  if (new == NULL) {
+    return NULL;
+  }
+  if (tstate->datastack_chunk) {
+    tstate->datastack_chunk->top =
+        tstate->datastack_top - &tstate->datastack_chunk->data[0];
+  }
+  tstate->datastack_chunk = new;
+  tstate->datastack_limit = (PyObject**)(((char*)new) + allocate_size);
+  // When new is the "root" chunk (i.e. new->previous == NULL), we can keep
+  // _PyThreadState_PopFrame from freeing it later by "skipping" over the
+  // first element:
+  PyObject** res = &new->data[new->previous == NULL];
+  tstate->datastack_top = res + size;
+  return res;
 }
 
 // https://github.com/python/cpython/blob/051b8a2589ff28f0194c3701b21f729444691752/Include/internal/pycore_frame.h#L199
-static inline bool
-THP_PyThreadState_HasStackSpace(PyThreadState *tstate, size_t size)
-{
-    CHECK(
-        (tstate->datastack_top == NULL && tstate->datastack_limit == NULL)
-        ||
-        (tstate->datastack_top != NULL && tstate->datastack_limit != NULL)
-    );
-    return tstate->datastack_top != NULL &&
-        size < (size_t)(tstate->datastack_limit - tstate->datastack_top);
+static inline bool THP_PyThreadState_HasStackSpace(
+    PyThreadState* tstate,
+    size_t size) {
+  CHECK(
+      (tstate->datastack_top == NULL && tstate->datastack_limit == NULL) ||
+      (tstate->datastack_top != NULL && tstate->datastack_limit != NULL));
+  return tstate->datastack_top != NULL &&
+      size < (size_t)(tstate->datastack_limit - tstate->datastack_top);
 }
 
 // https://github.com/python/cpython/blob/051b8a2589ff28f0194c3701b21f729444691752/Python/pystate.c#L2207
-_PyInterpreterFrame *
-THP_PyThreadState_BumpFramePointerSlow(PyThreadState *tstate, size_t size)
-{
-    if (THP_PyThreadState_HasStackSpace(tstate, size)) {
-        _PyInterpreterFrame *res = (_PyInterpreterFrame *)tstate->datastack_top;
-        tstate->datastack_top += size;
-        return res;
-    }
-    if (size > INT_MAX/2) {
-        PyErr_NoMemory();
-        return NULL;
-    }
-    return (_PyInterpreterFrame *)push_chunk(tstate, (int)size);
+_PyInterpreterFrame* THP_PyThreadState_BumpFramePointerSlow(
+    PyThreadState* tstate,
+    size_t size) {
+  if (THP_PyThreadState_HasStackSpace(tstate, size)) {
+    _PyInterpreterFrame* res = (_PyInterpreterFrame*)tstate->datastack_top;
+    tstate->datastack_top += size;
+    return res;
+  }
+  if (size > INT_MAX / 2) {
+    PyErr_NoMemory();
+    return NULL;
+  }
+  return (_PyInterpreterFrame*)push_chunk(tstate, (int)size);
 }
 
 // https://github.com/python/cpython/blob/051b8a2589ff28f0194c3701b21f729444691752/Python/pystate.c#L2222
-void
-THP_PyThreadState_PopFrame(PyThreadState *tstate, _PyInterpreterFrame * frame)
-{
-    CHECK(tstate->datastack_chunk);
-    PyObject **base = (PyObject **)frame;
-    if (base == &tstate->datastack_chunk->data[0]) {
-        _PyStackChunk *chunk = tstate->datastack_chunk;
-        _PyStackChunk *previous = chunk->previous;
-        // push_chunk ensures that the root chunk is never popped:
-        CHECK(previous);
-        tstate->datastack_top = &previous->data[previous->top];
-        tstate->datastack_chunk = previous;
-        THP_PyObject_VirtualFree(chunk, chunk->size);
-        tstate->datastack_limit = (PyObject **)(((char *)previous) + previous->size);
-    }
-    else {
-        CHECK(tstate->datastack_top);
-        CHECK(tstate->datastack_top >= base);
-        tstate->datastack_top = base;
-    }
+void THP_PyThreadState_PopFrame(
+    PyThreadState* tstate,
+    _PyInterpreterFrame* frame) {
+  CHECK(tstate->datastack_chunk);
+  PyObject** base = (PyObject**)frame;
+  if (base == &tstate->datastack_chunk->data[0]) {
+    _PyStackChunk* chunk = tstate->datastack_chunk;
+    _PyStackChunk* previous = chunk->previous;
+    // push_chunk ensures that the root chunk is never popped:
+    CHECK(previous);
+    tstate->datastack_top = &previous->data[previous->top];
+    tstate->datastack_chunk = previous;
+    THP_PyObject_VirtualFree(chunk, chunk->size);
+    tstate->datastack_limit = (PyObject**)(((char*)previous) + previous->size);
+  } else {
+    CHECK(tstate->datastack_top);
+    CHECK(tstate->datastack_top >= base);
+    tstate->datastack_top = base;
+  }
 }
 
-
 #endif
 
-#if IS_PYTHON_3_11_PLUS
-
-const uint8_t* THP_PyOpcode_Caches = _PyOpcode_Caches;
-const int THP_PyOpcode_Caches_size = sizeof(_PyOpcode_Caches) / sizeof(uint8_t);
-
-#else
-
 const uint8_t* THP_PyOpcode_Caches = NULL;
-const int THP_PyOpcode_Caches_size = 0;
-
+int THP_PyOpcode_Caches_size = 0;
+void init_THPCaches() {
+#if IS_PYTHON_3_11_PLUS
+  THP_PyOpcode_Caches = _PyOpcode_Caches;
+  THP_PyOpcode_Caches_size = sizeof(_PyOpcode_Caches) / sizeof(uint8_t);
 #endif
+}
 
-#endif // IS_PYTHON_3_14_PLUS
\ No newline at end of file
+#endif // IS_PYTHON_3_15_PLUS
diff --git a/torch/csrc/dynamo/cpython_defs.h b/torch/csrc/dynamo/cpython_defs.h
index 5a58c7ee8c77..7183875dc682 100644
--- a/torch/csrc/dynamo/cpython_defs.h
+++ b/torch/csrc/dynamo/cpython_defs.h
@@ -28,13 +28,13 @@ void THP_PyThreadState_PopFrame(
 
 // pointers to _PyOpcode_Caches for C++
 #ifdef __cplusplus
-
-extern "C" const uint8_t* THP_PyOpcode_Caches;
-extern "C" const int THP_PyOpcode_Caches_size;
-
-#else
+extern "C" {
+#endif
 
 extern const uint8_t* THP_PyOpcode_Caches;
-extern const int THP_PyOpcode_Caches_size;
+extern int THP_PyOpcode_Caches_size;
+void init_THPCaches();
 
+#ifdef __cplusplus
+} // extern "C"
 #endif
diff --git a/torch/csrc/dynamo/cpython_includes.h b/torch/csrc/dynamo/cpython_includes.h
index 8c88addf5e42..ff1fd4fcfc48 100644
--- a/torch/csrc/dynamo/cpython_includes.h
+++ b/torch/csrc/dynamo/cpython_includes.h
@@ -10,7 +10,6 @@
 #endif
 
 // see https://bugs.python.org/issue35886
-#if PY_VERSION_HEX >= 0x03080000
 #define Py_BUILD_CORE
 
 #ifndef __cplusplus
@@ -21,20 +20,19 @@
 
 #if IS_PYTHON_3_11_PLUS
 #include <internal/pycore_frame.h>
-#if IS_PYTHON_3_14_PLUS
-#include <internal/pycore_interpframe_structs.h>
-#endif
+
 #if IS_PYTHON_3_14_PLUS && !defined(_WIN32)
+#include <internal/pycore_code.h>
+#include <internal/pycore_genobject.h>
+#include <internal/pycore_interpframe.h>
 #include <internal/pycore_stackref.h>
-#endif
+#elif IS_PYTHON_3_14_PLUS && defined(_WIN32)
+#include <internal/pycore_interpframe_structs.h> // _PyInterpreterFrame
 #endif
 
-#if IS_PYTHON_3_14_PLUS && !defined(_WIN32)
-#include <internal/pycore_code.h>
 #endif
 
 #undef Py_BUILD_CORE
-#endif // PY_VERSION_HEX >= 0x03080000
 
 #ifdef __cplusplus
 extern "C" {
@@ -42,7 +40,7 @@ extern "C" {
 
 #if IS_PYTHON_3_14_PLUS && !defined(_WIN32)
 
-#define F_CODE(x) (PyCodeObject*)PyStackRef_AsPyObjectBorrow(x->f_executable)
+#define F_CODE(x) ((PyCodeObject*)PyStackRef_AsPyObjectBorrow(x->f_executable))
 #define PREV_INSTR(x) (x)->instr_ptr
 
 #elif IS_PYTHON_3_14_PLUS && defined(_WIN32)
@@ -62,10 +60,14 @@ extern "C" {
 
 #endif // IS_PYTHON_3_14_PLUS
 
-#if IS_PYTHON_3_12_PLUS
-#define FUNC(x) ((x)->f_funcobj)
+#if IS_PYTHON_3_14_PLUS && !defined(_WIN32)
+#define FUNC(x) ((PyFunctionObject*)PyStackRef_AsPyObjectBorrow((x)->f_funcobj))
+#elif IS_PYTHON_3_14_PLUS && defined(_WIN32)
+#define FUNC(x) ((PyFunctionObject*)((x)->f_funcobj.bits))
+#elif IS_PYTHON_3_12_PLUS
+#define FUNC(x) ((PyFunctionObject*)(x)->f_funcobj)
 #else
-#define FUNC(x) ((x)->f_func)
+#define FUNC(x) ((PyFunctionObject*)(x)->f_func)
 #endif
 
 #ifdef __cplusplus
diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c
index 07d28e7c77cf..f597cea30ed3 100644
--- a/torch/csrc/dynamo/eval_frame.c
+++ b/torch/csrc/dynamo/eval_frame.c
@@ -34,8 +34,9 @@ void eval_frame_callback_set(PyObject* obj) {
   PyThread_tss_set(&eval_frame_callback_key, obj);
 }
 
-// 3.14 Not supported at all. See cpython_defs.c for hints
-#if !(IS_PYTHON_3_14_PLUS)
+// 3.15 Not supported at all. See cpython_defs.c for hints
+// 3.14 currently not fully supported on Windows
+#if !(IS_PYTHON_3_15_PLUS || (IS_PYTHON_3_14_PLUS && defined(_WIN32)))
 
 #define DECLARE_PYOBJ_ATTR(name)                        \
   static PyObject* THPPyInterpreterFrame_##name(        \
@@ -56,7 +57,13 @@ static PyObject* THPPyInterpreterFrame_f_locals(
   return self->locals;
 }
 
-#if IS_PYTHON_3_13_PLUS
+#if IS_PYTHON_3_14_PLUS
+static PyObject* THPPyInterpreterFrame_f_executable(
+    THPPyInterpreterFrame* self,
+    PyObject* _noargs) {
+  return PyStackRef_AsPyObjectNew(self->frame->f_executable);
+}
+#elif IS_PYTHON_3_13_PLUS
 DECLARE_PYOBJ_ATTR(f_executable)
 #else
 DECLARE_PYOBJ_ATTR(f_code)
@@ -109,11 +116,8 @@ static PyObject* THPPyInterpreterFrame_f_back(
 static PyObject* THPPyInterpreterFrame_closure(
     THPPyInterpreterFrame* self,
     PyObject* _noargs) {
-#if IS_PYTHON_3_12_PLUS
-  PyObject* closure = ((PyFunctionObject*)self->frame->f_funcobj)->func_closure;
-  return closure == NULL ? PyTuple_New(0) : Py_XNewRef(closure);
-#elif IS_PYTHON_3_11_PLUS
-  PyObject* closure = ((PyFunctionObject*)self->frame->f_func)->func_closure;
+#if IS_PYTHON_3_11_PLUS
+  PyObject* closure = FUNC(self->frame)->func_closure;
   return closure == NULL ? PyTuple_New(0) : Py_XNewRef(closure);
 #else
   PyCodeObject* code = self->frame->f_code;
@@ -224,6 +228,17 @@ const char* get_frame_name(THP_EVAL_API_FRAME_OBJECT* frame) {
   return PyUnicode_AsUTF8(F_CODE(frame)->co_name);
 }
 
+#if IS_PYTHON_3_14_PLUS
+static void dup_obj(_PyStackRef* dst, _PyStackRef src) {
+  *dst = PyStackRef_DUP(src);
+}
+#else
+static void dup_obj(PyObject** dst, PyObject* src) {
+  Py_XINCREF(src);
+  *dst = src;
+}
+#endif
+
 static PyObject* dynamo_eval_custom_code_impl(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
@@ -237,11 +252,10 @@ static PyObject* dynamo_eval_custom_code_impl(
 
   // Generate Python function object and _PyInterpreterFrame in a way similar to
   // https://github.com/python/cpython/blob/e715da6db1d1d70cd779dc48e1ba8110c51cc1bf/Python/ceval.c#L1130
+  PyFunctionObject* old_func = FUNC(frame);
 #if IS_PYTHON_3_12_PLUS
-  PyFunctionObject* old_func = (PyFunctionObject*)frame->f_funcobj;
   size_t size = code->co_framesize;
 #else
-  PyFunctionObject* old_func = frame->f_func;
   size_t size = code->co_nlocalsplus + code->co_stacksize + FRAME_SPECIALS_SIZE;
 #endif
 
@@ -259,14 +273,23 @@ static PyObject* dynamo_eval_custom_code_impl(
 
   Py_INCREF(func);
   // consumes reference to func
-#if IS_PYTHON_3_12_PLUS
+#if IS_PYTHON_3_14_PLUS
+  _PyStackRef func_stackref = PyStackRef_FromPyObjectSteal((PyObject*)func);
+  _PyFrame_Initialize(
+      tstate, shadow, func_stackref, NULL, code, 0, frame->previous);
+#elif IS_PYTHON_3_12_PLUS
   _PyFrame_Initialize(shadow, func, NULL, code, 0);
 #else
   _PyFrame_InitializeSpecials(shadow, func, NULL, code->co_nlocalsplus);
 #endif
 
+#if IS_PYTHON_3_14_PLUS
+  _PyStackRef* fastlocals_old = frame->localsplus;
+  _PyStackRef* fastlocals_new = shadow->localsplus;
+#else
   PyObject** fastlocals_old = frame->localsplus;
   PyObject** fastlocals_new = shadow->localsplus;
+#endif
   Py_ssize_t n_old = F_CODE(frame)->co_nlocalsplus;
   Py_ssize_t n_new = code->co_nlocalsplus;
 
@@ -367,16 +390,14 @@ static PyObject* dynamo_eval_custom_code_impl(
       !!(F_CODE(frame)->co_flags & CO_VARKEYWORDS);
 
   for (Py_ssize_t i = 0; i < total_argcount_old; i++) {
-    Py_XINCREF(fastlocals_old[i]);
-    fastlocals_new[i] = fastlocals_old[i];
+    dup_obj(&fastlocals_new[i], fastlocals_old[i]);
   }
 
   // copy free vars
   Py_ssize_t nfrees_old = PyCode_GetNFreevars(F_CODE(frame));
 
   for (Py_ssize_t i = 0; i < nfrees_old; i++) {
-    Py_XINCREF(fastlocals_old[n_old - 1 - i]);
-    fastlocals_new[n_new - 1 - i] = fastlocals_old[n_old - 1 - i];
+    dup_obj(&fastlocals_new[n_new - 1 - i], fastlocals_old[n_old - 1 - i]);
   }
 
   // copy cell vars, from high index to low index, until it meets a variable
@@ -402,8 +423,7 @@ static PyObject* dynamo_eval_custom_code_impl(
     }
 #endif
 
-    Py_XINCREF(fastlocals_old[i]);
-    fastlocals_new[j] = fastlocals_old[i];
+    dup_obj(&fastlocals_new[j], fastlocals_old[i]);
   }
 
   // NOTE: if you want to evaluate frame instead of shadow in 3.12+,
@@ -468,7 +488,7 @@ static PyObject* dynamo__custom_eval_frame_shim(
   return dynamo__custom_eval_frame(tstate, frame, throw_flag, callback);
 }
 
-#else // !(IS_PYTHON_3_14_PLUS)
+#else // !(IS_PYTHON_3_15_PLUS)
 
 // Fake definitions for everything we removed
 
@@ -479,13 +499,19 @@ PyObject* dynamo_eval_custom_code(
     THP_EVAL_API_FRAME_OBJECT* frame,
     PyCodeObject* code,
     const char* trace_annotation,
-    int throw_flag) { return NULL; }
+    int throw_flag) {
+  return NULL;
+}
 THPPyInterpreterFrame* THPPyInterpreterFrame_New(
-    THP_EVAL_API_FRAME_OBJECT* frame) { return NULL; }
+    THP_EVAL_API_FRAME_OBJECT* frame) {
+  return NULL;
+}
 PyObject* dynamo_eval_frame_default(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
-    int throw_flag) { return NULL; }
+    int throw_flag) {
+  return NULL;
+}
 
 static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {{NULL}};
 
@@ -497,7 +523,7 @@ static PyTypeObject THPPyInterpreterFrameType = {
     .tp_getset = THPPyInterpreterFrame_properties,
 };
 
-#endif // !(IS_PYTHON_3_14_PLUS)
+#endif // !(IS_PYTHON_3_15_PLUS)
 
 void clear_old_frame_if_python_312_plus(
     PyThreadState* tstate,
@@ -542,9 +568,7 @@ static PyObject* decrement_working_threads(
   Py_RETURN_NONE;
 }
 
-static PyObject* set_eval_frame(
-    PyObject* new_callback,
-    PyObject* module) {
+static PyObject* set_eval_frame(PyObject* new_callback, PyObject* module) {
   // Change the eval frame callback and return the old one
   //  - None: disables TorchDynamo
   //  - False: run-only mode (reuse existing compiles)
@@ -564,8 +588,8 @@ static PyObject* set_eval_frame(
 
     Py_INCREF(new_callback);
 
-    // Set thread local callback. This will drive behavior of our shim, if/when it
-    // is installed.
+    // Set thread local callback. This will drive behavior of our shim, if/when
+    // it is installed.
     eval_frame_callback_set(new_callback);
 
     // Transfer owned reference from eval_frame_callback_get() to caller
diff --git a/torch/csrc/dynamo/eval_frame_cpp.cpp b/torch/csrc/dynamo/eval_frame_cpp.cpp
index 77927f43b905..e678bc7bad04 100644
--- a/torch/csrc/dynamo/eval_frame_cpp.cpp
+++ b/torch/csrc/dynamo/eval_frame_cpp.cpp
@@ -1,3 +1,4 @@
+#include <c10/util/Exception.h>
 #include <torch/csrc/dynamo/cache_entry.h>
 #include <torch/csrc/dynamo/cpp_shim.h>
 #include <torch/csrc/dynamo/cpython_includes.h>
@@ -23,10 +24,8 @@ static py::object dynamo_call_callback(
     CacheEntry* cache_entry,
     FrameState* frame_state) {
   THPPyInterpreterFrame* frame = THPPyInterpreterFrame_New(_frame);
-  if (frame == nullptr) {
-    throw std::runtime_error(
-        "Dynamo failed to initialize CPython interpreter frame wrapper");
-  }
+  TORCH_CHECK(
+      frame, "Dynamo failed to initialize CPython interpreter frame wrapper");
   frame->locals = (PyObject*)framelocals_mapping_to_dict(locals);
 
   py::object cache_entry_obj = py::none();
diff --git a/torch/csrc/dynamo/extra_state.cpp b/torch/csrc/dynamo/extra_state.cpp
index ad617a8de5b0..b9dccb456fd6 100644
--- a/torch/csrc/dynamo/extra_state.cpp
+++ b/torch/csrc/dynamo/extra_state.cpp
@@ -1,3 +1,4 @@
+#include <c10/util/Exception.h>
 #include <torch/csrc/dynamo/extra_state.h>
 
 #include <torch/csrc/dynamo/cache_entry.h>
@@ -232,9 +233,8 @@ py::list _debug_get_cache_entry_list(const py::handle& code_obj) {
 
 PrecompileEntry::PrecompileEntry(py::object gm, py::object c)
     : guard_manager(std::move(gm)), code(std::move(c)) {
-  if (!PyCode_Check(code.ptr())) {
-    throw std::runtime_error("Expecting CodeType from PrecompileEntry.");
-  }
+  TORCH_CHECK(
+      PyCode_Check(code.ptr()), "Expecting CodeType from PrecompileEntry.");
   root_mgr =
       torch::dynamo::convert_to_root_guard_manager(guard_manager.attr("root"));
 }
diff --git a/torch/csrc/dynamo/framelocals_mapping.cpp b/torch/csrc/dynamo/framelocals_mapping.cpp
index 16420ddc90e6..38f83e6a778f 100644
--- a/torch/csrc/dynamo/framelocals_mapping.cpp
+++ b/torch/csrc/dynamo/framelocals_mapping.cpp
@@ -28,8 +28,12 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame)
   PyCodeObject* co = F_CODE(frame);
   _framelocals.resize(co->co_nlocalsplus, nullptr);
 
-#if IS_PYTHON_3_14_PLUS
-  TORCH_CHECK(false, "Python 3.14+ not supported");
+#if IS_PYTHON_3_15_PLUS || (IS_PYTHON_3_14_PLUS && defined(_WIN32))
+  TORCH_CHECK(false, "Python 3.15+ / 3.14 on Windows not supported");
+#elif IS_PYTHON_3_14_PLUS
+  if (!frame->stackpointer) {
+    return;
+  }
 #else
   if (!frame->stacktop) {
     return;
@@ -59,8 +63,12 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame)
   };
 
   auto offset = co->co_nlocalsplus - co->co_nfreevars;
-#if IS_PYTHON_3_14_PLUS
-  TORCH_CHECK(false, "Python 3.14+ not supported");
+#if IS_PYTHON_3_15_PLUS || (IS_PYTHON_3_14_PLUS && defined(_WIN32))
+  TORCH_CHECK(false, "Python 3.15+ / 3.14 on Windows not supported");
+#elif IS_PYTHON_3_14_PLUS
+  for (int i = 0; i < offset; i++) {
+    update_framelocals(i, PyStackRef_AsPyObjectBorrow(frame->localsplus[i]));
+  }
 #else
   for (int i = 0; i < offset; i++) {
     update_framelocals(i, frame->localsplus[i]);
@@ -68,11 +76,11 @@ FrameLocalsMapping::FrameLocalsMapping(FrameLocalsFrameType* frame)
 #endif
 
   // Get references to closure variables
-#if IS_PYTHON_3_14_PLUS
+#if IS_PYTHON_3_15_PLUS || (IS_PYTHON_3_14_PLUS && defined(_WIN32))
   PyObject* closure;
-  TORCH_CHECK(false, "Python 3.14+ not supported");
+  TORCH_CHECK(false, "Python 3.15+ / 3.14 on Windows not supported");
 #else
-  PyObject* closure = ((PyFunctionObject*)FUNC(frame))->func_closure;
+  PyObject* closure = FUNC(frame)->func_closure;
 #endif
   for (int i = 0; i < co->co_nfreevars; i++) {
     update_framelocals(offset + i, PyTuple_GET_ITEM(closure, i));
@@ -151,9 +159,16 @@ void FrameLocalsMapping::_realize_dict() {
   auto update_mapping = [&](int i) {
     DEBUG_CHECK(0 <= i && i < _framelocals.size());
     PyObject* value = _framelocals[i].ptr();
-    if (value == nullptr) {
-      _dict.attr("pop")(framelocals_names[i], py::none());
-    } else {
+    // NOTE: CPython's PyFrame_FastToLocalsWithError/map_to_dict
+    // removes the local name from the locals dict if the value is NULL.
+    // This is likely so that if a local variable is deleted in the fastlocals,
+    // PyFrame_FastToLocalsWithError will also remove it from frame->f_locals.
+    // Since we create the locals dict from scratch every time (and only
+    // before a frame is run), we probably don't need to account for this
+    // codepath, saving us from unnecessarily calling _dict.pop().
+    // It is unexpected that multiple fastlocal values corresponding to
+    // the same variable name have both a null and non-null value.
+    if (value != nullptr) {
       _dict[framelocals_names[i]] = value;
     }
   };
diff --git a/torch/csrc/dynamo/guards.cpp b/torch/csrc/dynamo/guards.cpp
index f55b9da262be..bdcaf71c05d5 100644
--- a/torch/csrc/dynamo/guards.cpp
+++ b/torch/csrc/dynamo/guards.cpp
@@ -2,6 +2,7 @@
 #include <ATen/autocast_mode.h>
 #include <c10/core/SafePyObject.h>
 #include <c10/core/impl/PyInterpreter.h>
+#include <c10/util/Exception.h>
 #define PY_SSIZE_T_CLEAN
 #include <ATen/EmptyTensor.h>
 #include <ATen/SparseCsrTensorUtils.h>
@@ -18,6 +19,7 @@
 #include <torch/csrc/utils/python_symnode.h>
 #include <torch/csrc/utils/pythoncapi_compat.h>
 #include <torch/extension.h>
+#include <cstdint>
 
 #include <torch/csrc/dynamo/debug_macros.h>
 
@@ -64,8 +66,7 @@ int open_counter() {
 
 uint64_t count_instructions(const std::function<void()>& fn) {
   int fd = open_counter();
-  if (fd == -1)
-    throw std::runtime_error("perf_event_open failed");
+  TORCH_CHECK(fd != -1, "perf_event_open failed");
 
   ioctl(fd, PERF_EVENT_IOC_RESET, 0);
   ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
@@ -568,7 +569,7 @@ static PyMethodDef TensorGuards_methods[] = {
     {nullptr} /* Sentinel */
 };
 
-static PyTypeObject TensorGuardsType = { PyVarObject_HEAD_INIT(nullptr, 0)
+static PyTypeObject TensorGuardsType = {PyVarObject_HEAD_INIT(nullptr, 0)
 };
 
 struct AutocastState {
@@ -622,7 +623,7 @@ struct AutocastState {
 struct GlobalStateGuard {
   PyObject_HEAD
 
-  inline void init() {
+  void init() {
     auto& ctx = at::globalContext();
     _grad_mode = at::GradMode::is_enabled();
     _autocast_state = AutocastState();
@@ -634,14 +635,16 @@ struct GlobalStateGuard {
     _torch_function_all_disabled = at::impl::torch_function_all_disabled();
     _deterministic_algorithms = ctx.deterministicAlgorithms();
     _deterministic_algorithms_warn_only = ctx.deterministicAlgorithmsWarnOnly();
-    _allow_tf32 = ctx.float32Precision("cuda", "matmul") == "tf32";
+    _allow_tf32 =
+        ctx.float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
+        at::Float32Precision::TF32;
     _allow_fp16_reduce = ctx.allowFP16ReductionCuBLAS();
     _allow_bf16_reduce = ctx.allowBF16ReductionCuBLAS();
     _num_threads = at::get_num_threads();
     _default_dtype = at::get_default_dtype();
   }
 
-  inline bool check() const {
+  bool check() const {
     auto& ctx = at::globalContext();
     return (_grad_mode == at::GradMode::is_enabled() &&
             _autocast_state == AutocastState() &&
@@ -651,14 +654,17 @@ struct GlobalStateGuard {
             _deterministic_algorithms == ctx.deterministicAlgorithms() &&
             _deterministic_algorithms_warn_only ==
                 ctx.deterministicAlgorithmsWarnOnly() &&
-            _allow_tf32 == (ctx.float32Precision("cuda", "matmul") == "tf32") &&
+            _allow_tf32 ==
+                (ctx.float32Precision(
+                     at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
+                 at::Float32Precision::TF32) &&
             _allow_fp16_reduce == ctx.allowFP16ReductionCuBLAS() &&
             _allow_bf16_reduce == ctx.allowBF16ReductionCuBLAS() &&
             _num_threads == at::get_num_threads()) &&
         _default_dtype == at::get_default_dtype();
   }
 
-  inline std::string reason() const {
+  std::string reason() const {
     std::ostringstream os;
     auto& ctx = at::globalContext();
     if (_grad_mode != at::GradMode::is_enabled())
@@ -672,7 +678,10 @@ struct GlobalStateGuard {
     if (_deterministic_algorithms_warn_only !=
         ctx.deterministicAlgorithmsWarnOnly())
       os << "deterministic_algorithms_warn_only ";
-    if (_allow_tf32 != (ctx.float32Precision("cuda", "matmul") == "tf32"))
+    if (_allow_tf32 !=
+        (ctx.float32Precision(
+             at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
+         at::Float32Precision::TF32))
       os << "allow_tf32 ";
     if (_allow_fp16_reduce != ctx.allowFP16ReductionCuBLAS())
       os << "allow_fp16_reduce ";
@@ -695,8 +704,10 @@ struct GlobalStateGuard {
     json_j["deterministic_algorithms_warn_only"] =
         json_t._deterministic_algorithms_warn_only;
     json_j["allow_tf32"] = json_t._allow_tf32;
-    json_j["allow_fp16_reduce"] = json_t._allow_fp16_reduce;
-    json_j["allow_bf16_reduce"] = json_t._allow_bf16_reduce;
+    json_j["allow_fp16_reduce"] =
+        static_cast<int64_t>(json_t._allow_fp16_reduce);
+    json_j["allow_bf16_reduce"] =
+        static_cast<int64_t>(json_t._allow_bf16_reduce);
     json_j["num_threads"] = json_t._num_threads;
     json_j["default_dtype"] = json_t._default_dtype.toScalarType();
   }
@@ -712,8 +723,10 @@ struct GlobalStateGuard {
     json_t._deterministic_algorithms_warn_only =
         json_j.at("deterministic_algorithms_warn_only");
     json_t._allow_tf32 = json_j.at("allow_tf32");
-    json_t._allow_fp16_reduce = json_j.at("allow_fp16_reduce");
-    json_t._allow_bf16_reduce = json_j.at("allow_bf16_reduce");
+    json_t._allow_fp16_reduce = static_cast<at::CuBLASReductionOption>(
+        static_cast<int64_t>(json_j.at("allow_fp16_reduce")));
+    json_t._allow_bf16_reduce = static_cast<at::CuBLASReductionOption>(
+        static_cast<int64_t>(json_j.at("allow_bf16_reduce")));
     json_t._num_threads = json_j.at("num_threads");
     json_t._default_dtype =
         caffe2::TypeMeta::fromScalarType(json_j.at("default_dtype"));
@@ -726,8 +739,8 @@ struct GlobalStateGuard {
   bool _deterministic_algorithms;
   bool _deterministic_algorithms_warn_only;
   bool _allow_tf32;
-  bool _allow_fp16_reduce;
-  bool _allow_bf16_reduce;
+  at::CuBLASReductionOption _allow_fp16_reduce;
+  at::CuBLASReductionOption _allow_bf16_reduce;
   int _num_threads;
   caffe2::TypeMeta _default_dtype;
   // TODO(jansel): we should guard on more state as inductor starts using it
@@ -771,9 +784,8 @@ PyObject* GlobalStateGuard_load(
     PyObject* args,
     PyObject* kwargs) {
   char* json;
-  if (!PyArg_ParseTuple(args, "s", &json)) {
-    throw std::runtime_error("Cannot parse as json string.");
-  }
+  TORCH_CHECK(
+      PyArg_ParseTuple(args, "s", &json), "Cannot parse as json string.");
   nlohmann::json::parse(json).get_to(*self);
   Py_RETURN_NONE;
 }
@@ -797,7 +809,7 @@ static PyMethodDef GlobalStateGuard_methods[] = {
      METH_VARARGS,
      "Parse serialized json format"},
     {nullptr}};
-static PyTypeObject GlobalStateGuardType = { PyVarObject_HEAD_INIT(nullptr, 0)
+static PyTypeObject GlobalStateGuardType = {PyVarObject_HEAD_INIT(nullptr, 0)
 };
 
 static PyObject* check_type_id(PyObject* dummy, PyObject* args) {
@@ -854,9 +866,9 @@ static int dict_version_watch_callback(
 static uint64_t get_dict_version_unchecked(PyObject* dict) {
 #if IS_PYTHON_3_12_PLUS
 
-  if (PyDict_Watch(dict_version_watcher_id, dict)) {
-    throw std::runtime_error("failed to add version watcher to dict!");
-  }
+  TORCH_CHECK(
+      !PyDict_Watch(dict_version_watcher_id, dict),
+      "failed to add version watcher to dict!");
   if (!dict_version_map.count(dict)) {
     dict_version_map[dict] = global_dict_version_id++;
   }
@@ -1625,7 +1637,9 @@ class LeafGuard {
   // is not exposed to Python and can only be called from C++.
   virtual bool check_nopybind(PyObject* value) = 0;
   virtual bool check_nopybind(FrameLocalsMapping* map) {
-    throw std::runtime_error("fallback to python");
+    // throw std::runtime_error("fallback to python");
+    // Could fallback to running check on the Python dict (lazily constructed)
+    return check_nopybind((PyObject*)map->to_dict());
   }
 
   virtual ~LeafGuard() = default;
@@ -1656,13 +1670,8 @@ class LAMBDA_GUARD : public LeafGuard {
   LAMBDA_GUARD(
       RootGuardManager* root_guard_manager,
       py::object guard_check_fn,
-      py::object required_locals,
-      bool construct_partial_framelocals_dict,
       py::object verbose_code_parts)
-      : LeafGuard(root_guard_manager, std::move(verbose_code_parts)),
-        _required_locals(py::cast<py::dict>(required_locals)),
-        _construct_partial_framelocals_dict(
-            construct_partial_framelocals_dict) {
+      : LeafGuard(root_guard_manager, std::move(verbose_code_parts)) {
     if (py::isinstance<py::function>(guard_check_fn)) {
       _guard_check_fn = py::cast<py::function>(std::move(guard_check_fn));
     } else {
@@ -1699,126 +1708,11 @@ class LAMBDA_GUARD : public LeafGuard {
     return GuardDebugInfo(false, verbose_code_parts(), 0);
   }
 
-  bool check_nopybind(FrameLocalsMapping* map) override {
-    // TODO (anijain2305) - Get rid of the _construct_partial_framelocals_dict
-    // once its stable.
-    if (_construct_partial_framelocals_dict) {
-      py::dict partial_dict;
-
-      for (auto item : _required_locals) {
-        partial_dict[item.first] = map->get(item.second.cast<int>());
-      }
-
-      return check_nopybind(partial_dict.ptr());
-    }
-    return check_nopybind((PyObject*)map->to_dict());
-  }
-
- private:
-  // Dict of (local_name, framelocal_idx) representing the minimum number of
-  // framelocals needed to construct the dictionary for the lambda guard.
-  py::dict _required_locals;
-
-  // Temporary flag to allow a fallback behavior. With stability, we can remove
-  // this member.
-  bool _construct_partial_framelocals_dict;
-
-  // The user provided lambda function for check_fn.
-  py::function _guard_check_fn;
-};
-
-/*
-Similar to LAMBDA_GUARD but where lambda does not take any arguments. This
-ensures that we don't need to construct a dictionary from framelocals even if
-the guard is at the root. These guards are for root guards like GlobalState.
-*/
-class LAMBDA_GUARD_NO_ARGS : public LeafGuard {
- public:
-  LAMBDA_GUARD_NO_ARGS(
-      RootGuardManager* root_guard_manager,
-      py::object guard_check_fn,
-      py::object verbose_code_parts)
-      : LeafGuard(root_guard_manager, std::move(verbose_code_parts)) {
-    if (py::isinstance<py::function>(guard_check_fn)) {
-      _guard_check_fn = py::cast<py::function>(std::move(guard_check_fn));
-    } else {
-      throw py::type_error("LAMBDA_GUARD_NO_ARGS expects (callable, str)");
-    }
-  }
-
-  bool _check() {
-    PyObject* x = PyObject_CallNoArgs(_guard_check_fn.ptr()); // new ref
-    if (x == nullptr) {
-      // An exception is caught in the lambda function.
-      PyErr_Clear();
-      return false;
-    }
-    bool result = PyObject_IsTrue(x);
-    Py_DECREF(x);
-    return result;
-  }
-
-  bool check_nopybind(PyObject* value) override { // borrowed ref
-    return _check();
-  }
-
-  GuardDebugInfo check_verbose_nopybind(PyObject* value) override {
-    PyObject* x = PyObject_CallNoArgs(_guard_check_fn.ptr()); // new ref
-    if (x == nullptr) {
-      // An exception is caught in the lambda function.
-      std::string exc_message = get_exception_message();
-      PyErr_Clear();
-      return GuardDebugInfo(false, exc_message, 0);
-    }
-    bool result = PyObject_IsTrue(x);
-    Py_DECREF(x);
-    if (result) {
-      return GuardDebugInfo(true, 0);
-    }
-    return GuardDebugInfo(false, verbose_code_parts(), 0);
-  }
-
-  // Ensure that framelocals dict is not constructed.
-  bool check_nopybind(FrameLocalsMapping* map) override {
-    return _check();
-  }
-
  private:
   // The user provided lambda function for check_fn.
   py::function _guard_check_fn;
 };
 
-/*
-Similar to LAMBDA_GUARD but disallows running on a FrameLocalsMapping input.
-These guards are at trunk or leaf, and not at the root.
-*/
-class LAMBDA_GUARD_NO_FRAMELOCALS : public LAMBDA_GUARD {
- public:
-  LAMBDA_GUARD_NO_FRAMELOCALS(
-      RootGuardManager* root_guard_manager,
-      py::object guard_check_fn,
-      py::object verbose_code_parts)
-      : LAMBDA_GUARD(
-            root_guard_manager,
-            guard_check_fn,
-            py::dict(),
-            false,
-            verbose_code_parts) {}
-
-  bool check_nopybind(PyObject* value) override { // borrowed ref
-    return LAMBDA_GUARD::check_nopybind(value);
-  }
-
-  GuardDebugInfo check_verbose_nopybind(PyObject* value) override {
-    return LAMBDA_GUARD::check_verbose_nopybind(value);
-  }
-
-  bool check_nopybind(FrameLocalsMapping* map) override {
-    throw std::runtime_error(
-        "FramelocalsMapping input to LAMBDA_GUARD_NO_FRAMELOCALS, use LAMBDA_GUARD instead");
-  }
-};
-
 class TYPE_MATCH : public LeafGuard {
  public:
   // type_id = id(type(obj))
@@ -2726,7 +2620,9 @@ class GuardAccessor {
   // subtree on immutable dict getitems.
   virtual bool check_nopybind(PyObject* obj, bool matches_dict_tag = false) = 0;
   virtual bool check_nopybind(FrameLocalsMapping* map, bool matches_dict_tag) {
-    throw std::runtime_error("fallback to python");
+    // throw std::runtime_error("fallback to python");
+    // Could fallback to running check on the Python dict (lazily constructed)
+    return check_nopybind((PyObject*)map->to_dict(), matches_dict_tag);
   }
   virtual GuardDebugInfo check_verbose_nopybind(PyObject* obj) = 0;
   virtual std::string repr() const = 0;
@@ -2911,9 +2807,8 @@ class GuardManager {
       return py::type::of(py::none());
     }
 
-    if (!PyCallable_Check(_weak_type.ptr())) {
-      throw std::runtime_error("_weak_type is not callable");
-    }
+    TORCH_CHECK_TYPE(
+        PyCallable_Check(_weak_type.ptr()), "_weak_type is not callable");
     return _weak_type();
   }
 
@@ -2925,10 +2820,8 @@ class GuardManager {
   }
 
   void mark_tag_safe_root() {
-    if (!_is_tag_safe) {
-      throw std::runtime_error(
-          "Marking a node tag_safe_root when its not tag safe");
-    }
+    TORCH_CHECK(
+        _is_tag_safe, "Marking a node tag_safe_root when its not tag safe");
     _is_tag_safe_root = true;
   }
 
@@ -3194,11 +3087,10 @@ class GuardManager {
     if (is_recording) {
       stop_recording_dict_pointers(_root, value, result);
       if (result) {
-        if (!register_weakref_callback(value)) {
-          // something bad happened, disable the dict tag optimization
-          throw std::runtime_error(
-              "Could not register a callback for recursive dict tag optimization");
-        }
+        // something bad happened, disable the dict tag optimization
+        TORCH_CHECK(
+            register_weakref_callback(value),
+            "Could not register a callback for recursive dict tag optimization");
 #if IS_PYTHON_3_12_PLUS
         // Ideally we don't need to even register a weakref callback for value.
         // But it does not hurt to be more cautious
@@ -4131,14 +4023,14 @@ class DictGuardManager : public GuardManager {
       const py::object& a,
       const std::string& source,
       const py::object& b) {
-    throw std::runtime_error("Can not add an accessor to DictGuardManager");
+    TORCH_CHECK(false, "Can not add an accessor to DictGuardManager");
   }
 
   void add_leaf_guard(std::shared_ptr<LeafGuard> leaf_guard) override {
     // If you are calling this, you probably want to go through a key, value
     // child manager and then add a leaf guard on them. DictGuardManager already
     // has TYPE_MATCH and LENGTH_CHECK built in.
-    throw std::runtime_error("DictGuardManager does not support a leaf_guard");
+    TORCH_CHECK(false, "DictGuardManager does not support a leaf_guard");
   }
 
   // Debug helper - Returning raw pointers because we can't return unique_ptr
@@ -4897,13 +4789,12 @@ class FrameLocalsGuardAccessor : public GuardAccessor {
   // NB: Intentional duplication between check_nopybind and
   // check_verbose_nopybind.
   bool check_nopybind(PyObject* obj, bool matches_dict_tag = false) override {
-    if (!PyDict_Check(obj)) {
-      // This should not cause guard failure.
-      // If this error is encountered, it probably means
-      // we did not convert FrameLocalsMapping to dict (using to_dict()).
-      throw std::runtime_error(
-          "FrameLocalsGuardAccessor check expected dict() input");
-    }
+    // This should not cause guard failure.
+    // If this error is encountered, it probably means
+    // we did not convert FrameLocalsMapping to dict (using to_dict()).
+    TORCH_CHECK_TYPE(
+        PyDict_Check(obj),
+        "FrameLocalsGuardAccessor check expected dict() input");
 
     if (matches_dict_tag && _is_immutable_object) {
       // immutable object and dict tag matches, we can skip the guard subtree.
@@ -5349,7 +5240,7 @@ class TensorPropertyGuardAccessor : public GuardAccessor {
     } else if (_prop == TensorProperty::STORAGE_OFFSET) {
       opt_value = tensor.sym_storage_offset().maybe_as_int();
     } else {
-      throw std::runtime_error("Unknown property");
+      TORCH_CHECK(false, "Unknown property");
     }
 
     if (!opt_value.has_value()) {
@@ -6692,12 +6583,9 @@ double profile_guard_manager(
 } // namespace
 
 static void* _torchinductor_pyobject_tensor_data_ptr(PyObject* obj) {
-  if (C10_UNLIKELY(
-          obj == nullptr ||
-          (!THPVariable_CheckExact(obj) && !THPVariable_Check(obj)))) {
-    throw std::runtime_error(
-        "_torchinductor_pyobject_tensor_data_ptr: non-tensor input");
-  }
+  TORCH_CHECK(
+      obj != nullptr && (THPVariable_CheckExact(obj) || THPVariable_Check(obj)),
+      "_torchinductor_pyobject_tensor_data_ptr: non-tensor input");
   return THPVariable_Unpack(obj).data_ptr();
 }
 
@@ -6802,22 +6690,8 @@ PyObject* torch_c_dynamo_guards_init() {
       .def("verbose_code_parts", &LeafGuard::verbose_code_parts);
   py::class_<LAMBDA_GUARD, LeafGuard, std::shared_ptr<LAMBDA_GUARD>>(
       py_m, "LAMBDA_GUARD")
-      .def(
-          py::init<RootGuardManager*, py::function, py::dict, bool, py::list>())
-      .def("__call__", &LAMBDA_GUARD::check);
-  py::class_<
-      LAMBDA_GUARD_NO_ARGS,
-      LeafGuard,
-      std::shared_ptr<LAMBDA_GUARD_NO_ARGS>>(py_m, "LAMBDA_GUARD_NO_ARGS")
       .def(py::init<RootGuardManager*, py::function, py::list>())
-      .def("__call__", &LAMBDA_GUARD_NO_ARGS::check);
-  py::class_<
-      LAMBDA_GUARD_NO_FRAMELOCALS,
-      LeafGuard,
-      std::shared_ptr<LAMBDA_GUARD_NO_FRAMELOCALS>>(
-      py_m, "LAMBDA_GUARD_NO_FRAMELOCALS")
-      .def(py::init<RootGuardManager*, py::function, py::list>())
-      .def("__call__", &LAMBDA_GUARD_NO_FRAMELOCALS::check);
+      .def("__call__", &LAMBDA_GUARD::check);
   py::class_<TYPE_MATCH, LeafGuard, std::shared_ptr<TYPE_MATCH>>(
       py_m, "TYPE_MATCH")
       .def(py::init<RootGuardManager*, py::object, py::list>())
@@ -7126,32 +7000,8 @@ PyObject* torch_c_dynamo_guards_init() {
           "add_lambda_guard",
           [](GuardManager& self,
              py::object lambda,
-             py::object required_locals,
-             bool construct_partial_framelocals_dict,
              py::object verbose_code_parts) -> void {
             self.add_leaf_guard(std::make_shared<LAMBDA_GUARD>(
-                self.get_root(),
-                std::move(lambda),
-                std::move(required_locals),
-                construct_partial_framelocals_dict,
-                std::move(verbose_code_parts)));
-          })
-      .def(
-          "add_lambda_guard_no_args",
-          [](GuardManager& self,
-             py::object lambda,
-             py::object verbose_code_parts) -> void {
-            self.add_leaf_guard(std::make_shared<LAMBDA_GUARD_NO_ARGS>(
-                self.get_root(),
-                std::move(lambda),
-                std::move(verbose_code_parts)));
-          })
-      .def(
-          "add_lambda_guard_no_framelocals",
-          [](GuardManager& self,
-             py::object lambda,
-             py::object verbose_code_parts) -> void {
-            self.add_leaf_guard(std::make_shared<LAMBDA_GUARD_NO_FRAMELOCALS>(
                 self.get_root(),
                 std::move(lambda),
                 std::move(verbose_code_parts)));
@@ -7816,15 +7666,9 @@ PyObject* torch_c_dynamo_guards_init() {
           "add_epilogue_lambda_guard",
           [](RootGuardManager& self,
              py::object lambda,
-             py::object required_locals,
-             bool construct_partial_framelocals_dict,
              py::object verbose_code_parts) -> void {
             self.add_epilogue_lambda_guard(std::make_unique<LAMBDA_GUARD>(
-                &self,
-                std::move(lambda),
-                std::move(required_locals),
-                construct_partial_framelocals_dict,
-                std::move(verbose_code_parts)));
+                &self, std::move(lambda), std::move(verbose_code_parts)));
           });
 
   // Dict Guard Manager
@@ -7935,10 +7779,9 @@ PyObject* torch_c_dynamo_guards_init() {
              std::string source,
              py::handle example_value,
              py::handle guard_manager_enum) -> GuardManager* {
-            if (self.is_exact_dict_type()) {
-              throw std::runtime_error(
-                  "getattr_manager on a DictGuardManager is supported only for dict subclasses");
-            }
+            TORCH_CHECK(
+                !self.is_exact_dict_type(),
+                "getattr_manager on a DictGuardManager is supported only for dict subclasses");
             return self.get_child_manager<GetAttrGuardAccessor>(
                 std::move(attr_name),
                 std::move(source),
@@ -7976,16 +7819,15 @@ PyObject* torch_c_dynamo_guards_init() {
 #if IS_PYTHON_3_12_PLUS
 
   dict_version_watcher_id = PyDict_AddWatcher(dict_version_watch_callback);
-  if (dict_version_watcher_id == -1) {
-    throw std::runtime_error("Failed to install dict_version_watch_callback");
-  }
+  TORCH_CHECK(
+      dict_version_watcher_id != -1,
+      "Failed to install dict_version_watch_callback");
 
   dict_recursive_tag_watcher_id =
       PyDict_AddWatcher(dict_recursive_tag_watch_callback);
-  if (dict_recursive_tag_watcher_id == -1) {
-    throw std::runtime_error(
-        "Failed to install dict_recursive_tag_watch_callback");
-  }
+  TORCH_CHECK(
+      dict_recursive_tag_watcher_id != -1,
+      "Failed to install dict_recursive_tag_watch_callback");
 
 #endif
 
diff --git a/torch/csrc/dynamo/init.cpp b/torch/csrc/dynamo/init.cpp
index 2b642ce0bfe8..f1590e19d49c 100644
--- a/torch/csrc/dynamo/init.cpp
+++ b/torch/csrc/dynamo/init.cpp
@@ -1,3 +1,4 @@
+#include <c10/util/Exception.h>
 #include <torch/csrc/dynamo/init.h>
 #include <torch/csrc/dynamo/utils.h>
 
@@ -20,18 +21,8 @@ PYBIND11_MAKE_OPAQUE(std::vector<uint8_t>)
 
 namespace torch::dynamo {
 
-#if IS_PYTHON_3_11_PLUS
-
-std::vector<uint8_t> _PyOpcode_Caches_vec(
-    THP_PyOpcode_Caches,
-    THP_PyOpcode_Caches + THP_PyOpcode_Caches_size);
-
-#else
-
 std::vector<uint8_t> _PyOpcode_Caches_vec;
 
-#endif
-
 using torch::dynamo::autograd::torch_c_dynamo_compiled_autograd_init;
 
 namespace {
@@ -111,7 +102,7 @@ THPObjectPtr _unicode_dispatch(PyObject* str) {
       return F::apply(str, PyUnicode_4BYTE_DATA(str), length);
     default:
       // This should be impossible - throw to make the compiler happy.
-      throw std::runtime_error("unreachable");
+      TORCH_CHECK(false, "unreachable");
   }
 }
 
@@ -264,6 +255,13 @@ void initDynamoBindings(PyObject* torch) {
   m.def("_load_precompile_entry", &_load_precompile_entry);
   m.def("_debug_get_precompile_entries", &_debug_get_precompile_entries);
   py::bind_vector<std::vector<uint8_t>>(m, "VectorUInt8");
+  init_THPCaches();
+  if (THP_PyOpcode_Caches != nullptr) {
+    _PyOpcode_Caches_vec.insert(
+        _PyOpcode_Caches_vec.end(),
+        THP_PyOpcode_Caches,
+        THP_PyOpcode_Caches + THP_PyOpcode_Caches_size);
+  }
   m.attr("py_opcode_caches") = _PyOpcode_Caches_vec;
   m.def("code_framelocals_names", &code_framelocals_names);
   _register_functions(dynamo);
diff --git a/torch/csrc/export/upgrader.cpp b/torch/csrc/export/upgrader.cpp
index 9f92239840b9..04da1ab2a2d2 100644
--- a/torch/csrc/export/upgrader.cpp
+++ b/torch/csrc/export/upgrader.cpp
@@ -1,3 +1,4 @@
+#include <c10/util/Exception.h>
 #include <torch/csrc/export/upgrader.h>
 #include <limits>
 #include <map>
@@ -23,34 +24,29 @@ static const std::multiset<Upgrader>& getUpgrader(int current_version) {
 }
 
 static nlohmann::json getFieldByKeypath(
-    const nlohmann::json& obj,
+    nlohmann::json obj,
     const std::vector<std::string>& keypath) {
-  nlohmann::json current = obj;
   for (const auto& key : keypath) {
-    if (!current.contains(key)) {
-      throw std::runtime_error("Keypath not found: " + key);
-    }
-    current = current[key];
+    TORCH_CHECK(obj.contains(key), "Keypath not found: " + key);
+    obj = obj[key];
   }
-  return current;
+  return obj;
 }
 
 static void setFieldByKeypath(
     nlohmann::json& obj,
     const std::vector<std::string>& keypath,
-    const nlohmann::json& value) {
+    nlohmann::json value) {
   nlohmann::json* current = &obj;
   for (size_t i = 0; i < keypath.size() - 1; ++i) {
     const auto& key = keypath[i];
-    if (!current->contains(key)) {
-      throw std::runtime_error("Keypath not found: " + key);
-    }
+    TORCH_CHECK(current->contains(key), "Keypath not found: " + key);
     current = &((*current)[key]);
   }
-  if (!current->contains(keypath.back())) {
-    throw std::runtime_error("Keypath not found: " + keypath.back());
-  }
-  (*current)[keypath.back()] = value;
+  TORCH_CHECK(
+      current->contains(keypath.back()),
+      "Keypath not found: " + keypath.back());
+  (*current)[keypath.back()] = std::move(value);
 }
 
 Upgrader::Upgrader(std::vector<std::string> kp, UpgraderFunction func)
@@ -85,7 +81,7 @@ void registerUpgrader(
             error_stream << ".";
           error_stream << keypath[i];
         }
-        throw std::runtime_error(error_stream.str());
+        TORCH_CHECK(false, error_stream.str());
       }
     }
   }
@@ -113,7 +109,7 @@ void registerUpgrader(
     throw std::invalid_argument("Empty keypath provided");
   }
 
-  registerUpgrader(version, keypath_vector, upgrade_func);
+  registerUpgrader(version, std::move(keypath_vector), upgrade_func);
 }
 
 bool deregisterUpgrader(int version, const std::vector<std::string>& keypath) {
@@ -176,18 +172,16 @@ void throwUpgraderError(
     error_stream << "\nProblematic object: " << problematic_object.dump(2);
   }
 
-  throw std::runtime_error(error_stream.str());
+  TORCH_CHECK(false, error_stream.str());
 }
 
-nlohmann::json upgrade(const nlohmann::json& artifact, int target_version) {
-  auto current_artifact = artifact;
-
+nlohmann::json upgrade(nlohmann::json artifact, int target_version) {
   // Validate that the artifact contains required schema version information
-  if (!current_artifact.contains("schema_version")) {
-    throw std::runtime_error("Missing schema_version field in artifact");
-  }
+  TORCH_CHECK(
+      artifact.contains("schema_version"),
+      "Missing schema_version field in artifact");
 
-  int current_version = current_artifact["schema_version"]["major"];
+  int current_version = artifact["schema_version"]["major"];
 
   // Iteratively apply upgraders until target version is reached or no more are
   // available
@@ -204,14 +198,13 @@ nlohmann::json upgrade(const nlohmann::json& artifact, int target_version) {
     // (deeper keypaths first to prevent parent/child conflicts)
     for (const auto& upgrader : upgraders) {
       // Extract the field to be upgraded using its keypath
-      auto field_to_upgrade =
-          getFieldByKeypath(current_artifact, upgrader.keypath);
+      auto field_to_upgrade = getFieldByKeypath(artifact, upgrader.keypath);
 
       // Apply the upgrade transformation
-      auto upgraded_field = upgrader.upgrade_func(field_to_upgrade);
+      auto upgraded_field = upgrader.upgrade_func(std::move(field_to_upgrade));
 
       // Update the artifact with the upgraded field
-      setFieldByKeypath(current_artifact, upgrader.keypath, upgraded_field);
+      setFieldByKeypath(artifact, upgrader.keypath, upgraded_field);
     }
 
     // Move to the next version for potential additional upgrades
@@ -219,11 +212,11 @@ nlohmann::json upgrade(const nlohmann::json& artifact, int target_version) {
   }
 
   // Update schema version to reflect the final upgraded version
-  if (current_artifact["schema_version"]["major"] != current_version) {
-    current_artifact["schema_version"]["major"] = current_version;
+  if (artifact["schema_version"]["major"] != current_version) {
+    artifact["schema_version"]["major"] = current_version;
     // Reset minor version to 0 - the correct minor version should be set
     // when converting the json to in memory representation of ExportedProgram
-    current_artifact["schema_version"]["minor"] = 0;
+    artifact["schema_version"]["minor"] = 0;
   }
 
   // Validate that we reached the target version if requested
@@ -233,10 +226,10 @@ nlohmann::json upgrade(const nlohmann::json& artifact, int target_version) {
         << "Failed to upgrade to target version " << target_version
         << ". Final version reached: " << current_version
         << ". This may indicate missing upgraders for intermediate versions.";
-    throw std::runtime_error(error_stream.str());
+    TORCH_CHECK(false, error_stream.str());
   }
 
-  return current_artifact;
+  return artifact;
 }
 
 } // namespace torch::_export
diff --git a/torch/csrc/export/upgrader.h b/torch/csrc/export/upgrader.h
index c9e9b8f7ff1d..e3cb296a87f1 100644
--- a/torch/csrc/export/upgrader.h
+++ b/torch/csrc/export/upgrader.h
@@ -108,11 +108,12 @@ void throwUpgraderError(
 /// e.g. adding a new field with default value, it's automatically handled by
 /// the default constructor in generated_serialization_types.h.
 ///
-/// @param artifact The JSON artifact to upgrade
+/// @param artifact The JSON artifact to upgrade(passed by value: function
+/// operates on a local copy, original remains unmodified)
 /// @param target_version The target schema version to upgrade to
 /// @return The upgraded JSON artifact with updated schema version
 /// @throws std::runtime_error if artifact is missing schema_version field
 /// @throws std::runtime_error if final version doesn't match target version
-nlohmann::json upgrade(const nlohmann::json& artifact, int target_version);
+nlohmann::json upgrade(nlohmann::json artifact, int target_version);
 
 } // namespace torch::_export
diff --git a/torch/csrc/functionalization/Module.cpp b/torch/csrc/functionalization/Module.cpp
new file mode 100644
index 000000000000..d38cb1078054
--- /dev/null
+++ b/torch/csrc/functionalization/Module.cpp
@@ -0,0 +1,71 @@
+#include <torch/csrc/functionalization/Module.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <ATen/FunctionalStorageImpl.h>
+#include <ATen/FunctionalTensorWrapper.h>
+#include <ATen/FunctionalizeFallbackKernel.h>
+#include <memory>
+
+namespace torch::functionalization {
+
+void initModule(PyObject* module) {
+  auto m = py::handle(module).cast<py::module>();
+
+  // Create a `torch._C._functionalization` Python module.
+  auto functionalization = m.def_submodule(
+      "_functionalization", "functionalization related pybind.");
+
+  // Retrieve the ViewMeta sequence of a given functional tensor.
+  functionalization.def("get_view_meta_sequence", [](const at::Tensor& tensor) {
+    TORCH_INTERNAL_ASSERT(
+        at::functionalization::impl::isFunctionalTensor(tensor));
+    auto impl = at::functionalization::impl::unsafeGetFunctionalWrapper(tensor);
+    return impl->view_metas();
+  });
+
+  // Applies the given ViewMeta sequence to the given base.
+  functionalization.def(
+      "apply_view_meta_sequence",
+      [](const at::Tensor& base,
+         const std::vector<std::shared_ptr<at::functionalization::ViewMeta>>&
+             sequence) {
+        return at::functionalization::impl::apply_view_meta_sequence(
+            base, sequence);
+      });
+
+  // Binding for InverseReturnMode.
+  py::enum_<at::functionalization::InverseReturnMode>(
+      functionalization, "InverseReturnMode")
+      .value("AlwaysView", at::functionalization::InverseReturnMode::AlwaysView)
+      .value("NeverView", at::functionalization::InverseReturnMode::NeverView)
+      .value(
+          "ViewOrScatterInverse",
+          at::functionalization::InverseReturnMode::ViewOrScatterInverse);
+
+  // Create bindings for the ViewMeta base class.
+  //
+  // Needed so that we can take a list of ViewMeta objects as parameter.
+  // Specifically, in the Python-side, we will have a list of derived ViewMeta
+  // classes. We need to tell pybind11 that all of those are, in fact, instances
+  // of different ViewMeta sub-types.
+  py::class_<
+      at::functionalization::ViewMeta,
+      std::shared_ptr<at::functionalization::ViewMeta>>(
+      functionalization, "ViewMeta")
+      .def_property_readonly(
+          "has_symbolic_inputs",
+          [](const std::shared_ptr<at::functionalization::ViewMeta>& meta) {
+            return meta->has_symbolic_inputs;
+          });
+
+  // Bindings for `ViewMeta` specializations manually implemented.
+  create_binding_with_pickle<at::functionalization::resize__ViewMeta>(
+      functionalization);
+  create_binding_with_pickle<at::functionalization::_unsafe_view_ViewMeta>(
+      functionalization);
+
+  // Bindings for `ViewMeta` specializations automatically generated.
+  initGenerated(functionalization.ptr());
+}
+
+} // namespace torch::functionalization
diff --git a/torch/csrc/functionalization/Module.h b/torch/csrc/functionalization/Module.h
new file mode 100644
index 000000000000..2f77fd3098c3
--- /dev/null
+++ b/torch/csrc/functionalization/Module.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <ATen/FunctionalStorageImpl.h>
+
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::functionalization {
+
+// Creates the default bindings for `ViewMeta` specializations.
+//
+// Defines a constructor using the types in `SerializableTuple`, as well
+// as pickle methods.
+template <class T>
+void create_binding_with_pickle(py::module m) {
+  py::class_<T, std::shared_ptr<T>, at::functionalization::ViewMeta>(
+      m, T::name())
+      .def(py::init<typename T::SerializableTuple>())
+      .def(
+          "as_tuple",
+          [](const std::shared_ptr<T>& meta) {
+            return meta->to_serializable_tuple();
+          })
+      .def(py::pickle(
+          [](const std::shared_ptr<T>& meta) {
+            return meta->to_serializable_tuple();
+          },
+          [](const typename T::SerializableTuple& tpl) {
+            return std::make_shared<T>(tpl);
+          }));
+}
+
+void initModule(PyObject* module);
+void initGenerated(PyObject* module);
+
+} // namespace torch::functionalization
diff --git a/torch/csrc/functorch/init.cpp b/torch/csrc/functorch/init.cpp
index 3ad53c3f403f..32e781ce4305 100644
--- a/torch/csrc/functorch/init.cpp
+++ b/torch/csrc/functorch/init.cpp
@@ -363,6 +363,13 @@ static int64_t maybe_get_level(const Tensor& tensor) {
   return -1;
 }
 
+static void maybe_unsafe_set_level(const Tensor& tensor, int64_t level) {
+  auto* batched = maybeGetBatchedImpl(tensor);
+  if (batched) {
+    return batched->_unsafe_set_level(level);
+  }
+}
+
 static int64_t maybe_get_bdim(const Tensor& tensor) {
   auto* batched = maybeGetBatchedImpl(tensor);
   if (batched) {
@@ -519,6 +526,7 @@ void initFuncTorchBindings(PyObject* module) {
   m.def("is_functionaltensor", &is_functionaltensor);
   m.def("get_unwrapped", &get_unwrapped);
   m.def("maybe_get_level", &maybe_get_level);
+  m.def("_maybe_unsafe_set_level", &maybe_unsafe_set_level);
   m.def("maybe_get_bdim", &maybe_get_bdim);
   m.def("maybe_current_level", &maybe_current_level);
   m.def("current_level", &currentLevel);
diff --git a/torch/csrc/inductor/aoti_eager/kernel_holder.h b/torch/csrc/inductor/aoti_eager/kernel_holder.h
index 8459b35c6837..1575481148a0 100644
--- a/torch/csrc/inductor/aoti_eager/kernel_holder.h
+++ b/torch/csrc/inductor/aoti_eager/kernel_holder.h
@@ -105,7 +105,7 @@ class AOTIPythonKernelHolder : public c10::OperatorKernel {
   void init_aoti_kernel_cache();
   // Load the AOTIModelContainerRunner object from the given file path.
   std::shared_ptr<AOTIModelContainerRunner> load_aoti_model_runner(
-      const std::string&);
+      const std::string& /*so_path*/);
 };
 
 } // namespace torch::inductor
diff --git a/torch/csrc/inductor/aoti_package/model_package_loader.cpp b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
index aa8ef905d57a..1face0cd6b80 100644
--- a/torch/csrc/inductor/aoti_package/model_package_loader.cpp
+++ b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
@@ -24,6 +24,7 @@ namespace fs = std::filesystem;
 
 // TODO: C++17 has the filesystem header, which may replace these
 #ifdef _WIN32
+#include <Windows.h>
 // On Windows, the POSIX implementations are considered deprecated. We simply
 // map to the newer variant.
 #include <direct.h>
@@ -40,6 +41,23 @@ namespace {
 
 const std::string k_separator = "/";
 
+std::string remove_duplicate_separator_of_path(const std::string& path) {
+  /*
+  On Windows, temp file path maybe has duplicate separator.
+  Need to remove the duplication:
+  Origin: C:/Users/Xuhan/AppData/Local/Temp//tmpl10jfwef/filename
+  Processed: C:/Users/Xuhan/AppData/Local/Temp/tmpl10jfwef/filename
+  */
+  std::string result = path;
+  size_t pos = 0;
+
+  while ((pos = result.find("//", pos)) != std::string::npos) {
+    result.replace(pos, 2, "/");
+  }
+
+  return result;
+}
+
 std::string normalize_path_separator(const std::string& orig_path) {
   /*
   On Windows and Linux have different separator:
@@ -57,6 +75,7 @@ std::string normalize_path_separator(const std::string& orig_path) {
 #ifdef _WIN32
   std::replace(normalized_path.begin(), normalized_path.end(), '\\', '/');
 #endif
+  normalized_path = remove_duplicate_separator_of_path(normalized_path);
   return normalized_path;
 }
 
@@ -83,11 +102,10 @@ std::string create_temp_dir() {
   }
 #else
   std::string temp_dir = "/tmp/XXXXXX";
-  if (mkdtemp(temp_dir.data()) == nullptr) {
-    throw std::runtime_error(
-        std::string("Failed to create temporary directory: ") +
-        c10::utils::str_error(errno));
-  }
+  TORCH_CHECK(
+      mkdtemp(temp_dir.data()) != nullptr,
+      "Failed to create temporary directory: ",
+      c10::utils::str_error(errno));
   return temp_dir;
 #endif
 }
@@ -108,6 +126,22 @@ const char* extension_file_ext() {
 #endif
 }
 
+const char* get_output_flags(bool compile_only) {
+  if (compile_only) {
+#ifdef _WIN32
+    return "/c /Fo"; // codespell:ignore
+#else
+    return "-c -o";
+#endif
+  }
+
+#ifdef _WIN32
+  return "/Fe";
+#else
+  return "-o";
+#endif
+}
+
 bool _is_windows_os() {
 #ifdef _WIN32
   return true;
@@ -121,9 +155,7 @@ namespace torch::inductor {
 
 namespace {
 const nlohmann::json& load_json_file(const std::string& json_path) {
-  if (!file_exists(json_path)) {
-    throw std::runtime_error("File not found: " + json_path);
-  }
+  TORCH_CHECK(file_exists(json_path), "File not found: ", json_path);
 
   std::ifstream json_file(json_path);
   TORCH_CHECK(json_file.is_open());
@@ -146,7 +178,7 @@ std::tuple<std::string, std::string> get_cpp_compile_command(
 
   std::string source_args;
   for (const std::string& source : sources) {
-    source_args += source + " ";
+    source_args += normalize_path_separator(source) + " ";
   }
 
   std::string file_ext =
@@ -160,24 +192,28 @@ std::tuple<std::string, std::string> get_cpp_compile_command(
 
   std::string cflags_args;
   for (auto& arg : compile_options["cflags"]) {
-    cflags_args += _is_windows_os() ? "/" : "-" + arg.get<std::string>() + " ";
+    // [Windows compiler need it] convert first char arg to std::string, for
+    // following plus(+) strings.
+    cflags_args += std::string(_is_windows_os() ? "/" : "-") +
+        arg.get<std::string>() + " ";
   }
 
   std::string definitions_args;
   for (auto& arg : compile_options["definitions"]) {
-    definitions_args +=
-        _is_windows_os() ? "/D" : "-D " + arg.get<std::string>() + " ";
+    definitions_args += std::string(_is_windows_os() ? "/D" : "-D ") +
+        arg.get<std::string>() + " ";
   }
 
   std::string include_dirs_args;
   for (auto& arg : compile_options["include_dirs"]) {
-    include_dirs_args +=
-        _is_windows_os() ? "/I" : "-I" + arg.get<std::string>() + " ";
+    include_dirs_args += std::string(_is_windows_os() ? "/I" : "-I") +
+        arg.get<std::string>() + " ";
   }
 
   std::string ldflags_args;
   for (auto& arg : compile_options["ldflags"]) {
-    ldflags_args += _is_windows_os() ? "/" : "-" + arg.get<std::string>() + " ";
+    ldflags_args += std::string(_is_windows_os() ? "/" : "-") +
+        arg.get<std::string>() + " ";
   }
 
   std::string libraries_dirs_args;
@@ -209,38 +245,48 @@ std::tuple<std::string, std::string> get_cpp_compile_command(
     passthrough_parameters_args += arg_str + " ";
   }
 
-  std::string compile_only_arg =
-      compile_only ? (_is_windows_os() ? "/c" : "-c") : "";
+  std::string output_flags = get_output_flags(compile_only);
 
   std::string cmd;
+  /*
+  Format command as python frontend cpp_builder:
+  https://github.com/pytorch/pytorch/blob/3ef1bef36c73b4def0e1b71847e27fde1556c0fb/torch/_inductor/cpp_builder.py#L1780-L1790
+  https://github.com/pytorch/pytorch/blob/3ef1bef36c73b4def0e1b71847e27fde1556c0fb/torch/_inductor/cpp_builder.py#L1959-L1976
+  */
   if (_is_windows_os()) {
-    cmd = normalize_path_separator(fmt::format(
-        "{} {} {} {} {} {} /LD /Fe{} {} /link {} {} {}",
+    cmd = fmt::format(
+        "{} {} {} {} {} {} {}{}",
         compiler,
         include_dirs_args,
         definitions_args,
         cflags_args,
         source_args,
         passthrough_parameters_args,
-        target_file,
-        compile_only_arg,
-        libraries_dirs_args,
-        libraries_args,
-        ldflags_args));
+        output_flags,
+        target_file);
+    if (compile_only == false) {
+      cmd += fmt::format(
+          " /LD /link {} {} {}",
+          libraries_dirs_args,
+          libraries_args,
+          ldflags_args);
+    }
+    cmd = normalize_path_separator(cmd);
   } else {
-    cmd = normalize_path_separator(fmt::format(
-        "{} {} {} {} {} {} {} {} {} {} -o {}",
+    cmd = fmt::format(
+        "{} {} {} {} {} {} {} {}",
         compiler,
         source_args,
         definitions_args,
         cflags_args,
         include_dirs_args,
         passthrough_parameters_args,
-        ldflags_args,
-        libraries_args,
-        libraries_dirs_args,
-        compile_only_arg,
-        target_file));
+        output_flags,
+        target_file);
+    if (compile_only == false) {
+      cmd += fmt::format(
+          " {} {} {}", ldflags_args, libraries_args, libraries_dirs_args);
+    }
   }
 
   return std::make_tuple(cmd, target_file);
@@ -350,14 +396,15 @@ std::string compile_so(
   size_t lastindex = cpp_filename.find_last_of('.');
   std::string filename = cpp_filename.substr(0, lastindex);
 
-  std::string compile_flags_path = filename + "_compile_flags.json";
+  std::string compile_flags_path =
+      normalize_path_separator(filename + "_compile_flags.json");
   const nlohmann::json compile_flags = load_json_file(compile_flags_path);
 
   auto [compile_cmd, output_o] =
       get_cpp_compile_command(filename, {cpp_filename}, compile_flags);
 
-  std::string linker_flags_path =
-      cpp_filename.substr(0, lastindex) + "_linker_flags.json";
+  std::string linker_flags_path = normalize_path_separator(
+      cpp_filename.substr(0, lastindex) + "_linker_flags.json");
   const nlohmann::json linker_flags = load_json_file(linker_flags_path);
 
   obj_filenames.push_back(output_o);
@@ -365,32 +412,25 @@ std::string compile_so(
       get_cpp_compile_command(filename, obj_filenames, linker_flags);
 
   // Run the commands to generate a .so file
-  int status = system(compile_cmd.c_str());
-  if (status != 0) {
-    throw std::runtime_error("Failed to compile cpp file.");
-  }
-  status = system(link_cmd.c_str());
-  if (status != 0) {
-    throw std::runtime_error("Failed to link files.");
-  }
+  TORCH_CHECK(system(compile_cmd.c_str()) == 0, "Failed to compile cpp file.");
+  TORCH_CHECK(system(link_cmd.c_str()) == 0, "Failed to link files.");
 
   // Move the mmapped weights onto the .so
   std::string serialized_weights_path = filename + "_serialized_weights.bin";
   if (file_exists(serialized_weights_path)) {
     std::ifstream serialized_weights_file(
         serialized_weights_path, std::ios::binary);
-    if (!serialized_weights_file.is_open()) {
-      throw std::runtime_error("Failed to open serialized weights file");
-    }
+    TORCH_CHECK(
+        serialized_weights_file.is_open(),
+        "Failed to open serialized weights file");
+
     std::vector<char> serialized_weights(
         (std::istreambuf_iterator<char>(serialized_weights_file)),
         std::istreambuf_iterator<char>());
     serialized_weights_file.close();
 
     std::ofstream output_so_file(output_so, std::ios::binary | std::ios::app);
-    if (!output_so_file.is_open()) {
-      throw std::runtime_error("Failed to open output .so file");
-    }
+    TORCH_CHECK(output_so_file.is_open(), "Failed to open output .so file");
     // Page align the weights
     std::streampos so_size = output_so_file.tellp();
     std::vector<char> padding(16384 - so_size % 16384, ' ');
@@ -445,12 +485,11 @@ class RAIIMinizArchive {
  public:
   RAIIMinizArchive(const std::string& zip_path) {
     mz_zip_zero_struct(&_zip_archive);
-    if (!mz_zip_reader_init_file(
-            &_zip_archive, normalize_path_separator(zip_path).c_str(), 0)) {
-      throw std::runtime_error(fmt::format(
-          "Failed to initialize zip archive: {}",
-          mz_zip_get_error_string(mz_zip_get_last_error(&_zip_archive))));
-    }
+    TORCH_CHECK(
+        mz_zip_reader_init_file(
+            &_zip_archive, normalize_path_separator(zip_path).c_str(), 0),
+        "Failed to initialize zip archive: ",
+        mz_zip_get_error_string(mz_zip_get_last_error(&_zip_archive)));
   }
   RAIIMinizArchive(const RAIIMinizArchive&) = delete;
   RAIIMinizArchive& operator=(const RAIIMinizArchive&) = delete;
@@ -472,18 +511,18 @@ class RAIIMinizArchive {
       // terminator
       const auto zip_filename_len{
           mz_zip_reader_get_filename(&_zip_archive, i, nullptr, 0)};
-      if (!zip_filename_len) {
-        throw std::runtime_error(
-            fmt::format("Failed to read zip filename length at index {}", i));
-      }
+      TORCH_CHECK(
+          zip_filename_len, "Failed to read zip filename length at index ", i);
+
       // std::string implicitly appends a character for the null terminator
       std::string zip_filename(zip_filename_len - 1, '\0');
-      if (!mz_zip_reader_get_filename(
-              &_zip_archive, i, zip_filename.data(), zip_filename_len)) {
-        throw std::runtime_error(
-            fmt::format("Failed to read zip filename at index {}", i));
-      }
-      zip_filenames.emplace_back(zip_filename);
+      TORCH_CHECK(
+          mz_zip_reader_get_filename(
+              &_zip_archive, i, zip_filename.data(), zip_filename_len),
+          "Failed to read zip filename at index ",
+          i);
+
+      zip_filenames.emplace_back(std::move(zip_filename));
     }
 
     return zip_filenames;
@@ -492,12 +531,35 @@ class RAIIMinizArchive {
   void extract_file(
       const std::string& zip_filename,
       const std::string& dest_filename) {
+    // Can't normalize_path_separator zip_filename, as it is zip index.
+    std::string path_dest_filename = normalize_path_separator(dest_filename);
     if (!mz_zip_reader_extract_file_to_file(
-            &_zip_archive, zip_filename.c_str(), dest_filename.c_str(), 0)) {
-      throw std::runtime_error(fmt::format(
-          "Failed to extract zip file {} to destination file {}",
+            &_zip_archive,
+            zip_filename.c_str(),
+            path_dest_filename.c_str(),
+            0)) {
+#ifdef _WIN32
+      DWORD dwErrCode = GetLastError();
+      TORCH_CHECK(
+          false,
+          "Failed to extract zip file ",
           zip_filename,
-          dest_filename));
+          " to destination file ",
+          path_dest_filename,
+          ", error code: ",
+          dwErrCode,
+          " mz_zip error string: ",
+          mz_zip_get_error_string(mz_zip_get_last_error(&_zip_archive)));
+#else
+      TORCH_CHECK(
+          false,
+          "Failed to extract zip file ",
+          zip_filename,
+          " to destination file ",
+          path_dest_filename,
+          ", mz_zip error string: ",
+          mz_zip_get_error_string(mz_zip_get_last_error(&_zip_archive)));
+#endif
     }
   }
 
@@ -505,6 +567,101 @@ class RAIIMinizArchive {
   mz_zip_archive _zip_archive{};
 };
 
+std::unordered_map<std::string, std::string> AOTIModelPackageLoader::
+    load_metadata_from_package(
+        const std::string& model_package_path,
+        const std::string& model_name) {
+  // Open the zip archive
+  RAIIMinizArchive zip_archive{model_package_path};
+  auto found_filenames{zip_archive.get_filenames()};
+  TORCH_CHECK(!found_filenames.empty(), "No files found in zip archive.");
+
+  // Find the file prefix (similar to constructor logic)
+  std::string file_prefix;
+  if (found_filenames.size() >= 2) {
+    size_t pos = found_filenames[0].find('/');
+    std::string prefix0 = found_filenames[0].substr(0, pos);
+    pos = found_filenames[1].find('/');
+    std::string prefix1 = found_filenames[1].substr(0, pos);
+
+    if (!prefix0.empty() && !prefix1.empty() && prefix0 == prefix1) {
+      file_prefix = prefix0 + "/";
+    }
+  }
+
+  // Construct the expected metadata file path within the zip
+  std::string model_directory = normalize_path_separator(
+      file_prefix + "data" + k_separator + "aotinductor" + k_separator +
+      model_name);
+  std::string metadata_suffix = "wrapper_metadata.json";
+
+  std::string metadata_filename;
+
+  for (auto const& zip_filename_str : found_filenames) {
+    auto cur_filename = normalize_path_separator(zip_filename_str);
+
+    if (c10::starts_with(cur_filename, model_directory) &&
+        c10::ends_with(cur_filename, metadata_suffix)) {
+      metadata_filename = cur_filename;
+      break;
+    }
+  }
+
+  if (metadata_filename.empty()) {
+    std::string found_filenames_str;
+    for (const std::string& filename : found_filenames) {
+      found_filenames_str += filename + "\n";
+    }
+    std::string model_names_str;
+    for (const std::string& model_name_tmp :
+         find_model_names(found_filenames)) {
+      model_names_str += model_name_tmp + "\n";
+    }
+
+    TORCH_CHECK(
+        "Failed to find a generated cpp file or so file for model '",
+        model_name,
+        "' in the zip archive.\n\nAvailable models in the archive:\n",
+        model_names_str,
+        "\n\nTo load a specific model, please provide its name using the `model_name` parameter when calling AOTIModelPackageLoader() or torch._inductor.package.load_package.\n\n",
+        "The following files were loaded from the archive:\n",
+        found_filenames_str);
+  }
+
+  // Create temporary directory for extraction
+  std::string temp_dir = normalize_path_separator(create_temp_dir());
+  std::string output_path_str =
+      normalize_path_separator(temp_dir + k_separator + metadata_filename);
+
+  // Create the parent directory if it doesn't exist
+  size_t parent_path_idx = output_path_str.find_last_of(k_separator);
+  TORCH_CHECK(
+      parent_path_idx != std::string::npos,
+      "Failed to find parent path in " + output_path_str);
+  std::string parent_path = output_path_str.substr(0, parent_path_idx);
+  TORCH_CHECK(
+      recursive_mkdir(parent_path),
+      "Failed to create directory " + parent_path,
+      ": ",
+      c10::utils::str_error(errno));
+
+  LOG(INFO) << "Extract file: " << metadata_filename << " to "
+            << output_path_str;
+  zip_archive.extract_file(metadata_filename, output_path_str);
+
+  // Parse the metadata json file
+  const nlohmann::json metadata_json_obj = load_json_file(output_path_str);
+
+  std::unordered_map<std::string, std::string> metadata;
+  for (auto& item : metadata_json_obj.items()) {
+    metadata[item.key()] = item.value().get<std::string>();
+  }
+  // Clean up temporary directory
+  recursive_rmdir(temp_dir);
+
+  return metadata;
+}
+
 AOTIModelPackageLoader::AOTIModelPackageLoader(
     const std::string& model_package_path,
     const std::string& model_name,
@@ -512,23 +669,19 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
     const size_t num_runners,
     const c10::DeviceIndex device_index) {
   if (run_single_threaded) {
-    if (num_runners != 1) {
-      throw std::runtime_error(
-          "num_runners must be 1 when run_single_threaded is true");
-    }
+    TORCH_CHECK(
+        num_runners == 1,
+        "num_runners must be 1 when run_single_threaded is true");
   } else {
-    if (num_runners < 1) {
-      throw std::runtime_error(
-          "num_runners must be >=1 when run_single_threaded is false");
-    }
+    TORCH_CHECK(
+        num_runners >= 1,
+        "num_runners must be >=1 when run_single_threaded is false");
   }
 
   // Extract all files within the zipfile to a temporary directory
   RAIIMinizArchive zip_archive{model_package_path};
   auto found_filenames{zip_archive.get_filenames()};
-  if (found_filenames.empty()) {
-    throw std::runtime_error("No files found in zip archive.");
-  }
+  TORCH_CHECK(!found_filenames.empty(), "No files found in zip archive.");
 
   // All the paths are prepended with a tmp/ directory. We need to find the
   // prefix.
@@ -551,6 +704,7 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
 
   std::string so_filename;
   std::string cpp_filename;
+  std::string weight_blob_filename;
   std::vector<std::string> obj_filenames;
   std::string model_directory = normalize_path_separator(
       file_prefix + "data" + k_separator + "aotinductor" + k_separator +
@@ -590,17 +744,16 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
 
       // Create the parent directory if it doesn't exist
       size_t parent_path_idx = output_file_path.find_last_of(k_separator);
-      if (parent_path_idx == std::string::npos) {
-        throw std::runtime_error(
-            "Failed to find parent path in " + output_file_path);
-      }
+      TORCH_CHECK(
+          parent_path_idx != std::string::npos,
+          "Failed to find parent path in " + output_file_path);
+
       std::string parent_path = output_file_path.substr(0, parent_path_idx);
-      if (!recursive_mkdir(parent_path)) {
-        throw std::runtime_error(fmt::format(
-            "Failed to create directory {}: {}",
-            parent_path,
-            c10::utils::str_error(errno)));
-      }
+      TORCH_CHECK(
+          recursive_mkdir(parent_path),
+          "Failed to create directory " + parent_path,
+          ": ",
+          c10::utils::str_error(errno));
 
       // Extracts file to the temp directory
       zip_archive.extract_file(zip_filename_str, output_path_str);
@@ -615,6 +768,8 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
           obj_filenames.push_back(output_file_path);
         } else if (filename_extension == extension_file_ext()) {
           so_filename = output_file_path;
+        } else if (filename_extension == ".blob") {
+          weight_blob_filename = output_file_path;
         }
       }
     }
@@ -631,15 +786,14 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
       model_names_str += model_name_tmp + "\n";
     }
 
-    throw std::runtime_error(
-        "Failed to find a generated cpp file or so file for model '" +
-        model_name +
-        "' in the zip archive.\n\n"
-        "Available models in the archive:\n" +
-        model_names_str +
-        "\n\n"
-        "To load a specific model, please provide its name using the `model_name` parameter when calling AOTIModelPackageLoader() or torch._inductor.package.load_package.\n\n"
-        "The following files were loaded from the archive:\n" +
+    TORCH_CHECK(
+        false,
+        "Failed to find a generated cpp file or so file for model '",
+        model_name,
+        "' in the zip archive.\n\nAvailable models in the archive:\n",
+        model_names_str,
+        "\n\nTo load a specific model, please provide its name using the `model_name` parameter when calling AOTIModelPackageLoader() or torch._inductor.package.load_package.\n\n",
+        "The following files were loaded from the archive:\n",
         found_filenames_str);
   }
 
@@ -653,17 +807,15 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
 
   // Construct the runner depending on the device information
   std::string device_key = metadata_["AOTI_DEVICE_KEY"];
-
-  if (device_key.empty()) {
-    throw std::runtime_error("No device information found.");
-  }
+  TORCH_CHECK(!device_key.empty(), "No device information found.");
 
   std::unordered_map<std::string, CreateAOTIModelRunnerFunc>
       registered_aoti_runner = getAOTIModelRunnerRegistry();
 
-  if (registered_aoti_runner.find(device_key) == registered_aoti_runner.end()) {
-    throw std::runtime_error("Unsupported device key found: " + device_key);
-  }
+  TORCH_CHECK(
+      registered_aoti_runner.find(device_key) != registered_aoti_runner.end(),
+      "Unsupported device key found: ",
+      device_key);
 
   c10::Device device = c10::Device(device_key);
   device.set_index(device_index);
@@ -671,6 +823,10 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
   std::string cubin_dir = temp_dir_ + k_separator + model_directory;
   runner_ = registered_aoti_runner[device_key](
       so_path, num_runners, device.str(), cubin_dir, run_single_threaded);
+
+  if (weight_blob_filename != "") {
+    runner_->update_constant_buffer_from_blob(weight_blob_filename);
+  }
 }
 
 AOTIModelPackageLoader::~AOTIModelPackageLoader() {
@@ -722,7 +878,7 @@ void AOTIModelPackageLoader::load_constants(
     if (fqn_to_constant_name.find(it.first) != fqn_to_constant_name.end()) {
       updated_constants_map.emplace(fqn_to_constant_name[it.first], it.second);
     } else {
-      throw std::runtime_error("Constant not found: " + it.first);
+      TORCH_CHECK(false, "Constant not found: ", it.first);
     }
   }
 
diff --git a/torch/csrc/inductor/aoti_package/model_package_loader.h b/torch/csrc/inductor/aoti_package/model_package_loader.h
index db990de26d3b..9c98dddef68e 100644
--- a/torch/csrc/inductor/aoti_package/model_package_loader.h
+++ b/torch/csrc/inductor/aoti_package/model_package_loader.h
@@ -42,6 +42,11 @@ class TORCH_API AOTIModelPackageLoader {
       bool validate_full_updates,
       bool user_managed = false);
 
+  // Static function to load metadata directly from a model package
+  static std::unordered_map<std::string, std::string> load_metadata_from_package(
+      const std::string& model_package_path,
+      const std::string& model_name);
+
  private:
   std::string temp_dir_;
   std::unique_ptr<AOTIModelContainerRunner> runner_;
diff --git a/torch/csrc/inductor/aoti_package/pybind.cpp b/torch/csrc/inductor/aoti_package/pybind.cpp
index b207aa2a2ae0..591153bb1f6c 100644
--- a/torch/csrc/inductor/aoti_package/pybind.cpp
+++ b/torch/csrc/inductor/aoti_package/pybind.cpp
@@ -86,6 +86,11 @@ void initAOTIPackageBindings(PyObject* module) {
           py::arg("tensor_map"),
           py::arg("use_inactive"),
           py::arg("validate_full_updates"),
-          py::arg("user_managed") = false);
+          py::arg("user_managed") = false)
+      .def_static(
+          "load_metadata_from_package",
+          &AOTIModelPackageLoaderPybind::load_metadata_from_package,
+          py::arg("model_package_path"),
+          py::arg("model_name"));
 }
 } // namespace torch::inductor
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner.cpp b/torch/csrc/inductor/aoti_runner/model_container_runner.cpp
index 8bb23b16b789..44517bcd702e 100644
--- a/torch/csrc/inductor/aoti_runner/model_container_runner.cpp
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner.cpp
@@ -5,23 +5,20 @@
 #include <torch/csrc/inductor/aoti_torch/oss_proxy_executor.h>
 #include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
 
-#ifndef _WIN32
-#include <sys/stat.h>
-#else
-#include <filesystem>
-namespace fs = std::filesystem;
-#endif
+#include <c10/util/FileSystem.h>
 
-namespace {
-bool file_exists(std::string& path) {
+#include <fcntl.h>
 #ifdef _WIN32
-  return fs::exists(path);
-#else
-  struct stat rc{};
-  return lstat(path.c_str(), &rc) == 0;
-#endif
-}
-} // namespace
+#include <errno.h>
+#include <io.h>
+#include <sys/stat.h>
+#include <windows.h>
+#include <functional> // std::function
+#else // !_WIN32
+#include <dlfcn.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#endif // _WIN32
 
 namespace torch::inductor {
 
@@ -32,15 +29,13 @@ AOTIModelContainerRunner::AOTIModelContainerRunner(
     const std::string& cubin_dir,
     const bool run_single_threaded) {
   if (run_single_threaded) {
-    if (num_models != 1) {
-      throw std::runtime_error(
-          "num_models must be 1 when run_single_threaded is true");
-    }
+    TORCH_CHECK(
+        num_models == 1,
+        "num_models must be 1 when run_single_threaded is true");
   } else {
-    if (num_models < 1) {
-      throw std::runtime_error(
-          "num_models must be >=1 when run_single_threaded is false");
-    }
+    TORCH_CHECK(
+        num_models >= 1,
+        "num_models must be >=1 when run_single_threaded is false");
   }
   model_so_ = std::make_unique<at::DynamicLibrary>(model_so_path.c_str());
   TORCH_CHECK(model_so_, "Failed to load model: ", model_so_path);
@@ -89,11 +84,10 @@ AOTIModelContainerRunner::AOTIModelContainerRunner(
       ? "AOTInductorModelContainerRunSingleThreaded"
       : "AOTInductorModelContainerRun";
   TRY_LOAD_SYMBOL(run_func_, run_func_name)
-  if (run_func_ == nullptr && run_single_threaded) {
-    throw std::runtime_error(
-        "No AOTInductorModelContainerRunSingleThreaded function in .so! To use AOTInductor-compiled model in the single-threaded mode,\
+  TORCH_CHECK(
+      run_func_ != nullptr || !run_single_threaded,
+      "No AOTInductorModelContainerRunSingleThreaded function in .so! To use AOTInductor-compiled model in the single-threaded mode,\
 consider rebuild your model with the latest AOTInductor.");
-  }
 
   TRY_LOAD_SYMBOL(
       free_inactive_constant_buffer_func_,
@@ -104,13 +98,19 @@ consider rebuild your model with the latest AOTInductor.");
   TRY_LOAD_SYMBOL(
       update_user_managed_constant_buffer_func_,
       "AOTInductorModelContainerUpdateUserManagedConstantBuffer")
+  TRY_LOAD_SYMBOL(
+      get_constants_blob_size_func_,
+      "AOTInductorModelContainerGetConstantsBlobSize")
+  TRY_LOAD_SYMBOL(
+      update_constants_from_blob_func_,
+      "AOTInductorModelUpdateConstantsFromBlob")
 #undef TRY_LOAD_SYMBOL
 
   // Hack to find the json file name from the model so file
   size_t lastindex = model_so_path.find_last_of('.');
   std::string json_filename = model_so_path.substr(0, lastindex) + ".json";
 
-  if (file_exists(json_filename)) {
+  if (c10::filesystem::exists(json_filename)) {
     proxy_executor_ = std::make_unique<torch::aot_inductor::OSSProxyExecutor>(
         json_filename, device_str == "cpu");
     proxy_executor_handle_ =
@@ -267,6 +267,81 @@ void AOTIModelContainerRunner::update_constant_buffer(
   }
 }
 
+void AOTIModelContainerRunner::update_constant_buffer_from_blob(
+    const std::string& weights_path) {
+  uint64_t weights_size;
+  AOTI_RUNTIME_ERROR_CODE_CHECK(
+      get_constants_blob_size_func_(container_handle_, &weights_size));
+
+#ifdef _WIN32
+  // Proper Windows file mapping implementation
+
+  HANDLE hFile = CreateFileA(
+      weights_path.c_str(),
+      GENERIC_READ,
+      FILE_SHARE_READ,
+      NULL,
+      OPEN_EXISTING,
+      FILE_ATTRIBUTE_NORMAL,
+      NULL);
+
+  if (hFile == INVALID_HANDLE_VALUE) {
+    throw std::runtime_error(
+        "Failed to open external weights file: " + weights_path);
+  }
+
+  // Get actual file size for validation
+  LARGE_INTEGER fileSize;
+  if (!GetFileSizeEx(hFile, &fileSize)) {
+    CloseHandle(hFile);
+    throw std::runtime_error("Failed to get file size");
+  }
+
+  if (static_cast<uint64_t>(fileSize.QuadPart) < weights_size) {
+    CloseHandle(hFile);
+    throw std::runtime_error("File size smaller than expected weights size");
+  }
+
+  HANDLE hMapping = CreateFileMapping(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
+  CloseHandle(hFile); // Close file handle, keep mapping handle
+
+  if (hMapping == NULL) {
+    throw std::runtime_error("CreateFileMapping failed");
+  }
+
+  uint8_t* ptr = static_cast<uint8_t*>(
+      MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, weights_size));
+
+  if (ptr == NULL) {
+    CloseHandle(hMapping);
+    throw std::runtime_error("MapViewOfFile failed");
+  }
+
+#else
+  // Unix/Linux implementation
+  int fd = open(weights_path.c_str(), O_RDONLY);
+  TORCH_CHECK(fd >= 0, "Failed to open external weights file: " + weights_path);
+
+  uint8_t* ptr = static_cast<uint8_t*>(
+      mmap(NULL, weights_size, PROT_READ, MAP_PRIVATE, fd, 0));
+
+  close(fd);
+  TORCH_CHECK(ptr != MAP_FAILED, "mmap() failed");
+#endif
+  AOTI_RUNTIME_ERROR_CODE_CHECK(
+      update_constants_from_blob_func_(container_handle_, ptr));
+
+  // After update_constants_from_blob_func_ returns, the model has copied
+  // all the data from the mmap'd memory to its own internal storage,
+  // so we can safely unmap the memory now.
+#ifdef _WIN32
+  UnmapViewOfFile(ptr);
+  CloseHandle(hMapping);
+#else
+  munmap(ptr, weights_size);
+#endif
+}
+
 void AOTIModelContainerRunner::update_inactive_constant_buffer(
     const TensorConstantMap& const_map) {
   AOTI_RUNTIME_ERROR_CODE_CHECK(update_inactive_constant_buffer_func_(
@@ -288,10 +363,9 @@ void AOTIModelContainerRunner::swap_constant_buffer() {
 }
 
 void AOTIModelContainerRunner::free_inactive_constant_buffer() {
-  if (!free_inactive_constant_buffer_func_) {
-    throw std::runtime_error(
-        "No free_inactive_constant_buffer in .so! Consider rebuild your model with the latest AOTInductor.");
-  }
+  TORCH_CHECK(
+      free_inactive_constant_buffer_func_ != nullptr,
+      "No free_inactive_constant_buffer in .so! Consider rebuild your model with the latest AOTInductor.");
   AOTI_RUNTIME_ERROR_CODE_CHECK(
       free_inactive_constant_buffer_func_(container_handle_));
 }
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner.h b/torch/csrc/inductor/aoti_runner/model_container_runner.h
index 39065dab187f..e9bf0daf7267 100644
--- a/torch/csrc/inductor/aoti_runner/model_container_runner.h
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner.h
@@ -55,6 +55,7 @@ class TORCH_API AOTIModelContainerRunner {
       AOTInductorStreamHandle cuda_stream_handle = nullptr);
   void swap_constant_buffer();
   void free_inactive_constant_buffer();
+  void update_constant_buffer_from_blob(const std::string& weights_path);
 
   std::vector<std::string> get_call_spec();
 
@@ -99,6 +100,10 @@ class TORCH_API AOTIModelContainerRunner {
   decltype(&AOTInductorModelContainerFreeInactiveConstantBuffer)
       free_inactive_constant_buffer_func_{nullptr};
   decltype(&AOTInductorModelContainerGetCallSpec) get_call_spec_func_{nullptr};
+  decltype(&AOTInductorModelContainerGetConstantsBlobSize)
+      get_constants_blob_size_func_{nullptr};
+  decltype(&AOTInductorModelUpdateConstantsFromBlob)
+      update_constants_from_blob_func_{nullptr};
 
   AOTInductorModelContainerHandle container_handle_ = nullptr;
 
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp b/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp
index e2f968918d3d..a4f3f2ec564d 100644
--- a/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.cpp
@@ -25,9 +25,8 @@ std::unique_ptr<AOTIModelContainerRunner> create_aoti_runner_cpu(
     const std::string& device_str,
     const std::string& cubin_dir,
     const bool run_single_threaded) {
-  if (device_str != "cpu") {
-    throw std::runtime_error("Incorrect device passed to aoti_runner_cpu");
-  }
+  TORCH_CHECK(
+      device_str == "cpu", "Incorrect device passed to aoti_runner_cpu");
   return std::make_unique<AOTIModelContainerRunnerCpu>(
       model_so_path, num_models, run_single_threaded);
 }
diff --git a/torch/csrc/inductor/aoti_runner/model_container_runner_mps.cpp b/torch/csrc/inductor/aoti_runner/model_container_runner_mps.cpp
index 95dda420602f..a65496f26878 100644
--- a/torch/csrc/inductor/aoti_runner/model_container_runner_mps.cpp
+++ b/torch/csrc/inductor/aoti_runner/model_container_runner_mps.cpp
@@ -23,9 +23,8 @@ std::unique_ptr<AOTIModelContainerRunner> create_aoti_runner_mps(
     const std::string& device_str,
     const std::string& cubin_dir,
     const bool run_single_threaded) {
-  if (device_str != "mps") {
-    throw std::runtime_error("Incorrect device passed to aoti_runner_mps");
-  }
+  TORCH_CHECK(
+      device_str == "mps", "Incorrect device passed to aoti_runner_mps");
   return std::make_unique<AOTIModelContainerRunnerMps>(
       model_so_path, num_models, run_single_threaded);
 }
diff --git a/torch/csrc/inductor/aoti_runner/pybind.cpp b/torch/csrc/inductor/aoti_runner/pybind.cpp
index ff0d198aeaeb..d2cf3535f2d8 100644
--- a/torch/csrc/inductor/aoti_runner/pybind.cpp
+++ b/torch/csrc/inductor/aoti_runner/pybind.cpp
@@ -51,7 +51,11 @@ void initAOTIRunnerBindings(PyObject* module) {
           &AOTIModelContainerRunnerCpu::swap_constant_buffer)
       .def(
           "free_inactive_constant_buffer",
-          &AOTIModelContainerRunnerCpu::free_inactive_constant_buffer);
+          &AOTIModelContainerRunnerCpu::free_inactive_constant_buffer)
+      .def(
+          "update_constant_buffer_from_blob",
+          &AOTIModelContainerRunnerCpu::update_constant_buffer_from_blob,
+          py::arg("weights_path"));
 
 #ifdef USE_CUDA
   py::class_<AOTIModelContainerRunnerCuda>(m, "AOTIModelContainerRunnerCuda")
@@ -91,7 +95,11 @@ void initAOTIRunnerBindings(PyObject* module) {
           &AOTIModelContainerRunnerCuda::swap_constant_buffer)
       .def(
           "free_inactive_constant_buffer",
-          &AOTIModelContainerRunnerCuda::free_inactive_constant_buffer);
+          &AOTIModelContainerRunnerCuda::free_inactive_constant_buffer)
+      .def(
+          "update_constant_buffer_from_blob",
+          &AOTIModelContainerRunnerCuda::update_constant_buffer_from_blob,
+          py::arg("weights_path"));
 #endif
 #ifdef USE_XPU
   py::class_<AOTIModelContainerRunnerXpu>(m, "AOTIModelContainerRunnerXpu")
@@ -131,8 +139,11 @@ void initAOTIRunnerBindings(PyObject* module) {
           &AOTIModelContainerRunnerXpu::swap_constant_buffer)
       .def(
           "free_inactive_constant_buffer",
-          &AOTIModelContainerRunnerXpu::free_inactive_constant_buffer);
-
+          &AOTIModelContainerRunnerXpu::free_inactive_constant_buffer)
+      .def(
+          "update_constant_buffer_from_blob",
+          &AOTIModelContainerRunnerXpu::update_constant_buffer_from_blob,
+          py::arg("weights_path"));
 #endif
 #if defined(USE_MPS) && defined(__APPLE__) && \
     !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE))
@@ -167,8 +178,11 @@ void initAOTIRunnerBindings(PyObject* module) {
           &AOTIModelContainerRunnerMps::swap_constant_buffer)
       .def(
           "free_inactive_constant_buffer",
-          &AOTIModelContainerRunnerMps::free_inactive_constant_buffer);
-
+          &AOTIModelContainerRunnerMps::free_inactive_constant_buffer)
+      .def(
+          "update_constant_buffer_from_blob",
+          &AOTIModelContainerRunnerMps::update_constant_buffer_from_blob,
+          py::arg("weights_path"));
 #endif
 
   m.def(
diff --git a/torch/csrc/inductor/aoti_runtime/interface.h b/torch/csrc/inductor/aoti_runtime/interface.h
index fab9a87a725e..ffccdd94e5be 100644
--- a/torch/csrc/inductor/aoti_runtime/interface.h
+++ b/torch/csrc/inductor/aoti_runtime/interface.h
@@ -30,6 +30,11 @@ using AOTInductorStreamHandle = AOTInductorStreamOpaque*;
 struct AOTInductorConstantMap;
 using AOTInductorConstantMapHandle = AOTInductorConstantMap*;
 
+struct AOTInductorConstantMapEntry {
+  const char* name;
+  AtenTensorHandle handle;
+};
+
 // TODO: Deprecate this API. This was kept for BC compatibility.
 // Please use AOTInductorModelContainerCreateWithDevice instead.
 AOTI_API AOTIRuntimeError AOTInductorModelContainerCreate(
@@ -151,6 +156,16 @@ AOTInductorModelContainerUpdateUserManagedConstantBuffer(
     bool use_inactive,
     bool validate_full_update);
 
+// Same as AOTInductorModelContainerUpdateUserManagedConstantBuffer,
+// but no std::unordered_map crosses DLL boundaries for cross-compilation.
+AOTI_API AOTIRuntimeError
+AOTInductorModelContainerUpdateUserManagedConstantBufferPairs(
+    AOTInductorModelContainerHandle container_handle,
+    const AOTInductorConstantMapEntry* pairs,
+    size_t num_pairs,
+    bool use_inactive,
+    bool validate_full_update);
+
 // Setup the constant buffer in model container with provided ConstantMap
 // use_inactive should be set as true if the inactive buffer is to be updated.
 // validate_full_update checks if all constants are included in the ConstantMap
@@ -227,6 +242,16 @@ AOTI_API AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
     AOTInductorModelHandle model_handle,
     AOTInductorConstantMapHandle constant_map_handle);
 
+// Get the size of the constant blob
+AOTI_API AOTIRuntimeError AOTInductorModelContainerGetConstantsBlobSize(
+    AOTInductorModelContainerHandle container_handle,
+    uint64_t* ret_size);
+
+// Load weights from a single blob in weight_blob_ptr
+AOTI_API AOTIRuntimeError AOTInductorModelUpdateConstantsFromBlob(
+    AOTInductorModelContainerHandle container_handle,
+    const uint8_t* weight_blob_ptr);
+
 // Delete an AOTInductorModel created by AOTInductorModelCreate.
 AOTI_API AOTIRuntimeError
 AOTInductorModelDelete(AOTInductorModelHandle model_handle);
diff --git a/torch/csrc/inductor/aoti_runtime/model_base.h b/torch/csrc/inductor/aoti_runtime/model_base.h
index bccf25360bc4..19f1dca1b7e2 100644
--- a/torch/csrc/inductor/aoti_runtime/model_base.h
+++ b/torch/csrc/inductor/aoti_runtime/model_base.h
@@ -1,6 +1,6 @@
 #pragma once
 #ifdef _WIN32
-#include <Windows.h>
+#include <windows.h>
 #include <functional> // std::function
 #ifdef USE_MMAP_SELF
 #include <errno.h>
@@ -581,7 +581,14 @@ class AOTInductorModelBase {
     return folded_constants;
   }
 
-  void load_constants() {
+  void update_constants_from_blob(const uint8_t* weight_blob_ptr) {
+#if defined(USE_MMAP_EXTERNAL)
+    user_managed_mmap = const_cast<uint8_t*>(weight_blob_ptr);
+    load_constants(true);
+#endif
+  }
+
+  void load_constants(bool force = false) {
     size_t num_constants = this->num_constants();
     size_t num_folded_constants = this->num_folded_constants();
     constants_map_->reserve(num_constants);
@@ -590,7 +597,7 @@ class AOTInductorModelBase {
         num_constants - num_folded_constants);
     size_t blob_size = 0;
     compute_constant_blob(blob_size, constants_internal_offset);
-    if (!include_weights) {
+    if (!force && !include_weights) {
       return;
     }
 #if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_MPS)
@@ -817,6 +824,17 @@ class AOTInductorModelBase {
     return out_spec_.c_str();
   }
 
+  uint64_t constant_blob_size() const {
+#if defined(USE_MMAP_SELF) || defined(USE_MMAP_EXTERNAL)
+    const uint64_t weights_size =
+        reinterpret_cast<const uint64_t*>(_binary_constants_bin_start)[0];
+    return weights_size;
+#else
+    throw std::runtime_error{
+        "constant blob size is only available for mmap'd weights"};
+#endif
+  }
+
   void update_constants_array_from_map() {
     if (!constants_map_) {
       throw std::runtime_error{
@@ -903,6 +921,15 @@ class AOTInductorModelBase {
 
  protected:
   uint8_t* _get_constants_start() {
+#if defined(USE_MMAP_EXTERNAL)
+    if (!user_managed_mmap) {
+      throw std::runtime_error{
+          "Constants are not mmap'd. Use AOTInductorModelUpdateConstantsBlob to initialize the constants first."};
+    }
+    // Mapped memory for weights
+    return user_managed_mmap;
+#endif
+
 #ifndef USE_MMAP_SELF
     // NOLINTNEXTLINE(*const-cast*)
     return const_cast<uint8_t*>(_binary_constants_bin_start);
@@ -942,6 +969,7 @@ class AOTInductorModelBase {
     return self_mmap;
 #endif
   }
+
   struct ParamInfo {
     const char* name = nullptr;
   };
@@ -973,10 +1001,16 @@ class AOTInductorModelBase {
   // Holds the blob storage for constants' at::Tensor.
   RAIIDataPtr constant_blob_;
 
-#ifdef USE_MMAP_SELF
+#if defined(USE_MMAP_SELF)
+  // Mapped memory for weights
   uint8_t* self_mmap = NULL;
 #endif
 
+#if defined(USE_MMAP_EXTERNAL)
+  // Mapped memory for weights
+  uint8_t* user_managed_mmap = NULL;
+#endif
+
   // A directory with CUDA binary files, e.g. compiled kernels, etc.
   const std::optional<std::string> cubin_dir_;
 
diff --git a/torch/csrc/inductor/aoti_runtime/model_container.h b/torch/csrc/inductor/aoti_runtime/model_container.h
index 0bd12e841e39..05ffb0feea1b 100644
--- a/torch/csrc/inductor/aoti_runtime/model_container.h
+++ b/torch/csrc/inductor/aoti_runtime/model_container.h
@@ -255,6 +255,20 @@ class AOTInductorModelContainer {
     return models_[0]->constant_dtype(static_cast<int64_t>(idx));
   }
 
+  uint64_t constant_blob_size() const {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
+    return models_[0]->constant_blob_size();
+  }
+
+  void update_constants_from_blob(const uint8_t* weight_blob_ptr) {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
+    return models_[0]->update_constants_from_blob(weight_blob_ptr);
+  }
+
   void run_const_fold(
       bool inactive_buffer,
       DeviceStreamType stream,
diff --git a/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h b/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h
index 9745f69ccf4f..3a2e91c37c91 100644
--- a/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h
+++ b/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h
@@ -128,12 +128,10 @@ static std::unique_ptr<sycl::kernel> _createKernel(
     uint32_t numWarps,
     uint32_t sharedMemory,
     void** params,
-    sycl::queue* queuePtr) {
+    sycl::queue* queuePtr,
+    uint32_t threadsPerWarp) {
   std::string kernelName =
       kernelPtr->get_info<sycl::info::kernel::function_name>();
-  // Currently threadsPerWarp is hard code to 32 from torch.compile to triton
-  // stack.
-  int threadsPerWarp = 32;
   uint32_t numParams = kernelPtr->get_info<sycl::info::kernel::num_args>();
   size_t globalRangeX = gridX * threadsPerWarp * numWarps;
   size_t globalRangeY = gridY;
diff --git a/torch/csrc/inductor/aoti_runtime/utils.h b/torch/csrc/inductor/aoti_runtime/utils.h
index b813b3f6f745..7d1938f1c606 100644
--- a/torch/csrc/inductor/aoti_runtime/utils.h
+++ b/torch/csrc/inductor/aoti_runtime/utils.h
@@ -40,7 +40,7 @@ namespace torch::aot_inductor {
 
 using DeleterFnPtr = void (*)(void*);
 
-inline void noop_deleter(void*) {}
+inline void noop_deleter(void* /*unused*/) {}
 
 inline void delete_record_function_object(void* ptr) {
   AOTI_TORCH_ERROR_CODE_CHECK(aoti_record_function_end(
@@ -52,6 +52,11 @@ inline void delete_tensor_object(void* ptr) {
       aoti_torch_delete_tensor_object(reinterpret_cast<AtenTensorHandle>(ptr)));
 }
 
+inline void delete_c10_value_object(void* ptr) {
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_delete_c10_value_object(
+      reinterpret_cast<C10IValueHandle>(ptr)));
+}
+
 class RAIIAtenRecordFunctionHandle {
  public:
   RAIIAtenRecordFunctionHandle() : handle_(nullptr, noop_deleter) {}
@@ -196,9 +201,50 @@ class RAIIAtenTensorHandle {
   std::unique_ptr<AtenTensorOpaque, DeleterFnPtr> handle_;
 };
 
+// RAIIC10IValueHandle steals the IValue objects created by the libtorch C ABI
+class RAIIC10IValueHandle {
+ public:
+  RAIIC10IValueHandle() : handle_(nullptr, noop_deleter) {}
+  RAIIC10IValueHandle(const RAIIC10IValueHandle& other) = delete;
+  RAIIC10IValueHandle& operator=(const RAIIC10IValueHandle& other) = delete;
+
+  // Steal the ownership from another RAIIC10IValueHandle using std::move
+  RAIIC10IValueHandle(RAIIC10IValueHandle&& other) = default;
+  RAIIC10IValueHandle& operator=(RAIIC10IValueHandle&& other) = default;
+
+  // Steal the ownership from raw C10IValueHandle
+  RAIIC10IValueHandle(C10IValueHandle handle)
+      : handle_(handle, delete_c10_value_object) {}
+
+  ~RAIIC10IValueHandle() {
+    handle_.reset();
+  }
+
+  // Return a raw C10IValueHandle to be used by aoti_torch functions
+  // Note: this function does NOT transfer the ownership of the handle
+  operator C10IValueHandle() const {
+    return handle_.get();
+  }
+
+  C10IValueHandle release() {
+    return handle_.release();
+  }
+
+  C10IValueHandle get() const {
+    return handle_.get();
+  }
+
+  void reset() {
+    handle_.reset();
+  }
+
+ private:
+  std::unique_ptr<C10IValueOpaque, DeleterFnPtr> handle_;
+};
+
 class MaybeOwningAtenTensorHandle {
  public:
-  MaybeOwningAtenTensorHandle() : handle_(nullptr), raii_handle_() {}
+  MaybeOwningAtenTensorHandle() : handle_(nullptr) {}
   // We skip copy constructor as MaybeOwningAtenTensorHandle might be RAII which
   // makes it undefined.
   MaybeOwningAtenTensorHandle(const MaybeOwningAtenTensorHandle& other) =
diff --git a/torch/csrc/inductor/aoti_torch/c/shim.h b/torch/csrc/inductor/aoti_torch/c/shim.h
index b4463d333fc5..4fb746ea1527 100644
--- a/torch/csrc/inductor/aoti_torch/c/shim.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim.h
@@ -176,6 +176,28 @@ AOTI_TORCH_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled);
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_delete_tensor_object(AtenTensorHandle tensor);
 
+// c10::IValue <int64_t> object conversion
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_int64_to_ivalue(int64_t val, C10IValueHandle* ivalue);
+
+// c10::IValue <const char** > object conversions
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_strlist_to_ivalue(
+    const char** val,
+    int64_t len,
+    C10IValueHandle* ivalue);
+
+// c10::IValue <const char* > object conversions
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_str_to_ivalue(const char* val, C10IValueHandle* ivalue);
+
+// c10::IValue <at::Tensor> object conversions
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_tensor_to_ivalue(AtenTensorHandle val, C10IValueHandle* ivalue);
+
+// Free the c10::IValue object
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_delete_c10_value_object(C10IValueHandle handle);
+
 // Get a pointer to the underlying storage data
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_data_ptr(
     AtenTensorHandle tensor,
diff --git a/torch/csrc/inductor/aoti_torch/c/shim_mps.h b/torch/csrc/inductor/aoti_torch/c/shim_mps.h
index 08f1569927f0..2ab005780512 100644
--- a/torch/csrc/inductor/aoti_torch/c/shim_mps.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim_mps.h
@@ -3,12 +3,32 @@
 
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 
+struct AOTIMetalKernelFunctionOpaque;
+using AOTIMetalKernelFunctionHandle = AOTIMetalKernelFunctionOpaque*;
+
+struct AOTIMetalShaderLibraryOpaque;
+using AOTIMetalShaderLibraryHandle = AOTIMetalShaderLibraryOpaque*;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct AOTIMetalKernelFunctionOpaque;
-using AOTIMetalKernelFunctionHandle = AOTIMetalKernelFunctionOpaque*;
+// MetalShaderLibrary functions
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_create_shader_library(
+    const char* metal_shader_source,
+    AOTIMetalShaderLibraryHandle* library_handle);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_delete_shader_library(
+    AOTIMetalShaderLibraryHandle library_handle);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_get_kernel_function(
+    AOTIMetalShaderLibraryHandle library_handle,
+    const char* kernel_name,
+    AOTIMetalKernelFunctionHandle* function_handle);
+
+// MetalKernelFunction functions
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_mps_start_encoding(AOTIMetalKernelFunctionHandle func);
 
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_set_arg_tensor(
     AOTIMetalKernelFunctionHandle func,
@@ -20,6 +40,27 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_set_arg_int(
     unsigned idx,
     int64_t val);
 
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_dispatch_single(
+    AOTIMetalKernelFunctionHandle func,
+    uint64_t length);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_dispatch_single_with_group_size(
+    AOTIMetalKernelFunctionHandle func,
+    uint64_t length,
+    uint64_t group_size);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_dispatch_array(
+    AOTIMetalKernelFunctionHandle func,
+    const uint64_t* length,
+    size_t length_size);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_dispatch_array_with_group_size(
+    AOTIMetalKernelFunctionHandle func,
+    const uint64_t* length,
+    size_t length_size,
+    const uint64_t* group_size,
+    size_t group_size_size);
+
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_mps_malloc(void** buffer, size_t num_bytes);
 
@@ -39,6 +80,22 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_copy_buffer(
     size_t src_offset,
     size_t dst_offset);
 
+// C callback function type for command block execution
+typedef void (*aoti_torch_mps_command_block_callback_t)(
+    AOTIMetalKernelFunctionHandle func,
+    void* user_data);
+
+// Shared callback function for std::function trampoline
+AOTI_TORCH_EXPORT void aoti_torch_mps_shared_callback(
+    AOTIMetalKernelFunctionHandle func,
+    void* user_data);
+
+// Pure C version using function pointer and user data for trampoline pattern
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_run_command_block(
+    AOTIMetalKernelFunctionHandle func,
+    aoti_torch_mps_command_block_callback_t callback,
+    void* user_data);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
index 470919cf389c..c41487ae6dd8 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
@@ -44,6 +44,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_flash_atten
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_flash_attention_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_fused_attention_overrideable(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_bias, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5, AtenTensorHandle* ret6, AtenTensorHandle* ret7, AtenTensorHandle* ret8);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_fused_attention_overrideable_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, const int32_t* grad_input_mask, int64_t grad_input_mask_len_, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_grouped_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scale_a, AtenTensorHandle scale_b, AtenTensorHandle* offs, AtenTensorHandle* bias, AtenTensorHandle* scale_result, int32_t* out_dtype, int32_t use_fast_accum, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scale_a, AtenTensorHandle scale_b, AtenTensorHandle* bias, AtenTensorHandle* scale_result, int32_t* out_dtype, int32_t use_fast_accum, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scale_a, AtenTensorHandle scale_b, AtenTensorHandle* bias, AtenTensorHandle* scale_result, int32_t* out_dtype, int32_t use_fast_accum);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__segment_reduce_backward(AtenTensorHandle grad, AtenTensorHandle output, AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* offsets, int64_t axis, double* initial, AtenTensorHandle* ret0);
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
index 74c4c111ca51..e075956e14d7 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
@@ -16,7 +16,9 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__adaptive_avg_pool2d_backward(At
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__cdist_forward(AtenTensorHandle x1, AtenTensorHandle x2, double p, int64_t* compute_mode, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__efficientzerotensor(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__embedding_bag(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__embedding_bag_dense_backward(AtenTensorHandle grad, AtenTensorHandle indices, AtenTensorHandle offset2bag, AtenTensorHandle bag_size, AtenTensorHandle maximum_indices, int64_t num_weights, int32_t scale_grad_by_freq, int64_t mode, AtenTensorHandle* per_sample_weights, int64_t padding_idx, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__embedding_bag_forward_only(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__embedding_bag_per_sample_weights_backward(AtenTensorHandle grad, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, AtenTensorHandle offset2bag, int64_t mode, int64_t padding_idx, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__fft_c2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t forward, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__fft_r2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h
index 09ebbb76d0b2..39f0dec86165 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h
@@ -19,6 +19,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__scaled_dot_product_fused_attent
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__scaled_dot_product_fused_attention_overrideable_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, const int32_t* grad_input_mask, int64_t grad_input_mask_len_, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__weight_int4pack_mm_with_scales_and_zeros(AtenTensorHandle self, AtenTensorHandle mat2, int64_t qGroupSize, AtenTensorHandle qScale, AtenTensorHandle qZeros, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__weight_int8pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scales, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_abs(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_add_Scalar(AtenTensorHandle self, double other, double alpha, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_addbmm(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
diff --git a/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp b/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp
index 0beffa32d6c9..b0a8a09ffc3e 100644
--- a/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp
+++ b/torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp
@@ -4,6 +4,7 @@
 #include <vector>
 
 #include <c10/util/Exception.h>
+#include <c10/util/FileSystem.h>
 #include <torch/csrc/inductor/aoti_torch/oss_proxy_executor.h>
 #include <torch/csrc/jit/serialization/pickle.h>
 
@@ -18,12 +19,6 @@ bool has_key(
   return map.find(key) != map.end();
 }
 
-#ifdef _WIN32
-const std::string k_separator = "\\";
-#else
-const std::string k_separator = "/";
-#endif
-
 } // namespace
 
 namespace torch::aot_inductor {
@@ -423,7 +418,7 @@ void OSSProxyExecutor::get_input_info_from_serialized(
     const auto& arg = named_argument["arg"];
     const auto& name = named_argument["name"].get<std::string>();
 
-    std::string custom_obj_name = "";
+    std::string custom_obj_name;
     if (arg.contains("as_custom_obj")) {
       custom_obj_name = arg["as_custom_obj"]["name"].get<std::string>();
     }
@@ -624,10 +619,8 @@ OSSProxyExecutor::OSSProxyExecutor(
     // Load custom objects from custom_objs_config.json file
     // Get the constants json path from the extern_kernel_nodes .json file
 
-    size_t lastSlash = json_path.find_last_of("/\\");
-    std::string folder_path = json_path.substr(0, lastSlash);
-    std::string custom_objs_json_path =
-        folder_path + k_separator + "custom_objs_config.json";
+    auto folder_path = c10::filesystem::path(json_path).parent_path();
+    auto custom_objs_json_path = folder_path / "custom_objs_config.json";
     LOG(INFO) << "Loading custom_objs_config .json file from "
               << custom_objs_json_path;
 
@@ -642,8 +635,7 @@ OSSProxyExecutor::OSSProxyExecutor(
       custom_objs_json_file >> custom_objs_json;
       // Load custom objects from binary torchbind file
       for (auto& [customObjName, file_name] : custom_objs_json.items()) {
-        std::string customObjPath =
-            folder_path + k_separator + file_name.get<std::string>();
+        auto customObjPath = folder_path / file_name.get<std::string>();
         LOG(INFO) << "Loading custom object to FbProxyExecutor from: "
                   << customObjPath;
 
diff --git a/torch/csrc/inductor/aoti_torch/shim_common.cpp b/torch/csrc/inductor/aoti_torch/shim_common.cpp
index 2cdeab071cd8..6572f43172a2 100644
--- a/torch/csrc/inductor/aoti_torch/shim_common.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_common.cpp
@@ -28,6 +28,7 @@
 #include <c10/core/Device.h>
 #include <c10/core/DeviceGuard.h>
 #include <c10/core/Stream.h>
+#include <c10/util/FileSystem.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -57,57 +58,7 @@
 #include <ATen/ops/scatter_reduce.h>
 #include <ATen/ops/view_as_real_ops.h>
 #include <ATen/ops/view_ops.h>
-
-#endif
-
-#ifndef _WIN32
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <climits>
-
-#else
-#include <filesystem>
-namespace fs = std::filesystem;
-#endif
-
-// HACK for failed builds in ARVR, where it cannot find these symbols within
-// std::experimental::filesystem
-namespace {
-std::string get_current_path() {
-#ifdef _WIN32
-  return fs::current_path().string();
-#else
-  // NOLINTNEXTLINE(*array*)
-  char currentPath[PATH_MAX]{};
-  if (getcwd(currentPath, sizeof(currentPath)) != nullptr) {
-    return std::string(currentPath);
-  } else {
-    throw std::runtime_error("Failed to get current path");
-  }
 #endif
-}
-
-bool file_exists(std::string& path) {
-#ifdef _WIN32
-  return fs::exists(path);
-#else
-  struct stat rc{};
-  return lstat(path.c_str(), &rc) == 0;
-#endif
-}
-
-bool create_directories(const std::string& path) {
-#ifdef _WIN32
-  return fs::create_directories(path);
-#else
-  if (mkdir(path.c_str(), 0777) == -1) {
-    throw std::runtime_error("Failed to create directory");
-  }
-  return true;
-#endif
-}
-} // namespace
 
 using namespace torch::aot_inductor;
 
@@ -270,6 +221,55 @@ AOTITorchError aoti_torch_delete_tensor_object(AtenTensorHandle tensor) {
   });
 }
 
+AOTITorchError aoti_torch_delete_c10_value_object(C10IValueHandle handle) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::IValue* t = reinterpret_cast<c10::IValue*>(handle);
+    delete t;
+  });
+}
+
+AOTITorchError aoti_torch_int64_to_ivalue(
+    int64_t val,
+    C10IValueHandle* ivalue) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::IValue* t = new c10::IValue(val);
+    *ivalue = reinterpret_cast<C10IValueHandle>(t);
+  });
+}
+
+AOTITorchError aoti_torch_str_to_ivalue(
+    const char* val,
+    C10IValueHandle* ivalue) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::IValue* t = new c10::IValue(val);
+    *ivalue = reinterpret_cast<C10IValueHandle>(t);
+  });
+}
+
+AOTITorchError aoti_torch_strlist_to_ivalue(
+    const char** val,
+    int64_t len,
+    C10IValueHandle* ivalue) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::string> vec;
+    for (int64_t i = 0; i < len; i++) {
+      vec.push_back(std::string(val[i]));
+    }
+    c10::IValue* t = new c10::IValue(vec);
+    *ivalue = reinterpret_cast<C10IValueHandle>(t);
+  });
+}
+
+AOTITorchError aoti_torch_tensor_to_ivalue(
+    AtenTensorHandle tensor,
+    C10IValueHandle* ivalue) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    at::Tensor* tmp_tensor = tensor_handle_to_tensor_pointer(tensor);
+    c10::IValue* tmp_ivalue = new c10::IValue(*tmp_tensor);
+    *ivalue = reinterpret_cast<C10IValueHandle>(tmp_ivalue);
+  });
+}
+
 AOTITorchError aoti_torch_get_data_ptr(
     AtenTensorHandle tensor,
     void** ret_data_ptr) {
@@ -1124,7 +1124,7 @@ AOTITorchError aoti_record_function_start(
 
     std::vector<c10::IValue> recordInputs(n_inputs);
     for (size_t i = 0; i < n_inputs; i++) {
-      recordInputs.push_back(*reinterpret_cast<c10::IValue*>(inputs[i]));
+      recordInputs[i] = *reinterpret_cast<c10::IValue*>(inputs[i]);
     }
 
     newGuard->before(name, &recordInputs, &recordKwargs);
@@ -1232,21 +1232,22 @@ void aoti_torch_save_tensor_handle(
   at::Tensor* t = tensor_handle_to_tensor_pointer(self);
 #ifndef C10_MOBILE
   // Save tensor to tmp .pt file for tensors and can be torch.load'ed later
-  std::string cwd = get_current_path();
-  std::string tmp_folder = cwd + "/tmp/aoti_torch/";
-  if (!file_exists(tmp_folder)) {
+  auto cwd = c10::filesystem::current_path();
+  auto tmp_folder = cwd / "tmp" / "aoti_torch";
+  if (!c10::filesystem::exists(tmp_folder)) {
     std::cout
         << "aoti_torch_save_tensor_handle: Path does not exist, creating it..."
         << tmp_folder << '\n';
 
-    if (!create_directories(tmp_folder)) {
+    std::error_code ec{};
+    if (!c10::filesystem::create_directories(tmp_folder, ec)) {
       std::cout << "aoti_torch_save_tensor_handle: Error creating directory: "
-                << tmp_folder << '\n';
+                << tmp_folder << " error:" << ec.message() << '\n';
       return;
     }
   }
-  std::string tensor_filepath_to_save = tmp_folder + launch_prefix + "_" +
-      kernel_name + "_" + tensor_name + "_" + t->device().str() + ".pt";
+  std::string tensor_filepath_to_save = tmp_folder.string() + launch_prefix +
+      "_" + kernel_name + "_" + tensor_name + "_" + t->device().str() + ".pt";
 
   auto bytes = torch::jit::pickle_save(c10::IValue(*t));
   std::ofstream fout(tensor_filepath_to_save, std::ios::out | std::ios::binary);
@@ -1373,11 +1374,8 @@ void aoti_torch_warn(
     const char* file,
     uint32_t line,
     const char* msg) {
-  ::c10::warn(::c10::Warning(
-      ::c10::UserWarning(),
-      {func, file, static_cast<uint32_t>(line)},
-      msg,
-      false));
+  ::c10::warn(
+      ::c10::Warning(::c10::UserWarning(), {func, file, line}, msg, false));
 }
 
 AOTITorchError aoti_torch__alloc_from_pool(
@@ -1415,28 +1413,28 @@ static StableIValue from_ivalue(
     case c10::TypeKind::TensorType: {
       AtenTensorHandle ath = torch::aot_inductor::new_tensor_handle(
           std::move(const_cast<at::Tensor&>(ivalue.toTensor())));
-      return from(ath);
+      return torch::stable::detail::from(ath);
     }
     case c10::TypeKind::IntType: {
-      return from(ivalue.toInt());
+      return torch::stable::detail::from(ivalue.toInt());
     }
     case c10::TypeKind::FloatType: {
-      return from(ivalue.toDouble());
+      return torch::stable::detail::from(ivalue.toDouble());
     }
     case c10::TypeKind::BoolType: {
-      return from(ivalue.toBool());
+      return torch::stable::detail::from(ivalue.toBool());
     }
     case c10::TypeKind::ScalarTypeType: {
-      return from(ivalue.toScalarType());
+      return torch::stable::detail::from(ivalue.toScalarType());
     }
     case c10::TypeKind::DeviceObjType: {
-      return from(ivalue.toDevice());
+      return torch::stable::detail::from(ivalue.toDevice());
     }
     case c10::TypeKind::LayoutType: {
-      return from(ivalue.toLayout());
+      return torch::stable::detail::from(ivalue.toLayout());
     }
     case c10::TypeKind::MemoryFormatType: {
-      return from(ivalue.toMemoryFormat());
+      return torch::stable::detail::from(ivalue.toMemoryFormat());
     }
     case c10::TypeKind::OptionalType: {
       auto inner_type = type->castRaw<at::OptionalType>()->getElementType();
@@ -1446,17 +1444,18 @@ static StableIValue from_ivalue(
       // able to follow the patterned semantic of every other case here in one
       // line:
       //
-      // return from<std::optional<inner_type::t>>(ivalue.toInnerTypeT()));
+      // return
+      // torch::stable::detail::from<std::optional<inner_type::t>>(ivalue.toInnerTypeT()));
       //
       // BUT we do NOT have that type inner_type::t readily available, so we
       // will manually unwrap and recursively call. This implementation MUST
-      // be kept in sync with from<std::optional<T>> function in
-      // torch/csrc/stable/library.h
+      // be kept in sync with torch::stable::detail::from<std::optional<T>>
+      // function in torch/csrc/stable/stableivalue_conversions.h
       if (ivalue.isNone()) {
-        return from(std::nullopt);
+        return torch::stable::detail::from(std::nullopt);
       }
       StableIValue* sivp = new StableIValue(from_ivalue(inner_type, ivalue));
-      return from(sivp);
+      return torch::stable::detail::from(sivp);
     }
     default: {
       TORCH_CHECK(
@@ -1473,30 +1472,32 @@ static c10::IValue to_ivalue(
   switch (type->kind()) {
     case c10::TypeKind::TensorType: {
       auto ret_raiiath = torch::aot_inductor::RAIIAtenTensorHandle(
-          to<AtenTensorHandle>(stable_ivalue));
+          torch::stable::detail::to<AtenTensorHandle>(stable_ivalue));
       return (c10::IValue(*torch::aot_inductor::tensor_handle_to_tensor_pointer(
           ret_raiiath.get())));
     }
     case c10::TypeKind::IntType: {
-      return c10::IValue(to<int64_t>(stable_ivalue));
+      return c10::IValue(torch::stable::detail::to<int64_t>(stable_ivalue));
     }
     case c10::TypeKind::FloatType: {
-      return c10::IValue(to<double>(stable_ivalue));
+      return c10::IValue(torch::stable::detail::to<double>(stable_ivalue));
     }
     case c10::TypeKind::BoolType: {
-      return c10::IValue(to<bool>(stable_ivalue));
+      return c10::IValue(torch::stable::detail::to<bool>(stable_ivalue));
     }
     case c10::TypeKind::ScalarTypeType: {
-      return c10::IValue(to<c10::ScalarType>(stable_ivalue));
+      return c10::IValue(
+          torch::stable::detail::to<c10::ScalarType>(stable_ivalue));
     }
     case c10::TypeKind::DeviceObjType: {
-      return c10::IValue(to<c10::Device>(stable_ivalue));
+      return c10::IValue(torch::stable::detail::to<c10::Device>(stable_ivalue));
     }
     case c10::TypeKind::LayoutType: {
-      return c10::IValue(to<c10::Layout>(stable_ivalue));
+      return c10::IValue(torch::stable::detail::to<c10::Layout>(stable_ivalue));
     }
     case c10::TypeKind::MemoryFormatType: {
-      return c10::IValue(to<c10::MemoryFormat>(stable_ivalue));
+      return c10::IValue(
+          torch::stable::detail::to<c10::MemoryFormat>(stable_ivalue));
     }
     case c10::TypeKind::OptionalType: {
       auto inner_type = type->castRaw<at::OptionalType>()->getElementType();
@@ -1506,16 +1507,17 @@ static c10::IValue to_ivalue(
       // able to follow the patterned semantic of every other case here in one
       // line:
       //
-      // return c10::IValue(to<std::optional<inner_type::t>>(stable_ivalue));
+      // return
+      // c10::IValue(torch::stable::detail::to<std::optional<inner_type::t>>(stable_ivalue));
       //
       // BUT we do NOT have that type inner_type::t readily available, so we
       // will manually unwrap and recursively call. This implementation MUST
-      // be kept in sync with the to<T> function in
-      // torch/csrc/stable/library.h
-      if (stable_ivalue == from(std::nullopt)) {
+      // be kept in sync with the torch::stable::detail::to<T> function in
+      // torch/csrc/stable/stableivalue_conversions.h
+      if (stable_ivalue == torch::stable::detail::from(std::nullopt)) {
         return c10::IValue();
       }
-      auto sivp = to<StableIValue*>(stable_ivalue);
+      auto sivp = torch::stable::detail::to<StableIValue*>(stable_ivalue);
       auto ival = to_ivalue(inner_type, *sivp);
       delete sivp;
       return ival;
diff --git a/torch/csrc/inductor/aoti_torch/shim_cpu.cpp b/torch/csrc/inductor/aoti_torch/shim_cpu.cpp
index a610685fe955..b1c864bf3fbb 100644
--- a/torch/csrc/inductor/aoti_torch/shim_cpu.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_cpu.cpp
@@ -1,5 +1,7 @@
 
+#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/Functional.hpp>
+#endif
 #include <torch/csrc/inductor/aoti_torch/c/shim_cpu.h>
 #include <torch/csrc/inductor/aoti_torch/utils.h>
 
@@ -531,6 +533,7 @@ AOTITorchError aoti_torch_cpu__weight_int4pack_mm_cpu_tensor(
   });
 }
 
+#ifdef USE_DISTRIBUTED
 AOTITorchError aoti_torch_cpu__c10d_functional_all_reduce_(
     AtenTensorHandle inp,
     const char* reduce_op,
@@ -563,3 +566,4 @@ AOTITorchError aoti_torch_cpu__c10d_functional_wait_tensor(
     *ret0 = new_tensor_handle(std::move(tmp_result));
   });
 }
+#endif
diff --git a/torch/csrc/inductor/aoti_torch/shim_mps.cpp b/torch/csrc/inductor/aoti_torch/shim_mps.cpp
index 47cb8f0f71f0..568350fa717d 100644
--- a/torch/csrc/inductor/aoti_torch/shim_mps.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_mps.cpp
@@ -27,3 +27,116 @@ AOTITorchError aoti_torch_mps_set_arg_int(
     func->setArg(idx, val);
   });
 }
+
+AOTITorchError aoti_torch_mps_create_shader_library(
+    const char* metal_shader_source,
+    AOTIMetalShaderLibraryHandle* library_handle) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* library = new at::native::mps::DynamicMetalShaderLibrary(
+        std::string(metal_shader_source));
+    *library_handle = reinterpret_cast<AOTIMetalShaderLibraryHandle>(library);
+  });
+}
+
+AOTITorchError aoti_torch_mps_delete_shader_library(
+    AOTIMetalShaderLibraryHandle library_handle) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* library =
+        reinterpret_cast<at::native::mps::MetalShaderLibrary*>(library_handle);
+    delete library;
+  });
+}
+
+AOTITorchError aoti_torch_mps_get_kernel_function(
+    AOTIMetalShaderLibraryHandle library_handle,
+    const char* kernel_name,
+    AOTIMetalKernelFunctionHandle* function_handle) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* library =
+        reinterpret_cast<at::native::mps::MetalShaderLibrary*>(library_handle);
+    auto* function =
+        library->getCachedKernelFunctionPtr(std::string(kernel_name));
+    *function_handle =
+        reinterpret_cast<AOTIMetalKernelFunctionHandle>(function);
+  });
+}
+
+AOTITorchError aoti_torch_mps_start_encoding(
+    AOTIMetalKernelFunctionHandle func) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* function_ptr =
+        reinterpret_cast<at::native::mps::MetalKernelFunction*>(func);
+    function_ptr->startEncoding();
+  });
+}
+
+AOTITorchError aoti_torch_mps_dispatch_single(
+    AOTIMetalKernelFunctionHandle func,
+    uint64_t length) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* function_ptr =
+        reinterpret_cast<at::native::mps::MetalKernelFunction*>(func);
+    function_ptr->dispatch(length);
+  });
+}
+
+AOTITorchError aoti_torch_mps_dispatch_single_with_group_size(
+    AOTIMetalKernelFunctionHandle func,
+    uint64_t length,
+    uint64_t group_size) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* function_ptr =
+        reinterpret_cast<at::native::mps::MetalKernelFunction*>(func);
+    function_ptr->dispatch(length, group_size);
+  });
+}
+
+AOTITorchError aoti_torch_mps_dispatch_array(
+    AOTIMetalKernelFunctionHandle func,
+    const uint64_t* length,
+    size_t length_size) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* function_ptr =
+        reinterpret_cast<at::native::mps::MetalKernelFunction*>(func);
+    c10::ArrayRef<uint64_t> length_ref(length, length_size);
+    function_ptr->dispatch(length_ref);
+  });
+}
+
+AOTITorchError aoti_torch_mps_dispatch_array_with_group_size(
+    AOTIMetalKernelFunctionHandle func,
+    const uint64_t* length,
+    size_t length_size,
+    const uint64_t* group_size,
+    size_t group_size_size) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* function_ptr =
+        reinterpret_cast<at::native::mps::MetalKernelFunction*>(func);
+    c10::ArrayRef<uint64_t> length_ref(length, length_size);
+    c10::ArrayRef<uint64_t> group_size_ref(group_size, group_size_size);
+    function_ptr->dispatch(length_ref, group_size_ref);
+  });
+}
+
+// Shared callback function for std::function trampoline
+void aoti_torch_mps_shared_callback(
+    AOTIMetalKernelFunctionHandle func,
+    void* user_data) {
+  auto* function_wrapper =
+      static_cast<std::function<void(AOTIMetalKernelFunctionHandle)>*>(
+          user_data);
+  (*function_wrapper)(func);
+}
+
+// Pure C version using function pointer and user data for trampoline pattern
+AOTITorchError aoti_torch_mps_run_command_block(
+    AOTIMetalKernelFunctionHandle func,
+    aoti_torch_mps_command_block_callback_t callback,
+    void* user_data) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* function_ptr =
+        reinterpret_cast<at::native::mps::MetalKernelFunction*>(func);
+    function_ptr->runCommandBlock(
+        [callback, func, user_data]() { callback(func, user_data); });
+  });
+}
diff --git a/torch/csrc/inductor/aoti_torch/shim_mps.mm b/torch/csrc/inductor/aoti_torch/shim_mps.mm
index 1bf88839ecfe..a24fe1ca0214 100644
--- a/torch/csrc/inductor/aoti_torch/shim_mps.mm
+++ b/torch/csrc/inductor/aoti_torch/shim_mps.mm
@@ -1,4 +1,3 @@
-#include <ATen/native/mps/MetalShaderLibrary.h>
 #include <torch/csrc/inductor/aoti_torch/c/shim_mps.h>
 #include <torch/csrc/inductor/aoti_torch/utils.h>
 #include <ATen/mps/MPSAllocatorInterface.h>
@@ -6,7 +5,6 @@
 #include <ATen/mps/MPSStream.h>
 #include <ATen/mps/MPSProfiler.h>
 
-
 using namespace torch::aot_inductor;
 
 AOTITorchError aoti_torch_mps_malloc(
@@ -33,7 +31,6 @@ AOTITorchError aoti_torch_mps_free(
   });
 }
 
-
 AOTITorchError
 aoti_torch_mps_memcpy(void* buffer, size_t constant_offset, size_t bytes_read, size_t data_size, uint8_t* constants_start) {
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
@@ -46,7 +43,6 @@ AOTITorchError aoti_torch_mps_free(
 AOTITorchError
 aoti_torch_mps_copy_buffer(void* src_buffer, void* dst_buffer, size_t data_size, size_t src_offset, size_t dst_offset) {
   AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
-
     auto src_mtl_buffer = (id<MTLBuffer>)src_buffer;
     auto dst_mtl_buffer = (id<MTLBuffer>)dst_buffer;
 
diff --git a/torch/csrc/inductor/array_ref_impl.h b/torch/csrc/inductor/array_ref_impl.h
index 9e3ec836f5f1..8cfbc12fb2c3 100644
--- a/torch/csrc/inductor/array_ref_impl.h
+++ b/torch/csrc/inductor/array_ref_impl.h
@@ -77,11 +77,11 @@ void convert_handles_to_inputs(
 
 template <typename T>
 void assert_numel(const ArrayRefTensor<T>& tensor, uint64_t numel) {
-  if (tensor.numel() != numel) {
-    std::stringstream err;
-    err << "incorrect numel for input tensor. expected " << numel << ", got "
-        << tensor.numel();
-    throw std::runtime_error(err.str());
-  }
+  TORCH_CHECK(
+      tensor.numel() == numel,
+      "incorrect numel for input tensor. expected ",
+      numel,
+      ", got ",
+      tensor.numel());
 }
 } // namespace torch::aot_inductor
diff --git a/torch/csrc/inductor/cpp_prefix.h b/torch/csrc/inductor/cpp_prefix.h
index f98da60a1049..8ae212d3d3db 100644
--- a/torch/csrc/inductor/cpp_prefix.h
+++ b/torch/csrc/inductor/cpp_prefix.h
@@ -657,8 +657,8 @@ inline at::vec::Vectorized<float> vec_shuffle_down(
     case 4:
       return vec_t(_mm256_permute2f128_ps(x, x, SHUFFLE_MASK(1, 1, 1, 1)));
   }
-  throw std::runtime_error(
-      "Unhandled vec_shuffle_down value " + std::to_string(n));
+
+  TORCH_CHECK(false, "Unhandled vec_shuffle_down value ", n);
 }
 #endif
 
@@ -682,8 +682,8 @@ inline at::vec::Vectorized<float> vec_shuffle_down(
       return vec_t(_mm512_permutexvar_ps(
           _mm512_set_epi32(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8), x));
   }
-  throw std::runtime_error(
-      "Unhandled vec_shuffle_down value " + std::to_string(n));
+
+  TORCH_CHECK(false, "Unhandled vec_shuffle_down value ", n);
 }
 #endif
 
diff --git a/torch/csrc/inductor/resize_storage_bytes.cpp b/torch/csrc/inductor/resize_storage_bytes.cpp
index 018acb1a0fc5..b41b99aca747 100644
--- a/torch/csrc/inductor/resize_storage_bytes.cpp
+++ b/torch/csrc/inductor/resize_storage_bytes.cpp
@@ -14,8 +14,7 @@ using namespace at;
 static void resize_storage_bytes_(const Tensor& variable, SymInt new_size) {
   // similar to THPStorage_resize_ in StorageMethods.cpp, but is traceable
   if (variable.storage().device_type() == at::kCUDA) {
-    // rocm build has undefined reference to resize_bytes_cuda
-#if defined(USE_CUDA) && !defined(USE_ROCM)
+#if defined(USE_CUDA)
     at::native::resize_bytes_cuda(
         variable.storage().unsafeGetStorageImpl(), new_size.expect_int());
 #else
diff --git a/torch/csrc/instruction_counter/Module.cpp b/torch/csrc/instruction_counter/Module.cpp
index aafae0dd3cbd..757fa58727c7 100644
--- a/torch/csrc/instruction_counter/Module.cpp
+++ b/torch/csrc/instruction_counter/Module.cpp
@@ -1,3 +1,4 @@
+#include <c10/util/Exception.h>
 #include <c10/util/error.h>
 #include <torch/csrc/instruction_counter/Module.h>
 #include <torch/csrc/utils/pybind.h>
@@ -20,7 +21,7 @@ namespace torch::instruction_counter {
 
 static long start() {
 #if !defined(__linux__)
-  throw std::runtime_error("This systems seems not to be Linux");
+  TORCH_CHECK(false, "This systems seems not to be Linux");
 #else
 
   // Construct base perf_event_attr struct
@@ -51,7 +52,7 @@ static long start() {
 
 static uint64_t end(int fd) {
 #if !defined(__linux__)
-  throw std::runtime_error("This systems seems not to be Linux");
+  TORCH_CHECK(false, "This systems seems not to be Linux");
 #else
   // Disable the event group
   if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) {
diff --git a/torch/csrc/jit/api/function_impl.cpp b/torch/csrc/jit/api/function_impl.cpp
index 820ecef66a89..0c911970347b 100644
--- a/torch/csrc/jit/api/function_impl.cpp
+++ b/torch/csrc/jit/api/function_impl.cpp
@@ -62,7 +62,7 @@ T& toGraphFunctionImpl(F& function) {
 
 } // namespace
 
-static void placeholderCreator(GraphFunction&) {
+static void placeholderCreator(GraphFunction& /*unused*/) {
   throw RecursiveMethodCallError();
 }
 
diff --git a/torch/csrc/jit/api/function_impl.h b/torch/csrc/jit/api/function_impl.h
index f508f3e5d522..298ff1957c11 100644
--- a/torch/csrc/jit/api/function_impl.h
+++ b/torch/csrc/jit/api/function_impl.h
@@ -173,8 +173,8 @@ struct TORCH_API GraphFunction : public Function {
 };
 
 // Short hands for dynamic_cast<GraphFunction*>.
-TORCH_API GraphFunction* tryToGraphFunction(Function&) noexcept;
-TORCH_API GraphFunction& toGraphFunction(Function&);
-TORCH_API const GraphFunction& toGraphFunction(const Function&);
+TORCH_API GraphFunction* tryToGraphFunction(Function& /*function*/) noexcept;
+TORCH_API GraphFunction& toGraphFunction(Function& /*function*/);
+TORCH_API const GraphFunction& toGraphFunction(const Function& /*function*/);
 } // namespace torch::jit
 C10_DECLARE_bool(torch_jit_do_not_store_optimized_graph);
diff --git a/torch/csrc/jit/api/method.h b/torch/csrc/jit/api/method.h
index d7ef14ddb193..906ef46c1ad6 100644
--- a/torch/csrc/jit/api/method.h
+++ b/torch/csrc/jit/api/method.h
@@ -65,7 +65,9 @@ struct TORCH_API Method : public torch::IMethod {
   }
 
  private:
-  void setArgumentNames(std::vector<std::string>&) const override;
+  void setArgumentNames(
+      std::vector<std::string>& /*argumentNames*/ /*argumentNamesOut*/)
+      const override;
 
   // Methods are uniqued owned by a single module. This raw pointer allows
   // looking up the module.
diff --git a/torch/csrc/jit/api/module.h b/torch/csrc/jit/api/module.h
index 52cec12fb859..c9b7793c89b6 100644
--- a/torch/csrc/jit/api/module.h
+++ b/torch/csrc/jit/api/module.h
@@ -93,7 +93,7 @@ struct TORCH_API Module : public Object {
   Module(Module&&) noexcept = default;
   Module& operator=(Module&&) noexcept = default;
   Module(
-      c10::QualifiedName,
+      c10::QualifiedName /*class_name*/,
       std::shared_ptr<CompilationUnit> cu,
       bool shouldMangle = false);
   Module(ModulePtr module_value) : Object(std::move(module_value)) {}
diff --git a/torch/csrc/jit/codegen/cuda/interface.h b/torch/csrc/jit/codegen/cuda/interface.h
index 926e4cb5d265..2223c9b47b27 100644
--- a/torch/csrc/jit/codegen/cuda/interface.h
+++ b/torch/csrc/jit/codegen/cuda/interface.h
@@ -38,7 +38,7 @@ TORCH_API CudaFuserInterface* getFuserInterface();
 
 TORCH_API void compileFusionGroup(Node* fusion_node);
 TORCH_API void runFusionGroup(const Node* fusion_node, Stack& stack);
-TORCH_API void fuseGraph(std::shared_ptr<Graph>&);
+TORCH_API void fuseGraph(std::shared_ptr<Graph>& /*graph*/);
 TORCH_API bool canFuseNode(const Node* node);
 TORCH_API void InsertProfileNodesForCUDAFuser(ProfilingRecord* pr);
 TORCH_API bool profileNode(const Node* node);
diff --git a/torch/csrc/jit/codegen/fuser/codegen.cpp b/torch/csrc/jit/codegen/fuser/codegen.cpp
index 2f1e7e8e9505..a5cd6f4e3a43 100644
--- a/torch/csrc/jit/codegen/fuser/codegen.cpp
+++ b/torch/csrc/jit/codegen/fuser/codegen.cpp
@@ -74,7 +74,7 @@ static const char* scalarTypeName(const at::ScalarType type) {
     AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CASE)
 #undef DEFINE_CASE
     default:
-      throw std::runtime_error("unknown scalar type");
+      TORCH_CHECK(false, "unknown scalar type");
   }
 }
 
@@ -99,8 +99,7 @@ static std::string variableType(const c10::Type& t) {
     return calcScalarTypeName(*scalar_type);
   }
   // something went wrong with the type analysis during shape propagation
-  throw std::runtime_error(
-      "unknown scalar type during JIT fusion code generation");
+  TORCH_CHECK(false, "unknown type during JIT fusion code generation");
 }
 
 static std::string typeCastedValueName(
@@ -129,8 +128,7 @@ static std::string typeCastedValueName(
     return vn;
   }
   // something went wrong with the type analysis during shape propagation
-  throw std::runtime_error(
-      "unknown scalar type during JIT fusion code generation");
+  TORCH_CHECK(false, "unknown type during JIT fusion code generation");
 }
 
 // Writes RHS of special handling "simple mappable" ops
@@ -155,11 +153,10 @@ static std::string encodeSpecialRHS(const Node* n, at::jit::TemplateEnv& env) {
       env.s("1", valueName(min));
       return format("(${0} < ${1} ? ${1} : ${0})", env);
     } else {
-      throw std::runtime_error(
-          "At least one of 'min' or 'max' must not be None");
+      TORCH_CHECK(false, "At least one of 'min' or 'max' must not be None");
     }
   } else {
-    throw std::runtime_error("Cannot encode RHS of the node, op not supported");
+    TORCH_CHECK(false, "Cannot encode RHS of the node, op not supported");
   }
 }
 
diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
index a3e9d15522fc..cba3aff9efa6 100644
--- a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
@@ -8,6 +8,7 @@
 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <ATen/native/cuda/jit_utils.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <c10/util/Exception.h>
 #include <torch/csrc/jit/resource_guard.h>
 
 #include <cuda_runtime.h>
@@ -148,9 +149,7 @@ FusedKernelCUDA::FusedKernelCUDA(
     AT_CUDA_NVRTC_CHECK(nvrtc().nvrtcGetProgramLogSize(program, &logsize));
     std::vector<char> log(logsize);
     AT_CUDA_NVRTC_CHECK(nvrtc().nvrtcGetProgramLog(program, log.data()));
-    std::stringstream cu;
-    cu << log.data();
-    throw std::runtime_error(cu.str());
+    TORCH_CHECK(false, std::string(log.data(), log.size()));
   }
   ResourceGuard holdProgram(
       [&] { AT_CUDA_NVRTC_CHECK(nvrtc().nvrtcDestroyProgram(&program)); });
diff --git a/torch/csrc/jit/codegen/fuser/fallback.cpp b/torch/csrc/jit/codegen/fuser/fallback.cpp
index 8fe89ccf79ce..698e2882d6a5 100644
--- a/torch/csrc/jit/codegen/fuser/fallback.cpp
+++ b/torch/csrc/jit/codegen/fuser/fallback.cpp
@@ -2,11 +2,11 @@
 
 #include <ATen/core/functional.h> //fmap
 #include <ATen/core/stack.h>
+#include <c10/util/Exception.h>
 #include <torch/csrc/jit/codegen/fuser/kernel_cache.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
 #include <torch/csrc/jit/runtime/interpreter.h>
-
 #include <stdexcept>
 
 namespace torch::jit::fuser {
@@ -38,8 +38,7 @@ static RegisterOperators reg_fused_operators({Operator(
 
 void runFallback(int64_t key, Stack& stack) {
   auto maybe_spec = retrieve(key);
-  if (!maybe_spec)
-    throw std::runtime_error("Failed to find fusion spec to run fallback.");
+  TORCH_CHECK(maybe_spec, "Failed to find fusion spec to run fallback.")
 
   InterpreterState{(*maybe_spec)->code()}.run(stack);
 }
diff --git a/torch/csrc/jit/codegen/fuser/interface.cpp b/torch/csrc/jit/codegen/fuser/interface.cpp
index db3ab8e366c7..41efa23e2b43 100644
--- a/torch/csrc/jit/codegen/fuser/interface.cpp
+++ b/torch/csrc/jit/codegen/fuser/interface.cpp
@@ -5,6 +5,7 @@
 #include <torch/csrc/jit/codegen/fuser/fallback.h>
 #include <torch/csrc/jit/codegen/fuser/kernel_cache.h>
 
+#include <c10/util/Exception.h>
 #include <c10/util/Flags.h>
 #include <stdexcept>
 
@@ -93,9 +94,8 @@ std::string debugGetFusedKernelCode(
   const auto key = fuser::registerFusion(fusion_group);
 
   std::string code;
-  if (!fuser::runFusion(key, stack, &code)) {
-    throw std::runtime_error("Could not run fusion for graph");
-  }
+  TORCH_CHECK(
+      fuser::runFusion(key, stack, &code), "Could not run fusion for graph")
 
   return code;
 }
diff --git a/torch/csrc/jit/frontend/function_schema_parser.cpp b/torch/csrc/jit/frontend/function_schema_parser.cpp
index 4c824e6997bf..24b3adfd98cd 100644
--- a/torch/csrc/jit/frontend/function_schema_parser.cpp
+++ b/torch/csrc/jit/frontend/function_schema_parser.cpp
@@ -111,7 +111,7 @@ struct SchemaParser {
       L.expect(':');
       name = fmt::format("{}::{}", name, L.expect(TK_IDENT).text_view());
     }
-    std::string overload_name = "";
+    std::string overload_name;
     if (L.nextIf('.')) {
       overload_name = L.expect(TK_IDENT).text();
     }
diff --git a/torch/csrc/jit/frontend/lexer.h b/torch/csrc/jit/frontend/lexer.h
index 0faf6ff24da4..98c235bc24f1 100644
--- a/torch/csrc/jit/frontend/lexer.h
+++ b/torch/csrc/jit/frontend/lexer.h
@@ -412,11 +412,7 @@ struct Token {
 
 struct Lexer {
   explicit Lexer(std::shared_ptr<Source> source)
-      : source(std::move(source)),
-
-        indent_stack(),
-        next_tokens(),
-        shared(sharedParserData()) {
+      : source(std::move(source)), shared(sharedParserData()) {
     auto first_indent = lexRaw(true);
     indent_stack.push_back(first_indent.range.size());
     lex();
diff --git a/torch/csrc/jit/frontend/schema_type_parser.cpp b/torch/csrc/jit/frontend/schema_type_parser.cpp
index 4df9fb663984..735856dc10a7 100644
--- a/torch/csrc/jit/frontend/schema_type_parser.cpp
+++ b/torch/csrc/jit/frontend/schema_type_parser.cpp
@@ -8,6 +8,7 @@
 #include <torch/csrc/jit/frontend/parse_string_literal.h>
 #include <torch/custom_class.h>
 #include <string>
+#include <unordered_set>
 
 using c10::AliasInfo;
 using c10::AwaitType;
@@ -42,6 +43,25 @@ using c10::VarType;
 
 namespace torch::jit {
 
+static std::unordered_set<std::string>& getOpaqueTypes() {
+  static std::unordered_set<std::string> global_opaque_types;
+  return global_opaque_types;
+}
+
+void registerOpaqueType(const std::string& type_name) {
+  auto& global_opaque_types = getOpaqueTypes();
+  auto [_, inserted] = global_opaque_types.insert(type_name);
+  if (!inserted) {
+    throw std::runtime_error(
+        "Type '" + type_name + "' is already registered as an opaque type");
+  }
+}
+
+bool isRegisteredOpaqueType(const std::string& type_name) {
+  auto& global_opaque_types = getOpaqueTypes();
+  return global_opaque_types.find(type_name) != global_opaque_types.end();
+}
+
 TypePtr SchemaTypeParser::parseBaseType() {
   static std::unordered_map<std::string, TypePtr> type_map = {
       {"Generator", c10::TypeFactory::get<GeneratorType>()},
@@ -81,6 +101,11 @@ TypePtr SchemaTypeParser::parseBaseType() {
   }
   std::string text = tok.text();
 
+  // Check if this type is registered as an opaque type first
+  if (isRegisteredOpaqueType(text)) {
+    return c10::PyObjectType::get();
+  }
+
   auto it = type_map.find(text);
   if (it == type_map.end()) {
     if (allow_typevars_ && !text.empty() && islower(text[0])) {
diff --git a/torch/csrc/jit/frontend/schema_type_parser.h b/torch/csrc/jit/frontend/schema_type_parser.h
index ca5a00ecaa3f..19f108fa17e8 100644
--- a/torch/csrc/jit/frontend/schema_type_parser.h
+++ b/torch/csrc/jit/frontend/schema_type_parser.h
@@ -10,6 +10,9 @@ namespace torch::jit {
 
 using TypePtr = c10::TypePtr;
 
+TORCH_API void registerOpaqueType(const std::string& type_name);
+TORCH_API bool isRegisteredOpaqueType(const std::string& type_name);
+
 struct TORCH_API SchemaTypeParser {
   TypePtr parseBaseType();
   std::optional<c10::AliasInfo> parseAliasAnnotation();
diff --git a/torch/csrc/jit/frontend/sugared_value.cpp b/torch/csrc/jit/frontend/sugared_value.cpp
index 0e9f0c9c2178..f9a80cf4da5e 100644
--- a/torch/csrc/jit/frontend/sugared_value.cpp
+++ b/torch/csrc/jit/frontend/sugared_value.cpp
@@ -867,7 +867,7 @@ std::shared_ptr<SugaredValue> TorchCheckValue::call(
     }
   }
 
-  if (args.size() >= 1) {
+  if (!args.empty()) {
     if (found_cond_kwarg) {
       throw(
           ErrorReport(loc)
diff --git a/torch/csrc/jit/frontend/tracer.h b/torch/csrc/jit/frontend/tracer.h
index dbfc6faa88c4..58f6260145da 100644
--- a/torch/csrc/jit/frontend/tracer.h
+++ b/torch/csrc/jit/frontend/tracer.h
@@ -388,7 +388,7 @@ template <
          !std::is_convertible_v<
              std::decay_t<T>,
              c10::intrusive_ptr<c10::ivalue::Object>>)>>
-void addOutput(Node* node, T&&) {
+void addOutput(Node* node, T&& /*unused*/) {
   TORCH_CHECK(
       false,
       "Found an unsupported argument type ",
diff --git a/torch/csrc/jit/ir/attributes.h b/torch/csrc/jit/ir/attributes.h
index f6e8f2148078..de3a5ab42f35 100644
--- a/torch/csrc/jit/ir/attributes.h
+++ b/torch/csrc/jit/ir/attributes.h
@@ -33,7 +33,7 @@ enum class AttributeKind {
 };
 static inline const char* toString(AttributeKind kind) {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
-  static const char* names[] = {
+  static constexpr const char* names[] = {
       "f",
       "c",
       "cs",
diff --git a/torch/csrc/jit/ir/constants.cpp b/torch/csrc/jit/ir/constants.cpp
index a0f8c8760a13..e17c981a746e 100644
--- a/torch/csrc/jit/ir/constants.cpp
+++ b/torch/csrc/jit/ir/constants.cpp
@@ -1,7 +1,7 @@
-#include <torch/csrc/jit/ir/constants.h>
-
 #include <ATen/core/functional.h>
+#include <c10/util/Exception.h>
 #include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/jit/ir/constants.h>
 #include <torch/csrc/jit/ir/ir.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
 #include <torch/csrc/jit/runtime/operator.h>
@@ -54,8 +54,7 @@ Value* insertConstant(
   if (value) {
     return *value;
   }
-  throw constant_not_supported_error(
-      "Unsupported value kind: " + val.tagKind());
+  TORCH_CHECK(false, "Unsupported value kind: ", val.tagKind());
 }
 
 // IValue -> Constant node
@@ -215,9 +214,7 @@ std::optional<IValue> toIValue(const Value* v) {
     const auto& class_val = node->ival(attr::value);
     return class_val;
   } else {
-    std::stringstream ss;
-    ss << "constant literal not supported for: " << type->str();
-    throw std::runtime_error(ss.str());
+    TORCH_CHECK(false, "constant literal not supported for: ", type->str());
   }
 }
 
diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index fea29767d265..4368b3c8191d 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -842,10 +842,7 @@ bool Value::isValidName(const std::string& name) {
 }
 
 Value* Value::setDebugName(const std::string& name) {
-  if (!isValidName(name)) {
-    throw std::runtime_error("Invalid name: '" + name + "'");
-  }
-
+  TORCH_CHECK(isValidName(name), "Invalid name: '", name, "'")
   auto& names = node()->owningGraph()->unique_names_;
 
   // clear any old name from the map
@@ -970,8 +967,7 @@ static size_t findArgument(
       return i;
     }
   }
-  throw std::runtime_error(
-      std::string("Couldn't find an argument called ") + unqualName);
+  TORCH_CHECK(false, "Couldn't find an argument called ", unqualName);
 }
 
 static size_t findArgument(const FunctionSchema& the_schema, Symbol name) {
@@ -1179,12 +1175,10 @@ bool Node::hasSideEffects() const {
     case prim::rpc_sync: // It represents RPC message sent.
     case prim::rpc_remote: // It represents RPC message sent.
     case aten::wait: // It can represent RPC message received.
-#if !defined(USE_ROCM)
     case cuda::set_stream:
     case cuda::_set_device:
     case cuda::_current_device:
     case cuda::synchronize:
-#endif
     case prim::Enter:
     case prim::Exit:
       return true;
@@ -1773,7 +1767,7 @@ Node* Graph::createTupleSlice(
 
   int64_t i = beg;
   for ([[maybe_unused]] const auto j : c10::irange(num_values)) {
-    auto idx = insertConstant(IValue(static_cast<int64_t>(i)));
+    auto idx = insertConstant(IValue(i));
     auto tupleIndex = insertNode(createTupleIndex(tup, idx, tt->elements()[i]));
 
     new_vals.push_back(tupleIndex->output());
diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h
index c3b4f455d576..1169934def92 100644
--- a/torch/csrc/jit/ir/ir.h
+++ b/torch/csrc/jit/ir/ir.h
@@ -78,9 +78,7 @@ namespace aten {
 using namespace ::c10::aten;
 }
 namespace cuda {
-#if !defined(USE_ROCM)
 using namespace ::c10::cuda;
-#endif
 } // namespace cuda
 
 struct Function;
diff --git a/torch/csrc/jit/ir/scope.cpp b/torch/csrc/jit/ir/scope.cpp
index 3ff1c22b8d11..0b1fbc3e4068 100644
--- a/torch/csrc/jit/ir/scope.cpp
+++ b/torch/csrc/jit/ir/scope.cpp
@@ -1,7 +1,7 @@
-#include <torch/csrc/jit/ir/scope.h>
-
 #include <ATen/core/class_type.h>
 #include <ATen/core/function.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/jit/ir/scope.h>
 
 namespace torch::jit {
 // util functions
@@ -45,9 +45,7 @@ ScopePtr Scope::push(Symbol name) {
 }
 
 ScopePtr Scope::parent() {
-  if (!parent_) {
-    throw std::runtime_error("Cannot get parent from Scope with no parent");
-  }
+  TORCH_CHECK(parent_, "Cannot get parent from Scope with no parent");
   return parent_;
 }
 
diff --git a/torch/csrc/jit/ir/scope.h b/torch/csrc/jit/ir/scope.h
index 51baee8e277c..f94110508e87 100644
--- a/torch/csrc/jit/ir/scope.h
+++ b/torch/csrc/jit/ir/scope.h
@@ -190,7 +190,7 @@ struct TORCH_API InlinedCallStack : public c10::intrusive_ptr_target {
   // Return callstack as a vector of [Function, SourceRange] pairs.
   std::vector<InlinedCallStackEntry> vec();
 
-  void setCallee(std::optional<InlinedCallStackPtr>);
+  void setCallee(std::optional<InlinedCallStackPtr> /*callee*/);
 
   bool operator==(const InlinedCallStack& rhs) const {
     // No need to compare fn_, since source_range equivalence check
diff --git a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
index 8d847ddeb533..23752d5f041c 100644
--- a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
+++ b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
@@ -16,10 +16,6 @@
 #include <unordered_set>
 #include <vector>
 
-namespace c10 {
-TypePtr parseType(const std::string& pythonStr);
-} // namespace c10
-
 namespace torch::jit {
 
 using caffe2::serialize::FileAdapter;
@@ -67,8 +63,7 @@ std::vector<IValue> get_bytecode_ivalues(PyTorchStreamReader& reader) {
 /********************** Bytecode **********************/
 
 // Forward declare
-uint64_t _get_model_bytecode_version(
-    const std::vector<IValue>& bytecode_ivalues);
+
 static uint64_t _get_model_bytecode_version_from_bytes(char* data, size_t size);
 
 uint64_t _get_model_bytecode_version(std::istream& in) {
@@ -250,8 +245,6 @@ std::unordered_map<std::string, OperatorInfo> _get_model_ops_and_info(
 /********************** Get Type Table **********************/
 
 // Forward declare
-std::unordered_set<std::string> _get_mobile_model_contained_types(
-    const std::vector<IValue>& bytecode_ivalues);
 
 std::unordered_set<std::string> _get_mobile_model_contained_types(
     std::istream& in) {
diff --git a/torch/csrc/jit/mobile/compatibility/model_compatibility.h b/torch/csrc/jit/mobile/compatibility/model_compatibility.h
index 59ae2b1f23a4..03be3dbeb1c6 100644
--- a/torch/csrc/jit/mobile/compatibility/model_compatibility.h
+++ b/torch/csrc/jit/mobile/compatibility/model_compatibility.h
@@ -93,7 +93,7 @@ enum ModelCompatibilityStatus {
 
 struct ModelCompatCheckResult {
   ModelCompatibilityStatus status;
-  std::vector<std::string> errors{};
+  std::vector<std::string> errors;
 };
 // Takes in information about a runtime and a model and returns if the two are
 // compatible with one another.
diff --git a/torch/csrc/jit/mobile/compatibility/runtime_compatibility.cpp b/torch/csrc/jit/mobile/compatibility/runtime_compatibility.cpp
index c3c86a7d2698..35aeb435330e 100644
--- a/torch/csrc/jit/mobile/compatibility/runtime_compatibility.cpp
+++ b/torch/csrc/jit/mobile/compatibility/runtime_compatibility.cpp
@@ -7,10 +7,6 @@
 #include <torch/custom_class.h>
 #include <unordered_map>
 
-namespace c10 {
-TypePtr parseType(const std::string& pythonStr);
-} // namespace c10
-
 namespace torch::jit {
 
 uint64_t _get_runtime_bytecode_version() {
diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.cpp b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
index 4d9505ee21a9..103fadaf3a57 100644
--- a/torch/csrc/jit/mobile/flatbuffer_loader.cpp
+++ b/torch/csrc/jit/mobile/flatbuffer_loader.cpp
@@ -154,34 +154,34 @@ class FlatbufferLoader final {
 };
 
 IValue parseList(
-    FlatbufferLoader&,
+    FlatbufferLoader& /*loader*/,
     const mobile::serialization::IValue& ivalue);
 IValue parseTensor(
-    FlatbufferLoader&,
+    FlatbufferLoader& /*loader*/,
     const mobile::serialization::IValue& ivalue);
 IValue parseTuple(
-    FlatbufferLoader&,
+    FlatbufferLoader& /*loader*/,
     const mobile::serialization::IValue& ivalue);
 IValue parseDict(
-    FlatbufferLoader&,
+    FlatbufferLoader& /*loader*/,
     const mobile::serialization::IValue& ivalue);
 IValue parseObject(
-    FlatbufferLoader&,
+    FlatbufferLoader& /*loader*/,
     const mobile::serialization::IValue& ivalue);
 IValue parseIntList(
-    FlatbufferLoader&,
+    FlatbufferLoader& /*unused*/,
     const mobile::serialization::IValue& ivalue);
 IValue parseDoubleList(
-    FlatbufferLoader&,
+    FlatbufferLoader& /*unused*/,
     const mobile::serialization::IValue& ivalue);
 IValue parseBoolList(
-    FlatbufferLoader&,
+    FlatbufferLoader& /*unused*/,
     const mobile::serialization::IValue& ivalue);
 IValue parseBasic(
-    FlatbufferLoader&,
+    FlatbufferLoader& /*unused*/,
     const mobile::serialization::IValue& ivalue);
 IValue parseEnum(
-    FlatbufferLoader&,
+    FlatbufferLoader& /*loader*/,
     const mobile::serialization::IValue& ivalue);
 
 TypePtr resolveType(
@@ -442,7 +442,7 @@ IValue parseEnum(
 }
 
 IValue parseBasic(
-    FlatbufferLoader&,
+    FlatbufferLoader& /*unused*/,
     const mobile::serialization::IValue& ivalue) {
   switch (ivalue.val_type()) {
     case mobile::serialization::IValueUnion::NONE:
@@ -546,21 +546,21 @@ std::vector<T> parseListNative(const U* list) {
 }
 
 IValue parseIntList(
-    FlatbufferLoader&,
+    FlatbufferLoader& /*unused*/,
     const mobile::serialization::IValue& ivalue) {
   const auto& list = ivalue.val_as_IntList();
   return parseListNative<int64_t>(list);
 }
 
 IValue parseDoubleList(
-    FlatbufferLoader&,
+    FlatbufferLoader& /*unused*/,
     const mobile::serialization::IValue& ivalue) {
   const auto& list = ivalue.val_as_DoubleList();
   return parseListNative<double>(list);
 }
 
 IValue parseBoolList(
-    FlatbufferLoader&,
+    FlatbufferLoader& /*unused*/,
     const mobile::serialization::IValue& ivalue) {
   const auto& list = ivalue.val_as_BoolList();
   std::vector<uint8_t> res = parseListNative<uint8_t>(list);
@@ -690,8 +690,8 @@ IValue FlatbufferLoader::parseIValue(
       *this, *ivalue);
 }
 
-void deleteNothing2(void*);
-void deleteNothing2(void*) {}
+void deleteNothing2(void* /*unused*/);
+void deleteNothing2(void* /*unused*/) {}
 
 c10::Storage FlatbufferLoader::getStorage(uint32_t index) {
   TORCH_CHECK(index < storage_loaded_.size());
@@ -760,7 +760,7 @@ void FlatbufferLoader::extractJitSourceAndConstants(
 mobile::Module parse_and_initialize_mobile_module(
     void* data,
     size_t size,
-    std::optional<at::Device>,
+    std::optional<at::Device> /*unused*/,
     ExtraFilesMap* extra_files,
     bool should_copy_tensor_memory) {
   // TODO(T128189662): If not copying, enforce that data is aligned to
@@ -806,7 +806,7 @@ mobile::Module parse_and_initialize_mobile_module_for_jit(
     size_t size,
     ExtraFilesMap& jit_sources,
     std::vector<IValue>& jit_constants,
-    std::optional<at::Device>,
+    std::optional<at::Device> /*unused*/,
     ExtraFilesMap* extra_files) {
   TORCH_CHECK(
       mobile::serialization::ModuleBufferHasIdentifier(data), "Format error");
diff --git a/torch/csrc/jit/mobile/flatbuffer_loader.h b/torch/csrc/jit/mobile/flatbuffer_loader.h
index 24c670e01f79..b34bb8809380 100644
--- a/torch/csrc/jit/mobile/flatbuffer_loader.h
+++ b/torch/csrc/jit/mobile/flatbuffer_loader.h
@@ -121,13 +121,6 @@ TORCH_API mobile::Module parse_flatbuffer_no_object(
     size_t size,
     std::optional<at::Device> device);
 
-TORCH_API mobile::Module parse_and_initialize_mobile_module(
-    void* data,
-    size_t,
-    std::optional<at::Device>,
-    ExtraFilesMap* extra_files,
-    bool should_copy_tensor_memory);
-
 // no op, TODO(qihan) delete
 TORCH_API bool register_flatbuffer_loader();
 
diff --git a/torch/csrc/jit/mobile/function.cpp b/torch/csrc/jit/mobile/function.cpp
index ed807f8c073b..87128a180a6d 100644
--- a/torch/csrc/jit/mobile/function.cpp
+++ b/torch/csrc/jit/mobile/function.cpp
@@ -149,7 +149,9 @@ size_t Function::num_inputs() const {
   return schema_->arguments().size();
 }
 
-bool Function::call(Stack&, c10::function_ref<void(const mobile::Code&)> f) {
+bool Function::call(
+    Stack& /*unused*/,
+    c10::function_ref<void(const mobile::Code&)> f) {
   initialize_operators(true);
   f(code_);
   return true;
diff --git a/torch/csrc/jit/mobile/function.h b/torch/csrc/jit/mobile/function.h
index 5e0824f880b2..1f0f90d34561 100644
--- a/torch/csrc/jit/mobile/function.h
+++ b/torch/csrc/jit/mobile/function.h
@@ -26,7 +26,9 @@ class TORCH_API Function : public torch::jit::Function {
   void ensure_defined() override {}
   size_t num_inputs() const override;
   const c10::QualifiedName& qualname() const override;
-  bool call(Stack&, c10::function_ref<void(const mobile::Code&)>) override;
+  bool call(
+      Stack& /*unused*/,
+      c10::function_ref<void(const mobile::Code&)> /*f*/ /*unused*/) override;
 
   // NOTE: the APIs below is dangerous: if you call append_instruction with
   // dbg_handle and then call it without; then the dbg_handle will become
diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index 089a0df564a0..6a0ba7e038ea 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -87,8 +87,6 @@ using caffe2::serialize::MemoryReadAdapter;
 using caffe2::serialize::PyTorchStreamReader;
 using caffe2::serialize::ReadAdapterInterface;
 
-OpCode parseOpCode(const char* str);
-
 TypePtr resolveTypeNameMobile(
     const c10::QualifiedName& qn,
     const std::shared_ptr<CompilationUnit>& compilation_unit) {
@@ -216,7 +214,7 @@ class BytecodeDeserializer final {
       mobile::Function* function);
   std::shared_ptr<CompilationUnit> compilation_unit_;
   std::unordered_set<std::string> imported_libs_;
-  std::unique_ptr<PyTorchStreamReader> reader_{};
+  std::unique_ptr<PyTorchStreamReader> reader_;
   std::optional<at::Device> device_;
   uint64_t module_load_options_;
   // From `version` or `.data/version` in model.ptl and it's compute
diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp
index c2197fcdcb35..b5e67cd83cbb 100644
--- a/torch/csrc/jit/mobile/interpreter.cpp
+++ b/torch/csrc/jit/mobile/interpreter.cpp
@@ -17,7 +17,7 @@
 #include <torch/csrc/jit/runtime/vararg_functions.h>
 
 namespace torch::jit {
-std::ostream& operator<<(std::ostream& out, Instruction inst);
+
 namespace mobile {
 InterpreterState::InterpreterState(const Code& code) {
   enterFrame(code);
diff --git a/torch/csrc/jit/mobile/interpreter.h b/torch/csrc/jit/mobile/interpreter.h
index e67595c06b57..48755954e04b 100644
--- a/torch/csrc/jit/mobile/interpreter.h
+++ b/torch/csrc/jit/mobile/interpreter.h
@@ -12,7 +12,7 @@ struct InterpreterState {
   TORCH_API bool run(Stack& stack);
 
  private:
-  void enterFrame(const Code&);
+  void enterFrame(const Code& /*code*/);
   void leaveFrame();
   void saveExceptionDebugHandles();
   void callFunction(torch::jit::Function& f, Stack& stack);
diff --git a/torch/csrc/jit/mobile/observer.h b/torch/csrc/jit/mobile/observer.h
index 694fe1df82c1..4b22af1fda41 100644
--- a/torch/csrc/jit/mobile/observer.h
+++ b/torch/csrc/jit/mobile/observer.h
@@ -67,26 +67,28 @@ class MobileModuleObserver {
  public:
   virtual ~MobileModuleObserver() = default;
 
-  virtual void onEnterRunMethod(const int32_t) {}
+  virtual void onEnterRunMethod(const int32_t /*unused*/) {}
   virtual void onExitRunMethod(
-      const std::unordered_map<std::string, std::string>&,
-      const std::string&,
-      const int32_t) {}
+      const std::unordered_map<std::string, std::string>& /*unused*/,
+      const std::string& /*unused*/,
+      const int32_t /*unused*/) {}
   virtual void onFailRunMethod(
-      const std::unordered_map<std::string, std::string>&,
-      const std::string&,
-      const int32_t,
-      const char*) {}
-  virtual void onEnterLoadModel(const int32_t) {}
+      const std::unordered_map<std::string, std::string>& /*unused*/,
+      const std::string& /*unused*/,
+      const int32_t /*unused*/,
+      const char* /*unused*/) {}
+  virtual void onEnterLoadModel(const int32_t /*unused*/) {}
   virtual void onExitLoadModel(
-      const int32_t,
-      const std::unordered_map<std::string, std::string>&) {
+      const int32_t /*unused*/,
+      const std::unordered_map<std::string, std::string>& /*unused*/) {
   } // key: filename, value: file content
-  virtual void onFailLoadModel(const int32_t, const char*) {}
   virtual void onFailLoadModel(
-      const int32_t,
-      const char*,
-      const std::unordered_map<std::string, std::string>&) {}
+      const int32_t /*unused*/,
+      const char* /*unused*/) {}
+  virtual void onFailLoadModel(
+      const int32_t /*unused*/,
+      const char* /*unused*/,
+      const std::unordered_map<std::string, std::string>& /*unused*/) {}
   virtual std::vector<std::string> getDefaultExtraFiles() = 0;
   virtual std::unordered_map<std::string, std::string> processMetadataFromExtra(
       const std::unordered_map<std::string, std::string>&) = 0;
diff --git a/torch/csrc/jit/mobile/parse_bytecode.cpp b/torch/csrc/jit/mobile/parse_bytecode.cpp
index eb95976d451b..1a1e278e371f 100644
--- a/torch/csrc/jit/mobile/parse_bytecode.cpp
+++ b/torch/csrc/jit/mobile/parse_bytecode.cpp
@@ -9,7 +9,7 @@
 #include <torch/custom_class_detail.h>
 
 namespace torch::jit {
-OpCode parseOpCode(const char* str);
+
 using c10::IValue;
 
 IValue expect_field(
diff --git a/torch/csrc/jit/mobile/train/optim/sgd.cpp b/torch/csrc/jit/mobile/train/optim/sgd.cpp
index ae1a40e10621..1523c5629a9c 100644
--- a/torch/csrc/jit/mobile/train/optim/sgd.cpp
+++ b/torch/csrc/jit/mobile/train/optim/sgd.cpp
@@ -84,7 +84,7 @@ Tensor SGD::step(const LossClosure& closure) {
     loss = closure();
   }
   for (auto& group : param_groups_) {
-    auto& options = static_cast<SGDOptions&>(group.options());
+    auto& options = group.options();
     auto weight_decay = options.weight_decay();
     auto momentum = options.momentum();
     auto dampening = options.dampening();
diff --git a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
index 680f7683009c..af3a0d641016 100644
--- a/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
+++ b/torch/csrc/jit/passes/frozen_ops_to_mkldnn.cpp
@@ -272,8 +272,7 @@ Operation createUnaryOp(
     TORCH_INTERNAL_ASSERT(
         a_it.get_desc().get_size() % elementSize(a.scalar_type()) == 0);
 
-    auto out_aten = at::from_blob(
-        out_raw_data, {static_cast<int64_t>(nelem)}, a_options_with_strided);
+    auto out_aten = at::from_blob(out_raw_data, nelem, a_options_with_strided);
     aten_op(out_aten, in_aten);
     push(stack, out);
   };
diff --git a/torch/csrc/jit/passes/onnx/function_extraction.cpp b/torch/csrc/jit/passes/onnx/function_extraction.cpp
index 32c0e1b77c2c..7901b44bb85f 100644
--- a/torch/csrc/jit/passes/onnx/function_extraction.cpp
+++ b/torch/csrc/jit/passes/onnx/function_extraction.cpp
@@ -87,14 +87,14 @@ struct FunctionExtractor {
       const std::shared_ptr<Graph>& graph);
 
   static void HandleNoScopeNodes(
-      scope_ctx_map&,
+      scope_ctx_map& /*scope_ctxs*/,
       const node_list& no_scope_nlist);
   std::tuple<scope_ctx_map, node_list> PartitionNodesByScope(Block* b);
   scope_ctx_map PartitionNodesByScope(const std::shared_ptr<Graph>& graph);
   static std::unordered_map<ScopePtr, scope_list> PartitionIdenticalScopes(
       scope_ctx_map& scope_ctxs);
   static scope_list SortScopesByMaxDepth(
-      std::unordered_map<ScopePtr, scope_list>&);
+      std::unordered_map<ScopePtr, scope_list>& /*identical_scope_map*/);
   Node* CreateFunctionDefNode(
       FunctionContext& func_ctx,
       const std::shared_ptr<Graph>& graph,
@@ -107,7 +107,7 @@ struct FunctionExtractor {
       const std::string& domain_name,
       const std::string& func_name);
 
-  static void DebugPrintScopeContexts(const scope_ctx_map&);
+  static void DebugPrintScopeContexts(const scope_ctx_map& /*scope_ctxs*/);
   static void DebugPrintGraphWithFunction(const std::shared_ptr<Graph>& g);
   static void DebugPrintConstantDiff(const FunctionContext&);
 
diff --git a/torch/csrc/jit/passes/onnx/helper.h b/torch/csrc/jit/passes/onnx/helper.h
index 09b31576998a..cad60e8816d3 100644
--- a/torch/csrc/jit/passes/onnx/helper.h
+++ b/torch/csrc/jit/passes/onnx/helper.h
@@ -28,9 +28,6 @@ TORCH_API ValueToParamPairMap
 buildValueToParamsMap(Block* b, const ParamMap& paramsDict);
 TORCH_API void eraseUnusedValuesFromMap(ValueToParamPairMap& valsToParamsMap);
 TORCH_API void eraseUnusedBlockInputs(Block* b);
-TORCH_API void buildParamsMapFromValueToParamsMap(
-    const ValueToParamPairMap& valsToParamsMap,
-    ParamMap& paramsDict);
 
 TORCH_API Node* addNodeToBlock(
     Block* block,
diff --git a/torch/csrc/jit/passes/onnx/naming.cpp b/torch/csrc/jit/passes/onnx/naming.cpp
index 692d60a2d3d4..034c73beb4c7 100644
--- a/torch/csrc/jit/passes/onnx/naming.cpp
+++ b/torch/csrc/jit/passes/onnx/naming.cpp
@@ -85,7 +85,7 @@ class NodeNameGenerator {
 
  protected:
   virtual void CreateNodeName(Node* n) = 0;
-  void PopulateNodeNames(Block*);
+  void PopulateNodeNames(Block* /*b*/);
   void UpdateOutputsNames(Node* n);
   bool IsGraphOutput(const Value* v, const std::shared_ptr<Graph>& graph) const;
 
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index 452b18f3efc3..ccb6e0bc163a 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -1439,8 +1439,8 @@ void ComputeConstant(Node* n, int opset_version) {
                   for (auto cur_dim : shape_vector_0) {
                     num_elements *= cur_dim.static_size();
                   }
-                  dims.emplace_back(c10::ShapeSymbol::fromStaticSize(
-                      static_cast<int64_t>(num_elements)));
+                  dims.emplace_back(
+                      c10::ShapeSymbol::fromStaticSize(num_elements));
                 }
               }
             }
diff --git a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
index 2e39bf67bf5f..8df57982bc33 100644
--- a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
+++ b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
@@ -750,7 +750,7 @@ class InsertQuantDeQuantHelper {
     }
   }
 
-  void collectObserverNodesAndValueToQuantize(Module& module, Value*);
+  void collectObserverNodesAndValueToQuantize(Module& module, Value* /*v*/);
   void cleanup(Module& module, Graph* g);
   void removeObserverNodes(Graph* g);
   void quantizeTensors(Module& module, Graph* g, Value* self);
diff --git a/torch/csrc/jit/passes/quantization/quantization_patterns.h b/torch/csrc/jit/passes/quantization/quantization_patterns.h
index 86d7b5857c49..e30688ed6e21 100644
--- a/torch/csrc/jit/passes/quantization/quantization_patterns.h
+++ b/torch/csrc/jit/passes/quantization/quantization_patterns.h
@@ -16,7 +16,7 @@ struct QuantFusionInfo {
   std::string quantized_op_name;
   std::string pattern;
   std::string replacement;
-  std::vector<MatchFilter> filters = {};
+  std::vector<MatchFilter> filters;
 };
 
 namespace {
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index bb052fc8421f..672a9949c6b9 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -16,7 +16,6 @@
 #include <torch/csrc/jit/passes/pass_manager.h>
 #include <torch/csrc/jit/passes/remove_redundant_profiles.h>
 #include <torch/csrc/jit/passes/symbolic_shape_runtime_fusion.h>
-#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
 #include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/runtime/custom_operator.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index c71102069772..beb6f8951980 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -15,6 +15,7 @@
 #endif
 #include <c10/core/SymNodeImpl.h>
 #include <torch/csrc/jit/frontend/ir_emitter.h>
+#include <torch/csrc/jit/frontend/schema_type_parser.h>
 #include <torch/csrc/jit/frontend/tracer.h>
 #include <torch/csrc/jit/ir/irparser.h>
 #include <torch/csrc/jit/jit_log.h>
@@ -78,6 +79,7 @@
 #include <torch/csrc/jit/passes/vulkan_rewrite.h>
 #include <torch/csrc/jit/passes/xnnpack_rewrite.h>
 #include <torch/csrc/jit/python/init.h>
+#include <torch/csrc/jit/python/opaque_obj.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/jit/python/python_arg_flatten.h>
 #include <torch/csrc/jit/python/python_custom_class.h>
@@ -1253,11 +1255,6 @@ void initJITBindings(PyObject* module) {
           [](const c10::SymNode& a, const char* file, int64_t line) {
             return a->expect_true(file, line);
           })
-      .def(
-          "expect_size",
-          [](const c10::SymNode& a, const char* file, int64_t line) {
-            return a->expect_size(file, line);
-          })
       .def(
           "guard_size_oblivious",
           [](const c10::SymNode& a, const char* file, int64_t line) {
@@ -1865,6 +1862,47 @@ void initJITBindings(PyObject* module) {
       &parseSchema,
       py::arg("schema"),
       py::arg("allow_typevars") = true);
+  m.def(
+      "_make_opaque_object",
+      [](py::object payload) {
+        auto obj = c10::make_intrusive<OpaqueObject>(payload);
+        auto typePtr =
+            torch::getCustomClass("__torch__.torch.classes.aten.OpaqueObject");
+        return torch::jit::toPyObject(c10::IValue(std::move(obj)));
+      },
+      R"doc(Creates an opaque object which stores the given Python object.)doc");
+  m.def(
+      "_get_opaque_object_payload",
+      [](py::object obj) {
+        auto typePtr =
+            torch::getCustomClass("__torch__.torch.classes.aten.OpaqueObject");
+        auto ivalue = torch::jit::toIValue(std::move(obj), typePtr);
+        auto customObj = ivalue.toCustomClass<OpaqueObject>();
+        return customObj->getPayload();
+      },
+      R"doc(Returns the Python object stored on the given opaque object.)doc");
+  m.def(
+      "_set_opaque_object_payload",
+      [](py::object obj, py::object payload) {
+        auto typePtr =
+            torch::getCustomClass("__torch__.torch.classes.aten.OpaqueObject");
+        auto ivalue = torch::jit::toIValue(std::move(obj), typePtr);
+        auto customObj = ivalue.toCustomClass<OpaqueObject>();
+        customObj->setPayload(std::move(payload));
+      },
+      R"doc(Sets the payload of the given opaque object with the given Python object.)doc");
+  m.def(
+      "_register_opaque_type",
+      [](const std::string& type_name) {
+        torch::jit::registerOpaqueType(type_name);
+      },
+      R"doc(Registers a type name to be treated as an opaque type (PyObject) in schema parsing.)doc");
+  m.def(
+      "_is_opaque_type_registered",
+      [](const std::string& type_name) -> bool {
+        return torch::jit::isRegisteredOpaqueType(type_name);
+      },
+      R"doc(Checks if a type name is registered as an opaque type.)doc");
   m.def("unify_type_list", [](const std::vector<TypePtr>& types) {
     std::ostringstream s;
     auto type = unifyTypeList(types, s);
diff --git a/torch/csrc/jit/python/opaque_obj.h b/torch/csrc/jit/python/opaque_obj.h
new file mode 100644
index 000000000000..bf137527da80
--- /dev/null
+++ b/torch/csrc/jit/python/opaque_obj.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <string>
+#include <utility>
+
+#include <c10/macros/Macros.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/custom_class.h>
+
+namespace torch::jit {
+struct OpaqueObject : public CustomClassHolder {
+  OpaqueObject(py::object payload) : payload_(std::move(payload)) {}
+
+  void setPayload(py::object payload) {
+    payload_ = std::move(payload);
+  }
+
+  py::object getPayload() {
+    return payload_;
+  }
+
+  py::object payload_;
+};
+
+static auto register_opaque_obj_class =
+    torch::class_<OpaqueObject>("aten", "OpaqueObject")
+        .def(
+            "__eq__",
+            [](const c10::intrusive_ptr<OpaqueObject>& self,
+               const c10::intrusive_ptr<OpaqueObject>& other) {
+              auto self_payload = self->getPayload();
+              auto other_payload = other->getPayload();
+
+              if (!self_payload.ptr() || !other_payload.ptr()) {
+                return false;
+              }
+
+              py::gil_scoped_acquire gil;
+              auto res = PyObject_RichCompareBool(
+                  self_payload.ptr(), other_payload.ptr(), Py_EQ);
+              if (res == -1) {
+                throw py::error_already_set();
+              }
+              return res > 0;
+            })
+        .def_pickle(
+            [](const c10::intrusive_ptr<OpaqueObject>& self) { // __getstate__
+              // Since we cannot directly return the py::object due to
+              // CustomClassHolder's signature limitations, we will have to
+              // serialize it directly here. We also can't return py::bytes so
+              // need to encode it into a string.
+              py::module_ pickle = py::module_::import("pickle");
+              py::module_ base64 = py::module_::import("base64");
+              py::bytes pickled_payload =
+                  pickle.attr("dumps")(self->getPayload());
+              py::bytes encoded_payload =
+                  base64.attr("b64encode")(pickled_payload);
+              return std::string(encoded_payload);
+            },
+            [](const std::string& state) { // __setstate__
+              py::module_ pickle = py::module_::import("pickle");
+              py::module_ base64 = py::module_::import("base64");
+              py::bytes state_bytes(state);
+              py::bytes decoded_payload = base64.attr("b64decode")(state_bytes);
+              py::object restored_payload =
+                  pickle.attr("loads")(decoded_payload);
+              return c10::make_intrusive<OpaqueObject>(restored_payload);
+            })
+        .def(
+            "__obj_flatten__",
+            [](const c10::intrusive_ptr<OpaqueObject>& self) {
+              throw std::runtime_error(
+                  "Unable to implement __obj_flatten__ for opaque objects.");
+            });
+
+} // namespace torch::jit
diff --git a/torch/csrc/jit/python/pybind.h b/torch/csrc/jit/python/pybind.h
index 5bab3878f3b4..066ff7f77f56 100644
--- a/torch/csrc/jit/python/pybind.h
+++ b/torch/csrc/jit/python/pybind.h
@@ -113,7 +113,7 @@ struct type_caster<torch::jit::IValue> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(torch::jit::IValue, _("IValue"));
 
-  bool load(handle src, bool) {
+  bool load(handle src, bool /*unused*/) {
     try {
       value = torch::jit::toTypeInferredIValue(src);
       return true;
@@ -136,7 +136,7 @@ struct type_caster<torch::jit::Symbol> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(torch::jit::Symbol, _("Symbol"));
 
-  bool load(handle src, bool) {
+  bool load(handle src, bool /*unused*/) {
     // TODO: Is there a way to py::cast that doesn't raise an exception on
     // failure?  Can we catch pybind11::cast_error here instead?
     std::string src_str;
@@ -164,7 +164,7 @@ struct type_caster<torch::jit::AttributeKind> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(torch::jit::AttributeKind, _("AttributeKind"));
 
-  bool load(handle src, bool) {
+  bool load(handle src, bool /*unused*/) {
     return false;
   }
 
@@ -186,7 +186,7 @@ template <>
 struct type_caster<std::vector<torch::jit::Node*>> : ListCasterBase {
   static handle cast(
       const std::vector<torch::jit::Node*>& src,
-      return_value_policy,
+      return_value_policy /*unused*/,
       handle parent) {
     return ListCasterBase::cast(src, return_value_policy::reference, parent);
   }
diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp
index a366aa58f822..d60a6a099008 100644
--- a/torch/csrc/jit/python/pybind_utils.cpp
+++ b/torch/csrc/jit/python/pybind_utils.cpp
@@ -874,10 +874,27 @@ std::optional<py::object> _maybe_handle_torch_function(
   std::vector<PyObject*> overloaded_args;
   const auto args_size = args.size();
   size_t total_arg_num = args_size + kwargs.size();
+  PyObject* const args_ptr = args.ptr();
   for (const auto i : c10::irange(args_size)) {
-    is_tensor_and_append_overloaded(args[i].ptr(), &overloaded_args);
+    // Because pybind object indexing is implemented generically for
+    // all objects, operator[] returns py::object instead of
+    // py::handle, so args[i].ptr() would cause a reference count
+    // round trip. This has enough overhead that I noticed it while
+    // profiling and came here to fix it. In contrast,
+    // PyTuple_GetItem returns a borrowed reference, so no counting
+    // overhead.
+    static_assert(
+        std::is_base_of_v<py::tuple, std::decay_t<decltype(args)>>,
+        "Use of PyTuple_GetItem below requires that args is a tuple!");
+
+    // Using PyTuple_GetItem instead of PyTuple_GET_ITEM out of an
+    // abundance of caution and for robustness under maintenance. If
+    // you're here looking for further performance improvements, you
+    // can probably switch to PyTuple_GET_ITEM.
+    auto* const args_i_ptr = PyTuple_GetItem(args_ptr, i);
+    is_tensor_and_append_overloaded(args_i_ptr, &overloaded_args);
     is_tensor_list_and_append_overloaded(
-        args[i].ptr(),
+        args_i_ptr,
         &overloaded_args,
         static_cast<int>(total_arg_num),
         false /* throw_error */);
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index 2c0c1ea4b9cf..5ae84e3e0c68 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -13,8 +13,6 @@
 #include <torch/csrc/Layout.h>
 #include <torch/csrc/QScheme.h>
 #include <torch/csrc/Stream.h>
-#include <torch/csrc/distributed/rpc/py_rref.h>
-#include <torch/csrc/distributed/rpc/rref_impl.h>
 #include <torch/csrc/jit/api/module.h>
 #include <torch/csrc/jit/frontend/schema_matching.h>
 #include <torch/csrc/jit/frontend/tracer.h>
@@ -26,6 +24,10 @@
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_arg_parser.h>
 #include <torch/csrc/utils/six.h>
+#ifdef USE_DISTRIBUTED
+#include <torch/csrc/distributed/rpc/py_rref.h>
+#include <torch/csrc/distributed/rpc/rref_impl.h>
+#endif
 
 #include <ATen/core/function_schema.h>
 #include <c10/core/Stream.h>
diff --git a/torch/csrc/jit/python/python_arg_flatten.cpp b/torch/csrc/jit/python/python_arg_flatten.cpp
index 655bbb5edac3..b71f21d043a3 100644
--- a/torch/csrc/jit/python/python_arg_flatten.cpp
+++ b/torch/csrc/jit/python/python_arg_flatten.cpp
@@ -78,8 +78,7 @@ void flatten_rec(PyObject* obj, ParsedArgs& args) {
     args.desc.metadata.emplace_back(var);
     args.desc.structure.push_back(D::Bool);
   } else if (PyLong_Check(obj)) { // Wrap longs in Long tensors
-    at::Tensor var = scalar_to_tensor(
-        at::Scalar(static_cast<int64_t>(THPUtils_unpackLong(obj))));
+    at::Tensor var = scalar_to_tensor(at::Scalar(THPUtils_unpackLong(obj)));
     args.vars.push_back(var);
     args.desc.metadata.emplace_back(var);
     args.desc.structure.push_back(D::Long);
diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp
index 808fe7d3605b..8b16e089aa50 100644
--- a/torch/csrc/jit/python/python_sugared_value.cpp
+++ b/torch/csrc/jit/python/python_sugared_value.cpp
@@ -1225,7 +1225,7 @@ std::shared_ptr<SugaredValue> toSugaredValue(
   } else if (obj.ptr() == py::module::import("torch").attr("_check").ptr()) {
     return std::make_shared<TorchCheckValue>();
 #ifdef USE_RPC
-    // This is not defined on WINDOWS
+    // RPC module is only available when build flag "USE_DISTRIBUTED" is on.
   } else if (
       isRpcAvailable &&
       obj.ptr() ==
@@ -1238,6 +1238,7 @@ std::shared_ptr<SugaredValue> toSugaredValue(
     return SpecialFormValue::create(prim::rpc_sync);
   } else if (
       isRpcAvailable &&
+      // RPC module is only available  when build flag "USE_DISTRIBUTED" is on.
       obj.ptr() ==
           py::module::import("torch.distributed.rpc").attr("remote").ptr()) {
     return SpecialFormValue::create(prim::rpc_remote);
diff --git a/torch/csrc/jit/runtime/instruction.h b/torch/csrc/jit/runtime/instruction.h
index 73c78adbda03..fbaca4b6ea78 100644
--- a/torch/csrc/jit/runtime/instruction.h
+++ b/torch/csrc/jit/runtime/instruction.h
@@ -95,6 +95,5 @@ std::ostream& operator<<(std::ostream& out, Instruction inst);
 bool isOpSupportedInMobile(OpCode op);
 char const* toString(OpCode op);
 OpCode parseOpCode(const char* str);
-std::ostream& operator<<(std::ostream& out, Instruction inst);
 
 } // namespace torch::jit
diff --git a/torch/csrc/jit/runtime/interpreter.h b/torch/csrc/jit/runtime/interpreter.h
index be582cfb7cdd..6ae9f52a0cda 100644
--- a/torch/csrc/jit/runtime/interpreter.h
+++ b/torch/csrc/jit/runtime/interpreter.h
@@ -128,8 +128,13 @@ struct InterpreterContinuation {
       std::optional<at::ThreadLocalState> tls_state = std::nullopt)
       : state(std::move(state_)),
         stack(std::move(stack_)),
-        tls_state_(std::move(tls_state)),
-        dist_autograd_context_id_(dist_autograd_context_id) {}
+        tls_state_(std::move(tls_state))
+#ifdef USE_DISTRIBUTED
+        ,
+        dist_autograd_context_id_(dist_autograd_context_id)
+#endif
+  {
+  }
 
   void operator()();
 
@@ -137,10 +142,9 @@ struct InterpreterContinuation {
   InterpreterState state;
   Stack stack;
   std::optional<at::ThreadLocalState> tls_state_ = std::nullopt;
-#ifndef USE_RPC
-  [[maybe_unused]]
-#endif
+#ifdef USE_DISTRIBUTED
   int64_t dist_autograd_context_id_;
+#endif
 };
 
 // what is the tensors type, including state from the current execution context
diff --git a/torch/csrc/jit/runtime/jit_trace.cpp b/torch/csrc/jit/runtime/jit_trace.cpp
index b25088b32eca..45be4fe21bb4 100644
--- a/torch/csrc/jit/runtime/jit_trace.cpp
+++ b/torch/csrc/jit/runtime/jit_trace.cpp
@@ -62,7 +62,10 @@ void eraseAllOutputs(Node* opt_pn) {
   }
 }
 
-void insertTracingNodes(Block*, ProfilingRecord*, TracingData&);
+void insertTracingNodes(
+    Block* /*block*/,
+    ProfilingRecord* /*pr*/,
+    TracingData& /*td*/);
 
 // The subtlety in `createPropNodeForIfBlock` is that we need to create
 // a "propagate" node that will propagate the mapping between the outputs
diff --git a/torch/csrc/jit/runtime/profiling_record.h b/torch/csrc/jit/runtime/profiling_record.h
index c45dcde7b0bf..0dfdb246dd68 100644
--- a/torch/csrc/jit/runtime/profiling_record.h
+++ b/torch/csrc/jit/runtime/profiling_record.h
@@ -81,7 +81,8 @@ namespace torch::jit {
 using ::c10::TensorTypePtr;
 using Dimension = int64_t;
 
-TORCH_API void RegisterProfilingNode(const std::function<bool(const Node*)>&);
+TORCH_API void RegisterProfilingNode(
+    const std::function<bool(const Node*)>& /*func*/);
 
 struct ProfilingRecord;
 
diff --git a/torch/csrc/jit/runtime/register_ops_utils.h b/torch/csrc/jit/runtime/register_ops_utils.h
index 340b597280a6..7578ea6b1f99 100644
--- a/torch/csrc/jit/runtime/register_ops_utils.h
+++ b/torch/csrc/jit/runtime/register_ops_utils.h
@@ -418,8 +418,8 @@ struct OperatorGeneratorArgs {
 
   template <typename... Args>
   explicit constexpr OperatorGeneratorArgs(
-      torch::detail::SelectiveStr<false>,
-      Args...)
+      torch::detail::SelectiveStr<false> /*unused*/,
+      Args... /*unused*/)
       : schema_str(nullptr),
         isOperationCreator(false),
         operation(nullptr),
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index d59b93190e36..4aa098d870f5 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -1710,7 +1710,7 @@ int64_t stringFindImpl(
     bool reverse = false) {
   int64_t size = string.size();
   if (start < 0) {
-    start = std::max(int64_t(0), int64_t(size + start));
+    start = std::max(int64_t(0), size + start);
   }
   if (end < 0) {
     end = std::max(int64_t(0), int64_t(size + end + 1));
@@ -1964,7 +1964,7 @@ static const std::vector<OperatorGeneratorArgs> stringOpGenArgs{
             return;
           }
           if (start < 0) {
-            start = std::max(int64_t(0), int64_t(size + start));
+            start = std::max(int64_t(0), size + start);
           }
           if (end < 0) {
             end = std::max(int64_t(0), int64_t(size + end + 1));
@@ -1993,7 +1993,7 @@ static const std::vector<OperatorGeneratorArgs> stringOpGenArgs{
           std::string string = pop(stack).toStringRef();
           int64_t size = string.size();
           if (start < 0) {
-            start = std::max(int64_t(0), int64_t(size + start));
+            start = std::max(int64_t(0), (size + start));
           }
           if (end < 0) {
             end = std::max(int64_t(0), int64_t(size + end + 1));
@@ -2019,7 +2019,7 @@ static const std::vector<OperatorGeneratorArgs> stringOpGenArgs{
           std::string string = pop(stack).toStringRef();
           int64_t size = string.size();
           if (start < 0) {
-            start = std::max(int64_t(0), int64_t(size + start));
+            start = std::max(int64_t(0), (size + start));
           }
           if (end < 0) {
             end = std::max(int64_t(0), int64_t(size + end + 1));
diff --git a/torch/csrc/jit/runtime/script_profile.h b/torch/csrc/jit/runtime/script_profile.h
index 8061d6fc8597..6c6588b2cec4 100644
--- a/torch/csrc/jit/runtime/script_profile.h
+++ b/torch/csrc/jit/runtime/script_profile.h
@@ -24,7 +24,7 @@ struct Datapoint {
 
 class TORCH_API InstructionSpan {
  public:
-  explicit InstructionSpan(Node&);
+  explicit InstructionSpan(Node& /*node*/);
   ~InstructionSpan();
   InstructionSpan(InstructionSpan&&) = delete;
   InstructionSpan& operator=(InstructionSpan&&) = delete;
@@ -91,7 +91,7 @@ class TORCH_API ScriptProfile : public CustomClassHolder {
   void enable();
   void disable();
   const SourceMap& dumpStats();
-  void addDatapoint(std::shared_ptr<profiling::Datapoint>);
+  void addDatapoint(std::shared_ptr<profiling::Datapoint> /*datapoint*/);
   ~ScriptProfile() override;
 
  private:
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 78378b04b4a6..0a6e0b3564ad 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -1098,7 +1098,7 @@ namespace {
 
 void destroyNodeOutputs(ProcessedNode& p_node) {
   const auto borrows_outputs = borrowsOutputs(p_node.node()->kind());
-  const auto num_outputs = static_cast<uint32_t>(p_node.num_outputs());
+  const auto num_outputs = p_node.num_outputs();
   for (const auto i : c10::irange<uint32_t>(num_outputs)) {
     auto& output = p_node.Output(i);
     if (doesNotHeapAllocateWhenStoredInIValue(*output.type())) {
@@ -1863,7 +1863,7 @@ bool BlockRunner::check_for_memory_leak(
   const auto num_nodes = static_cast<uint32_t>(nodes_.size());
   for (const auto n : c10::irange(num_nodes)) {
     auto& pnode = nodes_[n];
-    const auto num_outputs = static_cast<uint32_t>(pnode.num_outputs());
+    const auto num_outputs = pnode.num_outputs();
     for (const auto i : c10::irange(num_outputs)) {
       const IValue* ival = &pnode.Output(i);
       const Value* val = pnode.node()->output(i);
@@ -1943,7 +1943,7 @@ bool BlockRunner::checkOutputTensorMemoryLeaks() {
   const auto num_nodes = static_cast<uint32_t>(nodes_.size());
   for (const auto n : c10::irange(num_nodes)) {
     auto& pnode = nodes_[n];
-    const auto num_outputs = static_cast<uint32_t>(pnode.num_outputs());
+    const auto num_outputs = pnode.num_outputs();
     for (const auto i : c10::irange(num_outputs)) {
       const IValue* ival = &pnode.Output(i);
       const Value* val = pnode.node()->output(i);
@@ -2042,7 +2042,7 @@ ProcessedFunction::ProcessedFunction(
         stack.emplace_back(static_cast<int>(size));
       }
       node_op(stack);
-      const auto num_outputs = static_cast<uint32_t>(pnode->num_outputs());
+      const auto num_outputs = pnode->num_outputs();
       TORCH_DCHECK_EQ(stack.size(), num_outputs);
       for (const auto i : c10::irange(num_outputs)) {
         pnode->Output(i) = std::move(stack[i]);
@@ -2158,7 +2158,7 @@ bool ProcessedNode::verify_no_memory_overlap(bool force_check) const {
 }
 
 bool ProcessedNode::verify_outputs_dont_overlap_each_other() const {
-  const auto n_outputs = static_cast<uint32_t>(num_outputs());
+  const auto n_outputs = num_outputs();
   for (const auto i : c10::irange(n_outputs)) {
     if (!Output(i).isTensor()) {
       continue;
@@ -2196,7 +2196,7 @@ bool ProcessedNode::verify_inputs_dont_overlap_outputs(bool force_check) const {
     return true;
   }
   const auto n_inputs = static_cast<uint32_t>(inputs_.size());
-  const auto n_outputs = static_cast<uint32_t>(num_outputs());
+  const auto n_outputs = num_outputs();
   for (const auto i : c10::irange<uint32_t>(n_inputs)) {
     const IValue* in = &Input(i);
     if (!in->isTensor()) {
@@ -2235,7 +2235,7 @@ bool ProcessedNode::check_and_correct_overlap_with(
 
 void ProcessedNode::verify_and_correct_memory_overlap() {
   const auto n_inputs = static_cast<uint32_t>(inputs_.size());
-  const auto n_outputs = static_cast<uint32_t>(num_outputs());
+  const auto n_outputs = num_outputs();
   for (const auto i : c10::irange(n_inputs)) {
     const IValue& in = Input(i);
     if (!in.isTensor()) {
diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h
index b25f63c939b0..24f8f01d7547 100644
--- a/torch/csrc/jit/runtime/static/impl.h
+++ b/torch/csrc/jit/runtime/static/impl.h
@@ -142,9 +142,9 @@ class TORCH_API ManagedTensorRanges {
 
   // Maps Node* to the set of managed tensors that are now available
   // for reuse after this node.
-  c10::FastMap<Node*, std::vector<const Value*>> node_to_newly_free_tensors_{};
+  c10::FastMap<Node*, std::vector<const Value*>> node_to_newly_free_tensors_;
   // Maps each Value* to its lifetime (start node index, end node index)
-  c10::FastMap<const Value*, Lifetime> value_lifetimes_{};
+  c10::FastMap<const Value*, Lifetime> value_lifetimes_;
 };
 
 struct TORCH_API StaticModuleOptions {
@@ -395,7 +395,7 @@ class BlockInfo {
   c10::FastSet<const Value*> managed_output_tensor_values_;
   c10::FastSet<const Value*> leaked_values_;
 
-  ManagedTensorRanges managed_tensor_ranges_{};
+  ManagedTensorRanges managed_tensor_ranges_;
 
   // The index of this block's inputs in the shared values_ array.
   const uint16_t input_idx_;
@@ -549,7 +549,7 @@ class TORCH_API StaticModule {
   // IValue table (defined by prim::Constant nodes)
   std::vector<IValue> constants_;
   // The functions to be called by corresponding ProcessedNode.
-  std::vector<ProcessedFunction> functions_{};
+  std::vector<ProcessedFunction> functions_;
   // A list of pre-processed nodes from which ProcessedNode are created per
   // StaticRuntime instance.
   std::vector<StaticNodeInfo> nodes_;
diff --git a/torch/csrc/jit/runtime/static/memory_planner.h b/torch/csrc/jit/runtime/static/memory_planner.h
index 018b8947a07c..d9755d83048c 100644
--- a/torch/csrc/jit/runtime/static/memory_planner.h
+++ b/torch/csrc/jit/runtime/static/memory_planner.h
@@ -35,7 +35,7 @@ class StorageGroup {
   // allocated for all tensors in this storage group. Initially it
   // is zero, eventually it gets updated by the MemoryPlanner.
   size_t max_tensor_size_ = 0;
-  std::vector<at::Tensor*> group_{};
+  std::vector<at::Tensor*> group_;
 };
 
 // A contiguous buffer of `StorageImpl`s
@@ -263,7 +263,7 @@ class MemoryPlanner {
   // to an ordinary "strong reference" state.
   std::vector<IValue*> borrowed_ivalues_needing_incref_;
 
-  std::vector<std::pair<size_t, at::Tensor*>> managed_output_tensors_{};
+  std::vector<std::pair<size_t, at::Tensor*>> managed_output_tensors_;
   at::DataPtr buffer_; // allocated each time we call Run()
   uint8_t* buffer_start_{nullptr};
   uint8_t* buffer_end_{nullptr};
@@ -292,7 +292,7 @@ class StandardMemoryPlanner : public MemoryPlanner {
   void allocateManagedTensors() override;
   void deallocateManagedTensors() override;
 
-  std::vector<StorageGroup> managed_tensors_{};
+  std::vector<StorageGroup> managed_tensors_;
 };
 
 } // namespace torch::jit
diff --git a/torch/csrc/jit/runtime/static/ops.h b/torch/csrc/jit/runtime/static/ops.h
index 7b4b00e7e8ea..69fbfc7d58fa 100644
--- a/torch/csrc/jit/runtime/static/ops.h
+++ b/torch/csrc/jit/runtime/static/ops.h
@@ -22,7 +22,7 @@ namespace torch::jit {
 
 using SROpFunctor = SROperator (*)(Node* n);
 struct SROperatorFunctor {
-  virtual SROperator Generate(Node*) {
+  virtual SROperator Generate(Node* /*unused*/) {
     SROperator out;
     return out;
   }
@@ -165,7 +165,7 @@ inline void LogAndDumpSchema(const Node* node) {
   VLOG(1) << "Found schema mismatch for: " << node->schema();
 }
 
-inline bool sr_schema_check(torch::jit::Node*) {
+inline bool sr_schema_check(torch::jit::Node* /*unused*/) {
   return true;
 }
 
diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp
index 6184889e5f10..59ed5281db6b 100644
--- a/torch/csrc/jit/serialization/export.cpp
+++ b/torch/csrc/jit/serialization/export.cpp
@@ -12,7 +12,6 @@
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/inliner.h>
 #include <torch/csrc/jit/runtime/instruction.h>
-#include <torch/csrc/jit/serialization/export.h>
 #include <torch/csrc/jit/serialization/import_export_constants.h>
 #include <torch/csrc/jit/serialization/import_export_functions.h>
 #include <torch/csrc/jit/serialization/import_export_helpers.h>
diff --git a/torch/csrc/jit/serialization/export_module.cpp b/torch/csrc/jit/serialization/export_module.cpp
index e0ded27d375b..36c1804a06b7 100644
--- a/torch/csrc/jit/serialization/export_module.cpp
+++ b/torch/csrc/jit/serialization/export_module.cpp
@@ -661,10 +661,10 @@ void ScriptModuleSerializer::writeByteCode(
   BackendDebugInfoRecorder debug_info_recorder;
   int64_t version_to_write = caffe2::serialize::kProducedBytecodeVersion;
 
-  elements.emplace_back(static_cast<int64_t>(version_to_write));
+  elements.emplace_back(version_to_write);
   std::vector<c10::IValue> debug_info_elements;
   // Always save debug handles
-  debug_info_elements.emplace_back(static_cast<int64_t>(version_to_write));
+  debug_info_elements.emplace_back(version_to_write);
 
   mobile::Module mobile_module =
       jitModuleToMobile(module, getOptionsFromGlobal());
@@ -913,7 +913,7 @@ void save_jit_module_to_write_func(
     const std::function<size_t(const void*, size_t)>& writer_func) {
   (void)save_mobile_debug_info;
   auto buffer = save_jit_module_to_bytes(module, extra_files);
-  writer_func(reinterpret_cast<void*>(buffer->data()), buffer->size());
+  writer_func(buffer->data(), buffer->size());
 }
 
 void ExportModule(
diff --git a/torch/csrc/jit/serialization/import.h b/torch/csrc/jit/serialization/import.h
index 0e2024483f4a..aa7d457d2b2c 100644
--- a/torch/csrc/jit/serialization/import.h
+++ b/torch/csrc/jit/serialization/import.h
@@ -140,12 +140,6 @@ TORCH_API Module load_jit_module_from_stream(
     ExtraFilesMap& extra_files,
     std::optional<at::Device> device = std::nullopt);
 
-TORCH_API Module parse_and_initialize_jit_module(
-    const std::shared_ptr<char>& data,
-    size_t size,
-    ExtraFilesMap& extra_files,
-    std::optional<at::Device> device);
-
 TORCH_API c10::intrusive_ptr<c10::ivalue::Object> ObjLoaderFunc(
     const at::StrongTypePtr& type,
     IValue input);
diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h
index e3379f4de65a..526c840bc10e 100644
--- a/torch/csrc/jit/serialization/pickler.h
+++ b/torch/csrc/jit/serialization/pickler.h
@@ -79,7 +79,9 @@ class TORCH_API Pickler {
   void pushTuple(const IValue& ivalue);
   void pushString(const std::string& string);
   void pushDevice(const IValue& ivalue);
+#ifdef USE_DISTRIBUTED
   void pushRRef(const IValue& ivalue);
+#endif
   // unmemoized version
   void pushStringImpl(const std::string& string);
   void pushStorageOfTensor(const at::Tensor& tensor);
diff --git a/torch/csrc/jit/serialization/pickler_helper.h b/torch/csrc/jit/serialization/pickler_helper.h
index b27d974a10e9..9a320cafb3b4 100644
--- a/torch/csrc/jit/serialization/pickler_helper.h
+++ b/torch/csrc/jit/serialization/pickler_helper.h
@@ -53,6 +53,7 @@ enum class PickleOpCode : char {
   BINFLOAT = 'G',
 
   // Protocol 2
+  // NOLINTNEXTLINE(readability-redundant-inline-specifier)
   PROTO = char('\x80'),
   NEWOBJ = '\x81',
   EXT1 = '\x82',
@@ -71,6 +72,7 @@ enum class PickleOpCode : char {
   SHORT_BINBYTES = 'C',
 
   // Protocol 4
+  // NOLINTNEXTLINE(readability-redundant-inline-specifier)
   SHORT_BINUNICODE = char('\x8c'),
   BINUNICODE8 = '\x8d',
   BINBYTES8 = '\x8e',
diff --git a/torch/csrc/jit/serialization/source_range_serialization.cpp b/torch/csrc/jit/serialization/source_range_serialization.cpp
index b9a56bc87523..caefafc6632e 100644
--- a/torch/csrc/jit/serialization/source_range_serialization.cpp
+++ b/torch/csrc/jit/serialization/source_range_serialization.cpp
@@ -167,9 +167,7 @@ std::vector<char> SourceRangePickler::pickle(
     }
 
     ivalues.emplace_back(c10::ivalue::Tuple::create(
-        {(int64_t)range.bytes,
-         srs->serialize(range.range),
-         static_cast<int64_t>(source_range_tag)}));
+        {(int64_t)range.bytes, srs->serialize(range.range), source_range_tag}));
   }
 
   std::vector<at::Tensor> table;
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index 0253a5588030..67f7ef01730f 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -261,12 +261,9 @@ void Unpickler::run() {
 void Unpickler::setInput(size_t memo_id) {
   AT_ASSERT(!stack_.empty());
   if (memo_id >= memo_table_.size()) {
-    memo_table_.insert(
-        memo_table_.end(), memo_id - memo_table_.size(), IValue());
-    memo_table_.push_back(stack_.back());
-  } else {
-    memo_table_[memo_id] = stack_.back();
+    memo_table_.resize(memo_id + 1);
   }
+  memo_table_[memo_id] = stack_.back();
 }
 
 static std::vector<int64_t> tupleToIntList(const IValue& v) {
@@ -354,7 +351,6 @@ PickleOpCode Unpickler::readInstruction() {
       TORCH_CHECK(!marks_.empty(), "Parsing error: marks_ is empty");
       size_t start = marks_.back();
       marks_.pop_back();
-      std::vector<IValue> elements;
       TORCH_CHECK(
           stack_.size() >= start,
           "Parsing error: wrong start index ",
@@ -382,11 +378,10 @@ PickleOpCode Unpickler::readInstruction() {
           stack_.emplace_back(c10::ivalue::Tuple::create(pop(stack_)));
           break;
         default: {
-          elements.reserve(stack_.size() - start);
           auto start_it = stack_.begin() + static_cast<std::ptrdiff_t>(start);
-          for (auto it = start_it; it != stack_.end(); ++it) {
-            elements.emplace_back(std::move(*it));
-          }
+          std::vector<IValue> elements{
+              std::make_move_iterator(start_it),
+              std::make_move_iterator(stack_.end())};
           stack_.erase(start_it, stack_.end());
           stack_.emplace_back(c10::ivalue::Tuple::create(std::move(elements)));
           break;
@@ -1066,10 +1061,10 @@ void Unpickler::rebuildRRef() {
     // const reference will extend the lifetime of the temporary variable
     const auto& rrefId = distributed::rpc::RRefId(
         static_cast<int16_t>(args.at(distributed::rpc::RREFID_ON_IDX).toInt()),
-        static_cast<int64_t>(args.at(distributed::rpc::RREFID_ID_IDX).toInt()));
+        args.at(distributed::rpc::RREFID_ID_IDX).toInt());
     const auto& forkId = distributed::rpc::RRefId(
         static_cast<int16_t>(args.at(distributed::rpc::FORKID_ON_IDX).toInt()),
-        static_cast<int64_t>(args.at(distributed::rpc::FORKID_ID_IDX).toInt()));
+        args.at(distributed::rpc::FORKID_ID_IDX).toInt());
     auto parent =
         static_cast<int16_t>(args.at(distributed::rpc::PARENT_IDX).toInt());
     const auto& typeStr = static_cast<std::string>(
diff --git a/torch/csrc/jit/serialization/unpickler.h b/torch/csrc/jit/serialization/unpickler.h
index 208cf554ad2b..702a1d8816e7 100644
--- a/torch/csrc/jit/serialization/unpickler.h
+++ b/torch/csrc/jit/serialization/unpickler.h
@@ -140,7 +140,9 @@ class TORCH_API Unpickler {
   void rebuildParameter();
   void rebuildTensorFromTypeV2();
   void rebuildSparseTensor();
+#ifdef USE_DISTRIBUTED
   void rebuildRRef();
+#endif
   PickleOpCode readInstruction();
   PickleOpCode readOpCode() {
     return static_cast<PickleOpCode>(read<uint8_t>());
diff --git a/torch/csrc/jit/tensorexpr/cpp_codegen.h b/torch/csrc/jit/tensorexpr/cpp_codegen.h
index d8a46fa7893a..6b6011b66a37 100644
--- a/torch/csrc/jit/tensorexpr/cpp_codegen.h
+++ b/torch/csrc/jit/tensorexpr/cpp_codegen.h
@@ -26,35 +26,35 @@ class TORCH_API CppPrinter : public IRPrinter {
   using IRPrinter::visit;
 
   // Binary expressions.
-  void visit(const ModPtr&) override;
-  void visit(const MaxPtr&) override;
-  void visit(const MinPtr&) override;
+  void visit(const ModPtr& /*v*/) override;
+  void visit(const MaxPtr& /*v*/) override;
+  void visit(const MinPtr& /*v*/) override;
 
   // Conditional expressions.
-  void visit(const CompareSelectPtr&) override;
-  void visit(const IfThenElsePtr&) override;
+  void visit(const CompareSelectPtr& /*v*/) override;
+  void visit(const IfThenElsePtr& /*v*/) override;
 
   // Tensor operations.
-  void visit(const AllocatePtr&) override;
-  void visit(const FreePtr&) override;
-  void visit(const LoadPtr&) override;
-  void visit(const StorePtr&) override;
+  void visit(const AllocatePtr& /*v*/) override;
+  void visit(const FreePtr& /*v*/) override;
+  void visit(const LoadPtr& /*v*/) override;
+  void visit(const StorePtr& /*v*/) override;
 
   // Casts.
-  void visit(const CastPtr&) override;
-  void visit(const BitCastPtr&) override;
+  void visit(const CastPtr& /*v*/) override;
+  void visit(const BitCastPtr& /*v*/) override;
 
   // Calls.
-  void visit(const IntrinsicsPtr&) override;
-  void visit(const ExternalCallPtr&) override;
+  void visit(const IntrinsicsPtr& /*v*/) override;
+  void visit(const ExternalCallPtr& /*v*/) override;
 
   // Vars.
-  void visit(const LetPtr&) override;
-  void visit(const VarPtr&) override;
+  void visit(const LetPtr& /*v*/) override;
+  void visit(const VarPtr& /*v*/) override;
 
   // Vector data types.
-  void visit(const RampPtr&) override;
-  void visit(const BroadcastPtr&) override;
+  void visit(const RampPtr& /*v*/) override;
+  void visit(const BroadcastPtr& /*v*/) override;
 
  private:
   int lane_;
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index dbf9536ee227..6131b55883df 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -843,14 +843,14 @@ static std::ostream& operator<<(
   return out;
 }
 
-static const char* device_resource_string = R"(
+static constexpr const char* device_resource_string = R"(
 #define NAN __int_as_float(0x7fffffff)
 #define POS_INFINITY __int_as_float(0x7f800000)
 #define NEG_INFINITY __int_as_float(0xff800000)
 
 )";
 
-static const char* shared_resource_string = R"(
+static constexpr const char* shared_resource_string = R"(
 template<typename T>
 __device__ T maximum(T a, T b) {
   return isnan(a) ? a : (a > b ? a : b);
@@ -1082,8 +1082,7 @@ void CudaCodeGen::call_with_numel(void** args, int64_t numel) {
   // https://stackoverflow.com/questions/34388712/cannot-understand-how-jcuda-culaunchkernel-work
   std::vector<void*> ptr_to_args(buffer_args.size());
   for (size_t i = 0; i < buffer_args.size(); i++) {
-    ptr_to_args[i] =
-        buffer_args[i].isVar() ? args[i] : const_cast<void**>(&args[i]);
+    ptr_to_args[i] = buffer_args[i].isVar() ? args[i] : (&args[i]);
   }
 
   const auto device = this->device().index();
diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h
index 8cbc1689e0c9..2582ec5797dd 100644
--- a/torch/csrc/jit/tensorexpr/eval.h
+++ b/torch/csrc/jit/tensorexpr/eval.h
@@ -127,7 +127,7 @@ To raw_bitcast(const From& src) {
   TORCH_CHECK(sizeof(To) == sizeof(From), "Invalid bitcast invocation");
   To storage;
   std::memcpy(&storage, &src, sizeof(To));
-  return reinterpret_cast<To&>(storage);
+  return storage;
 }
 
 class SimpleIREvaluatorImpl;
diff --git a/torch/csrc/jit/tensorexpr/exceptions.h b/torch/csrc/jit/tensorexpr/exceptions.h
index 1241400474a4..9963feccde2b 100644
--- a/torch/csrc/jit/tensorexpr/exceptions.h
+++ b/torch/csrc/jit/tensorexpr/exceptions.h
@@ -14,8 +14,10 @@ class Stmt;
 
 // Forward declarations of functions
 namespace std {
-TORCH_API std::string to_string(const torch::jit::tensorexpr::ExprPtr&);
-TORCH_API std::string to_string(const torch::jit::tensorexpr::StmtPtr&);
+TORCH_API std::string to_string(
+    const torch::jit::tensorexpr::ExprPtr& /*expr*/);
+TORCH_API std::string to_string(
+    const torch::jit::tensorexpr::StmtPtr& /*stmt*/);
 } // namespace std
 
 namespace torch::jit::tensorexpr {
diff --git a/torch/csrc/jit/tensorexpr/external_functions.cpp b/torch/csrc/jit/tensorexpr/external_functions.cpp
index c9aedb115a98..ee43036d77c9 100644
--- a/torch/csrc/jit/tensorexpr/external_functions.cpp
+++ b/torch/csrc/jit/tensorexpr/external_functions.cpp
@@ -378,7 +378,7 @@ void nnc_aten_quantized_conv1d(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -408,7 +408,7 @@ void nnc_aten_quantized_conv1d_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   const double x_qscale = ((double*)extra_args)[0];
@@ -442,7 +442,7 @@ void nnc_aten_quantized_conv2d(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -470,7 +470,7 @@ void nnc_aten_quantized_conv2d_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   const double x_qscale = ((double*)extra_args)[0];
@@ -502,7 +502,7 @@ void nnc_aten_quantized_conv2d_relu(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -530,7 +530,7 @@ void nnc_aten_quantized_conv2d_relu_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   const double x_qscale = ((double*)extra_args)[0];
@@ -562,7 +562,7 @@ void nnc_aten_quantized_linear(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -590,7 +590,7 @@ void nnc_aten_quantized_linear_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   const double x_qscale = ((double*)extra_args)[0];
@@ -622,7 +622,7 @@ void nnc_aten_quantized_linear_relu(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -651,7 +651,7 @@ void nnc_aten_quantized_add(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   // TORCH_INTERNAL_ASSERT(tensors.size() == 3);
 
@@ -684,7 +684,7 @@ void nnc_aten_quantized_mul(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const double a_qscale = ((double*)extra_args)[0];
   const int64_t a_qzero = extra_args[1];
@@ -714,7 +714,7 @@ void nnc_aten_quantized_mul_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   const double a_qscale = ((double*)extra_args)[0];
@@ -748,7 +748,7 @@ void nnc_aten_quantized_mul_scalar(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -773,7 +773,7 @@ void nnc_aten_quantized_mul_scalar_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   const double x_qscale = ((double*)extra_args)[0];
@@ -802,7 +802,7 @@ void nnc_aten_quantized_relu(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -826,7 +826,7 @@ void nnc_aten_quantized_sigmoid(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -851,7 +851,7 @@ void nnc_aten_quantized_sigmoid_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const double x_qscale = ((double*)extra_args)[0];
   const int64_t x_qzero = extra_args[1];
@@ -880,7 +880,7 @@ void nnc_aten_quantized_cat(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   std::vector<std::pair<size_t, QIData>> qdata;
   const auto in_bufs_num = bufs_num - 1;
@@ -914,7 +914,7 @@ void nnc_aten_upsample_nearest2d(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   // NOLINTNEXTLINE(facebook-hte-LocalUncheckedArrayBounds)
   const double x_qscale = ((double*)extra_args)[0];
@@ -956,7 +956,7 @@ void nnc_aten_upsample_nearest2d_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   // NOLINTNEXTLINE(facebook-hte-LocalUncheckedArrayBounds)
@@ -1008,7 +1008,7 @@ void nnc_aten_quantize_per_tensor(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   auto tensors = constructTensors(
       bufs_num, buf_data, buf_ranks, buf_dims, buf_strides, buf_dtypes);
@@ -1028,7 +1028,7 @@ void nnc_aten_quantize_per_tensor_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   auto tensors = constructTensors2(
@@ -1058,7 +1058,7 @@ void nnc_aten_dequantize(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const double qscale = ((double*)extra_args)[0];
   const int64_t qzero = extra_args[1];
@@ -1083,7 +1083,7 @@ void nnc_aten_dequantize_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   const size_t bufs_out_num = 1u;
   const double qscale = ((double*)extra_args)[0];
@@ -1275,7 +1275,7 @@ void nnc_aten_max_red_out(
     int64_t* buf_dims,
     int64_t* buf_strides,
     int8_t* buf_dtypes,
-    int64_t,
+    int64_t /*unused*/,
     int64_t* extra_args) {
   size_t bufs_out_num = 1u;
   auto tensors = constructTensors2(
diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h
index a8ceabe701e7..4f916c118165 100644
--- a/torch/csrc/jit/tensorexpr/ir.h
+++ b/torch/csrc/jit/tensorexpr/ir.h
@@ -901,13 +901,13 @@ class TORCH_API Intrinsics : public ExprNode<Intrinsics> {
 };
 
 TORCH_API std::vector<ExprPtr> ExprHandleVectorToExprVector(
-    const std::vector<ExprHandle>&);
+    const std::vector<ExprHandle>& /*v*/);
 TORCH_API std::vector<ExprHandle> ExprVectorToExprHandleVector(
-    const std::vector<ExprPtr>&);
+    const std::vector<ExprPtr>& /*v*/);
 TORCH_API std::vector<VarPtr> VarHandleVectorToVarVector(
-    const std::vector<VarHandle>&);
+    const std::vector<VarHandle>& /*v*/);
 TORCH_API std::vector<VarHandle> VarVectorToVarHandleVector(
-    const std::vector<VarPtr>&);
+    const std::vector<VarPtr>& /*v*/);
 TORCH_API ExprPtr flatten_index(
     const std::vector<ExprPtr>& dims,
     const std::vector<ExprPtr>& indices,
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.h b/torch/csrc/jit/tensorexpr/ir_printer.h
index 1909a40283c7..10ba6f4fdaeb 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.h
+++ b/torch/csrc/jit/tensorexpr/ir_printer.h
@@ -15,9 +15,9 @@ class TORCH_API IRPrinter : public IRVisitor {
  public:
   explicit IRPrinter(std::ostream& os) : printer_os_(this, os) {}
 
-  void print(ExprHandle);
-  void print(Expr&);
-  void print(Stmt&);
+  void print(ExprHandle /*expr*/);
+  void print(Expr& /*expr*/);
+  void print(Stmt& /*stmt*/);
   void visit(const AddPtr& v) override;
   void visit(const SubPtr& v) override;
   void visit(const MulPtr& v) override;
@@ -105,10 +105,12 @@ class TORCH_API IRPrinter : public IRVisitor {
   UniqueNameManager name_manager_;
 };
 
-TORCH_API std::ostream& operator<<(std::ostream& stream, const Expr&);
-TORCH_API std::ostream& operator<<(std::ostream& stream, const ExprHandle&);
-TORCH_API std::ostream& operator<<(std::ostream& stream, const Stmt&);
-TORCH_API std::ostream& operator<<(std::ostream& stream, const Tensor&);
+TORCH_API std::ostream& operator<<(std::ostream& stream, const Expr& /*expr*/);
+TORCH_API std::ostream& operator<<(
+    std::ostream& stream,
+    const ExprHandle& /*expr*/);
+TORCH_API std::ostream& operator<<(std::ostream& stream, const Stmt& /*stmt*/);
+TORCH_API std::ostream& operator<<(std::ostream& stream, const Tensor& /*t*/);
 
 TORCH_API void print(const ExprPtr& expr);
 TORCH_API void print(const StmtPtr& stmt);
diff --git a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
index 88d86d639c68..0d51e11e446d 100644
--- a/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_simplifier.cpp
@@ -885,7 +885,7 @@ ExprPtr PolynomialTransformer::insertIntoTerm(
   bool merged{false};
   for (const auto& component : term->variables()) {
     if (auto roundoff = isRoundOff(component, expr)) {
-      vars.push_back(roundoff);
+      vars.push_back(std::move(roundoff));
       merged = true;
     } else {
       vars.push_back(component);
@@ -897,10 +897,10 @@ ExprPtr PolynomialTransformer::insertIntoTerm(
   }
 
   if (vars.size() == 1 && immediateEquals(term->scalar(), 1)) {
-    return vars[0];
+    return std::move(vars[0]);
   }
 
-  return alloc<Term>(hasher_, term->scalar(), vars);
+  return alloc<Term>(hasher_, term->scalar(), std::move(vars));
 }
 
 ExprPtr PolynomialTransformer::mutate(const MulPtr& v) {
diff --git a/torch/csrc/jit/tensorexpr/ir_verifier.h b/torch/csrc/jit/tensorexpr/ir_verifier.h
index e8e887ac80ae..d2043001184f 100644
--- a/torch/csrc/jit/tensorexpr/ir_verifier.h
+++ b/torch/csrc/jit/tensorexpr/ir_verifier.h
@@ -47,8 +47,8 @@ class TORCH_API IRVerifier : public IRVisitor {
   void visit(const BlockPtr& v) override;
 };
 
-TORCH_API void verify(const StmtPtr&);
-TORCH_API void verify(const ExprPtr&);
-TORCH_API void verify(const ExprHandle&);
+TORCH_API void verify(const StmtPtr& /*s*/);
+TORCH_API void verify(const ExprPtr& /*e*/);
+TORCH_API void verify(const ExprHandle& /*e*/);
 
 } // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index a8ffa40f58db..cc1566372038 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -1482,7 +1482,7 @@ std::vector<BufPtr> TensorExprKernel::preAllocIntermediateBufs(
       remaining_interm_bufs.push_back(buf);
       continue;
     }
-    auto bp = (void*)malloc(size);
+    auto bp = malloc(size);
     if (!bp) {
       remaining_interm_bufs.push_back(buf);
       continue;
diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h
index 20614fea0bad..802998aaa4b8 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.h
+++ b/torch/csrc/jit/tensorexpr/loopnest.h
@@ -43,11 +43,11 @@ class TORCH_API LoopNest {
     return root_stmt_;
   }
 
-  std::vector<ForPtr> getLoopStmtsFor(const Tensor&) const;
-  std::vector<ForPtr> getLoopStmtsFor(const BufPtr&) const;
-  std::vector<ForPtr> getLoopStmtsFor(StmtPtr) const;
-  StmtPtr getLoopBodyFor(const Tensor&) const;
-  StmtPtr getLoopBodyFor(BufPtr) const;
+  std::vector<ForPtr> getLoopStmtsFor(const Tensor& /*t*/) const;
+  std::vector<ForPtr> getLoopStmtsFor(const BufPtr& /*buf*/) const;
+  std::vector<ForPtr> getLoopStmtsFor(StmtPtr /*s*/) const;
+  StmtPtr getLoopBodyFor(const Tensor& /*t*/) const;
+  StmtPtr getLoopBodyFor(BufPtr /*buf*/) const;
 
   // Returns the For stmt indexed by 'indices' in the 'root' For stmt.
   //'indices' indicates the path to the returned loop from 'root' in AST, e.g.,
@@ -77,7 +77,7 @@ class TORCH_API LoopNest {
   static std::vector<ForPtr> getEnclosingLoopNest(const StmtPtr& st);
 
   // Returns a list of all Stmts that write to the given buf.
-  std::vector<StmtPtr> getAllWritesToBuf(BufPtr) const;
+  std::vector<StmtPtr> getAllWritesToBuf(BufPtr /*buf*/) const;
 
   // The following methods return the For loops that contain writes to
   // the given buf.
@@ -97,13 +97,14 @@ class TORCH_API LoopNest {
   // to buf.
   // For the above example:
   //   getAllInnermostLoopsWritingToBuf(a) => {j1, k2, j3}
-  std::vector<ForPtr> getAllInnermostLoopsWritingToBuf(BufPtr) const;
+  std::vector<ForPtr> getAllInnermostLoopsWritingToBuf(BufPtr /*buf*/) const;
 
   // Returns a list of For loopnests which contain a Stmt that writes to
   // the given buf. Each loopnest here is a vector For loops.
   // For the above example:
   //   getAllLoopNestsWritingToBuf(a) => {{i1,j1}, {i2,j2,k2}, {i2,j3}}
-  std::vector<std::vector<ForPtr>> getAllLoopNestsWritingToBuf(BufPtr) const;
+  std::vector<std::vector<ForPtr>> getAllLoopNestsWritingToBuf(
+      BufPtr /*buf*/) const;
 
   StmtPtr simplify();
 
@@ -561,7 +562,7 @@ class TORCH_API LoopNest {
   // Vectorize the given loop. This method requires that the given loop
   // does not perform a reduction.
   // It returns true if vectorization is successful and false otherwise.
-  static bool vectorize(const ForPtr&);
+  static bool vectorize(const ForPtr& /*f*/);
 
   // Find the inner-most loops and vectorize them. Currently, this only works
   // for the LLVM backend, when no reductions are involved.
diff --git a/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp b/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
index dd5c51d63153..46a09314fb7b 100644
--- a/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
@@ -369,7 +369,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
 
           // Find a random number of loops to fuse
           int num_loops_to_fuse =
-              std::max(2, (int)(std::rand() % (int)loops.size()));
+              std::max(2, (std::rand() % (int)loops.size()));
 
           auto [loops_to_fuse, chosen_indices] =
               randomization_helper::select_n_randomly<ForPtr>(
diff --git a/torch/csrc/jit/tensorexpr/operators/quantization.cpp b/torch/csrc/jit/tensorexpr/operators/quantization.cpp
index 4b0bd3a1005a..f6ca4defaf62 100644
--- a/torch/csrc/jit/tensorexpr/operators/quantization.cpp
+++ b/torch/csrc/jit/tensorexpr/operators/quantization.cpp
@@ -139,8 +139,8 @@ Tensor computeQuantizePerTensor(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
-    const std::optional<ScalarType>&,
-    at::Device) {
+    const std::optional<ScalarType>& /*unused*/,
+    at::Device /*unused*/) {
   std::vector<VarPtr> vars;
   std::vector<ExprHandle> indices;
   for (const auto& os : outputShape) {
@@ -180,7 +180,7 @@ Tensor computeQuantizedAdd(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
-    at::Device) {
+    at::Device /*unused*/) {
   const BufHandle& QA = std::get<BufHandle>(inputs[0]);
   const BufHandle& QB = std::get<BufHandle>(inputs[1]);
   auto qa_scale = ExprHandle(QA.node()->qscale());
@@ -223,7 +223,7 @@ Tensor computeQuantizePerTensorExternalCall(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
-    at::Device) {
+    at::Device /*unused*/) {
   const BufHandle& x = std::get<BufHandle>(inputs[0]);
   const auto qscale = std::get<double>(inputs[1]);
   const auto qzero = std::get<int64_t>(inputs[2]);
@@ -255,7 +255,7 @@ Tensor computeDequantizeExternalCall(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
-    at::Device) {
+    at::Device /*unused*/) {
   Dtype dtype = kFloat;
   if (outputType) {
     dtype = Dtype(*outputType);
@@ -280,7 +280,7 @@ Tensor computeQuantizedConv2dPrepack(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
-    at::Device) {
+    at::Device /*unused*/) {
   Dtype dtype = kFloat;
   if (outputType) {
     dtype = Dtype(*outputType);
@@ -634,7 +634,7 @@ Tensor computeDequantize(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
-    at::Device) {
+    at::Device /*unused*/) {
   Dtype dtype = kFloat;
   if (outputType) {
     dtype = Dtype(*outputType);
@@ -666,7 +666,7 @@ Tensor computeUpsampleNearest2d(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
-    at::Device) {
+    at::Device /*unused*/) {
   const auto& A = std::get<BufHandle>(inputs[0]);
   const auto& output_height = outputShape[2];
   const auto& output_width = outputShape[3];
@@ -713,7 +713,7 @@ Tensor computeUpsampleNearest2dExternalCall(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
-    at::Device) {
+    at::Device /*unused*/) {
   Dtype dtype = kFloat;
   if (outputType) {
     dtype = Dtype(*outputType);
@@ -772,7 +772,7 @@ Tensor computeQuantizedSigmoidExternalCall(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
-    at::Device) {
+    at::Device /*unused*/) {
   const BufHandle& qx = std::get<BufHandle>(inputs[0]);
 
   const auto out_qdtype = immQDType(qx);
diff --git a/torch/csrc/jit/tensorexpr/operators/quantization.h b/torch/csrc/jit/tensorexpr/operators/quantization.h
index 51bdbe730a6a..ecc86c912b50 100644
--- a/torch/csrc/jit/tensorexpr/operators/quantization.h
+++ b/torch/csrc/jit/tensorexpr/operators/quantization.h
@@ -42,13 +42,6 @@ TORCH_API Tensor computeQuantizedConv2dPrepack(
     const std::optional<ScalarType>& outputType,
     at::Device device);
 
-TORCH_API Tensor computeQuantizedConv1d(
-    const std::vector<ArgValue>& inputs,
-    const std::vector<ExprHandle>& outputShape,
-    const std::vector<ExprHandle>& outputStrides,
-    const std::optional<ScalarType>& outputType,
-    at::Device device);
-
 TORCH_API Tensor computeQuantizedConv2d(
     const std::vector<ArgValue>& inputs,
     const std::vector<ExprHandle>& outputShape,
@@ -152,5 +145,5 @@ TORCH_API Tensor computeQuantizedSigmoidExternalCall(
     const std::vector<ExprHandle>& outputShape,
     const std::vector<ExprHandle>& outputStrides,
     const std::optional<ScalarType>& outputType,
-    at::Device);
+    at::Device /*unused*/);
 } // namespace torch::jit::tensorexpr
diff --git a/torch/csrc/jit/tensorexpr/stmt.h b/torch/csrc/jit/tensorexpr/stmt.h
index 5cdbe7de5217..c3c070fc9607 100644
--- a/torch/csrc/jit/tensorexpr/stmt.h
+++ b/torch/csrc/jit/tensorexpr/stmt.h
@@ -586,7 +586,7 @@ class TORCH_API LoopOptions {
     }
 
     // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-    static const char* kBlockIndexNames[] = {
+    static constexpr const char* kBlockIndexNames[] = {
         "blockIdx.x",
         "blockIdx.y",
         "blockIdx.z",
@@ -629,7 +629,7 @@ class TORCH_API LoopOptions {
     }
 
     // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-    static const char* kThreadIndexNames[] = {
+    static constexpr const char* kThreadIndexNames[] = {
         "threadIdx.x", "threadIdx.y", "threadIdx.z", "threadIdx.w"};
 
     if (gpu_thread_index_ < IDX_X || gpu_thread_index_ > IDX_MAX) {
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.h b/torch/csrc/lazy/core/lazy_graph_executor.h
index ffa444993e48..3bdf3e0fc736 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.h
+++ b/torch/csrc/lazy/core/lazy_graph_executor.h
@@ -21,7 +21,7 @@ class TORCH_API LazyGraphExecutor {
   };
 
   // Register a lazy graph executor instance that can be retrieved using Get()
-  static void Register(LazyGraphExecutor*);
+  static void Register(LazyGraphExecutor* /*executor*/);
   static LazyGraphExecutor* Get();
 
   virtual ~LazyGraphExecutor() = default;
diff --git a/torch/csrc/lazy/core/multi_wait.cpp b/torch/csrc/lazy/core/multi_wait.cpp
index da30333ea227..c75cacb75b39 100644
--- a/torch/csrc/lazy/core/multi_wait.cpp
+++ b/torch/csrc/lazy/core/multi_wait.cpp
@@ -1,3 +1,4 @@
+#include <c10/util/Exception.h>
 #include <torch/csrc/lazy/core/multi_wait.h>
 
 #include <chrono>
@@ -31,7 +32,7 @@ void MultiWait::Wait(double wait_seconds) {
   if (!cv_.wait_for(lock, std::chrono::duration<double>(wait_seconds), [this] {
         return completed_count_ >= count_;
       })) {
-    throw std::runtime_error("Timeout");
+    TORCH_CHECK(false, "Timeout");
   }
   if (exptr_ != nullptr) {
     std::rethrow_exception(exptr_);
diff --git a/torch/csrc/lazy/core/shape_inference.cpp b/torch/csrc/lazy/core/shape_inference.cpp
index 5e9c7dd29560..e7ab494d18e3 100644
--- a/torch/csrc/lazy/core/shape_inference.cpp
+++ b/torch/csrc/lazy/core/shape_inference.cpp
@@ -225,7 +225,7 @@ std::vector<Shape> compute_shape_constant_pad_nd(
     auto pad_idx = pad.size() - ((i + 1) * 2);
     auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
     TORCH_CHECK(
-        new_dim > 0,
+        new_dim >= 0,
         "The input size ",
         input_sizes[l_diff + i],
         ", plus negative padding ",
diff --git a/torch/csrc/lazy/core/tensor.h b/torch/csrc/lazy/core/tensor.h
index a0f4ade6fdc9..bbe6fa1e5efb 100644
--- a/torch/csrc/lazy/core/tensor.h
+++ b/torch/csrc/lazy/core/tensor.h
@@ -253,7 +253,7 @@ TORCH_API at::Tensor to_lazy_tensor(
 template <size_t... Indices>
 auto TupleAtenFromLtcTensorsImpl(
     const std::vector<LazyTensorPtr>& tensors,
-    std::index_sequence<Indices...>) {
+    std::index_sequence<Indices...> /*unused*/) {
   return std::make_tuple(CreateAtenFromLtcTensor(tensors[Indices])...);
 }
 
diff --git a/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp b/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp
index b5096a2e373b..9222b8b2997f 100644
--- a/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_lowering_context.cpp
@@ -1,4 +1,5 @@
 #include <c10/core/ScalarType.h>
+#include <c10/util/Exception.h>
 #include <torch/csrc/lazy/ts_backend/ts_backend_impl.h>
 #include <torch/csrc/lazy/ts_backend/ts_lowering_context.h>
 #include <torch/csrc/lazy/ts_backend/ts_node.h>
@@ -44,8 +45,8 @@ void TSLoweringContext::Lower(const Node* node) {
       AssignOutputOp(torch::lazy::Output(node, i), ops[i]);
     }
   } else {
-    throw std::runtime_error(
-        "Expected torch::lazy::TsNode but could not dynamic cast");
+    TORCH_CHECK(
+        false, "Expected torch::lazy::TsNode but could not dynamic cast");
   }
 }
 
diff --git a/torch/csrc/lazy/ts_backend/ts_lowering_context.h b/torch/csrc/lazy/ts_backend/ts_lowering_context.h
index 356ea3d8e923..e75035815e26 100644
--- a/torch/csrc/lazy/ts_backend/ts_lowering_context.h
+++ b/torch/csrc/lazy/ts_backend/ts_lowering_context.h
@@ -2,6 +2,7 @@
 
 #include <sstream>
 
+#include <c10/util/Exception.h>
 #include <torch/csrc/api/include/torch/jit.h>
 #include <torch/csrc/jit/runtime/graph_executor.h>
 #include <torch/csrc/lazy/backend/lowering_context.h>
@@ -26,8 +27,8 @@ class TORCH_API TSComputation : public Computation {
   }
 
   const std::vector<Shape>& parameter_shapes() const override {
-    throw std::runtime_error(
-        "TODO(whc) implement TS computation shapes or change interface");
+    TORCH_CHECK(
+        false, "TODO(whc) implement TS computation shapes or change interface");
     return parameter_shapes_;
   }
 
@@ -36,8 +37,8 @@ class TORCH_API TSComputation : public Computation {
   }
 
   const Shape& result_shape() const override {
-    throw std::runtime_error(
-        "TODO(whc) implement TS computation shapes or change interface");
+    TORCH_CHECK(
+        false, "TODO(whc) implement TS computation shapes or change interface");
     return result_shape_;
   }
 
diff --git a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
index 1bb720b810f9..f1f69e092591 100644
--- a/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
+++ b/torch/csrc/lazy/ts_backend/ts_native_functions.cpp
@@ -466,6 +466,14 @@ at::Tensor LazyNativeFunctions::linalg_pinv(
       linalg_pinv, atol_rtol_tensor)>::call(self, atol, rtol, hermitian);
 }
 
+std::tuple<at::Tensor, at::Tensor, at::Tensor> LazyNativeFunctions::svd(
+    const at::Tensor& self,
+    bool some,
+    bool compute_uv) {
+  return at::functionalization::functionalize_aten_op<ATEN_OP(svd)>::call(
+      self, some, compute_uv);
+}
+
 // functionalize_aten_op can't handle out= ops directly.
 // Instead, we can call the composite kernel from core, and copy and mutations
 // back to the inputs.
diff --git a/torch/csrc/monitor/counters.cpp b/torch/csrc/monitor/counters.cpp
index eddcfe66e9eb..832d289d7885 100644
--- a/torch/csrc/monitor/counters.cpp
+++ b/torch/csrc/monitor/counters.cpp
@@ -1,3 +1,4 @@
+#include <c10/util/Exception.h>
 #include <torch/csrc/monitor/counters.h>
 
 #include <unordered_set>
@@ -21,8 +22,10 @@ const char* aggregationName(Aggregation agg) {
     case Aggregation::MIN:
       return "min";
     default:
-      throw std::runtime_error(
-          "unknown aggregation: " + std::to_string(static_cast<int>(agg)));
+      TORCH_CHECK(
+          false,
+          "unknown aggregation: ",
+          std::to_string(static_cast<int>(agg)));
   }
 }
 
diff --git a/torch/csrc/monitor/events.cpp b/torch/csrc/monitor/events.cpp
index 2374b692a3c0..43f5e54f5476 100644
--- a/torch/csrc/monitor/events.cpp
+++ b/torch/csrc/monitor/events.cpp
@@ -37,8 +37,8 @@ class EventHandlers {
   }
 
  private:
-  std::mutex mu_{};
-  std::vector<std::shared_ptr<EventHandler>> handlers_{};
+  std::mutex mu_;
+  std::vector<std::shared_ptr<EventHandler>> handlers_;
 };
 } // namespace
 
diff --git a/torch/csrc/monitor/python_init.cpp b/torch/csrc/monitor/python_init.cpp
index 2438395c4d37..25b14c0a2b2c 100644
--- a/torch/csrc/monitor/python_init.cpp
+++ b/torch/csrc/monitor/python_init.cpp
@@ -1,3 +1,4 @@
+#include <c10/util/Exception.h>
 #include <utility>
 
 #include <c10/util/WaitCounter.h>
@@ -23,7 +24,7 @@ struct type_caster<torch::monitor::data_value_t> {
   PYBIND11_TYPE_CASTER(torch::monitor::data_value_t, _("data_value_t"));
 
   // Python -> C++
-  bool load(handle src, bool) {
+  bool load(handle src, bool /*unused*/) {
     PyObject* source = src.ptr();
     if (THPUtils_checkLong(source)) {
       this->value = THPUtils_unpackLong(source);
@@ -58,7 +59,7 @@ struct type_caster<torch::monitor::data_value_t> {
       std::string& str = std::get<std::string>(src);
       return THPUtils_packString(str);
     }
-    throw std::runtime_error("unknown data_value_t type");
+    TORCH_CHECK(false, "unknown data_value_t type");
   }
 };
 } // namespace pybind11::detail
diff --git a/torch/csrc/profiler/collection.cpp b/torch/csrc/profiler/collection.cpp
index 4d6a538c0772..133951dd817c 100644
--- a/torch/csrc/profiler/collection.cpp
+++ b/torch/csrc/profiler/collection.cpp
@@ -397,7 +397,9 @@ std::unique_ptr<KinetoObserverContext> ThreadLocalSubqueue::begin_op(
 
   event->start_time_ = c10::getApproximateTime();
   event->allow_tf32_cublas_ =
-      at::globalContext().float32Precision("cuda", "matmul") == "tf32";
+      at::globalContext().float32Precision(
+          at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
+      at::Float32Precision::TF32;
   if (!config_.experimental_config.performance_events.empty()) {
     const size_t n = config_.experimental_config.performance_events.size();
     event->counters_ = std::make_unique<perf_counters_t>(n, 0);
@@ -961,8 +963,9 @@ class TransferEvents {
  public:
   TransferEvents(
       std::vector<std::shared_ptr<Result>>& results,
-      trace_ptr_t& trace)
-      : results_{results} {
+      trace_ptr_t& trace,
+      const ProfilerConfig& config)
+      : results_{results}, config_{config} {
     auto* trace_activities_ptr = trace->get()->activities();
     TORCH_INTERNAL_ASSERT(trace_activities_ptr != nullptr);
     trace_activities_ = *trace_activities_ptr;
@@ -1092,13 +1095,25 @@ class TransferEvents {
   void extractEventsFromTrace() {
     for (const auto* activity : trace_activities_) {
       auto e = toResult(activity);
-      const auto* linked_activity = activity->linkedActivity();
-      if (e && linked_activity) {
-        e->visit(c10::overloaded(
-            [&](ExtraFields<EventType::Kineto>& i) {
-              i.linked_activity_ = toResult(linked_activity);
-            },
-            [](auto&) { TORCH_INTERNAL_ASSERT(false); }));
+      if (e) {
+        if (config_.experimental_config.expose_kineto_event_metadata) {
+          e->visit(c10::overloaded(
+              [&](ExtraFields<EventType::TorchOp>& i) {
+                i.metadata_json_ = activity->metadataJson();
+              },
+              [&](ExtraFields<EventType::Kineto>& i) {
+                i.metadata_json_ = activity->metadataJson();
+              },
+              [](auto&) { return; }));
+        }
+        const auto* linked_activity = activity->linkedActivity();
+        if (linked_activity) {
+          e->visit(c10::overloaded(
+              [&](ExtraFields<EventType::Kineto>& i) {
+                i.linked_activity_ = toResult(linked_activity);
+              },
+              [](auto&) { TORCH_INTERNAL_ASSERT(false); }));
+        }
       }
     }
   }
@@ -1175,6 +1190,7 @@ class TransferEvents {
   static constexpr long long unmatchedIndex = -1;
   static constexpr auto noTID = std::numeric_limits<uint64_t>::max();
   std::reference_wrapper<std::vector<std::shared_ptr<Result>>> results_;
+  const ProfilerConfig& config_;
   std::vector<const itrace_t*> trace_activities_;
   ska::flat_hash_map<const itrace_t*, std::shared_ptr<Result>> kineto_events_;
 };
@@ -1182,7 +1198,7 @@ class TransferEvents {
 class TransferEvents {
  public:
   template <class... Args>
-  TransferEvents(Args&&...) {}
+  TransferEvents(Args&&... /*unused*/) {}
 };
 #endif
 
@@ -1201,7 +1217,7 @@ trace_ptr_t addKinetoEvents(
 
   auto trace = std::make_unique<ActivityTraceWrapper>(stopTrace());
   TORCH_INTERNAL_ASSERT(trace || !kKinetoAvailable);
-  TransferEvents transfer{results, trace};
+  TransferEvents transfer{results, trace, config};
   return trace;
 }
 
diff --git a/torch/csrc/profiler/collection.h b/torch/csrc/profiler/collection.h
index 847819f97195..b05f4608fb77 100644
--- a/torch/csrc/profiler/collection.h
+++ b/torch/csrc/profiler/collection.h
@@ -178,6 +178,7 @@ struct ExtraFields<EventType::TorchOp> : TorchOpBasicFields {
   FallbackPair device_fallback_;
   bool allow_tf32_cublas_;
   std::unique_ptr<perf_counters_t> perf_event_counters_;
+  std::string metadata_json_;
 };
 
 template <>
@@ -368,7 +369,8 @@ struct ExtraFields<EventType::Kineto> {
   uint64_t correlation_id_{0};
   libkineto::ActivityType activity_type_;
   Flow flow;
-  std::weak_ptr<Result> linked_activity_{};
+  std::weak_ptr<Result> linked_activity_;
+  std::string metadata_json_;
 };
 
 struct TORCH_API Result : public std::enable_shared_from_this<Result> {
@@ -445,7 +447,7 @@ struct TORCH_API Result : public std::enable_shared_from_this<Result> {
         extra_fields_{std::move(extra_fields)} {}
 
   template <EventType E>
-  static EventType deduceTag(const ExtraFields<E>&) {
+  static EventType deduceTag(const ExtraFields<E>& /*unused*/) {
     return E;
   }
 };
@@ -687,21 +689,22 @@ class TORCH_API RecordQueue {
 };
 
 TORCH_API bool get_record_concrete_inputs_enabled();
-TORCH_API void set_record_concrete_inputs_enabled_fn(std::function<bool()>);
-TORCH_API void set_record_concrete_inputs_enabled_val(bool);
+TORCH_API void set_record_concrete_inputs_enabled_fn(
+    std::function<bool()> /*fn*/);
+TORCH_API void set_record_concrete_inputs_enabled_val(bool /*val*/);
 
 TORCH_API bool get_fwd_bwd_enabled();
-TORCH_API void set_fwd_bwd_enabled_fn(std::function<bool()>);
-TORCH_API void set_fwd_bwd_enabled_val(bool);
+TORCH_API void set_fwd_bwd_enabled_fn(std::function<bool()> /*fn*/);
+TORCH_API void set_fwd_bwd_enabled_val(bool /*val*/);
 
 TORCH_API bool get_cuda_sync_enabled();
-TORCH_API void set_cuda_sync_enabled_fn(std::function<bool()>);
-TORCH_API void set_cuda_sync_enabled_val(bool);
+TORCH_API void set_cuda_sync_enabled_fn(std::function<bool()> /*fn*/);
+TORCH_API void set_cuda_sync_enabled_val(bool /*val*/);
 
 // Comms related RecordFunctions will record information about tensor storage
 // locations.
 TORCH_API bool get_record_tensor_addrs_enabled();
-TORCH_API void set_record_tensor_addrs_enabled_fn(std::function<bool()>);
-TORCH_API void set_record_tensor_addrs_enabled_val(bool);
+TORCH_API void set_record_tensor_addrs_enabled_fn(std::function<bool()> /*fn*/);
+TORCH_API void set_record_tensor_addrs_enabled_val(bool /*val*/);
 
 } // namespace torch::profiler::impl
diff --git a/torch/csrc/profiler/data_flow.cpp b/torch/csrc/profiler/data_flow.cpp
index 5f13421c5524..a9f98930f8c6 100644
--- a/torch/csrc/profiler/data_flow.cpp
+++ b/torch/csrc/profiler/data_flow.cpp
@@ -50,7 +50,7 @@ struct RawTensors {
   }
 
   template <typename T>
-  void operator()(T&) {}
+  void operator()(T& /*unused*/) {}
 
   std::vector<RawTensorInfo> tensors_;
 };
diff --git a/torch/csrc/profiler/orchestration/observer.cpp b/torch/csrc/profiler/orchestration/observer.cpp
index 5ef0690d1811..b2c4fc507151 100644
--- a/torch/csrc/profiler/orchestration/observer.cpp
+++ b/torch/csrc/profiler/orchestration/observer.cpp
@@ -22,6 +22,7 @@ ExperimentalConfig::ExperimentalConfig(
     bool profile_all_threads,
     bool capture_overload_names,
     bool record_python_gc_info,
+    bool expose_kineto_event_metadata,
     std::string custom_profiler_config,
     bool adjust_timestamps)
     : profiler_metrics{std::move(profiler_metrics)},
@@ -34,6 +35,7 @@ ExperimentalConfig::ExperimentalConfig(
       profile_all_threads{profile_all_threads},
       capture_overload_names{capture_overload_names},
       record_python_gc_info{record_python_gc_info},
+      expose_kineto_event_metadata{expose_kineto_event_metadata},
       custom_profiler_config(std::move(custom_profiler_config)),
       adjust_timestamps{adjust_timestamps} {}
 
diff --git a/torch/csrc/profiler/orchestration/observer.h b/torch/csrc/profiler/orchestration/observer.h
index ba62e9b56b5c..3b59466e6060 100644
--- a/torch/csrc/profiler/orchestration/observer.h
+++ b/torch/csrc/profiler/orchestration/observer.h
@@ -63,6 +63,7 @@ struct TORCH_API ExperimentalConfig {
       bool profile_all_threads = false,
       bool capture_overload_names = false,
       bool record_python_gc_info = false,
+      bool expose_kineto_event_metadata = false,
       std::string custom_profiler_config = "",
       bool adjust_timestamps = false);
   explicit operator bool() const;
@@ -109,6 +110,10 @@ struct TORCH_API ExperimentalConfig {
    */
   bool record_python_gc_info;
 
+  /* controls whether KinetoEvent metadata is exposed to FunctionEvent
+   * in the PyTorch Profiler as a JSON string */
+  bool expose_kineto_event_metadata;
+
   /*
    * A custom_profiler_config option is introduced to allow custom backends
    * to apply custom configurations as needed.
diff --git a/torch/csrc/profiler/orchestration/python_tracer.cpp b/torch/csrc/profiler/orchestration/python_tracer.cpp
index 0d1ad389f889..f7f0ea584e64 100644
--- a/torch/csrc/profiler/orchestration/python_tracer.cpp
+++ b/torch/csrc/profiler/orchestration/python_tracer.cpp
@@ -13,9 +13,9 @@ struct NoOpPythonTracer : public PythonTracerBase {
   void restart() override {}
   void register_gc_callback() override {}
   std::vector<std::shared_ptr<Result>> getEvents(
-      std::function<c10::time_t(c10::approx_time_t)>,
-      std::vector<CompressedEvent>&,
-      c10::time_t) override {
+      std::function<c10::time_t(c10::approx_time_t)> /*time_converter*/,
+      std::vector<CompressedEvent>& /*enters*/,
+      c10::time_t /*end_time_ns*/) override {
     return {};
   }
 };
@@ -25,7 +25,7 @@ struct NoOpMemoryPythonTracer : public PythonMemoryTracerBase {
   ~NoOpMemoryPythonTracer() override = default;
   void start() override {}
   void stop() override {}
-  void export_memory_history(const std::string&) override {}
+  void export_memory_history(const std::string& /*path*/) override {}
 };
 
 } // namespace
diff --git a/torch/csrc/profiler/perf.h b/torch/csrc/profiler/perf.h
index 07ff1211dbf9..906ee79e2cf4 100644
--- a/torch/csrc/profiler/perf.h
+++ b/torch/csrc/profiler/perf.h
@@ -88,7 +88,7 @@ class PerfProfiler {
 
   /* Disable counting and fill in the caller supplied container with delta
    * calculated from the start count values since last Enable() */
-  void Disable(perf_counters_t&);
+  void Disable(perf_counters_t& /*vals*/);
 
  private:
   uint64_t CalcDelta(uint64_t start, uint64_t end) const;
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index 7c8417270426..f057f736c4af 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -2,6 +2,7 @@
 
 #include <ATen/record_function.h>
 #include <c10/core/impl/PyInterpreter.h>
+#include <c10/util/Exception.h>
 #include <c10/util/overloaded.h>
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
@@ -88,7 +89,7 @@ struct type_caster<std::shared_ptr<torch::CapturedTraceback>> {
       std::shared_ptr<torch::CapturedTraceback>,
       _("torch._C._profiler.CapturedTraceback"));
 
-  bool load(handle src, bool) {
+  bool load(handle src, bool /*unused*/) {
     if (Py_TYPE(src.ptr()) == &THPCapturedTracebackType) {
       value = reinterpret_cast<THPCapturedTraceback*>(src.ptr())->data;
       return true;
@@ -250,9 +251,34 @@ PyObject* RecordFunctionFast_enter(PyObject* selfGeneric, PyObject* unused) {
         if (THPUtils_checkString(value)) {
           ivalue = at::IValue(THPUtils_unpackString(value));
         } else {
+          // Handle other types (not strings, not lists)
           auto match = torch::jit::tryToInferPrimitiveType(value);
           if (match.success()) {
             ivalue = torch::jit::toIValue(value, match.type());
+          } else if (PyList_Check(value)) {
+            // Handle list of strings
+            bool all_strings = true;
+            std::vector<std::string> string_list;
+            Py_ssize_t list_size = PyList_Size(value);
+
+            for (Py_ssize_t i = 0; i < list_size; i++) {
+              PyObject* item = PyList_GetItem(value, i);
+              if (THPUtils_checkString(item)) {
+                string_list.push_back(THPUtils_unpackString(item));
+              } else {
+                all_strings = false;
+                break;
+              }
+            }
+
+            if (all_strings) {
+              c10::List<std::string> string_ivalue_list(string_list);
+              ivalue = at::IValue(string_ivalue_list);
+            } else {
+              TORCH_WARN(
+                  "Unable to infer type of value in the List for keyword: ",
+                  key_str);
+            }
           } else {
             TORCH_WARN("Unable to infer type of value for keyword: ", key_str);
             ivalue = at::IValue("NULL");
@@ -352,6 +378,7 @@ void initPythonBindings(PyObject* module) {
               bool /* profile_all_threads */,
               bool /* capture_overload_names */,
               bool /* record_python_gc_info */,
+              bool /* expose_kineto_event_metadata */,
               std::string /* custom_profiler_config*/
               >(),
           "An experimental config for Kineto features. Please note that"
@@ -372,6 +399,7 @@ void initPythonBindings(PyObject* module) {
           "    profile_all_threads (bool) : whether to profile all threads\n"
           "    capture_overload_names (bool) : whether to include ATen overload names in the profile\n"
           "    record_python_gc_info (bool) : adds python gc events to profile\n"
+          "    expose_kineto_event_metadata (bool) : whether to expose KinetoEvent metadata in the PyTorch Profiler\n"
           "    custom_profiler_config (string) : Used to pass some configurations to the custom profiler backend.\n",
           py::arg("profiler_metrics") = std::vector<std::string>(),
           py::arg("profiler_measure_per_kernel") = false,
@@ -383,6 +411,7 @@ void initPythonBindings(PyObject* module) {
           py::arg("profile_all_threads") = false,
           py::arg("capture_overload_names") = false,
           py::arg("record_python_gc_info") = false,
+          py::arg("expose_kineto_event_metadata") = false,
           py::arg("custom_profiler_config") = "")
       .def(py::pickle(
           [](const ExperimentalConfig& p) { // __getstate__
@@ -407,13 +436,12 @@ void initPythonBindings(PyObject* module) {
                 p.profile_all_threads,
                 p.capture_overload_names,
                 p.record_python_gc_info,
+                p.expose_kineto_event_metadata,
                 p.custom_profiler_config,
                 p.performance_events);
           },
           [](const py::tuple& t) { // __setstate__
-            if (t.size() >= 5) {
-              throw std::runtime_error("Expected at least 5 values in state");
-            }
+            TORCH_CHECK(t.size() < 5, "Expected at least 5 values in state");
 
             py::list py_metrics = t[0].cast<py::list>();
             std::vector<std::string> metrics{py_metrics.size()};
diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
index e46c141cd3f4..918cc554c5b1 100644
--- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp
+++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
@@ -30,12 +30,15 @@
 #include <torch/csrc/profiler/standalone/execution_trace_observer.h>
 #include <torch/csrc/profiler/util.h>
 
+#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
+#endif // USE_DISTRIBUTED
 
 using namespace at;
 
 // Collective property attributes
 // https://github.com/pytorch/pytorch/issues/124674
+#ifdef USE_DISTRIBUTED
 constexpr auto kETCommsName = "collective_name";
 constexpr auto kETInMsgNelems = "in_msg_nelems";
 constexpr auto kETOutMsgNelems = "out_msg_nelems";
@@ -46,6 +49,7 @@ constexpr auto kETGlobalRankStride = "global_rank_stride";
 constexpr auto kETGroupSize = "pg_size";
 constexpr auto kETProcessGroupName = "pg_name";
 constexpr auto kETProcessGroupDesc = "pg_desc";
+#endif // USE_DISTRIBUTED
 
 namespace torch::profiler::impl {
 
@@ -105,15 +109,15 @@ struct TORCH_API ExecutionTraceObserver { // NOLINT
   using ID = size_t;
 
   // Mapping of each thread to its own operator stack
-  std::map<size_t, std::stack<ID>> opStack{};
+  std::map<size_t, std::stack<ID>> opStack;
   // Uses the underlying TensorImpl object pointer as the key and map to its
   // unique id.
-  std::map<const void*, ID> objectId{};
+  std::map<const void*, ID> objectId;
 
   using weak_storage_ptr = c10::weak_intrusive_ptr<StorageImpl>;
-  std::unordered_map<const void*, ID> data_ptr_to_storage_id{};
+  std::unordered_map<const void*, ID> data_ptr_to_storage_id;
   std::unordered_map<const void*, weak_storage_ptr>
-      data_ptr_to_weak_storage_ptr{};
+      data_ptr_to_weak_storage_ptr;
 
   ID get_tensor_storage_ID(const c10::Storage& t_storage) {
     const std::lock_guard<std::recursive_mutex> lock(gMutex);
@@ -134,8 +138,7 @@ struct TORCH_API ExecutionTraceObserver { // NOLINT
         // So we need to remove the key and insert the key with the new value.
         data_ptr_to_storage_id.erase(raw_data_ptr);
         data_ptr_to_storage_id[raw_data_ptr] = id;
-        data_ptr_to_weak_storage_ptr.erase(raw_data_ptr);
-        data_ptr_to_weak_storage_ptr.emplace(
+        data_ptr_to_weak_storage_ptr.insert_or_assign(
             raw_data_ptr, t_storage.getWeakStorageImpl());
         return id;
       } else {
@@ -148,21 +151,21 @@ struct TORCH_API ExecutionTraceObserver { // NOLINT
   enum class RunState { uninitialized, disabled, enabled };
 
   // Mutex for multithreaded access to the shared containers.
-  std::recursive_mutex gMutex{};
+  std::recursive_mutex gMutex;
   // Stream to write output JSON.
-  std::ofstream out{};
+  std::ofstream out;
 
   // Full path to the output file.
-  std::string fileName{};
+  std::string fileName;
 
-  std::string resourceDir{};
+  std::string resourceDir;
 
   // RecordFunction callback handle for this observer.
   CallbackHandle cbHandle{INVALID_CALLBACK_HANDLE};
 
   // Process ID.
   int32_t pid{-1};
-  std::string recordTime{};
+  std::string recordTime;
 
   ExecutionTraceObserver() = default;
 
@@ -189,7 +192,7 @@ struct TORCH_API ExecutionTraceObserver { // NOLINT
 
   bool record_integral_tensor_range{false};
 
-  std::unordered_set<std::string> nodeListForSavingIntegerTensor{};
+  std::unordered_set<std::string> nodeListForSavingIntegerTensor;
 
  private:
   static bool callbackShouldBeEnabled(RunState run_state) {
@@ -265,6 +268,7 @@ static std::ofstream openOutputFile(const std::string& name) {
   return stream;
 }
 
+#ifdef USE_DISTRIBUTED
 static std::string getAttrJson(
     const std::string& name,
     const std::string& type,
@@ -277,6 +281,7 @@ static std::string getAttrJson(
       type,
       value);
 }
+#endif
 
 static void writeJsonNode(
     std::ofstream& out,
@@ -654,6 +659,7 @@ static void handleKernelBackendInfo(
 inline std::string getCommsNodeAttrs(const RecordFunction& fn) { // NOLINT
   std::vector<std::string> attrs;
 
+#ifdef USE_DISTRIBUTED
   // We rely on paramcommsdebug object that is available in thread local info
   auto debugInfo = dynamic_cast<ParamCommsDebugInfo*>(
       c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PARAM_COMMS_INFO));
@@ -697,6 +703,8 @@ inline std::string getCommsNodeAttrs(const RecordFunction& fn) { // NOLINT
 
   addAttr(kGroupSize, kETGroupSize, "uint64");
 
+#endif // USE_DISTRIBUTED
+
   // XXX consider using as string stream?
   return attrs.empty() ? "" : fmt::format(", {}", fmt::join(attrs, ", "));
 }
diff --git a/torch/csrc/profiler/standalone/itt_observer.cpp b/torch/csrc/profiler/standalone/itt_observer.cpp
index d7e1029494cc..6a1088c91e06 100644
--- a/torch/csrc/profiler/standalone/itt_observer.cpp
+++ b/torch/csrc/profiler/standalone/itt_observer.cpp
@@ -20,8 +20,12 @@ struct ITTThreadLocalState : ProfilerStateBase {
     return ActiveProfilerType::ITT;
   }
 
-  void reportMemoryUsage(void*, int64_t, size_t, size_t, c10::Device) override {
-  }
+  void reportMemoryUsage(
+      void* /*ptr*/,
+      int64_t /*alloc_size*/,
+      size_t /*total_allocated*/,
+      size_t /*total_reserved*/,
+      c10::Device /*device*/) override {}
 
   static ITTThreadLocalState* getTLS() {
     auto tls = ProfilerStateBase::get(/*global=*/false);
diff --git a/torch/csrc/profiler/standalone/nvtx_observer.cpp b/torch/csrc/profiler/standalone/nvtx_observer.cpp
index d5697e6323bc..6631b2c132d1 100644
--- a/torch/csrc/profiler/standalone/nvtx_observer.cpp
+++ b/torch/csrc/profiler/standalone/nvtx_observer.cpp
@@ -20,8 +20,12 @@ struct NVTXThreadLocalState : ProfilerStateBase {
     return ActiveProfilerType::NVTX;
   }
 
-  void reportMemoryUsage(void*, int64_t, size_t, size_t, c10::Device) override {
-  }
+  void reportMemoryUsage(
+      void* /*ptr*/,
+      int64_t /*alloc_size*/,
+      size_t /*total_allocated*/,
+      size_t /*total_reserved*/,
+      c10::Device /*device*/) override {}
 
   static NVTXThreadLocalState* getTLS() {
     auto tls = ProfilerStateBase::get(/*global=*/false);
diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp
index e08b2a3efd0f..ea195f7bc71a 100644
--- a/torch/csrc/profiler/stubs/cuda.cpp
+++ b/torch/csrc/profiler/stubs/cuda.cpp
@@ -36,7 +36,7 @@ static void cudaCheck(cudaError_t result, const char* file, int line) {
     } else {
       ss << cudaGetErrorString(result);
     }
-    throw std::runtime_error(ss.str());
+    TORCH_CHECK(false, ss.str());
   }
 }
 #define TORCH_CUDA_CHECK(result) cudaCheck(result, __FILE__, __LINE__);
diff --git a/torch/csrc/profiler/unwind/fde.h b/torch/csrc/profiler/unwind/fde.h
index cb3de64486b8..083578ec391e 100644
--- a/torch/csrc/profiler/unwind/fde.h
+++ b/torch/csrc/profiler/unwind/fde.h
@@ -57,7 +57,7 @@ struct FDE {
       throw UnwindError("unsupported 'eh' augmentation string");
     }
     code_alignment_factor_ = static_cast<int64_t>(LC.readULEB128());
-    data_alignment_factor_ = static_cast<int64_t>(LC.readSLEB128());
+    data_alignment_factor_ = LC.readSLEB128();
     if (version == 1) {
       ra_register_ = LC.read<uint8_t>();
     } else {
diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
index e97699a99fd1..d266958e2cb6 100644
--- a/torch/csrc/profiler/util.cpp
+++ b/torch/csrc/profiler/util.cpp
@@ -1,6 +1,5 @@
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/profiler/collection.h>
-#include <torch/csrc/profiler/kineto_shim.h>
 #include <torch/csrc/profiler/util.h>
 
 #include <c10/util/ArrayRef.h>
@@ -11,7 +10,9 @@
 #ifdef USE_KINETO
 #include <libkineto.h>
 #endif
+#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
+#endif // USE_DISTRIBUTED
 
 namespace torch::profiler::impl {
 
@@ -453,7 +454,7 @@ std::unordered_map<std::string, std::string> saveNcclMeta(
     // @lint-ignore CLANGTIDY
     const SaveNcclMetaConfig& config) {
   std::unordered_map<std::string, std::string> map;
-#if !defined(BUILD_LITE_INTERPRETER) && !defined(C10_MOBILE)
+#ifdef USE_DISTRIBUTED
   auto debugInfo = dynamic_cast<ParamCommsDebugInfo*>(
       c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PARAM_COMMS_INFO));
 
@@ -563,7 +564,7 @@ std::unordered_map<std::string, std::string> saveNcclMeta(
       }
     }
   }
-#endif // !defined(BUILD_LITE_INTERPRETER) && !defined(C10_MOBILE)
+#endif // USE_DISTRIBUTED
   return map;
 }
 
diff --git a/torch/csrc/profiler/util.h b/torch/csrc/profiler/util.h
index dcb4b866a2de..f2ae57fa0e59 100644
--- a/torch/csrc/profiler/util.h
+++ b/torch/csrc/profiler/util.h
@@ -185,6 +185,7 @@ struct HashCombine {
   }
 };
 
+#ifdef USE_DISTRIBUTED
 constexpr auto kCommsName = "Collective name";
 constexpr auto kDtype = "dtype";
 constexpr auto kInMsgNelems = "In msg nelems";
@@ -202,5 +203,6 @@ constexpr auto kP2pSrc = "Src Rank";
 constexpr auto kP2pDst = "Dst Rank";
 constexpr auto kInTensorsStart = "Input Tensors start";
 constexpr auto kOutTensorsStart = "Output Tensors start";
+#endif // USE_DISTRIBUTED
 
 } // namespace torch::profiler::impl
diff --git a/torch/csrc/serialization.cpp b/torch/csrc/serialization.cpp
index dd3027d372dc..4cd185493f1f 100644
--- a/torch/csrc/serialization.cpp
+++ b/torch/csrc/serialization.cpp
@@ -257,7 +257,7 @@ void THPStorage_writeFileRaw(
         at::device(self->device()).dtype(c10::kByte),
         {self->device()});
     cpu_tensor = device_tensor.to(at::kCPU);
-    data = (uint8_t*)cpu_tensor.data_ptr();
+    data = static_cast<uint8_t*>(cpu_tensor.data_ptr());
   }
   if (save_size) {
     if (torch::utils::THP_nativeByteOrder() ==
@@ -266,8 +266,8 @@ void THPStorage_writeFileRaw(
     else {
       int64_t nsize{}; // convert big endian cpu to little endian storage
       torch::utils::THP_encodeBuffer(
-          (uint8_t*)&nsize,
-          (const int64_t*)&numel,
+          reinterpret_cast<uint8_t*>(&nsize),
+          reinterpret_cast<const int64_t*>(&numel),
           torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
           1);
       doWrite(fd, &nsize, sizeof(int64_t));
@@ -279,7 +279,7 @@ void THPStorage_writeFileRaw(
           torch::utils::THPByteOrder::THP_LITTLE_ENDIAN) {
     doWrite(fd, data, size_bytes);
   } else {
-    size_t buffer_size = std::min(numel, (size_t)5000);
+    size_t buffer_size = std::min(numel, static_cast<size_t>(5000));
     std::vector<uint8_t> le_buffer;
     le_buffer.resize(buffer_size * element_size);
     for (size_t i = 0; i < numel; i += buffer_size) {
@@ -287,19 +287,19 @@ void THPStorage_writeFileRaw(
       if (element_size == 2) {
         torch::utils::THP_encodeBuffer(
             le_buffer.data(),
-            (const int16_t*)data + i,
+            reinterpret_cast<const int16_t*>(data) + i,
             torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
             to_convert);
       } else if (element_size == 4) {
         torch::utils::THP_encodeBuffer(
             le_buffer.data(),
-            (const int32_t*)data + i,
+            reinterpret_cast<const int32_t*>(data) + i,
             torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
             to_convert);
       } else if (element_size == 8) {
         torch::utils::THP_encodeBuffer(
             le_buffer.data(),
-            (const int64_t*)data + i,
+            reinterpret_cast<const int64_t*>(data) + i,
             torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
             to_convert);
       }
@@ -333,7 +333,8 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
   if (torch::utils::THP_nativeByteOrder() ==
       torch::utils::THPByteOrder::THP_BIG_ENDIAN) {
     int64_t tsize = size; // convert little endian storage to big endian cpu
-    torch::utils::THP_decodeBuffer(&size, (const uint8_t*)&tsize, true, 1);
+    torch::utils::THP_decodeBuffer(
+        &size, reinterpret_cast<const uint8_t*>(&tsize), true, 1);
   }
   size_t nbytes = element_size * size;
   if (!storage.defined()) {
@@ -358,7 +359,7 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
     data = static_cast<uint8_t*>(storage->mutable_data());
   } else {
     cpu_data.resize(nbytes);
-    data = (uint8_t*)cpu_data.data();
+    data = reinterpret_cast<uint8_t*>(cpu_data.data());
   }
 
   // fast track for bytes and little endian
@@ -367,7 +368,7 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
           torch::utils::THPByteOrder::THP_LITTLE_ENDIAN) {
     doRead(file, data, storage->nbytes());
   } else {
-    int64_t buffer_size = std::min(size, (int64_t)5000);
+    int64_t buffer_size = std::min(size, static_cast<int64_t>(5000));
     std::vector<uint8_t> le_buffer;
     le_buffer.resize(buffer_size * element_size);
 
@@ -378,13 +379,22 @@ c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
       // NOLINTNEXTLINE(bugprone-branch-clone)
       if (element_size == 2) {
         torch::utils::THP_decodeBuffer(
-            (int16_t*)data + i, le_buffer.data(), true, to_convert);
+            reinterpret_cast<int16_t*>(data) + i,
+            le_buffer.data(),
+            true,
+            to_convert);
       } else if (element_size == 4) {
         torch::utils::THP_decodeBuffer(
-            (int32_t*)data + i, le_buffer.data(), true, to_convert);
+            reinterpret_cast<int32_t*>(data) + i,
+            le_buffer.data(),
+            true,
+            to_convert);
       } else if (element_size == 8) {
         torch::utils::THP_decodeBuffer(
-            (int64_t*)data + i, le_buffer.data(), true, to_convert);
+            reinterpret_cast<int64_t*>(data) + i,
+            le_buffer.data(),
+            true,
+            to_convert);
       }
     }
   }
diff --git a/torch/csrc/stable/.clang-tidy b/torch/csrc/stable/.clang-tidy
new file mode 100644
index 000000000000..a741370b2f08
--- /dev/null
+++ b/torch/csrc/stable/.clang-tidy
@@ -0,0 +1,10 @@
+# NOTE: Please don't disable inheritance from the parent to make sure that common checks get propagated.
+
+# This configuration prevents global namespace pollution in headers.
+---
+InheritParentConfig: true
+Checks: '
+google-global-names-in-headers,
+misc-definitions-in-headers,
+'
+...
diff --git a/torch/csrc/stable/accelerator.h b/torch/csrc/stable/accelerator.h
index e104107dbc5b..879080237b4e 100644
--- a/torch/csrc/stable/accelerator.h
+++ b/torch/csrc/stable/accelerator.h
@@ -5,10 +5,10 @@
 
 #include <memory>
 
-using DeleterFnPtr = void (*)(void*);
-
 namespace torch::stable::accelerator {
 
+using DeleterFnPtr = void (*)(void*);
+
 namespace {
 inline void delete_device_guard(void* ptr) {
   TORCH_ERROR_CODE_CHECK(
diff --git a/torch/csrc/stable/ops.h b/torch/csrc/stable/ops.h
index 4a11c7256bf4..0fbd9210be2a 100644
--- a/torch/csrc/stable/ops.h
+++ b/torch/csrc/stable/ops.h
@@ -10,25 +10,23 @@
 #include <torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h>
 #include <torch/headeronly/core/ScalarType.h>
 
-using torch::stable::Tensor;
-
 namespace torch::stable {
 
 // We expect this to be the stable version of the empty_like op that takes in
 // no kwargs (device, dtype, layout, memory_format). We will add kwargs
 // support in the future.
-inline Tensor empty_like(const Tensor& self) {
+inline torch::stable::Tensor empty_like(const torch::stable::Tensor& self) {
   const auto num_args = 6;
   std::array<StableIValue, num_args> stack{
-      from(self),
-      from(std::nullopt),
-      from(std::nullopt),
-      from(std::nullopt),
-      from(std::nullopt),
-      from(std::nullopt)};
+      torch::stable::detail::from(self),
+      torch::stable::detail::from(std::nullopt),
+      torch::stable::detail::from(std::nullopt),
+      torch::stable::detail::from(std::nullopt),
+      torch::stable::detail::from(std::nullopt),
+      torch::stable::detail::from(std::nullopt)};
   TORCH_ERROR_CODE_CHECK(
       aoti_torch_call_dispatcher("aten::empty_like", "", stack.data()));
-  return to<Tensor>(stack[0]);
+  return torch::stable::detail::to<torch::stable::Tensor>(stack[0]);
 }
 
 // We expect this to be the stable version of the fill_.Scalar op
@@ -36,7 +34,9 @@ inline Tensor empty_like(const Tensor& self) {
 // A subtle nuance is that `value` is typed as a double, but it is
 // actually a Scalar. This is because Scalar.h is currently not
 // header-only.
-inline Tensor fill_(const Tensor& self, double value) {
+inline torch::stable::Tensor fill_(
+    const torch::stable::Tensor& self,
+    double value) {
   TORCH_ERROR_CODE_CHECK(aoti_torch_aten_fill__Scalar(self.get(), value));
   return self;
 }
@@ -44,18 +44,22 @@ inline Tensor fill_(const Tensor& self, double value) {
 // We expect this to be the stable version of the narrow.default op.
 // narrow takes in a SymInt for start and length, but these are typed as
 // int64_t as SymInt is not yet header-only.
-inline Tensor narrow(Tensor& self, int64_t dim, int64_t start, int64_t length) {
+inline torch::stable::Tensor narrow(
+    torch::stable::Tensor& self,
+    int64_t dim,
+    int64_t start,
+    int64_t length) {
   AtenTensorHandle ret0 = nullptr;
 
   TORCH_ERROR_CODE_CHECK(
       aoti_torch_aten_narrow(self.get(), dim, start, length, &ret0));
-  return Tensor(ret0);
+  return torch::stable::Tensor(ret0);
 }
 
 // We expect this to be a stable version of the new_empty op that takes in
 // only dtype information.
-inline Tensor new_empty(
-    const Tensor& self,
+inline torch::stable::Tensor new_empty(
+    const torch::stable::Tensor& self,
     std::vector<int64_t> size,
     std::optional<c10::ScalarType> dtype = std::nullopt) {
   int32_t device_type;
@@ -67,7 +71,8 @@ inline Tensor new_empty(
 
   int32_t target_dtype;
   if (dtype.has_value()) {
-    target_dtype = to<int32_t>(from(dtype.value()));
+    target_dtype = torch::stable::detail::to<int32_t>(
+        torch::stable::detail::from(dtype.value()));
   } else {
     TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(self.get(), &target_dtype));
   }
@@ -87,13 +92,13 @@ inline Tensor new_empty(
       nullptr, // pin_memory (nullptr for default)
       &ret0));
 
-  return Tensor(ret0);
+  return torch::stable::Tensor(ret0);
 }
 
 // We expect this to be a stable version of the new_zeros op that takes in
 // only dtype information.
-inline Tensor new_zeros(
-    const Tensor& self,
+inline torch::stable::Tensor new_zeros(
+    const torch::stable::Tensor& self,
     std::vector<int64_t> size,
     std::optional<c10::ScalarType> dtype = std::nullopt) {
   int32_t device_type;
@@ -105,7 +110,8 @@ inline Tensor new_zeros(
 
   int32_t target_dtype;
   if (dtype.has_value()) {
-    target_dtype = to<int32_t>(from(dtype.value()));
+    target_dtype = torch::stable::detail::to<int32_t>(
+        torch::stable::detail::from(dtype.value()));
   } else {
     TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(self.get(), &target_dtype));
   }
@@ -125,7 +131,7 @@ inline Tensor new_zeros(
       nullptr, // pin_memory (nullptr for default)
       &ath));
 
-  return Tensor(ath);
+  return torch::stable::Tensor(ath);
 }
 
 // We expect this to be the stable version of the pad.default op.
@@ -133,8 +139,8 @@ inline Tensor new_zeros(
 // use std::vector<int64_t> because
 // (1) IntArrayRef is not yet header-only
 // (2) SymInt is not yet header-only
-inline Tensor pad(
-    const Tensor& self,
+inline torch::stable::Tensor pad(
+    const torch::stable::Tensor& self,
     std::vector<int64_t> pad,
     const std::string& mode = "constant",
     double value = 0.0) {
@@ -142,7 +148,7 @@ inline Tensor pad(
 
   TORCH_ERROR_CODE_CHECK(aoti_torch_aten_pad(
       self.get(), pad.data(), pad.size(), mode.c_str(), &value, &ret0));
-  return Tensor(ret0);
+  return torch::stable::Tensor(ret0);
 }
 
 // We expect the following two functions to be stable versions of the
@@ -154,11 +160,14 @@ inline Tensor pad(
 
 // This function is an overload to compute the maximum value along each slice of
 // `self` along a single dimension `dim`.
-inline Tensor amax(const Tensor& self, int64_t dim, bool keepdim = false) {
+inline torch::stable::Tensor amax(
+    const torch::stable::Tensor& self,
+    int64_t dim,
+    bool keepdim = false) {
   AtenTensorHandle ret = nullptr;
   TORCH_ERROR_CODE_CHECK(
       aoti_torch_aten_amax(self.get(), &dim, 1, keepdim, &ret));
-  return Tensor(ret);
+  return torch::stable::Tensor(ret);
 }
 
 // This function is an overload to compute the maximum value along each slice of
@@ -166,8 +175,8 @@ inline Tensor amax(const Tensor& self, int64_t dim, bool keepdim = false) {
 // amax.default op takes in a SymInt[] as the dims argument, however dims is
 // typed as use std::vector<int64_t> here because (1) IntArrayRef is not yet
 // header-only (2) SymInt is not yet header-only
-inline Tensor amax(
-    const Tensor& self,
+inline torch::stable::Tensor amax(
+    const torch::stable::Tensor& self,
     std::vector<int64_t> dims,
     bool keepdim = false) {
   AtenTensorHandle ret = nullptr;
@@ -177,28 +186,62 @@ inline Tensor amax(
       static_cast<int64_t>(dims.size()),
       keepdim,
       &ret));
-  return Tensor(ret);
+  return torch::stable::Tensor(ret);
 }
 
 // We expect this to be the stable version of the transpose op with identical
 // semantics to the existing transpose.int op.
-inline Tensor transpose(const Tensor& self, int64_t dim0, int64_t dim1) {
+inline torch::stable::Tensor transpose(
+    const torch::stable::Tensor& self,
+    int64_t dim0,
+    int64_t dim1) {
   const auto num_args = 3;
-  std::array<StableIValue, num_args> stack{from(self), from(dim0), from(dim1)};
+  std::array<StableIValue, num_args> stack{
+      torch::stable::detail::from(self),
+      torch::stable::detail::from(dim0),
+      torch::stable::detail::from(dim1)};
   TORCH_ERROR_CODE_CHECK(
       aoti_torch_call_dispatcher("aten::transpose", "int", stack.data()));
-  return to<Tensor>(stack[0]);
+  return torch::stable::detail::to<torch::stable::Tensor>(stack[0]);
 }
 
 // We expect this to be the stable version of the zero_ op with identical
 // semantics to the existing zero_ op (except that it will not be called as
 // a tensor method but only as a function i.e. zero_(t) not t.zero_()).
-inline Tensor zero_(Tensor& self) {
+inline torch::stable::Tensor zero_(torch::stable::Tensor& self) {
   const auto num_args = 1;
-  std::array<StableIValue, num_args> stack{from(self)};
+  std::array<StableIValue, num_args> stack{torch::stable::detail::from(self)};
   TORCH_ERROR_CODE_CHECK(
       aoti_torch_call_dispatcher("aten::zero_", "", stack.data()));
-  return to<Tensor>(stack[0]);
+  return torch::stable::detail::to<torch::stable::Tensor>(stack[0]);
+}
+
+// We expect this to be the stable version of the copy_ op with
+// identical semantics to the existing copy_ op.
+inline torch::stable::Tensor copy_(
+    torch::stable::Tensor& self,
+    const torch::stable::Tensor& src,
+    std::optional<bool> non_blocking = std::nullopt) {
+  const auto num_args = 3;
+  std::array<StableIValue, num_args> stack{
+      torch::stable::detail::from(self),
+      torch::stable::detail::from(src),
+      torch::stable::detail::from(non_blocking.value_or(false))};
+  TORCH_ERROR_CODE_CHECK(
+      aoti_torch_call_dispatcher("aten::copy_", "", stack.data()));
+  return torch::stable::detail::to<torch::stable::Tensor>(stack[0]);
+}
+
+// We expect this to be the stable version of the clone op. We will
+// add optional memory_format kwarg support in the future.
+inline torch::stable::Tensor clone(const torch::stable::Tensor& self) {
+  const auto num_args = 2;
+  std::array<StableIValue, num_args> stack{
+      torch::stable::detail::from(self),
+      torch::stable::detail::from(std::nullopt)};
+  TORCH_ERROR_CODE_CHECK(
+      aoti_torch_call_dispatcher("aten::clone", "", stack.data()));
+  return torch::stable::detail::to<torch::stable::Tensor>(stack[0]);
 }
 
 } // namespace torch::stable
diff --git a/torch/csrc/stable/stableivalue_conversions.h b/torch/csrc/stable/stableivalue_conversions.h
index ce5fdd941c6d..4aa96337be18 100644
--- a/torch/csrc/stable/stableivalue_conversions.h
+++ b/torch/csrc/stable/stableivalue_conversions.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/util/Exception.h>
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #include <torch/csrc/stable/tensor_struct.h>
 #include <torch/headeronly/core/ScalarType.h>
@@ -8,9 +9,7 @@
 
 #include <optional>
 
-// use anonymous namespace to avoid collisions between differing
-// versions of this file that may be included by different sources
-namespace {
+namespace torch::stable::detail {
 
 // forward declare so that the from/to() implementations in the detail
 // namespace of library.h where the real work is done can compile.
@@ -20,15 +19,8 @@ template <typename T>
 T to(StableIValue val);
 
 // =============================================================================
-//  helpers for converting between StableIValue and T
+//  Below are the helpers for converting between StableIValue and T
 // =============================================================================
-
-// note that the signatures for from and to are forward declared in
-// stable/stableivalue_conversions.h but defined below to avoid circular
-// dependencies where other headers (like tensor-inl.h) will need to/from.
-
-namespace detail {
-
 // =============================================================================
 // FROM CONVERSIONS (T -> StableIValue)
 // =============================================================================
@@ -118,7 +110,8 @@ struct FromImpl<ScalarType> {
       case ScalarType::UInt64:
         return from(aoti_torch_dtype_uint64());
       default:
-        throw std::runtime_error(
+        TORCH_CHECK(
+            false,
             "Not yet supported ScalarType, please file an issue describing your use case.");
     }
   }
@@ -267,8 +260,10 @@ struct ToImpl<ScalarType> {
     } else if (shim_scalartype == aoti_torch_dtype_uint64()) {
       return ScalarType::UInt64;
     } else {
-      throw std::runtime_error(
-          "Not yet supported ScalarType " + std::to_string(shim_scalartype) +
+      TORCH_CHECK(
+          false,
+          "Not yet supported ScalarType ",
+          std::to_string(shim_scalartype),
           ", please file an issue describing your use case.");
     }
   }
@@ -314,32 +309,68 @@ struct ToImpl<torch::stable::Tensor> {
   }
 };
 
-} // namespace detail
+// =============================================================================
+//  end to helpers for converting between StableIValue and T
+// =============================================================================
 
 // Expose the partially templated class functions through single functions
 template <typename T>
-StableIValue from(T val) {
+inline StableIValue from(T val) {
   return detail::FromImpl<T>::call(val);
 }
 
 template <typename T>
-StableIValue from(const std::optional<T>& val) {
+inline StableIValue from(const std::optional<T>& val) {
   return detail::FromImpl<std::optional<T>>::call(val);
 }
 
 // The below overload is used! See https://godbolt.org/z/859cshxrW
 // We are suppressing the warning for versions clang12- and gcc11-
-[[maybe_unused]] StableIValue from(const torch::stable::Tensor& val) {
+[[maybe_unused]] inline StableIValue from(const torch::stable::Tensor& val) {
   return detail::FromImpl<torch::stable::Tensor>::call(val);
 }
 
 template <typename T>
-T to(StableIValue val) {
+inline T to(StableIValue val) {
   return detail::ToImpl<T>::call(val);
 }
 
-// =============================================================================
-//  end to helpers for converting between StableIValue and T
-// =============================================================================
+} // namespace torch::stable::detail
+
+// [global from/to deprecation note]
+// WARNING! the following APIs will be removed!! We deprecated global from/to
+// (in 2.10) in favor of torch::stable::detail from/to to not pollute the global
+// namespace. We are only including the following wrappers for backwards
+// compatibility.
 
-} // namespace
+// WARNING! Will be removed. Only exists for BC. See [global from/to deprecation
+// note]
+template <typename T>
+[[deprecated("Use torch::stable::detail::from instead.")]]
+inline StableIValue from(T val) {
+  return torch::stable::detail::from(val);
+}
+
+// WARNING! Will be removed. Only exists for BC. See [global from/to deprecation
+// note]
+template <typename T>
+[[deprecated("Use torch::stable::detail::from instead.")]]
+inline StableIValue from(const std::optional<T>& val) {
+  return torch::stable::detail::from(val);
+}
+
+// WARNING! Will be removed. Only exists for BC. See [global from/to deprecation
+// note]
+[[deprecated(
+    "Use torch::stable::detail::from instead.")]] [[maybe_unused]] inline StableIValue
+from(const torch::stable::Tensor& val) {
+  return torch::stable::detail::from(val);
+}
+
+// WARNING! Will be removed. Only exists for BC. See [global from/to deprecation
+// note]
+template <typename T>
+[[deprecated("Use torch::stable::detail::to instead.")]]
+inline T to(StableIValue val) {
+  return torch::stable::detail::to<T>(val);
+}
diff --git a/torch/csrc/stable/tensor_inl.h b/torch/csrc/stable/tensor_inl.h
index cbc6f30ed656..f8461d93aff9 100644
--- a/torch/csrc/stable/tensor_inl.h
+++ b/torch/csrc/stable/tensor_inl.h
@@ -14,10 +14,11 @@ namespace torch::stable {
 
 using torch::headeronly::ScalarType;
 
-ScalarType Tensor::scalar_type() const {
+inline ScalarType Tensor::scalar_type() const {
   int32_t dtype;
   TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(ath_.get(), &dtype));
-  return to<ScalarType>(from(dtype));
+  return torch::stable::detail::to<ScalarType>(
+      torch::stable::detail::from(dtype));
 }
 
 } // namespace torch::stable
diff --git a/torch/csrc/utils.cpp b/torch/csrc/utils.cpp
index c23a41e8e64e..f5eb5bd9fa84 100644
--- a/torch/csrc/utils.cpp
+++ b/torch/csrc/utils.cpp
@@ -55,17 +55,17 @@ std::vector<int64_t> THPUtils_unpackLongs(PyObject* arg) {
     for (int i = 0; i != nDim; ++i) {
       PyObject* item =
           tuple ? PyTuple_GET_ITEM(arg, i) : PyList_GET_ITEM(arg, i);
-      if (!THPUtils_checkLong(item)) {
-        std::ostringstream oss;
-        oss << "expected int at position " << i
-            << ", but got: " << THPUtils_typename(item);
-        throw std::runtime_error(oss.str());
-      }
+      TORCH_CHECK(
+          THPUtils_checkLong(item),
+          "expected int at position ",
+          i,
+          ", but got: ",
+          THPUtils_typename(item));
       sizes[i] = THPUtils_unpackLong(item);
     }
     return sizes;
   }
-  throw std::runtime_error("Expected tuple or list");
+  TORCH_CHECK(false, "Expected tuple or list");
 }
 
 bool THPUtils_checkIntTuple(PyObject* arg) {
@@ -81,12 +81,10 @@ bool THPUtils_checkIntTuple(PyObject* arg) {
 }
 
 std::vector<int> THPUtils_unpackIntTuple(PyObject* arg) {
-  if (!THPUtils_checkIntTuple(arg)) {
-    throw std::runtime_error("Couldn't unpack int tuple");
-  }
+  TORCH_CHECK(THPUtils_checkIntTuple(arg), "Couldn't unpack int tuple");
   std::vector<int> values(PyTuple_GET_SIZE(arg));
   for (Py_ssize_t i = 0; i < PyTuple_GET_SIZE(arg); ++i) {
-    values[i] = (int)THPUtils_unpackLong(PyTuple_GET_ITEM(arg, i));
+    values[i] = THPUtils_unpackInt(PyTuple_GET_ITEM(arg, i));
   }
   return values;
 }
@@ -356,7 +354,7 @@ std::string dispatch_keyset_string(c10::DispatchKeySet keyset) {
 
 namespace pybind11::detail {
 
-bool type_caster<at::Tensor>::load(handle src, bool) {
+bool type_caster<at::Tensor>::load(handle src, bool /*unused*/) {
   PyObject* obj = src.ptr();
   if (THPVariable_Check(obj)) {
     value = THPVariable_Unpack(obj);
@@ -372,7 +370,7 @@ handle type_caster<at::Tensor>::cast(
   return handle(THPVariable_Wrap(src));
 }
 
-bool type_caster<at::IntArrayRef>::load(handle src, bool) {
+bool type_caster<at::IntArrayRef>::load(handle src, bool /*unused*/) {
   PyObject* source = src.ptr();
   auto tuple = PyTuple_Check(source);
   if (tuple || PyList_Check(source)) {
@@ -405,7 +403,7 @@ handle type_caster<at::IntArrayRef>::cast(
   return handle(THPUtils_packInt64Array(src.size(), src.data()));
 }
 
-bool type_caster<at::SymIntArrayRef>::load(handle src, bool) {
+bool type_caster<at::SymIntArrayRef>::load(handle src, bool /*unused*/) {
   PyObject* source = src.ptr();
 
   auto tuple = PyTuple_Check(source);
@@ -446,7 +444,9 @@ handle type_caster<at::SymIntArrayRef>::cast(
   return t.release();
 }
 
-bool type_caster<at::ArrayRef<c10::SymNode>>::load(handle src, bool) {
+bool type_caster<at::ArrayRef<c10::SymNode>>::load(
+    handle src,
+    bool /*unused*/) {
   TORCH_INTERNAL_ASSERT(0, "NYI");
 }
 handle type_caster<at::ArrayRef<c10::SymNode>>::cast(
diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h
index 71a2b10e5904..0fa7584e1f71 100644
--- a/torch/csrc/utils.h
+++ b/torch/csrc/utils.h
@@ -21,9 +21,6 @@
 #define THP_EXPECT(x, y) (x)
 #endif
 
-#define THPUtils_checkReal_FLOAT(object) \
-  (PyFloat_Check(object) || PyLong_Check(object))
-
 #define THPUtils_unpackReal_FLOAT(object)           \
   (PyFloat_Check(object) ? PyFloat_AsDouble(object) \
        : PyLong_Check(object)                       \
@@ -53,88 +50,11 @@
        : (throw std::runtime_error("Could not parse real"),                   \
           c10::complex<double>(0, 0)))
 
-#define THPUtils_checkReal_BOOL(object) PyBool_Check(object)
-
-#define THPUtils_checkReal_COMPLEX(object)                                    \
-  PyComplex_Check(object) || PyFloat_Check(object) || PyLong_Check(object) || \
-      PyInt_Check(object)
-
-#define THPUtils_newReal_FLOAT(value) PyFloat_FromDouble(value)
-#define THPUtils_newReal_INT(value) PyInt_FromLong(value)
-
-#define THPUtils_newReal_BOOL(value) PyBool_FromLong(value)
-
-#define THPUtils_newReal_COMPLEX(value) \
-  PyComplex_FromDoubles(value.real(), value.imag())
-
-#define THPDoubleUtils_checkReal(object) THPUtils_checkReal_FLOAT(object)
-#define THPDoubleUtils_unpackReal(object) \
-  (double)THPUtils_unpackReal_FLOAT(object)
-#define THPDoubleUtils_newReal(value) THPUtils_newReal_FLOAT(value)
-#define THPFloatUtils_checkReal(object) THPUtils_checkReal_FLOAT(object)
-#define THPFloatUtils_unpackReal(object) \
-  (float)THPUtils_unpackReal_FLOAT(object)
-#define THPFloatUtils_newReal(value) THPUtils_newReal_FLOAT(value)
-#define THPHalfUtils_checkReal(object) THPUtils_checkReal_FLOAT(object)
-#define THPHalfUtils_unpackReal(object) \
-  (at::Half) THPUtils_unpackReal_FLOAT(object)
-#define THPHalfUtils_newReal(value) PyFloat_FromDouble(value)
-#define THPHalfUtils_newAccreal(value) THPUtils_newReal_FLOAT(value)
-#define THPComplexDoubleUtils_checkReal(object) \
-  THPUtils_checkReal_COMPLEX(object)
-#define THPComplexDoubleUtils_unpackReal(object) \
-  THPUtils_unpackReal_COMPLEX(object)
-#define THPComplexDoubleUtils_newReal(value) THPUtils_newReal_COMPLEX(value)
-#define THPComplexFloatUtils_checkReal(object) \
-  THPUtils_checkReal_COMPLEX(object)
-#define THPComplexFloatUtils_unpackReal(object) \
-  (c10::complex<float>)THPUtils_unpackReal_COMPLEX(object)
-#define THPComplexFloatUtils_newReal(value) THPUtils_newReal_COMPLEX(value)
-#define THPBFloat16Utils_checkReal(object) THPUtils_checkReal_FLOAT(object)
-#define THPBFloat16Utils_unpackReal(object) \
-  (at::BFloat16) THPUtils_unpackReal_FLOAT(object)
-#define THPBFloat16Utils_newReal(value) PyFloat_FromDouble(value)
-#define THPBFloat16Utils_newAccreal(value) THPUtils_newReal_FLOAT(value)
-
-#define THPBoolUtils_checkReal(object) THPUtils_checkReal_BOOL(object)
 #define THPBoolUtils_unpackReal(object) THPUtils_unpackReal_BOOL(object)
-#define THPBoolUtils_newReal(value) THPUtils_newReal_BOOL(value)
 #define THPBoolUtils_checkAccreal(object) THPUtils_checkReal_BOOL(object)
-#define THPBoolUtils_unpackAccreal(object) \
-  (int64_t)THPUtils_unpackReal_BOOL(object)
-#define THPBoolUtils_newAccreal(value) THPUtils_newReal_BOOL(value)
-#define THPLongUtils_checkReal(object) THPUtils_checkReal_INT(object)
-#define THPLongUtils_unpackReal(object) (int64_t)THPUtils_unpackReal_INT(object)
-#define THPLongUtils_newReal(value) THPUtils_newReal_INT(value)
-#define THPIntUtils_checkReal(object) THPUtils_checkReal_INT(object)
-#define THPIntUtils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
-#define THPIntUtils_newReal(value) THPUtils_newReal_INT(value)
-#define THPShortUtils_checkReal(object) THPUtils_checkReal_INT(object)
-#define THPShortUtils_unpackReal(object) (short)THPUtils_unpackReal_INT(object)
-#define THPShortUtils_newReal(value) THPUtils_newReal_INT(value)
-#define THPCharUtils_checkReal(object) THPUtils_checkReal_INT(object)
-#define THPCharUtils_unpackReal(object) (char)THPUtils_unpackReal_INT(object)
-#define THPCharUtils_newReal(value) THPUtils_newReal_INT(value)
 #define THPByteUtils_checkReal(object) THPUtils_checkReal_INT(object)
 #define THPByteUtils_unpackReal(object) \
   (unsigned char)THPUtils_unpackReal_INT(object)
-#define THPByteUtils_newReal(value) THPUtils_newReal_INT(value)
-// quantized types
-#define THPQUInt8Utils_checkReal(object) THPUtils_checkReal_INT(object)
-#define THPQUInt8Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
-#define THPQUInt8Utils_newReal(value) THPUtils_newReal_INT(value)
-#define THPQInt8Utils_checkReal(object) THPUtils_checkReal_INT(object)
-#define THPQInt8Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
-#define THPQInt8Utils_newReal(value) THPUtils_newReal_INT(value)
-#define THPQInt32Utils_checkReal(object) THPUtils_checkReal_INT(object)
-#define THPQInt32Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
-#define THPQInt32Utils_newReal(value) THPUtils_newReal_INT(value)
-#define THPQUInt4x2Utils_checkReal(object) THPUtils_checkReal_INT(object)
-#define THPQUInt4x2Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
-#define THPQUInt4x2Utils_newReal(value) THPUtils_newReal_INT(value)
-#define THPQUInt2x4Utils_checkReal(object) THPUtils_checkReal_INT(object)
-#define THPQUInt2x4Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
-#define THPQUInt2x4Utils_newReal(value) THPUtils_newReal_INT(value)
 
 /*
    From https://github.com/python/cpython/blob/v3.7.0/Modules/xxsubtype.c
diff --git a/torch/csrc/utils/byte_order.cpp b/torch/csrc/utils/byte_order.cpp
index b7d00207a3ae..ccb8990e5915 100644
--- a/torch/csrc/utils/byte_order.cpp
+++ b/torch/csrc/utils/byte_order.cpp
@@ -172,7 +172,7 @@ template <>
 TORCH_API void THP_decodeBuffer<bool, bool>(
     bool* dst,
     const uint8_t* src,
-    bool,
+    bool /*unused*/,
     size_t len) {
   for (const auto i : c10::irange(len)) {
     dst[i] = (int)src[i] != 0 ? true : false;
diff --git a/torch/csrc/utils/disable_torch_function.cpp b/torch/csrc/utils/disable_torch_function.cpp
index 9dc6e9777a36..becbe1681f00 100644
--- a/torch/csrc/utils/disable_torch_function.cpp
+++ b/torch/csrc/utils/disable_torch_function.cpp
@@ -348,7 +348,7 @@ inline static bool array_has_torch_function(
   return false;
 }
 
-PyObject* THPModule_has_torch_function(PyObject*, PyObject* arg) {
+PyObject* THPModule_has_torch_function(PyObject* /*unused*/, PyObject* arg) {
   bool result = false;
   if (PyTuple_CheckExact(arg) || PyList_CheckExact(arg)) {
     // Fast path:
@@ -372,7 +372,9 @@ PyObject* THPModule_has_torch_function(PyObject*, PyObject* arg) {
   Py_RETURN_FALSE;
 }
 
-PyObject* THPModule_has_torch_function_unary(PyObject*, PyObject* obj) {
+PyObject* THPModule_has_torch_function_unary(
+    PyObject* /*unused*/,
+    PyObject* obj) {
   // Special case `THPModule_has_torch_function` for the single arg case.
   if (torch::check_has_torch_function(obj)) {
     Py_RETURN_TRUE;
@@ -381,7 +383,7 @@ PyObject* THPModule_has_torch_function_unary(PyObject*, PyObject* obj) {
 }
 
 PyObject* THPModule_has_torch_function_variadic(
-    PyObject*,
+    PyObject* /*unused*/,
     PyObject* const* args,
     Py_ssize_t nargs) {
   if (array_has_torch_function(args, nargs)) {
diff --git a/torch/csrc/utils/disable_torch_function.h b/torch/csrc/utils/disable_torch_function.h
index 9331c521b183..b52173c252a8 100644
--- a/torch/csrc/utils/disable_torch_function.h
+++ b/torch/csrc/utils/disable_torch_function.h
@@ -37,9 +37,11 @@ PyObject* THPModule_DisableTorchFunctionType();
 PyObject* THPModule_DisableTorchFunctionSubclassType();
 PyObject* THPModule_disable_torch_function(PyObject* self, PyObject* args);
 PyObject* THPModule_disable_torch_dispatch(PyObject* self, PyObject* args);
-PyObject* THPModule_has_torch_function(PyObject*, PyObject* arg);
-PyObject* THPModule_has_torch_function_unary(PyObject*, PyObject* obj);
+PyObject* THPModule_has_torch_function(PyObject* /*unused*/, PyObject* arg);
+PyObject* THPModule_has_torch_function_unary(
+    PyObject* /*unused*/,
+    PyObject* obj);
 PyObject* THPModule_has_torch_function_variadic(
-    PyObject*,
+    PyObject* /*unused*/,
     PyObject* const* args,
     Py_ssize_t nargs);
diff --git a/torch/csrc/utils/pybind.cpp b/torch/csrc/utils/pybind.cpp
index 2ff645b7593c..cce34b7cf68b 100644
--- a/torch/csrc/utils/pybind.cpp
+++ b/torch/csrc/utils/pybind.cpp
@@ -4,7 +4,7 @@
 
 namespace pybind11::detail {
 
-bool type_caster<c10::SymInt>::load(py::handle src, bool) {
+bool type_caster<c10::SymInt>::load(py::handle src, bool /*unused*/) {
   if (torch::is_symint(src)) {
     auto node = src.attr("node");
     if (py::isinstance<c10::SymNodeImpl>(node)) {
@@ -62,7 +62,7 @@ py::handle type_caster<c10::SymInt>::cast(
   }
 }
 
-bool type_caster<c10::SymFloat>::load(py::handle src, bool) {
+bool type_caster<c10::SymFloat>::load(py::handle src, bool /*unused*/) {
   if (torch::is_symfloat(src)) {
     value = c10::SymFloat(static_cast<c10::SymNode>(
         c10::make_intrusive<torch::impl::PythonSymNodeImpl>(src.attr("node"))));
@@ -92,7 +92,7 @@ py::handle type_caster<c10::SymFloat>::cast(
   }
 }
 
-bool type_caster<c10::SymBool>::load(py::handle src, bool) {
+bool type_caster<c10::SymBool>::load(py::handle src, bool /*unused*/) {
   if (torch::is_symbool(src)) {
     value = c10::SymBool(static_cast<c10::SymNode>(
         c10::make_intrusive<torch::impl::PythonSymNodeImpl>(src.attr("node"))));
@@ -122,7 +122,7 @@ py::handle type_caster<c10::SymBool>::cast(
   }
 }
 
-bool type_caster<c10::Scalar>::load(py::handle src, bool) {
+bool type_caster<c10::Scalar>::load(py::handle src, bool /*unused*/) {
   TORCH_INTERNAL_ASSERT(
       0, "pybind11 loading for c10::Scalar NYI (file a bug if you need it)");
 }
diff --git a/torch/csrc/utils/pybind.h b/torch/csrc/utils/pybind.h
index 681d94582986..b2c0863148ad 100644
--- a/torch/csrc/utils/pybind.h
+++ b/torch/csrc/utils/pybind.h
@@ -38,7 +38,7 @@ struct TORCH_PYTHON_API type_caster<at::Tensor> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(at::Tensor, _("torch.Tensor"));
 
-  bool load(handle src, bool);
+  bool load(handle src, bool /*unused*/);
 
   static handle cast(
       const at::Tensor& src,
@@ -53,7 +53,7 @@ struct type_caster<at::Storage> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(at::Storage, _("torch.StorageBase"));
 
-  bool load(handle src, bool) {
+  bool load(handle src, bool /*unused*/) {
     PyObject* obj = src.ptr();
     if (torch::isStorage(obj)) {
       value = torch::createStorage(obj);
@@ -76,7 +76,7 @@ struct type_caster<at::Generator> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(at::Generator, _("torch.Generator"));
 
-  bool load(handle src, bool) {
+  bool load(handle src, bool /*unused*/) {
     PyObject* obj = src.ptr();
     if (THPGenerator_Check(obj)) {
       value = reinterpret_cast<THPGenerator*>(obj)->cdata;
@@ -99,7 +99,7 @@ struct TORCH_PYTHON_API type_caster<at::IntArrayRef> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(at::IntArrayRef, _("Tuple[int, ...]"));
 
-  bool load(handle src, bool);
+  bool load(handle src, bool /*unused*/);
   static handle cast(
       at::IntArrayRef src,
       return_value_policy /* policy */,
@@ -115,7 +115,7 @@ struct TORCH_PYTHON_API type_caster<at::SymIntArrayRef> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(at::SymIntArrayRef, _("List[int]"));
 
-  bool load(handle src, bool);
+  bool load(handle src, bool /*unused*/);
   static handle cast(
       at::SymIntArrayRef src,
       return_value_policy /* policy */,
@@ -131,7 +131,7 @@ struct TORCH_PYTHON_API type_caster<at::ArrayRef<c10::SymNode>> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(at::ArrayRef<c10::SymNode>, _("List[SymNode]"));
 
-  bool load(handle src, bool);
+  bool load(handle src, bool /*unused*/);
   static handle cast(
       at::ArrayRef<c10::SymNode> src,
       return_value_policy /* policy */,
@@ -147,7 +147,7 @@ struct type_caster<at::MemoryFormat> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(at::MemoryFormat, _("torch.memory_format"));
 
-  bool load(handle src, bool) {
+  bool load(handle src, bool /*unused*/) {
     PyObject* obj = src.ptr();
     if (THPMemoryFormat_Check(obj)) {
       value = reinterpret_cast<THPMemoryFormat*>(obj)->memory_format;
@@ -175,7 +175,7 @@ struct type_caster<at::Device> {
   // after a successful call to load.
   type_caster() : value(c10::kCPU) {}
 
-  bool load(handle src, bool) {
+  bool load(handle src, bool /*unused*/) {
     PyObject* obj = src.ptr();
     if (THPDevice_Check(obj)) {
       value = reinterpret_cast<THPDevice*>(obj)->device;
@@ -204,7 +204,7 @@ struct type_caster<at::ScalarType> {
   // after a successful call to load.
   type_caster() : value(at::kFloat) {}
 
-  bool load(handle src, bool) {
+  bool load(handle src, bool /*unused*/) {
     PyObject* obj = src.ptr();
     if (THPDtype_Check(obj)) {
       value = reinterpret_cast<THPDtype*>(obj)->scalar_type;
@@ -233,7 +233,7 @@ struct type_caster<c10::Stream> {
   // after a successful call to load.
   type_caster() : value(c10::Stream::DEFAULT, c10::Device(c10::kCPU, 0)) {}
 
-  bool load(handle src, bool) {
+  bool load(handle src, bool /*unused*/) {
     PyObject* obj = src.ptr();
     if (THPStream_Check(obj)) {
       value = c10::Stream::unpack3(
@@ -286,7 +286,7 @@ struct TORCH_PYTHON_API type_caster<c10::Scalar> {
   PYBIND11_TYPE_CASTER(
       c10::Scalar,
       _("Union[Number, torch.SymInt, torch.SymFloat, torch.SymBool]"));
-  bool load(py::handle src, bool);
+  bool load(py::handle src, bool /*unused*/);
 
   static py::handle cast(
       const c10::Scalar& si,
@@ -298,7 +298,7 @@ template <>
 struct TORCH_PYTHON_API type_caster<c10::SymInt> {
  public:
   PYBIND11_TYPE_CASTER(c10::SymInt, _("Union[int, torch.SymInt]"));
-  bool load(py::handle src, bool);
+  bool load(py::handle src, bool /*unused*/);
 
   static py::handle cast(
       const c10::SymInt& si,
@@ -310,7 +310,7 @@ template <>
 struct TORCH_PYTHON_API type_caster<c10::SymFloat> {
  public:
   PYBIND11_TYPE_CASTER(c10::SymFloat, _("float"));
-  bool load(py::handle src, bool);
+  bool load(py::handle src, bool /*unused*/);
 
   static py::handle cast(
       const c10::SymFloat& si,
@@ -322,7 +322,7 @@ template <>
 struct TORCH_PYTHON_API type_caster<c10::SymBool> {
  public:
   PYBIND11_TYPE_CASTER(c10::SymBool, _("Union[bool, torch.SymBool]"));
-  bool load(py::handle src, bool);
+  bool load(py::handle src, bool /*unused*/);
 
   static py::handle cast(
       const c10::SymBool& si,
@@ -336,7 +336,7 @@ struct type_caster<c10::complex<T>> {
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   PYBIND11_TYPE_CASTER(c10::complex<T>, _("complex"));
 
-  bool load(handle src, bool) {
+  bool load(handle src, bool /*unused*/) {
     PyObject* obj = src.ptr();
 
     // Referred from `THPUtils_unpackComplexDouble`
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index 53cee2632b11..a51cfaf8c5c1 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -936,6 +936,9 @@ static bool is_int_or_symint(PyObject* obj) {
   if (torch::is_symint(py::handle(obj))) {
     return true;
   }
+  if (torch::is_dynint(py::handle(obj))) {
+    return true;
+  }
 
   // FakeTensor(..., size=()) is qualified for SymInt param,
   // but we can't go via __index__ (below) as we would normally
@@ -1070,7 +1073,8 @@ auto FunctionParameter::_check(
         return !var.requires_grad() && var.dim() == 0;
       }
       if (torch::is_symfloat(py::handle(obj)) ||
-          torch::is_symint(py::handle(obj))) {
+          torch::is_symint(py::handle(obj)) ||
+          torch::is_dynint(py::handle(obj))) {
         // This will induce a guard
         return true;
       }
@@ -1085,7 +1089,8 @@ auto FunctionParameter::_check(
         return at::isIntegralType(var.scalar_type(), /*includeBool=*/false) &&
             !var.requires_grad() && var.dim() == 0;
       }
-      if (torch::is_symint(py::handle(obj))) {
+      if (torch::is_symint(py::handle(obj)) ||
+          torch::is_dynint(py::handle(obj))) {
         // This will induce a guard
         return true;
       }
@@ -1127,7 +1132,8 @@ auto FunctionParameter::_check(
       // Allow symint to be passed in as device, but we'll specialize and
       // guard in this case.
       return THPUtils_checkLong(obj) || THPUtils_checkString(obj) ||
-          THPDevice_Check(obj) || torch::is_symint(py::handle(obj));
+          THPDevice_Check(obj) || torch::is_symint(py::handle(obj)) ||
+          torch::is_dynint(py::handle(obj));
     case ParameterType::STREAM:
       return THPStream_Check(obj);
     case ParameterType::STRING:
@@ -1881,7 +1887,8 @@ at::Tensor PythonArgs::tensor_slow(int i) {
     // NB: we DO NOT put symbolic ints/floats into the Scalar itself,
     // because although Scalar supports SymInt/SymFloat, the subsequent
     // conversion to Tensor does not.  Instead, do it out of band.
-  } else if (torch::is_symint(py::handle(obj))) {
+  } else if (
+      torch::is_symint(py::handle(obj)) || torch::is_dynint(py::handle(obj))) {
     save_symint = true;
     // This scalar value doesn't matter, it shouldn't ever actually
     // get read out.  Make it a big and weird looking number to help
@@ -1969,6 +1976,10 @@ at::Scalar PythonArgs::scalar_slow(PyObject* arg) {
     return at::Scalar(py::cast<c10::SymInt>(arg));
   }
 
+  if (torch::is_dynint(arg)) {
+    return at::Scalar(py::cast<int>(arg));
+  }
+
   if (torch::is_symfloat(arg)) {
     return at::Scalar(py::cast<c10::SymFloat>(arg));
   }
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index a81f861ae903..5887235f72e5 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -89,7 +89,7 @@ inline bool THPUtils_checkScalar(PyObject* obj) {
   }
 #endif
   return PyFloat_Check(obj) || PyLong_Check(obj) || PyComplex_Check(obj) ||
-      torch::is_symint(py::handle(obj)) ||
+      torch::is_symint(py::handle(obj)) || torch::is_dynint(py::handle(obj)) ||
       torch::is_symfloat(py::handle(obj)) || torch::is_symbool(py::handle(obj));
 }
 
@@ -612,6 +612,8 @@ inline std::vector<c10::SymInt> PythonArgs::symintlist(int i) {
         try {
           if (is_symint(py::handle(obj))) {
             res.push_back(py::handle(obj).cast<c10::SymInt>());
+          } else if (is_dynint(py::handle(obj))) {
+            res.push_back(py::handle(obj).cast<int>());
           } else {
             res.emplace_back(THPUtils_unpackIndex(obj));
           }
@@ -640,6 +642,9 @@ inline std::vector<int64_t> PythonArgs::intlistWithDefault(
         size1,
         py::handle(arg).cast<c10::SymInt>().guard_int(__FILE__, __LINE__));
   }
+  if (size1 > 0 && torch::is_dynint(py::handle(arg))) {
+    return std::vector<int64_t>(size1, py::handle(arg).cast<int>());
+  }
   auto tuple = PyTuple_Check(arg);
   // NOLINTNEXTLINE(bugprone-branch-clone)
   const auto size2 = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
@@ -672,6 +677,8 @@ inline std::vector<int64_t> PythonArgs::intlistWithDefault(
       } else if (torch::is_symint(py::handle(obj))) {
         res[idx] = py::cast<c10::SymInt>(py::handle(obj))
                        .guard_int(__FILE__, __LINE__);
+      } else if (torch::is_dynint(py::handle(obj))) {
+        res[idx] = py::handle(obj).cast<int>();
       } else if (THPVariable_Check(obj)) {
         auto& var = THPVariable_Unpack(obj);
         if (var.numel() != 1 ||
@@ -846,6 +853,10 @@ inline at::Device toDevice(PyObject* obj) {
         py::cast<c10::SymInt>(py::handle(obj)).guard_int(__FILE__, __LINE__);
     return deviceFromLong(device_index);
   }
+  if (torch::is_dynint(py::handle(obj))) {
+    auto device_index = py::cast<int>(py::handle(obj));
+    return deviceFromLong(device_index);
+  }
   const std::string& device_str = THPUtils_unpackString(obj);
   return at::Device(device_str);
 }
@@ -982,6 +993,9 @@ inline int64_t PythonArgs::toInt64(int i) {
     return py::cast<c10::SymInt>(py::handle(args[i]))
         .guard_int(__FILE__, __LINE__);
   }
+  if (torch::is_dynint(py::handle(args[i]))) {
+    return py::cast<int>(py::handle(args[i]));
+  }
   return THPUtils_unpackLong(args[i]);
 }
 
@@ -1055,6 +1069,9 @@ inline double PythonArgs::toDouble(int i) {
     return static_cast<double>(py::cast<c10::SymInt>(py::handle(args[i]))
                                    .guard_int(__FILE__, __LINE__));
   }
+  if (torch::is_dynint(py::handle(args[i]))) {
+    return static_cast<double>(py::cast<int>(py::handle(args[i])));
+  }
   return THPUtils_unpackDouble(args[i]);
 }
 
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index 9d6eb35c7178..f97b6ac0ba9b 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -16,6 +16,7 @@
 
 #include <c10/core/SafePyObject.h>
 #include <torch/csrc/PyInterpreter.h>
+#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
 #include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/utils/tensor_new.h>
@@ -494,7 +495,20 @@ void initDispatchBindings(PyObject* module) {
           "",
           py::arg("dispatch"),
           py::arg("func"),
-          py::arg("with_keyset") = false);
+          py::arg("with_keyset") = false)
+      .def(
+          "register_ad_inplace_or_view_fallback",
+          [](const py::object& self, const char* name) {
+            HANDLE_TH_ERRORS
+            auto& lib = self.cast<torch::Library&>();
+            lib.impl(
+                name,
+                c10::DispatchKey::ADInplaceOrView,
+                torch::autograd::autogradNotImplementedInplaceOrViewFallback());
+            END_HANDLE_TH_ERRORS_PYBIND
+          },
+          "",
+          py::arg("name"));
 
   m.def(
       "_dispatch_library",
diff --git a/torch/csrc/utils/python_scalars.h b/torch/csrc/utils/python_scalars.h
index c5e19f8855d5..66e92ba8fd52 100644
--- a/torch/csrc/utils/python_scalars.h
+++ b/torch/csrc/utils/python_scalars.h
@@ -11,16 +11,12 @@ namespace torch::utils {
 
 template <typename T>
 inline T unpackIntegral(PyObject* obj, const char* type) {
-#if PY_VERSION_HEX >= 0x030a00f0
   // In Python-3.10 floats can no longer be silently converted to integers
   // Keep backward compatible behavior for now
   if (PyFloat_Check(obj)) {
     return c10::checked_convert<T>(THPUtils_unpackDouble(obj), type);
   }
   return c10::checked_convert<T>(THPUtils_unpackLong(obj), type);
-#else
-  return static_cast<T>(THPUtils_unpackLong(obj));
-#endif
 }
 
 inline void store_scalar(void* data, at::ScalarType scalarType, PyObject* obj) {
diff --git a/torch/csrc/utils/python_strings.h b/torch/csrc/utils/python_strings.h
index 229734af238f..fd36655f4887 100644
--- a/torch/csrc/utils/python_strings.h
+++ b/torch/csrc/utils/python_strings.h
@@ -3,6 +3,7 @@
 #include <torch/csrc/python_headers.h>
 #include <torch/csrc/utils/object_ptr.h>
 #include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/python_compat.h>
 #include <stdexcept>
 #include <string>
 
@@ -98,6 +99,14 @@ inline void THPUtils_internStringInPlace(PyObject** obj) {
  */
 
 inline py::object PyObject_FastGetAttrString(PyObject* obj, const char* name) {
+#if IS_PYTHON_3_13_PLUS
+  PyObject* res = (PyObject*)nullptr;
+  int result_code = PyObject_GetOptionalAttrString(obj, name, &res);
+  if (result_code == -1) {
+    PyErr_Clear();
+  }
+  return py::reinterpret_steal<py::object>(res);
+#else
   PyTypeObject* tp = Py_TYPE(obj);
   PyObject* res = (PyObject*)nullptr;
 
@@ -122,4 +131,5 @@ inline py::object PyObject_FastGetAttrString(PyObject* obj, const char* name) {
     }
   }
   return py::reinterpret_steal<py::object>(res);
+#endif
 }
diff --git a/torch/csrc/utils/python_symnode.cpp b/torch/csrc/utils/python_symnode.cpp
index 2c12e730abb1..9e17f8166a4b 100644
--- a/torch/csrc/utils/python_symnode.cpp
+++ b/torch/csrc/utils/python_symnode.cpp
@@ -53,4 +53,24 @@ py::handle get_symbool_class() {
 #endif
 }
 
+py::handle get_dynint_class() {
+  // NB: leak
+#if IS_PYBIND_2_13_PLUS
+  PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store<py::object>
+      storage;
+  return storage
+      .call_once_and_store_result([]() -> py::object {
+        return py::module::import("torch.fx.experimental.sym_node")
+            .attr("DynamicInt");
+      })
+      .get_stored();
+#else
+  static py::handle symbool_class =
+      py::object(py::module::import("torch.fx.experimental.sym_node")
+                     .attr("DynamicInt"))
+          .release();
+  return symbool_class;
+#endif
+}
+
 } // namespace torch
diff --git a/torch/csrc/utils/python_symnode.h b/torch/csrc/utils/python_symnode.h
index 69d03b9b7a43..ebdc90612a9e 100644
--- a/torch/csrc/utils/python_symnode.h
+++ b/torch/csrc/utils/python_symnode.h
@@ -12,6 +12,7 @@ namespace torch {
 TORCH_PYTHON_API py::handle get_symint_class();
 TORCH_PYTHON_API py::handle get_symfloat_class();
 TORCH_PYTHON_API py::handle get_symbool_class();
+TORCH_PYTHON_API py::handle get_dynint_class();
 
 // NB: These functions must not be called too early, otherwise torch not setup.
 // Alternate design is to have torch "register" the object to us
@@ -24,6 +25,9 @@ inline bool is_symfloat(py::handle obj) {
 inline bool is_symbool(py::handle obj) {
   return py::isinstance(obj, get_symbool_class());
 }
+inline bool is_dynint(py::handle obj) {
+  return py::isinstance(obj, get_dynint_class());
+}
 
 namespace impl {
 
@@ -125,11 +129,6 @@ class PythonSymNodeImpl : public c10::SymNodeImpl {
     return getPyObj().attr("expect_true")(file, line).cast<bool>();
   }
 
-  bool expect_size(const char* file, int64_t line) override {
-    py::gil_scoped_acquire acquire;
-    return getPyObj().attr("expect_size")(file, line).cast<bool>();
-  }
-
   bool guard_size_oblivious(const char* file, int64_t line) override {
     py::gil_scoped_acquire acquire;
     return getPyObj().attr("guard_size_oblivious")(file, line).cast<bool>();
diff --git a/torch/csrc/utils/pythoncapi_compat.h b/torch/csrc/utils/pythoncapi_compat.h
index c0feaa20904d..05e80b5ee860 100644
--- a/torch/csrc/utils/pythoncapi_compat.h
+++ b/torch/csrc/utils/pythoncapi_compat.h
@@ -68,16 +68,6 @@ static inline PyObject* _Py_XNewRef(PyObject *obj)
 #endif
 
 
-// bpo-39573 added Py_SET_REFCNT() to Python 3.9.0a4
-#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_REFCNT)
-static inline void _Py_SET_REFCNT(PyObject *ob, Py_ssize_t refcnt)
-{
-    ob->ob_refcnt = refcnt;
-}
-#define Py_SET_REFCNT(ob, refcnt) _Py_SET_REFCNT(_PyObject_CAST(ob), refcnt)
-#endif
-
-
 // Py_SETREF() and Py_XSETREF() were added to Python 3.5.2.
 // It is excluded from the limited C API.
 #if (PY_VERSION_HEX < 0x03050200 && !defined(Py_SETREF)) && !defined(Py_LIMITED_API)
@@ -114,37 +104,6 @@ static inline void _Py_SET_REFCNT(PyObject *ob, Py_ssize_t refcnt)
 #  define Py_IsFalse(x) Py_Is(x, Py_False)
 #endif
 
-
-// bpo-39573 added Py_SET_TYPE() to Python 3.9.0a4
-#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_TYPE)
-static inline void _Py_SET_TYPE(PyObject *ob, PyTypeObject *type)
-{
-    ob->ob_type = type;
-}
-#define Py_SET_TYPE(ob, type) _Py_SET_TYPE(_PyObject_CAST(ob), type)
-#endif
-
-
-// bpo-39573 added Py_SET_SIZE() to Python 3.9.0a4
-#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_SIZE)
-static inline void _Py_SET_SIZE(PyVarObject *ob, Py_ssize_t size)
-{
-    ob->ob_size = size;
-}
-#define Py_SET_SIZE(ob, size) _Py_SET_SIZE((PyVarObject*)(ob), size)
-#endif
-
-
-// bpo-40421 added PyFrame_GetCode() to Python 3.9.0b1
-#if PY_VERSION_HEX < 0x030900B1 || defined(PYPY_VERSION)
-static inline PyCodeObject* PyFrame_GetCode(PyFrameObject *frame)
-{
-    assert(frame != _Py_NULL);
-    assert(frame->f_code != _Py_NULL);
-    return _Py_CAST(PyCodeObject*, Py_NewRef(frame->f_code));
-}
-#endif
-
 static inline PyCodeObject* _PyFrame_GetCodeBorrow(PyFrameObject *frame)
 {
     PyCodeObject *code = PyFrame_GetCode(frame);
@@ -153,15 +112,6 @@ static inline PyCodeObject* _PyFrame_GetCodeBorrow(PyFrameObject *frame)
 }
 
 
-// bpo-40421 added PyFrame_GetBack() to Python 3.9.0b1
-#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
-static inline PyFrameObject* PyFrame_GetBack(PyFrameObject *frame)
-{
-    assert(frame != _Py_NULL);
-    return _Py_CAST(PyFrameObject*, Py_XNewRef(frame->f_back));
-}
-#endif
-
 #if !defined(PYPY_VERSION)
 static inline PyFrameObject* _PyFrame_GetBackBorrow(PyFrameObject *frame)
 {
@@ -279,26 +229,6 @@ PyFrame_GetVarString(PyFrameObject *frame, const char *name)
 #endif
 
 
-// bpo-39947 added PyThreadState_GetInterpreter() to Python 3.9.0a5
-#if PY_VERSION_HEX < 0x030900A5 || defined(PYPY_VERSION)
-static inline PyInterpreterState *
-PyThreadState_GetInterpreter(PyThreadState *tstate)
-{
-    assert(tstate != _Py_NULL);
-    return tstate->interp;
-}
-#endif
-
-
-// bpo-40429 added PyThreadState_GetFrame() to Python 3.9.0b1
-#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
-static inline PyFrameObject* PyThreadState_GetFrame(PyThreadState *tstate)
-{
-    assert(tstate != _Py_NULL);
-    return _Py_CAST(PyFrameObject *, Py_XNewRef(tstate->frame));
-}
-#endif
-
 #if !defined(PYPY_VERSION)
 static inline PyFrameObject*
 _PyThreadState_GetFrameBorrow(PyThreadState *tstate)
@@ -310,35 +240,6 @@ _PyThreadState_GetFrameBorrow(PyThreadState *tstate)
 #endif
 
 
-// bpo-39947 added PyInterpreterState_Get() to Python 3.9.0a5
-#if PY_VERSION_HEX < 0x030900A5 || defined(PYPY_VERSION)
-static inline PyInterpreterState* PyInterpreterState_Get(void)
-{
-    PyThreadState *tstate;
-    PyInterpreterState *interp;
-
-    tstate = PyThreadState_GET();
-    if (tstate == _Py_NULL) {
-        Py_FatalError("GIL released (tstate is NULL)");
-    }
-    interp = tstate->interp;
-    if (interp == _Py_NULL) {
-        Py_FatalError("no current interpreter");
-    }
-    return interp;
-}
-#endif
-
-
-// bpo-39947 added PyInterpreterState_Get() to Python 3.9.0a6
-#if 0x030700A1 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x030900A6 && !defined(PYPY_VERSION)
-static inline uint64_t PyThreadState_GetID(PyThreadState *tstate)
-{
-    assert(tstate != _Py_NULL);
-    return tstate->id;
-}
-#endif
-
 // bpo-43760 added PyThreadState_EnterTracing() to Python 3.11.0a2
 #if PY_VERSION_HEX < 0x030B00A2 && !defined(PYPY_VERSION)
 static inline void PyThreadState_EnterTracing(PyThreadState *tstate)
@@ -368,27 +269,6 @@ static inline void PyThreadState_LeaveTracing(PyThreadState *tstate)
 #endif
 
 
-// bpo-37194 added PyObject_CallNoArgs() to Python 3.9.0a1
-// PyObject_CallNoArgs() added to PyPy 3.9.16-v7.3.11
-#if !defined(PyObject_CallNoArgs) && PY_VERSION_HEX < 0x030900A1
-static inline PyObject* PyObject_CallNoArgs(PyObject *func)
-{
-    return PyObject_CallFunctionObjArgs(func, NULL);
-}
-#endif
-
-
-// bpo-39245 made PyObject_CallOneArg() public (previously called
-// _PyObject_CallOneArg) in Python 3.9.0a4
-// PyObject_CallOneArg() added to PyPy 3.9.16-v7.3.11
-#if !defined(PyObject_CallOneArg) && PY_VERSION_HEX < 0x030900A4
-static inline PyObject* PyObject_CallOneArg(PyObject *func, PyObject *arg)
-{
-    return PyObject_CallFunctionObjArgs(func, arg, NULL);
-}
-#endif
-
-
 // bpo-1635741 added PyModule_AddObjectRef() to Python 3.10.0a3
 #if PY_VERSION_HEX < 0x030A00A3
 static inline int
@@ -414,58 +294,6 @@ PyModule_AddObjectRef(PyObject *module, const char *name, PyObject *value)
 #endif
 
 
-// bpo-40024 added PyModule_AddType() to Python 3.9.0a5
-#if PY_VERSION_HEX < 0x030900A5
-static inline int PyModule_AddType(PyObject *module, PyTypeObject *type)
-{
-    const char *name, *dot;
-
-    if (PyType_Ready(type) < 0) {
-        return -1;
-    }
-
-    // inline _PyType_Name()
-    name = type->tp_name;
-    assert(name != _Py_NULL);
-    dot = strrchr(name, '.');
-    if (dot != _Py_NULL) {
-        name = dot + 1;
-    }
-
-    return PyModule_AddObjectRef(module, name, _PyObject_CAST(type));
-}
-#endif
-
-
-// bpo-40241 added PyObject_GC_IsTracked() to Python 3.9.0a6.
-// bpo-4688 added _PyObject_GC_IS_TRACKED() to Python 2.7.0a2.
-#if PY_VERSION_HEX < 0x030900A6 && !defined(PYPY_VERSION)
-static inline int PyObject_GC_IsTracked(PyObject* obj)
-{
-    return (PyObject_IS_GC(obj) && _PyObject_GC_IS_TRACKED(obj));
-}
-#endif
-
-// bpo-40241 added PyObject_GC_IsFinalized() to Python 3.9.0a6.
-// bpo-18112 added _PyGCHead_FINALIZED() to Python 3.4.0 final.
-#if PY_VERSION_HEX < 0x030900A6 && PY_VERSION_HEX >= 0x030400F0 && !defined(PYPY_VERSION)
-static inline int PyObject_GC_IsFinalized(PyObject *obj)
-{
-    PyGC_Head *gc = _Py_CAST(PyGC_Head*, obj) - 1;
-    return (PyObject_IS_GC(obj) && _PyGCHead_FINALIZED(gc));
-}
-#endif
-
-
-// bpo-39573 added Py_IS_TYPE() to Python 3.9.0a4
-#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_IS_TYPE)
-static inline int _Py_IS_TYPE(PyObject *ob, PyTypeObject *type) {
-    return Py_TYPE(ob) == type;
-}
-#define Py_IS_TYPE(ob, type) _Py_IS_TYPE(_PyObject_CAST(ob), type)
-#endif
-
-
 // bpo-46906 added PyFloat_Pack2() and PyFloat_Unpack2() to Python 3.11a7.
 // bpo-11734 added _PyFloat_Pack2() and _PyFloat_Unpack2() to Python 3.6.0b1.
 // Python 3.11a2 moved _PyFloat_Pack2() and _PyFloat_Unpack2() to the internal
@@ -592,81 +420,6 @@ static inline Py_ssize_t PyVectorcall_NARGS(size_t n)
 #endif
 
 
-// gh-105922 added PyObject_Vectorcall() to Python 3.9.0a4
-#if PY_VERSION_HEX < 0x030900A4
-static inline PyObject*
-PyObject_Vectorcall(PyObject *callable, PyObject *const *args,
-                     size_t nargsf, PyObject *kwnames)
-{
-#if PY_VERSION_HEX >= 0x030800B1 && !defined(PYPY_VERSION)
-    // bpo-36974 added _PyObject_Vectorcall() to Python 3.8.0b1
-    return _PyObject_Vectorcall(callable, args, nargsf, kwnames);
-#else
-    PyObject *posargs = NULL, *kwargs = NULL;
-    PyObject *res;
-    Py_ssize_t nposargs, nkwargs, i;
-
-    if (nargsf != 0 && args == NULL) {
-        PyErr_BadInternalCall();
-        goto error;
-    }
-    if (kwnames != NULL && !PyTuple_Check(kwnames)) {
-        PyErr_BadInternalCall();
-        goto error;
-    }
-
-    nposargs = (Py_ssize_t)PyVectorcall_NARGS(nargsf);
-    if (kwnames) {
-        nkwargs = PyTuple_GET_SIZE(kwnames);
-    }
-    else {
-        nkwargs = 0;
-    }
-
-    posargs = PyTuple_New(nposargs);
-    if (posargs == NULL) {
-        goto error;
-    }
-    if (nposargs) {
-        for (i=0; i < nposargs; i++) {
-            PyTuple_SET_ITEM(posargs, i, Py_NewRef(*args));
-            args++;
-        }
-    }
-
-    if (nkwargs) {
-        kwargs = PyDict_New();
-        if (kwargs == NULL) {
-            goto error;
-        }
-
-        for (i = 0; i < nkwargs; i++) {
-            PyObject *key = PyTuple_GET_ITEM(kwnames, i);
-            PyObject *value = *args;
-            args++;
-            if (PyDict_SetItem(kwargs, key, value) < 0) {
-                goto error;
-            }
-        }
-    }
-    else {
-        kwargs = NULL;
-    }
-
-    res = PyObject_Call(callable, posargs, kwargs);
-    Py_DECREF(posargs);
-    Py_XDECREF(kwargs);
-    return res;
-
-error:
-    Py_DECREF(posargs);
-    Py_XDECREF(kwargs);
-    return NULL;
-#endif
-}
-#endif
-
-
 // gh-106521 added PyObject_GetOptionalAttr() and
 // PyObject_GetOptionalAttrString() to Python 3.13.0a1
 #if PY_VERSION_HEX < 0x030D00A1
diff --git a/torch/csrc/utils/tensor_list.cpp b/torch/csrc/utils/tensor_list.cpp
index 84f4688e0ecc..f25175af2dcc 100644
--- a/torch/csrc/utils/tensor_list.cpp
+++ b/torch/csrc/utils/tensor_list.cpp
@@ -1,3 +1,4 @@
+#include <ATen/functorch/TensorWrapper.h>
 #include <torch/csrc/utils/tensor_list.h>
 
 #include <c10/util/irange.h>
@@ -39,6 +40,12 @@ static PyObject* recursive_to_list(
   return list.release();
 }
 
+const Tensor& recursive_unwrap(const Tensor& tensor) {
+  if (auto* wrapper = at::functorch::maybeGetTensorWrapper(tensor))
+    return recursive_unwrap(wrapper->value());
+  return tensor;
+}
+
 PyObject* tensor_to_list(const Tensor& tensor) {
   {
     py::object pytensor =
@@ -48,7 +55,9 @@ PyObject* tensor_to_list(const Tensor& tensor) {
         ".tolist() is not supported for tensor subclasses, got ",
         Py_TYPE(pytensor.ptr())->tp_name);
   }
+  // check if it is a grad tracking tensor and unwrap.
   Tensor data = tensor.resolve_conj().resolve_neg();
+  data = recursive_unwrap(data);
   if (!data.device().is_cpu()) {
     pybind11::gil_scoped_release no_gil;
     data = data.toBackend(Backend::CPU);
diff --git a/torch/csrc/utils/tensor_memoryformats.h b/torch/csrc/utils/tensor_memoryformats.h
index b9268070e34c..4f08109284a4 100644
--- a/torch/csrc/utils/tensor_memoryformats.h
+++ b/torch/csrc/utils/tensor_memoryformats.h
@@ -9,6 +9,7 @@ namespace torch::utils {
 void initializeMemoryFormats();
 
 // This methods returns a borrowed reference!
-TORCH_PYTHON_API PyObject* getTHPMemoryFormat(c10::MemoryFormat);
+TORCH_PYTHON_API PyObject* getTHPMemoryFormat(
+    c10::MemoryFormat /*memory_format*/);
 
 } // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 77a22568a3fd..c422e8af0ecd 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -1497,7 +1497,7 @@ Tensor tensor_ctor(
         pin_memory);
     auto names = r.toDimnameListOptional(5);
     if (names) {
-      at::namedinference::propagate_names(
+      at::namedinference::propagate_names_if_nonempty(
           new_tensor, *names, /*validate_names=*/true);
     }
     new_tensor.detach_(); // ensure new_tensor a leaf node
diff --git a/torch/csrc/utils/tensor_numpy.cpp b/torch/csrc/utils/tensor_numpy.cpp
index c6cd56711c4a..1813d623af5e 100644
--- a/torch/csrc/utils/tensor_numpy.cpp
+++ b/torch/csrc/utils/tensor_numpy.cpp
@@ -145,6 +145,10 @@ PyObject* tensor_to_numpy(const at::Tensor& tensor, bool force /*=false*/) {
         " device type tensor to numpy. Use Tensor.cpu() to ",
         "copy the tensor to host memory first.");
 
+    TORCH_CHECK(
+        !at::_is_zerotensor(tensor),
+        " Cannot convert a ZeroTensor to numpy. Set force=True if you need the zero array.");
+
     TORCH_CHECK(
         !(at::GradMode::is_enabled() && tensor.requires_grad()),
         "Can't call numpy() on Tensor that requires grad. "
@@ -186,6 +190,9 @@ PyObject* tensor_to_numpy(const at::Tensor& tensor, bool force /*=false*/) {
   if (!array)
     return nullptr;
 
+  if (at::_is_zerotensor(tensor))
+    PyArray_FILLWBYTE(reinterpret_cast<PyArrayObject*>(array.get()), 0);
+
   // TODO: This attempts to keep the underlying memory alive by setting the base
   // object of the ndarray to the tensor and disabling resizes on the storage.
   // This is not sufficient. For example, the tensor's storage may be changed
diff --git a/torch/csrc/utils/variadic.h b/torch/csrc/utils/variadic.h
index 44fe1028fe5c..ae40ff5ab8f2 100644
--- a/torch/csrc/utils/variadic.h
+++ b/torch/csrc/utils/variadic.h
@@ -101,7 +101,10 @@ template <
     typename Function,
     typename Accessor,
     size_t... Is>
-ReturnType unpack(Function function, Accessor accessor, Indices<Is...>) {
+ReturnType unpack(
+    Function function,
+    Accessor accessor,
+    Indices<Is...> /*unused*/) {
   return ReturnType(function(accessor.template operator()<Ts>(Is)...));
 }
 
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index b61b3eca7a4f..21e5a19056c2 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -17,15 +17,16 @@
 import threading
 import traceback
 import warnings
+from collections.abc import Callable
 from functools import lru_cache
-from typing import Any, Callable, cast, NewType, Optional, TYPE_CHECKING, Union
+from typing import Any, cast, NewType, Optional, TYPE_CHECKING, Union
 
 import torch
 import torch._C
 from torch import device as _device
 from torch._utils import _dummy_type, _LazySeedTracker, classproperty
 
-from . import gds
+from . import _device_limits, gds
 from ._utils import _get_device_index
 from .graphs import (
     CUDAGraph,
@@ -259,7 +260,7 @@ def _check_capability():
     CUDA_ARCHES_SUPPORTED = {
         "12.6": {"min": 50, "max": 90},
         "12.8": {"min": 70, "max": 120},
-        "12.9": {"min": 70, "max": 120},
+        "13.0": {"min": 75, "max": 120},
     }
 
     if (
@@ -495,12 +496,14 @@ class cudaStatus:
 
 class CudaError(RuntimeError):
     def __init__(self, code: int) -> None:
+        # pyrefly: ignore  # missing-attribute
         msg = _cudart.cudaGetErrorString(_cudart.cudaError(code))
         super().__init__(f"{msg} ({code})")
 
 
 def check_error(res: int) -> None:
     r"""Raise an error if the result of a CUDA runtime API call is not success."""
+    # pyrefly: ignore  # missing-attribute
     if res != _cudart.cudaError.success:
         raise CudaError(res)
 
@@ -600,6 +603,7 @@ def get_device_capability(device: "Device" = None) -> tuple[int, int]:
     return prop.major, prop.minor
 
 
+# pyrefly: ignore  # not-a-type
 def get_device_properties(device: "Device" = None) -> _CudaDeviceProperties:
     r"""Get the properties of a device.
 
@@ -650,6 +654,7 @@ def __init__(self, stream: Optional["torch.cuda.Stream"]):
         self.idx = _get_device_index(None, True)
         if not torch.jit.is_scripting():
             if self.idx is None:
+                # pyrefly: ignore  # bad-assignment
                 self.idx = -1
 
         self.src_prev_stream = (
@@ -952,7 +957,9 @@ def _device_count_amdsmi() -> int:
             if raw_cnt <= 0:
                 return raw_cnt
             # Trim the list up to a maximum available device
+            # pyrefly: ignore  # bad-argument-type
             for idx, val in enumerate(visible_devices):
+                # pyrefly: ignore  # redundant-cast
                 if cast(int, val) >= raw_cnt:
                     return idx
     except OSError:
@@ -986,7 +993,9 @@ def _device_count_nvml() -> int:
             if raw_cnt <= 0:
                 return raw_cnt
             # Trim the list up to a maximum available device
+            # pyrefly: ignore  # bad-argument-type
             for idx, val in enumerate(visible_devices):
+                # pyrefly: ignore  # redundant-cast
                 if cast(int, val) >= raw_cnt:
                     return idx
     except OSError:
@@ -1201,8 +1210,10 @@ def get_sync_debug_mode() -> int:
 def _get_pynvml_handler(device: "Device" = None):
     if not _HAS_PYNVML:
         raise ModuleNotFoundError(
-            "pynvml does not seem to be installed or it can't be imported."
+            "nvidia-ml-py does not seem to be installed or it can't be imported."
+            # pyrefly: ignore  # invalid-inheritance
         ) from _PYNVML_ERR
+    # pyrefly: ignore  # import-error
     from pynvml import NVMLError_DriverNotLoaded
 
     try:
@@ -1219,6 +1230,7 @@ def _get_amdsmi_handler(device: "Device" = None):
     if not _HAS_PYNVML:
         raise ModuleNotFoundError(
             "amdsmi does not seem to be installed or it can't be imported."
+            # pyrefly: ignore  # invalid-inheritance
         ) from _PYNVML_ERR
     try:
         amdsmi.amdsmi_init()
@@ -1482,6 +1494,7 @@ def _get_rng_state_offset(device: Union[int, str, torch.device] = "cuda") -> int
     return default_generator.get_offset()
 
 
+# pyrefly: ignore  # deprecated
 from .memory import *  # noqa: F403
 from .random import *  # noqa: F403
 
@@ -1698,6 +1711,7 @@ def _register_triton_kernels():
     def kernel_impl(*args, **kwargs):
         from torch.sparse._triton_ops import bsr_dense_mm
 
+        # pyrefly: ignore  # not-callable
         return bsr_dense_mm(*args, skip_checks=True, **kwargs)
 
     @_WrappedTritonKernel
diff --git a/torch/cuda/_device_limits.py b/torch/cuda/_device_limits.py
new file mode 100644
index 000000000000..808d748c8f6e
--- /dev/null
+++ b/torch/cuda/_device_limits.py
@@ -0,0 +1,140 @@
+import torch
+from torch._C import dtype
+
+
+__all__ = ["GPULimits"]
+
+
+class GPULimits:
+    r"""Utility class that provides the theoretical limits of Nvidia GPU devices. The
+    limits don't take into account thermal throttling (assume that the GPU run at its
+    peak rated frequency). This is because user hardware configuration may influence
+    power behavior.
+    """
+
+    def __init__(self, target_device: torch.device):
+        # The device properties object is obtained by calling 'cudaGetDeviceProperties' CUDA
+        # runtime function. We need the total memory bus width and the memory clock rate to
+        # calculate the memory bandwidth.
+        self.device_properties = torch.cuda.get_device_properties(target_device)
+
+        # The compute capability is needed to determine the number of FLOPs per cycle per SM
+        self.compute_capability = int(
+            f"{self.device_properties.major}{self.device_properties.minor}"
+        )
+
+    # FLOPs per cycle information derived from Table 2 in:
+    # https://resources.nvidia.com/en-us-hopper-architecture/nvidia-h100-tensor-c
+
+    # Returns the number of FMA instructions retired per cycle per SM for a given
+    # data type, when tensor cores are NOT used
+    def get_fma_per_cycle_per_sm_cuda_cores(self, data_type: dtype) -> int:
+        hardcoded_device_values = {
+            # Ampere Architecture
+            "fp16_80": 256,
+            "fp32_80": 64,
+            "fp64_80": 32,
+            # Hopper Architecture
+            "fp16_90": 64,
+            "fp32_90": 128,
+            "fp64_90": 64,
+            # Blackwell Architecture
+            "fp16_100": 256,
+            "fp32_100": 128,
+            "fp64_100": 64,
+        }
+        dict_key = ""
+        if data_type is torch.float16:
+            dict_key = f"fp16_{self.compute_capability}"
+        elif data_type is torch.float32:
+            dict_key = f"fp32_{self.compute_capability}"
+        elif data_type is torch.float64:
+            dict_key = f"fp64_{self.compute_capability}"
+        else:
+            dict_key = "unknown"
+
+        if dict_key not in hardcoded_device_values.keys():
+            raise RuntimeError(
+                f"No data for sm_{self.compute_capability} and {data_type}."
+            )
+
+        return hardcoded_device_values[dict_key]
+
+    # Returns the number of FMA instructions retired per cycle per SM for a given
+    # data type, when tensor cores ARE used
+    def get_fma_per_cycle_per_sm_tensor_cores(self, data_type: dtype) -> int:
+        hardcoded_device_values = {
+            # Ampere Architecture
+            "int8_80": 2048,
+            "fp16_80": 1024,
+            "fp32_80": 512,
+            "fp64_80": 64,
+            # Hopper Architecture
+            "int8_90": 4096,
+            "fp8_90": 4096,
+            "fp16_90": 2048,
+            "fp32_90": 1024,
+            "fp64_90": 128,
+            # Blackwell Architecture
+            "int8_100": 8192,
+            "fp8_100": 8192,
+            "fp16_100": 4096,
+            "fp32_100": 2048,
+        }
+        dict_key = ""
+        if data_type is torch.float16:
+            dict_key = f"fp16_{self.compute_capability}"
+        elif data_type is torch.bfloat16:
+            # FP16 and BF16 are equivalent in terms of FLOPs per cycle per SM
+            dict_key = f"fp16_{self.compute_capability}"
+        elif data_type is torch.float32:
+            dict_key = f"fp32_{self.compute_capability}"
+        elif data_type is torch.int8:
+            dict_key = f"int8_{self.compute_capability}"
+        elif data_type is torch.float64:
+            dict_key = f"fp64_{self.compute_capability}"
+        else:
+            dict_key = "unknown"
+
+        if dict_key not in hardcoded_device_values.keys():
+            raise RuntimeError(
+                f"No data for sm_{self.compute_capability} and {data_type}."
+            )
+
+        return hardcoded_device_values[dict_key]
+
+    def get_tflops_per_second(
+        self, data_type: dtype, use_tensor_cores: bool = True
+    ) -> float:
+        num_sms = self.device_properties.multi_processor_count
+        clock_rate = self.device_properties.clock_rate  # KHz
+
+        fma_per_cycle = 0
+        if use_tensor_cores:
+            fma_per_cycle = self.get_fma_per_cycle_per_sm_tensor_cores(data_type)
+        else:
+            fma_per_cycle = self.get_fma_per_cycle_per_sm_cuda_cores(data_type)
+
+        # 1 FMA counts as 2 floating point operations
+        # Clock rate is in KHz
+        tflops_per_second = num_sms * fma_per_cycle * 2 * clock_rate / 1e9
+        return tflops_per_second
+
+    def get_memory_bandwidth_Bps(self) -> int:
+        # DRAM devices are Double-Data which means they provide an output at both fronts of
+        # a clock beat
+        bus_bytes_per_cycle = int(2 * self.device_properties.memory_bus_width / 8)
+        mem_clock_rate_Hz = self.device_properties.memory_clock_rate * 1000
+        bytes_per_second = bus_bytes_per_cycle * mem_clock_rate_Hz * 2
+        return bytes_per_second
+
+    def get_shared_memory_bandwidth_Bps(self) -> int:
+        # Each warp can LD or ST 32 x 4 bytes per cycle. To calculate the
+        # device's throughput we need to multiply with frequency and number of SMs.
+        num_sms = self.device_properties.multi_processor_count
+        bytes_per_cycle_per_sm = 128
+        bytes_per_cycle_per_device = num_sms * bytes_per_cycle_per_sm
+        bytes_per_second = (
+            bytes_per_cycle_per_device * self.device_properties.clock_rate * 1000
+        )
+        return bytes_per_second
diff --git a/torch/cuda/_gpu_trace.py b/torch/cuda/_gpu_trace.py
index 9a23a8a2abc3..d3b8f7e4626f 100644
--- a/torch/cuda/_gpu_trace.py
+++ b/torch/cuda/_gpu_trace.py
@@ -1,4 +1,4 @@
-from typing import Callable
+from collections.abc import Callable
 
 from torch._utils import CallbackRegistry
 
diff --git a/torch/cuda/_sanitizer.py b/torch/cuda/_sanitizer.py
index ff287b89c9ad..90953d888d6c 100644
--- a/torch/cuda/_sanitizer.py
+++ b/torch/cuda/_sanitizer.py
@@ -200,7 +200,7 @@ def delete_tensor(self, data_ptr: DataPtr) -> None:
         del self.accesses[data_ptr]
 
     def were_there_reads_since_last_write(self, data_ptr: DataPtr) -> bool:
-        return True if self.accesses[data_ptr].reads else False
+        return bool(self.accesses[data_ptr].reads)
 
     def get_allocation_stack_trace(
         self, data_ptr: DataPtr
diff --git a/torch/cuda/_utils.py b/torch/cuda/_utils.py
index db82a46e633a..7ba1326e20f7 100644
--- a/torch/cuda/_utils.py
+++ b/torch/cuda/_utils.py
@@ -71,10 +71,22 @@ def _get_hiprtc_library() -> ctypes.CDLL:
 
 
 def _get_nvrtc_library() -> ctypes.CDLL:
+    major_version = int(torch.version.cuda.split(".")[0])  # type: ignore[union-attr]
     if sys.platform == "win32":
-        return ctypes.CDLL("nvrtc64_120_0.dll")
+        nvrtc_libs = [
+            f"nvrtc64_{major_version}0_0.dll",
+        ]
     else:
-        return ctypes.CDLL("libnvrtc.so")
+        nvrtc_libs = [
+            f"libnvrtc.so.{major_version}",
+            "libnvrtc.so",  # Fallback to unversioned
+        ]
+    for lib_name in nvrtc_libs:
+        try:
+            return ctypes.CDLL(lib_name)
+        except OSError:
+            continue
+    raise OSError("Could not find any NVRTC library")
 
 
 def _get_gpu_rtc_library() -> ctypes.CDLL:
@@ -267,6 +279,7 @@ def __getattr__(self, name: str) -> "_CudaKernel":
             return self._kernels[name]
 
         # Import the CUDA library inside the method
+        # pyrefly: ignore  # missing-module-attribute
         from torch.cuda._utils import _get_gpu_runtime_library
 
         libcuda = _get_gpu_runtime_library()
@@ -408,7 +421,7 @@ def set_shared_memory_config(self, shared_mem_bytes: int) -> None:
             # navi, CDNA1-CDNA3 allows a max of 64KB shared memory
             # CDNA4 allows a max of 160KB shared memory
             max_shared_mem = (
-                65536 if device_props.gcnArchName not in ["gfx950"] else 160 * 1024
+                65536 if device_props.gcnArchName != "gfx950" else 160 * 1024
             )
         else:
             max_shared_mem = getattr(
diff --git a/torch/cuda/amp/__init__.py b/torch/cuda/amp/__init__.py
index 74520496372f..54fba70ff68a 100644
--- a/torch/cuda/amp/__init__.py
+++ b/torch/cuda/amp/__init__.py
@@ -1,3 +1,4 @@
+# pyrefly: ignore  # deprecated
 from .autocast_mode import autocast, custom_bwd, custom_fwd
 from .common import amp_definitely_not_available
 from .grad_scaler import GradScaler
diff --git a/torch/cuda/amp/autocast_mode.py b/torch/cuda/amp/autocast_mode.py
index d52ff7cf672b..e6b63c708d3f 100644
--- a/torch/cuda/amp/autocast_mode.py
+++ b/torch/cuda/amp/autocast_mode.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
 import functools
+import sys
 from typing import Any
 from typing_extensions import deprecated
 
@@ -9,17 +10,36 @@
 __all__ = ["autocast", "custom_fwd", "custom_bwd"]
 
 
+@deprecated(
+    "`torch.cuda.amp.autocast(args...)` is deprecated. "
+    "Please use `torch.amp.autocast('cuda', args...)` instead.",
+    category=FutureWarning,
+)
 class autocast(torch.amp.autocast_mode.autocast):
     r"""See :class:`torch.autocast`.
 
     ``torch.cuda.amp.autocast(args...)`` is deprecated. Please use ``torch.amp.autocast("cuda", args...)`` instead.
     """
 
-    @deprecated(
-        "`torch.cuda.amp.autocast(args...)` is deprecated. "
-        "Please use `torch.amp.autocast('cuda', args...)` instead.",
-        category=FutureWarning,
-    )
+    # TODO: remove this conditional once we stop supporting Python < 3.13
+    # Prior to Python 3.13, inspect.signature could not retrieve the correct
+    # signature information for classes decorated with @deprecated (unless
+    # the __new__ static method was explicitly defined);
+    #
+    # However, this issue has been fixed in Python 3.13 and later versions.
+    if sys.version_info < (3, 13):
+
+        def __new__(
+            cls,
+            enabled: bool = True,
+            dtype: torch.dtype = torch.float16,
+            cache_enabled: bool = True,
+        ):
+            return super().__new__(cls)
+
+        def __init_subclass__(cls):
+            pass
+
     def __init__(
         self,
         enabled: bool = True,
diff --git a/torch/cuda/gds.py b/torch/cuda/gds.py
index d3922499682e..5a7dfa388caa 100644
--- a/torch/cuda/gds.py
+++ b/torch/cuda/gds.py
@@ -1,6 +1,7 @@
 import os
 import sys
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Optional
 
 import torch
 from torch.types import Storage
diff --git a/torch/cuda/graphs.py b/torch/cuda/graphs.py
index 3946b7b3360a..c5bb24c40dcb 100644
--- a/torch/cuda/graphs.py
+++ b/torch/cuda/graphs.py
@@ -2,8 +2,9 @@
 
 import gc
 import typing
-from typing import Callable, Optional, overload, TYPE_CHECKING, Union
-from typing_extensions import ParamSpec, Self, TypeAlias, TypeVar
+from collections.abc import Callable
+from typing import Optional, overload, TYPE_CHECKING, TypeAlias, Union
+from typing_extensions import ParamSpec, Self, TypeVar
 
 import torch
 from torch import Tensor
@@ -258,6 +259,7 @@ def __enter__(self) -> None:
         self.cuda_graph.capture_begin(
             # type: ignore[misc]
             *self.pool,
+            # pyrefly: ignore  # bad-keyword-argument
             capture_error_mode=self.capture_error_mode,
         )
 
@@ -523,6 +525,7 @@ def make_graphed_autograd_function(
     ) -> Callable[..., object]:
         class Graphed(torch.autograd.Function):
             @staticmethod
+            # pyrefly: ignore  # bad-override
             def forward(ctx: object, *inputs: Tensor) -> tuple[Tensor, ...]:
                 # At this stage, only the user args may (potentially) be new tensors.
                 for i in range(len_user_args):
@@ -534,6 +537,7 @@ def forward(ctx: object, *inputs: Tensor) -> tuple[Tensor, ...]:
 
             @staticmethod
             @torch.autograd.function.once_differentiable
+            # pyrefly: ignore  # bad-override
             def backward(ctx: object, *grads: Tensor) -> tuple[Tensor, ...]:
                 assert len(grads) == len(static_grad_outputs)
                 for g, grad in zip(static_grad_outputs, grads):
@@ -547,7 +551,9 @@ def backward(ctx: object, *grads: Tensor) -> tuple[Tensor, ...]:
                 # Input args that didn't require grad expect a None gradient.
                 assert isinstance(static_grad_inputs, tuple)
                 return tuple(
-                    b.detach() if b is not None else b for b in static_grad_inputs
+                    # pyrefly: ignore  # bad-argument-type
+                    b.detach() if b is not None else b
+                    for b in static_grad_inputs
                 )
 
         def functionalized(*user_args: object) -> object:
diff --git a/torch/cuda/jiterator.py b/torch/cuda/jiterator.py
index 8bcb14d9fcfb..6eaa54b5b795 100644
--- a/torch/cuda/jiterator.py
+++ b/torch/cuda/jiterator.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
 import re
-from typing import Callable
+from collections.abc import Callable
 
 import torch
 from torch import Tensor
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 5a1a0adc02af..dc4c3827c8a7 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -769,7 +769,8 @@ def list_gpu_processes(device: "Device" = None) -> str:
         try:
             import pynvml  # type: ignore[import]
         except ModuleNotFoundError:
-            return "pynvml module not found, please install pynvml"
+            return "pynvml module not found, please install nvidia-ml-py"
+        # pyrefly: ignore  # import-error
         from pynvml import NVMLError_DriverNotLoaded
 
         try:
@@ -852,6 +853,7 @@ def _record_memory_history_legacy(
     _C._cuda_record_memory_history_legacy(  # type: ignore[call-arg]
         enabled,
         record_context,
+        # pyrefly: ignore  # bad-argument-type
         trace_alloc_max_entries,
         trace_alloc_record_context,
         record_context_cpp,
@@ -862,7 +864,7 @@ def _record_memory_history_legacy(
 
 
 def _record_memory_history(
-    enabled: Literal[None, "state", "all"] = "all", *args, **kwargs
+    enabled: Optional[Literal["state", "all"]] = "all", *args, **kwargs
 ) -> None:
     """Enable recording of stack traces associated with memory
     allocations, so you can tell what allocated any piece of memory in
@@ -1061,6 +1063,32 @@ def _dump_snapshot(filename="dump_snapshot.pickle"):
         pickle.dump(s, f)
 
 
+def _set_memory_metadata(metadata: str):
+    """
+    Set custom metadata that will be attached to all subsequent CUDA memory allocations.
+
+    This metadata will be recorded in the memory snapshot for all allocations made
+    after this call until the metadata is cleared or changed.
+
+    Args:
+        metadata (str): Custom metadata string to attach to allocations.
+                       Pass an empty string to clear the metadata.
+    """
+    # pyrefly: ignore  # missing-attribute
+    torch._C._cuda_setMemoryMetadata(metadata)
+
+
+def _get_memory_metadata() -> str:
+    """
+    Get the current custom metadata that is being attached to CUDA memory allocations.
+
+    Returns:
+        str: The current metadata string, or empty string if no metadata is set.
+    """
+    # pyrefly: ignore  # missing-attribute
+    return torch._C._cuda_getMemoryMetadata()
+
+
 def _save_segment_usage(filename="output.svg", snapshot=None):
     if snapshot is None:
         snapshot = _snapshot()
@@ -1075,8 +1103,13 @@ def _save_memory_usage(filename="output.svg", snapshot=None):
         f.write(_memory(snapshot))
 
 
+@deprecated(
+    "torch.cuda._set_allocator_settings is deprecated. Use torch._C._accelerator_setAllocatorSettings instead.",
+    category=FutureWarning,
+)
 def _set_allocator_settings(env: str):
-    return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env)
+    # pyrefly: ignore  # missing-attribute
+    return torch._C._accelerator_setAllocatorSettings(env)
 
 
 def get_allocator_backend() -> str:
diff --git a/torch/cuda/nvtx.py b/torch/cuda/nvtx.py
index cb3719d93fb3..3cd106ff0fd9 100644
--- a/torch/cuda/nvtx.py
+++ b/torch/cuda/nvtx.py
@@ -53,6 +53,7 @@ def range_start(msg) -> int:
     Args:
         msg (str): ASCII message to associate with the range.
     """
+    # pyrefly: ignore  # missing-attribute
     return _nvtx.rangeStartA(msg)
 
 
@@ -63,6 +64,7 @@ def range_end(range_id) -> None:
     Args:
         range_id (int): an unique handle for the start range.
     """
+    # pyrefly: ignore  # missing-attribute
     _nvtx.rangeEnd(range_id)
 
 
@@ -83,6 +85,7 @@ def _device_range_start(msg: str, stream: int = 0) -> object:
         msg (str): ASCII message to associate with the range.
         stream (int): CUDA stream id.
     """
+    # pyrefly: ignore  # missing-attribute
     return _nvtx.deviceRangeStart(msg, stream)
 
 
@@ -95,6 +98,7 @@ def _device_range_end(range_handle: object, stream: int = 0) -> None:
         range_handle: an unique handle for the start range.
         stream (int): CUDA stream id.
     """
+    # pyrefly: ignore  # missing-attribute
     _nvtx.deviceRangeEnd(range_handle, stream)
 
 
diff --git a/torch/cuda/streams.py b/torch/cuda/streams.py
index 023f5f9a53b2..9c022d23beb6 100644
--- a/torch/cuda/streams.py
+++ b/torch/cuda/streams.py
@@ -32,6 +32,9 @@ class Stream(torch._C._CudaStreamBase):
     """
 
     def __new__(cls, device=None, priority=0, **kwargs):
+        # Check CUDA availability
+        if not torch.backends.cuda.is_built():
+            raise RuntimeError("torch.cuda.Stream requires CUDA support")
         # setting device manager is expensive, so we avoid it unless necessary
         if device is None or ("stream_id" in kwargs and "device_index" in kwargs):
             return super().__new__(cls, priority=priority, **kwargs)
@@ -116,6 +119,16 @@ def __hash__(self):
     def __repr__(self):
         return f"<torch.cuda.Stream device={self.device} cuda_stream={self.cuda_stream:#x}>"
 
+    def __cuda_stream__(self):
+        """Implements the CUDA Stream Protocol:
+        https://nvidia.github.io/cuda-python/cuda-core/latest/interoperability.html#cuda-stream-protocol
+
+        Returns:
+            tuple: A 2-tuple of (version, handle) where version is the protocol version
+                   and handle is the address of cudaStream_t (CUDA) or hipStream_t (ROCm) as a Python int.
+        """
+        return (0, self.cuda_stream)
+
 
 class ExternalStream(Stream):
     r"""Wrapper around an externally allocated CUDA stream.
diff --git a/torch/cuda/tunable.py b/torch/cuda/tunable.py
index 99f469d46dc1..262c6870d400 100644
--- a/torch/cuda/tunable.py
+++ b/torch/cuda/tunable.py
@@ -206,13 +206,12 @@
     "get_filename",
     "get_results",
     "get_validators",
-    "write_file_on_exit",
-    "write_file",
     "read_file",
     "tune_gemm_in_file",
     "mgpu_tune_gemm_in_file",
     "set_rotating_buffer_size",
     "get_rotating_buffer_size",
+    "set_numerical_check_tolerances",
 ]
 
 
@@ -306,25 +305,6 @@ def get_validators() -> tuple[str, str]:
     return torch._C._cuda_tunableop_get_validators()  # type: ignore[attr-defined]
 
 
-def write_file_on_exit(val: bool) -> None:
-    r"""During Tuning Context destruction, write file to disk.
-
-    This is useful as a final flush of your results to disk if your application
-    terminates as result of normal operation or an error. Manual flushing of
-    your results can be achieved by manually calling ``write_file()``."""
-    torch._C._cuda_tunableop_write_file_on_exit(val)  # type: ignore[attr-defined]
-
-
-def write_file(filename: Optional[str] = None) -> bool:
-    r"""Write results to a CSV file.
-
-    If :attr:`filename` is not given, ``get_filename()`` is called.
-    """
-    if filename is None:
-        filename = get_filename()
-    return torch._C._cuda_tunableop_write_file(filename)  # type: ignore[attr-defined]
-
-
 def read_file(filename: Optional[str] = None) -> bool:
     r"""Read results from a TunableOp CSV file.
 
@@ -348,6 +328,13 @@ def get_rotating_buffer_size() -> int:
     return torch._C._cuda_tunableop_get_rotating_buffer_size()  # type: ignore[attr-defined]
 
 
+def set_numerical_check_tolerances(
+    enable: bool, atol: float = 1e-5, rtol: float = 1e-5
+) -> None:
+    r"""Set the atol and rtol values in numeric check"""
+    return torch._C._cuda_tunableop_set_numerical_check_tolerances(enable, atol, rtol)  # type: ignore[attr-defined]
+
+
 def tune_gemm_in_file(filename: str) -> None:
     r"""tune GEMM in file."""
 
@@ -591,7 +578,6 @@ def _process_single_offline_gemm(untuned_gemm_line: str, gpu_id: int) -> None:
         transA = layout[1] == "T"
         dtype = dtype_dict.get(data_type)
         if data_type == "tf32":
-            # User must still set HIPBLASLT_ALLOW_TF32=1
             torch.backends.cuda.matmul.allow_tf32 = True
         else:
             torch.backends.cuda.matmul.allow_tf32 = False
@@ -788,7 +774,6 @@ def mgpu_tune_gemm_in_file(filename_pattern: str, num_gpus: int) -> None:
     mp_context = mp.get_context("spawn")
 
     futures = []  # empty list to hold futures
-    flush_results = []  # empty list to hold futures
 
     # GEMM are assigned to GPUs in a round robin manner
     h = 0
@@ -810,13 +795,6 @@ def mgpu_tune_gemm_in_file(filename_pattern: str, num_gpus: int) -> None:
         for future in concurrent.futures.as_completed(futures):
             future.result()
 
-        for g in range(num_gpus):
-            flush_result = executor.submit(write_file)
-            flush_results.append(flush_result)
-
-        for flush_result in concurrent.futures.as_completed(flush_results):
-            flush_result.result()
-
     torch.cuda.synchronize()
 
     _gather_tunableop_results()
diff --git a/torch/custom_class.h b/torch/custom_class.h
index 01a4f2e92b28..4a3d1a39db31 100644
--- a/torch/custom_class.h
+++ b/torch/custom_class.h
@@ -90,7 +90,7 @@ class class_ : public ::torch::detail::class_base {
   /// constructor taking an `int` and a `std::string` as argument.
   template <typename... Types>
   class_& def(
-      torch::detail::types<void, Types...>,
+      torch::detail::types<void, Types...> /*unused*/,
       std::string doc_string = "",
       std::initializer_list<arg> default_args =
           {}) { // Used in combination with
@@ -457,8 +457,8 @@ inline class_<CurClass> selective_class_(
 
 template <class CurClass>
 inline detail::ClassNotSelected selective_class_(
-    const std::string&,
-    detail::SelectiveStr<false>) {
+    const std::string& /*unused*/,
+    detail::SelectiveStr<false> /*unused*/) {
   return detail::ClassNotSelected();
 }
 
@@ -512,7 +512,7 @@ inline class_<CurClass> Library::class_(detail::SelectiveStr<true> className) {
 }
 
 template <class CurClass>
-inline detail::ClassNotSelected Library::class_(detail::SelectiveStr<false>) {
+inline detail::ClassNotSelected Library::class_(detail::SelectiveStr<false> /*unused*/) {
   return detail::ClassNotSelected();
 }
 
diff --git a/torch/custom_class_detail.h b/torch/custom_class_detail.h
index e770adfda51a..512320081b5d 100644
--- a/torch/custom_class_detail.h
+++ b/torch/custom_class_detail.h
@@ -128,7 +128,7 @@ typename c10::guts::infer_function_traits_t<Functor>::return_type
 call_torchbind_method_from_stack(
     Functor& functor,
     jit::Stack& stack,
-    std::index_sequence<ivalue_arg_indices...>) {
+    std::index_sequence<ivalue_arg_indices...> /*unused*/) {
   (void)(stack); // when sizeof...(ivalue_arg_indices) == 0, this argument would
                  // be unused and we have to silence the compiler warning.
 
diff --git a/torch/distributed/_C_stubs.py b/torch/distributed/_C_stubs.py
deleted file mode 100644
index b241006372b6..000000000000
--- a/torch/distributed/_C_stubs.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# mypy: allow-untyped-defs
-"""
-Python stubs for backend-specific distributed components.
-
-Since _C._distributed_c10d always exists now, this module only provides
-stubs for backend-specific functionality that may not be available in all builds
-(e.g., NCCL, UCC, MPI, Gloo, etc.).
-"""
-
-from __future__ import annotations
-
-from typing import Optional, TYPE_CHECKING
-
-from torch._C._distributed_c10d import Store
-
-
-if TYPE_CHECKING:
-    from datetime import timedelta
-
-import torch
-
-
-# Store classes
-class HashStore(Store):
-    """Stub HashStore for builds without this functionality."""
-
-    def __init__(self, *args, **kwargs):
-        self._data = {}
-
-    def set(self, key: str, value: str):
-        self._data[key] = value
-
-    def get(self, key: str) -> bytes:
-        return self._data.get(key, "").encode()
-
-
-# Backend-specific process group stubs
-class ProcessGroupMPI:
-    """Stub ProcessGroupMPI for non-MPI builds."""
-
-    def __init__(self, *args, **kwargs):
-        pass
-
-
-class ProcessGroupNCCL:
-    """Stub ProcessGroupNCCL for non-NCCL builds."""
-
-    def __init__(self, *args, **kwargs):
-        pass
-
-
-class ProcessGroupGloo:
-    """Stub ProcessGroupGloo for non-Gloo builds."""
-
-    def __init__(self, *args, **kwargs):
-        pass
-
-
-class ProcessGroupUCC:
-    """Stub ProcessGroupUCC for non-UCC builds."""
-
-    def __init__(self, *args, **kwargs):
-        pass
-
-
-class ProcessGroupXCCL:
-    """Stub ProcessGroupXCCL for non-XCCL builds."""
-
-    def __init__(self, *args, **kwargs):
-        pass
-
-
-class _ProcessGroupWrapper:
-    """Stub _ProcessGroupWrapper for non-Gloo builds."""
-
-    def __init__(self, process_group, *args, **kwargs):
-        self._process_group = process_group
-
-    def __getattr__(self, name):
-        return getattr(self._process_group, name)
-
-
-# NCCL-specific function stubs
-_DEFAULT_PG_NCCL_TIMEOUT: Optional[timedelta] = None
-
-
-def _hash_tensors(tensors):
-    """Stub function to hash tensors - returns dummy hash."""
-    return 0
-
-
-def _dump_nccl_trace_json(
-    includeCollectives: Optional[bool] = None, onlyActive: Optional[bool] = None
-) -> bytes:
-    """Stub function that returns empty JSON trace."""
-    return b"{}"
-
-
-def _dump_nccl_trace(
-    includeCollectives: Optional[bool] = None,
-    includeStackTraces: Optional[bool] = None,
-    onlyActive: Optional[bool] = None,
-) -> bytes:
-    """Stub function that returns empty pickle trace."""
-    return b""
-
-
-# NVSHMEM/SymmetricMemory stubs
-def _is_nvshmem_available() -> bool:
-    """Stub function that returns False indicating NVSHMEM is not available."""
-    return False
-
-
-def _nvshmemx_cumodule_init(module: int) -> None:
-    """Stub function for NVSHMEM CU module initialization."""
-
-
-class _SymmetricMemory:
-    """Stub _SymmetricMemory class for builds without this functionality."""
-
-    def __init__(self, *args, **kwargs):
-        pass
-
-    @classmethod
-    def empty_strided_p2p(cls, size, stride, dtype, device, group_name=None):
-        """Stub that returns a regular tensor."""
-        return torch.empty(size, dtype=dtype, device=device)
-
-    @classmethod
-    def rendezvous(cls, tensor, group_name=None):
-        """Stub that returns None."""
-        return None
-
-    @classmethod
-    def set_group_info(cls, *args, **kwargs):
-        """Stub that does nothing."""
-
-    @classmethod
-    def set_backend(cls, name):
-        """Stub that does nothing."""
-
-    @classmethod
-    def get_backend(cls, device):
-        """Stub that returns None."""
-        return None
-
-    @classmethod
-    def has_multicast_support(cls, device_type, device_index):
-        """Stub that returns False."""
-        return False
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index 836b00c51c3a..f8b5a7a75b2f 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -14,10 +14,16 @@
 
 def is_available() -> bool:
     """
-    Always returns ``True``.  Note that even if distributed is available,
-    there may not necessarily be any usable backends.
+    Return ``True`` if the distributed package is available.
+
+    Otherwise,
+    ``torch.distributed`` does not expose any other APIs. Currently,
+    ``torch.distributed`` is available on Linux, MacOS and Windows. Set
+    ``USE_DISTRIBUTED=1`` to enable it when building PyTorch from source.
+    Currently, the default value is ``USE_DISTRIBUTED=1`` for Linux and Windows,
+    ``USE_DISTRIBUTED=0`` for MacOS.
     """
-    return True
+    return hasattr(torch._C, "_c10d_init")
 
 
 if is_available() and not torch._C._c10d_init():
@@ -30,124 +36,133 @@ def is_available() -> bool:
 DistStoreError = torch._C._DistStoreError
 QueueEmptyError = torch._C._DistQueueEmptyError
 
-from torch.distributed._distributed_c10d import (
-    _broadcast_coalesced,
-    _compute_bucket_assignment_by_size,
-    _ControlCollectives,
-    _DEFAULT_FIRST_BUCKET_BYTES,
-    _make_nccl_premul_sum,
-    _register_builtin_comm_hook,
-    _register_comm_hook,
-    _StoreCollectives,
-    _test_python_store,
-    _verify_params_across_processes,
-    Backend as _Backend,
-    BuiltinCommHookType,
-    DebugLevel,
-    FileStore,
-    get_debug_level,
-    GradBucket,
-    Logger,
-    PrefixStore,
-    ProcessGroup as ProcessGroup,
-    Reducer,
-    set_debug_level,
-    set_debug_level_from_env,
-    Store,
-    TCPStore,
-    Work as _Work,
-)
-
-
-class _DistributedPdb(pdb.Pdb):
-    """
-    Supports using PDB from inside a multiprocessing child process.
-
-    Usage:
-    _DistributedPdb().set_trace()
-    """
-
-    def interaction(self, *args, **kwargs):
-        _stdin = sys.stdin
+if is_available():
+    from torch._C._distributed_c10d import (
+        _broadcast_coalesced,
+        _compute_bucket_assignment_by_size,
+        _ControlCollectives,
+        _DEFAULT_FIRST_BUCKET_BYTES,
+        _make_nccl_premul_sum,
+        _register_builtin_comm_hook,
+        _register_comm_hook,
+        _StoreCollectives,
+        _test_python_store,
+        _verify_params_across_processes,
+        Backend as _Backend,
+        BuiltinCommHookType,
+        DebugLevel,
+        FileStore,
+        get_debug_level,
+        GradBucket,
+        Logger,
+        PrefixStore,
+        ProcessGroup as ProcessGroup,
+        Reducer,
+        set_debug_level,
+        set_debug_level_from_env,
+        Store,
+        TCPStore,
+        Work as _Work,
+    )
+
+    class _DistributedPdb(pdb.Pdb):
+        """
+        Supports using PDB from inside a multiprocessing child process.
+
+        Usage:
+        _DistributedPdb().set_trace()
+        """
+
+        def interaction(self, *args, **kwargs):
+            _stdin = sys.stdin
+            try:
+                sys.stdin = open("/dev/stdin")
+                pdb.Pdb.interaction(self, *args, **kwargs)
+            finally:
+                sys.stdin = _stdin
+
+    _breakpoint_cache: dict[int, typing.Any] = {}
+
+    def breakpoint(rank: int = 0, skip: int = 0, timeout_s=3600):
+        """
+        Set a breakpoint, but only on a single rank.  All other ranks will wait for you to be
+        done with the breakpoint before continuing.
+
+        Args:
+            rank (int): Which rank to break on.  Default: ``0``
+            skip (int): Skip the first ``skip`` calls to this breakpoint. Default: ``0``.
+        """
+        if skip > 0:
+            key = hash(str(traceback.format_exc()))
+            counter = _breakpoint_cache.get(key, 0) + 1
+            _breakpoint_cache[key] = counter
+            if counter <= skip:
+                log.warning("Skip the breakpoint, counter=%d", counter)
+                return
+
+        # avoid having the default timeout (if short) interrupt your debug session
+        if timeout_s is not None:
+            for group in torch.distributed.distributed_c10d._pg_map:
+                torch.distributed.distributed_c10d._set_pg_timeout(
+                    timedelta(seconds=timeout_s), group
+                )
+
+        if get_rank() == rank:
+            pdb = _DistributedPdb()
+            pdb.message(
+                "\n!!! ATTENTION !!!\n\n"
+                f"Type 'up' to get to the frame that called dist.breakpoint(rank={rank})\n"
+            )
+            pdb.set_trace()
+        # If Meta/Python keys are in the TLS, we want to make sure that we ignore them
+        # and hit the (default) CPU/CUDA implementation of barrier.
+        meta_in_tls = torch._C._meta_in_tls_dispatch_include()
+        guard = torch._C._DisableTorchDispatch()  # type: ignore[attr-defined]
+        torch._C._set_meta_in_tls_dispatch_include(False)
         try:
-            sys.stdin = open("/dev/stdin")
-            pdb.Pdb.interaction(self, *args, **kwargs)
+            barrier()
         finally:
-            sys.stdin = _stdin
-
-
-_breakpoint_cache: dict[int, typing.Any] = {}
-
-
-def breakpoint(rank: int = 0, skip: int = 0, timeout_s=3600):
-    """
-    Set a breakpoint, but only on a single rank.  All other ranks will wait for you to be
-    done with the breakpoint before continuing.
-
-    Args:
-        rank (int): Which rank to break on.  Default: ``0``
-        skip (int): Skip the first ``skip`` calls to this breakpoint. Default: ``0``.
-    """
-    if skip > 0:
-        key = hash(str(traceback.format_exc()))
-        counter = _breakpoint_cache.get(key, 0) + 1
-        _breakpoint_cache[key] = counter
-        if counter <= skip:
-            log.warning("Skip the breakpoint, counter=%d", counter)
-            return
-
-    # avoid having the default timeout (if short) interrupt your debug session
-    if timeout_s is not None:
-        for group in torch.distributed.distributed_c10d._pg_map:
-            torch.distributed.distributed_c10d._set_pg_timeout(
-                timedelta(seconds=timeout_s), group
-            )
-
-    if get_rank() == rank:
-        pdb = _DistributedPdb()
-        pdb.message(
-            "\n!!! ATTENTION !!!\n\n"
-            f"Type 'up' to get to the frame that called dist.breakpoint(rank={rank})\n"
-        )
-        pdb.set_trace()
-    # If Meta/Python keys are in the TLS, we want to make sure that we ignore them
-    # and hit the (default) CPU/CUDA implementation of barrier.
-    meta_in_tls = torch._C._meta_in_tls_dispatch_include()
-    guard = torch._C._DisableTorchDispatch()  # type: ignore[attr-defined]
-    torch._C._set_meta_in_tls_dispatch_include(False)
-    try:
-        barrier()
-    finally:
-        torch._C._set_meta_in_tls_dispatch_include(meta_in_tls)
-        del guard
-
-
-if sys.platform != "win32":
-    from torch.distributed._distributed_c10d import HashStore
-
-from .device_mesh import DeviceMesh, init_device_mesh
-
-# Variables prefixed with underscore are not auto imported
-# See the comment in `distributed_c10d.py` above `_backend` on why we expose
-# this.
-from .distributed_c10d import *  # noqa: F403
-from .distributed_c10d import (
-    _all_gather_base,
-    _coalescing_manager,
-    _CoalescingManager,
-    _create_process_group_wrapper,
-    _get_process_group_name,
-    _rank_not_in_group,
-    _reduce_scatter_base,
-    _time_estimator,
-    get_node_local_rank,
-)
-from .remote_device import _remote_device
-from .rendezvous import (
-    _create_store_from_options,
-    register_rendezvous_handler,
-    rendezvous,
-)
-
-
-set_debug_level_from_env()
+            torch._C._set_meta_in_tls_dispatch_include(meta_in_tls)
+            del guard
+
+    if sys.platform != "win32":
+        from torch._C._distributed_c10d import HashStore
+
+    from .device_mesh import DeviceMesh, init_device_mesh
+
+    # Variables prefixed with underscore are not auto imported
+    # See the comment in `distributed_c10d.py` above `_backend` on why we expose
+    # this.
+    # pyrefly: ignore  # deprecated
+    from .distributed_c10d import *  # noqa: F403
+    from .distributed_c10d import (  # pyrefly: ignore  # deprecated
+        _all_gather_base,
+        _coalescing_manager,
+        _CoalescingManager,
+        _create_process_group_wrapper,
+        _get_process_group_name,
+        _rank_not_in_group,
+        _reduce_scatter_base,
+        _time_estimator,
+        get_node_local_rank,
+    )
+    from .remote_device import _remote_device
+    from .rendezvous import (
+        _create_store_from_options,
+        register_rendezvous_handler,
+        rendezvous,
+    )
+
+    set_debug_level_from_env()
+
+else:
+    # This stub is sufficient to get
+    #   python test/test_public_bindings.py -k test_correct_module_names
+    # working even when USE_DISTRIBUTED=0.  Feel free to add more
+    # stubs as necessary.
+    # We cannot define stubs directly because they confuse pyre
+
+    class _ProcessGroupStub:
+        pass
+
+    sys.modules["torch.distributed"].ProcessGroup = _ProcessGroupStub  # type: ignore[attr-defined]
diff --git a/torch/distributed/_composable/contract.py b/torch/distributed/_composable/contract.py
index 56ada8791ebf..507edafff182 100644
--- a/torch/distributed/_composable/contract.py
+++ b/torch/distributed/_composable/contract.py
@@ -1,9 +1,10 @@
 # mypy: allow-untyped-defs
 import uuid
 from collections import OrderedDict
+from collections.abc import Callable
 from functools import wraps
-from typing import Callable, Generic, Optional, Protocol
-from typing_extensions import Concatenate, ParamSpec, TypeVar
+from typing import Concatenate, Generic, Optional, Protocol
+from typing_extensions import ParamSpec, TypeVar
 
 import torch
 import torch.nn as nn
@@ -106,6 +107,7 @@ def wrapper(
                 # If the user passes a sequence of modules, then we assume that
                 # we only need to insert the state object on the root modules
                 # (i.e. those without a parent) among the passed-in modules.
+                # pyrefly: ignore  # no-matching-overload
                 modules = _get_root_modules(list(module))
             state = state_cls()  # shared across all modules
             registry_item = RegistryItem()  # shared across all modules
@@ -117,6 +119,7 @@ def wrapper(
             all_orig_named_buffers: list[dict[str, torch.Tensor]] = []
             all_orig_named_modules: list[dict[str, nn.Module]] = []
 
+            # pyrefly: ignore  # bad-assignment
             for module in modules:
                 default_all_state: dict[Callable, _State] = OrderedDict()
                 default_registry: dict[str, RegistryItem] = OrderedDict()
@@ -143,8 +146,11 @@ def wrapper(
                 all_state.setdefault(func, state)
                 registry.setdefault(func.__name__, registry_item)
 
+                # pyrefly: ignore  # missing-attribute
                 all_orig_named_params.append(OrderedDict(module.named_parameters()))
+                # pyrefly: ignore  # missing-attribute
                 all_orig_named_buffers.append(OrderedDict(module.named_buffers()))
+                # pyrefly: ignore  # missing-attribute
                 all_orig_named_modules.append(OrderedDict(module.named_modules()))
 
             updated = func(inp_module, *args, **kwargs)
@@ -159,9 +165,13 @@ def wrapper(
             all_new_named_params: list[dict[str, nn.Parameter]] = []
             all_new_named_buffers: list[dict[str, torch.Tensor]] = []
             all_new_named_modules: list[dict[str, nn.Module]] = []
+            # pyrefly: ignore  # bad-assignment
             for module in updated_modules:
+                # pyrefly: ignore  # missing-attribute
                 all_new_named_params.append(OrderedDict(module.named_parameters()))
+                # pyrefly: ignore  # missing-attribute
                 all_new_named_buffers.append(OrderedDict(module.named_buffers()))
+                # pyrefly: ignore  # missing-attribute
                 all_new_named_modules.append(OrderedDict(module.named_modules()))
 
             num_orig_modules = len(all_orig_named_modules)
@@ -224,6 +234,7 @@ def check_fqn(orig_fqns: list[str], new_fqns: list[str], check_key: str):
             # TODO: verify that installed distributed paradigms are compatible with
             # each other.
 
+            # pyrefly: ignore  # bad-return
             return updated
 
         def get_state(module: nn.Module) -> _State:
diff --git a/torch/distributed/_composable/replicate.py b/torch/distributed/_composable/replicate.py
index cb3d916d646b..3d9ab861ee5d 100644
--- a/torch/distributed/_composable/replicate.py
+++ b/torch/distributed/_composable/replicate.py
@@ -214,11 +214,9 @@ def replicate(
 
     state = replicate.state(module)
     module.register_forward_pre_hook(state.forward_pre_hook, with_kwargs=True)
-    device_mesh = kwargs.get("device_mesh", None)
+    device_mesh = kwargs.get("device_mesh")
     if device_mesh is not None:
-        from torch.distributed.device_mesh import _mesh_resources
-
-        root_mesh = _mesh_resources.get_root_mesh(device_mesh)
+        root_mesh = device_mesh._get_root_mesh()
         # if a root mesh is not the same as device_mesh,
         # meaning the device_mesh is sliced out from the root mesh.
         if root_mesh != device_mesh:
diff --git a/torch/distributed/_composable/replicate_with_fsdp.py b/torch/distributed/_composable/replicate_with_fsdp.py
index 219501a0a708..405e3381145e 100644
--- a/torch/distributed/_composable/replicate_with_fsdp.py
+++ b/torch/distributed/_composable/replicate_with_fsdp.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 
 import logging
-from typing import Callable, Optional, TYPE_CHECKING, Union
+from typing import Optional, TYPE_CHECKING, Union
 
 import torch
 import torch.distributed as dist
@@ -40,6 +40,8 @@
 
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from torch.distributed.tensor import Shard
 
 
@@ -98,6 +100,7 @@ def init(
         for module in modules:
             _insert_module_state(module, self)
         self._modules = modules
+        # pyrefly: ignore  # read-only
         self._device = device
         self._device_handle = _get_device_handle(device.type)
         self._mp_policy = mp_policy
@@ -148,6 +151,7 @@ def _lazy_init(self) -> None:
                     )
                 state._is_root = False
             self._state_ctx.all_states.append(state)
+            # pyrefly: ignore  # bad-argument-type
             visited_states.add(state)
         if self._fsdp_param_group and self._auto_reshard_after_forward:
             # For the root, do not reshard after forward since for training,
@@ -224,10 +228,10 @@ def replicate_impl(
     # Place Replicate leftmost for highest priority in the method resolution order
     for module in modules:
         cls = module.__class__
-        new_cls = cls_to_replicate_cls.get(cls, None)
+        new_cls = cls_to_replicate_cls.get(cls)
         if not new_cls:
             dct = {"__deepcopy__": _unimplemented_deepcopy}
-            new_cls = type(f"Replicate{cls.__name__}", (FSDPModule, cls), dct)
+            new_cls = type(f"Replicate{cls.__name__}", (ReplicateModule, cls), dct)
             cls_to_replicate_cls[cls] = new_cls
         module.__class__ = new_cls
     return arg_module
@@ -269,6 +273,20 @@ def replicate(
     return module
 
 
+class ReplicateModule(FSDPModule):
+    def __new__(cls, *args, **kwargs):
+        """
+        Override ``__new__`` to remove the FSDP class and directly construct
+        the original class for cases like indexing into a container module.
+        """
+        # Use index 2 since 0 is the dynamically constructed `FSDP<...>` class
+        # and index 1 is the `FSDPModule` class itself
+        orig_cls = cls.__mro__[3]
+        self = orig_cls.__new__(orig_cls, *args, **kwargs)
+        self.__init__(*args, **kwargs)
+        return self
+
+
 def _get_managed_modules(
     root_modules: tuple[nn.Module, ...],
     ignored_params: Optional[set[nn.Parameter]] = None,
diff --git a/torch/distributed/_composable_state.py b/torch/distributed/_composable_state.py
index 6d2b8baed766..507db1bf7fc6 100644
--- a/torch/distributed/_composable_state.py
+++ b/torch/distributed/_composable_state.py
@@ -31,6 +31,7 @@ def _get_module_state(module: nn.Module) -> Optional[_State]:
     """
     global _module_state_mapping
     if isinstance(module, _State):
+        # pyrefly: ignore  # redundant-cast
         return cast(_State, module)
     else:
         # https://github.com/pytorch/pytorch/issues/107054
diff --git a/torch/distributed/_dist2.py b/torch/distributed/_dist2.py
index 1c27bf55d683..ce5cb8d7e0cc 100644
--- a/torch/distributed/_dist2.py
+++ b/torch/distributed/_dist2.py
@@ -10,7 +10,7 @@
 from typing import Protocol, Union
 
 import torch
-from torch.distributed._distributed_c10d import (
+from torch._C._distributed_c10d import (
     _current_process_group,
     _set_process_group,
     ProcessGroup,
diff --git a/torch/distributed/_distributed_c10d.py b/torch/distributed/_distributed_c10d.py
deleted file mode 100644
index beb7830edc1d..000000000000
--- a/torch/distributed/_distributed_c10d.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# mypy: disable-error-code="assignment"
-# noqa: F401
-"""
-Centralized module for importing and re-exporting torch._C._distributed_c10d components.
-
-IMPORTANT PATTERN:
-Never access torch._C._distributed_c10d directly in code. Always import from and use
-torch.distributed._distributed_c10d which is guaranteed to have all functions available.
-
-Example:
-    # WRONG: torch._C._distributed_c10d._set_global_rank(rank)
-    # RIGHT:
-    from torch.distributed._distributed_c10d import _set_global_rank
-    _set_global_rank(rank)
-"""
-
-from typing import TYPE_CHECKING
-
-# Import all core distributed components from the C extension
-# NB: This list has to be spelled out because the _C module doesn't have __all__
-from torch._C._distributed_c10d import (
-    _allow_inflight_collective_as_graph_input,
-    _broadcast_coalesced,
-    _compute_bucket_assignment_by_size,
-    _ControlCollectives,
-    _current_process_group,
-    _DEFAULT_FIRST_BUCKET_BYTES,
-    _DEFAULT_PG_TIMEOUT,
-    _DistributedBackendOptions,
-    _make_nccl_premul_sum,
-    _register_builtin_comm_hook,
-    _register_comm_hook,
-    _register_process_group,
-    _register_work,
-    _resolve_process_group,
-    _set_allow_inflight_collective_as_graph_input,
-    _set_global_rank,
-    _set_process_group,
-    _StoreCollectives,
-    _test_python_store,
-    _unregister_all_process_groups,
-    _unregister_process_group,
-    _verify_params_across_processes,
-    _WorkerServer,
-    AllgatherOptions,
-    AllreduceCoalescedOptions,
-    AllreduceOptions,
-    AllToAllOptions,
-    Backend,
-    BarrierOptions,
-    BroadcastOptions,
-    BuiltinCommHookType,
-    DebugLevel,
-    FakeProcessGroup,
-    FakeWork,
-    FileStore,
-    GatherOptions,
-    get_debug_level,
-    GradBucket,
-    Logger,
-    PrefixStore,
-    ProcessGroup,
-    ReduceOp,
-    ReduceOptions,
-    Reducer,
-    ReduceScatterOptions,
-    ScatterOptions,
-    set_debug_level,
-    set_debug_level_from_env,
-    Store,
-    TCPStore,
-    Work,
-)
-
-
-# Backend-specific components that may not be available
-_MPI_AVAILABLE = False
-_NCCL_AVAILABLE = False
-_GLOO_AVAILABLE = False
-_UCC_AVAILABLE = False
-_XCCL_AVAILABLE = False
-
-# HashStore
-try:
-    from torch._C._distributed_c10d import HashStore
-except ImportError:
-    if not TYPE_CHECKING:
-        from torch.distributed._C_stubs import HashStore
-
-# NVSHMEM/SymmetricMemory components
-
-# There are multiple backends for SymmetricMemory, as a result,
-# _SymmetricMemory should not be imported together with NVSHMEM related modules.
-try:
-    from torch._C._distributed_c10d import _SymmetricMemory
-except ImportError:
-    if not TYPE_CHECKING:
-        from torch.distributed._C_stubs import _SymmetricMemory
-
-try:
-    from torch._C._distributed_c10d import (
-        _is_nvshmem_available,
-        _nvshmemx_cumodule_init,
-    )
-except ImportError:
-    if not TYPE_CHECKING:
-        from torch.distributed._C_stubs import (
-            _is_nvshmem_available,
-            _nvshmemx_cumodule_init,
-        )
-
-# MPI backend
-try:
-    from torch._C._distributed_c10d import ProcessGroupMPI
-
-    _MPI_AVAILABLE = True
-except ImportError:
-    if not TYPE_CHECKING:
-        from torch.distributed._C_stubs import ProcessGroupMPI
-
-# NCCL backend
-try:
-    from torch._C._distributed_c10d import (
-        _DEFAULT_PG_NCCL_TIMEOUT,
-        _dump_nccl_trace,
-        _dump_nccl_trace_json,
-        _hash_tensors,
-        ProcessGroupNCCL,
-    )
-
-    _NCCL_AVAILABLE = True
-except ImportError:
-    if not TYPE_CHECKING:
-        from torch.distributed._C_stubs import (
-            _DEFAULT_PG_NCCL_TIMEOUT,
-            _dump_nccl_trace,
-            _dump_nccl_trace_json,
-            _hash_tensors,
-            ProcessGroupNCCL,
-        )
-
-# Gloo backend
-try:
-    from torch._C._distributed_c10d import _ProcessGroupWrapper, ProcessGroupGloo
-
-    _GLOO_AVAILABLE = True
-except ImportError:
-    if not TYPE_CHECKING:
-        from torch.distributed._C_stubs import _ProcessGroupWrapper, ProcessGroupGloo
-
-# UCC backend
-try:
-    from torch._C._distributed_c10d import ProcessGroupUCC
-
-    _UCC_AVAILABLE = True
-except ImportError:
-    if not TYPE_CHECKING:
-        from torch.distributed._C_stubs import ProcessGroupUCC
-
-# XCCL backend
-try:
-    from torch._C._distributed_c10d import ProcessGroupXCCL
-
-    _XCCL_AVAILABLE = True
-except ImportError:
-    if not TYPE_CHECKING:
-        from torch.distributed._C_stubs import ProcessGroupXCCL
-
-# Provide backwards compatibility by making all symbols available at module level
-__all__ = [
-    # Basic components
-    "_broadcast_coalesced",
-    "_compute_bucket_assignment_by_size",
-    "_ControlCollectives",
-    "_DEFAULT_FIRST_BUCKET_BYTES",
-    "_DEFAULT_PG_TIMEOUT",
-    "_DEFAULT_PG_NCCL_TIMEOUT",
-    "_make_nccl_premul_sum",
-    "_register_builtin_comm_hook",
-    "_register_comm_hook",
-    "_StoreCollectives",
-    "_test_python_store",
-    "_verify_params_across_processes",
-    "_allow_inflight_collective_as_graph_input",
-    "_register_work",
-    "_set_allow_inflight_collective_as_graph_input",
-    "_is_nvshmem_available",
-    "_nvshmemx_cumodule_init",
-    "_SymmetricMemory",
-    "_hash_tensors",
-    "_set_global_rank",
-    "_dump_nccl_trace",
-    "_dump_nccl_trace_json",
-    "Backend",
-    "BuiltinCommHookType",
-    "DebugLevel",
-    "FakeProcessGroup",
-    "FileStore",
-    "get_debug_level",
-    "GradBucket",
-    "HashStore",
-    "Logger",
-    "PrefixStore",
-    "ProcessGroup",
-    "Reducer",
-    "ReduceOp",
-    "set_debug_level",
-    "set_debug_level_from_env",
-    "Store",
-    "TCPStore",
-    "Work",
-    "FakeWork",
-    # Additional distributed_c10d components
-    "_DistributedBackendOptions",
-    "_register_process_group",
-    "_resolve_process_group",
-    "_unregister_all_process_groups",
-    "_unregister_process_group",
-    "_current_process_group",
-    "_set_process_group",
-    "_WorkerServer",
-    "AllgatherOptions",
-    "AllreduceCoalescedOptions",
-    "AllreduceOptions",
-    "AllToAllOptions",
-    "BarrierOptions",
-    "BroadcastOptions",
-    "GatherOptions",
-    "ReduceOptions",
-    "ReduceScatterOptions",
-    "ScatterOptions",
-    # Process group implementations
-    "ProcessGroupMPI",
-    "ProcessGroupNCCL",
-    "ProcessGroupGloo",
-    "ProcessGroupUCC",
-    "ProcessGroupXCCL",
-    "_ProcessGroupWrapper",
-    # Availability flags
-    "_MPI_AVAILABLE",
-    "_NCCL_AVAILABLE",
-    "_GLOO_AVAILABLE",
-    "_UCC_AVAILABLE",
-    "_XCCL_AVAILABLE",
-]
diff --git a/torch/distributed/_functional_collectives.py b/torch/distributed/_functional_collectives.py
index 95feb6cd7971..70dc50f1591a 100644
--- a/torch/distributed/_functional_collectives.py
+++ b/torch/distributed/_functional_collectives.py
@@ -7,10 +7,6 @@
 import torch
 import torch.distributed as dist
 import torch.distributed.distributed_c10d as c10d
-from torch.distributed._distributed_c10d import (
-    _allow_inflight_collective_as_graph_input,
-    _set_allow_inflight_collective_as_graph_input,
-)
 from torch.distributed.device_mesh import DeviceMesh
 from torch.fx.experimental.proxy_tensor import get_proxy_mode
 
@@ -637,6 +633,7 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[
         if func == torch.ops.aten.view.default:
             # Fast handle aten.view as a lot of view related op goes to aten.view
             # eventually, this avoids pytree slowdown
+            # pyrefly: ignore  # index-error
             res = func(args[0].elem, args[1])
             wrapper_res = AsyncCollectiveTensor(res)
             return wrapper_res
@@ -790,6 +787,7 @@ def _resolve_group_name(group: RANK_TYPES, tag: str = "") -> str:
                 FutureWarning,
                 stacklevel=3,
             )
+        # pyrefly: ignore  # redundant-cast
         return c10d._resolve_group_name_by_ranks_and_tag(cast(list[int], group), tag)
     else:
         raise ValueError(f"Unsupported group type: {type(group)}, {group}")
@@ -862,13 +860,15 @@ def all_reduce_wait_compiled(y):
     will be registered in the work registry, and the wait_tensor() in compiled region called on
     the output tensor of the collective will wait on the correct work object.
     """
-    previous = _allow_inflight_collective_as_graph_input()
+    previous = torch._C._distributed_c10d._allow_inflight_collective_as_graph_input()
 
     try:
-        _set_allow_inflight_collective_as_graph_input(value)
+        torch._C._distributed_c10d._set_allow_inflight_collective_as_graph_input(value)
         yield
     finally:
-        _set_allow_inflight_collective_as_graph_input(previous)
+        torch._C._distributed_c10d._set_allow_inflight_collective_as_graph_input(
+            previous
+        )
 
 
 def _make_all_gather_out_tensor(input, group_size):
@@ -946,7 +946,7 @@ def _all_to_all_single_meta(
         return input.new_empty(input.size())
     else:
         for s in output_split_sizes:
-            torch._check_is_size(s)
+            torch._check(s >= 0)
         out_size = list(input.size())
         out_size[0] = sum(output_split_sizes)
         return input.new_empty(out_size)
@@ -1009,8 +1009,8 @@ def _reduce_scatter_tensor_coalesced_native_meta(
 lib_impl.impl("broadcast_", _broadcast__meta, "Meta")
 
 # mark these ops has side effect so that they won't be removed by DCE
-torch.fx.node.has_side_effect(torch.ops._c10d_functional.wait_tensor.default)
-torch.fx.node.has_side_effect(torch.ops._c10d_functional.wait_tensor)
+torch.fx.node.has_side_effect(torch.ops._c10d_functional.wait_tensor.default)  # type: ignore[has-type]
+torch.fx.node.has_side_effect(torch.ops._c10d_functional.wait_tensor)  # type: ignore[has-type]
 
 # Register legacy ops for backward compatibility
 # TODO(yifu): remove these in functional collective beta release
@@ -1166,15 +1166,17 @@ def all_gather_inplace(
     for t in tensor_list:
         is_scalar = t.dim() == 0
         t_offset = 1 if is_scalar else t.size(0)
+        # pyrefly: ignore  # unsupported-operation
         out = output[offset] if is_scalar else output[offset : offset + t_offset]
         output_splits.append(out)
+        # pyrefly: ignore  # unsupported-operation
         offset += t_offset
     for dst, src in zip(tensor_list, output_splits):
         dst.copy_(src)
     return tensor_list
 
 
-from torch.distributed.distributed_c10d import (
+from torch.distributed.distributed_c10d import (  # pyrefly: ignore  # deprecated
     _all_gather_base as legacy_all_gather_base,
     _reduce_scatter_base as legacy_reduce_scatter_base,
     all_gather as legacy_all_gather,
@@ -1188,11 +1190,11 @@ def all_gather_inplace(
 # This dict should contain sets of functions that dynamo is allowed to remap.
 # Functions in this set should accept the same args/kwargs 1:1 as their mapping.
 traceable_collective_remaps = {
-    legacy_allgather: all_gather_tensor_inplace,
-    legacy_reducescatter: reduce_scatter_tensor_inplace,
-    legacy_allreduce: all_reduce_inplace,
-    legacy_all_to_all_single: all_to_all_inplace,
-    legacy_all_gather: all_gather_inplace,
-    legacy_reduce_scatter_base: reduce_scatter_tensor_inplace,
-    legacy_all_gather_base: all_gather_tensor_inplace,
+    legacy_allgather: all_gather_tensor_inplace,  # type: ignore[has-type]
+    legacy_reducescatter: reduce_scatter_tensor_inplace,  # type: ignore[has-type]
+    legacy_allreduce: all_reduce_inplace,  # type: ignore[has-type]
+    legacy_all_to_all_single: all_to_all_inplace,  # type: ignore[has-type]
+    legacy_all_gather: all_gather_inplace,  # type: ignore[has-type]
+    legacy_reduce_scatter_base: reduce_scatter_tensor_inplace,  # type: ignore[has-type]
+    legacy_all_gather_base: all_gather_tensor_inplace,  # type: ignore[has-type]
 }
diff --git a/torch/distributed/_local_tensor/__init__.py b/torch/distributed/_local_tensor/__init__.py
new file mode 100644
index 000000000000..5e88acb9badf
--- /dev/null
+++ b/torch/distributed/_local_tensor/__init__.py
@@ -0,0 +1,875 @@
+from ast import Call
+
+
+"""
+A LocalTensor is a tensor subclass which simulates a tensor that is
+distributed across SPMD ranks.  A LocalTensor might be size N, but in fact
+there are world_size shards/replicas of it stored internally.  When you do a
+plain PyTorch operation on it, we apply the operation to each shard; when you
+do a collective, we do the mathematically equivalent operation on the local
+shards.  A LocalTensor is associated with a list of ranks which specify
+which ranks it holds local tensors for.
+
+NB, this is NOT a DataParallel like abstraction where you can run operations
+on multiple different GPUs. It is intended purely for *debugging* purposes,
+the overhead is almost certainly too high to keep eight GPUs (even the C++
+autograd needs multithreading to keep up!)  (It might potentially be possible
+to trace through this with torch.compile and then compile it with CUDA graphs
+but this is currently a non-goal.)
+
+We do not directly handling MPMD. However in practice even in SPMD you may
+encounter divergence in behavior per rank (for example, uneven sharding
+across ranks). To support scenarios like this, we provide a helper decorator
+that allows you to run a function with no side effects for each LocalTensor
+shard and combine results back into LocalTensor or LocalIntNode.
+
+NB: This is a torch dispatch Tensor subclass, as we want to assume that autograd
+is SPMD, so we run it once, and dispatch the inner autograd calls to the individual
+local shards.
+
+NOTE ABOUT MESH:  This subclass requires collectives that are issued to it to
+respect a DeviceMesh like abstraction.  The reason for this is that when
+DTensor issues us a collective for a particular rank, you will be asked to do
+this on a specific process group which involves some ranks.  However, this
+will only be for the LOCAL PG that this particular rank is participating in;
+there will be a bunch of other PGs for other nodes that you don't get to see.
+We need to be able to reverse engineer all of the collectives that don't
+involve the current local rank here to actually issue them.  This can be done
+two ways: (1) looking at the participating local ranks in the PG and computing
+the complement which specifies all the other collectives you have to run, or
+(2) retrieving the device mesh axis corresponding to the PG for this rank, and
+then running all the fibers for this.
+"""
+
+import contextlib
+import functools
+import operator
+import os
+import sys
+from collections import defaultdict
+from collections.abc import Sequence
+from types import TracebackType
+from typing import Any, Callable, Generator, Optional, Union
+
+
+try:
+    import numpy as np
+
+    HAS_NUMPY = True
+except ModuleNotFoundError:
+    HAS_NUMPY = False
+    np = None  # type: ignore[assignment]
+
+import torch
+from torch import Size, SymBool, SymInt, Tensor
+from torch._C import DispatchKey, DispatchKeySet, ScriptObject
+from torch._export.wrappers import mark_subclass_constructor_exportable_experimental
+from torch.distributed import DeviceMesh, ProcessGroup
+from torch.distributed._functional_collectives import AsyncCollectiveTensor
+from torch.distributed.distributed_c10d import _get_default_group
+from torch.fx.experimental._constant_symnode import ConstantIntNode
+from torch.nested._internal.nested_int import NestedIntNode
+from torch.utils import _pytree as pytree
+from torch.utils._python_dispatch import return_and_correct_aliasing, TorchDispatchMode
+from torch.utils.checkpoint import get_device_states, set_device_states
+
+
+not_implemented_log = torch._logging.getArtifactLogger(__name__, "not_implemented")
+
+
+from . import _c10d
+
+
+def _int_on_rank(i: "int | LocalIntNode | ConstantIntNode", r: int) -> int:
+    if isinstance(i, LocalIntNode):
+        return i._local_ints[r]
+    elif isinstance(i, ConstantIntNode):
+        return i.val
+    elif isinstance(i, int):
+        return i
+    else:
+        raise AssertionError(type(i))
+
+
+def _check_for_subclass(flat_args: Sequence[object]) -> bool:
+    return any(_check_for_subclass_arg(x) for x in flat_args)
+
+
+def _check_for_subclass_arg(x: object) -> bool:
+    return (
+        not isinstance(x, LocalTensor)
+        and isinstance(x, Tensor)
+        and type(x) not in (Tensor, torch.nn.Parameter, torch.nn.Buffer)
+    )
+
+
+def _map_to_rank_local_val(val: Any, rank: int) -> Any:
+    if isinstance(val, LocalTensor):
+        return val._local_tensors[rank]
+    if isinstance(val, SymInt) and isinstance(val.node, LocalIntNode):
+        return val.node._local_ints[rank]
+    return val
+
+
+def collect_cuda_rng_states() -> list[torch.Tensor]:
+    """
+    Collects RNG state from all available CUDA devices.
+
+    Returns:
+        List of RNG state tensors, one for each CUDA device.
+        Returns empty list if CUDA is not available.
+    """
+    if not torch.cuda.is_available():
+        return []
+
+    num_devices = torch.cuda.device_count()
+    rng_states = []
+
+    for device_idx in range(num_devices):
+        with torch.cuda.device(device_idx):
+            rng_state = torch.cuda.get_rng_state()
+            rng_states.append(rng_state)
+
+    return rng_states
+
+
+def set_cuda_rng_states(rng_states: list[torch.Tensor]) -> None:
+    """
+    Sets RNG state for all CUDA devices from a list of states.
+
+    Args:
+        rng_states: List of RNG state tensors to restore.
+    """
+    if not torch.cuda.is_available():
+        return
+
+    num_devices = min(len(rng_states), torch.cuda.device_count())
+
+    for device_idx in range(num_devices):
+        with torch.cuda.device(device_idx):
+            torch.cuda.set_rng_state(rng_states[device_idx])
+
+
+def _get_rng_state() -> tuple[torch.Tensor, list[torch.Tensor]]:
+    """
+    Gets CPU and CUDA rng states from all devices.
+    """
+    return (torch.get_rng_state(), collect_cuda_rng_states())
+
+
+def _set_rng_state(cpu_state: torch.Tensor, cuda_states: list[torch.Tensor]) -> None:
+    """
+    Sets CPU and CUDA rng states for all devices. If the list of cuda states
+    is shorter than the number of devices only the first len(cuda_states) devices
+    will get their rng state set.
+    """
+    torch.set_rng_state(cpu_state)
+    set_cuda_rng_states(cuda_states)
+
+
+def _for_each_rank_run_func(
+    func: Callable[..., Any],
+    ranks: frozenset[int],
+    args: Sequence[Any],
+    kwargs: dict[str, Any],
+    *,
+    alias: bool = True,
+) -> Any:
+    flat_args, args_spec = pytree.tree_flatten((args, kwargs))
+    flat_args = [
+        a.wait() if isinstance(a, AsyncCollectiveTensor) else a for a in flat_args
+    ]
+
+    # NB: Before invoking an op we are collecting rng states from CPU and
+    # CUDA devices such that we can reset to the same before invoking op
+    # for each rank. This is not very efficient and will likely be revisited
+    # to support per rank rng state.
+    rng_state = _get_rng_state()
+    flat_rank_rets = {}
+
+    for r in sorted(ranks):
+        _set_rng_state(*rng_state)
+        rank_flat_args = [_map_to_rank_local_val(a, r) for a in flat_args]
+        rank_args, rank_kwargs = pytree.tree_unflatten(rank_flat_args, args_spec)
+        rank_ret = func(*rank_args, **rank_kwargs)
+        flat_rank_rets[r] = rank_ret
+
+    rr_key = next(iter(flat_rank_rets.keys()))
+    rr_val = flat_rank_rets[rr_key]
+
+    if isinstance(rr_val, Tensor):
+        ret = LocalTensor({r: flat_rank_rets[r] for r in sorted(ranks)})
+    elif isinstance(rr_val, (list, tuple)):
+        ret_list = []
+        for i in range(len(rr_val)):
+            rets = {r: flat_rank_rets[r][i] for r in sorted(ranks)}
+            v_it = iter(rets.values())
+            v = next(v_it)
+            if isinstance(v, Tensor):
+                ret_list.append(LocalTensor(rets))
+            elif isinstance(v, int) and not all(v == v2 for v2 in v_it):
+                ret_list.append(torch.SymInt(LocalIntNode(rets)))
+            else:
+                assert all(v == v2 for v2 in v_it)
+                ret_list.append(v)
+        ret = type(rr_val)(ret_list)
+    else:
+        v_it = iter(flat_rank_rets.values())
+        v = next(v_it)
+        if all(v == v2 for v2 in v_it):
+            return v
+        if isinstance(v, int):
+            return torch.SymInt(LocalIntNode(flat_rank_rets))
+        raise AssertionError(f"Unexpected return type {type(v)}")
+
+    if alias:
+        return return_and_correct_aliasing(func, args, kwargs, ret)
+    else:
+        return ret
+
+
+def _get_extra_dispatch_keys(t: torch.Tensor) -> DispatchKeySet:
+    extra_dispatch_keys = torch._C.DispatchKeySet.from_raw_repr(0)
+    if torch._C._dispatch_keys(t).has(torch._C.DispatchKey.Conjugate):
+        extra_dispatch_keys = extra_dispatch_keys.add(torch._C.DispatchKey.Conjugate)
+    if torch._C._dispatch_keys(t).has(torch._C.DispatchKey.Negative):
+        extra_dispatch_keys = extra_dispatch_keys.add(torch._C.DispatchKey.Negative)
+    return extra_dispatch_keys
+
+
+class LocalIntNode:
+    """
+    Like a LocalTensor, but for an int.  We can't use a 0D tensor to represent this
+    because often only a SymInt is accepted where we wish to use this.
+    """
+
+    def __new__(cls, local_ints: dict[int, int]) -> "ConstantIntNode | LocalIntNode":  # type: ignore[misc]
+        if len(set(local_ints.values())) == 1:
+            return ConstantIntNode(next(iter(local_ints.values())))
+        return super().__new__(cls)
+
+    def __init__(self, local_ints: dict[int, int]):
+        self._local_ints = local_ints
+
+    def maybe_as_int(self) -> Optional[int]:
+        return None
+
+    def is_int(self) -> bool:
+        return True
+
+    def is_float(self) -> bool:
+        return False
+
+    def is_bool(self) -> bool:
+        return False
+
+    def is_nested_int(self) -> bool:
+        return False
+
+    def clone(self) -> "LocalIntNode":
+        return self
+
+    def _str(self) -> str:
+        return f"LocalIntNode({self._local_ints})"
+
+    def __str__(self) -> str:
+        return self._str()
+
+    def __repr__(self) -> str:
+        return self._str()
+
+    def _graph_repr(self) -> str:
+        return self._str()
+
+    def is_symbolic(self) -> bool:
+        return False
+
+    def is_constant(self) -> bool:
+        return False
+
+    def sym_max(
+        self, other: "int | LocalIntNode | ConstantIntNode"
+    ) -> "LocalIntNode | ConstantIntNode":
+        return LocalIntNode(
+            {
+                r: max(self._local_ints[r], _int_on_rank(other, r))
+                for r in self._local_ints
+            }
+        )
+
+    def add(
+        self, other: "int | LocalIntNode | ConstantIntNode"
+    ) -> "LocalIntNode | ConstantIntNode":
+        return LocalIntNode(
+            {r: self._local_ints[r] + _int_on_rank(other, r) for r in self._local_ints}
+        )
+
+    def sub(
+        self, other: "int | LocalIntNode | ConstantIntNode"
+    ) -> "LocalIntNode | ConstantIntNode":
+        return LocalIntNode(
+            {r: self._local_ints[r] - _int_on_rank(other, r) for r in self._local_ints}
+        )
+
+    def mul(
+        self, other: "int | LocalIntNode | ConstantIntNode"
+    ) -> "LocalIntNode | ConstantIntNode":
+        return LocalIntNode(
+            {r: self._local_ints[r] * _int_on_rank(other, r) for r in self._local_ints}
+        )
+
+    def floordiv(
+        self, other: "int | LocalIntNode | ConstantIntNode"
+    ) -> "LocalIntNode | ConstantIntNode":
+        return LocalIntNode(
+            {r: self._local_ints[r] // _int_on_rank(other, r) for r in self._local_ints}
+        )
+
+    def mod(
+        self, other: "int | LocalIntNode | ConstantIntNode"
+    ) -> "LocalIntNode | ConstantIntNode":
+        return LocalIntNode(
+            {r: self._local_ints[r] % _int_on_rank(other, r) for r in self._local_ints}
+        )
+
+    def int_floordiv(
+        self, other: "int | LocalIntNode | ConstantIntNode"
+    ) -> "LocalIntNode | ConstantIntNode":
+        return LocalIntNode(
+            {r: self._local_ints[r] // _int_on_rank(other, r) for r in self._local_ints}
+        )
+
+    def eq(self, other: "int | LocalIntNode | ConstantIntNode") -> bool | SymBool:
+        r = {self._local_ints[r] == _int_on_rank(other, r) for r in self._local_ints}
+        return torch._C._get_constant_bool_symnode(len(r) == 1 and next(iter(r)))
+
+    def gt(self, other: "int | LocalIntNode | ConstantIntNode") -> bool | SymBool:
+        r = {self._local_ints[r] > _int_on_rank(other, r) for r in self._local_ints}
+        assert len(r) == 1, (self, other)
+        return torch._C._get_constant_bool_symnode(next(iter(r)))
+
+    def lt(self, other: "int | LocalIntNode | ConstantIntNode") -> bool | SymBool:
+        r = {self._local_ints[r] < _int_on_rank(other, r) for r in self._local_ints}
+        assert len(r) == 1, (self, other)
+        return torch._C._get_constant_bool_symnode(next(iter(r)))
+
+    def wrap_int(self, num: int) -> "LocalIntNode | ConstantIntNode":
+        return ConstantIntNode(num)
+
+
+class LocalTensor(torch.Tensor):
+    """
+    LocalTensor is a Tensor subclass that simulates a tensor distributed across multiple SPMD
+    (Single Program, Multiple Data) ranks. Each LocalTensor instance internally holds a mapping from
+    global rank ids to their corresponding local Tensor shards.Operations performed on a LocalTensor
+    are applied independently to each local shard, mimicking distributed computation. Collectives
+    and other distributed operations are handled by mapping them to the local shards as appropriate.
+
+    Note:
+        This class is primarily intended for debugging and simulating distributed tensor computations
+        on a single process.
+
+    """
+
+    # Map from global rank to the local tensor.
+    _local_tensors: dict[int, torch.Tensor]
+    # Precomputed for speed set of keys from the local tensor map.
+    _ranks: frozenset[int]
+    __slots__ = ["_local_tensors", "_ranks"]
+
+    @staticmethod
+    @torch._disable_dynamo
+    def __new__(
+        cls,
+        local_tensors: dict[int, torch.Tensor],
+    ) -> "LocalTensor":
+        if any(t.requires_grad for t in local_tensors.values()):
+            raise AssertionError(
+                "Internal local_tensors require grad, but we will ignore those autograd graph. "
+                "Make a custom autograd function and make sure you detach the inner tensors."
+            )
+
+        it = iter(local_tensors.values())
+        first_local_tensor = next(it)
+
+        first_shape = first_local_tensor.shape
+        first_stride = first_local_tensor.stride()
+        dtype = first_local_tensor.dtype
+        device = first_local_tensor.device
+        layout = first_local_tensor.layout
+
+        extra_dispatch_keys = _get_extra_dispatch_keys(first_local_tensor)
+
+        # Assert that all tensors have the same dtype, layout and dispatch keys. Due
+        # to uneven sharding, it is possible that tensors will have different shapes.
+        for local_tensor in it:
+            assert dtype == local_tensor.dtype, (
+                "Tensors representing LocalTensor shards must have the same dtype"
+            )
+            assert layout == local_tensor.layout, (
+                "Tensors representing LocalTensor shards must have the same layout"
+            )
+            assert extra_dispatch_keys == _get_extra_dispatch_keys(local_tensor), (
+                "Tensors representing LocalTensor shards must have the same set of extra dispatch keys"
+            )
+
+        # Compute shape/stride.  We allow for non-SPMD'ness here
+        local_shapes: dict[int, dict[int, int]] = defaultdict(
+            dict
+        )  # dim => rank => size
+        local_strides: dict[int, dict[int, int]] = defaultdict(
+            dict
+        )  # dim => rank => size
+        for r, local_tensor in local_tensors.items():
+            for d, size in enumerate(local_tensor.shape):
+                local_shapes[d][r] = size
+                local_strides[d][r] = local_tensor.stride(d)
+        shape = [
+            (
+                first_shape[d]
+                if len(set(local_shapes[d])) == 1
+                else torch.SymInt(LocalIntNode(local_shapes[d]))
+            )
+            for d in range(len(first_shape))
+        ]
+        strides = [
+            (
+                first_stride[d]
+                if len(set(local_strides[d])) == 1
+                else torch.SymInt(LocalIntNode(local_strides[d]))
+            )
+            for d in range(len(first_shape))
+        ]
+
+        r = torch.Tensor._make_wrapper_subclass(
+            cls,
+            shape,
+            strides=strides,
+            dtype=dtype,
+            device=device,
+            layout=layout,
+            requires_grad=False,
+            _extra_dispatch_keys=extra_dispatch_keys,
+        )
+
+        local_tensors = {
+            r: v if not isinstance(v, AsyncCollectiveTensor) else v.wait()
+            for r, v in local_tensors.items()
+        }
+        r._local_tensors = local_tensors
+        r._ranks = frozenset(local_tensors.keys())
+        return r
+
+    @torch._disable_dynamo
+    @mark_subclass_constructor_exportable_experimental  # type: ignore[misc]
+    def __init__(self, *args: Any, **kwargs: Any):
+        super().__init__()
+
+    def __repr__(self) -> str:  # type: ignore[override]
+        parts = []
+        for k, v in self._local_tensors.items():
+            # pyrefly: ignore  # bad-argument-type
+            parts.append(f"  {k}: {v}")
+        tensors_str = ",\n".join(parts)
+        return f"LocalTensor(\n{tensors_str}\n)"
+
+    def __tensor_flatten__(self) -> tuple[list[str], tuple[Any, ...]]:
+        """
+        protocol to inform how to flatten a DTensor to local tensor
+        for PT2 tracing
+        """
+        return ["_local_tensors"], ()
+
+    @staticmethod
+    def __tensor_unflatten__(
+        inner_tensors: dict[str, Any],
+        flatten_spec: tuple[Any, ...],
+        outer_size: torch.Size,
+        outer_stride: tuple[int, ...],
+    ) -> "LocalTensor":
+        assert flatten_spec is not None, (
+            "Expecting spec to be not None from `__tensor_flatten__` return value!"
+        )
+        local_tensors = inner_tensors["_local_tensors"]
+        return LocalTensor(local_tensors)
+
+    @classmethod
+    @torch._disable_dynamo
+    def __torch_dispatch__(  # type: ignore[override]
+        cls,
+        func: Any,
+        types: tuple[Any, ...],
+        args: tuple[Any, ...] = (),
+        kwargs: dict[str, Any] | None = None,
+    ) -> Any:
+        if kwargs is None:
+            kwargs = {}
+
+        # This is horribly inefficient
+        flat_args, args_spec = pytree.tree_flatten((args, kwargs))
+        local_tensor = None
+        for arg in flat_args:
+            if isinstance(arg, LocalTensor):
+                local_tensor = arg
+                break
+
+        assert local_tensor is not None, (
+            "At least one of the arguments must be a LocalTensor"
+        )
+
+        # Check for unrecognized tensor subclasses (but allow regular tensors and scalars)
+        has_unrecognized_types = _check_for_subclass(flat_args)
+        if has_unrecognized_types:
+            unrecognized_types = [
+                type(x) for x in flat_args if _check_for_subclass_arg(x)
+            ]
+            not_implemented_log.debug(
+                "LocalTensor unrecognized subclass(es): %s", unrecognized_types
+            )
+            return NotImplemented
+
+        with LocalTensorMode(local_tensor._ranks):
+            return func(*args, **kwargs)
+
+    def numpy(
+        self, *, force: bool = False
+    ) -> np.ndarray:  # pyrefly: ignore  # missing-attribute
+        if HAS_NUMPY:
+            return self.reconcile().numpy(force=force)
+        else:
+            raise RuntimeError("Numpy is not available")
+
+    def __lt__(
+        self, other: torch.Tensor | bool | complex | float | int
+    ) -> torch.Tensor:
+        self_rec = self.reconcile()
+        other_rec = other
+        if isinstance(other, LocalTensor):
+            other_rec = other.reconcile()
+        return self_rec < other_rec
+
+    def __gt__(
+        self, other: torch.Tensor | bool | complex | float | int
+    ) -> torch.Tensor:
+        self_rec = self.reconcile()
+        other_rec = other
+        if isinstance(other, LocalTensor):
+            other_rec = other.reconcile()
+        return self_rec > other_rec
+
+    def tolist(self) -> list[Any]:
+        """
+        Reconcile and convert result to list.
+        """
+
+        return self.reconcile().tolist()
+
+    def reconcile(self) -> torch.Tensor:
+        """
+        Reconciles the LocalTensor into a single torch.Tensor by ensuring all local
+        shards are identical and returning a detached clone of one of them.
+
+        Note:
+            This method is useful for extracting a representative tensor from a LocalTensor
+            when all shards are expected to be the same, such as after a collective operation
+            that synchronizes all ranks.
+        """
+
+        # Force all local tensor shards across ranks to be the same
+        it = iter(self._local_tensors.values())
+        t1 = next(it)
+        for t2 in it:
+            assert torch.equal(t1, t2), (
+                "LocalTensor shards must be the same to reconcile"
+            )
+        cl = t1.clone().detach()
+        cl.requires_grad_(self.requires_grad)
+        return cl
+
+
+_LOCAL_TENSOR_MODE: list["LocalTensorMode"] = []
+
+
+class LocalTensorMode(TorchDispatchMode):
+    """
+    A TorchDispatchMode that simulates SPMD (Single Program, Multiple Data) execution
+    for LocalTensor objects across a set of ranks.
+
+    LocalTensorMode enables PyTorch operations to be transparently applied to each
+    local shard of a LocalTensor, as if they were distributed across multiple ranks.
+    When active, this mode intercepts tensor operations and dispatches them to each
+    rank's local tensor, collecting and wrapping the results as LocalTensors. It also
+    handles collective operations by mapping them to local implementations.
+
+    This mode is primarily intended for debugging and simulating distributed tensor
+    computations on a single process, rather than for high-performance distributed
+    training. It maintains a stack of active modes, patches DeviceMesh coordinate
+    resolution, and provides utilities for temporarily disabling the mode or mapping
+    functions over ranks.
+    """
+
+    # What ranks this local tensor mode is operating over
+    def __init__(self, ranks: Union[int, frozenset[int]]):
+        if isinstance(ranks, int):
+            # assume is world size
+            self.ranks = frozenset(range(ranks))
+        else:
+            assert isinstance(ranks, frozenset)
+            self.ranks = ranks
+        self._disable = False
+        self._old_get_coordinate = None
+
+    def __enter__(self) -> "LocalTensorMode":
+        self._disable = False
+        self._patch_device_mesh()
+        _LOCAL_TENSOR_MODE.append(self)
+
+        return super().__enter__()
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> None:
+        self._disable = True
+        self._unpatch_device_mesh()
+        _LOCAL_TENSOR_MODE.pop()
+        super().__exit__(exc_type, exc_val, exc_tb)
+
+    def __torch_dispatch__(
+        self,
+        func: Any,
+        types: tuple[Any, ...],
+        args: tuple[Any, ...] = (),
+        kwargs: dict[str, Any] | None = None,
+    ) -> Any:
+        if kwargs is None:
+            kwargs = {}
+
+        flat_args, args_spec = pytree.tree_flatten((args, kwargs))
+
+        # Find all LocalTensor arguments to determine ranks
+        local_tensors = [a for a in flat_args if isinstance(a, LocalTensor)]
+
+        # Check for unrecognized tensor subclasses (but allow regular tensors and scalars)
+        has_unrecognized_types = _check_for_subclass(flat_args)
+        if has_unrecognized_types:
+            unrecognized_types = [
+                type(x) for x in flat_args if _check_for_subclass_arg(x)
+            ]
+            not_implemented_log.debug(
+                "LocalTensorMode unrecognized subclass(es): %s", unrecognized_types
+            )
+            return NotImplemented
+
+        # Factory functions convert into LocalTensor, so we don't have to
+        # transmute a Tensor into a LocalTensor if mutation happens...
+        # But if you do an operation on a Tensor, do NOT wrap it into a
+        # LocalTensor.  This helps prevent accidents when you're doing Tensor
+        # operations on the inner non-wrapped tensors.
+        if not local_tensors:
+            if self._disable or any(isinstance(a, Tensor) for a in flat_args):
+                return func(*args, **kwargs)
+
+        # For LocalTensors, verify they have compatible ranks
+        for a in flat_args:
+            if isinstance(a, LocalTensor):
+                assert a._ranks <= self.ranks, (
+                    f"Input LocalTensor {a} and LocalTensorMode must be configured for the same ranks"
+                )
+
+        if func.namespace == "c10d":
+            if func is torch.ops.c10d.allreduce_.default:
+                return _c10d._local_all_reduce_(*args, **kwargs)
+            elif func is torch.ops.c10d.allreduce_coalesced_.default:
+                return _c10d._local_allreduce_coalesced_(*args, **kwargs)
+            elif func is torch.ops.c10d.reduce_scatter_tensor_coalesced_.default:
+                return _c10d._local_reduce_scatter_tensor_coalesced_(*args, **kwargs)
+            elif func is torch.ops.c10d.scatter_.default:
+                return _c10d._local_scatter_(*args, **kwargs)
+            elif func is torch.ops.c10d.broadcast_.default:
+                return _c10d._local_broadcast_(*args, **kwargs)
+            elif func is torch.ops.c10d.allgather_.default:
+                return _c10d._local_all_gather_(*args, **kwargs)
+            elif func is torch.ops.c10d.allgather_into_tensor_coalesced_.default:
+                return _c10d._local_allgather_into_tensor_coalesced_(*args, **kwargs)
+            elif func is torch.ops.c10d.gather_.default:
+                return _c10d._local_gather_(*args, **kwargs)
+            elif func is torch.ops.c10d.alltoall_.default:
+                return _c10d._local_alltoall_(*args, **kwargs)
+            elif func is torch.ops.c10d.alltoall_base_.default:
+                return _c10d._local_alltoall_base_(*args, **kwargs)
+            elif func is torch.ops.c10d.barrier.default:
+                return _c10d._local_barrier(*args, **kwargs)
+            elif func is torch.ops.c10d.monitored_barrier_.default:
+                return _c10d._local_monitored_barrier_(*args, **kwargs)
+            elif func is torch.ops.c10d.send.default:
+                return _c10d._local_send(*args, **kwargs)
+            elif func is torch.ops.c10d.recv_.default:
+                return _c10d._local_recv_(*args, **kwargs)
+            elif func is torch.ops.c10d.recv_any_source_.default:
+                return _c10d._local_recv_any_source_(*args, **kwargs)
+            raise NotImplementedError(f"{func} not implemented")
+
+        if func.namespace == "_c10d_functional" or func.namespace == "_dtensor":
+            with LocalTensorMode(self.ranks):
+                return func._op_dk(
+                    DispatchKey.CompositeExplicitAutograd, *args, **kwargs
+                )
+
+        if func.namespace == "profiler":
+            return func(*args, **kwargs)
+
+        if func.namespace == "_c10d_functional_autograd":
+            raise NotImplementedError(f"{func} not implemented")
+
+        if func.namespace == "symm_mem":
+            raise NotImplementedError(f"{func} not implemented")
+
+        return _for_each_rank_run_func(func, self.ranks, args, kwargs, alias=True)
+
+    @contextlib.contextmanager
+    def disable(self) -> Generator[None, None, None]:
+        """
+        Disables LocalTensorMode temporarily. Primarily is intended to be used to perform
+        rank specific computations and merge results back before enabling LocalTensorMode back.
+        """
+
+        old = self._disable
+        self._disable = True
+        self._unpatch_device_mesh()
+        try:
+            yield
+        finally:
+            self._disable = old
+            self._patch_device_mesh()
+
+    def rank_map(self, cb: Callable[[int], Tensor]) -> LocalTensor:
+        """
+        Creates a LocalTensor instance by mapping rank id to ids local shard.
+        """
+
+        with self.disable():
+            return LocalTensor({r: cb(r) for r in self.ranks})
+
+    def _patch_device_mesh(self) -> None:
+        assert self._old_get_coordinate is None
+        self._old_get_coordinate = DeviceMesh.get_coordinate  # type: ignore[assignment]
+        DeviceMesh.get_coordinate = _LocalDeviceMesh.get_coordinate  # type: ignore[method-assign]
+
+    def _unpatch_device_mesh(self) -> None:
+        assert self._old_get_coordinate is not None
+        DeviceMesh.get_coordinate = self._old_get_coordinate
+        # pyrefly: ignore  # bad-assignment
+        self._old_get_coordinate = None
+
+
+class _LocalDeviceMesh:
+    """
+    Holds implementations of DeviceMesh functionality that must be patched while running
+    under LocalTensorMode.
+    """
+
+    @staticmethod
+    def get_coordinate(self: DeviceMesh) -> Optional[list[int] | None]:
+        # NB: In order to support submeshes the code below recreates for each
+        # rank submesh with the same mesh dimensions as current mesh. We are
+        # doing this because when submesh is created it is created for a particular
+        # rank (therefore below we are patching get_rank method). We are trying to
+        # limit the invasiveness of local tensor.
+        lm = local_tensor_mode()
+        assert lm is not None, "Unexpectedly not in LocalTensorMode"
+
+        coords: list[dict[int, int]] = [{} for _ in range(self.ndim)]
+        for r in lm.ranks:
+            rank_tensor = self._layout.remap_to_tensor(self._rank_map)
+            rank_coords = (rank_tensor == r).nonzero().tolist()
+            assert len(rank_coords) == 1
+            for d, c in enumerate(rank_coords[0][1:]):
+                coords[d][r] = c
+
+        out = [torch.SymInt(LocalIntNode(c)) for c in coords]
+        # The output contains coordinates for each of the ranks with respect to
+        # their meshes formed from root mesh and selecting the same dimensions
+        # as the current mesh.
+        return out  # type: ignore[return-value]
+
+
+def reconcile_args(args: Any, kwargs: dict[str, Any] | None = None) -> Any:
+    """
+    Reconciles arguments by converting any LocalTensor instances in the input
+    arguments to their underlying torch.Tensor representation.
+
+    This function is typically used to prepare arguments for functions that
+    expect standard torch.Tensor objects, by flattening the input arguments,
+    replacing LocalTensor instances with their reconciled (standard tensor)
+    versions, and then reconstructing the original argument structure.
+
+    Args:
+        args: Positional arguments, possibly containing LocalTensor instances.
+        kwargs: Keyword arguments, possibly containing LocalTensor instances.
+
+    Returns:
+        Any: The arguments with all LocalTensor instances replaced by their reconciled torch.Tensor equivalents,
+             preserving the original structure.
+    """
+    if kwargs is None:
+        kwargs = {}
+    flat_args, args_spec = pytree.tree_flatten((args, kwargs))
+    reconciled_args = [
+        a.reconcile() if isinstance(a, LocalTensor) else a for a in flat_args
+    ]
+    return pytree.tree_unflatten(reconciled_args, args_spec)
+
+
+def local_tensor_mode() -> Optional[LocalTensorMode]:
+    """
+    Returns the current active LocalTensorMode if one exists.
+
+    This function checks the global stack of LocalTensorMode instance. If there
+    is at least one LocalTensorMode active, it returns the most recently entered
+    (top of the stack) LocalTensorMode. If no LocalTensorMode is active, it returns None.
+
+    Returns:
+        Optional[LocalTensorMode]: The current LocalTensorMode if active, else None.
+    """
+    if len(_LOCAL_TENSOR_MODE) > 0:
+        return _LOCAL_TENSOR_MODE[-1]
+    return None
+
+
+def maybe_run_for_local_tensor(func: Callable[..., Any]) -> Callable[..., Any]:
+    """
+    Decorator that ensures a function is executed for each local tensor shard
+    when running under LocalTensorMode. If not in LocalTensorMode, the function
+    is executed normally. When in LocalTensorMode, the function is run for each
+    rank, and the results are collected appropriately.
+
+    This decorator is useful for functions that exhibit non-SPMD behavior, such
+    as those requiring rank specific actions. For example, a function that computes
+    offset into input tensor based on rank.
+
+    Note that the function being decorated must not have any side effects and
+    contain operations for a single rank only. For example, wrapping a function
+    that performs a collective operation will not work.
+
+    Args:
+        func (Callable[..., Any]): The function to be decorated.
+
+    Returns:
+        Callable[..., Any]: The wrapped function that handles LocalTensorMode logic.
+    """
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):  # type: ignore[no-untyped-def]
+        lm = local_tensor_mode()
+        if lm is None:
+            return func(*args, **kwargs)
+        ret = None
+        with lm.disable():
+            ret = _for_each_rank_run_func(func, lm.ranks, args, kwargs, alias=False)
+
+        return ret
+
+    return wrapper
diff --git a/torch/distributed/_local_tensor/_c10d.py b/torch/distributed/_local_tensor/_c10d.py
new file mode 100644
index 000000000000..43745218afd8
--- /dev/null
+++ b/torch/distributed/_local_tensor/_c10d.py
@@ -0,0 +1,676 @@
+import functools
+import math
+import operator
+from typing import Sequence
+
+import torch
+from torch._C import ScriptObject
+from torch._C._distributed_c10d import FakeWork
+from torch.distributed._mesh_layout import _MeshLayout
+from torch.distributed.distributed_c10d import (
+    _get_default_group,
+    ProcessGroup,
+    ReduceOp,
+    Work,
+)
+
+
+# NOTE: Most of the c10d collectives often take a Tensor[] (or Tensor[][])
+# when you would expect Tensor (or Tensor[]).  In fact, there will only ever
+# be one Tensor in this case; the old signature was to support dispatching a
+# collective on multiple devices (ala DataParallel) but we don't support that
+# API anymore.  Note that we are not 100% consistent about this; some more
+# modern collectives like _allgather_base_ got rid of the unnecessary list.
+# When in doubt, consult the code that dispatches to the collective on the PG
+# in distributed_c10d.py e.g., work = group.allgather([tensor_list], [tensor],
+# opts) indicates its always a list.
+
+
+def _gcd_list(numbers: Sequence[int]) -> int:
+    return 0 if not numbers else functools.reduce(math.gcd, numbers)
+
+
+def _indices_to_layout(indices: list[int]) -> tuple[tuple[int, ...], tuple[int, ...]]:
+    # Base case: A single index represents a point, not a dimension.
+    if len(indices) <= 1:
+        return (), ()
+
+    # The smallest stride is likely the GCD of the differences between consecutive indices.
+    # For a sorted, unique list, all differences will be positive.
+    diffs = [indices[i] - indices[i - 1] for i in range(1, len(indices))]
+    last_stride = _gcd_list(diffs)
+
+    assert last_stride != 0, (
+        # This case should not be reached if indices are unique and sorted.
+        "Cannot determine stride; indices may not be unique."
+    )
+
+    # Identify the starting index of each "row" in the last dimension.
+    # An index starts a new row if the preceding index (index - stride) is not present.
+    indices_set = set(indices)
+    higher_dim_indices = [indices[0]]
+    for index in indices[1:]:
+        if (index - last_stride) not in indices_set:
+            higher_dim_indices.append(index)
+
+    # From the number of rows, we can deduce the shape of the last dimension.
+    assert len(indices) % len(higher_dim_indices) == 0, (
+        "Indices do not form a regular grid. "
+        f"Found {len(higher_dim_indices)} subgroups for {len(indices)} total elements."
+    )
+    last_shape = len(indices) // len(higher_dim_indices)
+
+    # Recurse on the higher-dimensional indices (the start of each row).
+    higher_shapes, higher_strides = _indices_to_layout(higher_dim_indices)
+
+    # Combine the results from the recursion with the current dimension's results.
+    final_shapes = higher_shapes + (last_shape,)
+    final_strides = higher_strides + (last_stride,)
+
+    return final_shapes, final_strides
+
+
+def _prepare_collective_groups(
+    process_group_so: ScriptObject,
+) -> tuple[list[int], list[int], int]:
+    process_group = ProcessGroup.unbox(process_group_so)
+
+    ranks = torch.distributed.get_process_group_ranks(process_group)
+    assert ranks
+    # TODO: We can handle permutations but the layout inference algorithm will
+    # lose the permutation so we will have to reapply it
+    assert ranks == sorted(ranks), ranks
+    offset = ranks[0]
+    ranks = [r - offset for r in ranks]
+
+    shape, strides = _indices_to_layout(ranks)
+    layout = _MeshLayout(shape, strides)
+
+    global_pg = _get_default_group()
+    group_offsets = layout.complement(global_pg.size()).all_ranks_from_zero()
+
+    return ranks, group_offsets, offset
+
+
+def _local_broadcast_(
+    tensors: list[torch.Tensor],
+    process_group_so: ScriptObject,
+    root_rank: int,
+    root_tensor: int,
+    async_op: bool = True,
+    timeout: int = -1,
+) -> tuple[list[torch.Tensor], ScriptObject]:
+    # "broadcast_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, "
+    # "int root_rank, int root_tensor, bool async_op=True, int timeout=-1) -> (Tensor[], __torch__.torch.classes.c10d.Work)"
+    from . import LocalTensor
+
+    assert len(tensors) == 1
+    assert root_tensor == 0
+    tensor = tensors[0]
+
+    ranks, group_offsets, offset = _prepare_collective_groups(process_group_so)
+
+    # We're going to assume SPMD where for every rank group the root_rank is
+    # the same relative to others
+    relative_root_rank = root_rank - offset
+
+    assert isinstance(tensor, LocalTensor), "Input tensor must be a LocalTensor"
+
+    for group_offset in group_offsets:
+        # For the tensors in this group [group_offset + r for r in ranks]
+        # perform the broadcast on them
+        group_ranks = [group_offset + r for r in ranks]
+        source_rank = group_offset + relative_root_rank
+        source_tensor = tensor._local_tensors[source_rank]
+
+        # Broadcast the source tensor to all ranks in this group
+        for rank in group_ranks:
+            if source_rank != rank:
+                tensor._local_tensors[rank].copy_(source_tensor)
+
+    work = FakeWork()
+    work_so = Work.boxed(work)
+    return (tensors, work_so)
+
+
+def _local_reduce(
+    reduce_op: ReduceOp,
+    tensors: list[torch.Tensor],
+) -> torch.Tensor:
+    if reduce_op == ReduceOp.SUM:
+        op = operator.add
+    elif reduce_op == ReduceOp.AVG:
+        op = None
+    elif reduce_op == ReduceOp.PRODUCT:
+        op = operator.mul
+    elif reduce_op == ReduceOp.MIN:
+        op = torch.minimum
+    elif reduce_op == ReduceOp.MAX:
+        op = torch.maximum
+    elif reduce_op == ReduceOp.BAND:
+        op = torch.bitwise_and
+    elif reduce_op == ReduceOp.BOR:
+        op = torch.bitwise_or
+    elif reduce_op == ReduceOp.BXOR:
+        op = torch.bitwise_xor
+    elif reduce_op == ReduceOp.PREMUL_SUM:
+        raise NotImplementedError("PREMUL_SUM: need to add binding for scaling factor")
+    else:
+        raise NotImplementedError(f"ReduceOp {reduce_op} not implemented")
+
+    if reduce_op == ReduceOp.AVG:
+        return functools.reduce(operator.add, tensors) / len(tensors)
+    else:
+        assert op is not None
+        return functools.reduce(op, tensors)
+
+
+def _local_all_reduce_(
+    tensors: list[torch.Tensor],
+    process_group_so: ScriptObject,
+    reduce_op_so: ScriptObject,
+    sparse_indices: torch.Tensor | None = None,
+    async_op: bool = True,
+    timeout: int = -1,
+) -> tuple[list[torch.Tensor], ScriptObject]:
+    # "allreduce_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, "
+    # "__torch__.torch.classes.c10d.ReduceOp reduce_op, Tensor? sparse_indices, bool async_op=True, "
+    # "int timeout=-1) -> (Tensor[], __torch__.torch.classes.c10d.Work)");
+    from . import LocalTensor
+
+    assert len(tensors) == 1
+    tensor = tensors[0]
+    reduce_op = reduce_op_so.op()  # type: ignore[attr-defined]
+
+    ranks, group_offsets, _offset = _prepare_collective_groups(process_group_so)
+
+    assert isinstance(tensor, LocalTensor), "Input tensor must be a LocalTensor"
+
+    for group_offset in group_offsets:
+        # For the tensors in this group [group_offset + r for r in ranks]
+        # perform the allreduce on them
+        group_ranks = [group_offset + r for r in ranks]
+
+        # Collect tensors from the specified ranks in this group
+        group_tensors = []
+        for rank in group_ranks:
+            group_tensors.append(tensor._local_tensors[rank])
+
+        # Perform the reduction operation
+        reduced_tensor = _local_reduce(reduce_op, group_tensors)
+
+        # Update all tensors in the group with the reduced result
+        for rank in group_ranks:
+            tensor._local_tensors[rank].copy_(reduced_tensor)
+
+    work = FakeWork()
+    work_so = Work.boxed(work)
+    return (tensors, work_so)
+
+
+def _local_allreduce_coalesced_(
+    tensors: list[torch.Tensor],
+    process_group_so: ScriptObject,
+    reduce_op_so: ScriptObject,
+    async_op: bool = True,
+    timeout: int = -1,
+) -> ScriptObject:
+    # "allreduce_coalesced_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, "
+    # "__torch__.torch.classes.c10d.ReduceOp reduce_op, bool async_op=True, int timeout=-1) -> __torch__.torch.classes.c10d.Work"
+    from . import LocalTensor
+
+    reduce_op = reduce_op_so.op()  # type: ignore[attr-defined]
+    ranks, group_offsets, _offset = _prepare_collective_groups(process_group_so)
+
+    for group_offset in group_offsets:
+        # For the tensors in this group [group_offset + r for r in ranks]
+        # perform the allreduce on all tensors together
+        group_ranks = [group_offset + r for r in ranks]
+
+        # For each tensor, perform the reduction operation
+        for tensor in tensors:
+            assert isinstance(tensor, LocalTensor), "Input tensor must be a LocalTensor"
+            # Collect tensors from the specified ranks in this group
+            group_tensors = []
+            for rank in group_ranks:
+                group_tensors.append(tensor._local_tensors[rank])
+
+            # Perform the reduction operation
+            reduced_tensor = _local_reduce(reduce_op, group_tensors)
+
+            # Update all tensors in the group with the reduced result
+            for rank in group_ranks:
+                tensor._local_tensors[rank].copy_(reduced_tensor)
+
+    work = FakeWork()
+    work_so = Work.boxed(work)
+    return work_so
+
+
+def _local_reduce_scatter_tensor_coalesced_(
+    output_tensors: list[torch.Tensor],
+    input_tensors: list[torch.Tensor],
+    process_group_so: ScriptObject,
+    reduce_op_so: ScriptObject,
+    async_op: bool = True,
+    timeout: int = -1,
+) -> ScriptObject:
+    # "reduce_scatter_tensor_coalesced_(Tensor[] outputs, Tensor[] inputs, "
+    # "__torch__.torch.classes.c10d.ProcessGroup process_group, "
+    # "__torch__.torch.classes.c10d.ReduceOp reduce_op, bool async_op=True, "
+    # "int timeout=-1) -> __torch__.torch.classes.c10d.Work"
+
+    from . import LocalTensor
+
+    reduce_op = reduce_op_so.op()  # type: ignore[attr-defined]
+    ranks, group_offsets, _offset = _prepare_collective_groups(process_group_so)
+
+    for group_offset in group_offsets:
+        # For the tensors in this group [group_offset + r for r in ranks]
+        # perform the allreduce on all tensors together
+        group_ranks = [group_offset + r for r in ranks]
+
+        # For each tensor, perform the reduction operation
+        for input_tensor, output_tensor in zip(input_tensors, output_tensors):
+            assert isinstance(input_tensor, LocalTensor), (
+                "Input tensor must be a LocalTensor"
+            )
+            assert isinstance(output_tensor, LocalTensor), (
+                "Output tensor must be a LocalTensor"
+            )
+            # Collect tensors from the specified ranks in this group
+            group_inputs = []
+            for rank in group_ranks:
+                group_inputs.append(input_tensor._local_tensors[rank])
+
+            # Perform the reduction operation
+            reduced_input = _local_reduce(reduce_op, group_inputs)
+
+            reduced_inpit_splits = torch.split(
+                reduced_input, reduced_input.size(0) // len(group_ranks), dim=0
+            )
+
+            # Update all tensors in the group with the reduced result
+            for rank in group_ranks:
+                output_tensor._local_tensors[rank].copy_(reduced_inpit_splits[rank])
+
+    work = FakeWork()
+    work_so = Work.boxed(work)
+    return work_so
+
+
+def _local_all_gather_(
+    output_tensors: list[list[torch.Tensor]],
+    input_tensors: list[torch.Tensor],
+    process_group_so: ScriptObject,
+    async_op: bool = True,
+    timeout: int = -1,
+) -> tuple[list[list[torch.Tensor]], ScriptObject]:
+    # "allgather_(Tensor[][] output_tensors, Tensor[] input_tensors, "
+    # "__torch__.torch.classes.c10d.ProcessGroup process_group, bool async_op=True, "
+    # "int timeout=-1) -> (Tensor[][], __torch__.torch.classes.c10d.Work)");
+
+    from . import LocalTensor
+
+    assert len(output_tensors) == 1
+    assert len(input_tensors) == 1
+
+    input_tensor = input_tensors[0]
+    # pyrefly: ignore  # bad-assignment
+    output_tensors = output_tensors[0]
+
+    ranks, group_offsets, _offset = _prepare_collective_groups(process_group_so)
+
+    for i in range(len(output_tensors)):
+        assert isinstance(output_tensors[i], LocalTensor), (
+            "Output tensor must be a LocalTensor"
+        )
+
+    for group_offset in group_offsets:
+        # For the tensors in this group [group_offset + r for r in ranks]
+        # perform the all_gather on them
+        group_ranks = [group_offset + r for r in ranks]
+
+        # For each rank in the group, gather from their input tensor
+        for i, rank_i in enumerate(group_ranks):
+            # allgather object happens to create pure tensor, so we special case it here
+            source_tensor = input_tensor
+            if isinstance(input_tensor, LocalTensor):
+                source_tensor = input_tensor._local_tensors[rank_i]
+            # pyrefly: ignore  # missing-attribute
+            output_tensors[i].copy_(source_tensor)
+
+    work = FakeWork()
+    work_so = Work.boxed(work)
+    # pyrefly: ignore  # bad-return
+    return ([output_tensors], work_so)
+
+
+def _local_allgather_into_tensor_coalesced_(
+    output_tensors: list[torch.Tensor],
+    input_tensors: list[torch.Tensor],
+    process_group_so: ScriptObject,
+    async_op: bool = True,
+) -> ScriptObject:
+    # "allgather_into_tensor_coalesced_(Tensor[] outputs, Tensor[] inputs, "
+    # "__torch__.torch.classes.c10d.ProcessGroup process_group, bool async_op=True) "
+    # "-> __torch__.torch.classes.c10d.Work"
+    from . import LocalTensor
+
+    ranks, group_offsets, _offset = _prepare_collective_groups(process_group_so)
+
+    # Each output tensor should be sized to hold all gathered inputs
+    # outputs[i] will contain all inputs[i] from all ranks
+    assert len(output_tensors) == len(input_tensors), (
+        f"Number of outputs ({len(output_tensors)}) must match number of inputs ({len(input_tensors)})"
+    )
+
+    for group_offset in group_offsets:
+        # For the tensors in this group [group_offset + r for r in ranks]
+        # perform the allgather_into_tensor on them
+        group_ranks = [group_offset + r for r in ranks]
+
+        # For each input/output pair
+        for input_tensor, output_tensor in zip(input_tensors, output_tensors):
+            assert isinstance(input_tensor, LocalTensor), (
+                "Input tensor must be a LocalTensor"
+            )
+            assert isinstance(output_tensor, LocalTensor), (
+                "Output tensor must be a LocalTensor"
+            )
+            # Gather input_tensor from all ranks into output_tensor
+            # The output should be a concatenation of all inputs along the first dimension
+            gathered_tensors = []
+            for rank in group_ranks:
+                gathered_tensors.append(input_tensor._local_tensors[rank])
+
+            # Concatenate along first dimension and copy to output
+            if gathered_tensors:
+                concatenated = torch.cat(gathered_tensors, dim=0)
+                for rank in group_ranks:
+                    output_tensor._local_tensors[rank].copy_(concatenated)
+
+    work = FakeWork()
+    work_so = Work.boxed(work)
+    return work_so
+
+
+def _local_gather_(
+    output_tensors: list[list[torch.Tensor]],
+    input_tensors: list[torch.Tensor],
+    process_group_so: ScriptObject,
+    root_rank: int,
+    async_op: bool = True,
+    timeout: int = -1,
+) -> ScriptObject:
+    # "gather_(Tensor[][] output_tensors, Tensor[] input_tensors, "
+    # "__torch__.torch.classes.c10d.ProcessGroup process_group, int root_rank, "
+    # "bool async_op=True, int timeout=-1) -> __torch__.torch.classes.c10d.Work"
+    raise NotImplementedError(
+        "LocalTensor does not support MPMD operations like gather "
+        "(only root rank receives data). Use SPMD collective operations like allgather instead."
+    )
+
+
+def _local_scatter_(
+    output_tensors: list[torch.Tensor],
+    input_tensors: list[list[torch.Tensor]],
+    process_group_so: ScriptObject,
+    root_rank: int,
+    async_op: bool = True,
+    timeout: int = -1,
+) -> tuple[list[torch.Tensor], ScriptObject]:
+    # "scatter_(Tensor[] output_tensors, Tensor[][] input_tensors, "
+    # "__torch__.torch.classes.c10d.ProcessGroup process_group, int root_rank, "
+    # "bool async_op=True, int timeout=-1) -> (Tensor[], __torch__.torch.classes.c10d.Work)");
+
+    from . import LocalTensor
+
+    assert len(output_tensors) == 1
+    assert len(input_tensors) == 1
+    output_tensor = output_tensors[0]
+    # pyrefly: ignore  # bad-assignment
+    input_tensors = input_tensors[0]
+
+    ranks, group_offsets, offset = _prepare_collective_groups(process_group_so)
+
+    # We're going to assume SPMD where for every rank group the root_rank is
+    # the same relative to others
+    relative_root_rank = root_rank - offset
+
+    assert isinstance(output_tensor, LocalTensor), "Output tensor must be a LocalTensor"
+    assert len(ranks) == len(input_tensors), (ranks, input_tensors)
+
+    for group_offset in group_offsets:
+        # For the tensors in this group [group_offset + r for r in ranks]
+        # perform the scatter on them
+        group_ranks = [group_offset + r for r in ranks]
+
+        # Root rank scatters its input tensors to all ranks in this group
+        for i, rank in enumerate(group_ranks):
+            input_tensor = input_tensors[i]
+            assert isinstance(input_tensor, LocalTensor)
+            # Each rank i gets the i-th input tensor from the root
+            source_tensor = input_tensor._local_tensors[
+                group_offset + relative_root_rank
+            ]
+            output_tensor._local_tensors[rank].copy_(source_tensor)
+
+    work = FakeWork()
+    work_so = Work.boxed(work)
+    return (output_tensors, work_so)
+
+
+def _local_alltoall_(
+    output_tensors: list[torch.Tensor],
+    input_tensors: list[torch.Tensor],
+    process_group_so: ScriptObject,
+    async_op: bool = True,
+    timeout: int = -1,
+) -> tuple[list[torch.Tensor], ScriptObject]:
+    # "alltoall_(Tensor[] output_tensors, Tensor[] input_tensors, "
+    # "__torch__.torch.classes.c10d.ProcessGroup process_group, bool async_op=True, "
+    # "int timeout=-1) -> (Tensor[], __torch__.torch.classes.c10d.Work)";
+
+    from . import LocalTensor
+
+    ranks, group_offsets, _offset = _prepare_collective_groups(process_group_so)
+
+    assert len(input_tensors) == len(output_tensors) == len(ranks), (
+        f"Number of input tensors ({len(input_tensors)}), "
+        f"output tensors ({len(output_tensors)}), and ranks ({len(ranks)}) must match"
+    )
+
+    for group_offset in group_offsets:
+        # For the tensors in this group [group_offset + r for r in ranks]
+        # perform the alltoall on them
+        group_ranks = [group_offset + r for r in ranks]
+
+        # In alltoall, rank i sends input_tensors[j] to rank j and receives into output_tensors[i] from rank j
+        for i, rank_i in enumerate(group_ranks):
+            output_tensor = output_tensors[i]
+            assert isinstance(output_tensor, LocalTensor), (
+                "Output tensor must be a LocalTensor"
+            )
+            for j, rank_j in enumerate(group_ranks):
+                input_tensor = input_tensors[j]
+                assert isinstance(input_tensor, LocalTensor), (
+                    "Input tensor must be a LocalTensor"
+                )
+                # Rank i's j-th input tensor goes to rank j's i-th output tensor
+                source_tensor = input_tensor._local_tensors[rank_i]
+                output_tensor._local_tensors[rank_j].copy_(source_tensor)
+
+    work = FakeWork()
+    work_so = Work.boxed(work)
+    return (output_tensors, work_so)
+
+
+def _local_alltoall_base_(
+    output_tensor: torch.Tensor,
+    input_tensor: torch.Tensor,
+    process_group_so: ScriptObject,
+    output_split_sizes: list[int],
+    input_split_sizes: list[int],
+    async_op: bool = True,
+    timeout: int = -1,
+) -> ScriptObject:
+    # "alltoall_base_(Tensor output, Tensor input, __torch__.torch.classes.c10d.ProcessGroup process_group, "
+    # "int[] output_split_sizes, int[] input_split_sizes, bool async_op=True, int timeout=-1) -> __torch__.torch.classes.c10d.Work";
+
+    from . import LocalTensor
+
+    ranks, group_offsets, _offset = _prepare_collective_groups(process_group_so)
+
+    assert isinstance(input_tensor, LocalTensor), "Input tensor must be a LocalTensor"
+    assert isinstance(output_tensor, LocalTensor), "Output tensor must be a LocalTensor"
+    # Convert split sizes to lists if they aren't already
+    if output_split_sizes is not None:
+        output_split_sizes = list(output_split_sizes)
+    if input_split_sizes is not None:
+        input_split_sizes = list(input_split_sizes)
+
+    for group_offset in group_offsets:
+        # For the tensors in this group [group_offset + r for r in ranks]
+        # perform the alltoall_base on them
+        group_ranks = [group_offset + r for r in ranks]
+
+        for i, rank_i in enumerate(group_ranks):
+            # Split input tensor from rank_i according to input_split_sizes
+            rank_tensor = input_tensor._local_tensors[rank_i]
+
+            if input_split_sizes is not None and len(input_split_sizes) > 0:
+                # Split the input tensor
+                input_splits = torch.split(rank_tensor, input_split_sizes, dim=0)
+            else:
+                # No split sizes specified, split evenly
+                split_size = rank_tensor.size(0) // len(group_ranks)
+                input_splits = torch.split(rank_tensor, split_size, dim=0)
+
+            # Send each split to the corresponding rank
+            for j, rank_j in enumerate(group_ranks):
+                if j < len(input_splits):
+                    split_tensor = input_splits[j]
+
+                    # Determine where to place this split in the output tensor
+                    if output_split_sizes is not None and len(output_split_sizes) > 0:
+                        # Calculate offset based on output split sizes
+                        output_offset = sum(output_split_sizes[:i]) if i > 0 else 0
+                        end_offset = (
+                            output_offset + output_split_sizes[i]
+                            if i < len(output_split_sizes)
+                            else output_tensor._local_tensors[rank_j].size(0)
+                        )
+                    else:
+                        # No output split sizes, use even splits
+                        split_size = output_tensor._local_tensors[rank_j].size(
+                            0
+                        ) // len(group_ranks)
+                        output_offset = i * split_size
+                        end_offset = min(
+                            (i + 1) * split_size,
+                            output_tensor._local_tensors[rank_j].size(0),
+                        )
+
+                    # Copy the split to the appropriate section of the output tensor
+                    output_section = output_tensor._local_tensors[rank_j][
+                        output_offset:end_offset
+                    ]
+                    if output_section.numel() > 0:
+                        # Reshape split_tensor to match output_section if necessary
+                        if split_tensor.size() != output_section.size():
+                            split_tensor = split_tensor.view(output_section.size())
+                        output_section.copy_(split_tensor)
+
+    work = FakeWork()
+    work_so = Work.boxed(work)
+    return work_so
+
+
+def _local_barrier(
+    tensor: torch.Tensor,
+    process_group_so: ScriptObject,
+    device_ids: list[int],
+    async_op: bool = True,
+    timeout: int = -1,
+) -> ScriptObject:
+    # "barrier(Tensor tensor, __torch__.torch.classes.c10d.ProcessGroup process_group, "
+    # "int[] device_ids, bool async_op=True, int timeout=-1) -> __torch__.torch.classes.c10d.Work";
+
+    from . import LocalTensor
+
+    # Barrier is a synchronization primitive - in local simulation,
+    # we don't need to do any actual work since all "ranks" are in the same process
+    # Just validate that the tensor is a LocalTensor
+    assert isinstance(tensor, LocalTensor)
+
+    # In a real distributed setting, barrier would synchronize all processes
+    # In local simulation, this is essentially a no-op since all ranks are local
+    work = FakeWork()
+    work_so = Work.boxed(work)
+    return work_so
+
+
+def _local_monitored_barrier_(
+    tensor: torch.Tensor,
+    process_group_so: ScriptObject,
+    device_ids: list[int],
+    timeout: int,
+    wait_all_ranks: bool,
+) -> None:
+    # "monitored_barrier_(Tensor tensor, __torch__.torch.classes.c10d.ProcessGroup process_group, "
+    # "int[] device_ids, int timeout, bool wait_all_ranks) -> ()";
+
+    from . import LocalTensor
+
+    # Monitored barrier is a synchronization primitive with monitoring - in local simulation,
+    # we don't need to do any actual work since all "ranks" are in the same process
+    # Just validate that the tensor is a LocalTensor
+    assert isinstance(tensor, LocalTensor)
+
+    # In a real distributed setting, monitored barrier would synchronize all processes
+    # and provide monitoring capabilities. In local simulation, this is essentially a no-op
+    # since all ranks are local and no actual synchronization is needed
+    return
+
+
+def _local_send(
+    tensors: list[torch.Tensor],
+    process_group_so: ScriptObject,
+    dst: int,
+    tag: int,
+) -> ScriptObject:
+    # "send(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, "
+    # "int dst, int tag) -> __torch__.torch.classes.c10d.Work";
+
+    raise NotImplementedError(
+        "LocalTensor does not support MPMD operations like send. "
+        "Use SPMD collective operations instead."
+    )
+
+
+def _local_recv_(
+    tensors: list[torch.Tensor],
+    process_group_so: ScriptObject,
+    src: int,
+    tag: int,
+) -> ScriptObject:
+    # "recv_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, "
+    # "int src, int tag) -> __torch__.torch.classes.c10d.Work";
+
+    raise NotImplementedError(
+        "LocalTensor does not support MPMD operations like recv. "
+        "Use SPMD collective operations instead."
+    )
+
+
+def _local_recv_any_source_(
+    tensors: list[torch.Tensor], process_group_so: ScriptObject, tag: int
+) -> ScriptObject:
+    # "recv_any_source_(Tensor[] tensors, __torch__.torch.classes.c10d.ProcessGroup process_group, "
+    # "int tag) -> __torch__.torch.classes.c10d.Work";
+
+    raise NotImplementedError(
+        "LocalTensor does not support MPMD operations like recv_any_source. "
+        "Use SPMD collective operations instead."
+    )
diff --git a/torch/distributed/_mesh_layout.py b/torch/distributed/_mesh_layout.py
index 86969fccc55d..2a8355fb26cc 100644
--- a/torch/distributed/_mesh_layout.py
+++ b/torch/distributed/_mesh_layout.py
@@ -5,8 +5,11 @@
 import math
 from collections.abc import Iterator
 from dataclasses import dataclass
+from itertools import product
 
+import torch
 from torch.distributed._pycute import (
+    as_tuple,
     coalesce,
     complement,
     composition,
@@ -15,12 +18,30 @@
     is_int,
     is_tuple,
     Layout,
+    match_structure,
 )
 
 
 @dataclass(frozen=True, init=True)
 class _MeshLayout(Layout):
+    """
+    Utility class for representing an integer layout by borrowing ideas from CuTe Layout Algebra.
+    See https://docs.nvidia.com/cutlass/media/docs/cpp/cute/02_layout_algebra.html for more details.
+
+    Each layout is represented as a list of sizes and strides. We use it as a way for mechanical bookkeeping
+    of the integers such as ranks in a SPMD mesh, and the transformation on top of it.
+
+    Lots of methods of layout like coalesce, composition, complement, etc. are borrowed from pycute.
+    https://github.com/NVIDIA/cutlass/blob/6dd13d42784ee5bfa232d2441e6b9a021c5c6290/python/pycute/layout.py#L137,L257
+
+    Note this is a CuTe-inspired layout, because CuTe uses co-lexicographic way in linearization while PyTorch
+    is using lexicographic. So even though the CuTe documentation can still be referenced, the implementation will be
+    different from that of PyCute's.
+    """
+
+    # pyrefly: ignore  # bad-override
     shape: IntTuple
+    # pyrefly: ignore  # bad-override
     stride: IntTuple
 
     def __post_init__(self) -> None:
@@ -28,14 +49,9 @@ def __post_init__(self) -> None:
             raise TypeError(f"shape must be a tuple or int, got {type(self.shape)}")
         if not is_tuple(self.stride) and not is_int(self.stride):
             raise TypeError(f"stride must be a tuple or int, got {type(self.stride)}")
-        if (
-            is_tuple(self.shape)
-            and is_tuple(self.stride)
-            and len(flatten(self.shape)) != len(flatten(self.stride))
-        ):
+        if not match_structure(self.shape, self.stride):
             raise ValueError(
-                f"sizes {len(flatten(self.shape))} and "
-                f"strides {len(flatten(self.stride))} must have the same length"
+                f"sizes {self.shape} and strides {self.stride} don't match"
             )
 
     @property
@@ -50,22 +66,244 @@ def strides(self) -> IntTuple:
     def sizes_and_strides(self) -> Iterator[tuple[int, int]]:
         return zip(flatten(self.shape), flatten(self.stride))
 
+    @property
+    def top_level_sizes(self) -> tuple[int, ...]:
+        return tuple(self[i].numel() for i in range(len(self)))
+
     def numel(self) -> int:
         return math.prod(flatten(self.shape))
 
     # # operator []    (get-i like tuples)
     def __getitem__(self, i: int) -> "_MeshLayout":
+        if i < -len(self) or i >= len(self):
+            raise IndexError(
+                f"Dim {i} is out of range for layout with {len(self)} dimensions. "
+                f"Expected dim to be in range [{-len(self)}, {len(self) - 1}]."
+            )
         layout = super().__getitem__(i)
         return _MeshLayout(layout.shape, layout.stride)
 
+    def nest(self) -> "_MeshLayout":
+        return _MeshLayout((self.shape,), (self.stride,))
+
     def coalesce(self) -> "_MeshLayout":
+        """
+        A layout is represented by (sizes):(strides), e.g. (3,2):(4,2).
+        Two consecutive dimensions can be "merged" into one if their
+        strides are contiguous/multiplicative (i.e., the inner stride * inner size
+        equals the next stride), we perform this kind of merge inside coalesce.
+
+        Example 1 (simple): (3,2):(2,1)
+        - inner dimension: has stride=1, size=2
+        - outer dimension: stride = inner_stride * inner_size = 2
+        → coalesced = (6:1)    # acts like a flat 1D array of length 6
+
+        Example 2 (non-coalescible): (3,2):(4,1)
+        - inner dimension: stride=1, size=2 → 2*1 = 2
+        - outer dimension: stride=4, mismatch (≠ 2)
+        → cannot merge; result stays (3,2):(4,1)
+        """
         layout = coalesce(self)
         return _MeshLayout(layout.shape, layout.stride)
 
     def composition(self, layout: "_MeshLayout") -> "_MeshLayout":
+        """
+        By-dimension composition allows one layout to "select from" or "filter through" another layout.
+        Think of it as function composition: (self ∘ layout)(input) = self(layout(input))
+        between two layouts. This function is a wrapper of pycute's composition.
+
+        Mental model about how to understand the composition logic:
+        - The LEFT layout (self) defines the "output space" - what indices are possible
+        - The RIGHT layout (layout parameter) acts as a "selector" - which specific indices to pick
+        - The composition only generates indices that the left layout could originally produce,
+          but the right layout determines which indices to be picked.
+        - The stride of the composition layout will not be smaller than the stride of the right layout,
+          because when picking the indices the composition will at least follow the the right layout's stride
+          to move forward.
+
+        Example:
+          self = (6,2):(2,1)      # sizes=(6,2), strides=(2,1)
+          layout = (3:2)          # sizes=(3,), stride=(2,)
+          self o layout = (3:2)
+
+        Returns:
+          Layout being composed.
+        """
         result = composition(self, layout)
         return _MeshLayout(result.shape, result.stride)
 
     def complement(self, world_size: int) -> "_MeshLayout":
+        """
+        Compute the "complement layout" relative to a given world_size.
+        A complement layout fills in the "missing" factor so that: self repeat a layout of complement(self, world_size)
+        will get a complete world_size. We use ⊗ to denote the repeat operation.
+
+        Example:
+          self = (4:1)   # size=4, stride=1
+          world_size = 8
+          Then:
+            complete needed factor = 8 / 4 = 2
+            complement(self, 8) = (2:1)
+
+          Together they form:
+            (4:1) ⊗ (2:1) = (4,2):(2,1)
+          which has world_size = 4 * 2 = 8, as required.
+
+        In distributed terms, complement() is often used to derive the "other"
+        rank grouping when splitting processes into 2D meshes.
+
+        For a visualized explanation, see https://x.com/ezyang/status/1962364978393981433/
+        """
         layout = complement(self, world_size)
         return _MeshLayout(layout.shape, layout.stride)
+
+    def splice(self, start: int, end: int, layout: "_MeshLayout") -> "_MeshLayout":
+        sizes = list(as_tuple(self.sizes))
+        strides = list(as_tuple(self.strides))
+        sizes[start:end] = list(as_tuple(layout.sizes))
+        strides[start:end] = list(as_tuple(layout.strides))
+        return _MeshLayout(tuple(sizes), tuple(strides))
+
+    def all_ranks_from_zero(self) -> list[int]:
+        """
+        This function computes the all ranks specified by the layout staring from zero.
+
+        How it works:
+        1. we enumerates every possible coordinate (like a nested for-loop).
+        If sizes = (2, 3), we get the following coordinates:
+            (0,0), (0,1), (0,2), (1,0), (1,1), (1,2)
+
+        2. For each coordinate, we compute a linear rank index as:
+            all_ranks_from_zero = sum(coord[i] * strides[i] for i in range(ndim))
+
+        Example A:
+        sizes = (2, 3)        # 2 rows, 3 cols
+        strides = (3, 1)        # row-major layout
+        coords = (0,0) -> 0*3 + 0*1 = 0
+                 (0,1) -> 0*3 + 1*1 = 1
+                 (0,2) -> 0*3 + 2*1 = 2
+                 (1,0) -> 1*3 + 0*1 = 3
+                 (1,1) -> 1*3 + 1*1 = 4
+                 (1,2) -> 1*3 + 2*1 = 5
+        result = [0, 1, 2, 3, 4, 5]
+
+        Example B:
+        sizes = (2, 3)
+        strides = (1, 2)        # non-standard / strided layout
+        coords = (0,0) -> 0*1 + 0*2 = 0
+                 (0,1) -> 0*1 + 1*2 = 2
+                 (0,2) -> 0*1 + 2*2 = 4
+                 (1,0) -> 1*1 + 0*2 = 1
+                 (1,1) -> 1*1 + 1*2 = 3
+                 (1,2) -> 1*1 + 2*2 = 5
+        result = [0, 2, 4, 1, 3, 5]
+        """
+        return [
+            sum(c * s for c, s in zip(coord, flatten(self.strides)))
+            for coord in product(*(range(s) for s in flatten(self.sizes)))
+        ]
+
+    def global_ranks(self, world_size: int) -> list[list[int]]:
+        """
+        Build global ranks specified by the layout via two-level ranks composition.
+
+        The nested list forms the Cartesian product of all ranks for one layout and offset
+        regarding filling up the world_size with the layout.
+        The final global ranks are the addition of these two. The result is a
+        list of lists: one sublist per layout. This rank list will be used to build
+        the communicator underlying the layout and the given `world_size`.
+
+        Example:
+        world_size = 16
+        self.size = 4
+        self.stride = 1
+        ranks = [0, 1, 2, 3]
+        offsets = [0, 4, 8, 12]
+        result = [
+            [0+0, 0+1, 0+2, 0+3],  # → [0, 1, 2, 3]
+            [4+0, 4+1, 4+2, 4+3],  # → [4, 5, 6, 7]
+            [8+0, 8+1, 8+2, 8+3],  # → [8, 9, 10,11]
+            [12+0, 12+1, 12+2, 12+3],  # → [12,13,14,15]
+        ]
+        """
+        return [
+            [offset + rank for rank in self.all_ranks_from_zero()]
+            for offset in self.complement(world_size).all_ranks_from_zero()
+        ]
+
+    def check_non_overlap(self) -> bool:
+        """
+        Check if the layout has any overlap between the ranks it generates. If there is overlap,
+        we return False, otherwise True.
+
+        The layout is supposed to be injective i.e, aside from indice 0, indices from each
+        dim of the layout must be non-overlapping.
+
+        Example 1 - Valid (no overlap):
+        Layout: sizes=(2,3), strides=(6,1)
+        - Dim 1: stride=1, span=3*1=3, covers indices [0,1,2]
+        - Dim 0: stride=6, span=2*6=12, covers indices [0,6]
+        → No overlap since 6 > 3
+
+        Example 2 - Invalid (overlap):
+        Layout: sizes=(2,3), strides=(2,1)
+        - Dim 1: stride=1, span=3*1=3, covers indices [0,1,2]
+        - Dim 0: stride=2, span=2*2=4, covers indices [0,2]
+        → Overlap! stride=2 < span=3, so indices [0,2] are duplicated
+
+        Example 3 - Invalid (overlap):
+        Layout: sizes=(4,2), strides=(1,1)
+        - Dim 1: stride=1, span=4, covers indices [0,1,2,3]
+        - Dim 0: stride=1, span=2, covers indices [0,1]
+        → Overlap! stride is same for two dims, so indices [0,2] are duplicated
+
+        Returns:
+            bool: True if no overlap, False if overlap detected
+        """
+        ranks = self.all_ranks_from_zero()
+        return len(ranks) == len(set(ranks))
+
+    def remap_to_tensor(self, rank_map: torch.Tensor) -> torch.Tensor:
+        """
+        Leverage layout as an index for mesh tensor that re-maps the indexes after layout
+        transformation to actual device ranks.
+
+        With this method, the cute layout serves as the backend of indices bookkeeping for the
+        mesh tensor when it comes to flatten, unflatten and slicing operations. The actual mesh
+        tensor still represents the actual device assignment and ranks. We need this function
+        to specify device allocation and create backend for a mesh. Although any transform of mesh tensors
+        can be treated as a view or subset of mesh tensor, we do need to use the actual view or
+        sub-tensor for DeviceMesh and its backend creation.
+
+        The shape of the `rank_map` must be 1D and contiguous.
+
+        Examples:
+
+        Case 1 - Consecutive ranks, full world:
+            original_mesh_tensor = [[0,1],[2,3]]  # 2x2 mesh, ranks 0-3
+            world_size = 4
+            layout = Layout(2:2)
+            Return: [[0,2],[1,3]]
+
+        Case 2 - Non-consecutive ranks:
+            original_mesh_tensor = [[10,20],[30,40]]  # custom rank assignment
+            world_size = 4
+            layout = Layout(2:2)
+            Return: [[[10,30],[20,40]]]
+
+        Args:
+            rank_map: The concrete mesh tensor with actual device ranks
+
+        Returns:
+            torch.Tensor: A tensor representing the actual device allocation from rank_map
+        """
+        assert rank_map.ndim == 1
+        assert rank_map.is_contiguous()
+        assert rank_map.numel() >= self.cosize()
+
+        complement_layout = self.complement(rank_map.numel())
+
+        return rank_map.as_strided(
+            flatten(complement_layout.sizes) + flatten(self.sizes),
+            flatten(complement_layout.strides) + flatten(self.strides),
+        ).reshape(-1, *self.top_level_sizes)
diff --git a/torch/distributed/_pycute/__init__.py b/torch/distributed/_pycute/__init__.py
index 9dbd35a44533..e13bcc86e509 100644
--- a/torch/distributed/_pycute/__init__.py
+++ b/torch/distributed/_pycute/__init__.py
@@ -31,6 +31,7 @@
 #################################################################################################
 
 from .int_tuple import (
+    as_tuple,
     crd2crd,
     crd2idx,
     elem_scale,
@@ -41,6 +42,7 @@
     IntTuple,
     is_int,
     is_tuple,
+    match_structure,
     product,
     shape_div,
     signum,
diff --git a/torch/distributed/_pycute/int_tuple.py b/torch/distributed/_pycute/int_tuple.py
index 3a6f171d5a24..b060edde2281 100644
--- a/torch/distributed/_pycute/int_tuple.py
+++ b/torch/distributed/_pycute/int_tuple.py
@@ -36,8 +36,8 @@
 
 from functools import reduce
 from itertools import chain
-from typing import Optional, Union
-from typing_extensions import TypeAlias, TypeIs
+from typing import Optional, TypeAlias, Union
+from typing_extensions import TypeIs
 
 from .typing import Integer
 
@@ -54,6 +54,20 @@ def is_tuple(x: object) -> TypeIs[tuple]:
     return isinstance(x, tuple)
 
 
+def as_tuple(x: IntTuple) -> tuple[IntTuple, ...]:
+    if is_int(x):
+        return (x,)
+    return x
+
+
+def match_structure(a: IntTuple, b: IntTuple) -> bool:
+    if is_int(a) and is_int(b):
+        return True
+    if is_tuple(a) and is_tuple(b):
+        return len(a) == len(b) and all(match_structure(x, y) for x, y in zip(a, b))
+    return False
+
+
 def flatten(t: IntTuple) -> tuple[int, ...]:
     if is_tuple(t):
         if len(t) == 0:
@@ -198,7 +212,9 @@ def crd2idx(
             for i in range(len(shape) - 1, 0, -1):
                 result += crd2idx(crd % product(shape[i]), shape[i], stride[i])
                 crd = crd // product(shape[i])
-            return result + crd2idx(crd, shape[0], stride[0])
+            if len(shape) > 0:
+                result += crd2idx(crd, shape[0], stride[0])
+            return result
         else:  # "int" "int" "int"
             assert not is_tuple(shape) and not is_tuple(stride)
             return crd * stride  # all are ints after type checks
diff --git a/torch/distributed/_pycute/layout.py b/torch/distributed/_pycute/layout.py
index dab5a08a9ea5..04ae5d1fa5fd 100644
--- a/torch/distributed/_pycute/layout.py
+++ b/torch/distributed/_pycute/layout.py
@@ -36,8 +36,8 @@
 """
 
 from itertools import chain
-from typing import Optional, Union
-from typing_extensions import TypeAlias, TypeIs
+from typing import Optional, TypeAlias, Union
+from typing_extensions import TypeIs
 
 from .int_tuple import (
     crd2idx,
@@ -162,7 +162,7 @@ def coalesce(layout: Layout, profile: LayoutProfile = None) -> Layout:
         assert len(layout) >= len(profile)
         return make_layout(
             chain(
-                (coalesce(layout[i], profile[i]) for i in range(0, len(profile))),  # type: ignore[arg-type]
+                (coalesce(layout[i], profile[i]) for i in range(len(profile))),  # type: ignore[arg-type]
                 (layout[i] for i in range(len(profile), len(layout))),
             )
         )
@@ -203,7 +203,7 @@ def filter(layout: Layout, profile: LayoutProfile = None) -> Layout:
         assert len(layout) >= len(profile)
         return make_layout(
             chain(
-                (filter(layout[i], profile[i]) for i in range(0, len(profile))),  # type: ignore[arg-type]
+                (filter(layout[i], profile[i]) for i in range(len(profile))),  # type: ignore[arg-type]
                 (layout[i] for i in range(len(profile), len(layout))),
             )
         )
@@ -233,7 +233,7 @@ def composition(layoutA: Layout, layoutB: LayoutInput) -> Layout:
         assert len(layoutA) >= len(layoutB)
         return make_layout(
             chain(
-                (composition(layoutA[i], layoutB[i]) for i in range(0, len(layoutB))),  # type: ignore[arg-type]
+                (composition(layoutA[i], layoutB[i]) for i in range(len(layoutB))),  # type: ignore[arg-type]
                 (layoutA[i] for i in range(len(layoutB), len(layoutA))),
             )
         )
@@ -371,7 +371,7 @@ def logical_divide(layoutA: Layout, layoutB: LayoutInput) -> Layout:
             chain(
                 (
                     logical_divide(layoutA[i], layoutB[i])  # type: ignore[arg-type]
-                    for i in range(0, len(layoutB))
+                    for i in range(len(layoutB))
                 ),
                 (layoutA[i] for i in range(len(layoutB), len(layoutA))),
             )
@@ -396,7 +396,7 @@ def logical_product(layoutA: Layout, layoutB: LayoutInput) -> Layout:
             chain(
                 (
                     logical_product(layoutA[i], layoutB[i])  # type: ignore[arg-type]
-                    for i in range(0, len(layoutB))
+                    for i in range(len(layoutB))
                 ),
                 (layoutA[i] for i in range(len(layoutB), len(layoutA))),
             )
@@ -421,14 +421,14 @@ def hier_unzip(
         # A layout with shape ((A,a),(B,b),(C,c))
         split = make_layout(
             hier_unzip(splitter, layoutA[i], layoutB[i])  # type: ignore[arg-type]
-            for i in range(0, len(layoutB))
+            for i in range(len(layoutB))
         )
         # Gather to shape ((A,B,C,...),(a,b,c,...,y,z))
         return make_layout(
-            make_layout(split[i][0] for i in range(0, len(layoutB))),  # type: ignore[arg-type]
+            make_layout(split[i][0] for i in range(len(layoutB))),  # type: ignore[arg-type]
             make_layout(
                 chain(  # type: ignore[arg-type]
-                    (split[i][1] for i in range(0, len(layoutB))),
+                    (split[i][1] for i in range(len(layoutB))),
                     (layoutA[i] for i in range(len(layoutB), len(layoutA))),
                 )
             ),
diff --git a/torch/distributed/_serialization.py b/torch/distributed/_serialization.py
index 2aa9786c0e47..d9c3bfe6b8d5 100644
--- a/torch/distributed/_serialization.py
+++ b/torch/distributed/_serialization.py
@@ -57,10 +57,13 @@ def read_from(self, f: BufferedIOBase) -> None:
         for entry in entries:
             data = f.read(entry.length)
             if entry.is_storage:
-                storage = torch.frombuffer(
-                    data,
-                    dtype=torch.uint8,
-                ).untyped_storage()
+                if entry.length == 0:
+                    storage = torch.UntypedStorage(0)
+                else:
+                    storage = torch.frombuffer(
+                        data,
+                        dtype=torch.uint8,
+                    ).untyped_storage()
 
                 self.records[entry.key] = (
                     storage,
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/_common.py b/torch/distributed/_shard/sharded_tensor/_ops/_common.py
index 502e0ac9a855..cc965a2ab711 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/_common.py
+++ b/torch/distributed/_shard/sharded_tensor/_ops/_common.py
@@ -43,6 +43,7 @@ def decorator_sharded_func(wrapped_func):
         def wrapper(types, args=(), kwargs=None, pg=None):
             _basic_validation(op, args, kwargs)
 
+            # pyrefly: ignore  # index-error
             st = args[0]
             if kwargs is None:
                 kwargs = {}
@@ -92,6 +93,7 @@ def _register_sharded_op_on_local_shards(
     @_sharded_op_impl(op)
     @_sharded_op_common(op, early_stop_func, extra_check)
     def sharded_tensor_op_on_local_shards(types, args=(), kwargs=None, pg=None):
+        # pyrefly: ignore  # index-error
         st = args[0]
         st_metadata = st.metadata()
         local_shards = st.local_shards()
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/init.py b/torch/distributed/_shard/sharded_tensor/_ops/init.py
index 71a9c20b4535..6c7255bb7c64 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/init.py
+++ b/torch/distributed/_shard/sharded_tensor/_ops/init.py
@@ -20,10 +20,13 @@ def uniform_(types, args=(), kwargs=None, pg=None):
         b: the upper bound of the uniform distribution
     """
     validate_param(kwargs, "kwargs")
+    # pyrefly: ignore  # unsupported-operation
     sharded_tensor = kwargs["tensor"]
     validate_param(sharded_tensor, "tensor")
+    # pyrefly: ignore  # unsupported-operation
     a = kwargs["a"]
     validate_param(a, "a")
+    # pyrefly: ignore  # unsupported-operation
     b = kwargs["b"]
     validate_param(b, "b")
 
@@ -43,10 +46,13 @@ def normal_(types, args=(), kwargs=None, pg=None):
         std: the standard deviation of the normal distribution
     """
     validate_param(kwargs, "kwargs")
+    # pyrefly: ignore  # unsupported-operation
     sharded_tensor = kwargs["tensor"]
     validate_param(sharded_tensor, "tensor")
+    # pyrefly: ignore  # unsupported-operation
     mean = kwargs["mean"]
     validate_param(mean, "mean")
+    # pyrefly: ignore  # unsupported-operation
     std = kwargs["std"]
     validate_param(std, "std")
 
@@ -78,12 +84,16 @@ def kaiming_uniform_(types, args=(), kwargs=None, pg=None):
             recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
     """
     validate_param(kwargs, "kwargs")
+    # pyrefly: ignore  # unsupported-operation
     sharded_tensor = kwargs["tensor"]
     validate_param(sharded_tensor, "tensor")
+    # pyrefly: ignore  # unsupported-operation
     a = kwargs["a"]
     validate_param(a, "a")
+    # pyrefly: ignore  # unsupported-operation
     mode = kwargs["mode"]
     validate_param(mode, "mode")
+    # pyrefly: ignore  # unsupported-operation
     nonlinearity = kwargs["nonlinearity"]
     validate_param(nonlinearity, "nonlinearity")
 
@@ -103,8 +113,10 @@ def constant_(types, args=(), kwargs=None, pg=None):
         val: the value to fill the tensor with
     """
     validate_param(kwargs, "kwargs")
+    # pyrefly: ignore  # unsupported-operation
     sharded_tensor = kwargs["tensor"]
     validate_param(sharded_tensor, "tensor")
+    # pyrefly: ignore  # unsupported-operation
     val = kwargs["val"]
     validate_param(val, "val")
     for shard in sharded_tensor.local_shards():
@@ -131,12 +143,13 @@ def tensor_creation_op(types, args=(), kwargs=None, pg=None):
         takes a ShardedTensor as argument, such as ``torch.zeros_like`` or
         ``torch.full_like``.
         """
-        creation_op = tensor_like_creation_op_map.get(op, None)
+        creation_op = tensor_like_creation_op_map.get(op)
         if creation_op is None:
             raise RuntimeError(f"Tensor creation {op} not supported!")
         if kwargs is None:
             kwargs = {}
 
+        # pyrefly: ignore  # index-error
         st = args[0]
 
         new_st = creation_op(st.sharding_spec(), st.size(), *args[1:], **kwargs)  # type: ignore[operator]
diff --git a/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py b/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
index 19c475fe8179..a41a0bf9b157 100644
--- a/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
+++ b/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
@@ -40,6 +40,7 @@
 # the device property on each rank
 @_sharded_op_impl(torch.Tensor.device.__get__)
 def tensor_device(types, args=(), kwargs=None, pg=None):
+    # pyrefly: ignore  # index-error
     self_st = args[0]
     # Validate types
     if not isinstance(self_st, ShardedTensor):
@@ -56,6 +57,7 @@ def tensor_device(types, args=(), kwargs=None, pg=None):
 
 @_sharded_op_impl(torch.Tensor.is_meta.__get__)  # type: ignore[attr-defined]
 def st_is_meta(types, args=(), kwargs=None, pg=None):
+    # pyrefly: ignore  # index-error
     return args[0].local_tensor().is_meta
 
 
@@ -196,6 +198,7 @@ def sharded_detach(args, kwargs, pg):
 
 @_sharded_op_impl(torch.Tensor.requires_grad_)
 def tensor_requires_grad_set(types, args=(), kwargs=None, pg=None):
+    # pyrefly: ignore  # index-error
     self_st = args[0]
     # Validate types
     if not isinstance(self_st, ShardedTensor):
diff --git a/torch/distributed/_shard/sharded_tensor/api.py b/torch/distributed/_shard/sharded_tensor/api.py
index 772483322cc5..9e2b8a5712b0 100644
--- a/torch/distributed/_shard/sharded_tensor/api.py
+++ b/torch/distributed/_shard/sharded_tensor/api.py
@@ -8,7 +8,7 @@
 import weakref
 from dataclasses import dataclass
 from functools import reduce
-from typing import Callable, cast, Optional, TYPE_CHECKING
+from typing import cast, Optional, TYPE_CHECKING
 from typing_extensions import deprecated
 
 import torch
@@ -41,7 +41,7 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
 
     from torch.distributed._shard.metadata import ShardMetadata
 
@@ -299,7 +299,9 @@ def _post_init(self):
         if self._init_rrefs:
             with _sharded_tensor_lock:
                 global _sharded_tensor_current_id, _sharded_tensor_map
+                # pyrefly: ignore  # bad-assignment
                 self._sharded_tensor_id = _sharded_tensor_current_id
+                # pyrefly: ignore  # unsupported-operation
                 _sharded_tensor_map[self._sharded_tensor_id] = weakref.ref(self)
                 _sharded_tensor_current_id += 1
 
@@ -676,7 +678,7 @@ def to(self, *args, **kwargs) -> ShardedTensor:
         copy_tensor = kwargs.get("copy", False)
         non_blocking = kwargs.get("non_blocking", False)
         memory_format = kwargs.get("memory_format", torch.preserve_format)
-        process_group = kwargs.get("process_group", None)
+        process_group = kwargs.get("process_group")
 
         if (
             not copy_tensor
diff --git a/torch/distributed/_shard/sharded_tensor/reshard.py b/torch/distributed/_shard/sharded_tensor/reshard.py
index 2bc3d65e5c8c..daef9c358618 100644
--- a/torch/distributed/_shard/sharded_tensor/reshard.py
+++ b/torch/distributed/_shard/sharded_tensor/reshard.py
@@ -4,7 +4,7 @@
 import torch
 import torch.distributed as dist
 import torch.distributed._shard.sharding_spec as shard_spec
-from torch.distributed._distributed_c10d import ProcessGroup
+from torch._C._distributed_c10d import ProcessGroup
 from torch.distributed._shard.metadata import ShardMetadata
 from torch.distributed._shard.sharding_spec._internals import (
     get_chunked_dim_size,
diff --git a/torch/distributed/_shard/sharded_tensor/utils.py b/torch/distributed/_shard/sharded_tensor/utils.py
index 5ddb05d4d3c0..ed65991aeb09 100644
--- a/torch/distributed/_shard/sharded_tensor/utils.py
+++ b/torch/distributed/_shard/sharded_tensor/utils.py
@@ -208,6 +208,7 @@ def build_global_metadata(
     global_sharded_tensor_metadata = None
     global_metadata_rank = 0
 
+    # pyrefly: ignore  # bad-assignment
     for rank, rank_metadata in enumerate(gathered_metadatas):
         if rank_metadata is None:
             continue
diff --git a/torch/distributed/_shard/sharding_spec/api.py b/torch/distributed/_shard/sharding_spec/api.py
index b24f28d973ab..87a49abdb5c0 100644
--- a/torch/distributed/_shard/sharding_spec/api.py
+++ b/torch/distributed/_shard/sharding_spec/api.py
@@ -2,8 +2,9 @@
 import functools
 import operator
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Callable, TYPE_CHECKING
+from typing import TYPE_CHECKING
 
 import torch
 import torch.distributed._shard.sharded_tensor.metadata as sharded_tensor_meta
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py
index e8eaeabb9f92..a8d8e422d1fb 100644
--- a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py
@@ -167,6 +167,7 @@ def shard(
                     )
 
                 tensors_to_scatter[
+                    # pyrefly: ignore  # bad-argument-type
                     dist.get_group_rank(process_group, remote_global_rank)
                 ] = tensor_to_scatter
 
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py
index 83d3371c7f90..3083a61163ad 100644
--- a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py
@@ -58,6 +58,7 @@ def _register_sharded_op_on_local_tensor(
     @custom_sharding_spec_op(ChunkShardingSpec, op)
     @_sharded_op_common(op, early_stop_func, extra_check)
     def sharded_tensor_op_on_local_tensor(types, args=(), kwargs=None, pg=None):
+        # pyrefly: ignore  # index-error
         st = args[0]
         sharding_spec = st.sharding_spec()
         if len(st.local_shards()) != 1:
diff --git a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
index f02563619d2f..dfdefa9373fb 100644
--- a/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
+++ b/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
@@ -4,7 +4,7 @@
 
 import torch
 import torch.distributed as dist
-from torch.distributed._distributed_c10d import ReduceOp
+from torch._C._distributed_c10d import ReduceOp
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 from torch.distributed._shard.sharding_spec import ChunkShardingSpec
 from torch.distributed._shard.sharding_spec.api import custom_sharding_spec_op
@@ -425,7 +425,9 @@ def _handle_row_wise_sharding(
         else:
             split_sizes = torch.cat(
                 (
+                    # pyrefly: ignore  # unsupported-operation
                     offsets[1 : offsets.size(0)] - offsets[0:-1],
+                    # pyrefly: ignore  # unsupported-operation
                     (input.size(0) - offsets[-1]).unsqueeze(0),
                 ),
                 dim=-1,
diff --git a/torch/distributed/_state_dict_utils.py b/torch/distributed/_state_dict_utils.py
index 8c527e7efe5d..06aa9db81e9c 100644
--- a/torch/distributed/_state_dict_utils.py
+++ b/torch/distributed/_state_dict_utils.py
@@ -3,8 +3,8 @@
 import io
 import math
 import weakref
-from collections.abc import Mapping, MutableMapping
-from typing import Any, Callable, cast, NamedTuple, Optional, TYPE_CHECKING, Union
+from collections.abc import Callable, Mapping, MutableMapping
+from typing import Any, cast, NamedTuple, Optional, TYPE_CHECKING, Union
 
 import torch
 import torch.cuda._pin_memory_utils as pin_memory_utils
@@ -195,11 +195,13 @@ def _iterate_state_dict(
                             ret.local_shards()[idx].tensor, non_blocking=non_blocking
                         )
                 else:
+                    # pyrefly: ignore  # missing-attribute
                     companion_obj.copy_(ret, non_blocking=non_blocking)
                 ret = companion_obj
     else:
         ret = {} if isinstance(ret, dict) else None
 
+    # pyrefly: ignore  # bad-return
     return ret
 
 
@@ -596,7 +598,7 @@ def _distribute_tensors(
     if pg is None:
         pg = dist.distributed_c10d._get_default_group()
     for key in keys:
-        _local_state = local_state_dict.get(key, None)
+        _local_state = local_state_dict.get(key)
         if _local_state is None or torch.is_tensor(_local_state):
             continue
 
@@ -706,7 +708,7 @@ def _distribute_state_dict(
             local_state_dict[key] = value.cpu()
         else:
             assert isinstance(value, torch.Tensor)
-            local_state = local_state_dict.get(key, None)
+            local_state = local_state_dict.get(key)
             if local_state is None:
                 continue
             elif isinstance(local_state, DTensor):
@@ -790,20 +792,21 @@ def extend_list(lst: list[Any], idx: int) -> None:
     for i in range(1, len(path)):
         prev_key = path[i - 1]
         key = path[i]
-        def_val: Union[CONTAINER_TYPE, list[Any]] = {} if type(key) == str else []
+        def_val: Union[CONTAINER_TYPE, list[Any]] = {} if type(key) is str else []
 
         if isinstance(cur_container, Mapping):
             cur_container = cast(
                 CONTAINER_TYPE, cur_container.setdefault(prev_key, def_val)
             )
         else:
+            # pyrefly: ignore  # bad-argument-type
             extend_list(cur_container, prev_key)
             if cur_container[prev_key] is None:
                 cur_container[prev_key] = def_val
             cur_container = cur_container[prev_key]
 
     key = path[-1]
-    if type(key) == int:
+    if type(key) is int:
         extend_list(cast(list[Any], cur_container), key)
 
     cur_container[key] = value
diff --git a/torch/distributed/_symmetric_memory/__init__.py b/torch/distributed/_symmetric_memory/__init__.py
index 77e05cf9b162..132a40977f85 100644
--- a/torch/distributed/_symmetric_memory/__init__.py
+++ b/torch/distributed/_symmetric_memory/__init__.py
@@ -4,23 +4,18 @@
 import os
 import socket
 import uuid
-from collections.abc import Generator
+from collections.abc import Callable, Generator
 from contextlib import contextmanager
 from datetime import timedelta
 from enum import Enum
 from functools import partial
-from typing import Any, Callable, Literal
+from typing import Any, Literal
 
 import torch
 import torch.distributed._functional_collectives as funcol
 import torch.distributed.distributed_c10d as c10d
 from torch._C._autograd import DeviceType
-from torch.distributed._distributed_c10d import (
-    _register_work,
-    _SymmetricMemory,
-    ProcessGroup,
-    Work as _Work,
-)
+from torch._C._distributed_c10d import _SymmetricMemory, Work as _Work
 
 
 _group_name_to_store: dict[str, c10d.Store] = {}
@@ -322,6 +317,7 @@ def _pipelined_produce_and_all2all(
     chunk_producer: Callable[[int, torch.Tensor], None],
     output: torch.Tensor,
     group_name: str,
+    out_chunk_dim: int = 0,
 ) -> None:
     """
     Perform the following logic with micro-pipelined computation and
@@ -333,7 +329,9 @@ def _pipelined_produce_and_all2all(
         ]
         dist.all_to_all_single(output=output, input=torch.cat(chunks))
     """
-    out_chunks = output.chunk(c10d._get_group_size_by_name(group_name))
+    out_chunks = output.chunk(
+        c10d._get_group_size_by_name(group_name), dim=out_chunk_dim
+    )
     p2p_workspace_size_req = out_chunks[0].numel() * out_chunks[0].element_size() * 2
     symm_mem = get_symm_mem_workspace(group_name, min_size=p2p_workspace_size_req)
     group_size = symm_mem.world_size
@@ -455,7 +453,7 @@ def get_p2p_buf(rank: int, idx: int) -> torch.Tensor:
 lib.define(
     "fused_scaled_matmul_reduce_scatter("
     "Tensor A, Tensor B, Tensor A_scale, Tensor B_scale, "
-    "str reduce_op, int orig_scatter_dim, int scatter_dim_after_maybe_reshape, str group_name, int[]? output_shape, "
+    "str reduce_op, int orig_scatter_dim, int scatter_dim_after_maybe_reshape, str group_name, SymInt[]? output_shape, "
     "Tensor? bias = None, "
     "Tensor? result_scale = None, "
     "ScalarType? out_dtype = None, "
@@ -526,6 +524,19 @@ def _fused_all_gather_matmul_impl(
 
     group = c10d._resolve_process_group(group_name)
 
+    if gather_dim == A_shard.ndim - 1 or gather_dim == -1:
+        return _fused_all_gather_matmul_last_gather_dim_impl(
+            mm_out_op,
+            A_shard,
+            Bs,
+            A_scale,
+            kwargs_list,
+            out_dtypes,
+            gather_dim,
+            group_name,
+            return_A,
+        )
+
     # Move the gather_dim to the front and flatten the tensor into a 2D matrix.
     # The flattened tensor doesn't need to be contiguous (for computation
     # efficiency), as _pipelined_all_gather_and_consume guarantees that shards
@@ -626,6 +637,140 @@ def default_consumer(shard: torch.Tensor, rank: int) -> None:
     return A, [unflatten(output) for output in outputs]
 
 
+def _pipelined_all_gather_and_consume_last_dim(
+    shard: torch.Tensor,
+    shard_consumer: Callable[[torch.Tensor, int], None],
+    ag_out: torch.Tensor,
+    group_name: str,
+    ag_out_needed: bool = True,
+) -> None:
+    p2p_workspace_size_req = 0
+    p2p_workspace_size_req = shard.numel() * shard.element_size()
+    symm_mem = get_symm_mem_workspace(group_name, min_size=p2p_workspace_size_req)
+    group_size = symm_mem.world_size
+    rank = symm_mem.rank
+
+    symm_mem.barrier(channel=0)
+    backend_stream = _get_backend_stream()
+    backend_stream.wait_stream(torch.cuda.current_stream())
+
+    def copy_shard(dst: torch.Tensor, src: torch.Tensor) -> None:
+        dst.copy_(src)
+
+    def get_p2p_buf(remote_rank: int) -> torch.Tensor:
+        buf = symm_mem.get_buffer(
+            remote_rank,
+            shard.shape,
+            shard.dtype,
+        )
+        return buf
+
+    local_p2p_buf = get_p2p_buf(rank)
+
+    shards = ag_out.chunk(group_size)
+
+    copy_shard(dst=local_p2p_buf, src=shard)
+    symm_mem.barrier(channel=1)
+    backend_stream.wait_stream(torch.cuda.current_stream())
+
+    # At this point, all ranks have copied their local shard to
+    # their local p2p buffer. Each rank can now copy and consume
+    # remote shards.
+    shard_consumer(shard, rank)
+
+    for step in range(1, group_size):
+        if step % 2 == 0:
+            stream = torch.cuda.current_stream()
+        else:
+            stream = backend_stream
+        remote_rank = (step + rank) % group_size
+        remote_p2p_buf = get_p2p_buf(remote_rank)
+        with stream:
+            copy_shard(dst=shards[remote_rank], src=remote_p2p_buf)
+            shard_consumer(shards[remote_rank], remote_rank)
+
+    if ag_out_needed:
+        # Copy from input to the all-gather output. Opportunistically overlap
+        # it with the last shard_consumer.
+        if group_size % 2 == 0:
+            stream = torch.cuda.current_stream()
+        else:
+            stream = backend_stream
+        with stream:
+            copy_shard(dst=shards[rank], src=shard)
+
+    torch.cuda.current_stream().wait_stream(backend_stream)
+    symm_mem.barrier(channel=0)
+
+
+def _fused_all_gather_matmul_last_gather_dim_impl(
+    mm_out_op: torch._ops.OpOverload,
+    A_shard: torch.Tensor,
+    Bs: list[torch.Tensor],
+    A_scale: torch.Tensor | None,
+    kwargs_list: list[dict[str, Any]],
+    out_dtypes: list[torch.dtype | None],
+    gather_dim: int,
+    group_name: str,
+    return_A: bool,
+) -> tuple[torch.Tensor | None, list[torch.Tensor]]:
+    group = c10d._resolve_process_group(group_name)
+    group_size = group.size()
+
+    B_shards = [B.chunk(group.size()) for B in Bs]
+
+    leading_dims = list(A_shard.shape[:-1])
+    A_shard_flat = A_shard.flatten(0, -2)
+
+    def unflatten(t: torch.Tensor) -> torch.Tensor:
+        return t.view(*leading_dims, -1)
+
+    A_flat_out = A_shard_flat.new_empty(
+        A_shard_flat.shape[0] * group.size(),
+        A_shard_flat.shape[1],
+    )
+
+    outputs = [
+        torch.empty(
+            (A_shard_flat.shape[0], B.shape[1]),
+            dtype=out_dtype or B.dtype,
+            device=A_shard.device,
+        )
+        for B, out_dtype in zip(Bs, out_dtypes)
+    ]
+
+    first = True
+    events = [torch.cuda.Event() for _ in outputs]
+
+    def default_consumer(shard: torch.Tensor, rank: int) -> None:
+        nonlocal first
+        for out, event, B_shard, kwargs in zip(outputs, events, B_shards, kwargs_list):
+            event.wait()
+            if first:
+                torch.ops.aten.mm.out(shard, B_shard[rank], **kwargs, out=out)
+            else:
+                out.addmm_(shard, B_shard[rank])
+            event.record()
+
+        first = False
+
+    _pipelined_all_gather_and_consume_last_dim(
+        A_shard_flat,
+        default_consumer,
+        A_flat_out,
+        group_name,
+        return_A,
+    )
+    ret_A = None
+    if return_A:
+        # This path is inefficient and will be filtered out at passes stage
+        # Added only for completeness.
+        A_split_cat_out_flat = torch.cat(A_flat_out.chunk(group_size), dim=-1)
+        ret_A = unflatten(A_split_cat_out_flat)
+
+    return ret_A, [unflatten(output) for output in outputs]
+
+
 @torch.library.impl(lib, "fused_all_gather_matmul", "Meta")
 def _fused_all_gather_matmul_fallback(
     A_shard: torch.Tensor,
@@ -640,6 +785,15 @@ def _fused_all_gather_matmul_fallback(
         A_shard.contiguous(), group_size, group_name
     )
     A = torch.ops._c10d_functional.wait_tensor(A)
+    if gather_dim == A.ndim - 1 or gather_dim == -1:
+        A_splits = A.chunk(group_size)
+        A_mm = torch.cat(A_splits, dim=-1)
+        res = [torch.matmul(A_mm, B) for B in Bs]
+        if return_A:
+            return A_mm, res
+        else:
+            return None, res
+
     A = A.view(group_size, *A_shard.shape).movedim(gather_dim + 1, 1).flatten(0, 1)
     res = [torch.matmul(A, B).movedim(0, gather_dim) for B in Bs]
     if return_A:
@@ -1070,11 +1224,40 @@ def _fused_matmul_reduce_scatter_impl(
         reduce_fn = partial(torch.mean, dim=0)
     else:
         raise ValueError("reduce_op must be sum or avg")
-
     group = c10d._resolve_process_group(group_name)
     out_shape = [*A.shape[:-1], B.shape[1]]
     out_shape[scatter_dim] //= group.size()
 
+    if scatter_dim == A.ndim - 1:
+        B_shards = B.chunk(group.size(), dim=B.ndim - 1)
+        A_flat = A.flatten(0, -2)
+
+        def _chunk_producer(rank: int, out: torch.Tensor) -> None:
+            mm_out_op(A_flat, B_shards[rank], **kwargs, out=out)
+
+        leading_dims = list(A.shape[:-1])
+
+        stacked_partials = torch.empty(
+            (A_flat.shape[0], B.shape[1]),
+            dtype=out_dtype or A.dtype,
+            device=A.device,
+        )
+
+        _pipelined_produce_and_all2all(
+            _chunk_producer,
+            stacked_partials,
+            group_name,
+            out_chunk_dim=1,
+        )
+
+        stacked_partials_view = stacked_partials.reshape(
+            *leading_dims, group.size(), -1
+        )
+        return reduce_fn(
+            stacked_partials_view,
+            dim=-2,
+        )
+
     # Move the scatter_dim to the front and flatten the tensor into a 2D matrix
     x = A.movedim(scatter_dim, 0)
     leading_dims = [group.size()] + list(x.shape[:-1])
@@ -1394,7 +1577,7 @@ def _maybe_convert_scalar_types_to_dtypes(
         if scalar_type is None:
             dtypes.append(scalar_type)
         elif scalar_type not in _SCALAR_TYPE_TO_DTYPE:
-            raise ValueError("Unrecognized scalar type {scalar_type}")
+            raise ValueError(f"Unrecognized scalar type {scalar_type}")
         else:
             dtypes.append(_SCALAR_TYPE_TO_DTYPE[scalar_type])
     return dtypes
@@ -1488,12 +1671,12 @@ def _low_contention_all_gather(
             local_buf.copy_(tensor)
         # pull
         symm_mem.barrier()
-        for step in range(0, world_size):
+        for step in range(world_size):
             remote_rank = (rank - step) % world_size
             src_buf = symm_mem.get_buffer(remote_rank, tensor.shape, tensor.dtype)
             chunks[remote_rank].copy_(src_buf)
         symm_mem.barrier()
-        _register_work(output, Work())
+        torch._C._distributed_c10d._register_work(output, Work())
         return output
 
 
@@ -1523,7 +1706,7 @@ def _low_contention_reduce_scatter_with_symm_mem_input(
     with _get_backend_stream():
         # pull + offline reduction
         symm_mem.barrier()
-        for step in range(0, world_size):
+        for step in range(world_size):
             remote_rank = (rank - step) % world_size
             src_buf = symm_mem.get_buffer(
                 remote_rank,
@@ -1541,7 +1724,7 @@ def _low_contention_reduce_scatter_with_symm_mem_input(
             ret = ret.mean(dim=0)
         else:
             raise ValueError(f"reduce_op ({reduce_op}) is not supported")
-        _register_work(ret, Work())
+        torch._C._distributed_c10d._register_work(ret, Work())
         return ret
 
 
@@ -1560,7 +1743,7 @@ def _low_contention_reduce_scatter_with_workspace(
     with _get_backend_stream():
         # push + offline reduction
         workspace.barrier()
-        for step in range(0, world_size):
+        for step in range(world_size):
             remote_rank = (rank - step) % world_size
             dst_buf = workspace.get_buffer(
                 remote_rank, chunks[0].shape, chunks[0].dtype, chunks[0].numel() * rank
@@ -1576,7 +1759,7 @@ def _low_contention_reduce_scatter_with_workspace(
             ret = ret.mean(dim=0)
         else:
             raise ValueError(f"reduce_op ({reduce_op}) is not supported")
-        _register_work(ret, Work())
+        torch._C._distributed_c10d._register_work(ret, Work())
         return ret
 
 
@@ -1654,6 +1837,7 @@ def _all_to_all_vdev_2d_offset_meta(
 
 
 if TYPE_CHECKING:
+    from torch._C._distributed_c10d import ProcessGroup
     from torch.types import _device, _dtype, _int
 
 
@@ -1664,6 +1848,7 @@ def empty(
 
 
 @overload
+# pyrefly: ignore  # inconsistent-overload
 def empty(
     size: Sequence[_int],
     *,
@@ -1731,6 +1916,8 @@ def rendezvous(
         group (Union[str, :class:`torch.distributed.ProcessGroup`]): The group identifying the
             participating processes. This can be either a group name or a process group object.
     """
+    from torch._C._distributed_c10d import ProcessGroup
+
     if isinstance(group, str):
         group_name = group
     elif isinstance(group, ProcessGroup):
@@ -1748,7 +1935,11 @@ def is_nvshmem_available() -> bool:
 
     Check if NVSHMEM is available in current build and on current system.
     """
-    from torch.distributed._distributed_c10d import _is_nvshmem_available
+    try:
+        from torch._C._distributed_c10d import _is_nvshmem_available
+    except ImportError:
+        # Not all builds have NVSHMEM support.
+        return False
 
     # Check if NVSHMEM is available on current system.
     return _is_nvshmem_available()
diff --git a/torch/distributed/_symmetric_memory/_nvshmem_triton.py b/torch/distributed/_symmetric_memory/_nvshmem_triton.py
index b1fe5d067670..46e30376b5b1 100644
--- a/torch/distributed/_symmetric_memory/_nvshmem_triton.py
+++ b/torch/distributed/_symmetric_memory/_nvshmem_triton.py
@@ -11,108 +11,236 @@
 logger = logging.getLogger(__name__)
 
 
-def _find_nvshmem_device_library() -> str:
-    paths = [os.path.join(sysconfig.get_path("purelib"), "nvidia", "nvshmem", "lib")]
-
-    # Add common system installation paths
-    common_paths = [
-        "/usr/local/lib",
-        "/usr/lib",
-        "/opt/nvidia/nvshmem/lib",
-    ]
-    paths.extend(common_paths)
-
-    try:
-        import torch
-
-        torch_lib = os.path.join(os.path.dirname(torch.__file__), "lib")
-        so_path = os.path.join(torch_lib, "libtorch_nvshmem.so")
-
-        if os.path.exists(so_path):
-            try:
-                result = subprocess.run(
-                    ["readelf", "-d", so_path],
-                    capture_output=True,
-                    text=True,
-                    check=True,
-                )
+class NvshmemLibFinder:
+    """
+    A class to find path to the NVSHMEM device library.
 
-                for line in result.stdout.splitlines():
-                    if ("RPATH" in line or "RUNPATH" in line) and "[" in line:
-                        rpath = line.split("[", 1)[1].split("]", 1)[0]
-                        for p in rpath.split(":"):
-                            p = p.strip().replace("$ORIGIN", torch_lib)
-                            if p and p not in paths:
-                                paths.append(p)
-            except subprocess.CalledProcessError:
-                pass
+    Environment variable:
 
-    except ImportError:
-        pass
+    `NVSHMEM_LIB_DIR` (Optional[str]): The directory where the NVSHMEM device
+    library is located. If not provided, it will use the default path where
+    NVSHMEM wheel is installed, or search for the library in common system
+    paths.
+    """
+
+    # Class variable to store the found library path for reuse
+    found_device_lib_path: Optional[str] = None
 
-    for path in paths:
-        device_lib = os.path.join(path, "libnvshmem_device.bc")
-        if os.path.exists(device_lib):
-            return device_lib
+    @classmethod
+    def find_device_library(cls) -> str:
+        """
+        Find the path to the NVSHMEM device library.
 
-    raise RuntimeError(f"NVSHMEM device library not found. Searched: {paths}")
+        Returns:
+            str: The path to libnvshmem_device.bc (included).
+        """
+        if cls.found_device_lib_path is not None:
+            # Return the cached path if it exists
+            return cls.found_device_lib_path
+
+        # First, check if the user has specified a custom library path
+        user_lib_dir = os.environ.get("NVSHMEM_LIB_DIR", None)
+        if user_lib_dir is not None:
+            lib_path = os.path.join(user_lib_dir, "libnvshmem_device.bc")
+            if not os.path.exists(lib_path):
+                raise RuntimeError(
+                    f"NVSHMEM device library not found at specified path: {user_lib_dir}"
+                )
+            cls.found_device_lib_path = lib_path
+            return lib_path
+
+        # Otherwise, search for the library in the default installation paths
+        paths = [
+            os.path.join(sysconfig.get_path("purelib"), "nvidia", "nvshmem", "lib")
+        ]
+
+        # Add common system installation paths
+        common_paths = [
+            "/usr/local/lib",
+            "/usr/lib",
+            "/opt/nvidia/nvshmem/lib",
+        ]
+        paths.extend(common_paths)
+
+        try:
+            import torch
+
+            torch_lib = os.path.join(os.path.dirname(torch.__file__), "lib")
+            so_path = os.path.join(torch_lib, "libtorch_nvshmem.so")
+
+            if os.path.exists(so_path):
+                try:
+                    result = subprocess.run(
+                        ["readelf", "-d", so_path],
+                        capture_output=True,
+                        text=True,
+                        check=True,
+                    )
+
+                    for line in result.stdout.splitlines():
+                        if ("RPATH" in line or "RUNPATH" in line) and "[" in line:
+                            rpath = line.split("[", 1)[1].split("]", 1)[0]
+                            for p in rpath.split(":"):
+                                p = p.strip().replace("$ORIGIN", torch_lib)
+                                if p and p not in paths:
+                                    paths.append(p)
+                except subprocess.CalledProcessError:
+                    pass
+
+        except ImportError:
+            pass
+
+        for path in paths:
+            device_lib = os.path.join(path, "libnvshmem_device.bc")
+            if os.path.exists(device_lib):
+                cls.found_device_lib_path = device_lib
+                return device_lib
+
+        raise RuntimeError(f"NVSHMEM device library not found. Searched: {paths}")
 
 
 def enable_triton(lib_dir: Optional[str] = None) -> dict[str, str]:
+    raise NotImplementedError(
+        "`enable_triton` is deprecated. "
+        "If you need NVSHMEM device function support for Triton, "
+        "please use `@requires_nvshmem` to decorate your Triton kernel. ",
+    )
+
+
+class NvshmemKernelRegistry:
     """
-    Enable NVSHMEM device functions for Triton. It performs a NVSHMEM
-    device-side initialization on the kernel module created by Triton.
-
-    This function sets a global hook that initializes NVSHMEM for Triton
-    kernels. To avoid unnecessary initializations, the hook only acts on
-    kernels that have "nvshmem" in their function name. Therefore, it is
-    required that all Triton kernels using NVSHMEM primitives follow this
-    naming convention.
-
-    Args:
-        lib_dir (Optional[str]): The directory where the NVSHMEM device library
-        is located. If not provided, it will use the default path where NVSHMEM
-        wheel is installed.
-
-    Returns:
-        dict[str, str]: A dictionary containing the NVSHMEM device library name
-        and path.
+    A class to register kernel functions that ** require NVSHMEM initialization **
     """
-    import triton
 
-    from torch.distributed._distributed_c10d import _nvshmemx_cumodule_init
+    # Class variable to store the functions to be initialized
+    _to_init: dict[str, Any] = {}
+
+    @classmethod
+    def register(cls, name: str) -> None:
+        """
+        Register a kernel function with the given name.
+
+        Args:
+            name (str): The name of the kernel function.
+        """
+        cls._to_init.setdefault(name)
+
+    @classmethod
+    def deregister(cls, name: str) -> None:
+        """
+        Deregister a kernel function with the given name.
+
+        Args:
+            name (str): The name of the kernel function.
+        """
+        cls._to_init.pop(name, None)
+
+    @classmethod
+    def has(cls, name: str) -> bool:
+        """
+        Check if a kernel function with the given name is registered.
+
+        Args:
+            name (str): The name of the kernel function.
+
+        Returns:
+            bool: True if the kernel function is registered, False otherwise.
+        """
+        return name in cls._to_init
+
 
-    if lib_dir is not None:
-        lib_path = os.path.join(lib_dir, "libnvshmem_device.bc")
-        if not os.path.exists(lib_path):
-            raise RuntimeError(
-                f"NVSHMEM device library not found at specified path: {lib_path}"
+def _nvshmem_init_hook(*args, **kwargs) -> None:  # type: ignore[no-untyped-def]
+    """
+    A hook function to initialize the CUModule created by `triton.jit` with
+    NVSHMEM device context
+    """
+    from torch._C._distributed_c10d import _nvshmemx_cumodule_init
+
+    jit_function = kwargs["fn"].jit_function
+    fn_name = jit_function.fn.__name__
+
+    # Only initialize NVSHMEM module for kernels registered via @requires_nvshmem
+    if NvshmemKernelRegistry.has(fn_name):
+        key = kwargs["key"]
+        device = kwargs["compile"]["device"]
+        jit_function = kwargs["fn"].jit_function
+        kernel_cache = jit_function.device_caches[device][0]
+        kernel = kernel_cache.get(key, None)
+        if kernel is not None:
+            kernel.run
+            # Initialize NVSHMEM for the CU module
+            _nvshmemx_cumodule_init(kernel.module)
+        else:
+            logger.warning(
+                f"It seems Triton hasn't created a kernel for function {fn_name}. "  # noqa: G004
+                "Please report this issue to Triton."
             )
-    else:
-        # Otherwise, search for the library automatically.
-        lib_path = _find_nvshmem_device_library()
 
+
+if has_triton():
+    from triton.runtime.jit import JITFunction, KernelInterface
+
+    # Create a new Callable class that follows the KernelInterface protocol so
+    # that the Callable works with the subscript operator, e.g. `foo[(1, 1)]`
+    class GridCallableWithExtern(KernelInterface):
+        """
+        `KernelInterface` invokes `self.run` in `__getitem__`, i.e. [].  We
+        implement a `run` method by directing the call to `JITFunction.run`,
+        with added extern_libs kwarg, so that users don't have to pass it
+        """
+
+        def __init__(self, jit_func: JITFunction, extern_libs: dict[str, str]) -> None:
+            self.jit_func = jit_func
+            self.extern_libs = extern_libs
+
+        def run(self, *args, **kwargs):  # type: ignore[no-untyped-def]
+            # Call the JITFunction.run with added extern_libs kwarg
+            return self.jit_func.run(*args, **kwargs, extern_libs=self.extern_libs)
+
+
+def requires_nvshmem(  # type: ignore[no-untyped-def]
+    jit_func,  # JITFunction created by triton.jit
+):
+    """
+    A decorator to register a Triton kernel function that requires NVSHMEM initialization.
+
+    Example usage:
+    ```
+        @requires_nvshmem
+        @triton.jit
+        def foo(...):
+            ...
+    ```
+
+    If you would like to specify a path to the NVSHMEM device library other
+    than standard search locations, you can use the following environment
+    variable:
+    ```
+        export NVSHMEM_LIB_DIR=/path/to/nvshmem/lib
+    ```
+    """
+
+    import triton
+    from triton.runtime.jit import JITFunction
+
+    if not isinstance(jit_func, JITFunction):
+        raise TypeError(f"Expected a JITFunction, but got {type(jit_func)}")
+
+    # Find the NVSHMEM device library
+    lib_path = NvshmemLibFinder.find_device_library()
     extern_libs = {"libnvshmem_device": lib_path}
 
-    # A hook function to initialize NVSHMEM in Triton
-    def nvshmem_init_hook(*args, **kwargs) -> None:  # type: ignore[no-untyped-def]
-        jit_function = kwargs["fn"].jit_function
-        # Only initialize NVSHMEM module for kernels containing "nvshmem" in their name
-        if "nvshmem" in jit_function.fn.__name__:
-            key = kwargs["key"]
-            device = kwargs["compile"]["device"]
-            jit_function = kwargs["fn"].jit_function
-            kernel_cache = jit_function.device_caches[device][0]
-            kernel = kernel_cache.get(key, None)
-            if kernel is not None:
-                kernel.run
-                _nvshmemx_cumodule_init(kernel.module)
+    # Register the JITFunction with the kernel registry as "to be initialized"
+    NvshmemKernelRegistry.register(jit_func.fn.__name__)
 
-    # Register the function as a post-compile hook
-    triton.knobs.runtime.jit_post_compile_hook = nvshmem_init_hook
+    # Register the NVSHMEM init function as a post-compile hook.
+    # [Note] This is a global setting (due to lack of Triton API exposure). To
+    # avoid initializing Triton kernels that do not require NVSHMEM, filtering
+    # is performed in the hook function itself by checking against
+    # NvshmemKernelRegistry.
+    triton.knobs.runtime.jit_post_compile_hook = _nvshmem_init_hook
 
-    # Return to user so that they can use it in Triton kernel invocation
-    return extern_libs
+    return GridCallableWithExtern(jit_func, extern_libs)
 
 
 if has_triton():
@@ -227,13 +355,67 @@ def getmem_block_extern_wrapper(dest, source, size_bytes, pe, _semantic=None):
             _semantic=_semantic,
         )
 
+    @triton.jit  # type: ignore[misc]
+    def get_nbi(dest, source, nelems, pe):  # type: ignore[no-untyped-def]
+        """
+        Get tensor data from a remote PE to local PE, non-blocking.
+
+        Different from the `get` function, this function returns after
+        initiating the operation. The operation is considered complete after a
+        subsequent call to `quiet`.
+
+        Args:
+            dest: Destination tensor on the local PE. Type must match source.
+            source: Source tensor on the remote PE containing data to be copied.
+            nelems: Number of elements to transfer.
+            pe: PE number of the remote PE (0 ≤ pe < nvshmem_n_pes()).
+
+        Notes:
+            - Performs compile-time type checking between dest and source tensors.
+            - Automatically calculates byte size from tensor type and element count.
+
+        Example:
+            ```
+            # Get 100 elements from PE 0
+            nvshmem.get_nbi(dest, src, 100, 0)
+            # Some independent computation which overlaps with the get operation
+            ...
+            # Wait for completion of the get operation
+            nvshmem.quiet()
+            ```
+        """
+        tl.static_assert(dest.type == source.type)
+        nbytes = nelems * dest.type.element_ty.itemsize
+        return getmem_block_extern_wrapper(
+            dest.to(tl.int64), source.to(tl.int64), nbytes.to(tl.int64), pe
+        )
+
+    @core.extern
+    def getmem_nbi_block_extern_wrapper(dest, source, size_bytes, pe, _semantic=None):  # type: ignore[no-untyped-def]
+        """Low-level extern wrapper for NVSHMEM get"""
+        return core.extern_elementwise(
+            "",
+            "",
+            [dest, source, size_bytes, pe],
+            {
+                (
+                    core.dtype("int64"),  # dest ptr
+                    core.dtype("int64"),  # source ptr
+                    core.dtype("int64"),  # size in bytes
+                    core.dtype("int32"),  # pe number
+                ): ("nvshmemx_getmem_nbi_block", core.dtype("int32"))
+            },
+            is_pure=False,
+            _semantic=_semantic,
+        )
+
     @triton.jit  # type: ignore[misc]
     def putmem_signal_block(  # type: ignore[no-untyped-def]
         dst,
         src,
         size_bytes,
-        sig_addr,
         signal,
+        sig_val,
         sig_op,
         pe,
     ):  # type: ignore[no-untyped-def]
@@ -245,10 +427,10 @@ def putmem_signal_block(  # type: ignore[no-untyped-def]
         This enables efficient point-to-point synchronization between PEs.
 
         Args:
-            dst (int64): Symmetric address of the destination data object on the remote PE.
-            src (int64): Local address of the source data object containing data to be copied.
+            dst (tensor): A tensor on calling PE symmetric to the destination tensor on remote PE.
+            src (tensor): Local tensor containing the source data.
             size_bytes (int64): Number of bytes to transfer. Must be positive.
-            sig_addr (int64): Symmetric address of the signal variable (uint64_t) on the remote PE.
+            signal (tensor): Symmetric signal pad with remote PE.
                              Must be 8-byte aligned symmetric memory.
             signal (int64): Value to be used in the signal operation.
             sig_op (int32): Signal operation type. Common values:
@@ -276,13 +458,14 @@ def putmem_signal_block(  # type: ignore[no-untyped-def]
             )
             ```
         """
-        signal_64 = 0 << 32 | signal
+        # Ensure sig_val is 64 bits
+        sig_val = 0 << 32 | sig_val
         return putmem_signal_block_extern_wrapper(
-            dst,
-            src,
+            dst.to(tl.int64),
+            src.to(tl.int64),
             size_bytes.to(tl.int64),
-            sig_addr,
-            signal_64.to(tl.uint64),
+            signal.to(tl.int64),
+            sig_val.to(tl.uint64),
             sig_op,
             pe,
         )
@@ -292,8 +475,8 @@ def putmem_signal_block_extern_wrapper(  # type: ignore[no-untyped-def]
         dst,
         src,
         size_bytes,
-        sig_addr,
         signal,
+        sig_val,
         sig_op,
         pe,
         _semantic=None,
@@ -301,7 +484,7 @@ def putmem_signal_block_extern_wrapper(  # type: ignore[no-untyped-def]
         return core.extern_elementwise(
             "",
             "",
-            [dst, src, size_bytes, sig_addr, signal, sig_op, pe],
+            [dst, src, size_bytes, signal, sig_val, sig_op, pe],
             {
                 (
                     core.dtype("int64"),
@@ -375,7 +558,7 @@ def wait_until_extern_wrapper(ivar, cmp, cmp_val, _semantic=None):  # type: igno
         )
 
     @triton.jit  # type: ignore[misc]
-    def signal_wait_until(sig_addr, cmp, cmp_val):  # type: ignore[no-untyped-def]
+    def signal_wait_until(signal, cmp, cmp_val):  # type: ignore[no-untyped-def]
         """
         Wait until a signal variable meets a specified condition.
 
@@ -385,7 +568,7 @@ def signal_wait_until(sig_addr, cmp, cmp_val):  # type: ignore[no-untyped-def]
         with signal operations.
 
         Args:
-            sig_addr (int64): Symmetric address of the signal variable (uint64_t).
+            signal (tensor): Symmetric signal tensor with remote PE.
                              Must be 8-byte aligned symmetric memory.
             cmp (int32): Comparison operator. Common values:
                         - NVSHMEM_CMP_EQ (0): Wait until signal == cmp_val
@@ -414,14 +597,16 @@ def signal_wait_until(sig_addr, cmp, cmp_val):  # type: ignore[no-untyped-def]
             ```
         """
         cmp_val = 0 << 32 | cmp_val
-        return signal_wait_until_extern_wrapper(sig_addr, cmp, cmp_val.to(tl.uint64))
+        return signal_wait_until_extern_wrapper(
+            signal.to(tl.int64), cmp, cmp_val.to(tl.uint64)
+        )
 
     @core.extern
-    def signal_wait_until_extern_wrapper(sig_addr, cmp, cmp_val, _semantic=None):  # type: ignore[no-untyped-def]
+    def signal_wait_until_extern_wrapper(signal, cmp, cmp_val, _semantic=None):  # type: ignore[no-untyped-def]
         return core.extern_elementwise(
             "",
             "",
-            [sig_addr, cmp, cmp_val],
+            [signal, cmp, cmp_val],
             {
                 (
                     core.dtype("int64"),
diff --git a/torch/distributed/_tools/fake_collectives.py b/torch/distributed/_tools/fake_collectives.py
index b89970ab3348..3b201b395334 100644
--- a/torch/distributed/_tools/fake_collectives.py
+++ b/torch/distributed/_tools/fake_collectives.py
@@ -2,9 +2,7 @@
 from typing import Any
 
 import torch
-
-# Import centralized distributed components
-from torch.distributed._distributed_c10d import (
+from torch._C._distributed_c10d import (
     _resolve_process_group,
     FakeWork,
     ProcessGroup,
diff --git a/torch/distributed/_tools/fsdp2_mem_tracker.py b/torch/distributed/_tools/fsdp2_mem_tracker.py
index 5ab0da552214..a6d20c69ecf8 100644
--- a/torch/distributed/_tools/fsdp2_mem_tracker.py
+++ b/torch/distributed/_tools/fsdp2_mem_tracker.py
@@ -1,7 +1,8 @@
+from collections.abc import Callable
 from copy import deepcopy
 from enum import auto, Enum
 from functools import partial, wraps
-from typing import Any, Callable, NamedTuple, Optional, TypeVar, Union
+from typing import Any, NamedTuple, Optional, TypeVar, Union
 from typing_extensions import ParamSpec, TypeVarTuple, Unpack
 
 import torch
@@ -230,6 +231,7 @@ def inner(
                         " or file a github issue if you need this feature."
                     )
 
+            # pyrefly: ignore  # bad-assignment
             args, kwargs = orig_fsdp_state_pre_fw(*args, **kwargs)
 
             fsdp_state = fsdp_mod._get_fsdp_state()
@@ -363,6 +365,7 @@ def _instrument_fsdp_module(self) -> None:
         # `FSDPParamGroup.post_forward` because during AC these won't be called.
         # TODO(@sanketpurandare): This will need to be modified after this PR (https://github.com/pytorch/pytorch/pull/127786)
         # lands. For backward we monkey-patch the `FSDPParamGroup.pre_backward` and `FSDPParamGroup.post_backward`.
+        # pyrefly: ignore  # missing-attribute
         for module in self._root_mod.modules():
             if isinstance(module, FSDPModule):
                 fsdp_state = module._get_fsdp_state()
@@ -371,6 +374,7 @@ def _instrument_fsdp_module(self) -> None:
                     fsdp_state._pre_forward_hook_handle.remove()
                     fsdp_state._post_forward_hook_handle.remove()
                     fsdp_state._pre_forward_hook_handle = (
+                        # pyrefly: ignore  # missing-attribute
                         module.register_forward_pre_hook(
                             self._fsdp_state_pre_forward(
                                 module, fsdp_state._pre_forward
@@ -379,6 +383,7 @@ def _instrument_fsdp_module(self) -> None:
                             with_kwargs=True,
                         )
                     )
+                    # pyrefly: ignore  # missing-attribute
                     fsdp_state._post_forward_hook_handle = module.register_forward_hook(
                         self._fsdp_state_post_forward(module, fsdp_state._post_forward),
                         prepend=False,
@@ -397,6 +402,7 @@ def _instrument_fsdp_module(self) -> None:
                         )
                     )
 
+        # pyrefly: ignore  # missing-attribute
         for buffer in self._root_mod.buffers():
             self._update_and_maybe_create_winfos(
                 buffer,
@@ -506,6 +512,7 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):  # type: ignor
         ):
             # N.B: This is a hacky way to override the Meta IMPL of wait_tensor. The original impl returns
             # a new tensor which does not happen in eager mode, when a wait_tensor is called.
+            # pyrefly: ignore  # unsupported-operation
             res = args[0]
         else:
             res = func(*args, **kwargs or {})
@@ -522,6 +529,7 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):  # type: ignor
             _FSDPState.PRE_FW,
             _FSDPState.PRE_BW,
         ]:
+            # pyrefly: ignore  # unsupported-operation
             output_tensor = args[0]
             self._update_and_maybe_create_winfos(
                 output_tensor,
@@ -532,6 +540,7 @@ def __torch_dispatch__(self, func, types, args=..., kwargs=None):  # type: ignor
             func == c10d._reduce_scatter_base_.default
             and self._fsdp_state == _FSDPState.POST_BW
         ):
+            # pyrefly: ignore  # unsupported-operation
             input_tensor = args[1]
             self._update_and_maybe_create_winfos(
                 input_tensor,
diff --git a/torch/distributed/_tools/ilp_utils.py b/torch/distributed/_tools/ilp_utils.py
index b3c2980dd3b8..0e8ba4195ffd 100644
--- a/torch/distributed/_tools/ilp_utils.py
+++ b/torch/distributed/_tools/ilp_utils.py
@@ -127,7 +127,7 @@ def aggregate_stats(
     }
 
     for mod in model.modules():
-        if mod_mem_stat := mod_mem_stats.get(mod, None):
+        if mod_mem_stat := mod_mem_stats.get(mod):
             if tradeoff_stats := mod_sac_tradeoff_stats.get(mod_mem_stat.mod_fqn, None):
                 sac_runtime = tradeoff_stats.sac_runtime
                 sac_memory = tradeoff_stats.sac_memory
diff --git a/torch/distributed/_tools/mem_tracker.py b/torch/distributed/_tools/mem_tracker.py
index 097cf0fba54a..2736ca0a2f30 100644
--- a/torch/distributed/_tools/mem_tracker.py
+++ b/torch/distributed/_tools/mem_tracker.py
@@ -2,11 +2,12 @@
 import os
 import re
 import warnings
+from collections.abc import Callable
 from contextlib import nullcontext
 from copy import deepcopy
 from enum import auto, Enum
 from functools import partial, wraps
-from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from typing import Any, Optional, TYPE_CHECKING, Union
 from typing_extensions import Self
 
 import torch
@@ -142,6 +143,7 @@ def __init__(
         self.size = size
         self.element_size = element_size
         self.reftype = reftype
+        # pyrefly: ignore  # read-only
         self.device = device
         self.mem_consumed = self._calculate_mem_consumed()
 
@@ -403,6 +405,7 @@ def _update_snap(
         # Initialize a flag to track if the total memory might drop to zero after updates.
         maybe_zero = False
         # Ensure the device entry exists in the current memory snapshot, initializing if necessary.
+        # pyrefly: ignore  # no-matching-overload
         dev_snap = self._curr_mem_snap.setdefault(
             winfo.device, dict.fromkeys(self._ref_class, 0)
         )
@@ -914,6 +917,7 @@ def __enter__(self) -> "MemTracker":
         self._depth += 1
         return self
 
+    # pyrefly: ignore  # bad-override
     def __exit__(self, *args: Any) -> None:
         self._depth -= 1
         if self._depth == 0:
@@ -931,6 +935,7 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):  # type: ignore
         ):
             # N.B: This is a hacky way to override the Meta IMPL of wait_tensor. The original impl returns
             # a new tensor which does not happen in eager mode, when a wait_tensor is called.
+            # pyrefly: ignore  # index-error
             res = args[0]
         else:
             res = func(*args, **kwargs or {})
diff --git a/torch/distributed/_tools/memory_tracker.py b/torch/distributed/_tools/memory_tracker.py
index 290846d604b7..1ee9817c95a2 100644
--- a/torch/distributed/_tools/memory_tracker.py
+++ b/torch/distributed/_tools/memory_tracker.py
@@ -2,9 +2,9 @@
 import operator
 import pickle
 from collections import defaultdict
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from itertools import chain
-from typing import Any, Callable, no_type_check, TYPE_CHECKING
+from typing import Any, no_type_check, TYPE_CHECKING
 
 import torch
 import torch.nn as nn
@@ -232,7 +232,9 @@ def _create_pre_forward_hook(self, name: str) -> Callable:
         def _pre_forward_hook(module: nn.Module, inputs: Any) -> None:
             self._cur_module_name = f"{name}.forward"
             if (
+                # pyrefly: ignore  # invalid-argument
                 hasattr(module, "_memory_tracker_is_root")
+                # pyrefly: ignore  # not-callable
                 and module._memory_tracker_is_root
             ):
                 self._add_marker("fw_start")
@@ -248,7 +250,9 @@ def _post_forward_hook(
             outputs: Sequence[torch.Tensor],
         ) -> None:
             if (
+                # pyrefly: ignore  # invalid-argument
                 hasattr(module, "_memory_tracker_is_root")
+                # pyrefly: ignore  # not-callable
                 and module._memory_tracker_is_root
             ):
                 self._add_marker("fw_bw_boundary")
diff --git a/torch/distributed/_tools/mod_tracker.py b/torch/distributed/_tools/mod_tracker.py
index 2465a285e19a..32a76062ec5e 100644
--- a/torch/distributed/_tools/mod_tracker.py
+++ b/torch/distributed/_tools/mod_tracker.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 import warnings
 import weakref
-from typing import Callable, Optional
+from collections.abc import Callable
+from typing import Optional
 
 import torch
 from torch.autograd.graph import register_multi_grad_hook
@@ -177,6 +178,7 @@ def fn(*args):
                 def custom_formatwarning(msg, category, filename, lineno, line=None):
                     return f"{filename}:{lineno}: {category.__name__}: {msg} \n"
 
+                # pyrefly: ignore  # bad-assignment
                 warnings.formatwarning = custom_formatwarning
                 warnings.warn(
                     "The module hierarchy tracking maybe be messed up."
diff --git a/torch/distributed/_tools/runtime_estimator.py b/torch/distributed/_tools/runtime_estimator.py
index 734e463fceaa..d739d789f4a1 100644
--- a/torch/distributed/_tools/runtime_estimator.py
+++ b/torch/distributed/_tools/runtime_estimator.py
@@ -2,7 +2,7 @@
 import math
 import os
 from collections import defaultdict
-from typing import Any, Callable
+from typing import Any, TYPE_CHECKING
 from typing_extensions import Self
 
 import torch
@@ -16,6 +16,10 @@
 from torch.utils.flop_counter import flop_registry
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 aten = torch.ops.aten
 
 # This value is hard-coded here:
@@ -515,6 +519,7 @@ def __enter__(self) -> Self:
         super().__enter__()
         return self
 
+    # pyrefly: ignore  # bad-override
     def __exit__(self, *args: Any) -> None:
         print(
             f"Estimated ({self._estimate_mode_type})"
diff --git a/torch/distributed/_tools/sac_estimator.py b/torch/distributed/_tools/sac_estimator.py
index 55b667776141..d14d8c9ae922 100644
--- a/torch/distributed/_tools/sac_estimator.py
+++ b/torch/distributed/_tools/sac_estimator.py
@@ -429,6 +429,7 @@ def __torch_dispatch__(  # type: ignore[no-untyped-def]
         # sdpa has non-deterministic seed, but might be deterministic
         # if no dropout is applied
         if func.overloadpacket.__name__ == "_scaled_dot_product_flash_attention":
+            # pyrefly: ignore  # missing-attribute
             is_rand_op = kwargs.get("dropout_p", 0) != 0
         # 5. Create metadata information per active non-leaf module
         for mod_fqn in self._mod_tracker.parents:
@@ -710,7 +711,7 @@ def display_sac_stats(
                 str(i in sac_stats.view_like_ops),
                 str(i in sac_stats.rand_ops),
                 str(i in sac_stats.saved_autograd_ops),
-                str(op_parent.get(i, None)),
+                str(op_parent.get(i)),
             ]
             table_data.append(row)
         # Define headers
diff --git a/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py b/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
index 98e213792b73..3ce067f6cddc 100644
--- a/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
+++ b/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
@@ -1,10 +1,10 @@
 # mypy: allow-untyped-defs
 import warnings
 from abc import ABC, abstractmethod
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 from enum import auto, Enum
 from functools import partial
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 
 import torch
 import torch.nn as nn
diff --git a/torch/distributed/algorithms/_quantization/quantization.py b/torch/distributed/algorithms/_quantization/quantization.py
index a579a0a02fea..23c08e63331e 100644
--- a/torch/distributed/algorithms/_quantization/quantization.py
+++ b/torch/distributed/algorithms/_quantization/quantization.py
@@ -65,6 +65,7 @@ def _dequantize_tensor(tensor, qtype, quant_loss=None):
         elif tensor.dtype == torch.float16 and quant_loss is None:
             return tensor.float()
         else:
+            # pyrefly: ignore  # unsupported-operation
             return tensor.float() / quant_loss
     elif qtype == DQuantType.BFP16:
         if tensor.dtype != torch.float16:
@@ -106,7 +107,7 @@ def auto_quantize(func, qtype, quant_loss=None):
 
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
-        group = kwargs.get("group", None)
+        group = kwargs.get("group")
         async_op = kwargs.get("async_op", False)
         if async_op is True:
             raise RuntimeError("The async_op=True mode is not supported yet.")
@@ -132,8 +133,8 @@ def wrapper(*args, **kwargs):
 
         elif func == dist.all_to_all_single:
             tensors = args[0]
-            out_splits = kwargs.get("out_splits", None)
-            in_splits = kwargs.get("in_splits", None)
+            out_splits = kwargs.get("out_splits")
+            in_splits = kwargs.get("in_splits")
             # Quantizing the input/output tensor
             input_tensors = _quantize_tensor(args[1], qtype)
             out_tensors = _quantize_tensor(tensors, qtype)
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
index a1d1ffd2fc87..d9cc6d12785c 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
@@ -1,7 +1,21 @@
 # mypy: allow-untyped-defs
+import sys
 from enum import Enum
 from functools import partial
 
+
+# To suppress FutureWarning from partial since 3.13
+if sys.version_info >= (3, 13):
+    from enum import member
+
+    def _enum_member(x):
+        return member(x)
+else:
+
+    def _enum_member(x):
+        return x
+
+
 import torch.distributed as dist
 
 from . import (
@@ -51,45 +65,61 @@ class DDPCommHookType(Enum):
     ``DDPCommHookType.ALLREDUCE.value(model=model, state=process_group)``.
     """
 
-    ALLREDUCE = partial(_ddp_comm_hook_wrapper, comm_hook=default.allreduce_hook)
-    FP16_COMPRESS = partial(
-        _ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook
+    ALLREDUCE = _enum_member(
+        partial(_ddp_comm_hook_wrapper, comm_hook=default.allreduce_hook)
+    )
+    FP16_COMPRESS = _enum_member(
+        partial(_ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook)
     )
-    BF16_COMPRESS = partial(
-        _ddp_comm_hook_wrapper, comm_hook=default.bf16_compress_hook
+    BF16_COMPRESS = _enum_member(
+        partial(_ddp_comm_hook_wrapper, comm_hook=default.bf16_compress_hook)
     )
-    QUANTIZE_PER_TENSOR = partial(
-        _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook
+    QUANTIZE_PER_TENSOR = _enum_member(
+        partial(
+            _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook
+        )
     )
-    QUANTIZE_PER_CHANNEL = partial(
-        _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_perchannel_hook
+    QUANTIZE_PER_CHANNEL = _enum_member(
+        partial(
+            _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_perchannel_hook
+        )
     )
-    POWER_SGD = partial(
-        _powerSGD_comm_hook_wrapper,
-        comm_hook=powerSGD.powerSGD_hook,
-        matrix_approximation_rank=1,
+    POWER_SGD = _enum_member(
+        partial(
+            _powerSGD_comm_hook_wrapper,
+            comm_hook=powerSGD.powerSGD_hook,
+            matrix_approximation_rank=1,
+        )
     )
     # Rank-2 PowerSGD can give a higher accuracy than the default rank-1 version,
     # but it runs slower and consumes more memory.
-    POWER_SGD_RANK2 = partial(
-        _powerSGD_comm_hook_wrapper,
-        comm_hook=powerSGD.powerSGD_hook,
-        matrix_approximation_rank=2,
+    POWER_SGD_RANK2 = _enum_member(
+        partial(
+            _powerSGD_comm_hook_wrapper,
+            comm_hook=powerSGD.powerSGD_hook,
+            matrix_approximation_rank=2,
+        )
     )
     # Batching can lead to a faster training at the cost of accuracy.
-    BATCHED_POWER_SGD = partial(
-        _powerSGD_comm_hook_wrapper,
-        comm_hook=powerSGD.batched_powerSGD_hook,
-        matrix_approximation_rank=1,
+    BATCHED_POWER_SGD = _enum_member(
+        partial(
+            _powerSGD_comm_hook_wrapper,
+            comm_hook=powerSGD.batched_powerSGD_hook,
+            matrix_approximation_rank=1,
+        )
     )
-    BATCHED_POWER_SGD_RANK2 = partial(
-        _powerSGD_comm_hook_wrapper,
-        comm_hook=powerSGD.batched_powerSGD_hook,
-        matrix_approximation_rank=2,
+    BATCHED_POWER_SGD_RANK2 = _enum_member(
+        partial(
+            _powerSGD_comm_hook_wrapper,
+            comm_hook=powerSGD.batched_powerSGD_hook,
+            matrix_approximation_rank=2,
+        )
     )
-    NOOP = partial(
-        _ddp_comm_hook_wrapper,
-        comm_hook=debugging.noop_hook,
+    NOOP = _enum_member(
+        partial(
+            _ddp_comm_hook_wrapper,
+            comm_hook=debugging.noop_hook,
+        )
     )
 
 
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
index 6153d8e03fdf..2e55941b370c 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import weakref
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 
 import torch
 import torch.distributed as dist
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
index c02d4db91966..a1febff0a6fc 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
-from typing import Any, Callable, cast
+from collections.abc import Callable
+from typing import Any, cast
 
 import torch
 import torch.distributed as dist
@@ -21,6 +22,7 @@ def _allreduce_fut(
     group_to_use = process_group if process_group is not None else dist.group.WORLD
 
     # Apply the division first to avoid overflow, especially for FP16.
+    # pyrefly: ignore  # missing-attribute
     tensor.div_(group_to_use.size())
 
     return (
@@ -58,6 +60,7 @@ def _compress_hook(
     bucket: dist.GradBucket,
 ) -> torch.futures.Future[torch.Tensor]:
     group_to_use = process_group if process_group is not None else dist.group.WORLD
+    # pyrefly: ignore  # missing-attribute
     world_size = group_to_use.size()
 
     buffer = (
@@ -77,7 +80,10 @@ def decompress(fut):
 
     if torch.compiler.is_compiling():
         grad = dist._functional_collectives.all_reduce(
-            compressed_tensor, "sum", group_to_use
+            compressed_tensor,
+            "sum",
+            # pyrefly: ignore  # bad-argument-type
+            group_to_use,
         )
         return decompress(grad)
     else:
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py
index ae8136a13593..162160e394ad 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
+from collections.abc import Callable
 from dataclasses import dataclass
 from functools import partial
-from typing import Any, Callable, no_type_check
+from typing import Any, no_type_check
 
 import torch
 import torch.distributed as dist
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py
index 838d5f3b9266..5224decc5eeb 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py
@@ -66,6 +66,7 @@ def quantization_pertensor_hook(
     """
     group_to_use = process_group if process_group is not None else dist.group.WORLD
     rank = process_group.rank() if process_group is not None else dist.get_rank()
+    # pyrefly: ignore  # missing-attribute
     world_size = group_to_use.size()
 
     tensor = bucket.buffer()
@@ -147,6 +148,7 @@ def quantization_perchannel_hook(
     """
     group_to_use = process_group if process_group is not None else dist.group.WORLD
     rank = process_group.rank() if process_group is not None else dist.get_rank()
+    # pyrefly: ignore  # missing-attribute
     world_size = group_to_use.size()
 
     tensor = bucket.buffer()
diff --git a/torch/distributed/algorithms/join.py b/torch/distributed/algorithms/join.py
index 70d74af7ead0..3b6ea6e40e33 100644
--- a/torch/distributed/algorithms/join.py
+++ b/torch/distributed/algorithms/join.py
@@ -210,6 +210,7 @@ def _extract_dist_info(self) -> None:
         """
         process_group = None
         device = None
+        # pyrefly: ignore  # bad-assignment
         for joinable in self._joinables:
             if process_group is None:
                 process_group = joinable.join_process_group
diff --git a/torch/distributed/algorithms/model_averaging/utils.py b/torch/distributed/algorithms/model_averaging/utils.py
index 3e3243002a9c..fa8cc184eddc 100644
--- a/torch/distributed/algorithms/model_averaging/utils.py
+++ b/torch/distributed/algorithms/model_averaging/utils.py
@@ -5,6 +5,10 @@
 
 import torch
 import torch.distributed as dist
+
+# The two imports below are not always available depending on the
+# USE_DISTRIBUTED compile flag. Make sure they raise import error
+# if we're trying to use them.
 from torch.distributed import group, ProcessGroup
 
 
diff --git a/torch/distributed/benchmarks/benchmark_ddp_rpc.py b/torch/distributed/benchmarks/benchmark_ddp_rpc.py
index e200382a6b07..2ad8df5834fe 100644
--- a/torch/distributed/benchmarks/benchmark_ddp_rpc.py
+++ b/torch/distributed/benchmarks/benchmark_ddp_rpc.py
@@ -74,6 +74,7 @@ def forward(self, indices, offsets):
         assert NUM_PS * EMBEDDING_DIM >= 512
         dim_normalizer = int(NUM_PS * EMBEDDING_DIM / 512)
         emb_lookups_reshaped = emb_lookups_cat.reshape(  # type: ignore[possibly-undefined]
+            # pyrefly: ignore  # unbound-name
             [emb_lookups_cat.shape[0] * dim_normalizer, 512]
         )
 
diff --git a/torch/distributed/c10d_logger.py b/torch/distributed/c10d_logger.py
index c4dfb2b99e82..446682b804f0 100644
--- a/torch/distributed/c10d_logger.py
+++ b/torch/distributed/c10d_logger.py
@@ -9,7 +9,8 @@
 
 import functools
 import logging
-from typing import Any, Callable, TypeVar
+from collections.abc import Callable
+from typing import Any, TypeVar
 from typing_extensions import ParamSpec
 
 import torch
@@ -44,6 +45,7 @@ def _get_logging_handler(
     return (log_handler, log_handler_name)
 
 
+# pyrefly: ignore  # unknown-name
 global _c10d_logger
 _c10d_logger = _get_or_create_logger()
 
diff --git a/torch/distributed/checkpoint/__init__.py b/torch/distributed/checkpoint/__init__.py
index c9eb7de5b25a..6f67d225998b 100644
--- a/torch/distributed/checkpoint/__init__.py
+++ b/torch/distributed/checkpoint/__init__.py
@@ -12,6 +12,10 @@
 from .optimizer import load_sharded_optimizer_state_dict
 from .planner import LoadPlan, LoadPlanner, ReadItem, SavePlan, SavePlanner, WriteItem
 from .quantized_hf_storage import QuantizedHuggingFaceStorageReader
+
+# pyrefly: ignore  # deprecated
 from .state_dict_loader import load, load_state_dict
+
+# pyrefly: ignore  # deprecated
 from .state_dict_saver import async_save, save, save_state_dict
 from .storage import StorageReader, StorageWriter
diff --git a/torch/distributed/checkpoint/_async_process_executor.py b/torch/distributed/checkpoint/_async_process_executor.py
index d9c6de79b32b..7c8aa6b63984 100644
--- a/torch/distributed/checkpoint/_async_process_executor.py
+++ b/torch/distributed/checkpoint/_async_process_executor.py
@@ -4,7 +4,6 @@
 import os
 from concurrent.futures import Future, ThreadPoolExecutor
 from dataclasses import dataclass
-from datetime import timedelta
 from enum import Enum
 from typing import Any, Optional, Union
 from uuid import uuid4
@@ -110,7 +109,8 @@ def __init__(
         # Wait for the checkpoint background process to initialize.
         # Using default GLOO init timeout.
         response = self._wait_for_response(timeout=1800)
-        assert response == _CheckpointSaveProcessControlOpts.INIT_COMPLETE
+        if not response == _CheckpointSaveProcessControlOpts.INIT_COMPLETE:
+            raise AssertionError(f"Expected INIT_COMPLETE response, got {response}")
 
     def __del__(self) -> None:
         if self._save_process.is_alive():
@@ -176,7 +176,8 @@ def save(
         )
         self._send(async_cp_request)
         result = self._wait_for_response()
-        assert isinstance(result, Metadata)
+        if not isinstance(result, Metadata):
+            raise AssertionError(f"Expected Metadata response, got {type(result)}")
         return result
 
     @staticmethod
@@ -223,9 +224,7 @@ def _checkpointing_subprocess(
                 "Initializing dist.ProcessGroup in checkpoint background process"
             )
             # NOTE: GLOO backend is enforced here.
-            dist.init_process_group(
-                backend=dist.Backend.GLOO, timeout=timedelta(seconds=600)
-            )
+            dist.init_process_group(backend=dist.Backend.GLOO)
             dist.barrier()
 
             logger.info("Checkpoint background process is running...")
@@ -248,7 +247,10 @@ def _checkpointing_subprocess(
                 ):
                     logger.info("Terminating the checkpoint background process.")
                     return
-                assert isinstance(obj, _AsyncCheckpointRequest)
+                if not isinstance(obj, _AsyncCheckpointRequest):
+                    raise AssertionError(
+                        f"Expected _AsyncCheckpointRequest, got {type(obj)}"
+                    )
                 logger.info(
                     f"Received async checkpoint request with id={obj.checkpoint_request_id.checkpoint_id}"  # noqa: G004
                 )
@@ -299,7 +301,10 @@ def _execute_save_impl(
     ) -> Metadata:
         global _CHECKPOINT_PROCESS
         if _CHECKPOINT_PROCESS is None:
-            assert pg_init_info is not None
+            if pg_init_info is None:
+                raise AssertionError(
+                    "pg_init_info must not be None when _CHECKPOINT_PROCESS is None"
+                )
             ckpt_kwargs = {}
             if (ckpt_id := getattr(storage_writer, "checkpoint_id", None)) is not None:
                 ckpt_kwargs["checkpoint_id"] = ckpt_id
@@ -308,11 +313,15 @@ def _execute_save_impl(
             @_dcp_method_logger(**ckpt_kwargs)
             def create_checkpoint_daemon_process() -> None:
                 global _CHECKPOINT_PROCESS
+                # pyrefly: ignore  # bad-argument-type
                 _CHECKPOINT_PROCESS = _AsyncCheckpointProcess(pg_init_info=pg_init_info)
 
             create_checkpoint_daemon_process()
 
-        assert _CHECKPOINT_PROCESS is not None
+        if _CHECKPOINT_PROCESS is None:
+            raise AssertionError(
+                "_CHECKPOINT_PROCESS must not be None after initialization"
+            )
         staged_state_dict = (
             staging_future_or_state_dict.result()
             if isinstance(staging_future_or_state_dict, Future)
diff --git a/torch/distributed/checkpoint/_checkpointer.py b/torch/distributed/checkpoint/_checkpointer.py
index d21d8248d204..d54de9092a93 100644
--- a/torch/distributed/checkpoint/_checkpointer.py
+++ b/torch/distributed/checkpoint/_checkpointer.py
@@ -89,7 +89,8 @@ def async_save(
             process_group=self.process_group,
             planner=self.save_planner,
         )
-        assert isinstance(response, Future)
+        if not isinstance(response, Future):
+            raise AssertionError("response should be a Future instance")
         return response
 
     def load(self, state_dict: dict[str, Any]) -> None:
diff --git a/torch/distributed/checkpoint/_dedup_save_plans.py b/torch/distributed/checkpoint/_dedup_save_plans.py
index 3e2cf954c409..acb81c418628 100644
--- a/torch/distributed/checkpoint/_dedup_save_plans.py
+++ b/torch/distributed/checkpoint/_dedup_save_plans.py
@@ -54,7 +54,8 @@ def dedup_save_plans(
         for plan_idx in plan_indices - {select_plan_idx}:
             plan_to_item_indices[plan_idx].discard(write_item_idx)
     # Sanity check
-    assert len(all_plans) == len(plan_to_item_indices)
+    if len(all_plans) != len(plan_to_item_indices):
+        raise AssertionError("len(all_plans) != len(plan_to_item_indices)")
     # Create new plans with the updated write items post deduplication
     return [
         dataclasses.replace(
diff --git a/torch/distributed/checkpoint/_experimental/barriers.py b/torch/distributed/checkpoint/_experimental/barriers.py
index 18de93c81d13..bcea8ad91401 100644
--- a/torch/distributed/checkpoint/_experimental/barriers.py
+++ b/torch/distributed/checkpoint/_experimental/barriers.py
@@ -150,9 +150,8 @@ def __init__(
         Raises:
             AssertionError: If the distributed process group is not initialized.
         """
-        assert dist.is_initialized(), (
-            "DistBarrier requires an initialized process group."
-        )
+        if not dist.is_initialized():
+            raise AssertionError("DistBarrier requires an initialized process group.")
 
     def execute_barrier(self) -> None:
         """
diff --git a/torch/distributed/checkpoint/_experimental/builder.py b/torch/distributed/checkpoint/_experimental/builder.py
index f705072790a1..1a7f5fa9e712 100644
--- a/torch/distributed/checkpoint/_experimental/builder.py
+++ b/torch/distributed/checkpoint/_experimental/builder.py
@@ -6,7 +6,8 @@
 and configuration with reasonable defaults.
 """
 
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 
 import torch.distributed as dist
 
diff --git a/torch/distributed/checkpoint/_experimental/checkpoint_process.py b/torch/distributed/checkpoint/_experimental/checkpoint_process.py
index 5bca7c3e6e86..5fde55053eed 100644
--- a/torch/distributed/checkpoint/_experimental/checkpoint_process.py
+++ b/torch/distributed/checkpoint/_experimental/checkpoint_process.py
@@ -1,11 +1,12 @@
 import logging
 import os
 import traceback
+from collections.abc import Callable
 from concurrent.futures import Future, ThreadPoolExecutor
 from dataclasses import dataclass
 from enum import Enum
 from multiprocessing.connection import Connection
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch.multiprocessing as mp
 from torch.multiprocessing.spawn import ProcessExitedException
@@ -134,7 +135,8 @@ def _create_subprocess(
         )
 
         # wait for the timeout or a response from subprocess
-        assert self._parent_end is not None, "Parent end of pipe should be initialized"
+        if self._parent_end is None:
+            raise AssertionError("Parent end of pipe should be initialized")
         if not self._parent_end.poll(timeout=config.subprocess_init_timeout_secs):
             msg = f"Timed out after {config.subprocess_init_timeout_secs}s waiting for checkpoint subprocess to initialize"
             logger.error(msg)
@@ -160,7 +162,8 @@ def _subprocess(
             os.getpid(),
         )
 
-        assert sub_rank == 0, "We need only one checkpointer per parent training"
+        if sub_rank != 0:
+            raise AssertionError("We need only one checkpointer per parent training")
         request = WorkerRequest(request_type=RequestType.PING, payload={})
 
         try:
@@ -221,13 +224,12 @@ def _subprocess(
                 )
             )
             parent_pipe.close()
-            logger.error("Subprocess terminated due to exception: %s", e)
+            logger.exception("Subprocess terminated due to exception")
 
     def _send(self, request_type: RequestType, payload: dict[str, Any]) -> None:
         try:
-            assert self._parent_end is not None, (
-                "Parent end of pipe should be initialized"
-            )
+            if self._parent_end is None:
+                raise AssertionError("Parent end of pipe should be initialized")
             self._parent_end.send(
                 WorkerRequest(
                     request_type=request_type,
@@ -236,16 +238,15 @@ def _send(self, request_type: RequestType, payload: dict[str, Any]) -> None:
             )
         except OSError as e:
             error_msg = "Child process terminated unexpectedly"
-            logger.error(
-                "Communication failed during %s request: %s", request_type.value, e
+            logger.exception(
+                "Communication failed during %s request", request_type.value
             )
             raise RuntimeError(error_msg) from e
 
     def _recv(self) -> Optional[dict[str, Any]]:
         try:
-            assert self._parent_end is not None, (
-                "Parent end of pipe should be initialized"
-            )
+            if self._parent_end is None:
+                raise AssertionError("Parent end of pipe should be initialized")
             response = self._parent_end.recv()
             if response.success is False:
                 error_msg = (
@@ -321,6 +322,7 @@ def close(self) -> None:
             subprocess_pid = self.process.processes[0].pid
             # send graceful termination to sub process
             try:
+                # pyrefly: ignore  # missing-attribute
                 self._parent_end.send(
                     WorkerRequest(
                         request_type=RequestType.TERMINATE_PROCESS,
@@ -352,10 +354,8 @@ def close(self) -> None:
                     )
                     self.process.processes[0].kill()
                     logger.info("Subprocess killed forcefully")
-            except ProcessExitedException as e:
-                logger.error(
-                    "ProcessExitedException during subprocess termination: %s", e
-                )
+            except ProcessExitedException:
+                logger.exception("ProcessExitedException during subprocess termination")
                 raise
 
         logger.debug("CheckpointProcess closed successfully")
diff --git a/torch/distributed/checkpoint/_experimental/checkpoint_reader.py b/torch/distributed/checkpoint/_experimental/checkpoint_reader.py
index 3119fb22a0be..fb1bcf46198b 100644
--- a/torch/distributed/checkpoint/_experimental/checkpoint_reader.py
+++ b/torch/distributed/checkpoint/_experimental/checkpoint_reader.py
@@ -134,11 +134,12 @@ def load_tensor(
 
                 tensor_offset = source.untyped_storage()._checkpoint_offset
 
-                assert tensor_offset is not None, (
-                    "checkpoint_offset for tensor in torch serialized file is not set. This could"
-                    "happen if the checkpoint was saved with a older version of Pytorch."
-                    "Please make sure that the checkpoint was saved with Pytorch 2.7 or later."
-                )
+                if tensor_offset is None:
+                    raise AssertionError(
+                        "checkpoint_offset for tensor in torch serialized file is not set. This could "
+                        "happen if the checkpoint was saved with a older version of Pytorch. "
+                        "Please make sure that the checkpoint was saved with Pytorch 2.7 or later."
+                    )
 
                 tensor_len = source.nelement() * source.element_size()
                 file.seek(
@@ -175,6 +176,7 @@ def process_value(
                         # create a new map with all the keys present in source_value
                         target_value = dict.fromkeys(source_value.keys())
 
+                    # pyrefly: ignore  # missing-attribute
                     for key in list(target_value.keys()):
                         current_path = f"{key_path}.{key}" if key_path else key
                         if key in source_value:
diff --git a/torch/distributed/checkpoint/_experimental/staging.py b/torch/distributed/checkpoint/_experimental/staging.py
index 3316fbe613d2..199532e2d116 100644
--- a/torch/distributed/checkpoint/_experimental/staging.py
+++ b/torch/distributed/checkpoint/_experimental/staging.py
@@ -147,18 +147,21 @@ def __init__(
         self._staging_stream = None
 
         if self._config.use_async_staging:
+            # pyrefly: ignore  # bad-assignment
             self._staging_executor = ThreadPoolExecutor(max_workers=1)
             if torch.accelerator.is_available():
                 # Note: stream needs to be initialized on the main thread after default cuda
                 # stream is setup/used to avoid the risk of accidentally reusing the main
                 # compute stream or in other cases kernels actually launching from the
                 # main thread.
+                # pyrefly: ignore  # bad-assignment
                 self._staging_stream = torch.Stream()
 
         if self._config.use_non_blocking_copy:
-            assert torch.accelerator.is_available(), (
-                "Non-blocking copy requires that the current accelerator is available."
-            )
+            if not torch.accelerator.is_available():
+                raise AssertionError(
+                    "Non-blocking copy requires that the current accelerator is available."
+                )
 
     def stage(
         self,
@@ -166,9 +169,10 @@ def stage(
         **kwargs: Any,
     ) -> Union[STATE_DICT, Future[STATE_DICT]]:
         if self._config.use_async_staging:
-            assert self._staging_executor is not None, (
-                "Staging executor should be initialized for async staging"
-            )
+            if self._staging_executor is None:
+                raise AssertionError(
+                    "Staging executor should be initialized for async staging"
+                )
             return self._staging_executor.submit(
                 self._stage,
                 state_dict,
@@ -183,9 +187,10 @@ def _stage(self, state_dict: STATE_DICT, **kwargs: Any) -> STATE_DICT:
         )
 
         if self._config.use_non_blocking_copy:
-            assert self._staging_stream or not self._config.use_async_staging, (
-                "Non-blocking copy in a background thread for async staging needs staging_stream to be initialized."
-            )
+            if not (self._staging_stream or not self._config.use_async_staging):
+                raise AssertionError(
+                    "Non-blocking copy in a background thread for async staging needs staging_stream to be initialized."
+                )
 
             # waits for the enqued copy operations to finish.
             self._staging_stream.synchronize() if self._staging_stream else torch.accelerator.synchronize()
diff --git a/torch/distributed/checkpoint/_experimental/types.py b/torch/distributed/checkpoint/_experimental/types.py
index 3874ecc30bf4..61268fd5b14a 100644
--- a/torch/distributed/checkpoint/_experimental/types.py
+++ b/torch/distributed/checkpoint/_experimental/types.py
@@ -7,8 +7,7 @@
 """
 
 from dataclasses import dataclass
-from typing import Any
-from typing_extensions import TypeAlias
+from typing import Any, TypeAlias
 
 
 # Type alias for state dictionaries used in checkpointing
diff --git a/torch/distributed/checkpoint/_extension.py b/torch/distributed/checkpoint/_extension.py
index 4c56dd0b36e1..2bde1cfb10b5 100644
--- a/torch/distributed/checkpoint/_extension.py
+++ b/torch/distributed/checkpoint/_extension.py
@@ -94,6 +94,7 @@ def is_available() -> bool:
         return zstandard is not None or pyzstd is not None
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def from_descriptor(version: str) -> "ZStandard":
         if version.partition(".")[0] != "1":
             raise ValueError(f"Unknown extension {version=}")
@@ -216,6 +217,7 @@ def from_descriptor(desc: str) -> Extension:
             ext = self.extensions.get(name)
             if not ext:
                 raise ValueError(f"Unknown extension {name=}")
+            # pyrefly: ignore  # bad-argument-type
             return ext.from_descriptor(version)
 
         return [from_descriptor(desc) for desc in descriptors]
diff --git a/torch/distributed/checkpoint/_fsspec_filesystem.py b/torch/distributed/checkpoint/_fsspec_filesystem.py
index 377c34ae1e5d..e239bbe891fb 100644
--- a/torch/distributed/checkpoint/_fsspec_filesystem.py
+++ b/torch/distributed/checkpoint/_fsspec_filesystem.py
@@ -37,7 +37,8 @@ def __init__(self) -> None:
     def create_stream(
         self, path: Union[str, os.PathLike], mode: str
     ) -> Generator[io.IOBase, None, None]:
-        assert self.fs is not None
+        if self.fs is None:
+            raise AssertionError("fs should not be None")
         path = os.fspath(path)
 
         # fsspec does not support concurrent transactions, and not all
diff --git a/torch/distributed/checkpoint/_pg_transport.py b/torch/distributed/checkpoint/_pg_transport.py
index f4c53829b23b..6a327afd445f 100644
--- a/torch/distributed/checkpoint/_pg_transport.py
+++ b/torch/distributed/checkpoint/_pg_transport.py
@@ -1,11 +1,11 @@
 import logging
 import pickle
 import time
-from collections.abc import Generator
+from collections.abc import Callable, Generator
 from contextlib import contextmanager
 from dataclasses import dataclass
 from datetime import timedelta
-from typing import Callable, cast, Optional, TypeVar, Union
+from typing import cast, Optional, TypeVar, Union
 
 import torch
 from torch.distributed import ProcessGroup, Work
@@ -193,12 +193,12 @@ def _cast_tensor(tensor: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
     caveat that the cast tensor may be larger than the original tensor due to
     the differences in striding.
     """
-    assert type(tensor) is torch.Tensor, (
-        f"can only cast standard tensors not {type(tensor)}"
-    )
+    if type(tensor) is not torch.Tensor:
+        raise AssertionError(f"can only cast standard tensors not {type(tensor)}")
     storage = tensor.untyped_storage()
     ret = torch.tensor(storage, dtype=dtype, device=tensor.device)
-    assert ret.untyped_storage() is storage, "storage should be the same"
+    if ret.untyped_storage() is not storage:
+        raise AssertionError("storage should be the same")
     return ret
 
 
@@ -227,6 +227,7 @@ def __init__(
         self._work: list[Work] = []
         self._pg = pg
         self._timeout = timeout
+        # pyrefly: ignore  # read-only
         self._device = device
         self._state_dict = state_dict
 
@@ -316,9 +317,8 @@ def recv(path: KeyPath, v: _TensorMeta) -> torch.Tensor:
                 if isinstance(inplace, DTensor):
                     inplace = inplace._local_tensor
                 t = _cast_tensor(inplace, torch.uint8)
-                assert t.nbytes == v.nbytes, (
-                    "inplace tensor storage must be the same size"
-                )
+                if t.nbytes != v.nbytes:
+                    raise AssertionError("inplace tensor storage must be the same size")
             else:
                 t = torch.empty(v.nbytes, dtype=torch.uint8, device=self._device)
 
diff --git a/torch/distributed/checkpoint/_state_dict_stager.py b/torch/distributed/checkpoint/_state_dict_stager.py
index 45fbd7686d89..1a5945657d26 100644
--- a/torch/distributed/checkpoint/_state_dict_stager.py
+++ b/torch/distributed/checkpoint/_state_dict_stager.py
@@ -123,12 +123,13 @@ def _stage_untyped_storage(
         # Check if we've already cached this storage
         if storage in self._cached_storage_mapping:
             cached_storage = self._cached_storage_mapping[storage]
-            assert cached_storage.size() == storage.size(), (
-                "For async checkpointing,  We cache storages in DRAM and reuse them."
-                "Cached storage size does not match original storage size."
-                "This should never happen as we track the original storage weakref "
-                "and clean up the cache storage. Please report this to PyTorch Distributed Checkpointing."
-            )
+            if cached_storage.size() != storage.size():
+                raise AssertionError(
+                    "For async checkpointing,  We cache storages in DRAM and reuse them. "
+                    "Cached storage size does not match original storage size. "
+                    "This should never happen as we track the original storage weakref "
+                    "and clean up the cache storage. Please report this to PyTorch Distributed Checkpointing."
+                )
             # Reuse cached storage but update with new data
             cached_storage.copy_(storage, non_blocking=non_blocking)
             return cached_storage
diff --git a/torch/distributed/checkpoint/_traverse.py b/torch/distributed/checkpoint/_traverse.py
index cc29207093db..cfd605a2bfb4 100644
--- a/torch/distributed/checkpoint/_traverse.py
+++ b/torch/distributed/checkpoint/_traverse.py
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
-from collections.abc import Collection, Mapping, MutableMapping
-from typing import Callable, cast, Optional, TypeVar, Union
+from collections.abc import Callable, Collection, Mapping, MutableMapping
+from typing import cast, Optional, TypeVar, Union
 
 import torch
 from torch.distributed._shard.sharded_tensor.api import ShardedTensor
@@ -121,20 +121,21 @@ def extend_list(lst: list[STATE_DICT_ITEM], idx: int) -> None:
     for i in range(1, len(path)):
         prev_key = path[i - 1]
         key = path[i]
-        def_val = cast(STATE_DICT_ITEM, {} if type(key) == str else [])
+        def_val = cast(STATE_DICT_ITEM, {} if type(key) is str else [])
 
         if isinstance(cur_container, Mapping):
             cur_container = cast(
                 CONTAINER_TYPE, cur_container.setdefault(prev_key, def_val)
             )
         else:
+            # pyrefly: ignore  # bad-argument-type
             extend_list(cur_container, prev_key)
             if cur_container[prev_key] is None:
                 cur_container[prev_key] = def_val
             cur_container = cur_container[prev_key]
 
     key = path[-1]
-    if type(key) == int:
+    if type(key) is int:
         extend_list(cast(list[STATE_DICT_ITEM], cur_container), key)
 
     cur_container[key] = value
@@ -154,6 +155,7 @@ def get_element(
         elif not isinstance(cur_value, Mapping) or part not in cur_value:
             return default_value
 
+        # pyrefly: ignore  # index-error
         cur_value = cast(CONTAINER_TYPE, cur_value[part])
     return cast(Optional[T], cur_value)
 
diff --git a/torch/distributed/checkpoint/default_planner.py b/torch/distributed/checkpoint/default_planner.py
index 3c9f5831b7e8..ee0029ec7d63 100644
--- a/torch/distributed/checkpoint/default_planner.py
+++ b/torch/distributed/checkpoint/default_planner.py
@@ -313,7 +313,8 @@ def set_up_planner(
         self.is_coordinator = is_coordinator
 
     def create_local_plan(self) -> LoadPlan:
-        assert self.metadata is not None
+        if self.metadata is None:
+            raise AssertionError("self.metadata is not None")
         if self.flatten_state_dict:
             # To support checkpoints that are saved before v2.4, we have to
             # differentiate if the missing keys are due to old checkpoints.
@@ -432,8 +433,10 @@ def set_up_planner(
         metadata: Optional[Metadata] = None,
         is_coordinator: bool = False,
     ) -> None:
-        assert not state_dict
-        assert metadata is not None
+        if state_dict:
+            raise AssertionError("not state_dict")
+        if metadata is None:
+            raise AssertionError("metadata is not None")
 
         # rebuild the state dict from the metadata
         for k, v in metadata.state_dict_metadata.items():
@@ -548,14 +551,16 @@ def create_default_global_save_plan(
     for plan in all_plans:
         new_items = []
         for item in plan.items:
-            if not item.type == WriteItemType.SHARD:
-                assert item.index.fqn not in md
+            if item.type != WriteItemType.SHARD:
+                if item.index.fqn in md:
+                    raise AssertionError("item.index.fqn not in md")
 
             if item.type == WriteItemType.BYTE_IO:
                 md[item.index.fqn] = BytesStorageMetadata()
                 new_items.append(item)
             else:
-                assert item.tensor_data is not None
+                if item.tensor_data is None:
+                    raise AssertionError("item.tensor_data is not None")
                 tensor_md = cast(
                     TensorStorageMetadata,
                     md.setdefault(
@@ -575,10 +580,11 @@ def create_default_global_save_plan(
                     new_item = dataclasses.replace(item, index=new_index)
                 new_items.append(new_item)
 
-                assert item.tensor_data.chunk is not None, f"""
+                if item.tensor_data.chunk is None:
+                    raise AssertionError(f"""
                     Cannot create MD for tensor without bounds.
                     FQN: {item.index.fqn}
-                """
+                """)
                 tensor_md.chunks.append(item.tensor_data.chunk)
         new_plans.append(dataclasses.replace(plan, items=new_items))
     return (new_plans, Metadata(md))
diff --git a/torch/distributed/checkpoint/examples/async_checkpointing_example.py b/torch/distributed/checkpoint/examples/async_checkpointing_example.py
index eb0562ec3dad..c3375c375437 100644
--- a/torch/distributed/checkpoint/examples/async_checkpointing_example.py
+++ b/torch/distributed/checkpoint/examples/async_checkpointing_example.py
@@ -60,6 +60,7 @@ def _init_model(rank, world_size):
     optim = torch.optim.Adam(model.parameters(), lr=0.0001)
 
     _patch_model_state_dict(model)
+    # pyrefly: ignore  # bad-argument-type
     _patch_optimizer_state_dict(model, optimizers=optim)
 
     return model, optim
@@ -92,6 +93,7 @@ def run(rank, world_size):
     loss_calc = torch.nn.BCELoss()
 
     f = None
+    # pyrefly: ignore  # bad-assignment
     for epoch in range(NUM_EPOCHS):
         try:
             torch.manual_seed(epoch)
@@ -107,7 +109,8 @@ def run(rank, world_size):
 
             if epoch % SAVE_PERIOD == 0:
                 if f is not None:
-                    assert isinstance(f, Future)
+                    if not isinstance(f, Future):
+                        raise AssertionError("f should be a Future instance")
                     f.result()
                 f = dcp.state_dict_saver.async_save(
                     state_dict, checkpoint_id=CHECKPOINT_DIR
@@ -124,7 +127,8 @@ def run(rank, world_size):
 
             _print("Reloading model from last checkpoint!")
             if f is not None:
-                assert isinstance(f, Future)
+                if not isinstance(f, Future):
+                    raise AssertionError("f should be a Future instance") from None
                 f.result()
             dcp.load(state_dict)
 
diff --git a/torch/distributed/checkpoint/filesystem.py b/torch/distributed/checkpoint/filesystem.py
index cc4115cb7de0..5def6c13dc14 100644
--- a/torch/distributed/checkpoint/filesystem.py
+++ b/torch/distributed/checkpoint/filesystem.py
@@ -11,13 +11,13 @@
 import uuid
 import warnings
 from abc import ABC, abstractmethod
-from collections.abc import Generator, Iterable, Iterator, Sequence
+from collections.abc import Callable, Generator, Iterable, Iterator, Sequence
 from contextlib import contextmanager
 from dataclasses import dataclass
 from enum import Enum
 from io import UnsupportedOperation
 from pathlib import Path
-from typing import Any, Callable, cast, Final, IO, Optional, Union
+from typing import Any, cast, Final, IO, Optional, Union
 
 # introduced as collections.abc.Buffer in Python 3.12
 from typing_extensions import Buffer
@@ -201,7 +201,8 @@ def _refill(self) -> None:
                 self.in_flight_data += tensor.numel() * tensor.element_size()
 
     def _finish(self) -> Iterable[tuple[torch.Tensor, object]]:
-        assert self._done
+        if not self._done:
+            raise AssertionError("_finish called before all items were processed")
         if len(self.current_items) > 0:
             self.stream.synchronize()
         return self.current_items
@@ -281,7 +282,8 @@ def close(self):
 
 def _item_size(item: WriteItem) -> int:
     size = 1
-    assert item.tensor_data is not None
+    if item.tensor_data is None:
+        raise AssertionError("WriteItem tensor_data must not be None")
     # can't use math.prod as PT needs to support older python
     for s in item.tensor_data.size:
         size *= s
@@ -329,11 +331,16 @@ def _write_item(
     )
 
     if write_item.type == WriteItemType.BYTE_IO:
-        assert isinstance(data, io.BytesIO)
+        if not isinstance(data, io.BytesIO):
+            raise AssertionError("Data must be io.BytesIO for BYTE_IO write items")
         transform_to.write(data.getbuffer())
     else:
-        assert isinstance(data, torch.Tensor)
-        assert data.device == torch.device("cpu")
+        if not isinstance(data, torch.Tensor):
+            raise AssertionError(
+                "Data must be torch.Tensor for non-BYTE_IO write items"
+            )
+        if data.device != torch.device("cpu"):
+            raise AssertionError("Tensor must be on CPU device")
         if serialization_format == SerializationFormat.TORCH_SAVE:
             torch.save(data, transform_to)
 
@@ -428,7 +435,8 @@ def _write_files_from_queue(
                 tensor_dict = {}
                 metadata_dict = {}
                 for tensor, write_item in loader.values():
-                    assert tensor.is_cpu
+                    if not tensor.is_cpu:
+                        raise AssertionError("Tensor must be on CPU")
                     write_results.append(
                         _write_item(
                             transforms,
@@ -631,7 +639,7 @@ def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
     def set_up_storage_writer(
         self, is_coordinator: bool, *args: Any, **kwargs: Any
     ) -> None:
-        self.rank = kwargs.get("rank", None)
+        self.rank = kwargs.get("rank")
         self.use_collectives = kwargs.get("use_collectives", True)
 
     def _metadata_exists(self) -> bool:
@@ -903,9 +911,10 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
                         )
                         target_tensor = planner.resolve_tensor(req).detach()
 
-                        assert target_tensor.size() == tensor.size(), (
-                            f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
-                        )
+                        if target_tensor.size() != tensor.size():
+                            raise AssertionError(
+                                f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
+                            )
                         target_tensor.copy_(tensor)
                         planner.commit_tensor(req, target_tensor)
 
@@ -919,7 +928,7 @@ def _get_metadata_path(self, rank: Optional[int] = None) -> os.PathLike:
 
     # Implementing the abstract function in StorageReader
     def read_metadata(self, *args: Any, **kwargs: Any) -> Metadata:
-        rank = kwargs.get("rank", None)
+        rank = kwargs.get("rank")
         path = self._get_metadata_path(rank)
         with self.fs.create_stream(path, "rb") as metadata_file:
             metadata = pickle.load(metadata_file)
@@ -934,9 +943,10 @@ def set_up_storage_reader(
         self, metadata: Metadata, is_coordinator: bool, *args: Any, **kwargs: Any
     ) -> None:
         self.storage_data = metadata.storage_data
-        self.rank = kwargs.get("rank", None)
+        self.rank = kwargs.get("rank")
         self.use_collectives = kwargs.get("use_collectives", True)
-        assert self.storage_data is not None
+        if self.storage_data is None:
+            raise AssertionError("storage_data must not be None in metadata")
 
     def prepare_local_plan(self, plan: LoadPlan) -> LoadPlan:
         return plan
diff --git a/torch/distributed/checkpoint/format_utils.py b/torch/distributed/checkpoint/format_utils.py
index fc695c495cb5..b61474f675db 100644
--- a/torch/distributed/checkpoint/format_utils.py
+++ b/torch/distributed/checkpoint/format_utils.py
@@ -64,6 +64,7 @@ def __init__(
         self.checkpoint_id = checkpoint_id
         self.coordinator_rank = coordinator_rank
 
+    # pyrefly: ignore  # bad-override
     def read_metadata(self) -> Metadata:
         """Extends the default StorageReader to support building the metadata file"""
         # Metadata is built in planner.set_up_planner, since we are not actually reading metadata from
@@ -83,7 +84,8 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
         # the entire checkpoint on each rank, hopefully preventing OOM issues
         # TODO: read on each host, instead of only the coordinator
         if self.is_coordinator:
-            assert self.checkpoint_id is not None
+            if self.checkpoint_id is None:
+                raise AssertionError("checkpoint_id must be set before reading data")
             torch_state_dict = torch.load(
                 self.checkpoint_id, map_location="cpu", weights_only=False
             )
@@ -102,6 +104,7 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
             #  Broadcast the tensor from the coordinator rank
             if self.is_coordinator:
                 pg_device = dist.distributed_c10d._get_pg_default_device()
+                # pyrefly: ignore  # unsupported-operation
                 tensor = torch_state_dict[req.storage_index.fqn].to(pg_device)
             else:
                 tensor = torch.empty_like(planner.state_dict[req.storage_index.fqn])
@@ -110,10 +113,11 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
 
             tensor = narrow_tensor_by_index(tensor, req.storage_offsets, req.lengths)
             target_tensor = planner.resolve_tensor(req).detach()
-            assert target_tensor.size() == tensor.size(), (
-                f"req {req.storage_index} mismatch sizes, "
-                f"{target_tensor.size()} vs {tensor.size()}"
-            )
+            if not target_tensor.size() == tensor.size():
+                raise AssertionError(
+                    f"req {req.storage_index} mismatch sizes, "
+                    f"{target_tensor.size()} vs {tensor.size()}"
+                )
             target_tensor.copy_(tensor)
             planner.commit_tensor(req, target_tensor)
 
@@ -121,13 +125,21 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
         fut.set_result(None)
         return fut
 
+    # pyrefly: ignore  # bad-override
     def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> None:
         """Implementation of the StorageReader method"""
         self.is_coordinator = is_coordinator
         if self.is_coordinator:
-            assert dist.get_rank() == self.coordinator_rank
+            if not dist.get_rank() == self.coordinator_rank:
+                raise AssertionError(
+                    f"Coordinator rank mismatch: expected {self.coordinator_rank}, "
+                    f"got {dist.get_rank()}"
+                )
 
-        assert self.checkpoint_id is not None
+        if self.checkpoint_id is None:
+            raise AssertionError(
+                "checkpoint_id must be set before setting up storage reader"
+            )
 
     def prepare_local_plan(self, plan: LoadPlan) -> LoadPlan:
         """Implementation of the StorageReader method"""
diff --git a/torch/distributed/checkpoint/hf_storage.py b/torch/distributed/checkpoint/hf_storage.py
index 17db989727d4..c769565229b3 100644
--- a/torch/distributed/checkpoint/hf_storage.py
+++ b/torch/distributed/checkpoint/hf_storage.py
@@ -226,9 +226,10 @@ def _process_read_request(self, f, req: ReadItem, planner: LoadPlanner) -> None:
         tensor = f.get_slice(req.storage_index.fqn)[slices]
         target_tensor = planner.resolve_tensor(req).detach()
 
-        assert target_tensor.size() == tensor.size(), (
-            f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
-        )
+        if target_tensor.size() != tensor.size():
+            raise AssertionError(
+                f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
+            )
 
         target_tensor.copy_(tensor)
         planner.commit_tensor(req, target_tensor)
@@ -299,14 +300,16 @@ def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
             except queue.Empty:
                 pass
 
-            assert processed_count == len(per_file), (
-                f"Not all files were processed: {processed_count} out of {len(per_file)}"
-            )
+            if processed_count != len(per_file):
+                raise AssertionError(
+                    f"Not all files were processed: {processed_count} out of {len(per_file)}"
+                )
 
         fut: Future = Future()
         fut.set_result(None)
         return fut
 
+    # pyrefly: ignore  # bad-override
     def read_metadata(self) -> Metadata:
         from safetensors import safe_open  # type: ignore[import]
         from safetensors.torch import _getdtype  # type: ignore[import]
diff --git a/torch/distributed/checkpoint/logger.py b/torch/distributed/checkpoint/logger.py
index a8961493cbee..f5373da83b62 100644
--- a/torch/distributed/checkpoint/logger.py
+++ b/torch/distributed/checkpoint/logger.py
@@ -2,7 +2,8 @@
 import functools
 import logging
 import time
-from typing import Any, Callable, TypeVar
+from collections.abc import Callable
+from typing import Any, TypeVar
 from typing_extensions import ParamSpec
 from uuid import uuid4
 
@@ -15,6 +16,7 @@
 
 __all__: list[str] = []
 
+# pyrefly: ignore  # unknown-name
 global _dcp_logger
 _dcp_logger = c10d_logger._get_or_create_logger(DCP_LOGGER_NAME)
 
@@ -29,15 +31,17 @@ def _msg_dict_from_dcp_method_args(*args, **kwargs) -> dict[str, Any]:
     msg_dict = {}
 
     # checkpoint ID can be passed in through the serializer or through the checkpoint id directly
-    storage_writer = kwargs.get("storage_writer", None)
-    storage_reader = kwargs.get("storage_reader", None)
-    planner = kwargs.get("planner", None)
+    storage_writer = kwargs.get("storage_writer")
+    storage_reader = kwargs.get("storage_reader")
+    planner = kwargs.get("planner")
 
-    checkpoint_id = kwargs.get("checkpoint_id", None)
+    checkpoint_id = kwargs.get("checkpoint_id")
     if not checkpoint_id and (serializer := storage_writer or storage_reader):
+        # pyrefly: ignore  # unbound-name
         checkpoint_id = getattr(serializer, "checkpoint_id", None)
 
     msg_dict["checkpoint_id"] = (
+        # pyrefly: ignore  # unsupported-operation
         str(checkpoint_id) if checkpoint_id is not None else checkpoint_id
     )
 
diff --git a/torch/distributed/checkpoint/optimizer.py b/torch/distributed/checkpoint/optimizer.py
index ed864aa24965..7d72633b6a94 100644
--- a/torch/distributed/checkpoint/optimizer.py
+++ b/torch/distributed/checkpoint/optimizer.py
@@ -29,6 +29,8 @@
     _create_read_items,
     create_read_items_for_chunk_list,
 )
+
+# pyrefly: ignore  # deprecated
 from torch.distributed.checkpoint.state_dict_loader import load_state_dict
 from torch.distributed.checkpoint.storage import StorageReader
 from torch.distributed.checkpoint.utils import (
@@ -135,12 +137,10 @@ def _get_state_dict_2d_layout(
     for key, value in state_dict.items():
         specs[key] = (None, value.size())
         if _is_nested_tensor(value):
-            assert len(value.local_shards()) == 1, (
-                "Cannot handle ST with multiple shards"
-            )
-            assert isinstance(value, ShardedTensor), (
-                "Can only handle nested ShardedTensor"
-            )
+            if not len(value.local_shards()) == 1:
+                raise AssertionError("Cannot handle ST with multiple shards")
+            if not isinstance(value, ShardedTensor):
+                raise AssertionError("Can only handle nested ShardedTensor")
             shard = value.local_shards()[0]
             specs[key] = (
                 shard.metadata.shard_offsets,
@@ -157,6 +157,7 @@ def _get_state_dict_2d_layout(
 class _ReaderWithOffset(DefaultLoadPlanner):
     translation: dict[MetadataIndex, MetadataIndex]
     state_dict: STATE_DICT_TYPE
+    # pyrefly: ignore  # bad-override
     metadata: Metadata
 
     def __init__(self, fqn_to_offset: dict[str, Sequence[int]]) -> None:
@@ -181,7 +182,8 @@ def create_local_plan(self) -> LoadPlan:
 
             offset = self.fqn_to_offset[fqn]
 
-            assert len(obj.local_shards()) == 1
+            if not len(obj.local_shards()) == 1:
+                raise AssertionError("Expected exactly one local shard")
             original_shard = obj.local_shards()[0]
             local_chunks = [
                 ChunkStorageMetadata(
@@ -198,7 +200,8 @@ def create_local_plan(self) -> LoadPlan:
             # TODO: The ReadItems will have a displaced MetadataIndex, fix it.
             # TODO: we should change _create_sharded_read_items to have more ergonomic API
             for ri in reqs:
-                assert ri.dest_index.offset is not None
+                if ri.dest_index.offset is None:
+                    raise AssertionError("dest_index.offset must not be None")
                 original_offset = _element_wise_sub(ri.dest_index.offset, offset)
                 original_index = dataclasses.replace(
                     ri.dest_index, offset=torch.Size(original_offset)
diff --git a/torch/distributed/checkpoint/planner_helpers.py b/torch/distributed/checkpoint/planner_helpers.py
index 35b1411ef946..9d7af7d7a821 100644
--- a/torch/distributed/checkpoint/planner_helpers.py
+++ b/torch/distributed/checkpoint/planner_helpers.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import io
-from typing import Any, Callable, cast
+from collections.abc import Callable
+from typing import Any, cast
 
 import torch
 import torch.distributed as dist
diff --git a/torch/distributed/checkpoint/quantized_hf_storage.py b/torch/distributed/checkpoint/quantized_hf_storage.py
index 1bc8b852ed81..2cb189d515a8 100644
--- a/torch/distributed/checkpoint/quantized_hf_storage.py
+++ b/torch/distributed/checkpoint/quantized_hf_storage.py
@@ -6,6 +6,7 @@
 
 import torch
 from torch.distributed.checkpoint._hf_utils import _metadata_fn
+from torch.distributed.checkpoint.metadata import TensorStorageMetadata
 from torch.distributed.checkpoint.planner import LoadPlanner, ReadItem
 
 from .hf_storage import HuggingFaceStorageReader
@@ -50,10 +51,20 @@ def __init__(
         self._weight_scale_mapping: dict[str, str] = {}
         # Track which file contains each tensor
         self._weight_map: dict[str, str] = {}
+        # Cache for full tensor shapes (fqn -> shape)
+        self._tensor_full_shapes: dict[str, torch.Size] = {}
 
     def read_metadata(self) -> Any:
+        metadata = super().read_metadata()
+        # Build a cache of FQN -> full tensor shape for faster lookups.
+        for fqn, tensor_metadata in metadata.state_dict_metadata.items():
+            # Only process TensorStorageMetadata which has size attribute
+            if isinstance(tensor_metadata, TensorStorageMetadata):
+                self._tensor_full_shapes[fqn] = tensor_metadata.size
+
         self._load_quantization_metadata()
-        return super().read_metadata()
+
+        return metadata
 
     def _load_quantization_metadata(self):
         """Load quantization metadata from the checkpoint."""
@@ -96,71 +107,129 @@ def _process_read_request(
 
         target_tensor = planner.resolve_tensor(req).detach()
 
-        assert target_tensor.size() == tensor.size(), (
-            f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
-        )
+        if target_tensor.size() != tensor.size():
+            raise AssertionError(
+                f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
+            )
 
         target_tensor.copy_(tensor)
         planner.commit_tensor(req, target_tensor)
 
-    def _calculate_scale_shape(
-        self, weight: torch.Tensor, block_size: int
-    ) -> tuple[int, int]:
-        """Calculate expected scale tensor shape based on weight tensor and block size."""
-        rows, cols = weight.shape
-        block_rows = (rows + block_size - 1) // block_size  # Ceiling division
-        block_cols = (cols + block_size - 1) // block_size  # Ceiling division
-        return (block_rows, block_cols)
+    def _get_slice_to_block_mapping(
+        self, req: ReadItem
+    ) -> tuple[tuple[int, int], tuple[int, int], slice, slice]:
+        """
+        Calculate which blocks correspond to the requested slice.
+
+        Args:
+            req: Read request containing tensor info and required slices
+
+        Returns:
+            Tuple of (row_block_range, col_block_range, row_slice, col_slice)
+        """
+        # Get the slice information
+        row_slice = slice(
+            req.storage_offsets[0], req.storage_offsets[0] + req.lengths[0]
+        )
+        col_slice = slice(
+            req.storage_offsets[1], req.storage_offsets[1] + req.lengths[1]
+        )
+
+        # Calculate which blocks this slice spans
+        row_start_block = row_slice.start // self.block_size
+        row_end_block = (row_slice.stop - 1) // self.block_size + 1  # Inclusive end
+
+        col_start_block = col_slice.start // self.block_size
+        col_end_block = (col_slice.stop - 1) // self.block_size + 1  # Inclusive end
+
+        return (
+            (row_start_block, row_end_block),
+            (col_start_block, col_end_block),
+            row_slice,
+            col_slice,
+        )
 
     def _dequantize_tensor(
         self,
         weight: torch.Tensor,
         scale_inv: torch.Tensor,
+        full_tensor_shape: torch.Size,
+        slice_info: tuple[tuple[int, int], tuple[int, int], slice, slice],
     ) -> torch.Tensor:
         """
-        Dequantize tensor using block-wise scaling.
+        Dequantize a sliced tensor using the appropriate portion of the scale tensor.
 
         Args:
-            weight: Quantized weight tensor
-            scale_inv: Scale inverse tensor for dequantization
+            weight: Sliced quantized weight tensor
+            scale_inv: Full scale inverse tensor for dequantization
+            full_tensor_shape: Shape of the original full tensor
+            slice_info: Block mapping information from _get_slice_to_block_mapping
 
         Returns:
             Dequantized tensor
         """
+        (row_block_range, col_block_range, row_slice, col_slice) = slice_info
+
         # Convert to float32 for computation
         # Certain quantized dtypes like Float8_e4m3fn
         # don't support multiplication on CPU yet in PyTorch.
         upcasted_weight = weight.to(torch.float32)
 
-        # Get original dimensions
-        orig_shape = weight.shape
-
-        # Calculate block dimensions for the local shard
-        expected_scale_shape = self._calculate_scale_shape(weight, self.block_size)
-        block_rows, block_cols = expected_scale_shape
-
         # Create output tensor in target dtype
         dequantized = weight.detach().to(dtype=self.target_dtype, copy=True)
 
-        # Apply scaling factors to each block
-        for i in range(block_rows):
-            row_start = i * self.block_size
-            row_end = min(row_start + self.block_size, orig_shape[0])
-
-            for j in range(block_cols):
-                col_start = j * self.block_size
-                col_end = min(col_start + self.block_size, orig_shape[1])
-
-                # Get the block
-                block = upcasted_weight[row_start:row_end, col_start:col_end]
-
-                scale = scale_inv[i, j]
+        # Get the actual slice boundaries
+        row_start_global = row_slice.start
+        row_end_global = row_slice.stop
+        col_start_global = col_slice.start
+        col_end_global = col_slice.stop
+
+        # Apply scaling factors to each block that intersects with our slice
+        for block_i in range(row_block_range[0], row_block_range[1]):
+            for block_j in range(col_block_range[0], col_block_range[1]):
+                # Calculate the block boundaries in global coordinates
+                block_row_start_global = block_i * self.block_size
+                block_row_end_global = min(
+                    block_row_start_global + self.block_size, full_tensor_shape[0]
+                )
+                block_col_start_global = block_j * self.block_size
+                block_col_end_global = min(
+                    block_col_start_global + self.block_size, full_tensor_shape[1]
+                )
+
+                # Find the intersection of the block with our slice
+                intersect_row_start = max(block_row_start_global, row_start_global)
+                intersect_row_end = min(block_row_end_global, row_end_global)
+                intersect_col_start = max(block_col_start_global, col_start_global)
+                intersect_col_end = min(block_col_end_global, col_end_global)
+
+                # Skip if no intersection
+                if (
+                    intersect_row_start >= intersect_row_end
+                    or intersect_col_start >= intersect_col_end
+                ):
+                    continue
+
+                # Convert global coordinates to local coordinates in the sliced tensor
+                local_row_start = intersect_row_start - row_start_global
+                local_row_end = intersect_row_end - row_start_global
+                local_col_start = intersect_col_start - col_start_global
+                local_col_end = intersect_col_end - col_start_global
+
+                # Get the block from the sliced tensor
+                block = upcasted_weight[
+                    local_row_start:local_row_end, local_col_start:local_col_end
+                ]
+
+                # Apply the scale factor
+                scale = scale_inv[block_i, block_j]
                 block = block * scale
 
-                # Explicitly convert block to target dtype
+                # Convert block to target dtype and store
                 block_converted = block.to(dtype=self.target_dtype)
-                # Store the dequantized block
-                dequantized[row_start:row_end, col_start:col_end] = block_converted
+                dequantized[
+                    local_row_start:local_row_end, local_col_start:local_col_end
+                ] = block_converted
 
         return dequantized
 
@@ -202,15 +271,14 @@ def _read_quantized_tensor_with_block_alignment(
         scale_fqn = self._weight_scale_mapping[tensor_fqn]
 
         try:
-            # Load the quantized weight
+            # Load the sliced quantized weight
             weight_slices = tuple(
                 slice(offset, offset + length)
                 for offset, length in zip(req.storage_offsets, req.lengths)
             )
             quantized_tensor = safetensor_file.get_slice(tensor_fqn)[weight_slices]
 
-            # Load the corresponding scale inverse tensor
-            # Use weight_map to find the correct file for the scale tensor
+            # Load the corresponding scale inverse tensor (full tensor)
             scale_file_name = self._weight_map.get(scale_fqn)
             if scale_file_name is None:
                 raise ValueError(f"Scale tensor {scale_fqn} not found in weight_map")
@@ -231,10 +299,20 @@ def _read_quantized_tensor_with_block_alignment(
                 ) as scale_file:
                     scale_inv = scale_file.get_tensor(scale_fqn)
 
-            # Perform dequantization
+            # Get the full tensor shape from our O(1) lookup cache
+            full_tensor_shape = self._tensor_full_shapes.get(tensor_fqn)
+            if full_tensor_shape is None:
+                raise ValueError(f"Could not find full tensor shape for {tensor_fqn}")
+
+            # Get slice to block mapping
+            slice_info = self._get_slice_to_block_mapping(req)
+
+            # Perform dequantization with proper block alignment
             dequantized_tensor = self._dequantize_tensor(
                 weight=quantized_tensor,
                 scale_inv=scale_inv,
+                full_tensor_shape=full_tensor_shape,
+                slice_info=slice_info,
             )
 
             return dequantized_tensor
diff --git a/torch/distributed/checkpoint/staging.py b/torch/distributed/checkpoint/staging.py
index c463b66ddd3f..d3ea5334d68b 100644
--- a/torch/distributed/checkpoint/staging.py
+++ b/torch/distributed/checkpoint/staging.py
@@ -182,18 +182,21 @@ def __init__(
         self._staging_executor = None
         self._staging_stream = None
         if self._config.use_async_staging:
+            # pyrefly: ignore  # bad-assignment
             self._staging_executor = ThreadPoolExecutor(max_workers=1)
             if torch.accelerator.is_available():
                 # Note: stream needs to be initialized on the main thread after default cuda
                 # stream is setup/used to avoid the risk of accidentally reusing the main
                 # compute stream or in other cases kernels actually launching from the
                 # main thread.
+                # pyrefly: ignore  # bad-assignment
                 self._staging_stream = torch.Stream()
 
         if self._config.use_non_blocking_copy:
-            assert torch.accelerator.is_available(), (
-                "Non-blocking copy requires that the current accelerator is available."
-            )
+            if not torch.accelerator.is_available():
+                raise AssertionError(
+                    "Non-blocking copy requires that the current accelerator is available."
+                )
 
         self._staging_future: Optional[Future[STATE_DICT_TYPE]] = None
 
@@ -213,7 +216,10 @@ def stage(
             state_dict (STATE_DICT_TYPE): The state_dict to be staged.
         """
         if self._config.use_async_staging:
-            assert self._staging_executor is not None
+            if self._staging_executor is None:
+                raise AssertionError(
+                    "staging_executor should not be None for async staging"
+                )
             self._staging_future = self._staging_executor.submit(
                 self._stage,
                 state_dict,
@@ -225,9 +231,10 @@ def stage(
 
     def _stage(self, state_dict: STATE_DICT_TYPE, **kwargs: Any) -> STATE_DICT_TYPE:
         if self._config.use_non_blocking_copy:
-            assert self._staging_stream or not self._config.use_async_staging, (
-                "Non-blocking copy in a background thread for async staging needs staging_stream to be initialized."
-            )
+            if not (self._staging_stream or not self._config.use_async_staging):
+                raise AssertionError(
+                    "Non-blocking copy in a background thread for async staging needs staging_stream to be initialized."
+                )
             with (
                 self._staging_stream
                 if self._staging_stream is not None
@@ -348,6 +355,7 @@ def __init__(
     ):
         self._pg = pg
         self._timeout = timeout
+        # pyrefly: ignore  # read-only
         self._device = device
         self._transport = PGTransport(pg, timeout, device, None)
 
diff --git a/torch/distributed/checkpoint/state_dict.py b/torch/distributed/checkpoint/state_dict.py
index a430a64fad81..d401db7a8460 100644
--- a/torch/distributed/checkpoint/state_dict.py
+++ b/torch/distributed/checkpoint/state_dict.py
@@ -3,10 +3,10 @@
 import functools
 import gc
 import warnings
-from collections.abc import Generator, Iterable
+from collections.abc import Callable, Generator, Iterable
 from dataclasses import asdict, dataclass, field
 from itertools import chain
-from typing import Any, Callable, cast, no_type_check, Optional, Union
+from typing import Any, cast, no_type_check, Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -186,7 +186,8 @@ def _get_fqns(
     curr_obj = model
     for i, curr_obj_name in enumerate(obj_names):
         if isinstance(curr_obj, DDP):
-            assert curr_obj_name == "module"
+            if curr_obj_name != "module":
+                raise AssertionError(f"Expected 'module', got '{curr_obj_name}'")
             curr_obj = curr_obj.module
             if not skip_ddp_prefix:
                 fqn_obj_names.append(curr_obj_name)
@@ -199,10 +200,12 @@ def _get_fqns(
                 return {f"{prefix}{fqn}" for fqn in flat_param._fqns}
             curr_obj = getattr(curr_obj, FSDP_WRAPPED_MODULE)
             if curr_obj_name != FSDP_WRAPPED_MODULE:
+                # pyrefly: ignore  # bad-argument-type
                 fqn_obj_names.append(curr_obj_name)
                 curr_obj = getattr(curr_obj, curr_obj_name)
         elif isinstance(curr_obj, torch._dynamo.eval_frame.OptimizedModule):
-            assert curr_obj_name == "_orig_mod"
+            if curr_obj_name != "_orig_mod":
+                raise AssertionError(f"Expected '_orig_mod', got '{curr_obj_name}'")
             curr_obj = curr_obj._orig_mod
             if not skip_compiler_prefix:
                 fqn_obj_names.append(curr_obj_name)
@@ -215,6 +218,7 @@ def _get_fqns(
                 ):
                     if hasattr(curr_obj, removed_fqn):
                         curr_obj = getattr(curr_obj, removed_fqn)
+            # pyrefly: ignore  # bad-argument-type
             fqn_obj_names.append(curr_obj_name)
             if curr_obj_name == nn.modules.module._EXTRA_STATE_KEY_SUFFIX:
                 if i != len(obj_names) - 1:
@@ -305,7 +309,7 @@ def _verify_options(
             continue
 
         fqns = _get_fqns(model, name)
-        fqn = fqn_param_mapping.get(param, None)
+        fqn = fqn_param_mapping.get(param)
         if fqn is not None:
             cast(set[str], fqn_param_mapping[param]).update(fqns)
             shared_params_mapping[param] = fqn_param_mapping[param]
@@ -327,7 +331,8 @@ def _verify_options(
             if module not in submodules:
                 continue
             fqns = _get_fqns(model, name)
-            assert len(fqns) == 1, "Submodule FQN should only have 1 instance"
+            if len(fqns) != 1:
+                raise AssertionError("Submodule FQN should only have 1 instance")
             submodule_prefixes.update(f"{fqn}." for fqn in fqns)
 
     if options.broadcast_from_rank0 and not options.full_state_dict:
@@ -406,7 +411,8 @@ def _verify_state_dict(
 ) -> None:
     for module in info.fsdp_modules:
         fsdp_state = _get_module_fsdp_state_if_fully_sharded_module(module)
-        assert fsdp_state is not None, "Expected a fsdp_state with a fsdp module."
+        if fsdp_state is None:
+            raise AssertionError("Expected a fsdp_state with a fsdp module.")
 
     # Verify if the model_state_dict and optim_state_dict are valid. This API
     # should give the users an explicit error message to debug or report.
@@ -481,7 +487,10 @@ def _get_model_state_dict(
 
     for key in list(state_dict.keys()):
         fqns = _get_fqns(model, key)
-        assert len(fqns) == 1, (key, fqns)
+        if len(fqns) != 1:
+            raise AssertionError(
+                f"Expected 1 FQN for key '{key}', got {len(fqns)}: {fqns}"
+            )
         fqn = next(iter(fqns))
         if fqn != key:
             # As we only support FSDP, DDP, and TP, the only cases are
@@ -744,7 +753,8 @@ def _unflatten_optim_state_dict(
                     continue
 
                 params = pg_state[-1][_PARAMS]
-                assert isinstance(params, list)  # typing
+                if not isinstance(params, list):
+                    raise AssertionError(f"Expected list, got {type(params)}")
                 params.append(fqn)
                 if not param.requires_grad:
                     continue
@@ -806,7 +816,10 @@ def _get_optim_state_dict(
             fqn_pid_mapping = {}
             for key, param in model.named_parameters():
                 fqns = _get_fqns(model, key)
-                assert len(fqns) == 1
+                if len(fqns) != 1:
+                    raise AssertionError(
+                        f"Expected 1 FQN for key '{key}', got {len(fqns)}"
+                    )
                 fqn = next(iter(fqns))
                 if param not in param_pid_mapping:
                     continue
@@ -884,7 +897,8 @@ def _split_optim_state_dict(
                     continue
 
                 params = pg_state[-1][_PARAMS]
-                assert isinstance(params, list)
+                if not isinstance(params, list):
+                    raise AssertionError(f"Expected list, got {type(params)}")
                 params.append(fqn)
                 if param.requires_grad:
                     state[fqn] = cast(DictValueType, optim_state_dict[_STATE])[fqn]
@@ -963,7 +977,10 @@ def _load_optim_state_dict(
                 if fqns == fqns_with_compiler:
                     continue
 
-                assert len(fqns) == 1
+                if len(fqns) != 1:
+                    raise AssertionError(
+                        f"Expected 1 FQN for '{original_fqn}', got {len(fqns)}"
+                    )
                 fqn = fqns.pop()
                 fqn_with_compiler = fqns_with_compiler.pop()
                 for g in optim_state_dict[_PG]:
@@ -997,7 +1014,8 @@ def _device(t):
                 return t
 
             _ = tree_map_only(torch.Tensor, _device, local_state_dict)
-            assert device is not None
+            if device is None:
+                raise AssertionError("Expected device to be set")
             flatten_osd, osd_mapping = _flatten_state_dict(optim_state_dict)
             flatten_local_osd, local_osd_mapping = _flatten_state_dict(local_state_dict)
             if info.broadcast_from_rank0:
@@ -1010,7 +1028,10 @@ def _device(t):
             # having additional parameters ultimately.
             for optim_key in flatten_osd.keys():
                 if optim_key not in flatten_local_osd:
-                    assert optim_key in osd_mapping
+                    if optim_key not in osd_mapping:
+                        raise AssertionError(
+                            f"Expected key '{optim_key}' in osd_mapping"
+                        )
                     flatten_local_osd[optim_key] = flatten_osd[optim_key]
                     local_osd_mapping[optim_key] = osd_mapping[optim_key]
             optim_state_dict = _unflatten_state_dict(
@@ -1206,6 +1227,7 @@ def _unflatten_model_state_dict(
     if not state_dict:
         return {}
 
+    # pyrefly: ignore  # no-matching-overload
     if isinstance(next(iter(state_dict.keys())), nn.Module):
         warnings.warn(
             "Passing model_state_dict as a ``Dict[nn.Module, Dict[str, Any]]``"
@@ -1222,7 +1244,10 @@ def _unflatten_model_state_dict(
                     continue
 
                 fqns = _get_fqns(model, name)
-                assert len(fqns) == 1, "FQNs for a submodule should only have 1 element"
+                if len(fqns) != 1:
+                    raise AssertionError(
+                        "FQNs for a submodule should only have 1 element"
+                    )
                 prefix = f"{next(iter(fqns))}."
                 new_state_dict.update(
                     {prefix + subfqn: value for subfqn, value in sub_state_dict.items()}
diff --git a/torch/distributed/checkpoint/state_dict_loader.py b/torch/distributed/checkpoint/state_dict_loader.py
index ae3c4df775ab..389dc0e5e571 100644
--- a/torch/distributed/checkpoint/state_dict_loader.py
+++ b/torch/distributed/checkpoint/state_dict_loader.py
@@ -246,8 +246,10 @@ def local_step():
             except Exception:
                 logger.info("Rank local metadata is not found.")
 
-        assert planner is not None
-        assert metadata is not None
+        if planner is None:
+            raise AssertionError("planner is None")
+        if metadata is None:
+            raise AssertionError("metadata is None")
         planner.set_up_planner(state_dict, metadata, distW.is_coordinator)
 
         if (
@@ -269,7 +271,8 @@ def local_step():
 
     @_dcp_method_logger(**ckpt_kwargs)
     def global_step(all_local_plans):
-        assert planner is not None
+        if planner is None:
+            raise AssertionError("planner is None")
         all_local_plans = planner.create_global_plan(all_local_plans)
         all_local_plans = storage_reader.prepare_global_plan(all_local_plans)
         return all_local_plans
@@ -284,8 +287,10 @@ def global_step(all_local_plans):
 
     @_dcp_method_logger(**ckpt_kwargs)
     def read_data():
-        assert planner is not None
-        assert central_plan is not None
+        if planner is None:
+            raise AssertionError("planner is None")
+        if central_plan is None:
+            raise AssertionError("central_plan is None")
         final_local_plan = planner.finish_plan(central_plan)
         all_reads = storage_reader.read_data(final_local_plan, planner)
 
diff --git a/torch/distributed/checkpoint/state_dict_saver.py b/torch/distributed/checkpoint/state_dict_saver.py
index 05175cf400e5..58a4bd0e85ef 100644
--- a/torch/distributed/checkpoint/state_dict_saver.py
+++ b/torch/distributed/checkpoint/state_dict_saver.py
@@ -292,11 +292,10 @@ def async_save(
 
     if dist.is_available() and dist.is_initialized():
         pg = process_group or _get_default_group()
-        assert (
-            torch.device("cpu") in pg._device_types  # type: ignore[attr-defined]
-        ), (
-            "A CPU backend must be enabled for async save; try initializing process group with 'cpu:gloo,cuda:nccl'"
-        )
+        if torch.device("cpu") not in pg._device_types:
+            raise AssertionError(
+                "A CPU backend must be enabled for async save; try initializing process group with 'cpu:gloo,cuda:nccl'"
+            )
 
     if async_stager is None:
         if storage_writer is not None and isinstance(storage_writer, AsyncStager):
@@ -329,6 +328,7 @@ def stage_state_dict() -> Union[Future[STATE_DICT_TYPE], STATE_DICT_TYPE]:
     upload_future: Future = upload_executor.execute_save(
         staging_future_or_state_dict,
         checkpoint_id=checkpoint_id,
+        # pyrefly: ignore  # bad-argument-type
         storage_writer=storage_writer,
         planner=planner,
         process_group=process_group,
@@ -395,7 +395,8 @@ def _save_state_dict(
     distW = _DistWrapper(process_group, not no_dist, coordinator_rank)
     if planner is None:
         planner = DefaultSavePlanner()
-    assert planner is not None
+    if planner is None:
+        raise AssertionError("planner is None")
 
     global_metadata = None
 
@@ -406,7 +407,8 @@ def _save_state_dict(
 
     @_dcp_method_logger(**ckpt_kwargs)
     def local_step():
-        assert planner is not None
+        if planner is None:
+            raise AssertionError("planner is None")
         storage_meta = storage_writer.storage_meta()
         if "storage_meta" not in inspect.signature(planner.set_up_planner).parameters:
             warnings.warn(
@@ -442,7 +444,8 @@ def local_step():
     def global_step(all_local_plans):
         nonlocal global_metadata
 
-        assert planner is not None
+        if planner is None:
+            raise AssertionError("planner is None")
         all_local_plans, global_metadata = planner.create_global_plan(all_local_plans)
         all_local_plans = storage_writer.prepare_global_plan(all_local_plans)
         return all_local_plans
@@ -457,8 +460,10 @@ def global_step(all_local_plans):
 
     @_dcp_method_logger(**ckpt_kwargs)
     def write_data():
-        assert planner is not None
-        assert central_plan is not None
+        if planner is None:
+            raise AssertionError("planner is None")
+        if central_plan is None:
+            raise AssertionError("central_plan is None")
         final_local_plan = planner.finish_plan(central_plan)
         all_writes = storage_writer.write_data(final_local_plan, planner)
 
@@ -467,7 +472,8 @@ def write_data():
 
     @_dcp_method_logger(**ckpt_kwargs)
     def finish_checkpoint(all_results):
-        assert global_metadata is not None
+        if global_metadata is None:
+            raise AssertionError("global_metadata is None")
         storage_writer.finish(metadata=global_metadata, results=all_results)
         return global_metadata
 
diff --git a/torch/distributed/checkpoint/utils.py b/torch/distributed/checkpoint/utils.py
index 6d00026d9934..c06c50223836 100644
--- a/torch/distributed/checkpoint/utils.py
+++ b/torch/distributed/checkpoint/utils.py
@@ -5,11 +5,11 @@
 import itertools
 import os
 import warnings
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from contextlib import contextmanager
 from functools import wraps
 from pstats import Stats
-from typing import Any, Callable, cast, Optional, TypeVar, Union
+from typing import Any, cast, Optional, TypeVar, Union
 
 import torch
 import torch.distributed as dist
@@ -168,7 +168,8 @@ def scatter_object(self, object_list: Optional[list[T]]) -> T:
 
             local_reply = gather_result[0]
         else:
-            assert object_list is not None
+            if object_list is None:
+                raise AssertionError("object_list is None")
             local_reply = object_list[0]
         return local_reply
 
@@ -196,7 +197,8 @@ def reduce_scatter(
         all_data = self.gather_object(local_data)
         all_results: Optional[list[Union[R, CheckpointException]]] = None
         if self.is_coordinator:
-            assert all_data is not None
+            if all_data is None:
+                raise AssertionError("all_data is None")
             node_failures = _get_failure_dict(all_data)
 
             if len(node_failures) == 0:
@@ -243,7 +245,8 @@ def all_reduce(
         all_data = self.gather_object(local_data)
         result: Optional[Union[R, CheckpointException]] = None
         if self.is_coordinator:
-            assert all_data is not None
+            if all_data is None:
+                raise AssertionError("all_data is None")
             node_failures = _get_failure_dict(all_data)
             if len(node_failures) == 0:
                 try:
@@ -254,6 +257,7 @@ def all_reduce(
             if len(node_failures) > 0:
                 result = CheckpointException(step, node_failures)
 
+        # pyrefly: ignore  # bad-argument-type
         final_result = self.broadcast_object(result)
         if isinstance(final_result, CheckpointException):
             raise final_result
@@ -302,6 +306,7 @@ def broadcast(
                 result = map_fun()
             except BaseException as e:  # noqa: B036
                 result = CheckpointException(step, {self.rank: _wrap_exception(e)})
+        # pyrefly: ignore  # bad-argument-type
         final_result = self.broadcast_object(result)
         if isinstance(final_result, CheckpointException):
             raise final_result
@@ -463,10 +468,12 @@ def inner_func(*args, **kwargs) -> Any:
                 p.name for p in sig.parameters.values() if p.kind == p.KEYWORD_ONLY
             ]
             if "storage_writer" in kwonlyargs:
-                assert "storage_writer" not in kwargs, (args, kwargs)
+                if "storage_writer" in kwargs:
+                    raise AssertionError(f"storage_writer in kwargs: {(args, kwargs)}")
                 kwargs["storage_writer"] = args[1]
             elif "storage_reader" in kwonlyargs:
-                assert "storage_reader" not in kwargs, (args, kwargs)
+                if "storage_reader" in kwargs:
+                    raise AssertionError(f"storage_reader in kwargs: {(args, kwargs)}")
                 kwargs["storage_reader"] = args[1]
             else:
                 raise RuntimeError(f"Unexpected kwonlyargs = {kwonlyargs}")
diff --git a/torch/distributed/collective_utils.py b/torch/distributed/collective_utils.py
index 715cd251ea4d..b61155274bc8 100644
--- a/torch/distributed/collective_utils.py
+++ b/torch/distributed/collective_utils.py
@@ -13,11 +13,11 @@
 import logging
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Any, Callable, cast, Generic, Optional, TYPE_CHECKING, TypeVar, Union
+from typing import Any, cast, Generic, Optional, TYPE_CHECKING, TypeVar, Union
 
 
 if TYPE_CHECKING:
-    from collections.abc import Iterable
+    from collections.abc import Callable, Iterable
 
 import torch
 import torch.distributed as dist
@@ -114,6 +114,7 @@ def broadcast(
             error_msg += f": stage {sync_obj.stage_name}"
         if sync_obj.exception is not None:
             error_msg += f": exception {sync_obj.exception}"
+        # pyrefly: ignore  # invalid-inheritance
         raise RuntimeError(error_msg) from sync_obj.exception
 
     return cast(T, sync_obj.payload)
@@ -183,13 +184,16 @@ def all_gather(
 
         if len(exception_list) > 0:
             raise RuntimeError(  # type: ignore[misc]
-                error_msg, exception_list
+                error_msg,
+                exception_list,
+                # pyrefly: ignore  # invalid-inheritance
             ) from exception_list[0]
         return ret_list
     else:
         if not sync_obj.success:
             raise RuntimeError(
                 f"all_gather failed with exception {sync_obj.exception}",
+                # pyrefly: ignore  # invalid-inheritance
             ) from sync_obj.exception
         return [sync_obj.payload]  # type: ignore[list-item]
 
@@ -204,7 +208,7 @@ def all_gather_object_enforce_type(
     # pyre-fixme[2]: Parameter must have a type other than `Any`
     obj: Any,
     # pyre-fixme[2]: Parameter must have a type that does not contain `Any`
-    type_checker: Callable[[Any, Any], bool] = lambda x, y: type(x) == type(y),
+    type_checker: Callable[[Any, Any], bool] = lambda x, y: type(x) is type(y),
 ) -> None:
     """
     Similar to plain all_gather_object but with additional type checking
@@ -266,10 +270,13 @@ def _summarize_ranks(ranks: Iterable[int]) -> str:
     result = []
     for r in ranges:
         if len(r) == 1:
+            # pyrefly: ignore  # bad-argument-type
             result.append(f"{r.start}")
         elif r.step == 1:
+            # pyrefly: ignore  # bad-argument-type
             result.append(f"{r.start}:{r.stop}")
         else:
+            # pyrefly: ignore  # bad-argument-type
             result.append(f"{r.start}:{r.stop}:{r.step}")
     return ",".join(result)
 
diff --git a/torch/distributed/constants.py b/torch/distributed/constants.py
index bfa878521864..c1e604bc8675 100644
--- a/torch/distributed/constants.py
+++ b/torch/distributed/constants.py
@@ -1,11 +1,7 @@
 from datetime import timedelta
 from typing import Optional
 
-# Import from centralized fallback module - no ImportError handling needed
-from torch.distributed._distributed_c10d import (
-    _DEFAULT_PG_NCCL_TIMEOUT,
-    _DEFAULT_PG_TIMEOUT,
-)
+from torch._C._distributed_c10d import _DEFAULT_PG_TIMEOUT
 
 
 __all__ = ["default_pg_timeout", "default_pg_nccl_timeout"]
@@ -20,4 +16,11 @@
 # Later, we could consider merging them back together at the c++ layer if we can align on a same value.
 # (only if TORCH_NCCL_BLOCKING_WAIT or TORCH_NCCL_ASYNC_ERROR_HANDLING is set to 1).
 
-default_pg_nccl_timeout: Optional[timedelta] = _DEFAULT_PG_NCCL_TIMEOUT
+try:
+    from torch._C._distributed_c10d import _DEFAULT_PG_NCCL_TIMEOUT
+
+    default_pg_nccl_timeout: Optional[timedelta] = _DEFAULT_PG_NCCL_TIMEOUT
+except ImportError:
+    # if C++ NCCL support is not compiled, we don't have access to the default nccl value.
+    # if anyone is actually trying to use nccl in this state, it should error.
+    default_pg_nccl_timeout = None
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py
index 6ee9263db8cd..5c8969091d69 100644
--- a/torch/distributed/device_mesh.py
+++ b/torch/distributed/device_mesh.py
@@ -1,24 +1,45 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import logging
-import math
 import os
 import threading
 import warnings
 from collections.abc import Iterator
-from functools import reduce
 from itertools import zip_longest
 from typing import Optional, TYPE_CHECKING, Union
 
 import torch
+from torch.distributed import is_available
+from torch.distributed._mesh_layout import _MeshLayout
+from torch.distributed._pycute import is_int, suffix_product
 from torch.utils._typing_utils import not_none
 
 
 __all__ = ["init_device_mesh", "DeviceMesh"]
 
 
-if True:  # just to temporarily avoid reindentation
-    from torch.distributed._distributed_c10d import Backend as C10dBackend
+if not is_available():
+    import sys
+
+    # We need to create the stubs when distributed is not available.
+    # Otherwise, we would fail the doc tests (```./.ci/pytorch/docs-test.sh```),
+    # since it would try to import ``torch.distributed.device_mesh`` or
+    # ``torch.distributed.init_device_mesh`` but cannot find them.
+
+    class _DeviceMeshStub:
+        pass
+
+    def _init_device_mesh_stub():
+        pass
+
+    sys.modules["torch.distributed.device_mesh"].DeviceMesh = _DeviceMeshStub  # type: ignore[attr-defined]
+    sys.modules[
+        "torch.distributed.device_mesh"
+    ].init_device_mesh = _init_device_mesh_stub  # type: ignore[attr-defined]
+
+
+else:
+    from torch._C._distributed_c10d import Backend as C10dBackend
     from torch.distributed.distributed_c10d import (
         _get_default_group,
         _resolve_process_group,
@@ -44,202 +65,31 @@
                 "DeviceMesh requires numpy >= 1.21 to be installed for type checking"
             )
 
+    BackendConfig = tuple[Optional[str], Optional[C10dBackend.Options]]
+    torch.serialization.add_safe_globals([_MeshLayout])
+
     class _MeshEnv(threading.local):
         def __init__(self) -> None:
             self.mesh_stack: list[DeviceMesh] = []
-            self.child_to_root_mapping: dict[DeviceMesh, DeviceMesh] = {}
-            self.mesh_dim_group_options: dict[
-                int, tuple[Optional[str], Optional[C10dBackend.Options]]
-            ] = {}
-            self.root_to_flatten_mapping: dict[DeviceMesh, dict[str, DeviceMesh]] = {}
-            # Record flatten mesh name to its mesh dim index in root mesh.
-            self.flatten_name_to_root_dims: dict[
-                DeviceMesh, dict[str, tuple[int, ...]]
-            ] = {}
 
         def get_current_mesh(self) -> "DeviceMesh":
             if len(self.mesh_stack) == 0:
                 raise RuntimeError("No device mesh is currently active!")
             return self.mesh_stack[-1]
 
-        def create_sub_mesh(
-            self,
-            device_mesh: "DeviceMesh",
-            submesh_dim_names: tuple[str, ...],
-            submesh_dims: list[tuple[int, ...]],
-        ) -> "DeviceMesh":
-            # Get the submesh dim size from the submesh_dims.
-            # For example, if we have a 3D mesh with mesh_shape (2, 2, 2) mesh_dim_names ("dp", "cp", "tp") and we want
-            # to slice out mesh["dp_cp"], then submesh_dims = [(0, 1), (2,)] and submesh_dim_size = [2 * 2, 2] = [4, 2].
-            # If we want to slice out mesh["dp", "cp"], then submesh_dims = [(0,), (1,)] and submesh_dim_size = [2, 2].
-            slice_dim_size = [
-                reduce(
-                    lambda x, y: x * device_mesh.mesh.size(y),
-                    mesh_dim,
-                    1,
-                )
-                for mesh_dim in submesh_dims
-            ]
-
-            mesh_tensor = device_mesh.mesh
-            # slice_dim_idx could be different from submesh_dims, as we may need to flatten out some dims.
-            slice_dim_idx = []
-            slice_dim_group_name = []
-            # keep track of the number of dims that have been flattened so we can get the correct slice_dim_idx in the
-            # flattened mesh tensor.
-            num_dims_flatten = 0
-            for mesh_dim_indices, mesh_dim_name in zip(submesh_dims, submesh_dim_names):
-                # Currently, this only allows slicing out a contiguous flattened dim.
-                # TODO: we need to handle reconstructing a non-contiguous flattened dim.
-                if len(mesh_dim_indices) > 1:
-                    # We need to move the start_dim and end_dim to the left if some dims are already flattened.
-                    mesh_tensor = mesh_tensor.flatten(
-                        start_dim=mesh_dim_indices[0] - num_dims_flatten,
-                        end_dim=mesh_dim_indices[-1] - num_dims_flatten,
-                    )
-                    # If some dims are already flattened, we need to adjust the slice_dim_idx accordingly.
-                    # For example, if the submesh_dims = [(0, 1), (2,), (3, 4)] with 0-1 flattened and 3-4 flattened,
-                    # then the final slice_dim_idx should be [0, 1, 2].
-                    slice_dim_idx.append(mesh_dim_indices[0] - num_dims_flatten)
-                    num_dims_flatten += len(mesh_dim_indices) - 1
-                    slice_dim_group_name.append(
-                        self.root_to_flatten_mapping[device_mesh][
-                            mesh_dim_name
-                        ]._dim_group_names[0]  # type: ignore[has-type]
-                    )
-                else:
-                    slice_dim_idx.append(mesh_dim_indices[0] - num_dims_flatten)
-                    slice_dim_group_name.append(
-                        device_mesh._dim_group_names[mesh_dim_indices[0]]  # type: ignore[has-type]
-                    )
-
-            # mesh_tensor has already been flattened if needed. So mesh_tensor.ndim <= device_mesh.mesh.ndim now.
-            mesh_dims_remained_idx = list(range(mesh_tensor.ndim))
-            for idx in slice_dim_idx:
-                if idx not in mesh_dims_remained_idx:
-                    raise NotImplementedError(
-                        "Currently, this only allows slicing out a contiguous flattened dim."
-                    )
-                mesh_dims_remained_idx.remove(idx)
-
-            # pg_ranks_by_dim is the size of [number of local ranks of the outermost submesh dimension, *slice_dim_idx]
-            # This means on each local rank of the outermost slice mesh dim, we have a tensor of submesh size with
-            # the pg ranks of the submesh. From this, we can extract the submesh mesh tensor contains the current rank.
-            pg_ranks_by_dim = mesh_tensor.permute(
-                *mesh_dims_remained_idx, *slice_dim_idx
-            ).reshape(-1, *slice_dim_size)
-
-            cur_rank = device_mesh.get_rank()
-            for mesh_nd in pg_ranks_by_dim:
-                submesh = DeviceMesh(
-                    device_mesh.device_type,
-                    mesh_nd,
-                    mesh_dim_names=submesh_dim_names,
-                    _init_backend=False,
-                )
-                if cur_rank in mesh_nd:
-                    res_submesh = submesh
-
-            res_submesh._dim_group_names = slice_dim_group_name  # type: ignore[possibly-undefined, has-type]
-            self.child_to_root_mapping[res_submesh] = device_mesh
-
-            return res_submesh
-
-        def create_flatten_mesh(
-            self,
-            device_mesh: "DeviceMesh",
-            mesh_dim_name: Optional[str] = None,
-            backend_override: tuple[Optional[str], Optional[C10dBackend.Options]] = (
-                None,
-                None,
-            ),
-        ) -> "DeviceMesh":
-            root_mesh = _mesh_resources.get_root_mesh(device_mesh)
-
-            flatten_dims_in_root = [
-                not_none(root_mesh.mesh_dim_names).index(flatten_mesh_dim_name)
-                for flatten_mesh_dim_name in not_none(device_mesh.mesh_dim_names)
-            ]
-
-            if not mesh_dim_name:
-                mesh_dim_name = "_".join(not_none(device_mesh.mesh_dim_names))
-
-            # Flatten a 1D device mesh into its original mesh_dim_name will return itself.
-            if device_mesh.ndim == 1 and mesh_dim_name in not_none(
-                device_mesh.mesh_dim_names
-            ):
-                return device_mesh
-
-            # Check whether the mesh_dim_name for flattened mesh is valid.
-            self.flatten_name_to_root_dims.setdefault(root_mesh, {})
-            invalid_dim_names = not_none(root_mesh.mesh_dim_names)
-            if mesh_dim_name in invalid_dim_names:
-                raise RuntimeError(
-                    f"{mesh_dim_name} already exists for submesh of the {root_mesh}. ",
-                    f"The mesh_dim_names of submesh and flattened mesh are {invalid_dim_names}. "
-                    f"Please specify another valid mesh_dim_name.",
-                )
-
-            # Quick return if the flatten mesh has been created before.
-            if (
-                root_mesh in self.root_to_flatten_mapping
-                and mesh_dim_name in self.root_to_flatten_mapping[root_mesh]
-            ):
-                return self.root_to_flatten_mapping[root_mesh][mesh_dim_name]
-
-            flattened_mesh_dim_size = math.prod(device_mesh.mesh.size())
-
-            remained_dims_in_root = list(range(root_mesh.mesh.ndim))
-            for flatten_dim_in_root in flatten_dims_in_root:
-                remained_dims_in_root.remove(flatten_dim_in_root)
-
-            pg_ranks_by_dim = root_mesh.mesh.permute(
-                *remained_dims_in_root, *flatten_dims_in_root
-            ).reshape(-1, flattened_mesh_dim_size)
-
-            cur_rank = root_mesh.get_rank()
-            for mesh_nd in pg_ranks_by_dim:
-                # need to init backend here since the flattened pg doesn't exist in root mesh.
-                flattened_mesh = DeviceMesh(
-                    root_mesh.device_type,
-                    mesh_nd,
-                    mesh_dim_names=(mesh_dim_name,),
-                    backend_override=(backend_override,),
-                )
-                if cur_rank in mesh_nd:
-                    res_flattened_mesh = flattened_mesh
-            self.child_to_root_mapping[res_flattened_mesh] = root_mesh  # type: ignore[possibly-undefined]
-            self.root_to_flatten_mapping.setdefault(root_mesh, {})[mesh_dim_name] = (
-                res_flattened_mesh  # type: ignore[possibly-undefined]
-            )
-            self.flatten_name_to_root_dims[root_mesh][mesh_dim_name] = tuple(
-                flatten_dims_in_root
-            )  # type: ignore[possibly-undefined]
-
-            return res_flattened_mesh
-
+        # TODO: to remove it once we move all use cases into new API.
         def get_root_mesh(self, device_mesh: "DeviceMesh") -> "DeviceMesh":
             # If a mesh could not be found in the child_to_root_mapping, it is a root mesh itself.
             # A root mesh is not created through slicing.
             # We considers the root mesh of a root mesh is itself.
-            root_mesh = self.child_to_root_mapping.get(device_mesh, None)
-            return device_mesh if not root_mesh else root_mesh
-
-        def get_root_mesh_dim(self, device_mesh: "DeviceMesh") -> Optional[int]:
-            """
-            Returns the index of the mesh dim in the root mesh.
-            The device_mesh passed in needs to be sliced out from the root mesh
-            or submesh of the root mesh.
-            """
-            root_mesh = self.get_root_mesh(device_mesh)
-            child_mesh_dim_names = device_mesh.mesh_dim_names
-            if root_mesh and child_mesh_dim_names:
-                assert len(child_mesh_dim_names) == 1, (
-                    "The submesh can only be a 1D mesh."
-                )
-                child_mesh_dim_name = child_mesh_dim_names[0]
-                return self.get_mesh_dim_by_name(root_mesh, child_mesh_dim_name)
-            return None
+            # We keep this function for backward compatibility.
+            warnings.warn(
+                "This get_root_mesh API will be deprecated soon."
+                "Please use `get_root_mesh` inside DeviceMesh instead."
+            )
+            if not device_mesh:
+                return device_mesh
+            return device_mesh._get_root_mesh()
 
         @staticmethod
         def num_devices_per_host(device_type: str) -> int:
@@ -251,115 +101,16 @@ def num_hosts(device_type: str) -> int:
             # homogeneous hardware for now
             return get_world_size() // _MeshEnv.num_devices_per_host(device_type)
 
-        def get_mesh_dim_by_name(
-            self, device_mesh: "DeviceMesh", mesh_dim_name: str
-        ) -> int:
-            if (
-                device_mesh.mesh_dim_names is None
-                or len(device_mesh.mesh_dim_names) == 0
-            ):
-                raise KeyError(
-                    "No `mesh_dim_names` found.",
-                )
-            if mesh_dim_name not in device_mesh.mesh_dim_names:
-                raise KeyError(
-                    f"Mesh dimension '{mesh_dim_name}' does not exist.",
-                    f"Available mesh dimensions are: mesh_dim_names={device_mesh.mesh_dim_names}",
-                )
-            return not_none(device_mesh.mesh_dim_names.index(mesh_dim_name))
-
-        def _set_mesh_dim_group_options(
-            self,
-            dim: int,
-            backend: Optional[str],
-            pg_options: Optional[C10dBackend.Options] = None,
-        ) -> None:
-            self.mesh_dim_group_options[dim] = (backend, pg_options)
-
-        def _get_slice_mesh_dims(
-            self, device_mesh, mesh_dim_names
-        ) -> list[tuple[int, ...]]:
-            """
-            Validate whether the mesh_dim_names is valid for slicing the given device_mesh.
-            If valid, return dim indexes of the slice mesh in the device mesh.
-            """
-            if device_mesh != self.get_root_mesh(device_mesh):
-                warnings.warn(
-                    "You are attempting to slice a submesh from another submesh. While we support this operation, "
-                    "it is users' responsibility to ensure that the submesh is consistently sliced across all ranks. "
-                    "If not, this may result in some ranks receiving the submesh while others encounter errors."
-                )
-
-            # The slice mesh_dim_names should consist either the device_mesh's mesh_dim_names
-            # or its flattened mesh's mesh_dim_names.
-            self.flatten_name_to_root_dims.setdefault(device_mesh, {})
-            flatten_name_to_root_dims = self.flatten_name_to_root_dims[device_mesh]
-            valid_mesh_dim_names = [
-                *device_mesh.mesh_dim_names,
-                *flatten_name_to_root_dims,
-            ]
-
-            if not all(
-                mesh_dim_name in valid_mesh_dim_names
-                for mesh_dim_name in mesh_dim_names
-            ):
-                raise KeyError(
-                    f"Invalid mesh_dim_names {mesh_dim_names} specified. "
-                    f"Valid mesh_dim_names are {valid_mesh_dim_names}."
-                )
-
-            # Validate the order of the slice mesh dim indices.
-            # This needs to be in ascending order.
-            curr_idx = -1
-            slice_mesh_dims = []
-            for mesh_dim_name in mesh_dim_names:
-                if mesh_dim_name in flatten_name_to_root_dims:
-                    mesh_indices = flatten_name_to_root_dims[mesh_dim_name]
-                    # TODO: this doesn't allow non-contiguous slicing with flatten dim yet. next_idx
-                    # should be mesh_indices[0] once we support non-contiguous slicing with flatten dim.
-                    next_idx = mesh_indices[-1]
-                    slice_mesh_dims.append(mesh_indices)
-                else:
-                    next_idx = device_mesh.mesh_dim_names.index(mesh_dim_name)
-                    slice_mesh_dims.append((next_idx,))
-                if next_idx <= curr_idx:
-                    raise KeyError(
-                        f"Invalid mesh_dim_names {mesh_dim_names} specified. "
-                        f"Found mesh dim indices to slice: {slice_mesh_dims}. "
-                        "Mesh dim indices should be in ascending order."
-                    )
-                curr_idx = next_idx
-
-            return slice_mesh_dims
-
+        # TODO: to remove it once we move all use cases into new API.
+        # We keep this API for backward compatibility.
         def _get_all_submeshes(
             self, device_mesh: "DeviceMesh", mesh_dim_name: str
         ) -> list["DeviceMesh"]:
-            """
-            Return all the submeshes of a given mesh dimension of the device mesh.
-            """
-            mesh_dim = self.get_mesh_dim_by_name(device_mesh, mesh_dim_name)
-            pg_ranks_by_dim = device_mesh.mesh.swapdims(-1, mesh_dim).reshape(
-                -1, device_mesh.mesh.size(mesh_dim)
+            warnings.warn(
+                "This _get_all_submeshes API will be deprecated soon."
+                "Please use `_get_all_submeshes` inside DeviceMesh instead."
             )
-
-            cur_rank = device_mesh.get_rank()
-            res_submeshes = []
-            for mesh_1d in pg_ranks_by_dim:
-                submesh = DeviceMesh(
-                    device_mesh.device_type,
-                    mesh_1d,
-                    mesh_dim_names=(mesh_dim_name,),
-                    _init_backend=False,
-                )
-                submesh._dim_group_names = (
-                    [device_mesh._dim_group_names[mesh_dim]]  # type: ignore[has-type]
-                    if cur_rank in mesh_1d
-                    else []
-                )
-                res_submeshes.append(submesh)
-
-            return res_submeshes
+            return device_mesh._get_all_submeshes(mesh_dim_name)
 
     _mesh_resources: _MeshEnv = _MeshEnv()
 
@@ -420,49 +171,89 @@ class DeviceMesh:
             >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
         """
 
-        device_type: str
-        mesh: torch.Tensor
-        mesh_dim_names: Optional[tuple[str, ...]]
+        _device_type: str
+        _rank_map: torch.Tensor
+        _mesh_dim_names: Optional[tuple[str, ...]]
+        _layout: _MeshLayout
+        _root_mesh: Optional["DeviceMesh"] = None
+        # Record flatten mesh name to its flattened mesh in root mesh.
+        _flatten_mapping: dict[str, "DeviceMesh"]
 
         def __init__(
             self,
             device_type: str,
-            mesh: Union[torch.Tensor, "ArrayLike"],
+            mesh: Optional[Union[torch.Tensor, "ArrayLike"]] = None,
             *,
             mesh_dim_names: Optional[tuple[str, ...]] = None,
-            backend_override: Optional[
-                tuple[tuple[Optional[str], Optional[C10dBackend.Options]], ...]
-            ] = None,
+            backend_override: Optional[tuple[BackendConfig, ...]] = None,
             _init_backend: bool = True,
             _rank: Optional[int] = None,
+            _layout: Optional[_MeshLayout] = None,
+            _rank_map: Optional[torch.Tensor] = None,
+            _root_mesh: Optional["DeviceMesh"] = None,
         ) -> None:
-            self.device_type = device_type
-            if isinstance(mesh, torch.Tensor) and mesh.device.type != "cpu":
-                raise ValueError(f"`mesh` must be a CPU tensor, got {mesh}")
-            self.mesh = (
-                mesh.detach().to(dtype=torch.int)
-                if isinstance(mesh, torch.Tensor)
-                else torch.tensor(mesh, device="cpu", dtype=torch.int)
+            if mesh is not None:
+                if _layout is not None or _rank_map is not None:
+                    raise TypeError(
+                        "Cannot provide _layout and/or _rank_map if passing explicit mesh"
+                    )
+                if isinstance(mesh, torch.Tensor) and mesh.device.type != "cpu":
+                    raise ValueError(f"`mesh` must be a CPU tensor, got {mesh}")
+                mesh_tensor = (
+                    mesh.detach().to(dtype=torch.int).contiguous()
+                    if isinstance(mesh, torch.Tensor)
+                    else torch.tensor(mesh, device="cpu", dtype=torch.int)
+                )
+                _layout = _MeshLayout(mesh_tensor.size(), mesh_tensor.stride())
+                _rank_map = mesh_tensor.flatten()
+            else:
+                if _layout is None or _rank_map is None:
+                    raise TypeError(
+                        "The mesh argument is required except for PRIVATE USAGE ONLY!"
+                    )
+
+            assert _layout.check_non_overlap(), (
+                "Please use a non-overlapping layout when creating a DeviceMesh."
+            )
+            assert _rank_map.ndim == 1, "The rank map must be 1-dimensional"
+            assert _rank_map.is_contiguous(), "The rank map must be contiguous"
+            assert _rank_map.numel() >= _layout.cosize(), (
+                f"The rank map contains {_rank_map.numel()} element, "
+                f"which isn't large enough for layout {_layout}"
             )
-            self.mesh_dim_names = tuple(mesh_dim_names) if mesh_dim_names else None
-            if backend_override is None:
-                backend_override = ((None, None),) * self.mesh.ndim
 
-            # private field to pre-generate DeviceMesh's hash
-            self._flatten_mesh_list = tuple(self.mesh.flatten().tolist())
-            self._thread_id = None
+            self._device_type = device_type
+            self._layout = _layout
+            self._rank_map = _rank_map
+            self._mesh_dim_names = tuple(mesh_dim_names) if mesh_dim_names else None
+            self._root_mesh = _root_mesh
+
+            if backend_override is None:
+                backend_override = ((None, None),) * len(self._layout)
+            elif len(backend_override) != len(self._layout):
+                raise ValueError(
+                    f"backend_override should have the same length as the number of mesh dimensions, "
+                    f"but got {len(backend_override)} and {len(self._layout)}."
+                )
 
             # Skip process group initialization if xla device or init backend is False
             # TODO(yeounoh) implement DeviceMesh backend and register XLA backend.
+            self._thread_id = None
             if device_type != "xla":
                 # always try to create default (world) pg, even if it is not initialized
                 # already. The world pg is used for device mesh identity (rank) on each
                 # process (we need to know if the current global rank is in the mesh or not).
                 if _init_backend:
                     self._setup_world_group_and_device()
-                    self._init_process_groups(backend_override)
+                    self._dim_group_names = self._init_process_groups(
+                        self._layout,
+                        self._rank_map,
+                        self._mesh_dim_names,
+                        backend_override,
+                    )
 
                 if is_initialized() and get_backend() == "threaded":
+                    # pyrefly: ignore  # bad-assignment
                     self._thread_id = threading.get_ident()
 
                 if _rank is None:
@@ -475,6 +266,36 @@ def __init__(
                     rank_coords[0].tolist() if rank_coords.size(0) > 0 else None
                 )
 
+            # private field to pre-generate DeviceMesh's hash
+            self._flatten_mesh_list = tuple(self.mesh.flatten().tolist())
+            # Initialize instance-specific flatten mapping
+            self._flatten_mapping = {}
+
+        @property
+        def device_type(self) -> str:
+            """Returns the device type of the mesh."""
+            return self._device_type
+
+        @property
+        def mesh(self) -> torch.Tensor:
+            """Returns the tensor representing the layout of devices."""
+            full_mesh = self._layout.remap_to_tensor(self._rank_map)
+            if full_mesh.size(0) == 1:
+                return full_mesh[0]
+            my_coords = (full_mesh == get_rank()).nonzero()
+            if my_coords.size(0) > 0:
+                return full_mesh[my_coords[0, 0]]
+            raise RuntimeError(
+                "In order to get the mesh Tensor of a DeviceMesh it needs to "
+                "either have all its original dimensions (e.g., no slicing) "
+                "or it needs to contain the local rank"
+            )
+
+        @property
+        def mesh_dim_names(self) -> Optional[tuple[str, ...]]:
+            """Returns the names of mesh dimensions."""
+            return self._mesh_dim_names
+
         def _setup_world_group_and_device(self):
             default_initialized = is_initialized()
             # TODO: think about how to allow pg options to be passed to world group
@@ -483,14 +304,14 @@ def _setup_world_group_and_device(self):
                 init_process_group()
 
             world_size = get_world_size()
-            if self.mesh.numel() > world_size:
+            if self._layout.numel() > world_size:
                 raise RuntimeError(
-                    f"Mesh should not be bigger than default world size {world_size}, but found {self.mesh.numel()} ranks!"
+                    f"Mesh should not be bigger than default world size {world_size}, but found {self._layout.numel()} ranks!"
                 )
 
             # ONLY set the device if the current device is not initialized, if user already
             # set the device before DeviceMesh init, we respect the user's choice.
-            device_handle = _get_device_handle(self.device_type)
+            device_handle = _get_device_handle(self._device_type)
             if device_handle and not device_handle.is_initialized():
                 # auto set the cuda/cuda-like device only if user has not set it, if there's LOCAL_RANK
                 # env variable from launchers, we use it to set the device.
@@ -513,25 +334,25 @@ def _setup_world_group_and_device(self):
                     # heuristic to set the current cuda/cuda-like device base on num of gpu devices available in each host
                     # NOTE: This device selection would only work for homogeneous hardware.
                     num_devices_per_host = device_handle.device_count()
-                    if num_devices_per_host:
-                        if (
-                            world_size > num_devices_per_host
-                            and world_size % num_devices_per_host != 0
-                        ):
-                            raise RuntimeError(
-                                f"DeviceMesh only support homogeneous hardware, but found "
-                                f"{world_size} ranks and {num_devices_per_host} {self.device_type} devices!"
-                            )
-                        device_handle.set_device(get_rank() % num_devices_per_host)
+                    if (
+                        world_size > num_devices_per_host
+                        and world_size % num_devices_per_host != 0
+                    ):
+                        raise RuntimeError(
+                            f"DeviceMesh only support homogeneous hardware, but found "
+                            f"{world_size} ranks and {num_devices_per_host} {self._device_type} devices!"
+                        )
+                    device_handle.set_device(get_rank() % num_devices_per_host)
 
             return _get_default_group()
 
+        @staticmethod
         def _init_process_groups(
-            self,
-            backend_override: tuple[
-                tuple[Optional[str], Optional[C10dBackend.Options]], ...
-            ],
-        ):
+            layout: _MeshLayout,
+            rank_map: torch.Tensor,
+            mesh_dim_names: Optional[tuple[str, ...]],
+            backend_override: tuple[BackendConfig, ...],
+        ) -> list[str]:
             # group_name associated with each mesh dimension, each
             # mesh dimension should have one sub-group per rank
             #
@@ -539,13 +360,11 @@ def _init_process_groups(
             default_group = _get_default_group()
 
             if (
-                self.mesh.ndim == 1
-                and self.mesh.numel() == get_world_size()
-                and _mesh_resources.mesh_dim_group_options.get(0, (None, None))
-                == (None, None)
+                len(layout) == 1
+                and layout.numel() == get_world_size()
                 and backend_override[0] == (None, None)
             ):
-                # Append the default pg to the first dim groups only if the default pg is compatible with `self.device_type`.
+                # Append the default pg to the first dim groups only if the default pg is compatible with `self._device_type`.
                 # Otherwise, create new pg.
                 ranks = list(range(get_world_size()))
                 dim_group = (
@@ -561,35 +380,23 @@ def _init_process_groups(
                 dim_group_names.append(dim_group.group_name)
             else:
                 # create sub pgs base on the mesh argument specified
-                for dim in range(self.mesh.ndim):
+                for dim in range(len(layout)):
                     # swap the current dim to the last dim
                     # then reshape to flatten out other dims
-                    pg_ranks_by_dim = self.mesh.swapdims(-1, dim).reshape(
-                        -1, self.mesh.size(dim)
-                    )
-
-                    # Respect dim group options specified via _MeshEnv.set_dim_group_options().
-                    # Inherit from the parent group if no options are specified for the group.
-                    if dim in _mesh_resources.mesh_dim_group_options:
-                        if backend_override[dim] != (None, None):
-                            raise RuntimeError(
-                                f"Dimension {dim} present both in the backend_override argument "
-                                "and via _mesh_resources._set_mesh_dim_group_options"
-                            )
-                        (
-                            backend,
-                            pg_options,
-                        ) = _mesh_resources.mesh_dim_group_options[dim]
-                    else:
-                        backend, pg_options = backend_override[dim]
+                    pg_ranks_by_dim = layout[dim].nest().remap_to_tensor(rank_map)
+                    backend, pg_options = backend_override[dim]
+                    # We need to explicitly pass in timeout when specified in option, otherwise
+                    # the default timeout will be used to override the timeout set in option.
+                    # TODO: remove this once we have fixed inside c10d level.
+                    timeout = pg_options._timeout if pg_options else None
 
                     # If we have a 2D mesh with mesh_dim_names ("dp", "tp"), the group description
                     # of the subgroups would be `mesh_dim_dp` and `mesh_name_tp`.
                     # If the mesh doesn't not have a mesh_dim_names, then the group description of the
                     # subgroup would be `mesh_dim_0` and `mesh_dim_1`.
                     group_desc = (
-                        f"mesh_{self.mesh_dim_names[dim]}"
-                        if self.mesh_dim_names
+                        f"mesh_{mesh_dim_names[dim]}"
+                        if mesh_dim_names
                         else f"mesh_dim_{dim}"
                     )
 
@@ -619,6 +426,7 @@ def _init_process_groups(
                     ):
                         dim_group = split_group(
                             parent_pg=default_group,
+                            timeout=timeout,
                             pg_options=pg_options,
                             split_ranks=pg_ranks_by_dim.tolist(),
                             group_desc=group_desc,
@@ -635,23 +443,28 @@ def _init_process_groups(
                         # We temporarily revert the reuse subgroup, since it breaks two internal tests.
                         # Temporarily reverting to resolve test timeout while root-causing.
                         # TODO: Add two tests to cover internal tests scenarios and re-enable reuse subgroup if exists.
+                        # pyrefly: ignore  # unbound-name
                         if bound_device_id is None or not has_split_group:
                             dim_group = new_group(
                                 ranks=subgroup_ranks,
+                                timeout=timeout,
                                 backend=backend,
                                 pg_options=pg_options,
                                 group_desc=group_desc,
                             )
 
                         # only add to dim_groups if the current rank in the subgroup
-                        if self.get_rank() in subgroup_ranks:
+                        if get_rank() in subgroup_ranks:
                             if len(dim_group_names) > dim:
                                 raise RuntimeError(
-                                    f"Each device mesh dimension should get only one process group, but got {self.get_rank()} "
+                                    f"Each device mesh dimension should get only one process group, but got {get_rank()} "
                                     f"in {subgroup_ranks}!"
                                 )
                             dim_group_names.append(dim_group.group_name)  # type: ignore[union-attr]
-            self._dim_group_names = dim_group_names
+            return dim_group_names
+
+        def _get_root_mesh(self) -> "DeviceMesh":
+            return self._root_mesh if self._root_mesh else self
 
         def __enter__(self) -> "DeviceMesh":
             # set this mesh as the current mesh in mesh env
@@ -665,11 +478,11 @@ def __exit__(self, exc_type, exc_value, exc_traceback) -> None:
 
         def __repr__(self) -> str:
             device_mesh_repr = (
-                f"({', '.join(f'{k}={v}' for k, v in zip(self.mesh_dim_names, self.mesh.shape))})"
-                if self.mesh_dim_names
-                else f"{tuple(self.mesh.shape)}"
+                f"({', '.join(f'{k}={v}' for k, v in zip(self._mesh_dim_names, self._layout.top_level_sizes))})"
+                if self._mesh_dim_names
+                else f"{self._layout.top_level_sizes}"
             )
-            device_mesh_repr = f"DeviceMesh({device_mesh_repr}, device: '{self.device_type}', stride: {self.mesh.stride()}"
+            device_mesh_repr = f"DeviceMesh({device_mesh_repr}, '{self.device_type}', stride={self._layout.strides}"
             # We only print the mesh tensor if the debug mode is turned on.
             if os.environ.get("TORCH_DISTRIBUTED_DEBUG", "") == "DETAIL":
                 device_mesh_repr += f", Mesh: {self.mesh.tolist()}"
@@ -682,10 +495,11 @@ def __hash__(self):
                 self._hash = hash(
                     (
                         self._flatten_mesh_list,
-                        self.mesh.shape,
-                        self.device_type,
-                        self.mesh_dim_names,
+                        self._layout,
+                        self._device_type,
+                        self._mesh_dim_names,
                         self._thread_id,
+                        self._root_mesh,
                     )
                 )
             return self._hash
@@ -697,10 +511,11 @@ def __eq__(self, other: object) -> bool:
                 return False
             return (
                 self._flatten_mesh_list == other._flatten_mesh_list
-                and self.mesh.shape == other.mesh.shape
-                and self.device_type == other.device_type
-                and self.mesh_dim_names == other.mesh_dim_names
+                and self._layout == other._layout
+                and self._device_type == other._device_type
+                and self._mesh_dim_names == other._mesh_dim_names
                 and self._thread_id == other._thread_id
+                and self._root_mesh == other._root_mesh
             )
 
         def __getitem__(
@@ -749,32 +564,28 @@ def __getitem__(
                 >>> dp_cp_mesh = mesh_3d["dp", "cp"]
                 >>> cp_dp_mesh = mesh_3d["cp", "dp"]
             """
-            if not self.mesh_dim_names:
+            if not self._mesh_dim_names:
                 raise RuntimeError("Cannot slice a DeviceMesh without mesh_dim_names!")
 
             mesh_dim_names = (
                 (mesh_dim_names,) if isinstance(mesh_dim_names, str) else mesh_dim_names
             )
 
-            if mesh_dim_names == self.mesh_dim_names:
+            if mesh_dim_names == self._mesh_dim_names:
                 return self
             else:
-                slice_mesh_dims = _mesh_resources._get_slice_mesh_dims(
-                    self, mesh_dim_names
-                )
-                # When using FakeTensorMode to trace the model, `create_sub_mesh()` will
+                sliced_mesh_layout = self._get_slice_mesh_layout(mesh_dim_names)
+                # When using FakeTensorMode to trace the model, `_create_sub_mesh()` will
                 # fail as it will require a real tensor to manipulate.
                 # `unset_fake_temporarily()` will allow us to materialize the tensors
-                # within `_mesh_resources`, which should not affect modling.
+                # within `_create_sub_mesh`, which should not affect modling.
                 #
                 # Note that this should be orthogonal to torch.compile(). But whether
                 # we can compile device_mesh `slicing` (no graph break) is not verified
                 # yet and need a follow-up,
                 # TODO: compiler + device_mesh slicing.
                 with torch._subclasses.fake_tensor.unset_fake_temporarily():
-                    submesh = _mesh_resources.create_sub_mesh(
-                        self, mesh_dim_names, slice_mesh_dims
-                    )
+                    submesh = self._create_sub_mesh(sliced_mesh_layout, mesh_dim_names)
                 return submesh
 
         def get_group(self, mesh_dim: Optional[Union[int, str]] = None) -> ProcessGroup:
@@ -792,22 +603,20 @@ def get_group(self, mesh_dim: Optional[Union[int, str]] = None) -> ProcessGroup:
             if not hasattr(self, "_dim_group_names"):
                 raise RuntimeError("DeviceMesh process groups not initialized!")
 
-            if self.mesh.ndim > 1 and mesh_dim is None:
+            if len(self._layout) > 1 and mesh_dim is None:
                 raise RuntimeError(
-                    f"Found the DeviceMesh have {self.mesh.ndim} dimensions",
+                    f"Found the DeviceMesh have {len(self._layout)} dimensions",
                     "Optional kwarg `mesh_dim` needs to be specified when device_mesh.ndim > 1.",
                     "If you want to get the list of all the ProcessGroups in the DeviceMesh,"
                     "please use `get_all_groups()` instead.",
                 )
 
             # Quick return if the current device_mesh is a 1D mesh.
-            if self.mesh.ndim == 1 and mesh_dim is None:
+            if len(self._layout) == 1 and mesh_dim is None:
                 return not_none(_resolve_process_group(self._dim_group_names[0]))
 
-            root_mesh = _mesh_resources.get_root_mesh(self)
-            root_to_flatten_mapping = _mesh_resources.root_to_flatten_mapping.get(
-                root_mesh, None
-            )
+            root_mesh = self._get_root_mesh()
+            root_to_flatten_mapping = root_mesh._flatten_mapping
             if root_to_flatten_mapping and mesh_dim in root_to_flatten_mapping.keys():
                 dim_group_name = root_to_flatten_mapping[
                     mesh_dim  # type: ignore[index]
@@ -815,7 +624,7 @@ def get_group(self, mesh_dim: Optional[Union[int, str]] = None) -> ProcessGroup:
                 return not_none(_resolve_process_group(dim_group_name))
             else:
                 mesh_dim = (
-                    _mesh_resources.get_mesh_dim_by_name(self, mesh_dim)
+                    self._get_mesh_dim_by_name(mesh_dim)
                     if isinstance(mesh_dim, str)
                     else mesh_dim
                 )
@@ -829,7 +638,235 @@ def get_all_groups(self) -> list[ProcessGroup]:
             Returns:
                 A list of :class:`ProcessGroup` object.
             """
-            return [self.get_group(i) for i in range(self.mesh.ndim)]
+            return [self.get_group(i) for i in range(len(self._layout))]
+
+        def _create_sub_mesh(
+            self,
+            layout: _MeshLayout,
+            submesh_dim_names: tuple[str, ...],
+        ) -> "DeviceMesh":
+            root_mesh = self._get_root_mesh()
+            slice_dim_group_name = []
+            for name in submesh_dim_names:
+                if name in not_none(self._mesh_dim_names):
+                    slice_dim_group_name.append(
+                        self._dim_group_names[  # type: ignore[has-type]
+                            not_none(self._mesh_dim_names).index(name)
+                        ]
+                    )
+                else:
+                    # If device_mesh is not root_mesh, we already throw error in _get_slice_mesh_layout
+                    # Since we will deprecate the slicing of flattened dim_name from root mesh soon,
+                    # we don't want to optimize the code furthermore.
+                    flatten_mesh = self._flatten_mapping[name]
+                    slice_dim_group_name.append(
+                        flatten_mesh._dim_group_names[  # type: ignore[has-type]
+                            not_none(flatten_mesh._mesh_dim_names).index(name)
+                        ]
+                    )
+            res_submesh = DeviceMesh(
+                self._device_type,
+                _layout=layout,
+                _rank_map=root_mesh._rank_map,
+                mesh_dim_names=submesh_dim_names,
+                _root_mesh=root_mesh,
+                _init_backend=False,
+            )
+            res_submesh._dim_group_names = slice_dim_group_name
+            return res_submesh
+
+        def _create_flatten_mesh(
+            self,
+            mesh_dim_name: Optional[str] = None,
+            backend_override: BackendConfig = (None, None),
+        ) -> "DeviceMesh":
+            root_mesh = self._get_root_mesh()
+
+            if not mesh_dim_name:
+                mesh_dim_name = "_".join(not_none(self._mesh_dim_names))
+
+            # Flatten a 1D device mesh into its original mesh_dim_name will return itself.
+            if self.ndim == 1 and mesh_dim_name in not_none(self._mesh_dim_names):
+                return self
+
+            # Check whether the mesh_dim_name for flattened mesh is valid.
+            invalid_dim_names = not_none(root_mesh._mesh_dim_names)
+            if mesh_dim_name in invalid_dim_names:
+                raise ValueError(
+                    f"{mesh_dim_name} already exists for submesh of the {root_mesh}. ",
+                    f"The mesh_dim_names of submesh and flattened mesh are {invalid_dim_names}. "
+                    f"Please specify another valid mesh_dim_name.",
+                )
+
+            flattened_mesh_layout = self._layout.coalesce()
+            if len(flattened_mesh_layout) > 1:
+                flattened_mesh_layout = flattened_mesh_layout.nest()
+            # Quick return if the flatten mesh has been created before.
+            if mesh_dim_name in root_mesh._flatten_mapping:
+                if (
+                    flattened_mesh_layout
+                    == root_mesh._flatten_mapping[mesh_dim_name]._layout
+                ):
+                    return root_mesh._flatten_mapping[mesh_dim_name]
+                else:
+                    raise ValueError(
+                        f"Flatten mesh with mesh_dim_name {mesh_dim_name} has been created before, "
+                        f"Please specify another valid mesh_dim_name."
+                    )
+
+            res_flattened_mesh = DeviceMesh(
+                root_mesh._device_type,
+                _layout=flattened_mesh_layout,
+                _rank_map=root_mesh._rank_map,
+                mesh_dim_names=(mesh_dim_name,),
+                _root_mesh=root_mesh,
+                backend_override=(backend_override,),
+            )
+            root_mesh._flatten_mapping[mesh_dim_name] = res_flattened_mesh
+
+            return res_flattened_mesh
+
+        def _get_root_mesh_dim(self) -> Optional[int]:
+            """
+            Returns the index of the mesh dim in the root mesh.
+            The device_mesh passed in needs to be sliced out from the root mesh
+            or submesh of the root mesh.
+            """
+            root_mesh = self._get_root_mesh()
+            child_mesh_dim_names = self._mesh_dim_names
+            if root_mesh and child_mesh_dim_names:
+                assert len(child_mesh_dim_names) == 1, (
+                    "The submesh can only be a 1D mesh."
+                )
+                child_mesh_dim_name = child_mesh_dim_names[0]
+                return root_mesh._get_mesh_dim_by_name(child_mesh_dim_name)
+            return None
+
+        def _get_mesh_dim_by_name(self, mesh_dim_name: str) -> int:
+            if self._mesh_dim_names is None or len(self._mesh_dim_names) == 0:
+                raise KeyError(
+                    "No `mesh_dim_names` found.",
+                )
+            if mesh_dim_name not in self._mesh_dim_names:
+                raise KeyError(
+                    f"Mesh dimension '{mesh_dim_name}' does not exist.",
+                    f"Available mesh dimensions are: mesh_dim_names={self._mesh_dim_names}",
+                )
+            return not_none(self._mesh_dim_names.index(mesh_dim_name))
+
+        def _get_slice_mesh_layout(
+            self, mesh_dim_names: tuple[str, ...]
+        ) -> _MeshLayout:
+            """
+            Validate whether the mesh_dim_names is valid for slicing the given device_mesh.
+            If valid, return dim indexes of the slice mesh in the device mesh.
+            """
+            slice_from_root = True
+            if self != self._get_root_mesh():
+                warnings.warn(
+                    "You are attempting to slice a submesh from another submesh. While we support this operation, "
+                    "it is users' responsibility to ensure that the submesh is consistently sliced across all ranks. "
+                    "If not, this may result in some ranks receiving the submesh while others encounter errors."
+                )
+                slice_from_root = False
+
+            # The slice mesh_dim_names should consist either the current device_mesh's mesh_dim_names
+            # or its flattened mesh's mesh_dim_names if it's root_mesh.
+            flatten_name_to_root_layout = (
+                {
+                    key: mesh._layout
+                    for key, mesh in self._get_root_mesh()._flatten_mapping.items()
+                }
+                if slice_from_root
+                else {}
+            )
+            valid_mesh_dim_names = [
+                *not_none(self._mesh_dim_names),
+                *flatten_name_to_root_layout,
+            ]
+
+            if not all(
+                mesh_dim_name in valid_mesh_dim_names
+                for mesh_dim_name in mesh_dim_names
+            ):
+                raise KeyError(
+                    f"Invalid mesh_dim_names {mesh_dim_names} specified. "
+                    f"Valid mesh_dim_names are {valid_mesh_dim_names}."
+                )
+
+            layout_sliced = []
+            for name in mesh_dim_names:
+                if name in not_none(self._mesh_dim_names):
+                    layout_sliced.append(
+                        self._layout[not_none(self._mesh_dim_names).index(name)]
+                    )
+                elif name in flatten_name_to_root_layout:
+                    warnings.warn(
+                        "Slicing a flattened dim from root mesh will be deprecated in PT 2.11. "
+                        "Users need to bookkeep the flattened mesh directly. "
+                    )
+                    layout_sliced.append(flatten_name_to_root_layout[name])
+
+            sliced_sizes = tuple(l.sizes for l in layout_sliced)
+            sliced_strides = tuple(l.strides for l in layout_sliced)
+
+            # The check below is from DeviceMesh's implementation before adopting CuTe layout for internal
+            # bookkeeping and it can be removed but we need to define what is the expected behavior.
+            # TODO: Remove the below check and define the expected behavior.
+            # Validate the order of the slice mesh dim indices.
+            # This needs to be in ascending order.
+            pre_stride = -1
+            for stride in reversed(sliced_strides):
+                # Note that with CuTe layout, we can support slicing flattened non-contiguous mesh dims with no problem.
+                # But this will make this behavior complicated so we decided to not support it for now.
+                if not is_int(stride):
+                    raise NotImplementedError(
+                        "Currently, this only allows slicing out a contiguous flattened dim."
+                    )
+                if stride < pre_stride:
+                    raise KeyError(
+                        f"Invalid mesh_dim_names {mesh_dim_names} specified. "
+                        "Mesh dim indices should be in ascending order."
+                    )
+                pre_stride = stride
+
+            # When users sliced dim_names outside from current mesh, we will check whether
+            # there is layout overlap.
+            # TODO: Eventually we will just directly throw error here because
+            # we will deprecate the slicing of flattened dim_name from root mesh.
+            layout_sliced = _MeshLayout(sliced_sizes, sliced_strides)
+            if not layout_sliced.check_non_overlap():
+                raise RuntimeError(
+                    f"Slicing overlapping dim_names {mesh_dim_names} is not allowed."
+                )
+
+            return layout_sliced
+
+        # TODO: to make this use case by other components public API in the future.
+        def _get_all_submeshes(self, mesh_dim_name: str) -> list["DeviceMesh"]:
+            """
+            Return all the submeshes of a given mesh dimension of the device mesh.
+            """
+            mesh_dim = self._get_mesh_dim_by_name(mesh_dim_name)
+            layout = self._layout[mesh_dim]
+            pg_ranks_by_dim = layout.remap_to_tensor(self._rank_map)
+            cur_rank = self.get_rank()
+            res_submeshes = []
+            for mesh_1d in pg_ranks_by_dim:
+                submesh = DeviceMesh(
+                    self._device_type,
+                    mesh_1d,
+                    mesh_dim_names=(mesh_dim_name,),
+                    _init_backend=False,
+                )
+                submesh._dim_group_names = (  # type: ignore[has-type]
+                    [self._dim_group_names[mesh_dim]]  # type: ignore[has-type]
+                    if cur_rank in mesh_1d
+                    else []
+                )
+                res_submeshes.append(submesh)
+
+            return res_submeshes
 
         @staticmethod
         def from_group(
@@ -906,6 +943,10 @@ def from_group(
                 raise ValueError(
                     "Must pass mesh_dim_names if passing multiple ProcessGroups"
                 )
+            # When init a DeviceMesh with multiple ProcessGroups directly, we need to make sure
+            # the mesh tensor is contiguous. Otherwise, the layout we inferred from the mesh tensor
+            # will have larger span than the actual tensor. This is just internal implementation detail
+            # and does not affect user facing behavior.
             mesh = (
                 mesh.detach().to(dtype=torch.int, device="cpu")
                 if isinstance(mesh, torch.Tensor)
@@ -923,15 +964,17 @@ def from_group(
             return device_mesh
 
         def size(self, mesh_dim: Optional[int] = None) -> int:
-            return self.mesh.numel() if mesh_dim is None else self.mesh.size(mesh_dim)
+            if mesh_dim is not None:
+                return self._layout[mesh_dim].numel()
+            return self._layout.numel()
 
         @property
         def ndim(self) -> int:
-            return self.mesh.ndim
+            return len(self._layout)
 
         @property
         def shape(self) -> tuple[int, ...]:
-            return tuple(self.mesh.shape)
+            return self._layout.top_level_sizes
 
         def get_rank(self) -> int:
             """
@@ -970,7 +1013,7 @@ def get_local_rank(self, mesh_dim: Optional[Union[int, str]] = None) -> int:
             """
             if self.ndim > 1 and mesh_dim is None:
                 raise RuntimeError(
-                    f"Found the DeviceMesh have {self.mesh.ndim} dimensions",
+                    f"Found the DeviceMesh have {len(self._layout)} dimensions",
                     "Optional kwarg `mesh_dim` needs to be specified when device_mesh.ndim > 1.",
                 )
             elif mesh_dim is None:
@@ -1008,7 +1051,7 @@ def _flatten(
             After the flattened dimension is created, to access the flattened dimension in mesh_3d, one can use the
             existing slicing method to obtain the flattened mesh through calling mesh_3d["dp_cp"].
             """
-            if not self.mesh_dim_names:
+            if not self._mesh_dim_names:
                 raise RuntimeError(
                     "Cannot flatten a DeviceMesh without mesh_dim_names!"
                 )
@@ -1020,8 +1063,119 @@ def _flatten(
             else:
                 backend_override_tuple = (None, None)
 
-            return _mesh_resources.create_flatten_mesh(
-                self, mesh_dim_name, backend_override_tuple
+            return self._create_flatten_mesh(mesh_dim_name, backend_override_tuple)
+
+        def _create_unflatten_mesh(
+            self,
+            dim: int,
+            mesh_sizes: tuple[int, ...],
+            mesh_dim_names: tuple[str, ...],
+            backend_override: tuple[
+                tuple[Optional[str], Optional[C10dBackend.Options]], ...
+            ] = ((None, None),),
+        ) -> "DeviceMesh":
+            inner_layout = _MeshLayout(tuple(mesh_sizes), suffix_product(mesh_sizes))
+
+            if inner_layout.numel() != self._layout[dim].numel():
+                raise ValueError(
+                    f"The product of {mesh_sizes=} is {inner_layout.numel()}, "
+                    f"but the original dimension at dim={dim} has size {self._layout[dim].numel()}. "
+                    f"These must be equal for unflatten to work correctly."
+                )
+
+            partial_layout = self._layout[dim].composition(inner_layout)
+            unflattened_layout = self._layout.splice(dim, dim + 1, partial_layout)
+            unflattened_mesh_dim_names = list(not_none(self.mesh_dim_names))
+            unflattened_mesh_dim_names[dim : dim + 1] = list(mesh_dim_names)
+
+            root_mesh = self._get_root_mesh()
+            res_mesh = DeviceMesh(
+                self.device_type,
+                _layout=unflattened_layout,
+                _rank_map=root_mesh._rank_map,
+                mesh_dim_names=tuple(unflattened_mesh_dim_names),
+                _root_mesh=root_mesh,
+                _init_backend=False,
+            )
+
+            # If original mesh has initiated its backend, we need to initialize the backend
+            # of unflatten dims as well.
+            # TODO: To make backend init more efficient with cute layout representation and support
+            # per dim backend init.
+            if hasattr(self, "_dim_group_names"):
+                dim_group_names = self._dim_group_names.copy()
+                dim_group_names[dim : dim + 1] = self._init_process_groups(
+                    partial_layout,
+                    root_mesh._rank_map,
+                    mesh_dim_names,
+                    backend_override,
+                )
+                res_mesh._dim_group_names = dim_group_names
+
+            return res_mesh
+
+        def _unflatten(
+            self,
+            dim: Union[int, str],
+            mesh_sizes: tuple[int, ...],
+            mesh_dim_names: tuple[str, ...],
+            backend_override: Optional[
+                dict[
+                    str,
+                    Union[str, C10dBackend.Options, tuple[str, C10dBackend.Options]],
+                ]
+            ] = None,
+        ) -> "DeviceMesh":
+            """
+            Returns a DeviceMesh by unflatten the current DeviceMesh.
+
+            This api can be used to unflatten a N-D DeviceMesh into N-1+len(mesh_sizes)-D meshes or submeshes.
+            The dim is the dimension to be unflattened which can be either a string or an integer.
+
+            The mesh_sizes is a tuple which specifies the shape of the mesh unflatten into for the given dim.
+            The mesh_dim_names is a list of strings which specifies the names of the dimensions of the mesh unflatten into.
+            Its length must match the length of mesh_sizes.
+
+            For example, if we have a 1D mesh DeviceMesh([0, 1, 2, 3, 4, 5, 6, 7], mesh_dim_names=("world")),
+            calling mesh_1d._unflatten(0, (2, 2, 4), ["dp", "pp", "tp"]) will create a 3D mesh
+            DeviceMesh([[[0, 1], [2, 3]], [[4, 5], [6, 7]]], mesh_dim_names=("dp", "cp", "tp")).
+
+            Note that after calling the unflatten, there is no access to the unflattened dimension in mesh_1d, one can only
+            use the newly unflattened mesh to slice out the unflattened mesh dims.
+            """
+            if isinstance(dim, int) and dim >= self.ndim:
+                raise ValueError(
+                    f"dim {dim} specified in `_unflatten` is out of range {self.ndim}"
+                )
+            elif isinstance(dim, str) and dim in not_none(self.mesh_dim_names):
+                raise ValueError(
+                    f"dim {dim} specified in `_unflatten` is not in {self.mesh_dim_names}"
+                )
+
+            if len(mesh_sizes) != len(mesh_dim_names):
+                raise RuntimeError(
+                    "mesh_dim_names must have same length as mesh_sizes in _unflatten!"
+                )
+
+            if isinstance(dim, str):
+                dim = not_none(self.mesh_dim_names).index(dim)
+
+            if backend_override is not None:
+                backend_override_tuple = tuple(
+                    _normalize_backend_override(
+                        backend_override,  # type: ignore[arg-type]
+                        len(mesh_sizes),
+                        mesh_dim_names,
+                    )
+                )
+            else:
+                backend_override_tuple = ((None, None),) * len(mesh_dim_names)
+
+            return self._create_unflatten_mesh(
+                dim,
+                mesh_sizes,
+                mesh_dim_names,
+                backend_override_tuple,
             )
 
     def _normalize_backend_override(
@@ -1031,7 +1185,7 @@ def _normalize_backend_override(
         ],
         ndim: int,
         mesh_dim_names: Optional[tuple[str, ...]] = None,
-    ) -> Iterator[tuple[Optional[str], Optional[C10dBackend.Options]]]:
+    ) -> Iterator[BackendConfig]:
         if mesh_dim_names is None:
             mesh_dim_names = ()
         for dim_idx, dim_name in zip_longest(range(ndim), mesh_dim_names):
@@ -1143,13 +1297,15 @@ def init_device_mesh(
                 "If you maintained a 'torch.device' object, it's recommended to pass in 'device.type'.",
             )
 
-        # Always initialize the mesh's tensor on CPU, regardless of what the
+        layout = _MeshLayout(tuple(mesh_shape), suffix_product(tuple(mesh_shape)))
+        # Always initialize the (identity) rank map on CPU, regardless of what the
         # external device type has been set to be (e.g. meta)
         with torch.device("cpu"):
-            mesh = torch.arange(math.prod(mesh_shape), dtype=torch.int).view(mesh_shape)
+            rank_map = torch.arange(layout.numel(), dtype=torch.int)
         device_mesh = DeviceMesh(
             device_type=device_type,
-            mesh=mesh,
+            _layout=layout,
+            _rank_map=rank_map,
             mesh_dim_names=mesh_dim_names,
             backend_override=backend_override_tuple,
         )
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 75c973c4e2a6..c39847176517 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -14,26 +14,19 @@
 import time
 import warnings
 from collections import namedtuple
+from collections.abc import Callable
 from datetime import timedelta
-from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from typing import Any, Optional, TYPE_CHECKING, Union
 from typing_extensions import deprecated
 
 import torch
-import torch.distributed._distributed_c10d as _c10d
 from torch._C import _DistStoreError as DistStoreError
-from torch._utils_internal import set_pytorch_distributed_envs_from_justknobs
-from torch.distributed._distributed_c10d import (  # Process group implementations; Availability flags
+from torch._C._distributed_c10d import (
     _DistributedBackendOptions,
-    _GLOO_AVAILABLE,
-    _MPI_AVAILABLE,
-    _NCCL_AVAILABLE,
-    _ProcessGroupWrapper,
     _register_process_group,
     _resolve_process_group,
-    _UCC_AVAILABLE,
     _unregister_all_process_groups,
     _unregister_process_group,
-    _XCCL_AVAILABLE,
     AllgatherOptions,
     AllreduceCoalescedOptions,
     AllreduceOptions,
@@ -45,11 +38,6 @@
     get_debug_level,
     PrefixStore,
     ProcessGroup,
-    ProcessGroupGloo,
-    ProcessGroupMPI,
-    ProcessGroupNCCL,
-    ProcessGroupUCC,
-    ProcessGroupXCCL,
     ReduceOp,
     ReduceOptions,
     ReduceScatterOptions,
@@ -57,6 +45,7 @@
     Store,
     Work,
 )
+from torch._utils_internal import set_pytorch_distributed_envs_from_justknobs
 from torch.monitor import _WaitCounter
 from torch.overrides import handle_torch_function, has_torch_function
 from torch.utils._typing_utils import not_none
@@ -143,11 +132,17 @@
     "split_group",
 ]
 
+_MPI_AVAILABLE = True
+_NCCL_AVAILABLE = True
+_GLOO_AVAILABLE = True
+_UCC_AVAILABLE = True
+_XCCL_AVAILABLE = True
+
 _pickler = pickle.Pickler
 _unpickler = pickle.Unpickler
 
 
-# Change __module__ of all imported types from the distributed wrapper that are public
+# Change __module__ of all imported types from torch._C._distributed_c10d that are public
 def _export_c_types() -> None:
     _public_types_to_change_module = [
         AllreduceCoalescedOptions,
@@ -173,26 +168,45 @@ def _export_c_types() -> None:
 
 _export_c_types()
 
-# Add process groups to __all__ and set their module based on availability
-if _MPI_AVAILABLE:
+try:
+    from torch._C._distributed_c10d import ProcessGroupMPI
+
     ProcessGroupMPI.__module__ = "torch.distributed.distributed_c10d"
     __all__ += ["ProcessGroupMPI"]
+except ImportError:
+    _MPI_AVAILABLE = False
+
+try:
+    from torch._C._distributed_c10d import ProcessGroupNCCL
 
-if _NCCL_AVAILABLE:
     ProcessGroupNCCL.__module__ = "torch.distributed.distributed_c10d"
     __all__ += ["ProcessGroupNCCL"]
+except ImportError:
+    _NCCL_AVAILABLE = False
+
+try:
+    from torch._C._distributed_c10d import _ProcessGroupWrapper, ProcessGroupGloo
 
-if _GLOO_AVAILABLE:
     ProcessGroupGloo.__module__ = "torch.distributed.distributed_c10d"
     __all__ += ["ProcessGroupGloo"]
+except ImportError:
+    _GLOO_AVAILABLE = False
+
+try:
+    from torch._C._distributed_c10d import ProcessGroupUCC
 
-if _UCC_AVAILABLE:
     ProcessGroupUCC.__module__ = "torch.distributed.distributed_c10d"
     __all__ += ["ProcessGroupUCC"]
+except ImportError:
+    _UCC_AVAILABLE = False
+
+try:
+    from torch._C._distributed_c10d import ProcessGroupXCCL
 
-if _XCCL_AVAILABLE:
     ProcessGroupXCCL.__module__ = "torch.distributed.distributed_c10d"
     __all__ += ["ProcessGroupXCCL"]
+except ImportError:
+    _XCCL_AVAILABLE = False
 
 logger = logging.getLogger(__name__)
 
@@ -358,6 +372,7 @@ class BackendConfig:
     def __init__(self, backend: Backend):
         """Init."""
         self.device_backend_map: dict[str, Backend] = {}
+        # pyrefly: ignore  # bad-assignment
         backend = str(backend)
 
         if backend == Backend.UNDEFINED:
@@ -378,6 +393,7 @@ def __init__(self, backend: Backend):
             # e.g. "nccl", "gloo", "ucc", "mpi"
             supported_devices = Backend.backend_capability[backend.lower()]
             backend_val = Backend(backend)
+            # pyrefly: ignore  # bad-assignment
             self.device_backend_map = dict.fromkeys(supported_devices, backend_val)
         elif ":" in backend.lower():
             # Backend specified in "device:backend" format
@@ -396,6 +412,7 @@ def __init__(self, backend: Backend):
                         f"Invalid device:backend pairing: \
                                      {device_backend_pair_str}. {backend_str_error_message}"
                     )
+                # pyrefly: ignore  # bad-assignment
                 device, backend = device_backend_pair
                 if device in self.device_backend_map:
                     raise ValueError(
@@ -955,7 +972,7 @@ def _store_based_barrier(
         except RuntimeError as e:
             worker_count = store.add(store_key, 0)
             # Print status periodically to keep track.
-            logger.debug(
+            logger.debug(  # noqa: G200
                 "Waiting in store based barrier to initialize process group for %s seconds"
                 "rank: %s, key: %s (world_size=%s, num_workers_joined=%s, timeout=%s error=%s)",
                 time.time() - start,
@@ -1168,6 +1185,7 @@ def _as_iterable(obj) -> collections.abc.Iterable:
 
 def _ensure_all_tensors_same_dtype(*tensors) -> None:
     last_dtype = None
+    # pyrefly: ignore  # bad-assignment
     for tensor in itertools.chain.from_iterable(map(_as_iterable, tensors)):
         tensor_dtype = tensor.dtype
         # Mixing complex and its element type is allowed
@@ -1240,6 +1258,18 @@ def is_xccl_available() -> bool:
     return _XCCL_AVAILABLE
 
 
+def _check_single_backend_availability(backend_name: str) -> bool:
+    """
+    Helper function to check if a single backend is available.
+    """
+    available_func = getattr(
+        torch.distributed, f"is_{str(backend_name).lower()}_available", None
+    )
+    if available_func:
+        return available_func()
+    return str(backend_name).lower() in Backend.backend_list
+
+
 def is_backend_available(backend: str) -> bool:
     """
     Check backend availability.
@@ -1253,11 +1283,16 @@ def is_backend_available(backend: str) -> bool:
         bool: Returns true if the backend is available otherwise false.
     """
     # If the backend has an ``is_backend_available`` function, return the result of that function directly
-    available_func = getattr(torch.distributed, f"is_{backend.lower()}_available", None)
-    if available_func:
-        return available_func()
-
-    return backend.lower() in Backend.backend_list
+    if ":" in backend.lower():  # composite backend like "cpu:gloo"
+        backend_config = BackendConfig(Backend(backend))
+        device_backend_map = backend_config.get_device_backend_map()
+        return all(
+            _check_single_backend_availability(str(backend_name))
+            for backend_name in device_backend_map.values()
+        )
+    else:
+        # Handle simple backend strings like "nccl", "gloo"
+        return _check_single_backend_availability(backend)
 
 
 def is_initialized() -> bool:
@@ -1314,8 +1349,7 @@ def _get_default_store() -> Store:
 def _update_default_pg(pg) -> None:
     _world.default_pg = pg
     rank = pg.rank() if pg is not None and pg != GroupMember.NON_GROUP_MEMBER else -1
-
-    _c10d._set_global_rank(rank)
+    torch._C._distributed_c10d._set_global_rank(rank)
 
 
 def get_backend_config(group: Optional[ProcessGroup] = None) -> str:
@@ -1824,6 +1858,7 @@ def _get_split_source(pg):
         split_from = pg._get_backend(pg.bound_device_id)
     elif pg is _world.default_pg:
         try:
+            # pyrefly: ignore  # missing-attribute
             split_from = pg._get_backend(torch.device("cuda"))
         except RuntimeError:
             # no cuda device associated with this backend
@@ -1952,7 +1987,7 @@ def _new_process_group_helper(
 
     if device_id:
         pg.bound_device_id = device_id
-    backend_class: _c10d.Backend
+    backend_class: torch._C._distributed_c10d.Backend
     for device, backend_str in backend_config.get_device_backend_map().items():
         # Use the group name as prefix in the default store, such that
         # a single store can be reused by multiple groups.
@@ -1984,7 +2019,11 @@ def _new_process_group_helper(
             if not is_gloo_available():
                 raise RuntimeError("Distributed package doesn't have Gloo built in")
             backend_class = ProcessGroupGloo(
-                backend_prefix_store, group_rank, group_size, timeout=timeout
+                backend_prefix_store,
+                group_rank,
+                group_size,
+                # pyrefly: ignore  # bad-argument-type
+                timeout=timeout,
             )
             backend_class.options.global_ranks_in_group = global_ranks_in_group
             backend_class.options.group_name = group_name
@@ -2005,6 +2044,7 @@ def _new_process_group_helper(
                 # default backend_options for NCCL
                 backend_options = ProcessGroupNCCL.Options()
                 backend_options.is_high_priority_stream = False
+            # pyrefly: ignore  # bad-argument-type
             backend_options._timeout = timeout
 
             if split_from:
@@ -2024,7 +2064,11 @@ def _new_process_group_helper(
             # RuntimeError if is_ucc_available() returns false.
 
             backend_class = ProcessGroupUCC(
-                backend_prefix_store, group_rank, group_size, timeout=timeout
+                backend_prefix_store,
+                group_rank,
+                group_size,
+                # pyrefly: ignore  # bad-argument-type
+                timeout=timeout,
             )
             backend_type = ProcessGroup.BackendType.UCC
         elif backend_str == Backend.XCCL:
@@ -2033,6 +2077,7 @@ def _new_process_group_helper(
             backend_options = ProcessGroupXCCL.Options()
             backend_options.global_ranks_in_group = global_ranks_in_group
             backend_options.group_name = group_name
+            # pyrefly: ignore  # bad-argument-type
             backend_options._timeout = timeout
             backend_class = ProcessGroupXCCL(
                 backend_prefix_store, group_rank, group_size, backend_options
@@ -2057,6 +2102,7 @@ def _new_process_group_helper(
                 dist_backend_opts.store = backend_prefix_store
                 dist_backend_opts.group_rank = group_rank
                 dist_backend_opts.group_size = group_size
+                # pyrefly: ignore  # bad-argument-type
                 dist_backend_opts.timeout = timeout
                 dist_backend_opts.group_id = group_name
                 dist_backend_opts.global_ranks_in_group = global_ranks_in_group
@@ -2100,6 +2146,7 @@ def _new_process_group_helper(
                         store=backend_prefix_store,
                         rank=group_rank,
                         world_size=group_size,
+                        # pyrefly: ignore  # bad-argument-type
                         timeout=timeout,
                     )
 
@@ -2173,7 +2220,7 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
     # alive until all works and hooks are done. The current implementation does the
     # latter. Therefore, we explicitly call _wait_for_pending_works() here to wait
     # for the pending hooks to finish.
-    if type(pg) == ProcessGroup and pg._has_hooks():
+    if type(pg) is ProcessGroup and pg._has_hooks():
         pg._wait_for_pending_works()
 
     if group is None or group == GroupMember.WORLD:
@@ -2753,7 +2800,7 @@ def peer_kwarg(op: P2POp) -> dict[str, int]:
         key = "group_dst" if op.op == isend else "group_src"
         return {key: op.group_peer}
 
-    if type(group) == ProcessGroup and group._get_backend(device).supports_coalescing:
+    if type(group) is ProcessGroup and group._get_backend(device).supports_coalescing:
         # NCCL style coalescing
         with _coalescing_manager(group, device, async_ops=True) as cm:
             for p2p_op in p2p_op_list:
@@ -3067,9 +3114,7 @@ def _object_to_tensor(obj, device, group):
         if get_debug_level() == DebugLevel.DETAIL and is_nccl_available():
             backend = get_backend(group)
             if backend == Backend.NCCL:
-                from torch.distributed._distributed_c10d import _hash_tensors
-
-                hash = _hash_tensors([byte_tensor])
+                hash = torch._C._distributed_c10d._hash_tensors([byte_tensor])
                 logger.warning(
                     "_object_to_tensor size: %s hash value: %s",
                     byte_tensor.numel(),
@@ -3084,9 +3129,7 @@ def _tensor_to_object(tensor, tensor_size, group):
         if get_debug_level() == DebugLevel.DETAIL and is_nccl_available():
             backend = get_backend(group)
             if backend == Backend.NCCL:
-                from torch.distributed._distributed_c10d import _hash_tensors
-
-                hash = _hash_tensors([tensor])
+                hash = torch._C._distributed_c10d._hash_tensors([tensor])
                 logger.warning(
                     "_tensor_to_object size: %s hash value: %s", tensor.numel(), hash
                 )
@@ -3313,6 +3356,7 @@ def gather_object(
         return
 
     assert object_gather_list is not None, "Must provide object_gather_list on dst rank"
+    # pyrefly: ignore  # unbound-name
     for i, tensor in enumerate(output_tensors):
         tensor = tensor.type(torch.uint8)
         tensor_size = object_size_list[i]
@@ -3689,8 +3733,10 @@ def broadcast_object_list(
     # has only one element, we can skip the copy.
     if my_group_rank == group_src:
         if len(tensor_list) == 1:  # type: ignore[possibly-undefined]
+            # pyrefly: ignore  # unbound-name
             object_tensor = tensor_list[0]
         else:
+            # pyrefly: ignore  # unbound-name
             object_tensor = torch.cat(tensor_list)
     else:
         object_tensor = torch.empty(  # type: ignore[call-overload]
@@ -3819,6 +3865,7 @@ def scatter_object_list(
     broadcast(max_tensor_size, group_src=group_src, group=group)
 
     # Scatter actual serialized objects
+    # pyrefly: ignore  # no-matching-overload
     output_tensor = torch.empty(
         max_tensor_size.item(), dtype=torch.uint8, device=pg_device
     )
@@ -4855,16 +4902,19 @@ def barrier(
     if isinstance(device_ids, list):
         opts.device_ids = device_ids
         # use only the first device id
+        # pyrefly: ignore  # read-only
         opts.device = torch.device(device.type, device_ids[0])
     elif getattr(group, "bound_device_id", None) is not None:
         # Use device id from `init_process_group(device_id=...)`
         opts.device = group.bound_device_id  # type: ignore[assignment]
     elif device.type == "cpu" or _get_object_coll_device(group) == "cpu":
+        # pyrefly: ignore  # read-only
         opts.device = torch.device("cpu")
     else:
         # Use the current device set by the user. If user did not set any, this
         # may use default device 0, causing issues like hang or all processes
         # creating context on device 0.
+        # pyrefly: ignore  # read-only
         opts.device = device
         if group.rank() == 0:
             warnings.warn(  # warn only once
@@ -4963,7 +5013,7 @@ def monitored_barrier(
 
 
 def _create_process_group_wrapper(
-    wrapped_pg: _c10d.Backend,
+    wrapped_pg: torch._C._distributed_c10d.Backend,
     store_prefix: str,
     store: Store,
     rank: int,
@@ -4995,6 +5045,7 @@ def _hash_ranks_to_str(ranks: list[int]) -> str:
 # Takes a list of ranks and computes an integer color
 def _process_group_color(ranks: list[int]) -> int:
     # Convert list to tuple to make it hashable
+    # pyrefly: ignore  # bad-assignment
     ranks = tuple(ranks)
     hash_value = hash(ranks)
     # Split color must be:
@@ -5033,7 +5084,7 @@ def _is_safe_to_split() -> bool:
     users must be aware that a pg is only splittable after the first collective is
     issued.
     """
-    return False if _get_default_group().bound_device_id is None else True
+    return _get_default_group().bound_device_id is not None
 
 
 @_time_logger
diff --git a/torch/distributed/elastic/agent/server/api.py b/torch/distributed/elastic/agent/server/api.py
index fc5764072424..b02095304391 100644
--- a/torch/distributed/elastic/agent/server/api.py
+++ b/torch/distributed/elastic/agent/server/api.py
@@ -15,10 +15,11 @@
 import traceback
 import warnings
 from collections import defaultdict
+from collections.abc import Callable
 from contextlib import contextmanager
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch.distributed.elastic.rendezvous as rdzv
 import torch.distributed.elastic.utils.store as store_util
@@ -720,7 +721,7 @@ def run(self, role: str = DEFAULT_ROLE) -> RunResult:
             self._record_worker_events(result)
             return result
         except RendezvousGracefulExitError as e:
-            logger.info("Rendezvous gracefully exited: %s", e)
+            logger.info("Rendezvous gracefully exited: %s", e)  # noqa: G200
         except SignalException as e:
             logger.warning("Received %s death signal, shutting down workers", e.sigval)
             self._shutdown(e.sigval)
diff --git a/torch/distributed/elastic/agent/server/health_check_server.py b/torch/distributed/elastic/agent/server/health_check_server.py
index d54915f74616..4815d86aa289 100644
--- a/torch/distributed/elastic/agent/server/health_check_server.py
+++ b/torch/distributed/elastic/agent/server/health_check_server.py
@@ -6,7 +6,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Callable
+from collections.abc import Callable
 
 from torch.distributed.elastic.utils.logging import get_logger
 
diff --git a/torch/distributed/elastic/agent/server/local_elastic_agent.py b/torch/distributed/elastic/agent/server/local_elastic_agent.py
index dd9f0647a153..c08e8cc6c17c 100644
--- a/torch/distributed/elastic/agent/server/local_elastic_agent.py
+++ b/torch/distributed/elastic/agent/server/local_elastic_agent.py
@@ -333,8 +333,10 @@ def _start_workers(self, worker_group: WorkerGroup) -> dict[int, Any]:
                     rank=worker.global_rank,
                     local_rank=local_rank,
                 )
+                # pyrefly: ignore  # unsupported-operation
                 log_line_prefixes[local_rank] = log_line_prefix
 
+            # pyrefly: ignore  # unsupported-operation
             envs[local_rank] = worker_env
             worker_args = list(spec.args)
             worker_args = macros.substitute(worker_args, str(local_rank))
diff --git a/torch/distributed/elastic/control_plane.py b/torch/distributed/elastic/control_plane.py
index 63334a0ca3f6..817255edd23d 100644
--- a/torch/distributed/elastic/control_plane.py
+++ b/torch/distributed/elastic/control_plane.py
@@ -14,7 +14,7 @@
 
 @contextmanager
 def _worker_server(socket_path: str) -> Generator[None, None, None]:
-    from torch.distributed._distributed_c10d import _WorkerServer
+    from torch._C._distributed_c10d import _WorkerServer
 
     server = _WorkerServer(socket_path)
     try:
diff --git a/torch/distributed/elastic/events/api.py b/torch/distributed/elastic/events/api.py
index b0c350d7bcaa..2e340c47afa6 100644
--- a/torch/distributed/elastic/events/api.py
+++ b/torch/distributed/elastic/events/api.py
@@ -54,6 +54,7 @@ def deserialize(data: Union[str, "Event"]) -> "Event":
         if isinstance(data, str):
             data_dict = json.loads(data)
         data_dict["source"] = EventSource[data_dict["source"]]  # type: ignore[possibly-undefined]
+        # pyrefly: ignore  # unbound-name
         return Event(**data_dict)
 
     def serialize(self) -> str:
@@ -108,6 +109,7 @@ def deserialize(data: Union[str, "RdzvEvent"]) -> "RdzvEvent":
         if isinstance(data, str):
             data_dict = json.loads(data)
         data_dict["node_state"] = NodeState[data_dict["node_state"]]  # type: ignore[possibly-undefined]
+        # pyrefly: ignore  # unbound-name
         return RdzvEvent(**data_dict)
 
     def serialize(self) -> str:
diff --git a/torch/distributed/elastic/metrics/api.py b/torch/distributed/elastic/metrics/api.py
index 2f4100a461ad..0bfa255174d1 100644
--- a/torch/distributed/elastic/metrics/api.py
+++ b/torch/distributed/elastic/metrics/api.py
@@ -88,10 +88,7 @@ def configure(handler: MetricHandler, group: Optional[str] = None):
 
 
 def getStream(group: str):
-    if group in _metrics_map:
-        handler = _metrics_map[group]
-    else:
-        handler = _default_metrics_handler
+    handler = _metrics_map.get(group, _default_metrics_handler)
     return MetricStream(group, handler)
 
 
@@ -171,12 +168,15 @@ def wrapper(*args, **kwargs):
             try:
                 start_time = time.time()
                 result = func(*args, **kwargs)
+                # pyrefly: ignore  # bad-argument-type
                 publish_metric(group, f"{func.__name__}.success", 1)
             except Exception:
+                # pyrefly: ignore  # bad-argument-type
                 publish_metric(group, f"{func.__name__}.failure", 1)
                 raise
             finally:
                 publish_metric(
+                    # pyrefly: ignore  # bad-argument-type
                     group,
                     f"{func.__name__}.duration.ms",
                     get_elapsed_time_ms(start_time),  # type: ignore[possibly-undefined]
diff --git a/torch/distributed/elastic/multiprocessing/__init__.py b/torch/distributed/elastic/multiprocessing/__init__.py
index 3f9fabd720bd..2f1db10e965a 100644
--- a/torch/distributed/elastic/multiprocessing/__init__.py
+++ b/torch/distributed/elastic/multiprocessing/__init__.py
@@ -63,7 +63,8 @@ def trainer(a, b, c):
 implementations of the parent :class:`api.PContext` class.
 """
 
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+from typing import Optional, Union
 
 from torch.distributed.elastic.multiprocessing.api import (  # noqa: F401
     _validate_full_rank,
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
index 53e133523d45..ede23f8b801c 100644
--- a/torch/distributed/elastic/multiprocessing/api.py
+++ b/torch/distributed/elastic/multiprocessing/api.py
@@ -19,12 +19,13 @@
 import threading
 import time
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from contextlib import nullcontext
 from dataclasses import dataclass, field
 from enum import IntFlag
 from multiprocessing import synchronize
 from types import FrameType
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch.multiprocessing as mp
 from torch.distributed.elastic.multiprocessing.errors import ProcessFailure, record
@@ -488,11 +489,13 @@ def start(self) -> None:
                     sig = getattr(signal, sig_name.strip())
                     signal.signal(sig, _terminate_process_handler)
                     logger.info("Registered signal handler for %s", sig_name)
-                except (AttributeError, ValueError) as e:
+                except (AttributeError, ValueError):
                     logger.warning(
-                        "Failed to register signal handler for %s: %s", sig_name, e
+                        "Failed to register signal handler for %s",
+                        sig_name,
+                        exc_info=True,
                     )
-                except RuntimeError as e:
+                except RuntimeError:
                     if IS_WINDOWS and sig_name.strip() in [
                         "SIGHUP",
                         "SIGQUIT",
@@ -504,7 +507,9 @@ def start(self) -> None:
                         )
                     else:
                         logger.warning(
-                            "Failed to register signal handler for %s: %s", sig_name, e
+                            "Failed to register signal handler for %s",
+                            sig_name,
+                            exc_info=True,
                         )
         else:
             logger.warning(
@@ -726,7 +731,7 @@ def _poll(self) -> Optional[RunProcsResult]:
             # pipe. Hence to prevent deadlocks on large return values,
             # we opportunistically try queue.get on each join call
             # See: https://docs.python.org/2/library/multiprocessing.html#all-platforms
-            for local_rank in range(0, self.nprocs):
+            for local_rank in range(self.nprocs):
                 return_queue = self._ret_vals[local_rank]
                 if not return_queue.empty():
                     # save the return values temporarily into a member var
diff --git a/torch/distributed/elastic/multiprocessing/errors/__init__.py b/torch/distributed/elastic/multiprocessing/errors/__init__.py
index 1f2ef3c11ded..fa6abc8794b6 100644
--- a/torch/distributed/elastic/multiprocessing/errors/__init__.py
+++ b/torch/distributed/elastic/multiprocessing/errors/__init__.py
@@ -54,11 +54,12 @@
 import signal
 import socket
 import time
+from collections.abc import Callable
 from dataclasses import dataclass, field
 from datetime import datetime
 from functools import wraps
 from string import Template
-from typing import Any, Callable, Optional, TypeVar, Union
+from typing import Any, Optional, TypeVar, Union
 from typing_extensions import ParamSpec
 
 from torch.distributed.elastic.utils.logging import get_logger
@@ -78,9 +79,9 @@
 logger = get_logger(__name__)
 
 
-JSON = dict
+JSON = dict[str, Any]
 
-_EMPTY_ERROR_DATA = {"message": "<NONE>"}
+_EMPTY_ERROR_DATA: dict[str, Any] = {"message": "<NONE>"}
 _NOT_AVAILABLE = "<N/A>"
 
 _R = TypeVar("_R")
@@ -142,6 +143,10 @@ def __post_init__(self):
                     f" received by PID {self.pid}"
                 )
             else:
+                self.error_file_data["errorTraits"] = {
+                    "category": "system_terminated_error",
+                    "retryability": "False",
+                }
                 self.message = "To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html"
 
     def _get_error_data(self, error_file_data: dict[str, Any]) -> tuple[str, int]:
diff --git a/torch/distributed/elastic/multiprocessing/tail_log.py b/torch/distributed/elastic/multiprocessing/tail_log.py
index 034072109b7f..2aa73dc19dd6 100644
--- a/torch/distributed/elastic/multiprocessing/tail_log.py
+++ b/torch/distributed/elastic/multiprocessing/tail_log.py
@@ -97,6 +97,7 @@ def __init__(
         n = len(log_files)
         self._threadpool = None
         if n > 0:
+            # pyrefly: ignore  # bad-assignment
             self._threadpool = ThreadPoolExecutor(
                 max_workers=n,
                 thread_name_prefix=f"{self.__class__.__qualname__}_{name}",
@@ -141,12 +142,11 @@ def stop(self) -> None:
             try:
                 f.result()
             except Exception as e:
-                logger.error(
-                    "error in log tailor for %s%s. %s: %s",
+                logger.exception(
+                    "error in log tailor for %s%s. %s",
                     self._name,
                     local_rank,
                     e.__class__.__qualname__,
-                    e,
                 )
 
         if self._threadpool:
diff --git a/torch/distributed/elastic/rendezvous/api.py b/torch/distributed/elastic/rendezvous/api.py
index 9d9a192e2c17..9e66c0228daa 100644
--- a/torch/distributed/elastic/rendezvous/api.py
+++ b/torch/distributed/elastic/rendezvous/api.py
@@ -7,8 +7,9 @@
 
 import socket
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable, ClassVar, Optional
+from typing import Any, ClassVar, Optional
 
 from torch.distributed import Store
 from torch.distributed.elastic.utils.distributed import get_free_port
diff --git a/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py b/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py
index 7183085b8704..982ff267a06a 100644
--- a/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py
+++ b/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py
@@ -87,10 +87,7 @@ def set_state(
             if not isinstance(token, bytes):
                 result = self.get_state()
                 if result is not None:
-                    tmp = *result, False
-                    # Python 3.6 does not support tuple unpacking in return
-                    # statements.
-                    return tmp
+                    return *result, False
                 return None
 
             token = token.decode()
diff --git a/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py b/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py
index 7ad0d470a000..2a0e44aef31a 100644
--- a/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py
+++ b/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py
@@ -14,10 +14,11 @@
 import time
 import weakref
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from dataclasses import dataclass
 from datetime import datetime, timedelta, timezone
 from enum import Enum
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 
 import torch.distributed as dist
 from torch.distributed import Store
diff --git a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
index 0e4da86d4621..300399414d9c 100644
--- a/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
+++ b/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
@@ -208,8 +208,8 @@ def shutdown(self) -> bool:
         try:
             self.set_closed()
             return True
-        except BaseException as e:  # noqa: B036
-            logger.warning("Shutdown failed. Error occurred: %s", str(e))
+        except BaseException:  # noqa: B036
+            logger.warning("Shutdown failed", exc_info=True)
             return False
 
 
@@ -333,7 +333,7 @@ def rendezvous_barrier(self):
                 # to avoid spamming etcd
                 # FIXME: there are a few things that fall under this like
                 # etcd.EtcdKeyNotFound, etc, which could be handled more explicitly.
-                logger.info("Rendezvous attempt failed, will retry. Reason: %s", e)
+                logger.info("Rendezvous attempt failed, will retry. Reason: %s", e)  # noqa: G200
                 time.sleep(1)
 
     def init_phase(self):
diff --git a/torch/distributed/elastic/rendezvous/etcd_rendezvous_backend.py b/torch/distributed/elastic/rendezvous/etcd_rendezvous_backend.py
index 9ebb680bef17..9a69dff151a9 100644
--- a/torch/distributed/elastic/rendezvous/etcd_rendezvous_backend.py
+++ b/torch/distributed/elastic/rendezvous/etcd_rendezvous_backend.py
@@ -96,10 +96,7 @@ def set_state(
         def get_state():
             result = self.get_state()
             if result is not None:
-                tmp = *result, False
-                # Python 3.6 does not support tuple unpacking in return
-                # statements.
-                return tmp
+                return *result, False
             return None
 
         if token:
@@ -129,6 +126,7 @@ def get_state():
         return tmp
 
     def _decode_state(self, result: etcd.EtcdResult) -> tuple[bytes, Token]:
+        # pyrefly: ignore  # missing-attribute
         base64_state = result.value.encode()
 
         try:
@@ -138,6 +136,7 @@ def _decode_state(self, result: etcd.EtcdResult) -> tuple[bytes, Token]:
                 "The state object is corrupt. See inner exception for details."
             ) from exc
 
+        # pyrefly: ignore  # missing-attribute
         return state, result.modifiedIndex
 
 
diff --git a/torch/distributed/elastic/rendezvous/etcd_server.py b/torch/distributed/elastic/rendezvous/etcd_server.py
index 8af8c01c028a..7e54fdd9839a 100644
--- a/torch/distributed/elastic/rendezvous/etcd_server.py
+++ b/torch/distributed/elastic/rendezvous/etcd_server.py
@@ -176,7 +176,7 @@ def start(
             except Exception as e:
                 curr_retries += 1
                 stop_etcd(self._etcd_proc)
-                logger.warning(
+                logger.warning(  # noqa: G200
                     "Failed to start etcd server, got error: %s, retrying", str(e)
                 )
                 if curr_retries >= num_retries:
diff --git a/torch/distributed/elastic/rendezvous/etcd_store.py b/torch/distributed/elastic/rendezvous/etcd_store.py
index 676303216f11..781a40e20e91 100644
--- a/torch/distributed/elastic/rendezvous/etcd_store.py
+++ b/torch/distributed/elastic/rendezvous/etcd_store.py
@@ -149,9 +149,9 @@ def check(self, keys) -> bool:
     # In case of `str`, utf-8 encoding is assumed.
     #
     def _encode(self, value) -> str:
-        if type(value) == bytes:
+        if type(value) is bytes:
             return b64encode(value).decode()
-        elif type(value) == str:
+        elif type(value) is str:
             return b64encode(value.encode()).decode()
         raise ValueError("Value must be of type str or bytes")
 
@@ -160,9 +160,9 @@ def _encode(self, value) -> str:
     # Return type is `bytes`, which is more convenient with the Store interface.
     #
     def _decode(self, value) -> bytes:
-        if type(value) == bytes:
+        if type(value) is bytes:
             return b64decode(value)
-        elif type(value) == str:
+        elif type(value) is str:
             return b64decode(value.encode())
         raise ValueError("Value must be of type str or bytes")
 
diff --git a/torch/distributed/elastic/rendezvous/registry.py b/torch/distributed/elastic/rendezvous/registry.py
index 75f0d16f7d19..ebada4623a81 100644
--- a/torch/distributed/elastic/rendezvous/registry.py
+++ b/torch/distributed/elastic/rendezvous/registry.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-import sys
+from importlib.metadata import entry_points
 
 from .api import (
     rendezvous_handler_registry as handler_registry,
@@ -15,11 +15,6 @@
 from .dynamic_rendezvous import create_handler
 
 
-if sys.version_info < (3, 10):
-    from importlib_metadata import entry_points
-else:
-    from importlib.metadata import entry_points
-
 log = logging.getLogger(__name__)
 
 __all__ = ["get_rendezvous_handler"]
diff --git a/torch/distributed/elastic/rendezvous/utils.py b/torch/distributed/elastic/rendezvous/utils.py
index a292c8c6184a..e4717959232d 100644
--- a/torch/distributed/elastic/rendezvous/utils.py
+++ b/torch/distributed/elastic/rendezvous/utils.py
@@ -11,9 +11,10 @@
 import socket
 import time
 import weakref
+from collections.abc import Callable
 from datetime import timedelta
 from threading import Event, Thread
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 
 __all__ = ["parse_rendezvous_endpoint"]
diff --git a/torch/distributed/elastic/timer/file_based_local_timer.py b/torch/distributed/elastic/timer/file_based_local_timer.py
index 0cd60bfbe8b6..d0f61bf1cef3 100644
--- a/torch/distributed/elastic/timer/file_based_local_timer.py
+++ b/torch/distributed/elastic/timer/file_based_local_timer.py
@@ -13,7 +13,8 @@
 import sys
 import threading
 import time
-from typing import Callable, Optional, TypeVar
+from collections.abc import Callable
+from typing import Optional, TypeVar
 from typing_extensions import ParamSpec
 
 from torch.distributed.elastic.timer.api import TimerClient, TimerRequest
diff --git a/torch/distributed/elastic/timer/local_timer.py b/torch/distributed/elastic/timer/local_timer.py
index d55cc6ac6e37..5e66ef3fae34 100644
--- a/torch/distributed/elastic/timer/local_timer.py
+++ b/torch/distributed/elastic/timer/local_timer.py
@@ -59,7 +59,7 @@ def size(self) -> int:
     def get(self, size, timeout: float) -> list[TimerRequest]:
         requests = []
         wait = timeout
-        for _ in range(0, size):
+        for _ in range(size):
             start = time.time()
 
             try:
diff --git a/torch/distributed/elastic/utils/data/cycling_iterator.py b/torch/distributed/elastic/utils/data/cycling_iterator.py
index 2d3b79f18dfe..291a04226db7 100644
--- a/torch/distributed/elastic/utils/data/cycling_iterator.py
+++ b/torch/distributed/elastic/utils/data/cycling_iterator.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
-from collections.abc import Iterator
-from typing import Callable, TypeVar
+from collections.abc import Callable, Iterator
+from typing import TypeVar
 from typing_extensions import Self
 
 
diff --git a/torch/distributed/elastic/utils/data/elastic_distributed_sampler.py b/torch/distributed/elastic/utils/data/elastic_distributed_sampler.py
index d95c2b0256fe..8cf489cd18f2 100644
--- a/torch/distributed/elastic/utils/data/elastic_distributed_sampler.py
+++ b/torch/distributed/elastic/utils/data/elastic_distributed_sampler.py
@@ -53,6 +53,7 @@ def __init__(
             raise TypeError("Dataset must be an instance of collections.abc.Sized")
 
         # Cast to Sized for mypy
+        # pyrefly: ignore  # redundant-cast
         sized_dataset = cast(Sized, dataset)
 
         if start_index >= len(sized_dataset):
@@ -62,8 +63,8 @@ def __init__(
 
         self.start_index = start_index
         sized_dataset = cast(Sized, self.dataset)
-        self.num_samples = int(
-            math.ceil(float(len(sized_dataset) - self.start_index) / self.num_replicas)
+        self.num_samples = math.ceil(
+            float(len(sized_dataset) - self.start_index) / self.num_replicas
         )
         self.total_size = self.num_samples * self.num_replicas
 
diff --git a/torch/distributed/elastic/utils/store.py b/torch/distributed/elastic/utils/store.py
index 8c7ded1261ed..e01991114bef 100644
--- a/torch/distributed/elastic/utils/store.py
+++ b/torch/distributed/elastic/utils/store.py
@@ -7,10 +7,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from contextlib import contextmanager
 from datetime import timedelta
-from typing import Callable, Optional
+from typing import Optional
 
 import torch
 
diff --git a/torch/distributed/fsdp/__init__.py b/torch/distributed/fsdp/__init__.py
index 9db45a719328..1e4219250c39 100644
--- a/torch/distributed/fsdp/__init__.py
+++ b/torch/distributed/fsdp/__init__.py
@@ -6,6 +6,7 @@
     MixedPrecisionPolicy,
     OffloadPolicy,
     register_fsdp_forward_method,
+    share_comm_ctx,
     UnshardHandle,
 )
 from .fully_sharded_data_parallel import (
@@ -54,6 +55,7 @@
     "OffloadPolicy",
     "register_fsdp_forward_method",
     "UnshardHandle",
+    "share_comm_ctx",
 ]
 
 # Set namespace for exposed private names
@@ -64,3 +66,4 @@
 OffloadPolicy.__module__ = "torch.distributed.fsdp"
 register_fsdp_forward_method.__module__ = "torch.distributed.fsdp"
 UnshardHandle.__module__ = "torch.distributed.fsdp"
+share_comm_ctx.__module__ = "torch.distributed.fsdp"
diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index 0d4fb2a88c34..8e63d8818381 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -7,11 +7,11 @@
 import traceback
 import warnings
 import weakref
-from collections.abc import Generator, Iterable
+from collections.abc import Callable, Generator, Iterable
 from enum import auto, Enum
 from functools import partial
 from itertools import chain
-from typing import Any, Callable, cast, no_type_check, Optional, TYPE_CHECKING
+from typing import Any, cast, no_type_check, Optional, TYPE_CHECKING
 
 import torch
 import torch.distributed as dist
@@ -65,6 +65,7 @@ def __init__(self, device: torch.device, backend: Any = None):
         if backend is None:
             try:
                 self.__backend = getattr(torch, device.type)
+                # pyrefly: ignore  # read-only
                 self.__device = device
             except AttributeError as exc:
                 raise AttributeError(
@@ -202,9 +203,10 @@ def _module_handle(state: _FSDPState, module: nn.Module) -> Optional["FlatParamH
         # handles, meaning no entry in `_fully_sharded_module_to_handles`
         if state._handle is None:
             return None
-        assert module in state._fully_sharded_module_to_handle, (
-            f"Expects a fully sharded module but got {module} on rank {state.rank}"
-        )
+        if module not in state._fully_sharded_module_to_handle:
+            raise AssertionError(
+                f"Expects a fully sharded module but got {module} on rank {state.rank}"
+            )
         return state._fully_sharded_module_to_handle[module]
     else:
         # NOTE: This assumes `module` is a `FullyShardedDataParallel` instance.
@@ -257,9 +259,10 @@ def _named_parameters_with_duplicates(
     This API is required as some modules overwrite `named_parameters()` but do not support
     `remove_duplicate`.
     """
-    assert "remove_duplicate" not in kwargs, (
-        "_named_parameters_with_duplicates cannot be used with `remove_duplicate` argument."
-    )
+    if "remove_duplicate" in kwargs:
+        raise AssertionError(
+            "_named_parameters_with_duplicates cannot be used with `remove_duplicate` argument."
+        )
     kwargs["remove_duplicate"] = False
     try:
         ret = list(module.named_parameters(**kwargs))
diff --git a/torch/distributed/fsdp/_debug_utils.py b/torch/distributed/fsdp/_debug_utils.py
index ab6b5975ea94..cf5a411f8c55 100644
--- a/torch/distributed/fsdp/_debug_utils.py
+++ b/torch/distributed/fsdp/_debug_utils.py
@@ -39,11 +39,12 @@ def reset(cls) -> None:
     @classmethod
     @contextmanager
     def profile(cls, profile_type: str) -> Iterator[None]:
-        assert profile_type not in cls.profiling, (
-            f"{profile_type} is already being profiled. "
-            "SimpleProfiler does not support profiling multiple instances at "
-            "the same time. "
-        )
+        if profile_type in cls.profiling:
+            raise AssertionError(
+                f"{profile_type} is already being profiled. "
+                "SimpleProfiler does not support profiling multiple instances at "
+                "the same time. "
+            )
 
         cls.profiling.add(profile_type)
         begin = time.monotonic()
@@ -129,7 +130,8 @@ def module_fn(
 
         if handle:
             param = handle.flat_param
-            assert isinstance(param, flat_param_file.FlatParameter)
+            if not isinstance(param, flat_param_file.FlatParameter):
+                raise AssertionError(f"Expected FlatParameter, got {type(param)}")
             global_fqns = [
                 clean_tensor_name(prefix + name) for name in param._fqns
             ]  # prefixed from the top level `model` (i.e. including `prefix`)
diff --git a/torch/distributed/fsdp/_exec_order_utils.py b/torch/distributed/fsdp/_exec_order_utils.py
index 519ce39b1678..778302a957ae 100644
--- a/torch/distributed/fsdp/_exec_order_utils.py
+++ b/torch/distributed/fsdp/_exec_order_utils.py
@@ -214,7 +214,8 @@ def _check_order(self, handle: FlatParamHandle, is_training: bool) -> None:
             # parameters
             # TODO (awgu): Since every module has at most one handle in the
             # current implementation, this should never raise the error.
-            assert self.world_size is not None  # mypy
+            if self.world_size is None:
+                raise AssertionError("Expected world_size to not be None")
             if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
                 # TODO(voz): Don't graph break on this - dynamo hates the n1 != n2
                 # tensor comparison control flow.
diff --git a/torch/distributed/fsdp/_flat_param.py b/torch/distributed/fsdp/_flat_param.py
index 4fe05da4c844..ce5d29dc166a 100644
--- a/torch/distributed/fsdp/_flat_param.py
+++ b/torch/distributed/fsdp/_flat_param.py
@@ -4,10 +4,10 @@
 import logging
 import os
 import warnings
-from collections.abc import Generator, Iterator, Sequence
+from collections.abc import Callable, Generator, Iterator, Sequence
 from enum import auto, Enum
 from itertools import accumulate, chain
-from typing import Any, Callable, cast, NamedTuple, no_type_check, Optional, Union
+from typing import Any, cast, NamedTuple, no_type_check, Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -360,7 +360,8 @@ class FlatParameter(nn.Parameter, metaclass=_FlatParameterMeta):
     _is_padding_mask: list[bool]
 
     def __new__(cls, data=None, requires_grad=True):
-        assert cls is FlatParameter, "subclasses FlatParameter not supported"
+        if cls is not FlatParameter:
+            raise AssertionError("subclasses FlatParameter not supported")
         r = nn.Parameter.__new__(nn.Parameter, data, requires_grad)  # type: ignore[call-arg]
         r._is_flat_param = True  # type: ignore[attr-defined]
         return r
@@ -398,11 +399,26 @@ def _init_metadata(
         Args:
             See the Attributes in the class docstring.
         """
-        assert len(param_infos) == len(shapes)
-        assert len(param_infos) == len(strides)
-        assert len(param_infos) == len(contiguities)
-        assert len(param_infos) == len(fqns)
-        assert len(param_infos) == len(param_extensions)
+        if len(param_infos) != len(shapes):
+            raise AssertionError(
+                f"Expected param_infos length {len(param_infos)} to match shapes length {len(shapes)}"
+            )
+        if len(param_infos) != len(strides):
+            raise AssertionError(
+                f"Expected param_infos length {len(param_infos)} to match strides length {len(strides)}"
+            )
+        if len(param_infos) != len(contiguities):
+            raise AssertionError(
+                f"Expected param_infos length {len(param_infos)} to match contiguities length {len(contiguities)}"
+            )
+        if len(param_infos) != len(fqns):
+            raise AssertionError(
+                f"Expected param_infos length {len(param_infos)} to match fqns length {len(fqns)}"
+            )
+        if len(param_infos) != len(param_extensions):
+            raise AssertionError(
+                f"Expected param_infos length {len(param_infos)} to match param_extensions length {len(param_extensions)}"
+            )
         self._num_params = len(param_infos)
         self._param_infos = param_infos
         self._shapes = shapes
@@ -418,22 +434,32 @@ def _init_metadata(
                 numels_without_padding.append(numel)
         self._numels = tuple(numels_without_padding)
         self._numels_with_padding = tuple(numels)
-        assert len(self._numels) == self._num_params
+        if len(self._numels) != self._num_params:
+            raise AssertionError(
+                f"Expected _numels length {len(self._numels)} to equal _num_params {self._num_params}"
+            )
 
         self._shared_param_infos = tuple(shared_param_infos)
         self._modules = {pi.module for pi in self._param_infos}.union(
             {spi.module for spi in self._shared_param_infos}
         )
-        assert (params is None) == (shared_params is None)
-        if params is not None:
-            assert shared_params is not None and len(shared_params) == len(
-                shared_param_infos
+        if (params is None) != (shared_params is None):
+            raise AssertionError(
+                "Expected params and shared_params to both be None or both be not None"
             )
+        if params is not None:
+            if shared_params is None or len(shared_params) != len(shared_param_infos):
+                raise AssertionError(
+                    f"Expected shared_params to be not None and have length {len(shared_param_infos)}, got {shared_params}"
+                )
             self._params = []
             for param, is_padding in zip(params, is_padding_mask):
                 if not is_padding:
                     self._params.append(param)
-            self._shared_params = shared_params
+            if shared_params is not None:
+                self._shared_params = shared_params
+            else:
+                self._shared_params = []
             # Mark the original parameters to avoid flattening them into
             # another `FlatParameter` during recursive construction
             for param in chain(self._params, self._shared_params):
@@ -539,11 +565,12 @@ def __init__(
         # Only align addresses for `use_orig_params=True` (for now)
         align_addresses = use_orig_params
         self._init_get_unflat_views_fn(align_addresses)
+        # pyrefly: ignore  # read-only
         self.device = device
         self._device_handle = _FSDPDeviceHandle.from_device(self.device)
         self.process_group = process_group
         if self._use_fake_all_gather or self._use_fake_reduce:
-            self._fake_process_group = FakeProcessGroup(
+            self._fake_process_group = FakeProcessGroup._create_internal(
                 rank=process_group.rank(), world_size=process_group.size()
             )
         self.rank = process_group.rank()
@@ -578,7 +605,8 @@ def __init__(
         # before `_init_flat_param()`, which performs the actual validation
         self._orig_param_dtype = params[0].dtype
         self._init_param_reduce_dtypes(mp_param_dtype, mp_reduce_dtype)
-        assert self._fwd_bwd_param_dtype is not None  # mypy
+        if self._fwd_bwd_param_dtype is None:
+            raise AssertionError("Expected _fwd_bwd_param_dtype to be not None")  # mypy
         self._aligned_numel = (
             _get_aligned_numel(unsharded_dtype=self._fwd_bwd_param_dtype)
             if align_addresses
@@ -806,7 +834,8 @@ def _validate_tensors_to_flatten(
             dtype = tensor.dtype
             flat_param_requires_grad = flat_param_requires_grad or tensor.requires_grad
             device = tensor.device
-        assert flat_param_requires_grad is not None, "Requires non-empty `tensors` list"
+        if flat_param_requires_grad is None:
+            raise AssertionError("Requires non-empty `tensors` list")
         return dtype, flat_param_requires_grad, device
 
     def flatten_tensors(
@@ -907,8 +936,10 @@ def _init_param_reduce_dtypes(
         else:
             self._fwd_bwd_param_dtype = mp_param_dtype or self._orig_param_dtype
             self._reduce_dtype = mp_reduce_dtype or self._orig_param_dtype
-        assert self._fwd_bwd_param_dtype is not None
-        assert self._reduce_dtype is not None
+        if self._fwd_bwd_param_dtype is None:
+            raise AssertionError("Expected _fwd_bwd_param_dtype to be not None")
+        if self._reduce_dtype is None:
+            raise AssertionError("Expected _reduce_dtype to be not None")
 
     ###################################
     # SHARD INITIALIZATION & METADATA #
@@ -984,9 +1015,10 @@ def _init_shard_metadata(
         shard_param_infos = self._get_shard_metadata(
             unsharded_start_idx, unsharded_end_idx
         )
-        assert len(shard_param_infos) == flat_param._num_params, (
-            f"Expects length {flat_param._num_params} but got {len(shard_param_infos)}"
-        )
+        if len(shard_param_infos) != flat_param._num_params:
+            raise AssertionError(
+                f"Expects length {flat_param._num_params} but got {len(shard_param_infos)}"
+            )
         flat_param._shard_param_infos = shard_param_infos  # type: ignore[attr-defined]
         flat_param._shard_numel_padded = numel_padded  # type: ignore[attr-defined]
 
@@ -1002,9 +1034,10 @@ def _get_shard_metadata(
         unsharded flat parameter specifying the shard.
         """
         flat_param_offsets = self._get_flat_param_offsets()
-        assert len(flat_param_offsets) == len(self.flat_param._numels_with_padding), (
-            f"Expected {len(self.flat_param._numels_with_padding)} but got {len(flat_param_offsets)}"
-        )
+        if len(flat_param_offsets) != len(self.flat_param._numels_with_padding):
+            raise AssertionError(
+                f"Expected {len(self.flat_param._numels_with_padding)} but got {len(flat_param_offsets)}"
+            )
         shard_param_infos: list[_ShardParamInfo] = []
         sharded_flat_param_numel = unsharded_end_idx - unsharded_start_idx + 1
         # `unsharded_param_start_idx` and `unsharded_param_end_idx` are indices
@@ -1032,12 +1065,13 @@ def _get_shard_metadata(
                         unsharded_start_idx - unsharded_param_start_idx
                     )
                     offset_in_shard = 0
-                assert (
+                if not (
                     offset_in_shard >= 0 and offset_in_shard < sharded_flat_param_numel
-                ), (
-                    f"Invalid `offset_in_shard` of {offset_in_shard} for "
-                    f"sharded flat parameter with {sharded_flat_param_numel} numel"
-                )
+                ):
+                    raise AssertionError(
+                        f"Invalid `offset_in_shard` of {offset_in_shard} for "
+                        f"sharded flat parameter with {sharded_flat_param_numel} numel"
+                    )
                 intra_param_end_idx = (
                     min(unsharded_param_end_idx, unsharded_end_idx)
                     - unsharded_param_start_idx
@@ -1081,9 +1115,10 @@ def _get_unpadded_shard(
         else:
             chunk = chunks[rank]
         numel_to_pad = chunks[0].numel() - chunk.numel()
-        assert numel_to_pad >= 0, (
-            "Chunk's size should be at most the first chunk's size"
-        )
+        if numel_to_pad < 0:
+            raise AssertionError(
+                "Chunk's size should be at most the first chunk's size"
+            )
         return chunk, numel_to_pad
 
     @staticmethod
@@ -1114,12 +1149,16 @@ def _get_sharded_size(tensor: Tensor, rank: int, world_size: int) -> torch.Size:
         This requires ``tensor`` to have 1D shape and ensures that the returned
         shape is 1D.
         """
-        assert len(tensor.shape) == 1, f"{tensor.shape}"
+        if len(tensor.shape) != 1:
+            raise AssertionError(f"Expected 1D tensor shape, got {tensor.shape}")
         unpadded_sharded_tensor, numel_to_pad = FlatParamHandle._get_unpadded_shard(
             tensor, rank, world_size
         )
         unpadded_sharded_size = unpadded_sharded_tensor.size()
-        assert len(unpadded_sharded_size) == 1, f"{unpadded_sharded_size}"
+        if len(unpadded_sharded_size) != 1:
+            raise AssertionError(
+                f"Expected 1D unpadded_sharded_size, got {unpadded_sharded_size}"
+            )
         return torch.Size([unpadded_sharded_size[0] + numel_to_pad])
 
     def _get_flat_param_offsets(self) -> list[tuple[int, int]]:
@@ -2058,7 +2097,7 @@ def _use_unsharded_grad_views(self) -> None:
             _p_assert(
                 hasattr(module, param_name),
                 f"{module_name + '.' + param_name if module_name else param_name} is missing",
-            )  # did not save FQN info in `_shared_param_infos`
+            )
             param = getattr(module, param_name)
             prim_param = getattr(prim_module, prim_param_name)
             if (
@@ -2129,7 +2168,8 @@ def _use_sharded_views(self) -> None:
                 offset = shard_param_info.offset_in_shard
                 numel_in_shard = shard_param_info.numel_in_shard
                 param.data = flat_param[offset : offset + numel_in_shard]
-        assert self.flat_param._shared_params is not None
+        if self.flat_param._shared_params is None:
+            raise AssertionError("Expected _shared_params to be not None")
         for i, (
             param,
             (param_name, module, _, prim_param_name, prim_module, _),
@@ -2193,7 +2233,8 @@ def _use_sharded_grad_views(self) -> None:
                         )
                 else:
                     param.grad = None
-        assert flat_param._shared_params is not None
+        if flat_param._shared_params is None:
+            raise AssertionError("Expected _shared_params to be not None")
         for param, (_, _, _, prim_param_name, prim_module, _) in zip(
             flat_param._shared_params, flat_param._shared_param_infos
         ):
@@ -2407,7 +2448,8 @@ def _writeback_tensor(
             dst_tensor[offset : offset + expected_shape.numel()].copy_(src_tensor)
         else:
             dst_tensor[offset : offset + expected_shape.numel()].zero_()
-            assert self.flat_param._is_grad_none_mask is not None
+            if self.flat_param._is_grad_none_mask is None:
+                raise AssertionError("Expected _is_grad_none_mask to be not None")
             self.flat_param._is_grad_none_mask[tensor_index] = True
 
     def _reset_flat_param_grad_info_if_needed(self):
@@ -2426,7 +2468,8 @@ def _reset_flat_param_grad_info_if_needed(self):
         if not self._use_orig_params:
             return
         flat_param = self.flat_param
-        assert flat_param._params is not None  # mypy
+        if flat_param._params is None:
+            raise AssertionError("Expected _params to be not None")  # mypy
         all_grad_none = True
         requires_grad = False
         for param in flat_param._params:
@@ -2570,12 +2613,16 @@ def _reset_is_grad_none(self) -> None:
             "Expects to only be called in the post-backward after gradient computation",
         )
         flat_param = self.flat_param
-        assert flat_param._params is not None  # mypy
+        if flat_param._params is None:
+            raise AssertionError("Expected _params to be not None")  # mypy
         for i, param in enumerate(flat_param._params):  # type: ignore[arg-type]
             # As long as the parameter requires gradient, it should receive a
             # meaningful gradient (even if the gradient happens to be zeros)
             if param.requires_grad:
-                assert flat_param._is_grad_none_mask is not None  # mypy
+                if flat_param._is_grad_none_mask is None:
+                    raise AssertionError(
+                        "Expected _is_grad_none_mask to be not None"
+                    )  # mypy
                 flat_param._is_grad_none_mask[i] = False
 
     #######################
diff --git a/torch/distributed/fsdp/_fsdp_extensions.py b/torch/distributed/fsdp/_fsdp_extensions.py
index f861a90ce58a..699274ba50f9 100644
--- a/torch/distributed/fsdp/_fsdp_extensions.py
+++ b/torch/distributed/fsdp/_fsdp_extensions.py
@@ -161,7 +161,8 @@ def _ext_pre_load_state_dict_transform(
     if fsdp_extension is not None:
         return fsdp_extension.pre_load_state_dict_transform(tensor)
 
-    assert type(tensor) is ShardedTensor
+    if type(tensor) is not ShardedTensor:
+        raise AssertionError(f"Expected ShardedTensor, got {type(tensor)}")
     shards = tensor.local_shards()
     return (tensor, shards)
 
diff --git a/torch/distributed/fsdp/_fully_shard/__init__.py b/torch/distributed/fsdp/_fully_shard/__init__.py
index 7592385955a9..d4d0b341a3f8 100644
--- a/torch/distributed/fsdp/_fully_shard/__init__.py
+++ b/torch/distributed/fsdp/_fully_shard/__init__.py
@@ -3,6 +3,7 @@
     FSDPModule,
     fully_shard,
     register_fsdp_forward_method,
+    share_comm_ctx,
     UnshardHandle,
 )
 
@@ -15,4 +16,5 @@
     "OffloadPolicy",
     "register_fsdp_forward_method",
     "UnshardHandle",
+    "share_comm_ctx",
 ]
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py b/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
index ea624cb092bd..bf3f8eadaaf1 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py
@@ -1,7 +1,7 @@
 import math
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from itertools import chain
-from typing import Any, Callable, cast, NamedTuple, Optional, Union
+from typing import Any, cast, NamedTuple, Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -502,9 +502,10 @@ def foreach_reduce(
         ):
             if (shard_dim := fsdp_param.fsdp_placement.dim) == 0:
                 continue
-            assert unsharded_grad.size(shard_dim) % world_size == 0, (
-                f"Shard({shard_dim}) requires even sharding: {unsharded_grad.size()=} {world_size=}"
-            )
+            if unsharded_grad.size(shard_dim) % world_size != 0:
+                raise AssertionError(
+                    f"Shard({shard_dim}) requires even sharding: {unsharded_grad.size()=} {world_size=}"
+                )
             chunks = torch.chunk(unsharded_grad, world_size, dim=shard_dim)
             unsharded_grads[i] = torch.cat(chunks, dim=0)
 
@@ -621,7 +622,10 @@ def foreach_reduce(
                     # ensure that the D2H copy finishes before the optimizer
                     fsdp_param.grad_offload_event = post_reduce_stream.record_event()
             if to_accumulate_grad:
-                assert isinstance(fsdp_param.sharded_param.grad, DTensor)
+                if not isinstance(fsdp_param.sharded_param.grad, DTensor):
+                    raise AssertionError(
+                        f"Expected fsdp_param.sharded_param.grad to be DTensor, got {type(fsdp_param.sharded_param.grad)}"
+                    )
                 fsdp_param.sharded_param.grad._local_tensor += new_sharded_grad
             else:
                 new_sharded_dtensor_grad = fsdp_param.to_sharded_dtensor(
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_common.py b/torch/distributed/fsdp/_fully_shard/_fsdp_common.py
index b599f48d77d1..5013ce62cb3a 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_common.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_common.py
@@ -17,9 +17,10 @@
 
 
 def detect_compiled_autograd():
-    assert not torch.compiler.is_compiling(), (
-        "`detect_compiled_autograd()` is designed to be called in eager mode"
-    )
+    if torch.compiler.is_compiling():
+        raise AssertionError(
+            "`detect_compiled_autograd()` is designed to be called in eager mode"
+        )
     global _compiled_autograd_enabled
     import torch._dynamo.compiled_autograd as ca
 
@@ -171,3 +172,7 @@ def _cast_fp_tensor(dtype: torch.dtype, x: torch.Tensor) -> torch.Tensor:
     ):
         return x
     return x.to(dtype)
+
+
+def is_bw() -> bool:
+    return torch._C._current_graph_task_id() != -1
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_init.py b/torch/distributed/fsdp/_fully_shard/_fsdp_init.py
index a0dba72b6efa..5239c1add111 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_init.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_init.py
@@ -220,6 +220,7 @@ def _move_states_to_device(
     the future.
     """
     # Follow the logic in `nn.Module._apply`
+    # pyrefly: ignore  # bad-argument-type
     for tensor in itertools.chain(params, buffers):
         if tensor.device == device or tensor.device.type == "meta":
             # Keep meta-device tensors on meta device for deferred init
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_param.py b/torch/distributed/fsdp/_fully_shard/_fsdp_param.py
index db8f2bf722f0..376898d519f7 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_param.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_param.py
@@ -1,10 +1,10 @@
 # mypy: allow-untyped-defs
 import inspect
 import itertools
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from dataclasses import dataclass, field
 from enum import auto, Enum
-from typing import Any, Callable, cast, Optional
+from typing import Any, cast, Optional
 
 import torch
 import torch.nn as nn
@@ -232,6 +232,7 @@ def __init__(
         self._module_info: ParamModuleInfo = module_info
         self.mesh_info = mesh_info
         self.post_forward_mesh_info = post_forward_mesh_info
+        # pyrefly: ignore  # read-only
         self.device = device
         self.mp_policy = mp_policy
         self.offload_to_cpu: bool = isinstance(offload_policy, CPUOffloadPolicy)
@@ -274,7 +275,10 @@ def _init_sharded_param(
             fsdp_placement = Shard(0)
         elif fsdp_placement.dim < 0:
             fsdp_placement = Shard(fsdp_placement.dim + param.ndim)
-        assert isinstance(fsdp_placement, Shard), f"{fsdp_placement}"
+        if not isinstance(fsdp_placement, Shard):
+            raise AssertionError(
+                f"Expected Shard, got {type(fsdp_placement)}: {fsdp_placement}"
+            )
         self.fsdp_placement = fsdp_placement
         shard_dim = fsdp_placement.dim
         # TODO: Replace the sharded DTensor parameter construction logic with
@@ -285,8 +289,8 @@ def _init_sharded_param(
         if self.is_dtensor:
             self._tp_spec = cast(DTensor, param)._spec
             dp_mesh, tp_mesh = (self.mesh_info.mesh, self._tp_spec.mesh)
-            dp_global_mesh = _mesh_resources.get_root_mesh(dp_mesh)
-            tp_global_mesh = _mesh_resources.get_root_mesh(tp_mesh)
+            dp_global_mesh = dp_mesh._get_root_mesh() if dp_mesh is not None else None
+            tp_global_mesh = tp_mesh._get_root_mesh() if tp_mesh is not None else None
             if dp_global_mesh != tp_global_mesh or (
                 dp_global_mesh is None or tp_global_mesh is None
             ):
@@ -295,8 +299,10 @@ def _init_sharded_param(
                     f"DP's global mesh: {dp_global_mesh}\nTP/EP's global mesh: {tp_global_mesh}"
                 )
             name_dims_error = "FSDP requires named DeviceMesh dims for ND parallelism"
-            assert dp_mesh.mesh_dim_names is not None, name_dims_error
-            assert tp_mesh.mesh_dim_names is not None, name_dims_error
+            if dp_mesh.mesh_dim_names is None:
+                raise AssertionError(name_dims_error)
+            if tp_mesh.mesh_dim_names is None:
+                raise AssertionError(name_dims_error)
             submesh_names = dp_mesh.mesh_dim_names + tp_mesh.mesh_dim_names
             self._spmd_mesh = dp_global_mesh[submesh_names]
             if len(self._tp_spec.placements) > 2:
@@ -304,10 +310,11 @@ def _init_sharded_param(
                     f"FSDP only supports 1D TP/EP or 2D EP+TP, not {self._tp_spec.placements}"
                 )
             split_factor = self._tp_spec.num_shards_map[shard_dim]
-            assert 2 <= self._spmd_mesh.ndim <= 4, (
-                "_spmd_mesh.ndim can only be 2 (FSDP+TP/EP), 3 (FSDP+EP+TP, HSDP+TP/EP), "
-                f"or 4 (HSDP+EP+TP) but got {self._spmd_mesh.ndim}."
-            )
+            if not (2 <= self._spmd_mesh.ndim <= 4):
+                raise AssertionError(
+                    "_spmd_mesh.ndim can only be 2 (FSDP+TP/EP), 3 (FSDP+EP+TP, HSDP+TP/EP), "
+                    f"or 4 (HSDP+EP+TP) but got {self._spmd_mesh.ndim}."
+                )
             self._spmd_placements: tuple[Placement, ...]
             dp_shard_tp_placement = (
                 (
@@ -320,7 +327,10 @@ def _init_sharded_param(
             if dp_mesh.ndim == 1:  # FSDP
                 self._spmd_placements = dp_shard_tp_placement
             else:  # HSDP
-                assert self.mesh_info.replicate_mesh_dim == 0
+                if self.mesh_info.replicate_mesh_dim != 0:
+                    raise AssertionError(
+                        f"Expected replicate_mesh_dim to be 0, got {self.mesh_info.replicate_mesh_dim}"
+                    )
                 self._spmd_placements = (Replicate(),) + dp_shard_tp_placement
             self._sharding_spec = DTensorSpec(
                 self._spmd_mesh,
@@ -340,7 +350,10 @@ def _init_sharded_param(
                 tensor_meta=TensorMeta(param.size(), param.stride(), param.dtype),
             )
             param_data = param
-        assert param_data.is_contiguous(), f"{param_data.shape=} {param_data.stride()=}"
+        if not param_data.is_contiguous():
+            raise AssertionError(
+                f"Expected contiguous tensor, got {param_data.shape=} {param_data.stride()=}"
+            )
         shard_dim = fsdp_placement.dim
         if shard_dim >= param_data.ndim:
             raise AssertionError(
@@ -382,7 +395,10 @@ def _init_sharded_param(
         sharded_param = padded_sharded_param.narrow(
             dim=shard_dim, start=0, length=length
         )
-        assert sharded_param.is_contiguous(), f"{self.fsdp_placement=}"
+        if not sharded_param.is_contiguous():
+            raise AssertionError(
+                f"Expected contiguous tensor with {self.fsdp_placement=}"
+            )
         self.sharded_param = nn.Parameter(self.to_sharded_dtensor(sharded_param))
         self.sharded_param.requires_grad_(param.requires_grad)
         # Let `param_data` be freed normally when its ref count reaches 0 when
@@ -392,7 +408,8 @@ def _init_sharded_param(
 
     def _init_sharded_post_forward_param_metadata(self, param: torch.Tensor) -> None:
         mesh_info = self.post_forward_mesh_info
-        assert mesh_info is not None  # mypy
+        if mesh_info is None:
+            raise AssertionError("Expected post_forward_mesh_info to not be None")
         param_data = param._local_tensor if isinstance(param, DTensor) else param
         chunks = _chunk_with_empty(param_data, mesh_info.shard_mesh_size, dim=0)
         self.sharded_post_forward_size = _get_dim_chunked_size(
@@ -497,7 +514,10 @@ def init_unsharded_param(self):
         else:
             # For the default path (no post-all-gather), the all-gather output
             # gives the unsharded parameter data directly
-            assert len(self.all_gather_outputs) == 1, f"{len(self.all_gather_outputs)}"
+            if len(self.all_gather_outputs) != 1:
+                raise AssertionError(
+                    f"Expected 1 all_gather_output, got {len(self.all_gather_outputs)}"
+                )
             unsharded_tensor = self.all_gather_outputs[0]
         unsharded_param = torch.as_strided(
             unsharded_tensor,
@@ -508,7 +528,8 @@ def init_unsharded_param(self):
         if self.is_dtensor:
             unsharded_param = _from_local_no_grad(unsharded_param, self._tp_spec)
         if hasattr(self, "_unsharded_param"):
-            assert compiled_autograd_enabled()
+            if not compiled_autograd_enabled():
+                raise AssertionError("Expected compiled_autograd to be enabled")
             with (
                 torch.no_grad(),
                 torch.autograd._unsafe_preserve_version_counter(self._unsharded_param),
@@ -545,8 +566,12 @@ def to_sharded_post_forward(self) -> None:
                 "Resharding to smaller mesh with TP is not supported yet"
             )
         self._assert_in_states(ShardedState.UNSHARDED)
-        assert self.post_forward_mesh_info is not None  # mypy
-        assert len(self.all_gather_outputs) == 1
+        if self.post_forward_mesh_info is None:
+            raise AssertionError("Expected post_forward_mesh_info to not be None")
+        if len(self.all_gather_outputs) != 1:
+            raise AssertionError(
+                f"Expected 1 all_gather_output, got {len(self.all_gather_outputs)}"
+            )
         shard_world_size = self.post_forward_mesh_info.shard_mesh_size
         if (numel := self.all_gather_outputs[0].numel()) % shard_world_size != 0:
             _raise_assert_with_print(
@@ -554,6 +579,7 @@ def to_sharded_post_forward(self) -> None:
                 f"world size ({shard_world_size})"
             )
         shard_rank = self.post_forward_mesh_info.shard_mesh_rank
+        # pyrefly: ignore  # unbound-name
         sharded_numel = numel // shard_world_size
         self._sharded_post_forward_param_data = (
             self.all_gather_outputs[0].narrow(
@@ -614,7 +640,10 @@ def to_sharded_post_forward_dtensor(self, tensor: torch.Tensor) -> DTensor:
             _raise_assert_with_print(
                 f"Expects size {self.sharded_post_forward_size} but got {tensor.shape}"
             )
-        assert isinstance(self.post_forward_mesh_info, HSDPMeshInfo)
+        if not isinstance(self.post_forward_mesh_info, HSDPMeshInfo):
+            raise AssertionError(
+                f"Expected HSDPMeshInfo, got {type(self.post_forward_mesh_info)}"
+            )
         # TODO: Prefer this DTensor to be read-only and generalize the
         # placement once we support TP.
         post_forward_sharding_spec = DTensorSpec(
@@ -684,23 +713,23 @@ def all_gather_inputs(self) -> list[torch.Tensor]:  # 1D
                         self.device, non_blocking=True
                     )
                 pre_all_gather_signature = inspect.signature(
+                    # pyrefly: ignore  # missing-attribute
                     sharded_local_tensor.fsdp_pre_all_gather
                 )
                 num_fn_params = len(pre_all_gather_signature.parameters)
                 # Old signature only passes mesh; keep for BC for now
-                assert num_fn_params in (
-                    1,
-                    5,
-                ), (
-                    f"Invalid fsdp_pre_all_gather: {pre_all_gather_signature}\n"
-                    "Expects fsdp_pre_all_gather(self, mesh: DeviceMesh, "
-                    "outer_size: torch.Size, outer_stride: tuple[int, ...], "
-                    "module: nn.Module, mp_policy: MixedPrecisionPolicy)"
-                )
+                if num_fn_params not in (1, 5):
+                    raise AssertionError(
+                        f"Invalid fsdp_pre_all_gather: {pre_all_gather_signature}\n"
+                        "Expects fsdp_pre_all_gather(self, mesh: DeviceMesh, "
+                        "outer_size: torch.Size, outer_stride: tuple[int, ...], "
+                        "module: nn.Module, mp_policy: MixedPrecisionPolicy)"
+                    )
                 if num_fn_params == 1:
                     (
                         all_gather_inputs,
                         self._extensions_data.all_gather_metadata,
+                        # pyrefly: ignore  # missing-attribute
                     ) = sharded_local_tensor.fsdp_pre_all_gather(
                         self.shard_mesh_from_root
                     )
@@ -708,6 +737,7 @@ def all_gather_inputs(self) -> list[torch.Tensor]:  # 1D
                     (
                         all_gather_inputs,
                         self._extensions_data.all_gather_metadata,
+                        # pyrefly: ignore  # missing-attribute
                     ) = sharded_local_tensor.fsdp_pre_all_gather(
                         self.shard_mesh_from_root,
                         self._orig_size,
@@ -760,25 +790,29 @@ def unsharded_param(self) -> nn.Parameter:  # ND
     @property
     def unsharded_grad_data(self) -> torch.Tensor:
         grad = self.unsharded_param.grad
-        assert grad is not None, "Expects unsharded_param.grad to not be None"
+        if grad is None:
+            raise AssertionError("Expects unsharded_param.grad to not be None")
         return self._get_grad_inner_tensor(grad)
 
     @property
     def unsharded_accumulated_grad_data(self) -> torch.Tensor:
         grad = self.unsharded_accumulated_grad
-        assert grad is not None, "Expects unsharded_accumulated_grad to not be None"
+        if grad is None:
+            raise AssertionError("Expects unsharded_accumulated_grad to not be None")
         return self._get_grad_inner_tensor(grad)
 
     def _get_grad_inner_tensor(self, grad: torch.Tensor) -> torch.Tensor:
         if self.is_dtensor:
             if isinstance(grad, AsyncCollectiveTensor):
                 grad = grad.wait()
-            assert isinstance(grad, DTensor), f"{type(grad)}"
+            if not isinstance(grad, DTensor):
+                raise AssertionError(f"Expected DTensor, got {type(grad)}")
             placements = self._tp_spec.placements
             if placements != grad.placements:
-                assert len(self._tp_spec.placements) == len(grad.placements), (
-                    f"{self._tp_spec=} {grad.placements=}"
-                )
+                if len(self._tp_spec.placements) != len(grad.placements):
+                    raise AssertionError(
+                        f"Expected same placement length: {self._tp_spec=} {grad.placements=}"
+                    )
                 grad = grad.redistribute(placements=placements)
             grad = grad._local_tensor
         return grad
@@ -793,7 +827,8 @@ def shard_mesh(self):
         if mesh.ndim == 1:
             return mesh
         elif mesh.ndim == 2:
-            assert mesh.mesh_dim_names is not None
+            if mesh.mesh_dim_names is None:
+                raise AssertionError("Expected mesh_dim_names to not be None")
             return mesh[mesh.mesh_dim_names[-1]]
         raise ValueError(f"Invalid mesh: {mesh}")
 
@@ -804,7 +839,8 @@ def shard_mesh_from_root(self):
         if mesh.ndim == 1:
             return mesh
         else:
-            assert mesh.mesh_dim_names is not None
+            if mesh.mesh_dim_names is None:
+                raise AssertionError("Expected mesh_dim_names to not be None")
             shard_dim_name = mesh.mesh_dim_names[-1]
 
             root_mesh = _mesh_resources.get_root_mesh(mesh)
@@ -829,17 +865,36 @@ def reset_sharded_param(self):
                     f"instead of {self.sharded_param}"
                 )
             self.sharded_param = new_param
+        # pyrefly: ignore  # missing-attribute
         local_tensor = new_param._local_tensor
         if local_tensor.is_meta:
             return
         updated_local_tensor = False
+        # local_tensor can be padded twice
+        # 1st time in fully_shard(model)
+        # 2nd time in model(input) lazy_init
+        # 2nd time should be no-op if parameters remain unchanged
+        # 2nd time shouldn't be no-op if people call model.load_state_dict(...) before lazy_init
+        # this makes it possible for trainer to call `sd = model.state_dict()` before the training loop
+        # and use `sd` without calling .state_dict() per iteration
+        same_local_tensor = False
+        # TODO: need to support tensor subclass
+        if type(self._sharded_param_data) is torch.Tensor:
+            same_local_tensor = (
+                # when sharding param with shape (1, ...) over 2 ranks
+                # local_tensor on rank 1 can be size 0, data_ptr() can be 0
+                self._sharded_param_data.untyped_storage().data_ptr() > 0
+                and self._sharded_param_data.untyped_storage().data_ptr()
+                == local_tensor.untyped_storage().data_ptr()
+            )
         padded_sharded_size = self.padded_sharded_param_size
         shard_dim = self.fsdp_placement.dim
         length = local_tensor.size(shard_dim) if local_tensor.numel() > 0 else 0
-        if local_tensor.size() != padded_sharded_size:
-            assert shard_dim == 0, (
-                f"Shard({shard_dim}) requires even sharding: {local_tensor.size()=}"
-            )
+        if local_tensor.size() != padded_sharded_size and not same_local_tensor:
+            if shard_dim != 0:
+                raise AssertionError(
+                    f"Shard({shard_dim}) requires even sharding: {local_tensor.size()=}"
+                )
             padded_local_tensor = local_tensor.new_zeros(padded_sharded_size)
             padded_local_tensor.narrow(dim=shard_dim, start=0, length=length).copy_(
                 local_tensor
@@ -849,14 +904,19 @@ def reset_sharded_param(self):
         if self.pin_memory and not local_tensor.is_pinned():
             local_tensor = local_tensor.cpu().pin_memory()
             updated_local_tensor = True
-        self._sharded_param_data = local_tensor.view(-1)
-        assert isinstance(self.sharded_param, DTensor)  # mypy
+        if not same_local_tensor:
+            self._sharded_param_data = local_tensor.view(-1)
+        if not isinstance(self.sharded_param, DTensor):
+            raise AssertionError(f"Expected DTensor, got {type(self.sharded_param)}")
         if updated_local_tensor:
             # Only change the local tensor object if needed
             self.sharded_param._local_tensor = local_tensor.narrow(
                 dim=shard_dim, start=0, length=length
             )
-            assert self.sharded_param._local_tensor.is_contiguous()
+            if not self.sharded_param._local_tensor.is_contiguous():
+                raise AssertionError(
+                    "Expected sharded_param._local_tensor to be contiguous"
+                )
         self._sharding_spec = self.sharded_param._spec
 
     def __repr__(self):
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py b/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py
index 554367e8705c..32939a554503 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_param_group.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 import contextlib
 import logging
-from typing import Any, Callable, cast, NamedTuple, Optional
+from collections.abc import Callable
+from typing import Any, cast, NamedTuple, Optional
 
 import torch
 import torch.distributed as dist
@@ -30,6 +31,7 @@
     compiled_autograd_enabled,
     FSDPMeshInfo,
     HSDPMeshInfo,
+    is_bw,
     TrainingState,
 )
 from ._fsdp_param import alloc_storage, FSDPParam, ParamModuleInfo, ShardedState
@@ -149,6 +151,7 @@ def __init__(
         ]
         self.mesh_info = mesh_info
         self.post_forward_mesh_info = post_forward_mesh_info
+        # pyrefly: ignore  # read-only
         self.device = device
         self.device_handle = _get_device_handle(device.type)
         self.mp_policy = mp_policy
@@ -234,7 +237,7 @@ def _init_mp_dtypes(self) -> None:
             raise AssertionError(
                 f"FSDP expects uniform original parameter dtype but got {orig_dtypes}"
             )
-        self._orig_dtype = next(iter(orig_dtypes)) if len(trainable_params) else None
+        self._orig_dtype = next(iter(orig_dtypes)) if trainable_params else None
         if len(trainable_params) > 0 and len(reduce_dtypes) != 1:
             # This can be relaxed if we issue one reduce-scatter per reduce
             # dtype (but we would need a way for users to specify multiple
@@ -242,9 +245,7 @@ def _init_mp_dtypes(self) -> None:
             raise AssertionError(
                 f"FSDP expects uniform reduce dtype but got {reduce_dtypes}"
             )
-        self._reduce_dtype = (
-            next(iter(reduce_dtypes)) if len(trainable_params) else None
-        )
+        self._reduce_dtype = next(iter(reduce_dtypes)) if trainable_params else None
 
     def lazy_init(self):
         # Lazy init should be idempotent
@@ -270,25 +271,27 @@ def set_allocate_memory_from_process_group(self, enable: bool) -> None:
         Whether to (try to) use the ProcessGroup's allocate_tensor method for
         the staging buffers for collective comms.
         """
-        assert isinstance(
-            self._all_gather_comm, (DefaultAllGather, ProcessGroupAllocAllGather)
-        ), (
-            "cannot call set_allocate_memory_from_process_group() "
-            f"when all gather comm is custom: {self._all_gather_comm.__class__.__name__}"
-        )
+        if not isinstance(
+            self._all_gather_comm, (DefaultAllGather | ProcessGroupAllocAllGather)
+        ):
+            raise AssertionError(
+                "cannot call set_allocate_memory_from_process_group() "
+                f"when all gather comm is custom: {self._all_gather_comm.__class__.__name__}"
+            )
         self._all_gather_comm = (
             ProcessGroupAllocAllGather(self._all_gather_process_group)
             if enable
             else DefaultAllGather()
         )
 
-        assert isinstance(
+        if not isinstance(
             self._reduce_scatter_comm,
-            (DefaultReduceScatter, ProcessGroupAllocReduceScatter),
-        ), (
-            "cannot call set_allocate_memory_from_process_group() "
-            f"when reduce scatter comm is custom: {self._reduce_scatter_comm.__class__.__name__}"
-        )
+            (DefaultReduceScatter | ProcessGroupAllocReduceScatter),
+        ):
+            raise AssertionError(
+                "cannot call set_allocate_memory_from_process_group() "
+                f"when reduce scatter comm is custom: {self._reduce_scatter_comm.__class__.__name__}"
+            )
         self._reduce_scatter_comm = (
             ProcessGroupAllocReduceScatter(self._reduce_scatter_process_group)
             if enable
@@ -444,8 +447,15 @@ def post_forward(self, module: nn.Module, input: Any, output: Any):
         if not compiled_autograd_enabled():
             logger.debug("%s", self._with_fqn("FSDP::post_forward"))
         with record_function(self._with_fqn("FSDP::post_forward")):
-            self.reshard()
-            self._record_post_forward()
+            if not compiled_autograd_enabled():
+                # for AC(fully_shard(model)), AC runs fsdp's _pre_forward
+                # it shouldn't change post_forward_order
+                if not is_bw():
+                    self.reshard()
+                    self._record_post_forward()
+            else:
+                self.reshard()
+                self._record_post_forward()
             self._training_state = TrainingState.IDLE
             return output
 
@@ -526,9 +536,10 @@ def post_backward(self, *unused: Any):
             if all_reduce_pg is None and self._all_reduce_hook_stream is not None:
                 # this means the native HSDP is not enabled,
                 # but user may want to have a custom HSDP setup
-                assert self._all_reduce_hook is not None, (
-                    "all reduce hook stream is specified but hook itself is missing."
-                )
+                if self._all_reduce_hook is None:
+                    raise AssertionError(
+                        "all reduce hook stream is specified but hook itself is missing."
+                    )
                 all_reduce_stream = self._all_reduce_hook_stream
             else:
                 all_reduce_stream = self.comm_ctx.all_reduce_stream
@@ -563,7 +574,10 @@ def post_backward(self, *unused: Any):
             )
             if all_reduce_input is not None:
                 if self.device.type != "cpu":
-                    assert all_reduce_event is not None
+                    if all_reduce_event is None:
+                        raise AssertionError(
+                            "Expected all_reduce_event to be set for non-CPU device"
+                        )
                 self._all_reduce_state = AllReduceState(
                     all_reduce_input, all_reduce_event
                 )
@@ -607,6 +621,7 @@ def _backward_prefetch(self) -> None:
             # Prefetch naively using the reverse post-forward order, which may
             # have mistargeted prefetches if not all modules used in forward
             # are used in this backward
+            # pyrefly: ignore  # unbound-name
             target_fsdp_param_group = self.comm_ctx.post_forward_order[target_index]
             self._prefetch_unshard(target_fsdp_param_group, "backward")
 
@@ -701,9 +716,10 @@ def _register_post_backward_hook(
     def _register_state_dict_hooks(self) -> None:
         num_pre_save_hooks = len(self._module_to_pre_save_state_dict_hook_handle)
         num_pre_load_hooks = len(self._module_to_pre_load_state_dict_hook_handle)
-        assert num_pre_save_hooks == num_pre_load_hooks, (
-            f"Pre-save: {num_pre_save_hooks} pre-load: {num_pre_load_hooks}"
-        )
+        if num_pre_save_hooks != num_pre_load_hooks:
+            raise AssertionError(
+                f"Pre-save: {num_pre_save_hooks} pre-load: {num_pre_load_hooks}"
+            )
         if num_pre_save_hooks > 0:
             return  # already registered
         modules_with_fsdp_params: set[nn.Module] = {
@@ -744,17 +760,26 @@ def _all_gather_process_group(self) -> dist.ProcessGroup:
             if self.is_sharded_post_forward
             else self.mesh_info
         )
-        assert isinstance(mesh_info, FSDPMeshInfo)
+        if not isinstance(mesh_info, FSDPMeshInfo):
+            raise AssertionError(
+                f"Expected mesh_info to be FSDPMeshInfo, got {type(mesh_info)}"
+            )
         return mesh_info.shard_process_group
 
     @property
     def _reduce_scatter_process_group(self) -> dist.ProcessGroup:
-        assert isinstance(self.mesh_info, FSDPMeshInfo)
+        if not isinstance(self.mesh_info, FSDPMeshInfo):
+            raise AssertionError(
+                f"Expected mesh_info to be FSDPMeshInfo, got {type(self.mesh_info)}"
+            )
         return self.mesh_info.shard_process_group
 
     @property
     def _all_reduce_process_group(self) -> dist.ProcessGroup:
-        assert isinstance(self.mesh_info, HSDPMeshInfo)
+        if not isinstance(self.mesh_info, HSDPMeshInfo):
+            raise AssertionError(
+                f"Expected mesh_info to be HSDPMeshInfo, got {type(self.mesh_info)}"
+            )
         return self.mesh_info.replicate_process_group
 
     def _with_fqn(self, label: str) -> str:
@@ -823,7 +848,7 @@ def _get_param_module_infos(
                             param_name
                         )
     if len(param_to_module_info) != len(params):
-        raise AssertionError(f"Some parameters are not in the module tree of {module}")
+        raise AssertionError(f"Some parameters are not in the module tree of {modules}")
     return [param_to_module_info[param] for param in params]
 
 
@@ -843,6 +868,7 @@ def _assert_not_tracing_fsdp():
             raise RuntimeError(msg)
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, param_group: FSDPParamGroup, *inputs: torch.Tensor):
         # All tensors in `inputs` should require gradient
         RegisterPostBackwardFunction._assert_not_tracing_fsdp()
diff --git a/torch/distributed/fsdp/_fully_shard/_fsdp_state.py b/torch/distributed/fsdp/_fully_shard/_fsdp_state.py
index e8ba77e8fa0e..6484c94d3ca2 100644
--- a/torch/distributed/fsdp/_fully_shard/_fsdp_state.py
+++ b/torch/distributed/fsdp/_fully_shard/_fsdp_state.py
@@ -2,8 +2,8 @@
 # mypy: allow-untyped-defs
 import functools
 import logging
-from collections.abc import Sequence
-from typing import Any, Callable, Optional, TYPE_CHECKING
+from collections.abc import Callable, Sequence
+from typing import Any, Optional, TYPE_CHECKING
 
 import torch
 import torch.nn as nn
@@ -96,6 +96,7 @@ def init(
         for module in modules:
             _insert_module_state(module, self)
         self._modules = modules
+        # pyrefly: ignore  # read-only
         self._device = device
         self._device_handle = _get_device_handle(device.type)
         self._mp_policy = mp_policy
@@ -202,7 +203,8 @@ def _init_shared_state(self) -> None:
 
     def _init_fqns(self) -> None:
         """Sets module and parameter FQN attributes for debugging."""
-        assert self._is_root
+        if not self._is_root:
+            raise AssertionError("Expected _is_root to be True")
         root_module = self._modules[0]
         param_to_fsdp_param: dict[nn.Parameter, FSDPParam] = {}
         module_to_fsdp_param_group: dict[nn.Module, FSDPParamGroup] = {}
@@ -221,7 +223,10 @@ def _init_fqns(self) -> None:
                 if module_fqn is None:
                     module_to_fsdp_param_group[module]._module_fqn = module_name
                 else:
-                    assert isinstance(module_fqn, str), f"{module_fqn}"
+                    if not isinstance(module_fqn, str):
+                        raise AssertionError(
+                            f"Expected module_fqn to be str, got {type(module_fqn)}: {module_fqn}"
+                        )
                     module_fqn += f", {module_name}"
                     module_to_fsdp_param_group[module]._module_fqn = module_fqn
 
diff --git a/torch/distributed/fsdp/_fully_shard/_fully_shard.py b/torch/distributed/fsdp/_fully_shard/_fully_shard.py
index 4b116e415042..545416562061 100644
--- a/torch/distributed/fsdp/_fully_shard/_fully_shard.py
+++ b/torch/distributed/fsdp/_fully_shard/_fully_shard.py
@@ -4,16 +4,8 @@
 from __future__ import annotations
 
 import functools
-from typing import (
-    Any,
-    Callable,
-    cast,
-    NoReturn,
-    Optional,
-    overload,
-    TYPE_CHECKING,
-    Union,
-)
+from contextlib import contextmanager
+from typing import Any, cast, NoReturn, Optional, overload, TYPE_CHECKING, Union
 from typing_extensions import deprecated
 
 import torch
@@ -36,7 +28,7 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Iterable
+    from collections.abc import Callable, Iterable, Iterator
 
     from torch.distributed.tensor import DeviceMesh, Shard
 
@@ -46,6 +38,8 @@
     "UnshardHandle",
     "register_fsdp_forward_method",
     "get_cls_to_fsdp_cls",
+    "disable_fsdp_module_new_init",
+    "share_comm_ctx",
 ]
 
 
@@ -57,6 +51,7 @@ def get_cls_to_fsdp_cls() -> dict[type, type]:
 
 
 @overload
+# pyrefly: ignore  # inconsistent-overload
 def fully_shard(
     module: nn.Module,
     *,
@@ -70,6 +65,7 @@ def fully_shard(
 
 
 @overload
+# pyrefly: ignore  # inconsistent-overload
 def fully_shard(
     module: list[nn.Module],
     *,
@@ -246,7 +242,7 @@ def fully_shard(
     # Place FSDP leftmost for highest priority in the method resolution order
     for module in modules:
         cls = module.__class__
-        new_cls = cls_to_fsdp_cls.get(cls, None)
+        new_cls = cls_to_fsdp_cls.get(cls)
         if not new_cls:
             dct = {"__deepcopy__": _unimplemented_deepcopy}
             new_cls = type(f"FSDP{cls.__name__}", (FSDPModule, cls), dct)
@@ -261,6 +257,19 @@ def _unimplemented_deepcopy(*args: Any, **kwargs: Any) -> NoReturn:
     )
 
 
+_enable_fsdp_module_new_init: bool = True
+
+
+@contextmanager
+def disable_fsdp_module_new_init() -> Iterator[None]:
+    global _enable_fsdp_module_new_init
+    prev, _enable_fsdp_module_new_init = _enable_fsdp_module_new_init, False
+    try:
+        yield
+    finally:
+        _enable_fsdp_module_new_init = prev
+
+
 class FSDPModule:
     def __new__(cls, *args, **kwargs):
         """
@@ -271,7 +280,8 @@ def __new__(cls, *args, **kwargs):
         # and index 1 is the `FSDPModule` class itself
         orig_cls = cls.__mro__[2]
         self = orig_cls.__new__(orig_cls, *args, **kwargs)
-        self.__init__(*args, **kwargs)
+        if _enable_fsdp_module_new_init:
+            self.__init__(*args, **kwargs)
         return self
 
     def reshard(self) -> None:
@@ -702,6 +712,34 @@ def wrapped_method(self, *args, **kwargs):
     )
 
 
+def share_comm_ctx(modules: list[FSDPModule]) -> None:
+    """
+    Share cuda streams for multiple FSDPModules
+
+    Example usage:
+        from torch.distributed.fsdp import share_comm_ctx
+        share_comm_ctx([fsdp_model_1, fsdp_model_2, ...])
+
+    For Pipeline Parallelism (PP), each model chunk is a FSDP root. We want
+    to share cuda streams for all-gather, reduce-scatter, and all-reduce.
+    This avoids allocating inter-stream memory framgmentation
+
+    Args:
+        modules (List[FSDPModule]): modules to share cuda streams
+    """
+    if len(modules) == 0:
+        return
+    for module in modules:
+        if not isinstance(module, FSDPModule):
+            raise ValueError(f"Expects list of FSDPModules but got {module}")
+    fsdp_states = [module._get_fsdp_state() for module in modules]
+    comm_ctx = fsdp_states[0]._comm_ctx
+    for fsdp_state in fsdp_states[1:]:
+        fsdp_state._comm_ctx = comm_ctx
+        if fsdp_param_group := fsdp_state._fsdp_param_group:
+            fsdp_param_group.comm_ctx = comm_ctx
+
+
 def _assert_all_fsdp_modules(modules: Iterable[Any]) -> None:
     for module in modules:
         if not isinstance(module, FSDPModule):
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index b145b3e059a6..f157bbd565aa 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -3,8 +3,8 @@
 import itertools
 import os
 import warnings
-from collections.abc import Generator, Iterable, Iterator
-from typing import Any, Callable, no_type_check, Optional, TYPE_CHECKING, Union
+from collections.abc import Callable, Generator, Iterable, Iterator
+from typing import Any, no_type_check, Optional, TYPE_CHECKING, Union
 
 import torch
 import torch.distributed as dist
@@ -13,7 +13,7 @@
 import torch.distributed.fsdp.fully_sharded_data_parallel as fsdp_file
 import torch.nn as nn
 from torch.distributed.algorithms._comm_hooks import default_hooks
-from torch.distributed.device_mesh import _mesh_resources, DeviceMesh
+from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.distributed_c10d import _get_default_group
 from torch.distributed.fsdp._common_utils import (
     _FSDPDeviceHandle,
@@ -56,7 +56,7 @@
 except ImportError:
     _TORCHDISTX_AVAIL = False
 
-PARAM_BROADCAST_BUCKET_SIZE = int(250 * 1024 * 1024)
+PARAM_BROADCAST_BUCKET_SIZE = 250 * 1024 * 1024
 FSDP_SYNCED = "_fsdp_synced"
 # Specification of process groups for hybrid sharding strategies.
 HybridShardProcessGroupType = tuple[dist.ProcessGroup, dist.ProcessGroup]
@@ -243,9 +243,10 @@ def _init_inter_node_process_group(
         if local_rank == my_local_rank:
             inter_node_pg = grp
 
-    assert inter_node_pg is not None, (
-        f"{my_local_rank} expected to assign inter-node pg, but did not"
-    )
+    if inter_node_pg is None:
+        raise AssertionError(
+            f"{my_local_rank} expected to assign inter-node pg, but did not"
+        )
     return inter_node_pg
 
 
@@ -508,10 +509,11 @@ def _init_prefetching_state(
 
 
 @no_type_check
+# pyrefly: ignore  # bad-function-definition
 def _init_extension(state: _FSDPState, device_mesh: DeviceMesh = None) -> _FSDPState:
     # TODO: we need to add additional check once we support FSDP + PiPPy.
     # This check is currently sufficient, since we only support FSDP + TP.
-    root_mesh = _mesh_resources.get_root_mesh(device_mesh)
+    root_mesh = device_mesh._get_root_mesh() if device_mesh is not None else None
     # if a root mesh is not the same as device_mesh,
     # meaning the device_mesh is sliced out from the root mesh.
     if device_mesh and root_mesh != state._device_mesh:
@@ -547,7 +549,8 @@ def _verify_managed_params(module: nn.Module, params: list[nn.Parameter]) -> Non
                 if param is param_:
                     param_name = name
                     break
-            assert param_name
+            if not param_name:
+                raise AssertionError("Expected param_name to be set")
             raise ValueError(
                 "FSDP doesn't support scalar parameters. "
                 f"Change {param_name} to a 1D tensor with numel equal to 1."
@@ -645,7 +648,8 @@ def _init_param_handle_from_params(
         fsdp_extension=state._fsdp_extension,
     )
     handle.shard()
-    assert not state._handle
+    if state._handle:
+        raise AssertionError("Expected state._handle to be None")
     state.params.append(handle.flat_param)
     state._handle = handle
     state._fully_sharded_module_to_handle[handle._fully_sharded_module] = handle
@@ -706,7 +710,10 @@ def _get_ignored_modules(
     for submodule in root_module.modules():
         optional_fsdp_state = _get_module_fsdp_state(submodule)
         if optional_fsdp_state is not None:
-            assert hasattr(optional_fsdp_state, "_ignored_modules")
+            if not hasattr(optional_fsdp_state, "_ignored_modules"):
+                raise AssertionError(
+                    "Expected optional_fsdp_state to have _ignored_modules attribute"
+                )
             ignored_modules.update(optional_fsdp_state._ignored_modules)
     return ignored_modules
 
@@ -739,7 +746,10 @@ def _get_ignored_params(
     for submodule in root_module.modules():
         optional_fsdp_state = _get_module_fsdp_state(submodule)
         if optional_fsdp_state is not None:
-            assert hasattr(optional_fsdp_state, "_ignored_params")
+            if not hasattr(optional_fsdp_state, "_ignored_params"):
+                raise AssertionError(
+                    "Expected optional_fsdp_state to have _ignored_params attribute"
+                )
             all_ignored_params.update(optional_fsdp_state._ignored_params)
 
     return all_ignored_params
@@ -768,7 +778,10 @@ def _get_ignored_buffer_names(
     for submodule in root_module.modules():
         optional_fsdp_state = _get_module_fsdp_state(submodule)
         if optional_fsdp_state is not None:
-            assert hasattr(optional_fsdp_state, "_ignored_buffer_names")
+            if not hasattr(optional_fsdp_state, "_ignored_buffer_names"):
+                raise AssertionError(
+                    "Expected optional_fsdp_state to have _ignored_buffer_names attribute"
+                )
             all_ignored_buffer_names.update(optional_fsdp_state._ignored_buffer_names)
 
     return all_ignored_buffer_names
@@ -904,7 +917,9 @@ def _materialize_meta_module(
                 # As a contract to the user, only call `reset_parameters()` if
                 # the module has directly managed parameters/buffers
                 module_state_iter = itertools.chain(
-                    module.parameters(recurse=False), module.buffers(recurse=False)
+                    module.parameters(recurse=False),
+                    # pyrefly: ignore  # bad-argument-type
+                    module.buffers(recurse=False),
                 )
                 has_module_states = len(list(module_state_iter)) > 0
                 if has_module_states:
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index 5fb52c7c281c..300be17b6aba 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -146,9 +146,8 @@ def _unflatten_optim_state(
         dict will need to map these entries using the proper unflattened
         parameter IDs.
     """
-    assert not shard_state or to_save, (
-        "If ``shard_state`` is True, ``to_save`` has to be True."
-    )
+    if shard_state and not to_save:
+        raise AssertionError("If ``shard_state`` is True, ``to_save`` has to be True.")
     consolidated_state = _communicate_optim_state(
         fsdp_param_info,
         flat_param_state,
@@ -219,9 +218,8 @@ def _communicate_optim_state(
             ):
                 tensor_state[state_name] = value
                 continue
-            assert fsdp_state.compute_device is not None, (
-                "compute_device has not been initialized"
-            )
+            if fsdp_state.compute_device is None:
+                raise AssertionError("compute_device has not been initialized")
             if value.device.type != fsdp_state.compute_device.type:
                 value = value.to(fsdp_state.compute_device)
             # Assume that positive-dimension tensor optimizer state
@@ -294,7 +292,10 @@ def _unflatten_communicated_optim_state(
             if shard_state:
                 osd_config = fsdp_state._optim_state_dict_config
                 if getattr(osd_config, "_use_dtensor", False):
-                    assert fsdp_state._device_mesh is not None
+                    if fsdp_state._device_mesh is None:
+                        raise AssertionError(
+                            f"Expected _device_mesh to be not None, got {fsdp_state._device_mesh}"
+                        )
                     optim_state = _ext_chunk_dtensor(
                         optim_state,
                         fsdp_state.rank,
@@ -302,7 +303,10 @@ def _unflatten_communicated_optim_state(
                         fsdp_state._fsdp_extension,
                     )
                 else:
-                    assert fsdp_state.process_group is not None
+                    if fsdp_state.process_group is None:
+                        raise AssertionError(
+                            f"Expected process_group to be not None, got {fsdp_state.process_group}"
+                        )
                     optim_state = _ext_chunk_tensor(
                         optim_state,
                         fsdp_state.rank,
@@ -349,10 +353,11 @@ def _broadcast_state(
         tensor = state.to(fsdp_state.compute_device)
     else:
         if isinstance(state, torch.Tensor):
-            assert state.dim() == 0, (
-                "For non-zero ranks, a tensor state should have zero dimension, "
-                "but got the state with shape {state.shape()}."
-            )
+            if state.dim() != 0:
+                raise AssertionError(
+                    "For non-zero ranks, a tensor state should have zero dimension, "
+                    f"but got the state with shape {state.shape}."
+                )
             return state
         elif not isinstance(state, _PosDimTensorInfo):
             return state
@@ -491,9 +496,10 @@ def _flatten_optim_state_dict(
             if flat_state:
                 flat_osd_state[key] = flat_state
             elif use_orig_params:
-                assert len(fqns) == 1, (
-                    f"use_orig_params is True but there are multiple FQNs, {fqns}."
-                )
+                if len(fqns) != 1:
+                    raise AssertionError(
+                        f"use_orig_params is True but there are multiple FQNs, {fqns}."
+                    )
                 if optim is not None:  # NamedOptimizer or KeyedOptimizer case.
                     state = optim.state.get(param, None)  # type: ignore[call-overload]
                     if state is not None:
@@ -509,7 +515,8 @@ def _flatten_optim_state_dict(
                     "use_orig_params=True."
                 )
         else:  # do not flatten non-FSDP parameters' states
-            assert len(fqns) == 1
+            if len(fqns) != 1:
+                raise AssertionError(f"Expected len(fqns) == 1, got {len(fqns)}")
             key = _OptimStateKey(tuple(fqns), False)
             flat_osd_state[key] = copy.copy(unflat_osd_state[fqn])
 
@@ -571,14 +578,16 @@ def _flatten_optim_state(
     handle = fsdp_param_info.handle
     flat_param = handle.flat_param
     num_unflat_params = len(unflat_param_names)
-    assert num_unflat_params > 0, (
-        "Expects at least one unflattened parameter corresponding to the flat parameter"
-    )
+    if num_unflat_params <= 0:
+        raise AssertionError(
+            "Expects at least one unflattened parameter corresponding to the flat parameter"
+        )
     unflat_param_shapes = flat_param._shapes
     num_unflat_param_shapes = len(unflat_param_shapes)
-    assert num_unflat_params == num_unflat_param_shapes, (
-        f"Expects {num_unflat_params} shapes but got {num_unflat_param_shapes}"
-    )
+    if num_unflat_params != num_unflat_param_shapes:
+        raise AssertionError(
+            f"Expects {num_unflat_params} shapes but got {num_unflat_param_shapes}"
+        )
 
     # Check if these unflattened parameters have any optimizer state
     has_state = [
@@ -603,6 +612,7 @@ def _flatten_optim_state(
     ]
     # Check that the unflattened parameters have the same state names
     state_names = None
+    # pyrefly: ignore  # bad-assignment
     for unflat_param_state in unflat_param_states:
         if unflat_param_state is None:
             continue
@@ -614,7 +624,8 @@ def _flatten_optim_state(
                     "Differing optimizer state names for the unflattened "
                     f"parameters: {unflat_param_names}"
                 )
-    assert state_names is not None
+    if state_names is None:
+        raise AssertionError(f"Expected state_names to be not None, got {state_names}")
 
     # Flatten the state
     flat_state: dict[str, Optional[torch.Tensor]] = {}
@@ -671,7 +682,10 @@ def _flatten_optim_state(
                 unflat_param_names,
             )
         else:
-            assert are_non_tensors
+            if not are_non_tensors:
+                raise AssertionError(
+                    f"Expected are_non_tensors to be True, got {are_non_tensors}"
+                )
             flat_state[state_name] = _flatten_non_tensor_optim_state(
                 state_name,
                 state_values,
@@ -759,9 +773,10 @@ def _flatten_tensor_optim_state(
     ]
     flat_tensor = handle.flatten_tensors(tensors_to_flatten, handle._aligned_numel)
     flat_param_shape = flat_param._unpadded_unsharded_size  # type: ignore[attr-defined]
-    assert flat_tensor.shape == flat_param_shape, (
-        f"tensor optim state: {flat_tensor.shape} flat parameter: {flat_param_shape}"
-    )
+    if flat_tensor.shape != flat_param_shape:
+        raise AssertionError(
+            f"tensor optim state: {flat_tensor.shape} flat parameter: {flat_param_shape}"
+        )
     return flat_tensor
 
 
@@ -892,7 +907,10 @@ def _rekey_sharded_optim_state_dict(
     # All parameter keys in `param_to_param_key` should be in
     # `param_to_fqns` -- strict inequality follows when not all parameters are
     # passed to the optimizer
-    assert len(param_to_param_key) <= len(param_to_fqns)
+    if len(param_to_param_key) > len(param_to_fqns):
+        raise AssertionError(
+            f"Expected len(param_to_param_key) <= len(param_to_fqns), got {len(param_to_param_key)} > {len(param_to_fqns)}"
+        )
 
     unflat_param_names_to_flat_param_key: dict[
         tuple[str, ...], Union[int, str]
@@ -918,6 +936,7 @@ def _rekey_sharded_optim_state_dict(
         flat_param_key = unflat_param_names_to_flat_param_key.get(
             key.unflat_param_names, key.unflat_param_names
         )
+        # pyrefly: ignore  # unsupported-operation
         rekeyed_osd_state[flat_param_key] = param_state
 
     # Only process param_groups if it exists in sharded_osd
@@ -980,6 +999,7 @@ def _get_param_id_to_param_from_optim_input(
     if optim_input is None:
         return dict(enumerate(model.parameters()))
     try:
+        # pyrefly: ignore  # no-matching-overload
         params = cast(list[nn.Parameter], list(optim_input))
     except TypeError as e:
         raise TypeError(
@@ -999,14 +1019,15 @@ def _get_param_id_to_param_from_optim_input(
         raise TypeError("Optimizer input should be an iterable of Tensors or dicts")
     if all_tensors:
         return dict(enumerate(params))
-    assert all_dicts
+    if not all_dicts:
+        raise AssertionError(f"Expected all_dicts to be True, got {all_dicts}")
     param_id_to_param: list[nn.Parameter] = []
     for param_group in params:
         has_params_key = "params" in param_group  # type: ignore[operator]
-        assert has_params_key, (
-            'A parameter group should map "params" to a list of the '
-            "parameters in the group"
-        )
+        if not has_params_key:
+            raise AssertionError(
+                'A parameter group should map "params" to a list of the parameters in the group'
+            )
         # Implicitly map `flat_param_id` (current length of the list) to
         # `param`
         param_id_to_param.extend(param_group["params"])  # type: ignore[index]
@@ -1065,10 +1086,12 @@ def _get_param_key_to_param(
     """
     clean_fqn_to_curr_fqn: dict[str, str] = {}
     if is_named_optimizer:
-        assert param_to_fqns is not None and flat_param_to_fqn is not None, (
-            "The optimizer is a NamedOptimizer, `param_to_fqns` must not be None."
-        )
-        assert model is not None
+        if param_to_fqns is None or flat_param_to_fqn is None:
+            raise AssertionError(
+                "The optimizer is a NamedOptimizer, `param_to_fqns` must not be None."
+            )
+        if model is None:
+            raise AssertionError(f"Expected model to be not None, got {model}")
         for key, _ in _named_parameters_with_duplicates(model):
             clean_fqn_to_curr_fqn[clean_tensor_name(key)] = key
 
@@ -1077,14 +1100,23 @@ def _get_param_key_to_param(
     for param_group in optim.param_groups:
         if is_named_optimizer:
             for param in param_group["params"]:
-                assert flat_param_to_fqn is not None
+                if flat_param_to_fqn is None:
+                    raise AssertionError(
+                        f"Expected flat_param_to_fqn to be not None, got {flat_param_to_fqn}"
+                    )
                 if param in flat_param_to_fqn:
                     # FlatParameter case
                     key = flat_param_to_fqn[param]
                 else:
-                    assert param_to_fqns is not None
+                    if param_to_fqns is None:
+                        raise AssertionError(
+                            f"Expected param_to_fqns to be not None, got {param_to_fqns}"
+                        )
                     # use_orig_params case
-                    assert len(param_to_fqns[param]) == 1
+                    if len(param_to_fqns[param]) != 1:
+                        raise AssertionError(
+                            f"Expected len(param_to_fqns[param]) == 1, got {len(param_to_fqns[param])}"
+                        )
                     key = param_to_fqns[param][0]
                 try:
                     key = clean_fqn_to_curr_fqn[key]
@@ -1150,9 +1182,8 @@ def _check_missing_keys_on_rank(
             continue
         param_key = optim_state_key_to_param_key[r0_optim_state_key]
         if isinstance(param_key, int):
-            assert param_key >= 0 and param_key < len(param_key_to_param), (
-                "Check the `param_key_to_param` construction"
-            )
+            if not (param_key >= 0 and param_key < len(param_key_to_param)):
+                raise AssertionError("Check the `param_key_to_param` construction")
     # We cannot use FSDPState.compute_device as this API is a global view.
     device = _get_pg_default_device(group)
     num_missing = torch.tensor([len(missing_keys)], dtype=torch.int32, device=device)
@@ -1201,10 +1232,10 @@ def _map_param_key_to_optim_keys(
         fqns = param_to_fqns[param]
         is_fsdp_managed = isinstance(param, FlatParameter)
         if is_fsdp_managed:
-            assert fqns[0] in fqn_to_fsdp_param_info, (
-                fqns[0],
-                list(fqn_to_fsdp_param_info.keys()),
-            )
+            if fqns[0] not in fqn_to_fsdp_param_info:
+                raise AssertionError(
+                    f"Expected {fqns[0]} to be in fqn_to_fsdp_param_info, got keys: {list(fqn_to_fsdp_param_info.keys())}"
+                )
         is_fsdp_managed = fqns[0] in fqn_to_fsdp_param_info
         optim_state_key = _OptimStateKey(
             unflat_param_names=tuple(fqns),
@@ -1226,7 +1257,10 @@ def _map_param_key_to_optim_keys(
             [all_optim_state_keys] if rank == 0 else [None]
         )
         dist.broadcast_object_list(key_obj_list, src=0, group=group)
-        assert key_obj_list[0] is not None
+        if key_obj_list[0] is None:
+            raise AssertionError(
+                f"Expected key_obj_list[0] to be not None, got {key_obj_list[0]}"
+            )
         all_optim_state_keys = key_obj_list[0]
         _check_missing_keys_on_rank(
             all_optim_state_keys,
@@ -1267,7 +1301,7 @@ def _is_named_optimizer(optim_state_dict: dict[str, Any]) -> bool:
     (which usually are FQNs) versus integers (which usually refer to param_ids
     from a vanilla torch.optim.Optimizer).
     """
-    state = optim_state_dict.get("state", None)
+    state = optim_state_dict.get("state")
     if not state:
         # If we cannot find a state, assume it is not NamedOptimizer as
         # NamedOptimizer has eager initialization.
@@ -1359,11 +1393,17 @@ def _convert_all_state_info(
                     if not dtype:
                         dtype = info.dtype
                     else:
-                        assert dtype == info.dtype
+                        if dtype != info.dtype:
+                            raise AssertionError(
+                                f"Expected dtype == info.dtype, got {dtype} != {info.dtype}"
+                            )
                 if numels[-1] == 0:
                     _empty_ranks.add(rank)
 
-            assert not empty_ranks or empty_ranks == _empty_ranks
+            if not (not empty_ranks or empty_ranks == _empty_ranks):
+                raise AssertionError(
+                    f"Expected empty_ranks to be empty or equal to _empty_ranks, got {empty_ranks} vs {_empty_ranks}"
+                )
             empty_ranks = _empty_ranks
             if state_name not in state_buffers:
                 state_buffers[state_name] = [
@@ -1385,23 +1425,26 @@ def _convert_all_state_info(
                 continue
             for name, non_tensor_value in object_state.non_tensors.items():
                 curr_non_tensor_value = gathered_state.get(name, None)
-                assert (
+                if not (
                     curr_non_tensor_value is None
                     or curr_non_tensor_value == non_tensor_value
-                ), (
-                    f"Rank {rank} has different values for {name}: {non_tensor_value}."
-                    + f" Other ranks: {curr_non_tensor_value}"
-                )
+                ):
+                    raise AssertionError(
+                        f"Rank {rank} has different values for {name}: {non_tensor_value}."
+                        + f" Other ranks: {curr_non_tensor_value}"
+                    )
                 gathered_state[name] = non_tensor_value
 
             for name, scalar_tensor_value in object_state.scalar_tensors.items():
                 curr_scalar_tensor_value = gathered_state.get(name, None)
-                assert curr_scalar_tensor_value is None or torch.equal(
-                    scalar_tensor_value, curr_scalar_tensor_value
-                ), (
-                    f"Rank {rank} has different values for {name}: {scalar_tensor_value}."
-                    + f" Other ranks: {curr_scalar_tensor_value}"
-                )
+                if not (
+                    curr_scalar_tensor_value is None
+                    or torch.equal(scalar_tensor_value, curr_scalar_tensor_value)
+                ):
+                    raise AssertionError(
+                        f"Rank {rank} has different values for {name}: {scalar_tensor_value}."
+                        + f" Other ranks: {curr_scalar_tensor_value}"
+                    )
                 gathered_state[name] = scalar_tensor_value
 
     return dtype, state_buffers  # type: ignore[possibly-undefined]
@@ -1452,7 +1495,10 @@ def _unflatten_orig_param_states(
         if shard_state:
             osd_config = fsdp_state._optim_state_dict_config
             if getattr(osd_config, "_use_dtensor", False):
-                assert fsdp_state._device_mesh is not None
+                if fsdp_state._device_mesh is None:
+                    raise AssertionError(
+                        f"Expected _device_mesh to be not None, got {fsdp_state._device_mesh}"
+                    )
                 value = _ext_chunk_dtensor(
                     value,
                     fsdp_state.rank,
@@ -1460,7 +1506,10 @@ def _unflatten_orig_param_states(
                     fsdp_state._fsdp_extension,
                 )
             else:
-                assert fsdp_state.process_group is not None
+                if fsdp_state.process_group is None:
+                    raise AssertionError(
+                        f"Expected process_group to be not None, got {fsdp_state.process_group}"
+                    )
                 value = _ext_chunk_tensor(
                     value,
                     fsdp_state.rank,
@@ -1508,8 +1557,7 @@ def _allgather_orig_param_states(
         return output_states
 
     has_state_params: list[bool] = [
-        True if fqn in output_states else False
-        for fqn, idx in fsdp_param_info.param_indices.items()
+        fqn in output_states for fqn, idx in fsdp_param_info.param_indices.items()
     ]
 
     # Loop through the ``state_buffers`` and construct the flattened, concatenated,
@@ -1596,24 +1644,26 @@ def _allgather_orig_param_states(
             sum(t.numel() for t in local_buffers)
         )
 
-        assert flat_param._shard_numel_padded == shard_numel_padded, (
-            "Manually calculated _sharded_numel_padded is incorrect. "
-            f"_shard_numel_padded={flat_param._shard_numel_padded}, "
-            f"shard_numel_padded={shard_numel_padded}, "
-            f"_sharded_size.numel={flat_param._sharded_size.numel()}, "
-            f"_numels_with_padding={flat_param._numels_with_padding}, "
-            f"begin={begin}, end={end},"
-        )
+        if flat_param._shard_numel_padded != shard_numel_padded:
+            raise AssertionError(
+                "Manually calculated _sharded_numel_padded is incorrect. "
+                f"_shard_numel_padded={flat_param._shard_numel_padded}, "
+                f"shard_numel_padded={shard_numel_padded}, "
+                f"_sharded_size.numel={flat_param._sharded_size.numel()}, "
+                f"_numels_with_padding={flat_param._numels_with_padding}, "
+                f"begin={begin}, end={end},"
+            )
         if shard_numel_padded > 0:
             # Add right-handed padding.
             local_buffers.append(empty_func(shard_numel_padded))
         local_shard = torch.cat(local_buffers)
-        assert local_shard.numel() * fsdp_state.world_size == gathered_tensor.numel(), (
-            "The size of local shard times the world size should equal to the "
-            "gathered tensor size. The inconsistency may be from a bug of "
-            "FlatParameter's metadata or the reconstruction logic in optimizer "
-            "state dict."
-        )
+        if local_shard.numel() * fsdp_state.world_size != gathered_tensor.numel():
+            raise AssertionError(
+                "The size of local shard times the world size should equal to the "
+                "gathered tensor size. The inconsistency may be from a bug of "
+                "FlatParameter's metadata or the reconstruction logic in optimizer "
+                "state dict."
+            )
         fsdp_state._device_handle.synchronize()
         with SimpleProfiler.profile(SimpleProfiler.Type.ALLGATHER):
             dist.all_gather_into_tensor(
@@ -1625,11 +1675,12 @@ def _allgather_orig_param_states(
         unpadded_tensor = gathered_tensor[: flat_param._unpadded_unsharded_size.numel()]
         flat_param_handle = fsdp_param_info.handle
         orig_states = flat_param_handle._get_unflat_views_aligned(unpadded_tensor)
-        assert len(orig_states) == len(fsdp_param_info.param_indices), (
-            "The number of parameters from FlatParameter is not consistent to "
-            "the number of states used by optimizer state dict reconstruction "
-            "logic."
-        )
+        if len(orig_states) != len(fsdp_param_info.param_indices):
+            raise AssertionError(
+                "The number of parameters from FlatParameter is not consistent to "
+                "the number of states used by optimizer state dict reconstruction "
+                "logic."
+            )
         for fqn, idx in fsdp_param_info.param_indices.items():
             if fsdp_param_info.param_requires_grad[idx] or fqn in output_states:
                 output_states[fqn][state_name] = orig_states[idx]
@@ -1716,7 +1767,7 @@ def _convert_state_with_orig_params(
     # across ranks
     for optim_state_key in all_optim_state_keys:
         param_key: Union[str, int, None] = optim_state_key_to_param_key.get(
-            optim_state_key, None
+            optim_state_key
         )
 
         if param_key is None and not optim_state_key.is_fsdp_managed:
@@ -1724,7 +1775,7 @@ def _convert_state_with_orig_params(
 
         if optim_state_key.is_fsdp_managed:
             fqn = optim_state_key.unflat_param_names[0]
-            fsdp_param_info = fqn_to_fsdp_param_info.get(fqn, None)
+            fsdp_param_info = fqn_to_fsdp_param_info.get(fqn)
             if fsdp_param_info is None:
                 # This can happen if the not all FSDP instances have all the
                 # parameters. This can happen with FSDP + some MPMD style
@@ -1739,7 +1790,10 @@ def _convert_state_with_orig_params(
             all_states[id(fsdp_param_info)][fqn] = state
 
         elif to_save:
-            assert len(optim_state_key.unflat_param_names) == 1
+            if len(optim_state_key.unflat_param_names) != 1:
+                raise AssertionError(
+                    f"Expected len(optim_state_key.unflat_param_names) == 1, got {len(optim_state_key.unflat_param_names)}"
+                )
             unflat_param_name = optim_state_key.unflat_param_names[0]
             with SimpleProfiler.profile("none_fsdp_managed_copy"):
                 param_key = cast(Union[str, int], param_key)
@@ -1759,10 +1813,11 @@ def _convert_state_with_orig_params(
     for _all_states in all_states.values():
         fqn = next(iter(_all_states.keys()))
         fsdp_param_info = fqn_to_fsdp_param_info[fqn]
-        assert len(fsdp_param_info.param_requires_grad) > 0, (
-            "With use_orig_params, FSDPParamInfo should have requires_grad "
-            "information. However, the length is zero."
-        )
+        if len(fsdp_param_info.param_requires_grad) <= 0:
+            raise AssertionError(
+                "With use_orig_params, FSDPParamInfo should have requires_grad "
+                "information. However, the length is zero."
+            )
         for key, idx in fsdp_param_info.param_indices.items():
             if key in _all_states:
                 continue
@@ -1802,13 +1857,14 @@ def _convert_state_with_flat_params(
     # across ranks
     for optim_state_key in all_optim_state_keys:
         param_key: Union[str, int, None] = optim_state_key_to_param_key.get(
-            optim_state_key, None
+            optim_state_key
         )
 
-        assert param_key is not None, (
-            "If use_orig_params is False, we must be able to find the "
-            f"corresponding param id. {optim_state_key} {param_key}"
-        )
+        if param_key is None:
+            raise AssertionError(
+                "If use_orig_params is False, we must be able to find the "
+                f"corresponding param id. {optim_state_key} {param_key}"
+            )
 
         if optim_state_key.is_fsdp_managed:
             # If there are multiple unflat_param_names (not use_orig_params),
@@ -1824,7 +1880,11 @@ def _convert_state_with_flat_params(
                 cpu_offload,
             )
             if to_save:
-                assert len(unflat_state) == len(optim_state_key.unflat_param_names)
+                if len(unflat_state) != len(optim_state_key.unflat_param_names):
+                    raise AssertionError(
+                        f"Expected len(unflat_state) == len(optim_state_key.unflat_param_names), "
+                        f"got {len(unflat_state)} != {len(optim_state_key.unflat_param_names)}"
+                    )
                 fsdp_osd_state.update(
                     zip(
                         optim_state_key.unflat_param_names,
@@ -1832,7 +1892,10 @@ def _convert_state_with_flat_params(
                     )
                 )
         elif to_save:
-            assert len(optim_state_key.unflat_param_names) == 1
+            if len(optim_state_key.unflat_param_names) != 1:
+                raise AssertionError(
+                    f"Expected len(optim_state_key.unflat_param_names) == 1, got {len(optim_state_key.unflat_param_names)}"
+                )
             unflat_param_name = optim_state_key.unflat_param_names[0]
             fsdp_osd_state[unflat_param_name] = copy.copy(optim_state_dict[param_key])
             if cpu_offload:
@@ -2028,7 +2091,10 @@ def module_fn(module, prefix, tree_level, fqn_to_param_info):
         for idx, local_fqn in enumerate(flat_param._fqns):
             fqn = clean_tensor_name(prefix + local_fqn)
             if fqn in fqn_to_param_info:
-                assert fqn_to_param_info[fqn].handle.flat_param is flat_param, fqn
+                if fqn_to_param_info[fqn].handle.flat_param is not flat_param:
+                    raise AssertionError(
+                        f"Expected fqn_to_param_info[fqn].handle.flat_param is flat_param for {fqn}"
+                    )
             fqn_to_param_info[fqn] = fsdp_param_info
             fsdp_param_info.param_indices[fqn] = idx
             if flat_param._params is not None:
diff --git a/torch/distributed/fsdp/_runtime_utils.py b/torch/distributed/fsdp/_runtime_utils.py
index f4dd3d2b35bd..eab47412f5d2 100644
--- a/torch/distributed/fsdp/_runtime_utils.py
+++ b/torch/distributed/fsdp/_runtime_utils.py
@@ -1,8 +1,9 @@
 # mypy: allow-untyped-defs
 import functools
 import logging
+from collections.abc import Callable
 from enum import auto, Enum
-from typing import Any, Callable, no_type_check, Optional
+from typing import Any, no_type_check, Optional
 
 import torch
 import torch.distributed as dist
@@ -102,7 +103,8 @@ def _is_fsdp_root(state: _FSDPState, module: nn.Module) -> bool:
     """
     # Force a lazy initialization to determine the FSDP root
     _lazy_init(state, module)
-    assert state._is_root is not None  # mypy
+    if state._is_root is None:
+        raise AssertionError("Expected _is_root to be set after lazy init")
     return state._is_root
 
 
@@ -239,8 +241,10 @@ def _init_streams(
     Initializes CUDA streams for overlapping communication, computation, and
     data transfers. The streams should be shared across FSDP instances.
     """
-    assert state._is_root
-    assert state._device_handle.is_available()
+    if not state._is_root:
+        raise AssertionError("Expected state to be root")
+    if not state._device_handle.is_available():
+        raise AssertionError("Expected device handle to be available")
     uses_hybrid_sharding = any(
         fsdp_state.sharding_strategy in HYBRID_SHARDING_STRATEGIES
         for fsdp_state in state._all_fsdp_states
@@ -1458,7 +1462,8 @@ def _register_post_backward_hook(
             "register the post-backward hook",
         )
         acc_grad = temp_flat_param.grad_fn.next_functions[0][0]  # type: ignore[union-attr]
-        assert acc_grad is not None
+        if acc_grad is None:
+            raise AssertionError("Expected acc_grad to be set")
         hook_handle = acc_grad.register_hook(
             functools.partial(_post_backward_hook, state, handle)
         )
@@ -1500,7 +1505,8 @@ def _register_post_backward_reshard_only_hook(
         inp_tensors = [
             obj for obj in args_flat if torch.is_tensor(obj) and obj.requires_grad
         ]
-    assert inp_tensors is not None  # mypy
+    if inp_tensors is None:
+        raise AssertionError("Expected inp_tensors to be set")
     hook_handle = register_multi_grad_hook(
         inp_tensors, functools.partial(_post_backward_reshard_only_hook, state, handle)
     )
@@ -1598,7 +1604,10 @@ def _get_buffers_and_dtypes_for_computation(
                 continue
             buffers.append(buffer)
             buffer_dtypes.append(fsdp_state.mixed_precision.buffer_dtype)
-    assert len(buffers) == len(buffer_dtypes), f"{len(buffers)} {len(buffer_dtypes)}"
+    if len(buffers) != len(buffer_dtypes):
+        raise AssertionError(
+            f"Expected buffers and buffer_dtypes to have the same length, got {len(buffers)} and {len(buffer_dtypes)}"
+        )
     return buffers, buffer_dtypes
 
 
diff --git a/torch/distributed/fsdp/_shard_utils.py b/torch/distributed/fsdp/_shard_utils.py
index 037bef9be3b3..eca5b9bd3987 100644
--- a/torch/distributed/fsdp/_shard_utils.py
+++ b/torch/distributed/fsdp/_shard_utils.py
@@ -68,7 +68,11 @@ def _create_chunk_sharded_tensor(
         )
         for r in range(len(chunk_sizes))
     ]
-    assert len(chunk_sizes) == len(chunk_offsets) == len(placements)
+    if len(chunk_sizes) != len(chunk_offsets) or len(chunk_sizes) != len(placements):
+        raise AssertionError(
+            f"Expected chunk_sizes, chunk_offsets, and placements to have the same length, "
+            f"got {len(chunk_sizes)}, {len(chunk_offsets)}, {len(placements)}"
+        )
     shard_metadata = [
         ShardMetadata(offset, size, placement)
         for offset, size, placement in zip(chunk_offsets, chunk_sizes, placements)
@@ -121,9 +125,8 @@ def _all_gather_dtensor(
     """
     All gather a DTensor in its sharded dimension and return the local tensor.
     """
-    assert root_mesh == tensor.device_mesh, (
-        "The device mesh of a tensor should be a root mesh."
-    )
+    if root_mesh != tensor.device_mesh:
+        raise AssertionError("The device mesh of a tensor should be a root mesh.")
 
     placements = list(copy.deepcopy(tensor.placements))
     # FSDP placements: [Shard(0)] -> [Replicate()]
diff --git a/torch/distributed/fsdp/_state_dict_utils.py b/torch/distributed/fsdp/_state_dict_utils.py
index a81d48ebdba8..496475b5b11d 100644
--- a/torch/distributed/fsdp/_state_dict_utils.py
+++ b/torch/distributed/fsdp/_state_dict_utils.py
@@ -3,8 +3,8 @@
 import logging
 import math
 import warnings
-from collections.abc import Generator, Iterator
-from typing import Any, Callable, cast, no_type_check
+from collections.abc import Callable, Generator, Iterator
+from typing import Any, cast, no_type_check
 
 import torch
 import torch.distributed as dist
@@ -16,7 +16,6 @@
     Shard,
     ShardedTensor,
 )
-from torch.distributed.device_mesh import _mesh_resources
 from torch.distributed.fsdp._common_utils import (
     _FSDPState,
     _get_module_fsdp_state_if_fully_sharded_module,
@@ -110,10 +109,11 @@ def _enter_unshard_params_ctx(
     requires to enter the context in the pre-hook but leave the context in the
     post-hook. This API enters the context of ``_unshard_fsdp_state_params``.
     """
-    assert module not in fsdp_state._unshard_params_ctx, (
-        "Entering the ``_unshard_fsdp_state_params`` context but _unshard_params_ctx[module] "
-        "is not None."
-    )
+    if module in fsdp_state._unshard_params_ctx:
+        raise AssertionError(
+            "Entering the ``_unshard_fsdp_state_params`` context but _unshard_params_ctx[module] "
+            "is not None."
+        )
     fsdp_state._unshard_params_ctx[module] = _unshard_fsdp_state_params(
         module,
         fsdp_state,
@@ -219,12 +219,13 @@ def _common_unshard_post_state_dict_hook(
         if no_fsdp_return:
             state_dict.pop(fqn)
             continue
-        assert fqn in state_dict, (
-            f"FSDP assumes {fqn} is in the state_dict but the state_dict only "
-            f"has {state_dict.keys()}. "
-            f"prefix={prefix}, module_name={module_name}, "
-            f"param_name={param_name} rank={fsdp_state.rank}."
-        )
+        if fqn not in state_dict:
+            raise AssertionError(
+                f"FSDP assumes {fqn} is in the state_dict but the state_dict only "
+                f"has {state_dict.keys()}. "
+                f"prefix={prefix}, module_name={module_name}, "
+                f"param_name={param_name} rank={fsdp_state.rank}."
+            )
 
         param_hook(state_dict, prefix, fqn)
 
@@ -288,7 +289,7 @@ def _full_pre_state_dict_hook(
     ``nn.Module``.
     """
     if getattr(fsdp_state, "_device_mesh", False):
-        _mesh_resources.get_root_mesh(fsdp_state._device_mesh)
+        fsdp_state._device_mesh._get_root_mesh()
 
     _common_pre_state_dict_hook(module, fsdp_state)
     _common_unshard_pre_state_dict_hook(
@@ -410,7 +411,8 @@ def _local_post_state_dict_hook(
     # value as the flat_param but it is a pure Tensor because
     # nn.Module.state_dict() will detach the parameter. Therefore, we need
     # to get flat_param to get the metadata.
-    assert _module_handle(fsdp_state, module), "Should have returned early"
+    if not _module_handle(fsdp_state, module):
+        raise AssertionError("Should have returned early")
     flat_param = _module_handle(fsdp_state, module).flat_param
     # Constructs a ShardedTensor from the flat_param "without" padding.
     # Removing the padding allows users to change the number of ranks
@@ -460,32 +462,37 @@ def _local_pre_load_state_dict_hook(
     _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP_PREFIX}")
     fqn = f"{prefix}{FSDP_PREFIX}{FLAT_PARAM}"
     if fqn not in state_dict:
-        assert not _has_fsdp_params(fsdp_state, module), (
-            "No `FlatParameter` in `state_dict` for this FSDP instance "
-            "but it has parameters"
-        )
+        if _has_fsdp_params(fsdp_state, module):
+            raise AssertionError(
+                "No `FlatParameter` in `state_dict` for this FSDP instance "
+                "but it has parameters"
+            )
         return
     load_tensor = state_dict[fqn]
-    assert isinstance(load_tensor, ShardedTensor), (
-        "Tensors in local_state_dict should be ShardedTensor."
-    )
+    if not isinstance(load_tensor, ShardedTensor):
+        raise AssertionError("Tensors in local_state_dict should be ShardedTensor.")
 
     # Convert the ShardedTensor to a Tensor.
     flat_param = _module_handle(fsdp_state, module).flat_param
-    assert flat_param is not None
+    if flat_param is None:
+        raise AssertionError("Expected flat_param to be set")
     valid_data_size = flat_param.numel() - flat_param._shard_numel_padded
     shards = load_tensor.local_shards()
     if valid_data_size > 0:
-        assert len(shards), "load_local_state_dict assume one shard per ShardedTensor."
+        if not len(shards):
+            raise AssertionError(
+                "load_local_state_dict assume one shard per ShardedTensor."
+            )
         load_tensor = shards[0].tensor
 
         # Get the metadata of the flat_param to decide whether to pad the loaded
         # tensor.
         if flat_param._shard_numel_padded > 0:
-            assert load_tensor.numel() < flat_param.numel(), (
-                f"Local shard size = {flat_param.numel()} and the tensor in "
-                f"the state_dict is {load_tensor.numel()}."
-            )
+            if load_tensor.numel() >= flat_param.numel():
+                raise AssertionError(
+                    f"Local shard size = {flat_param.numel()} and the tensor in "
+                    f"the state_dict is {load_tensor.numel()}."
+                )
             load_tensor = F.pad(load_tensor, [0, flat_param._shard_numel_padded])
     else:
         load_tensor = flat_param
@@ -618,10 +625,11 @@ def _sharded_pre_load_state_dict_hook(
                 param, fsdp_state._fsdp_extension
             )
 
-            assert len(shards) < 2, (
-                "Expects 0 or 1 shard per rank "
-                f"but got {len(shards)} shards on rank {fsdp_state.rank}."
-            )
+            if len(shards) >= 2:
+                raise AssertionError(
+                    "Expects 0 or 1 shard per rank "
+                    f"but got {len(shards)} shards on rank {fsdp_state.rank}."
+                )
             param_numel = param.size().numel()
             dim_0_size = param.size()[0]
             chunk_size = (
@@ -655,7 +663,7 @@ def _sharded_pre_load_state_dict_hook(
             if param.device != fsdp_state._device_mesh.device_type:
                 param = param.to(fsdp_state._device_mesh.device_type)
 
-            root_mesh = _mesh_resources.get_root_mesh(fsdp_state._device_mesh)
+            root_mesh = fsdp_state._device_mesh._get_root_mesh()
             local_tensor = _ext_all_gather_dtensor(
                 param, root_mesh, fsdp_state._fsdp_extension
             )
diff --git a/torch/distributed/fsdp/_trace_utils.py b/torch/distributed/fsdp/_trace_utils.py
index 22cde2abc966..c4d514c5c647 100644
--- a/torch/distributed/fsdp/_trace_utils.py
+++ b/torch/distributed/fsdp/_trace_utils.py
@@ -1,8 +1,9 @@
 # mypy: allow-untyped-defs
 import functools
+from collections.abc import Callable
 from contextlib import contextmanager
 from dataclasses import dataclass, field
-from typing import Any, Callable, NamedTuple, Optional
+from typing import Any, NamedTuple, Optional
 
 import torch
 import torch.nn as nn
@@ -143,9 +144,10 @@ def _patched_call_module(
         named_params = list(module.named_parameters())
         curr_module = exec_info.curr_module
         if named_params:
-            assert curr_module in exec_info.module_to_param_usage_infos, (
-                "The current module should have already been processed by a patched `call_module`"
-            )
+            if curr_module not in exec_info.module_to_param_usage_infos:
+                raise AssertionError(
+                    "The current module should have already been processed by a patched `call_module`"
+                )
             exec_info.module_to_param_usage_infos[exec_info.curr_module].append(
                 _ParamUsageInfo(module, named_params)
             )
diff --git a/torch/distributed/fsdp/_unshard_param_utils.py b/torch/distributed/fsdp/_unshard_param_utils.py
index 1876c4a44431..bd24583d919b 100644
--- a/torch/distributed/fsdp/_unshard_param_utils.py
+++ b/torch/distributed/fsdp/_unshard_param_utils.py
@@ -66,7 +66,8 @@ def _get_shard(flat_param_or_grad: torch.Tensor) -> torch.Tensor:
     if writeback_grad:
         existing_grad = handle.sharded_grad
         if existing_grad is not None:
-            assert handle.flat_param.grad is not None
+            if handle.flat_param.grad is None:
+                raise AssertionError("Expected handle.flat_param.grad to not be None")
             grad_shard = _get_shard(handle.flat_param.grad)
             existing_grad[: grad_shard.numel()].copy_(grad_shard)
 
@@ -185,9 +186,10 @@ def _unshard_fsdp_state_params(
         yield
         return
 
-    assert handle._training_state == HandleTrainingState.IDLE, (
-        f"Expects the handle training to be IDLE but got {handle._training_state}"
-    )
+    if handle._training_state != HandleTrainingState.IDLE:
+        raise AssertionError(
+            f"Expects the handle training to be IDLE but got {handle._training_state}"
+        )
 
     handle._training_state = HandleTrainingState.SUMMON_FULL_PARAMS
 
diff --git a/torch/distributed/fsdp/_wrap_utils.py b/torch/distributed/fsdp/_wrap_utils.py
index ceecabcacf74..0a83e6307e15 100644
--- a/torch/distributed/fsdp/_wrap_utils.py
+++ b/torch/distributed/fsdp/_wrap_utils.py
@@ -3,8 +3,9 @@
 import functools
 import inspect
 import warnings
+from collections.abc import Callable
 from functools import partial
-from typing import Any, Callable, Union
+from typing import Any, Union
 
 import torch.nn as nn
 from torch.distributed.fsdp._common_utils import (
diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py
index f8d0033eb59b..ce396a84777f 100644
--- a/torch/distributed/fsdp/fully_sharded_data_parallel.py
+++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -6,10 +6,10 @@
 import math
 import traceback
 import warnings
-from collections.abc import Generator, Iterable, Iterator
+from collections.abc import Callable, Generator, Iterable, Iterator
 from contextlib import contextmanager
 from enum import auto, Enum
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -699,12 +699,12 @@ def set_state_dict_type(
             state_dict_config = state_dict_config_type()
         if optim_state_dict_config is None:
             optim_state_dict_config = optim_state_dict_config_type()
-        if state_dict_config_type != type(state_dict_config):
+        if state_dict_config_type is not type(state_dict_config):
             raise RuntimeError(
                 f"Expected state_dict_config of type {state_dict_config_type} "
                 f"but got {type(state_dict_config)}"
             )
-        if optim_state_dict_config_type != type(optim_state_dict_config):
+        if optim_state_dict_config_type is not type(optim_state_dict_config):
             raise RuntimeError(
                 f"Expected optim_state_dict_config of type {optim_state_dict_config_type} "
                 f"but got {type(optim_state_dict_config)}"
@@ -718,24 +718,29 @@ def set_state_dict_type(
             if prev_state_dict_type is None:
                 prev_state_dict_type = submodule._state_dict_type
             else:
-                assert prev_state_dict_type == submodule._state_dict_type, (
-                    "All FSDP modules should have the same state_dict_type."
-                )
+                if prev_state_dict_type != submodule._state_dict_type:
+                    raise AssertionError(
+                        "All FSDP modules should have the same state_dict_type."
+                    )
             if prev_state_dict_config is None:
                 prev_state_dict_config = submodule._state_dict_config
             else:
-                assert isinstance(
+                if not isinstance(
                     submodule._state_dict_config, type(prev_state_dict_config)
-                ), "All FSDP modules must have the same type of state_dict_config."
+                ):
+                    raise AssertionError(
+                        "All FSDP modules must have the same type of state_dict_config."
+                    )
             if prev_optim_state_dict_config is None:
                 prev_optim_state_dict_config = submodule._optim_state_dict_config
             else:
-                assert isinstance(
+                if not isinstance(
                     submodule._optim_state_dict_config,
                     type(prev_optim_state_dict_config),
-                ), (
-                    "All FSDP modules must have the same type of optim_state_dict_config."
-                )
+                ):
+                    raise AssertionError(
+                        "All FSDP modules must have the same type of optim_state_dict_config."
+                    )
 
             submodule._state_dict_type = state_dict_type
             submodule._state_dict_config = state_dict_config
@@ -774,10 +779,11 @@ def get_state_dict_type(module: nn.Module) -> StateDictSettings:
                     submodule._state_dict_config,
                     submodule._optim_state_dict_config,
                 )
-                assert state_dict_settings == submodule_settings, (
-                    "All FSDP modules must have the same state dict settings."
-                    f"Got {submodule_settings} and {state_dict_settings}."
-                )
+                if state_dict_settings != submodule_settings:
+                    raise AssertionError(
+                        "All FSDP modules must have the same state dict settings."
+                        f"Got {submodule_settings} and {state_dict_settings}."
+                    )
                 _set_optim_use_dtensor(submodule, submodule_settings)
         return state_dict_settings
 
@@ -1054,10 +1060,11 @@ def no_sync(self) -> Generator:
             yield
         finally:
             for m, old_flag in old_flags:
-                assert not m._sync_gradients, (
-                    "`_sync_gradients` was incorrectly set to "
-                    "`True` while in the `no_sync()` context manager"
-                )
+                if m._sync_gradients:
+                    raise AssertionError(
+                        "`_sync_gradients` was incorrectly set to "
+                        "`True` while in the `no_sync()` context manager"
+                    )
                 m._sync_gradients = old_flag
 
     @torch.no_grad()
@@ -1275,15 +1282,22 @@ def _optim_state_dict_impl(
             )
         else:
             using_optim_input = False
-            assert optim_input is None and not rank0_only
+            if optim_input is not None or rank0_only:
+                raise AssertionError(
+                    f"Expected optim_input to be None and rank0_only to be False, "
+                    f"got optim_input={optim_input}, rank0_only={rank0_only}"
+                )
 
         use_orig_params = FullyShardedDataParallel.fsdp_modules(model)[
             0
         ]._use_orig_params
-        assert all(
+        if not all(
             use_orig_params == m._use_orig_params
             for m in FullyShardedDataParallel.fsdp_modules(model)
-        ), "Not all FSDP modules have the same _use_orig_params value"
+        ):
+            raise AssertionError(
+                "Not all FSDP modules have the same _use_orig_params value"
+            )
 
         return _optim_state_dict(
             model=model,
@@ -1329,15 +1343,22 @@ def _optim_state_dict_to_load_impl(
             )
         else:
             using_optim_input = False
-            assert optim_input is None and not rank0_only
+            if optim_input is not None or rank0_only:
+                raise AssertionError(
+                    f"Expected optim_input to be None and rank0_only to be False, "
+                    f"got optim_input={optim_input}, rank0_only={rank0_only}"
+                )
 
         use_orig_params = FullyShardedDataParallel.fsdp_modules(model)[
             0
         ]._use_orig_params
-        assert all(
+        if not all(
             use_orig_params == m._use_orig_params
             for m in FullyShardedDataParallel.fsdp_modules(model)
-        ), "Not all FSDP modules have the same _use_orig_params value"
+        ):
+            raise AssertionError(
+                "Not all FSDP modules have the same _use_orig_params value"
+            )
 
         if rank0_only and dist.get_rank(group) > 0:
             optim_state_dict = {}
@@ -1719,10 +1740,13 @@ def rekey_optim_state_dict(
             optim_input,
             optim,
         )
-        assert optim_state_key_type in (
+        if optim_state_key_type not in (
             OptimStateKeyType.PARAM_NAME,
             OptimStateKeyType.PARAM_ID,
-        )
+        ):
+            raise AssertionError(
+                f"Expected optim_state_key_type to be PARAM_NAME or PARAM_ID, got {optim_state_key_type}"
+            )
         osd = optim_state_dict  # alias
         # Validate that the existing parameter keys are uniformly typed
         uses_param_name_mask = [type(param_key) is str for param_key in osd["state"]]
@@ -2150,9 +2174,10 @@ def _get_param_to_fqn(
     """
     param_to_param_names = _get_param_to_fqns(model)
     for param_names in param_to_param_names.values():
-        assert len(param_names) > 0, (
-            "`_get_param_to_fqns()` should not construct empty lists"
-        )
+        if len(param_names) == 0:
+            raise AssertionError(
+                "`_get_param_to_fqns()` should not construct empty lists"
+            )
         if len(param_names) > 1:
             raise RuntimeError(
                 "Each parameter should only map to one parameter name but got "
diff --git a/torch/distributed/fsdp/sharded_grad_scaler.py b/torch/distributed/fsdp/sharded_grad_scaler.py
index 4a8d41c9358a..3986d733328c 100644
--- a/torch/distributed/fsdp/sharded_grad_scaler.py
+++ b/torch/distributed/fsdp/sharded_grad_scaler.py
@@ -35,7 +35,10 @@ class _GeneralMultiDeviceReplicator(_MultiDeviceReplicator):
     """
 
     def __init__(self, master_tensor: torch.Tensor) -> None:
-        assert _is_supported_device(master_tensor)
+        if not _is_supported_device(master_tensor):
+            raise AssertionError(
+                f"Expected supported device, got {master_tensor.device}"
+            )
         self.master = master_tensor
         self._per_device_tensors: dict[torch.device, torch.Tensor] = {}
 
@@ -130,10 +133,12 @@ def scale(
             return outputs
 
         if isinstance(outputs, torch.Tensor):
-            assert _is_supported_device(outputs)
+            if not _is_supported_device(outputs):
+                raise AssertionError(f"Expected supported device, got {outputs.device}")
             if self._scale is None:
                 self._lazy_init_scale_growth_tracker(outputs.device)
-            assert self._scale is not None
+            if self._scale is None:
+                raise AssertionError("Expected _scale to be initialized, got None")
             scaled_output = outputs * self._scale.to(
                 device=outputs.device, non_blocking=True
             )
@@ -146,11 +151,15 @@ def scale(
 
         def apply_scale(val: Union[torch.Tensor, Iterable[torch.Tensor]]):
             if isinstance(val, torch.Tensor):
-                assert _is_supported_device(val)
+                if not _is_supported_device(val):
+                    raise AssertionError(f"Expected supported device, got {val.device}")
                 if len(stash) == 0:
                     if self._scale is None:
                         self._lazy_init_scale_growth_tracker(val.device)
-                    assert self._scale is not None
+                    if self._scale is None:
+                        raise AssertionError(
+                            "Expected _scale to be initialized, got None"
+                        )
                     stash.append(_GeneralMultiDeviceReplicator(self._scale))
                 scaled_val = val * stash[0].get(val.device)
                 # Here we ensure the return dtype is the same as the outputs dtype.
@@ -218,7 +227,8 @@ def _unscale_grads_(
         # ranks may have no (non-zero sized) parameter shards, necessitating the
         # initialization of `per_device_found_inf._per_device_tensors` here
         if not per_device_found_inf._per_device_tensors:
-            assert self._scale is not None
+            if self._scale is None:
+                raise AssertionError("Expected _scale to be initialized, got None")
             per_device_found_inf.get(self._scale.device)
         return per_device_found_inf._per_device_tensors
 
@@ -238,7 +248,8 @@ def unscale_(self, optimizer: torch.optim.Optimizer) -> None:
             raise RuntimeError("unscale_() is being called after step().")
 
         # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
-        assert self._scale is not None
+        if self._scale is None:
+            raise AssertionError("Expected _scale to be initialized, got None")
         inv_scale = self._scale.double().reciprocal().float()
         found_inf = torch.full(
             (1,), 0.0, dtype=torch.float32, device=self._scale.device
@@ -279,7 +290,10 @@ def _amp_update_scale_cpu_(self, found_inf: torch.Tensor) -> None:
         If found_inf is 1.0 (True), then scale is multiplied by backoff_factor and growth_tracker is set to zero.
         Otherwise, scale is multiplied by the growth factor when the growth interval is reached.
         """
-        assert self._scale is not None and self._growth_tracker is not None
+        if self._scale is None or self._growth_tracker is None:
+            raise AssertionError(
+                "Expected _scale and _growth_tracker to be initialized, got None"
+            )
 
         if found_inf.item() >= 1.0:
             self._scale *= self._backoff_factor
@@ -323,9 +337,12 @@ def update(self, new_scale: Optional[Union[float, torch.Tensor]] = None) -> None
                     "new_scale should be a float or a 1-element torch.cuda.FloatTensor or "
                     "torch.FloatTensor with requires_grad=False."
                 )
-                assert new_scale.device.type == self._device, reason
-                assert new_scale.numel() == 1, reason
-                assert new_scale.requires_grad is False, reason
+                if new_scale.device.type != self._device:
+                    raise AssertionError(reason)
+                if new_scale.numel() != 1:
+                    raise AssertionError(reason)
+                if new_scale.requires_grad is not False:
+                    raise AssertionError(reason)
                 self._scale.copy_(new_scale)  # type: ignore[union-attr]
         else:
             # Consume shared inf/nan data collected from optimizers to update the scale.
@@ -336,7 +353,8 @@ def update(self, new_scale: Optional[Union[float, torch.Tensor]] = None) -> None
                 for found_inf in state["found_inf_per_device"].values()
             ]
 
-            assert len(found_infs) > 0, "No inf checks were recorded prior to update."
+            if len(found_infs) == 0:
+                raise AssertionError("No inf checks were recorded prior to update.")
 
             found_inf_combined = found_infs[0]
             if len(found_infs) > 1:
diff --git a/torch/distributed/fsdp/wrap.py b/torch/distributed/fsdp/wrap.py
index ad1bfef5a4ff..f0a210eca8a6 100644
--- a/torch/distributed/fsdp/wrap.py
+++ b/torch/distributed/fsdp/wrap.py
@@ -7,8 +7,8 @@
 import contextlib
 import copy
 from abc import ABC, abstractmethod
-from collections.abc import Generator, Iterable, Sequence
-from typing import Any, Callable, cast, Optional, Union
+from collections.abc import Callable, Generator, Iterable, Sequence
+from typing import Any, cast, Optional, Union
 
 import torch.nn as nn
 
@@ -53,17 +53,20 @@ def _post_order_apply_inner(
                 _post_order_apply_inner(child_module, child_module_name, module)
         optional_module = fn(module)
         if optional_module is not None:
-            assert isinstance(parent_module, nn.Module), (
-                "Non-root modules should have their parent module set but got "
-                f"{parent_module} for {module}"
-            )
-            assert module_name, (
-                "Non-root modules should have their module name set but got "
-                f"an empty module name for {module}"
-            )
-            assert isinstance(optional_module, nn.Module), (
-                f"fn should return None or an nn.Module but got {optional_module}"
-            )
+            if not isinstance(parent_module, nn.Module):
+                raise AssertionError(
+                    "Non-root modules should have their parent module set but got "
+                    f"{parent_module} for {module}"
+                )
+            if not module_name:
+                raise AssertionError(
+                    "Non-root modules should have their module name set but got "
+                    f"an empty module name for {module}"
+                )
+            if not isinstance(optional_module, nn.Module):
+                raise AssertionError(
+                    f"fn should return None or an nn.Module but got {optional_module}"
+                )
             setattr(parent_module, module_name, optional_module)
 
     _post_order_apply_inner(root_module, "", None)
@@ -456,7 +459,8 @@ def wrap(module: nn.Module, **wrap_overrides: Any) -> nn.Module:
             the values provided by the :func:`enable_wrap` context
     """
     if _ConfigAutoWrap.in_autowrap_context:
-        assert _ConfigAutoWrap.wrapper_cls is not None
+        if _ConfigAutoWrap.wrapper_cls is None:
+            raise AssertionError("Expected _ConfigAutoWrap.wrapper_cls to be set")
 
         wrap_overrides = {**_ConfigAutoWrap.kwargs, **wrap_overrides}
         return _wrap(
@@ -468,7 +472,8 @@ def wrap(module: nn.Module, **wrap_overrides: Any) -> nn.Module:
 
 
 def _wrap(module: nn.Module, wrapper_cls: Callable, **kwargs) -> nn.Module:
-    assert wrapper_cls is not None
+    if wrapper_cls is None:
+        raise AssertionError("Expected wrapper_cls to be set")
     if hasattr(module, "_wrap_overrides"):
         # If module has a _wrap_overrides attribute, we force overriding the
         # FSDP config with these attributes for this module. Currently this
@@ -506,14 +511,19 @@ def _recursive_wrap(
         (nn.Module, int):
             ``module`` after wrapping and the numel recursively wrapped.
     """
-    assert auto_wrap_policy is not None, "Must specify auto_wrap_policy."
-    assert wrapper_cls is not None, "Must specify wrapper_cls"
+    if auto_wrap_policy is None:
+        raise AssertionError("Must specify auto_wrap_policy.")
+    if wrapper_cls is None:
+        raise AssertionError("Must specify wrapper_cls")
     # Make sure no child is already wrapped.
     for _, child in module.named_modules():
         if child in ignored_modules:
             continue
         try:
-            assert not isinstance(child, cast(type, wrapper_cls))
+            if isinstance(child, cast(type, wrapper_cls)):
+                raise AssertionError(
+                    f"Child module {child} is already wrapped by {wrapper_cls}"
+                )
         except TypeError:
             # wrapper_cls is a function as opposed to a class type, just bypass above check.
             pass
@@ -523,7 +533,8 @@ def _recursive_wrap(
         p.numel() for p in module.parameters() if p not in ignored_params
     )
 
-    assert auto_wrap_policy is not None
+    if auto_wrap_policy is None:
+        raise AssertionError("Expected auto_wrap_policy to be set")
     if auto_wrap_policy(module=module, recurse=True, nonwrapped_numel=nonwrapped_numel):
         total_wrapped_numel = 0
         # Iterate through the children, recursively wrap if necessary
@@ -575,9 +586,10 @@ def enable_autowrap_context(kwargs: Any) -> None:
             )
         _ConfigAutoWrap.in_autowrap_context = True
         # Get and save the wrapper cls for the context.
-        assert "wrapper_cls" in kwargs.keys(), (
-            "Expected to pass in wrapper_cls arg into _ConfigAutoWrap."
-        )
+        if "wrapper_cls" not in kwargs.keys():
+            raise AssertionError(
+                "Expected to pass in wrapper_cls arg into _ConfigAutoWrap."
+            )
         _ConfigAutoWrap.wrapper_cls = cast(Callable, kwargs["wrapper_cls"])
         del kwargs["wrapper_cls"]
         # Save the rest.
diff --git a/torch/distributed/launcher/api.py b/torch/distributed/launcher/api.py
index cde1dc1750d2..d8414cde18c8 100644
--- a/torch/distributed/launcher/api.py
+++ b/torch/distributed/launcher/api.py
@@ -9,8 +9,9 @@
 import os
 import sys
 import uuid
+from collections.abc import Callable
 from dataclasses import dataclass, field
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch.distributed.elastic.rendezvous.registry as rdzv_registry
diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py
index 66b66c0530c4..f4313d4cbee9 100644
--- a/torch/distributed/nn/api/remote_module.py
+++ b/torch/distributed/nn/api/remote_module.py
@@ -4,8 +4,8 @@
 import io
 import sys
 import types
-from collections.abc import Iterator, Mapping
-from typing import Any, Callable, Optional, TypeVar, Union
+from collections.abc import Callable, Iterator, Mapping
+from typing import Any, Optional, TypeVar, Union
 from typing_extensions import Self
 
 import torch
@@ -354,7 +354,9 @@ def to(self, *args, **kwargs) -> T:  # type: ignore[misc, return, type-var]
         _raise_not_supported(self.to.__name__)
 
     def register_backward_hook(  # type: ignore[return]
-        self, hook: Callable[[Module, _grad_t, _grad_t], Union[None, _grad_t]]
+        self,
+        hook: Callable[[Module, _grad_t, _grad_t], Union[None, _grad_t]],
+        # pyrefly: ignore  # bad-return
     ) -> RemovableHandle:
         _raise_not_supported(self.register_backward_hook.__name__)
 
@@ -369,6 +371,7 @@ def register_forward_pre_hook(  # type: ignore[return]
         ],
         prepend: bool = False,
         with_kwargs: bool = False,
+        # pyrefly: ignore  # bad-return
     ) -> RemovableHandle:
         _raise_not_supported(self.register_forward_pre_hook.__name__)
 
@@ -380,6 +383,7 @@ def register_forward_hook(  # type: ignore[return, override]
         ],
         prepend: bool = False,
         with_kwargs: bool = False,
+        # pyrefly: ignore  # bad-return
     ) -> RemovableHandle:
         _raise_not_supported(self.register_forward_hook.__name__)
 
@@ -400,7 +404,11 @@ def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
         )
 
     def named_parameters(  # type: ignore[return]
-        self, prefix: str = "", recurse: bool = True, remove_duplicate: bool = True
+        self,
+        prefix: str = "",
+        recurse: bool = True,
+        remove_duplicate: bool = True,
+        # pyrefly: ignore  # bad-return
     ) -> Iterator[tuple[str, Parameter]]:
         _raise_not_supported(self.named_parameters.__name__)
 
@@ -408,7 +416,11 @@ def buffers(self, recurse: bool = True) -> Iterator[Tensor]:  # type: ignore[ret
         _raise_not_supported(self.buffers.__name__)
 
     def named_buffers(  # type: ignore[return]
-        self, prefix: str = "", recurse: bool = True, remove_duplicate: bool = True
+        self,
+        prefix: str = "",
+        recurse: bool = True,
+        remove_duplicate: bool = True,
+        # pyrefly: ignore  # bad-return
     ) -> Iterator[tuple[str, Tensor]]:
         _raise_not_supported(self.named_buffers.__name__)
 
@@ -572,23 +584,31 @@ def init_from_module_rref(
 
         remote_module = object.__new__(RemoteModule)
 
+        # pyrefly: ignore  # missing-attribute
         enable_moving_cpu_tensors_to_cuda = remote_module._prepare_init(remote_device)
 
         if _module_interface_cls is not None:
             # Users reply on this field to know if this generated RemoteModule is TorchScript-able.
+            # pyrefly: ignore  # missing-attribute
             remote_module.is_scriptable = True
 
+            # pyrefly: ignore  # missing-attribute
             remote_module._init_template(
                 _module_interface_cls, enable_moving_cpu_tensors_to_cuda
             )
         else:
+            # pyrefly: ignore  # missing-attribute
             remote_module.is_scriptable = False
+            # pyrefly: ignore  # missing-attribute
             remote_module.generated_methods = (
                 _NON_SCRIPTABLE_REMOTE_MODULE_MODULE._generated_methods
             )
+        # pyrefly: ignore  # missing-attribute
         remote_module.module_rref = module_rref
 
+        # pyrefly: ignore  # missing-attribute
         remote_module._install_generated_methods()
+        # pyrefly: ignore  # missing-attribute
         remote_module._check_attribute_picklability()
 
         return remote_module
@@ -691,9 +711,11 @@ def _remote_module_receiver(
     m.__dict__.update(serialized_remote_module._asdict())
 
     # Unpickling the attribute `module_rref` must invoke RRef's `_deserialize()` method.
+    # pyrefly: ignore  # missing-attribute
     m.module_rref = rpc.PyRRef._deserialize(m.module_rref)
 
     # Install generated methods when unpickled.
+    # pyrefly: ignore  # missing-attribute
     for method in m.generated_methods:
         method_name = method.__name__
         method = torch.jit.export(method)
diff --git a/torch/distributed/nn/functional.py b/torch/distributed/nn/functional.py
index 2bdf3fe2bdff..ff2c776348e0 100644
--- a/torch/distributed/nn/functional.py
+++ b/torch/distributed/nn/functional.py
@@ -2,6 +2,10 @@
 import torch
 import torch.distributed as dist
 from torch.autograd import Function
+
+# The two imports below are not always available depending on the
+# USE_DISTRIBUTED compile flag. Make sure they raise import error
+# if we're trying to use them.
 from torch.distributed import group, ReduceOp
 
 
@@ -221,6 +225,7 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=group.WORLD):
 
 class _Broadcast(Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, src, group, tensor):
         ctx.src = src
         ctx.group = group
@@ -232,6 +237,7 @@ def forward(ctx, src, group, tensor):
         return tensor
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         gx = _Reduce.apply(ctx.src, ReduceOp.SUM, ctx.group, grad_output)
         if ctx.src != ctx.rank:
@@ -241,6 +247,7 @@ def backward(ctx, grad_output):
 
 class _Gather(Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, dst, group, tensor):
         ctx.dst = dst
         ctx.group = group
@@ -266,6 +273,7 @@ def backward(ctx, *grad_outputs):
 
 class _Scatter(Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, src, group, *tensors):
         ctx.src = src
         ctx.group = group
@@ -278,12 +286,14 @@ def forward(ctx, src, group, *tensors):
         return output
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         return (None, None) + _Gather.apply(ctx.src, ctx.group, grad_output)
 
 
 class _Reduce(Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, src, op, group, tensor):
         ctx.src = src
         ctx.group = group
@@ -292,12 +302,14 @@ def forward(ctx, src, op, group, tensor):
         return tensor
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         return (None, None, None) + (_Broadcast.apply(ctx.src, ctx.group, grad_output),)
 
 
 class _Reduce_Scatter(Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, op, group, tensor, *input_tensor_list):
         ctx.group = group
         # Need contiguous tensors for collectives.
@@ -307,12 +319,14 @@ def forward(ctx, op, group, tensor, *input_tensor_list):
         return tensor
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         return (None, None, None) + _AllGather.apply(ctx.group, grad_output)
 
 
 class _AllGather(Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, group, tensor):
         # Need contiguous tensors for collectives.
         tensor = tensor.contiguous()
@@ -342,12 +356,14 @@ def backward(ctx, *grad_outputs):
 
 class _AllGatherBase(Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, output_tensor, input_tensor, group):
         ctx.group = group
         dist._all_gather_base(output_tensor, input_tensor.contiguous(), group=group)
         return output_tensor
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         if dist.get_backend(group=ctx.group) is dist.Backend.NCCL:
             world_size = dist.get_world_size(group=ctx.group)
@@ -369,6 +385,7 @@ def backward(ctx, grad_output):
 
 class _AlltoAll(Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, group, out_tensor_list, *tensors):
         ctx.group = group
         ctx.input_tensor_size_list = [
@@ -404,6 +421,7 @@ def backward(ctx, *grad_outputs):
 
 class _AlltoAllSingle(Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, group, output, output_split_sizes, input_split_sizes, input):
         ctx.group = group
         ctx.input_size = input.size()
@@ -419,6 +437,7 @@ def forward(ctx, group, output, output_split_sizes, input_split_sizes, input):
         return output
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         tensor = torch.empty(
             ctx.input_size, device=grad_output.device, dtype=grad_output.dtype
@@ -436,6 +455,7 @@ def backward(ctx, grad_output):
 
 class _AllReduce(Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, op, group, tensor):
         ctx.group = group
         ctx.op = op
@@ -444,5 +464,6 @@ def forward(ctx, op, group, tensor):
         return tensor
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         return (None, None) + (_AllReduce.apply(ctx.op, ctx.group, grad_output),)
diff --git a/torch/distributed/optim/named_optimizer.py b/torch/distributed/optim/named_optimizer.py
index 00d96739e517..65d3944ae71a 100644
--- a/torch/distributed/optim/named_optimizer.py
+++ b/torch/distributed/optim/named_optimizer.py
@@ -1,8 +1,8 @@
 import logging
 import warnings
-from collections.abc import Collection, Mapping
+from collections.abc import Callable, Collection, Mapping
 from copy import deepcopy
-from typing import Any, Callable, Optional, overload, Union
+from typing import Any, Optional, overload, Union
 
 import torch
 import torch.nn as nn
diff --git a/torch/distributed/optim/optimizer.py b/torch/distributed/optim/optimizer.py
index b1664cd588bb..9d17601a4e3f 100644
--- a/torch/distributed/optim/optimizer.py
+++ b/torch/distributed/optim/optimizer.py
@@ -52,7 +52,7 @@ def step(self, autograd_ctx_id: int):
         all_local_grads = dist_autograd.get_gradients(autograd_ctx_id)
         # apply functional optimizer step with a list of gradients
         grads: list[Optional[Tensor]] = [
-            all_local_grads[p] if p in all_local_grads else None
+            all_local_grads[p] if p in all_local_grads else None  # noqa: SIM401
             for p in self._local_params
         ]
 
diff --git a/torch/distributed/optim/zero_redundancy_optimizer.py b/torch/distributed/optim/zero_redundancy_optimizer.py
index 18e4ed1ea6e3..18f31ade189e 100644
--- a/torch/distributed/optim/zero_redundancy_optimizer.py
+++ b/torch/distributed/optim/zero_redundancy_optimizer.py
@@ -11,8 +11,9 @@
 import inspect
 import io
 import logging
+from collections.abc import Callable
 from itertools import chain
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -99,15 +100,19 @@ def _broadcast_object(
         data = bytearray(buffer.getbuffer())
         length_tensor = torch.LongTensor([len(data)]).to(device)
         data_send_tensor = torch.ByteTensor(data).to(device)
+        # pyrefly: ignore  # bad-argument-type
         dist.broadcast(length_tensor, src=src_rank, group=group, async_op=False)
+        # pyrefly: ignore  # bad-argument-type
         dist.broadcast(data_send_tensor, src=src_rank, group=group, async_op=False)
     else:
         # Receive the object
         length_tensor = torch.LongTensor([0]).to(device)
+        # pyrefly: ignore  # bad-argument-type
         dist.broadcast(length_tensor, src=src_rank, group=group, async_op=False)
         data_recv_tensor = torch.empty(
             [int(length_tensor.item())], dtype=torch.uint8, device=device
         )
+        # pyrefly: ignore  # bad-argument-type
         dist.broadcast(data_recv_tensor, src=src_rank, group=group, async_op=False)
         buffer = io.BytesIO(data_recv_tensor.cpu().numpy())
         obj = torch.load(buffer, map_location=device, weights_only=False)
@@ -166,6 +171,7 @@ def __init__(
         if len(self.parameters) == 0:
             raise ValueError("Empty bucket assignment")
         # DDP guarantees all parameters in the bucket have the same device
+        # pyrefly: ignore  # read-only
         self.device: torch.device = self.parameters[0].device
         self.tensor: Optional[torch.Tensor] = None
 
@@ -414,7 +420,9 @@ def __init__(
         self.world_size: int = dist.get_world_size(self.process_group)
         self.rank: int = dist.get_rank(self.process_group)
         self.global_rank: int = dist.distributed_c10d.get_global_rank(
-            self.process_group, self.rank
+            # pyrefly: ignore  # bad-argument-type
+            self.process_group,
+            self.rank,
         )
 
         self._overlap_with_ddp: bool = overlap_with_ddp
@@ -534,7 +542,9 @@ def consolidate_state_dict(self, to: int = 0) -> None:
         self._all_state_dicts = []
         for rank in range(self.world_size):
             global_rank = dist.distributed_c10d.get_global_rank(
-                self.process_group, rank
+                # pyrefly: ignore  # bad-argument-type
+                self.process_group,
+                rank,
             )
             if self.rank == to:
                 # Consolidate all local `state_dict`s on this rank, storing on
@@ -766,7 +776,9 @@ def _broadcast_params_from_rank(self, rank: int):
             for dev_i_buckets in self._buckets:
                 bucket = dev_i_buckets[rank]
                 global_rank = dist.distributed_c10d.get_global_rank(
-                    self.process_group, rank
+                    # pyrefly: ignore  # bad-argument-type
+                    self.process_group,
+                    rank,
                 )
                 handles.append(
                     dist.broadcast(
@@ -779,7 +791,9 @@ def _broadcast_params_from_rank(self, rank: int):
         else:
             param_groups = self._partition_parameters()[rank]
             global_rank = dist.distributed_c10d.get_global_rank(
-                self.process_group, rank
+                # pyrefly: ignore  # bad-argument-type
+                self.process_group,
+                rank,
             )
             for param_group in param_groups:
                 handles.extend(
@@ -978,11 +992,14 @@ def _bucket_assignments_per_rank(self) -> list[dict[int, _DDPBucketAssignment]]:
                 for param_index, param in enumerate(bucket_params):
                     param_numel = param.numel()
                     if (
+                        # pyrefly: ignore  # unbound-name
                         assignment_size + param_numel >= threshold
                         and param_index > bucket_offset
                     ):
                         assigned_rank = self._get_min_index(
-                            size_per_rank, assigned_ranks_per_bucket[bucket_index]
+                            # pyrefly: ignore  # unbound-name
+                            size_per_rank,
+                            assigned_ranks_per_bucket[bucket_index],
                         )
                         # Include up to but not including the parameter that
                         # exceeded the threshold
@@ -993,6 +1010,7 @@ def _bucket_assignments_per_rank(self) -> list[dict[int, _DDPBucketAssignment]]:
                             assigned_rank,
                             assigned_ranks_per_bucket,
                         )
+                        # pyrefly: ignore  # unbound-name
                         size_per_rank[assigned_rank] += assignment_size
                         bucket_offset = param_index
                         assignment_size = 0
@@ -1000,7 +1018,9 @@ def _bucket_assignments_per_rank(self) -> list[dict[int, _DDPBucketAssignment]]:
                 # Assign the remainder of the bucket so that no assignment
                 # spans across two buckets
                 assigned_rank = self._get_min_index(
-                    size_per_rank, assigned_ranks_per_bucket[bucket_index]
+                    # pyrefly: ignore  # unbound-name
+                    size_per_rank,
+                    assigned_ranks_per_bucket[bucket_index],
                 )
                 self._assign_bucket_subset_to_rank(
                     bucket_index,
@@ -1009,6 +1029,7 @@ def _bucket_assignments_per_rank(self) -> list[dict[int, _DDPBucketAssignment]]:
                     assigned_rank,
                     assigned_ranks_per_bucket,
                 )
+                # pyrefly: ignore  # unbound-name
                 size_per_rank[assigned_rank] += assignment_size
 
         return self._bucket_assignments_per_rank_cache
@@ -1087,6 +1108,7 @@ def _local_step(
 
         return loss
 
+    # pyrefly: ignore  # bad-override
     def step(
         self,
         closure: Optional[Callable[[], float]] = None,
diff --git a/torch/distributed/optim/zero_redundancy_optimizer.pyi b/torch/distributed/optim/zero_redundancy_optimizer.pyi
index 0f8ccfb24c27..8ffbb04f13ff 100644
--- a/torch/distributed/optim/zero_redundancy_optimizer.pyi
+++ b/torch/distributed/optim/zero_redundancy_optimizer.pyi
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import enum
-from typing import Any, Callable, overload
+from collections.abc import Callable
+from typing import Any, overload
 
 import torch
 from torch.distributed.algorithms.join import Joinable, JoinHook
diff --git a/torch/distributed/pipelining/_IR.py b/torch/distributed/pipelining/_IR.py
index ab648a97a7ee..45e90c4f3aad 100644
--- a/torch/distributed/pipelining/_IR.py
+++ b/torch/distributed/pipelining/_IR.py
@@ -4,10 +4,11 @@
 import logging
 import operator
 from collections import defaultdict
+from collections.abc import Callable
 from enum import Enum
 from inspect import Parameter, Signature, signature
 from types import MethodType
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch.fx as fx
@@ -35,6 +36,16 @@
 # 2. Add parameter movement to split_module
 
 
+PP_SUBMOD_PREFIX = "submod_pp"
+
+
+def get_submod_name(stage_idx: int):
+    """Returns the name of the submod for a given stage index.
+    For example, "submod_pp_0", "submod_pp_1", etc.
+    """
+    return "_".join([PP_SUBMOD_PREFIX, str(stage_idx)])
+
+
 def _find_loss_from_output_and_spec(output_val, spec_val):
     if spec_val is False:
         return None
@@ -178,7 +189,7 @@ def add_to_live_nodes(n):
                 output_grads: Union[tuple[Optional[fx.Node], ...], Optional[fx.Node]]
                 if node in tuples:
                     stage_output = tuples[node]
-                    output_grads = tuple(val_to_grad.get(n, None) for n in tuples[node])
+                    output_grads = tuple(val_to_grad.get(n) for n in tuples[node])
                     outputs_with_grads_idxs = [
                         i for i, n in enumerate(tuples[node]) if n in live_nodes
                     ]
@@ -271,6 +282,7 @@ def forward(self, *args, **kwargs):
 
 
 class TrivialLossWrapper(LossWrapper):
+    # pyrefly: ignore  # bad-override
     def forward(self, x, targets):
         model_out = self.module(x)
         return self.loss_fn(model_out, targets)
@@ -379,7 +391,7 @@ def detach_tensors(a):
 
         """
         def dont_traverse_size(a):
-            return type(a) != torch.Size
+            return type(a) is not torch.Size
         """
 
         args = map_aggregate(
@@ -593,7 +605,7 @@ def throw(self, *args, **kwargs):
         i = 0
         while True:
             try:
-                name = f"submod_{i}"
+                name = get_submod_name(i)
                 submod = getattr(self.split_gm, name)
                 submod.__class__.__reduce__ = _direct_serialization_reduce
                 i += 1
@@ -639,15 +651,17 @@ def get_stage_module(self, stage_idx: int) -> torch.nn.Module:
         """
         if stage_idx < 0 or stage_idx >= self.num_stages:
             raise ValueError(f"Invalid stage index {stage_idx}!")
-        return getattr(self.split_gm, f"submod_{stage_idx}")
+
+        submod_name = get_submod_name(stage_idx)
+        return getattr(self.split_gm, submod_name)
 
     @staticmethod
     def _number_and_count_forward_stages(gm: fx.GraphModule):
         num_stages = 0
         found_idxs: dict[int, None] = {}
         for node in gm.graph.nodes:
-            if node.op == "call_module" and node.target.startswith("submod_"):
-                node.meta["stage_idx"] = int(node.target[len("submod_") :])
+            if node.op == "call_module" and node.target.startswith(PP_SUBMOD_PREFIX):
+                node.meta["stage_idx"] = int(node.target[len(PP_SUBMOD_PREFIX) + 1 :])
                 found_idxs.setdefault(node.meta["stage_idx"])
                 num_stages += 1
 
@@ -729,7 +743,7 @@ def split_callback(n: fx.Node):
 
         # TODO: what does split do with module invocations? does it move the modules
         # into the submodules?
-        split = split_module(traced, mod, split_callback)  # type: ignore[arg-type]
+        split = split_module(traced, mod, split_callback, partition_affix="pp")  # type: ignore[arg-type]
         # a (custom) tracer can produce dead code like orphan get_attr nodes
         split.graph.eliminate_dead_code()
 
@@ -1002,7 +1016,7 @@ def _trace_with_export(
     ) -> ExportedProgram:
         logger.info("Tracing model ...")
         try:
-            ep = torch.export.export(mod, example_args, example_kwargs, strict=True)
+            ep = torch.export.export(mod, example_args, example_kwargs)
         except Exception as e:
             raise RuntimeError(
                 "It seems that we cannot capture your model as a full graph. "
diff --git a/torch/distributed/pipelining/_backward.py b/torch/distributed/pipelining/_backward.py
index a3529067db79..5410c9b94484 100644
--- a/torch/distributed/pipelining/_backward.py
+++ b/torch/distributed/pipelining/_backward.py
@@ -114,7 +114,7 @@ def get_param_groups(
             "intermediates": intersected,
         }
         for input_node in intersected:
-            existing = param_groups.get(input_node, None)
+            existing = param_groups.get(input_node)
             if existing is not None:
                 existing["params"] = existing["params"].union(param_group["params"])
                 existing["intermediates"] = existing["intermediates"].union(
@@ -245,6 +245,7 @@ def stage_backward_weight(
             if non_none_grads:
                 summed_grad = sum(non_none_grads)
                 valid_edges.append(GradientEdge(intermediate, 0))
+                # pyrefly: ignore  # bad-argument-type
                 valid_grad_outputs.append(summed_grad)
 
         # Break a reference cycle caused inside stage_backward_input->get_hook->hook
@@ -366,6 +367,13 @@ def extract_tensors_with_grads(
         for val in input_values:
             if isinstance(val, torch.Tensor):
                 grad_inputs.append(val.grad)
+                # Since gradients that will pass back to previous stages do not require gradient accumulation,
+                # by decrementing the gradients' reference count at this point, the memory of gradients will be
+                # returned to the allocator as soon as the next micro batch's get_bwd_send_ops comes and current
+                # asynchronous send completes.
+                # This prevents the gradients from persisting in GPU memory for the entire duration of step_microbatches
+                # until clear_runtime_states() is called.
+                val.grad = None
             else:
                 grad_inputs.append(None)
 
diff --git a/torch/distributed/pipelining/_schedule_visualizer.py b/torch/distributed/pipelining/_schedule_visualizer.py
index 1230adc35bde..365cdd246b37 100644
--- a/torch/distributed/pipelining/_schedule_visualizer.py
+++ b/torch/distributed/pipelining/_schedule_visualizer.py
@@ -81,6 +81,7 @@ def get_schedule_ops(
         raise ValueError(f"Invalid schedule: {schedule_class}")
 
     # Instantiate the schedule class
+    # pyrefly: ignore  # bad-instantiation, bad-argument-type
     schedule_instance = schedule_class(stages, num_microbatches)
     assert schedule_instance.pipeline_order is not None
 
diff --git a/torch/distributed/pipelining/microbatch.py b/torch/distributed/pipelining/microbatch.py
index 61f87fb7fd6a..06c4edb9b3d3 100644
--- a/torch/distributed/pipelining/microbatch.py
+++ b/torch/distributed/pipelining/microbatch.py
@@ -2,11 +2,13 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import logging
 import operator
+from collections.abc import Sequence
 from typing import Any, Optional
 
 import torch
 from torch.fx.node import map_aggregate
-from torch.utils._pytree import tree_flatten, tree_unflatten
+from torch.nn.attention.flex_attention import BlockMask
+from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
 
 
 __all__ = [
@@ -115,6 +117,111 @@ class _Replicate:
     pass
 
 
+def _split_block_mask(
+    block_mask: BlockMask,
+    num_chunks: int,
+) -> list[BlockMask]:
+    """Given a block mask, split the block mask along the batch dimension (dim0).
+
+    Args:
+        block_mask: Block mask to split
+        num_chunks: Number of chunks to split the block mask into
+
+    Returns:
+        chunk_block_masks: List of chunked block masks
+    """
+
+    # BlockMask will broadcast if B is 1.
+    if block_mask.kv_num_blocks.size(0) == 1:
+        return [block_mask] * num_chunks
+
+    assert block_mask.kv_num_blocks.size(0) >= num_chunks, (
+        "Block mask has fewer batch size than the number of chunks. "
+    )
+
+    batch_dim = 0
+    kv_num_blocks_chunks = torch.tensor_split(
+        block_mask.kv_num_blocks, num_chunks, batch_dim
+    )
+    kv_indices_chunks = torch.tensor_split(block_mask.kv_indices, num_chunks, batch_dim)
+    full_kv_num_blocks_chunks = (
+        torch.tensor_split(block_mask.full_kv_num_blocks, num_chunks, batch_dim)
+        if block_mask.full_kv_num_blocks is not None
+        else [None] * num_chunks
+    )
+    full_kv_indices_chunks = (
+        torch.tensor_split(block_mask.full_kv_indices, num_chunks, batch_dim)
+        if block_mask.full_kv_indices is not None
+        else [None] * num_chunks
+    )
+
+    chunk_block_masks = []
+    batch_offset = 0
+    for chunk_idx in range(num_chunks):
+
+        def create_mask_mod(idx):
+            def batch_offset_mask_mod(b, h, q_idx, kv_idx):
+                b_offset = torch.full_like(b, idx)
+                return block_mask.mask_mod(b + b_offset, h, q_idx, kv_idx)
+
+            return batch_offset_mask_mod
+
+        chunk_block_masks.append(
+            BlockMask.from_kv_blocks(
+                kv_num_blocks=kv_num_blocks_chunks[chunk_idx],
+                kv_indices=kv_indices_chunks[chunk_idx],
+                full_kv_num_blocks=full_kv_num_blocks_chunks[chunk_idx],
+                full_kv_indices=full_kv_indices_chunks[chunk_idx],
+                BLOCK_SIZE=block_mask.BLOCK_SIZE,
+                mask_mod=create_mask_mod(batch_offset),
+                seq_lengths=block_mask.seq_lengths,
+            )
+        )
+        batch_offset += kv_num_blocks_chunks[chunk_idx].size(0)
+    return chunk_block_masks
+
+
+def _split_tensor(
+    tensor: torch.Tensor,
+    spec: TensorChunkSpec,
+    num_chunks: int,
+) -> Sequence[torch.Tensor]:
+    """Given a tensor, and a chunking spec, split the tensor.
+    Args:
+
+        tensor: Tensor to split
+        spec: Chunking spec
+        num_chunks: Number of chunks to split the tensor into
+
+    Returns:
+        chunk_tensors: List of chunked tensors
+    """
+
+    assert tensor.size(spec.split_dim) >= num_chunks, (
+        f"Tensor size {tensor.size(spec.split_dim)} is smaller than num_chunks"
+    )
+    chunk_tensors = torch.tensor_split(tensor, num_chunks, spec.split_dim)
+
+    if not _debug_mask_minibatches:
+        return chunk_tensors
+
+    expanded_chunks = []
+    split_dim_idx = 0
+    for chunk_tensor in chunk_tensors:
+        new_val = torch.zeros_like(tensor)
+        upper_idx = split_dim_idx + chunk_tensor.size(spec.split_dim)
+
+        slice_indices = [slice(None, None, None)] * new_val.ndim
+        slice_indices[spec.split_dim] = slice(split_dim_idx, upper_idx)
+        new_val[slice_indices] = chunk_tensor
+
+        expanded_chunks.append(new_val)
+
+        split_dim_idx += chunk_tensor.size(spec.split_dim)
+
+    return expanded_chunks
+
+
 def _shard_dict_of_args(
     args_dict,
     args_chunk_spec,
@@ -132,114 +239,67 @@ def _shard_dict_of_args(
     Returns:
         args_split: List of sharded args
     """
-    # Stage 1+2: flatten and shard/replicate
 
-    # args_sharded_replicated : [num args, num flat values, num chunks]
-    args_sharded_replicated = {}
-    arg_specs = []
-
-    real_num_chunks = num_chunks
-    first_tensor = True
+    if not args_dict:
+        return [{} for _ in range(num_chunks)]
 
     assert len(args_dict) == len(args_chunk_spec), (
-        f"args_dict.keys() = {list(args_dict.keys())} args_chunk_spec.keys() = {list(args_chunk_spec.keys())}"
+        f"args_dict.keys() = {list(args_dict.keys())} "
+        f"args_chunk_spec.keys() = {list(args_chunk_spec.keys())}"
     )
-
-    for arg_key, arg in args_dict.items():
-        flat, spec = tree_flatten(arg)
-        arg_specs.append(spec)
-
-        chunk_spec = args_chunk_spec[arg_key]
-        assert chunk_spec is not None  # Should have been set by caller
-        chunk_spec_flat, _ = tree_flatten(chunk_spec)
-        if len(flat) != len(chunk_spec_flat):
+    assert args_chunk_spec is not None  # Should have been set by caller
+
+    values, tree_spec = tree_flatten(args_dict)
+    chunk_specs, _ = tree_flatten(args_chunk_spec)
+
+    # First check and find the actual number of chunks
+    split_sizes = []
+    for v, spec in zip(values, chunk_specs, strict=True):
+        # The original logic is "spec is _Replicate". This doesn't seem to be
+        # correct. But we keep it for backward compatibility.
+        if spec is _Replicate or isinstance(spec, _Replicate):
+            split_sizes.append(num_chunks)
+        elif isinstance(v, torch.Tensor):
+            assert isinstance(spec, TensorChunkSpec)
+            split_sizes.append(v.size(spec.split_dim))
+        elif isinstance(v, BlockMask):
+            assert isinstance(spec, TensorChunkSpec)
+            assert spec.split_dim == 0, "BlockMask only supports split_dim=0"
+            # BlockMask will broadcast if B is 1.
+            if v.kv_num_blocks.size(0) == 1:
+                split_sizes.append(num_chunks)
+            else:
+                split_sizes.append(v.kv_num_blocks.size(0))
+        else:
             raise ValueError(
-                f"Argument value {arg} did not have the same number of "
-                f"values as as chunk spec {chunk_spec}"
+                f"Unsupported chunk spec: {spec} and value: {v} combination."
+            )
+    result_num_chunks = min(*split_sizes, num_chunks)
+
+    flat_split_results: list[Any] = [[] for _ in range(result_num_chunks)]
+    for v, spec in zip(values, chunk_specs, strict=True):
+        v_splits: Sequence[Any] = []
+        if spec is _Replicate or isinstance(spec, _Replicate):
+            v_splits = [v] * result_num_chunks
+        elif isinstance(v, torch.Tensor):
+            v_splits = _split_tensor(v, spec, result_num_chunks)
+        elif isinstance(v, BlockMask):
+            v_splits = _split_block_mask(v, result_num_chunks)
+        else:
+            raise ValueError(
+                f"Unsupported chunk spec: {spec} and value: {v} combination."
             )
 
-        sharded_arg_flat = []
-
-        for v, chunk_v in zip(flat, chunk_spec_flat):
-            if chunk_v is _Replicate or not isinstance(v, torch.Tensor):
-                sharded_arg_flat.append([v] * real_num_chunks)
-            elif isinstance(chunk_v, TensorChunkSpec):
-                # TODO: check type of v. If it's a tensor, use chunk (or debug mask).
-                # If it's a collection type, split it as you would expect. Otherwise,
-                # Throw an error
-                assert isinstance(v, torch.Tensor), f"{v} is not a tensor"
-
-                v_split_dim_size = v.size(chunk_v.split_dim)
-                if v_split_dim_size < real_num_chunks:
-                    if first_tensor:
-                        # We can only adjust number of chunks when we hit this
-                        # issue at the first tensor encountered
-                        logger.warning(
-                            f"Tensor size on chunking dimension is {v_split_dim_size}, "  # noqa: G004
-                            f"downsizing the number of chunks from {num_chunks} to {v_split_dim_size}."
-                        )
-                        real_num_chunks = v_split_dim_size
-                    else:
-                        raise RuntimeError(
-                            f"Arg {arg_key} on chunking dimension has a size of {v_split_dim_size}, "
-                            f"smaller than the number of chunks {num_chunks}. "
-                            "PiPPy cannot reduce the number of chunks because "
-                            "other arguments have bigger chunk-dimension sizes. "
-                            "Please adjust your num_chunks setting."
-                        )
-
-                chunk_tensors = torch.tensor_split(
-                    v, real_num_chunks, chunk_v.split_dim
-                )
-
-                if _debug_mask_minibatches:
-                    expanded_chunks = []
-
-                    split_dim_idx = 0
-                    for chunk_tensor in chunk_tensors:
-                        new_val = torch.zeros_like(v)
-                        upper_idx = split_dim_idx + chunk_tensor.size(chunk_v.split_dim)
-
-                        slice_indices = [slice(None, None, None)] * new_val.ndim
-                        slice_indices[chunk_v.split_dim] = slice(
-                            split_dim_idx, upper_idx
-                        )
-                        new_val[slice_indices] = chunk_tensor
-
-                        expanded_chunks.append(new_val)
-
-                        split_dim_idx += chunk_tensor.size(chunk_v.split_dim)
-
-                    sharded_arg_flat.append(expanded_chunks)
-                else:
-                    sharded_arg_flat.append(chunk_tensors)  # type: ignore[arg-type]
-
-                first_tensor = False
-            else:
-                raise TypeError(f"Unrecognized chunk spec: {chunk_v}")
-
-        args_sharded_replicated[arg_key] = sharded_arg_flat
-
-    # chunks_flat : [num chunks, num args, num flat values]
-    chunks_flat = []
-    for chunk_idx in range(real_num_chunks):
-        chunk_args = {}
-        for key, arg in args_sharded_replicated.items():
-            arg_single_chunk = [v_flat[chunk_idx] for v_flat in arg]
-            chunk_args[key] = arg_single_chunk
-        chunks_flat.append(chunk_args)
-
-    # args_split : [num chunks, num args]
-    args_split = []
-
-    for chunk in chunks_flat:
-        per_chunk_args = {}
-        assert len(arg_specs) == len(chunk)
-        for (key, arg), arg_spec in zip(chunk.items(), arg_specs):
-            per_chunk_args[key] = tree_unflatten(arg, arg_spec)
-        args_split.append(per_chunk_args)
+        # pyrefly: ignore  # no-matching-overload
+        for _flat_split_result, _v_split in zip(
+            flat_split_results, v_splits, strict=True
+        ):
+            _flat_split_result.append(_v_split)
 
-    return args_split
+    return [
+        tree_unflatten(_flat_split_result, tree_spec)
+        for _flat_split_result in flat_split_results
+    ]
 
 
 def split_args_kwargs_into_chunks(
@@ -303,11 +363,17 @@ def split_args_kwargs_into_chunks(
 
     # If user did not provide args_chunk_spec or kwargs_chunk_spec, we extend
     # their format and use default chunking along dim 0
+    def default_spec(v):
+        if isinstance(v, torch.Tensor | BlockMask):
+            return TensorChunkSpec(DEFAULT_CHUNK_DIM)
+        else:
+            return _Replicate()
+
     if args_chunk_spec is None:
-        args_chunk_spec = (TensorChunkSpec(DEFAULT_CHUNK_DIM),) * len(args)
+        args_chunk_spec = tree_map(default_spec, args)
 
     if kwargs_chunk_spec is None:
-        kwargs_chunk_spec = dict.fromkeys(kwargs, TensorChunkSpec(DEFAULT_CHUNK_DIM))
+        kwargs_chunk_spec = tree_map(default_spec, kwargs)
 
     args_split_dict = _shard_dict_of_args(
         dict(enumerate(args)),
diff --git a/torch/distributed/pipelining/schedules.py b/torch/distributed/pipelining/schedules.py
index ffc23a654ec4..ff8e19d4f7eb 100644
--- a/torch/distributed/pipelining/schedules.py
+++ b/torch/distributed/pipelining/schedules.py
@@ -8,9 +8,10 @@
 import re
 from abc import ABC, abstractmethod
 from collections import Counter, defaultdict
+from collections.abc import Callable
 from enum import Enum
 from functools import lru_cache
-from typing import Any, Callable, NamedTuple, Optional, Union
+from typing import Any, cast, NamedTuple, Optional, Protocol, Union
 
 import torch
 import torch.distributed as dist
@@ -245,6 +246,7 @@ def _format_pipeline_order(
         pipeline_order.get(key, [""] * num_steps) for key in sorted(pipeline_order)
     ]
     # Transpose the list of lists (rows to columns)
+    # pyrefly: ignore  # no-matching-overload
     transposed_actions = list(itertools.zip_longest(*rank_actions, fillvalue=""))
     # Generate column labels for ranks
     num_ranks = len(pipeline_order)
@@ -357,6 +359,7 @@ def _step_microbatches(
         kwarg_mbs: Optional[list] = None,
         target_mbs: Optional[list] = None,
         losses: Optional[list] = None,
+        return_outputs: bool = True,
     ):
         """
         Run one iteration of the pipeline schedule with list of microbatches.
@@ -365,11 +368,19 @@ def _step_microbatches(
 
         Args:
             microbatches: list of microbatch args.
+            return_outputs: whether to return the outputs from the last stage.
         """
         raise NotImplementedError
 
     @abstractmethod
-    def step(self, *args, target=None, losses: Optional[list] = None, **kwargs):
+    def step(
+        self,
+        *args,
+        target=None,
+        losses: Optional[list] = None,
+        return_outputs=True,
+        **kwargs,
+    ):
         """
         Run one iteration of the pipeline schedule with *whole-batch* input.
         Will chunk the input into microbatches automatically, and go through the
@@ -379,6 +390,7 @@ def step(self, *args, target=None, losses: Optional[list] = None, **kwargs):
         kwargs: keyword arguments to the model (as in non-pipeline case).
         target: target for the loss function.
         losses: a list to store the losses for each microbatch.
+        return_outputs: whether to return the outputs from the last stage.
         """
         raise NotImplementedError
 
@@ -408,7 +420,7 @@ def _check_inputs(
         kwarg_mbs: Optional[list] = None,
         target_mbs: Optional[list] = None,
         losses: Optional[list] = None,
-    ):
+    ) -> tuple[list, list]:
         """
         Pre-process/check inputs
         """
@@ -558,7 +570,8 @@ def __init__(
         # Self attributes
         self._stage = stage
         self._num_stages = stage.num_stages
-        self._stage_initialized = False
+        self._stage_forward_initialized = False
+        self._stage_backward_initialized = False
 
         if n_microbatches < self._num_stages:
             raise ValueError(
@@ -571,19 +584,29 @@ def __init__(
         )
 
     def _initialize_stage(self, args, kwargs):
-        # Prepare the communication needed for the pipeline schedule execution
-        # This is needed because during execution we always perform a series of batch P2P ops
-        # The first call of the batched P2P needs to involve the global group
-        all_ops: list[dist.P2POp] = []
-        all_ops.extend(self._stage._get_init_p2p_neighbors_ops())
-        _wait_batch_p2p(_batch_p2p(all_ops))
-
-        self._stage._prepare_forward_infra(self._n_microbatches, args, kwargs)
-        if self._has_backward:
+        if not self._stage_forward_initialized:
+            # Prepare the communication needed for the pipeline schedule execution
+            # This is needed because during execution we always perform a series of batch P2P ops
+            # The first call of the batched P2P needs to involve the global group
+            all_ops: list[dist.P2POp] = []
+            all_ops.extend(self._stage._get_init_p2p_neighbors_ops())
+            _wait_batch_p2p(_batch_p2p(all_ops))
+
+            self._stage._prepare_forward_infra(self._n_microbatches, args, kwargs)
+            self._stage_forward_initialized = True
+
+        if self._has_backward and not self._stage_backward_initialized:
             self._stage._prepare_backward_infra(self._n_microbatches)
-        self._stage_initialized = True
+            self._stage_backward_initialized = True
 
-    def step(self, *args, target=None, losses: Optional[list] = None, **kwargs):
+    def step(
+        self,
+        *args,
+        target=None,
+        losses: Optional[list] = None,
+        return_outputs: bool = True,
+        **kwargs,
+    ):
         """
         Run one iteration of the pipeline schedule with *whole-batch* input.
         Will chunk the input into microbatches automatically, and go through the
@@ -593,6 +616,7 @@ def step(self, *args, target=None, losses: Optional[list] = None, **kwargs):
         kwargs: keyword arguments to the model (as in non-pipeline case).
         target: target for the loss function.
         losses: a list to store the losses for each microbatch.
+        return_outputs: whether to return the outputs from the last stage.
         """
         if self._has_backward and not torch.is_grad_enabled():
             raise RuntimeError(
@@ -617,10 +641,16 @@ def step(self, *args, target=None, losses: Optional[list] = None, **kwargs):
             targets_split = None
 
         # Run microbatches
-        self._step_microbatches(args_split, kwargs_split, targets_split, losses)
+        self._step_microbatches(
+            args_split, kwargs_split, targets_split, losses, return_outputs
+        )
+
+        # Stage post processing
+        grad_scale_factor = self._n_microbatches if self.scale_grads else 1
+        self._stage._post_backward(grad_scale_factor)
 
         # Return merged results per original format
-        if self._stage.is_last:
+        if self._stage.is_last and return_outputs:
             return self._merge_outputs(self._stage.output_chunks)
         else:
             return None
@@ -656,6 +686,7 @@ def _step_microbatches(
         kwarg_mbs: Optional[list] = None,
         target_mbs: Optional[list] = None,
         losses: Optional[list] = None,
+        return_outputs: bool = True,
     ):
         """
         Run one iteration of the pipeline schedule
@@ -666,8 +697,7 @@ def _step_microbatches(
             )
 
         arg_mbs, kwarg_mbs = self._check_inputs(arg_mbs, kwarg_mbs, target_mbs, losses)
-        if not self._stage_initialized:
-            self._initialize_stage(arg_mbs[0], kwarg_mbs[0])
+        self._initialize_stage(arg_mbs[0], kwarg_mbs[0])
 
         # Delay send waits
         fwd_sends_to_wait: list[list[dist.Work]] = []
@@ -707,6 +737,7 @@ def _step_microbatches(
         kwarg_mbs: Optional[list] = None,
         target_mbs: Optional[list] = None,
         losses: Optional[list] = None,
+        return_outputs: bool = True,
     ):
         """
         Run one iteration of the pipeline schedule with list of microbatches.
@@ -714,11 +745,10 @@ def _step_microbatches(
 
         Args:
             microbatches: list of microbatch args.
+            return_outputs: whether to return the outputs from the last stage.
         """
         arg_mbs, kwarg_mbs = self._check_inputs(arg_mbs, kwarg_mbs, target_mbs, losses)
-
-        if not self._stage_initialized:
-            self._initialize_stage(arg_mbs[0], kwarg_mbs[0])
+        self._initialize_stage(arg_mbs[0], kwarg_mbs[0])
 
         # Delay send waits
         fwd_sends_to_wait: list[list[dist.Work]] = []
@@ -731,7 +761,9 @@ def _step_microbatches(
                 for work in works.values():
                     _wait_batch_p2p(work)
 
-                output = self._stage.forward_one_chunk(i, arg_mbs[i], kwarg_mbs[i])  # type: ignore[index]
+                output = self._stage.forward_one_chunk(
+                    i, arg_mbs[i], kwarg_mbs[i], save_forward_output=return_outputs
+                )  # type: ignore[index]
 
                 ops = self._stage.get_fwd_send_ops(i)
                 works = _sorted_batch_p2p(ops, desc="fwd_send")
@@ -770,10 +802,6 @@ def _step_microbatches(
 
             logger.debug("[%s] Backwarded microbatch %s", self._stage.stage_index, i)
 
-        self._stage.scale_grads(
-            grad_scale_factor=self._n_microbatches if self.scale_grads else 1
-        )
-
         # Wait for all backward sends to finish
         for work in bwd_sends_to_wait:
             _wait_batch_p2p(work)
@@ -826,6 +854,7 @@ def _step_microbatches(
         kwarg_mbs: Optional[list] = None,
         target_mbs: Optional[list] = None,
         losses: Optional[list] = None,
+        return_outputs: bool = True,
     ):
         """
         Run one iteration of the pipeline schedule with list of microbatches.
@@ -833,11 +862,10 @@ def _step_microbatches(
 
         Args:
             microbatches: list of microbatch args.
+            return_outputs: whether to return the outputs from the last stage.
         """
         arg_mbs, kwarg_mbs = self._check_inputs(arg_mbs, kwarg_mbs, target_mbs, losses)
-
-        if not self._stage_initialized:
-            self._initialize_stage(arg_mbs[0], kwarg_mbs[0])
+        self._initialize_stage(arg_mbs[0], kwarg_mbs[0])
 
         # Last stage has 1 warmup, second-to-last 2 warmups, ...
         # first stage `num_stages` warmups
@@ -860,7 +888,10 @@ def _step_microbatches(
 
             # Compute
             output = self._stage.forward_one_chunk(
-                fwd_mb_index, arg_mbs[fwd_mb_index], kwarg_mbs[fwd_mb_index]
+                fwd_mb_index,
+                arg_mbs[fwd_mb_index],
+                kwarg_mbs[fwd_mb_index],
+                save_forward_output=return_outputs,
             )  # type: ignore[index]
 
             # Clear previous chunk's forward sends (hopefully they have well
@@ -915,7 +946,10 @@ def _step_microbatches(
 
             # Now do the fwd
             output = self._stage.forward_one_chunk(
-                fwd_mb_index, arg_mbs[fwd_mb_index], kwarg_mbs[fwd_mb_index]
+                fwd_mb_index,
+                arg_mbs[fwd_mb_index],
+                kwarg_mbs[fwd_mb_index],
+                save_forward_output=return_outputs,
             )  # type: ignore[index]
 
             # Compute loss
@@ -950,10 +984,6 @@ def _step_microbatches(
             send_work = _batch_p2p(bwd_sends, desc="bwd_send")
             bwd_mb_index += 1
 
-        self._stage.scale_grads(
-            grad_scale_factor=self._n_microbatches if self.scale_grads else 1
-        )
-
         # Wait for the last backward send to finish
         _wait_batch_p2p(send_work)
 
@@ -1060,9 +1090,7 @@ def next_stage_indices(
                         if sub_action.stage_index not in seen:
                             seen.add(sub_action.stage_index)
                             ret.append(sub_action.stage_index)
-                            if len(ret) == count:
-                                break
-                    if len(ret) == count:
+                    if len(ret) >= count:
                         break
                 else:
                     # Regular action
@@ -1109,6 +1137,10 @@ def _reshard(stage_index: int):
             _unshard(stage)
         fsdp_aware_actions.append(action)
 
+    # Reshard all remaining active stages after processing all operations
+    for stage in list(active_stages):
+        _reshard(stage)
+
     return fsdp_aware_actions
 
 
@@ -1157,6 +1189,9 @@ def _add_send_recv(
 ) -> dict[int, list[_Action]]:
     """
     Transforms a compute-only schedule into a complete schedule with communication actions.
+
+    For actions with sub-actions (OVERLAP_F_B) we ensure that all the subactions have been
+    computed and the communication is ready
     """
     comm_actions: dict[int, list[_Action]] = {rank: [] for rank in compute_actions}
     prev_actions: dict[int, set[_Action]] = {rank: set() for rank in compute_actions}
@@ -1191,7 +1226,7 @@ def _ready_to_schedule(
         """
         if action is None:
             return True
-        elif action.computation_type == F and not action.stage_index == 0:
+        elif action.computation_type == F and action.stage_index != 0:
             if (
                 _Action(action.stage_index, RECV_F, action.microbatch_index)
                 in prev_actions
@@ -1205,7 +1240,7 @@ def _ready_to_schedule(
             return False
         elif (
             action.computation_type in (BACKWARD_INPUT, FULL_BACKWARD)
-            and not action.stage_index == num_stages - 1
+            and action.stage_index != num_stages - 1
         ):
             if (
                 _Action(action.stage_index, RECV_B, action.microbatch_index)
@@ -1226,19 +1261,6 @@ def _ready_to_schedule(
         else:
             return True
 
-    # TODO: For now we are splitting OVERLAP_F_B into replacing it to
-    # its forward and backward components
-    # We need to figure out how to do the communication
-    for rank in compute_actions:
-        new_actions: list[_Action] = []
-        for action in compute_actions[rank]:
-            if action is not None and action.sub_actions is not None:
-                # Replace OVERLAP_F_B action with its sub_actions
-                new_actions.extend(action.sub_actions)
-            else:
-                new_actions.append(action)
-        compute_actions[rank] = new_actions
-
     while compute_actions:
         progress = False
         # go in order of ranks even if dict keys aren't ordered
@@ -1247,21 +1269,28 @@ def _ready_to_schedule(
                 f"{rank=}, {len(compute_actions[rank])=}"
             )
             action = compute_actions[rank][0]
+            # handle case where parent action (e.g. OVERLAP_F_B) can be comprised of subactions
+            if action is not None and action.sub_actions is not None:
+                all_actions = action.sub_actions
+            else:
+                all_actions = (action,)
 
-            if not _ready_to_schedule(action, prev_actions[rank]):
+            if not all(_ready_to_schedule(a, prev_actions[rank]) for a in all_actions):
                 continue
 
+            # The action's dependencies are satisfied, so add to schedule
             if action is not None:
                 comm_actions[rank].append(action)
-                prev_actions[rank].add(action)
-                if _has_comms(action):
-                    send, recv = _get_comms(action)
-                    # TODO we can avoid send/recv if the 2 stages are on the same rank.
-                    # should we avoid that in the runtime or here?
-                    comm_actions[rank].append(send)
-                    prev_actions[rank].add(send)
-                    comm_actions[stage_to_rank(recv.stage_index)].append(recv)
-                    prev_actions[stage_to_rank(recv.stage_index)].add(recv)
+                for a in all_actions:
+                    prev_actions[rank].add(a)
+                    if _has_comms(a):
+                        send, recv = _get_comms(a)
+                        # TODO we can avoid send/recv if the 2 stages are on the same rank.
+                        # should we avoid that in the runtime or here?
+                        comm_actions[rank].append(send)
+                        prev_actions[rank].add(send)
+                        comm_actions[stage_to_rank(recv.stage_index)].append(recv)
+                        prev_actions[stage_to_rank(recv.stage_index)].add(recv)
 
             compute_actions[rank].pop(0)
             if len(compute_actions[rank]) == 0:
@@ -1436,7 +1465,8 @@ def __init__(
         for stage in self._stages:
             stage.stage_index_to_group_rank = self.stage_index_to_group_rank
 
-        self._stages_initialized = False
+        self._stages_forward_initialized = False
+        self._stages_backward_initialized = False
 
         # avoid putting a reference to 'self' inside the lambda, it creates a ref cycle
         has_loss: bool = self._loss_fn is not None
@@ -1452,30 +1482,33 @@ def __init__(
             )
 
     def _initialize_stages(self, args: tuple[Any, ...], kwargs):
-        # Prepare the communication needed for the pipeline schedule execution
-        # This is needed because during execution we always perform a series of batch P2P ops
-        # The first call of the batched P2P needs to involve the global group
-        all_ops: list[dist.P2POp] = []
-        for stage in self._stages:
-            all_ops.extend(stage._get_init_p2p_neighbors_ops())
-        _wait_batch_p2p(_batch_p2p(all_ops))
-
-        # may be 'none' value (if this stage sends its output shapes to the next stage via P2P)
-        # or real value (if this stage and next stage are on the same device)
-        next_stage_args: tuple[Any, ...] = tuple()
-        for stage in self._stages:
-            if stage.is_first:
-                next_stage_args = stage._prepare_forward_infra(
-                    self._n_microbatches, args, kwargs
-                )
-            else:
-                next_stage_args = stage._prepare_forward_infra(
-                    self._n_microbatches, next_stage_args, kwargs
-                )
+        if not self._stages_forward_initialized:
+            # Prepare the communication needed for the pipeline schedule execution
+            # This is needed because during execution we always perform a series of batch P2P ops
+            # The first call of the batched P2P needs to involve the global group
+            all_ops: list[dist.P2POp] = []
+            for stage in self._stages:
+                all_ops.extend(stage._get_init_p2p_neighbors_ops())
+            _wait_batch_p2p(_batch_p2p(all_ops))
+
+            # may be 'none' value (if this stage sends its output shapes to the next stage via P2P)
+            # or real value (if this stage and next stage are on the same device)
+            next_stage_args: tuple[Any, ...] = tuple()
+            for stage in self._stages:
+                if stage.is_first:
+                    next_stage_args = stage._prepare_forward_infra(
+                        self._n_microbatches, args, kwargs
+                    )
+                else:
+                    next_stage_args = stage._prepare_forward_infra(
+                        self._n_microbatches, next_stage_args, kwargs
+                    )
+            self._stages_forward_initialized = True
 
-            if self._has_backward:
+        if self._has_backward and not self._stages_backward_initialized:
+            for stage in self._stages:
                 stage._prepare_backward_infra(self._n_microbatches)
-        self._stages_initialized = True
+            self._stages_backward_initialized = True
 
     def _validate_and_set_stage_mapping(
         self, actions: dict[int, list[Optional[_Action]]]
@@ -1515,7 +1548,14 @@ def _load_csv(self, filename, format="compute_only"):
         # This will overwrite the default stage_to_rank_mapping created in the constructor
         self._validate_and_set_stage_mapping(self.pipeline_order)
 
-    def step(self, *args, target=None, losses: Optional[list] = None, **kwargs):
+    def step(
+        self,
+        *args,
+        target=None,
+        losses: Optional[list] = None,
+        return_outputs: bool = True,
+        **kwargs,
+    ):
         """
         Run one iteration of the pipeline schedule with *whole-batch* input.
         Will chunk the input into microbatches automatically, and go through the
@@ -1525,6 +1565,7 @@ def step(self, *args, target=None, losses: Optional[list] = None, **kwargs):
         kwargs: keyword arguments to the model (as in non-pipeline case).
         target: target for the loss function.
         losses: a list to store the losses for each microbatch.
+        return_outputs: whether to return the outputs from the last stage.
         """
         if self._has_backward and not torch.is_grad_enabled():
             raise RuntimeError(
@@ -1551,13 +1592,21 @@ def step(self, *args, target=None, losses: Optional[list] = None, **kwargs):
             targets_split = None
 
         # Run microbatches
-        self._step_microbatches(args_split, kwargs_split, targets_split, losses)
+        self._step_microbatches(
+            args_split, kwargs_split, targets_split, losses, return_outputs
+        )
+
+        # Stage post processing
+        # TODO: remove this section and include as part of the schedule IR?
+        for stage in self._stages:
+            grad_scale_factor = self._n_microbatches if self.scale_grads else 1
+            stage._post_backward(grad_scale_factor)
 
         # Return merged results per original format
         for stage in self._stages:
-            if stage.is_last:
+            if stage.is_last and return_outputs:
                 return self._merge_outputs(stage.output_chunks)
-        # Does not contain the last stage
+        # Does not contain the last stage or we do not return output chunks
         return None
 
     def _step_microbatches(
@@ -1566,6 +1615,7 @@ def _step_microbatches(
         kwarg_mbs: Optional[list] = None,
         target_mbs: Optional[list] = None,
         losses: Optional[list] = None,
+        return_outputs: bool = True,
     ):
         """
         Operate on the microbatches for looped schedules (multiple stages on each rank).
@@ -1575,8 +1625,7 @@ def _step_microbatches(
         """
         arg_mbs, kwarg_mbs = self._check_inputs(arg_mbs, kwarg_mbs, target_mbs, losses)
 
-        if not self._stages_initialized:
-            self._initialize_stages(arg_mbs[0], kwarg_mbs[0])
+        self._initialize_stages(arg_mbs[0], kwarg_mbs[0])
 
         # Based on the plan in Step 1 created in __init__:
         # 2. Perform communication based on the pipeline_order
@@ -1610,7 +1659,10 @@ def _step_microbatches(
                         # perform forward computation
                         stage = stage_index_to_stage[stage_index]
                         output = stage.forward_one_chunk(
-                            mb_index, arg_mbs[mb_index], kwarg_mbs[mb_index]
+                            mb_index,
+                            arg_mbs[mb_index],
+                            kwarg_mbs[mb_index],
+                            save_forward_output=return_outputs,
                         )
                         self._maybe_compute_loss(stage, output, target_mbs, mb_index)
                         ops.extend(stage.get_fwd_send_ops(mb_index))
@@ -1729,7 +1781,7 @@ def _step_microbatches(
                 # do the communication
                 _wait_batch_p2p(_batch_p2p(ops))
             except Exception as e:
-                logger.error(
+                logger.error(  # noqa: G200
                     "[Rank %s] pipeline schedule %s caught the following exception '%s' \
 at time_step %s when running action %s",
                     self.rank,
@@ -1749,6 +1801,26 @@ def _step_microbatches(
         self._update_losses(self._stages, losses)
 
 
+class _PipelineContext:
+    def __init__(
+        self,
+        schedule_ref: _PipelineSchedule,
+        arg_mbs: Optional[list[tuple]] = None,
+        kwarg_mbs: Optional[list[dict]] = None,
+        target_mbs: Optional[list] = None,
+        losses: Optional[list] = None,
+    ):
+        self.schedule_ref = schedule_ref
+        self.arg_mbs = arg_mbs
+        self.kwarg_mbs = kwarg_mbs
+        self.target_mbs = target_mbs
+        self.losses = losses
+
+
+class _CustomFunctionProtocol(Protocol):
+    def __call__(self, action: _Action, ctx: _PipelineContext) -> None: ...
+
+
 class _PipelineScheduleRuntime(PipelineScheduleMulti):
     """
     Provides a simple runtime that requires a 'schedule IR' including specified communication operations.
@@ -1757,6 +1829,57 @@ class _PipelineScheduleRuntime(PipelineScheduleMulti):
     subclassed and the subclass can be responsible for creating a schedule IR.
     """
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Action to custom function mapping
+        self._comp_type_to_function_map: dict[_ComputationType, Callable] = {}
+        # count either full_backward or backward_weight together, to determine when to sync DP grads
+        self.backward_counter: Counter[int] = Counter()
+
+        # recv ops indexed by (stage_idx, mb_idx) need to be waited on before use
+        self.bwd_recv_ops: dict[tuple[int, int], list[dist.Work]] = {}
+        self.fwd_recv_ops: dict[tuple[int, int], list[dist.Work]] = {}
+
+        # we track which stages are 'active' when used with FSDP, and wait on unshard ops before computing on stages
+        self.unshard_ops: dict[int, list[UnshardHandle]] = defaultdict(list)
+        self.unsharded_stages = set()
+
+    def register_custom_function(
+        self,
+        computation_type: _ComputationType,
+        custom_function: _CustomFunctionProtocol,
+    ) -> None:
+        """
+        Register a custom function to be executed for a specific computation type.
+
+        Args:
+            computation_type: The computation type for which to register the custom function
+            custom_function: The function to execute when this computation type is encountered.
+                Must have signature: (stage: _PipelineStageBase, mb_index: int, *args, **kwargs) -> None
+        """
+        # Ensure that the computation type is valid
+        if computation_type not in (
+            FORWARD,
+            FULL_BACKWARD,
+            BACKWARD_INPUT,
+            BACKWARD_WEIGHT,
+            OVERLAP_F_B,
+        ):
+            raise ValueError(
+                f"Invalid computation type {computation_type}. Only FORWARD, FULL_BACKWARD, \
+BACKWARD_INPUT, BACKWARD_WEIGHT, and OVERLAP_F_B are supported."
+            )
+
+        # Check if computation_type is already registered
+        if computation_type in self._comp_type_to_function_map:
+            logger.warning(
+                "Computation type %s is already registered. "
+                "Overwriting the existing custom function.",
+                computation_type,
+            )
+
+        self._comp_type_to_function_map[computation_type] = custom_function
+
     def _prepare_schedule_with_comms(
         self,
         actions: dict[int, list[Optional[_Action]]],
@@ -1850,12 +1973,27 @@ def _simulate(self):
             self._num_stages,
         )
 
+    def _assert_unsharded(self, stage: _PipelineStageBase):
+        """If an unshard is active for `stage_idx`, wait() it and mark `stage_idx` unshared."""
+        stage_uses_fsdp = isinstance(stage.submod, FSDPModule)
+        if stage_uses_fsdp:
+            stage_idx = stage.stage_index
+            if stage_idx in self.unshard_ops:
+                for op in self.unshard_ops[stage_idx]:
+                    op.wait()
+                del self.unshard_ops[stage_idx]
+                self.unsharded_stages.add(stage_idx)
+            assert stage_idx in self.unsharded_stages, (
+                f"Attempted to compute on sharded {stage_idx=}"
+            )
+
     def _step_microbatches(
         self,
         arg_mbs: Optional[list] = None,
         kwarg_mbs: Optional[list] = None,
         target_mbs: Optional[list] = None,
         losses: Optional[list] = None,
+        return_outputs: bool = True,
     ):
         """
         Operate on the microbatches for looped schedules (multiple stages on each rank).
@@ -1864,8 +2002,7 @@ def _step_microbatches(
         not support models with skip connections.
         """
         arg_mbs, kwarg_mbs = self._check_inputs(arg_mbs, kwarg_mbs, target_mbs, losses)
-        if not self._stages_initialized:
-            self._initialize_stages(arg_mbs[0], kwarg_mbs[0])
+        self._initialize_stages(arg_mbs[0], kwarg_mbs[0])
 
         # Based on the plan in Step 1 created in __init__:
         # 2. Perform communication based on the pipeline_order
@@ -1877,203 +2014,202 @@ def _step_microbatches(
             "Must call _prepare_schedule_with_comms() before calling _step_microbatches()"
         )
 
-        # recv ops indexed by (stage_idx, mb_idx) need to be waited on before use
-        bwd_recv_ops: dict[tuple[int, int], list[dist.Work]] = {}
-        fwd_recv_ops: dict[tuple[int, int], list[dist.Work]] = {}
-
         # send ops should be waited on before step() exists, mainly for hygiene
         send_ops: list[list[dist.Work]] = []
 
-        # we track which stages are 'active' when used with FSDP, and wait on unshard ops before computing on stages
-        unshard_ops: dict[int, UnshardHandle] = {}
-        unsharded_stages = set()
-
-        def _assert_unsharded(stage_idx: int):
-            """If an unshard is active for `stage_idx`, wait() it and mark `stage_idx` unshared."""
-            if stage_idx in unshard_ops:
-                unshard_ops[stage_idx].wait()
-                del unshard_ops[stage_idx]
-                unsharded_stages.add(stage_idx)
-            assert stage_idx in unsharded_stages, (
-                f"Attempted to compute on sharded {stage_idx=}"
+        def _perform_action(action: _Action) -> None:
+            comp_type = action.computation_type
+            mb_index: int = (
+                action.microbatch_index if action.microbatch_index is not None else -1
+            )
+            assert mb_index >= 0 or comp_type in (
+                UNSHARD,
+                RESHARD,
+            ), f"{action=} missing mb_index"
+            stage_idx = action.stage_index
+            stage = stage_index_to_stage[stage_idx]
+            stage_uses_fsdp = isinstance(stage.submod, FSDPModule)
+            # see [Note: V-schedule special case]
+            is_next_stage_on_this_rank = stage_idx + 1 in stage_index_to_stage
+            is_prev_stage_on_this_rank = stage_idx - 1 in stage_index_to_stage
+
+            logger.debug(
+                "_PipelineScheduleRuntime running time_step %d, action %s",
+                time_step,
+                action,
             )
 
-        # count either full_backward or backward_weight together, to determine when to sync DP grads
-        backward_counter: Counter[int] = Counter()
-        for time_step, action in enumerate(self.pipeline_order_with_comms[self.rank]):
-            try:
-                comp_type = action.computation_type
-                mb_index: int = (
-                    action.microbatch_index
-                    if action.microbatch_index is not None
-                    else -1
+            # TODO(whc) it's not actually safe to use _batch_p2p here in the uncommon case the model has skip-connections,
+            # since we do not want to batch up ops between more than a pair of ranks.  _sorted_batch_p2p would be
+            # safe to use instead.
+            # However, I was wondering if I should avoid calling batched operators at all in the case that there is
+            # only one operator per batch.  I could iterate through the 'fwd_send_ops' one by one and run them.
+            if comp_type == SEND_F:
+                send_ops.append(_batch_p2p(stage.get_fwd_send_ops(mb_index)))
+            elif comp_type == SEND_B:
+                send_ops.append(_batch_p2p(stage.get_bwd_send_ops(mb_index)))
+            elif comp_type == RECV_F:
+                assert (
+                    stage_idx,
+                    mb_index,
+                ) not in self.fwd_recv_ops, (
+                    f"Recv twice for {stage_idx=} {mb_index=} without executing forward"
                 )
-                assert mb_index >= 0 or comp_type in (
-                    UNSHARD,
-                    RESHARD,
-                ), f"{action=} missing mb_index"
-                stage_idx = action.stage_index
-                stage = stage_index_to_stage[stage_idx]
-                stage_uses_fsdp = isinstance(stage.submod, FSDPModule)
-                # see [Note: V-schedule special case]
-                is_next_stage_on_this_rank = stage_idx + 1 in stage_index_to_stage
-                is_prev_stage_on_this_rank = stage_idx - 1 in stage_index_to_stage
-
-                logger.debug(
-                    "_PipelineScheduleRuntime running time_step %d, action %s",
-                    time_step,
-                    action,
+                self.fwd_recv_ops[(stage_idx, mb_index)] = _batch_p2p(
+                    stage.get_fwd_recv_ops(mb_index)
                 )
+            elif comp_type == RECV_B:
+                assert (
+                    stage_idx,
+                    mb_index,
+                ) not in self.bwd_recv_ops, (
+                    f"Recv twice for {stage_idx=} {mb_index=} without executing backward"
+                )
+                self.bwd_recv_ops[(stage_idx, mb_index)] = _batch_p2p(
+                    stage.get_bwd_recv_ops(mb_index)
+                )
+            elif comp_type == UNSHARD:
+                if stage_uses_fsdp:
+                    assert (
+                        stage_idx not in self.unsharded_stages
+                        and stage_idx not in self.unshard_ops
+                    ), f"Unsharding the same {stage_idx=} twice"
+                    for submodule in stage.submod.modules():
+                        if not isinstance(submodule, FSDPModule):
+                            continue
+                        handle = cast(UnshardHandle, submodule.unshard(async_op=True))
+                        self.unshard_ops[stage_idx].append(handle)
+            elif comp_type == RESHARD:
+                if stage_uses_fsdp:
+                    assert stage_idx in self.unsharded_stages, (
+                        f"Resharding {stage_idx=} without unsharding"
+                    )
+                    assert stage_idx not in self.unshard_ops, (
+                        f"Resharding {stage_idx=} before finishing unshard"
+                    )
+                    for submodule in stage.submod.modules():
+                        if not isinstance(submodule, FSDPModule):
+                            continue
+                        submodule.reshard()
+                    self.unsharded_stages.remove(stage_idx)
+            elif comp_type == FORWARD:
+                self._assert_unsharded(stage)
+
+                if (
+                    not stage.is_first
+                    # no recv op expected for V-schedule special case (see [Note: V-schedule special case])
+                    and not is_prev_stage_on_this_rank
+                ):
+                    assert (
+                        stage_idx,
+                        mb_index,
+                    ) in self.fwd_recv_ops, (
+                        f"Computing {action=} before receiving input"
+                    )
+                    _wait_batch_p2p(self.fwd_recv_ops.pop((stage_idx, mb_index)))
 
-                with record_function(_get_profiler_function_name(action)):
-                    # TODO(whc) it's not actually safe to use _batch_p2p here in the uncommon case the model has skip-connections,
-                    # since we do not want to batch up ops between more than a pair of ranks.  _sorted_batch_p2p would be
-                    # safe to use instead.
-                    # However, I was wondering if I should avoid calling batched operators at all in the case that there is
-                    # only one operator per batch.  I could iterate through the 'fwd_send_ops' one by one and run them.
-                    if comp_type == SEND_F:
-                        send_ops.append(_batch_p2p(stage.get_fwd_send_ops(mb_index)))
-                    elif comp_type == SEND_B:
-                        send_ops.append(_batch_p2p(stage.get_bwd_send_ops(mb_index)))
-                    elif comp_type == RECV_F:
-                        assert (
-                            stage_idx,
-                            mb_index,
-                        ) not in fwd_recv_ops, (
-                            "Recv twice for {stage_idx=} {mb_index=} without executing forward"
-                        )
-                        fwd_recv_ops[(stage_idx, mb_index)] = _batch_p2p(
-                            stage.get_fwd_recv_ops(mb_index)
-                        )
-                    elif comp_type == RECV_B:
-                        assert (
-                            stage_idx,
-                            mb_index,
-                        ) not in bwd_recv_ops, (
-                            "Recv twice for {stage_idx=} {mb_index=} without executing backward"
-                        )
-                        bwd_recv_ops[(stage_idx, mb_index)] = _batch_p2p(
-                            stage.get_bwd_recv_ops(mb_index)
-                        )
-                    elif comp_type == UNSHARD:
-                        if stage_uses_fsdp:
-                            assert (
-                                stage_idx not in unsharded_stages
-                                and stage_idx not in unshard_ops
-                            ), f"Unsharding the same {stage_idx=} twice"
-                            unshard_ops[stage_idx] = stage.submod.unshard(async_op=True)  # type: ignore[operator]
-                    elif comp_type == RESHARD:
-                        if stage_uses_fsdp:
-                            assert stage_idx in unsharded_stages, (
-                                f"Resharding {stage_idx=} without unsharding"
-                            )
-                            assert stage_idx not in unshard_ops, (
-                                f"Resharding {stage_idx=} before finishing unshard"
-                            )
-                            stage.submod.reshard()  # type: ignore[operator]
-                    elif comp_type == FORWARD:
-                        if stage_uses_fsdp:
-                            _assert_unsharded(stage_idx)
-
-                        if (
-                            not stage.is_first
-                            # no recv op expected for V-schedule special case (see [Note: V-schedule special case])
-                            and not is_prev_stage_on_this_rank
-                        ):
-                            assert (
-                                stage_idx,
-                                mb_index,
-                            ) in fwd_recv_ops, (
-                                f"Computing {action=} before receiving input"
-                            )
-                            _wait_batch_p2p(fwd_recv_ops.pop((stage_idx, mb_index)))
-
-                        output = stage.forward_one_chunk(
-                            mb_index, arg_mbs[mb_index], kwarg_mbs[mb_index]
-                        )
-                        self._maybe_compute_loss(stage, output, target_mbs, mb_index)
+                output = stage.forward_one_chunk(
+                    mb_index,
+                    arg_mbs[mb_index],  # type: ignore[index]
+                    kwarg_mbs[mb_index],  # type: ignore[index]
+                    save_forward_output=return_outputs,
+                )
+                self._maybe_compute_loss(stage, output, target_mbs, mb_index)
 
-                        # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
-                        # see [Note: V-schedule special case]
-                        if is_next_stage_on_this_rank:
-                            stage_index_to_stage[stage_idx + 1].set_local_fwd_input(
-                                output, mb_index
-                            )
+                # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
+                # see [Note: V-schedule special case]
+                if is_next_stage_on_this_rank:
+                    stage_index_to_stage[stage_idx + 1].set_local_fwd_input(
+                        output, mb_index
+                    )
 
-                    elif comp_type == FULL_BACKWARD:
-                        if stage_uses_fsdp:
-                            _assert_unsharded(stage_idx)
+            elif comp_type == FULL_BACKWARD:
+                self._assert_unsharded(stage)
+
+                if (
+                    not stage.is_last
+                    # no recv op expected for V-schedule special case (see [Note: V-schedule special case])
+                    and not is_next_stage_on_this_rank
+                ):
+                    assert (
+                        stage_idx,
+                        mb_index,
+                    ) in self.bwd_recv_ops, (
+                        f"Attempted to run compute {action=} before receiving input"
+                    )
+                    _wait_batch_p2p(self.bwd_recv_ops.pop((stage_idx, mb_index)))
+                loss = self._maybe_get_loss(stage, mb_index)
+                self.backward_counter[stage_idx] += 1
+                last_backward = self.backward_counter[stage_idx] == self._n_microbatches
+                stage.backward_one_chunk(
+                    mb_index,
+                    loss=loss,
+                    full_backward=True,
+                    last_backward=last_backward,
+                )
+                # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
+                # see [Note: V-schedule special case]
+                if is_prev_stage_on_this_rank:
+                    stage_index_to_stage[stage_idx - 1].set_local_bwd_input(
+                        stage.get_local_bwd_output(mb_index), mb_index
+                    )
+            elif comp_type == BACKWARD_INPUT:
+                self._assert_unsharded(stage)
+
+                if not stage.is_last and not is_next_stage_on_this_rank:
+                    assert (
+                        stage_idx,
+                        mb_index,
+                    ) in self.bwd_recv_ops, (
+                        f"Attempted to run compute {action=} before receiving input"
+                    )
+                    _wait_batch_p2p(self.bwd_recv_ops.pop((stage_idx, mb_index)))
+                loss = self._maybe_get_loss(stage, mb_index)
+                stage.backward_one_chunk(
+                    mb_index,
+                    loss=loss,
+                    full_backward=False,
+                    last_backward=False,
+                )
+                # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
+                # see [Note: V-schedule special case]
+                if is_prev_stage_on_this_rank:
+                    stage_index_to_stage[stage_idx - 1].set_local_bwd_input(
+                        stage.get_local_bwd_output(mb_index), mb_index
+                    )
+            elif comp_type == BACKWARD_WEIGHT:
+                self._assert_unsharded(stage)
+                self.backward_counter[stage_idx] += 1
+                last_backward = self.backward_counter[stage_idx] == self._n_microbatches
+                stage.backward_weight_one_chunk(
+                    mb_index,
+                    last_backward=last_backward,
+                )
+            else:
+                raise ValueError(f"{action=} is unknown or unsupported")
 
-                        if (
-                            not stage.is_last
-                            # no recv op expected for V-schedule special case (see [Note: V-schedule special case])
-                            and not is_next_stage_on_this_rank
-                        ):
-                            assert (
-                                stage_idx,
-                                mb_index,
-                            ) in bwd_recv_ops, (
-                                f"Attempted to run compute {action=} before receiving input"
-                            )
-                            _wait_batch_p2p(bwd_recv_ops.pop((stage_idx, mb_index)))
-                        loss = self._maybe_get_loss(stage, mb_index)
-                        backward_counter[stage_idx] += 1
-                        last_backward = (
-                            backward_counter[stage_idx] == self._n_microbatches
-                        )
-                        grad_scale_factor = (
-                            self._n_microbatches if self.scale_grads else 1
-                        )
-                        stage.backward_one_chunk(
-                            mb_index,
-                            loss=loss,
-                            full_backward=True,
-                            last_backward=last_backward,
-                        )
-                        if last_backward:
-                            stage.scale_grads(grad_scale_factor)
-                        # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
-                        # see [Note: V-schedule special case]
-                        if is_prev_stage_on_this_rank:
-                            stage_index_to_stage[stage_idx - 1].set_local_bwd_input(
-                                stage.get_local_bwd_output(mb_index), mb_index
-                            )
-                    elif comp_type == BACKWARD_INPUT:
-                        if stage_uses_fsdp:
-                            _assert_unsharded(stage_idx)
-
-                        if not stage.is_last and not is_next_stage_on_this_rank:
-                            assert (
-                                stage_idx,
-                                mb_index,
-                            ) in bwd_recv_ops, (
-                                f"Attempted to run compute {action=} before receiving input"
-                            )
-                            _wait_batch_p2p(bwd_recv_ops.pop((stage_idx, mb_index)))
-                        loss = self._maybe_get_loss(stage, mb_index)
-                        stage.backward_one_chunk(
-                            mb_index,
-                            loss=loss,
-                            full_backward=False,
-                            last_backward=False,
+        # count either full_backward or backward_weight together, to determine when to sync DP grads
+        self.backward_counter.clear()
+        for time_step, action in enumerate(self.pipeline_order_with_comms[self.rank]):
+            try:
+                with record_function(_get_profiler_function_name(action)):
+                    if action.computation_type in self._comp_type_to_function_map:
+                        ctx = _PipelineContext(
+                            self,
+                            arg_mbs,
+                            kwarg_mbs,
+                            target_mbs,
+                            losses,
                         )
-                        # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
-                        # see [Note: V-schedule special case]
-                        if is_prev_stage_on_this_rank:
-                            stage_index_to_stage[stage_idx - 1].set_local_bwd_input(
-                                stage.get_local_bwd_output(mb_index), mb_index
-                            )
-                    elif comp_type == BACKWARD_WEIGHT:
-                        if stage_uses_fsdp:
-                            _assert_unsharded(stage_idx)
-                        backward_counter[stage_idx] += 1
-                        stage.backward_weight_one_chunk(
-                            mb_index,
-                            last_backward=backward_counter[stage_idx]
-                            == self._n_microbatches,
+                        self._comp_type_to_function_map[action.computation_type](
+                            action, ctx
                         )
+                    elif action.computation_type == OVERLAP_F_B:
+                        assert action.sub_actions is not None, "sub_actions must be set"
+                        for sub_a in action.sub_actions:
+                            _perform_action(sub_a)
                     else:
-                        raise ValueError(f"{action=} is unknown or unsupported")
+                        _perform_action(action)
             except Exception as e:
                 logger.error(
                     "_PipelineScheduleRuntime caught exception at step %s when running action %s.  Full Schedule:",
@@ -2091,16 +2227,16 @@ def _assert_unsharded(stage_idx: int):
                 raise e
 
         # Mostly these operations should have finished long ago, but there isn't an obvious time when to wait for them
-        while len(send_ops):
+        while send_ops:
             _wait_batch_p2p(send_ops.pop())
 
-        assert len(unshard_ops) == 0, "Unused unshard operations"
+        assert len(self.unshard_ops) == 0, "Unused unshard operations"
 
         # Return losses if there is a container passed in
         self._update_losses(self._stages, losses)
 
 
-class ScheduleLoopedBFS(PipelineScheduleMulti):
+class ScheduleLoopedBFS(_PipelineScheduleRuntime):
     """
     Breadth-First Pipeline Parallelism.
     See https://arxiv.org/abs/2211.05953 for details.
@@ -2135,6 +2271,9 @@ def __init__(
             rank_ops = self._calculate_single_rank_operations(rank)
             self.pipeline_order[rank] = rank_ops
 
+        # Initialize the pipeline order with communication necessary to run with _PipelineScheduleRuntime
+        self._prepare_schedule_with_comms(self.pipeline_order)
+
     def _calculate_single_rank_operations(self, rank):
         n_local_stages = len(self._stages)
         stage_indices = range(
@@ -2301,7 +2440,7 @@ def _get_1f1b_rank_ops(
     return rank_ops
 
 
-class ScheduleInterleaved1F1B(PipelineScheduleMulti):
+class ScheduleInterleaved1F1B(_PipelineScheduleRuntime):
     """
     The Interleaved 1F1B schedule.
     See https://arxiv.org/pdf/2104.04473 for details.
@@ -2357,6 +2496,9 @@ def __init__(
             rank_ops = self._calculate_single_rank_operations(rank)
             self.pipeline_order[rank] = rank_ops
 
+        # Initialize the pipeline order with communication necessary to run with _PipelineScheduleRuntime
+        self._prepare_schedule_with_comms(self.pipeline_order)
+
     def _calculate_single_rank_operations(self, rank) -> list[Optional[_Action]]:
         def get_rank_warmup_ops(rank):
             # Warms up operations for last stage
@@ -2417,7 +2559,7 @@ def backward_stage_index(step):
         )
 
 
-class ScheduleInterleavedZeroBubble(PipelineScheduleMulti):
+class ScheduleInterleavedZeroBubble(_PipelineScheduleRuntime):
     """
     The Interleaved Zero Bubble schedule.
     See https://arxiv.org/pdf/2401.10241 for details.
@@ -2438,15 +2580,8 @@ def __init__(
         output_merge_spec: Optional[Union[dict[str, Any], tuple[Any]]] = None,
         scale_grads: bool = True,
     ):
-        # TODO: we don't support Zero Bubble with torch.compile so we
-        # should disable it for now
-        for stage in stages:
-            if isinstance(stage.submod, OptimizedModule):
-                raise RuntimeError(
-                    "The Zero Bubble schedule is not supported with \
-stage modules that have used torch.compile"
-                )
-
+        # TODO: we dont support input/weight backward split with torch.compile
+        _check_torch_compile_compatibility(stages, self.__class__.__name__)
         self.pp_group_size = stages[0].group_size
         super().__init__(
             stages=stages,
@@ -2482,6 +2617,9 @@ def __init__(
             self.n_local_stages * self.pp_group_size,
         )
 
+        # Initialize the pipeline order with communication necessary to run with _PipelineScheduleRuntime
+        self._prepare_schedule_with_comms(self.pipeline_order)
+
     def _calculate_single_rank_operations(self, rank) -> list[Optional[_Action]]:
         def get_rank_warmup_ops(rank):
             # Warms up operations for last stage
@@ -2613,7 +2751,7 @@ def need_bubble(stage, op, microbatch, num_stages_global, seen_ops):
         return result
 
 
-class ScheduleZBVZeroBubble(PipelineScheduleMulti):
+class ScheduleZBVZeroBubble(_PipelineScheduleRuntime):
     """
     The Zero Bubble schedule (ZBV variant).
     See https://arxiv.org/pdf/2401.10241 Section 6 for details.
@@ -2639,6 +2777,8 @@ def __init__(
         output_merge_spec: Optional[Union[dict[str, Any], tuple[Any]]] = None,
         scale_grads: bool = True,
     ):
+        # TODO: we dont support input/weight backward split with torch.compile
+        _check_torch_compile_compatibility(stages, self.__class__.__name__)
         self.pp_group_size = stages[0].group_size
         super().__init__(
             stages=stages,
@@ -2673,6 +2813,9 @@ def __init__(
             rank_ops = self._calculate_single_rank_operations(rank)
             self.pipeline_order[rank] = rank_ops
 
+        # Initialize the pipeline order with communication necessary to run with _PipelineScheduleRuntime
+        self._prepare_schedule_with_comms(self.pipeline_order)
+
     def _calculate_single_rank_operations(self, rank) -> list[Optional[_Action]]:
         # max(2 * self.pp_group_size - 1, ...) ensure the number of microbatches is at least
         # as large of the number of microbatches needed to fully utilize the pipeline
@@ -2810,6 +2953,8 @@ def __init__(
         output_merge_spec: Optional[Union[dict[str, Any], tuple[Any]]] = None,
         scale_grads: bool = True,
     ):
+        # TODO: we dont support input/weight backward split with torch.compile
+        _check_torch_compile_compatibility(stages, self.__class__.__name__)
         self.pp_group_size = stages[0].group_size
         super().__init__(
             stages=stages,
@@ -3207,3 +3352,29 @@ def _dump_chrometrace(schedule, filename):
 
     with open(filename, "w") as f:
         json.dump({"traceEvents": events}, f)
+
+
+def _check_torch_compile_compatibility(
+    stages: list[_PipelineStageBase], schedule_name: str
+):
+    """
+    Check if the schedule is compatible with torch.compile.
+
+    Args:
+        stages: List of pipeline stages to check
+        schedule_name: Name of the schedule for error message
+
+    Raises:
+        RuntimeError: If any stage uses torch.compile
+    """
+    for stage in stages:
+        if not isinstance(stage.submod, torch.nn.Module):
+            continue
+
+        for module in stage.submod.modules():
+            if isinstance(module, OptimizedModule):
+                raise RuntimeError(
+                    f"The {schedule_name} schedule is not supported with "
+                    "stage modules that have used torch.compile. "
+                    f"Found OptimizedModule in {type(module).__name__}"
+                )
diff --git a/torch/distributed/pipelining/stage.py b/torch/distributed/pipelining/stage.py
index 6615ced0398e..3e9aabedd0a1 100644
--- a/torch/distributed/pipelining/stage.py
+++ b/torch/distributed/pipelining/stage.py
@@ -3,13 +3,15 @@
 import logging
 import operator
 from abc import ABC, abstractmethod
-from typing import Any, Callable, cast, Optional, Union
+from collections.abc import Callable
+from typing import Any, cast, Optional, Union
 
 import torch
 import torch.distributed as dist
 import torch.fx as fx
 import torch.nn as nn
 from torch._subclasses.fake_tensor import FakeTensor
+from torch.distributed._composable.replicate_with_fsdp import replicate, ReplicateModule
 from torch.distributed.fsdp import FSDPModule, fully_shard
 from torch.fx.node import Argument, map_aggregate
 from torch.nn.parallel import DistributedDataParallel
@@ -153,6 +155,7 @@ def __init__(
         self.submod = submodule
         self.stage_index = stage_index
         self.num_stages = num_stages
+        # pyrefly: ignore  # read-only
         self.device = device
         self.group = group
 
@@ -587,7 +590,7 @@ def backward_maybe_with_nosync(
         last_backward: bool = False,
     ) -> tuple[tuple[Optional[torch.Tensor], ...], Optional[list[dict[str, Any]]]]:
         """
-        Whether using PP with FSDP or DDP, there are some runtime differences between the last backward step and the
+        Whether using PP with FSDP, DDP, or replicate there are some runtime differences between the last backward step and the
         other steps.  Namely, we need to accumulate gradients on previous steps and reduce them on the last step, but
         there are additional state-variables and performance considerations depending on the data parallelism used.
         This helper should adapt any pipeline parallel schedule to work with common/supported data parallel libraries.
@@ -641,29 +644,13 @@ def perform_backward(
             else:
                 with self.submod.no_sync():  # type: ignore[operator]
                     result = perform_backward(backward_type)()
-        # If submod is a FSDP module
+
+        # If submod is a FSDP or replicate module
         elif isinstance(self.submod, FSDPModule):
             self.submod.set_is_last_backward(False)
             self.submod.set_reshard_after_backward(False)
             self.submod.set_requires_gradient_sync(False)
             result = perform_backward(backward_type)()
-            if last_backward:
-                # Manually call post backward for FSDP
-                def run_post_backward(fsdp_module: FSDPModule) -> None:
-                    fsdp_module.set_is_last_backward(True)
-                    fsdp_module.set_reshard_after_backward(True)
-                    fsdp_module.set_requires_gradient_sync(True)
-                    fsdp_state = fully_shard.state(fsdp_module)  # type: ignore[attr-defined]
-                    for state in fsdp_state._state_ctx.all_states:
-                        if state._fsdp_param_group:
-                            state._fsdp_param_group.post_backward()
-
-                    # it would be much better if pipelining backward invoked .backward so autograd hooks
-                    # worked and modules like DDP/FSDP behaved as expected.  Working around this for the time being,
-                    # we need to call this too to ensure FSDP syncs its grad reduction ops back to the default stream.
-                    fsdp_state._root_post_backward_final_callback()
-
-                run_post_backward(self.submod)
 
         else:
             # Non-DP submodule, regular backward
@@ -677,6 +664,7 @@ def forward_one_chunk(
         fwd_chunk_id: int,
         args: tuple[Any, ...],
         kwargs: Optional[dict[str, Any]] = None,
+        save_forward_output: bool = True,
     ):
         """
         Perform forward pass on the stage with one microbatch.
@@ -716,9 +704,8 @@ def forward_one_chunk(
 
         # Prepare for final output merge or reduction
         # Output chunks is only used for the last stage since we only merge the output of the last stage
-        if self.is_last:
+        if self.is_last and save_forward_output:
             self.output_chunks.append(output)
-
         # Save activations and inputs for backward
         flat_args = flatten_args(composite_args)
         flat_kwargs = flatten_args(composite_kwargs)
@@ -945,8 +932,10 @@ def _get_init_p2p_neighbors_ops(self) -> list[dist.P2POp]:
         next_stage_peer_rank = self.stage_index_to_group_rank.get(self.stage_index + 1)
         prev_stage_peer_rank = self.stage_index_to_group_rank.get(self.stage_index - 1)
 
-        recv_tensor = torch.zeros(1, device=self.device)
-        send_tensor = torch.tensor(self.stage_index, device=self.device)
+        recv_tensor = torch.zeros(1, device=self.device, dtype=torch.float32)
+        send_tensor = torch.tensor(
+            self.stage_index, device=self.device, dtype=torch.float32
+        )
         # forward
         if not self.is_first:
             ops.append(
@@ -989,6 +978,31 @@ def _get_init_p2p_neighbors_ops(self) -> list[dist.P2POp]:
 
         return ops
 
+    def _post_backward(self, grad_scale_factor: int):
+        # Manually call post backward for FSDP
+        if isinstance(self.submod, FSDPModule):
+            fsdp_module = self.submod
+            fsdp_module.set_is_last_backward(True)
+            fsdp_module.set_reshard_after_backward(True)
+            fsdp_module.set_requires_gradient_sync(True)
+
+            if isinstance(fsdp_module, ReplicateModule):
+                distributed_state = replicate.state(fsdp_module)  # type: ignore[arg-type]
+            else:
+                distributed_state = fully_shard.state(fsdp_module)  # type: ignore[attr-defined]
+
+            for state in distributed_state._state_ctx.all_states:
+                if state._fsdp_param_group:
+                    state._fsdp_param_group.post_backward()
+
+            # it would be much better if pipelining backward invoked .backward so autograd hooks
+            # worked and modules like DDP/FSDP behaved as expected.  Working around this for the time being,
+            # we need to call this too to ensure FSDP syncs its grad reduction ops back to the default stream.
+            distributed_state._root_post_backward_final_callback()
+        # Call gradient scaling at the end of the backward pass
+        # NOTE: this must happen after FSDP post_backward is FSDP is enabled
+        self.scale_grads(grad_scale_factor)
+
 
 class _PipelineStage(_PipelineStageBase):
     def __init__(
@@ -1516,14 +1530,12 @@ def _prepare_forward_infra(
             if not self.is_first:
                 # We assume that we always receive from stage - 1
                 recv_infos = tuple(
-                    [
-                        _RecvInfo(
-                            f"recv_for_{self.stage_index}_from_{self.stage_index - 1}",
-                            self.stage_index - 1,
-                            _make_tensor_from_meta(inp, self.device),
-                        )
-                        for inp in self.inputs_meta
-                    ]
+                    _RecvInfo(
+                        f"recv_for_{self.stage_index}_from_{self.stage_index - 1}",
+                        self.stage_index - 1,
+                        _make_tensor_from_meta(inp, self.device),
+                    )
+                    for inp in self.inputs_meta
                 )
                 # In case there is backward pass, set requires_grad for receive buffers
                 if self.has_backward:
@@ -1533,7 +1545,7 @@ def _prepare_forward_infra(
                 self.args_recv_info[chunk_id] = recv_infos
             else:
                 self.args_recv_info[chunk_id] = tuple(
-                    [_RootArgPlaceholder(i) for i in self.inputs_meta]
+                    _RootArgPlaceholder(i) for i in self.inputs_meta
                 )
 
         # Send info during forward for each activation
@@ -1558,15 +1570,11 @@ def _create_grad_recv_info(
             # Receiving gradients from multiple sources is not supported
             # hence we only take the first destination
             grad_recv_info = tuple(
-                [
-                    _RecvInfo(
-                        f"recv_grad_for_{self.stage_index}_from_{dst_list[0]}",
-                        dst_list[0],
-                        _make_tensor_from_meta(
-                            self.get_outputs_meta()[idx], self.device
-                        ),
-                    )
-                    for idx, dst_list in act_send_info.items()
-                ]
+                _RecvInfo(
+                    f"recv_grad_for_{self.stage_index}_from_{dst_list[0]}",
+                    dst_list[0],
+                    _make_tensor_from_meta(self.get_outputs_meta()[idx], self.device),
+                )
+                for idx, dst_list in act_send_info.items()
             )
         return grad_recv_info
diff --git a/torch/distributed/remote_device.py b/torch/distributed/remote_device.py
index ab5215e2f83a..fbd14faa4dee 100644
--- a/torch/distributed/remote_device.py
+++ b/torch/distributed/remote_device.py
@@ -36,12 +36,14 @@ def __init__(self, remote_device: Union[str, torch.device]):
         elif isinstance(remote_device, str):
             fields = remote_device.split("/")
             if len(fields) == 2:
+                # pyrefly: ignore  # bad-assignment
                 self._worker_name, self._device = fields
             elif len(fields) == 1:
                 # Check if this is a valid device.
                 if _remote_device._is_valid_local_device(fields[0]):
                     self._device = fields[0]
                 else:
+                    # pyrefly: ignore  # bad-assignment
                     self._worker_name = fields[0]
                     self._device = "cpu"
             else:
@@ -63,6 +65,7 @@ def __init__(self, remote_device: Union[str, torch.device]):
                 # rank:<rank>/device format, extract rank
                 if fields[0] == "rank" and fields[1].isdigit():
                     self._rank = int(fields[1])  # type: ignore[assignment]
+                    # pyrefly: ignore  # bad-assignment
                     self._worker_name = None
                 else:
                     raise ValueError(PARSE_ERROR)
diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py
index a7b8c358d9ab..4d5e58778164 100644
--- a/torch/distributed/rendezvous.py
+++ b/torch/distributed/rendezvous.py
@@ -9,9 +9,9 @@
 import numbers
 import os
 import sys
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 from datetime import timedelta
-from typing import Callable, Optional
+from typing import Optional
 
 from torch.distributed import FileStore, Store, TCPStore
 
@@ -93,6 +93,7 @@ def _rendezvous_helper(url: str, rank: int, world_size_opt: Optional[int], **kwa
         result = result._replace(
             query=f"{'&'.join([f'{k}={v}' for k, v in query_dict.items()])}"
         )
+        # pyrefly: ignore  # bad-assignment
         url = urlunparse(result)
 
     if result.scheme not in _rendezvous_handlers:
@@ -110,6 +111,7 @@ def rendezvous(url: str, rank: int = -1, world_size: int = -1, **kwargs):
     if not isinstance(world_size, numbers.Integral):
         raise RuntimeError(f"`world_size` must be an integer. {world_size}")
 
+    # pyrefly: ignore  # bad-argument-type
     return _rendezvous_helper(url, rank, world_size, **kwargs)
 
 
diff --git a/torch/distributed/rpc/__init__.py b/torch/distributed/rpc/__init__.py
index 27a945a92e44..adf901d6b6e3 100644
--- a/torch/distributed/rpc/__init__.py
+++ b/torch/distributed/rpc/__init__.py
@@ -37,6 +37,7 @@ def is_available() -> bool:
     import numbers
 
     import torch.distributed.autograd as dist_autograd
+    from torch._C._distributed_c10d import Store
     from torch._C._distributed_rpc import (  # noqa: F401
         _cleanup_python_rpc_handler,
         _DEFAULT_INIT_METHOD,
@@ -69,7 +70,6 @@ def is_available() -> bool:
         RpcBackendOptions,
         WorkerInfo,
     )
-    from torch.distributed._distributed_c10d import Store
 
     if _is_tensorpipe_available:
         from torch._C._distributed_rpc import (  # noqa: F401
diff --git a/torch/distributed/rpc/api.py b/torch/distributed/rpc/api.py
index 4337efd700c4..883b6b324f9b 100644
--- a/torch/distributed/rpc/api.py
+++ b/torch/distributed/rpc/api.py
@@ -295,8 +295,8 @@ def _barrier(worker_names):
     """
     try:
         _all_gather(None, set(worker_names))
-    except RuntimeError as ex:
-        logger.error("Failed to complete barrier, got error %s", ex)
+    except RuntimeError:
+        logger.exception("Failed to complete barrier")
 
 
 @_require_initialized
@@ -311,9 +311,7 @@ def _wait_all_workers(timeout=DEFAULT_SHUTDOWN_TIMEOUT):
     try:
         _all_gather(None, timeout=timeout)
     except RuntimeError as ex:
-        logger.error(
-            "Failed to respond to 'Shutdown Proceed' in time, got error %s", ex
-        )
+        logger.exception("Failed to respond to 'Shutdown Proceed' in time")
         raise ex
 
 
@@ -473,6 +471,7 @@ def _rref_typeof_on_user(
 
 
 T = TypeVar("T")
+# pyrefly: ignore  # invalid-annotation
 GenericWithOneTypeVar = Generic[T]
 
 
@@ -719,6 +718,7 @@ def _invoke_rpc(
         is_async_exec = hasattr(func, "_wrapped_async_rpc_function")
 
         if is_async_exec:
+            # pyrefly: ignore  # missing-attribute
             wrapped = func._wrapped_async_rpc_function
             if isinstance(wrapped, torch.jit.ScriptFunction):
                 func = wrapped
diff --git a/torch/distributed/rpc/backend_registry.py b/torch/distributed/rpc/backend_registry.py
index 07251419a5e6..2eea49a08030 100644
--- a/torch/distributed/rpc/backend_registry.py
+++ b/torch/distributed/rpc/backend_registry.py
@@ -95,6 +95,7 @@ def register_backend(
     BackendType.__repr__ = _backend_type_repr  # type: ignore[assignment]
     if BackendType.__doc__:
         BackendType.__doc__ = _backend_type_doc
+    # pyrefly: ignore  # unsupported-operation
     return BackendType[backend_name]
 
 
@@ -350,7 +351,7 @@ def _tensorpipe_init_backend_handler(
 
     device_count = torch.cuda.device_count()
 
-    is_static_group = True if world_size else False
+    is_static_group = bool(world_size)
     # world_size is specified so this is a static group (ranks cannot join and leave)
     if is_static_group:
         # The agent's join method is required to behave like a barrier and perform
diff --git a/torch/distributed/rpc/options.py b/torch/distributed/rpc/options.py
index e8b78236b9b2..506c7bfd6ad8 100644
--- a/torch/distributed/rpc/options.py
+++ b/torch/distributed/rpc/options.py
@@ -48,6 +48,7 @@ def _to_device_list(devices: list[DeviceType]) -> list[torch.device]:
     _TensorPipeRpcBackendOptionsBase = object  # type: ignore[assignment, misc]
 
 
+# pyrefly: ignore  # invalid-inheritance
 class TensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
     r"""
     The backend options for
diff --git a/torch/distributed/rpc/server_process_global_profiler.py b/torch/distributed/rpc/server_process_global_profiler.py
index 2e29e10291f1..dfe8c02aef29 100644
--- a/torch/distributed/rpc/server_process_global_profiler.py
+++ b/torch/distributed/rpc/server_process_global_profiler.py
@@ -4,6 +4,8 @@
 import itertools
 
 import torch
+
+# pyrefly: ignore  # deprecated
 from torch.autograd.profiler_legacy import profile
 
 from . import (
@@ -174,11 +176,13 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         flattened_function_events = list(
             itertools.chain.from_iterable(process_global_function_events)
         )
+        # pyrefly: ignore  # bad-assignment
         self.function_events = torch.autograd.profiler_util.EventList(
             flattened_function_events,
             use_device="cuda" if self.use_cuda else None,
             profile_memory=self.profile_memory,
         )
+        # pyrefly: ignore  # missing-attribute
         self.function_events._build_tree()
 
         self.process_global_function_events = process_global_function_events
diff --git a/torch/distributed/run.py b/torch/distributed/run.py
index ba8fd3dc69e5..c312b9dc9a0d 100644
--- a/torch/distributed/run.py
+++ b/torch/distributed/run.py
@@ -373,8 +373,9 @@ def main():
 import sys
 import uuid
 from argparse import ArgumentParser, REMAINDER
+from collections.abc import Callable
 from importlib import metadata
-from typing import Callable, Optional, Union
+from typing import Optional, Union
 
 import torch
 from torch.distributed.argparse_util import check_env, env
@@ -770,14 +771,9 @@ def _get_logs_specs_class(logs_specs_name: Optional[str]) -> type[LogsSpecs]:
     logs_specs_cls = None
     if logs_specs_name is not None:
         eps = metadata.entry_points()
-        if hasattr(eps, "select"):  # >= 3.10
-            group = eps.select(group="torchrun.logs_specs")
-            if group.select(name=logs_specs_name):
-                logs_specs_cls = group[logs_specs_name].load()
-
-        elif specs := eps.get("torchrun.logs_specs"):  # < 3.10
-            if entrypoint_list := [ep for ep in specs if ep.name == logs_specs_name]:
-                logs_specs_cls = entrypoint_list[0].load()
+        group = eps.select(group="torchrun.logs_specs")
+        if group.select(name=logs_specs_name):
+            logs_specs_cls = group[logs_specs_name].load()
 
         if logs_specs_cls is None:
             raise ValueError(
@@ -844,6 +840,7 @@ def config_from_args(args) -> tuple[LaunchConfig, Union[Callable, str], list[str
             ) from e
 
     logs_specs_cls: type[LogsSpecs] = _get_logs_specs_class(args.logs_specs)
+    # pyrefly: ignore  # bad-instantiation
     logs_specs = logs_specs_cls(
         log_dir=args.log_dir,
         redirects=Std.from_str(args.redirects),
diff --git a/torch/distributed/tensor/__init__.py b/torch/distributed/tensor/__init__.py
index f64f41672b7c..067d4c0917e9 100644
--- a/torch/distributed/tensor/__init__.py
+++ b/torch/distributed/tensor/__init__.py
@@ -46,7 +46,11 @@
 ]
 
 # For weights_only torch.load
-from ._dtensor_spec import DTensorSpec as _DTensorSpec, TensorMeta as _TensorMeta
+from ._dtensor_spec import (
+    DTensorSpec as _DTensorSpec,
+    ShardOrderEntry as _ShardOrderEntry,
+    TensorMeta as _TensorMeta,
+)
 
 
 torch.serialization.add_safe_globals(
@@ -54,6 +58,7 @@
         DeviceMesh,
         _DTensorSpec,
         _TensorMeta,
+        _ShardOrderEntry,
         DTensor,
         Partial,
         Replicate,
diff --git a/torch/distributed/tensor/_api.py b/torch/distributed/tensor/_api.py
index 7eeafaa8eaf9..5fd66b2c5f8e 100644
--- a/torch/distributed/tensor/_api.py
+++ b/torch/distributed/tensor/_api.py
@@ -3,8 +3,8 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import inspect
 import warnings
-from collections.abc import Sequence
-from typing import Any, Callable, cast, Optional
+from collections.abc import Callable, Sequence
+from typing import Any, cast, Optional
 from typing_extensions import deprecated
 
 import torch
@@ -25,6 +25,7 @@
     normalize_to_torch_size,
 )
 from torch.distributed.tensor.placement_types import (
+    _StridedShard,
     Partial,
     Placement,
     Replicate,
@@ -240,8 +241,8 @@ class DTensor(torch.Tensor):
     # _op_dispatcher instance as a class attribute to handle runtime dispatching logic
     _op_dispatcher: op_dispatch.OpDispatcher = op_dispatch.OpDispatcher()
 
-    @staticmethod
-    @torch._disable_dynamo
+    # This implementation is just to convince mypy _spec and _local_tensor are
+    # initialized; it is immediately overridden below.
     def __new__(
         cls,
         local_tensor: torch.Tensor,
@@ -249,10 +250,21 @@ def __new__(
         *,
         requires_grad: bool,
     ) -> "DTensor":
+        r = torch.Tensor._dtensor__new__(
+            cls, local_tensor, spec, requires_grad=requires_grad
+        )
+        r._spec = spec
+        r._local_tensor = local_tensor
+        return r
+
+    __new__ = torch.Tensor._dtensor__new__  # type: ignore[assignment] # noqa: F811
+
+    @torch._disable_dynamo
+    @mark_subclass_constructor_exportable_experimental
+    def __init__(self, *args, **kwargs):
         """
         Construct a DTensor from a local tensor, device mesh, and placement and
         other tensor properties (i.e. shape, requires_grad, strides, etc).
-
         .. note:: This is not a public API and it's only supposed to be used by the
             operator implementations and internals. If you want to construct a
             DTensor from a local tensor, consider using ``DTensor.from_local``, if
@@ -260,31 +272,6 @@ def __new__(
             already have tensor initialized and want to shard this tensor),
             consider using ``distribute_tensor``.
         """
-        if local_tensor.requires_grad and not requires_grad:
-            warnings.warn(
-                "To construct DTensor from torch.Tensor, it's recommended to "
-                "use local_tensor.detach() and make requires_grad consistent."
-            )
-
-        # new method instruct wrapper tensor from local_tensor and add
-        # placement spec, it does not do actual distribution
-        assert spec.tensor_meta is not None, "TensorMeta should not be None!"
-
-        r = torch.Tensor._make_dtensor(
-            cls,
-            spec.tensor_meta.shape,
-            spec.tensor_meta.stride,
-            local_tensor,
-            requires_grad,
-        )
-
-        r._spec = spec
-        r._local_tensor = local_tensor
-        return r
-
-    @torch._disable_dynamo
-    @mark_subclass_constructor_exportable_experimental
-    def __init__(self, *args, **kwargs):
         super().__init__()
 
     # pyre-fixme[14]: `__repr__` overrides method defined in `DTensor` inconsistently.
@@ -401,6 +388,12 @@ def from_local(
         .. note:: ``from_local`` is differentiable, the `requires_grad` of the created
             `DTensor` object will depend on if `local_tensor` requires_grad or not.
         """
+        # `local_tensor` argument cannot be DTensor
+        if isinstance(local_tensor, DTensor):
+            raise RuntimeError(
+                f"the local_tensor argument only accepts torch.Tensor but got {type(local_tensor)} value."
+            )
+
         # if same shape/dtype, no need to run_check, if not, must allgather
         # the metadatas to check the size/dtype across ranks
         # There should be no data communication unless there's replication
@@ -542,9 +535,10 @@ def redistribute(
 
         placements = list(placements)
         for i, placement in enumerate(placements):
-            if placement.is_partial():
+            if placement.is_partial() and self.placements[i] != placement:
                 raise RuntimeError(
-                    "Can not redistribute to Partial, redistributing to Partial is for internal use only!"
+                    f"Can not redistribute from {self.placements[i]} to {placement}, "
+                    "redistributing to Partial is for internal use only!"
                 )
             elif isinstance(placement, Shard) and placement.dim < 0:
                 # normalize shard dim to be positive
@@ -606,7 +600,21 @@ def placements(self) -> tuple[Placement, ...]:
         """
         return self._spec.placements
 
+    def _raise_if_contains_partial_placements(self) -> None:
+        """
+        Raise an error if the DTensor contains partial placements.
+        """
+        for placement in self._spec.placements:
+            if not isinstance(placement, Partial):
+                continue
+
+            raise ValueError(
+                "Any checkpointing related operations are not supported for "
+                "DTensor with partial placements!"
+            )
+
     def __create_write_items__(self, fqn: str, object: Any):
+        self._raise_if_contains_partial_placements()
         from torch.distributed.checkpoint.planner_helpers import (
             _create_write_items_for_dtensor,
         )
@@ -629,6 +637,7 @@ def __create_chunk_list__(self):
         Returns:
             A List[:class:`ChunkStorageMetadata`] object that represents the shard size/offset on the current rank.
         """
+        self._raise_if_contains_partial_placements()
         from torch.distributed.checkpoint.planner_helpers import (
             _create_chunk_from_dtensor,
         )
@@ -641,6 +650,7 @@ def __create_chunk_list__(self):
             raise RuntimeError("Unsupported tensor type!")
 
     def __get_tensor_shard__(self, index):
+        self._raise_if_contains_partial_placements()
         if hasattr(self._local_tensor, "__get_tensor_shard__"):
             return self._local_tensor.__get_tensor_shard__(index)  # type: ignore[attr-defined]
         elif isinstance(self._local_tensor, torch.Tensor):
@@ -648,6 +658,17 @@ def __get_tensor_shard__(self, index):
         else:
             raise RuntimeError("Unsupported tensor type!")
 
+    @classmethod
+    def __metadata_guard__(
+        cls, orig: tuple[DTensorSpec, bool], other: tuple[DTensorSpec, bool]
+    ) -> bool:
+        orig_spec, orig_requires_grad = orig
+        other_spec, other_requires_grad = other
+        return (
+            orig_spec._check_equals(other_spec, skip_shapes=True)
+            and orig_requires_grad == other_requires_grad
+        )
+
 
 def distribute_tensor(
     tensor: torch.Tensor,
@@ -756,18 +777,29 @@ def distribute_tensor(
     # distribute the tensor according to the placements.
     placements = list(placements)
     for idx, placement in enumerate(placements):
-        if placement.is_shard():
-            placement = cast(Shard, placement)
-            if placement.dim < 0:
-                # normalize shard placement dim
-                placement = Shard(placement.dim + tensor.ndim)
-                placements[idx] = placement
-            local_tensor = placement._shard_tensor(
-                local_tensor, device_mesh, idx, src_data_rank
+        if isinstance(placement, Shard):
+            placement_dim = (
+                placement.dim + tensor.ndim if placement.dim < 0 else placement.dim
             )
-        elif placement.is_replicate():
-            placement = cast(Replicate, placement)
-            local_tensor = placement._replicate_tensor(
+            if isinstance(placement, _StridedShard):
+                local_tensor = _StridedShard._make_shard_tensor(
+                    placement_dim,
+                    local_tensor,
+                    device_mesh,
+                    idx,
+                    src_data_rank,
+                    split_factor=placement.split_factor,
+                )
+                placements[idx] = _StridedShard(
+                    placement_dim, split_factor=placement.split_factor
+                )
+            else:
+                local_tensor = Shard._make_shard_tensor(
+                    placement_dim, local_tensor, device_mesh, idx, src_data_rank
+                )
+                placements[idx] = Shard(placement_dim)
+        elif isinstance(placement, Replicate):
+            local_tensor = Replicate._make_replicate_tensor(
                 local_tensor, device_mesh, idx, src_data_rank
             )
         else:
diff --git a/torch/distributed/tensor/_collective_utils.py b/torch/distributed/tensor/_collective_utils.py
index f01836c59592..463898318e4a 100644
--- a/torch/distributed/tensor/_collective_utils.py
+++ b/torch/distributed/tensor/_collective_utils.py
@@ -8,10 +8,9 @@
 import torch
 import torch.distributed._functional_collectives as funcol
 import torch.distributed.tensor._dtensor_spec as dtensor_spec
+from torch._C._distributed_c10d import _resolve_process_group
 from torch._logging import warning_once
-
-# Import from centralized fallback module - no conditional imports needed
-from torch.distributed._distributed_c10d import _resolve_process_group
+from torch.distributed._local_tensor import local_tensor_mode
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh
 from torch.distributed.distributed_c10d import (
     _get_group_size_by_name,
@@ -42,7 +41,7 @@ def _shard_dim_alltoall_meta(input, gather_dim, shard_dim, group_name):
 
 
 def shard_dim_alltoall(input, gather_dim, shard_dim, mesh, mesh_dim):
-    if mesh.device_type == "cpu":
+    if mesh.device_type == "cpu" and local_tensor_mode() is None:
         # Gloo does not support alltoall, so falling back to allgather + chunk
         warning_once(
             logger,
diff --git a/torch/distributed/tensor/_dispatch.py b/torch/distributed/tensor/_dispatch.py
index 9703c412657f..8a293aaaea24 100644
--- a/torch/distributed/tensor/_dispatch.py
+++ b/torch/distributed/tensor/_dispatch.py
@@ -1,8 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import contextlib
-import functools
 import logging
-import operator
 import warnings
 from collections.abc import Sequence
 from typing import cast, Optional
@@ -23,11 +21,8 @@
 )
 from torch.distributed.tensor._utils import try_find_mesh_from_args
 from torch.distributed.tensor.placement_types import Partial, Placement, Replicate
-from torch.utils._python_dispatch import (
-    _get_current_dispatch_mode,
-    return_and_correct_aliasing,
-)
-from torch.utils.debug_mode import DebugMode
+from torch.utils._debug_mode import get_active_debug_mode
+from torch.utils._python_dispatch import return_and_correct_aliasing
 
 
 try:
@@ -170,7 +165,7 @@ def dispatch(
                 raise
         except Exception as e:
             raise RuntimeError(
-                f"Sharding propagation failed for {op_info.schema}"
+                f"{e}\n\nSharding propagation failed for {op_info.schema}"
             ) from e
 
         output_sharding = op_info.output_sharding
@@ -178,6 +173,7 @@ def dispatch(
 
         mesh = op_info.compute_mesh
         participating = mesh.get_coordinate() is not None
+        local_results = None
         if participating:
             # computation that happens in the current rank of the mesh, normal case
             if output_sharding.needs_redistribute:
@@ -192,7 +188,9 @@ def dispatch(
 
             local_tensor_args = (
                 pytree.tree_unflatten(
-                    cast(list[object], op_info.local_args), op_info.args_tree_spec
+                    cast(list[object], op_info.local_args),
+                    # pyrefly: ignore  # bad-argument-type
+                    op_info.args_tree_spec,
                 )
                 if op_info.args_tree_spec
                 else op_info.local_args
@@ -279,14 +277,16 @@ def default_tensor(spec: DTensorSpec) -> torch.Tensor:
 
         if output_sharding.output_spec is None:
             if op_call == aten.equal.default:
-                # For equal operator, The local results from all devices should be all-gathered
-                # and a reduce op (AND) will be performed on the list of results to ensure SPMD
-                # execution. We can extend this for more ops if necessary.
-                obj_list = [None for _ in range(dist.get_world_size())]
-                dist.all_gather_object(obj_list, local_results)  # type: ignore[possibly-undefined]
-                obj_list = list(filter(lambda x: x is not None, obj_list))
-                # perform reduce on the collection with AND op
-                local_results = functools.reduce(operator.and_, obj_list, True)
+                # The output of the equal op is a bool, by converting it into a
+                # a single value tensor, we can use all-reduce with min reduce op
+                # to simulate logical and.
+                assert local_results is None or isinstance(local_results, bool)
+                r = torch.tensor(
+                    int(local_results) if local_results is not None else 1,
+                    device=mesh.device_type,
+                )
+                dist.all_reduce(r, op=dist.ReduceOp.MIN)
+                local_results = bool(r.item())
 
         if op_info.schema.is_inplace_op():
             # inplace op should return self instead of re-wrapping
@@ -338,8 +338,7 @@ def redistribute_local_args(
         suggested_input_schema: OpSchema,
         use_val_from_redistribute_schema: bool,
     ) -> None:
-        debug_mode = _get_current_dispatch_mode()
-        in_debug_mode = isinstance(debug_mode, DebugMode)
+        debug_mode = get_active_debug_mode()
 
         # NOTE: it's very rare that we need to reshard kwargs so we intentionally skip it
         if op_info.args_tree_spec is not None:
@@ -356,16 +355,19 @@ def redistribute_local_args(
                 local_tensor = cast(torch.Tensor, op_info.local_args[i])
                 if arg_spec != reshard_arg_spec:
                     redistribute_context = (
-                        debug_mode.record_redistribute_calls(
+                        debug_mode.record_redistribute_calls(  # type: ignore[union-attr]
                             i, arg_spec, reshard_arg_spec
                         )
-                        if in_debug_mode
+                        if debug_mode is not None
                         else contextlib.nullcontext()
                     )
 
                     with redistribute_context:
                         resharded_local_tensor = redistribute_local_tensor(
-                            local_tensor, arg_spec, reshard_arg_spec
+                            local_tensor,
+                            arg_spec,
+                            # pyrefly: ignore  # bad-argument-type
+                            reshard_arg_spec,
                         )
                     new_local_args.append(resharded_local_tensor)
                 else:
@@ -435,7 +437,10 @@ def unwrap_to_op_info(
                     op_call, args_list
                 )
                 kwargs_schema[k] = self._try_replicate_spec_for_scalar_tensor(
-                    op_call, v, compute_mesh
+                    op_call,
+                    v,
+                    # pyrefly: ignore  # bad-argument-type
+                    compute_mesh,
                 )
                 local_kwargs[k] = v
             else:
@@ -451,6 +456,7 @@ def unwrap_to_op_info(
             OpSchema(
                 op_call,
                 (
+                    # pyrefly: ignore  # bad-argument-type
                     pytree.tree_unflatten(args_schema, args_spec)
                     if args_spec
                     else tuple(args_schema)
@@ -521,5 +527,7 @@ def _try_replicate_spec_for_scalar_tensor(
             raise RuntimeError(
                 f"{op_call}: got mixed torch.Tensor and DTensor, need to convert all"
                 " torch.Tensor to DTensor before calling distributed operators!"
+                " Please see https://docs.pytorch.org/docs/main/distributed.tensor.html#mixed-tensor-and-dtensor-operations"
+                " for more details."
             )
         return replication_spec
diff --git a/torch/distributed/tensor/_dtensor_spec.py b/torch/distributed/tensor/_dtensor_spec.py
index bffb399b2bca..42cb7fcd7c33 100644
--- a/torch/distributed/tensor/_dtensor_spec.py
+++ b/torch/distributed/tensor/_dtensor_spec.py
@@ -1,14 +1,56 @@
+import itertools
+from collections import defaultdict
 from dataclasses import dataclass
 from typing import Any, cast, NamedTuple, Optional
 
 import torch
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor.placement_types import (
+    _StridedShard,
     Partial,
     Placement,
     Replicate,
     Shard,
 )
+from torch.utils._debug_mode import _stringify_shape
+from torch.utils._dtype_abbrs import dtype_abbrs
+
+
+class ShardOrderEntry(NamedTuple):
+    """
+    Represents how a single tensor dimension is sharded across mesh dimensions.
+
+    Attributes:
+        tensor_dim: The tensor dimension being sharded (e.g., 0, 1, 2 for a 3D tensor).
+        mesh_dims: Tuple of mesh dimensions across which this tensor dimension is sharded,
+                   in execution order. The first mesh dim is applied first, second is applied
+                   second, etc. This tuple is guaranteed to be non-empty.
+
+    Examples:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_DISTRIBUTED)
+        >>> # Tensor dim 1 sharded across mesh dim 2, then mesh dim 0
+        >>> ShardOrderEntry(tensor_dim=1, mesh_dims=(2, 0))
+
+        >>> # Tensor dim 0 sharded only on mesh dim 1
+        >>> ShardOrderEntry(tensor_dim=0, mesh_dims=(1,))
+    """
+
+    tensor_dim: int
+    mesh_dims: tuple[int, ...]  # guaranteed to be non-empty
+
+
+# Type alias for the complete shard order specification
+# A tuple of ShardOrderEntry, one per sharded tensor dimension
+#
+# Example:
+#   shard_order = (
+#       ShardOrderEntry(tensor_dim=0, mesh_dims=(1,)),
+#       ShardOrderEntry(tensor_dim=2, mesh_dims=(0, 3)),
+#   )
+#   This means:
+#     - Tensor dimension 0 is sharded on mesh dimension 1
+#     - Tensor dimension 2 is sharded on mesh dimension 0 first, then mesh dimension 3
+ShardOrder = tuple[ShardOrderEntry, ...]
 
 
 class TensorMeta(NamedTuple):
@@ -29,16 +71,102 @@ class DTensorSpec:
     # tensor meta will only be set during sharding propagation
     tensor_meta: Optional[TensorMeta] = None
 
+    # When a tensor dimension is sharded across multiple mesh axes,
+    # `shard_order` specifies the sequence in which these shardings are applied.
+    # This order determines how tensor shards are mapped and distributed across
+    # devices.
+    #
+    # Example:
+    #   For a tensor of shape [8, 16] and a 3D device mesh, if dim 0 is sharded over
+    #   mesh dim 1, and dim 1 is sharded over mesh dim 0 and then mesh dim 2,
+    #   the shard_order would be:
+    #     shard_order = (
+    #         ShardOrderEntry(tensor_dim=0, mesh_dims=(1,)),
+    #         ShardOrderEntry(tensor_dim=1, mesh_dims=(0, 2)),
+    #     )
+    shard_order: ShardOrder = None  # type: ignore[assignment]
+
     def __post_init__(self) -> None:
         if not isinstance(self.placements, tuple):
             self.placements = tuple(self.placements)
-        self._hash: Optional[int] = None
+        if self.shard_order is None:
+            # pyrefly: ignore  # bad-assignment
+            self.shard_order = DTensorSpec.compute_default_shard_order(self.placements)
+        self._hash: int | None = None
+
+    @staticmethod
+    def compute_default_shard_order(
+        placements: tuple[Placement, ...],
+    ) -> ShardOrder:
+        """
+        Compute the default shard order from placements.
+
+        Returns a ShardOrder where each ShardOrderEntry maps a tensor dimension
+        to the mesh dimensions it's sharded on, in left-to-right order.
+        """
+        # follow default left-to-right device order if shard_order is not specified
+        tensor_dim_to_mesh_dims: defaultdict[int, list[int]] = defaultdict(list)
+        mesh_ndim = len(placements)
+        for mesh_dim in range(mesh_ndim):
+            # shard_order doesn't work with _StridedShard
+            if isinstance(placements[mesh_dim], _StridedShard):
+                return ()
+            if isinstance(placements[mesh_dim], Shard):
+                placement = cast(Shard, placements[mesh_dim])
+                shard_dim = placement.dim
+                assert shard_dim >= 0, (
+                    f"Shard dim {shard_dim} in placements {placements} must be normalized"
+                )
+                tensor_dim_to_mesh_dims[shard_dim].append(mesh_dim)
+
+        # Convert dict into ShardOrderEntry tuples
+        default_shard_order = tuple(
+            ShardOrderEntry(tensor_dim=key, mesh_dims=tuple(value))
+            for key, value in sorted(tensor_dim_to_mesh_dims.items())
+            if value
+        )
+        return default_shard_order
+
+    def _verify_shard_order(self, shard_order: ShardOrder) -> None:
+        """Verify that the shard_order is valid and matches the placements."""
+        total_shard = 0
+        if any(isinstance(p, _StridedShard) for p in self.placements):
+            return
+        prev_tensor_dim = -1
+        for entry in shard_order:
+            tensor_dim = entry.tensor_dim
+            mesh_dims = entry.mesh_dims
+            assert len(mesh_dims) > 0, f"shard_order {shard_order} has empty mesh dim"
+            assert tensor_dim >= 0, (
+                f"shard_order {shard_order} has invalid tensor dim {tensor_dim}"
+            )
+            assert tensor_dim > prev_tensor_dim, (
+                "tensor dim should be sorted in shard_order"
+            )
+            prev_tensor_dim = tensor_dim
+            total_shard += len(mesh_dims)
+            for mesh_dim in mesh_dims:
+                assert 0 <= mesh_dim < len(self.placements), (
+                    f"shard_order {shard_order} has invalid mesh dim {mesh_dims}"
+                )
+                assert self.placements[mesh_dim] == Shard(tensor_dim), (
+                    f"placement[{mesh_dim}] doesn't have a matching shard in shard_order"
+                )
+        assert total_shard == sum(1 for p in self.placements if isinstance(p, Shard))
 
     def __setattr__(self, attr: str, value: Any) -> None:
+        if attr == "shard_order" and value is not None:
+            self._verify_shard_order(value)
         super().__setattr__(attr, value)
         # Make sure to recompute the hash in case any of the hashed attributes
-        # change (though we do not expect `mesh` or `placements` to change)
-        if hasattr(self, "_hash") and attr in ("mesh", "placements", "tensor_meta"):
+        # change (though we do not expect `mesh`, `placements` or `shard_order`
+        # to change)
+        if hasattr(self, "_hash") and attr in (
+            "mesh",
+            "placements",
+            "tensor_meta",
+            "shard_order",
+        ):
             self._hash = None
         # This assert was triggered by buggy handling for dict outputs in some
         # FX passes, where you accidentally iterate over a dict and try to put
@@ -49,7 +177,7 @@ def __setattr__(self, attr: str, value: Any) -> None:
             # TODO: the TensorMetadata arises from
             # test/distributed/tensor/experimental/test_tp_transform.py::TensorParallelTest::test_tp_transform_e2e
             # but I actually can't reproduce it, maybe it is also a bug!
-            assert isinstance(value, (TensorMeta, TensorMetadata)), value
+            assert isinstance(value, TensorMeta | TensorMetadata), value
 
     def _hash_impl(self) -> int:
         # hashing and equality check for DTensorSpec are used to cache the sharding
@@ -62,12 +190,13 @@ def _hash_impl(self) -> int:
                 (
                     self.mesh,
                     self.placements,
+                    self.shard_order,
                     self.tensor_meta.shape,
                     self.tensor_meta.stride,
                     self.tensor_meta.dtype,
                 )
             )
-        return hash((self.mesh, self.placements))
+        return hash((self.mesh, self.placements, self.shard_order))
 
     def __hash__(self) -> int:
         # We lazily cache the spec to avoid recomputing the hash upon each
@@ -78,37 +207,123 @@ def __hash__(self) -> int:
             self._hash = self._hash_impl()
         return self._hash
 
-    def __eq__(self, other: object, /) -> bool:
+    def _check_equals(self, other: object, skip_shapes: bool = False) -> bool:
         if not (
             isinstance(other, DTensorSpec)
             and self.mesh == other.mesh
             and self.placements == other.placements
+            and self.shard_order == other.shard_order
         ):
             return False
         if self.tensor_meta is None or other.tensor_meta is None:
             return self.tensor_meta == other.tensor_meta
 
+        if skip_shapes:
+            return self.tensor_meta.dtype == other.tensor_meta.dtype
         return (
             self.tensor_meta.shape == other.tensor_meta.shape  # type: ignore[union-attr]
             and self.tensor_meta.stride == other.tensor_meta.stride  # type: ignore[union-attr]
             and self.tensor_meta.dtype == other.tensor_meta.dtype  # type: ignore[union-attr]
         )
 
+    def __eq__(self, other: object, /) -> bool:
+        return self._check_equals(other)
+
     def __str__(self) -> str:
         """
         human readable representation of the DTensorSpec
         """
-        if len(self.placements) == 1:
-            placement_str = str(self.placements[0])
-        else:
-            placement_str = str(self.placements)
-
+        placement_str = self.format_shard_order_str(self.placements, self.shard_order)
         if self.tensor_meta is not None:
-            tensor_shape = str(tuple(self.tensor_meta.shape))
+            tensor_shape = _stringify_shape(self.tensor_meta.shape)
+            tensor_dtype = dtype_abbrs[self.tensor_meta.dtype]
         else:
             tensor_shape = "unknown shape"
+            tensor_dtype = "unknown dtype"
+
+        return f"Spec({tensor_dtype}{tensor_shape}({placement_str}))"
+
+    @staticmethod
+    def is_default_device_order(shard_order: ShardOrder) -> bool:
+        """
+        Check if the device order is the default left-to-right order.
+        """
+        for entry in shard_order:
+            mesh_dims = entry.mesh_dims
+            is_increasing = all(
+                prev < nxt for prev, nxt in itertools.pairwise(mesh_dims)
+            )
+            if not is_increasing:
+                return False
+        return True
+
+    @staticmethod
+    def format_shard_order_str(
+        placements: tuple[Placement, ...],
+        shard_order: Optional[ShardOrder] = None,
+    ) -> str:
+        """
+        Format DTensor sharding information as a human-readable string.
 
-        return f"Spec({placement_str} on {tensor_shape})"
+        This method formats the sharding pattern in mesh-centric order, showing the placement
+        for each mesh dimension sequentially. When a tensor dimension is sharded across multiple
+        mesh dimensions, the order index indicates the execution sequence of the sharding operations.
+
+        Args:
+            placements: Tuple of placement objects for each mesh dimension.
+            shard_order: Optional ShardOrder specifying the sharding order.
+
+        Returns:
+            String representation of the sharding pattern in mesh-centric format.
+
+        Example:
+            For a 3D tensor on a 2x2x2x2 mesh (16 devices) with::
+
+                placements = [Partial(), Shard(1), Shard(1), Replicate()]
+                shard_order = (ShardOrderEntry(tensor_dim=1, mesh_dims=(2, 1)),)
+
+            Mesh configuration:
+                - mesh_dim_0: Partial reduction (sum)
+                - mesh_dim_1: Shard tensor dimension 1 (executed second, order index 1)
+                - mesh_dim_2: Shard tensor dimension 1 (executed first, order index 0)
+                - mesh_dim_3: Replicate
+
+            Output: ``"PS(1)[1]S(1)[0]R"``
+
+            Explanation:
+                - ``P``: mesh dimension 0 has partial reduction
+                - ``S(1)[1]``: mesh dimension 1 shards tensor dimension 1 (order index 1 means second)
+                - ``S(1)[0]``: mesh dimension 2 shards tensor dimension 1 (order index 0 means first)
+                - ``R``: mesh dimension 3 replicates
+
+            The format follows mesh dimension order (0, 1, 2, 3), and when a tensor dimension
+            is sharded across multiple mesh dimensions, the bracketed index shows the execution
+            order: ``[0]`` is executed first, ``[1]`` is executed second, etc.
+        """
+        out_str = ""
+        # native dtensor-style sharding representation: map from mesh
+        # dim to tensor dim
+        for mesh_dim, placement in enumerate(placements):
+            if isinstance(placement, Shard):
+                if shard_order is not None:
+                    for entry in shard_order:
+                        tensor_dim = entry.tensor_dim
+                        mesh_dims = entry.mesh_dims
+
+                        if placement.dim == tensor_dim:
+                            assert mesh_dim in mesh_dims
+                            if len(mesh_dims) > 1:
+                                out_str += f"{placement}[{mesh_dims.index(mesh_dim)}]"
+                            else:
+                                # no need to show device order if the tensor dim is
+                                # only sharded in one mesh dim
+                                out_str += str(placement)
+                            break
+                else:
+                    out_str += str(placement)
+            else:
+                out_str += str(placement)
+        return out_str
 
     @property
     def shape(self) -> torch.Size:
diff --git a/torch/distributed/tensor/_op_schema.py b/torch/distributed/tensor/_op_schema.py
index 6f8c644095ee..8d3a89a1a647 100644
--- a/torch/distributed/tensor/_op_schema.py
+++ b/torch/distributed/tensor/_op_schema.py
@@ -24,12 +24,16 @@
 """
 
 from collections.abc import Sequence
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from functools import cached_property
 from typing import Any, Optional, Union
 from typing_extensions import deprecated
 
 import torch
+from torch._C import (
+    _DTensor_OpSchema_post_init,
+    _DTensor_OpSchema_recompute_comparison_key,
+)
 from torch._ops import OpOverload
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
@@ -201,7 +205,7 @@ def __init__(self, strategies: list[OpSpec]) -> None:
     def __str__(self) -> str:
         strategy_list_str = ", ".join([str(strategy) for strategy in self.strategies])
         mesh_shape = self.mesh_shape
-        return f"[{strategy_list_str}] @ mesh: {mesh_shape}"
+        return f"OpStragety[{strategy_list_str}] @ mesh: {mesh_shape}"
 
     def max_num_shards(self) -> int:
         """
@@ -331,6 +335,8 @@ class OpSchema:
 
     _comparison_key: Optional[tuple[object, ...]] = None
 
+    has_symints: bool = field(init=False)
+
     @property
     def args_spec(self) -> tuple[DTensorSpec, ...]:
         """
@@ -367,33 +373,28 @@ def __repr__(self) -> str:
 
     def __str__(self) -> str:
         args_schema: list[str] = []
-        mesh_shape = None
+        device_mesh = None
+
         for arg in self.args_schema:
             if isinstance(arg, DTensorSpec):
                 args_schema.append(str(arg))
-                mesh_shape = arg.mesh.shape
+                device_mesh = arg.mesh
             elif isinstance(arg, OpStrategy):
                 assert len(arg.strategies) == 1
                 args_schema.append(_pretty_print_spec(arg.strategies[0].output_specs))
-                mesh_shape = arg.mesh_shape
+                device_mesh = arg.mesh
             elif isinstance(arg, TupleStrategy):
                 first_op_strategy = arg.children[0]
                 assert isinstance(first_op_strategy, OpStrategy)
-                mesh_shape = first_op_strategy.mesh_shape
+                device_mesh = first_op_strategy.mesh
                 args_schema.append(str(arg))
             else:
                 args_schema.append(str(arg))
-        return f"Op(op={self.op}, args_schema={', '.join(args_schema)} @ mesh: {mesh_shape})"
+
+        return f"{self.op}({', '.join(args_schema)}) on {device_mesh})"
 
     def __post_init__(self) -> None:
-        has_symints = False
-        for a in self.args_schema:
-            if isinstance(a, DTensorSpec) and a.tensor_meta is not None:
-                if any(isinstance(s, torch.SymInt) for s in a.tensor_meta.shape):
-                    has_symints = True
-                    break
-        self.has_symints = has_symints
-        self._recompute_comparison_key()
+        _DTensor_OpSchema_post_init(self)
 
     def arg_type_tensor_or_tensor_list_like(self, arg: object) -> bool:
         is_tensor = isinstance(arg, DTensorSpec)
@@ -479,26 +480,8 @@ def is_out_variant_op(self) -> bool:
     def is_view_op(self) -> bool:
         return self.op._schema._is_view_op()
 
-    def _recompute_comparison_key(self):
-        if not self.schema_info:
-            static_argnum = len(self.args_schema)
-            static_kwargkey = None
-        else:
-            static_argnum = self.schema_info.static_argnum
-            static_kwargkey = self.schema_info.static_kwargkey
-
-        args_to_hash = tuple(
-            tuple(e) if isinstance(e, list) else e
-            for i, e in enumerate(self.args_schema)
-            if self.arg_type_tensor_or_tensor_list_like(e) or i >= static_argnum
-        )
-        if static_kwargkey is not None:
-            kwargs_to_hash = tuple(
-                self.kwargs_schema.get(k, None) for k in static_kwargkey
-            )
-            self._comparison_key = (self.op, args_to_hash, kwargs_to_hash)
-        else:
-            self._comparison_key = (self.op, args_to_hash)
+    def _recompute_comparison_key(self) -> None:
+        _DTensor_OpSchema_recompute_comparison_key(self)
 
     def __hash__(self) -> int:
         return hash(self._comparison_key)
diff --git a/torch/distributed/tensor/_ops/_common_rules.py b/torch/distributed/tensor/_ops/_common_rules.py
index d70cc130dfc2..65d72c09e7a4 100644
--- a/torch/distributed/tensor/_ops/_common_rules.py
+++ b/torch/distributed/tensor/_ops/_common_rules.py
@@ -171,6 +171,7 @@ def merge_sharding(dim: str, a: int, b: int) -> int:
                             global_shape, input_spec.mesh, input_spec.placements
                         )
                         cost += prod(local_shape) * input_spec.mesh.size(mesh_dim)
+                # pyrefly: ignore  # bad-argument-type
                 costs.append(cost)
             d_to_keep_sharding = dims[costs.index(max(costs))]
             for d in dims:
diff --git a/torch/distributed/tensor/_ops/_conv_ops.py b/torch/distributed/tensor/_ops/_conv_ops.py
index 2198986d50c5..bcb9e01b5ed9 100644
--- a/torch/distributed/tensor/_ops/_conv_ops.py
+++ b/torch/distributed/tensor/_ops/_conv_ops.py
@@ -35,22 +35,21 @@ def convolution_rules(op_schema: OpSchema) -> OutputSharding:
     assert isinstance(padding, list)
     assert isinstance(dilation, list)
     assert isinstance(weight_shape, torch.Size)
-    N, H_in, W_in = in_shape[0], in_shape[2], in_shape[3]
-    C_out = weight_shape[0]
-    H_out = (H_in + 2 * padding[0] - dilation[0] * (weight_shape[2] - 1) - 1) // stride[
-        0
-    ] + 1
-    W_out = (W_in + 2 * padding[1] - dilation[1] * (weight_shape[3] - 1) - 1) // stride[
-        1
-    ] + 1
-    output_shape = [N, C_out, H_out, W_out]
-    output_stride = (C_out * H_out * W_out, H_out * W_out, W_out, 1)
+    out_conv_shape = [
+        (d + 2 * padding[i] - dilation[i] * (weight_shape[i + 1] - 1) - 1) // stride[i]
+        + 1
+        for (i, d) in enumerate(in_shape[2:])
+    ]
+    output_shape = [in_shape[0], weight_shape[0]] + out_conv_shape
+    output_stride = [1]
+    for i in range(1, len(output_shape)):
+        output_stride.insert(0, output_stride[0] * output_shape[-i])
     output_dim_map = input_spec.dim_map
     pending_sums = input_spec.sums
 
     tensor_meta = TensorMeta(
         torch.Size(output_shape),
-        output_stride,
+        tuple(output_stride),
         input_spec.tensor_meta.dtype,
     )
     return OutputSharding(
diff --git a/torch/distributed/tensor/_ops/_embedding_ops.py b/torch/distributed/tensor/_ops/_embedding_ops.py
index 1b8e47895ce5..283cffb78efd 100644
--- a/torch/distributed/tensor/_ops/_embedding_ops.py
+++ b/torch/distributed/tensor/_ops/_embedding_ops.py
@@ -6,6 +6,7 @@
 
 import torch
 import torch.distributed._functional_collectives as funcol
+from torch.distributed._local_tensor import maybe_run_for_local_tensor
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor._op_schema import (
     OpSchema,
@@ -83,29 +84,41 @@ class _MaskPartial(Partial):
     offset_shape: Optional[torch.Size] = None
     offset_dim: int = 0
 
+    @staticmethod
+    @maybe_run_for_local_tensor
+    def _mask_tensor(
+        tensor: torch.Tensor, local_offset_on_dim: int, local_shard_size: int
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Build the input mask and save it for the current partial placement
+        # this is so that the output of embedding op can reuse the same partial
+        # placement saved mask to perform mask + reduction
+        mask = (tensor < local_offset_on_dim) | (
+            tensor >= local_offset_on_dim + local_shard_size
+        )
+        # mask the input tensor
+        masked_tensor = tensor.clone() - local_offset_on_dim
+        masked_tensor[mask] = 0
+        return mask, masked_tensor
+
     def _partition_value(
         self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
     ) -> torch.Tensor:
+        my_coordinate = mesh.get_coordinate()
+        assert my_coordinate is not None, "my_coordinate should not be None"
         # override parent logic to perform partial mask for embedding
         num_chunks = mesh.size(mesh_dim)
         # get local shard size and offset on the embedding_dim
         assert self.offset_shape is not None, (
             "offset_shape needs to be set for _MaskPartial"
         )
-        local_shard_size, local_offset_on_dim = Shard._local_shard_size_and_offset(
+        local_shard_size, local_offset_on_dim = Shard.local_shard_size_and_offset(
             self.offset_shape[self.offset_dim],
             num_chunks,
-            mesh.get_local_rank(mesh_dim),
+            my_coordinate[mesh_dim],
         )
-        # Build the input mask and save it for the current partial placement
-        # this is so that the output of embedding op can reuse the same partial
-        # placement saved mask to perform mask + reduction
-        mask = (tensor < local_offset_on_dim) | (
-            tensor >= local_offset_on_dim + local_shard_size
+        mask, masked_tensor = _MaskPartial._mask_tensor(
+            tensor, local_offset_on_dim, local_shard_size
         )
-        # mask the input tensor
-        masked_tensor = tensor.clone() - local_offset_on_dim
-        masked_tensor[mask] = 0
         # materialize the mask buffer to be used for reduction
         self.mask_buffer.materialize_mask(mask)
         return masked_tensor
diff --git a/torch/distributed/tensor/_ops/_math_ops.py b/torch/distributed/tensor/_ops/_math_ops.py
index 1e6eb40939e4..0d2d68c9923b 100644
--- a/torch/distributed/tensor/_ops/_math_ops.py
+++ b/torch/distributed/tensor/_ops/_math_ops.py
@@ -22,6 +22,7 @@
     expand_to_full_mesh_op_strategy,
     generate_redistribute_costs,
     is_tensor_evenly_shardable,
+    is_tensor_evenly_shardable_on_dim,
     normalize_dim,
     normalize_dims,
     register_op_strategy,
@@ -103,7 +104,10 @@ def _partition_value(
                 raise NotImplementedError(f"Unsupported norm type:: {self.norm_type}")
             elif self.norm_type == 1:
                 return tensor / mesh.size(mesh_dim)
-            assert isinstance(self.norm_type, (int, float))
+            if not isinstance(self.norm_type, (int, float)):
+                raise AssertionError(
+                    f"Expected int or float, got {type(self.norm_type)}"
+                )
             return tensor / math.pow(mesh.size(mesh_dim), 1 / self.norm_type)
         raise NotImplementedError(self.reduce_op)
 
@@ -114,7 +118,8 @@ def _reduce_shard_value(
         mesh_dim: int,
         shard_spec: Placement,
     ) -> torch.Tensor:
-        assert isinstance(shard_spec, Shard), f"{shard_spec}"
+        if not isinstance(shard_spec, Shard):
+            raise AssertionError(f"Expected Shard, got {type(shard_spec)}")
         tensor = self._pre_reduce_transform(tensor)
         reduced_tensor = super()._reduce_shard_value(tensor, mesh, mesh_dim, shard_spec)
         return self._post_reduce_transform(reduced_tensor)
@@ -128,15 +133,23 @@ def _reduce_value(
 
     def _pre_reduce_transform(self, tensor: torch.Tensor) -> torch.Tensor:
         if self.reduce_op == "sum":
-            assert isinstance(self.norm_type, (int, float)), f"{self.norm_type}"
+            if not isinstance(self.norm_type, (int, float)):
+                raise AssertionError(
+                    f"Expected int or float, got {type(self.norm_type)}"
+                )
             if self.norm_type != 0 and self.norm_type != 1:
+                # pyrefly: ignore  # unsupported-operation
                 return tensor**self.norm_type
         return tensor
 
     def _post_reduce_transform(self, tensor: torch.Tensor) -> torch.Tensor:
         if self.reduce_op == "sum":
-            assert isinstance(self.norm_type, (int, float)), f"{self.norm_type}"
+            if not isinstance(self.norm_type, (int, float)):
+                raise AssertionError(
+                    f"Expected int or float, got {type(self.norm_type)}"
+                )
             if self.norm_type != 0 and self.norm_type != 1:
+                # pyrefly: ignore  # unsupported-operation
                 return tensor ** (1.0 / self.norm_type)
         return tensor
 
@@ -233,7 +246,8 @@ def map_placements_after_reduction(
         if isinstance(placement, (Replicate, Partial)):
             new_placements.append(placement)
         else:
-            assert isinstance(placement, Shard)
+            if not isinstance(placement, Shard):
+                raise AssertionError(f"Expected Shard, got {type(placement)}")
             shard_dim = placement.dim
             new_shard_dim = reduction_dims_map[shard_dim]
             if new_shard_dim == -1 or shard_dim in reduction_dims:
@@ -268,6 +282,15 @@ def common_reduction_strategy(
     reduction_strategy = OpStrategy([])
 
     for op_spec in input_strategy.strategies:
+        if reduction_op == "avg":
+            output_spec = op_spec.output_spec
+            local_shape = list(output_spec.tensor_meta.shape)  # type:ignore[union-attr]
+            for dim in reduce_dims:
+                if not is_tensor_evenly_shardable_on_dim(local_shape, output_spec, dim):
+                    # reduce(avg) is not linear for unevenly sharded tensors
+                    reduction_linear = False
+                    break
+
         if not reduction_linear:
             # input placements for this strategy should clear out pending sum and sharding
             # on the reduction dimension
@@ -307,9 +330,14 @@ def common_reduction_strategy(
     aten.all.dim: "sum",
     aten.sum.default: "sum",
     aten.sum.dim_IntList: "sum",
+    aten.any.default: "sum",
+    aten.any.dim: "sum",
+    aten.any.out: "sum",
+    # These are only valid when there is no padding
     aten.prod.default: "product",
     aten.prod.dim_int: "product",
     aten.prod.int_out: "product",
+    # avg is only linear when there is no padding
     aten.mean.default: "avg",
     aten.mean.dim: "avg",
     aten.mean.out: "avg",
@@ -319,9 +347,6 @@ def common_reduction_strategy(
     aten.min.default: "min",
     aten.min.dim: "min",
     aten.min.out: "min",
-    aten.any.default: "sum",
-    aten.any.dim: "sum",
-    aten.any.out: "sum",
     aten.amax.default: "max",
     aten.amax.out: "max",
     aten.amin.default: "min",
@@ -335,7 +360,8 @@ def common_reduction_strategy(
 def linear_reduction_strategy(op_schema: OpSchema) -> OpStrategy:
     args_schema = op_schema.args_schema
     input_strategy = args_schema[0]
-    assert isinstance(input_strategy, OpStrategy)
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
 
     dims = None
     if len(op_schema.args_schema) > 1:
@@ -358,9 +384,11 @@ def linear_reduction_strategy(op_schema: OpSchema) -> OpStrategy:
 def cumsum_strategy(op_schema: OpSchema) -> OpStrategy:
     args_schema = op_schema.args_schema
     input_strategy = args_schema[0]
-    assert isinstance(input_strategy, OpStrategy)
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
     dim = args_schema[1]
-    assert isinstance(dim, int), f"{dim}"
+    if not isinstance(dim, int):
+        raise AssertionError(f"Expected int, got {type(dim)}")
 
     return common_reduction_strategy(
         input_strategy, [dim], keep_dim=True, reduction_linear=False
@@ -374,7 +402,8 @@ def cumsum_strategy(op_schema: OpSchema) -> OpStrategy:
 def var_reduction_strategy(op_schema: OpSchema) -> OpStrategy:
     args_schema = op_schema.args_schema
     input_strategy = args_schema[0]
-    assert isinstance(input_strategy, OpStrategy)
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
     dims = None
     if len(op_schema.args_schema) > 1:
         dims = _infer_reduction_dims(args_schema[1], input_strategy.ndim)
@@ -393,10 +422,12 @@ def var_reduction_strategy(op_schema: OpSchema) -> OpStrategy:
 def vector_norm_strategy(op_schema: OpSchema) -> OpStrategy:
     args_schema = op_schema.args_schema
     input_strategy = args_schema[0]
-    assert isinstance(input_strategy, OpStrategy)
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
 
     norm_type = args_schema[1] if len(args_schema) > 1 else 2
-    assert isinstance(norm_type, (int, float, str)), f"{norm_type}"
+    if not isinstance(norm_type, (int, float, str)):
+        raise AssertionError(f"Expected int, float, or str, got {type(norm_type)}")
     dim = args_schema[2] if len(args_schema) > 2 else None
     keepdim = args_schema[3] if len(args_schema) > 3 else False
     dims = _infer_reduction_dims(dim, input_strategy.ndim)
@@ -416,12 +447,17 @@ def vector_norm_strategy(op_schema: OpSchema) -> OpStrategy:
 def foreach_norm_strategy(op_schema: OpSchema) -> TupleStrategy:
     args_schema = op_schema.args_schema
     input_tuple_strategy = args_schema[0]
-    assert isinstance(input_tuple_strategy, TupleStrategy)
+    if not isinstance(input_tuple_strategy, TupleStrategy):
+        raise AssertionError(
+            f"Expected TupleStrategy, got {type(input_tuple_strategy)}"
+        )
     norm_type = args_schema[1] if len(args_schema) > 1 else 2
-    assert isinstance(norm_type, (int, float, str)), f"{norm_type}"
+    if not isinstance(norm_type, (int, float, str)):
+        raise AssertionError(f"Expected int, float, or str, got {type(norm_type)}")
     output_tuple_strategy_children: list[OpStrategy] = []
     for op_strategy in input_tuple_strategy.children:
-        assert isinstance(op_strategy, OpStrategy), f"{op_strategy}"
+        if not isinstance(op_strategy, OpStrategy):
+            raise AssertionError(f"Expected OpStrategy, got {type(op_strategy)}")
         reduce_dims = list(range(op_strategy.ndim))
         output_strategy = common_reduction_strategy(
             op_strategy,
@@ -462,7 +498,8 @@ def linalg_replicate_strategy(op_schema: OpSchema) -> OpStrategy:
     """
     args_schema = op_schema.args_schema
     input_strategy = args_schema[0]
-    assert isinstance(input_strategy, OpStrategy), f"{input_strategy}"
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
     mesh = input_strategy.mesh
 
     output_strategies: list[OpSpec] = []
@@ -586,7 +623,8 @@ def softmax_backward_strategy(op_schema: OpSchema) -> OpStrategy:
 def nll_loss_forward_strategy(op_schema: OpSchema) -> OpStrategy:
     mesh = op_schema.get_mesh_from_args()
 
-    assert len(op_schema.args_schema) == 5
+    if not len(op_schema.args_schema) == 5:
+        raise AssertionError(f"Expected 5 args, got {len(op_schema.args_schema)}")
 
     (
         input_strategy,
@@ -636,7 +674,10 @@ def nll_loss_forward_strategy(op_schema: OpSchema) -> OpStrategy:
         # weight tensor, if given, has to be a Tensor of size input_shape[channel_dim]
         # make sure it is replicated
         if weight_strategy is not None:
-            assert isinstance(weight_strategy, OpStrategy)
+            if not isinstance(weight_strategy, OpStrategy):
+                raise AssertionError(
+                    f"Expected OpStrategy, got {type(weight_strategy)}"
+                )
             weight_src_spec = weight_strategy.strategies[idx].output_spec
             weight_expected_spec = DTensorSpec(
                 mesh=mesh,
@@ -711,7 +752,8 @@ def nll_loss_backward_strategy(op_schema: OpSchema) -> OpStrategy:
     # backward op does not need to validate the mesh since forward op has already done it
     mesh = op_schema.get_mesh_from_args(validate=False)
 
-    assert len(op_schema.args_schema) == 7
+    if not len(op_schema.args_schema) == 7:
+        raise AssertionError(f"Expected 7 args, got {len(op_schema.args_schema)}")
     (
         grad_out_strategy,
         input_strategy,
@@ -780,7 +822,10 @@ def nll_loss_backward_strategy(op_schema: OpSchema) -> OpStrategy:
         # weight tensor, if given, has to be a Tensor of size input_shape[channel_dim]
         # make sure it is replicated
         if weight_strategy is not None:
-            assert isinstance(weight_strategy, OpStrategy)
+            if not isinstance(weight_strategy, OpStrategy):
+                raise AssertionError(
+                    f"Expected OpStrategy, got {type(weight_strategy)}"
+                )
             weight_src_spec = weight_strategy.strategies[idx].output_spec
             weight_expected_spec = DTensorSpec(
                 mesh=mesh,
@@ -830,7 +875,8 @@ def _common_norm_forward_strategy(
         # for None weight and bias, their corresponding objects will
         # be None as well. layer_norm_strategy returns one OpStrategy
         # for the triple return values (out, mean, rstd).
-        assert len(op_schema.args_schema) == 5
+        if not len(op_schema.args_schema) == 5:
+            raise AssertionError(f"Expected 5 args, got {len(op_schema.args_schema)}")
         (
             input_strategy,
             normalized_shape,
@@ -840,7 +886,8 @@ def _common_norm_forward_strategy(
         ) = op_schema.args_schema
     else:
         # rms_norm args: input, normalized_shape, weight, eps
-        assert len(op_schema.args_schema) == 4
+        if not len(op_schema.args_schema) == 4:
+            raise AssertionError(f"Expected 4 args, got {len(op_schema.args_schema)}")
         (
             input_strategy,
             normalized_shape,
@@ -851,8 +898,12 @@ def _common_norm_forward_strategy(
 
     # the current norm implementation requires that all
     # input DTensor's sharding must be in form of OpStrategy
-    assert isinstance(input_strategy, OpStrategy)
-    assert isinstance(normalized_shape, (int, Sequence, torch.Size))
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
+    if not isinstance(normalized_shape, (int, Sequence, torch.Size)):
+        raise AssertionError(
+            f"Expected int, Sequence, or torch.Size, got {type(normalized_shape)}"
+        )
     normalized_size = normalize_to_torch_size(normalized_shape)
 
     input_ndim = input_strategy.ndim
@@ -880,7 +931,10 @@ def _common_norm_forward_strategy(
         )
 
         if weight_strategy is not None:
-            assert isinstance(weight_strategy, OpStrategy)
+            if not isinstance(weight_strategy, OpStrategy):
+                raise AssertionError(
+                    f"Expected OpStrategy, got {type(weight_strategy)}"
+                )
             weight_src_spec = weight_strategy.strategies[idx].output_spec
 
             # for the weight tensor, we replicate it on all dims if necessary
@@ -897,7 +951,8 @@ def _common_norm_forward_strategy(
             )
 
         if bias_strategy is not None:
-            assert isinstance(bias_strategy, OpStrategy)
+            if not isinstance(bias_strategy, OpStrategy):
+                raise AssertionError(f"Expected OpStrategy, got {type(bias_strategy)}")
             bias_src_spec = bias_strategy.strategies[idx].output_spec
 
             # for the bias tensor, we replicate it on all dims if necessary
@@ -954,7 +1009,8 @@ def _common_norm_backward_strategy(
         # layer_norm args: grad_out, input, normalized_shape, mean, rstd,
         # weight, bias, output_mask. For None weight and bias, their
         # corresponding objects will be None as well.
-        assert len(op_schema.args_schema) == 8
+        if not len(op_schema.args_schema) == 8:
+            raise AssertionError(f"Expected 8 args, got {len(op_schema.args_schema)}")
         (
             grad_out_strategy,
             input_strategy,
@@ -967,7 +1023,8 @@ def _common_norm_backward_strategy(
         ) = op_schema.args_schema
     else:
         # rms_norm args: grad_out, input, normalized_shape, rstd,
-        assert len(op_schema.args_schema) == 6
+        if not len(op_schema.args_schema) == 6:
+            raise AssertionError(f"Expected 6 args, got {len(op_schema.args_schema)}")
         (
             grad_out_strategy,
             input_strategy,
@@ -979,22 +1036,37 @@ def _common_norm_backward_strategy(
         mean_strategy = None
         bias_strategy = None
 
-    assert isinstance(grad_out_strategy, OpStrategy)
-    assert isinstance(input_strategy, OpStrategy)
-    assert isinstance(rstd_strategy, OpStrategy)
+    if not isinstance(grad_out_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(grad_out_strategy)}")
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
+    if not isinstance(rstd_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(rstd_strategy)}")
     if mean_strategy is not None:
-        assert isinstance(mean_strategy, OpStrategy)
+        if not isinstance(mean_strategy, OpStrategy):
+            raise AssertionError(f"Expected OpStrategy, got {type(mean_strategy)}")
 
-    assert isinstance(normalized_shape, (int, Sequence, torch.Size))
+    if not isinstance(normalized_shape, (int, Sequence, torch.Size)):
+        raise AssertionError(
+            f"Expected int, Sequence, or torch.Size, got {type(normalized_shape)}"
+        )
     normalized_size = normalize_to_torch_size(normalized_shape)
     input_ndim = input_strategy.ndim
     axis = input_ndim - len(normalized_size)
     outer_dims = list(range(axis))
 
     if not rms_norm:
-        assert isinstance(output_mask, list) and len(output_mask) == 3
+        if not (isinstance(output_mask, list) and len(output_mask) == 3):
+            raise AssertionError(
+                f"Expected output_mask to be list of length 3, got {type(output_mask)} "
+                f"of length {len(output_mask) if isinstance(output_mask, list) else 'N/A'}"
+            )
     else:
-        assert isinstance(output_mask, list) and len(output_mask) == 2
+        if not (isinstance(output_mask, list) and len(output_mask) == 2):
+            raise AssertionError(
+                f"Expected output_mask to be list of length 2, got {type(output_mask)} "
+                f"of length {len(output_mask) if isinstance(output_mask, list) else 'N/A'}"
+            )
 
     # output tuple: (d_input, d_weight[, d_bias])
     out_tuple_strategy = OpStrategy([])
@@ -1039,7 +1111,8 @@ def _common_norm_backward_strategy(
 
         # arg: mean
         if not rms_norm:
-            assert mean_strategy is not None  # mypy fix
+            if mean_strategy is None:
+                raise AssertionError("Expected mean_strategy to not be None")
             mean_src_spec = mean_strategy.strategies[idx].output_spec
             input_specs_list.append(mean_src_spec)
             redistribute_costs.append([0.0 for _ in mean_strategy.strategies])
@@ -1051,7 +1124,8 @@ def _common_norm_backward_strategy(
 
         def _add_target_input_spec(strategy) -> DTensorSpec:
             # shared logic for setting the weight and bias target input specs
-            assert isinstance(strategy, OpStrategy)
+            if not isinstance(strategy, OpStrategy):
+                raise AssertionError(f"Expected OpStrategy, got {type(strategy)}")
             src_spec = strategy.strategies[idx].output_spec
             # no need to redistribute since they should be replicated in forward pass
             input_specs_list.append(src_spec)
@@ -1084,7 +1158,8 @@ def _add_target_input_spec(strategy) -> DTensorSpec:
                 error_msg = "output_mask[1] should not be `True` while weight argument is `None` in native_layer_norm_backward."
             else:
                 error_msg = "output_mask[1] should not be `True` while weight argument is `None` in _fused_rms_norm_backward."
-            assert output_mask[1] is False, error_msg
+            if output_mask[1] is not False:
+                raise AssertionError(error_msg)
             output_specs_list.append(None)
 
         # arg: bias
@@ -1109,9 +1184,10 @@ def _add_target_input_spec(strategy) -> DTensorSpec:
                 )
                 output_specs_list.append(bias_out_spec if output_mask[2] else None)
             else:
-                assert output_mask[2] is False, (
-                    "output_mask[2] should not be `True` while bias argument is `None` in native_layer_norm_backward."
-                )
+                if output_mask[2] is not False:
+                    raise AssertionError(
+                        "output_mask[2] should not be `True` while bias argument is `None` in native_layer_norm_backward."
+                    )
                 output_specs_list.append(None)
 
         out_tuple_strategy.strategies.append(
@@ -1176,7 +1252,8 @@ def topk_strategy(op_schema: OpSchema) -> OpStrategy:
 def sort_default_strategy(op_schema: OpSchema) -> OpStrategy:
     # mostly copy paste from topk_strategy
     input_strategy = op_schema.args_schema[0]
-    assert isinstance(input_strategy, OpStrategy)
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
     sort_dim = -1
     if len(op_schema.args_schema) > 1:
         sort_dim = cast(int, op_schema.args_schema[1])
@@ -1193,7 +1270,8 @@ def sort_default_strategy(op_schema: OpSchema) -> OpStrategy:
 def sort_stable_strategy(op_schema: OpSchema) -> OpStrategy:
     # mostly copy paste from topk_strategy
     input_strategy = op_schema.args_schema[0]
-    assert isinstance(input_strategy, OpStrategy)
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
     sort_dim = -1
     if "dim" in op_schema.kwargs_schema:
         sort_dim = cast(int, op_schema.kwargs_schema["dim"])
@@ -1222,3 +1300,41 @@ def histc_strategy(op_schema: OpSchema) -> OpStrategy:
     return expand_to_full_mesh_op_strategy(
         input_strategy.mesh, op_schema, single_mesh_dim_strategies
     )
+
+
+@register_op_strategy(
+    [aten.logsumexp.default],
+    schema_info=RuntimeSchemaInfo(
+        # static_argnum is the position where non-Tensor args beings.
+        static_argnum=1,
+        # static_kwargkey is the name of kwargs to hash (which determines
+        # whether sharding prop can be cached).
+        static_kwargkey=["keepdim"],
+    ),
+)
+def logsumexp_strategy(op_schema: OpSchema) -> OpStrategy:
+    """Implements the sharding propagation strategy for logsumexp."""
+
+    # args_schema contains all but the DTensor args (e.g., dim, keepdim).
+    args_schema = op_schema.args_schema
+    if not len(args_schema) > 1:
+        raise AssertionError(
+            f"Expected more than 1 arg (input and dim are required), got {len(args_schema)}"
+        )
+
+    input_strategy = args_schema[0]
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
+
+    dims_arg = args_schema[1]
+    reduce_dims = _infer_reduction_dims(dims_arg, input_strategy.ndim)
+    if reduce_dims is None:
+        raise AssertionError("Expected reduce_dims to not be None")
+
+    keep_dim = cast(bool, op_schema.kwargs_schema.get("keepdim", False))
+    return common_reduction_strategy(
+        input_strategy,
+        reduce_dims,
+        keep_dim=keep_dim,
+        reduction_linear=False,
+    )
diff --git a/torch/distributed/tensor/_ops/_matrix_ops.py b/torch/distributed/tensor/_ops/_matrix_ops.py
index b0dc49dde358..0005acf0cd7d 100644
--- a/torch/distributed/tensor/_ops/_matrix_ops.py
+++ b/torch/distributed/tensor/_ops/_matrix_ops.py
@@ -42,7 +42,8 @@
 @register_op_strategy(aten.t.default)
 def transpose_strategy(op_schema: OpSchema) -> OpStrategy:
     self_strategy = op_schema.args_schema[0]
-    assert isinstance(self_strategy, OpStrategy)
+    if not isinstance(self_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(self_strategy)}")
 
     transpose_strategies = []
     for input_strategy in self_strategy.strategies:
@@ -68,15 +69,20 @@ def _mm_like_strategy(
     mm_equation: str, mesh: DeviceMesh, op_schema: OpSchema
 ) -> OpStrategy:
     self_strategy, mat2_strategy = op_schema.args_schema
-    assert isinstance(self_strategy, OpStrategy)
-    assert isinstance(mat2_strategy, OpStrategy)
+    if not isinstance(self_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(self_strategy)}")
+    if not isinstance(mat2_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(mat2_strategy)}")
     # generate all possible strategies for mm
     mm_strategy = gen_einsum_strategies(mm_equation, mesh)
     # filter out invalid strategies and associate costs
     strategies = mm_strategy.strategies
     filtered_strategies = []
     for strtg in strategies:
-        assert strtg.input_specs is not None
+        if strtg.input_specs is None:
+            raise AssertionError(
+                f"Expected input_specs to be not None, got {strtg.input_specs}"
+            )
         self_spec = strtg.input_specs[0]
         mat2_spec = strtg.input_specs[1]
         if is_tensor_shardable(self_strategy.shape, self_spec) and is_tensor_shardable(
@@ -98,9 +104,12 @@ def _addmm_like_strategy(
     mm_equation: str, mesh: DeviceMesh, op_schema: OpSchema
 ) -> OpStrategy:
     self_strategy, mat1_strategy, mat2_strategy = op_schema.args_schema
-    assert isinstance(self_strategy, OpStrategy)
-    assert isinstance(mat1_strategy, OpStrategy)
-    assert isinstance(mat2_strategy, OpStrategy)
+    if not isinstance(self_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(self_strategy)}")
+    if not isinstance(mat1_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(mat1_strategy)}")
+    if not isinstance(mat2_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(mat2_strategy)}")
     self_shape = self_strategy.shape
     mm_out_shape = torch.Size(
         [
@@ -115,7 +124,10 @@ def _addmm_like_strategy(
     filtered_strategies = []
     for strtg in strategies:
         # construct new strategy by consider the self arg
-        assert strtg.input_specs is not None
+        if strtg.input_specs is None:
+            raise AssertionError(
+                f"Expected input_specs to be not None, got {strtg.input_specs}"
+            )
         mat1_spec = strtg.input_specs[0]
         mat2_spec = strtg.input_specs[1]
         out_spec = strtg.output_spec
@@ -160,22 +172,29 @@ def _scaled_mm_like_strategy(
         scale_result_strategy,
         *_,
     ) = op_schema.args_schema
-    assert isinstance(self_strategy, OpStrategy)
-    assert isinstance(mat2_strategy, OpStrategy)
-    assert isinstance(scale_self_strategy, OpStrategy)
-    assert isinstance(scale_mat2_strategy, OpStrategy)
+    if not isinstance(self_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(self_strategy)}")
+    if not isinstance(mat2_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(mat2_strategy)}")
+    if not isinstance(scale_self_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(scale_self_strategy)}")
+    if not isinstance(scale_mat2_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(scale_mat2_strategy)}")
     # TODO: add support for these later
-    assert bias_strategy is None, "_scaled_mm on DTensors doesn't support bias"
-    assert scale_result_strategy is None, (
-        "_scaled_mm on DTensors doesn't support scale_result"
-    )
+    if bias_strategy is not None:
+        raise AssertionError("_scaled_mm on DTensors doesn't support bias")
+    if scale_result_strategy is not None:
+        raise AssertionError("_scaled_mm on DTensors doesn't support scale_result")
     # generate all possible strategies for mm
     mm_strategy = gen_einsum_strategies(mm_equation, mesh)
     # filter out invalid strategies and associate costs
     strategies = mm_strategy.strategies
     filtered_strategies = []
     for strtg in strategies:
-        assert strtg.input_specs is not None
+        if strtg.input_specs is None:
+            raise AssertionError(
+                f"Expected input_specs to be not None, got {strtg.input_specs}"
+            )
         self_spec = strtg.input_specs[0]
         mat2_spec = strtg.input_specs[1]
         # propagate the operands' specs to their scales, except for tensor-wise
@@ -260,7 +279,8 @@ def scaled_dot_product_flash_attention_strategy(op_schema: OpSchema) -> OpStrate
 
     return_debug_mask = len(op_schema.args_schema) >= 6 and op_schema.args_schema[5]
     q_input_strategy = op_schema.args_schema[0]
-    assert isinstance(q_input_strategy, OpStrategy)
+    if not isinstance(q_input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(q_input_strategy)}")
     # assuming q/k/v have the same shape
 
     single_mesh_dim_strategies = []
@@ -361,7 +381,8 @@ def scaled_dot_product_flash_attention_backward_strategy(
     mesh = op_schema.get_mesh_from_args(validate=False)
 
     q_input_strategy = op_schema.args_schema[1]
-    assert isinstance(q_input_strategy, OpStrategy)
+    if not isinstance(q_input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(q_input_strategy)}")
     # assuming q/k/v have the same shape
 
     tensor_input_indices = [
@@ -473,7 +494,8 @@ def scaled_dot_product_efficient_attention_strategy(op_schema: OpSchema) -> OpSt
     # NOTE: currently we only support some simple strategies to support tensor parallelism
     mesh = op_schema.get_mesh_from_args()
     q_input_strategy = op_schema.args_schema[0]
-    assert isinstance(q_input_strategy, OpStrategy)
+    if not isinstance(q_input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(q_input_strategy)}")
     # assuming q/k/v have the same shape
 
     has_attn_bias = op_schema.args_schema[3] is not None
@@ -570,7 +592,8 @@ def scaled_dot_product_efficient_attention_backward_strategy(
     mesh = op_schema.get_mesh_from_args(validate=False)
 
     q_input_strategy = op_schema.args_schema[1]
-    assert isinstance(q_input_strategy, OpStrategy)
+    if not isinstance(q_input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(q_input_strategy)}")
     # assuming q/k/v have the same shape
     has_attn_bias = op_schema.args_schema[4] is not None
 
@@ -689,7 +712,8 @@ def scaled_dot_product_cudnn_attention_strategy(op_schema: OpSchema) -> OpStrate
         Replicate() if return_debug_mask else None
     )
 
-    assert isinstance(query_strategy, OpStrategy)
+    if not isinstance(query_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(query_strategy)}")
     # assuming q/k/v have the same shape
 
     single_mesh_dim_strategies = []
@@ -794,12 +818,16 @@ def scaled_scaled_dot_product_cudnn_attention_backward_strategy(
     # backward op does not need to validate the mesh since forward op has already done it
     mesh = op_schema.get_mesh_from_args(validate=False)
 
-    assert len(op_schema.args_schema) >= 15
+    if len(op_schema.args_schema) < 15:
+        raise AssertionError(
+            f"Expected at least 15 args_schema, got {len(op_schema.args_schema)}"
+        )
     has_attn_bias = op_schema.args_schema[8] is not None
     has_scale = len(op_schema.args_schema) >= 16 and False
 
     query_strategy = op_schema.args_schema[1]
-    assert isinstance(query_strategy, OpStrategy)
+    if not isinstance(query_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(query_strategy)}")
     # assuming q/k/v have the same shape
 
     single_mesh_dim_strategies = []
@@ -911,12 +939,15 @@ def grouped_mm_strategy(op_schema: OpSchema) -> OpStrategy:
     mesh = op_schema.get_mesh_from_args()
 
     mat1_strategy = op_schema.args_schema[0]
-    assert isinstance(mat1_strategy, OpStrategy)
+    if not isinstance(mat1_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(mat1_strategy)}")
     mat2_strategy = op_schema.args_schema[1]
-    assert isinstance(mat2_strategy, OpStrategy)
+    if not isinstance(mat2_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(mat2_strategy)}")
     if len(op_schema.args_schema) > 3:
         bias_strategy = op_schema.args_schema[3]
-        assert bias_strategy is None, "grouped_mm doesn't support bias yet"
+        if bias_strategy is not None:
+            raise AssertionError("grouped_mm doesn't support bias yet")
 
     single_mesh_dim_strategies = []
 
@@ -1048,8 +1079,14 @@ def valid_grouped_mm_strides(
         # 2. apply the logic from the groped_mm meta function
         # UGH the input DTensorSpecs are missing their tensormetas... so i can get them another way
         def local_meta(spec: OpSpec, placements: tuple[Placement, ...]) -> TensorMeta:
-            assert isinstance(spec.output_specs, DTensorSpec)
-            assert isinstance(spec.output_specs.tensor_meta, TensorMeta)
+            if not isinstance(spec.output_specs, DTensorSpec):
+                raise AssertionError(
+                    f"Expected DTensorSpec, got {type(spec.output_specs)}"
+                )
+            if not isinstance(spec.output_specs.tensor_meta, TensorMeta):
+                raise AssertionError(
+                    f"Expected TensorMeta, got {type(spec.output_specs.tensor_meta)}"
+                )
             meta: TensorMeta = spec.output_specs.tensor_meta
             local_stride = compute_local_stride(meta.stride, mesh, placements)
             local_shape, _ = compute_local_shape_and_global_offset(
@@ -1057,7 +1094,9 @@ def local_meta(spec: OpSpec, placements: tuple[Placement, ...]) -> TensorMeta:
             )
             return TensorMeta(torch.Size(local_shape), local_stride, meta.dtype)
 
+        # pyrefly: ignore  # missing-attribute
         mat1_meta = local_meta(mat1_strategy.strategies[0], input_specs[0].placements)
+        # pyrefly: ignore  # missing-attribute
         mat2_meta = local_meta(mat2_strategy.strategies[0], input_specs[1].placements)
 
         def check_valid_strides(meta: TensorMeta) -> bool:
@@ -1067,12 +1106,12 @@ def check_valid_strides(meta: TensorMeta) -> bool:
             if meta.stride[end_dim - 1] == 1 and meta.stride[end_dim] >= max(
                 1, meta.shape[end_dim - 1]
             ):
-                if not meta.stride[end_dim] % alignment == 0:
+                if meta.stride[end_dim] % alignment != 0:
                     return False
             elif meta.stride[end_dim] == 1 and meta.stride[end_dim - 1] >= max(
                 1, meta.shape[end_dim]
             ):
-                if not meta.stride[end_dim - 1] % alignment == 0:
+                if meta.stride[end_dim - 1] % alignment != 0:
                     return False
             else:
                 return False
diff --git a/torch/distributed/tensor/_ops/_tensor_ops.py b/torch/distributed/tensor/_ops/_tensor_ops.py
index a94c68c58739..70d74c1c614c 100644
--- a/torch/distributed/tensor/_ops/_tensor_ops.py
+++ b/torch/distributed/tensor/_ops/_tensor_ops.py
@@ -46,11 +46,13 @@ def propagate_single_input_strategy(op_schema: OpSchema) -> StrategyType:
     # for each strategy that the input supports, we create a corresponding strategy.
     # Note: this may be a complete waste of work, because it should be equivalent to
     # `return first_input_strategy` (unless creating a deep copy is important for some reason)
-    assert len([s for s in op_schema.args_schema if isinstance(s, OpStrategy)]) == 1, (
-        "propagate_single_input_strategy only works for single-tensor-input ops"
-    )
+    if len([s for s in op_schema.args_schema if isinstance(s, OpStrategy)]) != 1:
+        raise AssertionError(
+            "propagate_single_input_strategy only works for single-tensor-input ops"
+        )
     first_input_strategy = op_schema.args_schema[0]
-    assert isinstance(first_input_strategy, OpStrategy)
+    if not isinstance(first_input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(first_input_strategy)}")
     return OpStrategy(
         [
             OpSpec(
@@ -107,8 +109,10 @@ def equal_strategy(op_schema: OpSchema) -> StrategyType:
     # same strategy in theory.
     mesh = op_schema.get_mesh_from_args()
     self_strategy, other_strategy = op_schema.args_schema
-    assert isinstance(self_strategy, OpStrategy)
-    assert isinstance(other_strategy, OpStrategy)
+    if not isinstance(self_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(self_strategy)}")
+    if not isinstance(other_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(other_strategy)}")
 
     select_strategy = (
         self_strategy
@@ -164,7 +168,8 @@ def create_like_strategy(op_schema: OpSchema) -> StrategyType:
     # move from partial to replicated.
     select_strategy = op_schema.args_schema[0]
     create_like_strategy = OpStrategy([])
-    assert isinstance(select_strategy, OpStrategy)
+    if not isinstance(select_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(select_strategy)}")
     for arg_strategy in select_strategy.strategies:
         arg_spec = arg_strategy.output_spec
         output_spec = DTensorSpec(
@@ -196,12 +201,14 @@ def new_factory_strategy(op_schema: OpSchema) -> StrategyType:
     # 1. let the output be replicated
     # 2. let the output follow the input if input and output have the same shape
     input_strategy = op_schema.args_schema[0]
-    assert isinstance(input_strategy, OpStrategy)
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
 
     mesh = input_strategy.mesh
     input_shape = input_strategy.shape
     output_shape = op_schema.args_schema[1]
-    assert isinstance(output_shape, list)
+    if not isinstance(output_shape, list):
+        raise AssertionError(f"Expected list, got {type(output_shape)}")
 
     new_factory_strategy = OpStrategy([])
     for arg_strategy in input_strategy.strategies:
@@ -242,8 +249,10 @@ def gen_bucketize_strategy(op_schema: OpSchema) -> StrategyType:
     mesh = op_schema.get_mesh_from_args()
     input_strategy, boundaries_strategy = op_schema.args_schema
     bucketize_strategy = OpStrategy([])
-    assert isinstance(input_strategy, OpStrategy)
-    assert isinstance(boundaries_strategy, OpStrategy)
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
+    if not isinstance(boundaries_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(boundaries_strategy)}")
     for arg_strategy in input_strategy.strategies:
         arg_spec = DTensorSpec(
             mesh,
@@ -283,8 +292,10 @@ def select_int_strategy(op_schema: OpSchema) -> StrategyType:
         - Case 3 shard_dim > selected_dim: shard_dim -= 1.
     """
     input_strategy = op_schema.args_schema[0]
-    assert isinstance(input_strategy, OpStrategy)
-    assert len(op_schema.args_schema) == 3
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
+    if len(op_schema.args_schema) != 3:
+        raise AssertionError(f"Expected 3 args, got {len(op_schema.args_schema)}")
     selected_dim, index = (
         cast(int, op_schema.args_schema[1]),
         cast(int, op_schema.args_schema[2]),
@@ -335,8 +346,10 @@ def select_backward_strategy(op_schema: OpSchema) -> OpStrategy:
     # func: select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
     args_schema = op_schema.args_schema
     input_strategy, dim = args_schema[0], args_schema[2]
-    assert isinstance(input_strategy, OpStrategy), f"{input_strategy}"
-    assert isinstance(dim, int)
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {input_strategy}")
+    if not isinstance(dim, int):
+        raise AssertionError(f"Expected int, got {type(dim)}")
     output_strategies: list[OpSpec] = []
     for placement_strategy in input_strategy.strategies:
         input_spec = placement_strategy.output_spec
@@ -357,19 +370,24 @@ def gen_slice_strategy(op_schema: OpSchema) -> StrategyType:
     input_strategy, dim, start, end, step = (
         op_schema.args_schema + defaults[len(op_schema.args_schema) :]
     )
-    assert isinstance(input_strategy, OpStrategy)
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
 
     mesh = input_strategy.mesh
     input_shape = input_strategy.shape
     input_ndim = input_strategy.ndim
-    assert isinstance(dim, int)
+    if not isinstance(dim, int):
+        raise AssertionError(f"Expected int, got {type(dim)}")
     if start is None:
         start = 0
     if end is None or end > input_shape[dim]:
         end = input_shape[dim]
-    assert isinstance(start, IntLike)
-    assert isinstance(end, IntLike)
-    assert isinstance(step, IntLike)
+    if not isinstance(start, IntLike):
+        raise AssertionError(f"Expected IntLike, got {type(start)}")
+    if not isinstance(end, IntLike):
+        raise AssertionError(f"Expected IntLike, got {type(end)}")
+    if not isinstance(step, IntLike):
+        raise AssertionError(f"Expected IntLike, got {type(step)}")
 
     # normalize args
     slice_dim = normalize_dim(dim, input_ndim)  # type: ignore[arg-type]
@@ -419,7 +437,8 @@ def slice_backward_rules(op_schema: OpSchema) -> OpStrategy:
     # func: slice_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt start, SymInt end, SymInt step) -> Tensor
     args_schema = op_schema.args_schema
     input_strategy, dim = args_schema[0], args_schema[2]
-    assert isinstance(input_strategy, OpStrategy), f"{input_strategy}"
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {input_strategy}")
     output_strategies: list[OpSpec] = []
     for placement_strategy in input_strategy.strategies:
         output_spec = placement_strategy.output_spec
@@ -473,8 +492,10 @@ def gen_slice_scatter_strategy(op_schema: OpSchema) -> StrategyType:
     mesh = op_schema.get_mesh_from_args()
     input_strategy = op_schema.args_schema[0]
     src_strategy = op_schema.args_schema[1]
-    assert isinstance(input_strategy, OpStrategy)
-    assert isinstance(src_strategy, OpStrategy)
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
+    if not isinstance(src_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(src_strategy)}")
     input_ndim = input_strategy.ndim
     slice_dim = (
         cast(int, op_schema.args_schema[2]) if len(op_schema.args_schema) > 2 else 0
@@ -529,7 +550,8 @@ def gen_slice_scatter_strategy(op_schema: OpSchema) -> StrategyType:
 def replica_only_strategy(op_schema: OpSchema) -> StrategyType:
     """Only allow replication on the input/output."""
     input_strategy = op_schema.args_schema[0]
-    assert isinstance(input_strategy, OpStrategy)
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
     mesh = input_strategy.mesh
     replicate_spec = DTensorSpec(mesh, tuple([Replicate()] * mesh.ndim))
     return OpStrategy([OpSpec(replicate_spec)])
@@ -573,9 +595,12 @@ def scatter_add_strategy(op_schema: OpSchema) -> StrategyType:
     dim = op_schema.args_schema[1]
     index_strategy = op_schema.args_schema[2]
 
-    assert isinstance(input_strategy, OpStrategy)
-    assert isinstance(index_strategy, OpStrategy)
-    assert isinstance(dim, int)
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
+    if not isinstance(index_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(index_strategy)}")
+    if not isinstance(dim, int):
+        raise AssertionError(f"Expected int, got {type(dim)}")
     dim = normalize_dim(dim, input_strategy.ndim)
     mesh = input_strategy.mesh
     input_shape = input_strategy.shape
@@ -690,7 +715,8 @@ def merge_placement(
     follow_placements: Optional[list[Placement]] = None
     mesh = tuple_strategy.child_mesh(0)
     for arg_strategy in tuple_strategy.children:
-        assert isinstance(arg_strategy, OpStrategy)
+        if not isinstance(arg_strategy, OpStrategy):
+            raise AssertionError(f"Expected OpStrategy, got {type(arg_strategy)}")
         if arg_strategy.mesh != mesh:
             raise ValueError(
                 f"All operands in {op} must have the same mesh, "
@@ -702,13 +728,17 @@ def merge_placement(
             if follow_placements is None:
                 follow_placements = list(arg_placements)
                 continue
-            assert follow_placements is not None
+            if follow_placements is None:
+                raise AssertionError(
+                    "follow_placements should not be None at this point"
+                )
             for mesh_idx in range(mesh.ndim):
                 # merge placements with the priority
                 follow_placements[mesh_idx] = merge_placement(
                     follow_placements[mesh_idx], arg_placements[mesh_idx]
                 )
-    assert follow_placements is not None, "follow placements should not be None!"
+    if follow_placements is None:
+        raise AssertionError("follow placements should not be None!")
     return follow_placements
 
 
@@ -716,9 +746,11 @@ def merge_placement(
 def stack_strategy(op_schema: OpSchema) -> StrategyType:
     args_schema = op_schema.args_schema
     input_tuple_strategy = args_schema[0]
-    assert isinstance(input_tuple_strategy, TupleStrategy), f"{input_tuple_strategy}"
+    if not isinstance(input_tuple_strategy, TupleStrategy):
+        raise AssertionError(f"Expected TupleStrategy, got {input_tuple_strategy}")
     first_input_strategy = input_tuple_strategy.children[0]
-    assert isinstance(first_input_strategy, OpStrategy), f"{first_input_strategy}"
+    if not isinstance(first_input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {first_input_strategy}")
     common_input_ndim = first_input_strategy.ndim
     dim = cast(int, args_schema[1]) if len(args_schema) > 1 else 0
     # normalize the dim to be within the common input ndim
@@ -743,7 +775,8 @@ def stack_strategy(op_schema: OpSchema) -> StrategyType:
     follow_placements = shift_shard_dims_after_insert(follow_placements, dim)
 
     for strategy in input_tuple_strategy.children:
-        assert isinstance(strategy, OpStrategy)
+        if not isinstance(strategy, OpStrategy):
+            raise AssertionError(f"Expected OpStrategy, got {type(strategy)}")
         output_spec = DTensorSpec(mesh, tuple(follow_placements))
         redistribute_cost = []
         for input_spec in input_specs:
@@ -763,10 +796,12 @@ def stack_strategy(op_schema: OpSchema) -> StrategyType:
 def cat_strategy(op_schema: OpSchema) -> StrategyType:
     args_schema = op_schema.args_schema
     input_tuple_strategy = args_schema[0]
-    assert isinstance(input_tuple_strategy, TupleStrategy), f"{input_tuple_strategy}"
+    if not isinstance(input_tuple_strategy, TupleStrategy):
+        raise AssertionError(f"Expected TupleStrategy, got {input_tuple_strategy}")
     num_input_tensor = len(input_tuple_strategy.children)
     first_input_strategy = input_tuple_strategy.children[0]
-    assert isinstance(first_input_strategy, OpStrategy), f"{first_input_strategy}"
+    if not isinstance(first_input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {first_input_strategy}")
     common_input_ndim = first_input_strategy.ndim
     dim = cast(int, args_schema[1]) if len(args_schema) > 1 else 0
     # normalize the dim to be within the common input ndim
@@ -779,16 +814,17 @@ def cat_strategy(op_schema: OpSchema) -> StrategyType:
     strategies_placement_pool = set()
     for this_strategy in input_tuple_strategy.children:
         # check strategy of each tensor to be concatenated
-        assert isinstance(this_strategy, OpStrategy)
-        assert this_strategy.mesh == mesh, (
-            "cat op doesn't support cross mesh concatenation"
-        )
+        if not isinstance(this_strategy, OpStrategy):
+            raise AssertionError(f"Expected OpStrategy, got {type(this_strategy)}")
+        if this_strategy.mesh != mesh:
+            raise AssertionError("cat op doesn't support cross mesh concatenation")
         for op_spec in this_strategy.strategies:
             # Check each OpSpec of the tensor, the placement in this OpSpec
             # is used as the exemplar strategy that other tensors and output
             # tensor should follow. We also need to deduplicate the output
             # strategy with the same placement.
-            assert isinstance(op_spec, OpSpec)
+            if not isinstance(op_spec, OpSpec):
+                raise AssertionError(f"Expected OpSpec, got {type(op_spec)}")
             # exemplar OpSpec to follow
             exemplar_spec = op_spec.output_spec
             # check if the tensor is sharded on the concat dim
@@ -806,7 +842,10 @@ def cat_strategy(op_schema: OpSchema) -> StrategyType:
                 for idx in range(num_input_tensor):
                     # extract the strategy for the idx tensors to build the tensor_metadata and redistribute_cost
                     that_tensor_strategy = input_tuple_strategy.children[idx]
-                    assert isinstance(that_tensor_strategy, OpStrategy)
+                    if not isinstance(that_tensor_strategy, OpStrategy):
+                        raise AssertionError(
+                            f"Expected OpStrategy, got {type(that_tensor_strategy)}"
+                        )
                     input_spec = DTensorSpec(
                         mesh,
                         exemplar_placement,
@@ -832,9 +871,12 @@ def cat_strategy(op_schema: OpSchema) -> StrategyType:
 def prop_index_select(op_schema: OpSchema) -> OutputSharding:
     values_spec, dim, indices_spec = op_schema.args_schema
 
-    assert isinstance(values_spec, DTensorSpec)
-    assert isinstance(dim, int)
-    assert isinstance(indices_spec, DTensorSpec)
+    if not isinstance(values_spec, DTensorSpec):
+        raise AssertionError(f"Expected DTensorSpec, got {type(values_spec)}")
+    if not isinstance(dim, int):
+        raise AssertionError(f"Expected int, got {type(dim)}")
+    if not isinstance(indices_spec, DTensorSpec):
+        raise AssertionError(f"Expected DTensorSpec, got {type(indices_spec)}")
 
     all_indices_spec: list[Optional[DTensorSpec]] = [
         indices_spec if dim == i else None for i in range(values_spec.ndim)
@@ -872,17 +914,21 @@ def prop_index_put(op_schema: OpSchema) -> StrategyType:
     # We have 3 DTensor spec from argument `in`, `indices` and `values`
     # accordingly.
     in_spec, indices_spec, values_spec, *_ = op_schema.args_schema
-    assert isinstance(in_spec, OpStrategy)
+    if not isinstance(in_spec, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(in_spec)}")
     # `indices`` is a tuple of scalar LongTensor, so we use TupleStrategy.
-    assert isinstance(indices_spec, TupleStrategy)
-    assert isinstance(values_spec, OpStrategy)
+    if not isinstance(indices_spec, TupleStrategy):
+        raise AssertionError(f"Expected TupleStrategy, got {type(indices_spec)}")
+    if not isinstance(values_spec, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(values_spec)}")
     mesh = values_spec.mesh
     op_strategy = OpStrategy([])
     # 1. `indices` should all be replicated first.
     indices_redistribute_costs = []
     new_indices_spec: list[Optional[DTensorSpec]] = []
     for indices_spec_child in indices_spec.children:
-        assert isinstance(indices_spec_child, OpStrategy)
+        if not isinstance(indices_spec_child, OpStrategy):
+            raise AssertionError(f"Expected OpStrategy, got {type(indices_spec_child)}")
 
         replicated_spec = DTensorSpec(
             mesh=mesh,
@@ -910,7 +956,8 @@ def prop_index_put(op_schema: OpSchema) -> StrategyType:
             placements = strategy.output_spec.placements
             for placement in placements:
                 if placement.is_shard():
-                    assert isinstance(placement, Shard)
+                    if not isinstance(placement, Shard):
+                        raise AssertionError(f"Expected Shard, got {type(placement)}")
                     if exemplar_spec is in_spec:
                         # let `values_spce` follow `in_spec`
                         if placement.dim < size_offset:
@@ -984,8 +1031,10 @@ def prop_index(op_schema: OpSchema) -> OutputSharding:
     #   into either sharded or replicated)
 
     values_spec, multi_indices_spec = op_schema.args_schema
-    assert isinstance(values_spec, DTensorSpec)
-    assert isinstance(multi_indices_spec, list)
+    if not isinstance(values_spec, DTensorSpec):
+        raise AssertionError(f"Expected DTensorSpec, got {type(values_spec)}")
+    if not isinstance(multi_indices_spec, list):
+        raise AssertionError(f"Expected list, got {type(multi_indices_spec)}")
     multi_indices_spec = cast(list[Optional[DTensorSpec]], multi_indices_spec)
     valid_indices_spec: list[tuple[int, DTensorSpec]] = [
         (i, a) for i, a in enumerate(multi_indices_spec) if a is not None
@@ -1004,17 +1053,24 @@ def prop_index(op_schema: OpSchema) -> OutputSharding:
 
     if not need_reshard_on_indices:
         # this means that our inputs are already sharded properly and we will use that as our indices_spec
-        assert isinstance(indices_out.output_spec, DTensorSpec)
+        if not isinstance(indices_out.output_spec, DTensorSpec):
+            raise AssertionError(
+                f"Expected DTensorSpec, got {type(indices_out.output_spec)}"
+            )
         indices_spec: DTensorSpec = indices_out.output_spec
     else:
-        assert indices_out.redistribute_schema is not None
+        if indices_out.redistribute_schema is None:
+            raise AssertionError("redistribute_schema should not be None")
         valid_indices_suggestion = indices_out.redistribute_schema
         for i, v in enumerate(valid_indices_suggestion.args_spec):
             multi_indices_spec[valid_indices_spec[i][0]] = v
         # we'll need to call pointwise_rule again to see what's our ideal indices_spec and then
         # use that to compute our ideal values_spec
         indices_output_spec = pointwise_rule(valid_indices_suggestion).output_spec
-        assert isinstance(indices_output_spec, DTensorSpec)
+        if not isinstance(indices_output_spec, DTensorSpec):
+            raise AssertionError(
+                f"Expected DTensorSpec, got {type(indices_output_spec)}"
+            )
         indices_spec = indices_output_spec
 
     lookup_dims = {v[0] for v in valid_indices_spec}
@@ -1073,10 +1129,8 @@ def place(vp: Placement, ip: Placement) -> Placement:
                     DTensorSpec(
                         mesh=values_spec.mesh,
                         placements=tuple(
-                            [
-                                Replicate() if need_reshard_on_values[i] else v
-                                for i, v in enumerate(values_spec.placements)
-                            ]
+                            Replicate() if need_reshard_on_values[i] else v
+                            for i, v in enumerate(values_spec.placements)
                         ),
                         tensor_meta=values_spec.tensor_meta,
                     ),
@@ -1099,7 +1153,8 @@ def place(vp: Placement, ip: Placement) -> Placement:
 def split_strategy(op_schema: OpSchema) -> OpStrategy:
     input_strategy = op_schema.args_schema[0]
     split_size_or_sections = op_schema.args_schema[1]
-    assert isinstance(input_strategy, OpStrategy)
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
     input_ndim = input_strategy.ndim
     split_dim = (
         cast(int, op_schema.args_schema[2]) if len(op_schema.args_schema) > 2 else 0
@@ -1109,7 +1164,8 @@ def split_strategy(op_schema: OpSchema) -> OpStrategy:
     def size_split(N, i) -> list:
         # Last chunk will be smaller if the tensor size N
         # along the given dimension dim is not divisible by i.
-        assert i > 0
+        if not i > 0:
+            raise AssertionError(f"Split size must be positive, got {i}")
         return [i] * (N // i) + ([N % i] if N % i != 0 else [])
 
     output_size_list = (
@@ -1117,7 +1173,8 @@ def size_split(N, i) -> list:
         if isinstance(split_size_or_sections, int)
         else split_size_or_sections
     )
-    assert isinstance(output_size_list, Sized)
+    if not isinstance(output_size_list, Sized):
+        raise AssertionError(f"Expected Sized, got {type(output_size_list)}")
 
     all_strategies = []
     for strategy in input_strategy.strategies:
@@ -1151,7 +1208,8 @@ def size_split(N, i) -> list:
 def gen_unbind_strategy(op_schema: OpSchema) -> StrategyType:
     """Forward all shardings except the unbind dimension."""
     input_strategy = op_schema.args_schema[0]
-    assert isinstance(input_strategy, OpStrategy)
+    if not isinstance(input_strategy, OpStrategy):
+        raise AssertionError(f"Expected OpStrategy, got {type(input_strategy)}")
     input_ndim = input_strategy.ndim
     input_shape = input_strategy.shape
     unbind_dim = (
diff --git a/torch/distributed/tensor/_ops/_view_ops.py b/torch/distributed/tensor/_ops/_view_ops.py
index 80a0491f694c..2d9e33402c60 100644
--- a/torch/distributed/tensor/_ops/_view_ops.py
+++ b/torch/distributed/tensor/_ops/_view_ops.py
@@ -1,8 +1,8 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
-from collections.abc import Iterable, Sequence
+from collections.abc import Callable, Iterable, Sequence
 from dataclasses import dataclass
-from typing import Callable, cast, Optional, Union
+from typing import cast, Optional, Union
 
 import torch
 from torch import Tensor
@@ -141,10 +141,14 @@ class Split(DimSpec):
 
     @classmethod
     def new(cls, dim: DimSpec, group_shape: tuple[int, ...], idx: int) -> DimSpec:
-        assert len(group_shape) > 0
+        if not len(group_shape) > 0:
+            raise AssertionError(
+                f"Expected group_shape length > 0, got {len(group_shape)}"
+            )
         if len(group_shape) == 1:
             # not really a group, just return the input dim back
-            assert idx == 0
+            if not idx == 0:
+                raise AssertionError(f"Expected idx == 0, got {idx}")
             return dim
         elif group_shape[idx] == 1:
             return Singleton()
@@ -181,7 +185,10 @@ def dim_atleast_3d(ndim: int) -> DimMap:
 
 def expand(input_shape: Shape, shape: Shape) -> DimMap:
     """Implement broadcast on multiple dimensions."""
-    assert len(shape) >= len(input_shape)
+    if not len(shape) >= len(input_shape):
+        raise AssertionError(
+            f"Expected len(shape) >= len(input_shape), got {len(shape)} < {len(input_shape)}"
+        )
 
     # 1. create padded input dimensions
     padded_input = dim_pad_left(len(input_shape), len(shape))
@@ -190,11 +197,17 @@ def expand(input_shape: Shape, shape: Shape) -> DimMap:
     for p, desired_s in zip(padded_input, shape):
         if isinstance(p, Singleton):
             actual_s = 1
-            assert desired_s >= 0
+            if not desired_s >= 0:
+                raise AssertionError(f"Expected desired_s >= 0, got {desired_s}")
         else:
-            assert isinstance(p, InputDim), f"DimSpec not supported in expand: {p}"
+            if not isinstance(p, InputDim):
+                raise AssertionError(f"DimSpec not supported in expand: {p}")
             actual_s = input_shape[p.input_dim]
-            assert actual_s == 1 or desired_s == -1 or desired_s == actual_s
+            if not (actual_s == 1 or desired_s == -1 or desired_s == actual_s):
+                raise AssertionError(
+                    f"Expected actual_s == 1 or desired_s == -1 or "
+                    f"desired_s == actual_s, got actual_s={actual_s}, desired_s={desired_s}"
+                )
         mapping.append(
             p
             if desired_s in (1, -1) or desired_s == actual_s
@@ -238,12 +251,21 @@ def dim_movedim(
     input = normalize_dims(input, ndim)
     destination = normalize_dims(destination, ndim)
 
-    assert len(input) == len(destination)
+    if not len(input) == len(destination):
+        raise AssertionError(
+            f"Expected len(input) == len(destination), got {len(input)} != {len(destination)}"
+        )
     input_set = set(input)
-    assert len(input_set) == len(input), "Found repeated input dims"
-    assert len(set(destination)) == len(destination), "Found repeated output dims"
-    assert max(input) < ndim
-    assert max(destination) < ndim
+    if not len(input_set) == len(input):
+        raise AssertionError("Found repeated input dims")
+    if not len(set(destination)) == len(destination):
+        raise AssertionError("Found repeated output dims")
+    if not max(input) < ndim:
+        raise AssertionError(f"Expected max(input) < ndim, got {max(input)} >= {ndim}")
+    if not max(destination) < ndim:
+        raise AssertionError(
+            f"Expected max(destination) < ndim, got {max(destination)} >= {ndim}"
+        )
 
     dest = [-1] * ndim
     for i, d in zip(input, destination):
@@ -259,9 +281,10 @@ def dim_movedim(
 
 def dim_repeat(ndim: int, sizes: Shape) -> DimMap:
     sizes = normalize_sizes(sizes)
-    assert len(sizes) >= ndim, (
-        f"Number of dimensions of repeat dims {sizes} can not be smaller than number of dimensions of tensor {ndim}."
-    )
+    if not len(sizes) >= ndim:
+        raise AssertionError(
+            f"Number of dimensions of repeat dims {sizes} can not be smaller than number of dimensions of tensor {ndim}."
+        )
     pad = len(sizes) - ndim
     return tuple(Repeat.new(Singleton(), s) for s in sizes[:pad]) + tuple(
         Repeat.new(InputDim(i), s) for i, s in enumerate(sizes[pad:])
@@ -276,15 +299,18 @@ def infer_size(total_size: int, sizes: Shape) -> Shape:
     """
     infers = [i for i, s in enumerate(sizes) if s == -1]
     size = prod(sizes)
-    assert len(infers) <= 1, "can only infer one size"
+    if not len(infers) <= 1:
+        raise AssertionError("can only infer one size")
     if infers:
         size = -size
         missing_size = total_size // size
-        assert total_size % size == 0, (
-            f"size inferred for -1 is not integral {sizes} should have {total_size} elements."
-        )
+        if not total_size % size == 0:
+            raise AssertionError(
+                f"size inferred for -1 is not integral {sizes} should have {total_size} elements."
+            )
         return tuple(s if s != -1 else missing_size for s in sizes)
-    assert size == total_size, f"sizes do not match {total_size} vs {size}"
+    if not size == total_size:
+        raise AssertionError(f"sizes do not match {total_size} vs {size}")
     return sizes
 
 
@@ -320,7 +346,8 @@ def view_groups(from_size: Shape, to_size: Shape) -> DimMap:
     from_nelem = prod(from_size)
     to_size = infer_size(from_nelem, normalize_sizes(to_size))
 
-    assert from_nelem == prod(to_size), "Total view shape does not add up"
+    if not from_nelem == prod(to_size):
+        raise AssertionError("Total view shape does not add up")
 
     from_idx = 0
     to_idx = 0
@@ -390,8 +417,10 @@ def dim_tile(ndim: int, dims: tuple[int, ...]) -> DimMap:
 def dim_transpose(ndim: int, dim1: int, dim2: int) -> DimMap:
     dim1 = normalize_dim(dim1, ndim)
     dim2 = normalize_dim(dim2, ndim)
-    assert dim1 < ndim
-    assert dim2 < ndim
+    if not dim1 < ndim:
+        raise AssertionError(f"Expected dim1 < ndim, got {dim1} >= {ndim}")
+    if not dim2 < ndim:
+        raise AssertionError(f"Expected dim2 < ndim, got {dim2} >= {ndim}")
     dimmap = [InputDim(i) for i in range(ndim)]
     swapdim = dimmap[dim1]
     dimmap[dim1] = dimmap[dim2]
@@ -490,9 +519,8 @@ def propagate_shape_and_sharding(
     - An output dimension that is a split of the input dimension can only be sharded
       if the leftmost split size is divisible by the mesh dimension
     """
-    assert len(input_src_placements) == len(mesh_sizes), (
-        f"{input_src_placements} != {mesh_sizes}"
-    )
+    if not len(input_src_placements) == len(mesh_sizes):
+        raise AssertionError(f"{input_src_placements} != {mesh_sizes}")
     # for each input dim, for each mesh dim, provides a list of possible shardable dimensions
     mesh_ndim = len(mesh_sizes)
     shardable_dims: dict[int, list[bool]] = {}
@@ -534,7 +562,8 @@ def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
         elif isinstance(cmd, Flatten):
             for i, dim in enumerate(cmd.input_dims):
                 # so far all Flatten is always composed of InputDims; revisit this if needed
-                assert isinstance(dim, InputDim)
+                if not isinstance(dim, InputDim):
+                    raise AssertionError(f"Expected InputDim, got {type(dim)}")
                 can_shard_dim = True
                 shard_mesh_dim, shard_placement = (
                     maybe_get_shard_mesh_dim_and_placement(dim)
@@ -548,7 +577,10 @@ def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
                             "It cannot be performed without redistribution, which is disallowed by the current operator.",
                         )
                 elif input_sharded:
-                    assert shard_placement is not None and shard_mesh_dim is not None
+                    if not (shard_placement is not None and shard_mesh_dim is not None):
+                        raise AssertionError(
+                            "Expected shard_placement and shard_mesh_dim to be not None"
+                        )
                     tensor_dim_size = global_input_shape[shard_placement.dim]
                     mesh_dim_size = mesh_sizes[shard_mesh_dim]
                     if tensor_dim_size % mesh_dim_size != 0:
@@ -561,7 +593,10 @@ def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
                             )
                 shardable_dims[dim.input_dim] = [can_shard_dim] * mesh_ndim
 
-            assert isinstance(cmd.input_dims[0], InputDim)
+            if not isinstance(cmd.input_dims[0], InputDim):
+                raise AssertionError(
+                    f"Expected InputDim, got {type(cmd.input_dims[0])}"
+                )
             return cmd.input_dims[0]
         elif isinstance(cmd, Split):
             in_dim = get_in_dim_to_shard(cmd.input_dim)
@@ -594,9 +629,10 @@ def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
                 for size, shard in zip(mesh_sizes, input_src_placements):
                     if isinstance(shard, Shard) and shard.dim == in_dim:
                         submesh_size *= size
-                assert out_size % submesh_size == 0, (
-                    f"Resulting dimension size {out_size} is not divisible by its mesh dimension {submesh_size}."
-                )
+                if not out_size % submesh_size == 0:
+                    raise AssertionError(
+                        f"Resulting dimension size {out_size} is not divisible by its mesh dimension {submesh_size}."
+                    )
 
             # we will only shard our first component of the split
             return in_dim if cmd.split_id == 0 else None
@@ -677,7 +713,8 @@ def reshape_strategy(op_schema: OpSchema) -> StrategyType:
         mesh = op_schema.get_mesh_from_args(validate=False)
 
         global_in_shape = input_strategy.shape
-        assert global_in_shape is not None, "Shape required."
+        if global_in_shape is None:
+            raise AssertionError("Shape required.")
 
         output_strategy = OpStrategy([])
         for input_placement_strategy in input_strategy.strategies:
diff --git a/torch/distributed/tensor/_ops/utils.py b/torch/distributed/tensor/_ops/utils.py
index 2d05b62aef44..2b6f30bded97 100644
--- a/torch/distributed/tensor/_ops/utils.py
+++ b/torch/distributed/tensor/_ops/utils.py
@@ -3,8 +3,8 @@
 import functools
 import itertools
 import operator
-from collections.abc import Iterable, Sequence
-from typing import Callable, cast, Optional, TypeVar, Union
+from collections.abc import Callable, Iterable, Sequence
+from typing import cast, Optional, TypeVar, Union
 from typing_extensions import ParamSpec
 
 import torch
@@ -150,7 +150,7 @@ def normalize_dims(dims: DimsType, ndim: int) -> DimsSequenceType:
     elif isinstance(dims, list):
         dims = [normalize_dim(dim, ndim) for dim in dims]
     elif isinstance(dims, tuple):
-        dims = tuple([normalize_dim(dim, ndim) for dim in dims])
+        dims = tuple(normalize_dim(dim, ndim) for dim in dims)
     return dims
 
 
@@ -194,6 +194,22 @@ def is_tensor_evenly_shardable(shape: Sequence[int], spec: DTensorSpec) -> bool:
     return True
 
 
+def is_tensor_evenly_shardable_on_dim(
+    shape: Sequence[int], spec: DTensorSpec, dim: int
+) -> bool:
+    """Check if the shape is evenly shardable according to the spec on dim."""
+    dim = normalize_dim(dim, len(shape))
+
+    num_shards = 1
+    for i, placement in enumerate(spec.placements):
+        if placement.is_shard():
+            shard_dim = cast(Shard, placement).dim
+            if shard_dim == dim:
+                num_shards *= spec.mesh.size(i)
+
+    return shape[dim] % num_shards == 0
+
+
 def is_tensor_dim_sharded(spec: DTensorSpec, dim: int) -> bool:
     """Return True if tensor dim is sharded."""
     return any(p.is_shard(dim) for p in spec.placements)
@@ -320,6 +336,7 @@ def expand_to_full_mesh_op_strategy(
         for specs in zip(*strategy_comb):
             if specs[0] is not None:
                 # TODO: we should fill in tensor_meta here.  If nothing else, it helps the filter strategy callback
+                # pyrefly: ignore  # bad-argument-type
                 spec_list.append(DTensorSpec(mesh, specs))
             else:
                 spec_list.append(None)
diff --git a/torch/distributed/tensor/_random.py b/torch/distributed/tensor/_random.py
index 68a3fe3f329a..19e69cff218f 100644
--- a/torch/distributed/tensor/_random.py
+++ b/torch/distributed/tensor/_random.py
@@ -150,6 +150,7 @@ class _RNGStateTracker:
     """
 
     def __init__(self, device: torch.device):
+        # pyrefly: ignore  # read-only
         self._device = device
         self._device_handle = _get_device_handle(self._device.type)
         if not (self._device_handle and self._device_handle.is_available()):
diff --git a/torch/distributed/tensor/_redistribute.py b/torch/distributed/tensor/_redistribute.py
index d8e2e11e1069..9dc5f5041abb 100644
--- a/torch/distributed/tensor/_redistribute.py
+++ b/torch/distributed/tensor/_redistribute.py
@@ -1,6 +1,12 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
+import contextlib
+import dataclasses
+import itertools
 import logging
+import weakref
+from collections import defaultdict
+from collections.abc import Sequence
 from functools import cache
 from typing import cast, NamedTuple, Optional
 
@@ -8,7 +14,12 @@
 import torch.distributed._functional_collectives as funcol
 import torch.distributed.tensor._api as dtensor
 from torch.distributed._functional_collectives import _are_we_tracing
-from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+from torch.distributed.tensor._dtensor_spec import (
+    DTensorSpec,
+    ShardOrder,
+    ShardOrderEntry,
+    TensorMeta,
+)
 from torch.distributed.tensor.device_mesh import DeviceMesh
 from torch.distributed.tensor.placement_types import (
     Partial,
@@ -16,6 +27,7 @@
     Replicate,
     Shard,
 )
+from torch.utils._debug_mode import get_active_debug_mode
 
 
 logger = logging.getLogger(__name__)
@@ -28,95 +40,597 @@ class _TransformInfo(NamedTuple):
     logical_shape: list[int]
 
 
-def _gen_transform_infos_non_cached(
-    src_spec: DTensorSpec,
-    dst_spec: DTensorSpec,
-) -> list[_TransformInfo]:
+# Global cache for DTensorRedistributePlanner instances
+_planner_cache: dict[
+    tuple[weakref.ReferenceType, int], "DTensorRedistributePlanner"
+] = {}
+
+
+def get_redistribute_planner(
+    device_mesh: DeviceMesh, tensor_dimension: int
+) -> "DTensorRedistributePlanner":
     """
-    Generate the transform infos from the source placements to the target placements.
-
-    To transform from source to target placement it might have multiple steps, i.e. it
-    might decompose Si -> Sj into Si -> R -> Sj.
-    This would detect if there're mis-aligned/nested shardings between src/dst placements.
-    E.g. Suppose the redistribution to perform is (Shard(0), Shard(0)) -> (Replicate(), Shard(0)),
-    in this case Shard(0) -> Shard(0) for mesh dimension 1 actually needs resharding, because in
-    the former is a nested-sharding of a tensor already already sharded dimension 0, whereras
-    the latter is the first sharding on tensor dimension 0.
+    Factory function to get or create a DTensorRedistributePlanner instance.
+    This function provides transparent caching of planner instances based on
+    device_mesh and tensor_dimension. Multiple calls with the same parameters
+    will return the same cached instance for better performance.
+    Args:
+        device_mesh: The device mesh for the planner
+        tensor_dimension: Number of tensor dimensions
+    Returns:
+        A DTensorRedistributePlanner instance (potentially cached)
     """
-    transform_infos: list[_TransformInfo] = []
+    cache_key = (weakref.ref(device_mesh), tensor_dimension)
 
-    device_mesh = src_spec.device_mesh
-    my_coordinate = device_mesh.get_coordinate()
-    assert my_coordinate is not None
-
-    # logical shape records the logic tensor shape on the mesh dimension
-    # this is useful to ensure uneven sharding gets correct output shape
-    initial_logical_shape = list(src_spec.shape)
-    mesh_dims_to_logical_shape = [initial_logical_shape]
-
-    if device_mesh.ndim == 1:
-        # if device_mesh is 1D, redistribute is a simple direct transformation
-        transform_infos.append(
-            _TransformInfo(
-                mesh_dim=0,
-                src_dst_placements=(src_spec.placements[0], dst_spec.placements[0]),
-                logical_shape=initial_logical_shape,
+    if cache_key not in _planner_cache:
+        planner = DTensorRedistributePlanner(device_mesh, tensor_dimension)
+        _planner_cache[cache_key] = planner
+
+    return _planner_cache[cache_key]
+
+
+def clear_redistribute_planner_cache() -> None:
+    """Clear the cache of DTensorRedistributePlanner instances."""
+    _planner_cache.clear()
+
+
+class DTensorRedistributePlanner:
+    """
+    This class is used to plan the collective calls to transform the local shard
+    of the DTensor from its current spec to the target spec.
+    Suppose there are N tensor dimensions and M mesh dimensions, the total
+    possible state size will be (N+2)*M*M!.
+    Note: Use get_redistribute_planner() factory function instead of direct
+    instantiation for automatic caching.
+    """
+
+    @dataclasses.dataclass(frozen=True, slots=True)
+    class DistState:
+        placements: tuple[Placement, ...]
+        tensor_dim_to_mesh_dim: ShardOrder
+        _hash: Optional[int] = dataclasses.field(
+            default=None, init=False, repr=False, compare=False
+        )
+
+        def __str__(self):
+            return DTensorSpec.format_shard_order_str(
+                self.placements,
+                self.tensor_dim_to_mesh_dim,
+            )
+
+        def __repr__(self):
+            return self.__str__()
+
+        def __post_init__(self):
+            # precompute hash after all attributes are set
+            object.__setattr__(
+                self,
+                "_hash",
+                self._compute_hash(),
+            )
+
+        def __hash__(self) -> int:
+            return self._hash if self._hash is not None else self._compute_hash()
+
+        def _compute_hash(self) -> int:
+            return hash(
+                (
+                    self.placements,
+                    self.tensor_dim_to_mesh_dim,
+                )
+            )
+
+        def __eq__(self, other: object) -> bool:
+            if not isinstance(other, DTensorRedistributePlanner.DistState):
+                return False
+            if self._hash != other._hash:
+                return False
+            return (
+                self.placements,
+                self.tensor_dim_to_mesh_dim,
+            ) == (
+                other.placements,
+                other.tensor_dim_to_mesh_dim,
             )
+
+    def _to_tuple(self, x):
+        """Convert a nested list structure to a nested tuple structure."""
+        if isinstance(x, list | tuple):
+            return tuple(self._to_tuple(item) for item in x)
+        return x
+
+    @staticmethod
+    def _dict_to_ShardOrder(x: dict[int, list[int]]) -> ShardOrder:
+        """Convert dict to ShardOrder"""
+        return tuple(
+            ShardOrderEntry(tensor_dim=key, mesh_dims=tuple(value))
+            for key, value in sorted(x.items())
+            if value
         )
-        return transform_infos
 
-    # Handle multi-dim device mesh placement redistribution
-    # First, we need to build the logical shape for each mesh dim
-    # for correct allgathering uneven shards on each mesh dim (with dynamic padding)
-    for i, src in enumerate(src_spec.placements):
-        current_logical_shape = mesh_dims_to_logical_shape[i]
-        if isinstance(src, Shard):
-            if i < device_mesh.ndim - 1:
-                # calculate and save the logical shape for this sharding
-                mesh_dim_size = device_mesh.size(mesh_dim=i)
-                local_shard_size, _ = src._local_shard_size_and_offset(
-                    current_logical_shape[src.dim],
-                    mesh_dim_size,
-                    my_coordinate[i],
+    @staticmethod
+    def _ShardOrder_to_dict(x: ShardOrder) -> dict[int, list[int]]:
+        """Convert ShardOrder to dict with tensor dim as key"""
+        tensor_mesh_dim_dict = defaultdict(list)
+        for entry in x:
+            tensor_mesh_dim_dict[entry.tensor_dim] = list(entry.mesh_dims)
+        return tensor_mesh_dim_dict
+
+    @staticmethod
+    def stringify_transform_infos(
+        mesh: DeviceMesh,
+        transform_infos: Sequence[_TransformInfo],
+        src_placement: tuple[Placement, ...],
+        src_shard_order: Optional[ShardOrder] = None,
+    ) -> str:
+        """
+        Generate a string representation of the sequence of state transitions
+        (placements and shard orders) as described by the given transform_info.
+
+        Args:
+            mesh: The DeviceMesh used for the redistribution.
+            transform_infos: A sequence of _TransformInfo objects describing each
+                transformation step.
+            src_placement: The initial tuple of Placement objects.
+            src_shard_order: (Optional) The initial ShardOrder representing
+                the mapping of tensor dimensions to mesh dimensions. If None,
+                the default shard order is computed from src_placement and mesh.
+
+        Returns:
+            A string showing the sequence of DistState transitions, separated by '->'.
+        """
+        assert len(src_placement) == mesh.ndim
+        if src_shard_order is None:
+            src_shard_order = DTensorSpec.compute_default_shard_order(src_placement)
+        cur_placement = list(src_placement)
+        shard_order_dict = DTensorRedistributePlanner._ShardOrder_to_dict(
+            src_shard_order
+        )
+        cur_state = DTensorRedistributePlanner.DistState(
+            tuple(cur_placement), src_shard_order
+        )
+        state_list = [
+            cur_state,
+        ]
+        for transform_info in transform_infos:
+            src_dim_placement, dst_dim_placement = transform_info.src_dst_placements
+            if src_dim_placement.is_shard():
+                src_dim = src_dim_placement.dim  # type: ignore[attr-defined]
+                assert (
+                    src_dim in shard_order_dict and len(shard_order_dict[src_dim]) > 0
                 )
-                new_logical_shape = list(current_logical_shape)
-                new_logical_shape[src.dim] = local_shard_size
-                mesh_dims_to_logical_shape.append(new_logical_shape)
-        else:
-            mesh_dims_to_logical_shape.append(current_logical_shape)
-
-    # Next, we need to derive the transform infos from src to dst placements,
-    # here we use a greedy search with step by step state transformations
-    current_placements = list(src_spec.placements)
-    target_placements = list(dst_spec.placements)
-
-    if src_spec.num_shards > 1:
-        # If src_spec have sharding, it could potentially have sharding that is misaligned with dst_spec
-        # a common case of this is nested sharding (i.e. (S(0), S(0)) -> (R, S(0))).
-        # In those cases, we first traverse from inner placement to outer placement
-        # to detect misaligned shardings and properly replicate nested sharding first.
-        for mesh_dim in reversed(range(len(current_placements))):
-            current = current_placements[mesh_dim]
-            target = target_placements[mesh_dim]
-            # If target is not Shard, we can directly redistribute since we are traversing from innner
-            # to outer placements here
-            if isinstance(target, Shard):
-                # If target is Shard, check for nested sharding on the tensor dim BEFORE the current mesh_dim
-                shard_dim = target.dim
-                current_mesh_sharding, target_mesh_sharding = [], []
-                for i, (s, p) in enumerate(zip(current_placements, target_placements)):
-                    if i >= mesh_dim:
-                        break
-                    if s.is_shard(shard_dim):
-                        current_mesh_sharding.append(i)
-                    if p.is_shard(shard_dim):
-                        target_mesh_sharding.append(i)
-
-                if current_mesh_sharding != target_mesh_sharding:
-                    # if current/target_placements have misaligned sharding on the tensor dim BEFORE the current
-                    # mesh_dim, we need to replicate the tensor on the mesh dim first to clear the nested sharding
-                    target = Replicate()
+                shard_order_dict[src_dim].pop()
+            if dst_dim_placement.is_shard():
+                dst_dim = dst_dim_placement.dim  # type: ignore[attr-defined]
+                if dst_dim not in shard_order_dict:
+                    shard_order_dict[dst_dim] = []
+                shard_order_dict[dst_dim].append(transform_info.mesh_dim)
+            cur_placement[transform_info.mesh_dim] = dst_dim_placement
+            new_state = DTensorRedistributePlanner.DistState(
+                tuple(cur_placement),
+                DTensorRedistributePlanner._dict_to_ShardOrder(shard_order_dict),
+            )
+            state_list.append(new_state)
+        return "->".join([str(s) for s in state_list])
+
+    def __init__(
+        self,
+        device_mesh: DeviceMesh,
+        tensor_dimension: int,
+    ) -> None:
+        """
+        Initialize DTensorRedistributePlanner.
+
+        Args:
+            device_mesh: The device mesh for this planner
+            tensor_dimension: Number of tensor dimensions
+        """
+        self.device_mesh = device_mesh
+        self.coordinate = device_mesh.get_coordinate()
+        assert self.coordinate is not None
+        self.tensor_dimension = tensor_dimension
+        self.setup_collective_cost()
+
+    def setup_collective_cost(
+        self,
+        all_reduce_cost: int = 4,
+        all_to_all_cost: int = 1,
+        all_gather_cost: int = 2,
+        reduce_scatter_cost: int = 2,
+        chunk_cost: int = 0,
+    ) -> None:
+        """
+        Set up the cost weights for different collective operations.
+        """
+        # those can be turned in a handler considering the tensor dim size
+        self.all_reduce_cost = all_reduce_cost
+        self.all_to_all_cost = all_to_all_cost
+        self.all_gather_cost = all_gather_cost
+        self.reduce_scatter = reduce_scatter_cost
+        self.chunk_cost = chunk_cost
+
+    def get_next_state(
+        self,
+        placements: tuple[Placement, ...],
+        tensor_mesh_dim_tuple: ShardOrder,
+    ) -> dict["DTensorRedistributePlanner.DistState", int]:
+        # We map tensor dimensions to device mesh axes, similar to JAX-style
+        # sharding representation. Notation:
+        # S(<tensor_dim>)[<list_of_device_dims>] means tensor dimension
+        # <tensor_dim> is sharded on the listed device mesh axes, where
+        # <list_of_device_dims> is sorted by device order.
+        #
+        # To generalize to arbitrary dimensionality, we use the following notation:
+        #   S(a)[x, ...]   : tensor dimension 'a' is sharded on device mesh axes x, ... (variadic, possibly empty)
+        #   R[...]         : replicated on the listed device mesh axes (possibly empty)
+        #   P[...]         : partial on the listed device mesh axes (possibly empty)
+        # The ellipsis '...' denotes a variadic wildcard, i.e., zero or more device mesh axes.
+        #
+        # Below are possible transitions from one sharding state to another.
+        # We use `S` for Shard, `R` for Replicate, and `P` for Partial.
+        #
+        # Case 1. Shard(a) -> Shard(b), use all-to-all (a2a), applies to:
+        #   S(a)[..., x] -> S(b)[..., x]
+        #   or
+        #   S(a)[..., x, y]S(b)[..., z, k] -> S(a)[..., x]S(b)[..., z, k, y]
+        #   where device order of 'y' > device order of 'z' and 'k'
+        #
+        # Case 2. Shard() -> Replicate(), use all-gather, applies to:
+        #   S(a)[..., x, y, z] -> S(a)[..., x, y]
+        #
+        # Case 3. Partial() -> Replicate(), use all-reduce, applies to:
+        #   P[..., x, y] -> P[..., y] or P[..., x]
+        #   Note: this case can be disabled because all-reduce technically is not
+        #   a primitive since it combines a reduce-scatter + all-gather.
+        #
+        # Case 4. Replicate() -> Shard(), use chunk, applies to:
+        #   S(a)[..., z] -> S(a)[..., z, y] (`a` can be any tensor dim). Note that
+        #   'y' must be after 'z'.
+        #
+        # Case 5. Partial() -> Shard(), use reduce-scatter, applies to:
+        #  P[..., x, y] -> P[..., x]S(a)[..., y] or P[..., x, y] -> P[..., y]S(a)[..., x]
+        #
+        # Case 6. Replicate() -> Partial(), local math op, applies to:
+        #   R* -> P[..., x]
+        #
+        # NB: Device order in Partial placement doesn't take impact. We should be able
+        # to operate on any Partial mesh dim.
+
+        # list of [DistState, cost]
+        all_next_state: dict[DTensorRedistributePlanner.DistState, int] = {}
+
+        tensor_mesh_dim_dict = DTensorRedistributePlanner._ShardOrder_to_dict(
+            tensor_mesh_dim_tuple
+        )
+        ######################################################################
+        # handle case 1: Shard(a) -> Shard(b)
+        # For S(a), S(b), only the last device order of S(a) and S(b) can be a2a
+        # interchangeably.
+
+        # convert sparse tuple
+        for entry in tensor_mesh_dim_tuple:
+            src_tensor_dim = entry.tensor_dim
+            for dst_tensor_dim in range(self.tensor_dimension):
+                if src_tensor_dim == dst_tensor_dim:
+                    continue
+                # try move the last sharded device dim from
+                # Shard(src_tensor_dim) to Shard(dst_tensor_dim)
+                move_mesh_dim = tensor_mesh_dim_dict[src_tensor_dim].pop()
+                tensor_mesh_dim_dict[dst_tensor_dim].append(move_mesh_dim)
+                new_placements = list(placements)
+                new_placements[move_mesh_dim] = Shard(dst_tensor_dim)
+                dist_state = self.DistState(
+                    self._to_tuple(new_placements),
+                    DTensorRedistributePlanner._dict_to_ShardOrder(
+                        tensor_mesh_dim_dict
+                    ),
+                )
+                all_next_state[dist_state] = self.all_to_all_cost
+                # reset content for next iteration
+                tensor_mesh_dim_dict[src_tensor_dim].append(move_mesh_dim)
+                tensor_mesh_dim_dict[dst_tensor_dim].pop()
+        # TODO(zpcore): support discovering submesh to prevent padding when
+        # tensor dim is not divisible by the mesh dim.
+
+        ######################################################################
+        # handle case 2: Shard() -> Replicate()
+        for entry in tensor_mesh_dim_tuple:
+            src_tensor_dim = entry.tensor_dim
+            move_mesh_dim = tensor_mesh_dim_dict[src_tensor_dim].pop()
+            new_placements = list(placements)
+            new_placements[move_mesh_dim] = Replicate()
+            dist_state = self.DistState(
+                self._to_tuple(new_placements),
+                DTensorRedistributePlanner._dict_to_ShardOrder(tensor_mesh_dim_dict),
+            )
+            tensor_mesh_dim_dict[src_tensor_dim].append(move_mesh_dim)
+            all_next_state[dist_state] = self.all_gather_cost
+
+        ######################################################################
+        # handle case 3: Partial() -> Replicate()
+        for src_mesh_dim, placement in enumerate(placements):
+            if not isinstance(placement, Partial):
+                continue
+            new_placements = list(placements)
+            new_placements[src_mesh_dim] = Replicate()
+            dist_state = self.DistState(
+                self._to_tuple(new_placements), tensor_mesh_dim_tuple
+            )
+            all_next_state[dist_state] = self.all_reduce_cost
+
+        ######################################################################
+        # handle case 4: Replicate() -> Shard()
+        for mesh_dim, placement in enumerate(placements):
+            if not isinstance(placement, Replicate):
+                continue
+            for dst_tensor_dim in range(self.tensor_dimension):
+                # try convert placement[mesh_dim] to Shard(dst_tensor_dim)
+                new_placements = list(placements)
+                new_placements[mesh_dim] = Shard(dst_tensor_dim)
+                tensor_mesh_dim_dict[dst_tensor_dim].append(mesh_dim)
+                dist_state = self.DistState(
+                    self._to_tuple(new_placements),
+                    DTensorRedistributePlanner._dict_to_ShardOrder(
+                        tensor_mesh_dim_dict
+                    ),
+                )
+                all_next_state[dist_state] = self.chunk_cost
+                tensor_mesh_dim_dict[dst_tensor_dim].pop()
+
+        ######################################################################
+        # handle case 5: Partial() -> Shard()
+        for mesh_dim, placement in enumerate(placements):
+            if not isinstance(placement, Partial):
+                continue
+            for dst_tensor_dim in range(self.tensor_dimension):
+                # try convert placement[mesh_dim] to Shard(dst_tensor_dim)
+                new_placements = list(placements)
+                new_placements[mesh_dim] = Shard(dst_tensor_dim)
+                tensor_mesh_dim_dict[dst_tensor_dim].append(mesh_dim)
+                dist_state = self.DistState(
+                    self._to_tuple(new_placements),
+                    DTensorRedistributePlanner._dict_to_ShardOrder(
+                        tensor_mesh_dim_dict
+                    ),
+                )
+                all_next_state[dist_state] = self.reduce_scatter
+                tensor_mesh_dim_dict[dst_tensor_dim].pop()
+
+        ######################################################################
+        # handle case 6: Replicate() -> Partial(), default to partial(sum)
+        for mesh_dim, placement in enumerate(placements):
+            if not isinstance(placement, Replicate):
+                continue
+            new_placements = list(placements)
+            new_placements[mesh_dim] = Partial()
+            dist_state = self.DistState(
+                self._to_tuple(new_placements), tensor_mesh_dim_tuple
+            )
+            all_next_state[dist_state] = self.chunk_cost
+
+        return all_next_state
+
+    # TODO(zpcore): if the dst_state contains special placement like
+    # `_MaskPartial`, we will never reach that state. Need to support this case.
+    def find_min_cost_path(
+        self, src_state: DistState, dst_state: DistState
+    ) -> list["DTensorRedistributePlanner.DistState"]:
+        """
+        Find the min cost path from src_state to dst_state using Dijkstra's
+        algorithm.
+
+        Args:
+            src_state: The source state
+            dst_state: The destination state
+
+        Returns:
+            A list of states representing the min cost path from src_state to
+            dst_state
+        """
+        import heapq
+
+        # priority queue (cost, counter, state, path) for Dijkstra's algorithm
+        # use counter to break ties and avoid comparing DistState objects
+        counter = 0
+        pq: list[
+            tuple[
+                int,
+                int,
+                DTensorRedistributePlanner.DistState,
+                list[DTensorRedistributePlanner.DistState],
+            ]
+        ] = [(0, counter, src_state, [src_state])]
+        visited = set()
+        while pq:
+            cost, _, current_state, path = heapq.heappop(pq)
+            if current_state == dst_state:
+                return path
+            if current_state in visited:
+                continue
+            visited.add(current_state)
+            # get all possible next states and their costs
+            next_states = self.get_next_state(
+                current_state.placements, current_state.tensor_dim_to_mesh_dim
+            )
+            for next_state, transition_cost in next_states.items():
+                if next_state not in visited:
+                    new_cost = cost + transition_cost
+                    new_path = path + [next_state]
+                    counter += 1
+                    heapq.heappush(pq, (new_cost, counter, next_state, new_path))
+        raise AssertionError(
+            f"No path found from src_state {src_state} to dst_state {dst_state}"
+        )
 
+    def get_logical_shape(
+        self,
+        src_state: "DTensorRedistributePlanner.DistState",
+        mesh_dim: int,
+        full_tensor_shape: tuple[int, ...],
+    ) -> list[int]:
+        new_logical_shape = list(full_tensor_shape)
+        assert self.coordinate is not None
+        for entry in src_state.tensor_dim_to_mesh_dim:
+            tensor_dim = entry.tensor_dim
+            mesh_dims = entry.mesh_dims
+            assert len(mesh_dims) > 0
+            for mdim in mesh_dims:
+                if mdim == mesh_dim:
+                    continue
+                new_size = Shard.local_shard_size_and_offset(
+                    new_logical_shape[tensor_dim],
+                    self.device_mesh.size(mesh_dim=mdim),
+                    self.coordinate[mdim],
+                )[0]
+                new_logical_shape[tensor_dim] = new_size
+        return new_logical_shape
+
+    def generate_graph_based_transform_infos(
+        self,
+        src_spec: DTensorSpec,
+        dst_spec: DTensorSpec,
+        full_tensor_shape: tuple[int, ...],
+    ) -> list[_TransformInfo]:
+        assert src_spec.shard_order is not None and dst_spec.shard_order is not None
+        src_state = self.DistState(src_spec.placements, src_spec.shard_order)
+        dst_state = self.DistState(dst_spec.placements, dst_spec.shard_order)
+        transform_infos: list[_TransformInfo] = []
+        state_path = self.find_min_cost_path(src_state, dst_state)
+        for cur_state, nxt_state in itertools.pairwise(state_path):
+            # find the mesh_dim that is different between cur_state and nxt_state
+            if cur_state.placements != nxt_state.placements:
+                update_mesh_dim = -1
+                for mesh_dim, (cur_placement, nxt_placement) in enumerate(
+                    zip(cur_state.placements, nxt_state.placements)
+                ):
+                    if cur_placement != nxt_placement:
+                        if update_mesh_dim != -1:
+                            raise AssertionError(
+                                "Multiple mesh_dims are different between cur_state and nxt_state"
+                            )
+                        update_mesh_dim = mesh_dim
+                        logical_shape = self.get_logical_shape(
+                            cur_state, mesh_dim, full_tensor_shape
+                        )
+                        transform_infos.append(
+                            _TransformInfo(
+                                mesh_dim=update_mesh_dim,
+                                src_dst_placements=(cur_placement, nxt_placement),
+                                logical_shape=logical_shape,
+                            )
+                        )
+
+        return transform_infos
+
+    def generate_greedy_transform_infos(
+        self,
+        src_spec: DTensorSpec,
+        dst_spec: DTensorSpec,
+    ) -> list[_TransformInfo]:
+        """
+        Generate the transform infos from the source placements to the target placements.
+
+        To transform from source to target placement it might have multiple steps, i.e. it
+        might decompose Si -> Sj into Si -> R -> Sj.
+        This would detect if there're mis-aligned/nested shardings between src/dst placements.
+        E.g. Suppose the redistribution to perform is (Shard(0), Shard(0)) -> (Replicate(), Shard(0)),
+        in this case Shard(0) -> Shard(0) for mesh dimension 1 actually needs resharding, because in
+        the former is a nested-sharding of a tensor already already sharded dimension 0, whereas
+        the latter is the first sharding on tensor dimension 0.
+        """
+        # logical shape records the logic tensor shape on the mesh dimension
+        # this is useful to ensure uneven sharding gets correct output shape
+        assert self.coordinate is not None
+        initial_logical_shape = list(src_spec.shape)
+        mesh_dims_to_logical_shape = [initial_logical_shape]
+        transform_infos: list[_TransformInfo] = []
+        if self.device_mesh.ndim == 1:
+            # if device_mesh is 1D, redistribute is a simple direct
+            # transformation
+            transform_infos.append(
+                _TransformInfo(
+                    mesh_dim=0,
+                    src_dst_placements=(src_spec.placements[0], dst_spec.placements[0]),
+                    logical_shape=initial_logical_shape,
+                )
+            )
+            return transform_infos
+
+        # Handle multi-dim device mesh placement redistribution First, we need
+        # to build the logical shape for each mesh dim for correct allgather
+        # uneven shards on each mesh dim (with dynamic padding)
+        for i, src in enumerate(src_spec.placements):
+            current_logical_shape = mesh_dims_to_logical_shape[i]
+            if isinstance(src, Shard):
+                if i < self.device_mesh.ndim - 1:
+                    # calculate and save the logical shape for this sharding
+                    mesh_dim_size = self.device_mesh.size(mesh_dim=i)
+                    local_shard_size, _ = src._local_shard_size_and_offset(
+                        current_logical_shape[src.dim],
+                        mesh_dim_size,
+                        self.coordinate[i],
+                    )
+                    new_logical_shape = list(current_logical_shape)
+                    new_logical_shape[src.dim] = local_shard_size
+                    mesh_dims_to_logical_shape.append(new_logical_shape)
+            else:
+                mesh_dims_to_logical_shape.append(current_logical_shape)
+
+        # Next, we need to derive the transform infos from src to dst
+        # placements, here we use a greedy search with step by step state
+        # transformations
+        current_placements = list(src_spec.placements)
+        target_placements = list(dst_spec.placements)
+
+        if src_spec.num_shards > 1:
+            # If src_spec have sharding, it could potentially have sharding that
+            # is misaligned with dst_spec a common case of this is nested
+            # sharding (i.e. (S(0), S(0)) -> (R, S(0))). In those cases, we
+            # first traverse from inner placement to outer placement to detect
+            # misaligned shardings and properly replicate nested sharding first.
+            for mesh_dim in reversed(range(len(current_placements))):
+                current = current_placements[mesh_dim]
+                target = target_placements[mesh_dim]
+                # If target is not Shard, we can directly redistribute since we
+                # are traversing from innner to outer placements here
+                if isinstance(target, Shard):
+                    # If target is Shard, check for nested sharding on the
+                    # tensor dim BEFORE the current mesh_dim
+                    shard_dim = target.dim
+                    current_mesh_sharding, target_mesh_sharding = [], []
+                    for i, (s, p) in enumerate(
+                        zip(current_placements, target_placements)
+                    ):
+                        if i >= mesh_dim:
+                            break
+                        if s.is_shard(shard_dim):
+                            current_mesh_sharding.append(i)
+                        if p.is_shard(shard_dim):
+                            target_mesh_sharding.append(i)
+
+                    if current_mesh_sharding != target_mesh_sharding:
+                        # if current/target_placements have misaligned sharding
+                        # on the tensor dim BEFORE the current mesh_dim, we need
+                        # to replicate the tensor on the mesh dim first to clear
+                        # the nested sharding
+                        target = Replicate()
+
+                if current != target:
+                    transform_infos.append(
+                        _TransformInfo(
+                            mesh_dim=mesh_dim,
+                            src_dst_placements=(current, target),
+                            logical_shape=mesh_dims_to_logical_shape[mesh_dim],
+                        )
+                    )
+                    current_placements[mesh_dim] = target
+
+        # We always traverse from outer placement to inner placement to collect
+        # the remaining needed transform infos (i.e. the replication from nested
+        # sharding might need to further perform resharding to Shard again)
+        for mesh_dim, (current, target) in enumerate(
+            zip(current_placements, target_placements)
+        ):
             if current != target:
                 transform_infos.append(
                     _TransformInfo(
@@ -126,23 +640,37 @@ def _gen_transform_infos_non_cached(
                     )
                 )
                 current_placements[mesh_dim] = target
+        return transform_infos
 
-    # We always traverse from outer placement to inner placement to collect the remaining
-    # needed transform infos (i.e. the replication from nested sharding might need to further
-    # perform resharding to Shard again)
-    for mesh_dim, (current, target) in enumerate(
-        zip(current_placements, target_placements)
-    ):
-        if current != target:
-            transform_infos.append(
-                _TransformInfo(
-                    mesh_dim=mesh_dim,
-                    src_dst_placements=(current, target),
-                    logical_shape=mesh_dims_to_logical_shape[mesh_dim],
-                )
-            )
-            current_placements[mesh_dim] = target
 
+def _gen_transform_infos_non_cached(
+    src_spec: DTensorSpec,
+    dst_spec: DTensorSpec,
+    use_graph_based_transform: Optional[bool] = None,
+) -> list[_TransformInfo]:
+    transform_infos: list[_TransformInfo] = []
+    device_mesh = src_spec.device_mesh
+    src_shard_order = src_spec.shard_order
+    dst_shard_order = dst_spec.shard_order
+    # DTensorSpec should automatically generate shard_order, and it can be () if
+    # no shard.
+    assert src_shard_order is not None and dst_shard_order is not None
+    if use_graph_based_transform is None:
+        if all(
+            DTensorSpec.is_default_device_order(order)
+            for order in (src_shard_order, dst_shard_order)
+        ):
+            use_graph_based_transform = False
+        else:
+            # switch to graph search algorithm if the device order is not the default
+            use_graph_based_transform = True
+    drp = get_redistribute_planner(device_mesh, len(src_spec.shape))
+    if use_graph_based_transform:
+        transform_infos = drp.generate_graph_based_transform_infos(
+            src_spec, dst_spec, src_spec.shape
+        )
+    else:
+        transform_infos = drp.generate_greedy_transform_infos(src_spec, dst_spec)
     return transform_infos
 
 
@@ -150,8 +678,11 @@ def _gen_transform_infos_non_cached(
 def _gen_transform_infos(
     src_spec: DTensorSpec,
     dst_spec: DTensorSpec,
+    use_graph_based_transform: Optional[bool] = None,
 ) -> list[_TransformInfo]:
-    return _gen_transform_infos_non_cached(src_spec, dst_spec)
+    return _gen_transform_infos_non_cached(
+        src_spec, dst_spec, use_graph_based_transform
+    )
 
 
 def redistribute_local_tensor(
@@ -161,6 +692,7 @@ def redistribute_local_tensor(
     *,
     async_op: bool = False,
     is_backward: bool = False,
+    use_graph_based_transform: Optional[bool] = None,
 ) -> torch.Tensor:
     """
     This redistribute the local tensor (torch.Tensor) from the current DTensorSpec to
@@ -183,96 +715,125 @@ def redistribute_local_tensor(
         return local_tensor
 
     if _are_we_tracing():
-        transform_infos = _gen_transform_infos_non_cached(current_spec, target_spec)
+        transform_infos = _gen_transform_infos_non_cached(
+            current_spec, target_spec, use_graph_based_transform
+        )
     else:
-        transform_infos = _gen_transform_infos(current_spec, target_spec)
-
-    for transform_info in transform_infos:
-        i = transform_info.mesh_dim
-        current, target = transform_info.src_dst_placements
-        device_mesh.size(mesh_dim=i)
-
-        if current == target:
-            # short cut, just use the original local tensor
-            new_local_tensor = local_tensor
-            continue
-
-        logger.debug("redistribute from %s to %s on mesh dim %s", current, target, i)
-
-        if target.is_replicate():
-            # Case 1: target is Replicate
-            if current.is_partial():
-                partial_spec = cast(Partial, current)
-                new_local_tensor = partial_spec._reduce_value(
-                    local_tensor, device_mesh, i
-                )
-            elif current.is_shard():
-                current_placement = cast(Shard, current)
-                new_local_tensor = current_placement._to_replicate_tensor(
-                    local_tensor, device_mesh, i, transform_info.logical_shape
-                )
-            else:
-                raise RuntimeError(
-                    f"redistribute from {current} to {target} not supported yet"
-                )
-        elif target.is_shard():
-            # Case 2: target is Shard
-            target_placement = cast(Shard, target)
-            if current.is_partial():
-                partial_spec = cast(Partial, current)
-                new_local_tensor = partial_spec._reduce_shard_value(
-                    local_tensor, device_mesh, i, target_placement
-                )
-            elif current.is_replicate():
-                # split the tensor and return the corresponding cloned local shard
-                new_local_tensor = target_placement._replicate_to_shard(
-                    local_tensor, device_mesh, i, my_coordinate[i]
-                )
-            else:
-                assert current.is_shard(), (
-                    f"Current placement should be shard but found {current}"
-                )
-                shard_spec = cast(Shard, current)
-                if shard_spec.dim != target_placement.dim:
-                    new_local_tensor = shard_spec._to_new_shard_dim(
-                        local_tensor,
-                        device_mesh,
-                        i,
-                        transform_info.logical_shape,
-                        target_placement.dim,
+        transform_infos = _gen_transform_infos(
+            current_spec, target_spec, use_graph_based_transform
+        )
+
+    debug_mode = get_active_debug_mode()
+    redistribute_context = (
+        debug_mode.record_redistribute_calls(  # type: ignore[union-attr]
+            local_tensor,
+            current_spec.placements,
+            target_spec.placements,
+            DTensorRedistributePlanner.stringify_transform_infos(
+                device_mesh,
+                transform_infos,
+                current_spec.placements,
+                current_spec.shard_order,
+            ),
+        )
+        if debug_mode is not None
+        else contextlib.nullcontext()
+    )
+
+    with redistribute_context:
+        for transform_info in transform_infos:
+            i = transform_info.mesh_dim
+            current, target = transform_info.src_dst_placements
+            num_chunks = device_mesh.size(mesh_dim=i)
+
+            if current == target:
+                # short cut, just use the original local tensor
+                new_local_tensor = local_tensor
+                continue
+
+            if num_chunks == 1:
+                # short cut, if there's only one shard, we don't need to do any collective
+                # comm, just use the original local tensor
+                new_local_tensor = local_tensor
+                continue
+
+            if target.is_replicate():
+                # Case 1: target is Replicate
+                if current.is_partial():
+                    partial_spec = cast(Partial, current)
+                    new_local_tensor = partial_spec._reduce_value(
+                        local_tensor, device_mesh, i
                     )
-        elif target.is_partial():
-            if current.is_replicate():
-                partial_spec = cast(Partial, target)
-                # skip the replicate to partial transformation when we are in backward pass
-                # In this case we keep the grad as replicate, this is because we don't
-                # want to convert the replicated gradients back to partial, although
-                # that's logically conform with the same layout, converting the gradients
-                # back to partial is actually useless as you would have to do reduce later
-                # which would be more expensive than keeping it replicate! For this reason,
-                # we keep the replicate grad here.
-                new_local_tensor = (
-                    partial_spec._partition_value(local_tensor, device_mesh, i)
-                    if not is_backward
-                    else local_tensor
-                )
-            elif current.is_shard():
-                if not is_backward:
+                elif current.is_shard():
+                    current_placement = cast(Shard, current)
+                    new_local_tensor = current_placement._to_replicate_tensor(
+                        local_tensor, device_mesh, i, transform_info.logical_shape
+                    )
+                else:
                     raise RuntimeError(
                         f"redistribute from {current} to {target} not supported yet"
                     )
-                # for backward shard -> partial, we just need to convert the shard to replicate
-                current_placement = cast(Shard, current)
-                new_local_tensor = current_placement._to_replicate_tensor(
-                    local_tensor, device_mesh, i, transform_info.logical_shape
-                )
-            else:
-                # partial -> partial no op, should never hit
-                new_local_tensor = local_tensor
 
-        if not async_op and isinstance(new_local_tensor, funcol.AsyncCollectiveTensor):
-            new_local_tensor = new_local_tensor.wait()
-        local_tensor = new_local_tensor
+            elif target.is_shard():
+                # Case 2: target is Shard
+                target_placement = cast(Shard, target)
+                if current.is_partial():
+                    partial_spec = cast(Partial, current)
+                    new_local_tensor = partial_spec._reduce_shard_value(
+                        local_tensor, device_mesh, i, target_placement
+                    )
+                elif current.is_replicate():
+                    # split the tensor and return the corresponding cloned local shard
+                    new_local_tensor = target_placement._replicate_to_shard(
+                        local_tensor, device_mesh, i, my_coordinate[i]
+                    )
+                else:
+                    assert current.is_shard(), (
+                        f"Current placement should be shard but found {current}"
+                    )
+                    shard_spec = cast(Shard, current)
+                    if shard_spec.dim != target_placement.dim:
+                        new_local_tensor = shard_spec._to_new_shard_dim(
+                            local_tensor,
+                            device_mesh,
+                            i,
+                            transform_info.logical_shape,
+                            target_placement.dim,
+                        )
+            elif target.is_partial():
+                if current.is_replicate():
+                    partial_spec = cast(Partial, target)
+                    # skip the replicate to partial transformation when we are in backward pass
+                    # In this case we keep the grad as replicate, this is because we don't
+                    # want to convert the replicated gradients back to partial, although
+                    # that's logically conform with the same layout, converting the gradients
+                    # back to partial is actually useless as you would have to do reduce later
+                    # which would be more expensive than keeping it replicate! For this reason,
+                    # we keep the replicate grad here.
+                    new_local_tensor = (
+                        partial_spec._partition_value(local_tensor, device_mesh, i)
+                        if not is_backward
+                        else local_tensor
+                    )
+                elif current.is_shard():
+                    if not is_backward:
+                        raise RuntimeError(
+                            f"redistribute from {current} to {target} not supported yet"
+                        )
+                    # for backward shard -> partial, we just need to convert the shard to replicate
+                    current_placement = cast(Shard, current)
+                    new_local_tensor = current_placement._to_replicate_tensor(
+                        local_tensor, device_mesh, i, transform_info.logical_shape
+                    )
+                else:
+                    # partial -> partial no op, should never hit
+                    new_local_tensor = local_tensor
+
+            if not async_op and isinstance(
+                new_local_tensor, funcol.AsyncCollectiveTensor
+            ):
+                new_local_tensor = new_local_tensor.wait()
+            local_tensor = new_local_tensor
     return new_local_tensor
 
 
diff --git a/torch/distributed/tensor/_sharding_prop.py b/torch/distributed/tensor/_sharding_prop.py
index a96a4dc756ac..c1af2c131717 100644
--- a/torch/distributed/tensor/_sharding_prop.py
+++ b/torch/distributed/tensor/_sharding_prop.py
@@ -1,9 +1,9 @@
 # mypy: allow-untyped-defs
 import threading
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from functools import lru_cache
 from itertools import chain
-from typing import Callable, cast, Optional, Union
+from typing import cast, Optional, Union
 
 import torch
 from torch._ops import OpOverload
@@ -48,6 +48,9 @@ def __call__(self, *args, **kwargs) -> object:
     def cache_info(self):
         return self.cache.cache_info()
 
+    def cache_clear(self):
+        return self.cache.cache_clear()
+
 
 class ShardingPropagator:
     def __init__(self) -> None:
@@ -418,16 +421,14 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
                     output_specs: OutputSpecType = output_strategy.output_specs
                     if isinstance(output_specs, DTensorSpec):
                         output_specs = tuple(
-                            [
-                                # create a new DTensorSpec with the same placement as the
-                                # output_specs in output_strategy
-                                DTensorSpec(
-                                    mesh=output_specs.mesh,
-                                    placements=output_specs.placements,
-                                    tensor_meta=output_specs.tensor_meta,
-                                )
-                                for _ in range(len(op_schema.op._schema.returns))
-                            ]
+                            # create a new DTensorSpec with the same placement as the
+                            # output_specs in output_strategy
+                            DTensorSpec(
+                                mesh=output_specs.mesh,
+                                placements=output_specs.placements,
+                                tensor_meta=output_specs.tensor_meta,
+                            )
+                            for _ in range(len(op_schema.op._schema.returns))
                         )
                 elif (
                     op_schema.return_type_tensor()
diff --git a/torch/distributed/tensor/_shards_wrapper.py b/torch/distributed/tensor/_shards_wrapper.py
index a3798eac4ae0..1673dd7e34b9 100644
--- a/torch/distributed/tensor/_shards_wrapper.py
+++ b/torch/distributed/tensor/_shards_wrapper.py
@@ -187,7 +187,7 @@ def handle_equal(args, kwargs) -> bool:
             aten.equal.default(x, y) for x, y in zip(a.local_shards(), b.local_shards())
         ):
             return False
-        if not a.storage_metadata() == b.storage_metadata():
+        if a.storage_metadata() != b.storage_metadata():
             return False
         return True
 
diff --git a/torch/distributed/tensor/_tp_conv.py b/torch/distributed/tensor/_tp_conv.py
index f3e908f3e7a2..2fa1848d399b 100644
--- a/torch/distributed/tensor/_tp_conv.py
+++ b/torch/distributed/tensor/_tp_conv.py
@@ -13,25 +13,25 @@
 
 def _requires_data_exchange(padding):
     # TODO: whether there requires data exchange is currently determined by padding
-    return padding[1] != 0
+    return padding[-1] != 0
 
 
 def _is_supported(input_size, kernel_size, stride, padding, dilation):
-    if dilation[1] != 1:
+    if dilation[-1] != 1:
         raise RuntimeError("Dilation must be 1 for tensor parallel convolution.")
-    if padding[1] != 0:
-        if stride[1] != 1:
+    if padding[-1] != 0:
+        if stride[-1] != 1:
             raise RuntimeError(
                 "Stride must be 1 when there is padding for tensor parallel convolution."
             )
-        if kernel_size[3] // 2 > input_size[3]:
+        if kernel_size[-1] // 2 > input_size[-1]:
             raise RuntimeError(
-                "kernel_size[3] // 2 should be less than or equal to input_size[3] for tensor parallel convolution."
+                "kernel_size[-1] // 2 should be less than or equal to input_size[-1] for tensor parallel convolution."
             )
     else:
-        if not (input_size[3] % stride[1] == 0 and stride[1] == kernel_size[3]):
+        if not (input_size[-1] % stride[-1] == 0 and stride[-1] == kernel_size[-1]):
             raise RuntimeError(
-                "It requires that input_size[3] is divisible by stride[1] and stride[1] equals kernel_size[3] "
+                "It requires that input_size[-1] is divisible by stride[-1] and stride[-1] equals kernel_size[-1] "
                 "when there is padding for tensor parallel convolution."
             )
     return True
@@ -39,8 +39,8 @@ def _is_supported(input_size, kernel_size, stride, padding, dilation):
 
 def _ring_send_recv_construct(in_tensor, d1, d2, left, right, rank, size):
     # dist comms and reconstruct local input tensor
-    send_to_right = in_tensor[:, :, :, -d1:].contiguous()
-    send_to_left = in_tensor[:, :, :, :d2].contiguous()
+    send_to_right = in_tensor[..., -d1:].contiguous()
+    send_to_left = in_tensor[..., :d2].contiguous()
     recv_from_right = torch.zeros_like(send_to_left)
     recv_from_left = torch.zeros_like(send_to_right)
 
@@ -125,7 +125,7 @@ def tp_convolution(
         return local_results
     else:
         # step 0 compute the overlap pixels of the input tensor
-        d = weight.shape[3] - 1
+        d = weight.shape[-1] - 1
         d1 = d // 2
         d2 = d - d1
         assert d1 + d2 == d
@@ -144,14 +144,14 @@ def tp_convolution(
         local_results = op_call(*local_tensor_args, **local_tensor_kwargs)
 
         # step3 remove extra outputs from the results
-        padding_w = padding[1]
-        w = local_results.size(3)
+        padding_w = padding[-1]
+        w = local_results.size(-1)
         if rank == 0:
-            local_results = local_results[:, :, :, : w - padding_w]
+            local_results = local_results[..., : w - padding_w]
         elif rank == size - 1:
-            local_results = local_results[:, :, :, padding_w:]
+            local_results = local_results[..., padding_w:]
         else:
-            local_results = local_results[:, :, :, padding_w : w - padding_w]
+            local_results = local_results[..., padding_w : w - padding_w]
 
         return local_results
 
@@ -256,8 +256,10 @@ def convolution_backward_handler(
     kwargs: dict[str, object],
 ) -> object:
     # Redistribute grad_output tensor to the same placement as input tensor
+    # pyrefly: ignore  # bad-assignment
     args = list(args)
     assert isinstance(args[0], dtensor.DTensor) and isinstance(args[1], dtensor.DTensor)
+    # pyrefly: ignore  # unsupported-operation
     args[0] = args[0].redistribute(args[1].device_mesh, args[1].placements)
     args = tuple(args)
 
diff --git a/torch/distributed/tensor/_utils.py b/torch/distributed/tensor/_utils.py
index a39c49f5230a..7325fc2daf09 100644
--- a/torch/distributed/tensor/_utils.py
+++ b/torch/distributed/tensor/_utils.py
@@ -15,6 +15,7 @@
     Replicate,
     Shard,
 )
+from torch.utils._typing_utils import not_none
 
 
 def _explicit_order_placements(
@@ -123,69 +124,100 @@ def _compute_local_shape_and_global_offset(
     my_coordinate: Optional[list[int]],
     placements: Sequence[Placement],
 ) -> tuple[tuple[int, ...], tuple[int, ...]]:
-    ordered_placements = _explicit_order_placements(mesh_shape, placements)
+    """
+    Suppose you have a full tensor with size global_shape, and you have sharded
+    it according to placements for mesh_shape.  This function returns, for a
+    specific coordinate my_coordinate in the device mesh:
+
+        - The size of your local shard WITHOUT padding (i.e., if you have
+          an uneven split, your size might be smaller than the other entries
+          in your dim), and
+
+        - Where the data for your shard begins, in the full tensor.
+
+    This function is fairly simple if your tensor is evenly sharded; the complication
+    is around uneven splits.  There is also some complication for handling StridedShard,
+    which changes the order you should apply sharding.
+    """
 
     if my_coordinate is None:
         # if rank not in the mesh, return empty offset
         return ((0,), ())
-    else:
-        local_shape = list(global_shape)
-        global_offset = [0] * len(global_shape)
-        for mesh_dim, placement in ordered_placements:
-            mesh_dim_size = mesh_shape[mesh_dim]
-            if isinstance(placement, Shard):
-                shard_dim = placement.dim
-                local_offset = [0] * len(global_shape)
-                assert shard_dim < len(local_shape), (
-                    f"Sharding dim {shard_dim} greater than tensor ndim {len(local_shape)}"
-                )
-                shard_size, shard_offset = placement._local_shard_size_and_offset(
-                    local_shape[shard_dim],
-                    mesh_dim_size,
-                    my_coordinate[mesh_dim],
-                )
 
-                local_shape[shard_dim] = shard_size
-                local_offset[shard_dim] = shard_offset
-                if shard_size == 0:
-                    # Special case to fill in a standardized non-garbage value for the global_offset
-                    # of zero-sized shards.  This value is out of bounds of the tensor, so it won't conflict
-                    # with any real offsets.  DCP may rely on this value to de-duplicate shards.
-                    global_offset[shard_dim] = global_shape[shard_dim]
-                else:
-                    # On a given dimension, if the local_offset[shard_dim] is smaller than global_offset[shard_dim],
-                    # it means that this dimension has been already sharded in previous placement.
-                    # Therefore, we cannot simply replace the global_offset[shard_dim] with local_offset[shard_dim].
-                    # Instead, for the given shard_dim, we need to add local_offset[shard_dim] to existing global_offset[shard_dim].
-                    if global_offset[shard_dim] <= local_offset[shard_dim]:
-                        global_offset[shard_dim] = local_offset[shard_dim]
-                    else:
-                        global_offset[shard_dim] += local_offset[shard_dim]
-
-        # NOTE: the offset compute relies on the local shard index and it has no
-        # problem when strided sharding is not present. To correctly compute, we assume
-        # that the ``_StridedShard.split_factor`` field encodes how many partitions
-        # each local tensor will be further split into when sharding on higher mesh
-        # dimensions. However, this number is only correct if the DTensor is not
-        # sharded after the strided sharding completes. For example,
-        # [Shard(0), _StridedShard(0, split_factor=2), Shard(0)] is the placements
-        # where the DTensor's dim-0 is first sharded on device mesh dim-0, then on
-        # device mesh dim-2, and last on mesh dim-1. We define the
-        # "_StridedShard(0, split_factor=2), Shard(0)" part as the strided sharding
-        # part because strided sharding happens on mesh dim-1 and it was caused by
-        # the fact that sharding on dim-2 occurred ahead. In this case, there's no
-        # further sharding after this strided sharding part and ``split_factor``
-        # correctly encodes the number. Another example is
-        # [_StridedShard(0, split_factor=2), Shard(0), Shard(0)] where the DTensor's
-        # dim-0 is first sharded on mesh dim-1, then on mesh dim-0, and last on mesh
-        # dim-2. This violates our assumption that no further sharding shall occur
-        # after the strided sharding part and ``split_factor`` won't correctly
-        # encode the number of further split. So far, the only case where _StridedShard
-        # placement would appear is FSDP2 + TP on 2D mesh and the above case could only
-        # happen on mesh of 3 or more dimensions.
-        # TODO: change this function to correctly address this.
-        # TODO: this logic can be applied to contiguous sharding as well
-        return tuple(local_shape), tuple(global_offset)
+    # StridedShard implies a non-standard order to apply shards; get the
+    # correct order to start applying splits
+    ordered_placements = _explicit_order_placements(mesh_shape, placements)
+
+    local_shape = list(global_shape)
+    # We'll compute the data for where the shard begins on a per-dim basis.
+    # However, a single dim can be sharded multiple times, so we will end up
+    # doing a Sum(size*stride) like computation to determine the location of our
+    # shard for each of the shardings on that dim.
+    global_offset = [0] * len(global_shape)
+
+    for mesh_dim, placement in ordered_placements:
+        mesh_dim_size = mesh_shape[mesh_dim]
+        if isinstance(placement, Shard):
+            shard_dim = placement.dim
+            assert shard_dim < len(local_shape), (
+                f"Sharding dim {shard_dim} greater than tensor ndim {len(local_shape)}"
+            )
+            shard_size, shard_offset = placement._local_shard_size_and_offset(
+                local_shape[shard_dim],
+                mesh_dim_size,
+                my_coordinate[mesh_dim],
+            )
+
+            local_shape[shard_dim] = shard_size
+
+            shard_global_offset = global_offset[shard_dim] + not_none(shard_offset)
+
+            zero_global_offset = global_shape[shard_dim]
+            if isinstance(shard_global_offset, torch.SymInt) and not isinstance(
+                zero_global_offset, torch.SymInt
+            ):
+                zero_global_offset = torch.SymInt(zero_global_offset)
+
+            global_offset[shard_dim] = torch.sym_ite(
+                shard_size == 0,
+                # Special case to fill in a standardized non-garbage value for
+                # the global_offset of zero-sized shards.  This value is out
+                # of bounds of the tensor, so it won't conflict with any real
+                # offsets.  DCP may rely on this value to de-duplicate shards.
+                # Note that you can end up with zero-size shards that are
+                # still otherwise in bounds for the tensor (TODO: give an
+                # example).
+                zero_global_offset,
+                # As we successively shard the same dimension, we keep
+                # advancing our pointer beyond our original offset until we
+                # get to the final chunk start.
+                shard_global_offset,
+            )
+
+    # NOTE: the offset compute relies on the local shard index and it has no
+    # problem when strided sharding is not present. To correctly compute, we assume
+    # that the ``_StridedShard.split_factor`` field encodes how many partitions
+    # each local tensor will be further split into when sharding on higher mesh
+    # dimensions. However, this number is only correct if the DTensor is not
+    # sharded after the strided sharding completes. For example,
+    # [Shard(0), _StridedShard(0, split_factor=2), Shard(0)] is the placements
+    # where the DTensor's dim-0 is first sharded on device mesh dim-0, then on
+    # device mesh dim-2, and last on mesh dim-1. We define the
+    # "_StridedShard(0, split_factor=2), Shard(0)" part as the strided sharding
+    # part because strided sharding happens on mesh dim-1 and it was caused by
+    # the fact that sharding on dim-2 occurred ahead. In this case, there's no
+    # further sharding after this strided sharding part and ``split_factor``
+    # correctly encodes the number. Another example is
+    # [_StridedShard(0, split_factor=2), Shard(0), Shard(0)] where the DTensor's
+    # dim-0 is first sharded on mesh dim-1, then on mesh dim-0, and last on mesh
+    # dim-2. This violates our assumption that no further sharding shall occur
+    # after the strided sharding part and ``split_factor`` won't correctly
+    # encode the number of further split. So far, the only case where _StridedShard
+    # placement would appear is FSDP2 + TP on 2D mesh and the above case could only
+    # happen on mesh of 3 or more dimensions.
+    # TODO: change this function to correctly address this.
+    # TODO: this logic can be applied to contiguous sharding as well
+    return tuple(local_shape), tuple(global_offset)
 
 
 def compute_global_tensor_info(
@@ -247,6 +279,70 @@ def compute_global_tensor_info(
     return tensor_shape, tensor_stride
 
 
+def compute_local_tensor_info(
+    global_tensor: torch.Tensor,
+    mesh: DeviceMesh,
+    placements: Sequence[Placement],
+) -> tuple[list[int], list[int]]:
+    """
+    Compute the local size and stride of a DTensor from the given global tensor info.
+
+    For example, if we have a global tensor with size (4, 8, 4) and stride (32, 1, 8).
+    If the DTensor placements are [Shard(2)] and world_size is 2;
+    then the local size is (4, 8, 2) and stride is (16, 1, 8).
+
+    Args:
+        tensor (:class:`torch.Tensor`):
+            Global tensor which DTensor will distribute
+        mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology
+            of devices for the DTensor.
+        placements (Sequence[:class:`Placement`]):
+            The attribute of the DTensor that describes its layout
+            on the mesh topology.
+
+    Returns:
+        local_shape: A List of int which specifies the size of the local tensor.
+        local_stride: A List of int which specifies the stride of the local tensor.
+    """
+    local_shape = list(global_tensor.size())
+    local_stride = list(global_tensor.stride())
+
+    for idx, placement in enumerate(placements):
+        mesh_dim_size = mesh.size(idx)
+        if placement.is_shard():
+            shard_placement = cast(Shard, placement)
+            if shard_placement.dim < 0:
+                raise AssertionError(
+                    "Shard placements should have negative dims normalized in "
+                    f"the user-facing APIs: {shard_placement}"
+                )
+            shard_dim = shard_placement.dim
+            assert shard_dim < len(local_shape), (
+                f"Sharding dim {shard_dim} greater than tensor ndim {len(local_shape)} "
+                f"for placement number {idx}."
+            )
+
+            global_dim_size = local_shape[shard_dim]
+            assert global_dim_size % mesh_dim_size == 0, (
+                f"Global dim {global_dim_size} not divisible by mesh size {mesh_dim_size}"
+            )
+            local_shape[shard_dim] = global_dim_size // mesh_dim_size
+
+            # shrink strides that were scaled up globally
+            for i in range(len(local_stride)):
+                if (
+                    i != shard_dim
+                    and local_stride[i] >= local_stride[shard_dim] * mesh_dim_size
+                ):
+                    local_stride[i] = local_stride[i] // mesh_dim_size
+
+        elif not isinstance(placement, (Replicate, Partial)):
+            raise RuntimeError(f"placement type {type(placement)} not supported!")
+
+    return local_shape, local_stride
+
+
 def compute_global_tensor_shape(
     shape: torch.Size, mesh: DeviceMesh, placements: Sequence[Placement]
 ) -> torch.Size:
diff --git a/torch/distributed/tensor/debug/__init__.py b/torch/distributed/tensor/debug/__init__.py
index e5bf3b833fe4..a74f1449ad12 100644
--- a/torch/distributed/tensor/debug/__init__.py
+++ b/torch/distributed/tensor/debug/__init__.py
@@ -19,6 +19,17 @@ def _get_sharding_prop_cache_info():
     )
 
 
+def _clear_sharding_prop_cache():
+    """
+    Clears the cache for the sharding propagation cache, used for debugging purpose only.
+    """
+    from torch.distributed.tensor._api import DTensor
+
+    return (
+        DTensor._op_dispatcher.sharding_propagator.propagate_op_sharding.cache_clear()  # type:ignore[attr-defined]
+    )
+
+
 # Set namespace for exposed private names
 CommDebugMode.__module__ = "torch.distributed.tensor.debug"
 visualize_sharding.__module__ = "torch.distributed.tensor.debug"
diff --git a/torch/distributed/tensor/debug/_comm_mode.py b/torch/distributed/tensor/debug/_comm_mode.py
index 99978f9cc6b5..31f091fe31bd 100644
--- a/torch/distributed/tensor/debug/_comm_mode.py
+++ b/torch/distributed/tensor/debug/_comm_mode.py
@@ -594,6 +594,7 @@ def __enter__(self):
         self.advanced_module_tracker.__enter__()
         return self
 
+    # pyrefly: ignore  # bad-override
     def __exit__(self, *args):
         self.advanced_module_tracker.__exit__()
         super().__exit__(*args)
@@ -733,3 +734,6 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         ].append(operation_dict)
 
         return out
+
+    def __repr__(self):
+        return f"CommDebugMode(get_total_counts()={self.get_total_counts()})"
diff --git a/torch/distributed/tensor/debug/_op_coverage.py b/torch/distributed/tensor/debug/_op_coverage.py
index b43acaa9b196..fa17430bd942 100644
--- a/torch/distributed/tensor/debug/_op_coverage.py
+++ b/torch/distributed/tensor/debug/_op_coverage.py
@@ -90,6 +90,7 @@ def print_op_coverage_summary(model: nn.Module, args, kwargs, *, output_csv=Fals
     op_infos.sort(key=itemgetter(count_idx), reverse=True)
 
     headers = ["Operator", "Schema", "Total Count", "Supported"]
+    # pyrefly: ignore  # bad-argument-type
     print(tabulate(op_infos, headers=headers))
 
     if output_csv:
@@ -101,4 +102,5 @@ def print_op_coverage_summary(model: nn.Module, args, kwargs, *, output_csv=Fals
             csv_writer.writerow(headers)
             # Write each table row to the CSV file
             for row in op_infos:
+                # pyrefly: ignore  # bad-argument-type
                 csv_writer.writerow(row)
diff --git a/torch/distributed/tensor/examples/comm_mode_features_example.py b/torch/distributed/tensor/examples/comm_mode_features_example.py
index 8625a3f7dd1d..674444852782 100644
--- a/torch/distributed/tensor/examples/comm_mode_features_example.py
+++ b/torch/distributed/tensor/examples/comm_mode_features_example.py
@@ -5,7 +5,7 @@
 
 import argparse
 import os
-from typing import Callable, Union
+from typing import TYPE_CHECKING, Union
 
 import torch
 import torch.nn as nn
@@ -26,6 +26,10 @@
 from torch.utils.checkpoint import checkpoint
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 def get_device_type() -> str:
     device_type = "cpu"
     if torch.accelerator.device_count() >= 4:
diff --git a/torch/distributed/tensor/examples/convnext_example.py b/torch/distributed/tensor/examples/convnext_example.py
index 994f2ee10f69..c1bd542922af 100644
--- a/torch/distributed/tensor/examples/convnext_example.py
+++ b/torch/distributed/tensor/examples/convnext_example.py
@@ -34,7 +34,7 @@ def __init__(self, normalized_shape, eps=1e-6, data_format=torch.contiguous_form
         self.bias = nn.Parameter(torch.zeros(normalized_shape))
         self.eps = eps
         self.data_format = data_format
-        if self.data_format not in [torch.contiguous_format]:
+        if self.data_format != torch.contiguous_format:
             raise NotImplementedError
         self.normalized_shape = (normalized_shape,)
 
@@ -110,7 +110,7 @@ def forward(self, x):
 
 @torch.no_grad()
 def init_weights(m):
-    if type(m) == nn.Conv2d or type(m) == nn.Linear:
+    if type(m) is nn.Conv2d or type(m) is nn.Linear:
         nn.init.ones_(m.weight)
         if m.bias is not None:
             nn.init.zeros_(m.bias)
diff --git a/torch/distributed/tensor/examples/torchrec_sharding_example.py b/torch/distributed/tensor/examples/torchrec_sharding_example.py
index 2c5d10413610..9647b4bb93ef 100644
--- a/torch/distributed/tensor/examples/torchrec_sharding_example.py
+++ b/torch/distributed/tensor/examples/torchrec_sharding_example.py
@@ -90,8 +90,11 @@ def __torch_dispatch__(cls, func, types, args=(), kwargs=None):  # type: ignore[
         # TODO: we shall continually extend this function to support more ops if needed
         if func in supported_ops:
             res_shards_list = [
-                func(shard, *args[1:], **kwargs) for shard in args[0].shards
+                func(shard, *args[1:], **kwargs)
+                # pyrefly: ignore  # index-error
+                for shard in args[0].shards
             ]
+            # pyrefly: ignore  # index-error
             return LocalShardsWrapper(res_shards_list, args[0].shard_offsets)
         else:
             raise NotImplementedError(
@@ -141,6 +144,7 @@ def run_torchrec_row_wise_even_sharding_example(rank, world_size):
     local_tensor = torch.randn(local_shard_shape, device=device)
     # row-wise sharding: one shard per rank
     # create the local shards wrapper
+    # pyrefly: ignore  # no-matching-overload
     local_shards_wrapper = LocalShardsWrapper(
         local_shards=[local_tensor],
         offsets=[local_shard_offset],
@@ -219,6 +223,7 @@ def run_torchrec_row_wise_uneven_sharding_example(rank, world_size):
     # local shards
     # row-wise sharding: one shard per rank
     # create the local shards wrapper
+    # pyrefly: ignore  # no-matching-overload
     local_shards_wrapper = LocalShardsWrapper(
         local_shards=[local_tensor],
         offsets=[local_shard_offset],
@@ -297,6 +302,7 @@ def run_torchrec_table_wise_sharding_example(rank, world_size):
         local_shard_offset = torch.Size((0, 0))
         # wrap local shards into a wrapper
         local_shards_wrapper = (
+            # pyrefly: ignore  # no-matching-overload
             LocalShardsWrapper(
                 local_shards=[local_tensor],
                 offsets=[local_shard_offset],
diff --git a/torch/distributed/tensor/experimental/_attention.py b/torch/distributed/tensor/experimental/_attention.py
index 891bfe91e7f7..9b89563a0ef9 100644
--- a/torch/distributed/tensor/experimental/_attention.py
+++ b/torch/distributed/tensor/experimental/_attention.py
@@ -2,33 +2,34 @@
 import itertools
 import logging
 import types
-import weakref
 from abc import ABC, abstractmethod
-from collections.abc import Generator
+from collections.abc import Callable, Generator
 from dataclasses import dataclass
 from enum import auto, Enum
-from typing import Any, Callable, Optional, Protocol, Union
+from functools import partial
+from typing import Any, cast, Mapping, Optional, Protocol, Sequence, TypeAlias
 
 import torch
 import torch.distributed as dist
 import torch.distributed._functional_collectives as ft_c
+import torch.distributed.distributed_c10d as c10d
+import torch.nn as nn
 import torch.nn.functional as F
-from torch import nn
 from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.tensor import (
-    distribute_module,
-    distribute_tensor,
-    DTensor,
-    Replicate,
-    Shard,
+from torch.distributed.tensor import distribute_tensor, DTensor, Shard
+from torch.distributed.tensor.experimental._load_balancer import (
+    _create_default_load_balancer,
+    _LoadBalancer,
 )
-from torch.distributed.tensor.parallel.style import ParallelStyle
+from torch.distributed.tensor.parallel import ParallelStyle
 from torch.nn.attention.flex_attention import (
     _mask_mod_signature,
     BlockMask,
     create_block_mask,
 )
-from torch.overrides import TorchFunctionMode
+from torch.utils._pytree import tree_flatten, tree_unflatten
+
+from ._cp_custom_ops import flex_cp_allgather
 
 
 __all__ = ["context_parallel", "set_rotate_method"]
@@ -51,8 +52,7 @@ class _RotateMethod(Enum):
 
 class _DispatchMode(Enum):
     MONKEY_PATCH = auto()
-    TORCH_FUNCTION = auto()
-    TORCH_DISPATCH = auto()
+    MODULE_WRAPPER = auto()
 
 
 _dispatch_mode: _DispatchMode = _DispatchMode.MONKEY_PATCH
@@ -61,35 +61,16 @@ class _DispatchMode(Enum):
 @dataclass
 class _ContextParallelOptions:
     # Whether to upcast parameters and gradients to float32 to avoid accumulation
-    # errors. It is likely this is always True but we currently keep this variable
-    # for the experimental purpose.
+    # errors. It is likely this is always True, but we currently keep this variable
+    # for experimental purposes.
     convert_to_f32: bool = True
-    enable_load_balance = True
+    enable_load_balance: bool = True
     rotate_method: _RotateMethod = _RotateMethod.ALL_GATHER
 
 
 _cp_options = _ContextParallelOptions()
 
 
-@dataclass
-class _ContextParallelGlobalVars:
-    # The current context parallel impl requires a record of some info
-    # as global vars. This dataclass stores those variables.
-    # TODO: this var should be able to stored in CP context
-    cp_shard_dim: int = 0
-    # This variable stores the TorchFunctionMode singleton because using multiple TF
-    # instances for dispatching may trigger recompilations
-    torch_function_mode: Optional[TorchFunctionMode] = None
-
-
-_cp_global_vars = _ContextParallelGlobalVars()
-
-
-def _set_cp_global_var(name: str, value: Any) -> None:
-    """Set a global variable for context parallelism."""
-    setattr(_cp_global_vars, name, value)
-
-
 def _is_causal_behavior(
     rank: int, world_size: int, i: int, is_causal: bool
 ) -> _CausalBehavior:
@@ -129,10 +110,10 @@ def _partial_update(
     add: bool,
 ) -> torch.Tensor:
     """
-    This API partially update a chunk of ``original`` tensor. The ``original``
-    tensor will be first chunked along ``dim`` dimension then the ``idx`` chunk
+    This API partially updates a chunk of ``original`` tensor. The ``original``
+    tensor will be first chunked along ``dim`` dimension, then the ``idx`` chunk
     will be updated with ``new``. If ``add`` is True, the chunk will be added
-    with ``new``, otherwise the chunk with be replaced by ``add``.
+    with ``new``, otherwise the chunk will be replaced by ``new``.
 
     The result is a tensor that is the same size as ``original``.
     """
@@ -146,12 +127,13 @@ def _partial_update(
 
 
 class _SDPAMerger:
-    """A class to help to merge the local SDPA result."""
+    """A class to help merge the local SDPA result."""
 
     def __init__(self, convert_to_f32: bool, seq_dim: int):
         self._seq_dim = seq_dim
         self._out: Optional[torch.Tensor] = None
         self._lse: Optional[torch.Tensor] = None
+        self._should_lse_squeeze = False
         self._convert_to_f32 = convert_to_f32
         self._out_dtype = torch.float32
         self._lse_dtype = torch.float32
@@ -159,7 +141,14 @@ def __init__(self, convert_to_f32: bool, seq_dim: int):
     def _merge_one(
         self, block_out: torch.Tensor, block_lse: torch.Tensor, partial: bool
     ) -> None:
-        block_lse = block_lse.unsqueeze(dim=-1)
+        # The cuDNN backend preserves the last dimension for LSE.
+        # Apply unsqueeze only if the input does not already have
+        # the required dimensionality.
+        if len(block_lse.shape) < len(block_out.shape):
+            block_lse = block_lse.unsqueeze(dim=-1)
+            self._should_lse_squeeze = True
+        assert len(block_lse.shape) == len(block_out.shape)
+
         if self._lse is None:
             self._lse = block_lse
             self._out = block_out
@@ -217,8 +206,12 @@ def step(self, out: torch.Tensor, lse: torch.Tensor, partial: bool) -> None:
     def results(self) -> tuple[torch.Tensor, torch.Tensor]:
         assert self._out is not None
         assert self._lse is not None
-        out, lse = self._out, self._lse.squeeze(-1)
-        return out.to(self._out_dtype), lse.to(self._lse_dtype)
+        out = self._out.to(self._out_dtype)
+        if self._should_lse_squeeze:
+            lse = self._lse.squeeze(-1).to(self._lse_dtype)
+        else:
+            lse = self._lse.to(self._lse_dtype)
+        return out, lse
 
 
 class _AttentionOp(Protocol):
@@ -243,7 +236,7 @@ def next_buffer(self) -> torch.Tensor: ...
 
 
 class _AllToAllRotater(_RingRotater):
-    """Use all_to_all to send the kv to the next rank"""
+    """Use all_to_all to send the kv to the next rank."""
 
     def __init__(self, pg: dist.ProcessGroup, seq_dim: int) -> None:
         self._pg = pg
@@ -263,7 +256,7 @@ def next_buffer(self) -> torch.Tensor:
 
 class _AllGatherRotater(_RingRotater):
     """
-    Allgather the kv and return the only the required kv.
+    Allgather the kv and return only the required kv.
     Only one communication will be done.
     """
 
@@ -274,7 +267,7 @@ def __init__(self, pg: dist.ProcessGroup, seq_dim: int) -> None:
         self._idx = 0
 
     def exchange_buffers(self, curr_buffer: torch.Tensor) -> None:
-        # We only need to perform the allgather once.
+        # We only need to perform allgather once.
         self._idx += 1
         if self._aggregated_buffer is None:
             self._aggregated_buffer = ft_c.all_gather_tensor(
@@ -315,7 +308,7 @@ def _templated_ring_attention(
     **kwargs: object,
 ) -> tuple[torch.Tensor, ...]:
     """
-    This is a generalized ring attention implementation that can support multiple attention ops.
+    A generalized ring attention implementation that can support multiple attention ops.
 
     Note [Context parallelism load balance algorithm for causal masking]
     =====================
@@ -369,7 +362,7 @@ def _templated_ring_attention(
     (k0, k3). For rank0, no computation is needed for q0. However, computations for
     q3k1 and q3k2 are required, so only q3 is used for SDPA. This corresponds to the
     `else` of the (`if`, `elif`, `else`) in the implementation.
-    For rank1, k0 is not needed for q1 and q2, so only k3 is used for SDPA. This
+    For rank1, k3 is not needed for q1 and q2, so only k0 is used for SDPA. This
     corresponds to the `elif` of (`if`, `elif`, `else`) in the implementation.
 
     Parameters
@@ -403,7 +396,7 @@ def _templated_ring_attention(
 
     next_kv = None
 
-    # Without making key and value contiguous(), the lose curve is bad.
+    # Without making key and value contiguous(), the loss curve is bad.
     # TODO(fegin): figure out why this is a requirement since SDPA does not have
     # this requirement.
     key = key.contiguous()
@@ -445,8 +438,8 @@ def _templated_ring_attention(
             q, k, v, partial = (query, key, value, False)
         elif i <= rank:
             # Round-robin load balancing case, and i <= rank.
-            # We need to do SPDA, with only the first local chunk of the k, v.
-            # Note that q, k, v, each contains two local chunks.
+            # We need to do SDPA with only the first local chunk of k, v.
+            # Note that q, k, v each contains two local chunks.
             ROUND_ROBIN_CYCLE = 2
             q, k, v, partial = (
                 query,
@@ -456,9 +449,9 @@ def _templated_ring_attention(
             )
         else:
             # Round-robin load balancing case, and i > rank.
-            # We need to do SPDA with only the second half of the q, and update
-            # only the second part of  logsumexp. So partial is True.
-            # Note that q, k, v, each contains two chunks.
+            # We need to do SDPA with only the second half of q, and update
+            # only the second part of logsumexp. So partial is True.
+            # Note that q, k, v each contains two chunks.
             q, k, v, partial = query.chunk(2, dim=2)[1], key, value, True
 
         # See https://github.com/pytorch/pytorch/blob/release/2.4/aten/src/ATen/native/native_functions.yaml#L14695
@@ -472,6 +465,7 @@ def _templated_ring_attention(
         )
         sdpa_merger.step(out, logsumexp, partial)
 
+    # pyrefly: ignore  # unbound-name
     return *sdpa_merger.results(), *rest
 
 
@@ -489,7 +483,7 @@ def _templated_ring_attention_backward(
     is_causal: bool,
     **kwargs: Any,
 ) -> tuple[torch.Tensor, ...]:
-    """This API implements the backward of the ring attention."""
+    """This API implements the backward pass of the ring attention."""
     if not is_causal and _cp_options.enable_load_balance:
         raise RuntimeError("Load balancing requires `is_causal=True`.")
     rank = dist.get_rank(group)
@@ -533,8 +527,8 @@ def _templated_ring_attention_backward(
                 q, k, v, out_, dout, lse = (query, key, value, out, grad_out, logsumexp)
             elif i <= rank:
                 # Round-robin load balancing case, and i <= rank.
-                # We need to do SPDA with only the first half of the k, v.
-                # Note that q, k, v, each contains two chunks.
+                # We need to do SDPA with only the first half of k, v.
+                # Note that q, k, v each contains two chunks.
                 q, k, v, out_, dout, lse = (
                     query,
                     key.chunk(2, dim=seq_dim)[0],
@@ -545,8 +539,8 @@ def _templated_ring_attention_backward(
                 )
             else:
                 # Round-robin load balancing case, and i > rank.
-                # We need to do SPDA with only the second half of the q
-                # Note that q, k, v, each contains two chunks.
+                # We need to do SDPA with only the second half of q.
+                # Note that q, k, v each contains two chunks.
                 q, k, v, out_, dout, lse = (
                     query.chunk(2, dim=seq_dim)[1],
                     key,
@@ -613,7 +607,7 @@ def _templated_ring_attention_backward(
                 grad_value += grad_value_
 
         next_grad_kv = torch.cat([grad_key.flatten(), grad_value.flatten()])
-        # Send the grad key, and grad value to the next rank.
+        # Send the grad key and grad value to the next rank.
         dkv_rotater.exchange_buffers(next_grad_kv)
 
         if i <= rank or not _cp_options.enable_load_balance:
@@ -638,6 +632,7 @@ def _templated_ring_attention_backward(
         grad_query,
         grad_key,
         grad_value,
+        # pyrefly: ignore  # unbound-name
         *rest,
     )
 
@@ -921,7 +916,7 @@ def _sdpa_handler(
     return DTensor._op_dispatcher.wrap(local_results, output_sharding.output_spec)
 
 
-customized_ops = {
+custom_ops = {
     aten._scaled_dot_product_flash_attention.default: _sdpa_handler,
     aten._scaled_dot_product_flash_attention_backward.default: _sdpa_handler,
     aten._scaled_dot_product_efficient_attention.default: _sdpa_handler,
@@ -929,8 +924,14 @@ def _sdpa_handler(
     aten._scaled_dot_product_cudnn_attention.default: _sdpa_handler,
     aten._scaled_dot_product_cudnn_attention_backward.default: _sdpa_handler,
 }
+exitsing_custom_ops = DTensor._op_dispatcher._custom_op_handlers
 
 
+ArgsType = tuple[Any, ...]
+KwargsType = dict[str, Any]
+InputFnType = Callable[[Optional[nn.Module], ArgsType, KwargsType, DeviceMesh], Any]
+OutputFnType = Callable[[Optional[nn.Module], Any, Any, DeviceMesh], Any]
+
 _replaced_functions: dict[Callable, tuple[str, Callable]] = {}
 
 
@@ -938,46 +939,23 @@ def _distribute_function(
     fn: Callable,
     fn_module: types.ModuleType,
     device_mesh: DeviceMesh,
-    input_fn: Optional[Callable] = None,
-    output_fn: Optional[Callable] = None,
+    input_fn: InputFnType,
+    output_fn: OutputFnType,
 ) -> None:
     """
-    ``distribute_function`` is an experimental API that allows users to "distribute"
-    the inputs and outputs of a function. Similar to ``distribute_module``, this API
-    installs hooks to the ``fn`` to convert the inputs and outputs. There are two
-    major differences between ``distribute_function`` and ``distribute_module``.
-    First, a function does not have parameters and buffers, as a result,
-    ``distribute_function`` itself won't convert any parameters/buffers but simply
-    install the input and output hooks.  The tensor conversion will happen in the hooks.
-    Another difference is an nn.Module subclass can have several instances and each
-    instance be fed into ``distribute_module`` independently with affecting other
-    instance. On the other hand, function is a singleton object. So if a function
-    is distributed by ``distribute_function`` all subsequent calls to the function
-    will invoke the installed hooks.
+    A helper function to replace a function with a distributed version by
+    using the monkey patching approach.
 
-    Args:
-        fn (Callable): the function to be distributed.
-        fn_module (types.ModuleType): the Python module that the function is declared.
-            e.g., if ``fn`` is ``torch.nn.functional.scaled_dot_product_attention``,
-            ``fn_module`` is ``torch.nn.functional``.
-        device_mesh (:class:`DeviceMesh`): the device mesh that will be used by the
-            input and output hooks to distribute the tensors.
-        input_fn (Optional[Callable]): the hook to distribute or convert the input
-            arguments of ``fn``.
-        output_fn (Optional[Callable]): the hook to distribute or convert the output
-            arguments of ``fn``.
+    This function is for the CP internal usage only.
     """
 
     def wrapper(
-        target_fn: Callable, input_fn: Optional[Callable], output_fn: Optional[Callable]
+        target_fn: Callable, input_fn: InputFnType, output_fn: OutputFnType
     ) -> Callable:
-        def inner_fn(*args: tuple[Any, ...], **kwargs: dict[str, Any]) -> Any:
-            if input_fn is not None:
-                args, kwargs = input_fn(device_mesh, *args, **kwargs)
-            output = target_fn(*args, **kwargs)
-            if output_fn is not None:
-                output = output_fn(device_mesh, output)
-            return output
+        def inner_fn(*args: ArgsType, **kwargs: KwargsType) -> Any:
+            args, kwargs = input_fn(None, args, kwargs, device_mesh)
+            outputs = target_fn(*args, **kwargs)
+            return output_fn(None, (args, kwargs), outputs, device_mesh)
 
         return inner_fn
 
@@ -993,9 +971,6 @@ def inner_fn(*args: tuple[Any, ...], **kwargs: dict[str, Any]) -> Any:
 
 def _restore_function(fn: Callable, fn_module: types.ModuleType) -> None:
     """Restore the function that is replaced by _distribute_function."""
-    global _original_functions
-    global _wrapper_functions
-
     if fn not in _replaced_functions:
         return
 
@@ -1003,156 +978,206 @@ def _restore_function(fn: Callable, fn_module: types.ModuleType) -> None:
     setattr(fn_module, original_name, original_fn)
 
 
-@contextlib.contextmanager
-def _enable_cp_dispatcher() -> Generator[None, None, None]:
+def _enable_cp_dtensor_dispatcher() -> None:
     """Enables DTensor dispatcher to dispatch SDPA to CP."""
-    old_handlers = DTensor._op_dispatcher._custom_op_handlers
-    DTensor._op_dispatcher._custom_op_handlers = {**old_handlers, **customized_ops}
-
-    yield
+    DTensor._op_dispatcher._custom_op_handlers = {
+        **exitsing_custom_ops,
+        **custom_ops,
+    }
 
-    DTensor._op_dispatcher._custom_op_handlers = old_handlers
 
+def _disable_cp_dtensor_dispatcher() -> None:
+    """Disables DTensor dispatcher to dispatch SDPA to CP."""
+    DTensor._op_dispatcher._custom_op_handlers = exitsing_custom_ops
 
-class _AttentionContextParallel(ParallelStyle):
-    """
-    Applies context parallel optimizations to the attention layer.
 
-    This will work for nn.MultiHeadedAttention and custom attention layers that
-    call F.scaled_dotproduct_attention with a similar signature.
+def _enable_context_parallel_dispatcher_impl(seq_dim: int, mesh: DeviceMesh) -> None:
+    sdpa_cp = _ContextParallel(
+        seq_dim=seq_dim,
+        attention_type=_ContextParallel.AttentionType.SDPA,
+    )
 
-    This expects the `forward` method consumes either:
+    if _dispatch_mode == _DispatchMode.MONKEY_PATCH:
+        _distribute_function(
+            F.scaled_dot_product_attention,
+            F,
+            mesh,
+            sdpa_cp.sdpa_input_fn,
+            sdpa_cp.sdpa_output_fn,
+        )
+        _enable_cp_dtensor_dispatcher()
+    elif _dispatch_mode == _DispatchMode.MODULE_WRAPPER:
+        _enable_cp_dtensor_dispatcher()
+    else:
+        raise ValueError(f"Unknown dispatch mode: {_dispatch_mode}")
 
-    * a single tensor for self attention
-    * one argument for each of: query, key, value
 
-    This currently only supports ring attention and the
-    SDPBackend.FLASH_ATTENTION backend. See sdpa_kernel.
+def _disable_context_parallel_dispatcher_impl() -> None:
+    if _dispatch_mode == _DispatchMode.MONKEY_PATCH:
+        _restore_function(F.scaled_dot_product_attention, F)
+    elif _dispatch_mode == _DispatchMode.MODULE_WRAPPER:
+        pass
+    else:
+        raise NotImplementedError(f"Unknown dispatch mode: {_dispatch_mode}")
 
-    Non-flash attention backends will result in incorrect results.
-    """
+    _disable_cp_dtensor_dispatcher()
 
-    # use a weakref dictionary to store context managers for each nn.Module
-    _CONTEXT_MANAGERS: "weakref.WeakKeyDictionary[nn.Module, Any]" = (
-        weakref.WeakKeyDictionary()
-    )
 
-    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
-        if not device_mesh.ndim == 1:
-            raise ValueError("CP only supports single dimension device mesh")
+_compiled_create_block_mask = torch.compile(
+    create_block_mask, dynamic=False, fullgraph=True
+)
 
-        return distribute_module(
-            module,
-            device_mesh,
-            input_fn=self._input_fn,  # type: ignore[arg-type]
-            output_fn=self._output_fn,  # type: ignore[arg-type]
-        )
 
-    @classmethod
-    def _input_fn(
-        cls,
-        module: nn.Module,
-        inputs: tuple[Union[torch.Tensor, int, float], ...],
-        device_mesh: DeviceMesh,
-    ) -> tuple[Union[torch.Tensor, int, float], ...]:
-        # TODO(d4l3k); this should be Shard(2), need to fix Linear layer rules
-        placement = [Replicate()]
-
-        def backward_hook(grad: torch.Tensor) -> None:
-            if module in cls._CONTEXT_MANAGERS:
-                cls._CONTEXT_MANAGERS[module].__exit__(None, None, None)
-                del cls._CONTEXT_MANAGERS[module]
-
-        # convert inputs to DTensor
-        inp = []
-        for input in inputs:
-            if isinstance(input, torch.Tensor) and not isinstance(input, DTensor):
-                input = DTensor.from_local(
-                    input.contiguous(), device_mesh, placement, run_check=False
-                )
+def _context_parallel_buffers(
+    mesh: DeviceMesh,
+    buffers: list[torch.Tensor | BlockMask],
+    buffer_seq_dims: list[int],
+    load_balancer: Optional[_LoadBalancer] = None,
+) -> list[torch.Tensor | BlockMask]:
+    """
+    Shard the buffers along the sequence dimensions according to CP rules.
+    Args:
+        mesh (:class:`DeviceMesh`): the device mesh for the context parallelism.
+        buffers (List[torch.Tensor]): the buffers to be sharded.
+        seq_dims (List[int]): the sequence dimensions of ``buffers``. This list
+            must have the same length as ``buffers``.
+        load_balancer (Optional[:class:`_LoadBalancer`]): an optional `_LoadBalancer`
+            object. If this argument is `None`, it means the `buffers` need no
+            rearrangement before being sharded. If this argument is a `_LoadBalancer`
+            object, call its `_generate_indices(restore=False)` to generate the
+            rearrangement indices such that each shard of `buffer[rearrange_idx]` is
+            well-balanced (i.e., having close sparsities).
 
-            if isinstance(input, torch.Tensor) and input.requires_grad:
-                input.register_hook(backward_hook)
-
-            inp.append(input)
-
-        manager = _enable_cp_dispatcher()
-        manager.__enter__()
-        cls._CONTEXT_MANAGERS[module] = manager
-
-        return tuple(inp)
-
-    @classmethod
-    def _output_fn(
-        cls,
-        module: nn.Module,
-        outputs: Union[torch.Tensor, tuple[Union[torch.Tensor, int, float], ...]],
-        device_mesh: DeviceMesh,
-    ) -> Union[
-        Union[torch.Tensor, int, float], tuple[Union[torch.Tensor, int, float], ...]
-    ]:
-        cls._CONTEXT_MANAGERS[module].__exit__(None, None, None)
-        del cls._CONTEXT_MANAGERS[module]
-
-        def backward_hook(grad: torch.Tensor) -> None:
-            if module not in cls._CONTEXT_MANAGERS:
-                manager = _enable_cp_dispatcher()
-                manager.__enter__()
-                cls._CONTEXT_MANAGERS[module] = manager
-
-        # back to local tensor
-        out = []
-        for output in [outputs] if isinstance(outputs, torch.Tensor) else outputs:
-            output = output.to_local() if isinstance(output, DTensor) else output
+    Returns:
+        List[torch.Tensor]: the sharded buffers.
 
-            if isinstance(output, torch.Tensor) and output.requires_grad:
-                output.register_hook(backward_hook)
+    Note:
+        For `_context_parallel_shard` we require a non-None `load_balancer` object to be
+        explicitly passed if load-balancing is needed.
+    """
+    # generate the index tensor for rearranging the buffer if a load-balance
+    # is available
+    load_balance_indices = load_balancer._generate_indices() if load_balancer else None
+    assert load_balance_indices is None or load_balance_indices.ndim == 2, (
+        "load balance index expects shape (1, seq_len) or (B, seq_len) "
+        f"but got {load_balance_indices.shape}."
+    )
 
-            out.append(output)
+    new_buffers = []
+    sharded_buffer: torch.Tensor | BlockMask
+    for buffer, seq_dim in zip(buffers, buffer_seq_dims):
+        if isinstance(buffer, torch.Tensor):
+            # TODO: the load balance doesn't perform error handling.
+
+            # NOTE: assuming batch dim is 0
+
+            if load_balance_indices is not None:
+                # TODO: we should expclitly ask users to unsqueeze the batch dim.
+                # But this is a BC breaking ask.
+                # However, what we have done today is also not very safe.
+                idx_batch_size = load_balance_indices.size(0)
+                data_batch_size = buffer.size(0) if seq_dim > 0 else 1
+
+                if idx_batch_size != 1 and idx_batch_size != data_batch_size:
+                    raise ValueError(
+                        "Cannot rearrange buffer: "
+                        f"load_balance_indices has shape {load_balance_indices.shape}, "
+                        f"but buffer has shape {buffer.shape}."
+                    )
+
+                if seq_dim == 0:
+                    buffer = torch.index_select(
+                        buffer, dim=0, index=load_balance_indices[0]
+                    )
+                else:
+                    indices = load_balance_indices
+                    if idx_batch_size == 1:
+                        size = [data_batch_size] + list(indices.size())[1:]
+                        indices = indices.expand(*size)
+
+                    for i in range(data_batch_size):
+                        buffer[i] = torch.index_select(
+                            buffer[i], dim=seq_dim - 1, index=indices[i]
+                        )
+
+            # use DTensor to shard the buffer on sequence dimension, retain the local tensor
+            sharded_buffer = distribute_tensor(
+                buffer, mesh, [Shard(seq_dim)], src_data_rank=None
+            ).to_local()
+        elif isinstance(buffer, BlockMask):
+            sharded_buffer = _create_cp_block_mask(
+                mask_mod=buffer.mask_mod,
+                B=buffer.kv_num_blocks.shape[0],
+                H=buffer.kv_num_blocks.shape[1],
+                Q_LEN=buffer.seq_lengths[0],
+                KV_LEN=buffer.seq_lengths[1],
+                device_mesh=mesh,
+                load_balancer=load_balancer,
+            )
+        else:
+            raise ValueError(f"Unknown buffer type: {type(buffer)}")
 
-        if isinstance(outputs, torch.Tensor):
-            return out[0]
+        new_buffers.append(sharded_buffer)
 
-        return tuple(out)
+    return new_buffers
 
 
-def create_cp_block_mask(
+def _create_cp_block_mask(
     mask_mod: _mask_mod_signature,
     B: int,
     H: int,
     Q_LEN: int,
     KV_LEN: int,
     device_mesh: DeviceMesh,
+    load_balancer: Optional[_LoadBalancer] = None,
 ) -> BlockMask:
     """
-    This API creates a special BlockMask for Context Parallel FlexAttention:
-    1. This BlockMask is masking on the attention of Q shard and KV global views, by
-    mapping the local q_idx to the global q_idx before sending to mask_mod.
-    2. The kv_seq_length (i.e. seq_lengths[1]) of this blockMask is tailored to match
-    the sequence length of KV shard instead of KV global. This is to pass the shape check
-    in flex_atttention(). The correct value (i.e. the sequence length of KV global) will be
-    used in flex_attention once the shape check passes.
+    Creates a specialized BlockMask for Context Parallel FlexAttention.
+
+    This function creates a BlockMask that enables computation of attention results
+    for sharded Q attending to global KV. The mask appropriately handles the query
+    index offset required when each rank operates on a shard of the query sequence
+    while accessing the full key-value sequence.
+
+    The function internally rewrites the provided mask_mod function to translate local
+    query indices to global query indices, ensuring that the masking logic is applied
+    correctly across the distributed computation.
 
     Args:
-        mask_mod (Callable): Function to modify the mask over the global attention result.
+        mask_mod (Callable): Mask function that operates on global attention indices.
         B (int): Batch size.
         H (int): Number of query heads.
-        Q_LEN (int): Sequence length of query (global view).
-        KV_LEN (int): Sequence length of key/value (global view).
-        device_mesh (:class:`DeviceMesh`): The device mesh for the context parallelism.
+        Q_LEN (int): Global sequence length of the query.
+        KV_LEN (int): Global sequence length of the key/value.
+        device_mesh (DeviceMesh): Device mesh used for context parallelism.
+        load_balancer (Optional[:class:`_LoadBalancer`]): The load-balancer used to rearrange
+            QKV before sharding. This will be used to modify the block_mask generated.
 
-    Return:
-        :class:`BlockMask`: the block_mask to be used in flex_attention() within the
-        context_parallel() context.
+    Returns:
+        BlockMask: A block mask configured for the local query shard that can be used
+            with flex_attention() for the given cp_mesh.
+
+    Raises:
+        NotImplementedError: If Q_LEN is not divisible by (CP world size * BLOCK_SIZE).
+
+    Warning:
+        Currently requires Q_LEN to be divisible by CP mesh world size * BLOCK_SIZE
+        (BLOCK_SIZE defaults to 128). This constraint exists because the BlockMask
+        must handle both padding and offsets correctly. For example, if Q_LEN is 384,
+        CP world size is 2, and BLOCK_SIZE is 128, the local Q_LEN would be 192. In
+        such cases, both rank0 and rank1 would have paddings in their local BlockMasks.
+        Support for padding in this scenario is planned for future work.
 
-    .. warning::
-        This function cannot generate correct block_mask if the BLOCK_SIZE is not
-        ``_DEFAULT_SPARSE_BLOCK_SIZE`` which usually happens when the attention
-        size is smaller than 128. Please do not use context_parallel() when the
-        FlexAttention size is small.
     """
+
     from torch.nn.attention.flex_attention import _DEFAULT_SPARSE_BLOCK_SIZE
 
+    if Q_LEN % (device_mesh.size() * _DEFAULT_SPARSE_BLOCK_SIZE) != 0:
+        raise NotImplementedError(
+            f"Q_LEN {Q_LEN} is not divisible by CP mesh world size {device_mesh.size()} * "
+            f"BLOCK_SIZE {_DEFAULT_SPARSE_BLOCK_SIZE}. This is not supported yet. "
+        )
+
     compiled_create_block_mask = torch.compile(
         create_block_mask, dynamic=False, fullgraph=True
     )
@@ -1160,10 +1185,30 @@ def create_cp_block_mask(
     def _rewrite_mask_mod(
         mask_mod: _mask_mod_signature,
         rank: int,
-        world_size: int,
         block_size: int,
         local_q_size: int,
+        qkv_rearrange_indices: Optional[torch.Tensor] = None,
     ) -> _mask_mod_signature:
+        assert qkv_rearrange_indices is None or qkv_rearrange_indices.ndim == 2, (
+            "load balance index expects shape (1, seq_len) or (B, seq_len) "
+            f"but got {qkv_rearrange_indices.shape}."
+        )
+
+        def qkv_idx_restore(
+            b: torch.Tensor, idx_post_rearrange: torch.Tensor
+        ) -> torch.Tensor:
+            if qkv_rearrange_indices is not None:
+                if (
+                    qkv_rearrange_indices.size(0) == 1
+                ):  # identical load-balance in batch
+                    idx_pre_rearrange = qkv_rearrange_indices[0][idx_post_rearrange]
+                else:
+                    idx_pre_rearrange = qkv_rearrange_indices[b][idx_post_rearrange]
+            else:
+                idx_pre_rearrange = idx_post_rearrange
+
+            return idx_pre_rearrange
+
         def local_q_idx_to_q_idx(local_q_idx: torch.Tensor) -> torch.Tensor:
             # calculate local block_idx and block_offset
             local_blk_idx, local_blk_offset = (
@@ -1178,16 +1223,29 @@ def local_q_idx_to_q_idx(local_q_idx: torch.Tensor) -> torch.Tensor:
         return lambda b, h, q_idx, kv_idx: mask_mod(
             b,
             h,
-            local_q_idx_to_q_idx(q_idx),
-            kv_idx,
+            qkv_idx_restore(b, local_q_idx_to_q_idx(q_idx)),
+            qkv_idx_restore(b, kv_idx),
         )
 
     cp_rank = device_mesh.get_local_rank()
     cp_group_size = device_mesh.size()
+    load_balancer = load_balancer or _create_default_load_balancer(
+        Q_LEN, cp_group_size, device_mesh.device_type
+    )
     Q_SHARD_LEN = Q_LEN // cp_group_size
     block_size = _DEFAULT_SPARSE_BLOCK_SIZE
+
+    rearrange_indices = (
+        load_balancer._generate_indices(restore=False) if load_balancer else None
+    )
     block_mask = compiled_create_block_mask(
-        _rewrite_mask_mod(mask_mod, cp_rank, cp_group_size, block_size, Q_SHARD_LEN),
+        _rewrite_mask_mod(
+            mask_mod,
+            cp_rank,
+            block_size,
+            Q_SHARD_LEN,
+            qkv_rearrange_indices=rearrange_indices,
+        ),
         B,
         H,
         Q_SHARD_LEN,
@@ -1195,26 +1253,87 @@ def local_q_idx_to_q_idx(local_q_idx: torch.Tensor) -> torch.Tensor:
         device=device_mesh.device_type,
         BLOCK_SIZE=(block_size, block_size),
     )
-    # flex_attention function checks the following shape so we need to rewrite:
-    # key.size(-2) == block_mask.seq_lengths[1]
-    seq_lengths = block_mask.seq_lengths
-    block_mask.seq_lengths = (seq_lengths[0], seq_lengths[1] // cp_group_size)
     return block_mask
 
 
-@contextlib.contextmanager
-def _context_parallel(seq_dim: int, mesh: DeviceMesh) -> Generator[None, None, None]:
-    """Replace SDPA with the CP-wrapped version and enable DTensor CP dispatcher."""
+#####################
+# Experimental APIs
+#####################
+
 
-    def attention_input_fn(
-        mesh: DeviceMesh, *args: tuple[Any, ...], **kwargs: dict[str, Any]
+class _ContextParallel(ParallelStyle):
+    class AttentionType(Enum):
+        FLEX = "flex_attention"
+        SDPA = "scaled_dot_product_attention"
+
+    def __init__(
+        self,
+        seq_dim: int,
+        attention_type: AttentionType,
+    ) -> None:
+        super().__init__()
+        self.seq_dim = seq_dim
+        self.attention_type = attention_type
+
+    def _apply(self, module: nn.Module, mesh: DeviceMesh) -> nn.Module:
+        if self.attention_type == self.AttentionType.FLEX:
+            module.register_forward_pre_hook(
+                partial(self.flex_input_fn, mesh=mesh), with_kwargs=True
+            )
+            return module
+        elif self.attention_type == self.AttentionType.SDPA:
+            module.register_forward_pre_hook(
+                partial(self.sdpa_input_fn, mesh=mesh), with_kwargs=True
+            )
+            module.register_forward_hook(partial(self.sdpa_output_fn, mesh=mesh))
+            return module
+        else:
+            raise ValueError(f"Unknown attention type: {self.attention_type}")
+
+    def flex_input_fn(
+        self, module: Optional[nn.Module], args: Any, kwargs: Any, mesh: DeviceMesh
+    ) -> Any:
+        args_list = list(args)
+        for idx, name in enumerate(
+            ("query", "key", "value", "score_mod", "block_mask")
+        ):
+            if idx >= len(args):
+                args_list.append(kwargs.pop(name, None))
+
+        query, key, value, score_mod, block_mask = args_list[:5]
+        assert isinstance(query, torch.Tensor)
+        assert isinstance(key, torch.Tensor)
+        assert isinstance(value, torch.Tensor)
+        assert isinstance(block_mask, BlockMask | tuple)
+
+        key = key.contiguous()
+        value = value.contiguous()
+
+        global_key, global_value = flex_cp_allgather(
+            key, value, self.seq_dim, c10d._get_process_group_name(mesh.get_group())
+        )
+        args_list[1] = global_key
+        args_list[2] = global_value
+
+        return tuple(args_list), kwargs
+
+    def sdpa_input_fn(
+        self,
+        module: Optional[nn.Module],
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+        mesh: DeviceMesh,
     ) -> tuple[tuple[Any, ...], dict[str, Any]]:
-        placement = [Shard(seq_dim)]
+        placement = [Shard(self.seq_dim)]
         all_args = []
 
+        # pyrefly: ignore  # bad-assignment, bad-argument-type
         for arg in itertools.chain(args, kwargs.values()):
-            if isinstance(arg, torch.Tensor) and not isinstance(arg, DTensor):
-                arg = DTensor.from_local(arg, mesh, placement, run_check=False)
+            if isinstance(arg, torch.Tensor):
+                if isinstance(arg, DTensor):
+                    assert arg._spec.placements == placement
+                else:
+                    arg = DTensor.from_local(arg, mesh, placement, run_check=False)
 
             all_args.append(arg)
 
@@ -1222,7 +1341,9 @@ def attention_input_fn(
         new_kwargs = dict(zip(kwargs.keys(), all_args[len(args) :]))
         return new_args, new_kwargs
 
-    def attention_output_fn(mesh: DeviceMesh, outputs: Any) -> Any:
+    def sdpa_output_fn(
+        self, module: Optional[nn.Module], inputs: Any, outputs: Any, mesh: DeviceMesh
+    ) -> Any:
         new_outputs = []
         for output in [outputs] if isinstance(outputs, torch.Tensor) else outputs:
             output = output.to_local() if isinstance(output, DTensor) else output
@@ -1233,169 +1354,102 @@ def attention_output_fn(mesh: DeviceMesh, outputs: Any) -> Any:
 
         return tuple(new_outputs)
 
-    def unshard(x: torch.Tensor, mesh: DeviceMesh, shard_dim: int) -> torch.Tensor:
-        x = x.contiguous()
-        all_xs = [torch.empty_like(x) for _ in range(mesh.size())]
-        ft_c.all_gather_inplace(all_xs, x, mesh)
-        return torch.cat(all_xs, dim=shard_dim)
-
-    class DistributeFunction(TorchFunctionMode):
-        def __init__(
-            self,
-            fn: Callable,
-            device_mesh: DeviceMesh,
-            input_fn: Optional[Callable] = None,
-            output_fn: Optional[Callable] = None,
-        ):
-            self._device_mesh = device_mesh
-            self._input_fn = input_fn
-            self._output_fn = output_fn
-            self._fn = fn
-
-        def __torch_function__(
-            self,
-            func: Callable,
-            types: Any,
-            args: tuple[Any, ...] = (),
-            kwargs: Optional[dict[str, Any]] = None,
-        ) -> Any:
-            kwargs = kwargs or {}
-
-            # special handler for flex_attention
-            if func == torch._higher_order_ops.flex_attention:
-                query, key, value, score_mod, block_mask = args[:5]
-                assert isinstance(query, torch.Tensor)
-                assert isinstance(key, torch.Tensor)
-                assert isinstance(value, torch.Tensor)
-                assert isinstance(block_mask, tuple)
-
-                global_key = ft_c.all_gather_tensor_autograd(
-                    key, _cp_global_vars.cp_shard_dim, self._device_mesh
-                )
-                global_value = ft_c.all_gather_tensor_autograd(
-                    value, _cp_global_vars.cp_shard_dim, self._device_mesh
-                )
-
-                # shape rewrite: because torch.nn.flex_attention() checks
-                # the QKV shape against the block_mask object, we need to
-                # manually rewrite the shape info in block_mask tuple to
-                # make it compatible with q_shard, k_global, v_global
-                if block_mask[1] != global_key.size(-2):
-                    block_mask = (block_mask[0], global_key.size(-2), *block_mask[2:])
-
-                return func(
-                    query,
-                    global_key,
-                    global_value,
-                    score_mod,
-                    block_mask,
-                    *args[5:],
-                    **kwargs,
-                )
 
-            if func != self._fn:
-                return func(*args, **kwargs)
+CPBuffer: TypeAlias = torch.Tensor | BlockMask
+CPBufferContainer: TypeAlias = Sequence[CPBuffer] | Mapping[str, CPBuffer]
+CPBufferSeqDims: TypeAlias = Sequence[int] | Mapping[str, int]
 
-            if self._input_fn is not None:
-                args, kwargs = self._input_fn(self._device_mesh, *args, **kwargs)
-            output = func(*args, **kwargs)
-            if self._output_fn is not None:
-                output = self._output_fn(self._device_mesh, output)
-            return output
 
-    if _dispatch_mode == _DispatchMode.MONKEY_PATCH:
-        _distribute_function(
-            F.scaled_dot_product_attention,
-            F,
-            mesh,
-            attention_input_fn,
-            attention_output_fn,
-        )
-        with _enable_cp_dispatcher():
-            yield
-        _restore_function(F.scaled_dot_product_attention, F)
-    elif _dispatch_mode == _DispatchMode.TORCH_FUNCTION:
-        tf_mode = _cp_global_vars.torch_function_mode
-        if tf_mode is None:
-            tf_mode = DistributeFunction(
-                F.scaled_dot_product_attention,
-                mesh,
-                attention_input_fn,
-                attention_output_fn,
-            )
-            _set_cp_global_var("torch_function_mode", tf_mode)
-
-        with tf_mode:
-            with _enable_cp_dispatcher():
-                yield
-    else:
-        raise NotImplementedError("torch dispatch mode is not supported yet.")
+def _context_parallel_shard(
+    mesh: DeviceMesh,
+    buffers: CPBufferContainer,
+    seq_dims: CPBufferSeqDims,
+    load_balancer: Optional[_LoadBalancer] = None,
+) -> list[torch.Tensor | BlockMask]:
+    """
+    Shard the buffers along the specified sequence dimensions (`seq_dims`), so that each
+    rank retains only its corresponding shard according to the provided `mesh`. If a
+    `load_balancer` is provided, the buffers will be rearranged by the load balancer
+    before sharding to improve load balance. Buffers can be either tensors or `BlockMask`
+    objects. If a buffer is a `BlockMask`, its sharding dimension is determined by the
+    `BlockMask` implementation, and the corresponding `seq_dim` is ignored.
 
+    Note:
+        For `_context_parallel_shard`, a non-None `load_balancer` must be explicitly passed
+        if load balancing is required.
 
-def _generate_round_robin_indices(
-    seq_length: int,
-    cp_world_size: int,
-    device: torch.device,
-    restore: bool = False,
-) -> torch.Tensor:
-    """
-    Generate round-robin load balancing indices or restore indices.
     Args:
-        seq_length: Total sequence length
-        cp_world_size: Context parallel world size
-        device: Device to place the tensor on
-        restore: If True, generate restore indices that map round-robin reordered
-                positions back to original positions. If False, generate load
-                balance indices that reorder original positions to round-robin pattern.
+        mesh (DeviceMesh): The device mesh used for context parallelism.
+        buffers (List[torch.Tensor | BlockMask]): Buffers whose usage depends on the sequence
+            dimension. Examples include input batches, labels, and positional embedding buffers.
+            These buffers must be sharded along the sequence dimension to ensure correctness.
+        seq_dims (List[int]): The sequence dimensions for each buffer in `buffers`. Must have
+            the same length as `buffers`.
+        load_balancer (Optional[_LoadBalancer]): An optional load balancer object. If provided,
+            it rearranges the buffers before sharding to achieve better load balance. If not
+            provided, no rearrangement is performed.
+
     Returns:
-        Index tensor of shape (seq_length,) with the requested mapping.
+        List[torch.Tensor | BlockMask]: The sharded buffers, each corresponding to the local
+            shard for the current rank.
     """
-    assert seq_length % (cp_world_size * 2) == 0
-    chunk_size = seq_length // (cp_world_size * 2)
-    all_indices = []
-
-    for cp_rank in range(cp_world_size):
-        # Generate indices for first chunk of the cp rank
-        first_chunk_start = cp_rank * chunk_size
-        first_chunk_indices = list(
-            range(first_chunk_start, first_chunk_start + chunk_size)
-        )
+    # TODO: these global variables are going to bite us someday.
+    # We will have to remove them soon.
+    # For the new API, we only support the module wrapper mode.
+    global _dispatch_mode
+    _dispatch_mode = _DispatchMode.MODULE_WRAPPER
+    global _cp_options
+    if load_balancer is not None:
+        _cp_options.enable_load_balance = True
+    else:
+        _cp_options.enable_load_balance = False
 
-        # Second chunk: positions from the complementary chunk
-        second_chunk_idx = cp_world_size * 2 - cp_rank - 1
-        second_chunk_start = second_chunk_idx * chunk_size
-        second_chunk_indices = list(
-            range(second_chunk_start, second_chunk_start + chunk_size)
+    if len(buffers) != len(seq_dims):
+        raise ValueError(
+            "`seq_dims` must have the same number of elements as `buffers`."
         )
-        # combine the indices for this rank
-        all_indices.extend(first_chunk_indices + second_chunk_indices)
-    all_indices_tensor = torch.tensor(all_indices, dtype=torch.int, device=device)
-    if restore:
-        all_indices_tensor = torch.argsort(all_indices_tensor)
-    return all_indices_tensor
 
+    flat_buffers, spec = tree_flatten(buffers)
+    flat_seq_dims, _ = tree_flatten(seq_dims)
+    if len(flat_buffers) != len(flat_seq_dims):
+        raise ValueError("`seq_dims` must have the pytree structure as `buffers`.")
 
-def _context_parallel_buffers(
-    mesh: DeviceMesh,
-    buffers: list[torch.Tensor],
-    buffer_seq_dims: list[int],
-    load_balance_indices: Optional[torch.Tensor] = None,
-) -> list[torch.Tensor]:
-    """Shard the buffers along the sequence dimensions according to CP rules."""
-    new_buffers = []
-    for buffer, seq_dim in zip(buffers, buffer_seq_dims):
-        if load_balance_indices is not None:
-            buffer = torch.index_select(buffer, dim=seq_dim, index=load_balance_indices)
+    if isinstance(flat_buffers[0], torch.Tensor):
+        device = flat_buffers[0].device
+    else:
+        device = flat_buffers[0].kv_num_blocks.device
+    for buffer in flat_buffers:
+        if isinstance(buffer, torch.Tensor):
+            assert device == buffer.device, "All buffers must be on the same device"
+        else:
+            assert device == buffer.kv_num_blocks.device, (
+                "All buffers must be on the same device"
+            )
 
-        # use DTensor to shard the buffer on sequence dimension, retain the local tensor
-        sharded_buffer = distribute_tensor(
-            buffer, mesh, [Shard(seq_dim)], src_data_rank=None
-        ).to_local()
-        new_buffers.append(sharded_buffer)
+    flat_sharded_buffers = _context_parallel_buffers(
+        mesh, flat_buffers, flat_seq_dims, load_balancer
+    )
 
-    return new_buffers
+    return tree_unflatten(flat_sharded_buffers, spec)
 
 
+def _enable_context_parallel_dispatcher() -> None:
+    """
+    Enable the context parallel dispatcher. This API is experimental and subject to change.
+    """
+    _enable_cp_dtensor_dispatcher()
+
+
+def _disable_context_parallel_dispatcher() -> None:
+    """
+    Disable the context parallel dispatcher. This API is experimental and subject to change.
+    """
+    _disable_cp_dtensor_dispatcher()
+
+
+#####################################################
+# Current public APIs, but are also subject to change
+#####################################################
 @contextlib.contextmanager
 @torch.no_grad()
 def context_parallel(
@@ -1434,6 +1488,11 @@ def context_parallel(
         `torch.distributed.tensor.experimental.context_parallel` is a
         prototype feature in PyTorch. The API is subject to change.
     """
+    # For the legacy API, we only support the monkey-patch mode.
+    # We will deprecate this API once the new API is widely used.
+    global _dispatch_mode
+    _dispatch_mode = _DispatchMode.MONKEY_PATCH
+
     buffers = [] if buffers is None else buffers
     buffer_seq_dims = [] if buffer_seq_dims is None else buffer_seq_dims
     no_restore_buffers = set() if no_restore_buffers is None else no_restore_buffers
@@ -1453,24 +1512,27 @@ def context_parallel(
     device = buffers[0].device
     seq_length = buffers[0].shape[buffer_seq_dims[0]]
     cp_world_size = mesh.size()
-    if _cp_options.enable_load_balance:
-        load_balance_indices = _generate_round_robin_indices(
-            seq_length=seq_length,
-            cp_world_size=cp_world_size,
-            device=device,
-        )
-    else:
-        load_balance_indices = None
+
+    # If `enable_load_balance` is True, the default Head-tail load balancer
+    # (:class:`_HeadTailLoadBalancer`) is used to rearrange the buffers before
+    # sharding. Otherwise, we don't do any load-balance rearrange by passing
+    # `None` to `_context_parallel_shard()`.
+    load_balancer = _create_default_load_balancer(seq_length, cp_world_size, device)
     shards = _context_parallel_buffers(
-        mesh, buffers, buffer_seq_dims, load_balance_indices
+        mesh,
+        cast(list[torch.Tensor | BlockMask], buffers),
+        buffer_seq_dims,
+        load_balancer,
     )
     for buffer, shard in zip(buffers, shards):
+        assert isinstance(shard, torch.Tensor), "ContextParallel only supports Tensor"
         shard = shard.clone()
         buffer.resize_(shard.shape)
         buffer.copy_(shard)
 
-    with _context_parallel(seq_dim=2, mesh=mesh):
-        yield
+    _enable_context_parallel_dispatcher_impl(seq_dim=2, mesh=mesh)
+    yield
+    _disable_context_parallel_dispatcher_impl()
 
     for buffer, original_buffer in zip(buffers, original_buffers):
         if original_buffer is not None:
@@ -1483,6 +1545,7 @@ def context_parallel_unshard(
     mesh: DeviceMesh,
     buffers: list[torch.Tensor],
     seq_dims: list[int],
+    load_balancer: Optional[_LoadBalancer] = None,
 ) -> list[torch.Tensor]:
     """
     Unshard the tensors (e.g., output) that are sharded due to context parallelism.
@@ -1492,33 +1555,71 @@ def context_parallel_unshard(
         buffers (List[torch.Tensor]): the buffers to be unsharded.
         seq_dims (List[int]): the sequence dimensions of ``buffers``. This list
             must have the same length as ``buffers``.
+        load_balancer (Optional[:class:`_Loadbalancer`]): an optional `_LoadBalancer`
+            object. If this argument is `None`, it means the `buffers` were not
+            rearranged when being sharded and there's no need to put it back to order
+            after unsharding. If this argument is a `_LoadBalancer` object, call
+            its `_generate_indices(restore=True)` to generate the restore indices such
+            that `unsharded[restore_idx]` is the original buffer.
 
     Returns:
         List[torch.Tensor]: the unsharded buffers.
+
+    Note:
+        For `context_parallel_unshard` we require not-None `load_balancer` object be
+        explicitly passed if flex_attention() is to be used and load-balancing is needed.
+        This is different from the case of SDPA though we strongly suggest users follow
+        the same convention.
     """
-    if _cp_options.enable_load_balance:
-        device = buffers[0].device
-        cp_world_size = mesh.size()
-        seq_length = buffers[0].shape[seq_dims[0]] * cp_world_size
-        restore_indices = _generate_round_robin_indices(
-            seq_length=seq_length,
-            cp_world_size=cp_world_size,
-            device=device,
-            restore=True,
-        )
-    else:
-        restore_indices = None
+    device = buffers[0].device
+    cp_world_size = mesh.size()
+    seq_length = buffers[0].shape[seq_dims[0]] * cp_world_size
+
+    # If users don't pass in a `load_balancer`:
+    # - if `enable_load_balance` is True, we use the default round-robin
+    #   load balancer.
+    # - if `enable_load_balance` is False, we don't do any load balancing
+    #   by passing in `None` as `restore_indices`.
+    load_balancer = load_balancer or _create_default_load_balancer(
+        seq_length, cp_world_size, device
+    )
+    restore_indices = (
+        load_balancer._generate_indices(restore=True) if load_balancer else None
+    )
+
+    assert restore_indices is None or restore_indices.ndim == 2, (
+        "load balance restore index expects shape (1, seq_len) or (B, seq_len) "
+        f"but got {restore_indices.shape}."
+    )
     unsharded_buffers = []
     for b, dim in zip(buffers, seq_dims):
         b = b.contiguous()
         unsharded_b = _maybe_wait(ft_c.all_gather_tensor(b, dim, mesh))
 
         if restore_indices is not None:
-            unsharded_b = torch.index_select(
-                unsharded_b, dim=dim, index=restore_indices
-            )
+            # NOTE: assuming batch dim is 0
+            idx_batch_size = restore_indices.size(0)
+            data_batch_size = unsharded_b.size(0)
+            if idx_batch_size != 1 and idx_batch_size != data_batch_size:
+                raise ValueError(
+                    "Cannot restore buffer: "
+                    f"restore_indices has shape {restore_indices.shape}, "
+                    f"but unsharded_b has shape {unsharded_b.shape}."
+                )
+
+            for i in range(data_batch_size):
+                index = (
+                    restore_indices[0]  # identical load-balance in batch
+                    if idx_batch_size == 1
+                    else restore_indices[i]
+                )
+                unsharded_b_batch_i = torch.index_select(
+                    unsharded_b[i], dim=dim - 1, index=index
+                )
+                unsharded_b[i] = unsharded_b_batch_i
 
         unsharded_buffers.append(unsharded_b)
+
     return unsharded_buffers
 
 
@@ -1538,6 +1639,7 @@ def set_rotate_method(rotate_method: str) -> None:
     Returns:
         None
     """
+    logger.info("Note that FlexAttention CP doesn't support alltoall yet.")
     if rotate_method == "allgather":
         _cp_options.rotate_method = _RotateMethod.ALL_GATHER
     elif rotate_method == "alltoall":
diff --git a/torch/distributed/tensor/experimental/_cp_custom_ops.py b/torch/distributed/tensor/experimental/_cp_custom_ops.py
new file mode 100644
index 000000000000..49705221cb4d
--- /dev/null
+++ b/torch/distributed/tensor/experimental/_cp_custom_ops.py
@@ -0,0 +1,88 @@
+from typing import Any
+
+import torch
+import torch.distributed._functional_collectives as funcol
+import torch.distributed.distributed_c10d as c10d
+
+
+@torch.library.custom_op("cplib::flex_cp_allgather", mutates_args=())
+def flex_cp_allgather(
+    k: torch.Tensor, v: torch.Tensor, seq_dim: int, pg_name: str
+) -> tuple[torch.Tensor, torch.Tensor]:
+    k = k.contiguous()
+    v = v.contiguous()
+    k = funcol.all_gather_tensor(k, seq_dim, pg_name)
+    v = funcol.all_gather_tensor(v, seq_dim, pg_name)
+    if isinstance(k, funcol.AsyncCollectiveTensor):
+        k = k.wait()
+    if isinstance(v, funcol.AsyncCollectiveTensor):
+        v = v.wait()
+    return k, v
+
+
+@flex_cp_allgather.register_fake
+def _(
+    k: torch.Tensor, v: torch.Tensor, seq_dim: int, pg_name: str
+) -> tuple[torch.Tensor, torch.Tensor]:
+    shape_k = list(k.shape)
+    shape_v = list(v.shape)
+    shape_k[seq_dim] *= c10d._get_group_size_by_name(pg_name)
+    shape_v[seq_dim] *= c10d._get_group_size_by_name(pg_name)
+    new_k = torch.empty(shape_k, dtype=k.dtype, device=k.device)
+    new_v = torch.empty(shape_v, dtype=v.dtype, device=v.device)
+    return new_k, new_v
+
+
+@torch.library.custom_op("cplib::flex_cp_allgather_backward", mutates_args=())
+def flex_cp_allgather_backward(
+    grad_full_k: torch.Tensor,
+    grad_full_v: torch.Tensor,
+    seq_dim: int,
+    pg_name: str,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    grad_k = funcol.reduce_scatter_tensor(grad_full_k, "sum", seq_dim, pg_name)
+    if isinstance(grad_k, funcol.AsyncCollectiveTensor):
+        grad_k = grad_k.wait()
+    grad_v = funcol.reduce_scatter_tensor(grad_full_v, "sum", seq_dim, pg_name)
+    if isinstance(grad_v, funcol.AsyncCollectiveTensor):
+        grad_v = grad_v.wait()
+
+    return grad_k, grad_v
+
+
+@flex_cp_allgather_backward.register_fake
+def _(
+    grad_full_k: torch.Tensor,
+    grad_full_v: torch.Tensor,
+    seq_dim: int,
+    pg_name: str,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    shape_k = list(grad_full_k.shape)
+    shape_v = list(grad_full_v.shape)
+    shape_k[seq_dim] //= c10d._get_group_size_by_name(pg_name)
+    shape_v[seq_dim] //= c10d._get_group_size_by_name(pg_name)
+    new_grad_k = torch.empty(
+        shape_k, dtype=grad_full_k.dtype, device=grad_full_k.device
+    )
+    new_grad_v = torch.empty(
+        shape_v, dtype=grad_full_v.dtype, device=grad_full_v.device
+    )
+    return new_grad_k, new_grad_v
+
+
+def _flex_cp_allgather_backward(
+    ctx: Any, grad_full_k: torch.Tensor, grad_full_v: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor, None, None]:
+    grad_k, grad_v = flex_cp_allgather_backward(
+        grad_full_k, grad_full_v, ctx.seq_dim, ctx.pg_name
+    )
+    return grad_k, grad_v, None, None
+
+
+def _flex_cp_setup_context(ctx: Any, inputs: Any, output: Any) -> None:
+    _, _, ctx.seq_dim, ctx.pg_name = inputs
+
+
+flex_cp_allgather.register_autograd(
+    _flex_cp_allgather_backward, setup_context=_flex_cp_setup_context
+)
diff --git a/torch/distributed/tensor/experimental/_func_map.py b/torch/distributed/tensor/experimental/_func_map.py
index 31cdd0f9a06f..922d5238cab1 100644
--- a/torch/distributed/tensor/experimental/_func_map.py
+++ b/torch/distributed/tensor/experimental/_func_map.py
@@ -1,8 +1,8 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
 import functools
-from collections.abc import Sequence
-from typing import Callable, Optional, Union
+from collections.abc import Callable, Sequence
+from typing import Optional, Union
 
 import torch
 from torch.distributed._functional_collectives import AsyncCollectiveTensor
@@ -238,6 +238,7 @@ def _local_map_wrapped(
 
             flat_local_args.append(arg)
 
+    # pyrefly: ignore  # bad-argument-type
     local_args = pytree.tree_unflatten(flat_local_args, args_spec)
 
     out = func(*local_args, **kwargs)
@@ -271,6 +272,7 @@ def _local_map_wrapped(
 
                 flat_dist_out.append(out)
 
+        # pyrefly: ignore  # bad-argument-type
         return pytree.tree_unflatten(flat_dist_out, out_spec)
     else:
         return out
diff --git a/torch/distributed/tensor/experimental/_load_balancer.py b/torch/distributed/tensor/experimental/_load_balancer.py
new file mode 100644
index 000000000000..befda2c736ed
--- /dev/null
+++ b/torch/distributed/tensor/experimental/_load_balancer.py
@@ -0,0 +1,487 @@
+# this file contains the `_LoadBalancer` class and its family of implementation
+# for different load-balancing strategies in tensor sharding.
+import functools
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
+from torch import Tensor
+from torch.nn.attention.flex_attention import BlockMask
+
+
+# make it private since it's still a prototype
+class _LoadBalancer(ABC):
+    @abstractmethod
+    def _generate_indices(self, restore: bool = False) -> Optional[Tensor]:
+        """
+        Generate indices for load balancing.
+        Args:
+            restore (bool):
+
+        Returns:
+            The generated indices of shape `(1, seq_len)` if the load-balancing is
+            identical within the batch, or `(batch_size, seq_len)` if the load-balancing
+            should vary within the batch.
+
+        Warning:
+            For Multi-Head Attention, we require the masks over the head dimension are identical
+            (i.e. the return value of `_generate_indices()` does not have `heads` dimension).
+
+        Example:
+            Here is the causal mask for attention where q_len == kv_len == 8:
+                            KV_index
+                    [1, 0, 0, 0, 0, 0, 0, 0]
+                    [1, 1, 0, 0, 0, 0, 0, 0]
+                    [1, 1, 1, 0, 0, 0, 0, 0]
+            Q_index [1, 1, 1, 1, 0, 0, 0, 0]
+                    [1, 1, 1, 1, 1, 0, 0, 0]
+                    [1, 1, 1, 1, 1, 1, 0, 0]
+                    [1, 1, 1, 1, 1, 1, 1, 0]
+                    [1, 1, 1, 1, 1, 1, 1, 1]
+
+            This mask matrix also represents the computation required to compute
+            the masked Q @ K^T by:
+            - mask[i, j] == 1: the computation of Q[i, :] dot K[j, :] is required
+            - mask[i, j] == 0: the computation should be skipped
+
+            Therefore the number of 1s in matrix represents the amount of computation
+            required.
+
+            Assume we want to distribute this Q @ K^T computation to 2 devices, then
+            the matrix is also distributed as:
+                            KV_index
+                    [1, 0, 0, 0, 0, 0, 0, 0]
+                    [1, 1, 0, 0, 0, 0, 0, 0]
+                    [1, 1, 1, 0, 0, 0, 0, 0]    rank 0
+                    [1, 1, 1, 1, 0, 0, 0, 0]
+            Q_index ------------------------
+                    [1, 1, 1, 1, 1, 0, 0, 0]
+                    [1, 1, 1, 1, 1, 1, 0, 0]    rank 1
+                    [1, 1, 1, 1, 1, 1, 1, 0]
+                    [1, 1, 1, 1, 1, 1, 1, 1]
+
+            An imbalance of computation is observed on these 2 ranks and this could make
+            rank 1 the straggler when performing Context Parallel. In order to balance
+            the computation, we need to rearrange the QKV tensors before sharding in such a
+            way that the result mask matrix is evenly distributed over devices and each
+            rank has the number of 1s as close as possible.
+
+            This method defines the strategy of how to rearrange the QKV tensor for better
+            load-balance:
+            - when `restore == False`, this method returns an indices tensor `rearrange_idx`
+            such that Q[rearrange_idx] is the desired Q tensor after rearranging.
+            - when `restore == True`, this method returns an indices tensor `restore_idx`
+            such that Q[rearrange_idx][restore_idx] == Q, i.e. restoring the rearranged tensor
+            back to the original status before rearranging.
+        """
+
+
+class _HeadTailLoadBalancer(_LoadBalancer):
+    def __init__(self, seq_length: int, world_size: int, device: str | torch.device):
+        self.seq_length = seq_length
+        self.world_size = world_size
+        self.device = device
+
+    def _generate_indices(self, restore: bool = False) -> Tensor:
+        """
+        Generate head-and-tail load balancing indices or restore indices.
+        Args:
+            restore:
+                If True, generate restore indices that map head-and-tail rearranged
+                positions back to original positions. If False, generate load
+                balance indices that rearrange original positions to head-and-tail pattern.
+
+        Returns:
+            The generated indices of shape `(1, seq_len)` because the load-balancing is
+            identical within the batch.
+
+        Warning:
+            For Multi-Head Attention, we require the masks over the head dimension are identical
+            (i.e. the return value of `_generate_indices()` does not have `heads` dimension).
+
+        Example:
+            Here is the causal mask for attention where q_len == kv_len == 8:
+                            KV_index
+                    [1, 0, 0, 0, 0, 0, 0, 0]
+                    [1, 1, 0, 0, 0, 0, 0, 0]
+                    [1, 1, 1, 0, 0, 0, 0, 0]
+            Q_index [1, 1, 1, 1, 0, 0, 0, 0]
+                    [1, 1, 1, 1, 1, 0, 0, 0]
+                    [1, 1, 1, 1, 1, 1, 0, 0]
+                    [1, 1, 1, 1, 1, 1, 1, 0]
+                    [1, 1, 1, 1, 1, 1, 1, 1]
+
+            Head-tail load-balance strategy rearranges the Q tensor by combining
+            Q[0:k] (on seq dim) and Q[-k:] for rank 0, Q[k:2k] and Q[-2k:-k] for
+            rank 1, and so on. In python code it looks like:
+
+                k = Q.size(0) // (2 * cp_world_size)
+                for rank in range(cp_world_size):
+                    reordered_Q[rank * 2 * k : (rank + 1) * 2 * k] = torch.cat(
+                        (Q[rank * k : (rank + 1) * k], Q[-(rank + 1) * k : -rank * k])
+                    )
+
+            This can also be done by tensor slicing. For the above example, the indices
+            tensor for slicing is:
+                slice_indices = Tensor([0, 7, 1, 6, 2, 5, 3, 4])
+
+            After reordering QKV using the `slice_indices`, the corresponding mask matrix
+            distributing over 2 devices becomes well-balanced:
+                            KV_index
+                    [1, 0, 0, 0, 0, 0, 0, 0]
+                    [1, 1, 1, 1, 1, 1, 1, 1]
+                    [1, 1, 0, 0, 0, 0, 0, 0]    rank 0
+                    [1, 1, 1, 1, 1, 1, 1, 0]
+            Q_index ------------------------
+                    [1, 1, 1, 0, 0, 0, 0, 0]
+                    [1, 1, 1, 1, 1, 1, 0, 0]    rank 1
+                    [1, 1, 1, 1, 0, 0, 0, 0]
+                    [1, 1, 1, 1, 1, 0, 0, 0]
+
+            To restore the reordering and putting the tensor back, slicing op can do the
+            trick with a `restore_indices` such that:
+                slice_indices[restore_indices] == Tensor([0, 1, 2, ...])
+
+            In this way, `reordered_Q[restore_indices]` will just be the original Q.
+        """
+        seq_length = self.seq_length
+        world_size = self.world_size
+        assert seq_length % (world_size * 2) == 0
+        chunk_size = seq_length // (world_size * 2)
+        all_indices = []
+
+        for rank in range(world_size):
+            # Generate indices for first chunk of the cp rank
+            first_chunk_start = rank * chunk_size
+            first_chunk_indices = list(
+                range(first_chunk_start, first_chunk_start + chunk_size)
+            )
+
+            # Second chunk: positions from the complementary chunk
+            second_chunk_idx = world_size * 2 - rank - 1
+            second_chunk_start = second_chunk_idx * chunk_size
+            second_chunk_indices = list(
+                range(second_chunk_start, second_chunk_start + chunk_size)
+            )
+            # combine the indices for this rank
+            all_indices.extend(first_chunk_indices + second_chunk_indices)
+
+        all_indices_tensor = torch.tensor(
+            all_indices, dtype=torch.int, device=self.device
+        )
+        if restore:
+            all_indices_tensor = torch.argsort(all_indices_tensor)
+
+        return all_indices_tensor.unsqueeze(0)  # add batch dim
+
+
+class _PerDocumentHeadTailLoadBalancer(_LoadBalancer):
+    def __init__(
+        self,
+        seq_length_per_doc: list[list[int]],
+        world_size: int,
+        device: str | torch.device,
+    ):
+        """
+        `seq_length_per_doc` has size (B, seq_len) if the load-balancing should vary
+        within the batch. Otherwise `seq_length_per_doc` should have size (1, seq_len).
+        """
+        self.seq_length_per_doc = seq_length_per_doc
+        self.world_size = world_size
+        self.device = device
+
+    def _generate_indices(self, restore: bool = False) -> Tensor:
+        """
+        Generate the per-document head-and-tail rearrange indices so that after rearranging
+        the input is load-balanced in per-document head-and-tail style.
+
+        Args:
+            restore:
+                If True, generate restore indices that map per-document head-and-tail
+                rearranged positions back to original positions. If False, generate load
+                balance indices that rearrange original positions to per-document
+                head-and-tail pattern.
+
+        Returns:
+            The generated indices of shape `(batch_size, seq_len)` if the load-balancing
+            should vary within the batch. Otherwise, it should have shape `(1, seq_len)`.
+
+        Warning:
+            For Multi-Head Attention, we require the masks over the head dimension are identical
+            (i.e. `seq_length_per_doc` must have size (B, seq_len) or (1, seq_len)).
+
+        Example:
+            Here is the document causal mask for attention where q_len == kv_len == 16:
+                                        KV_index
+                    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                    [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                    [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                    [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+            Q_index [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0]
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0]
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]
+
+            The per-document head-and-tail load-balancer will apply head-and-tail
+            reordering within each document. After load-balancing for context-parallel
+            on 2 devices, the above mask matrix will look like this:
+                                        KV_index
+                    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                    [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
+            Q_index [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]
+                    ------------------------------------------------
+                    [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                    [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0]
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0]
+        """
+        return torch.stack(
+            [
+                self._generate_indices_for_batch(seq_lengths, restore)
+                for seq_lengths in self.seq_length_per_doc
+            ]
+        )
+
+    def _generate_indices_for_batch(self, seq_length_per_doc, restore) -> Tensor:  # type: ignore[no-untyped-def]
+        world_size = self.world_size
+        device = self.device
+        assert all(
+            seq_length % (2 * world_size) == 0 for seq_length in seq_length_per_doc
+        )
+        chunk_length_per_doc = [
+            seq_length // (2 * world_size) for seq_length in seq_length_per_doc
+        ]
+
+        indices = []
+        document_start_idx = 0
+        for seq_length, chunk_length in zip(seq_length_per_doc, chunk_length_per_doc):
+            # Generate the indices for the current document
+            for rank in range(world_size):
+                head_chunk_start_idx = document_start_idx + chunk_length * rank
+                tail_chunk_end_idx = document_start_idx + chunk_length * (
+                    2 * world_size - rank
+                )
+                indices.append(
+                    torch.arange(
+                        head_chunk_start_idx,
+                        head_chunk_start_idx + chunk_length,
+                        device=device,
+                    )
+                )
+                indices.append(
+                    torch.arange(
+                        tail_chunk_end_idx - chunk_length,
+                        tail_chunk_end_idx,
+                        device=device,
+                    )
+                )
+
+            document_start_idx += seq_length
+
+        indices_tensor = torch.cat(indices)
+        if restore:
+            indices_tensor = torch.argsort(indices_tensor)
+
+        return indices_tensor
+
+
+class _PTRRLoadBalancer(_LoadBalancer):
+    """
+    Processing-Time based Round-Robin (PTRR) load balancer. This load balancer should
+    only be used for flex_attention() since it leverages `BlockMask`.
+    """
+
+    def __init__(
+        self,
+        block_mask: BlockMask,
+        world_size: int,
+    ):
+        """
+        `block_mask` must have shape (B, 1, seq_len, seq_len) or (1, 1, seq_len, seq_len).
+        """
+        self.block_mask = block_mask
+        self.world_size = world_size
+
+    @staticmethod
+    def ptrr_scheduling(process_time: Tensor, group_size: int) -> Tensor:
+        """
+        Separate the tasks into `group_size` groups using PTRR scheduling.
+        process_time:
+            1D tensor of size n, where n is the number of tasks. The value
+            is the process time of the task. Size `n` must be divisible by
+            `group_size`.
+        group_size:
+            the number of groups
+
+        Returns:
+        tasks_in_group (list[list[int]]):
+            A collection of list[int] and each list should have size `n // group_size`
+            (`group_size` lists in total). Each element is an index in the input
+            `process_time` (i.e. [0, len(process_time) - 1]).
+
+        Example:
+            process_time = [9, 14, 2, 20, 10, 15, 8, 14, 16, 19, 15, 3, 12, 1, 12, 10]
+            tasks_in_group = [
+                [3, 12, 13, 14],    # values = [1, 12, 12, 20], sum = 45
+                [2, 4, 7, 9],       # values = [2, 10, 14, 19], sum = 45
+                [1, 8, 11, 15],     # values = [14, 16, 3, 10], sum = 43
+                [0, 5, 6, 10]       # values = [9, 15, 8, 15], sum = 47
+            ]
+        """
+        assert process_time.ndim == 1
+
+        num_tasks = process_time.size(0)
+
+        if num_tasks % group_size != 0:
+            raise NotImplementedError(
+                f"num_tasks {num_tasks} must be divisible by group_size {group_size}"
+            )
+
+        device = process_time.device
+        _, sorted_indices_descending = torch.sort(
+            process_time, descending=True, stable=True
+        )  # if process time is tied, the order is preserved
+        sorted_indices_descending_reversed = torch.flip(
+            sorted_indices_descending.view(-1, group_size), dims=[1]
+        ).view(-1)
+        tasks_in_group = torch.where(
+            torch.arange(num_tasks, device=device) // group_size % 2 == 0,
+            sorted_indices_descending,
+            sorted_indices_descending_reversed,
+        )
+        tasks_in_group = tasks_in_group.view(-1, group_size).transpose(
+            0, 1
+        )  # (group_size, n // group_size)
+
+        # sort each group. This step should not have impact on correctness
+        # nor execution run time, but it helps users visualize the mask
+        tasks_in_group, _ = torch.sort(tasks_in_group, dim=1)
+        return tasks_in_group
+
+    def _generate_indices(self, restore: bool = False) -> Tensor:
+        """
+        Generate the PTRR reorder indices of shape `(1, seq_len)` or `(batch_size, seq_len)`.
+
+        Args:
+            restore:
+                If True, generate restore indices that map Processing-Time based Round-Robin
+                (PTRR) rearranged positions back to original positions. If False, generate
+                load balance indices that rearrange original positions to PTRR pattern.
+
+            Returns:
+                The generated indices of shape `(1, seq_len)` if the load-balancing is
+                identical within the batch (i.e. `BlockMask.shape[0] == 1`), or
+                `(batch_size, seq_len)` if the load-balancing should vary within the batch.
+
+        Warning:
+            For Multi-Head Attention, we require the masks over the head dimension are identical
+            (i.e. `self.block_mask` must have shape (B, 1, seq_len, seq_len) or (1, 1, seq_len, seq_len)).
+
+        Example:
+            Here is the document causal mask for attention whereq_len == kv_len == 16 * BLOCK_SIZE
+            (each entry is a block):
+                                        KV_index
+                    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -> row value = 1
+                    [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -> row value = 2
+                    [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -> row value = 3
+                    [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -> row value = 4
+                    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -> row value = 1
+                    [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -> row value = 2
+                    [0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -> row value = 3
+            Q_index [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]  -> row value = 4
+                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]  -> row value = 5
+                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]  -> row value = 6
+                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]  -> row value = 7
+                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]  -> row value = 8
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]  -> row value = 1
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0]  -> row value = 2
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0]  -> row value = 3
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]  -> row value = 4
+
+            The reorder indices will be: [2, 3, 5, 6, 8, 11, 12, 13, 0, 1, 4, 7, 9, 10, 14, 15] and
+            the mask matrix will look like:
+                                        KV_index
+                    [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -> row value = 3
+                    [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -> row value = 4
+                    [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -> row value = 2
+                    [0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -> row value = 3
+                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]  -> row value = 5  rank 0 (sum=28)
+                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]  -> row value = 8
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]  -> row value = 1
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0]  -> row value = 2
+                    ------------------------------------------------
+                    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -> row value = 1
+                    [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -> row value = 2
+                    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  -> row value = 1
+                    [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]  -> row value = 4
+                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]  -> row value = 6  rank 1 (sum=28)
+                    [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]  -> row value = 7
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0]  -> row value = 3
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]  -> row value = 4
+        """
+        block_mask = self.block_mask
+        kv_num_blocks = block_mask.kv_num_blocks
+        full_kv_num_blocks = block_mask.full_kv_num_blocks
+        non_sparse_kv_num_blocks = (
+            kv_num_blocks + full_kv_num_blocks
+            if full_kv_num_blocks is not None
+            else kv_num_blocks
+        )
+        B, H, Q = non_sparse_kv_num_blocks.shape
+        # requirement: the masking is identical across heads (i.e. H == 1 in BlockMask)
+        non_sparse_kv_num_blocks = non_sparse_kv_num_blocks.view(-1, Q)  # (B, Q_BLK)
+
+        batch_ptrr = torch.vmap(
+            functools.partial(
+                _PTRRLoadBalancer.ptrr_scheduling,
+                group_size=self.world_size,
+            )
+        )
+        ptrr_indices = batch_ptrr(
+            non_sparse_kv_num_blocks
+        )  # (B, group_size, num_blks_in_group)
+        ptrr_indices = ptrr_indices.reshape(B, -1)  # (B, num_blocks)
+
+        # NOTE: only support the case where the qkv block size are equal
+        q_blk_size, kv_blk_size = block_mask.BLOCK_SIZE
+        assert q_blk_size == kv_blk_size, (
+            "for now only support q_blk_size == kv_blk_size"
+        )
+
+        indices = torch.arange(
+            q_blk_size * ptrr_indices.size(1), device=ptrr_indices.device
+        ).view(-1, q_blk_size)  # (NUM_BLOCKS, BLOCK_SIZE)
+        indices = indices[ptrr_indices].view(B, -1)  # (B, qkv_size)
+
+        if restore:
+            indices = torch.vmap(torch.argsort)(indices)
+
+        return indices
+
+
+def _create_default_load_balancer(
+    seq_length: int, world_size: int, device: str | torch.device
+) -> Optional[_LoadBalancer]:
+    from torch.distributed.tensor.experimental._attention import _cp_options
+
+    if _cp_options.enable_load_balance:
+        return _HeadTailLoadBalancer(seq_length, world_size, device)
+    else:
+        return None
diff --git a/torch/distributed/tensor/experimental/_register_sharding.py b/torch/distributed/tensor/experimental/_register_sharding.py
index b286b151efed..9879946f54bc 100644
--- a/torch/distributed/tensor/experimental/_register_sharding.py
+++ b/torch/distributed/tensor/experimental/_register_sharding.py
@@ -1,8 +1,8 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from functools import partial
-from typing import Callable, Union
+from typing import Union
 
 import torch
 from torch._ops import OpOverload
diff --git a/torch/distributed/tensor/experimental/_tp_transform.py b/torch/distributed/tensor/experimental/_tp_transform.py
index 7bdfa768cf55..f84c6a101391 100644
--- a/torch/distributed/tensor/experimental/_tp_transform.py
+++ b/torch/distributed/tensor/experimental/_tp_transform.py
@@ -237,8 +237,11 @@ def _mark_sharding(
                         op_schema,
                     )
                 placement_strategies[node] = OpSpec(
+                    # pyrefly: ignore  # bad-argument-type
                     output_specs=_get_output_spec_from_output_sharding(output_sharding),
+                    # pyrefly: ignore  # missing-attribute
                     input_specs=output_sharding.redistribute_schema.args_spec
+                    # pyrefly: ignore  # missing-attribute
                     if output_sharding.redistribute_schema is not None
                     else _get_input_node_specs(node, placement_strategies),
                 )
@@ -464,7 +467,7 @@ def reshard_fn(local_tensor: torch.Tensor) -> torch.Tensor:
             if reshard_node.op not in ["placeholder", "output"]:
                 reshard_node.meta["nn_module_stack"] = (
                     copy.copy(input_arg.meta["nn_module_stack"])
-                    if not input_arg.op == "placeholder"
+                    if input_arg.op != "placeholder"
                     else copy.copy(node.meta["nn_module_stack"])
                 )
         output_node = gm.graph.graph_copy(
diff --git a/torch/distributed/tensor/parallel/fsdp.py b/torch/distributed/tensor/parallel/fsdp.py
index 1b0b8cac7c76..dd62d5fc1718 100644
--- a/torch/distributed/tensor/parallel/fsdp.py
+++ b/torch/distributed/tensor/parallel/fsdp.py
@@ -14,7 +14,6 @@
 )
 from torch.distributed._shard.sharding_spec import ShardMetadata
 from torch.distributed._shard.sharding_spec.chunk_sharding_spec import ChunkShardingSpec
-from torch.distributed.device_mesh import _mesh_resources
 from torch.distributed.fsdp._common_utils import _set_fsdp_flattened
 from torch.distributed.fsdp._fsdp_extensions import FSDPExtensions
 from torch.distributed.fsdp._shard_utils import _create_chunk_sharded_tensor
@@ -135,9 +134,11 @@ def _rewrite_spec_if_needed(
             break
     if rewrite:
         spec = copy.deepcopy(spec)
+        # pyrefly: ignore  # missing-attribute
         for i, placement in enumerate(spec.placements):
             placement = cast(_remote_device, placement)
             if placement.rank() == rank and placement.device() != tensor.device:
+                # pyrefly: ignore  # missing-attribute
                 spec.placements[i] = _remote_device(f"rank:{rank}/{tensor.device}")
 
     return spec
@@ -227,7 +228,7 @@ def _chunk_dtensor(
 
     The local rank will gets its corresponding chunk as the local tensor to create a DTensor.
     """
-    root_mesh = _mesh_resources.get_root_mesh(device_mesh)
+    root_mesh = device_mesh._get_root_mesh() if device_mesh is not None else None
     if root_mesh is None:
         raise RuntimeError("No parent device_mesh is found for FSDP device_mesh.")
     if root_mesh.ndim < 2:
@@ -304,7 +305,7 @@ def _all_gather_dtensor(
     placements = list(copy.deepcopy(tensor.placements))
     # FSDP + TP: [Shard(0), tp_placement] -> [Replicate(), tp_placement]
     # HSDP + TP: [Replicate(), Shard(0), tp_placement] -> [Replicate(), Replicate(), tp_placement]
-    for i in range(0, len(placements) - 1):
+    for i in range(len(placements) - 1):
         placements[i] = Replicate()
     tensor = tensor.redistribute(
         device_mesh=tensor.device_mesh,
diff --git a/torch/distributed/tensor/parallel/loss.py b/torch/distributed/tensor/parallel/loss.py
index 32a90bc8f1fb..bc9c5486298f 100644
--- a/torch/distributed/tensor/parallel/loss.py
+++ b/torch/distributed/tensor/parallel/loss.py
@@ -251,6 +251,7 @@ def _weight_view(weight: Tensor) -> Tensor:
     if weight is not None:
         new_shape = list(x.shape)
         new_shape[channel_dim] = -1
+        # pyrefly: ignore  # unbound-name
         w = w.expand(new_shape)
         wsum = torch.gather(w, channel_dim, safe_target_).squeeze(channel_dim)
         wsum = torch.where(target != ignore_index, wsum, 0)
@@ -308,7 +309,9 @@ def _nll_loss_forward_handler(
         output_placements = all_replicate_placements
 
     # tensor inputs to _propagate_tensor_meta need to be DTensors
+    # pyrefly: ignore  # bad-assignment
     args = list(args)
+    # pyrefly: ignore  # unsupported-operation
     args[1], args[2] = target, weight
     output_tensor_meta = _propagate_tensor_meta(op_call, tuple(args), kwargs)
 
@@ -439,8 +442,11 @@ def _nll_loss_backward_handler(
         weight = _cast_to_dtensor(weight, all_replicate_placements, spec.mesh)
 
     # tensor inputs to _propagate_tensor_meta need to be DTensors
+    # pyrefly: ignore  # bad-assignment
     args = list(args)
+    # pyrefly: ignore  # unsupported-operation
     args[2], args[3] = target, weight
+    # pyrefly: ignore  # unsupported-operation
     args[6] = _cast_to_dtensor(total_weight, all_replicate_placements, spec.mesh)
     output_tensor_meta = _propagate_tensor_meta(op_call, tuple(args), kwargs)
 
diff --git a/torch/distributed/tensor/parallel/style.py b/torch/distributed/tensor/parallel/style.py
index 3580a924d183..3625f36fefac 100644
--- a/torch/distributed/tensor/parallel/style.py
+++ b/torch/distributed/tensor/parallel/style.py
@@ -473,9 +473,11 @@ class PrepareModuleInput(ParallelStyle):
     def __init__(
         self,
         *,
-        input_layouts: Optional[Union[Placement, tuple[Optional[Placement]]]] = None,
+        input_layouts: Optional[
+            Union[Placement, tuple[Optional[Placement], ...]]
+        ] = None,
         desired_input_layouts: Optional[
-            Union[Placement, tuple[Optional[Placement]]]
+            Union[Placement, tuple[Optional[Placement], ...]]
         ] = None,
         input_kwarg_layouts: Optional[dict[str, Placement]] = None,
         desired_input_kwarg_layouts: Optional[dict[str, Placement]] = None,
@@ -546,6 +548,7 @@ def _prepare_input_fn(self, inputs, device_mesh):
         assert self.desired_input_layouts is not None, (
             "desired module inputs should not be None!"
         )
+        # pyrefly: ignore  # no-matching-overload
         for inp, input_layout, desired_layout in zip(
             inputs, self.input_layouts, self.desired_input_layouts
         ):
@@ -634,8 +637,8 @@ class PrepareModuleOutput(ParallelStyle):
     def __init__(
         self,
         *,
-        output_layouts: Union[Placement, tuple[Placement]],
-        desired_output_layouts: Union[Placement, tuple[Placement]],
+        output_layouts: Union[Placement, tuple[Optional[Placement], ...]],
+        desired_output_layouts: Union[Placement, tuple[Placement, ...]],
         use_local_output: bool = True,
     ):
         self.output_layouts = (
@@ -661,6 +664,7 @@ def _prepare_out_fn(self, outputs, device_mesh):
             raise ValueError(
                 "module outputs and output_layouts should have same length!"
             )
+        # pyrefly: ignore  # no-matching-overload
         for out, out_layout, desired_out_layout in zip(
             outputs, self.output_layouts, self.desired_output_layouts
         ):
@@ -764,15 +768,17 @@ class PrepareModuleInputOutput(ParallelStyle):
     def __init__(
         self,
         *,
-        input_layouts: Optional[Union[Placement, tuple[Optional[Placement]]]] = None,
+        input_layouts: Optional[
+            Union[Placement, tuple[Optional[Placement], ...]]
+        ] = None,
         desired_input_layouts: Optional[
-            Union[Placement, tuple[Optional[Placement]]]
+            Union[Placement, tuple[Optional[Placement], ...]]
         ] = None,
         input_kwarg_layouts: Optional[dict[str, Placement]] = None,
         desired_input_kwarg_layouts: Optional[dict[str, Placement]] = None,
         use_local_input: bool = False,
-        output_layouts: Union[Placement, tuple[Placement]],
-        desired_output_layouts: Union[Placement, tuple[Placement]],
+        output_layouts: Union[Placement, tuple[Optional[Placement], ...]],
+        desired_output_layouts: Union[Placement, tuple[Placement, ...]],
         use_local_output: bool = True,
     ):
         self.prepare_module_input = PrepareModuleInput(
diff --git a/torch/distributed/tensor/placement_types.py b/torch/distributed/tensor/placement_types.py
index a8538713a381..8930d3b1b29c 100644
--- a/torch/distributed/tensor/placement_types.py
+++ b/torch/distributed/tensor/placement_types.py
@@ -6,6 +6,7 @@
 
 import torch
 import torch.distributed._functional_collectives as funcol
+from torch.distributed._local_tensor import maybe_run_for_local_tensor
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor._collective_utils import (
     fill_empty_tensor_to_shards,
@@ -111,7 +112,8 @@ def _split_tensor(
         return shard_list, pad_sizes
 
     @staticmethod
-    def _local_shard_size_and_offset(
+    @maybe_run_for_local_tensor
+    def local_shard_size_and_offset(
         curr_local_size: int,
         num_chunks: int,
         rank: int,
@@ -145,6 +147,28 @@ def _local_shard_size_and_offset(
             )
             return local_shard_size, shard_starting_idx
 
+    def _local_shard_size_and_offset(
+        self,
+        curr_local_size: int,
+        num_chunks: int,
+        rank: int,
+    ) -> tuple[int, Optional[int]]:
+        return Shard.local_shard_size_and_offset(curr_local_size, num_chunks, rank)
+
+    @staticmethod
+    @maybe_run_for_local_tensor
+    def _maybe_unpad_tensor_with_sizes(
+        dim, local_tensor, pad_sizes, mesh_dim_local_rank, make_contiguous
+    ) -> torch.Tensor:
+        # Only unpad if the local_tensor was padded on the dimension.
+        if pad_sizes[mesh_dim_local_rank] > 0:
+            local_tensor = unpad_tensor(
+                local_tensor, dim, pad_sizes[mesh_dim_local_rank]
+            )
+            if make_contiguous:
+                local_tensor = local_tensor.contiguous()
+        return local_tensor
+
     def _shard_tensor(
         self,
         tensor: torch.Tensor,
@@ -172,24 +196,40 @@ def _shard_tensor(
                 tensor, num_chunks, with_padding=False, contiguous=True
             )
 
-            return scatter_list[mesh_dim_local_rank]
+            return self._select_shard(scatter_list, mesh_dim_local_rank)
 
         scatter_list, pad_sizes = self._split_tensor(
             tensor, num_chunks, with_padding=True, contiguous=True
         )
-        output = torch.empty_like(scatter_list[mesh_dim_local_rank])
+
+        it = iter(scatter_list)
+        first = next(it)
+        # Tensors in the scatter list are expected to have the same shape because
+        # split is requested with padding.
+        assert all(first.shape == v.shape for v in it)
+
+        output = torch.empty_like(first)
 
         # perform scatter from the src_data_rank as data source when it is not None
         mesh_scatter(
             output, scatter_list, mesh, mesh_dim=mesh_dim, group_src=src_data_rank
         )
 
-        # Only unpad if the local_tensor was padded on the dimension.
-        if pad_sizes[mesh_dim_local_rank] > 0:
-            output = unpad_tensor(output, self.dim, pad_sizes[mesh_dim_local_rank])
-            # Unpad might return a view, hence we need to remake it contiguous
-            output = output.contiguous()
-        return output
+        return Shard._maybe_unpad_tensor_with_sizes(
+            self.dim, output, pad_sizes, mesh_dim_local_rank, True
+        )
+
+    @classmethod
+    def _make_shard_tensor(
+        cls,
+        dim: int,
+        tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        mesh_dim: int,
+        src_data_rank: Optional[int] = 0,
+    ) -> torch.Tensor:
+        shard_placement = cls(dim)
+        return shard_placement._shard_tensor(tensor, mesh, mesh_dim, src_data_rank)
 
     def _reduce_shard_tensor(
         self,
@@ -210,6 +250,7 @@ def _reduce_shard_tensor(
             return tensor
 
         is_padded = tensor.size(self.dim) % num_chunks != 0
+        pad_sizes = None
         if is_padded:
             scattered_list, pad_sizes = self._split_tensor(
                 tensor, num_chunks, with_padding=True, contiguous=True
@@ -223,9 +264,47 @@ def _reduce_shard_tensor(
         )
 
         if is_padded:
-            output = unpad_tensor(output, self.dim, pad_sizes[my_coordinate[mesh_dim]])  # type: ignore[possibly-undefined]
+            assert pad_sizes is not None
+            output = Shard._maybe_unpad_tensor_with_sizes(
+                self.dim, output, pad_sizes, my_coordinate[mesh_dim], False
+            )
         return output
 
+    @maybe_run_for_local_tensor
+    def _maybe_pad_tensor(
+        self,
+        local_tensor: torch.Tensor,
+        logical_dim_size: int,
+        num_chunks: int,
+    ) -> torch.Tensor:
+        is_padded = logical_dim_size % num_chunks != 0
+
+        if is_padded:
+            full_chunk_size = (logical_dim_size + num_chunks - 1) // num_chunks
+            pad_size = full_chunk_size - local_tensor.size(self.dim)
+            local_tensor = pad_tensor(local_tensor, self.dim, pad_size)
+
+        if not local_tensor.is_contiguous():
+            local_tensor = local_tensor.contiguous()
+
+        return local_tensor
+
+    @maybe_run_for_local_tensor
+    def _maybe_unpad_tensor(
+        self,
+        local_tensor: torch.Tensor,
+        logical_dim_size: int,
+        num_chunks: int,
+    ) -> torch.Tensor:
+        is_padded = logical_dim_size % num_chunks != 0
+
+        if is_padded:
+            full_chunk_size = (logical_dim_size + num_chunks - 1) // num_chunks
+            unpad_size = full_chunk_size * num_chunks - logical_dim_size  # type: ignore[possibly-undefined]
+            local_tensor = unpad_tensor(local_tensor, self.dim, unpad_size)
+
+        return local_tensor
+
     def _to_replicate_tensor(
         self,
         local_tensor: torch.Tensor,
@@ -238,28 +317,27 @@ def _to_replicate_tensor(
         is replicated on the previously sharded mesh dimension
         """
         num_chunks = mesh.size(mesh_dim=mesh_dim)
-
         logical_dim_size = current_logical_shape[self.dim]
-        is_padded = logical_dim_size % num_chunks != 0
 
-        if is_padded:
-            full_chunk_size = (logical_dim_size + num_chunks - 1) // num_chunks
-            pad_size = full_chunk_size - local_tensor.size(self.dim)
-            local_tensor = pad_tensor(local_tensor, self.dim, pad_size)
-
-        if not local_tensor.is_contiguous():
-            local_tensor = local_tensor.contiguous()
+        local_tensor = self._maybe_pad_tensor(
+            local_tensor, logical_dim_size, num_chunks
+        )
 
         result = funcol.all_gather_tensor(
             local_tensor,
             gather_dim=self.dim,
             group=(mesh, mesh_dim),
         )
-        if is_padded:
-            unpad_size = full_chunk_size * num_chunks - logical_dim_size  # type: ignore[possibly-undefined]
-            result = unpad_tensor(result, self.dim, unpad_size)
+
+        result = self._maybe_unpad_tensor(result, logical_dim_size, num_chunks)
+
         return result
 
+    @staticmethod
+    @maybe_run_for_local_tensor
+    def _select_shard(shards: list[torch.Tensor], shard_index) -> torch.Tensor:
+        return shards[shard_index].clone()
+
     def _replicate_to_shard(
         self,
         local_tensor: torch.Tensor,
@@ -278,7 +356,18 @@ def _replicate_to_shard(
             with_padding=False,
             contiguous=False,
         )
-        return shards[shard_index].clone()
+
+        return Shard._select_shard(shards, shard_index)
+
+    @staticmethod
+    @maybe_run_for_local_tensor
+    def _get_shard_pad_size(
+        full_size: int, local_tensor: torch.Tensor, dim: int
+    ) -> int:
+        """
+        Get the padding size of the local tensor on the shard dimension.
+        """
+        return full_size - local_tensor.size(dim)
 
     def _to_new_shard_dim(
         self,
@@ -308,14 +397,16 @@ def _to_new_shard_dim(
             old_dim_full_chunk_size = (
                 old_dim_logical_size + num_chunks - 1
             ) // num_chunks
-            old_dim_pad_size = old_dim_full_chunk_size - local_tensor.size(self.dim)
+            old_dim_pad_size = Shard._get_shard_pad_size(
+                old_dim_full_chunk_size, local_tensor, self.dim
+            )
             local_tensor = pad_tensor(local_tensor, self.dim, old_dim_pad_size)
         if new_dim_padding:
             new_dim_full_chunk_size = (
                 new_dim_logical_size + num_chunks - 1
             ) // num_chunks
-            new_dim_pad_size = new_dim_full_chunk_size * num_chunks - local_tensor.size(
-                new_shard_dim
+            new_dim_pad_size = Shard._get_shard_pad_size(
+                new_dim_full_chunk_size * num_chunks, local_tensor, new_shard_dim
             )
             local_tensor = pad_tensor(local_tensor, new_shard_dim, new_dim_pad_size)
 
@@ -360,11 +451,7 @@ def __str__(self) -> str:
         return f"S({self.dim})"
 
 
-# kw_only is only available in python >= 3.10
-kw_only_dataclass = dict(kw_only=True) if "kw_only" in dataclass.__kwdefaults__ else {}
-
-
-@dataclass(frozen=True, **kw_only_dataclass)
+@dataclass(frozen=True, kw_only=True)
 class _StridedShard(Shard):
     """
     _StridedShard is only introduced to support 2D FSDP2 + TP sharding where the tensor
@@ -448,6 +535,21 @@ def __str__(self) -> str:
         """human readable representation of the _StridedShard placement"""
         return f"_S({self.dim}, {self.split_factor})"
 
+    @classmethod
+    def _make_shard_tensor(
+        cls,
+        dim: int,
+        tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        mesh_dim: int,
+        src_data_rank: Optional[int] = 0,
+        split_factor: int = 1,
+    ) -> torch.Tensor:
+        strided_shard_placement = cls(dim=dim, split_factor=split_factor)
+        return strided_shard_placement._shard_tensor(
+            tensor, mesh, mesh_dim, src_data_rank
+        )
+
     def _split_tensor(
         self,
         tensor: torch.Tensor,
@@ -460,33 +562,36 @@ def _split_tensor(
             f"Sharding dim {self.dim} greater than tensor ndim {tensor.ndim}"
         )
 
-        # num_chunks represents the size of this StridedShard mesh dim, while self.split_factor
-        # represents the aggregate num chunks for other shardings applied logically earlier than this strided shard.
-        # (e.g. in FSDP+TP case, num_chunks is size(dp dim), split_factor is size(tp dim))
-        total_split = num_chunks * self.split_factor
-
-        tensor_list = list(torch.chunk(tensor, total_split, dim=self.dim))
-        tensor_list = fill_empty_tensor_to_shards(
-            tensor_list, self.dim, total_split - len(tensor_list)
+        # Essentially _StridedShard express the right-to-left sharding in the
+        # reversed order. Here we perform first_split as the virtual "right" sharding,
+        # and then second_split as the virtual "left" sharding, and finally assemble
+        # results in the transposed left-first order.
+        first_split, _ = super()._split_tensor(
+            tensor, self.split_factor, with_padding=False, contiguous=False
         )
-
-        # compute the chunk size inline with ``torch.chunk`` to calculate padding
-        full_chunk_size = (tensor.size(self.dim) + total_split - 1) // total_split
+        second_split = [
+            super(_StridedShard, self)._split_tensor(
+                s, num_chunks=num_chunks, with_padding=False, contiguous=False
+            )[0]
+            for s in first_split
+        ]
 
         shard_list: list[torch.Tensor] = []
-        pad_sizes: list[int] = []
         for i in range(num_chunks):
             shard = torch.cat(
-                [tensor_list[i + j * num_chunks] for j in range(self.split_factor)],
+                [second_split[j][i] for j in range(self.split_factor)],
                 dim=self.dim,
             )
-            if with_padding:
-                pad_size = full_chunk_size * self.split_factor - shard.size(self.dim)
-                shard = pad_tensor(shard, self.dim, pad_size)
-                pad_sizes.append(pad_size)
             if contiguous:
                 shard = shard.contiguous()
             shard_list.append(shard)
+
+        # The amount of padding is determined by the local chunk with the largest size.
+        pad_sizes: list[int] = []
+        max_chunk_size = max([shard.size(self.dim) for shard in shard_list])
+        if with_padding:
+            pad_sizes = [max_chunk_size - shard.size(self.dim) for shard in shard_list]
+
         return shard_list, pad_sizes
 
     def _to_replicate_tensor(
@@ -497,106 +602,90 @@ def _to_replicate_tensor(
         current_logical_shape: list[int],
     ) -> torch.Tensor:
         """
-        Given a tensor with strided sharding (e.g. [StridedShard(d), Shard(d)]),
-        this function is called during the process of converting to [Replicate(), Replicate()],
-        and `local_tensor` represents the portion of the tensor on this rank after the intermediate step of
-        converting to [StridedShard(d), Replicate()] in right-to-left unsharding order.
-
-        note: this conversion logic is pretty specialized on this 2D case.  It could be generalized further. This
-        is a common enough case to be worth fixing (since it occurs when applying TP and then FSDP to a model).
-
-        note: this does not support 'reduce_scatter' for StridedShard.
-
-        Example
-        -------
-        mesh = (DP=2, TP=2)
-        # single-gpu "weight" of size 5, will be 'uneven' for sharding
-        original = torch.arange(5)
-
-        tp sharded tensor
-        -----------------
-        `tp = distribute_tensor(x, world_mesh['tp'], [Shard(0)])`
-
-        local_tensors:
-        rank0: [0,1,2]    rank1: [3,4]
-        rank1: [0,1,2]    rank3: [3,4]
-
-        fsdp+tp sharded tensor
-        ----------------------
-        `dp_tp = ...` (the process of creating a strided-shard tensor is skipped over as it is complicated
-        dp_tp has placement (_StridedShard(0, split_factor=2), Shard(0))
-        local_tensors:
-        rank0: [0,1]  rank1: [3]
-        rank1: [2]    rank3: [4]
-
-        Now, say someone wants to reconstruct dp_tp's full tensor. This will invoke 'redistribute' to replicate.
-        redistribute will first replicate the "Shard(0)" placement on the rightmost mesh dim, then replicate the
-        StridedShard placement second, which is implemented by this function.
-        So our starting point (`local_tensor` arg) is the result of replicating the Shard(0) placement across the
-        TP dim, which looks like this.
-
-        Note the discrepancy with the 'tp sharded tensor' line above!  We'll fix it by locally shuffling data.
-
-        local_tensors:
-        rank0: [0,1,3]  rank1: [0,1,3]
-        rank2: [2,4]    rank3: [2,4]
-
-        Step 1: replicate over the DP dimension.  Afterwards, each rank can locally sort the values.
-          note: we need padding to do this allgather, and we'll need to keep track of the padding amount for later
-                local_tensors:
-        rank0: [0,1,3,2,4]    rank1: [0,1,3,2,4]
-        rank2: [0,1,3,2,4]    rank3: [0,1,3,2,4]
-
-        Step 2: chunk and shuffle values around to account for the wrong order of operations above
-        and get the original tensor content back
-
-        01324#       <- our allgather includes padding, if padding was applied in step 1
-        01324        <- Remove the padding
-        013, 24      <- chunk once, 'undoing' the DP allgather
-        01, 3, 2, 4  <- chunk each chunk, 'undoing' the initial (wrong) TP allgather performed by Shard(0)->Replicate()
-        012, 34      <- interleave with stride=TP mesh dim size
-        01234        <- concatenate
-
-        Note: the current implementation of this function is incomplete, and supports only the common pattern of one
-        strided shard placement, which is used in the FSDP + TP case.  We could extend this implementation to handle
-        multiple strided shardings (e.g. [StridedShard, StridedShard, Shard]), by repeating the chunking step more times
-        and handling more complex shuffling in the last step.  On the other hand, we plan to replace 'StridedShard'
-        with using just Shard and specifying a sharding order, so it may be ok to leave this as-is for the time being.
+        replay the replicate-to-shard process to understand how to stitch shards back
         """
         num_chunks = mesh.size(mesh_dim=mesh_dim)
         logical_dim_size = current_logical_shape[self.dim]
-        full_chunk_size = (logical_dim_size + num_chunks - 1) // num_chunks
-        local_pad_size = full_chunk_size - local_tensor.size(self.dim)
 
-        local_tensor = pad_tensor(local_tensor, self.dim, local_pad_size)
+        # indices_tensor is 1D torch.arange(logical_dim_size) unsqueezed
+        # so that we can reuse self._split_tensor which splits on self.dim
+        shape = [1] * self.dim + [logical_dim_size]
+        indices_tensor = torch.arange(
+            logical_dim_size, device=local_tensor.device
+        ).view(shape)
 
-        if not local_tensor.is_contiguous():
-            local_tensor = local_tensor.contiguous()
+        sharded_indices, _ = self._split_tensor(
+            indices_tensor,
+            num_chunks,
+            with_padding=False,
+            contiguous=False,
+        )
+        # squeeze back to 1D indices tensor
+        sharded_indices = [shard.view(-1) for shard in sharded_indices]
 
-        result = funcol.all_gather_tensor(
-            local_tensor,
+        max_chunk_size = max([len(shard) for shard in sharded_indices])
+        local_pad_size = max_chunk_size - local_tensor.size(self.dim)
+        local_tensor_padded = pad_tensor(local_tensor, self.dim, local_pad_size)
+
+        if not local_tensor_padded.is_contiguous():
+            local_tensor_padded = local_tensor_padded.contiguous()
+
+        replicate_tensor_permuted_padded = funcol.all_gather_tensor(
+            local_tensor_padded,
             gather_dim=self.dim,
             group=(mesh, mesh_dim),
         )
-        if isinstance(result, funcol.AsyncCollectiveTensor):
-            result = result.wait()
-
-        if result.shape[self.dim] > logical_dim_size:
-            result = unpad_tensor(
-                result, self.dim, result.shape[self.dim] - logical_dim_size
+        if isinstance(replicate_tensor_permuted_padded, funcol.AsyncCollectiveTensor):
+            replicate_tensor_permuted_padded = replicate_tensor_permuted_padded.wait()
+
+        if replicate_tensor_permuted_padded.shape[self.dim] > logical_dim_size:
+            replicate_tensor_permuted = unpad_tensor(
+                replicate_tensor_permuted_padded,
+                self.dim,
+                replicate_tensor_permuted_padded.shape[self.dim] - logical_dim_size,
             )
+        else:
+            replicate_tensor_permuted = replicate_tensor_permuted_padded
 
-        # this reverses our 'all_gather' but gives every rank a copy
-        outer_shards = torch.chunk(result, num_chunks, dim=self.dim)
-        # this undoes the 'Shard(0)' -> Replicate() that happened over the wrong mesh dim in the first place
-        inner_shards: list[torch.Tensor] = []
-        for p in outer_shards:
-            inner_shards.extend(torch.chunk(p, self.split_factor, dim=self.dim))
-        # now we just have to correctly stride the shards
-        reordered_shards = []
-        for i in range(self.split_factor):
-            reordered_shards.extend(inner_shards[i :: self.split_factor])
-        return torch.cat(reordered_shards, dim=self.dim).contiguous()
+        permutation = torch.cat(sharded_indices)
+        inv_permutation = torch.argsort(permutation)
+        replicate_tensor = torch.index_select(
+            replicate_tensor_permuted, self.dim, inv_permutation
+        )
+
+        return replicate_tensor.contiguous()
+
+    @staticmethod
+    @maybe_run_for_local_tensor
+    def _local_shard_size(sharded_indices: list[torch.Tensor], rank: int) -> int:
+        return len(sharded_indices[rank])
+
+    def _local_shard_size_and_offset(
+        self,
+        curr_local_size: int,
+        num_chunks: int,
+        rank: int,
+    ) -> tuple[int, Optional[int]]:
+        # indices_tensor is 1D torch.arange(logical_dim_size) unsqueezed
+        # so that we can reuse self._split_tensor which splits on self.dim
+        shape = [1] * self.dim + [curr_local_size]
+        indices_tensor = torch.arange(
+            curr_local_size,
+        ).view(shape)
+
+        sharded_indices, _ = self._split_tensor(
+            indices_tensor,
+            num_chunks,
+            with_padding=False,
+            contiguous=False,
+        )
+        # squeeze back to 1D indices tensor
+        sharded_indices = [shard.view(-1) for shard in sharded_indices]
+
+        local_shard_size = _StridedShard._local_shard_size(sharded_indices, rank)
+
+        # offsets from _StridedShard is never used
+        return local_shard_size, None
 
 
 @dataclass(frozen=True)
@@ -627,8 +716,9 @@ def __str__(self) -> str:
         """
         return "R"
 
-    def _replicate_tensor(
-        self,
+    @classmethod
+    def _make_replicate_tensor(
+        cls,
         tensor: torch.Tensor,
         mesh: DeviceMesh,
         mesh_dim: int,
@@ -650,6 +740,15 @@ def _replicate_tensor(
             mesh_broadcast(tensor, mesh, mesh_dim=mesh_dim, group_src=src_data_rank)
         return tensor
 
+    def _replicate_tensor(
+        self,
+        tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        mesh_dim: int,
+        src_data_rank: Optional[int] = 0,
+    ) -> torch.Tensor:
+        return Replicate._make_replicate_tensor(tensor, mesh, mesh_dim, src_data_rank)
+
     def is_replicate(self) -> bool:
         return True
 
diff --git a/torch/distributed/utils.py b/torch/distributed/utils.py
index 812e3d5f033a..1dc123b50dbe 100644
--- a/torch/distributed/utils.py
+++ b/torch/distributed/utils.py
@@ -2,8 +2,8 @@
 import dataclasses
 import traceback
 from collections import OrderedDict
-from collections.abc import Container
-from typing import Any, Callable, Optional, overload, TypeVar
+from collections.abc import Callable, Container
+from typing import Any, Optional, overload, TypeVar
 
 import torch
 import torch.distributed as dist
@@ -59,6 +59,7 @@ def _cast_forward_inputs(
     def cast_fn(x: torch.Tensor) -> torch.Tensor:
         if not torch.is_floating_point(x) or x.dtype == dtype:
             return x
+        # pyrefly: ignore  # no-matching-overload
         return x.to(dtype)
 
     return (_apply_to_tensors(cast_fn, args), _apply_to_tensors(cast_fn, kwargs))
@@ -133,12 +134,16 @@ def to_map(obj):
         from torch.nn.parallel.scatter_gather import _is_namedtuple
 
         if _is_namedtuple(obj):
+            # pyrefly: ignore  # no-matching-overload
             return [type(obj)(*args) for args in zip(*map(to_map, obj))]
         if isinstance(obj, tuple) and len(obj) > 0:
+            # pyrefly: ignore  # no-matching-overload
             return list(zip(*map(to_map, obj)))
         if isinstance(obj, list) and len(obj) > 0:
+            # pyrefly: ignore  # no-matching-overload
             return [list(i) for i in zip(*map(to_map, obj))]
         if isinstance(obj, dict) and len(obj) > 0:
+            # pyrefly: ignore  # no-matching-overload
             return [type(obj)(i) for i in zip(*map(to_map, obj.items()))]
         return [obj]
 
diff --git a/torch/distributions/bernoulli.py b/torch/distributions/bernoulli.py
index 74fedc929783..44446b4ffda8 100644
--- a/torch/distributions/bernoulli.py
+++ b/torch/distributions/bernoulli.py
@@ -39,6 +39,7 @@ class Bernoulli(ExponentialFamily):
         validate_args (bool, optional): whether to validate arguments, None by default
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
     support = constraints.boolean
     has_enumerate_support = True
@@ -56,10 +57,12 @@ def __init__(
             )
         if probs is not None:
             is_scalar = isinstance(probs, _Number)
+            # pyrefly: ignore  # read-only
             (self.probs,) = broadcast_all(probs)
         else:
             assert logits is not None  # helps mypy
             is_scalar = isinstance(logits, _Number)
+            # pyrefly: ignore  # read-only
             (self.logits,) = broadcast_all(logits)
         self._param = self.probs if probs is not None else self.logits
         if is_scalar:
@@ -137,5 +140,6 @@ def enumerate_support(self, expand=True):
     def _natural_params(self) -> tuple[Tensor]:
         return (torch.logit(self.probs),)
 
+    # pyrefly: ignore  # bad-override
     def _log_normalizer(self, x):
         return torch.log1p(torch.exp(x))
diff --git a/torch/distributions/beta.py b/torch/distributions/beta.py
index e06a28ca5aa4..0cab53613079 100644
--- a/torch/distributions/beta.py
+++ b/torch/distributions/beta.py
@@ -31,6 +31,7 @@ class Beta(ExponentialFamily):
             (often referred to as beta)
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {
         "concentration1": constraints.positive,
         "concentration0": constraints.positive,
@@ -113,5 +114,6 @@ def concentration0(self) -> Tensor:
     def _natural_params(self) -> tuple[Tensor, Tensor]:
         return (self.concentration1, self.concentration0)
 
+    # pyrefly: ignore  # bad-override
     def _log_normalizer(self, x, y):
         return torch.lgamma(x) + torch.lgamma(y) - torch.lgamma(x + y)
diff --git a/torch/distributions/binomial.py b/torch/distributions/binomial.py
index 90461784c06d..b400b9861407 100644
--- a/torch/distributions/binomial.py
+++ b/torch/distributions/binomial.py
@@ -45,6 +45,7 @@ class Binomial(Distribution):
         logits (Tensor): Event log-odds
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {
         "total_count": constraints.nonnegative_integer,
         "probs": constraints.unit_interval,
@@ -66,6 +67,7 @@ def __init__(
         if probs is not None:
             (
                 self.total_count,
+                # pyrefly: ignore  # read-only
                 self.probs,
             ) = broadcast_all(total_count, probs)
             self.total_count = self.total_count.type_as(self.probs)
@@ -73,6 +75,7 @@ def __init__(
             assert logits is not None  # helps mypy
             (
                 self.total_count,
+                # pyrefly: ignore  # read-only
                 self.logits,
             ) = broadcast_all(total_count, logits)
             self.total_count = self.total_count.type_as(self.logits)
@@ -99,6 +102,7 @@ def _new(self, *args, **kwargs):
         return self._param.new(*args, **kwargs)
 
     @constraints.dependent_property(is_discrete=True, event_dim=0)
+    # pyrefly: ignore  # bad-override
     def support(self):
         return constraints.integer_interval(0, self.total_count)
 
diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py
index 1c8fed2636ad..7e083f802206 100644
--- a/torch/distributions/categorical.py
+++ b/torch/distributions/categorical.py
@@ -50,6 +50,7 @@ class Categorical(Distribution):
         logits (Tensor): event log probabilities (unnormalized)
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {"probs": constraints.simplex, "logits": constraints.real_vector}
     has_enumerate_support = True
 
@@ -66,12 +67,14 @@ def __init__(
         if probs is not None:
             if probs.dim() < 1:
                 raise ValueError("`probs` parameter must be at least one-dimensional.")
+            # pyrefly: ignore  # read-only
             self.probs = probs / probs.sum(-1, keepdim=True)
         else:
             assert logits is not None  # helps mypy
             if logits.dim() < 1:
                 raise ValueError("`logits` parameter must be at least one-dimensional.")
             # Normalize
+            # pyrefly: ignore  # read-only
             self.logits = logits - logits.logsumexp(dim=-1, keepdim=True)
         self._param = self.probs if probs is not None else self.logits
         self._num_events = self._param.size()[-1]
@@ -99,6 +102,7 @@ def _new(self, *args, **kwargs):
         return self._param.new(*args, **kwargs)
 
     @constraints.dependent_property(is_discrete=True, event_dim=0)
+    # pyrefly: ignore  # bad-override
     def support(self):
         return constraints.integer_interval(0, self._num_events - 1)
 
diff --git a/torch/distributions/cauchy.py b/torch/distributions/cauchy.py
index 84c1d34bda79..39b9885b237c 100644
--- a/torch/distributions/cauchy.py
+++ b/torch/distributions/cauchy.py
@@ -31,6 +31,7 @@ class Cauchy(Distribution):
         scale (float or Tensor): half width at half maximum.
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
     support = constraints.real
     has_rsample = True
diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py
index bd64f18483f7..9181a87abe4d 100644
--- a/torch/distributions/constraints.py
+++ b/torch/distributions/constraints.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 
 
 r"""
diff --git a/torch/distributions/continuous_bernoulli.py b/torch/distributions/continuous_bernoulli.py
index 14d0d6a9c177..d949a19d3f77 100644
--- a/torch/distributions/continuous_bernoulli.py
+++ b/torch/distributions/continuous_bernoulli.py
@@ -47,6 +47,7 @@ class ContinuousBernoulli(ExponentialFamily):
     https://arxiv.org/abs/1907.06845
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
     support = constraints.unit_interval
     _mean_carrier_measure = 0
@@ -65,16 +66,19 @@ def __init__(
             )
         if probs is not None:
             is_scalar = isinstance(probs, _Number)
+            # pyrefly: ignore  # read-only
             (self.probs,) = broadcast_all(probs)
             # validate 'probs' here if necessary as it is later clamped for numerical stability
             # close to 0 and 1, later on; otherwise the clamped 'probs' would always pass
             if validate_args is not None:
                 if not self.arg_constraints["probs"].check(self.probs).all():
                     raise ValueError("The parameter probs has invalid values")
+            # pyrefly: ignore  # read-only
             self.probs = clamp_probs(self.probs)
         else:
             assert logits is not None  # helps mypy
             is_scalar = isinstance(logits, _Number)
+            # pyrefly: ignore  # read-only
             (self.logits,) = broadcast_all(logits)
         self._param = self.probs if probs is not None else self.logits
         if is_scalar:
@@ -230,6 +234,7 @@ def entropy(self):
     def _natural_params(self) -> tuple[Tensor]:
         return (self.logits,)
 
+    # pyrefly: ignore  # bad-override
     def _log_normalizer(self, x):
         """computes the log normalizing constant as a function of the natural parameter"""
         out_unst_reg = torch.max(
diff --git a/torch/distributions/dirichlet.py b/torch/distributions/dirichlet.py
index 414ad6efe47e..0f2a656ac21d 100644
--- a/torch/distributions/dirichlet.py
+++ b/torch/distributions/dirichlet.py
@@ -22,6 +22,7 @@ def _Dirichlet_backward(x, concentration, grad_output):
 
 class _Dirichlet(Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, concentration):
         x = torch._sample_dirichlet(concentration)
         ctx.save_for_backward(x, concentration)
@@ -29,6 +30,7 @@ def forward(ctx, concentration):
 
     @staticmethod
     @once_differentiable
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         x, concentration = ctx.saved_tensors
         return _Dirichlet_backward(x, concentration, grad_output)
@@ -50,6 +52,7 @@ class Dirichlet(ExponentialFamily):
             (often referred to as alpha)
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {
         "concentration": constraints.independent(constraints.positive, 1)
     }
@@ -130,5 +133,6 @@ def entropy(self):
     def _natural_params(self) -> tuple[Tensor]:
         return (self.concentration,)
 
+    # pyrefly: ignore  # bad-override
     def _log_normalizer(self, x):
         return x.lgamma().sum(-1) - torch.lgamma(x.sum(-1))
diff --git a/torch/distributions/exponential.py b/torch/distributions/exponential.py
index d15cb1f7a258..3630d4158b37 100644
--- a/torch/distributions/exponential.py
+++ b/torch/distributions/exponential.py
@@ -27,6 +27,7 @@ class Exponential(ExponentialFamily):
         rate (float or Tensor): rate = 1 / scale of the distribution
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {"rate": constraints.positive}
     support = constraints.nonnegative
     has_rsample = True
@@ -89,5 +90,6 @@ def entropy(self):
     def _natural_params(self) -> tuple[Tensor]:
         return (-self.rate,)
 
+    # pyrefly: ignore  # bad-override
     def _log_normalizer(self, x):
         return -torch.log(-x)
diff --git a/torch/distributions/fishersnedecor.py b/torch/distributions/fishersnedecor.py
index 4755bd0d8bde..a329d68c61e6 100644
--- a/torch/distributions/fishersnedecor.py
+++ b/torch/distributions/fishersnedecor.py
@@ -29,6 +29,7 @@ class FisherSnedecor(Distribution):
         df2 (float or Tensor): degrees of freedom parameter 2
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {"df1": constraints.positive, "df2": constraints.positive}
     support = constraints.positive
     has_rsample = True
diff --git a/torch/distributions/gamma.py b/torch/distributions/gamma.py
index 9df91ebee640..67086674714c 100644
--- a/torch/distributions/gamma.py
+++ b/torch/distributions/gamma.py
@@ -34,6 +34,7 @@ class Gamma(ExponentialFamily):
             (often referred to as beta), rate = 1 / scale
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {
         "concentration": constraints.positive,
         "rate": constraints.positive,
@@ -109,6 +110,7 @@ def entropy(self):
     def _natural_params(self) -> tuple[Tensor, Tensor]:
         return (self.concentration - 1, -self.rate)
 
+    # pyrefly: ignore  # bad-override
     def _log_normalizer(self, x, y):
         return torch.lgamma(x + 1) + (x + 1) * torch.log(-y.reciprocal())
 
diff --git a/torch/distributions/generalized_pareto.py b/torch/distributions/generalized_pareto.py
index 4ee0a54b608f..218faacfdb60 100644
--- a/torch/distributions/generalized_pareto.py
+++ b/torch/distributions/generalized_pareto.py
@@ -35,6 +35,7 @@ class GeneralizedPareto(Distribution):
         concentration (float or Tensor): Concentration parameter of the distribution
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {
         "loc": constraints.real,
         "scale": constraints.positive,
@@ -130,6 +131,7 @@ def variance(self):
         concentration = self.concentration
         valid = concentration < 0.5
         safe_conc = torch.where(valid, concentration, 0.25)
+        # pyrefly: ignore  # unsupported-operation
         result = self.scale**2 / ((1 - safe_conc) ** 2 * (1 - 2 * safe_conc))
         return torch.where(valid, result, nan)
 
@@ -142,6 +144,7 @@ def mode(self):
         return self.loc
 
     @constraints.dependent_property(is_discrete=False, event_dim=0)
+    # pyrefly: ignore  # bad-override
     def support(self):
         lower = self.loc
         upper = torch.where(
diff --git a/torch/distributions/geometric.py b/torch/distributions/geometric.py
index b5ceac39e94e..4fb6b534b1bf 100644
--- a/torch/distributions/geometric.py
+++ b/torch/distributions/geometric.py
@@ -44,6 +44,7 @@ class Geometric(Distribution):
         logits (Number, Tensor): the log-odds of sampling `1`.
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
     support = constraints.nonnegative_integer
 
@@ -58,9 +59,11 @@ def __init__(
                 "Either `probs` or `logits` must be specified, but not both."
             )
         if probs is not None:
+            # pyrefly: ignore  # read-only
             (self.probs,) = broadcast_all(probs)
         else:
             assert logits is not None  # helps mypy
+            # pyrefly: ignore  # read-only
             (self.logits,) = broadcast_all(logits)
         probs_or_logits = probs if probs is not None else logits
         if isinstance(probs_or_logits, _Number):
diff --git a/torch/distributions/gumbel.py b/torch/distributions/gumbel.py
index 6d097c9324e2..8057d9718de6 100644
--- a/torch/distributions/gumbel.py
+++ b/torch/distributions/gumbel.py
@@ -32,6 +32,7 @@ class Gumbel(TransformedDistribution):
     """
 
     arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
+    # pyrefly: ignore  # bad-override
     support = constraints.real
 
     def __init__(
diff --git a/torch/distributions/half_cauchy.py b/torch/distributions/half_cauchy.py
index 572ae080ac3e..a2848f3d0cdd 100644
--- a/torch/distributions/half_cauchy.py
+++ b/torch/distributions/half_cauchy.py
@@ -32,8 +32,10 @@ class HalfCauchy(TransformedDistribution):
     """
 
     arg_constraints = {"scale": constraints.positive}
+    # pyrefly: ignore  # bad-override
     support = constraints.nonnegative
     has_rsample = True
+    # pyrefly: ignore  # bad-override
     base_dist: Cauchy
 
     def __init__(
diff --git a/torch/distributions/half_normal.py b/torch/distributions/half_normal.py
index 21e1b9d2c506..0aac8852e6e1 100644
--- a/torch/distributions/half_normal.py
+++ b/torch/distributions/half_normal.py
@@ -32,8 +32,10 @@ class HalfNormal(TransformedDistribution):
     """
 
     arg_constraints = {"scale": constraints.positive}
+    # pyrefly: ignore  # bad-override
     support = constraints.nonnegative
     has_rsample = True
+    # pyrefly: ignore  # bad-override
     base_dist: Normal
 
     def __init__(
diff --git a/torch/distributions/independent.py b/torch/distributions/independent.py
index b66406681bb8..b901a7caab58 100644
--- a/torch/distributions/independent.py
+++ b/torch/distributions/independent.py
@@ -91,6 +91,7 @@ def has_enumerate_support(self) -> bool:  # type: ignore[override]
         return self.base_dist.has_enumerate_support
 
     @constraints.dependent_property
+    # pyrefly: ignore  # bad-override
     def support(self):
         result = self.base_dist.support
         if self.reinterpreted_batch_ndims:
diff --git a/torch/distributions/inverse_gamma.py b/torch/distributions/inverse_gamma.py
index de432a34434e..1be089e5331e 100644
--- a/torch/distributions/inverse_gamma.py
+++ b/torch/distributions/inverse_gamma.py
@@ -38,8 +38,10 @@ class InverseGamma(TransformedDistribution):
         "concentration": constraints.positive,
         "rate": constraints.positive,
     }
+    # pyrefly: ignore  # bad-override
     support = constraints.positive
     has_rsample = True
+    # pyrefly: ignore  # bad-override
     base_dist: Gamma
 
     def __init__(
diff --git a/torch/distributions/kl.py b/torch/distributions/kl.py
index 5dbbd7611b69..ca82802bcc85 100644
--- a/torch/distributions/kl.py
+++ b/torch/distributions/kl.py
@@ -1,8 +1,8 @@
 # mypy: allow-untyped-defs
 import math
 import warnings
+from collections.abc import Callable
 from functools import total_ordering
-from typing import Callable
 
 import torch
 from torch import inf, Tensor
@@ -280,7 +280,7 @@ def _kl_exponential_exponential(p, q):
 
 @register_kl(ExponentialFamily, ExponentialFamily)
 def _kl_expfamily_expfamily(p, q):
-    if not type(p) == type(q):
+    if type(p) is not type(q):
         raise NotImplementedError(
             "The cross KL-divergence between different exponential families cannot \
                             be computed using Bregman divergences"
diff --git a/torch/distributions/kumaraswamy.py b/torch/distributions/kumaraswamy.py
index 53c09ab9870d..03fe9d6e3712 100644
--- a/torch/distributions/kumaraswamy.py
+++ b/torch/distributions/kumaraswamy.py
@@ -44,6 +44,7 @@ class Kumaraswamy(TransformedDistribution):
         "concentration1": constraints.positive,
         "concentration0": constraints.positive,
     }
+    # pyrefly: ignore  # bad-override
     support = constraints.unit_interval
     has_rsample = True
 
@@ -66,6 +67,7 @@ def __init__(
             AffineTransform(loc=1.0, scale=-1.0),
             PowerTransform(exponent=self.concentration1.reciprocal()),
         ]
+        # pyrefly: ignore  # bad-argument-type
         super().__init__(base_dist, transforms, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
diff --git a/torch/distributions/laplace.py b/torch/distributions/laplace.py
index 0d50712fb26f..01f51edc0546 100644
--- a/torch/distributions/laplace.py
+++ b/torch/distributions/laplace.py
@@ -28,6 +28,7 @@ class Laplace(Distribution):
         scale (float or Tensor): scale of the distribution
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
     support = constraints.real
     has_rsample = True
diff --git a/torch/distributions/lkj_cholesky.py b/torch/distributions/lkj_cholesky.py
index f3fc4b20751e..3f1e6b98b6fe 100644
--- a/torch/distributions/lkj_cholesky.py
+++ b/torch/distributions/lkj_cholesky.py
@@ -60,6 +60,7 @@ class LKJCholesky(Distribution):
     Journal of Multivariate Analysis. 100. 10.1016/j.jmva.2009.04.008
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {"concentration": constraints.positive}
     support = constraints.corr_cholesky
 
diff --git a/torch/distributions/log_normal.py b/torch/distributions/log_normal.py
index 2c6dbc6bf55c..675c58ab2e64 100644
--- a/torch/distributions/log_normal.py
+++ b/torch/distributions/log_normal.py
@@ -32,8 +32,10 @@ class LogNormal(TransformedDistribution):
     """
 
     arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
+    # pyrefly: ignore  # bad-override
     support = constraints.positive
     has_rsample = True
+    # pyrefly: ignore  # bad-override
     base_dist: Normal
 
     def __init__(
diff --git a/torch/distributions/logistic_normal.py b/torch/distributions/logistic_normal.py
index 729e3a67419f..14ef668a72cf 100644
--- a/torch/distributions/logistic_normal.py
+++ b/torch/distributions/logistic_normal.py
@@ -36,8 +36,10 @@ class LogisticNormal(TransformedDistribution):
     """
 
     arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
+    # pyrefly: ignore  # bad-override
     support = constraints.simplex
     has_rsample = True
+    # pyrefly: ignore  # bad-override
     base_dist: Independent[Normal]
 
     def __init__(
diff --git a/torch/distributions/lowrank_multivariate_normal.py b/torch/distributions/lowrank_multivariate_normal.py
index 968e4634ba62..27270ee59cec 100644
--- a/torch/distributions/lowrank_multivariate_normal.py
+++ b/torch/distributions/lowrank_multivariate_normal.py
@@ -86,6 +86,7 @@ class LowRankMultivariateNormal(Distribution):
             capacitance = I + cov_factor.T @ inv(cov_diag) @ cov_factor
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {
         "loc": constraints.real_vector,
         "cov_factor": constraints.independent(constraints.real, 2),
diff --git a/torch/distributions/mixture_same_family.py b/torch/distributions/mixture_same_family.py
index 3fe47a4b4c6b..3ab2be132e9b 100644
--- a/torch/distributions/mixture_same_family.py
+++ b/torch/distributions/mixture_same_family.py
@@ -124,6 +124,7 @@ def expand(self, batch_shape, _instance=None):
         return new
 
     @constraints.dependent_property
+    # pyrefly: ignore  # bad-override
     def support(self):
         return MixtureSameFamilyConstraint(self._component_distribution.support)
 
diff --git a/torch/distributions/multinomial.py b/torch/distributions/multinomial.py
index 41d8ded53fd6..58e7f6734b0d 100644
--- a/torch/distributions/multinomial.py
+++ b/torch/distributions/multinomial.py
@@ -50,6 +50,7 @@ class Multinomial(Distribution):
         logits (Tensor): event log probabilities (unnormalized)
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {"probs": constraints.simplex, "logits": constraints.real_vector}
     total_count: int
 
@@ -92,6 +93,7 @@ def _new(self, *args, **kwargs):
         return self._categorical._new(*args, **kwargs)
 
     @constraints.dependent_property(is_discrete=True, event_dim=1)
+    # pyrefly: ignore  # bad-override
     def support(self):
         return constraints.multinomial(self.total_count)
 
diff --git a/torch/distributions/multivariate_normal.py b/torch/distributions/multivariate_normal.py
index c15a84815b06..1cf701901819 100644
--- a/torch/distributions/multivariate_normal.py
+++ b/torch/distributions/multivariate_normal.py
@@ -123,6 +123,7 @@ class MultivariateNormal(Distribution):
         the corresponding lower triangular matrices using a Cholesky decomposition.
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {
         "loc": constraints.real_vector,
         "covariance_matrix": constraints.positive_definite,
@@ -156,6 +157,7 @@ def __init__(
                     "with optional leading batch dimensions"
                 )
             batch_shape = torch.broadcast_shapes(scale_tril.shape[:-2], loc.shape[:-1])
+            # pyrefly: ignore  # read-only
             self.scale_tril = scale_tril.expand(batch_shape + (-1, -1))
         elif covariance_matrix is not None:
             if covariance_matrix.dim() < 2:
@@ -166,6 +168,7 @@ def __init__(
             batch_shape = torch.broadcast_shapes(
                 covariance_matrix.shape[:-2], loc.shape[:-1]
             )
+            # pyrefly: ignore  # read-only
             self.covariance_matrix = covariance_matrix.expand(batch_shape + (-1, -1))
         else:
             assert precision_matrix is not None  # helps mypy
@@ -177,6 +180,7 @@ def __init__(
             batch_shape = torch.broadcast_shapes(
                 precision_matrix.shape[:-2], loc.shape[:-1]
             )
+            # pyrefly: ignore  # read-only
             self.precision_matrix = precision_matrix.expand(batch_shape + (-1, -1))
         self.loc = loc.expand(batch_shape + (-1,))
 
diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py
index f28222f92f78..a743c318f419 100644
--- a/torch/distributions/negative_binomial.py
+++ b/torch/distributions/negative_binomial.py
@@ -33,6 +33,7 @@ class NegativeBinomial(Distribution):
         logits (Tensor): Event log-odds for probabilities of success
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {
         "total_count": constraints.greater_than_eq(0),
         "probs": constraints.half_open_interval(0.0, 1.0),
@@ -54,6 +55,7 @@ def __init__(
         if probs is not None:
             (
                 self.total_count,
+                # pyrefly: ignore  # read-only
                 self.probs,
             ) = broadcast_all(total_count, probs)
             self.total_count = self.total_count.type_as(self.probs)
@@ -61,6 +63,7 @@ def __init__(
             assert logits is not None  # helps mypy
             (
                 self.total_count,
+                # pyrefly: ignore  # read-only
                 self.logits,
             ) = broadcast_all(total_count, logits)
             self.total_count = self.total_count.type_as(self.logits)
diff --git a/torch/distributions/normal.py b/torch/distributions/normal.py
index 626358d14795..cc391f4afacc 100644
--- a/torch/distributions/normal.py
+++ b/torch/distributions/normal.py
@@ -31,6 +31,7 @@ class Normal(ExponentialFamily):
             (often referred to as sigma)
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
     support = constraints.real
     has_rsample = True
@@ -88,6 +89,7 @@ def log_prob(self, value):
         if self._validate_args:
             self._validate_sample(value)
         # compute the variance
+        # pyrefly: ignore  # unsupported-operation
         var = self.scale**2
         log_scale = (
             math.log(self.scale)
@@ -117,5 +119,6 @@ def entropy(self):
     def _natural_params(self) -> tuple[Tensor, Tensor]:
         return (self.loc / self.scale.pow(2), -0.5 * self.scale.pow(2).reciprocal())
 
+    # pyrefly: ignore  # bad-override
     def _log_normalizer(self, x, y):
         return -0.25 * x.pow(2) / y + 0.5 * torch.log(-math.pi / y)
diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py
index 8edb6da0b8dd..aec55b95f786 100644
--- a/torch/distributions/one_hot_categorical.py
+++ b/torch/distributions/one_hot_categorical.py
@@ -42,6 +42,7 @@ class OneHotCategorical(Distribution):
         logits (Tensor): event log probabilities (unnormalized)
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {"probs": constraints.simplex, "logits": constraints.real_vector}
     support = constraints.one_hot
     has_enumerate_support = True
diff --git a/torch/distributions/pareto.py b/torch/distributions/pareto.py
index bbca7e0cba35..8b1df0dbb9bb 100644
--- a/torch/distributions/pareto.py
+++ b/torch/distributions/pareto.py
@@ -39,6 +39,7 @@ def __init__(
         self.scale, self.alpha = broadcast_all(scale, alpha)
         base_dist = Exponential(self.alpha, validate_args=validate_args)
         transforms = [ExpTransform(), AffineTransform(loc=0, scale=self.scale)]
+        # pyrefly: ignore  # bad-argument-type
         super().__init__(base_dist, transforms, validate_args=validate_args)
 
     def expand(
diff --git a/torch/distributions/poisson.py b/torch/distributions/poisson.py
index d3fb4446baf4..04524ec56d93 100644
--- a/torch/distributions/poisson.py
+++ b/torch/distributions/poisson.py
@@ -32,6 +32,7 @@ class Poisson(ExponentialFamily):
         rate (Number, Tensor): the rate parameter
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {"rate": constraints.nonnegative}
     support = constraints.nonnegative_integer
 
@@ -82,5 +83,6 @@ def log_prob(self, value):
     def _natural_params(self) -> tuple[Tensor]:
         return (torch.log(self.rate),)
 
+    # pyrefly: ignore  # bad-override
     def _log_normalizer(self, x):
         return torch.exp(x)
diff --git a/torch/distributions/relaxed_bernoulli.py b/torch/distributions/relaxed_bernoulli.py
index 16ad4219627e..fd6e4226603c 100644
--- a/torch/distributions/relaxed_bernoulli.py
+++ b/torch/distributions/relaxed_bernoulli.py
@@ -40,6 +40,7 @@ class LogitRelaxedBernoulli(Distribution):
     (Jang et al., 2017)
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
     support = constraints.real
 
@@ -57,10 +58,12 @@ def __init__(
             )
         if probs is not None:
             is_scalar = isinstance(probs, _Number)
+            # pyrefly: ignore  # read-only
             (self.probs,) = broadcast_all(probs)
         else:
             assert logits is not None  # helps mypy
             is_scalar = isinstance(logits, _Number)
+            # pyrefly: ignore  # read-only
             (self.logits,) = broadcast_all(logits)
         self._param = self.probs if probs is not None else self.logits
         if is_scalar:
@@ -138,8 +141,10 @@ class RelaxedBernoulli(TransformedDistribution):
     """
 
     arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
+    # pyrefly: ignore  # bad-override
     support = constraints.unit_interval
     has_rsample = True
+    # pyrefly: ignore  # bad-override
     base_dist: LogitRelaxedBernoulli
 
     def __init__(
diff --git a/torch/distributions/relaxed_categorical.py b/torch/distributions/relaxed_categorical.py
index 47314be9e44a..c5492d69b706 100644
--- a/torch/distributions/relaxed_categorical.py
+++ b/torch/distributions/relaxed_categorical.py
@@ -38,6 +38,7 @@ class ExpRelaxedCategorical(Distribution):
     (Jang et al., 2017)
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {"probs": constraints.simplex, "logits": constraints.real_vector}
     support = (
         constraints.real_vector
@@ -127,8 +128,10 @@ class RelaxedOneHotCategorical(TransformedDistribution):
     """
 
     arg_constraints = {"probs": constraints.simplex, "logits": constraints.real_vector}
+    # pyrefly: ignore  # bad-override
     support = constraints.simplex
     has_rsample = True
+    # pyrefly: ignore  # bad-override
     base_dist: ExpRelaxedCategorical
 
     def __init__(
diff --git a/torch/distributions/studentT.py b/torch/distributions/studentT.py
index d98554f413c0..ef84b5bdc879 100644
--- a/torch/distributions/studentT.py
+++ b/torch/distributions/studentT.py
@@ -31,6 +31,7 @@ class StudentT(Distribution):
         scale (float or Tensor): scale of the distribution
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {
         "df": constraints.positive,
         "loc": constraints.real,
diff --git a/torch/distributions/transformed_distribution.py b/torch/distributions/transformed_distribution.py
index 1724b586b5a7..1cc427f069f5 100644
--- a/torch/distributions/transformed_distribution.py
+++ b/torch/distributions/transformed_distribution.py
@@ -123,6 +123,7 @@ def expand(self, batch_shape, _instance=None):
         return new
 
     @constraints.dependent_property(is_discrete=False)
+    # pyrefly: ignore  # bad-override
     def support(self):
         if not self.transforms:
             return self.base_dist.support
diff --git a/torch/distributions/transforms.py b/torch/distributions/transforms.py
index 9584bb0b342d..9fdf6911c10c 100644
--- a/torch/distributions/transforms.py
+++ b/torch/distributions/transforms.py
@@ -226,11 +226,13 @@ def __init__(self, transform: Transform) -> None:
         self._inv: Transform = transform  # type: ignore[assignment]
 
     @constraints.dependent_property(is_discrete=False)
+    # pyrefly: ignore  # bad-override
     def domain(self):
         assert self._inv is not None
         return self._inv.codomain
 
     @constraints.dependent_property(is_discrete=False)
+    # pyrefly: ignore  # bad-override
     def codomain(self):
         assert self._inv is not None
         return self._inv.domain
@@ -300,6 +302,7 @@ def __eq__(self, other):
         return self.parts == other.parts
 
     @constraints.dependent_property(is_discrete=False)
+    # pyrefly: ignore  # bad-override
     def domain(self):
         if not self.parts:
             return constraints.real
@@ -315,6 +318,7 @@ def domain(self):
         return domain
 
     @constraints.dependent_property(is_discrete=False)
+    # pyrefly: ignore  # bad-override
     def codomain(self):
         if not self.parts:
             return constraints.real
@@ -434,12 +438,14 @@ def with_cache(self, cache_size=1):
         )
 
     @constraints.dependent_property(is_discrete=False)
+    # pyrefly: ignore  # bad-override
     def domain(self):
         return constraints.independent(
             self.base_transform.domain, self.reinterpreted_batch_ndims
         )
 
     @constraints.dependent_property(is_discrete=False)
+    # pyrefly: ignore  # bad-override
     def codomain(self):
         return constraints.independent(
             self.base_transform.codomain, self.reinterpreted_batch_ndims
@@ -507,10 +513,12 @@ def __init__(
         super().__init__(cache_size=cache_size)
 
     @constraints.dependent_property
+    # pyrefly: ignore  # bad-override
     def domain(self):
         return constraints.independent(constraints.real, len(self.in_shape))
 
     @constraints.dependent_property
+    # pyrefly: ignore  # bad-override
     def codomain(self):
         return constraints.independent(constraints.real, len(self.out_shape))
 
@@ -764,12 +772,14 @@ def event_dim(self) -> int:
         return self._event_dim
 
     @constraints.dependent_property(is_discrete=False)
+    # pyrefly: ignore  # bad-override
     def domain(self):
         if self.event_dim == 0:
             return constraints.real
         return constraints.independent(constraints.real, self.event_dim)
 
     @constraints.dependent_property(is_discrete=False)
+    # pyrefly: ignore  # bad-override
     def codomain(self):
         if self.event_dim == 0:
             return constraints.real
@@ -867,6 +877,7 @@ def _call(self, x):
         # apply stick-breaking on the squared values
         # Note that y = sign(r) * sqrt(z * z1m_cumprod)
         #             = (sign(r) * sqrt(z)) * sqrt(z1m_cumprod) = r * sqrt(z1m_cumprod)
+        # pyrefly: ignore  # unsupported-operation
         z = r**2
         z1m_cumprod_sqrt = (1 - z).sqrt().cumprod(-1)
         # Diagonal elements must be 1.
@@ -1155,12 +1166,14 @@ def bijective(self) -> bool:  # type: ignore[override]
         return all(t.bijective for t in self.transforms)
 
     @constraints.dependent_property
+    # pyrefly: ignore  # bad-override
     def domain(self):
         return constraints.cat(
             [t.domain for t in self.transforms], self.dim, self.lengths
         )
 
     @constraints.dependent_property
+    # pyrefly: ignore  # bad-override
     def codomain(self):
         return constraints.cat(
             [t.codomain for t in self.transforms], self.dim, self.lengths
@@ -1233,10 +1246,12 @@ def bijective(self) -> bool:  # type: ignore[override]
         return all(t.bijective for t in self.transforms)
 
     @constraints.dependent_property
+    # pyrefly: ignore  # bad-override
     def domain(self):
         return constraints.stack([t.domain for t in self.transforms], self.dim)
 
     @constraints.dependent_property
+    # pyrefly: ignore  # bad-override
     def codomain(self):
         return constraints.stack([t.codomain for t in self.transforms], self.dim)
 
diff --git a/torch/distributions/uniform.py b/torch/distributions/uniform.py
index b6e7c2640cfc..fc3cac86770a 100644
--- a/torch/distributions/uniform.py
+++ b/torch/distributions/uniform.py
@@ -79,6 +79,7 @@ def expand(self, batch_shape, _instance=None):
         return new
 
     @constraints.dependent_property(is_discrete=False, event_dim=0)
+    # pyrefly: ignore  # bad-override
     def support(self):
         return constraints.interval(self.low, self.high)
 
diff --git a/torch/distributions/utils.py b/torch/distributions/utils.py
index 8ebed81f493d..a5afc5395ee7 100644
--- a/torch/distributions/utils.py
+++ b/torch/distributions/utils.py
@@ -1,6 +1,6 @@
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from functools import update_wrapper
-from typing import Any, Callable, Final, Generic, Optional, overload, TypeVar, Union
+from typing import Any, Final, Generic, Optional, overload, TypeVar, Union
 
 import torch
 import torch.nn.functional as F
diff --git a/torch/distributions/von_mises.py b/torch/distributions/von_mises.py
index 4f96a23cf55b..9112d9f5be3c 100644
--- a/torch/distributions/von_mises.py
+++ b/torch/distributions/von_mises.py
@@ -92,6 +92,7 @@ def _log_modified_bessel_fn(x, order=0):
 @torch.jit.script_if_tracing
 def _rejection_sample(loc, concentration, proposal_r, x):
     done = torch.zeros(x.shape, dtype=torch.bool, device=loc.device)
+    # pyrefly: ignore  # bad-assignment
     while not done.all():
         u = torch.rand((3,) + x.shape, dtype=loc.dtype, device=loc.device)
         u1, u2, u3 = u.unbind()
@@ -100,6 +101,7 @@ def _rejection_sample(loc, concentration, proposal_r, x):
         c = concentration * (proposal_r - f)
         accept = ((c * (2 - c) - u2) > 0) | ((c / u2).log() + 1 - c >= 0)
         if accept.any():
+            # pyrefly: ignore  # no-matching-overload
             x = torch.where(accept, (u3 - 0.5).sign() * f.acos(), x)
             done = done | accept
     return (x + math.pi + loc) % (2 * math.pi) - math.pi
@@ -123,6 +125,7 @@ class VonMises(Distribution):
     :param torch.Tensor concentration: concentration parameter
     """
 
+    # pyrefly: ignore  # bad-override
     arg_constraints = {"loc": constraints.real, "concentration": constraints.positive}
     support = constraints.real
     has_rsample = False
@@ -160,8 +163,10 @@ def _concentration(self) -> Tensor:
     @lazy_property
     def _proposal_r(self) -> Tensor:
         kappa = self._concentration
+        # pyrefly: ignore  # unsupported-operation
         tau = 1 + (1 + 4 * kappa**2).sqrt()
         rho = (tau - (2 * tau).sqrt()) / (2 * kappa)
+        # pyrefly: ignore  # unsupported-operation
         _proposal_r = (1 + rho**2) / (2 * rho)
         # second order Taylor expansion around 0 for small kappa
         _proposal_r_taylor = 1 / kappa + kappa
diff --git a/torch/distributions/weibull.py b/torch/distributions/weibull.py
index aec5e6b8cd1c..0c7c3762b774 100644
--- a/torch/distributions/weibull.py
+++ b/torch/distributions/weibull.py
@@ -35,6 +35,7 @@ class Weibull(TransformedDistribution):
         "scale": constraints.positive,
         "concentration": constraints.positive,
     }
+    # pyrefly: ignore  # bad-override
     support = constraints.positive
 
     def __init__(
@@ -52,6 +53,7 @@ def __init__(
             PowerTransform(exponent=self.concentration_reciprocal),
             AffineTransform(loc=0, scale=self.scale),
         ]
+        # pyrefly: ignore  # bad-argument-type
         super().__init__(base_dist, transforms, validate_args=validate_args)
 
     def expand(self, batch_shape, _instance=None):
diff --git a/torch/distributions/wishart.py b/torch/distributions/wishart.py
index c5865b6b43c4..5aaa3ddc9d09 100644
--- a/torch/distributions/wishart.py
+++ b/torch/distributions/wishart.py
@@ -116,10 +116,13 @@ def __init__(
             )
 
         if scale_tril is not None:
+            # pyrefly: ignore  # read-only
             self.scale_tril = param.expand(batch_shape + (-1, -1))
         elif covariance_matrix is not None:
+            # pyrefly: ignore  # read-only
             self.covariance_matrix = param.expand(batch_shape + (-1, -1))
         elif precision_matrix is not None:
+            # pyrefly: ignore  # read-only
             self.precision_matrix = param.expand(batch_shape + (-1, -1))
 
         if self.df.lt(event_shape[-1]).any():
@@ -335,6 +338,7 @@ def _natural_params(self) -> tuple[Tensor, Tensor]:
         p = self._event_shape[-1]  # has singleton shape
         return -self.precision_matrix / 2, (nu - p - 1) / 2
 
+    # pyrefly: ignore  # bad-override
     def _log_normalizer(self, x, y):
         p = self._event_shape[-1]
         return (y + (p + 1) / 2) * (
diff --git a/torch/export/__init__.py b/torch/export/__init__.py
index 621cabf15a3b..83b6b87fe4d8 100644
--- a/torch/export/__init__.py
+++ b/torch/export/__init__.py
@@ -2,8 +2,8 @@
 import os
 import warnings
 import zipfile
-from collections.abc import Mapping
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable, Mapping
+from typing import Any, Optional, Union
 from typing_extensions import deprecated
 
 import torch
@@ -436,6 +436,7 @@ def load(
         print(ep(torch.randn(5)))
     """
     if isinstance(f, (str, os.PathLike)):
+        # pyrefly: ignore  # no-matching-overload
         f = os.fspath(f)
 
     extra_files = extra_files or {}
@@ -447,8 +448,8 @@ def load(
             f,
             expected_opset_version=expected_opset_version,
         )
-    except RuntimeError as e:
-        log.warning("Ran into the following error when deserializing: %s", e)
+    except RuntimeError:
+        log.warning("Ran into the following error when deserializing", exc_info=True)
         pt2_contents = PT2ArchiveContents({}, {}, {})
 
     if len(pt2_contents.exported_programs) > 0 or len(pt2_contents.extra_files) > 0:
diff --git a/torch/export/_draft_export.py b/torch/export/_draft_export.py
index 2b14327b2451..84c9bb924f8e 100644
--- a/torch/export/_draft_export.py
+++ b/torch/export/_draft_export.py
@@ -5,10 +5,10 @@
 import re
 import tempfile
 import time
-from collections.abc import Mapping
+from collections.abc import Callable, Mapping
 from dataclasses import dataclass
 from enum import IntEnum
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch._logging._internal
@@ -295,6 +295,7 @@ def __enter__(self) -> "CaptureStructuredTrace":
 
         self.logger.addHandler(self)
         self.prev_get_dtrace = torch._logging._internal.GET_DTRACE_STRUCTURED
+        # pyrefly: ignore  # bad-assignment
         torch._logging._internal.GET_DTRACE_STRUCTURED = True
         return self
 
@@ -302,6 +303,7 @@ def __exit__(self, exc_type, exc_value, traceback) -> None:  # type: ignore[no-u
         self.log_record = LogRecord()
         self.expression_created_logs = {}
         self.logger.removeHandler(self)
+        # pyrefly: ignore  # bad-assignment
         torch._logging._internal.GET_DTRACE_STRUCTURED = self.prev_get_dtrace
         self.prev_get_dtrace = False
 
diff --git a/torch/export/_leakage_detection_utils.py b/torch/export/_leakage_detection_utils.py
index c72152759d23..fe211e1dc079 100644
--- a/torch/export/_leakage_detection_utils.py
+++ b/torch/export/_leakage_detection_utils.py
@@ -43,7 +43,7 @@ def _is_tracked_fake(obj: typing.Any) -> bool:
 
 def _is_gm_meta_like_dict(d: dict, o: typing.Any) -> bool:
     # Hope gm.meta was a custom dict we can assert on
-    return d.get("val", None) is o
+    return d.get("val") is o
 
 
 def _dict_is_attr_of_tracked_fake(d: dict) -> bool:
diff --git a/torch/export/_swap.py b/torch/export/_swap.py
index 4c93956e32b4..553128c2612c 100644
--- a/torch/export/_swap.py
+++ b/torch/export/_swap.py
@@ -107,8 +107,11 @@ def _try_remove_connecting_pytrees(curr_module_node: torch.fx.Node) -> None:
             return
 
         if not (
+            # pyrefly: ignore  # missing-attribute
             arg.op == "call_function"
+            # pyrefly: ignore  # missing-attribute
             and arg.target == operator.getitem
+            # pyrefly: ignore  # missing-attribute
             and arg.args[1] == i
         ):
             log.debug(
@@ -139,7 +142,7 @@ def _try_remove_connecting_pytrees(curr_module_node: torch.fx.Node) -> None:
         return
 
     next_module_node = next(iter(unflatten_getitem_getitem_users))
-    if not (next_module_node.op == "call_module"):
+    if next_module_node.op != "call_module":
         log.debug(
             "Unflatten node %s's user is not a call_module. "
             "Instead it is: %s. Passing...",
@@ -371,8 +374,8 @@ def _fix_input_output_signature(
         assert signature.in_spec.num_children == 2
         arg_spec = signature.in_spec.children_specs[0]
         kwarg_spec = signature.in_spec.children_specs[1]
-        assert arg_spec.type == tuple
-        assert kwarg_spec.type == dict
+        assert arg_spec.type is tuple
+        assert kwarg_spec.type is dict
         for i in range(arg_spec.num_children):
             forward_arg_names.append(f"arg_{i}")
         forward_arg_names.extend(kwarg_spec.context)
diff --git a/torch/export/_trace.py b/torch/export/_trace.py
index 4e7671ee338b..779b7cfe4967 100644
--- a/torch/export/_trace.py
+++ b/torch/export/_trace.py
@@ -2,18 +2,20 @@
 # mypy: allow-untyped-defs
 import dataclasses
 import functools
-import gc
 import inspect
 import logging
-import os
 import re
 import sys
 import time
 import warnings
-import weakref
+from collections.abc import Callable
 from contextlib import contextmanager, nullcontext
-from typing import Any, Callable, Optional, Union
-from typing_extensions import TypeAlias
+from itertools import chain
+from typing import Any, Optional, TYPE_CHECKING, TypeAlias, Union
+
+
+if TYPE_CHECKING:
+    import weakref
 
 import torch
 import torch._dynamo
@@ -111,9 +113,6 @@
 
 log = logging.getLogger(__name__)
 
-NONSTRICT_EXPORT_SANITIZE_TRACE = "NONSTRICT_EXPORT_SANITIZE_TRACE"
-
-
 # Type alias for dynamic shapes specification
 _DynamicShapesSpec: TypeAlias = Union[dict[str, Any], tuple[Any, ...], list[Any]]
 
@@ -187,6 +186,7 @@ def _ignore_backend_decomps():
 def _disable_custom_triton_op_functional_decomposition():
     old = torch._functorch.config.decompose_custom_triton_ops
     try:
+        # pyrefly: ignore  # bad-assignment
         torch._functorch.config.decompose_custom_triton_ops = False
         yield torch._functorch.config.decompose_custom_triton_ops
     finally:
@@ -208,6 +208,14 @@ def _strip_root(x):
     return x
 
 
+def _is_bogus_const_name(name: str):
+    splitted_names = name.split(".")
+    if len(splitted_names) < 1:
+        return True
+
+    return splitted_names[-1].startswith("lifted_tensor")
+
+
 def _rewrite_tracepoint_node(gm: torch.fx.GraphModule):
     """
     In-place modify input graph module by replacing the export tracepoint with a new node
@@ -349,26 +357,15 @@ def _normalize_nn_module_stack(gm_torch_level, root_cls):
             if add_root:
 
                 def normalize_path(path):
-                    try:
-                        parts = []
-
-                        class Path:
-                            def __getattr__(self, name):
-                                if name != "_modules":
-                                    parts.append(name)
-                                return self
-
-                            def __getitem__(self, idx):
-                                parts.append(str(idx))
-                                return self
-
-                        eval(path, {"L": {"self": Path()}})
-                        return ".".join(parts)
-                    except Exception:  # TODO(zhxchen17) Remove this.
-                        return path
+                    if path == "L['self']":
+                        return ""
+                    if path.startswith("L['self']."):
+                        return path[len("L['self'].") :]
+                    return path
 
                 nn_module_stack = {
                     root_key: (root, root_cls.__module__ + "." + root_cls.__qualname__),
+                    # pyrefly: ignore  # unbound-name
                     **nn_module_stack,
                 }
                 node.meta["nn_module_stack"] = {
@@ -517,6 +514,7 @@ def _replace_unbacked_bindings(gm: torch.fx.GraphModule) -> None:
                 simplify=True,
             )
         ):
+            # pyrefly: ignore  # unbound-name
             node.meta["unbacked_bindings"] = unbacked_bindings
 
 
@@ -683,24 +681,22 @@ def _restore_state_dict(
     Restores the state dict of the traced module to that of the original module.
     """
     param_buffer_table = _get_param_buffer_mapping(original_module, traced_module)
-    # Since the graph module is flattened (no module hierarchy), we
-    # need to normalize the module by replacing "." with "_". If we
-    # don't, it will try to save the weight to a submodule which no
-    # longer exists.
-    for name, fqn in param_buffer_table.items():
-        param_buffer_table[name] = fqn.replace(".", "_")
+    # Don't want to change the convention of previous call.
+    param_buffer_table_reverse = {v: k for k, v in param_buffer_table.items()}
 
     # Replace state dict attr names with the fqn
-    for name, fqn in param_buffer_table.items():
-        if not hasattr(traced_module, name):
-            continue
-
-        attr = getattr(traced_module, name)
-        if isinstance(attr, torch.Tensor) and not isinstance(attr, torch.nn.Parameter):
-            traced_module.register_buffer(fqn, attr)
-        else:
-            setattr(traced_module, fqn, attr)
-        delattr(traced_module, name)
+    for name, _ in list(
+        chain(
+            original_module.named_parameters(remove_duplicate=False),
+            # pyrefly: ignore  # bad-argument-type
+            original_module.named_buffers(remove_duplicate=False),
+        )
+    ):
+        if name in param_buffer_table_reverse:
+            dynamo_name = param_buffer_table_reverse[name]
+            param = torch.fx.graph_module._get_attr(traced_module, dynamo_name)
+            torch.fx.graph_module._assign_attr(param, traced_module, name)
+            torch.fx.graph_module._del_attr(traced_module, dynamo_name)
 
     # Replace graph getattr nodes with the correct name
     for node in traced_module.graph.nodes:
@@ -822,6 +818,12 @@ def _export_to_torch_ir(
                     gm_torch_level = _dynamo_graph_capture_for_export(
                         f, constraints=constraints, dynamic_shapes=dynamic_shapes
                     )(*args, **kwargs)
+                    # We can't serialize entire fake mode yet, so this is to make sure
+                    # things like copy.deepcopy(ep.graph_module) not crash.
+                    # see test_export.py::test_custom_tag_metadata_re_export
+                    # Once we delete the old strict export, we can use this fake mode in the
+                    # subsequent logic when lowering to aten IR.
+                    del gm_torch_level.meta["fake_mode"]
 
                 else:
                     gm_torch_level, _ = torch._dynamo.export(
@@ -907,7 +909,6 @@ def _maybe_fixup_gm_and_output_node_meta(old_gm, new_gm):
             # make sure we don't override any meta
             if "desc" in new_output_node.meta:
                 del new_output_node.meta["desc"]
-            assert len(new_output_node.meta) == 0
             new_output_node.meta.update(old_output_node.meta)
 
     # TODO unfortunately preserving graph-level metadata and output node's meta
@@ -1267,6 +1268,9 @@ def _process_export_inputs(
             f"Expecting `args` to be a tuple of example positional inputs, got {type(args)}",
         )
     kwargs = kwargs if kwargs is not None else {}
+    if pytree.is_namedtuple_instance(args):
+        args = tuple(args)
+
     _, original_in_spec = pytree.tree_flatten((args, kwargs))
 
     verify_additional_inputs: Callable[[ExportedProgram], None]
@@ -2069,18 +2073,18 @@ def _export_for_training(
 
     original_state_dict = _get_original_state_dict(mod)
 
+    has_ambient_mode = False
+    if not strict:
+        flat_args, _ = pytree.tree_flatten((args, kwargs))
+        has_ambient_mode = torch._guards.detect_fake_mode(flat_args) is not None
+
     # Call the appropriate export function based on the strictness of tracing.
     export_func = _strict_export if strict else _non_strict_export
 
-    alive_fake_input_ids_before_export: list[int] = []
+    if not strict and torch._export.config.detect_non_strict_fake_tensor_leaks:
+        from torch._subclasses.fake_tensor import fake_tensor_tls
 
-    if not strict and os.environ.get(NONSTRICT_EXPORT_SANITIZE_TRACE, "0") == "1":
-        gc.collect()
-        alive_fake_input_ids_before_export = [
-            id(i)
-            for i in gc.get_objects()
-            if isinstance(i, torch._subclasses.fake_tensor.FakeTensor)
-        ]
+        fake_tensor_tls.non_strict_export_fake_tensor_tracker.clear()
 
     export_artifact = export_func(
         mod=mod,
@@ -2093,6 +2097,25 @@ def _export_for_training(
         _to_aten_func=_export_to_aten_ir_make_fx,
     )
 
+    # If we are tracing with fake inputs, it is expected to
+    # see fake tensor constants.
+    if not strict and not has_ambient_mode:
+        for const, val in export_artifact.aten.constants.items():
+            if isinstance(
+                val, torch._subclasses.fake_tensor.FakeTensor
+            ) and _is_bogus_const_name(const):
+                error_msg = (
+                    f"We found a fake tensor in the exported program constant's list. "
+                    f"This typically means our tracing system encountered an op that "
+                    f"we can't trace through. For the potential source, you can refer to "
+                    f"following model attribute: {const}. "
+                    f"Please file an issue on github. "
+                )
+                if torch._export.config.error_on_lifted_constant_tensors:
+                    raise RuntimeError(error_msg)
+                else:
+                    warnings.warn(error_msg)
+
     export_graph_signature = export_artifact.aten.sig
 
     forward_arg_names = _get_forward_arg_names(mod, args, kwargs)
@@ -2138,26 +2161,14 @@ def _export_for_training(
 
     verify_additional_inputs(exported_program)
 
-    if not strict and os.environ.get(NONSTRICT_EXPORT_SANITIZE_TRACE, "0") == "1":
+    if not strict and torch._export.config.detect_non_strict_fake_tensor_leaks:
         # See NOTE [export non-strict fake tensor leak detection]
+        from torch._subclasses.fake_tensor import fake_tensor_tls
         from torch.fx.experimental.proxy_tensor import (
             _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT,
         )
 
-        fakes_after: list[torch._subclasses.fake_tensor.FakeTensor] = [
-            i
-            for i in gc.get_objects()
-            if isinstance(i, torch._subclasses.fake_tensor.FakeTensor)
-        ]
-
-        active_fakes: weakref.WeakSet = weakref.WeakSet()
-        for fake_tensor in fakes_after:
-            if id(fake_tensor) not in alive_fake_input_ids_before_export:
-                active_fakes.add(fake_tensor)
-
-        del fakes_after
-        del alive_fake_input_ids_before_export
-
+        active_fakes = fake_tensor_tls.non_strict_export_fake_tensor_tracker
         legit_leak: weakref.WeakSet = find_legit_leaks_from_referrers(active_fakes)
         leak_sources: list[str] = []
         if len(legit_leak) > 0:
diff --git a/torch/export/_tree_utils.py b/torch/export/_tree_utils.py
index 1c6a05319ad5..5c2d4426066d 100644
--- a/torch/export/_tree_utils.py
+++ b/torch/export/_tree_utils.py
@@ -1,4 +1,5 @@
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 
 from torch.utils._pytree import Context, TreeSpec
 
diff --git a/torch/export/_unlift.py b/torch/export/_unlift.py
index 0cf9b80ff100..4ce7c28f4b0d 100644
--- a/torch/export/_unlift.py
+++ b/torch/export/_unlift.py
@@ -241,8 +241,11 @@ def _check_input_constraints_pre_hook(self, args, kwargs):
         _check_inputs_match(args, kwargs, self._in_spec)
         return
 
-    # NOTE: this call is Dynamo disabled, as it used to be
-    _check_input_constraints_for_module(self, args, kwargs)
+    # NOTE: for some reason, Dynamo is tracing into this, we should see why and
+    # put compile at the right place. Until then, we can skip the input
+    # constraint checks.
+    if not torch.compiler.is_dynamo_compiling():
+        _check_input_constraints_for_module(self, args, kwargs)
 
 
 def _unlift_inputs_as_getattr(
@@ -323,8 +326,7 @@ def _insert_copy_for_mutations(
             return_nodes_to_copy[return_node] = copy_node
 
     output_args = tuple(
-        return_nodes_to_copy[node] if node in return_nodes_to_copy else node
-        for node in user_output_nodes
+        return_nodes_to_copy.get(node, node) for node in user_output_nodes
     )
     with gm.graph.inserting_before(output_node):
         # Only return user outputs
@@ -353,10 +355,10 @@ def _get_codegen(
     if forward_arg_names:
         names = forward_arg_names
     elif (
-        in_spec.type == tuple
+        in_spec.type is tuple
         and in_spec.num_children == 2
-        and in_spec.children_specs[0].type == tuple
-        and in_spec.children_specs[1].type == dict
+        and in_spec.children_specs[0].type is tuple
+        and in_spec.children_specs[1].type is dict
     ):
         # if in_spec contains the args (tuple) and kwargs (dict)
         names = [f"arg_{i}" for i in range(in_spec.children_specs[0].num_children)]
diff --git a/torch/export/decomp_utils.py b/torch/export/decomp_utils.py
index 2f4c86617cbe..d3097734c8a3 100644
--- a/torch/export/decomp_utils.py
+++ b/torch/export/decomp_utils.py
@@ -1,5 +1,5 @@
 # mypy: allow-untyped-defs
-from typing import Callable
+from collections.abc import Callable
 
 import torch
 from torch._export.utils import (
@@ -21,6 +21,10 @@
 PRESERVED_ATEN_CIA_OPS = {
     torch.ops.aten.upsample_bilinear2d.vec,
     torch.ops.aten.upsample_nearest2d.vec,
+    # NB: don't use the C++ decomp, because it is not functional!
+    torch.ops.aten.silu_backward.default,
+    torch.ops.aten.mish_backward.default,
+    torch.ops.aten._fused_rms_norm.default,
 }
 
 
@@ -49,7 +53,7 @@ def __init__(self):
         self.decomp_table = _core_aten_decompositions_post_autograd()
 
         for op in _collect_all_valid_cia_ops_for_aten_namespace():
-            if op not in PRESERVED_ATEN_CIA_OPS:
+            if op not in PRESERVED_ATEN_CIA_OPS and op not in self.decomp_table:
                 self.decomp_table[op] = _get_decomp_for_cia(op)
 
         # This is to track the *pending* deleted custom ops that haven't been materialized yet
diff --git a/torch/export/dynamic_shapes.py b/torch/export/dynamic_shapes.py
index de41fdfdb346..e362e8334241 100644
--- a/torch/export/dynamic_shapes.py
+++ b/torch/export/dynamic_shapes.py
@@ -4,8 +4,9 @@
 import logging
 import sys
 from collections import defaultdict
+from collections.abc import Callable
 from enum import auto, Enum
-from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 import torch
 from torch.utils._pytree import (
@@ -57,6 +58,15 @@ class _DimHintType(Enum):
 
 @dataclasses.dataclass
 class _DimHint:
+    """
+    Internal class for dynamic shape hints.
+    - min and max are optional.
+    - _factory is for UX only, below example:
+        auto_hint = _DimHint.AUTO()  # _factory=True
+        bounded_hint = auto_hint(min=10, max=100)  # Returns new instance with _factory=False
+        bounded_hint(min=5, max=50)  # Will fail, non-factory instance cannot be called
+    """
+
     type: _DimHintType
     min: Optional[int] = None
     max: Optional[int] = None
@@ -82,6 +92,14 @@ def __call__(self, min=None, max=None) -> "_DimHint":
         assert min is None or max is None or min <= max, "min must be <= max"
         return _DimHint(self.type, min=min, max=max, _factory=False)
 
+    def __repr__(self):
+        parts = [self.type.name]
+        if self.min is not None:
+            parts.append(f"min={self.min}")
+        if self.max is not None:
+            parts.append(f"max={self.max}")
+        return f"DimHint({', '.join(parts)})"
+
 
 class Dim:
     """
@@ -854,7 +872,7 @@ def dynamic_shapes(self, m, args, kwargs=None):
         ]
 
         def _mark_dynamism(v, *other_vs):
-            if not all(type(v) == type(other) for other in other_vs):
+            if not all(type(v) is type(other) for other in other_vs):
                 raise ValueError(
                     "The following inputs were found to have differing types, "
                     f"so they cannot be marked as dynamic: {(v,) + other_vs}."
diff --git a/torch/export/experimental/__init__.py b/torch/export/experimental/__init__.py
index 1c87bb29bfe9..01c522e72b0c 100644
--- a/torch/export/experimental/__init__.py
+++ b/torch/export/experimental/__init__.py
@@ -293,6 +293,7 @@ def _exporter_context(*args, **kwargs):  # type: ignore[no-untyped-def]
                     if isinstance(fn, torch.nn.Module):
                         dynamic_shapes = v(fn, *args, **kwargs)  # type: ignore[arg-type]
                     else:
+                        # pyrefly: ignore  # invalid-param-spec
                         dynamic_shapes = v(*args, **kwargs)
                 except AssertionError:
                     continue
@@ -340,6 +341,7 @@ def _define_overload(
         assert not hasattr(fn, "_define_overload")
         _exporter_context._define_overload = _define_overload  # type: ignore[attr-defined]
 
+        # pyrefly: ignore  # bad-return
         return _exporter_context
 
     @property
@@ -362,7 +364,7 @@ def _compiled_and_package(
             "always_keep_tensor_constants": True,
             # we'll change this back to False once we enable weight deduping for standalone mode
             "aot_inductor.package_constants_in_so": standalone,
-            "aot_inductor.compile_standalone": standalone,
+            "aot_inductor_mode.compile_standalone": standalone,
         }
         aoti_files_map = {}
         model_names = []
@@ -376,6 +378,7 @@ def _compiled_and_package(
                 kwargs=ep.example_inputs[1],
                 options=options,
             )
+            # pyrefly: ignore  # unsupported-operation
             aoti_files_map[name] = aoti_files
 
         from torch._inductor.package import package
diff --git a/torch/export/exported_program.py b/torch/export/exported_program.py
index 807321f0a1eb..eec86b28c040 100644
--- a/torch/export/exported_program.py
+++ b/torch/export/exported_program.py
@@ -8,9 +8,9 @@
 import types
 import warnings
 from collections import defaultdict
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 from contextlib import contextmanager
-from typing import Any, Callable, final, NamedTuple, Optional, TYPE_CHECKING, Union
+from typing import Any, final, NamedTuple, Optional, TYPE_CHECKING, Union
 
 from torch._guards import tracing, TracingContext
 from torch._higher_order_ops.utils import autograd_not_implemented
@@ -532,16 +532,16 @@ def _is_joint_ir_decomp(ep, joint_loss_index):
         _verify_stack_trace(gm)
         _verify_placeholder_names(gm, new_graph_signature)
 
-        gm, new_graph_signature = _remove_unneccessary_copy_op_pass(
+        gm, new_graph_signature = _remove_unnecessary_copy_op_pass(
             gm, new_graph_signature
         )
 
-        # When we apply parameterixzation rule to unwrap
+        # When we apply parameterization rule to unwrap
         # subclasses, the state dict will now have different
         # desugared parameters. We need to manually filter those
         # and update the ep.state_dict. Ideally, we should just return
         # the state dict of ep.module but ep.module only stores params
-        # buffers that participate in forward. If we undo this behaviour,
+        # buffers that participate in forward. If we undo this behavior,
         # it would break some downstream users.
         new_state_dict = {
             **ep.state_dict,
@@ -590,7 +590,7 @@ def _is_joint_ir_decomp(ep, joint_loss_index):
             ep.graph_module,
             fake_args,
             decompositions=python_decomp_table,
-            trace_joint=True if joint_loss_index is not None else False,
+            trace_joint=joint_loss_index is not None,
             output_loss_index=(
                 joint_loss_index if joint_loss_index is not None else None
             ),
@@ -781,7 +781,7 @@ def update_arg(old_arg, new_ph):
     return gm, new_graph_signature, ep.state_dict
 
 
-def _remove_unneccessary_copy_op_pass(
+def _remove_unnecessary_copy_op_pass(
     gm: torch.fx.GraphModule, new_graph_signature: ExportGraphSignature
 ) -> tuple[torch.fx.GraphModule, ExportGraphSignature]:
     """
@@ -894,7 +894,7 @@ def _get_updated_module_call_graph(
                     user_input_counter += 1
 
     # For all the parameters and buffers, we first see
-    # if they are result of paramerizaitons and if they
+    # if they are result of parametrizations and if they
     # are, we log them and error later
     old_param_to_desugared = defaultdict(list)
     for name, target in new_graph_params_buffers.items():
@@ -1500,6 +1500,7 @@ def _transform_do_not_use(self, *passes: PassType) -> "ExportedProgram":
         transformed_gm = res.graph_module if res is not None else self.graph_module
         assert transformed_gm is not None
 
+        # pyrefly: ignore  # missing-attribute
         if transformed_gm is self.graph_module and not res.modified:
             return self
 
@@ -1578,6 +1579,7 @@ def _get_updated_graph_signature(
             verifiers=self.verifiers,
         )
         transformed_ep.graph_module.meta.update(self.graph_module.meta)
+        # pyrefly: ignore  # missing-attribute
         transformed_ep.graph_module.meta.update(res.graph_module.meta)
         return transformed_ep
 
diff --git a/torch/export/passes/__init__.py b/torch/export/passes/__init__.py
index 5e9c5a66008b..c322f9da6363 100644
--- a/torch/export/passes/__init__.py
+++ b/torch/export/passes/__init__.py
@@ -81,6 +81,7 @@ def _get_new_device(
                     and node.target == torch.ops.aten.to.device
                 ):
                     args = list(node.args)
+                    # pyrefly: ignore  # unsupported-operation
                     args[1] = _get_new_device(args[1], location)
                     node.args = tuple(args)
 
diff --git a/torch/export/pt2_archive/_package.py b/torch/export/pt2_archive/_package.py
index eab67a092e1c..1a2e74b84e32 100644
--- a/torch/export/pt2_archive/_package.py
+++ b/torch/export/pt2_archive/_package.py
@@ -6,8 +6,7 @@
 import tempfile
 import zipfile
 from dataclasses import dataclass
-from typing import Any, IO, Optional, TYPE_CHECKING, Union
-from typing_extensions import TypeAlias
+from typing import Any, IO, Optional, TYPE_CHECKING, TypeAlias, Union
 
 import torch
 import torch.utils._pytree as pytree
@@ -32,6 +31,7 @@
 from torch.export.pt2_archive._package_weights import (
     get_complete,
     group_weights,
+    TensorProperties,
     Weights,
 )
 from torch.export.pt2_archive.constants import (
@@ -83,8 +83,8 @@ def is_pt2_package(serialized_model: Union[bytes, str]) -> bool:
         archive_format_path = f"{root_folder}/{ARCHIVE_FORMAT_PATH}"
         if archive_format_path in zip_reader.namelist():
             return zip_reader.read(archive_format_path) == b"pt2"
-    except Exception as ex:
-        logger.info("Model is not a PT2 package: %s", str(ex))
+    except Exception:
+        logger.info("Model is not a PT2 package")
     return False
 
 
@@ -173,8 +173,10 @@ def write_folder(self, archive_dir: str, folder_dir: str) -> None:
             os.path.isfile, glob.glob(f"{folder_dir}/**", recursive=True)
         )
         for file_path in file_paths:
+            # pyrefly: ignore  # no-matching-overload
             filename = os.path.relpath(file_path, folder_dir)
             archive_path = os.path.join(archive_dir, filename)
+            # pyrefly: ignore  # bad-argument-type
             self.write_file(archive_path, file_path)
 
     def close(self) -> None:
@@ -346,7 +348,7 @@ def _get_raw_tensor_bytes(value: torch.Tensor) -> bytes:
     if _is_fake_tensor(value):
         value_bytes = b""
     elif value.data_ptr():
-        cpu_tensor = value.cpu().contiguous()
+        cpu_tensor = value.cpu()
         value_untyped_storage = cpu_tensor.untyped_storage()
         # we store the raw bytes the untyped storage. Tensor metadata is stored separately
         value_bytes = bytes(
@@ -361,58 +363,149 @@ def _get_raw_tensor_bytes(value: torch.Tensor) -> bytes:
     return value_bytes
 
 
+def _should_use_pickle(t: torch.Tensor) -> bool:
+    return _is_tensor_subclass(t) and not _is_fake_tensor(t)
+
+
+def _save_pickled_tensors(
+    pickled_items: list[tuple[str, torch.Tensor]],
+    archive_writer: PT2ArchiveWriter,
+    config: dict[str, schema.PayloadMeta],
+    directory: str,
+    filename_prefix: str,
+    idx: int,
+    pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL,
+) -> int:
+    """Save pickled tensors and update config. Returns updated index."""
+    for item_fqn, tensor in pickled_items:
+        path_name = f"{filename_prefix}{idx}"
+        archive_path = os.path.join(directory, path_name)
+        buffer = io.BytesIO()
+        torch.save(tensor, buffer, pickle_protocol=pickle_protocol)
+        archive_writer.write_bytes(archive_path, buffer.getvalue())
+
+        config[item_fqn] = schema.PayloadMeta(
+            path_name=path_name,
+            is_param=isinstance(tensor, torch.nn.Parameter),
+            use_pickle=True,
+            tensor_meta=serialize_tensor_meta(tensor),
+        )
+        idx += 1
+    return idx
+
+
+def _save_raw_tensors(
+    raw_items: dict[str, tuple[torch.Tensor, TensorProperties]],
+    model_name: str,
+    archive_writer: PT2ArchiveWriter,
+    config: dict[str, schema.PayloadMeta],
+    directory: str,
+    filename_prefix: str,
+    idx: int,
+) -> int:
+    """Save deduplicated raw tensor bytes and update config. Returns updated index."""
+    if not raw_items:
+        return idx
+
+    weights_dict = {model_name: Weights(raw_items)}
+    storage_groups = group_weights(weights_dict)
+
+    for group in storage_groups:
+        # Find the complete tensor that covers all others in this storage group
+        model_name, complete_item_name = get_complete(group, weights_dict)
+        complete_tensor, _ = weights_dict[model_name].get_weight(complete_item_name)
+
+        path_name = f"{filename_prefix}{idx}"
+        archive_path = os.path.join(directory, path_name)
+        tensor_bytes = _get_raw_tensor_bytes(complete_tensor)
+        archive_writer.write_bytes(archive_path, tensor_bytes)
+        idx += 1
+
+        for _, item_fqn in group:
+            tensor, _ = weights_dict[model_name].get_weight(item_fqn)
+            config[item_fqn] = schema.PayloadMeta(
+                path_name=path_name,
+                is_param=isinstance(tensor, torch.nn.Parameter),
+                use_pickle=False,
+                tensor_meta=serialize_tensor_meta(tensor),
+            )
+
+    return idx
+
+
 def _package_state_dict(
+    model_name: str,
     exported_program: ExportedProgram,
     archive_writer: PT2ArchiveWriter,
     pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL,
 ) -> schema.PayloadConfig:
     weights_config: dict[str, schema.PayloadMeta] = {}
-    storage_map: dict[torch.UntypedStorage, str] = {}
 
-    idx = archive_writer.count_prefix(os.path.join(WEIGHTS_DIR, WEIGHT_FILENAME_PREFIX))
+    pickled_weights: list[tuple[str, torch.Tensor]] = []
+    raw_weights: dict[str, tuple[torch.Tensor, TensorProperties]] = {}
+
+    # Categorize weights
     for weight_fqn, weight_tensor in exported_program.state_dict.items():
         assert isinstance(weight_tensor, torch.Tensor), (
             "only torch.Tensor is allowed in state_dict"
         )
-        path_name = f"{WEIGHT_FILENAME_PREFIX}{idx}"
-        is_param = isinstance(weight_tensor, torch.nn.Parameter)
-        # use pickle for non-fake tensor subclasses
-        use_pickle = _is_tensor_subclass(weight_tensor) and not _is_fake_tensor(
-            weight_tensor
-        )
-        archive_path = os.path.join(WEIGHTS_DIR, path_name)
-        if use_pickle:
-            buffer = io.BytesIO()
-            torch.save(weight_tensor, buffer, pickle_protocol=pickle_protocol)
-            archive_writer.write_bytes(archive_path, buffer.getvalue())
-            idx += 1
+        if _should_use_pickle(weight_tensor):
+            pickled_weights.append((weight_fqn, weight_tensor))
         else:
-            tensor_storage = weight_tensor.untyped_storage()
-            if tensor_storage not in storage_map:
-                storage_map[tensor_storage] = path_name
-                tensor_bytes = _get_raw_tensor_bytes(weight_tensor)
-                archive_writer.write_bytes(archive_path, tensor_bytes)
-                idx += 1
-            else:
-                path_name = storage_map[tensor_storage]
+            raw_weights[weight_fqn] = (weight_tensor, TensorProperties(weight_tensor))
 
-        weights_config[weight_fqn] = schema.PayloadMeta(
-            path_name=path_name,
-            is_param=is_param,
-            use_pickle=use_pickle,
-            tensor_meta=serialize_tensor_meta(weight_tensor),
-        )
+    idx = archive_writer.count_prefix(os.path.join(WEIGHTS_DIR, WEIGHT_FILENAME_PREFIX))
+
+    # Save weights in pickle format
+    idx = _save_pickled_tensors(
+        pickled_weights,
+        archive_writer,
+        weights_config,
+        WEIGHTS_DIR,
+        WEIGHT_FILENAME_PREFIX,
+        idx,
+        pickle_protocol,
+    )
+
+    # Save weights in raw bytes format
+    _save_raw_tensors(
+        raw_weights,
+        model_name,
+        archive_writer,
+        weights_config,
+        WEIGHTS_DIR,
+        WEIGHT_FILENAME_PREFIX,
+        idx,
+    )
 
     return schema.PayloadConfig(config=weights_config)
 
 
 def _package_constants(
+    model_name: str,
     exported_program: ExportedProgram,
     archive_writer: PT2ArchiveWriter,
     pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL,
 ) -> schema.PayloadConfig:
     constants_config: dict[str, schema.PayloadMeta] = {}
-    storage_map: dict[torch.UntypedStorage, str] = {}
+
+    pickled_constants: list[tuple[str, torch.Tensor]] = []
+    raw_constants: dict[str, tuple[torch.Tensor, TensorProperties]] = {}
+    custom_objects: list[tuple[str, torch._C.ScriptObject]] = []
+
+    # Categorize constants
+    for constant_fqn, constant in exported_program.constants.items():
+        if isinstance(constant, torch.Tensor):
+            if _should_use_pickle(constant):
+                pickled_constants.append((constant_fqn, constant))
+            else:
+                raw_constants[constant_fqn] = (constant, TensorProperties(constant))
+
+        elif isinstance(constant, torch._C.ScriptObject):
+            custom_objects.append((constant_fqn, constant))
+
+        else:
+            raise RuntimeError(f"Unsupported constant type: {type(constant)}")
 
     tensor_idx = archive_writer.count_prefix(
         os.path.join(CONSTANTS_DIR, TENSOR_CONSTANT_FILENAME_PREFIX)
@@ -421,50 +514,42 @@ def _package_constants(
         os.path.join(CONSTANTS_DIR, CUSTOM_OBJ_FILENAME_PREFIX)
     )
 
-    for constant_fqn, constant in exported_program.constants.items():
-        if isinstance(constant, torch.Tensor):
-            use_pickle = _is_tensor_subclass(constant) and not _is_fake_tensor(constant)
-            path_name = f"{TENSOR_CONSTANT_FILENAME_PREFIX}{tensor_idx}"
-            archive_path = os.path.join(CONSTANTS_DIR, path_name)
-            if use_pickle:
-                buffer = io.BytesIO()
-                torch.save(constant, buffer, pickle_protocol=pickle_protocol)
-                archive_writer.write_bytes(archive_path, buffer.getvalue())
-                tensor_idx += 1
-            else:
-                # Only save once when tensors share the same storage
-                tensor_storage = constant.untyped_storage()
-                if tensor_storage not in storage_map:
-                    storage_map[tensor_storage] = path_name
-                    tensor_bytes = _get_raw_tensor_bytes(constant)
-                    archive_writer.write_bytes(archive_path, tensor_bytes)
-                    tensor_idx += 1
-                else:
-                    path_name = storage_map[tensor_storage]
+    # Save constants in pickle format
+    tensor_idx = _save_pickled_tensors(
+        pickled_constants,
+        archive_writer,
+        constants_config,
+        CONSTANTS_DIR,
+        TENSOR_CONSTANT_FILENAME_PREFIX,
+        tensor_idx,
+        pickle_protocol,
+    )
 
-            constants_config[constant_fqn] = schema.PayloadMeta(
-                path_name=path_name,
-                is_param=False,
-                use_pickle=use_pickle,
-                tensor_meta=serialize_tensor_meta(constant),
-            )
+    # Save constants in raw bytes format
+    _save_raw_tensors(
+        raw_constants,
+        model_name,
+        archive_writer,
+        constants_config,
+        CONSTANTS_DIR,
+        TENSOR_CONSTANT_FILENAME_PREFIX,
+        tensor_idx,
+    )
 
-        elif isinstance(constant, torch._C.ScriptObject):
-            # use pickle for custom objects
-            path_name = f"{CUSTOM_OBJ_FILENAME_PREFIX}{custom_obj_idx}"
-            custom_obj_idx += 1
-            constants_config[constant_fqn] = schema.PayloadMeta(
-                path_name=path_name,
-                is_param=False,
-                use_pickle=True,
-                tensor_meta=None,
-            )
-            archive_path = os.path.join(CONSTANTS_DIR, path_name)
-            custom_obj_bytes = torch._C._pickle_save(constant)
-            archive_writer.write_bytes(archive_path, custom_obj_bytes)
+    # Handle custom objects
+    for constant_fqn, constant in custom_objects:
+        path_name = f"{CUSTOM_OBJ_FILENAME_PREFIX}{custom_obj_idx}"
+        archive_path = os.path.join(CONSTANTS_DIR, path_name)
+        custom_obj_bytes = torch._C._pickle_save(constant)
+        archive_writer.write_bytes(archive_path, custom_obj_bytes)
 
-        else:
-            raise RuntimeError(f"Unsupported constant type: {type(constant)}")
+        constants_config[constant_fqn] = schema.PayloadMeta(
+            path_name=path_name,
+            is_param=False,
+            use_pickle=True,
+            tensor_meta=None,
+        )
+        custom_obj_idx += 1
 
     return schema.PayloadConfig(config=constants_config)
 
@@ -497,11 +582,15 @@ def _package_exported_programs(
     assert isinstance(exported_programs, dict)
 
     for model_name, ep in exported_programs.items():
-        weights_config = _package_state_dict(ep, archive_writer, pickle_protocol)
+        weights_config = _package_state_dict(
+            model_name, ep, archive_writer, pickle_protocol
+        )
         weights_config_file = WEIGHTS_CONFIG_FILENAME_FORMAT.format(model_name)
         _package_payload_config(archive_writer, weights_config, weights_config_file)
 
-        constants_config = _package_constants(ep, archive_writer, pickle_protocol)
+        constants_config = _package_constants(
+            model_name, ep, archive_writer, pickle_protocol
+        )
         constants_config_file = CONSTANTS_CONFIG_FILENAME_FORMAT.format(model_name)
         _package_payload_config(archive_writer, constants_config, constants_config_file)
 
@@ -594,6 +683,7 @@ def package_pt2(
 
     if not (
         (isinstance(f, (io.IOBase, IO)) and f.writable() and f.seekable())
+        # pyrefly: ignore  # no-matching-overload
         or (isinstance(f, (str, os.PathLike)) and os.fspath(f).endswith(".pt2"))
         or (isinstance(f, tempfile._TemporaryFileWrapper) and f.name.endswith(".pt2"))
     ):
@@ -605,8 +695,10 @@ def package_pt2(
         )
 
     if isinstance(f, (str, os.PathLike)):
+        # pyrefly: ignore  # no-matching-overload
         f = os.fspath(f)
 
+    # pyrefly: ignore  # bad-argument-type
     with PT2ArchiveWriter(f) as archive_writer:
         _package_exported_programs(
             archive_writer, exported_programs, pickle_protocol=pickle_protocol
@@ -621,6 +713,7 @@ def package_pt2(
 
     if isinstance(f, (io.IOBase, IO)):
         f.seek(0)
+    # pyrefly: ignore  # bad-return
     return f
 
 
@@ -794,7 +887,9 @@ def _load_state_dict(
                     ),
                 )
                 if payload_meta.is_param:
-                    state_dict[weight_fqn] = torch.nn.Parameter(weight_tensor)
+                    state_dict[weight_fqn] = torch.nn.Parameter(
+                        weight_tensor, requires_grad=tensor_meta.requires_grad
+                    )
                 else:
                     state_dict[weight_fqn] = weight_tensor
 
@@ -915,6 +1010,44 @@ def _load_extra_files(
     return extra_file_contents
 
 
+def _load_aoti(
+    file: str,
+    model_name: str,
+    run_single_threaded: bool,
+    num_runners: int,
+    device_idx: int,
+) -> AOTICompiledModel:
+    loaded_metadata = torch._C._aoti.AOTIModelPackageLoader.load_metadata_from_package(  # type: ignore[attr-defined]
+        file, model_name
+    )
+
+    device = loaded_metadata["AOTI_DEVICE_KEY"]
+    current_device_info = torch._inductor.codecache.get_device_information(device)
+
+    for k, v in current_device_info.items():
+        if k in loaded_metadata:
+            if v != loaded_metadata[k]:
+                logger.warning(
+                    "Device information mismatch for %s: %s vs %s. "
+                    "This could cause some issues when loading the AOTInductor compiled artifacts.",
+                    k,
+                    v,
+                    loaded_metadata[k],
+                )
+
+    aoti_compiled_model = AOTICompiledModel(
+        torch._C._aoti.AOTIModelPackageLoader(
+            file,
+            model_name,
+            run_single_threaded,
+            num_runners,
+            device_idx,
+        )
+    )
+
+    return aoti_compiled_model
+
+
 def load_pt2(
     f: FileLike,
     *,
@@ -953,6 +1086,7 @@ def load_pt2(
 
     if not (
         (isinstance(f, (io.IOBase, IO)) and f.readable() and f.seekable())
+        # pyrefly: ignore  # no-matching-overload
         or (isinstance(f, (str, os.PathLike)) and os.fspath(f).endswith(".pt2"))
     ):
         # TODO: turn this into an error in 2.9
@@ -963,10 +1097,12 @@ def load_pt2(
         )
 
     if isinstance(f, (str, os.PathLike)):
+        # pyrefly: ignore  # no-matching-overload
         f = os.fspath(f)
 
     weights = {}
     weight_maps = {}
+    # pyrefly: ignore  # bad-argument-type
     with PT2ArchiveReader(f) as archive_reader:
         version = archive_reader.read_string(ARCHIVE_VERSION_PATH)
         if version != ARCHIVE_VERSION_VALUE:
@@ -1017,14 +1153,12 @@ def load_pt2(
                 logger.debug("Writing buffer to tmp file located at %s.", tf.name)
 
                 aoti_runners = {
-                    model_name: AOTICompiledModel(
-                        torch._C._aoti.AOTIModelPackageLoader(
-                            tf.name,
-                            model_name,
-                            run_single_threaded,
-                            num_runners,
-                            device_index,
-                        )
+                    model_name: _load_aoti(
+                        tf.name,
+                        model_name,
+                        run_single_threaded,
+                        num_runners,
+                        device_index,
                     )
                     for model_name in aoti_model_names
                 }
@@ -1032,10 +1166,13 @@ def load_pt2(
             aoti_runners = {}
     else:
         aoti_runners = {
-            model_name: AOTICompiledModel(
-                torch._C._aoti.AOTIModelPackageLoader(
-                    f, model_name, run_single_threaded, num_runners, device_index
-                )
+            model_name: _load_aoti(
+                # pyrefly: ignore  # bad-argument-type
+                f,
+                model_name,
+                run_single_threaded,
+                num_runners,
+                device_index,
             )
             for model_name in aoti_model_names
         }
diff --git a/torch/export/pt2_archive/_package_weights.py b/torch/export/pt2_archive/_package_weights.py
index 5e2a360b3dc6..d7f8d4fb2f87 100644
--- a/torch/export/pt2_archive/_package_weights.py
+++ b/torch/export/pt2_archive/_package_weights.py
@@ -1,6 +1,8 @@
 import collections
+import warnings
 
 import torch
+from torch._subclasses.fake_tensor import FakeTensor
 from torch.utils._ordered_set import OrderedSet
 
 
@@ -14,22 +16,46 @@ def _end_ptr(tensor: torch.Tensor) -> int:
 
 class TensorProperties:
     def __init__(self, tensor: torch.Tensor):
-        # info about underlying storage
-        self.storage_ptr = tensor.untyped_storage().data_ptr()
-        self.storage_size = tensor.untyped_storage().nbytes()
+        self.is_fake = isinstance(tensor, FakeTensor)
+        self.is_contiguous = tensor.is_contiguous()
+        self.storage_ptr = None
+        self.storage_size = None
+        self.start = None
+        self.end = None
+
+        if not self.is_fake:
+            # only get the storage pointer for real tensors
+            # pyrefly: ignore  # bad-assignment
+            self.storage_ptr = tensor.untyped_storage().data_ptr()
+            if self.is_contiguous:
+                # only get storage size and start/end pointers for contiguous tensors
+                # pyrefly: ignore  # bad-assignment
+                self.storage_size = tensor.untyped_storage().nbytes()
+                # pyrefly: ignore  # bad-assignment
+                self.start = tensor.data_ptr()
+                # pyrefly: ignore  # bad-assignment
+                self.end = _end_ptr(tensor)
 
         # info to recover tensor
         self.shape = tensor.shape
         self.stride = tensor.stride()
         self.offset = tensor.storage_offset()
 
-        self.start = tensor.data_ptr()
-        self.end = _end_ptr(tensor)
-
     def is_complete(self) -> bool:
         """
         Whether the tensor completely overlaps with its underlying storage
         """
+        if self.is_fake:
+            # Theoretically, fake tensors should not appear in weights
+            # But we handle this corner case to make it always complete
+            return True
+        if not self.is_contiguous:
+            return False
+
+        assert self.storage_ptr is not None
+        assert self.storage_size is not None
+        assert self.start is not None
+        assert self.end is not None
         return (
             self.start == self.storage_ptr
             and self.end == self.storage_ptr + self.storage_size
@@ -80,7 +106,12 @@ def get_tensor_properties(name_tuple: tuple[str, str]) -> TensorProperties:
         if tensor_property.is_complete():
             return name_tuple
 
-    raise RuntimeError("No complete tensor found in the group!")
+    warnings.warn(
+        "No complete tensor found in the group! Returning the first one. "
+        "This may cause issues when your weights are not on CPU."
+    )
+    assert len(group) > 0
+    return next(iter(group))
 
 
 def group_weights(all_weights: dict[str, Weights]) -> list[OrderedSet[tuple[str, str]]]:
@@ -90,12 +121,14 @@ def group_weights(all_weights: dict[str, Weights]) -> list[OrderedSet[tuple[str,
     Returns a list of sets, each set contains a tuple of (model_name, weight_name).
     """
 
-    weights_dict: dict[int, OrderedSet[tuple[str, str]]] = collections.defaultdict(
-        OrderedSet
-    )  # storage_key -> set(weight)
+    weights_dict: dict[tuple[int, torch.dtype], OrderedSet[tuple[str, str]]] = (
+        collections.defaultdict(OrderedSet)
+    )  # (storage_key, dtype) -> set(weight)
 
     for model_name, weights in all_weights.items():
-        for weight_name, (_, properties) in weights.items():
-            weights_dict[properties.storage_ptr].add((model_name, weight_name))
+        for weight_name, (tensor, properties) in weights.items():
+            weights_dict[(properties.storage_ptr, tensor.dtype)].add(
+                (model_name, weight_name)
+            )
 
     return list(weights_dict.values())
diff --git a/torch/export/unflatten.py b/torch/export/unflatten.py
index d09307f66d6b..bdea191190e3 100644
--- a/torch/export/unflatten.py
+++ b/torch/export/unflatten.py
@@ -5,11 +5,12 @@
 import operator
 import re
 from collections import defaultdict
+from collections.abc import Callable
 from contextlib import contextmanager
 from copy import deepcopy
 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Callable, cast, Optional, Union
+from typing import Any, cast, Optional, Union
 
 import torch
 import torch.fx._pytree as fx_pytree
@@ -936,6 +937,7 @@ def arg_dump(arg) -> str:
                 for key, value in pytree.tree_map(arg_dump, node.kwargs).items()
             ]
             target = node.target if node.op in ("call_function", "get_attr") else ""
+            # pyrefly: ignore  # bad-argument-type
             ret.append(f"{i}: {node.op}[{target}]({', '.join(args_dump)})")
             nodes_idx[id(node)] = i
         return "\n".join(ret)
@@ -1472,6 +1474,7 @@ def run_from(self, node_idx):
                 self.seen_attrs[self.child_fqn].add(node.target)
 
             self.copy_node(node)
+            # pyrefly: ignore  # unsupported-operation
             node_idx += 1
 
 
diff --git a/torch/functional.py b/torch/functional.py
index b5fcf8240c83..47e147f85086 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -1784,7 +1784,9 @@ def norm(  # noqa: F811
 
         if isinstance(p, str):
             if p == "fro" and (
-                dim is None or isinstance(dim, (int, torch.SymInt)) or len(dim) <= 2
+                dim is None
+                or isinstance(dim, (int, torch.SymInt))
+                or len(dim) <= 2  # pyrefly: ignore  # bad-argument-type
             ):
                 if out is None:
                     return torch.linalg.vector_norm(
@@ -1940,7 +1942,7 @@ def _unravel_index(indices: Tensor, shape: Union[int, Sequence[int]]) -> Tensor:
     torch._check_type(
         not indices.is_complex()
         and not indices.is_floating_point()
-        and not indices.dtype == torch.bool,
+        and indices.dtype != torch.bool,
         lambda: f"expected 'indices' to be integer dtype, but got {indices.dtype}",
     )
 
@@ -1950,7 +1952,7 @@ def _unravel_index(indices: Tensor, shape: Union[int, Sequence[int]]) -> Tensor:
     )
 
     if isinstance(shape, (int, torch.SymInt)):
-        shape = torch.Size([shape])
+        shape = torch.Size([shape])  # pyrefly: ignore  # bad-argument-type
     else:
         for dim in shape:
             torch._check_type(
diff --git a/torch/futures/__init__.py b/torch/futures/__init__.py
index 76a479d965a3..cee84f545273 100644
--- a/torch/futures/__init__.py
+++ b/torch/futures/__init__.py
@@ -1,11 +1,15 @@
 # mypy: allow-untyped-defs
 from __future__ import annotations
 
-from typing import Callable, cast, Generic, Optional, TypeVar, Union
+from typing import cast, Generic, Optional, TYPE_CHECKING, TypeVar, Union
 
 import torch
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 __all__ = ["Future", "collect_all", "wait_all"]
 
 
diff --git a/torch/fx/_compatibility.py b/torch/fx/_compatibility.py
index 26bb3ff3b772..c07dd1b51bc0 100644
--- a/torch/fx/_compatibility.py
+++ b/torch/fx/_compatibility.py
@@ -1,5 +1,6 @@
 import textwrap
-from typing import Any, Callable, TypeVar
+from collections.abc import Callable
+from typing import Any, TypeVar
 
 
 _BACK_COMPAT_OBJECTS: dict[Any, None] = {}
diff --git a/torch/fx/_graph_pickler.py b/torch/fx/_graph_pickler.py
index a53cefb2c018..0a2dd4fb6cb4 100644
--- a/torch/fx/_graph_pickler.py
+++ b/torch/fx/_graph_pickler.py
@@ -3,7 +3,8 @@
 import io
 import pickle
 from abc import abstractmethod
-from typing import Any, Callable, NewType, Optional, TypeVar, Union
+from collections.abc import Callable
+from typing import Any, NewType, Optional, TypeVar, Union
 from typing_extensions import override, Self
 
 import torch
@@ -65,6 +66,7 @@ def __init__(self, file: io.BytesIO, options: Optional[Options] = None) -> None:
         self._meta_tensor_describer = MetaTensorDescriber(copy_data=False)
 
     @override
+    # pyrefly: ignore  # bad-override
     def reducer_override(
         self, obj: object
     ) -> tuple[Callable[..., Any], tuple[Any, ...]]:
@@ -201,6 +203,7 @@ def reduce_helper(
     ]:
         args = (cls(obj.node), pickler._unpickle_state)
         if isinstance(obj, torch.SymInt):
+            # pyrefly: ignore  # bad-return
             return _SymNodePickleData.unpickle_sym_int, args
         else:
             raise NotImplementedError(f"Unhandled SymNode type {type(obj)}")
@@ -262,6 +265,14 @@ def unpickle(self, unpickle_state: _UnpickleState) -> FakeTensor:
             fake_mode=unpickle_state.fake_mode,
         )
 
+        # also need to set the fake_mode on the base of a tensor if it's a view
+        if metadata.is_view and metadata.base is not None:
+            new_base = dataclasses.replace(
+                metadata.base,
+                fake_mode=unpickle_state.fake_mode,
+            )
+            metadata = dataclasses.replace(metadata, base=new_base)
+
         def with_fake(
             make_meta_t: Callable[[], torch.Tensor], device: Union[torch.device, str]
         ) -> FakeTensor:
@@ -269,6 +280,7 @@ def with_fake(
                 return FakeTensor(
                     unpickle_state.fake_mode,
                     make_meta_t(),
+                    # pyrefly: ignore  # bad-argument-type
                     device,
                 )
 
@@ -321,7 +333,9 @@ def from_object(cls, tnp: object) -> Optional[Self]:
         if not (name := getattr(np, "__name__", None)):
             return None
 
+        # pyrefly: ignore  # unbound-name
         assert np == getattr(importlib.import_module(mod), name)
+        # pyrefly: ignore  # unbound-name
         return cls(mod, name)
 
 
@@ -413,12 +427,9 @@ def pickle(cls, op: object, options: Options) -> "_OpPickleData":
             return cls._pickle_op(name, _OpOverloadPickleData, options)
         elif isinstance(op, torch._ops.OpOverloadPacket):
             return cls._pickle_op(name, _OpOverloadPacketPickleData, options)
-        elif name.startswith(("builtins.", "math.", "torch.")):
+        elif name.startswith(_OpFunctionPickleData.SUPPORTED_ROOTS):
             root, detail = name.split(".", 1)
-            return _OpBuiltinPickleData(root, detail)
-        elif name.startswith("operator."):
-            _, detail = name.split(".", 1)
-            return _OpOperatorPickleData(detail)
+            return _OpFunctionPickleData(root, detail)
         else:
             # TODO: raise a BypassFxGraphCache so we will just bypass this one...
             raise NotImplementedError(f"TARGET: {type(op)} {op} {name}")
@@ -492,7 +503,16 @@ def unpickle(self, unpickle_state: _UnpickleState) -> torch._ops.OpOverloadPacke
         return obj
 
 
-class _OpBuiltinPickleData(_OpPickleData):
+class _OpFunctionPickleData(_OpPickleData):
+    """
+    Supports pickling a set of standard/common functions
+    These must be prefixed with the full namespace in order to properly
+    be pickled (i.e `einops.rearrange` and not `from einops import rearrange`)
+    """
+
+    # Static variable listing supported root names
+    SUPPORTED_ROOTS = ("builtins.", "math.", "torch.", "operator.", "einops.")
+
     def __init__(self, root: str, name: str) -> None:
         self.root = root
         self.name = name
@@ -506,18 +526,16 @@ def unpickle(self, unpickle_state: _UnpickleState) -> object:
             return self._getattr_by_name(math, self.name)
         elif self.root == "torch":
             return self._getattr_by_name(torch, self.name)
-        else:
-            raise NotImplementedError
+        elif self.root == "operator":
+            import operator
 
+            return self._getattr_by_name(operator, self.name)
+        elif self.root == "einops":
+            import einops
 
-class _OpOperatorPickleData(_OpPickleData):
-    def __init__(self, name: str) -> None:
-        self.name = name
-
-    def unpickle(self, unpickle_state: _UnpickleState) -> object:
-        import operator
-
-        return self._getattr_by_name(operator, self.name)
+            return self._getattr_by_name(einops, self.name)
+        else:
+            raise NotImplementedError
 
 
 class _GraphPickleData:
diff --git a/torch/fx/_pytree.py b/torch/fx/_pytree.py
index 7a31e4ef3cfa..2f608816c49b 100644
--- a/torch/fx/_pytree.py
+++ b/torch/fx/_pytree.py
@@ -1,5 +1,6 @@
 from collections import namedtuple
-from typing import Any, Callable, Optional, TypeVar
+from collections.abc import Callable
+from typing import Any, Optional, TypeVar
 from typing_extensions import NamedTuple
 
 import torch.return_types
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
index 4775bef4ba31..ddce85e21d22 100644
--- a/torch/fx/_symbolic_trace.py
+++ b/torch/fx/_symbolic_trace.py
@@ -9,10 +9,10 @@
 import math
 import os
 import warnings
+from collections.abc import Callable
 from itertools import chain
 from types import CodeType, FunctionType, ModuleType
-from typing import Any, Callable, get_args, NamedTuple, Optional, Union
-from typing_extensions import TypeAlias
+from typing import Any, get_args, NamedTuple, Optional, TypeAlias, Union
 
 import torch
 import torch.utils._pytree as pytree
@@ -603,6 +603,7 @@ def maybe_get_proxy_for_attr(
                             in inspect.signature(self.create_proxy).parameters
                         ):
                             kwargs["proxy_factory_fn"] = (
+                                # pyrefly: ignore  # unsupported-operation
                                 None
                                 if not self.param_shapes_constant
                                 else lambda node: ParameterProxy(
@@ -890,7 +891,7 @@ def __deepcopy__(self, memo):
         new_tracer = Tracer.__new__(Tracer)
 
         for k, v in self.__dict__.items():
-            if k in {"_autowrap_search"}:
+            if k == "_autowrap_search":
                 new_obj = copy.copy(v)
             else:
                 new_obj = copy.deepcopy(v, memo)
@@ -926,7 +927,11 @@ def replace_ph(x):
 
                     return out
                 # Union[int, bool] == bool in Python <= 3.6
-                if type(x) == bool or type(x) in base_types and type(x) != torch.Tensor:
+                if (
+                    type(x) is bool
+                    or type(x) in base_types
+                    and type(x) is not torch.Tensor
+                ):
                     torch._assert(
                         out == x,
                         f"{name} has been specialized to have value {x} but got another value",
diff --git a/torch/fx/experimental/_constant_symnode.py b/torch/fx/experimental/_constant_symnode.py
index c45728d24d1d..a321ab7c6b73 100644
--- a/torch/fx/experimental/_constant_symnode.py
+++ b/torch/fx/experimental/_constant_symnode.py
@@ -41,6 +41,9 @@ def __repr__(self) -> str:
     def _graph_repr(self) -> str:
         return self._str()
 
+    def add(self, other: Any) -> Any:
+        return other.add(self)
+
     def mul(self, other: Any) -> Any:
         return other.mul(self)
 
diff --git a/torch/fx/experimental/_dynamism.py b/torch/fx/experimental/_dynamism.py
index 4828b6f458eb..f6f30779ecc2 100644
--- a/torch/fx/experimental/_dynamism.py
+++ b/torch/fx/experimental/_dynamism.py
@@ -1,5 +1,6 @@
 import re
-from typing import Any, Callable, Union
+from collections.abc import Callable
+from typing import Any, Union
 
 import torch
 from torch.utils._pytree import tree_flatten_with_path, tree_map
diff --git a/torch/fx/experimental/accelerator_partitioner.py b/torch/fx/experimental/accelerator_partitioner.py
index c29d05f511a7..153e54400ee1 100644
--- a/torch/fx/experimental/accelerator_partitioner.py
+++ b/torch/fx/experimental/accelerator_partitioner.py
@@ -657,7 +657,10 @@ def combine_partitions_based_on_size(
                 # Mark bfs level
                 get_bfs_level_partition(self.partitions)
                 find_combination, partitions = find_partition_to_combine_based_on_size(
-                    sorted_partitions, available_mem_bytes, partitions
+                    sorted_partitions,
+                    available_mem_bytes,
+                    # pyrefly: ignore  # bad-argument-type
+                    partitions,
                 )
             return
 
@@ -702,6 +705,7 @@ def reset_partition_in_sparse_nn(partition, new_partition=True):
                 non_embedding_partitions.append(partition)
             if new_partition:
                 partition = self.create_partition()
+                # pyrefly: ignore  # missing-attribute
                 partition.left_mem_bytes = available_mem_bytes
                 return partition
             return None
@@ -997,6 +1001,7 @@ def swap_node_to_partition(
                     node, n1, p0, p1, node_to_latency_mapping, transfer_rate_per_sec
                 )
                 if cost < min_cost:
+                    # pyrefly: ignore  # bad-assignment
                     node_pair = [node, n1]
                     min_cost = cost
             return cost, node_pair  # type: ignore[possibly-undefined]
diff --git a/torch/fx/experimental/const_fold.py b/torch/fx/experimental/const_fold.py
index 3e53cb908fbf..d4a56a808bc1 100644
--- a/torch/fx/experimental/const_fold.py
+++ b/torch/fx/experimental/const_fold.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import re
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+from typing import Optional, Union
 
 import torch.fx
 from torch.fx.node import map_arg
diff --git a/torch/fx/experimental/graph_gradual_typechecker.py b/torch/fx/experimental/graph_gradual_typechecker.py
index a8798a6a0726..d1ca9bc0c880 100644
--- a/torch/fx/experimental/graph_gradual_typechecker.py
+++ b/torch/fx/experimental/graph_gradual_typechecker.py
@@ -1,8 +1,9 @@
 # mypy: allow-untyped-defs
 import itertools
 import operator
+from collections.abc import Callable
 from functools import reduce
-from typing import Callable, TypeVar
+from typing import TypeVar
 from typing_extensions import ParamSpec
 
 import sympy
@@ -180,12 +181,12 @@ def add_inference_rule(n: Node):
     t2 = n.args[1].type
 
     # handle scalar addition
-    if t1 == int and isinstance(t2, TensorType):
+    if t1 is int and isinstance(t2, TensorType):
         n.type = t2
         return n.type
 
     # handle scalar addition
-    elif t2 == int and isinstance(t1, TensorType):
+    elif t2 is int and isinstance(t1, TensorType):
         n.type = t1
         return n.type
 
@@ -942,15 +943,11 @@ def refine_node(self, n: Node):
         if n.op == "call_function":
             if n.target in _REFINEMENT_RULES:
                 self.constraints += _REFINEMENT_RULES[n.target](n)
-            else:
-                pass
 
         if n.op == "call_module":
             module_instance = self.traced.get_submodule(n.target)
             if type(module_instance) in _REFINEMENT_RULES:
                 self.constraints += _REFINEMENT_RULES[type(module_instance)](n)
-            else:
-                pass
 
         if n.op == "output":
 
@@ -960,23 +957,16 @@ def get_node_type(a):
             n.type = torch.fx.node.map_arg(n.args[0], get_node_type)
             return n.type
 
-        else:
-            pass
-
     def infer_symbolic_relations(self, n: Node):
         n.type = self.convert_to_sympy_symbols(n.type)
         if n.op == "call_function":
             if n.target in _RULES:
                 return _RULES[n.target](n)
-            else:
-                pass
 
         if n.op == "call_module":
             module_instance = self.traced.get_submodule(n.target)
             if type(module_instance) in _RULES:
                 return _RULES[type(module_instance)](n, module_instance)
-            else:
-                pass
 
         if n.op == "output":
 
@@ -986,9 +976,6 @@ def get_node_type(a):
             n.type = torch.fx.node.map_arg(n.args[0], get_node_type)
             return n.type
 
-        else:
-            pass
-
 
 def get_parameter(traced, target: str):
     """
diff --git a/torch/fx/experimental/merge_matmul.py b/torch/fx/experimental/merge_matmul.py
index c6a51918f930..80912ec87e7a 100644
--- a/torch/fx/experimental/merge_matmul.py
+++ b/torch/fx/experimental/merge_matmul.py
@@ -30,6 +30,7 @@ def split_result_tensors(
     else:
         splits = [x.shape[0] for x in inputs]
 
+    # pyrefly: ignore  # bad-argument-type
     return torch.split(result, splits)
 
 
diff --git a/torch/fx/experimental/meta_tracer.py b/torch/fx/experimental/meta_tracer.py
index bc00be5ee7ae..040521a28455 100644
--- a/torch/fx/experimental/meta_tracer.py
+++ b/torch/fx/experimental/meta_tracer.py
@@ -2,7 +2,8 @@
 import builtins
 import functools
 import warnings
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional, Union
 
 import torch
 import torch.fx
@@ -171,7 +172,14 @@ def create_proxy(
         proxy_factory_fn=None,
     ):
         rv = super().create_proxy(
-            kind, target, args, kwargs, name, type_expr, proxy_factory_fn
+            kind,
+            target,
+            args,
+            kwargs,
+            name,
+            type_expr,
+            # pyrefly: ignore  # bad-argument-type
+            proxy_factory_fn,
         )
 
         if kind == "placeholder" and target in self.meta_args:
@@ -193,6 +201,7 @@ def create_proxy(
 
             if kind == "call_function":
                 meta_target = manual_meta_overrides.get(target, target)
+                # pyrefly: ignore  # not-callable
                 meta_out = meta_target(*args_metas, **kwargs_metas)
             elif kind == "call_method":
                 meta_target = getattr(args_metas[0], target)  # type: ignore[index]
diff --git a/torch/fx/experimental/migrate_gradual_types/constraint_generator.py b/torch/fx/experimental/migrate_gradual_types/constraint_generator.py
index e4951aab15cb..381cdf18d19b 100644
--- a/torch/fx/experimental/migrate_gradual_types/constraint_generator.py
+++ b/torch/fx/experimental/migrate_gradual_types/constraint_generator.py
@@ -1,8 +1,8 @@
 # mypy: allow-untyped-defs
 import operator
 import warnings
-from collections.abc import Iterable
-from typing import Callable, TypeVar
+from collections.abc import Callable, Iterable
+from typing import TypeVar
 from typing_extensions import ParamSpec
 
 import torch
@@ -528,9 +528,11 @@ def view_inference_rule(n: Node, symbols, constraints, counter):
         if t == -1:
             var, counter = gen_dvar(counter)
             t2_type.append(var)
+            # pyrefly: ignore  # bad-argument-type
             num_constraints.append(BinConstraintD(var, Dyn, op_neq))
 
         else:
+            # pyrefly: ignore  # bad-argument-type
             num_constraints.append(BinConstraintD(t, Dyn, op_neq))
             t2_type.append(t)  # type: ignore[arg-type]
 
@@ -1475,6 +1477,7 @@ def generate_constraints(self, counter=0):
 
         all_constraints = []
 
+        # pyrefly: ignore  # missing-attribute
         for n in graph.nodes:
             (constraints, counter) = self.generate_constraints_node(n, counter)
             all_constraints += constraints
diff --git a/torch/fx/experimental/migrate_gradual_types/constraint_transformation.py b/torch/fx/experimental/migrate_gradual_types/constraint_transformation.py
index 9b84c12127f0..0782ba5affc9 100644
--- a/torch/fx/experimental/migrate_gradual_types/constraint_transformation.py
+++ b/torch/fx/experimental/migrate_gradual_types/constraint_transformation.py
@@ -1,7 +1,7 @@
 # mypy: ignore-errors
 import copy
 import itertools
-from typing import Callable
+from collections.abc import Callable
 
 from torch.fx.experimental.migrate_gradual_types.constraint import (
     ApplyBroadcasting,
diff --git a/torch/fx/experimental/normalize.py b/torch/fx/experimental/normalize.py
index 73cce6017bf1..e2dd3c962bbe 100644
--- a/torch/fx/experimental/normalize.py
+++ b/torch/fx/experimental/normalize.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import operator
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 
 import torch
 import torch.fx
@@ -46,12 +47,12 @@ def run_node(self, n: Node) -> Any:
 
         def get_type(arg):
             if isinstance(arg, fx.Node):
-                return n.meta["type"] if "type" in n.meta else None
+                return n.meta.get("type")
             return type(arg)
 
         arg_types = map_aggregate(n.args, get_type)
         assert isinstance(arg_types, tuple)
-        arg_types = tuple([create_type_hint(i) for i in arg_types])
+        arg_types = tuple(create_type_hint(i) for i in arg_types)
         kwarg_types = {k: get_type(v) for k, v in kwargs.items()}
         if n.op == "call_function":
             out = self.call_function(n.target, args, kwargs, arg_types, kwarg_types)
diff --git a/torch/fx/experimental/optimization.py b/torch/fx/experimental/optimization.py
index 3e406b57a96d..73a7805f0478 100644
--- a/torch/fx/experimental/optimization.py
+++ b/torch/fx/experimental/optimization.py
@@ -193,6 +193,7 @@ def modules_to_mkldnn(nodes: list[fx.Node], modules: dict[str, nn.Module]):
             assert isinstance(node.target, str)
             cur_module = modules[node.target]
             if type(cur_module) in mkldnn_map:
+                # pyrefly: ignore  # index-error
                 new_module = mkldnn_map[type(cur_module)](cur_module, torch.float)
                 assert isinstance(new_module, nn.Module)
                 old_modules[new_module] = copy.deepcopy(cur_module)
@@ -263,7 +264,10 @@ def benchmark(f):
         )
 
         reset_modules(
-            submodule.graph.nodes, dict(submodule.named_modules()), old_modules
+            submodule.graph.nodes,
+            dict(submodule.named_modules()),
+            # pyrefly: ignore  # bad-argument-type
+            old_modules,
         )
         no_mkl_time = benchmark(lambda: submodule(*sample_inputs))
         return mkl_time < no_mkl_time
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index d9f687f4d24e..28a60bafcac8 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -16,12 +16,12 @@
 import typing_extensions
 import weakref
 from collections import defaultdict, OrderedDict
-from collections.abc import Generator, Mapping, Sequence
+from collections.abc import Callable, Generator, Mapping, Sequence
 from contextlib import _GeneratorContextManager, contextmanager, ExitStack, nullcontext
 from dataclasses import dataclass
 from typing import (
     Any,
-    Callable,
+    Concatenate,
     Optional,
     overload,
     Protocol,
@@ -29,7 +29,7 @@
     TypeVar,
     Union,
 )
-from typing_extensions import Concatenate, ParamSpec, Self, TypeVarTuple, Unpack
+from typing_extensions import ParamSpec, Self, TypeVarTuple, Unpack
 from weakref import WeakKeyDictionary
 
 import torch
@@ -63,6 +63,7 @@
     _disable_infra_mode,
     _push_mode,
     _unset_infra_mode,
+    autograd_would_have_decomposed,
     TorchDispatchMode,
 )
 from torch.utils._stats import count
@@ -124,6 +125,7 @@
     torch.Size,
     lambda xs: (list(xs), None),
     lambda xs, _: tuple(xs),
+    # pyrefly: ignore  # bad-argument-type
     flatten_with_keys_fn=lambda xs: (
         [(pytree.SequenceKey(i), x) for i, x in enumerate(xs)],
         None,
@@ -284,7 +286,8 @@ def set_proxy_slot(  # type: ignore[no-redef]
         # is derivable from a primal that we use that.
         assert isinstance(obj, py_sym_types), type(obj)
         if obj not in tracer.symnode_tracker:
-            tracer.symnode_tracker[obj] = typing.cast(_PySymProxyType, proxy)
+            proxy = typing.cast(_PySymProxyType, proxy)
+            tracer.symnode_tracker[obj] = proxy
 
             # WAR: python test/dynamo/test_subclasses.py
             # TestNestedTensor.test_basic_autograd
@@ -301,11 +304,14 @@ def set_proxy_slot(  # type: ignore[no-redef]
             import sympy
 
             if isinstance(obj.node.expr, sympy.Symbol):
-                tracer.sympy_expr_tracker[obj.node.expr] = proxy
+                tracer.sympy_expr_tracker[obj.node.expr] = _SympyExprTrackerValue(
+                    proxy, obj
+                )
 
 
 def has_proxy_slot(obj: Tensor, tracer: _ProxyTracer) -> bool:
     assert isinstance(obj, (Tensor, SymNode)), type(obj)
+    # pyrefly: ignore  # no-matching-overload
     return bool(get_proxy_slot(obj, tracer, False, lambda _: True))
 
 
@@ -402,22 +408,145 @@ def get_proxy_slot(
         assert isinstance(obj, py_sym_types), type(obj)
         tracker = tracer.symnode_tracker
 
-    if obj not in tracker:
-        # Last ditch
-        if isinstance(obj, py_sym_types) and obj.node.expr in tracer.sympy_expr_tracker:
-            value = tracer.sympy_expr_tracker[obj.node.expr]
-        else:
-            if isinstance(default, _NoDefault):
-                raise RuntimeError(
-                    f"{obj} ({id(obj)})is not tracked with proxy for {tracer}"
-                )
-            return default
-    else:
-        value = tracker[obj]
+    # pyrefly: ignore  # index-error
+    value = tracker.get(obj)
+
+    if value is None and isinstance(obj, py_sym_types):
+        if obj.node.is_symbolic():
+            # Last ditch - we found a SymInt (SymBool, etc) we don't know
+            # about.
+            if (tmp := tracer.sympy_expr_tracker.get(obj.node.expr)) is not None:
+                value = tmp.proxy
+
+            else:
+                # Attempt to build it from first principles.
+                _build_proxy_for_sym_expr(tracer, obj.node.expr, obj)
+                # pyrefly: ignore  # no-matching-overload
+                value = tracker.get(obj)
+
+    if value is None:
+        # We don't know this value - return the default.
+        if isinstance(default, _NoDefault):
+            raise RuntimeError(
+                f"{obj} ({type(obj)}, {id(obj)})is not tracked with proxy for {tracer}"
+            )
+        return default
+
     res = transform(value)
     return res
 
 
+@functools.cache
+def _sympy_handlers() -> dict[type[sympy.Expr], Callable[..., Any]]:
+    """
+    Returns a dict converting sympy functions to python operators
+    (i.e. `sympy.Mul` -> `operator.mul`)
+    """
+    import torch.utils._sympy.interp
+
+    handlers = {}
+    for k, v in torch.utils._sympy.interp.handlers().items():
+        op = getattr(operator, v, None)
+        if op is not None:
+            handlers[k] = op
+    return handlers
+
+
+def _build_proxy_for_sym_expr(
+    tracer: _ProxyTracer, expr: sympy.Expr, out: PySymType | None = None
+) -> PySymType | None:
+    """
+    Decompose `expr` and look for the pieces as inputs. If `out` is provided
+    then that will be the resulting SymNode (and `out.expr` must be the same as
+    `expr`).
+
+    This function is used when the ProxyTorchDispatchMode sees a SymNode
+    that it hasn't seen before to try to associate it with traced inputs.
+
+    How can this happen?
+
+    First thing to remember is that although sympy.Exprs are interned (so
+    `sympy.Expr("s3*s4")` will always have the same `id` and will always compare
+    equal) SymNode does not (so doing `SymNode("s3")*SymNode("s4")` twice in a
+    row will give two unique SymNodes).
+
+    - On way for this to happen is if we turn off tracing to compute an
+      intermediate value and then USE that value with tracing turned on - for
+      example if we turn off tracing to do some FakeTensor propagation to
+      compute a size (dtensor does this) but then turn tracing back on and use
+      that computed size.
+
+    - Another way is if we compute a size in one graph and stash it somewhere
+      hidden (such as in some meta-data) and later use it in a different graph
+      (dtensor does this too). Since the size was computed in the first graph
+      and it's not an official input to the second graph it's not tracked
+      properly. This is often going to show up as it usually works in fullgraph
+      but a graph break causes a failure.
+
+    To handle this we decompose the sympy.Expr and look for the pieces as
+    inputs. But there are problems with this approach:
+
+    - We lose operation provanance: We end up figuring out where to get the
+      inputs - but those may not actually be correct. If we have "s1" coming in
+      from both tensor1 and tensor2 and we pick the wrong one we could end up
+      keeping a tensor alive longer than intended.
+
+    - There's no guarantee that those values are inputs to the graph: If we have
+      "s1*s2" computed in a graph #1 and used in graph #2 there's no guarantee
+      that the input that holds "s1" is actually an input on graph #2.
+
+    - The decomposition isn't guaranteed to be the same: Sympy can "simplify"
+      expressions so it's possible that our inputs are "s1*s2" and "s3" but we
+      decompose it into "s1" and "s2*s3" - which wouldn't be found.
+
+    Other ways we could handle this:
+
+    - Don't: Just require that all inputs are tracked properly. This is the
+      "correct" solution but harder because you need to track down each
+      potential problem one by one and fix them. And when it fails it's a lot of
+      work to figure out both why it's failing and the right way to fix it. This
+      is complicated by the fact that a stashed value could be incorrect but
+      work fine until we happen to get an graph break in the wrong place - so it
+      may be a while before the bug is found. (Maybe we need a "dynamo abuse
+      mode" where we run tests with as many graph breaks inserted as possible?)
+
+    - Track SymNode ops separately from proxy tracing: Right now SymNode
+      operations are tracked as part of the proxy tracing - so when we disable
+      proxy tracing we also disable SymNode tracing. But we don't have to do
+      that - we could instead always have SymNodes track where they came from
+      and just use that when needed. This solves the problem of tracing being
+      temporarily turned off but doesn't help if an input isn't present after a
+      graph break.
+
+    - Better decomposition: Right now the decomposition is pretty simple. We do
+      have a sat-solver available to us so we could theoretically do a better
+      job figuring out a "correct" decomposition. But that still relies on
+      having the inputs available at all - which isn't a guarantee.
+    """
+
+    if (value := tracer.sympy_expr_tracker.get(expr)) is not None:
+        assert not out
+        return value.value
+
+    args = []
+    for arg in expr.args:
+        if (arg_value := _build_proxy_for_sym_expr(tracer, arg)) is None:
+            return None
+        args.append(arg_value)
+    args = tuple(args)
+
+    func: OpOverload | None = _sympy_handlers().get(expr.func)  # type: ignore[assignment]
+    if not func:
+        # Handler not found
+        return None
+
+    if out is None:
+        out = func(*args)
+    else:
+        _sym_register(tracer, func, args, out)
+    return out
+
+
 def snapshot_fake(val: Tensor, include_real: bool = False) -> Optional[Tensor]:
     # val.detach() will also eventually call fast_detach(),
     # but this saves us a full trip into __torch_dispatch__
@@ -788,6 +917,7 @@ def fetch_object_proxy(
 def fetch_object_proxy(
     tracer: _ProxyTracer, t: Union[Tensor, _AnyScriptObjectType, PySymType]
 ) -> object:
+    # pyrefly: ignore  # no-matching-overload
     return get_proxy_slot(t, tracer, t)
 
 
@@ -798,23 +928,29 @@ def _maybe_record_pointwise_barrier(
     func: object, proxy_mode: ProxyTorchDispatchMode
 ) -> None:
     """
-    Records pointwise operators in user program (non decomposed) that were output in fp16/bf16
+    Records operators whose tensor outputs or inputs are fp16/bf16 so downstream pointwise code can
+    emulate eager's rounding behavior when emulate_precision_casts is enabled.
     """
     if proxy_mode.decomp_layers or not proxy_mode.emulate_precision_casts:
         return
 
-    if (
-        not isinstance(func, torch._ops.OpOverload)
-        or torch.Tag.pointwise not in func.tags
-    ):
+    if not isinstance(func, torch._ops.OpOverload):
         return
 
     last_node = next(iter(reversed(proxy_mode.tracer.graph.nodes)))
     t = last_node.meta.get("val")
-    if not isinstance(t, torch.Tensor) or t.dtype not in (
-        torch.bfloat16,
-        torch.float16,
-    ):
+    low_pr_fp = (torch.bfloat16, torch.float16)
+
+    output_low_precision = isinstance(t, torch.Tensor) and t.dtype in low_pr_fp
+
+    if not output_low_precision:
+        for input_node in last_node.all_input_nodes:
+            val = input_node.meta.get("val") if hasattr(input_node, "meta") else None
+            if isinstance(val, torch.Tensor) and val.dtype in low_pr_fp:
+                output_low_precision = True
+                break
+
+    if not output_low_precision:
         return
 
     last_node.meta["low_precision_pointwise_barrier"] = True
@@ -830,6 +966,7 @@ def _fetch_proxies_and_all_constant_flag(
     """
     f_flat_args_kwargs = [
         (
+            # pyrefly: ignore  # no-matching-overload
             fetch_object_proxy(tracer, x)
             if isinstance(x, (Tensor, _AnyScriptObject))
             else x
@@ -876,7 +1013,7 @@ def proxy_call(
     def can_handle_tensor(x: Tensor) -> bool:
         r = type(x) in HANDLED_TYPES or has_proxy_slot(x, proxy_mode.tracer)
         if proxy_mode._allow_fake_constant:
-            r = r or type(x) in (torch._subclasses.FakeTensor,)
+            r = r or type(x) is torch._subclasses.FakeTensor
         if not r:
             unrecognized_types.append(type(x))
         return r
@@ -896,11 +1033,16 @@ def can_handle_tensor(x: Tensor) -> bool:
         return r
 
     # For pre-autograd tracing, we do not want to run CompositeImplicit decomps.
-    if not pre_dispatch and func not in [
-        torch.ops.aten.size.default,
-        torch.ops.aten.stride.default,
-        torch.ops.aten.storage_offset.default,
-    ]:
+    if (
+        not pre_dispatch
+        and func
+        not in [
+            torch.ops.aten.size.default,
+            torch.ops.aten.stride.default,
+            torch.ops.aten.storage_offset.default,
+        ]
+        and autograd_would_have_decomposed(func, flat_args_kwargs)
+    ):
         with proxy_mode:
             r = func.decompose(*args, **kwargs)
             if r is not NotImplemented:
@@ -1090,10 +1232,16 @@ def __len__(self) -> int:
         return len(self.sym_node_dict)
 
 
+@dataclass
+class _SympyExprTrackerValue:
+    proxy: _PySymProxyType
+    value: PySymType
+
+
 class PythonKeyTracer(Tracer):
     script_object_tracker: MutableMapping[_AnyScriptObjectType, Proxy]
     symnode_tracker: _SymNodeDict
-    sympy_expr_tracker: dict[sympy.Symbol, object]
+    sympy_expr_tracker: dict[sympy.Symbol, _SympyExprTrackerValue]
     tensor_tracker: MutableMapping[Tensor, _ProxyTensor]
     torch_fn_counts: dict[OpOverload, int]
     enable_thunkify: bool = False
@@ -1105,7 +1253,7 @@ def __init__(self) -> None:
         self.script_object_tracker = WeakIdKeyDictionary(
             dict=None, ref_type=_WeakHashRef
         )
-        self.sympy_expr_tracker = dict()
+        self.sympy_expr_tracker = {}
 
         # Stores the torch function that was called during tracing
         self.torch_fn_metadata = None
@@ -1404,6 +1552,7 @@ def __torch_function__(
         kwargs: Optional[dict[str, object]] = None,
     ) -> object:
         kwargs = kwargs or {}
+        # pyrefly: ignore  # bad-assignment
         self.tracer.torch_fn_metadata = func
         self.tracer.torch_fn_counts[func] = self.tracer.torch_fn_counts.get(func, 0) + 1
         return func(*args, **kwargs)
@@ -1453,6 +1602,7 @@ def __torch_function__(
             # For autocast, the python APIs run so we don't have to run them again
             # here.
             if func is torch._C._set_grad_enabled:
+                # pyrefly: ignore  # bad-argument-type
                 func(*args, **kwargs)
             return node
 
@@ -1528,7 +1678,7 @@ def __torch_dispatch__(
         with set_original_aten_op(func):
             kwargs = kwargs or {}
 
-            if func in (prim.device.default,):
+            if func == prim.device.default:
                 return func(*args, **kwargs)
 
             return proxy_call(self, func, self.pre_dispatch, args, kwargs)
@@ -1558,39 +1708,6 @@ def __exit__(
     def is_infra_mode(cls) -> bool:
         return True
 
-    def _compute_proxy(
-        self, func: OpOverload, args: tuple[object, ...], out: PySymType
-    ) -> Proxy:
-        # Handle torch.sym_sum
-        n_args: tuple[object, ...]
-        if len(args) == 1 and isinstance(args[0], (list, tuple)):
-            n_args = (
-                tuple(
-                    (
-                        get_proxy_slot(a, self.tracer).force().node
-                        if isinstance(a, py_sym_types)
-                        else a
-                    )
-                    for a in args[0]
-                ),
-            )
-        else:
-            n_args = tuple(
-                (
-                    get_proxy_slot(a, self.tracer).force().node
-                    if isinstance(a, py_sym_types)
-                    else a
-                )
-                for a in args
-            )
-
-        # func doesn't have a __torch_function__ that Proxy can interpose, so
-        # we gotta do it manually
-        n_out = self.tracer.create_node("call_function", func, n_args, {})  # type: ignore[arg-type]
-        p_out = fx.Proxy(n_out, self.tracer)
-        set_meta(p_out, out)
-        return p_out
-
     def __sym_dispatch__(
         self,
         func: OpOverload,
@@ -1611,25 +1728,63 @@ def __sym_dispatch__(
         # We also assume there are no keyword arguments.
         assert not kwargs
         out = func(*args, **kwargs)
+        _sym_register(self.tracer, func, args, out)
+        return out
+
+
+def _sym_register(
+    tracer: _ProxyTracer, func: OpOverload, args: tuple[object, ...], out: object
+) -> None:
+    # If func returned a constant, we don't need to trace; we have
+    # determined that the result is constant (no matter if the inputs
+    # were symbolic) and it is no longer necessary to trace the
+    # computation.  This could occur if func triggered some guards.
+    if isinstance(out, py_sym_types):
+        p_out_thunk = thunkify(
+            tracer, _compute_proxy, tracer, func=func, args=args, out=out
+        )
+        set_proxy_slot(out, tracer, p_out_thunk)
 
-        # If func returned a constant, we don't need to trace; we have
-        # determined that the result is constant (no matter if the inputs
-        # were symbolic) and it is no longer necessary to trace the
-        # computation.  This could occur if func triggered some guards.
-        if isinstance(out, py_sym_types):
-            p_out_thunk = thunkify(
-                self.tracer, self._compute_proxy, func=func, args=args, out=out
+
+def _compute_proxy(
+    tracer: _ProxyTracer, func: OpOverload, args: tuple[object, ...], out: PySymType
+) -> Proxy:
+    # Handle torch.sym_sum
+    n_args: tuple[object, ...]
+    if len(args) == 1 and isinstance(args[0], (list, tuple)):
+        n_args = (
+            tuple(
+                (
+                    get_proxy_slot(a, tracer).force().node
+                    if isinstance(a, py_sym_types)
+                    else a
+                )
+                for a in args[0]
+            ),
+        )
+    else:
+        n_args = tuple(
+            (
+                get_proxy_slot(a, tracer).force().node
+                if isinstance(a, py_sym_types)
+                else a
             )
-            set_proxy_slot(out, self.tracer, p_out_thunk)
+            for a in args
+        )
 
-        return out
+    # func doesn't have a __torch_function__ that Proxy can interpose, so
+    # we gotta do it manually
+    n_out = tracer.create_node("call_function", func, n_args, {})  # type: ignore[arg-type]
+    p_out = fx.Proxy(n_out, tracer)
+    set_meta(p_out, out)
+    return p_out
 
 
 class _GraphAppendingTracerEx(fx.proxy.GraphAppendingTracer):
     script_object_tracker: MutableMapping[_AnyScriptObjectType, Proxy]
     symnode_tracker: MutableMapping[PySymType, _PySymProxyType]
     tensor_tracker: MutableMapping[Tensor, _ProxyTensor]
-    sympy_expr_tracker: dict[sympy.Symbol, object]
+    sympy_expr_tracker: dict[sympy.Symbol, _SympyExprTrackerValue]
     torch_fn_metadata: Optional[OpOverload]
     torch_fn_counts: dict[OpOverload, int]
     enable_thunkify: bool = False
@@ -1666,6 +1821,7 @@ def __init__(
         self.decomposition_table = decomposition_table or {}
         self.mode = ProxyTorchDispatchMode(self.tracer, tracing_mode="real")
 
+    # pyrefly: ignore  # bad-override
     def placeholder(
         self,
         target: str,  # type: ignore[override]
@@ -1678,6 +1834,7 @@ def placeholder(
         # TODO handle case where the first character of target is '*'
         return out
 
+    # pyrefly: ignore  # bad-override
     def get_attr(
         self,
         target: str,  # type: ignore[override]
@@ -1691,6 +1848,7 @@ def get_attr(
 
     # call_function, call_method, call_module get traced automatically by the outer mode.
 
+    # pyrefly: ignore  # bad-override
     def output(
         self,
         target: str,  # type: ignore[override]
@@ -1809,6 +1967,7 @@ def __init__(self, base: Union[Module, _AttrProxy], path: str) -> None:
                 # Class is modified to be a subclass of torch.nn.Module
                 # Warning: We blow away our own attributes here to mimic the base class
                 # - so don't expect `self.x` to do anything useful.
+                # pyrefly: ignore  # no-matching-overload
                 self.__class__ = type(
                     base.__class__.__name__,
                     (self.__class__, base.__class__),
@@ -1831,6 +1990,7 @@ def __getattr__(self, name: str) -> AttrProxy:
                 if not isinstance(attr_val, Module):
                     return attr_val
 
+                # pyrefly: ignore  # index-error
                 return AttrProxy(attr_val, tracer.proxy_paths[self] + "." + name)
 
             def get_base(self) -> Module:
@@ -1843,10 +2003,12 @@ def __getitem__(self, idx: Union[int, slice]) -> AttrProxy:
                         res = torch.nn.Sequential(
                             OrderedDict(list(self._modules.items())[idx])
                         )
+                        # pyrefly: ignore  # index-error
                         return AttrProxy(res, f"{tracer.proxy_paths[self]}.{idx}")
                     elif isinstance(self, torch.nn.ModuleList):
                         # Copied from nn/modules/container.py
                         res = torch.nn.ModuleList(list(self._modules.values())[idx])
+                        # pyrefly: ignore  # index-error
                         return AttrProxy(res, f"{tracer.proxy_paths[self]}.{idx}")
 
                 return super().__getitem__(idx)  # type: ignore[misc]
@@ -1911,14 +2073,12 @@ def trace(  # type: ignore[override]
         # In non-strict export, we don't have dynamo's side effect
         # tracking logic which makes some cases hard to detect.
         # In general, our detecting strategy is:
-        #  (1) We do gc.collect() before export and get the alive fake tensors
-        #  (2) We dump the proxy to fake tensor map from make_fx tracer (_FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT)
-        #  (3) We query gc again to get alive fake tensors
-        #  (4) We take the delta between (1) and (3)
-        #  (5) Filter out fake tensors that are:
+        #  (1) We instrument fake tensor creation to log all the fake tensors created during export.
+        #  (2) We dump the proxy to fake tensor map from make_fx tracer (_FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT))
+        #  (3) Filter out fake tensors that are logged during (1):
         #      (1) Associated with TrackedFake (input tracking thing in symbolic_shapes)
         #      (2) Associated with gm.meta
-        #  (6) Do ID match with the proxies
+        #  (4) Do ID match with the proxies
 
         global _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT
         _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT.clear()
@@ -2210,9 +2370,9 @@ def _init_modes_from_parent(
             self.fake_tensor_mode = parent_tracer.fake_tensor_mode
 
             def _create_sub_fx_tracer(parent_tracer: _ProxyTracer) -> PythonKeyTracer:
-                if type(parent_tracer) == PythonKeyTracer:
+                if type(parent_tracer) is PythonKeyTracer:
                     return PythonKeyTracer()
-                elif type(parent_tracer) == _ModuleStackTracer:
+                elif type(parent_tracer) is _ModuleStackTracer:
                     return _ModuleStackTracer(parent_tracer.scope_root)
                 else:
                     raise RuntimeError(
@@ -2343,6 +2503,7 @@ def _wrap_func(f: Callable[_P, R], phs: Sequence[PHBase]) -> Callable[_P, R]:
         ):
             from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
 
+            # pyrefly: ignore  # unbound-name
             insert_deferred_runtime_asserts(t, fake_mode.shape_env, "reenter_make_fx")
             t.recompile()
         # TODO: kind of a bad way to do it, should maybe figure out a better way
diff --git a/torch/fx/experimental/recording.py b/torch/fx/experimental/recording.py
index a9025fc54ebe..4ec092898cd6 100644
--- a/torch/fx/experimental/recording.py
+++ b/torch/fx/experimental/recording.py
@@ -3,8 +3,9 @@
 import inspect
 import itertools
 import logging
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch.utils._pytree as pytree
diff --git a/torch/fx/experimental/rewriter.py b/torch/fx/experimental/rewriter.py
index 8e635a525f6f..2cc902599aeb 100644
--- a/torch/fx/experimental/rewriter.py
+++ b/torch/fx/experimental/rewriter.py
@@ -5,8 +5,9 @@
 import functools
 import inspect
 import textwrap
+from collections.abc import Callable
 from types import FunctionType
-from typing import Any, Callable, cast, Optional, Union
+from typing import Any, cast, Optional, Union
 
 import torch
 from torch._sources import normalize_source_lines
diff --git a/torch/fx/experimental/sym_node.py b/torch/fx/experimental/sym_node.py
index 5468191163ab..8e7784fcd639 100644
--- a/torch/fx/experimental/sym_node.py
+++ b/torch/fx/experimental/sym_node.py
@@ -49,7 +49,7 @@
 sym_node_log = torch._logging.getArtifactLogger(__name__, "sym_node")
 
 
-__all__ = ["SymNode", "method_to_operator", "magic_methods"]
+__all__ = ["SymNode", "method_to_operator", "magic_methods", "DynamicInt"]
 
 
 from torch.types import py_sym_types as SymTypes
@@ -560,20 +560,6 @@ def expect_true(self, file, line):
             self.expr, f"{file}:{line}", fx_node=self.fx_node
         )
 
-    def expect_size(self, file, line):
-        from torch.fx.experimental.symbolic_shapes import _advise_is_size
-
-        b = self.ge(self.wrap_int(0))
-        # Generate a deferred runtime assert
-        r = b.expect_true(file, line)
-        # Refine compile time range, but only if it's unbacked.
-        # If you refine range for hinted variables, you can end up making
-        # improper deductions since compile time reasoning may be
-        # incompatible with runtime reasoning.
-        if r and not self.has_hint():
-            _advise_is_size(SymInt(self))
-        return r
-
     def statically_known_true(self, file, line):
         from torch.fx.experimental.symbolic_shapes import statically_known_true
 
@@ -625,6 +611,40 @@ def is_constant(self):
         return False
 
 
+class _DynamicScalar:
+    def __new__(cls, *args):
+        if cls is _DynamicScalar:
+            raise TypeError("_DynamicScalar is an abstract base class, use DynamicInt.")
+        return super().__new__(cls, *args)
+
+
+class DynamicInt(_DynamicScalar, int):
+    """
+    User API for marking dynamic integers in `torch.compile`.
+    Intended to be compatible with both compile and eager mode.
+
+    Example usage::
+
+        fn = torch.compile(f)
+        x = DynamicInt(4)
+        fn(x)  # compiles x as a dynamic integer input; returns f(4)
+    """
+
+    def __new__(cls, val):
+        assert isinstance(val, int)
+        obj = super().__new__(cls, int(val))
+        return obj
+
+    def __repr__(self):
+        return f"DynamicInt({self.real})"
+
+    def __floordiv__(self, other):  # // was casting to int without these overrides?
+        return DynamicInt(self.real // other)
+
+    def __rfloordiv__(self, other):
+        return DynamicInt(other // self.real)
+
+
 # TODO: this probably needs the sizes-strides eval functions
 METHOD_TO_OPERATOR = {
     "pos": operator.pos,
@@ -1650,7 +1670,6 @@ def sizes_strides_user(sizes, strides):
 def _make_user_magic(method, user_type):
     # User magic takes care of wrapping the other operand into a node,
     # so that our internal logic can assume everything is nodes
-
     if method in magic_methods_on_operator_with_trailing_underscore:
         method_attr = f"sym_{method}"
     else:
@@ -1781,7 +1800,7 @@ def rbinary_magic_impl(self, other):
         other = promote(other)
         self, other = promote2(self, other)
         if is_constant(self):
-            return (method_to_operator(method))(get_constant(self), other)
+            return (method_to_operator(method))(other, get_constant(self))
         if is_constant(other):
             other = get_constant(other)
         other_node = to_node(self.node, other)
@@ -1790,11 +1809,31 @@ def rbinary_magic_impl(self, other):
         ret = wrap_node(getattr(other_node, method_attr)(self.node))
         return get_constant(ret) if is_constant(ret) else ret
 
+    def setattrs(user_type, attr, symnode_impl):
+        """
+        Registers the SymNode magic method on SymInt/Float/Bool,
+        and optionally registers a corresponding wrapped method on DynamicInt.
+        """
+
+        # SymInt/Float/Bool
+        setattr(user_type, attr, symnode_impl)
+
+        # DynamicInt impl
+        def dynamic_int_impl(*args):
+            args = [x.real if isinstance(x, DynamicInt) else x for x in args]
+            out = getattr(int, attr)(*args)
+            if isinstance(out, int) and not isinstance(out, bool):
+                return DynamicInt(out)
+            return out
+
+        if user_type is SymInt:
+            setattr(DynamicInt, attr, dynamic_int_impl)
+
     if method in unary_magic_methods:
-        setattr(user_type, f"__{method}__", unary_magic_impl)
+        setattrs(user_type, f"__{method}__", unary_magic_impl)
     elif method in unary_nonmagic_methods:
         orig = getattr(user_type, method)
-        setattr(user_type, method, update_wrapper(unary_magic_impl, orig))
+        setattrs(user_type, method, update_wrapper(unary_magic_impl, orig))
     elif method == "sym_ite":
 
         def sym_ite_magic_impl(pred, then_val, else_val):
@@ -1811,7 +1850,7 @@ def sym_ite_magic_impl(pred, then_val, else_val):
             ret = wrap_node(getattr(pred.node, method_attr)(then_node, else_node))
             return get_constant(ret) if ret.node.is_constant() else ret
 
-        setattr(user_type, f"__{method}__", sym_ite_magic_impl)
+        setattrs(user_type, f"__{method}__", sym_ite_magic_impl)
     elif method == "round":
 
         def round_magic_impl(self, ndigits=None):
@@ -1820,14 +1859,14 @@ def round_magic_impl(self, ndigits=None):
 
             return wrap_node(getattr(self.node, method)(ndigits))
 
-        setattr(user_type, f"__{method}__", round_magic_impl)
+        setattrs(user_type, f"__{method}__", round_magic_impl)
     else:
         method_name = method
         if method in bitwise_ops:
             method_name = bitwise_ops[method]
-        setattr(user_type, f"__{method_name}__", binary_magic_impl)
+        setattrs(user_type, f"__{method_name}__", binary_magic_impl)
         if method in reflectable_magic_methods:
-            setattr(user_type, f"__r{method_name}__", rbinary_magic_impl)
+            setattrs(user_type, f"__r{method_name}__", rbinary_magic_impl)
 
 
 for method, func in magic_methods.items():  # type: ignore[assignment]
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index 850f6b8b5d2b..67f8c0f66574 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -31,23 +31,24 @@
 import threading
 import traceback
 from collections import Counter, defaultdict
-from collections.abc import Generator, Iterator, Mapping, Sequence
+from collections.abc import Callable, Generator, Iterator, Mapping, Sequence
 from contextlib import _GeneratorContextManager, contextmanager
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from typing import (
     Any,
-    Callable,
     cast,
     Generic,
     NamedTuple,
     NoReturn,
     Optional,
     TYPE_CHECKING,
+    TypeAlias,
+    TypeGuard,
     TypeVar,
     Union,
 )
-from typing_extensions import deprecated, ParamSpec, TypeAlias, TypeGuard
+from typing_extensions import deprecated, ParamSpec
 
 import torch
 import torch.fx
@@ -82,7 +83,6 @@
     IntTrueDiv,
     IsNonOverlappingAndDenseIndicator,
     Max,
-    Min,
     Mod,
     PythonMod,
     TruncToInt,
@@ -213,7 +213,7 @@ def log_lru_cache_stats(wrapped_f: functools._lru_cache_wrapper[object]) -> None
 class SymIntEqByExpr:
     """
     This is a wrapper around SymInt which has alternative semantics for
-    equality.  Specifically, instead of erroring or guarding, we
+    equality and pickling.  Specifically, instead of erroring or guarding, we
     instead will hash/compare equality based on the underlying sympy
     expression; e.g., s0 and s1 will always compare as False.
 
@@ -222,31 +222,25 @@ class SymIntEqByExpr:
     canonicalize to the same expression via regular simplification.
     """
 
-    val: Union[torch.SymInt, int]
+    @staticmethod
+    def _extract(val: Union[torch.SymInt, int]) -> sympy.Expr:
+        if isinstance(val, torch.SymInt):
+            return val.node.expr
+        else:
+            return sympy.Integer(val)
 
     def __init__(self, val: Union[torch.SymInt, int]) -> None:
-        self.val = val
+        self.val: sympy.Expr = SymIntEqByExpr._extract(val)
 
     def __repr__(self) -> str:
         return repr(self.val)
 
-    def _extract(self) -> sympy.Expr:
-        if isinstance(self.val, torch.SymInt):
-            return self.val.node.expr
-        else:
-            return sympy.Integer(self.val)
-
     def __eq__(self, other: object) -> bool:
         assert isinstance(other, SymIntEqByExpr)
-
-        # int equality fastpath
-        if type(self.val) is int and type(other.val) is int:
-            return self.val == other.val
-
-        return self._extract() == other._extract()
+        return self.val == other.val
 
     def __hash__(self) -> int:
-        return hash(self._extract())
+        return hash(self.val)
 
 
 def _nested_int_aware_sort(
@@ -627,11 +621,13 @@ def rebind_unbacked(
             ):
                 # This is what the pattern match above is testing
                 repacked = _sympy_cast_symbool_to_symint_guardless(
+                    # pyrefly: ignore  # unbound-name
                     sympy.Eq(new_raw_u1, 1)
                 )
                 assert repacked == raw_u1, f"{repacked} != {raw_u1}"
                 # Cancel the to_int(to_bool(x)). This is sound because x in
                 # [0, 1]
+                # pyrefly: ignore  # unbound-name
                 raw_u1 = new_raw_u1
 
             if not isinstance(raw_u1, sympy.Symbol):
@@ -846,6 +842,7 @@ def div_by_factor(x: sympy.Expr, factor: int) -> sympy.Expr:
         factor = functools.reduce(math.gcd, map(integer_coefficient, atoms))
         if factor == 1:
             return expr
+        # pyrefly: ignore  # bad-argument-type
         atoms = [div_by_factor(x, factor) for x in atoms]
         return _sympy_from_args(
             sympy.Add, atoms, sort=True, is_commutative=expr.is_commutative
@@ -886,11 +883,16 @@ def _iterate_exprs(val: IterateExprs) -> Iterator[sympy.Basic]:
     Raises:
         AssertionError: If the value is of an unsupported type.
     """
+    # This is almost close enough to implement in terms of _iterate_nodes()
+    # except that it needs to handle `list[sympy.Basic]` which _iterate_nodes()
+    # can't handle.
     if isinstance(val, SymTypes):
         # This allow applies to the jagged layout NestedTensor case as
         # nested ints are not symbolic
         if is_symbolic(val):
             yield val.node.expr
+    elif isinstance(val, SymNode):
+        yield val.expr
     elif isinstance(val, sympy.Basic):
         yield val
     elif isinstance(val, (int, float, bool)):
@@ -913,6 +915,28 @@ def _iterate_exprs(val: IterateExprs) -> Iterator[sympy.Basic]:
         raise AssertionError(f"cannot extract sympy expressions from {val} {type(val)}")
 
 
+def _iterate_nodes(val: Any) -> Iterator[SymNode]:
+    """
+    Recursively iterate through a value and yield all SymNodes contained
+    within it.
+    """
+    if isinstance(val, SymNode):
+        yield val
+    elif isinstance(val, py_sym_types):
+        # This allow applies to the jagged layout NestedTensor case as
+        # nested ints are not symbolic
+        if is_symbolic(val):
+            yield val.node
+    elif isinstance(val, (tuple, list, torch.Size)):
+        for s in val:
+            yield from _iterate_nodes(s)
+    elif isinstance(val, torch.Tensor):
+        yield from _iterate_nodes(val.size())
+        if not is_sparse_any(val):
+            yield from _iterate_nodes(val.stride())
+            yield from _iterate_nodes(val.storage_offset())
+
+
 def free_symbols(val: IterateExprs) -> OrderedSet[sympy.Symbol]:
     """
     Recursively collect all free symbols from a value.
@@ -973,6 +997,16 @@ def free_unbacked_symbols(x: IterateExprs) -> OrderedSet[sympy.Symbol]:
     )
 
 
+def _free_non_source_unbacked_symbols(
+    x: IterateExprs, unbacked_inputs: OrderedSet[sympy.Symbol]
+) -> OrderedSet[sympy.Symbol]:
+    """Unbacked symbols that are not inputs to the graph. These are symbols that originated from
+    data-dependent operations as opposed to mark_unbacked calls."""
+    unbacked_symbols = free_unbacked_symbols(x)
+    non_source_symbols = unbacked_symbols - unbacked_inputs
+    return non_source_symbols
+
+
 # WARNING: Don't use this on Dynamo produced graphs, they don't have meta
 # setup!
 def is_symbol_binding_fx_node(node: torch.fx.Node) -> Optional[sympy.Symbol]:
@@ -1021,6 +1055,7 @@ def find_symbol_binding_fx_nodes(
     # NB: Prefer first occurrence of symbol
     for node in graph.nodes:
         if (s := is_symbol_binding_fx_node(node)) is not None and s not in r:
+            # pyrefly: ignore  # unbound-name
             r[s] = node
     return r
 
@@ -1191,10 +1226,13 @@ def expr(s: Union[SymInt, SymFloat, SymBool]) -> sympy.Expr:
         and isinstance(s := expr(a), sympy.Symbol)
         and s in pending
     ):
+        # pyrefly: ignore  # unbound-name
         r[s] = path
         if shape_env and real is not None:
             assert isinstance(real, (int, float))
+            # pyrefly: ignore  # unbound-name
             shape_env.set_unbacked_var_to_val(s, real)
+        # pyrefly: ignore  # unbound-name
         pending.remove(s)
     # When an unbacked SymInt is perfectly divisible by an integer
     # constant, we replace it with the integer constant to improve
@@ -1224,19 +1262,27 @@ def _symint_wrap(s: sympy.Symbol) -> SymInt:
                 source=shape_env.var_to_sources.get(s, [None])[0],  # type: ignore[union-attr]
             )
 
+        # pyrefly: ignore  # unbound-name
         unbacked = lhs if lhs in pending else rhs
         divisor: IntLikeType = (
+            # pyrefly: ignore  # unbound-name
             int(coeff)
+            # pyrefly: ignore  # unbound-name
             if shape_env and isinstance(coeff, sympy.Integer)
+            # pyrefly: ignore  # unbound-name
             else _symint_wrap(coeff)
         )
         # TODO: DivideByKey needs to test divisibility at runtime!
+
         r[unbacked] = path + (DivideByKey(divisor),)
         if real is not None:
             assert isinstance(real, int)
             val = (
+                # pyrefly: ignore  # unbound-name
                 real // int(coeff)
+                # pyrefly: ignore  # unbound-name
                 if isinstance(coeff, sympy.Integer)
+                # pyrefly: ignore  # unbound-name
                 else CleanDiv(real, coeff)
             )
             if shape_env:
@@ -1253,11 +1299,14 @@ def _symint_wrap(s: sympy.Symbol) -> SymInt:
         and s.rhs == 1
         and s.lhs in pending
     ):
+        # pyrefly: ignore  # unsupported-operation
         r[s.lhs] = path + (ConvertIntKey(),)
         if real is not None:
             assert type(real) is bool
             if shape_env:
+                # pyrefly: ignore  # unbound-name
                 shape_env.set_unbacked_var_to_val(s, int(real))
+        # pyrefly: ignore  # unbound-name
         pending.remove(s.lhs)
 
     return r
@@ -1333,6 +1382,7 @@ def compute_unbacked_bindings(
             ):
                 if (
                     isinstance(old_sym, SymTypes)
+                    # pyrefly: ignore  # unbound-name
                     and (old_s := old_sym.node.expr) != new_s
                 ):
                     # If old_s is not an unbacked_symbol,
@@ -1342,11 +1392,15 @@ def compute_unbacked_bindings(
                     # and the original symbol gets replaced by the backed symbol.
                     # When this happens we just replace new_s by the old_s
                     # because we know the value is the same.
+                    # pyrefly: ignore  # unbound-name
                     if isinstance(old_s, sympy.Symbol) and free_unbacked_symbols(old_s):
+                        # pyrefly: ignore  # unbound-name
                         shape_env._rename_unbacked_to(new_s, old_s)
                     else:
+                        # pyrefly: ignore  # unbound-name
                         shape_env._eliminate_unbacked(new_s, old_s)
                 elif not isinstance(old_sym, SymTypes):
+                    # pyrefly: ignore  # unbound-name
                     shape_env._eliminate_unbacked(new_s, sympy.sympify(old_sym))
 
     return symbol_to_path
@@ -1644,7 +1698,7 @@ def constrain_range(
     if max < min:
         raise ValueError(
             "Maximum value to constrain_as_size can't be less than the specified min value, "
-            "received min={min} and max={max}"
+            f"received min={min} and max={max}"
         )
 
     if isinstance(a, int):
@@ -2054,7 +2108,7 @@ class SymIntSymbolicContext(SymbolicContext):
 
 
 @dataclass(frozen=True)
-class StatelessSymbolicContext(Generic[_P1, _T1], SymbolicContext):
+class StatelessSymbolicContext(SymbolicContext, Generic[_P1, _T1]):
     """
     Create symbols in ``create_symbolic_sizes_strides_storage_offset`` via
     a symbolic_context determination as given by ``DimDynamic`` and ``DimConstraint``.
@@ -2169,6 +2223,7 @@ class SubclassSymbolicContext(StatefulSymbolicContext):
     def __post_init__(self) -> None:
         super().__post_init__()
         if self.inner_contexts is None:
+            # pyrefly: ignore  # bad-assignment
             self.inner_contexts = {}
 
 
@@ -2257,9 +2312,12 @@ def _fast_expand(expr: _SympyT) -> _SympyT:
     # only re-create the objects if any of the args changed to avoid expensive
     # checks when re-creating objects.
     new_args = [_fast_expand(arg) for arg in expr.args]  # type: ignore[arg-type]
+    # pyrefly: ignore  # missing-attribute
     if any(arg is not new_arg for arg, new_arg in zip(expr.args, new_args)):
+        # pyrefly: ignore  # missing-attribute
         return _fast_expand(expr.func(*new_args))
 
+    # pyrefly: ignore  # missing-attribute
     if expr.is_Pow:
         base: sympy.Expr
         exp: sympy.Expr
@@ -2269,9 +2327,11 @@ def _fast_expand(expr: _SympyT) -> _SympyT:
                 return sympy.expand_multinomial(expr, deep=False)
             elif exp < 0:
                 return S.One / sympy.expand_multinomial(S.One / expr, deep=False)
+    # pyrefly: ignore  # missing-attribute
     elif expr.is_Mul:
         num: list[sympy.Expr] = []
         den: list[sympy.Expr] = []
+        # pyrefly: ignore  # missing-attribute
         for arg in expr.args:
             if arg.is_Pow and arg.args[1] == -1:
                 den.append(S.One / arg)  # type: ignore[operator, arg-type]
@@ -2393,6 +2453,7 @@ def _maybe_evaluate_static_worker(
 
     # TODO: remove this try catch (esp for unbacked_only)
     try:
+        # pyrefly: ignore  # missing-attribute
         new_expr = expr.xreplace(new_shape_env)
     except RecursionError:
         log.warning("RecursionError in sympy.xreplace(%s, %s)", expr, new_shape_env)
@@ -2930,13 +2991,19 @@ def floor_div_handler(*args: sympy.Expr) -> sympy.Expr:
             # is_integer tests though haha
             return (base - mod_reduced) / divisor
 
+        # pyrefly: ignore  # missing-attribute
         if expr.has(Mod):
+            # pyrefly: ignore  # missing-attribute
             expr = expr.replace(Mod, mod_handler)
         # 7 // -3 is -3, 7 % -3 is -2, and 7 - (-2) / -3 is -3.0 so negative
         # arguments should be OK.
+        # pyrefly: ignore  # missing-attribute
         if expr.has(PythonMod):
+            # pyrefly: ignore  # missing-attribute
             expr = expr.replace(PythonMod, mod_handler)
+        # pyrefly: ignore  # missing-attribute
         if expr.has(FloorDiv):
+            # pyrefly: ignore  # missing-attribute
             expr = expr.replace(FloorDiv, floor_div_handler)
         return expr
 
@@ -3142,8 +3209,8 @@ def solve(self) -> None:
                         self._dynamic_results.add(self._dcp.doprint(arg))
                 else:
                     self._dynamic_results.add(self._dcp.doprint(solution))
-            except (NotImplementedError, AssertionError) as e:
-                log.warning("Failed to reduce inequalities: %s", e)
+            except (NotImplementedError, AssertionError):
+                log.warning("Failed to reduce inequalities", exc_info=True)
                 for expr2 in exprs:
                     self._dynamic_results.add(self._dcp.doprint(expr2))
 
@@ -3298,6 +3365,7 @@ def _check_same_range(c: Mapping[str, int], dim: object) -> bool:
                     and str(symbol := next(iter(c["eq"].free_symbols))) == old_root
                 ):  # derived dim with root = old_root
                     new_root_expr = results[str(old_root)]["eq"]  # dx=3*_dx+1
+                    # pyrefly: ignore  # unbound-name
                     new_expr = c["eq"].subs({symbol: new_root_expr})  # dy=(3*_dx+1)+1
                     c["eq"] = new_expr
 
@@ -3719,6 +3787,8 @@ def _init(
         self.var_to_range_sloc: dict[sympy.Symbol, ValueRangesSLoc] = {}
         self.source_name_to_debug_name: dict[str, str] = {}
         self.var_to_sources: dict[sympy.Symbol, list[Source]] = {}
+        # A set of unbacked symbols that are inputs (i.e: not data dependent).
+        self.unbacked_inputs: OrderedSet[sympy.Symbol] = OrderedSet()
         self.var_to_stack: dict[sympy.Symbol, CapturedTraceback] = {}
         self.var_to_hint_override: dict[sympy.Symbol, int] = {}
         # Maps a source to the *original* symbol that was assigned to it
@@ -3737,8 +3807,8 @@ def _init(
         # Duck-shaping says that if two input tensors have the same size,
         # they get assigned the same symbolic variable
         self.val_to_var: dict[int, sympy.Symbol] = {}
-        self.unbacked_symfloat_counter = itertools.count()
-        self.unbacked_symint_counter = itertools.count()
+        self.unbacked_symfloat_counter = 0
+        self.unbacked_symint_counter = 0
         # Similar to guards, but these MUST evaluate to true and can
         # only be evaluated at runtime midway through (i.e., they always
         # involve unbacked symints)
@@ -3988,14 +4058,7 @@ def check_equal(self, other: ShapeEnv) -> None:
         # and the stack when it was added to the set of guards. In order to compare
         # it, we throw away the stack information.
         def map_value(key: str, value: Any) -> Any:
-            if key in ("unbacked_symfloat_counter", "unbacked_symint_counter"):
-                from copy import copy
-
-                # For itertools.count(), we compare the next integer returned
-                # by the count iterators. Not that we need to copy the iterator
-                # first. Otherwise we are mutating the object.
-                return next(copy(value))
-            elif key == "guards":
+            if key == "guards":
                 # Transform the list of ShapeGuard into a list of expressions.
                 return [g.expr for g in value]
             elif key == "deferred_runtime_asserts":
@@ -4089,7 +4152,7 @@ def _constrain_range_for_size(
         if max < min:
             raise ValueError(
                 "Maximum value to constrain_as_size can't be less than the specified min value, "
-                "received min={min} and max={max}"
+                f"received min={min} and max={max}"
             )
 
         self.constrain_symbol_range(
@@ -4379,7 +4442,7 @@ def _produce_dyn_sizes_from_int_tuple(
         size = []
         for i, val in enumerate(tensor_size):
             sym = self.create_symbol(
-                val if i not in hint_overrides else hint_overrides[i],
+                hint_overrides.get(i, val),
                 TensorPropertySource(source, TensorProperty.SIZE, i),
                 dynamic_dims[i],
                 constraint_dims[i],
@@ -4580,7 +4643,7 @@ def _create_symbolic_sizes_strides_storage_offset(
         sym_sizes = [
             self.create_symintnode(
                 sym,
-                hint=hint if i not in hint_overrides else hint_overrides[i],
+                hint=hint_overrides.get(i, hint),
                 source=TensorPropertySource(source, TensorProperty.SIZE, i),
             )
             for i, (sym, hint) in enumerate(zip(size, ex_size))
@@ -4816,8 +4879,9 @@ def _log_create_unbacked_symbol(
     def create_unbacked_symfloat(self) -> SymFloat:
         """Create a symbolic float without a hint value"""
         symbol: sympy.Symbol = make_symbol(
-            SymT.UNBACKED_FLOAT, next(self.unbacked_symfloat_counter)
+            SymT.UNBACKED_FLOAT, self.unbacked_symfloat_counter
         )
+        self.unbacked_symfloat_counter += 1
         self.counter["create_unbacked_symbol"] += 1
         if not self._ignore_fresh_unbacked_symbols_tls():
             self.pending_fresh_unbacked_symbols.append(symbol)
@@ -4841,8 +4905,9 @@ def create_unbacked_symfloat(self) -> SymFloat:
     def create_unbacked_symint(self, source: Optional[Source] = None) -> SymInt:
         """Create a symbolic integer without a hint value"""
         symbol: sympy.Symbol = make_symbol(
-            SymT.UNBACKED_INT, next(self.unbacked_symint_counter), integer=True
+            SymT.UNBACKED_INT, self.unbacked_symint_counter, integer=True
         )
+        self.unbacked_symint_counter += 1
         if not self._ignore_fresh_unbacked_symbols_tls():
             self.pending_fresh_unbacked_symbols.append(symbol)
         self.counter["create_unbacked_symbol"] += 1
@@ -4859,7 +4924,6 @@ def create_unbacked_symint(self, source: Optional[Source] = None) -> SymInt:
         self._log_create_unbacked_symbol(
             "create_unbacked_symint", symbol, vr, source, sym_node=sym_node
         )
-
         return SymInt(sym_node)
 
     def is_unbacked_symint(self, symbol: sympy.Symbol) -> bool:
@@ -4870,8 +4934,9 @@ def is_unbacked_symint(self, symbol: sympy.Symbol) -> bool:
     def create_unbacked_symbool(self) -> SymBool:
         """Create a symbolic boolean without a hint value"""
         symbol: sympy.Symbol = make_symbol(
-            SymT.UNBACKED_INT, next(self.unbacked_symint_counter), integer=True
+            SymT.UNBACKED_INT, self.unbacked_symint_counter, integer=True
         )
+        self.unbacked_symint_counter += 1
         if not self._ignore_fresh_unbacked_symbols_tls():
             self.pending_fresh_unbacked_symbols.append(symbol)
         self.counter["create_unbacked_symbol"] += 1
@@ -4977,6 +5042,9 @@ def create_symbol(
         if dynamic_dim in (DimDynamic.SIZE_LIKE_UNBACKED, DimDynamic.OBLIVIOUS_SIZE):
             out = self.create_unbacked_symint(source).node.expr
             self._constrain_range_for_size(out)
+
+            self.unbacked_inputs.add(out)
+
             if isinstance(symbolic_context, StatefulSymbolicContext) and source_name:
                 symbolic_context.shape_env_to_source_to_symbol_cache[id(self)][
                     source_name
@@ -5054,6 +5122,7 @@ def create_symbol(
 
             if duck:
                 # Make sure to reuse this symbol for subsequent duck shaping
+                # pyrefly: ignore  # unsupported-operation
                 self.val_to_var[val] = sympy_expr
 
             if isinstance(val, int):
@@ -5069,7 +5138,7 @@ def create_symbol(
                         self._get_sloc(
                             "user code shown is first use of this value--the guard itself is not "
                             "due user code but due to 0/1 specialization in the framework; to "
-                            "avoid specialization try torch._dynamo.mark_unbacked(tensor, dim)"
+                            "avoid specialization try torch._dynamo.decorators.mark_unbacked(tensor, dim)"
                             if self.specialize_zero_one
                             else None
                         ),
@@ -5285,15 +5354,19 @@ def _create_no_constraints_context(t: Tensor) -> StatelessSymbolicContext:
 
         # Expand optional inputs, or verify invariants are upheld
         if input_contexts is None:
+            # pyrefly: ignore  # bad-assignment
             input_contexts = [
+                # pyrefly: ignore  # bad-argument-type
                 _create_no_constraints_context(t) if isinstance(t, Tensorlike) else None
                 for t in placeholders
             ]
         else:
             assert len(input_contexts) == len(placeholders)
+
             for i, (t, context) in enumerate(zip(placeholders, input_contexts)):
                 if isinstance(t, Tensorlike):
                     if context is None:
+                        # pyrefly: ignore  # bad-argument-type
                         input_contexts[i] = _create_no_constraints_context(t)
                 else:
                     assert isinstance(t, (SymInt, int, SymFloat, float))
@@ -5579,6 +5652,7 @@ def track_symfloat(source: Source, val: FloatLikeType) -> None:
                 s = sympy.Float(val)
                 input_guards.append((source, s))
 
+        # pyrefly: ignore  # no-matching-overload
         for t, source, context in zip(placeholders, sources, input_contexts):
             if isinstance(source, str):
                 from torch._dynamo.source import LocalSource
@@ -5643,6 +5717,7 @@ def track_symfloat(source: Source, val: FloatLikeType) -> None:
                             src, TensorProperty.SIZE, i
                         )
                         track_symint(property_source, ss, constraint_size[i])
+
                     for i, ss in enumerate(curr_t.stride()):
                         property_source = TensorPropertySource(
                             src, TensorProperty.STRIDE, i
@@ -5658,6 +5733,7 @@ def track_symfloat(source: Source, val: FloatLikeType) -> None:
         #    if we have an input (2, 3), we must show s0*2 == 2 and s1 == 3.
         #    This does a lot of work: it covers duck sizing and equality guards.
         all_exprs: list[list[str]] = [[] for _ in langs]
+
         self.dim_constraints = DimConstraints(
             symbol_to_source,
             self.var_to_val,
@@ -5834,6 +5910,7 @@ def issue_guard(guard: ShapeGuard) -> None:
                 is not None
             ):
                 continue
+
             issue_guard(guard)
 
         # Because there are guards that export's constraint solver can suggest good fixes for, that we may have
@@ -5845,6 +5922,7 @@ def issue_guard(guard: ShapeGuard) -> None:
             if self._maybe_evaluate_static(ra.expr, axioms=()) is not None:
                 continue
             expr = self.simplify(ra.expr)
+
             self.dim_constraints.add(expr)
 
         # 3. Every symbol must be within its value range (this handles 0/1
@@ -5937,6 +6015,7 @@ def issue_guard(guard: ShapeGuard) -> None:
                 else:
                     str_msg = f"  - {msg_cb()}"
                     error_msgs.append(str_msg)
+                    # pyrefly: ignore  # bad-argument-type
                     debug_names.add(debug_name)
             if len(error_msgs) > 0:
                 debug_names_str = ", ".join(sorted(debug_names))
@@ -6070,6 +6149,7 @@ def get_pruned_guards(self, symints: Sequence[torch.SymInt]) -> list[ShapeGuard]
         Get a list of guards, but pruned so it only provides guards that
         reference symints from the passed in input
         """
+        # pyrefly: ignore  # bad-assignment
         symints = {
             s.node.expr for s in symints if isinstance(s.node.expr, sympy.Symbol)
         }
@@ -6332,6 +6412,7 @@ def replace(self, expr: _SympyT) -> _SympyT:
         Apply symbol replacements to any symbols in the given expression.
         """
         replacements = {}
+        # pyrefly: ignore  # missing-attribute
         for s in expr.free_symbols:
             r = self._find(s)
 
@@ -6341,6 +6422,7 @@ def replace(self, expr: _SympyT) -> _SympyT:
             if not r.is_Symbol or r != s:
                 replacements[s] = r
         if replacements:
+            # pyrefly: ignore  # missing-attribute
             return safe_expand(expr.xreplace(replacements))
         else:
             return expr
@@ -6380,23 +6462,6 @@ def simplify(self, expr: _SympyT, size_oblivious: bool = False) -> _SympyT:
             if min_max_replacements:
                 expr = expr.xreplace(min_max_replacements)
 
-        if size_oblivious and (expr.has(Max) or expr.has(Min)):  # type: ignore[has-type]
-            min_max_replacements = {}
-            for atom in (*expr.atoms(Max), *expr.atoms(Min)):  # type: ignore[has-type]
-                if len(atom.args) > 2:
-                    continue
-                a, b = atom.args
-                if b == 1 or b == 0:
-                    a, b = b, a
-                if a == 1 or a == 0:
-                    vr = self.bound_sympy(b, size_oblivious=True)
-                    if vr.lower >= a:
-                        min_max_replacements[atom] = b if atom.func is Max else a
-                    elif vr.upper <= a:
-                        min_max_replacements[atom] = a if atom.func is Max else b
-            if min_max_replacements:
-                expr = expr.xreplace(min_max_replacements)
-
         if expr.has(TruncToInt):
             trunc_replacements = {}
             for atom in expr.atoms(TruncToInt):
@@ -6651,6 +6716,7 @@ def _set_replacement(self, a: sympy.Symbol, tgt: sympy.Expr, msg: str) -> None:
         Adds or updates a replacement for a symbol.
         Use this instead of `self.replacements[a] = tgt`.
         """
+
         if tgt == self.replacements.get(a, None):
             return
 
@@ -6908,7 +6974,10 @@ def _smart_symbol_sort(x: sympy.Symbol) -> tuple[int, int, str]:
                 ):
                     raise NotImplementedError
 
-                # Never replace unbacked symbols with other unbacked symbols.
+                # Never replace unbacked symbols with other unbacked symbols that are
+                # not function arguments. (ex:mark_unbacked symbols are fine to replace
+                # other unbacked, but not those coming from .item() calls).
+
                 # This is error prone because you can cause references to
                 # unbacked symbols to time travel backwards.  E.g.,
                 #
@@ -6924,8 +6993,10 @@ def _smart_symbol_sort(x: sympy.Symbol) -> tuple[int, int, str]:
                 # dependencies for substitutions, so ban it entirely.
                 def trivial_solve(lhs: sympy.Expr, rhs: sympy.Expr) -> bool:
                     if isinstance(lhs, sympy.Symbol):
-                        if free_unbacked_symbols(lhs) and not free_unbacked_symbols(
-                            rhs
+                        if free_unbacked_symbols(
+                            lhs
+                        ) and not _free_non_source_unbacked_symbols(
+                            rhs, self.unbacked_inputs
                         ):
                             return True
                         if symbol_is_type(lhs, SymT.FLOAT):
@@ -7043,7 +7114,7 @@ def _check_frozen(self, expr: sympy.Basic, concrete_val: sympy.Basic) -> None:
                 expr,
                 concrete_val,
                 # only print stack trace when debug mode is on (e.g. TORCH_LOGS="dynamic")
-                stack_info=True if log.getEffectiveLevel() < logging.WARNING else False,
+                stack_info=log.getEffectiveLevel() < logging.WARNING,
             )
 
     def _get_user_frame(self) -> Optional[types.FrameType]:
@@ -7126,6 +7197,7 @@ def _find_frame_locals(self) -> _FrameLocalResult:
         instructions = list(dis.Bytecode(frame.f_code))
         co_lines, offset = inspect.getsourcelines(frame.f_code)
         start, end, cur = None, None, None
+        # pyrefly: ignore  # bad-assignment
         for i, instr in enumerate(instructions):
             if instr.starts_line is not None:
                 cur = instr.starts_line
@@ -7414,7 +7486,6 @@ def _evaluate_expr(
         forcing_spec: bool = False,
     ) -> sympy.Basic:
         # TODO: split conjunctions and evaluate them separately
-
         if isinstance(
             orig_expr,
             (sympy.logic.boolalg.BooleanTrue, sympy.logic.boolalg.BooleanFalse),
@@ -7559,8 +7630,10 @@ def compute_concrete_val() -> sympy.Basic:
                         log.info(
                             "oblivious_size %s -> %s (passed counterfactual)",
                             orig_expr,
+                            # pyrefly: ignore  # unbound-name
                             correct_hint,
                         )
+                        # pyrefly: ignore  # unbound-name
                         concrete_val = correct_hint
                         # NB: do NOT transmute into runtime assert
                         ok = True
@@ -7577,8 +7650,10 @@ def compute_concrete_val() -> sympy.Basic:
                             ).xreplace(self.var_to_val)
                         ).free_symbols
                     ):
+                        # pyrefly: ignore  # unbound-name
                         self._log_real_tensor_propagation(orig_expr, unsound_result)
                         transmute_into_runtime_assert = True
+                        # pyrefly: ignore  # unbound-name
                         concrete_val = unsound_result
                         ok = True
 
@@ -7759,6 +7834,7 @@ def guard_or_defer_runtime_assert(
             expr = canonicalize_bool_expr(expr)
             stack = CapturedTraceback.extract(skip=1)
             ra = RuntimeAssert(expr, msg, stack)
+
             # TODO: Do this in a way that is less janky than int(s.name[1:])
             cands = sorted(
                 (s for s in expr.free_symbols if symbol_is_type(s, SymT.UNBACKED_INT)),
@@ -7822,13 +7898,13 @@ def _refine_ranges(self, expr: SympyBoolean) -> None:
             # sympy.Eq may update both lower and upper bounds.
             # sympy.G{t,e} may update the lower bound, only.
             # sympy.L{t,e} may update the upper bound, only.
-            if lower < rhs_vr.lower and isinstance(
+            if lower <= rhs_vr.lower and isinstance(
                 r_expr, (sympy.Eq, sympy.Ge, sympy.Gt)
             ):
                 # Strictly greater relations allow us to refine a bit more, since
                 # x < y implies that the lower bound for x is: y + 1.
                 lower = rhs_vr.lower + int(isinstance(r_expr, sympy.Gt))
-            if upper > rhs_vr.upper and isinstance(
+            if upper >= rhs_vr.upper and isinstance(
                 r_expr, (sympy.Eq, sympy.Le, sympy.Lt)
             ):
                 upper = rhs_vr.upper - int(isinstance(r_expr, sympy.Lt))
@@ -7926,17 +8002,6 @@ def _print_Symbol(self, sym: sympy.Symbol) -> str:
         return self.src_map[sym.name][0]
 
 
-def _is_non_negative_check(cond: sympy.Basic) -> Optional[str]:
-    """
-    Check if a condition (SymPy expression) is checking for non-negative values (>= 0).
-    Returns the variable name if it's a non-negative check (>= 0), None otherwise.
-    """
-    if isinstance(cond, sympy.Rel):
-        if cond.rel_op == ">=" and cond.rhs == 0:
-            return str(cond.lhs)
-    return None
-
-
 def _suggest_torch_checks(
     e: GuardOnDataDependentSymNode, src_map: defaultdict[str, list[str]]
 ) -> None:
@@ -7965,26 +8030,13 @@ def _suggest_torch_checks(
     msg += "\nTo fix the error, insert one of the following checks before this call:"
 
     not_cond_str = printer.doprint(sympy.Not(cond))
-    var_name = _is_non_negative_check(cond)
 
     # suggested fixes to resolve `cond` are to tell the compiler to assume
     # either `cond` or its negation (the user will need to select which)
-    suggested_fixes = []
-
-    if var_name:
-        suggested_fixes = [
-            f"You can add either: torch._check_is_size({var_name}) or torch._check({var_name}>=0)"
-            f" Note: torch._check_is_size({var_name}) could prevent data dependent errors that"
-            + " happen in a guard_size_oblivious(..) context by opting into guard_size_oblivious reasoning."
-            + " See documentation on guard_size_oblivious for more details:"
-            + " https://pytorch.org/docs/stable/generated/torch.fx.experimental.symbolic_shapes.guard_size_oblivious.html",
-            f"torch._check({not_cond_str})",
-        ]
-    else:
-        suggested_fixes = [
-            f"torch._check({printer.doprint(cond)})",
-            f"torch._check({not_cond_str})",
-        ]
+    suggested_fixes = [
+        f"torch._check({printer.doprint(cond)})",
+        f"torch._check({not_cond_str})",
+    ]
 
     for i, fix in enumerate(suggested_fixes):
         msg += f"\n  {i + 1}. {fix}"
diff --git a/torch/fx/experimental/unification/core.py b/torch/fx/experimental/unification/core.py
index e32f42c8968e..3d8071c847ae 100644
--- a/torch/fx/experimental/unification/core.py
+++ b/torch/fx/experimental/unification/core.py
@@ -77,7 +77,7 @@ def reify(e, s):
 seq = tuple, list, Iterator
 
 
-@dispatch(seq, seq, dict)
+@dispatch(seq, seq, dict)  # type: ignore[arg-type]
 def _unify(u, v, s):
     if len(u) != len(v):
         return False
diff --git a/torch/fx/experimental/unification/dispatch.py b/torch/fx/experimental/unification/dispatch.py
index 82d62e1f1619..72b950c5b36d 100644
--- a/torch/fx/experimental/unification/dispatch.py
+++ b/torch/fx/experimental/unification/dispatch.py
@@ -1,8 +1,8 @@
 from functools import partial
 
-from .multipledispatch import dispatch  # type: ignore[import]
+from .multipledispatch import dispatch as _dispatch  # type: ignore[import]
 
 
 namespace = {}  # type: ignore[var-annotated]
 
-dispatch = partial(dispatch, namespace=namespace)
+dispatch = partial(_dispatch, namespace=namespace)
diff --git a/torch/fx/experimental/unification/more.py b/torch/fx/experimental/unification/more.py
index da2b1773f95b..f1df562a2dcd 100644
--- a/torch/fx/experimental/unification/more.py
+++ b/torch/fx/experimental/unification/more.py
@@ -1,8 +1,16 @@
 # mypy: allow-untyped-defs
-from .core import reify, unify  # type: ignore[attr-defined]
+from .core import (  # type: ignore[attr-defined]
+    _reify as core_reify,
+    _unify as core_unify,
+    reify,
+    unify,
+)
 from .dispatch import dispatch
 
 
+__all__ = ["unifiable", "reify_object", "unify_object"]
+
+
 def unifiable(cls):
     """Register standard unify and reify operations on class
     This uses the type and __dict__ or __slots__ attributes to define the
@@ -21,8 +29,8 @@ def unifiable(cls):
     >>> unify(a, b, {})
     {~x: 2}
     """
-    _unify.add((cls, cls, dict), unify_object)
-    _reify.add((cls, dict), reify_object)
+    core_unify.add((cls, cls, dict), unify_object)  # type: ignore[attr-defined]
+    core_reify.add((cls, dict), reify_object)  # type: ignore[attr-defined]
 
     return cls
 
@@ -104,7 +112,7 @@ def unify_object(u, v, s):
     >>> unify_object(f, g, {})
     {~x: 2}
     """
-    if type(u) != type(v):
+    if type(u) is not type(v):
         return False
     if hasattr(u, "__slots__"):
         return unify(
diff --git a/torch/fx/experimental/unification/multipledispatch/core.py b/torch/fx/experimental/unification/multipledispatch/core.py
index f1f09dcf559c..69b9f3b2b5a2 100644
--- a/torch/fx/experimental/unification/multipledispatch/core.py
+++ b/torch/fx/experimental/unification/multipledispatch/core.py
@@ -1,5 +1,8 @@
 # mypy: allow-untyped-defs
 import inspect
+from collections.abc import Callable
+from typing import Any, TypeVar
+from typing_extensions import TypeVarTuple, Unpack
 
 from .dispatcher import Dispatcher, MethodDispatcher
 
@@ -8,8 +11,13 @@
 
 __all__ = ["dispatch", "ismethod"]
 
+T = TypeVar("T")
+Ts = TypeVarTuple("Ts")
 
-def dispatch(*types, **kwargs):
+
+def dispatch(
+    *types: Unpack[Ts], **kwargs: Any
+) -> Callable[[Callable[..., T]], Callable[..., T]]:
     """Dispatch function on the types of the inputs
     Supports dispatch on all non-keyword arguments.
     Collects implementations based on the function name.  Ignores namespaces.
@@ -50,7 +58,7 @@ def dispatch(*types, **kwargs):
     """
     namespace = kwargs.get("namespace", global_namespace)
 
-    types = tuple(types)
+    types_tuple: tuple[type, ...] = tuple(types)  # type: ignore[arg-type]
 
     def _df(func):
         name = func.__name__
@@ -65,7 +73,7 @@ def _df(func):
                 namespace[name] = Dispatcher(name)
             dispatcher = namespace[name]
 
-        dispatcher.add(types, func)
+        dispatcher.add(types_tuple, func)
         return dispatcher
 
     return _df
diff --git a/torch/fx/experimental/unification/multipledispatch/dispatcher.py b/torch/fx/experimental/unification/multipledispatch/dispatcher.py
index 11cc8bd59a73..1410bbc5239c 100644
--- a/torch/fx/experimental/unification/multipledispatch/dispatcher.py
+++ b/torch/fx/experimental/unification/multipledispatch/dispatcher.py
@@ -238,6 +238,7 @@ def add(self, signature, func):
                         "To use a variadic union type place the desired types "
                         "inside of a tuple, e.g., [(int, str)]"
                     )
+                # pyrefly: ignore  # bad-specialization
                 new_signature.append(Variadic[typ[0]])
             else:
                 new_signature.append(typ)
@@ -265,7 +266,7 @@ def reorder(self, on_ambiguity=ambiguity_warn):
         return od
 
     def __call__(self, *args, **kwargs):
-        types = tuple([type(arg) for arg in args])
+        types = tuple(type(arg) for arg in args)
         try:
             func = self._cache[types]
         except KeyError as e:
@@ -406,6 +407,7 @@ class MethodDispatcher(Dispatcher):
         Dispatcher
     """
 
+    # pyrefly: ignore  # bad-override
     __slots__ = ("obj", "cls")
 
     @classmethod
@@ -420,7 +422,7 @@ def __get__(self, instance, owner):
         return self
 
     def __call__(self, *args, **kwargs):
-        types = tuple([type(arg) for arg in args])
+        types = tuple(type(arg) for arg in args)
         func = self.dispatch(*types)
         if not func:
             raise NotImplementedError(
diff --git a/torch/fx/experimental/unification/unification_tools.py b/torch/fx/experimental/unification/unification_tools.py
index a47d900273f5..f29bc8b52550 100644
--- a/torch/fx/experimental/unification/unification_tools.py
+++ b/torch/fx/experimental/unification/unification_tools.py
@@ -298,6 +298,7 @@ def update_in(d, keys, func, default=None, factory=dict):
     rv = inner = factory()
     rv.update(d)
 
+    # pyrefly: ignore  # not-iterable
     for key in ks:
         if k in d:
             d = d[k]
diff --git a/torch/fx/experimental/unification/utils.py b/torch/fx/experimental/unification/utils.py
index a8035f75d302..ab99ad1b4f0d 100644
--- a/torch/fx/experimental/unification/utils.py
+++ b/torch/fx/experimental/unification/utils.py
@@ -60,7 +60,7 @@ def _toposort(edges):
             incoming_edges[m].remove(n)
             if not incoming_edges[m]:
                 S.add(m)
-    if any(incoming_edges.get(v, None) for v in edges):
+    if any(incoming_edges.get(v) for v in edges):
         raise ValueError("Input has cycles")
     return L
 
diff --git a/torch/fx/experimental/unification/variable.py b/torch/fx/experimental/unification/variable.py
index 46e59851fdfa..1b5b51aaf99a 100644
--- a/torch/fx/experimental/unification/variable.py
+++ b/torch/fx/experimental/unification/variable.py
@@ -31,7 +31,7 @@ def __str__(self):
     __repr__ = __str__
 
     def __eq__(self, other):
-        return type(self) == type(other) and self.token == other.token  # type: ignore[attr-defined]
+        return type(self) is type(other) and self.token == other.token  # type: ignore[attr-defined]
 
     def __hash__(self):
         return hash((type(self), self.token))  # type: ignore[attr-defined]
@@ -55,7 +55,7 @@ def isvar(v):
 
 @dispatch(object)  # type: ignore[no-redef]
 def isvar(o):
-    return not not _glv and hashable(o) and o in _glv
+    return _glv and hashable(o) and o in _glv
 
 
 @contextmanager
diff --git a/torch/fx/experimental/validator.py b/torch/fx/experimental/validator.py
index db0095251206..eb55b6c2050c 100644
--- a/torch/fx/experimental/validator.py
+++ b/torch/fx/experimental/validator.py
@@ -4,8 +4,9 @@
 import logging
 import math
 import operator
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import sympy
 
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 514490513cbf..940737e7e3a6 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -12,10 +12,10 @@
 import typing
 import warnings
 from collections import defaultdict
-from collections.abc import Iterable, Iterator
+from collections.abc import Callable, Iterable, Iterator
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Any, Callable, Literal, NamedTuple, Optional, TYPE_CHECKING
+from typing import Any, Literal, NamedTuple, Optional, TYPE_CHECKING
 
 import torch
 import torch.utils._pytree as pytree
@@ -500,19 +500,10 @@ def type_repr(o: Any):
 
                 origin_typename = add_global(_type_repr(origin_type), origin_type)
 
-                if hasattr(o, "__args__"):
-                    # Assign global names for each of the inner type variables.
+                if hasattr(o, "__args__") and o.__args__:
                     args = [type_repr(arg) for arg in o.__args__]
-
-                    if len(args) == 0:
-                        # Bare type, such as `typing.Tuple` with no subscript
-                        # This code-path used in Python < 3.9
-                        return origin_typename
-
                     return f"{origin_typename}[{','.join(args)}]"
                 else:
-                    # Bare type, such as `typing.Tuple` with no subscript
-                    # This code-path used in Python 3.9+
                     return origin_typename
 
             # Common case: this is a regular module name like 'foo.bar.baz'
@@ -657,24 +648,37 @@ def emit_node(node: Node):
                     "val",
                     node.meta.get("tensor_meta", node.meta.get("example_value", None)),
                 )
+
+                def _tensor_annotation(t: torch.Tensor) -> str:
+                    stride = stringify_shape(t.stride()) if include_stride else ""
+                    device = f"{t.device}" if include_device else ""
+                    return (
+                        f"{red(dtype_abbrs[t.dtype])}"
+                        f"{blue(stringify_shape(t.shape))}"
+                        f"{dim_blue(stride)}"
+                        f"{dim_green(device)}"
+                    )
+
                 # use string as annotation, to make it valid python code
                 if isinstance(meta_val, torch.Tensor) and meta_val.layout not in (
                     torch.sparse_csc,
                     torch.sparse_csr,
                 ):
-                    stride_annotation = (
-                        f"{stringify_shape(meta_val.stride())}"
-                        if include_stride
-                        else ""
-                    )
-                    device_annotation = f"{meta_val.device}" if include_device else ""
-                    maybe_type_annotation = (
-                        f': "{red(dtype_abbrs[meta_val.dtype])}{blue(stringify_shape(meta_val.shape))}'
-                        f'{dim_blue(stride_annotation)}{dim_green(device_annotation)}"'
+                    # Fake tensors cause tests to wobble, so do not custom print them.
+                    is_plain = type(meta_val) is torch.Tensor or isinstance(
+                        meta_val, torch._subclasses.FakeTensor
                     )
+                    core = _tensor_annotation(meta_val)
+                    if is_plain:
+                        maybe_type_annotation = f': "{core}"'
+                    else:
+                        cls = meta_val.__class__.__name__
+                        maybe_type_annotation = f': "{cls}({core})"'
+
                 elif isinstance(meta_val, py_sym_types):
                     val_str = CodeGen._sym_repr(meta_val)
                     maybe_type_annotation = f': "Sym({val_str})"'
+
                 elif isinstance(meta_val, TensorMetadata):
                     maybe_type_annotation = f': "{dtype_abbrs[meta_val.dtype]}{stringify_shape(meta_val.shape)}"'
 
@@ -844,6 +848,44 @@ def emit_node(node: Node):
 # 2. In the FX graph, we need to access 2 attributes - in_spec and out_spec.
 #    Since we can't access .graph within the FX forward, we need to copy the attribute to the module.
 # 3. We currently can't register the pytree imports with `add_global` - not sure why.
+class _BoxedCodeGen(CodeGen):
+    """
+    CodeGen subclass that generates code using the "boxed" calling convention.
+
+    The boxed calling convention takes a single list argument and clears it
+    after extracting the arguments, which allows for early deallocation of
+    input tensors.
+    """
+
+    def gen_fn_def(
+        self, free_vars, maybe_return_annotation, *, expanded_def: bool = False
+    ):
+        """
+        Generate function definition for boxed calling convention.
+
+        Instead of taking individual arguments, the generated function takes
+        a single 'args_list' parameter, extracts placeholder values from it,
+        and clears the list.
+        """
+        # Generate the function signature with args_list parameter
+        fn_def = f"def {self._func_name}(self, args_list){maybe_return_annotation}:"
+
+        if free_vars:
+            # This is horribly manual but we don't get the "raw" free vars
+            # without a bigger refactor.
+            placeholder_vars = [
+                v.split(":")[0].split("=")[0].strip() for v in free_vars if v != "self"
+            ]
+
+            if placeholder_vars:
+                fn_def += "\n    args_iter = iter(args_list)"
+                for var in placeholder_vars:
+                    fn_def += f"\n    {var} = next(args_iter)"
+                fn_def += "\n    args_list.clear()"
+
+        return fn_def
+
+
 class _PyTreeCodeGen(CodeGen):
     def __init__(self, pytree_info: _PyTreeInfo):
         super().__init__()
@@ -909,10 +951,10 @@ def gen_fn_def(
         if len(free_vars) > 0:  # pytree has placeholders in it
             # when kwargs is present, in_spec is tuple(args, kwargs)
             has_args_kwargs_tuple = (
-                self.pytree_info.in_spec.type == tuple
+                self.pytree_info.in_spec.type is tuple
                 and self.pytree_info.in_spec.num_children == 2
-                and self.pytree_info.in_spec.children_specs[0].type == tuple
-                and self.pytree_info.in_spec.children_specs[1].type == dict
+                and self.pytree_info.in_spec.children_specs[0].type is tuple
+                and self.pytree_info.in_spec.children_specs[1].type is dict
             )
             fn_kwargs = "{}"
             fn_signature = f"[{', '.join(fn_args)}], self._in_spec"
@@ -1832,6 +1874,7 @@ def check_arg(arg: Node, n: Optional[Node] = None) -> None:
                             "a str is expected"
                         )
                 if node.op in ["get_attr", "call_module"]:
+                    # pyrefly: ignore  # missing-attribute
                     target_atoms = node.target.split(".")
                     m_itr = self.owning_module
                     for i, atom in enumerate(target_atoms):
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index f4496338fffc..dbe2467b1b89 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -7,8 +7,9 @@
 import sys
 import traceback
 import warnings
+from collections.abc import Callable
 from pathlib import Path
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -18,6 +19,7 @@
 
 from ._compatibility import compatibility
 from .graph import (
+    _BoxedCodeGen,
     _custom_builtins,
     _is_from_torch,
     _override_sym_repr,
@@ -192,7 +194,7 @@ def is_leaf_module(self, _: torch.nn.Module, __: str) -> bool:
     graph = KeepModules().trace(com, **tracer_extras)
 
     # Recover node.meta["stack_trace"] after re-tracing
-    node_meta_stack_trace = body.get("_graphmodule_graph_node_meta_stack_trace", None)
+    node_meta_stack_trace = body.get("_graphmodule_graph_node_meta_stack_trace")
     if node_meta_stack_trace is not None:
         del body["_graphmodule_graph_node_meta_stack_trace"]
         for node in graph.nodes:
@@ -533,6 +535,7 @@ def __init__(
             self.graph._tracer_cls
             and "<locals>" not in self.graph._tracer_cls.__qualname__
         ):
+            # pyrefly: ignore  # bad-assignment
             self._tracer_cls = self.graph._tracer_cls
 
         self._tracer_extras = {}
@@ -552,7 +555,11 @@ def __init__(
     # continued string literal. Issue here: https://github.com/pytorch/pytorch/issues/44842
     #
     # Shouldn't be an issue since these methods shouldn't be used in TorchScript anyway
-    __jit_unused_properties__ = ["graph"]
+    __jit_unused_properties__ = ["graph", "_boxed_call"]
+
+    @property
+    def _boxed_call(self) -> bool:
+        return isinstance(self._graph._codegen, _BoxedCodeGen)
 
     @property
     def graph(self) -> Graph:
diff --git a/torch/fx/node.py b/torch/fx/node.py
index dbd6ed93ef26..b267b01a7c50 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -4,9 +4,9 @@
 import logging
 import operator
 import types
-from collections.abc import Iterable, Mapping, Sequence
-from typing import Any, Callable, Optional, TYPE_CHECKING, Union
-from typing_extensions import ParamSpec, TypeAlias, TypeVar
+from collections.abc import Callable, Iterable, Mapping, Sequence
+from typing import Any, Optional, TYPE_CHECKING, TypeAlias, Union
+from typing_extensions import ParamSpec, TypeVar
 
 import torch
 from torch._C import _fx_map_aggregate, _fx_map_arg, _NodeBase
@@ -59,6 +59,7 @@
         BaseArgumentTypes,
     ]
 ]
+# pyrefly: ignore  # invalid-annotation
 ArgumentT = TypeVar("ArgumentT", bound=Argument)
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
diff --git a/torch/fx/operator_schemas.py b/torch/fx/operator_schemas.py
index 618a0fa8b413..1234d13b3b11 100644
--- a/torch/fx/operator_schemas.py
+++ b/torch/fx/operator_schemas.py
@@ -5,7 +5,8 @@
 import types
 import typing
 import warnings
-from typing import Any, Callable, cast, NamedTuple, Optional, TYPE_CHECKING
+from collections.abc import Callable
+from typing import Any, cast, NamedTuple, Optional, TYPE_CHECKING
 
 import torch
 from torch._jit_internal import boolean_dispatched
@@ -120,6 +121,7 @@ def _torchscript_schema_to_signature_impl(
             # which makes it hard to do type annotation
             kind = Parameter.POSITIONAL_ONLY  # type: ignore[assignment]
             # This renders all previous arguments to positional only
+
             for idx, p in enumerate(parameters):
                 assert p.kind == Parameter.POSITIONAL_OR_KEYWORD
                 parameters[idx] = Parameter(
@@ -128,6 +130,7 @@ def _torchscript_schema_to_signature_impl(
                     default=p.default,
                     annotation=p.annotation,
                 )
+
         parameters.append(
             Parameter(name=name, kind=kind, default=default, annotation=arg_type)
         )
diff --git a/torch/fx/passes/__init__.py b/torch/fx/passes/__init__.py
index 433d8818e259..3bcb6e1d75a1 100644
--- a/torch/fx/passes/__init__.py
+++ b/torch/fx/passes/__init__.py
@@ -4,6 +4,7 @@
     net_min_base,
     operator_support,
     param_fetch,
+    regional_inductor,
     reinplace,
     runtime_assert,
     shape_prop,
diff --git a/torch/fx/passes/_tensorify_python_scalars.py b/torch/fx/passes/_tensorify_python_scalars.py
index dd8edb50e161..41a9e371344d 100644
--- a/torch/fx/passes/_tensorify_python_scalars.py
+++ b/torch/fx/passes/_tensorify_python_scalars.py
@@ -164,9 +164,13 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                 c = float(expr)
 
             node = graph.call_function(
-                torch.ops.aten.scalar_tensor.default, (c,), {"dtype": dtype}
+                torch.ops.aten.scalar_tensor.default,
+                # pyrefly: ignore  # unbound-name
+                (c,),
+                {"dtype": dtype},
             )
             with fake_mode:
+                # pyrefly: ignore  # unbound-name
                 node.meta["val"] = torch.ops.aten.scalar_tensor.default(c, dtype=dtype)
             expr_to_tensor_proxy[expr] = MetaProxy(
                 node,
@@ -219,17 +223,22 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                 expr_to_sym_proxy[s] = MetaProxy(
                     node, tracer=tracer, fake_mode=fake_mode
                 )
+            # pyrefly: ignore  # bad-argument-type
             elif (sym_expr := _get_sym_val(node)) is not None:
                 if sym_expr not in expr_to_sym_proxy and not isinstance(
                     sym_expr, (sympy.Number, sympy.logic.boolalg.BooleanAtom)
                 ):
                     expr_to_sym_proxy[sym_expr] = MetaProxy(
-                        node, tracer=tracer, fake_mode=fake_mode
+                        # pyrefly: ignore  # bad-argument-type
+                        node,
+                        tracer=tracer,
+                        fake_mode=fake_mode,
                     )
 
             # Specialize all dimensions that contain symfloats. Here's
             # an example test that requires this:
             # PYTORCH_OPINFO_SAMPLE_INPUT_INDEX=4 python test/inductor/test_torchinductor_opinfo.py TestInductorOpInfoCUDA.test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 # noqa: B950
+            # pyrefly: ignore  # missing-attribute
             val = node.meta.get("val")
             if isinstance(val, FakeTensor):
                 for dim in val.shape:
@@ -248,13 +257,17 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                                 should_restart = True
 
             # Look for functions to convert
+            # pyrefly: ignore  # missing-attribute
             if node.op == "call_function" and (
+                # pyrefly: ignore  # missing-attribute
                 replacement_op := SUPPORTED_OPS.get(node.target)
             ):
                 args: list[Any] = []
                 transform = False
+                # pyrefly: ignore  # missing-attribute
                 compute_dtype = get_computation_dtype(node.meta["val"].dtype)
 
+                # pyrefly: ignore  # missing-attribute
                 for a in node.args:
                     if (
                         isinstance(a, fx.Node)
@@ -263,6 +276,7 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                     ):
                         transform = True
                         try:
+                            # pyrefly: ignore  # unbound-name
                             proxy = _sympy_interp(zf.node.expr)
                         except NotImplementedError:
                             transform = False
@@ -289,8 +303,10 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                         args.append(a)
 
                 if transform:
+                    # pyrefly: ignore  # unbound-name
                     replacement_proxy = replacement_op(*args)
 
+                    # pyrefly: ignore  # missing-attribute
                     if compute_dtype != node.meta["val"].dtype:
                         replacement_proxy = (
                             torch.ops.prims.convert_element_type.default(
@@ -299,7 +315,9 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                             )
                         )
 
+                    # pyrefly: ignore  # missing-attribute
                     node.replace_all_uses_with(replacement_proxy.node)
+                    # pyrefly: ignore  # bad-argument-type
                     graph.erase_node(node)
 
                     metrics_context = get_metrics_context()
@@ -308,13 +326,16 @@ def _sympy_interp(expr: sympy.Expr) -> MetaProxy:
                             "tensorify_float_success", True, overwrite=True
                         )
             else:
+                # pyrefly: ignore  # missing-attribute
                 for a in node.args:
                     if (
                         isinstance(a, fx.Node)
                         and "val" in a.meta
                         and isinstance(zf := a.meta["val"], torch.SymFloat)
                     ):
+                        # pyrefly: ignore  # missing-attribute
                         failed_tensorify_ops.update(str(node.target))
+                        # pyrefly: ignore  # missing-attribute
                         log.info("Failed to tensorify %s", str(node.target))
 
     # Now do one more pass that specializes all symfloats we didn't manage
diff --git a/torch/fx/passes/backends/cudagraphs.py b/torch/fx/passes/backends/cudagraphs.py
index b98178f0d533..657c7578f5fa 100644
--- a/torch/fx/passes/backends/cudagraphs.py
+++ b/torch/fx/passes/backends/cudagraphs.py
@@ -15,10 +15,10 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
         if node.op not in CALLABLE_NODE_OPS:
             return False
 
-        if node.target in [torch.ops.aten.embedding_dense_backward.default]:
+        if node.target == torch.ops.aten.embedding_dense_backward.default:
             return False
 
-        if node.target in [operator.getitem]:
+        if node.target == operator.getitem:
             return True
 
         found_not_cuda = False
diff --git a/torch/fx/passes/fake_tensor_prop.py b/torch/fx/passes/fake_tensor_prop.py
index 43dbe86c7370..48b35f5183bc 100644
--- a/torch/fx/passes/fake_tensor_prop.py
+++ b/torch/fx/passes/fake_tensor_prop.py
@@ -93,6 +93,7 @@ def extract_val(obj):
             if (shape_env := self._mode.shape_env) and (
                 symbol_to_path := compute_unbacked_bindings(shape_env, result)
             ):
+                # pyrefly: ignore  # unbound-name
                 n.meta["unbacked_bindings"] = symbol_to_path
 
         return result
diff --git a/torch/fx/passes/graph_drawer.py b/torch/fx/passes/graph_drawer.py
index a5445a6851fa..313766d51028 100644
--- a/torch/fx/passes/graph_drawer.py
+++ b/torch/fx/passes/graph_drawer.py
@@ -437,11 +437,14 @@ def _to_dot(
                         )
                     current_graph = buf_name_to_subgraph.get(buf_name)  # type: ignore[assignment]
 
+                # pyrefly: ignore  # missing-attribute
                 current_graph.add_node(dot_node)
 
                 def get_module_params_or_buffers():
                     for pname, ptensor in chain(
-                        leaf_module.named_parameters(), leaf_module.named_buffers()
+                        leaf_module.named_parameters(),
+                        # pyrefly: ignore  # bad-argument-type
+                        leaf_module.named_buffers(),
                     ):
                         pname1 = node.name + "." + pname
                         label1 = (
diff --git a/torch/fx/passes/graph_transform_observer.py b/torch/fx/passes/graph_transform_observer.py
index 6479af665895..e762b8a60d10 100644
--- a/torch/fx/passes/graph_transform_observer.py
+++ b/torch/fx/passes/graph_transform_observer.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 import os
-from typing import Callable, Optional, TypeVar
+from collections.abc import Callable
+from typing import Optional, TypeVar
 
 from torch.fx import Graph, Node
 from torch.fx._compatibility import compatibility
diff --git a/torch/fx/passes/infra/partitioner.py b/torch/fx/passes/infra/partitioner.py
index 6fc17b959424..7bb536dbba93 100644
--- a/torch/fx/passes/infra/partitioner.py
+++ b/torch/fx/passes/infra/partitioner.py
@@ -190,7 +190,7 @@ def _update_partition_map(node: Node, id: int):
                 # Iterate through all the users of this node and update the partition map to indicate
                 # that there is a path from the partition id of this node to the target partition id.
                 for user_node in node.users:
-                    target_id = assignment.get(user_node, None)
+                    target_id = assignment.get(user_node)
                     if target_id is not None:
                         partition_map[id].add(target_id)
                         partition_map[id].update(partition_map[target_id])
@@ -267,9 +267,9 @@ def _update_partition_map(node: Node, id: int):
 
             # node has tuple outputs, re-assign all following getitem node into node's partition
             if is_tuple_output:
-                id = assignment.get(node, None)  # type: ignore[arg-type]
+                id = assignment.get(node)  # type: ignore[arg-type]
                 for user in node.users:
-                    if assignment.get(user, None) != id:  # type: ignore[arg-type]
+                    if assignment.get(user) != id:  # type: ignore[arg-type]
                         nodes_reassignment[user] = id  # type: ignore[assignment]
         for node, id in nodes_reassignment.items():
             merge_single_node(node, None, id)
diff --git a/torch/fx/passes/infra/pass_base.py b/torch/fx/passes/infra/pass_base.py
index 957b8145f995..ef8e79e57869 100644
--- a/torch/fx/passes/infra/pass_base.py
+++ b/torch/fx/passes/infra/pass_base.py
@@ -11,6 +11,7 @@
 
 
 @compatibility(is_backward_compatible=False)
+# pyrefly: ignore  # invalid-inheritance
 class PassResult(namedtuple("PassResult", ["graph_module", "modified"])):
     """
     Result of a pass:
diff --git a/torch/fx/passes/infra/pass_manager.py b/torch/fx/passes/infra/pass_manager.py
index 4077e74360f5..e13ca72fd240 100644
--- a/torch/fx/passes/infra/pass_manager.py
+++ b/torch/fx/passes/infra/pass_manager.py
@@ -1,9 +1,9 @@
 # mypy: allow-untyped-defs
 import inspect
 import logging
+from collections.abc import Callable
 from functools import wraps
 from queue import Queue
-from typing import Callable
 
 import torch.nn as nn
 from torch.fx._compatibility import compatibility
@@ -31,6 +31,7 @@ def pass_result_wrapper(fn: Callable) -> Callable:
         wrapped_fn (Callable[Module, PassResult])
     """
     if fn is None:
+        # pyrefly: ignore  # bad-return
         return None
 
     @wraps(fn)
diff --git a/torch/fx/passes/net_min_base.py b/torch/fx/passes/net_min_base.py
index 8c15b9097397..b4a82f10177d 100644
--- a/torch/fx/passes/net_min_base.py
+++ b/torch/fx/passes/net_min_base.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 import logging
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable, cast, Optional
+from typing import Any, cast, Optional
 
 import torch
 import torch.fx
@@ -395,21 +396,25 @@ def _run_and_compare(
             report.append(f"Result mismatch for {result_key}")  # type: ignore[possibly-undefined]
             if self.module_exporter:
                 if isinstance(result_key, tuple):  # type: ignore[possibly-undefined]
+                    # pyrefly: ignore  # unbound-name
                     result_key = result_key[-1]
                 # If the result is still a tuple (happens in non-sequential mode),
                 # we only use the first element as name.
                 if isinstance(result_key, tuple):  # type: ignore[possibly-undefined]
+                    # pyrefly: ignore  # unbound-name
                     result_key = str(result_key[0])
                 # pyre-ignore[29]: not a function
                 self.module_exporter(
                     a_input,
                     submodule,
+                    # pyrefly: ignore  # unbound-name
                     result_key + "_cpu",
                 )
                 # pyre-ignore[29]: not a function
                 self.module_exporter(
                     b_input,
                     submodule,
+                    # pyrefly: ignore  # unbound-name
                     result_key + "_acc",
                 )
             raise FxNetMinimizerResultMismatchError(f"Result mismatch for {result_key}")  # type: ignore[possibly-undefined]
@@ -668,7 +673,7 @@ def _block_traverse(
         final_start_idx: Optional[int] = start_idx
         final_end_idx: Optional[int] = end_idx
 
-        run_both = True if find_last_node is None else False
+        run_both = find_last_node is None
 
         # step 1: find (0, end_idx) of culprit block
         if run_both or find_last_node:
diff --git a/torch/fx/passes/param_fetch.py b/torch/fx/passes/param_fetch.py
index 02904b8e403e..5e17a8040e6a 100644
--- a/torch/fx/passes/param_fetch.py
+++ b/torch/fx/passes/param_fetch.py
@@ -1,4 +1,5 @@
-from typing import Any, Callable
+from collections.abc import Callable
+from typing import Any
 
 import torch
 import torch.nn as nn
diff --git a/torch/fx/passes/pass_manager.py b/torch/fx/passes/pass_manager.py
index 48dfe702fedb..297d50a68f47 100644
--- a/torch/fx/passes/pass_manager.py
+++ b/torch/fx/passes/pass_manager.py
@@ -1,8 +1,9 @@
 # mypy: allow-untyped-defs
 import logging
+from collections.abc import Callable
 from functools import wraps
 from inspect import unwrap
-from typing import Callable, Optional
+from typing import Optional
 
 
 logger = logging.getLogger(__name__)
diff --git a/torch/fx/passes/regional_inductor.py b/torch/fx/passes/regional_inductor.py
new file mode 100644
index 000000000000..dfd1643513e1
--- /dev/null
+++ b/torch/fx/passes/regional_inductor.py
@@ -0,0 +1,133 @@
+# mypy: allow-untyped-defs
+
+import functools
+import logging
+
+import torch
+from torch.fx._compatibility import compatibility
+
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["regional_inductor"]
+
+
+# standalone_inductor returns a callable class object - this does not sit well
+# with Fx graph node op call_function which expects a function. So this is just
+# a wrapper function to make Fx graph codegen happy.
+def _dummy_wrapper(fn):
+    @functools.wraps(fn)
+    def inner(*args, **kwargs):
+        return fn(*args, **kwargs)
+
+    return inner
+
+
+def _partition_by_supported_nodes(gm, supported_ops, prefix):
+    from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+    from torch.fx.passes.utils.fuser_utils import fuse_by_partitions
+
+    partitioner = CapabilityBasedPartitioner(
+        gm, supported_ops, allows_single_node_partition=True
+    )
+
+    candidate_partitions = partitioner.propose_partitions()
+    partitioned_gm = fuse_by_partitions(
+        partitioner.graph_module,
+        [partition.nodes for partition in candidate_partitions],
+        prefix=prefix,
+        always_return_tuple=True,
+    )
+
+    return partitioned_gm
+
+
+def _compile_submod(gm, prefix):
+    for node in gm.graph.nodes:
+        if node.op == "call_module" and node.target.startswith(prefix):
+            fake_inputs = []
+            for inp_node in node.all_input_nodes:
+                if hasattr(inp_node, "meta") and "val" in inp_node.meta:
+                    fake_inputs.append(inp_node.meta["val"])
+                else:
+                    raise RuntimeError(
+                        f"Partition is bad because non fake tensor value is seen {inp_node}"
+                    )
+
+            submod = getattr(gm, node.target)
+
+            # _dummy_wrapper is to make call_function happy
+            compiled_submod = _dummy_wrapper(
+                torch._inductor.standalone_compile(
+                    submod, fake_inputs, dynamic_shapes="from_tracing_context"
+                )
+            )
+
+            with gm.graph.inserting_after(node):
+                new_node = gm.graph.call_function(
+                    compiled_submod, args=node.args, kwargs=node.kwargs
+                )
+                new_node.meta = node.meta
+                node.replace_all_uses_with(new_node)
+                gm.graph.erase_node(node)
+                del gm._modules[node.target]
+
+    gm.recompile()
+    return gm
+
+
+def _needs_inductor_compile(node):
+    return (
+        node.op not in ("placeholder", "output")
+        and hasattr(node, "meta")
+        and node.meta.get("custom", None)
+        and "compile_with_inductor" in node.meta["custom"]
+    )
+
+
+def _compile_fx_annotated_nodes_with_inductor(gm):
+    from torch.fx.passes.operator_support import OperatorSupport
+
+    found_marked_node = False
+    for node in gm.graph.nodes:
+        if _needs_inductor_compile(node):
+            found_marked_node = True
+            break
+
+    if not found_marked_node:
+        logger.info("No inductor marked nodes found")
+        return gm
+
+    class InductorMarkedNodes(OperatorSupport):
+        def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+            return _needs_inductor_compile(node)
+
+    marked_nodes = InductorMarkedNodes()
+    gm = _partition_by_supported_nodes(gm, marked_nodes, "__marked_inductor_submod")
+    gm = _compile_submod(gm, "__marked_inductor_submod")
+    return gm
+
+
+def _recursive_compile_fx_annotated_nodes_with_inductor(gm):
+    for node in gm.graph.find_nodes(op="get_attr"):
+        if _needs_inductor_compile(node):
+            # If the get_attr itself is marked for compile, the outer graph will
+            # take care of it. If we dont do that, we end up with nested
+            # regional inductor compiles that do not work well.
+            continue
+        submod = getattr(gm, node.target)
+        if isinstance(submod, torch.fx.GraphModule):
+            _recursive_compile_fx_annotated_nodes_with_inductor(submod)
+
+    return _compile_fx_annotated_nodes_with_inductor(gm)
+
+
+@compatibility(is_backward_compatible=False)
+def regional_inductor(gm, *example_args):
+    """
+    Scoops out inductor marked regions and compiles them with inductor.
+    """
+    # fuser utils create new nodes using create_proxy which retains the seq_nr
+    # metadata and cause issues
+    with torch.fx.traceback.preserve_node_meta(enable=False):
+        return _recursive_compile_fx_annotated_nodes_with_inductor(gm)
diff --git a/torch/fx/passes/reinplace.py b/torch/fx/passes/reinplace.py
index 6027c603ec1f..30f154938961 100644
--- a/torch/fx/passes/reinplace.py
+++ b/torch/fx/passes/reinplace.py
@@ -2,8 +2,9 @@
 import _operator
 import itertools
 from collections import defaultdict
+from collections.abc import Callable
 from enum import Enum
-from typing import Any, Callable
+from typing import Any
 
 import torch
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
@@ -542,7 +543,7 @@ def _add_to_map(x):
                 continue
             if len(node.target._schema.arguments) < 1:
                 continue
-            if type(node.target._schema.arguments[0].type) != torch.TensorType:
+            if type(node.target._schema.arguments[0].type) is not torch.TensorType:
                 continue
 
             # Step 1a: Check that the self argument we're attempting to reinplace
diff --git a/torch/fx/passes/runtime_assert.py b/torch/fx/passes/runtime_assert.py
index 19e101a5c120..46fd2afa2291 100644
--- a/torch/fx/passes/runtime_assert.py
+++ b/torch/fx/passes/runtime_assert.py
@@ -61,7 +61,7 @@ def insert_deferred_runtime_asserts(
     """
     During tracing, we may have discovered that some data-dependent values
     had runtime assert on them; e.g., torch.empty(x.item()) induces a runtime
-    that x.item() >= 0.  This asserts can happen unpredictably during fake
+    that x.item() >= 0.  These asserts can happen unpredictably during fake
     tensor propagation, so we cannot conveniently insert them into the FX graph
     when they occur.  Instead, we accumulate them in the ShapeEnv, and in this
     pass insert them into the graph as proper tests.
@@ -298,10 +298,14 @@ def match_symbol(symint, cb):
                         and s not in expr_to_proxy
                     ):
                         with _set_node_metadata_hook(gm, _node_metadata_hook):
+                            # pyrefly: ignore  # unbound-name
                             expr_to_proxy[s] = fx.Proxy(cb(), tracer=tracer)
+                        # pyrefly: ignore  # unbound-name
                         log.debug("expr_to_proxy[%s] = %s", s, expr_to_proxy[s])
 
+                # pyrefly: ignore  # unbound-name
                 match_symbol(example_value, lambda: node)
+                # pyrefly: ignore  # unbound-name
                 if isinstance(t := example_value, torch.Tensor):
                     for i, s in enumerate(t.size()):
                         match_symbol(
@@ -358,6 +362,7 @@ def match_symbol(symint, cb):
             ):
                 # this guards against deleting calls like item() that produce new untracked symbols
                 def has_new_untracked_symbols():
+                    # pyrefly: ignore  # missing-attribute
                     for symbol in sym_expr.free_symbols:
                         if symbol not in expr_to_proxy:
                             return True
@@ -373,6 +378,7 @@ def has_new_untracked_symbols():
                 assert resolved_unbacked_bindings is not None
 
                 def has_new_unbacked_bindings():
+                    # pyrefly: ignore  # missing-attribute
                     for key in resolved_unbacked_bindings.keys():
                         if key not in expr_to_proxy:
                             return True
@@ -380,6 +386,7 @@ def has_new_unbacked_bindings():
 
                 # maybe re-reify expression, replace current node
                 if (
+                    # pyrefly: ignore  # unbound-name
                     sym_expr in expr_to_proxy
                     or (  # example value is redundant
                         _is_intermediate_tensor_sym_call(node)
@@ -398,20 +405,30 @@ def has_new_unbacked_bindings():
                                 nn_module_stack=node.meta.get("nn_module_stack"),
                             ),
                         ):
+                            # pyrefly: ignore  # unbound-name
                             expr_to_proxy[sym_expr] = _sympy_interp(
-                                expr_to_proxy, sym_expr
+                                expr_to_proxy,
+                                # pyrefly: ignore  # unbound-name
+                                sym_expr,
                             )  # type: ignore[arg-type]
                         # won't try DCE-ing tensor compute here
                     hash_node = expr_to_proxy[sym_expr].node  # type: ignore[arg-type]
                     node.replace_all_uses_with(hash_node)
                     gm.graph.erase_node(node)
                     log.debug(
-                        "CSE node %s -> %s for expr %s", node, hash_node, sym_expr
+                        "CSE node %s -> %s for expr %s",
+                        node,
+                        hash_node,
+                        # pyrefly: ignore  # unbound-name
+                        sym_expr,
                     )
 
                 # store node in hash cons, don't delete/replace
+                # pyrefly: ignore  # unbound-name
                 elif sym_expr not in expr_to_proxy and not isinstance(
-                    sym_expr, (sympy.Number, sympy.logic.boolalg.BooleanAtom)
+                    # pyrefly: ignore  # unbound-name
+                    sym_expr,
+                    (sympy.Number, sympy.logic.boolalg.BooleanAtom),
                 ):  # don't hash cons primitives
                     expr_to_proxy[sym_expr] = fx.Proxy(node, tracer=tracer)  # type: ignore[arg-type]
 
diff --git a/torch/fx/passes/shape_prop.py b/torch/fx/passes/shape_prop.py
index d734242abd82..62ea21835613 100644
--- a/torch/fx/passes/shape_prop.py
+++ b/torch/fx/passes/shape_prop.py
@@ -51,11 +51,11 @@ def _extract_tensor_metadata(
     memory_format = None
 
     if include_contiguity and not is_sparse_any(result):
-        memory_formats = {
+        memory_formats = (
             torch.contiguous_format,
             torch.channels_last,
             torch.channels_last_3d,
-        }
+        )
         for query_format in memory_formats:
             if is_contiguous_for_memory_format_or_false(
                 result, memory_format=query_format
@@ -68,14 +68,14 @@ def _extract_tensor_metadata(
     if is_quantized:
         qscheme = result.qscheme()
         qparams["qscheme"] = qscheme
-        if qscheme in {torch.per_tensor_affine, torch.per_tensor_symmetric}:
+        if qscheme in (torch.per_tensor_affine, torch.per_tensor_symmetric):
             qparams["scale"] = result.q_scale()  # type: ignore[assignment]
             qparams["zero_point"] = result.q_zero_point()  # type: ignore[assignment]
-        elif qscheme in {
+        elif qscheme in (
             torch.per_channel_affine,
             torch.per_channel_affine_float_qparams,
             torch.per_channel_symmetric,
-        }:
+        ):
             # In this branch, scale and zero_point are expected to be tensors,
             # we store the values as immutable_list in TensorMetadata for
             # easier serialization downstream
diff --git a/torch/fx/passes/split_module.py b/torch/fx/passes/split_module.py
index 4d9526c63f83..095aea9c1644 100644
--- a/torch/fx/passes/split_module.py
+++ b/torch/fx/passes/split_module.py
@@ -2,7 +2,8 @@
 import inspect
 import logging
 from collections import OrderedDict
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 
 import torch
 from torch.fx._compatibility import compatibility
@@ -59,6 +60,8 @@ def split_module(
     keep_original_order: Optional[bool] = False,
     keep_original_node_name: Optional[bool] = False,
     keep_original_input_name: bool = True,
+    *,
+    partition_affix: Optional[str] = None,
 ):
     """
     Creates subgraphs out of main graph
@@ -81,6 +84,8 @@ def split_module(
             have the same node names as the original graph.
         keep_original_input_name: bool: If the partitioned graphs should
             have the same input names as the original graph.
+        partition_affix: Optional[str]: If specified, the submodules' names will contain
+            the affix, e.g. "submod_<affix>_<idx>".
 
     Returns:
         GraphModule: the module after split.
@@ -253,7 +258,13 @@ def record_cross_partition_use(def_node: Node, use_node: Optional[Node]):
                     use_partition.dependencies.setdefault(defined)
 
     def instantiate_node_partition_mapping(node):
-        partition_name = str(split_callback(node))
+        partition_idx = split_callback(node)
+        partition_name = str(partition_idx)
+        if partition_affix is not None:
+            # For example, if user specifies partition_affix = "pp", then the
+            # partition name will be "pp_0", "pp_1", etc
+            partition_name = "_".join([partition_affix, partition_name])
+
         log.debug(
             "instantiate_node_partition_mapping %s (%s)", node.name, partition_name
         )
@@ -307,6 +318,7 @@ def instantiate_node_partition_mapping(node):
             and isinstance(s0 := val.node.expr, sympy.Symbol)
             and s0 not in symbol_to_node
         ):
+            # pyrefly: ignore  # unbound-name
             symbol_to_node[val.node.expr] = node
 
         if node.op in ["placeholder", "get_attr", "output"]:
@@ -340,7 +352,9 @@ def instantiate_node_partition_mapping(node):
 
     assert all(v is not None for v in autocast_exits.values()), "autocast must exit"
 
+    # pyrefly: ignore  # bad-assignment
     autocast_regions = {k: sorted(v) for k, v in autocast_regions.items()}
+    # pyrefly: ignore  # bad-assignment
     grad_regions = {k: sorted(v) for k, v in grad_regions.items()}
 
     if _LOGGER.isEnabledFor(logging.DEBUG):
@@ -405,7 +419,9 @@ def instantiate_node_partition_mapping(node):
     for regions_mapping in [autocast_regions, grad_regions]:
         for node, regions in regions_mapping.items():
             assert len(regions) > 0
+            # pyrefly: ignore  # index-error
             partitions[str(regions[0])].environment[node] = node
+            # pyrefly: ignore  # index-error
             for r in regions[1:]:
                 partition = partitions[str(r)]
                 new_node = partition.graph.create_node(
@@ -505,6 +521,7 @@ def add_placeholder():
         for node in reversed(regions_mapping):
             regions = regions_mapping[node]
             assert len(regions) > 0
+            # pyrefly: ignore  # index-error
             for r in regions[:-1]:
                 partition = partitions[str(r)]
                 exit_node = autocast_exits[node]
diff --git a/torch/fx/passes/utils/common.py b/torch/fx/passes/utils/common.py
index 17362c9eec12..3924a93d22cf 100644
--- a/torch/fx/passes/utils/common.py
+++ b/torch/fx/passes/utils/common.py
@@ -64,6 +64,7 @@ def lift_subgraph_as_module(
 
         for name in target_name_parts[:-1]:
             if not hasattr(curr, name):
+                # pyrefly: ignore  # missing-attribute
                 curr.add_module(name, HolderModule({}))
 
             curr = getattr(curr, name)
diff --git a/torch/fx/passes/utils/matcher_utils.py b/torch/fx/passes/utils/matcher_utils.py
index aa58b52933f9..6f253cb29286 100644
--- a/torch/fx/passes/utils/matcher_utils.py
+++ b/torch/fx/passes/utils/matcher_utils.py
@@ -127,7 +127,7 @@ def _match_attributes(self, pn: Node, gn: Node) -> bool:
         pn_value = torch.fx.graph_module._get_attr(pn.graph.owning_module, pn.target)
         gn_value = torch.fx.graph_module._get_attr(gn.graph.owning_module, gn.target)
 
-        if type(pn_value) != type(gn_value):
+        if type(pn_value) is not type(gn_value):
             return False
 
         # Don't require exact match on tensor values.
@@ -213,7 +213,7 @@ def _match_literals(self, pn: Any, gn: Any, match: InternalMatch) -> bool:
         elif not isinstance(pn, Node) and isinstance(gn, Node):
             return False
         else:
-            return type(gn) == type(pn) and gn == pn
+            return type(gn) is type(pn) and gn == pn
 
     def _match_nodes(
         self, pn: Node, gn: Node, match: InternalMatch, node_name_match: str = ""
diff --git a/torch/fx/passes/utils/source_matcher_utils.py b/torch/fx/passes/utils/source_matcher_utils.py
index 97a60b06694c..d504ce56fd66 100644
--- a/torch/fx/passes/utils/source_matcher_utils.py
+++ b/torch/fx/passes/utils/source_matcher_utils.py
@@ -1,7 +1,8 @@
 import logging
 import os
+from collections.abc import Callable
 from dataclasses import dataclass, field
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 
 from torch.fx._compatibility import compatibility
 from torch.fx.graph import Graph
@@ -84,6 +85,7 @@ def get_source_partitions(
         if (source_fn_st := node.meta.get("source_fn_stack", None)) is None and (
             torch_fn := node.meta.get("torch_fn", None)
         ) is not None:
+            # pyrefly: ignore  # unbound-name
             node_fqn, source_fn = torch_fn
             source_fn_name = source_fn.split(".")[1]
             if source_fn_name in wanted_sources:
diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py
index aa057085b376..9b359d78d6d8 100644
--- a/torch/fx/proxy.py
+++ b/torch/fx/proxy.py
@@ -10,13 +10,14 @@
 import sys
 import traceback
 from collections import OrderedDict
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 from dataclasses import fields, is_dataclass
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 
 import torch
 import torch.fx.traceback as fx_traceback
 from torch._C import _fx_map_aggregate as map_aggregate, _fx_map_arg as map_arg
+from torch._logging import getArtifactLogger
 from torch.utils._traceback import CapturedTraceback
 
 from ._compatibility import compatibility
@@ -40,6 +41,7 @@
 
 
 log = logging.getLogger(__name__)
+annotation_log = getArtifactLogger(__name__, "annotation")
 
 
 @compatibility(is_backward_compatible=False)
@@ -202,7 +204,9 @@ def create_node(
             # BWD pass we retrieve the sequence_nr stored on the current
             # executing autograd Node. See NOTE [ Sequence Number ].
             if current_meta.get("in_grad_fn", 0) > 0:
+                annotation_log.debug("seq_nr from current_meta")
                 new_seq_nr = current_meta["grad_fn_seq_nr"][-1]
+            annotation_log.debug("Assigning new_seq_nr %s to %s", new_seq_nr, node.name)
             node.meta["seq_nr"] = new_seq_nr
 
         elif self.module_stack:
@@ -816,7 +820,7 @@ def _create_arg_dict(self, a):
     ]
 }
 _create_arg_bypass[Proxy] = lambda self, a: a.node
-_create_arg_bypass[tuple] = lambda self, a: tuple([self.create_arg(elem) for elem in a])
+_create_arg_bypass[tuple] = lambda self, a: tuple(self.create_arg(elem) for elem in a)
 _create_arg_bypass[list] = lambda self, a: [self.create_arg(elem) for elem in a]
 _create_arg_bypass[dict] = _create_arg_dict
 _create_arg_bypass[immutable_list] = _create_arg_bypass[list]
diff --git a/torch/fx/subgraph_rewriter.py b/torch/fx/subgraph_rewriter.py
index eebdfad09632..2253da19d364 100644
--- a/torch/fx/subgraph_rewriter.py
+++ b/torch/fx/subgraph_rewriter.py
@@ -1,6 +1,7 @@
 import copy
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable, NamedTuple, Optional, TYPE_CHECKING, Union
+from typing import Any, NamedTuple, Optional, TYPE_CHECKING, Union
 
 import torch
 
@@ -288,7 +289,7 @@ def _replace_pattern(
     elif isinstance(pattern, Graph):
         pattern_graph = pattern
     else:
-        pattern_graph = symbolic_trace(pattern).graph
+        pattern_graph = symbolic_trace(pattern).graph  # type: ignore[arg-type]
 
     matcher = SubgraphMatcher(
         pattern_graph,
@@ -321,7 +322,7 @@ def _replace_pattern(
         assert replacement_callback is not None, (
             "Must provide either a replacement GraphModule or a replacement callback"
         )
-        common_replacement_graph = None
+        common_replacement_graph = None  # type: ignore[assignment]
 
     # As we progressively replace nodes, we'll need to keep track of how the match results should change
     match_changed_node: dict[Node, Node] = {}
diff --git a/torch/fx/traceback.py b/torch/fx/traceback.py
index ed111b5f5b54..2774c76850aa 100644
--- a/torch/fx/traceback.py
+++ b/torch/fx/traceback.py
@@ -10,12 +10,15 @@
 
 from ._compatibility import compatibility
 from .graph import Graph
+from .graph_module import GraphModule
 from .node import Node
 
 
 log = logging.getLogger(__name__)
 
 __all__ = [
+    "annotate",
+    "annotate_fn",
     "preserve_node_meta",
     "has_preserved_node_meta",
     "set_stack_trace",
@@ -218,19 +221,15 @@ def _from_dict(cls, d: Optional[dict]) -> Optional["NodeSource"]:
 def preserve_node_meta(enable=True):
     global should_preserve_node_meta
     global current_meta
-    # If enable is False, this context manager is a no-op
-    if not enable:
+    saved_should_preserve_node_meta = should_preserve_node_meta
+    # Shallow copy is OK since fields of current_meta are not mutated
+    saved_current_meta = current_meta.copy()
+    try:
+        should_preserve_node_meta = enable
         yield
-    else:
-        saved_should_preserve_node_meta = should_preserve_node_meta
-        # Shallow copy is OK since fields of current_meta are not mutated
-        saved_current_meta = current_meta.copy()
-        try:
-            should_preserve_node_meta = True
-            yield
-        finally:
-            should_preserve_node_meta = saved_should_preserve_node_meta
-            current_meta = saved_current_meta
+    finally:
+        should_preserve_node_meta = saved_should_preserve_node_meta
+        current_meta = saved_current_meta
 
 
 @compatibility(is_backward_compatible=False)
@@ -241,6 +240,96 @@ def set_stack_trace(stack: list[str]):
         current_meta["stack_trace"] = "".join(stack)
 
 
+@compatibility(is_backward_compatible=False)
+@contextmanager
+def annotate(annotation_dict: dict):
+    """
+    Temporarily adds custom annotations to the current tracing context.
+    The fx_node produced from this tracing context will have the
+    custom annotations in node.metadata["custom"] field.
+
+    This context manager allows you to insert arbitrary metadata into the PT2
+    tracing system by updating the global `current_meta["custom"]` dictionary.
+    The annotations are automatically reverted after the context exits.
+
+    This is intended for advanced users who need to attach additional metadata to the fx nodes
+    (e.g., for debugging, analysis, or external tooling) during export tracing.
+
+    Note:
+        This API is **not backward compatible** and may evolve in future releases.
+
+    Note:
+        This API is not compatible with fx.symbolic_trace or jit.trace. It's intended
+        to be used with PT2 family of tracers, e.g. torch.export and dynamo.
+
+    Args:
+        annotation_dict (dict): A dictionary of custom key-value pairs to inject
+            into the FX trace metadata.
+
+    Example:
+        After exiting the context, custom annotations are removed.
+
+        >>> with annotate({"source": "custom_pass", "tag": 42}):
+        ...     pass  # Your computation here
+    """
+
+    global current_meta
+
+    has_custom = "custom" in current_meta
+    old_custom = copy.copy(current_meta.get("custom", {}))
+
+    try:
+        if not has_custom:
+            current_meta["custom"] = {}
+
+        # Update with all key-value pairs from the input dict
+        current_meta["custom"].update(annotation_dict)
+        yield
+    finally:
+        if has_custom:
+            # Restore the original custom dict
+            current_meta["custom"] = old_custom
+        else:
+            del current_meta["custom"]
+
+
+@compatibility(is_backward_compatible=False)
+def annotate_fn(annotation_dict: dict):
+    """
+    A decorator that wraps a function with the annotate context manager.
+    Use this when you want to annotate an entire function instead of a specific code block.
+
+    Note:
+        This API is **not backward compatible** and may evolve in future releases.
+
+    Note:
+        This API is not compatible with fx.symbolic_trace or jit.trace. It's intended
+        to be used with PT2 family of tracers, e.g. torch.export and dynamo.
+
+    Args:
+        annotation_dict (dict): A dictionary of custom key-value pairs to inject
+            into the FX trace metadata for all operations in the function.
+
+    Example:
+        All operations in my_function will have {"pp_stage": 1} in their metadata.
+
+        >>> @annotate_fn({"pp_stage": 1})
+        ... def my_function(x):
+        ...     return x + 1
+    """
+    from functools import wraps
+
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            with annotate(annotation_dict):
+                return func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
 @compatibility(is_backward_compatible=False)
 def set_grad_fn_seq_nr(seq_nr):
     global current_meta
@@ -339,3 +428,20 @@ def get_graph_provenance_json(graph: Graph) -> dict[str, Any]:
             },
         )
         return {}
+
+
+def _get_custom_metadata(gm: GraphModule) -> str:
+    assert isinstance(gm, GraphModule)
+
+    def helper(gm: GraphModule):
+        custom_metadata = []
+        for node in gm.graph.nodes:
+            if hasattr(node, "meta") and node.meta.get("custom", None):
+                custom_metadata.append((node.op, node.name, node.meta["custom"]))
+            if node.op == "get_attr" and isinstance(
+                getattr(gm, node.target), GraphModule
+            ):
+                custom_metadata.append(helper(getattr(gm, node.target)))
+        return custom_metadata
+
+    return "\n".join(str(x) for x in helper(gm))
diff --git a/torch/header_only_apis.txt b/torch/header_only_apis.txt
index 4cfeeb6238ad..3b6d6f2b66b7 100644
--- a/torch/header_only_apis.txt
+++ b/torch/header_only_apis.txt
@@ -95,7 +95,41 @@ bits4x2
 bits8
 bits16
 
+# torch/headeronly/core/DeviceType.h
+DeviceType
+kCPU
+kCUDA
+kHIP
+kFPGA
+kMAIA
+kXLA
+kMPS
+kMeta
+kVulkan
+kMetal
+kXPU
+kHPU
+kVE
+kLazy
+kIPU
+kMTIA
+kPrivateUse1
+COMPILE_TIME_MAX_DEVICE_TYPES
+
 # torch/headeronly/core/ScalarType.h
 NumScalarTypes
 ScalarType
 # dummy_int1_7_t, dummy_uint1_7_t tested through ScalarType
+ScalarTypeToCPPTypeT
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF_F8NZ
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS
+AT_FORALL_INT_TYPES
+AT_FORALL_SCALAR_TYPES
+AT_FORALL_SCALAR_TYPES_AND
+AT_FORALL_SCALAR_TYPES_AND2
+AT_FORALL_SCALAR_TYPES_AND3
+AT_FORALL_SCALAR_TYPES_AND7
+AT_FORALL_QINT_TYPES
+AT_FORALL_FLOAT8_TYPES
+AT_FORALL_COMPLEX_TYPES
diff --git a/torch/headeronly/build.bzl b/torch/headeronly/build.bzl
index 6ec9a843e884..c44a8dc563d0 100644
--- a/torch/headeronly/build.bzl
+++ b/torch/headeronly/build.bzl
@@ -1,9 +1,35 @@
 def define_targets(rules):
+    # workaround issue where open source bazel requires $(location ...)
+    # for filepaths but the buck conversion requires no $(location ...)
+    # for filepaths.
+    is_buck = hasattr(native, "read_config")
+    template_arg = "version.h.in" if is_buck else "$(location version.h.in)"
+
+    genrule_args = {
+        "name": "version_h",
+        "srcs": [
+            "version.h.in",
+            "//:version.txt",
+        ],
+        "outs": ["version.h"],
+        "cmd": "$(execpath //tools/setup_helpers:gen_version_header) " +
+               "--template-path " + template_arg + " " +
+               "--version-path $(location //:version.txt) --output-path $@ ",
+        "tools": ["//tools/setup_helpers:gen_version_header"],
+    }
+
+    # Add visibility only for Bazel, buck genrule in fbcode.bzl does not
+    # support this argument
+    if not is_buck:
+        genrule_args["visibility"] = ["//visibility:public"]
+
+    rules.genrule(**genrule_args)
+
     rules.cc_library(
         name = "torch_headeronly",
         hdrs = rules.glob([
             "**/*.h"
-        ]),
+        ]) + ["version.h.in"],
         visibility = ["//visibility:public"],
         deps = [
             "//torch/headeronly/macros",
diff --git a/torch/headeronly/core/DeviceType.h b/torch/headeronly/core/DeviceType.h
new file mode 100644
index 000000000000..b4167b43c763
--- /dev/null
+++ b/torch/headeronly/core/DeviceType.h
@@ -0,0 +1,125 @@
+// This is directly synchronized with caffe2/proto/caffe2.proto, but
+// doesn't require me to figure out how to get Protobuf headers into
+// ATen/core (which would require a lot more build system hacking.)
+// If you modify me, keep me synchronized with that file.
+
+#include <torch/headeronly/macros/Export.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+
+namespace c10 {
+
+// These contains all device types that also have a BackendComponent
+// and therefore participate in per-backend functionality dispatch keys.
+// This is most backends except PrivateUse2 and PrivateUse3
+#define C10_FORALL_BACKEND_DEVICE_TYPES(_, extra) \
+  _(CPU, extra)                                   \
+  _(CUDA, extra)                                  \
+  _(HIP, extra)                                   \
+  _(XLA, extra)                                   \
+  _(MPS, extra)                                   \
+  _(IPU, extra)                                   \
+  _(XPU, extra)                                   \
+  _(HPU, extra)                                   \
+  _(VE, extra)                                    \
+  _(Lazy, extra)                                  \
+  _(Meta, extra)                                  \
+  _(MTIA, extra)                                  \
+  _(PrivateUse1, extra)
+
+enum class DeviceType : int8_t {
+  CPU = 0,
+  CUDA = 1, // CUDA.
+  MKLDNN = 2, // Reserved for explicit MKLDNN
+  OPENGL = 3, // OpenGL
+  OPENCL = 4, // OpenCL
+  IDEEP = 5, // IDEEP.
+  HIP = 6, // AMD HIP
+  FPGA = 7, // FPGA
+  MAIA = 8, // ONNX Runtime / Microsoft
+  XLA = 9, // XLA / TPU
+  Vulkan = 10, // Vulkan
+  Metal = 11, // Metal
+  XPU = 12, // XPU
+  MPS = 13, // MPS
+  Meta = 14, // Meta (tensors with no data)
+  HPU = 15, // HPU / HABANA
+  VE = 16, // SX-Aurora / NEC
+  Lazy = 17, // Lazy Tensors
+  IPU = 18, // Graphcore IPU
+  MTIA = 19, // Meta training and inference devices
+  PrivateUse1 = 20, // PrivateUse1 device
+  // NB: If you add more devices:
+  //  - Change the implementations of DeviceTypeName and isValidDeviceType
+  //    in c10/core/DeviceType.cpp
+  //  - Change the number below
+  COMPILE_TIME_MAX_DEVICE_TYPES = 21,
+};
+
+constexpr DeviceType kCPU = DeviceType::CPU;
+constexpr DeviceType kCUDA = DeviceType::CUDA;
+constexpr DeviceType kHIP = DeviceType::HIP;
+constexpr DeviceType kFPGA = DeviceType::FPGA;
+constexpr DeviceType kMAIA = DeviceType::MAIA;
+constexpr DeviceType kXLA = DeviceType::XLA;
+constexpr DeviceType kMPS = DeviceType::MPS;
+constexpr DeviceType kMeta = DeviceType::Meta;
+constexpr DeviceType kVulkan = DeviceType::Vulkan;
+constexpr DeviceType kMetal = DeviceType::Metal;
+constexpr DeviceType kXPU = DeviceType::XPU;
+constexpr DeviceType kHPU = DeviceType::HPU;
+constexpr DeviceType kVE = DeviceType::VE;
+constexpr DeviceType kLazy = DeviceType::Lazy;
+constexpr DeviceType kIPU = DeviceType::IPU;
+constexpr DeviceType kMTIA = DeviceType::MTIA;
+constexpr DeviceType kPrivateUse1 = DeviceType::PrivateUse1;
+
+// define explicit int constant
+constexpr int COMPILE_TIME_MAX_DEVICE_TYPES =
+    static_cast<int>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
+
+static_assert(
+    COMPILE_TIME_MAX_DEVICE_TYPES <= 21,
+    "Hey!  You seem to be adding a lot of new DeviceTypes.  The intent was "
+    "for this constant to reflect the actual number of DeviceTypes we support "
+    "in PyTorch; it's important that this number is not too large as we "
+    "use this to allocate stack arrays in some places in our code.  If you "
+    "are indeed just adding the 20th device type, feel free to change "
+    "the check to 32; but if you are adding some sort of extensible device "
+    "types registration, please be aware that you are affecting code that "
+    "this number is small.  Try auditing uses of this constant.");
+
+} // namespace c10
+
+namespace std {
+template <>
+struct hash<c10::DeviceType> {
+  std::size_t operator()(c10::DeviceType k) const {
+    return std::hash<int>()(static_cast<int>(k));
+  }
+};
+} // namespace std
+
+namespace torch::headeronly {
+using c10::COMPILE_TIME_MAX_DEVICE_TYPES;
+using c10::DeviceType;
+using c10::kCPU;
+using c10::kCUDA;
+using c10::kFPGA;
+using c10::kHIP;
+using c10::kHPU;
+using c10::kIPU;
+using c10::kLazy;
+using c10::kMAIA;
+using c10::kMeta;
+using c10::kMetal;
+using c10::kMPS;
+using c10::kMTIA;
+using c10::kPrivateUse1;
+using c10::kVE;
+using c10::kVulkan;
+using c10::kXLA;
+using c10::kXPU;
+} // namespace torch::headeronly
diff --git a/torch/headeronly/core/ScalarType.h b/torch/headeronly/core/ScalarType.h
index 0e426427997b..6caacd8c119e 100644
--- a/torch/headeronly/core/ScalarType.h
+++ b/torch/headeronly/core/ScalarType.h
@@ -30,7 +30,70 @@ struct dummy_uint1_7_t {};
 template <unsigned int N>
 struct dummy_int1_7_t {};
 
-// See [dtype Macros note] in c10/core/ScalarType.h regarding macros
+// [dtype Macros note] For the macros below:
+//
+// For users: If you want to macro some code for all non-QInt scalar types
+// (i.e. types with complete information, you probably want one of the
+// AT_FORALL_SCALAR_TYPES / AT_FORALL_SCALAR_TYPES_AND macros below, which are
+// designed to behave similarly to the Dispatch macros with the same name.
+//
+// For adding a new dtype: In the beginning, we had an idea that there was a
+// list of all scalar types, and you could use AT_FORALL_SCALAR_TYPES to
+// iterate over them.  But over the years we added weird types which couldn't
+// be handled uniformly everywhere and so in the end we ended up with some
+// mish-mosh of some helper macros, but mostly use sites making a call about
+// what dtypes they can or can't support.  So if you want to add a new dtype,
+// the preferred resolution is to find a dtype similar to what you want,
+// grep for it and edit all the sites you find this way.  If you need to add
+// a completely new kind of dtype, you're going to have to laboriously audit
+// all of the sites everywhere to figure out how it should work.  Consulting
+// some old PRs where we added new dtypes (check history of this file) can
+// help give you an idea where to start.
+
+// If you want to support ComplexHalf for real, add ComplexHalf
+// into this macro (and change the name).  But beware: convert()
+// doesn't work for all the conversions you need...
+//
+// TODO: To add unsigned int types here, we must define accumulate type.
+// But uint8 currently accumulates into int64, so we would have to make
+// an inconsistent choice for the larger types.  Difficult.
+#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF_F8NZ(_) \
+  _(uint8_t, Byte)                                                      \
+  _(int8_t, Char)                                                       \
+  _(int16_t, Short)                                                     \
+  _(int, Int)                                                           \
+  _(int64_t, Long)                                                      \
+  _(at::Half, Half)                                                     \
+  _(float, Float)                                                       \
+  _(double, Double)                                                     \
+  _(c10::complex<float>, ComplexFloat)                                  \
+  _(c10::complex<double>, ComplexDouble)                                \
+  _(bool, Bool)                                                         \
+  _(at::BFloat16, BFloat16)                                             \
+  _(at::Float8_e5m2, Float8_e5m2)                                       \
+  _(at::Float8_e4m3fn, Float8_e4m3fn)
+
+// This macro controls many of our C++ APIs, including constructors
+// for Scalar as well as the data() and item() accessors on Tensor
+#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(_) \
+  _(uint8_t, Byte)                             \
+  _(int8_t, Char)                              \
+  _(int16_t, Short)                            \
+  _(int, Int)                                  \
+  _(int64_t, Long)                             \
+  _(at::Half, Half)                            \
+  _(float, Float)                              \
+  _(double, Double)                            \
+  _(c10::complex<c10::Half>, ComplexHalf)      \
+  _(c10::complex<float>, ComplexFloat)         \
+  _(c10::complex<double>, ComplexDouble)       \
+  _(bool, Bool)                                \
+  _(at::BFloat16, BFloat16)                    \
+  _(at::Float8_e5m2, Float8_e5m2)              \
+  _(at::Float8_e4m3fn, Float8_e4m3fn)          \
+  _(at::Float8_e5m2fnuz, Float8_e5m2fnuz)      \
+  _(at::Float8_e4m3fnuz, Float8_e4m3fnuz)      \
+  _(at::Float8_e8m0fnu, Float8_e8m0fnu)
 
 // NB: Order matters for this macro; it is relied upon in
 // _promoteTypesLookup and the serialization format.
@@ -82,6 +145,130 @@ struct dummy_int1_7_t {};
   _(c10::Float8_e8m0fnu, Float8_e8m0fnu) /* 44 */        \
   _(c10::Float4_e2m1fn_x2, Float4_e2m1fn_x2) /* 45 */
 
+// NB: despite its generic sounding name, the macros that don't take _AND
+// are mostly only used by tensorexpr
+#define AT_FORALL_INT_TYPES(_) \
+  _(uint8_t, Byte)             \
+  _(int8_t, Char)              \
+  _(int16_t, Short)            \
+  _(int, Int)                  \
+  _(int64_t, Long)
+
+#define AT_FORALL_SCALAR_TYPES(_) \
+  _(uint8_t, Byte)                \
+  _(int8_t, Char)                 \
+  _(int16_t, Short)               \
+  _(int, Int)                     \
+  _(int64_t, Long)                \
+  _(float, Float)                 \
+  _(double, Double)
+
+// These macros are often controlling how many template instantiations we
+// create for kernels.  It is typically inappropriate to add new dtypes here,
+// instead, new types should be added to use sites on a case-by-case basis.
+// We generally are not accepting new dtypes due to binary size concerns.
+
+#define AT_FORALL_SCALAR_TYPES_AND(SCALARTYPE, _) \
+  _(uint8_t, Byte)                                \
+  _(int8_t, Char)                                 \
+  _(int16_t, Short)                               \
+  _(int, Int)                                     \
+  _(int64_t, Long)                                \
+  _(float, Float)                                 \
+  _(double, Double)                               \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE>::t),  \
+    SCALARTYPE)
+
+#define AT_FORALL_SCALAR_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \
+  _(uint8_t, Byte)                                               \
+  _(int8_t, Char)                                                \
+  _(int16_t, Short)                                              \
+  _(int, Int)                                                    \
+  _(int64_t, Long)                                               \
+  _(float, Float)                                                \
+  _(double, Double)                                              \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                   \
+             ::c10::ScalarType::SCALARTYPE1>::t),                \
+    SCALARTYPE1)                                                 \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                   \
+             ::c10::ScalarType::SCALARTYPE2>::t),                \
+    SCALARTYPE2)
+
+#define AT_FORALL_SCALAR_TYPES_AND3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, _) \
+  _(uint8_t, Byte)                                                            \
+  _(int8_t, Char)                                                             \
+  _(int16_t, Short)                                                           \
+  _(int, Int)                                                                 \
+  _(int64_t, Long)                                                            \
+  _(float, Float)                                                             \
+  _(double, Double)                                                           \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                                \
+             ::c10::ScalarType::SCALARTYPE1>::t),                             \
+    SCALARTYPE1)                                                              \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                                \
+             ::c10::ScalarType::SCALARTYPE2>::t),                             \
+    SCALARTYPE2)                                                              \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                                \
+             ::c10::ScalarType::SCALARTYPE3>::t),                             \
+    SCALARTYPE3)
+
+#define AT_FORALL_SCALAR_TYPES_AND7(              \
+    SCALARTYPE1,                                  \
+    SCALARTYPE2,                                  \
+    SCALARTYPE3,                                  \
+    SCALARTYPE4,                                  \
+    SCALARTYPE5,                                  \
+    SCALARTYPE6,                                  \
+    SCALARTYPE7,                                  \
+    _)                                            \
+  _(uint8_t, Byte)                                \
+  _(int8_t, Char)                                 \
+  _(int16_t, Short)                               \
+  _(int, Int)                                     \
+  _(int64_t, Long)                                \
+  _(float, Float)                                 \
+  _(double, Double)                               \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE1>::t), \
+    SCALARTYPE1)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE2>::t), \
+    SCALARTYPE2)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE3>::t), \
+    SCALARTYPE3)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE4>::t), \
+    SCALARTYPE4)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE5>::t), \
+    SCALARTYPE5)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE6>::t), \
+    SCALARTYPE6)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE7>::t), \
+    SCALARTYPE7)
+
+#define AT_FORALL_QINT_TYPES(_) \
+  _(c10::qint8, QInt8)          \
+  _(c10::quint8, QUInt8)        \
+  _(c10::qint32, QInt32)        \
+  _(c10::quint4x2, QUInt4x2)    \
+  _(c10::quint2x4, QUInt2x4)
+
+#define AT_FORALL_FLOAT8_TYPES(_)         \
+  _(at::Float8_e5m2, Float8_e5m2)         \
+  _(at::Float8_e4m3fn, Float8_e4m3fn)     \
+  _(at::Float8_e5m2fnuz, Float8_e5m2fnuz) \
+  _(at::Float8_e4m3fnuz, Float8_e4m3fnuz) \
+  _(at::Float8_e8m0fnu, Float8_e8m0fnu)
+
+#define AT_FORALL_COMPLEX_TYPES(_)     \
+  _(c10::complex<float>, ComplexFloat) \
+  _(c10::complex<double>, ComplexDouble)
+
 enum class ScalarType : int8_t {
 #define DEFINE_ST_ENUM_VAL_(_1, n) n,
   AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ST_ENUM_VAL_)
@@ -93,6 +280,37 @@ enum class ScalarType : int8_t {
 constexpr uint16_t NumScalarTypes =
     static_cast<uint16_t>(ScalarType::NumOptions);
 
+namespace impl {
+
+// These are used to map ScalarTypes to C++ types.
+
+template <c10::ScalarType N>
+struct ScalarTypeToCPPType;
+
+#define SPECIALIZE_ScalarTypeToCPPType(cpp_type, scalar_type)                \
+  template <>                                                                \
+  struct ScalarTypeToCPPType<c10::ScalarType::scalar_type> {                 \
+    using type = cpp_type;                                                   \
+                                                                             \
+    /* This is a workaround for the CUDA bug which prevents */               \
+    /* ::detail::ScalarTypeToCType<T>::type being used directly due to */    \
+    /* ambiguous reference which can't to be resolved. For some reason it */ \
+    /* can't pick between at::detail and at::cuda::detail. */                \
+    /* For repro example, please see: */                                     \
+    /* https://gist.github.com/izdeby/952ae7cf256ddb740a73776d39a7e7ba */    \
+    /* TODO: remove once the bug is fixed. */                                \
+    static type t;                                                           \
+  };
+
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_ScalarTypeToCPPType)
+
+#undef SPECIALIZE_ScalarTypeToCPPType
+
+template <c10::ScalarType N>
+using ScalarTypeToCPPTypeT = typename ScalarTypeToCPPType<N>::type;
+
+} // namespace impl
+
 } // namespace c10
 
 namespace torch::headeronly {
@@ -100,4 +318,7 @@ using c10::dummy_int1_7_t;
 using c10::dummy_uint1_7_t;
 using c10::NumScalarTypes;
 using c10::ScalarType;
+namespace impl {
+using c10::impl::ScalarTypeToCPPTypeT;
+} // namespace impl
 } // namespace torch::headeronly
diff --git a/torch/headeronly/ovrsource_defs.bzl b/torch/headeronly/ovrsource_defs.bzl
index 3c3030c048b1..56fa9596585d 100644
--- a/torch/headeronly/ovrsource_defs.bzl
+++ b/torch/headeronly/ovrsource_defs.bzl
@@ -40,6 +40,7 @@ def define_torch_headeronly_ovrsource(name, is_mobile):
         ],
         exported_deps = [
             ":ovrsource_torch_headeronly_cmake_macros.h",
+            ":generate-version-header",
         ],
     )
 
diff --git a/torch/headeronly/util/BFloat16.h b/torch/headeronly/util/BFloat16.h
index 2c1f805ac7b7..ac47e3f844a7 100644
--- a/torch/headeronly/util/BFloat16.h
+++ b/torch/headeronly/util/BFloat16.h
@@ -39,7 +39,9 @@ struct alignas(2) BFloat16 {
     return from_bits_t();
   }
 
-  constexpr C10_HOST_DEVICE BFloat16(unsigned short bits, from_bits_t)
+  constexpr C10_HOST_DEVICE BFloat16(
+      unsigned short bits,
+      from_bits_t /*unused*/)
       : x(bits) {}
   /* implicit */ inline C10_HOST_DEVICE BFloat16(float value);
   inline C10_HOST_DEVICE operator float() const;
diff --git a/torch/headeronly/util/Float8_e4m3fn.h b/torch/headeronly/util/Float8_e4m3fn.h
index d54a8f40a6c1..4b5d2687a4c0 100644
--- a/torch/headeronly/util/Float8_e4m3fn.h
+++ b/torch/headeronly/util/Float8_e4m3fn.h
@@ -44,7 +44,7 @@ struct alignas(1) Float8_e4m3fn {
 
   Float8_e4m3fn() = default;
 
-  constexpr C10_HOST_DEVICE Float8_e4m3fn(uint8_t bits, from_bits_t)
+  constexpr C10_HOST_DEVICE Float8_e4m3fn(uint8_t bits, from_bits_t /*unused*/)
       : x(bits) {}
   inline C10_HOST_DEVICE Float8_e4m3fn(float value);
   inline C10_HOST_DEVICE operator float() const;
diff --git a/torch/headeronly/util/Float8_e4m3fnuz.h b/torch/headeronly/util/Float8_e4m3fnuz.h
index 772ffd9e96c6..e313588d4b89 100644
--- a/torch/headeronly/util/Float8_e4m3fnuz.h
+++ b/torch/headeronly/util/Float8_e4m3fnuz.h
@@ -45,7 +45,9 @@ struct alignas(1) Float8_e4m3fnuz {
 
   Float8_e4m3fnuz() = default;
 
-  constexpr C10_HOST_DEVICE Float8_e4m3fnuz(uint8_t bits, from_bits_t)
+  constexpr C10_HOST_DEVICE Float8_e4m3fnuz(
+      uint8_t bits,
+      from_bits_t /*unused*/)
       : x(bits) {}
   inline C10_HOST_DEVICE Float8_e4m3fnuz(float value);
   inline C10_HOST_DEVICE operator float() const;
diff --git a/torch/headeronly/util/Float8_e5m2.h b/torch/headeronly/util/Float8_e5m2.h
index aeee40d8e5b8..70748f18276d 100644
--- a/torch/headeronly/util/Float8_e5m2.h
+++ b/torch/headeronly/util/Float8_e5m2.h
@@ -30,7 +30,8 @@ struct alignas(1) Float8_e5m2 {
 
   Float8_e5m2() = default;
 
-  constexpr C10_HOST_DEVICE Float8_e5m2(uint8_t bits, from_bits_t) : x(bits) {}
+  constexpr C10_HOST_DEVICE Float8_e5m2(uint8_t bits, from_bits_t /*unused*/)
+      : x(bits) {}
   inline C10_HOST_DEVICE Float8_e5m2(float value);
   inline C10_HOST_DEVICE operator float() const;
   inline C10_HOST_DEVICE bool isnan() const;
diff --git a/torch/headeronly/util/Float8_e5m2fnuz.h b/torch/headeronly/util/Float8_e5m2fnuz.h
index 8bcb2ac07f76..0398587bc5e9 100644
--- a/torch/headeronly/util/Float8_e5m2fnuz.h
+++ b/torch/headeronly/util/Float8_e5m2fnuz.h
@@ -44,7 +44,9 @@ struct alignas(1) Float8_e5m2fnuz {
 
   Float8_e5m2fnuz() = default;
 
-  constexpr C10_HOST_DEVICE Float8_e5m2fnuz(uint8_t bits, from_bits_t)
+  constexpr C10_HOST_DEVICE Float8_e5m2fnuz(
+      uint8_t bits,
+      from_bits_t /*unused*/)
       : x(bits) {}
   inline C10_HOST_DEVICE Float8_e5m2fnuz(float value);
   inline C10_HOST_DEVICE operator float() const;
diff --git a/torch/headeronly/util/Float8_e8m0fnu.h b/torch/headeronly/util/Float8_e8m0fnu.h
index c5a70525f2f2..153a46e7b91f 100644
--- a/torch/headeronly/util/Float8_e8m0fnu.h
+++ b/torch/headeronly/util/Float8_e8m0fnu.h
@@ -39,7 +39,7 @@ struct alignas(1) Float8_e8m0fnu {
 
   Float8_e8m0fnu() = default;
 
-  constexpr C10_HOST_DEVICE Float8_e8m0fnu(uint8_t bits, from_bits_t)
+  constexpr C10_HOST_DEVICE Float8_e8m0fnu(uint8_t bits, from_bits_t /*unused*/)
       : x(bits) {}
   inline C10_HOST_DEVICE Float8_e8m0fnu(float value);
   inline C10_HOST_DEVICE operator float() const;
diff --git a/torch/headeronly/util/Half.h b/torch/headeronly/util/Half.h
index 59a86f07e333..9673301e2de7 100644
--- a/torch/headeronly/util/Half.h
+++ b/torch/headeronly/util/Half.h
@@ -80,7 +80,8 @@ struct alignas(2) Half {
   Half() = default;
 #endif
 
-  constexpr C10_HOST_DEVICE Half(unsigned short bits, from_bits_t) : x(bits) {}
+  constexpr C10_HOST_DEVICE Half(unsigned short bits, from_bits_t /*unused*/)
+      : x(bits) {}
 #if defined(__aarch64__) && !defined(__CUDACC__)
   inline Half(float16_t value);
   inline operator float16_t() const;
diff --git a/torch/csrc/api/include/torch/version.h.in b/torch/headeronly/version.h.in
similarity index 100%
rename from torch/csrc/api/include/torch/version.h.in
rename to torch/headeronly/version.h.in
diff --git a/torch/hub.py b/torch/hub.py
index fc943a4dd004..d3328d1abe6e 100644
--- a/torch/hub.py
+++ b/torch/hub.py
@@ -334,7 +334,7 @@ def _check_repo_is_trusted(
         if not is_trusted:
             warnings.warn(
                 "You are about to download and run code from an untrusted repository. In a future release, this won't "
-                "be allowed. To add the repository to your trusted list, change the command to {calling_fn}(..., "
+                f"be allowed. To add the repository to your trusted list, change the command to {calling_fn}(..., "
                 "trust_repo=False) and a command prompt will appear asking for an explicit confirmation of trust, "
                 f"or {calling_fn}(..., trust_repo=True), which will assume that the prompt is to be answered with "
                 f"'yes'. You can also use {calling_fn}(..., trust_repo='check') which will only prompt for "
@@ -372,7 +372,7 @@ def _check_dependencies(m):
 
     if dependencies is not None:
         missing_deps = [pkg for pkg in dependencies if not _check_module_exists(pkg)]
-        if len(missing_deps):
+        if missing_deps:
             raise RuntimeError(f"Missing dependencies: {', '.join(missing_deps)}")
 
 
@@ -421,7 +421,7 @@ def set_dir(d: Union[str, os.PathLike]) -> None:
         d (str): path to a local folder to save downloaded models & weights.
     """
     global _hub_dir
-    _hub_dir = os.path.expanduser(d)
+    _hub_dir = os.path.expanduser(d)  # pyrefly: ignore  # no-matching-overload
 
 
 def list(
diff --git a/torch/jit/_dataclass_impls.py b/torch/jit/_dataclass_impls.py
index 58abc91da044..67da5e202063 100644
--- a/torch/jit/_dataclass_impls.py
+++ b/torch/jit/_dataclass_impls.py
@@ -4,8 +4,8 @@
 import dataclasses
 import inspect
 import os
+from collections.abc import Callable
 from functools import partial
-from typing import Callable
 
 from torch._jit_internal import FAKE_FILENAME_PREFIX, is_optional
 from torch._sources import ParsedDef, SourceContext
diff --git a/torch/jit/_decompositions.py b/torch/jit/_decompositions.py
index 000ec7d0ec79..b4d2d6246698 100644
--- a/torch/jit/_decompositions.py
+++ b/torch/jit/_decompositions.py
@@ -6,7 +6,8 @@
 aten = torch.ops.aten
 import inspect
 import warnings
-from typing import Callable, Optional, TypeVar
+from collections.abc import Callable
+from typing import Optional, TypeVar
 from typing_extensions import ParamSpec
 
 from torch.types import Number
@@ -129,6 +130,7 @@ def var_decomposition(
         else:
             raise RuntimeError("correction must be int or float")
 
+    # pyrefly: ignore  # no-matching-overload
     return sum / max(0, denom)
 
 
diff --git a/torch/jit/_monkeytype_config.py b/torch/jit/_monkeytype_config.py
index 84ea4d5c3f6b..6c94dccb0efc 100644
--- a/torch/jit/_monkeytype_config.py
+++ b/torch/jit/_monkeytype_config.py
@@ -14,6 +14,8 @@
 _IS_MONKEYTYPE_INSTALLED = True
 try:
     import monkeytype  # type: ignore[import]
+
+    # pyrefly: ignore  # import-error
     from monkeytype import trace as monkeytype_trace
     from monkeytype.config import _startswith, LIB_PATHS  # type: ignore[import]
     from monkeytype.db.base import (  # type: ignore[import]
@@ -66,7 +68,7 @@ def get_optional_of_element_type(types):
     from the list of consolidated types and returns `Optional[element type]`.
     TODO: To remove this check once Union support lands.
     """
-    elem_type = types[1] if type(None) == types[0] else types[0]
+    elem_type = types[1] if type(None) is types[0] else types[0]
     elem_type = get_type(elem_type)
 
     # Optional type is internally converted to Union[type, NoneType], which
@@ -87,6 +89,7 @@ def __init__(self, store: CallTraceStore):
             super().__init__(store)
 
         def log(self, trace: CallTrace) -> None:
+            # pyrefly: ignore  # missing-attribute
             self.traces.append(trace)
 
     class JitTypeTraceStore(CallTraceStore):
@@ -148,6 +151,7 @@ def __init__(self, s: JitTypeTraceStore):
 
         def trace_logger(self) -> JitTypeTraceStoreLogger:
             """Return a JitCallTraceStoreLogger that logs to the configured trace store."""
+            # pyrefly: ignore  # bad-argument-count
             return JitTypeTraceStoreLogger(self.trace_store())
 
         def trace_store(self) -> CallTraceStore:
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index e89bcc47dff6..530266fa9dca 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -2,7 +2,6 @@
 import collections
 import functools
 import inspect
-import sys
 import textwrap
 import types
 import warnings
@@ -158,8 +157,6 @@ def __init__(self, source, filename, file_lineno, leading_whitespace_len):
 
 
 def get_annotations(obj):
-    if sys.version_info < (3, 10):
-        return getattr(obj, "__annotations__", {})
     # In Python-3.10+ it is recommended to use inspect.get_annotations
     # See https://docs.python.org/3.10/howto/annotations.html
     # But also, in 3.10 annotations from base class are not inherited
@@ -371,7 +368,7 @@ def infer_type(name, item):
                 hint = (
                     "(This function exists as an attribute on the Python module, "
                     "but we failed to compile it to a TorchScript function. "
-                    f"\nThe error stack is reproduced here:\n{e}"
+                    f"\nThe error stack is reproduced here:\n{e})"
                 )
                 concrete_type_builder.add_failed_attribute(name, hint)
 
@@ -750,6 +747,7 @@ def get_overload_annotations(mod, jit_ignored_properties):
             if method_overloads is None:
                 continue
 
+            # pyrefly: ignore  # missing-attribute
             if item.__func__ in method_overloads:
                 raise RuntimeError(
                     _jit_internal.get_overload_no_implementation_error_message(
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index ccd967d69f4e..86b72d1d4656 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -14,7 +14,9 @@
 import inspect
 import pickle
 import warnings
-from typing import Any, Callable, Union
+from collections.abc import Callable
+from typing import Any, Union
+from typing_extensions import deprecated
 
 import torch
 import torch._jit_internal as _jit_internal
@@ -309,7 +311,7 @@ def init_then_script(self, *args, **kwargs):
             original_init(self, *args, **kwargs)
             added_methods_in_init = len(cls._methods) > num_methods
 
-            if type(self) == cls:
+            if type(self) is cls:
 
                 def make_stubs(module):
                     cls = type(module)
@@ -543,6 +545,7 @@ def __setattr__(self, attr, value):
                 #
                 # This ensures that if we use the attr again in `__init__`, it
                 # will look like the actual value, not an instance of Attribute.
+                # pyrefly: ignore  # invalid-argument
                 if isinstance(value, Attribute):
                     # NB: Ensure that we set __annotations__ on the specific
                     # class in question, and not on a superclass (which would
@@ -654,6 +657,7 @@ def _construct(cpp_module, init_fn):
 
             # Finalize the ScriptModule: replace the nn.Module state with our
             # custom implementations and flip the _initializing bit.
+            # pyrefly: ignore  # missing-attribute
             RecursiveScriptModule._finalize_scriptmodule(script_module)
             return script_module
 
@@ -750,6 +754,10 @@ def save(self, f, **kwargs):
             """
             return self._c.save(str(f), **kwargs)
 
+        @deprecated(
+            "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \
+            https://docs.pytorch.org/executorch/stable/getting-started.html"
+        )
         def _save_for_lite_interpreter(self, *args, **kwargs):
             r"""Add (or update) the bytecode session to the script model.
 
@@ -763,9 +771,23 @@ def _save_for_lite_interpreter(self, *args, **kwargs):
                 _extra_files: Map from filename to contents which will be stored as part of 'f'.
 
             """
+            warnings.warn(
+                "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \
+                https://docs.pytorch.org/executorch/stable/getting-started.html",
+                DeprecationWarning,
+            )
             return self._c._save_for_mobile(*args, **kwargs)
 
+        @deprecated(
+            "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \
+            https://docs.pytorch.org/executorch/stable/getting-started.html"
+        )
         def _save_to_buffer_for_lite_interpreter(self, *args, **kwargs):
+            warnings.warn(
+                "Lite Interpreter is deprecated. Please consider switching to ExecuTorch. \
+                https://docs.pytorch.org/executorch/stable/getting-started.html",
+                DeprecationWarning,
+            )
             return self._c._save_to_buffer_for_mobile(*args, **kwargs)
 
         def save_to_buffer(self, *args, **kwargs):
@@ -782,7 +804,7 @@ def graph_for(self, *args, **kwargs):
 
         @property
         def original_name(self):
-            if type(self) == str(self._c._type().name()):
+            if type(self) is str(self._c._type().name()):
                 return ""
             return str(self._c._type().name())
 
@@ -909,6 +931,7 @@ def init_fn(script_module):
                 # Don't do anything here, we'll initialize the ScriptModule below
                 return
 
+            # pyrefly: ignore  # missing-attribute
             return RecursiveScriptModule._construct(
                 self._c._replicate_for_data_parallel(), init_fn
             )
@@ -918,6 +941,7 @@ def init_fn(script_module):
     # This is because `super().foo()` does not use
     # `__getattr__` to look up `foo`. So we need to make each method available on
     # the ScriptModule manually.
+    # pyrefly: ignore  # missing-attribute
     for name, item in RecursiveScriptModule.__dict__.items():
         if not callable(item) and not isinstance(item, property):
             continue
@@ -986,6 +1010,7 @@ def fail(self, *args, **kwargs):
         if name.startswith("__") or name.endswith("_call_impl"):
             continue
         if (
+            # pyrefly: ignore  # missing-attribute
             name not in RecursiveScriptModule.__dict__
             and name not in _compiled_methods_allowlist
         ):
@@ -1018,6 +1043,7 @@ def call_prepare_scriptable_func_impl(obj, memo):
         return memo[id(obj)]
 
     obj = (
+        # pyrefly: ignore  # not-callable
         obj.__prepare_scriptable__() if hasattr(obj, "__prepare_scriptable__") else obj
     )  # type: ignore[operator]
     # Record obj in memo to avoid infinite recursion in the case of cycles in the module
@@ -1115,6 +1141,7 @@ def _script_impl(
         # the provide example inputs. This logs all the traces in type_trace_db
         type_trace_db = JitTypeTraceStore()
         if monkeytype_trace:
+            # pyrefly: ignore  # bad-argument-count
             monkeytype_config = JitTypeTraceConfig(type_trace_db)
             with monkeytype_trace(monkeytype_config):
                 if isinstance(example_inputs, dict):
diff --git a/torch/jit/_script.pyi b/torch/jit/_script.pyi
index f6727f9198ca..7d3a5de62a96 100644
--- a/torch/jit/_script.pyi
+++ b/torch/jit/_script.pyi
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 # mypy: disable-error-code="type-arg"
-from typing import Any, Callable, NamedTuple, overload, TypeVar
-from typing_extensions import Never, TypeAlias
+from collections.abc import Callable
+from typing import Any, NamedTuple, overload, TypeAlias, TypeVar
+from typing_extensions import Never
 
 from _typeshed import Incomplete
 
diff --git a/torch/jit/_serialization.py b/torch/jit/_serialization.py
index 57cb6a8475d9..c719a01708c9 100644
--- a/torch/jit/_serialization.py
+++ b/torch/jit/_serialization.py
@@ -166,11 +166,23 @@ def load(f, map_location=None, _extra_files=None, _restore_shapes=False):
     cu = torch._C.CompilationUnit()
     if isinstance(f, (str, os.PathLike)):
         cpp_module = torch._C.import_ir_module(
-            cu, os.fspath(f), map_location, _extra_files, _restore_shapes
+            cu,
+            # pyrefly: ignore  # no-matching-overload
+            os.fspath(f),
+            map_location,
+            _extra_files,
+            # pyrefly: ignore  # bad-argument-count
+            _restore_shapes,
         )  # type: ignore[call-arg]
     else:
         cpp_module = torch._C.import_ir_module_from_buffer(
-            cu, f.read(), map_location, _extra_files, _restore_shapes
+            cu,
+            # pyrefly: ignore  # missing-attribute
+            f.read(),
+            map_location,
+            _extra_files,
+            # pyrefly: ignore  # bad-argument-count
+            _restore_shapes,
         )  # type: ignore[call-arg]
 
     # TODO: Pretty sure this approach loses ConstSequential status and such
@@ -196,6 +208,7 @@ def validate_map_location(map_location=None):
 
 def jit_module_from_flatbuffer(f):
     if isinstance(f, (str, os.PathLike)):
+        # pyrefly: ignore  # no-matching-overload
         f = os.fspath(f)
         return wrap_cpp_module(torch._C._load_jit_module_from_file(f))
     else:
@@ -245,6 +258,7 @@ def forward(self, x):
         extra_files = {}
 
     if isinstance(f, (str, os.PathLike)):
+        # pyrefly: ignore  # no-matching-overload
         f = os.fspath(f)
         torch._C._save_jit_module(m._c, f, extra_files)
     else:
diff --git a/torch/jit/_state.py b/torch/jit/_state.py
index 2c0c58b8c98a..f48dd80a0b36 100644
--- a/torch/jit/_state.py
+++ b/torch/jit/_state.py
@@ -76,11 +76,11 @@ def _get_script_class(python_class):
     override = getattr(python_class, "_jit_override_qualname", None)
     if override is not None:
         python_class = _get_python_class(override)
-    return _script_classes.get(python_class, None)
+    return _script_classes.get(python_class)
 
 
 def _get_python_class(qualified_name):
-    return _name_to_pyclass.get(qualified_name, None)
+    return _name_to_pyclass.get(qualified_name)
 
 
 def _clear_class_state():
diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py
index 5084d7c92283..5b1713e77d36 100644
--- a/torch/jit/_trace.py
+++ b/torch/jit/_trace.py
@@ -16,8 +16,9 @@
 import os
 import re
 import warnings
+from collections.abc import Callable
 from enum import Enum
-from typing import Any, Callable, Optional, TypeVar
+from typing import Any, Optional, TypeVar
 from typing_extensions import ParamSpec
 
 import torch
@@ -168,6 +169,7 @@ def clone_input(a):
         else:
             return a.clone(memory_format=torch.preserve_format)
 
+    # pyrefly: ignore  # missing-attribute
     return function._nested_map(
         lambda x: isinstance(x, torch.Tensor), clone_input, condition_msg="tensors"
     )(args)
@@ -334,6 +336,7 @@ def _check_trace(
 
         if is_trace_module:
             copied_dict = {}
+            # pyrefly: ignore  # missing-attribute
             for name, data in inputs.items():
                 copied_dict[name] = _clone_inputs(data)
             check_mod = torch.jit.trace_module(
@@ -648,7 +651,7 @@ def analyze_ts_result_with_export_result(export, trace):
         # mkldnn is not supported for torch.allclose
         if orig.layout == torch._mkldnn:  # type: ignore[attr-defined]
             return True
-        if type(orig) != type(loaded):
+        if type(orig) is not type(loaded):
             return False
 
         if isinstance(orig, torch._subclasses.FakeTensor):
@@ -738,6 +741,7 @@ def _trace_impl(
         example_inputs = (example_inputs,)
     # done primarily so that weird iterables fail here and not pybind11 code
     elif example_kwarg_inputs is None and not isinstance(example_inputs, tuple):
+        # pyrefly: ignore  # bad-argument-type
         example_inputs = tuple(example_inputs)
 
     var_lookup_fn = _create_interpreter_name_lookup_fn(0)
@@ -764,6 +768,7 @@ def _trace_impl(
         traced = torch._C._create_function_from_trace(
             name,
             func,
+            # pyrefly: ignore  # bad-argument-type
             example_inputs,
             var_lookup_fn,
             strict,
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index 48d5eb8a644a..9bfa68329441 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -98,6 +98,7 @@ class EvalEnv:
     def __init__(self, rcb):
         self.rcb = rcb
         if torch.distributed.rpc.is_available():
+            # pyrefly: ignore  # unsupported-operation
             self.env["RRef"] = RRef
 
     def __getitem__(self, name):
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index 518eb7df2749..a4874fc1df80 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -115,6 +115,7 @@ def is_reserved_name(name):
     ast.Continue: "continue",
 }
 
+# pyrefly: ignore  # no-matching-overload
 pretty_node_names.update(
     {
         ast.AsyncFunctionDef: "async function definitions",
@@ -125,6 +126,7 @@ def is_reserved_name(name):
     }
 )
 
+# pyrefly: ignore  # no-matching-overload
 node_start_tokens.update(
     {
         ast.AsyncFunctionDef: "async def",
@@ -135,6 +137,7 @@ def is_reserved_name(name):
     }
 )
 
+# pyrefly: ignore  # no-matching-overload
 pretty_node_names.update(
     {
         ast.AnnAssign: "annotated assignments",
@@ -711,7 +714,7 @@ def build_AnnAssign(ctx, stmt):
 
         # Disallow type annotations on instance attributes outside of __init__
         if (
-            type(stmt.target) == ast.Attribute
+            type(stmt.target) is ast.Attribute
             and stmt.target.value.id == "self"  # type: ignore[attr-defined]
             and ctx.funcname != "__init__"
         ):
@@ -859,6 +862,7 @@ class ExprBuilder(Builder):
         ast.RShift: ">>",
     }
 
+    # pyrefly: ignore  # unsupported-operation
     binop_map[ast.MatMult] = "@"
 
     unop_map = {
@@ -1220,6 +1224,7 @@ def build_JoinedStr(ctx, expr):
                 s += "{}"
                 args.append(build_expr(ctx, value.value))
             elif isinstance(value, ast.Constant):
+                # pyrefly: ignore  # unsupported-operation
                 s += value.value
             else:
                 raise NotSupportedError(r, "Unsupported value in JoinedStr")
diff --git a/torch/jit/mobile/__init__.py b/torch/jit/mobile/__init__.py
index 593dabf2fc43..d5a3c525659f 100644
--- a/torch/jit/mobile/__init__.py
+++ b/torch/jit/mobile/__init__.py
@@ -44,10 +44,13 @@ def _load_for_lite_interpreter(f, map_location=None):
     map_location = validate_map_location(map_location)
 
     if isinstance(f, (str, os.PathLike)):
+        # pyrefly: ignore  # no-matching-overload
         cpp_module = torch._C._load_for_lite_interpreter(os.fspath(f), map_location)
     else:
         cpp_module = torch._C._load_for_lite_interpreter_from_buffer(
-            f.read(), map_location
+            # pyrefly: ignore  # missing-attribute
+            f.read(),
+            map_location,
         )
 
     return LiteScriptModule(cpp_module)
@@ -103,8 +106,10 @@ def _get_model_bytecode_version(f_input) -> int:
             raise ValueError(f"The provided filename {f_input} is a directory")
 
     if isinstance(f_input, (str, os.PathLike)):
+        # pyrefly: ignore  # no-matching-overload
         return torch._C._get_model_bytecode_version(os.fspath(f_input))
     else:
+        # pyrefly: ignore  # missing-attribute
         return torch._C._get_model_bytecode_version_from_buffer(f_input.read())
 
 
@@ -135,8 +140,10 @@ def _get_mobile_model_contained_types(f_input) -> int:
             raise ValueError(f"The provided filename {f_input} is a directory")
 
     if isinstance(f_input, (str, os.PathLike)):
+        # pyrefly: ignore  # no-matching-overload
         return torch._C._get_mobile_model_contained_types(os.fspath(f_input))
     else:
+        # pyrefly: ignore  # missing-attribute
         return torch._C._get_mobile_model_contained_types_from_buffer(f_input.read())
 
 
@@ -161,11 +168,18 @@ def _backport_for_mobile(f_input, f_output, to_version):
         isinstance(f_output, (str, os.PathLike))
     ):
         return torch._C._backport_for_mobile(
-            os.fspath(f_input), os.fspath(f_output), to_version
+            # pyrefly: ignore  # no-matching-overload
+            os.fspath(f_input),
+            # pyrefly: ignore  # no-matching-overload
+            os.fspath(f_output),
+            to_version,
         )
     else:
         return torch._C._backport_for_mobile_from_buffer(
-            f_input.read(), str(f_output), to_version
+            # pyrefly: ignore  # missing-attribute
+            f_input.read(),
+            str(f_output),
+            to_version,
         )
 
 
@@ -184,10 +198,13 @@ def _backport_for_mobile_to_buffer(f_input, to_version):
             raise ValueError(f"The provided filename {f_input} is a directory")
 
     if isinstance(f_input, (str, os.PathLike)):
+        # pyrefly: ignore  # no-matching-overload
         return torch._C._backport_for_mobile_to_buffer(os.fspath(f_input), to_version)
     else:
         return torch._C._backport_for_mobile_from_buffer_to_buffer(
-            f_input.read(), to_version
+            # pyrefly: ignore  # missing-attribute
+            f_input.read(),
+            to_version,
         )
 
 
@@ -227,6 +244,8 @@ def _get_model_ops_and_info(f_input):
             raise ValueError(f"The provided filename {f_input} is a directory")
 
     if isinstance(f_input, (str, os.PathLike)):
+        # pyrefly: ignore  # no-matching-overload
         return torch._C._get_model_ops_and_info(os.fspath(f_input))
     else:
+        # pyrefly: ignore  # missing-attribute
         return torch._C._get_model_ops_and_info(f_input.read())
diff --git a/torch/jit/supported_ops.py b/torch/jit/supported_ops.py
index 98229edff6ee..c17fa38cefd9 100644
--- a/torch/jit/supported_ops.py
+++ b/torch/jit/supported_ops.py
@@ -261,6 +261,7 @@ def _get_global_builtins():
 
     magic_methods_rows = []
     for fn, magic_method in magic_methods:
+        # pyrefly: ignore  # bad-argument-type
         magic_methods_rows.append(f'"{fn}", "``{magic_method}``"')
 
     schematized_ops = []
@@ -279,6 +280,7 @@ def _get_global_builtins():
             table_row = (
                 f'":external+python:py:obj:`{fn}`", "{schemaless_op_explanations[fn]}"'
             )
+            # pyrefly: ignore  # bad-argument-type
             schemaless_ops.append(table_row)
 
     schematized_ops_str = "\n".join(schematized_ops)
diff --git a/torch/lib/libshm/core.cpp b/torch/lib/libshm/core.cpp
index 1a49c278824d..72edb235888c 100644
--- a/torch/lib/libshm/core.cpp
+++ b/torch/lib/libshm/core.cpp
@@ -16,9 +16,7 @@ static AllocInfo get_alloc_info(const char* filename) {
   info.pid = getpid();
   info.free = false;
   size_t len = strlen(filename);
-  if (len >= sizeof(info.filename)) {
-    throw std::runtime_error("MapAllocatorContext_filename too long");
-  }
+  TORCH_CHECK(len < sizeof(info.filename), "MapAllocatorContext_filename too long");
   memcpy(info.filename, filename, len + 1);
   return info;
 }
@@ -57,21 +55,16 @@ static void start_manager() {
     handle.append(buffer.data(), bytes_read);
   }
   SYSCHECK_ERR_RETURN_NEG1(close(pipe_ends[0]));
-  if (handle.length() == 0) {
-    std::string msg("no response from torch_shm_manager at \"");
-    msg += manager_executable_path;
-    msg += "\"";
-    throw std::runtime_error(msg);
-  }
+
+  TORCH_CHECK(handle.length() != 0, "no response from torch_shm_manager at \"", manager_executable_path, "\"");
 
   handle.pop_back(); // remove \n
-  if (handle.rfind("ERROR: ", 0) == 0) {
-    std::string msg("torch_shm_manager at \"");
-    msg += manager_executable_path;
-    msg += "\": ";
-    msg += handle.substr(7); // remove "ERROR: "
-    throw std::runtime_error(msg);
-  }
+  TORCH_CHECK(
+      handle.rfind("ERROR: ", 0) != 0,
+      "torch_shm_manager at \"",
+      manager_executable_path,
+      "\": ",
+      handle.substr(7));
 
   ClientSocket manager{handle};
   managers.emplace(std::move(handle), std::move(manager));
diff --git a/torch/lib/libshm/libshm.h b/torch/lib/libshm/libshm.h
index 28024aa2338d..d3f7c7061abc 100644
--- a/torch/lib/libshm/libshm.h
+++ b/torch/lib/libshm/libshm.h
@@ -36,7 +36,7 @@ class THManagedMapAllocator : private THManagedMapAllocatorInit,
       const char* filename,
       int flags,
       size_t size);
-  static THManagedMapAllocator* fromDataPtr(const at::DataPtr&);
+  static THManagedMapAllocator* fromDataPtr(const at::DataPtr& /*dptr*/);
 
   const char* manager_handle() const {
     return manager_handle_.c_str();
diff --git a/torch/lib/libshm/manager.cpp b/torch/lib/libshm/manager.cpp
index ce4805f3804c..ec0519d83b75 100644
--- a/torch/lib/libshm/manager.cpp
+++ b/torch/lib/libshm/manager.cpp
@@ -10,6 +10,7 @@
 #include <vector>
 
 #include <c10/util/tempfile.h>
+#include <c10/util/Exception.h>
 
 #include <libshm/err.h>
 #include <libshm/socket.h>
@@ -96,10 +97,9 @@ int main(int argc, char* argv[]) {
   std::optional<c10::TempDir> tempdir;
   try {
     tempdir = c10::try_make_tempdir(/*name_prefix=*/"torch-shm-dir-");
-    if (!tempdir.has_value()) {
-      throw std::runtime_error(
-          "could not generate a random directory for manager socket");
-    }
+    TORCH_CHECK(
+        tempdir.has_value(),
+        "could not generate a random directory for manager socket");
 
     std::string tempfile = tempdir->name + "/manager.sock";
 
diff --git a/torch/lib/libshm/socket.h b/torch/lib/libshm/socket.h
index 6b7207eb70a8..e048098b94ef 100644
--- a/torch/lib/libshm/socket.h
+++ b/torch/lib/libshm/socket.h
@@ -58,16 +58,13 @@ class Socket {
         SYSCHECK_ERR_RETURN_NEG1(
             step_received =
                 ::read(socket_fd, buffer, num_bytes - bytes_received));
-        if (step_received == 0)
-          throw std::runtime_error("Other end has closed the connection");
+        TORCH_CHECK(step_received != 0, "Other end has closed the connection");
         bytes_received += step_received;
         buffer += step_received;
       } else if (pfd.revents & (POLLERR | POLLHUP)) {
-        throw std::runtime_error(
-            "An error occurred while waiting for the data");
+        TORCH_CHECK(false, "An error occurred while waiting for the data");
       } else {
-        throw std::runtime_error(
-            "Shared memory manager connection has timed out");
+        TORCH_CHECK(false, "Shared memory manager connection has timed out");
       }
     }
   }
@@ -156,9 +153,9 @@ class ClientSocket : public Socket {
     char buffer[3] = {0, 0, 0};
     send(&info, sizeof(info));
     recv(buffer, 2);
-    if (strcmp(buffer, "OK") != 0)
-      throw std::runtime_error(
-          "Shared memory manager didn't respond with an OK");
+    TORCH_CHECK(
+        strcmp(buffer, "OK") == 0,
+        "Shared memory manager didn't respond with an OK");
   }
 
   void register_deallocation(AllocInfo& info) {
diff --git a/torch/library.h b/torch/library.h
index f906e04ddecf..816f88b13f30 100644
--- a/torch/library.h
+++ b/torch/library.h
@@ -115,7 +115,7 @@ class TORCH_API CppFunction final {
       Func* f,
       std::enable_if_t<
           c10::guts::is_function_type<Func>::value,
-          std::nullptr_t> = nullptr)
+          std::nullptr_t>  /*unused*/= nullptr)
       : func_(c10::KernelFunction::makeFromUnboxedRuntimeFunction(f)),
         cpp_signature_(c10::impl::CppSignature::make<Func>()),
         schema_(
@@ -129,7 +129,7 @@ class TORCH_API CppFunction final {
       FuncPtr f,
       std::enable_if_t<
           c10::is_compile_time_function_pointer<FuncPtr>::value,
-          std::nullptr_t> = nullptr)
+          std::nullptr_t>  /*unused*/= nullptr)
       : func_(c10::KernelFunction::makeFromUnboxedFunction(f)),
         cpp_signature_(
             c10::impl::CppSignature::make<typename FuncPtr::FuncType>()),
@@ -144,7 +144,7 @@ class TORCH_API CppFunction final {
       Lambda&& f,
       std::enable_if_t<
           c10::guts::is_functor<std::decay_t<Lambda>>::value,
-          std::nullptr_t> = nullptr)
+          std::nullptr_t>  /*unused*/= nullptr)
       : func_(c10::KernelFunction::makeFromUnboxedLambda(
             std::forward<Lambda>(f))),
         cpp_signature_(c10::impl::CppSignature::make<Lambda>()),
@@ -310,7 +310,7 @@ class TORCH_API CppFunction final {
 
   // The "setter" for dispatch_key_
   template <typename Func>
-  friend CppFunction dispatch(c10::DispatchKey, Func&&);
+  friend CppFunction dispatch(c10::DispatchKey /*k*/, Func&& /*raw_f*/);
 
   // The only class which actually pulls out values from CppFunction (does so
   // destructively, felt too lazy to write accessors that I don't even
@@ -746,14 +746,14 @@ class TORCH_API Library final {
   // These overloads cover cases when a SelectiveStr (see Note [Selective
   // build]) has been disabled at compile time.  In that case, don't generate
   // any code referencing the passed in functions at all.
-  Library& def(detail::SelectiveStr<false>, const std::vector<at::Tag>& tags [[maybe_unused]] = {}) & {
+  Library& def(detail::SelectiveStr<false> /*unused*/, const std::vector<at::Tag>& tags [[maybe_unused]] = {}) & {
     return *this;
   }
   Library& def(detail::SelectiveStr<true> raw_schema, const std::vector<at::Tag>& tags = {}) & {
     return def(raw_schema.operator const char*(), tags);
   }
   template <typename Func>
-  Library& def(detail::SelectiveStr<false>, Func&& /*raw_f*/, const std::vector<at::Tag>& tags [[maybe_unused]] = {}) & {
+  Library& def(detail::SelectiveStr<false> /*unused*/, Func&& /*raw_f*/, const std::vector<at::Tag>& tags [[maybe_unused]] = {}) & {
     return *this;
   }
   template <typename Func>
@@ -764,12 +764,12 @@ class TORCH_API Library final {
 
   template <typename Func>
   // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
-  Library& impl(detail::SelectiveStr<false>, Func&& /*raw_f*/) & {
+  Library& impl(detail::SelectiveStr<false> /*unused*/, Func&& /*raw_f*/) & {
     return *this;
   }
   template <typename Dispatch, typename Func>
   Library& impl(
-      detail::SelectiveStr<false>,
+      detail::SelectiveStr<false> /*unused*/,
       // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
       Dispatch&& /*key*/,
       // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
@@ -877,7 +877,7 @@ class TORCH_API Library final {
       const std::vector<at::Tag>& tags = {},
       _RegisterOrVerify rv = _RegisterOrVerify::REGISTER) &;
   Library& _def(
-      std::variant<c10::OperatorName, c10::FunctionSchema>&&,
+      std::variant<c10::OperatorName, c10::FunctionSchema>&& /*name_or_schema*/,
       CppFunction&& f,
       const std::vector<at::Tag>& tags = {}) &;
   Library& _impl(
diff --git a/torch/library.py b/torch/library.py
index d36c18158148..d962c08c3905 100644
--- a/torch/library.py
+++ b/torch/library.py
@@ -6,17 +6,8 @@
 import sys
 import traceback
 import weakref
-from collections.abc import Sequence
-from typing import (
-    Any,
-    Callable,
-    Literal,
-    Optional,
-    overload,
-    TYPE_CHECKING,
-    TypeVar,
-    Union,
-)
+from collections.abc import Callable, Sequence
+from typing import Any, Optional, overload, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import deprecated, ParamSpec
 
 import torch
@@ -251,6 +242,7 @@ def _impl_with_aoti_compile(self, op_name, dispatch_key=""):
 
         if dispatch_key == "":
             dispatch_key = self.dispatch_key
+        # pyrefly: ignore  # bad-argument-type
         assert torch.DispatchKeySet(dispatch_key).has(torch._C.DispatchKey.Dense)
 
         if isinstance(op_name, str):
@@ -562,7 +554,7 @@ def wrap(f):
 def impl(
     qualname: str,
     types: Union[str, Sequence[str]],
-    func: Literal[None] = None,
+    func: None = None,
     *,
     lib: Optional[Library] = None,
 ) -> Callable[[Callable[..., object]], None]: ...
@@ -652,6 +644,7 @@ def impl(
         >>> y2 = torch.sin(x) + 1
         >>> assert torch.allclose(y1, y2)
     """
+    # pyrefly: ignore  # no-matching-overload
     return _impl(qualname, types, func, lib=lib, disable_dynamo=False)
 
 
@@ -674,7 +667,7 @@ def wrap(f: Callable[_P, _T]) -> Callable[_P, _T]:
 def _impl(
     qualname: str,
     types: Union[str, Sequence[str]],
-    func: Literal[None] = None,
+    func: None = None,
     *,
     lib: Optional[Library] = None,
     disable_dynamo: bool = False,
@@ -838,6 +831,7 @@ def register_kernel(
     if device_types is None:
         device_types = "CompositeExplicitAutograd"
 
+    # pyrefly: ignore  # no-matching-overload
     return _impl(op, device_types, func, lib=lib, disable_dynamo=True)
 
 
diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index 355ad00d491a..336798f2a3cf 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -1,7 +1,7 @@
-from torch._C import (  # type: ignore[attr-defined]
+from torch._C import (  # type: ignore[attr-defined]  # pyrefly: ignore  # missing-module-attribute
     _add_docstr,
     _linalg,
-    _LinAlgError as LinAlgError,
+    _LinAlgError as LinAlgError,  # pyrefly: ignore  # missing-module-attribute
 )
 
 
diff --git a/torch/masked/_ops.py b/torch/masked/_ops.py
index f0e23fed81f5..382a493782c4 100644
--- a/torch/masked/_ops.py
+++ b/torch/masked/_ops.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 import warnings
-from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar, Union
-from typing_extensions import ParamSpec, TypeAlias
+from collections.abc import Callable
+from typing import Any, Optional, TYPE_CHECKING, TypeAlias, TypeVar, Union
+from typing_extensions import ParamSpec
 
 import torch
 from torch import sym_float, Tensor
@@ -427,7 +428,7 @@ def _reduction_identity(op_name: str, input: Tensor, *args):
             return torch.tensor(-torch.inf, dtype=dtype, device=device)
         elif torch.is_signed(input) or dtype == torch.uint8:
             return torch.tensor(torch.iinfo(dtype).min, dtype=dtype, device=device)
-    elif op_name in {"logsumexp"}:
+    elif op_name == "logsumexp":
         if torch.is_floating_point(input):
             return torch.tensor(-torch.inf, dtype=dtype, device=device)
         elif torch.is_complex(input):
@@ -482,6 +483,7 @@ def _canonical_dim(dim: DimOrDims, ndim: int) -> tuple[int, ...]:
             raise IndexError(
                 f"Dimension out of range (expected to be in range of [{-ndim}, {ndim - 1}], but got {d})"
             )
+        # pyrefly: ignore  # bad-argument-type
         dims.append(d % ndim)
     return tuple(sorted(dims))
 
@@ -640,6 +642,7 @@ def _sparse_coo_scatter_reduction_helper(
 
     # promote dtype if specified
     if values.dtype != output_dtype:
+        # pyrefly: ignore  # no-matching-overload
         values = values.to(output_dtype)
 
     if keepdim:
@@ -764,6 +767,7 @@ def _sparse_csr_segment_reduction_helper(
 
     # promote dtype if specified
     if values.dtype != output_dtype:
+        # pyrefly: ignore  # no-matching-overload
         values = values.to(output_dtype)
 
     if len(dims) == 0:
@@ -1014,6 +1018,7 @@ def helper(input, mask):
 
     class Combine(torch.autograd.Function):
         @staticmethod
+        # pyrefly: ignore  # bad-override
         def forward(ctx, input, mask):
             """Return input with masked-out elements eliminated for the given operations."""
             ctx.save_for_backward(mask)
@@ -1024,6 +1029,7 @@ def forward(ctx, input, mask):
             return helper(input, mask)
 
         @staticmethod
+        # pyrefly: ignore  # bad-override
         def backward(ctx, grad_output):
             (mask,) = ctx.saved_tensors
             grad_data = (
@@ -1398,15 +1404,18 @@ def mean(
     if input.layout == torch.strided:
         if mask is None:
             # TODO: compute count analytically
+            # pyrefly: ignore  # no-matching-overload
             count = sum(
                 torch.ones(input.shape, dtype=torch.int64, device=input.device),
                 dim,
                 keepdim=keepdim,
             )
+            # pyrefly: ignore  # no-matching-overload
             total = sum(input, dim, keepdim=keepdim, dtype=dtype)
         else:
             inmask = _input_mask(input, mask=mask)
             count = inmask.sum(dim=dim, keepdim=bool(keepdim))
+            # pyrefly: ignore  # no-matching-overload
             total = sum(input, dim, keepdim=keepdim, dtype=dtype, mask=inmask)
         return total / count
     elif input.layout == torch.sparse_csr:
@@ -1617,15 +1626,18 @@ def _std_var(
     if input.layout == torch.strided:
         if mask is None:
             # TODO: compute count analytically
+            # pyrefly: ignore  # no-matching-overload
             count = sum(
                 torch.ones(input.shape, dtype=torch.int64, device=input.device),
                 dim,
                 keepdim=True,
             )
+            # pyrefly: ignore  # no-matching-overload
             sample_total = sum(input, dim, keepdim=True, dtype=dtype)
         else:
             inmask = _input_mask(input, mask=mask)
             count = inmask.sum(dim=dim, keepdim=True)
+            # pyrefly: ignore  # no-matching-overload
             sample_total = sum(input, dim, keepdim=True, dtype=dtype, mask=inmask)
         # TODO: replace torch.subtract/divide/square/maximum with
         # masked subtract/divide/square/maximum when these will be
@@ -1633,6 +1645,7 @@ def _std_var(
         sample_mean = torch.divide(sample_total, count)
         x = torch.subtract(input, sample_mean)
         if mask is None:
+            # pyrefly: ignore  # no-matching-overload
             total = sum(x * x.conj(), dim, keepdim=keepdim, dtype=compute_dtype)
         else:
             total = sum(
diff --git a/torch/masked/maskedtensor/_ops_refs.py b/torch/masked/maskedtensor/_ops_refs.py
index 9a4df21429ad..a2bb25c9f6cf 100644
--- a/torch/masked/maskedtensor/_ops_refs.py
+++ b/torch/masked/maskedtensor/_ops_refs.py
@@ -1,8 +1,9 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
 
+from collections.abc import Callable
 from functools import partial
-from typing import Any, Callable, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 
 import torch
 
@@ -46,6 +47,7 @@ def _check_args_kwargs_length(
 
 class _MaskedContiguous(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, input):
         if not is_masked_tensor(input):
             raise ValueError("MaskedContiguous forward: input must be a MaskedTensor.")
@@ -59,12 +61,14 @@ def forward(ctx, input):
         return MaskedTensor(data.contiguous(), mask.contiguous())
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         return grad_output
 
 
 class _MaskedToDense(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, input):
         if not is_masked_tensor(input):
             raise ValueError("MaskedToDense forward: input must be a MaskedTensor.")
@@ -79,6 +83,7 @@ def forward(ctx, input):
         return MaskedTensor(data.to_dense(), mask.to_dense())
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         layout = ctx.layout
 
@@ -93,6 +98,7 @@ def backward(ctx, grad_output):
 
 class _MaskedToSparse(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, input):
         if not is_masked_tensor(input):
             raise ValueError("MaskedToSparse forward: input must be a MaskedTensor.")
@@ -109,12 +115,14 @@ def forward(ctx, input):
         return MaskedTensor(sparse_data, sparse_mask)
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         return grad_output.to_dense()
 
 
 class _MaskedToSparseCsr(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, input):
         if not is_masked_tensor(input):
             raise ValueError("MaskedToSparseCsr forward: input must be a MaskedTensor.")
@@ -135,18 +143,21 @@ def forward(ctx, input):
         return MaskedTensor(sparse_data, sparse_mask)
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         return grad_output.to_dense()
 
 
 class _MaskedWhere(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, cond, self, other):
         ctx.mark_non_differentiable(cond)
         ctx.save_for_backward(cond)
         return torch.ops.aten.where(cond, self, other)
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         (cond,) = ctx.saved_tensors
 
@@ -384,7 +395,7 @@ def _softmax_backward_data(func, *args, **kwargs):
     if is_masked_tensor(grad) and is_masked_tensor(output):
         if not _masks_match(grad, output):
             raise ValueError(
-                "__torch_dispatch__, {func}: expected the masks of grad and output to match"
+                f"__torch_dispatch__, {func}: expected the masks of grad and output to match"
             )
         grad_data = _get_data(grad)
         new_grad_data = torch.ops.aten._masked_softmax_backward(
@@ -416,7 +427,7 @@ def where(func, *args, **kwargs):
         args, kwargs, f"__torch_dispatch__, {func}", len_args=3, len_kwargs=0
     )
     if not torch.is_tensor(args[0]):
-        raise ValueError("__torch_dispatch__, {func}: expected args[0] to be a tensor")
+        raise ValueError(f"__torch_dispatch__, {func}: expected args[0] to be a tensor")
     mx = args[1]
     my = args[2]
     if not is_masked_tensor(mx):
@@ -434,7 +445,7 @@ def _to_sparse(func, *args, **kwargs):
         args, kwargs, f"__torch_dispatch__, {func}", len_args=1, len_kwargs=0
     )
     if not torch.is_tensor(args[0]):
-        raise TypeError("__torch_dispatch__, {func}: expected args[0] to be a tensor")
+        raise TypeError(f"__torch_dispatch__, {func}: expected args[0] to be a tensor")
     mt = args[0]
     if not is_masked_tensor(mt):
         mt = MaskedTensor(mt, torch.ones_like(mt, dtype=torch.bool))
@@ -451,7 +462,7 @@ def _to_sparse_csr(func, *args, **kwargs):
         args, kwargs, f"__torch_dispatch__, {func}", len_args=1, len_kwargs=0
     )
     if not torch.is_tensor(args[0]):
-        raise ValueError("__torch_dispatch__, {func}: expected args[0] to be a tensor")
+        raise ValueError(f"__torch_dispatch__, {func}: expected args[0] to be a tensor")
     mt = args[0]
     if not is_masked_tensor(mt):
         mt = MaskedTensor(mt, torch.ones_like(mt).bool())
@@ -468,7 +479,7 @@ def _to_dense(func, *args, **kwargs):
         args, kwargs, f"__torch_dispatch__, {func}", len_args=1, len_kwargs=0
     )
     if not torch.is_tensor(args[0]):
-        raise ValueError("__torch_dispatch__, {func}: expected args[0] to be a tensor")
+        raise ValueError(f"__torch_dispatch__, {func}: expected args[0] to be a tensor")
     mt = args[0]
     if not is_masked_tensor(mt):
         mt = MaskedTensor(mt, torch.ones_like(mt).bool())
diff --git a/torch/masked/maskedtensor/core.py b/torch/masked/maskedtensor/core.py
index 2e3608b3e6d3..0b3fa9b858fe 100644
--- a/torch/masked/maskedtensor/core.py
+++ b/torch/masked/maskedtensor/core.py
@@ -174,6 +174,7 @@ def __new__(cls, data, mask, requires_grad=False):
                 UserWarning,
                 stacklevel=2,
             )
+        # pyrefly: ignore  # bad-argument-type
         return torch.Tensor._make_wrapper_subclass(cls, data.size(), **kwargs)
 
     def _preprocess_data(self, data, mask):
@@ -197,7 +198,7 @@ def _preprocess_data(self, data, mask):
     def _validate_members(self):
         data = self._masked_data
         mask = self.get_mask()
-        if type(data) != type(mask):
+        if type(data) is not type(mask):
             raise TypeError(
                 f"data and mask must have the same type. Got {type(data)} and {type(mask)}"
             )
@@ -243,10 +244,12 @@ def _from_values(data, mask):
 
         class Constructor(torch.autograd.Function):
             @staticmethod
+            # pyrefly: ignore  # bad-override
             def forward(ctx, data, mask):
                 return MaskedTensor(data, mask)
 
             @staticmethod
+            # pyrefly: ignore  # bad-override
             def backward(ctx, grad_output):
                 return grad_output, None
 
@@ -333,10 +336,12 @@ def to_tensor(self, value):
     def get_data(self):
         class GetData(torch.autograd.Function):
             @staticmethod
+            # pyrefly: ignore  # bad-override
             def forward(ctx, self):
                 return self._masked_data.detach()
 
             @staticmethod
+            # pyrefly: ignore  # bad-override
             def backward(ctx, grad_output):
                 if is_masked_tensor(grad_output):
                     return grad_output
diff --git a/torch/mps/profiler.py b/torch/mps/profiler.py
index eebeea9a02a4..daaaf729ff2e 100644
--- a/torch/mps/profiler.py
+++ b/torch/mps/profiler.py
@@ -1,5 +1,6 @@
-# mypy: allow-untyped-defs
 import contextlib
+from collections.abc import Iterator
+from typing import Literal
 
 import torch
 
@@ -14,7 +15,10 @@
 ]
 
 
-def start(mode: str = "interval", wait_until_completed: bool = False) -> None:
+ProfilerMode = Literal["interval", "event", "interval,event"]
+
+
+def start(mode: ProfilerMode = "interval", wait_until_completed: bool = False) -> None:
     r"""Start OS Signpost tracing from MPS backend.
 
     The generated OS Signposts could be recorded and viewed in
@@ -35,16 +39,20 @@ def start(mode: str = "interval", wait_until_completed: bool = False) -> None:
        https://developer.apple.com/documentation/os/logging/recording_performance_data
     """
     mode_normalized = mode.lower().replace(" ", "")
-    torch._C._mps_profilerStartTrace(mode_normalized, wait_until_completed)
+    torch._C._mps_profilerStartTrace(  # type: ignore[attr-defined]
+        mode_normalized, wait_until_completed
+    )
 
 
-def stop():
+def stop() -> None:
     r"""Stops generating OS Signpost tracing from MPS backend."""
-    torch._C._mps_profilerStopTrace()
+    torch._C._mps_profilerStopTrace()  # type: ignore[attr-defined]
 
 
 @contextlib.contextmanager
-def profile(mode: str = "interval", wait_until_completed: bool = False):
+def profile(
+    mode: ProfilerMode = "interval", wait_until_completed: bool = False
+) -> Iterator[None]:
     r"""Context Manager to enabling generating OS Signpost tracing from MPS backend.
 
     Args:
@@ -72,16 +80,16 @@ def is_metal_capture_enabled() -> bool:
     """Checks if `metal_capture` context manager is usable
     To enable metal capture, set MTL_CAPTURE_ENABLED envvar
     """
-    return torch._C._mps_isCaptureEnabled()  # type: ignore[attr-defined]
+    return torch._C._mps_isCaptureEnabled()  # type: ignore[attr-defined, no-any-return]
 
 
 def is_capturing_metal() -> bool:
     """Checks if metal capture is in progress"""
-    return torch._C._mps_isCapturing()  # type: ignore[attr-defined]
+    return torch._C._mps_isCapturing()  # type: ignore[attr-defined, no-any-return]
 
 
 @contextlib.contextmanager
-def metal_capture(fname: str):
+def metal_capture(fname: str) -> Iterator[None]:
     """Context manager that enables capturing of Metal calls into gputrace"""
     try:
         torch._C._mps_startCapture(fname)  # type: ignore[attr-defined]
diff --git a/torch/mtia/__init__.py b/torch/mtia/__init__.py
index 14871d425969..092e9f2cc5cb 100644
--- a/torch/mtia/__init__.py
+++ b/torch/mtia/__init__.py
@@ -5,7 +5,8 @@
 
 import threading
 import warnings
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional, Union
 
 import torch
 from torch import device as _device, Tensor
@@ -302,7 +303,7 @@ def __init__(self, stream: Optional["torch.mtia.Stream"]):
         self.idx = _get_device_index(None, True)
         if not torch.jit.is_scripting():
             if self.idx is None:
-                self.idx = -1
+                self.idx = -1  # pyrefly: ignore  # bad-assignment
 
         self.src_prev_stream = (
             None if not torch.jit.is_scripting() else torch.mtia.default_stream(None)
diff --git a/torch/multiprocessing/spawn.py b/torch/multiprocessing/spawn.py
index b11e5714fc2e..d4652ab32ff7 100644
--- a/torch/multiprocessing/spawn.py
+++ b/torch/multiprocessing/spawn.py
@@ -119,6 +119,7 @@ def _join_procs_with_timeout(self, timeout: float):
         """Attempt to join all processes with a shared timeout."""
         end = time.monotonic() + timeout
         for process in self.processes:
+            # pyrefly: ignore  # no-matching-overload
             time_to_wait = max(0, end - time.monotonic())
             process.join(time_to_wait)
 
@@ -274,7 +275,7 @@ def start_process(i):
         tf.close()
         os.unlink(tf.name)
 
-        process = mp.Process(
+        process = mp.Process(  # pyrefly: ignore  # missing-attribute
             target=_wrap,
             args=(fn, i, args, tf.name),
             daemon=daemon,
diff --git a/torch/nativert/common/FileUtil.cpp b/torch/nativert/common/FileUtil.cpp
index c0887b527792..798a76ee00f6 100644
--- a/torch/nativert/common/FileUtil.cpp
+++ b/torch/nativert/common/FileUtil.cpp
@@ -12,6 +12,7 @@
 #endif
 #include <cerrno>
 
+#include <c10/util/Exception.h>
 #include <fmt/core.h>
 
 namespace torch::nativert {
@@ -26,7 +27,7 @@ int unistd_close(int fh) {
 #endif
 }
 
-inline void incr(ssize_t) {}
+inline void incr(ssize_t /*unused*/) {}
 template <typename Offset>
 inline void incr(ssize_t n, Offset& offset) {
   offset += static_cast<Offset>(n);
@@ -130,14 +131,15 @@ File::File(int fd, bool ownsFd) noexcept : fd_(fd), ownsFd_(ownsFd) {
 
 File::File(std::string_view name, int flags, mode_t mode)
     : fd_(::open(std::string(name).c_str(), flags, mode)), ownsFd_(false) {
-  if (fd_ == -1) {
-    throw std::runtime_error(fmt::format(
-        "open(\"{}\", {}, 0{}) failed with errno {}.",
-        name,
-        flags,
-        mode,
-        errno));
-  }
+  TORCH_CHECK(
+      fd_ != 1,
+      "open(\"",
+      name,
+      "\", ",
+      flags,
+      ", 0",
+      mode,
+      ") returned stdout.")
   ownsFd_ = true;
 }
 
@@ -166,15 +168,11 @@ File::~File() {
 /* static */ File File::temporary() {
   // make a temp file with tmpfile(), dup the fd, then return it in a File.
   FILE* tmpFile = tmpfile();
-  if (!tmpFile) {
-    throw std::runtime_error("tmpfile() failed");
-  }
+  TORCH_CHECK(tmpFile != nullptr, "tmpfile() failed");
   auto guard = c10::make_scope_exit([&]() { fclose(tmpFile); });
 
   int fd = ::dup(fileno(tmpFile));
-  if (fd == -1) {
-    throw std::runtime_error("dup() failed");
-  }
+  TORCH_CHECK(fd != -1, "dup() failed");
 
   return File(fd, true);
 }
@@ -193,9 +191,7 @@ void File::swap(File& other) noexcept {
 }
 
 void File::close() {
-  if (!closeNoThrow()) {
-    throw std::runtime_error("close() failed");
-  }
+  TORCH_CHECK(closeNoThrow(), "close() failed");
 }
 
 [[nodiscard]] bool File::closeNoThrow() {
diff --git a/torch/nativert/common/FileUtil.h b/torch/nativert/common/FileUtil.h
index 28fc7c11bc35..6fa82347ac2b 100644
--- a/torch/nativert/common/FileUtil.h
+++ b/torch/nativert/common/FileUtil.h
@@ -111,8 +111,8 @@ class File {
   void swap(File& other) noexcept;
 
   // movable
-  File(File&&) noexcept;
-  File& operator=(File&&) noexcept;
+  File(File&& /*other*/) noexcept;
+  File& operator=(File&& /*other*/) noexcept;
 
  private:
   // unique
diff --git a/torch/nativert/detail/ITree.h b/torch/nativert/detail/ITree.h
index 19359920720a..5448fb2dead7 100644
--- a/torch/nativert/detail/ITree.h
+++ b/torch/nativert/detail/ITree.h
@@ -32,7 +32,7 @@ using ITreeMapNoReturnFn =
 using IValueApplyFn =
     void (*)(ITreeMapNoReturnFn, const c10::IValue&, const ITreeSpec&);
 
-nlohmann::json defaultContextLoadFn(std::string_view);
+nlohmann::json defaultContextLoadFn(std::string_view /*context*/);
 
 struct NodeDef {
   ITreeFlattenFn flattenFn;
diff --git a/torch/nativert/executor/ExecutionFrame.cpp b/torch/nativert/executor/ExecutionFrame.cpp
index 133a6125a26a..2cef8e208670 100644
--- a/torch/nativert/executor/ExecutionFrame.cpp
+++ b/torch/nativert/executor/ExecutionFrame.cpp
@@ -1,4 +1,5 @@
 #include <c10/util/Enumerate.h>
+#include <c10/util/Exception.h>
 #include <c10/util/Logging.h>
 
 #include <torch/nativert/executor/ExecutionFrame.h>
@@ -137,8 +138,8 @@ void ExecutionFrame::updateMovableOutputs() {
 ExecutionFrame::ExecutionFrame(
     const Graph& graph,
     size_t numValues,
-    const std::vector<ValueId>&,
-    const std::vector<ValueId>&)
+    const std::vector<ValueId>& /*unused*/,
+    const std::vector<ValueId>& /*unused*/)
     : graph_(graph) {
   allValues_.resize(numValues);
 }
@@ -156,11 +157,8 @@ void ExecutionFrame::setBorrowedIValue(ValueId id, c10::IValue ivalue) {
 
 at::Tensor ExecutionFrame::getTensor(ValueId id) const {
   const auto& ivalue = getIValue(id);
-  if (C10_LIKELY(ivalue.isTensor())) {
-    return ivalue.toTensor();
-  } else {
-    throw std::runtime_error("getTensor called on non-tensor value");
-  }
+  TORCH_CHECK(ivalue.isTensor(), "getTensor called on non-tensor value");
+  return ivalue.toTensor();
 }
 
 std::vector<c10::IValue> ExecutionFrame::tryMoveUserOutputs() {
diff --git a/torch/nativert/executor/triton/CpuTritonKernelManager.cpp b/torch/nativert/executor/triton/CpuTritonKernelManager.cpp
index c212539e4930..f52ebdb02dc8 100644
--- a/torch/nativert/executor/triton/CpuTritonKernelManager.cpp
+++ b/torch/nativert/executor/triton/CpuTritonKernelManager.cpp
@@ -1,5 +1,6 @@
 #include <torch/nativert/executor/triton/TritonKernelManager.h>
 
+#include <c10/util/Exception.h>
 #include <c10/util/FbcodeMaps.h>
 #include <c10/util/Logging.h>
 
@@ -28,7 +29,7 @@ void* _dlsym(void* handle, const char* name) {
 
 char* _dlerror() {
 #if defined(_WIN32)
-  throw std::runtime_error("dlerror not supported on Windows");
+  TORCH_CHECK(false, "dlerror not supported on Windows");
 #else
   return dlerror();
 #endif
diff --git a/torch/nativert/graph/Graph.h b/torch/nativert/graph/Graph.h
index 49335ec6aebd..bbd87a8e2014 100644
--- a/torch/nativert/graph/Graph.h
+++ b/torch/nativert/graph/Graph.h
@@ -71,7 +71,7 @@ class Type {
 // These are all the constant types that are allowed as attributes on Nodes.
 struct None {};
 // None always equals itself
-inline bool operator==(const None&, const None&) {
+inline bool operator==(const None& /*unused*/, const None& /*unused*/) {
   return true;
 }
 
diff --git a/torch/nativert/graph/passes/SubgraphRewriter.cpp b/torch/nativert/graph/passes/SubgraphRewriter.cpp
index 9742a4c3a7aa..f4aa743d0214 100644
--- a/torch/nativert/graph/passes/SubgraphRewriter.cpp
+++ b/torch/nativert/graph/passes/SubgraphRewriter.cpp
@@ -1,8 +1,8 @@
 #include <variant>
 
+#include <c10/util/Exception.h>
 #include <torch/nativert/graph/Graph.h>
 #include <torch/nativert/graph/passes/SubgraphRewriter.h>
-
 namespace torch::nativert {
 
 const std::string kDummyTarget = "dummy";
@@ -66,7 +66,12 @@ bool compareConstants(const Constant& a, const Constant& b) {
         // Unsupported types (Graph)
         LOG(ERROR) << "Unsupported Constant types for pattern matching: "
                    << typeid(lhs).name() << " vs " << typeid(rhs).name();
-        throw std::runtime_error("Unsupported Constant types.");
+        TORCH_CHECK(
+            false,
+            "Unsupported Constant types for pattern matching: ",
+            typeid(lhs).name(),
+            " vs ",
+            typeid(rhs).name())
       },
       a,
       b);
diff --git a/torch/nativert/graph/passes/pass_manager/GraphPassRegistry.h b/torch/nativert/graph/passes/pass_manager/GraphPassRegistry.h
index 28a7f77aa8a1..d46f920f79ea 100644
--- a/torch/nativert/graph/passes/pass_manager/GraphPassRegistry.h
+++ b/torch/nativert/graph/passes/pass_manager/GraphPassRegistry.h
@@ -3,9 +3,9 @@
 #include <functional>
 #include <map>
 
+#include <c10/util/Exception.h>
 #include <c10/util/Logging.h>
 #include <torch/nativert/graph/Graph.h>
-
 namespace torch::nativert {
 
 using PassSignature = std::function<bool(Graph*)>;
@@ -63,9 +63,7 @@ class GraphPassRegistry {
 
   const GraphPass& get_pass(const GraphPassIdentifier& name) {
     auto it = registry_.find(name);
-    if (it == registry_.end()) {
-      throw std::runtime_error("Pass " + name + " not registered to get");
-    }
+    TORCH_CHECK(it != registry_.end(), "Pass ", name, " not registered to get");
     return it->second;
   }
 
diff --git a/torch/nativert/kernels/AutoFunctionalizeKernel.cpp b/torch/nativert/kernels/AutoFunctionalizeKernel.cpp
index 76589b52c56e..0b649d90074d 100644
--- a/torch/nativert/kernels/AutoFunctionalizeKernel.cpp
+++ b/torch/nativert/kernels/AutoFunctionalizeKernel.cpp
@@ -1,8 +1,7 @@
 #include <torch/nativert/kernels/AutoFunctionalizeKernel.h>
 
-#include <fmt/format.h>
-
 #include <c10/util/Enumerate.h>
+#include <c10/util/Exception.h>
 
 namespace torch::nativert {
 
@@ -37,10 +36,12 @@ void UnsafeAutoFunctionalizeKernel::computeInternal(
     // IndexError, ValueError). If retaining this information is important
     // to us, we'll have to change this up a little.
     auto stackTrace = node_->getMetadata("stack_trace");
-    throw std::runtime_error(fmt::format(
-        "Original Python stacktrace:\n{}\n{}",
+    TORCH_CHECK(
+        false,
+        "Oringinal Python stacktrace:\n",
         stackTrace ? *stackTrace : "<no stack trace>",
-        ex.what()));
+        "\n",
+        ex.what())
   }
 
   const auto& outputValues = node_->outputs();
diff --git a/torch/nativert/kernels/C10Kernel.cpp b/torch/nativert/kernels/C10Kernel.cpp
index 98f028ff3da2..079b8f1735b8 100644
--- a/torch/nativert/kernels/C10Kernel.cpp
+++ b/torch/nativert/kernels/C10Kernel.cpp
@@ -3,6 +3,7 @@
 #include <fmt/ostream.h>
 
 #include <c10/util/Enumerate.h>
+#include <c10/util/Exception.h>
 
 #ifdef __SIGRID_USE_GPU__
 #include <ATen/cuda/CUDAContext.h>
@@ -31,15 +32,18 @@ void C10Kernel::computeInternal(ExecutionFrame& executionFrame) const {
     op_.callBoxed(stack);
   } catch (const std::exception& ex) {
     auto stackTrace = node_->getMetadata("stack_trace");
-    throw std::runtime_error(fmt::format(
-        "Exception while executing node: {}\n"
-        "with args:\n{}\n"
-        "{}\n"
-        "Original Python stacktrace:\n{}",
-        fmt::streamed(*node_),
+    TORCH_CHECK(
+        false,
+        "Exception while executing node: ",
+        *node_,
+        "\n"
+        "with args:\n",
         readableArgs(op_.schema(), stack),
+        "\n",
         ex.what(),
-        stackTrace ? *stackTrace : "<no stack trace>"));
+        "\n",
+        "Original Python stacktrace:\n",
+        stackTrace ? *stackTrace : "<no stack trace>")
   }
 
   // Write out results
@@ -67,7 +71,7 @@ std::unordered_map<std::string, c10::IValue> getSymInputs(
     if (val.isInt() || val.isDouble() || val.isBool()) {
       inputs[input.name] = val;
     } else {
-      throw std::runtime_error("unsupported type for symbolic input");
+      TORCH_CHECK(false, "unsupported type for symbolic input");
     }
   }
   for (const auto& attribute : node.attributes()) {
@@ -78,7 +82,7 @@ std::unordered_map<std::string, c10::IValue> getSymInputs(
     } else if (std::holds_alternative<bool>(attribute.value)) {
       inputs[attribute.name] = std::get<bool>(attribute.value);
     } else {
-      throw std::runtime_error("unsupported type for symbolic input");
+      TORCH_CHECK(false, "unsupported type for symbolic input");
     }
   }
   return inputs;
@@ -102,8 +106,7 @@ void computeScalarBinaryOp(
   } else if (target == "_operator.pow") {
     out = std::pow(a, b);
   } else {
-    throw std::runtime_error(
-        fmt::format("unsupported operator for symbolic values: {}", target));
+    TORCH_CHECK(false, "unsupported operator for scalar binary op: ", target);
   }
 
   executionFrame.setIValue(node.outputs()[0]->id(), out);
@@ -130,7 +133,7 @@ void ScalarBinaryOpKernel::computeInternal(
     } else if (x.isDouble()) {
       return x.toDouble();
     } else {
-      throw std::runtime_error("unsupported type for symbolic input");
+      TORCH_CHECK(false, "unsupported type for symbolic input");
     }
   };
 
@@ -171,8 +174,7 @@ void SymIntOpKernel::computeInternal(ExecutionFrame& executionFrame) const {
   } else if (target == "torch.sym_min") {
     out = std::min(a, b);
   } else {
-    throw std::runtime_error(
-        fmt::format("unsupported operator for SymInt: {}", node_->target()));
+    TORCH_CHECK(false, "unsupported operator for SymInt: ", node_->target())
   }
 
   executionFrame.setIValue(node_->outputs()[0]->id(), out);
@@ -219,8 +221,7 @@ void SymBoolOpKernel::computeInternal(ExecutionFrame& executionFrame) const {
     bool b = inputs.at("b").toBool();
     out = a && b;
   } else {
-    throw std::runtime_error(
-        fmt::format("unsupported operator for SymBool: {}", node_->target()));
+    TORCH_CHECK(false, "unsupported operator for SymBool: ", node_->target())
   }
 
   executionFrame.setIValue(node_->outputs()[0]->id(), out);
@@ -247,7 +248,7 @@ void SymFloatOpKernel::computeInternal(ExecutionFrame& executionFrame) const {
     } else if (a.isDouble()) {
       out = -a.toDouble();
     } else {
-      throw std::runtime_error("unsupported type for symbolic input");
+      TORCH_CHECK(false, "unsupported type for symbolic input");
     }
     executionFrame.setIValue(node_->outputs()[0]->id(), out);
   } else if (target == "_operator.truediv") {
@@ -258,9 +259,7 @@ void SymFloatOpKernel::computeInternal(ExecutionFrame& executionFrame) const {
     double out = a / b;
     executionFrame.setIValue(node_->outputs()[0]->id(), out);
   } else {
-    throw std::runtime_error(
-        fmt::format("unsupported operator for SymFloat: {}", node_->target()));
+    TORCH_CHECK(false, "unsupported operator for SymFloat: ", node_->target());
   }
 }
-
 } // namespace torch::nativert
diff --git a/torch/nativert/kernels/HigherOrderKernel.cpp b/torch/nativert/kernels/HigherOrderKernel.cpp
index 370339c82f82..0a39da4f28e6 100644
--- a/torch/nativert/kernels/HigherOrderKernel.cpp
+++ b/torch/nativert/kernels/HigherOrderKernel.cpp
@@ -1,7 +1,6 @@
 #include <torch/nativert/kernels/HigherOrderKernel.h>
 
-#include <fmt/format.h>
-
+#include <c10/util/Exception.h>
 #include <c10/util/string_view.h>
 
 namespace torch::nativert {
@@ -34,8 +33,7 @@ HigherOrderKernel::HigherOrderKernel(
     TORCH_CHECK(!node_->attributes().empty());
     TORCH_CHECK(node_->inputs().size() == 1);
   } else {
-    throw std::runtime_error(
-        fmt::format("Unknown higher order op: {}", opName));
+    TORCH_CHECK(false, "Unknown higher order op: ", opName);
   }
 }
 
@@ -53,7 +51,7 @@ void HigherOrderKernel::computeInternal(ExecutionFrame& executionFrame) const {
       } else if (cond.isBool()) {
         branchIdx = cond.toBool() ? 0 : 1;
       } else {
-        throw std::runtime_error("Unsupported type for cond predicate");
+        TORCH_CHECK(false, "Unsupported type for cond predicate");
       }
       ExecutionFrame branchFrame(*std::get<std::unique_ptr<Graph>>(
           node_->attributes()[branchIdx].value));
diff --git a/torch/nativert/kernels/KernelRegistry.cpp b/torch/nativert/kernels/KernelRegistry.cpp
index f416210cc393..f157257e733b 100644
--- a/torch/nativert/kernels/KernelRegistry.cpp
+++ b/torch/nativert/kernels/KernelRegistry.cpp
@@ -1424,4 +1424,33 @@ C10_REGISTER_TYPED_CLASS(
     "torch.ops.aten._to_copy.default",
     OpKernel_aten__to_copy)
 
+REGISTER_CPU_KERNEL(
+    "torch.ops.aten.where.ScalarOther",
+    aten_where_ScalarOther,
+    {
+      const auto& condition = KernelInput(0).toTensor();
+      const auto& self = KernelInput(1).toTensor();
+      const auto& other = KernelInput(2).toScalar();
+
+      KernelOutput(0) = at::where(condition, self, other);
+    })
+
+REGISTER_CPU_KERNEL(
+    "torch.ops.aten.repeat_interleave.self_Tensor",
+    aten_repeat_interleave_self_Tensor,
+    {
+      const auto& self = KernelInput(0).toTensor();
+      const auto& repeats = KernelInput(1).toTensor();
+      std::optional<int64_t> dim = std::nullopt;
+      if (!KernelInput(2).isNone()) {
+        dim = KernelInput(2).toInt();
+      }
+      std::optional<int64_t> output_size = std::nullopt;
+      if (!KernelInput(3).isNone()) {
+        output_size = KernelInput(3).toInt();
+      }
+
+      KernelOutput(0) = at::repeat_interleave(self, repeats, dim, output_size);
+    })
+
 } // namespace torch::nativert
diff --git a/torch/nested/_internal/nested_tensor.py b/torch/nested/_internal/nested_tensor.py
index d3c4ba8c9166..8d446a7bd518 100644
--- a/torch/nested/_internal/nested_tensor.py
+++ b/torch/nested/_internal/nested_tensor.py
@@ -406,7 +406,7 @@ def backward(ctx, gO: torch.Tensor):  # type: ignore[override]
 # Not actually a view!
 class ViewNestedFromBuffer(torch.autograd.Function):
     @staticmethod
-    def forward(
+    def forward(  # pyrefly: ignore  # bad-override
         ctx,
         values: torch.Tensor,
         offsets: torch.Tensor,
diff --git a/torch/nested/_internal/ops.py b/torch/nested/_internal/ops.py
index 8cec2634a30f..bdca74c13b1d 100644
--- a/torch/nested/_internal/ops.py
+++ b/torch/nested/_internal/ops.py
@@ -18,6 +18,23 @@
 JAGGED_OPS_TABLE: Dict[Any, Any] = {}
 
 
+def _get_padding_value(dtype, padding_type):
+    if dtype.is_floating_point:
+        return (
+            torch.finfo(dtype).max if padding_type == "max" else torch.finfo(dtype).min
+        )
+    elif dtype == torch.int64:
+        # Largest int64 value exactly representable in float64 (IEEE 754 double precision).
+        # Avoids overflow when padding_value is passed as double to _jagged_to_padded_dense_forward.
+        int64_safe_max = (1 << 53) - 1
+        int64_safe_min = -int64_safe_max
+        return int64_safe_max if padding_type == "max" else int64_safe_min
+    else:
+        return (
+            torch.iinfo(dtype).max if padding_type == "max" else torch.iinfo(dtype).min
+        )
+
+
 def _outer_to_inner_dim(ndim, dim, ragged_dim, canonicalize=False):
     from torch._prims_common import canonicalize_dims
 
@@ -29,11 +46,14 @@ def _outer_to_inner_dim(ndim, dim, ragged_dim, canonicalize=False):
     if canonicalize:
         dim = canonicalize_dims(ndim, dim)
 
-    assert dim >= 0 and dim < ndim
+    assert dim >= 0 and dim < ndim  # pyrefly: ignore  # unsupported-operation
 
     # Map dim=0 (AKA batch dim) -> packed dim i.e. outer ragged dim - 1.
     # For other dims, subtract 1 to convert to inner space.
-    return ragged_dim - 1 if dim == 0 else dim - 1
+    return (
+        # pyrefly: ignore  # unsupported-operation
+        ragged_dim - 1 if dim == 0 else dim - 1
+    )
 
 
 def _wrap_jagged_dim(
@@ -413,6 +433,36 @@ def _flatten_sig(input, start_dim=0, end_dim=-1):
 
         return inp.reshape(*new_shape)
 
+    # Handle NestedTensor share_memory_.
+    if func.__name__ == "share_memory_":
+        nt = args[0]
+
+        if nt.is_cuda:
+            return nt
+
+        names, _ = nt.__tensor_flatten__()
+        with torch._C.DisableTorchFunctionSubclass():
+            for name in names:
+                component = getattr(nt, name, None)
+                if component is not None:
+                    component.share_memory_()
+        return nt
+
+    # Handle NestedTensor is_shared.
+    if func.__name__ == "is_shared":
+        nt = args[0]
+
+        if nt.is_cuda:
+            return False
+
+        names, _ = nt.__tensor_flatten__()
+        if not names:
+            return False
+        return all(
+            getattr(nt, name) is not None and getattr(nt, name).is_shared()
+            for name in names
+        )
+
     # Handle nested-specific input validation for CompositeImplicit rms_norm
     if func.__name__ == "rms_norm":
 
@@ -1062,7 +1112,7 @@ def chunk_default(func, *args, **kwargs):
         # the input number; it can be counter-intuitive, but it matches dense behavior.
         return [
             NestedTensor(values=chunk_values[i], **(nested_kwargs[i]))
-            for i in range(0, len(chunk_values))
+            for i in range(len(chunk_values))
         ]
     else:
         return [
@@ -1089,7 +1139,7 @@ def unbind_int(func, *args, **kwargs):
     ragged_idx = inp._ragged_idx
 
     def _torch_check(_lengths: list[int], _offsets: Optional[list[int]] = None):
-        # This torch._check and torch._check_is_size are needed for torch.compile
+        # This torch._check are needed for torch.compile
         # symbolic shapes processing.
         # offsets and lengths are symbolic variables during compilation,
         # we guarantee the correct offsets/lengths correspondence:
@@ -1101,7 +1151,7 @@ def _torch_check(_lengths: list[int], _offsets: Optional[list[int]] = None):
         lengths_sum = 0
         ragged_dim_size = values.shape[ragged_idx - 1]
         for i in range(len(_lengths)):
-            torch._check_is_size(_lengths[i])
+            torch._check(_lengths[i] >= 0)
             torch._check(_lengths[i] <= ragged_dim_size)
 
             lengths_sum += _lengths[i]
@@ -1114,7 +1164,7 @@ def _torch_check(_lengths: list[int], _offsets: Optional[list[int]] = None):
 
         if _offsets is not None:
             for i in range(len(_offsets)):
-                torch._check_is_size(_offsets[i])
+                torch._check(_offsets[i] >= 0)
                 torch._check(_offsets[i] <= ragged_dim_size)
 
     if lengths is None:
@@ -1182,7 +1232,7 @@ def unsqueeze_default(func, *args, **kwargs):
     return NestedTensor(func(values, **new_kwargs), **output_kwargs)
 
 
-@register_jagged_func(torch.ops.aten.cat.default, "tensors: any, dim: any")
+@register_jagged_func(torch.ops.aten.cat.default, "tensors: any, dim: any?")
 def cat_default(func, *args, **kwargs):
     _, new_kwargs = normalize_function(  # type: ignore[misc]
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
@@ -1958,6 +2008,7 @@ def index_put_(func, *args, **kwargs):
     else:
         lengths = inp.lengths()
     torch._assert_async(
+        # pyrefly: ignore  # no-matching-overload
         torch.all(indices[inp._ragged_idx] < lengths),
         "Some indices in the ragged dimension are out of bounds!",
     )
@@ -2101,13 +2152,27 @@ def all_any_max_min_default(func, *args, **kwargs):
     return func(inp._values, **new_kwargs)
 
 
+@register_jagged_func(
+    [torch.ops.aten._is_all_true.default, torch.ops.aten._is_any_true.default],
+    "self: jt_all",
+)
+def _is_true_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(  # type: ignore[misc]
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    return func(inp._values)
+
+
 @register_jagged_func(torch.ops.aten.min.dim, "self: jt_all, dim: any, keepdim: any?")
 def min_dim(func, *args, **kwargs):
     _, new_kwargs = normalize_function(  # type: ignore[misc]
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
     )
 
-    dtype_max = torch.finfo(new_kwargs["input"].dtype).max
+    dtype = new_kwargs["input"].dtype
+    dtype_max = _get_padding_value(dtype, "max")
     return _apply_reduction(func, "min", dtype_max, *args, **kwargs)
 
 
@@ -2117,7 +2182,8 @@ def max_dim(func, *args, **kwargs):
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
     )
 
-    dtype_min = torch.finfo(new_kwargs["input"].dtype).min
+    dtype = new_kwargs["input"].dtype
+    dtype_min = _get_padding_value(dtype, "min")
     return _apply_reduction(func, "max", dtype_min, *args, **kwargs)
 
 
@@ -2129,7 +2195,8 @@ def amin_default(func, *args, **kwargs):
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
     )
 
-    dtype_max = torch.finfo(new_kwargs["input"].dtype).max
+    dtype = new_kwargs["input"].dtype
+    dtype_max = _get_padding_value(dtype, "max")
     return _apply_reduction(func, "amin", dtype_max, *args, **kwargs)
 
 
@@ -2141,7 +2208,8 @@ def amax_default(func, *args, **kwargs):
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
     )
 
-    dtype_min = torch.finfo(new_kwargs["input"].dtype).min
+    dtype = new_kwargs["input"].dtype
+    dtype_min = _get_padding_value(dtype, "min")
     return _apply_reduction(func, "amax", dtype_min, *args, **kwargs)
 
 
@@ -2153,7 +2221,8 @@ def argmin_default(func, *args, **kwargs):
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
     )
 
-    dtype_max = torch.finfo(new_kwargs["input"].dtype).max
+    dtype = new_kwargs["input"].dtype
+    dtype_max = _get_padding_value(dtype, "max")
     return _apply_reduction(func, "argmin", dtype_max, *args, **kwargs)
 
 
@@ -2165,7 +2234,8 @@ def argmax_default(func, *args, **kwargs):
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
     )
 
-    dtype_min = torch.finfo(new_kwargs["input"].dtype).min
+    dtype = new_kwargs["input"].dtype
+    dtype_min = _get_padding_value(dtype, "min")
     return _apply_reduction(func, "argmax", dtype_min, *args, **kwargs)
 
 
@@ -2205,7 +2275,7 @@ def value_selecting_reduction_backward_default(func, *args, **kwargs):
     return NestedTensor(func(**new_kwargs), **output_kwargs)
 
 
-@register_jagged_func(torch.ops.aten.stack.default, "tensors: any, dim: any")
+@register_jagged_func(torch.ops.aten.stack.default, "tensors: any, dim: any?")
 def stack_default(func, *args, **kwargs):
     _, new_kwargs = normalize_function(  # type: ignore[misc]
         func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
diff --git a/torch/nested/_internal/sdpa.py b/torch/nested/_internal/sdpa.py
index 997e1805d08c..4e8d430e845b 100644
--- a/torch/nested/_internal/sdpa.py
+++ b/torch/nested/_internal/sdpa.py
@@ -658,7 +658,7 @@ def _is_computing_meta_flops(x):
             torch.utils._python_dispatch._get_current_dispatch_mode_stack()
         )
         return any(
-            type(x) == torch.utils.flop_counter._FlopCounterMode
+            type(x) is torch.utils.flop_counter._FlopCounterMode
             for x in torch_dispatch_mode_stack
         )
     return False
diff --git a/torch/nn/attention/__init__.py b/torch/nn/attention/__init__.py
index efdd7daa0d2a..4aa6c14f811d 100644
--- a/torch/nn/attention/__init__.py
+++ b/torch/nn/attention/__init__.py
@@ -14,8 +14,15 @@
     SDPAParams,
 )
 
+from .varlen import varlen_attn
 
-__all__: list[str] = ["SDPBackend", "sdpa_kernel", "WARN_FOR_UNFUSED_KERNELS"]
+
+__all__: list[str] = [
+    "SDPBackend",
+    "sdpa_kernel",
+    "WARN_FOR_UNFUSED_KERNELS",
+    "varlen_attn",
+]
 
 # Note: [SDPA warnings]
 # TODO: Consider using this for sdpa regardless of subclasses
@@ -27,9 +34,6 @@
 WARN_FOR_UNFUSED_KERNELS = False
 
 
-# Hacks for Sphinx documentation:
-# https://stackoverflow.com/questions/38765577/overriding-sphinx-autodoc-alias-of-for-import-of-private-class
-SDPBackend = SDPBackend
 r"""An enum-like class that contains the different backends for scaled dot product attention.
     This backend class is designed to be used with the sdpa_kernel context manager.
 
diff --git a/torch/nn/attention/bias.py b/torch/nn/attention/bias.py
index 3d002b7b2365..fceec1272c16 100644
--- a/torch/nn/attention/bias.py
+++ b/torch/nn/attention/bias.py
@@ -153,6 +153,7 @@ def _lower_right(self, device: torch.device) -> torch.Tensor:
             diagonal=diagonal_offset,
         )
 
+    # pyrefly: ignore  # bad-return
     def _materialize(self, device: Optional[torch.device] = None) -> torch.Tensor:
         """
         Materializes the causal bias into a tensor form.
diff --git a/torch/nn/attention/flex_attention.py b/torch/nn/attention/flex_attention.py
index a6d6e1228a32..b68b010ef43d 100644
--- a/torch/nn/attention/flex_attention.py
+++ b/torch/nn/attention/flex_attention.py
@@ -8,8 +8,9 @@
 import math
 import operator
 import warnings
+from collections.abc import Callable
 from enum import Enum
-from typing import Any, Callable, NamedTuple, Optional, Union
+from typing import Any, NamedTuple, Optional, Union
 
 import torch
 from torch import Tensor
@@ -83,6 +84,7 @@ def _warn_once(
 _mask_mod_signature = Callable[[Tensor, Tensor, Tensor, Tensor], Tensor]
 
 
+# pyrefly: ignore  # invalid-inheritance
 class FlexKernelOptions(TypedDict, total=False):
     """Options for controlling the behavior of FlexAttention kernels.
 
@@ -126,79 +128,105 @@ class FlexKernelOptions(TypedDict, total=False):
     """
 
     # Performance tuning options
+    # pyrefly: ignore  # invalid-annotation
     num_warps: NotRequired[int]
     """Number of warps to use in the CUDA kernel. Higher values may improve performance
     but increase register pressure. Default is determined by autotuning."""
 
+    # pyrefly: ignore  # invalid-annotation
     num_stages: NotRequired[int]
     """Number of pipeline stages in the CUDA kernel. Higher values may improve performance
     but increase shared memory usage. Default is determined by autotuning."""
 
+    # pyrefly: ignore  # invalid-annotation
     BLOCK_M: NotRequired[int]
     """Thread block size for the sequence length dimension of Q in forward pass.
     Must be a power of 2. Common values: 16, 32, 64, 128. Default is determined by autotuning."""
 
+    # pyrefly: ignore  # invalid-annotation
     BLOCK_N: NotRequired[int]
     """Thread block size for the sequence length dimension of K/V in forward pass.
     Must be a power of 2. Common values: 16, 32, 64, 128. Default is determined by autotuning."""
 
     # Backward-specific block sizes (when prefixed with 'bwd_')
+    # pyrefly: ignore  # invalid-annotation
     BLOCK_M1: NotRequired[int]
     """Thread block size for Q dimension in backward pass. Use as 'bwd_BLOCK_M1'.
     Default is determined by autotuning."""
 
+    # pyrefly: ignore  # invalid-annotation
     BLOCK_N1: NotRequired[int]
     """Thread block size for K/V dimension in backward pass. Use as 'bwd_BLOCK_N1'.
     Default is determined by autotuning."""
 
+    # pyrefly: ignore  # invalid-annotation
     BLOCK_M2: NotRequired[int]
     """Thread block size for second Q dimension in backward pass. Use as 'bwd_BLOCK_M2'.
     Default is determined by autotuning."""
 
+    # pyrefly: ignore  # invalid-annotation
     BLOCK_N2: NotRequired[int]
     """Thread block size for second K/V dimension in backward pass. Use as 'bwd_BLOCK_N2'.
     Default is determined by autotuning."""
 
+    # pyrefly: ignore  # invalid-annotation
     PRESCALE_QK: NotRequired[bool]
     """Whether to pre-scale QK by 1/sqrt(d) and change of base. This is slightly faster but
     may have more numerical error. Default: False."""
 
+    # pyrefly: ignore  # invalid-annotation
     ROWS_GUARANTEED_SAFE: NotRequired[bool]
     """If True, guarantees that at least one value in each row is not masked out.
     Allows skipping safety checks for better performance. Only set this if you are certain
     your mask guarantees this property. For example, causal attention is guaranteed safe
     because each query has at least 1 key-value to attend to. Default: False."""
 
+    # pyrefly: ignore  # invalid-annotation
     BLOCKS_ARE_CONTIGUOUS: NotRequired[bool]
     """If True, guarantees that all blocks in the mask are contiguous.
     Allows optimizing block traversal. For example, causal masks would satisfy this,
     but prefix_lm + sliding window would not. Default: False."""
 
+    # pyrefly: ignore  # invalid-annotation
     WRITE_DQ: NotRequired[bool]
     """Controls whether gradient scatters are done in the DQ iteration loop of the backward pass.
     Setting this to False will force this to happen in the DK loop which depending on your
     specific score_mod and mask_mod might be faster. Default: True."""
 
+    # pyrefly: ignore  # invalid-annotation
     FORCE_USE_FLEX_ATTENTION: NotRequired[bool]
     """If True, forces the use of the flex attention kernel instead of potentially using
     the more optimized flex-decoding kernel for short sequences. This can be a helpful
     option for debugging. Default: False."""
 
+    # pyrefly: ignore  # invalid-annotation
     USE_TMA: NotRequired[bool]
     """Whether to use Tensor Memory Accelerator (TMA) on supported hardware.
     This is experimental and may not work on all hardware, currently specific
     to NVIDIA GPUs Hopper+. Default: False."""
 
     # ROCm-specific options
+    # pyrefly: ignore  # invalid-annotation
     kpack: NotRequired[int]
     """ROCm-specific kernel packing parameter."""
 
+    # pyrefly: ignore  # invalid-annotation
     matrix_instr_nonkdim: NotRequired[int]
     """ROCm-specific matrix instruction non-K dimension."""
 
+    # pyrefly: ignore  # invalid-annotation
     waves_per_eu: NotRequired[int]
     """ROCm-specific waves per execution unit."""
 
+    # pyrefly: ignore  # invalid-annotation
+    force_flash: NotRequired[bool]
+    """ If True, forces use of the cute-dsl flash attention kernel.
+
+    Raises an error if flash attention cannot be used instead of falling back
+    to the default implementation. Useful for ensuring flash attention is used
+    when expected.
+    """
+
 
 class AuxRequest(NamedTuple):
     """Request which auxiliary outputs to compute from flex_attention.
@@ -239,11 +267,20 @@ def _get_mod_type(fn: Callable) -> _ModificationType:
     considered as a score_mod function. If the function has 4 positional arguments, it is
     considered as a mask function.
     """
-    num_positional_args = sum(
-        1
-        for param in inspect.signature(fn).parameters.values()
-        if param.default == inspect.Parameter.empty
-    )
+    if hasattr(fn, "__code__"):
+        code = fn.__code__
+        num_positional_total = code.co_argcount
+        defaults = ()
+        if hasattr(fn, "__defaults__"):
+            defaults = fn.__defaults__ or ()
+        num_defaults = len(defaults)
+        num_positional_args = num_positional_total - num_defaults
+    else:
+        num_positional_args = sum(
+            1
+            for param in inspect.signature(fn).parameters.values()
+            if param.default is inspect.Parameter.empty
+        )
     assert num_positional_args == 5 or num_positional_args == 4
     if num_positional_args == 5:
         return _ModificationType.SCORE_MOD
@@ -320,6 +357,33 @@ def noop_mask(
     return batch.new_ones(size=(), dtype=torch.bool, device=batch.device)
 
 
+def _sliced_mask_mod_error(
+    batch: Tensor,
+    head: Tensor,
+    token_q: Tensor,
+    token_kv: Tensor,
+) -> Tensor:
+    """
+    Raises helpful error when using mask_mod from a sliced BlockMask.
+
+    After slicing a BlockMask, the mask_mod is reset and cannot be used directly.
+    Users must reassign mask_mod from the original (unsliced) BlockMask.
+    """
+    raise RuntimeError(
+        "Cannot use mask_mod from a sliced BlockMask. "
+        "When you slice a BlockMask using [], the mask_mod attribute is reset. "
+        "You must set it from the original BlockMask's mask_mod."
+        "\n\nIncorrect usage:"
+        "\n  base_mask = create_block_mask(my_mask_fn, ...)"
+        "\n  sliced_mask = base_mask[:, :, block_idx]"
+        "\n  sliced_mask.mask_mod = apply_offset(sliced_mask.mask_mod, offset)  # WRONG!"
+        "\n\nCorrect usage:"
+        "\n  base_mask = create_block_mask(my_mask_fn, ...)"
+        "\n  sliced_mask = base_mask[:, :, block_idx]"
+        "\n  sliced_mask.mask_mod = apply_offset(base_mask.mask_mod, offset)  # Use base_mask!"
+    )
+
+
 _DEFAULT_SPARSE_BLOCK_SIZE = 128
 _LARGE_SPARSE_BLOCK_SIZE = 1 << 30
 
@@ -580,6 +644,7 @@ def as_tuple(self, flatten: bool = True):
             block_size = (self.BLOCK_SIZE,)  # type: ignore[assignment]
             seq_lengths = (self.seq_lengths,)  # type: ignore[assignment]
 
+        # pyrefly: ignore  # not-iterable
         return (
             *seq_lengths,
             self.kv_num_blocks,
@@ -648,6 +713,15 @@ def causal_mask(b, h, q_idx, kv_idx):
                 assert new_block_mask.kv_num_blocks.shape == (2, 1, 1)
                 assert new_block_mask.kv_indices.shape == (2, 1, 1, 4)
         """
+        index = (index,) if not isinstance(index, tuple) else index
+        padded = (*index, slice(None), slice(None), slice(None))[:3]
+        sizes = self.kv_num_blocks.shape[:3]
+        index = tuple(
+            (slice(i + n, i + n + 1) if -n <= i < 0 else slice(i, i + 1))
+            if isinstance(i, int)
+            else i
+            for i, n in zip(padded, sizes)
+        )
         new_kv_num_blocks = self.kv_num_blocks[index]
         new_kv_indices = self.kv_indices[index]
         if self.full_kv_num_blocks is not None:
@@ -663,7 +737,7 @@ def causal_mask(b, h, q_idx, kv_idx):
             new_full_kv_num_blocks,
             new_full_kv_indices,
             BLOCK_SIZE=self.BLOCK_SIZE,
-            mask_mod=None,
+            mask_mod=_sliced_mask_mod_error,
             seq_lengths=self.seq_lengths,
             compute_q_blocks=self.q_indices is not None,
         )
@@ -743,6 +817,7 @@ def to_dense(self) -> Tensor:
         partial_dense = _ordered_to_dense(self.kv_num_blocks, self.kv_indices)
         if self.full_kv_num_blocks is not None:
             assert self.full_kv_indices is not None
+            # pyrefly: ignore  # bad-return
             return partial_dense | _ordered_to_dense(
                 self.full_kv_num_blocks, self.full_kv_indices
             )
@@ -971,7 +1046,7 @@ def create_mask(
     H: Optional[int],
     Q_LEN: int,
     KV_LEN: int,
-    device: DeviceLikeType = "cuda",
+    device: Optional[DeviceLikeType] = None,
 ) -> Tensor:
     r"""This function creates a mask tensor from a mod_fn function.
 
@@ -986,6 +1061,8 @@ def create_mask(
     Returns:
         mask (Tensor): A mask tensor with shape (B, H, M, N).
     """
+    if device is None:
+        device = torch.accelerator.current_accelerator() or "cpu"
     if B is None:
         B = 1
     if H is None:
@@ -1020,7 +1097,7 @@ def create_block_mask(
     H: Optional[int],
     Q_LEN: int,
     KV_LEN: int,
-    device: DeviceLikeType = "cuda",
+    device: Optional[DeviceLikeType] = None,
     BLOCK_SIZE: Union[int, tuple[int, int]] = _DEFAULT_SPARSE_BLOCK_SIZE,
     _compile=False,
 ) -> BlockMask:
@@ -1055,6 +1132,8 @@ def causal_mask(b, h, q_idx, kv_idx):
             value = torch.randn(1, 1, 8192, 64, device="cuda", dtype=torch.float16)
             output = flex_attention(query, key, value, block_mask=block_mask)
     """
+    if device is None:
+        device = torch.accelerator.current_accelerator() or "cpu"
     mod_type = _get_mod_type(mask_mod)
     assert mod_type == _ModificationType.MASK_MOD, (
         f"create-block_mask requires a mask_mod function! Got {mask_mod}"
@@ -1362,6 +1441,11 @@ def score_mod(
     if block_mask is None:
         block_mask = _create_empty_block_mask(query, key)
 
+    # If BlockMask was sliced, its mask_mod is intentionally replaced with an error-raising stub.
+    # This guard ensures we surface the intended error message before any shape-based checks.
+    if getattr(block_mask, "mask_mod", None) is _sliced_mask_mod_error:
+        raise RuntimeError("Cannot use mask_mod from a sliced BlockMask")
+
     if (
         block_mask.BLOCK_SIZE[0] == _LARGE_SPARSE_BLOCK_SIZE
         and block_mask.BLOCK_SIZE[1] == _LARGE_SPARSE_BLOCK_SIZE
@@ -1404,7 +1488,7 @@ def score_mod(
     elif return_lse and return_aux is None:
         _warn_once(
             "deprecated_return_lse",
-            "return_lse is deprecated and will be removed in v2.7. "
+            "return_lse is deprecated and will be removed in v2.10. "
             "Please use return_aux=AuxRequest(lse=True) instead.",
             category=FutureWarning,
         )
diff --git a/torch/nn/attention/varlen.py b/torch/nn/attention/varlen.py
new file mode 100644
index 000000000000..7234dd5e7912
--- /dev/null
+++ b/torch/nn/attention/varlen.py
@@ -0,0 +1,199 @@
+"""
+Variable-length attention implementation using Flash Attention.
+
+This module provides a high-level Python interface for variable-length attention
+that calls into the optimized Flash Attention kernels.
+"""
+
+import logging
+from functools import lru_cache
+from typing import NamedTuple, Optional, Union
+
+import torch
+
+
+log = logging.getLogger(__name__)
+
+__all__ = ["varlen_attn", "AuxRequest"]
+
+
+@lru_cache(maxsize=8)
+def _should_use_cudnn(device_index: int) -> bool:
+    """Cache device capability check to avoid repeated CUDA calls."""
+    return False
+
+
+class AuxRequest(NamedTuple):
+    """
+    Request which auxiliary outputs to compute from varlen_attn.
+
+    Each field is a boolean indicating whether that auxiliary output should be computed.
+    """
+
+    lse: bool = False
+
+
+# import failures when I try to register as custom op
+# @torch.library.custom_op("torch_nn_attention::_varlen_attn", mutates_args={})
+def _varlen_attn(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    cu_seq_q: torch.Tensor,
+    cu_seq_k: torch.Tensor,
+    max_q: int,
+    max_k: int,
+    is_causal: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Private custom op for variable-length attention.
+
+    This is the internal implementation. Users should use the public varlen_attn function instead.
+    """
+
+    use_cudnn = query.is_cuda and _should_use_cudnn(query.device.index)
+
+    if use_cudnn:
+        log.info("Using cuDNN backend for varlen_attn")
+        result = torch.ops.aten._cudnn_attention_forward(
+            query,
+            key,
+            value,
+            None,  # attn_bias
+            cu_seq_q,
+            cu_seq_k,
+            max_q,
+            max_k,
+            True,  # compute_log_sumexp
+            0.0,  # dropout_p hardcoded to 0.0
+            is_causal,
+            False,  # return_debug_mask
+        )
+        # cuDNN returns: (output, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, philox_seed, philox_offset, debug_attn_mask)
+        output, softmax_lse = result[0], result[1]
+    else:
+        log.info("Using Flash Attention backend for varlen_attn")
+        output, softmax_lse, rng_state, _, _ = torch.ops.aten._flash_attention_forward(
+            query,
+            key,
+            value,
+            cu_seq_q,
+            cu_seq_k,
+            max_q,
+            max_k,
+            0.0,  # dropout_p hardcoded to 0.0
+            is_causal,
+            return_debug_mask=False,
+        )
+
+    return output, softmax_lse
+
+
+# @_varlen_attn.register_fake
+def _varlen_attn_fake(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    cu_seq_q: torch.Tensor,
+    cu_seq_k: torch.Tensor,
+    max_q: int,
+    max_k: int,
+    is_causal: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Fake implementation for meta tensor computation and tracing.
+
+    Based on the 3D varlen path from meta__flash_attention_forward:
+    - query shape: (total, num_heads, head_dim)
+    - logsumexp shape: (num_heads, total_q)
+    """
+    # Output has same shape as query
+    output = torch.empty_like(query)
+
+    # For varlen path: logsumexp shape is (num_heads, total_q)
+    total_q = query.size(0)
+    num_heads = query.size(1)
+    logsumexp = torch.empty(
+        (num_heads, total_q), dtype=torch.float, device=query.device
+    )
+
+    return output, logsumexp
+
+
+def varlen_attn(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    cu_seq_q: torch.Tensor,
+    cu_seq_k: torch.Tensor,
+    max_q: int,
+    max_k: int,
+    is_causal: bool = False,
+    return_aux: Optional[AuxRequest] = None,
+) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+    """
+    Compute variable-length attention using Flash Attention.
+    This function is similar to scaled_dot_product_attention but optimized for
+    variable-length sequences using cumulative sequence position tensors.
+    Args:
+    - query (Tensor): Query tensor; shape :math:`(T_q, H, D)`
+    - key (Tensor): Key tensor; shape :math:`(T_k, H, D)`
+    - value (Tensor): Value tensor; shape :math:`(T_k, H, D)`
+    - cu_seq_q (Tensor): Cumulative sequence positions for queries; shape :math:`(N+1,)`
+    - cu_seq_k (Tensor): Cumulative sequence positions for keys/values; shape :math:`(N+1,)`
+    - max_q (int): Maximum query sequence length in the batch.
+    - max_k (int): Maximum key/value sequence length in the batch.
+    - is_causal (bool, optional): If set to True, applies causal masking (default: False).
+    - return_aux (Optional[AuxRequest]): If not None and ``return_aux.lse`` is True, also returns the logsumexp tensor.
+
+    Shape legend:
+    - :math:`N`: Batch size
+    - :math:`T_q`: Total number of query tokens in the batch (sum of all query sequence lengths)
+    - :math:`T_k`: Total number of key/value tokens in the batch (sum of all key/value sequence lengths)
+    - :math:`H`: Number of attention heads
+    - :math:`D`: Head dimension
+
+    Returns:
+    - Tensor: Output tensor from attention computation
+    - If ``return_aux`` is not None and ``return_aux.lse`` is True, returns a tuple of Tensors:
+    (output, lse), where lse is the logsumexp
+
+    Example::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> batch_size, max_seq_len, embed_dim, num_heads = 2, 512, 1024, 16
+        >>> head_dim = embed_dim // num_heads
+        >>> seq_lengths = []
+        >>> for _ in range(batch_size):
+        ...     length = torch.randint(1, max_seq_len // 64 + 1, (1,)).item() * 64
+        ...     seq_lengths.append(min(length, max_seq_len))
+        >>> seq_lengths = torch.tensor(seq_lengths, device="cuda")
+        >>> total_tokens = seq_lengths.sum().item()
+        >>>
+        >>> # Create packed query, key, value tensors
+        >>> query = torch.randn(
+        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
+        ... )
+        >>> key = torch.randn(
+        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
+        ... )
+        >>> value = torch.randn(
+        ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
+        ... )
+        >>>
+        >>> # Build cumulative sequence tensor
+        >>> cu_seq = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
+        >>> cu_seq[1:] = seq_lengths.cumsum(0)
+        >>> max_len = seq_lengths.max().item()
+        >>>
+        >>> # Call varlen_attn
+        >>> output = varlen_attn(
+        ...     query, key, value, cu_seq, cu_seq, max_len, max_len, is_causal=False
+        ... )
+    """
+    out, lse = _varlen_attn(
+        query, key, value, cu_seq_q, cu_seq_k, max_q, max_k, is_causal
+    )
+    if return_aux is not None and return_aux.lse:
+        return out, lse
+    return out
diff --git a/torch/nn/common_types.py b/torch/nn/common_types.py
index 0629e6441e45..a8a9d70f4002 100644
--- a/torch/nn/common_types.py
+++ b/torch/nn/common_types.py
@@ -1,5 +1,4 @@
-from typing import Optional, TypeVar, Union
-from typing_extensions import TypeAlias as _TypeAlias
+from typing import Optional, TypeAlias as _TypeAlias, TypeVar, Union
 
 from torch import Tensor
 
diff --git a/torch/nn/cpp.py b/torch/nn/cpp.py
index 98a61bfb7c42..5d01f7f16a4a 100644
--- a/torch/nn/cpp.py
+++ b/torch/nn/cpp.py
@@ -78,6 +78,7 @@ def _apply(self, fn, recurse=True):
 
     # nn.Module defines training as a boolean
     @property  # type: ignore[override]
+    # pyrefly: ignore  # bad-override
     def training(self):
         return self.cpp_module.training
 
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 92142fd44df8..9f1438d3780c 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -3,11 +3,17 @@
 import importlib
 import math
 import warnings
-from typing import Callable, Optional, TYPE_CHECKING, Union
+from collections.abc import Callable
+from typing import Any as _Any, Optional, TYPE_CHECKING, Union
 
 import torch
 from torch import _VF, sym_int as _sym_int, Tensor
-from torch._C import _add_docstr, _infer_size
+from torch._C import (
+    _add_docstr,
+    _infer_size,
+    _ScalingType as ScalingType,
+    _SwizzleType as SwizzleType,
+)
 from torch._jit_internal import (
     _overload,
     boolean_dispatch,
@@ -26,6 +32,10 @@
 )
 
 
+# Set visibility of the bound enums to this module
+ScalingType.__module__ = "torch.nn.functional"
+SwizzleType.__module__ = "torch.nn.functional"
+
 if TYPE_CHECKING:
     from torch.types import _dtype as DType
 else:
@@ -1079,6 +1089,9 @@ def lp_pool3d(
     If the sum of all inputs to the power of `p` is
     zero, the gradient is set to zero as well.
 
+    When ``ceil_mode`` is ``True``, sliding windows may go off-bounds if they start within the left
+    padding or the input. Sliding windows that would start in the right padded region are ignored.
+
     See :class:`~torch.nn.LPPool3d` for details.
     """
     if has_torch_function_unary(input):
@@ -1117,6 +1130,9 @@ def lp_pool2d(
     If the sum of all inputs to the power of `p` is
     zero, the gradient is set to zero as well.
 
+    When ``ceil_mode`` is ``True``, sliding windows may go off-bounds if they start within the left
+    padding or the input. Sliding windows that would start in the right padded region are ignored.
+
     See :class:`~torch.nn.LPPool2d` for details.
     """
     if has_torch_function_unary(input):
@@ -1152,6 +1168,9 @@ def lp_pool1d(
     If the sum of all inputs to the power of `p` is
     zero, the gradient is set to zero as well.
 
+    When ``ceil_mode`` is ``True``, sliding windows may go off-bounds if they start within the left
+    padding or the input. Sliding windows that would start in the right padded region are ignored.
+
     See :class:`~torch.nn.LPPool1d` for details.
     """
     if has_torch_function_unary(input):
@@ -1256,6 +1275,7 @@ def adaptive_max_pool2d_with_indices(
             output_size,
             return_indices=return_indices,
         )
+    # pyrefly: ignore  # bad-argument-type
     output_size = _list_with_default(output_size, input.size())
     return torch._C._nn.adaptive_max_pool2d(input, output_size)
 
@@ -1313,6 +1333,7 @@ def adaptive_max_pool3d_with_indices(
             output_size,
             return_indices=return_indices,
         )
+    # pyrefly: ignore  # bad-argument-type
     output_size = _list_with_default(output_size, input.size())
     return torch._C._nn.adaptive_max_pool3d(input, output_size)
 
@@ -1371,6 +1392,7 @@ def adaptive_avg_pool2d(input: Tensor, output_size: BroadcastingList2[int]) -> T
     """
     if has_torch_function_unary(input):
         return handle_torch_function(adaptive_avg_pool2d, (input,), input, output_size)
+    # pyrefly: ignore  # bad-argument-type
     _output_size = _list_with_default(output_size, input.size())
     return torch._C._nn.adaptive_avg_pool2d(input, _output_size)
 
@@ -1386,6 +1408,7 @@ def adaptive_avg_pool3d(input: Tensor, output_size: BroadcastingList3[int]) -> T
     """
     if has_torch_function_unary(input):
         return handle_torch_function(adaptive_avg_pool3d, (input,), input, output_size)
+    # pyrefly: ignore  # bad-argument-type
     _output_size = _list_with_default(output_size, input.size())
     return torch._C._nn.adaptive_avg_pool3d(input, _output_size)
 
@@ -2421,6 +2444,7 @@ def _no_grad_embedding_renorm_(
     input: Tensor,
     max_norm: float,
     norm_type: float,
+    # pyrefly: ignore  # bad-return
 ) -> tuple[Tensor, Tensor]:
     torch.embedding_renorm_(weight.detach(), input, max_norm, norm_type)
 
@@ -2674,6 +2698,7 @@ def embedding_bag(
 
     if not torch.jit.is_scripting() and input.dim() == 2 and input.is_nested:
         include_last_offset = True
+        # pyrefly: ignore  # missing-attribute
         offsets = input.offsets()
         input = input.values().reshape(-1)
         if per_sample_weights is not None:
@@ -2808,6 +2833,7 @@ def batch_norm(
             eps=eps,
         )
     if training:
+        # pyrefly: ignore  # bad-argument-type
         _verify_batch_size(input.size())
 
     return torch.batch_norm(
@@ -2863,6 +2889,7 @@ def instance_norm(
             eps=eps,
         )
     if use_input_stats:
+        # pyrefly: ignore  # bad-argument-type
         _verify_spatial_size(input.size())
     return torch.instance_norm(
         input,
@@ -2988,11 +3015,13 @@ def local_response_norm(
     div = input.mul(input)
     if dim == 3:
         div = div.unsqueeze(1)
+        # pyrefly: ignore  # bad-argument-type
         div = pad(div, (0, 0, size // 2, (size - 1) // 2))
         div = avg_pool2d(div, (size, 1), stride=1).squeeze(1)
     else:
         sizes = input.size()
         div = div.view(sizes[0], 1, sizes[1], sizes[2], -1)
+        # pyrefly: ignore  # bad-argument-type
         div = pad(div, (0, 0, 0, 0, size // 2, (size - 1) // 2))
         div = avg_pool3d(div, (size, 1, 1), stride=1).squeeze(1)
         div = div.view(sizes)
@@ -3141,7 +3170,12 @@ def nll_loss(
     if size_average is not None or reduce is not None:
         reduction = _Reduction.legacy_get_string(size_average, reduce)
     return torch._C._nn.nll_loss_nd(
-        input, target, weight, _Reduction.get_enum(reduction), ignore_index
+        input,
+        target,
+        weight,
+        # pyrefly: ignore  # bad-argument-type
+        _Reduction.get_enum(reduction),
+        ignore_index,
     )
 
 
@@ -3262,11 +3296,13 @@ def gaussian_nll_loss(
         if input.size()[:-1] == var.size():
             var = torch.unsqueeze(var, -1)
 
-        # This checks if the sizes match up to the final dimension, and the final dimension of var is of size 1.
+        # This checks if the var is broadcastable to the input and there is only one mismatched dimension.
         # This is also a homoscedastic case.
         # e.g. input.size = (10, 2, 3), var.size = (10, 2, 1)
+        # or  input.size = (4, 3, 32, 32), var.size = (4, 1, 32, 32)
         elif (
-            input.size()[:-1] == var.size()[:-1] and var.size(-1) == 1
+            input.ndim == var.ndim
+            and sum(y for x, y in zip(input.size(), var.size()) if x != y) == 1
         ):  # Heteroscedastic case
             pass
 
@@ -3284,6 +3320,7 @@ def gaussian_nll_loss(
         var.clamp_(min=eps)
 
     # Calculate the loss
+    # pyrefly: ignore  # unsupported-operation
     loss = 0.5 * (torch.log(var) + (input - target) ** 2 / var)
     if full:
         loss += 0.5 * math.log(2 * math.pi)
@@ -3459,6 +3496,7 @@ def cross_entropy(
         input,
         target,
         weight,
+        # pyrefly: ignore  # bad-argument-type
         _Reduction.get_enum(reduction),
         ignore_index,
         label_smoothing,
@@ -3523,6 +3561,7 @@ def binary_cross_entropy(
         new_size = _infer_size(target.size(), weight.size())
         weight = weight.expand(new_size)
 
+    # pyrefly: ignore  # bad-argument-type
     return torch._C._nn.binary_cross_entropy(input, target, weight, reduction_enum)
 
 
@@ -3637,7 +3676,7 @@ def smooth_l1_loss(
             reduction=reduction,
             beta=beta,
         )
-    if not (target.size() == input.size()):
+    if target.size() != input.size():
         warnings.warn(
             f"Using a target size ({target.size()}) that is different to the input size ({input.size()}). "
             "This will likely lead to incorrect results due to broadcasting. "
@@ -3651,11 +3690,18 @@ def smooth_l1_loss(
 
     if beta == 0.0:
         return torch._C._nn.l1_loss(
-            expanded_input, expanded_target, _Reduction.get_enum(reduction)
+            expanded_input,
+            expanded_target,
+            # pyrefly: ignore  # bad-argument-type
+            _Reduction.get_enum(reduction),
         )
     else:
         return torch._C._nn.smooth_l1_loss(
-            expanded_input, expanded_target, _Reduction.get_enum(reduction), beta
+            expanded_input,
+            expanded_target,
+            # pyrefly: ignore  # bad-argument-type
+            _Reduction.get_enum(reduction),
+            beta,
         )
 
 
@@ -3700,7 +3746,7 @@ def huber_loss(
             weight=weight,
         )
 
-    if not (target.size() == input.size()):
+    if target.size() != input.size():
         warnings.warn(
             f"Using a target size ({target.size()}) that is different to the input size ({input.size()}). "
             "This will likely lead to incorrect results due to broadcasting. "
@@ -3713,7 +3759,11 @@ def huber_loss(
     if weight is None:
         # Use the optimized C++ backend for standard Huber loss
         return torch._C._nn.huber_loss(
-            expanded_input, expanded_target, _Reduction.get_enum(reduction), delta
+            expanded_input,
+            expanded_target,
+            # pyrefly: ignore  # bad-argument-type
+            _Reduction.get_enum(reduction),
+            delta,
         )
     else:
         if weight.size() != input.size():
@@ -3721,7 +3771,11 @@ def huber_loss(
 
         # Calculate the unweighted loss first
         unweighted_loss = torch._C._nn.huber_loss(
-            expanded_input, expanded_target, _Reduction.get_enum("none"), delta
+            expanded_input,
+            expanded_target,
+            # pyrefly: ignore  # bad-argument-type
+            _Reduction.get_enum("none"),
+            delta,
         )
 
         # Apply weight to the unweighted loss
@@ -3777,7 +3831,7 @@ def l1_loss(
             reduce=reduce,
             reduction=reduction,
         )
-    if not (target.size() == input.size()):
+    if target.size() != input.size():
         warnings.warn(
             f"Using a target size ({target.size()}) that is different to the input size ({input.size()}). "
             "This will likely lead to incorrect results due to broadcasting. "
@@ -3808,7 +3862,10 @@ def l1_loss(
             )
     else:
         return torch._C._nn.l1_loss(
-            expanded_input, expanded_target, _Reduction.get_enum(reduction)
+            expanded_input,
+            expanded_target,
+            # pyrefly: ignore  # bad-argument-type
+            _Reduction.get_enum(reduction),
         )
 
 
@@ -3850,7 +3907,7 @@ def mse_loss(
             weight=weight,
         )
 
-    if not (target.size() == input.size()):
+    if target.size() != input.size():
         warnings.warn(
             f"Using a target size ({target.size()}) that is different to the input size ({input.size()}). "
             "This will likely lead to incorrect results due to broadcasting. "
@@ -3883,7 +3940,10 @@ def mse_loss(
             )
     else:
         return torch._C._nn.mse_loss(
-            expanded_input, expanded_target, _Reduction.get_enum(reduction)
+            expanded_input,
+            expanded_target,
+            # pyrefly: ignore  # bad-argument-type
+            _Reduction.get_enum(reduction),
         )
 
 
@@ -4020,6 +4080,7 @@ def multilabel_margin_loss(
         reduction_enum = _Reduction.legacy_get_enum(size_average, reduce)
     else:
         reduction_enum = _Reduction.get_enum(reduction)
+    # pyrefly: ignore  # bad-argument-type
     return torch._C._nn.multilabel_margin_loss(input, target, reduction_enum)
 
 
@@ -4061,6 +4122,7 @@ def soft_margin_loss(
         reduction_enum = _Reduction.legacy_get_enum(size_average, reduce)
     else:
         reduction_enum = _Reduction.get_enum(reduction)
+    # pyrefly: ignore  # bad-argument-type
     return torch._C._nn.soft_margin_loss(input, target, reduction_enum)
 
 
@@ -4225,7 +4287,13 @@ def multi_margin_loss(
             raise ValueError("weight must be one-dimensional")
 
     return torch._C._nn.multi_margin_loss(
-        input, target, p, margin, weight, reduction_enum
+        input,
+        target,
+        p,
+        margin,
+        weight,
+        # pyrefly: ignore  # bad-argument-type
+        reduction_enum,
     )
 
 
@@ -4371,6 +4439,7 @@ def upsample(  # noqa: F811
     scale_factor: Optional[float] = None,
     mode: str = "nearest",
     align_corners: Optional[bool] = None,
+    # pyrefly: ignore  # bad-return
 ) -> Tensor:  # noqa: B950
     pass
 
@@ -4382,6 +4451,7 @@ def upsample(  # noqa: F811
     scale_factor: Optional[float] = None,
     mode: str = "nearest",
     align_corners: Optional[bool] = None,
+    # pyrefly: ignore  # bad-return
 ) -> Tensor:  # noqa: B950
     pass
 
@@ -4484,6 +4554,7 @@ def interpolate(  # noqa: F811
     align_corners: Optional[bool] = None,
     recompute_scale_factor: Optional[bool] = None,
     antialias: bool = False,
+    # pyrefly: ignore  # bad-return
 ) -> Tensor:  # noqa: B950
     pass
 
@@ -4497,6 +4568,7 @@ def interpolate(  # noqa: F811
     align_corners: Optional[bool] = None,
     recompute_scale_factor: Optional[bool] = None,
     antialias: bool = False,
+    # pyrefly: ignore  # bad-return
 ) -> Tensor:  # noqa: B950
     pass
 
@@ -4510,6 +4582,7 @@ def interpolate(  # noqa: F811
     align_corners: Optional[bool] = None,
     recompute_scale_factor: Optional[bool] = None,
     antialias: bool = False,
+    # pyrefly: ignore  # bad-return
 ) -> Tensor:  # noqa: B950
     pass
 
@@ -4523,6 +4596,7 @@ def interpolate(  # noqa: F811
     align_corners: Optional[bool] = None,
     recompute_scale_factor: Optional[bool] = None,
     antialias: bool = False,
+    # pyrefly: ignore  # bad-return
 ) -> Tensor:
     pass
 
@@ -4697,6 +4771,7 @@ def interpolate(  # noqa: F811
                 (
                     torch.floor(
                         (
+                            # pyrefly: ignore  # missing-attribute
                             input.size(i + 2).float()
                             * torch.tensor(scale_factors[i], dtype=torch.float32)
                         ).float()
@@ -4706,7 +4781,7 @@ def interpolate(  # noqa: F811
             ]
         elif torch.jit.is_scripting():
             output_size = [
-                int(math.floor(float(input.size(i + 2)) * scale_factors[i]))
+                math.floor(float(input.size(i + 2)) * scale_factors[i])
                 for i in range(dim)
             ]
         else:
@@ -4721,21 +4796,28 @@ def interpolate(  # noqa: F811
         )
 
     if input.dim() == 3 and mode == "nearest":
+        # pyrefly: ignore  # bad-argument-type
         return torch._C._nn.upsample_nearest1d(input, output_size, scale_factors)
     if input.dim() == 4 and mode == "nearest":
+        # pyrefly: ignore  # bad-argument-type
         return torch._C._nn.upsample_nearest2d(input, output_size, scale_factors)
     if input.dim() == 5 and mode == "nearest":
+        # pyrefly: ignore  # bad-argument-type
         return torch._C._nn.upsample_nearest3d(input, output_size, scale_factors)
 
     if input.dim() == 3 and mode == "nearest-exact":
+        # pyrefly: ignore  # bad-argument-type
         return torch._C._nn._upsample_nearest_exact1d(input, output_size, scale_factors)
     if input.dim() == 4 and mode == "nearest-exact":
+        # pyrefly: ignore  # bad-argument-type
         return torch._C._nn._upsample_nearest_exact2d(input, output_size, scale_factors)
     if input.dim() == 5 and mode == "nearest-exact":
+        # pyrefly: ignore  # bad-argument-type
         return torch._C._nn._upsample_nearest_exact3d(input, output_size, scale_factors)
 
     if input.dim() == 3 and mode == "area":
         assert output_size is not None
+        # pyrefly: ignore  # bad-argument-type
         return adaptive_avg_pool1d(input, output_size)
     if input.dim() == 4 and mode == "area":
         assert output_size is not None
@@ -4747,13 +4829,21 @@ def interpolate(  # noqa: F811
     if input.dim() == 3 and mode == "linear":
         assert align_corners is not None
         return torch._C._nn.upsample_linear1d(
-            input, output_size, align_corners, scale_factors
+            input,
+            # pyrefly: ignore  # bad-argument-type
+            output_size,
+            align_corners,
+            scale_factors,
         )
     if input.dim() == 4 and mode == "bilinear":
         assert align_corners is not None
         if antialias:
             return torch._C._nn._upsample_bilinear2d_aa(
-                input, output_size, align_corners, scale_factors
+                input,
+                # pyrefly: ignore  # bad-argument-type
+                output_size,
+                align_corners,
+                scale_factors,
             )
         # Two levels are necessary to prevent TorchScript from touching
         # are_deterministic_algorithms_enabled.
@@ -4766,7 +4856,11 @@ def interpolate(  # noqa: F811
                     "torch._decomp.decompositions"
                 )._upsample_linear_vec(input, output_size, align_corners, scale_factors)
         return torch._C._nn.upsample_bilinear2d(
-            input, output_size, align_corners, scale_factors
+            input,
+            # pyrefly: ignore  # bad-argument-type
+            output_size,
+            align_corners,
+            scale_factors,
         )
     if input.dim() == 5 and mode == "trilinear":
         assert align_corners is not None
@@ -4781,16 +4875,28 @@ def interpolate(  # noqa: F811
                     "torch._decomp.decompositions"
                 )._upsample_linear_vec(input, output_size, align_corners, scale_factors)
         return torch._C._nn.upsample_trilinear3d(
-            input, output_size, align_corners, scale_factors
+            input,
+            # pyrefly: ignore  # bad-argument-type
+            output_size,
+            align_corners,
+            scale_factors,
         )
     if input.dim() == 4 and mode == "bicubic":
         assert align_corners is not None
         if antialias:
             return torch._C._nn._upsample_bicubic2d_aa(
-                input, output_size, align_corners, scale_factors
+                input,
+                # pyrefly: ignore  # bad-argument-type
+                output_size,
+                align_corners,
+                scale_factors,
             )
         return torch._C._nn.upsample_bicubic2d(
-            input, output_size, align_corners, scale_factors
+            input,
+            # pyrefly: ignore  # bad-argument-type
+            output_size,
+            align_corners,
+            scale_factors,
         )
 
     if input.dim() == 3 and mode == "bilinear":
@@ -4822,6 +4928,7 @@ def upsample_nearest(  # noqa: F811
     input: Tensor,
     size: Optional[int] = None,
     scale_factor: Optional[float] = None,
+    # pyrefly: ignore  # bad-return
 ) -> Tensor:
     pass
 
@@ -4831,6 +4938,7 @@ def upsample_nearest(  # noqa: F811
     input: Tensor,
     size: Optional[list[int]] = None,
     scale_factor: Optional[float] = None,
+    # pyrefly: ignore  # bad-return
 ) -> Tensor:
     pass
 
@@ -4872,6 +4980,7 @@ def upsample_bilinear(  # noqa: F811
     input: Tensor,
     size: Optional[int] = None,
     scale_factor: Optional[float] = None,
+    # pyrefly: ignore  # bad-return
 ) -> Tensor:
     pass
 
@@ -4881,6 +4990,7 @@ def upsample_bilinear(  # noqa: F811
     input: Tensor,
     size: Optional[list[int]] = None,
     scale_factor: Optional[float] = None,
+    # pyrefly: ignore  # bad-return
 ) -> Tensor:
     pass
 
@@ -4890,6 +5000,7 @@ def upsample_bilinear(  # noqa: F811
     input: Tensor,
     size: Optional[int] = None,
     scale_factor: Optional[list[float]] = None,
+    # pyrefly: ignore  # bad-return
 ) -> Tensor:
     pass
 
@@ -4899,6 +5010,7 @@ def upsample_bilinear(  # noqa: F811
     input: Tensor,
     size: Optional[list[int]] = None,
     scale_factor: Optional[list[float]] = None,
+    # pyrefly: ignore  # bad-return
 ) -> Tensor:
     pass
 
@@ -5705,6 +5817,7 @@ def _in_projection_packed(
                 .squeeze(-2)
                 .contiguous()
             )
+            # pyrefly: ignore  # bad-return
             return proj[0], proj[1], proj[2]
         else:
             # encoder-decoder attention
@@ -5723,6 +5836,7 @@ def _in_projection_packed(
                 .squeeze(-2)
                 .contiguous()
             )
+            # pyrefly: ignore  # bad-return
             return (q_proj, kv_proj[0], kv_proj[1])
     else:
         w_q, w_k, w_v = w.chunk(3)
@@ -5730,6 +5844,7 @@ def _in_projection_packed(
             b_q = b_k = b_v = None
         else:
             b_q, b_k, b_v = b.chunk(3)
+        # pyrefly: ignore  # bad-return
         return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
 
 
@@ -6360,8 +6475,10 @@ def multi_head_attention_forward(
         k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
         v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
         if attn_mask is not None:
+            # pyrefly: ignore  # bad-argument-type
             attn_mask = pad(attn_mask, (0, 1))
         if key_padding_mask is not None:
+            # pyrefly: ignore  # bad-argument-type
             key_padding_mask = pad(key_padding_mask, (0, 1))
     else:
         assert bias_k is None
@@ -6370,8 +6487,10 @@ def multi_head_attention_forward(
     #
     # reshape q, k, v for multihead attention and make them batch first
     #
+    # pyrefly: ignore  # no-matching-overload
     q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
     if static_k is None:
+        # pyrefly: ignore  # no-matching-overload
         k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
     else:
         # TODO finish disentangling control flow so we don't do in-projections when statics are passed
@@ -6383,6 +6502,7 @@ def multi_head_attention_forward(
         )
         k = static_k
     if static_v is None:
+        # pyrefly: ignore  # no-matching-overload
         v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
     else:
         # TODO finish disentangling control flow so we don't do in-projections when statics are passed
@@ -6398,14 +6518,20 @@ def multi_head_attention_forward(
     if add_zero_attn:
         zero_attn_shape = (bsz * num_heads, 1, head_dim)
         k = torch.cat(
-            [k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1
+            # pyrefly: ignore  # no-matching-overload
+            [k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)],
+            dim=1,
         )
         v = torch.cat(
-            [v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1
+            # pyrefly: ignore  # no-matching-overload
+            [v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)],
+            dim=1,
         )
         if attn_mask is not None:
+            # pyrefly: ignore  # bad-argument-type
             attn_mask = pad(attn_mask, (0, 1))
         if key_padding_mask is not None:
+            # pyrefly: ignore  # bad-argument-type
             key_padding_mask = pad(key_padding_mask, (0, 1))
 
     # update source sequence length after adjustments
@@ -6455,6 +6581,7 @@ def multi_head_attention_forward(
         attn_output = torch.bmm(attn_output_weights, v)
 
         attn_output = (
+            # pyrefly: ignore  # no-matching-overload
             attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
         )
         attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
@@ -6481,13 +6608,16 @@ def multi_head_attention_forward(
                 attn_mask = attn_mask.view(bsz, num_heads, -1, src_len)
 
         q = q.view(bsz, num_heads, tgt_len, head_dim)
+        # pyrefly: ignore  # no-matching-overload
         k = k.view(bsz, num_heads, src_len, head_dim)
+        # pyrefly: ignore  # no-matching-overload
         v = v.view(bsz, num_heads, src_len, head_dim)
 
         attn_output = scaled_dot_product_attention(
             q, k, v, attn_mask, dropout_p, is_causal
         )
         attn_output = (
+            # pyrefly: ignore  # no-matching-overload
             attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
         )
 
@@ -6497,3 +6627,174 @@ def multi_head_attention_forward(
             # squeeze the output if input was unbatched
             attn_output = attn_output.squeeze(1)
         return attn_output, None
+
+
+def scaled_mm(
+    mat_a: Tensor,
+    mat_b: Tensor,
+    scale_a: Tensor | list[Tensor],
+    scale_recipe_a: ScalingType | list[ScalingType],
+    scale_b: Tensor | list[Tensor],
+    scale_recipe_b: ScalingType | list[ScalingType],
+    swizzle_a: SwizzleType | list[SwizzleType] | None = None,
+    swizzle_b: SwizzleType | list[SwizzleType] | None = None,
+    bias: Optional[Tensor] = None,
+    output_dtype: Optional[torch.dtype] = torch.bfloat16,
+    contraction_dim: list[int] | tuple[int] = (),
+    use_fast_accum: bool = False,
+) -> Tensor:
+    r"""
+    scaled_mm(mat_a, mat_b, scale_a, scale_recipe_a, scale_b, scale_recipe_b, swizzle_a, swizzle_b, bias, output_dtype,
+              contraction_dim, use_fast_accum)
+
+    Applies a scaled matrix-multiply, mm(mat_a, mat_b) where the scaling of mat_a and mat_b are described by
+    scale_recipe_a and scale_recipe_b respectively.
+
+    Args:
+        scale_a: Tensor containing decoding scaling factors for mat_a
+        scale_recipe_a: Enum describing how mat_a has been scaled
+        scale_b: Tensor containing decoding scaling factors for mat_b
+        scale_recipe_b: Enum describing how mat_b has been scaled
+        swizzle_a: Enum describing the swizzling pattern (if any) of scale_a
+        swizzle_b: Enum describing the swizzling pattern (if any) of scale_b
+        bias: optional bias term to be added to the output
+        output_dtype: dtype used for the output tensor
+        contraction_dim: describe which dimensions are :math:`K` in the matmul.
+        use_fast_accum: enable/disable tensor-core fast accumulation (Hopper-GPUs only)
+    """
+
+    def expand_single_value(v: _Any | list[_Any] | None) -> list[_Any]:
+        if v is None:
+            return []
+        elif not isinstance(v, (list)):
+            return [
+                v,
+            ]
+        else:
+            return v
+
+    scale_a = expand_single_value(scale_a)
+    scale_recipe_a = expand_single_value(scale_recipe_a)
+    scale_b = expand_single_value(scale_b)
+    scale_recipe_b = expand_single_value(scale_recipe_b)
+    swizzle_a = expand_single_value(swizzle_a)
+    swizzle_b = expand_single_value(swizzle_b)
+
+    # native_functions has restrictions on what can be defined
+    # & passed through - std::optional<ArrayRef<Tensor>> for instance
+    # *cannot* be passed, but an empty vector (list) can.
+    # So, we need to convert None arguments for lists in python
+    # explicitly into empty lists.
+    def list_or_empty(l: list[_Any] | None) -> list[_Any]:
+        return l if l else []
+
+    def enum_list_as_int_list(l: _Any | list[_Any]) -> list[_Any]:
+        if not isinstance(l, list):
+            l = [
+                l,
+            ]
+        return [li.value for li in l]
+
+    out = torch._scaled_mm_v2(
+        mat_a,
+        mat_b,
+        scale_a,
+        enum_list_as_int_list(scale_recipe_a),
+        enum_list_as_int_list(list_or_empty(swizzle_a)),
+        scale_b,
+        enum_list_as_int_list(scale_recipe_b),
+        enum_list_as_int_list(list_or_empty(swizzle_b)),
+        bias,
+        output_dtype,
+        contraction_dim,
+        use_fast_accum,
+    )
+
+    return out
+
+
+def scaled_grouped_mm(
+    mat_a: Tensor,
+    mat_b: Tensor,
+    scale_a: Tensor | list[Tensor],
+    scale_recipe_a: ScalingType | list[ScalingType],
+    scale_b: Tensor | list[Tensor],
+    scale_recipe_b: ScalingType | list[ScalingType],
+    swizzle_a: SwizzleType | list[SwizzleType] | None = None,
+    swizzle_b: SwizzleType | list[SwizzleType] | None = None,
+    bias: Optional[Tensor] = None,
+    offs: Optional[Tensor] = None,
+    output_dtype: Optional[torch.dtype] = torch.bfloat16,
+    contraction_dim: list[int] | tuple[int] = (),
+    use_fast_accum: bool = False,
+) -> Tensor:
+    r"""
+    scaled_grouped_mm(mat_a, mat_b, scale_a, scale_recipe_a, scale_b, scale_recipe_b, swizzle_a, swizzle_b, bias, offs,
+              output_dtype, use_fast_accum)
+
+    Applies a grouped scaled matrix-multiply, grouped_mm(mat_a, mat_b) where the scaling of mat_a and mat_b are described by
+    scale_recipe_a and scale_recipe_b respectively.
+
+    Args:
+        scale_a: Tensor containing decoding scaling factors for mat_a
+        scale_recipe_a: Enum describing how mat_a has been scaled
+        scale_b: Tensor containing decoding scaling factors for mat_b
+        scale_recipe_b: Enum describing how mat_b has been scaled
+        swizzle_a: Enum describing the swizzling pattern (if any) of scale_a
+        swizzle_b: Enum describing the swizzling pattern (if any) of scale_b
+        bias: optional bias term to be added to the output
+        offs: optional offsets into the source tensors denoting group start indices
+        output_dtype: dtype used for the output tensor
+        contraction_dim: describe which dimensions are :math:`K` in the matmul.
+        use_fast_accum: enable/disable tensor-core fast accumulation (Hopper-GPUs only)
+    """
+
+    def expand_single_value(v: _Any | list[_Any] | None) -> list[_Any]:
+        if v is None:
+            return []
+        elif not isinstance(v, (list)):
+            return [
+                v,
+            ]
+        else:
+            return v
+
+    scale_a = expand_single_value(scale_a)
+    scale_recipe_a = expand_single_value(scale_recipe_a)
+    scale_b = expand_single_value(scale_b)
+    scale_recipe_b = expand_single_value(scale_recipe_b)
+    swizzle_a = expand_single_value(swizzle_a)
+    swizzle_b = expand_single_value(swizzle_b)
+
+    # native_functions has restrictions on what can be defined
+    # & passed through - std::optional<ArrayRef<Tensor>> for instance
+    # *cannot* be passed, but an empty vector (list) can.
+    # So, we need to convert None arguments for lists in python
+    # explicitly into empty lists.
+    def list_or_empty(l: list[_Any] | None) -> list[_Any]:
+        return l if l else []
+
+    def enum_list_as_int_list(l: _Any | list[_Any]) -> list[_Any]:
+        if not isinstance(l, list):
+            l = [
+                l,
+            ]
+        return [li.value for li in l]
+
+    out = torch._scaled_grouped_mm_v2(
+        mat_a,
+        mat_b,
+        scale_a,
+        enum_list_as_int_list(scale_recipe_a),
+        enum_list_as_int_list(list_or_empty(swizzle_a)),
+        scale_b,
+        enum_list_as_int_list(scale_recipe_b),
+        enum_list_as_int_list(list_or_empty(swizzle_b)),
+        offs,
+        bias,
+        output_dtype,
+        contraction_dim,
+        use_fast_accum,
+    )
+
+    return out
diff --git a/torch/nn/init.py b/torch/nn/init.py
index af31b6fa2282..e033198d4e5e 100644
--- a/torch/nn/init.py
+++ b/torch/nn/init.py
@@ -2,7 +2,8 @@
 
 import math
 import warnings
-from typing import Callable, Literal, Optional as _Optional, TypeVar, Union
+from collections.abc import Callable
+from typing import Literal, Optional as _Optional, TypeVar, Union
 from typing_extensions import ParamSpec
 
 import torch
@@ -499,6 +500,7 @@ def xavier_normal_(
 
 
 def _calculate_correct_fan(tensor: Tensor, mode: _FanMode) -> int:
+    # pyrefly: ignore  # bad-assignment
     mode = mode.lower()
     valid_modes = ["fan_in", "fan_out"]
     if mode not in valid_modes:
@@ -704,7 +706,7 @@ def sparse_(
         raise ValueError("Only tensors with 2 dimensions are supported")
 
     rows, cols = tensor.shape
-    num_zeros = int(math.ceil(sparsity * rows))
+    num_zeros = math.ceil(sparsity * rows)
 
     with torch.no_grad():
         tensor.normal_(0, std, generator=generator)
diff --git a/torch/nn/intrinsic/qat/modules/conv_fused.py b/torch/nn/intrinsic/qat/modules/conv_fused.py
index 79c7dc116a67..f8dc1d49aad3 100644
--- a/torch/nn/intrinsic/qat/modules/conv_fused.py
+++ b/torch/nn/intrinsic/qat/modules/conv_fused.py
@@ -1,4 +1,3 @@
-# flake8: noqa: F401
 r"""Intrinsic QAT Modules.
 
 This file is in the process of migration to `torch/ao/nn/intrinsic/qat`, and
diff --git a/torch/nn/intrinsic/qat/modules/linear_fused.py b/torch/nn/intrinsic/qat/modules/linear_fused.py
index 2c961557daff..79567d67bd1f 100644
--- a/torch/nn/intrinsic/qat/modules/linear_fused.py
+++ b/torch/nn/intrinsic/qat/modules/linear_fused.py
@@ -1,4 +1,3 @@
-# flake8: noqa: F401
 r"""Intrinsic QAT Modules.
 
 This file is in the process of migration to `torch/ao/nn/intrinsic/qat`, and
diff --git a/torch/nn/intrinsic/qat/modules/linear_relu.py b/torch/nn/intrinsic/qat/modules/linear_relu.py
index 1b9fad39f646..71705320075e 100644
--- a/torch/nn/intrinsic/qat/modules/linear_relu.py
+++ b/torch/nn/intrinsic/qat/modules/linear_relu.py
@@ -1,4 +1,3 @@
-# flake8: noqa: F401
 r"""Intrinsic QAT Modules.
 
 This file is in the process of migration to `torch/ao/nn/intrinsic/qat`, and
diff --git a/torch/nn/modules/_functions.py b/torch/nn/modules/_functions.py
index dd66c2b323c8..407fcc7e279f 100644
--- a/torch/nn/modules/_functions.py
+++ b/torch/nn/modules/_functions.py
@@ -6,6 +6,7 @@
 
 class SyncBatchNorm(Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(
         self,
         input,
@@ -210,6 +211,7 @@ def backward(self, grad_output):
 
 class CrossMapLRN2d(Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, input, size, alpha=1e-4, beta=0.75, k=1):
         ctx.size = size
         ctx.alpha = alpha
@@ -265,6 +267,7 @@ def forward(ctx, input, size, alpha=1e-4, beta=0.75, k=1):
         return output
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         input, output = ctx.saved_tensors
         grad_input = grad_output.new()
@@ -306,6 +309,7 @@ def backward(ctx, grad_output):
 
 class BackwardHookFunction(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, *args):
         ctx.mark_non_differentiable(*[arg for arg in args if not arg.requires_grad])
         return args
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index ed7efdb2b426..edd65601db98 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -1074,9 +1074,13 @@ def _is_make_fx_tracing():
         torch_dispatch_mode_stack = (
             torch.utils._python_dispatch._get_current_dispatch_mode_stack()
         )
-        return any(
-            type(x) == torch.fx.experimental.proxy_tensor.ProxyTorchDispatchMode
-            for x in torch_dispatch_mode_stack
+        # this can be triggered when dynamo inlining the module too.
+        return (
+            any(
+                type(x) is torch.fx.experimental.proxy_tensor.ProxyTorchDispatchMode
+                for x in torch_dispatch_mode_stack
+            )
+            or torch.compiler.is_exporting()
         )
     else:
         return False
diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py
index 12f20517c2ad..dbc32e0ff968 100644
--- a/torch/nn/modules/batchnorm.py
+++ b/torch/nn/modules/batchnorm.py
@@ -72,6 +72,7 @@ def __init__(
                 torch.tensor(
                     0,
                     dtype=torch.long,
+                    # pyrefly: ignore  # bad-argument-type
                     **{k: v for k, v in factory_kwargs.items() if k != "dtype"},
                 ),
             )
@@ -221,6 +222,7 @@ def __init__(
         dtype=None,
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
+        # pyrefly: ignore  # bad-argument-type
         super().__init__(
             # affine and track_running_stats are hardcoded to False to
             # avoid creating tensors that will soon be overwritten.
@@ -234,22 +236,29 @@ def __init__(
         self.affine = affine
         self.track_running_stats = track_running_stats
         if self.affine:
+            # pyrefly: ignore  # bad-argument-type
             self.weight = UninitializedParameter(**factory_kwargs)
+            # pyrefly: ignore  # bad-argument-type
             self.bias = UninitializedParameter(**factory_kwargs)
         if self.track_running_stats:
+            # pyrefly: ignore  # bad-argument-type
             self.running_mean = UninitializedBuffer(**factory_kwargs)
+            # pyrefly: ignore  # bad-argument-type
             self.running_var = UninitializedBuffer(**factory_kwargs)
             self.num_batches_tracked = torch.tensor(
                 0,
                 dtype=torch.long,
+                # pyrefly: ignore  # bad-argument-type
                 **{k: v for k, v in factory_kwargs.items() if k != "dtype"},
             )
 
     def reset_parameters(self) -> None:
+        # pyrefly: ignore  # bad-argument-type
         if not self.has_uninitialized_params() and self.num_features != 0:
             super().reset_parameters()
 
     def initialize_parameters(self, input) -> None:  # type: ignore[override]
+        # pyrefly: ignore  # bad-argument-type
         if self.has_uninitialized_params():
             self.num_features = input.shape[1]
             if self.affine:
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index a03f57ea58a8..373b6743c5b9 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -109,6 +109,7 @@ class Sequential(Module):
     def __init__(self, *args: Module) -> None: ...
 
     @overload
+    # pyrefly: ignore  # inconsistent-overload
     def __init__(self, arg: OrderedDict[str, Module]) -> None: ...
 
     def __init__(self, *args):
@@ -472,6 +473,7 @@ def append(self, module: Module) -> Self:
         return self
 
     def pop(self, key: Union[int, slice]) -> Module:
+        # pyrefly: ignore  # index-error
         v = self[key]
         del self[key]
         return v
@@ -623,9 +625,11 @@ def update(self, modules: Mapping[str, Module]) -> None:
                         "ModuleDict update sequence element "
                         "#" + str(j) + " should be Iterable; is" + type(m).__name__
                     )
+                # pyrefly: ignore  # bad-argument-type
                 if not len(m) == 2:
                     raise ValueError(
                         "ModuleDict update sequence element "
+                        # pyrefly: ignore  # bad-argument-type
                         "#" + str(j) + " has length " + str(len(m)) + "; 2 is required"
                     )
                 # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)]
@@ -684,6 +688,7 @@ def _get_abs_string_index(self, idx):
     def __getitem__(self, idx: int) -> Any: ...
 
     @overload
+    # pyrefly: ignore  # inconsistent-overload
     def __getitem__(self: T, idx: slice) -> T: ...
 
     def __getitem__(self, idx):
@@ -769,9 +774,11 @@ def extra_repr(self) -> str:
                     size_str,
                     device_str,
                 )
+                # pyrefly: ignore  # bad-argument-type
                 child_lines.append("  (" + str(k) + "): " + parastr)
             else:
                 child_lines.append(
+                    # pyrefly: ignore  # bad-argument-type
                     "  (" + str(k) + "): Object of type: " + type(p).__name__
                 )
 
@@ -923,7 +930,7 @@ def get(self, key: str, default: Optional[Any] = None) -> Any:
             key (str): key to get from the ParameterDict
             default (Parameter, optional): value to return if key not present
         """
-        return self[key] if key in self else default
+        return self[key] if key in self else default  # noqa: SIM401
 
     def fromkeys(
         self, keys: Iterable[str], default: Optional[Any] = None
@@ -979,9 +986,11 @@ def update(self, parameters: Union[Mapping[str, Any], ParameterDict]) -> None:
                         "ParameterDict update sequence element "
                         "#" + str(j) + " should be Iterable; is" + type(p).__name__
                     )
+                # pyrefly: ignore  # bad-argument-type
                 if not len(p) == 2:
                     raise ValueError(
                         "ParameterDict update sequence element "
+                        # pyrefly: ignore  # bad-argument-type
                         "#" + str(j) + " has length " + str(len(p)) + "; 2 is required"
                     )
                 # parameters as length-2 list too cumbersome to type, see ModuleDict.update comment
@@ -1002,9 +1011,11 @@ def extra_repr(self) -> str:
                     size_str,
                     device_str,
                 )
+                # pyrefly: ignore  # bad-argument-type
                 child_lines.append("  (" + str(k) + "): " + parastr)
             else:
                 child_lines.append(
+                    # pyrefly: ignore  # bad-argument-type
                     "  (" + str(k) + "): Object of type: " + type(p).__name__
                 )
         tmpstr = "\n".join(child_lines)
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index 2f15c3d488f7..35ae57bcbcd2 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -363,6 +363,7 @@ def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]):
                 self.dilation,
                 self.groups,
             )
+        # pyrefly: ignore  # no-matching-overload
         return F.conv1d(
             input, weight, bias, self.stride, self.padding, self.dilation, self.groups
         )
@@ -540,6 +541,7 @@ def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]):
                 self.dilation,
                 self.groups,
             )
+        # pyrefly: ignore  # no-matching-overload
         return F.conv2d(
             input, weight, bias, self.stride, self.padding, self.dilation, self.groups
         )
@@ -709,6 +711,7 @@ def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]):
                 self.dilation,
                 self.groups,
             )
+        # pyrefly: ignore  # no-matching-overload
         return F.conv3d(
             input, weight, bias, self.stride, self.padding, self.dilation, self.groups
         )
@@ -898,6 +901,23 @@ class ConvTranspose1d(_ConvTransposeNd):
                          sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
                          :math:`k = \frac{groups}{C_\text{out} * \text{kernel\_size}}`
 
+    Examples::
+
+        >>> # With square kernels and equal stride
+        >>> m = nn.ConvTranspose1d(16, 33, 3, stride=2)
+        >>> input = torch.randn(20, 16, 50)
+        >>> output = m(input)
+        >>> # exact output size can be also specified as an argument
+        >>> input = torch.randn(1, 16, 12)
+        >>> downsample = nn.Conv1d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nn.ConvTranspose1d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(input)
+        >>> h.size()
+        torch.Size([1, 16, 6])
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12])
+
     .. _`here`:
         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
 
@@ -1494,6 +1514,7 @@ def __init__(
         dtype=None,
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
+        # pyrefly: ignore  # bad-argument-type
         super().__init__(
             0,
             0,
@@ -1508,9 +1529,11 @@ def __init__(
             padding_mode,
             **factory_kwargs,
         )
+        # pyrefly: ignore  # bad-override, bad-argument-type
         self.weight = UninitializedParameter(**factory_kwargs)
         self.out_channels = out_channels
         if bias:
+            # pyrefly: ignore  # bad-override, bad-argument-type
             self.bias = UninitializedParameter(**factory_kwargs)
 
     def _get_num_spatial_dims(self) -> int:
@@ -1563,6 +1586,7 @@ def __init__(
         dtype=None,
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
+        # pyrefly: ignore  # bad-argument-type
         super().__init__(
             0,
             0,
@@ -1577,9 +1601,11 @@ def __init__(
             padding_mode,
             **factory_kwargs,
         )
+        # pyrefly: ignore  # bad-override, bad-argument-type
         self.weight = UninitializedParameter(**factory_kwargs)
         self.out_channels = out_channels
         if bias:
+            # pyrefly: ignore  # bad-override, bad-argument-type
             self.bias = UninitializedParameter(**factory_kwargs)
 
     def _get_num_spatial_dims(self) -> int:
@@ -1633,6 +1659,7 @@ def __init__(
         dtype=None,
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
+        # pyrefly: ignore  # bad-argument-type
         super().__init__(
             0,
             0,
@@ -1647,9 +1674,11 @@ def __init__(
             padding_mode,
             **factory_kwargs,
         )
+        # pyrefly: ignore  # bad-override, bad-argument-type
         self.weight = UninitializedParameter(**factory_kwargs)
         self.out_channels = out_channels
         if bias:
+            # pyrefly: ignore  # bad-override, bad-argument-type
             self.bias = UninitializedParameter(**factory_kwargs)
 
     def _get_num_spatial_dims(self) -> int:
@@ -1701,6 +1730,7 @@ def __init__(
         dtype=None,
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
+        # pyrefly: ignore  # bad-argument-type
         super().__init__(
             0,
             0,
@@ -1716,9 +1746,11 @@ def __init__(
             padding_mode,
             **factory_kwargs,
         )
+        # pyrefly: ignore  # bad-override, bad-argument-type
         self.weight = UninitializedParameter(**factory_kwargs)
         self.out_channels = out_channels
         if bias:
+            # pyrefly: ignore  # bad-override, bad-argument-type
             self.bias = UninitializedParameter(**factory_kwargs)
 
     def _get_num_spatial_dims(self) -> int:
@@ -1770,6 +1802,7 @@ def __init__(
         dtype=None,
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
+        # pyrefly: ignore  # bad-argument-type
         super().__init__(
             0,
             0,
@@ -1785,9 +1818,11 @@ def __init__(
             padding_mode,
             **factory_kwargs,
         )
+        # pyrefly: ignore  # bad-override, bad-argument-type
         self.weight = UninitializedParameter(**factory_kwargs)
         self.out_channels = out_channels
         if bias:
+            # pyrefly: ignore  # bad-override, bad-argument-type
             self.bias = UninitializedParameter(**factory_kwargs)
 
     def _get_num_spatial_dims(self) -> int:
@@ -1839,6 +1874,7 @@ def __init__(
         dtype=None,
     ) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
+        # pyrefly: ignore  # bad-argument-type
         super().__init__(
             0,
             0,
@@ -1854,9 +1890,11 @@ def __init__(
             padding_mode,
             **factory_kwargs,
         )
+        # pyrefly: ignore  # bad-override, bad-argument-type
         self.weight = UninitializedParameter(**factory_kwargs)
         self.out_channels = out_channels
         if bias:
+            # pyrefly: ignore  # bad-override, bad-argument-type
             self.bias = UninitializedParameter(**factory_kwargs)
 
     def _get_num_spatial_dims(self) -> int:
diff --git a/torch/nn/modules/lazy.py b/torch/nn/modules/lazy.py
index 46e7c7be63db..1984eb0d0e15 100644
--- a/torch/nn/modules/lazy.py
+++ b/torch/nn/modules/lazy.py
@@ -172,7 +172,9 @@ class LazyModuleMixin:
     def __init__(self: _LazyProtocol, *args, **kwargs):
         # Mypy doesn't like this super call in a mixin
         super().__init__(*args, **kwargs)  # type: ignore[misc]
+        # pyrefly: ignore  # read-only
         self._load_hook = self._register_load_state_dict_pre_hook(self._lazy_load_hook)
+        # pyrefly: ignore  # read-only
         self._initialize_hook = self.register_forward_pre_hook(
             self._infer_parameters, with_kwargs=True
         )
diff --git a/torch/nn/modules/linear.py b/torch/nn/modules/linear.py
index 2a2d130590ef..0d17e3174615 100644
--- a/torch/nn/modules/linear.py
+++ b/torch/nn/modules/linear.py
@@ -286,6 +286,7 @@ class LazyLinear(LazyModuleMixin, Linear):
     """
 
     cls_to_become = Linear  # type: ignore[assignment]
+    # pyrefly: ignore  # bad-override
     weight: UninitializedParameter
     bias: UninitializedParameter  # type: ignore[assignment]
 
@@ -295,16 +296,20 @@ def __init__(
         factory_kwargs = {"device": device, "dtype": dtype}
         # bias is hardcoded to False to avoid creating tensor
         # that will soon be overwritten.
+        # pyrefly: ignore  # bad-argument-type
         super().__init__(0, 0, False)
+        # pyrefly: ignore  # bad-argument-type
         self.weight = UninitializedParameter(**factory_kwargs)
         self.out_features = out_features
         if bias:
+            # pyrefly: ignore  # bad-argument-type
             self.bias = UninitializedParameter(**factory_kwargs)
 
     def reset_parameters(self) -> None:
         """
         Resets parameters based on their initialization used in ``__init__``.
         """
+        # pyrefly: ignore  # bad-argument-type
         if not self.has_uninitialized_params() and self.in_features != 0:
             super().reset_parameters()
 
@@ -312,6 +317,7 @@ def initialize_parameters(self, input) -> None:  # type: ignore[override]
         """
         Infers ``in_features`` based on ``input`` and initializes parameters.
         """
+        # pyrefly: ignore  # bad-argument-type
         if self.has_uninitialized_params():
             with torch.no_grad():
                 self.in_features = input.shape[-1]
diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py
index 949c9f46d008..c5d2a94cf394 100644
--- a/torch/nn/modules/loss.py
+++ b/torch/nn/modules/loss.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+from typing import Optional, Union
 from typing_extensions import deprecated
 
 from torch import Tensor
@@ -2062,10 +2063,11 @@ class CTCLoss(_Loss):
         https://www.cs.toronto.edu/~graves/icml_2006.pdf
 
     Note:
-        In order to use CuDNN, the following must be satisfied: :attr:`targets` must be
+        In order to use CuDNN, the following must be satisfied: the :attr:`targets` must be
         in concatenated format, all :attr:`input_lengths` must be `T`.  :math:`blank=0`,
         :attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of
-        dtype :attr:`torch.int32`.
+        dtype :attr:`torch.int32`, and the :attr:`log_probs` itself must be of
+        dtype :attr:`torch.float32`.
 
         The regular implementation uses the (more common in PyTorch) `torch.long` dtype.
 
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index f0c4914782f3..084e98217819 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -6,8 +6,8 @@
 import warnings
 import weakref
 from collections import namedtuple, OrderedDict
-from collections.abc import Iterator, Mapping
-from typing import Any, Callable, Optional, overload, TypeVar, Union
+from collections.abc import Callable, Iterator, Mapping
+from typing import Any, Optional, overload, TypeVar, Union
 from typing_extensions import Self
 
 import torch
@@ -38,11 +38,13 @@
 
 
 class _IncompatibleKeys(
+    # pyrefly: ignore  # invalid-inheritance
     namedtuple("IncompatibleKeys", ["missing_keys", "unexpected_keys"]),
 ):
     __slots__ = ()
 
     def __repr__(self) -> str:
+        # pyrefly: ignore  # missing-attribute
         if not self.missing_keys and not self.unexpected_keys:
             return "<All keys matched successfully>"
         return super().__repr__()
@@ -91,6 +93,7 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
     def __getstate__(self) -> dict:
         result = {"hook": self.hook, "with_module": self.with_module}
         if self.with_module:
+            # pyrefly: ignore  # unsupported-operation
             result["module"] = self.module()
 
         return result
@@ -476,7 +479,7 @@ def forward(self, x):
     call_super_init: bool = False
     _compiled_call_impl: Optional[Callable] = None
 
-    def __init__(self, *args, **kwargs) -> None:
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
         """Initialize internal Module state, shared by both nn.Module and ScriptModule."""
         torch._C._log_api_usage_once("python.nn_module")
 
@@ -929,8 +932,12 @@ def _apply(self, fn, recurse=True):
             for module in self.children():
                 module._apply(fn)
 
+        from torch._subclasses.fake_tensor import FakeTensor
+
         def compute_should_use_set_data(tensor, tensor_applied) -> bool:
-            if torch._has_compatible_shallow_copy_type(tensor, tensor_applied):
+            if torch._has_compatible_shallow_copy_type(
+                tensor, tensor_applied
+            ) and not isinstance(tensor_applied, FakeTensor):
                 # If the new tensor has compatible tensor type as the existing tensor,
                 # the current behavior is to change the tensor in-place using `.data =`,
                 # and the future behavior is to overwrite the existing tensor. However,
@@ -957,8 +964,6 @@ def compute_should_use_set_data(tensor, tensor_applied) -> bool:
                 param_applied = fn(param)
             p_should_use_set_data = compute_should_use_set_data(param, param_applied)
 
-            from torch._subclasses.fake_tensor import FakeTensor
-
             # subclasses may have multiple child tensors so we need to use swap_tensors
             p_should_use_swap_tensors = (
                 should_use_swap_tensors
@@ -974,7 +979,9 @@ def compute_should_use_set_data(tensor, tensor_applied) -> bool:
                         # Decrement use count of the gradient by setting to None
                         param.grad = None
                     param_applied = torch.nn.Parameter(
-                        param_applied, requires_grad=param.requires_grad
+                        # pyrefly: ignore  # bad-argument-type
+                        param_applied,
+                        requires_grad=param.requires_grad,
                     )
                     torch.utils.swap_tensors(param, param_applied)
                 except Exception as e:
@@ -985,11 +992,13 @@ def compute_should_use_set_data(tensor, tensor_applied) -> bool:
                     ) from e
                 out_param = param
             elif p_should_use_set_data:
+                # pyrefly: ignore  # bad-assignment
                 param.data = param_applied
                 out_param = param
             else:
                 assert isinstance(param, Parameter)
                 assert param.is_leaf
+                # pyrefly: ignore  # bad-argument-type
                 out_param = Parameter(param_applied, param.requires_grad)
                 self._parameters[key] = out_param
 
@@ -1040,7 +1049,7 @@ def apply(self, fn: Callable[["Module"], None]) -> Self:
             >>> @torch.no_grad()
             >>> def init_weights(m):
             >>>     print(m)
-            >>>     if type(m) == nn.Linear:
+            >>>     if type(m) is nn.Linear:
             >>>         m.weight.fill_(1.0)
             >>>         print(m.weight)
             >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
@@ -1752,11 +1761,7 @@ def _slow_forward(self, *input, **kwargs):
         if recording_scopes:
             # type ignore was added because at this point one knows that
             # torch.jit._trace._trace_module_map is not Optional and has type Dict[Any, Any]
-            name = (
-                torch.jit._trace._trace_module_map[self]  # type: ignore[index]
-                if self in torch.jit._trace._trace_module_map  # type: ignore[operator]
-                else None
-            )  # noqa: B950
+            name = torch.jit._trace._trace_module_map.get(self, None)  # type: ignore[operator, union-attr]
             if name:
                 tracing_state.push_scope(name)
             else:
@@ -2251,6 +2256,7 @@ def state_dict(self, *args, destination=None, prefix="", keep_vars=False):
 
         if destination is None:
             destination = OrderedDict()
+            # pyrefly: ignore  # missing-attribute
             destination._metadata = OrderedDict()
 
         local_metadata = dict(version=self._version)
@@ -2400,7 +2406,9 @@ def _load_from_state_dict(
             if k not in self._non_persistent_buffers_set
         }
         local_name_params = itertools.chain(
-            self._parameters.items(), persistent_buffers.items()
+            self._parameters.items(),
+            # pyrefly: ignore  # bad-argument-type
+            persistent_buffers.items(),
         )
         local_state = {k: v for k, v in local_name_params if v is not None}
         assign_to_params_buffers = local_metadata.get("assign_to_params_buffers", False)
diff --git a/torch/nn/modules/padding.py b/torch/nn/modules/padding.py
index 6c4c117d1a7d..2300a498acaa 100644
--- a/torch/nn/modules/padding.py
+++ b/torch/nn/modules/padding.py
@@ -84,6 +84,7 @@ class CircularPad1d(_CircularPadNd):
                  [5., 6., 7., 4., 5., 6., 7., 4.]]])
     """
 
+    # pyrefly: ignore  # bad-override
     padding: tuple[int, int]
 
     def __init__(self, padding: _size_2_t) -> None:
@@ -144,6 +145,7 @@ class CircularPad2d(_CircularPadNd):
                   [8., 6., 7., 8., 6.]]]])
     """
 
+    # pyrefly: ignore  # bad-override
     padding: tuple[int, int, int, int]
 
     def __init__(self, padding: _size_4_t) -> None:
@@ -194,6 +196,7 @@ class CircularPad3d(_CircularPadNd):
         >>> output = m(input)
     """
 
+    # pyrefly: ignore  # bad-override
     padding: tuple[int, int, int, int, int, int]
 
     def __init__(self, padding: _size_6_t) -> None:
@@ -265,6 +268,7 @@ class ConstantPad1d(_ConstantPadNd):
                  [ 3.5000,  3.5000,  3.5000, -3.6372,  0.1182, -1.8652,  3.5000]]])
     """
 
+    # pyrefly: ignore  # bad-override
     padding: tuple[int, int]
 
     def __init__(self, padding: _size_2_t, value: float) -> None:
@@ -316,6 +320,7 @@ class ConstantPad2d(_ConstantPadNd):
     """
 
     __constants__ = ["padding", "value"]
+    # pyrefly: ignore  # bad-override
     padding: tuple[int, int, int, int]
 
     def __init__(self, padding: _size_4_t, value: float) -> None:
@@ -356,6 +361,7 @@ class ConstantPad3d(_ConstantPadNd):
         >>> output = m(input)
     """
 
+    # pyrefly: ignore  # bad-override
     padding: tuple[int, int, int, int, int, int]
 
     def __init__(self, padding: _size_6_t, value: float) -> None:
@@ -409,6 +415,7 @@ class ReflectionPad1d(_ReflectionPadNd):
                  [7., 6., 5., 4., 5., 6., 7., 6.]]])
     """
 
+    # pyrefly: ignore  # bad-override
     padding: tuple[int, int]
 
     def __init__(self, padding: _size_2_t) -> None:
@@ -462,6 +469,7 @@ class ReflectionPad2d(_ReflectionPadNd):
                   [7., 6., 7., 8., 7.]]]])
     """
 
+    # pyrefly: ignore  # bad-override
     padding: tuple[int, int, int, int]
 
     def __init__(self, padding: _size_4_t) -> None:
@@ -517,6 +525,7 @@ class ReflectionPad3d(_ReflectionPadNd):
                    [1., 0., 1., 0.]]]]])
     """
 
+    # pyrefly: ignore  # bad-override
     padding: tuple[int, int, int, int, int, int]
 
     def __init__(self, padding: _size_6_t) -> None:
@@ -570,6 +579,7 @@ class ReplicationPad1d(_ReplicationPadNd):
                  [4., 4., 4., 4., 5., 6., 7., 7.]]])
     """
 
+    # pyrefly: ignore  # bad-override
     padding: tuple[int, int]
 
     def __init__(self, padding: _size_2_t) -> None:
@@ -623,6 +633,7 @@ class ReplicationPad2d(_ReplicationPadNd):
                   [6., 6., 7., 8., 8.]]]])
     """
 
+    # pyrefly: ignore  # bad-override
     padding: tuple[int, int, int, int]
 
     def __init__(self, padding: _size_4_t) -> None:
@@ -665,6 +676,7 @@ class ReplicationPad3d(_ReplicationPadNd):
         >>> output = m(input)
     """
 
+    # pyrefly: ignore  # bad-override
     padding: tuple[int, int, int, int, int, int]
 
     def __init__(self, padding: _size_6_t) -> None:
diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
index 3a5cd9805bfb..777e6b0abd8c 100644
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@@ -298,7 +298,7 @@ class MaxPool3d(_MaxPoolNd):
 
     .. _link:
         https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
-    """  # noqa: E501
+    """
 
     kernel_size: _size_3_t
     stride: _size_3_t
@@ -1141,6 +1141,10 @@ class LPPool1d(_LPPoolNd):
         stride: a single int, the stride of the window. Default value is :attr:`kernel_size`
         ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
 
+    Note:
+        When :attr:`ceil_mode` is ``True``, sliding windows may go off-bounds if they start within the
+        left padding or the input. Sliding windows that would start in the right padded region are ignored.
+
     Shape:
         - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
         - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
@@ -1190,6 +1194,10 @@ class LPPool2d(_LPPoolNd):
         stride: the stride of the window. Default value is :attr:`kernel_size`
         ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
 
+    Note:
+        When :attr:`ceil_mode` is ``True``, sliding windows may go off-bounds if they start within the
+        left padding or the input. Sliding windows that would start in the right padded region are ignored.
+
     Shape:
         - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
         - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
@@ -1246,6 +1254,10 @@ class LPPool3d(_LPPoolNd):
         stride: the stride of the window. Default value is :attr:`kernel_size`
         ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
 
+    Note:
+        When :attr:`ceil_mode` is ``True``, sliding windows may go off-bounds if they start within the
+        left padding or the input. Sliding windows that would start in the right padded region are ignored.
+
     Shape:
         - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
         - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
diff --git a/torch/nn/modules/rnn.py b/torch/nn/modules/rnn.py
index df4f2f882628..bff265bd92ad 100644
--- a/torch/nn/modules/rnn.py
+++ b/torch/nn/modules/rnn.py
@@ -242,7 +242,7 @@ def flatten_parameters(self) -> None:
         for fw in self._flat_weights:
             if (
                 not isinstance(fw, Tensor)
-                or not (fw.dtype == dtype)
+                or fw.dtype != dtype
                 or not fw.is_cuda
                 or not torch.backends.cudnn.is_acceptable(fw)
             ):
@@ -640,14 +640,18 @@ def __init__(self, *args, **kwargs):
     @overload
     @torch._jit_internal._overload_method  # noqa: F811
     def forward(
-        self, input: Tensor, hx: Optional[Tensor] = None
+        self,
+        input: Tensor,
+        hx: Optional[Tensor] = None,
     ) -> tuple[Tensor, Tensor]:
         pass
 
     @overload
     @torch._jit_internal._overload_method  # noqa: F811
     def forward(
-        self, input: PackedSequence, hx: Optional[Tensor] = None
+        self,
+        input: PackedSequence,
+        hx: Optional[Tensor] = None,
     ) -> tuple[PackedSequence, Tensor]:
         pass
 
@@ -772,7 +776,10 @@ def forward(self, input, hx=None):  # noqa: F811
 
         if isinstance(orig_input, PackedSequence):
             output_packed = PackedSequence(
-                output, batch_sizes, sorted_indices, unsorted_indices
+                output,
+                batch_sizes,
+                sorted_indices,
+                unsorted_indices,
             )
             return output_packed, self.permute_hidden(hidden, unsorted_indices)
 
@@ -1030,7 +1037,9 @@ def permute_hidden(  # type: ignore[override]
     @overload  # type: ignore[override]
     @torch._jit_internal._overload_method  # noqa: F811
     def forward(
-        self, input: Tensor, hx: Optional[tuple[Tensor, Tensor]] = None
+        self,
+        input: Tensor,
+        hx: Optional[tuple[Tensor, Tensor]] = None,
     ) -> tuple[Tensor, tuple[Tensor, Tensor]]:  # noqa: F811
         pass
 
@@ -1038,7 +1047,9 @@ def forward(
     @overload
     @torch._jit_internal._overload_method  # noqa: F811
     def forward(
-        self, input: PackedSequence, hx: Optional[tuple[Tensor, Tensor]] = None
+        self,
+        input: PackedSequence,
+        hx: Optional[tuple[Tensor, Tensor]] = None,
     ) -> tuple[PackedSequence, tuple[Tensor, Tensor]]:  # noqa: F811
         pass
 
@@ -1152,7 +1163,10 @@ def forward(self, input, hx=None):  # noqa: F811
         # xxx: isinstance check needs to be in conditional for TorchScript to compile
         if isinstance(orig_input, PackedSequence):
             output_packed = PackedSequence(
-                output, batch_sizes, sorted_indices, unsorted_indices
+                output,
+                batch_sizes,
+                sorted_indices,
+                unsorted_indices,
             )
             return output_packed, self.permute_hidden(hidden, unsorted_indices)
         else:
@@ -1319,14 +1333,18 @@ def __init__(self, *args, **kwargs):
     @overload  # type: ignore[override]
     @torch._jit_internal._overload_method  # noqa: F811
     def forward(
-        self, input: Tensor, hx: Optional[Tensor] = None
+        self,
+        input: Tensor,
+        hx: Optional[Tensor] = None,
     ) -> tuple[Tensor, Tensor]:  # noqa: F811
         pass
 
     @overload
     @torch._jit_internal._overload_method  # noqa: F811
     def forward(
-        self, input: PackedSequence, hx: Optional[Tensor] = None
+        self,
+        input: PackedSequence,
+        hx: Optional[Tensor] = None,
     ) -> tuple[PackedSequence, Tensor]:  # noqa: F811
         pass
 
@@ -1420,7 +1438,10 @@ def forward(self, input, hx=None):  # noqa: F811
         # xxx: isinstance check needs to be in conditional for TorchScript to compile
         if isinstance(orig_input, PackedSequence):
             output_packed = PackedSequence(
-                output, batch_sizes, sorted_indices, unsorted_indices
+                output,
+                batch_sizes,
+                sorted_indices,
+                unsorted_indices,
             )
             return output_packed, self.permute_hidden(hidden, unsorted_indices)
         else:
diff --git a/torch/nn/modules/transformer.py b/torch/nn/modules/transformer.py
index dd699537e08c..d5f489c7c56a 100644
--- a/torch/nn/modules/transformer.py
+++ b/torch/nn/modules/transformer.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 import copy
 import warnings
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -134,7 +135,11 @@ def __init__(
                 **factory_kwargs,
             )
             encoder_norm = LayerNorm(
-                d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs
+                d_model,
+                eps=layer_norm_eps,
+                bias=bias,
+                # pyrefly: ignore  # bad-argument-type
+                **factory_kwargs,
             )
             self.encoder = TransformerEncoder(
                 encoder_layer, num_encoder_layers, encoder_norm
@@ -156,7 +161,11 @@ def __init__(
                 **factory_kwargs,
             )
             decoder_norm = LayerNorm(
-                d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs
+                d_model,
+                eps=layer_norm_eps,
+                bias=bias,
+                # pyrefly: ignore  # bad-argument-type
+                **factory_kwargs,
             )
             self.decoder = TransformerDecoder(
                 decoder_layer, num_decoder_layers, decoder_norm
@@ -381,7 +390,7 @@ def __init__(
             why_not_sparsity_fast_path = (
                 f"{enc_layer}.activation_relu_or_gelu was not True"
             )
-        elif not (encoder_layer.norm1.eps == encoder_layer.norm2.eps):
+        elif encoder_layer.norm1.eps != encoder_layer.norm2.eps:
             why_not_sparsity_fast_path = (
                 f"{enc_layer}.norm1.eps was not equal to {enc_layer}.norm2.eps"
             )
@@ -443,6 +452,7 @@ def forward(
         str_first_layer = "self.layers[0]"
         batch_first = first_layer.self_attn.batch_first
         is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled()
+        do_mask_check = getattr(self, "mask_check", True)
 
         if not is_fastpath_enabled:
             why_not_sparsity_fast_path = (
@@ -456,15 +466,19 @@ def forward(
             )
         elif first_layer.training:
             why_not_sparsity_fast_path = f"{str_first_layer} was in training mode"
-        elif not src.dim() == 3:
+        elif src.dim() != 3:
             why_not_sparsity_fast_path = (
                 f"input not batched; expected src.dim() of 3 but got {src.dim()}"
             )
         elif src_key_padding_mask is None:
             why_not_sparsity_fast_path = "src_key_padding_mask was None"
-        elif (
-            (not hasattr(self, "mask_check")) or self.mask_check
-        ) and not torch._nested_tensor_from_mask_left_aligned(
+        # This check avoids a call to torch._nested_tensor_from_mask_left_aligned() that
+        # breaks in torch.compile.
+        elif do_mask_check and torch.compiler.is_compiling():
+            why_not_sparsity_fast_path = (
+                "mask_check enabled with torch.compile or torch.export"
+            )
+        elif do_mask_check and not torch._nested_tensor_from_mask_left_aligned(
             src, src_key_padding_mask.logical_not()
         ):
             why_not_sparsity_fast_path = "mask_check enabled, and src and src_key_padding_mask was not left aligned"
@@ -754,7 +768,9 @@ def __init__(
         self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs)
 
         self.norm_first = norm_first
+        # pyrefly: ignore  # bad-argument-type
         self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        # pyrefly: ignore  # bad-argument-type
         self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
         self.dropout1 = Dropout(dropout)
         self.dropout2 = Dropout(dropout)
@@ -826,7 +842,7 @@ def forward(
             why_not_sparsity_fast_path = (
                 "torch.backends.mha.get_fastpath_enabled() was not True"
             )
-        elif not src.dim() == 3:
+        elif src.dim() != 3:
             why_not_sparsity_fast_path = (
                 f"input not batched; expected src.dim() of 3 but got {src.dim()}"
             )
@@ -840,7 +856,7 @@ def forward(
             why_not_sparsity_fast_path = "self_attn._qkv_same_embed_dim was not True"
         elif not self.activation_relu_or_gelu:
             why_not_sparsity_fast_path = "activation_relu_or_gelu was not True"
-        elif not (self.norm1.eps == self.norm2.eps):
+        elif self.norm1.eps != self.norm2.eps:
             why_not_sparsity_fast_path = "norm1.eps is not equal to norm2.eps"
         elif src.is_nested and (
             src_key_padding_mask is not None or src_mask is not None
@@ -1046,8 +1062,11 @@ def __init__(
         self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs)
 
         self.norm_first = norm_first
+        # pyrefly: ignore  # bad-argument-type
         self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        # pyrefly: ignore  # bad-argument-type
         self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        # pyrefly: ignore  # bad-argument-type
         self.norm3 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
         self.dropout1 = Dropout(dropout)
         self.dropout2 = Dropout(dropout)
diff --git a/torch/nn/modules/utils.py b/torch/nn/modules/utils.py
index 220b8f206b19..d8d8783b06b4 100644
--- a/torch/nn/modules/utils.py
+++ b/torch/nn/modules/utils.py
@@ -36,6 +36,7 @@ def _list_with_default(out_size: list[int], defaults: list[int]) -> list[int]:
     import torch
 
     if isinstance(out_size, (int, torch.SymInt)):
+        # pyrefly: ignore  # bad-return
         return out_size
     if len(defaults) <= len(out_size):
         raise ValueError(f"Input dimension should be at least {len(out_size) + 1}")
diff --git a/torch/nn/parallel/comm.py b/torch/nn/parallel/comm.py
index 42b3dbd908d6..01ed3030fb84 100644
--- a/torch/nn/parallel/comm.py
+++ b/torch/nn/parallel/comm.py
@@ -43,6 +43,7 @@ def broadcast(tensor, devices=None, *, out=None):
         devices = [_get_device_index(d) for d in devices]
         return torch._C._broadcast(tensor, devices)
     else:
+        # pyrefly: ignore  # bad-argument-type
         return torch._C._broadcast_out(tensor, out)
 
 
@@ -200,6 +201,7 @@ def scatter(tensor, devices=None, chunk_sizes=None, dim=0, streams=None, *, out=
     """
     tensor = _handle_complex(tensor)
     if out is None:
+        # pyrefly: ignore  # not-iterable
         devices = [_get_device_index(d) for d in devices]
         return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams))
     else:
diff --git a/torch/nn/parallel/data_parallel.py b/torch/nn/parallel/data_parallel.py
index 16bdc204a6bf..22cc3044c221 100644
--- a/torch/nn/parallel/data_parallel.py
+++ b/torch/nn/parallel/data_parallel.py
@@ -160,6 +160,7 @@ def __init__(
         self.module = module
         self.device_ids = [_get_device_index(x, True) for x in device_ids]
         self.output_device = _get_device_index(output_device, True)
+        # pyrefly: ignore  # read-only
         self.src_device_obj = torch.device(device_type, self.device_ids[0])
 
         if device_type == "cuda":
@@ -173,6 +174,7 @@ def forward(self, *inputs: Any, **kwargs: Any) -> Any:
             if not self.device_ids:
                 return self.module(*inputs, **kwargs)
 
+            # pyrefly: ignore  # bad-argument-type
             for t in chain(self.module.parameters(), self.module.buffers()):
                 if t.device != self.src_device_obj:
                     raise RuntimeError(
@@ -259,8 +261,10 @@ def data_parallel(
 
     device_ids = [_get_device_index(x, True) for x in device_ids]
     output_device = _get_device_index(output_device, True)
+    # pyrefly: ignore  # no-matching-overload
     src_device_obj = torch.device(device_type, device_ids[0])
 
+    # pyrefly: ignore  # bad-argument-type
     for t in chain(module.parameters(), module.buffers()):
         if t.device != src_device_obj:
             raise RuntimeError(
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index c5db538f52bb..73e0deec5e4c 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -9,10 +9,11 @@
 import warnings
 import weakref
 from collections import defaultdict, deque
+from collections.abc import Callable
 from contextlib import contextmanager
 from dataclasses import dataclass, fields, is_dataclass
 from enum import auto, Enum
-from typing import Any, Callable, Optional, TYPE_CHECKING
+from typing import Any, Optional, TYPE_CHECKING
 
 import torch
 import torch.distributed as dist
@@ -217,7 +218,7 @@ def _dump_DDP_relevant_env_vars():
     ]
     formatted_output = ""
     for var in relevant_env_vars:
-        value = os.environ[var] if var in os.environ else "N/A"
+        value = os.environ.get(var, "N/A")
         formatted_output += f"env:{var}={value}\n"
     print(formatted_output)
 
@@ -240,6 +241,7 @@ class _BufferCommHook:
 # is completed.
 class _DDPSink(Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, ddp_weakref, *inputs):
         # set_materialize_grads(False) will ensure that None gradients stay as
         # None and are not filled with zeros.
@@ -690,6 +692,7 @@ def __init__(
         elif process_group is None and device_mesh is None:
             self.process_group = _get_default_group()
         elif device_mesh is None:
+            # pyrefly: ignore  # bad-assignment
             self.process_group = process_group
         else:
             if device_mesh.ndim != 1:
@@ -698,9 +701,8 @@ def __init__(
                 )
             self.device_mesh = device_mesh
             self.process_group = device_mesh.get_group(mesh_dim=0)
-            from torch.distributed.device_mesh import _mesh_resources
 
-            root_mesh = _mesh_resources.get_root_mesh(device_mesh)
+            root_mesh = device_mesh._get_root_mesh()
             # if a root mesh is not the same as device_mesh,
             # meaning the device_mesh is sliced out from the root mesh.
             if root_mesh != device_mesh:
@@ -772,17 +774,19 @@ def __init__(
                     "DistributedDataParallel device_ids and output_device arguments "
                     "only work with single-device/multiple-device GPU modules or CPU modules, "
                     f"but got device_ids {device_ids}, output_device {output_device}, "
-                    f"and module parameters { ({p.device for p in self._module_parameters}) }.",  # noqa: E201,E202
+                    f"and module parameters { ({p.device for p in self._module_parameters}) }.",
                 )
 
             self.device_ids = None
             self.output_device = None
         else:
+            # pyrefly: ignore  # bad-assignment
             self.device_ids = [_get_device_index(x, True) for x in device_ids]
 
             if output_device is None:
                 output_device = device_ids[0]
 
+            # pyrefly: ignore  # bad-assignment
             self.output_device = _get_device_index(output_device, True)
 
         self.static_graph = False
@@ -818,7 +822,7 @@ def __init__(
                     "Run a dummy forward pass to correctly initialize the modules",
                 )
         # used for intra-node param sync and inter-node sync as well
-        self.broadcast_bucket_size = int(250 * 1024 * 1024)
+        self.broadcast_bucket_size = 250 * 1024 * 1024
 
         # reduction bucket size
         if bucket_cap_mb is None:
@@ -932,6 +936,7 @@ def __init__(
         # enabled.
         self._accum_grad_hooks: list[RemovableHandle] = []
         if self._use_python_reducer:
+            # pyrefly: ignore  # bad-assignment
             torch._inductor.config._fuse_ddp_communication = True
             torch._inductor.config._fuse_ddp_bucket_size = bucket_cap_mb
             # Directly adding this to the trace rule will disturb the users
diff --git a/torch/nn/parallel/scatter_gather.py b/torch/nn/parallel/scatter_gather.py
index 947f56357365..cb167b80b809 100644
--- a/torch/nn/parallel/scatter_gather.py
+++ b/torch/nn/parallel/scatter_gather.py
@@ -56,12 +56,16 @@ def scatter_map(obj):
         if isinstance(obj, torch.Tensor):
             return Scatter.apply(target_gpus, None, dim, obj)
         if _is_namedtuple(obj):
+            # pyrefly: ignore  # no-matching-overload
             return [type(obj)(*args) for args in zip(*map(scatter_map, obj))]
         if isinstance(obj, tuple) and len(obj) > 0:
+            # pyrefly: ignore  # no-matching-overload
             return list(zip(*map(scatter_map, obj)))
         if isinstance(obj, list) and len(obj) > 0:
+            # pyrefly: ignore  # no-matching-overload
             return [list(i) for i in zip(*map(scatter_map, obj))]
         if isinstance(obj, dict) and len(obj) > 0:
+            # pyrefly: ignore  # no-matching-overload
             return [type(obj)(i) for i in zip(*map(scatter_map, obj.items()))]
         return [obj for _ in target_gpus]
 
@@ -123,9 +127,12 @@ def gather_map(outputs):
         if isinstance(out, dict):
             if not all(len(out) == len(d) for d in outputs):
                 raise ValueError("All dicts must have the same number of keys")
+            # pyrefly: ignore  # not-callable
             return type(out)((k, gather_map([d[k] for d in outputs])) for k in out)
         if _is_namedtuple(out):
+            # pyrefly: ignore  # no-matching-overload
             return type(out)._make(map(gather_map, zip(*outputs)))
+        # pyrefly: ignore  # no-matching-overload
         return type(out)(map(gather_map, zip(*outputs)))
 
     # Recursive function calls like this create reference cycles.
diff --git a/torch/nn/parameter.py b/torch/nn/parameter.py
index c41a102fc946..39758f3efd15 100644
--- a/torch/nn/parameter.py
+++ b/torch/nn/parameter.py
@@ -81,6 +81,7 @@ def __deepcopy__(self, memo):
             memo[id(self)] = result
             return result
 
+    # pyrefly: ignore  # bad-override
     def __repr__(self):
         return "Parameter containing:\n" + super().__repr__()
 
@@ -143,6 +144,7 @@ def materialize(self, shape, device=None, dtype=None):
         if dtype is None:
             dtype = self.data.dtype
         self.data = torch.empty(shape, device=device, dtype=dtype)
+        # pyrefly: ignore  # bad-override, missing-attribute
         self.__class__ = self.cls_to_become
 
     @property
@@ -166,6 +168,7 @@ def __repr__(self):
 
     def __reduce_ex__(self, proto):
         # See Note [Don't serialize hooks]
+        # pyrefly: ignore  # missing-attribute
         return (self.__class__, (self.requires_grad,))
 
     @classmethod
@@ -175,6 +178,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
         if func in cls._allowed_methods or func.__class__.__name__ == "method-wrapper":
             if kwargs is None:
                 kwargs = {}
+            # pyrefly: ignore  # missing-attribute
             return super().__torch_function__(func, types, args, kwargs)
         raise ValueError(
             f"Attempted to use an uninitialized parameter in {func}. "
@@ -216,6 +220,7 @@ class UninitializedParameter(UninitializedTensorMixin, Parameter):
     def __new__(cls, requires_grad=True, device=None, dtype=None) -> None:
         factory_kwargs = {"device": device, "dtype": dtype}
         data = torch.empty(0, **factory_kwargs)
+        # pyrefly: ignore  # bad-return
         return torch.Tensor._make_subclass(cls, data, requires_grad)
 
     def __deepcopy__(self, memo):
@@ -261,7 +266,9 @@ def __new__(cls, data=None, *, persistent=True):
             data = torch.empty(0)
 
         t = data.detach().requires_grad_(data.requires_grad)
+        # pyrefly: ignore  # missing-attribute
         t.persistent = persistent
+        # pyrefly: ignore  # missing-attribute
         t._is_buffer = True
         return t
 
@@ -292,6 +299,9 @@ def __new__(
         factory_kwargs = {"device": device, "dtype": dtype}
         data = torch.empty(0, **factory_kwargs)
         ret = torch.Tensor._make_subclass(cls, data, requires_grad)
+        # pyrefly: ignore  # missing-attribute
         ret.persistent = persistent
+        # pyrefly: ignore  # missing-attribute
         ret._is_buffer = True
+        # pyrefly: ignore  # bad-return
         return ret
diff --git a/torch/nn/utils/__init__.py b/torch/nn/utils/__init__.py
index 5af9ed93e92b..ed9a83b13389 100644
--- a/torch/nn/utils/__init__.py
+++ b/torch/nn/utils/__init__.py
@@ -1,5 +1,5 @@
-from . import parametrizations, rnn, stateless
-from .clip_grad import (
+from . import parametrizations, parametrize, rnn, stateless
+from .clip_grad import (  # pyrefly: ignore  # deprecated
     _clip_grads_with_norm_ as clip_grads_with_norm_,
     _get_total_norm as get_total_norm,
     clip_grad_norm,
@@ -36,6 +36,7 @@
     "get_total_norm",
     "parameters_to_vector",
     "parametrizations",
+    "parametrize",
     "remove_spectral_norm",
     "remove_weight_norm",
     "rnn",
diff --git a/torch/nn/utils/_deprecation_utils.py b/torch/nn/utils/_deprecation_utils.py
index d20292b8c0bb..995da89c70bd 100644
--- a/torch/nn/utils/_deprecation_utils.py
+++ b/torch/nn/utils/_deprecation_utils.py
@@ -1,6 +1,6 @@
 import importlib
 import warnings
-from typing import Callable
+from collections.abc import Callable
 
 
 _MESSAGE_TEMPLATE = (
diff --git a/torch/nn/utils/_expanded_weights/conv_expanded_weights.py b/torch/nn/utils/_expanded_weights/conv_expanded_weights.py
index ad1adf06d0fe..dba0cd27132d 100644
--- a/torch/nn/utils/_expanded_weights/conv_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/conv_expanded_weights.py
@@ -1,4 +1,5 @@
-from typing import Any, Callable, TypeVar
+from collections.abc import Callable
+from typing import Any, TypeVar
 from typing_extensions import ParamSpec
 
 import torch
@@ -23,6 +24,7 @@
 @implements_per_sample_grads(F.conv3d)
 class ConvPerSampleGrad(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(
         ctx: Any,
         kwarg_names: list[str],
@@ -55,6 +57,7 @@ def forward(
                 f"unbatched input of dim {input.dim()}, expected input of dim {batched_dim_size}"
             )
 
+        # pyrefly: ignore  # invalid-type-var
         ctx.conv_fn = conv_fn
 
         ctx.batch_size = orig_input.shape[0]
diff --git a/torch/nn/utils/_expanded_weights/conv_utils.py b/torch/nn/utils/_expanded_weights/conv_utils.py
index 74418e143860..463d7efb6467 100644
--- a/torch/nn/utils/_expanded_weights/conv_utils.py
+++ b/torch/nn/utils/_expanded_weights/conv_utils.py
@@ -237,6 +237,7 @@ def conv_unfold_weight_grad_sample(
     # n=batch_sz; o=num_out_channels; p=(num_in_channels/groups)*kernel_sz
     weight_grad_sample = torch.einsum("noq,npq->nop", grad_output, input)
     # rearrange the above tensor and extract diagonals.
+    # pyrefly: ignore  # no-matching-overload
     weight_grad_sample = weight_grad_sample.view(
         n,
         groups,
diff --git a/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py b/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py
index febeecab8b31..e1c9dc04d8cf 100644
--- a/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py
@@ -1,5 +1,4 @@
-# mypy: allow-untyped-defs
-from typing import Optional
+from typing import Any, Optional
 
 import torch
 import torch.nn.functional as F
@@ -15,7 +14,10 @@
 @implements_per_sample_grads(F.embedding)
 class EmbeddingPerSampleGrad(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
+    # pyrefly: ignore  # bad-override
+    def forward(
+        ctx: Any, kwarg_names: list[str], _: Any, *expanded_args_and_kwargs: Any
+    ) -> torch.Tensor:
         expanded_args, expanded_kwargs = standard_kwargs(
             kwarg_names, expanded_args_and_kwargs
         )
@@ -33,7 +35,10 @@ def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
         return output
 
     @staticmethod
-    def backward(ctx, grad_output):
+    # pyrefly: ignore  # bad-override
+    def backward(
+        ctx: Any, grad_output: torch.Tensor
+    ) -> tuple[Optional[torch.Tensor], ...]:
         input, weight = ctx.input, ctx.weight
         padding_idx, scale_grad_by_freq, sparse = (
             ctx.padding_idx,
@@ -41,7 +46,7 @@ def backward(ctx, grad_output):
             ctx.sparse,
         )
 
-        def weight_per_sample_grad(weight):
+        def weight_per_sample_grad(weight: torch.Tensor) -> torch.Tensor:
             batch_size = input.shape[0]
             embedding_dim = weight.shape[1]
             index = (
@@ -49,7 +54,7 @@ def weight_per_sample_grad(weight):
                 .expand(*input.shape, embedding_dim)
                 .reshape(batch_size, -1, embedding_dim)
             )
-            grad_sample = torch.zeros(
+            grad_sample = torch.zeros(  # type: ignore[attr-defined]
                 batch_size, *weight.shape, device=weight.device, dtype=grad_output.dtype
             )
             return grad_sample.scatter_add_(
diff --git a/torch/nn/utils/_expanded_weights/expanded_weights_impl.py b/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
index 2dd4b6de4f69..dd6c6107fe22 100644
--- a/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
+++ b/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import functools
+from collections.abc import Callable
 from contextlib import contextmanager
-from typing import Callable
 
 import torch
 from torch._decomp import decomposition_table
@@ -131,7 +131,9 @@ def __torch_function__(cls, func, _, args=(), kwargs=None):
             # in aten, choosing the input or data variants is done by parsing logic. This mimics some of that
             decomp_opts = expanded_weights_rnn_decomps[func]
             use_input_variant = isinstance(
-                args[2], list
+                # pyrefly: ignore  # index-error
+                args[2],
+                list,
             )  # data variant uses a list here
             decomp = decomp_opts[0] if use_input_variant else decomp_opts[1]
 
diff --git a/torch/nn/utils/_expanded_weights/expanded_weights_utils.py b/torch/nn/utils/_expanded_weights/expanded_weights_utils.py
index 1249adfd7594..5f99e468767d 100644
--- a/torch/nn/utils/_expanded_weights/expanded_weights_utils.py
+++ b/torch/nn/utils/_expanded_weights/expanded_weights_utils.py
@@ -8,6 +8,7 @@
 
 def is_batch_first(expanded_args_and_kwargs):
     batch_first = None
+    # pyrefly: ignore  # bad-assignment
     for arg in expanded_args_and_kwargs:
         if not isinstance(arg, ExpandedWeight):
             continue
diff --git a/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py b/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py
index 913bc6cce7b5..1439593408c8 100644
--- a/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py
@@ -18,6 +18,7 @@
 @implements_per_sample_grads(F.group_norm)
 class GroupNormPerSampleGrad(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
         expanded_args, expanded_kwargs = standard_kwargs(
             kwarg_names, expanded_args_and_kwargs
@@ -46,6 +47,7 @@ def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
         return output
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         input, num_groups = ctx.input, ctx.num_groups
         weight, bias, eps = ctx.weight, ctx.bias, ctx.eps
@@ -94,7 +96,9 @@ def backward(ctx, grad_output):
             set_grad_sample_if_exists(
                 weight,
                 lambda _: torch.einsum(
-                    "ni...->ni", F.group_norm(input, num_groups, eps=eps) * grad_output
+                    "ni...->ni",
+                    # pyrefly: ignore  # unsupported-operation
+                    F.group_norm(input, num_groups, eps=eps) * grad_output,
                 ),
             )
         if hasattr(ctx, "bias"):
diff --git a/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py b/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py
index 586e29a40f95..7f7fc02dc905 100644
--- a/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py
@@ -17,6 +17,7 @@
 @implements_per_sample_grads(F.instance_norm)
 class InstanceNormPerSampleGrad(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
         instance_norm = partial(torch.instance_norm, cudnn_enabled=True)
         expanded_args, expanded_kwargs = standard_kwargs(
@@ -36,6 +37,7 @@ def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
         return output
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         input, running_mean, running_var = ctx.input, ctx.running_mean, ctx.running_var
         weight, bias, eps = ctx.weight, ctx.bias, ctx.eps
diff --git a/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py b/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py
index f223f97460a1..a53ee8a52dab 100644
--- a/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py
@@ -17,6 +17,7 @@
 @implements_per_sample_grads(F.layer_norm)
 class LayerNormPerSampleGrad(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
         expanded_args, expanded_kwargs = standard_kwargs(
             kwarg_names, expanded_args_and_kwargs
@@ -42,6 +43,7 @@ def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
         return output
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         def weight_per_sample_grad(weight):
             return sum_over_all_but_batch_and_last_n(
diff --git a/torch/nn/utils/_expanded_weights/linear_expanded_weights.py b/torch/nn/utils/_expanded_weights/linear_expanded_weights.py
index 25b544ed7826..e617c79bb1c4 100644
--- a/torch/nn/utils/_expanded_weights/linear_expanded_weights.py
+++ b/torch/nn/utils/_expanded_weights/linear_expanded_weights.py
@@ -16,6 +16,7 @@
 @implements_per_sample_grads(F.linear)
 class LinearPerSampleGrad(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, _, __, *expanded_args_and_kwargs):
         if len(expanded_args_and_kwargs[0].shape) <= 1:
             raise RuntimeError(
@@ -35,6 +36,7 @@ def forward(ctx, _, __, *expanded_args_and_kwargs):
         return output
 
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def backward(ctx, grad_output):
         input, weight = ctx.args
         bias = ctx.kwargs["bias"]
diff --git a/torch/nn/utils/_named_member_accessor.py b/torch/nn/utils/_named_member_accessor.py
index 318eb2258ecc..111a24ec1863 100644
--- a/torch/nn/utils/_named_member_accessor.py
+++ b/torch/nn/utils/_named_member_accessor.py
@@ -77,6 +77,7 @@ def swap_tensor(
             setattr(module, name, tensor)
         elif hasattr(module, name):
             delattr(module, name)
+    # pyrefly: ignore  # bad-return
     return orig_tensor
 
 
@@ -145,7 +146,7 @@ def get_submodule(self, name: str) -> "torch.nn.Module":
                     f"{module._get_name()} has no attribute `{attr}`"
                 ) from ex
             if not isinstance(submodule, torch.nn.Module):
-                raise TypeError(  # noqa: B904
+                raise TypeError(
                     f"submodule `{name}`: {submodule} is not an instance of torch.nn.Module"
                 )
             self.memo[name] = submodule
diff --git a/torch/nn/utils/clip_grad.py b/torch/nn/utils/clip_grad.py
index fd6e908e1ff0..42cf898bfdf0 100644
--- a/torch/nn/utils/clip_grad.py
+++ b/torch/nn/utils/clip_grad.py
@@ -4,8 +4,9 @@
 import types
 import typing
 import warnings
-from typing import Callable, cast, Optional, TypeVar, Union
-from typing_extensions import deprecated, ParamSpec, TypeAlias
+from collections.abc import Callable
+from typing import cast, Optional, TypeAlias, TypeVar, Union
+from typing_extensions import deprecated, ParamSpec
 
 import torch
 from torch import Tensor
@@ -40,9 +41,11 @@ def _no_grad(func: Callable[_P, _R]) -> Callable[_P, _R]:
 
     def _no_grad_wrapper(*args, **kwargs):
         with torch.no_grad():
+            # pyrefly: ignore  # invalid-param-spec
             return func(*args, **kwargs)
 
     functools.update_wrapper(_no_grad_wrapper, func)
+    # pyrefly: ignore  # bad-return
     return _no_grad_wrapper
 
 
@@ -280,6 +283,7 @@ def clip_grad_value_(
     clip_value = float(clip_value)
 
     grads = [p.grad for p in parameters if p.grad is not None]
+    # pyrefly: ignore  # bad-argument-type
     grouped_grads = _group_tensors_by_device_and_dtype([grads])
 
     for (device, _), ([grads], _) in grouped_grads.items():
diff --git a/torch/nn/utils/memory_format.py b/torch/nn/utils/memory_format.py
index 59e54b11e3b9..757b0bb272c8 100644
--- a/torch/nn/utils/memory_format.py
+++ b/torch/nn/utils/memory_format.py
@@ -84,6 +84,7 @@ def convert_conv2d_weight_memory_format(
         )
     for child in module.children():
         convert_conv2d_weight_memory_format(child, memory_format)
+    # pyrefly: ignore  # bad-return
     return module
 
 
@@ -163,6 +164,7 @@ def convert_conv3d_weight_memory_format(
         )
     for child in module.children():
         convert_conv3d_weight_memory_format(child, memory_format)
+    # pyrefly: ignore  # bad-return
     return module
 
 
diff --git a/torch/nn/utils/parametrizations.py b/torch/nn/utils/parametrizations.py
index 5a371af995b6..5a48b690cfe0 100644
--- a/torch/nn/utils/parametrizations.py
+++ b/torch/nn/utils/parametrizations.py
@@ -98,6 +98,7 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
                 )
             # Q is now orthogonal (or unitary) of size (..., n, n)
             if n != k:
+                # pyrefly: ignore  # unbound-name
                 Q = Q[..., :k]
             # Q is now the size of the X (albeit perhaps transposed)
         else:
@@ -110,8 +111,10 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
             Q = Q * X.diagonal(dim1=-2, dim2=-1).int().unsqueeze(-2)
 
         if hasattr(self, "base"):
+            # pyrefly: ignore  # unbound-name
             Q = self.base @ Q
         if transposed:
+            # pyrefly: ignore  # unbound-name
             Q = Q.mT
         return Q  # type: ignore[possibly-undefined]
 
diff --git a/torch/nn/utils/parametrize.py b/torch/nn/utils/parametrize.py
index 25de247c6df6..ed298dece3ac 100644
--- a/torch/nn/utils/parametrize.py
+++ b/torch/nn/utils/parametrize.py
@@ -179,23 +179,28 @@ def __init__(
 
         # Register the tensor(s)
         if self.is_tensor:
+            # pyrefly: ignore  # missing-attribute
             if original.dtype != new.dtype:
                 raise ValueError(
                     "When `right_inverse` outputs one tensor, it may not change the dtype.\n"
                     f"original.dtype: {original.dtype}\n"
+                    # pyrefly: ignore  # missing-attribute
                     f"right_inverse(original).dtype: {new.dtype}"
                 )
 
+            # pyrefly: ignore  # missing-attribute
             if original.device != new.device:
                 raise ValueError(
                     "When `right_inverse` outputs one tensor, it may not change the device.\n"
                     f"original.device: {original.device}\n"
+                    # pyrefly: ignore  # missing-attribute
                     f"right_inverse(original).device: {new.device}"
                 )
 
             # Set the original to original so that the user does not need to re-register the parameter
             # manually in the optimiser
             with torch.no_grad():
+                # pyrefly: ignore  # bad-argument-type
                 _maybe_set(original, new)
             _register_parameter_or_buffer(self, "original", original)
         else:
@@ -396,6 +401,7 @@ def get_parametrized(self) -> Tensor:
         if torch.jit.is_scripting():
             raise RuntimeError("Parametrization is not working with scripting.")
         parametrization = self.parametrizations[tensor_name]
+        # pyrefly: ignore  # redundant-condition
         if _cache_enabled:
             if torch.jit.is_scripting():
                 # Scripting
@@ -695,6 +701,7 @@ def remove_parametrizations(
     # Fetch the original tensor
     assert isinstance(module.parametrizations, ModuleDict)  # Make mypy happy
     parametrizations = module.parametrizations[tensor_name]
+    # pyrefly: ignore  # invalid-argument
     if parametrizations.is_tensor:
         original = parametrizations.original
         assert isinstance(original, torch.Tensor), "is_tensor promised us a Tensor"
diff --git a/torch/nn/utils/prune.py b/torch/nn/utils/prune.py
index aee6bdc2ad21..aa0d5c2e7248 100644
--- a/torch/nn/utils/prune.py
+++ b/torch/nn/utils/prune.py
@@ -274,8 +274,11 @@ def __init__(self, *args):
         if not isinstance(args, Iterable):  # only 1 item
             self._tensor_name = args._tensor_name
             self.add_pruning_method(args)
+        # pyrefly: ignore  # bad-argument-type
         elif len(args) == 1:  # only 1 item in a tuple
+            # pyrefly: ignore  # index-error
             self._tensor_name = args[0]._tensor_name
+            # pyrefly: ignore  # index-error
             self.add_pruning_method(args[0])
         else:  # manual construction from list or other iterable (or no args)
             for method in args:
@@ -1097,6 +1100,7 @@ def global_unstructured(parameters, pruning_method, importance_scores=None, **kw
 
     # flatten importance scores to consider them all at once in global pruning
     relevant_importance_scores = torch.nn.utils.parameters_to_vector(
+        # pyrefly: ignore  # bad-argument-type
         [
             importance_scores.get((module, name), getattr(module, name))
             for (module, name) in parameters
diff --git a/torch/nn/utils/rnn.py b/torch/nn/utils/rnn.py
index aa7c0f35a183..5529cbc83ef0 100644
--- a/torch/nn/utils/rnn.py
+++ b/torch/nn/utils/rnn.py
@@ -1,6 +1,6 @@
 import warnings
-from collections.abc import Iterable
-from typing import Any, Callable, NamedTuple, Optional, overload, TypeVar, Union
+from collections.abc import Callable, Iterable
+from typing import Any, NamedTuple, Optional, overload, TypeVar, Union
 from typing_extensions import Self
 
 import torch
diff --git a/torch/nn/utils/spectral_norm.py b/torch/nn/utils/spectral_norm.py
index a1eeb87c24ab..9cf39cc5bda7 100644
--- a/torch/nn/utils/spectral_norm.py
+++ b/torch/nn/utils/spectral_norm.py
@@ -332,6 +332,7 @@ def spectral_norm(
         else:
             dim = 0
     SpectralNorm.apply(module, name, n_power_iterations, dim, eps)
+    # pyrefly: ignore  # bad-return
     return module
 
 
diff --git a/torch/nn/utils/stateless.py b/torch/nn/utils/stateless.py
index ce55641faab4..148052740922 100644
--- a/torch/nn/utils/stateless.py
+++ b/torch/nn/utils/stateless.py
@@ -103,9 +103,6 @@ def _reparametrize_module(
     strict: bool = False,
     stack_weights: bool = False,
 ):
-    parameters_and_buffers = parameters_and_buffers
-    stack_weights = stack_weights
-
     if tie_weights:
         untied_parameters_and_buffers = _untie_named_tensors_map(
             module, parameters_and_buffers
diff --git a/torch/numa/binding.py b/torch/numa/binding.py
index b92a046676f9..140457845fde 100644
--- a/torch/numa/binding.py
+++ b/torch/numa/binding.py
@@ -1,12 +1,12 @@
 import os
 import traceback
 from collections import defaultdict
-from collections.abc import Iterable, Iterator
+from collections.abc import Callable, Iterable, Iterator
 from contextlib import contextmanager
 from dataclasses import asdict, dataclass
 from enum import Enum
 from logging import getLogger
-from typing import Callable, Optional, TypeVar
+from typing import Optional, TypeVar
 
 import torch
 from torch._utils_internal import signpost_event
@@ -134,7 +134,8 @@ def _raise_if_logical_cpu_indices_invalid(*, logical_cpu_indices: set[int]) -> N
 
 def _bind_current_thread_to_logical_cpus(*, logical_cpu_indices: set[int]) -> None:
     # 0 represents the current thread
-    os.sched_setaffinity(0, logical_cpu_indices)
+    # pyrefly: ignore  # missing-attribute
+    os.sched_setaffinity(0, logical_cpu_indices)  # type: ignore[attr-defined]
 
 
 def _get_logical_cpus_to_bind_to(
@@ -544,4 +545,5 @@ def _get_numa_node_indices_for_socket_index(*, socket_index: int) -> set[int]:
 
 def _get_allowed_cpu_indices_for_current_thread() -> set[int]:
     # 0 denotes current thread
-    return os.sched_getaffinity(0)
+    # pyrefly: ignore  # missing-attribute
+    return os.sched_getaffinity(0)  # type:ignore[attr-defined]
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index 668f47c15bc8..df0bf2cd1a22 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -14,7 +14,7 @@
     "ONNXProgram",
 ]
 
-from typing import Any, Callable, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 
 import torch
 from torch._C import _onnx as _C_onnx
@@ -37,7 +37,7 @@
 )
 from ._internal.torchscript_exporter.utils import (  # Deprecated members that are excluded from __all__
     register_custom_op_symbolic,
-    select_model_mode_for_export,
+    select_model_mode_for_export,  # pyrefly: ignore  # deprecated
     unregister_custom_op_symbolic,
 )
 from .errors import OnnxExporterError
@@ -45,7 +45,7 @@
 
 if TYPE_CHECKING:
     import os
-    from collections.abc import Collection, Mapping, Sequence
+    from collections.abc import Callable, Collection, Mapping, Sequence
 
 # Set namespace for exposed private names
 ONNXProgram.__module__ = "torch.onnx"
diff --git a/torch/onnx/_internal/exporter/_analysis.py b/torch/onnx/_internal/exporter/_analysis.py
index 53860413526e..45e87ef2fdae 100644
--- a/torch/onnx/_internal/exporter/_analysis.py
+++ b/torch/onnx/_internal/exporter/_analysis.py
@@ -122,11 +122,13 @@ def _format_model_info(model_info: ModelInfo) -> str:
 
         target_to_nodes = defaultdict(list)
         for node, _ in model_info.dispatch_failures:
+            # pyrefly: ignore  # index-error
             target_to_nodes[str(node.target)].append(node)
 
         target_to_messages = {}
         for node, message in model_info.dispatch_failures:
             if str(node.target) not in target_to_messages:
+                # pyrefly: ignore  # unsupported-operation
                 target_to_messages[str(node.target)] = message
 
         for target, nodes in sorted(
diff --git a/torch/onnx/_internal/exporter/_building.py b/torch/onnx/_internal/exporter/_building.py
index 64319ac427fe..608591ca04c2 100644
--- a/torch/onnx/_internal/exporter/_building.py
+++ b/torch/onnx/_internal/exporter/_building.py
@@ -267,6 +267,7 @@ def _get_or_create_constant(
     # float representation of complex numbers
     if isinstance(arg, complex):
         # Convert the complex number to a float
+        # pyrefly: ignore  # bad-assignment
         arg = (arg.real, arg.imag)
 
     if isinstance(arg, list):
@@ -645,45 +646,6 @@ def eval_function(  # type: ignore[override]
         kwargs: Mapping[str, AllowedArgType],
     ) -> _tensors.SymbolicTensor | Sequence[_tensors.SymbolicTensor] | bool | int:
         try:
-            # TODO(justinchuby): Remove this once IsScalar and Rank are removed
-            # Special cases for handling IsScalar and Rank
-            if function.name == "IsScalar":
-                if len(args) != 1:
-                    raise TypeError(
-                        f"Expected 1 positional argument for function '{function}', got {len(args)}."
-                    )
-                if isinstance(args[0], _tensors.SymbolicTensor):
-                    if args[0].rank is not None:
-                        return args[0].rank == 0
-                    else:
-                        # Fall to call add_function_call
-                        pass
-                elif isinstance(args[0], Sequence):
-                    return False
-                else:
-                    # Python constants are scalars
-                    return True
-            if function.name == "Rank":
-                if len(args) != 1:
-                    raise TypeError(
-                        f"Expected 1 positional argument for function '{function}', got {len(args)}."
-                    )
-                if isinstance(args[0], _tensors.SymbolicTensor):
-                    if args[0].rank is not None:
-                        return args[0].rank
-                    else:
-                        # Fall to call add_function_call
-                        pass
-                elif isinstance(args[0], Sequence):
-                    if all(isinstance(arg, (int, float)) for arg in args[0]):
-                        return 1
-                    else:
-                        # Fall to call add_function_call
-                        pass
-                else:
-                    # Python constants are scalars
-                    return 0
-
             # NOTE: signature should be written to function in the registration process
             if hasattr(function, "_pt_onnx_signature"):
                 op_signature = function._pt_onnx_signature  # type: ignore[attr-defined]
diff --git a/torch/onnx/_internal/exporter/_capture_strategies.py b/torch/onnx/_internal/exporter/_capture_strategies.py
index 89a2b7e9e5e2..dc2f39990fec 100644
--- a/torch/onnx/_internal/exporter/_capture_strategies.py
+++ b/torch/onnx/_internal/exporter/_capture_strategies.py
@@ -9,7 +9,7 @@
 import datetime
 import logging
 import pathlib
-from typing import Any, Callable, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 
 import torch
 from torch.onnx import _flags
@@ -17,6 +17,7 @@
 
 if TYPE_CHECKING:
     import os
+    from collections.abc import Callable
 
 
 logger = logging.getLogger(__name__)
@@ -46,6 +47,7 @@ def _patch_dynamo_unsupported_functions():
 
     # Replace torch.jit.isinstance with isinstance
     jit_isinstance = torch.jit.isinstance
+    # pyrefly: ignore  # bad-assignment
     torch.jit.isinstance = isinstance
     logger.info("Replaced torch.jit.isinstance with isinstance to allow dynamo tracing")
     try:
diff --git a/torch/onnx/_internal/exporter/_compat.py b/torch/onnx/_internal/exporter/_compat.py
index 0bc0c6182fca..78a54c270d5f 100644
--- a/torch/onnx/_internal/exporter/_compat.py
+++ b/torch/onnx/_internal/exporter/_compat.py
@@ -7,8 +7,8 @@
 import io
 import logging
 import warnings
-from collections.abc import Mapping, Sequence
-from typing import Any, Callable, TYPE_CHECKING
+from collections.abc import Callable, Mapping, Sequence
+from typing import Any, TYPE_CHECKING
 
 import torch
 from torch.onnx import _constants as onnx_constants
diff --git a/torch/onnx/_internal/exporter/_core.py b/torch/onnx/_internal/exporter/_core.py
index 0cf27560784f..5696273f7b66 100644
--- a/torch/onnx/_internal/exporter/_core.py
+++ b/torch/onnx/_internal/exporter/_core.py
@@ -12,8 +12,8 @@
 import textwrap
 import traceback
 import typing
-from collections.abc import Mapping, Sequence
-from typing import Any, Callable, Literal
+from collections.abc import Callable, Mapping, Sequence
+from typing import Any, Literal
 
 import onnxscript
 import onnxscript.evaluator
@@ -132,8 +132,10 @@ def numpy(self) -> npt.NDArray:
         # view the tensor as that dtype so that it is convertible to NumPy,
         # and then view it back to the proper dtype (using ml_dtypes obtained by
         # calling dtype.numpy()).
+        # pyrefly: ignore  # missing-attribute
         if self.dtype == ir.DataType.BFLOAT16:
             return (
+                # pyrefly: ignore  # missing-attribute
                 self.raw.view(torch.uint16).numpy(force=True).view(self.dtype.numpy())
             )
         if self.dtype in {
@@ -142,9 +144,11 @@ def numpy(self) -> npt.NDArray:
             ir.DataType.FLOAT8E5M2,
             ir.DataType.FLOAT8E5M2FNUZ,
         }:
+            # pyrefly: ignore  # missing-attribute
             return self.raw.view(torch.uint8).numpy(force=True).view(self.dtype.numpy())
         if self.dtype == ir.DataType.FLOAT4E2M1:
             return _type_casting.unpack_float4x2_as_uint8(self.raw).view(
+                # pyrefly: ignore  # missing-attribute
                 self.dtype.numpy()
             )
 
@@ -156,10 +160,8 @@ def __array__(self, dtype: Any = None, copy: bool | None = None) -> npt.NDArray:
             return self.numpy()
         return self.numpy().__array__(dtype)
 
-    def tobytes(self) -> bytes:
-        # Implement tobytes to support native PyTorch types so we can use types like bloat16
-        # Reading from memory directly is also more efficient because
-        # it avoids copying to a NumPy array
+    def _get_cbytes(self):
+        """Get a ctypes byte array pointing to the tensor data."""
         import torch._subclasses.fake_tensor
 
         with torch._subclasses.fake_tensor.unset_fake_temporarily():
@@ -168,16 +170,27 @@ def tobytes(self) -> bytes:
 
         if isinstance(tensor, torch._subclasses.fake_tensor.FakeTensor):
             raise TypeError(
+                # pyrefly: ignore  # missing-attribute
                 f"Cannot take content out from the FakeTensor ('{self.name}'). Please replace the tensor "
                 "with a tensor backed by real data using ONNXProgram.apply_weights() "
                 "or save the model without initializers by setting include_initializers=False."
             )
 
-        return bytes(
-            (ctypes.c_ubyte * tensor.element_size() * tensor.numel()).from_address(
-                tensor.data_ptr()
-            )
-        )
+        # Return the tensor to ensure it is not garbage collected while the ctypes array is in use
+        return tensor, (
+            ctypes.c_ubyte * tensor.element_size() * tensor.numel()
+        ).from_address(tensor.data_ptr())
+
+    def tobytes(self) -> bytes:
+        # Implement tobytes to support native PyTorch types so we can use types like bloat16
+        # Reading from memory directly is also more efficient because
+        # it avoids copying to a NumPy array
+        _, data = self._get_cbytes()
+        return bytes(data)
+
+    def tofile(self, file) -> None:
+        _, data = self._get_cbytes()
+        return file.write(data)
 
 
 # https://github.com/pytorch/pytorch/blob/ee6cb6daa173896f8ea1876266a19775aaa4f610/torch/export/graph_signature.py#L56C1-L62C19
@@ -238,6 +251,7 @@ def _set_shape_type(
             if isinstance(dim, int):
                 dims.append(dim)
             else:
+                # pyrefly: ignore  # bad-argument-type
                 dims.append(str(dim.node))
 
         # If the dtype is set already (e.g. by the onnx_symbolic ops),
@@ -270,8 +284,6 @@ def _set_shape_type(
     elif isinstance(meta_val, (float, torch.SymFloat)):
         value.dtype = ir.DataType.FLOAT
         value.shape = ir.Shape([])
-    else:
-        pass
 
 
 def _get_qualified_module_name(cls: Any) -> str:
@@ -1214,6 +1226,7 @@ def _exported_program_to_onnx_program(
     # so we need to get them from the name_* apis.
     for name, torch_tensor in itertools.chain(
         exported_program.named_parameters(),
+        # pyrefly: ignore  # bad-argument-type
         exported_program.named_buffers(),
         exported_program.constants.items(),
     ):
@@ -1236,9 +1249,6 @@ def _exported_program_to_onnx_program(
 
     # TODO: Decide if we should keep mutated buffers as inputs/outputs
 
-    # TODO(justinchuby): Remove the hack
-    _ir_passes.add_torchlib_common_imports(model)
-
     # Collect and add opset imports to the model
     _ir_passes.add_opset_imports(model)
 
diff --git a/torch/onnx/_internal/exporter/_decomp.py b/torch/onnx/_internal/exporter/_decomp.py
index 9227a6ee52f8..4988706404e9 100644
--- a/torch/onnx/_internal/exporter/_decomp.py
+++ b/torch/onnx/_internal/exporter/_decomp.py
@@ -2,13 +2,15 @@
 from __future__ import annotations
 
 import itertools
-from typing import Callable, TYPE_CHECKING
+from typing import TYPE_CHECKING
 
 import torch
 import torch._ops
 
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from torch.onnx._internal.exporter import _registration
 
 
diff --git a/torch/onnx/_internal/exporter/_dispatching.py b/torch/onnx/_internal/exporter/_dispatching.py
index 141cb76deacd..1f935cfed192 100644
--- a/torch/onnx/_internal/exporter/_dispatching.py
+++ b/torch/onnx/_internal/exporter/_dispatching.py
@@ -2,8 +2,8 @@
 from __future__ import annotations
 
 import logging
-from collections.abc import Sequence
-from typing import Any, Callable
+from collections.abc import Callable, Sequence
+from typing import Any
 
 from onnxscript import ir
 
diff --git a/torch/onnx/_internal/exporter/_dynamic_shapes.py b/torch/onnx/_internal/exporter/_dynamic_shapes.py
index 3b04ab85a886..20651017f3ea 100644
--- a/torch/onnx/_internal/exporter/_dynamic_shapes.py
+++ b/torch/onnx/_internal/exporter/_dynamic_shapes.py
@@ -39,6 +39,15 @@ def from_dynamic_axes_to_dynamic_shapes(
 
     Detail on Dim.DYNAMIC: `#133620 <https://github.com/pytorch/pytorch/pull/133620>`_
     """
+
+    warnings.warn(
+        "from_dynamic_axes_to_dynamic_shapes is deprecated and will be removed in a future release. "
+        "This function converts 'dynamic_axes' format (including custom axis names) to 'dynamic_shapes' format. "
+        "Instead of relying on this conversion, provide 'dynamic_shapes' directly with custom names.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+
     # https://github.com/pytorch/pytorch/pull/128371
     # 1. The function does not need to provide dynamic_shapes to torch.export.export
     if dynamic_axes is None:
@@ -62,9 +71,8 @@ def from_dynamic_axes_to_dynamic_shapes(
                 raise ValueError(
                     "The axis in dynamic_axes must be in the form of: dict[int, str] or list[int]."
                 )
-            dynamic_shapes[input_name] = {
-                k: torch.export.Dim.DYNAMIC for k, _ in axes.items()
-            }
+            # str will be converted to Dim.DYNAMIC in convert_str_to_export_dim
+            dynamic_shapes[input_name] = axes
         elif isinstance(axes, list):
             if any(not isinstance(k, int) for k in axes):
                 raise ValueError(
diff --git a/torch/onnx/_internal/exporter/_flags.py b/torch/onnx/_internal/exporter/_flags.py
index 0f07508f831e..8e9d8c9db6e4 100644
--- a/torch/onnx/_internal/exporter/_flags.py
+++ b/torch/onnx/_internal/exporter/_flags.py
@@ -3,10 +3,14 @@
 from __future__ import annotations
 
 import functools
-from typing import Callable, TypeVar
+from typing import TYPE_CHECKING, TypeVar
 from typing_extensions import ParamSpec
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 _is_onnx_exporting = False
 
 # Use ParamSpec to preserve parameter types instead of erasing to Any
diff --git a/torch/onnx/_internal/exporter/_ir_passes.py b/torch/onnx/_internal/exporter/_ir_passes.py
index 8a715e245597..9391b642b009 100644
--- a/torch/onnx/_internal/exporter/_ir_passes.py
+++ b/torch/onnx/_internal/exporter/_ir_passes.py
@@ -90,28 +90,6 @@ def rename_axis(model: ir.Model, rename_mapping: dict[str, str]) -> None:
             value.shape = ir.Shape(new_shape)
 
 
-def add_torchlib_common_imports(
-    model: ir.Model, opset_version: int = _constants.TORCHLIB_OPSET
-) -> None:
-    """Hack to add torchlib common imports to the model."""
-
-    try:
-        # TODO(justinchuby): Remove this hack and improved onnxscript
-        from onnxscript.function_libs.torch_lib.ops import common as common_ops
-
-        model.opset_imports["pkg.onnxscript.torch_lib.common"] = 1
-        rank_func = ir.serde.deserialize_function(common_ops.Rank.to_function_proto())
-        rank_func.opset_imports[""] = opset_version
-        is_scalar_func = ir.serde.deserialize_function(
-            common_ops.IsScalar.to_function_proto()
-        )
-        is_scalar_func.opset_imports[""] = opset_version
-        model.functions[rank_func.identifier()] = rank_func
-        model.functions[is_scalar_func.identifier()] = is_scalar_func
-    except Exception:
-        logger.exception("Failed to add torchlib common imports to the model.")
-
-
 def _maybe_set_opset_version(
     opset_imports: dict[str, int], domain: str, version: int | None
 ) -> None:
diff --git a/torch/onnx/_internal/exporter/_isolated.py b/torch/onnx/_internal/exporter/_isolated.py
index ea575f07a5e2..141c8ad754cf 100644
--- a/torch/onnx/_internal/exporter/_isolated.py
+++ b/torch/onnx/_internal/exporter/_isolated.py
@@ -5,10 +5,14 @@
 import multiprocessing
 import os
 import warnings
-from typing import Any, Callable, TypeVar, TypeVarTuple, Union, Unpack
+from typing import Any, TYPE_CHECKING, TypeVar, TypeVarTuple, Union, Unpack
 from typing_extensions import ParamSpec
 
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
 _Ts = TypeVarTuple("_Ts")
@@ -22,6 +26,7 @@ def _call_function_and_return_exception(
     """Call function and return a exception if there is one."""
 
     try:
+        # pyrefly: ignore  # bad-argument-type
         return func(*args, **kwargs)
     except Exception as e:
         return e
diff --git a/torch/onnx/_internal/exporter/_onnx_program.py b/torch/onnx/_internal/exporter/_onnx_program.py
index 62333289fad8..0afd81ce4e23 100644
--- a/torch/onnx/_internal/exporter/_onnx_program.py
+++ b/torch/onnx/_internal/exporter/_onnx_program.py
@@ -13,8 +13,8 @@
 import tempfile
 import textwrap
 import warnings
-from collections.abc import Sequence
-from typing import Any, Callable, TYPE_CHECKING
+from collections.abc import Callable, Sequence
+from typing import Any, TYPE_CHECKING
 
 import torch
 from torch.onnx._internal._lazy_import import onnx, onnxscript_apis, onnxscript_ir as ir
@@ -157,6 +157,7 @@ def _to_ort_value(input: torch.Tensor | int | float | str | bool) -> ort.OrtValu
             int: np.int64,
             float: np.float32,
         }
+        # pyrefly: ignore  # no-matching-overload
         dtype = dtype_mapping.get(type(input), None)
         return ort.OrtValue.ortvalue_from_numpy(np.array(input, dtype=dtype))
 
@@ -252,6 +253,7 @@ def __call__(self, *args, **kwargs) -> Sequence[torch.Tensor]:
         run_options = ort.RunOptions()
         run_options.log_severity_level = 3  # 3: Error
         logger.debug("Running the inference session with %s arguments.", len(ort_input))
+        # pyrefly: ignore  # missing-attribute
         outputs = self._inference_session.run_with_ort_values(
             None, ort_input, run_options=run_options
         )
diff --git a/torch/onnx/_internal/exporter/_registration.py b/torch/onnx/_internal/exporter/_registration.py
index fefc8022d7e8..38d9f31afab6 100644
--- a/torch/onnx/_internal/exporter/_registration.py
+++ b/torch/onnx/_internal/exporter/_registration.py
@@ -18,8 +18,8 @@
 import math
 import operator
 import types
-from typing import Callable, Literal, Union
-from typing_extensions import TypeAlias
+from collections.abc import Callable
+from typing import Literal, TypeAlias, Union
 
 import torch
 import torch._ops
@@ -64,8 +64,11 @@ def __post_init__(self) -> None:
                 if isinstance(self.onnx_function, onnxscript.OnnxFunction):
                     signature = _schemas.OpSignature.from_function(  # type: ignore[attr-defined]
                         self.onnx_function,
+                        # pyrefly: ignore  # missing-attribute
                         self.onnx_function.function_ir.domain,
+                        # pyrefly: ignore  # missing-attribute
                         self.onnx_function.name,
+                        # pyrefly: ignore  # missing-attribute
                         opset_version=self.onnx_function.opset.version,
                     )
                 else:
@@ -80,7 +83,7 @@ def __post_init__(self) -> None:
                     # When the function is targeting an HOP, for example, it will accept
                     # functions as arguments and fail to generate an ONNX signature.
                     # In this case we set signature to None and dispatch to this function always.
-                    logger.warning(
+                    logger.warning(  # noqa: G200
                         "Failed to infer the signature for function '%s' because '%s'"
                         "All nodes targeting `%s` will be dispatched to this function",
                         self.onnx_function,
diff --git a/torch/onnx/_internal/exporter/_schemas.py b/torch/onnx/_internal/exporter/_schemas.py
index 3aa8b0e0c7e2..9ee51980cf5d 100644
--- a/torch/onnx/_internal/exporter/_schemas.py
+++ b/torch/onnx/_internal/exporter/_schemas.py
@@ -541,6 +541,7 @@ def from_function(
                 if (
                     return_param_name := _get_type_constraint_name(return_type_i)
                 ) in type_constraints:
+                    # pyrefly: ignore  # index-error
                     type_constraint = type_constraints[return_param_name]
                 else:
                     return_param_name = f"TReturn{i}"
@@ -553,6 +554,7 @@ def from_function(
                     type_constraints[return_param_name] = type_constraint
                 outputs.append(
                     Parameter(
+                        # pyrefly: ignore  # bad-argument-type
                         name=return_param_name,
                         type_constraint=type_constraint,
                         required=True,
diff --git a/torch/onnx/_internal/exporter/_tensors.py b/torch/onnx/_internal/exporter/_tensors.py
index 2fdafacbe06f..6a40893e6b6c 100644
--- a/torch/onnx/_internal/exporter/_tensors.py
+++ b/torch/onnx/_internal/exporter/_tensors.py
@@ -30,13 +30,16 @@ def __init__(
 
     @property
     def rank(self) -> int | None:
+        # pyrefly: ignore  # missing-attribute
         if self.shape is None:
             return None
+        # pyrefly: ignore  # bad-argument-type
         return len(self.shape)
 
     # TODO: Implement indexing
 
     def __mod__(self, other):
+        # pyrefly: ignore  # missing-attribute
         if self.dtype in {
             ir.DataType.FLOAT,
             ir.DataType.DOUBLE,
diff --git a/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py b/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py
index 8c045d11a2b8..a2f86a6ccf26 100644
--- a/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py
+++ b/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py
@@ -8,8 +8,8 @@
 __all__ = ["onnx_impl", "get_torchlib_ops"]
 
 import logging
-from collections.abc import Sequence
-from typing import Any, Callable, TypeVar
+from collections.abc import Callable, Sequence
+from typing import Any, TypeVar
 from typing_extensions import ParamSpec
 
 import onnxscript
diff --git a/torch/onnx/_internal/exporter/_torchlib/ops/nn.py b/torch/onnx/_internal/exporter/_torchlib/ops/nn.py
index 1ea9a4161f43..9be57d88a635 100644
--- a/torch/onnx/_internal/exporter/_torchlib/ops/nn.py
+++ b/torch/onnx/_internal/exporter/_torchlib/ops/nn.py
@@ -50,8 +50,10 @@ def aten_group_norm(
 
     c = op21.Shape(input, start=1, end=2)
     if weight is None:
+        # pyrefly: ignore  # missing-attribute
         weight = op21.ConstantOfShape(c, value=ir.tensor(1.0, dtype=input.dtype))
     if bias is None:
+        # pyrefly: ignore  # missing-attribute
         bias = op21.ConstantOfShape(c, value=ir.tensor(0.0, dtype=input.dtype))
     return op21.GroupNormalization(
         input, weight, bias, epsilon=eps, num_groups=num_groups
@@ -80,6 +82,7 @@ def aten_rms_norm(
 
     # Create weight tensor if not provided
     if weight is None:
+        # pyrefly: ignore  # missing-attribute
         weight = op23.Constant(value=ir.tensor(1.0, dtype=input.dtype))
 
     return op23.RMSNormalization(input, weight, axis=axis, epsilon=eps)
@@ -128,6 +131,7 @@ def aten_scaled_dot_product_attention_23(
     assert (not is_causal) or (is_causal and attn_mask is None), (
         "is_causal and attn_mask cannot be set at the same time"
     )
+    # pyrefly: ignore  # missing-attribute
     assert len(query.shape) == 4 and len(key.shape) == 4 and len(value.shape) == 4, (
         "only 4D query, key, and value are supported"
     )
@@ -136,12 +140,15 @@ def aten_scaled_dot_product_attention_23(
     if dropout_p == 0:
         if enable_gqa:
             assert (
+                # pyrefly: ignore  # index-error
                 query.shape[1] > key.shape[1] == value.shape[1]
+                # pyrefly: ignore  # index-error
                 and query.shape[1] % key.shape[1] == 0
             ), (
                 "SDPA (GQA or MQA) requires q_num_heads > kv_num_heads & q_num_heads % kv_num_heads == 0"
             )
         else:
+            # pyrefly: ignore  # index-error
             assert query.shape[1] == key.shape[1] == value.shape[1], (
                 "SDPA (MHA) requires q_num_heads = kv_num_heads"
             )
@@ -202,7 +209,9 @@ def _attention_repeat_kv_for_group_query(
     """
 
     assert (
+        # pyrefly: ignore  # missing-attribute
         query.shape[1] > key.shape[1] == value.shape[1]
+        # pyrefly: ignore  # missing-attribute
         and query.shape[1] % key.shape[1] == 0
     ), (
         "SDPA (GQA or MQA) requires q_num_heads > kv_num_heads & q_num_heads % kv_num_heads == 0"
diff --git a/torch/onnx/_internal/exporter/_verification.py b/torch/onnx/_internal/exporter/_verification.py
index a475908b5825..9741ae81bfff 100644
--- a/torch/onnx/_internal/exporter/_verification.py
+++ b/torch/onnx/_internal/exporter/_verification.py
@@ -317,12 +317,9 @@ def run_node(self, n: torch.fx.Node) -> Any:
             return result
         try:
             (onnx_result,) = self._onnx_program.compute_values([node_name], self._args)
-        except Exception as e:
+        except Exception:
             logger.warning(
-                "Failed to compute value for node %s: %s",
-                node_name,
-                e,
-                exc_info=True,
+                "Failed to compute value for node %s", node_name, exc_info=True
             )
             return result
         info = VerificationInfo.from_tensors(
diff --git a/torch/onnx/_internal/fx/_pass.py b/torch/onnx/_internal/fx/_pass.py
index 88cd0b2eaab5..b1fad573f290 100644
--- a/torch/onnx/_internal/fx/_pass.py
+++ b/torch/onnx/_internal/fx/_pass.py
@@ -7,7 +7,7 @@
 import difflib
 import io
 import sys
-from typing import Any, Callable, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 
 import torch
 import torch.fx
@@ -15,6 +15,8 @@
 
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from torch._subclasses import fake_tensor
 
 
diff --git a/torch/onnx/_internal/fx/passes/type_promotion.py b/torch/onnx/_internal/fx/passes/type_promotion.py
index a7f6be32f47d..87220a453124 100644
--- a/torch/onnx/_internal/fx/passes/type_promotion.py
+++ b/torch/onnx/_internal/fx/passes/type_promotion.py
@@ -6,7 +6,7 @@
 import dataclasses
 import inspect
 import logging
-from typing import Any, Callable, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 
 import torch
 import torch._dispatch.python
@@ -26,7 +26,7 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Mapping, Sequence
+    from collections.abc import Callable, Mapping, Sequence
     from types import ModuleType
 
     from torch._subclasses import fake_tensor
@@ -149,6 +149,7 @@ def __repr__(self):
             f"{self.promote_args_positions}, {self.promote_kwargs_names}, {self.promotion_kind})"
         )
 
+    # pyrefly: ignore  # bad-override
     def __eq__(self, other: object, /) -> bool:
         if not isinstance(other, ElementwiseTypePromotionRule):
             return False
@@ -227,7 +228,7 @@ def __init__(self):
     def preview_type_promotion(
         self, args: tuple, kwargs: dict
     ) -> TypePromotionSnapshot:
-        rounding_mode = kwargs.get("rounding_mode", None)
+        rounding_mode = kwargs.get("rounding_mode")
         if rounding_mode is None:
             # true_divide
             self.promotion_kind = (
@@ -265,6 +266,7 @@ def __init__(
     def __repr__(self):
         return f"ReductionTypePromotionRule('{self.namespace}', '{self.op_name}', {self.promotion_kind})"
 
+    # pyrefly: ignore  # bad-override
     def __eq__(self, other: object, /) -> bool:
         if not isinstance(other, ElementwiseTypePromotionRule):
             return False
@@ -285,7 +287,7 @@ def preview_type_promotion(
         )
         arg = args[0]
         assert isinstance(arg, torch.Tensor), f"{type(arg)=} is not torch.Tensor"
-        dtype: torch.dtype | None = kwargs.get("dtype", None)
+        dtype: torch.dtype | None = kwargs.get("dtype")
 
         computation_dtype, result_dtype = _prims_common.reduction_dtypes(
             arg, self.promotion_kind, dtype
@@ -349,7 +351,7 @@ def preview_type_promotion(
         )
         arg = args[0]
         assert isinstance(arg, torch.Tensor), f"{type(arg)=} is not torch.Tensor"
-        dtype: torch.dtype | None = kwargs.get("dtype", None)
+        dtype: torch.dtype | None = kwargs.get("dtype")
         # The below logic is copied from `torch/_refs/__init__.py` reduction ops impl.
         if dtype is None:
             if _prims_common.is_boolean_dtype(
diff --git a/torch/onnx/_internal/torchscript_exporter/_globals.py b/torch/onnx/_internal/torchscript_exporter/_globals.py
index 55d0550324e7..9e27c1dbeb8a 100644
--- a/torch/onnx/_internal/torchscript_exporter/_globals.py
+++ b/torch/onnx/_internal/torchscript_exporter/_globals.py
@@ -1,4 +1,3 @@
-# mypy: allow-untyped-defs
 """Globals used internally by the ONNX exporter.
 
 Do not use this module outside of `torch.onnx` and its tests.
@@ -34,12 +33,12 @@ def __init__(self) -> None:
         self._autograd_inlining: bool = True
 
     @property
-    def training_mode(self):
+    def training_mode(self) -> _C_onnx.TrainingMode:
         """The training mode for the exporter."""
         return self._training_mode
 
     @training_mode.setter
-    def training_mode(self, training_mode: _C_onnx.TrainingMode):
+    def training_mode(self, training_mode: _C_onnx.TrainingMode) -> None:
         if not isinstance(training_mode, _C_onnx.TrainingMode):
             raise TypeError(
                 "training_mode must be of type 'torch.onnx.TrainingMode'. This is "
@@ -53,7 +52,7 @@ def export_onnx_opset_version(self) -> int:
         return self._export_onnx_opset_version
 
     @export_onnx_opset_version.setter
-    def export_onnx_opset_version(self, value: int):
+    def export_onnx_opset_version(self, value: int) -> None:
         self._export_onnx_opset_version = value
 
     @property
@@ -62,7 +61,7 @@ def in_onnx_export(self) -> bool:
         return self._in_onnx_export
 
     @in_onnx_export.setter
-    def in_onnx_export(self, value: bool):
+    def in_onnx_export(self, value: bool) -> None:
         if type(value) is not bool:
             raise TypeError("in_onnx_export must be a boolean")
         self._in_onnx_export = value
@@ -73,7 +72,7 @@ def autograd_inlining(self) -> bool:
         return self._autograd_inlining
 
     @autograd_inlining.setter
-    def autograd_inlining(self, value: bool):
+    def autograd_inlining(self, value: bool) -> None:
         if type(value) is not bool:
             raise TypeError("autograd_inlining must be a boolean")
         self._autograd_inlining = value
diff --git a/torch/onnx/_internal/torchscript_exporter/_type_utils.py b/torch/onnx/_internal/torchscript_exporter/_type_utils.py
index 81bcaeef1107..d4c1382d2931 100644
--- a/torch/onnx/_internal/torchscript_exporter/_type_utils.py
+++ b/torch/onnx/_internal/torchscript_exporter/_type_utils.py
@@ -153,6 +153,7 @@ def from_dtype(cls, dtype: torch.dtype | None) -> JitScalarType:
         """
         if dtype not in _DTYPE_TO_SCALAR_TYPE:
             raise errors.OnnxExporterError(f"Unknown dtype: {dtype}")
+        # pyrefly: ignore  # index-error
         return _DTYPE_TO_SCALAR_TYPE[dtype]
 
     @classmethod
diff --git a/torch/onnx/_internal/torchscript_exporter/jit_utils.py b/torch/onnx/_internal/torchscript_exporter/jit_utils.py
index 6c00b6a9c8c4..e0bbe92e0e88 100644
--- a/torch/onnx/_internal/torchscript_exporter/jit_utils.py
+++ b/torch/onnx/_internal/torchscript_exporter/jit_utils.py
@@ -298,9 +298,12 @@ def _create_node(
     for key, value in sorted(attributes.items()):
         if key in _SKIP_NODE_ATTRIBUTES:
             continue
+        # pyrefly: ignore  # unbound-name
         _add_attribute(node, key, value, aten=aten)
     if shape_inference:
+        # pyrefly: ignore  # unbound-name
         _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
+    # pyrefly: ignore  # unbound-name
     return node
 
 
diff --git a/torch/onnx/_internal/torchscript_exporter/registration.py b/torch/onnx/_internal/torchscript_exporter/registration.py
index b8bba134f36b..f073227f87b3 100644
--- a/torch/onnx/_internal/torchscript_exporter/registration.py
+++ b/torch/onnx/_internal/torchscript_exporter/registration.py
@@ -2,8 +2,8 @@
 """Module for handling symbolic function registration."""
 
 import warnings
-from collections.abc import Collection, Sequence
-from typing import Callable, Generic, Optional, TypeVar, Union
+from collections.abc import Callable, Collection, Sequence
+from typing import Generic, Optional, TypeVar, Union
 from typing_extensions import ParamSpec
 
 from torch.onnx import _constants, errors
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py b/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
index a5e85aed01ef..3f92f6418c89 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
@@ -103,8 +103,14 @@
 import sys
 import typing
 import warnings
-from typing import Any, Callable, Literal, NoReturn, TypeVar as _TypeVar
-from typing_extensions import Concatenate as _Concatenate, ParamSpec as _ParamSpec
+from typing import (
+    Any,
+    Concatenate as _Concatenate,
+    Literal,
+    NoReturn,
+    TypeVar as _TypeVar,
+)
+from typing_extensions import ParamSpec as _ParamSpec
 
 import torch
 import torch._C._onnx as _C_onnx
@@ -115,7 +121,7 @@
 
 
 if typing.TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
 
     from torch.types import Number
 
@@ -358,6 +364,7 @@ def wrapper(g: _U, *args: _P.args, **kwargs: _P.kwargs) -> _T:
                 fn_name = None
             args = [
                 _parse_arg(arg, arg_desc, arg_name, fn_name)  # type: ignore[method-assign]
+                # pyrefly: ignore  # no-matching-overload
                 for arg, arg_desc, arg_name in zip(args, arg_descriptors, arg_names)
             ]
             # only support _outputs in kwargs
@@ -815,9 +822,10 @@ def _is_fp(value) -> bool:
 
 
 def _is_bool(value) -> bool:
-    return _type_utils.JitScalarType.from_value(
-        value, _type_utils.JitScalarType.UNDEFINED
-    ) in {_type_utils.JitScalarType.BOOL}
+    return (
+        _type_utils.JitScalarType.from_value(value, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.BOOL
+    )
 
 
 def _generate_wrapped_number(g: jit_utils.GraphContext, scalar):
@@ -997,7 +1005,7 @@ def _interpolate_size_to_scales(g: jit_utils.GraphContext, input, output_size, d
             if i < 2
             else float(output_size[-(dim - i)])
             / float(input.type().sizes()[-(dim - i)])
-            for i in range(0, dim)
+            for i in range(dim)
         ]
         scales = g.op(
             "Constant", value_t=torch.tensor(scales_constant, dtype=torch.float32)
@@ -1793,22 +1801,27 @@ def _op_with_optional_float_cast(g: jit_utils.GraphContext, op_name, *args, **kw
 
     if require_cast:
         for input in inputs:
+            # pyrefly: ignore  # missing-attribute
             if input.isCompleteTensor():
                 input_scalar_type = _type_utils.JitScalarType.from_value(input)
                 if input_scalar_type != dtype_0:
                     raise errors.SymbolicValueError(
                         f"Inputs of {op_name} must have same dtype."
                         f"Got {dtype_0.scalar_name()} and {input_scalar_type.scalar_name()}",
+                        # pyrefly: ignore  # bad-argument-type
                         input,
                     )
         for i, input in enumerate(inputs):
+            # pyrefly: ignore  # missing-attribute
             if input.isCompleteTensor() and not _is_fp(input):
                 inputs[i] = g.op(
                     "Cast",
+                    # pyrefly: ignore  # bad-argument-type
                     input,
                     to_i=target_float_t.onnx_type(),
                 )
 
+    # pyrefly: ignore  # bad-argument-type
     self = g.op(op_name, *inputs, **kwargs)
 
     if require_cast:
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py
index 6b36396250b4..6bb09ef3ec2a 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py
@@ -205,6 +205,7 @@ def _adjust_attributes_of_max_pool(
     else:
         strides = stride  # type: ignore[assignment]
 
+    # pyrefly: ignore  # bad-return
     return (kernel_shape, strides, pads, dilation)
 
 
@@ -381,6 +382,7 @@ def _adjust_attributes_of_avg_pool(
     else:
         strides = stride  # type: ignore[assignment]
 
+    # pyrefly: ignore  # bad-return
     return (kernel_shape, strides, pads)
 
 
@@ -709,6 +711,7 @@ def fake_quantize_per_tensor_affine(
             "Non-constant scale not supported",
             inputs,
         )
+    # pyrefly: ignore  # missing-attribute
     scale = scale.float().data  # Avoid exporter generating double type
     if quant_min == 0:
         zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py
index f437e2670768..cbba5d2e61cb 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py
@@ -819,6 +819,7 @@ def _get_arange_dtype(dtype):
             "Constant",
             value_t=torch.tensor(1, dtype=type_.dtype()),
         )
+        # pyrefly: ignore  # bad-argument-type
         return g.op("Range", start_default, end, delta_default)
     elif len(args) == 4 or len(args) == 7:
         if len(args) == 4:
@@ -830,6 +831,7 @@ def _get_arange_dtype(dtype):
         _, end, start, step = symbolic_helper._arange_cast_helper(
             g, start=args[0], end=args[1], step=args[2], dtype=dtype
         )
+        # pyrefly: ignore  # bad-argument-type
         return g.op("Range", start, end, step)
     elif len(args) == 6:
         # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
@@ -841,6 +843,7 @@ def _get_arange_dtype(dtype):
             "Constant",
             value_t=torch.tensor(1, dtype=type_.dtype()),
         )
+        # pyrefly: ignore  # bad-argument-type
         return g.op("Range", start, end, delta_default)
     else:
         return symbolic_helper._unimplemented(
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py
index 431660409717..d4b887560f9b 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py
@@ -331,7 +331,7 @@ def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
 
         ndim = symbolic_helper._get_tensor_rank(input)
         assert ndim is not None
-        perm = list(range(0, ndim))
+        perm = list(range(ndim))
         perm.append(perm.pop(dimension))
 
         unsqueeze_list = []
@@ -363,7 +363,10 @@ def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
         concat = loop_context.op("Concat", *unsqueeze_list, axis_i=0)
 
         cond_out = loop_context.op(
-            "Cast", loop_condition, _C_onnx.TensorProtoDataType.BOOL
+            "Cast",
+            loop_condition,
+            # pyrefly: ignore  # bad-argument-type
+            _C_onnx.TensorProtoDataType.BOOL,
         )
         utils._add_output_to_block(loop_block, cond_out)
         utils._add_output_to_block(loop_block, concat)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset13.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset13.py
index e9da6a426f7f..9deb479a7ceb 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset13.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset13.py
@@ -96,6 +96,7 @@ def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=No
 
     split_val = symbolic_helper._node_get(split_size_or_sizes.node(), "value")
     if split_val.dim() > 0:
+        # pyrefly: ignore  # bad-argument-type
         return g.op("Split", self, split_size_or_sizes, axis_i=dim, outputs=_outputs)
     split_size = symbolic_helper._get_const(split_size_or_sizes, "i", "split_size")
 
@@ -112,6 +113,7 @@ def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=No
     if leftover:
         splits.append(leftover)
     splits = g.op("Constant", value_t=torch.tensor(splits))
+    # pyrefly: ignore  # bad-argument-type
     return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
 
 
@@ -186,6 +188,7 @@ def tensor_split(
         splits = g.op(
             "Constant", value_t=torch.tensor(splits + leftover, dtype=torch.long)
         )
+        # pyrefly: ignore  # bad-argument-type
         return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
 
     if (
@@ -311,6 +314,7 @@ def where(g: jit_utils.GraphContext, condition, self=None, other=None, _outputs=
         return symbolic_helper._unbind_helper(
             g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
         )
+    # pyrefly: ignore  # bad-argument-type
     return g.op("Where", condition, self, other)
 
 
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset14.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset14.py
index 5675f362893e..3e6752506bd4 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset14.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset14.py
@@ -167,6 +167,7 @@ def scaled_dot_product_attention(
     # NOTE: onnx-script has different logic here, because the attribute perms in
     # transpose needs list of ints
     key_shape_builtin = symbolic_helper._get_tensor_rank(key)
+    # pyrefly: ignore  # no-matching-overload
     key_transposed_axes = list(range(key_shape_builtin))
     key_transposed_axes[-1], key_transposed_axes[-2] = (
         key_transposed_axes[-2],
@@ -176,7 +177,9 @@ def scaled_dot_product_attention(
 
     # https://github.com/pytorch/pytorch/blob/12da0c70378b5be9135c6fda62a9863bce4a4818/aten/src/ATen/native/transformers/attention.cpp#L653
     # Scale q, k before matmul for stability see https://tinyurl.com/sudb9s96 for math
+    # pyrefly: ignore  # bad-argument-type
     query_scaled = g.op("Mul", query, g.op("Sqrt", scale))
+    # pyrefly: ignore  # bad-argument-type
     key_transposed_scaled = g.op("Mul", key_transposed, g.op("Sqrt", scale))
     mul_qk = g.op("MatMul", query_scaled, key_transposed_scaled)
 
@@ -190,6 +193,7 @@ def scaled_dot_product_attention(
         # Turn the Boolean mask to float: attn_mask.masked_fill(not attn_mask, -float('inf'))
         const_zero = g.op("Constant", value_t=torch.tensor([0.0]))
         const_neg_inf = g.op("Constant", value_t=torch.tensor([-float("inf")]))
+        # pyrefly: ignore  # bad-argument-type
         attn_mask = g.op("Where", attn_mask, const_zero, const_neg_inf)
         mul_qk_add = g.op("Add", mul_qk, attn_mask)
         attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
@@ -203,6 +207,7 @@ def scaled_dot_product_attention(
         _type_utils.JitScalarType.HALF,
         _type_utils.JitScalarType.BFLOAT16,
     ):
+        # pyrefly: ignore  # bad-argument-type
         mul_qk_add = g.op("Add", mul_qk, attn_mask)
         attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
     else:
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset17.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset17.py
index e8ea41e64306..42acd954520a 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset17.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset17.py
@@ -176,6 +176,7 @@ def stft(
         )
 
     # Get window and make sure it's the same size as `win_length` or `n_fft`
+    # pyrefly: ignore  # bad-argument-type
     n_win = symbolic_helper._get_tensor_dim_size(window, dim=0)
     if n_win is not None:
         win_length_default = win_length if win_length else n_fft
@@ -189,6 +190,7 @@ def stft(
             left, right = _compute_edge_sizes(n_fft, n_win)
             left_win = g.op("Constant", value_t=torch.zeros(left))
             right_win = g.op("Constant", value_t=torch.zeros(right))
+            # pyrefly: ignore  # bad-argument-type
             window = g.op("Concat", left_win, window, right_win, axis_i=0)
 
     # Create window, if needed
@@ -212,7 +214,10 @@ def stft(
         assert torch_window.shape[0] == n_fft
         window = g.op("Constant", value_t=torch_window)
     window = g.op(
-        "Cast", window, to_i=_type_utils.JitScalarType.from_value(signal).onnx_type()
+        "Cast",
+        # pyrefly: ignore  # bad-argument-type
+        window,
+        to_i=_type_utils.JitScalarType.from_value(signal).onnx_type(),
     )
 
     # Run STFT
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset18.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset18.py
index 6a5ac408fb1b..f8ff787df9c0 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset18.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset18.py
@@ -151,6 +151,7 @@ def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
 @_onnx_symbolic("aten::maximum")
 @symbolic_helper.quantized_args(True, True)
 def maximum(g: jit_utils.GraphContext, input, other):
+    # pyrefly: ignore  # no-matching-overload
     return max(g, input, dim_or_y=other)
 
 
@@ -163,6 +164,7 @@ def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
 @_onnx_symbolic("aten::minimum")
 @symbolic_helper.quantized_args(True, True)
 def minimum(g: jit_utils.GraphContext, input, other):
+    # pyrefly: ignore  # no-matching-overload
     return min(g, input, dim_or_y=other)
 
 
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
index bde072608088..8ba8e6ee6622 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
@@ -116,7 +116,7 @@ def symbolic_fn(g, input, output_size, *args):
                 if i < 2
                 else float(output_size[-(dim - i)])
                 / float(input.type().sizes()[-(dim - i)])
-                for i in range(0, dim)
+                for i in range(dim)
             ]
         return g.op("Upsample", input, mode_s=interpolate_mode, scales_f=scales)
 
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
index 596c656777f8..16e94b91f89f 100644
--- a/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
@@ -14,7 +14,7 @@
 import math
 import sys
 import warnings
-from typing import Callable, TYPE_CHECKING
+from typing import TYPE_CHECKING
 from typing_extensions import deprecated
 
 import torch
@@ -33,7 +33,7 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
 
     from torch.types import Number
 
@@ -840,7 +840,7 @@ def t(g: jit_utils.GraphContext, self):
 def numpy_T(g: jit_utils.GraphContext, input):
     ndim = symbolic_helper._get_tensor_rank(input)
     assert ndim is not None
-    perm = list(reversed(range(0, ndim)))
+    perm = list(reversed(range(ndim)))
     return g.op("Transpose", input, perm_i=perm)
 
 
@@ -990,7 +990,7 @@ def transpose(g: jit_utils.GraphContext, self, dim0, dim1):
 @_onnx_symbolic("aten::permute")
 @symbolic_helper.parse_args("v", "is")
 def permute(g: jit_utils.GraphContext, self, dims):
-    if dims == list(range(0, len(dims))):
+    if dims == list(range(len(dims))):
         return self
     return g.op("Transpose", self, perm_i=dims)
 
@@ -1051,6 +1051,7 @@ def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=No
     leftover = size % split_size
     if leftover:
         splits.append(leftover)
+    # pyrefly: ignore  # bad-argument-type
     return g.op("Split", self, split_i=splits, axis_i=dim, outputs=_outputs)
 
 
@@ -1068,6 +1069,7 @@ def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs
         return symbolic_helper._onnx_opset_unsupported_detailed(
             "split_with_sizes", 9, 11, "Dynamic number of outputs not supported", self
         )
+    # pyrefly: ignore  # bad-argument-type
     return g.op("Split", self, split_i=split_sizes, axis_i=dim, outputs=_outputs)
 
 
@@ -1365,9 +1367,8 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
             "get_pool_ceil_padding", "input size not accessible", input
         )
     ceiled_output_dim = [
-        int(math.ceil((dim[i] + 2 * padding[i] - kernel_size[i]) / float(stride[i])))
-        + 1
-        for i in range(0, len(padding))
+        math.ceil((dim[i] + 2 * padding[i] - kernel_size[i]) / float(stride[i])) + 1
+        for i in range(len(padding))
     ]
     # ensure last pooling starts inside
     ceiled_output_dim = [
@@ -1376,7 +1377,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
             if (((ceiled_output_dim[i] - 1) * stride[i]) >= (dim[i] + padding[i]))
             else ceiled_output_dim[i]
         )
-        for i in range(0, len(ceiled_output_dim))
+        for i in range(len(ceiled_output_dim))
     ]
     padding_ceil = [
         (
@@ -1391,7 +1392,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
                 )
             )
         )
-        for i in range(0, len(padding))
+        for i in range(len(padding))
     ]
     # ensure padding is not > kernel_size
     padding_ceil = [
@@ -1404,7 +1405,7 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding):
             if ((padding_ceil[i] + 2 * padding[i]) >= (kernel_size[i]))
             else int(padding_ceil[i])
         )
-        for i in range(0, len(padding_ceil))
+        for i in range(len(padding_ceil))
     ]
     return padding_ceil
 
@@ -1696,16 +1697,17 @@ def symbolic_fn(g, input, output_size):
                 name, "input size not accessible", input
             )
         # verify if output size % input size = 0 for all dim
-        mod = [dim[i] % output_size[i] for i in range(0, len(dim))]
+        mod = [dim[i] % output_size[i] for i in range(len(dim))]
         if mod != [0] * len(mod):
             if output_size == [1] * len(output_size):
                 return g.op("GlobalMaxPool", input), None
             return symbolic_helper._unimplemented(
                 name, "output size that are not factor of input size", output_size_value
             )
-        k = [int(dim[i] / output_size[i]) for i in range(0, len(dim))]
+        k = [int(dim[i] / output_size[i]) for i in range(len(dim))]
         # call max_poolxd_with_indices to get indices in the output
         if type == "MaxPool":
+            # pyrefly: ignore  # not-callable
             return fn(g, input, k, k, (0,) * len(dim), (1,) * len(dim), False)
         output = g.op(type, input, kernel_shape_i=tuple_fn(k), strides_i=tuple_fn(k))
         return output
@@ -1760,6 +1762,7 @@ def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value):
         )
 
     padding = _convert_padding_node(padding)
+    # pyrefly: ignore  # bad-argument-type
     paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
     return symbolic_helper._op_with_optional_float_cast(
         g, "Pad", input, pads_i=paddings, mode_s=mode, value_f=value, opset_before=11
@@ -1813,6 +1816,7 @@ def _pad_circular(g: jit_utils.GraphContext, input: _C.Value, pad: _C.Value):
 def reflection_pad(g: jit_utils.GraphContext, input, padding):
     mode = "reflect"
     padding = _convert_padding_node(padding)
+    # pyrefly: ignore  # bad-argument-type
     paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
     return symbolic_helper._op_with_optional_float_cast(
         g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
@@ -1825,6 +1829,7 @@ def reflection_pad(g: jit_utils.GraphContext, input, padding):
 def replication_pad(g: jit_utils.GraphContext, input, padding):
     mode = "edge"
     padding = _convert_padding_node(padding)
+    # pyrefly: ignore  # bad-argument-type
     paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
     return symbolic_helper._op_with_optional_float_cast(
         g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
@@ -2205,6 +2210,7 @@ def where(g: jit_utils.GraphContext, condition, self=None, other=None, _outputs=
         return symbolic_helper._unbind_helper(
             g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
         )
+    # pyrefly: ignore  # bad-argument-type
     return g.op("Where", condition, self, other)
 
 
@@ -2380,6 +2386,7 @@ def _convolution_mode(
         "group_i": groups,
     }
 
+    # pyrefly: ignore  # bad-argument-type
     n = g.op("Conv", *args, **kwargs)
 
     if (
@@ -2724,10 +2731,12 @@ def native_layer_norm(
 
     # variance = e((x - e(x))^2), and (x - e(x)) is the numerator in the layer_norm formula
     if g.opset < 18:
+        # pyrefly: ignore  # no-matching-overload
         variance = g.op("ReduceMean", pow(g, numerator, two_cst), axes_i=axes)
     else:
         variance = g.op(
             "ReduceMean",
+            # pyrefly: ignore  # no-matching-overload
             pow(g, numerator, two_cst),
             g.op("Constant", value_t=torch.tensor(axes, dtype=torch.long)),
         )
@@ -2897,7 +2906,7 @@ def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
             for low, hi in zip(low_indices, hi_indices)
         ]
         ndim = len(sizes)
-        perm = list(range(0, ndim))
+        perm = list(range(ndim))
         perm.append(perm.pop(dimension))
         unsqueeze = [
             symbolic_helper._unsqueeze_helper(
@@ -3066,10 +3075,12 @@ def pairwise_distance(g: jit_utils.GraphContext, input1, input2, p, eps, keepdim
     )
     summation = symbolic_helper._reducesum_helper(
         g,
+        # pyrefly: ignore  # no-matching-overload
         pow(g, sub(g, input1, input2), p),
         axes_i=[-1],
         keepdims_i=symbolic_helper._parse_arg(keepdim, "i"),
     )
+    # pyrefly: ignore  # no-matching-overload
     return pow(g, summation, inv_p)
 
 
@@ -3179,6 +3190,7 @@ def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
 @_onnx_symbolic("aten::maximum")
 @symbolic_helper.quantized_args(True, True)
 def maximum(g: jit_utils.GraphContext, input, other):
+    # pyrefly: ignore  # no-matching-overload
     return max(g, input, dim_or_y=other)
 
 
@@ -3191,6 +3203,7 @@ def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
 @_onnx_symbolic("aten::minimum")
 @symbolic_helper.quantized_args(True, True)
 def minimum(g: jit_utils.GraphContext, input, other):
+    # pyrefly: ignore  # no-matching-overload
     return min(g, input, dim_or_y=other)
 
 
@@ -3487,6 +3500,7 @@ def zeros_like(
             input, _type_utils.JitScalarType.FLOAT
         )
     else:
+        # pyrefly: ignore  # bad-argument-type
         scalar_type = _type_utils.JitScalarType(dtype)
     return g.op(
         "ConstantOfShape",
@@ -3546,6 +3560,7 @@ def ones_like(
             input, _type_utils.JitScalarType.FLOAT
         )
     else:
+        # pyrefly: ignore  # bad-argument-type
         scalar_type = _type_utils.JitScalarType(dtype)
     return g.op(
         "ConstantOfShape",
@@ -4536,7 +4551,7 @@ def lstm_cell(g: jit_utils.GraphContext, self, hidden, w_ih, w_hh, b_ih, b_hh):
     weight = (
         (w_ih, w_hh, b_ih, b_hh) if symbolic_helper._is_tensor(b_ih) else (w_ih, w_hh)
     )
-    has_biases = True if symbolic_helper._is_tensor(b_ih) else False
+    has_biases = bool(symbolic_helper._is_tensor(b_ih))
     _, h_outs, c_outs = _generic_rnn(
         g,
         "LSTM",
@@ -5535,6 +5550,7 @@ def linalg_matrix_norm(
             g, g.op("Abs", self), axes_i=[dim[0]], keepdims_i=keepdim
         )
         if ord_value > 0:
+            # pyrefly: ignore  # no-matching-overload
             result, _indices = max(
                 g,
                 sum,
@@ -5542,6 +5558,7 @@ def linalg_matrix_norm(
                 keepdim=keepdim,
             )
         else:
+            # pyrefly: ignore  # no-matching-overload
             result, _indices = min(
                 g,
                 sum,
@@ -5905,7 +5922,9 @@ def as_strided(g: jit_utils.GraphContext, self, sizes, strides, offset=None):
             else:
                 ind = g.op("Add", ind, tmp_ind)
         if offset:
+            # pyrefly: ignore  # bad-argument-type
             ind = g.op("Add", ind, g.op("Constant", torch.tensor([offset])))
+        # pyrefly: ignore  # bad-argument-type
         return g.op("Gather", self_1d, ind)
 
 
@@ -6188,6 +6207,7 @@ def _euclidean_dist(g: jit_utils.GraphContext, x1, x2):
     assert rank is not None
     x1_norm = symbolic_helper._reducesum_helper(
         g,
+        # pyrefly: ignore  # no-matching-overload
         pow(g, x1, symbolic_helper._generate_wrapped_number(g, 2.0)),
         axes_i=[-1],
         keepdims_i=True,
@@ -6195,6 +6215,7 @@ def _euclidean_dist(g: jit_utils.GraphContext, x1, x2):
     x1_pad = ones_like(g, x1_norm)
     x2_norm = symbolic_helper._reducesum_helper(
         g,
+        # pyrefly: ignore  # no-matching-overload
         pow(g, x2, symbolic_helper._generate_wrapped_number(g, 2.0)),
         axes_i=[-1],
         keepdims_i=True,
diff --git a/torch/onnx/_internal/torchscript_exporter/utils.py b/torch/onnx/_internal/torchscript_exporter/utils.py
index 2a7339c27e08..f2004ac02320 100644
--- a/torch/onnx/_internal/torchscript_exporter/utils.py
+++ b/torch/onnx/_internal/torchscript_exporter/utils.py
@@ -62,7 +62,7 @@
 import re
 import typing
 import warnings
-from typing import Any, Callable, cast
+from typing import Any, cast
 from typing_extensions import deprecated
 
 import torch
@@ -80,7 +80,7 @@
 
 
 if typing.TYPE_CHECKING:
-    from collections.abc import Collection, Mapping, Sequence
+    from collections.abc import Callable, Collection, Mapping, Sequence
 
 
 # TODO(justinchuby): Remove dependency to this global variable from constant_fold.cpp
@@ -136,7 +136,7 @@ def select_model_mode_for_export(model, mode: _C_onnx.TrainingMode):
     try:
         yield
     finally:
-        if hasattr(model, "training") and not mode == _C_onnx.TrainingMode.PRESERVE:
+        if hasattr(model, "training") and mode != _C_onnx.TrainingMode.PRESERVE:
             model.train(originally_training)
 
 
diff --git a/torch/onnx/_internal/torchscript_exporter/verification.py b/torch/onnx/_internal/torchscript_exporter/verification.py
index 3bf8cba1c8d6..f8e2d37ba737 100644
--- a/torch/onnx/_internal/torchscript_exporter/verification.py
+++ b/torch/onnx/_internal/torchscript_exporter/verification.py
@@ -239,7 +239,7 @@ def _compare_onnx_pytorch_outputs_in_np(
             if acceptable_error_percentage:
                 error_percentage = 1 - np.sum(
                     np.isclose(ort_out, pt_out, rtol=options.rtol, atol=options.atol)
-                ) / np.prod(ort_out.shape)
+                ) / np.prod(ort_out.shape)  # pyrefly: ignore # missing-attribute
                 if error_percentage <= acceptable_error_percentage:
                     warnings.warn(
                         f"Suppressed AssertionError:\n{e}.\n"
@@ -247,8 +247,10 @@ def _compare_onnx_pytorch_outputs_in_np(
                         f"within acceptable range {acceptable_error_percentage}."
                     )
                     continue
+            # pyrefly: ignore  # missing-attribute
             if ort_out.dtype == np.uint8 or ort_out.dtype == np.int8:
                 warnings.warn("ONNX output is quantized")
+            # pyrefly: ignore  # missing-attribute
             if pt_out.dtype == np.uint8 or pt_out.dtype == np.int8:
                 warnings.warn("PyTorch output is quantized")
             raise
diff --git a/torch/onnx/ops/__init__.py b/torch/onnx/ops/__init__.py
index d10ba1ac7a3c..8da3fc8e5872 100644
--- a/torch/onnx/ops/__init__.py
+++ b/torch/onnx/ops/__init__.py
@@ -17,14 +17,14 @@
 ]
 
 
-from typing import Callable, TYPE_CHECKING
+from typing import TYPE_CHECKING
 
 import torch
 from torch.onnx.ops import _impl, _symbolic_impl
 
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
 
 
 # https://github.com/onnx/onnx/blob/f542e1f06699ea7e1db5f62af53355b64338c723/onnx/onnx.proto#L597
diff --git a/torch/onnx/ops/_impl.py b/torch/onnx/ops/_impl.py
index a7eba334ecfc..f5e3721111df 100644
--- a/torch/onnx/ops/_impl.py
+++ b/torch/onnx/ops/_impl.py
@@ -1,6 +1,7 @@
 # flake8: noqa: B950
 import math
-from typing import Callable, Optional, TypeVar
+from collections.abc import Callable
+from typing import Optional, TypeVar
 from typing_extensions import ParamSpec
 
 import torch
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 6b1d752bb04e..c50676eda781 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -5,4 +5,5 @@
 
 __all__: list[str] = []
 
+# pyrefly: ignore  # deprecated
 from torch.onnx._internal.torchscript_exporter.utils import *  # noqa: F401,F403
diff --git a/torch/optim/_muon.py b/torch/optim/_muon.py
index 28b6c2d8b5b4..7c2d5465c63c 100644
--- a/torch/optim/_muon.py
+++ b/torch/optim/_muon.py
@@ -78,6 +78,7 @@ def _adjust_lr(
     A, B = param_shape[:2]
 
     if adjust_lr_fn is None or adjust_lr_fn == "original":
+        # pyrefly: ignore  # no-matching-overload
         adjusted_ratio = math.sqrt(max(1, A / B))
     elif adjust_lr_fn == "match_rms_adamw":
         adjusted_ratio = 0.2 * math.sqrt(max(A, B))
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index 8bbccfb0bc11..f1602a820cec 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -84,6 +84,7 @@ def __init__(
                 )
             if betas[1].numel() != 1:
                 raise ValueError("Tensor betas[1] must be 1-element")
+        betas = tuple(map(_to_scalar, betas))
 
         defaults = {
             "lr": lr,
@@ -315,8 +316,9 @@ def step(self, closure=None):
         lr (float, Tensor, optional): learning rate (default: 1e-3). A tensor LR
             is not yet supported for all our implementations. Please use a float
             LR if you are not also specifying fused=True or capturable=True.
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
+        betas (tuple[Union[float, Tensor], Union[float, Tensor]], optional):
+            coefficients used for computing running averages of gradient and
+            its square. If a tensor is provided, must be 1-element. (default: (0.9, 0.999))
         eps (float, optional): term added to the denominator to improve
             numerical stability (default: 1e-8)
         weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
@@ -375,7 +377,8 @@ def _single_tensor_adam(
         assert isinstance(beta2, float)
     else:
         lr = _to_scalar(lr)
-        # TODO: Support nonzero-dim Tensor betas, see #147921
+        beta1 = _to_scalar(beta1)
+        beta2 = _to_scalar(beta2)
 
     # We only shuffle around the beta when it is a Tensor, otherwise, we prefer
     # treating it as a scalar.
@@ -415,6 +418,7 @@ def _single_tensor_adam(
                     if weight_decay.requires_grad:
                         grad = grad.addcmul_(param.clone(), weight_decay)
                     else:
+                        # pyrefly: ignore  # bad-argument-type
                         grad = grad.add(param, alpha=weight_decay)
                 else:
                     grad = grad.add(param, alpha=weight_decay)
@@ -444,6 +448,7 @@ def _single_tensor_adam(
             device_beta1 = beta1
 
         # Decay the first and second moment running average coefficient
+        # pyrefly: ignore  # no-matching-overload
         exp_avg.lerp_(grad, 1 - device_beta1)
 
         # Nested if is necessary to bypass jitscript rules
@@ -608,7 +613,8 @@ def _multi_tensor_adam(
     assert not differentiable, "_foreach ops don't support autograd"
 
     lr = _to_scalar(lr)
-    # TODO: Support nonzero-dim Tensor betas, see #147921
+    beta1 = _to_scalar(beta1)
+    beta2 = _to_scalar(beta2)
 
     grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
         [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps]  # type: ignore[list-item]
@@ -798,8 +804,8 @@ def _fused_adam(
     *,
     amsgrad: bool,
     has_complex: bool,  # Needed for consistency.
-    beta1: float,
-    beta2: float,
+    beta1: Union[float, Tensor],
+    beta2: Union[float, Tensor],
     lr: Union[float, Tensor],
     weight_decay: float,
     eps: float,
@@ -813,6 +819,9 @@ def _fused_adam(
     if differentiable:
         raise RuntimeError("Adam with fused=True does not support differentiable=True")
 
+    beta1 = _to_scalar(beta1)
+    beta2 = _to_scalar(beta2)
+
     grad_scale_dict: DeviceDict = (
         {grad_scale.device: grad_scale} if grad_scale is not None else {}
     )
@@ -902,8 +911,8 @@ def adam(
     decoupled_weight_decay: bool = False,
     *,
     amsgrad: bool,
-    beta1: float,
-    beta2: float,
+    beta1: Union[float, Tensor],
+    beta2: Union[float, Tensor],
     lr: Union[float, Tensor],
     weight_decay: float,
     eps: float,
diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py
index b61a3f61b668..0558cbddd883 100644
--- a/torch/optim/adamw.py
+++ b/torch/optim/adamw.py
@@ -102,8 +102,9 @@ def __setstate__(self, state):
         lr (float, Tensor, optional): learning rate (default: 1e-3). A tensor LR
             is not yet supported for all our implementations. Please use a float
             LR if you are not also specifying fused=True or capturable=True.
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
+        betas (tuple[Union[float, Tensor], Union[float, Tensor]], optional):
+            coefficients used for computing running averages of gradient and
+            its square. If a tensor is provided, must be 1-element. (default: (0.9, 0.999))
         eps (float, optional): term added to the denominator to improve
             numerical stability (default: 1e-8)
         weight_decay (float, optional): weight decay coefficient (default: 1e-2)
@@ -145,8 +146,8 @@ def adamw(
     has_complex: bool = False,
     *,
     amsgrad: bool,
-    beta1: float,
-    beta2: float,
+    beta1: Union[float, Tensor],
+    beta2: Union[float, Tensor],
     lr: Union[float, Tensor],
     weight_decay: float,
     eps: float,
diff --git a/torch/optim/asgd.py b/torch/optim/asgd.py
index aff201520adb..35d691f88c7e 100644
--- a/torch/optim/asgd.py
+++ b/torch/optim/asgd.py
@@ -263,6 +263,7 @@ def _single_tensor_asgd(
             ax.copy_(param)
 
         if capturable:
+            # pyrefly: ignore  # unsupported-operation
             eta.copy_(lr / ((1 + lambd * lr * step_t) ** alpha))
             mu.copy_(1 / torch.maximum(step_t - t0, torch.ones_like(step_t)))
         else:
diff --git a/torch/optim/lbfgs.py b/torch/optim/lbfgs.py
index 09f5f2ca8c88..cf2ad5a2f35a 100644
--- a/torch/optim/lbfgs.py
+++ b/torch/optim/lbfgs.py
@@ -113,11 +113,16 @@ def _strong_wolfe(
 
         # compute new trial value
         t = _cubic_interpolate(
+            # pyrefly: ignore  # index-error
             bracket[0],
+            # pyrefly: ignore  # unbound-name
             bracket_f[0],
             bracket_gtd[0],  # type: ignore[possibly-undefined]
+            # pyrefly: ignore  # index-error
             bracket[1],
+            # pyrefly: ignore  # unbound-name
             bracket_f[1],
+            # pyrefly: ignore  # unbound-name
             bracket_gtd[1],
         )
 
@@ -128,14 +133,20 @@ def _strong_wolfe(
         #   + `t` is at one of the boundary,
         # we will move `t` to a position which is `0.1 * len(bracket)`
         # away from the nearest boundary point.
+        # pyrefly: ignore  # unbound-name
         eps = 0.1 * (max(bracket) - min(bracket))
+        # pyrefly: ignore  # unbound-name
         if min(max(bracket) - t, t - min(bracket)) < eps:
             # interpolation close to boundary
+            # pyrefly: ignore  # unbound-name
             if insuf_progress or t >= max(bracket) or t <= min(bracket):
                 # evaluate at 0.1 away from boundary
+                # pyrefly: ignore  # unbound-name
                 if abs(t - max(bracket)) < abs(t - min(bracket)):
+                    # pyrefly: ignore  # unbound-name
                     t = max(bracket) - eps
                 else:
+                    # pyrefly: ignore  # unbound-name
                     t = min(bracket) + eps
                 insuf_progress = False
             else:
@@ -149,32 +160,45 @@ def _strong_wolfe(
         gtd_new = g_new.dot(d)
         ls_iter += 1
 
+        # pyrefly: ignore  # unbound-name
         if f_new > (f + c1 * t * gtd) or f_new >= bracket_f[low_pos]:
             # Armijo condition not satisfied or not lower than lowest point
+            # pyrefly: ignore  # unsupported-operation
             bracket[high_pos] = t
+            # pyrefly: ignore  # unbound-name
             bracket_f[high_pos] = f_new
             bracket_g[high_pos] = g_new.clone(memory_format=torch.contiguous_format)  # type: ignore[possibly-undefined]
+            # pyrefly: ignore  # unbound-name
             bracket_gtd[high_pos] = gtd_new
+            # pyrefly: ignore  # unbound-name
             low_pos, high_pos = (0, 1) if bracket_f[0] <= bracket_f[1] else (1, 0)
         else:
             if abs(gtd_new) <= -c2 * gtd:
                 # Wolfe conditions satisfied
                 done = True
+            # pyrefly: ignore  # index-error
             elif gtd_new * (bracket[high_pos] - bracket[low_pos]) >= 0:
                 # old high becomes new low
+                # pyrefly: ignore  # unsupported-operation
                 bracket[high_pos] = bracket[low_pos]
+                # pyrefly: ignore  # unbound-name
                 bracket_f[high_pos] = bracket_f[low_pos]
                 bracket_g[high_pos] = bracket_g[low_pos]  # type: ignore[possibly-undefined]
+                # pyrefly: ignore  # unbound-name
                 bracket_gtd[high_pos] = bracket_gtd[low_pos]
 
             # new point becomes new low
+            # pyrefly: ignore  # unsupported-operation
             bracket[low_pos] = t
+            # pyrefly: ignore  # unbound-name
             bracket_f[low_pos] = f_new
             bracket_g[low_pos] = g_new.clone(memory_format=torch.contiguous_format)  # type: ignore[possibly-undefined]
+            # pyrefly: ignore  # unbound-name
             bracket_gtd[low_pos] = gtd_new
 
     # return stuff
     t = bracket[low_pos]  # type: ignore[possibly-undefined]
+    # pyrefly: ignore  # unbound-name
     f_new = bracket_f[low_pos]
     g_new = bracket_g[low_pos]  # type: ignore[possibly-undefined]
     return f_new, g_new, t, ls_func_evals
@@ -252,6 +276,7 @@ def __init__(
 
     def _numel(self):
         if self._numel_cache is None:
+            # pyrefly: ignore  # bad-assignment
             self._numel_cache = sum(
                 2 * p.numel() if torch.is_complex(p) else p.numel()
                 for p in self._params
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 92288d0cbdfd..3a6bc296d70d 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -11,7 +11,6 @@
 from functools import partial, wraps
 from typing import (
     Any,
-    Callable,
     cast,
     Literal,
     Optional,
@@ -29,7 +28,7 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Iterable, Sequence
+    from collections.abc import Callable, Iterable, Sequence
 
 
 __all__ = [
@@ -79,9 +78,20 @@ def _copy(_param):
     return list(map(_copy, param))
 
 
+def _param_groups_val_list(optimizer: Optimizer, key: str) -> list[Any]:
+    """Create a list containing group[key] for each optimizer param_group.
+    Prevents aliasing when group[key] could be a Tensor.
+    Raises a KeyError when group[key] does not exist.
+    """
+    return [
+        group[key].clone() if isinstance(group[key], Tensor) else group[key]
+        for group in optimizer.param_groups
+    ]
+
+
 def _update_param_group_val(param_group: dict[str, Any], key: str, val: float | Tensor):
-    """Set param_group[key] to val without aliasing or assignment when they're both tensors.
-    Raises a KeyError if param_group[key] does not exist.
+    """Set param_group[key] to val without aliasing or assignment when they're
+    both tensors. Raises a KeyError if param_group[key] does not exist.
     """
     if isinstance(param_group[key], Tensor):
         param_group[key].fill_(_to_scalar(val))
@@ -90,7 +100,25 @@ def _update_param_group_val(param_group: dict[str, Any], key: str, val: float |
 
 
 class LRScheduler:
-    r"""Adjusts the learning rate during optimization."""
+    r"""Base class for all learning rate schedulers.
+
+    Subclasses implement :meth:`get_lr` and optionally override :meth:`step` to
+    define scheduling behavior.
+
+    Args:
+        optimizer (Optimizer): The optimizer this scheduler will adjust the
+            learning rates of.
+        last_epoch (int): Index of the last epoch seen by the scheduler. Use
+            ``-1`` (default) to initialize the scheduler. Only use a non-default
+            value when restoring this scheduler from a saved checkpoint.
+
+    .. warning::
+        Initializing a scheduler overwrites its optimizer's
+        ``param_group["lr"]``\s. When restoring a checkpoint, initialize the
+        scheduler **before** calling your optimizer's
+        :meth:`~torch.optim.Optimizer.load_state_dict` to avoid overwriting the
+        loaded learning rates.
+    """
 
     _get_lr_called_within_step: bool = False
     _is_initial: bool = False
@@ -121,9 +149,9 @@ def __init__(
                         "1. You're trying to resume training from a checkpoint but haven't properly loaded the optimizer state\n"
                         "2. You're using last_epoch >= 0 for a fresh training run (not recommended)"
                     )
-        self.base_lrs: list[float] = [
-            group["initial_lr"] for group in optimizer.param_groups
-        ]
+        self.base_lrs: list[float | Tensor] = _param_groups_val_list(
+            optimizer, "initial_lr"
+        )
         self.last_epoch = last_epoch
 
         # Following https://github.com/pytorch/pytorch/issues/20124
@@ -161,7 +189,7 @@ def _initial_step(self) -> None:
     def state_dict(self) -> dict[str, Any]:
         """Return the state of the scheduler as a :class:`dict`.
 
-        It contains an entry for every variable in self.__dict__ which
+        It contains an entry for every variable in ``self.__dict__`` which
         is not the optimizer.
         """
         return {
@@ -177,16 +205,58 @@ def load_state_dict(self, state_dict: dict[str, Any]):
         """
         self.__dict__.update(state_dict)
 
-    def get_last_lr(self) -> list[float]:
-        """Return last computed learning rate by current scheduler."""
+    def get_last_lr(self) -> list[float | Tensor]:
+        r"""Get the most recent learning rates computed by this scheduler.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates with entries
+            for each of the optimizer's
+            :attr:`~torch.optim.Optimizer.param_groups`, with the same types as
+            their ``group["lr"]``\s.
+
+        .. note::
+            The returned :class:`~torch.Tensor`\s are copies, and never alias
+            the optimizer's ``group["lr"]``\s.
+        """
+        # We always update self._last_lr with _param_groups_val_list, so it's a
+        # .clone() of the group["lr"]s. If we didn't do this, the user could
+        # corrupt their learning rates by modifying the outputs in place.
         return self._last_lr
 
-    def get_lr(self) -> list[float]:
-        """Compute learning rate using chainable form of the scheduler."""
+    def get_lr(self) -> list[float | Tensor]:
+        r"""Compute the next learning rate for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups`.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+
+        .. note::
+            If you're trying to inspect the most recent learning rate, use
+            :meth:`get_last_lr()` instead.
+
+        .. note::
+            The returned :class:`~torch.Tensor`\s are copies, and never alias
+            the optimizer's ``group["lr"]``\s.
+        """
         raise NotImplementedError
 
     def step(self, epoch: Optional[int] = None) -> None:
-        """Perform a step."""
+        """Step the scheduler.
+
+        Args:
+            epoch (int, optional):
+                .. deprecated:: 1.4
+                    If provided, sets :attr:`last_epoch` to ``epoch`` and uses
+                    :meth:`_get_closed_form_lr` if it is available. This is not
+                    universally supported. Use :meth:`step` without arguments
+                    instead.
+
+        .. note::
+            Call this method after calling the optimizer's
+            :meth:`~torch.optim.Optimizer.step`.
+        """
         # Raise a warning if old pattern is detected
         # https://github.com/pytorch/pytorch/issues/20124
         if self._step_count == 1:
@@ -224,16 +294,18 @@ def _update_lr(self, epoch: Optional[int] = None):
             else:
                 self.last_epoch = epoch
                 if hasattr(self, "_get_closed_form_lr"):
-                    values = cast(list[float], self._get_closed_form_lr())
+                    values = cast(
+                        list[Union[float, Tensor]], self._get_closed_form_lr()
+                    )
                 else:
                     values = self.get_lr()
 
         for param_group, lr in zip(self.optimizer.param_groups, values):
             _update_param_group_val(param_group, "lr", lr)
 
-        self._last_lr: list[float] = [
-            group["lr"] for group in self.optimizer.param_groups
-        ]
+        self._last_lr: list[float | Tensor] = _param_groups_val_list(
+            self.optimizer, "lr"
+        )
 
 
 def _warn_get_lr_called_within_step(lr_scheduler: LRScheduler) -> None:
@@ -333,8 +405,7 @@ def __init__(
     def state_dict(self) -> dict[str, Any]:
         """Return the state of the scheduler as a :class:`dict`.
 
-        It contains an entry for every variable in self.__dict__ which
-        is not the optimizer.
+        It contains an entry for every variable in ``self.__dict__`` which is not the optimizer.
         The learning rate lambda functions will only be saved if they are callable objects
         and not if they are functions or lambdas.
 
@@ -349,6 +420,7 @@ def state_dict(self) -> dict[str, Any]:
 
         for idx, fn in enumerate(self.lr_lambdas):
             if not isinstance(fn, types.FunctionType):
+                # pyrefly: ignore  # unsupported-operation
                 state_dict["lr_lambdas"][idx] = fn.__dict__.copy()
 
         return state_dict
@@ -374,8 +446,26 @@ def load_state_dict(self, state_dict: dict[str, Any]) -> None:
                 self.lr_lambdas[idx].__dict__.update(fn)
 
     @override
-    def get_lr(self) -> list[float]:
-        """Compute learning rate."""
+    def get_lr(self) -> list[float | Tensor]:
+        r"""Compute the next learning rate for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups`.
+
+        Scales the :attr:`base_lrs` by the outputs of the :attr:`lr_lambdas` at
+        :attr:`last_epoch`.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+
+        .. note::
+            If you're trying to inspect the most recent learning rate, use
+            :meth:`get_last_lr()` instead.
+
+        .. note::
+            The returned :class:`~torch.Tensor`\s are copies, and never alias
+            the optimizer's ``group["lr"]``\s.
+        """
         _warn_get_lr_called_within_step(self)
 
         return [
@@ -436,7 +526,7 @@ def __init__(
     def state_dict(self) -> dict[str, Any]:
         """Return the state of the scheduler as a :class:`dict`.
 
-        It contains an entry for every variable in self.__dict__ which
+        It contains an entry for every variable in ``self.__dict__`` which
         is not the optimizer.
         The learning rate lambda functions will only be saved if they are callable objects
         and not if they are functions or lambdas.
@@ -450,6 +540,7 @@ def state_dict(self) -> dict[str, Any]:
 
         for idx, fn in enumerate(self.lr_lambdas):
             if not isinstance(fn, types.FunctionType):
+                # pyrefly: ignore  # unsupported-operation
                 state_dict["lr_lambdas"][idx] = fn.__dict__.copy()
 
         return state_dict
@@ -473,8 +564,27 @@ def load_state_dict(self, state_dict: dict[str, Any]) -> None:
                 self.lr_lambdas[idx].__dict__.update(fn)
 
     @override
-    def get_lr(self) -> list[float]:
-        """Compute the learning rate of each parameter group."""
+    def get_lr(self) -> list[float | Tensor]:
+        r"""Compute the next learning rate for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups`.
+
+        Scales the current ``group["lr"]``\s in each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups` by the outputs of the
+        :attr:`lr_lambdas` at :attr:`last_epoch`.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+
+        .. note::
+            If you're trying to inspect the most recent learning rate, use
+            :meth:`get_last_lr()` instead.
+
+        .. note::
+            The returned :class:`~torch.Tensor`\s are copies, and never alias
+            the optimizer's ``group["lr"]``\s.
+        """
         _warn_get_lr_called_within_step(self)
 
         if not self._is_initial:
@@ -483,7 +593,7 @@ def get_lr(self) -> list[float]:
                 for lmbda, group in zip(self.lr_lambdas, self.optimizer.param_groups)
             ]
         else:
-            return [group["lr"] for group in self.optimizer.param_groups]
+            return _param_groups_val_list(self.optimizer, "lr")
 
 
 class StepLR(LRScheduler):
@@ -527,15 +637,46 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     @override
-    def get_lr(self) -> list[float]:
-        """Compute the learning rate of each parameter group."""
+    def get_lr(self) -> list[float | Tensor]:
+        r"""Compute the next learning rate for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups`.
+
+        If the current epoch is a non-zero multiple of :attr:`step_size`, we
+        scale the current ``group["lr"]``\s in the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups` by :attr:`gamma`.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+
+        .. note::
+            If you're trying to inspect the most recent learning rate, use
+            :meth:`get_last_lr()` instead.
+
+        .. note::
+            The returned :class:`~torch.Tensor`\s are copies, and never alias
+            the optimizer's ``group["lr"]``\s.
+        """
         _warn_get_lr_called_within_step(self)
 
         if (self.last_epoch == 0) or (self.last_epoch % self.step_size != 0):
-            return [group["lr"] for group in self.optimizer.param_groups]
+            return _param_groups_val_list(self.optimizer, "lr")
         return [group["lr"] * self.gamma for group in self.optimizer.param_groups]
 
-    def _get_closed_form_lr(self) -> list[float]:
+    def _get_closed_form_lr(self) -> list[float | Tensor]:
+        r"""Compute learning rates for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups` at :attr:`last_epoch` using
+        a closed-form formula.
+
+        Uses :attr:`base_lrs` to compute learning rates. This method is called
+        when an epoch is passed to :meth:`step`.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+        """
         return [
             base_lr * self.gamma ** (self.last_epoch // self.step_size)
             for base_lr in self.base_lrs
@@ -582,18 +723,53 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     @override
-    def get_lr(self) -> list[float]:
-        """Compute the learning rate of each parameter group."""
+    def get_lr(self) -> list[float | Tensor]:
+        r"""Compute the next learning rate for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups`.
+
+        If the current epoch is in :attr:`milestones`, decays the
+        ``group["lr"]``\s in the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups` by :attr:`gamma`.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+
+        .. note::
+            If you're trying to inspect the most recent learning rate, use
+            :meth:`get_last_lr()` instead.
+
+        .. note::
+            The returned :class:`~torch.Tensor`\s are copies, and never alias
+            the optimizer's ``group["lr"]``\s.
+
+        .. note::
+            If the current epoch appears in :attr:`milestones` ``n`` times, we
+            scale by :attr:`gamma` to the power of ``n``
+        """
         _warn_get_lr_called_within_step(self)
 
         if self.last_epoch not in self.milestones:
-            return [group["lr"] for group in self.optimizer.param_groups]
+            return _param_groups_val_list(self.optimizer, "lr")
         return [
             group["lr"] * self.gamma ** self.milestones[self.last_epoch]
             for group in self.optimizer.param_groups
         ]
 
     def _get_closed_form_lr(self):
+        r"""Compute learning rates for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups` at :attr:`last_epoch` using
+        a closed-form formula.
+
+        Uses :attr:`base_lrs` to compute learning rates. This method is called
+        when an epoch is passed to :meth:`step`.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+        """
         milestones = sorted(self.milestones.elements())
         return [
             base_lr * self.gamma ** bisect_right(milestones, self.last_epoch)
@@ -651,21 +827,53 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     @override
-    def get_lr(self) -> list[float]:
-        """Compute the learning rate of each parameter group."""
+    def get_lr(self) -> list[float | Tensor]:
+        r"""Compute the next learning rate for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups`.
+
+        When :attr:`last_epoch` is 0, this method scales the ``group["lr"]``\s
+        in each of the optimizer's :attr:`~torch.optim.Optimizer.param_groups`
+        by :attr:`factor`. Once :attr:`total_iters` is reached, it undoes this,
+        scaling by ``1 / factor``.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+
+        .. note::
+            If you're trying to inspect the most recent learning rate, use
+            :meth:`get_last_lr()` instead.
+
+        .. note::
+            The returned :class:`~torch.Tensor`\s are copies, and never alias
+            the optimizer's ``group["lr"]``\s.
+        """
         _warn_get_lr_called_within_step(self)
 
         if self.last_epoch == 0:
             return [group["lr"] * self.factor for group in self.optimizer.param_groups]
 
         if self.last_epoch != self.total_iters:
-            return [group["lr"] for group in self.optimizer.param_groups]
+            return _param_groups_val_list(self.optimizer, "lr")
 
         return [
             group["lr"] * (1.0 / self.factor) for group in self.optimizer.param_groups
         ]
 
     def _get_closed_form_lr(self):
+        r"""Compute learning rates for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups` at :attr:`last_epoch` using
+        a closed-form formula.
+
+        Uses :attr:`base_lrs` to compute learning rates. This method is called
+        when an epoch is passed to :meth:`step`.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+        """
         return [
             base_lr
             * (self.factor + (self.last_epoch >= self.total_iters) * (1 - self.factor))
@@ -733,8 +941,28 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     @override
-    def get_lr(self) -> list[float]:
-        """Compute the learning rate."""
+    def get_lr(self) -> list[float | Tensor]:
+        r"""Compute the next learning rate for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups`.
+
+        Scales the ``group["lr"]``\s in the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups` such that successive steps
+        interpolate linearly from :attr:`start_factor` up to :attr:`end_factor`
+        across :attr:`total_iters` steps.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+
+        .. note::
+            If you're trying to inspect the most recent learning rate, use
+            :meth:`get_last_lr()` instead.
+
+        .. note::
+            The returned :class:`~torch.Tensor`\s are copies, and never alias
+            the optimizer's ``group["lr"]``\s.
+        """
         _warn_get_lr_called_within_step(self)
 
         if self.last_epoch == 0:
@@ -743,7 +971,7 @@ def get_lr(self) -> list[float]:
             ]
 
         if self._is_initial or self.last_epoch > self.total_iters:
-            return [group["lr"] for group in self.optimizer.param_groups]
+            return _param_groups_val_list(self.optimizer, "lr")
 
         return [
             group["lr"]
@@ -759,6 +987,18 @@ def get_lr(self) -> list[float]:
         ]
 
     def _get_closed_form_lr(self):
+        r"""Compute learning rates for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups` at :attr:`last_epoch` using
+        a closed-form formula.
+
+        Uses :attr:`base_lrs` to compute learning rates. This method is called
+        when an epoch is passed to :meth:`step`.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+        """
         return [
             base_lr
             * (
@@ -802,17 +1042,47 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     @override
-    def get_lr(self) -> list[float]:
-        """Compute the learning rate of each parameter group."""
+    def get_lr(self) -> list[float | Tensor]:
+        r"""Compute the next learning rate for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups`.
+
+        Multiplies the current ``group["lr"]``\s in the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups` by :attr:`gamma`.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+
+        .. note::
+            If you're trying to inspect the most recent learning rate, use
+            :meth:`get_last_lr()` instead.
+
+        .. note::
+            The returned :class:`~torch.Tensor`\s are copies, and never alias
+            the optimizer's ``group["lr"]``\s.
+        """
         _warn_get_lr_called_within_step(self)
 
         # when loading from a checkpoint, we don't want _initial_step (called from the constructor)
         # to update the lr one more step ahead of itself.
         if self._is_initial:
-            return [group["lr"] for group in self.optimizer.param_groups]
+            return _param_groups_val_list(self.optimizer, "lr")
         return [group["lr"] * self.gamma for group in self.optimizer.param_groups]
 
     def _get_closed_form_lr(self):
+        r"""Compute learning rates for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups` at :attr:`last_epoch` using
+        a closed-form formula.
+
+        Uses :attr:`base_lrs` to compute learning rates. This method is called
+        when an epoch is passed to :meth:`step`.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+        """
         return [base_lr * self.gamma**self.last_epoch for base_lr in self.base_lrs]
 
 
@@ -935,7 +1205,7 @@ def step(self) -> None:  # type: ignore[override]
     def state_dict(self) -> dict[str, Any]:
         """Return the state of the scheduler as a :class:`dict`.
 
-        It contains an entry for every variable in self.__dict__ which
+        It contains an entry for every variable in ``self.__dict__`` which
         is not the optimizer.
         The wrapped scheduler states will also be saved.
         """
@@ -947,6 +1217,7 @@ def state_dict(self) -> dict[str, Any]:
         state_dict["_schedulers"] = [None] * len(self._schedulers)
 
         for idx, s in enumerate(self._schedulers):
+            # pyrefly: ignore  # unsupported-operation
             state_dict["_schedulers"][idx] = s.state_dict()
 
         return state_dict
@@ -1008,12 +1279,38 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     @override
-    def get_lr(self) -> list[float]:
-        """Compute the learning rate."""
+    def get_lr(self) -> list[float | Tensor]:
+        r"""Compute the next learning rate for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups`.
+
+        Scales the ``group["lr"]``\s in the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups` such that the learning rates
+        follow
+
+        .. math::
+            \texttt{base\_lr} \cdot \left(1 - \frac{\texttt{last\_epoch}}
+            {\texttt{total\_iters}} \right)^\texttt{power}
+
+        Returns the current learning rates unchanged after :attr:`total_iters`
+        is reached.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+
+        .. note::
+            If you're trying to inspect the most recent learning rate, use
+            :meth:`get_last_lr()` instead.
+
+        .. note::
+            The returned :class:`~torch.Tensor`\s are copies, and never alias
+            the optimizer's ``group["lr"]``\s.
+        """
         _warn_get_lr_called_within_step(self)
 
         if self._is_initial or self.last_epoch > self.total_iters:
-            return [group["lr"] for group in self.optimizer.param_groups]
+            return _param_groups_val_list(self.optimizer, "lr")
 
         decay_factor = (
             (1.0 - self.last_epoch / self.total_iters)
@@ -1021,7 +1318,19 @@ def get_lr(self) -> list[float]:
         ) ** self.power
         return [group["lr"] * decay_factor for group in self.optimizer.param_groups]
 
-    def _get_closed_form_lr(self):
+    def _get_closed_form_lr(self) -> list[float | Tensor]:
+        r"""Compute learning rates for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups` at :attr:`last_epoch` using
+        a closed-form formula.
+
+        Uses :attr:`base_lrs` to compute learning rates. This method is called
+        when an epoch is passed to :meth:`step`.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+        """
         return [
             (
                 base_lr
@@ -1094,12 +1403,36 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     @override
-    def get_lr(self) -> list[float]:
-        """Retrieve the learning rate of each parameter group."""
+    def get_lr(self) -> list[float | Tensor]:
+        r"""Compute the next learning rate for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups`.
+
+        Scales the ``group["lr"]``\s in the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups` such that their learning
+        rates approximate
+
+        .. math::
+            \texttt{eta\_min} + \frac{1}{2} (\texttt{base\_lr} -
+            \texttt{eta\_min}) \left(1 + \cos\left(\pi \cdot
+            \frac{\texttt{last\_epoch}}{\texttt{T\_max}}\right) \right)
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+
+        .. note::
+            If you're trying to inspect the most recent learning rate, use
+            :meth:`get_last_lr()` instead.
+
+        .. note::
+            The returned :class:`~torch.Tensor`\s are copies, and never alias
+            the optimizer's ``group["lr"]``\s.
+        """
         _warn_get_lr_called_within_step(self)
 
         if self._is_initial:
-            return [group["lr"] for group in self.optimizer.param_groups]
+            return _param_groups_val_list(self.optimizer, "lr")
         elif self._step_count == 1 and self.last_epoch > 0:
             return [
                 self.eta_min
@@ -1122,7 +1455,19 @@ def get_lr(self) -> list[float]:
             for group in self.optimizer.param_groups
         ]
 
-    def _get_closed_form_lr(self) -> list[float]:
+    def _get_closed_form_lr(self) -> list[float | Tensor]:
+        r"""Compute learning rates for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups` at :attr:`last_epoch` using
+        a closed-form formula.
+
+        Uses :attr:`base_lrs` to compute learning rates. This method is called
+        when an epoch is passed to :meth:`step`.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+        """
         return [
             self.eta_min
             + (base_lr - self.eta_min)
@@ -1191,23 +1536,19 @@ def __init__(
                 )
         self._schedulers = schedulers
         self.optimizer = optimizer
-        self._last_lr = [
-            group["lr"] for group in self._schedulers[-1].optimizer.param_groups
-        ]
+        self._last_lr = _param_groups_val_list(self._schedulers[-1].optimizer, "lr")
 
     def step(self) -> None:  # type: ignore[override]
         """Perform a step."""
         for scheduler in self._schedulers:
             scheduler.step()
-        self._last_lr = [
-            group["lr"] for group in self._schedulers[-1].optimizer.param_groups
-        ]
+        self._last_lr = _param_groups_val_list(self._schedulers[-1].optimizer, "lr")
 
     @override
     def state_dict(self) -> dict[str, Any]:
         """Return the state of the scheduler as a :class:`dict`.
 
-        It contains an entry for every variable in self.__dict__ which
+        It contains an entry for every variable in ``self.__dict__`` which
         is not the optimizer.
         The wrapped scheduler states will also be saved.
         """
@@ -1219,6 +1560,7 @@ def state_dict(self) -> dict[str, Any]:
         state_dict["_schedulers"] = [None] * len(self._schedulers)
 
         for idx, s in enumerate(self._schedulers):
+            # pyrefly: ignore  # unsupported-operation
             state_dict["_schedulers"][idx] = s.state_dict()
 
         return state_dict
@@ -1327,6 +1669,7 @@ def __init__(
             self.default_min_lr = None
             self.min_lrs = list(min_lr)
         else:
+            # pyrefly: ignore  # bad-assignment
             self.default_min_lr = min_lr
             self.min_lrs = [min_lr] * len(optimizer.param_groups)
 
@@ -1334,7 +1677,7 @@ def __init__(
         self.cooldown = cooldown
         self.eps = eps
         self.last_epoch = 0
-        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
+        self._last_lr = _param_groups_val_list(self.optimizer, "lr")
         self._init_is_better(
             mode=mode, threshold=threshold, threshold_mode=threshold_mode
         )
@@ -1371,7 +1714,7 @@ def step(self, metrics: SupportsFloat, epoch=None) -> None:  # type: ignore[over
             self.cooldown_counter = self.cooldown
             self.num_bad_epochs = 0
 
-        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
+        self._last_lr = _param_groups_val_list(self.optimizer, "lr")
 
     def _reduce_lr(self, epoch):
         if len(self.optimizer.param_groups) != len(self.min_lrs):
@@ -1386,6 +1729,7 @@ def _reduce_lr(self, epoch):
                     "of the `optimizer` param groups."
                 )
             else:
+                # pyrefly: ignore  # bad-assignment
                 self.min_lrs = [self.default_min_lr] * len(self.optimizer.param_groups)
 
         for i, param_group in enumerate(self.optimizer.param_groups):
@@ -1561,18 +1905,17 @@ def __init__(
         base_lrs = _format_param("base_lr", optimizer, base_lr)
         if last_epoch == -1:
             for lr, group in zip(base_lrs, optimizer.param_groups):
-                if isinstance(group["lr"], Tensor):
-                    lr_val = lr.item() if isinstance(lr, Tensor) else lr
-                    group["lr"].fill_(lr_val)
-                else:
-                    group["lr"] = lr
+                _update_param_group_val(group, "lr", lr)
 
         self.max_lrs = _format_param("max_lr", optimizer, max_lr)
 
+        # pyrefly: ignore  # bad-assignment
         step_size_up = float(step_size_up)
         step_size_down = (
+            # pyrefly: ignore  # bad-assignment
             float(step_size_down) if step_size_down is not None else step_size_up
         )
+        # pyrefly: ignore  # unsupported-operation
         self.total_size = step_size_up + step_size_down
         self.step_ratio = step_size_up / self.total_size
 
@@ -1649,13 +1992,34 @@ def _exp_range_scale_fn(gamma: float, x: float) -> float:
         return gamma**x
 
     @override
-    def get_lr(self) -> list[float]:
-        """Calculate the learning rate at batch index.
-
-        This function treats `self.last_epoch` as the last batch index.
-
-        If `self.cycle_momentum` is ``True``, this function has a side effect of
-        updating the optimizer's momentum.
+    def get_lr(self) -> list[float | Tensor]:
+        r"""Compute the next learning rate for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups`.
+
+        Advances each ``group["lr"]`` in the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups` along a cycle between the
+        group's ``base_lr`` and ``max_lr`` using :meth:`scale_fn`.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+
+        .. note::
+            If you're trying to inspect the most recent learning rate, use
+            :meth:`get_last_lr()` instead.
+
+        .. note::
+            The returned :class:`~torch.Tensor`\s are copies, and never alias
+            the optimizer's ``group["lr"]``\s.
+
+        .. note::
+            This method treats :attr:`last_epoch` as the index of the previous
+            batch.
+
+        .. note::
+            When :attr:`cycle_momentum` is ``True``, this method has a side
+            effect of updating the optimizer's momentum.
         """
         _warn_get_lr_called_within_step(self)
 
@@ -1700,7 +2064,7 @@ def get_lr(self) -> list[float]:
     def state_dict(self) -> dict[str, Any]:  # noqa: D102
         """Return the state of the scheduler as a :class:`dict`.
 
-        It contains an entry for every variable in self.__dict__ which
+        It contains an entry for every variable in ``self.__dict__`` which
         is not the optimizer.
         The learning rate lambda functions will only be saved if they are callable objects
         and not if they are functions or lambdas.
@@ -1795,8 +2159,36 @@ def __init__(
         super().__init__(optimizer, last_epoch)
 
     @override
-    def get_lr(self) -> list[float]:
-        """Compute the initial learning rate."""
+    def get_lr(self) -> list[float | Tensor]:
+        r"""Compute the next learning rate for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups`.
+
+        Computes learning rates for the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups` following:
+
+        .. math::
+            \texttt{eta\_min} + \frac{1}{2}(\texttt{base\_lr} -
+            \texttt{eta\_min})\left(1 + \cos\left(\pi \cdot
+            \frac{\texttt{T\_cur}}{\texttt{T\_i}}\right)\right)
+
+        Where :attr:`T_cur` is the number of epochs since the last restart and
+        :attr:`T_i` is the number of epochs between two restarts. Both
+        :attr:`T_cur` and :attr:`T_i` are updated in :meth:`step`, and
+        :attr:`T_i` becomes :attr:`T_mult` times larger after each restart.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+
+        .. note::
+            If you're trying to inspect the most recent learning rate, use
+            :meth:`get_last_lr()` instead.
+
+        .. note::
+            The returned :class:`~torch.Tensor`\s are copies, and never alias
+            the optimizer's ``group["lr"]``\s.
+        """
         _warn_get_lr_called_within_step(self)
 
         return [
@@ -1869,7 +2261,7 @@ def step(self, epoch=None) -> None:
             for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
                 _update_param_group_val(param_group, "lr", lr)
 
-        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
+        self._last_lr = _param_groups_val_list(self.optimizer, "lr")
 
 
 class _SchedulePhase(TypedDict):
@@ -2141,8 +2533,31 @@ def _annealing_linear(start, end, pct):
         return (end - start) * pct + start
 
     @override
-    def get_lr(self) -> list[float]:
-        """Compute the learning rate of each parameter group."""
+    def get_lr(self) -> list[float | Tensor]:
+        r"""Compute the next learning rate for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups`.
+
+        Finds the appropriate :attr:`_schedule_phases` entry for the current
+        step and interpolates between its ``start_lr`` and ``end_lr`` using
+        :meth:`_anneal_func`.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+
+        .. note::
+            If you're trying to inspect the most recent learning rate, use
+            :meth:`get_last_lr()` instead.
+
+        .. note::
+            The returned :class:`~torch.Tensor`\s are copies, and never alias
+            the optimizer's ``group["lr"]``\s.
+
+        .. note::
+            When :attr:`cycle_momentum` is ``True``, this method has a side
+            effect of updating the optimizer's momentum.
+        """
         _warn_get_lr_called_within_step(self)
 
         lrs = []
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index 2ef6c48f4efa..7d142c4cc3b7 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -4,11 +4,11 @@
 import functools
 import warnings
 from collections import defaultdict, OrderedDict
-from collections.abc import Hashable, Iterable, Sequence
+from collections.abc import Callable, Hashable, Iterable, Sequence
 from copy import deepcopy
 from itertools import chain
-from typing import Any, Callable, cast, Optional, overload, TypeVar, Union
-from typing_extensions import ParamSpec, Self, TypeAlias
+from typing import Any, cast, Optional, overload, TypeAlias, TypeVar, Union
+from typing_extensions import ParamSpec, Self
 
 import torch
 import torch.utils.hooks as hooks
@@ -62,6 +62,7 @@ def _use_grad_for_differentiable(func: Callable[_P, _T]) -> Callable[_P, _T]:
     def _use_grad(*args: _P.args, **kwargs: _P.kwargs) -> _T:
         import torch._dynamo
 
+        # pyrefly: ignore  # unsupported-operation
         self = cast(Optimizer, args[0])  # assume first positional arg is `self`
         prev_grad = torch.is_grad_enabled()
         try:
@@ -135,11 +136,13 @@ def maybe_fallback(*args: _P.args, **kwargs: _P.kwargs):
             if torch.compiler.is_compiling() and (
                 not kwargs.get("capturable", False)
                 and has_state_steps
+                # pyrefly: ignore  # unsupported-operation
                 and (arg := args[state_steps_ind])
                 and isinstance(arg, Sequence)
                 and arg[0].is_cuda
                 or (
                     "state_steps" in kwargs
+                    # pyrefly: ignore  # unsupported-operation
                     and (kwarg := kwargs["state_steps"])
                     and isinstance(kwarg, Sequence)
                     and kwarg[0].is_cuda
@@ -227,7 +230,7 @@ def _get_capturable_supported_devices(supports_xla: bool = True) -> list[str]:
     return capturable_supported_devices
 
 
-def _to_scalar(x):
+def _to_scalar(x: Union[float, torch.Tensor]):
     r"""This function converts a hyperparameter to a 0-dimension (scalar) tensor
     if it is a nonzero-dimensions 1-element tensor. If it is not a tensor, it is
     kept as is.
@@ -359,14 +362,18 @@ class Optimizer:
 
     _optimizer_step_pre_hooks: dict[int, OptimizerPreHook]
     _optimizer_step_post_hooks: dict[int, OptimizerPostHook]
+    # pyrefly: ignore  # not-a-type
     _optimizer_state_dict_pre_hooks: 'OrderedDict[int, Callable[["Optimizer"], None]]'
     _optimizer_state_dict_post_hooks: (
+        # pyrefly: ignore  # not-a-type
         'OrderedDict[int, Callable[["Optimizer", StateDict], Optional[StateDict]]]'
     )
     _optimizer_load_state_dict_pre_hooks: (
+        # pyrefly: ignore  # not-a-type
         'OrderedDict[int, Callable[["Optimizer", StateDict], Optional[StateDict]]]'
     )
     _optimizer_load_state_dict_post_hooks: (
+        # pyrefly: ignore  # not-a-type
         'OrderedDict[int, Callable[["Optimizer"], None]]'
     )
 
@@ -391,6 +398,7 @@ def __init__(self, params: ParamsT, defaults: dict[str, Any]) -> None:  # noqa:
         self.state: defaultdict[torch.Tensor, Any] = defaultdict(dict)
         self.param_groups: list[dict[str, Any]] = []
 
+        # pyrefly: ignore  # no-matching-overload
         param_groups = list(params)
         if len(param_groups) == 0:
             raise ValueError("optimizer got an empty parameter list")
@@ -514,6 +522,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> R:
                                 f"{func} must return None or a tuple of (new_args, new_kwargs), but got {result}."
                             )
 
+                # pyrefly: ignore  # invalid-param-spec
                 out = func(*args, **kwargs)
                 self._optimizer_step_code()
 
@@ -774,8 +783,8 @@ def _process_value_according_to_param_policy(
         assert param_groups is not None
         for pg in param_groups:
             if param_id in pg["params"]:
-                fused = pg["fused"] if "fused" in pg else False
-                capturable = pg["capturable"] if "capturable" in pg else False
+                fused = pg.get("fused", False)
+                capturable = pg.get("capturable", False)
                 break
         if key == "step":
             if capturable or fused:
@@ -949,7 +958,13 @@ def _cast(param, value, param_id=None, param_groups=None, key=None):
             r"""Make a deep copy of value, casting all tensors to device of param."""
             if isinstance(value, torch.Tensor):
                 return Optimizer._process_value_according_to_param_policy(
-                    param, value, param_id, param_groups, key
+                    param,
+                    value,
+                    # pyrefly: ignore  # bad-argument-type
+                    param_id,
+                    # pyrefly: ignore  # bad-argument-type
+                    param_groups,
+                    key,
                 )
             elif isinstance(value, dict):
                 return {
@@ -960,6 +975,7 @@ def _cast(param, value, param_id=None, param_groups=None, key=None):
                 }
             elif isinstance(value, Iterable):
                 return type(value)(
+                    # pyrefly: ignore  # bad-argument-count
                     _cast(param, v, param_id=param_id, param_groups=param_groups)
                     for v in value
                 )  # type: ignore[call-arg]
diff --git a/torch/optim/radam.py b/torch/optim/radam.py
index bf5bc9102ce2..fed3e73673ce 100644
--- a/torch/optim/radam.py
+++ b/torch/optim/radam.py
@@ -322,6 +322,7 @@ def _single_tensor_radam(
         rho_t = rho_inf - 2 * step * (beta2**step) / bias_correction2
 
         def _compute_rect():
+            # pyrefly: ignore  # unsupported-operation
             return (
                 (rho_t - 4)
                 * (rho_t - 2)
@@ -336,6 +337,7 @@ def _compute_adaptive_lr():
             else:
                 exp_avg_sq_sqrt = exp_avg_sq_sqrt.add_(eps)
 
+            # pyrefly: ignore  # unsupported-operation
             return (bias_correction2**0.5) / exp_avg_sq_sqrt
 
         # Compute the variance rectification term and update parameters accordingly
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
index 4fafecbd31bd..d7d4b1f21c53 100644
--- a/torch/optim/sgd.py
+++ b/torch/optim/sgd.py
@@ -347,6 +347,7 @@ def _single_tensor_sgd(
                     # usually this is the differentiable path, which is why the param.clone() is needed
                     grad = grad.addcmul_(param.clone(), weight_decay)
                 else:
+                    # pyrefly: ignore  # bad-argument-type
                     grad = grad.add(param, alpha=weight_decay)
             else:
                 grad = grad.add(param, alpha=weight_decay)
@@ -370,6 +371,7 @@ def _single_tensor_sgd(
             if lr.requires_grad:
                 param.addcmul_(grad, lr, value=-1)
             else:
+                # pyrefly: ignore  # bad-argument-type
                 param.add_(grad, alpha=-lr)
         else:
             param.add_(grad, alpha=-lr)
@@ -441,6 +443,7 @@ def _multi_tensor_sgd(
                 torch._foreach_add_(bufs, device_grads, alpha=1 - dampening)
             else:
                 bufs = []
+
                 for i in range(len(device_momentum_buffer_list)):
                     if device_momentum_buffer_list[i] is None:
                         buf = device_momentum_buffer_list[i] = momentum_buffer_list[
diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py
index e15b796cdbe5..80674c0a39da 100644
--- a/torch/optim/swa_utils.py
+++ b/torch/optim/swa_utils.py
@@ -4,9 +4,9 @@
 import itertools
 import math
 import warnings
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from copy import deepcopy
-from typing import Any, Callable, cast, Literal, Optional, Union
+from typing import Any, cast, Literal, Optional, Union
 from typing_extensions import override
 
 import torch
@@ -249,11 +249,13 @@ def forward(self, *args, **kwargs):
     def update_parameters(self, model: Module):
         """Update model parameters."""
         self_param = (
+            # pyrefly: ignore  # bad-argument-type
             itertools.chain(self.module.parameters(), self.module.buffers())
             if self.use_buffers
             else self.parameters()
         )
         model_param = (
+            # pyrefly: ignore  # bad-argument-type
             itertools.chain(model.parameters(), model.buffers())
             if self.use_buffers
             else model.parameters()
@@ -295,13 +297,17 @@ def update_parameters(self, model: Module):
                         avg_fn = get_swa_avg_fn()
                         n_averaged = self.n_averaged.to(device)
                         for p_averaged, p_model in zip(self_params, model_params):  # type: ignore[assignment]
+                            # pyrefly: ignore  # missing-attribute
                             p_averaged.copy_(avg_fn(p_averaged, p_model, n_averaged))
             else:
                 for p_averaged, p_model in zip(  # type: ignore[assignment]
                     self_param_detached, model_param_detached
                 ):
+                    # pyrefly: ignore  # missing-attribute
                     n_averaged = self.n_averaged.to(p_averaged.device)
+                    # pyrefly: ignore  # missing-attribute
                     p_averaged.detach().copy_(
+                        # pyrefly: ignore  # missing-attribute, bad-argument-type
                         self.avg_fn(p_averaged.detach(), p_model, n_averaged)
                     )
 
@@ -454,8 +460,29 @@ def _get_initial_lr(lr, swa_lr, alpha):
             return swa_lr
         return (lr - alpha * swa_lr) / (1 - alpha)
 
+    @override
     def get_lr(self):
-        """Get learning rate."""
+        r"""Compute the next learning rate for each of the optimizer's
+        :attr:`~torch.optim.Optimizer.param_groups`.
+
+        Uses :attr:`anneal_func` to interpolate between each group's
+        ``group["lr"]`` and ``group["swa_lr"]`` over :attr:`anneal_epochs`
+        epochs. Once :attr:`anneal_epochs` is reached, keeps the learning rate
+        fixed at ``group["swa_lr"]``.
+
+        Returns:
+            list[float | Tensor]: A :class:`list` of learning rates for each of
+            the optimizer's :attr:`~torch.optim.Optimizer.param_groups` with the
+            same types as their current ``group["lr"]``\s.
+
+        .. note::
+            If you're trying to inspect the most recent learning rate, use
+            :meth:`get_last_lr()` instead.
+
+        .. note::
+            The returned :class:`~torch.Tensor`\s are copies, and never alias
+            the optimizer's ``group["lr"]``\s.
+        """
         # `_get_lr_called_within_step` is only available `_enable_get_lr_call`,
         # so we ignore the type error here. See `LRScheduler.step()` for more details.
         if not self._get_lr_called_within_step:
@@ -468,12 +495,14 @@ def get_lr(self):
         step = self._step_count - 1
         if self.anneal_epochs == 0:
             step = max(1, step)
+        # pyrefly: ignore  # no-matching-overload
         prev_t = max(0, min(1, (step - 1) / max(1, self.anneal_epochs)))
         prev_alpha = self.anneal_func(prev_t)
         prev_lrs = [
             self._get_initial_lr(group["lr"], group["swa_lr"], prev_alpha)
             for group in self.optimizer.param_groups
         ]
+        # pyrefly: ignore  # no-matching-overload
         t = max(0, min(1, step / max(1, self.anneal_epochs)))
         alpha = self.anneal_func(t)
         return [
diff --git a/torch/overrides.py b/torch/overrides.py
index c8fd7c6a2289..264edf07b918 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -27,9 +27,9 @@
 import functools
 import types
 import warnings
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from functools import wraps
-from typing import Any, Callable, Optional, TypeVar
+from typing import Any, Optional, TypeVar
 from typing_extensions import ParamSpec
 
 import torch
@@ -249,6 +249,8 @@ def get_ignored_functions() -> set[Callable]:
         torch.nn.functional.has_torch_function_unary,
         torch.nn.functional.has_torch_function_variadic,
         torch.nn.functional.handle_torch_function,
+        torch.nn.functional.scaled_grouped_mm,
+        torch.nn.functional.scaled_mm,
         torch.nn.functional.sigmoid,
         torch.nn.functional.hardsigmoid,
         torch.nn.functional.tanh,
@@ -362,7 +364,7 @@ def get_ignored_functions() -> set[Callable]:
         Tensor._view_func,
         Tensor._view_func_unsafe,
         Tensor._rev_view_func_unsafe,
-        Tensor._make_dtensor,
+        Tensor._dtensor__new__,
         Tensor._make_wrapper_subclass,
         Tensor._python_dispatch.__get__,
         Tensor._has_symbolic_sizes_strides.__get__,
@@ -1351,6 +1353,7 @@ def get_testing_overrides() -> dict[Callable, Callable]:
         Tensor._grad.__get__: lambda self: -1,
         Tensor._grad_fn.__get__: lambda self: -1,
         Tensor.grad_fn.__get__: lambda self: -1,
+        Tensor.grad_dtype.__get__: lambda self: -1,
         Tensor._version.__get__: lambda self: -1,
         Tensor._autocast_to_reduced_precision: lambda self, cuda_enabled, cpu_enabled, cuda_dtype, cpu_dtype: -1,
         Tensor._autocast_to_full_precision: lambda self, cuda_enabled, cpu_enabled: -1,
@@ -1516,6 +1519,7 @@ def get_testing_overrides() -> dict[Callable, Callable]:
         Tensor.zero_: lambda self: -1,
         Tensor.__dlpack__: lambda self, stream=None, max_version=None, dl_device=None, copy=None: -1,
         Tensor.__dlpack_device__: lambda self: -1,
+        Tensor.index: lambda self, a, b: -1,
         torch.linalg.lstsq: lambda self, b, cond=None, driver=None: -1,
     }  # fmt: skip
 
diff --git a/torch/package/_package_pickler.py b/torch/package/_package_pickler.py
index 8384a3ce2c16..31898c96f1b0 100644
--- a/torch/package/_package_pickler.py
+++ b/torch/package/_package_pickler.py
@@ -1,4 +1,5 @@
 # mypy: allow-untyped-defs
+# pyrefly: ignore  # missing-module-attribute
 from pickle import (  # type: ignore[attr-defined]
     _compat_pickle,
     _extension_registry,
diff --git a/torch/package/analyze/is_from_package.py b/torch/package/analyze/is_from_package.py
index 82ff5896b6ff..800f87eb4867 100644
--- a/torch/package/analyze/is_from_package.py
+++ b/torch/package/analyze/is_from_package.py
@@ -10,7 +10,7 @@ def is_from_package(obj: Any) -> bool:
 
     Note: packaged objects from externed modules will return ``False``.
     """
-    if type(obj) == ModuleType:
+    if type(obj) is ModuleType:
         return is_mangled(obj.__name__)
     else:
         return is_mangled(type(obj).__module__)
diff --git a/torch/package/analyze/trace_dependencies.py b/torch/package/analyze/trace_dependencies.py
index e029fe130bdd..839c2da8cabc 100644
--- a/torch/package/analyze/trace_dependencies.py
+++ b/torch/package/analyze/trace_dependencies.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import sys
-from collections.abc import Iterable
-from typing import Any, Callable
+from collections.abc import Callable, Iterable
+from typing import Any
 
 
 __all__ = ["trace_dependencies"]
diff --git a/torch/package/importer.py b/torch/package/importer.py
index 8cfc1e336a45..3984ddfc40fb 100644
--- a/torch/package/importer.py
+++ b/torch/package/importer.py
@@ -2,10 +2,12 @@
 import importlib
 import logging
 from abc import ABC, abstractmethod
+
+# pyrefly: ignore  # missing-module-attribute
 from pickle import (  # type: ignore[attr-defined]
     _getattribute,
     _Pickler,
-    whichmodule as _pickle_whichmodule,
+    whichmodule as _pickle_whichmodule,  # pyrefly: ignore  # missing-module-attribute
 )
 from types import ModuleType
 from typing import Any, Optional
diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py
index 6118e8ce8096..7b686f008201 100644
--- a/torch/package/package_exporter.py
+++ b/torch/package/package_exporter.py
@@ -8,12 +8,12 @@
 import platform
 import types
 from collections import defaultdict, OrderedDict
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from dataclasses import dataclass
 from enum import Enum
 from importlib.machinery import SourceFileLoader
 from pathlib import Path
-from typing import Any, Callable, cast, IO, Optional, Union
+from typing import Any, cast, IO, Optional, Union
 
 import torch
 from torch.serialization import location_tag, normalize_storage_type
@@ -219,7 +219,7 @@ def __init__(
         torch._C._log_api_usage_once("torch.package.PackageExporter")
         self.debug = debug
         if isinstance(f, (str, os.PathLike)):
-            f = os.fspath(f)
+            f = os.fspath(f)  # pyrefly: ignore  # no-matching-overload
             self.buffer: Optional[IO[bytes]] = None
         else:  # is a byte buffer
             self.buffer = f
@@ -652,6 +652,7 @@ def _check_mocked_error(module: Optional[str], field: Optional[str]):
             memo: defaultdict[int, str] = defaultdict(None)
             memo_count = 0
             # pickletools.dis(data_value)
+            # pyrefly: ignore  # bad-assignment
             for opcode, arg, _pos in pickletools.genops(data_value):
                 if pickle_protocol == 4:
                     if (
diff --git a/torch/package/package_importer.py b/torch/package/package_importer.py
index 7291227e42ae..8f2a009f9121 100644
--- a/torch/package/package_importer.py
+++ b/torch/package/package_importer.py
@@ -8,9 +8,9 @@
 import os
 import sys
 import types
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from contextlib import contextmanager
-from typing import Any, Callable, cast, Optional, TYPE_CHECKING, Union
+from typing import Any, cast, Optional, TYPE_CHECKING, Union
 from weakref import WeakValueDictionary
 
 import torch
@@ -108,6 +108,7 @@ def __init__(
             self.filename = "<pytorch_file_reader>"
             self.zip_reader = file_or_buffer
         elif isinstance(file_or_buffer, (os.PathLike, str)):
+            # pyrefly: ignore  # no-matching-overload
             self.filename = os.fspath(file_or_buffer)
             if not os.path.isdir(self.filename):
                 self.zip_reader = torch._C.PyTorchFileReader(self.filename)
diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
index d9f3a917c152..1fa07c90fde1 100644
--- a/torch/profiler/_memory_profiler.py
+++ b/torch/profiler/_memory_profiler.py
@@ -5,8 +5,7 @@
 import itertools as it
 import logging
 from collections.abc import Iterator
-from typing import Any, cast, Optional, Union
-from typing_extensions import Literal
+from typing import Any, cast, Literal, Optional, Union
 
 import torch
 from torch._C import FunctionSchema
@@ -231,6 +230,7 @@ def inputs_are_mutable(cls, t: _ExtraFields_TorchOp) -> tuple[Optional[bool], ..
         for schema in cls.match_schemas(t):
             mutable = mutable or [False for _ in schema.arguments]
             for i, arg in enumerate(schema.arguments):
+                # pyrefly: ignore  # unsupported-operation
                 mutable[i] |= getattr(arg.alias_info, "is_write", False)
 
         return tuple(mutable or (None for _ in t.inputs))
@@ -672,6 +672,7 @@ def timeline(self) -> tuple[tuple[int, Action, KeyAndID, int], ...]:
         output: list[tuple[int, Action, KeyAndID, int]] = []
         allocation_times: dict[tuple[TensorKey, bool], int] = {}
         live_unknown: dict[tuple[int, torch.device], Literal[True]] = {}
+
         for event in self._op_tree.dfs():
             if event.typed[0] == _EventType.Allocation:
                 alloc_fields = event.typed[1]
@@ -778,6 +779,7 @@ def _any_version_depends_on_gradient(self) -> set[int]:
 
                 if ids:
                     depends_on_gradient.update(ids)
+
                     depends_on_gradient.update(key.id for key in node.outputs)
 
             # We are guaranteed to exit because there is a finite set of
@@ -1082,6 +1084,7 @@ def get_category_index(key, version):
 
             if action in (Action.PREEXISTING, Action.CREATE):
                 raw_events.append(
+                    # pyrefly: ignore  # bad-argument-type
                     (
                         t,
                         _ACTION_TO_INDEX[action],
@@ -1092,6 +1095,7 @@ def get_category_index(key, version):
 
             elif action == Action.INCREMENT_VERSION:
                 raw_events.append(
+                    # pyrefly: ignore  # bad-argument-type
                     (
                         t,
                         _ACTION_TO_INDEX[action],
@@ -1100,6 +1104,7 @@ def get_category_index(key, version):
                     )
                 )
                 raw_events.append(
+                    # pyrefly: ignore  # bad-argument-type
                     (
                         t,
                         _ACTION_TO_INDEX[action],
@@ -1110,6 +1115,7 @@ def get_category_index(key, version):
 
             elif action == Action.DESTROY:
                 raw_events.append(
+                    # pyrefly: ignore  # bad-argument-type
                     (
                         t,
                         _ACTION_TO_INDEX[action],
diff --git a/torch/profiler/_utils.py b/torch/profiler/_utils.py
index 5b631ef743c6..04b0fc62189e 100644
--- a/torch/profiler/_utils.py
+++ b/torch/profiler/_utils.py
@@ -211,6 +211,7 @@ def new_old_event_comparator(event):
             # Find latest cuda kernel event
             if hasattr(event, "start_us"):
                 start_time = event.start_us() * 1000
+                # pyrefly: ignore  # missing-attribute
                 end_time = (event.start_us() + event.duration_us()) * 1000
                 # Find current spawned cuda kernel event
                 if event in kernel_mapping and kernel_mapping[event] is not None:
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
index 573541799bbe..1e8e6591d49e 100644
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -5,10 +5,10 @@
 import shutil
 import tempfile
 from abc import ABC, abstractmethod
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from enum import Enum
 from functools import partial
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 from typing_extensions import Self
 from warnings import warn
 
@@ -161,14 +161,19 @@ def __init__(
         self.mem_tl: Optional[MemoryProfileTimeline] = None
         self.use_device = None
         if ProfilerActivity.CUDA in self.activities:
+            # pyrefly: ignore  # bad-assignment
             self.use_device = "cuda"
         elif ProfilerActivity.XPU in self.activities:
+            # pyrefly: ignore  # bad-assignment
             self.use_device = "xpu"
         elif ProfilerActivity.MTIA in self.activities:
+            # pyrefly: ignore  # bad-assignment
             self.use_device = "mtia"
         elif ProfilerActivity.HPU in self.activities:
+            # pyrefly: ignore  # bad-assignment
             self.use_device = "hpu"
         elif ProfilerActivity.PrivateUse1 in self.activities:
+            # pyrefly: ignore  # bad-assignment
             self.use_device = _get_privateuse1_backend_name()
 
         # user-defined metadata to be amended to the trace
@@ -380,6 +385,7 @@ def _get_distributed_info(self):
         }
         if backend == "nccl":
             nccl_version = torch.cuda.nccl.version()
+            # pyrefly: ignore  # unsupported-operation
             dist_info["nccl_version"] = ".".join(str(v) for v in nccl_version)
         return dist_info
 
diff --git a/torch/quantization/_quantized_conversions.py b/torch/quantization/_quantized_conversions.py
index 8d930c366c0d..54f40dcf7b25 100644
--- a/torch/quantization/_quantized_conversions.py
+++ b/torch/quantization/_quantized_conversions.py
@@ -71,6 +71,7 @@ def quantized_weight_reorder_for_mixed_dtypes_linear_cutlass(
                 nrows // 16, 16
             )
         ).view(-1)
+    # pyrefly: ignore  # unbound-name
     outp = outp.index_copy(1, cols_permuted, outp)
 
     # interleave_column_major_tensor
diff --git a/torch/quantization/qconfig.py b/torch/quantization/qconfig.py
index a02ff7d6f738..75398d3343f9 100644
--- a/torch/quantization/qconfig.py
+++ b/torch/quantization/qconfig.py
@@ -27,5 +27,5 @@
     QConfig,
     qconfig_equals,
     QConfigAny,
-    QConfigDynamic,
+    QConfigDynamic,  # pyrefly: ignore  # deprecated
 )
diff --git a/torch/serialization.py b/torch/serialization.py
index a6eb314fc1a8..dcdbf0c3cef9 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -13,10 +13,11 @@
 import tempfile
 import threading
 import warnings
+from collections.abc import Callable
 from contextlib import closing, contextmanager
 from enum import Enum
-from typing import Any, Callable, cast, Generic, IO, Optional, TypeVar, Union
-from typing_extensions import TypeAlias, TypeIs
+from typing import Any, cast, Generic, IO, Optional, TypeAlias, TypeVar, Union
+from typing_extensions import TypeIs
 
 import torch
 import torch._weights_only_unpickler as _weights_only_unpickler
@@ -660,6 +661,11 @@ def _deserialize(backend_name, obj, location):
     functools.partial(_backend_tag, "xpu"),
     functools.partial(_deserialize, "xpu"),
 )
+register_package(
+    26,
+    functools.partial(_backend_tag, "mtia"),
+    functools.partial(_deserialize, "mtia"),
+)
 
 
 def location_tag(
@@ -768,7 +774,10 @@ def _open_file_like(name_or_buffer: FileLike, mode: str) -> _opener[IO[bytes]]:
 
 class _open_zipfile_reader(_opener[torch._C.PyTorchFileReader]):
     def __init__(self, name_or_buffer: Union[str, IO[bytes]]) -> None:
-        super().__init__(torch._C.PyTorchFileReader(name_or_buffer))
+        super().__init__(
+            # pyrefly: ignore  # no-matching-overload
+            torch._C.PyTorchFileReader(name_or_buffer)
+        )
 
 
 class _open_zipfile_writer_file(_opener[torch._C.PyTorchFileWriter]):
@@ -781,9 +790,10 @@ def __init__(self, name: str) -> None:
             # PyTorchFileWriter only supports ascii filename.
             # For filenames with non-ascii characters, we rely on Python
             # for writing out the file.
+            # pyrefly: ignore  # bad-assignment
             self.file_stream = io.FileIO(self.name, mode="w")
             super().__init__(
-                torch._C.PyTorchFileWriter(
+                torch._C.PyTorchFileWriter(  # pyrefly: ignore  # no-matching-overload
                     self.file_stream, get_crc32_options(), _get_storage_alignment()
                 )
             )
@@ -960,7 +970,7 @@ def save(
     _check_save_filelike(f)
 
     if isinstance(f, (str, os.PathLike)):
-        f = os.fspath(f)
+        f = os.fspath(f)  # pyrefly: ignore  # no-matching-overload
 
     if _use_new_zipfile_serialization:
         with _open_zipfile_writer(f) as opened_zipfile:
@@ -1514,7 +1524,10 @@ def _get_wo_message(message: str) -> str:
                     else:
                         shared = False
                     overall_storage = torch.UntypedStorage.from_file(
-                        os.fspath(f), shared, size
+                        # pyrefly: ignore  # no-matching-overload
+                        os.fspath(f),
+                        shared,
+                        size,
                     )
                 if weights_only:
                     try:
diff --git a/torch/signal/windows/windows.py b/torch/signal/windows/windows.py
index e68c202f03e8..ed240bda8160 100644
--- a/torch/signal/windows/windows.py
+++ b/torch/signal/windows/windows.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from math import sqrt
-from typing import Callable, Optional, TypeVar
+from typing import Optional, TypeVar
 
 import torch
 from torch import Tensor
@@ -326,7 +326,7 @@ def gaussian(
         requires_grad=requires_grad,
     )
 
-    return torch.exp(-(k**2))
+    return torch.exp(-(k**2))  # pyrefly: ignore  # unsupported-operation
 
 
 @_add_docstr(
@@ -397,11 +397,17 @@ def kaiser(
         )
 
     # Avoid NaNs by casting `beta` to the appropriate dtype.
+    # pyrefly: ignore  # bad-assignment
     beta = torch.tensor(beta, dtype=dtype, device=device)
 
     start = -beta
     constant = 2.0 * beta / (M if not sym else M - 1)
-    end = torch.minimum(beta, start + (M - 1) * constant)
+    end = torch.minimum(
+        # pyrefly: ignore  # bad-argument-type
+        beta,
+        # pyrefly: ignore  # bad-argument-type
+        start + (M - 1) * constant,
+    )
 
     k = torch.linspace(
         start=start,
@@ -413,7 +419,10 @@ def kaiser(
         requires_grad=requires_grad,
     )
 
-    return torch.i0(torch.sqrt(beta * beta - torch.pow(k, 2))) / torch.i0(beta)
+    return torch.i0(torch.sqrt(beta * beta - torch.pow(k, 2))) / torch.i0(
+        # pyrefly: ignore  # bad-argument-type
+        beta
+    )
 
 
 @_add_docstr(
diff --git a/torch/sparse/__init__.py b/torch/sparse/__init__.py
index 31299314a85f..448b50eda020 100644
--- a/torch/sparse/__init__.py
+++ b/torch/sparse/__init__.py
@@ -623,17 +623,20 @@ def convert_to_strided_representation(args):
                         )
                         obj = obj.to_dense().sparse_mask(full_mask)
                     if obj.layout is torch.sparse_coo:
+                        # pyrefly: ignore  # no-matching-overload
                         d.update(
                             indices=obj._indices(), is_coalesced=obj.is_coalesced()
                         )
                         values = obj._values()
                     elif obj.layout in {torch.sparse_csr, torch.sparse_bsr}:
+                        # pyrefly: ignore  # no-matching-overload
                         d.update(
                             compressed_indices=obj.crow_indices(),
                             plain_indices=obj.col_indices(),
                         )
                         values = obj.values()
                     else:
+                        # pyrefly: ignore  # no-matching-overload
                         d.update(
                             compressed_indices=obj.ccol_indices(),
                             plain_indices=obj.row_indices(),
diff --git a/torch/sparse/_semi_structured_conversions.py b/torch/sparse/_semi_structured_conversions.py
index f9b1b0899f87..c98205f56707 100644
--- a/torch/sparse/_semi_structured_conversions.py
+++ b/torch/sparse/_semi_structured_conversions.py
@@ -140,6 +140,7 @@ def sparse_semi_structured_from_dense_cutlass(dense):
 
     if dense.dtype != torch.float:
         sparse0 = dense_4.gather(-1, idxs0.unsqueeze(-1))  # type: ignore[possibly-undefined]
+        # pyrefly: ignore  # unbound-name
         sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
         sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
     else:
@@ -172,6 +173,7 @@ def sparse_semi_structured_from_dense_cutlass(dense):
     meta_offsets = _calculate_meta_reordering_scatter_offsets(
         m, meta_ncols, meta_dtype, device
     )
+    # pyrefly: ignore  # unbound-name
     meta_reordered.scatter_(0, meta_offsets, meta.view(-1))
 
     return (sparse, meta_reordered.view(m, meta_ncols))
diff --git a/torch/sparse/_semi_structured_ops.py b/torch/sparse/_semi_structured_ops.py
index 0a4196b1f62b..55cb0a8c113e 100644
--- a/torch/sparse/_semi_structured_ops.py
+++ b/torch/sparse/_semi_structured_ops.py
@@ -40,7 +40,7 @@ def semi_sparse_values(func, types, args=(), kwargs=None) -> torch.Tensor:
     if A.meta is None:
         m, k = A.shape
         num_kept_elements = m * k // 2
-        return A.packed[:num_kept_elements:].view(m, -1)
+        return A.packed.ravel()[:num_kept_elements:].view(m, -1)
     else:
         return A.packed.detach()
 
@@ -53,7 +53,7 @@ def semi_sparse_indices(func, types, args=(), kwargs=None) -> torch.Tensor:
     if A.meta is None:
         m, k = A.shape
         num_kept_elements = m * k // 2
-        metadata = A.packed[num_kept_elements:].view(m, -1)
+        metadata = A.packed.ravel()[num_kept_elements:].view(m, -1)
         return metadata.view(torch.int32 if A.dtype == torch.int32 else torch.int16)
     else:
         return A.meta
@@ -67,6 +67,7 @@ def semi_sparse_t(func, types, args=(), kwargs=None) -> torch.Tensor:
     # Because we cannot go from the compressed representation back to the dense representation currently,
     # we just keep track of how many times we have been transposed. Depending on whether the sparse matrix
     # is the first or second argument, we expect an even / odd number of calls to transpose respectively.
+    # pyrefly: ignore  # no-matching-overload
     return self.__class__(
         torch.Size([self.shape[-1], self.shape[0]]),
         packed=self.packed_t,
diff --git a/torch/sparse/_triton_ops.py b/torch/sparse/_triton_ops.py
index ea36264d8f82..942e5e8dca3f 100644
--- a/torch/sparse/_triton_ops.py
+++ b/torch/sparse/_triton_ops.py
@@ -385,6 +385,7 @@ def scatter_mm(blocks, others, indices_data, *, accumulators=None):
                 g1 = c_offsets[r + 1]
                 for g in range(g0, g1):
                     p, q = pq[g]
+
                     accumulators[r] += blocks[p] @ others[q]
         else:
             _scatter_mm2(blocks, others, c_offsets, pq, accumulators)
@@ -1296,6 +1297,7 @@ def bsr_dense_addmm(
     assert alpha != 0
 
     def kernel(grid, *sliced_tensors):
+        # pyrefly: ignore  # unsupported-operation
         _bsr_strided_addmm_kernel[grid](
             *ptr_stride_extractor(*sliced_tensors),
             beta,
@@ -1425,6 +1427,7 @@ def _sampled_addmm_kernel(
 
                 mat1_block = tl.load(
                     mat1_block_ptrs + mat1_col_block_stride * k_offsets[None, :],
+                    # pyrefly: ignore  # index-error
                     mask=mask_k[None, :],
                     other=0.0,
                 )
@@ -1433,6 +1436,7 @@ def _sampled_addmm_kernel(
                     mat2_block_ptrs
                     + mat2_tiled_col_stride * col_block
                     + mat2_row_block_stride * k_offsets[:, None],
+                    # pyrefly: ignore  # index-error
                     mask=mask_k[:, None],
                     other=0.0,
                 )
@@ -1970,6 +1974,7 @@ def _scaled_dot_product_attention(
         if attn_mask.dtype is not torch.bool:
             check_dtype(f_name, attn_mask, query.dtype)
 
+        # pyrefly: ignore  # not-callable
         sdpa = sampled_addmm(
             attn_mask, query, key.transpose(-2, -1), beta=0.0, skip_checks=False
         )
@@ -1981,8 +1986,10 @@ def _scaled_dot_product_attention(
             )
         scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
         sdpa.values().mul_(scale_factor)
+        # pyrefly: ignore  # not-callable
         sdpa = bsr_softmax(sdpa)
         torch.nn.functional.dropout(sdpa.values(), p=dropout_p, inplace=True)
+        # pyrefly: ignore  # not-callable
         sdpa = bsr_dense_mm(sdpa, value)
         return sdpa
 
diff --git a/torch/sparse/_triton_ops_meta.py b/torch/sparse/_triton_ops_meta.py
index 89245246395a..903c0a5a9d6c 100644
--- a/torch/sparse/_triton_ops_meta.py
+++ b/torch/sparse/_triton_ops_meta.py
@@ -232,8 +232,10 @@ def sort_key(key):
     part2 = current_content[end_data_index:]
     data_part = []
     for op_key in sorted(_operation_device_version_data, key=sort_key):
+        # pyrefly: ignore  # bad-argument-type
         data_part.append("    " + repr(op_key).replace("'", '"') + ": {")
         op_data = _operation_device_version_data[op_key]
+        # pyrefly: ignore  # bad-argument-type
         data_part.extend(f"        {key}: {op_data[key]}," for key in sorted(op_data))
         data_part.append("    },")
     new_content = part1 + "\n".join(data_part) + "\n" + part2
@@ -367,6 +369,7 @@ def from_key(key, parameters):
                 if next_target < minimal_target:
                     minimal_target = next_target
                     parameters = next_parameters
+                    # pyrefly: ignore  # unsupported-operation
                     pbar.total += i + 1
                     break
         else:
diff --git a/torch/sparse/semi_structured.py b/torch/sparse/semi_structured.py
index b225eaabb320..7fcdd8687933 100644
--- a/torch/sparse/semi_structured.py
+++ b/torch/sparse/semi_structured.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 import warnings
 from collections import namedtuple
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 
 import torch
 from torch.sparse._semi_structured_conversions import (
@@ -183,6 +184,7 @@ def __tensor_unflatten__(
         outer_stride,
     ) -> torch.Tensor:
         shape, fuse_transpose_cusparselt, alg_id_cusparselt, requires_grad = tensor_meta
+        # pyrefly: ignore  # no-matching-overload
         return cls(
             shape=shape,
             packed=inner_tensors.get("packed", None),
@@ -412,6 +414,7 @@ def from_dense(
             sparse_tensor_cutlass,
             meta_tensor_cutlass,
         ) = sparse_semi_structured_from_dense_cutlass(original_tensor)
+        # pyrefly: ignore  # no-matching-overload
         return cls(
             original_tensor.shape,
             packed=sparse_tensor_cutlass,
@@ -498,6 +501,7 @@ def prune_dense_static_sort(
             original_tensor, algorithm=algorithm, use_cutlass=True
         )
 
+        # pyrefly: ignore  # no-matching-overload
         return cls(
             original_tensor.shape,
             packed=packed,
@@ -559,6 +563,7 @@ def from_dense(
         cls, original_tensor: torch.Tensor
     ) -> "SparseSemiStructuredTensorCUSPARSELT":
         cls._validate_device_dim_dtype_shape(original_tensor)
+        # pyrefly: ignore  # no-matching-overload
         return cls(
             shape=original_tensor.shape,
             packed=torch._cslt_compress(original_tensor),
@@ -576,7 +581,7 @@ def prune_dense_static_sort(
         cls, original_tensor: torch.Tensor, algorithm=""
     ) -> "SparseSemiStructuredTensor":
         """
-        This function does the same thing as described in SparseSemiStructuredCUTLASS, but uses the cuSPASRELt metadata
+        This function does the same thing as described in SparseSemiStructuredCUTLASS, but uses the cuSPARSELt metadata
         layout and sparse matmul.
 
         The only functional difference is that cuSPARSELt stores `metadata` and `packed` together into a single tensor.
@@ -620,6 +625,12 @@ def prune_dense_static_sort(
             original_tensor, algorithm=algorithm, use_cutlass=False
         )
 
+        # Map this two 2-dim view of packed data.
+        # TODO: is this proper cuSPARSELt metadata?
+        packed = packed.view(original_tensor.shape[0], -1)
+        packed_t = packed_t.view(original_tensor.shape[1], -1)
+
+        # pyrefly: ignore  # no-matching-overload
         return cls(
             original_tensor.shape,
             packed=packed,
diff --git a/torch/storage.py b/torch/storage.py
index e651bc9d16eb..fbe75b549f23 100644
--- a/torch/storage.py
+++ b/torch/storage.py
@@ -618,7 +618,7 @@ def _get_storage_from_sequence(sequence, dtype, device):
 
 def _isint(x):
     if HAS_NUMPY:
-        return isinstance(x, (int, np.integer))
+        return isinstance(x, (int, np.integer))  # pyrefly: ignore  # missing-attribute
     else:
         return isinstance(x, int)
 
@@ -889,9 +889,9 @@ def untyped(self):
         return self._untyped_storage
 
     def _new_wrapped_storage(self, untyped_storage) -> Self:
-        assert type(untyped_storage) == torch.UntypedStorage
+        assert type(untyped_storage) is torch.UntypedStorage
 
-        if type(self) == TypedStorage:
+        if type(self) is TypedStorage:
             return cast(
                 Self,
                 TypedStorage(
@@ -913,7 +913,7 @@ def _maybe_wrap_index(self, idx, is_stop=False):
                 return 0
 
         else:
-            if type(idx) != int:
+            if type(idx) is not int:
                 raise TypeError(f"can't index a {type(self)} with {type(idx)}")
             if is_stop:
                 if (idx > self._size()) or (idx < -self._size()):
@@ -1513,7 +1513,7 @@ class _LegacyStorageMeta(type):
     dtype: torch.dtype
 
     def __instancecheck__(cls, instance):
-        if type(instance) == TypedStorage:
+        if type(instance) is TypedStorage:
             cls_device = _get_device_from_module(cls.__module__)
             return (cls_device == instance.device.type) and (
                 cls.dtype == instance.dtype
diff --git a/torch/testing/__init__.py b/torch/testing/__init__.py
index de042277c7c8..9e8f41008dcf 100644
--- a/torch/testing/__init__.py
+++ b/torch/testing/__init__.py
@@ -1,5 +1,7 @@
 from torch._C import FileCheck as FileCheck
 
 from . import _utils
+
+# pyrefly: ignore  # deprecated
 from ._comparison import assert_allclose, assert_close as assert_close
 from ._creation import make_tensor as make_tensor
diff --git a/torch/testing/_comparison.py b/torch/testing/_comparison.py
index eff07c413deb..1d4a050b8047 100644
--- a/torch/testing/_comparison.py
+++ b/torch/testing/_comparison.py
@@ -3,8 +3,8 @@
 import cmath
 import collections.abc
 import contextlib
-from collections.abc import Collection, Sequence
-from typing import Any, Callable, NoReturn, Optional, Union
+from collections.abc import Callable, Collection, Sequence
+from typing import Any, NoReturn, Optional, Union
 from typing_extensions import deprecated
 
 import torch
@@ -92,7 +92,9 @@ def default_tolerances(
                 f"Expected a torch.Tensor or a torch.dtype, but got {type(input)} instead."
             )
     dtype_precisions = dtype_precisions or _DTYPE_PRECISIONS
-    rtols, atols = zip(*[dtype_precisions.get(dtype, (0.0, 0.0)) for dtype in dtypes])
+    rtols, atols = zip(
+        *[dtype_precisions.get(dtype, (0.0, 0.0)) for dtype in dtypes], strict=True
+    )
     return max(rtols), max(atols)
 
 
@@ -241,6 +243,7 @@ def make_scalar_mismatch_msg(
             Defaults to "Scalars".
     """
     abs_diff = abs(actual - expected)
+    # pyrefly: ignore  # bad-argument-type
     rel_diff = float("inf") if expected == 0 else abs_diff / abs(expected)
     return _make_mismatch_msg(
         default_identifier="Scalars",
@@ -484,6 +487,7 @@ def __init__(
     def _supported_types(self) -> tuple[type, ...]:
         cls: list[type] = [bool]
         if HAS_NUMPY:
+            # pyrefly: ignore  # missing-attribute
             cls.append(np.bool_)
         return tuple(cls)
 
@@ -499,6 +503,7 @@ def _process_inputs(
     def _to_bool(self, bool_like: Any, *, id: tuple[Any, ...]) -> bool:
         if isinstance(bool_like, bool):
             return bool_like
+        # pyrefly: ignore  # missing-attribute
         elif isinstance(bool_like, np.bool_):
             return bool_like.item()
         else:
@@ -578,6 +583,7 @@ def __init__(
     def _supported_types(self) -> tuple[type, ...]:
         cls = list(self._NUMBER_TYPES)
         if HAS_NUMPY:
+            # pyrefly: ignore  # missing-attribute
             cls.append(np.number)
         return tuple(cls)
 
@@ -593,6 +599,7 @@ def _process_inputs(
     def _to_number(
         self, number_like: Any, *, id: tuple[Any, ...]
     ) -> Union[int, float, complex]:
+        # pyrefly: ignore  # missing-attribute
         if HAS_NUMPY and isinstance(number_like, np.number):
             return number_like.item()
         elif isinstance(number_like, self._NUMBER_TYPES):
@@ -1115,6 +1122,7 @@ def originate_pairs(
     mapping_types: tuple[type, ...] = (collections.abc.Mapping,),
     id: tuple[Any, ...] = (),
     **options: Any,
+    # pyrefly: ignore  # bad-return
 ) -> list[Pair]:
     """Originates pairs from the individual inputs.
 
@@ -1213,6 +1221,7 @@ def originate_pairs(
     else:
         for pair_type in pair_types:
             try:
+                # pyrefly: ignore  # bad-instantiation
                 return [pair_type(actual, expected, id=id, **options)]
             # Raising an `UnsupportedInputs` during origination indicates that the pair type is not able to handle the
             # inputs. Thus, we try the next pair type.
@@ -1310,7 +1319,9 @@ def not_close_error_metas(
     # would not get freed until cycle collection, leaking cuda memory in tests.
     # We break the cycle by removing the reference to the error_meta objects
     # from this frame as it returns.
+    # pyrefly: ignore  # bad-assignment
     error_metas = [error_metas]
+    # pyrefly: ignore  # bad-return
     return error_metas.pop()
 
 
diff --git a/torch/testing/_internal/autocast_test_lists.py b/torch/testing/_internal/autocast_test_lists.py
index a75e9c834b70..b3616fede6ce 100644
--- a/torch/testing/_internal/autocast_test_lists.py
+++ b/torch/testing/_internal/autocast_test_lists.py
@@ -437,13 +437,13 @@ def compare(first, second):
                 if isinstance(first, torch.Tensor):
                     return torch.equal(first, second)
                 elif isinstance(first, collections.abc.Iterable):
-                    return all(compare(f, s) for f, s in zip(first, second))
+                    return all(compare(f, s) for f, s in zip(first, second, strict=False))
                 else:
                     return first == second
 
             # If both torch.* and Tensor.* variants were found, check outputs are identical
             if (output is not None) and (output_method is not None):
-                self.assertTrue(type(output) == type(output_method))
+                self.assertTrue(type(output) is type(output_method))
                 comparison = compare(output, output_method)
                 self.assertTrue(
                     comparison, f"torch.{op} result did not match Tensor.{op} result"
@@ -465,7 +465,7 @@ def compare(first, second):
                     control = getattr(args[0].to(run_as_type), op)(
                         *cast(args[1:], run_as_type), **add_kwargs
                     )
-                self.assertTrue(type(output_to_compare) == type(control))
+                self.assertTrue(type(output_to_compare) is type(control))
                 comparison = compare(output_to_compare, control)
                 self.assertTrue(comparison, f"torch.{op} result did not match control")
             self.assertTrue(torch.is_autocast_enabled(device_type=device))
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index be284429114f..8202a32ae8ad 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -139,7 +139,6 @@ def evaluate_platform_supports_mxfp8_grouped_gemm():
 PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: evaluate_platform_supports_mx_gemm())
 PLATFORM_SUPPORTS_FP8: bool = LazyVal(lambda: evaluate_platform_supports_fp8())
 PLATFORM_SUPPORTS_FP8_GROUPED_GEMM: bool = LazyVal(lambda: evaluate_platform_supports_fp8_grouped_gemm())
-PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: TEST_CUDA and SM100OrLater)
 PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM: bool = LazyVal(lambda: evaluate_platform_supports_mxfp8_grouped_gemm())
 
 if TEST_NUMBA:
@@ -181,9 +180,6 @@ def tf32_off():
 
 @contextlib.contextmanager
 def tf32_on(self, tf32_precision=1e-5):
-    if torch.version.hip:
-        hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
-        os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
     old_allow_tf32_matmul = torch.backends.cuda.matmul.allow_tf32
     old_precision = self.precision
     try:
@@ -192,11 +188,6 @@ def tf32_on(self, tf32_precision=1e-5):
         with torch.backends.cudnn.flags(enabled=None, benchmark=None, deterministic=None, allow_tf32=True):
             yield
     finally:
-        if torch.version.hip:
-            if hip_allow_tf32 is not None:
-                os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
-            else:
-                del os.environ["HIPBLASLT_ALLOW_TF32"]
         torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32_matmul
         self.precision = old_precision
 
@@ -246,7 +237,7 @@ def tf32_enabled():
 # if device is specified, it will check if device is cuda
 # if dtype is specified, it will check if dtype is float32 or complex64
 # tf32 and fp32 are different only when all the three checks pass
-def tf32_on_and_off(tf32_precision=1e-5, only_if=True):
+def tf32_on_and_off(tf32_precision=1e-5, *, only_if=True):
     def with_tf32_disabled(self, function_call):
         with tf32_off():
             function_call()
@@ -261,7 +252,7 @@ def wrapper(f):
 
         @functools.wraps(f)
         def wrapped(*args, **kwargs):
-            kwargs.update(zip(arg_names, args))
+            kwargs.update(zip(arg_names, args, strict=False))
             cond = torch.cuda.is_tf32_supported() and only_if
             if 'device' in kwargs:
                 cond = cond and (torch.device(kwargs['device']).type == 'cuda')
@@ -334,7 +325,7 @@ def _create_scaling_models_optimizers(device="cuda", optimizer_ctor=torch.optim.
     mod_control = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.Linear(8, 8)).to(device=device)
     mod_scaling = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.Linear(8, 8)).to(device=device)
     with torch.no_grad():
-        for c, s in zip(mod_control.parameters(), mod_scaling.parameters()):
+        for c, s in zip(mod_control.parameters(), mod_scaling.parameters(), strict=True):
             s.copy_(c)
 
     kwargs = {"lr": 1.0}
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 43c7741c69aa..c31d7a54b65a 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -9,10 +9,10 @@
 import threading
 import unittest
 from collections import namedtuple
-from collections.abc import Iterable, Sequence
+from collections.abc import Callable, Iterable, Sequence
 from enum import Enum
 from functools import partial, wraps
-from typing import Any, Callable, ClassVar, Optional, TypeVar, Union
+from typing import Any, ClassVar, Optional, TypeVar, Union
 from typing_extensions import ParamSpec
 
 import torch
@@ -34,6 +34,7 @@
     IS_MACOS,
     is_privateuse1_backend_available,
     IS_REMOTE_GPU,
+    IS_S390X,
     IS_SANDCASTLE,
     IS_WINDOWS,
     NATIVE_DEVICES,
@@ -45,6 +46,7 @@
     TEST_MPS,
     TEST_WITH_ASAN,
     TEST_WITH_MIOPEN_SUGGEST_NHWC,
+    TEST_WITH_MTIA,
     TEST_WITH_ROCM,
     TEST_WITH_TORCHINDUCTOR,
     TEST_WITH_TSAN,
@@ -389,8 +391,8 @@ def _get_tolerance_override(self, test, dtype):
         return test.tolerance_overrides.get(dtype, tol(self.precision, self.rel_tol))
 
     def _apply_precision_override_for_test(self, test, param_kwargs):
-        dtype = param_kwargs["dtype"] if "dtype" in param_kwargs else None
-        dtype = param_kwargs["dtypes"] if "dtypes" in param_kwargs else dtype
+        dtype = param_kwargs.get("dtype")
+        dtype = param_kwargs.get("dtypes", dtype)
         if dtype:
             self.precision = self._get_precision_override(test, dtype)
             self.precision, self.rel_tol = self._get_tolerance_override(test, dtype)
@@ -628,8 +630,17 @@ def get_primary_device(cls):
     @classmethod
     def get_all_devices(cls):
         # currently only one device is supported on MPS backend
+        primary_device_idx = int(cls.get_primary_device().split(":")[1])
+        num_devices = torch.xpu.device_count()
+
         prim_device = cls.get_primary_device()
-        return [prim_device]
+        xpu_str = "xpu:{0}"
+        non_primary_devices = [
+            xpu_str.format(idx)
+            for idx in range(num_devices)
+            if idx != primary_device_idx
+        ]
+        return [prim_device] + non_primary_devices
 
     @classmethod
     def setUpClass(cls):
@@ -693,8 +704,13 @@ def get_device_type_test_bases():
 
     if IS_SANDCASTLE or IS_FBCODE:
         if IS_REMOTE_GPU:
-            # Skip if sanitizer is enabled
-            if not TEST_WITH_ASAN and not TEST_WITH_TSAN and not TEST_WITH_UBSAN:
+            # Skip if sanitizer is enabled or we're on MTIA machines
+            if (
+                not TEST_WITH_ASAN
+                and not TEST_WITH_TSAN
+                and not TEST_WITH_UBSAN
+                and not TEST_WITH_MTIA
+            ):
                 test_bases.append(CUDATestBase)
         else:
             test_bases.append(CPUTestBase)
@@ -1322,6 +1338,10 @@ def _has_sufficient_memory(device, size):
     else:
         effective_size = size
 
+    # don't try using all RAM on s390x, leave some for service processes
+    if IS_S390X:
+        effective_size = effective_size * 2
+
     if psutil.virtual_memory().available < effective_size:
         gc.collect()
     return psutil.virtual_memory().available >= effective_size
@@ -1366,8 +1386,9 @@ def dep_fn(self, *args, **kwargs):
 
 
 class expectedFailure:
-    def __init__(self, device_type):
+    def __init__(self, device_type, dtype=None):
         self.device_type = device_type
+        self.dtype = dtype
 
     def __call__(self, fn):
         @wraps(fn)
@@ -1381,7 +1402,13 @@ def efail_fn(slf, *args, **kwargs):
             else:
                 target_device_type = slf.device_type
 
-            if self.device_type is None or self.device_type == target_device_type:
+            target_dtype = kwargs.get("dtype", getattr(slf, "dtype", None))
+            device_matches = (
+                self.device_type is None or self.device_type == target_device_type
+            )
+            dtype_matches = self.dtype is None or self.dtype == target_dtype
+
+            if device_matches and dtype_matches:
                 try:
                     fn(slf, *args, **kwargs)
                 except Exception:
@@ -1395,13 +1422,13 @@ def efail_fn(slf, *args, **kwargs):
 
 
 class onlyOn:
-    def __init__(self, device_type):
+    def __init__(self, device_type: Union[str, list]):
         self.device_type = device_type
 
     def __call__(self, fn):
         @wraps(fn)
         def only_fn(slf, *args, **kwargs):
-            if self.device_type != slf.device_type:
+            if slf.device_type not in self.device_type:
                 reason = f"Only runs on {self.device_type}"
                 raise unittest.SkipTest(reason)
 
@@ -1701,6 +1728,10 @@ def expectedFailureMPS(fn):
     return expectedFailure("mps")(fn)
 
 
+def expectedFailureMPSComplex(fn):
+    return expectedFailure("mps", torch.complex64)(fn)
+
+
 def expectedFailureMPSPre15(fn):
     import platform
 
@@ -1960,6 +1991,10 @@ def skipHPU(fn):
     return skipHPUIf(True, "test doesn't work on HPU backend")(fn)
 
 
+def skipXPU(fn):
+    return skipXPUIf(True, "test doesn't work on XPU backend")(fn)
+
+
 def skipPRIVATEUSE1(fn):
     return skipPRIVATEUSE1If(True, "test doesn't work on privateuse1 backend")(fn)
 
diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 48e87b9fb54f..18384b311b93 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -1,6 +1,7 @@
 # mypy: ignore-errors
 
 import faulthandler
+import functools
 import itertools
 import logging
 import multiprocessing
@@ -15,13 +16,14 @@
 import traceback
 import types
 import unittest
+from collections.abc import Callable
 from contextlib import contextmanager
 from dataclasses import dataclass
 from datetime import timedelta
 from enum import Enum
 from functools import partial, reduce, wraps
 from io import StringIO
-from typing import Any, Callable, NamedTuple, Optional, Union
+from typing import Any, NamedTuple, Optional, Union
 from unittest.mock import patch
 
 import torch
@@ -32,6 +34,7 @@
 from torch._C._autograd import DeviceType
 from torch._C._distributed_c10d import _SymmetricMemory
 from torch._logging._internal import trace_log
+from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import (
     FILE_SCHEMA,
     find_free_port,
@@ -208,6 +211,14 @@ def at_least_x_gpu(x):
     return False
 
 
+def _maybe_handle_skip_if_lt_x_gpu(args, msg) -> bool:
+    _handle_test_skip = getattr(args[0], "_handle_test_skip", None)
+    if len(args) == 0 or _handle_test_skip is None:
+        return False
+    _handle_test_skip(msg)
+    return True
+
+
 def skip_if_lt_x_gpu(x):
     def decorator(func):
         @wraps(func)
@@ -218,7 +229,9 @@ def wrapper(*args, **kwargs):
                 return func(*args, **kwargs)
             if TEST_XPU and torch.xpu.device_count() >= x:
                 return func(*args, **kwargs)
-            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
+            test_skip = TEST_SKIPS[f"multi-gpu-{x}"]
+            if not _maybe_handle_skip_if_lt_x_gpu(args, test_skip.message):
+                sys.exit(test_skip.exit_code)
 
         return wrapper
 
@@ -234,7 +247,9 @@ def wrapper(*args, **kwargs):
                 return func(*args, **kwargs)
             if torch.cuda.is_available() and torch.cuda.device_count() >= x:
                 return func(*args, **kwargs)
-            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
+            test_skip = TEST_SKIPS[f"multi-gpu-{x}"]
+            if not _maybe_handle_skip_if_lt_x_gpu(args, test_skip.message):
+                sys.exit(test_skip.exit_code)
 
         return wrapper
 
@@ -414,15 +429,17 @@ def requires_multicast_support():
 
 
 def evaluate_platform_supports_symm_mem():
-    if TEST_WITH_ROCM:
-        arch_list = ["gfx942", "gfx950"]
-        for arch in arch_list:
-            if arch in torch.cuda.get_device_properties(0).gcnArchName:
-                return True
     if TEST_CUDA:
-        return True
-
-    return False
+        if TEST_WITH_ROCM:
+            arch_list = ["gfx942", "gfx950"]
+            for arch in arch_list:
+                if arch in torch.cuda.get_device_properties(0).gcnArchName:
+                    return True
+            return False
+        else:
+            return True
+    else:
+        return False
 
 
 PLATFORM_SUPPORTS_SYMM_MEM: bool = LazyVal(
@@ -439,11 +456,11 @@ def skip_if_rocm_arch_multiprocess(arch: tuple[str, ...]):
     """Skips a test for given ROCm archs - multiprocess UTs"""
 
     def decorator(func):
-        prop = torch.cuda.get_device_properties(0).gcnArchName.split(":")[0]
-        arch_match = prop in arch
         reason = None
-        if TEST_WITH_ROCM and arch_match:
-            reason = f"skip_if_rocm_arch_multiprocess: test skipped on {arch}"
+        if TEST_WITH_ROCM:
+            prop = torch.cuda.get_device_properties(0).gcnArchName.split(":")[0]
+            if prop in arch:
+                reason = f"skip_if_rocm_arch_multiprocess: test skipped on {arch}"
 
         return unittest.skipIf(reason is not None, reason)(func)
 
@@ -769,7 +786,12 @@ def _start_processes(self, proc) -> None:
             process = proc(
                 target=self.__class__._run,
                 name="process " + str(rank),
-                args=(rank, self._current_test_name(), self.file_name, child_conn),
+                args=(
+                    rank,
+                    self._current_test_name(),
+                    self.file_name,
+                    child_conn,
+                ),
                 kwargs={
                     "fake_pg": getattr(self, "fake_pg", False),
                 },
@@ -846,13 +868,14 @@ def run_test(self, test_name: str, parent_pipe) -> None:
             torch._C._set_print_stack_traces_on_fatal_signal(True)
         # Show full C++ stacktraces when a Python error originating from C++ is raised.
         os.environ["TORCH_SHOW_CPP_STACKTRACES"] = "1"
+        common_utils.set_rng_seed()
 
         # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
         # We're retrieving a corresponding test and executing it.
         try:
             getattr(self, test_name)()
         except unittest.SkipTest as se:
-            logger.info(
+            logger.info(  # noqa: G200
                 "Process %s skipping test %s for following reason: %s",
                 self.rank,
                 test_name,
@@ -894,11 +917,10 @@ def _get_timedout_process_traceback(self) -> None:
                 try:
                     pipe.send(MultiProcessTestCase.Event.GET_TRACEBACK)
                     pipes.append((i, pipe))
-                except ConnectionError as e:
-                    logger.error(
-                        "Encountered error while trying to get traceback for process %s: %s",
+                except ConnectionError:
+                    logger.exception(
+                        "Encountered error while trying to get traceback for process %s",
                         i,
-                        e,
                     )
 
         # Wait for results.
@@ -921,11 +943,10 @@ def _get_timedout_process_traceback(self) -> None:
                     logger.error(
                         "Could not retrieve traceback for timed out process: %s", rank
                     )
-            except ConnectionError as e:
-                logger.error(
-                    "Encountered error while trying to get traceback for process %s: %s",
+            except ConnectionError:
+                logger.exception(
+                    "Encountered error while trying to get traceback for process %s",
                     rank,
-                    e,
                 )
 
     def _join_processes(self, fn) -> None:
@@ -1130,7 +1151,7 @@ def run_subtests(
     subtest_config_values: list[list[Any]] = [item[1] for item in subtest_config_items]
     for values in itertools.product(*subtest_config_values):
         # Map keyword to chosen value
-        subtest_kwargs = dict(zip(subtest_config_keys, values))
+        subtest_kwargs = dict(zip(subtest_config_keys, values, strict=True))
         with cls_inst.subTest(**subtest_kwargs):
             torch._dynamo.reset()
             test_fn(*test_args, **test_kwargs, **subtest_kwargs)
@@ -1138,30 +1159,24 @@ def run_subtests(
         c10d.barrier()
 
 
-# Cannot use functools.cache as it requires python 3.9
-EFA_PROBE_RESULT = None
-
-
+@functools.cache
 def has_efa() -> bool:
     """
     If shell command `fi_info -p efa -t FI_EP_RDM` returns exit code 0 then we assume that the machine has
     Libfabric EFA interfaces and EFA software components installed,
     see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html.
     """
-    global EFA_PROBE_RESULT
-    if EFA_PROBE_RESULT is not None:
-        return EFA_PROBE_RESULT
 
     try:
-        EFA_PROBE_RESULT = (
+        return (
             subprocess.run(
                 ["fi_info", "-p", "efa", "-t", "FI_EP_RDM"], check=False
             ).returncode
             == 0
         )
     except FileNotFoundError:
-        EFA_PROBE_RESULT = False
-    return EFA_PROBE_RESULT
+        pass
+    return False
 
 
 def tp_transports():
@@ -1427,7 +1442,7 @@ def _check_return_codes(cls, failed_ranks, timeout, fn):
                 logger.error("Caught exception: \n%s exiting thread %s", msg, rank)
                 error_msg += f"Thread {rank} exited with exception:\n{msg}\n"
             elif isinstance(exc, SystemExit):
-                if type(exc.code) == int and skip_code < 0:
+                if type(exc.code) is int and skip_code < 0:
                     skip_code = exc.code
 
         # check exceptions
@@ -1673,6 +1688,10 @@ def _run_test_given_id(cls, test_id: str, **kwargs) -> None:
         self.rank = cls.rank
         self.world_size = cls.world_size
         test_fn = getattr(self, test_name)
+
+        # Ensure all the ranks use the same seed.
+        common_utils.set_rng_seed()
+
         # Run the test function
         test_fn(**kwargs)
 
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py
index c7274fddd6d3..dd211599cf14 100644
--- a/torch/testing/_internal/common_fsdp.py
+++ b/torch/testing/_internal/common_fsdp.py
@@ -9,11 +9,12 @@
 import unittest
 import warnings
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from contextlib import nullcontext
 from copy import deepcopy
 from enum import auto, Enum
 from functools import wraps
-from typing import Any, Callable, cast, no_type_check, Optional, Union
+from typing import Any, cast, no_type_check, Optional, Union
 from unittest import mock
 
 import torch
@@ -58,6 +59,7 @@
 from torch.testing._internal.common_utils import (
     FILE_SCHEMA,
     get_cycles_per_ms,
+    set_rng_seed,
     TEST_CUDA,
     TEST_HPU,
     TEST_XPU,
@@ -155,7 +157,7 @@ def _assert_module_states(
     assert rank0_states is not None  # mypy
     for state in olist[1:]:
         assert state is not None  # mypy
-        for (_, p1), (_, p2) in zip(rank0_states, state):
+        for (_, p1), (_, p2) in zip(rank0_states, state, strict=True):
             assert_fn(p1, p2)
 
 
@@ -995,6 +997,42 @@ def patch_all_gather(new_all_gather_into_tensor: Callable):
         dist.all_gather_into_tensor = orig_all_gather
 
 
+@contextlib.contextmanager
+def patch_foreach_all_gather(new_foreach_all_gather: Callable):
+    orig_foreach_all_gather = (
+        torch.distributed.fsdp._fully_shard._fsdp_param_group.foreach_all_gather
+    )
+    dist.barrier()
+    torch.distributed.fsdp._fully_shard._fsdp_param_group.foreach_all_gather = (
+        new_foreach_all_gather
+    )
+    try:
+        yield
+    finally:
+        dist.barrier()
+        torch.distributed.fsdp._fully_shard._fsdp_param_group.foreach_all_gather = (
+            orig_foreach_all_gather
+        )
+
+
+@contextlib.contextmanager
+def patch_foreach_reduce(new_foreach_reduce: Callable):
+    orig_foreach_foreach_reduce = (
+        torch.distributed.fsdp._fully_shard._fsdp_param_group.foreach_reduce
+    )
+    dist.barrier()
+    torch.distributed.fsdp._fully_shard._fsdp_param_group.foreach_reduce = (
+        new_foreach_reduce
+    )
+    try:
+        yield
+    finally:
+        dist.barrier()
+        torch.distributed.fsdp._fully_shard._fsdp_param_group.foreach_reduce = (
+            orig_foreach_foreach_reduce
+        )
+
+
 @contextlib.contextmanager
 def patch_reduce_scatter(new_reduce_scatter_tensor: Callable):
     orig_reduce_scatter = dist.reduce_scatter_tensor
@@ -1097,7 +1135,9 @@ def check_sharded_parity(
     prefixes_to_ignore: tuple[str, ...] = (),
 ):
     for (replicated_name, replicated_param), (sharded_name, sharded_param) in zip(
-        replicated_module.named_parameters(), sharded_module.named_parameters()
+        replicated_module.named_parameters(),
+        sharded_module.named_parameters(),
+        strict=True,
     ):
         clean_sharded_name = sharded_name
         for prefix in prefixes_to_ignore:
@@ -1228,6 +1268,7 @@ def _run(cls, rank, test_name, file_name, pipe, **kwargs):  # type: ignore[overr
         dist.barrier(device_ids=device_ids)
 
         torch._dynamo.reset()
+        set_rng_seed()
         self.run_test(test_name, pipe)
         torch._dynamo.reset()
 
diff --git a/torch/testing/_internal/common_jit.py b/torch/testing/_internal/common_jit.py
index 6ca05c51189b..ac6e851d7e28 100644
--- a/torch/testing/_internal/common_jit.py
+++ b/torch/testing/_internal/common_jit.py
@@ -135,7 +135,7 @@ def get_recording_tensors(args):
 
         self.assertEqual(outputs, outputs_test)
         self.assertEqual(grads, grads_test)
-        for g2, g2_test in zip(grads2, grads2_test):
+        for g2, g2_test in zip(grads2, grads2_test, strict=True):
             if g2 is None and g2_test is None:
                 continue
             self.assertEqual(g2, g2_test, atol=5e-4, rtol=1e-4)
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 080c95bc7d2f..0cecc762bce4 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -28,7 +28,7 @@
     (onlyCPU, onlyCUDA, onlyNativeDeviceTypes, disablecuDNN, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver,
      skipCUDAIfNoCusolver, skipCPUIfNoLapack, skipCPUIfNoFFT, skipCUDAIf, precisionOverride,
      skipCPUIfNoMklSparse,
-     toleranceOverride, tol)
+     toleranceOverride, tol, skipXPU)
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FLASH_ATTENTION, PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
     SM53OrLater, SM80OrLater, SM89OrLater, with_tf32_off, TEST_CUDNN, _get_torch_cuda_version,
@@ -39,7 +39,7 @@
     TEST_WITH_ROCM, IS_FBCODE, IS_WINDOWS, IS_MACOS, IS_S390X, TEST_SCIPY,
     torch_to_numpy_dtype_dict, numpy_to_torch_dtype, TEST_WITH_ASAN,
     GRADCHECK_NONDET_TOL, slowTest, TEST_WITH_SLOW,
-    TEST_WITH_TORCHINDUCTOR, MACOS_VERSION
+    TEST_WITH_TORCHINDUCTOR, MACOS_VERSION,
 )
 from torch.testing._utils import wrapper_set_seed
 
@@ -449,7 +449,7 @@ def sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs):
     biases = [None, channels, None]
     is_training = [True, False, False]
 
-    for weight, bias, training in zip(weights, biases, is_training):
+    for weight, bias, training in zip(weights, biases, is_training, strict=True):
         yield SampleInput(
             make_arg(input_shape),
             args=(
@@ -1915,7 +1915,7 @@ def get_val(dtype):
     for sample in sample_inputs_new_fns(self, device, dtype, requires_grad, **kwargs):
         # The scalar we are passing to new_full must be the same dtype
         # as the one of the resulting tensor
-        use_dtype = sample.kwargs['dtype'] if 'dtype' in sample.kwargs else dtype
+        use_dtype = sample.kwargs.get('dtype', dtype)
         yield SampleInput(
             sample.input, *sample.args, get_val(use_dtype), **sample.kwargs)
 
@@ -3011,7 +3011,7 @@ def error_inputs_aminmax_amax_amin(op_info, device, is_ref=False, **kwargs):
     err_msg_aminmax = "cannot compute aminmax over an empty dimension as the operation has no identity"
     if op_info.name in ['amax', 'amin', '_refs.amax', '_refs.amin']:
         yield ErrorInput(SampleInput(torch.rand(shape, device=device)), error_regex=err_msg_amax_amin)
-    elif op_info.name in ['aminmax']:
+    elif op_info.name == 'aminmax':
         yield ErrorInput(SampleInput(torch.rand(shape, device=device)), error_regex=err_msg_aminmax)
 
     # Error Inputs for tensors with more than 64 dimension
@@ -3050,7 +3050,7 @@ def error_inputs_aminmax_amax_amin(op_info, device, is_ref=False, **kwargs):
     if op_info.name in ['amax', 'amin', '_refs.amax', '_refs.amin']:
         yield ErrorInput(SampleInput(input5, kwargs={'dim': 0, 'out': illegal_values}),
                          error_regex=err_msg_amax_amin2)
-    elif op_info.name in ['aminmax']:
+    elif op_info.name == 'aminmax':
         yield ErrorInput(SampleInput(input5, kwargs={'dim': 0, 'out': (max_values, min_values)}),
                          error_regex=err_msg_aminmax2)
 
@@ -3631,7 +3631,7 @@ def _gen_shape(self):
     def _gen_kwargs(self):
         keys = self.kwargs.keys()
         for values in product(*self.kwargs.values()):
-            yield dict(zip(keys, values))
+            yield dict(zip(keys, values, strict=True))
 
     def gen_input_params(self):
         yield from product(self._gen_shape(), self._gen_kwargs())
@@ -4400,7 +4400,7 @@ def sample_inputs_instance_norm(opinfo, device, dtype, requires_grad, **kwargs):
     weights = [channels, None]
     biases = [None, None]
 
-    for weight_channels, bias_channels in zip(weights, biases):
+    for weight_channels, bias_channels in zip(weights, biases, strict=True):
         running_mean = make_arg_without_requires_grad(channels, low=0)
         running_var = make_arg_without_requires_grad(channels, low=0)
         yield SampleInput(
@@ -4883,7 +4883,7 @@ def shape(size, rank, with_batch_channel=True):
 def reference_inputs_upsample(mode, self, device, dtype, requires_grad, **kwargs):
     yield from sample_inputs_upsample(mode, self, device, dtype, requires_grad, **kwargs)
 
-    if mode in ('bilinear', ):
+    if mode == 'bilinear':
         make_arg = partial(
             make_tensor,
             device=device,
@@ -5905,6 +5905,7 @@ def sample_inputs_nn_pad(op_info, device, dtype, requires_grad, mode, **kwargs):
             ((1, 3), (1, 2)),
             ((1, 3), (0, 1)),
             ((1, 3), (0, 2, 0, 1)),
+            ((5, 3), (-1, -2, 1, 1)),
             ((0, 3, 3), (1, 2)),
             ((0, 3, 3), (0, 1)),
             ((0, 3, 3), (0, 2, 0, 1)),
@@ -9345,7 +9346,7 @@ def sample_inputs_multi_head_attention_forward(opinfo, device, dtype, requires_g
             "k_proj_weight" : k_proj_weight,
             "v_proj_weight" : v_proj_weight,
             "attn_mask" : attn_mask,
-            "training" : True if dropout_p > 0.0 else False,
+            "training" : dropout_p > 0.0,
             "use_separate_proj_weight" : use_separate_proj_weight
         }
 
@@ -9467,7 +9468,7 @@ def _should_disable_fastpath(self, opinfo, rightmost_arg, rightmost_arg_type, dt
             # unary
             if opinfo.ref in (torch.abs, torch.neg):
                 return False
-            if opinfo.ref_inplace in (torch.Tensor.zero_,):
+            if opinfo.ref_inplace == torch.Tensor.zero_:
                 return False
             return dtype in integral_types_and(torch.bool)
         if self.arity < 2 or rightmost_arg_type == ForeachRightmostArgType.Tensor:
@@ -9697,7 +9698,7 @@ def __init__(
         super().__init__(arity, rightmost_supports_scalar, rightmost_supports_scalarlist)
 
     def _should_disable_fastpath(self, opinfo, rightmost_arg, rightmost_arg_type, dtype):
-        return dtype in integral_types_and(torch.bool) and opinfo.ref in (torch.addcmul,)
+        return dtype in integral_types_and(torch.bool) and opinfo.ref == torch.addcmul
 
     def sample_zero_size_tensor_inputs(self, opinfo, device, dtype, requires_grad, **kwargs):
         assert "num_input_tensors" not in kwargs
@@ -9709,8 +9710,7 @@ def sample_zero_size_tensor_inputs(self, opinfo, device, dtype, requires_grad, *
             sample_inputs_foreach(None, device, dtype, NUM_SIZE0_TENSORS, zero_size=True, **_foreach_inputs_kwargs)
             for _ in range(2)
         ]
-        if "scalars" in kwargs:
-            del kwargs["scalars"]
+        kwargs.pop("scalars", None)
         kwargs.update(self._sample_kwargs(opinfo, args[-1], ForeachRightmostArgType.TensorList, dtype))
         yield ForeachSampleInput(input, *args, **kwargs)
 
@@ -11615,7 +11615,7 @@ def reference_searchsorted(sorted_sequence, boundary, out_int32=False, right=Fal
         # numpy searchsorted only supports 1D inputs so we split up ND inputs
         orig_shape = boundary.shape
         num_splits = np.prod(sorted_sequence.shape[:-1])
-        splits = range(0, num_splits)
+        splits = range(num_splits)
         sorted_sequence, boundary = sorted_sequence.reshape(num_splits, -1), boundary.reshape(num_splits, -1)
         if sorter is not None:
             sorter = sorter.reshape(num_splits, -1)
@@ -11625,7 +11625,7 @@ def reference_searchsorted(sorted_sequence, boundary, out_int32=False, right=Fal
         split_sorter = [sorter[i] if (sorter is not None) else None for i in splits]
 
         split_ret = [np.searchsorted(s_seq, b, side=side, sorter=s_sort)
-                     for (s_seq, b, s_sort) in zip(split_sequence, split_boundary, split_sorter)]
+                     for (s_seq, b, s_sort) in zip(split_sequence, split_boundary, split_sorter, strict=True)]
         split_ret = [i.astype(np.int32) for i in split_ret] if out_int32 else split_ret
         return np.stack(split_ret).reshape(orig_shape)
 
@@ -13645,7 +13645,8 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                skipCUDAIf(not ((_get_torch_cuda_version() >= (11, 3))
                                or (_get_torch_rocm_version() >= (5, 2))),
                           "cusparseSDDMM was added in 11.2.1"),
-               skipCPUIfNoMklSparse, ],
+               skipCPUIfNoMklSparse,
+               skipXPU],
            skips=(
                # NotImplementedError: Tensors of type SparseCsrTensorImpl do not have is_contiguous
                DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
@@ -14888,7 +14889,8 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
            skips=(
                DecorateInfo(unittest.skip('Works for int64, fails for everything else'), 'TestCommon', 'test_noncontiguous_samples'),  # noqa: B950
                DecorateInfo(unittest.skip('Fails in most cases, passes on LAZY for some reason'), 'TestCommon', 'test_variant_consistency_eager'),  # noqa: B950
-               DecorateInfo(unittest.skip('Fails on cuda + rocm'), 'TestCommon', 'test_complex_half_reference_testing'),
+               DecorateInfo(unittest.skip('Fails on cuda'), 'TestCommon', 'test_complex_half_reference_testing',
+                            active_if=not TEST_WITH_ROCM),
                DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_grad'),
                DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
                DecorateInfo(unittest.skip('Passes on complex128 and float64 only'), 'TestFwdGradients', 'test_fn_fwgrad_bwgrad'),
@@ -16256,7 +16258,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
         aten_backward_name='_prelu_kernel_backward',
         ref=lambda x, weight:
             np.maximum(0., x) + np.minimum(0., x) *
-            (weight if x.ndim == 1 else weight.reshape([weight.size if i == 1 else 1 for i in range(0, x.ndim)])),
+            (weight if x.ndim == 1 else weight.reshape([weight.size if i == 1 else 1 for i in range(x.ndim)])),
         dtypes=floating_types_and(torch.bfloat16, torch.float16),
         supports_forward_ad=True,
         supports_fwgrad_bwgrad=True,
@@ -16372,7 +16374,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
         supports_out=True,
         supports_forward_ad=False,
         supports_autograd=False,
-        decorators=[skipCUDAIf(not SM89OrLater or TEST_WITH_ROCM, 'Requires CUDA SM >= 8.9')],
+        decorators=[skipXPU, skipCUDAIf(not SM89OrLater or TEST_WITH_ROCM, 'Requires CUDA SM >= 8.9')],
         skips=(
             # Sample inputs isn't really parametrized on dtype
             DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes'),
@@ -16499,7 +16501,8 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
         # FIXME: mask_type == 2 (LowerRight)
         decorators=[
             skipCUDAIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "This platform doesn't support efficient attention"),
-            skipCUDAIf(TEST_WITH_ROCM, "Efficient attention on ROCM doesn't support custom_mask_type==2")],
+            skipCUDAIf(TEST_WITH_ROCM, "Efficient attention on ROCM doesn't support custom_mask_type==2"),
+            skipXPU],
         skips=(
             # Checking the scaler value of the philox seed and offset
             DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator', device_type='cuda'),
@@ -20862,6 +20865,8 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
             # AssertionError: Tensor-likes are not close!
             # Fails in cuda11.7
             DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu', device_type='cuda'),
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu', device_type='xpu'),
             DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),),),
     # In training mode, feature_alpha_dropout currently doesn't support inputs of complex dtype
     # unlike when `train=False`, it supports complex inputs, hence 2 OpInfos to cover all cases
diff --git a/torch/testing/_internal/common_mkldnn.py b/torch/testing/_internal/common_mkldnn.py
index 44da60a5ad1f..70ab98137bd7 100644
--- a/torch/testing/_internal/common_mkldnn.py
+++ b/torch/testing/_internal/common_mkldnn.py
@@ -91,7 +91,7 @@ def wrapper(f):
 
         @functools.wraps(f)
         def wrapped(*args, **kwargs):
-            kwargs.update(zip(arg_names, args))
+            kwargs.update(zip(arg_names, args, strict=False))
             cond = True
             if "device" in kwargs:
                 cond = cond and (torch.device(kwargs["device"]).type == "cpu")
diff --git a/torch/testing/_internal/common_modules.py b/torch/testing/_internal/common_modules.py
index 3296d3da804e..120a76eb5ef3 100644
--- a/torch/testing/_internal/common_modules.py
+++ b/torch/testing/_internal/common_modules.py
@@ -24,7 +24,7 @@
     marginrankingloss_reference, multimarginloss_reference, multilabelmarginloss_reference,
     nllloss_reference, nlllossNd_reference, smoothl1loss_reference, softmarginloss_reference, get_reduction)
 from torch.testing._internal.common_utils import (
-    freeze_rng_state, skipIfMPS, skipIfMPSOnMacOS13, GRADCHECK_NONDET_TOL, TEST_WITH_ROCM, IS_WINDOWS,
+    freeze_rng_state, skipIfMPS, GRADCHECK_NONDET_TOL, TEST_WITH_ROCM, IS_WINDOWS,
     skipIfTorchDynamo)
 from types import ModuleType
 import operator
@@ -432,15 +432,18 @@ def module_inputs_torch_nn_GaussianNLLLoss(module_info, device, dtype, requires_
         ('reduction_sum', {'reduction': 'sum'}),
         ('reduction_mean', {'reduction': 'mean'}),
         ('reduction_none', {'reduction': 'none'}),
+        ('homoscedastic', {'homoscedastic': True}),
     ]
 
     module_inputs = []
     for desc, constructor_kwargs in cases:
+        homoscedastic = constructor_kwargs.pop('homoscedastic', False)
+        var_input = make_input(1, 3).abs() if homoscedastic else make_input(4, 1).abs()
         module_inputs.append(
             ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
-                        forward_input=FunctionInput(make_input(3),
-                                                    make_target(3),
-                                                    make_input(1).abs()),
+                        forward_input=FunctionInput(make_input(4, 3),
+                                                    make_target(4, 3),
+                                                    var_input),
                         desc=desc,
                         reference_fn=no_batch_dim_reference_fn)
         )
@@ -1410,7 +1413,7 @@ def module_inputs_torch_nn_L1Loss(module_info, device, dtype, requires_grad, tra
                     forward_input=FunctionInput(make_input((2, 3, 4)),
                                                 make_input((2, 3, 4))),
                     reference_fn=lambda m, p, i, t: 1. / i.numel() * sum((a - b).abs().sum()
-                                                                         for a, b in zip(i, t))),
+                                                                         for a, b in zip(i, t, strict=True))),
         ModuleInput(constructor_input=FunctionInput(),
                     forward_input=FunctionInput(make_input(()), make_input(())),
                     reference_fn=lambda m, p, i, t: 1. / i.numel() * (i - t).abs().sum(),
@@ -3413,11 +3416,8 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                        'TestModule',
                        'test_memory_format',
                        active_if=operator.itemgetter('training'),
-                       device_type='cuda',
-                   ),
-                   # error: input types 'tensor<f32>' and 'tensor<15x10xf16>' are not broadcast compatible
-                   DecorateInfo(skipIfMPSOnMacOS13, 'TestModule', dtypes=[torch.float16], device_type='mps',),),
-               ),
+                       device_type='cuda',),
+               ),),
     ModuleInfo(torch.nn.AvgPool3d,
                module_inputs_func=module_inputs_torch_nn_AvgPool3d,
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
@@ -3496,14 +3496,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=1, lazy=False),
                gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
                module_memformat_affects_out=True,
-               skips=(
-                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
-                   # xfail does not work due to Fatal Python error: Aborted
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float16]),
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
-                                device_type='mps', dtypes=[torch.float16]),
-               ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
                )),
@@ -3516,15 +3508,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # See https://github.com/pytorch/pytorch/issues/80247
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
                                 device_type='cuda', dtypes=[torch.float64]),
-                   # Fails with channels last test on MPS backend
-                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float32, torch.float16]),
-                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
-                   # xfail does not work due to Fatal Python error: Aborted
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float16]),
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
-                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3552,12 +3535,7 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # Not implemented for chalf on CPU
                    DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
                                 dtypes=(torch.chalf,), device_type='cuda'),
-                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
-                   # xfail does not work due to Fatal Python error: Aborted
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float16]),
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
-                                device_type='mps', dtypes=[torch.float16]),),
+               ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
                    DecorateInfo(precisionOverride({torch.chalf: 5e-03}), 'TestModule', 'test_memory_format'),
@@ -3575,18 +3553,9 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # See https://github.com/pytorch/pytorch/issues/80247
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda',
                                 dtypes=[torch.float64, torch.complex128]),
-                   # Fails with channels last test on MPS backend
-                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float16, torch.float32]),
                    # Not implemented for chalf on CPU
                    DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
                                 dtypes=(torch.chalf,), device_type='cuda'),
-                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
-                   # xfail does not work due to Fatal Python error: Aborted
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float16]),
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
-                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3665,12 +3634,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
-                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
-                   # xfail does not work due to Fatal Python error: Aborted
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float16]),
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
-                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3687,15 +3650,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # See https://github.com/pytorch/pytorch/issues/80247
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
                                 device_type='cuda', dtypes=[torch.float64]),
-                   # Fails with channels last test on MPS backend
-                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float32, torch.float16]),
-                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
-                   # xfail does not work due to Fatal Python error: Aborted
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float16]),
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
-                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3725,12 +3679,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
                    # See https://github.com/pytorch/pytorch/issues/70505 for more info.
                    DecorateInfo(skipMeta),
-                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
-                   # xfail does not work due to Fatal Python error: Aborted
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float16]),
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
-                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3747,15 +3695,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                    # See https://github.com/pytorch/pytorch/issues/80247
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda',
                                 dtypes=[torch.float64]),
-                   # Fails with channels last test on MPS backend
-                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float32, torch.float16]),
-                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
-                   # xfail does not work due to Fatal Python error: Aborted
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_memory_format",
-                                device_type='mps', dtypes=[torch.float16]),
-                   DecorateInfo(skipIfMPSOnMacOS13, "TestModule", "test_non_contiguous_tensors",
-                                device_type='mps', dtypes=[torch.float16]),
                ),
                decorators=(
                    DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
@@ -3853,9 +3792,6 @@ def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad
                skips=(
                    # No channels_last support for loss functions.
                    DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
-                   # See #119108: input types 'tensor<f32>' and 'tensor<15x10xf16>' are not broadcast compatible
-                   DecorateInfo(skipIfMPSOnMacOS13, 'TestModule', 'test_non_contiguous_tensors',
-                                device_type='mps', dtypes=[torch.float16],),
                    # See #119108: tolerance issue
                    DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward",
                                 device_type='mps', dtypes=[torch.float16]),)
diff --git a/torch/testing/_internal/common_mps.py b/torch/testing/_internal/common_mps.py
index 2b4764ed0610..4628b8aa3e26 100644
--- a/torch/testing/_internal/common_mps.py
+++ b/torch/testing/_internal/common_mps.py
@@ -383,8 +383,6 @@ def mps_ops_modifier(
             "symeig": None,
             "take": None,
             "to": None,
-            "to_sparse": None,
-            "unique": None,
             "vdot": None,
             "segment_reduce_": None,
             "_upsample_bilinear2d_aa": [torch.uint8],  # uint8 is for CPU only
@@ -739,7 +737,6 @@ def mps_ops_grad_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
             "equal": [torch.float16, torch.float32],
             # 'float' object is not iterable
             "item": [torch.float16, torch.float32],
-            "nn.functional.embedding_bag": None,
             # "smooth_l1_backward_cpu_out" not implemented for 'Half'
             "nn.functional.smooth_l1_loss": [torch.float16],
             # cpu error: grad requires non-empty inputs
@@ -758,6 +755,10 @@ def mps_ops_grad_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
             "eye": [torch.float16, torch.float32],
             # topk fails with duplicate indices
             "topk": [torch.float16],
+            # Could not run 'aten::uniform_' with arguments from the 'SparseCPU' backend
+            "to_sparse": None,
+            # Exception: the derivative for '_unique2' is not implemented.
+            "unique": None,
         }
 
         SKIPLIST_GRAD = {
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index 135cc6a7bd66..3153359326dc 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -15,6 +15,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn import _reduction as _Reduction
+from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import TestCase, to_gpu, freeze_rng_state, is_iterable, \
     gradcheck, gradgradcheck, set_default_dtype, skipIfTorchDynamo, TEST_WITH_ROCM
 from torch.testing._internal.common_cuda import TEST_CUDA, SM90OrLater
@@ -23,7 +24,8 @@
 from torch.types import _TensorOrTensors
 import torch.backends.cudnn
 
-from typing import Callable, Union, Any
+from typing import Union, Any
+from collections.abc import Callable
 from collections.abc import Sequence
 
 TemporaryFile = tempfile.TemporaryFile
@@ -1078,6 +1080,7 @@ def unsqueeze_inp(inp):
 
 
 def get_new_module_tests():
+    common_utils.set_rng_seed()
     new_module_tests = [
         poissonnllloss_no_reduce_test(),
         bceloss_no_reduce_test(),
@@ -2630,7 +2633,7 @@ def get_new_module_tests():
     # add conv padding mode tests:
     for padding_mode, cpp_padding_mode in zip(
             ['reflect', 'circular', 'replicate', 'zeros'],
-            ['torch::kReflect', 'torch::kCircular', 'torch::kReplicate', 'torch::kZeros']):
+            ['torch::kReflect', 'torch::kCircular', 'torch::kReplicate', 'torch::kZeros'], strict=True):
         # conv signature:
         #     in_channels, out_channels, kernel_size, stride=1,
         #     padding=0, dilation=1, groups=1,
@@ -2845,8 +2848,8 @@ def nll_loss_helper(input, target, weight, ignore_index):
         return (result, norm)
 
     losses_and_weights = [nll_loss_helper(i, t, weight, ignore_index)
-                          for i, t in zip(input, target)]
-    losses, weights = zip(*losses_and_weights)
+                          for i, t in zip(input, target, strict=True)]
+    losses, weights = zip(*losses_and_weights, strict=True)
     losses_tensor = input.new_tensor(losses)
     if reduction == 'mean':
         return sum(losses_tensor) / sum(weights)
@@ -2893,7 +2896,7 @@ def _multilabelmarginloss_reference(input, target):
 
     sum = 0
     for target_index in targets:
-        for i in range(0, len(input)):
+        for i in range(len(input)):
             if i not in targets:
                 sum += max(0, 1 - input[target_index] + input[i])
 
@@ -2911,7 +2914,7 @@ def multilabelmarginloss_reference(input, target, reduction='mean'):
     n = input.size(0)
     dim = input.size(1)
     output = input.new(n).zero_()
-    for i in range(0, n):
+    for i in range(n):
         output[i] = _multilabelmarginloss_reference(input[i], target[i])
 
     if reduction == 'mean':
@@ -2952,7 +2955,7 @@ def _multimarginloss_reference(input, target_idx, p, margin, weight):
         weight = input.new(len(input)).fill_(1)
 
     output = 0
-    for i in range(0, len(input)):
+    for i in range(len(input)):
         if i != target_idx:
             output += weight[target_idx] * (max(0, (margin - input[target_idx] + input[i])) ** p)
     return output
@@ -2969,7 +2972,7 @@ def multimarginloss_reference(input, target, p=1, margin=1, weight=None, reducti
     n = input.size(0)
     dim = input.size(1)
     output = input.new(n)
-    for x in range(0, n):
+    for x in range(n):
         output[x] = _multimarginloss_reference(input[x], target[x], p, margin, weight)
 
     if reduction == 'mean':
@@ -2984,7 +2987,7 @@ def multimarginloss_reference(input, target, p=1, margin=1, weight=None, reducti
 def cosineembeddingloss_reference(input1, input2, target, margin=0, reduction='mean'):
     def _cos(a, b):
         cos = a.new(a.size(0))
-        for i in range(0, a.size(0)):
+        for i in range(a.size(0)):
             cos[i] = (a[i] * b[i]).sum() / ((((a[i] * a[i]).sum() + 1e-12) * ((b[i] * b[i]).sum() + 1e-12)) ** 0.5)
         return cos
 
@@ -3265,7 +3268,7 @@ def _analytical_jacobian(self, module, input: _TensorOrTensors, jacobian_input=T
         for i in range(output_size):
             param, d_param = self._get_parameters(module)
             # make non grad zeros
-            d_param = [torch.zeros_like(p) if d is None else d for (p, d) in zip(param, d_param)]
+            d_param = [torch.zeros_like(p) if d is None else d for (p, d) in zip(param, d_param, strict=True)]
 
             d_out = torch.zeros_like(output)
             flat_d_out = d_out.view(-1)
@@ -3279,7 +3282,7 @@ def _analytical_jacobian(self, module, input: _TensorOrTensors, jacobian_input=T
             d_input = self._backward(module, input, output, d_out)
 
             if jacobian_input:
-                for jacobian_x, d_x in zip(flat_jacobian_input, _iter_tensors(d_input)):
+                for jacobian_x, d_x in zip(flat_jacobian_input, _iter_tensors(d_input), strict=True):
                     jacobian_x[:, i] = d_x.contiguous().view(-1)
             if jacobian_parameters:
                 jacobian_param[:, i] = torch.cat(self._flatten_tensors(d_param), 0)
@@ -3317,7 +3320,7 @@ def check_jacobian(self, module, input: _TensorOrTensors, jacobian_input=True):
         numerical_t = list(_iter_tensors(numerical))
 
         differences = []
-        for a, n in zip(analytical_t, numerical_t):
+        for a, n in zip(analytical_t, numerical_t, strict=True):
             if a.numel() != 0:
                 differences.append(a.add(n, alpha=-1).abs().max())
             # TODO: compare structure (ensure analytic jacobian has correct shape)
@@ -3418,7 +3421,7 @@ def __init__(self, *args, **kwargs):
             kwargs.get('FIXME_no_cuda_gradgrad_comparison', False)
         self.precision = kwargs.get('precision', 2e-4)
         self.check_forward_only = kwargs.get('check_forward_only', False)
-        self.default_dtype = kwargs.get('default_dtype', None)
+        self.default_dtype = kwargs.get('default_dtype')
         if self.default_dtype is None:
             self.default_dtype = torch.get_default_dtype()
 
@@ -3525,7 +3528,7 @@ def test_cuda(self, test_case):
             gpu_module = self.constructor(*self.constructor_args).float().cuda()
             cpu_param = test_case._get_parameters(cpu_module)
             gpu_param = test_case._get_parameters(gpu_module)
-            for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]):
+            for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0], strict=True):
                 gpu_p.data.copy_(cpu_p)
 
             test_case._zero_grad_input(cpu_input_tuple)
@@ -3546,7 +3549,7 @@ def test_cuda(self, test_case):
                 cpu_gradInput = test_case._backward(cpu_module, cpu_input_tuple, cpu_output, cpu_gradOutput)
                 gpu_gradInput = test_case._backward(gpu_module, gpu_input_tuple, gpu_output, gpu_gradOutput)
                 test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False)
-                for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]):
+                for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1], strict=True):
                     test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0)
 
             # Run double-backwards on CPU and GPU and compare results
@@ -3572,7 +3575,7 @@ def test_cuda(self, test_case):
                     gpu_gradOutput,
                     create_graph=True)
 
-                for cpu_d_i, gpu_d_i in zip(cpu_gradInputs, gpu_gradInputs):
+                for cpu_d_i, gpu_d_i in zip(cpu_gradInputs, gpu_gradInputs, strict=True):
                     test_case.assertEqual(cpu_d_i, gpu_d_i, atol=self.precision, rtol=0, exact_dtype=False)
 
                 # We mix output into the second backwards computation so that
@@ -3595,7 +3598,7 @@ def test_cuda(self, test_case):
                     gpu_input_tuple + (gpu_gradOutput,) + tuple(gpu_module.parameters()),
                     retain_graph=True)
                 test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False)
-                for cpu_d_p, gpu_d_p in zip(cpu_gg, gpu_gg):
+                for cpu_d_p, gpu_d_p in zip(cpu_gg, gpu_gg, strict=True):
                     test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0, exact_dtype=False)
 
             self.test_noncontig(test_case, gpu_module, gpu_input_tuple)
@@ -3629,7 +3632,7 @@ def __init__(self, *args, **kwargs):
         self.test_cpu = kwargs.get('test_cpu', True)
         self.has_sparse_gradients = kwargs.get('has_sparse_gradients', False)
         self.check_batched_grad = kwargs.get('check_batched_grad', True)
-        self.gradcheck_fast_mode = kwargs.get('gradcheck_fast_mode', None)
+        self.gradcheck_fast_mode = kwargs.get('gradcheck_fast_mode')
         self.supports_forward_ad = kwargs.get('supports_forward_ad', False)
         self.supports_fwgrad_bwgrad = kwargs.get('supports_fwgrad_bwgrad', False)
 
@@ -3833,7 +3836,7 @@ def __init__(self, *args, **kwargs):
         self.with_tf32 = kwargs.get('with_tf32', True)
         self.tf32_precision = kwargs.get('tf32_precision', 0.001)
         self.check_batched_grad = kwargs.get('check_batched_grad', True)
-        self.default_dtype = kwargs.get('default_dtype', None)
+        self.default_dtype = kwargs.get('default_dtype')
         if self.default_dtype is None:
             self.default_dtype = torch.get_default_dtype()
 
diff --git a/torch/testing/_internal/common_optimizers.py b/torch/testing/_internal/common_optimizers.py
index aa99dc4022dd..5036fb54cdc6 100644
--- a/torch/testing/_internal/common_optimizers.py
+++ b/torch/testing/_internal/common_optimizers.py
@@ -528,7 +528,7 @@ def optim_inputs_func_adam(device, dtype=None):
             params=None,
             kwargs={
                 "lr": torch.tensor(0.001),
-                "betas": (torch.tensor(0.9), torch.tensor(0.99)),
+                "betas": (torch.tensor([[[0.9]]]), torch.tensor([[0.99]])),
                 "amsgrad": True,
                 "capturable": True,
             },
@@ -572,7 +572,7 @@ def optim_inputs_func_adam(device, dtype=None):
         + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
         + (mps_supported_configs if _get_device_type(device) == "mps" else [])
     )
-    if dtype in (torch.float16,):
+    if dtype == torch.float16:
         for input in total:
             """
             Too small eps will make denom to be zero for low precision dtype
@@ -1955,7 +1955,7 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         supports_complex=False,
         skips=(
             # Note on numerical differences: `compile` applies different matmul tuning,
-            # which leads to deviations compared to eager mode. In the Newton–Schulz
+            # which leads to deviations compared to eager mode. In the Newton-Schulz
             # iteration for orthogonalization, computations are done in bfloat16, further
             # amplifying these numerical differences.
             DecorateInfo(
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 8040d647216f..fde4f396b2b9 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -87,7 +87,8 @@
 import os
 
 import unittest
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
+from collections.abc import Callable
 
 import numpy as np
 import torch._dynamo as torchdynamo
@@ -765,7 +766,7 @@ def is_leaf_module(module):
             and not isinstance(module, _FusedModule)
         ):
             for child in module.children():
-                if type(child) in [nn.Dropout]:
+                if type(child) is nn.Dropout:
                     continue
                 self.checkObservers(
                     child, propagate_qconfig_list, prepare_custom_config_dict
@@ -1246,7 +1247,7 @@ def checkGraphModeFxOp(
                }
             """
             # TODO: make img_data a single example instead of a list
-            if type(inputs) == list:
+            if type(inputs) is list:
                 inputs = inputs[0]
 
             if quant_type == QuantType.QAT:
@@ -1286,7 +1287,7 @@ def checkGraphModeFxOp(
                 prepare_custom_config=prepare_custom_config,
                 backend_config=backend_config,
             )
-            if not quant_type == QuantType.DYNAMIC:
+            if quant_type != QuantType.DYNAMIC:
                 prepared(*inputs)
 
             if print_debug_info:
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index cd0f26fe29f4..284a3bdcfbd7 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -48,11 +48,11 @@
 from statistics import mean
 from typing import (
     Any,
-    Callable,
     Optional,
     TypeVar,
     Union,
 )
+from collections.abc import Callable
 from collections.abc import Iterable, Iterator
 from unittest.mock import MagicMock
 
@@ -95,14 +95,44 @@
 import torch.utils._pytree as pytree
 from torch.utils import cpp_extension
 try:
-    import pytest
+    import pytest  # type: ignore[import-not-found]
     has_pytest = True
 except ImportError:
     has_pytest = False
 
 
+SEED = 1234
+MI350_ARCH = ("gfx950",)
 MI300_ARCH = ("gfx942",)
 MI200_ARCH = ("gfx90a")
+NAVI_ARCH = ("gfx1030", "gfx1100", "gfx1101", "gfx1200", "gfx1201")
+NAVI3_ARCH = ("gfx1100", "gfx1101")
+NAVI4_ARCH = ("gfx1200", "gfx1201")
+
+class ProfilingMode(Enum):
+    LEGACY = 1
+    SIMPLE = 2
+    PROFILING = 3
+
+# Set by parse_cmd_line_args() if called
+CI_FUNCTORCH_ROOT = ""
+CI_PT_ROOT = ""
+CI_TEST_PREFIX = ""
+DISABLED_TESTS_FILE = ""
+GRAPH_EXECUTOR : Optional[ProfilingMode] = None
+LOG_SUFFIX = ""
+PYTEST_SINGLE_TEST = ""
+REPEAT_COUNT = 0
+RERUN_DISABLED_TESTS = False
+RUN_PARALLEL = 0
+SHOWLOCALS = False
+SLOW_TESTS_FILE = ""
+TEST_BAILOUTS = False
+TEST_DISCOVER = False
+TEST_IN_SUBPROCESS = False
+TEST_SAVE_XML = ""
+UNITTEST_ARGS : list[str] = []
+USE_PYTEST = False
 
 def freeze_rng_state(*args, **kwargs):
     return torch.testing._utils.freeze_rng_state(*args, **kwargs)
@@ -662,7 +692,7 @@ def _formatted_str_repr(self, idx, name, value):
             return f"{name}{idx}"
 
     def _default_subtest_name(self, idx, values):
-        return '_'.join([self._formatted_str_repr(idx, a, v) for a, v in zip(self.arg_names, values)])
+        return '_'.join([self._formatted_str_repr(idx, a, v) for a, v in zip(self.arg_names, values, strict=True)])
 
     def _get_subtest_name(self, idx, values, explicit_name=None):
         if explicit_name:
@@ -706,7 +736,7 @@ def test_wrapper(*args, **kwargs):
                     raise RuntimeError(f'Expected # values == # arg names, but got: {len(values)} '
                                        f'values and {len(self.arg_names)} names for test "{test.__name__}"')
 
-                param_kwargs = dict(zip(self.arg_names, values))
+                param_kwargs = dict(zip(self.arg_names, values, strict=True))
 
                 test_name = self._get_subtest_name(idx, values, explicit_name=maybe_name)
 
@@ -838,11 +868,6 @@ def test_wrapper(*args, **kwargs):
         yield (test_wrapper, test_name, {}, decorator_fn)
 
 
-class ProfilingMode(Enum):
-    LEGACY = 1
-    SIMPLE = 2
-    PROFILING = 3
-
 def cppProfilingFlagsToProfilingMode():
     old_prof_exec_state = torch._C._jit_set_profiling_executor(True)
     old_prof_mode_state = torch._C._get_graph_executor_optimize(True)
@@ -861,6 +886,7 @@ def cppProfilingFlagsToProfilingMode():
 def enable_profiling_mode_for_profiling_tests():
     old_prof_exec_state = False
     old_prof_mode_state = False
+    assert GRAPH_EXECUTOR
     if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
         old_prof_exec_state = torch._C._jit_set_profiling_executor(True)
         old_prof_mode_state = torch._C._get_graph_executor_optimize(True)
@@ -895,6 +921,7 @@ def num_profiled_runs(num_runs):
 def prof_callable(callable, *args, **kwargs):
     if 'profile_and_replay' in kwargs:
         del kwargs['profile_and_replay']
+        assert GRAPH_EXECUTOR
         if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
             with enable_profiling_mode_for_profiling_tests():
                 callable(*args, **kwargs)
@@ -924,72 +951,91 @@ def _get_test_report_path():
     test_source = override if override is not None else 'python-unittest'
     return os.path.join('test-reports', test_source)
 
-is_running_via_run_test = "run_test.py" in getattr(__main__, "__file__", "")
-parser = argparse.ArgumentParser(add_help=not is_running_via_run_test, allow_abbrev=False)
-parser.add_argument('--subprocess', action='store_true',
-                    help='whether to run each test in a subprocess')
-parser.add_argument('--seed', type=int, default=1234)
-parser.add_argument('--accept', action='store_true')
-parser.add_argument('--jit-executor', '--jit_executor', type=str)
-parser.add_argument('--repeat', type=int, default=1)
-parser.add_argument('--test-bailouts', '--test_bailouts', action='store_true')
-parser.add_argument('--use-pytest', action='store_true')
-parser.add_argument('--save-xml', nargs='?', type=str,
-                    const=_get_test_report_path(),
-                    default=_get_test_report_path() if IS_CI else None)
-parser.add_argument('--discover-tests', action='store_true')
-parser.add_argument('--log-suffix', type=str, default="")
-parser.add_argument('--run-parallel', type=int, default=1)
-parser.add_argument('--import-slow-tests', type=str, nargs='?', const=DEFAULT_SLOW_TESTS_FILE)
-parser.add_argument('--import-disabled-tests', type=str, nargs='?', const=DEFAULT_DISABLED_TESTS_FILE)
-parser.add_argument('--rerun-disabled-tests', action='store_true')
-parser.add_argument('--pytest-single-test', type=str, nargs=1)
-parser.add_argument('--showlocals', action=argparse.BooleanOptionalAction, default=False)
+def parse_cmd_line_args():
+    global CI_FUNCTORCH_ROOT
+    global CI_PT_ROOT
+    global CI_TEST_PREFIX
+    global DISABLED_TESTS_FILE
+    global GRAPH_EXECUTOR
+    global LOG_SUFFIX
+    global PYTEST_SINGLE_TEST
+    global REPEAT_COUNT
+    global RERUN_DISABLED_TESTS
+    global RUN_PARALLEL
+    global SHOWLOCALS
+    global SLOW_TESTS_FILE
+    global TEST_BAILOUTS
+    global TEST_DISCOVER
+    global TEST_IN_SUBPROCESS
+    global TEST_SAVE_XML
+    global UNITTEST_ARGS
+    global USE_PYTEST
+
+    is_running_via_run_test = "run_test.py" in getattr(__main__, "__file__", "")
+    parser = argparse.ArgumentParser(add_help=not is_running_via_run_test, allow_abbrev=False)
+    parser.add_argument('--subprocess', action='store_true',
+                        help='whether to run each test in a subprocess')
+    parser.add_argument('--accept', action='store_true')
+    parser.add_argument('--jit-executor', '--jit_executor', type=str)
+    parser.add_argument('--repeat', type=int, default=1)
+    parser.add_argument('--test-bailouts', '--test_bailouts', action='store_true')
+    parser.add_argument('--use-pytest', action='store_true')
+    parser.add_argument('--save-xml', nargs='?', type=str,
+                        const=_get_test_report_path(),
+                        default=_get_test_report_path() if IS_CI else None)
+    parser.add_argument('--discover-tests', action='store_true')
+    parser.add_argument('--log-suffix', type=str, default="")
+    parser.add_argument('--run-parallel', type=int, default=1)
+    parser.add_argument('--import-slow-tests', type=str, nargs='?', const=DEFAULT_SLOW_TESTS_FILE)
+    parser.add_argument('--import-disabled-tests', type=str, nargs='?', const=DEFAULT_DISABLED_TESTS_FILE)
+    parser.add_argument('--rerun-disabled-tests', action='store_true')
+    parser.add_argument('--pytest-single-test', type=str, nargs=1)
+    parser.add_argument('--showlocals', action=argparse.BooleanOptionalAction, default=False)
 
 # Only run when -h or --help flag is active to display both unittest and parser help messages.
-def run_unittest_help(argv):
-    unittest.main(argv=argv)
-
-if '-h' in sys.argv or '--help' in sys.argv:
-    help_thread = threading.Thread(target=run_unittest_help, args=(sys.argv,))
-    help_thread.start()
-    help_thread.join()
-
-args, remaining = parser.parse_known_args()
-if args.jit_executor == 'legacy':
-    GRAPH_EXECUTOR = ProfilingMode.LEGACY
-elif args.jit_executor == 'profiling':
-    GRAPH_EXECUTOR = ProfilingMode.PROFILING
-elif args.jit_executor == 'simple':
-    GRAPH_EXECUTOR = ProfilingMode.SIMPLE
-else:
-    # infer flags based on the default settings
-    GRAPH_EXECUTOR = cppProfilingFlagsToProfilingMode()
-
-RERUN_DISABLED_TESTS = args.rerun_disabled_tests
-
-SLOW_TESTS_FILE = args.import_slow_tests
-DISABLED_TESTS_FILE = args.import_disabled_tests
-LOG_SUFFIX = args.log_suffix
-RUN_PARALLEL = args.run_parallel
-TEST_BAILOUTS = args.test_bailouts
-USE_PYTEST = args.use_pytest
-PYTEST_SINGLE_TEST = args.pytest_single_test
-TEST_DISCOVER = args.discover_tests
-TEST_IN_SUBPROCESS = args.subprocess
-TEST_SAVE_XML = args.save_xml
-REPEAT_COUNT = args.repeat
-SEED = args.seed
-SHOWLOCALS = args.showlocals
-if not getattr(expecttest, "ACCEPT", False):
-    expecttest.ACCEPT = args.accept
-UNITTEST_ARGS = [sys.argv[0]] + remaining
-torch.manual_seed(SEED)
+    def run_unittest_help(argv):
+        unittest.main(argv=argv)
+
+    if '-h' in sys.argv or '--help' in sys.argv:
+        help_thread = threading.Thread(target=run_unittest_help, args=(sys.argv,))
+        help_thread.start()
+        help_thread.join()
+
+    args, remaining = parser.parse_known_args()
+    if args.jit_executor == 'legacy':
+        GRAPH_EXECUTOR = ProfilingMode.LEGACY
+    elif args.jit_executor == 'profiling':
+        GRAPH_EXECUTOR = ProfilingMode.PROFILING
+    elif args.jit_executor == 'simple':
+        GRAPH_EXECUTOR = ProfilingMode.SIMPLE
+    else:
+        # infer flags based on the default settings
+        GRAPH_EXECUTOR = cppProfilingFlagsToProfilingMode()
+
+    RERUN_DISABLED_TESTS = args.rerun_disabled_tests
+
+    SLOW_TESTS_FILE = args.import_slow_tests
+    DISABLED_TESTS_FILE = args.import_disabled_tests
+    LOG_SUFFIX = args.log_suffix
+    RUN_PARALLEL = args.run_parallel
+    TEST_BAILOUTS = args.test_bailouts
+    USE_PYTEST = args.use_pytest
+    PYTEST_SINGLE_TEST = args.pytest_single_test
+    TEST_DISCOVER = args.discover_tests
+    TEST_IN_SUBPROCESS = args.subprocess
+    TEST_SAVE_XML = args.save_xml
+    REPEAT_COUNT = args.repeat
+    SHOWLOCALS = args.showlocals
+    if not getattr(expecttest, "ACCEPT", False):
+        expecttest.ACCEPT = args.accept
+    UNITTEST_ARGS = [sys.argv[0]] + remaining
+
+    set_rng_seed()
 
 # CI Prefix path used only on CI environment
-CI_TEST_PREFIX = str(Path(os.getcwd()))
-CI_PT_ROOT = str(Path(os.getcwd()).parent)
-CI_FUNCTORCH_ROOT = str(os.path.join(Path(os.getcwd()).parent, "functorch"))
+    CI_TEST_PREFIX = str(Path(os.getcwd()))
+    CI_PT_ROOT = str(Path(os.getcwd()).parent)
+    CI_FUNCTORCH_ROOT = str(os.path.join(Path(os.getcwd()).parent, "functorch"))
 
 def wait_for_process(p, timeout=None):
     try:
@@ -1138,7 +1184,9 @@ def lint_test_case_extension(suite):
     return succeed
 
 
-def get_report_path(argv=UNITTEST_ARGS, pytest=False):
+def get_report_path(argv=None, pytest=False):
+    if argv is None:
+        argv = UNITTEST_ARGS
     test_filename = sanitize_test_filename(argv[0])
     test_report_path = TEST_SAVE_XML + LOG_SUFFIX
     test_report_path = os.path.join(test_report_path, test_filename)
@@ -1189,7 +1237,11 @@ def pytest_collection_finish(self, session):
     return test_collector_plugin.tests
 
 
-def run_tests(argv=UNITTEST_ARGS):
+def run_tests(argv=None):
+    parse_cmd_line_args()
+    if argv is None:
+        argv = UNITTEST_ARGS
+
     # import test files.
     if SLOW_TESTS_FILE:
         if os.path.exists(SLOW_TESTS_FILE):
@@ -1419,7 +1471,7 @@ def is_privateuse1_backend_available():
 TEST_MPS = torch.backends.mps.is_available()
 MACOS_VERSION = float('.'.join(platform.mac_ver()[0].split('.')[:2]) or -1)
 TEST_XPU = torch.xpu.is_available()
-TEST_HPU = True if (hasattr(torch, "hpu") and torch.hpu.is_available()) else False
+TEST_HPU = bool(hasattr(torch, "hpu") and torch.hpu.is_available())
 TEST_CUDA = torch.cuda.is_available()
 custom_device_mod = getattr(torch, torch._C._get_privateuse1_backend_name(), None)
 TEST_PRIVATEUSE1 = is_privateuse1_backend_available()
@@ -1463,6 +1515,10 @@ def split_if_not_empty(x: str):
     "TEST_WITH_ROCM",
     env_var="PYTORCH_TEST_WITH_ROCM",
 )
+TEST_WITH_MTIA: bool = TestEnvironment.def_flag(
+    "TEST_WITH_MTIA",
+    env_var="PYTORCH_TEST_WITH_MTIA",
+)
 
 # TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
 # See #64427
@@ -1755,6 +1811,7 @@ def decorator(fn):
         if not isinstance(fn, type):
             @wraps(fn)
             def wrapper(*args, **kwargs):
+                assert GRAPH_EXECUTOR
                 if GRAPH_EXECUTOR == ProfilingMode.LEGACY:
                     raise unittest.SkipTest(msg)
                 else:
@@ -1919,15 +1976,20 @@ def wrapper(*args, **kwargs):
         return dec_fn(func)
     return dec_fn
 
+def getRocmArchName(device_index: int = 0):
+    return torch.cuda.get_device_properties(device_index).gcnArchName
+
+def isRocmArchAnyOf(arch: tuple[str, ...]):
+    rocmArch = getRocmArchName()
+    return any(x in rocmArch for x in arch)
+
 def skipIfRocmArch(arch: tuple[str, ...]):
     def dec_fn(fn):
         @wraps(fn)
         def wrap_fn(self, *args, **kwargs):
-            if TEST_WITH_ROCM:
-                prop = torch.cuda.get_device_properties(0)
-                if prop.gcnArchName.split(":")[0] in arch:
-                    reason = f"skipIfRocm: test skipped on {arch}"
-                    raise unittest.SkipTest(reason)
+            if TEST_WITH_ROCM and isRocmArchAnyOf(arch):
+                reason = f"skipIfRocm: test skipped on {arch}"
+                raise unittest.SkipTest(reason)
             return fn(self, *args, **kwargs)
         return wrap_fn
     return dec_fn
@@ -1945,11 +2007,9 @@ def runOnRocmArch(arch: tuple[str, ...]):
     def dec_fn(fn):
         @wraps(fn)
         def wrap_fn(self, *args, **kwargs):
-            if TEST_WITH_ROCM:
-                prop = torch.cuda.get_device_properties(0)
-                if prop.gcnArchName.split(":")[0] not in arch:
-                    reason = f"skipIfRocm: test only runs on {arch}"
-                    raise unittest.SkipTest(reason)
+            if TEST_WITH_ROCM and not isRocmArchAnyOf(arch):
+                reason = f"skipIfRocm: test only runs on {arch}"
+                raise unittest.SkipTest(reason)
             return fn(self, *args, **kwargs)
         return wrap_fn
     return dec_fn
@@ -1990,16 +2050,6 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-def skipIfMPSOnMacOS13(fn):
-    @wraps(fn)
-    def wrapper(*args, **kwargs):
-        if TEST_MPS and int(MACOS_VERSION) == 13:
-            raise unittest.SkipTest("Test crashes MPSGraph on MacOS13")
-        else:
-            fn(*args, **kwargs)
-    return wrapper
-
-
 def skipIfHpu(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):
@@ -2009,15 +2059,18 @@ def wrapper(*args, **kwargs):
             fn(*args, **kwargs)
     return wrapper
 
+def getRocmVersion() -> tuple[int, int]:
+    from torch.testing._internal.common_cuda import _get_torch_rocm_version
+    rocm_version = _get_torch_rocm_version()
+    return (rocm_version[0], rocm_version[1])
+
 # Skips a test on CUDA if ROCm is available and its version is lower than requested.
 def skipIfRocmVersionLessThan(version=None):
     def dec_fn(fn):
         @wraps(fn)
         def wrap_fn(self, *args, **kwargs):
             if TEST_WITH_ROCM:
-                rocm_version = str(torch.version.hip)
-                rocm_version = rocm_version.split("-", maxsplit=1)[0]    # ignore git sha
-                rocm_version_tuple = tuple(int(x) for x in rocm_version.split("."))
+                rocm_version_tuple = getRocmVersion()
                 if rocm_version_tuple is None or version is None or rocm_version_tuple < tuple(version):
                     reason = f"ROCm {rocm_version_tuple} is available but {version} required"
                     raise unittest.SkipTest(reason)
@@ -2049,6 +2102,21 @@ def wrapper(*args, **kwargs):
         return dec_fn(func)
     return dec_fn
 
+def skipIfWindowsXPU(func=None, *, msg="test doesn't currently work on the Windows stack"):
+    def dec_fn(fn):
+        reason = f"skipIfWindowsXPU: {msg}"
+
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            if IS_WINDOWS and torch.xpu.is_available():  # noqa: F821
+                raise unittest.SkipTest(reason)
+            else:
+                return fn(*args, **kwargs)
+        return wrapper
+    if func:
+        return dec_fn(func)
+    return dec_fn
+
 def requires_cuda_p2p_access():
     cuda_p2p_access_available = (
         torch.cuda.is_available()
@@ -2385,7 +2453,9 @@ def get_function_arglist(func):
     return inspect.getfullargspec(func).args
 
 
-def set_rng_seed(seed):
+def set_rng_seed(seed=None):
+    if seed is None:
+        seed = SEED
     torch.manual_seed(seed)
     random.seed(seed)
     if TEST_NUMPY:
@@ -3147,7 +3217,7 @@ def remove_comment_lines(self, input_string):
 
     def remove_empty_lines(self, input_string):
         lines = input_string.split('\n')
-        filtered_lines = [line for line in lines if not line.strip() == '']
+        filtered_lines = [line for line in lines if line.strip() != '']
         return '\n'.join(filtered_lines)
 
     # ignore comments will ignore lines that starts with # after being stripped
@@ -3408,7 +3478,7 @@ def run(self, result=None):
 
     def setUp(self):
         check_if_enable(self)
-        set_rng_seed(SEED)
+        set_rng_seed()
 
         # Save global check sparse tensor invariants state that can be
         # restored from tearDown:
@@ -3626,7 +3696,7 @@ def random_sparse_compressed(n_compressed_dims, n_plain_dims, nnz):
             n_compressed_dims, n_plain_dims = size[-1 - dense_dims] // blocksize1, size[-2 - dense_dims] // blocksize0
         blocknnz = nnz // (blocksize0 * blocksize1)
         sparse_tensors = [random_sparse_compressed(n_compressed_dims, n_plain_dims, blocknnz) for _ in range(n_batch)]
-        sparse_tensors_it = map(list, zip(*sparse_tensors))
+        sparse_tensors_it = map(list, zip(*sparse_tensors, strict=True))
 
         values = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, blocknnz, *blocksize, *dense_size)
         compressed_indices = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, -1)
@@ -4188,7 +4258,7 @@ def assertNotEqual(self, x, y, msg: Optional[str] = None, *,
             self.assertEqual(x, y, msg, atol=atol, rtol=rtol, **kwargs)
 
     def assertEqualTypeString(self, x, y) -> None:
-        # This API is used simulate deprecated x.type() == y.type()
+        # This API is used simulate deprecated x.type() is y.type()
         self.assertEqual(x.device, y.device)
         self.assertEqual(x.dtype, y.dtype)
         self.assertEqual(x.is_sparse, y.is_sparse)
@@ -5070,7 +5140,7 @@ def gradcheck(fn, inputs, **kwargs):
 
     for key, value in default_values.items():
         # default value override values explicitly set to None
-        k = kwargs.get(key, None)
+        k = kwargs.get(key)
         kwargs[key] = k if k is not None else value
 
     return torch.autograd.gradcheck(fn, inputs, **kwargs)
@@ -5090,7 +5160,7 @@ def gradgradcheck(fn, inputs, grad_outputs=None, **kwargs):
 
     for key, value in default_values.items():
         # default value override values explicitly set to None
-        k = kwargs.get(key, None)
+        k = kwargs.get(key)
         kwargs[key] = k if k is not None else value
 
     return torch.autograd.gradgradcheck(fn, inputs, grad_outputs, **kwargs)
@@ -5706,8 +5776,7 @@ def install_cpp_extension(extension_root):
             shutil.rmtree(d)
 
     # Build the extension
-    setup_py_path = os.path.join(extension_root, "setup.py")
-    cmd = [sys.executable, setup_py_path, "install", "--root", install_dir]
+    cmd = [sys.executable, "-m", "pip", "install", extension_root, "-v", "--no-build-isolation", "--root", install_dir]
     return_code = shell(cmd, cwd=extension_root, env=os.environ)
     if return_code != 0:
         raise RuntimeError(f"build failed for cpp extension at {extension_root}")
diff --git a/torch/testing/_internal/composite_compliance.py b/torch/testing/_internal/composite_compliance.py
index 8007d3563091..527fc8a5826e 100644
--- a/torch/testing/_internal/composite_compliance.py
+++ b/torch/testing/_internal/composite_compliance.py
@@ -10,7 +10,7 @@
 from functools import partial
 from torch.utils._mode_utils import no_dispatch, all_same_mode
 import torch.autograd.forward_ad as fwAD
-from typing import Callable
+from collections.abc import Callable
 import re
 
 
@@ -234,7 +234,7 @@ def wrap(e):
                     #    tensor results to be that of the tensors that alias the input
                     result = func(*args, **kwargs)
                     if isinstance(result, (tuple, list)):
-                        for a, b in zip(rs, result):
+                        for a, b in zip(rs, result, strict=True):
                             a.set_(b)
                     else:
                         rs.set_(result)
@@ -303,7 +303,7 @@ def generate_subclass_choices(flat_args, CCT, cct_mode):
     for which_args_are_wrapped in itertools.product(*subclass_options):
 
         result = [maybe_map(partial(wrap, CCT=CCT, cct_mode=cct_mode), should_wrap_arg, arg)
-                  for should_wrap_arg, arg in zip(which_args_are_wrapped, flat_args)]
+                  for should_wrap_arg, arg in zip(which_args_are_wrapped, flat_args, strict=True)]
         yield result, which_args_are_wrapped
 
 
@@ -539,11 +539,11 @@ def maybe_make_dual(dual):
                 return fwAD.make_dual(primal.detach(), tangent)
             elif is_tensorlist(primal):
                 return tuple(fwAD.make_dual(pri.detach(), tang) if tang is not None else pri
-                             for pri, tang in zip(primal, tangent))
+                             for pri, tang in zip(primal, tangent, strict=True))
             return primal
 
         def compute_expected_grad(args, tangent_args, kwargs, tangent_kwargs):
-            op_args = tuple(map(maybe_make_dual, zip(args, tangent_args)))
+            op_args = tuple(map(maybe_make_dual, zip(args, tangent_args, strict=True)))
             op_kwargs = {k: maybe_make_dual((v, tangent_kwargs[k])) for k, v in kwargs.items()}
 
             if gradcheck_wrapper is None:
@@ -572,7 +572,7 @@ def compute_expected_grad(args, tangent_args, kwargs, tangent_kwargs):
                 new_tang_args, new_tang_kwargs, \
                     which_tang_args_are_wrapped, which_tang_kwargs_are_wrapped = tang_choice
 
-                op_args = tuple(map(maybe_make_dual, zip(new_args, new_tang_args)))
+                op_args = tuple(map(maybe_make_dual, zip(new_args, new_tang_args, strict=True)))
                 op_kwargs = {k: maybe_make_dual((v, new_tang_kwargs[k])) for k, v in new_kwargs.items()}
 
                 try:
diff --git a/torch/testing/_internal/custom_tensor.py b/torch/testing/_internal/custom_tensor.py
index 9fa6f79ec68a..de1b44ba8dac 100644
--- a/torch/testing/_internal/custom_tensor.py
+++ b/torch/testing/_internal/custom_tensor.py
@@ -144,7 +144,9 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
             new_out = pytree.tree_unflatten(
                 (
                     CustomTensorPlainOut(tensor1, tensor2)
-                    for tensor1, tensor2 in zip(out_inner_flat_1, out_inner_flat_2)
+                    for tensor1, tensor2 in zip(
+                        out_inner_flat_1, out_inner_flat_2, strict=True
+                    )
                 ),
                 spec,
             )
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index 604ba9714f21..22d6d8e7dede 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -2,18 +2,26 @@
 
 # Copyright (c) Meta Platforms, Inc. and affiliates
 
+import contextlib
+import functools
 import itertools
 import sys
-from collections.abc import Iterator, Sequence
+import types
+from collections.abc import Callable, Iterator, Sequence
 from dataclasses import dataclass
 from functools import partial, wraps
-from typing import Any, Callable, cast, Optional, TypeVar, Union
+from typing import Any, cast, Optional, TypeVar, Union
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
-from torch._utils import _get_device_module
+from torch.distributed._local_tensor import (
+    LocalIntNode,
+    LocalTensor,
+    LocalTensorMode,
+    maybe_run_for_local_tensor,
+)
 from torch.distributed.tensor import (
     DeviceMesh,
     distribute_tensor,
@@ -38,24 +46,21 @@
     skip_if_lt_x_gpu,
     TEST_SKIPS,
 )
-from torch.testing._internal.common_utils import TEST_CUDA, TEST_HPU, TEST_XPU
+from torch.testing._internal.common_utils import (
+    TEST_CUDA,
+    TEST_HPU,
+    TEST_PRIVATEUSE1,
+    TEST_XPU,
+)
 from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
 
 
 DEVICE_COUNT: int
 
-if TEST_CUDA:
-    DEVICE_TYPE = "cuda"
-    PG_BACKEND = "nccl"
-    DEVICE_COUNT = _get_device_module("cuda").device_count()
-elif TEST_HPU:
-    DEVICE_TYPE = "hpu"
-    PG_BACKEND = "hccl"
-    DEVICE_COUNT = _get_device_module("hpu").device_count()
-elif TEST_XPU:
-    DEVICE_TYPE = "xpu"
-    PG_BACKEND = "xccl"
-    DEVICE_COUNT = _get_device_module("xpu").device_count()
+if TEST_CUDA or TEST_XPU or TEST_HPU or TEST_PRIVATEUSE1:
+    DEVICE_TYPE = torch.accelerator.current_accelerator().type
+    DEVICE_COUNT = torch.accelerator.device_count()
+    PG_BACKEND = dist.Backend.default_device_backend_map[DEVICE_TYPE]
 else:
     DEVICE_TYPE = "cpu"
     PG_BACKEND = "gloo"
@@ -63,7 +68,7 @@
 NUM_DEVICES = 4
 
 # We use this as a proxy for "multiple GPUs exist"
-if (TEST_CUDA or TEST_XPU or TEST_HPU) and DEVICE_COUNT > 1:
+if (TEST_CUDA or TEST_XPU or TEST_HPU or TEST_PRIVATEUSE1) and DEVICE_COUNT > 1:
     # when we actually have multiple GPUs, relax the requirement to smaller counts.
     NUM_DEVICES = min(NUM_DEVICES, DEVICE_COUNT)
 
@@ -341,7 +346,10 @@ class DTensorContinuousTestBase(MultiProcContinuousTest):
     @classmethod
     def device_type(cls) -> str:
         # if enough GPU/XPU/HPU we can use those devices, otherwise we fallback to CPU
-        if not (TEST_CUDA or TEST_XPU or TEST_HPU) or DEVICE_COUNT < cls.world_size:
+        if (
+            not (TEST_CUDA or TEST_XPU or TEST_HPU or TEST_PRIVATEUSE1)
+            or DEVICE_COUNT < cls.world_size
+        ):
             return "cpu"
         else:
             return DEVICE_TYPE
@@ -360,7 +368,10 @@ def world_size(self) -> int:
     @property
     def device_type(self) -> str:
         # if enough GPU/XPU/HPU we can use those devices, otherwise we fallback to CPU
-        if not (TEST_CUDA or TEST_XPU or TEST_HPU) or DEVICE_COUNT < self.world_size:
+        if (
+            not (TEST_CUDA or TEST_XPU or TEST_HPU or TEST_PRIVATEUSE1)
+            or DEVICE_COUNT < self.world_size
+        ):
             return "cpu"
         else:
             return DEVICE_TYPE
@@ -370,6 +381,9 @@ def backend(self) -> str:
         backend = dist.get_default_backend_for_device(DEVICE_TYPE)
         return backend
 
+    def init_manual_seed_for_rank(self) -> None:
+        torch.manual_seed(self.rank)
+
     def build_device_mesh(self) -> DeviceMesh:
         return init_device_mesh(self.device_type, (self.world_size,))
 
@@ -377,6 +391,8 @@ def init_pg(self, eager_init, backend: Optional[str] = None) -> None:
         if "nccl" in self.backend and torch.cuda.device_count() < self.world_size:
             sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
 
+        curr_backend = dist.get_default_backend_for_device(self.device_type)
+
         if backend is None:
             backend = self.backend
 
@@ -384,7 +400,7 @@ def init_pg(self, eager_init, backend: Optional[str] = None) -> None:
             "nccl",
             "gloo",
             "mpi",
-            "cpu:gloo,cuda:nccl",
+            f"cpu:gloo,{self.device_type}:{curr_backend}",
             "hccl",
             "xccl",
             "fake",
@@ -656,7 +672,7 @@ def __next__(self) -> tuple[tuple[object, ...], dict[str, object]]:
     def to_dist_tensor(
         self, t: torch.Tensor, mesh: DeviceMesh, placements: list[Placement]
     ) -> torch.Tensor:
-        if type(t) is torch.Tensor or type(t) is nn.Parameter:
+        if type(t) is torch.Tensor or type(t) is nn.Parameter or type(t) is LocalTensor:
             if self.is_supported_tensor(t):
                 self.hit += 1
                 if t.ndim == 0:
@@ -665,7 +681,7 @@ def to_dist_tensor(
                 else:
                     # distribute non-scalar tensors
                     r = distribute_tensor(t, mesh, placements)
-                if type(t) is nn.Parameter:
+                if isinstance(t, nn.Parameter):
                     r = nn.Parameter(  # type: ignore[assignment]
                         r, requires_grad=r.requires_grad
                     )
@@ -682,3 +698,96 @@ def to_dist_tensor(
             return t
         else:
             raise RuntimeError(f"Trying to convert to DTensor, but got {type(t)}")
+
+
+class LocalDTensorTestBase(DTensorTestBase):
+    def _handle_test_skip(self, msg: str) -> None:
+        self.skipTest(msg)
+
+    def _get_local_tensor_mode(self):
+        return LocalTensorMode(frozenset(range(self.world_size)))
+
+    def setUp(self) -> None:
+        super().setUp()
+        torch.autograd._enable_record_function(False)
+
+    def tearDown(self) -> None:
+        super().tearDown()
+        torch.autograd._enable_record_function(True)
+
+    @property
+    def rank(self):
+        return torch.SymInt(LocalIntNode({r: r for r in range(self.world_size)}))
+
+    @rank.setter
+    def rank(self, rank):
+        pass
+
+    def join_or_run(self, fn):
+        @wraps(fn)
+        def wrapper(self):
+            fn()
+
+        return types.MethodType(wrapper, self)
+
+    def init_pg(self, eager_init, backend: Optional[str] = None) -> None:
+        dist.init_process_group("fake", rank=0, world_size=self.world_size)
+        self._pg = dist.distributed_c10d._get_default_group()
+
+    def destroy_pg(self, device_id: Optional[int] = None) -> None:
+        dist.destroy_process_group(self._pg)
+        self._pg = None
+
+    def _spawn_processes(self) -> None:
+        pass
+
+    def init_manual_seed_for_rank(self) -> None:
+        torch.manual_seed(0)
+
+
+def make_wrapped(fn, ctxs):
+    @functools.wraps(fn)
+    def wrapped(self):
+        torch._dynamo.reset()
+        stack = contextlib.ExitStack()
+        for ctx in ctxs:
+            if callable(ctx):
+                stack.enter_context(ctx(self))
+            else:
+                stack.enter_context(ctx)
+        out = fn(self)
+        stack.close()
+        return out
+
+    return wrapped
+
+
+def create_local_tensor_test_class(orig_cls, skipped_tests=None):
+    if skipped_tests is None:
+        skipped_tests = []
+
+    dct = orig_cls.__dict__.copy()
+    for name in list(dct.keys()):
+        fn = dct[name]
+        if not callable(fn):
+            continue
+        elif name in skipped_tests:
+            dct[name] = lambda self: self.skipTest("Skipped test")
+        elif name.startswith("test_"):
+            ctxs = [
+                lambda test: test._get_local_tensor_mode(),
+            ]
+            dct[name] = make_wrapped(fn, ctxs)
+
+    cls = type(
+        orig_cls.__name__ + "WithLocalTensor",
+        (LocalDTensorTestBase,) + orig_cls.__bases__,
+        dct,
+    )
+    cls.__file__ = __file__
+    return cls
+
+
+@maybe_run_for_local_tensor
+def map_local_tensor_for_rank(tensor, rank, func):
+    return func(tensor, rank)
diff --git a/torch/testing/_internal/distributed/checkpoint_utils.py b/torch/testing/_internal/distributed/checkpoint_utils.py
index 07b05140e36e..49a57ca26399 100644
--- a/torch/testing/_internal/distributed/checkpoint_utils.py
+++ b/torch/testing/_internal/distributed/checkpoint_utils.py
@@ -7,8 +7,9 @@
 import os
 import shutil
 import tempfile
+from collections.abc import Callable
 from functools import wraps
-from typing import Any, Callable, cast, IO, Optional
+from typing import Any, cast, IO, Optional
 
 # introduced as collections.abc.Buffer in Python 3.12
 from typing_extensions import Buffer
diff --git a/torch/testing/_internal/distributed/common_state_dict.py b/torch/testing/_internal/distributed/common_state_dict.py
index f7d79907bdbe..a78e312306ba 100644
--- a/torch/testing/_internal/distributed/common_state_dict.py
+++ b/torch/testing/_internal/distributed/common_state_dict.py
@@ -40,7 +40,7 @@ def _verify_msd(
         if not options.ignore_frozen_params:
             self.assertEqual(len(msd), len(dist_msd))
         for fqn, param in msd.items():
-            dist_param = dist_msd.get(fqn, None)
+            dist_param = dist_msd.get(fqn)
             if not options.ignore_frozen_params:
                 self.assertIsNotNone(dist_param, f"{fqn=}")
                 try:
@@ -60,7 +60,7 @@ def _verify_osd(
         dist_osd: dict[str, Any],
     ) -> None:
         params = list(chain.from_iterable(g["params"] for g in optim.param_groups))
-        param_pid_mapping = dict(zip(params, range(len(params))))
+        param_pid_mapping = dict(zip(params, range(len(params)), strict=True))
         fqn_pid_mapping = {}
         for fqn, param in model.named_parameters():
             pid = param_pid_mapping[param]
@@ -90,7 +90,7 @@ def _verify_osd(
             dist_osd[_PG] = [new_pg]
 
         self.assertEqual(len(osd[_PG]), len(dist_osd[_PG]))
-        for group, dist_group in zip(osd[_PG], dist_osd[_PG]):
+        for group, dist_group in zip(osd[_PG], dist_osd[_PG], strict=True):
             self.assertEqual(len(group), len(dist_group))
             for key, value in group.items():
                 # Below doesn't work because param_groups can have None
diff --git a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
index 61c21be3ca07..ca9bc297010a 100644
--- a/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@@ -192,7 +192,7 @@ def __init__(
         self.hybrid_module = HybridModel(
             self.remote_em_rref,
             self.remote_net_rref,
-            self.trainer_group if ddp_mode in (DdpMode.INSIDE,) else None,
+            self.trainer_group if ddp_mode == DdpMode.INSIDE else None,
         )
         self.ddp_params, self.non_ddp_params = (
             self.hybrid_module.ddp_params,
@@ -238,7 +238,9 @@ def train_batch(
             sparse_microbatch = torch.split(sparse_features, 2)
             values_microbatch = torch.split(values, 2)
             batches = []
-            for d, s, v in zip(dense_microbatch, sparse_microbatch, values_microbatch):
+            for d, s, v in zip(
+                dense_microbatch, sparse_microbatch, values_microbatch, strict=True
+            ):
                 feature_set = FeatureSet(dense_features=d, sparse_features=s, values=v)
                 batches.append(feature_set)
 
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 024fd47285ae..499341b07951 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -13,11 +13,12 @@
 import time
 import unittest
 from collections import defaultdict, namedtuple, OrderedDict
+from collections.abc import Callable
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from datetime import timedelta
 from functools import reduce
-from typing import Any, Callable, NamedTuple, Union
+from typing import Any, NamedTuple, Union
 
 import numpy as np
 
@@ -137,16 +138,18 @@ def eq(value, other):
 f = Foo(10)
 f.bar = 1
 
-foo_cpu_tensor = Foo(torch.randn(3, 3))
 
+# Defer instantiation until the seed is set so that randn() returns the same
+# values in all processes.
+def create_collectives_object_test_list():
+    return [
+        {"key1": 3, "key2": 4, "key3": {"nested": True}},
+        f,
+        Foo(torch.randn(3, 3)),
+        "foo",
+        [1, 2, True, "string", [4, 5, "nested"]],
+    ]
 
-COLLECTIVES_OBJECT_TEST_LIST = [
-    {"key1": 3, "key2": 4, "key3": {"nested": True}},
-    f,
-    foo_cpu_tensor,
-    "foo",
-    [1, 2, True, "string", [4, 5, "nested"]],
-]
 
 # Allowlist of distributed backends where profiling collectives is supported.
 PROFILING_SUPPORTED_BACKENDS = [
@@ -396,12 +399,6 @@ def forward(self, x):
             return F.relu(self.lin1(x))
 
 
-DDP_NET = Net()
-BN_NET = BatchNormNet()
-BN_NET_NO_AFFINE = BatchNormNet(affine=False)
-ONLY_SBN_NET = nn.SyncBatchNorm(2, momentum=0.99)
-
-
 def get_timeout(test_id):
     test_name = test_id.split(".")[-1]
     if test_name in CUSTOMIZED_TIMEOUT:
@@ -661,13 +658,13 @@ def _init_group_test(self, **kwargs):
             return (group, group_id, rank)
 
         def _init_full_group_test(self, **kwargs):
-            group = list(range(0, dist.get_world_size()))
+            group = list(range(dist.get_world_size()))
             group_id = dist.new_group(**kwargs)
             rank = dist.get_rank()
             return (group, group_id, rank)
 
         def _init_global_test(self):
-            group = list(range(0, dist.get_world_size()))
+            group = list(range(dist.get_world_size()))
             group_id = dist.group.WORLD
             rank = dist.get_rank()
             return (group, group_id, rank)
@@ -681,7 +678,7 @@ def _verify_buffers_equal(self, m1, m2):
             # Verify buffers across ranks.
             m1_buffers = list(m1.buffers())
             m2_buffers = list(m2.buffers())
-            for buf1, buf2 in zip(m1_buffers, m2_buffers):
+            for buf1, buf2 in zip(m1_buffers, m2_buffers, strict=True):
                 gathered_bufs = [
                     torch.empty_like(buf1) for _ in range(dist.get_world_size())
                 ]
@@ -707,7 +704,7 @@ def _sanity_check_profiler_nccl_meta(self, nccl_meta_events):
                 self.assertNotEqual(args.get("dtype", ""), "")
 
                 per_coll_meta[collname].append(args)
-                if collname in {"wait"}:
+                if collname == "wait":
                     continue
 
                 self.assertEqual(args["Process Group Description"], "default_pg")
@@ -728,7 +725,7 @@ def test_dump_DDP_relevant_env_vars(self):
                 lines = out.getvalue().splitlines()
 
             def format_line(var):
-                return f"env:{var}={os.environ[var] if var in os.environ else 'N/A'}"
+                return f"env:{var}={os.environ.get(var, 'N/A')}"
 
             # Check relevant env vars
             vars = [
@@ -858,8 +855,6 @@ def _test_barrier_timeout(self, group_id, timeout):
                 with exception_ctx:
                     dist.barrier(group_id)
                 self.assertGreaterAlmostEqual(time.time(), expected_time, delta=0.1)
-            else:
-                pass
 
         @skip_but_pass_in_sandcastle_if(
             BACKEND != "gloo", "Only gloo backend supports timeouts"
@@ -1119,7 +1114,7 @@ def test_periodic_model_averager(self):
                 averager = averagers.PeriodicModelAverager(
                     period=period, warmup_steps=warmup_steps
                 )
-                for step in range(0, 20):
+                for step in range(20):
                     # Reset the parameters at every step.
                     param.data = copy.deepcopy(tensor)
                     for params in model.parameters():
@@ -1148,7 +1143,7 @@ def test_periodic_model_averager_param_group(self):
                 averager = averagers.PeriodicModelAverager(
                     period=period, warmup_steps=warmup_steps
                 )
-                for step in range(0, 20):
+                for step in range(20):
                     # Reset the parameters at every step.
                     for param_group in opt.param_groups:
                         for params in param_group["params"]:
@@ -1208,7 +1203,7 @@ def test_1_level_hierarchical_model_averager_equivalent_to_periodic_model_averag
                 averager = averagers.PeriodicModelAverager(
                     period=period, warmup_steps=warmup_steps
                 )
-                for step in range(0, 20):
+                for step in range(20):
                     # Reset the parameters at every step.
                     param.data = copy.deepcopy(tensor)
                     for params in model.parameters():
@@ -1225,7 +1220,7 @@ def test_1_level_hierarchical_model_averager_equivalent_to_periodic_model_averag
             BACKEND not in DistTestCases.backend_feature["subgroup"],
             f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
         )
-        @require_world_size(4)
+        @require_exact_world_size(4)
         @skip_if_lt_x_gpu(4)
         def test_3_level_hierarchical_model_averager(self):
             rank = dist.get_rank()
@@ -1289,7 +1284,7 @@ def test_3_level_hierarchical_model_averager(self):
             expected_global_avg_tensor = (
                 torch.ones_like(param.data) * sum(range(world_size)) / world_size
             )
-            for step in range(0, 25):
+            for step in range(25):
                 # Reset the parameters at every step.
                 param.data = copy.deepcopy(tensor)
                 for params in model.parameters():
@@ -1395,7 +1390,7 @@ def test_batch_isend_irecv_nccl(self):
 
             for val in ["1", "0"]:
                 os.environ["TORCH_NCCL_BLOCKING_WAIT"] = val
-                for src in range(0, world_size):
+                for src in range(world_size):
                     send_tensor = _build_tensor(rank + 1, device_id=device_id).fill_(
                         src
                     )
@@ -1414,7 +1409,7 @@ def test_batch_isend_irecv_nccl(self):
                 for req in reqs:
                     req.wait()
 
-                for src in range(0, world_size):
+                for src in range(world_size):
                     self.assertEqual(recv_tensors[src], expected_tensors[src])
 
             self._barrier()
@@ -1510,7 +1505,7 @@ def test_batch_isend_irecv_gloo(self):
             rank = dist.get_rank()
             p2p_op_list = []
 
-            for src in range(0, dist.get_world_size()):
+            for src in range(dist.get_world_size()):
                 if src == rank:
                     continue
                 send_tensor = _build_tensor(rank + 1)
@@ -1533,7 +1528,7 @@ def test_batch_isend_irecv_gloo_tags(self):
             rank = dist.get_rank()
             p2p_op_list = []
 
-            for src in range(0, dist.get_world_size()):
+            for src in range(dist.get_world_size()):
                 if src == rank:
                     continue
                 send_tensor = _build_tensor(rank + 1)
@@ -1607,10 +1602,10 @@ def _test_send_recv_nccl(self, profiler_ctx=None):
             tensor = _build_tensor(rank + 1, device_id=device_id)
             profiler_cls = profiler_ctx if profiler_ctx is not None else nullcontext()
             with profiler_cls as prof:
-                for src in range(0, world_size):
+                for src in range(world_size):
                     if src == rank:
                         # Send mode
-                        for dst in range(0, world_size):
+                        for dst in range(world_size):
                             if dst == rank:
                                 continue
                             dist.send(tensor, dst)
@@ -1679,10 +1674,10 @@ def _test_send_recv(self, profiler_ctx):
             tensor = _build_tensor(send_size)
             ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
             with ctx as prof:
-                for src in range(0, dist.get_world_size()):
+                for src in range(dist.get_world_size()):
                     if src == rank:
                         # Send mode
-                        for dst in range(0, dist.get_world_size()):
+                        for dst in range(dist.get_world_size()):
                             if dst == rank:
                                 continue
                             dist.send(tensor, dst)
@@ -1747,10 +1742,10 @@ def _test_send_recv_any_source(self, profiler_ctx):
 
             ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
             with ctx as prof:
-                for dst in range(0, dist.get_world_size()):
+                for dst in range(dist.get_world_size()):
                     if dst == rank:
                         # Recv mode
-                        for dst in range(0, dist.get_world_size()):
+                        for dst in range(dist.get_world_size()):
                             if dst == rank:
                                 continue
 
@@ -1851,10 +1846,10 @@ def _test_send_recv_with_tag(self, profiler_ctx):
             tensor = _build_tensor(send_recv_size, value=rank)
             ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
             with ctx as prof:
-                for dst in range(0, world_size):
+                for dst in range(world_size):
                     if dst == rank:
                         # Recv mode
-                        for src in range(0, world_size):
+                        for src in range(world_size):
                             if src == rank:
                                 continue
                             output_tensor = _build_tensor(send_recv_size, value=-1)
@@ -3050,7 +3045,7 @@ def _test_all_reduce_coalesced_helper(
                 curr_values = master_values if rank == src else worker_values
                 tensors = [
                     _build_tensor(src + 1, val, dtype=dtype)
-                    for dtype, val in zip(dtypes, curr_values)
+                    for dtype, val in zip(dtypes, curr_values, strict=True)
                 ]
                 if cuda:
                     tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors]
@@ -3071,7 +3066,9 @@ def _test_all_reduce_coalesced_helper(
                 )
                 expected_tensors = [
                     _build_tensor(src + 1, expected_value, dtype=dtype)
-                    for dtype, expected_value in zip(dtypes, expected_values)
+                    for dtype, expected_value in zip(
+                        dtypes, expected_values, strict=True
+                    )
                 ]
                 self.assertEqual(tensors, expected_tensors)
 
@@ -3343,7 +3340,7 @@ def _test_gather_helper(
                 )
                 if rank == dest:
                     expected_tensors = [_build_tensor(dest + 1, i) for i in group]
-                    for t1, t2 in zip(tensors, expected_tensors):
+                    for t1, t2 in zip(tensors, expected_tensors, strict=True):
                         self.assertEqual(t1, t2)
 
             self._barrier()
@@ -3445,7 +3442,7 @@ def _test_all_gather_helper(
                 expected_tensors = [
                     _build_tensor(dest + 1, i, dtype=dtype) for i in group
                 ]
-                for t1, t2 in zip(tensors, expected_tensors):
+                for t1, t2 in zip(tensors, expected_tensors, strict=True):
                     self.assertEqual(t1, t2)
 
             self._barrier()
@@ -3629,8 +3626,8 @@ def _run_all_gather_coalesced_and_verify(
                 tensor_shapes=tensor_shapes,
             )
 
-            for l1, l2 in zip(output_tensor_lists, expected_tensors):
-                for t1, t2 in zip(l1, l2):
+            for l1, l2 in zip(output_tensor_lists, expected_tensors, strict=True):
+                for t1, t2 in zip(l1, l2, strict=True):
                     if not torch.equal(t1, t2):
                         return False
             return True
@@ -3829,7 +3826,7 @@ def _test_all_to_all_helper(
                     ]
                     out_tensors = [t.cuda(rank_to_GPU[rank][0]) for t in out_tensors]
                 dist.all_to_all(out_tensors, in_tensors, group=group_id)
-                for t1, t2 in zip(out_tensors, expected_tensors):
+                for t1, t2 in zip(out_tensors, expected_tensors, strict=True):
                     self.assertEqual(t1, t2)
             self._barrier()
 
@@ -4208,7 +4205,7 @@ def _test_DDP_helper(
 
         def _assert_equal_param(self, param_gpu, param_DDP):
             self.assertEqual(len(param_gpu), len(param_DDP))
-            for p_gpu, p_DDP in zip(param_gpu, param_DDP):
+            for p_gpu, p_DDP in zip(param_gpu, param_DDP, strict=True):
                 self.assertEqual(p_gpu, p_DDP)
 
         def _test_DDP_niter(
@@ -4295,7 +4292,7 @@ def _test_DistributedDataParallel(
             # as baseline
 
             # cpu training setup
-            model = DDP_NET
+            model = Net()
 
             # single gpu training setup
             model_gpu = copy.deepcopy(model)
@@ -4350,7 +4347,7 @@ def _test_DistributedDataParallelCPU(self, gradient_as_bucket_view=False):
             _group, _group_id, rank = self._init_global_test()
 
             # cpu training setup
-            model_base = DDP_NET
+            model_base = Net()
 
             # DDP-CPU training setup
             model_DDP = copy.deepcopy(model_base)
@@ -4623,6 +4620,7 @@ def _test_ddp_hook_with_optimizer_parity(
                     for hook_param, allreduce_param in zip(
                         ddp_model_with_optimizer_hook.parameters(),
                         ddp_model_with_no_hook.parameters(),
+                        strict=True,
                     ):
                         self.assertEqual(hook_param, allreduce_param)
 
@@ -4654,6 +4652,7 @@ def _test_ddp_hook_with_optimizer_parity(
                     for hook_param, allreduce_param in zip(
                         ddp_model_with_optimizer_hook.parameters(),
                         ddp_model_with_no_hook.parameters(),
+                        strict=True,
                     ):
                         self.assertEqual(hook_param, allreduce_param)
 
@@ -4830,7 +4829,9 @@ def _test_ddp_apply_optim_in_backward(
                         optimizer_kwargs=optim_kwargs,
                     )
 
-                for p1, p2 in zip(model.parameters(), model_optim_in_bwd.parameters()):
+                for p1, p2 in zip(
+                    model.parameters(), model_optim_in_bwd.parameters(), strict=True
+                ):
                     self.assertEqual(p1, p2, "Parameters not initially equal!")
                 # Enable determinism in cudnn operators
                 with torch.backends.cudnn.flags(
@@ -4848,7 +4849,9 @@ def _test_ddp_apply_optim_in_backward(
                             inp
                         ).sum().backward()  # runs optimizer as well
                         for p1, p2 in zip(
-                            model.parameters(), model_optim_in_bwd.parameters()
+                            model.parameters(),
+                            model_optim_in_bwd.parameters(),
+                            strict=True,
                         ):
                             self.assertEqual(
                                 p1, p2, f"Params not equal at iteration {i}"
@@ -5328,7 +5331,9 @@ def step_model(model, input, target):
                     # sync grads
                     step_model(ddp_model, ddp_input, ddp_target)
 
-                for i, j in zip(model.parameters(), ddp_model.parameters()):
+                for i, j in zip(
+                    model.parameters(), ddp_model.parameters(), strict=True
+                ):
                     if not i.requires_grad:
                         continue
                     if iteration % 2 == 0:
@@ -5499,7 +5504,7 @@ def test_DistributedDataParallel(self):
         def _test_DistributedDataParallel_with_amp(self, grad_is_view=False):
             torch.manual_seed(31415)
             # Creates model and optimizer in default precision
-            model = copy.deepcopy(DDP_NET).cuda()
+            model = Net().cuda()
             optimizer = torch.optim.SGD(model.parameters(), lr=0.03)
 
             # Creates a GradScaler once at the beginning of training.
@@ -5567,6 +5572,7 @@ def test_DistributedDataParallel_with_amp_and_grad_is_view(self):
             for i, j in zip(
                 ddp_model_grad_not_view.parameters(),
                 ddp_model_grad_is_view.parameters(),
+                strict=True,
             ):
                 self.assertEqual(i, j)
 
@@ -5584,7 +5590,7 @@ def _test_DistributedDataParallel_SyncBatchNorm(
             # as baseline
 
             # cpu training setup
-            model = BN_NET if affine else BN_NET_NO_AFFINE
+            model = BatchNormNet() if affine else BatchNormNet(affine=False)
 
             # single gpu training setup
             model_gpu = copy.deepcopy(model)
@@ -5634,6 +5640,7 @@ def _test_DistributedDataParallel_SyncBatchNorm(
         def _test_post_localSGD_optimizer_parity(self, create_averager, grad_is_view):
             learning_rate = 0.03
 
+            DDP_NET = Net()
             net = torch.nn.parallel.DistributedDataParallel(
                 copy.deepcopy(DDP_NET).cuda(),
                 device_ids=[self.rank],
@@ -5671,7 +5678,9 @@ def _test_post_localSGD_optimizer_parity(self, create_averager, grad_is_view):
                     target,
                 )
                 for p1, p2 in zip(
-                    net.parameters(), net_using_post_localSGD_opt.parameters()
+                    net.parameters(),
+                    net_using_post_localSGD_opt.parameters(),
+                    strict=True,
                 ):
                     self.assertEqual(p1.data, p2.data)
 
@@ -5700,7 +5709,7 @@ def _test_post_localSGD_optimizer_step_reload(
             learning_rate = 0.03
 
             net_using_post_localSGD_opt = torch.nn.parallel.DistributedDataParallel(
-                copy.deepcopy(DDP_NET).cuda(), device_ids=[self.rank]
+                Net().cuda(), device_ids=[self.rank]
             )
 
             averager = create_averager()
@@ -5850,7 +5859,7 @@ def _test_DistributedDataParallel_SyncBatchNorm_with_memory_format(
             bs_offset = int(rank * 2)
             global_bs = int(num_processes * 2)
 
-            model = ONLY_SBN_NET
+            model = nn.SyncBatchNorm(2, momentum=0.99)
             model_gpu = copy.deepcopy(model).cuda(rank)
             model_DDP = nn.parallel.DistributedDataParallel(
                 model_gpu, device_ids=[rank]
@@ -6060,6 +6069,7 @@ def test_DistributedDataParallel_SyncBatchNorm_Single_Input_Per_Process(self):
         def test_DistributedDataParallel_SyncBatchNorm_Diff_Input_Sizes_Running_Value(
             self,
         ):
+            ONLY_SBN_NET = nn.SyncBatchNorm(2, momentum=0.99)
             _group, _group_id, rank = self._init_global_test()
             model = nn.parallel.DistributedDataParallel(
                 ONLY_SBN_NET.cuda(rank), device_ids=[rank]
@@ -6127,7 +6137,7 @@ def test_DistributedDataParallel_SyncBatchNorm_Diff_Input_Sizes_gradient(self):
         def test_DistributedDataParallel_SyncBatchNorm_half(self):
             _group, _group_id, rank = self._init_global_test()
 
-            model = copy.deepcopy(BN_NET)
+            model = BatchNormNet()
             model = model.half()
             model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
             model = nn.parallel.DistributedDataParallel(
@@ -6143,7 +6153,7 @@ def test_DistributedDataParallel_SyncBatchNorm_half(self):
 
         def _test_ddp_logging_data(self, is_gpu):
             rank = dist.get_rank()
-            model_DDP = copy.deepcopy(DDP_NET)
+            model_DDP = Net()
             if is_gpu:
                 model_DDP = nn.parallel.DistributedDataParallel(
                     model_DDP.cuda(rank), device_ids=[rank]
@@ -6215,7 +6225,7 @@ def _test_ddp_logging_data(self, is_gpu):
         )
         def test_ddp_logging_data_cpu(self):
             def parse_env(var):
-                return os.environ[var] if var in os.environ else "N/A"
+                return os.environ.get(var, "N/A")
 
             dist.set_debug_level(dist.DebugLevel.INFO)
             _, group_id, _ = self._init_global_test()
@@ -6419,7 +6429,7 @@ def test_ddp_logging_data_gpu(self):
             BACKEND == "nccl", "nccl does not support DDP on CPU models"
         )
         def test_static_graph_api_cpu(self):
-            model_DDP = nn.parallel.DistributedDataParallel(DDP_NET)
+            model_DDP = nn.parallel.DistributedDataParallel(Net())
             expected_err = "should be called before training loop starts"
             with self.assertRaisesRegex(RuntimeError, expected_err):
                 local_bs = 2
@@ -6652,7 +6662,7 @@ def validate_global_samples(local_num_samples):
         def _test_allgather_object(self, subgroup=None):
             # Only set device for NCCL backend since it must use GPUs.
 
-            gather_objects = COLLECTIVES_OBJECT_TEST_LIST.copy()
+            gather_objects = create_collectives_object_test_list()
 
             backend = os.environ["BACKEND"]
             if backend == "nccl":
@@ -6696,7 +6706,7 @@ def test_all_gather_object_subgroup(self):
 
         def _test_gather_object(self, pg=None):
             # Ensure stateful objects can be gathered
-            gather_objects = COLLECTIVES_OBJECT_TEST_LIST.copy()
+            gather_objects = create_collectives_object_test_list()
             my_rank = dist.get_rank(pg)
 
             backend = os.environ["BACKEND"]
@@ -6746,6 +6756,7 @@ class Bar:
         )
         @require_backend_is_available(DistTestCases.backend_feature["gpu"])
         @with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"])
+        @require_exact_world_size(4)
         def test_gather_object(self):
             return self._test_gather_object()
 
@@ -6754,6 +6765,7 @@ def test_gather_object(self):
         )
         @require_backend_is_available(DistTestCases.backend_feature["gpu"])
         @with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"])
+        @require_exact_world_size(4)
         def test_gather_object_subgroup(self):
             default = _get_default_group()
             backend = dist.get_backend(default)
@@ -6818,7 +6830,7 @@ def test_ddp_sync_module_states(self):
             # they are the same as new_model on rank_to_broadcast.
             if rank == rank_to_broadcast:
                 expected_states = new_model.state_dict().values()
-                for t, expected in zip(net_module_states, expected_states):
+                for t, expected in zip(net_module_states, expected_states, strict=True):
                     self.assertEqual(t, expected)
 
         @skip_if_lt_x_gpu(2)
@@ -7029,7 +7041,7 @@ def _validate_execution_trace_nccl(self, et_file: str) -> None:
                 self.assertNotEqual(attrs.get("dtype", ""), "")
 
                 per_coll_meta[collname].append(attrs)
-                if collname in {"wait"}:
+                if collname == "wait":
                     continue
 
                 self.assertEqual(attrs["pg_name"], "0")  # yes this is a string
@@ -7135,7 +7147,9 @@ def test_ddp_join_model_equivalence(self):
 
             # Validate model state dicts are equal
             for (_, local_tensor), (_, dist_tensor) in zip(
-                local_model.state_dict().items(), net.module.state_dict().items()
+                local_model.state_dict().items(),
+                net.module.state_dict().items(),
+                strict=True,
             ):
                 self.assertEqual(local_tensor, dist_tensor)
 
@@ -7266,7 +7280,7 @@ def forward(self, x):
                     return x
 
             torch.cuda.set_device(self.rank)
-            model_bn = BN_NET
+            model_bn = BatchNormNet()
             model_bn = nn.SyncBatchNorm.convert_sync_batchnorm(
                 copy.deepcopy(model_bn)
             ).cuda(self.rank)
@@ -7466,7 +7480,7 @@ def forward(self, x, rank):
                 for baseline_iter in baseline_num_iters:
                     for offset in iteration_offsets:
                         mapping = dict.fromkeys(
-                            range(0, num_early_join_ranks), baseline_iter
+                            range(num_early_join_ranks), baseline_iter
                         )
                         # if num_early_join_ranks > 1, ranks > 0 that will join early
                         # iterate offset//2 more times than rank 0, to test nodes
@@ -7562,7 +7576,7 @@ def forward(self, _):
                     loss.backward()
 
         def _test_broadcast_object_list(self, group=None):
-            gather_objects = COLLECTIVES_OBJECT_TEST_LIST.copy()
+            gather_objects = create_collectives_object_test_list()
 
             # Only set device for NCCL backend since it must use GPUs.
             # Case where rank != GPU device.
@@ -7723,13 +7737,17 @@ def forward(self, x):
                     # materialized param grad is not touched by DDP, so its grad should
                     # be the same as if running locally.
                     for materialized_param, local_param in zip(
-                        ddp.module.fc2.parameters(), local_model.fc2.parameters()
+                        ddp.module.fc2.parameters(),
+                        local_model.fc2.parameters(),
+                        strict=True,
                     ):
                         self.assertEqual(materialized_param.grad, local_param.grad)
 
                     # fc1 parameter grad should still be different, due to allreduce.
                     for synced_param, local_param in zip(
-                        ddp.module.fc1.parameters(), local_model.fc1.parameters()
+                        ddp.module.fc1.parameters(),
+                        local_model.fc1.parameters(),
+                        strict=True,
                     ):
                         self.assertFalse(synced_param.grad == local_param.grad)
 
@@ -8286,10 +8304,11 @@ def forward(self, x):
         @require_backend_is_available({"gloo"})
         def test_scatter_object_list(self):
             src_rank = 0
+            collectives_object_test_list = create_collectives_object_test_list()
             scatter_list = (
-                COLLECTIVES_OBJECT_TEST_LIST
+                collectives_object_test_list
                 if self.rank == src_rank
-                else [None for _ in COLLECTIVES_OBJECT_TEST_LIST]
+                else [None for _ in collectives_object_test_list]
             )
             world_size = dist.get_world_size()
             scatter_list = scatter_list[:world_size]
@@ -8302,8 +8321,8 @@ def test_scatter_object_list(self):
             dist.scatter_object_list(output_obj_list, scatter_list, src=src_rank)
             self.assertEqual(
                 output_obj_list[0],
-                COLLECTIVES_OBJECT_TEST_LIST[
-                    self.rank % len(COLLECTIVES_OBJECT_TEST_LIST)
+                collectives_object_test_list[
+                    self.rank % len(collectives_object_test_list)
                 ],
             )
             # Ensure errors are raised upon incorrect arguments.
@@ -8581,7 +8600,7 @@ def _test_output_unused_in_loss(self, module_cls, gradient_as_bucket_view):
 
                 # Verify grads are the same
                 for local_param, dist_param in zip(
-                    local_net.parameters(), net.parameters()
+                    local_net.parameters(), net.parameters(), strict=True
                 ):
                     local_grad = local_param.grad
                     dist_grad = dist_param.grad
@@ -8631,7 +8650,7 @@ def test_undefined_grad_parity_unused_parameters(self):
             torch._C._functions.UndefinedGrad()(out).backward()
             torch._C._functions.UndefinedGrad()(local_out).backward()
             for (dist_param_name, dist_param), (local_param_name, local_param) in zip(
-                net.named_parameters(), local_net.named_parameters()
+                net.named_parameters(), local_net.named_parameters(), strict=True
             ):
                 dist_grad = dist_param.grad
                 local_grad = local_param.grad
@@ -8689,7 +8708,9 @@ def test_different_graph_across_ranks(self):
             self.assertTrue(
                 static_model._get_ddp_logging_data().get("has_rebuilt_buckets", 0)
             )
-            for i, j in zip(base_model.parameters(), static_model.parameters()):
+            for i, j in zip(
+                base_model.parameters(), static_model.parameters(), strict=True
+            ):
                 self.assertEqual(i, j)
 
         @require_backend_is_available({"gloo"})
@@ -9297,7 +9318,7 @@ def get_loss(model_output):
                     loss_static.backward()
                     self._model_step(model_static_graph)
                     for p, p_static in zip(
-                        model.parameters(), model_static_graph.parameters()
+                        model.parameters(), model_static_graph.parameters(), strict=True
                     ):
                         self.assertEqual(p, p_static)
 
@@ -9974,7 +9995,7 @@ def forward(self, x):
                         p.grad.data = p.grad / iters
 
                     for p_ddp, p_local in zip(
-                        model.parameters(), local_model.parameters()
+                        model.parameters(), local_model.parameters(), strict=True
                     ):
                         self.assertTrue(
                             torch.allclose(p_ddp.grad, p_local.grad),
@@ -9989,7 +10010,7 @@ def forward(self, x):
             "Only Nccl & Gloo backend support DistributedDataParallel",
         )
         def test_sync_bn_logged(self):
-            model = BN_NET
+            model = BatchNormNet()
             rank = self.rank
             # single gpu training setup
             model_gpu = model.cuda(rank)
@@ -10191,7 +10212,9 @@ def _test_hook_pickling(self, hook, hook_state):
             #  (refer to https://github.com/numpy/numpy/blob/266aad7478bc7fbcc55eea7f942a0d373b838396/numpy/random/mtrand.pyi)
             # To make sure random state was restored properly, all entries should equal the original
             for entry1, entry2 in zip(
-                hook_state.rng.get_state(), dummy_hook_state.rng.get_state()
+                hook_state.rng.get_state(),
+                dummy_hook_state.rng.get_state(),
+                strict=True,
             ):
                 np.testing.assert_array_equal(entry1, entry2)
 
@@ -10212,7 +10235,7 @@ def _test_hook_pickling(self, hook, hook_state):
 
             # Check that gradients after 10 epochs are the same
             for orig_param, dummy_param in zip(
-                ddp_model.parameters(), dummy_ddp_model.parameters()
+                ddp_model.parameters(), dummy_ddp_model.parameters(), strict=True
             ):
                 self.assertEqual(orig_param.grad, dummy_param.grad)
 
@@ -10299,7 +10322,9 @@ def test_ddp_compile_static_graph(self):
                 self.assertEqual(out_ddp, out_ddp_static)
                 out_ddp.backward()
                 out_ddp_static.backward()
-                for p1, p2 in zip(ddp.parameters(), ddp_static.parameters()):
+                for p1, p2 in zip(
+                    ddp.parameters(), ddp_static.parameters(), strict=True
+                ):
                     self.assertEqual(p1.grad, p2.grad)
 
         @skip_if_lt_x_gpu(2)
@@ -10392,7 +10417,9 @@ def test_skip_all_reduce_unused_parameters(self):
                 test_model_1._get_ddp_logging_data().get("num_buckets_reduced"), 1
             )
 
-            for i, j in zip(base_model.parameters(), test_model_1.parameters()):
+            for i, j in zip(
+                base_model.parameters(), test_model_1.parameters(), strict=True
+            ):
                 self.assertEqual(i, j)
 
 
diff --git a/torch/testing/_internal/distributed/fake_pg.py b/torch/testing/_internal/distributed/fake_pg.py
index a36d2da29b4a..de9c2cc7ee52 100644
--- a/torch/testing/_internal/distributed/fake_pg.py
+++ b/torch/testing/_internal/distributed/fake_pg.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 
 import torch.distributed as dist
-from torch.distributed._distributed_c10d import FakeProcessGroup
+from torch._C._distributed_c10d import FakeProcessGroup
 
 
 class FakeStore(dist.Store):
@@ -22,7 +22,7 @@ def _create_fake_pg(common_opts, backend_opts):
     for every collective. It should be used as a convenient tool when playing
     with distributed but don't care about the actual data.
     """
-    return FakeProcessGroup(
+    return FakeProcessGroup._create_internal(
         common_opts.group_rank, common_opts.group_size, backend_opts
     )
 
diff --git a/torch/testing/_internal/distributed/multi_threaded_pg.py b/torch/testing/_internal/distributed/multi_threaded_pg.py
index 8de13414dd47..79aff05b3421 100644
--- a/torch/testing/_internal/distributed/multi_threaded_pg.py
+++ b/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -71,6 +71,24 @@ def bitwise_reduce(tensors, op):
 }
 
 
+# Note [Hide collectives mutation from autograd]
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Threaded PG is intended to closely simulate the behavior of regular process
+# groups.  However, our regular PG implementations perform a dispatch through
+# c10d, whereas Threaded PG does not for some reason (some superficial
+# but not very convincing reasons include that Threaded PG is implemented
+# in Python but you can't override Backend in Python, you can only override
+# ProcessGroup in Python), thereby bypassing the dispatch step.  Now we have
+# a problem: c10d's signatures are LIES, they mutate their (output) tensor
+# arguments but their annotations don't have mutations on them so we don't
+# actually update any view metadata if you do differentiation.  This
+# ordinarily "doesn't matter" because distributed collectives aren't
+# differentiable anyway, but it's possible to tickle this in testing if
+# someone tries to touch the grad_fn of a Tensor.  There a few ways to
+# fix this, but the easiest way was to use the .detach() trick to hide
+# the mutations from autograd.
+
+
 class AllToAll:
     @torch.no_grad()
     def work(self, data):
@@ -79,7 +97,10 @@ def work(self, data):
             output_tensor_list, _ = data[dest_rank]
             for src_rank in range(world_size):
                 _, input_tensor_list = data[src_rank]
-                output_tensor_list[src_rank].copy_(input_tensor_list[dest_rank])
+                # See Note [Hide collectives mutation from autograd]
+                output_tensor_list[src_rank].detach().copy_(
+                    input_tensor_list[dest_rank]
+                )
 
 
 class AllToAllBase:
@@ -99,9 +120,10 @@ def work(self, data):
                     input_buffer.size(0), input_split_sizes, world_size
                 )
 
+                # See Note [Hide collectives mutation from autograd]
                 output_buffer[
                     output_indexes[src_rank] : output_indexes[src_rank + 1]
-                ].copy_(
+                ].detach().copy_(
                     input_buffer[
                         input_indexes[dest_rank] : input_indexes[dest_rank + 1]
                     ]
@@ -144,7 +166,7 @@ def work(self, data):
             # collect all data to the list and make them
             # all on rank 0 device
             tensors = [
-                data[src_rank][i].to(rank_0_device) for src_rank in range(0, len(data))
+                data[src_rank][i].to(rank_0_device) for src_rank in range(len(data))
             ]
 
             # now mimic reduce across all ranks
@@ -152,7 +174,8 @@ def work(self, data):
 
             # copy all the reduced value to each rank
             for src_rank in range(len(data)):
-                data[src_rank][i].copy_(res.to(data[src_rank][i].device))
+                # See Note [Hide collectives mutation from autograd]
+                data[src_rank][i].detach().copy_(res.to(data[src_rank][i].device))
 
 
 class AllGather:
@@ -166,7 +189,8 @@ def work(self, data):
 
             for dest in data:
                 dest_tensor = dest[0][0][src_rank]
-                dest_tensor.copy_(src_tensor)
+                # See Note [Hide collectives mutation from autograd]
+                dest_tensor.detach().copy_(src_tensor)
 
 
 class Scatter:
@@ -185,7 +209,8 @@ def work(self, data):
             # Can't handle scatter with multiple output tensor
             assert len(out_tensor_list) == 1
             dest_tensor = out_tensor_list[0]
-            dest_tensor.copy_(src_in_tensors[rank])
+            # See Note [Hide collectives mutation from autograd]
+            dest_tensor.detach().copy_(src_in_tensors[rank])
 
 
 class Gather:
@@ -202,7 +227,8 @@ def work(self, data):
             # Can't handle gather with multiple tensor lists
             assert len(src_in_tensor_list) == 1
             dest_tensor = out_tensor_list[rank]
-            dest_tensor.copy_(src_in_tensor_list[0])
+            # See Note [Hide collectives mutation from autograd]
+            dest_tensor.detach().copy_(src_in_tensor_list[0])
 
 
 class ReduceScatter:
@@ -224,14 +250,21 @@ def work(self, data):
                 assert len(dest_tensor_on_rank_i) == 1
                 dst_tensor_device = dest_tensor_on_rank_i[0].device
                 if not start_reduction[i]:
-                    dest_tensor_on_rank_i[0].copy_(to_scatter[i].to(dst_tensor_device))
+                    # See Note [Hide collectives mutation from autograd]
+                    dest_tensor_on_rank_i[0].detach().copy_(
+                        to_scatter[i].to(dst_tensor_device)
+                    )
                     start_reduction[i] = True
                 else:
-                    dest_tensor_on_rank_i[0].add_(to_scatter[i].to(dst_tensor_device))
+                    # See Note [Hide collectives mutation from autograd]
+                    dest_tensor_on_rank_i[0].detach().add_(
+                        to_scatter[i].to(dst_tensor_device)
+                    )
         if self.op == dist.ReduceOp.AVG:
             num_ranks = len(data)
             for each_rank_data in data:
-                each_rank_data[0][0] /= num_ranks
+                # See Note [Hide collectives mutation from autograd]
+                each_rank_data[0][0].detach().div_(num_ranks)
 
 
 class Broadcast:
@@ -242,9 +275,12 @@ def __init__(self, src):
     def work(self, data):
         in_tensor_list = flatten_list(data[self.src])
         for i in range(len(data)):
+            if i == self.src:
+                continue
             out_tensor_list = flatten_list(data[i])
             for j in range(len(in_tensor_list)):
-                out_tensor_list[j].copy_(in_tensor_list[j])
+                # See Note [Hide collectives mutation from autograd]
+                out_tensor_list[j].detach().copy_(in_tensor_list[j])
 
 
 class Collective:
@@ -421,7 +457,9 @@ def reduce_scatter_tensor_coalesced(
     ):
         works = [
             self._reduce_scatter_base(output_tensor, input_tensor, opts)
-            for output_tensor, input_tensor in zip(output_tensors, input_tensors)
+            for output_tensor, input_tensor in zip(
+                output_tensors, input_tensors, strict=True
+            )
         ]
         for work in works[:-1]:
             work.wait()
@@ -431,7 +469,7 @@ def allgather_into_tensor_coalesced(
         self, output_tensor_list, input_tensor_list, opts=AllgatherOptions()
     ):
         res = None
-        for o_t, i_t in zip(output_tensor_list, input_tensor_list):
+        for o_t, i_t in zip(output_tensor_list, input_tensor_list, strict=True):
             res = self._allgather_base(o_t, i_t)
         return res
 
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index f7cb2075e373..3c5c9101e43c 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -266,7 +266,7 @@ def _verify_backwards_remote(self, tensors, context_id, local_grads, *args):
         grads = dist_autograd.get_gradients(context_id)
         nargs = len(args)
         ngrads = 0
-        for i in range(0, nargs):
+        for i in range(nargs):
             if local_grads[i] is not None:
                 self.assertIn(args[i], grads)
                 self.assertEqual(local_grads[i], grads[args[i]])
@@ -1973,7 +1973,7 @@ def test_clean_context_during_backward(self):
         DistAutogradTest._test_clean_context_backward_context_id = context_id
 
         # Send the context id to all nodes.
-        for i in range(0, self.world_size):
+        for i in range(self.world_size):
             if i != self.rank:
                 rank_distance = (i - self.rank + self.world_size) % self.world_size
                 rpc.rpc_sync(
@@ -1988,7 +1988,7 @@ def test_clean_context_during_backward(self):
         self.assertEqual(self.world_size - 1, len(known_context_ids))
 
         t1 = torch.rand((3, 3), requires_grad=True)
-        for i in range(0, 100):
+        for i in range(100):
             dst = self._next_rank()
             t1 = rpc.rpc_sync(worker_name(dst), torch.add, args=(t1, t1))
 
@@ -2749,7 +2749,7 @@ def test_gradients_synchronizations(self):
 
                 for i in range(len(futs)):
                     local_gradients = [p.grad for p in local_layers[i].parameters()]
-                    for g1, g2 in zip(futs[i].wait(), local_gradients):
+                    for g1, g2 in zip(futs[i].wait(), local_gradients, strict=True):
                         self.assertEqual(g1, g2)
 
         rpc.shutdown()
diff --git a/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py b/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
index f84ba5225c6e..ad0b7fbe2207 100644
--- a/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
+++ b/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
@@ -46,7 +46,7 @@ def get_model(self):
     @rpc.functions.async_execution
     def update_and_fetch_model(ps_rref, grads):
         self = ps_rref.local_value()
-        for p, g in zip(self.model.parameters(), grads):
+        for p, g in zip(self.model.parameters(), grads, strict=True):
             if p.grad is None:
                 p.grad = g
             else:
diff --git a/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py b/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
index beb08a25484d..57008aed17db 100644
--- a/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
@@ -216,7 +216,7 @@ def finish_episode(self):
             returns.insert(0, R)
         returns = torch.tensor(returns)
         returns = (returns - returns.mean()) / (returns.std() + self.eps)
-        for log_prob, R in zip(probs, returns):
+        for log_prob, R in zip(probs, returns, strict=True):
             policy_loss.append(-log_prob * R)
         self.optimizer.zero_grad()
         policy_loss = torch.cat(policy_loss).sum()
diff --git a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
index ec2f2b949907..76c089f45800 100644
--- a/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
@@ -85,7 +85,7 @@ def test_rref_local_value(self):
         ):
             rref_local_value(rref)
 
-        ret = ret = rpc.rpc_sync(dst_worker_name, rref_local_value, (rref,))
+        ret = rpc.rpc_sync(dst_worker_name, rref_local_value, (rref,))
         self.assertEqual(ret, torch.add(torch.ones(2, 2), 1))
 
     @dist_init
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 4ec964092b39..b7c0dd17a116 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -664,7 +664,7 @@ def __init__(self, init_method):
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
-load_tests = load_tests
+load_tests = load_tests  # noqa: PLW0127
 
 
 class MyEmbeddingBagModel(torch.nn.Module):
@@ -1818,7 +1818,7 @@ def test_profiler_rpc_key_names(self):
         # Spawn multiple threads that send RPCs to ensure keys are correctly
         # prefixed when there are multiple RPCs being created/in flight at the
         # same time.
-        dst_ranks = [rank for rank in range(0, self.world_size) if rank != self.rank]
+        dst_ranks = [rank for rank in range(self.world_size) if rank != self.rank]
 
         def rpc_with_profiling(dst_worker):
             with _profile() as prof:
@@ -1884,7 +1884,7 @@ def _run_test_profiler_remote_events_profiled(self):
         if self.rank != 1:
             return
 
-        dst_ranks = [rank for rank in range(0, self.world_size) if rank != self.rank]
+        dst_ranks = [rank for rank in range(self.world_size) if rank != self.rank]
         for dst in dst_ranks:
             dst_worker = worker_name(dst)
             with _profile() as prof:
diff --git a/torch/testing/_internal/hypothesis_utils.py b/torch/testing/_internal/hypothesis_utils.py
index f02ef4c9e04b..a00e1e1a048a 100644
--- a/torch/testing/_internal/hypothesis_utils.py
+++ b/torch/testing/_internal/hypothesis_utils.py
@@ -7,6 +7,7 @@
 
 import hypothesis
 from functools import reduce
+from importlib.metadata import version
 from hypothesis import assume
 from hypothesis import settings
 from hypothesis import strategies as st
@@ -346,22 +347,33 @@ def tensor_conv(
 
     return X, W, b, groups, tr
 
+
 # We set the deadline in the currently loaded profile.
 # Creating (and loading) a separate profile overrides any settings the user
 # already specified.
-hypothesis_version = hypothesis.version.__version_info__
-current_settings = settings._profiles[settings._current_profile].__dict__
-current_settings['deadline'] = None
-if hypothesis_version >= (3, 16, 0) and hypothesis_version < (5, 0, 0):
-    current_settings['timeout'] = hypothesis.unlimited
+hypothesis_version = tuple(map(int, version("hypothesis").split(".")[:3]))
+
+if (3, 16, 0) <= hypothesis_version < (3, 27, 0):
+    # Hypothesis 3.16 → 3.26: use `timeout` instead of `deadline`
+    settings.register_profile("no_deadline", timeout=hypothesis.unlimited)
+else:
+    # Hypothesis >=3.27: use `deadline=None`
+    settings.register_profile("no_deadline", deadline=None)
+
+# Activate the profile
+settings.load_profile("no_deadline")
+
+
 def assert_deadline_disabled():
+    """Check that deadlines are effectively disabled across Hypothesis versions."""
     if hypothesis_version < (3, 27, 0):
         import warnings
+
         warning_message = (
             "Your version of hypothesis is outdated. "
             "To avoid `DeadlineExceeded` errors, please update. "
             f"Current hypothesis version: {hypothesis.__version__}"
         )
-        warnings.warn(warning_message)
+        warnings.warn(warning_message, stacklevel=2)
     else:
         assert settings().deadline is None
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py
index 4bc0738ec2f3..ce8e68ae1e2c 100644
--- a/torch/testing/_internal/jit_utils.py
+++ b/torch/testing/_internal/jit_utils.py
@@ -249,7 +249,7 @@ def extract_files(buffer):
             saved_module_buffer_2.seek(0)
             code_files_2, _debug_files_2 = extract_files(saved_module_buffer_2)
 
-            for a, b in zip(code_files, code_files_2):
+            for a, b in zip(code_files, code_files_2, strict=True):
                 self.assertMultiLineEqual(a, b)
 
             if isinstance(m, torch._C.ScriptModule):
@@ -439,7 +439,7 @@ def checkBailouts(self, model, inputs, expected):
         state = model.get_debug_state()
         plan = get_execution_plan(state)
         num_bailouts = plan.code.num_bailouts()
-        for i in range(0, num_bailouts):
+        for i in range(num_bailouts):
             plan.code.request_bailout(i)
             bailout_outputs = model(*inputs)
             self.assertEqual(bailout_outputs, expected)
@@ -617,7 +617,7 @@ def input_reduce(input, fn, acc):
         self.assertEqual(outputs, outputs_ge)
         if inputs_require_grads:
             self.assertEqual(grads, grads_ge, atol=grad_atol, rtol=grad_rtol)
-            for g2, g2_ge in zip(grads2, grads2_ge):
+            for g2, g2_ge in zip(grads2, grads2_ge, strict=True):
                 if g2 is None and g2_ge is None:
                     continue
                 self.assertEqual(g2, g2_ge, atol=8e-4, rtol=8e-4)
diff --git a/torch/testing/_internal/logging_utils.py b/torch/testing/_internal/logging_utils.py
index 3d017ffe14ff..1e1ecf8f4f70 100644
--- a/torch/testing/_internal/logging_utils.py
+++ b/torch/testing/_internal/logging_utils.py
@@ -7,7 +7,7 @@
 import torch._logging
 import torch._logging._internal
 from contextlib import AbstractContextManager
-from typing import Callable
+from collections.abc import Callable
 from torch._dynamo.utils import LazyString
 from torch._inductor import config as inductor_config
 import logging
@@ -228,11 +228,11 @@ def multiple_logs_to_string(module: str, *log_options: str) -> tuple[list[io.Str
     def tmp_redirect_logs():
         loggers = [torch._logging.getArtifactLogger(module, option) for option in log_options]
         try:
-            for logger, handler in zip(loggers, handlers):
+            for logger, handler in zip(loggers, handlers, strict=True):
                 logger.addHandler(handler)
             yield
         finally:
-            for logger, handler in zip(loggers, handlers):
+            for logger, handler in zip(loggers, handlers, strict=True):
                 logger.removeHandler(handler)
 
     def ctx_manager() -> AbstractContextManager[None]:
diff --git a/torch/testing/_internal/opinfo/core.py b/torch/testing/_internal/opinfo/core.py
index 97dee3c7c0f4..685fa2fd2efd 100644
--- a/torch/testing/_internal/opinfo/core.py
+++ b/torch/testing/_internal/opinfo/core.py
@@ -8,12 +8,12 @@
 import operator
 import unittest
 from abc import ABC, abstractmethod
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from functools import partial
 from itertools import product
-from typing import Any, Callable, Optional, TypeVar, Union
+from typing import Any, Optional, TypeVar, Union
 
 import torch
 from torch.testing import make_tensor
@@ -166,7 +166,7 @@ def __init__(
 A SampleInput can be constructed "naturally" with *args and **kwargs or by
 explicitly setting the "args" and "kwargs" parameters, but the two
 methods of construction cannot be mixed!"""
-        elif len(var_args) or len(var_kwargs):
+        elif var_args or var_kwargs:
             assert (
                 output_process_fn_grad is None
                 and broadcasts_input is None
diff --git a/torch/testing/_internal/opinfo/definitions/_masked.py b/torch/testing/_internal/opinfo/definitions/_masked.py
index c5d08073803b..d65fbef658a4 100644
--- a/torch/testing/_internal/opinfo/definitions/_masked.py
+++ b/torch/testing/_internal/opinfo/definitions/_masked.py
@@ -365,7 +365,7 @@ def sample_inputs_masked_cumops(op_info, device, dtype, requires_grad, **kwargs)
         for mask in _generate_masked_op_mask(
             sample_input.input.shape, device, **kwargs
         ):
-            if type(mask) != torch.Tensor:
+            if type(mask) is not torch.Tensor:
                 continue
             sample_input_args, sample_input_kwargs = (
                 sample_input.args,
@@ -402,9 +402,9 @@ def sample_inputs_masked_logaddexp(op_info, device, dtype, requires_grad, **kwar
         make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
     )
     for shape, input_masks, other_masks in zip(
-        shapes, input_mask_lists, other_mask_lists
+        shapes, input_mask_lists, other_mask_lists, strict=True
     ):
-        for input_mask, other_mask in zip(input_masks, other_masks):
+        for input_mask, other_mask in zip(input_masks, other_masks, strict=True):
             yield SampleInput(
                 make_arg(shape),
                 make_arg(shape),
diff --git a/torch/testing/_internal/opinfo/definitions/linalg.py b/torch/testing/_internal/opinfo/definitions/linalg.py
index 9eeacf887084..ae5a468ddd6a 100644
--- a/torch/testing/_internal/opinfo/definitions/linalg.py
+++ b/torch/testing/_internal/opinfo/definitions/linalg.py
@@ -41,6 +41,7 @@
     skipIfSlowGradcheckEnv,
     slowTest,
     TEST_WITH_ROCM,
+    TEST_XPU,
 )
 from torch.testing._internal.opinfo.core import (
     clone_sample,
@@ -292,7 +293,7 @@ def sample_inputs_linalg_multi_dot(op_info, device, dtype, requires_grad, **kwar
 
     for sizes in test_cases:
         tensors = []
-        for size in zip(sizes[:-1], sizes[1:]):
+        for size in itertools.pairwise(sizes):
             t = make_tensor(
                 size, dtype=dtype, device=device, requires_grad=requires_grad
             )
@@ -321,7 +322,7 @@ def sample_inputs_linalg_matrix_norm(op_info, device, dtype, requires_grad, **kw
 def sample_inputs_linalg_norm(
     op_info, device, dtype, requires_grad, *, variant=None, **kwargs
 ):
-    if variant is not None and variant not in ("subgradient_at_zero",):
+    if variant is not None and variant != "subgradient_at_zero":
         raise ValueError(
             f"Unsupported variant, expected variant to be 'subgradient_at_zero' but got: {variant}"
         )
@@ -1766,7 +1767,12 @@ def make_input():
         decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
         skips=(
             # linalg.lu_factor: LU without pivoting is not implemented on the CPU
-            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestCommon",
+                "test_compare_cpu",
+                active_if=(not TEST_XPU),
+            ),
         ),
     ),
     OpInfo(
@@ -1782,7 +1788,12 @@ def make_input():
         decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
         skips=(
             # linalg.lu_factor: LU without pivoting is not implemented on the CPU
-            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestCommon",
+                "test_compare_cpu",
+                active_if=(not TEST_XPU),
+            ),
         ),
     ),
     OpInfo(
@@ -1799,7 +1810,12 @@ def make_input():
         decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
         skips=(
             # linalg.lu_factor: LU without pivoting is not implemented on the CPU
-            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestCommon",
+                "test_compare_cpu",
+                active_if=(not TEST_XPU),
+            ),
         ),
     ),
     OpInfo(
diff --git a/torch/testing/_internal/opinfo/definitions/signal.py b/torch/testing/_internal/opinfo/definitions/signal.py
index 33e517b20838..f81efd19dbc6 100644
--- a/torch/testing/_internal/opinfo/definitions/signal.py
+++ b/torch/testing/_internal/opinfo/definitions/signal.py
@@ -1,9 +1,9 @@
 # mypy: ignore-errors
 
 import unittest
+from collections.abc import Callable
 from functools import partial
 from itertools import product
-from typing import Callable
 
 import numpy
 
diff --git a/torch/testing/_internal/opinfo/definitions/sparse.py b/torch/testing/_internal/opinfo/definitions/sparse.py
index 41c17471d9de..200a3ad9ed90 100644
--- a/torch/testing/_internal/opinfo/definitions/sparse.py
+++ b/torch/testing/_internal/opinfo/definitions/sparse.py
@@ -204,7 +204,7 @@ def _validate_sample_input_sparse_reduction(op_info, sample, check_validate=Fals
     if op_info.name == "sum":
         sample = _validate_sample_input_sparse_reduction_sum(sample)
 
-    if op_info.name in {"masked.sum"}:
+    if op_info.name == "masked.sum":
         mask = sample.kwargs.get("mask", UNSPECIFIED)
         if (
             mask not in {None, UNSPECIFIED}
@@ -792,12 +792,16 @@ def _sample_inputs_sparse_like_fns(
 
 
 def _validate_sample_input_sparse_like_fns(op_info, sample, check_validate=False):
-    if sample.input.layout in {
-        torch.sparse_csr,
-        torch.sparse_csc,
-        torch.sparse_bsr,
-        torch.sparse_bsc,
-    } and op_info.name not in {"zeros_like"}:
+    if (
+        sample.input.layout
+        in {
+            torch.sparse_csr,
+            torch.sparse_csc,
+            torch.sparse_bsr,
+            torch.sparse_bsc,
+        }
+        and op_info.name != "zeros_like"
+    ):
         if sample.kwargs.get("layout", sample.input.layout) != sample.input.layout:
             return ErrorInput(
                 sample,
diff --git a/torch/testing/_internal/opinfo/definitions/special.py b/torch/testing/_internal/opinfo/definitions/special.py
index 1418685e8832..d6dce75437d1 100644
--- a/torch/testing/_internal/opinfo/definitions/special.py
+++ b/torch/testing/_internal/opinfo/definitions/special.py
@@ -448,6 +448,10 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
             # Greatest absolute difference: inf
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+            # Too slow
+            DecorateInfo(
+                unittest.skip, "TestCommon", "test_compare_cpu", device_type="xpu"
+            ),
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
@@ -474,6 +478,10 @@ def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
             DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
             # Greatest absolute difference: nan
             DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+            # Too slow
+            DecorateInfo(
+                unittest.skip, "TestCommon", "test_compare_cpu", device_type="xpu"
+            ),
         ),
         supports_one_python_scalar=True,
         supports_autograd=False,
diff --git a/torch/testing/_internal/optests/aot_autograd.py b/torch/testing/_internal/optests/aot_autograd.py
index a4508a570a00..3c4d05a95a33 100644
--- a/torch/testing/_internal/optests/aot_autograd.py
+++ b/torch/testing/_internal/optests/aot_autograd.py
@@ -3,7 +3,7 @@
 import torch
 import torch.utils._pytree as pytree
 from torch.testing._utils import wrapper_set_seed
-from functorch.compile import compiled_function, min_cut_rematerialization_partition, nop
+from functorch.compile import compiled_function, min_cut_rematerialization_partition, default_partition, nop
 from .make_fx import randomize
 import re
 
@@ -38,7 +38,8 @@ def aot_autograd_check(
         assert_equals_fn=torch.testing.assert_close,
         check_gradients=True,
         try_check_data_specialization=False,
-        skip_correctness_check=False):
+        skip_correctness_check=False,
+        disable_functionalization=False):
     """Compares func(*args, **kwargs) in eager-mode to under AOTAutograd.
 
     Compares outputs and (if check_gradients=True) gradients produced by
@@ -63,8 +64,27 @@ def func_no_tensors(args):
         c_args, c_kwargs = pytree.tree_unflatten(reconstructed_flat_args, args_spec)
         return func(*c_args, **c_kwargs)
 
-    compiled_f = compiled_function(
-        func_no_tensors, nop, nop, dynamic=dynamic, partition_fn=min_cut_rematerialization_partition)
+    # cannot use the min cut partitioner without functionalization
+    if disable_functionalization:
+        compiled_f = compiled_function(
+            func_no_tensors,
+            nop,
+            nop,
+            dynamic=dynamic,
+            partition_fn=default_partition,
+            keep_inference_input_mutations=True,
+            disable_functionalization=True
+        )
+    else:
+        compiled_f = compiled_function(
+            func_no_tensors,
+            nop,
+            nop,
+            dynamic=dynamic,
+            partition_fn=min_cut_rematerialization_partition,
+            keep_inference_input_mutations=True,
+            disable_functionalization=False
+        )
 
     out = wrapper_set_seed(func_no_tensors, args)
     if check_gradients == "auto":
diff --git a/torch/testing/_internal/optests/generate_tests.py b/torch/testing/_internal/optests/generate_tests.py
index 51fcadd8dee9..17f7e27d6746 100644
--- a/torch/testing/_internal/optests/generate_tests.py
+++ b/torch/testing/_internal/optests/generate_tests.py
@@ -10,8 +10,8 @@
 import tempfile
 import threading
 import unittest
-from collections.abc import Sequence
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable, Sequence
+from typing import Any, Optional, Union
 
 import torch
 import torch._dynamo
diff --git a/torch/testing/_internal/subclasses.py b/torch/testing/_internal/subclasses.py
index 0898c288d926..228f98139fea 100644
--- a/torch/testing/_internal/subclasses.py
+++ b/torch/testing/_internal/subclasses.py
@@ -70,7 +70,7 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
     def __coerce_same_metadata_as_tangent__(
         self, expected_metadata: Any, expected_type: Optional[type] = None
     ):
-        if expected_type == type(self.a):
+        if expected_type is type(self.a):
             return self.a
         elif expected_type is TwoTensor:
             return TwoTensor(self.a, self.a.clone())
diff --git a/torch/testing/_internal/triton_utils.py b/torch/testing/_internal/triton_utils.py
index 4edaf86dd1d7..0964c68ebb20 100644
--- a/torch/testing/_internal/triton_utils.py
+++ b/torch/testing/_internal/triton_utils.py
@@ -912,7 +912,7 @@ def strange_config_matmul_kernel(
         b_ptrs = b_ptr + (offs_k[:, None] + offs_bn[None, :])
 
         accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-        for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        for k in range(tl.cdiv(K, BLOCK_SIZE_K)):
             a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
             b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
             accumulator = tl.dot(a, b, accumulator)
diff --git a/torch/testing/_internal/two_tensor.py b/torch/testing/_internal/two_tensor.py
index f0bdbf2d4ef6..8197829ac7f4 100644
--- a/torch/testing/_internal/two_tensor.py
+++ b/torch/testing/_internal/two_tensor.py
@@ -9,7 +9,7 @@
 # A simple tensor subclass that holds two tensors internally, and runs every op on both tensors.
 class TwoTensor(torch.Tensor):
     @staticmethod
-    def __new__(cls, a, b, outer_size=None, outer_stride=None):
+    def __new__(cls, a, b, outer_size=None, outer_stride=None, *, requires_grad=None):
         if outer_size is None:
             outer_size = a.size()
         if outer_stride is None:
@@ -28,7 +28,7 @@ def __new__(cls, a, b, outer_size=None, outer_stride=None):
         kwargs["storage_offset"] = a.storage_offset()
         kwargs["device"] = a.device
         kwargs["layout"] = a.layout
-        kwargs["requires_grad"] = a.requires_grad
+        kwargs["requires_grad"] = requires_grad or a.requires_grad
         kwargs["dtype"] = a.dtype
         out = torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)
 
@@ -39,7 +39,7 @@ def __new__(cls, a, b, outer_size=None, outer_stride=None):
 
     @torch._disable_dynamo
     @mark_subclass_constructor_exportable_experimental
-    def __init__(self, a, b, outer_size=None, outer_stride=None):
+    def __init__(self, a, b, outer_size=None, outer_stride=None, *, requires_grad=None):
         self.a = a
         self.b = b
 
@@ -78,7 +78,7 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
         # our two inner tensors return the same value
         out_flat = [
             cls(o_a, o_b) if isinstance(o_a, torch.Tensor) else o_a
-            for o_a, o_b in zip(out_a_flat, out_b_flat)
+            for o_a, o_b in zip(out_a_flat, out_b_flat, strict=True)
         ]
         out = pytree.tree_unflatten(out_flat, spec)
         from torch._higher_order_ops.cond import cond_op
diff --git a/torch/types.py b/torch/types.py
index ab6f4639f444..01a62ff4d013 100644
--- a/torch/types.py
+++ b/torch/types.py
@@ -12,8 +12,8 @@
     str as _str,
 )
 from collections.abc import Sequence
-from typing import Any, IO, TYPE_CHECKING, Union
-from typing_extensions import Self, TypeAlias
+from typing import Any, IO, TYPE_CHECKING, TypeAlias, Union
+from typing_extensions import Self
 
 # `as` imports have better static analysis support than assignment `ExposedType: TypeAlias = HiddenType`
 from torch import (  # noqa: F401
diff --git a/torch/utils/_appending_byte_serializer.py b/torch/utils/_appending_byte_serializer.py
index 91936ab6fc06..82cced0b3dc8 100644
--- a/torch/utils/_appending_byte_serializer.py
+++ b/torch/utils/_appending_byte_serializer.py
@@ -1,7 +1,7 @@
 import base64
 import zlib
-from collections.abc import Iterable
-from typing import Callable, Generic, TypeVar
+from collections.abc import Callable, Iterable
+from typing import Generic, TypeVar
 
 
 T = TypeVar("T")
@@ -38,7 +38,8 @@ def to_bytes(self) -> bytes:
         digest = zlib.crc32(self._data[CHECKSUM_DIGEST_SIZE:]).to_bytes(
             4, byteorder="big", signed=False
         )
-        assert len(digest) == CHECKSUM_DIGEST_SIZE
+        if len(digest) != CHECKSUM_DIGEST_SIZE:
+            raise AssertionError("Computed checksum digest has unexpected size")
         self._data[0:CHECKSUM_DIGEST_SIZE] = digest
         return bytes(self._data)
 
@@ -46,11 +47,13 @@ def to_bytes(self) -> bytes:
 class BytesReader:
     def __init__(self, data: bytes) -> None:
         # Check for data corruption
-        assert len(data) >= CHECKSUM_DIGEST_SIZE
+        if len(data) < CHECKSUM_DIGEST_SIZE:
+            raise AssertionError("Input data is too short to contain checksum")
         digest = zlib.crc32(data[CHECKSUM_DIGEST_SIZE:]).to_bytes(
             4, byteorder="big", signed=False
         )
-        assert len(digest) == CHECKSUM_DIGEST_SIZE
+        if len(digest) != CHECKSUM_DIGEST_SIZE:
+            raise AssertionError("Computed checksum digest has unexpected size")
         if data[0:CHECKSUM_DIGEST_SIZE] != digest:
             raise RuntimeError(
                 "Bytes object is corrupted, checksum does not match. "
@@ -120,7 +123,11 @@ def to_bytes(self) -> bytes:
     @staticmethod
     def to_list(data: bytes, *, deserialize_fn: Callable[[BytesReader], T]) -> list[T]:
         reader = BytesReader(data)
-        assert reader.read_uint64() == _ENCODING_VERSION
+        if reader.read_uint64() != _ENCODING_VERSION:
+            raise AssertionError(
+                f"Encoding version mismatch in AppendingByteSerializer.to_list, \
+                    got {reader.read_uint64()}"
+            )
 
         result: list[T] = []
         while not reader.is_finished():
diff --git a/torch/utils/_backport_slots.py b/torch/utils/_backport_slots.py
deleted file mode 100644
index 123996a85416..000000000000
--- a/torch/utils/_backport_slots.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# This code is backported from python 3.10 dataclasses. Once 3.10 becomes the
-# minimum supported we should use dataclass(slots=True) instead.
-
-from __future__ import annotations
-
-import dataclasses
-import itertools
-from typing import TYPE_CHECKING, TypeVar
-
-
-if TYPE_CHECKING:
-    from collections.abc import Generator
-
-    from _typeshed import DataclassInstance
-
-
-__all__ = ["dataclass_slots"]
-
-_T = TypeVar("_T", bound="DataclassInstance")
-
-
-def dataclass_slots(cls: type[_T]) -> type[DataclassInstance]:
-    assert dataclasses.is_dataclass(cls), "Can only be used on dataclasses."
-
-    def _get_slots(cls: type[DataclassInstance]) -> Generator[str, None, None]:
-        slots = cls.__dict__.get("__slots__")
-        # `__dictoffset__` and `__weakrefoffset__` can tell us whether
-        # the base type has dict/weakref slots, in a way that works correctly
-        # for both Python classes and C extension types. Extension types
-        # don't use `__slots__` for slot creation
-        if slots is None:
-            slots = []
-            if getattr(cls, "__weakrefoffset__", -1) != 0:
-                slots.append("__weakref__")
-            if getattr(cls, "__dictrefoffset__", -1) != 0:
-                slots.append("__dict__")
-            yield from slots
-        elif isinstance(slots, str):
-            yield slots
-        # Slots may be any iterable, but we cannot handle an iterator
-        # because it will already be (partially) consumed.
-        elif not hasattr(cls, "__next__"):
-            yield from slots
-        else:
-            raise TypeError(f"Slots of '{cls.__name__}' cannot be determined")
-
-    def _add_slots(
-        cls: type[DataclassInstance], is_frozen: bool, weakref_slot: bool
-    ) -> type[DataclassInstance]:
-        # Need to create a new class, since we can't set __slots__
-        #  after a class has been created.
-
-        # Make sure __slots__ isn't already set.
-        if "__slots__" in cls.__dict__:
-            raise TypeError(f"{cls.__name__} already specifies __slots__")
-
-        # Create a new dict for our new class.
-        cls_dict = dict(cls.__dict__)
-        field_names = tuple(f.name for f in dataclasses.fields(cls))
-        # Make sure slots don't overlap with those in base classes.
-        inherited_slots = set(
-            itertools.chain.from_iterable(map(_get_slots, cls.__mro__[1:-1]))
-        )
-        # The slots for our class.  Remove slots from our base classes.  Add
-        # '__weakref__' if weakref_slot was given, unless it is already present.
-        cls_dict["__slots__"] = tuple(
-            itertools.filterfalse(
-                inherited_slots.__contains__,
-                itertools.chain(
-                    # gh-93521: '__weakref__' also needs to be filtered out if
-                    # already present in inherited_slots
-                    field_names,
-                    ("__weakref__",) if weakref_slot else (),
-                ),
-            ),
-        )
-
-        for field_name in field_names:
-            # Remove our attributes, if present. They'll still be
-            #  available in _MARKER.
-            cls_dict.pop(field_name, None)
-
-        # Remove __dict__ itself.
-        cls_dict.pop("__dict__", None)
-
-        # Clear existing `__weakref__` descriptor, it belongs to a previous type:
-        cls_dict.pop("__weakref__", None)  # gh-102069
-
-        # And finally create the class.
-        qualname = getattr(cls, "__qualname__", None)
-        cls = type(cls.__name__, cls.__bases__, cls_dict)
-        if qualname is not None:
-            cls.__qualname__ = qualname
-
-        def _dataclass_getstate(self: _T) -> object:
-            fields = dataclasses.fields(self)
-            return [getattr(self, f.name) for f in fields]
-
-        def _dataclass_setstate(self: _T, state: list[object]) -> None:
-            fields = dataclasses.fields(self)
-            for field, value in zip(fields, state):
-                # use setattr because dataclass may be frozen
-                object.__setattr__(self, field.name, value)
-
-        if is_frozen:
-            # Need this for pickling frozen classes with slots.
-            if "__getstate__" not in cls_dict:
-                cls.__getstate__ = _dataclass_getstate  # type: ignore[method-assign, assignment]
-            if "__setstate__" not in cls_dict:
-                cls.__setstate__ = _dataclass_setstate  # type: ignore[attr-defined]
-
-        return cls
-
-    params = getattr(cls, dataclasses._PARAMS)  # type: ignore[attr-defined]
-    weakref_slot = getattr(params, "weakref_slot", False)
-    return _add_slots(cls, params.frozen, weakref_slot)
diff --git a/torch/utils/_config_module.py b/torch/utils/_config_module.py
index 811b45fd1d69..12ba497efd79 100644
--- a/torch/utils/_config_module.py
+++ b/torch/utils/_config_module.py
@@ -6,21 +6,12 @@
 import io
 import os
 import pickle
-import sys
 import tokenize
 import unittest
+from collections.abc import Callable
 from dataclasses import dataclass
 from types import FunctionType, ModuleType
-from typing import (
-    Any,
-    Callable,
-    Generic,
-    NoReturn,
-    Optional,
-    TYPE_CHECKING,
-    TypeVar,
-    Union,
-)
+from typing import Any, Generic, NoReturn, Optional, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import deprecated
 from unittest import mock
 
@@ -38,7 +29,7 @@
 _UNSET_SENTINEL = object()
 
 
-@dataclass
+@dataclass(kw_only=True)
 class _Config(Generic[T]):
     """Represents a config with richer behaviour than just a default value.
     ::
@@ -82,33 +73,28 @@ class _Config(Generic[T]):
     justknob: Optional[str] = None
     env_name_default: Optional[list[str]] = None
     env_name_force: Optional[list[str]] = None
+    value_type: Optional[type] = None
     alias: Optional[str] = None
 
-    def __init__(
-        self,
-        default: Union[T, object] = _UNSET_SENTINEL,
-        justknob: Optional[str] = None,
-        env_name_default: Optional[Union[str, list[str]]] = None,
-        env_name_force: Optional[Union[str, list[str]]] = None,
-        value_type: Optional[type] = None,
-        alias: Optional[str] = None,
-    ):
-        # python 3.9 does not support kw_only on the dataclass :(.
-        self.default = default
-        self.justknob = justknob
+    def __post_init__(self) -> None:
         self.env_name_default = _Config.string_or_list_of_string_to_list(
-            env_name_default
+            self.env_name_default
+        )
+        self.env_name_force = _Config.string_or_list_of_string_to_list(
+            self.env_name_force
         )
-        self.env_name_force = _Config.string_or_list_of_string_to_list(env_name_force)
-        self.value_type = value_type
-        self.alias = alias
+
         if self.alias is not None:
-            assert (
-                default is _UNSET_SENTINEL
-                and justknob is None
-                and env_name_default is None
-                and env_name_force is None
-            ), "if alias is set, none of {default, justknob and env var} can be set"
+            if (
+                self.default is not _UNSET_SENTINEL
+                or self.justknob is not None
+                or self.env_name_default is not None
+                or self.env_name_force is not None
+            ):
+                raise AssertionError(
+                    "if alias is set, none of {default, justknob, \
+                        env_name_default and env_name_force} can be set"
+                )
 
     @staticmethod
     def string_or_list_of_string_to_list(
@@ -118,7 +104,8 @@ def string_or_list_of_string_to_list(
             return None
         if isinstance(val, str):
             return [val]
-        assert isinstance(val, list)
+        if not isinstance(val, list):
+            raise AssertionError(f"val is not a list, got {type(val)}")
         return val
 
 
@@ -148,7 +135,12 @@ def Config(
         alias: Optional[str] = None,
     ) -> _Config[T]:
         return _Config(
-            default, justknob, env_name_default, env_name_force, value_type, alias
+            default=default,
+            justknob=justknob,
+            env_name_default=env_name_default,
+            env_name_force=env_name_force,
+            value_type=value_type,
+            alias=alias,
         )
 
 
@@ -178,10 +170,7 @@ def visit(
         prefix: str,
     ) -> None:
         """Walk the module structure and move everything to module._config"""
-        if sys.version_info[:2] < (3, 10):
-            type_hints = getattr(source, "__annotations__", {})
-        else:
-            type_hints = inspect.get_annotations(source)
+        type_hints = inspect.get_annotations(source)
         for key, value in list(source.__dict__.items()):
             if (
                 key.startswith("__")
@@ -209,7 +198,10 @@ def visit(
                 if dest is module:
                     delattr(module, key)
             elif isinstance(value, type):
-                assert value.__module__ == module.__name__
+                if value.__module__ != module.__name__:
+                    raise AssertionError(
+                        f"subconfig class {value} must be defined in module {module.__name__}"
+                    )
                 # a subconfig with `class Blah:` syntax
                 proxy = SubConfigProxy(module, f"{name}.")
                 visit(value, proxy, f"{name}.")
@@ -250,10 +242,8 @@ def get_assignments_with_compile_ignored_comments(module: ModuleType) -> set[str
             prev_name = ""
             maybe_current = token.string.strip()
             if COMPILE_IGNORED_MARKER in maybe_current:
-                assert current_comment == (
-                    "",
-                    -1,
-                ), f"unconsumed {COMPILE_IGNORED_MARKER}"
+                if current_comment != ("", -1):
+                    raise AssertionError(f"unconsumed {COMPILE_IGNORED_MARKER}")
                 current_comment = maybe_current, token.start[0]
         elif token.type == tokenize.NAME:
             # Only accept the first name token, to handle if you have
@@ -270,7 +260,8 @@ def get_assignments_with_compile_ignored_comments(module: ModuleType) -> set[str
                 assignments.add(prev_name)
                 current_comment = "", -1  # reset
             prev_name = ""
-    assert current_comment == ("", -1), f"unconsumed {COMPILE_IGNORED_MARKER}"
+    if current_comment != ("", -1):
+        raise AssertionError(f"unconsumed {COMPILE_IGNORED_MARKER}")
     return assignments
 
 
@@ -322,20 +313,22 @@ def __init__(self, config: _Config):
 
         # Ensure justknobs and envvars are allowlisted types
         if self.justknob is not None and self.default is not None:
-            assert isinstance(self.default, bool), (
-                f"justknobs only support booleans, {self.default} is not a boolean"
-            )
+            if not isinstance(self.default, bool):
+                raise AssertionError(
+                    f"justknobs only support booleans, {self.default} is not a boolean"
+                )
         if self.value_type is not None and (
             config.env_name_default is not None or config.env_name_force is not None
         ):
-            assert self.value_type in (
+            if self.value_type not in (
                 bool,
                 str,
                 Optional[bool],
                 Optional[str],
-            ), (
-                f"envvar configs only support (optional) booleans or strings, {self.value_type} is neither"
-            )
+            ):
+                raise AssertionError(
+                    f"envvar configs only support (optional) booleans or strings, {self.value_type} is neither"
+                )
 
 
 class ConfigModule(ModuleType):
@@ -420,7 +413,7 @@ def _get_alias_module_and_name(
         try:
             module = importlib.import_module(module_name)
         except ImportError as e:
-            raise AttributeError("config alias {alias} does not exist") from e
+            raise AttributeError(f"config alias {alias} does not exist") from e
         return module, constant_name
 
     def _get_alias_val(self, entry: _ConfigEntry) -> Any:
@@ -433,7 +426,10 @@ def _get_alias_val(self, entry: _ConfigEntry) -> Any:
 
     def _set_alias_val(self, entry: _ConfigEntry, val: Any) -> None:
         data = self._get_alias_module_and_name(entry)
-        assert data is not None
+        if data is None:
+            raise AssertionError(
+                "alias data should not be None when setting alias value"
+            )
         module, constant_name = data
         setattr(module, constant_name, val)
 
@@ -630,6 +626,9 @@ def load_config(self, maybe_pickled_config: Union[bytes, dict[str, Any]]) -> Non
     def get_config_copy(self) -> dict[str, Any]:
         return self._get_dict()
 
+    def get_serializable_config_copy(self) -> dict[str, Any]:
+        return self._get_dict(ignored_keys=getattr(self, "_save_config_ignore", []))
+
     def patch(
         self,
         arg1: Optional[Union[str, dict[str, Any]]] = None,
@@ -655,19 +654,32 @@ def foo(...):
         changes: dict[str, Any]
         if arg1 is not None:
             if arg2 is not None:
-                assert isinstance(arg1, str)
+                if not isinstance(arg1, str):
+                    raise AssertionError(
+                        "first argument must be a string when passing 2 positional args to patch"
+                    )
                 # patch("key", True) syntax
                 changes = {arg1: arg2}
             else:
-                assert isinstance(arg1, dict)
+                if not isinstance(arg1, dict):
+                    raise AssertionError(
+                        "first argument must be a dict when passing a single positional arg to patch"
+                    )
                 # patch({"key": True}) syntax
                 changes = arg1
-            assert not kwargs
+            if kwargs:
+                raise AssertionError(
+                    "cannot pass both positional and keyword arguments to patch"
+                )
         else:
             # patch(key=True) syntax
             changes = kwargs
-            assert arg2 is None
-        assert isinstance(changes, dict), f"expected `dict` got {type(changes)}"
+            if arg2 is not None:
+                raise AssertionError(
+                    "second positional argument is only valid when first argument is a key string"
+                )
+        if not isinstance(changes, dict):
+            raise AssertionError(f"expected `dict` got {type(changes)}")
         prior: dict[str, Any] = {}
         config = self
 
@@ -676,7 +688,10 @@ def __init__(self) -> None:
                 self.changes = changes
 
             def __enter__(self) -> None:
-                assert not prior
+                if prior:
+                    raise AssertionError(
+                        "prior should be empty when entering ConfigPatch"
+                    )
                 for key in self.changes.keys():
                     # KeyError on invalid entry
                     prior[key] = config.__getattr__(key)
diff --git a/torch/utils/_config_typing.pyi b/torch/utils/_config_typing.pyi
index d0490f71fc14..9cae7368cfa5 100644
--- a/torch/utils/_config_typing.pyi
+++ b/torch/utils/_config_typing.pyi
@@ -21,7 +21,8 @@ This file should be imported into any file that uses install_config_module like
 Note that the import should happen before the call to install_config_module(), otherwise runtime errors may occur.
 """
 
-assert TYPE_CHECKING, "Do not use at runtime"
+if not TYPE_CHECKING:  # noqa: PYI002
+    raise AssertionError("Do not use at runtime")  # noqa: W291
 
 def save_config() -> bytes: ...
 def save_config_portable(*, ignore_private_configs: bool = True) -> dict[str, Any]: ...
@@ -31,4 +32,5 @@ def to_dict() -> dict[str, Any]: ...
 def shallow_copy_dict() -> dict[str, Any]: ...
 def load_config(config: bytes | dict[str, Any]) -> None: ...
 def get_config_copy() -> dict[str, Any]: ...
+def get_serializable_config_copy() -> dict[str, Any]: ...
 def patch(arg1: str | dict[str, Any] | None = None, arg2: Any = None, **kwargs): ...
diff --git a/torch/utils/_content_store.py b/torch/utils/_content_store.py
index fab3730a43c8..0086a1e874dd 100644
--- a/torch/utils/_content_store.py
+++ b/torch/utils/_content_store.py
@@ -217,7 +217,10 @@ def read_storage(self, h: str, *, device=None) -> torch.UntypedStorage:
             weights_only=True,
             map_location=device,
         )._untyped_storage
-        assert s is not None
+        if s is None:
+            raise AssertionError(
+                f"expected storage for hash {h} in {os.path.join(self.loc, 'storages')}, got None"
+            )
         if self.storage_cache is not None:
             self.storage_cache[device][h] = StorageWeakRef(s)
         return s
diff --git a/torch/utils/_contextlib.py b/torch/utils/_contextlib.py
index 8db27efa270a..2b222ff951fd 100644
--- a/torch/utils/_contextlib.py
+++ b/torch/utils/_contextlib.py
@@ -6,7 +6,8 @@
 import inspect
 import sys
 import warnings
-from typing import Any, Callable, cast, TypeVar
+from collections.abc import Callable
+from typing import Any, cast, TypeVar
 
 
 # Used for annotating the decorator usage of _DecoratorContextManager (e.g.,
@@ -85,13 +86,14 @@ def context_decorator(ctx, func):
     be a multi-shot context manager that can be directly invoked multiple times)
     or a callable that produces a context manager.
     """
-    assert not (callable(ctx) and hasattr(ctx, "__enter__")), (
-        f"Passed in {ctx} is both callable and also a valid context manager "
-        "(has __enter__), making it ambiguous which interface to use.  If you "
-        "intended to pass a context manager factory, rewrite your call as "
-        "context_decorator(lambda: ctx()); if you intended to pass a context "
-        "manager directly, rewrite your call as context_decorator(lambda: ctx)"
-    )
+    if callable(ctx) and hasattr(ctx, "__enter__"):
+        raise AssertionError(
+            f"Passed in {ctx} is both callable and also a valid context manager "
+            "(has __enter__), making it ambiguous which interface to use.  If you "
+            "intended to pass a context manager factory, rewrite your call as "
+            "context_decorator(lambda: ctx()); if you intended to pass a context "
+            "manager directly, rewrite your call as context_decorator(lambda: ctx)"
+        )
 
     if not callable(ctx):
 
@@ -116,6 +118,7 @@ def ctx_factory():
 
     @functools.wraps(func)
     def decorate_context(*args, **kwargs):
+        # pyrefly: ignore  # bad-context-manager
         with ctx_factory():
             return func(*args, **kwargs)
 
diff --git a/torch/utils/_cpp_embed_headers.py b/torch/utils/_cpp_embed_headers.py
index 6bcf8d583f0c..1d1577b0d8cb 100644
--- a/torch/utils/_cpp_embed_headers.py
+++ b/torch/utils/_cpp_embed_headers.py
@@ -53,6 +53,6 @@ def embed_headers(
     import sys
 
     if len(sys.argv) < 2:
-        print("Usage:\n {sys.argv[0]} filename")
+        print(f"Usage:\n {sys.argv[0]} filename")
         sys.exit(1)
     print(embed_headers(sys.argv[1]))
diff --git a/torch/utils/_cxx_pytree.py b/torch/utils/_cxx_pytree.py
index efe140f10f01..9051591eaf2f 100644
--- a/torch/utils/_cxx_pytree.py
+++ b/torch/utils/_cxx_pytree.py
@@ -13,10 +13,9 @@
 """
 
 import functools
-import sys
 import types
-from collections.abc import Iterable
-from typing import Any, Callable, Optional, overload, TypeVar, Union
+from collections.abc import Callable, Iterable
+from typing import Any, Optional, overload, TypeVar, Union
 from typing_extensions import deprecated, TypeIs
 
 import torch.utils._pytree as python_pytree
@@ -570,10 +569,7 @@ def tree_map_(
 
 Type2 = tuple[type[T], type[S]]
 Type3 = tuple[type[T], type[S], type[U]]
-if sys.version_info >= (3, 10):
-    TypeAny = Union[type[Any], tuple[type[Any], ...], types.UnionType]
-else:
-    TypeAny = Union[type[Any], tuple[type[Any], ...]]
+TypeAny = Union[type[Any], tuple[type[Any], ...], types.UnionType]
 
 Fn2 = Callable[[Union[T, S]], R]
 Fn3 = Callable[[Union[T, S, U]], R]
@@ -631,10 +627,7 @@ def go(t):
 
     You can also directly use 'tree_map_only'
     """
-    if isinstance(type_or_types_or_pred, (type, tuple)) or (
-        sys.version_info >= (3, 10)
-        and isinstance(type_or_types_or_pred, types.UnionType)
-    ):
+    if isinstance(type_or_types_or_pred, (type, tuple, types.UnionType)):
 
         def pred(x: Any) -> bool:
             return isinstance(x, type_or_types_or_pred)  # type: ignore[arg-type]
@@ -713,6 +706,7 @@ def tree_map_only(
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> PyTree:
+    # pyrefly: ignore  # no-matching-overload
     return tree_map(map_only(type_or_types_or_pred)(func), tree, is_leaf=is_leaf)
 
 
@@ -773,6 +767,7 @@ def tree_map_only_(
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> PyTree:
+    # pyrefly: ignore  # no-matching-overload
     return tree_map_(map_only(type_or_types_or_pred)(func), tree, is_leaf=is_leaf)
 
 
@@ -943,7 +938,10 @@ def _broadcast_to_and_flatten(
     treespec: TreeSpec,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> Optional[list[Any]]:
-    assert _is_pytreespec_instance(treespec)
+    if not _is_pytreespec_instance(treespec):
+        raise AssertionError(
+            f"_broadcast_to_and_flatten: Expected `treespec` to be instance of PyTreeSpec but got {type(treespec)}"
+        )
     full_tree = tree_unflatten([0] * treespec.num_leaves, treespec)
     try:
         return broadcast_prefix(tree, full_tree, is_leaf=is_leaf)
@@ -1086,6 +1084,7 @@ def key_get(obj: Any, kp: KeyPath) -> Any:
 
 
 with python_pytree._NODE_REGISTRY_LOCK:
+    # pyrefly: ignore  # bad-assignment
     python_pytree._cxx_pytree_imported = True
     args, kwargs = (), {}  # type: ignore[var-annotated]
     for args, kwargs in python_pytree._cxx_pytree_pending_imports:
diff --git a/torch/utils/debug_mode.py b/torch/utils/_debug_mode.py
similarity index 51%
rename from torch/utils/debug_mode.py
rename to torch/utils/_debug_mode.py
index 4862a394d1b1..7f7de2b7334f 100644
--- a/torch/utils/debug_mode.py
+++ b/torch/utils/_debug_mode.py
@@ -1,16 +1,19 @@
 # mypy: allow-untyped-defs
 import contextlib
+from typing import Optional
 
 import torch
-import torch.distributed.tensor as dt
 from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
-from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.utils._dtype_abbrs import dtype_abbrs
-from torch.utils._python_dispatch import _get_current_dispatch_mode, TorchDispatchMode
+from torch.utils._python_dispatch import (
+    _get_current_dispatch_mode,
+    _get_current_dispatch_mode_stack,
+    TorchDispatchMode,
+)
 from torch.utils._pytree import tree_map
 
 
-__all__ = ["DebugMode"]
+__all__ = ["DebugMode", "get_active_debug_mode"]
 
 REDISTRIBUTE_FUNC = "redistribute_input"
 
@@ -27,42 +30,68 @@ def _stringify_placement(placement) -> str:
     return f"[{', '.join([str(p) for p in placement])}]"
 
 
-def _tensor_debug_string(tensor) -> str:
+def _stringify_attributes(tensor, attributes) -> str:
+    pairs = {}
+    for attr in attributes:
+        if hasattr(tensor, attr):
+            pairs[attr] = getattr(tensor, attr)
+    if len(pairs) == 0:
+        return ""
+    return f"{{{', '.join([f'{k}={v}' for k, v in pairs.items()])}}}"
+
+
+def _stringify_dtensor_spec(spec) -> str:
+    from torch.distributed.tensor._dtensor_spec import DTensorSpec
+
+    return DTensorSpec.format_shard_order_str(spec.placements, spec.shard_order)
+
+
+def _tensor_debug_string(tensor, attributes) -> str:
     """Convert tensor to debug string representation."""
-    if isinstance(tensor, dt.DTensor):
-        # omitted device mesh
-        return f"dt: {dtype_abbrs[tensor.dtype]}{_stringify_shape(tensor.shape)}{_stringify_placement(tensor.placements)}"
-    elif isinstance(tensor, FakeTensor):
-        return f"ft: {dtype_abbrs[tensor.dtype]}{_stringify_shape(tensor.shape)}"
-    elif isinstance(tensor, torch.Tensor):
-        return f"t: {dtype_abbrs[tensor.dtype]}{_stringify_shape(tensor.shape)}"
+
+    if isinstance(tensor, torch.Tensor):
+        tensor_debug_str = f"{dtype_abbrs[tensor.dtype]}{_stringify_shape(tensor.shape)}{_stringify_attributes(tensor, attributes)}"
+
+        if isinstance(tensor, torch.distributed.tensor.DTensor):
+            # omitted device mesh
+            return f"dt: {tensor_debug_str}| {_stringify_dtensor_spec(tensor._spec)}"
+        elif isinstance(tensor, FakeTensor):
+            return f"ft: {tensor_debug_str}"
+        else:
+            return f"t: {tensor_debug_str}"
     else:
         raise RuntimeError(f"Unsupported tensor type: {type(tensor)}")
 
 
-def _arg_to_str(arg) -> str:
+def _arg_to_str(arg, attributes) -> str:
+    from torch.distributed.tensor._dtensor_spec import DTensorSpec
+
     def to_str(x):
         if isinstance(x, torch.Tensor):
-            return _tensor_debug_string(x)
+            return _tensor_debug_string(x, attributes)
         elif isinstance(x, DTensorSpec):
-            return _stringify_placement(x.placements)
+            return _stringify_dtensor_spec(x)
         return x
 
     arg = tree_map(to_str, arg)
     return str(arg)
 
 
-def _op_to_str(op, *args, **kwargs) -> str:
+def _op_to_str(op, attributes, *args, **kwargs) -> str:
     if op == REDISTRIBUTE_FUNC:
-        assert len(args) == 3
-        _args = [_arg_to_str(arg) for arg in args]
-        args_str = f"{_args[0]}, {_args[1]} -> {_args[2]}"
+        if len(args) == 2:
+            args_str = f"{_arg_to_str(args[0], attributes)}, trace: {args[1]}"
+        elif len(args) == 3:
+            _args = [_arg_to_str(arg, attributes) for arg in args]
+            args_str = f"{_args[0]}, {_args[1]} -> {_args[2]}"
+        else:
+            raise RuntimeError(f"Unsupported args for {REDISTRIBUTE_FUNC}: {args}")
     else:
-        args_str = ", ".join(_arg_to_str(arg) for arg in args)
+        args_str = ", ".join(_arg_to_str(arg, attributes) for arg in args)
 
     if kwargs:
         kwargs_str = ", " + ", ".join(
-            f"{k}={_arg_to_str(v)}" for k, v in kwargs.items()
+            f"{k}={_arg_to_str(v, attributes)}" for k, v in kwargs.items()
         )
     else:
         kwargs_str = ""
@@ -81,19 +110,30 @@ class DebugMode(TorchDispatchMode):
     def __init__(
         self,
         *,
-        record_torchfunction=True,
+        record_torchfunction=False,
         record_faketensor=False,
         record_realtensor=True,
+        record_tensor_attributes=None,
     ):
         super().__init__()
+        import torch.distributed.tensor  # noqa: F401
 
+        self.supports_higher_order_operators = True
         self.record_torchfunction = record_torchfunction
         self.record_faketensor = record_faketensor
         self.record_realtensor = record_realtensor
+        self.record_tensor_attributes = record_tensor_attributes or []
 
         self.operators = []
         self.call_depth = 0
 
+    # Without this override, running torch.compile under DebugMode
+    # will force torch.compile to always use the “eager” backend
+    # With this, DebugMode will not take effect on torch.compile
+    @classmethod
+    def ignore_compile_internals(cls):
+        return True
+
     def __torch_function__(self, func, types, args=(), kwargs=None):
         if kwargs is None:
             kwargs = {}
@@ -111,14 +151,14 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
             kwargs = {}
 
         # Record the operation with its call depth
-        if dt.DTensor in types:
+        if torch.distributed.tensor.DTensor in types:
             self.operators.append((func, args, kwargs, self.call_depth))
             return NotImplemented
         elif FakeTensor in types or isinstance(
             _get_current_dispatch_mode(), FakeTensorMode
         ):
             if self.record_faketensor:
-                if func not in {torch.ops.prim.device.default}:
+                if func != torch.ops.prim.device.default:
                     self.operators.append((func, args, kwargs, self.call_depth + 1))
         elif len(types) == 0:
             if self.record_realtensor:
@@ -138,18 +178,30 @@ def __enter__(self):
         super().__enter__()
         return self
 
+    # pyrefly: ignore  # bad-override
     def __exit__(self, *args):
         super().__exit__(*args)
         if self.record_torchfunction:
             torch._C._pop_torch_function_stack()
 
     @contextlib.contextmanager
-    def record_redistribute_calls(self, arg_idx, src_placement, dst_placement):
+    def record_redistribute_calls(
+        self,
+        arg_idx,
+        src_placement,
+        dst_placement,
+        transform_info_str: Optional[str] = None,
+    ):
         try:
+            arg_list = (
+                [arg_idx, transform_info_str]
+                if transform_info_str
+                else [arg_idx, src_placement, dst_placement]
+            )
             self.operators.append(
                 (
                     REDISTRIBUTE_FUNC,
-                    [arg_idx, src_placement, dst_placement],
+                    arg_list,
                     {},
                     self.call_depth + 1,
                 )
@@ -163,7 +215,18 @@ def debug_string(self) -> str:
         with torch._C.DisableTorchFunction():
             result = ""
             result += "\n".join(
-                "  " + "  " * depth + _op_to_str(op, *args, **kwargs)
+                "  "
+                + "  " * depth
+                + _op_to_str(op, self.record_tensor_attributes, *args, **kwargs)
                 for op, args, kwargs, depth in self.operators
             )
         return result
+
+
+def get_active_debug_mode() -> Optional[DebugMode]:
+    debug_mode = None
+    for mode in _get_current_dispatch_mode_stack():
+        if isinstance(mode, DebugMode):
+            debug_mode = mode
+            break
+    return debug_mode
diff --git a/torch/utils/_device.py b/torch/utils/_device.py
index de3ee4a9e344..cd5494e268b9 100644
--- a/torch/utils/_device.py
+++ b/torch/utils/_device.py
@@ -60,6 +60,7 @@ def _device_constructors():
 # NB: This is directly called from C++ in torch/csrc/Device.cpp
 class DeviceContext(TorchFunctionMode):
     def __init__(self, device):
+        # pyrefly: ignore  # read-only
         self.device = torch.device(device)
 
     def __enter__(self):
@@ -86,12 +87,18 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         # or else someone else has popped it!
         for _ in range(_len_torch_function_stack() - 1):
             mode = _pop_mode()
-            assert not isinstance(mode, DeviceContext)
+            if isinstance(mode, DeviceContext):
+                raise AssertionError(
+                    "Found nested DeviceContext on the mode stack where none expected"
+                )
             cur_stack.append(mode)
 
         if _len_torch_function_stack() > 0:
             mode = _pop_mode()
-            assert isinstance(mode, DeviceContext)
+            if not isinstance(mode, DeviceContext):
+                raise AssertionError(
+                    "Expected a DeviceContext at the bottom of the mode stack"
+                )
 
         for mode in reversed(cur_stack):
             _push_mode(mode)
diff --git a/torch/utils/_exposed_in.py b/torch/utils/_exposed_in.py
index a0963b0e4e6a..2cca4ce240ad 100644
--- a/torch/utils/_exposed_in.py
+++ b/torch/utils/_exposed_in.py
@@ -1,4 +1,5 @@
-from typing import Callable, TypeVar
+from collections.abc import Callable
+from typing import TypeVar
 
 
 F = TypeVar("F")
diff --git a/torch/utils/_foreach_utils.py b/torch/utils/_foreach_utils.py
index e3a2070f2d4d..8b682d96c191 100644
--- a/torch/utils/_foreach_utils.py
+++ b/torch/utils/_foreach_utils.py
@@ -1,5 +1,4 @@
-from typing import Optional
-from typing_extensions import TypeAlias
+from typing import Optional, TypeAlias
 
 import torch
 from torch import Tensor
diff --git a/torch/utils/_functools.py b/torch/utils/_functools.py
index 0b555ffc27f9..cda127e45521 100644
--- a/torch/utils/_functools.py
+++ b/torch/utils/_functools.py
@@ -1,6 +1,7 @@
 import functools
-from typing import Callable, TypeVar
-from typing_extensions import Concatenate, ParamSpec
+from collections.abc import Callable
+from typing import Concatenate, TypeVar
+from typing_extensions import ParamSpec
 
 
 _P = ParamSpec("_P")
@@ -30,14 +31,17 @@ def cache_method(
 
     @functools.wraps(f)
     def wrap(self: _C, *args: _P.args, **kwargs: _P.kwargs) -> _T:
-        assert not kwargs
+        if kwargs:
+            raise AssertionError("cache_method does not accept keyword arguments")
         if not (cache := getattr(self, cache_name, None)):
             cache = {}
             setattr(self, cache_name, cache)
+        # pyrefly: ignore  # unbound-name
         cached_value = cache.get(args, _cache_sentinel)
         if cached_value is not _cache_sentinel:
             return cached_value
         value = f(self, *args, **kwargs)
+        # pyrefly: ignore  # unbound-name
         cache[args] = value
         return value
 
diff --git a/torch/utils/_get_clean_triton.py b/torch/utils/_get_clean_triton.py
index 98ee54a1c23d..fbbabc3f50e6 100644
--- a/torch/utils/_get_clean_triton.py
+++ b/torch/utils/_get_clean_triton.py
@@ -3,6 +3,7 @@
 import os
 import re
 import subprocess
+import sys
 from pathlib import Path
 
 
@@ -107,7 +108,7 @@ def process_file(
             env["TORCHINDUCTOR_DUMP_LAUNCH_PARAMS"] = "1"
 
             result = subprocess.run(
-                ["python", input_filename],
+                [sys.executable, input_filename],
                 env=env,
                 capture_output=True,
                 text=True,
diff --git a/torch/utils/_ordered_set.py b/torch/utils/_ordered_set.py
index b2a69fc0ff34..cea8ea684d39 100644
--- a/torch/utils/_ordered_set.py
+++ b/torch/utils/_ordered_set.py
@@ -158,6 +158,7 @@ def __or__(self, other: AbstractSet[T_co]) -> OrderedSet[T]:
     def __and__(self, other: AbstractSet[T_co]) -> OrderedSet[T]:
         # MutableSet impl will iterate over other, iter over smaller of two sets
         if isinstance(other, OrderedSet) and len(self) < len(other):
+            # pyrefly: ignore  # unsupported-operation, bad-return
             return other & self
         return cast(OrderedSet[T], super().__and__(other))
 
diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py
index 5441468eb3b5..990f63447d4e 100644
--- a/torch/utils/_python_dispatch.py
+++ b/torch/utils/_python_dispatch.py
@@ -1,11 +1,12 @@
 # mypy: allow-untyped-defs
+from __future__ import annotations
+
 import contextlib
 import functools
 import warnings
 from collections import deque
-from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Optional, overload, Protocol, Union
+from typing import Optional, overload, Protocol, TYPE_CHECKING, Union
 from typing_extensions import TypeIs
 
 import torch
@@ -20,6 +21,10 @@
 )
 
 
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
 # TODO: Limitations and things about enable_torch_dispatch_mode we should fix before exposing it:
 # - We need a better user-facing api for _DisableTorchDispatch that
 #   is able to selectively disable __torch_dispatch__ of a particular class.
@@ -32,7 +37,7 @@
 _is_in_any_mode_without_ignore_compile_internals = False
 
 
-def is_in_torch_dispatch_mode(include_infra_modes=True) -> bool:
+def is_in_torch_dispatch_mode(include_infra_modes: bool = True) -> bool:
     return (
         _is_in_torch_dispatch_mode
         if include_infra_modes
@@ -71,7 +76,7 @@ class TorchDispatchMode:
     the next mode on the mode stack.  If you want recursively call back into
     your current ``__torch_dispatch__`` implementation, either explicitly
     invoke ``self.__torch_dispatch__(...)``, or use the context manager
-    ``__torch_dispatch__(self)`` to make PyTorch
+    ``self`` to make PyTorch
     API self-referential (beware of infinite loops, in this case!)
     """
 
@@ -83,7 +88,8 @@ class TorchDispatchMode:
 
     def __init__(self, _dispatch_key=None):
         if _dispatch_key is not None:
-            assert isinstance(_dispatch_key, torch._C.DispatchKey)
+            if not isinstance(_dispatch_key, torch._C.DispatchKey):
+                raise AssertionError("_dispatch_key must be a torch._C.DispatchKey")
             self.__dict__["_dispatch_key"] = _dispatch_key
 
         self.old_dispatch_mode_flags: deque[bool] = deque()
@@ -200,25 +206,36 @@ def f(x):
         return False
 
 
-def _get_current_dispatch_mode():
+def _get_current_dispatch_mode() -> Optional[TorchDispatchMode]:
+    """
+    Return the top user mode on the stack (the next one that would be
+    executed) if there are any.
+    """
     stack_len = _len_torch_dispatch_stack()
-    # Return a user mode on the stack if there are any
     if stack_len > 0:
         return _get_dispatch_stack_at(stack_len - 1)
     return None
 
 
 def _detect_infra_mode(key):
-    assert key in [
+    if key not in (
         torch._C._TorchDispatchModeKey.FUNCTIONAL,
         torch._C._TorchDispatchModeKey.PROXY,
-    ]
+    ):
+        raise AssertionError(
+            f"key must be either FUNCTIONAL ({torch._C._TorchDispatchModeKey.FUNCTIONAL}) \
+                or PROXY ({torch._C._TorchDispatchModeKey.PROXY}) _TorchDispatchModeKey, \
+                    got {key}"
+        )
     from torch._ops import _get_dispatch_mode_pre_dispatch
 
     pre_dispatch_mode = _get_dispatch_mode_pre_dispatch(key)
     post_dispatch_mode = torch._C._get_dispatch_mode(key)
 
-    assert (pre_dispatch_mode is None) or (post_dispatch_mode is None)
+    if pre_dispatch_mode is not None and post_dispatch_mode is not None:
+        raise AssertionError(
+            "At most one of pre_dispatch_mode and post_dispatch_mode may be active"
+        )
 
     if pre_dispatch_mode is None:
         return post_dispatch_mode
@@ -244,10 +261,13 @@ def _unset_infra_mode(key):
 
 
 def _disable_infra_mode(key):
-    assert key in (
+    if key not in (
         torch._C._TorchDispatchModeKey.FUNCTIONAL,
         torch._C._TorchDispatchModeKey.PROXY,
-    )
+    ):
+        raise AssertionError(
+            "key must be either FUNCTIONAL or PROXY _TorchDispatchModeKey"
+        )
     mode_unset = _unset_infra_mode(key)
     try:
         yield mode_unset
@@ -256,14 +276,22 @@ def _disable_infra_mode(key):
             _push_mode(mode_unset)
 
 
-def _get_current_dispatch_mode_stack():
+def _get_current_dispatch_mode_stack() -> list[TorchDispatchMode]:
+    """
+    Returns the current stack of dispatch modes, with the most recent
+    (i.e., the one that will be processed first) at the end of the
+    list (standard stack convention).
+    """
     stack_len = _len_torch_dispatch_stack()
     return [_get_dispatch_stack_at(i) for i in range(stack_len)]
 
 
 def _push_mode(mode: TorchDispatchMode):
     k = mode._dispatch_key if hasattr(mode, "_dispatch_key") else None
-    assert k is None or k == torch._C.DispatchKey.PreDispatch
+    if k is not None and k != torch._C.DispatchKey.PreDispatch:
+        raise AssertionError(
+            "mode._dispatch_key must be None or DispatchKey.PreDispatch"
+        )
     if k is None:
         _push_on_torch_dispatch_stack(mode)
         return
@@ -406,7 +434,7 @@ def to(
     @overload
     def to(
         self,
-        device: Optional["torch._prims_common.DeviceLikeType"] = None,
+        device: Optional[torch._prims_common.DeviceLikeType] = None,
         dtype: Optional[torch.types._dtype] = None,
         non_blocking: bool = False,
         copy: bool = False,
@@ -501,14 +529,16 @@ def transform_subclass(t, callback, outer_size=None, outer_stride=None):
     # NB: Purposefully guard here to simplify the inner / outer symbols.
     # Using sym_eq() for symbolic comparison can result in an expression that's too
     # difficult to guard on, so we use == here.
-    assert sub.shape == outer_size, (
-        f"Expected return value from {type(t)}__tensor_unflatten__() to have "
-        f"shape equal to {outer_size}, but got: {sub.shape}"
-    )
-    assert sub.stride() == outer_stride, (
-        f"Expected return value from {type(t)}__tensor_unflatten__() to have "
-        f"stride equal to {outer_stride}, but got: {sub.stride()}"
-    )
+    if sub.shape != outer_size:
+        raise AssertionError(
+            f"Expected return value from {type(t)}__tensor_unflatten__() to have "
+            f"shape equal to {outer_size}, but got: {sub.shape}"
+        )
+    if sub.stride() != outer_stride:
+        raise AssertionError(
+            f"Expected return value from {type(t)}__tensor_unflatten__() to have "
+            f"stride equal to {outer_stride}, but got: {sub.stride()}"
+        )
 
     return sub
 
@@ -525,9 +555,12 @@ def _correct_storage_aliasing(func, schema_info, args, outs):
     It does this by unsafely overwriting the storage field of the output tensor
     to be the same storage as the input.
     """
-    assert isinstance(func, torch._ops.OpOverload)
-    assert isinstance(args, tuple)
-    assert isinstance(outs, (list, tuple))
+    if not isinstance(func, torch._ops.OpOverload):
+        raise AssertionError(f"func must be an OpOverload, got {type(args)}")
+    if not isinstance(args, tuple):
+        raise AssertionError(f"args must be a tuple, got {type(args)}")
+    if not isinstance(outs, (list, tuple)):
+        raise AssertionError(f"outs must be a list or tuple, got {type(args)}")
 
     def alias_non_inplace_storage(arg, ret):
         # This is hopefully a reasonable assert:
@@ -548,10 +581,11 @@ def alias_non_inplace_storage(arg, ret):
         ):
             ret_list = ret if isinstance(ret, list) else [ret]
             for r in ret_list:
-                assert type(arg) == type(
-                    r
-                ), f"""Called {str(func)} with input of type {type(arg)}
-and output of type {type(ret)}. But expected types to match."""
+                if type(arg) is not type(r):
+                    raise AssertionError(
+                        f"Called {str(func)} with input of type {type(arg)}\n"
+                        f"and output of type {type(ret)}. But expected types to match."
+                    )
         # Need to call a non-dispatcher helper, because we explicitly do **not**
         # want our subclass to intercept the set_() call.
         # instead, our subclass should directly have its storage swapped out.
@@ -567,7 +601,8 @@ def alias_non_inplace_storage(arg, ret):
             for r in ret:
                 torch._functionalize_unsafe_set(r, arg)
         else:
-            assert isinstance(ret, torch.Tensor), f"type: {type(ret)}"
+            if not isinstance(ret, torch.Tensor):
+                raise AssertionError(f"expected torch.Tensor, got {type(ret)}")
             torch._functionalize_unsafe_set(ret, arg)
 
     for arg_idx, schema_arg in enumerate(schema_info.args):
@@ -611,7 +646,10 @@ def get_alias_info(func) -> SchemaInfo:
     # properly for some ops that output tensorlists)
     if func.namespace == "aten":
         torchgen_schema_str = str(func._schema)
-        assert torchgen_schema_str.startswith("aten::")
+        if not torchgen_schema_str.startswith("aten::"):
+            raise AssertionError(
+                "Expected torchgen schema string to start with 'aten::'"
+            )
         # remove the aten:: namespace, which is added by the torchscript parser,
         # and torchgen doesn't know how to handle
         torchgen_schema_str = torchgen_schema_str[6:]
@@ -674,6 +712,64 @@ def get_alias_info(func) -> SchemaInfo:
     return schema_info
 
 
+def autograd_would_have_decomposed(
+    func: torch._ops.OpOverload, flat_args: Sequence[Union[torch.Tensor, object]]
+) -> bool:
+    """
+    Suppose that an operator has CompositeImplicitAutograd decomp registered.
+    Would autograd have used this decomposition?  It will only use it if there
+    isn't an explicit backend registration for the device as well.  This function
+    will tell if this would have occurred.
+
+    Why do we need to apply these decompositions later?  When inference mode is
+    on, the autograd key is bypassed entirely, so a lower level mode cannot rely
+    on the decomposition have been applied.  It's easy to accidentally never apply
+    the decomposition, resulting in an operator showing up in a graph that
+    is unexpected.
+
+    Why do we need to AVOID applying the decomposition when autograd wouldn't
+    have decomposed?  If autograd doesn't decompose, this means in eager mode
+    we would have run the fused kernel.  It must be possible to trace this
+    fused kernel directly into the graph for fidelity with eager (NB: a user
+    has the option of then further decomposing at proxy tensor mode via
+    decomposition table, but we must preserve it to proxy mode to have the
+    choice.)
+
+    Why does functionalization need to also perform the test here?  This is
+    because some CompositeImplicitAutograd decompositions are not functional.
+    If we are eventually going to decompose, we need to do this while we can
+    still turn functionalization back on, so those decompositions get functionalized.
+    So an early decomposition in functionalization may still be necessary.  Note that
+    if proxy tensor decomposition process could turn functionalization back on, this
+    wouldn't be necessary, and maybe that is a useful thing to do anyway because
+    the decomposition table is user specified and a user could violate the functional
+    decomp requirement with a bad decomp.  If this happened, then you could always
+    pass through functionalization.
+    """
+    has_backend_registration = False
+    for a in flat_args:
+        if isinstance(a, torch.Tensor):
+            backend_key = torch._C._parse_dispatch_key(
+                torch._C._dispatch_key_for_device(a.device.type)
+            )
+            if backend_key is None:
+                raise AssertionError(
+                    "Failed to infer backend dispatch key from tensor device"
+                )
+            # TODO: use func.has_kernel_for_dispatch_key(backend_key)
+            # but this one checks py_impl and CompositeImplicitAutograd
+            # incorrectly shows up as has backend reg here
+            has_backend_registration = torch._C._dispatch_has_kernel_for_dispatch_key(
+                func.name(), backend_key
+            )
+
+            # in theory we should take all backend keys and take the highest priority one
+            # to properly mimic the dispatcher,
+            # this just grabs the first tensor and takes its device key
+            break
+    return not has_backend_registration
+
+
 # See NOTE[SchemaInfo int_tags] above.
 _TORCH_TAG_INPLACE_VIEW_INT = int(torch.Tag.inplace_view)  # type: ignore[call-overload]
 
@@ -703,7 +799,8 @@ def get_write_alias(x):
         if not alias_set or not x.is_write:
             return None
         # torchscript allows for complicated alias sets, but our dispatcher ops only really involve simple aliasing
-        assert len(alias_set) == 1
+        if len(alias_set) != 1:
+            raise AssertionError("Expected alias_set to contain exactly one element")
         # timeit says next(iter(alias_set)) is faster than list(alias_set)[0] even for
         # set of size 1 on Python 3.13.
         return next(iter(alias_set))
@@ -717,7 +814,10 @@ def get_arg_from_alias(output_alias, schema_info, args, kwargs):
             i for i, a in enumerate(schema_info.args) if output_alias in a.alias_set
         ]
         # For any dispatcher op with an output alias, we expect it to map to exactly one alias in the schema's input arguments.
-        assert len(arg_indices) == 1
+        if len(arg_indices) != 1:
+            raise AssertionError(
+                "Expected exactly one argument index for the given output alias"
+            )
         idx = arg_indices[0]
         arg_info = schema_info.args[idx]
         if arg_info.name is not None and arg_info.name in new_kwargs:
@@ -743,7 +843,10 @@ def get_arg_from_alias(output_alias, schema_info, args, kwargs):
         ]
         # Assumption: we have a very small number of inplace_view ops that follow a strict schema:
         # there is only a single argument that gets its metadata mutated.
-        assert len(mutated_args) == 1
+        if len(mutated_args) != 1:
+            raise AssertionError(
+                "expected exactly one mutated arg for inplace_view ops"
+            )
         # This check exists because we generally *do* want to update the metadata of any wrapper subclasses,
         # but FunctionalTensor is special: it overrides all size/stride calls to plumb to the inner tensor.
         # so we don't actually need to update the metadata (and attempting to do so causes errors)
diff --git a/torch/utils/_pytree.py b/torch/utils/_pytree.py
index 773e9f00e3d1..c82e108c3ce1 100644
--- a/torch/utils/_pytree.py
+++ b/torch/utils/_pytree.py
@@ -20,16 +20,14 @@
 import importlib
 import importlib.metadata
 import json
-import sys
 import threading
 import types
 import warnings
 from collections import defaultdict, deque, namedtuple, OrderedDict
-from collections.abc import Hashable, Iterable, Mapping, Sequence
+from collections.abc import Callable, Hashable, Iterable, Mapping, Sequence
 from enum import Enum
 from typing import (
     Any,
-    Callable,
     cast,
     ClassVar,
     Final,
@@ -478,7 +476,8 @@ def _is_constant_holder(spec: "TreeSpec") -> bool:
 
 def _retrieve_constant(spec: "TreeSpec") -> Any:
     """Given a spec from a pytree registered with register_constant, retrieves the constant"""
-    assert _is_constant_holder(spec)
+    if not _is_constant_holder(spec):
+        raise AssertionError("spec does not correspond to a registered constant pytree")
     return tree_unflatten([], spec)
 
 
@@ -710,6 +709,7 @@ def __init_subclass__(cls) -> NoReturn:
     def __new__(
         cls: type[Self],
         sequence: Iterable[_T_co],
+        # pyrefly: ignore  # bad-function-definition
         dict: dict[str, Any] = ...,
     ) -> Self:
         raise NotImplementedError
@@ -756,6 +756,7 @@ def _tuple_flatten_with_keys(
     d: tuple[T, ...],
 ) -> tuple[list[tuple[KeyEntry, T]], Context]:
     values, context = _tuple_flatten(d)
+    # pyrefly: ignore  # bad-return
     return [(SequenceKey(i), v) for i, v in enumerate(values)], context
 
 
@@ -769,6 +770,7 @@ def _list_flatten(d: list[T]) -> tuple[list[T], Context]:
 
 def _list_flatten_with_keys(d: list[T]) -> tuple[list[tuple[KeyEntry, T]], Context]:
     values, context = _list_flatten(d)
+    # pyrefly: ignore  # bad-return
     return [(SequenceKey(i), v) for i, v in enumerate(values)], context
 
 
@@ -784,6 +786,7 @@ def _dict_flatten_with_keys(
     d: dict[Any, T],
 ) -> tuple[list[tuple[KeyEntry, T]], Context]:
     values, context = _dict_flatten(d)
+    # pyrefly: ignore  # bad-return
     return [(MappingKey(k), v) for k, v in zip(context, values)], context
 
 
@@ -799,6 +802,7 @@ def _namedtuple_flatten_with_keys(
     d: NamedTuple,
 ) -> tuple[list[tuple[KeyEntry, Any]], Context]:
     values, context = _namedtuple_flatten(d)
+    # pyrefly: ignore  # bad-return
     return (
         [(GetAttrKey(field), v) for field, v in zip(context._fields, values)],
         context,
@@ -848,6 +852,7 @@ def _ordereddict_flatten_with_keys(
     d: OrderedDict[Any, T],
 ) -> tuple[list[tuple[KeyEntry, T]], Context]:
     values, context = _ordereddict_flatten(d)
+    # pyrefly: ignore  # bad-return
     return [(MappingKey(k), v) for k, v in zip(context, values)], context
 
 
@@ -872,6 +877,7 @@ def _defaultdict_flatten_with_keys(
 ) -> tuple[list[tuple[KeyEntry, T]], Context]:
     values, context = _defaultdict_flatten(d)
     _, dict_context = context
+    # pyrefly: ignore  # bad-return
     return [(MappingKey(k), v) for k, v in zip(dict_context, values)], context
 
 
@@ -894,17 +900,25 @@ def _defaultdict_serialize(context: Context) -> DumpableContext:
 
 
 def _defaultdict_deserialize(dumpable_context: DumpableContext) -> Context:
-    assert isinstance(dumpable_context, dict)
-    assert set(dumpable_context) == {
+    if not isinstance(dumpable_context, dict):
+        raise AssertionError("dumpable_context must be a dict")
+
+    expected_keys = {
         "default_factory_module",
         "default_factory_name",
         "dict_context",
     }
+    if set(dumpable_context) != expected_keys:
+        raise AssertionError(
+            f"dumpable_context keys must be {expected_keys}, got {set(dumpable_context)}"
+        )
 
     default_factory_module = dumpable_context["default_factory_module"]
     default_factory_name = dumpable_context["default_factory_name"]
-    assert isinstance(default_factory_module, str)
-    assert isinstance(default_factory_name, str)
+    if not isinstance(default_factory_module, str):
+        raise AssertionError("default_factory_module must be a string")
+    if not isinstance(default_factory_name, str):
+        raise AssertionError("default_factory_name must be a string")
     module = importlib.import_module(default_factory_module)
     default_factory = getattr(module, default_factory_name)
 
@@ -920,6 +934,7 @@ def _deque_flatten_with_keys(
     d: deque[T],
 ) -> tuple[list[tuple[KeyEntry, T]], Context]:
     values, context = _deque_flatten(d)
+    # pyrefly: ignore  # bad-return
     return [(SequenceKey(i), v) for i, v in enumerate(values)], context
 
 
@@ -1412,10 +1427,7 @@ def tree_map_(
 
 Type2 = tuple[type[T], type[S]]
 Type3 = tuple[type[T], type[S], type[U]]
-if sys.version_info >= (3, 10):
-    TypeAny = Union[type[Any], tuple[type[Any], ...], types.UnionType]
-else:
-    TypeAny = Union[type[Any], tuple[type[Any], ...]]
+TypeAny = Union[type[Any], tuple[type[Any], ...], types.UnionType]
 
 Fn2 = Callable[[Union[T, S]], R]
 Fn3 = Callable[[Union[T, S, U]], R]
@@ -1473,10 +1485,7 @@ def go(t):
 
     You can also directly use 'tree_map_only'
     """
-    if isinstance(type_or_types_or_pred, (type, tuple)) or (
-        sys.version_info >= (3, 10)
-        and isinstance(type_or_types_or_pred, types.UnionType)
-    ):
+    if isinstance(type_or_types_or_pred, (type, tuple, types.UnionType)):
 
         def pred(x: Any) -> bool:
             return isinstance(x, type_or_types_or_pred)  # type: ignore[arg-type]
@@ -1555,6 +1564,7 @@ def tree_map_only(
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> PyTree:
+    # pyrefly: ignore  # no-matching-overload
     return tree_map(map_only(type_or_types_or_pred)(func), tree, is_leaf=is_leaf)
 
 
@@ -1615,6 +1625,7 @@ def tree_map_only_(
     tree: PyTree,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> PyTree:
+    # pyrefly: ignore  # no-matching-overload
     return tree_map_(map_only(type_or_types_or_pred)(func), tree, is_leaf=is_leaf)
 
 
@@ -1731,7 +1742,8 @@ def _broadcast_to_and_flatten(
     treespec: TreeSpec,
     is_leaf: Optional[Callable[[PyTree], bool]] = None,
 ) -> Optional[list[Any]]:
-    assert isinstance(treespec, TreeSpec)
+    if not isinstance(treespec, TreeSpec):
+        raise AssertionError("treespec must be a TreeSpec")
 
     if tree_is_leaf(tree, is_leaf=is_leaf):
         return [tree] * treespec.num_leaves
@@ -1827,6 +1839,7 @@ def enum_object_hook(obj: dict[str, Any]) -> Union[Enum, dict[str, Any]]:
         for attr in classname.split("."):
             enum_cls = getattr(enum_cls, attr)
         enum_cls = cast(type[Enum], enum_cls)
+        # pyrefly: ignore  # unsupported-operation
         return enum_cls[obj["name"]]
     return obj
 
diff --git a/torch/utils/_stats.py b/torch/utils/_stats.py
index 74b513932c30..b8a2978c3ea7 100644
--- a/torch/utils/_stats.py
+++ b/torch/utils/_stats.py
@@ -4,7 +4,8 @@
 import collections
 import functools
 from collections import OrderedDict
-from typing import Callable, TypeVar
+from collections.abc import Callable
+from typing import TypeVar
 from typing_extensions import ParamSpec
 
 
diff --git a/torch/utils/_strobelight/cli_function_profiler.py b/torch/utils/_strobelight/cli_function_profiler.py
index 9b94a7b7a484..7825f784e2f3 100644
--- a/torch/utils/_strobelight/cli_function_profiler.py
+++ b/torch/utils/_strobelight/cli_function_profiler.py
@@ -6,9 +6,9 @@
 import re
 import subprocess
 import time
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from threading import Lock
-from typing import Any, Callable, Optional, TypeVar
+from typing import Any, Optional, TypeVar
 from typing_extensions import ParamSpec
 
 
@@ -305,6 +305,7 @@ def strobelight_inner(
     ) -> Callable[_P, Optional[_R]]:
         @functools.wraps(work_function)
         def wrapper_function(*args: _P.args, **kwargs: _P.kwargs) -> Optional[_R]:
+            # pyrefly: ignore  # bad-argument-type
             return profiler.profile(work_function, *args, **kwargs)
 
         return wrapper_function
diff --git a/torch/utils/_sympy/functions.py b/torch/utils/_sympy/functions.py
index 2b6c159f5c3a..d7f65dd0c16e 100644
--- a/torch/utils/_sympy/functions.py
+++ b/torch/utils/_sympy/functions.py
@@ -3,7 +3,8 @@
 import math
 import operator
 import sys
-from typing import Callable, Optional, SupportsFloat, TYPE_CHECKING, TypeVar, Union
+from collections.abc import Callable
+from typing import Optional, SupportsFloat, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import TypeVarTuple, Unpack
 
 import sympy
@@ -19,6 +20,8 @@
 from sympy.printing.precedence import PRECEDENCE
 from sympy.utilities.iterables import sift
 
+from torch.torch_version import TorchVersion
+
 from .numbers import int_oo
 
 
@@ -102,6 +105,7 @@ def _keep_float(
 ) -> Callable[[Unpack[_Ts]], Union[_T, sympy.Float]]:
     @functools.wraps(f)
     def inner(*args: Unpack[_Ts]) -> Union[_T, sympy.Float]:
+        # pyrefly: ignore  # bad-argument-type
         r: Union[_T, sympy.Float] = f(*args)
         if any(isinstance(a, sympy.Float) for a in args) and not isinstance(
             r, sympy.Float
@@ -109,6 +113,7 @@ def inner(*args: Unpack[_Ts]) -> Union[_T, sympy.Float]:
             r = sympy.Float(float(r))
         return r
 
+    # pyrefly: ignore  # bad-return
     return inner
 
 
@@ -195,10 +200,12 @@ class FloorDiv(sympy.Function):
 
     @property
     def base(self) -> sympy.Basic:
+        # pyrefly: ignore  # missing-attribute
         return self.args[0]
 
     @property
     def divisor(self) -> sympy.Basic:
+        # pyrefly: ignore  # missing-attribute
         return self.args[1]
 
     def _sympystr(self, printer: sympy.printing.StrPrinter) -> str:
@@ -267,7 +274,20 @@ def eval(
             for term in sympy.Add.make_args(base):
                 quotient = term / divisor
 
-                if quotient.is_integer:
+                # This is a sympy bug fixed in https://github.com/sympy/sympy/pull/28442
+                # sympy can generate a quotient with (1/22)*.... such that quotient.is_integer is True
+                # FloorDiv should not allow that as output. see
+                quotient_is_integer = None
+                if isinstance(quotient, sympy.Mul) and TorchVersion(
+                    sympy.__version__
+                ) < TorchVersion("1.15.0"):
+                    rationals = quotient.atoms(sympy.Rational)
+                    all_rationals_ints = all(r.q == 1 for r in rationals)
+                    quotient_is_integer = quotient.is_integer and all_rationals_ints
+                else:
+                    quotient_is_integer = quotient.is_integer
+
+                if quotient_is_integer:
                     terms.append(term)
                     quotients += quotient
 
@@ -291,11 +311,6 @@ def eval(
 
         return None
 
-    def _ccode(self, printer):
-        base = printer.parenthesize(self.base, PRECEDENCE["Atom"] - 0.5)
-        divisor = printer.parenthesize(self.divisor, PRECEDENCE["Atom"] - 0.5)
-        return f"floor({base}/{divisor})"
-
 
 class ModularIndexing(sympy.Function):
     """
@@ -312,7 +327,6 @@ def eval(
     ) -> Optional[sympy.Basic]:
         if base == 0 or modulus == 1:
             return sympy.S.Zero
-
         if (
             isinstance(base, sympy.Integer)
             and isinstance(divisor, sympy.Integer)
@@ -360,6 +374,7 @@ def eval(
         return None
 
     def _eval_is_nonnegative(self) -> Optional[bool]:
+        # pyrefly: ignore  # missing-attribute
         p, q = self.args[:2]
         return fuzzy_eq(p.is_nonnegative, q.is_nonnegative)  # type: ignore[attr-defined]
 
@@ -440,6 +455,7 @@ def eval(cls, p: sympy.Expr, q: sympy.Expr) -> Optional[sympy.Expr]:
         #   - floor(p / q) = 0
         #   - p % q = p - floor(p / q) * q = p
         less = p < q
+        # pyrefly: ignore  # missing-attribute
         if less.is_Boolean and bool(less) and r.is_positive:
             return p
 
@@ -456,8 +472,11 @@ def _eval_is_nonpositive(self) -> Optional[bool]:
         return True if self.args[1].is_negative else None  # type: ignore[attr-defined]
 
     def _ccode(self, printer):
+        # pyrefly: ignore  # missing-attribute
         p = printer.parenthesize(self.args[0], PRECEDENCE["Atom"] - 0.5)
+        # pyrefly: ignore  # missing-attribute
         q = printer.parenthesize(self.args[1], PRECEDENCE["Atom"] - 0.5)
+        # pyrefly: ignore  # missing-attribute
         abs_q = str(q) if self.args[1].is_positive else f"abs({q})"
         return f"({p} % {q}) < 0 ? {p} % {q} + {abs_q} : {p} % {q}"
 
@@ -491,8 +510,10 @@ def eval(cls, p, q):
 
         # Evaluate if they are both literals.
         if q.is_Number and p.is_Number:
-            assert p >= 0, p
-            assert q >= 1, q
+            if p < 0:
+                raise AssertionError(p)
+            if q < 1:
+                raise AssertionError(q)
             return p % q
 
         # If q == 2, it's a matter of whether p is odd or even.
@@ -538,6 +559,7 @@ def eval(cls, number):
             return sympy.Integer(math.ceil(float(number)))
 
     def _ccode(self, printer):
+        # pyrefly: ignore  # missing-attribute
         number = printer.parenthesize(self.args[0], self.args[0].precedence - 0.5)
         return f"ceil({number})"
 
@@ -808,6 +830,7 @@ def do(ai, a):
             if not cond:
                 return ai.func(*[do(i, a) for i in ai.args], evaluate=False)
             if isinstance(ai, cls):
+                # pyrefly: ignore  # missing-attribute
                 return ai.func(*[do(i, a) for i in ai.args if i != a], evaluate=False)
             return a
 
@@ -985,6 +1008,7 @@ def _eval_is_nonnegative(self):  # type:ignore[override]
         return fuzzy_or(a.is_nonnegative for a in self.args)  # type: ignore[attr-defined]
 
     def _eval_is_negative(self):  # type:ignore[override]
+        # pyrefly: ignore  # missing-attribute
         return fuzzy_and(a.is_negative for a in self.args)
 
 
@@ -1003,6 +1027,7 @@ def _eval_is_nonnegative(self):  # type:ignore[override]
         return fuzzy_and(a.is_nonnegative for a in self.args)  # type: ignore[attr-defined]
 
     def _eval_is_negative(self):  # type:ignore[override]
+        # pyrefly: ignore  # missing-attribute
         return fuzzy_or(a.is_negative for a in self.args)
 
 
@@ -1140,7 +1165,9 @@ def eval(cls, base, divisor):
             return sympy.Float(int(base) / int(divisor))
 
     def _ccode(self, printer):
+        # pyrefly: ignore  # missing-attribute
         base = printer.parenthesize(self.args[0], PRECEDENCE["Atom"] - 0.5)
+        # pyrefly: ignore  # missing-attribute
         divisor = printer.parenthesize(self.args[1], PRECEDENCE["Atom"] - 0.5)
         return f"((int){base}/(int){divisor})"
 
@@ -1156,7 +1183,10 @@ class IsNonOverlappingAndDenseIndicator(sympy.Function):
 
     @classmethod
     def eval(cls, *args):
-        assert len(args) % 2 == 0
+        if len(args) % 2 != 0:
+            raise AssertionError(
+                f"expected an even number of arguments, got {len(args)}"
+            )
         dim = len(args) // 2
         sizes = args[0:dim]
         strides = args[dim:]
@@ -1188,7 +1218,8 @@ def eval(cls, *args):
             # this function could help figure this out.
 
         if all(isinstance(a, sympy.Integer) for a in strides):
-            assert dim != 0
+            if dim == 0:
+                raise AssertionError("dim must not be zero")
             # When all strides are integral, we can sort, and the size for the
             # largest stride doesn't matter and can be arbitrarily symbolic
             s_sizes, s_strides = zip(
@@ -1300,9 +1331,16 @@ class Identity(sympy.Function):
     precedence = 10
 
     def __repr__(self):  # type: ignore[override]
+        # pyrefly: ignore  # missing-attribute
         return f"Identity({self.args[0]})"
 
+    def _sympystr(self, printer):
+        """Controls how sympy's StrPrinter prints this"""
+        # pyrefly: ignore  # missing-attribute
+        return f"({printer.doprint(self.args[0])})"
+
     def _eval_is_real(self):
+        # pyrefly: ignore  # missing-attribute
         return self.args[0].is_real
 
     def _eval_is_integer(self):
@@ -1310,12 +1348,15 @@ def _eval_is_integer(self):
 
     def _eval_expand_identity(self, **hints):
         # Removes the identity op.
+        # pyrefly: ignore  # missing-attribute
         return self.args[0]
 
     def __int__(self) -> int:
+        # pyrefly: ignore  # missing-attribute
         return int(self.args[0])
 
     def __float__(self) -> float:
+        # pyrefly: ignore  # missing-attribute
         return float(self.args[0])
 
 
@@ -1411,7 +1452,10 @@ def eval(cls, a, b):
                 return sympy.Integer(getattr(operator, real_op_name)(int(a), int(b)))
             return None
 
-    BitwiseFn.__name__ = "BitwiseFn_" + name
+    nm = "BitwiseFn_" + name
+    BitwiseFn.__name__ = nm
+    BitwiseFn.__qualname__ = nm
+
     return BitwiseFn
 
 
diff --git a/torch/utils/_sympy/interp.py b/torch/utils/_sympy/interp.py
index 3b020b5fabbc..6dc496a0ddb1 100644
--- a/torch/utils/_sympy/interp.py
+++ b/torch/utils/_sympy/interp.py
@@ -160,7 +160,8 @@ def _run_sympy_handler(analysis, args, expr, index_dtype=torch.int64):
     handler = getattr(analysis, handler_name)
     try:
         if handler_name in ASSOCIATIVE_OPS:
-            assert len(args) > 1
+            if len(args) <= 1:
+                raise AssertionError("associative op needs >1 args")
             acc = handler(args[0], args[1])
             for i in range(2, len(args)):
                 acc = handler(acc, args[i])
@@ -219,7 +220,7 @@ def sympy_interp(
                 missing_handler=missing_handler,
             )
             for arg in expr.args
-        ],  # type: ignore[arg-type]
+        ],
         expr,
         index_dtype=index_dtype,
-    )  # type: ignore[arg-type]
+    )
diff --git a/torch/utils/_sympy/numbers.py b/torch/utils/_sympy/numbers.py
index d02b9879cad2..01aee8b29f10 100644
--- a/torch/utils/_sympy/numbers.py
+++ b/torch/utils/_sympy/numbers.py
@@ -9,6 +9,7 @@
 from sympy.core.singleton import S, Singleton
 
 
+# pyrefly: ignore  # invalid-inheritance
 class IntInfinity(Number, metaclass=Singleton):
     r"""Positive integer infinite quantity.
 
@@ -203,6 +204,7 @@ def ceiling(self):
 int_oo = S.IntInfinity
 
 
+# pyrefly: ignore  # invalid-inheritance
 class NegativeIntInfinity(Number, metaclass=Singleton):
     """Negative integer infinite quantity.
 
diff --git a/torch/utils/_sympy/printers.py b/torch/utils/_sympy/printers.py
index acfcc596bd49..475eed67c381 100644
--- a/torch/utils/_sympy/printers.py
+++ b/torch/utils/_sympy/printers.py
@@ -66,11 +66,14 @@ def _print_Float(self, expr: sympy.Expr) -> str:
     # NB: this pow by natural, you should never have used builtin sympy.pow
     # for FloatPow, and a symbolic exponent should be PowByNatural.  These
     # means exp is guaranteed to be integer.
+    # pyrefly: ignore  # bad-override
     def _print_Pow(self, expr: sympy.Expr) -> str:
         base, exp = expr.args
-        assert exp == int(exp), exp
+        if exp != int(exp):
+            raise AssertionError(exp)
         exp = int(exp)
-        assert exp >= 0
+        if exp < 0:
+            raise AssertionError(f"exponent must be non-negative, got {exp}")
         if exp > 0:
             return self.stringify([base] * exp, "*", PRECEDENCE["Mul"])
         return "1"
@@ -132,7 +135,8 @@ def _print_TruncToFloat(self, expr: sympy.Expr) -> str:
 
 class PythonPrinter(ExprPrinter):
     def _print_ToFloat(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("ToFloat expects exactly one argument")
         # NB: We use sym_float here because the printer is used for cache
         # serialization, and cache guards get evaluated with SymInt to
         # propagate guards to the parent ShapeEnv.  However, this comes at a
@@ -196,89 +200,110 @@ def _print_PowByNatural(self, expr: sympy.Expr) -> str:
         return self.stringify(expr.args, " ** ", PRECEDENCE["Pow"])
 
     def _print_floor(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("floor expects exactly one argument")
         return f"math.floor({self._print(expr.args[0])})"
 
     def _print_FloorToInt(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("FloorToInt expects exactly one argument")
         return f"math.floor({self._print(expr.args[0])})"
 
     def _print_TruncToInt(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("TruncToInt expects exactly one argument")
         # This also could have been int(), they'll do the same thing for float
         return f"math.trunc({self._print(expr.args[0])})"
 
     def _print_ceiling(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("ceiling expects exactly one argument")
         return f"math.ceil({self._print(expr.args[0])})"
 
     def _print_CeilToInt(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("CeilToInt expects exactly one argument")
         return f"math.ceil({self._print(expr.args[0])})"
 
     def _print_Abs(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("Abs expects exactly one argument")
         return f"abs({self._print(expr.args[0])})"
 
     # NB: It's expected that we've made explicit any promotion in the sympy
     # expression, so it doesn't matter that Python max/min doesn't perform
     # promotion
     def _print_Max(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) >= 2
+        if len(expr.args) < 2:
+            raise AssertionError("Max expects at least two arguments")
         return f"max({', '.join(map(self._print, expr.args))})"
 
     def _print_Min(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) >= 2
+        if len(expr.args) < 2:
+            raise AssertionError("Min expects at least two arguments")
         return f"min({', '.join(map(self._print, expr.args))})"
 
     def _print_OpaqueUnaryFn_cos(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("cos expects exactly one argument")
         return f"math.cos({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_cosh(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("cosh expects exactly one argument")
         return f"math.cosh({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_acos(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("acos expects exactly one argument")
         return f"math.acos({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_sin(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("sin expects exactly one argument")
         return f"math.sin({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_sinh(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("sinh expects exactly one argument")
         return f"math.sinh({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_asin(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("asin expects exactly one argument")
         return f"math.asin({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_tan(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("tan expects exactly one argument")
         return f"math.tan({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_tanh(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("tanh expects exactly one argument")
         return f"math.tanh({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_atan(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("atan expects exactly one argument")
         return f"math.atan({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_log2(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("log2 expects exactly one argument")
         return f"math.log2({self._print(expr.args[0])})"
 
     def _print_RoundToInt(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("RoundToInt expects exactly one argument")
         return f"round({self._print(expr.args[0])})"
 
     def _print_RoundDecimal(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 2
+        if len(expr.args) != 2:
+            raise AssertionError("RoundDecimal expects exactly two arguments")
         number, ndigits = expr.args
-        assert isinstance(ndigits, sympy.Integer)
+        if not isinstance(ndigits, sympy.Integer):
+            raise TypeError("ndigits must be an instance of sympy.Integer")
         return f"round({self._print(number)}, {ndigits})"
 
 
@@ -289,7 +314,8 @@ def _print_Integer(self, expr: sympy.Expr) -> str:
         if i > INDEX_TYPE_MAX or i < INDEX_TYPE_MIN:
             raise OverflowError(f"{i} too big to convert to {INDEX_TYPE}")
         elif i == INDEX_TYPE_MIN:
-            assert i == (-1) << 63
+            if i != (-1) << 63:
+                raise AssertionError("unexpected minimum index type value")
             # Writing -9223372036854775808L makes the value overflow
             # as it is parsed as -(9223372036854775808L) by the C/C++ compiler
             return f"(-1{suffix} << 63)"
@@ -322,26 +348,31 @@ def _print_FloorDiv(self, expr: sympy.Expr) -> str:
         return f"c10::div_floor_floating(static_cast<double>({x}), static_cast<double>({div}))"
 
     def _print_floor(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("floor expects exactly one argument")
         r = f"std::floor({self._print(expr.args[0])})"
         return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
 
     def _print_FloorToInt(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("FloorToInt expects exactly one argument")
         r = f"std::floor({self._print(expr.args[0])})"
         return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
 
     def _print_TruncToInt(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("TruncToInt expects exactly one argument")
         r = f"std::trunc({self._print(expr.args[0])})"
         return f"static_cast<{INDEX_TYPE}>({r})"
 
     def _print_TruncToFloat(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("TruncToFloat expects exactly one argument")
         return f"std::trunc({self._print(expr.args[0])})"
 
     def _print_ToFloat(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("ToFloat expects exactly one argument")
         return f"static_cast<double>({self._print(expr.args[0])})"
 
     def _print_PythonMod(self, expr: sympy.Expr) -> str:
@@ -406,12 +437,14 @@ def _print_Rational(self, expr: sympy.Expr) -> str:
         return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
 
     def _print_ceiling(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("ceiling expects exactly one argument")
         r = f"std::ceil({self._print(expr.args[0])})"
         return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
 
     def _print_CeilToInt(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("CeilToInt expects exactly one argument")
         r = f"std::ceil({self._print(expr.args[0])})"
         return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
 
@@ -434,43 +467,53 @@ def _print_Max(self, expr: sympy.Expr) -> str:
             return f"std::max<{INDEX_TYPE}>({il})"
 
     def _print_Abs(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("Abs expects exactly one argument")
         return f"std::abs({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_cos(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("cos expects exactly one argument")
         return f"std::cos({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_cosh(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("cosh expects exactly one argument")
         return f"std::cosh({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_acos(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("acos expects exactly one argument")
         return f"std::acos({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_sin(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
-        return f"std::sin({self._print(expr.args[0])})"
+        if len(expr.args) != 1:
+            raise AssertionError("sin expects exactly one argument")
+        return f"math.sin({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_sinh(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("sinh expects exactly one argument")
         return f"std::sinh({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_asin(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("asin expects exactly one argument")
         return f"std::asin({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_tan(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("tan expects exactly one argument")
         return f"std::tan({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_tanh(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("tanh expects exactly one argument")
         return f"std::tanh({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_atan(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("atan expects exactly one argument")
         return f"std::atan({self._print(expr.args[0])})"
 
     def _print_OpaqueUnaryFn_sqrt(self, expr: sympy.Expr) -> str:
@@ -480,16 +523,19 @@ def _print_OpaqueUnaryFn_log2(self, expr: sympy.Expr) -> str:
         return f"std::log2({self._print(expr.args[0])})"
 
     def _print_RoundToInt(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 1
+        if len(expr.args) != 1:
+            raise AssertionError("RoundToInt expects exactly one argument")
         # TODO: dispatch to llrint depending on index type
         return f"std::lrint({self._print(expr.args[0])})"
 
     def _print_RoundDecimal(self, expr: sympy.Expr) -> str:
-        assert len(expr.args) == 2
+        if len(expr.args) != 2:
+            raise AssertionError("RoundDecimal expects exactly two arguments")
         number, ndigits = expr.args
         if number.is_integer:
             # ndigits < 0 should have been filtered by the sympy function
-            assert ndigits < 0
+            if ndigits >= 0:
+                raise AssertionError("ndigits must be negative for integer inputs")
             raise ValueError(
                 f"For integer inputs, only non-negative ndigits are currently supported, but got {ndigits}."
             )
diff --git a/torch/utils/_sympy/reference.py b/torch/utils/_sympy/reference.py
index 8c960e92f223..9012f80cfc6e 100644
--- a/torch/utils/_sympy/reference.py
+++ b/torch/utils/_sympy/reference.py
@@ -76,7 +76,8 @@ def ge(a, b):
 
     @staticmethod
     def not_(a):
-        assert not isinstance(a, bool)
+        if isinstance(a, bool):
+            raise AssertionError("not_ needs sympy expr")
         return ~a
 
     @staticmethod
@@ -175,6 +176,7 @@ def sqrt(x):
 
     @staticmethod
     def pow(a, b):
+        # pyrefly: ignore  # bad-argument-type
         return _keep_float(FloatPow)(a, b)
 
     @staticmethod
diff --git a/torch/utils/_sympy/solve.py b/torch/utils/_sympy/solve.py
index 334a023c0f36..840957f4109c 100644
--- a/torch/utils/_sympy/solve.py
+++ b/torch/utils/_sympy/solve.py
@@ -21,7 +21,7 @@
 
 
 def mirror_rel_op(type: type) -> Optional[type[sympy.Rel]]:
-    return _MIRROR_REL_OP.get(type, None)
+    return _MIRROR_REL_OP.get(type)
 
 
 # Tries to simplify 'expr', so as to leave only 'thing' in the left-hand side.
@@ -77,7 +77,8 @@ def try_solve(
         if e is None:
             continue
 
-        assert isinstance(e, sympy.Rel)
+        if not isinstance(e, sympy.Rel):
+            raise AssertionError("expected sympy.Rel")
 
         for _ in range(trials):
             trial = _try_isolate_lhs(e, thing, floordiv_inequality=floordiv_inequality)
@@ -128,7 +129,8 @@ def _try_isolate_lhs(
             if isinstance(e, INEQUALITY_TYPES) and other.is_negative:
                 op = mirror_rel_op(op)  # type: ignore[assignment]
 
-            assert op is not None
+            if op is None:
+                raise AssertionError("expected op to be not None")
             e = op(lhs, rhs)
 
     ################################################################################
@@ -151,28 +153,28 @@ def _try_isolate_lhs(
         if isinstance(e, sympy.Eq):
             numerator, denominator = e.lhs.args
             return sympy.And(
-                sympy.Ge(numerator, (e.rhs * denominator)),  # type: ignore[arg-type]
-                sympy.Lt(numerator, ((e.rhs + 1) * denominator)),  # type: ignore[arg-type]
+                sympy.Ge(numerator, (e.rhs * denominator)),
+                sympy.Lt(numerator, ((e.rhs + 1) * denominator)),
             )
         # a // b != expr
         # => a < (b * expr) or a >= (b * (expr + 1))
         if isinstance(e, sympy.Ne):
             numerator, denominator = e.lhs.args
             return sympy.Or(
-                sympy.Lt(numerator, (e.rhs * denominator)),  # type: ignore[arg-type]
-                sympy.Ge(numerator, ((e.rhs + 1) * denominator)),  # type: ignore[arg-type]
+                sympy.Lt(numerator, (e.rhs * denominator)),
+                sympy.Ge(numerator, ((e.rhs + 1) * denominator)),
             )
         # The transformations below only work if b is positive.
         # Note: we only have this information for constants.
         # a // b > expr  => a >= b * (expr + 1)
         # a // b >= expr => a >= b * expr
         if isinstance(e, (sympy.Gt, sympy.Ge)):
-            quotient = e.rhs if isinstance(e, sympy.Ge) else (e.rhs + 1)  # type: ignore[arg-type]
-            return sympy.Ge(e.lhs.args[0], (quotient * e.lhs.args[1]))  # type: ignore[arg-type]
+            quotient = e.rhs if isinstance(e, sympy.Ge) else (e.rhs + 1)
+            return sympy.Ge(e.lhs.args[0], (quotient * e.lhs.args[1]))
         # a // b < expr  => a < b * expr
         # a // b <= expr => a < b * (expr + 1)
         if isinstance(e, (sympy.Lt, sympy.Le)):
-            quotient = e.rhs if isinstance(e, sympy.Lt) else (e.rhs + 1)  # type: ignore[arg-type]
-            return sympy.Lt(e.lhs.args[0], (quotient * e.lhs.args[1]))  # type: ignore[arg-type]
+            quotient = e.rhs if isinstance(e, sympy.Lt) else (e.rhs + 1)
+            return sympy.Lt(e.lhs.args[0], (quotient * e.lhs.args[1]))
 
     return e
diff --git a/torch/utils/_sympy/symbol.py b/torch/utils/_sympy/symbol.py
index de810498bbab..cd25478e6ed1 100644
--- a/torch/utils/_sympy/symbol.py
+++ b/torch/utils/_sympy/symbol.py
@@ -89,7 +89,8 @@ def make_symbol(prefix: SymT, idx: int, **kwargs) -> sympy.Symbol:
 # This type is a little wider than it should be, because free_symbols says
 # that it contains Basic, rather than Symbol
 def symbol_is_type(sym: sympy.Basic, prefix: Union[SymT, Iterable[SymT]]) -> bool:
-    assert isinstance(sym, sympy.Symbol)
+    if not isinstance(sym, sympy.Symbol):
+        raise AssertionError("expected sympy.Symbol")
     name_str = sym.name.lower()  # Match capitalized names like XBLOCK, RBLOCK
     if isinstance(prefix, SymT):
         return name_str.startswith(prefix_str[prefix])
diff --git a/torch/utils/_sympy/value_ranges.py b/torch/utils/_sympy/value_ranges.py
index e02e049cc36d..b0a99dd4887c 100644
--- a/torch/utils/_sympy/value_ranges.py
+++ b/torch/utils/_sympy/value_ranges.py
@@ -7,17 +7,17 @@
 import logging
 import math
 import operator
+from collections.abc import Callable
 from typing import (
-    Callable,
     Generic,
     Optional,
     overload,
     SupportsFloat,
     TYPE_CHECKING,
+    TypeGuard,
     TypeVar,
     Union,
 )
-from typing_extensions import TypeGuard
 
 import sympy
 from sympy.logic.boolalg import Boolean as SympyBoolean, BooleanAtom
@@ -71,12 +71,14 @@ def simple_sympify(e):
             return sympy.oo if e > 0 else -sympy.oo
         return sympy.Float(e)
     elif isinstance(e, sympy.Expr):
-        assert e.is_number, e
+        if not getattr(e, "is_number", False):
+            raise AssertionError(e)
         # NaNs can occur when doing things like 0 * sympy.oo, but it is better
         # if the operator notices this and takes care of it, because sometimes
         # the NaN is inappropriate (for example, for ints, the [-oo, oo] range
         # should go to zero when multiplied with [0, 0])
-        assert e != sympy.nan
+        if e == sympy.nan:
+            raise AssertionError("sympy expression is NaN")
         return e
     elif isinstance(e, BooleanAtom):
         return e
@@ -87,16 +89,17 @@ def simple_sympify(e):
 # Sympy atomics only. Unlike <=, it also works on Sympy bools.
 def sympy_generic_le(lower, upper):
     if isinstance(lower, sympy.Expr):
-        assert isinstance(upper, sympy.Expr)
+        if not isinstance(upper, sympy.Expr):
+            raise AssertionError(
+                "upper must be a sympy.Expr when lower is a sympy.Expr"
+            )
         # instead of lower <= upper, we do upper >= lower since upper is mostly int_oo
         # and we have better code paths there.
         return upper >= lower
     else:
         # only negative condition is True > False
-        assert isinstance(lower, SympyBoolean) and isinstance(upper, SympyBoolean), (
-            lower,
-            upper,
-        )
+        if not isinstance(lower, SympyBoolean) or not isinstance(upper, SympyBoolean):
+            raise AssertionError((lower, upper))
         return not (lower and not upper)
 
 
@@ -123,7 +126,9 @@ def vr_is_expr(vr: ValueRanges[_T]) -> TypeGuard[ValueRanges[sympy.Expr]]:
 class ValueRanges(Generic[_T]):
     if TYPE_CHECKING:
         # ruff doesn't understand circular references but mypy does
+        # pyrefly: ignore  # unbound-name
         ExprVR = ValueRanges[sympy.Expr]  # noqa: F821
+        # pyrefly: ignore  # unbound-name
         BoolVR = ValueRanges[SympyBoolean]  # noqa: F821
         AllVR = Union[ExprVR, BoolVR]
 
@@ -166,7 +171,8 @@ def __init__(self, lower: AllIn, upper: AllIn) -> None:
 
         is_bool_lower = isinstance(lower, SympyBoolean)
         is_bool_upper = isinstance(upper, SympyBoolean)
-        assert is_bool_lower == is_bool_upper, (lower, upper)
+        if is_bool_lower != is_bool_upper:
+            raise AssertionError((lower, upper))
 
         # Warning: is_int/is_float is best effort.  We do pretty well in
         # Dynamo, but in Inductor these attributes are often wrong because we
@@ -209,7 +215,8 @@ def __init__(self, lower: AllIn, upper: AllIn) -> None:
         """
         # NB: [-oo, oo] always advertises as float!
         object.__setattr__(self, "is_float", not self.is_bool and not self.is_int)
-        assert self.is_bool or self.is_int or self.is_float, (lower, upper)
+        if not self.is_bool and not self.is_int and not self.is_float:
+            raise AssertionError((lower, upper))
 
     def boolify(self) -> ValueRanges[SympyBoolean]:
         if vr_is_bool(self):
@@ -251,9 +258,12 @@ def __and__(self: AllVR, other: AllVR) -> AllVR:
             return self
         if self in (ValueRanges.unknown(), ValueRanges.unknown_int()):
             return other
-        assert self.is_bool == other.is_bool, (self, other)
-        assert self.is_int == other.is_int, (self, other)
-        assert self.is_float == other.is_float, (self, other)
+        if self.is_bool != other.is_bool:
+            raise AssertionError((self, other))
+        if self.is_int != other.is_int:
+            raise AssertionError((self, other))
+        if self.is_float != other.is_float:
+            raise AssertionError((self, other))
         if self.is_bool:
             return ValueRanges(
                 sympy.Or(self.lower, other.lower), sympy.And(self.upper, other.upper)
@@ -279,9 +289,12 @@ def __or__(  # type: ignore[misc]
     def __or__(self: AllVR, other: AllVR) -> AllVR:
         if ValueRanges.unknown() in (self, other):
             return ValueRanges.unknown()
-        assert self.is_bool == other.is_bool, (self, other)
-        assert self.is_int == other.is_int, (self, other)
-        assert self.is_float == other.is_float, (self, other)
+        if self.is_bool != other.is_bool:
+            raise AssertionError((self, other))
+        if self.is_int != other.is_int:
+            raise AssertionError((self, other))
+        if self.is_float != other.is_float:
+            raise AssertionError((self, other))
         if self.is_bool:
             return ValueRanges(
                 sympy.And(self.lower, other.lower), sympy.Or(self.upper, other.upper)
@@ -426,13 +439,15 @@ class SymPyValueRangeAnalysis:
     @staticmethod
     def constant(value, dtype):
         if isinstance(value, ValueRanges):
-            assert value.is_singleton()
+            if not value.is_singleton():
+                raise AssertionError("ValueRanges must be a singleton for constant()")
             value = value.lower
         # NB: value is NOT a sympy expression, it's a constant!
         is_python = isinstance(value, (int, float, bool))
-        assert is_python or isinstance(
+        if not is_python and not isinstance(
             value, (BooleanAtom, sympy.Integer, sympy.Number)
-        )
+        ):
+            raise AssertionError(f"not a supported constant type: {type(value)}")
 
         # using nan makes subsequent computation throw, and for the purposes of optimization
         # returning -math.inf - math.inf is equivalent to giving up
@@ -451,12 +466,17 @@ def constant(value, dtype):
             # We do a type check on a best-effort basis
             # We don't want to force a cast to sympy.Float if the value is Rational to avoid losing precision
             if dtype == torch.bool:
-                assert isinstance(value, BooleanAtom)
+                if not isinstance(value, BooleanAtom):
+                    raise AssertionError("expected BooleanAtom for bool dtype")
             elif dtype.is_floating_point:
-                assert not value.is_finite or value.is_real
+                if value.is_finite and not value.is_real:
+                    raise AssertionError(
+                        "expected float-like sympy value for float dtype"
+                    )
             else:
                 # dtype is intXX
-                assert value.is_integer
+                if not getattr(value, "is_integer", False):
+                    raise AssertionError("expected integer sympy value for int dtype")
 
         r = ValueRanges.wrap(value)
         return r
@@ -464,6 +484,7 @@ def constant(value, dtype):
     @staticmethod
     def to_dtype(a, dtype, src_dtype=None):
         if dtype == torch.float64:
+            # pyrefly: ignore  # bad-argument-type
             return ValueRanges.increasing_map(a, ToFloat)
         elif dtype == torch.bool:
             return ValueRanges.unknown_bool()
@@ -473,13 +494,15 @@ def to_dtype(a, dtype, src_dtype=None):
 
     @staticmethod
     def trunc_to_int(a, dtype):
+        # pyrefly: ignore  # bad-argument-type
         return ValueRanges.increasing_map(a, TruncToInt)
 
     @staticmethod
     def not_(a):
         a = ValueRanges.wrap(a)
         a = a.boolify()
-        assert a.is_bool
+        if not a.is_bool:
+            raise AssertionError("not_ expects a boolean ValueRanges")
         return ValueRanges.decreasing_map(a, sympy.Not)
 
     @staticmethod
@@ -565,7 +588,10 @@ def identity(cls, a):
     def lt(cls, a, b):
         a = ValueRanges.wrap(a)
         b = ValueRanges.wrap(b)
-        assert a.is_bool == b.is_bool
+        if a.is_bool != b.is_bool:
+            raise AssertionError(
+                "operands must both be boolean ValueRanges or both non-boolean"
+            )
         if a.is_bool:
             return cls.and_(cls.not_(a), b)
         else:
@@ -598,7 +624,10 @@ def mul(cls, a, b):
         a = ValueRanges.wrap(a)
         b = ValueRanges.wrap(b)
 
-        assert a.is_bool == b.is_bool
+        if a.is_bool != b.is_bool:
+            raise AssertionError(
+                "operands must both be boolean ValueRanges or both non-boolean"
+            )
         if a.is_bool:
             return cls.and_(a, b)
 
@@ -621,7 +650,10 @@ def int_truediv(a, b):
             return ValueRanges.unknown()
         else:
             return ValueRanges.coordinatewise_monotone_map(
-                a, b, _keep_float(IntTrueDiv)
+                a,
+                b,
+                # pyrefly: ignore  # bad-argument-type
+                _keep_float(IntTrueDiv),
             )
 
     @staticmethod
@@ -634,7 +666,10 @@ def truediv(a, b):
             return ValueRanges.unknown()
         else:
             return ValueRanges.coordinatewise_monotone_map(
-                a, b, _keep_float(FloatTrueDiv)
+                a,
+                b,
+                # pyrefly: ignore  # bad-argument-type
+                _keep_float(FloatTrueDiv),
             )
 
     @staticmethod
@@ -713,6 +748,7 @@ def pow_by_natural(cls, a, b):
             # We should know that b >= 0 but we may have forgotten this fact due
             # to replacements, so don't assert it, but DO clamp it to prevent
             # degenerate problems
+            # pyrefly: ignore  # no-matching-overload
             return ValueRanges.coordinatewise_increasing_map(
                 a, b & ValueRanges(0, int_oo), PowByNatural
             )
@@ -879,6 +915,7 @@ def round_decimal(cls, number, ndigits):
 
     @classmethod
     def round_to_int(cls, number, dtype):
+        # pyrefly: ignore  # bad-argument-type
         return ValueRanges.increasing_map(number, RoundToInt)
 
     # It's used in some models on symints
@@ -896,7 +933,10 @@ def where(a, b, c):
         a = a.boolify()
         # We sometimes write unknown without specifying the type correctly
         # In particular, we do that when initialising the bounds for loads in bounds.py
-        assert b.is_bool == c.is_bool or ValueRanges.unknown() in (b, c)
+        if b.is_bool != c.is_bool and ValueRanges.unknown() not in (b, c):
+            raise AssertionError(
+                "where() requires b and c to have the same boolean-ness or allow unknown()"
+            )
         if b.is_bool:
             return ValueRanges(sympy.And(b.lower, c.lower), sympy.Or(b.upper, c.upper))
         else:
@@ -992,6 +1032,7 @@ def atan(x):
 
     @staticmethod
     def trunc(x):
+        # pyrefly: ignore  # bad-argument-type
         return ValueRanges.increasing_map(x, TruncToFloat)
 
 
diff --git a/torch/utils/_thunk.py b/torch/utils/_thunk.py
index 08cf6efc96fc..28689f2f76f1 100644
--- a/torch/utils/_thunk.py
+++ b/torch/utils/_thunk.py
@@ -1,4 +1,5 @@
-from typing import Callable, Generic, Optional, TypeVar
+from collections.abc import Callable
+from typing import Generic, Optional, TypeVar
 
 
 R = TypeVar("R")
diff --git a/torch/utils/_traceback.py b/torch/utils/_traceback.py
index b0152794b5c9..21fadb297be8 100644
--- a/torch/utils/_traceback.py
+++ b/torch/utils/_traceback.py
@@ -206,7 +206,8 @@ def extract(*, script=False, cpp=False, skip=0):
         import torch._C._profiler
 
         if script or cpp:
-            assert skip == 0, "skip with script/cpp NYI"
+            if skip != 0:
+                raise AssertionError("skip with script/cpp NYI")
 
         return CapturedTraceback(
             torch._C._profiler.gather_traceback(python=True, script=script, cpp=cpp),
diff --git a/torch/utils/_triton.py b/torch/utils/_triton.py
index 5f0ca5b4eff8..f062f7e7508c 100644
--- a/torch/utils/_triton.py
+++ b/torch/utils/_triton.py
@@ -16,7 +16,7 @@ def has_triton_package() -> bool:
 @functools.cache
 def get_triton_version(fallback: tuple[int, int] = (0, 0)) -> tuple[int, int]:
     try:
-        import triton  # noqa: F401
+        import triton
 
         major, minor = tuple(int(v) for v in triton.__version__.split(".")[:2])
         return (major, minor)
diff --git a/torch/utils/backend_registration.py b/torch/utils/backend_registration.py
index 5a83aede8d46..b2c6b6c24863 100644
--- a/torch/utils/backend_registration.py
+++ b/torch/utils/backend_registration.py
@@ -6,7 +6,10 @@
 from torch.overrides import handle_torch_function, has_torch_function_unary
 
 
-__all__ = ["rename_privateuse1_backend", "generate_methods_for_privateuse1_backend"]
+__all__ = [
+    "rename_privateuse1_backend",
+    "generate_methods_for_privateuse1_backend",
+]
 
 # TODO: Should use `torch._C._get_privateuse1_backend_name()` to get
 # renamed-backend name for `privateuse1`, but the func will cause an
@@ -199,6 +202,7 @@ def wrap_module_to(
         Args:
             device (int, optional): if specified, all parameters will be copied to that device
         """
+        # pyrefly: ignore  # missing-attribute
         return self._apply(lambda t: getattr(t, custom_backend_name)(device))
 
     _check_register_once(torch.nn.Module, custom_backend_name)
@@ -426,15 +430,89 @@ def func_name(*args, **kwargs):
     it is marked as private. It is a convenience function for backend implementers to
     more easily call the hooks into their backend extensions.
     """
-    assert isinstance(func_name, str), (
-        f"func_name must be `str`, but got `{type(func_name)}`."
-    )
+    if not isinstance(func_name, str):
+        raise AssertionError(f"func_name must be `str`, but got `{type(func_name)}`.")
     backend_name = _get_privateuse1_backend_name()
-    custom_device_mod = getattr(torch, backend_name, None)  # type: ignore[arg-type]
-    function = getattr(custom_device_mod, func_name, None)  # type: ignore[arg-type]
+    custom_device_mod = getattr(torch, backend_name, None)
+    function = getattr(custom_device_mod, func_name, None)
     if custom_device_mod is None or function is None:
         message = f"Try to call torch.{backend_name}.{func_name}. The backend must register a custom backend "
         message += f"module with `torch._register_device_module('{backend_name}', BackendModule)`. And "
         message += f"BackendModule needs to have the following API's:\n `{func_name}(*args, **kwargs)`. \n"
         raise RuntimeError(message)
     return function
+
+
+class _DummyBackendModule:
+    def is_initialized(self):
+        return True
+
+    def is_available(self):
+        return True
+
+    def current_device(self):
+        return 0
+
+    def _is_in_bad_fork(self):
+        return False
+
+    def manual_seed_all(self, seed: int):
+        pass
+
+    def device_count(self):
+        return 1
+
+
+class _DummyPrivateUse1Hook(torch._C._acc.PrivateUse1Hooks):
+    def is_available(self):
+        return True
+
+    def has_primary_context(self, dev_id):
+        return True
+
+    def is_built(self):
+        return True
+
+
+class _DummyDeviceGuard(torch._C._acc.DeviceGuard):
+    def type_(self):
+        return torch._C._autograd.DeviceType.PrivateUse1
+
+
+def _setup_privateuseone_for_python_backend(
+    rename=None, backend_module=None, hook=None, device_guard=None
+):
+    """This function will prepare the PrivateUse1 dispatch key to be used as a python backend.
+
+    WARNING: this API is experimental and might change without notice.
+
+    Formally, this registers things that Pytorch expects a registered backend
+    in C++ to have: including device guards, hooks, and backend modules and what not.
+
+    after this call, one can use `torch.library` to write Ops for this dispatch key
+    and expect it to behave like a backend registered in C++.
+
+    See the unit test at test/test_privateuseone_python_backend.py for more details.
+
+    Args:
+        rename: str | None, if passed in, we will rename privateuseone backend to
+           the name given.
+        backend_module: object | None, if passed in None, we will use DummyBackendModule
+        hook: object | None, if passed in None, we will use DummyPrivateUse1Hook
+        device_guard: object | None, if passed in None, we will use DummyDeviceGuard
+    """
+    # NOTE: the ordering of which these functions are called is important.
+    if rename is not None:
+        torch.utils.rename_privateuse1_backend(rename)
+    else:
+        rename = "privateuseone"
+    torch.utils.generate_methods_for_privateuse1_backend()
+    if backend_module is None:
+        backend_module = _DummyBackendModule()
+    if hook is None:
+        hook = _DummyPrivateUse1Hook()
+    if device_guard is None:
+        device_guard = _DummyDeviceGuard()
+    torch._register_device_module(rename, backend_module)
+    torch._C._acc.register_python_privateuseone_hook(hook)
+    torch._C._acc.register_python_privateuseone_device_guard(device_guard)
diff --git a/torch/utils/benchmark/examples/op_benchmark.py b/torch/utils/benchmark/examples/op_benchmark.py
index cdf3a7853d73..55f25e5c896d 100644
--- a/torch/utils/benchmark/examples/op_benchmark.py
+++ b/torch/utils/benchmark/examples/op_benchmark.py
@@ -22,8 +22,10 @@ def assert_dicts_equal(dict_0, dict_1):
         x = {"a": np.ones((2, 1))}
         x == x  # Raises ValueError
     """
-    assert set(dict_0.keys()) == set(dict_0.keys())
-    assert all(np.all(v == dict_1[k]) for k, v in dict_0.items() if k != "dtype")
+    if set(dict_0.keys()) != set(dict_0.keys()):
+        raise AssertionError("dicts must have the same keys")
+    if all(np.all(v != dict_1[k]) for k, v in dict_0.items() if k != "dtype"):
+        raise AssertionError("dict values differ for keys other than 'dtype'")
 
 
 def run(n, stmt, fuzzer_cls):
diff --git a/torch/utils/benchmark/examples/sparse/compare.py b/torch/utils/benchmark/examples/sparse/compare.py
index 640912e0167e..91e30e68054a 100644
--- a/torch/utils/benchmark/examples/sparse/compare.py
+++ b/torch/utils/benchmark/examples/sparse/compare.py
@@ -63,6 +63,7 @@ def generate_coo_data(size, sparse_dim, nnz, dtype, device):
     indices = torch.rand(sparse_dim, nnz, device=device)
     indices.mul_(torch.tensor(size[:sparse_dim]).unsqueeze(1).to(indices))
     indices = indices.to(torch.long)
+    # pyrefly: ignore  # no-matching-overload
     values = torch.rand([nnz, ], dtype=dtype, device=device)
     return indices, values
 
diff --git a/torch/utils/benchmark/examples/sparse/fuzzer.py b/torch/utils/benchmark/examples/sparse/fuzzer.py
index 8b10fc9fac18..a5aac22179d8 100644
--- a/torch/utils/benchmark/examples/sparse/fuzzer.py
+++ b/torch/utils/benchmark/examples/sparse/fuzzer.py
@@ -40,7 +40,7 @@ def main():
             [
                 benchmark_utils.FuzzedSparseTensor(
                     name=name,
-                    size=tuple([f"k{i}" for i in range(3)]),
+                    size=tuple(f"k{i}" for i in range(3)),
                     min_elements=64 * 1024,
                     max_elements=128 * 1024,
                     sparse_dim="sparse_dim",
diff --git a/torch/utils/benchmark/examples/sparse/op_benchmark.py b/torch/utils/benchmark/examples/sparse/op_benchmark.py
index 3efb75e8ea13..bd52084fbc0c 100644
--- a/torch/utils/benchmark/examples/sparse/op_benchmark.py
+++ b/torch/utils/benchmark/examples/sparse/op_benchmark.py
@@ -20,8 +20,10 @@ def assert_dicts_equal(dict_0, dict_1):
         x = {"a": np.ones((2, 1))}
         x == x  # Raises ValueError
     """
-    assert set(dict_0.keys()) == set(dict_0.keys())
-    assert all(np.all(v == dict_1[k]) for k, v in dict_0.items() if k != "dtype")
+    if set(dict_0.keys()) != set(dict_0.keys()):
+        raise AssertionError("dicts must have the same keys")
+    if all(np.all(v != dict_1[k]) for k, v in dict_0.items() if k != "dtype"):
+        raise AssertionError("dict values differ for keys other than 'dtype'")
 
 def run(n, stmt, fuzzer_cls):
     float_iter = fuzzer_cls(seed=0, dtype=torch.float32).take(n)
diff --git a/torch/utils/benchmark/op_fuzzers/sparse_unary.py b/torch/utils/benchmark/op_fuzzers/sparse_unary.py
index f6fe622183f6..07d2aeeeabaf 100644
--- a/torch/utils/benchmark/op_fuzzers/sparse_unary.py
+++ b/torch/utils/benchmark/op_fuzzers/sparse_unary.py
@@ -1,9 +1,16 @@
-# mypy: allow-untyped-defs
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional
 
 import numpy as np
 import torch
+
+if TYPE_CHECKING:
+    from torch.types import _dtype
+
 from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedSparseTensor
 
+__all__ = ["UnaryOpSparseFuzzer"]
 
 _MIN_DIM_SIZE = 16
 _MAX_DIM_SIZE = 16 * 1024 ** 2
@@ -13,7 +20,9 @@
 ))
 
 class UnaryOpSparseFuzzer(Fuzzer):
-    def __init__(self, seed, dtype=torch.float32, cuda=False):
+    def __init__(self, seed: Optional[int], dtype: _dtype | None = None, cuda: bool = False) -> None:
+        if dtype is None:
+            dtype = getattr(torch, 'float32', None)
         super().__init__(
             parameters=[
                 # Sparse dim parameter of x. (e.g. 1D, 2D, or 3D.)
diff --git a/torch/utils/benchmark/utils/_stubs.py b/torch/utils/benchmark/utils/_stubs.py
index 068e62ec87a3..c91e3d12b29e 100644
--- a/torch/utils/benchmark/utils/_stubs.py
+++ b/torch/utils/benchmark/utils/_stubs.py
@@ -1,4 +1,5 @@
-from typing import Any, Callable
+from typing import Any
+from collections.abc import Callable
 from typing_extensions import Protocol, runtime_checkable
 
 
diff --git a/torch/utils/benchmark/utils/common.py b/torch/utils/benchmark/utils/common.py
index e25909f6c85e..10fe1d898de0 100644
--- a/torch/utils/benchmark/utils/common.py
+++ b/torch/utils/benchmark/utils/common.py
@@ -276,7 +276,8 @@ def unit_to_english(u: str) -> str:
 
 def trim_sigfig(x: float, n: int) -> float:
     """Trim `x` to `n` significant figures. (e.g. 3.14159, 2 -> 3.10000)"""
-    assert n == int(n)
+    if n != int(n):
+        raise AssertionError("Number of significant figures must be an integer")
     magnitude = int(torch.tensor(x).abs().log10().ceil().item())
     scale = 10 ** (magnitude - n)
     return float(torch.tensor(x / scale).round() * scale)
@@ -312,8 +313,10 @@ def _make_temp_dir(prefix: Optional[str] = None, gc_dev_shm: bool = False) -> st
     use_dev_shm: bool = (os.getenv("BENCHMARK_USE_DEV_SHM") or "").lower() in ("1", "true")
     if use_dev_shm:
         root = "/dev/shm/pytorch_benchmark_utils"
-        assert os.name == "posix", f"tmpfs (/dev/shm) is POSIX only, current platform is {os.name}"
-        assert os.path.exists("/dev/shm"), "This system does not appear to support tmpfs (/dev/shm)."
+        if os.name != "posix":
+            raise AssertionError(f"tmpfs (/dev/shm) is POSIX only, current platform is {os.name}")
+        if not os.path.exists("/dev/shm"):
+            raise AssertionError("This system does not appear to support tmpfs (/dev/shm).")
         os.makedirs(root, exist_ok=True)
 
         # Because we're working in shared memory, it is more important than
diff --git a/torch/utils/benchmark/utils/compare.py b/torch/utils/benchmark/utils/compare.py
index d1df2987ea6c..0b8a2163b3c4 100644
--- a/torch/utils/benchmark/utils/compare.py
+++ b/torch/utils/benchmark/utils/compare.py
@@ -157,7 +157,8 @@ def __init__(
             trim_significant_figures: bool,
             highlight_warnings: bool
     ):
-        assert len({r.label for r in results}) == 1
+        if len({r.label for r in results}) != 1:
+            raise AssertionError("All results must share the same label")
 
         self.results = results
         self._colorize = colorize
diff --git a/torch/utils/benchmark/utils/compile.py b/torch/utils/benchmark/utils/compile.py
index cee9c8d7f717..9127b14c99b3 100644
--- a/torch/utils/benchmark/utils/compile.py
+++ b/torch/utils/benchmark/utils/compile.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
-from typing import Any, Callable, cast, Optional, Union
+from typing import Any, cast, Optional, Union
+from collections.abc import Callable
 
 import torch
 import torch._dynamo
@@ -14,6 +15,7 @@
 _default_float_32_precision = torch.get_float32_matmul_precision()
 
 try:
+
     from tabulate import tabulate
 
     HAS_TABULATE = True
@@ -168,6 +170,7 @@ def bench_all(
                             _disable_tensor_cores()
                             table.append([
                                 ("Training" if optimizer else "Inference"),
+                                # pyrefly: ignore  # redundant-condition
                                 backend if backend else "-",
                                 mode if mode is not None else "-",
                                 f"{compilation_time} ms " if compilation_time else "-",
@@ -188,4 +191,5 @@ def bench_all(
                     ])
 
 
+        # pyrefly: ignore  # not-callable
         return tabulate(table, headers=field_names, tablefmt="github")
diff --git a/torch/utils/benchmark/utils/cpp_jit.py b/torch/utils/benchmark/utils/cpp_jit.py
index b7aec25f6a76..27699d9ee21e 100644
--- a/torch/utils/benchmark/utils/cpp_jit.py
+++ b/torch/utils/benchmark/utils/cpp_jit.py
@@ -35,6 +35,7 @@ def _get_build_root() -> str:
     global _BUILD_ROOT
     if _BUILD_ROOT is None:
         _BUILD_ROOT = _make_temp_dir(prefix="benchmark_utils_jit_build")
+        # pyrefly: ignore  # missing-argument
         atexit.register(shutil.rmtree, _BUILD_ROOT)
     return _BUILD_ROOT
 
@@ -158,7 +159,8 @@ def compile_timeit_template(*, stmt: str, setup: str, global_setup: str) -> Time
         src: str = f.read()
 
     module = _compile_template(stmt=stmt, setup=setup, global_setup=global_setup, src=src, is_standalone=False)
-    assert isinstance(module, TimeitModuleType)
+    if not isinstance(module, TimeitModuleType):
+        raise AssertionError("compiled module is not a TimeitModuleType")
     return module
 
 
@@ -168,5 +170,6 @@ def compile_callgrind_template(*, stmt: str, setup: str, global_setup: str) -> s
         src: str = f.read()
 
     target = _compile_template(stmt=stmt, setup=setup, global_setup=global_setup, src=src, is_standalone=True)
-    assert isinstance(target, str)
+    if not isinstance(target, str):
+        raise AssertionError("compiled target path is not a string")
     return target
diff --git a/torch/utils/benchmark/utils/fuzzer.py b/torch/utils/benchmark/utils/fuzzer.py
index 6fd52a7aecd3..f7fc21ceaf88 100644
--- a/torch/utils/benchmark/utils/fuzzer.py
+++ b/torch/utils/benchmark/utils/fuzzer.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 import functools
 import itertools as it
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
+from collections.abc import Callable
 
 import torch
 
@@ -92,12 +93,17 @@ def sample(self, state):
 
     def _check_distribution(self, distribution):
         if not isinstance(distribution, dict):
-            assert distribution in _DISTRIBUTIONS
+            if distribution not in _DISTRIBUTIONS:
+                raise AssertionError(f"Unknown distribution: {distribution}")
         else:
-            assert not any(i < 0 for i in distribution.values()), "Probabilities cannot be negative"
-            assert abs(sum(distribution.values()) - 1) <= 1e-5, "Distribution is not normalized"
-            assert self._minval is None
-            assert self._maxval is None
+            if any(i < 0 for i in distribution.values()):
+                raise AssertionError("Probabilities cannot be negative")
+            if not abs(sum(distribution.values()) - 1) > 1e-5:
+                raise AssertionError("Distribution is not normalized")
+            if self._minval is not None:
+                raise AssertionError("When passing a custom distribution, 'minval' must be None")
+            if self._maxval is not None:
+                raise AssertionError("When passing a custom distribution, 'maxval' must be None")
 
         return distribution
 
@@ -327,7 +333,8 @@ def satisfies_constraints(self, params):
         size, _, allocation_size = self._get_size_and_steps(params)
         # Product is computed in Python to avoid integer overflow.
         num_elements = prod(size)
-        assert num_elements >= 0
+        if num_elements < 0:
+            raise AssertionError("Computed number of elements is negative")
 
         allocation_bytes = prod(allocation_size, base=dtype_size(self._dtype))
 
diff --git a/torch/utils/benchmark/utils/sparse_fuzzer.py b/torch/utils/benchmark/utils/sparse_fuzzer.py
index 498f94ca26f1..735b40c3b5e4 100644
--- a/torch/utils/benchmark/utils/sparse_fuzzer.py
+++ b/torch/utils/benchmark/utils/sparse_fuzzer.py
@@ -70,7 +70,8 @@ def sparse_tensor_constructor(size, dtype, sparse_dim, nnz, is_coalesced):
         """
         if isinstance(size, Number):
             size = [size] * sparse_dim
-        assert all(size[d] > 0 for d in range(sparse_dim)) or nnz == 0, 'invalid arguments'
+        if all(size[d] <= 0 for d in range(sparse_dim)) and nnz != 0:
+            raise AssertionError('invalid arguments')
         v_size = [nnz] + list(size[sparse_dim:])
         if dtype.is_floating_point:
             v = torch.rand(size=v_size, dtype=dtype, device="cpu")
@@ -91,16 +92,20 @@ def sparse_tensor_constructor(size, dtype, sparse_dim, nnz, is_coalesced):
         return x
 
     def _make_tensor(self, params, state):
+        # pyrefly: ignore  # missing-attribute
         size, _, _ = self._get_size_and_steps(params)
         density = params['density']
         nnz = math.ceil(sum(size) * density)
-        assert nnz <= sum(size)
+        if nnz > sum(size):
+            raise AssertionError('nnz cannot exceed total number of elements')
 
         is_coalesced = params['coalesced']
         sparse_dim = params['sparse_dim'] if self._sparse_dim else len(size)
         sparse_dim = min(sparse_dim, len(size))
+        # pyrefly: ignore  # missing-attribute
         tensor = self.sparse_tensor_constructor(size, self._dtype, sparse_dim, nnz, is_coalesced)
 
+        # pyrefly: ignore  # missing-attribute
         if self._cuda:
             tensor = tensor.cuda()
         sparse_dim = tensor.sparse_dim()
@@ -116,6 +121,7 @@ def _make_tensor(self, params, state):
             "sparse_dim": sparse_dim,
             "dense_dim": dense_dim,
             "is_hybrid": is_hybrid,
+            # pyrefly: ignore  # missing-attribute
             "dtype": str(self._dtype),
         }
         return tensor, properties
diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py
index 377fc1221dbe..acd9e5f96205 100644
--- a/torch/utils/benchmark/utils/timer.py
+++ b/torch/utils/benchmark/utils/timer.py
@@ -2,7 +2,8 @@
 import enum
 import timeit
 import textwrap
-from typing import overload, Any, Callable, NoReturn, Optional, Union
+from typing import overload, Any, NoReturn, Optional, Union
+from collections.abc import Callable
 
 import torch
 from torch.utils.benchmark.utils import common, cpp_jit
@@ -207,7 +208,8 @@ def __init__(
                 )
 
         elif language in (Language.CPP, "cpp", "c++"):
-            assert self._timer_cls is timeit.Timer, "_timer_cls has already been swapped."
+            if self._timer_cls is not timeit.Timer:
+                raise AssertionError("_timer_cls has already been swapped.")
             self._timer_cls = CPPTimer
             setup = ("" if setup == "pass" else setup)
             self._language = Language.CPP
@@ -232,6 +234,7 @@ def __init__(
         setup = textwrap.dedent(setup)
         setup = (setup[1:] if setup and setup[0] == "\n" else setup).rstrip()
 
+        # pyrefly: ignore  # bad-instantiation
         self._timer = self._timer_cls(
             stmt=stmt,
             setup=setup,
@@ -515,7 +518,8 @@ def collect_callgrind(
         # the parent process rather than the valgrind subprocess.
         self._timeit(1)
         is_python = (self._language == Language.PYTHON)
-        assert is_python or not self._globals
+        if not is_python and self._globals:
+            raise AssertionError("_timer globals are only supported for Python timers")
         result = valgrind_timer_interface.wrapper_singleton().collect_callgrind(
             task_spec=self._task_spec,
             globals=self._globals,
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
index 900d8c3722a8..e80416482271 100644
--- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@@ -11,8 +11,9 @@
 import sys
 import textwrap
 from typing import (
-    cast, Any, Callable, NamedTuple,
+    cast, Any, NamedTuple,
     Optional, Union, TYPE_CHECKING)
+from collections.abc import Callable
 from collections.abc import Iterator
 
 import torch
@@ -144,7 +145,8 @@ def _merge(
         second: "FunctionCounts",
         merge_fn: Callable[[int], int]
     ) -> "FunctionCounts":
-        assert self.inclusive == second.inclusive, "Cannot merge inclusive and exclusive counts."
+        if self.inclusive != second.inclusive:
+            raise AssertionError("Cannot merge inclusive and exclusive counts.")
         counts: collections.defaultdict[str, int] = collections.defaultdict(int)
         for c, fn in self:
             counts[fn] += c
@@ -447,11 +449,13 @@ def construct(self) -> str:
         load_lines = []
         for name, wrapped_value in self._globals.items():
             if wrapped_value.setup is not None:
+                # pyrefly: ignore  # bad-argument-type
                 load_lines.append(textwrap.dedent(wrapped_value.setup))
 
             if wrapped_value.serialization == Serialization.PICKLE:
                 path = os.path.join(self._data_dir, f"{name}.pkl")
                 load_lines.append(
+                    # pyrefly: ignore  # bad-argument-type
                     f"with open({repr(path)}, 'rb') as f:\n    {name} = pickle.load(f)")
                 with open(path, "wb") as f:
                     pickle.dump(wrapped_value.value, f)
@@ -461,11 +465,13 @@ def construct(self) -> str:
                 # TODO: Figure out if we can use torch.serialization.add_safe_globals here
                 # Using weights_only=False after the change in
                 # https://dev-discuss.pytorch.org/t/bc-breaking-change-torch-load-is-being-flipped-to-use-weights-only-true-by-default-in-the-nightlies-after-137602/2573
+                # pyrefly: ignore  # bad-argument-type
                 load_lines.append(f"{name} = torch.load({repr(path)}, weights_only=False)")
                 torch.save(wrapped_value.value, path)
 
             elif wrapped_value.serialization == Serialization.TORCH_JIT:
                 path = os.path.join(self._data_dir, f"{name}.pt")
+                # pyrefly: ignore  # bad-argument-type
                 load_lines.append(f"{name} = torch.jit.load({repr(path)})")
                 with open(path, "wb") as f:
                     torch.jit.save(wrapped_value.value, f)  # type: ignore[no-untyped-call]
@@ -491,7 +497,8 @@ def __init__(self) -> None:
         else:
             print("Callgrind bindings are not present in `torch._C`. JIT-ing bindings.")
             self._bindings_module = cpp_jit.get_compat_bindings()
-            assert all(hasattr(self._bindings_module, symbol) for symbol in valgrind_symbols)
+            if not all(hasattr(self._bindings_module, symbol) for symbol in valgrind_symbols):
+                raise AssertionError("JIT-compiled callgrind bindings are missing required symbols")
             self._supported_platform = self._bindings_module._valgrind_supported_platform()
 
         self._commands_available: dict[str, bool] = {}
@@ -530,7 +537,8 @@ def collect_callgrind(
     ) -> tuple[CallgrindStats, ...]:
         """Collect stats, and attach a reference run which can be used to filter interpreter overhead."""
         self._validate()
-        assert is_python or not collect_baseline
+        if not is_python and collect_baseline:
+            raise AssertionError("collect_baseline is only supported for Python timers")
 
         *task_stats, baseline_stats = self._invoke(
             task_spec=task_spec,
@@ -541,7 +549,8 @@ def collect_callgrind(
             is_python=is_python,
             retain_out_file=retain_out_file,
         )
-        assert len(task_stats) == repeats
+        if len(task_stats) != repeats:
+            raise AssertionError("Unexpected number of task stats returned from _invoke")
 
         return tuple(
             CallgrindStats(
@@ -633,7 +642,8 @@ def run(args: list[str], **kwargs: Any) -> tuple[CompletedProcessType, str]:
 
                 run_loop_cmd = ["python", script_file]
             else:
-                assert not collect_baseline
+                if collect_baseline:
+                    raise AssertionError("collect_baseline must be False for non-Python timers")
                 run_loop_exec = cpp_jit.compile_callgrind_template(
                     stmt=task_spec.stmt,
                     setup=task_spec.setup,
@@ -699,7 +709,8 @@ class ScanState(enum.Enum):
                             scan_state = ScanState.PARSING
 
                     else:
-                        assert scan_state == ScanState.PARSING
+                        if scan_state != ScanState.PARSING:
+                            raise AssertionError("Failed to enter PARSING state while parsing callgrind_annotate output")
                         fn_match = function_pattern.match(l)
                         if fn_match:
                             ir_str, file_function = fn_match.groups()
@@ -717,7 +728,8 @@ class ScanState(enum.Enum):
                         else:
                             break
 
-                assert scan_state == ScanState.PARSING, f"Failed to parse {fpath}"
+                if scan_state != ScanState.PARSING:
+                    raise AssertionError(f"Failed to parse {fpath}")
                 return FunctionCounts(tuple(sorted(fn_counts, reverse=True)), inclusive=inclusive)
 
             def read_results(i: int) -> tuple[FunctionCounts, FunctionCounts, Optional[str]]:
diff --git a/torch/utils/bottleneck/__init__.py b/torch/utils/bottleneck/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/torch/utils/bottleneck/__main__.py b/torch/utils/bottleneck/__main__.py
deleted file mode 100644
index d8bc43be0e2b..000000000000
--- a/torch/utils/bottleneck/__main__.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# mypy: allow-untyped-defs
-import argparse
-import cProfile
-import pstats
-import sys
-import os
-
-import torch
-from torch.autograd import profiler
-from torch.utils.collect_env import get_env_info
-
-
-def redirect_argv(new_argv):
-    sys.argv[:] = new_argv[:]
-
-
-def compiled_with_cuda(sysinfo):
-    if sysinfo.cuda_compiled_version:
-        return f'compiled w/ CUDA {sysinfo.cuda_compiled_version}'
-    return 'not compiled w/ CUDA'
-
-
-env_summary = """
---------------------------------------------------------------------------------
-  Environment Summary
---------------------------------------------------------------------------------
-PyTorch {pytorch_version}{debug_str} {cuda_compiled}
-Running with Python {py_version} and {cuda_runtime}
-
-`{pip_version} list` truncated output:
-{pip_list_output}
-""".strip()
-
-
-def run_env_analysis():
-    print('Running environment analysis...')
-    info = get_env_info()
-
-    result: dict[str, str] = {}
-
-    debug_str = ''
-    if info.is_debug_build:
-        debug_str = ' DEBUG'
-
-    cuda_avail = ''
-    if info.is_cuda_available:
-        cuda = info.cuda_runtime_version
-        if cuda is not None:
-            cuda_avail = 'CUDA ' + cuda
-    else:
-        cuda = 'CUDA unavailable'
-
-    pip_version = info.pip_version
-    pip_list_output = info.pip_packages
-    if pip_list_output is None:
-        pip_list_output = 'Unable to fetch'
-
-    result = {
-        'debug_str': debug_str,
-        'pytorch_version': info.torch_version,
-        'cuda_compiled': compiled_with_cuda(info),
-        'py_version': f'{sys.version_info[0]}.{sys.version_info[1]}',
-        'cuda_runtime': cuda_avail,
-        'pip_version': pip_version,
-        'pip_list_output': pip_list_output,
-    }
-
-    return env_summary.format(**result)
-
-
-def run_cprofile(code, globs, launch_blocking=False):
-    print('Running your script with cProfile')
-    prof = cProfile.Profile()
-    prof.enable()
-    exec(code, globs, None)
-    prof.disable()
-    return prof
-
-
-cprof_summary = """
---------------------------------------------------------------------------------
-  cProfile output
---------------------------------------------------------------------------------
-""".strip()
-
-
-def print_cprofile_summary(prof, sortby='tottime', topk=15):
-    print(cprof_summary)
-    cprofile_stats = pstats.Stats(prof).sort_stats(sortby)
-    cprofile_stats.print_stats(topk)
-
-
-def run_autograd_prof(code, globs):
-    def run_prof(use_cuda=False):
-        with profiler.profile(use_cuda=use_cuda) as prof:
-            exec(code, globs, None)
-        return prof
-
-    print('Running your script with the autograd profiler...')
-    result = [run_prof(use_cuda=False)]
-    if torch.cuda.is_available():
-        result.append(run_prof(use_cuda=True))
-    else:
-        result.append(None)
-
-    return result
-
-
-autograd_prof_summary = """
---------------------------------------------------------------------------------
-  autograd profiler output ({mode} mode)
---------------------------------------------------------------------------------
-        {description}
-{cuda_warning}
-{output}
-""".strip()
-
-
-def print_autograd_prof_summary(prof, mode, sortby='cpu_time', topk=15):
-    valid_sortby = ['cpu_time', 'cuda_time', 'cpu_time_total', 'cuda_time_total', 'count']
-    if sortby not in valid_sortby:
-        warn = ('WARNING: invalid sorting option for autograd profiler results: {}\n'
-                'Expected `cpu_time`, `cpu_time_total`, or `count`. '
-                'Defaulting to `cpu_time`.')
-        print(warn.format(sortby))
-        sortby = 'cpu_time'
-
-    if mode == 'CUDA':
-        cuda_warning = ('\n\tBecause the autograd profiler uses the CUDA event API,\n'
-                        '\tthe CUDA time column reports approximately max(cuda_time, cpu_time).\n'
-                        '\tPlease ignore this output if your code does not use CUDA.\n')
-    else:
-        cuda_warning = ''
-
-    sorted_events = sorted(prof.function_events,
-                           key=lambda x: getattr(x, sortby), reverse=True)
-    topk_events = sorted_events[:topk]
-
-    result = {
-        'mode': mode,
-        'description': f'top {topk} events sorted by {sortby}',
-        'output': torch.autograd.profiler_util._build_table(topk_events),
-        'cuda_warning': cuda_warning
-    }
-
-    print(autograd_prof_summary.format(**result))
-
-
-descript = """
-`bottleneck` is a tool that can be used as an initial step for debugging
-bottlenecks in your program.
-
-It summarizes runs of your script with the Python profiler and PyTorch\'s
-autograd profiler. Because your script will be profiled, please ensure that it
-exits in a finite amount of time.
-
-For more complicated uses of the profilers, please see
-https://docs.python.org/3/library/profile.html and
-https://pytorch.org/docs/main/autograd.html#profiler for more information.
-""".strip()
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description=descript)
-    parser.add_argument('scriptfile', type=str,
-                        help='Path to the script to be run. '
-                        'Usually run with `python path/to/script`.')
-    parser.add_argument('args', type=str, nargs=argparse.REMAINDER,
-                        help='Command-line arguments to be passed to the script.')
-    return parser.parse_args()
-
-
-def cpu_time_total(autograd_prof):
-    return sum(event.cpu_time_total for event in autograd_prof.function_events)
-
-
-def main():
-    args = parse_args()
-
-    # Customizable constants.
-    scriptfile = args.scriptfile
-    scriptargs = [] if args.args is None else args.args
-    scriptargs.insert(0, scriptfile)
-    cprofile_sortby = 'tottime'
-    cprofile_topk = 15
-    autograd_prof_sortby = 'cpu_time_total'
-    autograd_prof_topk = 15
-
-    redirect_argv(scriptargs)
-
-    sys.path.insert(0, os.path.dirname(scriptfile))
-    with open(scriptfile, 'rb') as stream:
-        code = compile(stream.read(), scriptfile, 'exec')
-    globs = {
-        '__file__': scriptfile,
-        '__name__': '__main__',
-        '__package__': None,
-        '__cached__': None,
-    }
-
-    print(descript)
-
-    env_summary = run_env_analysis()
-
-    if torch.cuda.is_available():
-        torch.cuda.init()
-    cprofile_prof = run_cprofile(code, globs)
-    autograd_prof_cpu, autograd_prof_cuda = run_autograd_prof(code, globs)
-
-    print(env_summary)
-    print_cprofile_summary(cprofile_prof, cprofile_sortby, cprofile_topk)
-
-    if not torch.cuda.is_available():
-        print_autograd_prof_summary(autograd_prof_cpu, 'CPU', autograd_prof_sortby, autograd_prof_topk)
-        return
-
-    # Print both the result of the CPU-mode and CUDA-mode autograd profilers
-    # if their execution times are very different.
-    cuda_prof_exec_time = cpu_time_total(autograd_prof_cuda)
-    if len(autograd_prof_cpu.function_events) > 0:
-        cpu_prof_exec_time = cpu_time_total(autograd_prof_cpu)
-        pct_diff = (cuda_prof_exec_time - cpu_prof_exec_time) / cuda_prof_exec_time
-        if abs(pct_diff) > 0.05:
-            print_autograd_prof_summary(autograd_prof_cpu, 'CPU', autograd_prof_sortby, autograd_prof_topk)
-
-    print_autograd_prof_summary(autograd_prof_cuda, 'CUDA', autograd_prof_sortby, autograd_prof_topk)
-
-if __name__ == '__main__':
-    main()
diff --git a/torch/utils/bundled_inputs.py b/torch/utils/bundled_inputs.py
index 6209fc8ee874..ccb56172a077 100644
--- a/torch/utils/bundled_inputs.py
+++ b/torch/utils/bundled_inputs.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # mypy: allow-untyped-defs
-from typing import Any, TypeVar, Optional, NamedTuple, Union, Callable
-from collections.abc import Sequence
+from typing import Any, TypeVar, Optional, NamedTuple, Union
+from collections.abc import Callable, Sequence
 import textwrap
 import torch
 from torch._C import TupleType, ListType
@@ -119,10 +119,12 @@ def bundle_inputs(
     # Fortunately there is a function in _recursive that does exactly that conversion.
     cloned_module = wrap_cpp_module(clone)
     if isinstance(inputs, dict):
-        assert isinstance(info, dict) or info is None
+        if not isinstance(info, dict) and info is not None:
+            raise AssertionError("If inputs is a dict, info must be a dict or None")
         augment_many_model_functions_with_bundled_inputs(cloned_module, inputs, _receive_inflate_expr, info)
     else:
-        assert isinstance(info, list) or info is None
+        if not isinstance(info, list) and info is not None:
+            raise AssertionError("If inputs is a list, info must be a list or None")
         augment_model_with_bundled_inputs(cloned_module, inputs, _receive_inflate_expr, info)
     return cloned_module
 
diff --git a/torch/utils/checkpoint.py b/torch/utils/checkpoint.py
index 30d2fc106f5f..461f3d67e24e 100644
--- a/torch/utils/checkpoint.py
+++ b/torch/utils/checkpoint.py
@@ -132,7 +132,7 @@ def _infer_device_type(*args):
 
     def add_device_types(arg):
         nonlocal device_types
-        if isinstance(arg, torch.Tensor) and not arg.device.type == "cpu":
+        if isinstance(arg, torch.Tensor) and arg.device.type != "cpu":
             device_types.append(arg.device.type)
     tree_map(add_device_types, args)
 
@@ -222,6 +222,7 @@ def _get_autocast_kwargs(device_type="cuda"):
 
 class CheckpointFunction(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(ctx, run_function, preserve_rng_state, *args):
         check_backward_validity(args)
         ctx.run_function = run_function
@@ -784,6 +785,7 @@ def __init__(self):
 
 class _NoopSaveInputs(torch.autograd.Function):
     @staticmethod
+    # pyrefly: ignore  # bad-override
     def forward(*args):
         return torch.empty((0,))
 
@@ -1006,6 +1008,7 @@ def get_context_manager(self):
             def logging_mode():
                 with LoggingTensorMode(), \
                      capture_logs(True, python_tb=True, script_tb=True, cpp_tb=cpp_tb) as logs_and_tb:
+                    # pyrefly: ignore  # bad-assignment
                     self.logs, self.tbs = logs_and_tb
                     yield logs_and_tb
             return logging_mode()
@@ -1031,8 +1034,10 @@ def get_str_tb(label, capture_logs):
                     out += f"{line['filename']}:{line['line']}:{line['name']}\n"
                 out += "\n\n"
             return out
-        assert capture_logs_fwd.logs is not None
-        assert capture_logs_recompute.logs is not None
+        if capture_logs_fwd.logs is None:
+            raise AssertionError("capture_logs_fwd.logs is None")
+        if capture_logs_recompute.logs is None:
+            raise AssertionError("capture_logs_recompute.logs is None")
         raise CheckpointError(
             _checkpoint_error_template.format(
                 forward_traces=get_str_tb("original", capture_logs_fwd),
@@ -1070,12 +1075,14 @@ def __init__(self, target_frame_ref: ReferenceType, gid: int):
         def pack_hook(x):
             x = x.detach() if x.requires_grad else x
             target_frame = target_frame_ref()
-            assert target_frame is not None  # appease mypy
+            if target_frame is None:
+                raise AssertionError("Internal error: target_frame reference is None")
             recomp_idx = target_frame.recomp_counter[gid]
             target_frame.recomp_counter[gid] += 1
 
             if recomp_idx >= len(target_frame.weak_holders):
-                assert not target_frame.early_stop
+                if target_frame.early_stop:
+                    raise AssertionError("Unexpected state: target_frame.early_stop is set")
                 if not target_frame.forward_completed:
                     # We run into this case when early stop is not enabled and do
                     # grad within checkpoint.
@@ -1184,7 +1191,7 @@ def unpack_hook_with_error_cb(holder):
 def _is_compiling(func, args, kwargs):
     # Check if we are under AOTAutograd tracing
     # Checking that a functional mode is active should always do what we want
-    return torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.FUNCTIONAL) is not None
+    return torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.PROXY) is not None
 
 
 class _VersionWrapper:
@@ -1289,7 +1296,7 @@ def _policy_from_bool(b):
     # With subclasses involved, these metadata ops become dispatchable, this
     # can result in incorrectness if these ops are selected cached.
     torch.ops.prim.device.default,
-} | set(torch._subclasses.functional_tensor.FunctionalTensor.metadata_fns)
+} | set(torch._subclasses.functional_tensor.FunctionalTensor.metadata_fns)  # type: ignore[has-type]
 
 
 class _CachingTorchDispatchMode(TorchDispatchMode):
@@ -1512,12 +1519,14 @@ def _checkpoint_without_reentrant_generator(
     device_module = _get_device_module(device_type)
     forward_context, recompute_context = context_fn()
     if _is_compiling(fn, args, kwargs) and context_fn != noop_context_fn:
-        assert (
-            isinstance(forward_context, TorchDispatchMode) and
-            isinstance(recompute_context, TorchDispatchMode)
-        ), \
-            "In torch.compile mode, `context_fn` arg passed to `torch.utils.checkpoint` " + \
-            "must generate a tuple of two `TorchDispatchMode`s."
+        if (
+            not isinstance(forward_context, TorchDispatchMode)
+            or not isinstance(recompute_context, TorchDispatchMode)
+        ):
+            raise AssertionError(
+                "In torch.compile mode, `context_fn` arg passed to `torch.utils.checkpoint` "
+                "must generate a tuple of two `TorchDispatchMode`s."
+            )
     # Accommodates the (remote) possibility that autocast is enabled for cpu AND gpu.
     device_autocast_kwargs, cpu_autocast_kwargs = _get_autocast_kwargs(device_type=device_type)
 
diff --git a/torch/utils/collect_env.py b/torch/utils/collect_env.py
index c6473220bc00..3b8b62cfde6d 100644
--- a/torch/utils/collect_env.py
+++ b/torch/utils/collect_env.py
@@ -119,7 +119,7 @@
 
 def run(command):
     """Return (return-code, stdout, stderr)."""
-    shell = True if type(command) is str else False
+    shell = type(command) is str
     p = subprocess.Popen(
         command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell
     )
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 2309429594ee..2f5ac6334f8f 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -11,6 +11,7 @@
 import subprocess
 import sys
 import sysconfig
+import types
 import collections
 from pathlib import Path
 import errno
@@ -26,6 +27,7 @@
 from typing_extensions import deprecated
 from torch.torch_version import TorchVersion, Version
 
+
 from setuptools.command.build_ext import build_ext
 
 IS_WINDOWS = sys.platform == 'win32'
@@ -227,7 +229,7 @@ def _join_sycl_home(*paths) -> str:
 )
 ROCM_HOME = _find_rocm_home() if (torch.cuda._is_compiled() and torch.version.hip) else None
 HIP_HOME = _join_rocm_home('hip') if ROCM_HOME else None
-IS_HIP_EXTENSION = True if ((ROCM_HOME is not None) and (torch.version.hip is not None)) else False
+IS_HIP_EXTENSION = bool(ROCM_HOME is not None and torch.version.hip is not None)
 ROCM_VERSION = None
 if torch.version.hip is not None:
     ROCM_VERSION = tuple(int(v) for v in torch.version.hip.split('.')[:2])
@@ -235,6 +237,7 @@ def _join_sycl_home(*paths) -> str:
 CUDA_HOME = _find_cuda_home() if (torch.cuda._is_compiled() and torch.version.cuda) else None
 CUDNN_HOME = os.environ.get('CUDNN_HOME') or os.environ.get('CUDNN_PATH')
 SYCL_HOME = _find_sycl_home() if torch.xpu._is_compiled() else None
+WINDOWS_CUDA_HOME = os.environ.get('WINDOWS_CUDA_HOME')  # used for AOTI cross-compilation
 
 # PyTorch releases have the version pattern major.minor.patch, whereas when
 # PyTorch is built from source, we append the git commit hash, which gives
@@ -287,7 +290,8 @@ def _get_icpx_version() -> str:
     match = re.search(r'(\d+)\.(\d+)\.(\d+)', compiler_info.decode().strip())
     version = ['0', '0', '0'] if match is None else list(match.groups())
     version = list(map(int, version))
-    assert len(version) == 3, "Failed to parse DPC++ compiler version"
+    if len(version) != 3:
+        raise AssertionError("Failed to parse DPC++ compiler version")
     # Aligning version format with what torch.version.xpu() returns
     return f"{version[0]}{version[1]:02}{version[2]:02}"
 
@@ -321,7 +325,8 @@ def _get_sycl_device_flags(cflags):
     # We need last occurrence of -fsycl-targets as it will be the one taking effect.
     # So searching in reversed list.
     flags = [f for f in reversed(cflags) if f.startswith('-fsycl-targets=')]
-    assert flags, "bug: -fsycl-targets should have been amended to cflags"
+    if not flags:
+        raise AssertionError("bug: -fsycl-targets should have been amended to cflags")
 
     arch_list = _get_sycl_arch_list()
     if arch_list != '':
@@ -345,7 +350,7 @@ def _get_sycl_device_flags(cflags):
     'win-amd64' : 'x86_amd64',
 }
 
-min_supported_cpython = "0x03090000"  # Python 3.9 hexcode
+min_supported_cpython = "0x030A0000"  # Python 3.10 hexcode
 
 def get_cxx_compiler():
     if IS_WINDOWS:
@@ -659,7 +664,8 @@ def build_extensions(self) -> None:
             extension = next(extension_iter, None)
 
         if sycl_ext:
-            assert self.use_ninja, "ninja is required to build sycl extensions."
+            if not self.use_ninja:
+                raise AssertionError("ninja is required to build sycl extensions.")
 
         if cuda_ext and not IS_HIP_EXTENSION:
             _check_cuda_version(compiler_name, compiler_version)
@@ -691,7 +697,10 @@ def build_extensions(self) -> None:
             self._define_torch_extension_name(extension)
 
             if 'nvcc_dlink' in extension.extra_compile_args:
-                assert self.use_ninja, f"With dlink=True, ninja is required to build cuda extension {extension.name}."
+                if not self.use_ninja:
+                    raise AssertionError(
+                        f"With dlink=True, ninja is required to build cuda extension {extension.name}."
+                    )
 
         # Register .cu, .cuh, .hip, .mm and .sycl as valid source extensions.
         # NOTE: At the moment .sycl is not a standard extension for SYCL supported
@@ -785,6 +794,7 @@ def unix_wrap_ninja_compile(sources,
 
             # Use absolute path for output_dir so that the object file paths
             # (`objects`) get generated with absolute paths.
+            # pyrefly: ignore  # no-matching-overload
             output_dir = os.path.abspath(output_dir)
 
             # See Note [Absolute include_dirs]
@@ -975,6 +985,7 @@ def win_wrap_ninja_compile(sources,
                                    is_standalone=False):
             if not self.compiler.initialized:
                 self.compiler.initialize()
+            # pyrefly: ignore  # no-matching-overload
             output_dir = os.path.abspath(output_dir)
 
             # Note [Absolute include_dirs]
@@ -1526,6 +1537,7 @@ def include_paths(device_type: str = "cpu", torch_include_dirs=True) -> list[str
         # Support CUDA_INC_PATH env variable supported by CMake files
         if (cuda_inc_path := os.environ.get("CUDA_INC_PATH", None)) and \
                 cuda_inc_path != '/usr/include':
+            # pyrefly: ignore  # unbound-name
             paths.append(cuda_inc_path)
         if CUDNN_HOME is not None:
             paths.append(os.path.join(CUDNN_HOME, 'include'))
@@ -1535,7 +1547,7 @@ def include_paths(device_type: str = "cpu", torch_include_dirs=True) -> list[str
     return paths
 
 
-def library_paths(device_type: str = "cpu", torch_include_dirs=True) -> list[str]:
+def library_paths(device_type: str = "cpu", torch_include_dirs: bool = True, cross_target_platform: Optional[str] = None) -> list[str]:
     """
     Get the library paths required to build a C++ or CUDA extension.
 
@@ -1558,20 +1570,26 @@ def library_paths(device_type: str = "cpu", torch_include_dirs=True) -> list[str
         if HIP_HOME is not None:
             paths.append(os.path.join(HIP_HOME, 'lib'))
     elif device_type == "cuda":
-        if IS_WINDOWS:
+        if cross_target_platform == "windows":
             lib_dir = os.path.join('lib', 'x64')
+            if WINDOWS_CUDA_HOME is None:
+                raise RuntimeError("Need to set WINDOWS_CUDA_HOME for windows cross-compilation")
+            paths.append(os.path.join(WINDOWS_CUDA_HOME, lib_dir))
         else:
-            lib_dir = 'lib64'
-            if (not os.path.exists(_join_cuda_home(lib_dir)) and
-                    os.path.exists(_join_cuda_home('lib'))):
-                # 64-bit CUDA may be installed in 'lib' (see e.g. gh-16955)
-                # Note that it's also possible both don't exist (see
-                # _find_cuda_home) - in that case we stay with 'lib64'.
-                lib_dir = 'lib'
-
-        paths.append(_join_cuda_home(lib_dir))
-        if CUDNN_HOME is not None:
-            paths.append(os.path.join(CUDNN_HOME, lib_dir))
+            if IS_WINDOWS:
+                lib_dir = os.path.join('lib', 'x64')
+            else:
+                lib_dir = 'lib64'
+                if (not os.path.exists(_join_cuda_home(lib_dir)) and
+                        os.path.exists(_join_cuda_home('lib'))):
+                    # 64-bit CUDA may be installed in 'lib' (see e.g. gh-16955)
+                    # Note that it's also possible both don't exist (see
+                    # _find_cuda_home) - in that case we stay with 'lib64'.
+                    lib_dir = 'lib'
+
+            paths.append(_join_cuda_home(lib_dir))
+            if CUDNN_HOME is not None:
+                paths.append(os.path.join(CUDNN_HOME, lib_dir))
     elif device_type == "xpu":
         if IS_WINDOWS:
             lib_dir = os.path.join('lib', 'x64')
@@ -2084,7 +2102,7 @@ def _jit_compile(name,
                  with_sycl: Optional[bool],
                  is_python_module,
                  is_standalone,
-                 keep_intermediates=True) -> None:
+                 keep_intermediates=True) -> Union[types.ModuleType, str]:
     if is_python_module and is_standalone:
         raise ValueError("`is_python_module` and `is_standalone` are mutually exclusive.")
 
@@ -2298,7 +2316,7 @@ def _write_ninja_file_and_build_library(
 def is_ninja_available():
     """Return ``True`` if the `ninja <https://ninja-build.org/>`_ build system is available on the system, ``False`` otherwise."""
     try:
-        subprocess.check_output('ninja --version'.split())
+        subprocess.check_output(['ninja', '--version'])
     except Exception:
         return False
     else:
@@ -2561,6 +2579,7 @@ def _get_num_workers(verbose: bool) -> Optional[int]:
 def _get_vc_env(vc_arch: str) -> dict[str, str]:
     try:
         from setuptools import distutils  # type: ignore[attr-defined]
+        # pyrefly: ignore  # missing-attribute
         return distutils._msvccompiler._get_vc_env(vc_arch)
     except AttributeError:
         try:
@@ -2640,9 +2659,11 @@ def _import_module_from_library(module_name, path, is_python_module):
     if is_python_module:
         # https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path
         spec = importlib.util.spec_from_file_location(module_name, filepath)
-        assert spec is not None
+        if spec is None:
+            raise AssertionError(f"Failed to create spec for module {module_name} at {filepath}")
         module = importlib.util.module_from_spec(spec)
-        assert isinstance(spec.loader, importlib.abc.Loader)
+        if not isinstance(spec.loader, importlib.abc.Loader):
+            raise AssertionError("spec.loader is not a valid importlib Loader")
         spec.loader.exec_module(module)
         return module
     else:
@@ -2844,8 +2865,10 @@ def sanitize_flags(flags):
     ldflags = sanitize_flags(ldflags)
 
     # Sanity checks...
-    assert len(sources) == len(objects)
-    assert len(sources) > 0
+    if len(sources) != len(objects):
+        raise AssertionError("sources and objects lists must be the same length")
+    if len(sources) == 0:
+        raise AssertionError("At least one source is required to build a library")
 
     compiler = get_cxx_compiler()
 
diff --git a/torch/utils/data/__init__.py b/torch/utils/data/__init__.py
index 4feeda1e59fb..4ab5e7ce7f1c 100644
--- a/torch/utils/data/__init__.py
+++ b/torch/utils/data/__init__.py
@@ -74,4 +74,5 @@
 ]
 
 # Please keep this list sorted
-assert __all__ == sorted(__all__)
+if __all__ != sorted(__all__):
+    raise AssertionError("__all__ is not sorted")
diff --git a/torch/utils/data/_utils/collate.py b/torch/utils/data/_utils/collate.py
index 3b291b1e60a4..b9a04644f331 100644
--- a/torch/utils/data/_utils/collate.py
+++ b/torch/utils/data/_utils/collate.py
@@ -12,7 +12,8 @@
 import contextlib
 import copy
 import re
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+from typing import Optional, Union
 
 import torch
 
@@ -203,6 +204,7 @@ def collate(
         # check to make sure that the elements in batch have consistent size
         it = iter(batch)
         elem_size = len(next(it))
+        # pyrefly: ignore  # not-iterable
         if not all(len(elem) == elem_size for elem in it):
             raise RuntimeError("each element in list of batch should be of equal size")
         transposed = list(zip(*batch))  # It may be accessed twice, so we use a list.
diff --git a/torch/utils/data/_utils/pin_memory.py b/torch/utils/data/_utils/pin_memory.py
index b53c7aef9596..c0a9416c45fe 100644
--- a/torch/utils/data/_utils/pin_memory.py
+++ b/torch/utils/data/_utils/pin_memory.py
@@ -70,6 +70,7 @@ def pin_memory(data, device=None):
                 return clone
             else:
                 return type(data)(
+                    # pyrefly: ignore  # bad-argument-count
                     {k: pin_memory(sample, device) for k, sample in data.items()}
                 )  # type: ignore[call-arg]
         except TypeError:
diff --git a/torch/utils/data/_utils/signal_handling.py b/torch/utils/data/_utils/signal_handling.py
index a1d54f05e360..33e1dd021e97 100644
--- a/torch/utils/data/_utils/signal_handling.py
+++ b/torch/utils/data/_utils/signal_handling.py
@@ -72,7 +72,8 @@ def handler(signum, frame):
         # Python can still get and update the process status successfully.
         _error_if_any_worker_fails()
         if previous_handler is not None:
-            assert callable(previous_handler)
+            if not callable(previous_handler):
+                raise AssertionError("previous_handler is not callable")
             previous_handler(signum, frame)
 
     signal.signal(signal.SIGCHLD, handler)
diff --git a/torch/utils/data/_utils/worker.py b/torch/utils/data/_utils/worker.py
index 97c7243e78ef..5e61912dc6e7 100644
--- a/torch/utils/data/_utils/worker.py
+++ b/torch/utils/data/_utils/worker.py
@@ -269,7 +269,10 @@ def _worker_loop(
 
         shared_rng = torch.Generator()
         if isinstance(dataset, IterDataPipe):
-            assert shared_seed is not None
+            if shared_seed is None:
+                raise AssertionError(
+                    "shared_seed must be provided for IterDataPipe workers"
+                )
             shared_rng.manual_seed(shared_seed)
             dataset = apply_random_seed(dataset, shared_rng)
 
@@ -321,7 +324,10 @@ def _worker_loop(
                 iteration_end = False
 
                 if isinstance(dataset, IterDataPipe):
-                    assert r.seed is not None
+                    if r.seed is None:
+                        raise AssertionError(
+                            "resume iteration seed is None for IterDataPipe"
+                        )
                     shared_rng.manual_seed(r.seed)
                     dataset = apply_random_seed(dataset, shared_rng)
 
@@ -332,7 +338,10 @@ def _worker_loop(
                 continue
             elif r is None:
                 # Received the final signal
-                assert done_event.is_set() or iteration_end
+                if not done_event.is_set() and not iteration_end:
+                    raise AssertionError(
+                        "Received final signal but neither done_event nor iteration_end is set"
+                    )
                 break
             elif done_event.is_set() or iteration_end:
                 # `done_event` is set. But I haven't received the final signal
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 991b4f00eb85..ef0d0c201329 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -16,7 +16,8 @@
 import queue
 import threading
 import warnings
-from typing import Any, Callable, Generic, Optional, TYPE_CHECKING, TypeVar, Union
+from collections.abc import Callable
+from typing import Any, Generic, Optional, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import Self
 
 import torch
@@ -110,10 +111,14 @@ def _get_distributed_settings():
 def _sharding_worker_init_fn(worker_init_fn, world_size, rank_id, worker_id):
     global_worker_id = worker_id
     info = torch.utils.data.get_worker_info()
-    assert info is not None
+    if info is None:
+        raise AssertionError("Worker info is None in sharding worker init function")
     total_workers = info.num_workers
     datapipe = info.dataset
-    assert isinstance(datapipe, (IterDataPipe, MapDataPipe))
+    if not isinstance(datapipe, (IterDataPipe, MapDataPipe)):
+        raise AssertionError(
+            "datapipe must be an instance of IterDataPipe or MapDataPipe"
+        )
     # To distribute elements across distributed process evenly, we should shard data on distributed
     # processes first then shard on worker processes
     total_workers *= world_size
@@ -673,6 +678,7 @@ def __init__(self, loader: DataLoader) -> None:
 
         # Set pin memory device based on the current accelerator.
         self._pin_memory_device = (
+            # pyrefly: ignore  # unbound-name
             acc.type
             if self._pin_memory
             and (acc := torch.accelerator.current_accelerator()) is not None
@@ -764,8 +770,12 @@ def __getstate__(self):
 class _SingleProcessDataLoaderIter(_BaseDataLoaderIter):
     def __init__(self, loader):
         super().__init__(loader)
-        assert self._timeout == 0
-        assert self._num_workers == 0
+        if self._timeout != 0:
+            raise AssertionError("_SingleProcessDataLoaderIter requires timeout == 0")
+        if self._num_workers != 0:
+            raise AssertionError(
+                "_SingleProcessDataLoaderIter requires num_workers == 0"
+            )
 
         # Adds forward compatibilities so classic DataLoader can work with DataPipes:
         #   Taking care of distributed sharding
@@ -1107,8 +1117,14 @@ def __init__(self, loader):
         self._prefetch_factor = loader.prefetch_factor
         self._in_order = loader.in_order
 
-        assert self._num_workers > 0
-        assert self._prefetch_factor > 0
+        if self._num_workers <= 0:
+            raise AssertionError(
+                "num_workers must be greater than 0 for MultiProcessingDataLoaderIter"
+            )
+        if self._prefetch_factor <= 0:
+            raise AssertionError(
+                "prefetch_factor must be greater than 0 for MultiProcessingDataLoaderIter"
+            )
 
         if loader.multiprocessing_context is None:
             multiprocessing_context = torch.multiprocessing
@@ -1253,7 +1269,10 @@ def _reset(self, loader, first_iter=False):
             while resume_iteration_cnt > 0:
                 return_idx, return_data = self._get_data()
                 if isinstance(return_idx, _utils.worker._ResumeIteration):
-                    assert return_data is None
+                    if return_data is not None:
+                        raise AssertionError(
+                            "Expected return_data to be None when resuming iteration"
+                        )
                     resume_iteration_cnt -= 1
         # prime the prefetch loop
         for _ in range(self._prefetch_factor * self._num_workers):
@@ -1478,7 +1497,10 @@ def _next_data(self):
                 self._rcvd_idx += 1
                 return self._process_data(data, worker_id)
 
-            assert not self._shutdown and self._tasks_outstanding > 0
+            if self._shutdown or self._tasks_outstanding <= 0:
+                raise AssertionError(
+                    "Invalid iterator state: shutdown or no outstanding tasks when fetching next data"
+                )
             idx, data = self._get_data()
             self._tasks_outstanding -= 1
             if self._dataset_kind == _DatasetKind.Iterable:
@@ -1507,7 +1529,10 @@ def _next_data(self):
 
     def _try_put_index(self):
         max_tasks = self._prefetch_factor * self._num_workers
-        assert self._tasks_outstanding < max_tasks
+        if self._tasks_outstanding >= max_tasks:
+            raise AssertionError(
+                "Number of outstanding tasks exceeded maximum allowed tasks"
+            )
 
         try:
             index = self._next_index()
@@ -1546,9 +1571,14 @@ def _mark_worker_as_unavailable(self, worker_id, shutdown=False):
         # exhausting an `IterableDataset`. This should be used only when this
         # `_MultiProcessingDataLoaderIter` is going to continue running.
 
-        assert self._workers_status[worker_id] or (
-            self._persistent_workers and shutdown
-        )
+        if (
+            not self._workers_status[worker_id]
+            and not self._persistent_workers
+            and not shutdown
+        ):
+            raise AssertionError(
+                "Worker status inconsistent when marking worker as unavailable"
+            )
 
         # Signal termination to that specific worker.
         q = self._index_queues[worker_id]
@@ -1567,7 +1597,10 @@ def _mark_worker_as_unavailable(self, worker_id, shutdown=False):
 
         self._workers_status[worker_id] = False
 
-        assert self._workers_done_event.is_set() == shutdown
+        if self._workers_done_event.is_set() != shutdown:
+            raise AssertionError(
+                "_workers_done_event state does not match shutdown flag"
+            )
 
     def _shutdown_workers(self):
         # Called when shutting down this `_MultiProcessingDataLoaderIter`.
diff --git a/torch/utils/data/datapipes/_decorator.py b/torch/utils/data/datapipes/_decorator.py
index 0833f8fdf759..507e00259c4c 100644
--- a/torch/utils/data/datapipes/_decorator.py
+++ b/torch/utils/data/datapipes/_decorator.py
@@ -1,7 +1,8 @@
 # mypy: allow-untyped-defs
 import inspect
+from collections.abc import Callable
 from functools import wraps
-from typing import Any, Callable, get_type_hints, Optional, Union
+from typing import Any, get_type_hints, Optional, Union
 
 from torch.utils.data.datapipes._typing import _DataPipeMeta
 from torch.utils.data.datapipes.datapipe import IterDataPipe, MapDataPipe
@@ -74,9 +75,9 @@ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
 class non_deterministic:
     cls: Optional[type[IterDataPipe]] = None
     # TODO: Lambda for picking
-    deterministic_fn: Callable[[], bool]
+    deterministic_fn: Callable[..., bool]
 
-    def __init__(self, arg: Union[type[IterDataPipe], Callable[[], bool]]) -> None:
+    def __init__(self, arg: Union[type[IterDataPipe], Callable[..., bool]]) -> None:
         # 1. Decorator doesn't have any argument
         if isinstance(arg, type):  # type: ignore[arg-type]
             if not issubclass(arg, IterDataPipe):  # type: ignore[arg-type]
@@ -91,7 +92,7 @@ def __init__(self, arg: Union[type[IterDataPipe], Callable[[], bool]]) -> None:
         #    When the function returns True, the instance is non-deterministic. Otherwise,
         #    the instance is a deterministic DataPipe.
         elif isinstance(arg, Callable):  # type:ignore[arg-type]
-            self.deterministic_fn = arg  # type: ignore[assignment, misc]
+            self.deterministic_fn = arg
         else:
             raise TypeError(f"{arg} can not be decorated by non_deterministic")
 
@@ -118,7 +119,7 @@ def __call__(self, *args, **kwargs):
         return self.deterministic_wrapper_fn
 
     def deterministic_wrapper_fn(self, *args, **kwargs) -> IterDataPipe:
-        res = self.deterministic_fn(*args, **kwargs)  # type: ignore[call-arg, misc]
+        res = self.deterministic_fn(*args, **kwargs)
         if not isinstance(res, bool):
             raise TypeError(
                 "deterministic_fn of `non_deterministic` decorator is required "
diff --git a/torch/utils/data/datapipes/_typing.py b/torch/utils/data/datapipes/_typing.py
index d3ae5b4e18f4..c8972b005dd9 100644
--- a/torch/utils/data/datapipes/_typing.py
+++ b/torch/utils/data/datapipes/_typing.py
@@ -78,7 +78,7 @@ def issubtype(left, right, recursive=True):
         if getattr(right, "__origin__", None) is Generic:
             return True
 
-    if right == type(None):
+    if right is type(None):
         return False
 
     # Right-side type
@@ -265,6 +265,7 @@ def issubtype_of_instance(self, other):
 
 # Default type for DataPipe without annotation
 _T_co = TypeVar("_T_co", covariant=True)
+# pyrefly: ignore  # invalid-annotation
 _DEFAULT_TYPE = _DataPipeType(Generic[_T_co])
 
 
@@ -283,6 +284,7 @@ def __new__(cls, name, bases, namespace, **kwargs):
         return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
 
         # TODO: the statements below are not reachable by design as there is a bug and typing is low priority for now.
+        # pyrefly: ignore  # no-access
         cls.__origin__ = None
         if "type" in namespace:
             return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
diff --git a/torch/utils/data/datapipes/dataframe/__init__.py b/torch/utils/data/datapipes/dataframe/__init__.py
index 9feb5f113c0f..f7f4b7dcb414 100644
--- a/torch/utils/data/datapipes/dataframe/__init__.py
+++ b/torch/utils/data/datapipes/dataframe/__init__.py
@@ -8,4 +8,5 @@
 __all__ = ["CaptureDataFrame", "DFIterDataPipe", "DataFramesAsTuplesPipe"]
 
 # Please keep this list sorted
-assert __all__ == sorted(__all__)
+if __all__ != sorted(__all__):
+    raise AssertionError("__all__ is not sorted")
diff --git a/torch/utils/data/datapipes/dataframe/dataframes.py b/torch/utils/data/datapipes/dataframe/dataframes.py
index d697cb6ebc5c..f5a4ebaf2703 100644
--- a/torch/utils/data/datapipes/dataframe/dataframes.py
+++ b/torch/utils/data/datapipes/dataframe/dataframes.py
@@ -80,6 +80,7 @@ def __str__(self):
 
     def _ops_str(self):
         res = ""
+        # pyrefly: ignore  # not-iterable
         for op in self.ctx["operations"]:
             if len(res) > 0:
                 res += "\n"
@@ -89,6 +90,7 @@ def _ops_str(self):
     def __getstate__(self):
         # TODO(VitalyFedyunin): Currently can't pickle (why?)
         self.ctx["schema_df"] = None
+        # pyrefly: ignore  # not-iterable
         for var in self.ctx["variables"]:
             var.calculated_value = None
         state = {}
@@ -103,7 +105,7 @@ def __setstate__(self, state):
     def __getattr__(self, attrname):
         if attrname == "kwarg" or attrname == "kwargs":
             raise RuntimeError("no kwargs!")
-        if attrname in ["__deepcopy__"]:
+        if attrname == "__deepcopy__":
             raise AttributeError
         result = CaptureGetAttr(self, attrname, ctx=self.ctx)
         return result
@@ -112,11 +114,13 @@ def __getitem__(self, key):
         return CaptureGetItem(self, key, ctx=self.ctx)
 
     def __setitem__(self, key, value):
+        # pyrefly: ignore  # missing-attribute
         self.ctx["operations"].append(CaptureSetItem(self, key, value, ctx=self.ctx))
 
     def __add__(self, add_val):
         res = CaptureAdd(self, add_val, ctx=self.ctx)
         var = CaptureVariable(res, ctx=self.ctx)
+        # pyrefly: ignore  # missing-attribute
         self.ctx["operations"].append(
             CaptureVariableAssign(variable=var, value=res, ctx=self.ctx)
         )
@@ -125,6 +129,7 @@ def __add__(self, add_val):
     def __sub__(self, add_val):
         res = CaptureSub(self, add_val, ctx=self.ctx)
         var = CaptureVariable(res, ctx=self.ctx)
+        # pyrefly: ignore  # missing-attribute
         self.ctx["operations"].append(
             CaptureVariableAssign(variable=var, value=res, ctx=self.ctx)
         )
@@ -134,15 +139,19 @@ def __mul__(self, add_val):
         res = CaptureMul(self, add_val, ctx=self.ctx)
         var = CaptureVariable(res, ctx=self.ctx)
         t = CaptureVariableAssign(variable=var, value=res, ctx=self.ctx)
+        # pyrefly: ignore  # missing-attribute
         self.ctx["operations"].append(t)
         return var
 
     def _is_context_empty(self):
+        # pyrefly: ignore  # bad-argument-type
         return len(self.ctx["operations"]) == 0 and len(self.ctx["variables"]) == 0
 
     def apply_ops_2(self, dataframe):
         # TODO(VitalyFedyunin): Make this calculation thread safe (as currently it updates pointer)
+        # pyrefly: ignore  # unsupported-operation
         self.ctx["variables"][0].calculated_value = dataframe
+        # pyrefly: ignore  # not-iterable
         for op in self.ctx["operations"]:
             op.execute()
 
@@ -175,6 +184,7 @@ def __call__(self, *args, **kwargs):
         res = CaptureCall(self, ctx=self.ctx, args=args, kwargs=kwargs)
         var = CaptureVariable(None, ctx=self.ctx)
         t = CaptureVariableAssign(ctx=self.ctx, variable=var, value=res)
+        # pyrefly: ignore  # missing-attribute
         self.ctx["operations"].append(t)
         return var
 
@@ -273,7 +283,9 @@ def execute(self):
 
     def apply_ops(self, dataframe):
         # TODO(VitalyFedyunin): Make this calculation thread safe (as currently it updates pointer)
+        # pyrefly: ignore  # unsupported-operation
         self.ctx["variables"][0].calculated_value = dataframe
+        # pyrefly: ignore  # not-iterable
         for op in self.ctx["operations"]:
             op.execute()
         return self.calculated_value
@@ -373,6 +385,7 @@ def get_val(capture):
 
 class CaptureInitial(CaptureVariable):
     def __init__(self, schema_df=None):
+        # pyrefly: ignore  # bad-assignment
         new_ctx: dict[str, list[Any]] = {
             "operations": [],
             "variables": [],
@@ -388,6 +401,7 @@ class CaptureDataFrame(CaptureInitial):
 
 class CaptureDataFrameWithDataPipeOps(CaptureDataFrame):
     def as_datapipe(self):
+        # pyrefly: ignore  # unsupported-operation
         return DataFrameTracedOps(self.ctx["variables"][0].source_datapipe, self)
 
     def raw_iterator(self):
diff --git a/torch/utils/data/datapipes/dataframe/datapipes.py b/torch/utils/data/datapipes/dataframe/datapipes.py
index c9b89d6437aa..0c1b416e99c2 100644
--- a/torch/utils/data/datapipes/dataframe/datapipes.py
+++ b/torch/utils/data/datapipes/dataframe/datapipes.py
@@ -53,7 +53,7 @@ def __iter__(self):
             if len(buffer) == self.n_batch:
                 yield df_wrapper.concat(buffer)
                 buffer = []
-        if len(buffer):
+        if buffer:
             yield df_wrapper.concat(buffer)
 
 
@@ -78,7 +78,7 @@ def __iter__(self):
             if len(buffer) == size:
                 yield df_wrapper.concat(buffer)
                 buffer = []
-        if len(buffer):
+        if buffer:
             yield df_wrapper.concat(buffer)
 
 
@@ -92,6 +92,7 @@ def __iter__(self):
         size = None
         all_buffer = []
         filter_res = []
+        # pyrefly: ignore  # bad-assignment
         for df in self.source_datapipe:
             if size is None:
                 size = len(df.index)
@@ -106,7 +107,7 @@ def __iter__(self):
                 if len(buffer) == size:
                     yield df_wrapper.concat(buffer)
                     buffer = []
-        if len(buffer):
+        if buffer:
             yield df_wrapper.concat(buffer)
 
 
diff --git a/torch/utils/data/datapipes/datapipe.py b/torch/utils/data/datapipes/datapipe.py
index 506f642c411d..22e324e0ae2c 100644
--- a/torch/utils/data/datapipes/datapipe.py
+++ b/torch/utils/data/datapipes/datapipe.py
@@ -1,7 +1,7 @@
 import functools
 import pickle
-from collections.abc import Iterable, Iterator
-from typing import Callable, Optional, TypeVar
+from collections.abc import Callable, Iterable, Iterator
+from typing import Optional, TypeVar
 
 from torch.utils._import_utils import import_dill
 from torch.utils.data.datapipes._hook_iterator import _SnapshotState
@@ -135,6 +135,7 @@ class IterDataPipe(IterableDataset[_T_co], metaclass=_IterDataPipeMeta):
     _fast_forward_iterator: Optional[Iterator] = None
 
     def __iter__(self) -> Iterator[_T_co]:
+        # pyrefly: ignore  # bad-return
         return self
 
     def __getattr__(self, attribute_name):
@@ -379,6 +380,7 @@ def __getstate__(self):
             value = pickle.dumps(self._datapipe)
         except Exception:
             if HAS_DILL:
+                # pyrefly: ignore  # missing-attribute
                 value = dill.dumps(self._datapipe)
                 use_dill = True
             else:
@@ -388,6 +390,7 @@ def __getstate__(self):
     def __setstate__(self, state):
         value, use_dill = state
         if use_dill:
+            # pyrefly: ignore  # missing-attribute
             self._datapipe = dill.loads(value)
         else:
             self._datapipe = pickle.loads(value)
@@ -404,6 +407,7 @@ def __len__(self):
 class _IterDataPipeSerializationWrapper(_DataPipeSerializationWrapper, IterDataPipe):
     def __init__(self, datapipe: IterDataPipe[_T_co]):
         super().__init__(datapipe)
+        # pyrefly: ignore  # invalid-type-var
         self._datapipe_iter: Optional[Iterator[_T_co]] = None
 
     def __iter__(self) -> "_IterDataPipeSerializationWrapper":
@@ -411,7 +415,10 @@ def __iter__(self) -> "_IterDataPipeSerializationWrapper":
         return self
 
     def __next__(self) -> _T_co:  # type: ignore[type-var]
-        assert self._datapipe_iter is not None
+        if self._datapipe_iter is None:
+            raise AssertionError(
+                "Iterator has not been initialized; call __iter__() before __next__()"
+            )
         return next(self._datapipe_iter)
 
 
diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py
index 37d1664753b1..05831250da46 100644
--- a/torch/utils/data/datapipes/iter/__init__.py
+++ b/torch/utils/data/datapipes/iter/__init__.py
@@ -62,4 +62,5 @@
 ]
 
 # Please keep this list sorted
-assert __all__ == sorted(__all__)
+if __all__ != sorted(__all__):
+    raise AssertionError("__all__ is not sorted")
diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py
index 41c6bb362af2..bfff0d19f4cf 100644
--- a/torch/utils/data/datapipes/iter/callable.py
+++ b/torch/utils/data/datapipes/iter/callable.py
@@ -1,8 +1,8 @@
 # mypy: allow-untyped-defs
 import functools
 from collections import namedtuple
-from collections.abc import Iterator, Sized
-from typing import Any, Callable, Optional, TypeVar, Union
+from collections.abc import Callable, Iterator, Sized
+from typing import Any, Optional, TypeVar, Union
 
 import torch
 from torch.utils.data._utils.collate import default_collate
@@ -118,6 +118,7 @@ def _apply_fn(self, data):
                 for idx in sorted(self.input_col[1:], reverse=True):
                     del data[idx]
             else:
+                # pyrefly: ignore  # unsupported-operation
                 data[self.input_col] = res
         else:
             if self.output_col == -1:
diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py
index f92edd6b7b39..bd10ff2a6785 100644
--- a/torch/utils/data/datapipes/iter/combinatorics.py
+++ b/torch/utils/data/datapipes/iter/combinatorics.py
@@ -38,17 +38,17 @@ def __init__(
         sampler_args: Optional[tuple] = None,
         sampler_kwargs: Optional[dict] = None,
     ) -> None:
-        assert isinstance(datapipe, Sized), (
-            "Sampler class requires input datapipe implemented `__len__`"
-        )
+        if not isinstance(datapipe, Sized):
+            raise AssertionError(
+                "Sampler class requires input datapipe implemented `__len__`"
+            )
         super().__init__()
+        # pyrefly: ignore  # bad-assignment
         self.datapipe = datapipe
         self.sampler_args = () if sampler_args is None else sampler_args
         self.sampler_kwargs = {} if sampler_kwargs is None else sampler_kwargs
-        # https://github.com/python/mypy/pull/9629 will solve
-        self.sampler = sampler(
-            *self.sampler_args, data_source=self.datapipe, **self.sampler_kwargs
-        )  # type: ignore[misc]
+        self.sampler_kwargs["data_source"] = self.datapipe
+        self.sampler = sampler(*self.sampler_args, **self.sampler_kwargs)
 
     def __iter__(self) -> Iterator[_T_co]:
         return iter(self.sampler)
@@ -113,7 +113,8 @@ def __init__(
         # TODO: Performance optimization
         #       buffer can be a fixed size and remove expensive `append()` and `len()` operations
         self._buffer: list[_T_co] = []
-        assert buffer_size > 0, "buffer_size should be larger than 0"
+        if buffer_size <= 0:
+            raise AssertionError("buffer_size should be larger than 0")
         if unbatch_level == 0:
             self.datapipe = datapipe
         else:
diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py
index 8c6abc506210..22f27327b2ee 100644
--- a/torch/utils/data/datapipes/iter/combining.py
+++ b/torch/utils/data/datapipes/iter/combining.py
@@ -3,8 +3,8 @@
 import warnings
 from abc import ABC, abstractmethod
 from collections import deque
-from collections.abc import Iterator, Sized
-from typing import Any, Callable, Literal, Optional, TypeVar
+from collections.abc import Callable, Iterator, Sized
+from typing import Any, Literal, Optional, TypeVar
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes._hook_iterator import _SnapshotState
@@ -59,6 +59,7 @@ def __iter__(self) -> Iterator:
 
     def __len__(self) -> int:
         if all(isinstance(dp, Sized) for dp in self.datapipes):
+            # pyrefly: ignore  # bad-argument-type
             return sum(len(dp) for dp in self.datapipes)
         else:
             raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
@@ -179,6 +180,7 @@ def __init__(
         self._child_stop: list[bool] = [True for _ in range(num_instances)]
 
     def __len__(self):
+        # pyrefly: ignore  # bad-argument-type
         return len(self.main_datapipe)
 
     def get_next_element_by_instance(self, instance_id: int):
@@ -238,6 +240,7 @@ def is_every_instance_exhausted(self) -> bool:
         return self.end_ptr is not None and all(self._child_stop)
 
     def get_length_by_instance(self, instance_id: int) -> int:
+        # pyrefly: ignore  # bad-argument-type
         return len(self.main_datapipe)
 
     def reset(self) -> None:
@@ -321,8 +324,10 @@ class _ChildDataPipe(IterDataPipe):
     _is_child_datapipe: bool = True
 
     def __init__(self, main_datapipe: IterDataPipe, instance_id: int):
-        assert isinstance(main_datapipe, _ContainerTemplate)
+        if not isinstance(main_datapipe, _ContainerTemplate):
+            raise AssertionError("main_datapipe must implement _ContainerTemplate")
 
+        # pyrefly: ignore  # bad-assignment
         self.main_datapipe: IterDataPipe = main_datapipe
         self.instance_id = instance_id
 
@@ -449,6 +454,7 @@ def __init__(
         drop_none: bool,
         buffer_size: int,
     ):
+        # pyrefly: ignore  # invalid-type-var
         self.main_datapipe = datapipe
         self._datapipe_iterator: Optional[Iterator[Any]] = None
         self.num_instances = num_instances
@@ -460,7 +466,9 @@ def __init__(
                 UserWarning,
             )
         self.current_buffer_usage = 0
+        # pyrefly: ignore  # invalid-type-var
         self.child_buffers: list[deque[_T_co]] = [deque() for _ in range(num_instances)]
+        # pyrefly: ignore  # invalid-type-var
         self.classifier_fn = classifier_fn
         self.drop_none = drop_none
         self.main_datapipe_exhausted = False
@@ -618,7 +626,7 @@ def __init__(self, *datapipes):
 
     def __iter__(self):
         iterators = [iter(x) for x in self.datapipes]
-        while len(iterators):
+        while iterators:
             for it in iterators:
                 try:
                     value = next(it)
@@ -698,6 +706,7 @@ def __iter__(self) -> Iterator[tuple[_T_co]]:
 
     def __len__(self) -> int:
         if all(isinstance(dp, Sized) for dp in self.datapipes):
+            # pyrefly: ignore  # bad-argument-type
             return min(len(dp) for dp in self.datapipes)
         else:
             raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
diff --git a/torch/utils/data/datapipes/iter/fileopener.py b/torch/utils/data/datapipes/iter/fileopener.py
index 3025b809e12d..5b627a190e8a 100644
--- a/torch/utils/data/datapipes/iter/fileopener.py
+++ b/torch/utils/data/datapipes/iter/fileopener.py
@@ -1,5 +1,4 @@
-# mypy: allow-untyped-defs
-from collections.abc import Iterable
+from collections.abc import Iterable, Iterator
 from io import IOBase
 from typing import Optional
 
@@ -53,7 +52,7 @@ def __init__(
         length: int = -1,
     ):
         super().__init__()
-        self.datapipe: Iterable = datapipe
+        self.datapipe: Iterable[str] = datapipe
         self.mode: str = mode
         self.encoding: Optional[str] = encoding
 
@@ -70,12 +69,12 @@ def __init__(
     # Remove annotation due to 'IOBase' is a general type and true type
     # is determined at runtime based on mode. Some `DataPipe` requiring
     # a subtype would cause mypy error.
-    def __iter__(self):
+    def __iter__(self) -> Iterator[tuple[str, IOBase]]:
         yield from get_file_binaries_from_pathnames(
             self.datapipe, self.mode, self.encoding
         )
 
-    def __len__(self):
+    def __len__(self) -> int:
         if self.length == -1:
             raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
         return self.length
diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py
index 055d9c28b09b..9bd6ab7f819d 100644
--- a/torch/utils/data/datapipes/iter/grouping.py
+++ b/torch/utils/data/datapipes/iter/grouping.py
@@ -1,10 +1,8 @@
 # mypy: allow-untyped-defs
-import warnings
 from collections import defaultdict
-from collections.abc import Iterator, Sized
-from typing import Any, Callable, Optional, TypeVar
+from collections.abc import Callable, Iterator, Sized
+from typing import Any, Optional, TypeVar
 
-import torch.utils.data.datapipes.iter.sharding
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.datapipe import DataChunk, IterDataPipe
 from torch.utils.data.datapipes.utils.common import _check_unpickable_fn
@@ -21,16 +19,6 @@
 
 
 def __getattr__(name: str):
-    if name in ["SHARDING_PRIORITIES", "ShardingFilterIterDataPipe"]:
-        warnings.warn(
-            f"`{name}` from `torch.utils.data.datapipes.iter.grouping` is going to be removed in PyTorch 2.1"
-            f"Please use `{name}` from the `torch.utils.data.datapipes.iter.sharding`",
-            category=FutureWarning,
-            stacklevel=2,
-        )
-
-        return getattr(torch.utils.data.datapipes.iter.sharding, name)
-
     raise AttributeError(f"module {__name__} has no attribute {name}")
 
 
@@ -69,7 +57,8 @@ def __init__(
         drop_last: bool = False,
         wrapper_class: type[DataChunk] = DataChunk,
     ) -> None:
-        assert batch_size > 0, "Batch size is required to be larger than 0!"
+        if batch_size <= 0:
+            raise AssertionError("Batch size is required to be larger than 0!")
         super().__init__()
         self.datapipe = datapipe
         self.batch_size = batch_size
@@ -215,7 +204,9 @@ def __init__(
         drop_remaining: bool = False,
     ):
         _check_unpickable_fn(group_key_fn)
+        # pyrefly: ignore  # invalid-type-var
         self.datapipe = datapipe
+        # pyrefly: ignore  # invalid-type-var
         self.group_key_fn = group_key_fn
 
         self.keep_key = keep_key
@@ -225,10 +216,16 @@ def __init__(
         self.group_size = group_size
         self.guaranteed_group_size = None
         if group_size is not None and buffer_size is not None:
-            assert 0 < group_size <= buffer_size
+            if not (0 < group_size <= buffer_size):
+                raise AssertionError("group_size must be > 0 and <= buffer_size")
+            # pyrefly: ignore  # bad-assignment
             self.guaranteed_group_size = group_size
         if guaranteed_group_size is not None:
-            assert group_size is not None and 0 < guaranteed_group_size <= group_size
+            if group_size is None or not (0 < guaranteed_group_size <= group_size):
+                raise AssertionError(
+                    "guaranteed_group_size must be > 0 and <= group_size and group_size must be set"
+                )
+            # pyrefly: ignore  # bad-assignment
             self.guaranteed_group_size = guaranteed_group_size
         self.drop_remaining = drop_remaining
         self.wrapper_class = DataChunk
diff --git a/torch/utils/data/datapipes/iter/routeddecoder.py b/torch/utils/data/datapipes/iter/routeddecoder.py
index 611b4870a493..ba4d708a0a31 100644
--- a/torch/utils/data/datapipes/iter/routeddecoder.py
+++ b/torch/utils/data/datapipes/iter/routeddecoder.py
@@ -1,6 +1,6 @@
-from collections.abc import Iterable, Iterator, Sized
+from collections.abc import Callable, Iterable, Iterator, Sized
 from io import BufferedIOBase
-from typing import Any, Callable
+from typing import Any
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.datapipe import IterDataPipe
diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py
index 97dcb2d6c491..afb0e91d8557 100644
--- a/torch/utils/data/datapipes/iter/selecting.py
+++ b/torch/utils/data/datapipes/iter/selecting.py
@@ -1,6 +1,6 @@
 # mypy: allow-untyped-defs
-from collections.abc import Iterator
-from typing import Callable, TypeVar
+from collections.abc import Callable, Iterator
+from typing import TypeVar
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.dataframe import dataframe_wrapper as df_wrapper
@@ -88,7 +88,7 @@ def _returnIfTrue(self, data: _T) -> tuple[bool, _T]:
             for idx, mask in enumerate(df_wrapper.iterate(condition)):
                 if mask:
                     result.append(df_wrapper.get_item(data, idx))
-            if len(result):
+            if result:
                 return True, df_wrapper.concat(result)
             else:
                 return False, None  # type: ignore[return-value]
diff --git a/torch/utils/data/datapipes/map/__init__.py b/torch/utils/data/datapipes/map/__init__.py
index 7fa8932dd6fc..bc555e8fdac2 100644
--- a/torch/utils/data/datapipes/map/__init__.py
+++ b/torch/utils/data/datapipes/map/__init__.py
@@ -16,4 +16,5 @@
 __all__ = ["Batcher", "Concater", "Mapper", "SequenceWrapper", "Shuffler", "Zipper"]
 
 # Please keep this list sorted
-assert __all__ == sorted(__all__)
+if __all__ != sorted(__all__):
+    raise AssertionError("__all__ is not sorted")
diff --git a/torch/utils/data/datapipes/map/callable.py b/torch/utils/data/datapipes/map/callable.py
index cee08b7a8c8d..983ef41748d7 100644
--- a/torch/utils/data/datapipes/map/callable.py
+++ b/torch/utils/data/datapipes/map/callable.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
-from typing import Callable, TypeVar
+from collections.abc import Callable
+from typing import TypeVar
 
 from torch.utils.data.datapipes._decorator import functional_datapipe
 from torch.utils.data.datapipes.datapipe import MapDataPipe
@@ -59,6 +60,7 @@ def __init__(
         self.fn = fn  # type: ignore[assignment]
 
     def __len__(self) -> int:
+        # pyrefly: ignore  # bad-argument-type
         return len(self.datapipe)
 
     def __getitem__(self, index) -> _T_co:
diff --git a/torch/utils/data/datapipes/map/combinatorics.py b/torch/utils/data/datapipes/map/combinatorics.py
index 619d0e5c7a0e..b49619c12fd7 100644
--- a/torch/utils/data/datapipes/map/combinatorics.py
+++ b/torch/utils/data/datapipes/map/combinatorics.py
@@ -64,6 +64,7 @@ def __init__(
     ) -> None:
         super().__init__()
         self.datapipe = datapipe
+        # pyrefly: ignore  # bad-argument-type
         self.indices = list(range(len(datapipe))) if indices is None else indices
         self._enabled = True
         self._seed = None
@@ -95,6 +96,7 @@ def reset(self) -> None:
         self._shuffled_indices = self._rng.sample(self.indices, len(self.indices))
 
     def __len__(self) -> int:
+        # pyrefly: ignore  # bad-argument-type
         return len(self.datapipe)
 
     def __getstate__(self):
diff --git a/torch/utils/data/datapipes/map/combining.py b/torch/utils/data/datapipes/map/combining.py
index 97f9ef142a7c..b4cb1add714f 100644
--- a/torch/utils/data/datapipes/map/combining.py
+++ b/torch/utils/data/datapipes/map/combining.py
@@ -49,13 +49,16 @@ def __init__(self, *datapipes: MapDataPipe):
     def __getitem__(self, index) -> _T_co:  # type: ignore[type-var]
         offset = 0
         for dp in self.datapipes:
+            # pyrefly: ignore  # bad-argument-type
             if index - offset < len(dp):
                 return dp[index - offset]
             else:
+                # pyrefly: ignore  # bad-argument-type
                 offset += len(dp)
         raise IndexError(f"Index {index} is out of range.")
 
     def __len__(self) -> int:
+        # pyrefly: ignore  # bad-argument-type
         return sum(len(dp) for dp in self.datapipes)
 
 
@@ -102,4 +105,5 @@ def __getitem__(self, index) -> tuple[_T_co, ...]:
         return tuple(res)
 
     def __len__(self) -> int:
+        # pyrefly: ignore  # bad-argument-type
         return min(len(dp) for dp in self.datapipes)
diff --git a/torch/utils/data/datapipes/map/grouping.py b/torch/utils/data/datapipes/map/grouping.py
index e77f96730e5a..5929cab24279 100644
--- a/torch/utils/data/datapipes/map/grouping.py
+++ b/torch/utils/data/datapipes/map/grouping.py
@@ -45,7 +45,8 @@ def __init__(
         drop_last: bool = False,
         wrapper_class: type[DataChunk] = DataChunk,
     ) -> None:
-        assert batch_size > 0, "Batch size is required to be larger than 0!"
+        if batch_size <= 0:
+            raise AssertionError("Batch size is required to be larger than 0!")
         super().__init__()
         self.datapipe = datapipe
         self.batch_size = batch_size
diff --git a/torch/utils/data/datapipes/utils/common.py b/torch/utils/data/datapipes/utils/common.py
index ddf3eecdd949..6edcee5e35b2 100644
--- a/torch/utils/data/datapipes/utils/common.py
+++ b/torch/utils/data/datapipes/utils/common.py
@@ -4,9 +4,9 @@
 import inspect
 import os
 import warnings
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from io import IOBase
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 from torch.utils._import_utils import dill_available
 
@@ -196,6 +196,7 @@ def onerror(err: OSError):
         if match_masks(fname, masks):
             yield path
     else:
+        # pyrefly: ignore  # bad-assignment
         for path, dirs, files in os.walk(root, onerror=onerror):
             if abspath:
                 path = os.path.abspath(path)
diff --git a/torch/utils/data/datapipes/utils/decoder.py b/torch/utils/data/datapipes/utils/decoder.py
index 9db7309bdc52..f4cc55838ae0 100644
--- a/torch/utils/data/datapipes/utils/decoder.py
+++ b/torch/utils/data/datapipes/utils/decoder.py
@@ -61,7 +61,7 @@ def basichandlers(extension: str, data):
     if extension in "txt text transcript":
         return data.decode("utf-8")
 
-    if extension in "cls cls2 class count index inx id".split():
+    if extension in ["cls", "cls2", "class", "count", "index", "inx", "id"]:
         try:
             return int(data)
         except ValueError:
@@ -70,10 +70,10 @@ def basichandlers(extension: str, data):
     if extension in "json jsn":
         return json.loads(data)
 
-    if extension in "pyd pickle".split():
+    if extension in ["pyd", "pickle"]:
         return pickle.loads(data)
 
-    if extension in "pt".split():
+    if extension in ["pt"]:
         stream = io.BytesIO(data)
         return torch.load(stream)
 
@@ -169,13 +169,12 @@ class ImageHandler:
     """
 
     def __init__(self, imagespec):
-        assert imagespec in list(imagespecs.keys()), (
-            f"unknown image specification: {imagespec}"
-        )
+        if imagespec not in list(imagespecs.keys()):
+            raise AssertionError(f"unknown image specification: {imagespec}")
         self.imagespec = imagespec.lower()
 
     def __call__(self, extension, data):
-        if extension.lower() not in "jpg jpeg png ppm pgm pbm pnm".split():
+        if extension.lower() not in ["jpg", "jpeg", "png", "ppm", "pgm", "pbm", "pnm"]:
             return None
 
         try:
@@ -205,18 +204,20 @@ def __call__(self, extension, data):
                 return img
             elif atype == "numpy":
                 result = np.asarray(img)
-                assert result.dtype == np.uint8, (
-                    f"numpy image array should be type uint8, but got {result.dtype}"
-                )
+                if result.dtype != np.uint8:
+                    raise AssertionError(
+                        f"numpy image array should be type uint8, but got {result.dtype}"
+                    )
                 if etype == "uint8":
                     return result
                 else:
                     return result.astype("f") / 255.0
             elif atype == "torch":
                 result = np.asarray(img)
-                assert result.dtype == np.uint8, (
-                    f"numpy image array should be type uint8, but got {result.dtype}"
-                )
+                if result.dtype != np.uint8:
+                    raise AssertionError(
+                        f"numpy image array should be type uint8, but got {result.dtype}"
+                    )
 
                 if etype == "uint8":
                     result = np.array(result.transpose(2, 0, 1))
@@ -235,7 +236,17 @@ def imagehandler(imagespec):
 # torch video
 ################################################################
 def videohandler(extension, data):
-    if extension not in "mp4 ogv mjpeg avi mov h264 mpg webm wmv".split():
+    if extension not in [
+        "mp4",
+        "ogv",
+        "mjpeg",
+        "avi",
+        "mov",
+        "h264",
+        "mpg",
+        "webm",
+        "wmv",
+    ]:
         return None
 
     try:
diff --git a/torch/utils/data/datapipes/utils/snapshot.py b/torch/utils/data/datapipes/utils/snapshot.py
index d120025a934e..5d0f1c0dc84d 100644
--- a/torch/utils/data/datapipes/utils/snapshot.py
+++ b/torch/utils/data/datapipes/utils/snapshot.py
@@ -43,6 +43,7 @@ def _simple_graph_snapshot_restoration(
     # simple fast-forwarding. Therefore, we need to call `reset` twice, because if `SnapshotState` is `Restored`,
     # the first reset will not actually reset.
     datapipe.reset()  # This ensures `SnapshotState` is `Iterating` by this point, even if it was `Restored`.
+    # pyrefly: ignore  # bad-argument-type
     apply_random_seed(datapipe, rng)
 
     remainder = n_iterations
diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index e8164e015a66..221b3116017b 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -198,9 +198,8 @@ class TensorDataset(Dataset[tuple[Tensor, ...]]):
     tensors: tuple[Tensor, ...]
 
     def __init__(self, *tensors: Tensor) -> None:
-        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors), (
-            "Size mismatch between tensors"
-        )
+        if all(tensors[0].size(0) != tensor.size(0) for tensor in tensors):
+            raise AssertionError("Size mismatch between tensors")
         self.tensors = tensors
 
     def __getitem__(self, index):
@@ -321,11 +320,11 @@ def cumsum(sequence):
     def __init__(self, datasets: Iterable[Dataset]) -> None:
         super().__init__()
         self.datasets = list(datasets)
-        assert len(self.datasets) > 0, "datasets should not be an empty iterable"  # type: ignore[arg-type]
+        if len(self.datasets) == 0:
+            raise AssertionError("datasets should not be an empty iterable")
         for d in self.datasets:
-            assert not isinstance(d, IterableDataset), (
-                "ConcatDataset does not support IterableDataset"
-            )
+            if isinstance(d, IterableDataset):
+                raise AssertionError("ConcatDataset does not support IterableDataset")
         self.cumulative_sizes = self.cumsum(self.datasets)
 
     def __len__(self):
@@ -371,17 +370,15 @@ def __init__(self, datasets: Iterable[Dataset]) -> None:
 
     def __iter__(self):
         for d in self.datasets:
-            assert isinstance(d, IterableDataset), (
-                "ChainDataset only supports IterableDataset"
-            )
+            if not isinstance(d, IterableDataset):
+                raise AssertionError("ChainDataset only supports IterableDataset")
             yield from d
 
     def __len__(self):
         total = 0
         for d in self.datasets:
-            assert isinstance(d, IterableDataset), (
-                "ChainDataset only supports IterableDataset"
-            )
+            if not isinstance(d, IterableDataset):
+                raise AssertionError("ChainDataset only supports IterableDataset")
             total += len(d)  # type: ignore[arg-type]
         return total
 
@@ -454,9 +451,7 @@ def random_split(
         for i, frac in enumerate(lengths):
             if frac < 0 or frac > 1:
                 raise ValueError(f"Fraction at index {i} is not between 0 and 1")
-            n_items_in_split = int(
-                math.floor(len(dataset) * frac)  # type: ignore[arg-type]
-            )
+            n_items_in_split = math.floor(len(dataset) * frac)  # type: ignore[arg-type]
             subset_lengths.append(n_items_in_split)
         remainder = len(dataset) - sum(subset_lengths)  # type: ignore[arg-type]
         # add 1 to all the lengths in round-robin fashion until the remainder is 0
diff --git a/torch/utils/data/distributed.py b/torch/utils/data/distributed.py
index 949e3e0c23b4..a7f8b61beabe 100644
--- a/torch/utils/data/distributed.py
+++ b/torch/utils/data/distributed.py
@@ -125,12 +125,19 @@ def __iter__(self) -> Iterator[_T_co]:
         else:
             # remove tail of data to make it evenly divisible.
             indices = indices[: self.total_size]
-        assert len(indices) == self.total_size
+        if len(indices) != self.total_size:
+            raise AssertionError(
+                f"Number of indices ({len(indices)}) does not match total_size ({self.total_size})"
+            )
 
         # subsample
         indices = indices[self.rank : self.total_size : self.num_replicas]
-        assert len(indices) == self.num_samples
+        if len(indices) != self.num_samples:
+            raise AssertionError(
+                f"Number of subsampled indices ({len(indices)}) does not match num_samples ({self.num_samples})"
+            )
 
+        # pyrefly: ignore  # bad-return
         return iter(indices)
 
     def __len__(self) -> int:
diff --git a/torch/utils/data/graph.py b/torch/utils/data/graph.py
index 26a4eae6d18c..63ac99c49268 100644
--- a/torch/utils/data/graph.py
+++ b/torch/utils/data/graph.py
@@ -72,6 +72,7 @@ def reduce_hook(obj):
             p.dump(scan_obj)
         except (pickle.PickleError, AttributeError, TypeError):
             if dill_available():
+                # pyrefly: ignore  # missing-attribute
                 d.dump(scan_obj)
             else:
                 raise
diff --git a/torch/utils/data/sampler.py b/torch/utils/data/sampler.py
index 6c2e6dcaf2f4..81f05a936df8 100644
--- a/torch/utils/data/sampler.py
+++ b/torch/utils/data/sampler.py
@@ -32,10 +32,6 @@ class Sampler(Generic[_T_co]):
     way to iterate over indices or lists of indices (batches) of dataset elements,
     and may provide a :meth:`__len__` method that returns the length of the returned iterators.
 
-    Args:
-        data_source (Dataset): This argument is not used and will be removed in 2.2.0.
-            You may still have custom implementation that utilizes it.
-
     Example:
         >>> # xdoctest: +SKIP
         >>> class AccedingSequenceLengthSampler(Sampler[int]):
@@ -67,15 +63,6 @@ class Sampler(Generic[_T_co]):
               calculation involving the length of a :class:`~torch.utils.data.DataLoader`.
     """
 
-    def __init__(self, data_source: Optional[Sized] = None) -> None:
-        if data_source is not None:
-            import warnings
-
-            warnings.warn(
-                "`data_source` argument is not used and will be removed in 2.2.0."
-                "You may still have custom implementation that utilizes it."
-            )
-
     def __iter__(self) -> Iterator[_T_co]:
         raise NotImplementedError
 
@@ -111,7 +98,7 @@ class SequentialSampler(Sampler[int]):
     r"""Samples elements sequentially, always in the same order.
 
     Args:
-        data_source (Dataset): dataset to sample from
+        data_source (Sized): data source to sample from. Must implement __len__.
     """
 
     data_source: Sized
@@ -132,7 +119,7 @@ class RandomSampler(Sampler[int]):
     If with replacement, then user can specify :attr:`num_samples` to draw.
 
     Args:
-        data_source (Dataset): dataset to sample from
+        data_source (Sized): data source to sample from. Must implement __len__.
         replacement (bool): samples are drawn on-demand with replacement if ``True``, default=``False``
         num_samples (int): number of samples to draw, default=`len(dataset)`.
         generator (Generator): Generator used in sampling.
diff --git a/torch/utils/dlpack.py b/torch/utils/dlpack.py
index e7aeae1ba3c8..f63cc89cc26e 100644
--- a/torch/utils/dlpack.py
+++ b/torch/utils/dlpack.py
@@ -133,9 +133,8 @@ def from_dlpack(
         if device is not None:
             if isinstance(device, str):
                 device = torch.device(device)
-            assert isinstance(device, torch.device), (
-                f"from_dlpack: unsupported device type: {type(device)}"
-            )
+            if not isinstance(device, torch.device):
+                raise AssertionError(f"from_dlpack: unsupported device type: {type(device)}")
             kwargs["dl_device"] = torch._C._torchDeviceToDLDevice(device)
 
         ext_device = ext_tensor.__dlpack_device__()
@@ -163,10 +162,10 @@ def from_dlpack(
             dlpack = ext_tensor.__dlpack__(**kwargs)
 
     else:
-        assert device is None and copy is None, (
-            "device and copy kwargs not supported when ext_tensor is "
-            "already a DLPack capsule."
-        )
+        if device is not None or copy is not None:
+            raise AssertionError(
+                "device and copy kwargs not supported when ext_tensor is already a DLPack capsule."
+            )
         # Old versions just call the converter
         dlpack = ext_tensor
     return torch._C._from_dlpack(dlpack)
diff --git a/torch/utils/file_baton.py b/torch/utils/file_baton.py
index 8437b45d1ffe..b493441db23a 100644
--- a/torch/utils/file_baton.py
+++ b/torch/utils/file_baton.py
@@ -31,6 +31,7 @@ def try_acquire(self):
             True if the file could be created, else False.
         """
         try:
+            # pyrefly: ignore  # bad-assignment
             self.fd = os.open(self.lock_file_path, os.O_CREAT | os.O_EXCL)
             return True
         except FileExistsError:
diff --git a/torch/utils/flop_counter.py b/torch/utils/flop_counter.py
index b8d4e878b7f0..18b0e4ee286e 100644
--- a/torch/utils/flop_counter.py
+++ b/torch/utils/flop_counter.py
@@ -2,7 +2,8 @@
 import torch
 from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
 from .module_tracker import ModuleTracker
-from typing import Any, Optional, Union, TypeVar, Callable
+from typing import Any, Optional, Union, TypeVar
+from collections.abc import Callable
 from collections.abc import Iterator
 from typing_extensions import ParamSpec
 from collections import defaultdict
@@ -61,7 +62,8 @@ def mm_flop(a_shape, b_shape, *args, out_shape=None, **kwargs) -> int:
     # Inputs contains the shapes of two matrices.
     m, k = a_shape
     k2, n = b_shape
-    assert k == k2
+    if k != k2:
+        raise AssertionError(f"matmul: inner dimensions must match (k == k2), got {k} and {k2}")
     # NB(chilli): Should be 2 * k - 1 technically for FLOPs.
     return m * n * 2 * k
 
@@ -77,8 +79,10 @@ def bmm_flop(a_shape, b_shape, out_shape=None, **kwargs) -> int:
     # Inputs contains the shapes of two tensor.
     b, m, k = a_shape
     b2, k2, n = b_shape
-    assert b == b2
-    assert k == k2
+    if b != b2:
+        raise AssertionError(f"bmm: batch dimensions must match (b == b2), got {b} and {b2}")
+    if k != k2:
+        raise AssertionError(f"bmm: inner dimensions must match (k == k2), got {k} and {k2}")
     # NB(chilli): Should be 2 * k - 1 technically for FLOPs.
     flop = b * m * n * 2 * k
     return flop
@@ -148,6 +152,7 @@ def conv_flop_count(
 @register_flop_formula([aten.convolution, aten._convolution, aten.cudnn_convolution, aten._slow_conv2d_forward])
 def conv_flop(x_shape, w_shape, _bias, _stride, _padding, _dilation, transposed, *args, out_shape=None, **kwargs) -> int:
     """Count flops for convolution."""
+    # pyrefly: ignore  # bad-argument-type
     return conv_flop_count(x_shape, w_shape, out_shape, transposed=transposed)
 
 
@@ -264,7 +269,8 @@ def sdpa_flop_count(query_shape, key_shape, value_shape):
     b, h, s_q, d_q = query_shape
     _b2, _h2, s_k, _d2 = key_shape
     _b3, _h3, _s3, d_v = value_shape
-    assert b == _b2 == _b3 and h == _h2 == _h3 and d_q == _d2 and s_k == _s3 and d_q == _d2
+    if not b == _b2 == _b3 or not h == _h2 == _h3 or not d_q == _d2 or not s_k == _s3 or not d_q == _d2:
+        raise AssertionError("sdpa_flop_count: query/key/value shapes are incompatible")
     total_flops = 0
     # q: [b, h, s_q, d_q] @ k: [b, h, d_q, s_k] -> scores: [b, h, s_q, s_k]
     total_flops += bmm_flop((b * h, s_q, d_q), (b * h, d_q, s_k))
@@ -318,15 +324,21 @@ def _unpack_flash_attention_nested_shapes(
         # In comparison, non-Nested inputs have shape (batch, heads, sequence len, dimension)
         # To deal with this, we convert to a shape of (batch, heads, max_seq_len, dimension)
         # So the flops calculation in this case is an overestimate of the actual flops.
-        assert len(key.shape) == 3
-        assert len(value.shape) == 3
-        assert grad_out is None or grad_out.shape == query.shape
+        if len(key.shape) != 3:
+            raise AssertionError("sdpa_flop_count: expected key.shape to be 3-dimensional")
+        if len(value.shape) != 3:
+            raise AssertionError("sdpa_flop_count: expected value.shape to be 3-dimensional")
+        if grad_out is not None and grad_out.shape != query.shape:
+            raise AssertionError("sdpa_flop_count: grad_out.shape must match query.shape when provided")
         _, h_q, d_q = query.shape
         _, h_k, d_k = key.shape
         _, h_v, d_v = value.shape
-        assert cum_seq_q is not None
-        assert cum_seq_k is not None
-        assert cum_seq_q.shape == cum_seq_k.shape
+        if cum_seq_q is None:
+            raise AssertionError("sdpa_flop_count: cum_seq_q must not be None")
+        if cum_seq_k is None:
+            raise AssertionError("sdpa_flop_count: cum_seq_k must not be None")
+        if cum_seq_q.shape != cum_seq_k.shape:
+            raise AssertionError("sdpa_flop_count: cum_seq_q and cum_seq_k must have the same shape")
         seq_q_lengths = _offsets_to_lengths(cum_seq_q, max_q)
         seq_k_lengths = _offsets_to_lengths(cum_seq_k, max_k)
         for (seq_q_len, seq_k_len) in zip(seq_q_lengths, seq_k_lengths):
@@ -366,15 +378,22 @@ def _unpack_efficient_attention_nested_shapes(
         # In comparison, non-Nested inputs have shape (batch, heads, sequence len, dimension)
         # To deal with this, we convert to a shape of (batch, heads, max_seq_len, dimension)
         # So the flops calculation in this case is an overestimate of the actual flops.
-        assert len(key.shape) == 4
-        assert len(value.shape) == 4
-        assert grad_out is None or grad_out.shape == query.shape
+        if len(key.shape) != 4:
+            raise AssertionError("_unpack_efficient_attention_nested_shapes: expected key.shape to be 4-dimensional")
+        if len(value.shape) != 4:
+            raise AssertionError("_unpack_efficient_attention_nested_shapes: expected value.shape to be 4-dimensional")
+        if grad_out is not None and grad_out.shape != query.shape:
+            raise AssertionError("_unpack_efficient_attention_nested_shapes: grad_out.shape must match query.shape when provided")
         _, _, h_q, d_q = query.shape
         _, _, h_k, d_k = key.shape
         _, _, h_v, d_v = value.shape
-        assert cu_seqlens_q is not None
-        assert cu_seqlens_k is not None
-        assert cu_seqlens_q.shape == cu_seqlens_k.shape
+        if cu_seqlens_q is None:
+            raise AssertionError("_unpack_efficient_attention_nested_shapes: cu_seqlens_q must not be None")
+        if cu_seqlens_k is None:
+            raise AssertionError("_unpack_efficient_attention_nested_shapes: cu_seqlens_k must not be None")
+        if cu_seqlens_q.shape != cu_seqlens_k.shape:
+            raise AssertionError("_unpack_efficient_attention_nested_shapes: "
+                                 "cu_seqlens_q and cu_seqlens_k must have the same shape")
         seqlens_q = _offsets_to_lengths(cu_seqlens_q, max_seqlen_q)
         seqlens_k = _offsets_to_lengths(cu_seqlens_k, max_seqlen_k)
         for len_q, len_k in zip(seqlens_q, seqlens_k):
@@ -458,8 +477,10 @@ def sdpa_backward_flop_count(grad_out_shape, query_shape, key_shape, value_shape
     _b2, _h2, s_k, _d2 = key_shape
     _b3, _h3, _s3, d_v = value_shape
     _b4, _h4, _s4, _d4 = grad_out_shape
-    assert b == _b2 == _b3 == _b4 and h == _h2 == _h3 == _h4 and d_q == _d2
-    assert d_v == _d4 and s_k == _s3 and s_q == _s4
+    if not b == _b2 == _b3 == _b4 or not h == _h2 == _h3 == _h4 or not d_q == _d2:
+        raise AssertionError("sdpa_backward_flop_count: batch/heads/dimension mismatch among tensors")
+    if not d_v == _d4 or not s_k == _s3 or not s_q == _s4:
+        raise AssertionError("sdpa_backward_flop_count: grad_out/value/key/query shapes are incompatible")
     total_flops = 0
     # Step 1: We recompute the scores matrix.
     # q: [b, h, s_q, d_q] @ k: [b, h, d_q, s_k] -> scores: [b, h, s_q, s_k]
@@ -675,7 +696,9 @@ def get_table(self, depth=None):
         if depth is None:
             depth = 999999
 
+
         import tabulate
+
         tabulate.PRESERVE_WHITESPACE = True
         header = ["Module", "FLOP", "% Total"]
         values = []
@@ -738,7 +761,8 @@ def __enter__(self):
         return self
 
     def __exit__(self, *args):
-        assert self.mode is not None
+        if self.mode is None:
+            raise AssertionError("Internal error: FlopCounter.__exit__ called but mode is None")
         b = self.mode.__exit__(*args)
         self.mode = None  # break cycles
         self.mod_tracker.__exit__()
@@ -782,7 +806,7 @@ def _execute_with_isolated_flop_counting(self, branch_fn, operands):
         return result, flop_counts
 
     def _handle_higher_order_ops(self, func, types, args, kwargs):
-        if func not in {torch.ops.higher_order.cond, }:
+        if func is not torch.ops.higher_order.cond:
             return NotImplemented
 
         # The flop counter for cond counts the upper bound of flops.
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 12291db1704c..d1d9a08c71c5 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -48,6 +48,7 @@
     ]
 )
 
+# pyrefly: ignore  # no-matching-overload
 CUDA_TYPE_NAME_MAP = collections.OrderedDict(
     [
         ("CUresult", ("hipError_t", CONV_TYPE, API_DRIVER)),
@@ -341,6 +342,7 @@
         ("cudaPos", ("hipPos", CONV_MEM, API_RUNTIME)),
         ("cudaEvent_t", ("hipEvent_t", CONV_TYPE, API_RUNTIME)),
         ("cudaStream_t", ("hipStream_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaHostFn_t", ("hipHostFn_t", CONV_TYPE, API_RUNTIME)),
         ("cudaPointerAttributes", ("hipPointerAttribute_t", CONV_TYPE, API_RUNTIME)),
         ("cudaDeviceAttr", ("hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME)),
         ("cudaDeviceProp", ("hipDeviceProp_t", CONV_TYPE, API_RUNTIME)),
@@ -674,6 +676,7 @@
     ]
 )
 
+# pyrefly: ignore  # no-matching-overload
 CUDA_IDENTIFIER_MAP = collections.OrderedDict(
     [
         ("__CUDACC__", ("__HIPCC__", CONV_DEF, API_RUNTIME)),
@@ -7699,8 +7702,11 @@
         ("CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", ("HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_DESC_A_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_A_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_DESC_B_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_B_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_POINTER_MODE", ("HIPBLASLT_MATMUL_DESC_POINTER_MODE", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", CONV_MATH_FUNC, API_BLAS)),
         ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_POINTER_MODE_DEVICE", ("HIPBLASLT_POINTER_MODE_DEVICE", CONV_NUMERIC_LITERAL, API_BLAS)),
+        ("CUBLASLT_POINTER_MODE_HOST", ("HIPBLASLT_POINTER_MODE_HOST", CONV_NUMERIC_LITERAL, API_BLAS)),
         ("cublasLtMatrixLayout_t", ("hipblasLtMatrixLayout_t", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatrixLayoutOpaque_t", ("hipblasLtMatrixLayoutOpaque_t", CONV_MATH_FUNC, API_BLAS)),
         ("cublasLtMatrixLayoutAttribute_t", ("hipblasLtMatrixLayoutAttribute_t", CONV_MATH_FUNC, API_BLAS)),
diff --git a/torch/utils/hipify/hipify_python.py b/torch/utils/hipify/hipify_python.py
index 0e816020635b..2b19198f0c58 100755
--- a/torch/utils/hipify/hipify_python.py
+++ b/torch/utils/hipify/hipify_python.py
@@ -548,7 +548,8 @@ def get_hip_file_path(rel_filepath, is_pytorch_extension=False):
     """
     # At the moment, some PyTorch source files are HIPified in place.  The predicate
     # is_out_of_place tells us if this is the case or not.
-    assert not os.path.isabs(rel_filepath)
+    if os.path.isabs(rel_filepath):
+        raise AssertionError("rel_filepath must be a relative path")
     if not is_pytorch_extension and not is_out_of_place(rel_filepath):
         return rel_filepath
 
@@ -615,7 +616,8 @@ def get_hip_file_path(rel_filepath, is_pytorch_extension=False):
 
 
 def is_out_of_place(rel_filepath):
-    assert not os.path.isabs(rel_filepath)
+    if os.path.isabs(rel_filepath):
+        raise AssertionError("rel_filepath must be a relative path")
     if rel_filepath.startswith("torch/"):
         return False
     if rel_filepath.startswith("third_party/nvfuser/"):
@@ -627,7 +629,8 @@ def is_out_of_place(rel_filepath):
 
 # Keep this synchronized with includes/ignores in build_amd.py
 def is_pytorch_file(rel_filepath):
-    assert not os.path.isabs(rel_filepath)
+    if os.path.isabs(rel_filepath):
+        raise AssertionError("rel_filepath must be a relative path")
     if rel_filepath.startswith("aten/"):
         if rel_filepath.startswith("aten/src/ATen/core/"):
             return False
@@ -658,11 +661,13 @@ def is_special_file(rel_filepath):
     return False
 
 def is_caffe2_gpu_file(rel_filepath):
-    assert not os.path.isabs(rel_filepath)
+    if os.path.isabs(rel_filepath):
+        raise AssertionError("rel_filepath must be a relative path")
     if rel_filepath.startswith("c10/cuda"):
         return True
     filename = os.path.basename(rel_filepath)
     _, ext = os.path.splitext(filename)
+    # pyrefly: ignore  # unsupported-operation
     return ('gpu' in filename or ext in ['.cu', '.cuh']) and ('cudnn' not in filename)
 
 class TrieNode:
@@ -783,7 +788,8 @@ def export_to_regex(self):
 PYTORCH_SPECIAL_MAP = {}
 
 for mapping in CUDA_TO_HIP_MAPPINGS:
-    assert isinstance(mapping, Mapping)
+    if not isinstance(mapping, Mapping):
+        raise TypeError("Expected each mapping in CUDA_TO_HIP_MAPPINGS to be a Mapping")
     for src, value in mapping.items():
         dst = value[0]
         meta_data = value[1:]
@@ -1137,6 +1143,7 @@ def hipify(
                                         out_of_place_only=out_of_place_only,
                                         is_pytorch_extension=is_pytorch_extension))
     all_files_set = set(all_files)
+    # pyrefly: ignore  # bad-assignment
     for f in extra_files:
         if not os.path.isabs(f):
             f = os.path.join(output_directory, f)
diff --git a/torch/utils/hooks.py b/torch/utils/hooks.py
index e6e93966afdb..f7bacfaec7b9 100644
--- a/torch/utils/hooks.py
+++ b/torch/utils/hooks.py
@@ -145,6 +145,7 @@ def hook(grad_input, _):
 
                 res = out
 
+            # pyrefly: ignore  # bad-assignment
             self.grad_outputs = None
 
             return self._unpack_none(self.input_tensors_index, res)
@@ -237,7 +238,8 @@ def hook(_, grad_output):
                     self.grad_outputs = None
 
                 if local_grad_outputs is not None:
-                    assert self.output_tensors_index is not None  # mypy
+                    if self.output_tensors_index is None:
+                        raise AssertionError("output_tensors_index should not be None when grad_outputs is not None")
                     return tuple(local_grad_outputs[i] for i in self.output_tensors_index)
 
             grad_fn.register_hook(hook)
diff --git a/torch/utils/jit/log_extract.py b/torch/utils/jit/log_extract.py
index f5804e710bae..9e018457802f 100644
--- a/torch/utils/jit/log_extract.py
+++ b/torch/utils/jit/log_extract.py
@@ -32,10 +32,14 @@ def make_tensor_from_type(inp_type: torch._C.TensorType):
     stride = inp_type.strides()
     device = inp_type.device()
     dtype = inp_type.dtype()
-    assert size is not None
-    assert stride is not None
-    assert device is not None
-    assert dtype is not None
+    if size is None:
+        raise AssertionError("make_tensor_from_type: 'size' is None (inp_type.sizes() returned None)")
+    if stride is None:
+        raise AssertionError("make_tensor_from_type: 'stride' is None (inp_type.strides() returned None)")
+    if device is None:
+        raise AssertionError("make_tensor_from_type: 'device' is None (inp_type.device() returned None)")
+    if dtype is None:
+        raise AssertionError("make_tensor_from_type: 'dtype' is None (inp_type.dtype() returned None)")
     return torch.empty_strided(size=size, stride=stride, device=device, dtype=dtype)
 
 def load_graph_and_inputs(ir: str) -> tuple[Any, list[Any]]:
@@ -81,7 +85,8 @@ def run_test(ir, inputs, *, warmup_runs=10, test_runs=20) -> float:
         if isinstance(input, torch.Tensor):
             is_cpu = input.device.type == "cpu"
             break
-    assert is_cpu is not None
+    if is_cpu is None:
+        raise AssertionError("No tensor found in inputs")
 
     out = time_cpu(graph, inputs, test_runs) if is_cpu else time_cuda(graph, inputs, test_runs)
     return out
diff --git a/torch/utils/mkldnn.py b/torch/utils/mkldnn.py
index 06ca96d2de9a..b6b09937eb90 100644
--- a/torch/utils/mkldnn.py
+++ b/torch/utils/mkldnn.py
@@ -137,9 +137,12 @@ class MkldnnBatchNorm(torch.jit.ScriptModule):
     def __init__(self, dense_module):
         super().__init__()
 
-        assert not dense_module.training
-        assert dense_module.track_running_stats
-        assert dense_module.affine
+        if dense_module.training:
+            raise AssertionError("Only support eval mode batchnorm for mkldnn path now")
+        if not dense_module.track_running_stats:
+            raise AssertionError("Only support track_running_stats=True for mkldnn path now")
+        if not dense_module.affine:
+            raise AssertionError("Only support affine=True for mkldnn path now")
 
         if dense_module.momentum is None:
             self.exponential_average_factor = 0.0
@@ -204,8 +207,9 @@ def forward(self, x):
         return y
 
 def to_mkldnn(module, dtype=torch.float):
-    assert dtype in [torch.float, torch.bfloat16, torch.half], \
-        "MKLDNN only support float, bfloat16, and half path now"
+    if dtype not in (torch.float, torch.bfloat16, torch.half):
+        raise AssertionError("MKLDNN only support float, bfloat16, and half path now")
+
 
     def m_fn(m, d):
         if isinstance(m, torch.nn.Linear):
diff --git a/torch/utils/model_dump/__init__.py b/torch/utils/model_dump/__init__.py
index 7d6a6890e4ce..253301b31121 100644
--- a/torch/utils/model_dump/__init__.py
+++ b/torch/utils/model_dump/__init__.py
@@ -66,6 +66,7 @@
 
 import argparse
 import io
+import itertools
 import json
 import os
 import pickle
@@ -86,19 +87,31 @@
            'burn_in_info', 'get_info_and_burn_skeleton']
 
 def get_storage_info(storage):
-    assert isinstance(storage, torch.utils.show_pickle.FakeObject)
-    assert storage.module == "pers"
-    assert storage.name == "obj"
-    assert storage.state is None
-    assert isinstance(storage.args, tuple)
-    assert len(storage.args) == 1
+    if not isinstance(storage, torch.utils.show_pickle.FakeObject):
+        raise AssertionError(f"storage is not FakeObject: {type(storage)}")
+    if storage.module != "pers":
+        raise AssertionError(f"storage.module is not 'pers': {storage.module!r}")
+    if storage.name != "obj":
+        raise AssertionError(f"storage.name is not 'obj': {storage.name!r}")
+    if storage.state is not None:
+        raise AssertionError(f"storage.state is not None: {storage.state!r}")
+    if not isinstance(storage.args, tuple):
+        raise AssertionError(f"storage.args is not a tuple: {type(storage.args)}")
+    if len(storage.args) != 1:
+        raise AssertionError(f"len(storage.args) is not 1: {len(storage.args)}")
     sa = storage.args[0]
-    assert isinstance(sa, tuple)
-    assert len(sa) == 5
-    assert sa[0] == "storage"
-    assert isinstance(sa[1], torch.utils.show_pickle.FakeClass)
-    assert sa[1].module == "torch"
-    assert sa[1].name.endswith("Storage")
+    if not isinstance(sa, tuple):
+        raise AssertionError(f"sa is not a tuple: {type(sa)}")
+    if len(sa) != 5:
+        raise AssertionError(f"len(sa) is not 5: {len(sa)}")
+    if sa[0] != "storage":
+        raise AssertionError(f"sa[0] is not 'storage': {sa[0]!r}")
+    if not isinstance(sa[1], torch.utils.show_pickle.FakeClass):
+        raise AssertionError(f"sa[1] is not FakeClass: {type(sa[1])}")
+    if sa[1].module != "torch":
+        raise AssertionError(f"sa[1].module is not 'torch': {sa[1].module!r}")
+    if not sa[1].name.endswith("Storage"):
+        raise AssertionError(f"sa[1].name does not end with 'Storage': {sa[1].name!r}")
     storage_info = [sa[1].name.replace("Storage", "")] + list(sa[2:])
     return storage_info
 
@@ -123,52 +136,69 @@ def hierarchical_pickle(data):
         if (
             typename.startswith(('__torch__.', 'torch.jit.LoweredWrapper.', 'torch.jit.LoweredModule.'))
         ):
-            assert data.args == ()
+            if data.args != ():
+                raise AssertionError("data.args is not ()")
             return {
                 "__module_type__": typename,
                 "state": hierarchical_pickle(data.state),
             }
         if typename == "torch._utils._rebuild_tensor_v2":
-            assert data.state is None
+            if data.state is not None:
+                raise AssertionError("data.state is not None")
             storage, offset, size, stride, requires_grad, *_ = data.args
             storage_info = get_storage_info(storage)
             return {"__tensor_v2__": [storage_info, offset, size, stride, requires_grad]}
         if typename == "torch._utils._rebuild_qtensor":
-            assert data.state is None
+            if data.state is not None:
+                raise AssertionError("data.state is not None")
             storage, offset, size, stride, quantizer, requires_grad, *_ = data.args
             storage_info = get_storage_info(storage)
-            assert isinstance(quantizer, tuple)
-            assert isinstance(quantizer[0], torch.utils.show_pickle.FakeClass)
-            assert quantizer[0].module == "torch"
+            if not isinstance(quantizer, tuple):
+                raise AssertionError("quantizer is not a tuple")
+            if not isinstance(quantizer[0], torch.utils.show_pickle.FakeClass):
+                raise AssertionError("quantizer[0] is not a FakeClass")
+            if quantizer[0].module != "torch":
+                raise AssertionError("quantizer[0].module is not torch")
             if quantizer[0].name == "per_tensor_affine":
-                assert len(quantizer) == 3
-                assert isinstance(quantizer[1], float)
-                assert isinstance(quantizer[2], int)
+                if len(quantizer) != 3:
+                    raise AssertionError("len(quantizer) is not 3")
+                if not isinstance(quantizer[1], float):
+                    raise AssertionError("quantizer[1] is not a float")
+                if not isinstance(quantizer[2], int):
+                    raise AssertionError("quantizer[2] is not an int")
                 quantizer_extra = list(quantizer[1:3])
             else:
                 quantizer_extra = []
             quantizer_json = [quantizer[0].name] + quantizer_extra
             return {"__qtensor__": [storage_info, offset, size, stride, quantizer_json, requires_grad]}
         if typename == "torch.jit._pickle.restore_type_tag":
-            assert data.state is None
+            if data.state is not None:
+                raise AssertionError("data.state is not None")
             obj, typ = data.args
-            assert isinstance(typ, str)
+            if not isinstance(typ, str):
+                raise AssertionError("typ is not a string")
             return hierarchical_pickle(obj)
         if re.fullmatch(r"torch\.jit\._pickle\.build_[a-z]+list", typename):
-            assert data.state is None
+            if data.state is not None:
+                raise AssertionError("data.state is not None")
             ls, = data.args
-            assert isinstance(ls, list)
+            if not isinstance(ls, list):
+                raise AssertionError("ls is not a list")
             return hierarchical_pickle(ls)
         if typename == "torch.device":
-            assert data.state is None
+            if data.state is not None:
+                raise AssertionError("data.state is not None")
             name, = data.args
-            assert isinstance(name, str)
+            if not isinstance(name, str):
+                raise AssertionError("name is not a string")
             # Just forget that it was a device and return the name.
             return name
         if typename == "builtin.UnicodeDecodeError":
-            assert data.state is None
+            if data.state is not None:
+                raise AssertionError("data.state is not None")
             msg, = data.args
-            assert isinstance(msg, str)
+            if not isinstance(msg, str):
+                raise AssertionError("msg is not a string")
             # Hack: Pretend this is a module so we don't need custom serialization.
             # Hack: Wrap the message in a tuple so it looks like a nice state object.
             # TODO: Undo at least that second hack.  We should support string states.
@@ -207,6 +237,7 @@ def get_model_info(
     with zipfile.ZipFile(path_or_file) as zf:
         path_prefix = None
         zip_files = []
+        # pyrefly: ignore  # bad-assignment
         for zi in zf.infolist():
             prefix = re.sub("/.*", "", zi.filename)
             if path_prefix is None:
@@ -221,11 +252,13 @@ def get_model_info(
                     "file_size": zi.file_size,
                 }
             )
-        assert path_prefix is not None
+        if path_prefix is None:
+            raise AssertionError("path_prefix is None")
         version = zf.read(path_prefix + "/version").decode("utf-8").strip()
 
         def get_pickle(name):
-            assert path_prefix is not None
+            if path_prefix is None:
+                raise AssertionError("path_prefix is None")
             with zf.open(path_prefix + f"/{name}.pkl") as handle:
                 raw = torch.utils.show_pickle.DumpUnpickler(handle, catch_invalid_utf8=True).load()
                 return hierarchical_pickle(raw)
@@ -280,10 +313,11 @@ def parse_new_format(line):
                 debug_info.append((len(raw_code), (('', '', 0), 0, 0)))
 
             code_parts = []
-            for di, di_next in zip(debug_info, debug_info[1:]):
+            for di, di_next in itertools.pairwise(debug_info):
                 start, source_range, *_ = di
                 end = di_next[0]
-                assert end > start
+                if end <= start:
+                    raise AssertionError("end is not greater than start")
                 source, s_start, s_end = source_range
                 s_text, s_file, s_line = source
                 # TODO: Handle this case better.  TorchScript ranges are in bytes,
@@ -358,9 +392,12 @@ def get_inline_skeleton():
 
     import importlib.resources
 
+    # pyrefly: ignore  # bad-argument-type
     skeleton = importlib.resources.read_text(__package__, "skeleton.html")
+    # pyrefly: ignore  # bad-argument-type
     js_code = importlib.resources.read_text(__package__, "code.js")
     for js_module in ["preact", "htm"]:
+        # pyrefly: ignore  # bad-argument-type
         js_lib = importlib.resources.read_binary(__package__, f"{js_module}.mjs")
         js_url = "data:application/javascript," + urllib.parse.quote(js_lib)
         js_code = js_code.replace(f"https://unpkg.com/{js_module}?module", js_url)
diff --git a/torch/utils/tensorboard/_convert_np.py b/torch/utils/tensorboard/_convert_np.py
index afa801343334..21290a8b0ced 100644
--- a/torch/utils/tensorboard/_convert_np.py
+++ b/torch/utils/tensorboard/_convert_np.py
@@ -31,5 +31,7 @@ def make_np(x: torch.Tensor) -> np.ndarray:
 def _prepare_pytorch(x: torch.Tensor) -> np.ndarray:
     if x.dtype == torch.bfloat16:
         x = x.to(torch.float16)
+    # pyrefly: ignore  # bad-assignment
     x = x.detach().cpu().numpy()
+    # pyrefly: ignore  # bad-return
     return x
diff --git a/torch/utils/tensorboard/_embedding.py b/torch/utils/tensorboard/_embedding.py
index 44cb6c41b017..28385426c280 100644
--- a/torch/utils/tensorboard/_embedding.py
+++ b/torch/utils/tensorboard/_embedding.py
@@ -25,9 +25,10 @@ def make_tsv(metadata, save_path, metadata_header=None):
     if not metadata_header:
         metadata = [str(x) for x in metadata]
     else:
-        assert len(metadata_header) == len(
+        if len(metadata_header) != len(
             metadata[0]
-        ), "len of header must be equal to the number of columns in metadata"
+        ):
+            raise AssertionError("len of header must be equal to the number of columns in metadata")
         metadata = ["\t".join(str(e) for e in l) for l in [metadata_header] + metadata]
 
     metadata_bytes = tf.compat.as_bytes("\n".join(metadata) + "\n")
@@ -42,7 +43,7 @@ def make_sprite(label_img, save_path):
 
     # this ensures the sprite image has correct dimension as described in
     # https://www.tensorflow.org/get_started/embedding_viz
-    nrow = int(math.ceil((label_img.size(0)) ** 0.5))
+    nrow = math.ceil((label_img.size(0)) ** 0.5)
     arranged_img_CHW = make_grid(make_np(label_img), ncols=nrow)
 
     # augment images so that #images equals nrow*nrow
diff --git a/torch/utils/tensorboard/_pytorch_graph.py b/torch/utils/tensorboard/_pytorch_graph.py
index 85427162fc77..31ae14919315 100644
--- a/torch/utils/tensorboard/_pytorch_graph.py
+++ b/torch/utils/tensorboard/_pytorch_graph.py
@@ -187,7 +187,8 @@ def populate_namespace_from_OP_to_IO(self):
                 )
 
         for key, node in self.nodes_io.items():
-            if type(node) == NodeBase:
+            if type(node) is NodeBase:
+                # pyrefly: ignore  # unsupported-operation
                 self.unique_name_to_scoped_name[key] = node.scope + "/" + node.debugName
             if hasattr(node, "input_or_output"):
                 self.unique_name_to_scoped_name[key] = (
@@ -198,6 +199,7 @@ def populate_namespace_from_OP_to_IO(self):
                 self.unique_name_to_scoped_name[key] = node.scope + "/" + node.debugName
                 if node.scope == "" and self.shallowest_scope_name:
                     self.unique_name_to_scoped_name[node.debugName] = (
+                        # pyrefly: ignore  # unsupported-operation
                         self.shallowest_scope_name + "/" + node.debugName
                     )
 
diff --git a/torch/utils/tensorboard/_utils.py b/torch/utils/tensorboard/_utils.py
index f0ad185d968f..6c44576d4cb7 100644
--- a/torch/utils/tensorboard/_utils.py
+++ b/torch/utils/tensorboard/_utils.py
@@ -57,11 +57,14 @@ def is_power2(num):
         return num != 0 and ((num & (num - 1)) == 0)
 
     # pad to nearest power of 2, all at once
+    # pyrefly: ignore  # index-error
     if not is_power2(V.shape[0]):
+        # pyrefly: ignore  # index-error
         len_addition = int(2 ** V.shape[0].bit_length() - V.shape[0])
         V = np.concatenate((V, np.zeros(shape=(len_addition, t, c, h, w))), axis=0)
 
     n_rows = 2 ** ((b.bit_length() - 1) // 2)
+    # pyrefly: ignore  # index-error
     n_cols = V.shape[0] // n_rows
 
     V = np.reshape(V, newshape=(n_rows, n_cols, t, c, h, w))
@@ -73,10 +76,12 @@ def is_power2(num):
 
 def make_grid(I, ncols=8):
     # I: N1HW or N3HW
-    assert isinstance(I, np.ndarray), "plugin error, should pass numpy array here"
+    if not isinstance(I, np.ndarray):
+        raise AssertionError("plugin error, should pass numpy array here")
     if I.shape[1] == 1:
         I = np.concatenate([I, I, I], 1)
-    assert I.ndim == 4 and I.shape[1] == 3
+    if I.ndim != 4 or I.shape[1] != 3:
+        raise AssertionError("Input should be a 4D numpy array with 3 channels")
     nimg = I.shape[0]
     H = I.shape[2]
     W = I.shape[3]
@@ -98,13 +103,12 @@ def make_grid(I, ncols=8):
 
 
 def convert_to_HWC(tensor, input_format):  # tensor: numpy array
-    assert len(set(input_format)) == len(
-        input_format
-    ), f"You can not use the same dimension shordhand twice.         input_format: {input_format}"
-    assert len(tensor.shape) == len(
-        input_format
-    ), f"size of input tensor and input format are different. \
-        tensor shape: {tensor.shape}, input_format: {input_format}"
+    if len(set(input_format)) != len(input_format):
+        raise AssertionError(f"You can not use the same dimension shordhand twice. \
+            input_format: {input_format}")
+    if len(tensor.shape) != len(input_format):
+        raise AssertionError(f"size of input tensor and input format are different. \
+        tensor shape: {tensor.shape}, input_format: {input_format}")
     input_format = input_format.upper()
 
     if len(input_format) == 4:
diff --git a/torch/utils/tensorboard/summary.py b/torch/utils/tensorboard/summary.py
index 3fca4d9b7e66..e9322279c963 100644
--- a/torch/utils/tensorboard/summary.py
+++ b/torch/utils/tensorboard/summary.py
@@ -9,6 +9,7 @@
 import torch
 import numpy as np
 
+
 from google.protobuf import struct_pb2
 
 from tensorboard.compat.proto.summary_pb2 import (
@@ -369,9 +370,9 @@ def scalar(name, tensor, collections=None, new_style=False, double_precision=Fal
       ValueError: If tensor has the wrong shape or type.
     """
     tensor = make_np(tensor).squeeze()
-    assert (
-        tensor.ndim == 0
-    ), f"Tensor should contain one element (0 dimensions). Was given size: {tensor.size} and {tensor.ndim} dimensions."
+    if tensor.ndim != 0:
+        raise AssertionError(f"Tensor should contain one element (0 dimensions). \
+            Was given size: {tensor.size} and {tensor.ndim} dimensions.")
     # python float is double precision in numpy
     scalar = float(tensor)
     if new_style:
@@ -497,6 +498,7 @@ def make_histogram(values, bins, max_bins=None):
         subsampling = num_bins // max_bins
         subsampling_remainder = num_bins % subsampling
         if subsampling_remainder != 0:
+            # pyrefly: ignore  # no-matching-overload
             counts = np.pad(
                 counts,
                 pad_width=[[0, subsampling - subsampling_remainder]],
@@ -698,7 +700,8 @@ def audio(tag, tensor, sample_rate=44100):
     if abs(array).max() > 1:
         print("warning: audio amplitude out of range, auto clipped.")
         array = array.clip(-1, 1)
-    assert array.ndim == 1, "input tensor should be 1 dimensional."
+    if array.ndim != 1:
+        raise AssertionError("input tensor should be 1 dimensional.")
     array = (array * np.iinfo(np.int16).max).astype("<i2")
 
     import io
@@ -729,7 +732,8 @@ def custom_scalars(layout):
         for chart_name, chart_metadata in v.items():
             tags = chart_metadata[1]
             if chart_metadata[0] == "Margin":
-                assert len(tags) == 3
+                if len(tags) != 3:
+                    raise AssertionError("len(tags) != 3")
                 mgcc = layout_pb2.MarginChartContent(
                     series=[
                         layout_pb2.MarginChartContent.Series(
@@ -834,17 +838,21 @@ def compute_curve(labels, predictions, num_thresholds=None, weights=None):
         weights = 1.0
 
     # Compute bins of true positives and false positives.
+    # pyrefly: ignore  # unsupported-operation
     bucket_indices = np.int32(np.floor(predictions * (num_thresholds - 1)))
     float_labels = labels.astype(np.float64)
+    # pyrefly: ignore  # unsupported-operation
     histogram_range = (0, num_thresholds - 1)
     tp_buckets, _ = np.histogram(
         bucket_indices,
+        # pyrefly: ignore  # bad-argument-type
         bins=num_thresholds,
         range=histogram_range,
         weights=float_labels * weights,
     )
     fp_buckets, _ = np.histogram(
         bucket_indices,
+        # pyrefly: ignore  # bad-argument-type
         bins=num_thresholds,
         range=histogram_range,
         weights=(1.0 - float_labels) * weights,
diff --git a/torch/utils/tensorboard/writer.py b/torch/utils/tensorboard/writer.py
index 129281cb8ac3..51646362bceb 100644
--- a/torch/utils/tensorboard/writer.py
+++ b/torch/utils/tensorboard/writer.py
@@ -254,7 +254,9 @@ def __init__(
         buckets = []
         neg_buckets = []
         while v < 1e20:
+            # pyrefly: ignore  # bad-argument-type
             buckets.append(v)
+            # pyrefly: ignore  # bad-argument-type
             neg_buckets.append(-v)
             v *= 1.1
         self.default_bins = neg_buckets[::-1] + [0] + buckets
@@ -262,15 +264,19 @@ def __init__(
     def _get_file_writer(self):
         """Return the default FileWriter instance. Recreates it if closed."""
         if self.all_writers is None or self.file_writer is None:
+            # pyrefly: ignore  # bad-assignment
             self.file_writer = FileWriter(
                 self.log_dir, self.max_queue, self.flush_secs, self.filename_suffix
             )
+            # pyrefly: ignore  # bad-assignment, missing-attribute
             self.all_writers = {self.file_writer.get_logdir(): self.file_writer}
             if self.purge_step is not None:
                 most_recent_step = self.purge_step
+                # pyrefly: ignore  # missing-attribute
                 self.file_writer.add_event(
                     Event(step=most_recent_step, file_version="brain.Event:2")
                 )
+                # pyrefly: ignore  # missing-attribute
                 self.file_writer.add_event(
                     Event(
                         step=most_recent_step,
@@ -414,7 +420,8 @@ def add_scalars(self, main_tag, tag_scalar_dict, global_step=None, walltime=None
         fw_logdir = self._get_file_writer().get_logdir()
         for tag, scalar_value in tag_scalar_dict.items():
             fw_tag = fw_logdir + "/" + main_tag.replace("/", "_") + "_" + tag
-            assert self.all_writers is not None
+            if self.all_writers is None:
+                raise AssertionError("self.all_writers is None")
             if fw_tag in self.all_writers.keys():
                 fw = self.all_writers[fw_tag]
             else:
@@ -924,20 +931,19 @@ def add_embedding(
             fs.makedirs(save_path)
 
         if metadata is not None:
-            assert mat.shape[0] == len(
+            if mat.shape[0] != len(
                 metadata
-            ), "#labels should equal with #data points"
+            ):
+                raise AssertionError("#labels should equal with #data points")
             make_tsv(metadata, save_path, metadata_header=metadata_header)
 
         if label_img is not None:
-            assert (
-                mat.shape[0] == label_img.shape[0]
-            ), "#images should equal with #data points"
+            if mat.shape[0] != label_img.shape[0]:
+                raise AssertionError("#images should equal with #data points")
             make_sprite(label_img, save_path)
 
-        assert (
-            mat.ndim == 2
-        ), "mat should be 2D, where mat.size(0) is the number of data points"
+        if mat.ndim != 2:
+            raise AssertionError("mat should be 2D, where mat.size(0) is the number of data points")
         make_mat(mat, save_path)
 
         # Filesystem doesn't necessarily have append semantics, so we store an
@@ -950,6 +956,7 @@ def add_embedding(
         )
         self._projector_config.embeddings.extend([embedding_info])
 
+
         from google.protobuf import text_format
 
         config_pbtxt = text_format.MessageToString(self._projector_config)
@@ -1087,7 +1094,8 @@ def add_custom_scalars_marginchart(
         torch._C._log_api_usage_once(
             "tensorboard.logging.add_custom_scalars_marginchart"
         )
-        assert len(tags) == 3
+        if len(tags) != 3:
+            raise AssertionError(f"Expected 3 tags, got {len(tags)}.")
         layout = {category: {title: ["Margin", tags]}}
         self._get_file_writer().add_summary(custom_scalars(layout))
 
@@ -1199,6 +1207,7 @@ def close(self):
         for writer in self.all_writers.values():
             writer.flush()
             writer.close()
+        # pyrefly: ignore  # bad-assignment
         self.file_writer = self.all_writers = None
 
     def __enter__(self):
diff --git a/torch/utils/throughput_benchmark.py b/torch/utils/throughput_benchmark.py
index 2778b37b5a78..e88192c968bd 100644
--- a/torch/utils/throughput_benchmark.py
+++ b/torch/utils/throughput_benchmark.py
@@ -5,7 +5,8 @@
 
 def format_time(time_us=None, time_ms=None, time_s=None):
     """Define time formatting."""
-    assert sum([time_us is not None, time_ms is not None, time_s is not None]) == 1
+    if time_us is not None or time_ms is not None or time_s is not None:
+        raise AssertionError("Expected at least one of time_us, time_ms, time_s is not None.")
 
     US_IN_SECOND = 1e6
     US_IN_MS = 1e3
diff --git a/torch/utils/viz/_cycles.py b/torch/utils/viz/_cycles.py
index 79d8e8b8b171..f18225d62859 100644
--- a/torch/utils/viz/_cycles.py
+++ b/torch/utils/viz/_cycles.py
@@ -277,7 +277,7 @@ def create_graph(objects, *, context=None, filter=None):
         references = annotated_references(obj)
         for referrent in gc.get_referents(obj):
             rid = id(referrent)
-            tidx = id_to_node.get(rid, None)
+            tidx = id_to_node.get(rid)
             if tidx is None:
                 continue
             labels = references.get(rid, ["?"])
@@ -461,6 +461,7 @@ def to_html(nodes):
         if n.context is None:
             continue
         s = _listener_template.format(id=str(i + 1), stack=escape(f'{n.label}:\n{n.context}'))
+        # pyrefly: ignore  # bad-argument-type
         listeners.append(s)
     dot = to_dot(nodes)
     return _template.replace('$DOT', repr(dot)).replace('$LISTENERS', '\n'.join(listeners))
diff --git a/torch/utils/weak.py b/torch/utils/weak.py
index 9c7218cb2ad3..776eb82ec8ae 100644
--- a/torch/utils/weak.py
+++ b/torch/utils/weak.py
@@ -292,6 +292,7 @@ def popitem(self):
             if o is not None:
                 return o, value
 
+    # pyrefly: ignore  # bad-override
     def pop(self, key, *args):
         self._dirty_len = True
         return self.data.pop(self.ref_type(key), *args)  # CHANGED
@@ -308,7 +309,7 @@ def update(self, dict=None, **kwargs):  # type: ignore[override]
                 dict = type({})(dict)
             for key, value in dict.items():
                 d[self.ref_type(key, self._remove)] = value  # CHANGED
-        if len(kwargs):
+        if kwargs:
             self.update(kwargs)
 
     def __ior__(self, other):
@@ -350,14 +351,16 @@ class TensorWeakRef:
     ref: WeakRef[Tensor]
 
     def __init__(self, tensor: Tensor):
-        assert isinstance(tensor, Tensor)
+        if not isinstance(tensor, Tensor):
+            raise AssertionError(f"expected torch.Tensor, got {type(tensor)}.")
         self.ref = weakref.ref(tensor)
 
     def __call__(self):
         out = self.ref()
         if out is None:
             return out
-        assert isinstance(out, Tensor)
+        if not isinstance(out, Tensor):
+            raise AssertionError(f"expected torch.Tensor, got {type(out)}.")
         # TODO, add _fix_weakref type binding
         out._fix_weakref()  # type: ignore[attr-defined]
         return out
diff --git a/torch/xpu/__init__.py b/torch/xpu/__init__.py
index 6e15bf4380e3..137e960afabb 100644
--- a/torch/xpu/__init__.py
+++ b/torch/xpu/__init__.py
@@ -9,8 +9,9 @@
 
 import threading
 import traceback
+from collections.abc import Callable
 from functools import lru_cache
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 import torch._C
@@ -77,6 +78,17 @@ def is_bf16_supported(including_emulation: bool = True) -> bool:
     )
 
 
+def is_tf32_supported() -> bool:
+    r"""Return a bool indicating if the current XPU device supports dtype tf32."""
+    if not is_available():
+        return False
+    # On Intel Xe architecture and newer, TF32 operations can be accelerated
+    # through DPAS (Dot Product Accumulate Systolic) instructions. Therefore,
+    # TF32 support can be determined by checking whether the device supports
+    # subgroup matrix multiply-accumulate operations.
+    return torch.xpu.get_device_properties().has_subgroup_matrix_multiply_accumulate
+
+
 def is_initialized():
     r"""Return whether PyTorch's XPU state has been initialized."""
     return _initialized and not _is_in_bad_fork()
@@ -239,6 +251,7 @@ def get_device_capability(device: Optional[_device_t] = None) -> dict[str, Any]:
     # Only keep attributes that are safe for dictionary serialization.
     serializable_types = (int, float, bool, str, type(None), list, tuple, dict)
     return {
+        # pyrefly: ignore  # unbound-name
         key: value
         for key in dir(props)
         if not key.startswith("__")
@@ -246,7 +259,9 @@ def get_device_capability(device: Optional[_device_t] = None) -> dict[str, Any]:
     }
 
 
-def get_device_properties(device: Optional[_device_t] = None) -> _XpuDeviceProperties:
+def get_device_properties(
+    device: Optional[_device_t] = None,
+) -> _XpuDeviceProperties:  # pyrefly: ignore  # not-a-type
     r"""Get the properties of a device.
 
     Args:
@@ -314,7 +329,7 @@ def __init__(self, stream: Optional["torch.xpu.Stream"]):
         self.stream = stream
         self.idx = _get_device_index(None, True)
         if self.idx is None:
-            self.idx = -1
+            self.idx = -1  # pyrefly: ignore  # bad-assignment
 
     def __enter__(self):
         cur_stream = self.stream
@@ -555,6 +570,7 @@ def _get_rng_state_offset(device: Union[int, str, torch.device] = "xpu") -> int:
     "is_available",
     "is_bf16_supported",
     "is_initialized",
+    "is_tf32_supported",
     "manual_seed",
     "manual_seed_all",
     "max_memory_allocated",
diff --git a/torch/xpu/_gpu_trace.py b/torch/xpu/_gpu_trace.py
index 9ab5ac8f1bad..7c3a8b9bf785 100644
--- a/torch/xpu/_gpu_trace.py
+++ b/torch/xpu/_gpu_trace.py
@@ -1,4 +1,4 @@
-from typing import Callable
+from collections.abc import Callable
 
 from torch._utils import CallbackRegistry
 
diff --git a/torch/xpu/streams.py b/torch/xpu/streams.py
index dd381cf83419..378e71074c18 100644
--- a/torch/xpu/streams.py
+++ b/torch/xpu/streams.py
@@ -126,7 +126,7 @@ def record(self, stream=None) -> None:
         """
         if stream is None:
             stream = torch.xpu.current_stream()
-        super().record(stream)
+        super().record(stream)  # pyrefly: ignore  # bad-argument-type
 
     def wait(self, stream=None) -> None:
         r"""Make all future work submitted to the given stream wait for this event.
diff --git a/torchgen/aoti/fallback_ops.py b/torchgen/aoti/fallback_ops.py
index 611400d271d9..a66151a31bb1 100644
--- a/torchgen/aoti/fallback_ops.py
+++ b/torchgen/aoti/fallback_ops.py
@@ -56,6 +56,7 @@
     "aten._scaled_dot_product_fused_attention_overrideable_backward.default": {},
     "aten._scaled_dot_product_fused_attention_overrideable.default": {},
     "aten._scaled_mm.default": {},
+    "aten._scaled_grouped_mm.default": {},
     "aten._scaled_mm.out": {},
     "aten._segment_reduce_backward.default": {},
     "aten._thnn_fused_lstm_cell.default": {},
diff --git a/torchgen/api/functionalization.py b/torchgen/api/functionalization.py
index 93667e39b17f..f4b46b5f1476 100644
--- a/torchgen/api/functionalization.py
+++ b/torchgen/api/functionalization.py
@@ -23,20 +23,13 @@
 
 
 # This file describes the translation of JIT schema to API's used
-# when creating view lambdas that are used by the functionalization pass.
-# There are two types of lambdas: forward lambdas and reverse lambdas.
-# These API's mostly follow the dispatcher API, with a few quirks:
-# - The lambda capture has to convert reference types to value types
-# - While the forward lambda just directly calls into the at::_ops API
-#   (following the dispatcher convention), the logic here for the reverse lambda
+# when creating `ViewMeta` specializations that are used by the functionalization pass.
+# These API's mostly follow the dispatcher API, with one difference:
+# - While the forward function just directly calls into the at::_ops API
+#   (following the dispatcher convention), the logic here for the reverse function
 #   is responsible for generating both the call-site, and the declarations
 #   (which are implemented manually in the at::functionalization::impl namespace).
 
-# The lambdas generated for each view op in the functionalization pass are of the form
-# [capture_arguments](outer_arguments) -> returns_type {
-#     return name(inner_arguments);
-# }
-
 # Define some specific lambda input arguments.
 base_binding = Binding(
     name="base",
@@ -46,6 +39,18 @@
     ),
     default=None,
 )
+
+has_symbolic_inputs_binding = Binding(
+    name="has_symbolic_inputs",
+    nctype=NamedCType(name="has_symbolic_inputs", type=BaseCType(boolT)),
+    argument=Argument(
+        name="has_symbolic_inputs",
+        type=BaseType(BaseTy.bool),
+        default=None,
+        annotation=None,
+    ),
+    default=None,
+)
 mutated_view_binding = Binding(
     name="mutated_view",
     nctype=NamedCType(name="mutated_view", type=ConstRefCType(BaseCType(tensorT))),
@@ -54,11 +59,11 @@
     ),
     default=None,
 )
-mutated_view_idx_binding = Binding(
-    name="mutated_view_idx",
-    nctype=NamedCType(name="mutated_view_idx", type=BaseCType(longT)),
+out_index_binding = Binding(
+    name="out_index",
+    nctype=NamedCType(name="out_index", type=BaseCType(longT)),
     argument=Argument(
-        name="base", type=BaseType(BaseTy.Tensor), default=None, annotation=None
+        name="out_index", type=BaseType(BaseTy.int), default=None, annotation=None
     ),
     default=None,
 )
@@ -86,8 +91,13 @@
 )
 
 
-# The lambda capture itself doesn't have a name.
-# The name returned here corresponds to the name of the inner function called by the lambda.
+# Name of the `ViewMeta` specialization class created.
+def classname(func: FunctionSchema, with_namespace: bool = False) -> str:
+    namespace = "at::functionalization::" if with_namespace else ""
+    return f"{namespace}{func.name.unambiguous_name()}_ViewMeta"
+
+
+# Name of the operation called inside the `forward`/`reverse` implementations.
 def name(
     g: NativeFunctionsViewGroup,
     *,
@@ -124,24 +134,6 @@ def reverse_name(f: NativeFunction, include_namespace: bool) -> str:
         return f"{api_name}_inverse"
 
 
-def capture_arguments(func: FunctionSchema, *, is_reverse: bool) -> list[Binding]:
-    # capture arguments include all arguments except `self`.
-    # Importantly, they don't include any C++ reference types (or else we'll get a dangling reference in the capture),
-    # So any reference types (IntArrayRef) need to be converted to value types (vector<int64_t>)
-    args = func.arguments.flat_all
-    assert args[0].type == BaseType(BaseTy.Tensor)
-    non_self_args = args[1:]
-    non_self_value_bindings = [
-        dispatcher.argument(a, remove_non_owning_ref_types=True) for a in non_self_args
-    ]
-
-    all_bindings = [
-        inverse_return_mode_binding if is_reverse else reapply_views_binding
-    ]
-    all_bindings.extend(non_self_value_bindings)
-    return all_bindings
-
-
 def returns_type(func: FunctionSchema) -> CType:
     # Assertion: all view ops return tensor-like outputs
     assert len(func.returns) >= 1
@@ -152,24 +144,49 @@ def returns_type(func: FunctionSchema) -> CType:
     return BaseCType(tensorT)
 
 
-def outer_arguments(*, is_reverse: bool) -> list[Binding]:
-    if is_reverse:
-        return [base_binding, mutated_view_binding, mutated_view_idx_binding]
-    else:
-        return [base_binding, mutated_view_idx_binding]
+# Checks whether `func` might return more than one value.
+def is_multi_output(func: FunctionSchema) -> bool:
+    return len(func.returns) > 1 or (
+        len(func.returns) == 1 and func.returns[0].type.is_list_like() is not None
+    )
+
 
+# `ViewMeta` specialization constructor parameters.
+def base_ctor_arguments(func: FunctionSchema) -> list[Binding]:
+    # All specializations are parematerized by `has_symbolic_inputs` flag.
+    arguments = [has_symbolic_inputs_binding]
 
-def inner_call_index(func: FunctionSchema) -> Binding | None:
-    # For view ops that return multiple tensors (like `split`), we generate a separate lambda for each output.
-    # When we replay a view op that returns multiple tensors, we need to index into the output appropriately
-    if len(func.returns) > 1 or (
-        len(func.returns) == 1 and func.returns[0].type.is_list_like()
-    ):
-        return mutated_view_idx_binding
-    return None
+    # If `func` might return more than 1 value, we also parameterize this specialization
+    # with the output index.
+    if is_multi_output(func):
+        arguments.append(out_index_binding)
+
+    return arguments
+
+
+# `ViewMeta` specialized class' constructor arguments.
+#
+# Values needed specifically by this specialization, that the base class does not need.
+# Same as the class' attributes, but non-owning.
+def extra_ctor_arguments(func: FunctionSchema) -> list[Binding]:
+    return attributes(func, owning=False)
+
+
+# `ViewMeta` specialized class' non-static member data.
+#
+# Essential data for calling the instance's `forward` and `reverse functions. You can
+# think of them as values that should be captured from the functionalization kernel.
+def attributes(func: FunctionSchema, owning: bool = True) -> list[Binding]:
+    args = func.arguments.flat_all
+    assert args[0].type == BaseType(BaseTy.Tensor)
+    return [
+        reapply_views_binding,
+        inverse_return_mode_binding,
+        *[dispatcher.argument(a, remove_non_owning_ref_types=owning) for a in args[1:]],
+    ]
 
 
-def inner_arguments(func: FunctionSchema, is_reverse: bool) -> list[Binding]:
+def op_arguments(func: FunctionSchema, is_reverse: bool) -> list[Binding]:
     args = func.arguments.flat_all
     assert args[0].type == BaseType(BaseTy.Tensor)
     non_self_args = args[1:]
@@ -183,13 +200,12 @@ def inner_arguments(func: FunctionSchema, is_reverse: bool) -> list[Binding]:
         # the reverse lambda does the same, but with an additional "mutated_view" arg
         # additionally, we have a calling convention: for view ops that return multiple tensor outputs
         # their corresponding view_inverse function takes in an additional index argument.
-        index_binding = inner_call_index(func)
-        if index_binding is not None:
+        if is_multi_output(func):
             return [
                 base_binding,
                 mutated_view_binding,
                 inverse_return_mode_binding,
-                index_binding,
+                out_index_binding,
             ] + non_self_bindings
         else:
             return [
diff --git a/torchgen/api/types/signatures.py b/torchgen/api/types/signatures.py
index b3856e65e700..d4a47536dd1f 100644
--- a/torchgen/api/types/signatures.py
+++ b/torchgen/api/types/signatures.py
@@ -300,83 +300,11 @@ def decl(self) -> str:
         return_type = functionalization.returns_type(self.g.view.func)
         decls = [
             a.decl()
-            for a in functionalization.inner_arguments(
-                self.g.view.func, is_reverse=True
-            )
+            for a in functionalization.op_arguments(self.g.view.func, is_reverse=True)
         ]
         return f"static {return_type.cpp_type()} {self.name()}({', '.join(decls)});"
 
 
-@dataclass(frozen=True)
-class FunctionalizationLambda:
-    g: NativeFunctionsViewGroup
-
-    # are we generating the forward lambda or the reverse lambda?
-    is_reverse: bool
-
-    def captures(self) -> list[Expr]:
-        # The lambda lives inside of a kernel following the dispatcher API, so its outer context is the dispatcher arguments
-        # We also need to read the "reapply views" TLS at the time that the functionalization kernel was executed,
-        # and plumb it into the lambda.
-        outer_ctx = dispatcher.arguments(self.g.view.func) + [
-            functionalization.reapply_views_binding,
-            functionalization.inverse_return_mode_binding,
-        ]
-        capture_bindings = functionalization.capture_arguments(
-            self.g.view.func, is_reverse=self.is_reverse
-        )
-        # allow_expensive_conversions is set because we want to convert
-        # some reference types (IntArrayRef) to value types (vector<int64_t>).
-        capture_exprs = translate.translate(
-            outer_ctx, capture_bindings, method=False, allow_expensive_conversions=True
-        )
-        return capture_exprs
-
-    def decl(self) -> str:
-        return_type = functionalization.returns_type(self.g.view.func)
-        capture_str = ", ".join(
-            f"{val.type.name} = {val.expr}" for val in self.captures()
-        )
-        decls = [
-            a.decl()
-            for a in functionalization.outer_arguments(is_reverse=self.is_reverse)
-        ]
-        return f"[{capture_str}]({', '.join(decls)}) -> {return_type.cpp_type()}"
-
-    def inner_call(self, *, reapply_views: bool | None = None) -> str:
-        inner_call_name = functionalization.name(
-            self.g,
-            is_reverse=self.is_reverse,
-            include_namespace=True,
-            reapply_views=reapply_views,
-        )
-
-        arg_ctx = functionalization.outer_arguments(is_reverse=self.is_reverse)
-        capture_ctx = functionalization.capture_arguments(
-            self.g.view.func, is_reverse=self.is_reverse
-        )
-        full_ctx = arg_ctx + capture_ctx
-
-        assert self.g.view_copy is not None
-        call_bindings = functionalization.inner_arguments(
-            self.g.view_copy.func, is_reverse=self.is_reverse
-        )
-        maybe_index = functionalization.inner_call_index(self.g.view_copy.func)
-        call_exprs = [
-            e.expr for e in translate.translate(full_ctx, call_bindings, method=False)
-        ]
-        if not self.is_reverse and maybe_index is not None:
-            return f"{inner_call_name}({', '.join(call_exprs)})[{maybe_index.name}];"
-        else:
-            return f"{inner_call_name}({', '.join(call_exprs)});"
-
-    @staticmethod
-    def from_func(
-        g: NativeFunctionsViewGroup, *, is_reverse: bool
-    ) -> FunctionalizationLambda:
-        return FunctionalizationLambda(g, is_reverse)
-
-
 @dataclass(frozen=True)
 class StructuredImplSignature:
     g: NativeFunctionsGroup
diff --git a/torchgen/context.py b/torchgen/context.py
index a482a59eeb70..e3725d66b964 100644
--- a/torchgen/context.py
+++ b/torchgen/context.py
@@ -2,7 +2,7 @@
 
 import contextlib
 import functools
-from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar, Union
+from typing import Any, Optional, TYPE_CHECKING, TypeVar, Union
 
 import torchgen.local as local
 from torchgen.model import (
@@ -16,7 +16,7 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator
+    from collections.abc import Callable, Iterator
 
 
 # Helper functions for defining generators on things in the model
diff --git a/torchgen/gen.py b/torchgen/gen.py
index b8290d6b8684..ae0e4b52a0fc 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -8,7 +8,7 @@
 from collections import defaultdict, namedtuple, OrderedDict
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Callable, Literal, TYPE_CHECKING, TypeVar
+from typing import Any, Literal, TYPE_CHECKING, TypeVar
 from typing_extensions import assert_never
 
 import yaml
@@ -43,6 +43,8 @@
     gen_functionalization_definition,
     gen_functionalization_registration,
     gen_functionalization_view_inverse_declaration,
+    gen_functionalization_view_meta_classes_decl,
+    gen_functionalization_view_meta_classes_impl,
     GenCompositeViewCopyKernel,
 )
 from torchgen.gen_vmap_plumbing import gen_all_vmap_plumbing
@@ -94,7 +96,7 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Callable, Sequence
     from typing import Optional
 
 
@@ -2493,48 +2495,48 @@ def key_func(
         },
     )
 
-    def functionalization_env_callable(
+    def gen_op_headers(
         g: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup,
-    ) -> dict[str, list[str]]:
-        def gen_op_headers(
-            g: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup,
-        ) -> list[str]:
-            if isinstance(g, NativeFunctionsViewGroup):
-                # view ops always get a functionalization kernel
-                headers = [
-                    f"#include <ATen/ops/{g.view.root_name}_native.h>",
-                    f"#include <ATen/ops/{g.view.root_name}_ops.h>",
+    ) -> list[str]:
+        if isinstance(g, NativeFunctionsViewGroup):
+            # view ops always get a functionalization kernel
+            headers = [
+                f"#include <ATen/ops/{g.view.root_name}_native.h>",
+                f"#include <ATen/ops/{g.view.root_name}_ops.h>",
+            ]
+            if g.view_copy is not None:
+                headers += [
+                    f"#include <ATen/ops/{g.view_copy.root_name}_native.h>",
+                    f"#include <ATen/ops/{g.view_copy.root_name}_ops.h>",
                 ]
-                if g.view_copy is not None:
-                    headers += [
-                        f"#include <ATen/ops/{g.view_copy.root_name}_native.h>",
-                        f"#include <ATen/ops/{g.view_copy.root_name}_ops.h>",
-                    ]
-                return headers
-            elif isinstance(g, NativeFunctionsGroup):
-                headers = [
-                    f"#include <ATen/ops/{g.functional.root_name}_native.h>",
-                    f"#include <ATen/ops/{g.functional.root_name}_ops.h>",
-                    f"#include <ATen/ops/{g.out.root_name}_native.h>",
-                    f"#include <ATen/ops/{g.out.root_name}_ops.h>",
+            return headers
+        elif isinstance(g, NativeFunctionsGroup):
+            headers = [
+                f"#include <ATen/ops/{g.functional.root_name}_native.h>",
+                f"#include <ATen/ops/{g.functional.root_name}_ops.h>",
+                f"#include <ATen/ops/{g.out.root_name}_native.h>",
+                f"#include <ATen/ops/{g.out.root_name}_ops.h>",
+            ]
+            if g.inplace is not None:
+                headers += [
+                    f"#include <ATen/ops/{g.inplace.root_name}_native.h>",
+                    f"#include <ATen/ops/{g.inplace.root_name}_ops.h>",
                 ]
-                if g.inplace is not None:
-                    headers += [
-                        f"#include <ATen/ops/{g.inplace.root_name}_native.h>",
-                        f"#include <ATen/ops/{g.inplace.root_name}_ops.h>",
-                    ]
-                if g.mutable is not None:
-                    headers += [
-                        f"#include <ATen/ops/{g.mutable.root_name}_native.h>",
-                        f"#include <ATen/ops/{g.mutable.root_name}_ops.h>",
-                    ]
-                return headers
-            else:
-                return [
-                    f"#include <ATen/ops/{g.root_name}_native.h>",
-                    f"#include <ATen/ops/{g.root_name}_ops.h>",
+            if g.mutable is not None:
+                headers += [
+                    f"#include <ATen/ops/{g.mutable.root_name}_native.h>",
+                    f"#include <ATen/ops/{g.mutable.root_name}_ops.h>",
                 ]
+            return headers
+        else:
+            return [
+                f"#include <ATen/ops/{g.root_name}_native.h>",
+                f"#include <ATen/ops/{g.root_name}_ops.h>",
+            ]
 
+    def functionalization_env_callable(
+        g: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup,
+    ) -> dict[str, list[str]]:
         return {
             "ops_headers": gen_op_headers(g),
             "func_definitions": gen_functionalization_definition(
@@ -2600,6 +2602,31 @@ def gen_op_headers(
         },
     )
 
+    cpu_fm.write(
+        "ViewMetaClasses.h",
+        lambda: {
+            "view_meta_declarations": list(
+                concatMap(
+                    lambda g: gen_functionalization_view_meta_classes_decl(selector, g),
+                    view_groups,
+                )
+            )
+        },
+    )
+
+    cpu_fm.write(
+        "ViewMetaClasses.cpp",
+        lambda: {
+            "view_meta_implementations": list(
+                concatMap(
+                    lambda g: gen_functionalization_view_meta_classes_impl(selector, g),
+                    view_groups,
+                )
+            ),
+            "op_headers": list(concatMap(gen_op_headers, view_groups)),
+        },
+    )
+
     # Note [view_copy NativeFunctions]
     # Every view operator in native_functions.yaml that is not CompositeImplicitAutograd
     # needs to have a corresponding non-aliasing {view}_copy variant.
diff --git a/torchgen/gen_functionalization_type.py b/torchgen/gen_functionalization_type.py
index 42407974087a..1cb681ba19d3 100644
--- a/torchgen/gen_functionalization_type.py
+++ b/torchgen/gen_functionalization_type.py
@@ -1,16 +1,15 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Callable, TYPE_CHECKING
+from typing import Optional, TYPE_CHECKING
 
-from torchgen.api import cpp, dispatcher
+from torchgen.api import cpp, dispatcher, functionalization
 from torchgen.api.translate import translate
 from torchgen.api.types import (
     BaseCType,
     Binding,
     CType,
     DispatcherSignature,
-    FunctionalizationLambda,
     iTensorListRefT,
     NativeSignature,
     OptionalCType,
@@ -48,10 +47,12 @@
     MUTABLE_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT,
     OUT_OPS_THAT_DONT_GET_GROUPED_PROPERLY,
 )
-from torchgen.utils import dataclass_repr
+from torchgen.utils import concatMap, dataclass_repr, FileManager
 
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from torchgen.selective_build.selector import SelectiveBuilder
 
 
@@ -365,6 +366,8 @@ def emit_view_functionalization_body(
     with native_function_manager(f):
         call_sig = DispatcherSignature.from_schema(g.view_copy.func)
 
+        spec = ViewMetaSpecialization(g, f=f)
+
         # the "view_copy" op name that the functionalization kernels need to call
         api_name = g.view_copy.func.name.unambiguous_name()
         # Sometimes the functionalization pass needs to no-op (e.g. if it was passed non-functional tensors)
@@ -385,9 +388,6 @@ def emit_view_functionalization_body(
             for e in translate(unwrapped_args_ctx, call_sig.arguments(), method=False)
         ]
 
-        forward_lambda = FunctionalizationLambda.from_func(g, is_reverse=False)
-        reverse_lambda = FunctionalizationLambda.from_func(g, is_reverse=True)
-
         # The meta API call should use the same arguments, but convert all tensors to meta tensors first.
         meta_conversion_str, meta_call_ctx = convert_to_meta_tensors(dispatcher_sig)
         meta_call_args = [
@@ -415,19 +415,7 @@ def emit_view_functionalization_body(
             : at::functionalization::InverseReturnMode::NeverView
       );
       {symbolic_inputs_check}
-      at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
-        {forward_lambda.decl()} {{
-          if (reapply_views) {{
-            return {forward_lambda.inner_call(reapply_views=True)}
-          }} else {{
-            return {forward_lambda.inner_call(reapply_views=False)}
-          }}
-        }},
-        {reverse_lambda.decl()} {{
-          return {reverse_lambda.inner_call()}
-        }},
-        /*has_symbolic_inputs=*/{symbolic_inputs_varname}
-      );
+      auto view_meta = {spec.new()};
       auto compute_reference_meta =
         {view_tensor_name}.key_set().has_backend(c10::BackendComponent::XLABit) ||
         {view_tensor_name}.key_set().has_backend(c10::BackendComponent::LazyBit);
@@ -455,7 +443,6 @@ def emit_view_functionalization_body(
 """
 
         else:
-            is_multi_output_view = isinstance(f.func.returns[0].type, ListType)
             return f"""
     {dispatcher_sig.defn(name=wrapper_name(f.func), is_redispatching_fn=True)} {{
       {unwrap_tensor_args_str}
@@ -489,21 +476,7 @@ def emit_view_functionalization_body(
         }}
       }}
       {symbolic_inputs_check}
-      at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
-        {forward_lambda.decl()} {{
-          if (reapply_views) {{
-            return {forward_lambda.inner_call(reapply_views=True)}
-          }} else {{
-            return {forward_lambda.inner_call(reapply_views=False)}
-          }}
-        }},
-        {reverse_lambda.decl()} {{
-          return {reverse_lambda.inner_call()}
-        }},
-        /*has_symbolic_inputs=*/{symbolic_inputs_varname},
-        /*is_multi_output=*/{str(is_multi_output_view).lower()},
-        /*is_as_strided=*/{str(str(f.func.name) == "as_strided").lower()}
-      );
+      auto view_meta = {spec.new()};
       auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, {view_tensor_name}, view_meta);
       // See  Note [Propagating strides in the functionalization pass]
       if (compute_reference_meta && !disable_meta_reference()) {{
@@ -681,28 +654,6 @@ def emit_inplace_functionalization_body(
         for e in translate(unwrapped_args_ctx, functional_sig.arguments(), method=False)
     ]
 
-    if f.func.is_out_fn():
-        mutable_input_post_processing = "\n".join(
-            [
-                f"""
-      at::functionalization::impl::replace_(
-        {a.name}, {"std::get<" + str(i) + ">(tmp_output)" if len(f.func.returns) > 1 else "tmp_output"});
-      at::functionalization::impl::commit_update({a.name});"""
-                for (i, a) in enumerate(f.func.arguments.out)
-                if a.annotation and a.annotation.is_write and a.type.is_tensor_like()
-            ]
-        )
-    else:
-        mutable_input_post_processing = "\n".join(  # noqa: F841
-            [
-                f"""
-      at::functionalization::impl::replace_({a.name}, tmp_output);
-      at::functionalization::impl::commit_update({a.name});"""
-                for a in f.func.arguments.flat_all
-                if a.annotation and a.annotation.is_write and a.type.is_tensor_like()
-            ]
-        )
-
     meta_conversion_str, meta_call_ctx = convert_to_meta_tensors(dispatcher_sig)
     # We don't want to run the inplace meta func for ops like .set_(), because:
     # (1) they're unnecessary: inplace meta checks are only useful for ops like add_(),
@@ -771,6 +722,301 @@ def emit_decl_helper(g: NativeFunctionsViewGroup) -> str | None:
     return emit_decl_helper(g)
 
 
+# Helper class for generating `ViewMeta` specializations.
+@dataclass
+class ViewMetaSpecialization:
+    g: NativeFunctionsViewGroup
+    f: NativeFunction
+
+    @property
+    def is_multi_output(self) -> bool:
+        return functionalization.is_multi_output(self.f.func)
+
+    @property
+    def is_as_strided(self) -> bool:
+        return str(self.f.func.name) == "as_strided"
+
+    @property
+    def out_index(self) -> str:
+        if self.is_multi_output:
+            return functionalization.out_index_binding.name
+        return "0"
+
+    @property
+    def classname(self) -> str:
+        return functionalization.classname(self.f.func)
+
+    def decl(self) -> list[str]:
+        base_ctor_arguments = functionalization.base_ctor_arguments(self.f.func)
+        extra_ctor_arguments = functionalization.extra_ctor_arguments(self.f.func)
+        attributes = functionalization.attributes(self.f.func)
+
+        # List of types for declaring the `SerializableTuple` type.
+        serializable_tuple_args = ",\n".join(
+            f"      {binding.type} /* {binding.name} */"
+            for binding in (base_ctor_arguments + attributes)
+        )
+
+        # Arguments used for forwarding the tuple elements to the constructor.
+        destructure_tuple_args = ", ".join(
+            f"std::get<{i}>(tpl)"
+            for i in range(len(base_ctor_arguments) + len(extra_ctor_arguments))
+        )
+
+        # List of constructor parameters
+        ctor_parameters = ", ".join(
+            binding.decl() for binding in (base_ctor_arguments + extra_ctor_arguments)
+        )
+
+        # Call the base class `ViewMeta` constructor.
+        #
+        # Both of `is_multi_output` and `is_as_strided` are known values, given the
+        # operation schema.
+        is_multi_output_str = str(self.is_multi_output).lower()
+        is_as_strided_str = str(self.is_as_strided).lower()
+
+        base_ctor_bindings = ", ".join(
+            [
+                # `has_symbolic_inputs` is always taken as parameter.
+                functionalization.has_symbolic_inputs_binding.name,
+                f"/*is_multi_output=*/{is_multi_output_str}",
+                f"/*is_as_strided=*/{is_as_strided_str}",
+                # `out_index` is know if the operation returns only one value. Otherwise,
+                # we also take it as parameter.
+                f"/*out_index=*/{self.out_index}",
+            ]
+        )
+
+        # Assignments of `extra_ctor_arguments` to their corresponding fields.
+        # These are extra fields to-be-declared in this specialization.
+        #
+        # We need to set `allow_expensive_conversions`, since we are storing owned versions
+        # of the non-owning arguments.
+        ctor_assignments = ",\n".join(
+            f"        {e.type.name}({e.expr})"
+            for e in translate(
+                extra_ctor_arguments,
+                attributes,
+                method=False,
+                allow_expensive_conversions=True,
+            )
+        )
+
+        # List of arguments for constructing the `SerializableTuple` from an instance.
+        tuple_arguments = ", ".join(
+            binding.name for binding in (base_ctor_arguments + attributes)
+        )
+
+        # List of field declarations.
+        attr_declarations = "\n".join(f"  {binding.decl()};" for binding in attributes)
+
+        # Override `to_out_index` if this operation returns more than 1 value.
+        to_out_index_decl = ""
+        if self.is_multi_output:
+            to_out_index_decl = (
+                "  std::shared_ptr<ViewMeta> to_out_index(int64_t out_idx) override;"
+            )
+
+        return [
+            f"""
+struct TORCH_API {self.classname} : public ViewMeta {{
+  FUNCTIONALIZATION_VIEWMETA_NAME({self.classname})
+  FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(\n{serializable_tuple_args});
+
+  {self.classname}(const SerializableTuple& tpl)
+      : {self.classname}({destructure_tuple_args}) {{}}
+
+  {self.classname}({ctor_parameters})
+      : at::functionalization::ViewMeta({base_ctor_bindings}),
+{ctor_assignments} {{}}
+
+  Tensor forward(const Tensor& base) override;
+  Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
+{to_out_index_decl}
+
+  SerializableTuple to_serializable_tuple() {{
+    return std::make_tuple({tuple_arguments});
+  }}
+
+{attr_declarations}
+}};
+"""
+        ]
+
+    # Generate a call to the actual operation.
+    def opcall(self, is_reverse: bool, reapply_views: bool) -> str:
+        opname = functionalization.name(
+            self.g,
+            is_reverse=is_reverse,
+            include_namespace=True,
+            reapply_views=reapply_views,
+        )
+
+        # Expected arguments for the operation.
+        assert self.g.view_copy is not None
+        op_arguments = functionalization.op_arguments(self.g.view_copy.func, is_reverse)
+
+        # The context is composed by the constructor arguments (which are also
+        # the field variables stored in the instance), and the `base` tensor.
+        context = [functionalization.base_binding]
+        context += functionalization.base_ctor_arguments(self.f.func)
+        context += functionalization.attributes(self.f.func)
+
+        # If we are generating the call for the reverse function, we also have
+        # access to `mutated_view` argument.
+        if is_reverse:
+            context.append(functionalization.mutated_view_binding)
+
+        arguments = ", ".join(
+            [e.expr for e in translate(context, op_arguments, method=False)]
+        )
+
+        # Index the result if this operation returns multiple values.
+        maybe_index = ""
+        if not is_reverse and self.is_multi_output:
+            maybe_index = f"[{self.out_index}]"
+
+        return f"{opname}({arguments}){maybe_index}"
+
+    def impl(self) -> list[str]:
+        functions = [
+            f"""
+at::Tensor {self.classname}::forward(const at::Tensor& base) {{
+  if (reapply_views) {{
+    return {self.opcall(is_reverse=False, reapply_views=True)};
+  }} else {{
+    return {self.opcall(is_reverse=False, reapply_views=False)};
+  }}
+}}""",
+            f"""
+at::Tensor {self.classname}::reverse(const at::Tensor& base, const Tensor& mutated_view) {{
+  return {self.opcall(is_reverse=True, reapply_views=True)};
+}}""",
+        ]
+
+        # If this operation returns multiple values, also generate a `to_out_index`
+        # implementation.
+        if self.is_multi_output:
+            functions.append(f"""
+std::shared_ptr<at::functionalization::ViewMeta> {self.classname}::to_out_index(int64_t out_index) {{
+  return {self.new("out_index")};
+}}
+""")
+
+        return functions
+
+    # Create the Python binding for this specialized class.
+    def binding(self) -> list[str]:
+        name = functionalization.classname(self.f.func, with_namespace=True)
+        return [f"  create_binding_with_pickle<{name}>(functionalization);"]
+
+    # Generate an instantiation of this specialized class.
+    def new(self, out_index: str = "0") -> str:
+        name = functionalization.classname(self.f.func, with_namespace=True)
+        ctor_arguments = functionalization.base_ctor_arguments(
+            self.f.func
+        ) + functionalization.extra_ctor_arguments(self.f.func)
+        # Replace the `out_index` parameter with the given `out_index`.
+        arguments = ", ".join(
+            binding.name if binding.name != "out_index" else out_index
+            for binding in ctor_arguments
+        )
+        return f"std::make_shared<{name}>({arguments})"
+
+    # Run the function `run` for both: `view` and `view_inplace` functions.
+    @staticmethod
+    def map(
+        g: NativeFunctionsViewGroup, run: Callable[[ViewMetaSpecialization], list[str]]
+    ) -> list[str]:
+        def maybe_run(f: Optional[NativeFunction]) -> list[str]:
+            if f is None:
+                return []
+            with native_function_manager(f):
+                return run(ViewMetaSpecialization(g, f))
+
+        return list(concatMap(maybe_run, (g.view, g.view_inplace)))
+
+
+def gen_functionalization_view_meta_classes_base(
+    selector: SelectiveBuilder,
+    g: NativeFunctionsViewGroup,
+    run: Callable[[ViewMetaSpecialization], list[str]],
+) -> list[str]:
+    if not selector.include_all_operators:
+        return []
+
+    if g.composite:
+        return []
+
+    return ViewMetaSpecialization.map(g, run)
+
+
+def gen_functionalization_view_meta_classes_decl(
+    selector: SelectiveBuilder, g: NativeFunctionsViewGroup
+) -> list[str]:
+    return gen_functionalization_view_meta_classes_base(
+        selector, g, ViewMetaSpecialization.decl
+    )
+
+
+def gen_functionalization_view_meta_classes_impl(
+    selector: SelectiveBuilder, g: NativeFunctionsViewGroup
+) -> list[str]:
+    return gen_functionalization_view_meta_classes_base(
+        selector, g, ViewMetaSpecialization.impl
+    )
+
+
+def gen_functionalization_view_meta_classes_binding(
+    selector: SelectiveBuilder, g: NativeFunctionsViewGroup
+) -> list[str]:
+    return gen_functionalization_view_meta_classes_base(
+        selector, g, ViewMetaSpecialization.binding
+    )
+
+
+# Generates the Python bindings for the `ViewMeta` specialized classes.
+def gen_functionalization_view_meta_classes(
+    native_functions_path: str,
+    tags_path: str,
+    selector: SelectiveBuilder,
+    install_dir: str,
+    template_dir: str,
+) -> None:
+    from torchgen.gen import get_grouped_by_view_native_functions, parse_native_yaml
+
+    # Parse the native_functions.yaml.
+    # Then, group them into `NativeFunctionsViewGroup`.
+    #
+    # This is the same steps we do in gen.py (ATen codegen).
+    native_functions = parse_native_yaml(
+        native_functions_path, tags_path
+    ).native_functions
+    native_functions_with_view_groups = get_grouped_by_view_native_functions(
+        native_functions
+    )
+    view_groups = [
+        g
+        for g in native_functions_with_view_groups
+        if isinstance(g, NativeFunctionsViewGroup)
+    ]
+
+    fm = FileManager(install_dir=install_dir, template_dir=template_dir, dry_run=False)
+    fm.write(
+        "ViewMetaClassesPythonBinding.cpp",
+        lambda: {
+            "view_meta_bindings": list(
+                concatMap(
+                    lambda g: gen_functionalization_view_meta_classes_binding(
+                        selector, g
+                    ),
+                    view_groups,
+                )
+            ),
+        },
+    )
+
+
 def gen_functionalization_registration(
     selector: SelectiveBuilder,
     g: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup,
@@ -778,8 +1024,22 @@ def gen_functionalization_registration(
 ) -> list[str]:
     @with_native_function
     def emit_registration_helper(f: NativeFunction) -> str:
-        assert not f.has_composite_implicit_autograd_kernel
-        registration_str = f"TORCH_FN(functionalization::{wrapper_name(f.func)})"
+        if f.has_composite_implicit_autograd_kernel:
+            metadata = composite_implicit_autograd_index.get_kernel(f)
+            assert metadata is not None
+            native_api_name = metadata.kernel
+            sig = NativeSignature(f.func, symint=metadata.supports_symint())
+            # Note [Composite view ops in the functionalization pass]
+            # We don't need to worry about implemententing functionalization kernels for views with
+            # CompositeImplicitAutograd kernels, because we can just decompose them into their base operators.
+            # We can't just opt the entire Functionalization dispatch key into the composite keyset though,
+            # because we don't want to decompose non-view ops that are composite, like `at::ones`.
+            registration_str = (
+                f"static_cast<{sig.ptr_type()}>(at::native::{native_api_name})"
+            )
+        else:
+            # non-composite view ops (and inplace ops) get a normal registration.
+            registration_str = f"TORCH_FN(functionalization::{wrapper_name(f.func)})"
         return f'm.impl("{f.func.name}", {registration_str});'
 
     # Don't generate kernels in mobile build
@@ -792,12 +1052,8 @@ def emit_registration_helper(f: NativeFunction) -> str:
         if str(g.view.func.name) == "lift_fresh":
             return []
         view_str = []
-        if not g.view.has_composite_implicit_autograd_kernel:
-            view_str.append(emit_registration_helper(g.view))
-        if (
-            g.view_inplace is not None
-            and not g.view_inplace.has_composite_implicit_autograd_kernel
-        ):
+        view_str.append(emit_registration_helper(g.view))
+        if g.view_inplace is not None:
             assert g.view_inplace.is_view_op
             view_str.append(emit_registration_helper(g.view_inplace))
         return view_str
diff --git a/torchgen/gen_lazy_tensor.py b/torchgen/gen_lazy_tensor.py
index e397561d378e..ffd0aab2a281 100644
--- a/torchgen/gen_lazy_tensor.py
+++ b/torchgen/gen_lazy_tensor.py
@@ -4,7 +4,7 @@
 import os
 from collections import namedtuple
 from pathlib import Path
-from typing import Any, Callable, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 
 import yaml
 
@@ -26,7 +26,7 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Iterable, Iterator, Sequence
+    from collections.abc import Callable, Iterable, Iterator, Sequence
 
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
diff --git a/torchgen/gen_vmap_plumbing.py b/torchgen/gen_vmap_plumbing.py
index 0632e7c4b969..daf60589a0cc 100644
--- a/torchgen/gen_vmap_plumbing.py
+++ b/torchgen/gen_vmap_plumbing.py
@@ -150,7 +150,7 @@ def gen_vmap_inplace_plumbing(native_function: NativeFunction) -> str | None:
     assert schema.kind() == SchemaKind.inplace
     if not is_mutated_arg(schema.arguments.flat_all[0]):
         return None
-    if not len([arg for arg in schema.arguments.flat_all if is_mutated_arg(arg)]) == 1:
+    if len([arg for arg in schema.arguments.flat_all if is_mutated_arg(arg)]) != 1:
         return None
 
     # Only support cases where all returns are Tensors or vector<Tensor>
diff --git a/torchgen/model.py b/torchgen/model.py
index eb3a80dffe6a..906b61e2f19c 100644
--- a/torchgen/model.py
+++ b/torchgen/model.py
@@ -5,14 +5,14 @@
 import re
 from dataclasses import dataclass
 from enum import auto, Enum
-from typing import Callable, Optional, TYPE_CHECKING
+from typing import Optional, TYPE_CHECKING
 from typing_extensions import assert_never
 
 from torchgen.utils import NamespaceHelper, OrderedSet
 
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator, Sequence
+    from collections.abc import Callable, Iterator, Sequence
 
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
@@ -60,7 +60,23 @@ class Variant(Enum):
 DEFAULT_KERNEL_NAMESPACE = "at::native"
 
 # NOTE: Keep the list in sync with `DispatchKey` in c10/core/DispatchKey.h
-BACKEND_COMPONENTS = "CPU CUDA HIP XLA MTIA MPS IPU XPU HPU VE Lazy Meta PrivateUse1 PrivateUse2 PrivateUse3".split()
+BACKEND_COMPONENTS = [
+    "CPU",
+    "CUDA",
+    "HIP",
+    "XLA",
+    "MTIA",
+    "MPS",
+    "IPU",
+    "XPU",
+    "HPU",
+    "VE",
+    "Lazy",
+    "Meta",
+    "PrivateUse1",
+    "PrivateUse2",
+    "PrivateUse3",
+]
 FUNCTIONALITY_KEYS = [
     "",
     "Quantized",
diff --git a/torchgen/utils.py b/torchgen/utils.py
index 905d6fd0c0b6..1091b0ebed68 100644
--- a/torchgen/utils.py
+++ b/torchgen/utils.py
@@ -7,18 +7,19 @@
 import re
 import sys
 import textwrap
-from dataclasses import fields, is_dataclass
+from dataclasses import is_dataclass
 from enum import auto, Enum
 from pathlib import Path
-from typing import Any, Callable, Generic, Literal, NoReturn, TYPE_CHECKING, TypeVar
-from typing_extensions import assert_never, deprecated, Self
+from pprint import pformat
+from typing import Any, Generic, TYPE_CHECKING, TypeVar
+from typing_extensions import assert_never, Self
 
 from torchgen.code_template import CodeTemplate
 
 
 if TYPE_CHECKING:
     from argparse import Namespace
-    from collections.abc import Iterable, Iterator, Sequence
+    from collections.abc import Callable, Iterable, Iterator, Sequence
 
 
 TORCHGEN_ROOT = Path(__file__).absolute().parent
@@ -97,15 +98,6 @@ def context(msg_fn: Callable[[], str]) -> Iterator[None]:
         raise
 
 
-if TYPE_CHECKING:
-    # A little trick from https://github.com/python/mypy/issues/6366
-    # for getting mypy to do exhaustiveness checking
-    # TODO: put this somewhere else, maybe
-    @deprecated("Use typing_extensions.assert_never instead")
-    def assert_never(x: NoReturn) -> NoReturn:  # type: ignore[misc] # noqa: F811
-        raise AssertionError(f"Unhandled type: {type(x).__name__}")
-
-
 @functools.cache
 def _read_template(template_fn: str) -> CodeTemplate:
     return CodeTemplate.from_file(template_fn)
@@ -354,48 +346,7 @@ def dataclass_repr(
     indent: int = 0,
     width: int = 80,
 ) -> str:
-    # built-in pprint module support dataclasses from python 3.10
-    if sys.version_info >= (3, 10):
-        from pprint import pformat
-
-        return pformat(obj, indent, width)
-
-    return _pformat(obj, indent=indent, width=width)
-
-
-def _pformat(
-    obj: Any,
-    indent: int,
-    width: int,
-    curr_indent: int = 0,
-) -> str:
-    assert is_dataclass(obj), f"obj should be a dataclass, received: {type(obj)}"
-
-    class_name = obj.__class__.__name__
-    # update current indentation level with class name
-    curr_indent += len(class_name) + 1
-
-    fields_list = [(f.name, getattr(obj, f.name)) for f in fields(obj) if f.repr]
-
-    fields_str = []
-    for name, attr in fields_list:
-        # update the current indent level with the field name
-        # dict, list, set and tuple also add indent as done in pprint
-        _curr_indent = curr_indent + len(name) + 1
-        if is_dataclass(attr):
-            str_repr = _pformat(attr, indent, width, _curr_indent)
-        elif isinstance(attr, dict):
-            str_repr = _format_dict(attr, indent, width, _curr_indent)
-        elif isinstance(attr, (list, set, tuple)):
-            str_repr = _format_list(attr, indent, width, _curr_indent)
-        else:
-            str_repr = repr(attr)
-
-        fields_str.append(f"{name}={str_repr}")
-
-    indent_str = curr_indent * " "
-    body = f",\n{indent_str}".join(fields_str)
-    return f"{class_name}({body})"
+    return pformat(obj, indent, width)
 
 
 def _format_dict(
@@ -409,7 +360,7 @@ def _format_dict(
     for k, v in attr.items():
         k_repr = repr(k)
         v_str = (
-            _pformat(v, indent, width, curr_indent + len(k_repr))
+            pformat(v, indent, width, curr_indent + len(k_repr))
             if is_dataclass(v)
             else repr(v)
         )
@@ -426,7 +377,7 @@ def _format_list(
 ) -> str:
     curr_indent += indent + 1
     list_repr = [
-        _pformat(l, indent, width, curr_indent) if is_dataclass(l) else repr(l)
+        pformat(l, indent, width, curr_indent) if is_dataclass(l) else repr(l)
         for l in attr
     ]
     start, end = ("[", "]") if isinstance(attr, list) else ("(", ")")
@@ -522,7 +473,7 @@ def get_cpp_namespace(self, default: str = "") -> str:
 
 
 class OrderedSet(Generic[T]):
-    storage: dict[T, Literal[None]]
+    storage: dict[T, None]
 
     def __init__(self, iterable: Iterable[T] | None = None) -> None:
         if iterable is None: